2208 files changed, 127227 insertions, 62449 deletions
diff --git a/llvm/include/llvm-c/Comdat.h b/llvm/include/llvm-c/Comdat.h
index 81cde1107fa4..8002bc0581af 100644
--- a/llvm/include/llvm-c/Comdat.h
+++ b/llvm/include/llvm-c/Comdat.h
@@ -19,6 +19,13 @@
 
 LLVM_C_EXTERN_C_BEGIN
 
+/**
+ * @defgroup LLVMCCoreComdat Comdats
+ * @ingroup LLVMCCore
+ *
+ * @{
+ */
+
 typedef enum {
   LLVMAnyComdatSelectionKind,        ///< The linker may choose any COMDAT.
   LLVMExactMatchComdatSelectionKind, ///< The data referenced by the COMDAT must
@@ -66,6 +73,10 @@ LLVMComdatSelectionKind LLVMGetComdatSelectionKind(LLVMComdatRef C);
  */
 void LLVMSetComdatSelectionKind(LLVMComdatRef C, LLVMComdatSelectionKind Kind);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 1a5e763cfc60..d170eff17951 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -1580,10 +1580,10 @@ LLVMTypeRef LLVMX86AMXType(void);
       macro(ConstantVector)                 \
       macro(GlobalValue)                    \
         macro(GlobalAlias)                  \
-        macro(GlobalIFunc)                  \
         macro(GlobalObject)                 \
           macro(Function)                   \
           macro(GlobalVariable)             \
+          macro(GlobalIFunc)                \
       macro(UndefValue)                     \
       macro(PoisonValue)                    \
     macro(Instruction)                      \
@@ -3287,7 +3287,7 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC);
  */
 unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr);
 
-void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, LLVMAttributeIndex Idx,
                                 unsigned Align);
 
 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
@@ -3611,11 +3611,21 @@ void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Builder, LLVMMetadataRef Loc);
  * current debug location for the given builder.  If the builder has no current
  * debug location, this function is a no-op.
  *
+ * @deprecated LLVMSetInstDebugLocation is deprecated in favor of the more general
+ *             LLVMAddMetadataToInst.
+ *
  * @see llvm::IRBuilder::SetInstDebugLocation()
  */
 void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst);
 
 /**
+ * Adds the metadata registered with the given builder to the given instruction.
+ *
+ * @see llvm::IRBuilder::AddMetadataToInst()
+ */
+void LLVMAddMetadataToInst(LLVMBuilderRef Builder, LLVMValueRef Inst);
+
+/**
  * Get the dafult floating-point math metadata for a given builder.
  *
  * @see llvm::IRBuilder::getDefaultFPMathTag()
@@ -4081,6 +4091,7 @@ void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf);
 
 /**
  * @defgroup LLVMCCorePassRegistry Pass Registry
+ * @ingroup LLVMCCore
  *
  * @{
  */
@@ -4095,6 +4106,7 @@ LLVMPassRegistryRef LLVMGetGlobalPassRegistry(void);
 
 /**
  * @defgroup LLVMCCorePassManagers Pass Managers
+ * @ingroup LLVMCCore
  *
  * @{
  */
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 8c085807914b..d7fb898b60d2 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -22,6 +22,13 @@
 LLVM_C_EXTERN_C_BEGIN
 
 /**
+ * @defgroup LLVMCCoreDebugInfo Debug Information
+ * @ingroup LLVMCCore
+ *
+ * @{
+ */
+
+/**
  * Debug info flags.
  */
 typedef enum {
@@ -227,6 +234,13 @@ void LLVMDisposeDIBuilder(LLVMDIBuilderRef Builder);
 void LLVMDIBuilderFinalize(LLVMDIBuilderRef Builder);
 
 /**
+ * Finalize a specific subprogram.
+ * No new variables may be added to this subprogram afterwards.
+ */
+void LLVMDIBuilderFinalizeSubprogram(LLVMDIBuilderRef Builder,
+                                     LLVMMetadataRef Subprogram);
+
+/**
  * A CompileUnit provides an anchor for all debugging
  * information generated during this instance of compilation.
  * \param Lang          Source programming language, eg.
@@ -389,48 +403,48 @@ LLVMDIBuilderCreateImportedModuleFromNamespace(LLVMDIBuilderRef Builder,
  * \param ImportedEntity Previous imported entity to alias.
  * \param File           File where the declaration is located.
  * \param Line           Line number of the declaration.
+ * \param Elements       Renamed elements.
+ * \param NumElements    Number of renamed elements.
  */
-LLVMMetadataRef
-LLVMDIBuilderCreateImportedModuleFromAlias(LLVMDIBuilderRef Builder,
-                                           LLVMMetadataRef Scope,
-                                           LLVMMetadataRef ImportedEntity,
-                                           LLVMMetadataRef File,
-                                           unsigned Line);
+LLVMMetadataRef LLVMDIBuilderCreateImportedModuleFromAlias(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope,
+    LLVMMetadataRef ImportedEntity, LLVMMetadataRef File, unsigned Line,
+    LLVMMetadataRef *Elements, unsigned NumElements);
 
 /**
  * Create a descriptor for an imported module.
- * \param Builder    The \c DIBuilder.
- * \param Scope      The scope this module is imported into
- * \param M          The module being imported here
- * \param File       File where the declaration is located.
- * \param Line       Line number of the declaration.
+ * \param Builder        The \c DIBuilder.
+ * \param Scope          The scope this module is imported into
+ * \param M              The module being imported here
+ * \param File           File where the declaration is located.
+ * \param Line           Line number of the declaration.
+ * \param Elements       Renamed elements.
+ * \param NumElements    Number of renamed elements.
  */
-LLVMMetadataRef
-LLVMDIBuilderCreateImportedModuleFromModule(LLVMDIBuilderRef Builder,
-                                            LLVMMetadataRef Scope,
-                                            LLVMMetadataRef M,
-                                            LLVMMetadataRef File,
-                                            unsigned Line);
+LLVMMetadataRef LLVMDIBuilderCreateImportedModuleFromModule(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, LLVMMetadataRef M,
+    LLVMMetadataRef File, unsigned Line, LLVMMetadataRef *Elements,
+    unsigned NumElements);
 
 /**
  * Create a descriptor for an imported function, type, or variable.  Suitable
  * for e.g. FORTRAN-style USE declarations.
- * \param Builder    The DIBuilder.
- * \param Scope      The scope this module is imported into.
- * \param Decl       The declaration (or definition) of a function, type,
-                     or variable.
- * \param File       File where the declaration is located.
- * \param Line       Line number of the declaration.
- * \param Name       A name that uniquely identifies this imported declaration.
- * \param NameLen    The length of the C string passed to \c Name.
+ * \param Builder        The DIBuilder.
+ * \param Scope          The scope this module is imported into.
+ * \param Decl           The declaration (or definition) of a function, type,
+                         or variable.
+ * \param File           File where the declaration is located.
+ * \param Line           Line number of the declaration.
+ * \param Name           A name that uniquely identifies this imported
+ declaration.
+ * \param NameLen        The length of the C string passed to \c Name.
+ * \param Elements       Renamed elements.
+ * \param NumElements    Number of renamed elements.
  */
-LLVMMetadataRef
-LLVMDIBuilderCreateImportedDeclaration(LLVMDIBuilderRef Builder,
-                                       LLVMMetadataRef Scope,
-                                       LLVMMetadataRef Decl,
-                                       LLVMMetadataRef File,
-                                       unsigned Line,
-                                       const char *Name, size_t NameLen);
+LLVMMetadataRef LLVMDIBuilderCreateImportedDeclaration(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, LLVMMetadataRef Decl,
+    LLVMMetadataRef File, unsigned Line, const char *Name, size_t NameLen,
+    LLVMMetadataRef *Elements, unsigned NumElements);
 
 /**
  * Creates a new DebugLocation that describes a source location.
@@ -1360,6 +1374,10 @@ void LLVMInstructionSetDebugLoc(LLVMValueRef Inst, LLVMMetadataRef Loc);
  */
 LLVMMetadataKind LLVMGetMetadataKind(LLVMMetadataRef Metadata);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/DisassemblerTypes.h b/llvm/include/llvm-c/DisassemblerTypes.h
index ae5c68227594..53baaef11033 100644
--- a/llvm/include/llvm-c/DisassemblerTypes.h
+++ b/llvm/include/llvm-c/DisassemblerTypes.h
@@ -18,6 +18,12 @@
 #endif
 
 /**
+ * @addtogroup LLVMCDisassembler
+ *
+ * @{
+ */
+
+/**
  * An opaque reference to a disassembler context.
  */
 typedef void *LLVMDisasmContextRef;
@@ -157,4 +163,8 @@ typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
 /* The output reference is to a C++ symbol name. */
 #define LLVMDisassembler_ReferenceType_DeMangled_Name 9
 
+/**
+ * @}
+ */
+
 #endif
diff --git a/llvm/include/llvm-c/Error.h b/llvm/include/llvm-c/Error.h
index bc702ac7a1bf..c3baaf65186a 100644
--- a/llvm/include/llvm-c/Error.h
+++ b/llvm/include/llvm-c/Error.h
@@ -18,6 +18,13 @@
 
 LLVM_C_EXTERN_C_BEGIN
 
+/**
+ * @defgroup LLVMCError Error Handling
+ * @ingroup LLVMC
+ *
+ * @{
+ */
+
 #define LLVMErrorSuccess 0
 
 /**
@@ -67,6 +74,10 @@ LLVMErrorTypeId LLVMGetStringErrorTypeId(void);
  */
 LLVMErrorRef LLVMCreateStringError(const char *ErrMsg);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/ErrorHandling.h b/llvm/include/llvm-c/ErrorHandling.h
index 5ba099c209c0..d9b9f22752b8 100644
--- a/llvm/include/llvm-c/ErrorHandling.h
+++ b/llvm/include/llvm-c/ErrorHandling.h
@@ -18,6 +18,12 @@
 
 LLVM_C_EXTERN_C_BEGIN
 
+/**
+ * @addtogroup LLVMCError
+ *
+ * @{
+ */
+
 typedef void (*LLVMFatalErrorHandler)(const char *Reason);
 
 /**
@@ -42,6 +48,10 @@ void LLVMResetFatalErrorHandler(void);
  */
 void LLVMEnablePrettyStackTrace(void);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/IRReader.h b/llvm/include/llvm-c/IRReader.h
index 5a3f633c3d91..905b84fa5a86 100644
--- a/llvm/include/llvm-c/IRReader.h
+++ b/llvm/include/llvm-c/IRReader.h
@@ -20,6 +20,13 @@
 LLVM_C_EXTERN_C_BEGIN
 
 /**
+ * @defgroup LLVMCCoreIRReader IR Reader
+ * @ingroup LLVMCCore
+ *
+ * @{
+ */
+
+/**
  * Read LLVM IR from a memory buffer and convert it into an in-memory Module
  * object. Returns 0 on success.
  * Optionally returns a human-readable description of any errors that
@@ -32,6 +39,10 @@ LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
                               LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
                               char **OutMessage);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/LLJIT.h b/llvm/include/llvm-c/LLJIT.h
index f689ca0f1cf0..a06133aac4fb 100644
--- a/llvm/include/llvm-c/LLJIT.h
+++ b/llvm/include/llvm-c/LLJIT.h
@@ -32,6 +32,13 @@
 LLVM_C_EXTERN_C_BEGIN
 
 /**
+ * @defgroup LLVMCExecutionEngineLLJIT LLJIT
+ * @ingroup LLVMCExecutionEngine
+ *
+ * @{
+ */
+
+/**
  * A function for constructing an ObjectLinkingLayer instance to be used
  * by an LLJIT instance.
  *
@@ -235,6 +242,10 @@ LLVMOrcIRTransformLayerRef LLVMOrcLLJITGetIRTransformLayer(LLVMOrcLLJITRef J);
  */
 const char *LLVMOrcLLJITGetDataLayoutStr(LLVMOrcLLJITRef J);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif /* LLVM_C_LLJIT_H */
diff --git a/llvm/include/llvm-c/Linker.h b/llvm/include/llvm-c/Linker.h
index 1ad9cc958753..acff5d5e2225 100644
--- a/llvm/include/llvm-c/Linker.h
+++ b/llvm/include/llvm-c/Linker.h
@@ -19,6 +19,13 @@
 
 LLVM_C_EXTERN_C_BEGIN
 
+/**
+ * @defgroup LLVMCCoreLinker Linker
+ * @ingroup LLVMCCore
+ *
+ * @{
+ */
+
 /* This enum is provided for backwards-compatibility only. It has no effect. */
 typedef enum {
   LLVMLinkerDestroySource = 0, /* This is the default behavior. */
@@ -35,4 +42,8 @@ LLVMBool LLVMLinkModules2(LLVMModuleRef Dest, LLVMModuleRef Src);
 
 LLVM_C_EXTERN_C_END
 
+/**
+ * @}
+ */
+
 #endif
diff --git a/llvm/include/llvm-c/Orc.h b/llvm/include/llvm-c/Orc.h
index 1790afbcecc7..e2f30b7cdf45 100644
--- a/llvm/include/llvm-c/Orc.h
+++ b/llvm/include/llvm-c/Orc.h
@@ -34,6 +34,13 @@
 LLVM_C_EXTERN_C_BEGIN
 
 /**
+ * @defgroup LLVMCExecutionEngineORC On-Request-Compilation
+ * @ingroup LLVMCExecutionEngine
+ *
+ * @{
+ */
+
+/**
  * Represents an address in the executor process.
  */
 typedef uint64_t LLVMOrcJITTargetAddress;
@@ -921,6 +928,49 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
     LLVMOrcSymbolPredicate Filter, void *FilterCtx);
 
 /**
+ * Get a LLVMOrcCreateDynamicLibararySearchGeneratorForPath that will reflect
+ * library symbols into the JITDylib. On success the resulting generator is
+ * owned by the client. Ownership is typically transferred by adding the
+ * instance to a JITDylib using LLVMOrcJITDylibAddGenerator,
+ *
+ * The GlobalPrefix argument specifies the character that appears on the front
+ * of linker-mangled symbols for the target platform (e.g. '_' on MachO).
+ * If non-null, this character will be stripped from the start of all symbol
+ * strings before passing the remaining substring to dlsym.
+ *
+ * The optional Filter and Ctx arguments can be used to supply a symbol name
+ * filter: Only symbols for which the filter returns true will be visible to
+ * JIT'd code. If the Filter argument is null then all library symbols will
+ * be visible to JIT'd code. Note that the symbol name passed to the Filter
+ * function is the full mangled symbol: The client is responsible for stripping
+ * the global prefix if present.
+ * 
+ * THIS API IS EXPERIMENTAL AND LIKELY TO CHANGE IN THE NEAR FUTURE!
+ * 
+ */
+LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForPath(
+    LLVMOrcDefinitionGeneratorRef *Result, const char *FileName,
+    char GlobalPrefix, LLVMOrcSymbolPredicate Filter, void *FilterCtx);
+
+/**
+ * Get a LLVMOrcCreateStaticLibrarySearchGeneratorForPath that will reflect
+ * static library symbols into the JITDylib. On success the resulting
+ * generator is owned by the client. Ownership is typically transferred by
+ * adding the instance to a JITDylib using LLVMOrcJITDylibAddGenerator,
+ *
+ * Call with the optional TargetTriple argument will succeed if the file at
+ * the given path is a static library or a MachO universal binary containing a
+ * static library that is compatible with the given triple. Otherwise it will
+ * return an error.
+ *
+ * THIS API IS EXPERIMENTAL AND LIKELY TO CHANGE IN THE NEAR FUTURE!
+ * 
+ */
+LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath(
+    LLVMOrcDefinitionGeneratorRef *Result, LLVMOrcObjectLayerRef ObjLayer,
+    const char *FileName, const char *TargetTriple);
+
+/**
  * Create a ThreadSafeContext containing a new LLVMContext.
  *
  * Ownership of the underlying ThreadSafeContext data is shared: Clients
@@ -1133,6 +1183,10 @@ void LLVMOrcDisposeDumpObjects(LLVMOrcDumpObjectsRef DumpObjects);
 LLVMErrorRef LLVMOrcDumpObjects_CallOperator(LLVMOrcDumpObjectsRef DumpObjects,
                                              LLVMMemoryBufferRef *ObjBuffer);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif /* LLVM_C_ORC_H */
diff --git a/llvm/include/llvm-c/OrcEE.h b/llvm/include/llvm-c/OrcEE.h
index 2435e7421a42..e7ae0f5e6be2 100644
--- a/llvm/include/llvm-c/OrcEE.h
+++ b/llvm/include/llvm-c/OrcEE.h
@@ -33,6 +33,13 @@
 LLVM_C_EXTERN_C_BEGIN
 
 /**
+ * @defgroup LLVMCExecutionEngineORCEE ExecutionEngine-based ORC Utils
+ * @ingroup LLVMCExecutionEngine
+ *
+ * @{
+ */
+
+/**
  * Create a RTDyldObjectLinkingLayer instance using the standard
  * SectionMemoryManager for memory management.
  */
@@ -50,6 +57,10 @@ void LLVMOrcRTDyldObjectLinkingLayerRegisterJITEventListener(
     LLVMOrcObjectLayerRef RTDyldObjLinkingLayer,
     LLVMJITEventListenerRef Listener);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif /* LLVM_C_ORCEE_H */
diff --git a/llvm/include/llvm-c/Support.h b/llvm/include/llvm-c/Support.h
index 866df32efa98..17657861b32b 100644
--- a/llvm/include/llvm-c/Support.h
+++ b/llvm/include/llvm-c/Support.h
@@ -21,6 +21,12 @@
 LLVM_C_EXTERN_C_BEGIN
 
 /**
+ * @addtogroup LLVMCCore
+ *
+ * @{
+ */
+
+/**
  * This function permanently loads the dynamic library at the given path.
  * It is safe to call this function multiple times for the same library.
  *
@@ -57,6 +63,10 @@ void *LLVMSearchForAddressOfSymbol(const char *symbolName);
  */
 void LLVMAddSymbol(const char *symbolName, void *symbolValue);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/TargetMachine.h b/llvm/include/llvm-c/TargetMachine.h
index f82edd948b59..23c8c63ff0b4 100644
--- a/llvm/include/llvm-c/TargetMachine.h
+++ b/llvm/include/llvm-c/TargetMachine.h
@@ -25,6 +25,12 @@
 
 LLVM_C_EXTERN_C_BEGIN
 
+/**
+ * @addtogroup LLVMCTarget
+ *
+ * @{
+ */
+
 typedef struct LLVMOpaqueTargetMachine *LLVMTargetMachineRef;
 typedef struct LLVMTarget *LLVMTargetRef;
 
@@ -156,6 +162,10 @@ char* LLVMGetHostCPUFeatures(void);
 /** Adds the target-specific analysis passes to the pass manager. */
 void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif
diff --git a/llvm/include/llvm-c/Transforms/PassBuilder.h b/llvm/include/llvm-c/Transforms/PassBuilder.h
index 5635f10d6877..6d9f1b45c707 100644
--- a/llvm/include/llvm-c/Transforms/PassBuilder.h
+++ b/llvm/include/llvm-c/Transforms/PassBuilder.h
@@ -18,6 +18,13 @@
 #include "llvm-c/TargetMachine.h"
 #include "llvm-c/Types.h"
 
+/**
+ * @defgroup LLVMCCoreNewPM New Pass Manager
+ * @ingroup LLVMCCore
+ *
+ * @{
+ */
+
 LLVM_C_EXTERN_C_BEGIN
 
 /**
@@ -50,7 +57,7 @@ LLVMErrorRef LLVMRunPasses(LLVMModuleRef M, const char *Passes,
  * responsible for it. The client should call LLVMDisposePassBuilderOptions
  * to free the pass builder options.
  */
-LLVMPassBuilderOptionsRef LLVMCreatePassBuilderOptions();
+LLVMPassBuilderOptionsRef LLVMCreatePassBuilderOptions(void);
 
 /**
  * Toggle adding the VerifierPass for the PassBuilder, ensuring all functions
@@ -97,6 +104,10 @@ void LLVMPassBuilderOptionsSetMergeFunctions(LLVMPassBuilderOptionsRef Options,
  */
 void LLVMDisposePassBuilderOptions(LLVMPassBuilderOptionsRef Options);
 
+/**
+ * @}
+ */
+
 LLVM_C_EXTERN_C_END
 
 #endif // LLVM_C_TRANSFORMS_PASSBUILDER_H
diff --git a/llvm/include/llvm-c/lto.h b/llvm/include/llvm-c/lto.h
index f6fc8588f5f7..5ceb02224d2b 100644
--- a/llvm/include/llvm-c/lto.h
+++ b/llvm/include/llvm-c/lto.h
@@ -46,7 +46,7 @@ typedef bool lto_bool_t;
  * @{
  */
 
-#define LTO_API_VERSION 28
+#define LTO_API_VERSION 29
 
 /**
  * \since prior to LTO_API_VERSION=3
@@ -313,6 +313,16 @@ extern lto_bool_t lto_module_get_macho_cputype(lto_module_t mod,
                                                unsigned int *out_cpusubtype);
 
 /**
+ * This function can be used by the linker to check if a given module has
+ * any constructor or destructor functions.
+ *
+ * Returns true if the module has either the @llvm.global_ctors or the
+ * @llvm.global_dtors symbol. Otherwise returns false.
+ *
+ * \since LTO_API_VERSION=29
+ */
+extern lto_bool_t lto_module_has_ctor_dtor(lto_module_t mod);
+/**
  * Diagnostic severity.
  *
  * \since LTO_API_VERSION=7
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index f493a03b4b87..40e0e32c77a8 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -961,9 +961,7 @@ public:
   /// Returns a float which is bitcasted from an all one value int.
   ///
   /// \param Semantics - type float semantics
-  /// \param BitWidth - Select float type
-  static APFloat getAllOnesValue(const fltSemantics &Semantics,
-                                 unsigned BitWidth);
+  static APFloat getAllOnesValue(const fltSemantics &Semantics);
 
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
   /// into FoldingSets.
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index ff586f763e82..595cd94b6b8f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -31,7 +31,7 @@ class raw_ostream;
 template <typename T> class SmallVectorImpl;
 template <typename T> class ArrayRef;
 template <typename T> class Optional;
-template <typename T> struct DenseMapInfo;
+template <typename T, typename Enable> struct DenseMapInfo;
 
 class APInt;
 
@@ -66,6 +66,11 @@ inline APInt operator-(APInt);
 ///     not.
 ///   * In general, the class tries to follow the style of computation that LLVM
 ///     uses in its IR. This simplifies its use for LLVM.
+///   * APInt supports zero-bit-width values, but operations that require bits
+///     are not defined on it (e.g. you cannot ask for the sign of a zero-bit
+///     integer).  This means that operations like zero extension and logical
+///     shifts are defined, but sign extension and ashr is not.  Zero bit values
+///     compare and hash equal to themselves, and countLeadingZeros returns 0.
 ///
 class LLVM_NODISCARD APInt {
 public:
@@ -87,176 +92,6 @@ public:
 
   static constexpr WordType WORDTYPE_MAX = ~WordType(0);
 
-private:
-  /// This union is used to store the integer value. When the
-  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
-  union {
-    uint64_t VAL;   ///< Used to store the <= 64 bits integer value.
-    uint64_t *pVal; ///< Used to store the >64 bits integer value.
-  } U;
-
-  unsigned BitWidth; ///< The number of bits in this APInt.
-
-  friend struct DenseMapInfo<APInt>;
-
-  friend class APSInt;
-
-  /// Fast internal constructor
-  ///
-  /// This constructor is used only internally for speed of construction of
-  /// temporaries. It is unsafe for general use so it is not public.
-  APInt(uint64_t *val, unsigned bits) : BitWidth(bits) {
-    U.pVal = val;
-  }
-
-  /// Determine which word a bit is in.
-  ///
-  /// \returns the word position for the specified bit position.
-  static unsigned whichWord(unsigned bitPosition) {
-    return bitPosition / APINT_BITS_PER_WORD;
-  }
-
-  /// Determine which bit in a word a bit is in.
-  ///
-  /// \returns the bit position in a word for the specified bit position
-  /// in the APInt.
-  static unsigned whichBit(unsigned bitPosition) {
-    return bitPosition % APINT_BITS_PER_WORD;
-  }
-
-  /// Get a single bit mask.
-  ///
-  /// \returns a uint64_t with only bit at "whichBit(bitPosition)" set
-  /// This method generates and returns a uint64_t (word) mask for a single
-  /// bit at a specific bit position. This is used to mask the bit in the
-  /// corresponding word.
-  static uint64_t maskBit(unsigned bitPosition) {
-    return 1ULL << whichBit(bitPosition);
-  }
-
-  /// Clear unused high order bits
-  ///
-  /// This method is used internally to clear the top "N" bits in the high order
-  /// word that are not used by the APInt. This is needed after the most
-  /// significant word is assigned a value to ensure that those bits are
-  /// zero'd out.
-  APInt &clearUnusedBits() {
-    // Compute how many bits are used in the final word
-    unsigned WordBits = ((BitWidth-1) % APINT_BITS_PER_WORD) + 1;
-
-    // Mask out the high bits.
-    uint64_t mask = WORDTYPE_MAX >> (APINT_BITS_PER_WORD - WordBits);
-    if (isSingleWord())
-      U.VAL &= mask;
-    else
-      U.pVal[getNumWords() - 1] &= mask;
-    return *this;
-  }
-
-  /// Get the word corresponding to a bit position
-  /// \returns the corresponding word for the specified bit position.
-  uint64_t getWord(unsigned bitPosition) const {
-    return isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)];
-  }
-
-  /// Utility method to change the bit width of this APInt to new bit width,
-  /// allocating and/or deallocating as necessary. There is no guarantee on the
-  /// value of any bits upon return. Caller should populate the bits after.
-  void reallocate(unsigned NewBitWidth);
-
-  /// Convert a char array into an APInt
-  ///
-  /// \param radix 2, 8, 10, 16, or 36
-  /// Converts a string into a number.  The string must be non-empty
-  /// and well-formed as a number of the given base. The bit-width
-  /// must be sufficient to hold the result.
-  ///
-  /// This is used by the constructors that take string arguments.
-  ///
-  /// StringRef::getAsInteger is superficially similar but (1) does
-  /// not assume that the string is well-formed and (2) grows the
-  /// result to hold the input.
-  void fromString(unsigned numBits, StringRef str, uint8_t radix);
-
-  /// An internal division function for dividing APInts.
-  ///
-  /// This is used by the toString method to divide by the radix. It simply
-  /// provides a more convenient form of divide for internal use since KnuthDiv
-  /// has specific constraints on its inputs. If those constraints are not met
-  /// then it provides a simpler form of divide.
-  static void divide(const WordType *LHS, unsigned lhsWords,
-                     const WordType *RHS, unsigned rhsWords, WordType *Quotient,
-                     WordType *Remainder);
-
-  /// out-of-line slow case for inline constructor
-  void initSlowCase(uint64_t val, bool isSigned);
-
-  /// shared code between two array constructors
-  void initFromArray(ArrayRef<uint64_t> array);
-
-  /// out-of-line slow case for inline copy constructor
-  void initSlowCase(const APInt &that);
-
-  /// out-of-line slow case for shl
-  void shlSlowCase(unsigned ShiftAmt);
-
-  /// out-of-line slow case for lshr.
-  void lshrSlowCase(unsigned ShiftAmt);
-
-  /// out-of-line slow case for ashr.
-  void ashrSlowCase(unsigned ShiftAmt);
-
-  /// out-of-line slow case for operator=
-  void AssignSlowCase(const APInt &RHS);
-
-  /// out-of-line slow case for operator==
-  bool EqualSlowCase(const APInt &RHS) const LLVM_READONLY;
-
-  /// out-of-line slow case for countLeadingZeros
-  unsigned countLeadingZerosSlowCase() const LLVM_READONLY;
-
-  /// out-of-line slow case for countLeadingOnes.
-  unsigned countLeadingOnesSlowCase() const LLVM_READONLY;
-
-  /// out-of-line slow case for countTrailingZeros.
-  unsigned countTrailingZerosSlowCase() const LLVM_READONLY;
-
-  /// out-of-line slow case for countTrailingOnes
-  unsigned countTrailingOnesSlowCase() const LLVM_READONLY;
-
-  /// out-of-line slow case for countPopulation
-  unsigned countPopulationSlowCase() const LLVM_READONLY;
-
-  /// out-of-line slow case for intersects.
-  bool intersectsSlowCase(const APInt &RHS) const LLVM_READONLY;
-
-  /// out-of-line slow case for isSubsetOf.
-  bool isSubsetOfSlowCase(const APInt &RHS) const LLVM_READONLY;
-
-  /// out-of-line slow case for setBits.
-  void setBitsSlowCase(unsigned loBit, unsigned hiBit);
-
-  /// out-of-line slow case for flipAllBits.
-  void flipAllBitsSlowCase();
-
-  /// out-of-line slow case for operator&=.
-  void AndAssignSlowCase(const APInt& RHS);
-
-  /// out-of-line slow case for operator|=.
-  void OrAssignSlowCase(const APInt& RHS);
-
-  /// out-of-line slow case for operator^=.
-  void XorAssignSlowCase(const APInt& RHS);
-
-  /// Unsigned comparison. Returns -1, 0, or 1 if this APInt is less than, equal
-  /// to, or greater than RHS.
-  int compare(const APInt &RHS) const LLVM_READONLY;
-
-  /// Signed comparison. Returns -1, 0, or 1 if this APInt is less than, equal
-  /// to, or greater than RHS.
-  int compareSigned(const APInt &RHS) const LLVM_READONLY;
-
-public:
   /// \name Constructors
   /// @{
 
@@ -272,7 +107,6 @@ public:
   /// \param isSigned how to treat signedness of val
   APInt(unsigned numBits, uint64_t val, bool isSigned = false)
       : BitWidth(numBits) {
-    assert(BitWidth && "bitwidth too small");
     if (isSingleWord()) {
       U.VAL = val;
       clearUnusedBits();
@@ -312,7 +146,9 @@ public:
   /// \param radix the radix to use for the conversion
   APInt(unsigned numBits, StringRef str, uint8_t radix);
 
-  /// Simply makes *this a copy of that.
+  /// Default constructor that creates an APInt with a 1-bit zero value.
+  explicit APInt() : BitWidth(1) { U.VAL = 0; }
+
   /// Copy Constructor.
   APInt(const APInt &that) : BitWidth(that.BitWidth) {
     if (isSingleWord())
@@ -333,19 +169,131 @@ public:
       delete[] U.pVal;
   }
 
-  /// Default constructor that creates an uninteresting APInt
-  /// representing a 1-bit zero value.
+  /// @}
+  /// \name Value Generators
+  /// @{
+
+  /// Get the '0' value for the specified bit-width.
+  static APInt getZero(unsigned numBits) { return APInt(numBits, 0); }
+
+  /// NOTE: This is soft-deprecated.  Please use `getZero()` instead.
+  static APInt getNullValue(unsigned numBits) { return getZero(numBits); }
+
+  /// Return an APInt zero bits wide.
+  static APInt getZeroWidth() { return getZero(0); }
+
+  /// Gets maximum unsigned value of APInt for specific bit width.
+  static APInt getMaxValue(unsigned numBits) { return getAllOnes(numBits); }
+
+  /// Gets maximum signed value of APInt for a specific bit width.
+  static APInt getSignedMaxValue(unsigned numBits) {
+    APInt API = getAllOnes(numBits);
+    API.clearBit(numBits - 1);
+    return API;
+  }
+
+  /// Gets minimum unsigned value of APInt for a specific bit width.
+  static APInt getMinValue(unsigned numBits) { return APInt(numBits, 0); }
+
+  /// Gets minimum signed value of APInt for a specific bit width.
+  static APInt getSignedMinValue(unsigned numBits) {
+    APInt API(numBits, 0);
+    API.setBit(numBits - 1);
+    return API;
+  }
+
+  /// Get the SignMask for a specific bit width.
   ///
-  /// This is useful for object deserialization (pair this with the static
-  ///  method Read).
-  explicit APInt() : BitWidth(1) { U.VAL = 0; }
+  /// This is just a wrapper function of getSignedMinValue(), and it helps code
+  /// readability when we want to get a SignMask.
+  static APInt getSignMask(unsigned BitWidth) {
+    return getSignedMinValue(BitWidth);
+  }
 
-  /// Returns whether this instance allocated memory.
-  bool needsCleanup() const { return !isSingleWord(); }
+  /// Return an APInt of a specified width with all bits set.
+  static APInt getAllOnes(unsigned numBits) {
+    return APInt(numBits, WORDTYPE_MAX, true);
+  }
 
-  /// Used to insert APInt objects, or objects that contain APInt objects, into
-  ///  FoldingSets.
-  void Profile(FoldingSetNodeID &id) const;
+  /// NOTE: This is soft-deprecated.  Please use `getAllOnes()` instead.
+  static APInt getAllOnesValue(unsigned numBits) { return getAllOnes(numBits); }
+
+  /// Return an APInt with exactly one bit set in the result.
+  static APInt getOneBitSet(unsigned numBits, unsigned BitNo) {
+    APInt Res(numBits, 0);
+    Res.setBit(BitNo);
+    return Res;
+  }
+
+  /// Get a value with a block of bits set.
+  ///
+  /// Constructs an APInt value that has a contiguous range of bits set. The
+  /// bits from loBit (inclusive) to hiBit (exclusive) will be set. All other
+  /// bits will be zero. For example, with parameters(32, 0, 16) you would get
+  /// 0x0000FFFF. Please call getBitsSetWithWrap if \p loBit may be greater than
+  /// \p hiBit.
+  ///
+  /// \param numBits the intended bit width of the result
+  /// \param loBit the index of the lowest bit set.
+  /// \param hiBit the index of the highest bit set.
+  ///
+  /// \returns An APInt value with the requested bits set.
+  static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit) {
+    APInt Res(numBits, 0);
+    Res.setBits(loBit, hiBit);
+    return Res;
+  }
+
+  /// Wrap version of getBitsSet.
+  /// If \p hiBit is bigger than \p loBit, this is same with getBitsSet.
+  /// If \p hiBit is not bigger than \p loBit, the set bits "wrap". For example,
+  /// with parameters (32, 28, 4), you would get 0xF000000F.
+  /// If \p hiBit is equal to \p loBit, you would get a result with all bits
+  /// set.
+  static APInt getBitsSetWithWrap(unsigned numBits, unsigned loBit,
+                                  unsigned hiBit) {
+    APInt Res(numBits, 0);
+    Res.setBitsWithWrap(loBit, hiBit);
+    return Res;
+  }
+
+  /// Constructs an APInt value that has a contiguous range of bits set. The
+  /// bits from loBit (inclusive) to numBits (exclusive) will be set. All other
+  /// bits will be zero. For example, with parameters(32, 12) you would get
+  /// 0xFFFFF000.
+  ///
+  /// \param numBits the intended bit width of the result
+  /// \param loBit the index of the lowest bit to set.
+  ///
+  /// \returns An APInt value with the requested bits set.
+  static APInt getBitsSetFrom(unsigned numBits, unsigned loBit) {
+    APInt Res(numBits, 0);
+    Res.setBitsFrom(loBit);
+    return Res;
+  }
+
+  /// Constructs an APInt value that has the top hiBitsSet bits set.
+  ///
+  /// \param numBits the bitwidth of the result
+  /// \param hiBitsSet the number of high-order bits set in the result.
+  static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet) {
+    APInt Res(numBits, 0);
+    Res.setHighBits(hiBitsSet);
+    return Res;
+  }
+
+  /// Constructs an APInt value that has the bottom loBitsSet bits set.
+  ///
+  /// \param numBits the bitwidth of the result
+  /// \param loBitsSet the number of low-order bits set in the result.
+  static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet) {
+    APInt Res(numBits, 0);
+    Res.setLowBits(loBitsSet);
+    return Res;
+  }
+
+  /// Return a value containing V broadcasted over NewLen bits.
+  static APInt getSplat(unsigned NewLen, const APInt &V);
 
   /// @}
   /// \name Value Tests
@@ -373,7 +321,7 @@ public:
   /// This tests the high bit of this APInt to determine if it is set.
   ///
   /// \returns true if this APInt has its sign bit set, false otherwise.
-  bool isSignBitSet() const { return (*this)[BitWidth-1]; }
+  bool isSignBitSet() const { return (*this)[BitWidth - 1]; }
 
   /// Determine if sign bit of this APInt is clear.
   ///
@@ -388,50 +336,62 @@ public:
   /// that 0 is not a positive value.
   ///
   /// \returns true if this APInt is positive.
-  bool isStrictlyPositive() const { return isNonNegative() && !isNullValue(); }
+  bool isStrictlyPositive() const { return isNonNegative() && !isZero(); }
 
   /// Determine if this APInt Value is non-positive (<= 0).
   ///
   /// \returns true if this APInt is non-positive.
   bool isNonPositive() const { return !isStrictlyPositive(); }
 
-  /// Determine if all bits are set
-  ///
-  /// This checks to see if the value has all bits of the APInt are set or not.
-  bool isAllOnesValue() const {
+  /// Determine if all bits are set.  This is true for zero-width values.
+  bool isAllOnes() const {
+    if (BitWidth == 0)
+      return true;
     if (isSingleWord())
       return U.VAL == WORDTYPE_MAX >> (APINT_BITS_PER_WORD - BitWidth);
     return countTrailingOnesSlowCase() == BitWidth;
   }
 
-  /// Determine if all bits are clear
-  ///
-  /// This checks to see if the value has all bits of the APInt are clear or
-  /// not.
-  bool isNullValue() const { return !*this; }
+  /// NOTE: This is soft-deprecated.  Please use `isAllOnes()` instead.
+  bool isAllOnesValue() const { return isAllOnes(); }
+
+  /// Determine if this value is zero, i.e. all bits are clear.
+  bool isZero() const {
+    if (isSingleWord())
+      return U.VAL == 0;
+    return countLeadingZerosSlowCase() == BitWidth;
+  }
+
+  /// NOTE: This is soft-deprecated.  Please use `isZero()` instead.
+  bool isNullValue() const { return isZero(); }
 
   /// Determine if this is a value of 1.
   ///
   /// This checks to see if the value of this APInt is one.
-  bool isOneValue() const {
+  bool isOne() const {
     if (isSingleWord())
       return U.VAL == 1;
     return countLeadingZerosSlowCase() == BitWidth - 1;
   }
 
+  /// NOTE: This is soft-deprecated.  Please use `isOne()` instead.
+  bool isOneValue() const { return isOne(); }
+
   /// Determine if this is the largest unsigned value.
   ///
   /// This checks to see if the value of this APInt is the maximum unsigned
   /// value for the APInt's bit width.
-  bool isMaxValue() const { return isAllOnesValue(); }
+  bool isMaxValue() const { return isAllOnes(); }
 
   /// Determine if this is the largest signed value.
   ///
   /// This checks to see if the value of this APInt is the maximum signed
   /// value for the APInt's bit width.
   bool isMaxSignedValue() const {
-    if (isSingleWord())
+    if (isSingleWord()) {
+      assert(BitWidth && "zero width values not allowed");
       return U.VAL == ((WordType(1) << (BitWidth - 1)) - 1);
+    }
     return !isNegative() && countTrailingOnesSlowCase() == BitWidth - 1;
   }
 
@@ -439,39 +399,48 @@ public:
   ///
   /// This checks to see if the value of this APInt is the minimum unsigned
   /// value for the APInt's bit width.
-  bool isMinValue() const { return isNullValue(); }
+  bool isMinValue() const { return isZero(); }
 
   /// Determine if this is the smallest signed value.
   ///
   /// This checks to see if the value of this APInt is the minimum signed
   /// value for the APInt's bit width.
   bool isMinSignedValue() const {
-    if (isSingleWord())
+    if (isSingleWord()) {
+      assert(BitWidth && "zero width values not allowed");
       return U.VAL == (WordType(1) << (BitWidth - 1));
+    }
     return isNegative() && countTrailingZerosSlowCase() == BitWidth - 1;
   }
 
   /// Check if this APInt has an N-bits unsigned integer value.
-  bool isIntN(unsigned N) const {
-    assert(N && "N == 0 ???");
-    return getActiveBits() <= N;
-  }
+  bool isIntN(unsigned N) const { return getActiveBits() <= N; }
 
   /// Check if this APInt has an N-bits signed integer value.
-  bool isSignedIntN(unsigned N) const {
-    assert(N && "N == 0 ???");
-    return getMinSignedBits() <= N;
-  }
+  bool isSignedIntN(unsigned N) const { return getMinSignedBits() <= N; }
 
   /// Check if this APInt's value is a power of two greater than zero.
   ///
   /// \returns true if the argument APInt value is a power of two > 0.
   bool isPowerOf2() const {
-    if (isSingleWord())
+    if (isSingleWord()) {
+      assert(BitWidth && "zero width values not allowed");
       return isPowerOf2_64(U.VAL);
+    }
     return countPopulationSlowCase() == 1;
   }
 
+  /// Check if this APInt's negated value is a power of two greater than zero.
+  bool isNegatedPowerOf2() const {
+    assert(BitWidth && "zero width values not allowed");
+    if (isNonNegative())
+      return false;
+    // NegatedPowerOf2 - shifted mask in the top bits.
+    unsigned LO = countLeadingOnes();
+    unsigned TZ = countTrailingZeros();
+    return (LO + TZ) == BitWidth;
+  }
+
   /// Check if the APInt's value is returned by getSignMask.
   ///
   /// \returns true if this is the value returned by getSignMask.
@@ -480,7 +449,7 @@ public:
   /// Convert APInt to a boolean value.
   ///
   /// This converts the APInt to a boolean value as a test against zero.
-  bool getBoolValue() const { return !!*this; }
+  bool getBoolValue() const { return !isZero(); }
 
   /// If this value is smaller than the specified limit, return it, otherwise
   /// return the limit value.  This causes the value to saturate to the limit.
@@ -527,152 +496,22 @@ public:
     return (Ones + LeadZ + countTrailingZeros()) == BitWidth;
   }
 
-  /// @}
-  /// \name Value Generators
-  /// @{
-
-  /// Gets maximum unsigned value of APInt for specific bit width.
-  static APInt getMaxValue(unsigned numBits) {
-    return getAllOnesValue(numBits);
-  }
-
-  /// Gets maximum signed value of APInt for a specific bit width.
-  static APInt getSignedMaxValue(unsigned numBits) {
-    APInt API = getAllOnesValue(numBits);
-    API.clearBit(numBits - 1);
-    return API;
-  }
-
-  /// Gets minimum unsigned value of APInt for a specific bit width.
-  static APInt getMinValue(unsigned numBits) { return APInt(numBits, 0); }
-
-  /// Gets minimum signed value of APInt for a specific bit width.
-  static APInt getSignedMinValue(unsigned numBits) {
-    APInt API(numBits, 0);
-    API.setBit(numBits - 1);
-    return API;
-  }
-
-  /// Get the SignMask for a specific bit width.
-  ///
-  /// This is just a wrapper function of getSignedMinValue(), and it helps code
-  /// readability when we want to get a SignMask.
-  static APInt getSignMask(unsigned BitWidth) {
-    return getSignedMinValue(BitWidth);
-  }
-
-  /// Get the all-ones value.
-  ///
-  /// \returns the all-ones value for an APInt of the specified bit-width.
-  static APInt getAllOnesValue(unsigned numBits) {
-    return APInt(numBits, WORDTYPE_MAX, true);
-  }
-
-  /// Get the '0' value.
-  ///
-  /// \returns the '0' value for an APInt of the specified bit-width.
-  static APInt getNullValue(unsigned numBits) { return APInt(numBits, 0); }
-
   /// Compute an APInt containing numBits highbits from this APInt.
   ///
-  /// Get an APInt with the same BitWidth as this APInt, just zero mask
-  /// the low bits and right shift to the least significant bit.
+  /// Get an APInt with the same BitWidth as this APInt, just zero mask the low
+  /// bits and right shift to the least significant bit.
   ///
   /// \returns the high "numBits" bits of this APInt.
   APInt getHiBits(unsigned numBits) const;
 
   /// Compute an APInt containing numBits lowbits from this APInt.
   ///
-  /// Get an APInt with the same BitWidth as this APInt, just zero mask
-  /// the high bits.
+  /// Get an APInt with the same BitWidth as this APInt, just zero mask the high
+  /// bits.
   ///
   /// \returns the low "numBits" bits of this APInt.
   APInt getLoBits(unsigned numBits) const;
 
-  /// Return an APInt with exactly one bit set in the result.
-  static APInt getOneBitSet(unsigned numBits, unsigned BitNo) {
-    APInt Res(numBits, 0);
-    Res.setBit(BitNo);
-    return Res;
-  }
-
-  /// Get a value with a block of bits set.
-  ///
-  /// Constructs an APInt value that has a contiguous range of bits set. The
-  /// bits from loBit (inclusive) to hiBit (exclusive) will be set. All other
-  /// bits will be zero. For example, with parameters(32, 0, 16) you would get
-  /// 0x0000FFFF. Please call getBitsSetWithWrap if \p loBit may be greater than
-  /// \p hiBit.
-  ///
-  /// \param numBits the intended bit width of the result
-  /// \param loBit the index of the lowest bit set.
-  /// \param hiBit the index of the highest bit set.
-  ///
-  /// \returns An APInt value with the requested bits set.
-  static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit) {
-    assert(loBit <= hiBit && "loBit greater than hiBit");
-    APInt Res(numBits, 0);
-    Res.setBits(loBit, hiBit);
-    return Res;
-  }
-
-  /// Wrap version of getBitsSet.
-  /// If \p hiBit is bigger than \p loBit, this is same with getBitsSet.
-  /// If \p hiBit is not bigger than \p loBit, the set bits "wrap". For example,
-  /// with parameters (32, 28, 4), you would get 0xF000000F.
-  /// If \p hiBit is equal to \p loBit, you would get a result with all bits
-  /// set.
-  static APInt getBitsSetWithWrap(unsigned numBits, unsigned loBit,
-                                  unsigned hiBit) {
-    APInt Res(numBits, 0);
-    Res.setBitsWithWrap(loBit, hiBit);
-    return Res;
-  }
-
-  /// Get a value with upper bits starting at loBit set.
-  ///
-  /// Constructs an APInt value that has a contiguous range of bits set. The
-  /// bits from loBit (inclusive) to numBits (exclusive) will be set. All other
-  /// bits will be zero. For example, with parameters(32, 12) you would get
-  /// 0xFFFFF000.
-  ///
-  /// \param numBits the intended bit width of the result
-  /// \param loBit the index of the lowest bit to set.
-  ///
-  /// \returns An APInt value with the requested bits set.
-  static APInt getBitsSetFrom(unsigned numBits, unsigned loBit) {
-    APInt Res(numBits, 0);
-    Res.setBitsFrom(loBit);
-    return Res;
-  }
-
-  /// Get a value with high bits set
-  ///
-  /// Constructs an APInt value that has the top hiBitsSet bits set.
-  ///
-  /// \param numBits the bitwidth of the result
-  /// \param hiBitsSet the number of high-order bits set in the result.
-  static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet) {
-    APInt Res(numBits, 0);
-    Res.setHighBits(hiBitsSet);
-    return Res;
-  }
-
-  /// Get a value with low bits set
-  ///
-  /// Constructs an APInt value that has the bottom loBitsSet bits set.
-  ///
-  /// \param numBits the bitwidth of the result
-  /// \param loBitsSet the number of low-order bits set in the result.
-  static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet) {
-    APInt Res(numBits, 0);
-    Res.setLowBits(loBitsSet);
-    return Res;
-  }
-
-  /// Return a value containing V broadcasted over NewLen bits.
-  static APInt getSplat(unsigned NewLen, const APInt &V);
-
   /// Determine if two APInts have the same value, after zero-extending
   /// one of them (if needed!) to ensure that the bit-widths match.
   static bool isSameValue(const APInt &I1, const APInt &I2) {
@@ -701,12 +540,10 @@ public:
   /// \name Unary Operators
   /// @{
 
-  /// Postfix increment operator.
-  ///
-  /// Increments *this by 1.
+  /// Postfix increment operator.  Increment *this by 1.
   ///
   /// \returns a new APInt value representing the original value of *this.
-  const APInt operator++(int) {
+  APInt operator++(int) {
     APInt API(*this);
     ++(*this);
     return API;
@@ -717,12 +554,10 @@ public:
   /// \returns *this incremented by one
   APInt &operator++();
 
-  /// Postfix decrement operator.
-  ///
-  /// Decrements *this by 1.
+  /// Postfix decrement operator. Decrement *this by 1.
   ///
   /// \returns a new APInt value representing the original value of *this.
-  const APInt operator--(int) {
+  APInt operator--(int) {
     APInt API(*this);
     --(*this);
     return API;
@@ -733,16 +568,9 @@ public:
   /// \returns *this decremented by one.
   APInt &operator--();
 
-  /// Logical negation operator.
-  ///
-  /// Performs logical negation operation on this APInt.
-  ///
-  /// \returns true if *this is zero, false otherwise.
-  bool operator!() const {
-    if (isSingleWord())
-      return U.VAL == 0;
-    return countLeadingZerosSlowCase() == BitWidth;
-  }
+  /// Logical negation operation on this APInt returns true if zero, like normal
+  /// integers.
+  bool operator!() const { return isZero(); }
 
   /// @}
   /// \name Assignment Operators
@@ -752,14 +580,15 @@ public:
   ///
   /// \returns *this after assignment of RHS.
   APInt &operator=(const APInt &RHS) {
-    // If the bitwidths are the same, we can avoid mucking with memory
+    // The common case (both source or dest being inline) doesn't require
+    // allocation or deallocation.
     if (isSingleWord() && RHS.isSingleWord()) {
       U.VAL = RHS.U.VAL;
       BitWidth = RHS.BitWidth;
-      return clearUnusedBits();
+      return *this;
     }
 
-    AssignSlowCase(RHS);
+    assignSlowCase(RHS);
     return *this;
   }
 
@@ -780,7 +609,6 @@ public:
 
     BitWidth = that.BitWidth;
     that.BitWidth = 0;
-
     return *this;
   }
 
@@ -812,7 +640,7 @@ public:
     if (isSingleWord())
       U.VAL &= RHS.U.VAL;
     else
-      AndAssignSlowCase(RHS);
+      andAssignSlowCase(RHS);
     return *this;
   }
 
@@ -827,7 +655,7 @@ public:
       return *this;
     }
     U.pVal[0] &= RHS;
-    memset(U.pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
+    memset(U.pVal + 1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
     return *this;
   }
 
@@ -842,7 +670,7 @@ public:
     if (isSingleWord())
       U.VAL |= RHS.U.VAL;
     else
-      OrAssignSlowCase(RHS);
+      orAssignSlowCase(RHS);
     return *this;
   }
 
@@ -871,7 +699,7 @@ public:
     if (isSingleWord())
       U.VAL ^= RHS.U.VAL;
     else
-      XorAssignSlowCase(RHS);
+      xorAssignSlowCase(RHS);
     return *this;
   }
 
@@ -1057,6 +885,17 @@ public:
   /// Rotate right by rotateAmt.
   APInt rotr(const APInt &rotateAmt) const;
 
+  /// Concatenate the bits from "NewLSB" onto the bottom of *this.  This is
+  /// equivalent to:
+  ///   (this->zext(NewWidth) << NewLSB.getBitWidth()) | NewLSB.zext(NewWidth)
+  APInt concat(const APInt &NewLSB) const {
+    /// If the result will be small, then both the merged values are small.
+    unsigned NewWidth = getBitWidth() + NewLSB.getBitWidth();
+    if (NewWidth <= APINT_BITS_PER_WORD)
+      return APInt(NewWidth, (U.VAL << NewLSB.getBitWidth()) | NewLSB.U.VAL);
+    return concatSlowCase(NewLSB);
+  }
+
   /// Unsigned division operation.
   ///
   /// Perform an unsigned divide operation on this APInt by RHS. Both this and
@@ -1151,7 +990,7 @@ public:
     assert(BitWidth == RHS.BitWidth && "Comparison requires equal bit widths");
     if (isSingleWord())
       return U.VAL == RHS.U.VAL;
-    return EqualSlowCase(RHS);
+    return equalSlowCase(RHS);
   }
 
   /// Equality operator.
@@ -1436,8 +1275,6 @@ public:
     clearUnusedBits();
   }
 
-  /// Set a given bit to 1.
-  ///
   /// Set the given bit to 1 whose position is given as "bitPosition".
   void setBit(unsigned BitPosition) {
     assert(BitPosition < BitWidth && "BitPosition out of range");
@@ -1449,9 +1286,7 @@ public:
   }
 
   /// Set the sign bit to 1.
-  void setSignBit() {
-    setBit(BitWidth - 1);
-  }
+  void setSignBit() { setBit(BitWidth - 1); }
 
   /// Set a given bit to a given value.
   void setBitVal(unsigned BitPosition, bool BitValue) {
@@ -1497,14 +1332,10 @@ public:
   }
 
   /// Set the top bits starting from loBit.
-  void setBitsFrom(unsigned loBit) {
-    return setBits(loBit, BitWidth);
-  }
+  void setBitsFrom(unsigned loBit) { return setBits(loBit, BitWidth); }
 
   /// Set the bottom loBits bits.
-  void setLowBits(unsigned loBits) {
-    return setBits(0, loBits);
-  }
+  void setLowBits(unsigned loBits) { return setBits(0, loBits); }
 
   /// Set the top hiBits bits.
   void setHighBits(unsigned hiBits) {
@@ -1539,9 +1370,7 @@ public:
   }
 
   /// Set the sign bit to 0.
-  void clearSignBit() {
-    clearBit(BitWidth - 1);
-  }
+  void clearSignBit() { clearBit(BitWidth - 1); }
 
   /// Toggle every bit to its opposite value.
   void flipAllBits() {
@@ -1629,8 +1458,10 @@ public:
   /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
   /// uint64_t. Otherwise an assertion will result.
   uint64_t getZExtValue() const {
-    if (isSingleWord())
+    if (isSingleWord()) {
+      assert(BitWidth && "zero width values not allowed");
       return U.VAL;
+    }
     assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
     return U.pVal[0];
   }
@@ -1678,8 +1509,11 @@ public:
   /// \returns 0 if the high order bit is not set, otherwise returns the number
   /// of 1 bits from the most significant to the least
   unsigned countLeadingOnes() const {
-    if (isSingleWord())
+    if (isSingleWord()) {
+      if (LLVM_UNLIKELY(BitWidth == 0))
+        return 0;
       return llvm::countLeadingOnes(U.VAL << (APINT_BITS_PER_WORD - BitWidth));
+    }
     return countLeadingOnesSlowCase();
   }
 
@@ -1774,9 +1608,7 @@ public:
   /// The conversion does not do a translation from integer to double, it just
   /// re-interprets the bits as a double. Note that it is valid to do this on
   /// any bit width. Exactly 64 bits will be translated.
-  double bitsToDouble() const {
-    return BitsToDouble(getWord(0));
-  }
+  double bitsToDouble() const { return BitsToDouble(getWord(0)); }
 
   /// Converts APInt bits to a float
   ///
@@ -1808,7 +1640,7 @@ public:
   /// @{
 
   /// \returns the floor log base 2 of this APInt.
-  unsigned logBase2() const { return getActiveBits() -  1; }
+  unsigned logBase2() const { return getActiveBits() - 1; }
 
   /// \returns the ceil log base 2 of this APInt.
   unsigned ceilLogBase2() const {
@@ -1826,25 +1658,7 @@ public:
   ///
   /// to get around any mathematical concerns resulting from
   /// referencing 2 in a space where 2 does no exist.
-  unsigned nearestLogBase2() const {
-    // Special case when we have a bitwidth of 1. If VAL is 1, then we
-    // get 0. If VAL is 0, we get WORDTYPE_MAX which gets truncated to
-    // UINT32_MAX.
-    if (BitWidth == 1)
-      return U.VAL - 1;
-
-    // Handle the zero case.
-    if (isNullValue())
-      return UINT32_MAX;
-
-    // The non-zero case is handled by computing:
-    //
-    //   nearestLogBase2(x) = logBase2(x) + x[logBase2(x)-1].
-    //
-    // where x[i] is referring to the value of the ith bit of x.
-    unsigned lg = logBase2();
-    return lg + unsigned((*this)[lg - 1]);
-  }
+  unsigned nearestLogBase2() const;
 
   /// \returns the log base 2 of this APInt if its an exact power of two, -1
   /// otherwise
@@ -1854,12 +1668,12 @@ public:
     return logBase2();
   }
 
-  /// Compute the square root
+  /// Compute the square root.
   APInt sqrt() const;
 
-  /// Get the absolute value;
-  ///
-  /// If *this is < 0 then return -(*this), otherwise *this;
+  /// Get the absolute value.  If *this is < 0 then return -(*this), otherwise
+  /// *this.  Note that the "most negative" signed number (e.g. -128 for 8 bit
+  /// wide APInt) is unchanged due to how negation works.
   APInt abs() const {
     if (isNegative())
       return -(*this);
@@ -1870,18 +1684,6 @@ public:
   APInt multiplicativeInverse(const APInt &modulo) const;
 
   /// @}
-  /// \name Support for division by constant
-  /// @{
-
-  /// Calculate the magic number for signed division by a constant.
-  struct ms;
-  ms magic() const;
-
-  /// Calculate the magic number for unsigned division by a constant.
-  struct mu;
-  mu magicu(unsigned LeadingZeros = 0) const;
-
-  /// @}
   /// \name Building-block Operations for APInt and APFloat
   /// @{
 
@@ -1908,9 +1710,8 @@ public:
   /// DST, of dstCOUNT parts, such that the bit srcLSB becomes the least
   /// significant bit of DST.  All high bits above srcBITS in DST are
   /// zero-filled.
-  static void tcExtract(WordType *, unsigned dstCount,
-                        const WordType *, unsigned srcBits,
-                        unsigned srcLSB);
+  static void tcExtract(WordType *, unsigned dstCount, const WordType *,
+                        unsigned srcBits, unsigned srcLSB);
 
   /// Set the given bit of a bignum.  Zero-based.
   static void tcSetBit(WordType *, unsigned bit);
@@ -1927,14 +1728,13 @@ public:
   static void tcNegate(WordType *, unsigned);
 
   /// DST += RHS + CARRY where CARRY is zero or one.  Returns the carry flag.
-  static WordType tcAdd(WordType *, const WordType *,
-                        WordType carry, unsigned);
+  static WordType tcAdd(WordType *, const WordType *, WordType carry, unsigned);
   /// DST += RHS.  Returns the carry flag.
   static WordType tcAddPart(WordType *, WordType, unsigned);
 
   /// DST -= RHS + CARRY where CARRY is zero or one. Returns the carry flag.
-  static WordType tcSubtract(WordType *, const WordType *,
-                             WordType carry, unsigned);
+  static WordType tcSubtract(WordType *, const WordType *, WordType carry,
+                             unsigned);
   /// DST -= RHS.  Returns the carry flag.
   static WordType tcSubtractPart(WordType *, WordType, unsigned);
 
@@ -1950,8 +1750,7 @@ public:
   /// otherwise overflow occurred and return one.
   static int tcMultiplyPart(WordType *dst, const WordType *src,
                             WordType multiplier, WordType carry,
-                            unsigned srcParts, unsigned dstParts,
-                            bool add);
+                            unsigned srcParts, unsigned dstParts, bool add);
 
   /// DST = LHS * RHS, where DST has the same width as the operands and is
   /// filled with the least significant parts of the result.  Returns one if
@@ -1962,8 +1761,8 @@ public:
 
   /// DST = LHS * RHS, where DST has width the sum of the widths of the
   /// operands. No overflow occurs. DST must be disjoint from both operands.
-  static void tcFullMultiply(WordType *, const WordType *,
-                             const WordType *, unsigned, unsigned);
+  static void tcFullMultiply(WordType *, const WordType *, const WordType *,
+                             unsigned, unsigned);
 
   /// If RHS is zero LHS and REMAINDER are left unchanged, return one.
   /// Otherwise set LHS to LHS / RHS with the fractional part discarded, set
@@ -1974,9 +1773,8 @@ public:
   /// SCRATCH is a bignum of the same size as the operands and result for use by
   /// the routine; its contents need not be initialized and are destroyed.  LHS,
   /// REMAINDER and SCRATCH must be distinct.
-  static int tcDivide(WordType *lhs, const WordType *rhs,
-                      WordType *remainder, WordType *scratch,
-                      unsigned parts);
+  static int tcDivide(WordType *lhs, const WordType *rhs, WordType *remainder,
+                      WordType *scratch, unsigned parts);
 
   /// Shift a bignum left Count bits. Shifted in bits are zero. There are no
   /// restrictions on Count.
@@ -1986,12 +1784,6 @@ public:
   /// restrictions on Count.
   static void tcShiftRight(WordType *, unsigned Words, unsigned Count);
 
-  /// The obvious AND, OR and XOR and complement operations.
-  static void tcAnd(WordType *, const WordType *, unsigned);
-  static void tcOr(WordType *, const WordType *, unsigned);
-  static void tcXor(WordType *, const WordType *, unsigned);
-  static void tcComplement(WordType *, unsigned);
-
   /// Comparison (unsigned) of two bignums.
   static int tcCompare(const WordType *, const WordType *, unsigned);
 
@@ -2005,26 +1797,185 @@ public:
     return tcSubtractPart(dst, 1, parts);
   }
 
-  /// Set the least significant BITS and clear the rest.
-  static void tcSetLeastSignificantBits(WordType *, unsigned, unsigned bits);
+  /// Used to insert APInt objects, or objects that contain APInt objects, into
+  ///  FoldingSets.
+  void Profile(FoldingSetNodeID &id) const;
 
   /// debug method
   void dump() const;
 
-  /// @}
-};
+  /// Returns whether this instance allocated memory.
+  bool needsCleanup() const { return !isSingleWord(); }
 
-/// Magic data for optimising signed division by a constant.
-struct APInt::ms {
-  APInt m;    ///< magic number
-  unsigned s; ///< shift amount
-};
+private:
+  /// This union is used to store the integer value. When the
+  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
+  union {
+    uint64_t VAL;   ///< Used to store the <= 64 bits integer value.
+    uint64_t *pVal; ///< Used to store the >64 bits integer value.
+  } U;
+
+  unsigned BitWidth; ///< The number of bits in this APInt.
+
+  friend struct DenseMapInfo<APInt, void>;
+  friend class APSInt;
+
+  /// This constructor is used only internally for speed of construction of
+  /// temporaries. It is unsafe since it takes ownership of the pointer, so it
+  /// is not public.
+  APInt(uint64_t *val, unsigned bits) : BitWidth(bits) { U.pVal = val; }
+
+  /// Determine which word a bit is in.
+  ///
+  /// \returns the word position for the specified bit position.
+  static unsigned whichWord(unsigned bitPosition) {
+    return bitPosition / APINT_BITS_PER_WORD;
+  }
+
+  /// Determine which bit in a word the specified bit position is in.
+  static unsigned whichBit(unsigned bitPosition) {
+    return bitPosition % APINT_BITS_PER_WORD;
+  }
+
+  /// Get a single bit mask.
+  ///
+  /// \returns a uint64_t with only bit at "whichBit(bitPosition)" set
+  /// This method generates and returns a uint64_t (word) mask for a single
+  /// bit at a specific bit position. This is used to mask the bit in the
+  /// corresponding word.
+  static uint64_t maskBit(unsigned bitPosition) {
+    return 1ULL << whichBit(bitPosition);
+  }
+
+  /// Clear unused high order bits
+  ///
+  /// This method is used internally to clear the top "N" bits in the high order
+  /// word that are not used by the APInt. This is needed after the most
+  /// significant word is assigned a value to ensure that those bits are
+  /// zero'd out.
+  APInt &clearUnusedBits() {
+    // Compute how many bits are used in the final word.
+    unsigned WordBits = ((BitWidth - 1) % APINT_BITS_PER_WORD) + 1;
+
+    // Mask out the high bits.
+    uint64_t mask = WORDTYPE_MAX >> (APINT_BITS_PER_WORD - WordBits);
+    if (LLVM_UNLIKELY(BitWidth == 0))
+      mask = 0;
+
+    if (isSingleWord())
+      U.VAL &= mask;
+    else
+      U.pVal[getNumWords() - 1] &= mask;
+    return *this;
+  }
+
+  /// Get the word corresponding to a bit position
+  /// \returns the corresponding word for the specified bit position.
+  uint64_t getWord(unsigned bitPosition) const {
+    return isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)];
+  }
+
+  /// Utility method to change the bit width of this APInt to new bit width,
+  /// allocating and/or deallocating as necessary. There is no guarantee on the
+  /// value of any bits upon return. Caller should populate the bits after.
+  void reallocate(unsigned NewBitWidth);
+
+  /// Convert a char array into an APInt
+  ///
+  /// \param radix 2, 8, 10, 16, or 36
+  /// Converts a string into a number.  The string must be non-empty
+  /// and well-formed as a number of the given base. The bit-width
+  /// must be sufficient to hold the result.
+  ///
+  /// This is used by the constructors that take string arguments.
+  ///
+  /// StringRef::getAsInteger is superficially similar but (1) does
+  /// not assume that the string is well-formed and (2) grows the
+  /// result to hold the input.
+  void fromString(unsigned numBits, StringRef str, uint8_t radix);
+
+  /// An internal division function for dividing APInts.
+  ///
+  /// This is used by the toString method to divide by the radix. It simply
+  /// provides a more convenient form of divide for internal use since KnuthDiv
+  /// has specific constraints on its inputs. If those constraints are not met
+  /// then it provides a simpler form of divide.
+  static void divide(const WordType *LHS, unsigned lhsWords,
+                     const WordType *RHS, unsigned rhsWords, WordType *Quotient,
+                     WordType *Remainder);
+
+  /// out-of-line slow case for inline constructor
+  void initSlowCase(uint64_t val, bool isSigned);
+
+  /// shared code between two array constructors
+  void initFromArray(ArrayRef<uint64_t> array);
+
+  /// out-of-line slow case for inline copy constructor
+  void initSlowCase(const APInt &that);
+
+  /// out-of-line slow case for shl
+  void shlSlowCase(unsigned ShiftAmt);
+
+  /// out-of-line slow case for lshr.
+  void lshrSlowCase(unsigned ShiftAmt);
+
+  /// out-of-line slow case for ashr.
+  void ashrSlowCase(unsigned ShiftAmt);
+
+  /// out-of-line slow case for operator=
+  void assignSlowCase(const APInt &RHS);
+
+  /// out-of-line slow case for operator==
+  bool equalSlowCase(const APInt &RHS) const LLVM_READONLY;
+
+  /// out-of-line slow case for countLeadingZeros
+  unsigned countLeadingZerosSlowCase() const LLVM_READONLY;
+
+  /// out-of-line slow case for countLeadingOnes.
+  unsigned countLeadingOnesSlowCase() const LLVM_READONLY;
+
+  /// out-of-line slow case for countTrailingZeros.
+  unsigned countTrailingZerosSlowCase() const LLVM_READONLY;
+
+  /// out-of-line slow case for countTrailingOnes
+  unsigned countTrailingOnesSlowCase() const LLVM_READONLY;
+
+  /// out-of-line slow case for countPopulation
+  unsigned countPopulationSlowCase() const LLVM_READONLY;
+
+  /// out-of-line slow case for intersects.
+  bool intersectsSlowCase(const APInt &RHS) const LLVM_READONLY;
+
+  /// out-of-line slow case for isSubsetOf.
+  bool isSubsetOfSlowCase(const APInt &RHS) const LLVM_READONLY;
+
+  /// out-of-line slow case for setBits.
+  void setBitsSlowCase(unsigned loBit, unsigned hiBit);
+
+  /// out-of-line slow case for flipAllBits.
+  void flipAllBitsSlowCase();
 
-/// Magic data for optimising unsigned division by a constant.
-struct APInt::mu {
-  APInt m;    ///< magic number
-  bool a;     ///< add indicator
-  unsigned s; ///< shift amount
+  /// out-of-line slow case for concat.
+  APInt concatSlowCase(const APInt &NewLSB) const;
+
+  /// out-of-line slow case for operator&=.
+  void andAssignSlowCase(const APInt &RHS);
+
+  /// out-of-line slow case for operator|=.
+  void orAssignSlowCase(const APInt &RHS);
+
+  /// out-of-line slow case for operator^=.
+  void xorAssignSlowCase(const APInt &RHS);
+
+  /// Unsigned comparison. Returns -1, 0, or 1 if this APInt is less than, equal
+  /// to, or greater than RHS.
+  int compare(const APInt &RHS) const LLVM_READONLY;
+
+  /// Signed comparison. Returns -1, 0, or 1 if this APInt is less than, equal
+  /// to, or greater than RHS.
+  int compareSigned(const APInt &RHS) const LLVM_READONLY;
+
+  /// @}
 };
 
 inline bool operator==(uint64_t V1, const APInt &V2) { return V2 == V1; }
@@ -2161,7 +2112,6 @@ inline APInt operator*(uint64_t LHS, APInt b) {
   return b;
 }
 
-
 namespace APIntOps {
 
 /// Determine the smaller of two APInts considered to be signed.
@@ -2277,7 +2227,16 @@ Optional<APInt> SolveQuadraticEquationWrap(APInt A, APInt B, APInt C,
 Optional<unsigned> GetMostSignificantDifferentBit(const APInt &A,
                                                   const APInt &B);
 
-} // End of APIntOps namespace
+/// Splat/Merge neighboring bits to widen/narrow the bitmask represented
+/// by \param A to \param NewBitWidth bits.
+///
+/// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011
+/// e.g. ScaleBitMask(0b00011011, 4) -> 0b0111
+/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other.
+///
+/// TODO: Do we need a mode where all bits must be set when merging down?
+APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth);
+} // namespace APIntOps
 
 // See friend declaration above. This additional declaration is required in
 // order to compile LLVM with IBM xlC compiler.
@@ -2292,7 +2251,7 @@ void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst, unsigned StoreBytes);
 void LoadIntFromMemory(APInt &IntVal, const uint8_t *Src, unsigned LoadBytes);
 
 /// Provide DenseMapInfo for APInt.
-template <> struct DenseMapInfo<APInt> {
+template <> struct DenseMapInfo<APInt, void> {
   static inline APInt getEmptyKey() {
     APInt V(nullptr, 0);
     V.U.VAL = 0;
diff --git a/llvm/include/llvm/ADT/APSInt.h b/llvm/include/llvm/ADT/APSInt.h
index 1509d472f131..c1cf3c546070 100644
--- a/llvm/include/llvm/ADT/APSInt.h
+++ b/llvm/include/llvm/ADT/APSInt.h
@@ -58,7 +58,7 @@ public:
   /// that 0 is not a positive value.
   ///
   /// \returns true if this APSInt is positive.
-  bool isStrictlyPositive() const { return isNonNegative() && !isNullValue(); }
+  bool isStrictlyPositive() const { return isNonNegative() && !isZero(); }
 
   APSInt &operator=(APInt RHS) {
     // Retain our current sign.
@@ -344,17 +344,17 @@ inline raw_ostream &operator<<(raw_ostream &OS, const APSInt &I) {
 }
 
 /// Provide DenseMapInfo for APSInt, using the DenseMapInfo for APInt.
-template <> struct DenseMapInfo<APSInt> {
+template <> struct DenseMapInfo<APSInt, void> {
   static inline APSInt getEmptyKey() {
-    return APSInt(DenseMapInfo<APInt>::getEmptyKey());
+    return APSInt(DenseMapInfo<APInt, void>::getEmptyKey());
   }
 
   static inline APSInt getTombstoneKey() {
-    return APSInt(DenseMapInfo<APInt>::getTombstoneKey());
+    return APSInt(DenseMapInfo<APInt, void>::getTombstoneKey());
   }
 
   static unsigned getHashValue(const APSInt &Key) {
-    return DenseMapInfo<APInt>::getHashValue(Key);
+    return DenseMapInfo<APInt, void>::getHashValue(Key);
   }
 
   static bool isEqual(const APSInt &LHS, const APSInt &RHS) {
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index 2df49223c987..61f85cfc812b 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -26,8 +26,6 @@
 
 namespace llvm {
 
-  template<typename T> struct DenseMapInfo;
-
   /// ArrayRef - Represent a constant reference to an array (0 or more elements
   /// consecutively in memory), i.e. a start pointer and a length.  It allows
   /// various APIs to take consecutive elements easily and conveniently.
@@ -572,7 +570,7 @@ namespace llvm {
   }
 
   // Provide DenseMapInfo for ArrayRefs.
-  template <typename T> struct DenseMapInfo<ArrayRef<T>> {
+  template <typename T> struct DenseMapInfo<ArrayRef<T>, void> {
     static inline ArrayRef<T> getEmptyKey() {
       return ArrayRef<T>(
           reinterpret_cast<const T *>(~static_cast<uintptr_t>(0)), size_t(0));
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 31d388073633..cd1964cbdd98 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -85,7 +85,7 @@ class BitVector {
   unsigned Size; // Size of bitvector in bits.
 
 public:
-  typedef unsigned size_type;
+  using size_type = unsigned;
 
   // Encapsulation of a single bit.
   class reference {
@@ -536,8 +536,8 @@ public:
                [&Arg](auto const &BV) { return Arg.size() == BV; }) &&
            "consistent sizes");
     Out.resize(Arg.size());
-    for (size_t i = 0, e = Arg.Bits.size(); i != e; ++i)
-      Out.Bits[i] = f(Arg.Bits[i], Args.Bits[i]...);
+    for (size_type I = 0, E = Arg.Bits.size(); I != E; ++I)
+      Out.Bits[I] = f(Arg.Bits[I], Args.Bits[I]...);
     Out.clear_unused_bits();
     return Out;
   }
@@ -545,16 +545,16 @@ public:
   BitVector &operator|=(const BitVector &RHS) {
     if (size() < RHS.size())
       resize(RHS.size());
-    for (size_t i = 0, e = RHS.Bits.size(); i != e; ++i)
-      Bits[i] |= RHS.Bits[i];
+    for (size_type I = 0, E = RHS.Bits.size(); I != E; ++I)
+      Bits[I] |= RHS.Bits[I];
     return *this;
   }
 
   BitVector &operator^=(const BitVector &RHS) {
     if (size() < RHS.size())
       resize(RHS.size());
-    for (size_t i = 0, e = RHS.Bits.size(); i != e; ++i)
-      Bits[i] ^= RHS.Bits[i];
+    for (size_type I = 0, E = RHS.Bits.size(); I != E; ++I)
+      Bits[I] ^= RHS.Bits[I];
     return *this;
   }
 
@@ -808,11 +808,11 @@ private:
 
 public:
   /// Return the size (in bytes) of the bit vector.
-  size_t getMemorySize() const { return Bits.size() * sizeof(BitWord); }
-  size_t getBitCapacity() const { return Bits.size() * BITWORD_SIZE; }
+  size_type getMemorySize() const { return Bits.size() * sizeof(BitWord); }
+  size_type getBitCapacity() const { return Bits.size() * BITWORD_SIZE; }
 };
 
-inline size_t capacity_in_bytes(const BitVector &X) {
+inline BitVector::size_type capacity_in_bytes(const BitVector &X) {
   return X.getMemorySize();
 }
 
@@ -824,8 +824,8 @@ template <> struct DenseMapInfo<BitVector> {
     return V;
   }
   static unsigned getHashValue(const BitVector &V) {
-    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
-        std::make_pair(V.size(), V.getData()));
+    return DenseMapInfo<std::pair<BitVector::size_type, ArrayRef<uintptr_t>>>::
+        getHashValue(std::make_pair(V.size(), V.getData()));
   }
   static bool isEqual(const BitVector &LHS, const BitVector &RHS) {
     if (LHS.isInvalid() || RHS.isInvalid())
diff --git a/llvm/include/llvm/ADT/CombinationGenerator.h b/llvm/include/llvm/ADT/CombinationGenerator.h
new file mode 100644
index 000000000000..ab6afd555726
--- /dev/null
+++ b/llvm/include/llvm/ADT/CombinationGenerator.h
@@ -0,0 +1,148 @@
+//===-- llvm/ADT/CombinationGenerator.h ------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Combination generator.
+///
+/// Example: given input {{0, 1}, {2}, {3, 4}} it will produce the following
+/// combinations: {0, 2, 3}, {0, 2, 4}, {1, 2, 3}, {1, 2, 4}.
+///
+/// It is useful to think of input as vector-of-vectors, where the
+/// outer vector is the variable space, and inner vector is choice space.
+/// The number of choices for each variable can be different.
+///
+/// As for implementation, it is useful to think of this as a weird number,
+/// where each digit (==variable) may have different base (==number of choices).
+/// Thus modelling of 'produce next combination' is exactly analogous to the
+/// incrementing of an number - increment lowest digit (pick next choice for the
+/// variable), and if it wrapped to the beginning then increment next digit.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_COMBINATIONGENERATOR_H
+#define LLVM_ADT_COMBINATIONGENERATOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <cassert>
+#include <cstring>
+
+namespace llvm {
+
+template <typename choice_type, typename choices_storage_type,
+          int variable_smallsize>
+class CombinationGenerator {
+  template <typename T> struct WrappingIterator {
+    using value_type = T;
+
+    const ArrayRef<value_type> Range;
+    typename decltype(Range)::const_iterator Position;
+
+    // Rewind the tape, placing the position to again point at the beginning.
+    void rewind() { Position = Range.begin(); }
+
+    // Advance position forward, possibly wrapping to the beginning.
+    // Returns whether the wrap happened.
+    bool advance() {
+      ++Position;
+      bool Wrapped = Position == Range.end();
+      if (Wrapped)
+        rewind();
+      return Wrapped;
+    }
+
+    // Get the value at which we are currently pointing.
+    const value_type &operator*() const { return *Position; }
+
+    WrappingIterator(ArrayRef<value_type> Range_) : Range(Range_) {
+      assert(!Range.empty() && "The range must not be empty.");
+      rewind();
+    }
+  };
+
+  const ArrayRef<choices_storage_type> VariablesChoices;
+
+  void performGeneration(
+      const function_ref<bool(ArrayRef<choice_type>)> Callback) const {
+    SmallVector<WrappingIterator<choice_type>, variable_smallsize>
+        VariablesState;
+
+    // 'increment' of the the whole VariablesState is defined identically to the
+    // increment of a number: starting from the least significant element,
+    // increment it, and if it wrapped, then propagate that carry by also
+    // incrementing next (more significant) element.
+    auto IncrementState =
+        [](MutableArrayRef<WrappingIterator<choice_type>> VariablesState)
+        -> bool {
+      for (WrappingIterator<choice_type> &Variable :
+           llvm::reverse(VariablesState)) {
+        bool Wrapped = Variable.advance();
+        if (!Wrapped)
+          return false; // There you go, next combination is ready.
+        // We have carry - increment more significant variable next..
+      }
+      return true; // MSB variable wrapped, no more unique combinations.
+    };
+
+    // Initialize the per-variable state to refer to the possible choices for
+    // that variable.
+    VariablesState.reserve(VariablesChoices.size());
+    for (ArrayRef<choice_type> VC : VariablesChoices)
+      VariablesState.emplace_back(VC);
+
+    // Temporary buffer to store each combination before performing Callback.
+    SmallVector<choice_type, variable_smallsize> CurrentCombination;
+    CurrentCombination.resize(VariablesState.size());
+
+    while (true) {
+      // Gather the currently-selected variable choices into a vector.
+      for (auto I : llvm::zip(VariablesState, CurrentCombination))
+        std::get<1>(I) = *std::get<0>(I);
+      // And pass the new combination into callback, as intended.
+      if (/*Abort=*/Callback(CurrentCombination))
+        return;
+      // And tick the state to next combination, which will be unique.
+      if (IncrementState(VariablesState))
+        return; // All combinations produced.
+    }
+  };
+
+public:
+  CombinationGenerator(ArrayRef<choices_storage_type> VariablesChoices_)
+      : VariablesChoices(VariablesChoices_) {
+#ifndef NDEBUG
+    assert(!VariablesChoices.empty() && "There should be some variables.");
+    llvm::for_each(VariablesChoices, [](ArrayRef<choice_type> VariableChoices) {
+      assert(!VariableChoices.empty() &&
+             "There must always be some choice, at least a placeholder one.");
+    });
+#endif
+  }
+
+  // How many combinations can we produce, max?
+  // This is at most how many times the callback will be called.
+  size_t numCombinations() const {
+    size_t NumVariants = 1;
+    for (ArrayRef<choice_type> VariableChoices : VariablesChoices)
+      NumVariants *= VariableChoices.size();
+    assert(NumVariants >= 1 &&
+           "We should always end up producing at least one combination");
+    return NumVariants;
+  }
+
+  // Actually perform exhaustive combination generation.
+  // Each result will be passed into the callback.
+  void generate(const function_ref<bool(ArrayRef<choice_type>)> Callback) {
+    performGeneration(Callback);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/ADT/DenseMapInfo.h b/llvm/include/llvm/ADT/DenseMapInfo.h
index d276acbfa6a6..75b7371a3683 100644
--- a/llvm/include/llvm/ADT/DenseMapInfo.h
+++ b/llvm/include/llvm/ADT/DenseMapInfo.h
@@ -13,10 +13,10 @@
 #ifndef LLVM_ADT_DENSEMAPINFO_H
 #define LLVM_ADT_DENSEMAPINFO_H
 
-#include "llvm/ADT/Hashing.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <tuple>
 #include <utility>
 
 namespace llvm {
@@ -39,7 +39,12 @@ static inline unsigned combineHashValue(unsigned a, unsigned b) {
 
 } // end namespace detail
 
-template<typename T>
+/// An information struct used to provide DenseMap with the various necessary
+/// components for a given value type `T`. `Enable` is an optional additional
+/// parameter that is used to support SFINAE (generally using std::enable_if_t)
+/// in derived DenseMapInfo specializations; in non-SFINAE use cases this should
+/// just be `void`.
+template<typename T, typename Enable = void>
 struct DenseMapInfo {
   //static inline T getEmptyKey();
   //static inline T getTombstoneKey();
@@ -282,13 +287,6 @@ template <typename... Ts> struct DenseMapInfo<std::tuple<Ts...>> {
   }
 };
 
-template <> struct DenseMapInfo<hash_code> {
-  static inline hash_code getEmptyKey() { return hash_code(-1); }
-  static inline hash_code getTombstoneKey() { return hash_code(-2); }
-  static unsigned getHashValue(hash_code val) { return val; }
-  static bool isEqual(hash_code LHS, hash_code RHS) { return LHS == RHS; }
-};
-
 } // end namespace llvm
 
 #endif // LLVM_ADT_DENSEMAPINFO_H
diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index 273b00f99d5d..de6bb3bca7e3 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -30,7 +30,8 @@ namespace llvm {
 ///
 /// This implementation is an efficient implementation that only stores one copy
 /// of the element being indexed per entry in the set, and allows any arbitrary
-/// type to be indexed (as long as it can be ordered with operator<).
+/// type to be indexed (as long as it can be ordered with operator< or a
+/// comparator is provided).
 ///
 /// Here is a simple example using integers:
 ///
@@ -54,7 +55,7 @@ namespace llvm {
 ///   4
 ///   5 1 2
 ///
-template <class ElemTy>
+template <class ElemTy, class Compare = std::less<ElemTy>>
 class EquivalenceClasses {
   /// ECValue - The EquivalenceClasses data structure is just a set of these.
   /// Each of these represents a relation for a value.  First it stores the
@@ -101,22 +102,40 @@ class EquivalenceClasses {
       assert(RHS.isLeader() && RHS.getNext() == nullptr && "Not a singleton!");
     }
 
-    bool operator<(const ECValue &UFN) const { return Data < UFN.Data; }
-
     bool isLeader() const { return (intptr_t)Next & 1; }
     const ElemTy &getData() const { return Data; }
 
     const ECValue *getNext() const {
       return (ECValue*)((intptr_t)Next & ~(intptr_t)1);
     }
+  };
+
+  /// A wrapper of the comparator, to be passed to the set.
+  struct ECValueComparator {
+    using is_transparent = void;
+
+    ECValueComparator() : compare(Compare()) {}
+
+    bool operator()(const ECValue &lhs, const ECValue &rhs) const {
+      return compare(lhs.Data, rhs.Data);
+    }
+
+    template <typename T>
+    bool operator()(const T &lhs, const ECValue &rhs) const {
+      return compare(lhs, rhs.Data);
+    }
+
+    template <typename T>
+    bool operator()(const ECValue &lhs, const T &rhs) const {
+      return compare(lhs.Data, rhs);
+    }
 
-    template<typename T>
-    bool operator<(const T &Val) const { return Data < Val; }
+    const Compare compare;
   };
 
   /// TheMapping - This implicitly provides a mapping from ElemTy values to the
   /// ECValues, it just keeps the key as part of the value.
-  std::set<ECValue> TheMapping;
+  std::set<ECValue, ECValueComparator> TheMapping;
 
 public:
   EquivalenceClasses() = default;
diff --git a/llvm/include/llvm/ADT/FunctionExtras.h b/llvm/include/llvm/ADT/FunctionExtras.h
index e67ef7377c88..5a37417ddde5 100644
--- a/llvm/include/llvm/ADT/FunctionExtras.h
+++ b/llvm/include/llvm/ADT/FunctionExtras.h
@@ -37,6 +37,7 @@
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/MemAlloc.h"
 #include "llvm/Support/type_traits.h"
+#include <cstring>
 #include <memory>
 #include <type_traits>
 
@@ -64,11 +65,16 @@ template <typename CallableT, typename ThisT>
 using EnableUnlessSameType =
     std::enable_if_t<!std::is_same<remove_cvref_t<CallableT>, ThisT>::value>;
 template <typename CallableT, typename Ret, typename... Params>
-using EnableIfCallable =
-    std::enable_if_t<std::is_void<Ret>::value ||
-                     std::is_convertible<decltype(std::declval<CallableT>()(
-                                             std::declval<Params>()...)),
-                                         Ret>::value>;
+using EnableIfCallable = std::enable_if_t<llvm::disjunction<
+    std::is_void<Ret>,
+    std::is_same<decltype(std::declval<CallableT>()(std::declval<Params>()...)),
+                 Ret>,
+    std::is_same<const decltype(std::declval<CallableT>()(
+                     std::declval<Params>()...)),
+                 Ret>,
+    std::is_convertible<decltype(std::declval<CallableT>()(
+                            std::declval<Params>()...)),
+                        Ret>>::value>;
 
 template <typename ReturnT, typename... ParamTs> class UniqueFunctionBase {
 protected:
diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h
index e296c1c53ebd..74a87a3d8dbb 100644
--- a/llvm/include/llvm/ADT/Hashing.h
+++ b/llvm/include/llvm/ADT/Hashing.h
@@ -56,6 +56,7 @@
 #include <utility>
 
 namespace llvm {
+template <typename T, typename Enable> struct DenseMapInfo;
 
 /// An opaque object representing a hash code.
 ///
@@ -677,6 +678,13 @@ hash_code hash_value(const std::basic_string<T> &arg) {
   return hash_combine_range(arg.begin(), arg.end());
 }
 
+template <> struct DenseMapInfo<hash_code, void> {
+  static inline hash_code getEmptyKey() { return hash_code(-1); }
+  static inline hash_code getTombstoneKey() { return hash_code(-2); }
+  static unsigned getHashValue(hash_code val) { return val; }
+  static bool isEqual(hash_code LHS, hash_code RHS) { return LHS == RHS; }
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/ADT/ImmutableList.h b/llvm/include/llvm/ADT/ImmutableList.h
index c9ee494734e7..cf27c5a16d28 100644
--- a/llvm/include/llvm/ADT/ImmutableList.h
+++ b/llvm/include/llvm/ADT/ImmutableList.h
@@ -220,8 +220,7 @@ public:
 // Partially-specialized Traits.
 //===----------------------------------------------------------------------===//
 
-template<typename T> struct DenseMapInfo;
-template<typename T> struct DenseMapInfo<ImmutableList<T>> {
+template <typename T> struct DenseMapInfo<ImmutableList<T>, void> {
   static inline ImmutableList<T> getEmptyKey() {
     return reinterpret_cast<ImmutableListImpl<T>*>(-1);
   }
diff --git a/llvm/include/llvm/ADT/IntervalMap.h b/llvm/include/llvm/ADT/IntervalMap.h
index 26a7ed0cd333..3c107a3622a9 100644
--- a/llvm/include/llvm/ADT/IntervalMap.h
+++ b/llvm/include/llvm/ADT/IntervalMap.h
@@ -1137,7 +1137,7 @@ public:
 
   /// overlaps(a, b) - Return true if the intervals in this map overlap with the
   /// interval [a;b].
-  bool overlaps(KeyT a, KeyT b) {
+  bool overlaps(KeyT a, KeyT b) const {
     assert(Traits::nonEmpty(a, b));
     const_iterator I = find(a);
     if (!I.valid())
diff --git a/llvm/include/llvm/ADT/MapVector.h b/llvm/include/llvm/ADT/MapVector.h
index 1de1124f4ea2..f9540999381a 100644
--- a/llvm/include/llvm/ADT/MapVector.h
+++ b/llvm/include/llvm/ADT/MapVector.h
@@ -43,6 +43,7 @@ class MapVector {
       "The mapped_type of the specified Map must be an integral type");
 
 public:
+  using key_type = KeyT;
   using value_type = typename VectorType::value_type;
   using size_type = typename VectorType::size_type;
 
diff --git a/llvm/include/llvm/ADT/PointerIntPair.h b/llvm/include/llvm/ADT/PointerIntPair.h
index cb8b202c48b7..393ace6b70fc 100644
--- a/llvm/include/llvm/ADT/PointerIntPair.h
+++ b/llvm/include/llvm/ADT/PointerIntPair.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-template <typename T> struct DenseMapInfo;
+template <typename T, typename Enable> struct DenseMapInfo;
 template <typename PointerT, unsigned IntBits, typename PtrTraits>
 struct PointerIntPairInfo;
 
@@ -192,7 +192,7 @@ struct PointerIntPairInfo {
 
 // Provide specialization of DenseMapInfo for PointerIntPair.
 template <typename PointerTy, unsigned IntBits, typename IntType>
-struct DenseMapInfo<PointerIntPair<PointerTy, IntBits, IntType>> {
+struct DenseMapInfo<PointerIntPair<PointerTy, IntBits, IntType>, void> {
   using Ty = PointerIntPair<PointerTy, IntBits, IntType>;
 
   static Ty getEmptyKey() {
diff --git a/llvm/include/llvm/ADT/PointerUnion.h b/llvm/include/llvm/ADT/PointerUnion.h
index c39691061b72..0874f67db3fe 100644
--- a/llvm/include/llvm/ADT/PointerUnion.h
+++ b/llvm/include/llvm/ADT/PointerUnion.h
@@ -17,42 +17,13 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 
 namespace llvm {
 
-template <typename T> struct PointerUnionTypeSelectorReturn {
-  using Return = T;
-};
-
-/// Get a type based on whether two types are the same or not.
-///
-/// For:
-///
-/// \code
-///   using Ret = typename PointerUnionTypeSelector<T1, T2, EQ, NE>::Return;
-/// \endcode
-///
-/// Ret will be EQ type if T1 is same as T2 or NE type otherwise.
-template <typename T1, typename T2, typename RET_EQ, typename RET_NE>
-struct PointerUnionTypeSelector {
-  using Return = typename PointerUnionTypeSelectorReturn<RET_NE>::Return;
-};
-
-template <typename T, typename RET_EQ, typename RET_NE>
-struct PointerUnionTypeSelector<T, T, RET_EQ, RET_NE> {
-  using Return = typename PointerUnionTypeSelectorReturn<RET_EQ>::Return;
-};
-
-template <typename T1, typename T2, typename RET_EQ, typename RET_NE>
-struct PointerUnionTypeSelectorReturn<
-    PointerUnionTypeSelector<T1, T2, RET_EQ, RET_NE>> {
-  using Return =
-      typename PointerUnionTypeSelector<T1, T2, RET_EQ, RET_NE>::Return;
-};
-
 namespace pointer_union_detail {
   /// Determine the number of bits required to store integers with values < n.
   /// This is ceil(log2(n)).
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index eb001346b609..48f15b02283a 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -272,20 +272,24 @@ template <typename T> auto drop_begin(T &&RangeOrContainer, size_t N = 1) {
 // be applied whenever operator* is invoked on the iterator.
 
 template <typename ItTy, typename FuncTy,
-          typename FuncReturnTy =
-            decltype(std::declval<FuncTy>()(*std::declval<ItTy>()))>
+          typename ReferenceTy =
+              decltype(std::declval<FuncTy>()(*std::declval<ItTy>()))>
 class mapped_iterator
     : public iterator_adaptor_base<
-             mapped_iterator<ItTy, FuncTy>, ItTy,
-             typename std::iterator_traits<ItTy>::iterator_category,
-             typename std::remove_reference<FuncReturnTy>::type> {
+          mapped_iterator<ItTy, FuncTy>, ItTy,
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::remove_reference_t<ReferenceTy>,
+          typename std::iterator_traits<ItTy>::difference_type,
+          std::remove_reference_t<ReferenceTy> *, ReferenceTy> {
 public:
   mapped_iterator(ItTy U, FuncTy F)
     : mapped_iterator::iterator_adaptor_base(std::move(U)), F(std::move(F)) {}
 
   ItTy getCurrent() { return this->I; }
 
-  FuncReturnTy operator*() const { return F(*this->I); }
+  const FuncTy &getFunction() const { return F; }
+
+  ReferenceTy operator*() const { return F(*this->I); }
 
 private:
   FuncTy F;
@@ -303,6 +307,32 @@ auto map_range(ContainerTy &&C, FuncTy F) {
   return make_range(map_iterator(C.begin(), F), map_iterator(C.end(), F));
 }
 
+/// A base type of mapped iterator, that is useful for building derived
+/// iterators that do not need/want to store the map function (as in
+/// mapped_iterator). These iterators must simply provide a `mapElement` method
+/// that defines how to map a value of the iterator to the provided reference
+/// type.
+template <typename DerivedT, typename ItTy, typename ReferenceTy>
+class mapped_iterator_base
+    : public iterator_adaptor_base<
+          DerivedT, ItTy,
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::remove_reference_t<ReferenceTy>,
+          typename std::iterator_traits<ItTy>::difference_type,
+          std::remove_reference_t<ReferenceTy> *, ReferenceTy> {
+public:
+  using BaseT = mapped_iterator_base;
+
+  mapped_iterator_base(ItTy U)
+      : mapped_iterator_base::iterator_adaptor_base(std::move(U)) {}
+
+  ItTy getCurrent() { return this->I; }
+
+  ReferenceTy operator*() const {
+    return static_cast<const DerivedT &>(*this).mapElement(*this->I);
+  }
+};
+
 /// Helper to determine if type T has a member called rbegin().
 template <typename Ty> class has_rbegin_impl {
   using yes = char[1];
@@ -371,12 +401,7 @@ class filter_iterator_base
           typename std::common_type<
               IterTag, typename std::iterator_traits<
                            WrappedIteratorT>::iterator_category>::type> {
-  using BaseT = iterator_adaptor_base<
-      filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
-      WrappedIteratorT,
-      typename std::common_type<
-          IterTag, typename std::iterator_traits<
-                       WrappedIteratorT>::iterator_category>::type>;
+  using BaseT = typename filter_iterator_base::iterator_adaptor_base;
 
 protected:
   WrappedIteratorT End;
@@ -411,12 +436,10 @@ template <typename WrappedIteratorT, typename PredicateT,
           typename IterTag = std::forward_iterator_tag>
 class filter_iterator_impl
     : public filter_iterator_base<WrappedIteratorT, PredicateT, IterTag> {
-  using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>;
-
 public:
   filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
                        PredicateT Pred)
-      : BaseT(Begin, End, Pred) {}
+      : filter_iterator_impl::filter_iterator_base(Begin, End, Pred) {}
 };
 
 /// Specialization of filter_iterator_base for bidirectional iteration.
@@ -425,8 +448,8 @@ class filter_iterator_impl<WrappedIteratorT, PredicateT,
                            std::bidirectional_iterator_tag>
     : public filter_iterator_base<WrappedIteratorT, PredicateT,
                                   std::bidirectional_iterator_tag> {
-  using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT,
-                                     std::bidirectional_iterator_tag>;
+  using BaseT = typename filter_iterator_impl::filter_iterator_base;
+
   void findPrevValid() {
     while (!this->Pred(*this->I))
       BaseT::operator--();
@@ -514,9 +537,7 @@ template <typename WrappedIteratorT>
 class early_inc_iterator_impl
     : public iterator_adaptor_base<early_inc_iterator_impl<WrappedIteratorT>,
                                    WrappedIteratorT, std::input_iterator_tag> {
-  using BaseT =
-      iterator_adaptor_base<early_inc_iterator_impl<WrappedIteratorT>,
-                            WrappedIteratorT, std::input_iterator_tag>;
+  using BaseT = typename early_inc_iterator_impl::iterator_adaptor_base;
 
   using PointerT = typename std::iterator_traits<WrappedIteratorT>::pointer;
 
@@ -630,12 +651,18 @@ protected:
     return std::tuple<Iters...>(std::prev(std::get<Ns>(iterators))...);
   }
 
+  template <size_t... Ns>
+  bool test_all_equals(const zip_common &other,
+            std::index_sequence<Ns...>) const {
+    return all_of(std::initializer_list<bool>{std::get<Ns>(this->iterators) ==
+                                              std::get<Ns>(other.iterators)...},
+                  identity<bool>{});
+  }
+
 public:
   zip_common(Iters &&... ts) : iterators(std::forward<Iters>(ts)...) {}
 
-  value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
-
-  const value_type operator*() const {
+  value_type operator*() const {
     return deref(std::index_sequence_for<Iters...>{});
   }
 
@@ -650,6 +677,11 @@ public:
     iterators = tup_dec(std::index_sequence_for<Iters...>{});
     return *reinterpret_cast<ZipType *>(this);
   }
+
+  /// Return true if all the iterator are matching `other`'s iterators.
+  bool all_equals(zip_common &other) {
+    return test_all_equals(other, std::index_sequence_for<Iters...>{});
+  }
 };
 
 template <typename... Iters>
@@ -801,8 +833,6 @@ public:
       : iterators(std::forward<Iters>(ts.first)...),
         end_iterators(std::forward<Iters>(ts.second)...) {}
 
-  value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
-
   value_type operator*() const {
     return deref(std::index_sequence_for<Iters...>{});
   }
@@ -1073,8 +1103,7 @@ template <typename DerivedT, typename BaseT, typename T,
           typename PointerT = T *, typename ReferenceT = T &>
 class indexed_accessor_range_base {
 public:
-  using RangeBaseT =
-      indexed_accessor_range_base<DerivedT, BaseT, T, PointerT, ReferenceT>;
+  using RangeBaseT = indexed_accessor_range_base;
 
   /// An iterator element of this range.
   class iterator : public indexed_accessor_iterator<iterator, BaseT, T,
@@ -1087,8 +1116,7 @@ public:
 
   private:
     iterator(BaseT owner, ptrdiff_t curIndex)
-        : indexed_accessor_iterator<iterator, BaseT, T, PointerT, ReferenceT>(
-              owner, curIndex) {}
+        : iterator::indexed_accessor_iterator(owner, curIndex) {}
 
     /// Allow access to the constructor.
     friend indexed_accessor_range_base<DerivedT, BaseT, T, PointerT,
@@ -1234,20 +1262,39 @@ public:
   }
 };
 
+namespace detail {
+/// Return a reference to the first or second member of a reference. Otherwise,
+/// return a copy of the member of a temporary.
+///
+/// When passing a range whose iterators return values instead of references,
+/// the reference must be dropped from `decltype((elt.first))`, which will
+/// always be a reference, to avoid returning a reference to a temporary.
+template <typename EltTy, typename FirstTy> class first_or_second_type {
+public:
+  using type =
+      typename std::conditional_t<std::is_reference<EltTy>::value, FirstTy,
+                                  std::remove_reference_t<FirstTy>>;
+};
+} // end namespace detail
+
 /// Given a container of pairs, return a range over the first elements.
 template <typename ContainerTy> auto make_first_range(ContainerTy &&c) {
-  return llvm::map_range(
-      std::forward<ContainerTy>(c),
-      [](decltype((*std::begin(c))) elt) -> decltype((elt.first)) {
-        return elt.first;
-      });
+  using EltTy = decltype((*std::begin(c)));
+  return llvm::map_range(std::forward<ContainerTy>(c),
+                         [](EltTy elt) -> typename detail::first_or_second_type<
+                                           EltTy, decltype((elt.first))>::type {
+                           return elt.first;
+                         });
 }
 
 /// Given a container of pairs, return a range over the second elements.
 template <typename ContainerTy> auto make_second_range(ContainerTy &&c) {
+  using EltTy = decltype((*std::begin(c)));
   return llvm::map_range(
       std::forward<ContainerTy>(c),
-      [](decltype((*std::begin(c))) elt) -> decltype((elt.second)) {
+      [](EltTy elt) ->
+      typename detail::first_or_second_type<EltTy,
+                                            decltype((elt.second))>::type {
         return elt.second;
       });
 }
@@ -1260,7 +1307,7 @@ template <typename ContainerTy> auto make_second_range(ContainerTy &&c) {
 /// compares less than the first component of another std::pair.
 struct less_first {
   template <typename T> bool operator()(const T &lhs, const T &rhs) const {
-    return lhs.first < rhs.first;
+    return std::less<>()(lhs.first, rhs.first);
   }
 };
 
@@ -1268,7 +1315,7 @@ struct less_first {
 /// compares less than the second component of another std::pair.
 struct less_second {
   template <typename T> bool operator()(const T &lhs, const T &rhs) const {
-    return lhs.second < rhs.second;
+    return std::less<>()(lhs.second, rhs.second);
   }
 };
 
@@ -1877,8 +1924,7 @@ template <typename R> struct result_pair {
   }
 
   std::size_t index() const { return Index; }
-  const value_reference value() const { return *Iter; }
-  value_reference value() { return *Iter; }
+  value_reference value() const { return *Iter; }
 
 private:
   std::size_t Index = std::numeric_limits<std::size_t>::max();
@@ -1887,11 +1933,8 @@ private:
 
 template <typename R>
 class enumerator_iter
-    : public iterator_facade_base<
-          enumerator_iter<R>, std::forward_iterator_tag, result_pair<R>,
-          typename std::iterator_traits<IterOfRange<R>>::difference_type,
-          typename std::iterator_traits<IterOfRange<R>>::pointer,
-          typename std::iterator_traits<IterOfRange<R>>::reference> {
+    : public iterator_facade_base<enumerator_iter<R>, std::forward_iterator_tag,
+                                  const result_pair<R>> {
   using result_type = result_pair<R>;
 
 public:
@@ -1901,7 +1944,6 @@ public:
   enumerator_iter(std::size_t Index, IterOfRange<R> Iter)
       : Result(Index, Iter) {}
 
-  result_type &operator*() { return Result; }
   const result_type &operator*() const { return Result; }
 
   enumerator_iter &operator++() {
@@ -1986,6 +2028,45 @@ decltype(auto) apply_tuple(F &&f, Tuple &&t) {
                                   Indices{});
 }
 
+namespace detail {
+
+template <typename Predicate, typename... Args>
+bool all_of_zip_predicate_first(Predicate &&P, Args &&...args) {
+  auto z = zip(args...);
+  auto it = z.begin();
+  auto end = z.end();
+  while (it != end) {
+    if (!apply_tuple([&](auto &&...args) { return P(args...); }, *it))
+      return false;
+    ++it;
+  }
+  return it.all_equals(end);
+}
+
+// Just an adaptor to switch the order of argument and have the predicate before
+// the zipped inputs.
+template <typename... ArgsThenPredicate, size_t... InputIndexes>
+bool all_of_zip_predicate_last(
+    std::tuple<ArgsThenPredicate...> argsThenPredicate,
+    std::index_sequence<InputIndexes...>) {
+  auto constexpr OutputIndex =
+      std::tuple_size<decltype(argsThenPredicate)>::value - 1;
+  return all_of_zip_predicate_first(std::get<OutputIndex>(argsThenPredicate),
+                             std::get<InputIndexes>(argsThenPredicate)...);
+}
+
+} // end namespace detail
+
+/// Compare two zipped ranges using the provided predicate (as last argument).
+/// Return true if all elements satisfy the predicate and false otherwise.
+//  Return false if the zipped iterator aren't all at end (size mismatch).
+template <typename... ArgsAndPredicate>
+bool all_of_zip(ArgsAndPredicate &&...argsAndPredicate) {
+  return detail::all_of_zip_predicate_last(
+      std::forward_as_tuple(argsAndPredicate...),
+      std::make_index_sequence<sizeof...(argsAndPredicate) - 1>{});
+}
+
 /// Return true if the sequence [Begin, End) has exactly N items. Runs in O(N)
 /// time. Not meant for use with random-access iterators.
 /// Can optionally take a predicate to filter lazily some items.
diff --git a/llvm/include/llvm/ADT/Sequence.h b/llvm/include/llvm/ADT/Sequence.h
index 3e4bf0932222..fdbf397984d0 100644
--- a/llvm/include/llvm/ADT/Sequence.h
+++ b/llvm/include/llvm/ADT/Sequence.h
@@ -6,9 +6,74 @@
 //
 //===----------------------------------------------------------------------===//
 /// \file
-/// This routine provides some synthesis utilities to produce sequences of
-/// values. The names are intentionally kept very short as they tend to occur
-/// in common and widely used contexts.
+/// Provides some synthesis utilities to produce sequences of values. The names
+/// are intentionally kept very short as they tend to occur in common and
+/// widely used contexts.
+///
+/// The `seq(A, B)` function produces a sequence of values from `A` to up to
+/// (but not including) `B`, i.e., [`A`, `B`), that can be safely iterated over.
+/// `seq` supports both integral (e.g., `int`, `char`, `uint32_t`) and enum
+/// types. `seq_inclusive(A, B)` produces a sequence of values from `A` to `B`,
+/// including `B`.
+///
+/// Examples with integral types:
+/// ```
+/// for (int x : seq(0, 3))
+///   outs() << x << " ";
+/// ```
+///
+/// Prints: `0 1 2 `.
+///
+/// ```
+/// for (int x : seq_inclusive(0, 3))
+///   outs() << x << " ";
+/// ```
+///
+/// Prints: `0 1 2 3 `.
+///
+/// Similar to `seq` and `seq_inclusive`, the `enum_seq` and
+/// `enum_seq_inclusive` functions produce sequences of enum values that can be
+/// iterated over.
+/// To enable iteration with enum types, you need to either mark enums as safe
+/// to iterate on by specializing `enum_iteration_traits`, or opt into
+/// potentially unsafe iteration at every callsite by passing
+/// `force_iteration_on_noniterable_enum`.
+///
+/// Examples with enum types:
+/// ```
+/// namespace X {
+///   enum class MyEnum : unsigned {A = 0, B, C};
+/// } // namespace X
+///
+/// template <> struct enum_iteration_traits<X::MyEnum> {
+///   static contexpr bool is_iterable = true;
+/// };
+///
+/// class MyClass {
+/// public:
+///   enum Safe { D = 3, E, F };
+///   enum MaybeUnsafe { G = 1, H = 2, I = 4 };
+/// };
+///
+/// template <> struct enum_iteration_traits<MyClass::Safe> {
+///   static contexpr bool is_iterable = true;
+/// };
+/// ```
+///
+/// ```
+///   for (auto v : enum_seq(MyClass::Safe::D, MyClass::Safe::F))
+///     outs() << int(v) << " ";
+/// ```
+///
+/// Prints: `3 4 `.
+///
+/// ```
+///   for (auto v : enum_seq(MyClass::MaybeUnsafe::H, MyClass::MaybeUnsafe::I,
+///                          force_iteration_on_noniterable_enum))
+///     outs() << int(v) << " ";
+/// ```
+///
+/// Prints: `2 3 `.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -18,12 +83,31 @@
 #include <cassert>     // assert
 #include <iterator>    // std::random_access_iterator_tag
 #include <limits>      // std::numeric_limits
-#include <type_traits> // std::underlying_type, std::is_enum
+#include <type_traits> // std::is_integral, std::is_enum, std::underlying_type,
+                       // std::enable_if
 
 #include "llvm/Support/MathExtras.h" // AddOverflow / SubOverflow
 
 namespace llvm {
 
+// Enum traits that marks enums as safe or unsafe to iterate over.
+// By default, enum types are *not* considered safe for iteration.
+// To allow iteration for your enum type, provide a specialization with
+// `is_iterable` set to `true` in the `llvm` namespace.
+// Alternatively, you can pass the `force_iteration_on_noniterable_enum` tag
+// to `enum_seq` or `enum_seq_inclusive`.
+template <typename EnumT> struct enum_iteration_traits {
+  static constexpr bool is_iterable = false;
+};
+
+struct force_iteration_on_noniterable_enum_t {
+  explicit force_iteration_on_noniterable_enum_t() = default;
+};
+
+// TODO: Make this `inline` once we update to C++17 to avoid ORD violations.
+constexpr force_iteration_on_noniterable_enum_t
+    force_iteration_on_noniterable_enum;
+
 namespace detail {
 
 // Returns whether a value of type U can be represented with type T.
@@ -213,27 +297,81 @@ private:
   iterator PastEndValue;
 };
 
-/// Iterate over an integral/enum type from Begin up to - but not including -
-/// End.
-/// Note on enum iteration: `seq` will generate each consecutive value, even if
-/// no enumerator with that value exists.
+/// Iterate over an integral type from Begin up to - but not including - End.
 /// Note: Begin and End values have to be within [INTMAX_MIN, INTMAX_MAX] for
 /// forward iteration (resp. [INTMAX_MIN + 1, INTMAX_MAX] for reverse
 /// iteration).
-template <typename T> auto seq(T Begin, T End) {
+template <typename T, typename = std::enable_if_t<std::is_integral<T>::value &&
+                                                  !std::is_enum<T>::value>>
+auto seq(T Begin, T End) {
   return iota_range<T>(Begin, End, false);
 }
 
-/// Iterate over an integral/enum type from Begin to End inclusive.
-/// Note on enum iteration: `seq_inclusive` will generate each consecutive
-/// value, even if no enumerator with that value exists.
+/// Iterate over an integral type from Begin to End inclusive.
 /// Note: Begin and End values have to be within [INTMAX_MIN, INTMAX_MAX - 1]
 /// for forward iteration (resp. [INTMAX_MIN + 1, INTMAX_MAX - 1] for reverse
 /// iteration).
-template <typename T> auto seq_inclusive(T Begin, T End) {
+template <typename T, typename = std::enable_if_t<std::is_integral<T>::value &&
+                                                  !std::is_enum<T>::value>>
+auto seq_inclusive(T Begin, T End) {
   return iota_range<T>(Begin, End, true);
 }
 
+/// Iterate over an enum type from Begin up to - but not including - End.
+/// Note: `enum_seq` will generate each consecutive value, even if no
+/// enumerator with that value exists.
+/// Note: Begin and End values have to be within [INTMAX_MIN, INTMAX_MAX] for
+/// forward iteration (resp. [INTMAX_MIN + 1, INTMAX_MAX] for reverse
+/// iteration).
+template <typename EnumT,
+          typename = std::enable_if_t<std::is_enum<EnumT>::value>>
+auto enum_seq(EnumT Begin, EnumT End) {
+  static_assert(enum_iteration_traits<EnumT>::is_iterable,
+                "Enum type is not marked as iterable.");
+  return iota_range<EnumT>(Begin, End, false);
+}
+
+/// Iterate over an enum type from Begin up to - but not including - End, even
+/// when `EnumT` is not marked as safely iterable by `enum_iteration_traits`.
+/// Note: `enum_seq` will generate each consecutive value, even if no
+/// enumerator with that value exists.
+/// Note: Begin and End values have to be within [INTMAX_MIN, INTMAX_MAX] for
+/// forward iteration (resp. [INTMAX_MIN + 1, INTMAX_MAX] for reverse
+/// iteration).
+template <typename EnumT,
+          typename = std::enable_if_t<std::is_enum<EnumT>::value>>
+auto enum_seq(EnumT Begin, EnumT End, force_iteration_on_noniterable_enum_t) {
+  return iota_range<EnumT>(Begin, End, false);
+}
+
+/// Iterate over an enum type from Begin to End inclusive.
+/// Note: `enum_seq_inclusive` will generate each consecutive value, even if no
+/// enumerator with that value exists.
+/// Note: Begin and End values have to be within [INTMAX_MIN, INTMAX_MAX - 1]
+/// for forward iteration (resp. [INTMAX_MIN + 1, INTMAX_MAX - 1] for reverse
+/// iteration).
+template <typename EnumT,
+          typename = std::enable_if_t<std::is_enum<EnumT>::value>>
+auto enum_seq_inclusive(EnumT Begin, EnumT End) {
+  static_assert(enum_iteration_traits<EnumT>::is_iterable,
+                "Enum type is not marked as iterable.");
+  return iota_range<EnumT>(Begin, End, true);
+}
+
+/// Iterate over an enum type from Begin to End inclusive, even when `EnumT`
+/// is not marked as safely iterable by `enum_iteration_traits`.
+/// Note: `enum_seq_inclusive` will generate each consecutive value, even if no
+/// enumerator with that value exists.
+/// Note: Begin and End values have to be within [INTMAX_MIN, INTMAX_MAX - 1]
+/// for forward iteration (resp. [INTMAX_MIN + 1, INTMAX_MAX - 1] for reverse
+/// iteration).
+template <typename EnumT,
+          typename = std::enable_if_t<std::is_enum<EnumT>::value>>
+auto enum_seq_inclusive(EnumT Begin, EnumT End,
+                        force_iteration_on_noniterable_enum_t) {
+  return iota_range<EnumT>(Begin, End, true);
+}
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_SEQUENCE_H
diff --git a/llvm/include/llvm/ADT/SetOperations.h b/llvm/include/llvm/ADT/SetOperations.h
index 62f1d26dc1c2..3e30b6bb83d3 100644
--- a/llvm/include/llvm/ADT/SetOperations.h
+++ b/llvm/include/llvm/ADT/SetOperations.h
@@ -77,15 +77,6 @@ bool set_is_subset(const S1Ty &S1, const S2Ty &S2) {
   return true;
 }
 
-/// set_is_strict_subset(A, B) - Return true iff A in B and and A != B
-///
-template <class S1Ty, class S2Ty>
-bool set_is_strict_subset(const S1Ty &S1, const S2Ty &S2) {
-  if (S1.size() >= S2.size())
-    return false;
-  return set_is_subset(S1, S2);
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index f570bac23ad5..51ee5dbbce05 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -60,7 +60,7 @@ class SmallBitVector {
                 "Unsupported word size");
 
 public:
-  using size_type = unsigned;
+  using size_type = uintptr_t;
 
   // Encapsulation of a single bit.
   class reference {
@@ -96,7 +96,7 @@ private:
     return reinterpret_cast<BitVector *>(X);
   }
 
-  void switchToSmall(uintptr_t NewSmallBits, size_t NewSize) {
+  void switchToSmall(uintptr_t NewSmallBits, size_type NewSize) {
     X = 1;
     setSmallSize(NewSize);
     setSmallBits(NewSmallBits);
@@ -120,9 +120,11 @@ private:
   }
 
   // Return the size.
-  size_t getSmallSize() const { return getSmallRawBits() >> SmallNumDataBits; }
+  size_type getSmallSize() const {
+    return getSmallRawBits() >> SmallNumDataBits;
+  }
 
-  void setSmallSize(size_t Size) {
+  void setSmallSize(size_type Size) {
     setSmallRawBits(getSmallBits() | (Size << SmallNumDataBits));
   }
 
@@ -189,7 +191,7 @@ public:
   }
 
   /// Returns the number of bits in this bitvector.
-  size_t size() const {
+  size_type size() const {
     return isSmall() ? getSmallSize() : getPointer()->size();
   }
 
@@ -336,8 +338,8 @@ public:
     } else {
       BitVector *BV = new BitVector(N, t);
       uintptr_t OldBits = getSmallBits();
-      for (size_t i = 0, e = getSmallSize(); i != e; ++i)
-        (*BV)[i] = (OldBits >> i) & 1;
+      for (size_type I = 0, E = getSmallSize(); I != E; ++I)
+        (*BV)[I] = (OldBits >> I) & 1;
       switchToLarge(BV);
     }
   }
@@ -346,11 +348,11 @@ public:
     if (isSmall()) {
       if (N > SmallNumDataBits) {
         uintptr_t OldBits = getSmallRawBits();
-        size_t SmallSize = getSmallSize();
+        size_type SmallSize = getSmallSize();
         BitVector *BV = new BitVector(SmallSize);
-        for (size_t i = 0; i < SmallSize; ++i)
-          if ((OldBits >> i) & 1)
-            BV->set(i);
+        for (size_type I = 0; I < SmallSize; ++I)
+          if ((OldBits >> I) & 1)
+            BV->set(I);
         BV->reserve(N);
         switchToLarge(BV);
       }
@@ -491,8 +493,8 @@ public:
     else if (!isSmall() && !RHS.isSmall())
       return *getPointer() == *RHS.getPointer();
     else {
-      for (size_t i = 0, e = size(); i != e; ++i) {
-        if ((*this)[i] != RHS[i])
+      for (size_type I = 0, E = size(); I != E; ++I) {
+        if ((*this)[I] != RHS[I])
           return false;
       }
       return true;
@@ -512,11 +514,11 @@ public:
     else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator&=(*RHS.getPointer());
     else {
-      size_t i, e;
-      for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
-        (*this)[i] = test(i) && RHS.test(i);
-      for (e = size(); i != e; ++i)
-        reset(i);
+      size_type I, E;
+      for (I = 0, E = std::min(size(), RHS.size()); I != E; ++I)
+        (*this)[I] = test(I) && RHS.test(I);
+      for (E = size(); I != E; ++I)
+        reset(I);
     }
     return *this;
   }
@@ -561,8 +563,8 @@ public:
     else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator|=(*RHS.getPointer());
     else {
-      for (size_t i = 0, e = RHS.size(); i != e; ++i)
-        (*this)[i] = test(i) || RHS.test(i);
+      for (size_type I = 0, E = RHS.size(); I != E; ++I)
+        (*this)[I] = test(I) || RHS.test(I);
     }
     return *this;
   }
@@ -574,8 +576,8 @@ public:
     else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator^=(*RHS.getPointer());
     else {
-      for (size_t i = 0, e = RHS.size(); i != e; ++i)
-        (*this)[i] = test(i) != RHS.test(i);
+      for (size_type I = 0, E = RHS.size(); I != E; ++I)
+        (*this)[I] = test(I) != RHS.test(I);
     }
     return *this;
   }
@@ -721,8 +723,9 @@ template <> struct DenseMapInfo<SmallBitVector> {
   }
   static unsigned getHashValue(const SmallBitVector &V) {
     uintptr_t Store;
-    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
-        std::make_pair(V.size(), V.getData(Store)));
+    return DenseMapInfo<
+        std::pair<SmallBitVector::size_type, ArrayRef<uintptr_t>>>::
+        getHashValue(std::make_pair(V.size(), V.getData(Store)));
   }
   static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) {
     if (LHS.isInvalid() || RHS.isInvalid())
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index b8a11030fc33..0d13524f25ce 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -1239,13 +1239,22 @@ inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
   return X.capacity_in_bytes();
 }
 
+template <typename RangeType>
+using ValueTypeFromRangeType =
+    typename std::remove_const<typename std::remove_reference<
+        decltype(*std::begin(std::declval<RangeType &>()))>::type>::type;
+
 /// Given a range of type R, iterate the entire range and return a
 /// SmallVector with elements of the vector.  This is useful, for example,
 /// when you want to iterate a range and then sort the results.
 template <unsigned Size, typename R>
-SmallVector<typename std::remove_const<typename std::remove_reference<
-                decltype(*std::begin(std::declval<R &>()))>::type>::type,
-            Size>
+SmallVector<ValueTypeFromRangeType<R>, Size> to_vector(R &&Range) {
+  return {std::begin(Range), std::end(Range)};
+}
+template <typename R>
+SmallVector<ValueTypeFromRangeType<R>,
+            CalculateSmallVectorDefaultInlinedElements<
+                ValueTypeFromRangeType<R>>::value>
 to_vector(R &&Range) {
   return {std::begin(Range), std::end(Range)};
 }
diff --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h
index 6bda25b85313..2ca672e7855b 100644
--- a/llvm/include/llvm/ADT/StringExtras.h
+++ b/llvm/include/llvm/ADT/StringExtras.h
@@ -67,22 +67,27 @@ inline ArrayRef<uint8_t> arrayRefFromStringRef(StringRef Input) {
 ///
 /// If \p C is not a valid hex digit, -1U is returned.
 inline unsigned hexDigitValue(char C) {
-  struct HexTable {
-    unsigned LUT[255] = {};
-    constexpr HexTable() {
-      // Default initialize everything to invalid.
-      for (int i = 0; i < 255; ++i)
-        LUT[i] = ~0U;
-      // Initialize `0`-`9`.
-      for (int i = 0; i < 10; ++i)
-        LUT['0' + i] = i;
-      // Initialize `A`-`F` and `a`-`f`.
-      for (int i = 0; i < 6; ++i)
-        LUT['A' + i] = LUT['a' + i] = 10 + i;
-    }
+  /* clang-format off */
+  static const int16_t LUT[256] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,  // '0'..'9'
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'A'..'F'
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'a'..'f'
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   };
-  constexpr HexTable Table;
-  return Table.LUT[static_cast<unsigned char>(C)];
+  /* clang-format on */
+  return LUT[static_cast<unsigned char>(C)];
 }
 
 /// Checks if character \p C is one of the 10 decimal digits.
@@ -210,24 +215,31 @@ inline bool tryGetFromHex(StringRef Input, std::string &Output) {
   if (Input.empty())
     return true;
 
-  Output.reserve((Input.size() + 1) / 2);
+  // If the input string is not properly aligned on 2 nibbles we pad out the
+  // front with a 0 prefix; e.g. `ABC` -> `0ABC`.
+  Output.resize((Input.size() + 1) / 2);
+  char *OutputPtr = const_cast<char *>(Output.data());
   if (Input.size() % 2 == 1) {
     uint8_t Hex = 0;
     if (!tryGetHexFromNibbles('0', Input.front(), Hex))
       return false;
-
-    Output.push_back(Hex);
+    *OutputPtr++ = Hex;
     Input = Input.drop_front();
   }
 
-  assert(Input.size() % 2 == 0);
-  while (!Input.empty()) {
+  // Convert the nibble pairs (e.g. `9C`) into bytes (0x9C).
+  // With the padding above we know the input is aligned and the output expects
+  // exactly half as many bytes as nibbles in the input.
+  size_t InputSize = Input.size();
+  assert(InputSize % 2 == 0);
+  const char *InputPtr = Input.data();
+  for (size_t OutputIndex = 0; OutputIndex < InputSize / 2; ++OutputIndex) {
     uint8_t Hex = 0;
-    if (!tryGetHexFromNibbles(Input[0], Input[1], Hex))
+    if (!tryGetHexFromNibbles(InputPtr[OutputIndex * 2 + 0], // MSB
+                              InputPtr[OutputIndex * 2 + 1], // LSB
+                              Hex))
       return false;
-
-    Output.push_back(Hex);
-    Input = Input.drop_front(2);
+    OutputPtr[OutputIndex] = Hex;
   }
   return true;
 }
@@ -501,6 +513,83 @@ public:
   }
 };
 
+/// A forward iterator over partitions of string over a separator.
+class SplittingIterator
+    : public iterator_facade_base<SplittingIterator, std::forward_iterator_tag,
+                                  StringRef> {
+  char SeparatorStorage;
+  StringRef Current;
+  StringRef Next;
+  StringRef Separator;
+
+public:
+  SplittingIterator(StringRef Str, StringRef Separator)
+      : Next(Str), Separator(Separator) {
+    ++*this;
+  }
+
+  SplittingIterator(StringRef Str, char Separator)
+      : SeparatorStorage(Separator), Next(Str),
+        Separator(&SeparatorStorage, 1) {
+    ++*this;
+  }
+
+  SplittingIterator(const SplittingIterator &R)
+      : SeparatorStorage(R.SeparatorStorage), Current(R.Current), Next(R.Next),
+        Separator(R.Separator) {
+    if (R.Separator.data() == &R.SeparatorStorage)
+      Separator = StringRef(&SeparatorStorage, 1);
+  }
+
+  SplittingIterator &operator=(const SplittingIterator &R) {
+    if (this == &R)
+      return *this;
+
+    SeparatorStorage = R.SeparatorStorage;
+    Current = R.Current;
+    Next = R.Next;
+    Separator = R.Separator;
+    if (R.Separator.data() == &R.SeparatorStorage)
+      Separator = StringRef(&SeparatorStorage, 1);
+    return *this;
+  }
+
+  bool operator==(const SplittingIterator &R) const {
+    assert(Separator == R.Separator);
+    return Current.data() == R.Current.data();
+  }
+
+  const StringRef &operator*() const { return Current; }
+
+  StringRef &operator*() { return Current; }
+
+  SplittingIterator &operator++() {
+    std::tie(Current, Next) = Next.split(Separator);
+    return *this;
+  }
+};
+
+/// Split the specified string over a separator and return a range-compatible
+/// iterable over its partitions.  Used to permit conveniently iterating
+/// over separated strings like so:
+///
+/// \code
+///   for (StringRef x : llvm::split("foo,bar,baz", ","))
+///     ...;
+/// \end
+///
+/// Note that the passed string must remain valid throuhgout lifetime
+/// of the iterators.
+inline iterator_range<SplittingIterator> split(StringRef Str, StringRef Separator) {
+  return {SplittingIterator(Str, Separator),
+          SplittingIterator(StringRef(), Separator)};
+}
+
+inline iterator_range<SplittingIterator> split(StringRef Str, char Separator) {
+  return {SplittingIterator(Str, Separator),
+          SplittingIterator(StringRef(), Separator)};
+}
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_STRINGEXTRAS_H
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index a82afc9a817c..669956d41e0c 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -126,9 +126,7 @@ public:
 
   StringMap(std::initializer_list<std::pair<StringRef, ValueTy>> List)
       : StringMapImpl(List.size(), static_cast<unsigned>(sizeof(MapEntryTy))) {
-    for (const auto &P : List) {
-      insert(P);
-    }
+    insert(List);
   }
 
   StringMap(StringMap &&RHS)
@@ -297,6 +295,21 @@ public:
     return try_emplace(KV.first, std::move(KV.second));
   }
 
+  /// Inserts elements from range [first, last). If multiple elements in the
+  /// range have keys that compare equivalent, it is unspecified which element
+  /// is inserted .
+  template <typename InputIt> void insert(InputIt First, InputIt Last) {
+    for (InputIt It = First; It != Last; ++It)
+      insert(*It);
+  }
+
+  ///  Inserts elements from initializer list ilist. If multiple elements in
+  /// the range have keys that compare equivalent, it is unspecified which
+  /// element is inserted
+  void insert(std::initializer_list<std::pair<StringRef, ValueTy>> List) {
+    insert(List.begin(), List.end());
+  }
+
   /// Inserts an element or assigns to the current element if the key already
   /// exists. The return type is the same as try_emplace.
   template <typename V>
@@ -465,13 +478,7 @@ public:
   explicit StringMapKeyIterator(StringMapConstIterator<ValueTy> Iter)
       : base(std::move(Iter)) {}
 
-  StringRef &operator*() {
-    Key = this->wrapped()->getKey();
-    return Key;
-  }
-
-private:
-  StringRef Key;
+  StringRef operator*() const { return this->wrapped()->getKey(); }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index 17e64f7f81bb..9f4b89218042 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -35,7 +35,6 @@ namespace llvm {
   class APInt;
   class hash_code;
   template <typename T> class SmallVectorImpl;
-  template <typename T> struct DenseMapInfo;
   class StringRef;
 
   /// Helper functions for StringRef::getAsInteger.
@@ -949,7 +948,7 @@ namespace llvm {
   hash_code hash_value(StringRef S);
 
   // Provide DenseMapInfo for StringRefs.
-  template <> struct DenseMapInfo<StringRef> {
+  template <> struct DenseMapInfo<StringRef, void> {
     static inline StringRef getEmptyKey() {
       return StringRef(
           reinterpret_cast<const char *>(~static_cast<uintptr_t>(0)), 0);
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index 76f3514050f0..2fd3047acbfd 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -93,6 +93,8 @@ public:
     hsail64,        // AMD HSAIL with 64-bit pointers
     spir,           // SPIR: standard portable IR for OpenCL 32-bit version
     spir64,         // SPIR: standard portable IR for OpenCL 64-bit version
+    spirv32,        // SPIR-V with 32-bit pointers
+    spirv64,        // SPIR-V with 64-bit pointers
     kalimba,        // Kalimba: generic kalimba
     shave,          // SHAVE: Movidius vector VLIW processors
     lanai,          // Lanai: Lanai 32-bit
@@ -106,6 +108,9 @@ public:
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v9_2a,
+    ARMSubArch_v9_1a,
+    ARMSubArch_v9,
     ARMSubArch_v8_7a,
     ARMSubArch_v8_6a,
     ARMSubArch_v8_5a,
@@ -290,10 +295,10 @@ public:
   /// @name Normalization
   /// @{
 
-  /// normalize - Turn an arbitrary machine specification into the canonical
-  /// triple form (or something sensible that the Triple class understands if
-  /// nothing better can reasonably be done).  In particular, it handles the
-  /// common case in which otherwise valid components are in the wrong order.
+  /// Turn an arbitrary machine specification into the canonical triple form (or
+  /// something sensible that the Triple class understands if nothing better can
+  /// reasonably be done).  In particular, it handles the common case in which
+  /// otherwise valid components are in the wrong order.
   static std::string normalize(StringRef Str);
 
   /// Return the normalized form of this triple's string.
@@ -303,25 +308,24 @@ public:
   /// @name Typed Component Access
   /// @{
 
-  /// getArch - Get the parsed architecture type of this triple.
+  /// Get the parsed architecture type of this triple.
   ArchType getArch() const { return Arch; }
 
-  /// getSubArch - get the parsed subarchitecture type for this triple.
+  /// get the parsed subarchitecture type for this triple.
   SubArchType getSubArch() const { return SubArch; }
 
-  /// getVendor - Get the parsed vendor type of this triple.
+  /// Get the parsed vendor type of this triple.
   VendorType getVendor() const { return Vendor; }
 
-  /// getOS - Get the parsed operating system type of this triple.
+  /// Get the parsed operating system type of this triple.
   OSType getOS() const { return OS; }
 
-  /// hasEnvironment - Does this triple have the optional environment
-  /// (fourth) component?
+  /// Does this triple have the optional environment (fourth) component?
   bool hasEnvironment() const {
     return getEnvironmentName() != "";
   }
 
-  /// getEnvironment - Get the parsed environment type of this triple.
+  /// Get the parsed environment type of this triple.
   EnvironmentType getEnvironment() const { return Environment; }
 
   /// Parse the version number from the OS name component of the
@@ -333,39 +337,39 @@ public:
   void getEnvironmentVersion(unsigned &Major, unsigned &Minor,
                              unsigned &Micro) const;
 
-  /// getFormat - Get the object format for this triple.
+  /// Get the object format for this triple.
   ObjectFormatType getObjectFormat() const { return ObjectFormat; }
 
-  /// getOSVersion - Parse the version number from the OS name component of the
-  /// triple, if present.
+  /// Parse the version number from the OS name component of the triple, if
+  /// present.
   ///
   /// For example, "fooos1.2.3" would return (1, 2, 3).
   ///
   /// If an entry is not defined, it will be returned as 0.
   void getOSVersion(unsigned &Major, unsigned &Minor, unsigned &Micro) const;
 
-  /// getOSMajorVersion - Return just the major version number, this is
-  /// specialized because it is a common query.
+  /// Return just the major version number, this is specialized because it is a
+  /// common query.
   unsigned getOSMajorVersion() const {
     unsigned Maj, Min, Micro;
     getOSVersion(Maj, Min, Micro);
     return Maj;
   }
 
-  /// getMacOSXVersion - Parse the version number as with getOSVersion and then
-  /// translate generic "darwin" versions to the corresponding OS X versions.
-  /// This may also be called with IOS triples but the OS X version number is
-  /// just set to a constant 10.4.0 in that case.  Returns true if successful.
+  /// Parse the version number as with getOSVersion and then translate generic
+  /// "darwin" versions to the corresponding OS X versions.  This may also be
+  /// called with IOS triples but the OS X version number is just set to a
+  /// constant 10.4.0 in that case.  Returns true if successful.
   bool getMacOSXVersion(unsigned &Major, unsigned &Minor,
                         unsigned &Micro) const;
 
-  /// getiOSVersion - Parse the version number as with getOSVersion.  This should
-  /// only be called with IOS or generic triples.
+  /// Parse the version number as with getOSVersion.  This should only be called
+  /// with IOS or generic triples.
   void getiOSVersion(unsigned &Major, unsigned &Minor,
                      unsigned &Micro) const;
 
-  /// getWatchOSVersion - Parse the version number as with getOSVersion.  This
-  /// should only be called with WatchOS or generic triples.
+  /// Parse the version number as with getOSVersion.  This should only be called
+  /// with WatchOS or generic triples.
   void getWatchOSVersion(unsigned &Major, unsigned &Minor,
                          unsigned &Micro) const;
 
@@ -377,24 +381,24 @@ public:
 
   const std::string &getTriple() const { return Data; }
 
-  /// getArchName - Get the architecture (first) component of the
-  /// triple.
+  /// Get the architecture (first) component of the triple.
   StringRef getArchName() const;
 
-  /// getVendorName - Get the vendor (second) component of the triple.
+  /// Get the architecture name based on Kind and SubArch.
+  StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch) const;
+
+  /// Get the vendor (second) component of the triple.
   StringRef getVendorName() const;
 
-  /// getOSName - Get the operating system (third) component of the
-  /// triple.
+  /// Get the operating system (third) component of the triple.
   StringRef getOSName() const;
 
-  /// getEnvironmentName - Get the optional environment (fourth)
-  /// component of the triple, or "" if empty.
+  /// Get the optional environment (fourth) component of the triple, or "" if
+  /// empty.
   StringRef getEnvironmentName() const;
 
-  /// getOSAndEnvironmentName - Get the operating system and optional
-  /// environment components as a single string (separated by a '-'
-  /// if the environment component is present).
+  /// Get the operating system and optional environment components as a single
+  /// string (separated by a '-' if the environment component is present).
   StringRef getOSAndEnvironmentName() const;
 
   /// @}
@@ -420,8 +424,8 @@ public:
   /// Note that this tests for 16-bit pointer width, and nothing else.
   bool isArch16Bit() const;
 
-  /// isOSVersionLT - Helper function for doing comparisons against version
-  /// numbers included in the target triple.
+  /// Helper function for doing comparisons against version numbers included in
+  /// the target triple.
   bool isOSVersionLT(unsigned Major, unsigned Minor = 0,
                      unsigned Micro = 0) const {
     unsigned LHS[3];
@@ -443,14 +447,13 @@ public:
     return isOSVersionLT(RHS[0], RHS[1], RHS[2]);
   }
 
-  /// isMacOSXVersionLT - Comparison function for checking OS X version
-  /// compatibility, which handles supporting skewed version numbering schemes
-  /// used by the "darwin" triples.
+  /// Comparison function for checking OS X version compatibility, which handles
+  /// supporting skewed version numbering schemes used by the "darwin" triples.
   bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
                          unsigned Micro = 0) const;
 
-  /// isMacOSX - Is this a Mac OS X triple. For legacy reasons, we support both
-  /// "darwin" and "osx" as OS X triples.
+  /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin"
+  /// and "osx" as OS X triples.
   bool isMacOSX() const {
     return getOS() == Triple::Darwin || getOS() == Triple::MacOSX;
   }
@@ -480,7 +483,7 @@ public:
 
   bool isOSzOS() const { return getOS() == Triple::ZOS; }
 
-  /// isOSDarwin - Is this a "Darwin" OS (macOS, iOS, tvOS or watchOS).
+  /// Is this a "Darwin" OS (macOS, iOS, tvOS or watchOS).
   bool isOSDarwin() const {
     return isMacOSX() || isiOS() || isWatchOS();
   }
@@ -698,6 +701,11 @@ public:
     return getArch() == Triple::spir || getArch() == Triple::spir64;
   }
 
+  /// Tests whether the target is SPIR-V (32/64-bit).
+  bool isSPIRV() const {
+    return getArch() == Triple::spirv32 || getArch() == Triple::spirv64;
+  }
+
   /// Tests whether the target is NVPTX (32- or 64-bit).
   bool isNVPTX() const {
     return getArch() == Triple::nvptx || getArch() == Triple::nvptx64;
@@ -720,6 +728,19 @@ public:
     return getArch() == Triple::arm || getArch() == Triple::armeb;
   }
 
+  /// Tests whether the target supports the EHABI exception
+  /// handling standard.
+  bool isTargetEHABICompatible() const {
+    return (isARM() || isThumb()) &&
+           (getEnvironment() == Triple::EABI ||
+            getEnvironment() == Triple::GNUEABI ||
+            getEnvironment() == Triple::MuslEABI ||
+            getEnvironment() == Triple::EABIHF ||
+            getEnvironment() == Triple::GNUEABIHF ||
+            getEnvironment() == Triple::MuslEABIHF || isAndroid()) &&
+           isOSBinFormatELF();
+  }
+
   /// Tests whether the target is AArch64 (little and big endian).
   bool isAArch64() const {
     return getArch() == Triple::aarch64 || getArch() == Triple::aarch64_be ||
@@ -833,46 +854,38 @@ public:
   /// @name Mutators
   /// @{
 
-  /// setArch - Set the architecture (first) component of the triple
-  /// to a known type.
-  void setArch(ArchType Kind);
+  /// Set the architecture (first) component of the triple to a known type.
+  void setArch(ArchType Kind, SubArchType SubArch = NoSubArch);
 
-  /// setVendor - Set the vendor (second) component of the triple to a
-  /// known type.
+  /// Set the vendor (second) component of the triple to a known type.
   void setVendor(VendorType Kind);
 
-  /// setOS - Set the operating system (third) component of the triple
-  /// to a known type.
+  /// Set the operating system (third) component of the triple to a known type.
   void setOS(OSType Kind);
 
-  /// setEnvironment - Set the environment (fourth) component of the triple
-  /// to a known type.
+  /// Set the environment (fourth) component of the triple to a known type.
   void setEnvironment(EnvironmentType Kind);
 
-  /// setObjectFormat - Set the object file format
+  /// Set the object file format.
   void setObjectFormat(ObjectFormatType Kind);
 
-  /// setTriple - Set all components to the new triple \p Str.
+  /// Set all components to the new triple \p Str.
   void setTriple(const Twine &Str);
 
-  /// setArchName - Set the architecture (first) component of the
-  /// triple by name.
+  /// Set the architecture (first) component of the triple by name.
   void setArchName(StringRef Str);
 
-  /// setVendorName - Set the vendor (second) component of the triple
-  /// by name.
+  /// Set the vendor (second) component of the triple by name.
   void setVendorName(StringRef Str);
 
-  /// setOSName - Set the operating system (third) component of the
-  /// triple by name.
+  /// Set the operating system (third) component of the triple by name.
   void setOSName(StringRef Str);
 
-  /// setEnvironmentName - Set the optional environment (fourth)
-  /// component of the triple by name.
+  /// Set the optional environment (fourth) component of the triple by name.
   void setEnvironmentName(StringRef Str);
 
-  /// setOSAndEnvironmentName - Set the operating system and optional
-  /// environment components with a single string.
+  /// Set the operating system and optional environment components with a single
+  /// string.
   void setOSAndEnvironmentName(StringRef Str);
 
   /// @}
@@ -938,33 +951,30 @@ public:
   /// @name Static helpers for IDs.
   /// @{
 
-  /// getArchTypeName - Get the canonical name for the \p Kind architecture.
+  /// Get the canonical name for the \p Kind architecture.
   static StringRef getArchTypeName(ArchType Kind);
 
-  /// getArchTypePrefix - Get the "prefix" canonical name for the \p Kind
-  /// architecture. This is the prefix used by the architecture specific
-  /// builtins, and is suitable for passing to \see
-  /// Intrinsic::getIntrinsicForGCCBuiltin().
+  /// Get the "prefix" canonical name for the \p Kind architecture. This is the
+  /// prefix used by the architecture specific builtins, and is suitable for
+  /// passing to \see Intrinsic::getIntrinsicForGCCBuiltin().
   ///
   /// \return - The architecture prefix, or 0 if none is defined.
   static StringRef getArchTypePrefix(ArchType Kind);
 
-  /// getVendorTypeName - Get the canonical name for the \p Kind vendor.
+  /// Get the canonical name for the \p Kind vendor.
   static StringRef getVendorTypeName(VendorType Kind);
 
-  /// getOSTypeName - Get the canonical name for the \p Kind operating system.
+  /// Get the canonical name for the \p Kind operating system.
   static StringRef getOSTypeName(OSType Kind);
 
-  /// getEnvironmentTypeName - Get the canonical name for the \p Kind
-  /// environment.
+  /// Get the canonical name for the \p Kind environment.
   static StringRef getEnvironmentTypeName(EnvironmentType Kind);
 
   /// @}
   /// @name Static helpers for converting alternate architecture names.
   /// @{
 
-  /// getArchTypeForLLVMName - The canonical type for the given LLVM
-  /// architecture name (e.g., "x86").
+  /// The canonical type for the given LLVM architecture name (e.g., "x86").
   static ArchType getArchTypeForLLVMName(StringRef Str);
 
   /// @}
diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h
index 815b9a40afaf..3b7598f3251d 100644
--- a/llvm/include/llvm/ADT/TypeSwitch.h
+++ b/llvm/include/llvm/ADT/TypeSwitch.h
@@ -35,7 +35,12 @@ public:
   /// Invoke a case on the derived class with multiple case types.
   template <typename CaseT, typename CaseT2, typename... CaseTs,
             typename CallableT>
-  DerivedT &Case(CallableT &&caseFn) {
+  // This is marked always_inline and nodebug so it doesn't show up in stack
+  // traces at -O0 (or other optimization levels).  Large TypeSwitch's are
+  // common, are equivalent to a switch, and don't add any value to stack
+  // traces.
+  LLVM_ATTRIBUTE_ALWAYS_INLINE LLVM_ATTRIBUTE_NODEBUG DerivedT &
+  Case(CallableT &&caseFn) {
     DerivedT &derived = static_cast<DerivedT &>(*this);
     return derived.template Case<CaseT>(caseFn)
         .template Case<CaseT2, CaseTs...>(caseFn);
diff --git a/llvm/include/llvm/ADT/iterator.h b/llvm/include/llvm/ADT/iterator.h
index b3c6608e9b6e..6f0c42fe08be 100644
--- a/llvm/include/llvm/ADT/iterator.h
+++ b/llvm/include/llvm/ADT/iterator.h
@@ -35,6 +35,21 @@ namespace llvm {
 /// terms of addition of one. These aren't equivalent for all iterator
 /// categories, and respecting that adds a lot of complexity for little gain.
 ///
+/// Iterators are expected to have const rules analogous to pointers, with a
+/// single, const-qualified operator*() that returns ReferenceT. This matches
+/// the second and third pointers in the following example:
+/// \code
+///   int Value;
+///   { int *I = &Value; }             // ReferenceT 'int&'
+///   { int *const I = &Value; }       // ReferenceT 'int&'; const
+///   { const int *I = &Value; }       // ReferenceT 'const int&'
+///   { const int *const I = &Value; } // ReferenceT 'const int&'; const
+/// \endcode
+/// If an iterator facade returns a handle to its own state, then T (and
+/// PointerT and ReferenceT) should usually be const-qualified. Otherwise, if
+/// clients are expected to modify the handle itself, the field can be declared
+/// mutable or use const_cast.
+///
 /// Classes wishing to use `iterator_facade_base` should implement the following
 /// methods:
 ///
@@ -42,8 +57,7 @@ namespace llvm {
 ///   (All of the following methods)
 ///   - DerivedT &operator=(const DerivedT &R);
 ///   - bool operator==(const DerivedT &R) const;
-///   - const T &operator*() const;
-///   - T &operator*();
+///   - T &operator*() const;
 ///   - DerivedT &operator++();
 ///
 /// Bidirectional Iterators:
@@ -95,6 +109,22 @@ protected:
     operator ReferenceT() const { return *I; }
   };
 
+  /// A proxy object for computing a pointer via indirecting a copy of a
+  /// reference. This is used in APIs which need to produce a pointer but for
+  /// which the reference might be a temporary. The proxy preserves the
+  /// reference internally and exposes the pointer via a arrow operator.
+  class PointerProxy {
+    friend iterator_facade_base;
+
+    ReferenceT R;
+
+    template <typename RefT>
+    PointerProxy(RefT &&R) : R(std::forward<RefT>(R)) {}
+
+  public:
+    PointerT operator->() const { return &R; }
+  };
+
 public:
   DerivedT operator+(DifferenceTypeT n) const {
     static_assert(std::is_base_of<iterator_facade_base, DerivedT>::value,
@@ -172,19 +202,13 @@ public:
     return !(static_cast<const DerivedT &>(*this) < RHS);
   }
 
-  PointerT operator->() { return &static_cast<DerivedT *>(this)->operator*(); }
-  PointerT operator->() const {
-    return &static_cast<const DerivedT *>(this)->operator*();
-  }
-  ReferenceProxy operator[](DifferenceTypeT n) {
-    static_assert(IsRandomAccess,
-                  "Subscripting is only defined for random access iterators.");
-    return ReferenceProxy(static_cast<DerivedT *>(this)->operator+(n));
+  PointerProxy operator->() const {
+    return static_cast<const DerivedT *>(this)->operator*();
   }
   ReferenceProxy operator[](DifferenceTypeT n) const {
     static_assert(IsRandomAccess,
                   "Subscripting is only defined for random access iterators.");
-    return ReferenceProxy(static_cast<const DerivedT *>(this)->operator+(n));
+    return static_cast<const DerivedT *>(this)->operator+(n);
   }
 };
 
@@ -330,8 +354,7 @@ public:
   explicit pointer_iterator(WrappedIteratorT u)
       : pointer_iterator::iterator_adaptor_base(std::move(u)) {}
 
-  T &operator*() { return Ptr = &*this->I; }
-  const T &operator*() const { return Ptr = &*this->I; }
+  T &operator*() const { return Ptr = &*this->I; }
 };
 
 template <typename RangeT, typename WrappedIteratorT =
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 7fec0feb09d5..2770a1a9b277 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -61,6 +61,7 @@ class DominatorTree;
 class FenceInst;
 class Function;
 class InvokeInst;
+class LoopInfo;
 class PreservedAnalyses;
 class TargetLibraryInfo;
 class Value;
@@ -378,6 +379,50 @@ createModRefInfo(const FunctionModRefBehavior FMRB) {
   return ModRefInfo(FMRB & static_cast<int>(ModRefInfo::ModRef));
 }
 
+/// Virtual base class for providers of capture information.
+struct CaptureInfo {
+  virtual ~CaptureInfo() = 0;
+  virtual bool isNotCapturedBeforeOrAt(const Value *Object,
+                                       const Instruction *I) = 0;
+};
+
+/// Context-free CaptureInfo provider, which computes and caches whether an
+/// object is captured in the function at all, but does not distinguish whether
+/// it was captured before or after the context instruction.
+class SimpleCaptureInfo final : public CaptureInfo {
+  SmallDenseMap<const Value *, bool, 8> IsCapturedCache;
+
+public:
+  bool isNotCapturedBeforeOrAt(const Value *Object,
+                               const Instruction *I) override;
+};
+
+/// Context-sensitive CaptureInfo provider, which computes and caches the
+/// earliest common dominator closure of all captures. It provides a good
+/// approximation to a precise "captures before" analysis.
+class EarliestEscapeInfo final : public CaptureInfo {
+  DominatorTree &DT;
+  const LoopInfo &LI;
+
+  /// Map from identified local object to an instruction before which it does
+  /// not escape, or nullptr if it never escapes. The "earliest" instruction
+  /// may be a conservative approximation, e.g. the first instruction in the
+  /// function is always a legal choice.
+  DenseMap<const Value *, Instruction *> EarliestEscapes;
+
+  /// Reverse map from instruction to the objects it is the earliest escape for.
+  /// This is used for cache invalidation purposes.
+  DenseMap<Instruction *, TinyPtrVector<const Value *>> Inst2Obj;
+
+public:
+  EarliestEscapeInfo(DominatorTree &DT, const LoopInfo &LI) : DT(DT), LI(LI) {}
+
+  bool isNotCapturedBeforeOrAt(const Value *Object,
+                               const Instruction *I) override;
+
+  void removeInstruction(Instruction *I);
+};
+
 /// Reduced version of MemoryLocation that only stores a pointer and size.
 /// Used for caching AATags independent BasicAA results.
 struct AACacheLoc {
@@ -425,8 +470,7 @@ public:
   using AliasCacheT = SmallDenseMap<LocPair, CacheEntry, 8>;
   AliasCacheT AliasCache;
 
-  using IsCapturedCacheT = SmallDenseMap<const Value *, bool, 8>;
-  IsCapturedCacheT IsCapturedCache;
+  CaptureInfo *CI;
 
   /// Query depth used to distinguish recursive queries.
   unsigned Depth = 0;
@@ -439,18 +483,26 @@ public:
   /// assumption is disproven.
   SmallVector<AAQueryInfo::LocPair, 4> AssumptionBasedResults;
 
-  AAQueryInfo() : AliasCache(), IsCapturedCache() {}
+  AAQueryInfo(CaptureInfo *CI) : CI(CI) {}
 
   /// Create a new AAQueryInfo based on this one, but with the cache cleared.
   /// This is used for recursive queries across phis, where cache results may
   /// not be valid.
   AAQueryInfo withEmptyCache() {
-    AAQueryInfo NewAAQI;
+    AAQueryInfo NewAAQI(CI);
     NewAAQI.Depth = Depth;
     return NewAAQI;
   }
 };
 
+/// AAQueryInfo that uses SimpleCaptureInfo.
+class SimpleAAQueryInfo : public AAQueryInfo {
+  SimpleCaptureInfo CI;
+
+public:
+  SimpleAAQueryInfo() : AAQueryInfo(&CI) {}
+};
+
 class BatchAAResults;
 
 class AAResults {
@@ -770,7 +822,7 @@ public:
   /// helpers above.
   ModRefInfo getModRefInfo(const Instruction *I,
                            const Optional<MemoryLocation> &OptLoc) {
-    AAQueryInfo AAQIP;
+    SimpleAAQueryInfo AAQIP;
     return getModRefInfo(I, OptLoc, AAQIP);
   }
 
@@ -797,7 +849,7 @@ public:
   ModRefInfo callCapturesBefore(const Instruction *I,
                                 const MemoryLocation &MemLoc,
                                 DominatorTree *DT) {
-    AAQueryInfo AAQIP;
+    SimpleAAQueryInfo AAQIP;
     return callCapturesBefore(I, MemLoc, DT, AAQIP);
   }
 
@@ -896,9 +948,12 @@ private:
 class BatchAAResults {
   AAResults &AA;
   AAQueryInfo AAQI;
+  SimpleCaptureInfo SimpleCI;
 
 public:
-  BatchAAResults(AAResults &AAR) : AA(AAR), AAQI() {}
+  BatchAAResults(AAResults &AAR) : AA(AAR), AAQI(&SimpleCI) {}
+  BatchAAResults(AAResults &AAR, CaptureInfo *CI) : AA(AAR), AAQI(CI) {}
+
   AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
     return AA.alias(LocA, LocB, AAQI);
   }
diff --git a/llvm/include/llvm/Analysis/AssumeBundleQueries.h b/llvm/include/llvm/Analysis/AssumeBundleQueries.h
index 49c0cd89a4db..77da19110246 100644
--- a/llvm/include/llvm/Analysis/AssumeBundleQueries.h
+++ b/llvm/include/llvm/Analysis/AssumeBundleQueries.h
@@ -20,7 +20,6 @@
 #include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
-class IntrinsicInst;
 class AssumptionCache;
 class DominatorTree;
 
@@ -70,15 +69,15 @@ template<> struct DenseMapInfo<Attribute::AttrKind> {
 using RetainedKnowledgeKey = std::pair<Value *, Attribute::AttrKind>;
 
 struct MinMax {
-  unsigned Min;
-  unsigned Max;
+  uint64_t Min;
+  uint64_t Max;
 };
 
 /// A mapping from intrinsics (=`llvm.assume` calls) to a value range
 /// (=knowledge) that is encoded in them. How the value range is interpreted
 /// depends on the RetainedKnowledgeKey that was used to get this out of the
 /// RetainedKnowledgeMap.
-using Assume2KnowledgeMap = DenseMap<IntrinsicInst *, MinMax>;
+using Assume2KnowledgeMap = DenseMap<AssumeInst *, MinMax>;
 
 using RetainedKnowledgeMap =
     DenseMap<RetainedKnowledgeKey, Assume2KnowledgeMap>;
@@ -100,7 +99,7 @@ void fillMapFromAssume(AssumeInst &Assume, RetainedKnowledgeMap &Result);
 ///  - ArgValue will be 4.
 struct RetainedKnowledge {
   Attribute::AttrKind AttrKind = Attribute::None;
-  unsigned ArgValue = 0;
+  uint64_t ArgValue = 0;
   Value *WasOn = nullptr;
   bool operator==(RetainedKnowledge Other) const {
     return AttrKind == Other.AttrKind && WasOn == Other.WasOn &&
diff --git a/llvm/include/llvm/Analysis/AssumptionCache.h b/llvm/include/llvm/Analysis/AssumptionCache.h
index 51d04bd8cf02..12dd9b04c932 100644
--- a/llvm/include/llvm/Analysis/AssumptionCache.h
+++ b/llvm/include/llvm/Analysis/AssumptionCache.h
@@ -29,6 +29,7 @@ namespace llvm {
 class AssumeInst;
 class Function;
 class raw_ostream;
+class TargetTransformInfo;
 class Value;
 
 /// A cache of \@llvm.assume calls within a function.
@@ -59,6 +60,8 @@ private:
   /// We track this to lazily populate our assumptions.
   Function &F;
 
+  TargetTransformInfo *TTI;
+
   /// Vector of weak value handles to calls of the \@llvm.assume
   /// intrinsic.
   SmallVector<ResultElem, 4> AssumeHandles;
@@ -103,7 +106,8 @@ private:
 public:
   /// Construct an AssumptionCache from a function by scanning all of
   /// its instructions.
-  AssumptionCache(Function &F) : F(F) {}
+  AssumptionCache(Function &F, TargetTransformInfo *TTI = nullptr)
+      : F(F), TTI(TTI) {}
 
   /// This cache is designed to be self-updating and so it should never be
   /// invalidated.
@@ -174,9 +178,7 @@ class AssumptionAnalysis : public AnalysisInfoMixin<AssumptionAnalysis> {
 public:
   using Result = AssumptionCache;
 
-  AssumptionCache run(Function &F, FunctionAnalysisManager &) {
-    return AssumptionCache(F);
-  }
+  AssumptionCache run(Function &F, FunctionAnalysisManager &);
 };
 
 /// Printer pass for the \c AssumptionAnalysis results.
diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index 991c0cbb642a..ed9d1ba4c5a7 100644
--- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -13,10 +13,8 @@
 #ifndef LLVM_ANALYSIS_BASICALIASANALYSIS_H
 #define LLVM_ANALYSIS_BASICALIASANALYSIS_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -28,7 +26,6 @@
 namespace llvm {
 
 struct AAMDNodes;
-class APInt;
 class AssumptionCache;
 class BasicBlock;
 class DataLayout;
@@ -98,71 +95,7 @@ public:
   FunctionModRefBehavior getModRefBehavior(const Function *Fn);
 
 private:
-  // A linear transformation of a Value; this class represents ZExt(SExt(V,
-  // SExtBits), ZExtBits) * Scale + Offset.
-  struct VariableGEPIndex {
-    // An opaque Value - we can't decompose this further.
-    const Value *V;
-
-    // We need to track what extensions we've done as we consider the same Value
-    // with different extensions as different variables in a GEP's linear
-    // expression;
-    // e.g.: if V == -1, then sext(x) != zext(x).
-    unsigned ZExtBits;
-    unsigned SExtBits;
-
-    APInt Scale;
-
-    // Context instruction to use when querying information about this index.
-    const Instruction *CxtI;
-
-    /// True if all operations in this expression are NSW.
-    bool IsNSW;
-
-    void dump() const {
-      print(dbgs());
-      dbgs() << "\n";
-    }
-    void print(raw_ostream &OS) const {
-      OS << "(V=" << V->getName()
-	 << ", zextbits=" << ZExtBits
-	 << ", sextbits=" << SExtBits
-	 << ", scale=" << Scale << ")";
-    }
-  };
-
-  // Represents the internal structure of a GEP, decomposed into a base pointer,
-  // constant offsets, and variable scaled indices.
-  struct DecomposedGEP {
-    // Base pointer of the GEP
-    const Value *Base;
-    // Total constant offset from base.
-    APInt Offset;
-    // Scaled variable (non-constant) indices.
-    SmallVector<VariableGEPIndex, 4> VarIndices;
-    // Is GEP index scale compile-time constant.
-    bool HasCompileTimeConstantScale;
-    // Are all operations inbounds GEPs or non-indexing operations?
-    // (None iff expression doesn't involve any geps)
-    Optional<bool> InBounds;
-
-    void dump() const {
-      print(dbgs());
-      dbgs() << "\n";
-    }
-    void print(raw_ostream &OS) const {
-      OS << "(DecomposedGEP Base=" << Base->getName()
-         << ", Offset=" << Offset
-         << ", VarIndices=[";
-      for (size_t i = 0; i < VarIndices.size(); i++) {
-        if (i != 0)
-          OS << ", ";
-        VarIndices[i].print(OS);
-      }
-      OS << "], HasCompileTimeConstantScale=" << HasCompileTimeConstantScale
-         << ")";
-    }
-  };
+  struct DecomposedGEP;
 
   /// Tracks phi nodes we have visited.
   ///
@@ -187,10 +120,6 @@ private:
   DecomposeGEPExpression(const Value *V, const DataLayout &DL,
                          AssumptionCache *AC, DominatorTree *DT);
 
-  static bool isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
-      const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
-      LocationSize ObjectAccessSize);
-
   /// A Heuristic for aliasGEP that searches for a constant offset
   /// between the variables.
   ///
@@ -200,15 +129,14 @@ private:
   /// However, we know that, for all %x, zext(%x) != zext(%x + 1), even if
   /// the addition overflows.
   bool
-  constantOffsetHeuristic(const SmallVectorImpl<VariableGEPIndex> &VarIndices,
-                          LocationSize V1Size, LocationSize V2Size,
-                          const APInt &BaseOffset, AssumptionCache *AC,
+  constantOffsetHeuristic(const DecomposedGEP &GEP, LocationSize V1Size,
+                          LocationSize V2Size, AssumptionCache *AC,
                           DominatorTree *DT);
 
   bool isValueEqualInPotentialCycles(const Value *V1, const Value *V2);
 
-  void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
-                          const SmallVectorImpl<VariableGEPIndex> &Src);
+  void subtractDecomposedGEPs(DecomposedGEP &DestGEP,
+                              const DecomposedGEP &SrcGEP);
 
   AliasResult aliasGEP(const GEPOperator *V1, LocationSize V1Size,
                        const Value *V2, LocationSize V2Size,
diff --git a/llvm/include/llvm/Analysis/CGSCCPassManager.h b/llvm/include/llvm/Analysis/CGSCCPassManager.h
index e361cccef960..7cf172dc1dd1 100644
--- a/llvm/include/llvm/Analysis/CGSCCPassManager.h
+++ b/llvm/include/llvm/Analysis/CGSCCPassManager.h
@@ -20,7 +20,7 @@
 /// A secondary more general goal is to be able to isolate optimization on
 /// unrelated parts of the IR module. This is useful to ensure our
 /// optimizations are principled and don't miss oportunities where refinement
-/// of one part of the module influence transformations in another part of the
+/// of one part of the module influences transformations in another part of the
 /// module. But this is also useful if we want to parallelize the optimizations
 /// across common large module graph shapes which tend to be very wide and have
 /// large regions of unrelated cliques.
@@ -161,6 +161,12 @@ struct RequireAnalysisPass<AnalysisT, LazyCallGraph::SCC, CGSCCAnalysisManager,
     (void)AM.template getResult<AnalysisT>(C, CG);
     return PreservedAnalyses::all();
   }
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    auto ClassName = AnalysisT::name();
+    auto PassName = MapClassName2PassName(ClassName);
+    OS << "require<" << PassName << ">";
+  }
 };
 
 /// A proxy from a \c CGSCCAnalysisManager to a \c Module.
@@ -215,7 +221,7 @@ using ModuleAnalysisManagerCGSCCProxy =
                               LazyCallGraph &>;
 
 /// Support structure for SCC passes to communicate updates the call graph back
-/// to the CGSCC pass manager infrsatructure.
+/// to the CGSCC pass manager infrastructure.
 ///
 /// The CGSCC pass manager runs SCC passes which are allowed to update the call
 /// graph and SCC structures. This means the structure the pass manager works
@@ -274,22 +280,22 @@ struct CGSCCUpdateResult {
 
   /// If non-null, the updated current \c RefSCC being processed.
   ///
-  /// This is set when a graph refinement takes place an the "current" point in
-  /// the graph moves "down" or earlier in the post-order walk. This will often
-  /// cause the "current" RefSCC to be a newly created RefSCC object and the
-  /// old one to be added to the above worklist. When that happens, this
+  /// This is set when a graph refinement takes place and the "current" point
+  /// in the graph moves "down" or earlier in the post-order walk. This will
+  /// often cause the "current" RefSCC to be a newly created RefSCC object and
+  /// the old one to be added to the above worklist. When that happens, this
   /// pointer is non-null and can be used to continue processing the "top" of
   /// the post-order walk.
   LazyCallGraph::RefSCC *UpdatedRC;
 
   /// If non-null, the updated current \c SCC being processed.
   ///
-  /// This is set when a graph refinement takes place an the "current" point in
-  /// the graph moves "down" or earlier in the post-order walk. This will often
-  /// cause the "current" SCC to be a newly created SCC object and the old one
-  /// to be added to the above worklist. When that happens, this pointer is
-  /// non-null and can be used to continue processing the "top" of the
-  /// post-order walk.
+  /// This is set when a graph refinement takes place and the "current" point
+  /// in the graph moves "down" or earlier in the post-order walk. This will
+  /// often cause the "current" SCC to be a newly created SCC object and the
+  /// old one to be added to the above worklist. When that happens, this
+  /// pointer is non-null and can be used to continue processing the "top" of
+  /// the post-order walk.
   LazyCallGraph::SCC *UpdatedC;
 
   /// Preserved analyses across SCCs.
@@ -298,7 +304,7 @@ struct CGSCCUpdateResult {
   /// (changing both the CG structure and the function IR itself). However,
   /// this means we need to take special care to correctly mark what analyses
   /// are preserved *across* SCCs. We have to track this out-of-band here
-  /// because within the main `PassManeger` infrastructure we need to mark
+  /// because within the main `PassManager` infrastructure we need to mark
   /// everything within an SCC as preserved in order to avoid repeatedly
   /// invalidating the same analyses as we unnest pass managers and adaptors.
   /// So we track the cross-SCC version of the preserved analyses here from any
@@ -363,6 +369,13 @@ public:
   /// Runs the CGSCC pass across every SCC in the module.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    OS << "cgscc(";
+    Pass->printPipeline(OS, MapClassName2PassName);
+    OS << ")";
+  }
+
   static bool isRequired() { return true; }
 
 private:
@@ -377,8 +390,11 @@ createModuleToPostOrderCGSCCPassAdaptor(CGSCCPassT &&Pass) {
   using PassModelT = detail::PassModel<LazyCallGraph::SCC, CGSCCPassT,
                                        PreservedAnalyses, CGSCCAnalysisManager,
                                        LazyCallGraph &, CGSCCUpdateResult &>;
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
   return ModuleToPostOrderCGSCCPassAdaptor(
-      std::make_unique<PassModelT>(std::forward<CGSCCPassT>(Pass)));
+      std::unique_ptr<ModuleToPostOrderCGSCCPassAdaptor::PassConceptT>(
+          new PassModelT(std::forward<CGSCCPassT>(Pass))));
 }
 
 /// A proxy from a \c FunctionAnalysisManager to an \c SCC.
@@ -461,11 +477,14 @@ class CGSCCToFunctionPassAdaptor
 public:
   using PassConceptT = detail::PassConcept<Function, FunctionAnalysisManager>;
 
-  explicit CGSCCToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass)
-      : Pass(std::move(Pass)) {}
+  explicit CGSCCToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass,
+                                      bool EagerlyInvalidate, bool NoRerun)
+      : Pass(std::move(Pass)), EagerlyInvalidate(EagerlyInvalidate),
+        NoRerun(NoRerun) {}
 
   CGSCCToFunctionPassAdaptor(CGSCCToFunctionPassAdaptor &&Arg)
-      : Pass(std::move(Arg.Pass)) {}
+      : Pass(std::move(Arg.Pass)), EagerlyInvalidate(Arg.EagerlyInvalidate),
+        NoRerun(Arg.NoRerun) {}
 
   friend void swap(CGSCCToFunctionPassAdaptor &LHS,
                    CGSCCToFunctionPassAdaptor &RHS) {
@@ -481,24 +500,56 @@ public:
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    OS << "function";
+    if (EagerlyInvalidate)
+      OS << "<eager-inv>";
+    OS << "(";
+    Pass->printPipeline(OS, MapClassName2PassName);
+    OS << ")";
+  }
+
   static bool isRequired() { return true; }
 
 private:
   std::unique_ptr<PassConceptT> Pass;
+  bool EagerlyInvalidate;
+  bool NoRerun;
 };
 
 /// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename FunctionPassT>
 CGSCCToFunctionPassAdaptor
-createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass) {
+createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass,
+                                 bool EagerlyInvalidate = false,
+                                 bool NoRerun = false) {
   using PassModelT =
       detail::PassModel<Function, FunctionPassT, PreservedAnalyses,
                         FunctionAnalysisManager>;
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
   return CGSCCToFunctionPassAdaptor(
-      std::make_unique<PassModelT>(std::forward<FunctionPassT>(Pass)));
+      std::unique_ptr<CGSCCToFunctionPassAdaptor::PassConceptT>(
+          new PassModelT(std::forward<FunctionPassT>(Pass))),
+      EagerlyInvalidate, NoRerun);
 }
 
+// A marker to determine if function passes should be run on a function within a
+// CGSCCToFunctionPassAdaptor. This is used to prevent running an expensive
+// function pass (manager) on a function multiple times if SCC mutations cause a
+// function to be visited multiple times and the function is not modified by
+// other SCC passes.
+class ShouldNotRunFunctionPassesAnalysis
+    : public AnalysisInfoMixin<ShouldNotRunFunctionPassesAnalysis> {
+public:
+  static AnalysisKey Key;
+  struct Result {};
+
+  Result run(Function &F, FunctionAnalysisManager &FAM) { return Result(); }
+};
+
 /// A helper that repeats an SCC pass each time an indirect call is refined to
 /// a direct call by that pass.
 ///
@@ -528,6 +579,13 @@ public:
   PreservedAnalyses run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    OS << "devirt<" << MaxIterations << ">(";
+    Pass->printPipeline(OS, MapClassName2PassName);
+    OS << ")";
+  }
+
 private:
   std::unique_ptr<PassConceptT> Pass;
   int MaxIterations;
@@ -541,8 +599,11 @@ DevirtSCCRepeatedPass createDevirtSCCRepeatedPass(CGSCCPassT &&Pass,
   using PassModelT = detail::PassModel<LazyCallGraph::SCC, CGSCCPassT,
                                        PreservedAnalyses, CGSCCAnalysisManager,
                                        LazyCallGraph &, CGSCCUpdateResult &>;
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
   return DevirtSCCRepeatedPass(
-      std::make_unique<PassModelT>(std::forward<CGSCCPassT>(Pass)),
+      std::unique_ptr<DevirtSCCRepeatedPass::PassConceptT>(
+          new PassModelT(std::forward<CGSCCPassT>(Pass))),
       MaxIterations);
 }
 
diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h
index 9da5f18e944b..50d12db7a1c3 100644
--- a/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -22,6 +22,8 @@ namespace llvm {
   class DataLayout;
   class Instruction;
   class DominatorTree;
+  class LoopInfo;
+  class Function;
 
   /// getDefaultMaxUsesToExploreForCaptureTracking - Return default value of
   /// the maximal number of uses to explore before giving up. It is used by
@@ -55,10 +57,25 @@ namespace llvm {
   /// MaxUsesToExplore specifies how many uses the analysis should explore for
   /// one value before giving up due too "too many uses". If MaxUsesToExplore
   /// is zero, a default value is assumed.
-  bool PointerMayBeCapturedBefore(
-      const Value *V, bool ReturnCaptures, bool StoreCaptures,
-      const Instruction *I, const DominatorTree *DT, bool IncludeI = false,
-      unsigned MaxUsesToExplore = 0);
+  bool PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
+                                  bool StoreCaptures, const Instruction *I,
+                                  const DominatorTree *DT,
+                                  bool IncludeI = false,
+                                  unsigned MaxUsesToExplore = 0,
+                                  const LoopInfo *LI = nullptr);
+
+  // Returns the 'earliest' instruction that captures \p V in \F. An instruction
+  // A is considered earlier than instruction B, if A dominates B. If 2 escapes
+  // do not dominate each other, the terminator of the common dominator is
+  // chosen. If not all uses can be analyzed, the earliest escape is set to
+  // the first instruction in the function entry block. If \p V does not escape,
+  // nullptr is returned. Note that the caller of the function has to ensure
+  // that the instruction the result value is compared against is not in a
+  // cycle.
+  Instruction *FindEarliestCapture(const Value *V, Function &F,
+                                   bool ReturnCaptures, bool StoreCaptures,
+                                   const DominatorTree &DT,
+                                   unsigned MaxUsesToExplore = 0);
 
   /// This callback is used in conjunction with PointerMayBeCaptured. In
   /// addition to the interface here, you'll need to provide your own getters
diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index 62742fdf9a91..45fb879f0c1f 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -128,10 +128,25 @@ Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
 Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
                                                ArrayRef<int> Mask);
 
-/// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
-/// produce if it is constant and determinable.  If this is not determinable,
-/// return null.
-Constant *ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty, const DataLayout &DL);
+/// Extract value of C at the given Offset reinterpreted as Ty. If bits past
+/// the end of C are accessed, they are assumed to be poison.
+Constant *ConstantFoldLoadFromConst(Constant *C, Type *Ty, const APInt &Offset,
+                                    const DataLayout &DL);
+
+/// Extract value of C reinterpreted as Ty. Same as previous API with zero
+/// offset.
+Constant *ConstantFoldLoadFromConst(Constant *C, Type *Ty,
+                                    const DataLayout &DL);
+
+/// Return the value that a load from C with offset Offset would produce if it
+/// is constant and determinable. If this is not determinable, return null.
+Constant *ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty, APInt Offset,
+                                       const DataLayout &DL);
+
+/// Return the value that a load from C would produce if it is constant and
+/// determinable. If this is not determinable, return null.
+Constant *ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
+                                       const DataLayout &DL);
 
 /// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
 /// getelementptr constantexpr, return the constant value being addressed by the
@@ -140,13 +155,6 @@ Constant *ConstantFoldLoadThroughGEPConstantExpr(Constant *C, ConstantExpr *CE,
                                                  Type *Ty,
                                                  const DataLayout &DL);
 
-/// ConstantFoldLoadThroughGEPIndices - Given a constant and getelementptr
-/// indices (with an *implied* zero pointer index that is not in the list),
-/// return the constant value being addressed by a virtual load, or null if
-/// something is funny and we can't decide.
-Constant *ConstantFoldLoadThroughGEPIndices(Constant *C,
-                                            ArrayRef<Constant *> Indices);
-
 /// canConstantFoldCallTo - Return true if its even possible to fold a call to
 /// the specified function.
 bool canConstantFoldCallTo(const CallBase *Call, const Function *F);
diff --git a/llvm/include/llvm/Analysis/CostModel.h b/llvm/include/llvm/Analysis/CostModel.h
new file mode 100644
index 000000000000..649168050cec
--- /dev/null
+++ b/llvm/include/llvm/Analysis/CostModel.h
@@ -0,0 +1,26 @@
+//===- CostModel.h - --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_COSTMODEL_H
+#define LLVM_ANALYSIS_COSTMODEL_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+/// Printer pass for cost modeling results.
+class CostModelPrinterPass : public PassInfoMixin<CostModelPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit CostModelPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_COSTMODEL_H
diff --git a/llvm/include/llvm/Analysis/Delinearization.h b/llvm/include/llvm/Analysis/Delinearization.h
index 2658b6bbc80c..6e942530f253 100644
--- a/llvm/include/llvm/Analysis/Delinearization.h
+++ b/llvm/include/llvm/Analysis/Delinearization.h
@@ -16,10 +16,115 @@
 #ifndef LLVM_ANALYSIS_DELINEARIZATION_H
 #define LLVM_ANALYSIS_DELINEARIZATION_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+class GetElementPtrInst;
+class ScalarEvolution;
+class SCEV;
+
+/// Compute the array dimensions Sizes from the set of Terms extracted from
+/// the memory access function of this SCEVAddRecExpr (second step of
+/// delinearization).
+void findArrayDimensions(ScalarEvolution &SE,
+                         SmallVectorImpl<const SCEV *> &Terms,
+                         SmallVectorImpl<const SCEV *> &Sizes,
+                         const SCEV *ElementSize);
+
+/// Collect parametric terms occurring in step expressions (first step of
+/// delinearization).
+void collectParametricTerms(ScalarEvolution &SE, const SCEV *Expr,
+                            SmallVectorImpl<const SCEV *> &Terms);
+
+/// Return in Subscripts the access functions for each dimension in Sizes
+/// (third step of delinearization).
+void computeAccessFunctions(ScalarEvolution &SE, const SCEV *Expr,
+                            SmallVectorImpl<const SCEV *> &Subscripts,
+                            SmallVectorImpl<const SCEV *> &Sizes);
+/// Split this SCEVAddRecExpr into two vectors of SCEVs representing the
+/// subscripts and sizes of an array access.
+///
+/// The delinearization is a 3 step process: the first two steps compute the
+/// sizes of each subscript and the third step computes the access functions
+/// for the delinearized array:
+///
+/// 1. Find the terms in the step functions
+/// 2. Compute the array size
+/// 3. Compute the access function: divide the SCEV by the array size
+///    starting with the innermost dimensions found in step 2. The Quotient
+///    is the SCEV to be divided in the next step of the recursion. The
+///    Remainder is the subscript of the innermost dimension. Loop over all
+///    array dimensions computed in step 2.
+///
+/// To compute a uniform array size for several memory accesses to the same
+/// object, one can collect in step 1 all the step terms for all the memory
+/// accesses, and compute in step 2 a unique array shape. This guarantees
+/// that the array shape will be the same across all memory accesses.
+///
+/// FIXME: We could derive the result of steps 1 and 2 from a description of
+/// the array shape given in metadata.
+///
+/// Example:
+///
+/// A[][n][m]
+///
+/// for i
+///   for j
+///     for k
+///       A[j+k][2i][5i] =
+///
+/// The initial SCEV:
+///
+/// A[{{{0,+,2*m+5}_i, +, n*m}_j, +, n*m}_k]
+///
+/// 1. Find the different terms in the step functions:
+/// -> [2*m, 5, n*m, n*m]
+///
+/// 2. Compute the array size: sort and unique them
+/// -> [n*m, 2*m, 5]
+/// find the GCD of all the terms = 1
+/// divide by the GCD and erase constant terms
+/// -> [n*m, 2*m]
+/// GCD = m
+/// divide by GCD -> [n, 2]
+/// remove constant terms
+/// -> [n]
+/// size of the array is A[unknown][n][m]
+///
+/// 3. Compute the access function
+/// a. Divide {{{0,+,2*m+5}_i, +, n*m}_j, +, n*m}_k by the innermost size m
+/// Quotient: {{{0,+,2}_i, +, n}_j, +, n}_k
+/// Remainder: {{{0,+,5}_i, +, 0}_j, +, 0}_k
+/// The remainder is the subscript of the innermost array dimension: [5i].
+///
+/// b. Divide Quotient: {{{0,+,2}_i, +, n}_j, +, n}_k by next outer size n
+/// Quotient: {{{0,+,0}_i, +, 1}_j, +, 1}_k
+/// Remainder: {{{0,+,2}_i, +, 0}_j, +, 0}_k
+/// The Remainder is the subscript of the next array dimension: [2i].
+///
+/// The subscript of the outermost dimension is the Quotient: [j+k].
+///
+/// Overall, we have: A[][n][m], and the access function: A[j+k][2i][5i].
+void delinearize(ScalarEvolution &SE, const SCEV *Expr,
+                 SmallVectorImpl<const SCEV *> &Subscripts,
+                 SmallVectorImpl<const SCEV *> &Sizes, const SCEV *ElementSize);
+
+/// Gathers the individual index expressions from a GEP instruction.
+///
+/// This function optimistically assumes the GEP references into a fixed size
+/// array. If this is actually true, this function returns a list of array
+/// subscript expressions in \p Subscripts and a list of integers describing
+/// the size of the individual array dimensions in \p Sizes. Both lists have
+/// either equal length or the size list is one element shorter in case there
+/// is no known size available for the outermost array dimension. Returns true
+/// if successful and false otherwise.
+bool getIndexExpressionsFromGEP(ScalarEvolution &SE,
+                                const GetElementPtrInst *GEP,
+                                SmallVectorImpl<const SCEV *> &Subscripts,
+                                SmallVectorImpl<int> &Sizes);
+
 struct DelinearizationPrinterPass
     : public PassInfoMixin<DelinearizationPrinterPass> {
   explicit DelinearizationPrinterPass(raw_ostream &OS);
diff --git a/llvm/include/llvm/Analysis/HeatUtils.h b/llvm/include/llvm/Analysis/HeatUtils.h
index b665e211c6ac..9ecbbaf318da 100644
--- a/llvm/include/llvm/Analysis/HeatUtils.h
+++ b/llvm/include/llvm/Analysis/HeatUtils.h
@@ -1,9 +1,8 @@
 //===-- HeatUtils.h - Utility for printing heat colors ----------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index b623b9ca58d8..51c5c620230b 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -110,7 +110,8 @@ enum InstrType { Legal, Illegal, Invisible };
 /// by \ref isSameOperationAs.
 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
 /// exact same, and some do not.
-struct IRInstructionData : ilist_node<IRInstructionData> {
+struct IRInstructionData
+    : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> {
 
   /// The source Instruction that is being wrapped.
   Instruction *Inst = nullptr;
@@ -127,12 +128,41 @@ struct IRInstructionData : ilist_node<IRInstructionData> {
   /// to a less than form.  It is None otherwise.
   Optional<CmpInst::Predicate> RevisedPredicate;
 
+  /// This structure holds the distances of how far "ahead of" or "behind" the
+  /// target blocks of a branch, or the incoming blocks of a phi nodes are.
+  /// If the value is negative, it means that the block was registered before
+  /// the block of this instruction in terms of blocks in the function.
+  /// Code Example:
+  /// \code
+  /// block_1:
+  ///   br i1 %0, label %block_2, label %block_3
+  /// block_2:
+  ///   br i1 %1, label %block_1, label %block_2
+  /// block_3:
+  ///   br i1 %2, label %block_2, label %block_1
+  /// ; Replacing the labels with relative values, this becomes:
+  /// block_1:
+  ///   br i1 %0, distance 1, distance 2
+  /// block_2:
+  ///   br i1 %1, distance -1, distance 0
+  /// block_3:
+  ///   br i1 %2, distance -1, distance -2
+  /// \endcode
+  /// Taking block_2 as our example, block_1 is "behind" block_2, and block_2 is
+  /// "ahead" of block_2.
+  SmallVector<int, 4> RelativeBlockLocations;
+
   /// Gather the information that is difficult to gather for an Instruction, or
   /// is changed. i.e. the operands of an Instruction and the Types of those
   /// operands. This extra information allows for similarity matching to make
   /// assertions that allow for more flexibility when checking for whether an
   /// Instruction performs the same operation.
   IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
+  IRInstructionData(IRInstructionDataList &IDL);
+
+  /// Fills data stuctures for IRInstructionData when it is constructed from a
+  // reference or a pointer.
+  void initializeInstruction();
 
   /// Get the predicate that the compare instruction is using for hashing the
   /// instruction. the IRInstructionData must be wrapping a CmpInst.
@@ -145,6 +175,16 @@ struct IRInstructionData : ilist_node<IRInstructionData> {
   /// \return the consistent comparison predicate. 
   static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
 
+  /// For an IRInstructionData containing a branch, finds the
+  /// relative distances from the source basic block to the target by taking
+  /// the difference of the number assigned to the current basic block and the
+  /// target basic block of the branch.
+  ///
+  /// \param BasicBlockToInteger - The mapping of basic blocks to their location
+  /// in the module.
+  void
+  setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
+
   /// Hashes \p Value based on its opcode, types, and operand types.
   /// Two IRInstructionData instances produce the same hash when they perform
   /// the same operation.
@@ -198,7 +238,8 @@ struct IRInstructionData : ilist_node<IRInstructionData> {
   IRInstructionDataList *IDL = nullptr;
 };
 
-struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
+struct IRInstructionDataList
+    : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {};
 
 /// Compare one IRInstructionData class to another IRInstructionData class for
 /// whether they are performing a the same operation, and can mapped to the
@@ -288,6 +329,10 @@ struct IRInstructionMapper {
   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
       InstructionIntegerMap;
 
+  /// A mapping for a basic block in a module to its assigned number/location
+  /// in the module.
+  DenseMap<BasicBlock *, unsigned> BasicBlockToInteger;
+
   /// Set if we added an illegal number in the previous step.
   /// Since each illegal number is unique, we only need one of them between
   /// each range of legal numbers. This lets us make sure we don't add more
@@ -322,6 +367,14 @@ struct IRInstructionMapper {
   IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
                                                IRInstructionDataList &IDL);
 
+  /// Get an empty allocated IRInstructionData struct using the
+  /// InstDataAllocator.
+  ///
+  /// \param IDL - The InstructionDataList that the IRInstructionData is
+  /// inserted into.
+  /// \returns An allocated IRInstructionData struct.
+  IRInstructionData *allocateIRInstructionData(IRInstructionDataList &IDL);
+
   /// Get an allocated IRInstructionDataList object using the IDLAllocator.
   ///
   /// \returns An allocated IRInstructionDataList object.
@@ -329,6 +382,24 @@ struct IRInstructionMapper {
 
   IRInstructionDataList *IDL = nullptr;
 
+  /// Assigns values to all the basic blocks in function \p F starting from
+  /// integer \p BBNumber.
+  ///
+  /// \param F - The function containing the basic blocks to assign numbers to.
+  /// \param BBNumber - The number to start from.
+  void initializeForBBs(Function &F, unsigned &BBNumber) {
+    for (BasicBlock &BB : F)
+      BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++));
+  }
+
+  /// Assigns values to all the basic blocks in Module \p M.
+  /// \param M - The module containing the basic blocks to assign numbers to.
+  void initializeForBBs(Module &M) {
+    unsigned BBNumber = 0;
+    for (Function &F : M)
+      initializeForBBs(F, BBNumber);
+  }
+
   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
   /// determined by \p InstrType. Two Instructions are mapped to the same value
   /// if they are close as defined by the InstructionData class above.
@@ -386,7 +457,11 @@ struct IRInstructionMapper {
     InstructionClassification() {}
 
     // TODO: Determine a scheme to resolve when the label is similar enough.
-    InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
+    InstrType visitBranchInst(BranchInst &BI) {
+      if (EnableBranches)
+        return Legal;
+      return Illegal;
+    }
     // TODO: Determine a scheme to resolve when the labels are similar enough.
     InstrType visitPHINode(PHINode &PN) { return Illegal; }
     // TODO: Handle allocas.
@@ -419,6 +494,10 @@ struct IRInstructionMapper {
     // TODO: Handle interblock similarity.
     InstrType visitTerminator(Instruction &I) { return Illegal; }
     InstrType visitInstruction(Instruction &I) { return Legal; }
+
+    // The flag variable that lets the classifier know whether we should
+    // allow branches to be checked for similarity.
+    bool EnableBranches = false;
   };
 
   /// Maps an Instruction to a member of InstrType.
@@ -488,6 +567,12 @@ private:
   DenseMap<Value *, unsigned> ValueToNumber;
   /// Stores the mapping of the number to the value assigned this number.
   DenseMap<unsigned, Value *> NumberToValue;
+  /// Stores the mapping of a value's number to canonical numbering in the
+  /// candidate's respective similarity group.
+  DenseMap<unsigned, unsigned> NumberToCanonNum;
+  /// Stores the mapping of canonical number in the candidate's respective
+  /// similarity group to a value number.
+  DenseMap<unsigned, unsigned> CanonNumToNumber;
   /// @}
 
 public:
@@ -506,13 +591,27 @@ public:
   static bool isSimilar(const IRSimilarityCandidate &A,
                         const IRSimilarityCandidate &B);
 
-  /// \param A - The first IRInstructionCandidate to compare.
-  /// \param B - The second IRInstructionCandidate to compare.
+  /// \param [in] A - The first IRInstructionCandidate to compare.
+  /// \param [in] B - The second IRInstructionCandidate to compare.
   /// \returns True when every IRInstructionData in \p A is structurally similar
   /// to \p B.
   static bool compareStructure(const IRSimilarityCandidate &A,
                                const IRSimilarityCandidate &B);
 
+  /// \param [in] A - The first IRInstructionCandidate to compare.
+  /// \param [in] B - The second IRInstructionCandidate to compare.
+  /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from
+  /// candidate \p A to candidate \B.
+  /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from
+  /// candidate \p B to candidate \A.
+  /// \returns True when every IRInstructionData in \p A is structurally similar
+  /// to \p B.
+  static bool
+  compareStructure(const IRSimilarityCandidate &A,
+                   const IRSimilarityCandidate &B,
+                   DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
+                   DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB);
+
   struct OperandMapping {
     /// The IRSimilarityCandidate that holds the instruction the OperVals were
     /// pulled from.
@@ -526,6 +625,21 @@ public:
     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
   };
 
+  /// A helper struct to hold the candidate, for a branch instruction, the
+  /// relative location of a label, and the label itself.  This is mostly to
+  /// group the values together before passing them as a bundle to a function.
+  struct RelativeLocMapping {
+    /// The IRSimilarityCandidate that holds the instruction the relative
+    /// location was pulled from.
+    const IRSimilarityCandidate &IRSC;
+
+    /// The relative location to be analyzed.
+    int RelativeLocation;
+
+    /// The corresponding value.
+    Value *OperVal;
+  };
+
   /// Compare the operands in \p A and \p B and check that the current mapping
   /// of global value numbers from \p A to \p B and \p B to \A is consistent.
   ///
@@ -549,6 +663,94 @@ public:
   static bool compareCommutativeOperandMapping(OperandMapping A,
                                                OperandMapping B);
 
+  /// Compare the relative locations in \p A and \p B and check that the
+  /// distances match if both locations are contained in the region, and that
+  /// the branches both point outside the region if they do not.
+  /// Example Region:
+  /// \code
+  /// entry:
+  ///   br i1 %0, label %block_1, label %block_3
+  /// block_0:
+  ///   br i1 %0, label %block_1, label %block_2
+  /// block_1:
+  ///   br i1 %0, label %block_2, label %block_3
+  /// block_2:
+  ///   br i1 %1, label %block_1, label %block_4
+  /// block_3:
+  ///   br i1 %2, label %block_2, label %block_5
+  /// \endcode
+  /// If we compare the branches in block_0 and block_1 the relative values are
+  /// 1 and 2 for both, so we consider this a match.
+  ///
+  /// If we compare the branches in entry and block_0 the relative values are
+  /// 2 and 3, and 1 and 2 respectively.  Since these are not the same we do not
+  /// consider them a match.
+  ///
+  /// If we compare the branches in block_1 and block_2 the relative values are
+  /// 1 and 2, and -1 and None respectively.  As a result we do not consider
+  /// these to be the same
+  ///
+  /// If we compare the branches in block_2 and block_3 the relative values are
+  /// -1 and None for both.  We do consider these to be a match.
+  ///
+  /// \param A - The first IRInstructionCandidate, relative location value,
+  /// and incoming block.
+  /// \param B - The second IRInstructionCandidate, relative location value,
+  /// and incoming block.
+  /// \returns true if the relative locations match.
+  static bool checkRelativeLocations(RelativeLocMapping A,
+                                     RelativeLocMapping B);
+
+  /// Create a mapping from the value numbering to a different separate set of
+  /// numbers. This will serve as a guide for relating one candidate to another.
+  /// The canonical number gives use the ability identify which global value
+  /// number in one candidate relates to the global value number in the other.
+  ///
+  /// \param [in, out] CurrCand - The IRSimilarityCandidate to create a
+  /// canonical numbering for.
+  static void createCanonicalMappingFor(IRSimilarityCandidate &CurrCand);
+
+  /// Create a mapping for the value numbering of the calling
+  /// IRSimilarityCandidate, to a different separate set of numbers, based on
+  /// the canonical ordering in \p SourceCand. These are defined based on the
+  /// found mappings in \p ToSourceMapping and \p FromSourceMapping.  Both of
+  /// these relationships should have the same information, just in opposite
+  /// directions.
+  ///
+  /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
+  /// canonical numbering from.
+  /// \param ToSourceMapping - The mapping of value numbers from this candidate
+  /// to \p SourceCand.
+  /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand
+  /// to this candidate.
+  void createCanonicalRelationFrom(
+      IRSimilarityCandidate &SourceCand,
+      DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
+      DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping);
+
+  /// \param [in,out] BBSet - The set to track the basic blocks.
+  void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const {
+    for (IRInstructionData &ID : *this) {
+      BasicBlock *BB = ID.Inst->getParent();
+      if (BBSet.contains(BB))
+        continue;
+      BBSet.insert(BB);
+    }
+  }
+
+  /// \param [in,out] BBSet - The set to track the basic blocks.
+  /// \param [in,out] BBList - A list in order of use to track the basic blocks.
+  void getBasicBlocks(DenseSet<BasicBlock *> &BBSet,
+                      SmallVector<BasicBlock *> &BBList) const {
+    for (IRInstructionData &ID : *this) {
+      BasicBlock *BB = ID.Inst->getParent();
+      if (BBSet.contains(BB))
+        continue;
+      BBSet.insert(BB);
+      BBList.push_back(BB);
+    }
+  }
+
   /// Compare the start and end indices of the two IRSimilarityCandidates for
   /// whether they overlap. If the start instruction of one
   /// IRSimilarityCandidate is less than the end instruction of the other, and
@@ -611,6 +813,32 @@ public:
     return VNIt->second;
   }
 
+  /// Find the canonical number from the global value number \p N stored in the
+  /// candidate.
+  ///
+  /// \param N - The global value number to find the canonical number for.
+  /// \returns An optional containing the value, and None if it could not be
+  /// found.
+  Optional<unsigned> getCanonicalNum(unsigned N) {
+    DenseMap<unsigned, unsigned>::iterator NCIt = NumberToCanonNum.find(N);
+    if (NCIt == NumberToCanonNum.end())
+      return None;
+    return NCIt->second;
+  }
+
+  /// Find the global value number from the canonical number \p N stored in the
+  /// candidate.
+  ///
+  /// \param N - The canonical number to find the global vlaue number for.
+  /// \returns An optional containing the value, and None if it could not be
+  /// found.
+  Optional<unsigned> fromCanonicalNum(unsigned N) {
+    DenseMap<unsigned, unsigned>::iterator CNIt = CanonNumToNumber.find(N);
+    if (CNIt == CanonNumToNumber.end())
+      return None;
+    return CNIt->second;
+  }
+
   /// \param RHS -The IRSimilarityCandidate to compare against
   /// \returns true if the IRSimilarityCandidate is occurs after the
   /// IRSimilarityCandidate in the program.
@@ -623,6 +851,9 @@ public:
   iterator end() const { return std::next(iterator(back())); }
 };
 
+typedef DenseMap<IRSimilarityCandidate *,
+                 DenseMap<unsigned, DenseSet<unsigned>>>
+    CandidateGVNMapping;
 typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
 typedef std::vector<SimilarityGroup> SimilarityGroupList;
 
@@ -651,8 +882,9 @@ typedef std::vector<SimilarityGroup> SimilarityGroupList;
 /// analyzing the module.
 class IRSimilarityIdentifier {
 public:
-  IRSimilarityIdentifier()
-      : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
+  IRSimilarityIdentifier(bool MatchBranches = true)
+      : Mapper(&InstDataAllocator, &InstDataListAllocator),
+        EnableBranches(MatchBranches) {}
 
 private:
   /// Map the instructions in the module to unsigned integers, using mapping
@@ -728,6 +960,10 @@ private:
   /// instance of IRInstructionData.
   IRInstructionMapper Mapper;
 
+  /// The flag variable that marks whether we should check branches for
+  /// similarity, or only look within basic blocks.
+  bool EnableBranches = true;
+
   /// The SimilarityGroups found with the most recent run of \ref
   /// findSimilarity. None if there is no recent run.
   Optional<SimilarityGroupList> SimilarityCandidates;
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 82e1b14960bd..c26dbc457949 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -36,20 +36,24 @@ class DominatorTree;
 
 /// These are the kinds of recurrences that we support.
 enum class RecurKind {
-  None,   ///< Not a recurrence.
-  Add,    ///< Sum of integers.
-  Mul,    ///< Product of integers.
-  Or,     ///< Bitwise or logical OR of integers.
-  And,    ///< Bitwise or logical AND of integers.
-  Xor,    ///< Bitwise or logical XOR of integers.
-  SMin,   ///< Signed integer min implemented in terms of select(cmp()).
-  SMax,   ///< Signed integer max implemented in terms of select(cmp()).
-  UMin,   ///< Unisgned integer min implemented in terms of select(cmp()).
-  UMax,   ///< Unsigned integer max implemented in terms of select(cmp()).
-  FAdd,   ///< Sum of floats.
-  FMul,   ///< Product of floats.
-  FMin,   ///< FP min implemented in terms of select(cmp()).
-  FMax    ///< FP max implemented in terms of select(cmp()).
+  None,       ///< Not a recurrence.
+  Add,        ///< Sum of integers.
+  Mul,        ///< Product of integers.
+  Or,         ///< Bitwise or logical OR of integers.
+  And,        ///< Bitwise or logical AND of integers.
+  Xor,        ///< Bitwise or logical XOR of integers.
+  SMin,       ///< Signed integer min implemented in terms of select(cmp()).
+  SMax,       ///< Signed integer max implemented in terms of select(cmp()).
+  UMin,       ///< Unisgned integer min implemented in terms of select(cmp()).
+  UMax,       ///< Unsigned integer max implemented in terms of select(cmp()).
+  FAdd,       ///< Sum of floats.
+  FMul,       ///< Product of floats.
+  FMin,       ///< FP min implemented in terms of select(cmp()).
+  FMax,       ///< FP max implemented in terms of select(cmp()).
+  SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop
+              ///< invariant
+  SelectFCmp  ///< Integer select(fcmp(),x,y) where one of (x,y) is loop
+              ///< invariant
 };
 
 /// The RecurrenceDescriptor is used to identify recurrences variables in a
@@ -112,12 +116,14 @@ public:
   };
 
   /// Returns a struct describing if the instruction 'I' can be a recurrence
-  /// variable of type 'Kind'. If the recurrence is a min/max pattern of
-  /// select(icmp()) this function advances the instruction pointer 'I' from the
-  /// compare instruction to the select instruction and stores this pointer in
-  /// 'PatternLastInst' member of the returned struct.
-  static InstDesc isRecurrenceInstr(Instruction *I, RecurKind Kind,
-                                    InstDesc &Prev, FastMathFlags FMF);
+  /// variable of type 'Kind' for a Loop \p L and reduction PHI \p Phi.
+  /// If the recurrence is a min/max pattern of select(icmp()) this function
+  /// advances the instruction pointer 'I' from the compare instruction to the
+  /// select instruction and stores this pointer in 'PatternLastInst' member of
+  /// the returned struct.
+  static InstDesc isRecurrenceInstr(Loop *L, PHINode *Phi, Instruction *I,
+                                    RecurKind Kind, InstDesc &Prev,
+                                    FastMathFlags FuncFMF);
 
   /// Returns true if instruction I has multiple uses in Insts
   static bool hasMultipleUsesOf(Instruction *I,
@@ -127,20 +133,29 @@ public:
   /// Returns true if all uses of the instruction I is within the Set.
   static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set);
 
-  /// Returns a struct describing if the instruction is a
-  /// Select(ICmp(X, Y), X, Y) instruction pattern corresponding to a min(X, Y)
-  /// or max(X, Y). \p Prev specifies the description of an already processed
-  /// select instruction, so its corresponding cmp can be matched to it.
-  static InstDesc isMinMaxSelectCmpPattern(Instruction *I,
-                                           const InstDesc &Prev);
+  /// Returns a struct describing if the instruction is a llvm.(s/u)(min/max),
+  /// llvm.minnum/maxnum or a Select(ICmp(X, Y), X, Y) pair of instructions
+  /// corresponding to a min(X, Y) or max(X, Y), matching the recurrence kind \p
+  /// Kind. \p Prev specifies the description of an already processed select
+  /// instruction, so its corresponding cmp can be matched to it.
+  static InstDesc isMinMaxPattern(Instruction *I, RecurKind Kind,
+                                  const InstDesc &Prev);
+
+  /// Returns a struct describing whether the instruction is either a
+  ///   Select(ICmp(A, B), X, Y), or
+  ///   Select(FCmp(A, B), X, Y)
+  /// where one of (X, Y) is a loop invariant integer and the other is a PHI
+  /// value. \p Prev specifies the description of an already processed select
+  /// instruction, so its corresponding cmp can be matched to it.
+  static InstDesc isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi,
+                                     Instruction *I, InstDesc &Prev);
 
   /// Returns a struct describing if the instruction is a
   /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern.
   static InstDesc isConditionalRdxPattern(RecurKind Kind, Instruction *I);
 
   /// Returns identity corresponding to the RecurrenceKind.
-  static Constant *getRecurrenceIdentity(RecurKind K, Type *Tp,
-                                         FastMathFlags FMF);
+  Value *getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF);
 
   /// Returns the opcode corresponding to the RecurrenceKind.
   static unsigned getOpcode(RecurKind Kind);
@@ -150,7 +165,7 @@ public:
   /// non-null, the minimal bit width needed to compute the reduction will be
   /// computed.
   static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
-                              FastMathFlags FMF,
+                              FastMathFlags FuncFMF,
                               RecurrenceDescriptor &RedDes,
                               DemandedBits *DB = nullptr,
                               AssumptionCache *AC = nullptr,
@@ -220,6 +235,12 @@ public:
     return isIntMinMaxRecurrenceKind(Kind) || isFPMinMaxRecurrenceKind(Kind);
   }
 
+  /// Returns true if the recurrence kind is of the form
+  ///   select(cmp(),x,y) where one of (x,y) is loop invariant.
+  static bool isSelectCmpRecurrenceKind(RecurKind Kind) {
+    return Kind == RecurKind::SelectICmp || Kind == RecurKind::SelectFCmp;
+  }
+
   /// Returns the type of the recurrence. This type can be narrower than the
   /// actual type of the Phi if the recurrence has been type-promoted.
   Type *getRecurrenceType() const { return RecurrenceType; }
@@ -329,6 +350,11 @@ public:
                           : Instruction::BinaryOpsEnd;
   }
 
+  Type *getElementType() const {
+    assert(IK == IK_PtrInduction && "Only pointer induction has element type");
+    return ElementType;
+  }
+
   /// Returns a reference to the type cast instructions in the induction
   /// update chain, that are redundant when guarded with a runtime
   /// SCEV overflow check.
@@ -340,6 +366,7 @@ private:
   /// Private constructor - used by \c isInductionPHI.
   InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step,
                       BinaryOperator *InductionBinOp = nullptr,
+                      Type *ElementType = nullptr,
                       SmallVectorImpl<Instruction *> *Casts = nullptr);
 
   /// Start value.
@@ -350,6 +377,9 @@ private:
   const SCEV *Step = nullptr;
   // Instruction that advances induction variable.
   BinaryOperator *InductionBinOp = nullptr;
+  // Element type for pointer induction variables.
+  // TODO: This can be dropped once support for typed pointers is removed.
+  Type *ElementType = nullptr;
   // Instructions used for type-casts of the induction variable,
   // that are redundant when guarded with a runtime SCEV overflow check.
   SmallVector<Instruction *, 2> RedundantCasts;
diff --git a/llvm/include/llvm/Analysis/IVUsers.h b/llvm/include/llvm/Analysis/IVUsers.h
index f8ea3bcca229..e2026a4d5875 100644
--- a/llvm/include/llvm/Analysis/IVUsers.h
+++ b/llvm/include/llvm/Analysis/IVUsers.h
@@ -157,9 +157,6 @@ public:
 
   /// dump - This method is used for debugging.
   void dump() const;
-
-protected:
-  bool AddUsersImpl(Instruction *I, SmallPtrSetImpl<Loop*> &SimpleLoopNests);
 };
 
 Pass *createIVUsersPass();
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index c27aaf0db8f2..9f9bc3a5e71b 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -22,6 +22,7 @@ class CallBase;
 class Function;
 class Module;
 class OptimizationRemarkEmitter;
+struct ReplayInlinerSettings;
 
 /// There are 3 scenarios we can use the InlineAdvisor:
 /// - Default - use manual heuristics.
@@ -143,7 +144,11 @@ public:
   /// be up-to-date wrt previous inlining decisions. \p MandatoryOnly indicates
   /// only mandatory (always-inline) call sites should be recommended - this
   /// allows the InlineAdvisor track such inlininings.
-  /// Returns an InlineAdvice with the inlining recommendation.
+  /// Returns:
+  /// - An InlineAdvice with the inlining recommendation.
+  /// - Null when no recommendation is made (https://reviews.llvm.org/D110658).
+  /// TODO: Consider removing the Null return scenario by incorporating the
+  /// SampleProfile inliner into an InlineAdvisor
   std::unique_ptr<InlineAdvice> getAdvice(CallBase &CB,
                                           bool MandatoryOnly = false);
 
@@ -157,6 +162,12 @@ public:
   /// to prepare for a partial update.
   virtual void onPassExit() {}
 
+  /// Called when the module is invalidated. We let the advisor implementation
+  /// decide what to refresh - in the case of the development mode
+  /// implementation, for example, we wouldn't want to delete the whole object
+  /// and need to re-load the model evaluator.
+  virtual void onModuleInvalidated() {}
+
 protected:
   InlineAdvisor(Module &M, FunctionAnalysisManager &FAM);
   virtual std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) = 0;
@@ -219,15 +230,18 @@ public:
   InlineAdvisorAnalysis() = default;
   struct Result {
     Result(Module &M, ModuleAnalysisManager &MAM) : M(M), MAM(MAM) {}
-    bool invalidate(Module &, const PreservedAnalyses &,
+    bool invalidate(Module &, const PreservedAnalyses &PA,
                     ModuleAnalysisManager::Invalidator &) {
-      // InlineAdvisor must be preserved across analysis invalidations.
-      return false;
+      if (Advisor && !PA.areAllPreserved())
+        Advisor->onModuleInvalidated();
+      // Check whether the analysis has been explicitly invalidated. Otherwise,
+      // it's stateless and remains preserved.
+      auto PAC = PA.getChecker<InlineAdvisorAnalysis>();
+      return !PAC.preservedWhenStateless();
     }
     bool tryCreate(InlineParams Params, InliningAdvisorMode Mode,
-                   StringRef ReplayFile);
+                   const ReplayInlinerSettings &ReplaySettings);
     InlineAdvisor *getAdvisor() const { return Advisor.get(); }
-    void clear() { Advisor.reset(); }
 
   private:
     Module &M;
@@ -263,12 +277,16 @@ shouldInline(CallBase &CB, function_ref<InlineCost(CallBase &CB)> GetInlineCost,
 /// Emit ORE message.
 void emitInlinedInto(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
                      const BasicBlock *Block, const Function &Callee,
-                     const Function &Caller, const InlineCost &IC,
-                     bool ForProfileContext = false,
+                     const Function &Caller, bool IsMandatory,
+                     function_ref<void(OptimizationRemark &)> ExtraContext = {},
                      const char *PassName = nullptr);
 
-/// get call site location as string
-std::string getCallSiteLocation(DebugLoc DLoc);
+/// Emit ORE message based in cost (default heuristic).
+void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
+                                const BasicBlock *Block, const Function &Callee,
+                                const Function &Caller, const InlineCost &IC,
+                                bool ForProfileContext = false,
+                                const char *PassName = nullptr);
 
 /// Add location info to ORE message.
 void addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc);
diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index 4e1b28d4633f..b22841343b1a 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -213,6 +213,9 @@ struct InlineParams {
 
   /// Indicate whether we should allow inline deferral.
   Optional<bool> EnableDeferral = true;
+
+  /// Indicate whether we allow inlining for recursive call.
+  Optional<bool> AllowRecursiveCall = false;
 };
 
 /// Generate the parameters to tune the inline cost analysis based only on the
diff --git a/llvm/include/llvm/Analysis/InlineOrder.h b/llvm/include/llvm/Analysis/InlineOrder.h
new file mode 100644
index 000000000000..def3192356f4
--- /dev/null
+++ b/llvm/include/llvm/Analysis/InlineOrder.h
@@ -0,0 +1,172 @@
+//===- InlineOrder.h - Inlining order abstraction -*- C++ ---*-------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_ANALYSIS_INLINEORDER_H
+#define LLVM_ANALYSIS_INLINEORDER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include <algorithm>
+#include <utility>
+
+namespace llvm {
+class CallBase;
+class Function;
+class Module;
+
+template <typename T> class InlineOrder {
+public:
+  using reference = T &;
+  using const_reference = const T &;
+
+  virtual ~InlineOrder() {}
+
+  virtual size_t size() = 0;
+
+  virtual void push(const T &Elt) = 0;
+
+  virtual T pop() = 0;
+
+  virtual const_reference front() = 0;
+
+  virtual void erase_if(function_ref<bool(T)> Pred) = 0;
+
+  bool empty() { return !size(); }
+};
+
+template <typename T, typename Container = SmallVector<T, 16>>
+class DefaultInlineOrder : public InlineOrder<T> {
+  using reference = T &;
+  using const_reference = const T &;
+
+public:
+  size_t size() override { return Calls.size() - FirstIndex; }
+
+  void push(const T &Elt) override { Calls.push_back(Elt); }
+
+  T pop() override {
+    assert(size() > 0);
+    return Calls[FirstIndex++];
+  }
+
+  const_reference front() override {
+    assert(size() > 0);
+    return Calls[FirstIndex];
+  }
+
+  void erase_if(function_ref<bool(T)> Pred) override {
+    Calls.erase(std::remove_if(Calls.begin() + FirstIndex, Calls.end(), Pred),
+                Calls.end());
+  }
+
+private:
+  Container Calls;
+  size_t FirstIndex = 0;
+};
+
+class InlineSizePriority {
+public:
+  InlineSizePriority(int Size) : Size(Size) {}
+
+  static bool isMoreDesirable(const InlineSizePriority &S1,
+                              const InlineSizePriority &S2) {
+    return S1.Size < S2.Size;
+  }
+
+  static InlineSizePriority evaluate(CallBase *CB) {
+    Function *Callee = CB->getCalledFunction();
+    return InlineSizePriority(Callee->getInstructionCount());
+  }
+
+  int Size;
+};
+
+template <typename PriorityT>
+class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
+  using T = std::pair<CallBase *, int>;
+  using HeapT = std::pair<CallBase *, PriorityT>;
+  using reference = T &;
+  using const_reference = const T &;
+
+  static bool cmp(const HeapT &P1, const HeapT &P2) {
+    return PriorityT::isMoreDesirable(P2.second, P1.second);
+  }
+
+  // A call site could become less desirable for inlining because of the size
+  // growth from prior inlining into the callee. This method is used to lazily
+  // update the desirability of a call site if it's decreasing. It is only
+  // called on pop() or front(), not every time the desirability changes. When
+  // the desirability of the front call site decreases, an updated one would be
+  // pushed right back into the heap. For simplicity, those cases where
+  // the desirability of a call site increases are ignored here.
+  void adjust() {
+    bool Changed = false;
+    do {
+      CallBase *CB = Heap.front().first;
+      const PriorityT PreviousGoodness = Heap.front().second;
+      const PriorityT CurrentGoodness = PriorityT::evaluate(CB);
+      Changed = PriorityT::isMoreDesirable(PreviousGoodness, CurrentGoodness);
+      if (Changed) {
+        std::pop_heap(Heap.begin(), Heap.end(), cmp);
+        Heap.pop_back();
+        Heap.push_back({CB, CurrentGoodness});
+        std::push_heap(Heap.begin(), Heap.end(), cmp);
+      }
+    } while (Changed);
+  }
+
+public:
+  size_t size() override { return Heap.size(); }
+
+  void push(const T &Elt) override {
+    CallBase *CB = Elt.first;
+    const int InlineHistoryID = Elt.second;
+    const PriorityT Goodness = PriorityT::evaluate(CB);
+
+    Heap.push_back({CB, Goodness});
+    std::push_heap(Heap.begin(), Heap.end(), cmp);
+    InlineHistoryMap[CB] = InlineHistoryID;
+  }
+
+  T pop() override {
+    assert(size() > 0);
+    adjust();
+
+    CallBase *CB = Heap.front().first;
+    T Result = std::make_pair(CB, InlineHistoryMap[CB]);
+    InlineHistoryMap.erase(CB);
+    std::pop_heap(Heap.begin(), Heap.end(), cmp);
+    Heap.pop_back();
+    return Result;
+  }
+
+  const_reference front() override {
+    assert(size() > 0);
+    adjust();
+
+    CallBase *CB = Heap.front().first;
+    return *InlineHistoryMap.find(CB);
+  }
+
+  void erase_if(function_ref<bool(T)> Pred) override {
+    auto PredWrapper = [=](HeapT P) -> bool {
+      return Pred(std::make_pair(P.first, 0));
+    };
+    llvm::erase_if(Heap, PredWrapper);
+    std::make_heap(Heap.begin(), Heap.end(), cmp);
+  }
+
+private:
+  SmallVector<HeapT, 16> Heap;
+  DenseMap<CallBase *, int> InlineHistoryMap;
+};
+} // namespace llvm
+#endif // LLVM_ANALYSIS_INLINEORDER_H
diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index efaf1847276b..f0f8e4bc9175 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -248,7 +248,7 @@ Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                           const SimplifyQuery &Q);
 
 /// Given operands for a GetElementPtrInst, fold the result or return null.
-Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
+Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
                        const SimplifyQuery &Q);
 
 /// Given operands for an InsertValueInst, fold the result or return null.
diff --git a/llvm/include/llvm/Analysis/LazyCallGraph.h b/llvm/include/llvm/Analysis/LazyCallGraph.h
index ca276d2f3cf8..0580f4d7b226 100644
--- a/llvm/include/llvm/Analysis/LazyCallGraph.h
+++ b/llvm/include/llvm/Analysis/LazyCallGraph.h
@@ -145,7 +145,7 @@ public:
     /// around but clear them.
     explicit operator bool() const;
 
-    /// Returnss the \c Kind of the edge.
+    /// Returns the \c Kind of the edge.
     Kind getKind() const;
 
     /// Test whether the edge represents a direct call to a function.
@@ -307,9 +307,9 @@ public:
 
   /// A node in the call graph.
   ///
-  /// This represents a single node. It's primary roles are to cache the list of
-  /// callees, de-duplicate and provide fast testing of whether a function is
-  /// a callee, and facilitate iteration of child nodes in the graph.
+  /// This represents a single node. Its primary roles are to cache the list of
+  /// callees, de-duplicate and provide fast testing of whether a function is a
+  /// callee, and facilitate iteration of child nodes in the graph.
   ///
   /// The node works much like an optional in order to lazily populate the
   /// edges of each node. Until populated, there are no edges. Once populated,
@@ -392,7 +392,7 @@ public:
 
     /// Internal helper to directly replace the function with a new one.
     ///
-    /// This is used to facilitate tranfsormations which need to replace the
+    /// This is used to facilitate transformations which need to replace the
     /// formal Function object but directly move the body and users from one to
     /// the other.
     void replaceFunction(Function &NewF);
@@ -419,7 +419,7 @@ public:
   /// outer structure. SCCs do not support mutation of the call graph, that
   /// must be done through the containing \c RefSCC in order to fully reason
   /// about the ordering and connections of the graph.
-  class SCC {
+  class LLVM_EXTERNAL_VISIBILITY SCC {
     friend class LazyCallGraph;
     friend class LazyCallGraph::Node;
 
@@ -435,7 +435,7 @@ public:
       Nodes.clear();
     }
 
-    /// Print a short descrtiption useful for debugging or logging.
+    /// Print a short description useful for debugging or logging.
     ///
     /// We print the function names in the SCC wrapped in '()'s and skipping
     /// the middle functions if there are a large number.
@@ -467,9 +467,10 @@ public:
     /// Verify invariants about the SCC.
     ///
     /// This will attempt to validate all of the basic invariants within an
-    /// SCC, but not that it is a strongly connected componet per-se. Primarily
-    /// useful while building and updating the graph to check that basic
-    /// properties are in place rather than having inexplicable crashes later.
+    /// SCC, but not that it is a strongly connected component per se.
+    /// Primarily useful while building and updating the graph to check that
+    /// basic properties are in place rather than having inexplicable crashes
+    /// later.
     void verify();
 #endif
 
@@ -511,7 +512,7 @@ public:
 
     /// Provide a short name by printing this SCC to a std::string.
     ///
-    /// This copes with the fact that we don't have a name per-se for an SCC
+    /// This copes with the fact that we don't have a name per se for an SCC
     /// while still making the use of this in debugging and logging useful.
     std::string getName() const {
       std::string Name;
@@ -644,7 +645,7 @@ public:
 
     /// Provide a short name by printing this RefSCC to a std::string.
     ///
-    /// This copes with the fact that we don't have a name per-se for an RefSCC
+    /// This copes with the fact that we don't have a name per se for an RefSCC
     /// while still making the use of this in debugging and logging useful.
     std::string getName() const {
       std::string Name;
@@ -1085,47 +1086,9 @@ public:
   /// updates that set with every constant visited.
   ///
   /// For each defined function, calls \p Callback with that function.
-  template <typename CallbackT>
   static void visitReferences(SmallVectorImpl<Constant *> &Worklist,
                               SmallPtrSetImpl<Constant *> &Visited,
-                              CallbackT Callback) {
-    while (!Worklist.empty()) {
-      Constant *C = Worklist.pop_back_val();
-
-      if (Function *F = dyn_cast<Function>(C)) {
-        if (!F->isDeclaration())
-          Callback(*F);
-        continue;
-      }
-
-      // The blockaddress constant expression is a weird special case, we can't
-      // generically walk its operands the way we do for all other constants.
-      if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
-        // If we've already visited the function referred to by the block
-        // address, we don't need to revisit it.
-        if (Visited.count(BA->getFunction()))
-          continue;
-
-        // If all of the blockaddress' users are instructions within the
-        // referred to function, we don't need to insert a cycle.
-        if (llvm::all_of(BA->users(), [&](User *U) {
-              if (Instruction *I = dyn_cast<Instruction>(U))
-                return I->getFunction() == BA->getFunction();
-              return false;
-            }))
-          continue;
-
-        // Otherwise we should go visit the referred to function.
-        Visited.insert(BA->getFunction());
-        Worklist.push_back(BA->getFunction());
-        continue;
-      }
-
-      for (Value *Op : C->operand_values())
-        if (Visited.insert(cast<Constant>(Op)).second)
-          Worklist.push_back(cast<Constant>(Op));
-    }
-  }
+                              function_ref<void(Function &)> Callback);
 
   ///@}
 
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 0a0ef1536caf..2b4edfac61fc 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -177,21 +177,11 @@ public:
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
-  void addAccess(StoreInst *SI) {
-    Value *Ptr = SI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
-    InstMap.push_back(SI);
-    ++AccessIdx;
-  }
+  void addAccess(StoreInst *SI);
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
-  void addAccess(LoadInst *LI) {
-    Value *Ptr = LI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
-    InstMap.push_back(LI);
-    ++AccessIdx;
-  }
+  void addAccess(LoadInst *LI);
 
   /// Check whether the dependencies between the accesses are safe.
   ///
@@ -664,15 +654,14 @@ Value *stripIntegerCast(Value *V);
 /// If necessary this method will version the stride of the pointer according
 /// to \p PtrToStride and therefore add further predicates to \p PSE.
 ///
-/// If \p OrigPtr is not null, use it to look up the stride value instead of \p
-/// Ptr.  \p PtrToStride provides the mapping between the pointer value and its
+/// \p PtrToStride provides the mapping between the pointer value and its
 /// stride as collected by LoopVectorizationLegality::collectStridedAccess.
 const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                       const ValueToValueMap &PtrToStride,
-                                      Value *Ptr, Value *OrigPtr = nullptr);
+                                      Value *Ptr);
 
-/// If the pointer has a constant stride return it in units of its
-/// element size.  Otherwise return zero.
+/// If the pointer has a constant stride return it in units of the access type
+/// size.  Otherwise return zero.
 ///
 /// Ensure that it does not wrap in the address space, assuming the predicate
 /// associated with \p PSE is true.
@@ -681,7 +670,8 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 /// to \p PtrToStride and therefore add further predicates to \p PSE.
 /// The \p Assume parameter indicates if we are allowed to make additional
 /// run-time assumptions.
-int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
+int64_t getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
+                     const Loop *Lp,
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index 92db1d67fc4e..bc8a1e74e447 100644
--- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -58,6 +58,7 @@ struct LoopStandardAnalysisResults {
   TargetLibraryInfo &TLI;
   TargetTransformInfo &TTI;
   BlockFrequencyInfo *BFI;
+  BranchProbabilityInfo *BPI;
   MemorySSA *MSSA;
 };
 
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 164ec50e47bc..15c9d911ab80 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -527,7 +527,7 @@ extern template class LoopBase<BasicBlock, Loop>;
 
 /// Represents a single loop in the control flow graph.  Note that not all SCCs
 /// in the CFG are necessarily loops.
-class Loop : public LoopBase<BasicBlock, Loop> {
+class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop> {
 public:
   /// A range representing the start and end location of a loop.
   class LocRange {
@@ -950,7 +950,7 @@ public:
   ///
   /// Note that because loops form a forest of trees, preorder is equivalent to
   /// reverse postorder.
-  SmallVector<LoopT *, 4> getLoopsInPreorder();
+  SmallVector<LoopT *, 4> getLoopsInPreorder() const;
 
   /// Return all of the loops in the function in preorder across the loop
   /// nests, with siblings in *reverse* program order.
@@ -960,7 +960,7 @@ public:
   ///
   /// Also note that this is *not* a reverse preorder. Only the siblings are in
   /// reverse program order.
-  SmallVector<LoopT *, 4> getLoopsInReverseSiblingPreorder();
+  SmallVector<LoopT *, 4> getLoopsInReverseSiblingPreorder() const;
 
   /// Return the inner most loop that BB lives in. If a basic block is in no
   /// loop (for example the entry node), null is returned.
@@ -1213,6 +1213,13 @@ public:
 
 };
 
+/// Enable verification of loop info.
+///
+/// The flag enables checks which are expensive and are disabled by default
+/// unless the `EXPENSIVE_CHECKS` macro is defined.  The `-verify-loop-info`
+/// flag allows the checks to be enabled selectively without re-compilation.
+extern bool VerifyLoopInfo;
+
 // Allow clients to walk the list of nested loops...
 template <> struct GraphTraits<const Loop *> {
   typedef const Loop *NodeRef;
@@ -1305,6 +1312,10 @@ bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name);
 llvm::Optional<int>
 getOptionalIntLoopAttribute(const Loop *TheLoop, StringRef Name);
 
+/// Find named metadata for a loop with an integer value. Return \p Default if
+/// not set.
+int getIntLoopAttribute(const Loop *TheLoop, StringRef Name, int Default = 0);
+
 /// Find string metadata for loop
 ///
 /// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
diff --git a/llvm/include/llvm/Analysis/LoopInfoImpl.h b/llvm/include/llvm/Analysis/LoopInfoImpl.h
index 2cc9afb7c2cd..b8b8330d0fe1 100644
--- a/llvm/include/llvm/Analysis/LoopInfoImpl.h
+++ b/llvm/include/llvm/Analysis/LoopInfoImpl.h
@@ -574,7 +574,8 @@ void LoopInfoBase<BlockT, LoopT>::analyze(const DomTreeBase<BlockT> &DomTree) {
 }
 
 template <class BlockT, class LoopT>
-SmallVector<LoopT *, 4> LoopInfoBase<BlockT, LoopT>::getLoopsInPreorder() {
+SmallVector<LoopT *, 4>
+LoopInfoBase<BlockT, LoopT>::getLoopsInPreorder() const {
   SmallVector<LoopT *, 4> PreOrderLoops, PreOrderWorklist;
   // The outer-most loop actually goes into the result in the same relative
   // order as we walk it. But LoopInfo stores the top level loops in reverse
@@ -592,7 +593,7 @@ SmallVector<LoopT *, 4> LoopInfoBase<BlockT, LoopT>::getLoopsInPreorder() {
 
 template <class BlockT, class LoopT>
 SmallVector<LoopT *, 4>
-LoopInfoBase<BlockT, LoopT>::getLoopsInReverseSiblingPreorder() {
+LoopInfoBase<BlockT, LoopT>::getLoopsInReverseSiblingPreorder() const {
   SmallVector<LoopT *, 4> PreOrderLoops, PreOrderWorklist;
   // The outer-most loop actually goes into the result in the same relative
   // order as we walk it. LoopInfo stores the top level loops in reverse
diff --git a/llvm/include/llvm/Analysis/LoopNestAnalysis.h b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
index 9a749a1c8eae..3d4a064cf7e3 100644
--- a/llvm/include/llvm/Analysis/LoopNestAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
@@ -21,11 +21,14 @@
 namespace llvm {
 
 using LoopVectorTy = SmallVector<Loop *, 8>;
+
 class LPMUpdater;
 
 /// This class represents a loop nest and can be used to query its properties.
-class LoopNest {
+class LLVM_EXTERNAL_VISIBILITY LoopNest {
 public:
+  using InstrVectorTy = SmallVector<const Instruction *>;
+
   /// Construct a loop nest rooted by loop \p Root.
   LoopNest(Loop &Root, ScalarEvolution &SE);
 
@@ -48,6 +51,12 @@ public:
   static bool arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
                                  ScalarEvolution &SE);
 
+  /// Return a vector of instructions that prevent the LoopNest given
+  /// by loops \p OuterLoop and \p InnerLoop from being perfect.
+  static InstrVectorTy getInterveningInstructions(const Loop &OuterLoop,
+                                                  const Loop &InnerLoop,
+                                                  ScalarEvolution &SE);
+
   /// Return the maximum nesting depth of the loop nest rooted by loop \p Root.
   /// For example given the loop nest:
   /// \code
@@ -150,6 +159,17 @@ public:
 protected:
   const unsigned MaxPerfectDepth; // maximum perfect nesting depth level.
   LoopVectorTy Loops; // the loops in the nest (in breadth first order).
+
+private:
+  enum LoopNestEnum {
+    PerfectLoopNest,
+    ImperfectLoopNest,
+    InvalidLoopStructure,
+    OuterLoopLowerBoundUnknown
+  };
+  static LoopNestEnum analyzeLoopNestForPerfectNest(const Loop &OuterLoop,
+                                                    const Loop &InnerLoop,
+                                                    ScalarEvolution &SE);
 };
 
 raw_ostream &operator<<(raw_ostream &, const LoopNest &);
diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
index 54edbb823263..a218561e61c7 100644
--- a/llvm/include/llvm/Analysis/MLInlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
@@ -38,6 +38,7 @@ public:
   bool isForcedToStop() const { return ForceStop; }
   int64_t getLocalCalls(Function &F);
   const MLModelRunner &getModelRunner() const { return *ModelRunner.get(); }
+  void onModuleInvalidated() override { Invalid = true; }
 
 protected:
   std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
@@ -55,6 +56,7 @@ protected:
 private:
   int64_t getModuleIRSize() const;
 
+  bool Invalid = true;
   std::unique_ptr<CallGraph> CG;
 
   int64_t NodeCount = 0;
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index f40b99968fd3..48aeef371e3d 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -106,9 +106,6 @@
 
 namespace llvm {
 
-/// Enables memory ssa as a dependency for loop passes.
-extern cl::opt<bool> EnableMSSALoopDependency;
-
 class AllocaInst;
 class Function;
 class Instruction;
@@ -786,21 +783,22 @@ public:
   /// dominates Use \p B.
   bool dominates(const MemoryAccess *A, const Use &B) const;
 
+  enum class VerificationLevel { Fast, Full };
   /// Verify that MemorySSA is self consistent (IE definitions dominate
   /// all uses, uses appear in the right places).  This is used by unit tests.
-  void verifyMemorySSA() const;
+  void verifyMemorySSA(VerificationLevel = VerificationLevel::Fast) const;
 
   /// Used in various insertion functions to specify whether we are talking
   /// about the beginning or end of a block.
   enum InsertionPlace { Beginning, End, BeforeTerminator };
 
 protected:
-  // Used by Memory SSA annotater, dumpers, and wrapper pass
-  friend class MemorySSAAnnotatedWriter;
+  // Used by Memory SSA dumpers and wrapper pass
   friend class MemorySSAPrinterLegacyPass;
   friend class MemorySSAUpdater;
 
-  void verifyOrderingDominationAndDefUses(Function &F) const;
+  void verifyOrderingDominationAndDefUses(
+      Function &F, VerificationLevel = VerificationLevel::Fast) const;
   void verifyDominationNumbers(const Function &F) const;
   void verifyPrevDefInPhis(Function &F) const;
 
@@ -898,6 +896,13 @@ private:
   unsigned NextID;
 };
 
+/// Enables verification of MemorySSA.
+///
+/// The checks which this flag enables is exensive and disabled by default
+/// unless `EXPENSIVE_CHECKS` is defined.  The flag `-verify-memoryssa` can be
+/// used to selectively enable the verification without re-compilation.
+extern bool VerifyMemorySSA;
+
 // Internal MemorySSA utils, for use by MemorySSA classes and walkers
 class MemorySSAUtil {
 protected:
@@ -956,6 +961,17 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+/// Printer pass for \c MemorySSA via the walker.
+class MemorySSAWalkerPrinterPass
+    : public PassInfoMixin<MemorySSAWalkerPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit MemorySSAWalkerPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 /// Verifier pass for \c MemorySSA.
 struct MemorySSAVerifierPass : PassInfoMixin<MemorySSAVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
index 62bdade95d96..17062ab907a6 100644
--- a/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
+++ b/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
@@ -78,14 +78,17 @@ inline const Value *GetUnderlyingObjCPtr(const Value *V) {
 }
 
 /// A wrapper for GetUnderlyingObjCPtr used for results memoization.
-inline const Value *
-GetUnderlyingObjCPtrCached(const Value *V,
-                           DenseMap<const Value *, WeakTrackingVH> &Cache) {
-  if (auto InCache = Cache.lookup(V))
-    return InCache;
+inline const Value *GetUnderlyingObjCPtrCached(
+    const Value *V,
+    DenseMap<const Value *, std::pair<WeakVH, WeakTrackingVH>> &Cache) {
+  // The entry is invalid if either value handle is null.
+  auto InCache = Cache.lookup(V);
+  if (InCache.first && InCache.second)
+    return InCache.second;
 
   const Value *Computed = GetUnderlyingObjCPtr(V);
-  Cache[V] = const_cast<Value *>(Computed);
+  Cache[V] =
+      std::make_pair(const_cast<Value *>(V), const_cast<Value *>(Computed));
   return Computed;
 }
 
@@ -168,8 +171,8 @@ bool IsPotentialRetainableObjPtr(const Value *Op, AAResults &AA);
 /// Helper for GetARCInstKind. Determines what kind of construct CS
 /// is.
 inline ARCInstKind GetCallSiteClass(const CallBase &CB) {
-  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I)
-    if (IsPotentialRetainableObjPtr(*I))
+  for (const Use &U : CB.args())
+    if (IsPotentialRetainableObjPtr(U))
       return CB.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser;
 
   return CB.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call;
@@ -204,11 +207,10 @@ inline bool IsObjCIdentifiedObject(const Value *V) {
         return true;
 
       StringRef Section = GV->getSection();
-      if (Section.find("__message_refs") != StringRef::npos ||
-          Section.find("__objc_classrefs") != StringRef::npos ||
-          Section.find("__objc_superrefs") != StringRef::npos ||
-          Section.find("__objc_methname") != StringRef::npos ||
-          Section.find("__cstring") != StringRef::npos)
+      if (Section.contains("__message_refs") ||
+          Section.contains("__objc_classrefs") ||
+          Section.contains("__objc_superrefs") ||
+          Section.contains("__objc_methname") || Section.contains("__cstring"))
         return true;
     }
   }
diff --git a/llvm/include/llvm/Analysis/ObjCARCUtil.h b/llvm/include/llvm/Analysis/ObjCARCUtil.h
index 2566bfbcf61c..362dd6c29992 100644
--- a/llvm/include/llvm/Analysis/ObjCARCUtil.h
+++ b/llvm/include/llvm/Analysis/ObjCARCUtil.h
@@ -11,9 +11,11 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_OBJCARCUTIL_H
-#define LLVM_IR_OBJCARCUTIL_H
+#ifndef LLVM_ANALYSIS_OBJCARCUTIL_H
+#define LLVM_ANALYSIS_OBJCARCUTIL_H
 
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
 
@@ -24,13 +26,6 @@ inline const char *getRVMarkerModuleFlagStr() {
   return "clang.arc.retainAutoreleasedReturnValueMarker";
 }
 
-enum AttachedCallOperandBundle : unsigned { RVOB_Retain, RVOB_Claim };
-
-inline AttachedCallOperandBundle
-getAttachedCallOperandBundleEnum(bool IsRetain) {
-  return IsRetain ? RVOB_Retain : RVOB_Claim;
-}
-
 inline bool hasAttachedCallOpBundle(const CallBase *CB) {
   // Ignore the bundle if the return type is void. Global optimization passes
   // can turn the called function's return type to void. That should happen only
@@ -43,14 +38,32 @@ inline bool hasAttachedCallOpBundle(const CallBase *CB) {
              .hasValue();
 }
 
-inline bool hasAttachedCallOpBundle(const CallBase *CB, bool IsRetain) {
-  assert(hasAttachedCallOpBundle(CB) &&
-         "call doesn't have operand bundle clang_arc_attachedcall");
+/// This function returns operand bundle clang_arc_attachedcall's argument,
+/// which is the address of the ARC runtime function.
+inline Optional<Function *> getAttachedARCFunction(const CallBase *CB) {
   auto B = CB->getOperandBundle(LLVMContext::OB_clang_arc_attachedcall);
-  if (!B.hasValue())
-    return false;
-  return cast<ConstantInt>(B->Inputs[0])->getZExtValue() ==
-         getAttachedCallOperandBundleEnum(IsRetain);
+  if (!B.hasValue() || B->Inputs.size() == 0)
+    return None;
+
+  return cast<Function>(B->Inputs[0]);
+}
+
+/// Check whether the function is retainRV/claimRV.
+inline bool isRetainOrClaimRV(ARCInstKind Kind) {
+  return Kind == ARCInstKind::RetainRV || Kind == ARCInstKind::ClaimRV;
+}
+
+/// This function returns the ARCInstKind of the function attached to operand
+/// bundle clang_arc_attachedcall. It returns None if the call doesn't have the
+/// operand bundle or the operand is null. Otherwise it returns either RetainRV
+/// or ClaimRV.
+inline ARCInstKind getAttachedARCFunctionKind(const CallBase *CB) {
+  Optional<Function *> Fn = getAttachedARCFunction(CB);
+  if (!Fn.hasValue())
+    return ARCInstKind::None;
+  auto FnClass = GetFunctionClass(*Fn);
+  assert(isRetainOrClaimRV(FnClass) && "unexpected ARC runtime function");
+  return FnClass;
 }
 
 } // end namespace objcarc
diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index c95404d96f4e..886800d8a0f5 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -134,9 +134,13 @@ public:
   bool isColdCount(uint64_t C) const;
   /// Returns true if count \p C is considered hot with regard to a given
   /// hot percentile cutoff value.
+  /// PercentileCutoff is encoded as a 6 digit decimal fixed point number, where
+  /// the first two digits are the whole part. E.g. 995000 for 99.5 percentile.
   bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C) const;
   /// Returns true if count \p C is considered cold with regard to a given
   /// cold percentile cutoff value.
+  /// PercentileCutoff is encoded as a 6 digit decimal fixed point number, where
+  /// the first two digits are the whole part. E.g. 995000 for 99.5 percentile.
   bool isColdCountNthPercentile(int PercentileCutoff, uint64_t C) const;
   /// Returns true if BasicBlock \p BB is considered hot.
   bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const;
@@ -144,10 +148,14 @@ public:
   bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const;
   /// Returns true if BasicBlock \p BB is considered hot with regard to a given
   /// hot percentile cutoff value.
+  /// PercentileCutoff is encoded as a 6 digit decimal fixed point number, where
+  /// the first two digits are the whole part. E.g. 995000 for 99.5 percentile.
   bool isHotBlockNthPercentile(int PercentileCutoff, const BasicBlock *BB,
                                BlockFrequencyInfo *BFI) const;
   /// Returns true if BasicBlock \p BB is considered cold with regard to a given
   /// cold percentile cutoff value.
+  /// PercentileCutoff is encoded as a 6 digit decimal fixed point number, where
+  /// the first two digits are the whole part. E.g. 995000 for 99.5 percentile.
   bool isColdBlockNthPercentile(int PercentileCutoff, const BasicBlock *BB,
                                 BlockFrequencyInfo *BFI) const;
   /// Returns true if the call site \p CB is considered hot.
@@ -162,11 +170,11 @@ public:
   uint64_t getOrCompColdCountThreshold() const;
   /// Returns HotCountThreshold if set.
   uint64_t getHotCountThreshold() const {
-    return HotCountThreshold ? HotCountThreshold.getValue() : 0;
+    return HotCountThreshold.getValueOr(0);
   }
   /// Returns ColdCountThreshold if set.
   uint64_t getColdCountThreshold() const {
-    return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
+    return ColdCountThreshold.getValueOr(0);
   }
 
  private:
diff --git a/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h b/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
index 3018bcc241d8..a0eb9af62205 100644
--- a/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/ReplayInlineAdvisor.h
@@ -20,6 +20,46 @@ class Function;
 class Module;
 class OptimizationRemarkEmitter;
 
+struct CallSiteFormat {
+  enum class Format : int {
+    Line,
+    LineColumn,
+    LineDiscriminator,
+    LineColumnDiscriminator
+  };
+
+  bool outputColumn() const {
+    return OutputFormat == Format::LineColumn ||
+           OutputFormat == Format::LineColumnDiscriminator;
+  }
+
+  bool outputDiscriminator() const {
+    return OutputFormat == Format::LineDiscriminator ||
+           OutputFormat == Format::LineColumnDiscriminator;
+  }
+
+  Format OutputFormat;
+};
+
+/// Replay Inliner Setup
+struct ReplayInlinerSettings {
+  enum class Scope : int { Function, Module };
+  enum class Fallback : int { Original, AlwaysInline, NeverInline };
+
+  StringRef ReplayFile;
+  Scope ReplayScope;
+  Fallback ReplayFallback;
+  CallSiteFormat ReplayFormat;
+};
+
+/// Get call site location as a string with the given format
+std::string formatCallSiteLocation(DebugLoc DLoc, const CallSiteFormat &Format);
+
+std::unique_ptr<InlineAdvisor> getReplayInlineAdvisor(
+    Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
+    std::unique_ptr<InlineAdvisor> OriginalAdvisor,
+    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks);
+
 /// Replay inline advisor that uses optimization remarks from inlining of
 /// previous build to guide current inlining. This is useful for inliner tuning.
 class ReplayInlineAdvisor : public InlineAdvisor {
@@ -27,15 +67,24 @@ public:
   ReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
                       LLVMContext &Context,
                       std::unique_ptr<InlineAdvisor> OriginalAdvisor,
-                      StringRef RemarksFile, bool EmitRemarks);
+                      const ReplayInlinerSettings &ReplaySettings,
+                      bool EmitRemarks);
   std::unique_ptr<InlineAdvice> getAdviceImpl(CallBase &CB) override;
   bool areReplayRemarksLoaded() const { return HasReplayRemarks; }
 
 private:
-  StringSet<> InlineSitesFromRemarks;
+  bool hasInlineAdvice(Function &F) const {
+    return (ReplaySettings.ReplayScope ==
+            ReplayInlinerSettings::Scope::Module) ||
+           CallersToReplay.contains(F.getName());
+  }
   std::unique_ptr<InlineAdvisor> OriginalAdvisor;
   bool HasReplayRemarks = false;
+  const ReplayInlinerSettings ReplaySettings;
   bool EmitRemarks = false;
+
+  StringMap<bool> InlineSitesFromRemarks;
+  StringSet<> CallersToReplay;
 };
 } // namespace llvm
 #endif // LLVM_ANALYSIS_REPLAYINLINEADVISOR_H
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index ae9c73fede96..a2260688e3d6 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -25,7 +25,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SetVector.h"
@@ -112,6 +111,24 @@ public:
   /// Note that NUW and NSW are also valid properties of a recurrence, and
   /// either implies NW. For convenience, NW will be set for a recurrence
   /// whenever either NUW or NSW are set.
+  ///
+  /// We require that the flag on a SCEV apply to the entire scope in which
+  /// that SCEV is defined.  A SCEV's scope is set of locations dominated by
+  /// a defining location, which is in turn described by the following rules:
+  /// * A SCEVUnknown is at the point of definition of the Value.
+  /// * A SCEVConstant is defined at all points.
+  /// * A SCEVAddRec is defined starting with the header of the associated
+  ///   loop.
+  /// * All other SCEVs are defined at the earlest point all operands are
+  ///   defined.
+  ///
+  /// The above rules describe a maximally hoisted form (without regards to
+  /// potential control dependence).  A SCEV is defined anywhere a
+  /// corresponding instruction could be defined in said maximally hoisted
+  /// form.  Note that SCEVUDivExpr (currently the only expression type which
+  /// can trap) can be defined per these rules in regions where it would trap
+  /// at runtime.  A SCEV being defined does not require the existence of any
+  /// instruction within the defined scope.
   enum NoWrapFlags {
     FlagAnyWrap = 0,    // No guarantee.
     FlagNW = (1 << 0),  // No self-wrap.
@@ -472,6 +489,10 @@ public:
   clearFlags(SCEV::NoWrapFlags Flags, SCEV::NoWrapFlags OffFlags) {
     return (SCEV::NoWrapFlags)(Flags & ~OffFlags);
   }
+  LLVM_NODISCARD static bool hasFlags(SCEV::NoWrapFlags Flags,
+                                      SCEV::NoWrapFlags TestFlags) {
+    return TestFlags == maskFlags(Flags, TestFlags);
+  };
 
   ScalarEvolution(Function &F, TargetLibraryInfo &TLI, AssumptionCache &AC,
                   DominatorTree &DT, LoopInfo &LI);
@@ -498,13 +519,26 @@ public:
   // Returns a wider type among {Ty1, Ty2}.
   Type *getWiderType(Type *Ty1, Type *Ty2) const;
 
+  /// Return true if there exists a point in the program at which both
+  /// A and B could be operands to the same instruction.
+  /// SCEV expressions are generally assumed to correspond to instructions
+  /// which could exists in IR.  In general, this requires that there exists
+  /// a use point in the program where all operands dominate the use.
+  ///
+  /// Example:
+  /// loop {
+  ///   if
+  ///     loop { v1 = load @global1; }
+  ///   else
+  ///     loop { v2 = load @global2; }
+  /// }
+  /// No SCEV with operand V1, and v2 can exist in this program.
+  bool instructionCouldExistWitthOperands(const SCEV *A, const SCEV *B);
+
   /// Return true if the SCEV is a scAddRecExpr or it contains
   /// scAddRecExpr. The result will be cached in HasRecMap.
   bool containsAddRecurrence(const SCEV *S);
 
-  /// Erase Value from ValueExprMap and ExprValueMap.
-  void eraseValueFromMap(Value *V);
-
   /// Is operation \p BinOp between \p LHS and \p RHS provably does not have
   /// a signed/unsigned overflow (\p Signed)?
   bool willNotOverflow(Instruction::BinaryOps BinOp, bool Signed,
@@ -516,6 +550,12 @@ public:
   std::pair<SCEV::NoWrapFlags, bool /*Deduced*/>
   getStrengthenedNoWrapFlagsFromBinOp(const OverflowingBinaryOperator *OBO);
 
+  /// Notify this ScalarEvolution that \p User directly uses SCEVs in \p Ops.
+  void registerUser(const SCEV *User, ArrayRef<const SCEV *> Ops);
+
+  /// Return true if the SCEV expression contains an undef value.
+  bool containsUndefs(const SCEV *S) const;
+
   /// Return a SCEV expression for the full generality of the specified
   /// expression.
   const SCEV *getSCEV(Value *V);
@@ -700,6 +740,9 @@ public:
   /// cases do exist.
   const SCEV *getPointerBase(const SCEV *V);
 
+  /// Compute an expression equivalent to S - getPointerBase(S).
+  const SCEV *removePointerBase(const SCEV *S);
+
   /// Return a SCEV expression for the specified value at the specified scope
   /// in the program.  The L value specifies a loop nest to evaluate the
   /// expression at, where null is the top-level or a specified loop is
@@ -735,9 +778,13 @@ public:
   /// Convert from an "exit count" (i.e. "backedge taken count") to a "trip
   /// count".  A "trip count" is the number of times the header of the loop
   /// will execute if an exit is taken after the specified number of backedges
-  /// have been taken.  (e.g. TripCount = ExitCount + 1)  A zero result
-  /// must be interpreted as a loop having an unknown trip count.
-  const SCEV *getTripCountFromExitCount(const SCEV *ExitCount);
+  /// have been taken.  (e.g. TripCount = ExitCount + 1).  Note that the
+  /// expression can overflow if ExitCount = UINT_MAX.  \p Extend controls
+  /// how potential overflow is handled.  If true, a wider result type is
+  /// returned. ex: EC = 255 (i8), TC = 256 (i9).  If false, result unsigned
+  /// wraps with 2s-complement semantics.  ex: EC = 255 (i8), TC = 0 (i8)
+  const SCEV *getTripCountFromExitCount(const SCEV *ExitCount,
+                                        bool Extend = true);
 
   /// Returns the exact trip count of the loop if we can compute it, and
   /// the result is a small constant.  '0' is used to represent an unknown
@@ -762,6 +809,13 @@ public:
   /// Returns 0 if the trip count is unknown or not constant.
   unsigned getSmallConstantMaxTripCount(const Loop *L);
 
+  /// Returns the upper bound of the loop trip count infered from array size.
+  /// Can not access bytes starting outside the statically allocated size
+  /// without being immediate UB.
+  /// Returns SCEVCouldNotCompute if the trip count could not inferred
+  /// from array accesses.
+  const SCEV *getConstantMaxTripCountFromArray(const Loop *L);
+
   /// Returns the largest constant divisor of the trip count as a normal
   /// unsigned value, if possible. This means that the actual trip count is
   /// always a multiple of the returned value. Returns 1 if the trip count is
@@ -988,14 +1042,13 @@ public:
   /// Test if the given expression is known to satisfy the condition described
   /// by Pred, LHS, and RHS in the given Context.
   bool isKnownPredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS,
-                        const SCEV *RHS, const Instruction *Context);
+                          const SCEV *RHS, const Instruction *CtxI);
 
   /// Check whether the condition described by Pred, LHS, and RHS is true or
   /// false in the given \p Context. If we know it, return the evaluation of
   /// this condition. If neither is proved, return None.
   Optional<bool> evaluatePredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS,
-                                     const SCEV *RHS,
-                                     const Instruction *Context);
+                                     const SCEV *RHS, const Instruction *CtxI);
 
   /// Test if the condition described by Pred, LHS, RHS is known to be true on
   /// every iteration of the loop of the recurrency LHS.
@@ -1045,7 +1098,7 @@ public:
   getLoopInvariantExitCondDuringFirstIterations(ICmpInst::Predicate Pred,
                                                 const SCEV *LHS,
                                                 const SCEV *RHS, const Loop *L,
-                                                const Instruction *Context,
+                                                const Instruction *CtxI,
                                                 const SCEV *MaxIter);
 
   /// Simplify LHS and RHS in a comparison with predicate Pred. Return true
@@ -1092,110 +1145,11 @@ public:
   /// Return the size of an element read or written by Inst.
   const SCEV *getElementSize(Instruction *Inst);
 
-  /// Compute the array dimensions Sizes from the set of Terms extracted from
-  /// the memory access function of this SCEVAddRecExpr (second step of
-  /// delinearization).
-  void findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
-                           SmallVectorImpl<const SCEV *> &Sizes,
-                           const SCEV *ElementSize);
-
   void print(raw_ostream &OS) const;
   void verify() const;
   bool invalidate(Function &F, const PreservedAnalyses &PA,
                   FunctionAnalysisManager::Invalidator &Inv);
 
-  /// Collect parametric terms occurring in step expressions (first step of
-  /// delinearization).
-  void collectParametricTerms(const SCEV *Expr,
-                              SmallVectorImpl<const SCEV *> &Terms);
-
-  /// Return in Subscripts the access functions for each dimension in Sizes
-  /// (third step of delinearization).
-  void computeAccessFunctions(const SCEV *Expr,
-                              SmallVectorImpl<const SCEV *> &Subscripts,
-                              SmallVectorImpl<const SCEV *> &Sizes);
-
-  /// Gathers the individual index expressions from a GEP instruction.
-  ///
-  /// This function optimistically assumes the GEP references into a fixed size
-  /// array. If this is actually true, this function returns a list of array
-  /// subscript expressions in \p Subscripts and a list of integers describing
-  /// the size of the individual array dimensions in \p Sizes. Both lists have
-  /// either equal length or the size list is one element shorter in case there
-  /// is no known size available for the outermost array dimension. Returns true
-  /// if successful and false otherwise.
-  bool getIndexExpressionsFromGEP(const GetElementPtrInst *GEP,
-                                  SmallVectorImpl<const SCEV *> &Subscripts,
-                                  SmallVectorImpl<int> &Sizes);
-
-  /// Split this SCEVAddRecExpr into two vectors of SCEVs representing the
-  /// subscripts and sizes of an array access.
-  ///
-  /// The delinearization is a 3 step process: the first two steps compute the
-  /// sizes of each subscript and the third step computes the access functions
-  /// for the delinearized array:
-  ///
-  /// 1. Find the terms in the step functions
-  /// 2. Compute the array size
-  /// 3. Compute the access function: divide the SCEV by the array size
-  ///    starting with the innermost dimensions found in step 2. The Quotient
-  ///    is the SCEV to be divided in the next step of the recursion. The
-  ///    Remainder is the subscript of the innermost dimension. Loop over all
-  ///    array dimensions computed in step 2.
-  ///
-  /// To compute a uniform array size for several memory accesses to the same
-  /// object, one can collect in step 1 all the step terms for all the memory
-  /// accesses, and compute in step 2 a unique array shape. This guarantees
-  /// that the array shape will be the same across all memory accesses.
-  ///
-  /// FIXME: We could derive the result of steps 1 and 2 from a description of
-  /// the array shape given in metadata.
-  ///
-  /// Example:
-  ///
-  /// A[][n][m]
-  ///
-  /// for i
-  ///   for j
-  ///     for k
-  ///       A[j+k][2i][5i] =
-  ///
-  /// The initial SCEV:
-  ///
-  /// A[{{{0,+,2*m+5}_i, +, n*m}_j, +, n*m}_k]
-  ///
-  /// 1. Find the different terms in the step functions:
-  /// -> [2*m, 5, n*m, n*m]
-  ///
-  /// 2. Compute the array size: sort and unique them
-  /// -> [n*m, 2*m, 5]
-  /// find the GCD of all the terms = 1
-  /// divide by the GCD and erase constant terms
-  /// -> [n*m, 2*m]
-  /// GCD = m
-  /// divide by GCD -> [n, 2]
-  /// remove constant terms
-  /// -> [n]
-  /// size of the array is A[unknown][n][m]
-  ///
-  /// 3. Compute the access function
-  /// a. Divide {{{0,+,2*m+5}_i, +, n*m}_j, +, n*m}_k by the innermost size m
-  /// Quotient: {{{0,+,2}_i, +, n}_j, +, n}_k
-  /// Remainder: {{{0,+,5}_i, +, 0}_j, +, 0}_k
-  /// The remainder is the subscript of the innermost array dimension: [5i].
-  ///
-  /// b. Divide Quotient: {{{0,+,2}_i, +, n}_j, +, n}_k by next outer size n
-  /// Quotient: {{{0,+,0}_i, +, 1}_j, +, 1}_k
-  /// Remainder: {{{0,+,2}_i, +, 0}_j, +, 0}_k
-  /// The Remainder is the subscript of the next array dimension: [2i].
-  ///
-  /// The subscript of the outermost dimension is the Quotient: [j+k].
-  ///
-  /// Overall, we have: A[][n][m], and the access function: A[j+k][2i][5i].
-  void delinearize(const SCEV *Expr, SmallVectorImpl<const SCEV *> &Subscripts,
-                   SmallVectorImpl<const SCEV *> &Sizes,
-                   const SCEV *ElementSize);
-
   /// Return the DataLayout associated with the module this SCEV instance is
   /// operating on.
   const DataLayout &getDataLayout() const {
@@ -1234,6 +1188,18 @@ public:
   /// Try to apply information from loop guards for \p L to \p Expr.
   const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
 
+  /// Return true if the loop has no abnormal exits. That is, if the loop
+  /// is not infinite, it must exit through an explicit edge in the CFG.
+  /// (As opposed to either a) throwing out of the function or b) entering a
+  /// well defined infinite loop in some callee.)
+  bool loopHasNoAbnormalExits(const Loop *L) {
+    return getLoopProperties(L).HasNoAbnormalExits;
+  }
+
+  /// Return true if this loop is finite by assumption.  That is,
+  /// to be infinite, it must also be undefined.
+  bool loopIsFiniteByAssumption(const Loop *L);
+
 private:
   /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
   /// Value is deleted.
@@ -1532,15 +1498,15 @@ private:
       LoopDispositions;
 
   struct LoopProperties {
-    /// Set to true if the loop contains no instruction that can have side
-    /// effects (i.e. via throwing an exception, volatile or atomic access).
-    bool HasNoAbnormalExits;
-
     /// Set to true if the loop contains no instruction that can abnormally exit
     /// the loop (i.e. via throwing an exception, by terminating the thread
     /// cleanly or by infinite looping in a called function).  Strictly
     /// speaking, the last one is not leaving the loop, but is identical to
     /// leaving the loop for reasoning about undefined behavior.
+    bool HasNoAbnormalExits;
+
+    /// Set to true if the loop contains no instruction that can have side
+    /// effects (i.e. via throwing an exception, volatile or atomic access).
     bool HasNoSideEffects;
   };
 
@@ -1554,14 +1520,6 @@ private:
     return getLoopProperties(L).HasNoSideEffects;
   }
 
-  bool loopHasNoAbnormalExits(const Loop *L) {
-    return getLoopProperties(L).HasNoAbnormalExits;
-  }
-
-  /// Return true if this loop is finite by assumption.  That is,
-  /// to be infinite, it must also be undefined.
-  bool loopIsFiniteByAssumption(const Loop *L);
-
   /// Compute a LoopDisposition value.
   LoopDisposition computeLoopDisposition(const SCEV *S, const Loop *L);
 
@@ -1574,6 +1532,9 @@ private:
   /// Compute a BlockDisposition value.
   BlockDisposition computeBlockDisposition(const SCEV *S, const BasicBlock *BB);
 
+  /// Stores all SCEV that use a given SCEV as its direct operand.
+  DenseMap<const SCEV *, SmallPtrSet<const SCEV *, 8> > SCEVUsers;
+
   /// Memoized results from getRange
   DenseMap<const SCEV *, ConstantRange> UnsignedRanges;
 
@@ -1600,22 +1561,22 @@ private:
   /// copied if its needed for longer.
   const ConstantRange &getRangeRef(const SCEV *S, RangeSignHint Hint);
 
-  /// Determines the range for the affine SCEVAddRecExpr {\p Start,+,\p Stop}.
+  /// Determines the range for the affine SCEVAddRecExpr {\p Start,+,\p Step}.
   /// Helper for \c getRange.
-  ConstantRange getRangeForAffineAR(const SCEV *Start, const SCEV *Stop,
+  ConstantRange getRangeForAffineAR(const SCEV *Start, const SCEV *Step,
                                     const SCEV *MaxBECount, unsigned BitWidth);
 
   /// Determines the range for the affine non-self-wrapping SCEVAddRecExpr {\p
-  /// Start,+,\p Stop}<nw>.
+  /// Start,+,\p Step}<nw>.
   ConstantRange getRangeForAffineNoSelfWrappingAR(const SCEVAddRecExpr *AddRec,
                                                   const SCEV *MaxBECount,
                                                   unsigned BitWidth,
                                                   RangeSignHint SignHint);
 
   /// Try to compute a range for the affine SCEVAddRecExpr {\p Start,+,\p
-  /// Stop} by "factoring out" a ternary expression from the add recurrence.
+  /// Step} by "factoring out" a ternary expression from the add recurrence.
   /// Helper called by \c getRange.
-  ConstantRange getRangeViaFactoring(const SCEV *Start, const SCEV *Stop,
+  ConstantRange getRangeViaFactoring(const SCEV *Start, const SCEV *Step,
                                      const SCEV *MaxBECount, unsigned BitWidth);
 
   /// If the unknown expression U corresponds to a simple recurrence, return
@@ -1761,12 +1722,6 @@ private:
                                                  BasicBlock *ExitingBB,
                                                  bool IsSubExpr);
 
-  /// Given an exit condition of 'icmp op load X, cst', try to see if we can
-  /// compute the backedge-taken count.
-  ExitLimit computeLoadConstantCompareExitLimit(LoadInst *LI, Constant *RHS,
-                                                const Loop *L,
-                                                ICmpInst::Predicate p);
-
   /// Compute the exit limit of a loop that is controlled by a
   /// "(IV >> 1) != 0" type comparison.  We cannot compute the exact trip
   /// count in these cases (since SCEV has no way of expressing them), but we
@@ -1839,7 +1794,7 @@ private:
                                   const SCEV *RHS,
                                   ICmpInst::Predicate FoundPred,
                                   const SCEV *FoundLHS, const SCEV *FoundRHS,
-                                  const Instruction *Context);
+                                  const Instruction *CtxI);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is
@@ -1914,7 +1869,7 @@ private:
                                            const SCEV *LHS, const SCEV *RHS,
                                            const SCEV *FoundLHS,
                                            const SCEV *FoundRHS,
-                                           const Instruction *Context);
+                                           const Instruction *CtxI);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
@@ -1956,12 +1911,18 @@ private:
   bool splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R,
                       SCEV::NoWrapFlags &Flags);
 
-  /// Drop memoized information computed for S.
-  void forgetMemoizedResults(const SCEV *S);
+  /// Drop memoized information for all \p SCEVs.
+  void forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs);
+
+  /// Helper for forgetMemoizedResults.
+  void forgetMemoizedResultsImpl(const SCEV *S);
 
   /// Return an existing SCEV for V if there is one, otherwise return nullptr.
   const SCEV *getExistingSCEV(Value *V);
 
+  /// Erase Value from ValueExprMap and ExprValueMap.
+  void eraseValueFromMap(Value *V);
+
   /// Return false iff given SCEV contains a SCEVUnknown with NULL value-
   /// pointer.
   bool checkValidity(const SCEV *S) const;
@@ -1995,6 +1956,27 @@ private:
   /// would trigger undefined behavior on overflow.
   SCEV::NoWrapFlags getNoWrapFlagsFromUB(const Value *V);
 
+  /// Return a scope which provides an upper bound on the defining scope of
+  /// 'S'. Specifically, return the first instruction in said bounding scope.
+  /// Return nullptr if the scope is trivial (function entry).
+  /// (See scope definition rules associated with flag discussion above)
+  const Instruction *getNonTrivialDefiningScopeBound(const SCEV *S);
+
+  /// Return a scope which provides an upper bound on the defining scope for
+  /// a SCEV with the operands in Ops.  The outparam Precise is set if the
+  /// bound found is a precise bound (i.e. must be the defining scope.)
+  const Instruction *getDefiningScopeBound(ArrayRef<const SCEV *> Ops,
+                                           bool &Precise);
+
+  /// Wrapper around the above for cases which don't care if the bound
+  /// is precise.
+  const Instruction *getDefiningScopeBound(ArrayRef<const SCEV *> Ops);
+
+  /// Given two instructions in the same function, return true if we can
+  /// prove B must execute given A executes.
+  bool isGuaranteedToTransferExecutionTo(const Instruction *A,
+                                         const Instruction *B);
+
   /// Return true if the SCEV corresponding to \p I is never poison.  Proving
   /// this is more complex than proving that just \p I is never poison, since
   /// SCEV commons expressions across control flow, and you can have cases
@@ -2036,8 +2018,11 @@ private:
   /// permitted by Start, End, and Stride. This is for loops of the form
   /// {Start, +, Stride} LT End.
   ///
-  /// Precondition: the induction variable is known to be positive.  We *don't*
-  /// assert these preconditions so please be careful.
+  /// Preconditions:
+  /// * the induction variable is known to be positive.
+  /// * the induction variable is assumed not to overflow (i.e. either it
+  ///   actually doesn't, or we'd have to immediately execute UB)
+  /// We *don't* assert these preconditions so please be careful.
   const SCEV *computeMaxBECountForLT(const SCEV *Start, const SCEV *Stride,
                                      const SCEV *End, unsigned BitWidth,
                                      bool IsSigned);
@@ -2072,31 +2057,20 @@ private:
   /// an add rec on said loop.
   void getUsedLoops(const SCEV *S, SmallPtrSetImpl<const Loop *> &LoopsUsed);
 
-  /// Find all of the loops transitively used in \p S, and update \c LoopUsers
-  /// accordingly.
-  void addToLoopUseLists(const SCEV *S);
-
   /// Try to match the pattern generated by getURemExpr(A, B). If successful,
   /// Assign A and B to LHS and RHS, respectively.
   bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
 
   /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
-  /// `UniqueSCEVs`.
-  ///
-  /// The first component of the returned tuple is the SCEV if found and null
-  /// otherwise.  The second component is the `FoldingSetNodeID` that was
-  /// constructed to look up the SCEV and the third component is the insertion
-  /// point.
-  std::tuple<SCEV *, FoldingSetNodeID, void *>
-  findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
+  /// `UniqueSCEVs`.  Return if found, else nullptr.
+  SCEV *findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
 
   FoldingSet<SCEV> UniqueSCEVs;
   FoldingSet<SCEVPredicate> UniquePreds;
   BumpPtrAllocator SCEVAllocator;
 
-  /// This maps loops to a list of SCEV expressions that (transitively) use said
-  /// loop.
-  DenseMap<const Loop *, SmallVector<const SCEV *, 4>> LoopUsers;
+  /// This maps loops to a list of addrecs that directly use said loop.
+  DenseMap<const Loop *, SmallVector<const SCEVAddRecExpr *, 4>> LoopUsers;
 
   /// Cache tentative mappings from UnknownSCEVs in a Loop, to a SCEV expression
   /// they can be rewritten into under certain predicates.
diff --git a/llvm/include/llvm/Analysis/StackLifetime.h b/llvm/include/llvm/Analysis/StackLifetime.h
index df342a9533ee..239aec4e258b 100644
--- a/llvm/include/llvm/Analysis/StackLifetime.h
+++ b/llvm/include/llvm/Analysis/StackLifetime.h
@@ -191,6 +191,8 @@ public:
   StackLifetimePrinterPass(raw_ostream &OS, StackLifetime::LivenessType Type)
       : Type(Type), OS(OS) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index 59c1e3e3bd56..751735f3e59f 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -75,7 +75,15 @@ public:
   StackSafetyGlobalInfo &operator=(StackSafetyGlobalInfo &&);
   ~StackSafetyGlobalInfo();
 
+  // Whether we can prove that all accesses to this Alloca are in-range and
+  // during its lifetime.
   bool isSafe(const AllocaInst &AI) const;
+
+  // Returns true if the instruction can be proven to do only two types of
+  // memory accesses:
+  //  (1) live stack locations in-bounds or
+  //  (2) non-stack locations.
+  bool stackAccessIsSafe(const Instruction &I) const;
   void print(raw_ostream &O) const;
   void dump() const;
 };
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 22bfeda0efd0..6e3e1380535e 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -76,7 +76,7 @@ class TargetLibraryInfoImpl {
   /// Return true if the function type FTy is valid for the library function
   /// F, regardless of whether the function is available.
   bool isValidProtoForLibFunc(const FunctionType &FTy, LibFunc F,
-                              const DataLayout *DL) const;
+                              const Module &M) const;
 
 public:
   /// List of known vector-functions libraries.
@@ -115,6 +115,8 @@ public:
   ///
   /// If it is one of the known library functions, return true and set F to the
   /// corresponding value.
+  ///
+  /// FDecl is assumed to have a parent Module when using this function.
   bool getLibFunc(const Function &FDecl, LibFunc &F) const;
 
   /// Forces a function to be marked as unavailable.
@@ -238,7 +240,7 @@ public:
     else {
       // Disable individual libc/libm calls in TargetLibraryInfo.
       LibFunc LF;
-      AttributeSet FnAttrs = (*F)->getAttributes().getFnAttributes();
+      AttributeSet FnAttrs = (*F)->getAttributes().getFnAttrs();
       for (const Attribute &Attr : FnAttrs) {
         if (!Attr.isStringAttribute())
           continue;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 628058142e48..170d6b8f35ff 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -21,7 +21,6 @@
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
-#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
@@ -31,6 +30,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/InstructionCost.h"
 #include <functional>
+#include <utility>
 
 namespace llvm {
 
@@ -47,12 +47,14 @@ class ExtractElementInst;
 class Function;
 class GlobalValue;
 class InstCombiner;
+class OptimizationRemarkEmitter;
 class IntrinsicInst;
 class LoadInst;
 class LoopAccessInfo;
 class Loop;
 class LoopInfo;
 class ProfileSummaryInfo;
+class RecurrenceDescriptor;
 class SCEV;
 class ScalarEvolution;
 class StoreInst;
@@ -97,7 +99,7 @@ struct HardwareLoopInfo {
   Loop *L = nullptr;
   BasicBlock *ExitBlock = nullptr;
   BranchInst *ExitBranch = nullptr;
-  const SCEV *TripCount = nullptr;
+  const SCEV *ExitCount = nullptr;
   IntegerType *CountType = nullptr;
   Value *LoopDecrement = nullptr; // Decrement the loop counter by this
                                   // value in every iteration.
@@ -382,8 +384,15 @@ public:
 
   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
 
+  /// Return true if globals in this address space can have initializers other
+  /// than `undef`.
+  bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const;
+
   unsigned getAssumedAddrSpace(const Value *V) const;
 
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const;
+
   /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
   /// NewV, which has a different address space. This should happen for every
   /// operand index that collectFlatAddressOperands returned for the intrinsic.
@@ -506,7 +515,8 @@ public:
   /// transformation. The caller will initialize UP with the current
   /// target-independent defaults.
   void getUnrollingPreferences(Loop *L, ScalarEvolution &,
-                               UnrollingPreferences &UP) const;
+                               UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE) const;
 
   /// Query the target whether it would be profitable to convert the given loop
   /// into a hardware loop.
@@ -660,6 +670,9 @@ public:
   /// Return true if the target supports masked expand load.
   bool isLegalMaskedExpandLoad(Type *DataType) const;
 
+  /// Return true if we should be enabling ordered reductions for the target.
+  bool enableOrderedReductions() const;
+
   /// Return true if the target has a unified operation to calculate division
   /// and remainder. If so, the additional implicit multiplication and
   /// subtraction required to calculate a remainder from division are free. This
@@ -907,6 +920,9 @@ public:
   ///  architectural maximum vector length, and None otherwise.
   Optional<unsigned> getMaxVScale() const;
 
+  /// \return the value of vscale to tune the cost model for.
+  Optional<unsigned> getVScaleForTuning() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1094,8 +1110,8 @@ public:
   /// is using a compare with the specified predicate as condition. When vector
   /// types are passed, \p VecPred must be used for all lanes.
   InstructionCost
-  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
-                     CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
+  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                     CmpInst::Predicate VecPred,
                      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                      const Instruction *I = nullptr) const;
 
@@ -1104,6 +1120,16 @@ public:
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      unsigned Index = -1) const;
 
+  /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
+  /// \p ReplicationFactor times.
+  ///
+  /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
+  ///   <0,0,0,1,1,1,2,2,2,3,3,3>
+  InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
+                                            int VF,
+                                            const APInt &DemandedDstElts,
+                                            TTI::TargetCostKind CostKind);
+
   /// \return The cost of Load and Store instructions.
   InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
@@ -1452,13 +1478,18 @@ public:
   virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                           Intrinsic::ID IID) const = 0;
   virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
+  virtual bool
+  canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const = 0;
   virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
+  virtual std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const = 0;
   virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                   Value *OldV,
                                                   Value *NewV) const = 0;
   virtual bool isLoweredToCall(const Function *F) = 0;
   virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
-                                       UnrollingPreferences &UP) = 0;
+                                       UnrollingPreferences &UP,
+                                       OptimizationRemarkEmitter *ORE) = 0;
   virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                      PeelingPreferences &PP) = 0;
   virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
@@ -1505,6 +1536,7 @@ public:
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
+  virtual bool enableOrderedReductions() = 0;
   virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
   virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
   virtual bool prefersVectorizedAddressing() = 0;
@@ -1563,6 +1595,7 @@ public:
   virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() const = 0;
   virtual Optional<unsigned> getMaxVScale() const = 0;
+  virtual Optional<unsigned> getVScaleForTuning() const = 0;
   virtual bool shouldMaximizeVectorBandwidth() const = 0;
   virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                     bool IsScalable) const = 0;
@@ -1623,6 +1656,12 @@ public:
                                              const Instruction *I) = 0;
   virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                              unsigned Index) = 0;
+
+  virtual InstructionCost
+  getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
+                            const APInt &DemandedDstElts,
+                            TTI::TargetCostKind CostKind) = 0;
+
   virtual InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                           Align Alignment,
                                           unsigned AddressSpace,
@@ -1730,8 +1769,8 @@ public:
   InstructionCost
   getGEPCost(Type *PointeeType, const Value *Ptr,
              ArrayRef<const Value *> Operands,
-             enum TargetTransformInfo::TargetCostKind CostKind) override {
-    return Impl.getGEPCost(PointeeType, Ptr, Operands);
+             TargetTransformInfo::TargetCostKind CostKind) override {
+    return Impl.getGEPCost(PointeeType, Ptr, Operands, CostKind);
   }
   unsigned getInliningThresholdMultiplier() override {
     return Impl.getInliningThresholdMultiplier();
@@ -1775,10 +1814,20 @@ public:
     return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
   }
 
+  bool
+  canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
+    return Impl.canHaveNonUndefGlobalInitializerInAddressSpace(AS);
+  }
+
   unsigned getAssumedAddrSpace(const Value *V) const override {
     return Impl.getAssumedAddrSpace(V);
   }
 
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const override {
+    return Impl.getPredicatedAddrSpace(V);
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const override {
     return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
@@ -1788,8 +1837,9 @@ public:
     return Impl.isLoweredToCall(F);
   }
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               UnrollingPreferences &UP) override {
-    return Impl.getUnrollingPreferences(L, SE, UP);
+                               UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE) override {
+    return Impl.getUnrollingPreferences(L, SE, UP, ORE);
   }
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              PeelingPreferences &PP) override {
@@ -1886,6 +1936,9 @@ public:
   bool isLegalMaskedExpandLoad(Type *DataType) override {
     return Impl.isLegalMaskedExpandLoad(DataType);
   }
+  bool enableOrderedReductions() override {
+    return Impl.enableOrderedReductions();
+  }
   bool hasDivRemOp(Type *DataType, bool IsSigned) override {
     return Impl.hasDivRemOp(DataType, IsSigned);
   }
@@ -2015,6 +2068,9 @@ public:
   Optional<unsigned> getMaxVScale() const override {
     return Impl.getMaxVScale();
   }
+  Optional<unsigned> getVScaleForTuning() const override {
+    return Impl.getVScaleForTuning();
+  }
   bool shouldMaximizeVectorBandwidth() const override {
     return Impl.shouldMaximizeVectorBandwidth();
   }
@@ -2115,6 +2171,13 @@ public:
                                      unsigned Index) override {
     return Impl.getVectorInstrCost(Opcode, Val, Index);
   }
+  InstructionCost
+  getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
+                            const APInt &DemandedDstElts,
+                            TTI::TargetCostKind CostKind) override {
+    return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
+                                          DemandedDstElts, CostKind);
+  }
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c07a33c9f155..05ef2495475f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -24,6 +24,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
+#include <utility>
 
 using namespace llvm::PatternMatch;
 
@@ -47,10 +48,9 @@ public:
 
   const DataLayout &getDataLayout() const { return DL; }
 
-  InstructionCost
-  getGEPCost(Type *PointeeType, const Value *Ptr,
-             ArrayRef<const Value *> Operands,
-             TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const {
+  InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
+                             ArrayRef<const Value *> Operands,
+                             TTI::TargetCostKind CostKind) const {
     // In the basic model, we just assume that all-constant GEPs will be folded
     // into their uses via addressing modes.
     for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
@@ -105,9 +105,17 @@ public:
   }
 
   bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; }
+  bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
+    return AS == 0;
+  };
 
   unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
 
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const {
+    return std::make_pair(nullptr, -1);
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const {
     return nullptr;
@@ -187,7 +195,8 @@ public:
   }
 
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
-                               TTI::UnrollingPreferences &) const {}
+                               TTI::UnrollingPreferences &,
+                               OptimizationRemarkEmitter *) const {}
 
   void getPeelingPreferences(Loop *, ScalarEvolution &,
                              TTI::PeelingPreferences &) const {}
@@ -262,6 +271,8 @@ public:
 
   bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
 
+  bool enableOrderedReductions() const { return false; }
+
   bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
 
   bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
@@ -394,6 +405,7 @@ public:
   unsigned getMinVectorRegisterBitWidth() const { return 128; }
 
   Optional<unsigned> getMaxVScale() const { return None; }
+  Optional<unsigned> getVScaleForTuning() const { return None; }
 
   bool shouldMaximizeVectorBandwidth() const { return false; }
 
@@ -539,6 +551,12 @@ public:
     return 1;
   }
 
+  unsigned getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
+                                     const APInt &DemandedDstElts,
+                                     TTI::TargetCostKind CostKind) {
+    return 1;
+  }
+
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
@@ -614,7 +632,8 @@ public:
     return 1;
   }
 
-  unsigned getNumberOfParts(Type *Tp) const { return 0; }
+  // Assume that we have a register of the right size for the type.
+  unsigned getNumberOfParts(Type *Tp) const { return 1; }
 
   InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *,
                                             const SCEV *) const {
@@ -632,9 +651,10 @@ public:
     return 1;
   }
 
-  InstructionCost getExtendedAddReductionCost(
-      bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
+  InstructionCost
+  getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy,
+                              VectorType *Ty,
+                              TTI::TargetCostKind CostKind) const {
     return 1;
   }
 
@@ -856,10 +876,9 @@ protected:
 public:
   using BaseT::getGEPCost;
 
-  InstructionCost
-  getGEPCost(Type *PointeeType, const Value *Ptr,
-             ArrayRef<const Value *> Operands,
-             TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
+  InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
+                             ArrayRef<const Value *> Operands,
+                             TTI::TargetCostKind CostKind) {
     assert(PointeeType && Ptr && "can't get GEPCost of nullptr");
     assert(cast<PointerType>(Ptr->getType()->getScalarType())
                ->isOpaqueOrPointeeTypeMatches(PointeeType) &&
@@ -964,10 +983,10 @@ public:
         return TTI::TCC_Free;
       break;
     case Instruction::GetElementPtr: {
-      const GEPOperator *GEP = cast<GEPOperator>(U);
+      const auto *GEP = cast<GEPOperator>(U);
       return TargetTTI->getGEPCost(GEP->getSourceElementType(),
                                    GEP->getPointerOperand(),
-                                   Operands.drop_front());
+                                   Operands.drop_front(), CostKind);
     }
     case Instruction::Add:
     case Instruction::FAdd:
@@ -1063,58 +1082,94 @@ public:
       auto *IE = dyn_cast<InsertElementInst>(U);
       if (!IE)
         return TTI::TCC_Basic; // FIXME
-      auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
-      unsigned Idx = CI ? CI->getZExtValue() : -1;
+      unsigned Idx = -1;
+      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+        if (CI->getValue().getActiveBits() <= 32)
+          Idx = CI->getZExtValue();
       return TargetTTI->getVectorInstrCost(Opcode, Ty, Idx);
     }
     case Instruction::ShuffleVector: {
       auto *Shuffle = dyn_cast<ShuffleVectorInst>(U);
       if (!Shuffle)
         return TTI::TCC_Basic; // FIXME
+
       auto *VecTy = cast<VectorType>(U->getType());
       auto *VecSrcTy = cast<VectorType>(U->getOperand(0)->getType());
+      int NumSubElts, SubIndex;
+
+      if (Shuffle->changesLength()) {
+        // Treat a 'subvector widening' as a free shuffle.
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding())
+          return 0;
+
+        if (Shuffle->isExtractSubvectorMask(SubIndex))
+          return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
+                                           Shuffle->getShuffleMask(), SubIndex,
+                                           VecTy);
+
+        if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
+          return TargetTTI->getShuffleCost(
+              TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(),
+              SubIndex,
+              FixedVectorType::get(VecTy->getScalarType(), NumSubElts));
+
+        int ReplicationFactor, VF;
+        if (Shuffle->isReplicationMask(ReplicationFactor, VF)) {
+          APInt DemandedDstElts =
+              APInt::getNullValue(Shuffle->getShuffleMask().size());
+          for (auto I : enumerate(Shuffle->getShuffleMask())) {
+            if (I.value() != UndefMaskElem)
+              DemandedDstElts.setBit(I.index());
+          }
+          return TargetTTI->getReplicationShuffleCost(
+              VecSrcTy->getElementType(), ReplicationFactor, VF,
+              DemandedDstElts, CostKind);
+        }
 
-      // TODO: Identify and add costs for insert subvector, etc.
-      int SubIndex;
-      if (Shuffle->isExtractSubvectorMask(SubIndex))
-        return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
-                                         Shuffle->getShuffleMask(), SubIndex,
-                                         VecTy);
-      else if (Shuffle->changesLength())
         return CostKind == TTI::TCK_RecipThroughput ? -1 : 1;
-      else if (Shuffle->isIdentity())
+      }
+
+      if (Shuffle->isIdentity())
         return 0;
-      else if (Shuffle->isReverse())
+
+      if (Shuffle->isReverse())
         return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy,
                                          Shuffle->getShuffleMask(), 0, nullptr);
-      else if (Shuffle->isSelect())
+
+      if (Shuffle->isSelect())
         return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy,
                                          Shuffle->getShuffleMask(), 0, nullptr);
-      else if (Shuffle->isTranspose())
+
+      if (Shuffle->isTranspose())
         return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy,
                                          Shuffle->getShuffleMask(), 0, nullptr);
-      else if (Shuffle->isZeroEltSplat())
+
+      if (Shuffle->isZeroEltSplat())
         return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy,
                                          Shuffle->getShuffleMask(), 0, nullptr);
-      else if (Shuffle->isSingleSource())
+
+      if (Shuffle->isSingleSource())
         return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy,
                                          Shuffle->getShuffleMask(), 0, nullptr);
 
+      if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
+        return TargetTTI->getShuffleCost(
+            TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(), SubIndex,
+            FixedVectorType::get(VecTy->getScalarType(), NumSubElts));
+
       return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy,
                                        Shuffle->getShuffleMask(), 0, nullptr);
     }
     case Instruction::ExtractElement: {
-      unsigned Idx = -1;
       auto *EEI = dyn_cast<ExtractElementInst>(U);
       if (!EEI)
         return TTI::TCC_Basic; // FIXME
-
-      auto *CI = dyn_cast<ConstantInt>(EEI->getOperand(1));
-      if (CI)
-        Idx = CI->getZExtValue();
-
-      return TargetTTI->getVectorInstrCost(Opcode, U->getOperand(0)->getType(),
-                                           Idx);
+      unsigned Idx = -1;
+      if (auto *CI = dyn_cast<ConstantInt>(EEI->getOperand(1)))
+        if (CI->getValue().getActiveBits() <= 32)
+          Idx = CI->getZExtValue();
+      Type *DstTy = U->getOperand(0)->getType();
+      return TargetTTI->getVectorInstrCost(Opcode, DstTy, Idx);
     }
     }
     // By default, just classify everything as 'basic'.
diff --git a/llvm/include/llvm/Analysis/TypeMetadataUtils.h b/llvm/include/llvm/Analysis/TypeMetadataUtils.h
index 3f7603142900..074c40942b06 100644
--- a/llvm/include/llvm/Analysis/TypeMetadataUtils.h
+++ b/llvm/include/llvm/Analysis/TypeMetadataUtils.h
@@ -22,6 +22,7 @@ namespace llvm {
 class CallBase;
 class CallInst;
 class Constant;
+class Function;
 class DominatorTree;
 class Instruction;
 class Module;
@@ -56,7 +57,30 @@ void findDevirtualizableCallsForTypeCheckedLoad(
     SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses,
     const CallInst *CI, DominatorTree &DT);
 
-Constant *getPointerAtOffset(Constant *I, uint64_t Offset, Module &M);
-}
+/// Processes a Constant recursively looking into elements of arrays, structs
+/// and expressions to find a trivial pointer element that is located at the
+/// given offset (relative to the beginning of the whole outer Constant).
+///
+/// Used for example from GlobalDCE to find an entry in a C++ vtable that
+/// matches a vcall offset.
+///
+/// To support Swift vtables, getPointerAtOffset can see through "relative
+/// pointers", i.e. (sub-)expressions of the form of:
+///
+/// @symbol = ... {
+///   i32 trunc (i64 sub (
+///     i64 ptrtoint (<type> @target to i64), i64 ptrtoint (... @symbol to i64)
+///   ) to i32)
+/// }
+///
+/// For such (sub-)expressions, getPointerAtOffset returns the @target pointer.
+Constant *getPointerAtOffset(Constant *I, uint64_t Offset, Module &M,
+                             Constant *TopLevelGlobal = nullptr);
+
+/// Finds the same "relative pointer" pattern as described above, where the
+/// target is `F`, and replaces the entire pattern with a constant zero.
+void replaceRelativePointerUsersWithZero(Function *F);
+
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Analysis/Utils/TFUtils.h b/llvm/include/llvm/Analysis/Utils/TFUtils.h
index 47ee23e06000..1f6be0e60eb9 100644
--- a/llvm/include/llvm/Analysis/Utils/TFUtils.h
+++ b/llvm/include/llvm/Analysis/Utils/TFUtils.h
@@ -104,6 +104,9 @@ Optional<TensorSpec> getTensorSpecFromJSON(LLVMContext &Ctx,
 struct LoggedFeatureSpec {
   TensorSpec Spec;
   Optional<std::string> LoggingName;
+  const std::string &getLoggingName() const {
+    return LoggingName ? *LoggingName : Spec.name();
+  }
 };
 
 /// Load the output specs. If SpecFileOverride is not empty, that path is used.
@@ -170,7 +173,9 @@ public:
   // we can consider using bytes.
   char *addEntryAndGetFloatOrInt64Buffer(size_t FeatureID);
 
-  void print(raw_ostream &OS);
+  // Flush the content of the log to the stream, clearing the stored data in the
+  // process.
+  void flush(raw_ostream &OS);
 
 private:
   std::vector<LoggedFeatureSpec> FeatureSpecs;
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 90ec742f18e6..b4f38a3e976f 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -203,6 +203,15 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
                               const DominatorTree *DT = nullptr,
                               bool UseInstrInfo = true);
 
+  /// Get the minimum bit size for this Value \p Op as a signed integer.
+  /// i.e.  x == sext(trunc(x to MinSignedBits) to bitwidth(x)).
+  /// Similar to the APInt::getMinSignedBits function.
+  unsigned ComputeMinSignedBits(const Value *Op, const DataLayout &DL,
+                                unsigned Depth = 0,
+                                AssumptionCache *AC = nullptr,
+                                const Instruction *CxtI = nullptr,
+                                const DominatorTree *DT = nullptr);
+
   /// This function computes the integer multiple of Base that equals V. If
   /// successful, it returns true and returns the multiple in Multiple. If
   /// unsuccessful, it returns false. Also, if V can be simplified to an
@@ -549,6 +558,7 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
   ConstantRange computeConstantRange(const Value *V, bool UseInstrInfo = true,
                                      AssumptionCache *AC = nullptr,
                                      const Instruction *CtxI = nullptr,
+                                     const DominatorTree *DT = nullptr,
                                      unsigned Depth = 0);
 
   /// Return true if this function can prove that the instruction I will
@@ -573,6 +583,18 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
   /// instruction variant of this function.
   bool isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB);
 
+  /// Return true if every instruction in the range (Begin, End) is
+  /// guaranteed to transfer execution to its static successor. \p ScanLimit
+  /// bounds the search to avoid scanning huge blocks.
+  bool isGuaranteedToTransferExecutionToSuccessor(
+     BasicBlock::const_iterator Begin, BasicBlock::const_iterator End,
+     unsigned ScanLimit = 32);
+
+  /// Same as previous, but with range expressed via iterator_range.
+  bool isGuaranteedToTransferExecutionToSuccessor(
+     iterator_range<BasicBlock::const_iterator> Range,
+     unsigned ScanLimit = 32);
+
   /// Return true if this function can prove that the instruction I
   /// is executed for every iteration of the loop L.
   ///
@@ -624,10 +646,16 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
   /// true. If Op raises immediate UB but never creates poison or undef
   /// (e.g. sdiv I, 0), canCreatePoison returns false.
   ///
+  /// \p ConsiderFlags controls whether poison producing flags on the
+  /// instruction are considered.  This can be used to see if the instruction
+  /// could still introduce undef or poison even without poison generating flags
+  /// which might be on the instruction.  (i.e. could the result of
+  /// Op->dropPoisonGeneratingFlags() still create poison or undef)
+  ///
   /// canCreatePoison returns true if Op can create poison from non-poison
   /// operands.
-  bool canCreateUndefOrPoison(const Operator *Op);
-  bool canCreatePoison(const Operator *Op);
+  bool canCreateUndefOrPoison(const Operator *Op, bool ConsiderFlags = true);
+  bool canCreatePoison(const Operator *Op, bool ConsiderFlags = true);
 
   /// Return true if V is poison given that ValAssumedPoison is already poison.
   /// For example, if ValAssumedPoison is `icmp X, 10` and V is `icmp X, 5`,
@@ -744,6 +772,10 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
   /// minimum/maximum flavor.
   CmpInst::Predicate getInverseMinMaxPred(SelectPatternFlavor SPF);
 
+  /// Return the minimum or maximum constant value for the specified integer
+  /// min/max flavor and type.
+  APInt getMinMaxLimit(SelectPatternFlavor SPF, unsigned BitWidth);
+
   /// Check if the values in \p VL are select instructions that can be converted
   /// to a min or max (vector) intrinsic. Returns the intrinsic ID, if such a
   /// conversion is possible, together with a bool indicating whether all select
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index c890216c9e01..24e2318de48b 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -533,6 +533,12 @@ llvm::SmallVector<int, 16> createStrideMask(unsigned Start, unsigned Stride,
 llvm::SmallVector<int, 16>
 createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs);
 
+/// Given a shuffle mask for a binary shuffle, create the equivalent shuffle
+/// mask assuming both operands are identical. This assumes that the unary
+/// shuffle will use elements from operand 0 (operand 1 will be unused).
+llvm::SmallVector<int, 16> createUnaryMask(ArrayRef<int> Mask,
+                                           unsigned NumElts);
+
 /// Concatenate a list of vectors.
 ///
 /// This function generates code that concatenate the vectors in \p Vecs into a
@@ -686,10 +692,8 @@ public:
     if (getMember(getFactor() - 1))
       return false;
 
-    // We have a group with gaps. It therefore cannot be a group of stores,
-    // and it can't be a reversed access, because such groups get invalidated.
-    assert(!getMember(0)->mayWriteToMemory() &&
-           "Group should have been invalidated");
+    // We have a group with gaps. It therefore can't be a reversed access,
+    // because such groups get invalidated (TODO).
     assert(!isReverse() && "Group should have been invalidated");
 
     // This is a group of loads, with gaps, and without a last-member
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index c97d9781c33b..c30165e4a97b 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_ASMPARSER_LLLEXER_H
-#define LLVM_LIB_ASMPARSER_LLLEXER_H
+#ifndef LLVM_ASMPARSER_LLLEXER_H
+#define LLVM_ASMPARSER_LLLEXER_H
 
 #include "LLToken.h"
 #include "llvm/ADT/APFloat.h"
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 70db9218fa3d..d621c232378c 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_ASMPARSER_LLPARSER_H
-#define LLVM_LIB_ASMPARSER_LLPARSER_H
+#ifndef LLVM_ASMPARSER_LLPARSER_H
+#define LLVM_ASMPARSER_LLPARSER_H
 
 #include "LLLexer.h"
 #include "llvm/ADT/Optional.h"
@@ -172,9 +172,8 @@ namespace llvm {
     /// getGlobalVal - Get a value with the specified name or ID, creating a
     /// forward reference record if needed.  This can return null if the value
     /// exists but does not have the right type.
-    GlobalValue *getGlobalVal(const std::string &N, Type *Ty, LocTy Loc,
-                              bool IsCall);
-    GlobalValue *getGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
+    GlobalValue *getGlobalVal(const std::string &N, Type *Ty, LocTy Loc);
+    GlobalValue *getGlobalVal(unsigned ID, Type *Ty, LocTy Loc);
 
     /// Get a Comdat with the specified name, creating a forward reference
     /// record if needed.
@@ -270,7 +269,6 @@ namespace llvm {
     bool parseOptionalCommaAlign(MaybeAlign &Alignment, bool &AteExtraComma);
     bool parseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
                                      bool &AteExtraComma);
-    bool parseOptionalCommaInAlloca(bool &IsInAlloca);
     bool parseAllocSizeArguments(unsigned &BaseSizeArg,
                                  Optional<unsigned> &HowManyArg);
     bool parseVScaleRangeArguments(unsigned &MinValue, unsigned &MaxValue);
@@ -306,11 +304,10 @@ namespace llvm {
                      unsigned DLLStorageClass, bool DSOLocal,
                      GlobalVariable::ThreadLocalMode TLM,
                      GlobalVariable::UnnamedAddr UnnamedAddr);
-    bool parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
-                             unsigned L, unsigned Visibility,
-                             unsigned DLLStorageClass, bool DSOLocal,
-                             GlobalVariable::ThreadLocalMode TLM,
-                             GlobalVariable::UnnamedAddr UnnamedAddr);
+    bool parseAliasOrIFunc(const std::string &Name, LocTy NameLoc, unsigned L,
+                           unsigned Visibility, unsigned DLLStorageClass,
+                           bool DSOLocal, GlobalVariable::ThreadLocalMode TLM,
+                           GlobalVariable::UnnamedAddr UnnamedAddr);
     bool parseComdat();
     bool parseStandaloneMetadata();
     bool parseNamedMetadata();
@@ -424,8 +421,8 @@ namespace llvm {
       /// GetVal - Get a value with the specified name or ID, creating a
       /// forward reference record if needed.  This can return null if the value
       /// exists but does not have the right type.
-      Value *getVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall);
-      Value *getVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
+      Value *getVal(const std::string &Name, Type *Ty, LocTy Loc);
+      Value *getVal(unsigned ID, Type *Ty, LocTy Loc);
 
       /// setInstName - After an instruction is parsed and inserted into its
       /// basic block, this installs its name.
@@ -447,10 +444,10 @@ namespace llvm {
     };
 
     bool convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
-                             PerFunctionState *PFS, bool IsCall);
+                             PerFunctionState *PFS);
 
     Value *checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
-                                  Value *Val, bool IsCall);
+                                  Value *Val);
 
     bool parseConstantValue(Type *Ty, Constant *&C);
     bool parseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index aa49c68fe924..f8ca054863ac 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_ASMPARSER_LLTOKEN_H
-#define LLVM_LIB_ASMPARSER_LLTOKEN_H
+#ifndef LLVM_ASMPARSER_LLTOKEN_H
+#define LLVM_ASMPARSER_LLTOKEN_H
 
 namespace llvm {
 namespace lltok {
@@ -190,6 +190,7 @@ enum Kind {
   kw_convergent,
   kw_dereferenceable,
   kw_dereferenceable_or_null,
+  kw_disable_sanitizer_instrumentation,
   kw_elementtype,
   kw_inaccessiblememonly,
   kw_inaccessiblemem_or_argmemonly,
@@ -403,6 +404,9 @@ enum Kind {
   kw_returnDoesNotAlias,
   kw_noInline,
   kw_alwaysInline,
+  kw_noUnwind,
+  kw_mayThrow,
+  kw_hasUnknownCall,
   kw_calls,
   kw_callee,
   kw_params,
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index 34f124b5779a..61f3f27ebb47 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -248,6 +248,9 @@ HANDLE_DW_TAG(0x5103, ALTIUM_rev_carry_type, 0, ALTIUM, DW_KIND_NONE)
 // M16 __rom qualifier
 HANDLE_DW_TAG(0x5111, ALTIUM_rom, 0, ALTIUM, DW_KIND_NONE)
 
+// LLVM
+HANDLE_DW_TAG(0x6000, LLVM_annotation, 0, LLVM, DW_KIND_NONE)
+
 // Green Hills.
 HANDLE_DW_TAG(0x8004, GHS_namespace, 0, GHS, DW_KIND_NONE)
 HANDLE_DW_TAG(0x8005, GHS_using_namespace, 0, GHS, DW_KIND_NONE)
diff --git a/llvm/include/llvm/BinaryFormat/DynamicTags.def b/llvm/include/llvm/BinaryFormat/DynamicTags.def
index c08f8a53bdb5..814d8b113ec4 100644
--- a/llvm/include/llvm/BinaryFormat/DynamicTags.def
+++ b/llvm/include/llvm/BinaryFormat/DynamicTags.def
@@ -31,6 +31,11 @@
 #define PPC64_DYNAMIC_TAG_DEFINED
 #endif
 
+#ifndef RISCV_DYNAMIC_TAG
+#define RISCV_DYNAMIC_TAG(name, value) DYNAMIC_TAG(name, value)
+#define RISCV_DYNAMIC_TAG_DEFINED
+#endif
+
 #ifndef DYNAMIC_TAG_MARKER
 #define DYNAMIC_TAG_MARKER(name, value) DYNAMIC_TAG(name, value)
 #define DYNAMIC_TAG_MARKER_DEFINED
@@ -213,6 +218,9 @@ PPC_DYNAMIC_TAG(PPC_OPT, 0x70000001) // Has TLS optimization.
 PPC64_DYNAMIC_TAG(PPC64_GLINK, 0x70000000) // Address of 32 bytes before the
                                            // first glink lazy resolver stub.
 
+// RISC-V specific dynamic array tags.
+RISCV_DYNAMIC_TAG(RISCV_VARIANT_CC, 0x70000001)
+
 // Sun machine-independent extensions.
 DYNAMIC_TAG(AUXILIARY, 0x7FFFFFFD) // Shared object to load before self
 DYNAMIC_TAG(USED, 0x7FFFFFFE)      // Same as DT_NEEDED
@@ -243,3 +251,7 @@ DYNAMIC_TAG(FILTER, 0x7FFFFFFF)    // Shared object to get values from
 #undef PPC64_DYNAMIC_TAG
 #undef PPC64_DYNAMIC_TAG_DEFINED
 #endif
+#ifdef RISCV_DYNAMIC_TAG_DEFINED
+#undef RISCV_DYNAMIC_TAG
+#undef RISCV_DYNAMIC_TAG_DEFINED
+#endif
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 6148f968cdba..a270fd399aeb 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -660,6 +660,12 @@ enum {
 #include "ELFRelocs/RISCV.def"
 };
 
+enum {
+  // Symbol may follow different calling convention than the standard calling
+  // convention.
+  STO_RISCV_VARIANT_CC = 0x80
+};
+
 // ELF Relocation types for S390/zSeries
 enum {
 #include "ELFRelocs/SystemZ.def"
@@ -1596,6 +1602,16 @@ enum {
   NT_FREEBSD_PROCSTAT_AUXV = 16,
 };
 
+// OpenBSD core note types.
+enum {
+  NT_OPENBSD_PROCINFO = 10,
+  NT_OPENBSD_AUXV = 11,
+  NT_OPENBSD_REGS = 20,
+  NT_OPENBSD_FPREGS = 21,
+  NT_OPENBSD_XFPREGS = 22,
+  NT_OPENBSD_WCOOKIE = 23,
+};
+
 // AMDGPU-specific section indices.
 enum {
   SHN_AMDGPU_LDS = 0xff00, // Variable in LDS; symbol encoded like SHN_COMMON
@@ -1618,6 +1634,13 @@ enum {
   NT_AMDGPU_METADATA = 32
 };
 
+// LLVMOMPOFFLOAD specific notes.
+enum : unsigned {
+  NT_LLVM_OPENMP_OFFLOAD_VERSION = 1,
+  NT_LLVM_OPENMP_OFFLOAD_PRODUCER = 2,
+  NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION = 3
+};
+
 enum {
   GNU_ABI_TAG_LINUX = 0,
   GNU_ABI_TAG_HURD = 1,
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
index 9f2f0540bcbd..454450950444 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
@@ -46,10 +46,6 @@ ELF_RELOC(R_RISCV_ALIGN,             43)
 ELF_RELOC(R_RISCV_RVC_BRANCH,        44)
 ELF_RELOC(R_RISCV_RVC_JUMP,          45)
 ELF_RELOC(R_RISCV_RVC_LUI,           46)
-ELF_RELOC(R_RISCV_GPREL_I,           47)
-ELF_RELOC(R_RISCV_GPREL_S,           48)
-ELF_RELOC(R_RISCV_TPREL_I,           49)
-ELF_RELOC(R_RISCV_TPREL_S,           50)
 ELF_RELOC(R_RISCV_RELAX,             51)
 ELF_RELOC(R_RISCV_SUB6,              52)
 ELF_RELOC(R_RISCV_SET6,              53)
diff --git a/llvm/include/llvm/BinaryFormat/MachO.def b/llvm/include/llvm/BinaryFormat/MachO.def
index 76dcc58ba048..f68ecefa6c9e 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.def
+++ b/llvm/include/llvm/BinaryFormat/MachO.def
@@ -74,6 +74,8 @@ HANDLE_LOAD_COMMAND(LC_VERSION_MIN_TVOS, 0x0000002Fu, version_min_command)
 HANDLE_LOAD_COMMAND(LC_VERSION_MIN_WATCHOS, 0x00000030u, version_min_command)
 HANDLE_LOAD_COMMAND(LC_NOTE, 0x00000031u, note_command)
 HANDLE_LOAD_COMMAND(LC_BUILD_VERSION, 0x00000032u, build_version_command)
+HANDLE_LOAD_COMMAND(LC_DYLD_EXPORTS_TRIE, 0x80000033u, linkedit_data_command)
+HANDLE_LOAD_COMMAND(LC_DYLD_CHAINED_FIXUPS, 0x80000034u, linkedit_data_command)
 
 #endif
 
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index c38e64928521..0bc8c4e167d8 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines manifest constants for the wasm object file format.
-// See: https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
+// See: https://github.com/WebAssembly/design/blob/main/BinaryEncoding.md
 //
 //===----------------------------------------------------------------------===//
 
@@ -36,12 +36,25 @@ struct WasmObjectHeader {
   uint32_t Version;
 };
 
+struct WasmDylinkImportInfo {
+  StringRef Module;
+  StringRef Field;
+  uint32_t Flags;
+};
+
+struct WasmDylinkExportInfo {
+  StringRef Name;
+  uint32_t Flags;
+};
+
 struct WasmDylinkInfo {
   uint32_t MemorySize; // Memory size in bytes
   uint32_t MemoryAlignment;  // P2 alignment of memory
   uint32_t TableSize;  // Table size in elements
   uint32_t TableAlignment;  // P2 alignment of table
   std::vector<StringRef> Needed; // Shared library dependencies
+  std::vector<WasmDylinkImportInfo> ImportInfo;
+  std::vector<WasmDylinkExportInfo> ExportInfo;
 };
 
 struct WasmProducerInfo {
@@ -101,15 +114,9 @@ struct WasmGlobal {
   StringRef SymbolName; // from the "linking" section
 };
 
-struct WasmTagType {
-  // Kind of tag. Currently only WASM_TAG_ATTRIBUTE_EXCEPTION is possible.
-  uint8_t Attribute;
-  uint32_t SigIndex;
-};
-
 struct WasmTag {
   uint32_t Index;
-  WasmTagType Type;
+  uint32_t SigIndex;
   StringRef SymbolName; // from the "linking" section
 };
 
@@ -122,7 +129,6 @@ struct WasmImport {
     WasmGlobalType Global;
     WasmTableType Table;
     WasmLimits Memory;
-    WasmTagType Tag;
   };
 };
 
@@ -133,6 +139,7 @@ struct WasmLocalDecl {
 
 struct WasmFunction {
   uint32_t Index;
+  uint32_t SigIndex;
   std::vector<WasmLocalDecl> Locals;
   ArrayRef<uint8_t> Body;
   uint32_t CodeSectionOffset;
@@ -284,11 +291,14 @@ enum : unsigned {
 
 // Opcodes used in synthetic functions.
 enum : unsigned {
-  WASM_OPCODE_IF = 0x04,
-  WASM_OPCODE_ELSE = 0x05,
+  WASM_OPCODE_BLOCK = 0x02,
+  WASM_OPCODE_BR = 0x0c,
+  WASM_OPCODE_BR_TABLE = 0x0e,
+  WASM_OPCODE_RETURN = 0x0f,
   WASM_OPCODE_DROP = 0x1a,
   WASM_OPCODE_MISC_PREFIX = 0xfc,
   WASM_OPCODE_MEMORY_INIT = 0x08,
+  WASM_OPCODE_MEMORY_FILL = 0x0b,
   WASM_OPCODE_DATA_DROP = 0x09,
   WASM_OPCODE_ATOMICS_PREFIX = 0xfe,
   WASM_OPCODE_ATOMIC_NOTIFY = 0x00,
@@ -339,6 +349,14 @@ enum : unsigned {
   WASM_SYMBOL_TABLE = 0x8,
 };
 
+// Kind codes used in the custom "dylink" section
+enum : unsigned {
+  WASM_DYLINK_MEM_INFO = 0x1,
+  WASM_DYLINK_NEEDED = 0x2,
+  WASM_DYLINK_EXPORT_INFO = 0x3,
+  WASM_DYLINK_IMPORT_INFO = 0x4,
+};
+
 // Kind codes used in the custom "linking" section in the WASM_COMDAT_INFO
 enum : unsigned {
   WASM_COMDAT_DATA = 0x0,
@@ -379,6 +397,7 @@ const unsigned WASM_SYMBOL_UNDEFINED = 0x10;
 const unsigned WASM_SYMBOL_EXPORTED = 0x20;
 const unsigned WASM_SYMBOL_EXPLICIT_NAME = 0x40;
 const unsigned WASM_SYMBOL_NO_STRIP = 0x80;
+const unsigned WASM_SYMBOL_TLS = 0x100;
 
 #define WASM_RELOC(name, value) name = value,
 
diff --git a/llvm/include/llvm/BinaryFormat/WasmTraits.h b/llvm/include/llvm/BinaryFormat/WasmTraits.h
index 930ee690bcc0..bef9dd3291ca 100644
--- a/llvm/include/llvm/BinaryFormat/WasmTraits.h
+++ b/llvm/include/llvm/BinaryFormat/WasmTraits.h
@@ -18,10 +18,8 @@
 
 namespace llvm {
 
-template <typename T> struct DenseMapInfo;
-
 // Traits for using WasmSignature in a DenseMap.
-template <> struct DenseMapInfo<wasm::WasmSignature> {
+template <> struct DenseMapInfo<wasm::WasmSignature, void> {
   static wasm::WasmSignature getEmptyKey() {
     wasm::WasmSignature Sig;
     Sig.State = wasm::WasmSignature::Empty;
@@ -47,7 +45,7 @@ template <> struct DenseMapInfo<wasm::WasmSignature> {
 };
 
 // Traits for using WasmGlobalType in a DenseMap
-template <> struct DenseMapInfo<wasm::WasmGlobalType> {
+template <> struct DenseMapInfo<wasm::WasmGlobalType, void> {
   static wasm::WasmGlobalType getEmptyKey() {
     return wasm::WasmGlobalType{1, true};
   }
@@ -64,7 +62,7 @@ template <> struct DenseMapInfo<wasm::WasmGlobalType> {
 };
 
 // Traits for using WasmLimits in a DenseMap
-template <> struct DenseMapInfo<wasm::WasmLimits> {
+template <> struct DenseMapInfo<wasm::WasmLimits, void> {
   static wasm::WasmLimits getEmptyKey() {
     return wasm::WasmLimits{0xff, 0xff, 0xff};
   }
@@ -86,19 +84,19 @@ template <> struct DenseMapInfo<wasm::WasmLimits> {
 };
 
 // Traits for using WasmTableType in a DenseMap
-template <> struct DenseMapInfo<wasm::WasmTableType> {
+template <> struct DenseMapInfo<wasm::WasmTableType, void> {
   static wasm::WasmTableType getEmptyKey() {
-    return wasm::WasmTableType{0,
-                               DenseMapInfo<wasm::WasmLimits>::getEmptyKey()};
+    return wasm::WasmTableType{
+        0, DenseMapInfo<wasm::WasmLimits, void>::getEmptyKey()};
   }
   static wasm::WasmTableType getTombstoneKey() {
     return wasm::WasmTableType{
-        1, DenseMapInfo<wasm::WasmLimits>::getTombstoneKey()};
+        1, DenseMapInfo<wasm::WasmLimits, void>::getTombstoneKey()};
   }
   static unsigned getHashValue(const wasm::WasmTableType &TableType) {
     return hash_combine(
         TableType.ElemType,
-        DenseMapInfo<wasm::WasmLimits>::getHashValue(TableType.Limits));
+        DenseMapInfo<wasm::WasmLimits, void>::getHashValue(TableType.Limits));
   }
   static bool isEqual(const wasm::WasmTableType &LHS,
                       const wasm::WasmTableType &RHS) {
diff --git a/llvm/include/llvm/BinaryFormat/XCOFF.h b/llvm/include/llvm/BinaryFormat/XCOFF.h
index 8a42d26f3f4a..cffd8618f1e3 100644
--- a/llvm/include/llvm/BinaryFormat/XCOFF.h
+++ b/llvm/include/llvm/BinaryFormat/XCOFF.h
@@ -28,9 +28,14 @@ namespace XCOFF {
 constexpr size_t FileNamePadSize = 6;
 constexpr size_t NameSize = 8;
 constexpr size_t FileHeaderSize32 = 20;
+constexpr size_t FileHeaderSize64 = 24;
+constexpr size_t AuxFileHeaderSize32 = 72;
+constexpr size_t AuxFileHeaderSize64 = 110;
 constexpr size_t SectionHeaderSize32 = 40;
+constexpr size_t SectionHeaderSize64 = 72;
 constexpr size_t SymbolTableEntrySize = 18;
 constexpr size_t RelocationSerializationSize32 = 10;
+constexpr size_t RelocationSerializationSize64 = 14;
 constexpr uint16_t RelocOverflow = 65535;
 constexpr uint8_t AllocRegNo = 31;
 
@@ -38,6 +43,17 @@ enum ReservedSectionNum : int16_t { N_DEBUG = -2, N_ABS = -1, N_UNDEF = 0 };
 
 enum MagicNumber : uint16_t { XCOFF32 = 0x01DF, XCOFF64 = 0x01F7 };
 
+// This field only exists in the XCOFF64 definition.
+enum AuxHeaderFlags64 : uint16_t {
+  SHR_SYMTAB = 0x8000,  ///< At exec time, create shared symbol table for program
+                        ///< (main program only).
+  FORK_POLICY = 0x4000, ///< Forktree policy specified (main program only).
+  FORK_COR = 0x2000     ///< If _AOUT_FORK_POLICY is set, specify copy-on-reference
+                        ///< if this bit is set. Specify copy-on- write otherwise.
+                        ///< If _AOUT_FORK_POLICY is 0, this bit is reserved for
+                        ///< future use and should be set to 0.
+};
+
 // x_smclas field of x_csect from system header: /usr/include/syms.h
 /// Storage Mapping Class definitions.
 enum StorageMappingClass : uint8_t {
diff --git a/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h b/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
index de828be3bf1b..f6fc284da33f 100644
--- a/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
+++ b/llvm/include/llvm/Bitcode/BitcodeAnalyzer.h
@@ -42,6 +42,8 @@ struct BCDumpOptions {
   bool Symbolic = false;
   /// Print binary blobs using hex escapes.
   bool ShowBinaryBlobs = false;
+  /// Print BLOCKINFO block details.
+  bool DumpBlockinfo = false;
 
   BCDumpOptions(raw_ostream &OS) : OS(OS) {}
 };
diff --git a/llvm/include/llvm/Bitcode/BitcodeCommon.h b/llvm/include/llvm/Bitcode/BitcodeCommon.h
index 6a3e74550bc4..22d1872fe49c 100644
--- a/llvm/include/llvm/Bitcode/BitcodeCommon.h
+++ b/llvm/include/llvm/Bitcode/BitcodeCommon.h
@@ -19,10 +19,14 @@
 namespace llvm {
 
 struct AllocaPackedValues {
-  using Align = Bitfield::Element<unsigned, 0, 5>;
-  using UsedWithInAlloca = Bitfield::Element<bool, Align::NextBit, 1>;
+  // We increased the number of bits needed to represent alignment to be more
+  // than 5, but to preserve backward compatibility we store the upper bits
+  // separately.
+  using AlignLower = Bitfield::Element<unsigned, 0, 5>;
+  using UsedWithInAlloca = Bitfield::Element<bool, AlignLower::NextBit, 1>;
   using ExplicitType = Bitfield::Element<bool, UsedWithInAlloca::NextBit, 1>;
   using SwiftError = Bitfield::Element<bool, ExplicitType::NextBit, 1>;
+  using AlignUpper = Bitfield::Element<unsigned, SwiftError::NextBit, 3>;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 28870afb2fcb..04eb2739cbd5 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -671,6 +671,7 @@ enum AttributeKindCodes {
   ATTR_KIND_SWIFT_ASYNC = 75,
   ATTR_KIND_NO_SANITIZE_COVERAGE = 76,
   ATTR_KIND_ELEMENTTYPE = 77,
+  ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION = 78,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index bdfb416d9bd9..60442326d6c7 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -104,9 +104,12 @@ ISD::CondCode getFCmpCodeWithoutNaN(ISD::CondCode CC);
 
 /// getICmpCondCode - Return the ISD condition code corresponding to
 /// the given LLVM IR integer condition code.
-///
 ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred);
 
+/// getICmpCondCode - Return the LLVM IR integer condition code
+/// corresponding to the given ISD integer condition code.
+ICmpInst::Predicate getICmpCondCode(ISD::CondCode Pred);
+
 /// Test if the given instruction is in a position to be optimized
 /// with a tail-call. This roughly means that it's in a block with
 /// a return and there's nothing that needs to be scheduled
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 5dea86e67d64..d7d3692877de 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -41,7 +41,6 @@ class DIEAbbrev;
 class DwarfDebug;
 class GCMetadataPrinter;
 class GCStrategy;
-class GlobalIndirectSymbol;
 class GlobalObject;
 class GlobalValue;
 class GlobalVariable;
@@ -708,7 +707,7 @@ public:
   /// ${:comment}.  Targets can override this to add support for their own
   /// strange codes.
   virtual void PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
-                            const char *Code) const;
+                            StringRef Code) const;
 
   /// Print the MachineOperand as a symbol. Targets with complex handling of
   /// symbol references should override the base implementation.
@@ -795,8 +794,8 @@ private:
   void emitModuleCommandLines(Module &M);
 
   GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy &S);
-  /// Emit GlobalAlias or GlobalIFunc.
-  void emitGlobalIndirectSymbol(Module &M, const GlobalIndirectSymbol &GIS);
+  void emitGlobalAlias(Module &M, const GlobalAlias &GA);
+  void emitGlobalIFunc(Module &M, const GlobalIFunc &GI);
 
   /// This method decides whether the specified basic block requires a label.
   bool shouldEmitLabelForBasicBlock(const MachineBasicBlock &MBB) const;
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e3b834ec42c3..324b7dcfb3ac 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -282,6 +283,11 @@ public:
     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
   }
 
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const {
+    return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const {
     return nullptr;
@@ -363,8 +369,9 @@ public:
   }
 
   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
-                             ArrayRef<const Value *> Operands) {
-    return BaseT::getGEPCost(PointeeType, Ptr, Operands);
+                             ArrayRef<const Value *> Operands,
+                             TTI::TargetCostKind CostKind) {
+    return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
   }
 
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
@@ -484,7 +491,8 @@ public:
   int getInlinerVectorBonusPercent() { return 150; }
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP) {
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE) {
     // This unrolling functionality is target independent, but to provide some
     // motivation for its intended use, for x86:
 
@@ -526,6 +534,15 @@ public:
               continue;
           }
 
+          if (ORE) {
+            ORE->emit([&]() {
+              return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
+                                        L->getHeader())
+                     << "advising against unrolling the loop because it "
+                        "contains a "
+                     << ore::NV("Call", &I);
+            });
+          }
           return;
         }
       }
@@ -653,6 +670,7 @@ public:
   }
 
   Optional<unsigned> getMaxVScale() const { return None; }
+  Optional<unsigned> getVScaleForTuning() const { return None; }
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
@@ -686,7 +704,7 @@ public:
                                            bool Extract) {
     auto *Ty = cast<FixedVectorType>(InTy);
 
-    APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
+    APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
   }
 
@@ -737,8 +755,7 @@ public:
   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -1102,6 +1119,39 @@ public:
     return LT.first;
   }
 
+  InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
+                                            int VF,
+                                            const APInt &DemandedDstElts,
+                                            TTI::TargetCostKind CostKind) {
+    assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
+           "Unexpected size of DemandedDstElts.");
+
+    InstructionCost Cost;
+
+    auto *SrcVT = FixedVectorType::get(EltTy, VF);
+    auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
+
+    // The Mask shuffling cost is extract all the elements of the Mask
+    // and insert each of them Factor times into the wide vector:
+    //
+    // E.g. an interleaved group with factor 3:
+    //    %mask = icmp ult <8 x i32> %vec1, %vec2
+    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+    // The cost is estimated as extract all mask elements from the <8xi1> mask
+    // vector and insert them factor times into the <24xi1> shuffled mask
+    // vector.
+    APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
+    Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
+                                              /*Insert*/ false,
+                                              /*Extract*/ true);
+    Cost +=
+        thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
+                                          /*Insert*/ true, /*Extract*/ false);
+
+    return Cost;
+  }
+
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                   MaybeAlign Alignment, unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
@@ -1201,9 +1251,9 @@ public:
     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
     // type). The other loads are unused.
     //
-    // We only scale the cost of loads since interleaved store groups aren't
-    // allowed to have gaps.
-    if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
+    // TODO: Note that legalization can turn masked loads/stores into unmasked
+    // (legalized) loads/stores. This can be reflected in the cost.
+    if (Cost.isValid() && VecTySize > VecTyLTSize) {
       // The number of loads of a legal type it will take to represent a load
       // of the unlegalized vector type.
       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
@@ -1220,10 +1270,24 @@ public:
 
       // Scale the cost of the load by the fraction of legal instructions that
       // will be used.
-      Cost *= UsedInsts.count() / NumLegalInsts;
+      Cost = divideCeil(UsedInsts.count() * Cost.getValue().getValue(),
+                        NumLegalInsts);
     }
 
     // Then plus the cost of interleave operation.
+    assert(Indices.size() <= Factor &&
+           "Interleaved memory op has too many members");
+
+    const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
+    const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
+
+    APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
+    for (unsigned Index : Indices) {
+      assert(Index < Factor && "Invalid index for interleaved memory op");
+      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
+        DemandedLoadStoreElts.setBit(Index + Elm * Factor);
+    }
+
     if (Opcode == Instruction::Load) {
       // The interleave cost is similar to extract sub vectors' elements
       // from the wide vector, and insert them into sub vectors.
@@ -1233,79 +1297,56 @@ public:
       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
       // <8 x i32> vector and insert them into a <4 x i32> vector.
-
-      assert(Indices.size() <= Factor &&
-             "Interleaved memory op has too many members");
-
-      for (unsigned Index : Indices) {
-        assert(Index < Factor && "Invalid index for interleaved memory op");
-
-        // Extract elements from loaded vector for each sub vector.
-        for (unsigned i = 0; i < NumSubElts; i++)
-          Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
-                                              Index + i * Factor);
-      }
-
-      InstructionCost InsSubCost = 0;
-      for (unsigned i = 0; i < NumSubElts; i++)
-        InsSubCost +=
-            thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
-
+      InstructionCost InsSubCost =
+          thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+                                            /*Insert*/ true, /*Extract*/ false);
       Cost += Indices.size() * InsSubCost;
+      Cost +=
+          thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                            /*Insert*/ false, /*Extract*/ true);
     } else {
-      // The interleave cost is extract all elements from sub vectors, and
+      // The interleave cost is extract elements from sub vectors, and
       // insert them into the wide vector.
       //
-      // E.g. An interleaved store of factor 2:
-      //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
-      //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
-      // The cost is estimated as extract all elements from both <4 x i32>
-      // vectors and insert into the <8 x i32> vector.
-
-      InstructionCost ExtSubCost = 0;
-      for (unsigned i = 0; i < NumSubElts; i++)
-        ExtSubCost +=
-            thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
-      Cost += ExtSubCost * Factor;
-
-      for (unsigned i = 0; i < NumElts; i++)
-        Cost += static_cast<T *>(this)
-                    ->getVectorInstrCost(Instruction::InsertElement, VT, i);
+      // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
+      // (using VF=4):
+      //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
+      //    %gaps.mask = <true, true, false, true, true, false,
+      //                  true, true, false, true, true, false>
+      //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
+      //                           i32 Align, <12 x i1> %gaps.mask
+      // The cost is estimated as extract all elements (of actual members,
+      // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
+      // i32> vector.
+      InstructionCost ExtSubCost =
+          thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+                                            /*Insert*/ false, /*Extract*/ true);
+      Cost += ExtSubCost * Indices.size();
+      Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                                /*Insert*/ true,
+                                                /*Extract*/ false);
     }
 
     if (!UseMaskForCond)
       return Cost;
 
     Type *I8Type = Type::getInt8Ty(VT->getContext());
-    auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
-    SubVT = FixedVectorType::get(I8Type, NumSubElts);
-
-    // The Mask shuffling cost is extract all the elements of the Mask
-    // and insert each of them Factor times into the wide vector:
-    //
-    // E.g. an interleaved group with factor 3:
-    //    %mask = icmp ult <8 x i32> %vec1, %vec2
-    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
-    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
-    // The cost is estimated as extract all mask elements from the <8xi1> mask
-    // vector and insert them factor times into the <24xi1> shuffled mask
-    // vector.
-    for (unsigned i = 0; i < NumSubElts; i++)
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
 
-    for (unsigned i = 0; i < NumElts; i++)
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
+    Cost += thisT()->getReplicationShuffleCost(
+        I8Type, Factor, NumSubElts,
+        UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
+        CostKind);
 
     // The Gaps mask is invariant and created outside the loop, therefore the
     // cost of creating it is not accounted for here. However if we have both
     // a MaskForGaps and some other mask that guards the execution of the
     // memory access, we need to account for the cost of And-ing the two masks
     // inside the loop.
-    if (UseMaskForGaps)
+    if (UseMaskForGaps) {
+      auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
                                               CostKind);
+    }
 
     return Cost;
   }
@@ -1460,10 +1501,10 @@ public:
         Type *CondTy = RetTy->getWithNewBitWidth(1);
         Cost +=
             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
+                                        CmpInst::ICMP_EQ, CostKind);
         Cost +=
             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                        CmpInst::BAD_ICMP_PREDICATE, CostKind);
+                                        CmpInst::ICMP_EQ, CostKind);
       }
       return Cost;
     }
@@ -1689,26 +1730,34 @@ public:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
           /*IsUnsigned=*/true, CostKind);
-    case Intrinsic::abs:
+    case Intrinsic::abs: {
+      // abs(X) = select(icmp(X,0),X,sub(0,X))
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
+      InstructionCost Cost = 0;
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                          Pred, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                          Pred, CostKind);
+      // TODO: Should we add an OperandValueProperties::OP_Zero property?
+      Cost += thisT()->getArithmeticInstrCost(
+          BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
+      return Cost;
+    }
     case Intrinsic::smax:
     case Intrinsic::smin:
     case Intrinsic::umax:
     case Intrinsic::umin: {
-      // abs(X) = select(icmp(X,0),X,sub(0,X))
       // minmax(X,Y) = select(icmp(X,Y),X,Y)
       Type *CondTy = RetTy->getWithNewBitWidth(1);
+      bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
+      CmpInst::Predicate Pred =
+          IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
       InstructionCost Cost = 0;
-      // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
-      // TODO: Should we add an OperandValueProperties::OP_Zero property?
-      if (IID == Intrinsic::abs)
-        Cost += thisT()->getArithmeticInstrCost(
-            BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                          Pred, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                          Pred, CostKind);
       return Cost;
     }
     case Intrinsic::sadd_sat:
@@ -1719,6 +1768,7 @@ public:
       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
                                      ? Intrinsic::sadd_with_overflow
                                      : Intrinsic::ssub_with_overflow;
+      CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
 
       // SatMax -> Overflow && SumDiff < 0
       // SatMin -> Overflow && SumDiff >= 0
@@ -1726,12 +1776,10 @@ public:
       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
                                     nullptr, ScalarizationCostPassed);
       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(
-                      BinaryOperator::Select, RetTy, CondTy,
-                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                          Pred, CostKind);
+      Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+                                              CondTy, Pred, CostKind);
       return Cost;
     }
     case Intrinsic::uadd_sat:
@@ -1784,23 +1832,16 @@ public:
                             ? BinaryOperator::Add
                             : BinaryOperator::Sub;
 
-      //   LHSSign -> LHS >= 0
-      //   RHSSign -> RHS >= 0
-      //   SumSign -> Sum >= 0
-      //
       //   Add:
-      //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+      //   Overflow -> (Result < LHS) ^ (RHS < 0)
       //   Sub:
-      //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+      //   Overflow -> (Result < LHS) ^ (RHS > 0)
       InstructionCost Cost = 0;
       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost += 3 * thisT()->getCmpSelInstrCost(
-                      Instruction::ICmp, SumTy, OverflowTy,
-                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
       Cost += 2 * thisT()->getCmpSelInstrCost(
-                      Instruction::Select, OverflowTy, OverflowTy,
-                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
+                      Instruction::ICmp, SumTy, OverflowTy,
+                      CmpInst::ICMP_SGT, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
                                               CostKind);
       return Cost;
     }
@@ -1811,12 +1852,15 @@ public:
       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
                             ? BinaryOperator::Add
                             : BinaryOperator::Sub;
+      CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
+                                    ? CmpInst::ICMP_ULT
+                                    : CmpInst::ICMP_UGT;
 
       InstructionCost Cost = 0;
       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
       Cost +=
           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+                                      Pred, CostKind);
       return Cost;
     }
     case Intrinsic::smul_with_overflow:
@@ -1825,9 +1869,9 @@ public:
       Type *OverflowTy = RetTy->getContainedType(1);
       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
+      bool IsSigned = IID == Intrinsic::smul_with_overflow;
 
-      unsigned ExtOp =
-          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+      unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
       TTI::CastContextHint CCH = TTI::CastContextHint::None;
 
       InstructionCost Cost = 0;
@@ -1836,18 +1880,17 @@ public:
           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
                                             CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
+      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
                                               CostKind, TTI::OK_AnyValue,
                                               TTI::OK_UniformConstantValue);
 
-      if (IID == Intrinsic::smul_with_overflow)
+      if (IsSigned)
         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
                                                 CostKind, TTI::OK_AnyValue,
                                                 TTI::OK_UniformConstantValue);
 
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
-                                      CmpInst::BAD_ICMP_PREDICATE, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(
+          BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
       return Cost;
     }
     case Intrinsic::ctpop:
@@ -1974,16 +2017,16 @@ public:
   /// \param RetTy Return value types.
   /// \param Tys Argument types.
   /// \returns The cost of Call instruction.
-  InstructionCost
-  getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
-                   TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
+  InstructionCost getCallInstrCost(Function *F, Type *RetTy,
+                                   ArrayRef<Type *> Tys,
+                                   TTI::TargetCostKind CostKind) {
     return 10;
   }
 
   unsigned getNumberOfParts(Type *Tp) {
     std::pair<InstructionCost, MVT> LT =
         getTLI()->getTypeLegalizationCost(DL, Tp);
-    return *LT.first.getValue();
+    return LT.first.isValid() ? *LT.first.getValue() : 0;
   }
 
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
@@ -2060,7 +2103,8 @@ public:
     // By default reductions need one shuffle per reduction level.
     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
-    ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
+    ArithCost +=
+        NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
     return ShuffleCost + ArithCost +
            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
diff --git a/llvm/include/llvm/CodeGen/CodeGenCommonISel.h b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
new file mode 100644
index 000000000000..270f935b6738
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/CodeGenCommonISel.h
@@ -0,0 +1,219 @@
+//===- CodeGenCommonISel.h - Common code between ISels ---------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares common utilities that are shared between SelectionDAG and
+// GlobalISel frameworks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_CODEGENCOMMONISEL_H
+#define LLVM_CODEGEN_CODEGENCOMMONISEL_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <cassert>
+namespace llvm {
+
+class BasicBlock;
+class MachineBasicBlock;
+/// Encapsulates all of the information needed to generate a stack protector
+/// check, and signals to isel when initialized that one needs to be generated.
+///
+/// *NOTE* The following is a high level documentation of SelectionDAG Stack
+/// Protector Generation. This is now also ported be shared with GlobalISel,
+/// but without any significant changes.
+///
+/// High Level Overview of ISel Stack Protector Generation:
+///
+/// Previously, the "stack protector" IR pass handled stack protector
+/// generation. This necessitated splitting basic blocks at the IR level to
+/// create the success/failure basic blocks in the tail of the basic block in
+/// question. As a result of this, calls that would have qualified for the
+/// sibling call optimization were no longer eligible for optimization since
+/// said calls were no longer right in the "tail position" (i.e. the immediate
+/// predecessor of a ReturnInst instruction).
+///
+/// Since the sibling call optimization causes the callee to reuse the caller's
+/// stack, if we could delay the generation of the stack protector check until
+/// later in CodeGen after the sibling call decision was made, we get both the
+/// tail call optimization and the stack protector check!
+///
+/// A few goals in solving this problem were:
+///
+///   1. Preserve the architecture independence of stack protector generation.
+///
+///   2. Preserve the normal IR level stack protector check for platforms like
+///      OpenBSD for which we support platform-specific stack protector
+///      generation.
+///
+/// The main problem that guided the present solution is that one can not
+/// solve this problem in an architecture independent manner at the IR level
+/// only. This is because:
+///
+///   1. The decision on whether or not to perform a sibling call on certain
+///      platforms (for instance i386) requires lower level information
+///      related to available registers that can not be known at the IR level.
+///
+///   2. Even if the previous point were not true, the decision on whether to
+///      perform a tail call is done in LowerCallTo in SelectionDAG (or
+///      CallLowering in GlobalISel) which occurs after the Stack Protector
+///      Pass. As a result, one would need to put the relevant callinst into the
+///      stack protector check success basic block (where the return inst is
+///      placed) and then move it back later at ISel/MI time before the
+///      stack protector check if the tail call optimization failed. The MI
+///      level option was nixed immediately since it would require
+///      platform-specific pattern matching. The ISel level option was
+///      nixed because SelectionDAG only processes one IR level basic block at a
+///      time implying one could not create a DAG Combine to move the callinst.
+///
+/// To get around this problem:
+///
+///   1. SelectionDAG can only process one block at a time, we can generate
+///      multiple machine basic blocks for one IR level basic block.
+///      This is how we handle bit tests and switches.
+///
+///   2. At the MI level, tail calls are represented via a special return
+///      MIInst called "tcreturn". Thus if we know the basic block in which we
+///      wish to insert the stack protector check, we get the correct behavior
+///      by always inserting the stack protector check right before the return
+///      statement. This is a "magical transformation" since no matter where
+///      the stack protector check intrinsic is, we always insert the stack
+///      protector check code at the end of the BB.
+///
+/// Given the aforementioned constraints, the following solution was devised:
+///
+///   1. On platforms that do not support ISel stack protector check
+///      generation, allow for the normal IR level stack protector check
+///      generation to continue.
+///
+///   2. On platforms that do support ISel stack protector check
+///      generation:
+///
+///     a. Use the IR level stack protector pass to decide if a stack
+///        protector is required/which BB we insert the stack protector check
+///        in by reusing the logic already therein.
+///
+///     b. After we finish selecting the basic block, we produce the validation
+///        code with one of these techniques:
+///          1) with a call to a guard check function
+///          2) with inlined instrumentation
+///
+///        1) We insert a call to the check function before the terminator.
+///
+///        2) We first find a splice point in the parent basic block
+///        before the terminator and then splice the terminator of said basic
+///        block into the success basic block. Then we code-gen a new tail for
+///        the parent basic block consisting of the two loads, the comparison,
+///        and finally two branches to the success/failure basic blocks. We
+///        conclude by code-gening the failure basic block if we have not
+///        code-gened it already (all stack protector checks we generate in
+///        the same function, use the same failure basic block).
+class StackProtectorDescriptor {
+public:
+  StackProtectorDescriptor() = default;
+
+  /// Returns true if all fields of the stack protector descriptor are
+  /// initialized implying that we should/are ready to emit a stack protector.
+  bool shouldEmitStackProtector() const {
+    return ParentMBB && SuccessMBB && FailureMBB;
+  }
+
+  bool shouldEmitFunctionBasedCheckStackProtector() const {
+    return ParentMBB && !SuccessMBB && !FailureMBB;
+  }
+
+  /// Initialize the stack protector descriptor structure for a new basic
+  /// block.
+  void initialize(const BasicBlock *BB, MachineBasicBlock *MBB,
+                  bool FunctionBasedInstrumentation) {
+    // Make sure we are not initialized yet.
+    assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
+                                          "already initialized!");
+    ParentMBB = MBB;
+    if (!FunctionBasedInstrumentation) {
+      SuccessMBB = addSuccessorMBB(BB, MBB, /* IsLikely */ true);
+      FailureMBB = addSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
+    }
+  }
+
+  /// Reset state that changes when we handle different basic blocks.
+  ///
+  /// This currently includes:
+  ///
+  /// 1. The specific basic block we are generating a
+  /// stack protector for (ParentMBB).
+  ///
+  /// 2. The successor machine basic block that will contain the tail of
+  /// parent mbb after we create the stack protector check (SuccessMBB). This
+  /// BB is visited only on stack protector check success.
+  void resetPerBBState() {
+    ParentMBB = nullptr;
+    SuccessMBB = nullptr;
+  }
+
+  /// Reset state that only changes when we switch functions.
+  ///
+  /// This currently includes:
+  ///
+  /// 1. FailureMBB since we reuse the failure code path for all stack
+  /// protector checks created in an individual function.
+  ///
+  /// 2.The guard variable since the guard variable we are checking against is
+  /// always the same.
+  void resetPerFunctionState() { FailureMBB = nullptr; }
+
+  MachineBasicBlock *getParentMBB() { return ParentMBB; }
+  MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
+  MachineBasicBlock *getFailureMBB() { return FailureMBB; }
+
+private:
+  /// The basic block for which we are generating the stack protector.
+  ///
+  /// As a result of stack protector generation, we will splice the
+  /// terminators of this basic block into the successor mbb SuccessMBB and
+  /// replace it with a compare/branch to the successor mbbs
+  /// SuccessMBB/FailureMBB depending on whether or not the stack protector
+  /// was violated.
+  MachineBasicBlock *ParentMBB = nullptr;
+
+  /// A basic block visited on stack protector check success that contains the
+  /// terminators of ParentMBB.
+  MachineBasicBlock *SuccessMBB = nullptr;
+
+  /// This basic block visited on stack protector check failure that will
+  /// contain a call to __stack_chk_fail().
+  MachineBasicBlock *FailureMBB = nullptr;
+
+  /// Add a successor machine basic block to ParentMBB. If the successor mbb
+  /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
+  /// block will be created. Assign a large weight if IsLikely is true.
+  MachineBasicBlock *addSuccessorMBB(const BasicBlock *BB,
+                                     MachineBasicBlock *ParentMBB,
+                                     bool IsLikely,
+                                     MachineBasicBlock *SuccMBB = nullptr);
+};
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+MachineBasicBlock::iterator
+findSplitPointForStackProtector(MachineBasicBlock *BB,
+                                const TargetInstrInfo &TII);
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_CODEGENCOMMONISEL_H
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 5a4351756297..ed3cd54df272 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -48,7 +48,6 @@ Optional<CodeModel::Model> getExplicitCodeModel();
 
 llvm::ExceptionHandling getExceptionModel();
 
-CodeGenFileType getFileType();
 Optional<CodeGenFileType> getExplicitFileType();
 
 CodeGenFileType getFileType();
@@ -74,6 +73,8 @@ llvm::FloatABI::ABIType getFloatABIForCalls();
 
 llvm::FPOpFusion::FPOpFusionMode getFuseFPOps();
 
+SwiftAsyncFramePointerMode getSwiftAsyncFramePointer();
+
 bool getDontPlaceZerosInBSS();
 
 bool getEnableGuaranteedTailCallOpt();
@@ -128,8 +129,6 @@ bool getEnableMachineFunctionSplitter();
 
 bool getEnableDebugEntryValues();
 
-bool getPseudoProbeForProfiling();
-
 bool getValueTrackingVariableLocations();
 
 bool getForceDwarfFrameSection();
@@ -138,6 +137,8 @@ bool getXRayOmitFunctionIndex();
 
 bool getDebugStrictDwarf();
 
+unsigned getAlignLoops();
+
 /// Create this object with static storage to register codegen-related command
 /// line options.
 struct RegisterCodeGenFlags {
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index b6bde0249f88..524730d53694 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -17,7 +17,6 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 6bdaddd9c6f5..9c878d4b087b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -116,6 +116,9 @@ public:
     /// vreg that the swifterror should be copied into after the call.
     Register SwiftErrorVReg;
 
+    /// Original IR callsite corresponding to this call, if available.
+    const CallBase *CB = nullptr;
+
     MDNode *KnownCallees = nullptr;
 
     /// True if the call must be tail call optimized.
@@ -259,7 +262,7 @@ public:
     /// handle the appropriate COPY (either to or from) and mark any
     /// relevant uses/defines as needed.
     virtual void assignValueToReg(Register ValVReg, Register PhysReg,
-                                  CCValAssign &VA) = 0;
+                                  CCValAssign VA) = 0;
 
     /// The specified value has been assigned to a stack
     /// location. Load or store it there, with appropriate extension
@@ -279,11 +282,14 @@ public:
     }
 
     /// Handle custom values, which may be passed into one or more of \p VAs.
+    /// \p If the handler wants the assignments to be delayed until after
+    /// mem loc assignments, then it sets \p Thunk to the thunk to do the
+    /// assignment.
     /// \return The number of \p VAs that have been assigned after the first
     ///         one, and which should therefore be skipped from further
     ///         processing.
-    virtual unsigned assignCustomValue(ArgInfo &Arg,
-                                       ArrayRef<CCValAssign> VAs) {
+    virtual unsigned assignCustomValue(ArgInfo &Arg, ArrayRef<CCValAssign> VAs,
+                                       std::function<void()> *Thunk = nullptr) {
       // This is not a pure virtual method because not all targets need to worry
       // about custom values.
       llvm_unreachable("Custom values not supported");
@@ -315,7 +321,7 @@ public:
 
     /// Provides a default implementation for argument handling.
     void assignValueToReg(Register ValVReg, Register PhysReg,
-                          CCValAssign &VA) override;
+                          CCValAssign VA) override;
   };
 
   /// Base class for ValueHandlers used for arguments passed to a function call,
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 56459b68dce0..ff4ad4b72636 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -36,7 +36,10 @@ class GISelKnownBits;
 class MachineDominatorTree;
 class LegalizerInfo;
 struct LegalityQuery;
+class RegisterBank;
+class RegisterBankInfo;
 class TargetLowering;
+class TargetRegisterInfo;
 
 struct PreferredTuple {
   LLT Ty;                // The result type of the extend.
@@ -54,6 +57,7 @@ struct IndexedLoadStoreMatchInfo {
 struct PtrAddChain {
   int64_t Imm;
   Register Base;
+  const RegisterBank *Bank;
 };
 
 struct RegisterImmPair {
@@ -68,6 +72,16 @@ struct ShiftOfShiftedLogic {
   uint64_t ValSum;
 };
 
+using BuildFnTy = std::function<void(MachineIRBuilder &)>;
+
+struct MergeTruncStoresInfo {
+  SmallVector<GStore *> FoundStores;
+  GStore *LowestIdxStore = nullptr;
+  Register WideSrcVal;
+  bool NeedBSwap = false;
+  bool NeedRotate = false;
+};
+
 using OperandBuildSteps =
     SmallVector<std::function<void(MachineInstrBuilder &)>, 4>;
 struct InstructionBuildSteps {
@@ -95,6 +109,8 @@ protected:
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
   const LegalizerInfo *LI;
+  const RegisterBankInfo *RBI;
+  const TargetRegisterInfo *TRI;
 
 public:
   CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
@@ -120,6 +136,22 @@ public:
   void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp,
                         Register ToReg) const;
 
+  /// Replace the opcode in instruction with a new opcode and inform the
+  /// observer of the changes.
+  void replaceOpcodeWith(MachineInstr &FromMI, unsigned ToOpcode) const;
+
+  /// Get the register bank of \p Reg.
+  /// If Reg has not been assigned a register, a register class,
+  /// or a register bank, then this returns nullptr.
+  ///
+  /// \pre Reg.isValid()
+  const RegisterBank *getRegBank(Register Reg) const;
+
+  /// Set the register bank of \p Reg.
+  /// Does nothing if the RegBank is null.
+  /// This is the counterpart to getRegBank.
+  void setRegBank(Register Reg, const RegisterBank *RegBank);
+
   /// If \p MI is COPY, try to combine it.
   /// Returns true if MI changed.
   bool tryCombineCopy(MachineInstr &MI);
@@ -144,6 +176,9 @@ public:
   bool matchCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
   void applyCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
 
+  /// Match (and (load x), mask) -> zextload x
+  bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   /// Combine \p MI into a pre-indexed or post-indexed load/store operation if
   /// legal and the surrounding code makes it useful.
   bool tryCombineIndexedLoadStore(MachineInstr &MI);
@@ -341,6 +376,9 @@ public:
   bool matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src);
   void applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src);
 
+  /// Transform fabs(fneg(x)) to fabs(x).
+  bool matchCombineFAbsOfFNeg(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
   bool matchCombineTruncOfExt(MachineInstr &MI,
                               std::pair<Register, unsigned> &MatchInfo);
@@ -445,7 +483,7 @@ public:
 
   /// Fold and(and(x, C1), C2) -> C1&C2 ? and(x, C1&C2) : 0
   bool matchOverlappingAnd(MachineInstr &MI,
-                           std::function<void(MachineIRBuilder &)> &MatchInfo);
+                           BuildFnTy &MatchInfo);
 
   /// \return true if \p MI is a G_AND instruction whose operands are x and y
   /// where x & y == x or x & y == y. (E.g., one of operands is all-ones value.)
@@ -501,8 +539,10 @@ public:
   ///
   /// And check if the tree can be replaced with a M-bit load + possibly a
   /// bswap.
-  bool matchLoadOrCombine(MachineInstr &MI,
-                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+  bool matchLoadOrCombine(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  bool matchTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);
+  void applyTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);
 
   bool matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
   void applyExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
@@ -519,12 +559,10 @@ public:
 
   /// Use a function which takes in a MachineIRBuilder to perform a combine.
   /// By default, it erases the instruction \p MI from the function.
-  void applyBuildFn(MachineInstr &MI,
-                    std::function<void(MachineIRBuilder &)> &MatchInfo);
+  void applyBuildFn(MachineInstr &MI, BuildFnTy &MatchInfo);
   /// Use a function which takes in a MachineIRBuilder to perform a combine.
   /// This variant does not erase \p MI after calling the build function.
-  void applyBuildFnNoErase(MachineInstr &MI,
-                           std::function<void(MachineIRBuilder &)> &MatchInfo);
+  void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo);
 
   bool matchFunnelShiftToRotate(MachineInstr &MI);
   void applyFunnelShiftToRotate(MachineInstr &MI);
@@ -535,21 +573,57 @@ public:
   /// or false constant based off of KnownBits information.
   bool matchICmpToTrueFalseKnownBits(MachineInstr &MI, int64_t &MatchInfo);
 
-  bool matchBitfieldExtractFromSExtInReg(
-      MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo);
-  /// Match: and (lshr x, cst), mask -> ubfx x, cst, width
-  bool matchBitfieldExtractFromAnd(
-      MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo);
+  /// \returns true if a G_ICMP \p MI can be replaced with its LHS based off of
+  /// KnownBits information.
+  bool
+  matchICmpToLHSKnownBits(MachineInstr &MI,
+                          BuildFnTy &MatchInfo);
+
+  /// \returns true if (and (or x, c1), c2) can be replaced with (and x, c2)
+  bool matchAndOrDisjointMask(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  bool matchBitfieldExtractFromSExtInReg(MachineInstr &MI,
+                                         BuildFnTy &MatchInfo);
+  /// Match: and (lshr x, cst), mask -> ubfx x, cst, width
+  bool matchBitfieldExtractFromAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  /// Match: shr (shl x, n), k -> sbfx/ubfx x, pos, width
+  bool matchBitfieldExtractFromShr(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  /// Match: shr (and x, n), k -> ubfx x, pos, width
+  bool matchBitfieldExtractFromShrAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  // Helpers for reassociation:
+  bool matchReassocConstantInnerRHS(GPtrAdd &MI, MachineInstr *RHS,
+                                    BuildFnTy &MatchInfo);
+  bool matchReassocFoldConstantsInSubTree(GPtrAdd &MI, MachineInstr *LHS,
+                                          MachineInstr *RHS,
+                                          BuildFnTy &MatchInfo);
+  bool matchReassocConstantInnerLHS(GPtrAdd &MI, MachineInstr *LHS,
+                                    MachineInstr *RHS, BuildFnTy &MatchInfo);
   /// Reassociate pointer calculations with G_ADD involved, to allow better
   /// addressing mode usage.
-  bool matchReassocPtrAdd(MachineInstr &MI,
-                          std::function<void(MachineIRBuilder &)> &MatchInfo);
-
+  bool matchReassocPtrAdd(MachineInstr &MI, BuildFnTy &MatchInfo);
 
   /// Do constant folding when opportunities are exposed after MIR building.
   bool matchConstantFold(MachineInstr &MI, APInt &MatchInfo);
 
+  /// \returns true if it is possible to narrow the width of a scalar binop
+  /// feeding a G_AND instruction \p MI.
+  bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  /// Given an G_UDIV \p MI expressing a divide by constant, return an
+  /// expression that implements it by multiplying by a magic number.
+  /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+  MachineInstr *buildUDivUsingMul(MachineInstr &MI);
+  /// Combine G_UDIV by constant into a multiply by magic constant.
+  bool matchUDivByConst(MachineInstr &MI);
+  void applyUDivByConst(MachineInstr &MI);
+
+  // G_UMULH x, (1 << c)) -> x >> (bitwidth - c)
+  bool matchUMulHToLShr(MachineInstr &MI);
+  void applyUMulHToLShr(MachineInstr &MI);
+
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);
@@ -560,20 +634,21 @@ public:
   ///       and rename: s/bool tryEmit/void emit/
   bool tryEmitMemcpyInline(MachineInstr &MI);
 
-private:
-  // Memcpy family optimization helpers.
-  bool tryEmitMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
-                           uint64_t KnownLen, Align DstAlign, Align SrcAlign,
-                           bool IsVolatile);
-  bool optimizeMemcpy(MachineInstr &MI, Register Dst, Register Src,
-                      uint64_t KnownLen, uint64_t Limit, Align DstAlign,
-                      Align SrcAlign, bool IsVolatile);
-  bool optimizeMemmove(MachineInstr &MI, Register Dst, Register Src,
-                       uint64_t KnownLen, Align DstAlign, Align SrcAlign,
-                       bool IsVolatile);
-  bool optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
-                      uint64_t KnownLen, Align DstAlign, bool IsVolatile);
+  /// Match:
+  ///   (G_UMULO x, 2) -> (G_UADDO x, x)
+  ///   (G_SMULO x, 2) -> (G_SADDO x, x)
+  bool matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Transform (fadd x, fneg(y)) -> (fsub x, y)
+  ///           (fadd fneg(x), y) -> (fsub y, x)
+  ///           (fsub x, fneg(y)) -> (fadd x, y)
+  ///           (fmul fneg(x), fneg(y)) -> (fmul x, y)
+  ///           (fdiv fneg(x), fneg(y)) -> (fdiv x, y)
+  ///           (fmad fneg(x), fneg(y), z) -> (fmad x, y, z)
+  ///           (fma fneg(x), fneg(y), z) -> (fma x, y, z)
+  bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+private:
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a post-indexing operation.
   ///
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 1162134b2ad2..7103656365b1 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -57,9 +57,9 @@ public:
   bool isUnordered() const { return getMMO().isUnordered(); }
 
   /// Returns the size in bytes of the memory access.
-  uint64_t getMemSize() { return getMMO().getSize();
+  uint64_t getMemSize() const { return getMMO().getSize();
   } /// Returns the size in bits of the memory access.
-  uint64_t getMemSizeInBits() { return getMMO().getSizeInBits(); }
+  uint64_t getMemSizeInBits() const { return getMMO().getSizeInBits(); }
 
   static bool classof(const MachineInstr *MI) {
     switch (MI->getOpcode()) {
@@ -195,6 +195,37 @@ public:
   }
 };
 
+/// Represents a G_PTR_ADD.
+class GPtrAdd : public GenericMachineInstr {
+public:
+  Register getBaseReg() const { return getReg(1); }
+  Register getOffsetReg() const { return getReg(2); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_PTR_ADD;
+  }
+};
+
+/// Represents a G_IMPLICIT_DEF.
+class GImplicitDef : public GenericMachineInstr {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF;
+  }
+};
+
+/// Represents a G_SELECT.
+class GSelect : public GenericMachineInstr {
+public:
+  Register getCondReg() const { return getReg(1); }
+  Register getTrueReg() const { return getReg(2); }
+  Register getFalseReg() const { return getReg(3); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_SELECT;
+  }
+};
+
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
-\ No newline at end of file
+#endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 8eab8a5846a7..ebe16cd4f58c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -466,9 +467,8 @@ private:
   bool translateSIToFP(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateCast(TargetOpcode::G_SITOFP, U, MIRBuilder);
   }
-  bool translateUnreachable(const User &U, MachineIRBuilder &MIRBuilder) {
-    return true;
-  }
+  bool translateUnreachable(const User &U, MachineIRBuilder &MIRBuilder);
+
   bool translateSExt(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateCast(TargetOpcode::G_SEXT, U, MIRBuilder);
   }
@@ -586,6 +586,8 @@ private:
   /// stop translating such blocks early.
   bool HasTailCall = false;
 
+  StackProtectorDescriptor SPDescriptor;
+
   /// Switch analysis and optimization.
   class GISelSwitchLowering : public SwitchCG::SwitchLowering {
   public:
@@ -614,8 +616,34 @@ private:
   // * Clear the different maps.
   void finalizeFunction();
 
-  // Handle emitting jump tables for each basic block.
-  void finalizeBasicBlock();
+  // Processing steps done per block. E.g. emitting jump tables, stack
+  // protectors etc. Returns true if no errors, false if there was a problem
+  // that caused an abort.
+  bool finalizeBasicBlock(const BasicBlock &BB, MachineBasicBlock &MBB);
+
+  /// Codegen a new tail for a stack protector check ParentMBB which has had its
+  /// tail spliced into a stack protector check success bb.
+  ///
+  /// For a high level explanation of how this fits into the stack protector
+  /// generation see the comment on the declaration of class
+  /// StackProtectorDescriptor.
+  ///
+  /// \return true if there were no problems.
+  bool emitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                              MachineBasicBlock *ParentBB);
+
+  /// Codegen the failure basic block for a stack protector check.
+  ///
+  /// A failure stack protector machine basic block consists simply of a call to
+  /// __stack_chk_fail().
+  ///
+  /// For a high level explanation of how this fits into the stack protector
+  /// generation see the comment on the declaration of class
+  /// StackProtectorDescriptor.
+  ///
+  /// \return true if there were no problems.
+  bool emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
+                               MachineBasicBlock *FailureBB);
 
   /// Get the VRegs that represent \p Val.
   /// Non-aggregate types have just one corresponding VReg and the list can be
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h
index b1f2103da309..f6704df3f49d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h
@@ -478,4 +478,4 @@ private:
 
 } // end namespace llvm
 
-#endif // define LLVM_CODEGEN_GLOBALISEL_LEGACYLEGALIZERINFO_H
+#endif // LLVM_CODEGEN_GLOBALISEL_LEGACYLEGALIZERINFO_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 44a48927d35a..8a603de2f91d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -15,6 +15,7 @@
 #define LLVM_CODEGEN_GLOBALISEL_LEGALIZATIONARTIFACTCOMBINER_H
 
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -22,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "legalizer"
@@ -52,7 +54,8 @@ public:
 
   bool tryCombineAnyExt(MachineInstr &MI,
                         SmallVectorImpl<MachineInstr *> &DeadInsts,
-                        SmallVectorImpl<Register> &UpdatedDefs) {
+                        SmallVectorImpl<Register> &UpdatedDefs,
+                        GISelObserverWrapper &Observer) {
     assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
 
     Builder.setInstrAndDebugLoc(MI);
@@ -63,7 +66,11 @@ public:
     Register TruncSrc;
     if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.buildAnyExtOrTrunc(DstReg, TruncSrc);
+      if (MRI.getType(DstReg) == MRI.getType(TruncSrc))
+        replaceRegOrBuildCopy(DstReg, TruncSrc, MRI, Builder, UpdatedDefs,
+                              Observer);
+      else
+        Builder.buildAnyExtOrTrunc(DstReg, TruncSrc);
       UpdatedDefs.push_back(DstReg);
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
@@ -120,12 +127,14 @@ public:
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       LLT SrcTy = MRI.getType(SrcReg);
-      APInt MaskVal = APInt::getAllOnesValue(SrcTy.getScalarSizeInBits());
+      APInt MaskVal = APInt::getAllOnes(SrcTy.getScalarSizeInBits());
       auto Mask = Builder.buildConstant(
         DstTy, MaskVal.zext(DstTy.getScalarSizeInBits()));
-      auto Extended = SextSrc ? Builder.buildSExtOrTrunc(DstTy, SextSrc) :
-                                Builder.buildAnyExtOrTrunc(DstTy, TruncSrc);
-      Builder.buildAnd(DstReg, Extended, Mask);
+      if (SextSrc && (DstTy != MRI.getType(SextSrc)))
+        SextSrc = Builder.buildSExtOrTrunc(DstTy, SextSrc).getReg(0);
+      if (TruncSrc && (DstTy != MRI.getType(TruncSrc)))
+        TruncSrc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc).getReg(0);
+      Builder.buildAnd(DstReg, SextSrc ? SextSrc : TruncSrc, Mask);
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
@@ -176,9 +185,9 @@ public:
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       LLT SrcTy = MRI.getType(SrcReg);
       uint64_t SizeInBits = SrcTy.getScalarSizeInBits();
-      Builder.buildInstr(
-          TargetOpcode::G_SEXT_INREG, {DstReg},
-          {Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), SizeInBits});
+      if (DstTy != MRI.getType(TruncSrc))
+        TruncSrc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc).getReg(0);
+      Builder.buildSExtInReg(DstReg, TruncSrc, SizeInBits);
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
@@ -544,12 +553,14 @@ public:
     MachineIRBuilder &MIB;
     const LegalizerInfo &LI;
 
-  private:
+    // Stores the best register found in the current query so far.
+    Register CurrentBest = Register();
+
     /// Given an concat_vector op \p Concat and a start bit and size, try to
     /// find the origin of the value defined by that start position and size.
     ///
-    /// \returns A register if a value can be found, otherwise an empty
-    /// Register.
+    /// \returns a register with the requested size, or the current best
+    /// register found during the current query.
     Register findValueFromConcat(GConcatVectors &Concat, unsigned StartBit,
                                  unsigned Size) {
       assert(Size > 0);
@@ -566,22 +577,22 @@ public:
       // FIXME: we might be able return multiple sources? Or create an
       // appropriate concat to make it fit.
       if (InRegOffset + Size > SrcSize)
-        return Register();
+        return CurrentBest;
 
-      // If the bits exactly cover a single source, then return the operand as
-      // our value reg.
       Register SrcReg = Concat.getReg(StartSrcIdx);
-      if (InRegOffset == 0 && Size == SrcSize)
-        return SrcReg; // A source operand matches exactly.
+      if (InRegOffset == 0 && Size == SrcSize) {
+        CurrentBest = SrcReg;
+        return findValueFromDefImpl(SrcReg, 0, Size);
+      }
 
-      return findValueFromDef(SrcReg, InRegOffset, Size);
+      return findValueFromDefImpl(SrcReg, InRegOffset, Size);
     }
 
     /// Given an build_vector op \p BV and a start bit and size, try to find
     /// the origin of the value defined by that start position and size.
     ///
-    /// \returns A register if a value can be found, otherwise an empty
-    /// Register.
+    /// \returns a register with the requested size, or the current best
+    /// register found during the current query.
     Register findValueFromBuildVector(GBuildVector &BV, unsigned StartBit,
                                       unsigned Size) {
       assert(Size > 0);
@@ -596,17 +607,21 @@ public:
       unsigned InRegOffset = StartBit % SrcSize;
 
       if (InRegOffset != 0)
-        return Register(); // Give up, bits don't start at a scalar source.
+        return CurrentBest; // Give up, bits don't start at a scalar source.
       if (Size < SrcSize)
-        return Register(); // Scalar source is too large for requested bits.
+        return CurrentBest; // Scalar source is too large for requested bits.
 
       // If the bits cover multiple sources evenly, then create a new
       // build_vector to synthesize the required size, if that's been requested.
       if (Size > SrcSize) {
         if (Size % SrcSize > 0)
-          return Register(); // Isn't covered exactly by sources.
+          return CurrentBest; // Isn't covered exactly by sources.
 
         unsigned NumSrcsUsed = Size / SrcSize;
+        // If we're requesting all of the sources, just return this def.
+        if (NumSrcsUsed == BV.getNumSources())
+          return BV.getReg(0);
+
         LLT SrcTy = MRI.getType(Src1Reg);
         LLT NewBVTy = LLT::fixed_vector(NumSrcsUsed, SrcTy);
 
@@ -614,7 +629,7 @@ public:
         LegalizeActionStep ActionStep =
             LI.getAction({TargetOpcode::G_BUILD_VECTOR, {NewBVTy, SrcTy}});
         if (ActionStep.Action != LegalizeActions::Legal)
-          return Register();
+          return CurrentBest;
 
         SmallVector<Register> NewSrcs;
         for (unsigned SrcIdx = StartSrcIdx; SrcIdx < StartSrcIdx + NumSrcsUsed;
@@ -630,8 +645,8 @@ public:
     /// Given an G_INSERT op \p MI and a start bit and size, try to find
     /// the origin of the value defined by that start position and size.
     ///
-    /// \returns A register if a value can be found, otherwise an empty
-    /// Register.
+    /// \returns a register with the requested size, or the current best
+    /// register found during the current query.
     Register findValueFromInsert(MachineInstr &MI, unsigned StartBit,
                                  unsigned Size) {
       assert(MI.getOpcode() == TargetOpcode::G_INSERT);
@@ -685,28 +700,25 @@ public:
       if (EndBit <= InsertOffset || InsertedEndBit <= StartBit) {
         SrcRegToUse = ContainerSrcReg;
         NewStartBit = StartBit;
-        return findValueFromDef(SrcRegToUse, NewStartBit, Size);
+        return findValueFromDefImpl(SrcRegToUse, NewStartBit, Size);
       }
       if (InsertOffset <= StartBit && EndBit <= InsertedEndBit) {
         SrcRegToUse = InsertedReg;
         NewStartBit = StartBit - InsertOffset;
-        return findValueFromDef(SrcRegToUse, NewStartBit, Size);
+        if (NewStartBit == 0 &&
+            Size == MRI.getType(SrcRegToUse).getSizeInBits())
+          CurrentBest = SrcRegToUse;
+        return findValueFromDefImpl(SrcRegToUse, NewStartBit, Size);
       }
       // The bit range spans both the inserted and container regions.
       return Register();
     }
 
-  public:
-    ArtifactValueFinder(MachineRegisterInfo &Mri, MachineIRBuilder &Builder,
-                        const LegalizerInfo &Info)
-        : MRI(Mri), MIB(Builder), LI(Info) {}
-
-    /// Try to find a source of the value defined in the def \p DefReg, starting
-    /// at position \p StartBit with size \p Size.
-    /// \returns an empty Register if no value could be found, or \p DefReg if
-    /// if that was the best we could do.
-    Register findValueFromDef(Register DefReg, unsigned StartBit,
-                              unsigned Size) {
+    /// Internal implementation for findValueFromDef(). findValueFromDef()
+    /// initializes some data like the CurrentBest register, which this method
+    /// and its callees rely upon.
+    Register findValueFromDefImpl(Register DefReg, unsigned StartBit,
+                                  unsigned Size) {
       MachineInstr *Def = getDefIgnoringCopies(DefReg, MRI);
       // If the instruction has a single def, then simply delegate the search.
       // For unmerge however with multiple defs, we need to compute the offset
@@ -724,7 +736,7 @@ public:
         }
         Register SrcReg = Def->getOperand(Def->getNumOperands() - 1).getReg();
         Register SrcOriginReg =
-            findValueFromDef(SrcReg, StartBit + DefStartBit, Size);
+            findValueFromDefImpl(SrcReg, StartBit + DefStartBit, Size);
         if (SrcOriginReg)
           return SrcOriginReg;
         // Failed to find a further value. If the StartBit and Size perfectly
@@ -732,7 +744,7 @@ public:
         // nothing.
         if (StartBit == 0 && Size == DefSize)
           return DefReg;
-        return Register();
+        return CurrentBest;
       }
       case TargetOpcode::G_BUILD_VECTOR:
         return findValueFromBuildVector(cast<GBuildVector>(*Def), StartBit,
@@ -740,41 +752,48 @@ public:
       case TargetOpcode::G_INSERT:
         return findValueFromInsert(*Def, StartBit, Size);
       default:
-        return Register();
+        return CurrentBest;
       }
     }
-  };
 
-  bool tryCombineUnmergeValues(GUnmerge &MI,
-                               SmallVectorImpl<MachineInstr *> &DeadInsts,
-                               SmallVectorImpl<Register> &UpdatedDefs,
-                               GISelChangeObserver &Observer) {
-    unsigned NumDefs = MI.getNumDefs();
-    Register SrcReg = MI.getSourceReg();
-    MachineInstr *SrcDef = getDefIgnoringCopies(SrcReg, MRI);
-    if (!SrcDef)
-      return false;
-
-    LLT OpTy = MRI.getType(SrcReg);
-    LLT DestTy = MRI.getType(MI.getReg(0));
-    unsigned SrcDefIdx = getDefIndex(*SrcDef, SrcReg);
+  public:
+    ArtifactValueFinder(MachineRegisterInfo &Mri, MachineIRBuilder &Builder,
+                        const LegalizerInfo &Info)
+        : MRI(Mri), MIB(Builder), LI(Info) {}
 
-    Builder.setInstrAndDebugLoc(MI);
+    /// Try to find a source of the value defined in the def \p DefReg, starting
+    /// at position \p StartBit with size \p Size.
+    /// \returns a register with the requested size, or an empty Register if no
+    /// better value could be found.
+    Register findValueFromDef(Register DefReg, unsigned StartBit,
+                              unsigned Size) {
+      CurrentBest = Register();
+      Register FoundReg = findValueFromDefImpl(DefReg, StartBit, Size);
+      return FoundReg != DefReg ? FoundReg : Register();
+    }
 
-    auto tryCombineViaValueFinder = [&]() {
-      ArtifactValueFinder ValueFinder(MRI, Builder, LI);
+    /// Try to combine the defs of an unmerge \p MI by attempting to find
+    /// values that provides the bits for each def reg.
+    /// \returns true if all the defs of the unmerge have been made dead.
+    bool tryCombineUnmergeDefs(GUnmerge &MI, GISelChangeObserver &Observer,
+                               SmallVectorImpl<Register> &UpdatedDefs) {
+      unsigned NumDefs = MI.getNumDefs();
+      LLT DestTy = MRI.getType(MI.getReg(0));
 
       SmallBitVector DeadDefs(NumDefs);
       for (unsigned DefIdx = 0; DefIdx < NumDefs; ++DefIdx) {
         Register DefReg = MI.getReg(DefIdx);
-        Register FoundVal =
-            ValueFinder.findValueFromDef(DefReg, 0, DestTy.getSizeInBits());
-        if (!FoundVal || FoundVal == DefReg)
+        if (MRI.use_nodbg_empty(DefReg)) {
+          DeadDefs[DefIdx] = true;
+          continue;
+        }
+        Register FoundVal = findValueFromDef(DefReg, 0, DestTy.getSizeInBits());
+        if (!FoundVal)
           continue;
         if (MRI.getType(FoundVal) != DestTy)
           continue;
 
-        replaceRegOrBuildCopy(DefReg, FoundVal, MRI, Builder, UpdatedDefs,
+        replaceRegOrBuildCopy(DefReg, FoundVal, MRI, MIB, UpdatedDefs,
                               Observer);
         // We only want to replace the uses, not the def of the old reg.
         Observer.changingInstr(MI);
@@ -782,12 +801,31 @@ public:
         Observer.changedInstr(MI);
         DeadDefs[DefIdx] = true;
       }
-      if (DeadDefs.all()) {
-        markInstAndDefDead(MI, *SrcDef, DeadInsts, SrcDefIdx);
-        return true;
-      }
+      return DeadDefs.all();
+    }
+  };
+
+  bool tryCombineUnmergeValues(GUnmerge &MI,
+                               SmallVectorImpl<MachineInstr *> &DeadInsts,
+                               SmallVectorImpl<Register> &UpdatedDefs,
+                               GISelChangeObserver &Observer) {
+    unsigned NumDefs = MI.getNumDefs();
+    Register SrcReg = MI.getSourceReg();
+    MachineInstr *SrcDef = getDefIgnoringCopies(SrcReg, MRI);
+    if (!SrcDef)
       return false;
-    };
+
+    LLT OpTy = MRI.getType(SrcReg);
+    LLT DestTy = MRI.getType(MI.getReg(0));
+    unsigned SrcDefIdx = getDefIndex(*SrcDef, SrcReg);
+
+    Builder.setInstrAndDebugLoc(MI);
+
+    ArtifactValueFinder Finder(MRI, Builder, LI);
+    if (Finder.tryCombineUnmergeDefs(MI, Observer, UpdatedDefs)) {
+      markInstAndDefDead(MI, *SrcDef, DeadInsts, SrcDefIdx);
+      return true;
+    }
 
     if (auto *SrcUnmerge = dyn_cast<GUnmerge>(SrcDef)) {
       // %0:_(<4 x s16>) = G_FOO
@@ -813,7 +851,7 @@ public:
           return false;
         break;
       default:
-        return tryCombineViaValueFinder();
+        return false;
       }
 
       auto NewUnmerge = Builder.buildUnmerge(DestTy, SrcUnmergeSrc);
@@ -845,11 +883,7 @@ public:
                                        ConvertOp, OpTy, DestTy)) {
       // We might have a chance to combine later by trying to combine
       // unmerge(cast) first
-      if (tryFoldUnmergeCast(MI, *SrcDef, DeadInsts, UpdatedDefs))
-        return true;
-
-      // Try using the value finder.
-      return tryCombineViaValueFinder();
+      return tryFoldUnmergeCast(MI, *SrcDef, DeadInsts, UpdatedDefs);
     }
 
     const unsigned NumMergeRegs = MergeI->getNumOperands() - 1;
@@ -1042,7 +1076,7 @@ public:
     default:
       return false;
     case TargetOpcode::G_ANYEXT:
-      Changed = tryCombineAnyExt(MI, DeadInsts, UpdatedDefs);
+      Changed = tryCombineAnyExt(MI, DeadInsts, UpdatedDefs, WrapperObserver);
       break;
     case TargetOpcode::G_ZEXT:
       Changed = tryCombineZExt(MI, DeadInsts, UpdatedDefs, WrapperObserver);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 67141f3a6326..74615c73741a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -256,6 +256,20 @@ private:
                                         LLT SrcTy, LLT NarrowTy,
                                         unsigned ScalarOpc);
 
+  // Memcpy family legalization helpers.
+  LegalizeResult lowerMemset(MachineInstr &MI, Register Dst, Register Val,
+                             uint64_t KnownLen, Align Alignment,
+                             bool IsVolatile);
+  LegalizeResult lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
+                                   uint64_t KnownLen, Align DstAlign,
+                                   Align SrcAlign, bool IsVolatile);
+  LegalizeResult lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
+                             uint64_t KnownLen, uint64_t Limit, Align DstAlign,
+                             Align SrcAlign, bool IsVolatile);
+  LegalizeResult lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
+                              uint64_t KnownLen, Align DstAlign, Align SrcAlign,
+                              bool IsVolatile);
+
 public:
   /// Return the alignment to use for a stack temporary object with the given
   /// type.
@@ -402,6 +416,9 @@ public:
   LegalizeResult lowerDIVREM(MachineInstr &MI);
   LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
   LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
+  LegalizeResult lowerVectorReduction(MachineInstr &MI);
+  LegalizeResult lowerMemcpyInline(MachineInstr &MI);
+  LegalizeResult lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);
 };
 
 /// Helper function that creates a libcall to the given \p Name using the given
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 4fdfabbfb161..68c14240ebc7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -15,8 +15,6 @@
 #define LLVM_CODEGEN_GLOBALISEL_LEGALIZERINFO_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -113,6 +111,14 @@ struct LegalityQuery {
     LLT MemoryTy;
     uint64_t AlignInBits;
     AtomicOrdering Ordering;
+
+    MemDesc() = default;
+    MemDesc(LLT MemoryTy, uint64_t AlignInBits, AtomicOrdering Ordering)
+        : MemoryTy(MemoryTy), AlignInBits(AlignInBits), Ordering(Ordering) {}
+    MemDesc(const MachineMemOperand &MMO)
+        : MemoryTy(MMO.getMemoryType()),
+          AlignInBits(MMO.getAlign().value() * 8),
+          Ordering(MMO.getSuccessOrdering()) {}
   };
 
   /// Operations which require memory can use this to place requirements on the
@@ -293,6 +299,10 @@ LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size);
 /// type that's wider than the given size.
 LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size);
 
+/// True iff the specified type index is a scalar whose size is not a multiple
+/// of Size.
+LegalityPredicate sizeNotMultipleOf(unsigned TypeIdx, unsigned Size);
+
 /// True iff the specified type index is a scalar whose size is not a power of
 /// 2.
 LegalityPredicate sizeNotPow2(unsigned TypeIdx);
@@ -348,6 +358,11 @@ LegalizeMutation changeElementSizeTo(unsigned TypeIdx, unsigned FromTypeIdx);
 /// next power of 2.
 LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min = 0);
 
+/// Widen the scalar type or vector element type for the given type index to
+/// next multiple of \p Size.
+LegalizeMutation widenScalarOrEltToNextMultipleOf(unsigned TypeIdx,
+                                                  unsigned Size);
+
 /// Add more elements to the type for the given type index to the next power of
 /// 2.
 LegalizeMutation moreElementsToNextPow2(unsigned TypeIdx, unsigned Min = 0);
@@ -828,6 +843,16 @@ public:
         LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize));
   }
 
+  /// Widen the scalar to the next multiple of Size. No effect if the
+  /// type is not a scalar or is a multiple of Size.
+  LegalizeRuleSet &widenScalarToNextMultipleOf(unsigned TypeIdx,
+                                               unsigned Size) {
+    using namespace LegalityPredicates;
+    return actionIf(
+        LegalizeAction::WidenScalar, sizeNotMultipleOf(typeIdx(TypeIdx), Size),
+        LegalizeMutations::widenScalarOrEltToNextMultipleOf(TypeIdx, Size));
+  }
+
   /// Widen the scalar or vector element type to the next power of two that is
   /// at least MinSize.  No effect if the scalar size is a power of two.
   LegalizeRuleSet &widenScalarOrEltToNextPow2(unsigned TypeIdx,
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
new file mode 100644
index 000000000000..29575f386d7a
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
@@ -0,0 +1,165 @@
+//== llvm/CodeGen/GlobalISel/LoadStoreOpt.h - LoadStoreOpt -------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// This is an optimization pass for GlobalISel generic memory operations.
+/// Specifically, it focuses on merging stores and loads to consecutive
+/// addresses.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_GLOBALISEL_LOADSTOREOPT_H
+#define LLVM_CODEGEN_GLOBALISEL_LOADSTOREOPT_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+
+namespace llvm {
+// Forward declarations.
+class MachineRegisterInfo;
+class TargetTransformInfo;
+namespace GISelAddressing {
+/// Helper struct to store a base, index and offset that forms an address
+struct BaseIndexOffset {
+  Register BaseReg;
+  Register IndexReg;
+  int64_t Offset = 0;
+  bool IsIndexSignExt = false;
+};
+
+/// Returns a BaseIndexOffset which describes the pointer in \p Ptr.
+BaseIndexOffset getPointerInfo(Register Ptr, MachineRegisterInfo &MRI);
+
+/// Compute whether or not a memory access at \p MI1 aliases with an access at
+/// \p MI2 \returns true if either alias/no-alias is known. Sets \p IsAlias
+/// accordingly.
+bool aliasIsKnownForLoadStore(const MachineInstr &MI1, const MachineInstr &MI2,
+                              bool &IsAlias, MachineRegisterInfo &MRI);
+
+/// Returns true if the instruction \p MI may alias \p Other.
+/// This function uses multiple strategies to detect aliasing, whereas
+/// aliasIsKnownForLoadStore just looks at the addresses of load/stores and is
+/// tries to reason about base/index/offsets.
+bool instMayAlias(const MachineInstr &MI, const MachineInstr &Other,
+                  MachineRegisterInfo &MRI, AliasAnalysis *AA);
+} // namespace GISelAddressing
+
+using namespace GISelAddressing;
+
+class LoadStoreOpt : public MachineFunctionPass {
+public:
+  static char ID;
+
+private:
+  /// An input function to decide if the pass should run or not
+  /// on the given MachineFunction.
+  std::function<bool(const MachineFunction &)> DoNotRunPass;
+
+  MachineRegisterInfo *MRI;
+  const TargetLowering *TLI;
+  MachineFunction *MF;
+  AliasAnalysis *AA;
+  const LegalizerInfo *LI;
+
+  MachineIRBuilder Builder;
+
+  /// Initialize the field members using \p MF.
+  void init(MachineFunction &MF);
+
+  class StoreMergeCandidate {
+  public:
+    // The base pointer used as the base for all stores in this candidate.
+    Register BasePtr;
+    // Our algorithm is very simple at the moment. We assume that in instruction
+    // order stores are writing to incremeneting consecutive addresses. So when
+    // we walk the block in reverse order, the next eligible store must write to
+    // an offset one store width lower than CurrentLowestOffset.
+    uint64_t CurrentLowestOffset;
+    SmallVector<GStore *> Stores;
+    // A vector of MachineInstr/unsigned pairs to denote potential aliases that
+    // need to be checked before the candidate is considered safe to merge. The
+    // unsigned value is an index into the Stores vector. The indexed store is
+    // the highest-indexed store that has already been checked to not have an
+    // alias with the instruction. We record this so we don't have to repeat
+    // alias checks that have been already done, only those with stores added
+    // after the potential alias is recorded.
+    SmallVector<std::pair<MachineInstr *, unsigned>> PotentialAliases;
+
+    void addPotentialAlias(MachineInstr &MI);
+
+    /// Reset this candidate back to an empty one.
+    void reset() {
+      Stores.clear();
+      PotentialAliases.clear();
+      CurrentLowestOffset = 0;
+      BasePtr = Register();
+    }
+  };
+
+  bool isLegalOrBeforeLegalizer(const LegalityQuery &Query,
+                                MachineFunction &MF) const;
+  /// If the given store is valid to be a member of the candidate, add it and
+  /// return true. Otherwise, returns false.
+  bool addStoreToCandidate(GStore &MI, StoreMergeCandidate &C);
+  /// Returns true if the instruction \p MI would potentially alias with any
+  /// stores in the candidate \p C.
+  bool operationAliasesWithCandidate(MachineInstr &MI, StoreMergeCandidate &C);
+  /// Merges the stores in the given vector into a wide store.
+  /// \p returns true if at least some of the stores were merged.
+  /// This may decide not to merge stores if heuristics predict it will not be
+  /// worth it.
+  bool mergeStores(SmallVectorImpl<GStore *> &StoresToMerge);
+  /// Perform a merge of all the stores in \p Stores into a single store.
+  /// Erases the old stores from the block when finished.
+  /// \returns true if merging was done. It may fail to perform a merge if
+  /// there are issues with materializing legal wide values.
+  bool doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores);
+  bool processMergeCandidate(StoreMergeCandidate &C);
+  bool mergeBlockStores(MachineBasicBlock &MBB);
+  bool mergeFunctionStores(MachineFunction &MF);
+
+  /// Initialize some target-specific data structures for the store merging
+  /// optimization. \p AddrSpace indicates which address space to use when
+  /// probing the legalizer info for legal stores.
+  void initializeStoreMergeTargetInfo(unsigned AddrSpace = 0);
+  /// A map between address space numbers and a bitvector of supported stores
+  /// sizes. Each bit in the bitvector represents whether a store size of
+  /// that bit's value is legal. E.g. if bit 64 is set, then 64 bit scalar
+  /// stores are legal.
+  DenseMap<unsigned, BitVector> LegalStoreSizes;
+  bool IsPreLegalizer;
+  /// Contains instructions to be erased at the end of a block scan.
+  SmallSet<MachineInstr *, 16> InstsToErase;
+
+public:
+  LoadStoreOpt();
+  LoadStoreOpt(std::function<bool(const MachineFunction &)>);
+
+  StringRef getPassName() const override { return "LoadStoreOpt"; }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End namespace llvm.
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 4c6b47ab9bc8..e813d030eec3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -63,7 +63,7 @@ struct ConstantMatch {
   int64_t &CR;
   ConstantMatch(int64_t &C) : CR(C) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
-    if (auto MaybeCst = getConstantVRegSExtVal(Reg, MRI)) {
+    if (auto MaybeCst = getIConstantVRegSExtVal(Reg, MRI)) {
       CR = *MaybeCst;
       return true;
     }
@@ -73,21 +73,46 @@ struct ConstantMatch {
 
 inline ConstantMatch m_ICst(int64_t &Cst) { return ConstantMatch(Cst); }
 
-struct ICstRegMatch {
-  Register &CR;
-  ICstRegMatch(Register &C) : CR(C) {}
+struct GCstAndRegMatch {
+  Optional<ValueAndVReg> &ValReg;
+  GCstAndRegMatch(Optional<ValueAndVReg> &ValReg) : ValReg(ValReg) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
-    if (auto MaybeCst = getConstantVRegValWithLookThrough(
-            Reg, MRI, /*LookThroughInstrs*/ true,
-            /*HandleFConstants*/ false)) {
-      CR = MaybeCst->VReg;
-      return true;
-    }
-    return false;
+    ValReg = getIConstantVRegValWithLookThrough(Reg, MRI);
+    return ValReg ? true : false;
   }
 };
 
-inline ICstRegMatch m_ICst(Register &Reg) { return ICstRegMatch(Reg); }
+inline GCstAndRegMatch m_GCst(Optional<ValueAndVReg> &ValReg) {
+  return GCstAndRegMatch(ValReg);
+}
+
+struct GFCstAndRegMatch {
+  Optional<FPValueAndVReg> &FPValReg;
+  GFCstAndRegMatch(Optional<FPValueAndVReg> &FPValReg) : FPValReg(FPValReg) {}
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    FPValReg = getFConstantVRegValWithLookThrough(Reg, MRI);
+    return FPValReg ? true : false;
+  }
+};
+
+inline GFCstAndRegMatch m_GFCst(Optional<FPValueAndVReg> &FPValReg) {
+  return GFCstAndRegMatch(FPValReg);
+}
+
+struct GFCstOrSplatGFCstMatch {
+  Optional<FPValueAndVReg> &FPValReg;
+  GFCstOrSplatGFCstMatch(Optional<FPValueAndVReg> &FPValReg)
+      : FPValReg(FPValReg) {}
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    return (FPValReg = getFConstantSplat(Reg, MRI)) ||
+           (FPValReg = getFConstantVRegValWithLookThrough(Reg, MRI));
+  };
+};
+
+inline GFCstOrSplatGFCstMatch
+m_GFCstOrSplat(Optional<FPValueAndVReg> &FPValReg) {
+  return GFCstOrSplatGFCstMatch(FPValReg);
+}
 
 /// Matcher for a specific constant value.
 struct SpecificConstantMatch {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 9b652d8e16bc..069f71b54328 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1537,6 +1537,14 @@ public:
     return buildInstr(TargetOpcode::G_XOR, {Dst}, {Src0, NegOne});
   }
 
+  /// Build and insert integer negation
+  /// \p Zero = G_CONSTANT 0
+  /// \p Res = G_SUB Zero, \p Op0
+  MachineInstrBuilder buildNeg(const DstOp &Dst, const SrcOp &Src0) {
+    auto Zero = buildConstant(Dst.getLLTTy(*getMRI()), 0);
+    return buildInstr(TargetOpcode::G_SUB, {Dst}, {Zero, Src0});
+  }
+
   /// Build and insert \p Res = G_CTPOP \p Op0, \p Src0
   MachineInstrBuilder buildCTPOP(const DstOp &Dst, const SrcOp &Src0) {
     return buildInstr(TargetOpcode::G_CTPOP, {Dst}, {Src0});
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 818475a48abb..86545b976b8d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -14,6 +14,9 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_UTILS_H
 #define LLVM_CODEGEN_GLOBALISEL_UTILS_H
 
+#include "GISelWorkList.h"
+#include "LostDebugLocObserver.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/Register.h"
@@ -44,6 +47,7 @@ class TargetRegisterClass;
 class ConstantInt;
 class ConstantFP;
 class APFloat;
+class MachineIRBuilder;
 
 // Convenience macros for dealing with vector reduction opcodes.
 #define GISEL_VECREDUCE_CASES_ALL                                              \
@@ -162,13 +166,12 @@ void reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC,
                         MachineOptimizationRemarkMissed &R);
 
 /// If \p VReg is defined by a G_CONSTANT, return the corresponding value.
-Optional<APInt> getConstantVRegVal(Register VReg,
-                                   const MachineRegisterInfo &MRI);
+Optional<APInt> getIConstantVRegVal(Register VReg,
+                                    const MachineRegisterInfo &MRI);
 
-/// If \p VReg is defined by a G_CONSTANT fits in int64_t
-/// returns it.
-Optional<int64_t> getConstantVRegSExtVal(Register VReg,
-                                         const MachineRegisterInfo &MRI);
+/// If \p VReg is defined by a G_CONSTANT fits in int64_t returns it.
+Optional<int64_t> getIConstantVRegSExtVal(Register VReg,
+                                          const MachineRegisterInfo &MRI);
 
 /// Simple struct used to hold a constant integer value and a virtual
 /// register.
@@ -176,22 +179,32 @@ struct ValueAndVReg {
   APInt Value;
   Register VReg;
 };
-/// If \p VReg is defined by a statically evaluable chain of
-/// instructions rooted on a G_F/CONSTANT (\p LookThroughInstrs == true)
-/// and that constant fits in int64_t, returns its value as well as the
-/// virtual register defined by this G_F/CONSTANT.
-/// When \p LookThroughInstrs == false this function behaves like
-/// getConstantVRegVal.
-/// When \p HandleFConstants == false the function bails on G_FCONSTANTs.
-/// When \p LookThroughAnyExt == true the function treats G_ANYEXT same as
-/// G_SEXT.
+
+/// If \p VReg is defined by a statically evaluable chain of instructions rooted
+/// on a G_CONSTANT returns its APInt value and def register.
 Optional<ValueAndVReg>
-getConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI,
-                                  bool LookThroughInstrs = true,
-                                  bool HandleFConstants = true,
-                                  bool LookThroughAnyExt = false);
-const ConstantInt *getConstantIntVRegVal(Register VReg,
-                                         const MachineRegisterInfo &MRI);
+getIConstantVRegValWithLookThrough(Register VReg,
+                                   const MachineRegisterInfo &MRI,
+                                   bool LookThroughInstrs = true);
+
+/// If \p VReg is defined by a statically evaluable chain of instructions rooted
+/// on a G_CONSTANT or G_FCONSTANT returns its value as APInt and def register.
+Optional<ValueAndVReg> getAnyConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI,
+    bool LookThroughInstrs = true, bool LookThroughAnyExt = false);
+
+struct FPValueAndVReg {
+  APFloat Value;
+  Register VReg;
+};
+
+/// If \p VReg is defined by a statically evaluable chain of instructions rooted
+/// on a G_FCONSTANT returns its APFloat value and def register.
+Optional<FPValueAndVReg>
+getFConstantVRegValWithLookThrough(Register VReg,
+                                   const MachineRegisterInfo &MRI,
+                                   bool LookThroughInstrs = true);
+
 const ConstantFP* getConstantFPVRegVal(Register VReg,
                                        const MachineRegisterInfo &MRI);
 
@@ -254,6 +267,14 @@ Optional<APFloat> ConstantFoldFPBinOp(unsigned Opcode, const Register Op1,
                                       const Register Op2,
                                       const MachineRegisterInfo &MRI);
 
+/// Tries to constant fold a vector binop with sources \p Op1 and \p Op2.
+/// If successful, returns the G_BUILD_VECTOR representing the folded vector
+/// constant. \p MIB should have an insertion point already set to create new
+/// G_CONSTANT instructions as needed.
+Optional<MachineInstr *>
+ConstantFoldVectorBinop(unsigned Opcode, const Register Op1, const Register Op2,
+                        const MachineRegisterInfo &MRI, MachineIRBuilder &MIB);
+
 Optional<APInt> ConstantFoldExtOp(unsigned Opcode, const Register Op1,
                                   uint64_t Imm, const MachineRegisterInfo &MRI);
 
@@ -261,6 +282,11 @@ Optional<APFloat> ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy,
                                          Register Src,
                                          const MachineRegisterInfo &MRI);
 
+/// Tries to constant fold a G_CTLZ operation on \p Src. If \p Src is a vector
+/// then it tries to do an element-wise constant fold.
+Optional<SmallVector<unsigned>>
+ConstantFoldCTLZ(Register Src, const MachineRegisterInfo &MRI);
+
 /// Test if the given value is known to have exactly one bit set. This differs
 /// from computeKnownBits in that it doesn't necessarily determine which bit is
 /// set.
@@ -346,15 +372,23 @@ Optional<int> getSplatIndex(MachineInstr &MI);
 Optional<int64_t> getBuildVectorConstantSplat(const MachineInstr &MI,
                                               const MachineRegisterInfo &MRI);
 
+/// Returns a floating point scalar constant of a build vector splat if it
+/// exists. When \p AllowUndef == true some elements can be undef but not all.
+Optional<FPValueAndVReg> getFConstantSplat(Register VReg,
+                                           const MachineRegisterInfo &MRI,
+                                           bool AllowUndef = true);
+
 /// Return true if the specified instruction is a G_BUILD_VECTOR or
 /// G_BUILD_VECTOR_TRUNC where all of the elements are 0 or undef.
 bool isBuildVectorAllZeros(const MachineInstr &MI,
-                           const MachineRegisterInfo &MRI);
+                           const MachineRegisterInfo &MRI,
+                           bool AllowUndef = false);
 
 /// Return true if the specified instruction is a G_BUILD_VECTOR or
 /// G_BUILD_VECTOR_TRUNC where all of the elements are ~0 or undef.
 bool isBuildVectorAllOnes(const MachineInstr &MI,
-                          const MachineRegisterInfo &MRI);
+                          const MachineRegisterInfo &MRI,
+                          bool AllowUndef = false);
 
 /// \returns a value when \p MI is a vector splat. The splat can be either a
 /// Register or a constant.
@@ -378,6 +412,17 @@ bool isBuildVectorAllOnes(const MachineInstr &MI,
 Optional<RegOrConstant> getVectorSplat(const MachineInstr &MI,
                                        const MachineRegisterInfo &MRI);
 
+/// Determines if \p MI defines a constant integer or a build vector of
+/// constant integers. Treats undef values as constants.
+bool isConstantOrConstantVector(MachineInstr &MI,
+                                const MachineRegisterInfo &MRI);
+
+/// Determines if \p MI defines a constant integer or a splat vector of
+/// constant integers.
+/// \returns the scalar constant or None.
+Optional<APInt> isConstantOrConstantSplatVector(MachineInstr &MI,
+                                                const MachineRegisterInfo &MRI);
+
 /// Attempt to match a unary predicate against a scalar/splat constant or every
 /// element of a constant G_BUILD_VECTOR. If \p ConstVal is null, the source
 /// value was undef.
@@ -398,5 +443,14 @@ int64_t getICmpTrueVal(const TargetLowering &TLI, bool IsVector, bool IsFP);
 bool shouldOptForSize(const MachineBasicBlock &MBB, ProfileSummaryInfo *PSI,
                       BlockFrequencyInfo *BFI);
 
+using SmallInstListTy = GISelWorkList<4>;
+void saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LostDebugLocObserver *LocObserver,
+                      SmallInstListTy &DeadInstChain);
+void eraseInstrs(ArrayRef<MachineInstr *> DeadInstrs, MachineRegisterInfo &MRI,
+                 LostDebugLocObserver *LocObserver = nullptr);
+void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI,
+                LostDebugLocObserver *LocObserver = nullptr);
+
 } // End namespace llvm.
 #endif
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6803f4d76cf0..fd106f55a43d 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1260,6 +1260,11 @@ static const int FIRST_TARGET_STRICTFP_OPCODE = BUILTIN_OP_END + 400;
 /// be used with SelectionDAG::getMemIntrinsicNode.
 static const int FIRST_TARGET_MEMORY_OPCODE = BUILTIN_OP_END + 500;
 
+/// Whether this is bitwise logic opcode.
+inline bool isBitwiseLogicOp(unsigned Opcode) {
+  return Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::XOR;
+}
+
 /// Get underlying scalar opcode for VECREDUCE opcode.
 /// For example ISD::AND for ISD::VECREDUCE_AND.
 NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode);
@@ -1267,6 +1272,12 @@ NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode);
 /// Whether this is a vector-predicated Opcode.
 bool isVPOpcode(unsigned Opcode);
 
+/// Whether this is a vector-predicated binary operation opcode.
+bool isVPBinaryOp(unsigned Opcode);
+
+/// Whether this is a vector-predicated reduction opcode.
+bool isVPReduction(unsigned Opcode);
+
 /// The operand position of the vector mask.
 Optional<unsigned> getVPMaskIdx(unsigned Opcode);
 
diff --git a/llvm/include/llvm/CodeGen/IndirectThunks.h b/llvm/include/llvm/CodeGen/IndirectThunks.h
index 74973f38bc79..90f9912f0ee0 100644
--- a/llvm/include/llvm/CodeGen/IndirectThunks.h
+++ b/llvm/include/llvm/CodeGen/IndirectThunks.h
@@ -62,7 +62,7 @@ void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI,
   AttrBuilder B;
   B.addAttribute(llvm::Attribute::NoUnwind);
   B.addAttribute(llvm::Attribute::Naked);
-  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  F->addFnAttrs(B);
 
   // Populate our function a bit so that we can verify.
   BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
diff --git a/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h b/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
index 81b0025fdddc..c22f9d49f374 100644
--- a/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
+++ b/llvm/include/llvm/CodeGen/LinkAllAsmWriterComponents.h
@@ -24,6 +24,9 @@ namespace {
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
+      // This is so that globals in the translation units where these functions
+      // are defined are forced to be initialized, populating various
+      // registries.
       if (std::getenv("bar") != (char*) -1)
         return;
 
diff --git a/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h b/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h
index 1b13ff53ac85..d615a5db4504 100644
--- a/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h
+++ b/llvm/include/llvm/CodeGen/LinkAllCodegenComponents.h
@@ -27,6 +27,9 @@ namespace {
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
+      // This is so that globals in the translation units where these functions
+      // are defined are forced to be initialized, populating various
+      // registries.
       if (std::getenv("bar") != (char*) -1)
         return;
 
diff --git a/llvm/include/llvm/CodeGen/LiveInterval.h b/llvm/include/llvm/CodeGen/LiveInterval.h
index c2b158ac1b7f..923a45821dd4 100644
--- a/llvm/include/llvm/CodeGen/LiveInterval.h
+++ b/llvm/include/llvm/CodeGen/LiveInterval.h
@@ -521,11 +521,11 @@ namespace llvm {
       removeSegment(S.start, S.end, RemoveDeadValNo);
     }
 
-    /// Remove segment pointed to by iterator @p I from this range.  This does
-    /// not remove dead value numbers.
-    iterator removeSegment(iterator I) {
-      return segments.erase(I);
-    }
+    /// Remove segment pointed to by iterator @p I from this range.
+    iterator removeSegment(iterator I, bool RemoveDeadValNo = false);
+
+    /// Mark \p ValNo for deletion if no segments in this range use it.
+    void removeValNoIfDead(VNInfo *ValNo);
 
     /// Query Liveness at Idx.
     /// The sub-instruction slot of Idx doesn't matter, only the instruction
diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
index 4ebe0f2dcfd8..3b6a4a379d72 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -114,12 +114,19 @@ public:
     const LiveRange *LR = nullptr;
     LiveRange::const_iterator LRI;  ///< current position in LR
     ConstSegmentIter LiveUnionI;    ///< current position in LiveUnion
-    Optional<SmallVector<LiveInterval *, 4>> InterferingVRegs;
+    SmallVector<LiveInterval *, 4> InterferingVRegs;
     bool CheckedFirstInterference = false;
     bool SeenAllInterferences = false;
     unsigned Tag = 0;
     unsigned UserTag = 0;
 
+    // Count the virtual registers in this union that interfere with this
+    // query's live virtual register, up to maxInterferingRegs.
+    unsigned collectInterferingVRegs(unsigned MaxInterferingRegs);
+
+    // Was this virtual register visited during collectInterferingVRegs?
+    bool isSeenInterference(LiveInterval *VirtReg) const;
+
   public:
     Query() = default;
     Query(const LiveRange &LR, const LiveIntervalUnion &LIU)
@@ -131,7 +138,7 @@ public:
                const LiveIntervalUnion &NewLiveUnion) {
       LiveUnion = &NewLiveUnion;
       LR = &NewLR;
-      InterferingVRegs = None;
+      InterferingVRegs.clear();
       CheckedFirstInterference = false;
       SeenAllInterferences = false;
       Tag = NewLiveUnion.getTag();
@@ -151,20 +158,12 @@ public:
     // Does this live virtual register interfere with the union?
     bool checkInterference() { return collectInterferingVRegs(1); }
 
-    // Count the virtual registers in this union that interfere with this
-    // query's live virtual register, up to maxInterferingRegs.
-    unsigned collectInterferingVRegs(
-        unsigned MaxInterferingRegs = std::numeric_limits<unsigned>::max());
-
-    // Was this virtual register visited during collectInterferingVRegs?
-    bool isSeenInterference(LiveInterval *VirtReg) const;
-
-    // Did collectInterferingVRegs collect all interferences?
-    bool seenAllInterferences() const { return SeenAllInterferences; }
-
     // Vector generated by collectInterferingVRegs.
-    const SmallVectorImpl<LiveInterval*> &interferingVRegs() const {
-      return *InterferingVRegs;
+    const SmallVectorImpl<LiveInterval *> &interferingVRegs(
+        unsigned MaxInterferingRegs = std::numeric_limits<unsigned>::max()) {
+      if (!SeenAllInterferences || MaxInterferingRegs < InterferingVRegs.size())
+        collectInterferingVRegs(MaxInterferingRegs);
+      return InterferingVRegs;
     }
   };
 
diff --git a/llvm/include/llvm/CodeGen/LiveVariables.h b/llvm/include/llvm/CodeGen/LiveVariables.h
index 9b0667bbbeb0..dee316677b25 100644
--- a/llvm/include/llvm/CodeGen/LiveVariables.h
+++ b/llvm/include/llvm/CodeGen/LiveVariables.h
@@ -188,6 +188,12 @@ public:
   //===--------------------------------------------------------------------===//
   //  API to update live variable information
 
+  /// Recompute liveness from scratch for a virtual register \p Reg that is
+  /// known to have a single def that dominates all uses. This can be useful
+  /// after removing some uses of \p Reg. It is not necessary for the whole
+  /// machine function to be in SSA form.
+  void recomputeForSingleDefVirtReg(Register Reg);
+
   /// replaceKillInstruction - Update register kill info by replacing a kill
   /// instruction with a new one.
   void replaceKillInstruction(Register Reg, MachineInstr &OldMI,
diff --git a/llvm/include/llvm/CodeGen/LowLevelType.h b/llvm/include/llvm/CodeGen/LowLevelType.h
index 40985e16b37a..922f93d2e598 100644
--- a/llvm/include/llvm/CodeGen/LowLevelType.h
+++ b/llvm/include/llvm/CodeGen/LowLevelType.h
@@ -16,8 +16,8 @@
 #ifndef LLVM_CODEGEN_LOWLEVELTYPE_H
 #define LLVM_CODEGEN_LOWLEVELTYPE_H
 
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
@@ -31,6 +31,7 @@ LLT getLLTForType(Type &Ty, const DataLayout &DL);
 /// Get a rough equivalent of an MVT for a given LLT. MVT can't distinguish
 /// pointers, so these will convert to a plain integer.
 MVT getMVTForLLT(LLT Ty);
+EVT getApproximateEVTForLLT(LLT Ty, const DataLayout &DL, LLVMContext &Ctx);
 
 /// Get a rough equivalent of an LLT for a given MVT. LLT does not yet support
 /// scalarable vector types, and will assert if used.
diff --git a/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h b/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h
index 6137411b6dba..deb6b37a9bcf 100644
--- a/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h
+++ b/llvm/include/llvm/CodeGen/MIRFSDiscriminator.h
@@ -57,6 +57,10 @@ public:
     assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
   }
 
+  StringRef getPassName() const override {
+    return "Add FS discriminators in MIR";
+  }
+
   /// getNumFSBBs() - Return the number of machine BBs that have FS samples.
   unsigned getNumFSBBs();
 
diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
index 9cb92091db50..12c90600f6df 100644
--- a/llvm/include/llvm/CodeGen/MIRFormatter.h
+++ b/llvm/include/llvm/CodeGen/MIRFormatter.h
@@ -1,9 +1,8 @@
 //===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/CodeGen/MIRSampleProfile.h b/llvm/include/llvm/CodeGen/MIRSampleProfile.h
new file mode 100644
index 000000000000..2503524ccfdf
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRSampleProfile.h
@@ -0,0 +1,76 @@
+//===----- MIRSampleProfile.h: SampleFDO Support in MIR ---*- c++ -*-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the supoorting functions for machine level Sample FDO
+// loader. This is used in Flow Sensitive SampelFDO.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRSAMPLEPROFILE_H
+#define LLVM_CODEGEN_MIRSAMPLEPROFILE_H
+
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+
+#include <cassert>
+
+namespace llvm {
+
+using namespace sampleprof;
+
+class MIRProfileLoader;
+class MIRProfileLoaderPass : public MachineFunctionPass {
+  MachineFunction *MF;
+  std::string ProfileFileName;
+  FSDiscriminatorPass P;
+  unsigned LowBit;
+  unsigned HighBit;
+
+public:
+  static char ID;
+  /// FS bits will only use the '1' bits in the Mask.
+  MIRProfileLoaderPass(std::string FileName = "",
+                       std::string RemappingFileName = "",
+                       FSDiscriminatorPass P = FSDiscriminatorPass::Pass1);
+
+  /// getMachineFunction - Return the last machine function computed.
+  const MachineFunction *getMachineFunction() const { return MF; }
+
+  StringRef getPassName() const override { return "SampleFDO loader in MIR"; }
+
+private:
+  void init(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &) override;
+  bool doInitialization(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  std::unique_ptr<MIRProfileLoader> MIRSampleLoader;
+  /// Hold the information of the basic block frequency.
+  MachineBlockFrequencyInfo *MBFI;
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_MIRSAMPLEPROFILE_H
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index e7428e7ad260..b6d7c2487126 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -694,6 +694,7 @@ struct MachineFunction {
   // Register information
   bool TracksRegLiveness = false;
   bool HasWinCFI = false;
+  bool FailsVerification = false;
   std::vector<VirtualRegisterDefinition> VirtualRegisters;
   std::vector<MachineFunctionLiveIn> LiveIns;
   Optional<std::vector<FlowStringValue>> CalleeSavedRegisters;
@@ -722,6 +723,7 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("failedISel", MF.FailedISel, false);
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
     YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false);
+    YamlIO.mapOptional("failsVerification", MF.FailsVerification, false);
     YamlIO.mapOptional("registers", MF.VirtualRegisters,
                        std::vector<VirtualRegisterDefinition>());
     YamlIO.mapOptional("liveins", MF.LiveIns,
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index ac0cc70744d1..67544779f34c 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -153,7 +153,18 @@ enum class MachineCombinerPattern {
   FMLSv4f32_OP1,
   FMLSv4f32_OP2,
   FMLSv4i32_indexed_OP1,
-  FMLSv4i32_indexed_OP2
+  FMLSv4i32_indexed_OP2,
+
+  FMULv2i32_indexed_OP1,
+  FMULv2i32_indexed_OP2,
+  FMULv2i64_indexed_OP1,
+  FMULv2i64_indexed_OP2,
+  FMULv4i16_indexed_OP1,
+  FMULv4i16_indexed_OP2,
+  FMULv4i32_indexed_OP1,
+  FMULv4i32_indexed_OP2,
+  FMULv8i16_indexed_OP1,
+  FMULv8i16_indexed_OP2,
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index 46bf73cdd7b6..f749e9ff7e0a 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -36,6 +36,7 @@ extern template class DomTreeNodeBase<MachineBasicBlock>;
 extern template class DominatorTreeBase<MachineBasicBlock, false>; // DomTree
 extern template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTree
 
+using MachineDomTree = DomTreeBase<MachineBasicBlock>;
 using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 
 //===-------------------------------------
@@ -43,8 +44,6 @@ using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 /// compute a normal dominator tree.
 ///
 class MachineDominatorTree : public MachineFunctionPass {
-  using DomTreeT = DomTreeBase<MachineBasicBlock>;
-
   /// Helper structure used to hold all the basic blocks
   /// involved in the split of a critical edge.
   struct CriticalEdge {
@@ -67,7 +66,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
 
   /// The DominatorTreeBase that is used to compute a normal dominator tree.
-  std::unique_ptr<DomTreeT> DT;
+  std::unique_ptr<MachineDomTree> DT;
 
   /// Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
@@ -84,8 +83,9 @@ public:
     calculate(MF);
   }
 
-  DomTreeT &getBase() {
-    if (!DT) DT.reset(new DomTreeT());
+  MachineDomTree &getBase() {
+    if (!DT)
+      DT.reset(new MachineDomTree());
     applySplitCriticalEdges();
     return *DT;
   }
@@ -112,6 +112,12 @@ public:
     return DT->dominates(A, B);
   }
 
+  void getDescendants(MachineBasicBlock *A,
+                      SmallVectorImpl<MachineBasicBlock *> &Result) {
+    applySplitCriticalEdges();
+    DT->getDescendants(A, Result);
+  }
+
   bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const {
     applySplitCriticalEdges();
     return DT->dominates(A, B);
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 28a59703dc60..5df468102a8a 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -342,6 +342,8 @@ public:
       : StackAlignment(assumeAligned(StackAlignment)),
         StackRealignable(StackRealignable), ForcedRealign(ForcedRealign) {}
 
+  MachineFrameInfo(const MachineFrameInfo &) = delete;
+
   /// Return true if there are any stack objects in this function.
   bool hasStackObjects() const { return !Objects.empty(); }
 
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 786fe908f68f..dcbd19ac6b5a 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -149,6 +149,9 @@ public:
   //  all sizes attached to them have been eliminated.
   // TiedOpsRewritten: The twoaddressinstruction pass will set this flag, it
   //  means that tied-def have been rewritten to meet the RegConstraint.
+  // FailsVerification: Means that the function is not expected to pass machine
+  //  verification. This can be set by passes that introduce known problems that
+  //  have not been fixed yet.
   enum class Property : unsigned {
     IsSSA,
     NoPHIs,
@@ -159,7 +162,8 @@ public:
     RegBankSelected,
     Selected,
     TiedOpsRewritten,
-    LastProperty = TiedOpsRewritten,
+    FailsVerification,
+    LastProperty = FailsVerification,
   };
 
   bool hasProperty(Property P) const {
@@ -227,7 +231,7 @@ struct LandingPadInfo {
       : LandingPadBlock(MBB) {}
 };
 
-class MachineFunction {
+class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   Function &F;
   const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
@@ -536,6 +540,14 @@ public:
   /// (or DBG_PHI).
   void finalizeDebugInstrRefs();
 
+  /// Returns true if the function's variable locations should be tracked with
+  /// instruction referencing.
+  bool useDebugInstrRef() const;
+
+  /// A reserved operand number representing the instructions memory operand,
+  /// for instructions that have a stack spill fused into them.
+  const static unsigned int DebugOperandMemNumber;
+
   MachineFunction(Function &F, const LLVMTargetMachine &Target,
                   const TargetSubtargetInfo &STI, unsigned FunctionNum,
                   MachineModuleInfo &MMI);
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 757907f6d887..0ac934e208b6 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -517,7 +517,7 @@ public:
   SmallSet<Register, 4> getUsedDebugRegs() const {
     assert(isDebugValue() && "not a DBG_VALUE*");
     SmallSet<Register, 4> UsedRegs;
-    for (auto MO : debug_operands())
+    for (const auto &MO : debug_operands())
       if (MO.isReg() && MO.getReg())
         UsedRegs.insert(MO.getReg());
     return UsedRegs;
@@ -1331,6 +1331,7 @@ public:
     case TargetOpcode::LIFETIME_START:
     case TargetOpcode::LIFETIME_END:
     case TargetOpcode::PSEUDO_PROBE:
+    case TargetOpcode::ARITH_FENCE:
       return true;
     }
   }
@@ -1859,17 +1860,6 @@ public:
     }
   }
 
-  PseudoProbeAttributes getPseudoProbeAttribute() const {
-    assert(isPseudoProbe() && "Must be a pseudo probe instruction");
-    return (PseudoProbeAttributes)getOperand(3).getImm();
-  }
-
-  void addPseudoProbeAttribute(PseudoProbeAttributes Attr) {
-    assert(isPseudoProbe() && "Must be a pseudo probe instruction");
-    MachineOperand &AttrOperand = getOperand(3);
-    AttrOperand.setImm(AttrOperand.getImm() | (uint32_t)Attr);
-  }
-
 private:
   /// If this instruction is embedded into a MachineFunction, return the
   /// MachineRegisterInfo object for the current function, otherwise
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 07b8e5ebcc1d..00080b171974 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -282,17 +282,7 @@ public:
   /// success and failure orderings for an atomic operation.  (For operations
   /// other than cmpxchg, this is equivalent to getSuccessOrdering().)
   AtomicOrdering getMergedOrdering() const {
-    AtomicOrdering Ordering = getSuccessOrdering();
-    AtomicOrdering FailureOrdering = getFailureOrdering();
-    if (FailureOrdering == AtomicOrdering::SequentiallyConsistent)
-      return AtomicOrdering::SequentiallyConsistent;
-    if (FailureOrdering == AtomicOrdering::Acquire) {
-      if (Ordering == AtomicOrdering::Monotonic)
-        return AtomicOrdering::Acquire;
-      if (Ordering == AtomicOrdering::Release)
-        return AtomicOrdering::AcquireRelease;
-    }
-    return Ordering;
+    return getMergedAtomicOrdering(getSuccessOrdering(), getFailureOrdering());
   }
 
   bool isLoad() const { return FlagVals & MOLoad; }
diff --git a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
index 8cc5909c40b7..285b858c96cb 100644
--- a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
+++ b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
@@ -118,6 +118,12 @@ public:
       : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis,
                                       PassName, RemarkName, Loc, MBB) {}
 
+  MachineOptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName,
+                                    const MachineInstr *MI)
+      : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis,
+                                      PassName, RemarkName, MI->getDebugLoc(),
+                                      MI->getParent()) {}
+
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_MachineOptimizationRemarkAnalysis;
   }
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index ca3dd992bbd5..dbabfe5f0f32 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -821,7 +821,7 @@ public:
   /// deleted during LiveDebugVariables analysis.
   void markUsesInDebugValueAsUndef(Register Reg) const;
 
-  /// updateDbgUsersToReg - Update a collection of DBG_VALUE instructions
+  /// updateDbgUsersToReg - Update a collection of debug instructions
   /// to refer to the designated register.
   void updateDbgUsersToReg(MCRegister OldReg, MCRegister NewReg,
                            ArrayRef<MachineInstr *> Users) const {
@@ -829,21 +829,34 @@ public:
     for (MCRegUnitIterator RUI(OldReg, getTargetRegisterInfo()); RUI.isValid();
          ++RUI)
       OldRegUnits.insert(*RUI);
-    for (MachineInstr *MI : Users) {
-      assert(MI->isDebugValue());
-      for (auto &Op : MI->debug_operands()) {
-        if (Op.isReg()) {
-          for (MCRegUnitIterator RUI(OldReg, getTargetRegisterInfo());
-               RUI.isValid(); ++RUI) {
-            if (OldRegUnits.contains(*RUI)) {
-              Op.setReg(NewReg);
-              break;
-            }
+
+    // If this operand is a register, check whether it overlaps with OldReg.
+    // If it does, replace with NewReg.
+    auto UpdateOp = [this, &NewReg, &OldReg, &OldRegUnits](MachineOperand &Op) {
+      if (Op.isReg()) {
+        for (MCRegUnitIterator RUI(OldReg, getTargetRegisterInfo());
+             RUI.isValid(); ++RUI) {
+          if (OldRegUnits.contains(*RUI)) {
+            Op.setReg(NewReg);
+            break;
           }
         }
       }
-      assert(MI->hasDebugOperandForReg(NewReg) &&
-             "Expected debug value to have some overlap with OldReg");
+    };
+
+    // Iterate through (possibly several) operands to DBG_VALUEs and update
+    // each. For DBG_PHIs, only one operand will be present.
+    for (MachineInstr *MI : Users) {
+      if (MI->isDebugValue()) {
+        for (auto &Op : MI->debug_operands())
+          UpdateOp(Op);
+        assert(MI->hasDebugOperandForReg(NewReg) &&
+               "Expected debug value to have some overlap with OldReg");
+      } else if (MI->isDebugPHI()) {
+        UpdateOp(MI->getOperand(0));
+      } else {
+        llvm_unreachable("Non-DBG_VALUE, Non-DBG_PHI debug instr updated");
+      }
     }
   }
 
@@ -964,7 +977,7 @@ public:
   MCRegister getLiveInPhysReg(Register VReg) const;
 
   /// getLiveInVirtReg - If PReg is a live-in physical register, return the
-  /// corresponding live-in physical register.
+  /// corresponding live-in virtual register.
   Register getLiveInVirtReg(MCRegister PReg) const;
 
   /// EmitLiveInCopies - Emit copies to initialize livein virtual registers
diff --git a/llvm/include/llvm/CodeGen/MacroFusion.h b/llvm/include/llvm/CodeGen/MacroFusion.h
index 3a140fe63fde..ea2c7a5faae3 100644
--- a/llvm/include/llvm/CodeGen/MacroFusion.h
+++ b/llvm/include/llvm/CodeGen/MacroFusion.h
@@ -23,6 +23,8 @@ class MachineInstr;
 class ScheduleDAGMutation;
 class TargetInstrInfo;
 class TargetSubtargetInfo;
+class ScheduleDAGInstrs;
+class SUnit;
 
 /// Check if the instr pair, FirstMI and SecondMI, should be fused
 /// together. Given SecondMI, when FirstMI is unspecified, then check if
@@ -32,6 +34,18 @@ using ShouldSchedulePredTy = std::function<bool(const TargetInstrInfo &TII,
                                                 const MachineInstr *FirstMI,
                                                 const MachineInstr &SecondMI)>;
 
+/// Checks if the number of cluster edges between SU and its predecessors is
+/// less than FuseLimit
+bool hasLessThanNumFused(const SUnit &SU, unsigned FuseLimit);
+
+/// Create an artificial edge between FirstSU and SecondSU.
+/// Make data dependencies from the FirstSU also dependent on the SecondSU to
+/// prevent them from being scheduled between the FirstSU and the SecondSU
+/// and vice-versa.
+/// Fusing more than 2 instructions is not currently supported.
+bool fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
+                         SUnit &SecondSU);
+
 /// Create a DAG scheduling mutation to pair instructions back to back
 /// for instructions that benefit according to the target-specific
 /// shouldScheduleAdjacent predicate function.
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index da1bab718948..d5ad12fadfa0 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -37,6 +37,10 @@ class raw_ostream;
 
 // List of target independent CodeGen pass IDs.
 namespace llvm {
+
+  /// AtomicExpandPass - At IR level this pass replace atomic instructions with
+  /// __atomic_* library calls, or target specific instruction which implement the
+  /// same semantics in a way which better fits the target backend.
   FunctionPass *createAtomicExpandPass();
 
   /// createUnreachableBlockEliminationPass - The LLVM code generator does not
@@ -171,6 +175,9 @@ namespace llvm {
   /// This pass adds flow sensitive discriminators.
   extern char &MIRAddFSDiscriminatorsID;
 
+  /// This pass reads flow sensitive profile.
+  extern char &MIRProfileLoaderPassID;
+
   /// FastRegisterAllocation Pass - This pass register allocates as fast as
   /// possible. It is best suited for debug code where live ranges are short.
   ///
@@ -513,6 +520,11 @@ namespace llvm {
   FunctionPass *
   createMIRAddFSDiscriminatorsPass(sampleprof::FSDiscriminatorPass P);
 
+  /// Read Flow Sensitive Profile.
+  FunctionPass *createMIRProfileLoaderPass(std::string File,
+                                           std::string RemappingFile,
+                                           sampleprof::FSDiscriminatorPass P);
+
   /// Creates MIR Debugify pass. \see MachineDebugify.cpp
   ModulePass *createDebugifyMachineModulePass();
 
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index 39b77d919370..757ca8e112ee 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -1,9 +1,8 @@
 //===- RegAllocCommon.h - Utilities shared between allocators ---*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/CodeGen/RegisterScavenging.h b/llvm/include/llvm/CodeGen/RegisterScavenging.h
index 4f48ea2dc8e8..218e05f6eb6b 100644
--- a/llvm/include/llvm/CodeGen/RegisterScavenging.h
+++ b/llvm/include/llvm/CodeGen/RegisterScavenging.h
@@ -211,9 +211,6 @@ private:
   /// Initialize RegisterScavenger.
   void init(MachineBasicBlock &MBB);
 
-  /// Mark live-in registers of basic block as used.
-  void setLiveInsUsed(const MachineBasicBlock &MBB);
-
   /// Spill a register after position \p After and reload it before position
   /// \p UseMI.
   ScavengedInfo &spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 948a4763b872..5a3f4e9a23ff 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -531,7 +531,7 @@ public:
   }
 
 #ifndef NDEBUG
-  void VerifyDAGDiverence();
+  void VerifyDAGDivergence();
 #endif
 
   /// This iterates over the nodes in the SelectionDAG, folding
@@ -621,8 +621,8 @@ public:
 
   SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
                              bool IsOpaque = false) {
-    return getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL,
-                       VT, IsTarget, IsOpaque);
+    return getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()), DL, VT,
+                       IsTarget, IsOpaque);
   }
 
   SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
@@ -1307,6 +1307,74 @@ public:
   SDValue getIndexedStore(SDValue OrigStore, const SDLoc &dl, SDValue Base,
                           SDValue Offset, ISD::MemIndexedMode AM);
 
+  SDValue getLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
+                    const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
+                    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo,
+                    EVT MemVT, Align Alignment,
+                    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+                    const MDNode *Ranges = nullptr, bool IsExpanding = false);
+  inline SDValue
+  getLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
+            const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
+            SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
+            MaybeAlign Alignment = MaybeAlign(),
+            MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+            const AAMDNodes &AAInfo = AAMDNodes(),
+            const MDNode *Ranges = nullptr, bool IsExpanding = false) {
+    // Ensures that codegen never sees a None Alignment.
+    return getLoadVP(AM, ExtType, VT, dl, Chain, Ptr, Offset, Mask, EVL,
+                     PtrInfo, MemVT, Alignment.getValueOr(getEVTAlign(MemVT)),
+                     MMOFlags, AAInfo, Ranges, IsExpanding);
+  }
+  SDValue getLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
+                    const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
+                    SDValue Mask, SDValue EVL, EVT MemVT,
+                    MachineMemOperand *MMO, bool IsExpanding = false);
+  SDValue getLoadVP(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo,
+                    MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags,
+                    const AAMDNodes &AAInfo, const MDNode *Ranges = nullptr,
+                    bool IsExpanding = false);
+  SDValue getLoadVP(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                    SDValue Mask, SDValue EVL, MachineMemOperand *MMO,
+                    bool IsExpanding = false);
+  SDValue getExtLoadVP(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT,
+                       SDValue Chain, SDValue Ptr, SDValue Mask, SDValue EVL,
+                       MachinePointerInfo PtrInfo, EVT MemVT,
+                       MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags,
+                       const AAMDNodes &AAInfo, bool IsExpanding = false);
+  SDValue getExtLoadVP(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT,
+                       SDValue Chain, SDValue Ptr, SDValue Mask, SDValue EVL,
+                       EVT MemVT, MachineMemOperand *MMO,
+                       bool IsExpanding = false);
+  SDValue getIndexedLoadVP(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
+                           SDValue Offset, ISD::MemIndexedMode AM);
+  SDValue getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
+                     SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo,
+                     Align Alignment, MachineMemOperand::Flags MMOFlags,
+                     const AAMDNodes &AAInfo = AAMDNodes(),
+                     bool IsCompressing = false);
+  SDValue getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
+                     SDValue Mask, SDValue EVL, MachineMemOperand *MMO,
+                     bool IsCompressing = false);
+  SDValue getTruncStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
+                          SDValue Ptr, SDValue Mask, SDValue EVL,
+                          MachinePointerInfo PtrInfo, EVT SVT, Align Alignment,
+                          MachineMemOperand::Flags MMOFlags,
+                          const AAMDNodes &AAInfo, bool IsCompressing = false);
+  SDValue getTruncStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
+                          SDValue Ptr, SDValue Mask, SDValue EVL, EVT SVT,
+                          MachineMemOperand *MMO, bool IsCompressing = false);
+  SDValue getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl, SDValue Base,
+                            SDValue Offset, ISD::MemIndexedMode AM);
+
+  SDValue getGatherVP(SDVTList VTs, EVT VT, const SDLoc &dl,
+                      ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                      ISD::MemIndexType IndexType);
+  SDValue getScatterVP(SDVTList VTs, EVT VT, const SDLoc &dl,
+                       ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                       ISD::MemIndexType IndexType);
+
   SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base,
                         SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT,
                         MachineMemOperand *MMO, ISD::MemIndexedMode AM,
@@ -1664,10 +1732,6 @@ public:
   SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
                                  ArrayRef<SDValue> Ops);
 
-  SDValue FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
-                                       ArrayRef<SDValue> Ops,
-                                       const SDNodeFlags Flags = SDNodeFlags());
-
   /// Fold floating-point operations with 2 operands when both operands are
   /// constants and/or undefined.
   SDValue foldConstantFPMath(unsigned Opcode, const SDLoc &DL, EVT VT,
@@ -1769,6 +1833,19 @@ public:
   unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
                               unsigned Depth = 0) const;
 
+  /// Get the minimum bit size for this Value \p Op as a signed integer.
+  /// i.e.  x == sext(trunc(x to MinSignedBits) to bitwidth(x)).
+  /// Similar to the APInt::getMinSignedBits function.
+  /// Helper wrapper to ComputeNumSignBits.
+  unsigned ComputeMinSignedBits(SDValue Op, unsigned Depth = 0) const;
+
+  /// Get the minimum bit size for this Value \p Op as a signed integer.
+  /// i.e.  x == sext(trunc(x to MinSignedBits) to bitwidth(x)).
+  /// Similar to the APInt::getMinSignedBits function.
+  /// Helper wrapper to ComputeNumSignBits.
+  unsigned ComputeMinSignedBits(SDValue Op, const APInt &DemandedElts,
+                                unsigned Depth = 0) const;
+
   /// Return true if this function can prove that \p Op is never poison
   /// and, if \p PoisonOnly is false, does not have undef bits.
   bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly = false,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 4ee58333495b..6a3d76be0ed6 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -50,6 +50,7 @@ public:
   SDValue getIndex() { return Index; }
   SDValue getIndex() const { return Index; }
   bool hasValidOffset() const { return Offset.hasValue(); }
+  int64_t getOffset() const { return *Offset; }
 
   // Returns true if `Other` and `*this` are both some offset from the same base
   // pointer. In that case, `Off` is set to the offset between `*this` and
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index deeca98af3f3..2855e1f1e587 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -58,7 +58,6 @@ namespace llvm {
 
 class APInt;
 class Constant;
-template <typename T> struct DenseMapInfo;
 class GlobalValue;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
@@ -509,15 +508,19 @@ BEGIN_TWO_BYTE_PACK()
 
   class LSBaseSDNodeBitfields {
     friend class LSBaseSDNode;
+    friend class VPLoadStoreSDNode;
     friend class MaskedLoadStoreSDNode;
     friend class MaskedGatherScatterSDNode;
+    friend class VPGatherScatterSDNode;
 
     uint16_t : NumMemSDNodeBits;
 
     // This storage is shared between disparate class hierarchies to hold an
     // enumeration specific to the class hierarchy in use.
     //   LSBaseSDNode => enum ISD::MemIndexedMode
+    //   VPLoadStoreBaseSDNode => enum ISD::MemIndexedMode
     //   MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
+    //   VPGatherScatterSDNode => enum ISD::MemIndexType
     //   MaskedGatherScatterSDNode => enum ISD::MemIndexType
     uint16_t AddressingMode : 3;
   };
@@ -525,8 +528,10 @@ BEGIN_TWO_BYTE_PACK()
 
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
+    friend class VPLoadSDNode;
     friend class MaskedLoadSDNode;
     friend class MaskedGatherSDNode;
+    friend class VPGatherSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -536,8 +541,10 @@ BEGIN_TWO_BYTE_PACK()
 
   class StoreSDNodeBitfields {
     friend class StoreSDNode;
+    friend class VPStoreSDNode;
     friend class MaskedStoreSDNode;
     friend class MaskedScatterSDNode;
+    friend class VPScatterSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -1353,7 +1360,9 @@ public:
   const SDValue &getBasePtr() const {
     switch (getOpcode()) {
     case ISD::STORE:
+    case ISD::VP_STORE:
     case ISD::MSTORE:
+    case ISD::VP_SCATTER:
       return getOperand(2);
     case ISD::MGATHER:
     case ISD::MSCATTER:
@@ -1393,6 +1402,10 @@ public:
     case ISD::MSTORE:
     case ISD::MGATHER:
     case ISD::MSCATTER:
+    case ISD::VP_LOAD:
+    case ISD::VP_STORE:
+    case ISD::VP_GATHER:
+    case ISD::VP_SCATTER:
       return true;
     default:
       return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
@@ -1563,8 +1576,12 @@ public:
   Align getAlignValue() const { return Value->getAlignValue(); }
 
   bool isOne() const { return Value->isOne(); }
-  bool isNullValue() const { return Value->isZero(); }
-  bool isAllOnesValue() const { return Value->isMinusOne(); }
+  bool isZero() const { return Value->isZero(); }
+  // NOTE: This is soft-deprecated.  Please use `isZero()` instead.
+  bool isNullValue() const { return isZero(); }
+  bool isAllOnes() const { return Value->isMinusOne(); }
+  // NOTE: This is soft-deprecated.  Please use `isAllOnes()` instead.
+  bool isAllOnesValue() const { return isAllOnes(); }
   bool isMaxSignedValue() const { return Value->isMaxValue(true); }
   bool isMinSignedValue() const { return Value->isMinValue(true); }
 
@@ -2031,8 +2048,25 @@ public:
   int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
                                           uint32_t BitWidth) const;
 
+  /// Extract the raw bit data from a build vector of Undef, Constant or
+  /// ConstantFP node elements. Each raw bit element will be \p
+  /// DstEltSizeInBits wide, undef elements are treated as zero, and entirely
+  /// undefined elements are flagged in \p UndefElements.
+  bool getConstantRawBits(bool IsLittleEndian, unsigned DstEltSizeInBits,
+                          SmallVectorImpl<APInt> &RawBitElements,
+                          BitVector &UndefElements) const;
+
   bool isConstant() const;
 
+  /// Recast bit data \p SrcBitElements to \p DstEltSizeInBits wide elements.
+  /// Undef elements are treated as zero, and entirely undefined elements are
+  /// flagged in \p DstUndefElements.
+  static void recastRawBits(bool IsLittleEndian, unsigned DstEltSizeInBits,
+                            SmallVectorImpl<APInt> &DstBitElements,
+                            ArrayRef<APInt> SrcBitElements,
+                            BitVector &DstUndefElements,
+                            const BitVector &SrcUndefElements);
+
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::BUILD_VECTOR;
   }
@@ -2318,6 +2352,116 @@ public:
   }
 };
 
+/// This base class is used to represent VP_LOAD and VP_STORE nodes
+class VPLoadStoreSDNode : public MemSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, const DebugLoc &dl,
+                    SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT,
+                    MachineMemOperand *MMO)
+      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+    LSBaseSDNodeBits.AddressingMode = AM;
+    assert(getAddressingMode() == AM && "Value truncated");
+  }
+
+  // VPLoadSDNode (Chain, Ptr, Offset, Mask, EVL)
+  // VPStoreSDNode (Chain, Data, Ptr, Offset, Mask, EVL)
+  // Mask is a vector of i1 elements;
+  // the type of EVL is TLI.getVPExplicitVectorLengthTy().
+  const SDValue &getOffset() const {
+    return getOperand(getOpcode() == ISD::VP_LOAD ? 2 : 3);
+  }
+  const SDValue &getBasePtr() const {
+    return getOperand(getOpcode() == ISD::VP_LOAD ? 1 : 2);
+  }
+  const SDValue &getMask() const {
+    return getOperand(getOpcode() == ISD::VP_LOAD ? 3 : 4);
+  }
+  const SDValue &getVectorLength() const {
+    return getOperand(getOpcode() == ISD::VP_LOAD ? 4 : 5);
+  }
+
+  /// Return the addressing mode for this load or store:
+  /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
+  ISD::MemIndexedMode getAddressingMode() const {
+    return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
+  }
+
+  /// Return true if this is a pre/post inc/dec load/store.
+  bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
+
+  /// Return true if this is NOT a pre/post inc/dec load/store.
+  bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::VP_LOAD || N->getOpcode() == ISD::VP_STORE;
+  }
+};
+
+/// This class is used to represent a VP_LOAD node
+class VPLoadSDNode : public VPLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
+               ISD::MemIndexedMode AM, ISD::LoadExtType ETy, bool isExpanding,
+               EVT MemVT, MachineMemOperand *MMO)
+      : VPLoadStoreSDNode(ISD::VP_LOAD, Order, dl, VTs, AM, MemVT, MMO) {
+    LoadSDNodeBits.ExtTy = ETy;
+    LoadSDNodeBits.IsExpanding = isExpanding;
+  }
+
+  ISD::LoadExtType getExtensionType() const {
+    return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
+  }
+
+  const SDValue &getBasePtr() const { return getOperand(1); }
+  const SDValue &getOffset() const { return getOperand(2); }
+  const SDValue &getMask() const { return getOperand(3); }
+  const SDValue &getVectorLength() const { return getOperand(4); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::VP_LOAD;
+  }
+  bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
+};
+
+/// This class is used to represent a VP_STORE node
+class VPStoreSDNode : public VPLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
+                ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
+                EVT MemVT, MachineMemOperand *MMO)
+      : VPLoadStoreSDNode(ISD::VP_STORE, Order, dl, VTs, AM, MemVT, MMO) {
+    StoreSDNodeBits.IsTruncating = isTrunc;
+    StoreSDNodeBits.IsCompressing = isCompressing;
+  }
+
+  /// Return true if this is a truncating store.
+  /// For integers this is the same as doing a TRUNCATE and storing the result.
+  /// For floats, it is the same as doing an FP_ROUND and storing the result.
+  bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
+
+  /// Returns true if the op does a compression to the vector before storing.
+  /// The node contiguously stores the active elements (integers or floats)
+  /// in src (those with their respective bit set in writemask k) to unaligned
+  /// memory at base_addr.
+  bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
+
+  const SDValue &getValue() const { return getOperand(1); }
+  const SDValue &getBasePtr() const { return getOperand(2); }
+  const SDValue &getOffset() const { return getOperand(3); }
+  const SDValue &getMask() const { return getOperand(4); }
+  const SDValue &getVectorLength() const { return getOperand(5); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::VP_STORE;
+  }
+};
+
 /// This base class is used to represent MLOAD and MSTORE nodes
 class MaskedLoadStoreSDNode : public MemSDNode {
 public:
@@ -2424,6 +2568,94 @@ public:
 };
 
 /// This is a base class used to represent
+/// VP_GATHER and VP_SCATTER nodes
+///
+class VPGatherScatterSDNode : public MemSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPGatherScatterSDNode(ISD::NodeType NodeTy, unsigned Order,
+                        const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                        MachineMemOperand *MMO, ISD::MemIndexType IndexType)
+      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+    assert(getIndexType() == IndexType && "Value truncated");
+  }
+
+  /// How is Index applied to BasePtr when computing addresses.
+  ISD::MemIndexType getIndexType() const {
+    return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
+  }
+  bool isIndexScaled() const {
+    return (getIndexType() == ISD::SIGNED_SCALED) ||
+           (getIndexType() == ISD::UNSIGNED_SCALED);
+  }
+  bool isIndexSigned() const {
+    return (getIndexType() == ISD::SIGNED_SCALED) ||
+           (getIndexType() == ISD::SIGNED_UNSCALED);
+  }
+
+  // In the both nodes address is Op1, mask is Op2:
+  // VPGatherSDNode  (Chain, base, index, scale, mask, vlen)
+  // VPScatterSDNode (Chain, value, base, index, scale, mask, vlen)
+  // Mask is a vector of i1 elements
+  const SDValue &getBasePtr() const {
+    return getOperand((getOpcode() == ISD::VP_GATHER) ? 1 : 2);
+  }
+  const SDValue &getIndex() const {
+    return getOperand((getOpcode() == ISD::VP_GATHER) ? 2 : 3);
+  }
+  const SDValue &getScale() const {
+    return getOperand((getOpcode() == ISD::VP_GATHER) ? 3 : 4);
+  }
+  const SDValue &getMask() const {
+    return getOperand((getOpcode() == ISD::VP_GATHER) ? 4 : 5);
+  }
+  const SDValue &getVectorLength() const {
+    return getOperand((getOpcode() == ISD::VP_GATHER) ? 5 : 6);
+  }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::VP_GATHER ||
+           N->getOpcode() == ISD::VP_SCATTER;
+  }
+};
+
+/// This class is used to represent an VP_GATHER node
+///
+class VPGatherSDNode : public VPGatherScatterSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                 MachineMemOperand *MMO, ISD::MemIndexType IndexType)
+      : VPGatherScatterSDNode(ISD::VP_GATHER, Order, dl, VTs, MemVT, MMO,
+                              IndexType) {}
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::VP_GATHER;
+  }
+};
+
+/// This class is used to represent an VP_SCATTER node
+///
+class VPScatterSDNode : public VPGatherScatterSDNode {
+public:
+  friend class SelectionDAG;
+
+  VPScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                  MachineMemOperand *MMO, ISD::MemIndexType IndexType)
+      : VPGatherScatterSDNode(ISD::VP_SCATTER, Order, dl, VTs, MemVT, MMO,
+                              IndexType) {}
+
+  const SDValue &getValue() const { return getOperand(1); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::VP_SCATTER;
+  }
+};
+
+/// This is a base class used to represent
 /// MGATHER and MSCATTER nodes
 ///
 class MaskedGatherScatterSDNode : public MemSDNode {
diff --git a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
index 51f1d7d6fd21..bc22d7789856 100644
--- a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
+++ b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
@@ -183,12 +183,12 @@ struct JumpTableHeader {
   const Value *SValue;
   MachineBasicBlock *HeaderBB;
   bool Emitted;
-  bool OmitRangeCheck;
+  bool FallthroughUnreachable;
 
   JumpTableHeader(APInt F, APInt L, const Value *SV, MachineBasicBlock *H,
                   bool E = false)
       : First(std::move(F)), Last(std::move(L)), SValue(SV), HeaderBB(H),
-        Emitted(E), OmitRangeCheck(false) {}
+        Emitted(E), FallthroughUnreachable(false) {}
 };
 using JumpTableBlock = std::pair<JumpTableHeader, JumpTable>;
 
@@ -218,14 +218,14 @@ struct BitTestBlock {
   BitTestInfo Cases;
   BranchProbability Prob;
   BranchProbability DefaultProb;
-  bool OmitRangeCheck;
+  bool FallthroughUnreachable;
 
   BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT, bool E,
                bool CR, MachineBasicBlock *P, MachineBasicBlock *D,
                BitTestInfo C, BranchProbability Pr)
       : First(std::move(F)), Range(std::move(R)), SValue(SV), Reg(Rg),
         RegVT(RgVT), Emitted(E), ContiguousRange(CR), Parent(P), Default(D),
-        Cases(std::move(C)), Prob(Pr), OmitRangeCheck(false) {}
+        Cases(std::move(C)), Prob(Pr), FallthroughUnreachable(false) {}
 };
 
 /// Return the range of values within a range.
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 29e644898f6b..7713dd0800c0 100644
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -247,11 +247,11 @@ namespace ISD {
     unsigned PartOffset;
 
     OutputArg() = default;
-    OutputArg(ArgFlagsTy flags, EVT vt, EVT argvt, bool isfixed,
+    OutputArg(ArgFlagsTy flags, MVT vt, EVT argvt, bool isfixed,
               unsigned origIdx, unsigned partOffs)
-      : Flags(flags), IsFixed(isfixed), OrigArgIndex(origIdx),
-        PartOffset(partOffs) {
-      VT = vt.getSimpleVT();
+        : Flags(flags), IsFixed(isfixed), OrigArgIndex(origIdx),
+          PartOffset(partOffs) {
+      VT = vt;
       ArgVT = argvt;
     }
   };
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 05d0591f1e5d..8bc730a3eda5 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -411,9 +411,12 @@ public:
   /// This method returns a null pointer if the transformation cannot be
   /// performed, otherwise it returns the last new instruction.
   ///
-  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                              MachineInstr &MI,
-                                              LiveVariables *LV) const {
+  /// If \p LIS is not nullptr, the LiveIntervals info should be updated for
+  /// replacing \p MI with new instructions, even though this function does not
+  /// remove MI.
+  virtual MachineInstr *convertToThreeAddress(MachineInstr &MI,
+                                              LiveVariables *LV,
+                                              LiveIntervals *LIS) const {
     return nullptr;
   }
 
@@ -583,15 +586,14 @@ public:
   }
 
   /// Insert an unconditional indirect branch at the end of \p MBB to \p
-  /// NewDestBB.  \p BrOffset indicates the offset of \p NewDestBB relative to
+  /// NewDestBB. Optionally, insert the clobbered register restoring in \p
+  /// RestoreBB. \p BrOffset indicates the offset of \p NewDestBB relative to
   /// the offset of the position to insert the new branch.
-  ///
-  /// \returns The number of bytes added to the block.
-  virtual unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                        MachineBasicBlock &NewDestBB,
-                                        const DebugLoc &DL,
-                                        int64_t BrOffset = 0,
-                                        RegScavenger *RS = nullptr) const {
+  virtual void insertIndirectBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock &NewDestBB,
+                                    MachineBasicBlock &RestoreBB,
+                                    const DebugLoc &DL, int64_t BrOffset = 0,
+                                    RegScavenger *RS = nullptr) const {
     llvm_unreachable("target did not implement");
   }
 
@@ -1537,7 +1539,8 @@ public:
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
   virtual bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                              Register &SrcReg2, int &Mask, int &Value) const {
+                              Register &SrcReg2, int64_t &Mask,
+                              int64_t &Value) const {
     return false;
   }
 
@@ -1545,7 +1548,8 @@ public:
   /// into something more efficient. E.g., on ARM most instructions can set the
   /// flags register, obviating the need for a separate CMP.
   virtual bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                                    Register SrcReg2, int Mask, int Value,
+                                    Register SrcReg2, int64_t Mask,
+                                    int64_t Value,
                                     const MachineRegisterInfo *MRI) const {
     return false;
   }
@@ -1624,9 +1628,6 @@ public:
   unsigned defaultDefLatency(const MCSchedModel &SchedModel,
                              const MachineInstr &DefMI) const;
 
-  int computeDefOperandLatency(const InstrItineraryData *ItinData,
-                               const MachineInstr &DefMI) const;
-
   /// Return true if this opcode has high latency to its result.
   virtual bool isHighLatencyDef(int opc) const { return false; }
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 692dc4d7d4cf..87f5168ec48f 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -30,6 +30,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -371,10 +372,18 @@ public:
     return getPointerTy(DL);
   }
 
-  /// EVT is not used in-tree, but is used by out-of-tree target.
-  /// A documentation for this function would be nice...
+  /// Return the type to use for a scalar shift opcode, given the shifted amount
+  /// type. Targets should return a legal type if the input type is legal.
+  /// Targets can return a type that is too small if the input type is illegal.
   virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;
 
+  /// Returns the type for the shift amount of a shift opcode. For vectors,
+  /// returns the input type. For scalars, behavior depends on \p LegalTypes. If
+  /// \p LegalTypes is true, calls getScalarShiftAmountTy, otherwise uses
+  /// pointer type. If getScalarShiftAmountTy or pointer type cannot represent
+  /// all possible shift amounts, returns MVT::i32. In general, \p LegalTypes
+  /// should be set to true for calls during type legalization and after type
+  /// legalization has been completed.
   EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
                        bool LegalTypes = true) const;
 
@@ -591,7 +600,7 @@ public:
 
   /// Returns if it's reasonable to merge stores to MemVT size.
   virtual bool canMergeStoresTo(unsigned AS, EVT MemVT,
-                                const SelectionDAG &DAG) const {
+                                const MachineFunction &MF) const {
     return true;
   }
 
@@ -1396,6 +1405,11 @@ public:
     return NVT;
   }
 
+  virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
+                                     bool AllowUnknown = false) const {
+    return getValueType(DL, Ty, AllowUnknown);
+  }
+
   /// Return the EVT corresponding to this LLVM type.  This is fixed by the LLVM
   /// operations except for the pointer size.  If AllowUnknown is true, this
   /// will return MVT::Other for types with no EVT counterpart (e.g. structs),
@@ -1448,7 +1462,7 @@ public:
   /// Return the desired alignment for ByVal or InAlloca aggregate function
   /// arguments in the caller parameter area.  This is the actual alignment, not
   /// its logarithm.
-  virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;
+  virtual uint64_t getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;
 
   /// Return the type of registers that this ValueType will eventually require.
   MVT getRegisterType(MVT VT) const {
@@ -1763,9 +1777,7 @@ public:
   Align getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
 
   /// Return the preferred loop alignment.
-  virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
-    return PrefLoopAlignment;
-  }
+  virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const;
 
   /// Should loops be aligned even when the function is marked OptSize (but not
   /// MinSize).
@@ -2077,6 +2089,20 @@ public:
     return false;
   }
 
+  /// Return true if it may be profitable to transform
+  /// (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+  /// This may not be true if c1 and c2 can be represented as immediates but
+  /// c1*c2 cannot, for example.
+  /// The target should check if c1, c2 and c1*c2 can be represented as
+  /// immediates, or have to be materialized into registers. If it is not sure
+  /// about some cases, a default true can be returned to let the DAGCombiner
+  /// decide.
+  /// AddNode is (add x, c1), and ConstNode is c2.
+  virtual bool isMulAddWithConstProfitable(const SDValue &AddNode,
+                                           const SDValue &ConstNode) const {
+    return true;
+  }
+
   /// Return true if it is more correct/profitable to use strict FP_TO_INT
   /// conversion operations - canonicalizing the FP source value instead of
   /// converting all cases and then selecting based on value.
@@ -2177,8 +2203,7 @@ protected:
   /// Indicate that the specified operation does not work with the specified
   /// type and indicate what to do about it. Note that VT may refer to either
   /// the type of a result or that of an operand of Op.
-  void setOperationAction(unsigned Op, MVT VT,
-                          LegalizeAction Action) {
+  void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action) {
     assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
     OpActions[(unsigned)VT.SimpleTy][Op] = Action;
   }
@@ -2197,8 +2222,7 @@ protected:
 
   /// Indicate that the specified truncating store does not work with the
   /// specified type and indicate what to do about it.
-  void setTruncStoreAction(MVT ValVT, MVT MemVT,
-                           LegalizeAction Action) {
+  void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action) {
     assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
     TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
   }
@@ -2506,8 +2530,11 @@ public:
     return false;
   }
 
-  virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const {
-    return false;
+  virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { return false; }
+  virtual bool isTruncateFree(LLT FromTy, LLT ToTy, const DataLayout &DL,
+                              LLVMContext &Ctx) const {
+    return isTruncateFree(getApproximateEVTForLLT(FromTy, DL, Ctx),
+                          getApproximateEVTForLLT(ToTy, DL, Ctx));
   }
 
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
@@ -2583,8 +2610,11 @@ public:
     return false;
   }
 
-  virtual bool isZExtFree(EVT FromTy, EVT ToTy) const {
-    return false;
+  virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { return false; }
+  virtual bool isZExtFree(LLT FromTy, LLT ToTy, const DataLayout &DL,
+                          LLVMContext &Ctx) const {
+    return isZExtFree(getApproximateEVTForLLT(FromTy, DL, Ctx),
+                      getApproximateEVTForLLT(ToTy, DL, Ctx));
   }
 
   /// Return true if sign-extension from FromTy to ToTy is cheaper than
@@ -3807,7 +3837,7 @@ public:
       RetSExt = Call.hasRetAttr(Attribute::SExt);
       RetZExt = Call.hasRetAttr(Attribute::ZExt);
       NoMerge = Call.hasFnAttr(Attribute::NoMerge);
-      
+
       Callee = Target;
 
       CallConv = Call.getCallingConv();
@@ -4424,33 +4454,29 @@ public:
   /// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
-  /// \param Result output after conversion
-  /// \returns True, if the expansion was successful, false otherwise
-  bool expandCTPOP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandCTPOP(SDNode *N, SelectionDAG &DAG) const;
 
   /// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
-  /// \param Result output after conversion
-  /// \returns True, if the expansion was successful, false otherwise
-  bool expandCTLZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandCTLZ(SDNode *N, SelectionDAG &DAG) const;
 
   /// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// \param N Node to expand
-  /// \param Result output after conversion
-  /// \returns True, if the expansion was successful, false otherwise
-  bool expandCTTZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandCTTZ(SDNode *N, SelectionDAG &DAG) const;
 
   /// Expand ABS nodes. Expands vector/scalar ABS nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
   /// \param N Node to expand
-  /// \param Result output after conversion
   /// \param IsNegative indicate negated abs
-  /// \returns True, if the expansion was successful, false otherwise
-  bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG,
-                 bool IsNegative = false) const;
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandABS(SDNode *N, SelectionDAG &DAG,
+                    bool IsNegative = false) const;
 
   /// Expand BSWAP nodes. Expands scalar/vector BSWAP nodes with i16/i32/i64
   /// scalar types. Returns SDValue() if expand fails.
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index 11138039a3c5..9b13b61fc9de 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -187,8 +187,7 @@ public:
   void substitutePass(AnalysisID StandardID, IdentifyingPassPtr TargetID);
 
   /// Insert InsertedPassID pass after TargetPassID pass.
-  void insertPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
-                  bool VerifyAfter = true);
+  void insertPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID);
 
   /// Allow the target to enable a specific standard pass by default.
   void enablePass(AnalysisID PassID) { substitutePass(PassID, PassID); }
@@ -323,8 +322,7 @@ public:
 
   /// Add standard passes after a pass that has just been added. For example,
   /// the MachineVerifier if it is enabled.
-  void addMachinePostPasses(const std::string &Banner, bool AllowVerify = true,
-                            bool AllowStrip = true);
+  void addMachinePostPasses(const std::string &Banner);
 
   /// Check whether or not GlobalISel should abort on error.
   /// When this is disabled, GlobalISel will fall back on SDISel instead of
@@ -449,16 +447,12 @@ protected:
 
   /// Add a CodeGen pass at this point in the pipeline after checking overrides.
   /// Return the pass that was added, or zero if no pass was added.
-  /// @p verifyAfter   if true and adding a machine function pass add an extra
-  ///                  machine verification pass afterwards.
-  AnalysisID addPass(AnalysisID PassID, bool verifyAfter = true);
+  AnalysisID addPass(AnalysisID PassID);
 
   /// Add a pass to the PassManager if that pass is supposed to be run, as
   /// determined by the StartAfter and StopAfter options. Takes ownership of the
   /// pass.
-  /// @p verifyAfter   if true and adding a machine function pass add an extra
-  ///                  machine verification pass afterwards.
-  void addPass(Pass *P, bool verifyAfter = true);
+  void addPass(Pass *P);
 
   /// addMachinePasses helper to create the target-selected or overriden
   /// regalloc pass.
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 92ce5b737090..8483d078ca74 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -57,6 +57,8 @@ public:
   /// Classes with a higher priority value are assigned first by register
   /// allocators using a greedy heuristic. The value is in the range [0,63].
   const uint8_t AllocationPriority;
+  /// Configurable target specific flags.
+  const uint8_t TSFlags;
   /// Whether the class supports two (or more) disjunct subregister indices.
   const bool HasDisjunctSubRegs;
   /// Whether a combination of subregisters can cover every register in the
@@ -871,10 +873,6 @@ public:
   /// (3) Bottom-up allocation is no longer guaranteed to optimally color.
   virtual bool reverseLocalAssignment() const { return false; }
 
-  /// Add the allocation priority to global and split ranges as well as the
-  /// local ranges when registers are added to the queue.
-  virtual bool addAllocPriorityToGlobalRanges() const { return false; }
-
   /// Allow the target to override the cost of using a callee-saved register for
   /// the first time. Default value of 0 means we will use a callee-saved
   /// register if it is available.
diff --git a/llvm/include/llvm/CodeGen/TargetSchedule.h b/llvm/include/llvm/CodeGen/TargetSchedule.h
index aa6b82e14aa6..049ede89ab46 100644
--- a/llvm/include/llvm/CodeGen/TargetSchedule.h
+++ b/llvm/include/llvm/CodeGen/TargetSchedule.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_CODEGEN_TARGETSCHEDULE_H
 #define LLVM_CODEGEN_TARGETSCHEDULE_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 0e88e705e16b..7f989e08e9bf 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -216,6 +216,7 @@ def untyped   : ValueType<8,    174>;  // Produces an untyped value
 def funcref   : ValueType<0,    175>;  // WebAssembly's funcref type
 def externref : ValueType<0,    176>;  // WebAssembly's externref type
 def x86amx    : ValueType<8192, 177>;  // X86 AMX value
+def i64x8     : ValueType<512,  178>;  // 8 Consecutive GPRs (AArch64)
 
 
 def token      : ValueType<0, 248>;  // TokenTy
@@ -243,7 +244,7 @@ def Any        : ValueType<0, 255>;
 /// This class is for targets that want to use pointer types in patterns
 /// with the GlobalISelEmitter.  Targets must define their own pointer
 /// derived from this class.  The scalar argument should be an
-/// integer type with the same bit size as the ponter.
+/// integer type with the same bit size as the pointer.
 /// e.g. def p0 : PtrValueType <i64, 0>;
 
 class PtrValueType <ValueType scalar, int addrspace> :
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/DWARFLinker.h
index 7b89c9f66f86..1c6d0b1ead86 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinker.h
@@ -80,7 +80,7 @@ public:
                                    CompileUnit::DIEInfo &Info) = 0;
 
   /// Apply the valid relocations to the buffer \p Data, taking into
-  /// account that Data is at \p BaseOffset in the debug_info section.
+  /// account that Data is at \p BaseOffset in the .debug_info section.
   ///
   /// \returns true whether any reloc has been applied.
   virtual bool applyValidRelocs(MutableArrayRef<char> Data, uint64_t BaseOffset,
@@ -109,7 +109,7 @@ public:
   /// Emit section named SecName with data SecData.
   virtual void emitSectionContents(StringRef SecData, StringRef SecName) = 0;
 
-  /// Emit the abbreviation table \p Abbrevs to the debug_abbrev section.
+  /// Emit the abbreviation table \p Abbrevs to the .debug_abbrev section.
   virtual void
   emitAbbrevs(const std::vector<std::unique_ptr<DIEAbbrev>> &Abbrevs,
               unsigned DwarfVersion) = 0;
@@ -137,7 +137,7 @@ public:
   virtual void
   emitAppleTypes(AccelTable<AppleAccelTableStaticTypeData> &Table) = 0;
 
-  /// Emit debug_ranges for \p FuncRange by translating the
+  /// Emit .debug_ranges for \p FuncRange by translating the
   /// original \p Entries.
   virtual void emitRangesEntries(
       int64_t UnitPcOffset, uint64_t OrigLowPc,
@@ -145,17 +145,17 @@ public:
       const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
       unsigned AddressSize) = 0;
 
-  /// Emit debug_aranges entries for \p Unit and if \p DoRangesSection is true,
-  /// also emit the debug_ranges entries for the DW_TAG_compile_unit's
+  /// Emit .debug_aranges entries for \p Unit and if \p DoRangesSection is true,
+  /// also emit the .debug_ranges entries for the DW_TAG_compile_unit's
   /// DW_AT_ranges attribute.
   virtual void emitUnitRangesEntries(CompileUnit &Unit,
                                      bool DoRangesSection) = 0;
 
-  /// Copy the debug_line over to the updated binary while unobfuscating the
+  /// Copy the .debug_line over to the updated binary while unobfuscating the
   /// file names and directories.
   virtual void translateLineTable(DataExtractor LineData, uint64_t Offset) = 0;
 
-  /// Emit the line table described in \p Rows into the debug_line section.
+  /// Emit the line table described in \p Rows into the .debug_line section.
   virtual void emitLineTableForUnit(MCDwarfLineTableParams Params,
                                     StringRef PrologueBytes,
                                     unsigned MinInstLength,
@@ -175,7 +175,7 @@ public:
   virtual void emitFDE(uint32_t CIEOffset, uint32_t AddreSize, uint32_t Address,
                        StringRef Bytes) = 0;
 
-  /// Emit the debug_loc contribution for \p Unit by copying the entries from
+  /// Emit the .debug_loc contribution for \p Unit by copying the entries from
   /// \p Dwarf and offsetting them. Update the location attributes to point to
   /// the new entries.
   virtual void emitLocationsForUnit(
@@ -184,7 +184,7 @@ public:
           ProcessExpr) = 0;
 
   /// Emit the compilation unit header for \p Unit in the
-  /// debug_info section.
+  /// .debug_info section.
   ///
   /// As a side effect, this also switches the current Dwarf version
   /// of the MC layer to the one of U.getOrigUnit().
@@ -695,7 +695,7 @@ private:
   /// Assign an abbreviation number to \p Abbrev
   void assignAbbrev(DIEAbbrev &Abbrev);
 
-  /// Compute and emit debug_ranges section for \p Unit, and
+  /// Compute and emit .debug_ranges section for \p Unit, and
   /// patch the attributes referencing it.
   void patchRangesForUnit(const CompileUnit &Unit, DWARFContext &Dwarf,
                           const DWARFFile &File) const;
@@ -706,7 +706,7 @@ private:
 
   /// Extract the line tables from the original dwarf, extract the relevant
   /// parts according to the linked function ranges and emit the result in the
-  /// debug_line section.
+  /// .debug_line section.
   void patchLineTableForUnit(CompileUnit &Unit, DWARFContext &OrigDwarf,
                              const DWARFFile &File);
 
@@ -753,7 +753,7 @@ private:
   StringMap<uint32_t> EmittedCIEs;
 
   /// Offset of the last CIE that has been emitted in the output
-  /// debug_frame section.
+  /// .debug_frame section.
   uint32_t LastCIEOffset = 0;
 
   /// Apple accelerator tables.
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h b/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
index 18392e3608e7..99de8ebef812 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -10,7 +10,6 @@
 #define LLVM_DEBUGINFO_CODEVIEW_CVRECORD_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def b/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
index 48ea7e52c172..4cee3abdde87 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeViewRegisters.def
@@ -373,7 +373,7 @@ CV_REGISTER(AMD64_K7, 765)
 
 CV_REGISTER(ARM_NOREG, 0)
 
-// General purpose 32-bit integer regisers
+// General purpose 32-bit integer registers
 
 CV_REGISTER(ARM_R0, 10)
 CV_REGISTER(ARM_R1, 11)
diff --git a/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h b/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
index bdc6cf46509b..226a436c0930 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -18,6 +18,7 @@
 namespace llvm {
 
 class ScopedPrinter;
+class StringRef;
 
 namespace codeview {
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index dcb26f12b13e..cdf3f60f88be 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -144,6 +144,27 @@ public:
                                              const dwarf::Attribute Attr,
                                              const DWARFUnit &U) const;
 
+  /// Compute an offset from a DIE specified by DIE offset and attribute index.
+  ///
+  /// \param AttrIndex an index of DWARF attribute.
+  /// \param DIEOffset the DIE offset that points to the ULEB128 abbreviation
+  /// code in the .debug_info data.
+  /// \param U the DWARFUnit the contains the DIE.
+  /// \returns an offset of the attribute.
+  uint64_t getAttributeOffsetFromIndex(uint32_t AttrIndex, uint64_t DIEOffset,
+                                       const DWARFUnit &U) const;
+
+  /// Extract a DWARF form value from a DIE speccified by attribute index and
+  /// its offset.
+  ///
+  /// \param AttrIndex an index of DWARF attribute.
+  /// \param Offset offset of the attribute.
+  /// \param U the DWARFUnit the contains the DIE.
+  /// \returns Optional DWARF form value if the attribute was extracted.
+  Optional<DWARFFormValue>
+  getAttributeValueFromOffset(uint32_t AttrIndex, uint64_t Offset,
+                              const DWARFUnit &U) const;
+
   bool extract(DataExtractor Data, uint64_t* OffsetPtr);
   void dump(raw_ostream &OS) const;
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
index 154f7893aa17..537a03ec11fc 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAddressRange.h
@@ -39,6 +39,8 @@ struct DWARFAddressRange {
   /// Returns true if [LowPC, HighPC) intersects with [RHS.LowPC, RHS.HighPC).
   bool intersects(const DWARFAddressRange &RHS) const {
     assert(valid() && RHS.valid());
+    if (SectionIndex != RHS.SectionIndex)
+      return false;
     // Empty ranges can't intersect.
     if (LowPC == HighPC || RHS.LowPC == RHS.HighPC)
       return false;
@@ -69,12 +71,12 @@ struct DWARFAddressRange {
 
 inline bool operator<(const DWARFAddressRange &LHS,
                       const DWARFAddressRange &RHS) {
-  return std::tie(LHS.LowPC, LHS.HighPC) < std::tie(RHS.LowPC, RHS.HighPC);
+  return std::tie(LHS.SectionIndex, LHS.LowPC, LHS.HighPC) < std::tie(RHS.SectionIndex, RHS.LowPC, RHS.HighPC);
 }
 
 inline bool operator==(const DWARFAddressRange &LHS,
                        const DWARFAddressRange &RHS) {
-  return std::tie(LHS.LowPC, LHS.HighPC) == std::tie(RHS.LowPC, RHS.HighPC);
+  return std::tie(LHS.SectionIndex, LHS.LowPC, LHS.HighPC) == std::tie(RHS.SectionIndex, RHS.LowPC, RHS.HighPC);
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const DWARFAddressRange &R);
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 75b2280658f1..902973ff5722 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -243,6 +243,7 @@ public:
   }
 
   DWARFCompileUnit *getDWOCompileUnitForHash(uint64_t Hash);
+  DWARFTypeUnit *getTypeUnitForHash(uint16_t Version, uint64_t Hash, bool IsDWO);
 
   /// Return the compile unit that includes an offset (relative to .debug_info).
   DWARFCompileUnit *getCompileUnitForOffset(uint64_t Offset);
@@ -373,8 +374,24 @@ public:
     return {2, 4, 8};
   }
   static bool isAddressSizeSupported(unsigned AddressSize) {
-    return llvm::any_of(getSupportedAddressSizes(),
-                        [=](auto Elem) { return Elem == AddressSize; });
+    return llvm::is_contained(getSupportedAddressSizes(), AddressSize);
+  }
+  template <typename... Ts>
+  static Error checkAddressSizeSupported(unsigned AddressSize,
+                                         std::error_code EC, char const *Fmt,
+                                         const Ts &...Vals) {
+    if (isAddressSizeSupported(AddressSize))
+      return Error::success();
+    std::string Buffer;
+    raw_string_ostream Stream(Buffer);
+    Stream << format(Fmt, Vals...)
+           << " has unsupported address size: " << AddressSize
+           << " (supported are ";
+    ListSeparator LS;
+    for (unsigned Size : DWARFContext::getSupportedAddressSizes())
+      Stream << LS << Size;
+    Stream << ')';
+    return make_error<StringError>(Stream.str(), EC);
   }
 
   std::shared_ptr<DWARFContext> getDWOContext(StringRef AbsolutePath);
@@ -387,9 +404,12 @@ public:
 
   function_ref<void(Error)> getWarningHandler() { return WarningHandler; }
 
+  enum class ProcessDebugRelocations { Process, Ignore };
+
   static std::unique_ptr<DWARFContext>
-  create(const object::ObjectFile &Obj, const LoadedObjectInfo *L = nullptr,
-         std::string DWPName = "",
+  create(const object::ObjectFile &Obj,
+         ProcessDebugRelocations RelocAction = ProcessDebugRelocations::Process,
+         const LoadedObjectInfo *L = nullptr, std::string DWPName = "",
          std::function<void(Error)> RecoverableErrorHandler =
              WithColor::defaultErrorHandler,
          std::function<void(Error)> WarningHandler =
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
index 0bfe9f376f46..c4370cb54113 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
@@ -24,9 +24,11 @@ class DWARFDebugInfoEntry {
   /// Offset within the .debug_info of the start of this entry.
   uint64_t Offset = 0;
 
-  /// The integer depth of this DIE within the compile unit DIEs where the
-  /// compile/type unit DIE has a depth of zero.
-  uint32_t Depth = 0;
+  /// Index of the parent die. UINT32_MAX if there is no parent.
+  uint32_t ParentIdx = UINT32_MAX;
+
+  /// Index of the sibling die. Zero if there is no sibling.
+  uint32_t SiblingIdx = 0;
 
   const DWARFAbbreviationDeclaration *AbbrevDecl = nullptr;
 
@@ -36,15 +38,31 @@ public:
   /// Extracts a debug info entry, which is a child of a given unit,
   /// starting at a given offset. If DIE can't be extracted, returns false and
   /// doesn't change OffsetPtr.
-  bool extractFast(const DWARFUnit &U, uint64_t *OffsetPtr);
-
   /// High performance extraction should use this call.
   bool extractFast(const DWARFUnit &U, uint64_t *OffsetPtr,
                    const DWARFDataExtractor &DebugInfoData, uint64_t UEndOffset,
-                   uint32_t Depth);
+                   uint32_t ParentIdx);
 
   uint64_t getOffset() const { return Offset; }
-  uint32_t getDepth() const { return Depth; }
+
+  /// Returns index of the parent die.
+  Optional<uint32_t> getParentIdx() const {
+    if (ParentIdx == UINT32_MAX)
+      return None;
+
+    return ParentIdx;
+  }
+
+  /// Returns index of the sibling die.
+  Optional<uint32_t> getSiblingIdx() const {
+    if (SiblingIdx == 0)
+      return None;
+
+    return SiblingIdx;
+  }
+
+  /// Set index of sibling.
+  void setSiblingIdx(uint32_t Idx) { SiblingIdx = Idx; }
 
   dwarf::Tag getTag() const {
     return AbbrevDecl ? AbbrevDecl->getTag() : dwarf::DW_TAG_null;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index d1d65372740b..ee15b6d4112d 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -110,10 +110,6 @@ public:
     /// Length of the prologue in bytes.
     uint64_t getLength() const;
 
-    int32_t getMaxLineIncrementForSpecialOpcode() const {
-      return LineBase + (int8_t)LineRange - 1;
-    }
-
     /// Get DWARF-version aware access to the file name entry at the provided
     /// index.
     const llvm::DWARFDebugLine::FileNameEntry &
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 2f72c642a2d5..0d9f37c5610b 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -49,12 +49,7 @@ public:
     /// 2. An address, which defines the appropriate base address for
     /// use in interpreting the beginning and ending address offsets of
     /// subsequent entries of the location list.
-    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
-      assert(AddressSize == 4 || AddressSize == 8);
-      if (AddressSize == 4)
-        return StartAddress == -1U;
-      return StartAddress == -1ULL;
-    }
+    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const;
   };
 
 private:
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 1903bab5e73f..8f93ebc4ebc0 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -182,6 +182,8 @@ public:
   DWARFDie getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const;
   DWARFDie getAttributeValueAsReferencedDie(const DWARFFormValue &V) const;
 
+  DWARFDie resolveTypeUnitReference() const;
+
   /// Extract the range base attribute from this DIE as absolute section offset.
   ///
   /// This is a utility function that checks for either the DW_AT_rnglists_base
@@ -220,16 +222,6 @@ public:
   /// information is available.
   Expected<DWARFAddressRangesVector> getAddressRanges() const;
 
-  /// Get all address ranges for any DW_TAG_subprogram DIEs in this DIE or any
-  /// of its children.
-  ///
-  /// Get the hi/low PC range if both attributes are available or exrtracts the
-  /// non-contiguous address ranges from the DW_AT_ranges attribute for this DIE
-  /// and all children.
-  ///
-  /// \param Ranges the addres range vector to fill in.
-  void collectChildrenAddressRanges(DWARFAddressRangesVector &Ranges) const;
-
   bool addressRangeContainsAddress(const uint64_t Address) const;
 
   Expected<DWARFLocationExpressionsVector>
@@ -246,6 +238,8 @@ public:
   /// for ShortName if LinkageName is not found.
   /// Returns null if no name is found.
   const char *getName(DINameKind Kind) const;
+  void getFullName(raw_string_ostream &,
+                   std::string *OriginalFullName = nullptr) const;
 
   /// Return the DIE short name resolving DW_AT_specification or
   /// DW_AT_abstract_origin references if necessary. Returns null if no name
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index 794e859bfe72..b694eeacfd9d 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -86,24 +86,30 @@ public:
     uint64_t OperandEndOffsets[2];
 
   public:
-    Description &getDescription() { return Desc; }
-    uint8_t getCode() { return Opcode; }
-    uint64_t getRawOperand(unsigned Idx) { return Operands[Idx]; }
-    uint64_t getOperandEndOffset(unsigned Idx) { return OperandEndOffsets[Idx]; }
-    uint64_t getEndOffset() { return EndOffset; }
-    bool extract(DataExtractor Data, uint8_t AddressSize, uint64_t Offset,
-                 Optional<dwarf::DwarfFormat> Format);
-    bool isError() { return Error; }
+    const Description &getDescription() const { return Desc; }
+    uint8_t getCode() const { return Opcode; }
+    uint64_t getRawOperand(unsigned Idx) const { return Operands[Idx]; }
+    uint64_t getOperandEndOffset(unsigned Idx) const {
+      return OperandEndOffsets[Idx];
+    }
+    uint64_t getEndOffset() const { return EndOffset; }
+    bool isError() const { return Error; }
     bool print(raw_ostream &OS, DIDumpOptions DumpOpts,
                const DWARFExpression *Expr, const MCRegisterInfo *RegInfo,
-               DWARFUnit *U, bool isEH);
-    bool verify(DWARFUnit *U);
+               DWARFUnit *U, bool isEH) const;
+
+    /// Verify \p Op. Does not affect the return of \a isError().
+    static bool verify(const Operation &Op, DWARFUnit *U);
+
+  private:
+    bool extract(DataExtractor Data, uint8_t AddressSize, uint64_t Offset,
+                 Optional<dwarf::DwarfFormat> Format);
   };
 
   /// An iterator to go through the expression operations.
   class iterator
       : public iterator_facade_base<iterator, std::forward_iterator_tag,
-                                    Operation> {
+                                    const Operation> {
     friend class DWARFExpression;
     const DWARFExpression *Expr;
     uint64_t Offset;
@@ -116,19 +122,17 @@ public:
     }
 
   public:
-    class Operation &operator++() {
+    iterator &operator++() {
       Offset = Op.isError() ? Expr->Data.getData().size() : Op.EndOffset;
       Op.Error =
           Offset >= Expr->Data.getData().size() ||
           !Op.extract(Expr->Data, Expr->AddressSize, Offset, Expr->Format);
-      return Op;
+      return *this;
     }
 
-    class Operation &operator*() {
-      return Op;
-    }
+    const Operation &operator*() const { return Op; }
 
-    iterator skipBytes(uint64_t Add) {
+    iterator skipBytes(uint64_t Add) const {
       return iterator(Expr, Op.EndOffset + Add);
     }
 
@@ -159,6 +163,8 @@ public:
 
   bool operator==(const DWARFExpression &RHS) const;
 
+  StringRef getData() const { return Data.getData(); }
+
 private:
   DataExtractor Data;
   uint8_t AddressSize;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 43be024f0d37..3c051c3ea018 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -102,10 +102,6 @@ public:
     return extractValue(Data, OffsetPtr, FormParams, nullptr, U);
   }
 
-  bool isInlinedCStr() const {
-    return Value.data != nullptr && Value.data == (const uint8_t *)Value.cstr;
-  }
-
   /// getAsFoo functions below return the extracted value as Foo if only
   /// DWARFFormValue has form class is suitable for representing Foo.
   Optional<uint64_t> getAsReference() const;
@@ -123,6 +119,19 @@ public:
   Optional<ArrayRef<uint8_t>> getAsBlock() const;
   Optional<uint64_t> getAsCStringOffset() const;
   Optional<uint64_t> getAsReferenceUVal() const;
+  /// Correctly extract any file paths from a form value.
+  ///
+  /// These attributes can be in the from DW_AT_decl_file or DW_AT_call_file
+  /// attributes. We need to use the file index in the correct DWARFUnit's line
+  /// table prologue, and each DWARFFormValue has the DWARFUnit the form value
+  /// was extracted from.
+  ///
+  /// \param Kind The kind of path to extract.
+  ///
+  /// \returns A valid string value on success, or llvm::None if the form class
+  /// is not FC_Constant, or if the file index is not valid.
+  Optional<std::string>
+  getAsFile(DILineInfoSpecifier::FileLineInfoKind Kind) const;
 
   /// Skip a form's value in \p DebugInfoData at the offset specified by
   /// \p OffsetPtr.
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 93d7e2b563fd..d471b80c7fe1 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -49,8 +49,6 @@ public:
     DieRangeInfo(std::vector<DWARFAddressRange> Ranges)
         : Ranges(std::move(Ranges)) {}
 
-    typedef std::vector<DWARFAddressRange>::const_iterator
-        address_range_iterator;
     typedef std::set<DieRangeInfo>::const_iterator die_range_info_iterator;
 
     /// Inserts the address range. If the range overlaps with an existing
@@ -62,16 +60,6 @@ public:
     /// children address ranges must all be contained in.
     Optional<DWARFAddressRange> insert(const DWARFAddressRange &R);
 
-    /// Finds an address range in the sorted vector of ranges.
-    address_range_iterator findRange(const DWARFAddressRange &R) const {
-      auto Begin = Ranges.begin();
-      auto End = Ranges.end();
-      auto Iter = std::upper_bound(Begin, End, R);
-      if (Iter != Begin)
-        --Iter;
-      return Iter;
-    }
-
     /// Inserts the address range info. If any of its ranges overlaps with a
     /// range in an existing range info, the range info is *not* added and an
     /// iterator to the overlapping range info.
@@ -91,14 +79,11 @@ private:
   raw_ostream &OS;
   DWARFContext &DCtx;
   DIDumpOptions DumpOpts;
-  /// A map that tracks all references (converted absolute references) so we
-  /// can verify each reference points to a valid DIE and not an offset that
-  /// lies between to valid DIEs.
-  std::map<uint64_t, std::set<uint64_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
   // Used to relax some checks that do not currently work portably
   bool IsObjectFile;
   bool IsMachOObject;
+  using ReferenceMap = std::map<uint64_t, std::set<uint64_t>>;
 
   raw_ostream &error() const;
   raw_ostream &warn() const;
@@ -140,6 +125,7 @@ private:
   bool verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
                         uint64_t *Offset, unsigned UnitIndex, uint8_t &UnitType,
                         bool &isUnitDWARF64);
+  bool verifyName(const DWARFDie &Die);
 
   /// Verifies the header of a unit in a .debug_info or .debug_types section.
   ///
@@ -156,7 +142,9 @@ private:
   /// \param Unit      The DWARF Unit to verify.
   ///
   /// \returns The number of errors that occurred during verification.
-  unsigned verifyUnitContents(DWARFUnit &Unit);
+  unsigned verifyUnitContents(DWARFUnit &Unit,
+                              ReferenceMap &UnitLocalReferences,
+                              ReferenceMap &CrossUnitReferences);
 
   /// Verifies the unit headers and contents in a .debug_info or .debug_types
   /// section.
@@ -208,7 +196,9 @@ private:
   ///
   /// \returns NumErrors The number of errors occurred during verification of
   /// attributes' forms in a unit
-  unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
+  unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue,
+                               ReferenceMap &UnitLocalReferences,
+                               ReferenceMap &CrossUnitReferences);
 
   /// Verifies the all valid references that were found when iterating through
   /// all of the DIE attributes.
@@ -220,7 +210,9 @@ private:
   ///
   /// \returns NumErrors The number of errors occurred during verification of
   /// references for the .debug_info and .debug_types sections
-  unsigned verifyDebugInfoReferences();
+  unsigned verifyDebugInfoReferences(
+      const ReferenceMap &,
+      llvm::function_ref<DWARFUnit *(uint64_t)> GetUnitForDieOffset);
 
   /// Verify the DW_AT_stmt_list encoding and value and ensure that no
   /// compile units that have the same DW_AT_stmt_list value.
diff --git a/llvm/include/llvm/DebugInfo/GSYM/StringTable.h b/llvm/include/llvm/DebugInfo/GSYM/StringTable.h
index f7f800d01647..045c9e3f3ebd 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/StringTable.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/StringTable.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_GSYM_STRINGTABLE_H
 #define LLVM_DEBUGINFO_GSYM_STRINGTABLE_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/GSYM/Range.h"
 #include <stdint.h>
diff --git a/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h b/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h
index 83331b14b8af..a922839a999d 100644
--- a/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h
+++ b/llvm/include/llvm/DebugInfo/MSF/MSFCommon.h
@@ -93,6 +93,9 @@ inline bool isValidBlockSize(uint32_t Size) {
   case 1024:
   case 2048:
   case 4096:
+  case 8192:
+  case 16384:
+  case 32768:
     return true;
   }
   return false;
diff --git a/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h b/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h
index 473c89e8106f..296a4840b779 100644
--- a/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h
+++ b/llvm/include/llvm/DebugInfo/MSF/MappedBlockStream.h
@@ -58,12 +58,12 @@ public:
     return support::little;
   }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override;
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override;
 
-  uint32_t getLength() override;
+  uint64_t getLength() override;
 
   BumpPtrAllocator &getAllocator() { return Allocator; }
 
@@ -79,10 +79,10 @@ protected:
 
 private:
   const MSFStreamLayout &getStreamLayout() const { return StreamLayout; }
-  void fixCacheAfterWrite(uint32_t Offset, ArrayRef<uint8_t> Data) const;
+  void fixCacheAfterWrite(uint64_t Offset, ArrayRef<uint8_t> Data) const;
 
-  Error readBytes(uint32_t Offset, MutableArrayRef<uint8_t> Buffer);
-  bool tryReadContiguously(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, MutableArrayRef<uint8_t> Buffer);
+  bool tryReadContiguously(uint64_t Offset, uint64_t Size,
                            ArrayRef<uint8_t> &Buffer);
 
   const uint32_t BlockSize;
@@ -125,13 +125,13 @@ public:
     return support::little;
   }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override;
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override;
-  uint32_t getLength() override;
+  uint64_t getLength() override;
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) override;
+  Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Buffer) override;
 
   Error commit() override;
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
index 5fb13ad30e91..de5b46f21672 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
@@ -31,9 +31,7 @@ struct FileInfoSubstreamHeader;
 class DbiModuleSourceFilesIterator
     : public iterator_facade_base<DbiModuleSourceFilesIterator,
                                   std::random_access_iterator_tag, StringRef> {
-  using BaseType =
-      iterator_facade_base<DbiModuleSourceFilesIterator,
-                           std::random_access_iterator_tag, StringRef>;
+  using BaseType = typename DbiModuleSourceFilesIterator::iterator_facade_base;
 
 public:
   DbiModuleSourceFilesIterator(const DbiModuleList &Modules, uint32_t Modi,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h b/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
index 95c0a89551ed..474bd796b2b3 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/HashTable.h
@@ -38,6 +38,7 @@ class HashTableIterator
     : public iterator_facade_base<HashTableIterator<ValueT>,
                                   std::forward_iterator_tag,
                                   const std::pair<uint32_t, ValueT>> {
+  using BaseT = typename HashTableIterator::iterator_facade_base;
   friend HashTable<ValueT>;
 
   HashTableIterator(const HashTable<ValueT> &Map, uint32_t Index,
@@ -76,9 +77,7 @@ public:
 
   // Implement postfix op++ in terms of prefix op++ by using the superclass
   // implementation.
-  using iterator_facade_base<HashTableIterator<ValueT>,
-                             std::forward_iterator_tag,
-                             const std::pair<uint32_t, ValueT>>::operator++;
+  using BaseT::operator++;
   HashTableIterator &operator++() {
     while (Index < Map->Buckets.size()) {
       ++Index;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h b/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
index 1df059ffa9fd..f110e90b3f90 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NAMEDSTREAMMAP_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NAMEDSTREAMMAP_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
index 5dedc70f11ba..be0ddf0a063a 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVELINENUMBER_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVELINENUMBER_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
index 8f1834d0a2c2..90b5d8068959 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEFUNCTIONSIG_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEFUNCTIONSIG_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
index 4ae8f1471781..21995ca665c1 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEVTSHAPE_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEVTSHAPE_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
index 3c414e7a9005..004d005280d4 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
@@ -10,7 +10,6 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_PDBFILEBUILDER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index c396a1dc5dd3..3150e049320b 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -31,7 +31,6 @@ enum : int {
 char *itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
                       int *status);
 
-
 enum MSDemangleFlags {
   MSDF_None = 0,
   MSDF_DumpBackrefs = 1 << 0,
@@ -39,6 +38,7 @@ enum MSDemangleFlags {
   MSDF_NoCallingConvention = 1 << 2,
   MSDF_NoReturnType = 1 << 3,
   MSDF_NoMemberType = 1 << 4,
+  MSDF_NoVariableType = 1 << 5,
 };
 
 /// Demangles the Microsoft symbol pointed at by mangled_name and returns it.
@@ -53,13 +53,16 @@ enum MSDemangleFlags {
 /// receives the size of the demangled string on output if n_buf is not nullptr.
 /// status receives one of the demangle_ enum entries above if it's not nullptr.
 /// Flags controls various details of the demangled representation.
-char *microsoftDemangle(const char *mangled_name, size_t *n_read,
-                        char *buf, size_t *n_buf,
-                        int *status, MSDemangleFlags Flags = MSDF_None);
+char *microsoftDemangle(const char *mangled_name, size_t *n_read, char *buf,
+                        size_t *n_buf, int *status,
+                        MSDemangleFlags Flags = MSDF_None);
 
 // Demangles a Rust v0 mangled symbol. The API follows that of __cxa_demangle.
 char *rustDemangle(const char *MangledName, char *Buf, size_t *N, int *Status);
 
+// Demangles a D mangled symbol.
+char *dlangDemangle(const char *MangledName);
+
 /// Attempt to demangle a string using different demangling schemes.
 /// The function uses heuristics to determine which demangling scheme to use.
 /// \param MangledName - reference to string to demangle.
@@ -67,6 +70,8 @@ char *rustDemangle(const char *MangledName, char *Buf, size_t *N, int *Status);
 /// demangling occurred.
 std::string demangle(const std::string &MangledName);
 
+bool nonMicrosoftDemangle(const char *MangledName, std::string &Result);
+
 /// "Partial" demangler. This supports demangling a string into an AST
 /// (typically an intermediate stage in itaniumDemangle) and querying certain
 /// properties or partially printing the demangled name.
@@ -118,6 +123,7 @@ struct ItaniumPartialDemangler {
   bool isSpecialName() const;
 
   ~ItaniumPartialDemangler();
+
 private:
   void *RootNode;
   void *Context;
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 9163b713d118..86f5c992b63d 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -57,6 +57,7 @@
     X(LocalName) \
     X(VectorType) \
     X(PixelVectorType) \
+    X(BinaryFPType) \
     X(SyntheticTemplateParamName) \
     X(TypeTemplateParamDecl) \
     X(NonTypeTemplateParamDecl) \
@@ -109,6 +110,126 @@
 
 DEMANGLE_NAMESPACE_BEGIN
 
+template <class T, size_t N> class PODSmallVector {
+  static_assert(std::is_pod<T>::value,
+                "T is required to be a plain old data type");
+
+  T *First = nullptr;
+  T *Last = nullptr;
+  T *Cap = nullptr;
+  T Inline[N] = {0};
+
+  bool isInline() const { return First == Inline; }
+
+  void clearInline() {
+    First = Inline;
+    Last = Inline;
+    Cap = Inline + N;
+  }
+
+  void reserve(size_t NewCap) {
+    size_t S = size();
+    if (isInline()) {
+      auto *Tmp = static_cast<T *>(std::malloc(NewCap * sizeof(T)));
+      if (Tmp == nullptr)
+        std::terminate();
+      std::copy(First, Last, Tmp);
+      First = Tmp;
+    } else {
+      First = static_cast<T *>(std::realloc(First, NewCap * sizeof(T)));
+      if (First == nullptr)
+        std::terminate();
+    }
+    Last = First + S;
+    Cap = First + NewCap;
+  }
+
+public:
+  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
+
+  PODSmallVector(const PODSmallVector &) = delete;
+  PODSmallVector &operator=(const PODSmallVector &) = delete;
+
+  PODSmallVector(PODSmallVector &&Other) : PODSmallVector() {
+    if (Other.isInline()) {
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return;
+    }
+
+    First = Other.First;
+    Last = Other.Last;
+    Cap = Other.Cap;
+    Other.clearInline();
+  }
+
+  PODSmallVector &operator=(PODSmallVector &&Other) {
+    if (Other.isInline()) {
+      if (!isInline()) {
+        std::free(First);
+        clearInline();
+      }
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return *this;
+    }
+
+    if (isInline()) {
+      First = Other.First;
+      Last = Other.Last;
+      Cap = Other.Cap;
+      Other.clearInline();
+      return *this;
+    }
+
+    std::swap(First, Other.First);
+    std::swap(Last, Other.Last);
+    std::swap(Cap, Other.Cap);
+    Other.clear();
+    return *this;
+  }
+
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void push_back(const T &Elem) {
+    if (Last == Cap)
+      reserve(size() * 2);
+    *Last++ = Elem;
+  }
+
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void pop_back() {
+    assert(Last != First && "Popping empty vector!");
+    --Last;
+  }
+
+  void dropBack(size_t Index) {
+    assert(Index <= size() && "dropBack() can't expand!");
+    Last = First + Index;
+  }
+
+  T *begin() { return First; }
+  T *end() { return Last; }
+
+  bool empty() const { return First == Last; }
+  size_t size() const { return static_cast<size_t>(Last - First); }
+  T &back() {
+    assert(Last != First && "Calling back() on empty vector!");
+    return *(Last - 1);
+  }
+  T &operator[](size_t Index) {
+    assert(Index < size() && "Invalid access!");
+    return *(begin() + Index);
+  }
+  void clear() { Last = First; }
+
+  ~PODSmallVector() {
+    if (!isInline())
+      std::free(First);
+  }
+};
+
 // Base class of all AST nodes. The AST is built by the parser, then is
 // traversed by the printLeft/Right functions to produce a demangled string.
 class Node {
@@ -155,50 +276,48 @@ public:
   // would construct an equivalent node.
   //template<typename Fn> void match(Fn F) const;
 
-  bool hasRHSComponent(OutputStream &S) const {
+  bool hasRHSComponent(OutputBuffer &OB) const {
     if (RHSComponentCache != Cache::Unknown)
       return RHSComponentCache == Cache::Yes;
-    return hasRHSComponentSlow(S);
+    return hasRHSComponentSlow(OB);
   }
 
-  bool hasArray(OutputStream &S) const {
+  bool hasArray(OutputBuffer &OB) const {
     if (ArrayCache != Cache::Unknown)
       return ArrayCache == Cache::Yes;
-    return hasArraySlow(S);
+    return hasArraySlow(OB);
   }
 
-  bool hasFunction(OutputStream &S) const {
+  bool hasFunction(OutputBuffer &OB) const {
     if (FunctionCache != Cache::Unknown)
       return FunctionCache == Cache::Yes;
-    return hasFunctionSlow(S);
+    return hasFunctionSlow(OB);
   }
 
   Kind getKind() const { return K; }
 
-  virtual bool hasRHSComponentSlow(OutputStream &) const { return false; }
-  virtual bool hasArraySlow(OutputStream &) const { return false; }
-  virtual bool hasFunctionSlow(OutputStream &) const { return false; }
+  virtual bool hasRHSComponentSlow(OutputBuffer &) const { return false; }
+  virtual bool hasArraySlow(OutputBuffer &) const { return false; }
+  virtual bool hasFunctionSlow(OutputBuffer &) const { return false; }
 
   // Dig through "glue" nodes like ParameterPack and ForwardTemplateReference to
   // get at a node that actually represents some concrete syntax.
-  virtual const Node *getSyntaxNode(OutputStream &) const {
-    return this;
-  }
+  virtual const Node *getSyntaxNode(OutputBuffer &) const { return this; }
 
-  void print(OutputStream &S) const {
-    printLeft(S);
+  void print(OutputBuffer &OB) const {
+    printLeft(OB);
     if (RHSComponentCache != Cache::No)
-      printRight(S);
+      printRight(OB);
   }
 
-  // Print the "left" side of this Node into OutputStream.
-  virtual void printLeft(OutputStream &) const = 0;
+  // Print the "left" side of this Node into OutputBuffer.
+  virtual void printLeft(OutputBuffer &) const = 0;
 
   // Print the "right". This distinction is necessary to represent C++ types
   // that appear on the RHS of their subtype, such as arrays or functions.
   // Since most types don't have such a component, provide a default
   // implementation.
-  virtual void printRight(OutputStream &) const {}
+  virtual void printRight(OutputBuffer &) const {}
 
   virtual StringView getBaseName() const { return StringView(); }
 
@@ -227,19 +346,19 @@ public:
 
   Node *operator[](size_t Idx) const { return Elements[Idx]; }
 
-  void printWithComma(OutputStream &S) const {
+  void printWithComma(OutputBuffer &OB) const {
     bool FirstElement = true;
     for (size_t Idx = 0; Idx != NumElements; ++Idx) {
-      size_t BeforeComma = S.getCurrentPosition();
+      size_t BeforeComma = OB.getCurrentPosition();
       if (!FirstElement)
-        S += ", ";
-      size_t AfterComma = S.getCurrentPosition();
-      Elements[Idx]->print(S);
+        OB += ", ";
+      size_t AfterComma = OB.getCurrentPosition();
+      Elements[Idx]->print(OB);
 
       // Elements[Idx] is an empty parameter pack expansion, we should erase the
       // comma we just printed.
-      if (AfterComma == S.getCurrentPosition()) {
-        S.setCurrentPosition(BeforeComma);
+      if (AfterComma == OB.getCurrentPosition()) {
+        OB.setCurrentPosition(BeforeComma);
         continue;
       }
 
@@ -254,9 +373,7 @@ struct NodeArrayNode : Node {
 
   template<typename Fn> void match(Fn F) const { F(Array); }
 
-  void printLeft(OutputStream &S) const override {
-    Array.printWithComma(S);
-  }
+  void printLeft(OutputBuffer &OB) const override { Array.printWithComma(OB); }
 };
 
 class DotSuffix final : public Node {
@@ -269,11 +386,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Prefix, Suffix); }
 
-  void printLeft(OutputStream &s) const override {
-    Prefix->print(s);
-    s += " (";
-    s += Suffix;
-    s += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    Prefix->print(OB);
+    OB += " (";
+    OB += Suffix;
+    OB += ")";
   }
 };
 
@@ -288,12 +405,12 @@ public:
 
   template <typename Fn> void match(Fn F) const { F(Ty, Ext, TA); }
 
-  void printLeft(OutputStream &S) const override {
-    Ty->print(S);
-    S += " ";
-    S += Ext;
+  void printLeft(OutputBuffer &OB) const override {
+    Ty->print(OB);
+    OB += " ";
+    OB += Ext;
     if (TA != nullptr)
-      TA->print(S);
+      TA->print(OB);
   }
 };
 
@@ -319,13 +436,13 @@ protected:
   const Qualifiers Quals;
   const Node *Child;
 
-  void printQuals(OutputStream &S) const {
+  void printQuals(OutputBuffer &OB) const {
     if (Quals & QualConst)
-      S += " const";
+      OB += " const";
     if (Quals & QualVolatile)
-      S += " volatile";
+      OB += " volatile";
     if (Quals & QualRestrict)
-      S += " restrict";
+      OB += " restrict";
   }
 
 public:
@@ -336,22 +453,22 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Child, Quals); }
 
-  bool hasRHSComponentSlow(OutputStream &S) const override {
-    return Child->hasRHSComponent(S);
+  bool hasRHSComponentSlow(OutputBuffer &OB) const override {
+    return Child->hasRHSComponent(OB);
   }
-  bool hasArraySlow(OutputStream &S) const override {
-    return Child->hasArray(S);
+  bool hasArraySlow(OutputBuffer &OB) const override {
+    return Child->hasArray(OB);
   }
-  bool hasFunctionSlow(OutputStream &S) const override {
-    return Child->hasFunction(S);
+  bool hasFunctionSlow(OutputBuffer &OB) const override {
+    return Child->hasFunction(OB);
   }
 
-  void printLeft(OutputStream &S) const override {
-    Child->printLeft(S);
-    printQuals(S);
+  void printLeft(OutputBuffer &OB) const override {
+    Child->printLeft(OB);
+    printQuals(OB);
   }
 
-  void printRight(OutputStream &S) const override { Child->printRight(S); }
+  void printRight(OutputBuffer &OB) const override { Child->printRight(OB); }
 };
 
 class ConversionOperatorType final : public Node {
@@ -363,9 +480,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Ty); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "operator ";
-    Ty->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "operator ";
+    Ty->print(OB);
   }
 };
 
@@ -379,9 +496,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Ty, Postfix); }
 
-  void printLeft(OutputStream &s) const override {
-    Ty->printLeft(s);
-    s += Postfix;
+  void printLeft(OutputBuffer &OB) const override {
+    Ty->printLeft(OB);
+    OB += Postfix;
   }
 };
 
@@ -396,7 +513,7 @@ public:
   StringView getName() const { return Name; }
   StringView getBaseName() const override { return Name; }
 
-  void printLeft(OutputStream &s) const override { s += Name; }
+  void printLeft(OutputBuffer &OB) const override { OB += Name; }
 };
 
 class ElaboratedTypeSpefType : public Node {
@@ -408,10 +525,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Kind, Child); }
 
-  void printLeft(OutputStream &S) const override {
-    S += Kind;
-    S += ' ';
-    Child->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += Kind;
+    OB += ' ';
+    Child->print(OB);
   }
 };
 
@@ -426,11 +543,11 @@ struct AbiTagAttr : Node {
 
   template<typename Fn> void match(Fn F) const { F(Base, Tag); }
 
-  void printLeft(OutputStream &S) const override {
-    Base->printLeft(S);
-    S += "[abi:";
-    S += Tag;
-    S += "]";
+  void printLeft(OutputBuffer &OB) const override {
+    Base->printLeft(OB);
+    OB += "[abi:";
+    OB += Tag;
+    OB += "]";
   }
 };
 
@@ -442,10 +559,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Conditions); }
 
-  void printLeft(OutputStream &S) const override {
-    S += " [enable_if:";
-    Conditions.printWithComma(S);
-    S += ']';
+  void printLeft(OutputBuffer &OB) const override {
+    OB += " [enable_if:";
+    Conditions.printWithComma(OB);
+    OB += ']';
   }
 };
 
@@ -466,11 +583,11 @@ public:
            static_cast<const NameType *>(Ty)->getName() == "objc_object";
   }
 
-  void printLeft(OutputStream &S) const override {
-    Ty->print(S);
-    S += "<";
-    S += Protocol;
-    S += ">";
+  void printLeft(OutputBuffer &OB) const override {
+    Ty->print(OB);
+    OB += "<";
+    OB += Protocol;
+    OB += ">";
   }
 };
 
@@ -484,34 +601,34 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Pointee); }
 
-  bool hasRHSComponentSlow(OutputStream &S) const override {
-    return Pointee->hasRHSComponent(S);
+  bool hasRHSComponentSlow(OutputBuffer &OB) const override {
+    return Pointee->hasRHSComponent(OB);
   }
 
-  void printLeft(OutputStream &s) const override {
+  void printLeft(OutputBuffer &OB) const override {
     // We rewrite objc_object<SomeProtocol>* into id<SomeProtocol>.
     if (Pointee->getKind() != KObjCProtoName ||
         !static_cast<const ObjCProtoName *>(Pointee)->isObjCObject()) {
-      Pointee->printLeft(s);
-      if (Pointee->hasArray(s))
-        s += " ";
-      if (Pointee->hasArray(s) || Pointee->hasFunction(s))
-        s += "(";
-      s += "*";
+      Pointee->printLeft(OB);
+      if (Pointee->hasArray(OB))
+        OB += " ";
+      if (Pointee->hasArray(OB) || Pointee->hasFunction(OB))
+        OB += "(";
+      OB += "*";
     } else {
       const auto *objcProto = static_cast<const ObjCProtoName *>(Pointee);
-      s += "id<";
-      s += objcProto->Protocol;
-      s += ">";
+      OB += "id<";
+      OB += objcProto->Protocol;
+      OB += ">";
     }
   }
 
-  void printRight(OutputStream &s) const override {
+  void printRight(OutputBuffer &OB) const override {
     if (Pointee->getKind() != KObjCProtoName ||
         !static_cast<const ObjCProtoName *>(Pointee)->isObjCObject()) {
-      if (Pointee->hasArray(s) || Pointee->hasFunction(s))
-        s += ")";
-      Pointee->printRight(s);
+      if (Pointee->hasArray(OB) || Pointee->hasFunction(OB))
+        OB += ")";
+      Pointee->printRight(OB);
     }
   }
 };
@@ -531,15 +648,30 @@ class ReferenceType : public Node {
   // Dig through any refs to refs, collapsing the ReferenceTypes as we go. The
   // rule here is rvalue ref to rvalue ref collapses to a rvalue ref, and any
   // other combination collapses to a lvalue ref.
-  std::pair<ReferenceKind, const Node *> collapse(OutputStream &S) const {
+  //
+  // A combination of a TemplateForwardReference and a back-ref Substitution
+  // from an ill-formed string may have created a cycle; use cycle detection to
+  // avoid looping forever.
+  std::pair<ReferenceKind, const Node *> collapse(OutputBuffer &OB) const {
     auto SoFar = std::make_pair(RK, Pointee);
+    // Track the chain of nodes for the Floyd's 'tortoise and hare'
+    // cycle-detection algorithm, since getSyntaxNode(S) is impure
+    PODSmallVector<const Node *, 8> Prev;
     for (;;) {
-      const Node *SN = SoFar.second->getSyntaxNode(S);
+      const Node *SN = SoFar.second->getSyntaxNode(OB);
       if (SN->getKind() != KReferenceType)
         break;
       auto *RT = static_cast<const ReferenceType *>(SN);
       SoFar.second = RT->Pointee;
       SoFar.first = std::min(SoFar.first, RT->RK);
+
+      // The middle of Prev is the 'slow' pointer moving at half speed
+      Prev.push_back(SoFar.second);
+      if (Prev.size() > 1 && SoFar.second == Prev[(Prev.size() - 1) / 2]) {
+        // Cycle detected
+        SoFar.second = nullptr;
+        break;
+      }
     }
     return SoFar;
   }
@@ -551,31 +683,35 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Pointee, RK); }
 
-  bool hasRHSComponentSlow(OutputStream &S) const override {
-    return Pointee->hasRHSComponent(S);
+  bool hasRHSComponentSlow(OutputBuffer &OB) const override {
+    return Pointee->hasRHSComponent(OB);
   }
 
-  void printLeft(OutputStream &s) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (Printing)
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
-    Collapsed.second->printLeft(s);
-    if (Collapsed.second->hasArray(s))
-      s += " ";
-    if (Collapsed.second->hasArray(s) || Collapsed.second->hasFunction(s))
-      s += "(";
+    std::pair<ReferenceKind, const Node *> Collapsed = collapse(OB);
+    if (!Collapsed.second)
+      return;
+    Collapsed.second->printLeft(OB);
+    if (Collapsed.second->hasArray(OB))
+      OB += " ";
+    if (Collapsed.second->hasArray(OB) || Collapsed.second->hasFunction(OB))
+      OB += "(";
 
-    s += (Collapsed.first == ReferenceKind::LValue ? "&" : "&&");
+    OB += (Collapsed.first == ReferenceKind::LValue ? "&" : "&&");
   }
-  void printRight(OutputStream &s) const override {
+  void printRight(OutputBuffer &OB) const override {
     if (Printing)
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
-    if (Collapsed.second->hasArray(s) || Collapsed.second->hasFunction(s))
-      s += ")";
-    Collapsed.second->printRight(s);
+    std::pair<ReferenceKind, const Node *> Collapsed = collapse(OB);
+    if (!Collapsed.second)
+      return;
+    if (Collapsed.second->hasArray(OB) || Collapsed.second->hasFunction(OB))
+      OB += ")";
+    Collapsed.second->printRight(OB);
   }
 };
 
@@ -590,24 +726,24 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(ClassType, MemberType); }
 
-  bool hasRHSComponentSlow(OutputStream &S) const override {
-    return MemberType->hasRHSComponent(S);
+  bool hasRHSComponentSlow(OutputBuffer &OB) const override {
+    return MemberType->hasRHSComponent(OB);
   }
 
-  void printLeft(OutputStream &s) const override {
-    MemberType->printLeft(s);
-    if (MemberType->hasArray(s) || MemberType->hasFunction(s))
-      s += "(";
+  void printLeft(OutputBuffer &OB) const override {
+    MemberType->printLeft(OB);
+    if (MemberType->hasArray(OB) || MemberType->hasFunction(OB))
+      OB += "(";
     else
-      s += " ";
-    ClassType->print(s);
-    s += "::*";
+      OB += " ";
+    ClassType->print(OB);
+    OB += "::*";
   }
 
-  void printRight(OutputStream &s) const override {
-    if (MemberType->hasArray(s) || MemberType->hasFunction(s))
-      s += ")";
-    MemberType->printRight(s);
+  void printRight(OutputBuffer &OB) const override {
+    if (MemberType->hasArray(OB) || MemberType->hasFunction(OB))
+      OB += ")";
+    MemberType->printRight(OB);
   }
 };
 
@@ -624,19 +760,19 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Base, Dimension); }
 
-  bool hasRHSComponentSlow(OutputStream &) const override { return true; }
-  bool hasArraySlow(OutputStream &) const override { return true; }
+  bool hasRHSComponentSlow(OutputBuffer &) const override { return true; }
+  bool hasArraySlow(OutputBuffer &) const override { return true; }
 
-  void printLeft(OutputStream &S) const override { Base->printLeft(S); }
+  void printLeft(OutputBuffer &OB) const override { Base->printLeft(OB); }
 
-  void printRight(OutputStream &S) const override {
-    if (S.back() != ']')
-      S += " ";
-    S += "[";
+  void printRight(OutputBuffer &OB) const override {
+    if (OB.back() != ']')
+      OB += " ";
+    OB += "[";
     if (Dimension)
-      Dimension->print(S);
-    S += "]";
-    Base->printRight(S);
+      Dimension->print(OB);
+    OB += "]";
+    Base->printRight(OB);
   }
 };
 
@@ -660,8 +796,8 @@ public:
     F(Ret, Params, CVQuals, RefQual, ExceptionSpec);
   }
 
-  bool hasRHSComponentSlow(OutputStream &) const override { return true; }
-  bool hasFunctionSlow(OutputStream &) const override { return true; }
+  bool hasRHSComponentSlow(OutputBuffer &) const override { return true; }
+  bool hasFunctionSlow(OutputBuffer &) const override { return true; }
 
   // Handle C++'s ... quirky decl grammar by using the left & right
   // distinction. Consider:
@@ -670,32 +806,32 @@ public:
   // that takes a char and returns an int. If we're trying to print f, start
   // by printing out the return types's left, then print our parameters, then
   // finally print right of the return type.
-  void printLeft(OutputStream &S) const override {
-    Ret->printLeft(S);
-    S += " ";
+  void printLeft(OutputBuffer &OB) const override {
+    Ret->printLeft(OB);
+    OB += " ";
   }
 
-  void printRight(OutputStream &S) const override {
-    S += "(";
-    Params.printWithComma(S);
-    S += ")";
-    Ret->printRight(S);
+  void printRight(OutputBuffer &OB) const override {
+    OB += "(";
+    Params.printWithComma(OB);
+    OB += ")";
+    Ret->printRight(OB);
 
     if (CVQuals & QualConst)
-      S += " const";
+      OB += " const";
     if (CVQuals & QualVolatile)
-      S += " volatile";
+      OB += " volatile";
     if (CVQuals & QualRestrict)
-      S += " restrict";
+      OB += " restrict";
 
     if (RefQual == FrefQualLValue)
-      S += " &";
+      OB += " &";
     else if (RefQual == FrefQualRValue)
-      S += " &&";
+      OB += " &&";
 
     if (ExceptionSpec != nullptr) {
-      S += ' ';
-      ExceptionSpec->print(S);
+      OB += ' ';
+      ExceptionSpec->print(OB);
     }
   }
 };
@@ -707,10 +843,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(E); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "noexcept(";
-    E->print(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "noexcept(";
+    E->print(OB);
+    OB += ")";
   }
 };
 
@@ -722,10 +858,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Types); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "throw(";
-    Types.printWithComma(S);
-    S += ')';
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "throw(";
+    Types.printWithComma(OB);
+    OB += ')';
   }
 };
 
@@ -756,41 +892,41 @@ public:
   NodeArray getParams() const { return Params; }
   const Node *getReturnType() const { return Ret; }
 
-  bool hasRHSComponentSlow(OutputStream &) const override { return true; }
-  bool hasFunctionSlow(OutputStream &) const override { return true; }
+  bool hasRHSComponentSlow(OutputBuffer &) const override { return true; }
+  bool hasFunctionSlow(OutputBuffer &) const override { return true; }
 
   const Node *getName() const { return Name; }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (Ret) {
-      Ret->printLeft(S);
-      if (!Ret->hasRHSComponent(S))
-        S += " ";
+      Ret->printLeft(OB);
+      if (!Ret->hasRHSComponent(OB))
+        OB += " ";
     }
-    Name->print(S);
+    Name->print(OB);
   }
 
-  void printRight(OutputStream &S) const override {
-    S += "(";
-    Params.printWithComma(S);
-    S += ")";
+  void printRight(OutputBuffer &OB) const override {
+    OB += "(";
+    Params.printWithComma(OB);
+    OB += ")";
     if (Ret)
-      Ret->printRight(S);
+      Ret->printRight(OB);
 
     if (CVQuals & QualConst)
-      S += " const";
+      OB += " const";
     if (CVQuals & QualVolatile)
-      S += " volatile";
+      OB += " volatile";
     if (CVQuals & QualRestrict)
-      S += " restrict";
+      OB += " restrict";
 
     if (RefQual == FrefQualLValue)
-      S += " &";
+      OB += " &";
     else if (RefQual == FrefQualRValue)
-      S += " &&";
+      OB += " &&";
 
     if (Attrs != nullptr)
-      Attrs->print(S);
+      Attrs->print(OB);
   }
 };
 
@@ -803,9 +939,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(OpName); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "operator\"\" ";
-    OpName->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "operator\"\" ";
+    OpName->print(OB);
   }
 };
 
@@ -819,9 +955,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Special, Child); }
 
-  void printLeft(OutputStream &S) const override {
-    S += Special;
-    Child->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += Special;
+    Child->print(OB);
   }
 };
 
@@ -836,11 +972,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(FirstType, SecondType); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "construction vtable for ";
-    FirstType->print(S);
-    S += "-in-";
-    SecondType->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "construction vtable for ";
+    FirstType->print(OB);
+    OB += "-in-";
+    SecondType->print(OB);
   }
 };
 
@@ -855,10 +991,10 @@ struct NestedName : Node {
 
   StringView getBaseName() const override { return Name->getBaseName(); }
 
-  void printLeft(OutputStream &S) const override {
-    Qual->print(S);
-    S += "::";
-    Name->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    Qual->print(OB);
+    OB += "::";
+    Name->print(OB);
   }
 };
 
@@ -871,10 +1007,10 @@ struct LocalName : Node {
 
   template<typename Fn> void match(Fn F) const { F(Encoding, Entity); }
 
-  void printLeft(OutputStream &S) const override {
-    Encoding->print(S);
-    S += "::";
-    Entity->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    Encoding->print(OB);
+    OB += "::";
+    Entity->print(OB);
   }
 };
 
@@ -891,10 +1027,10 @@ public:
 
   StringView getBaseName() const override { return Name->getBaseName(); }
 
-  void printLeft(OutputStream &S) const override {
-    Qualifier->print(S);
-    S += "::";
-    Name->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    Qualifier->print(OB);
+    OB += "::";
+    Name->print(OB);
   }
 };
 
@@ -909,12 +1045,12 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(BaseType, Dimension); }
 
-  void printLeft(OutputStream &S) const override {
-    BaseType->print(S);
-    S += " vector[";
+  void printLeft(OutputBuffer &OB) const override {
+    BaseType->print(OB);
+    OB += " vector[";
     if (Dimension)
-      Dimension->print(S);
-    S += "]";
+      Dimension->print(OB);
+    OB += "]";
   }
 };
 
@@ -927,11 +1063,26 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Dimension); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     // FIXME: This should demangle as "vector pixel".
-    S += "pixel vector[";
-    Dimension->print(S);
-    S += "]";
+    OB += "pixel vector[";
+    Dimension->print(OB);
+    OB += "]";
+  }
+};
+
+class BinaryFPType final : public Node {
+  const Node *Dimension;
+
+public:
+  BinaryFPType(const Node *Dimension_)
+      : Node(KBinaryFPType), Dimension(Dimension_) {}
+
+  template<typename Fn> void match(Fn F) const { F(Dimension); }
+
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "_Float";
+    Dimension->print(OB);
   }
 };
 
@@ -953,20 +1104,20 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Kind, Index); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     switch (Kind) {
     case TemplateParamKind::Type:
-      S += "$T";
+      OB += "$T";
       break;
     case TemplateParamKind::NonType:
-      S += "$N";
+      OB += "$N";
       break;
     case TemplateParamKind::Template:
-      S += "$TT";
+      OB += "$TT";
       break;
     }
     if (Index > 0)
-      S << Index - 1;
+      OB << Index - 1;
   }
 };
 
@@ -980,13 +1131,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Name); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "typename ";
-  }
+  void printLeft(OutputBuffer &OB) const override { OB += "typename "; }
 
-  void printRight(OutputStream &S) const override {
-    Name->print(S);
-  }
+  void printRight(OutputBuffer &OB) const override { Name->print(OB); }
 };
 
 /// A non-type template parameter declaration, 'int N'.
@@ -1000,15 +1147,15 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Name, Type); }
 
-  void printLeft(OutputStream &S) const override {
-    Type->printLeft(S);
-    if (!Type->hasRHSComponent(S))
-      S += " ";
+  void printLeft(OutputBuffer &OB) const override {
+    Type->printLeft(OB);
+    if (!Type->hasRHSComponent(OB))
+      OB += " ";
   }
 
-  void printRight(OutputStream &S) const override {
-    Name->print(S);
-    Type->printRight(S);
+  void printRight(OutputBuffer &OB) const override {
+    Name->print(OB);
+    Type->printRight(OB);
   }
 };
 
@@ -1025,15 +1172,13 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Name, Params); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "template<";
-    Params.printWithComma(S);
-    S += "> typename ";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "template<";
+    Params.printWithComma(OB);
+    OB += "> typename ";
   }
 
-  void printRight(OutputStream &S) const override {
-    Name->print(S);
-  }
+  void printRight(OutputBuffer &OB) const override { Name->print(OB); }
 };
 
 /// A template parameter pack declaration, 'typename ...T'.
@@ -1046,14 +1191,12 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Param); }
 
-  void printLeft(OutputStream &S) const override {
-    Param->printLeft(S);
-    S += "...";
+  void printLeft(OutputBuffer &OB) const override {
+    Param->printLeft(OB);
+    OB += "...";
   }
 
-  void printRight(OutputStream &S) const override {
-    Param->printRight(S);
-  }
+  void printRight(OutputBuffer &OB) const override { Param->printRight(OB); }
 };
 
 /// An unexpanded parameter pack (either in the expression or type context). If
@@ -1067,11 +1210,11 @@ public:
 class ParameterPack final : public Node {
   NodeArray Data;
 
-  // Setup OutputStream for a pack expansion unless we're already expanding one.
-  void initializePackExpansion(OutputStream &S) const {
-    if (S.CurrentPackMax == std::numeric_limits<unsigned>::max()) {
-      S.CurrentPackMax = static_cast<unsigned>(Data.size());
-      S.CurrentPackIndex = 0;
+  // Setup OutputBuffer for a pack expansion unless we're already expanding one.
+  void initializePackExpansion(OutputBuffer &OB) const {
+    if (OB.CurrentPackMax == std::numeric_limits<unsigned>::max()) {
+      OB.CurrentPackMax = static_cast<unsigned>(Data.size());
+      OB.CurrentPackIndex = 0;
     }
   }
 
@@ -1094,38 +1237,38 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Data); }
 
-  bool hasRHSComponentSlow(OutputStream &S) const override {
-    initializePackExpansion(S);
-    size_t Idx = S.CurrentPackIndex;
-    return Idx < Data.size() && Data[Idx]->hasRHSComponent(S);
+  bool hasRHSComponentSlow(OutputBuffer &OB) const override {
+    initializePackExpansion(OB);
+    size_t Idx = OB.CurrentPackIndex;
+    return Idx < Data.size() && Data[Idx]->hasRHSComponent(OB);
   }
-  bool hasArraySlow(OutputStream &S) const override {
-    initializePackExpansion(S);
-    size_t Idx = S.CurrentPackIndex;
-    return Idx < Data.size() && Data[Idx]->hasArray(S);
+  bool hasArraySlow(OutputBuffer &OB) const override {
+    initializePackExpansion(OB);
+    size_t Idx = OB.CurrentPackIndex;
+    return Idx < Data.size() && Data[Idx]->hasArray(OB);
   }
-  bool hasFunctionSlow(OutputStream &S) const override {
-    initializePackExpansion(S);
-    size_t Idx = S.CurrentPackIndex;
-    return Idx < Data.size() && Data[Idx]->hasFunction(S);
+  bool hasFunctionSlow(OutputBuffer &OB) const override {
+    initializePackExpansion(OB);
+    size_t Idx = OB.CurrentPackIndex;
+    return Idx < Data.size() && Data[Idx]->hasFunction(OB);
   }
-  const Node *getSyntaxNode(OutputStream &S) const override {
-    initializePackExpansion(S);
-    size_t Idx = S.CurrentPackIndex;
-    return Idx < Data.size() ? Data[Idx]->getSyntaxNode(S) : this;
+  const Node *getSyntaxNode(OutputBuffer &OB) const override {
+    initializePackExpansion(OB);
+    size_t Idx = OB.CurrentPackIndex;
+    return Idx < Data.size() ? Data[Idx]->getSyntaxNode(OB) : this;
   }
 
-  void printLeft(OutputStream &S) const override {
-    initializePackExpansion(S);
-    size_t Idx = S.CurrentPackIndex;
+  void printLeft(OutputBuffer &OB) const override {
+    initializePackExpansion(OB);
+    size_t Idx = OB.CurrentPackIndex;
     if (Idx < Data.size())
-      Data[Idx]->printLeft(S);
+      Data[Idx]->printLeft(OB);
   }
-  void printRight(OutputStream &S) const override {
-    initializePackExpansion(S);
-    size_t Idx = S.CurrentPackIndex;
+  void printRight(OutputBuffer &OB) const override {
+    initializePackExpansion(OB);
+    size_t Idx = OB.CurrentPackIndex;
     if (Idx < Data.size())
-      Data[Idx]->printRight(S);
+      Data[Idx]->printRight(OB);
   }
 };
 
@@ -1144,8 +1287,8 @@ public:
 
   NodeArray getElements() const { return Elements; }
 
-  void printLeft(OutputStream &S) const override {
-    Elements.printWithComma(S);
+  void printLeft(OutputBuffer &OB) const override {
+    Elements.printWithComma(OB);
   }
 };
 
@@ -1162,35 +1305,35 @@ public:
 
   const Node *getChild() const { return Child; }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     constexpr unsigned Max = std::numeric_limits<unsigned>::max();
-    SwapAndRestore<unsigned> SavePackIdx(S.CurrentPackIndex, Max);
-    SwapAndRestore<unsigned> SavePackMax(S.CurrentPackMax, Max);
-    size_t StreamPos = S.getCurrentPosition();
+    SwapAndRestore<unsigned> SavePackIdx(OB.CurrentPackIndex, Max);
+    SwapAndRestore<unsigned> SavePackMax(OB.CurrentPackMax, Max);
+    size_t StreamPos = OB.getCurrentPosition();
 
     // Print the first element in the pack. If Child contains a ParameterPack,
     // it will set up S.CurrentPackMax and print the first element.
-    Child->print(S);
+    Child->print(OB);
 
     // No ParameterPack was found in Child. This can occur if we've found a pack
     // expansion on a <function-param>.
-    if (S.CurrentPackMax == Max) {
-      S += "...";
+    if (OB.CurrentPackMax == Max) {
+      OB += "...";
       return;
     }
 
     // We found a ParameterPack, but it has no elements. Erase whatever we may
     // of printed.
-    if (S.CurrentPackMax == 0) {
-      S.setCurrentPosition(StreamPos);
+    if (OB.CurrentPackMax == 0) {
+      OB.setCurrentPosition(StreamPos);
       return;
     }
 
     // Else, iterate through the rest of the elements in the pack.
-    for (unsigned I = 1, E = S.CurrentPackMax; I < E; ++I) {
-      S += ", ";
-      S.CurrentPackIndex = I;
-      Child->print(S);
+    for (unsigned I = 1, E = OB.CurrentPackMax; I < E; ++I) {
+      OB += ", ";
+      OB.CurrentPackIndex = I;
+      Child->print(OB);
     }
   }
 };
@@ -1205,12 +1348,12 @@ public:
 
   NodeArray getParams() { return Params; }
 
-  void printLeft(OutputStream &S) const override {
-    S += "<";
-    Params.printWithComma(S);
-    if (S.back() == '>')
-      S += " ";
-    S += ">";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "<";
+    Params.printWithComma(OB);
+    if (OB.back() == '>')
+      OB += " ";
+    OB += ">";
   }
 };
 
@@ -1252,42 +1395,42 @@ struct ForwardTemplateReference : Node {
   // special handling.
   template<typename Fn> void match(Fn F) const = delete;
 
-  bool hasRHSComponentSlow(OutputStream &S) const override {
+  bool hasRHSComponentSlow(OutputBuffer &OB) const override {
     if (Printing)
       return false;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    return Ref->hasRHSComponent(S);
+    return Ref->hasRHSComponent(OB);
   }
-  bool hasArraySlow(OutputStream &S) const override {
+  bool hasArraySlow(OutputBuffer &OB) const override {
     if (Printing)
       return false;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    return Ref->hasArray(S);
+    return Ref->hasArray(OB);
   }
-  bool hasFunctionSlow(OutputStream &S) const override {
+  bool hasFunctionSlow(OutputBuffer &OB) const override {
     if (Printing)
       return false;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    return Ref->hasFunction(S);
+    return Ref->hasFunction(OB);
   }
-  const Node *getSyntaxNode(OutputStream &S) const override {
+  const Node *getSyntaxNode(OutputBuffer &OB) const override {
     if (Printing)
       return this;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    return Ref->getSyntaxNode(S);
+    return Ref->getSyntaxNode(OB);
   }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (Printing)
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    Ref->printLeft(S);
+    Ref->printLeft(OB);
   }
-  void printRight(OutputStream &S) const override {
+  void printRight(OutputBuffer &OB) const override {
     if (Printing)
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
-    Ref->printRight(S);
+    Ref->printRight(OB);
   }
 };
 
@@ -1303,9 +1446,9 @@ struct NameWithTemplateArgs : Node {
 
   StringView getBaseName() const override { return Name->getBaseName(); }
 
-  void printLeft(OutputStream &S) const override {
-    Name->print(S);
-    TemplateArgs->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    Name->print(OB);
+    TemplateArgs->print(OB);
   }
 };
 
@@ -1320,9 +1463,9 @@ public:
 
   StringView getBaseName() const override { return Child->getBaseName(); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "::";
-    Child->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "::";
+    Child->print(OB);
   }
 };
 
@@ -1335,9 +1478,9 @@ struct StdQualifiedName : Node {
 
   StringView getBaseName() const override { return Child->getBaseName(); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "std::";
-    Child->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "std::";
+    Child->print(OB);
   }
 };
 
@@ -1377,26 +1520,26 @@ public:
     DEMANGLE_UNREACHABLE;
   }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     switch (SSK) {
     case SpecialSubKind::allocator:
-      S += "std::allocator";
+      OB += "std::allocator";
       break;
     case SpecialSubKind::basic_string:
-      S += "std::basic_string";
+      OB += "std::basic_string";
       break;
     case SpecialSubKind::string:
-      S += "std::basic_string<char, std::char_traits<char>, "
-           "std::allocator<char> >";
+      OB += "std::basic_string<char, std::char_traits<char>, "
+            "std::allocator<char> >";
       break;
     case SpecialSubKind::istream:
-      S += "std::basic_istream<char, std::char_traits<char> >";
+      OB += "std::basic_istream<char, std::char_traits<char> >";
       break;
     case SpecialSubKind::ostream:
-      S += "std::basic_ostream<char, std::char_traits<char> >";
+      OB += "std::basic_ostream<char, std::char_traits<char> >";
       break;
     case SpecialSubKind::iostream:
-      S += "std::basic_iostream<char, std::char_traits<char> >";
+      OB += "std::basic_iostream<char, std::char_traits<char> >";
       break;
     }
   }
@@ -1429,25 +1572,25 @@ public:
     DEMANGLE_UNREACHABLE;
   }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     switch (SSK) {
     case SpecialSubKind::allocator:
-      S += "std::allocator";
+      OB += "std::allocator";
       break;
     case SpecialSubKind::basic_string:
-      S += "std::basic_string";
+      OB += "std::basic_string";
       break;
     case SpecialSubKind::string:
-      S += "std::string";
+      OB += "std::string";
       break;
     case SpecialSubKind::istream:
-      S += "std::istream";
+      OB += "std::istream";
       break;
     case SpecialSubKind::ostream:
-      S += "std::ostream";
+      OB += "std::ostream";
       break;
     case SpecialSubKind::iostream:
-      S += "std::iostream";
+      OB += "std::iostream";
       break;
     }
   }
@@ -1465,10 +1608,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Basename, IsDtor, Variant); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (IsDtor)
-      S += "~";
-    S += Basename->getBaseName();
+      OB += "~";
+    OB += Basename->getBaseName();
   }
 };
 
@@ -1480,9 +1623,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Base); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "~";
-    Base->printLeft(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "~";
+    Base->printLeft(OB);
   }
 };
 
@@ -1494,10 +1637,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Count); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "'unnamed";
-    S += Count;
-    S += "\'";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "'unnamed";
+    OB += Count;
+    OB += "\'";
   }
 };
 
@@ -1516,22 +1659,22 @@ public:
     F(TemplateParams, Params, Count);
   }
 
-  void printDeclarator(OutputStream &S) const {
+  void printDeclarator(OutputBuffer &OB) const {
     if (!TemplateParams.empty()) {
-      S += "<";
-      TemplateParams.printWithComma(S);
-      S += ">";
+      OB += "<";
+      TemplateParams.printWithComma(OB);
+      OB += ">";
     }
-    S += "(";
-    Params.printWithComma(S);
-    S += ")";
+    OB += "(";
+    Params.printWithComma(OB);
+    OB += ")";
   }
 
-  void printLeft(OutputStream &S) const override {
-    S += "\'lambda";
-    S += Count;
-    S += "\'";
-    printDeclarator(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "\'lambda";
+    OB += Count;
+    OB += "\'";
+    printDeclarator(OB);
   }
 };
 
@@ -1543,10 +1686,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Bindings); }
 
-  void printLeft(OutputStream &S) const override {
-    S += '[';
-    Bindings.printWithComma(S);
-    S += ']';
+  void printLeft(OutputBuffer &OB) const override {
+    OB += '[';
+    Bindings.printWithComma(OB);
+    OB += ']';
   }
 };
 
@@ -1564,22 +1707,22 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(LHS, InfixOperator, RHS); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     // might be a template argument expression, then we need to disambiguate
     // with parens.
     if (InfixOperator == ">")
-      S += "(";
+      OB += "(";
 
-    S += "(";
-    LHS->print(S);
-    S += ") ";
-    S += InfixOperator;
-    S += " (";
-    RHS->print(S);
-    S += ")";
+    OB += "(";
+    LHS->print(OB);
+    OB += ") ";
+    OB += InfixOperator;
+    OB += " (";
+    RHS->print(OB);
+    OB += ")";
 
     if (InfixOperator == ">")
-      S += ")";
+      OB += ")";
   }
 };
 
@@ -1593,12 +1736,12 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Op1, Op2); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "(";
-    Op1->print(S);
-    S += ")[";
-    Op2->print(S);
-    S += "]";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "(";
+    Op1->print(OB);
+    OB += ")[";
+    Op2->print(OB);
+    OB += "]";
   }
 };
 
@@ -1612,11 +1755,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Child, Operator); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "(";
-    Child->print(S);
-    S += ")";
-    S += Operator;
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "(";
+    Child->print(OB);
+    OB += ")";
+    OB += Operator;
   }
 };
 
@@ -1631,14 +1774,14 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Cond, Then, Else); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "(";
-    Cond->print(S);
-    S += ") ? (";
-    Then->print(S);
-    S += ") : (";
-    Else->print(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "(";
+    Cond->print(OB);
+    OB += ") ? (";
+    Then->print(OB);
+    OB += ") : (";
+    Else->print(OB);
+    OB += ")";
   }
 };
 
@@ -1653,10 +1796,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(LHS, Kind, RHS); }
 
-  void printLeft(OutputStream &S) const override {
-    LHS->print(S);
-    S += Kind;
-    RHS->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    LHS->print(OB);
+    OB += Kind;
+    RHS->print(OB);
   }
 };
 
@@ -1677,20 +1820,20 @@ public:
     F(Type, SubExpr, Offset, UnionSelectors, OnePastTheEnd);
   }
 
-  void printLeft(OutputStream &S) const override {
-    SubExpr->print(S);
-    S += ".<";
-    Type->print(S);
-    S += " at offset ";
+  void printLeft(OutputBuffer &OB) const override {
+    SubExpr->print(OB);
+    OB += ".<";
+    Type->print(OB);
+    OB += " at offset ";
     if (Offset.empty()) {
-      S += "0";
+      OB += "0";
     } else if (Offset[0] == 'n') {
-      S += "-";
-      S += Offset.dropFront();
+      OB += "-";
+      OB += Offset.dropFront();
     } else {
-      S += Offset;
+      OB += Offset;
     }
-    S += ">";
+    OB += ">";
   }
 };
 
@@ -1706,10 +1849,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Prefix, Infix, Postfix); }
 
-  void printLeft(OutputStream &S) const override {
-    S += Prefix;
-    Infix->print(S);
-    S += Postfix;
+  void printLeft(OutputBuffer &OB) const override {
+    OB += Prefix;
+    Infix->print(OB);
+    OB += Postfix;
   }
 };
 
@@ -1725,13 +1868,13 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(CastKind, To, From); }
 
-  void printLeft(OutputStream &S) const override {
-    S += CastKind;
-    S += "<";
-    To->printLeft(S);
-    S += ">(";
-    From->printLeft(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += CastKind;
+    OB += "<";
+    To->printLeft(OB);
+    OB += ">(";
+    From->printLeft(OB);
+    OB += ")";
   }
 };
 
@@ -1744,11 +1887,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Pack); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "sizeof...(";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "sizeof...(";
     ParameterPackExpansion PPE(Pack);
-    PPE.printLeft(S);
-    S += ")";
+    PPE.printLeft(OB);
+    OB += ")";
   }
 };
 
@@ -1762,11 +1905,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Callee, Args); }
 
-  void printLeft(OutputStream &S) const override {
-    Callee->print(S);
-    S += "(";
-    Args.printWithComma(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    Callee->print(OB);
+    OB += "(";
+    Args.printWithComma(OB);
+    OB += ")";
   }
 };
 
@@ -1787,25 +1930,24 @@ public:
     F(ExprList, Type, InitList, IsGlobal, IsArray);
   }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (IsGlobal)
-      S += "::operator ";
-    S += "new";
+      OB += "::operator ";
+    OB += "new";
     if (IsArray)
-      S += "[]";
-    S += ' ';
+      OB += "[]";
+    OB += ' ';
     if (!ExprList.empty()) {
-      S += "(";
-      ExprList.printWithComma(S);
-      S += ")";
+      OB += "(";
+      ExprList.printWithComma(OB);
+      OB += ")";
     }
-    Type->print(S);
+    Type->print(OB);
     if (!InitList.empty()) {
-      S += "(";
-      InitList.printWithComma(S);
-      S += ")";
+      OB += "(";
+      InitList.printWithComma(OB);
+      OB += ")";
     }
-
   }
 };
 
@@ -1820,13 +1962,13 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Op, IsGlobal, IsArray); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (IsGlobal)
-      S += "::";
-    S += "delete";
+      OB += "::";
+    OB += "delete";
     if (IsArray)
-      S += "[] ";
-    Op->print(S);
+      OB += "[] ";
+    Op->print(OB);
   }
 };
 
@@ -1840,11 +1982,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Prefix, Child); }
 
-  void printLeft(OutputStream &S) const override {
-    S += Prefix;
-    S += "(";
-    Child->print(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += Prefix;
+    OB += "(";
+    Child->print(OB);
+    OB += ")";
   }
 };
 
@@ -1856,9 +1998,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Number); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "fp";
-    S += Number;
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "fp";
+    OB += Number;
   }
 };
 
@@ -1872,12 +2014,12 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Type, Expressions); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "(";
-    Type->print(S);
-    S += ")(";
-    Expressions.printWithComma(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "(";
+    Type->print(OB);
+    OB += ")(";
+    Expressions.printWithComma(OB);
+    OB += ")";
   }
 };
 
@@ -1894,12 +2036,12 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Type, SubExpr, Offset); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "(";
-    Type->print(S);
-    S += ")(";
-    SubExpr->print(S);
-    S += ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "(";
+    Type->print(OB);
+    OB += ")(";
+    SubExpr->print(OB);
+    OB += ")";
   }
 };
 
@@ -1912,12 +2054,12 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Ty, Inits); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (Ty)
-      Ty->print(S);
-    S += '{';
-    Inits.printWithComma(S);
-    S += '}';
+      Ty->print(OB);
+    OB += '{';
+    Inits.printWithComma(OB);
+    OB += '}';
   }
 };
 
@@ -1931,18 +2073,18 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Elem, Init, IsArray); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (IsArray) {
-      S += '[';
-      Elem->print(S);
-      S += ']';
+      OB += '[';
+      Elem->print(OB);
+      OB += ']';
     } else {
-      S += '.';
-      Elem->print(S);
+      OB += '.';
+      Elem->print(OB);
     }
     if (Init->getKind() != KBracedExpr && Init->getKind() != KBracedRangeExpr)
-      S += " = ";
-    Init->print(S);
+      OB += " = ";
+    Init->print(OB);
   }
 };
 
@@ -1956,15 +2098,15 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(First, Last, Init); }
 
-  void printLeft(OutputStream &S) const override {
-    S += '[';
-    First->print(S);
-    S += " ... ";
-    Last->print(S);
-    S += ']';
+  void printLeft(OutputBuffer &OB) const override {
+    OB += '[';
+    First->print(OB);
+    OB += " ... ";
+    Last->print(OB);
+    OB += ']';
     if (Init->getKind() != KBracedExpr && Init->getKind() != KBracedRangeExpr)
-      S += " = ";
-    Init->print(S);
+      OB += " = ";
+    Init->print(OB);
   }
 };
 
@@ -1983,43 +2125,43 @@ public:
     F(IsLeftFold, OperatorName, Pack, Init);
   }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     auto PrintPack = [&] {
-      S += '(';
-      ParameterPackExpansion(Pack).print(S);
-      S += ')';
+      OB += '(';
+      ParameterPackExpansion(Pack).print(OB);
+      OB += ')';
     };
 
-    S += '(';
+    OB += '(';
 
     if (IsLeftFold) {
       // init op ... op pack
       if (Init != nullptr) {
-        Init->print(S);
-        S += ' ';
-        S += OperatorName;
-        S += ' ';
+        Init->print(OB);
+        OB += ' ';
+        OB += OperatorName;
+        OB += ' ';
       }
       // ... op pack
-      S += "... ";
-      S += OperatorName;
-      S += ' ';
+      OB += "... ";
+      OB += OperatorName;
+      OB += ' ';
       PrintPack();
     } else { // !IsLeftFold
       // pack op ...
       PrintPack();
-      S += ' ';
-      S += OperatorName;
-      S += " ...";
+      OB += ' ';
+      OB += OperatorName;
+      OB += " ...";
       // pack op ... op init
       if (Init != nullptr) {
-        S += ' ';
-        S += OperatorName;
-        S += ' ';
-        Init->print(S);
+        OB += ' ';
+        OB += OperatorName;
+        OB += ' ';
+        Init->print(OB);
       }
     }
-    S += ')';
+    OB += ')';
   }
 };
 
@@ -2031,9 +2173,9 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Op); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "throw ";
-    Op->print(S);
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "throw ";
+    Op->print(OB);
   }
 };
 
@@ -2045,8 +2187,8 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Value); }
 
-  void printLeft(OutputStream &S) const override {
-    S += Value ? StringView("true") : StringView("false");
+  void printLeft(OutputBuffer &OB) const override {
+    OB += Value ? StringView("true") : StringView("false");
   }
 };
 
@@ -2058,10 +2200,10 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Type); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "\"<";
-    Type->print(S);
-    S += ">\"";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "\"<";
+    Type->print(OB);
+    OB += ">\"";
   }
 };
 
@@ -2073,11 +2215,11 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Type); }
 
-  void printLeft(OutputStream &S) const override {
-    S += "[]";
+  void printLeft(OutputBuffer &OB) const override {
+    OB += "[]";
     if (Type->getKind() == KClosureTypeName)
-      static_cast<const ClosureTypeName *>(Type)->printDeclarator(S);
-    S += "{...}";
+      static_cast<const ClosureTypeName *>(Type)->printDeclarator(OB);
+    OB += "{...}";
   }
 };
 
@@ -2092,15 +2234,15 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Ty, Integer); }
 
-  void printLeft(OutputStream &S) const override {
-    S << "(";
-    Ty->print(S);
-    S << ")";
+  void printLeft(OutputBuffer &OB) const override {
+    OB << "(";
+    Ty->print(OB);
+    OB << ")";
 
     if (Integer[0] == 'n')
-      S << "-" << Integer.dropFront(1);
+      OB << "-" << Integer.dropFront(1);
     else
-      S << Integer;
+      OB << Integer;
   }
 };
 
@@ -2114,21 +2256,21 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Type, Value); }
 
-  void printLeft(OutputStream &S) const override {
+  void printLeft(OutputBuffer &OB) const override {
     if (Type.size() > 3) {
-      S += "(";
-      S += Type;
-      S += ")";
+      OB += "(";
+      OB += Type;
+      OB += ")";
     }
 
     if (Value[0] == 'n') {
-      S += "-";
-      S += Value.dropFront(1);
+      OB += "-";
+      OB += Value.dropFront(1);
     } else
-      S += Value;
+      OB += Value;
 
     if (Type.size() <= 3)
-      S += Type;
+      OB += Type;
   }
 };
 
@@ -2158,7 +2300,7 @@ public:
 
   template<typename Fn> void match(Fn F) const { F(Contents); }
 
-  void printLeft(OutputStream &s) const override {
+  void printLeft(OutputBuffer &OB) const override {
     const char *first = Contents.begin();
     const char *last = Contents.end() + 1;
 
@@ -2184,7 +2326,7 @@ public:
 #endif
       char num[FloatData<Float>::max_demangled_size] = {0};
       int n = snprintf(num, sizeof(num), FloatData<Float>::spec, value);
-      s += StringView(num, num + n);
+      OB += StringView(num, num + n);
     }
   }
 };
@@ -2217,125 +2359,6 @@ FOR_EACH_NODE_KIND(SPECIALIZATION)
 
 #undef FOR_EACH_NODE_KIND
 
-template <class T, size_t N>
-class PODSmallVector {
-  static_assert(std::is_pod<T>::value,
-                "T is required to be a plain old data type");
-
-  T* First = nullptr;
-  T* Last = nullptr;
-  T* Cap = nullptr;
-  T Inline[N] = {0};
-
-  bool isInline() const { return First == Inline; }
-
-  void clearInline() {
-    First = Inline;
-    Last = Inline;
-    Cap = Inline + N;
-  }
-
-  void reserve(size_t NewCap) {
-    size_t S = size();
-    if (isInline()) {
-      auto* Tmp = static_cast<T*>(std::malloc(NewCap * sizeof(T)));
-      if (Tmp == nullptr)
-        std::terminate();
-      std::copy(First, Last, Tmp);
-      First = Tmp;
-    } else {
-      First = static_cast<T*>(std::realloc(First, NewCap * sizeof(T)));
-      if (First == nullptr)
-        std::terminate();
-    }
-    Last = First + S;
-    Cap = First + NewCap;
-  }
-
-public:
-  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
-
-  PODSmallVector(const PODSmallVector&) = delete;
-  PODSmallVector& operator=(const PODSmallVector&) = delete;
-
-  PODSmallVector(PODSmallVector&& Other) : PODSmallVector() {
-    if (Other.isInline()) {
-      std::copy(Other.begin(), Other.end(), First);
-      Last = First + Other.size();
-      Other.clear();
-      return;
-    }
-
-    First = Other.First;
-    Last = Other.Last;
-    Cap = Other.Cap;
-    Other.clearInline();
-  }
-
-  PODSmallVector& operator=(PODSmallVector&& Other) {
-    if (Other.isInline()) {
-      if (!isInline()) {
-        std::free(First);
-        clearInline();
-      }
-      std::copy(Other.begin(), Other.end(), First);
-      Last = First + Other.size();
-      Other.clear();
-      return *this;
-    }
-
-    if (isInline()) {
-      First = Other.First;
-      Last = Other.Last;
-      Cap = Other.Cap;
-      Other.clearInline();
-      return *this;
-    }
-
-    std::swap(First, Other.First);
-    std::swap(Last, Other.Last);
-    std::swap(Cap, Other.Cap);
-    Other.clear();
-    return *this;
-  }
-
-  void push_back(const T& Elem) {
-    if (Last == Cap)
-      reserve(size() * 2);
-    *Last++ = Elem;
-  }
-
-  void pop_back() {
-    assert(Last != First && "Popping empty vector!");
-    --Last;
-  }
-
-  void dropBack(size_t Index) {
-    assert(Index <= size() && "dropBack() can't expand!");
-    Last = First + Index;
-  }
-
-  T* begin() { return First; }
-  T* end() { return Last; }
-
-  bool empty() const { return First == Last; }
-  size_t size() const { return static_cast<size_t>(Last - First); }
-  T& back() {
-    assert(Last != First && "Calling back() on empty vector!");
-    return *(Last - 1);
-  }
-  T& operator[](size_t Index) {
-    assert(Index < size() && "Invalid access!");
-    return *(begin() + Index);
-  }
-  void clear() { Last = First; }
-
-  ~PODSmallVector() {
-    if (!isInline())
-      std::free(First);
-  }
-};
-
 template <typename Derived, typename Alloc> struct AbstractManglingParser {
   const char *First;
   const char *Last;
@@ -3884,6 +3907,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseType() {
     case 'h':
       First += 2;
       return make<NameType>("half");
+    //                ::= DF <number> _ # ISO/IEC TS 18661 binary floating point (N bits)
+    case 'F': {
+      First += 2;
+      Node *DimensionNumber = make<NameType>(parseNumber());
+      if (!DimensionNumber)
+        return nullptr;
+      if (!consumeIf('_'))
+        return nullptr;
+      return make<BinaryFPType>(DimensionNumber);
+    }
     //                ::= Di   # char32_t
     case 'i':
       First += 2;
diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
index 77446e9b0f07..46daa3885a06 100644
--- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -21,11 +21,11 @@
 
 namespace llvm {
 namespace itanium_demangle {
-class OutputStream;
+class OutputBuffer;
 }
 }
 
-using llvm::itanium_demangle::OutputStream;
+using llvm::itanium_demangle::OutputBuffer;
 using llvm::itanium_demangle::StringView;
 
 namespace llvm {
@@ -80,6 +80,7 @@ enum OutputFlags {
   OF_NoAccessSpecifier = 4,
   OF_NoMemberType = 8,
   OF_NoReturnType = 16,
+  OF_NoVariableType = 32,
 };
 
 // Types
@@ -261,7 +262,7 @@ struct Node {
 
   NodeKind kind() const { return Kind; }
 
-  virtual void output(OutputStream &OS, OutputFlags Flags) const = 0;
+  virtual void output(OutputBuffer &OB, OutputFlags Flags) const = 0;
 
   std::string toString(OutputFlags Flags = OF_Default) const;
 
@@ -300,12 +301,12 @@ struct SpecialTableSymbolNode;
 struct TypeNode : public Node {
   explicit TypeNode(NodeKind K) : Node(K) {}
 
-  virtual void outputPre(OutputStream &OS, OutputFlags Flags) const = 0;
-  virtual void outputPost(OutputStream &OS, OutputFlags Flags) const = 0;
+  virtual void outputPre(OutputBuffer &OB, OutputFlags Flags) const = 0;
+  virtual void outputPost(OutputBuffer &OB, OutputFlags Flags) const = 0;
 
-  void output(OutputStream &OS, OutputFlags Flags) const override {
-    outputPre(OS, Flags);
-    outputPost(OS, Flags);
+  void output(OutputBuffer &OB, OutputFlags Flags) const override {
+    outputPre(OB, Flags);
+    outputPost(OB, Flags);
   }
 
   Qualifiers Quals = Q_None;
@@ -315,8 +316,8 @@ struct PrimitiveTypeNode : public TypeNode {
   explicit PrimitiveTypeNode(PrimitiveKind K)
       : TypeNode(NodeKind::PrimitiveType), PrimKind(K) {}
 
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override {}
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override {}
 
   PrimitiveKind PrimKind;
 };
@@ -325,8 +326,8 @@ struct FunctionSignatureNode : public TypeNode {
   explicit FunctionSignatureNode(NodeKind K) : TypeNode(K) {}
   FunctionSignatureNode() : TypeNode(NodeKind::FunctionSignature) {}
 
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override;
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
 
   // Valid if this FunctionTypeNode is the Pointee of a PointerType or
   // MemberPointerType.
@@ -359,13 +360,13 @@ struct IdentifierNode : public Node {
   NodeArrayNode *TemplateParams = nullptr;
 
 protected:
-  void outputTemplateParameters(OutputStream &OS, OutputFlags Flags) const;
+  void outputTemplateParameters(OutputBuffer &OB, OutputFlags Flags) const;
 };
 
 struct VcallThunkIdentifierNode : public IdentifierNode {
   VcallThunkIdentifierNode() : IdentifierNode(NodeKind::VcallThunkIdentifier) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   uint64_t OffsetInVTable = 0;
 };
@@ -374,7 +375,7 @@ struct DynamicStructorIdentifierNode : public IdentifierNode {
   DynamicStructorIdentifierNode()
       : IdentifierNode(NodeKind::DynamicStructorIdentifier) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   VariableSymbolNode *Variable = nullptr;
   QualifiedNameNode *Name = nullptr;
@@ -384,7 +385,7 @@ struct DynamicStructorIdentifierNode : public IdentifierNode {
 struct NamedIdentifierNode : public IdentifierNode {
   NamedIdentifierNode() : IdentifierNode(NodeKind::NamedIdentifier) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   StringView Name;
 };
@@ -394,7 +395,7 @@ struct IntrinsicFunctionIdentifierNode : public IdentifierNode {
       : IdentifierNode(NodeKind::IntrinsicFunctionIdentifier),
         Operator(Operator) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   IntrinsicFunctionKind Operator;
 };
@@ -403,7 +404,7 @@ struct LiteralOperatorIdentifierNode : public IdentifierNode {
   LiteralOperatorIdentifierNode()
       : IdentifierNode(NodeKind::LiteralOperatorIdentifier) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   StringView Name;
 };
@@ -412,7 +413,7 @@ struct LocalStaticGuardIdentifierNode : public IdentifierNode {
   LocalStaticGuardIdentifierNode()
       : IdentifierNode(NodeKind::LocalStaticGuardIdentifier) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   bool IsThread = false;
   uint32_t ScopeIndex = 0;
@@ -422,7 +423,7 @@ struct ConversionOperatorIdentifierNode : public IdentifierNode {
   ConversionOperatorIdentifierNode()
       : IdentifierNode(NodeKind::ConversionOperatorIdentifier) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   // The type that this operator converts too.
   TypeNode *TargetType = nullptr;
@@ -434,7 +435,7 @@ struct StructorIdentifierNode : public IdentifierNode {
       : IdentifierNode(NodeKind::StructorIdentifier),
         IsDestructor(IsDestructor) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   // The name of the class that this is a structor of.
   IdentifierNode *Class = nullptr;
@@ -444,8 +445,8 @@ struct StructorIdentifierNode : public IdentifierNode {
 struct ThunkSignatureNode : public FunctionSignatureNode {
   ThunkSignatureNode() : FunctionSignatureNode(NodeKind::ThunkSignature) {}
 
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override;
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
 
   struct ThisAdjustor {
     uint32_t StaticOffset = 0;
@@ -459,8 +460,8 @@ struct ThunkSignatureNode : public FunctionSignatureNode {
 
 struct PointerTypeNode : public TypeNode {
   PointerTypeNode() : TypeNode(NodeKind::PointerType) {}
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override;
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
 
   // Is this a pointer, reference, or rvalue-reference?
   PointerAffinity Affinity = PointerAffinity::None;
@@ -476,8 +477,8 @@ struct PointerTypeNode : public TypeNode {
 struct TagTypeNode : public TypeNode {
   explicit TagTypeNode(TagKind Tag) : TypeNode(NodeKind::TagType), Tag(Tag) {}
 
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override;
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
 
   QualifiedNameNode *QualifiedName = nullptr;
   TagKind Tag;
@@ -486,11 +487,11 @@ struct TagTypeNode : public TypeNode {
 struct ArrayTypeNode : public TypeNode {
   ArrayTypeNode() : TypeNode(NodeKind::ArrayType) {}
 
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override;
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
 
-  void outputDimensionsImpl(OutputStream &OS, OutputFlags Flags) const;
-  void outputOneDimension(OutputStream &OS, OutputFlags Flags, Node *N) const;
+  void outputDimensionsImpl(OutputBuffer &OB, OutputFlags Flags) const;
+  void outputOneDimension(OutputBuffer &OB, OutputFlags Flags, Node *N) const;
 
   // A list of array dimensions.  e.g. [3,4,5] in `int Foo[3][4][5]`
   NodeArrayNode *Dimensions = nullptr;
@@ -501,14 +502,14 @@ struct ArrayTypeNode : public TypeNode {
 
 struct IntrinsicNode : public TypeNode {
   IntrinsicNode() : TypeNode(NodeKind::IntrinsicType) {}
-  void output(OutputStream &OS, OutputFlags Flags) const override {}
+  void output(OutputBuffer &OB, OutputFlags Flags) const override {}
 };
 
 struct CustomTypeNode : public TypeNode {
   CustomTypeNode() : TypeNode(NodeKind::Custom) {}
 
-  void outputPre(OutputStream &OS, OutputFlags Flags) const override;
-  void outputPost(OutputStream &OS, OutputFlags Flags) const override;
+  void outputPre(OutputBuffer &OB, OutputFlags Flags) const override;
+  void outputPost(OutputBuffer &OB, OutputFlags Flags) const override;
 
   IdentifierNode *Identifier = nullptr;
 };
@@ -516,9 +517,9 @@ struct CustomTypeNode : public TypeNode {
 struct NodeArrayNode : public Node {
   NodeArrayNode() : Node(NodeKind::NodeArray) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
-  void output(OutputStream &OS, OutputFlags Flags, StringView Separator) const;
+  void output(OutputBuffer &OB, OutputFlags Flags, StringView Separator) const;
 
   Node **Nodes = nullptr;
   size_t Count = 0;
@@ -527,7 +528,7 @@ struct NodeArrayNode : public Node {
 struct QualifiedNameNode : public Node {
   QualifiedNameNode() : Node(NodeKind::QualifiedName) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   NodeArrayNode *Components = nullptr;
 
@@ -541,7 +542,7 @@ struct TemplateParameterReferenceNode : public Node {
   TemplateParameterReferenceNode()
       : Node(NodeKind::TemplateParameterReference) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   SymbolNode *Symbol = nullptr;
 
@@ -556,7 +557,7 @@ struct IntegerLiteralNode : public Node {
   IntegerLiteralNode(uint64_t Value, bool IsNegative)
       : Node(NodeKind::IntegerLiteral), Value(Value), IsNegative(IsNegative) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   uint64_t Value = 0;
   bool IsNegative = false;
@@ -566,7 +567,7 @@ struct RttiBaseClassDescriptorNode : public IdentifierNode {
   RttiBaseClassDescriptorNode()
       : IdentifierNode(NodeKind::RttiBaseClassDescriptor) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   uint32_t NVOffset = 0;
   int32_t VBPtrOffset = 0;
@@ -576,7 +577,7 @@ struct RttiBaseClassDescriptorNode : public IdentifierNode {
 
 struct SymbolNode : public Node {
   explicit SymbolNode(NodeKind K) : Node(K) {}
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
   QualifiedNameNode *Name = nullptr;
 };
 
@@ -584,7 +585,7 @@ struct SpecialTableSymbolNode : public SymbolNode {
   explicit SpecialTableSymbolNode()
       : SymbolNode(NodeKind::SpecialTableSymbol) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
   QualifiedNameNode *TargetName = nullptr;
   Qualifiers Quals = Qualifiers::Q_None;
 };
@@ -593,7 +594,7 @@ struct LocalStaticGuardVariableNode : public SymbolNode {
   LocalStaticGuardVariableNode()
       : SymbolNode(NodeKind::LocalStaticGuardVariable) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   bool IsVisible = false;
 };
@@ -601,7 +602,7 @@ struct LocalStaticGuardVariableNode : public SymbolNode {
 struct EncodedStringLiteralNode : public SymbolNode {
   EncodedStringLiteralNode() : SymbolNode(NodeKind::EncodedStringLiteral) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   StringView DecodedString;
   bool IsTruncated = false;
@@ -611,7 +612,7 @@ struct EncodedStringLiteralNode : public SymbolNode {
 struct VariableSymbolNode : public SymbolNode {
   VariableSymbolNode() : SymbolNode(NodeKind::VariableSymbol) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   StorageClass SC = StorageClass::None;
   TypeNode *Type = nullptr;
@@ -620,7 +621,7 @@ struct VariableSymbolNode : public SymbolNode {
 struct FunctionSymbolNode : public SymbolNode {
   FunctionSymbolNode() : SymbolNode(NodeKind::FunctionSymbol) {}
 
-  void output(OutputStream &OS, OutputFlags Flags) const override;
+  void output(OutputBuffer &OB, OutputFlags Flags) const override;
 
   FunctionSignatureNode *Signature = nullptr;
 };
diff --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h
index 04ff65a35aed..4fea9351a4bf 100644
--- a/llvm/include/llvm/Demangle/Utility.h
+++ b/llvm/include/llvm/Demangle/Utility.h
@@ -24,7 +24,7 @@ DEMANGLE_NAMESPACE_BEGIN
 
 // Stream that AST nodes write their string representation into after the AST
 // has been parsed.
-class OutputStream {
+class OutputBuffer {
   char *Buffer = nullptr;
   size_t CurrentPosition = 0;
   size_t BufferCapacity = 0;
@@ -63,9 +63,9 @@ class OutputStream {
   }
 
 public:
-  OutputStream(char *StartBuf, size_t Size)
+  OutputBuffer(char *StartBuf, size_t Size)
       : Buffer(StartBuf), CurrentPosition(0), BufferCapacity(Size) {}
-  OutputStream() = default;
+  OutputBuffer() = default;
   void reset(char *Buffer_, size_t BufferCapacity_) {
     CurrentPosition = 0;
     Buffer = Buffer_;
@@ -77,7 +77,7 @@ public:
   unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max();
   unsigned CurrentPackMax = std::numeric_limits<unsigned>::max();
 
-  OutputStream &operator+=(StringView R) {
+  OutputBuffer &operator+=(StringView R) {
     size_t Size = R.size();
     if (Size == 0)
       return *this;
@@ -87,17 +87,28 @@ public:
     return *this;
   }
 
-  OutputStream &operator+=(char C) {
+  OutputBuffer &operator+=(char C) {
     grow(1);
     Buffer[CurrentPosition++] = C;
     return *this;
   }
 
-  OutputStream &operator<<(StringView R) { return (*this += R); }
+  OutputBuffer &operator<<(StringView R) { return (*this += R); }
 
-  OutputStream &operator<<(char C) { return (*this += C); }
+  OutputBuffer prepend(StringView R) {
+    size_t Size = R.size();
+
+    grow(Size);
+    std::memmove(Buffer + Size, Buffer, CurrentPosition);
+    std::memcpy(Buffer, R.begin(), Size);
+    CurrentPosition += Size;
 
-  OutputStream &operator<<(long long N) {
+    return *this;
+  }
+
+  OutputBuffer &operator<<(char C) { return (*this += C); }
+
+  OutputBuffer &operator<<(long long N) {
     if (N < 0)
       writeUnsigned(static_cast<unsigned long long>(-N), true);
     else
@@ -105,27 +116,37 @@ public:
     return *this;
   }
 
-  OutputStream &operator<<(unsigned long long N) {
+  OutputBuffer &operator<<(unsigned long long N) {
     writeUnsigned(N, false);
     return *this;
   }
 
-  OutputStream &operator<<(long N) {
+  OutputBuffer &operator<<(long N) {
     return this->operator<<(static_cast<long long>(N));
   }
 
-  OutputStream &operator<<(unsigned long N) {
+  OutputBuffer &operator<<(unsigned long N) {
     return this->operator<<(static_cast<unsigned long long>(N));
   }
 
-  OutputStream &operator<<(int N) {
+  OutputBuffer &operator<<(int N) {
     return this->operator<<(static_cast<long long>(N));
   }
 
-  OutputStream &operator<<(unsigned int N) {
+  OutputBuffer &operator<<(unsigned int N) {
     return this->operator<<(static_cast<unsigned long long>(N));
   }
 
+  void insert(size_t Pos, const char *S, size_t N) {
+    assert(Pos <= CurrentPosition);
+    if (N == 0)
+      return;
+    grow(N);
+    std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
+    std::memcpy(Buffer + Pos, S, N);
+    CurrentPosition += N;
+  }
+
   size_t getCurrentPosition() const { return CurrentPosition; }
   void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
 
@@ -171,7 +192,7 @@ public:
   SwapAndRestore &operator=(const SwapAndRestore &) = delete;
 };
 
-inline bool initializeOutputStream(char *Buf, size_t *N, OutputStream &S,
+inline bool initializeOutputBuffer(char *Buf, size_t *N, OutputBuffer &OB,
                                    size_t InitSize) {
   size_t BufferSize;
   if (Buf == nullptr) {
@@ -182,7 +203,7 @@ inline bool initializeOutputStream(char *Buf, size_t *N, OutputStream &S,
   } else
     BufferSize = *N;
 
-  S.reset(Buf, BufferSize);
+  OB.reset(Buf, BufferSize);
   return true;
 }
 
diff --git a/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h b/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
index 2e386518f0bf..43c91fb5f988 100644
--- a/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -21,7 +21,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/Binary.h"
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch64.h
new file mode 100644
index 000000000000..50eb598139ea
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_aarch64.h
@@ -0,0 +1,39 @@
+//===--- ELF_aarch64.h - JIT link functions for ELF/aarch64 --*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+// jit-link functions for ELF/aarch64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH64_H
+#define LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH64_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// Create a LinkGraph from an ELF/aarch64 relocatable object
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_aarch64(MemoryBufferRef ObjectBuffer);
+
+/// jit-link the given object buffer, which must be a ELF aarch64 relocatable
+/// object file.
+void link_ELF_aarch64(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx);
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_AARCH64_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_riscv.h
index 1339ab51cbb9..5a8b186a2c3e 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_riscv.h
@@ -35,4 +35,4 @@ void link_ELF_riscv(std::unique_ptr<LinkGraph> G,
 } // end namespace jitlink
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_RISCV64_H
+#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_RISCV_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
index d8ed953363e6..f5fa9e96c594 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
@@ -21,29 +21,17 @@ namespace jitlink {
 namespace ELF_x86_64_Edges {
 enum ELFX86RelocationKind : Edge::Kind {
   Branch32 = Edge::FirstRelocation,
-  Branch32ToStub,
-  Pointer32,
+  Pointer32Signed,
   Pointer64,
-  Pointer64Anon,
   PCRel32,
-  PCRel64,
-  PCRel32Minus1,
-  PCRel32Minus2,
-  PCRel32Minus4,
-  PCRel32Anon,
-  PCRel32Minus1Anon,
-  PCRel32Minus2Anon,
-  PCRel32Minus4Anon,
   PCRel32GOTLoad,
-  PCRel32GOT,
+  PCRel32GOTLoadRelaxable,
+  PCRel32REXGOTLoadRelaxable,
+  PCRel32TLV,
   PCRel64GOT,
   GOTOFF64,
   GOT64,
-  PCRel32TLV,
-  Delta32,
   Delta64,
-  NegDelta32,
-  NegDelta64,
 };
 
 } // end namespace ELF_x86_64_Edges
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 6162a675ec12..83d85953fce6 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -13,19 +13,19 @@
 #ifndef LLVM_EXECUTIONENGINE_JITLINK_JITLINK_H
 #define LLVM_EXECUTIONENGINE_JITLINK_JITLINK_H
 
-#include "JITLinkMemoryManager.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/JITLink/MemoryFlags.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #include <map>
@@ -225,7 +225,7 @@ public:
 
   /// Get the content for this block. Block must not be a zero-fill block.
   ArrayRef<char> getContent() const {
-    assert(Data && "Section does not contain content");
+    assert(Data && "Block does not contain content");
     return ArrayRef<char>(Data, Size);
   }
 
@@ -233,6 +233,7 @@ public:
   /// Caller is responsible for ensuring the underlying bytes are not
   /// deallocated while pointed to by this block.
   void setContent(ArrayRef<char> Content) {
+    assert(Content.data() && "Setting null content");
     Data = Content.data();
     Size = Content.size();
     ContentMutable = false;
@@ -251,6 +252,7 @@ public:
   /// to call this on a block with immutable content -- consider using
   /// getMutableContent instead.
   MutableArrayRef<char> getAlreadyMutableContent() {
+    assert(Data && "Block does not contain content");
     assert(ContentMutable && "Content is not mutable");
     return MutableArrayRef<char>(const_cast<char *>(Data), Size);
   }
@@ -260,6 +262,7 @@ public:
   /// The caller is responsible for ensuring that the memory pointed to by
   /// MutableContent is not deallocated while pointed to by this block.
   void setMutableContent(MutableArrayRef<char> MutableContent) {
+    assert(MutableContent.data() && "Setting null content");
     Data = MutableContent.data();
     Size = MutableContent.size();
     ContentMutable = true;
@@ -295,6 +298,7 @@ public:
   /// Add an edge to this block.
   void addEdge(Edge::Kind K, Edge::OffsetT Offset, Symbol &Target,
                Edge::AddendT Addend) {
+    assert(!isZeroFill() && "Adding edge to zero-fill block?");
     Edges.push_back(Edge(K, Offset, Target, Addend));
   }
 
@@ -339,6 +343,12 @@ private:
   std::vector<Edge> Edges;
 };
 
+// Align a JITTargetAddress to conform with block alignment requirements.
+inline JITTargetAddress alignToBlock(JITTargetAddress Addr, Block &B) {
+  uint64_t Delta = (B.getAlignmentOffset() - Addr) % B.getAlignment();
+  return Addr + Delta;
+}
+
 /// Describes symbol linkage. This can be used to make resolve definition
 /// clashes.
 enum class Linkage : uint8_t {
@@ -640,8 +650,7 @@ class Section {
   friend class LinkGraph;
 
 private:
-  Section(StringRef Name, sys::Memory::ProtectionFlags Prot,
-          SectionOrdinal SecOrdinal)
+  Section(StringRef Name, MemProt Prot, SectionOrdinal SecOrdinal)
       : Name(Name), Prot(Prot), SecOrdinal(SecOrdinal) {}
 
   using SymbolSet = DenseSet<Symbol *>;
@@ -666,12 +675,16 @@ public:
   StringRef getName() const { return Name; }
 
   /// Returns the protection flags for this section.
-  sys::Memory::ProtectionFlags getProtectionFlags() const { return Prot; }
+  MemProt getMemProt() const { return Prot; }
 
   /// Set the protection flags for this section.
-  void setProtectionFlags(sys::Memory::ProtectionFlags Prot) {
-    this->Prot = Prot;
-  }
+  void setMemProt(MemProt Prot) { this->Prot = Prot; }
+
+  /// Get the deallocation policy for this section.
+  MemDeallocPolicy getMemDeallocPolicy() const { return MDP; }
+
+  /// Set the deallocation policy for this section.
+  void setMemDeallocPolicy(MemDeallocPolicy MDP) { this->MDP = MDP; }
 
   /// Returns the ordinal for this section.
   SectionOrdinal getOrdinal() const { return SecOrdinal; }
@@ -686,6 +699,7 @@ public:
     return make_range(Blocks.begin(), Blocks.end());
   }
 
+  /// Returns the number of blocks in this section.
   BlockSet::size_type blocks_size() const { return Blocks.size(); }
 
   /// Returns an iterator over the symbols defined in this section.
@@ -734,7 +748,8 @@ private:
   }
 
   StringRef Name;
-  sys::Memory::ProtectionFlags Prot;
+  MemProt Prot;
+  MemDeallocPolicy MDP = MemDeallocPolicy::Standard;
   SectionOrdinal SecOrdinal = 0;
   BlockSet Blocks;
   SymbolSet Symbols;
@@ -916,6 +931,11 @@ public:
       : Name(std::move(Name)), TT(TT), PointerSize(PointerSize),
         Endianness(Endianness), GetEdgeKindName(std::move(GetEdgeKindName)) {}
 
+  LinkGraph(const LinkGraph &) = delete;
+  LinkGraph &operator=(const LinkGraph &) = delete;
+  LinkGraph(LinkGraph &&) = delete;
+  LinkGraph &operator=(LinkGraph &&) = delete;
+
   /// Returns the name of this graph (usually the name of the original
   /// underlying MemoryBuffer).
   const std::string &getName() const { return Name; }
@@ -962,7 +982,7 @@ public:
   }
 
   /// Create a section with the given name, protection flags, and alignment.
-  Section &createSection(StringRef Name, sys::Memory::ProtectionFlags Prot) {
+  Section &createSection(StringRef Name, MemProt Prot) {
     assert(llvm::find_if(Sections,
                          [&](std::unique_ptr<Section> &Sec) {
                            return Sec->getName() == Name;
@@ -1100,10 +1120,10 @@ public:
   Symbol &addDefinedSymbol(Block &Content, JITTargetAddress Offset,
                            StringRef Name, JITTargetAddress Size, Linkage L,
                            Scope S, bool IsCallable, bool IsLive) {
-    assert(llvm::count_if(defined_symbols(),
-                          [&](const Symbol *Sym) {
-                            return Sym->getName() == Name;
-                          }) == 0 &&
+    assert((S == Scope::Local || llvm::count_if(defined_symbols(),
+                                                [&](const Symbol *Sym) {
+                                                  return Sym->getName() == Name;
+                                                }) == 0) &&
            "Duplicate defined symbol");
     auto &Sym =
         Symbol::constructNamedDef(Allocator.Allocate<Symbol>(), Content, Offset,
@@ -1237,6 +1257,7 @@ public:
   void transferDefinedSymbol(Symbol &Sym, Block &DestBlock,
                              JITTargetAddress NewOffset,
                              Optional<JITTargetAddress> ExplicitNewSize) {
+    auto &OldSection = Sym.getBlock().getSection();
     Sym.setBlock(DestBlock);
     Sym.setOffset(NewOffset);
     if (ExplicitNewSize)
@@ -1246,6 +1267,10 @@ public:
       if (Sym.getSize() > RemainingBlockSize)
         Sym.setSize(RemainingBlockSize);
     }
+    if (&DestBlock.getSection() != &OldSection) {
+      OldSection.removeSymbol(Sym);
+      DestBlock.getSection().addSymbol(Sym);
+    }
   }
 
   /// Transfers the given Block and all Symbols pointing to it to the given
@@ -1280,6 +1305,8 @@ public:
                      bool PreserveSrcSection = false) {
     if (&DstSection == &SrcSection)
       return;
+    for (auto *B : SrcSection.blocks())
+      B->setSection(DstSection);
     SrcSection.transferContentTo(DstSection);
     if (!PreserveSrcSection)
       removeSection(SrcSection);
@@ -1345,6 +1372,13 @@ public:
     Sections.erase(I);
   }
 
+  /// Accessor for the AllocActions object for this graph. This can be used to
+  /// register allocation action calls prior to finalization.
+  ///
+  /// Accessing this object after finalization will result in undefined
+  /// behavior.
+  JITLinkMemoryManager::AllocActions &allocActions() { return AAs; }
+
   /// Dump the graph.
   void dump(raw_ostream &OS);
 
@@ -1361,6 +1395,7 @@ private:
   SectionList Sections;
   ExternalSymbolSet ExternalSymbols;
   ExternalSymbolSet AbsoluteSymbols;
+  JITLinkMemoryManager::AllocActions AAs;
 };
 
 inline MutableArrayRef<char> Block::getMutableContent(LinkGraph &G) {
@@ -1650,8 +1685,7 @@ public:
   /// finalized (i.e. emitted to memory and memory permissions set). If all of
   /// this objects dependencies have also been finalized then the code is ready
   /// to run.
-  virtual void
-  notifyFinalized(std::unique_ptr<JITLinkMemoryManager::Allocation> A) = 0;
+  virtual void notifyFinalized(JITLinkMemoryManager::FinalizedAlloc Alloc) = 0;
 
   /// Called by JITLink prior to linking to determine whether default passes for
   /// the target should be added. The default implementation returns true.
@@ -1683,6 +1717,36 @@ Error markAllSymbolsLive(LinkGraph &G);
 Error makeTargetOutOfRangeError(const LinkGraph &G, const Block &B,
                                 const Edge &E);
 
+/// Base case for edge-visitors where the visitor-list is empty.
+inline void visitEdge(LinkGraph &G, Block *B, Edge &E) {}
+
+/// Applies the first visitor in the list to the given edge. If the visitor's
+/// visitEdge method returns true then we return immediately, otherwise we
+/// apply the next visitor.
+template <typename VisitorT, typename... VisitorTs>
+void visitEdge(LinkGraph &G, Block *B, Edge &E, VisitorT &&V,
+               VisitorTs &&...Vs) {
+  if (!V.visitEdge(G, B, E))
+    visitEdge(G, B, E, std::forward<VisitorTs>(Vs)...);
+}
+
+/// For each edge in the given graph, apply a list of visitors to the edge,
+/// stopping when the first visitor's visitEdge method returns true.
+///
+/// Only visits edges that were in the graph at call time: if any visitor
+/// adds new edges those will not be visited. Visitors are not allowed to
+/// remove edges (though they can change their kind, target, and addend).
+template <typename... VisitorTs>
+void visitExistingEdges(LinkGraph &G, VisitorTs &&...Vs) {
+  // We may add new blocks during this process, but we don't want to iterate
+  // over them, so build a worklist.
+  std::vector<Block *> Worklist(G.blocks().begin(), G.blocks().end());
+
+  for (auto *B : Worklist)
+    for (auto &E : B->edges())
+      visitEdge(G, B, E, std::forward<VisitorTs>(Vs)...);
+}
+
 /// Create a LinkGraph from the given object buffer.
 ///
 /// Note: The graph does not take ownership of the underlying buffer, nor copy
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
index cee7d6b09c48..62c271dfc0b2 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
@@ -13,106 +13,416 @@
 #ifndef LLVM_EXECUTIONENGINE_JITLINK_JITLINKMEMORYMANAGER_H
 #define LLVM_EXECUTIONENGINE_JITLINK_JITLINKMEMORYMANAGER_H
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h"
+#include "llvm/ExecutionEngine/JITLink/MemoryFlags.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/Memory.h"
+#include "llvm/Support/RecyclingAllocator.h"
 
 #include <cstdint>
 #include <future>
+#include <mutex>
 
 namespace llvm {
 namespace jitlink {
 
+class Block;
+class LinkGraph;
+class Section;
+
 /// Manages allocations of JIT memory.
 ///
 /// Instances of this class may be accessed concurrently from multiple threads
 /// and their implemetations should include any necessary synchronization.
 class JITLinkMemoryManager {
 public:
-  using ProtectionFlags = sys::Memory::ProtectionFlags;
+  /// Represents a call to a graph-memory-management support function in the
+  /// executor.
+  ///
+  /// Support functions are called as:
+  ///
+  ///   auto *Result =
+  ///       ((char*(*)(const void*, size_t))FnAddr)(
+  ///           (const void*)CtxAddr, (size_t)CtxSize)
+  ///
+  /// A null result is interpreted as success.
+  ///
+  /// A non-null result is interpreted as a heap-allocated string containing
+  /// an error message to report to the allocator (the allocator's
+  /// executor-side implementation code is responsible for freeing the error
+  /// string).
+  struct AllocActionCall {
+    JITTargetAddress FnAddr = 0;
+    JITTargetAddress CtxAddr = 0;
+    JITTargetAddress CtxSize = 0;
+  };
+
+  /// A pair of AllocActionCalls, one to be run at finalization time, one to be
+  /// run at deallocation time.
+  ///
+  /// AllocActionCallPairs should be constructed for paired operations (e.g.
+  /// __register_ehframe and __deregister_ehframe for eh-frame registration).
+  /// See comments for AllocActions for execution ordering.
+  ///
+  /// For unpaired operations one or the other member can be left unused, as
+  /// AllocationActionCalls with an FnAddr of zero will be skipped.
+  struct AllocActionCallPair {
+    AllocActionCall Finalize;
+    AllocActionCall Dealloc;
+  };
+
+  /// A vector of allocation actions to be run for this allocation.
+  ///
+  /// Finalize allocations will be run in order at finalize time. Dealloc
+  /// actions will be run in reverse order at deallocation time.
+  using AllocActions = std::vector<AllocActionCallPair>;
+
+  /// Represents a finalized allocation.
+  ///
+  /// Finalized allocations must be passed to the
+  /// JITLinkMemoryManager:deallocate method prior to being destroyed.
+  ///
+  /// The interpretation of the Address associated with the finalized allocation
+  /// is up to the memory manager implementation. Common options are using the
+  /// base address of the allocation, or the address of a memory management
+  /// object that tracks the allocation.
+  class FinalizedAlloc {
+    friend class JITLinkMemoryManager;
 
-  class SegmentRequest {
   public:
-    SegmentRequest() = default;
-    SegmentRequest(uint64_t Alignment, size_t ContentSize,
-                   uint64_t ZeroFillSize)
-        : Alignment(Alignment), ContentSize(ContentSize),
-          ZeroFillSize(ZeroFillSize) {
-      assert(isPowerOf2_32(Alignment) && "Alignment must be power of 2");
+    static constexpr JITTargetAddress InvalidAddr = ~JITTargetAddress(0);
+
+    FinalizedAlloc() = default;
+    explicit FinalizedAlloc(JITTargetAddress A) : A(A) {
+      assert(A != 0 && "Explicitly creating an invalid allocation?");
+    }
+    FinalizedAlloc(const FinalizedAlloc &) = delete;
+    FinalizedAlloc(FinalizedAlloc &&Other) : A(Other.A) {
+      Other.A = InvalidAddr;
+    }
+    FinalizedAlloc &operator=(const FinalizedAlloc &) = delete;
+    FinalizedAlloc &operator=(FinalizedAlloc &&Other) {
+      assert(A == InvalidAddr &&
+             "Cannot overwrite active finalized allocation");
+      std::swap(A, Other.A);
+      return *this;
+    }
+    ~FinalizedAlloc() {
+      assert(A == InvalidAddr && "Finalized allocation was not deallocated");
+    }
+
+    /// FinalizedAllocs convert to false for default-constructed, and
+    /// true otherwise. Default-constructed allocs need not be deallocated.
+    explicit operator bool() const { return A != InvalidAddr; }
+
+    /// Returns the address associated with this finalized allocation.
+    /// The allocation is unmodified.
+    JITTargetAddress getAddress() const { return A; }
+
+    /// Returns the address associated with this finalized allocation and
+    /// resets this object to the default state.
+    /// This should only be used by allocators when deallocating memory.
+    JITTargetAddress release() {
+      JITTargetAddress Tmp = A;
+      A = InvalidAddr;
+      return Tmp;
     }
-    uint64_t getAlignment() const { return Alignment; }
-    size_t getContentSize() const { return ContentSize; }
-    uint64_t getZeroFillSize() const { return ZeroFillSize; }
+
   private:
-    uint64_t Alignment = 0;
-    size_t ContentSize = 0;
-    uint64_t ZeroFillSize = 0;
+    JITTargetAddress A = InvalidAddr;
   };
 
-  using SegmentsRequestMap = DenseMap<unsigned, SegmentRequest>;
-
-  /// Represents an allocation created by the memory manager.
+  /// Represents an allocation which has not been finalized yet.
   ///
-  /// An allocation object is responsible for allocating and owning jit-linker
-  /// working and target memory, and for transfering from working to target
-  /// memory.
+  /// InFlightAllocs manage both executor memory allocations and working
+  /// memory allocations.
   ///
-  class Allocation {
+  /// On finalization, the InFlightAlloc should transfer the content of
+  /// working memory into executor memory, apply memory protections, and
+  /// run any finalization functions.
+  ///
+  /// Working memory should be kept alive at least until one of the following
+  /// happens: (1) the InFlightAlloc instance is destroyed, (2) the
+  /// InFlightAlloc is abandoned, (3) finalized target memory is destroyed.
+  ///
+  /// If abandon is called then working memory and executor memory should both
+  /// be freed.
+  class InFlightAlloc {
   public:
-    using FinalizeContinuation = std::function<void(Error)>;
-
-    virtual ~Allocation();
+    using OnFinalizedFunction = unique_function<void(Expected<FinalizedAlloc>)>;
+    using OnAbandonedFunction = unique_function<void(Error)>;
 
-    /// Should return the address of linker working memory for the segment with
-    /// the given protection flags.
-    virtual MutableArrayRef<char> getWorkingMemory(ProtectionFlags Seg) = 0;
+    virtual ~InFlightAlloc();
 
-    /// Should return the final address in the target process where the segment
-    /// will reside.
-    virtual JITTargetAddress getTargetMemory(ProtectionFlags Seg) = 0;
+    /// Called prior to finalization if the allocation should be abandoned.
+    virtual void abandon(OnAbandonedFunction OnAbandoned) = 0;
 
-    /// Should transfer from working memory to target memory, and release
-    /// working memory.
-    virtual void finalizeAsync(FinalizeContinuation OnFinalize) = 0;
+    /// Called to transfer working memory to the target and apply finalization.
+    virtual void finalize(OnFinalizedFunction OnFinalized) = 0;
 
-    /// Calls finalizeAsync and waits for completion.
-    Error finalize() {
-      std::promise<MSVCPError> FinalizeResultP;
+    /// Synchronous convenience version of finalize.
+    Expected<FinalizedAlloc> finalize() {
+      std::promise<MSVCPExpected<FinalizedAlloc>> FinalizeResultP;
       auto FinalizeResultF = FinalizeResultP.get_future();
-      finalizeAsync(
-          [&](Error Err) { FinalizeResultP.set_value(std::move(Err)); });
+      finalize([&](Expected<FinalizedAlloc> Result) {
+        FinalizeResultP.set_value(std::move(Result));
+      });
       return FinalizeResultF.get();
     }
-
-    /// Should deallocate target memory.
-    virtual Error deallocate() = 0;
   };
 
+  /// Typedef for the argument to be passed to OnAllocatedFunction.
+  using AllocResult = Expected<std::unique_ptr<InFlightAlloc>>;
+
+  /// Called when allocation has been completed.
+  using OnAllocatedFunction = unique_function<void(AllocResult)>;
+
+  /// Called when deallocation has completed.
+  using OnDeallocatedFunction = unique_function<void(Error)>;
+
   virtual ~JITLinkMemoryManager();
 
-  /// Create an Allocation object.
+  /// Start the allocation process.
   ///
-  /// The JD argument represents the target JITLinkDylib, and can be used by
-  /// JITLinkMemoryManager implementers to manage per-dylib allocation pools
-  /// (e.g. one pre-reserved address space slab per dylib to ensure that all
-  /// allocations for the dylib are within a certain range). The JD argument
-  /// may be null (representing an allocation not associated with any
-  /// JITDylib.
+  /// If the initial allocation is successful then the OnAllocated function will
+  /// be called with a std::unique_ptr<InFlightAlloc> value. If the assocation
+  /// is unsuccessful then the OnAllocated function will be called with an
+  /// Error.
+  virtual void allocate(const JITLinkDylib *JD, LinkGraph &G,
+                        OnAllocatedFunction OnAllocated) = 0;
+
+  /// Convenience function for blocking allocation.
+  AllocResult allocate(const JITLinkDylib *JD, LinkGraph &G) {
+    std::promise<MSVCPExpected<std::unique_ptr<InFlightAlloc>>> AllocResultP;
+    auto AllocResultF = AllocResultP.get_future();
+    allocate(JD, G, [&](AllocResult Alloc) {
+      AllocResultP.set_value(std::move(Alloc));
+    });
+    return AllocResultF.get();
+  }
+
+  /// Deallocate a list of allocation objects.
   ///
-  /// The request argument describes the segment sizes and permisssions being
-  /// requested.
-  virtual Expected<std::unique_ptr<Allocation>>
-  allocate(const JITLinkDylib *JD, const SegmentsRequestMap &Request) = 0;
+  /// Dealloc actions will be run in reverse order (from the end of the vector
+  /// to the start).
+  virtual void deallocate(std::vector<FinalizedAlloc> Allocs,
+                          OnDeallocatedFunction OnDeallocated) = 0;
+
+  /// Convenience function for deallocation of a single alloc.
+  void deallocate(FinalizedAlloc Alloc, OnDeallocatedFunction OnDeallocated) {
+    std::vector<FinalizedAlloc> Allocs;
+    Allocs.push_back(std::move(Alloc));
+    deallocate(std::move(Allocs), std::move(OnDeallocated));
+  }
+
+  /// Convenience function for blocking deallocation.
+  Error deallocate(std::vector<FinalizedAlloc> Allocs) {
+    std::promise<MSVCPError> DeallocResultP;
+    auto DeallocResultF = DeallocResultP.get_future();
+    deallocate(std::move(Allocs),
+               [&](Error Err) { DeallocResultP.set_value(std::move(Err)); });
+    return DeallocResultF.get();
+  }
+
+  /// Convenience function for blocking deallocation of a single alloc.
+  Error deallocate(FinalizedAlloc Alloc) {
+    std::vector<FinalizedAlloc> Allocs;
+    Allocs.push_back(std::move(Alloc));
+    return deallocate(std::move(Allocs));
+  }
+};
+
+/// BasicLayout simplifies the implementation of JITLinkMemoryManagers.
+///
+/// BasicLayout groups Sections into Segments based on their memory protection
+/// and deallocation policies. JITLinkMemoryManagers can construct a BasicLayout
+/// from a Graph, and then assign working memory and addresses to each of the
+/// Segments. These addreses will be mapped back onto the Graph blocks in
+/// the apply method.
+class BasicLayout {
+public:
+  /// The Alignment, ContentSize and ZeroFillSize of each segment will be
+  /// pre-filled from the Graph. Clients must set the Addr and WorkingMem fields
+  /// prior to calling apply.
+  //
+  // FIXME: The C++98 initializer is an attempt to work around compile failures
+  // due to http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1397.
+  // We should be able to switch this back to member initialization once that
+  // issue is fixed.
+  class Segment {
+    friend class BasicLayout;
+
+  public:
+    Segment()
+        : ContentSize(0), ZeroFillSize(0), Addr(0), WorkingMem(nullptr),
+          NextWorkingMemOffset(0) {}
+    Align Alignment;
+    size_t ContentSize;
+    uint64_t ZeroFillSize;
+    JITTargetAddress Addr;
+    char *WorkingMem = nullptr;
+
+  private:
+    size_t NextWorkingMemOffset;
+    std::vector<Block *> ContentBlocks, ZeroFillBlocks;
+  };
+
+  /// A convenience class that further groups segments based on memory
+  /// deallocation policy. This allows clients to make two slab allocations:
+  /// one for all standard segments, and one for all finalize segments.
+  struct ContiguousPageBasedLayoutSizes {
+    uint64_t StandardSegs = 0;
+    uint64_t FinalizeSegs = 0;
+
+    uint64_t total() const { return StandardSegs + FinalizeSegs; }
+  };
+
+private:
+  using SegmentMap = AllocGroupSmallMap<Segment>;
+
+public:
+  BasicLayout(LinkGraph &G);
+
+  /// Return a reference to the graph this allocation was created from.
+  LinkGraph &getGraph() { return G; }
+
+  /// Returns the total number of required to allocate all segments (with each
+  /// segment padded out to page size) for all standard segments, and all
+  /// finalize segments.
+  ///
+  /// This is a convenience function for the common case where the segments will
+  /// be allocated contiguously.
+  ///
+  /// This function will return an error if any segment has an alignment that
+  /// is higher than a page.
+  Expected<ContiguousPageBasedLayoutSizes>
+  getContiguousPageBasedLayoutSizes(uint64_t PageSize);
+
+  /// Returns an iterator over the segments of the layout.
+  iterator_range<SegmentMap::iterator> segments() {
+    return {Segments.begin(), Segments.end()};
+  }
+
+  /// Apply the layout to the graph.
+  Error apply();
+
+  /// Returns a reference to the AllocActions in the graph.
+  /// This convenience function saves callers from having to #include
+  /// LinkGraph.h if all they need are allocation actions.
+  JITLinkMemoryManager::AllocActions &graphAllocActions();
+
+private:
+  LinkGraph &G;
+  SegmentMap Segments;
+};
+
+/// A utility class for making simple allocations using JITLinkMemoryManager.
+///
+/// SimpleSegementAlloc takes a mapping of AllocGroups to Segments and uses
+/// this to create a LinkGraph with one Section (containing one Block) per
+/// Segment. Clients can obtain a pointer to the working memory and executor
+/// address of that block using the Segment's AllocGroup. Once memory has been
+/// populated, clients can call finalize to finalize the memory.
+class SimpleSegmentAlloc {
+public:
+  /// Describes a segment to be allocated.
+  struct Segment {
+    Segment() = default;
+    Segment(size_t ContentSize, Align ContentAlign)
+        : ContentSize(ContentSize), ContentAlign(ContentAlign) {}
+
+    size_t ContentSize = 0;
+    Align ContentAlign;
+  };
+
+  /// Describes the segment working memory and executor address.
+  struct SegmentInfo {
+    JITTargetAddress Addr = 0;
+    MutableArrayRef<char> WorkingMem;
+  };
+
+  using SegmentMap = AllocGroupSmallMap<Segment>;
+
+  using OnCreatedFunction = unique_function<void(Expected<SimpleSegmentAlloc>)>;
+
+  using OnFinalizedFunction =
+      JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction;
+
+  static void Create(JITLinkMemoryManager &MemMgr, const JITLinkDylib *JD,
+                     SegmentMap Segments, OnCreatedFunction OnCreated);
+
+  static Expected<SimpleSegmentAlloc> Create(JITLinkMemoryManager &MemMgr,
+                                             const JITLinkDylib *JD,
+                                             SegmentMap Segments);
+
+  SimpleSegmentAlloc(SimpleSegmentAlloc &&);
+  SimpleSegmentAlloc &operator=(SimpleSegmentAlloc &&);
+  ~SimpleSegmentAlloc();
+
+  /// Returns the SegmentInfo for the given group.
+  SegmentInfo getSegInfo(AllocGroup AG);
+
+  /// Finalize all groups (async version).
+  void finalize(OnFinalizedFunction OnFinalized) {
+    Alloc->finalize(std::move(OnFinalized));
+  }
+
+  /// Finalize all groups.
+  Expected<JITLinkMemoryManager::FinalizedAlloc> finalize() {
+    return Alloc->finalize();
+  }
+
+private:
+  SimpleSegmentAlloc(
+      std::unique_ptr<LinkGraph> G, AllocGroupSmallMap<Block *> ContentBlocks,
+      std::unique_ptr<JITLinkMemoryManager::InFlightAlloc> Alloc);
+
+  std::unique_ptr<LinkGraph> G;
+  AllocGroupSmallMap<Block *> ContentBlocks;
+  std::unique_ptr<JITLinkMemoryManager::InFlightAlloc> Alloc;
 };
 
 /// A JITLinkMemoryManager that allocates in-process memory.
 class InProcessMemoryManager : public JITLinkMemoryManager {
 public:
-  Expected<std::unique_ptr<Allocation>>
-  allocate(const JITLinkDylib *JD, const SegmentsRequestMap &Request) override;
+  class IPInFlightAlloc;
+
+  /// Attempts to auto-detect the host page size.
+  static Expected<std::unique_ptr<InProcessMemoryManager>> Create();
+
+  /// Create an instance using the given page size.
+  InProcessMemoryManager(uint64_t PageSize) : PageSize(PageSize) {}
+
+  void allocate(const JITLinkDylib *JD, LinkGraph &G,
+                OnAllocatedFunction OnAllocated) override;
+
+  // Use overloads from base class.
+  using JITLinkMemoryManager::allocate;
+
+  void deallocate(std::vector<FinalizedAlloc> Alloc,
+                  OnDeallocatedFunction OnDeallocated) override;
+
+  // Use overloads from base class.
+  using JITLinkMemoryManager::deallocate;
+
+private:
+  // FIXME: Use an in-place array instead of a vector for DeallocActions.
+  //        There shouldn't need to be a heap alloc for this.
+  struct FinalizedAllocInfo {
+    sys::MemoryBlock StandardSegments;
+    std::vector<AllocActionCall> DeallocActions;
+  };
+
+  FinalizedAlloc
+  createFinalizedAlloc(sys::MemoryBlock StandardSegments,
+                       std::vector<AllocActionCall> DeallocActions);
+
+  uint64_t PageSize;
+  std::mutex FinalizedAllocsMutex;
+  RecyclingAllocator<BumpPtrAllocator, FinalizedAllocInfo> FinalizedAllocInfos;
 };
 
 } // end namespace jitlink
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h b/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
index ecbc93e1467d..aee14c0d1fe5 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/MachO_arm64.h
@@ -29,6 +29,8 @@ enum MachOARM64RelocationKind : Edge::Kind {
   PageOffset12,
   GOTPage21,
   GOTPageOffset12,
+  TLVPage21,
+  TLVPageOffset12,
   PointerToGOT,
   PairedAddend,
   LDRLiteral19,
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h
new file mode 100644
index 000000000000..8fdce93ebc56
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/MemoryFlags.h
@@ -0,0 +1,225 @@
+//===-------- MemoryFlags.h - Memory allocation flags -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines types and operations related to memory protection and allocation
+// lifetimes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_MEMORYFLAGS_H
+#define LLVM_EXECUTIONENGINE_JITLINK_MEMORYFLAGS_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// Describes Read/Write/Exec permissions for memory.
+enum class MemProt {
+  None = 0,
+  Read = 1U << 0,
+  Write = 1U << 1,
+  Exec = 1U << 2,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ Exec)
+};
+
+/// Print a MemProt as an RWX triple.
+raw_ostream &operator<<(raw_ostream &OS, MemProt MP);
+
+/// Convert a MemProt value to a corresponding sys::Memory::ProtectionFlags
+/// value.
+inline sys::Memory::ProtectionFlags toSysMemoryProtectionFlags(MemProt MP) {
+  std::underlying_type_t<sys::Memory::ProtectionFlags> PF = 0;
+  if ((MP & MemProt::Read) != MemProt::None)
+    PF |= sys::Memory::MF_READ;
+  if ((MP & MemProt::Write) != MemProt::None)
+    PF |= sys::Memory::MF_WRITE;
+  if ((MP & MemProt::Exec) != MemProt::None)
+    PF |= sys::Memory::MF_EXEC;
+  return static_cast<sys::Memory::ProtectionFlags>(PF);
+}
+
+/// Convert a sys::Memory::ProtectionFlags value to a corresponding MemProt
+/// value.
+inline MemProt fromSysMemoryProtectionFlags(sys::Memory::ProtectionFlags PF) {
+  MemProt MP = MemProt::None;
+  if (PF & sys::Memory::MF_READ)
+    MP |= MemProt::Read;
+  if (PF & sys::Memory::MF_WRITE)
+    MP |= MemProt::Write;
+  if (PF & sys::Memory::MF_EXEC)
+    MP |= MemProt::None;
+  return MP;
+}
+
+/// Describes a memory deallocation policy for memory to be allocated by a
+/// JITLinkMemoryManager.
+///
+/// All memory allocated by a call to JITLinkMemoryManager::allocate should be
+/// deallocated if a call is made to
+/// JITLinkMemoryManager::InFlightAllocation::abandon. The policies below apply
+/// to finalized allocations.
+enum class MemDeallocPolicy {
+  /// Standard memory should be deallocated when the deallocate method is called
+  /// for the finalized allocation.
+  Standard,
+
+  /// Finalize memory should be overwritten and then deallocated after all
+  /// finalization functions have been run.
+  Finalize
+};
+
+/// Print a MemDeallocPolicy.
+raw_ostream &operator<<(raw_ostream &OS, MemDeallocPolicy MDP);
+
+/// A pair of memory protections and allocation policies.
+///
+/// Optimized for use as a small map key.
+class AllocGroup {
+  friend struct llvm::DenseMapInfo<AllocGroup>;
+
+  using underlying_type = uint8_t;
+  static constexpr unsigned BitsForProt = 3;
+  static constexpr unsigned BitsForDeallocPolicy = 1;
+  static constexpr unsigned MaxIdentifiers =
+      1U << (BitsForProt + BitsForDeallocPolicy);
+
+public:
+  static constexpr unsigned NumGroups = MaxIdentifiers;
+
+  /// Create a default AllocGroup. No memory protections, standard
+  /// deallocation policy.
+  AllocGroup() = default;
+
+  /// Create an AllocGroup from a MemProt only -- uses
+  /// MemoryDeallocationPolicy::Standard.
+  AllocGroup(MemProt MP) : Id(static_cast<underlying_type>(MP)) {}
+
+  /// Create an AllocGroup from a MemProt and a MemoryDeallocationPolicy.
+  AllocGroup(MemProt MP, MemDeallocPolicy MDP)
+      : Id(static_cast<underlying_type>(MP) |
+           (static_cast<underlying_type>(MDP) << BitsForProt)) {}
+
+  /// Returns the MemProt for this group.
+  MemProt getMemProt() const {
+    return static_cast<MemProt>(Id & ((1U << BitsForProt) - 1));
+  }
+
+  /// Returns the MemoryDeallocationPolicy for this group.
+  MemDeallocPolicy getMemDeallocPolicy() const {
+    return static_cast<MemDeallocPolicy>(Id >> BitsForProt);
+  }
+
+  friend bool operator==(const AllocGroup &LHS, const AllocGroup &RHS) {
+    return LHS.Id == RHS.Id;
+  }
+
+  friend bool operator!=(const AllocGroup &LHS, const AllocGroup &RHS) {
+    return !(LHS == RHS);
+  }
+
+  friend bool operator<(const AllocGroup &LHS, const AllocGroup &RHS) {
+    return LHS.Id < RHS.Id;
+  }
+
+private:
+  AllocGroup(underlying_type RawId) : Id(RawId) {}
+  underlying_type Id = 0;
+};
+
+/// A specialized small-map for AllocGroups.
+///
+/// Iteration order is guaranteed to match key ordering.
+template <typename T> class AllocGroupSmallMap {
+private:
+  using ElemT = std::pair<AllocGroup, T>;
+  using VectorTy = SmallVector<ElemT, 4>;
+
+  static bool compareKey(const ElemT &E, const AllocGroup &G) {
+    return E.first < G;
+  }
+
+public:
+  using iterator = typename VectorTy::iterator;
+
+  AllocGroupSmallMap() = default;
+  AllocGroupSmallMap(std::initializer_list<std::pair<AllocGroup, T>> Inits) {
+    Elems.reserve(Inits.size());
+    for (const auto &E : Inits)
+      Elems.push_back(E);
+    llvm::sort(Elems, [](const ElemT &LHS, const ElemT &RHS) {
+      return LHS.first < RHS.first;
+    });
+  }
+
+  iterator begin() { return Elems.begin(); }
+  iterator end() { return Elems.end(); }
+  iterator find(AllocGroup G) {
+    auto I = lower_bound(Elems, G, compareKey);
+    return (I->first == G) ? I : end();
+  }
+
+  bool empty() const { return Elems.empty(); }
+  size_t size() const { return Elems.size(); }
+
+  T &operator[](AllocGroup G) {
+    auto I = lower_bound(Elems, G, compareKey);
+    if (I == Elems.end() || I->first != G)
+      I = Elems.insert(I, std::make_pair(G, T()));
+    return I->second;
+  }
+
+private:
+  VectorTy Elems;
+};
+
+/// Print an AllocGroup.
+raw_ostream &operator<<(raw_ostream &OS, AllocGroup AG);
+
+} // end namespace jitlink
+
+template <> struct DenseMapInfo<jitlink::MemProt> {
+  static inline jitlink::MemProt getEmptyKey() {
+    return jitlink::MemProt(~uint8_t(0));
+  }
+  static inline jitlink::MemProt getTombstoneKey() {
+    return jitlink::MemProt(~uint8_t(0) - 1);
+  }
+  static unsigned getHashValue(const jitlink::MemProt &Val) {
+    using UT = std::underlying_type_t<jitlink::MemProt>;
+    return DenseMapInfo<UT>::getHashValue(static_cast<UT>(Val));
+  }
+  static bool isEqual(const jitlink::MemProt &LHS,
+                      const jitlink::MemProt &RHS) {
+    return LHS == RHS;
+  }
+};
+
+template <> struct DenseMapInfo<jitlink::AllocGroup> {
+  static inline jitlink::AllocGroup getEmptyKey() {
+    return jitlink::AllocGroup(~uint8_t(0));
+  }
+  static inline jitlink::AllocGroup getTombstoneKey() {
+    return jitlink::AllocGroup(~uint8_t(0) - 1);
+  }
+  static unsigned getHashValue(const jitlink::AllocGroup &Val) {
+    return DenseMapInfo<jitlink::AllocGroup::underlying_type>::getHashValue(
+        Val.Id);
+  }
+  static bool isEqual(const jitlink::AllocGroup &LHS,
+                      const jitlink::AllocGroup &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_MEMORYFLAGS_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/TableManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/TableManager.h
new file mode 100644
index 000000000000..c20f62d515ec
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/TableManager.h
@@ -0,0 +1,63 @@
+//===---------------------- TableManager.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Fix edge for edge that needs an entry to reference the target symbol
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_TABLEMANAGER_H
+#define LLVM_EXECUTIONENGINE_JITLINK_TABLEMANAGER_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// A CRTP base for tables that are built on demand, e.g. Global Offset Tables
+/// and Procedure Linkage Tables.
+/// The getEntyrForTarget function returns the table entry corresponding to the
+/// given target, calling down to the implementation class to build an entry if
+/// one does not already exist.
+template <typename TableManagerImplT> class TableManager {
+public:
+  /// Return the constructed entry
+  ///
+  /// Use parameter G to construct the entry for target symbol
+  Symbol &getEntryForTarget(LinkGraph &G, Symbol &Target) {
+    assert(Target.hasName() && "Edge cannot point to anonymous target");
+
+    auto EntryI = Entries.find(Target.getName());
+
+    // Build the entry if it doesn't exist.
+    if (EntryI == Entries.end()) {
+      auto &Entry = impl().createEntry(G, Target);
+      DEBUG_WITH_TYPE("jitlink", {
+        dbgs() << "    Created" << impl().getSectionName() << "entry for "
+               << Target.getName() << ": " << Entry << "\n";
+      });
+      EntryI = Entries.insert(std::make_pair(Target.getName(), &Entry)).first;
+    }
+
+    assert(EntryI != Entries.end() && "Could not get entry symbol");
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "    Using " << impl().getSectionName() << " entry "
+             << *EntryI->second << "\n";
+    });
+    return *EntryI->second;
+  }
+
+private:
+  TableManagerImplT &impl() { return static_cast<TableManagerImplT &>(*this); }
+  DenseMap<StringRef, Symbol *> Entries;
+};
+
+} // namespace jitlink
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
new file mode 100644
index 000000000000..994ce783b058
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch64.h
@@ -0,0 +1,38 @@
+//=== aarch64.h - Generic JITLink aarch64 edge kinds, utilities -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing aarch64 objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_AARCH64_H
+#define LLVM_EXECUTIONENGINE_JITLINK_AARCH64_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+namespace aarch64 {
+
+/// Represets aarch64 fixups
+enum EdgeKind_aarch64 : Edge::Kind {
+
+  /// Set a CALL immediate field to bits [27:2] of X = Target - Fixup + Addend
+  R_AARCH64_CALL26 = Edge::FirstRelocation,
+
+};
+
+/// Returns a string name for the given aarch64 edge. For debugging purposes
+/// only
+const char *getEdgeKindName(Edge::Kind K);
+
+} // namespace aarch64
+} // namespace jitlink
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_AARCH64_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
index a4509f3888a4..b8d08d88c1c9 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
@@ -70,7 +70,19 @@ enum EdgeKind_riscv : Edge::Kind {
   ///
   /// Fixup expression:
   ///   Fixup <- (Target - Fixup + Addend)
-  R_RISCV_CALL
+  R_RISCV_CALL,
+
+  /// PC relative GOT offset
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (GOT - Fixup + Addend) >> 12
+  R_RISCV_GOT_HI20,
+
+  /// PC relative call by PLT
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend)
+  R_RISCV_CALL_PLT
 
 };
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
index 006d983537e9..3130ea381534 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
@@ -14,6 +14,7 @@
 #define LLVM_EXECUTIONENGINE_JITLINK_X86_64_H
 
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/JITLink/TableManager.h"
 
 #include <limits>
 
@@ -42,6 +43,16 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   Pointer32,
 
+  /// A signed 32-bit pointer value relocation
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : int32
+  ///
+  /// Errors:
+  ///   - The target must reside in the signed 32-bits([-2**31, 2**32 - 1]) of
+  ///   the address space, otherwise an out-of-range error will be returned.
+  Pointer32Signed,
+
   /// A 64-bit delta.
   ///
   /// Delta from the fixup to the target.
@@ -85,6 +96,18 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///     an out-of-range error will be returned.
   NegDelta32,
 
+  /// A 64-bit GOT delta.
+  ///
+  /// Delta from the global offset table to the target
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTSymbol + Addend : int64
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  Delta64FromGOT,
+
   /// A 32-bit PC-relative branch.
   ///
   /// Represents a PC-relative call or branch to a target. This can be used to
@@ -120,7 +143,7 @@ enum EdgeKind_x86_64 : Edge::Kind {
   /// This edge kind has the same fixup expression as BranchPCRel32, but further
   /// identifies the call/branch as being to a pointer jump stub. For edges of
   /// this kind the jump stub should not be bypassed (use
-  /// BranchPCRel32ToPtrJumpStubRelaxable for that), but the pointer location
+  /// BranchPCRel32ToPtrJumpStubBypassable for that), but the pointer location
   /// target may be recorded to allow manipulation at runtime.
   ///
   /// Fixup expression:
@@ -136,7 +159,8 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   /// The edge kind has the same fixup expression as BranchPCRel32ToPtrJumpStub,
   /// but identifies the call/branch as being to a pointer jump stub that may be
-  /// bypassed if the ultimate target is within range of the fixup location.
+  /// bypassed with a direct jump to the ultimate target if the ultimate target
+  /// is within range of the fixup location.
   ///
   /// Fixup expression:
   ///   Fixup <- Target - Fixup + Addend - 4: int32
@@ -145,7 +169,7 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - The result of the fixup expression must fit into an int32, otherwise
   ///     an out-of-range error will be returned.
   ///
-  BranchPCRel32ToPtrJumpStubRelaxable,
+  BranchPCRel32ToPtrJumpStubBypassable,
 
   /// A GOT entry getter/constructor, transformed to Delta32 pointing at the GOT
   /// entry for the original target.
@@ -167,7 +191,62 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   RequestGOTAndTransformToDelta32,
 
-  /// A PC-relative reference to a GOT entry, relaxable if GOT entry target
+  /// A GOT entry getter/constructor, transformed to Delta64 pointing at the GOT
+  /// entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta64 targeting
+  /// the GOT entry for the edge's current target, maintaining the same addend.
+  /// A GOT entry for the target should be created if one does not already
+  /// exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToDelta64,
+
+  /// A GOT entry offset within GOT getter/constructor, transformed to
+  /// Delta64FromGOT
+  /// pointing at the GOT entry for the original target
+  ///
+  /// Indicates that this edge should be transformed into a Delta64FromGOT
+  /// targeting
+  /// the GOT entry for the edge's current target, maintaining the same addend.
+  /// A GOT entry for the target should be created if one does not already
+  /// exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase
+  RequestGOTAndTransformToDelta64FromGOT,
+
+  /// A PC-relative load of a GOT entry, relaxable if GOT entry target is
+  /// in-range of the fixup
+  ///
+  /// TODO: Explain the optimization
+  ///
+  /// Fixup expression
+  ///   Fixup <- Target - (Fixup + 4) + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  //
+  PCRel32GOTLoadRelaxable,
+
+  /// A PC-relative REX load of a GOT entry, relaxable if GOT entry target
   /// is in-range of the fixup.
   ///
   /// If the GOT entry target is in-range of the fixup then the load from the
@@ -180,17 +259,39 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - The result of the fixup expression must fit into an int32, otherwise
   ///     an out-of-range error will be returned.
   ///
-  PCRel32GOTLoadRelaxable,
+  PCRel32GOTLoadREXRelaxable,
 
-  /// A GOT entry getter/constructor, transformed to PCRel32ToGOTLoadRelaxable
-  /// pointing at the GOT entry for the original target.
+  /// A GOT entry getter/constructor, transformed to
+  /// PCRel32ToGOTLoadREXRelaxable pointing at the GOT entry for the original
+  /// target.
   ///
-  /// Indicates that this edge should be transformed into a
-  /// PC32ToGOTLoadRelaxable targeting the GOT entry for the edge's current
-  /// target, maintaining the same addend. A GOT entry for the target should be
-  /// created if one does not already exist.
+  /// Indicates that this edge should be lowered to a PC32ToGOTLoadREXRelaxable
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does not
+  /// already exist.
   ///
-  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// Edges of this kind are usually lowered by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable,
+
+  /// A GOT entry getter/constructor, transformed to
+  /// PCRel32ToGOTLoadRelaxable pointing at the GOT entry for the original
+  /// target.
+  ///
+  /// Indicates that this edge should be lowered to a PC32ToGOTLoadRelaxable
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does not
+  /// already exist.
+  ///
+  /// Edges of this kind are usually lowered by a GOT builder pass inserted by
   /// default.
   ///
   /// Fixup expression:
@@ -202,10 +303,10 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   RequestGOTAndTransformToPCRel32GOTLoadRelaxable,
 
-  /// A PC-relative reference to a Thread Local Variable Pointer (TLVP) entry,
+  /// A PC-relative REX load of a Thread Local Variable Pointer (TLVP) entry,
   /// relaxable if the TLVP entry target is in-range of the fixup.
   ///
-  /// If the TLVP entry target is in-range of the fixup then the load frmo the
+  /// If the TLVP entry target is in-range of the fixup then the load from the
   /// TLVP may be replaced with a direct memory address calculation.
   ///
   /// The target of this edge must be a thread local variable entry of the form
@@ -222,15 +323,18 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - The target must be either external, or a TLV entry of the required
   ///     form, otherwise a malformed TLV entry error will be returned.
   ///
-  PCRel32TLVPLoadRelaxable,
+  PCRel32TLVPLoadREXRelaxable,
+
+  /// TODO: Explain the generic edge kind
+  RequestTLSDescInGOTAndTransformToDelta32,
 
   /// A TLVP entry getter/constructor, transformed to
-  /// Delta32ToTLVPLoadRelaxable.
+  /// Delta32ToTLVPLoadREXRelaxable.
   ///
   /// Indicates that this edge should be transformed into a
-  /// Delta32ToTLVPLoadRelaxable targeting the TLVP entry for the edge's current
-  /// target. A TLVP entry for the target should be created if one does not
-  /// already exist.
+  /// Delta32ToTLVPLoadREXRelaxable targeting the TLVP entry for the edge's
+  /// current target. A TLVP entry for the target should be created if one does
+  /// not already exist.
   ///
   /// Fixup expression:
   ///   NONE
@@ -239,7 +343,7 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
   ///     phase will result in an assert/unreachable during the fixup phase.
   ///
-  RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable
+  RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable
 };
 
 /// Returns a string name for the given x86-64 edge. For debugging purposes
@@ -258,7 +362,8 @@ inline bool isInRangeForImmS32(int64_t Value) {
 }
 
 /// Apply fixup expression for edge to block content.
-inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E) {
+inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
+                        const Symbol *GOTSymbol) {
   using namespace support;
 
   char *BlockWorkingMem = B.getAlreadyMutableContent().data();
@@ -281,12 +386,21 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E) {
       return makeTargetOutOfRangeError(G, B, E);
     break;
   }
+  case Pointer32Signed: {
+    int64_t Value = E.getTarget().getAddress() + E.getAddend();
+    if (LLVM_LIKELY(isInRangeForImmS32(Value)))
+      *(little32_t *)FixupPtr = Value;
+    else
+      return makeTargetOutOfRangeError(G, B, E);
+    break;
+  }
 
   case BranchPCRel32:
   case BranchPCRel32ToPtrJumpStub:
-  case BranchPCRel32ToPtrJumpStubRelaxable:
+  case BranchPCRel32ToPtrJumpStubBypassable:
   case PCRel32GOTLoadRelaxable:
-  case PCRel32TLVPLoadRelaxable: {
+  case PCRel32GOTLoadREXRelaxable:
+  case PCRel32TLVPLoadREXRelaxable: {
     int64_t Value =
         E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
     if (LLVM_LIKELY(isInRangeForImmS32(Value)))
@@ -325,6 +439,13 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E) {
       return makeTargetOutOfRangeError(G, B, E);
     break;
   }
+  case Delta64FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value =
+        E.getTarget().getAddress() - GOTSymbol->getAddress() + E.getAddend();
+    *(little64_t *)FixupPtr = Value;
+    break;
+  }
 
   default: {
     // If you hit this you should check that *constructor and other non-fixup
@@ -395,6 +516,114 @@ inline Symbol &createAnonymousPointerJumpStub(LinkGraph &G,
       false);
 }
 
+/// Global Offset Table Builder.
+class GOTTableManager : public TableManager<GOTTableManager> {
+public:
+  static StringRef getSectionName() { return "$__GOT"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    Edge::Kind KindToSet = Edge::Invalid;
+    switch (E.getKind()) {
+    case x86_64::Delta64FromGOT: {
+      // we need to make sure that the GOT section exists, but don't otherwise
+      // need to fix up this edge
+      getGOTSection(G);
+      return false;
+    }
+    case x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable:
+      KindToSet = x86_64::PCRel32GOTLoadREXRelaxable;
+      break;
+    case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable:
+      KindToSet = x86_64::PCRel32GOTLoadRelaxable;
+      break;
+    case x86_64::RequestGOTAndTransformToDelta64:
+      KindToSet = x86_64::Delta64;
+      break;
+    case x86_64::RequestGOTAndTransformToDelta64FromGOT:
+      KindToSet = x86_64::Delta64FromGOT;
+      break;
+    case x86_64::RequestGOTAndTransformToDelta32:
+      KindToSet = x86_64::Delta32;
+      break;
+    default:
+      return false;
+    }
+    assert(KindToSet != Edge::Invalid &&
+           "Fell through switch, but no new kind to set");
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+             << formatv("{0:x}", B->getFixupAddress(E)) << " ("
+             << formatv("{0:x}", B->getAddress()) << " + "
+             << formatv("{0:x}", E.getOffset()) << ")\n";
+    });
+    E.setKind(KindToSet);
+    E.setTarget(getEntryForTarget(G, E.getTarget()));
+    return true;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    return createAnonymousPointer(G, getGOTSection(G), &Target);
+  }
+
+private:
+  Section &getGOTSection(LinkGraph &G) {
+    if (!GOTSection)
+      GOTSection = &G.createSection(getSectionName(), MemProt::Read);
+    return *GOTSection;
+  }
+
+  Section *GOTSection = nullptr;
+};
+
+/// Procedure Linkage Table Builder.
+class PLTTableManager : public TableManager<PLTTableManager> {
+public:
+  PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {}
+
+  static StringRef getSectionName() { return "$__STUBS"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getKind() == x86_64::BranchPCRel32 && !E.getTarget().isDefined()) {
+      DEBUG_WITH_TYPE("jitlink", {
+        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+               << formatv("{0:x}", B->getFixupAddress(E)) << " ("
+               << formatv("{0:x}", B->getAddress()) << " + "
+               << formatv("{0:x}", E.getOffset()) << ")\n";
+      });
+      // Set the edge kind to Branch32ToPtrJumpStubBypassable to enable it to
+      // be optimized when the target is in-range.
+      E.setKind(x86_64::BranchPCRel32ToPtrJumpStubBypassable);
+      E.setTarget(getEntryForTarget(G, E.getTarget()));
+      return true;
+    }
+    return false;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    return createAnonymousPointerJumpStub(G, getStubsSection(G),
+                                          GOT.getEntryForTarget(G, Target));
+  }
+
+public:
+  Section &getStubsSection(LinkGraph &G) {
+    if (!PLTSection)
+      PLTSection =
+          &G.createSection(getSectionName(), MemProt::Read | MemProt::Exec);
+    return *PLTSection;
+  }
+
+  GOTTableManager &GOT;
+  Section *PLTSection = nullptr;
+};
+
+/// Optimize the GOT and Stub relocations if the edge target address is in range
+/// 1. PCRel32GOTLoadRelaxable. For this edge kind, if the target is in range,
+/// then replace GOT load with lea
+/// 2. BranchPCRel32ToPtrJumpStubRelaxable. For this edge kind, if the target is
+/// in range, replace a indirect jump by plt stub with a direct jump to the
+/// target
+Error optimizeGOTAndStubAccesses(LinkGraph &G);
+
 } // namespace x86_64
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/include/llvm/ExecutionEngine/MCJIT.h b/llvm/include/llvm/ExecutionEngine/MCJIT.h
index 8253bf98963b..adce98f380c5 100644
--- a/llvm/include/llvm/ExecutionEngine/MCJIT.h
+++ b/llvm/include/llvm/ExecutionEngine/MCJIT.h
@@ -26,6 +26,9 @@ namespace {
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
+      // This is so that globals in the translation units where these functions
+      // are defined are forced to be initialized, populating various
+      // registries.
       if (std::getenv("bar") != (char*) -1)
         return;
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index e832d8d57dfa..5cac65b49a05 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -21,7 +21,7 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
-#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
+#include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 
@@ -434,13 +434,16 @@ class SymbolsNotFound : public ErrorInfo<SymbolsNotFound> {
 public:
   static char ID;
 
-  SymbolsNotFound(SymbolNameSet Symbols);
-  SymbolsNotFound(SymbolNameVector Symbols);
+  SymbolsNotFound(std::shared_ptr<SymbolStringPool> SSP, SymbolNameSet Symbols);
+  SymbolsNotFound(std::shared_ptr<SymbolStringPool> SSP,
+                  SymbolNameVector Symbols);
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
+  std::shared_ptr<SymbolStringPool> getSymbolStringPool() { return SSP; }
   const SymbolNameVector &getSymbols() const { return Symbols; }
 
 private:
+  std::shared_ptr<SymbolStringPool> SSP;
   SymbolNameVector Symbols;
 };
 
@@ -449,12 +452,15 @@ class SymbolsCouldNotBeRemoved : public ErrorInfo<SymbolsCouldNotBeRemoved> {
 public:
   static char ID;
 
-  SymbolsCouldNotBeRemoved(SymbolNameSet Symbols);
+  SymbolsCouldNotBeRemoved(std::shared_ptr<SymbolStringPool> SSP,
+                           SymbolNameSet Symbols);
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
+  std::shared_ptr<SymbolStringPool> getSymbolStringPool() { return SSP; }
   const SymbolNameSet &getSymbols() const { return Symbols; }
 
 private:
+  std::shared_ptr<SymbolStringPool> SSP;
   SymbolNameSet Symbols;
 };
 
@@ -466,13 +472,17 @@ class MissingSymbolDefinitions : public ErrorInfo<MissingSymbolDefinitions> {
 public:
   static char ID;
 
-  MissingSymbolDefinitions(std::string ModuleName, SymbolNameVector Symbols)
-    : ModuleName(std::move(ModuleName)), Symbols(std::move(Symbols)) {}
+  MissingSymbolDefinitions(std::shared_ptr<SymbolStringPool> SSP,
+                           std::string ModuleName, SymbolNameVector Symbols)
+      : SSP(std::move(SSP)), ModuleName(std::move(ModuleName)),
+        Symbols(std::move(Symbols)) {}
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
+  std::shared_ptr<SymbolStringPool> getSymbolStringPool() { return SSP; }
   const std::string &getModuleName() const { return ModuleName; }
   const SymbolNameVector &getSymbols() const { return Symbols; }
 private:
+  std::shared_ptr<SymbolStringPool> SSP;
   std::string ModuleName;
   SymbolNameVector Symbols;
 };
@@ -485,13 +495,17 @@ class UnexpectedSymbolDefinitions : public ErrorInfo<UnexpectedSymbolDefinitions
 public:
   static char ID;
 
-  UnexpectedSymbolDefinitions(std::string ModuleName, SymbolNameVector Symbols)
-    : ModuleName(std::move(ModuleName)), Symbols(std::move(Symbols)) {}
+  UnexpectedSymbolDefinitions(std::shared_ptr<SymbolStringPool> SSP,
+                              std::string ModuleName, SymbolNameVector Symbols)
+      : SSP(std::move(SSP)), ModuleName(std::move(ModuleName)),
+        Symbols(std::move(Symbols)) {}
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
+  std::shared_ptr<SymbolStringPool> getSymbolStringPool() { return SSP; }
   const std::string &getModuleName() const { return ModuleName; }
   const SymbolNameVector &getSymbols() const { return Symbols; }
 private:
+  std::shared_ptr<SymbolStringPool> SSP;
   std::string ModuleName;
   SymbolNameVector Symbols;
 };
@@ -1241,21 +1255,6 @@ public:
                          const DenseMap<JITDylib *, SymbolLookupSet> &InitSyms);
 };
 
-/// Represents an abstract task for ORC to run.
-class Task : public RTTIExtends<Task, RTTIRoot> {
-public:
-  static char ID;
-
-  /// Description of the task to be performed. Used for logging.
-  virtual void printDescription(raw_ostream &OS) = 0;
-
-  /// Run the task.
-  virtual void run() = 0;
-
-private:
-  void anchor() override;
-};
-
 /// A materialization task.
 class MaterializationTask : public RTTIExtends<MaterializationTask, Task> {
 public:
@@ -1285,13 +1284,16 @@ public:
   /// For reporting errors.
   using ErrorReporter = std::function<void(Error)>;
 
+  /// Send a result to the remote.
+  using SendResultFunction = unique_function<void(shared::WrapperFunctionResult)>;
+
   /// For dispatching ORC tasks (typically materialization tasks).
   using DispatchTaskFunction = unique_function<void(std::unique_ptr<Task> T)>;
 
   /// An asynchronous wrapper-function callable from the executor via
   /// jit-dispatch.
   using JITDispatchHandlerFunction = unique_function<void(
-      ExecutorProcessControl::SendResultFunction SendResult,
+      SendResultFunction SendResult,
       const char *ArgData, size_t ArgSize)>;
 
   /// A map associating tag names with asynchronous wrapper function
@@ -1303,13 +1305,19 @@ public:
   /// object.
   ExecutionSession(std::unique_ptr<ExecutorProcessControl> EPC);
 
-  /// End the session. Closes all JITDylibs.
+  /// End the session. Closes all JITDylibs and disconnects from the
+  /// executor.
   Error endSession();
 
   /// Get the ExecutorProcessControl object associated with this
   /// ExecutionSession.
   ExecutorProcessControl &getExecutorProcessControl() { return *EPC; }
 
+  /// Get the SymbolStringPool for this instance.
+  std::shared_ptr<SymbolStringPool> getSymbolStringPool() {
+    return EPC->getSymbolStringPool();
+  }
+
   /// Add a symbol name to the SymbolStringPool and return a pointer to it.
   SymbolStringPtr intern(StringRef SymName) { return EPC->intern(SymName); }
 
@@ -1462,10 +1470,9 @@ public:
   /// \endcode{.cpp}
   ///
   /// The given OnComplete function will be called to return the result.
-  void callWrapperAsync(ExecutorProcessControl::SendResultFunction OnComplete,
-                        JITTargetAddress WrapperFnAddr,
-                        ArrayRef<char> ArgBuffer) {
-    EPC->callWrapperAsync(std::move(OnComplete), WrapperFnAddr, ArgBuffer);
+  template <typename... ArgTs>
+  void callWrapperAsync(ArgTs &&... Args) {
+    EPC->callWrapperAsync(std::forward<ArgTs>(Args)...);
   }
 
   /// Run a wrapper function in the executor. The wrapper function should be
@@ -1474,30 +1481,18 @@ public:
   /// \code{.cpp}
   ///   CWrapperFunctionResult fn(uint8_t *Data, uint64_t Size);
   /// \endcode{.cpp}
-  shared::WrapperFunctionResult callWrapper(JITTargetAddress WrapperFnAddr,
+  shared::WrapperFunctionResult callWrapper(ExecutorAddr WrapperFnAddr,
                                             ArrayRef<char> ArgBuffer) {
-    std::promise<shared::WrapperFunctionResult> RP;
-    auto RF = RP.get_future();
-    callWrapperAsync(
-        [&](shared::WrapperFunctionResult R) { RP.set_value(std::move(R)); },
-        WrapperFnAddr, ArgBuffer);
-    return RF.get();
+    return EPC->callWrapper(WrapperFnAddr, ArgBuffer);
   }
 
   /// Run a wrapper function using SPS to serialize the arguments and
   /// deserialize the results.
   template <typename SPSSignature, typename SendResultT, typename... ArgTs>
-  void callSPSWrapperAsync(SendResultT &&SendResult,
-                           JITTargetAddress WrapperFnAddr,
+  void callSPSWrapperAsync(ExecutorAddr WrapperFnAddr, SendResultT &&SendResult,
                            const ArgTs &...Args) {
-    shared::WrapperFunction<SPSSignature>::callAsync(
-        [this,
-         WrapperFnAddr](ExecutorProcessControl::SendResultFunction SendResult,
-                        const char *ArgData, size_t ArgSize) {
-          callWrapperAsync(std::move(SendResult), WrapperFnAddr,
-                           ArrayRef<char>(ArgData, ArgSize));
-        },
-        std::move(SendResult), Args...);
+    EPC->callSPSWrapperAsync<SPSSignature, SendResultT, ArgTs...>(
+        WrapperFnAddr, std::forward<SendResultT>(SendResult), Args...);
   }
 
   /// Run a wrapper function using SPS to serialize the arguments and
@@ -1506,13 +1501,10 @@ public:
   /// If SPSSignature is a non-void function signature then the second argument
   /// (the first in the Args list) should be a reference to a return value.
   template <typename SPSSignature, typename... WrapperCallArgTs>
-  Error callSPSWrapper(JITTargetAddress WrapperFnAddr,
+  Error callSPSWrapper(ExecutorAddr WrapperFnAddr,
                        WrapperCallArgTs &&...WrapperCallArgs) {
-    return shared::WrapperFunction<SPSSignature>::call(
-        [this, WrapperFnAddr](const char *ArgData, size_t ArgSize) {
-          return callWrapper(WrapperFnAddr, ArrayRef<char>(ArgData, ArgSize));
-        },
-        std::forward<WrapperCallArgTs>(WrapperCallArgs)...);
+    return EPC->callSPSWrapper<SPSSignature, WrapperCallArgTs...>(
+        WrapperFnAddr, std::forward<WrapperCallArgTs>(WrapperCallArgs)...);
   }
 
   /// Wrap a handler that takes concrete argument types (and a sender for a
@@ -1525,7 +1517,7 @@ public:
   template <typename SPSSignature, typename HandlerT>
   static JITDispatchHandlerFunction wrapAsyncWithSPS(HandlerT &&H) {
     return [H = std::forward<HandlerT>(H)](
-               ExecutorProcessControl::SendResultFunction SendResult,
+               SendResultFunction SendResult,
                const char *ArgData, size_t ArgSize) mutable {
       shared::WrapperFunction<SPSSignature>::handleAsync(ArgData, ArgSize, H,
                                                          std::move(SendResult));
@@ -1564,7 +1556,7 @@ public:
   /// This should be called by the ExecutorProcessControl instance in response
   /// to incoming jit-dispatch requests from the executor.
   void
-  runJITDispatchHandler(ExecutorProcessControl::SendResultFunction SendResult,
+  runJITDispatchHandler(SendResultFunction SendResult,
                         JITTargetAddress HandlerFnTagAddr,
                         ArrayRef<char> ArgBuffer);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h
new file mode 100644
index 000000000000..af092b3287d3
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h
@@ -0,0 +1,64 @@
+//===--- DebugerSupportPlugin.h -- Utils for debugger support ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generates debug objects and registers them using the jit-loader-gdb protocol.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORT_H
+#define LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORT_H
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+
+namespace llvm {
+namespace orc {
+
+/// For each object containing debug info, installs JITLink passes to synthesize
+/// a debug object and then register it via the GDB JIT-registration interface.
+///
+/// Currently MachO only. For ELF use DebugObjectManagerPlugin. These two
+/// plugins will be merged in the near future.
+class GDBJITDebugInfoRegistrationPlugin : public ObjectLinkingLayer::Plugin {
+public:
+  class DebugSectionSynthesizer {
+  public:
+    virtual ~DebugSectionSynthesizer() {}
+    virtual Error startSynthesis() = 0;
+    virtual Error completeSynthesisAndRegister() = 0;
+  };
+
+  static Expected<std::unique_ptr<GDBJITDebugInfoRegistrationPlugin>>
+  Create(ExecutionSession &ES, JITDylib &ProcessJD, const Triple &TT);
+
+  GDBJITDebugInfoRegistrationPlugin(ExecutorAddr RegisterActionAddr)
+      : RegisterActionAddr(RegisterActionAddr) {}
+
+  Error notifyFailed(MaterializationResponsibility &MR) override;
+  Error notifyRemovingResources(ResourceKey K) override;
+
+  void notifyTransferringResources(ResourceKey DstKey,
+                                   ResourceKey SrcKey) override;
+
+  void modifyPassConfig(MaterializationResponsibility &MR,
+                        jitlink::LinkGraph &LG,
+                        jitlink::PassConfiguration &PassConfig) override;
+
+private:
+  void modifyPassConfigForMachO(MaterializationResponsibility &MR,
+                                jitlink::LinkGraph &LG,
+                                jitlink::PassConfiguration &PassConfig);
+
+  ExecutorAddr RegisterActionAddr;
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORT_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
new file mode 100644
index 000000000000..20da3e3b89eb
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
@@ -0,0 +1,330 @@
+//===-- ELFNixPlatform.h -- Utilities for executing ELF in Orc --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Linux/BSD support for executing JIT'd ELF in Orc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
+#define LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+
+#include <future>
+#include <thread>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+struct ELFPerObjectSectionsToRegister {
+  ExecutorAddrRange EHFrameSection;
+  ExecutorAddrRange ThreadDataSection;
+};
+
+struct ELFNixJITDylibInitializers {
+  using SectionList = std::vector<ExecutorAddrRange>;
+
+  ELFNixJITDylibInitializers(std::string Name, ExecutorAddr DSOHandleAddress)
+      : Name(std::move(Name)), DSOHandleAddress(std::move(DSOHandleAddress)) {}
+
+  std::string Name;
+  ExecutorAddr DSOHandleAddress;
+
+  StringMap<SectionList> InitSections;
+};
+
+class ELFNixJITDylibDeinitializers {};
+
+using ELFNixJITDylibInitializerSequence =
+    std::vector<ELFNixJITDylibInitializers>;
+
+using ELFNixJITDylibDeinitializerSequence =
+    std::vector<ELFNixJITDylibDeinitializers>;
+
+/// Mediates between ELFNix initialization and ExecutionSession state.
+class ELFNixPlatform : public Platform {
+public:
+  /// Try to create a ELFNixPlatform instance, adding the ORC runtime to the
+  /// given JITDylib.
+  ///
+  /// The ORC runtime requires access to a number of symbols in
+  /// libc++. It is up to the caller to ensure that the requried
+  /// symbols can be referenced by code added to PlatformJD. The
+  /// standard way to achieve this is to first attach dynamic library
+  /// search generators for either the given process, or for the
+  /// specific required libraries, to PlatformJD, then to create the
+  /// platform instance:
+  ///
+  /// \code{.cpp}
+  ///   auto &PlatformJD = ES.createBareJITDylib("stdlib");
+  ///   PlatformJD.addGenerator(
+  ///     ExitOnErr(EPCDynamicLibrarySearchGenerator
+  ///                 ::GetForTargetProcess(EPC)));
+  ///   ES.setPlatform(
+  ///     ExitOnErr(ELFNixPlatform::Create(ES, ObjLayer, EPC, PlatformJD,
+  ///                                     "/path/to/orc/runtime")));
+  /// \endcode
+  ///
+  /// Alternatively, these symbols could be added to another JITDylib that
+  /// PlatformJD links against.
+  ///
+  /// Clients are also responsible for ensuring that any JIT'd code that
+  /// depends on runtime functions (including any code using TLV or static
+  /// destructors) can reference the runtime symbols. This is usually achieved
+  /// by linking any JITDylibs containing regular code against
+  /// PlatformJD.
+  ///
+  /// By default, ELFNixPlatform will add the set of aliases returned by the
+  /// standardPlatformAliases function. This includes both required aliases
+  /// (e.g. __cxa_atexit -> __orc_rt_elf_cxa_atexit for static destructor
+  /// support), and optional aliases that provide JIT versions of common
+  /// functions (e.g. dlopen -> __orc_rt_elf_jit_dlopen). Clients can
+  /// override these defaults by passing a non-None value for the
+  /// RuntimeAliases function, in which case the client is responsible for
+  /// setting up all aliases (including the required ones).
+  static Expected<std::unique_ptr<ELFNixPlatform>>
+  Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
+         JITDylib &PlatformJD, const char *OrcRuntimePath,
+         Optional<SymbolAliasMap> RuntimeAliases = None);
+
+  ExecutionSession &getExecutionSession() const { return ES; }
+  ObjectLinkingLayer &getObjectLinkingLayer() const { return ObjLinkingLayer; }
+
+  Error setupJITDylib(JITDylib &JD) override;
+  Error notifyAdding(ResourceTracker &RT,
+                     const MaterializationUnit &MU) override;
+  Error notifyRemoving(ResourceTracker &RT) override;
+
+  /// Returns an AliasMap containing the default aliases for the ELFNixPlatform.
+  /// This can be modified by clients when constructing the platform to add
+  /// or remove aliases.
+  static SymbolAliasMap standardPlatformAliases(ExecutionSession &ES);
+
+  /// Returns the array of required CXX aliases.
+  static ArrayRef<std::pair<const char *, const char *>> requiredCXXAliases();
+
+  /// Returns the array of standard runtime utility aliases for ELF.
+  static ArrayRef<std::pair<const char *, const char *>>
+  standardRuntimeUtilityAliases();
+
+  /// Returns true if the given section name is an initializer section.
+  static bool isInitializerSection(StringRef SecName);
+
+private:
+  // The ELFNixPlatformPlugin scans/modifies LinkGraphs to support ELF
+  // platform features including initializers, exceptions, TLV, and language
+  // runtime registration.
+  class ELFNixPlatformPlugin : public ObjectLinkingLayer::Plugin {
+  public:
+    ELFNixPlatformPlugin(ELFNixPlatform &MP) : MP(MP) {}
+
+    void modifyPassConfig(MaterializationResponsibility &MR,
+                          jitlink::LinkGraph &G,
+                          jitlink::PassConfiguration &Config) override;
+
+    SyntheticSymbolDependenciesMap
+    getSyntheticSymbolDependencies(MaterializationResponsibility &MR) override;
+
+    // FIXME: We should be tentatively tracking scraped sections and discarding
+    // if the MR fails.
+    Error notifyFailed(MaterializationResponsibility &MR) override {
+      return Error::success();
+    }
+
+    Error notifyRemovingResources(ResourceKey K) override {
+      return Error::success();
+    }
+
+    void notifyTransferringResources(ResourceKey DstKey,
+                                     ResourceKey SrcKey) override {}
+
+  private:
+    using InitSymbolDepMap =
+        DenseMap<MaterializationResponsibility *, JITLinkSymbolSet>;
+
+    void addInitializerSupportPasses(MaterializationResponsibility &MR,
+                                     jitlink::PassConfiguration &Config);
+
+    void addDSOHandleSupportPasses(MaterializationResponsibility &MR,
+                                   jitlink::PassConfiguration &Config);
+
+    void addEHAndTLVSupportPasses(MaterializationResponsibility &MR,
+                                  jitlink::PassConfiguration &Config);
+
+    Error preserveInitSections(jitlink::LinkGraph &G,
+                               MaterializationResponsibility &MR);
+
+    Error registerInitSections(jitlink::LinkGraph &G, JITDylib &JD);
+
+    Error fixTLVSectionsAndEdges(jitlink::LinkGraph &G, JITDylib &JD);
+
+    std::mutex PluginMutex;
+    ELFNixPlatform &MP;
+    InitSymbolDepMap InitSymbolDeps;
+  };
+
+  using SendInitializerSequenceFn =
+      unique_function<void(Expected<ELFNixJITDylibInitializerSequence>)>;
+
+  using SendDeinitializerSequenceFn =
+      unique_function<void(Expected<ELFNixJITDylibDeinitializerSequence>)>;
+
+  using SendSymbolAddressFn = unique_function<void(Expected<ExecutorAddr>)>;
+
+  static bool supportedTarget(const Triple &TT);
+
+  ELFNixPlatform(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
+                 JITDylib &PlatformJD,
+                 std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator,
+                 Error &Err);
+
+  // Associate ELFNixPlatform JIT-side runtime support functions with handlers.
+  Error associateRuntimeSupportFunctions(JITDylib &PlatformJD);
+
+  void getInitializersBuildSequencePhase(SendInitializerSequenceFn SendResult,
+                                         JITDylib &JD,
+                                         std::vector<JITDylibSP> DFSLinkOrder);
+
+  void getInitializersLookupPhase(SendInitializerSequenceFn SendResult,
+                                  JITDylib &JD);
+
+  void rt_getInitializers(SendInitializerSequenceFn SendResult,
+                          StringRef JDName);
+
+  void rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
+                            ExecutorAddr Handle);
+
+  void rt_lookupSymbol(SendSymbolAddressFn SendResult, ExecutorAddr Handle,
+                       StringRef SymbolName);
+
+  // Records the addresses of runtime symbols used by the platform.
+  Error bootstrapELFNixRuntime(JITDylib &PlatformJD);
+
+  Error registerInitInfo(JITDylib &JD,
+                         ArrayRef<jitlink::Section *> InitSections);
+
+  Error registerPerObjectSections(const ELFPerObjectSectionsToRegister &POSR);
+
+  Expected<uint64_t> createPThreadKey();
+
+  ExecutionSession &ES;
+  ObjectLinkingLayer &ObjLinkingLayer;
+
+  SymbolStringPtr DSOHandleSymbol;
+  std::atomic<bool> RuntimeBootstrapped{false};
+
+  ExecutorAddr orc_rt_elfnix_platform_bootstrap;
+  ExecutorAddr orc_rt_elfnix_platform_shutdown;
+  ExecutorAddr orc_rt_elfnix_register_object_sections;
+  ExecutorAddr orc_rt_elfnix_create_pthread_key;
+
+  DenseMap<JITDylib *, SymbolLookupSet> RegisteredInitSymbols;
+
+  // InitSeqs gets its own mutex to avoid locking the whole session when
+  // aggregating data from the jitlink.
+  std::mutex PlatformMutex;
+  DenseMap<JITDylib *, ELFNixJITDylibInitializers> InitSeqs;
+  std::vector<ELFPerObjectSectionsToRegister> BootstrapPOSRs;
+
+  DenseMap<JITTargetAddress, JITDylib *> HandleAddrToJITDylib;
+  DenseMap<JITDylib *, uint64_t> JITDylibToPThreadKey;
+};
+
+namespace shared {
+
+using SPSELFPerObjectSectionsToRegister =
+    SPSTuple<SPSExecutorAddrRange, SPSExecutorAddrRange>;
+
+template <>
+class SPSSerializationTraits<SPSELFPerObjectSectionsToRegister,
+                             ELFPerObjectSectionsToRegister> {
+
+public:
+  static size_t size(const ELFPerObjectSectionsToRegister &MOPOSR) {
+    return SPSELFPerObjectSectionsToRegister::AsArgList::size(
+        MOPOSR.EHFrameSection, MOPOSR.ThreadDataSection);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const ELFPerObjectSectionsToRegister &MOPOSR) {
+    return SPSELFPerObjectSectionsToRegister::AsArgList::serialize(
+        OB, MOPOSR.EHFrameSection, MOPOSR.ThreadDataSection);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          ELFPerObjectSectionsToRegister &MOPOSR) {
+    return SPSELFPerObjectSectionsToRegister::AsArgList::deserialize(
+        IB, MOPOSR.EHFrameSection, MOPOSR.ThreadDataSection);
+  }
+};
+
+using SPSNamedExecutorAddrRangeSequenceMap =
+    SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRangeSequence>>;
+
+using SPSELFNixJITDylibInitializers =
+    SPSTuple<SPSString, SPSExecutorAddr, SPSNamedExecutorAddrRangeSequenceMap>;
+
+using SPSELFNixJITDylibInitializerSequence =
+    SPSSequence<SPSELFNixJITDylibInitializers>;
+
+/// Serialization traits for ELFNixJITDylibInitializers.
+template <>
+class SPSSerializationTraits<SPSELFNixJITDylibInitializers,
+                             ELFNixJITDylibInitializers> {
+public:
+  static size_t size(const ELFNixJITDylibInitializers &MOJDIs) {
+    return SPSELFNixJITDylibInitializers::AsArgList::size(
+        MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const ELFNixJITDylibInitializers &MOJDIs) {
+    return SPSELFNixJITDylibInitializers::AsArgList::serialize(
+        OB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          ELFNixJITDylibInitializers &MOJDIs) {
+    return SPSELFNixJITDylibInitializers::AsArgList::deserialize(
+        IB, MOJDIs.Name, MOJDIs.DSOHandleAddress, MOJDIs.InitSections);
+  }
+};
+
+using SPSELFJITDylibDeinitializers = SPSEmpty;
+
+using SPSELFJITDylibDeinitializerSequence =
+    SPSSequence<SPSELFJITDylibDeinitializers>;
+
+template <>
+class SPSSerializationTraits<SPSELFJITDylibDeinitializers,
+                             ELFNixJITDylibDeinitializers> {
+public:
+  static size_t size(const ELFNixJITDylibDeinitializers &MOJDDs) { return 0; }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const ELFNixJITDylibDeinitializers &MOJDDs) {
+    return true;
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          ELFNixJITDylibDeinitializers &MOJDDs) {
+    MOJDDs = ELFNixJITDylibDeinitializers();
+    return true;
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h
index 410a202b3296..940d0d28ae83 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h
@@ -14,6 +14,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_EPCDEBUGOBJECTREGISTRAR_H
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Memory.h"
@@ -32,7 +33,7 @@ class ExecutionSession;
 /// Abstract interface for registering debug objects in the executor process.
 class DebugObjectRegistrar {
 public:
-  virtual Error registerDebugObject(sys::MemoryBlock) = 0;
+  virtual Error registerDebugObject(ExecutorAddrRange TargetMem) = 0;
   virtual ~DebugObjectRegistrar() {}
 };
 
@@ -40,14 +41,14 @@ public:
 /// executor process.
 class EPCDebugObjectRegistrar : public DebugObjectRegistrar {
 public:
-  EPCDebugObjectRegistrar(ExecutionSession &ES, JITTargetAddress RegisterFn)
+  EPCDebugObjectRegistrar(ExecutionSession &ES, ExecutorAddr RegisterFn)
       : ES(ES), RegisterFn(RegisterFn) {}
 
-  Error registerDebugObject(sys::MemoryBlock TargetMem) override;
+  Error registerDebugObject(ExecutorAddrRange TargetMem) override;
 
 private:
   ExecutionSession &ES;
-  JITTargetAddress RegisterFn;
+  ExecutorAddr RegisterFn;
 };
 
 /// Create a ExecutorProcessControl-based DebugObjectRegistrar that emits debug
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h
index 8cd6e9319a28..6d113a7bdf1a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h
@@ -14,6 +14,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_EPCEHFRAMEREGISTRAR_H
 
 #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 
 namespace llvm {
 namespace orc {
@@ -33,8 +34,8 @@ public:
   /// Create a EPCEHFrameRegistrar with the given ExecutorProcessControl
   /// object and registration/deregistration function addresses.
   EPCEHFrameRegistrar(ExecutionSession &ES,
-                      JITTargetAddress RegisterEHFrameWrapperFnAddr,
-                      JITTargetAddress DeregisterEHFRameWrapperFnAddr)
+                      ExecutorAddr RegisterEHFrameWrapperFnAddr,
+                      ExecutorAddr DeregisterEHFRameWrapperFnAddr)
       : ES(ES), RegisterEHFrameWrapperFnAddr(RegisterEHFrameWrapperFnAddr),
         DeregisterEHFrameWrapperFnAddr(DeregisterEHFRameWrapperFnAddr) {}
 
@@ -45,8 +46,8 @@ public:
 
 private:
   ExecutionSession &ES;
-  JITTargetAddress RegisterEHFrameWrapperFnAddr;
-  JITTargetAddress DeregisterEHFrameWrapperFnAddr;
+  ExecutorAddr RegisterEHFrameWrapperFnAddr;
+  ExecutorAddr DeregisterEHFrameWrapperFnAddr;
 };
 
 } // end namespace orc
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
new file mode 100644
index 000000000000..02e580c86f54
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h
@@ -0,0 +1,67 @@
+//===- EPCGenericDylibManager.h -- Generic EPC Dylib management -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements dylib loading and searching by making calls to
+// ExecutorProcessControl::callWrapper.
+//
+// This simplifies the implementaton of new ExecutorProcessControl instances,
+// as this implementation will always work (at the cost of some performance
+// overhead for the calls).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_EPCGENERICDYLIBMANAGER_H
+#define LLVM_EXECUTIONENGINE_ORC_EPCGENERICDYLIBMANAGER_H
+
+#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+
+namespace llvm {
+namespace orc {
+
+class SymbolLookupSet;
+
+class EPCGenericDylibManager {
+public:
+  /// Function addresses for memory access.
+  struct SymbolAddrs {
+    ExecutorAddr Instance;
+    ExecutorAddr Open;
+    ExecutorAddr Lookup;
+  };
+
+  /// Create an EPCGenericMemoryAccess instance from a given set of
+  /// function addrs.
+  static Expected<EPCGenericDylibManager>
+  CreateWithDefaultBootstrapSymbols(ExecutorProcessControl &EPC);
+
+  /// Create an EPCGenericMemoryAccess instance from a given set of
+  /// function addrs.
+  EPCGenericDylibManager(ExecutorProcessControl &EPC, SymbolAddrs SAs)
+      : EPC(EPC), SAs(SAs) {}
+
+  /// Loads the dylib with the given name.
+  Expected<tpctypes::DylibHandle> open(StringRef Path, uint64_t Mode);
+
+  /// Looks up symbols within the given dylib.
+  Expected<std::vector<ExecutorAddr>> lookup(tpctypes::DylibHandle H,
+                                             const SymbolLookupSet &Lookup);
+
+  /// Looks up symbols within the given dylib.
+  Expected<std::vector<ExecutorAddr>>
+  lookup(tpctypes::DylibHandle H, const RemoteSymbolLookupSet &Lookup);
+
+private:
+  ExecutorProcessControl &EPC;
+  SymbolAddrs SAs;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_EPCGENERICDYLIBMANAGER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
new file mode 100644
index 000000000000..b9825f17ec17
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
@@ -0,0 +1,97 @@
+//===- EPCGenericJITLinkMemoryManager.h - EPC-based mem manager -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements JITLinkMemoryManager by making remove calls via
+// ExecutorProcessControl::callWrapperAsync.
+//
+// This simplifies the implementaton of new ExecutorProcessControl instances,
+// as this implementation will always work (at the cost of some performance
+// overhead for the calls).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_EPCGENERICJITLINKMEMORYMANAGER_H
+#define LLVM_EXECUTIONENGINE_ORC_EPCGENERICJITLINKMEMORYMANAGER_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+
+namespace llvm {
+namespace orc {
+
+class EPCGenericJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
+public:
+  /// Function addresses for memory access.
+  struct SymbolAddrs {
+    ExecutorAddr Allocator;
+    ExecutorAddr Reserve;
+    ExecutorAddr Finalize;
+    ExecutorAddr Deallocate;
+  };
+
+  /// Create an EPCGenericJITLinkMemoryManager instance from a given set of
+  /// function addrs.
+  EPCGenericJITLinkMemoryManager(ExecutorProcessControl &EPC, SymbolAddrs SAs)
+      : EPC(EPC), SAs(SAs) {}
+
+  void allocate(const jitlink::JITLinkDylib *JD, jitlink::LinkGraph &G,
+                OnAllocatedFunction OnAllocated) override;
+
+  // Use overloads from base class.
+  using JITLinkMemoryManager::allocate;
+
+  void deallocate(std::vector<FinalizedAlloc> Allocs,
+                  OnDeallocatedFunction OnDeallocated) override;
+
+  // Use overloads from base class.
+  using JITLinkMemoryManager::deallocate;
+
+private:
+  class InFlightAlloc;
+
+  void completeAllocation(ExecutorAddr AllocAddr, jitlink::BasicLayout BL,
+                          OnAllocatedFunction OnAllocated);
+
+  ExecutorProcessControl &EPC;
+  SymbolAddrs SAs;
+};
+
+namespace shared {
+
+/// FIXME: This specialization should be moved into TargetProcessControlTypes.h
+///        (or whereever those types get merged to) once ORC depends on JITLink.
+template <>
+class SPSSerializationTraits<SPSExecutorAddr,
+                             jitlink::JITLinkMemoryManager::FinalizedAlloc> {
+public:
+  static size_t size(const jitlink::JITLinkMemoryManager::FinalizedAlloc &FA) {
+    return SPSArgList<SPSExecutorAddr>::size(ExecutorAddr(FA.getAddress()));
+  }
+
+  static bool
+  serialize(SPSOutputBuffer &OB,
+            const jitlink::JITLinkMemoryManager::FinalizedAlloc &FA) {
+    return SPSArgList<SPSExecutorAddr>::serialize(
+        OB, ExecutorAddr(FA.getAddress()));
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          jitlink::JITLinkMemoryManager::FinalizedAlloc &FA) {
+    ExecutorAddr A;
+    if (!SPSArgList<SPSExecutorAddr>::deserialize(IB, A))
+      return false;
+    FA = jitlink::JITLinkMemoryManager::FinalizedAlloc(A.getValue());
+    return true;
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_EPCGENERICJITLINKMEMORYMANAGER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
new file mode 100644
index 000000000000..8c1d457d06ab
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
@@ -0,0 +1,85 @@
+//===- EPCGenericMemoryAccess.h - Generic EPC MemoryAccess impl -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements ExecutorProcessControl::MemoryAccess by making calls to
+// ExecutorProcessControl::callWrapperAsync.
+//
+// This simplifies the implementaton of new ExecutorProcessControl instances,
+// as this implementation will always work (at the cost of some performance
+// overhead for the calls).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
+#define LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+
+namespace llvm {
+namespace orc {
+
+class EPCGenericMemoryAccess : public ExecutorProcessControl::MemoryAccess {
+public:
+  /// Function addresses for memory access.
+  struct FuncAddrs {
+    ExecutorAddr WriteUInt8s;
+    ExecutorAddr WriteUInt16s;
+    ExecutorAddr WriteUInt32s;
+    ExecutorAddr WriteUInt64s;
+    ExecutorAddr WriteBuffers;
+  };
+
+  /// Create an EPCGenericMemoryAccess instance from a given set of
+  /// function addrs.
+  EPCGenericMemoryAccess(ExecutorProcessControl &EPC, FuncAddrs FAs)
+      : EPC(EPC), FAs(FAs) {}
+
+  void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                        WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt8Write>)>(
+        FAs.WriteUInt8s, std::move(OnWriteComplete), Ws);
+  }
+
+  void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt16Write>)>(
+        FAs.WriteUInt16s, std::move(OnWriteComplete), Ws);
+  }
+
+  void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt32Write>)>(
+        FAs.WriteUInt32s, std::move(OnWriteComplete), Ws);
+  }
+
+  void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt64Write>)>(
+        FAs.WriteUInt64s, std::move(OnWriteComplete), Ws);
+  }
+
+  void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessBufferWrite>)>(
+        FAs.WriteBuffers, std::move(OnWriteComplete), Ws);
+  }
+
+private:
+  ExecutorProcessControl &EPC;
+  FuncAddrs FAs;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
new file mode 100644
index 000000000000..b6fdfb92ced3
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
@@ -0,0 +1,133 @@
+//===---- EPCGenericRTDyldMemoryManager.h - EPC-based MemMgr ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a RuntimeDyld::MemoryManager that uses EPC and the ORC runtime
+// bootstrap functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_EPCGENERICRTDYLDMEMORYMANAGER_H
+#define LLVM_EXECUTIONENGINE_ORC_EPCGENERICRTDYLDMEMORYMANAGER_H
+
+#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+
+/// Remote-mapped RuntimeDyld-compatible memory manager.
+class EPCGenericRTDyldMemoryManager : public RuntimeDyld::MemoryManager {
+public:
+  /// Symbol addresses for memory access.
+  struct SymbolAddrs {
+    ExecutorAddr Instance;
+    ExecutorAddr Reserve;
+    ExecutorAddr Finalize;
+    ExecutorAddr Deallocate;
+    ExecutorAddr RegisterEHFrame;
+    ExecutorAddr DeregisterEHFrame;
+  };
+
+  /// Create an EPCGenericRTDyldMemoryManager using the given EPC, looking up
+  /// the default symbol names in the bootstrap symbol set.
+  static Expected<std::unique_ptr<EPCGenericRTDyldMemoryManager>>
+  CreateWithDefaultBootstrapSymbols(ExecutorProcessControl &EPC);
+
+  /// Create an EPCGenericRTDyldMemoryManager using the given EPC and symbol
+  /// addrs.
+  EPCGenericRTDyldMemoryManager(ExecutorProcessControl &EPC, SymbolAddrs SAs);
+
+  EPCGenericRTDyldMemoryManager(const EPCGenericRTDyldMemoryManager &) = delete;
+  EPCGenericRTDyldMemoryManager &
+  operator=(const EPCGenericRTDyldMemoryManager &) = delete;
+  EPCGenericRTDyldMemoryManager(EPCGenericRTDyldMemoryManager &&) = delete;
+  EPCGenericRTDyldMemoryManager &
+  operator=(EPCGenericRTDyldMemoryManager &&) = delete;
+  ~EPCGenericRTDyldMemoryManager();
+
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID,
+                               StringRef SectionName) override;
+
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName,
+                               bool IsReadOnly) override;
+
+  void reserveAllocationSpace(uintptr_t CodeSize, uint32_t CodeAlign,
+                              uintptr_t RODataSize, uint32_t RODataAlign,
+                              uintptr_t RWDataSize,
+                              uint32_t RWDataAlign) override;
+
+  bool needsToReserveAllocationSpace() override;
+
+  void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override;
+
+  void deregisterEHFrames() override;
+
+  void notifyObjectLoaded(RuntimeDyld &Dyld,
+                          const object::ObjectFile &Obj) override;
+
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override;
+
+private:
+  struct Alloc {
+  public:
+    Alloc(uint64_t Size, unsigned Align)
+        : Size(Size), Align(Align),
+          Contents(std::make_unique<uint8_t[]>(Size + Align - 1)) {}
+
+    uint64_t Size;
+    unsigned Align;
+    std::unique_ptr<uint8_t[]> Contents;
+    ExecutorAddr RemoteAddr;
+  };
+
+  struct EHFrame {
+    ExecutorAddr Addr;
+    uint64_t Size;
+  };
+
+  // Group of section allocations to be allocated together in the executor. The
+  // RemoteCodeAddr will stand in as the id of the group for deallocation
+  // purposes.
+  struct AllocGroup {
+    AllocGroup() = default;
+    AllocGroup(const AllocGroup &) = delete;
+    AllocGroup &operator=(const AllocGroup &) = delete;
+    AllocGroup(AllocGroup &&) = default;
+    AllocGroup &operator=(AllocGroup &&) = default;
+
+    ExecutorAddrRange RemoteCode;
+    ExecutorAddrRange RemoteROData;
+    ExecutorAddrRange RemoteRWData;
+    std::vector<EHFrame> UnfinalizedEHFrames;
+    std::vector<Alloc> CodeAllocs, RODataAllocs, RWDataAllocs;
+  };
+
+  // Maps all allocations in Allocs to aligned blocks
+  void mapAllocsToRemoteAddrs(RuntimeDyld &Dyld, std::vector<Alloc> &Allocs,
+                              ExecutorAddr NextAddr);
+
+  ExecutorProcessControl &EPC;
+  SymbolAddrs SAs;
+
+  std::mutex M;
+  std::vector<AllocGroup> Unmapped;
+  std::vector<AllocGroup> Unfinalized;
+  std::vector<ExecutorAddr> FinalizedAllocs;
+  std::string ErrMsg;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif // LLVM_EXECUTIONENGINE_ORC_EPCGENERICRTDYLDMEMORYMANAGER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h
index 64f16d507c97..92de5882bafe 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCIndirectionUtils.h
@@ -126,7 +126,7 @@ public:
   }
 
 private:
-  using Allocation = jitlink::JITLinkMemoryManager::Allocation;
+  using FinalizedAlloc = jitlink::JITLinkMemoryManager::FinalizedAlloc;
 
   struct IndirectStubInfo {
     IndirectStubInfo() = default;
@@ -149,12 +149,12 @@ private:
   ExecutorProcessControl &EPC;
   std::unique_ptr<ABISupport> ABI;
   JITTargetAddress ResolverBlockAddr;
-  std::unique_ptr<jitlink::JITLinkMemoryManager::Allocation> ResolverBlock;
+  FinalizedAlloc ResolverBlock;
   std::unique_ptr<TrampolinePool> TP;
   std::unique_ptr<LazyCallThroughManager> LCTM;
 
   std::vector<IndirectStubInfo> AvailableIndirectStubs;
-  std::vector<std::unique_ptr<Allocation>> IndirectStubAllocs;
+  std::vector<FinalizedAlloc> IndirectStubAllocs;
 };
 
 /// This will call writeResolver on the given EPCIndirectionUtils instance
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index d540d0cd0608..105dac8e8d04 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_EXECUTORPROCESSCONTROL_H
 #define LLVM_EXECUTIONENGINE_ORC_EXECUTORPROCESSCONTROL_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
@@ -21,6 +20,7 @@
 #include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
 
@@ -37,11 +37,65 @@ class SymbolLookupSet;
 /// ExecutorProcessControl supports interaction with a JIT target process.
 class ExecutorProcessControl {
   friend class ExecutionSession;
-
 public:
-  /// Sender to return the result of a WrapperFunction executed in the JIT.
-  using SendResultFunction =
-      unique_function<void(shared::WrapperFunctionResult)>;
+
+  /// A handler or incoming WrapperFunctionResults -- either return values from
+  /// callWrapper* calls, or incoming JIT-dispatch requests.
+  ///
+  /// IncomingWFRHandlers are constructible from
+  /// unique_function<void(shared::WrapperFunctionResult)>s using the
+  /// runInPlace function or a RunWithDispatch object.
+  class IncomingWFRHandler {
+    friend class ExecutorProcessControl;
+  public:
+    IncomingWFRHandler() = default;
+    explicit operator bool() const { return !!H; }
+    void operator()(shared::WrapperFunctionResult WFR) { H(std::move(WFR)); }
+  private:
+    template <typename FnT> IncomingWFRHandler(FnT &&Fn)
+      : H(std::forward<FnT>(Fn)) {}
+
+    unique_function<void(shared::WrapperFunctionResult)> H;
+  };
+
+  /// Constructs an IncomingWFRHandler from a function object that is callable
+  /// as void(shared::WrapperFunctionResult). The function object will be called
+  /// directly. This should be used with care as it may block listener threads
+  /// in remote EPCs. It is only suitable for simple tasks (e.g. setting a
+  /// future), or for performing some quick analysis before dispatching "real"
+  /// work as a Task.
+  class RunInPlace {
+  public:
+    template <typename FnT>
+    IncomingWFRHandler operator()(FnT &&Fn) {
+      return IncomingWFRHandler(std::forward<FnT>(Fn));
+    }
+  };
+
+  /// Constructs an IncomingWFRHandler from a function object by creating a new
+  /// function object that dispatches the original using a TaskDispatcher,
+  /// wrapping the original as a GenericNamedTask.
+  ///
+  /// This is the default approach for running WFR handlers.
+  class RunAsTask {
+  public:
+    RunAsTask(TaskDispatcher &D) : D(D) {}
+
+    template <typename FnT>
+    IncomingWFRHandler operator()(FnT &&Fn) {
+      return IncomingWFRHandler(
+          [&D = this->D, Fn = std::move(Fn)]
+          (shared::WrapperFunctionResult WFR) mutable {
+              D.dispatch(
+                makeGenericNamedTask(
+                    [Fn = std::move(Fn), WFR = std::move(WFR)]() mutable {
+                      Fn(std::move(WFR));
+                    }, "WFR handler task"));
+          });
+    }
+  private:
+    TaskDispatcher &D;
+  };
 
   /// APIs for manipulating memory in the target process.
   class MemoryAccess {
@@ -51,53 +105,58 @@ public:
 
     virtual ~MemoryAccess();
 
-    virtual void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                             WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                                  WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
     Error writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt8s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt8sAsync(Ws,
+                       [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt16s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt16sAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt32s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt32sAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt64s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt64sAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeBuffers(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeBuffersAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
   };
@@ -113,10 +172,14 @@ public:
   /// Contains the address of the dispatch function and context that the ORC
   /// runtime can use to call functions in the JIT.
   struct JITDispatchInfo {
-    ExecutorAddress JITDispatchFunctionAddress;
-    ExecutorAddress JITDispatchContextAddress;
+    ExecutorAddr JITDispatchFunction;
+    ExecutorAddr JITDispatchContext;
   };
 
+  ExecutorProcessControl(std::shared_ptr<SymbolStringPool> SSP,
+                         std::unique_ptr<TaskDispatcher> D)
+    : SSP(std::move(SSP)), D(std::move(D)) {}
+
   virtual ~ExecutorProcessControl();
 
   /// Return the ExecutionSession associated with this instance.
@@ -132,6 +195,8 @@ public:
   /// Return a shared pointer to the SymbolStringPool for this instance.
   std::shared_ptr<SymbolStringPool> getSymbolStringPool() const { return SSP; }
 
+  TaskDispatcher &getDispatcher() { return *D; }
+
   /// Return the Triple for the target process.
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -153,6 +218,29 @@ public:
     return *MemMgr;
   }
 
+  /// Returns the bootstrap symbol map.
+  const StringMap<ExecutorAddr> &getBootstrapSymbolsMap() const {
+    return BootstrapSymbols;
+  }
+
+  /// For each (ExecutorAddr&, StringRef) pair, looks up the string in the
+  /// bootstrap symbols map and writes its address to the ExecutorAddr if
+  /// found. If any symbol is not found then the function returns an error.
+  Error getBootstrapSymbols(
+      ArrayRef<std::pair<ExecutorAddr &, StringRef>> Pairs) const {
+    for (auto &KV : Pairs) {
+      auto I = BootstrapSymbols.find(KV.second);
+      if (I == BootstrapSymbols.end())
+        return make_error<StringError>("Symbol \"" + KV.second +
+                                           "\" not found "
+                                           "in bootstrap symbols map",
+                                       inconvertibleErrorCode());
+
+      KV.first = I->second;
+    }
+    return Error::success();
+  }
+
   /// Load the dynamic library at the given path and return a handle to it.
   /// If LibraryPath is null this function will return the global handle for
   /// the target process.
@@ -163,44 +251,119 @@ public:
   /// The result of the lookup is a 2-dimentional array of target addresses
   /// that correspond to the lookup order. If a required symbol is not
   /// found then this method will return an error. If a weakly referenced
-  /// symbol is not found then it be assigned a '0' value in the result.
-  /// that correspond to the lookup order.
+  /// symbol is not found then it be assigned a '0' value.
   virtual Expected<std::vector<tpctypes::LookupResult>>
   lookupSymbols(ArrayRef<LookupRequest> Request) = 0;
 
   /// Run function with a main-like signature.
-  virtual Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
+  virtual Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                                       ArrayRef<std::string> Args) = 0;
 
-  /// Run a wrapper function in the executor.
+  /// Run a wrapper function in the executor. The given WFRHandler will be
+  /// called on the result when it is returned.
   ///
   /// The wrapper function should be callable as:
   ///
   /// \code{.cpp}
   ///   CWrapperFunctionResult fn(uint8_t *Data, uint64_t Size);
   /// \endcode{.cpp}
-  ///
-  /// The given OnComplete function will be called to return the result.
-  virtual void callWrapperAsync(SendResultFunction OnComplete,
-                                JITTargetAddress WrapperFnAddr,
+  virtual void callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                                IncomingWFRHandler OnComplete,
                                 ArrayRef<char> ArgBuffer) = 0;
 
+  /// Run a wrapper function in the executor using the given Runner to dispatch
+  /// OnComplete when the result is ready.
+  template <typename RunPolicyT, typename FnT>
+  void callWrapperAsync(RunPolicyT &&Runner, ExecutorAddr WrapperFnAddr,
+                        FnT &&OnComplete, ArrayRef<char> ArgBuffer) {
+    callWrapperAsync(
+        WrapperFnAddr, Runner(std::forward<FnT>(OnComplete)), ArgBuffer);
+  }
+
+  /// Run a wrapper function in the executor. OnComplete will be dispatched
+  /// as a GenericNamedTask using this instance's TaskDispatch object.
+  template <typename FnT>
+  void callWrapperAsync(ExecutorAddr WrapperFnAddr, FnT &&OnComplete,
+                        ArrayRef<char> ArgBuffer) {
+    callWrapperAsync(RunAsTask(*D), WrapperFnAddr,
+                     std::forward<FnT>(OnComplete), ArgBuffer);
+  }
+
+  /// Run a wrapper function in the executor. The wrapper function should be
+  /// callable as:
+  ///
+  /// \code{.cpp}
+  ///   CWrapperFunctionResult fn(uint8_t *Data, uint64_t Size);
+  /// \endcode{.cpp}
+  shared::WrapperFunctionResult callWrapper(ExecutorAddr WrapperFnAddr,
+                                            ArrayRef<char> ArgBuffer) {
+    std::promise<shared::WrapperFunctionResult> RP;
+    auto RF = RP.get_future();
+    callWrapperAsync(
+        RunInPlace(), WrapperFnAddr,
+        [&](shared::WrapperFunctionResult R) {
+          RP.set_value(std::move(R));
+        }, ArgBuffer);
+    return RF.get();
+  }
+
+  /// Run a wrapper function using SPS to serialize the arguments and
+  /// deserialize the results.
+  template <typename SPSSignature, typename RunPolicyT, typename SendResultT,
+            typename... ArgTs>
+  void callSPSWrapperAsync(RunPolicyT &&Runner, ExecutorAddr WrapperFnAddr,
+                           SendResultT &&SendResult, const ArgTs &...Args) {
+    shared::WrapperFunction<SPSSignature>::callAsync(
+        [this, WrapperFnAddr, Runner = std::move(Runner)]
+        (auto &&SendResult, const char *ArgData, size_t ArgSize) mutable {
+          this->callWrapperAsync(std::move(Runner), WrapperFnAddr,
+                                 std::move(SendResult),
+                                 ArrayRef<char>(ArgData, ArgSize));
+        },
+        std::forward<SendResultT>(SendResult), Args...);
+  }
+
+  /// Run a wrapper function using SPS to serialize the arguments and
+  /// deserialize the results.
+  template <typename SPSSignature, typename SendResultT, typename... ArgTs>
+  void callSPSWrapperAsync(ExecutorAddr WrapperFnAddr, SendResultT &&SendResult,
+                           const ArgTs &...Args) {
+    callSPSWrapperAsync<SPSSignature>(RunAsTask(*D), WrapperFnAddr,
+                                      std::forward<SendResultT>(SendResult),
+                                      Args...);
+  }
+
+  /// Run a wrapper function using SPS to serialize the arguments and
+  /// deserialize the results.
+  ///
+  /// If SPSSignature is a non-void function signature then the second argument
+  /// (the first in the Args list) should be a reference to a return value.
+  template <typename SPSSignature, typename... WrapperCallArgTs>
+  Error callSPSWrapper(ExecutorAddr WrapperFnAddr,
+                       WrapperCallArgTs &&...WrapperCallArgs) {
+    return shared::WrapperFunction<SPSSignature>::call(
+        [this, WrapperFnAddr](const char *ArgData, size_t ArgSize) {
+          return callWrapper(WrapperFnAddr, ArrayRef<char>(ArgData, ArgSize));
+        },
+        std::forward<WrapperCallArgTs>(WrapperCallArgs)...);
+  }
+
   /// Disconnect from the target process.
   ///
   /// This should be called after the JIT session is shut down.
   virtual Error disconnect() = 0;
 
 protected:
-  ExecutorProcessControl(std::shared_ptr<SymbolStringPool> SSP)
-      : SSP(std::move(SSP)) {}
 
   std::shared_ptr<SymbolStringPool> SSP;
+  std::unique_ptr<TaskDispatcher> D;
   ExecutionSession *ES = nullptr;
   Triple TargetTriple;
   unsigned PageSize = 0;
   JITDispatchInfo JDI;
   MemoryAccess *MemAccess = nullptr;
   jitlink::JITLinkMemoryManager *MemMgr = nullptr;
+  StringMap<ExecutorAddr> BootstrapSymbols;
 };
 
 /// A ExecutorProcessControl instance that asserts if any of its methods are
@@ -210,9 +373,12 @@ class UnsupportedExecutorProcessControl : public ExecutorProcessControl {
 public:
   UnsupportedExecutorProcessControl(
       std::shared_ptr<SymbolStringPool> SSP = nullptr,
+      std::unique_ptr<TaskDispatcher> D = nullptr,
       const std::string &TT = "", unsigned PageSize = 0)
       : ExecutorProcessControl(SSP ? std::move(SSP)
-                                   : std::make_shared<SymbolStringPool>()) {
+                               : std::make_shared<SymbolStringPool>(),
+                               D ? std::move(D)
+                               : std::make_unique<InPlaceTaskDispatcher>()) {
     this->TargetTriple = Triple(TT);
     this->PageSize = PageSize;
   }
@@ -226,13 +392,13 @@ public:
     llvm_unreachable("Unsupported");
   }
 
-  Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
+  Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override {
     llvm_unreachable("Unsupported");
   }
 
-  void callWrapperAsync(SendResultFunction OnComplete,
-                        JITTargetAddress WrapperFnAddr,
+  void callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                        IncomingWFRHandler OnComplete,
                         ArrayRef<char> ArgBuffer) override {
     llvm_unreachable("Unsupported");
   }
@@ -246,8 +412,9 @@ class SelfExecutorProcessControl
       private ExecutorProcessControl::MemoryAccess {
 public:
   SelfExecutorProcessControl(
-      std::shared_ptr<SymbolStringPool> SSP, Triple TargetTriple,
-      unsigned PageSize, std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr);
+      std::shared_ptr<SymbolStringPool> SSP, std::unique_ptr<TaskDispatcher> D,
+      Triple TargetTriple, unsigned PageSize,
+      std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr);
 
   /// Create a SelfExecutorProcessControl with the given symbol string pool and
   /// memory manager.
@@ -256,6 +423,7 @@ public:
   /// be created and used by default.
   static Expected<std::unique_ptr<SelfExecutorProcessControl>>
   Create(std::shared_ptr<SymbolStringPool> SSP = nullptr,
+         std::unique_ptr<TaskDispatcher> D = nullptr,
          std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr = nullptr);
 
   Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
@@ -263,32 +431,32 @@ public:
   Expected<std::vector<tpctypes::LookupResult>>
   lookupSymbols(ArrayRef<LookupRequest> Request) override;
 
-  Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
+  Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override;
 
-  void callWrapperAsync(SendResultFunction OnComplete,
-                        JITTargetAddress WrapperFnAddr,
+  void callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                        IncomingWFRHandler OnComplete,
                         ArrayRef<char> ArgBuffer) override;
 
   Error disconnect() override;
 
 private:
-  void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                   WriteResultFn OnWriteComplete) override;
+  void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                        WriteResultFn OnWriteComplete) override;
 
-  void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  static shared::detail::CWrapperFunctionResult
+  static shared::CWrapperFunctionResult
   jitDispatchViaWrapperFunctionManager(void *Ctx, const void *FnTag,
                                        const char *Data, size_t Size);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index 78e3ceef50e2..4d6d46595fc3 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -45,6 +45,13 @@ class PointerType;
 class Triple;
 class Twine;
 class Value;
+class MCDisassembler;
+class MCInstrAnalysis;
+
+namespace jitlink {
+class LinkGraph;
+class Symbol;
+} // namespace jitlink
 
 namespace orc {
 
@@ -557,6 +564,33 @@ GlobalAlias *cloneGlobalAliasDecl(Module &Dst, const GlobalAlias &OrigA,
 void cloneModuleFlagsMetadata(Module &Dst, const Module &Src,
                               ValueToValueMapTy &VMap);
 
+/// Introduce relocations to \p Sym in its own definition if there are any
+/// pointers formed via PC-relative address that do not already have a
+/// relocation.
+///
+/// This is useful when introducing indirection via a stub function at link time
+/// without compiler support. If a function pointer is formed without a
+/// relocation, e.g. in the definition of \c foo
+///
+/// \code
+/// _foo:
+///   leaq -7(%rip), rax # form pointer to _foo without relocation
+/// _bar:
+///   leaq (%rip), %rax  # uses X86_64_RELOC_SIGNED to '_foo'
+/// \endcode
+///
+/// the pointer to \c _foo computed by \c _foo and \c _bar may differ if we
+/// introduce a stub for _foo. If the pointer is used as a key, this may be
+/// observable to the program. This pass will attempt to introduce the missing
+/// "self-relocation" on the leaq instruction.
+///
+/// This is based on disassembly and should be considered "best effort". It may
+/// silently fail to add relocations.
+Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym,
+                                                   jitlink::LinkGraph &G,
+                                                   MCDisassembler &Disassembler,
+                                                   MCInstrAnalysis &MIA);
+
 } // end namespace orc
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LLVMSPSSerializers.h b/llvm/include/llvm/ExecutionEngine/Orc/LLVMSPSSerializers.h
deleted file mode 100644
index f3d616deae8f..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/LLVMSPSSerializers.h
+++ /dev/null
@@ -1,69 +0,0 @@
-//===-- LLVMSPSSerializers.h - SPS serialization for LLVM types -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// SPS Serialization for common LLVM types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_LLVMSPSSERIALIZERS_H
-#define LLVM_EXECUTIONENGINE_ORC_LLVMSPSSERIALIZERS_H
-
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
-
-namespace llvm {
-namespace orc {
-namespace shared {
-
-template <typename SPSValueT, typename ValueT>
-class SPSSerializationTraits<SPSSequence<SPSTuple<SPSString, SPSValueT>>,
-                             StringMap<ValueT>> {
-public:
-  static size_t size(const StringMap<ValueT> &M) {
-    size_t Sz = SPSArgList<uint64_t>::size(static_cast<uint64_t>(M.size()));
-    for (auto &E : M)
-      Sz += SPSArgList<SPSString, SPSValueT>::size(E.first(), E.second);
-    return Sz;
-  }
-
-  static bool serialize(SPSOutputBuffer &OB, const StringMap<ValueT> &M) {
-    if (!SPSArgList<uint64_t>::serialize(OB, static_cast<uint64_t>(M.size())))
-      return false;
-
-    for (auto &E : M)
-      if (!SPSArgList<SPSString, SPSValueT>::serialize(OB, E.first(), E.second))
-        return false;
-
-    return true;
-  }
-
-  static bool deserialize(SPSInputBuffer &IB, StringMap<ValueT> &M) {
-    uint64_t Size;
-    assert(M.empty() && "M already contains elements");
-
-    if (!SPSArgList<uint64_t>::deserialize(IB, Size))
-      return false;
-
-    while (Size--) {
-      StringRef S;
-      ValueT V;
-      if (!SPSArgList<SPSString, SPSValueT>::deserialize(IB, S, V))
-        return false;
-      if (!M.insert(std::make_pair(S, V)).second)
-        return false;
-    }
-
-    return true;
-  }
-};
-
-} // end namespace shared
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_LLVMSPSSERIALIZERS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h b/llvm/include/llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h
new file mode 100644
index 000000000000..a598405ee4f6
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h
@@ -0,0 +1,70 @@
+//===-- LookupAndRecordAddrs.h - Symbol lookup support utility --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Record the addresses of a set of symbols into ExecutorAddr objects.
+//
+// This can be used to avoid repeated lookup (via ExecutionSession::lookup) of
+// the given symbols.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LOOKUPANDRECORDADDRS_H
+#define LLVM_EXECUTIONENGINE_ORC_LOOKUPANDRECORDADDRS_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+/// Record addresses of the given symbols in the given ExecutorAddrs.
+///
+/// Useful for making permanent records of symbol addreses to call or
+/// access in the executor (e.g. runtime support functions in Platform
+/// subclasses).
+///
+/// By default the symbols are looked up using
+/// SymbolLookupFlags::RequiredSymbol, and an error will be generated if any of
+/// the requested symbols are not defined.
+///
+/// If SymbolLookupFlags::WeaklyReferencedSymbol is used then any missing
+/// symbols will have their corresponding address objects set to zero, and
+/// this function will never generate an error (the caller will need to check
+/// addresses before using them).
+///
+/// Asynchronous version.
+void lookupAndRecordAddrs(
+    unique_function<void(Error)> OnRecorded, ExecutionSession &ES, LookupKind K,
+    const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> Pairs,
+    SymbolLookupFlags LookupFlags = SymbolLookupFlags::RequiredSymbol);
+
+/// Record addresses of the given symbols in the given ExecutorAddrs.
+///
+/// Blocking version.
+Error lookupAndRecordAddrs(
+    ExecutionSession &ES, LookupKind K, const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> Pairs,
+    SymbolLookupFlags LookupFlags = SymbolLookupFlags::RequiredSymbol);
+
+/// Record addresses of given symbols in the given ExecutorAddrs.
+///
+/// ExecutorProcessControl lookup version. Lookups are always implicitly
+/// weak.
+Error lookupAndRecordAddrs(
+    ExecutorProcessControl &EPC, tpctypes::DylibHandle H,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> Pairs,
+    SymbolLookupFlags LookupFlags = SymbolLookupFlags::RequiredSymbol);
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LOOKUPANDRECORDADDRS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index f77dfd208413..d7b5e2eda6ee 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
-#include "llvm/ExecutionEngine/Orc/LLVMSPSSerializers.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 
@@ -27,22 +26,16 @@
 namespace llvm {
 namespace orc {
 
-struct MachOPerObjectSectionsToRegister {
-  ExecutorAddressRange EHFrameSection;
-  ExecutorAddressRange ThreadDataSection;
-};
-
 struct MachOJITDylibInitializers {
-  using SectionList = std::vector<ExecutorAddressRange>;
+  using SectionList = std::vector<ExecutorAddrRange>;
 
-  MachOJITDylibInitializers(std::string Name,
-                            ExecutorAddress MachOHeaderAddress)
+  MachOJITDylibInitializers(std::string Name, ExecutorAddr MachOHeaderAddress)
       : Name(std::move(Name)),
         MachOHeaderAddress(std::move(MachOHeaderAddress)) {}
 
   std::string Name;
-  ExecutorAddress MachOHeaderAddress;
-  ExecutorAddress ObjCImageInfoAddress;
+  ExecutorAddr MachOHeaderAddress;
+  ExecutorAddr ObjCImageInfoAddress;
 
   StringMap<SectionList> InitSections;
 };
@@ -155,15 +148,12 @@ private:
     using InitSymbolDepMap =
         DenseMap<MaterializationResponsibility *, JITLinkSymbolSet>;
 
-    void addInitializerSupportPasses(MaterializationResponsibility &MR,
-                                     jitlink::PassConfiguration &Config);
-
-    void addMachOHeaderSupportPasses(MaterializationResponsibility &MR,
-                                     jitlink::PassConfiguration &Config);
-
     void addEHAndTLVSupportPasses(MaterializationResponsibility &MR,
                                   jitlink::PassConfiguration &Config);
 
+    Error associateJITDylibHeaderSymbol(jitlink::LinkGraph &G,
+                                        MaterializationResponsibility &MR);
+
     Error preserveInitSections(jitlink::LinkGraph &G,
                                MaterializationResponsibility &MR);
 
@@ -174,6 +164,10 @@ private:
 
     Error fixTLVSectionsAndEdges(jitlink::LinkGraph &G, JITDylib &JD);
 
+    Error registerEHAndTLVSections(jitlink::LinkGraph &G);
+
+    Error registerEHSectionsPhase1(jitlink::LinkGraph &G);
+
     std::mutex PluginMutex;
     MachOPlatform &MP;
     DenseMap<JITDylib *, std::pair<uint32_t, uint32_t>> ObjCImageInfos;
@@ -186,7 +180,7 @@ private:
   using SendDeinitializerSequenceFn =
       unique_function<void(Expected<MachOJITDylibDeinitializerSequence>)>;
 
-  using SendSymbolAddressFn = unique_function<void(Expected<ExecutorAddress>)>;
+  using SendSymbolAddressFn = unique_function<void(Expected<ExecutorAddr>)>;
 
   static bool supportedTarget(const Triple &TT);
 
@@ -209,31 +203,34 @@ private:
                           StringRef JDName);
 
   void rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
-                            ExecutorAddress Handle);
+                            ExecutorAddr Handle);
 
-  void rt_lookupSymbol(SendSymbolAddressFn SendResult, ExecutorAddress Handle,
+  void rt_lookupSymbol(SendSymbolAddressFn SendResult, ExecutorAddr Handle,
                        StringRef SymbolName);
 
   // Records the addresses of runtime symbols used by the platform.
   Error bootstrapMachORuntime(JITDylib &PlatformJD);
 
-  Error registerInitInfo(JITDylib &JD, ExecutorAddress ObjCImageInfoAddr,
+  Error registerInitInfo(JITDylib &JD, ExecutorAddr ObjCImageInfoAddr,
                          ArrayRef<jitlink::Section *> InitSections);
 
-  Error registerPerObjectSections(const MachOPerObjectSectionsToRegister &POSR);
-
   Expected<uint64_t> createPThreadKey();
 
+  enum PlatformState { BootstrapPhase1, BootstrapPhase2, Initialized };
+
   ExecutionSession &ES;
   ObjectLinkingLayer &ObjLinkingLayer;
 
   SymbolStringPtr MachOHeaderStartSymbol;
-  std::atomic<bool> RuntimeBootstrapped{false};
+  std::atomic<PlatformState> State{BootstrapPhase1};
 
-  ExecutorAddress orc_rt_macho_platform_bootstrap;
-  ExecutorAddress orc_rt_macho_platform_shutdown;
-  ExecutorAddress orc_rt_macho_register_object_sections;
-  ExecutorAddress orc_rt_macho_create_pthread_key;
+  ExecutorAddr orc_rt_macho_platform_bootstrap;
+  ExecutorAddr orc_rt_macho_platform_shutdown;
+  ExecutorAddr orc_rt_macho_register_ehframe_section;
+  ExecutorAddr orc_rt_macho_deregister_ehframe_section;
+  ExecutorAddr orc_rt_macho_register_thread_data_section;
+  ExecutorAddr orc_rt_macho_deregister_thread_data_section;
+  ExecutorAddr orc_rt_macho_create_pthread_key;
 
   DenseMap<JITDylib *, SymbolLookupSet> RegisteredInitSymbols;
 
@@ -241,7 +238,6 @@ private:
   // aggregating data from the jitlink.
   std::mutex PlatformMutex;
   DenseMap<JITDylib *, MachOJITDylibInitializers> InitSeqs;
-  std::vector<MachOPerObjectSectionsToRegister> BootstrapPOSRs;
 
   DenseMap<JITTargetAddress, JITDylib *> HeaderAddrToJITDylib;
   DenseMap<JITDylib *, uint64_t> JITDylibToPThreadKey;
@@ -249,38 +245,12 @@ private:
 
 namespace shared {
 
-using SPSMachOPerObjectSectionsToRegister =
-    SPSTuple<SPSExecutorAddressRange, SPSExecutorAddressRange>;
-
-template <>
-class SPSSerializationTraits<SPSMachOPerObjectSectionsToRegister,
-                             MachOPerObjectSectionsToRegister> {
-
-public:
-  static size_t size(const MachOPerObjectSectionsToRegister &MOPOSR) {
-    return SPSMachOPerObjectSectionsToRegister::AsArgList::size(
-        MOPOSR.EHFrameSection, MOPOSR.ThreadDataSection);
-  }
-
-  static bool serialize(SPSOutputBuffer &OB,
-                        const MachOPerObjectSectionsToRegister &MOPOSR) {
-    return SPSMachOPerObjectSectionsToRegister::AsArgList::serialize(
-        OB, MOPOSR.EHFrameSection, MOPOSR.ThreadDataSection);
-  }
-
-  static bool deserialize(SPSInputBuffer &IB,
-                          MachOPerObjectSectionsToRegister &MOPOSR) {
-    return SPSMachOPerObjectSectionsToRegister::AsArgList::deserialize(
-        IB, MOPOSR.EHFrameSection, MOPOSR.ThreadDataSection);
-  }
-};
-
-using SPSNamedExecutorAddressRangeSequenceMap =
-    SPSSequence<SPSTuple<SPSString, SPSExecutorAddressRangeSequence>>;
+using SPSNamedExecutorAddrRangeSequenceMap =
+    SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRangeSequence>>;
 
 using SPSMachOJITDylibInitializers =
-    SPSTuple<SPSString, SPSExecutorAddress, SPSExecutorAddress,
-             SPSNamedExecutorAddressRangeSequenceMap>;
+    SPSTuple<SPSString, SPSExecutorAddr, SPSExecutorAddr,
+             SPSNamedExecutorAddrRangeSequenceMap>;
 
 using SPSMachOJITDylibInitializerSequence =
     SPSSequence<SPSMachOJITDylibInitializers>;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
index 5632118eee4e..109922a46e26 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -184,13 +184,13 @@ public:
   }
 
 private:
-  using AllocPtr = std::unique_ptr<jitlink::JITLinkMemoryManager::Allocation>;
+  using FinalizedAlloc = jitlink::JITLinkMemoryManager::FinalizedAlloc;
 
   void modifyPassConfig(MaterializationResponsibility &MR,
                         jitlink::LinkGraph &G,
                         jitlink::PassConfiguration &PassConfig);
   void notifyLoaded(MaterializationResponsibility &MR);
-  Error notifyEmitted(MaterializationResponsibility &MR, AllocPtr Alloc);
+  Error notifyEmitted(MaterializationResponsibility &MR, FinalizedAlloc FA);
 
   Error handleRemoveResources(ResourceKey K) override;
   void handleTransferResources(ResourceKey DstKey, ResourceKey SrcKey) override;
@@ -201,7 +201,7 @@ private:
   bool OverrideObjectFlags = false;
   bool AutoClaimObjectSymbols = false;
   ReturnObjectBufferFunction ReturnObjectBuffer;
-  DenseMap<ResourceKey, std::vector<AllocPtr>> Allocs;
+  DenseMap<ResourceKey, std::vector<FinalizedAlloc>> Allocs;
   std::vector<std::unique_ptr<Plugin>> Plugins;
 };
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h
deleted file mode 100644
index 4310ba9ce9e0..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h
+++ /dev/null
@@ -1,436 +0,0 @@
-//===-- OrcRPCExecutorProcessControl.h - Remote target control --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Executor control via ORC RPC.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_ORCRPCEXECUTORPROCESSCONTROL_H
-#define LLVM_EXECUTIONENGINE_ORC_ORCRPCEXECUTORPROCESSCONTROL_H
-
-#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
-#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
-#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
-
-namespace llvm {
-namespace orc {
-
-/// JITLinkMemoryManager implementation for a process connected via an ORC RPC
-/// endpoint.
-template <typename OrcRPCEPCImplT>
-class OrcRPCEPCJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
-private:
-  struct HostAlloc {
-    std::unique_ptr<char[]> Mem;
-    uint64_t Size;
-  };
-
-  struct TargetAlloc {
-    JITTargetAddress Address = 0;
-    uint64_t AllocatedSize = 0;
-  };
-
-  using HostAllocMap = DenseMap<int, HostAlloc>;
-  using TargetAllocMap = DenseMap<int, TargetAlloc>;
-
-public:
-  class OrcRPCAllocation : public Allocation {
-  public:
-    OrcRPCAllocation(OrcRPCEPCJITLinkMemoryManager<OrcRPCEPCImplT> &Parent,
-                     HostAllocMap HostAllocs, TargetAllocMap TargetAllocs)
-        : Parent(Parent), HostAllocs(std::move(HostAllocs)),
-          TargetAllocs(std::move(TargetAllocs)) {
-      assert(HostAllocs.size() == TargetAllocs.size() &&
-             "HostAllocs size should match TargetAllocs");
-    }
-
-    ~OrcRPCAllocation() override {
-      assert(TargetAllocs.empty() && "failed to deallocate");
-    }
-
-    MutableArrayRef<char> getWorkingMemory(ProtectionFlags Seg) override {
-      auto I = HostAllocs.find(Seg);
-      assert(I != HostAllocs.end() && "No host allocation for segment");
-      auto &HA = I->second;
-      return {HA.Mem.get(), static_cast<size_t>(HA.Size)};
-    }
-
-    JITTargetAddress getTargetMemory(ProtectionFlags Seg) override {
-      auto I = TargetAllocs.find(Seg);
-      assert(I != TargetAllocs.end() && "No target allocation for segment");
-      return I->second.Address;
-    }
-
-    void finalizeAsync(FinalizeContinuation OnFinalize) override {
-
-      std::vector<tpctypes::BufferWrite> BufferWrites;
-      orcrpctpc::ReleaseOrFinalizeMemRequest FMR;
-
-      for (auto &KV : HostAllocs) {
-        assert(TargetAllocs.count(KV.first) &&
-               "No target allocation for buffer");
-        auto &HA = KV.second;
-        auto &TA = TargetAllocs[KV.first];
-        BufferWrites.push_back({TA.Address, StringRef(HA.Mem.get(), HA.Size)});
-        FMR.push_back({orcrpctpc::toWireProtectionFlags(
-                           static_cast<sys::Memory::ProtectionFlags>(KV.first)),
-                       TA.Address, TA.AllocatedSize});
-      }
-
-      DEBUG_WITH_TYPE("orc", {
-        dbgs() << "finalizeAsync " << (void *)this << ":\n";
-        auto FMRI = FMR.begin();
-        for (auto &B : BufferWrites) {
-          auto Prot = FMRI->Prot;
-          ++FMRI;
-          dbgs() << "  Writing " << formatv("{0:x16}", B.Buffer.size())
-                 << " bytes to " << ((Prot & orcrpctpc::WPF_Read) ? 'R' : '-')
-                 << ((Prot & orcrpctpc::WPF_Write) ? 'W' : '-')
-                 << ((Prot & orcrpctpc::WPF_Exec) ? 'X' : '-')
-                 << " segment: local " << (const void *)B.Buffer.data()
-                 << " -> target " << formatv("{0:x16}", B.Address) << "\n";
-        }
-      });
-      if (auto Err =
-              Parent.Parent.getMemoryAccess().writeBuffers(BufferWrites)) {
-        OnFinalize(std::move(Err));
-        return;
-      }
-
-      DEBUG_WITH_TYPE("orc", dbgs() << " Applying permissions...\n");
-      if (auto Err =
-              Parent.getEndpoint().template callAsync<orcrpctpc::FinalizeMem>(
-                  [OF = std::move(OnFinalize)](Error Err2) {
-                    // FIXME: Dispatch to work queue.
-                    std::thread([OF = std::move(OF),
-                                 Err3 = std::move(Err2)]() mutable {
-                      DEBUG_WITH_TYPE(
-                          "orc", { dbgs() << "  finalizeAsync complete\n"; });
-                      OF(std::move(Err3));
-                    }).detach();
-                    return Error::success();
-                  },
-                  FMR)) {
-        DEBUG_WITH_TYPE("orc", dbgs() << "    failed.\n");
-        Parent.getEndpoint().abandonPendingResponses();
-        Parent.reportError(std::move(Err));
-      }
-      DEBUG_WITH_TYPE("orc", {
-        dbgs() << "Leaving finalizeAsync (finalization may continue in "
-                  "background)\n";
-      });
-    }
-
-    Error deallocate() override {
-      orcrpctpc::ReleaseOrFinalizeMemRequest RMR;
-      for (auto &KV : TargetAllocs)
-        RMR.push_back({orcrpctpc::toWireProtectionFlags(
-                           static_cast<sys::Memory::ProtectionFlags>(KV.first)),
-                       KV.second.Address, KV.second.AllocatedSize});
-      TargetAllocs.clear();
-
-      return Parent.getEndpoint().template callB<orcrpctpc::ReleaseMem>(RMR);
-    }
-
-  private:
-    OrcRPCEPCJITLinkMemoryManager<OrcRPCEPCImplT> &Parent;
-    HostAllocMap HostAllocs;
-    TargetAllocMap TargetAllocs;
-  };
-
-  OrcRPCEPCJITLinkMemoryManager(OrcRPCEPCImplT &Parent) : Parent(Parent) {}
-
-  Expected<std::unique_ptr<Allocation>>
-  allocate(const jitlink::JITLinkDylib *JD,
-           const SegmentsRequestMap &Request) override {
-    orcrpctpc::ReserveMemRequest RMR;
-    HostAllocMap HostAllocs;
-
-    for (auto &KV : Request) {
-      assert(KV.second.getContentSize() <= std::numeric_limits<size_t>::max() &&
-             "Content size is out-of-range for host");
-
-      RMR.push_back({orcrpctpc::toWireProtectionFlags(
-                         static_cast<sys::Memory::ProtectionFlags>(KV.first)),
-                     KV.second.getContentSize() + KV.second.getZeroFillSize(),
-                     KV.second.getAlignment()});
-      HostAllocs[KV.first] = {
-          std::make_unique<char[]>(KV.second.getContentSize()),
-          KV.second.getContentSize()};
-    }
-
-    DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Orc remote memmgr got request:\n";
-      for (auto &KV : Request)
-        dbgs() << "  permissions: "
-               << ((KV.first & sys::Memory::MF_READ) ? 'R' : '-')
-               << ((KV.first & sys::Memory::MF_WRITE) ? 'W' : '-')
-               << ((KV.first & sys::Memory::MF_EXEC) ? 'X' : '-')
-               << ", content size: "
-               << formatv("{0:x16}", KV.second.getContentSize())
-               << " + zero-fill-size: "
-               << formatv("{0:x16}", KV.second.getZeroFillSize())
-               << ", align: " << KV.second.getAlignment() << "\n";
-    });
-
-    // FIXME: LLVM RPC needs to be fixed to support alt
-    // serialization/deserialization on return types. For now just
-    // translate from std::map to DenseMap manually.
-    auto TmpTargetAllocs =
-        Parent.getEndpoint().template callB<orcrpctpc::ReserveMem>(RMR);
-    if (!TmpTargetAllocs)
-      return TmpTargetAllocs.takeError();
-
-    if (TmpTargetAllocs->size() != RMR.size())
-      return make_error<StringError>(
-          "Number of target allocations does not match request",
-          inconvertibleErrorCode());
-
-    TargetAllocMap TargetAllocs;
-    for (auto &E : *TmpTargetAllocs)
-      TargetAllocs[orcrpctpc::fromWireProtectionFlags(E.Prot)] = {
-          E.Address, E.AllocatedSize};
-
-    DEBUG_WITH_TYPE("orc", {
-      auto HAI = HostAllocs.begin();
-      for (auto &KV : TargetAllocs)
-        dbgs() << "  permissions: "
-               << ((KV.first & sys::Memory::MF_READ) ? 'R' : '-')
-               << ((KV.first & sys::Memory::MF_WRITE) ? 'W' : '-')
-               << ((KV.first & sys::Memory::MF_EXEC) ? 'X' : '-')
-               << " assigned local " << (void *)HAI->second.Mem.get()
-               << ", target " << formatv("{0:x16}", KV.second.Address) << "\n";
-    });
-
-    return std::make_unique<OrcRPCAllocation>(*this, std::move(HostAllocs),
-                                              std::move(TargetAllocs));
-  }
-
-private:
-  void reportError(Error Err) { Parent.reportError(std::move(Err)); }
-
-  decltype(std::declval<OrcRPCEPCImplT>().getEndpoint()) getEndpoint() {
-    return Parent.getEndpoint();
-  }
-
-  OrcRPCEPCImplT &Parent;
-};
-
-/// ExecutorProcessControl::MemoryAccess implementation for a process connected
-/// via an ORC RPC endpoint.
-template <typename OrcRPCEPCImplT>
-class OrcRPCEPCMemoryAccess : public ExecutorProcessControl::MemoryAccess {
-public:
-  OrcRPCEPCMemoryAccess(OrcRPCEPCImplT &Parent) : Parent(Parent) {}
-
-  void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                   WriteResultFn OnWriteComplete) override {
-    writeViaRPC<orcrpctpc::WriteUInt8s>(Ws, std::move(OnWriteComplete));
-  }
-
-  void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
-                    WriteResultFn OnWriteComplete) override {
-    writeViaRPC<orcrpctpc::WriteUInt16s>(Ws, std::move(OnWriteComplete));
-  }
-
-  void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
-                    WriteResultFn OnWriteComplete) override {
-    writeViaRPC<orcrpctpc::WriteUInt32s>(Ws, std::move(OnWriteComplete));
-  }
-
-  void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
-                    WriteResultFn OnWriteComplete) override {
-    writeViaRPC<orcrpctpc::WriteUInt64s>(Ws, std::move(OnWriteComplete));
-  }
-
-  void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
-                    WriteResultFn OnWriteComplete) override {
-    writeViaRPC<orcrpctpc::WriteBuffers>(Ws, std::move(OnWriteComplete));
-  }
-
-private:
-  template <typename WriteRPCFunction, typename WriteElementT>
-  void writeViaRPC(ArrayRef<WriteElementT> Ws, WriteResultFn OnWriteComplete) {
-    if (auto Err = Parent.getEndpoint().template callAsync<WriteRPCFunction>(
-            [OWC = std::move(OnWriteComplete)](Error Err2) mutable -> Error {
-              OWC(std::move(Err2));
-              return Error::success();
-            },
-            Ws)) {
-      Parent.reportError(std::move(Err));
-      Parent.getEndpoint().abandonPendingResponses();
-    }
-  }
-
-  OrcRPCEPCImplT &Parent;
-};
-
-// ExecutorProcessControl for a process connected via an ORC RPC Endpoint.
-template <typename RPCEndpointT>
-class OrcRPCExecutorProcessControlBase : public ExecutorProcessControl {
-public:
-  using ErrorReporter = unique_function<void(Error)>;
-
-  using OnCloseConnectionFunction = unique_function<Error(Error)>;
-
-  OrcRPCExecutorProcessControlBase(std::shared_ptr<SymbolStringPool> SSP,
-                                   RPCEndpointT &EP, ErrorReporter ReportError)
-      : ExecutorProcessControl(std::move(SSP)),
-        ReportError(std::move(ReportError)), EP(EP) {
-    using ThisT = OrcRPCExecutorProcessControlBase<RPCEndpointT>;
-    EP.template addAsyncHandler<orcrpctpc::RunWrapper>(*this,
-                                                       &ThisT::runWrapperInJIT);
-  }
-
-  void reportError(Error Err) { ReportError(std::move(Err)); }
-
-  RPCEndpointT &getEndpoint() { return EP; }
-
-  Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override {
-    DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Loading dylib \"" << (DylibPath ? DylibPath : "") << "\" ";
-      if (!DylibPath)
-        dbgs() << "(process symbols)";
-      dbgs() << "\n";
-    });
-    if (!DylibPath)
-      DylibPath = "";
-    auto H = EP.template callB<orcrpctpc::LoadDylib>(DylibPath);
-    DEBUG_WITH_TYPE("orc", {
-      if (H)
-        dbgs() << "  got handle " << formatv("{0:x16}", *H) << "\n";
-      else
-        dbgs() << "  error, unable to load\n";
-    });
-    return H;
-  }
-
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(ArrayRef<LookupRequest> Request) override {
-    std::vector<orcrpctpc::RemoteLookupRequest> RR;
-    for (auto &E : Request) {
-      RR.push_back({});
-      RR.back().first = E.Handle;
-      for (auto &KV : E.Symbols)
-        RR.back().second.push_back(
-            {(*KV.first).str(),
-             KV.second == SymbolLookupFlags::WeaklyReferencedSymbol});
-    }
-    DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Compound lookup:\n";
-      for (auto &R : Request) {
-        dbgs() << "  In " << formatv("{0:x16}", R.Handle) << ": {";
-        bool First = true;
-        for (auto &KV : R.Symbols) {
-          dbgs() << (First ? "" : ",") << " " << *KV.first;
-          First = false;
-        }
-        dbgs() << " }\n";
-      }
-    });
-    return EP.template callB<orcrpctpc::LookupSymbols>(RR);
-  }
-
-  Expected<int32_t> runAsMain(JITTargetAddress MainFnAddr,
-                              ArrayRef<std::string> Args) override {
-    DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Running as main: " << formatv("{0:x16}", MainFnAddr)
-             << ", args = [";
-      for (unsigned I = 0; I != Args.size(); ++I)
-        dbgs() << (I ? "," : "") << " \"" << Args[I] << "\"";
-      dbgs() << "]\n";
-    });
-    auto Result = EP.template callB<orcrpctpc::RunMain>(MainFnAddr, Args);
-    DEBUG_WITH_TYPE("orc", {
-      dbgs() << "  call to " << formatv("{0:x16}", MainFnAddr);
-      if (Result)
-        dbgs() << " returned result " << *Result << "\n";
-      else
-        dbgs() << " failed\n";
-    });
-    return Result;
-  }
-
-  void callWrapperAsync(SendResultFunction OnComplete,
-                        JITTargetAddress WrapperFnAddr,
-                        ArrayRef<char> ArgBuffer) override {
-    DEBUG_WITH_TYPE("orc", {
-      dbgs() << "Running as wrapper function "
-             << formatv("{0:x16}", WrapperFnAddr) << " with "
-             << formatv("{0:x16}", ArgBuffer.size()) << " argument buffer\n";
-    });
-    auto Result = EP.template callB<orcrpctpc::RunWrapper>(
-        WrapperFnAddr,
-        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(ArgBuffer.data()),
-                          ArgBuffer.size()));
-
-    if (!Result)
-      OnComplete(shared::WrapperFunctionResult::createOutOfBandError(
-          toString(Result.takeError())));
-    OnComplete(std::move(*Result));
-  }
-
-  Error closeConnection(OnCloseConnectionFunction OnCloseConnection) {
-    DEBUG_WITH_TYPE("orc", dbgs() << "Closing connection to remote\n");
-    return EP.template callAsync<orcrpctpc::CloseConnection>(
-        std::move(OnCloseConnection));
-  }
-
-  Error closeConnectionAndWait() {
-    std::promise<MSVCPError> P;
-    auto F = P.get_future();
-    if (auto Err = closeConnection([&](Error Err2) -> Error {
-          P.set_value(std::move(Err2));
-          return Error::success();
-        })) {
-      EP.abandonAllPendingResponses();
-      return joinErrors(std::move(Err), F.get());
-    }
-    return F.get();
-  }
-
-protected:
-  /// Subclasses must call this during construction to initialize the
-  /// TargetTriple and PageSize members.
-  Error initializeORCRPCEPCBase() {
-    if (auto EPI = EP.template callB<orcrpctpc::GetExecutorProcessInfo>()) {
-      this->TargetTriple = Triple(EPI->Triple);
-      this->PageSize = PageSize;
-      this->JDI = {ExecutorAddress(EPI->DispatchFuncAddr),
-                   ExecutorAddress(EPI->DispatchCtxAddr)};
-      return Error::success();
-    } else
-      return EPI.takeError();
-  }
-
-private:
-  Error runWrapperInJIT(
-      std::function<Error(Expected<shared::WrapperFunctionResult>)> SendResult,
-      JITTargetAddress FunctionTag, std::vector<uint8_t> ArgBuffer) {
-
-    getExecutionSession().runJITDispatchHandler(
-        [this, SendResult = std::move(SendResult)](
-            Expected<shared::WrapperFunctionResult> R) {
-          if (auto Err = SendResult(std::move(R)))
-            ReportError(std::move(Err));
-        },
-        FunctionTag,
-        {reinterpret_cast<const char *>(ArgBuffer.data()), ArgBuffer.size()});
-    return Error::success();
-  }
-
-  ErrorReporter ReportError;
-  RPCEndpointT &EP;
-};
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_ORCRPCEXECUTORPROCESSCONTROL_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
deleted file mode 100644
index 3d139740d677..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ /dev/null
@@ -1,925 +0,0 @@
-//===- OrcRemoteTargetClient.h - Orc Remote-target Client -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the OrcRemoteTargetClient class and helpers. This class
-// can be used to communicate over an RawByteChannel with an
-// OrcRemoteTargetServer instance to support remote-JITing.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETCLIENT_H
-#define LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETCLIENT_H
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
-#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#define DEBUG_TYPE "orc-remote"
-
-namespace llvm {
-namespace orc {
-namespace remote {
-
-/// This class provides utilities (including memory manager, indirect stubs
-/// manager, and compile callback manager types) that support remote JITing
-/// in ORC.
-///
-/// Each of the utility classes talks to a JIT server (an instance of the
-/// OrcRemoteTargetServer class) via an RPC system (see RPCUtils.h) to carry out
-/// its actions.
-class OrcRemoteTargetClient
-    : public shared::SingleThreadedRPCEndpoint<shared::RawByteChannel> {
-public:
-  /// Remote-mapped RuntimeDyld-compatible memory manager.
-  class RemoteRTDyldMemoryManager : public RuntimeDyld::MemoryManager {
-    friend class OrcRemoteTargetClient;
-
-  public:
-    ~RemoteRTDyldMemoryManager() {
-      Client.destroyRemoteAllocator(Id);
-      LLVM_DEBUG(dbgs() << "Destroyed remote allocator " << Id << "\n");
-    }
-
-    RemoteRTDyldMemoryManager(const RemoteRTDyldMemoryManager &) = delete;
-    RemoteRTDyldMemoryManager &
-    operator=(const RemoteRTDyldMemoryManager &) = delete;
-    RemoteRTDyldMemoryManager(RemoteRTDyldMemoryManager &&) = default;
-    RemoteRTDyldMemoryManager &operator=(RemoteRTDyldMemoryManager &&) = delete;
-
-    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID,
-                                 StringRef SectionName) override {
-      Unmapped.back().CodeAllocs.emplace_back(Size, Alignment);
-      uint8_t *Alloc = reinterpret_cast<uint8_t *>(
-          Unmapped.back().CodeAllocs.back().getLocalAddress());
-      LLVM_DEBUG(dbgs() << "Allocator " << Id << " allocated code for "
-                        << SectionName << ": " << Alloc << " (" << Size
-                        << " bytes, alignment " << Alignment << ")\n");
-      return Alloc;
-    }
-
-    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, StringRef SectionName,
-                                 bool IsReadOnly) override {
-      if (IsReadOnly) {
-        Unmapped.back().RODataAllocs.emplace_back(Size, Alignment);
-        uint8_t *Alloc = reinterpret_cast<uint8_t *>(
-            Unmapped.back().RODataAllocs.back().getLocalAddress());
-        LLVM_DEBUG(dbgs() << "Allocator " << Id << " allocated ro-data for "
-                          << SectionName << ": " << Alloc << " (" << Size
-                          << " bytes, alignment " << Alignment << ")\n");
-        return Alloc;
-      } // else...
-
-      Unmapped.back().RWDataAllocs.emplace_back(Size, Alignment);
-      uint8_t *Alloc = reinterpret_cast<uint8_t *>(
-          Unmapped.back().RWDataAllocs.back().getLocalAddress());
-      LLVM_DEBUG(dbgs() << "Allocator " << Id << " allocated rw-data for "
-                        << SectionName << ": " << Alloc << " (" << Size
-                        << " bytes, alignment " << Alignment << ")\n");
-      return Alloc;
-    }
-
-    void reserveAllocationSpace(uintptr_t CodeSize, uint32_t CodeAlign,
-                                uintptr_t RODataSize, uint32_t RODataAlign,
-                                uintptr_t RWDataSize,
-                                uint32_t RWDataAlign) override {
-      Unmapped.push_back(ObjectAllocs());
-
-      LLVM_DEBUG(dbgs() << "Allocator " << Id << " reserved:\n");
-
-      if (CodeSize != 0) {
-        Unmapped.back().RemoteCodeAddr =
-            Client.reserveMem(Id, CodeSize, CodeAlign);
-
-        LLVM_DEBUG(
-            dbgs() << "  code: "
-                   << format("0x%016" PRIx64, Unmapped.back().RemoteCodeAddr)
-                   << " (" << CodeSize << " bytes, alignment " << CodeAlign
-                   << ")\n");
-      }
-
-      if (RODataSize != 0) {
-        Unmapped.back().RemoteRODataAddr =
-            Client.reserveMem(Id, RODataSize, RODataAlign);
-
-        LLVM_DEBUG(
-            dbgs() << "  ro-data: "
-                   << format("0x%016" PRIx64, Unmapped.back().RemoteRODataAddr)
-                   << " (" << RODataSize << " bytes, alignment " << RODataAlign
-                   << ")\n");
-      }
-
-      if (RWDataSize != 0) {
-        Unmapped.back().RemoteRWDataAddr =
-            Client.reserveMem(Id, RWDataSize, RWDataAlign);
-
-        LLVM_DEBUG(
-            dbgs() << "  rw-data: "
-                   << format("0x%016" PRIx64, Unmapped.back().RemoteRWDataAddr)
-                   << " (" << RWDataSize << " bytes, alignment " << RWDataAlign
-                   << ")\n");
-      }
-    }
-
-    bool needsToReserveAllocationSpace() override { return true; }
-
-    void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                          size_t Size) override {
-      UnfinalizedEHFrames.push_back({LoadAddr, Size});
-    }
-
-    void deregisterEHFrames() override {
-      for (auto &Frame : RegisteredEHFrames) {
-        // FIXME: Add error poll.
-        Client.deregisterEHFrames(Frame.Addr, Frame.Size);
-      }
-    }
-
-    void notifyObjectLoaded(RuntimeDyld &Dyld,
-                            const object::ObjectFile &Obj) override {
-      LLVM_DEBUG(dbgs() << "Allocator " << Id << " applied mappings:\n");
-      for (auto &ObjAllocs : Unmapped) {
-        mapAllocsToRemoteAddrs(Dyld, ObjAllocs.CodeAllocs,
-                               ObjAllocs.RemoteCodeAddr);
-        mapAllocsToRemoteAddrs(Dyld, ObjAllocs.RODataAllocs,
-                               ObjAllocs.RemoteRODataAddr);
-        mapAllocsToRemoteAddrs(Dyld, ObjAllocs.RWDataAllocs,
-                               ObjAllocs.RemoteRWDataAddr);
-        Unfinalized.push_back(std::move(ObjAllocs));
-      }
-      Unmapped.clear();
-    }
-
-    bool finalizeMemory(std::string *ErrMsg = nullptr) override {
-      LLVM_DEBUG(dbgs() << "Allocator " << Id << " finalizing:\n");
-
-      for (auto &ObjAllocs : Unfinalized) {
-        if (copyAndProtect(ObjAllocs.CodeAllocs, ObjAllocs.RemoteCodeAddr,
-                           sys::Memory::MF_READ | sys::Memory::MF_EXEC))
-          return true;
-
-        if (copyAndProtect(ObjAllocs.RODataAllocs, ObjAllocs.RemoteRODataAddr,
-                           sys::Memory::MF_READ))
-          return true;
-
-        if (copyAndProtect(ObjAllocs.RWDataAllocs, ObjAllocs.RemoteRWDataAddr,
-                           sys::Memory::MF_READ | sys::Memory::MF_WRITE))
-          return true;
-      }
-      Unfinalized.clear();
-
-      for (auto &EHFrame : UnfinalizedEHFrames) {
-        if (auto Err = Client.registerEHFrames(EHFrame.Addr, EHFrame.Size)) {
-          // FIXME: Replace this once finalizeMemory can return an Error.
-          handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) {
-            if (ErrMsg) {
-              raw_string_ostream ErrOut(*ErrMsg);
-              EIB.log(ErrOut);
-            }
-          });
-          return false;
-        }
-      }
-      RegisteredEHFrames = std::move(UnfinalizedEHFrames);
-      UnfinalizedEHFrames = {};
-
-      return false;
-    }
-
-  private:
-    class Alloc {
-    public:
-      Alloc(uint64_t Size, unsigned Align)
-          : Size(Size), Align(Align), Contents(new char[Size + Align - 1]) {}
-
-      Alloc(const Alloc &) = delete;
-      Alloc &operator=(const Alloc &) = delete;
-      Alloc(Alloc &&) = default;
-      Alloc &operator=(Alloc &&) = default;
-
-      uint64_t getSize() const { return Size; }
-
-      unsigned getAlign() const { return Align; }
-
-      char *getLocalAddress() const {
-        uintptr_t LocalAddr = reinterpret_cast<uintptr_t>(Contents.get());
-        LocalAddr = alignTo(LocalAddr, Align);
-        return reinterpret_cast<char *>(LocalAddr);
-      }
-
-      void setRemoteAddress(JITTargetAddress RemoteAddr) {
-        this->RemoteAddr = RemoteAddr;
-      }
-
-      JITTargetAddress getRemoteAddress() const { return RemoteAddr; }
-
-    private:
-      uint64_t Size;
-      unsigned Align;
-      std::unique_ptr<char[]> Contents;
-      JITTargetAddress RemoteAddr = 0;
-    };
-
-    struct ObjectAllocs {
-      ObjectAllocs() = default;
-      ObjectAllocs(const ObjectAllocs &) = delete;
-      ObjectAllocs &operator=(const ObjectAllocs &) = delete;
-      ObjectAllocs(ObjectAllocs &&) = default;
-      ObjectAllocs &operator=(ObjectAllocs &&) = default;
-
-      JITTargetAddress RemoteCodeAddr = 0;
-      JITTargetAddress RemoteRODataAddr = 0;
-      JITTargetAddress RemoteRWDataAddr = 0;
-      std::vector<Alloc> CodeAllocs, RODataAllocs, RWDataAllocs;
-    };
-
-    RemoteRTDyldMemoryManager(OrcRemoteTargetClient &Client,
-                              ResourceIdMgr::ResourceId Id)
-        : Client(Client), Id(Id) {
-      LLVM_DEBUG(dbgs() << "Created remote allocator " << Id << "\n");
-    }
-
-    // Maps all allocations in Allocs to aligned blocks
-    void mapAllocsToRemoteAddrs(RuntimeDyld &Dyld, std::vector<Alloc> &Allocs,
-                                JITTargetAddress NextAddr) {
-      for (auto &Alloc : Allocs) {
-        NextAddr = alignTo(NextAddr, Alloc.getAlign());
-        Dyld.mapSectionAddress(Alloc.getLocalAddress(), NextAddr);
-        LLVM_DEBUG(
-            dbgs() << "     " << static_cast<void *>(Alloc.getLocalAddress())
-                   << " -> " << format("0x%016" PRIx64, NextAddr) << "\n");
-        Alloc.setRemoteAddress(NextAddr);
-
-        // Only advance NextAddr if it was non-null to begin with,
-        // otherwise leave it as null.
-        if (NextAddr)
-          NextAddr += Alloc.getSize();
-      }
-    }
-
-    // Copies data for each alloc in the list, then set permissions on the
-    // segment.
-    bool copyAndProtect(const std::vector<Alloc> &Allocs,
-                        JITTargetAddress RemoteSegmentAddr,
-                        unsigned Permissions) {
-      if (RemoteSegmentAddr) {
-        assert(!Allocs.empty() && "No sections in allocated segment");
-
-        for (auto &Alloc : Allocs) {
-          LLVM_DEBUG(dbgs() << "  copying section: "
-                            << static_cast<void *>(Alloc.getLocalAddress())
-                            << " -> "
-                            << format("0x%016" PRIx64, Alloc.getRemoteAddress())
-                            << " (" << Alloc.getSize() << " bytes)\n";);
-
-          if (Client.writeMem(Alloc.getRemoteAddress(), Alloc.getLocalAddress(),
-                              Alloc.getSize()))
-            return true;
-        }
-
-        LLVM_DEBUG(dbgs() << "  setting "
-                          << (Permissions & sys::Memory::MF_READ ? 'R' : '-')
-                          << (Permissions & sys::Memory::MF_WRITE ? 'W' : '-')
-                          << (Permissions & sys::Memory::MF_EXEC ? 'X' : '-')
-                          << " permissions on block: "
-                          << format("0x%016" PRIx64, RemoteSegmentAddr)
-                          << "\n");
-        if (Client.setProtections(Id, RemoteSegmentAddr, Permissions))
-          return true;
-      }
-      return false;
-    }
-
-    OrcRemoteTargetClient &Client;
-    ResourceIdMgr::ResourceId Id;
-    std::vector<ObjectAllocs> Unmapped;
-    std::vector<ObjectAllocs> Unfinalized;
-
-    struct EHFrame {
-      JITTargetAddress Addr;
-      uint64_t Size;
-    };
-    std::vector<EHFrame> UnfinalizedEHFrames;
-    std::vector<EHFrame> RegisteredEHFrames;
-  };
-
-  class RPCMMAlloc : public jitlink::JITLinkMemoryManager::Allocation {
-    using AllocationMap = DenseMap<unsigned, sys::MemoryBlock>;
-    using FinalizeContinuation =
-        jitlink::JITLinkMemoryManager::Allocation::FinalizeContinuation;
-    using ProtectionFlags = sys::Memory::ProtectionFlags;
-    using SegmentsRequestMap =
-        DenseMap<unsigned, jitlink::JITLinkMemoryManager::SegmentRequest>;
-
-    RPCMMAlloc(OrcRemoteTargetClient &Client, ResourceIdMgr::ResourceId Id)
-        : Client(Client), Id(Id) {}
-
-  public:
-    static Expected<std::unique_ptr<RPCMMAlloc>>
-    Create(OrcRemoteTargetClient &Client, ResourceIdMgr::ResourceId Id,
-           const SegmentsRequestMap &Request) {
-      auto *MM = new RPCMMAlloc(Client, Id);
-
-      if (Error Err = MM->allocateHostBlocks(Request))
-        return std::move(Err);
-
-      if (Error Err = MM->allocateTargetBlocks())
-        return std::move(Err);
-
-      return std::unique_ptr<RPCMMAlloc>(MM);
-    }
-
-    MutableArrayRef<char> getWorkingMemory(ProtectionFlags Seg) override {
-      assert(HostSegBlocks.count(Seg) && "No allocation for segment");
-      return {static_cast<char *>(HostSegBlocks[Seg].base()),
-              HostSegBlocks[Seg].allocatedSize()};
-    }
-
-    JITTargetAddress getTargetMemory(ProtectionFlags Seg) override {
-      assert(TargetSegBlocks.count(Seg) && "No allocation for segment");
-      return pointerToJITTargetAddress(TargetSegBlocks[Seg].base());
-    }
-
-    void finalizeAsync(FinalizeContinuation OnFinalize) override {
-      // Host allocations (working memory) remain ReadWrite.
-      OnFinalize(copyAndProtect());
-    }
-
-    Error deallocate() override {
-      // TODO: Cannot release target allocation. RPCAPI has no function
-      // symmetric to reserveMem(). Add RPC call like freeMem()?
-      return errorCodeToError(sys::Memory::releaseMappedMemory(HostAllocation));
-    }
-
-  private:
-    OrcRemoteTargetClient &Client;
-    ResourceIdMgr::ResourceId Id;
-    AllocationMap HostSegBlocks;
-    AllocationMap TargetSegBlocks;
-    JITTargetAddress TargetSegmentAddr;
-    sys::MemoryBlock HostAllocation;
-
-    Error allocateHostBlocks(const SegmentsRequestMap &Request) {
-      unsigned TargetPageSize = Client.getPageSize();
-
-      if (!isPowerOf2_64(static_cast<uint64_t>(TargetPageSize)))
-        return make_error<StringError>("Host page size is not a power of 2",
-                                       inconvertibleErrorCode());
-
-      auto TotalSize = calcTotalAllocSize(Request, TargetPageSize);
-      if (!TotalSize)
-        return TotalSize.takeError();
-
-      // Allocate one slab to cover all the segments.
-      const sys::Memory::ProtectionFlags ReadWrite =
-          static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                    sys::Memory::MF_WRITE);
-      std::error_code EC;
-      HostAllocation =
-          sys::Memory::allocateMappedMemory(*TotalSize, nullptr, ReadWrite, EC);
-      if (EC)
-        return errorCodeToError(EC);
-
-      char *SlabAddr = static_cast<char *>(HostAllocation.base());
-#ifndef NDEBUG
-      char *SlabAddrEnd = SlabAddr + HostAllocation.allocatedSize();
-#endif
-
-      // Allocate segment memory from the slab.
-      for (auto &KV : Request) {
-        const auto &Seg = KV.second;
-
-        uint64_t SegmentSize = Seg.getContentSize() + Seg.getZeroFillSize();
-        uint64_t AlignedSegmentSize = alignTo(SegmentSize, TargetPageSize);
-
-        // Zero out zero-fill memory.
-        char *ZeroFillBegin = SlabAddr + Seg.getContentSize();
-        memset(ZeroFillBegin, 0, Seg.getZeroFillSize());
-
-        // Record the block for this segment.
-        HostSegBlocks[KV.first] =
-            sys::MemoryBlock(SlabAddr, AlignedSegmentSize);
-
-        SlabAddr += AlignedSegmentSize;
-        assert(SlabAddr <= SlabAddrEnd && "Out of range");
-      }
-
-      return Error::success();
-    }
-
-    Error allocateTargetBlocks() {
-      // Reserve memory for all blocks on the target. We need as much space on
-      // the target as we allocated on the host.
-      TargetSegmentAddr = Client.reserveMem(Id, HostAllocation.allocatedSize(),
-                                            Client.getPageSize());
-      if (!TargetSegmentAddr)
-        return make_error<StringError>("Failed to reserve memory on the target",
-                                       inconvertibleErrorCode());
-
-      // Map memory blocks into the allocation, that match the host allocation.
-      JITTargetAddress TargetAllocAddr = TargetSegmentAddr;
-      for (const auto &KV : HostSegBlocks) {
-        size_t TargetAllocSize = KV.second.allocatedSize();
-
-        TargetSegBlocks[KV.first] =
-            sys::MemoryBlock(jitTargetAddressToPointer<void *>(TargetAllocAddr),
-                             TargetAllocSize);
-
-        TargetAllocAddr += TargetAllocSize;
-        assert(TargetAllocAddr - TargetSegmentAddr <=
-                   HostAllocation.allocatedSize() &&
-               "Out of range on target");
-      }
-
-      return Error::success();
-    }
-
-    Error copyAndProtect() {
-      unsigned Permissions = 0u;
-
-      // Copy segments one by one.
-      for (auto &KV : TargetSegBlocks) {
-        Permissions |= KV.first;
-
-        const sys::MemoryBlock &TargetBlock = KV.second;
-        const sys::MemoryBlock &HostBlock = HostSegBlocks.lookup(KV.first);
-
-        size_t TargetAllocSize = TargetBlock.allocatedSize();
-        auto TargetAllocAddr = pointerToJITTargetAddress(TargetBlock.base());
-        auto *HostAllocBegin = static_cast<const char *>(HostBlock.base());
-
-        bool CopyErr =
-            Client.writeMem(TargetAllocAddr, HostAllocBegin, TargetAllocSize);
-        if (CopyErr)
-          return createStringError(inconvertibleErrorCode(),
-                                   "Failed to copy %d segment to the target",
-                                   KV.first);
-      }
-
-      // Set permission flags for all segments at once.
-      bool ProtectErr =
-          Client.setProtections(Id, TargetSegmentAddr, Permissions);
-      if (ProtectErr)
-        return createStringError(inconvertibleErrorCode(),
-                                 "Failed to apply permissions for %d segment "
-                                 "on the target",
-                                 Permissions);
-      return Error::success();
-    }
-
-    static Expected<size_t>
-    calcTotalAllocSize(const SegmentsRequestMap &Request,
-                       unsigned TargetPageSize) {
-      size_t TotalSize = 0;
-      for (const auto &KV : Request) {
-        const auto &Seg = KV.second;
-
-        if (Seg.getAlignment() > TargetPageSize)
-          return make_error<StringError>("Cannot request alignment higher than "
-                                         "page alignment on target",
-                                         inconvertibleErrorCode());
-
-        TotalSize = alignTo(TotalSize, TargetPageSize);
-        TotalSize += Seg.getContentSize();
-        TotalSize += Seg.getZeroFillSize();
-      }
-
-      return TotalSize;
-    }
-  };
-
-  class RemoteJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
-  public:
-    RemoteJITLinkMemoryManager(OrcRemoteTargetClient &Client,
-                               ResourceIdMgr::ResourceId Id)
-        : Client(Client), Id(Id) {}
-
-    RemoteJITLinkMemoryManager(const RemoteJITLinkMemoryManager &) = delete;
-    RemoteJITLinkMemoryManager(RemoteJITLinkMemoryManager &&) = default;
-
-    RemoteJITLinkMemoryManager &
-    operator=(const RemoteJITLinkMemoryManager &) = delete;
-    RemoteJITLinkMemoryManager &
-    operator=(RemoteJITLinkMemoryManager &&) = delete;
-
-    ~RemoteJITLinkMemoryManager() {
-      Client.destroyRemoteAllocator(Id);
-      LLVM_DEBUG(dbgs() << "Destroyed remote allocator " << Id << "\n");
-    }
-
-    Expected<std::unique_ptr<Allocation>>
-    allocate(const jitlink::JITLinkDylib *JD,
-             const SegmentsRequestMap &Request) override {
-      return RPCMMAlloc::Create(Client, Id, Request);
-    }
-
-  private:
-    OrcRemoteTargetClient &Client;
-    ResourceIdMgr::ResourceId Id;
-  };
-
-  /// Remote indirect stubs manager.
-  class RemoteIndirectStubsManager : public IndirectStubsManager {
-  public:
-    RemoteIndirectStubsManager(OrcRemoteTargetClient &Client,
-                               ResourceIdMgr::ResourceId Id)
-        : Client(Client), Id(Id) {}
-
-    ~RemoteIndirectStubsManager() override {
-      Client.destroyIndirectStubsManager(Id);
-    }
-
-    Error createStub(StringRef StubName, JITTargetAddress StubAddr,
-                     JITSymbolFlags StubFlags) override {
-      if (auto Err = reserveStubs(1))
-        return Err;
-
-      return createStubInternal(StubName, StubAddr, StubFlags);
-    }
-
-    Error createStubs(const StubInitsMap &StubInits) override {
-      if (auto Err = reserveStubs(StubInits.size()))
-        return Err;
-
-      for (auto &Entry : StubInits)
-        if (auto Err = createStubInternal(Entry.first(), Entry.second.first,
-                                          Entry.second.second))
-          return Err;
-
-      return Error::success();
-    }
-
-    JITEvaluatedSymbol findStub(StringRef Name, bool ExportedStubsOnly) override {
-      auto I = StubIndexes.find(Name);
-      if (I == StubIndexes.end())
-        return nullptr;
-      auto Key = I->second.first;
-      auto Flags = I->second.second;
-      auto StubSymbol = JITEvaluatedSymbol(getStubAddr(Key), Flags);
-      if (ExportedStubsOnly && !StubSymbol.getFlags().isExported())
-        return nullptr;
-      return StubSymbol;
-    }
-
-    JITEvaluatedSymbol findPointer(StringRef Name) override {
-      auto I = StubIndexes.find(Name);
-      if (I == StubIndexes.end())
-        return nullptr;
-      auto Key = I->second.first;
-      auto Flags = I->second.second;
-      return JITEvaluatedSymbol(getPtrAddr(Key), Flags);
-    }
-
-    Error updatePointer(StringRef Name, JITTargetAddress NewAddr) override {
-      auto I = StubIndexes.find(Name);
-      assert(I != StubIndexes.end() && "No stub pointer for symbol");
-      auto Key = I->second.first;
-      return Client.writePointer(getPtrAddr(Key), NewAddr);
-    }
-
-  private:
-    struct RemoteIndirectStubsInfo {
-      JITTargetAddress StubBase;
-      JITTargetAddress PtrBase;
-      unsigned NumStubs;
-    };
-
-    using StubKey = std::pair<uint16_t, uint16_t>;
-
-    Error reserveStubs(unsigned NumStubs) {
-      if (NumStubs <= FreeStubs.size())
-        return Error::success();
-
-      unsigned NewStubsRequired = NumStubs - FreeStubs.size();
-      JITTargetAddress StubBase;
-      JITTargetAddress PtrBase;
-      unsigned NumStubsEmitted;
-
-      if (auto StubInfoOrErr = Client.emitIndirectStubs(Id, NewStubsRequired))
-        std::tie(StubBase, PtrBase, NumStubsEmitted) = *StubInfoOrErr;
-      else
-        return StubInfoOrErr.takeError();
-
-      unsigned NewBlockId = RemoteIndirectStubsInfos.size();
-      RemoteIndirectStubsInfos.push_back({StubBase, PtrBase, NumStubsEmitted});
-
-      for (unsigned I = 0; I < NumStubsEmitted; ++I)
-        FreeStubs.push_back(std::make_pair(NewBlockId, I));
-
-      return Error::success();
-    }
-
-    Error createStubInternal(StringRef StubName, JITTargetAddress InitAddr,
-                             JITSymbolFlags StubFlags) {
-      auto Key = FreeStubs.back();
-      FreeStubs.pop_back();
-      StubIndexes[StubName] = std::make_pair(Key, StubFlags);
-      return Client.writePointer(getPtrAddr(Key), InitAddr);
-    }
-
-    JITTargetAddress getStubAddr(StubKey K) {
-      assert(RemoteIndirectStubsInfos[K.first].StubBase != 0 &&
-             "Missing stub address");
-      return RemoteIndirectStubsInfos[K.first].StubBase +
-             K.second * Client.getIndirectStubSize();
-    }
-
-    JITTargetAddress getPtrAddr(StubKey K) {
-      assert(RemoteIndirectStubsInfos[K.first].PtrBase != 0 &&
-             "Missing pointer address");
-      return RemoteIndirectStubsInfos[K.first].PtrBase +
-             K.second * Client.getPointerSize();
-    }
-
-    OrcRemoteTargetClient &Client;
-    ResourceIdMgr::ResourceId Id;
-    std::vector<RemoteIndirectStubsInfo> RemoteIndirectStubsInfos;
-    std::vector<StubKey> FreeStubs;
-    StringMap<std::pair<StubKey, JITSymbolFlags>> StubIndexes;
-  };
-
-  class RemoteTrampolinePool : public TrampolinePool {
-  public:
-    RemoteTrampolinePool(OrcRemoteTargetClient &Client) : Client(Client) {}
-
-  private:
-    Error grow() override {
-      JITTargetAddress BlockAddr = 0;
-      uint32_t NumTrampolines = 0;
-      if (auto TrampolineInfoOrErr = Client.emitTrampolineBlock())
-        std::tie(BlockAddr, NumTrampolines) = *TrampolineInfoOrErr;
-      else
-        return TrampolineInfoOrErr.takeError();
-
-      uint32_t TrampolineSize = Client.getTrampolineSize();
-      for (unsigned I = 0; I < NumTrampolines; ++I)
-        AvailableTrampolines.push_back(BlockAddr + (I * TrampolineSize));
-
-      return Error::success();
-    }
-
-    OrcRemoteTargetClient &Client;
-  };
-
-  /// Remote compile callback manager.
-  class RemoteCompileCallbackManager : public JITCompileCallbackManager {
-  public:
-    RemoteCompileCallbackManager(OrcRemoteTargetClient &Client,
-                                 ExecutionSession &ES,
-                                 JITTargetAddress ErrorHandlerAddress)
-        : JITCompileCallbackManager(
-              std::make_unique<RemoteTrampolinePool>(Client), ES,
-              ErrorHandlerAddress) {}
-  };
-
-  /// Create an OrcRemoteTargetClient.
-  /// Channel is the ChannelT instance to communicate on. It is assumed that
-  /// the channel is ready to be read from and written to.
-  static Expected<std::unique_ptr<OrcRemoteTargetClient>>
-  Create(shared::RawByteChannel &Channel, ExecutionSession &ES) {
-    Error Err = Error::success();
-    auto Client = std::unique_ptr<OrcRemoteTargetClient>(
-        new OrcRemoteTargetClient(Channel, ES, Err));
-    if (Err)
-      return std::move(Err);
-    return std::move(Client);
-  }
-
-  /// Call the int(void) function at the given address in the target and return
-  /// its result.
-  Expected<int> callIntVoid(JITTargetAddress Addr) {
-    LLVM_DEBUG(dbgs() << "Calling int(*)(void) "
-                      << format("0x%016" PRIx64, Addr) << "\n");
-    return callB<exec::CallIntVoid>(Addr);
-  }
-
-  /// Call the int(int) function at the given address in the target and return
-  /// its result.
-  Expected<int> callIntInt(JITTargetAddress Addr, int Arg) {
-    LLVM_DEBUG(dbgs() << "Calling int(*)(int) " << format("0x%016" PRIx64, Addr)
-                      << "\n");
-    return callB<exec::CallIntInt>(Addr, Arg);
-  }
-
-  /// Call the int(int, char*[]) function at the given address in the target and
-  /// return its result.
-  Expected<int> callMain(JITTargetAddress Addr,
-                         const std::vector<std::string> &Args) {
-    LLVM_DEBUG(dbgs() << "Calling int(*)(int, char*[]) "
-                      << format("0x%016" PRIx64, Addr) << "\n");
-    return callB<exec::CallMain>(Addr, Args);
-  }
-
-  /// Call the void() function at the given address in the target and wait for
-  /// it to finish.
-  Error callVoidVoid(JITTargetAddress Addr) {
-    LLVM_DEBUG(dbgs() << "Calling void(*)(void) "
-                      << format("0x%016" PRIx64, Addr) << "\n");
-    return callB<exec::CallVoidVoid>(Addr);
-  }
-
-  /// Create an RCMemoryManager which will allocate its memory on the remote
-  /// target.
-  Expected<std::unique_ptr<RemoteRTDyldMemoryManager>>
-  createRemoteMemoryManager() {
-    auto Id = AllocatorIds.getNext();
-    if (auto Err = callB<mem::CreateRemoteAllocator>(Id))
-      return std::move(Err);
-    return std::unique_ptr<RemoteRTDyldMemoryManager>(
-        new RemoteRTDyldMemoryManager(*this, Id));
-  }
-
-  /// Create a JITLink-compatible memory manager which will allocate working
-  /// memory on the host and target memory on the remote target.
-  Expected<std::unique_ptr<RemoteJITLinkMemoryManager>>
-  createRemoteJITLinkMemoryManager() {
-    auto Id = AllocatorIds.getNext();
-    if (auto Err = callB<mem::CreateRemoteAllocator>(Id))
-      return std::move(Err);
-    LLVM_DEBUG(dbgs() << "Created remote allocator " << Id << "\n");
-    return std::unique_ptr<RemoteJITLinkMemoryManager>(
-        new RemoteJITLinkMemoryManager(*this, Id));
-  }
-
-  /// Create an RCIndirectStubsManager that will allocate stubs on the remote
-  /// target.
-  Expected<std::unique_ptr<RemoteIndirectStubsManager>>
-  createIndirectStubsManager() {
-    auto Id = IndirectStubOwnerIds.getNext();
-    if (auto Err = callB<stubs::CreateIndirectStubsOwner>(Id))
-      return std::move(Err);
-    return std::make_unique<RemoteIndirectStubsManager>(*this, Id);
-  }
-
-  Expected<RemoteCompileCallbackManager &>
-  enableCompileCallbacks(JITTargetAddress ErrorHandlerAddress) {
-    assert(!CallbackManager && "CallbackManager already obtained");
-
-    // Emit the resolver block on the JIT server.
-    if (auto Err = callB<stubs::EmitResolverBlock>())
-      return std::move(Err);
-
-    // Create the callback manager.
-    CallbackManager.emplace(*this, ES, ErrorHandlerAddress);
-    RemoteCompileCallbackManager &Mgr = *CallbackManager;
-    return Mgr;
-  }
-
-  /// Search for symbols in the remote process. Note: This should be used by
-  /// symbol resolvers *after* they've searched the local symbol table in the
-  /// JIT stack.
-  Expected<JITTargetAddress> getSymbolAddress(StringRef Name) {
-    return callB<utils::GetSymbolAddress>(Name);
-  }
-
-  /// Get the triple for the remote target.
-  const std::string &getTargetTriple() const { return RemoteTargetTriple; }
-
-  Error terminateSession() { return callB<utils::TerminateSession>(); }
-
-private:
-  OrcRemoteTargetClient(shared::RawByteChannel &Channel, ExecutionSession &ES,
-                        Error &Err)
-      : shared::SingleThreadedRPCEndpoint<shared::RawByteChannel>(Channel,
-                                                                  true),
-        ES(ES) {
-    ErrorAsOutParameter EAO(&Err);
-
-    addHandler<utils::RequestCompile>(
-        [this](JITTargetAddress Addr) -> JITTargetAddress {
-          if (CallbackManager)
-            return CallbackManager->executeCompileCallback(Addr);
-          return 0;
-        });
-
-    if (auto RIOrErr = callB<utils::GetRemoteInfo>()) {
-      std::tie(RemoteTargetTriple, RemotePointerSize, RemotePageSize,
-               RemoteTrampolineSize, RemoteIndirectStubSize) = *RIOrErr;
-      Err = Error::success();
-    } else
-      Err = RIOrErr.takeError();
-  }
-
-  void deregisterEHFrames(JITTargetAddress Addr, uint32_t Size) {
-    if (auto Err = callB<eh::RegisterEHFrames>(Addr, Size))
-      ES.reportError(std::move(Err));
-  }
-
-  void destroyRemoteAllocator(ResourceIdMgr::ResourceId Id) {
-    if (auto Err = callB<mem::DestroyRemoteAllocator>(Id)) {
-      // FIXME: This will be triggered by a removeModuleSet call: Propagate
-      //        error return up through that.
-      llvm_unreachable("Failed to destroy remote allocator.");
-      AllocatorIds.release(Id);
-    }
-  }
-
-  void destroyIndirectStubsManager(ResourceIdMgr::ResourceId Id) {
-    IndirectStubOwnerIds.release(Id);
-    if (auto Err = callB<stubs::DestroyIndirectStubsOwner>(Id))
-      ES.reportError(std::move(Err));
-  }
-
-  Expected<std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>>
-  emitIndirectStubs(ResourceIdMgr::ResourceId Id, uint32_t NumStubsRequired) {
-    return callB<stubs::EmitIndirectStubs>(Id, NumStubsRequired);
-  }
-
-  Expected<std::tuple<JITTargetAddress, uint32_t>> emitTrampolineBlock() {
-    return callB<stubs::EmitTrampolineBlock>();
-  }
-
-  uint32_t getIndirectStubSize() const { return RemoteIndirectStubSize; }
-  uint32_t getPageSize() const { return RemotePageSize; }
-  uint32_t getPointerSize() const { return RemotePointerSize; }
-
-  uint32_t getTrampolineSize() const { return RemoteTrampolineSize; }
-
-  Expected<std::vector<uint8_t>> readMem(char *Dst, JITTargetAddress Src,
-                                         uint64_t Size) {
-    return callB<mem::ReadMem>(Src, Size);
-  }
-
-  Error registerEHFrames(JITTargetAddress &RAddr, uint32_t Size) {
-    // FIXME: Duplicate error and report it via ReportError too?
-    return callB<eh::RegisterEHFrames>(RAddr, Size);
-  }
-
-  JITTargetAddress reserveMem(ResourceIdMgr::ResourceId Id, uint64_t Size,
-                              uint32_t Align) {
-    if (auto AddrOrErr = callB<mem::ReserveMem>(Id, Size, Align))
-      return *AddrOrErr;
-    else {
-      ES.reportError(AddrOrErr.takeError());
-      return 0;
-    }
-  }
-
-  bool setProtections(ResourceIdMgr::ResourceId Id,
-                      JITTargetAddress RemoteSegAddr, unsigned ProtFlags) {
-    if (auto Err = callB<mem::SetProtections>(Id, RemoteSegAddr, ProtFlags)) {
-      ES.reportError(std::move(Err));
-      return true;
-    } else
-      return false;
-  }
-
-  bool writeMem(JITTargetAddress Addr, const char *Src, uint64_t Size) {
-    if (auto Err = callB<mem::WriteMem>(DirectBufferWriter(Src, Addr, Size))) {
-      ES.reportError(std::move(Err));
-      return true;
-    } else
-      return false;
-  }
-
-  Error writePointer(JITTargetAddress Addr, JITTargetAddress PtrVal) {
-    return callB<mem::WritePtr>(Addr, PtrVal);
-  }
-
-  static Error doNothing() { return Error::success(); }
-
-  ExecutionSession &ES;
-  std::function<void(Error)> ReportError;
-  std::string RemoteTargetTriple;
-  uint32_t RemotePointerSize = 0;
-  uint32_t RemotePageSize = 0;
-  uint32_t RemoteTrampolineSize = 0;
-  uint32_t RemoteIndirectStubSize = 0;
-  ResourceIdMgr AllocatorIds, IndirectStubOwnerIds;
-  Optional<RemoteCompileCallbackManager> CallbackManager;
-};
-
-} // end namespace remote
-} // end namespace orc
-} // end namespace llvm
-
-#undef DEBUG_TYPE
-
-#endif // LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETCLIENT_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
deleted file mode 100644
index 367bfb369191..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
+++ /dev/null
@@ -1,386 +0,0 @@
-//===- OrcRemoteTargetRPCAPI.h - Orc Remote-target RPC API ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Orc remote-target RPC API. It should not be used
-// directly, but is used by the RemoteTargetClient and RemoteTargetServer
-// classes.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETRPCAPI_H
-#define LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETRPCAPI_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
-#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
-
-namespace llvm {
-namespace orc {
-
-namespace remote {
-
-/// Template error for missing resources.
-template <typename ResourceIdT>
-class ResourceNotFound
-  : public ErrorInfo<ResourceNotFound<ResourceIdT>> {
-public:
-  static char ID;
-
-  ResourceNotFound(ResourceIdT ResourceId,
-                   std::string ResourceDescription = "")
-    : ResourceId(std::move(ResourceId)),
-      ResourceDescription(std::move(ResourceDescription)) {}
-
-  std::error_code convertToErrorCode() const override {
-    return orcError(OrcErrorCode::UnknownResourceHandle);
-  }
-
-  void log(raw_ostream &OS) const override {
-    OS << (ResourceDescription.empty()
-             ? "Remote resource with id "
-               : ResourceDescription)
-       << " " << ResourceId << " not found";
-  }
-
-private:
-  ResourceIdT ResourceId;
-  std::string ResourceDescription;
-};
-
-template <typename ResourceIdT>
-char ResourceNotFound<ResourceIdT>::ID = 0;
-
-class DirectBufferWriter {
-public:
-  DirectBufferWriter() = default;
-  DirectBufferWriter(const char *Src, JITTargetAddress Dst, uint64_t Size)
-      : Src(Src), Dst(Dst), Size(Size) {}
-
-  const char *getSrc() const { return Src; }
-  JITTargetAddress getDst() const { return Dst; }
-  uint64_t getSize() const { return Size; }
-
-private:
-  const char *Src;
-  JITTargetAddress Dst;
-  uint64_t Size;
-};
-
-} // end namespace remote
-
-namespace shared {
-
-template <> class SerializationTypeName<JITSymbolFlags> {
-public:
-  static const char *getName() { return "JITSymbolFlags"; }
-};
-
-template <typename ChannelT>
-class SerializationTraits<ChannelT, JITSymbolFlags> {
-public:
-
-  static Error serialize(ChannelT &C, const JITSymbolFlags &Flags) {
-    return serializeSeq(C, Flags.getRawFlagsValue(), Flags.getTargetFlags());
-  }
-
-  static Error deserialize(ChannelT &C, JITSymbolFlags &Flags) {
-    JITSymbolFlags::UnderlyingType JITFlags;
-    JITSymbolFlags::TargetFlagsType TargetFlags;
-    if (auto Err = deserializeSeq(C, JITFlags, TargetFlags))
-      return Err;
-    Flags = JITSymbolFlags(static_cast<JITSymbolFlags::FlagNames>(JITFlags),
-                           TargetFlags);
-    return Error::success();
-  }
-};
-
-template <> class SerializationTypeName<remote::DirectBufferWriter> {
-public:
-  static const char *getName() { return "DirectBufferWriter"; }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, remote::DirectBufferWriter, remote::DirectBufferWriter,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  static Error serialize(ChannelT &C, const remote::DirectBufferWriter &DBW) {
-    if (auto EC = serializeSeq(C, DBW.getDst()))
-      return EC;
-    if (auto EC = serializeSeq(C, DBW.getSize()))
-      return EC;
-    return C.appendBytes(DBW.getSrc(), DBW.getSize());
-  }
-
-  static Error deserialize(ChannelT &C, remote::DirectBufferWriter &DBW) {
-    JITTargetAddress Dst;
-    if (auto EC = deserializeSeq(C, Dst))
-      return EC;
-    uint64_t Size;
-    if (auto EC = deserializeSeq(C, Size))
-      return EC;
-    char *Addr = reinterpret_cast<char *>(static_cast<uintptr_t>(Dst));
-
-    DBW = remote::DirectBufferWriter(nullptr, Dst, Size);
-
-    return C.readBytes(Addr, Size);
-  }
-};
-
-} // end namespace shared
-
-namespace remote {
-
-class ResourceIdMgr {
-public:
-  using ResourceId = uint64_t;
-  static const ResourceId InvalidId = ~0U;
-
-  ResourceIdMgr() = default;
-  explicit ResourceIdMgr(ResourceId FirstValidId)
-    : NextId(std::move(FirstValidId)) {}
-
-  ResourceId getNext() {
-    if (!FreeIds.empty()) {
-      ResourceId I = FreeIds.back();
-      FreeIds.pop_back();
-      return I;
-    }
-    assert(NextId + 1 != ~0ULL && "All ids allocated");
-    return NextId++;
-  }
-
-  void release(ResourceId I) { FreeIds.push_back(I); }
-
-private:
-  ResourceId NextId = 1;
-  std::vector<ResourceId> FreeIds;
-};
-
-/// Registers EH frames on the remote.
-namespace eh {
-
-  /// Registers EH frames on the remote.
-class RegisterEHFrames
-    : public shared::RPCFunction<RegisterEHFrames,
-                                 void(JITTargetAddress Addr, uint32_t Size)> {
-public:
-  static const char *getName() { return "RegisterEHFrames"; }
-};
-
-  /// Deregisters EH frames on the remote.
-class DeregisterEHFrames
-    : public shared::RPCFunction<DeregisterEHFrames,
-                                 void(JITTargetAddress Addr, uint32_t Size)> {
-public:
-  static const char *getName() { return "DeregisterEHFrames"; }
-};
-
-} // end namespace eh
-
-/// RPC functions for executing remote code.
-namespace exec {
-
-  /// Call an 'int32_t()'-type function on the remote, returns the called
-  /// function's return value.
-class CallIntVoid
-    : public shared::RPCFunction<CallIntVoid, int32_t(JITTargetAddress Addr)> {
-public:
-  static const char *getName() { return "CallIntVoid"; }
-};
-
-  /// Call an 'int32_t(int32_t)'-type function on the remote, returns the called
-  /// function's return value.
-class CallIntInt
-    : public shared::RPCFunction<CallIntInt,
-                                 int32_t(JITTargetAddress Addr, int)> {
-public:
-  static const char *getName() { return "CallIntInt"; }
-};
-
-  /// Call an 'int32_t(int32_t, char**)'-type function on the remote, returns the
-  /// called function's return value.
-class CallMain
-    : public shared::RPCFunction<CallMain,
-                                 int32_t(JITTargetAddress Addr,
-                                         std::vector<std::string> Args)> {
-public:
-  static const char *getName() { return "CallMain"; }
-};
-
-  /// Calls a 'void()'-type function on the remote, returns when the called
-  /// function completes.
-class CallVoidVoid
-    : public shared::RPCFunction<CallVoidVoid, void(JITTargetAddress FnAddr)> {
-public:
-  static const char *getName() { return "CallVoidVoid"; }
-};
-
-} // end namespace exec
-
-/// RPC functions for remote memory management / inspection / modification.
-namespace mem {
-
-  /// Creates a memory allocator on the remote.
-class CreateRemoteAllocator
-    : public shared::RPCFunction<CreateRemoteAllocator,
-                                 void(ResourceIdMgr::ResourceId AllocatorID)> {
-public:
-  static const char *getName() { return "CreateRemoteAllocator"; }
-};
-
-  /// Destroys a remote allocator, freeing any memory allocated by it.
-class DestroyRemoteAllocator
-    : public shared::RPCFunction<DestroyRemoteAllocator,
-                                 void(ResourceIdMgr::ResourceId AllocatorID)> {
-public:
-  static const char *getName() { return "DestroyRemoteAllocator"; }
-};
-
-  /// Read a remote memory block.
-class ReadMem
-    : public shared::RPCFunction<
-          ReadMem, std::vector<uint8_t>(JITTargetAddress Src, uint64_t Size)> {
-public:
-  static const char *getName() { return "ReadMem"; }
-};
-
-  /// Reserve a block of memory on the remote via the given allocator.
-class ReserveMem
-    : public shared::RPCFunction<
-          ReserveMem, JITTargetAddress(ResourceIdMgr::ResourceId AllocID,
-                                       uint64_t Size, uint32_t Align)> {
-public:
-  static const char *getName() { return "ReserveMem"; }
-};
-
-  /// Set the memory protection on a memory block.
-class SetProtections
-    : public shared::RPCFunction<
-          SetProtections, void(ResourceIdMgr::ResourceId AllocID,
-                               JITTargetAddress Dst, uint32_t ProtFlags)> {
-public:
-  static const char *getName() { return "SetProtections"; }
-};
-
-  /// Write to a remote memory block.
-class WriteMem
-    : public shared::RPCFunction<WriteMem,
-                                 void(remote::DirectBufferWriter DB)> {
-public:
-  static const char *getName() { return "WriteMem"; }
-};
-
-  /// Write to a remote pointer.
-class WritePtr
-    : public shared::RPCFunction<WritePtr, void(JITTargetAddress Dst,
-                                                JITTargetAddress Val)> {
-public:
-  static const char *getName() { return "WritePtr"; }
-};
-
-} // end namespace mem
-
-/// RPC functions for remote stub and trampoline management.
-namespace stubs {
-
-  /// Creates an indirect stub owner on the remote.
-class CreateIndirectStubsOwner
-    : public shared::RPCFunction<CreateIndirectStubsOwner,
-                                 void(ResourceIdMgr::ResourceId StubOwnerID)> {
-public:
-  static const char *getName() { return "CreateIndirectStubsOwner"; }
-};
-
-  /// RPC function for destroying an indirect stubs owner.
-class DestroyIndirectStubsOwner
-    : public shared::RPCFunction<DestroyIndirectStubsOwner,
-                                 void(ResourceIdMgr::ResourceId StubsOwnerID)> {
-public:
-  static const char *getName() { return "DestroyIndirectStubsOwner"; }
-};
-
-  /// EmitIndirectStubs result is (StubsBase, PtrsBase, NumStubsEmitted).
-class EmitIndirectStubs
-    : public shared::RPCFunction<
-          EmitIndirectStubs,
-          std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>(
-              ResourceIdMgr::ResourceId StubsOwnerID,
-              uint32_t NumStubsRequired)> {
-public:
-  static const char *getName() { return "EmitIndirectStubs"; }
-};
-
-  /// RPC function to emit the resolver block and return its address.
-class EmitResolverBlock
-    : public shared::RPCFunction<EmitResolverBlock, void()> {
-public:
-  static const char *getName() { return "EmitResolverBlock"; }
-};
-
-  /// EmitTrampolineBlock result is (BlockAddr, NumTrampolines).
-class EmitTrampolineBlock
-    : public shared::RPCFunction<EmitTrampolineBlock,
-                                 std::tuple<JITTargetAddress, uint32_t>()> {
-public:
-  static const char *getName() { return "EmitTrampolineBlock"; }
-};
-
-} // end namespace stubs
-
-/// Miscelaneous RPC functions for dealing with remotes.
-namespace utils {
-
-  /// GetRemoteInfo result is (Triple, PointerSize, PageSize, TrampolineSize,
-  ///                          IndirectStubsSize).
-class GetRemoteInfo
-    : public shared::RPCFunction<
-          GetRemoteInfo,
-          std::tuple<std::string, uint32_t, uint32_t, uint32_t, uint32_t>()> {
-public:
-  static const char *getName() { return "GetRemoteInfo"; }
-};
-
-  /// Get the address of a remote symbol.
-class GetSymbolAddress
-    : public shared::RPCFunction<GetSymbolAddress,
-                                 JITTargetAddress(std::string SymbolName)> {
-public:
-  static const char *getName() { return "GetSymbolAddress"; }
-};
-
-  /// Request that the host execute a compile callback.
-class RequestCompile
-    : public shared::RPCFunction<
-          RequestCompile, JITTargetAddress(JITTargetAddress TrampolineAddr)> {
-public:
-  static const char *getName() { return "RequestCompile"; }
-};
-
-  /// Notify the remote and terminate the session.
-class TerminateSession : public shared::RPCFunction<TerminateSession, void()> {
-public:
-  static const char *getName() { return "TerminateSession"; }
-};
-
-} // namespace utils
-
-class OrcRemoteTargetRPCAPI
-    : public shared::SingleThreadedRPCEndpoint<shared::RawByteChannel> {
-public:
-  // FIXME: Remove constructors once MSVC supports synthesizing move-ops.
-  OrcRemoteTargetRPCAPI(shared::RawByteChannel &C)
-      : shared::SingleThreadedRPCEndpoint<shared::RawByteChannel>(C, true) {}
-};
-
-} // end namespace remote
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETRPCAPI_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
deleted file mode 100644
index ce9bf064303d..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
+++ /dev/null
@@ -1,464 +0,0 @@
-//===- OrcRemoteTargetServer.h - Orc Remote-target Server -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the OrcRemoteTargetServer class. It can be used to build a
-// JIT server that can execute code sent from an OrcRemoteTargetClient.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETSERVER_H
-#define LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETSERVER_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
-#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <system_error>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-
-#define DEBUG_TYPE "orc-remote"
-
-namespace llvm {
-namespace orc {
-namespace remote {
-
-template <typename ChannelT, typename TargetT>
-class OrcRemoteTargetServer
-    : public shared::SingleThreadedRPCEndpoint<shared::RawByteChannel> {
-public:
-  using SymbolLookupFtor =
-      std::function<JITTargetAddress(const std::string &Name)>;
-
-  using EHFrameRegistrationFtor =
-      std::function<void(uint8_t *Addr, uint32_t Size)>;
-
-  OrcRemoteTargetServer(ChannelT &Channel, SymbolLookupFtor SymbolLookup,
-                        EHFrameRegistrationFtor EHFramesRegister,
-                        EHFrameRegistrationFtor EHFramesDeregister)
-      : shared::SingleThreadedRPCEndpoint<shared::RawByteChannel>(Channel,
-                                                                  true),
-        SymbolLookup(std::move(SymbolLookup)),
-        EHFramesRegister(std::move(EHFramesRegister)),
-        EHFramesDeregister(std::move(EHFramesDeregister)) {
-    using ThisT = std::remove_reference_t<decltype(*this)>;
-    addHandler<exec::CallIntVoid>(*this, &ThisT::handleCallIntVoid);
-    addHandler<exec::CallIntInt>(*this, &ThisT::handleCallIntInt);
-    addHandler<exec::CallMain>(*this, &ThisT::handleCallMain);
-    addHandler<exec::CallVoidVoid>(*this, &ThisT::handleCallVoidVoid);
-    addHandler<mem::CreateRemoteAllocator>(*this,
-                                           &ThisT::handleCreateRemoteAllocator);
-    addHandler<mem::DestroyRemoteAllocator>(
-        *this, &ThisT::handleDestroyRemoteAllocator);
-    addHandler<mem::ReadMem>(*this, &ThisT::handleReadMem);
-    addHandler<mem::ReserveMem>(*this, &ThisT::handleReserveMem);
-    addHandler<mem::SetProtections>(*this, &ThisT::handleSetProtections);
-    addHandler<mem::WriteMem>(*this, &ThisT::handleWriteMem);
-    addHandler<mem::WritePtr>(*this, &ThisT::handleWritePtr);
-    addHandler<eh::RegisterEHFrames>(*this, &ThisT::handleRegisterEHFrames);
-    addHandler<eh::DeregisterEHFrames>(*this, &ThisT::handleDeregisterEHFrames);
-    addHandler<stubs::CreateIndirectStubsOwner>(
-        *this, &ThisT::handleCreateIndirectStubsOwner);
-    addHandler<stubs::DestroyIndirectStubsOwner>(
-        *this, &ThisT::handleDestroyIndirectStubsOwner);
-    addHandler<stubs::EmitIndirectStubs>(*this,
-                                         &ThisT::handleEmitIndirectStubs);
-    addHandler<stubs::EmitResolverBlock>(*this,
-                                         &ThisT::handleEmitResolverBlock);
-    addHandler<stubs::EmitTrampolineBlock>(*this,
-                                           &ThisT::handleEmitTrampolineBlock);
-    addHandler<utils::GetSymbolAddress>(*this, &ThisT::handleGetSymbolAddress);
-    addHandler<utils::GetRemoteInfo>(*this, &ThisT::handleGetRemoteInfo);
-    addHandler<utils::TerminateSession>(*this, &ThisT::handleTerminateSession);
-  }
-
-  // FIXME: Remove move/copy ops once MSVC supports synthesizing move ops.
-  OrcRemoteTargetServer(const OrcRemoteTargetServer &) = delete;
-  OrcRemoteTargetServer &operator=(const OrcRemoteTargetServer &) = delete;
-
-  OrcRemoteTargetServer(OrcRemoteTargetServer &&Other) = default;
-  OrcRemoteTargetServer &operator=(OrcRemoteTargetServer &&) = delete;
-
-  Expected<JITTargetAddress> requestCompile(JITTargetAddress TrampolineAddr) {
-    return callB<utils::RequestCompile>(TrampolineAddr);
-  }
-
-  bool receivedTerminate() const { return TerminateFlag; }
-
-private:
-  struct Allocator {
-    Allocator() = default;
-    Allocator(Allocator &&Other) : Allocs(std::move(Other.Allocs)) {}
-
-    Allocator &operator=(Allocator &&Other) {
-      Allocs = std::move(Other.Allocs);
-      return *this;
-    }
-
-    ~Allocator() {
-      for (auto &Alloc : Allocs)
-        sys::Memory::releaseMappedMemory(Alloc.second);
-    }
-
-    Error allocate(void *&Addr, size_t Size, uint32_t Align) {
-      std::error_code EC;
-      sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(
-          Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
-      if (EC)
-        return errorCodeToError(EC);
-
-      Addr = MB.base();
-      assert(Allocs.find(MB.base()) == Allocs.end() && "Duplicate alloc");
-      Allocs[MB.base()] = std::move(MB);
-      return Error::success();
-    }
-
-    Error setProtections(void *block, unsigned Flags) {
-      auto I = Allocs.find(block);
-      if (I == Allocs.end())
-        return errorCodeToError(orcError(OrcErrorCode::RemoteMProtectAddrUnrecognized));
-      return errorCodeToError(
-          sys::Memory::protectMappedMemory(I->second, Flags));
-    }
-
-  private:
-    std::map<void *, sys::MemoryBlock> Allocs;
-  };
-
-  static Error doNothing() { return Error::success(); }
-
-  static JITTargetAddress reenter(void *JITTargetAddr, void *TrampolineAddr) {
-    auto T = static_cast<OrcRemoteTargetServer *>(JITTargetAddr);
-    auto AddrOrErr = T->requestCompile(static_cast<JITTargetAddress>(
-        reinterpret_cast<uintptr_t>(TrampolineAddr)));
-    // FIXME: Allow customizable failure substitution functions.
-    assert(AddrOrErr && "Compile request failed");
-    return *AddrOrErr;
-  }
-
-  Expected<int32_t> handleCallIntVoid(JITTargetAddress Addr) {
-    using IntVoidFnTy = int (*)();
-
-    IntVoidFnTy Fn =
-        reinterpret_cast<IntVoidFnTy>(static_cast<uintptr_t>(Addr));
-
-    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
-    int Result = Fn();
-    LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
-
-    return Result;
-  }
-
-  Expected<int32_t> handleCallIntInt(JITTargetAddress Addr, int Arg) {
-    using IntIntFnTy = int (*)(int);
-
-    IntIntFnTy Fn = reinterpret_cast<IntIntFnTy>(static_cast<uintptr_t>(Addr));
-
-    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr)
-                      << " with argument " << Arg << "\n");
-    int Result = Fn(Arg);
-    LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
-
-    return Result;
-  }
-
-  Expected<int32_t> handleCallMain(JITTargetAddress Addr,
-                                   std::vector<std::string> Args) {
-    using MainFnTy = int (*)(int, const char *[]);
-
-    MainFnTy Fn = reinterpret_cast<MainFnTy>(static_cast<uintptr_t>(Addr));
-    int ArgC = Args.size() + 1;
-    int Idx = 1;
-    std::unique_ptr<const char *[]> ArgV(new const char *[ArgC + 1]);
-    ArgV[0] = "<jit process>";
-    for (auto &Arg : Args)
-      ArgV[Idx++] = Arg.c_str();
-    ArgV[ArgC] = 0;
-    LLVM_DEBUG(for (int Idx = 0; Idx < ArgC; ++Idx) {
-      llvm::dbgs() << "Arg " << Idx << ": " << ArgV[Idx] << "\n";
-    });
-
-    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
-    int Result = Fn(ArgC, ArgV.get());
-    LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
-
-    return Result;
-  }
-
-  Error handleCallVoidVoid(JITTargetAddress Addr) {
-    using VoidVoidFnTy = void (*)();
-
-    VoidVoidFnTy Fn =
-        reinterpret_cast<VoidVoidFnTy>(static_cast<uintptr_t>(Addr));
-
-    LLVM_DEBUG(dbgs() << "  Calling " << format("0x%016x", Addr) << "\n");
-    Fn();
-    LLVM_DEBUG(dbgs() << "  Complete.\n");
-
-    return Error::success();
-  }
-
-  Error handleCreateRemoteAllocator(ResourceIdMgr::ResourceId Id) {
-    auto I = Allocators.find(Id);
-    if (I != Allocators.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteAllocatorIdAlreadyInUse));
-    LLVM_DEBUG(dbgs() << "  Created allocator " << Id << "\n");
-    Allocators[Id] = Allocator();
-    return Error::success();
-  }
-
-  Error handleCreateIndirectStubsOwner(ResourceIdMgr::ResourceId Id) {
-    auto I = IndirectStubsOwners.find(Id);
-    if (I != IndirectStubsOwners.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse));
-    LLVM_DEBUG(dbgs() << "  Create indirect stubs owner " << Id << "\n");
-    IndirectStubsOwners[Id] = ISBlockOwnerList();
-    return Error::success();
-  }
-
-  Error handleDeregisterEHFrames(JITTargetAddress TAddr, uint32_t Size) {
-    uint8_t *Addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(TAddr));
-    LLVM_DEBUG(dbgs() << "  Registering EH frames at "
-                      << format("0x%016x", TAddr) << ", Size = " << Size
-                      << " bytes\n");
-    EHFramesDeregister(Addr, Size);
-    return Error::success();
-  }
-
-  Error handleDestroyRemoteAllocator(ResourceIdMgr::ResourceId Id) {
-    auto I = Allocators.find(Id);
-    if (I == Allocators.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
-    Allocators.erase(I);
-    LLVM_DEBUG(dbgs() << "  Destroyed allocator " << Id << "\n");
-    return Error::success();
-  }
-
-  Error handleDestroyIndirectStubsOwner(ResourceIdMgr::ResourceId Id) {
-    auto I = IndirectStubsOwners.find(Id);
-    if (I == IndirectStubsOwners.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist));
-    IndirectStubsOwners.erase(I);
-    return Error::success();
-  }
-
-  Expected<std::tuple<JITTargetAddress, JITTargetAddress, uint32_t>>
-  handleEmitIndirectStubs(ResourceIdMgr::ResourceId Id,
-                          uint32_t NumStubsRequired) {
-    LLVM_DEBUG(dbgs() << "  ISMgr " << Id << " request " << NumStubsRequired
-                      << " stubs.\n");
-
-    auto StubOwnerItr = IndirectStubsOwners.find(Id);
-    if (StubOwnerItr == IndirectStubsOwners.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist));
-
-    auto IS = LocalIndirectStubsInfo<TargetT>::create(
-        NumStubsRequired, sys::Process::getPageSizeEstimate());
-    if (!IS)
-      return IS.takeError();
-
-    JITTargetAddress StubsBase = pointerToJITTargetAddress(IS->getStub(0));
-    JITTargetAddress PtrsBase = pointerToJITTargetAddress(IS->getPtr(0));
-    uint32_t NumStubsEmitted = IS->getNumStubs();
-
-    auto &BlockList = StubOwnerItr->second;
-    BlockList.push_back(std::move(*IS));
-
-    return std::make_tuple(StubsBase, PtrsBase, NumStubsEmitted);
-  }
-
-  Error handleEmitResolverBlock() {
-    std::error_code EC;
-    ResolverBlock = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-        TargetT::ResolverCodeSize, nullptr,
-        sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-    if (EC)
-      return errorCodeToError(EC);
-
-    TargetT::writeResolverCode(static_cast<char *>(ResolverBlock.base()),
-                               pointerToJITTargetAddress(ResolverBlock.base()),
-                               pointerToJITTargetAddress(&reenter),
-                               pointerToJITTargetAddress(this));
-
-    return errorCodeToError(sys::Memory::protectMappedMemory(
-        ResolverBlock.getMemoryBlock(),
-        sys::Memory::MF_READ | sys::Memory::MF_EXEC));
-  }
-
-  Expected<std::tuple<JITTargetAddress, uint32_t>> handleEmitTrampolineBlock() {
-    std::error_code EC;
-    auto TrampolineBlock =
-        sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-            sys::Process::getPageSizeEstimate(), nullptr,
-            sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-    if (EC)
-      return errorCodeToError(EC);
-
-    uint32_t NumTrampolines =
-        (sys::Process::getPageSizeEstimate() - TargetT::PointerSize) /
-        TargetT::TrampolineSize;
-
-    char *TrampolineMem = static_cast<char *>(TrampolineBlock.base());
-    TargetT::writeTrampolines(
-        TrampolineMem, pointerToJITTargetAddress(TrampolineMem),
-        pointerToJITTargetAddress(ResolverBlock.base()), NumTrampolines);
-
-    EC = sys::Memory::protectMappedMemory(TrampolineBlock.getMemoryBlock(),
-                                          sys::Memory::MF_READ |
-                                              sys::Memory::MF_EXEC);
-
-    TrampolineBlocks.push_back(std::move(TrampolineBlock));
-
-    return std::make_tuple(pointerToJITTargetAddress(TrampolineMem),
-                           NumTrampolines);
-  }
-
-  Expected<JITTargetAddress> handleGetSymbolAddress(const std::string &Name) {
-    JITTargetAddress Addr = SymbolLookup(Name);
-    LLVM_DEBUG(dbgs() << "  Symbol '" << Name
-                      << "' =  " << format("0x%016x", Addr) << "\n");
-    return Addr;
-  }
-
-  Expected<std::tuple<std::string, uint32_t, uint32_t, uint32_t, uint32_t>>
-  handleGetRemoteInfo() {
-    std::string ProcessTriple = sys::getProcessTriple();
-    uint32_t PointerSize = TargetT::PointerSize;
-    uint32_t PageSize = sys::Process::getPageSizeEstimate();
-    uint32_t TrampolineSize = TargetT::TrampolineSize;
-    uint32_t IndirectStubSize = TargetT::StubSize;
-    LLVM_DEBUG(dbgs() << "  Remote info:\n"
-                      << "    triple             = '" << ProcessTriple << "'\n"
-                      << "    pointer size       = " << PointerSize << "\n"
-                      << "    page size          = " << PageSize << "\n"
-                      << "    trampoline size    = " << TrampolineSize << "\n"
-                      << "    indirect stub size = " << IndirectStubSize
-                      << "\n");
-    return std::make_tuple(ProcessTriple, PointerSize, PageSize, TrampolineSize,
-                           IndirectStubSize);
-  }
-
-  Expected<std::vector<uint8_t>> handleReadMem(JITTargetAddress RSrc,
-                                               uint64_t Size) {
-    uint8_t *Src = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(RSrc));
-
-    LLVM_DEBUG(dbgs() << "  Reading " << Size << " bytes from "
-                      << format("0x%016x", RSrc) << "\n");
-
-    std::vector<uint8_t> Buffer;
-    Buffer.resize(Size);
-    for (uint8_t *P = Src; Size != 0; --Size)
-      Buffer.push_back(*P++);
-
-    return Buffer;
-  }
-
-  Error handleRegisterEHFrames(JITTargetAddress TAddr, uint32_t Size) {
-    uint8_t *Addr = reinterpret_cast<uint8_t *>(static_cast<uintptr_t>(TAddr));
-    LLVM_DEBUG(dbgs() << "  Registering EH frames at "
-                      << format("0x%016x", TAddr) << ", Size = " << Size
-                      << " bytes\n");
-    EHFramesRegister(Addr, Size);
-    return Error::success();
-  }
-
-  Expected<JITTargetAddress> handleReserveMem(ResourceIdMgr::ResourceId Id,
-                                              uint64_t Size, uint32_t Align) {
-    auto I = Allocators.find(Id);
-    if (I == Allocators.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
-    auto &Allocator = I->second;
-    void *LocalAllocAddr = nullptr;
-    if (auto Err = Allocator.allocate(LocalAllocAddr, Size, Align))
-      return std::move(Err);
-
-    LLVM_DEBUG(dbgs() << "  Allocator " << Id << " reserved " << LocalAllocAddr
-                      << " (" << Size << " bytes, alignment " << Align
-                      << ")\n");
-
-    JITTargetAddress AllocAddr = static_cast<JITTargetAddress>(
-        reinterpret_cast<uintptr_t>(LocalAllocAddr));
-
-    return AllocAddr;
-  }
-
-  Error handleSetProtections(ResourceIdMgr::ResourceId Id,
-                             JITTargetAddress Addr, uint32_t Flags) {
-    auto I = Allocators.find(Id);
-    if (I == Allocators.end())
-      return errorCodeToError(
-               orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
-    auto &Allocator = I->second;
-    void *LocalAddr = reinterpret_cast<void *>(static_cast<uintptr_t>(Addr));
-    LLVM_DEBUG(dbgs() << "  Allocator " << Id << " set permissions on "
-                      << LocalAddr << " to "
-                      << (Flags & sys::Memory::MF_READ ? 'R' : '-')
-                      << (Flags & sys::Memory::MF_WRITE ? 'W' : '-')
-                      << (Flags & sys::Memory::MF_EXEC ? 'X' : '-') << "\n");
-    return Allocator.setProtections(LocalAddr, Flags);
-  }
-
-  Error handleTerminateSession() {
-    TerminateFlag = true;
-    return Error::success();
-  }
-
-  Error handleWriteMem(DirectBufferWriter DBW) {
-    LLVM_DEBUG(dbgs() << "  Writing " << DBW.getSize() << " bytes to "
-                      << format("0x%016x", DBW.getDst()) << "\n");
-    return Error::success();
-  }
-
-  Error handleWritePtr(JITTargetAddress Addr, JITTargetAddress PtrVal) {
-    LLVM_DEBUG(dbgs() << "  Writing pointer *" << format("0x%016x", Addr)
-                      << " = " << format("0x%016x", PtrVal) << "\n");
-    uintptr_t *Ptr =
-        reinterpret_cast<uintptr_t *>(static_cast<uintptr_t>(Addr));
-    *Ptr = static_cast<uintptr_t>(PtrVal);
-    return Error::success();
-  }
-
-  SymbolLookupFtor SymbolLookup;
-  EHFrameRegistrationFtor EHFramesRegister, EHFramesDeregister;
-  std::map<ResourceIdMgr::ResourceId, Allocator> Allocators;
-  using ISBlockOwnerList = std::vector<LocalIndirectStubsInfo<TargetT>>;
-  std::map<ResourceIdMgr::ResourceId, ISBlockOwnerList> IndirectStubsOwners;
-  sys::OwningMemoryBlock ResolverBlock;
-  std::vector<sys::OwningMemoryBlock> TrampolineBlocks;
-  bool TerminateFlag = false;
-};
-
-} // end namespace remote
-} // end namespace orc
-} // end namespace llvm
-
-#undef DEBUG_TYPE
-
-#endif // LLVM_EXECUTIONENGINE_ORC_ORCREMOTETARGETSERVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
index 78a6623d7594..3c0b2b9edd52 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
@@ -34,25 +34,26 @@ private:
 };
 
 /// Represents an address in the executor process.
-class ExecutorAddress {
+class ExecutorAddr {
 public:
-  ExecutorAddress() = default;
-  explicit ExecutorAddress(uint64_t Addr) : Addr(Addr) {}
+  ExecutorAddr() = default;
 
-  /// Create an ExecutorAddress from the given pointer.
+  /// Create an ExecutorAddr from the given value.
+  explicit ExecutorAddr(uint64_t Addr) : Addr(Addr) {}
+
+  /// Create an ExecutorAddr from the given pointer.
   /// Warning: This should only be used when JITing in-process.
-  template <typename T> static ExecutorAddress fromPtr(T *Value) {
-    return ExecutorAddress(
+  template <typename T> static ExecutorAddr fromPtr(T *Value) {
+    return ExecutorAddr(
         static_cast<uint64_t>(reinterpret_cast<uintptr_t>(Value)));
   }
 
-  /// Cast this ExecutorAddress to a pointer of the given type.
-  /// Warning: This should only be esude when JITing in-process.
+  /// Cast this ExecutorAddr to a pointer of the given type.
+  /// Warning: This should only be used when JITing in-process.
   template <typename T> T toPtr() const {
     static_assert(std::is_pointer<T>::value, "T must be a pointer type");
     uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
-    assert(IntPtr == Addr &&
-           "JITTargetAddress value out of range for uintptr_t");
+    assert(IntPtr == Addr && "ExecutorAddr value out of range for uintptr_t");
     return reinterpret_cast<T>(IntPtr);
   }
 
@@ -62,53 +63,47 @@ public:
 
   explicit operator bool() const { return Addr != 0; }
 
-  friend bool operator==(const ExecutorAddress &LHS,
-                         const ExecutorAddress &RHS) {
+  friend bool operator==(const ExecutorAddr &LHS, const ExecutorAddr &RHS) {
     return LHS.Addr == RHS.Addr;
   }
 
-  friend bool operator!=(const ExecutorAddress &LHS,
-                         const ExecutorAddress &RHS) {
+  friend bool operator!=(const ExecutorAddr &LHS, const ExecutorAddr &RHS) {
     return LHS.Addr != RHS.Addr;
   }
 
-  friend bool operator<(const ExecutorAddress &LHS,
-                        const ExecutorAddress &RHS) {
+  friend bool operator<(const ExecutorAddr &LHS, const ExecutorAddr &RHS) {
     return LHS.Addr < RHS.Addr;
   }
 
-  friend bool operator<=(const ExecutorAddress &LHS,
-                         const ExecutorAddress &RHS) {
+  friend bool operator<=(const ExecutorAddr &LHS, const ExecutorAddr &RHS) {
     return LHS.Addr <= RHS.Addr;
   }
 
-  friend bool operator>(const ExecutorAddress &LHS,
-                        const ExecutorAddress &RHS) {
+  friend bool operator>(const ExecutorAddr &LHS, const ExecutorAddr &RHS) {
     return LHS.Addr > RHS.Addr;
   }
 
-  friend bool operator>=(const ExecutorAddress &LHS,
-                         const ExecutorAddress &RHS) {
+  friend bool operator>=(const ExecutorAddr &LHS, const ExecutorAddr &RHS) {
     return LHS.Addr >= RHS.Addr;
   }
 
-  ExecutorAddress &operator++() {
+  ExecutorAddr &operator++() {
     ++Addr;
     return *this;
   }
-  ExecutorAddress &operator--() {
+  ExecutorAddr &operator--() {
     --Addr;
     return *this;
   }
-  ExecutorAddress operator++(int) { return ExecutorAddress(Addr++); }
-  ExecutorAddress operator--(int) { return ExecutorAddress(Addr++); }
+  ExecutorAddr operator++(int) { return ExecutorAddr(Addr++); }
+  ExecutorAddr operator--(int) { return ExecutorAddr(Addr--); }
 
-  ExecutorAddress &operator+=(const ExecutorAddrDiff Delta) {
+  ExecutorAddr &operator+=(const ExecutorAddrDiff Delta) {
     Addr += Delta.getValue();
     return *this;
   }
 
-  ExecutorAddress &operator-=(const ExecutorAddrDiff Delta) {
+  ExecutorAddr &operator-=(const ExecutorAddrDiff Delta) {
     Addr -= Delta.getValue();
     return *this;
   }
@@ -118,83 +113,98 @@ private:
 };
 
 /// Subtracting two addresses yields an offset.
-inline ExecutorAddrDiff operator-(const ExecutorAddress &LHS,
-                                  const ExecutorAddress &RHS) {
+inline ExecutorAddrDiff operator-(const ExecutorAddr &LHS,
+                                  const ExecutorAddr &RHS) {
   return ExecutorAddrDiff(LHS.getValue() - RHS.getValue());
 }
 
 /// Adding an offset and an address yields an address.
-inline ExecutorAddress operator+(const ExecutorAddress &LHS,
-                                 const ExecutorAddrDiff &RHS) {
-  return ExecutorAddress(LHS.getValue() + RHS.getValue());
+inline ExecutorAddr operator+(const ExecutorAddr &LHS,
+                              const ExecutorAddrDiff &RHS) {
+  return ExecutorAddr(LHS.getValue() + RHS.getValue());
 }
 
 /// Adding an address and an offset yields an address.
-inline ExecutorAddress operator+(const ExecutorAddrDiff &LHS,
-                                 const ExecutorAddress &RHS) {
-  return ExecutorAddress(LHS.getValue() + RHS.getValue());
+inline ExecutorAddr operator+(const ExecutorAddrDiff &LHS,
+                              const ExecutorAddr &RHS) {
+  return ExecutorAddr(LHS.getValue() + RHS.getValue());
 }
 
 /// Represents an address range in the exceutor process.
-struct ExecutorAddressRange {
-  ExecutorAddressRange() = default;
-  ExecutorAddressRange(ExecutorAddress StartAddress, ExecutorAddress EndAddress)
-      : StartAddress(StartAddress), EndAddress(EndAddress) {}
+struct ExecutorAddrRange {
+  ExecutorAddrRange() = default;
+  ExecutorAddrRange(ExecutorAddr Start, ExecutorAddr End)
+      : Start(Start), End(End) {}
+  ExecutorAddrRange(ExecutorAddr Start, ExecutorAddrDiff Size)
+      : Start(Start), End(Start + Size) {}
 
-  bool empty() const { return StartAddress == EndAddress; }
-  ExecutorAddrDiff size() const { return EndAddress - StartAddress; }
+  bool empty() const { return Start == End; }
+  ExecutorAddrDiff size() const { return End - Start; }
 
-  ExecutorAddress StartAddress;
-  ExecutorAddress EndAddress;
+  friend bool operator==(const ExecutorAddrRange &LHS,
+                         const ExecutorAddrRange &RHS) {
+    return LHS.Start == RHS.Start && LHS.End == RHS.End;
+  }
+  friend bool operator!=(const ExecutorAddrRange &LHS,
+                         const ExecutorAddrRange &RHS) {
+    return !(LHS == RHS);
+  }
+  bool contains(ExecutorAddr Addr) const { return Start <= Addr && Addr < End; }
+  bool overlaps(const ExecutorAddrRange &Other) {
+    return !(Other.End <= Start || End <= Other.Start);
+  }
+
+  ExecutorAddr Start;
+  ExecutorAddr End;
 };
 
 namespace shared {
 
-/// SPS serializatior for ExecutorAddress.
-template <> class SPSSerializationTraits<SPSExecutorAddress, ExecutorAddress> {
+class SPSExecutorAddr {};
+
+/// SPS serializatior for ExecutorAddr.
+template <> class SPSSerializationTraits<SPSExecutorAddr, ExecutorAddr> {
 public:
-  static size_t size(const ExecutorAddress &EA) {
+  static size_t size(const ExecutorAddr &EA) {
     return SPSArgList<uint64_t>::size(EA.getValue());
   }
 
-  static bool serialize(SPSOutputBuffer &BOB, const ExecutorAddress &EA) {
+  static bool serialize(SPSOutputBuffer &BOB, const ExecutorAddr &EA) {
     return SPSArgList<uint64_t>::serialize(BOB, EA.getValue());
   }
 
-  static bool deserialize(SPSInputBuffer &BIB, ExecutorAddress &EA) {
+  static bool deserialize(SPSInputBuffer &BIB, ExecutorAddr &EA) {
     uint64_t Tmp;
     if (!SPSArgList<uint64_t>::deserialize(BIB, Tmp))
       return false;
-    EA = ExecutorAddress(Tmp);
+    EA = ExecutorAddr(Tmp);
     return true;
   }
 };
 
-using SPSExecutorAddressRange =
-    SPSTuple<SPSExecutorAddress, SPSExecutorAddress>;
+using SPSExecutorAddrRange = SPSTuple<SPSExecutorAddr, SPSExecutorAddr>;
 
 /// Serialization traits for address ranges.
 template <>
-class SPSSerializationTraits<SPSExecutorAddressRange, ExecutorAddressRange> {
+class SPSSerializationTraits<SPSExecutorAddrRange, ExecutorAddrRange> {
 public:
-  static size_t size(const ExecutorAddressRange &Value) {
-    return SPSArgList<SPSExecutorAddress, SPSExecutorAddress>::size(
-        Value.StartAddress, Value.EndAddress);
+  static size_t size(const ExecutorAddrRange &Value) {
+    return SPSArgList<SPSExecutorAddr, SPSExecutorAddr>::size(Value.Start,
+                                                              Value.End);
   }
 
-  static bool serialize(SPSOutputBuffer &BOB,
-                        const ExecutorAddressRange &Value) {
-    return SPSArgList<SPSExecutorAddress, SPSExecutorAddress>::serialize(
-        BOB, Value.StartAddress, Value.EndAddress);
+  static bool serialize(SPSOutputBuffer &BOB, const ExecutorAddrRange &Value) {
+    return SPSArgList<SPSExecutorAddr, SPSExecutorAddr>::serialize(
+        BOB, Value.Start, Value.End);
   }
 
-  static bool deserialize(SPSInputBuffer &BIB, ExecutorAddressRange &Value) {
-    return SPSArgList<SPSExecutorAddress, SPSExecutorAddress>::deserialize(
-        BIB, Value.StartAddress, Value.EndAddress);
+  static bool deserialize(SPSInputBuffer &BIB, ExecutorAddrRange &Value) {
+    return SPSArgList<SPSExecutorAddr, SPSExecutorAddr>::deserialize(
+        BIB, Value.Start, Value.End);
   }
 };
 
-using SPSExecutorAddressRangeSequence = SPSSequence<SPSExecutorAddressRange>;
+using SPSExecutorAddrRangeSequence = SPSSequence<SPSExecutorAddrRange>;
 
 } // End namespace shared.
 } // End namespace orc.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h
deleted file mode 100644
index 3f96fe3da49d..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-//===- FDRawByteChannel.h - File descriptor based byte-channel -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// File descriptor based RawByteChannel.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_FDRAWBYTECHANNEL_H
-#define LLVM_EXECUTIONENGINE_ORC_SHARED_FDRAWBYTECHANNEL_H
-
-#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
-
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-
-namespace llvm {
-namespace orc {
-namespace shared {
-
-/// Serialization channel that reads from and writes from file descriptors.
-class FDRawByteChannel final : public RawByteChannel {
-public:
-  FDRawByteChannel(int InFD, int OutFD) : InFD(InFD), OutFD(OutFD) {}
-
-  llvm::Error readBytes(char *Dst, unsigned Size) override {
-    assert(Dst && "Attempt to read into null.");
-    ssize_t Completed = 0;
-    while (Completed < static_cast<ssize_t>(Size)) {
-      ssize_t Read = ::read(InFD, Dst + Completed, Size - Completed);
-      if (Read <= 0) {
-        auto ErrNo = errno;
-        if (ErrNo == EAGAIN || ErrNo == EINTR)
-          continue;
-        else
-          return llvm::errorCodeToError(
-              std::error_code(errno, std::generic_category()));
-      }
-      Completed += Read;
-    }
-    return llvm::Error::success();
-  }
-
-  llvm::Error appendBytes(const char *Src, unsigned Size) override {
-    assert(Src && "Attempt to append from null.");
-    ssize_t Completed = 0;
-    while (Completed < static_cast<ssize_t>(Size)) {
-      ssize_t Written = ::write(OutFD, Src + Completed, Size - Completed);
-      if (Written < 0) {
-        auto ErrNo = errno;
-        if (ErrNo == EAGAIN || ErrNo == EINTR)
-          continue;
-        else
-          return llvm::errorCodeToError(
-              std::error_code(errno, std::generic_category()));
-      }
-      Completed += Written;
-    }
-    return llvm::Error::success();
-  }
-
-  llvm::Error send() override { return llvm::Error::success(); }
-
-private:
-  int InFD, OutFD;
-};
-
-} // namespace shared
-} // namespace orc
-} // namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_FDRAWBYTECHANNEL_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
new file mode 100644
index 000000000000..3ef43f33d84c
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -0,0 +1,68 @@
+//===---- OrcRTBridge.h -- Utils for interacting with orc-rt ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declares types and symbol names provided by the ORC runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_ORCRTBRIDGE_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_ORCRTBRIDGE_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+
+namespace llvm {
+namespace orc {
+namespace rt {
+
+extern const char *SimpleExecutorDylibManagerInstanceName;
+extern const char *SimpleExecutorDylibManagerOpenWrapperName;
+extern const char *SimpleExecutorDylibManagerLookupWrapperName;
+
+extern const char *SimpleExecutorMemoryManagerInstanceName;
+extern const char *SimpleExecutorMemoryManagerReserveWrapperName;
+extern const char *SimpleExecutorMemoryManagerFinalizeWrapperName;
+extern const char *SimpleExecutorMemoryManagerDeallocateWrapperName;
+
+extern const char *MemoryWriteUInt8sWrapperName;
+extern const char *MemoryWriteUInt16sWrapperName;
+extern const char *MemoryWriteUInt32sWrapperName;
+extern const char *MemoryWriteUInt64sWrapperName;
+extern const char *MemoryWriteBuffersWrapperName;
+
+extern const char *RegisterEHFrameSectionCustomDirectWrapperName;
+extern const char *DeregisterEHFrameSectionCustomDirectWrapperName;
+
+extern const char *RunAsMainWrapperName;
+
+using SPSSimpleExecutorDylibManagerOpenSignature =
+    shared::SPSExpected<uint64_t>(shared::SPSExecutorAddr, shared::SPSString,
+                                  uint64_t);
+
+using SPSSimpleExecutorDylibManagerLookupSignature =
+    shared::SPSExpected<shared::SPSSequence<shared::SPSExecutorAddr>>(
+        shared::SPSExecutorAddr, uint64_t, shared::SPSRemoteSymbolLookupSet);
+
+using SPSSimpleExecutorMemoryManagerReserveSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 uint64_t);
+using SPSSimpleExecutorMemoryManagerFinalizeSignature =
+    shared::SPSError(shared::SPSExecutorAddr, shared::SPSFinalizeRequest);
+using SPSSimpleExecutorMemoryManagerDeallocateSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+
+using SPSRunAsMainSignature = int64_t(shared::SPSExecutorAddr,
+                                      shared::SPSSequence<shared::SPSString>);
+
+} // end namespace rt
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_ORCRTBRIDGE_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h
deleted file mode 100644
index 1ff47ce42758..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/RPCUtils.h
+++ /dev/null
@@ -1,1659 +0,0 @@
-//===- RPCUtils.h - Utilities for building RPC APIs -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utilities to support construction of simple RPC APIs.
-//
-// The RPC utilities aim for ease of use (minimal conceptual overhead) for C++
-// programmers, high performance, low memory overhead, and efficient use of the
-// communications channel.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_RPCUTILS_H
-#define LLVM_EXECUTIONENGINE_ORC_SHARED_RPCUTILS_H
-
-#include <map>
-#include <thread>
-#include <vector>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
-#include "llvm/ExecutionEngine/Orc/Shared/Serialization.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
-
-#include <future>
-
-namespace llvm {
-namespace orc {
-namespace shared {
-
-/// Base class of all fatal RPC errors (those that necessarily result in the
-/// termination of the RPC session).
-class RPCFatalError : public ErrorInfo<RPCFatalError> {
-public:
-  static char ID;
-};
-
-/// RPCConnectionClosed is returned from RPC operations if the RPC connection
-/// has already been closed due to either an error or graceful disconnection.
-class ConnectionClosed : public ErrorInfo<ConnectionClosed> {
-public:
-  static char ID;
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-};
-
-/// BadFunctionCall is returned from handleOne when the remote makes a call with
-/// an unrecognized function id.
-///
-/// This error is fatal because Orc RPC needs to know how to parse a function
-/// call to know where the next call starts, and if it doesn't recognize the
-/// function id it cannot parse the call.
-template <typename FnIdT, typename SeqNoT>
-class BadFunctionCall
-    : public ErrorInfo<BadFunctionCall<FnIdT, SeqNoT>, RPCFatalError> {
-public:
-  static char ID;
-
-  BadFunctionCall(FnIdT FnId, SeqNoT SeqNo)
-      : FnId(std::move(FnId)), SeqNo(std::move(SeqNo)) {}
-
-  std::error_code convertToErrorCode() const override {
-    return orcError(OrcErrorCode::UnexpectedRPCCall);
-  }
-
-  void log(raw_ostream &OS) const override {
-    OS << "Call to invalid RPC function id '" << FnId
-       << "' with "
-          "sequence number "
-       << SeqNo;
-  }
-
-private:
-  FnIdT FnId;
-  SeqNoT SeqNo;
-};
-
-template <typename FnIdT, typename SeqNoT>
-char BadFunctionCall<FnIdT, SeqNoT>::ID = 0;
-
-/// InvalidSequenceNumberForResponse is returned from handleOne when a response
-/// call arrives with a sequence number that doesn't correspond to any in-flight
-/// function call.
-///
-/// This error is fatal because Orc RPC needs to know how to parse the rest of
-/// the response call to know where the next call starts, and if it doesn't have
-/// a result parser for this sequence number it can't do that.
-template <typename SeqNoT>
-class InvalidSequenceNumberForResponse
-    : public ErrorInfo<InvalidSequenceNumberForResponse<SeqNoT>,
-                       RPCFatalError> {
-public:
-  static char ID;
-
-  InvalidSequenceNumberForResponse(SeqNoT SeqNo) : SeqNo(std::move(SeqNo)) {}
-
-  std::error_code convertToErrorCode() const override {
-    return orcError(OrcErrorCode::UnexpectedRPCCall);
-  };
-
-  void log(raw_ostream &OS) const override {
-    OS << "Response has unknown sequence number " << SeqNo;
-  }
-
-private:
-  SeqNoT SeqNo;
-};
-
-template <typename SeqNoT>
-char InvalidSequenceNumberForResponse<SeqNoT>::ID = 0;
-
-/// This non-fatal error will be passed to asynchronous result handlers in place
-/// of a result if the connection goes down before a result returns, or if the
-/// function to be called cannot be negotiated with the remote.
-class ResponseAbandoned : public ErrorInfo<ResponseAbandoned> {
-public:
-  static char ID;
-
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-};
-
-/// This error is returned if the remote does not have a handler installed for
-/// the given RPC function.
-class CouldNotNegotiate : public ErrorInfo<CouldNotNegotiate> {
-public:
-  static char ID;
-
-  CouldNotNegotiate(std::string Signature);
-  std::error_code convertToErrorCode() const override;
-  void log(raw_ostream &OS) const override;
-  const std::string &getSignature() const { return Signature; }
-
-private:
-  std::string Signature;
-};
-
-template <typename DerivedFunc, typename FnT> class RPCFunction;
-
-// RPC Function class.
-// DerivedFunc should be a user defined class with a static 'getName()' method
-// returning a const char* representing the function's name.
-template <typename DerivedFunc, typename RetT, typename... ArgTs>
-class RPCFunction<DerivedFunc, RetT(ArgTs...)> {
-public:
-  /// User defined function type.
-  using Type = RetT(ArgTs...);
-
-  /// Return type.
-  using ReturnType = RetT;
-
-  /// Returns the full function prototype as a string.
-  static const char *getPrototype() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << SerializationTypeName<RetT>::getName() << " "
-          << DerivedFunc::getName() << "("
-          << SerializationTypeNameSequence<ArgTs...>() << ")";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-/// Allocates RPC function ids during autonegotiation.
-/// Specializations of this class must provide four members:
-///
-/// static T getInvalidId():
-///   Should return a reserved id that will be used to represent missing
-/// functions during autonegotiation.
-///
-/// static T getResponseId():
-///   Should return a reserved id that will be used to send function responses
-/// (return values).
-///
-/// static T getNegotiateId():
-///   Should return a reserved id for the negotiate function, which will be used
-/// to negotiate ids for user defined functions.
-///
-/// template <typename Func> T allocate():
-///   Allocate a unique id for function Func.
-template <typename T, typename = void> class RPCFunctionIdAllocator;
-
-/// This specialization of RPCFunctionIdAllocator provides a default
-/// implementation for integral types.
-template <typename T>
-class RPCFunctionIdAllocator<T, std::enable_if_t<std::is_integral<T>::value>> {
-public:
-  static T getInvalidId() { return T(0); }
-  static T getResponseId() { return T(1); }
-  static T getNegotiateId() { return T(2); }
-
-  template <typename Func> T allocate() { return NextId++; }
-
-private:
-  T NextId = 3;
-};
-
-namespace detail {
-
-/// Provides a typedef for a tuple containing the decayed argument types.
-template <typename T> class RPCFunctionArgsTuple;
-
-template <typename RetT, typename... ArgTs>
-class RPCFunctionArgsTuple<RetT(ArgTs...)> {
-public:
-  using Type = std::tuple<std::decay_t<std::remove_reference_t<ArgTs>>...>;
-};
-
-// ResultTraits provides typedefs and utilities specific to the return type
-// of functions.
-template <typename RetT> class ResultTraits {
-public:
-  // The return type wrapped in llvm::Expected.
-  using ErrorReturnType = Expected<RetT>;
-
-#ifdef _MSC_VER
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<MSVCPExpected<RetT>>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<MSVCPExpected<RetT>>;
-#else
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<ErrorReturnType>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<ErrorReturnType>;
-#endif
-
-  // Create a 'blank' value of the ErrorReturnType, ready and safe to
-  // overwrite.
-  static ErrorReturnType createBlankErrorReturnValue() {
-    return ErrorReturnType(RetT());
-  }
-
-  // Consume an abandoned ErrorReturnType.
-  static void consumeAbandoned(ErrorReturnType RetOrErr) {
-    consumeError(RetOrErr.takeError());
-  }
-
-  static ErrorReturnType returnError(Error Err) { return std::move(Err); }
-};
-
-// ResultTraits specialization for void functions.
-template <> class ResultTraits<void> {
-public:
-  // For void functions, ErrorReturnType is llvm::Error.
-  using ErrorReturnType = Error;
-
-#ifdef _MSC_VER
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<MSVCPError>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<MSVCPError>;
-#else
-  // The ErrorReturnType wrapped in a std::promise.
-  using ReturnPromiseType = std::promise<ErrorReturnType>;
-
-  // The ErrorReturnType wrapped in a std::future.
-  using ReturnFutureType = std::future<ErrorReturnType>;
-#endif
-
-  // Create a 'blank' value of the ErrorReturnType, ready and safe to
-  // overwrite.
-  static ErrorReturnType createBlankErrorReturnValue() {
-    return ErrorReturnType::success();
-  }
-
-  // Consume an abandoned ErrorReturnType.
-  static void consumeAbandoned(ErrorReturnType Err) {
-    consumeError(std::move(Err));
-  }
-
-  static ErrorReturnType returnError(Error Err) { return Err; }
-};
-
-// ResultTraits<Error> is equivalent to ResultTraits<void>. This allows
-// handlers for void RPC functions to return either void (in which case they
-// implicitly succeed) or Error (in which case their error return is
-// propagated). See usage in HandlerTraits::runHandlerHelper.
-template <> class ResultTraits<Error> : public ResultTraits<void> {};
-
-// ResultTraits<Expected<T>> is equivalent to ResultTraits<T>. This allows
-// handlers for RPC functions returning a T to return either a T (in which
-// case they implicitly succeed) or Expected<T> (in which case their error
-// return is propagated). See usage in HandlerTraits::runHandlerHelper.
-template <typename RetT>
-class ResultTraits<Expected<RetT>> : public ResultTraits<RetT> {};
-
-// Determines whether an RPC function's defined error return type supports
-// error return value.
-template <typename T> class SupportsErrorReturn {
-public:
-  static const bool value = false;
-};
-
-template <> class SupportsErrorReturn<Error> {
-public:
-  static const bool value = true;
-};
-
-template <typename T> class SupportsErrorReturn<Expected<T>> {
-public:
-  static const bool value = true;
-};
-
-// RespondHelper packages return values based on whether or not the declared
-// RPC function return type supports error returns.
-template <bool FuncSupportsErrorReturn> class RespondHelper;
-
-// RespondHelper specialization for functions that support error returns.
-template <> class RespondHelper<true> {
-public:
-  // Send Expected<T>.
-  template <typename WireRetT, typename HandlerRetT, typename ChannelT,
-            typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo,
-                          Expected<HandlerRetT> ResultOrErr) {
-    if (!ResultOrErr && ResultOrErr.template errorIsA<RPCFatalError>())
-      return ResultOrErr.takeError();
-
-    // Open the response message.
-    if (auto Err = C.startSendMessage(ResponseId, SeqNo))
-      return Err;
-
-    // Serialize the result.
-    if (auto Err =
-            SerializationTraits<ChannelT, WireRetT, Expected<HandlerRetT>>::
-                serialize(C, std::move(ResultOrErr)))
-      return Err;
-
-    // Close the response message.
-    if (auto Err = C.endSendMessage())
-      return Err;
-    return C.send();
-  }
-
-  template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo, Error Err) {
-    if (Err && Err.isA<RPCFatalError>())
-      return Err;
-    if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
-      return Err2;
-    if (auto Err2 = serializeSeq(C, std::move(Err)))
-      return Err2;
-    if (auto Err2 = C.endSendMessage())
-      return Err2;
-    return C.send();
-  }
-};
-
-// RespondHelper specialization for functions that do not support error returns.
-template <> class RespondHelper<false> {
-public:
-  template <typename WireRetT, typename HandlerRetT, typename ChannelT,
-            typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo,
-                          Expected<HandlerRetT> ResultOrErr) {
-    if (auto Err = ResultOrErr.takeError())
-      return Err;
-
-    // Open the response message.
-    if (auto Err = C.startSendMessage(ResponseId, SeqNo))
-      return Err;
-
-    // Serialize the result.
-    if (auto Err =
-            SerializationTraits<ChannelT, WireRetT, HandlerRetT>::serialize(
-                C, *ResultOrErr))
-      return Err;
-
-    // End the response message.
-    if (auto Err = C.endSendMessage())
-      return Err;
-
-    return C.send();
-  }
-
-  template <typename ChannelT, typename FunctionIdT, typename SequenceNumberT>
-  static Error sendResult(ChannelT &C, const FunctionIdT &ResponseId,
-                          SequenceNumberT SeqNo, Error Err) {
-    if (Err)
-      return Err;
-    if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
-      return Err2;
-    if (auto Err2 = C.endSendMessage())
-      return Err2;
-    return C.send();
-  }
-};
-
-// Send a response of the given wire return type (WireRetT) over the
-// channel, with the given sequence number.
-template <typename WireRetT, typename HandlerRetT, typename ChannelT,
-          typename FunctionIdT, typename SequenceNumberT>
-Error respond(ChannelT &C, const FunctionIdT &ResponseId, SequenceNumberT SeqNo,
-              Expected<HandlerRetT> ResultOrErr) {
-  return RespondHelper<SupportsErrorReturn<WireRetT>::value>::
-      template sendResult<WireRetT>(C, ResponseId, SeqNo,
-                                    std::move(ResultOrErr));
-}
-
-// Send an empty response message on the given channel to indicate that
-// the handler ran.
-template <typename WireRetT, typename ChannelT, typename FunctionIdT,
-          typename SequenceNumberT>
-Error respond(ChannelT &C, const FunctionIdT &ResponseId, SequenceNumberT SeqNo,
-              Error Err) {
-  return RespondHelper<SupportsErrorReturn<WireRetT>::value>::sendResult(
-      C, ResponseId, SeqNo, std::move(Err));
-}
-
-// Converts a given type to the equivalent error return type.
-template <typename T> class WrappedHandlerReturn {
-public:
-  using Type = Expected<T>;
-};
-
-template <typename T> class WrappedHandlerReturn<Expected<T>> {
-public:
-  using Type = Expected<T>;
-};
-
-template <> class WrappedHandlerReturn<void> {
-public:
-  using Type = Error;
-};
-
-template <> class WrappedHandlerReturn<Error> {
-public:
-  using Type = Error;
-};
-
-template <> class WrappedHandlerReturn<ErrorSuccess> {
-public:
-  using Type = Error;
-};
-
-// Traits class that strips the response function from the list of handler
-// arguments.
-template <typename FnT> class AsyncHandlerTraits;
-
-template <typename ResultT, typename... ArgTs>
-class AsyncHandlerTraits<Error(std::function<Error(Expected<ResultT>)>,
-                               ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Expected<ResultT>;
-};
-
-template <typename... ArgTs>
-class AsyncHandlerTraits<Error(std::function<Error(Error)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Error;
-};
-
-template <typename... ArgTs>
-class AsyncHandlerTraits<ErrorSuccess(std::function<Error(Error)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Error;
-};
-
-template <typename... ArgTs>
-class AsyncHandlerTraits<void(std::function<Error(Error)>, ArgTs...)> {
-public:
-  using Type = Error(ArgTs...);
-  using ResultType = Error;
-};
-
-template <typename ResponseHandlerT, typename... ArgTs>
-class AsyncHandlerTraits<Error(ResponseHandlerT, ArgTs...)>
-    : public AsyncHandlerTraits<Error(std::decay_t<ResponseHandlerT>,
-                                      ArgTs...)> {};
-
-// This template class provides utilities related to RPC function handlers.
-// The base case applies to non-function types (the template class is
-// specialized for function types) and inherits from the appropriate
-// speciilization for the given non-function type's call operator.
-template <typename HandlerT>
-class HandlerTraits
-    : public HandlerTraits<
-          decltype(&std::remove_reference<HandlerT>::type::operator())> {};
-
-// Traits for handlers with a given function type.
-template <typename RetT, typename... ArgTs>
-class HandlerTraits<RetT(ArgTs...)> {
-public:
-  // Function type of the handler.
-  using Type = RetT(ArgTs...);
-
-  // Return type of the handler.
-  using ReturnType = RetT;
-
-  // Call the given handler with the given arguments.
-  template <typename HandlerT, typename... TArgTs>
-  static typename WrappedHandlerReturn<RetT>::Type
-  unpackAndRun(HandlerT &Handler, std::tuple<TArgTs...> &Args) {
-    return unpackAndRunHelper(Handler, Args,
-                              std::index_sequence_for<TArgTs...>());
-  }
-
-  // Call the given handler with the given arguments.
-  template <typename HandlerT, typename ResponderT, typename... TArgTs>
-  static Error unpackAndRunAsync(HandlerT &Handler, ResponderT &Responder,
-                                 std::tuple<TArgTs...> &Args) {
-    return unpackAndRunAsyncHelper(Handler, Responder, Args,
-                                   std::index_sequence_for<TArgTs...>());
-  }
-
-  // Call the given handler with the given arguments.
-  template <typename HandlerT>
-  static std::enable_if_t<
-      std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value, Error>
-  run(HandlerT &Handler, ArgTs &&...Args) {
-    Handler(std::move(Args)...);
-    return Error::success();
-  }
-
-  template <typename HandlerT, typename... TArgTs>
-  static std::enable_if_t<
-      !std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value,
-      typename HandlerTraits<HandlerT>::ReturnType>
-  run(HandlerT &Handler, TArgTs... Args) {
-    return Handler(std::move(Args)...);
-  }
-
-  // Serialize arguments to the channel.
-  template <typename ChannelT, typename... CArgTs>
-  static Error serializeArgs(ChannelT &C, const CArgTs... CArgs) {
-    return SequenceSerialization<ChannelT, ArgTs...>::serialize(C, CArgs...);
-  }
-
-  // Deserialize arguments from the channel.
-  template <typename ChannelT, typename... CArgTs>
-  static Error deserializeArgs(ChannelT &C, std::tuple<CArgTs...> &Args) {
-    return deserializeArgsHelper(C, Args, std::index_sequence_for<CArgTs...>());
-  }
-
-private:
-  template <typename ChannelT, typename... CArgTs, size_t... Indexes>
-  static Error deserializeArgsHelper(ChannelT &C, std::tuple<CArgTs...> &Args,
-                                     std::index_sequence<Indexes...> _) {
-    return SequenceSerialization<ChannelT, ArgTs...>::deserialize(
-        C, std::get<Indexes>(Args)...);
-  }
-
-  template <typename HandlerT, typename ArgTuple, size_t... Indexes>
-  static typename WrappedHandlerReturn<
-      typename HandlerTraits<HandlerT>::ReturnType>::Type
-  unpackAndRunHelper(HandlerT &Handler, ArgTuple &Args,
-                     std::index_sequence<Indexes...>) {
-    return run(Handler, std::move(std::get<Indexes>(Args))...);
-  }
-
-  template <typename HandlerT, typename ResponderT, typename ArgTuple,
-            size_t... Indexes>
-  static typename WrappedHandlerReturn<
-      typename HandlerTraits<HandlerT>::ReturnType>::Type
-  unpackAndRunAsyncHelper(HandlerT &Handler, ResponderT &Responder,
-                          ArgTuple &Args, std::index_sequence<Indexes...>) {
-    return run(Handler, Responder, std::move(std::get<Indexes>(Args))...);
-  }
-};
-
-// Handler traits for free functions.
-template <typename RetT, typename... ArgTs>
-class HandlerTraits<RetT (*)(ArgTs...)> : public HandlerTraits<RetT(ArgTs...)> {
-};
-
-// Handler traits for class methods (especially call operators for lambdas).
-template <typename Class, typename RetT, typename... ArgTs>
-class HandlerTraits<RetT (Class::*)(ArgTs...)>
-    : public HandlerTraits<RetT(ArgTs...)> {};
-
-// Handler traits for const class methods (especially call operators for
-// lambdas).
-template <typename Class, typename RetT, typename... ArgTs>
-class HandlerTraits<RetT (Class::*)(ArgTs...) const>
-    : public HandlerTraits<RetT(ArgTs...)> {};
-
-// Utility to peel the Expected wrapper off a response handler error type.
-template <typename HandlerT> class ResponseHandlerArg;
-
-template <typename ArgT> class ResponseHandlerArg<Error(Expected<ArgT>)> {
-public:
-  using ArgType = Expected<ArgT>;
-  using UnwrappedArgType = ArgT;
-};
-
-template <typename ArgT>
-class ResponseHandlerArg<ErrorSuccess(Expected<ArgT>)> {
-public:
-  using ArgType = Expected<ArgT>;
-  using UnwrappedArgType = ArgT;
-};
-
-template <> class ResponseHandlerArg<Error(Error)> {
-public:
-  using ArgType = Error;
-};
-
-template <> class ResponseHandlerArg<ErrorSuccess(Error)> {
-public:
-  using ArgType = Error;
-};
-
-// ResponseHandler represents a handler for a not-yet-received function call
-// result.
-template <typename ChannelT> class ResponseHandler {
-public:
-  virtual ~ResponseHandler() {}
-
-  // Reads the function result off the wire and acts on it. The meaning of
-  // "act" will depend on how this method is implemented in any given
-  // ResponseHandler subclass but could, for example, mean running a
-  // user-specified handler or setting a promise value.
-  virtual Error handleResponse(ChannelT &C) = 0;
-
-  // Abandons this outstanding result.
-  virtual void abandon() = 0;
-
-  // Create an error instance representing an abandoned response.
-  static Error createAbandonedResponseError() {
-    return make_error<ResponseAbandoned>();
-  }
-};
-
-// ResponseHandler subclass for RPC functions with non-void returns.
-template <typename ChannelT, typename FuncRetT, typename HandlerT>
-class ResponseHandlerImpl : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result by deserializing it from the channel then passing it
-  // to the user defined handler.
-  Error handleResponse(ChannelT &C) override {
-    using UnwrappedArgType = typename ResponseHandlerArg<
-        typename HandlerTraits<HandlerT>::Type>::UnwrappedArgType;
-    UnwrappedArgType Result;
-    if (auto Err =
-            SerializationTraits<ChannelT, FuncRetT,
-                                UnwrappedArgType>::deserialize(C, Result))
-      return Err;
-    if (auto Err = C.endReceiveMessage())
-      return Err;
-    return Handler(std::move(Result));
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-// ResponseHandler subclass for RPC functions with void returns.
-template <typename ChannelT, typename HandlerT>
-class ResponseHandlerImpl<ChannelT, void, HandlerT>
-    : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result (no actual value, just a notification that the function
-  // has completed on the remote end) by calling the user-defined handler with
-  // Error::success().
-  Error handleResponse(ChannelT &C) override {
-    if (auto Err = C.endReceiveMessage())
-      return Err;
-    return Handler(Error::success());
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-template <typename ChannelT, typename FuncRetT, typename HandlerT>
-class ResponseHandlerImpl<ChannelT, Expected<FuncRetT>, HandlerT>
-    : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result by deserializing it from the channel then passing it
-  // to the user defined handler.
-  Error handleResponse(ChannelT &C) override {
-    using HandlerArgType = typename ResponseHandlerArg<
-        typename HandlerTraits<HandlerT>::Type>::ArgType;
-    HandlerArgType Result((typename HandlerArgType::value_type()));
-
-    if (auto Err = SerializationTraits<ChannelT, Expected<FuncRetT>,
-                                       HandlerArgType>::deserialize(C, Result))
-      return Err;
-    if (auto Err = C.endReceiveMessage())
-      return Err;
-    return Handler(std::move(Result));
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-template <typename ChannelT, typename HandlerT>
-class ResponseHandlerImpl<ChannelT, Error, HandlerT>
-    : public ResponseHandler<ChannelT> {
-public:
-  ResponseHandlerImpl(HandlerT Handler) : Handler(std::move(Handler)) {}
-
-  // Handle the result by deserializing it from the channel then passing it
-  // to the user defined handler.
-  Error handleResponse(ChannelT &C) override {
-    Error Result = Error::success();
-    if (auto Err = SerializationTraits<ChannelT, Error, Error>::deserialize(
-            C, Result)) {
-      consumeError(std::move(Result));
-      return Err;
-    }
-    if (auto Err = C.endReceiveMessage()) {
-      consumeError(std::move(Result));
-      return Err;
-    }
-    return Handler(std::move(Result));
-  }
-
-  // Abandon this response by calling the handler with an 'abandoned response'
-  // error.
-  void abandon() override {
-    if (auto Err = Handler(this->createAbandonedResponseError())) {
-      // Handlers should not fail when passed an abandoned response error.
-      report_fatal_error(std::move(Err));
-    }
-  }
-
-private:
-  HandlerT Handler;
-};
-
-// Create a ResponseHandler from a given user handler.
-template <typename ChannelT, typename FuncRetT, typename HandlerT>
-std::unique_ptr<ResponseHandler<ChannelT>> createResponseHandler(HandlerT H) {
-  return std::make_unique<ResponseHandlerImpl<ChannelT, FuncRetT, HandlerT>>(
-      std::move(H));
-}
-
-// Helper for wrapping member functions up as functors. This is useful for
-// installing methods as result handlers.
-template <typename ClassT, typename RetT, typename... ArgTs>
-class MemberFnWrapper {
-public:
-  using MethodT = RetT (ClassT::*)(ArgTs...);
-  MemberFnWrapper(ClassT &Instance, MethodT Method)
-      : Instance(Instance), Method(Method) {}
-  RetT operator()(ArgTs &&...Args) {
-    return (Instance.*Method)(std::move(Args)...);
-  }
-
-private:
-  ClassT &Instance;
-  MethodT Method;
-};
-
-// Helper that provides a Functor for deserializing arguments.
-template <typename... ArgTs> class ReadArgs {
-public:
-  Error operator()() { return Error::success(); }
-};
-
-template <typename ArgT, typename... ArgTs>
-class ReadArgs<ArgT, ArgTs...> : public ReadArgs<ArgTs...> {
-public:
-  ReadArgs(ArgT &Arg, ArgTs &...Args) : ReadArgs<ArgTs...>(Args...), Arg(Arg) {}
-
-  Error operator()(ArgT &ArgVal, ArgTs &...ArgVals) {
-    this->Arg = std::move(ArgVal);
-    return ReadArgs<ArgTs...>::operator()(ArgVals...);
-  }
-
-private:
-  ArgT &Arg;
-};
-
-// Manage sequence numbers.
-template <typename SequenceNumberT> class SequenceNumberManager {
-public:
-  // Reset, making all sequence numbers available.
-  void reset() {
-    std::lock_guard<std::mutex> Lock(SeqNoLock);
-    NextSequenceNumber = 0;
-    FreeSequenceNumbers.clear();
-  }
-
-  // Get the next available sequence number. Will re-use numbers that have
-  // been released.
-  SequenceNumberT getSequenceNumber() {
-    std::lock_guard<std::mutex> Lock(SeqNoLock);
-    if (FreeSequenceNumbers.empty())
-      return NextSequenceNumber++;
-    auto SequenceNumber = FreeSequenceNumbers.back();
-    FreeSequenceNumbers.pop_back();
-    return SequenceNumber;
-  }
-
-  // Release a sequence number, making it available for re-use.
-  void releaseSequenceNumber(SequenceNumberT SequenceNumber) {
-    std::lock_guard<std::mutex> Lock(SeqNoLock);
-    FreeSequenceNumbers.push_back(SequenceNumber);
-  }
-
-private:
-  std::mutex SeqNoLock;
-  SequenceNumberT NextSequenceNumber = 0;
-  std::vector<SequenceNumberT> FreeSequenceNumbers;
-};
-
-// Checks that predicate P holds for each corresponding pair of type arguments
-// from T1 and T2 tuple.
-template <template <class, class> class P, typename T1Tuple, typename T2Tuple>
-class RPCArgTypeCheckHelper;
-
-template <template <class, class> class P>
-class RPCArgTypeCheckHelper<P, std::tuple<>, std::tuple<>> {
-public:
-  static const bool value = true;
-};
-
-template <template <class, class> class P, typename T, typename... Ts,
-          typename U, typename... Us>
-class RPCArgTypeCheckHelper<P, std::tuple<T, Ts...>, std::tuple<U, Us...>> {
-public:
-  static const bool value =
-      P<T, U>::value &&
-      RPCArgTypeCheckHelper<P, std::tuple<Ts...>, std::tuple<Us...>>::value;
-};
-
-template <template <class, class> class P, typename T1Sig, typename T2Sig>
-class RPCArgTypeCheck {
-public:
-  using T1Tuple = typename RPCFunctionArgsTuple<T1Sig>::Type;
-  using T2Tuple = typename RPCFunctionArgsTuple<T2Sig>::Type;
-
-  static_assert(std::tuple_size<T1Tuple>::value >=
-                    std::tuple_size<T2Tuple>::value,
-                "Too many arguments to RPC call");
-  static_assert(std::tuple_size<T1Tuple>::value <=
-                    std::tuple_size<T2Tuple>::value,
-                "Too few arguments to RPC call");
-
-  static const bool value = RPCArgTypeCheckHelper<P, T1Tuple, T2Tuple>::value;
-};
-
-template <typename ChannelT, typename WireT, typename ConcreteT>
-class CanSerialize {
-private:
-  using S = SerializationTraits<ChannelT, WireT, ConcreteT>;
-
-  template <typename T>
-  static std::true_type check(
-      std::enable_if_t<std::is_same<decltype(T::serialize(
-                                        std::declval<ChannelT &>(),
-                                        std::declval<const ConcreteT &>())),
-                                    Error>::value,
-                       void *>);
-
-  template <typename> static std::false_type check(...);
-
-public:
-  static const bool value = decltype(check<S>(0))::value;
-};
-
-template <typename ChannelT, typename WireT, typename ConcreteT>
-class CanDeserialize {
-private:
-  using S = SerializationTraits<ChannelT, WireT, ConcreteT>;
-
-  template <typename T>
-  static std::true_type
-      check(std::enable_if_t<
-            std::is_same<decltype(T::deserialize(std::declval<ChannelT &>(),
-                                                 std::declval<ConcreteT &>())),
-                         Error>::value,
-            void *>);
-
-  template <typename> static std::false_type check(...);
-
-public:
-  static const bool value = decltype(check<S>(0))::value;
-};
-
-/// Contains primitive utilities for defining, calling and handling calls to
-/// remote procedures. ChannelT is a bidirectional stream conforming to the
-/// RPCChannel interface (see RPCChannel.h), FunctionIdT is a procedure
-/// identifier type that must be serializable on ChannelT, and SequenceNumberT
-/// is an integral type that will be used to number in-flight function calls.
-///
-/// These utilities support the construction of very primitive RPC utilities.
-/// Their intent is to ensure correct serialization and deserialization of
-/// procedure arguments, and to keep the client and server's view of the API in
-/// sync.
-template <typename ImplT, typename ChannelT, typename FunctionIdT,
-          typename SequenceNumberT>
-class RPCEndpointBase {
-protected:
-  class OrcRPCInvalid : public RPCFunction<OrcRPCInvalid, void()> {
-  public:
-    static const char *getName() { return "__orc_rpc$invalid"; }
-  };
-
-  class OrcRPCResponse : public RPCFunction<OrcRPCResponse, void()> {
-  public:
-    static const char *getName() { return "__orc_rpc$response"; }
-  };
-
-  class OrcRPCNegotiate
-      : public RPCFunction<OrcRPCNegotiate, FunctionIdT(std::string)> {
-  public:
-    static const char *getName() { return "__orc_rpc$negotiate"; }
-  };
-
-  // Helper predicate for testing for the presence of SerializeTraits
-  // serializers.
-  template <typename WireT, typename ConcreteT>
-  class CanSerializeCheck : detail::CanSerialize<ChannelT, WireT, ConcreteT> {
-  public:
-    using detail::CanSerialize<ChannelT, WireT, ConcreteT>::value;
-
-    static_assert(value, "Missing serializer for argument (Can't serialize the "
-                         "first template type argument of CanSerializeCheck "
-                         "from the second)");
-  };
-
-  // Helper predicate for testing for the presence of SerializeTraits
-  // deserializers.
-  template <typename WireT, typename ConcreteT>
-  class CanDeserializeCheck
-      : detail::CanDeserialize<ChannelT, WireT, ConcreteT> {
-  public:
-    using detail::CanDeserialize<ChannelT, WireT, ConcreteT>::value;
-
-    static_assert(value, "Missing deserializer for argument (Can't deserialize "
-                         "the second template type argument of "
-                         "CanDeserializeCheck from the first)");
-  };
-
-public:
-  /// Construct an RPC instance on a channel.
-  RPCEndpointBase(ChannelT &C, bool LazyAutoNegotiation)
-      : C(C), LazyAutoNegotiation(LazyAutoNegotiation) {
-    // Hold ResponseId in a special variable, since we expect Response to be
-    // called relatively frequently, and want to avoid the map lookup.
-    ResponseId = FnIdAllocator.getResponseId();
-    RemoteFunctionIds[OrcRPCResponse::getPrototype()] = ResponseId;
-
-    // Register the negotiate function id and handler.
-    auto NegotiateId = FnIdAllocator.getNegotiateId();
-    RemoteFunctionIds[OrcRPCNegotiate::getPrototype()] = NegotiateId;
-    Handlers[NegotiateId] = wrapHandler<OrcRPCNegotiate>(
-        [this](const std::string &Name) { return handleNegotiate(Name); });
-  }
-
-  /// Negotiate a function id for Func with the other end of the channel.
-  template <typename Func> Error negotiateFunction(bool Retry = false) {
-    return getRemoteFunctionId<Func>(true, Retry).takeError();
-  }
-
-  /// Append a call Func, does not call send on the channel.
-  /// The first argument specifies a user-defined handler to be run when the
-  /// function returns. The handler should take an Expected<Func::ReturnType>,
-  /// or an Error (if Func::ReturnType is void). The handler will be called
-  /// with an error if the return value is abandoned due to a channel error.
-  template <typename Func, typename HandlerT, typename... ArgTs>
-  Error appendCallAsync(HandlerT Handler, const ArgTs &...Args) {
-
-    static_assert(
-        detail::RPCArgTypeCheck<CanSerializeCheck, typename Func::Type,
-                                void(ArgTs...)>::value,
-        "");
-
-    // Look up the function ID.
-    FunctionIdT FnId;
-    if (auto FnIdOrErr = getRemoteFunctionId<Func>(LazyAutoNegotiation, false))
-      FnId = *FnIdOrErr;
-    else {
-      // Negotiation failed. Notify the handler then return the negotiate-failed
-      // error.
-      cantFail(Handler(make_error<ResponseAbandoned>()));
-      return FnIdOrErr.takeError();
-    }
-
-    SequenceNumberT SeqNo; // initialized in locked scope below.
-    {
-      // Lock the pending responses map and sequence number manager.
-      std::lock_guard<std::mutex> Lock(ResponsesMutex);
-
-      // Allocate a sequence number.
-      SeqNo = SequenceNumberMgr.getSequenceNumber();
-      assert(!PendingResponses.count(SeqNo) &&
-             "Sequence number already allocated");
-
-      // Install the user handler.
-      PendingResponses[SeqNo] =
-          detail::createResponseHandler<ChannelT, typename Func::ReturnType>(
-              std::move(Handler));
-    }
-
-    // Open the function call message.
-    if (auto Err = C.startSendMessage(FnId, SeqNo)) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    // Serialize the call arguments.
-    if (auto Err = detail::HandlerTraits<typename Func::Type>::serializeArgs(
-            C, Args...)) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    // Close the function call messagee.
-    if (auto Err = C.endSendMessage()) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    return Error::success();
-  }
-
-  Error sendAppendedCalls() { return C.send(); };
-
-  template <typename Func, typename HandlerT, typename... ArgTs>
-  Error callAsync(HandlerT Handler, const ArgTs &...Args) {
-    if (auto Err = appendCallAsync<Func>(std::move(Handler), Args...))
-      return Err;
-    return C.send();
-  }
-
-  /// Handle one incoming call.
-  Error handleOne() {
-    FunctionIdT FnId;
-    SequenceNumberT SeqNo;
-    if (auto Err = C.startReceiveMessage(FnId, SeqNo)) {
-      abandonPendingResponses();
-      return Err;
-    }
-    if (FnId == ResponseId)
-      return handleResponse(SeqNo);
-    auto I = Handlers.find(FnId);
-    if (I != Handlers.end())
-      return I->second(C, SeqNo);
-
-    // else: No handler found. Report error to client?
-    return make_error<BadFunctionCall<FunctionIdT, SequenceNumberT>>(FnId,
-                                                                     SeqNo);
-  }
-
-  /// Helper for handling setter procedures - this method returns a functor that
-  /// sets the variables referred to by Args... to values deserialized from the
-  /// channel.
-  /// E.g.
-  ///
-  ///   typedef Function<0, bool, int> Func1;
-  ///
-  ///   ...
-  ///   bool B;
-  ///   int I;
-  ///   if (auto Err = expect<Func1>(Channel, readArgs(B, I)))
-  ///     /* Handle Args */ ;
-  ///
-  template <typename... ArgTs>
-  static detail::ReadArgs<ArgTs...> readArgs(ArgTs &...Args) {
-    return detail::ReadArgs<ArgTs...>(Args...);
-  }
-
-  /// Abandon all outstanding result handlers.
-  ///
-  /// This will call all currently registered result handlers to receive an
-  /// "abandoned" error as their argument. This is used internally by the RPC
-  /// in error situations, but can also be called directly by clients who are
-  /// disconnecting from the remote and don't or can't expect responses to their
-  /// outstanding calls. (Especially for outstanding blocking calls, calling
-  /// this function may be necessary to avoid dead threads).
-  void abandonPendingResponses() {
-    // Lock the pending responses map and sequence number manager.
-    std::lock_guard<std::mutex> Lock(ResponsesMutex);
-
-    for (auto &KV : PendingResponses)
-      KV.second->abandon();
-    PendingResponses.clear();
-    SequenceNumberMgr.reset();
-  }
-
-  /// Remove the handler for the given function.
-  /// A handler must currently be registered for this function.
-  template <typename Func> void removeHandler() {
-    auto IdItr = LocalFunctionIds.find(Func::getPrototype());
-    assert(IdItr != LocalFunctionIds.end() &&
-           "Function does not have a registered handler");
-    auto HandlerItr = Handlers.find(IdItr->second);
-    assert(HandlerItr != Handlers.end() &&
-           "Function does not have a registered handler");
-    Handlers.erase(HandlerItr);
-  }
-
-  /// Clear all handlers.
-  void clearHandlers() { Handlers.clear(); }
-
-protected:
-  FunctionIdT getInvalidFunctionId() const {
-    return FnIdAllocator.getInvalidId();
-  }
-
-  /// Add the given handler to the handler map and make it available for
-  /// autonegotiation and execution.
-  template <typename Func, typename HandlerT>
-  void addHandlerImpl(HandlerT Handler) {
-
-    static_assert(detail::RPCArgTypeCheck<
-                      CanDeserializeCheck, typename Func::Type,
-                      typename detail::HandlerTraits<HandlerT>::Type>::value,
-                  "");
-
-    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
-    LocalFunctionIds[Func::getPrototype()] = NewFnId;
-    Handlers[NewFnId] = wrapHandler<Func>(std::move(Handler));
-  }
-
-  template <typename Func, typename HandlerT>
-  void addAsyncHandlerImpl(HandlerT Handler) {
-
-    static_assert(
-        detail::RPCArgTypeCheck<
-            CanDeserializeCheck, typename Func::Type,
-            typename detail::AsyncHandlerTraits<
-                typename detail::HandlerTraits<HandlerT>::Type>::Type>::value,
-        "");
-
-    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
-    LocalFunctionIds[Func::getPrototype()] = NewFnId;
-    Handlers[NewFnId] = wrapAsyncHandler<Func>(std::move(Handler));
-  }
-
-  Error handleResponse(SequenceNumberT SeqNo) {
-    using Handler = typename decltype(PendingResponses)::mapped_type;
-    Handler PRHandler;
-
-    {
-      // Lock the pending responses map and sequence number manager.
-      std::unique_lock<std::mutex> Lock(ResponsesMutex);
-      auto I = PendingResponses.find(SeqNo);
-
-      if (I != PendingResponses.end()) {
-        PRHandler = std::move(I->second);
-        PendingResponses.erase(I);
-        SequenceNumberMgr.releaseSequenceNumber(SeqNo);
-      } else {
-        // Unlock the pending results map to prevent recursive lock.
-        Lock.unlock();
-        abandonPendingResponses();
-        return make_error<InvalidSequenceNumberForResponse<SequenceNumberT>>(
-            SeqNo);
-      }
-    }
-
-    assert(PRHandler &&
-           "If we didn't find a response handler we should have bailed out");
-
-    if (auto Err = PRHandler->handleResponse(C)) {
-      abandonPendingResponses();
-      return Err;
-    }
-
-    return Error::success();
-  }
-
-  FunctionIdT handleNegotiate(const std::string &Name) {
-    auto I = LocalFunctionIds.find(Name);
-    if (I == LocalFunctionIds.end())
-      return getInvalidFunctionId();
-    return I->second;
-  }
-
-  // Find the remote FunctionId for the given function.
-  template <typename Func>
-  Expected<FunctionIdT> getRemoteFunctionId(bool NegotiateIfNotInMap,
-                                            bool NegotiateIfInvalid) {
-    bool DoNegotiate;
-
-    // Check if we already have a function id...
-    auto I = RemoteFunctionIds.find(Func::getPrototype());
-    if (I != RemoteFunctionIds.end()) {
-      // If it's valid there's nothing left to do.
-      if (I->second != getInvalidFunctionId())
-        return I->second;
-      DoNegotiate = NegotiateIfInvalid;
-    } else
-      DoNegotiate = NegotiateIfNotInMap;
-
-    // We don't have a function id for Func yet, but we're allowed to try to
-    // negotiate one.
-    if (DoNegotiate) {
-      auto &Impl = static_cast<ImplT &>(*this);
-      if (auto RemoteIdOrErr =
-              Impl.template callB<OrcRPCNegotiate>(Func::getPrototype())) {
-        RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
-        if (*RemoteIdOrErr == getInvalidFunctionId())
-          return make_error<CouldNotNegotiate>(Func::getPrototype());
-        return *RemoteIdOrErr;
-      } else
-        return RemoteIdOrErr.takeError();
-    }
-
-    // No key was available in the map and we weren't allowed to try to
-    // negotiate one, so return an unknown function error.
-    return make_error<CouldNotNegotiate>(Func::getPrototype());
-  }
-
-  using WrappedHandlerFn = std::function<Error(ChannelT &, SequenceNumberT)>;
-
-  // Wrap the given user handler in the necessary argument-deserialization code,
-  // result-serialization code, and call to the launch policy (if present).
-  template <typename Func, typename HandlerT>
-  WrappedHandlerFn wrapHandler(HandlerT Handler) {
-    return [this, Handler](ChannelT &Channel,
-                           SequenceNumberT SeqNo) mutable -> Error {
-      // Start by deserializing the arguments.
-      using ArgsTuple = typename detail::RPCFunctionArgsTuple<
-          typename detail::HandlerTraits<HandlerT>::Type>::Type;
-      auto Args = std::make_shared<ArgsTuple>();
-
-      if (auto Err =
-              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
-                  Channel, *Args))
-        return Err;
-
-      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
-      // for RPCArgs. Void cast RPCArgs to work around this for now.
-      // FIXME: Remove this workaround once we can assume a working GCC version.
-      (void)Args;
-
-      // End receieve message, unlocking the channel for reading.
-      if (auto Err = Channel.endReceiveMessage())
-        return Err;
-
-      using HTraits = detail::HandlerTraits<HandlerT>;
-      using FuncReturn = typename Func::ReturnType;
-      return detail::respond<FuncReturn>(Channel, ResponseId, SeqNo,
-                                         HTraits::unpackAndRun(Handler, *Args));
-    };
-  }
-
-  // Wrap the given user handler in the necessary argument-deserialization code,
-  // result-serialization code, and call to the launch policy (if present).
-  template <typename Func, typename HandlerT>
-  WrappedHandlerFn wrapAsyncHandler(HandlerT Handler) {
-    return [this, Handler](ChannelT &Channel,
-                           SequenceNumberT SeqNo) mutable -> Error {
-      // Start by deserializing the arguments.
-      using AHTraits = detail::AsyncHandlerTraits<
-          typename detail::HandlerTraits<HandlerT>::Type>;
-      using ArgsTuple =
-          typename detail::RPCFunctionArgsTuple<typename AHTraits::Type>::Type;
-      auto Args = std::make_shared<ArgsTuple>();
-
-      if (auto Err =
-              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
-                  Channel, *Args))
-        return Err;
-
-      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
-      // for RPCArgs. Void cast RPCArgs to work around this for now.
-      // FIXME: Remove this workaround once we can assume a working GCC version.
-      (void)Args;
-
-      // End receieve message, unlocking the channel for reading.
-      if (auto Err = Channel.endReceiveMessage())
-        return Err;
-
-      using HTraits = detail::HandlerTraits<HandlerT>;
-      using FuncReturn = typename Func::ReturnType;
-      auto Responder = [this,
-                        SeqNo](typename AHTraits::ResultType RetVal) -> Error {
-        return detail::respond<FuncReturn>(C, ResponseId, SeqNo,
-                                           std::move(RetVal));
-      };
-
-      return HTraits::unpackAndRunAsync(Handler, Responder, *Args);
-    };
-  }
-
-  ChannelT &C;
-
-  bool LazyAutoNegotiation;
-
-  RPCFunctionIdAllocator<FunctionIdT> FnIdAllocator;
-
-  FunctionIdT ResponseId;
-  std::map<std::string, FunctionIdT> LocalFunctionIds;
-  std::map<const char *, FunctionIdT> RemoteFunctionIds;
-
-  std::map<FunctionIdT, WrappedHandlerFn> Handlers;
-
-  std::mutex ResponsesMutex;
-  detail::SequenceNumberManager<SequenceNumberT> SequenceNumberMgr;
-  std::map<SequenceNumberT, std::unique_ptr<detail::ResponseHandler<ChannelT>>>
-      PendingResponses;
-};
-
-} // end namespace detail
-
-template <typename ChannelT, typename FunctionIdT = uint32_t,
-          typename SequenceNumberT = uint32_t>
-class MultiThreadedRPCEndpoint
-    : public detail::RPCEndpointBase<
-          MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-          ChannelT, FunctionIdT, SequenceNumberT> {
-private:
-  using BaseClass = detail::RPCEndpointBase<
-      MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-      ChannelT, FunctionIdT, SequenceNumberT>;
-
-public:
-  MultiThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
-      : BaseClass(C, LazyAutoNegotiation) {}
-
-  /// Add a handler for the given RPC function.
-  /// This installs the given handler functor for the given RPCFunction, and
-  /// makes the RPC function available for negotiation/calling from the remote.
-  template <typename Func, typename HandlerT>
-  void addHandler(HandlerT Handler) {
-    return this->template addHandlerImpl<Func>(std::move(Handler));
-  }
-
-  /// Add a class-method as a handler.
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addHandler<Func>(
-        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  template <typename Func, typename HandlerT>
-  void addAsyncHandler(HandlerT Handler) {
-    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
-  }
-
-  /// Add a class-method as a handler.
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addAsyncHandler<Func>(
-        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  /// Return type for non-blocking call primitives.
-  template <typename Func>
-  using NonBlockingCallResult = typename detail::ResultTraits<
-      typename Func::ReturnType>::ReturnFutureType;
-
-  /// Call Func on Channel C. Does not block, does not call send. Returns a pair
-  /// of a future result and the sequence number assigned to the result.
-  ///
-  /// This utility function is primarily used for single-threaded mode support,
-  /// where the sequence number can be used to wait for the corresponding
-  /// result. In multi-threaded mode the appendCallNB method, which does not
-  /// return the sequence numeber, should be preferred.
-  template <typename Func, typename... ArgTs>
-  Expected<NonBlockingCallResult<Func>> appendCallNB(const ArgTs &...Args) {
-    using RTraits = detail::ResultTraits<typename Func::ReturnType>;
-    using ErrorReturn = typename RTraits::ErrorReturnType;
-    using ErrorReturnPromise = typename RTraits::ReturnPromiseType;
-
-    ErrorReturnPromise Promise;
-    auto FutureResult = Promise.get_future();
-
-    if (auto Err = this->template appendCallAsync<Func>(
-            [Promise = std::move(Promise)](ErrorReturn RetOrErr) mutable {
-              Promise.set_value(std::move(RetOrErr));
-              return Error::success();
-            },
-            Args...)) {
-      RTraits::consumeAbandoned(FutureResult.get());
-      return std::move(Err);
-    }
-    return std::move(FutureResult);
-  }
-
-  /// The same as appendCallNBWithSeq, except that it calls C.send() to
-  /// flush the channel after serializing the call.
-  template <typename Func, typename... ArgTs>
-  Expected<NonBlockingCallResult<Func>> callNB(const ArgTs &...Args) {
-    auto Result = appendCallNB<Func>(Args...);
-    if (!Result)
-      return Result;
-    if (auto Err = this->C.send()) {
-      this->abandonPendingResponses();
-      detail::ResultTraits<typename Func::ReturnType>::consumeAbandoned(
-          std::move(Result->get()));
-      return std::move(Err);
-    }
-    return Result;
-  }
-
-  /// Call Func on Channel C. Blocks waiting for a result. Returns an Error
-  /// for void functions or an Expected<T> for functions returning a T.
-  ///
-  /// This function is for use in threaded code where another thread is
-  /// handling responses and incoming calls.
-  template <typename Func, typename... ArgTs,
-            typename AltRetT = typename Func::ReturnType>
-  typename detail::ResultTraits<AltRetT>::ErrorReturnType
-  callB(const ArgTs &...Args) {
-    if (auto FutureResOrErr = callNB<Func>(Args...))
-      return FutureResOrErr->get();
-    else
-      return FutureResOrErr.takeError();
-  }
-
-  /// Handle incoming RPC calls.
-  Error handlerLoop() {
-    while (true)
-      if (auto Err = this->handleOne())
-        return Err;
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename FunctionIdT = uint32_t,
-          typename SequenceNumberT = uint32_t>
-class SingleThreadedRPCEndpoint
-    : public detail::RPCEndpointBase<
-          SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-          ChannelT, FunctionIdT, SequenceNumberT> {
-private:
-  using BaseClass = detail::RPCEndpointBase<
-      SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
-      ChannelT, FunctionIdT, SequenceNumberT>;
-
-public:
-  SingleThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
-      : BaseClass(C, LazyAutoNegotiation) {}
-
-  template <typename Func, typename HandlerT>
-  void addHandler(HandlerT Handler) {
-    return this->template addHandlerImpl<Func>(std::move(Handler));
-  }
-
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addHandler<Func>(
-        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  template <typename Func, typename HandlerT>
-  void addAsyncHandler(HandlerT Handler) {
-    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
-  }
-
-  /// Add a class-method as a handler.
-  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
-    addAsyncHandler<Func>(
-        detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
-  }
-
-  template <typename Func, typename... ArgTs,
-            typename AltRetT = typename Func::ReturnType>
-  typename detail::ResultTraits<AltRetT>::ErrorReturnType
-  callB(const ArgTs &...Args) {
-    bool ReceivedResponse = false;
-    using AltRetTraits = detail::ResultTraits<AltRetT>;
-    using ResultType = typename AltRetTraits::ErrorReturnType;
-    ResultType Result = AltRetTraits::createBlankErrorReturnValue();
-
-    // We have to 'Check' result (which we know is in a success state at this
-    // point) so that it can be overwritten in the async handler.
-    (void)!!Result;
-
-    if (Error Err = this->template appendCallAsync<Func>(
-            [&](ResultType R) {
-              Result = std::move(R);
-              ReceivedResponse = true;
-              return Error::success();
-            },
-            Args...)) {
-      AltRetTraits::consumeAbandoned(std::move(Result));
-      return AltRetTraits::returnError(std::move(Err));
-    }
-
-    if (Error Err = this->C.send()) {
-      AltRetTraits::consumeAbandoned(std::move(Result));
-      return AltRetTraits::returnError(std::move(Err));
-    }
-
-    while (!ReceivedResponse) {
-      if (Error Err = this->handleOne()) {
-        AltRetTraits::consumeAbandoned(std::move(Result));
-        return AltRetTraits::returnError(std::move(Err));
-      }
-    }
-
-    return Result;
-  }
-};
-
-/// Asynchronous dispatch for a function on an RPC endpoint.
-template <typename RPCClass, typename Func> class RPCAsyncDispatch {
-public:
-  RPCAsyncDispatch(RPCClass &Endpoint) : Endpoint(Endpoint) {}
-
-  template <typename HandlerT, typename... ArgTs>
-  Error operator()(HandlerT Handler, const ArgTs &...Args) const {
-    return Endpoint.template appendCallAsync<Func>(std::move(Handler), Args...);
-  }
-
-private:
-  RPCClass &Endpoint;
-};
-
-/// Construct an asynchronous dispatcher from an RPC endpoint and a Func.
-template <typename Func, typename RPCEndpointT>
-RPCAsyncDispatch<RPCEndpointT, Func> rpcAsyncDispatch(RPCEndpointT &Endpoint) {
-  return RPCAsyncDispatch<RPCEndpointT, Func>(Endpoint);
-}
-
-/// Allows a set of asynchrounous calls to be dispatched, and then
-///        waited on as a group.
-class ParallelCallGroup {
-public:
-  ParallelCallGroup() = default;
-  ParallelCallGroup(const ParallelCallGroup &) = delete;
-  ParallelCallGroup &operator=(const ParallelCallGroup &) = delete;
-
-  /// Make as asynchronous call.
-  template <typename AsyncDispatcher, typename HandlerT, typename... ArgTs>
-  Error call(const AsyncDispatcher &AsyncDispatch, HandlerT Handler,
-             const ArgTs &...Args) {
-    // Increment the count of outstanding calls. This has to happen before
-    // we invoke the call, as the handler may (depending on scheduling)
-    // be run immediately on another thread, and we don't want the decrement
-    // in the wrapped handler below to run before the increment.
-    {
-      std::unique_lock<std::mutex> Lock(M);
-      ++NumOutstandingCalls;
-    }
-
-    // Wrap the user handler in a lambda that will decrement the
-    // outstanding calls count, then poke the condition variable.
-    using ArgType = typename detail::ResponseHandlerArg<
-        typename detail::HandlerTraits<HandlerT>::Type>::ArgType;
-    auto WrappedHandler = [this, Handler = std::move(Handler)](ArgType Arg) {
-      auto Err = Handler(std::move(Arg));
-      std::unique_lock<std::mutex> Lock(M);
-      --NumOutstandingCalls;
-      CV.notify_all();
-      return Err;
-    };
-
-    return AsyncDispatch(std::move(WrappedHandler), Args...);
-  }
-
-  /// Blocks until all calls have been completed and their return value
-  ///        handlers run.
-  void wait() {
-    std::unique_lock<std::mutex> Lock(M);
-    while (NumOutstandingCalls > 0)
-      CV.wait(Lock);
-  }
-
-private:
-  std::mutex M;
-  std::condition_variable CV;
-  uint32_t NumOutstandingCalls = 0;
-};
-
-/// Convenience class for grouping RPCFunctions into APIs that can be
-///        negotiated as a block.
-///
-template <typename... Funcs> class APICalls {
-public:
-  /// Test whether this API contains Function F.
-  template <typename F> class Contains {
-  public:
-    static const bool value = false;
-  };
-
-  /// Negotiate all functions in this API.
-  template <typename RPCEndpoint> static Error negotiate(RPCEndpoint &R) {
-    return Error::success();
-  }
-};
-
-template <typename Func, typename... Funcs> class APICalls<Func, Funcs...> {
-public:
-  template <typename F> class Contains {
-  public:
-    static const bool value = std::is_same<F, Func>::value |
-                              APICalls<Funcs...>::template Contains<F>::value;
-  };
-
-  template <typename RPCEndpoint> static Error negotiate(RPCEndpoint &R) {
-    if (auto Err = R.template negotiateFunction<Func>())
-      return Err;
-    return APICalls<Funcs...>::negotiate(R);
-  }
-};
-
-template <typename... InnerFuncs, typename... Funcs>
-class APICalls<APICalls<InnerFuncs...>, Funcs...> {
-public:
-  template <typename F> class Contains {
-  public:
-    static const bool value =
-        APICalls<InnerFuncs...>::template Contains<F>::value |
-        APICalls<Funcs...>::template Contains<F>::value;
-  };
-
-  template <typename RPCEndpoint> static Error negotiate(RPCEndpoint &R) {
-    if (auto Err = APICalls<InnerFuncs...>::negotiate(R))
-      return Err;
-    return APICalls<Funcs...>::negotiate(R);
-  }
-};
-
-} // end namespace shared
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_RPCUTILS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h
deleted file mode 100644
index 2ee471939251..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h
+++ /dev/null
@@ -1,183 +0,0 @@
-//===- RawByteChannel.h -----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_RAWBYTECHANNEL_H
-#define LLVM_EXECUTIONENGINE_ORC_SHARED_RAWBYTECHANNEL_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/Orc/Shared/Serialization.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <cstdint>
-#include <mutex>
-#include <string>
-#include <type_traits>
-
-namespace llvm {
-namespace orc {
-namespace shared {
-
-/// Interface for byte-streams to be used with ORC Serialization.
-class RawByteChannel {
-public:
-  virtual ~RawByteChannel() = default;
-
-  /// Read Size bytes from the stream into *Dst.
-  virtual Error readBytes(char *Dst, unsigned Size) = 0;
-
-  /// Read size bytes from *Src and append them to the stream.
-  virtual Error appendBytes(const char *Src, unsigned Size) = 0;
-
-  /// Flush the stream if possible.
-  virtual Error send() = 0;
-
-  /// Notify the channel that we're starting a message send.
-  /// Locks the channel for writing.
-  template <typename FunctionIdT, typename SequenceIdT>
-  Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) {
-    writeLock.lock();
-    if (auto Err = serializeSeq(*this, FnId, SeqNo)) {
-      writeLock.unlock();
-      return Err;
-    }
-    return Error::success();
-  }
-
-  /// Notify the channel that we're ending a message send.
-  /// Unlocks the channel for writing.
-  Error endSendMessage() {
-    writeLock.unlock();
-    return Error::success();
-  }
-
-  /// Notify the channel that we're starting a message receive.
-  /// Locks the channel for reading.
-  template <typename FunctionIdT, typename SequenceNumberT>
-  Error startReceiveMessage(FunctionIdT &FnId, SequenceNumberT &SeqNo) {
-    readLock.lock();
-    if (auto Err = deserializeSeq(*this, FnId, SeqNo)) {
-      readLock.unlock();
-      return Err;
-    }
-    return Error::success();
-  }
-
-  /// Notify the channel that we're ending a message receive.
-  /// Unlocks the channel for reading.
-  Error endReceiveMessage() {
-    readLock.unlock();
-    return Error::success();
-  }
-
-  /// Get the lock for stream reading.
-  std::mutex &getReadLock() { return readLock; }
-
-  /// Get the lock for stream writing.
-  std::mutex &getWriteLock() { return writeLock; }
-
-private:
-  std::mutex readLock, writeLock;
-};
-
-template <typename ChannelT, typename T>
-class SerializationTraits<
-    ChannelT, T, T,
-    std::enable_if_t<
-        std::is_base_of<RawByteChannel, ChannelT>::value &&
-        (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value ||
-         std::is_same<T, uint16_t>::value || std::is_same<T, int16_t>::value ||
-         std::is_same<T, uint32_t>::value || std::is_same<T, int32_t>::value ||
-         std::is_same<T, uint64_t>::value || std::is_same<T, int64_t>::value ||
-         std::is_same<T, char>::value)>> {
-public:
-  static Error serialize(ChannelT &C, T V) {
-    support::endian::byte_swap<T, support::big>(V);
-    return C.appendBytes(reinterpret_cast<const char *>(&V), sizeof(T));
-  };
-
-  static Error deserialize(ChannelT &C, T &V) {
-    if (auto Err = C.readBytes(reinterpret_cast<char *>(&V), sizeof(T)))
-      return Err;
-    support::endian::byte_swap<T, support::big>(V);
-    return Error::success();
-  };
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, bool, bool,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  static Error serialize(ChannelT &C, bool V) {
-    uint8_t Tmp = V ? 1 : 0;
-    if (auto Err = C.appendBytes(reinterpret_cast<const char *>(&Tmp), 1))
-      return Err;
-    return Error::success();
-  }
-
-  static Error deserialize(ChannelT &C, bool &V) {
-    uint8_t Tmp = 0;
-    if (auto Err = C.readBytes(reinterpret_cast<char *>(&Tmp), 1))
-      return Err;
-    V = Tmp != 0;
-    return Error::success();
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, std::string, StringRef,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  /// Serialization channel serialization for std::strings.
-  static Error serialize(RawByteChannel &C, StringRef S) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(S.size())))
-      return Err;
-    return C.appendBytes((const char *)S.data(), S.size());
-  }
-};
-
-template <typename ChannelT, typename T>
-class SerializationTraits<
-    ChannelT, std::string, T,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value &&
-                     (std::is_same<T, const char *>::value ||
-                      std::is_same<T, char *>::value)>> {
-public:
-  static Error serialize(RawByteChannel &C, const char *S) {
-    return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
-                                                                            S);
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, std::string, std::string,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  /// Serialization channel serialization for std::strings.
-  static Error serialize(RawByteChannel &C, const std::string &S) {
-    return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
-                                                                            S);
-  }
-
-  /// Serialization channel deserialization for std::strings.
-  static Error deserialize(RawByteChannel &C, std::string &S) {
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-    S.resize(Count);
-    return C.readBytes(&S[0], Count);
-  }
-};
-
-} // end namespace shared
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_RAWBYTECHANNEL_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h
deleted file mode 100644
index 0ea483ba2abb..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/Serialization.h
+++ /dev/null
@@ -1,769 +0,0 @@
-//===- Serialization.h ------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SERIALIZATION_H
-#define LLVM_EXECUTIONENGINE_ORC_SHARED_SERIALIZATION_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
-#include "llvm/Support/thread.h"
-#include <map>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace llvm {
-namespace orc {
-namespace shared {
-
-template <typename T> class SerializationTypeName;
-
-/// TypeNameSequence is a utility for rendering sequences of types to a string
-/// by rendering each type, separated by ", ".
-template <typename... ArgTs> class SerializationTypeNameSequence {};
-
-/// Render an empty TypeNameSequence to an ostream.
-template <typename OStream>
-OStream &operator<<(OStream &OS, const SerializationTypeNameSequence<> &V) {
-  return OS;
-}
-
-/// Render a TypeNameSequence of a single type to an ostream.
-template <typename OStream, typename ArgT>
-OStream &operator<<(OStream &OS, const SerializationTypeNameSequence<ArgT> &V) {
-  OS << SerializationTypeName<ArgT>::getName();
-  return OS;
-}
-
-/// Render a TypeNameSequence of more than one type to an ostream.
-template <typename OStream, typename ArgT1, typename ArgT2, typename... ArgTs>
-OStream &
-operator<<(OStream &OS,
-           const SerializationTypeNameSequence<ArgT1, ArgT2, ArgTs...> &V) {
-  OS << SerializationTypeName<ArgT1>::getName() << ", "
-     << SerializationTypeNameSequence<ArgT2, ArgTs...>();
-  return OS;
-}
-
-template <> class SerializationTypeName<void> {
-public:
-  static const char *getName() { return "void"; }
-};
-
-template <> class SerializationTypeName<int8_t> {
-public:
-  static const char *getName() { return "int8_t"; }
-};
-
-template <> class SerializationTypeName<uint8_t> {
-public:
-  static const char *getName() { return "uint8_t"; }
-};
-
-template <> class SerializationTypeName<int16_t> {
-public:
-  static const char *getName() { return "int16_t"; }
-};
-
-template <> class SerializationTypeName<uint16_t> {
-public:
-  static const char *getName() { return "uint16_t"; }
-};
-
-template <> class SerializationTypeName<int32_t> {
-public:
-  static const char *getName() { return "int32_t"; }
-};
-
-template <> class SerializationTypeName<uint32_t> {
-public:
-  static const char *getName() { return "uint32_t"; }
-};
-
-template <> class SerializationTypeName<int64_t> {
-public:
-  static const char *getName() { return "int64_t"; }
-};
-
-template <> class SerializationTypeName<uint64_t> {
-public:
-  static const char *getName() { return "uint64_t"; }
-};
-
-template <> class SerializationTypeName<bool> {
-public:
-  static const char *getName() { return "bool"; }
-};
-
-template <> class SerializationTypeName<std::string> {
-public:
-  static const char *getName() { return "std::string"; }
-};
-
-template <> class SerializationTypeName<Error> {
-public:
-  static const char *getName() { return "Error"; }
-};
-
-template <typename T> class SerializationTypeName<Expected<T>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "Expected<" << SerializationTypeNameSequence<T>() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T1, typename T2>
-class SerializationTypeName<std::pair<T1, T2>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::pair<" << SerializationTypeNameSequence<T1, T2>() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename... ArgTs> class SerializationTypeName<std::tuple<ArgTs...>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::tuple<" << SerializationTypeNameSequence<ArgTs...>() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T> class SerializationTypeName<Optional<T>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "Optional<" << SerializationTypeName<T>::getName() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T> class SerializationTypeName<std::vector<T>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::vector<" << SerializationTypeName<T>::getName() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename T> class SerializationTypeName<std::set<T>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::set<" << SerializationTypeName<T>::getName() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-template <typename K, typename V> class SerializationTypeName<std::map<K, V>> {
-public:
-  static const char *getName() {
-    static std::string Name = [] {
-      std::string Name;
-      raw_string_ostream(Name)
-          << "std::map<" << SerializationTypeNameSequence<K, V>() << ">";
-      return Name;
-    }();
-    return Name.data();
-  }
-};
-
-/// The SerializationTraits<ChannelT, T> class describes how to serialize and
-/// deserialize an instance of type T to/from an abstract channel of type
-/// ChannelT. It also provides a representation of the type's name via the
-/// getName method.
-///
-/// Specializations of this class should provide the following functions:
-///
-///   @code{.cpp}
-///
-///   static const char* getName();
-///   static Error serialize(ChannelT&, const T&);
-///   static Error deserialize(ChannelT&, T&);
-///
-///   @endcode
-///
-/// The third argument of SerializationTraits is intended to support SFINAE.
-/// E.g.:
-///
-///   @code{.cpp}
-///
-///   class MyVirtualChannel { ... };
-///
-///   template <DerivedChannelT>
-///   class SerializationTraits<DerivedChannelT, bool,
-///         std::enable_if_t<
-///           std::is_base_of<VirtChannel, DerivedChannel>::value
-///         >> {
-///   public:
-///     static const char* getName() { ... };
-///   }
-///
-///   @endcode
-template <typename ChannelT, typename WireType,
-          typename ConcreteType = WireType, typename = void>
-class SerializationTraits;
-
-template <typename ChannelT> class SequenceTraits {
-public:
-  static Error emitSeparator(ChannelT &C) { return Error::success(); }
-  static Error consumeSeparator(ChannelT &C) { return Error::success(); }
-};
-
-/// Utility class for serializing sequences of values of varying types.
-/// Specializations of this class contain 'serialize' and 'deserialize' methods
-/// for the given channel. The ArgTs... list will determine the "over-the-wire"
-/// types to be serialized. The serialize and deserialize methods take a list
-/// CArgTs... ("caller arg types") which must be the same length as ArgTs...,
-/// but may be different types from ArgTs, provided that for each CArgT there
-/// is a SerializationTraits specialization
-/// SerializeTraits<ChannelT, ArgT, CArgT> with methods that can serialize the
-/// caller argument to over-the-wire value.
-template <typename ChannelT, typename... ArgTs> class SequenceSerialization;
-
-template <typename ChannelT> class SequenceSerialization<ChannelT> {
-public:
-  static Error serialize(ChannelT &C) { return Error::success(); }
-  static Error deserialize(ChannelT &C) { return Error::success(); }
-};
-
-template <typename ChannelT, typename ArgT>
-class SequenceSerialization<ChannelT, ArgT> {
-public:
-  template <typename CArgT> static Error serialize(ChannelT &C, CArgT &&CArg) {
-    return SerializationTraits<ChannelT, ArgT, std::decay_t<CArgT>>::serialize(
-        C, std::forward<CArgT>(CArg));
-  }
-
-  template <typename CArgT> static Error deserialize(ChannelT &C, CArgT &CArg) {
-    return SerializationTraits<ChannelT, ArgT, CArgT>::deserialize(C, CArg);
-  }
-};
-
-template <typename ChannelT, typename ArgT, typename... ArgTs>
-class SequenceSerialization<ChannelT, ArgT, ArgTs...> {
-public:
-  template <typename CArgT, typename... CArgTs>
-  static Error serialize(ChannelT &C, CArgT &&CArg, CArgTs &&...CArgs) {
-    if (auto Err =
-            SerializationTraits<ChannelT, ArgT, std::decay_t<CArgT>>::serialize(
-                C, std::forward<CArgT>(CArg)))
-      return Err;
-    if (auto Err = SequenceTraits<ChannelT>::emitSeparator(C))
-      return Err;
-    return SequenceSerialization<ChannelT, ArgTs...>::serialize(
-        C, std::forward<CArgTs>(CArgs)...);
-  }
-
-  template <typename CArgT, typename... CArgTs>
-  static Error deserialize(ChannelT &C, CArgT &CArg, CArgTs &...CArgs) {
-    if (auto Err =
-            SerializationTraits<ChannelT, ArgT, CArgT>::deserialize(C, CArg))
-      return Err;
-    if (auto Err = SequenceTraits<ChannelT>::consumeSeparator(C))
-      return Err;
-    return SequenceSerialization<ChannelT, ArgTs...>::deserialize(C, CArgs...);
-  }
-};
-
-template <typename ChannelT, typename... ArgTs>
-Error serializeSeq(ChannelT &C, ArgTs &&...Args) {
-  return SequenceSerialization<ChannelT, std::decay_t<ArgTs>...>::serialize(
-      C, std::forward<ArgTs>(Args)...);
-}
-
-template <typename ChannelT, typename... ArgTs>
-Error deserializeSeq(ChannelT &C, ArgTs &...Args) {
-  return SequenceSerialization<ChannelT, ArgTs...>::deserialize(C, Args...);
-}
-
-template <typename ChannelT> class SerializationTraits<ChannelT, Error> {
-public:
-  using WrappedErrorSerializer =
-      std::function<Error(ChannelT &C, const ErrorInfoBase &)>;
-
-  using WrappedErrorDeserializer =
-      std::function<Error(ChannelT &C, Error &Err)>;
-
-  template <typename ErrorInfoT, typename SerializeFtor,
-            typename DeserializeFtor>
-  static void registerErrorType(std::string Name, SerializeFtor Serialize,
-                                DeserializeFtor Deserialize) {
-    assert(!Name.empty() &&
-           "The empty string is reserved for the Success value");
-
-    const std::string *KeyName = nullptr;
-    {
-      // We're abusing the stability of std::map here: We take a reference to
-      // the key of the deserializers map to save us from duplicating the string
-      // in the serializer. This should be changed to use a stringpool if we
-      // switch to a map type that may move keys in memory.
-      std::lock_guard<std::recursive_mutex> Lock(DeserializersMutex);
-      auto I = Deserializers.insert(
-          Deserializers.begin(),
-          std::make_pair(std::move(Name), std::move(Deserialize)));
-      KeyName = &I->first;
-    }
-
-    {
-      assert(KeyName != nullptr && "No keyname pointer");
-      std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
-      Serializers[ErrorInfoT::classID()] =
-          [KeyName, Serialize = std::move(Serialize)](
-              ChannelT &C, const ErrorInfoBase &EIB) -> Error {
-        assert(EIB.dynamicClassID() == ErrorInfoT::classID() &&
-               "Serializer called for wrong error type");
-        if (auto Err = serializeSeq(C, *KeyName))
-          return Err;
-        return Serialize(C, static_cast<const ErrorInfoT &>(EIB));
-      };
-    }
-  }
-
-  static Error serialize(ChannelT &C, Error &&Err) {
-    std::lock_guard<std::recursive_mutex> Lock(SerializersMutex);
-
-    if (!Err)
-      return serializeSeq(C, std::string());
-
-    return handleErrors(std::move(Err), [&C](const ErrorInfoBase &EIB) {
-      auto SI = Serializers.find(EIB.dynamicClassID());
-      if (SI == Serializers.end())
-        return serializeAsStringError(C, EIB);
-      return (SI->second)(C, EIB);
-    });
-  }
-
-  static Error deserialize(ChannelT &C, Error &Err) {
-    std::lock_guard<std::recursive_mutex> Lock(DeserializersMutex);
-
-    std::string Key;
-    if (auto Err = deserializeSeq(C, Key))
-      return Err;
-
-    if (Key.empty()) {
-      ErrorAsOutParameter EAO(&Err);
-      Err = Error::success();
-      return Error::success();
-    }
-
-    auto DI = Deserializers.find(Key);
-    assert(DI != Deserializers.end() && "No deserializer for error type");
-    return (DI->second)(C, Err);
-  }
-
-private:
-  static Error serializeAsStringError(ChannelT &C, const ErrorInfoBase &EIB) {
-    std::string ErrMsg;
-    {
-      raw_string_ostream ErrMsgStream(ErrMsg);
-      EIB.log(ErrMsgStream);
-    }
-    return serialize(C, make_error<StringError>(std::move(ErrMsg),
-                                                inconvertibleErrorCode()));
-  }
-
-  static std::recursive_mutex SerializersMutex;
-  static std::recursive_mutex DeserializersMutex;
-  static std::map<const void *, WrappedErrorSerializer> Serializers;
-  static std::map<std::string, WrappedErrorDeserializer> Deserializers;
-};
-
-template <typename ChannelT>
-std::recursive_mutex SerializationTraits<ChannelT, Error>::SerializersMutex;
-
-template <typename ChannelT>
-std::recursive_mutex SerializationTraits<ChannelT, Error>::DeserializersMutex;
-
-template <typename ChannelT>
-std::map<const void *,
-         typename SerializationTraits<ChannelT, Error>::WrappedErrorSerializer>
-    SerializationTraits<ChannelT, Error>::Serializers;
-
-template <typename ChannelT>
-std::map<std::string, typename SerializationTraits<
-                          ChannelT, Error>::WrappedErrorDeserializer>
-    SerializationTraits<ChannelT, Error>::Deserializers;
-
-/// Registers a serializer and deserializer for the given error type on the
-/// given channel type.
-template <typename ChannelT, typename ErrorInfoT, typename SerializeFtor,
-          typename DeserializeFtor>
-void registerErrorSerialization(std::string Name, SerializeFtor &&Serialize,
-                                DeserializeFtor &&Deserialize) {
-  SerializationTraits<ChannelT, Error>::template registerErrorType<ErrorInfoT>(
-      std::move(Name), std::forward<SerializeFtor>(Serialize),
-      std::forward<DeserializeFtor>(Deserialize));
-}
-
-/// Registers serialization/deserialization for StringError.
-template <typename ChannelT> void registerStringError() {
-  static bool AlreadyRegistered = false;
-  if (!AlreadyRegistered) {
-    registerErrorSerialization<ChannelT, StringError>(
-        "StringError",
-        [](ChannelT &C, const StringError &SE) {
-          return serializeSeq(C, SE.getMessage());
-        },
-        [](ChannelT &C, Error &Err) -> Error {
-          ErrorAsOutParameter EAO(&Err);
-          std::string Msg;
-          if (auto E2 = deserializeSeq(C, Msg))
-            return E2;
-          Err = make_error<StringError>(
-              std::move(Msg),
-              orcError(OrcErrorCode::UnknownErrorCodeFromRemote));
-          return Error::success();
-        });
-    AlreadyRegistered = true;
-  }
-}
-
-/// SerializationTraits for Expected<T1> from an Expected<T2>.
-template <typename ChannelT, typename T1, typename T2>
-class SerializationTraits<ChannelT, Expected<T1>, Expected<T2>> {
-public:
-  static Error serialize(ChannelT &C, Expected<T2> &&ValOrErr) {
-    if (ValOrErr) {
-      if (auto Err = serializeSeq(C, true))
-        return Err;
-      return SerializationTraits<ChannelT, T1, T2>::serialize(C, *ValOrErr);
-    }
-    if (auto Err = serializeSeq(C, false))
-      return Err;
-    return serializeSeq(C, ValOrErr.takeError());
-  }
-
-  static Error deserialize(ChannelT &C, Expected<T2> &ValOrErr) {
-    ExpectedAsOutParameter<T2> EAO(&ValOrErr);
-    bool HasValue;
-    if (auto Err = deserializeSeq(C, HasValue))
-      return Err;
-    if (HasValue)
-      return SerializationTraits<ChannelT, T1, T2>::deserialize(C, *ValOrErr);
-    Error Err = Error::success();
-    if (auto E2 = deserializeSeq(C, Err))
-      return E2;
-    ValOrErr = std::move(Err);
-    return Error::success();
-  }
-};
-
-/// SerializationTraits for Expected<T1> from a T2.
-template <typename ChannelT, typename T1, typename T2>
-class SerializationTraits<ChannelT, Expected<T1>, T2> {
-public:
-  static Error serialize(ChannelT &C, T2 &&Val) {
-    return serializeSeq(C, Expected<T2>(std::forward<T2>(Val)));
-  }
-};
-
-/// SerializationTraits for Expected<T1> from an Error.
-template <typename ChannelT, typename T>
-class SerializationTraits<ChannelT, Expected<T>, Error> {
-public:
-  static Error serialize(ChannelT &C, Error &&Err) {
-    return serializeSeq(C, Expected<T>(std::move(Err)));
-  }
-};
-
-/// SerializationTraits default specialization for std::pair.
-template <typename ChannelT, typename T1, typename T2, typename T3, typename T4>
-class SerializationTraits<ChannelT, std::pair<T1, T2>, std::pair<T3, T4>> {
-public:
-  static Error serialize(ChannelT &C, const std::pair<T3, T4> &V) {
-    if (auto Err = SerializationTraits<ChannelT, T1, T3>::serialize(C, V.first))
-      return Err;
-    return SerializationTraits<ChannelT, T2, T4>::serialize(C, V.second);
-  }
-
-  static Error deserialize(ChannelT &C, std::pair<T3, T4> &V) {
-    if (auto Err =
-            SerializationTraits<ChannelT, T1, T3>::deserialize(C, V.first))
-      return Err;
-    return SerializationTraits<ChannelT, T2, T4>::deserialize(C, V.second);
-  }
-};
-
-/// SerializationTraits default specialization for std::tuple.
-template <typename ChannelT, typename... ArgTs>
-class SerializationTraits<ChannelT, std::tuple<ArgTs...>> {
-public:
-  /// RPC channel serialization for std::tuple.
-  static Error serialize(ChannelT &C, const std::tuple<ArgTs...> &V) {
-    return serializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
-  }
-
-  /// RPC channel deserialization for std::tuple.
-  static Error deserialize(ChannelT &C, std::tuple<ArgTs...> &V) {
-    return deserializeTupleHelper(C, V, std::index_sequence_for<ArgTs...>());
-  }
-
-private:
-  // Serialization helper for std::tuple.
-  template <size_t... Is>
-  static Error serializeTupleHelper(ChannelT &C, const std::tuple<ArgTs...> &V,
-                                    std::index_sequence<Is...> _) {
-    return serializeSeq(C, std::get<Is>(V)...);
-  }
-
-  // Serialization helper for std::tuple.
-  template <size_t... Is>
-  static Error deserializeTupleHelper(ChannelT &C, std::tuple<ArgTs...> &V,
-                                      std::index_sequence<Is...> _) {
-    return deserializeSeq(C, std::get<Is>(V)...);
-  }
-};
-
-template <typename ChannelT, typename T>
-class SerializationTraits<ChannelT, Optional<T>> {
-public:
-  /// Serialize an Optional<T>.
-  static Error serialize(ChannelT &C, const Optional<T> &O) {
-    if (auto Err = serializeSeq(C, O != None))
-      return Err;
-    if (O)
-      if (auto Err = serializeSeq(C, *O))
-        return Err;
-    return Error::success();
-  }
-
-  /// Deserialize an Optional<T>.
-  static Error deserialize(ChannelT &C, Optional<T> &O) {
-    bool HasValue = false;
-    if (auto Err = deserializeSeq(C, HasValue))
-      return Err;
-    if (HasValue)
-      if (auto Err = deserializeSeq(C, *O))
-        return Err;
-    return Error::success();
-  };
-};
-
-/// SerializationTraits default specialization for std::vector.
-template <typename ChannelT, typename T>
-class SerializationTraits<ChannelT, std::vector<T>> {
-public:
-  /// Serialize a std::vector<T> from std::vector<T>.
-  static Error serialize(ChannelT &C, const std::vector<T> &V) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(V.size())))
-      return Err;
-
-    for (const auto &E : V)
-      if (auto Err = serializeSeq(C, E))
-        return Err;
-
-    return Error::success();
-  }
-
-  /// Deserialize a std::vector<T> to a std::vector<T>.
-  static Error deserialize(ChannelT &C, std::vector<T> &V) {
-    assert(V.empty() &&
-           "Expected default-constructed vector to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    V.resize(Count);
-    for (auto &E : V)
-      if (auto Err = deserializeSeq(C, E))
-        return Err;
-
-    return Error::success();
-  }
-};
-
-/// Enable vector serialization from an ArrayRef.
-template <typename ChannelT, typename T>
-class SerializationTraits<ChannelT, std::vector<T>, ArrayRef<T>> {
-public:
-  static Error serialize(ChannelT &C, ArrayRef<T> V) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(V.size())))
-      return Err;
-
-    for (const auto &E : V)
-      if (auto Err = serializeSeq(C, E))
-        return Err;
-
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename T, typename T2>
-class SerializationTraits<ChannelT, std::set<T>, std::set<T2>> {
-public:
-  /// Serialize a std::set<T> from std::set<T2>.
-  static Error serialize(ChannelT &C, const std::set<T2> &S) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(S.size())))
-      return Err;
-
-    for (const auto &E : S)
-      if (auto Err = SerializationTraits<ChannelT, T, T2>::serialize(C, E))
-        return Err;
-
-    return Error::success();
-  }
-
-  /// Deserialize a std::set<T> to a std::set<T>.
-  static Error deserialize(ChannelT &C, std::set<T2> &S) {
-    assert(S.empty() && "Expected default-constructed set to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    while (Count-- != 0) {
-      T2 Val;
-      if (auto Err = SerializationTraits<ChannelT, T, T2>::deserialize(C, Val))
-        return Err;
-
-      auto Added = S.insert(Val).second;
-      if (!Added)
-        return make_error<StringError>("Duplicate element in deserialized set",
-                                       orcError(OrcErrorCode::UnknownORCError));
-    }
-
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename K, typename V, typename K2, typename V2>
-class SerializationTraits<ChannelT, std::map<K, V>, std::map<K2, V2>> {
-public:
-  /// Serialize a std::map<K, V> from std::map<K2, V2>.
-  static Error serialize(ChannelT &C, const std::map<K2, V2> &M) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(M.size())))
-      return Err;
-
-    for (const auto &E : M) {
-      if (auto Err =
-              SerializationTraits<ChannelT, K, K2>::serialize(C, E.first))
-        return Err;
-      if (auto Err =
-              SerializationTraits<ChannelT, V, V2>::serialize(C, E.second))
-        return Err;
-    }
-
-    return Error::success();
-  }
-
-  /// Deserialize a std::map<K, V> to a std::map<K, V>.
-  static Error deserialize(ChannelT &C, std::map<K2, V2> &M) {
-    assert(M.empty() && "Expected default-constructed map to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    while (Count-- != 0) {
-      std::pair<K2, V2> Val;
-      if (auto Err =
-              SerializationTraits<ChannelT, K, K2>::deserialize(C, Val.first))
-        return Err;
-
-      if (auto Err =
-              SerializationTraits<ChannelT, V, V2>::deserialize(C, Val.second))
-        return Err;
-
-      auto Added = M.insert(Val).second;
-      if (!Added)
-        return make_error<StringError>("Duplicate element in deserialized map",
-                                       orcError(OrcErrorCode::UnknownORCError));
-    }
-
-    return Error::success();
-  }
-};
-
-template <typename ChannelT, typename K, typename V, typename K2, typename V2>
-class SerializationTraits<ChannelT, std::map<K, V>, DenseMap<K2, V2>> {
-public:
-  /// Serialize a std::map<K, V> from DenseMap<K2, V2>.
-  static Error serialize(ChannelT &C, const DenseMap<K2, V2> &M) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(M.size())))
-      return Err;
-
-    for (auto &E : M) {
-      if (auto Err =
-              SerializationTraits<ChannelT, K, K2>::serialize(C, E.first))
-        return Err;
-
-      if (auto Err =
-              SerializationTraits<ChannelT, V, V2>::serialize(C, E.second))
-        return Err;
-    }
-
-    return Error::success();
-  }
-
-  /// Serialize a std::map<K, V> from DenseMap<K2, V2>.
-  static Error deserialize(ChannelT &C, DenseMap<K2, V2> &M) {
-    assert(M.empty() && "Expected default-constructed map to deserialize into");
-
-    uint64_t Count = 0;
-    if (auto Err = deserializeSeq(C, Count))
-      return Err;
-
-    while (Count-- != 0) {
-      std::pair<K2, V2> Val;
-      if (auto Err =
-              SerializationTraits<ChannelT, K, K2>::deserialize(C, Val.first))
-        return Err;
-
-      if (auto Err =
-              SerializationTraits<ChannelT, V, V2>::deserialize(C, Val.second))
-        return Err;
-
-      auto Added = M.insert(Val).second;
-      if (!Added)
-        return make_error<StringError>("Duplicate element in deserialized map",
-                                       orcError(OrcErrorCode::UnknownORCError));
-    }
-
-    return Error::success();
-  }
-};
-
-} // namespace shared
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SERIALIZATION_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
index 854f1098d5af..9ac13a493e9d 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
@@ -33,10 +33,12 @@
 #define LLVM_EXECUTIONENGINE_ORC_SHARED_SIMPLEPACKEDSERIALIZATION_H
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SwapByteOrder.h"
 
+#include <limits>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -193,13 +195,6 @@ template <typename SPSElementTagT> class SPSSequence;
 /// SPS tag type for strings, which are equivalent to sequences of chars.
 using SPSString = SPSSequence<char>;
 
-/// SPS tag type for executor addresseses.
-class SPSExecutorAddress {};
-
-template <>
-class SPSSerializationTraits<SPSExecutorAddress, uint64_t>
-    : public SPSSerializationTraits<uint64_t, uint64_t> {};
-
 /// SPS tag type for maps.
 ///
 /// SPS maps are just sequences of (Key, Value) tuples.
@@ -289,6 +284,40 @@ public:
   }
 };
 
+/// Trivial ArrayRef<T> -> SPSSequence<SPSElementTagT> serialization.
+template <typename SPSElementTagT, typename T>
+class TrivialSPSSequenceSerialization<SPSElementTagT, ArrayRef<T>> {
+public:
+  static constexpr bool available = true;
+};
+
+/// Specialized SPSSequence<char> -> ArrayRef<char> serialization.
+///
+/// On deserialize, points directly into the input buffer.
+template <> class SPSSerializationTraits<SPSSequence<char>, ArrayRef<char>> {
+public:
+  static size_t size(const ArrayRef<char> &A) {
+    return SPSArgList<uint64_t>::size(static_cast<uint64_t>(A.size())) +
+           A.size();
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const ArrayRef<char> &A) {
+    if (!SPSArgList<uint64_t>::serialize(OB, static_cast<uint64_t>(A.size())))
+      return false;
+    return OB.write(A.data(), A.size());
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, ArrayRef<char> &A) {
+    uint64_t Size;
+    if (!SPSArgList<uint64_t>::deserialize(IB, Size))
+      return false;
+    if (Size > std::numeric_limits<size_t>::max())
+      return false;
+    A = {IB.data(), static_cast<size_t>(Size)};
+    return IB.skip(Size);
+  }
+};
+
 /// 'Trivial' sequence serialization: Sequence is serialized as a uint64_t size
 /// followed by a for-earch loop over the elements of the sequence to serialize
 /// each of them.
@@ -330,6 +359,44 @@ public:
   }
 };
 
+/// SPSTuple serialization for std::tuple.
+template <typename... SPSTagTs, typename... Ts>
+class SPSSerializationTraits<SPSTuple<SPSTagTs...>, std::tuple<Ts...>> {
+private:
+  using TupleArgList = typename SPSTuple<SPSTagTs...>::AsArgList;
+  using ArgIndices = std::make_index_sequence<sizeof...(Ts)>;
+
+  template <std::size_t... I>
+  static size_t size(const std::tuple<Ts...> &T, std::index_sequence<I...>) {
+    return TupleArgList::size(std::get<I>(T)...);
+  }
+
+  template <std::size_t... I>
+  static bool serialize(SPSOutputBuffer &OB, const std::tuple<Ts...> &T,
+                        std::index_sequence<I...>) {
+    return TupleArgList::serialize(OB, std::get<I>(T)...);
+  }
+
+  template <std::size_t... I>
+  static bool deserialize(SPSInputBuffer &IB, std::tuple<Ts...> &T,
+                          std::index_sequence<I...>) {
+    return TupleArgList::deserialize(IB, std::get<I>(T)...);
+  }
+
+public:
+  static size_t size(const std::tuple<Ts...> &T) {
+    return size(T, ArgIndices{});
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const std::tuple<Ts...> &T) {
+    return serialize(OB, T, ArgIndices{});
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, std::tuple<Ts...> &T) {
+    return deserialize(IB, T, ArgIndices{});
+  }
+};
+
 /// SPSTuple serialization for std::pair.
 template <typename SPSTagT1, typename SPSTagT2, typename T1, typename T2>
 class SPSSerializationTraits<SPSTuple<SPSTagT1, SPSTagT2>, std::pair<T1, T2>> {
@@ -380,6 +447,49 @@ public:
   }
 };
 
+/// Serialization for StringMap<ValueT>s.
+template <typename SPSValueT, typename ValueT>
+class SPSSerializationTraits<SPSSequence<SPSTuple<SPSString, SPSValueT>>,
+                             StringMap<ValueT>> {
+public:
+  static size_t size(const StringMap<ValueT> &M) {
+    size_t Sz = SPSArgList<uint64_t>::size(static_cast<uint64_t>(M.size()));
+    for (auto &E : M)
+      Sz += SPSArgList<SPSString, SPSValueT>::size(E.first(), E.second);
+    return Sz;
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const StringMap<ValueT> &M) {
+    if (!SPSArgList<uint64_t>::serialize(OB, static_cast<uint64_t>(M.size())))
+      return false;
+
+    for (auto &E : M)
+      if (!SPSArgList<SPSString, SPSValueT>::serialize(OB, E.first(), E.second))
+        return false;
+
+    return true;
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, StringMap<ValueT> &M) {
+    uint64_t Size;
+    assert(M.empty() && "M already contains elements");
+
+    if (!SPSArgList<uint64_t>::deserialize(IB, Size))
+      return false;
+
+    while (Size--) {
+      StringRef S;
+      ValueT V;
+      if (!SPSArgList<SPSString, SPSValueT>::deserialize(IB, S, V))
+        return false;
+      if (!M.insert(std::make_pair(S, V)).second)
+        return false;
+    }
+
+    return true;
+  }
+};
+
 /// SPS tag type for errors.
 class SPSError;
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h
new file mode 100644
index 000000000000..9e074ed1f931
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h
@@ -0,0 +1,235 @@
+//===--- SimpleRemoteEPCUtils.h - Utils for Simple Remote EPC ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Message definitions and other utilities for SimpleRemoteEPC and
+// SimpleRemoteEPCServer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SIMPLEREMOTEEPCUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_SIMPLEREMOTEEPCUTILS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+#include "llvm/Support/Error.h"
+
+#include <atomic>
+#include <mutex>
+#include <string>
+#include <thread>
+
+namespace llvm {
+namespace orc {
+
+namespace SimpleRemoteEPCDefaultBootstrapSymbolNames {
+extern const char *ExecutorSessionObjectName;
+extern const char *DispatchFnName;
+} // end namespace SimpleRemoteEPCDefaultBootstrapSymbolNames
+
+enum class SimpleRemoteEPCOpcode : uint8_t {
+  Setup,
+  Hangup,
+  Result,
+  CallWrapper,
+  LastOpC = CallWrapper
+};
+
+struct SimpleRemoteEPCExecutorInfo {
+  std::string TargetTriple;
+  uint64_t PageSize;
+  StringMap<ExecutorAddr> BootstrapSymbols;
+};
+
+using SimpleRemoteEPCArgBytesVector = SmallVector<char, 128>;
+
+class SimpleRemoteEPCTransportClient {
+public:
+  enum HandleMessageAction { ContinueSession, EndSession };
+
+  virtual ~SimpleRemoteEPCTransportClient();
+
+  /// Handle receipt of a message.
+  ///
+  /// Returns an Error if the message cannot be handled, 'EndSession' if the
+  /// client will not accept any further messages, and 'ContinueSession'
+  /// otherwise.
+  virtual Expected<HandleMessageAction>
+  handleMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo, ExecutorAddr TagAddr,
+                SimpleRemoteEPCArgBytesVector ArgBytes) = 0;
+
+  /// Handle a disconnection from the underlying transport. No further messages
+  /// should be sent to handleMessage after this is called.
+  /// Err may contain an Error value indicating unexpected disconnection. This
+  /// allows clients to log such errors, but no attempt should be made at
+  /// recovery (which should be handled inside the transport class, if it is
+  /// supported at all).
+  virtual void handleDisconnect(Error Err) = 0;
+};
+
+class SimpleRemoteEPCTransport {
+public:
+  virtual ~SimpleRemoteEPCTransport();
+
+  /// Called during setup of the client to indicate that the client is ready
+  /// to receive messages.
+  ///
+  /// Transport objects should not access the client until this method is
+  /// called.
+  virtual Error start() = 0;
+
+  /// Send a SimpleRemoteEPC message.
+  ///
+  /// This function may be called concurrently. Subclasses should implement
+  /// locking if required for the underlying transport.
+  virtual Error sendMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                            ExecutorAddr TagAddr, ArrayRef<char> ArgBytes) = 0;
+
+  /// Trigger disconnection from the transport. The implementation should
+  /// respond by calling handleDisconnect on the client once disconnection
+  /// is complete. May be called more than once and from different threads.
+  virtual void disconnect() = 0;
+};
+
+/// Uses read/write on FileDescriptors for transport.
+class FDSimpleRemoteEPCTransport : public SimpleRemoteEPCTransport {
+public:
+  /// Create a FDSimpleRemoteEPCTransport using the given FDs for
+  /// reading (InFD) and writing (OutFD).
+  static Expected<std::unique_ptr<FDSimpleRemoteEPCTransport>>
+  Create(SimpleRemoteEPCTransportClient &C, int InFD, int OutFD);
+
+  /// Create a FDSimpleRemoteEPCTransport using the given FD for both
+  /// reading and writing.
+  static Expected<std::unique_ptr<FDSimpleRemoteEPCTransport>>
+  Create(SimpleRemoteEPCTransportClient &C, int FD) {
+    return Create(C, FD, FD);
+  }
+
+  ~FDSimpleRemoteEPCTransport() override;
+
+  Error start() override;
+
+  Error sendMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                    ExecutorAddr TagAddr, ArrayRef<char> ArgBytes) override;
+
+  void disconnect() override;
+
+private:
+  FDSimpleRemoteEPCTransport(SimpleRemoteEPCTransportClient &C, int InFD,
+                             int OutFD)
+      : C(C), InFD(InFD), OutFD(OutFD) {}
+
+  Error readBytes(char *Dst, size_t Size, bool *IsEOF = nullptr);
+  int writeBytes(const char *Src, size_t Size);
+  void listenLoop();
+
+  std::mutex M;
+  SimpleRemoteEPCTransportClient &C;
+  std::thread ListenerThread;
+  int InFD, OutFD;
+  std::atomic<bool> Disconnected{false};
+};
+
+struct RemoteSymbolLookupSetElement {
+  std::string Name;
+  bool Required;
+};
+
+using RemoteSymbolLookupSet = std::vector<RemoteSymbolLookupSetElement>;
+
+struct RemoteSymbolLookup {
+  uint64_t H;
+  RemoteSymbolLookupSet Symbols;
+};
+
+namespace shared {
+
+using SPSRemoteSymbolLookupSetElement = SPSTuple<SPSString, bool>;
+
+using SPSRemoteSymbolLookupSet = SPSSequence<SPSRemoteSymbolLookupSetElement>;
+
+using SPSRemoteSymbolLookup = SPSTuple<uint64_t, SPSRemoteSymbolLookupSet>;
+
+/// Tuple containing target triple, page size, and bootstrap symbols.
+using SPSSimpleRemoteEPCExecutorInfo =
+    SPSTuple<SPSString, uint64_t,
+             SPSSequence<SPSTuple<SPSString, SPSExecutorAddr>>>;
+
+template <>
+class SPSSerializationTraits<SPSRemoteSymbolLookupSetElement,
+                             RemoteSymbolLookupSetElement> {
+public:
+  static size_t size(const RemoteSymbolLookupSetElement &V) {
+    return SPSArgList<SPSString, bool>::size(V.Name, V.Required);
+  }
+
+  static size_t serialize(SPSOutputBuffer &OB,
+                          const RemoteSymbolLookupSetElement &V) {
+    return SPSArgList<SPSString, bool>::serialize(OB, V.Name, V.Required);
+  }
+
+  static size_t deserialize(SPSInputBuffer &IB,
+                            RemoteSymbolLookupSetElement &V) {
+    return SPSArgList<SPSString, bool>::deserialize(IB, V.Name, V.Required);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSRemoteSymbolLookup, RemoteSymbolLookup> {
+public:
+  static size_t size(const RemoteSymbolLookup &V) {
+    return SPSArgList<uint64_t, SPSRemoteSymbolLookupSet>::size(V.H, V.Symbols);
+  }
+
+  static size_t serialize(SPSOutputBuffer &OB, const RemoteSymbolLookup &V) {
+    return SPSArgList<uint64_t, SPSRemoteSymbolLookupSet>::serialize(OB, V.H,
+                                                                     V.Symbols);
+  }
+
+  static size_t deserialize(SPSInputBuffer &IB, RemoteSymbolLookup &V) {
+    return SPSArgList<uint64_t, SPSRemoteSymbolLookupSet>::deserialize(
+        IB, V.H, V.Symbols);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSSimpleRemoteEPCExecutorInfo,
+                             SimpleRemoteEPCExecutorInfo> {
+public:
+  static size_t size(const SimpleRemoteEPCExecutorInfo &SI) {
+    return SPSSimpleRemoteEPCExecutorInfo::AsArgList ::size(
+        SI.TargetTriple, SI.PageSize, SI.BootstrapSymbols);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const SimpleRemoteEPCExecutorInfo &SI) {
+    return SPSSimpleRemoteEPCExecutorInfo::AsArgList ::serialize(
+        OB, SI.TargetTriple, SI.PageSize, SI.BootstrapSymbols);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, SimpleRemoteEPCExecutorInfo &SI) {
+    return SPSSimpleRemoteEPCExecutorInfo::AsArgList ::deserialize(
+        IB, SI.TargetTriple, SI.PageSize, SI.BootstrapSymbols);
+  }
+};
+
+using SPSLoadDylibSignature = SPSExpected<SPSExecutorAddr>(SPSExecutorAddr,
+                                                           SPSString, uint64_t);
+
+using SPSLookupSymbolsSignature =
+    SPSExpected<SPSSequence<SPSSequence<SPSExecutorAddr>>>(
+        SPSExecutorAddr, SPSSequence<SPSRemoteSymbolLookup>);
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SIMPLEREMOTEEPCUTILS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
index a44bcd4c8064..0e8b7e7d345a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
@@ -17,6 +17,10 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include "llvm/Support/Memory.h"
 
 #include <vector>
 
@@ -24,12 +28,108 @@ namespace llvm {
 namespace orc {
 namespace tpctypes {
 
+enum WireProtectionFlags : uint8_t {
+  WPF_None = 0,
+  WPF_Read = 1U << 0,
+  WPF_Write = 1U << 1,
+  WPF_Exec = 1U << 2,
+  LLVM_MARK_AS_BITMASK_ENUM(WPF_Exec)
+};
+
+/// Convert from sys::Memory::ProtectionFlags
+inline WireProtectionFlags
+toWireProtectionFlags(sys::Memory::ProtectionFlags PF) {
+  WireProtectionFlags WPF = WPF_None;
+  if (PF & sys::Memory::MF_READ)
+    WPF |= WPF_Read;
+  if (PF & sys::Memory::MF_WRITE)
+    WPF |= WPF_Write;
+  if (PF & sys::Memory::MF_EXEC)
+    WPF |= WPF_Exec;
+  return WPF;
+}
+
+inline sys::Memory::ProtectionFlags
+fromWireProtectionFlags(WireProtectionFlags WPF) {
+  int PF = 0;
+  if (WPF & WPF_Read)
+    PF |= sys::Memory::MF_READ;
+  if (WPF & WPF_Write)
+    PF |= sys::Memory::MF_WRITE;
+  if (WPF & WPF_Exec)
+    PF |= sys::Memory::MF_EXEC;
+  return static_cast<sys::Memory::ProtectionFlags>(PF);
+}
+
+inline std::string getWireProtectionFlagsStr(WireProtectionFlags WPF) {
+  std::string Result;
+  Result += (WPF & WPF_Read) ? 'R' : '-';
+  Result += (WPF & WPF_Write) ? 'W' : '-';
+  Result += (WPF & WPF_Exec) ? 'X' : '-';
+  return Result;
+}
+
+struct WrapperFunctionCall {
+  ExecutorAddr Func;
+  ExecutorAddrRange ArgData;
+
+  WrapperFunctionCall() = default;
+  WrapperFunctionCall(ExecutorAddr Func, ExecutorAddr ArgData,
+                      ExecutorAddrDiff ArgSize)
+      : Func(Func), ArgData(ArgData, ArgSize) {}
+  WrapperFunctionCall(ExecutorAddr Func, ExecutorAddrRange ArgData)
+      : Func(Func), ArgData(ArgData) {}
+
+  shared::WrapperFunctionResult run() {
+    using FnTy =
+        shared::CWrapperFunctionResult(const char *ArgData, size_t ArgSize);
+    return shared::WrapperFunctionResult(
+        Func.toPtr<FnTy *>()(ArgData.Start.toPtr<const char *>(),
+                             static_cast<size_t>(ArgData.size().getValue())));
+  }
+
+  /// Run call and deserialize result using SPS.
+  template <typename SPSRetT, typename RetT> Error runWithSPSRet(RetT &RetVal) {
+    auto WFR = run();
+    if (const char *ErrMsg = WFR.getOutOfBandError())
+      return make_error<StringError>(ErrMsg, inconvertibleErrorCode());
+    shared::SPSInputBuffer IB(WFR.data(), WFR.size());
+    if (!shared::SPSSerializationTraits<SPSRetT, RetT>::deserialize(IB, RetVal))
+      return make_error<StringError>("Could not deserialize result from "
+                                     "serialized wrapper function call",
+                                     inconvertibleErrorCode());
+    return Error::success();
+  }
+
+  /// Overload for SPS functions returning void.
+  Error runWithSPSRet() {
+    shared::SPSEmpty E;
+    return runWithSPSRet<shared::SPSEmpty>(E);
+  }
+};
+
+struct AllocationActionsPair {
+  WrapperFunctionCall Finalize;
+  WrapperFunctionCall Deallocate;
+};
+
+struct SegFinalizeRequest {
+  WireProtectionFlags Prot;
+  ExecutorAddr Addr;
+  uint64_t Size;
+  ArrayRef<char> Content;
+};
+
+struct FinalizeRequest {
+  std::vector<SegFinalizeRequest> Segments;
+  std::vector<AllocationActionsPair> Actions;
+};
+
 template <typename T> struct UIntWrite {
   UIntWrite() = default;
-  UIntWrite(JITTargetAddress Address, T Value)
-      : Address(Address), Value(Value) {}
+  UIntWrite(ExecutorAddr Addr, T Value) : Addr(Addr), Value(Value) {}
 
-  JITTargetAddress Address = 0;
+  ExecutorAddr Addr;
   T Value = 0;
 };
 
@@ -49,10 +149,10 @@ using UInt64Write = UIntWrite<uint64_t>;
 /// For use with TargetProcessControl::MemoryAccess objects.
 struct BufferWrite {
   BufferWrite() = default;
-  BufferWrite(JITTargetAddress Address, StringRef Buffer)
-      : Address(Address), Buffer(Buffer) {}
+  BufferWrite(ExecutorAddr Addr, StringRef Buffer)
+      : Addr(Addr), Buffer(Buffer) {}
 
-  JITTargetAddress Address = 0;
+  ExecutorAddr Addr;
   StringRef Buffer;
 };
 
@@ -62,6 +162,180 @@ using DylibHandle = JITTargetAddress;
 using LookupResult = std::vector<JITTargetAddress>;
 
 } // end namespace tpctypes
+
+namespace shared {
+
+class SPSMemoryProtectionFlags {};
+
+using SPSWrapperFunctionCall = SPSTuple<SPSExecutorAddr, SPSExecutorAddrRange>;
+
+using SPSSegFinalizeRequest =
+    SPSTuple<SPSMemoryProtectionFlags, SPSExecutorAddr, uint64_t,
+             SPSSequence<char>>;
+
+using SPSAllocationActionsPair =
+    SPSTuple<SPSWrapperFunctionCall, SPSWrapperFunctionCall>;
+
+using SPSFinalizeRequest = SPSTuple<SPSSequence<SPSSegFinalizeRequest>,
+                                    SPSSequence<SPSAllocationActionsPair>>;
+
+template <typename T>
+using SPSMemoryAccessUIntWrite = SPSTuple<SPSExecutorAddr, T>;
+
+using SPSMemoryAccessUInt8Write = SPSMemoryAccessUIntWrite<uint8_t>;
+using SPSMemoryAccessUInt16Write = SPSMemoryAccessUIntWrite<uint16_t>;
+using SPSMemoryAccessUInt32Write = SPSMemoryAccessUIntWrite<uint32_t>;
+using SPSMemoryAccessUInt64Write = SPSMemoryAccessUIntWrite<uint64_t>;
+
+using SPSMemoryAccessBufferWrite = SPSTuple<SPSExecutorAddr, SPSSequence<char>>;
+
+template <>
+class SPSSerializationTraits<SPSMemoryProtectionFlags,
+                             tpctypes::WireProtectionFlags> {
+public:
+  static size_t size(const tpctypes::WireProtectionFlags &WPF) {
+    return SPSArgList<uint8_t>::size(static_cast<uint8_t>(WPF));
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const tpctypes::WireProtectionFlags &WPF) {
+    return SPSArgList<uint8_t>::serialize(OB, static_cast<uint8_t>(WPF));
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          tpctypes::WireProtectionFlags &WPF) {
+    uint8_t Val;
+    if (!SPSArgList<uint8_t>::deserialize(IB, Val))
+      return false;
+    WPF = static_cast<tpctypes::WireProtectionFlags>(Val);
+    return true;
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSWrapperFunctionCall,
+                             tpctypes::WrapperFunctionCall> {
+  using AL = SPSWrapperFunctionCall::AsArgList;
+
+public:
+  static size_t size(const tpctypes::WrapperFunctionCall &WFC) {
+    return AL::size(WFC.Func, WFC.ArgData);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const tpctypes::WrapperFunctionCall &WFC) {
+    return AL::serialize(OB, WFC.Func, WFC.ArgData);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          tpctypes::WrapperFunctionCall &WFC) {
+    return AL::deserialize(IB, WFC.Func, WFC.ArgData);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSAllocationActionsPair,
+                             tpctypes::AllocationActionsPair> {
+  using AL = SPSAllocationActionsPair::AsArgList;
+
+public:
+  static size_t size(const tpctypes::AllocationActionsPair &AAP) {
+    return AL::size(AAP.Finalize, AAP.Deallocate);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const tpctypes::AllocationActionsPair &AAP) {
+    return AL::serialize(OB, AAP.Finalize, AAP.Deallocate);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          tpctypes::AllocationActionsPair &AAP) {
+    return AL::deserialize(IB, AAP.Finalize, AAP.Deallocate);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSSegFinalizeRequest,
+                             tpctypes::SegFinalizeRequest> {
+  using SFRAL = SPSSegFinalizeRequest::AsArgList;
+
+public:
+  static size_t size(const tpctypes::SegFinalizeRequest &SFR) {
+    return SFRAL::size(SFR.Prot, SFR.Addr, SFR.Size, SFR.Content);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const tpctypes::SegFinalizeRequest &SFR) {
+    return SFRAL::serialize(OB, SFR.Prot, SFR.Addr, SFR.Size, SFR.Content);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB,
+                          tpctypes::SegFinalizeRequest &SFR) {
+    return SFRAL::deserialize(IB, SFR.Prot, SFR.Addr, SFR.Size, SFR.Content);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSFinalizeRequest, tpctypes::FinalizeRequest> {
+  using FRAL = SPSFinalizeRequest::AsArgList;
+
+public:
+  static size_t size(const tpctypes::FinalizeRequest &FR) {
+    return FRAL::size(FR.Segments, FR.Actions);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const tpctypes::FinalizeRequest &FR) {
+    return FRAL::serialize(OB, FR.Segments, FR.Actions);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, tpctypes::FinalizeRequest &FR) {
+    return FRAL::deserialize(IB, FR.Segments, FR.Actions);
+  }
+};
+
+template <typename T>
+class SPSSerializationTraits<SPSMemoryAccessUIntWrite<T>,
+                             tpctypes::UIntWrite<T>> {
+public:
+  static size_t size(const tpctypes::UIntWrite<T> &W) {
+    return SPSTuple<SPSExecutorAddr, T>::AsArgList::size(W.Addr, W.Value);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const tpctypes::UIntWrite<T> &W) {
+    return SPSTuple<SPSExecutorAddr, T>::AsArgList::serialize(OB, W.Addr,
+                                                              W.Value);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, tpctypes::UIntWrite<T> &W) {
+    return SPSTuple<SPSExecutorAddr, T>::AsArgList::deserialize(IB, W.Addr,
+                                                                W.Value);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSMemoryAccessBufferWrite,
+                             tpctypes::BufferWrite> {
+public:
+  static size_t size(const tpctypes::BufferWrite &W) {
+    return SPSTuple<SPSExecutorAddr, SPSSequence<char>>::AsArgList::size(
+        W.Addr, W.Buffer);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const tpctypes::BufferWrite &W) {
+    return SPSTuple<SPSExecutorAddr, SPSSequence<char>>::AsArgList ::serialize(
+        OB, W.Addr, W.Buffer);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, tpctypes::BufferWrite &W) {
+    return SPSTuple<SPSExecutorAddr,
+                    SPSSequence<char>>::AsArgList ::deserialize(IB, W.Addr,
+                                                                W.Buffer);
+  }
+};
+
+
+} // end namespace shared
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h
index 2f14a1c76332..bf841b1f706b 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h
@@ -10,9 +10,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTIONENGINE_ORC_WRAPPERFUNCTIONUTILS_H
-#define LLVM_EXECUTIONENGINE_ORC_WRAPPERFUNCTIONUTILS_H
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_WRAPPERFUNCTIONUTILS_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_WRAPPERFUNCTIONUTILS_H
 
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 #include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
 #include "llvm/Support/Error.h"
 
@@ -22,24 +23,18 @@ namespace llvm {
 namespace orc {
 namespace shared {
 
-namespace detail {
-
-// DO NOT USE DIRECTLY.
 // Must be kept in-sync with compiler-rt/lib/orc/c-api.h.
 union CWrapperFunctionResultDataUnion {
   char *ValuePtr;
   char Value[sizeof(ValuePtr)];
 };
 
-// DO NOT USE DIRECTLY.
 // Must be kept in-sync with compiler-rt/lib/orc/c-api.h.
 typedef struct {
   CWrapperFunctionResultDataUnion Data;
   size_t Size;
 } CWrapperFunctionResult;
 
-} // end namespace detail
-
 /// C++ wrapper function result: Same as CWrapperFunctionResult but
 /// auto-releases memory.
 class WrapperFunctionResult {
@@ -48,11 +43,11 @@ public:
   WrapperFunctionResult() { init(R); }
 
   /// Create a WrapperFunctionResult by taking ownership of a
-  /// detail::CWrapperFunctionResult.
+  /// CWrapperFunctionResult.
   ///
   /// Warning: This should only be used by clients writing wrapper-function
   /// caller utilities (like TargetProcessControl).
-  WrapperFunctionResult(detail::CWrapperFunctionResult R) : R(R) {
+  WrapperFunctionResult(CWrapperFunctionResult R) : R(R) {
     // Reset R.
     init(R);
   }
@@ -77,18 +72,25 @@ public:
       free(R.Data.ValuePtr);
   }
 
-  /// Release ownership of the contained detail::CWrapperFunctionResult.
+  /// Release ownership of the contained CWrapperFunctionResult.
   /// Warning: Do not use -- this method will be removed in the future. It only
   /// exists to temporarily support some code that will eventually be moved to
   /// the ORC runtime.
-  detail::CWrapperFunctionResult release() {
-    detail::CWrapperFunctionResult Tmp;
+  CWrapperFunctionResult release() {
+    CWrapperFunctionResult Tmp;
     init(Tmp);
     std::swap(R, Tmp);
     return Tmp;
   }
 
   /// Get a pointer to the data contained in this instance.
+  char *data() {
+    assert((R.Size != 0 || R.Data.ValuePtr == nullptr) &&
+           "Cannot get data for out-of-band error value");
+    return R.Size > sizeof(R.Data.Value) ? R.Data.ValuePtr : R.Data.Value;
+  }
+
+  /// Get a const pointer to the data contained in this instance.
   const char *data() const {
     assert((R.Size != 0 || R.Data.ValuePtr == nullptr) &&
            "Cannot get data for out-of-band error value");
@@ -108,24 +110,19 @@ public:
 
   /// Create a WrapperFunctionResult with the given size and return a pointer
   /// to the underlying memory.
-  static char *allocate(WrapperFunctionResult &WFR, size_t Size) {
+  static WrapperFunctionResult allocate(size_t Size) {
     // Reset.
-    WFR = WrapperFunctionResult();
+    WrapperFunctionResult WFR;
     WFR.R.Size = Size;
-    char *DataPtr;
-    if (WFR.R.Size > sizeof(WFR.R.Data.Value)) {
-      DataPtr = (char *)malloc(WFR.R.Size);
-      WFR.R.Data.ValuePtr = DataPtr;
-    } else
-      DataPtr = WFR.R.Data.Value;
-    return DataPtr;
+    if (WFR.R.Size > sizeof(WFR.R.Data.Value))
+      WFR.R.Data.ValuePtr = (char *)malloc(WFR.R.Size);
+    return WFR;
   }
 
   /// Copy from the given char range.
   static WrapperFunctionResult copyFrom(const char *Source, size_t Size) {
-    WrapperFunctionResult WFR;
-    char *DataPtr = allocate(WFR, Size);
-    memcpy(DataPtr, Source, Size);
+    auto WFR = allocate(Size);
+    memcpy(WFR.data(), Source, Size);
     return WFR;
   }
 
@@ -161,12 +158,12 @@ public:
   }
 
 private:
-  static void init(detail::CWrapperFunctionResult &R) {
+  static void init(CWrapperFunctionResult &R) {
     R.Data.ValuePtr = nullptr;
     R.Size = 0;
   }
 
-  detail::CWrapperFunctionResult R;
+  CWrapperFunctionResult R;
 };
 
 namespace detail {
@@ -174,10 +171,8 @@ namespace detail {
 template <typename SPSArgListT, typename... ArgTs>
 WrapperFunctionResult
 serializeViaSPSToWrapperFunctionResult(const ArgTs &...Args) {
-  WrapperFunctionResult Result;
-  char *DataPtr =
-      WrapperFunctionResult::allocate(Result, SPSArgListT::size(Args...));
-  SPSOutputBuffer OB(DataPtr, Result.size());
+  auto Result = WrapperFunctionResult::allocate(SPSArgListT::size(Args...));
+  SPSOutputBuffer OB(Result.data(), Result.size());
   if (!SPSArgListT::serialize(OB, Args...))
     return WrapperFunctionResult::createOutOfBandError(
         "Error serializing arguments to blob in call");
@@ -315,6 +310,7 @@ private:
   static void callAsync(HandlerT &&H,
                         SerializeAndSendResultT &&SerializeAndSendResult,
                         ArgTupleT Args, std::index_sequence<I...>) {
+    (void)Args; // Silence a buggy GCC warning.
     return std::forward<HandlerT>(H)(std::move(SerializeAndSendResult),
                                      std::move(std::get<I>(Args))...);
   }
@@ -486,10 +482,16 @@ public:
     }
 
     auto SendSerializedResult = [SDR = std::move(SendDeserializedResult)](
-                                    WrapperFunctionResult R) {
+                                    WrapperFunctionResult R) mutable {
       RetT RetVal = detail::ResultDeserializer<SPSRetTagT, RetT>::makeValue();
       detail::ResultDeserializer<SPSRetTagT, RetT>::makeSafe(RetVal);
 
+      if (auto *ErrMsg = R.getOutOfBandError()) {
+        SDR(make_error<StringError>(ErrMsg, inconvertibleErrorCode()),
+            std::move(RetVal));
+        return;
+      }
+
       SPSInputBuffer IB(R.data(), R.size());
       if (auto Err = detail::ResultDeserializer<SPSRetTagT, RetT>::deserialize(
               RetVal, R.data(), R.size()))
@@ -547,12 +549,68 @@ public:
     return WrapperFunction<SPSEmpty(SPSTagTs...)>::call(Caller, BE, Args...);
   }
 
+  template <typename AsyncCallerFn, typename SendDeserializedResultFn,
+            typename... ArgTs>
+  static void callAsync(AsyncCallerFn &&Caller,
+                        SendDeserializedResultFn &&SendDeserializedResult,
+                        const ArgTs &...Args) {
+    WrapperFunction<SPSEmpty(SPSTagTs...)>::callAsync(
+        std::forward<AsyncCallerFn>(Caller),
+        [SDR = std::move(SendDeserializedResult)](Error SerializeErr,
+                                                  SPSEmpty E) mutable {
+          SDR(std::move(SerializeErr));
+        },
+        Args...);
+  }
+
   using WrapperFunction<SPSEmpty(SPSTagTs...)>::handle;
   using WrapperFunction<SPSEmpty(SPSTagTs...)>::handleAsync;
 };
 
+/// A function object that takes an ExecutorAddr as its first argument,
+/// casts that address to a ClassT*, then calls the given method on that
+/// pointer passing in the remaining function arguments. This utility
+/// removes some of the boilerplate from writing wrappers for method calls.
+///
+///   @code{.cpp}
+///   class MyClass {
+///   public:
+///     void myMethod(uint32_t, bool) { ... }
+///   };
+///
+///   // SPS Method signature -- note MyClass object address as first argument.
+///   using SPSMyMethodWrapperSignature =
+///     SPSTuple<SPSExecutorAddr, uint32_t, bool>;
+///
+///   WrapperFunctionResult
+///   myMethodCallWrapper(const char *ArgData, size_t ArgSize) {
+///     return WrapperFunction<SPSMyMethodWrapperSignature>::handle(
+///        ArgData, ArgSize, makeMethodWrapperHandler(&MyClass::myMethod));
+///   }
+///   @endcode
+///
+template <typename RetT, typename ClassT, typename... ArgTs>
+class MethodWrapperHandler {
+public:
+  using MethodT = RetT (ClassT::*)(ArgTs...);
+  MethodWrapperHandler(MethodT M) : M(M) {}
+  RetT operator()(ExecutorAddr ObjAddr, ArgTs &...Args) {
+    return (ObjAddr.toPtr<ClassT*>()->*M)(std::forward<ArgTs>(Args)...);
+  }
+
+private:
+  MethodT M;
+};
+
+/// Create a MethodWrapperHandler object from the given method pointer.
+template <typename RetT, typename ClassT, typename... ArgTs>
+MethodWrapperHandler<RetT, ClassT, ArgTs...>
+makeMethodWrapperHandler(RetT (ClassT::*Method)(ArgTs...)) {
+  return MethodWrapperHandler<RetT, ClassT, ArgTs...>(Method);
+}
+
 } // end namespace shared
 } // end namespace orc
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORC_WRAPPERFUNCTIONUTILS_H
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_WRAPPERFUNCTIONUTILS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
new file mode 100644
index 000000000000..bd72e4535325
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
@@ -0,0 +1,140 @@
+//===---- SimpleRemoteEPC.h - Simple remote executor control ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple remote executor process control.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEEPC_H
+#define LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEEPC_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
+#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
+
+#include <future>
+
+namespace llvm {
+namespace orc {
+
+class SimpleRemoteEPC : public ExecutorProcessControl,
+                        public SimpleRemoteEPCTransportClient {
+public:
+  /// A setup object containing callbacks to construct a memory manager and
+  /// memory access object. Both are optional. If not specified,
+  /// EPCGenericJITLinkMemoryManager and EPCGenericMemoryAccess will be used.
+  struct Setup {
+    using CreateMemoryManagerFn =
+        Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>(
+            SimpleRemoteEPC &);
+    using CreateMemoryAccessFn =
+        Expected<std::unique_ptr<MemoryAccess>>(SimpleRemoteEPC &);
+
+    unique_function<CreateMemoryManagerFn> CreateMemoryManager;
+    unique_function<CreateMemoryAccessFn> CreateMemoryAccess;
+  };
+
+  /// Create a SimpleRemoteEPC using the given transport type and args.
+  template <typename TransportT, typename... TransportTCtorArgTs>
+  static Expected<std::unique_ptr<SimpleRemoteEPC>>
+  Create(std::unique_ptr<TaskDispatcher> D, Setup S,
+         TransportTCtorArgTs &&...TransportTCtorArgs) {
+    std::unique_ptr<SimpleRemoteEPC> SREPC(
+        new SimpleRemoteEPC(std::make_shared<SymbolStringPool>(),
+                            std::move(D)));
+    auto T = TransportT::Create(
+        *SREPC, std::forward<TransportTCtorArgTs>(TransportTCtorArgs)...);
+    if (!T)
+      return T.takeError();
+    SREPC->T = std::move(*T);
+    if (auto Err = SREPC->setup(std::move(S)))
+      return joinErrors(std::move(Err), SREPC->disconnect());
+    return std::move(SREPC);
+  }
+
+  SimpleRemoteEPC(const SimpleRemoteEPC &) = delete;
+  SimpleRemoteEPC &operator=(const SimpleRemoteEPC &) = delete;
+  SimpleRemoteEPC(SimpleRemoteEPC &&) = delete;
+  SimpleRemoteEPC &operator=(SimpleRemoteEPC &&) = delete;
+  ~SimpleRemoteEPC();
+
+  Expected<tpctypes::DylibHandle> loadDylib(const char *DylibPath) override;
+
+  Expected<std::vector<tpctypes::LookupResult>>
+  lookupSymbols(ArrayRef<LookupRequest> Request) override;
+
+  Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
+                              ArrayRef<std::string> Args) override;
+
+  void callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                        IncomingWFRHandler OnComplete,
+                        ArrayRef<char> ArgBuffer) override;
+
+  Error disconnect() override;
+
+  Expected<HandleMessageAction>
+  handleMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo, ExecutorAddr TagAddr,
+                SimpleRemoteEPCArgBytesVector ArgBytes) override;
+
+  void handleDisconnect(Error Err) override;
+
+private:
+  SimpleRemoteEPC(std::shared_ptr<SymbolStringPool> SSP,
+                  std::unique_ptr<TaskDispatcher> D)
+    : ExecutorProcessControl(std::move(SSP), std::move(D)) {}
+
+  static Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
+  createDefaultMemoryManager(SimpleRemoteEPC &SREPC);
+  static Expected<std::unique_ptr<MemoryAccess>>
+  createDefaultMemoryAccess(SimpleRemoteEPC &SREPC);
+
+  Error sendMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                    ExecutorAddr TagAddr, ArrayRef<char> ArgBytes);
+
+  Error handleSetup(uint64_t SeqNo, ExecutorAddr TagAddr,
+                    SimpleRemoteEPCArgBytesVector ArgBytes);
+  Error setup(Setup S);
+
+  Error handleResult(uint64_t SeqNo, ExecutorAddr TagAddr,
+                     SimpleRemoteEPCArgBytesVector ArgBytes);
+  void handleCallWrapper(uint64_t RemoteSeqNo, ExecutorAddr TagAddr,
+                         SimpleRemoteEPCArgBytesVector ArgBytes);
+  Error handleHangup(SimpleRemoteEPCArgBytesVector ArgBytes);
+
+  uint64_t getNextSeqNo() { return NextSeqNo++; }
+  void releaseSeqNo(uint64_t SeqNo) {}
+
+  using PendingCallWrapperResultsMap =
+    DenseMap<uint64_t, IncomingWFRHandler>;
+
+  std::mutex SimpleRemoteEPCMutex;
+  std::condition_variable DisconnectCV;
+  bool Disconnected = false;
+  Error DisconnectErr = Error::success();
+
+  std::unique_ptr<SimpleRemoteEPCTransport> T;
+  std::unique_ptr<jitlink::JITLinkMemoryManager> OwnedMemMgr;
+  std::unique_ptr<MemoryAccess> OwnedMemAccess;
+
+  std::unique_ptr<EPCGenericDylibManager> DylibMgr;
+  ExecutorAddr RunAsMainAddr;
+
+  uint64_t NextSeqNo = 0;
+  PendingCallWrapperResultsMap PendingCallWrapperResults;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEEPC_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h
new file mode 100644
index 000000000000..32c127634b25
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h
@@ -0,0 +1,36 @@
+//===- ExecutorService.h - Provide bootstrap symbols to session -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Provides a service by supplying some set of bootstrap symbols.
+//
+// FIXME: The functionality in this file should be moved to the ORC runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_EXECUTORBOOTSTRAPSERVICE_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_EXECUTORBOOTSTRAPSERVICE_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+
+namespace llvm {
+namespace orc {
+
+class ExecutorBootstrapService {
+public:
+  virtual ~ExecutorBootstrapService();
+
+  virtual void
+  addBootstrapSymbols(StringMap<ExecutorAddr> &BootstrapSymbols) = 0;
+  virtual Error shutdown() = 0;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_EXECUTORBOOTSTRAPSERVICE_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h
index 3fad98b5f178..cfb951178da6 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h
@@ -16,7 +16,7 @@
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include <cstdint>
 
-extern "C" llvm::orc::shared::detail::CWrapperFunctionResult
+extern "C" llvm::orc::shared::CWrapperFunctionResult
 llvm_orc_registerJITLoaderGDBWrapper(const char *Data, uint64_t Size);
 
 #endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_JITLOADERGDB_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h
deleted file mode 100644
index 96e4341fce68..000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/OrcRPCTPCServer.h
+++ /dev/null
@@ -1,660 +0,0 @@
-//===-- OrcRPCTPCServer.h -- OrcRPCTargetProcessControl Server --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// OrcRPCTargetProcessControl server class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRPCTPCSERVER_H
-#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRPCTPCSERVER_H
-
-#include "llvm/ADT/BitmaskEnum.h"
-#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
-#include "llvm/ExecutionEngine/Orc/Shared/RawByteChannel.h"
-#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
-#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
-#include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/Process.h"
-
-#include <atomic>
-
-namespace llvm {
-namespace orc {
-
-namespace orcrpctpc {
-
-enum WireProtectionFlags : uint8_t {
-  WPF_None = 0,
-  WPF_Read = 1U << 0,
-  WPF_Write = 1U << 1,
-  WPF_Exec = 1U << 2,
-  LLVM_MARK_AS_BITMASK_ENUM(WPF_Exec)
-};
-
-struct ExecutorProcessInfo {
-  std::string Triple;
-  unsigned PageSize;
-  JITTargetAddress DispatchFuncAddr;
-  JITTargetAddress DispatchCtxAddr;
-};
-
-/// Convert from sys::Memory::ProtectionFlags
-inline WireProtectionFlags
-toWireProtectionFlags(sys::Memory::ProtectionFlags PF) {
-  WireProtectionFlags WPF = WPF_None;
-  if (PF & sys::Memory::MF_READ)
-    WPF |= WPF_Read;
-  if (PF & sys::Memory::MF_WRITE)
-    WPF |= WPF_Write;
-  if (PF & sys::Memory::MF_EXEC)
-    WPF |= WPF_Exec;
-  return WPF;
-}
-
-inline sys::Memory::ProtectionFlags
-fromWireProtectionFlags(WireProtectionFlags WPF) {
-  int PF = 0;
-  if (WPF & WPF_Read)
-    PF |= sys::Memory::MF_READ;
-  if (WPF & WPF_Write)
-    PF |= sys::Memory::MF_WRITE;
-  if (WPF & WPF_Exec)
-    PF |= sys::Memory::MF_EXEC;
-  return static_cast<sys::Memory::ProtectionFlags>(PF);
-}
-
-struct ReserveMemRequestElement {
-  WireProtectionFlags Prot = WPF_None;
-  uint64_t Size = 0;
-  uint64_t Alignment = 0;
-};
-
-using ReserveMemRequest = std::vector<ReserveMemRequestElement>;
-
-struct ReserveMemResultElement {
-  WireProtectionFlags Prot = WPF_None;
-  JITTargetAddress Address = 0;
-  uint64_t AllocatedSize = 0;
-};
-
-using ReserveMemResult = std::vector<ReserveMemResultElement>;
-
-struct ReleaseOrFinalizeMemRequestElement {
-  WireProtectionFlags Prot = WPF_None;
-  JITTargetAddress Address = 0;
-  uint64_t Size = 0;
-};
-
-using ReleaseOrFinalizeMemRequest =
-    std::vector<ReleaseOrFinalizeMemRequestElement>;
-
-} // end namespace orcrpctpc
-
-namespace shared {
-
-template <> class SerializationTypeName<WrapperFunctionResult> {
-public:
-  static const char *getName() { return "WrapperFunctionResult"; }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, WrapperFunctionResult, WrapperFunctionResult,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  static Error serialize(ChannelT &C, const WrapperFunctionResult &E) {
-    if (auto Err = serializeSeq(C, static_cast<uint64_t>(E.size())))
-      return Err;
-    if (E.size() == 0)
-      return Error::success();
-    return C.appendBytes(E.data(), E.size());
-  }
-
-  static Error deserialize(ChannelT &C, WrapperFunctionResult &E) {
-    uint64_t Size;
-    if (auto Err = deserializeSeq(C, Size))
-      return Err;
-
-    WrapperFunctionResult Tmp;
-    char *Data = WrapperFunctionResult::allocate(Tmp, Size);
-
-    if (auto Err = C.readBytes(Data, Size))
-      return Err;
-
-    E = std::move(Tmp);
-
-    return Error::success();
-  }
-};
-
-template <> class SerializationTypeName<tpctypes::UInt8Write> {
-public:
-  static const char *getName() { return "UInt8Write"; }
-};
-
-template <> class SerializationTypeName<tpctypes::UInt16Write> {
-public:
-  static const char *getName() { return "UInt16Write"; }
-};
-
-template <> class SerializationTypeName<tpctypes::UInt32Write> {
-public:
-  static const char *getName() { return "UInt32Write"; }
-};
-
-template <> class SerializationTypeName<tpctypes::UInt64Write> {
-public:
-  static const char *getName() { return "UInt64Write"; }
-};
-
-template <> class SerializationTypeName<tpctypes::BufferWrite> {
-public:
-  static const char *getName() { return "BufferWrite"; }
-};
-
-template <> class SerializationTypeName<orcrpctpc::ReserveMemRequestElement> {
-public:
-  static const char *getName() { return "ReserveMemRequestElement"; }
-};
-
-template <> class SerializationTypeName<orcrpctpc::ReserveMemResultElement> {
-public:
-  static const char *getName() { return "ReserveMemResultElement"; }
-};
-
-template <>
-class SerializationTypeName<orcrpctpc::ReleaseOrFinalizeMemRequestElement> {
-public:
-  static const char *getName() { return "ReleaseOrFinalizeMemRequestElement"; }
-};
-
-template <> class SerializationTypeName<orcrpctpc::ExecutorProcessInfo> {
-public:
-  static const char *getName() { return "ExecutorProcessInfo"; }
-};
-
-template <typename ChannelT, typename WriteT>
-class SerializationTraits<
-    ChannelT, WriteT, WriteT,
-    std::enable_if_t<std::is_same<WriteT, tpctypes::UInt8Write>::value ||
-                     std::is_same<WriteT, tpctypes::UInt16Write>::value ||
-                     std::is_same<WriteT, tpctypes::UInt32Write>::value ||
-                     std::is_same<WriteT, tpctypes::UInt64Write>::value>> {
-public:
-  static Error serialize(ChannelT &C, const WriteT &W) {
-    return serializeSeq(C, W.Address, W.Value);
-  }
-  static Error deserialize(ChannelT &C, WriteT &W) {
-    return deserializeSeq(C, W.Address, W.Value);
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<
-    ChannelT, tpctypes::BufferWrite, tpctypes::BufferWrite,
-    std::enable_if_t<std::is_base_of<RawByteChannel, ChannelT>::value>> {
-public:
-  static Error serialize(ChannelT &C, const tpctypes::BufferWrite &W) {
-    uint64_t Size = W.Buffer.size();
-    if (auto Err = serializeSeq(C, W.Address, Size))
-      return Err;
-
-    return C.appendBytes(W.Buffer.data(), Size);
-  }
-  static Error deserialize(ChannelT &C, tpctypes::BufferWrite &W) {
-    JITTargetAddress Address;
-    uint64_t Size;
-
-    if (auto Err = deserializeSeq(C, Address, Size))
-      return Err;
-
-    char *Buffer = jitTargetAddressToPointer<char *>(Address);
-
-    if (auto Err = C.readBytes(Buffer, Size))
-      return Err;
-
-    W = {Address, StringRef(Buffer, Size)};
-    return Error::success();
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<ChannelT, orcrpctpc::ReserveMemRequestElement> {
-public:
-  static Error serialize(ChannelT &C,
-                         const orcrpctpc::ReserveMemRequestElement &E) {
-    return serializeSeq(C, static_cast<uint8_t>(E.Prot), E.Size, E.Alignment);
-  }
-
-  static Error deserialize(ChannelT &C,
-                           orcrpctpc::ReserveMemRequestElement &E) {
-    return deserializeSeq(C, *reinterpret_cast<uint8_t *>(&E.Prot), E.Size,
-                          E.Alignment);
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<ChannelT, orcrpctpc::ReserveMemResultElement> {
-public:
-  static Error serialize(ChannelT &C,
-                         const orcrpctpc::ReserveMemResultElement &E) {
-    return serializeSeq(C, static_cast<uint8_t>(E.Prot), E.Address,
-                        E.AllocatedSize);
-  }
-
-  static Error deserialize(ChannelT &C, orcrpctpc::ReserveMemResultElement &E) {
-    return deserializeSeq(C, *reinterpret_cast<uint8_t *>(&E.Prot), E.Address,
-                          E.AllocatedSize);
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<ChannelT,
-                          orcrpctpc::ReleaseOrFinalizeMemRequestElement> {
-public:
-  static Error
-  serialize(ChannelT &C,
-            const orcrpctpc::ReleaseOrFinalizeMemRequestElement &E) {
-    return serializeSeq(C, static_cast<uint8_t>(E.Prot), E.Address, E.Size);
-  }
-
-  static Error deserialize(ChannelT &C,
-                           orcrpctpc::ReleaseOrFinalizeMemRequestElement &E) {
-    return deserializeSeq(C, *reinterpret_cast<uint8_t *>(&E.Prot), E.Address,
-                          E.Size);
-  }
-};
-
-template <typename ChannelT>
-class SerializationTraits<ChannelT, orcrpctpc::ExecutorProcessInfo> {
-public:
-  static Error serialize(ChannelT &C,
-                         const orcrpctpc::ExecutorProcessInfo &EPI) {
-    return serializeSeq(C, EPI.Triple, EPI.PageSize, EPI.DispatchFuncAddr,
-                        EPI.DispatchCtxAddr);
-  }
-
-  static Error deserialize(ChannelT &C, orcrpctpc::ExecutorProcessInfo &EPI) {
-    return deserializeSeq(C, EPI.Triple, EPI.PageSize, EPI.DispatchFuncAddr,
-                          EPI.DispatchCtxAddr);
-  }
-};
-
-} // end namespace shared
-
-namespace orcrpctpc {
-
-using RemoteSymbolLookupSet = std::vector<std::pair<std::string, bool>>;
-using RemoteLookupRequest =
-    std::pair<tpctypes::DylibHandle, RemoteSymbolLookupSet>;
-
-class GetExecutorProcessInfo
-    : public shared::RPCFunction<GetExecutorProcessInfo,
-                                 orcrpctpc::ExecutorProcessInfo()> {
-public:
-  static const char *getName() { return "GetJITDispatchInfo"; }
-};
-
-class ReserveMem
-    : public shared::RPCFunction<ReserveMem, Expected<ReserveMemResult>(
-                                                 ReserveMemRequest)> {
-public:
-  static const char *getName() { return "ReserveMem"; }
-};
-
-class FinalizeMem
-    : public shared::RPCFunction<FinalizeMem,
-                                 Error(ReleaseOrFinalizeMemRequest)> {
-public:
-  static const char *getName() { return "FinalizeMem"; }
-};
-
-class ReleaseMem
-    : public shared::RPCFunction<ReleaseMem,
-                                 Error(ReleaseOrFinalizeMemRequest)> {
-public:
-  static const char *getName() { return "ReleaseMem"; }
-};
-
-class WriteUInt8s
-    : public shared::RPCFunction<WriteUInt8s,
-                                 Error(std::vector<tpctypes::UInt8Write>)> {
-public:
-  static const char *getName() { return "WriteUInt8s"; }
-};
-
-class WriteUInt16s
-    : public shared::RPCFunction<WriteUInt16s,
-                                 Error(std::vector<tpctypes::UInt16Write>)> {
-public:
-  static const char *getName() { return "WriteUInt16s"; }
-};
-
-class WriteUInt32s
-    : public shared::RPCFunction<WriteUInt32s,
-                                 Error(std::vector<tpctypes::UInt32Write>)> {
-public:
-  static const char *getName() { return "WriteUInt32s"; }
-};
-
-class WriteUInt64s
-    : public shared::RPCFunction<WriteUInt64s,
-                                 Error(std::vector<tpctypes::UInt64Write>)> {
-public:
-  static const char *getName() { return "WriteUInt64s"; }
-};
-
-class WriteBuffers
-    : public shared::RPCFunction<WriteBuffers,
-                                 Error(std::vector<tpctypes::BufferWrite>)> {
-public:
-  static const char *getName() { return "WriteBuffers"; }
-};
-
-class LoadDylib
-    : public shared::RPCFunction<LoadDylib, Expected<tpctypes::DylibHandle>(
-                                                std::string DylibPath)> {
-public:
-  static const char *getName() { return "LoadDylib"; }
-};
-
-class LookupSymbols
-    : public shared::RPCFunction<LookupSymbols,
-                                 Expected<std::vector<tpctypes::LookupResult>>(
-                                     std::vector<RemoteLookupRequest>)> {
-public:
-  static const char *getName() { return "LookupSymbols"; }
-};
-
-class RunMain
-    : public shared::RPCFunction<RunMain,
-                                 int64_t(JITTargetAddress MainAddr,
-                                         std::vector<std::string> Args)> {
-public:
-  static const char *getName() { return "RunMain"; }
-};
-
-class RunWrapper
-    : public shared::RPCFunction<RunWrapper,
-                                 shared::WrapperFunctionResult(
-                                     JITTargetAddress, std::vector<uint8_t>)> {
-public:
-  static const char *getName() { return "RunWrapper"; }
-};
-
-class CloseConnection : public shared::RPCFunction<CloseConnection, void()> {
-public:
-  static const char *getName() { return "CloseConnection"; }
-};
-
-} // end namespace orcrpctpc
-
-/// TargetProcessControl for a process connected via an ORC RPC Endpoint.
-template <typename RPCEndpointT> class OrcRPCTPCServer {
-private:
-  using ThisT = OrcRPCTPCServer<RPCEndpointT>;
-
-public:
-  /// Create an OrcRPCTPCServer from the given endpoint.
-  OrcRPCTPCServer(RPCEndpointT &EP) : EP(EP) {
-
-    TripleStr = sys::getProcessTriple();
-    PageSize = sys::Process::getPageSizeEstimate();
-
-    EP.template addHandler<orcrpctpc::GetExecutorProcessInfo>(
-        *this, &ThisT::getExecutorProcessInfo);
-    EP.template addHandler<orcrpctpc::ReserveMem>(*this, &ThisT::reserveMemory);
-    EP.template addHandler<orcrpctpc::FinalizeMem>(*this,
-                                                   &ThisT::finalizeMemory);
-    EP.template addHandler<orcrpctpc::ReleaseMem>(*this, &ThisT::releaseMemory);
-
-    EP.template addHandler<orcrpctpc::WriteUInt8s>(
-        handleWriteUInt<tpctypes::UInt8Write>);
-    EP.template addHandler<orcrpctpc::WriteUInt16s>(
-        handleWriteUInt<tpctypes::UInt16Write>);
-    EP.template addHandler<orcrpctpc::WriteUInt32s>(
-        handleWriteUInt<tpctypes::UInt32Write>);
-    EP.template addHandler<orcrpctpc::WriteUInt64s>(
-        handleWriteUInt<tpctypes::UInt64Write>);
-    EP.template addHandler<orcrpctpc::WriteBuffers>(handleWriteBuffer);
-
-    EP.template addHandler<orcrpctpc::LoadDylib>(*this, &ThisT::loadDylib);
-    EP.template addHandler<orcrpctpc::LookupSymbols>(*this,
-                                                     &ThisT::lookupSymbols);
-
-    EP.template addHandler<orcrpctpc::RunMain>(*this, &ThisT::runMain);
-    EP.template addHandler<orcrpctpc::RunWrapper>(*this, &ThisT::runWrapper);
-
-    EP.template addHandler<orcrpctpc::CloseConnection>(*this,
-                                                       &ThisT::closeConnection);
-  }
-
-  /// Set the ProgramName to be used as the first argv element when running
-  /// functions via runAsMain.
-  void setProgramName(Optional<std::string> ProgramName = None) {
-    this->ProgramName = std::move(ProgramName);
-  }
-
-  /// Get the RPC endpoint for this server.
-  RPCEndpointT &getEndpoint() { return EP; }
-
-  /// Run the server loop.
-  Error run() {
-    while (!Finished) {
-      if (auto Err = EP.handleOne())
-        return Err;
-    }
-    return Error::success();
-  }
-
-  Expected<shared::WrapperFunctionResult>
-  runWrapperInJIT(JITTargetAddress FunctionId, ArrayRef<char> ArgBuffer) {
-    return EP.template callB<orcrpctpc::RunWrapper>(
-        FunctionId,
-        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(ArgBuffer.data()),
-                          ArgBuffer.size()));
-  }
-
-private:
-  static shared::detail::CWrapperFunctionResult
-  jitDispatchViaOrcRPCTPCServer(void *Ctx, const void *FnTag, const char *Data,
-                                size_t Size) {
-    assert(Ctx && "Attempt to dispatch with null context ptr");
-    auto R = static_cast<ThisT *>(Ctx)->runWrapperInJIT(
-        pointerToJITTargetAddress(FnTag), {Data, Size});
-    if (!R) {
-      auto ErrMsg = toString(R.takeError());
-      return shared::WrapperFunctionResult::createOutOfBandError(ErrMsg.data())
-          .release();
-    }
-    return R->release();
-  }
-
-  orcrpctpc::ExecutorProcessInfo getExecutorProcessInfo() {
-    return {TripleStr, static_cast<uint32_t>(PageSize),
-            pointerToJITTargetAddress(jitDispatchViaOrcRPCTPCServer),
-            pointerToJITTargetAddress(this)};
-  }
-
-  template <typename WriteT>
-  static void handleWriteUInt(const std::vector<WriteT> &Ws) {
-    using ValueT = decltype(std::declval<WriteT>().Value);
-    for (auto &W : Ws)
-      *jitTargetAddressToPointer<ValueT *>(W.Address) = W.Value;
-  }
-
-  std::string getProtStr(orcrpctpc::WireProtectionFlags WPF) {
-    std::string Result;
-    Result += (WPF & orcrpctpc::WPF_Read) ? 'R' : '-';
-    Result += (WPF & orcrpctpc::WPF_Write) ? 'W' : '-';
-    Result += (WPF & orcrpctpc::WPF_Exec) ? 'X' : '-';
-    return Result;
-  }
-
-  static void handleWriteBuffer(const std::vector<tpctypes::BufferWrite> &Ws) {
-    for (auto &W : Ws) {
-      memcpy(jitTargetAddressToPointer<char *>(W.Address), W.Buffer.data(),
-             W.Buffer.size());
-    }
-  }
-
-  Expected<orcrpctpc::ReserveMemResult>
-  reserveMemory(const orcrpctpc::ReserveMemRequest &Request) {
-    orcrpctpc::ReserveMemResult Allocs;
-    auto PF = sys::Memory::MF_READ | sys::Memory::MF_WRITE;
-
-    uint64_t TotalSize = 0;
-
-    for (const auto &E : Request) {
-      uint64_t Size = alignTo(E.Size, PageSize);
-      uint16_t Align = E.Alignment;
-
-      if ((Align > PageSize) || (PageSize % Align))
-        return make_error<StringError>(
-            "Page alignmen does not satisfy requested alignment",
-            inconvertibleErrorCode());
-
-      TotalSize += Size;
-    }
-
-    // Allocate memory slab.
-    std::error_code EC;
-    auto MB = sys::Memory::allocateMappedMemory(TotalSize, nullptr, PF, EC);
-    if (EC)
-      return make_error<StringError>("Unable to allocate memory: " +
-                                         EC.message(),
-                                     inconvertibleErrorCode());
-
-    // Zero-fill the whole thing.
-    memset(MB.base(), 0, MB.allocatedSize());
-
-    // Carve up sections to return.
-    uint64_t SectionBase = 0;
-    for (const auto &E : Request) {
-      uint64_t SectionSize = alignTo(E.Size, PageSize);
-      Allocs.push_back({E.Prot,
-                        pointerToJITTargetAddress(MB.base()) + SectionBase,
-                        SectionSize});
-      SectionBase += SectionSize;
-    }
-
-    return Allocs;
-  }
-
-  Error finalizeMemory(const orcrpctpc::ReleaseOrFinalizeMemRequest &FMR) {
-    for (const auto &E : FMR) {
-      sys::MemoryBlock MB(jitTargetAddressToPointer<void *>(E.Address), E.Size);
-
-      auto PF = orcrpctpc::fromWireProtectionFlags(E.Prot);
-      if (auto EC =
-              sys::Memory::protectMappedMemory(MB, static_cast<unsigned>(PF)))
-        return make_error<StringError>("error protecting memory: " +
-                                           EC.message(),
-                                       inconvertibleErrorCode());
-    }
-    return Error::success();
-  }
-
-  Error releaseMemory(const orcrpctpc::ReleaseOrFinalizeMemRequest &RMR) {
-    for (const auto &E : RMR) {
-      sys::MemoryBlock MB(jitTargetAddressToPointer<void *>(E.Address), E.Size);
-
-      if (auto EC = sys::Memory::releaseMappedMemory(MB))
-        return make_error<StringError>("error release memory: " + EC.message(),
-                                       inconvertibleErrorCode());
-    }
-    return Error::success();
-  }
-
-  Expected<tpctypes::DylibHandle> loadDylib(const std::string &Path) {
-    std::string ErrMsg;
-    const char *DLPath = !Path.empty() ? Path.c_str() : nullptr;
-    auto DL = sys::DynamicLibrary::getPermanentLibrary(DLPath, &ErrMsg);
-    if (!DL.isValid())
-      return make_error<StringError>(std::move(ErrMsg),
-                                     inconvertibleErrorCode());
-
-    tpctypes::DylibHandle H = Dylibs.size();
-    Dylibs[H] = std::move(DL);
-    return H;
-  }
-
-  Expected<std::vector<tpctypes::LookupResult>>
-  lookupSymbols(const std::vector<orcrpctpc::RemoteLookupRequest> &Request) {
-    std::vector<tpctypes::LookupResult> Result;
-
-    for (const auto &E : Request) {
-      auto I = Dylibs.find(E.first);
-      if (I == Dylibs.end())
-        return make_error<StringError>("Unrecognized handle",
-                                       inconvertibleErrorCode());
-      auto &DL = I->second;
-      Result.push_back({});
-
-      for (const auto &KV : E.second) {
-        auto &SymString = KV.first;
-        bool WeakReference = KV.second;
-
-        const char *Sym = SymString.c_str();
-#ifdef __APPLE__
-        if (*Sym == '_')
-          ++Sym;
-#endif
-
-        void *Addr = DL.getAddressOfSymbol(Sym);
-        if (!Addr && !WeakReference)
-          return make_error<StringError>(Twine("Missing definition for ") + Sym,
-                                         inconvertibleErrorCode());
-
-        Result.back().push_back(pointerToJITTargetAddress(Addr));
-      }
-    }
-
-    return Result;
-  }
-
-  int64_t runMain(JITTargetAddress MainFnAddr,
-                  const std::vector<std::string> &Args) {
-    Optional<StringRef> ProgramNameOverride;
-    if (ProgramName)
-      ProgramNameOverride = *ProgramName;
-
-    return runAsMain(
-        jitTargetAddressToFunction<int (*)(int, char *[])>(MainFnAddr), Args,
-        ProgramNameOverride);
-  }
-
-  shared::WrapperFunctionResult
-  runWrapper(JITTargetAddress WrapperFnAddr,
-             const std::vector<uint8_t> &ArgBuffer) {
-    using WrapperFnTy = shared::detail::CWrapperFunctionResult (*)(
-        const char *Data, uint64_t Size);
-    auto *WrapperFn = jitTargetAddressToFunction<WrapperFnTy>(WrapperFnAddr);
-    return WrapperFn(reinterpret_cast<const char *>(ArgBuffer.data()),
-                     ArgBuffer.size());
-  }
-
-  void closeConnection() { Finished = true; }
-
-  std::string TripleStr;
-  uint64_t PageSize = 0;
-  Optional<std::string> ProgramName;
-  RPCEndpointT &EP;
-  std::atomic<bool> Finished{false};
-  DenseMap<tpctypes::DylibHandle, sys::DynamicLibrary> Dylibs;
-};
-
-} // end namespace orc
-} // end namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRPCTPCSERVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h
index 3b4aabb90371..735aa53e41fd 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h
@@ -33,10 +33,26 @@ Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
 } // end namespace orc
 } // end namespace llvm
 
-extern "C" llvm::orc::shared::detail::CWrapperFunctionResult
+/// An eh-frame registration utility suitable for use as a support function
+/// call. This function expects the direct address and size of the eh-frame
+/// section to register as its arguments (it does not treat its arguments as
+/// pointers to an SPS-serialized arg buffer).
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerEHFrameSectionCustomDirectWrapper(
+    const char *EHFrameSectionAddr, uint64_t Size);
+
+/// An eh-frame deregistration utility suitable for use as a support function
+/// call. This function expects the direct address and size of the eh-frame
+/// section to register as its arguments (it does not treat its arguments as
+/// pointers to an SPS-serialized arg buffer).
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_deregisterEHFrameSectionCustomDirectWrapper(
+    const char *EHFrameSectionAddr, uint64_t Size);
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
 llvm_orc_registerEHFrameSectionWrapper(const char *Data, uint64_t Size);
 
-extern "C" llvm::orc::shared::detail::CWrapperFunctionResult
+extern "C" llvm::orc::shared::CWrapperFunctionResult
 llvm_orc_deregisterEHFrameSectionWrapper(const char *Data, uint64_t Size);
 
 #endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_REGISTEREHFRAMES_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
new file mode 100644
index 000000000000..cbab234f8a2d
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
@@ -0,0 +1,64 @@
+//===--------------- SimpleExecutorDylibManager.h ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple dynamic library management class. Allows dynamic libraries to be
+// loaded and searched.
+//
+// FIXME: The functionality in this file should be moved to the ORC runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEEXECUTORDYLIBMANAGER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEEXECUTORDYLIBMANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+/// Simple page-based allocator.
+class SimpleExecutorDylibManager : public ExecutorBootstrapService {
+public:
+  virtual ~SimpleExecutorDylibManager();
+
+  Expected<tpctypes::DylibHandle> open(const std::string &Path, uint64_t Mode);
+  Expected<std::vector<ExecutorAddr>> lookup(tpctypes::DylibHandle H,
+                                             const RemoteSymbolLookupSet &L);
+
+  Error shutdown() override;
+  void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
+
+private:
+  using DylibsMap = DenseMap<uint64_t, sys::DynamicLibrary>;
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  openWrapper(const char *ArgData, size_t ArgSize);
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  lookupWrapper(const char *ArgData, size_t ArgSize);
+
+  std::mutex M;
+  uint64_t NextId = 0;
+  DylibsMap Dylibs;
+};
+
+} // end namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEEXECUTORDYLIBMANAGER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
new file mode 100644
index 000000000000..6858f6d4db6e
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
@@ -0,0 +1,70 @@
+//===---------------- SimpleExecutorMemoryManager.h -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple allocator class suitable for basic remote-JIT use.
+//
+// FIXME: The functionality in this file should be moved to the ORC runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEEXECUTORMEMORYMANAGER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEEXECUTORMEMORYMANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+/// Simple page-based allocator.
+class SimpleExecutorMemoryManager : public ExecutorBootstrapService {
+public:
+  virtual ~SimpleExecutorMemoryManager();
+
+  Expected<ExecutorAddr> allocate(uint64_t Size);
+  Error finalize(tpctypes::FinalizeRequest &FR);
+  Error deallocate(const std::vector<ExecutorAddr> &Bases);
+
+  Error shutdown() override;
+  void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
+
+private:
+  struct Allocation {
+    size_t Size = 0;
+    std::vector<tpctypes::WrapperFunctionCall> DeallocationActions;
+  };
+
+  using AllocationsMap = DenseMap<void *, Allocation>;
+
+  Error deallocateImpl(void *Base, Allocation &A);
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  reserveWrapper(const char *ArgData, size_t ArgSize);
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  finalizeWrapper(const char *ArgData, size_t ArgSize);
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  deallocateWrapper(const char *ArgData, size_t ArgSize);
+
+  std::mutex M;
+  AllocationsMap Allocations;
+};
+
+} // end namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEEXECUTORMEMORYMANAGER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h
new file mode 100644
index 000000000000..afd3d39dbb53
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h
@@ -0,0 +1,182 @@
+//===---- SimpleRemoteEPCServer.h - EPC over abstract channel ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EPC over simple abstract channel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEREMOTEEPCSERVER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEREMOTEEPCSERVER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+
+#include <condition_variable>
+#include <future>
+#include <memory>
+#include <mutex>
+
+namespace llvm {
+namespace orc {
+
+/// A simple EPC server implementation.
+class SimpleRemoteEPCServer : public SimpleRemoteEPCTransportClient {
+public:
+  using ReportErrorFunction = unique_function<void(Error)>;
+
+  /// Dispatches calls to runWrapper.
+  class Dispatcher {
+  public:
+    virtual ~Dispatcher();
+    virtual void dispatch(unique_function<void()> Work) = 0;
+    virtual void shutdown() = 0;
+  };
+
+#if LLVM_ENABLE_THREADS
+  class ThreadDispatcher : public Dispatcher {
+  public:
+    void dispatch(unique_function<void()> Work) override;
+    void shutdown() override;
+
+  private:
+    std::mutex DispatchMutex;
+    bool Running = true;
+    size_t Outstanding = 0;
+    std::condition_variable OutstandingCV;
+  };
+#endif
+
+  class Setup {
+    friend class SimpleRemoteEPCServer;
+
+  public:
+    SimpleRemoteEPCServer &server() { return S; }
+    StringMap<ExecutorAddr> &bootstrapSymbols() { return BootstrapSymbols; }
+    std::vector<std::unique_ptr<ExecutorBootstrapService>> &services() {
+      return Services;
+    }
+    void setDispatcher(std::unique_ptr<Dispatcher> D) { S.D = std::move(D); }
+    void setErrorReporter(unique_function<void(Error)> ReportError) {
+      S.ReportError = std::move(ReportError);
+    }
+
+  private:
+    Setup(SimpleRemoteEPCServer &S) : S(S) {}
+    SimpleRemoteEPCServer &S;
+    StringMap<ExecutorAddr> BootstrapSymbols;
+    std::vector<std::unique_ptr<ExecutorBootstrapService>> Services;
+  };
+
+  static StringMap<ExecutorAddr> defaultBootstrapSymbols();
+
+  template <typename TransportT, typename... TransportTCtorArgTs>
+  static Expected<std::unique_ptr<SimpleRemoteEPCServer>>
+  Create(unique_function<Error(Setup &S)> SetupFunction,
+         TransportTCtorArgTs &&...TransportTCtorArgs) {
+    auto Server = std::make_unique<SimpleRemoteEPCServer>();
+    Setup S(*Server);
+    if (auto Err = SetupFunction(S))
+      return std::move(Err);
+
+    // Set ReportError up-front so that it can be used if construction
+    // process fails.
+    if (!Server->ReportError)
+      Server->ReportError = [](Error Err) {
+        logAllUnhandledErrors(std::move(Err), errs(), "SimpleRemoteEPCServer ");
+      };
+
+    // Attempt to create transport.
+    auto T = TransportT::Create(
+        *Server, std::forward<TransportTCtorArgTs>(TransportTCtorArgs)...);
+    if (!T)
+      return T.takeError();
+    Server->T = std::move(*T);
+    if (auto Err = Server->T->start())
+      return std::move(Err);
+
+    // If transport creation succeeds then start up services.
+    Server->Services = std::move(S.services());
+    Server->Services.push_back(
+        std::make_unique<rt_bootstrap::SimpleExecutorDylibManager>());
+    for (auto &Service : Server->Services)
+      Service->addBootstrapSymbols(S.bootstrapSymbols());
+
+    if (auto Err = Server->sendSetupMessage(std::move(S.BootstrapSymbols)))
+      return std::move(Err);
+    return std::move(Server);
+  }
+
+  /// Set an error reporter for this server.
+  void setErrorReporter(ReportErrorFunction ReportError) {
+    this->ReportError = std::move(ReportError);
+  }
+
+  /// Call to handle an incoming message.
+  ///
+  /// Returns 'Disconnect' if the message is a 'detach' message from the remote
+  /// otherwise returns 'Continue'. If the server has moved to an error state,
+  /// returns an error, which should be reported and treated as a 'Disconnect'.
+  Expected<HandleMessageAction>
+  handleMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo, ExecutorAddr TagAddr,
+                SimpleRemoteEPCArgBytesVector ArgBytes) override;
+
+  Error waitForDisconnect();
+
+  void handleDisconnect(Error Err) override;
+
+private:
+  Error sendMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                    ExecutorAddr TagAddr, ArrayRef<char> ArgBytes);
+
+  Error sendSetupMessage(StringMap<ExecutorAddr> BootstrapSymbols);
+
+  Error handleResult(uint64_t SeqNo, ExecutorAddr TagAddr,
+                     SimpleRemoteEPCArgBytesVector ArgBytes);
+  void handleCallWrapper(uint64_t RemoteSeqNo, ExecutorAddr TagAddr,
+                         SimpleRemoteEPCArgBytesVector ArgBytes);
+
+  shared::WrapperFunctionResult
+  doJITDispatch(const void *FnTag, const char *ArgData, size_t ArgSize);
+
+  static shared::CWrapperFunctionResult jitDispatchEntry(void *DispatchCtx,
+                                                         const void *FnTag,
+                                                         const char *ArgData,
+                                                         size_t ArgSize);
+
+  uint64_t getNextSeqNo() { return NextSeqNo++; }
+  void releaseSeqNo(uint64_t) {}
+
+  using PendingJITDispatchResultsMap =
+      DenseMap<uint64_t, std::promise<shared::WrapperFunctionResult> *>;
+
+  std::mutex ServerStateMutex;
+  std::condition_variable ShutdownCV;
+  enum { ServerRunning, ServerShuttingDown, ServerShutDown } RunState;
+  Error ShutdownErr = Error::success();
+  std::unique_ptr<SimpleRemoteEPCTransport> T;
+  std::unique_ptr<Dispatcher> D;
+  std::vector<std::unique_ptr<ExecutorBootstrapService>> Services;
+  ReportErrorFunction ReportError;
+
+  uint64_t NextSeqNo = 0;
+  PendingJITDispatchResultsMap PendingJITDispatchResults;
+  std::vector<sys::DynamicLibrary> Dylibs;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_SIMPLEREMOTEEPCSERVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
new file mode 100644
index 000000000000..c57264e59655
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
@@ -0,0 +1,131 @@
+//===--------- TaskDispatch.h - ORC task dispatch utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Task and TaskDispatch classes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TASKDISPATCH_H
+#define LLVM_EXECUTIONENGINE_ORC_TASKDISPATCH_H
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cassert>
+#include <string>
+
+#if LLVM_ENABLE_THREADS
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#endif
+
+namespace llvm {
+namespace orc {
+
+/// Represents an abstract task for ORC to run.
+class Task : public RTTIExtends<Task, RTTIRoot> {
+public:
+  static char ID;
+
+  virtual ~Task() {}
+
+  /// Description of the task to be performed. Used for logging.
+  virtual void printDescription(raw_ostream &OS) = 0;
+
+  /// Run the task.
+  virtual void run() = 0;
+
+private:
+  void anchor() override;
+};
+
+/// Base class for generic tasks.
+class GenericNamedTask : public RTTIExtends<GenericNamedTask, Task> {
+public:
+  static char ID;
+  static const char *DefaultDescription;
+};
+
+/// Generic task implementation.
+template <typename FnT> class GenericNamedTaskImpl : public GenericNamedTask {
+public:
+  GenericNamedTaskImpl(FnT &&Fn, std::string DescBuffer)
+      : Fn(std::forward<FnT>(Fn)), Desc(DescBuffer.c_str()),
+        DescBuffer(std::move(DescBuffer)) {}
+  GenericNamedTaskImpl(FnT &&Fn, const char *Desc)
+      : Fn(std::forward<FnT>(Fn)), Desc(Desc) {
+    assert(Desc && "Description cannot be null");
+  }
+  void printDescription(raw_ostream &OS) override { OS << Desc; }
+  void run() override { Fn(); }
+
+private:
+  FnT Fn;
+  const char *Desc;
+  std::string DescBuffer;
+};
+
+/// Create a generic named task from a std::string description.
+template <typename FnT>
+std::unique_ptr<GenericNamedTask> makeGenericNamedTask(FnT &&Fn,
+                                                       std::string Desc) {
+  return std::make_unique<GenericNamedTaskImpl<FnT>>(std::forward<FnT>(Fn),
+                                                     std::move(Desc));
+}
+
+/// Create a generic named task from a const char * description.
+template <typename FnT>
+std::unique_ptr<GenericNamedTask>
+makeGenericNamedTask(FnT &&Fn, const char *Desc = nullptr) {
+  if (!Desc)
+    Desc = GenericNamedTask::DefaultDescription;
+  return std::make_unique<GenericNamedTaskImpl<FnT>>(std::forward<FnT>(Fn),
+                                                     Desc);
+}
+
+/// Abstract base for classes that dispatch ORC Tasks.
+class TaskDispatcher {
+public:
+  virtual ~TaskDispatcher();
+
+  /// Run the given task.
+  virtual void dispatch(std::unique_ptr<Task> T) = 0;
+
+  /// Called by ExecutionSession. Waits until all tasks have completed.
+  virtual void shutdown() = 0;
+};
+
+/// Runs all tasks on the current thread.
+class InPlaceTaskDispatcher : public TaskDispatcher {
+public:
+  void dispatch(std::unique_ptr<Task> T) override;
+  void shutdown() override;
+};
+
+#if LLVM_ENABLE_THREADS
+
+class DynamicThreadPoolTaskDispatcher : public TaskDispatcher {
+public:
+  void dispatch(std::unique_ptr<Task> T) override;
+  void shutdown() override;
+private:
+  std::mutex DispatchMutex;
+  bool Running = true;
+  size_t Outstanding = 0;
+  std::condition_variable OutstandingCV;
+};
+
+#endif // LLVM_ENABLE_THREADS
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TASKDISPATCH_H
diff --git a/llvm/include/llvm/ExecutionEngine/OrcMCJITReplacement.h b/llvm/include/llvm/ExecutionEngine/OrcMCJITReplacement.h
deleted file mode 100644
index 6cca1933f39f..000000000000
--- a/llvm/include/llvm/ExecutionEngine/OrcMCJITReplacement.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===---- OrcMCJITReplacement.h - Orc-based MCJIT replacement ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file forces OrcMCJITReplacement to link in on certain operating systems.
-// (Windows).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORCMCJITREPLACEMENT_H
-#define LLVM_EXECUTIONENGINE_ORCMCJITREPLACEMENT_H
-
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include <cstdlib>
-
-extern "C" void LLVMLinkInOrcMCJITReplacement();
-
-namespace {
-  struct ForceOrcMCJITReplacementLinking {
-    ForceOrcMCJITReplacementLinking() {
-      // We must reference OrcMCJITReplacement in such a way that compilers will
-      // not delete it all as dead code, even with whole program optimization,
-      // yet is effectively a NO-OP. As the compiler isn't smart enough to know
-      // that getenv() never returns -1, this will do the job.
-      if (std::getenv("bar") != (char*) -1)
-        return;
-
-      LLVMLinkInOrcMCJITReplacement();
-    }
-  } ForceOrcMCJITReplacementLinking;
-}
-
-#endif
diff --git a/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h b/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
index 128c9967a596..c434b45077a3 100644
--- a/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -112,6 +112,20 @@ public:
                                          StringRef SectionName,
                                          bool IsReadOnly) = 0;
 
+    /// An allocated TLS section
+    struct TLSSection {
+      /// The pointer to the initialization image
+      uint8_t *InitializationImage;
+      /// The TLS offset
+      intptr_t Offset;
+    };
+
+    /// Allocate a memory block of (at least) the given size to be used for
+    /// thread-local storage (TLS).
+    virtual TLSSection allocateTLSSection(uintptr_t Size, unsigned Alignment,
+                                          unsigned SectionID,
+                                          StringRef SectionName);
+
     /// Inform the memory manager about the total amount of memory required to
     /// allocate all sections to be loaded:
     /// \p CodeSize - the total size of all code sections
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 3dc6194c7830..5ee379b7fcad 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -144,6 +144,26 @@ def OMPC_Schedule : Clause<"schedule"> {
   ];
 }
 
+def OMP_MEMORY_ORDER_SeqCst : ClauseVal<"seq_cst", 1, 1> {}
+def OMP_MEMORY_ORDER_AcqRel : ClauseVal<"acq_rel", 2, 1> {}
+def OMP_MEMORY_ORDER_Acquire : ClauseVal<"acquire", 3, 1> {}
+def OMP_MEMORY_ORDER_Release : ClauseVal<"release", 4, 1> {}
+def OMP_MEMORY_ORDER_Relaxed : ClauseVal<"relaxed", 5, 1> {}
+def OMP_MEMORY_ORDER_Default : ClauseVal<"default", 6, 0> {
+  let isDefault = 1;
+}
+def OMPC_MemoryOrder : Clause<"memory_order"> {
+  let enumClauseValue = "MemoryOrderKind";
+  let allowedClauseValues = [
+    OMP_MEMORY_ORDER_SeqCst,
+    OMP_MEMORY_ORDER_AcqRel,
+    OMP_MEMORY_ORDER_Acquire,
+    OMP_MEMORY_ORDER_Release,
+    OMP_MEMORY_ORDER_Relaxed,
+    OMP_MEMORY_ORDER_Default
+  ];
+}
+
 def OMPC_Ordered : Clause<"ordered"> {
   let clangClass = "OMPOrderedClause";
   let flangClass = "ScalarIntConstantExpr";
@@ -261,13 +281,17 @@ def OMPC_Allocate : Clause<"allocate"> {
 }
 def OMPC_NonTemporal : Clause<"nontemporal"> {
   let clangClass = "OMPNontemporalClause";
+  let flangClass = "Name";
+  let isValueList = true; 
 }
 
-def OMP_ORDER_concurrent : ClauseVal<"default",2,0> { let isDefault = 1; }
+def OMP_ORDER_concurrent : ClauseVal<"concurrent",1,1> {}
+def OMP_ORDER_unknown : ClauseVal<"unknown",2,0> { let isDefault = 1; }
 def OMPC_Order : Clause<"order"> {
   let clangClass = "OMPOrderClause";
   let enumClauseValue = "OrderKind";
   let allowedClauseValues = [
+    OMP_ORDER_unknown,
     OMP_ORDER_concurrent
   ];
 }
@@ -312,6 +336,8 @@ def OMPC_Uniform : Clause<"uniform"> {
 }
 def OMPC_DeviceType : Clause<"device_type"> {}
 def OMPC_Match : Clause<"match"> {}
+def OMPC_AdjustArgs : Clause<"adjust_args"> { }
+def OMPC_AppendArgs : Clause<"append_args"> { }
 def OMPC_Depobj : Clause<"depobj"> {
   let clangClass = "OMPDepobjClause";
   let isImplicit = true;
@@ -337,6 +363,14 @@ def OMPC_Filter : Clause<"filter"> {
   let clangClass = "OMPFilterClause";
   let flangClass = "ScalarIntExpr";
 }
+def OMPC_Align : Clause<"align"> {
+  let clangClass = "OMPAlignClause";
+}
+def OMPC_When: Clause<"when"> {}
+
+def OMPC_Bind : Clause<"bind"> {
+  let clangClass = "OMPBindClause";
+}
 
 //===----------------------------------------------------------------------===//
 // Definition of OpenMP directives
@@ -473,8 +507,8 @@ def OMP_TaskWait : Directive<"taskwait"> {
 }
 def OMP_TaskGroup : Directive<"taskgroup"> {
   let allowedClauses = [
-    VersionedClause<OMPC_TaskReduction>,
-    VersionedClause<OMPC_Allocate>
+    VersionedClause<OMPC_TaskReduction, 50>,
+    VersionedClause<OMPC_Allocate, 50>
   ];
 }
 def OMP_Flush : Directive<"flush"> {
@@ -489,10 +523,12 @@ def OMP_Flush : Directive<"flush"> {
 }
 def OMP_Ordered : Directive<"ordered"> {
   let allowedClauses = [
-    VersionedClause<OMPC_Threads>,
-    VersionedClause<OMPC_Simd>,
     VersionedClause<OMPC_Depend>
   ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Threads>,
+    VersionedClause<OMPC_Simd>
+  ];
 }
 def OMP_Atomic : Directive<"atomic"> {
   let allowedClauses = [
@@ -1506,13 +1542,18 @@ def OMP_TargetTeamsDistributeSimd :
 }
 def OMP_Allocate : Directive<"allocate"> {
   let allowedOnceClauses = [
-    VersionedClause<OMPC_Allocator>
+    VersionedClause<OMPC_Allocator>,
+    VersionedClause<OMPC_Align, 51>
   ];
 }
 def OMP_DeclareVariant : Directive<"declare variant"> {
   let allowedClauses = [
     VersionedClause<OMPC_Match>
   ];
+  let allowedExclusiveClauses = [
+    VersionedClause<OMPC_AdjustArgs, 51>,
+    VersionedClause<OMPC_AppendArgs, 51>
+  ];
 }
 def OMP_MasterTaskloop : Directive<"master taskloop"> {
   let allowedClauses = [
@@ -1699,6 +1740,22 @@ def OMP_masked : Directive<"masked"> {
     VersionedClause<OMPC_Filter>
   ];
 }
+def OMP_loop : Directive<"loop"> {
+  let allowedClauses = [
+    VersionedClause<OMPC_LastPrivate>,
+    VersionedClause<OMPC_Private>,
+    VersionedClause<OMPC_Reduction>,
+  ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Bind, 50>,
+    VersionedClause<OMPC_Collapse>,
+    VersionedClause<OMPC_Order>,
+  ];
+}
+def OMP_Metadirective : Directive<"metadirective"> {
+  let allowedClauses = [VersionedClause<OMPC_When>];
+  let allowedOnceClauses = [VersionedClause<OMPC_Default>];
+}
 def OMP_Unknown : Directive<"unknown"> {
   let isDefault = true;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index d174cc8992dd..2fec3e7e4230 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -128,6 +128,14 @@ enum class OMPScheduleType {
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
 };
 
+enum OMPTgtExecModeFlags : int8_t {
+  OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
+  OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
+  OMP_TGT_EXEC_MODE_GENERIC_SPMD =
+      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD)
+};
+
 } // end namespace omp
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
index 0b6aed1e9e12..89f5de229b3b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -29,100 +29,89 @@ namespace omp {
 ///
 /// Example usage in clang:
 ///   const unsigned slot_size =
-///   ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+///   ctx.GetTargetInfo().getGridValue().GV_Warp_Size;
 ///
 /// Example usage in libomptarget/deviceRTLs:
 ///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 ///   #ifdef __AMDGPU__
-///     #define GRIDVAL AMDGPUGpuGridValues
+///     #define GRIDVAL AMDGPUGridValues
 ///   #else
-///     #define GRIDVAL NVPTXGpuGridValues
+///     #define GRIDVAL NVPTXGridValues
 ///   #endif
 ///   ... Then use this reference for GV_Warp_Size in the deviceRTL source.
-///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+///   llvm::omp::GRIDVAL().GV_Warp_Size
 ///
 /// Example usage in libomptarget hsa plugin:
 ///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-///   #define GRIDVAL AMDGPUGpuGridValues
+///   #define GRIDVAL AMDGPUGridValues
 ///   ... Then use this reference to access GV_Warp_Size in the hsa plugin.
-///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+///   llvm::omp::GRIDVAL().GV_Warp_Size
 ///
 /// Example usage in libomptarget cuda plugin:
 ///    #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-///    #define GRIDVAL NVPTXGpuGridValues
+///    #define GRIDVAL NVPTXGridValues
 ///   ... Then use this reference to access GV_Warp_Size in the cuda plugin.
-///    llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+///    llvm::omp::GRIDVAL().GV_Warp_Size
 ///
-enum GVIDX {
-  /// The maximum number of workers in a kernel.
-  /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z
-  GV_Threads,
+
+struct GV {
   /// The size reserved for data in a shared memory slot.
-  GV_Slot_Size,
+  const unsigned GV_Slot_Size;
   /// The default value of maximum number of threads in a worker warp.
-  GV_Warp_Size,
-  /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size
-  /// for NVPTX.
-  GV_Warp_Size_32,
-  /// The number of bits required to represent the max number of threads in warp
-  GV_Warp_Size_Log2,
-  /// GV_Warp_Size * GV_Slot_Size,
-  GV_Warp_Slot_Size,
+  const unsigned GV_Warp_Size;
+
+  constexpr unsigned warpSlotSize() const {
+    return GV_Warp_Size * GV_Slot_Size;
+  }
+
   /// the maximum number of teams.
-  GV_Max_Teams,
-  /// Global Memory Alignment
-  GV_Mem_Align,
-  /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
-  GV_Warp_Size_Log2_Mask,
+  const unsigned GV_Max_Teams;
   // An alternative to the heavy data sharing infrastructure that uses global
   // memory is one that uses device __shared__ memory.  The amount of such space
   // (in bytes) reserved by the OpenMP runtime is noted here.
-  GV_SimpleBufferSize,
+  const unsigned GV_SimpleBufferSize;
   // The absolute maximum team size for a working group
-  GV_Max_WG_Size,
+  const unsigned GV_Max_WG_Size;
   // The default maximum team size for a working group
-  GV_Default_WG_Size,
-  // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN.
-  GV_Max_Warp_Number,
-  /// The slot size that should be reserved for a working warp.
-  /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
-  GV_Warp_Size_Log2_MaskL
+  const unsigned GV_Default_WG_Size;
+
+  constexpr unsigned maxWarpNumber() const {
+    return GV_Max_WG_Size / GV_Warp_Size;
+  }
 };
 
 /// For AMDGPU GPUs
-static constexpr unsigned AMDGPUGpuGridValues[] = {
-    448,       // GV_Threads
-    256,       // GV_Slot_Size
-    64,        // GV_Warp_Size
-    32,        // GV_Warp_Size_32
-    6,         // GV_Warp_Size_Log2
-    64 * 256,  // GV_Warp_Slot_Size
-    128,       // GV_Max_Teams
-    256,       // GV_Mem_Align
-    63,        // GV_Warp_Size_Log2_Mask
-    896,       // GV_SimpleBufferSize
-    1024,      // GV_Max_WG_Size,
-    256,       // GV_Defaut_WG_Size
-    1024 / 64, // GV_Max_WG_Size / GV_WarpSize
-    63         // GV_Warp_Size_Log2_MaskL
+static constexpr GV AMDGPUGridValues64 = {
+    256,  // GV_Slot_Size
+    64,   // GV_Warp_Size
+    128,  // GV_Max_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size,
+    256,  // GV_Default_WG_Size
 };
 
+static constexpr GV AMDGPUGridValues32 = {
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
+    128,  // GV_Max_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size,
+    256,  // GV_Default_WG_Size
+};
+
+template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
+  static_assert(wavesize == 32 || wavesize == 64, "");
+  return wavesize == 32 ? AMDGPUGridValues32 : AMDGPUGridValues64;
+}
+
 /// For Nvidia GPUs
-static constexpr unsigned NVPTXGpuGridValues[] = {
-    992,               // GV_Threads
-    256,               // GV_Slot_Size
-    32,                // GV_Warp_Size
-    32,                // GV_Warp_Size_32
-    5,                 // GV_Warp_Size_Log2
-    32 * 256,          // GV_Warp_Slot_Size
-    1024,              // GV_Max_Teams
-    256,               // GV_Mem_Align
-    (~0u >> (32 - 5)), // GV_Warp_Size_Log2_Mask
-    896,               // GV_SimpleBufferSize
-    1024,              // GV_Max_WG_Size
-    128,               // GV_Defaut_WG_Size
-    1024 / 32,         // GV_Max_WG_Size / GV_WarpSize
-    31                 // GV_Warp_Size_Log2_MaskL
+static constexpr GV NVPTXGridValues = {
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
+    1024, // GV_Max_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size
+    128,  // GV_Default_WG_Size
 };
 
 } // namespace omp
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8144f1527a06..563e0eed1762 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -257,18 +257,17 @@ public:
   ///
   ///  * Sign of the step and the comparison operator might disagree:
   ///
-  ///      for (int i = 0; i < 42; --i)
+  ///      for (int i = 0; i < 42; i -= 1u)
   ///
   //
   /// \param Loc       The insert and source location description.
   /// \param BodyGenCB Callback that will generate the loop body code.
   /// \param Start     Value of the loop counter for the first iterations.
-  /// \param Stop      Loop counter values past this will stop the the
-  ///                  iterations.
+  /// \param Stop      Loop counter values past this will stop the loop.
   /// \param Step      Loop counter increment after each iteration; negative
-  ///                  means counting down. \param IsSigned  Whether Start, Stop
-  ///                  and Stop are signed integers.
-  /// \param InclusiveStop Whether  \p Stop itself is a valid value for the loop
+  ///                  means counting down.
+  /// \param IsSigned  Whether Start, Stop and Step are signed integers.
+  /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
   ///                      counter.
   /// \param ComputeIP Insertion point for instructions computing the trip
   ///                  count. Can be used to ensure the trip count is available
@@ -335,7 +334,7 @@ public:
   ///    has a trip count of 0). This is permitted by the OpenMP specification.
   ///
   /// \param DL        Debug location for instructions added for collapsing,
-  ///                  such as instructions to compute derive the input loop's
+  ///                  such as instructions to compute/derive the input loop's
   ///                  induction variables.
   /// \param Loops     Loops in the loop nest to collapse. Loops are specified
   ///                  from outermost-to-innermost and every control flow of a
@@ -358,8 +357,16 @@ public:
   /// the current thread, updates the relevant instructions in the canonical
   /// loop and calls to an OpenMP runtime finalization function after the loop.
   ///
-  /// \param Loc      The source location description, the insertion location
-  ///                 is not used.
+  /// TODO: Workshare loops with static scheduling may contain up to two loops
+  /// that fulfill the requirements of an OpenMP canonical loop. One for
+  /// iterating over all iterations of a chunk and another one for iterating
+  /// over all chunks that are executed on the same thread. Returning
+  /// CanonicalLoopInfo objects representing them may eventually be useful for
+  /// the apply clause planned in OpenMP 6.0, but currently whether these are
+  /// canonical loops is irrelevant.
+  ///
+  /// \param DL       Debug location for instructions added for the
+  ///                 workshare-loop construct itself.
   /// \param CLI      A descriptor of the canonical loop to workshare.
   /// \param AllocaIP An insertion point for Alloca instructions usable in the
   ///                 preheader of the loop.
@@ -368,12 +375,11 @@ public:
   /// \param Chunk    The size of loop chunk considered as a unit when
   ///                 scheduling. If \p nullptr, defaults to 1.
   ///
-  /// \returns Updated CanonicalLoopInfo.
-  CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc,
-                                               CanonicalLoopInfo *CLI,
-                                               InsertPointTy AllocaIP,
-                                               bool NeedsBarrier,
-                                               Value *Chunk = nullptr);
+  /// \returns Point where to insert code after the workshare construct.
+  InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+                                         InsertPointTy AllocaIP,
+                                         bool NeedsBarrier,
+                                         Value *Chunk = nullptr);
 
   /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
   ///
@@ -382,8 +388,9 @@ public:
   /// turn it into a workshare loop. In particular, it calls to an OpenMP
   /// runtime function in the preheader to obtain, and then in each iteration
   /// to update the loop counter.
-  /// \param Loc      The source location description, the insertion location
-  ///                 is not used.
+  ///
+  /// \param DL       Debug location for instructions added for the
+  ///                 workshare-loop construct itself.
   /// \param CLI      A descriptor of the canonical loop to workshare.
   /// \param AllocaIP An insertion point for Alloca instructions usable in the
   ///                 preheader of the loop.
@@ -393,13 +400,12 @@ public:
   /// \param Chunk    The size of loop chunk considered as a unit when
   ///                 scheduling. If \p nullptr, defaults to 1.
   ///
-  /// \returns Point where to insert code after the loop.
-  InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
-                                           CanonicalLoopInfo *CLI,
-                                           InsertPointTy AllocaIP,
-                                           omp::OMPScheduleType SchedType,
-                                           bool NeedsBarrier,
-                                           Value *Chunk = nullptr);
+  /// \returns Point where to insert code after the workshare construct.
+  InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+                                          InsertPointTy AllocaIP,
+                                          omp::OMPScheduleType SchedType,
+                                          bool NeedsBarrier,
+                                          Value *Chunk = nullptr);
 
   /// Modifies the canonical loop to be a workshare loop.
   ///
@@ -410,19 +416,17 @@ public:
   /// the current thread, updates the relevant instructions in the canonical
   /// loop and calls to an OpenMP runtime finalization function after the loop.
   ///
-  /// \param Loc      The source location description, the insertion location
-  ///                 is not used.
+  /// \param DL       Debug location for instructions added for the
+  ///                 workshare-loop construct itself.
   /// \param CLI      A descriptor of the canonical loop to workshare.
   /// \param AllocaIP An insertion point for Alloca instructions usable in the
   ///                 preheader of the loop.
   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
   ///                     the loop.
   ///
-  /// \returns Updated CanonicalLoopInfo.
-  CanonicalLoopInfo *createWorkshareLoop(const LocationDescription &Loc,
-                                         CanonicalLoopInfo *CLI,
-                                         InsertPointTy AllocaIP,
-                                         bool NeedsBarrier);
+  /// \returns Point where to insert code after the workshare construct.
+  InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+                                   InsertPointTy AllocaIP, bool NeedsBarrier);
 
   /// Tile a loop nest.
   ///
@@ -471,6 +475,48 @@ public:
   tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
             ArrayRef<Value *> TileSizes);
 
+  /// Fully unroll a loop.
+  ///
+  /// Instead of unrolling the loop immediately (and duplicating its body
+  /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
+  /// metadata.
+  ///
+  /// \param DL   Debug location for instructions added by unrolling.
+  /// \param Loop The loop to unroll. The loop will be invalidated.
+  void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
+
+  /// Fully or partially unroll a loop. How the loop is unrolled is determined
+  /// using LLVM's LoopUnrollPass.
+  ///
+  /// \param DL   Debug location for instructions added by unrolling.
+  /// \param Loop The loop to unroll. The loop will be invalidated.
+  void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
+
+  /// Partially unroll a loop.
+  ///
+  /// The CanonicalLoopInfo of the unrolled loop for use with chained
+  /// loop-associated directive can be requested using \p UnrolledCLI. Not
+  /// needing the CanonicalLoopInfo allows more efficient code generation by
+  /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
+  /// A loop-associated directive applied to the unrolled loop needs to know the
+  /// new trip count which means that if using a heuristically determined unroll
+  /// factor (\p Factor == 0), that factor must be computed immediately. We are
+  /// using the same logic as the LoopUnrollPass to derived the unroll factor,
+  /// but which assumes that some canonicalization has taken place (e.g.
+  /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
+  /// better when the unrolled loop's CanonicalLoopInfo is not needed.
+  ///
+  /// \param DL          Debug location for instructions added by unrolling.
+  /// \param Loop        The loop to unroll. The loop will be invalidated.
+  /// \param Factor      The factor to unroll the loop by. A factor of 0
+  ///                    indicates that a heuristic should be used to determine
+  ///                    the unroll-factor.
+  /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
+  ///                    partially unrolled loop. Otherwise, uses loop metadata
+  ///                    to defer unrolling to the LoopUnrollPass.
+  void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
+                         CanonicalLoopInfo **UnrolledCLI);
+
   /// Generator for '#omp flush'
   ///
   /// \param Loc The location where the flush directive was encountered
@@ -486,6 +532,115 @@ public:
   /// \param Loc The location where the taskyield directive was encountered.
   void createTaskyield(const LocationDescription &Loc);
 
+  /// Functions used to generate reductions. Such functions take two Values
+  /// representing LHS and RHS of the reduction, respectively, and a reference
+  /// to the value that is updated to refer to the reduction result.
+  using ReductionGenTy =
+      function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+
+  /// Functions used to generate atomic reductions. Such functions take two
+  /// Values representing pointers to LHS and RHS of the reduction. They are
+  /// expected to atomically update the LHS to the reduced value.
+  using AtomicReductionGenTy =
+      function_ref<InsertPointTy(InsertPointTy, Value *, Value *)>;
+
+  /// Information about an OpenMP reduction.
+  struct ReductionInfo {
+    ReductionInfo(Value *Variable, Value *PrivateVariable,
+                  ReductionGenTy ReductionGen,
+                  AtomicReductionGenTy AtomicReductionGen)
+        : Variable(Variable), PrivateVariable(PrivateVariable),
+          ReductionGen(ReductionGen), AtomicReductionGen(AtomicReductionGen) {}
+
+    /// Returns the type of the element being reduced.
+    Type *getElementType() const {
+      return Variable->getType()->getPointerElementType();
+    }
+
+    /// Reduction variable of pointer type.
+    Value *Variable;
+
+    /// Thread-private partial reduction variable.
+    Value *PrivateVariable;
+
+    /// Callback for generating the reduction body. The IR produced by this will
+    /// be used to combine two values in a thread-safe context, e.g., under
+    /// lock or within the same thread, and therefore need not be atomic.
+    ReductionGenTy ReductionGen;
+
+    /// Callback for generating the atomic reduction body, may be null. The IR
+    /// produced by this will be used to atomically combine two values during
+    /// reduction. If null, the implementation will use the non-atomic version
+    /// along with the appropriate synchronization mechanisms.
+    AtomicReductionGenTy AtomicReductionGen;
+  };
+
+  // TODO: provide atomic and non-atomic reduction generators for reduction
+  // operators defined by the OpenMP specification.
+
+  /// Generator for '#omp reduction'.
+  ///
+  /// Emits the IR instructing the runtime to perform the specific kind of
+  /// reductions. Expects reduction variables to have been privatized and
+  /// initialized to reduction-neutral values separately. Emits the calls to
+  /// runtime functions as well as the reduction function and the basic blocks
+  /// performing the reduction atomically and non-atomically.
+  ///
+  /// The code emitted for the following:
+  ///
+  /// \code
+  ///   type var_1;
+  ///   type var_2;
+  ///   #pragma omp <directive> reduction(reduction-op:var_1,var_2)
+  ///   /* body */;
+  /// \endcode
+  ///
+  /// corresponds to the following sketch.
+  ///
+  /// \code
+  /// void _outlined_par() {
+  ///   // N is the number of different reductions.
+  ///   void *red_array[] = {privatized_var_1, privatized_var_2, ...};
+  ///   switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
+  ///                        _omp_reduction_func,
+  ///                        _gomp_critical_user.reduction.var)) {
+  ///   case 1: {
+  ///     var_1 = var_1 <reduction-op> privatized_var_1;
+  ///     var_2 = var_2 <reduction-op> privatized_var_2;
+  ///     // ...
+  ///    __kmpc_end_reduce(...);
+  ///     break;
+  ///   }
+  ///   case 2: {
+  ///     _Atomic<ReductionOp>(var_1, privatized_var_1);
+  ///     _Atomic<ReductionOp>(var_2, privatized_var_2);
+  ///     // ...
+  ///     break;
+  ///   }
+  ///   default: break;
+  ///   }
+  /// }
+  ///
+  /// void _omp_reduction_func(void **lhs, void **rhs) {
+  ///   *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
+  ///   *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
+  ///   // ...
+  /// }
+  /// \endcode
+  ///
+  /// \param Loc                The location where the reduction was
+  ///                           encountered. Must be within the associate
+  ///                           directive and after the last local access to the
+  ///                           reduction variables.
+  /// \param AllocaIP           An insertion point suitable for allocas usable
+  ///                           in reductions.
+  /// \param ReductionInfos     A list of info on each reduction variable.
+  /// \param IsNoWait           A flag set if the reduction is marked as nowait.
+  InsertPointTy createReductions(const LocationDescription &Loc,
+                                 InsertPointTy AllocaIP,
+                                 ArrayRef<ReductionInfo> ReductionInfos,
+                                 bool IsNoWait = false);
+
   ///}
 
   /// Return the insertion point used by the underlying IRBuilder.
@@ -515,6 +670,10 @@ public:
   Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
                                  unsigned Line, unsigned Column);
 
+  /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
+  /// fallback if \p DL does not specify the function name.
+  Constant *getOrCreateSrcLocStr(DebugLoc DL, Function *F = nullptr);
+
   /// Return the (LLVM-IR) string describing the source location \p Loc.
   Constant *getOrCreateSrcLocStr(const LocationDescription &Loc);
 
@@ -524,8 +683,8 @@ public:
                           omp::IdentFlag Flags = omp::IdentFlag(0),
                           unsigned Reserve2Flags = 0);
 
-  // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL
-  Type *getLanemaskType();
+  /// Create a global flag \p Namein the module with initial value \p Value.
+  GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
 
   /// Generate control flow and cleanup for cancellation.
   ///
@@ -651,11 +810,11 @@ public:
   /// \param Loc The source location description.
   /// \param MapperFunc Function to be called.
   /// \param SrcLocInfo Source location information global.
-  /// \param MaptypesArgs
-  /// \param MapnamesArg
+  /// \param MaptypesArg The argument types.
+  /// \param MapnamesArg The argument names.
   /// \param MapperAllocas The AllocaInst used for the call.
   /// \param DeviceID Device ID for the call.
-  /// \param TotalNbOperand Number of operand in the call.
+  /// \param NumOperands Number of operands in the call.
   void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
                       Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
                       struct MapperAllocas &MapperAllocas, int64_t DeviceID,
@@ -705,7 +864,7 @@ public:
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param FiniCB Callback to finialize variable copies.
   ///
-  /// \returns The insertion position *after* the master.
+  /// \returns The insertion position *after* the masked.
   InsertPointTy createMasked(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB, Value *Filter);
@@ -718,12 +877,41 @@ public:
   /// \param CriticalName name of the lock used by the critical directive
   /// \param HintInst Hint Instruction for hint clause associated with critical
   ///
-  /// \returns The insertion position *after* the master.
+  /// \returns The insertion position *after* the critical.
   InsertPointTy createCritical(const LocationDescription &Loc,
                                BodyGenCallbackTy BodyGenCB,
                                FinalizeCallbackTy FiniCB,
                                StringRef CriticalName, Value *HintInst);
 
+  /// Generator for '#omp ordered depend (source | sink)'
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param NumLoops The number of loops in depend clause.
+  /// \param StoreValues The value will be stored in vector address.
+  /// \param Name The name of alloca instruction.
+  /// \param IsDependSource If true, depend source; otherwise, depend sink.
+  ///
+  /// \return The insertion position *after* the ordered.
+  InsertPointTy createOrderedDepend(const LocationDescription &Loc,
+                                    InsertPointTy AllocaIP, unsigned NumLoops,
+                                    ArrayRef<llvm::Value *> StoreValues,
+                                    const Twine &Name, bool IsDependSource);
+
+  /// Generator for '#omp ordered [threads | simd]'
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param BodyGenCB Callback that will generate the region code.
+  /// \param FiniCB Callback to finalize variable copies.
+  /// \param IsThreads If true, with threads clause or without clause;
+  /// otherwise, with simd clause;
+  ///
+  /// \returns The insertion position *after* the ordered.
+  InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
+                                         BodyGenCallbackTy BodyGenCB,
+                                         FinalizeCallbackTy FiniCB,
+                                         bool IsThreads);
+
   /// Generator for '#omp sections'
   ///
   /// \param Loc The insert and source location description.
@@ -816,14 +1004,16 @@ public:
   /// \param Loc The insert and source location description.
   /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
   /// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
-  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);
+  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
+                                 bool RequiresFullRuntime);
 
   /// Create a runtime call for kmpc_target_deinit
   ///
   /// \param Loc The insert and source location description.
   /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
   /// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
-  void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);
+  void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD,
+                          bool RequiresFullRuntime);
 
   ///}
 
@@ -1121,7 +1311,25 @@ public:
 /// The control-flow structure is standardized for easy consumption by
 /// directives associated with loops. For instance, the worksharing-loop
 /// construct may change this control flow such that each loop iteration is
-/// executed on only one thread.
+/// executed on only one thread. The constraints of a canonical loop in brief
+/// are:
+///
+///  * The number of loop iterations must have been computed before entering the
+///    loop.
+///
+///  * Has an (unsigned) logical induction variable that starts at zero and
+///    increments by one.
+///
+///  * The loop's CFG itself has no side-effects. The OpenMP specification
+///    itself allows side-effects, but the order in which they happen, including
+///    how often or whether at all, is unspecified. We expect that the frontend
+///    will emit those side-effect instructions somewhere (e.g. before the loop)
+///    such that the CanonicalLoopInfo itself can be side-effect free.
+///
+/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
+/// execution of a loop body that satifies these constraints. It does NOT
+/// represent arbitrary SESE regions that happen to contain a loop. Do not use
+/// CanonicalLoopInfo for such purposes.
 ///
 /// The control flow can be described as follows:
 ///
@@ -1141,73 +1349,149 @@ public:
 ///              |
 ///            After
 ///
-/// Code in the header, condition block, latch and exit block must not have any
-/// side-effect. The body block is the single entry point into the loop body,
-/// which may contain arbitrary control flow as long as all control paths
-/// eventually branch to the latch block.
+/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
+/// including) and end at AfterIP (at the After's first instruction, excluding).
+/// That is, instructions in the Preheader and After blocks (except the
+/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
+/// side-effects. Typically, the Preheader is used to compute the loop's trip
+/// count. The instructions from BodyIP (at the Body block's first instruction,
+/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
+/// control and thus can have side-effects. The body block is the single entry
+/// point into the loop body, which may contain arbitrary control flow as long
+/// as all control paths eventually branch to the Latch block.
+///
+/// TODO: Consider adding another standardized BasicBlock between Body CFG and
+/// Latch to guarantee that there is only a single edge to the latch. It would
+/// make loop transformations easier to not needing to consider multiple
+/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
+/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
+/// executes after each body iteration.
+///
+/// There must be no loop-carried dependencies through llvm::Values. This is
+/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
+/// for the induction variable.
+///
+/// All code in Header, Cond, Latch and Exit (plus the terminator of the
+/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
+/// by assertOK(). They are expected to not be modified unless explicitly
+/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
+/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
+/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
+/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
+/// anymore as its underlying control flow may not exist anymore.
+/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
+/// may also return a new CanonicalLoopInfo that can be passed to other
+/// loop-associated construct implementing methods. These loop-transforming
+/// methods may either create a new CanonicalLoopInfo usually using
+/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
+/// modify one of the input CanonicalLoopInfo and return it as representing the
+/// modified loop. What is done is an implementation detail of
+/// transformation-implementing method and callers should always assume that the
+/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
+/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
+/// created by createCanonicalLoop, such that transforming methods do not have
+/// to special case where the CanonicalLoopInfo originated from.
+///
+/// Generally, methods consuming CanonicalLoopInfo do not need an
+/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
+/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
+/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
+/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
+/// any InsertPoint in the Preheader, After or Block can still be used after
+/// calling such a method.
 ///
-/// Defined outside OpenMPIRBuilder because one cannot forward-declare nested
-/// classes.
+/// TODO: Provide mechanisms for exception handling and cancellation points.
+///
+/// Defined outside OpenMPIRBuilder because nested classes cannot be
+/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
 class CanonicalLoopInfo {
   friend class OpenMPIRBuilder;
 
 private:
-  /// Whether this object currently represents a loop.
-  bool IsValid = false;
-
-  BasicBlock *Preheader;
-  BasicBlock *Header;
-  BasicBlock *Cond;
-  BasicBlock *Body;
-  BasicBlock *Latch;
-  BasicBlock *Exit;
-  BasicBlock *After;
+  BasicBlock *Preheader = nullptr;
+  BasicBlock *Header = nullptr;
+  BasicBlock *Cond = nullptr;
+  BasicBlock *Body = nullptr;
+  BasicBlock *Latch = nullptr;
+  BasicBlock *Exit = nullptr;
+  BasicBlock *After = nullptr;
 
   /// Add the control blocks of this loop to \p BBs.
   ///
   /// This does not include any block from the body, including the one returned
   /// by getBody().
+  ///
+  /// FIXME: This currently includes the Preheader and After blocks even though
+  /// their content is (mostly) not under CanonicalLoopInfo's control.
+  /// Re-evaluated whether this makes sense.
   void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
 
 public:
+  /// Returns whether this object currently represents the IR of a loop. If
+  /// returning false, it may have been consumed by a loop transformation or not
+  /// been intialized. Do not use in this case;
+  bool isValid() const { return Header; }
+
   /// The preheader ensures that there is only a single edge entering the loop.
   /// Code that must be execute before any loop iteration can be emitted here,
   /// such as computing the loop trip count and begin lifetime markers. Code in
   /// the preheader is not considered part of the canonical loop.
-  BasicBlock *getPreheader() const { return Preheader; }
+  BasicBlock *getPreheader() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Preheader;
+  }
 
   /// The header is the entry for each iteration. In the canonical control flow,
   /// it only contains the PHINode for the induction variable.
-  BasicBlock *getHeader() const { return Header; }
+  BasicBlock *getHeader() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Header;
+  }
 
   /// The condition block computes whether there is another loop iteration. If
   /// yes, branches to the body; otherwise to the exit block.
-  BasicBlock *getCond() const { return Cond; }
+  BasicBlock *getCond() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Cond;
+  }
 
   /// The body block is the single entry for a loop iteration and not controlled
   /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
   /// eventually branch to the \p Latch block.
-  BasicBlock *getBody() const { return Body; }
+  BasicBlock *getBody() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Body;
+  }
 
   /// Reaching the latch indicates the end of the loop body code. In the
   /// canonical control flow, it only contains the increment of the induction
   /// variable.
-  BasicBlock *getLatch() const { return Latch; }
+  BasicBlock *getLatch() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Latch;
+  }
 
   /// Reaching the exit indicates no more iterations are being executed.
-  BasicBlock *getExit() const { return Exit; }
+  BasicBlock *getExit() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Exit;
+  }
 
   /// The after block is intended for clean-up code such as lifetime end
   /// markers. It is separate from the exit block to ensure, analogous to the
   /// preheader, it having just a single entry edge and being free from PHI
   /// nodes should there be multiple loop exits (such as from break
   /// statements/cancellations).
-  BasicBlock *getAfter() const { return After; }
+  BasicBlock *getAfter() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return After;
+  }
 
   /// Returns the llvm::Value containing the number of loop iterations. It must
   /// be valid in the preheader and always interpreted as an unsigned integer of
   /// any bit-width.
   Value *getTripCount() const {
+    assert(isValid() && "Requires a valid canonical loop");
     Instruction *CmpI = &Cond->front();
     assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
     return CmpI->getOperand(1);
@@ -1216,33 +1500,47 @@ public:
   /// Returns the instruction representing the current logical induction
   /// variable. Always unsigned, always starting at 0 with an increment of one.
   Instruction *getIndVar() const {
+    assert(isValid() && "Requires a valid canonical loop");
     Instruction *IndVarPHI = &Header->front();
     assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
     return IndVarPHI;
   }
 
   /// Return the type of the induction variable (and the trip count).
-  Type *getIndVarType() const { return getIndVar()->getType(); }
+  Type *getIndVarType() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return getIndVar()->getType();
+  }
 
   /// Return the insertion point for user code before the loop.
   OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
+    assert(isValid() && "Requires a valid canonical loop");
     return {Preheader, std::prev(Preheader->end())};
   };
 
   /// Return the insertion point for user code in the body.
   OpenMPIRBuilder::InsertPointTy getBodyIP() const {
+    assert(isValid() && "Requires a valid canonical loop");
     return {Body, Body->begin()};
   };
 
   /// Return the insertion point for user code after the loop.
   OpenMPIRBuilder::InsertPointTy getAfterIP() const {
+    assert(isValid() && "Requires a valid canonical loop");
     return {After, After->begin()};
   };
 
-  Function *getFunction() const { return Header->getParent(); }
+  Function *getFunction() const {
+    assert(isValid() && "Requires a valid canonical loop");
+    return Header->getParent();
+  }
 
   /// Consistency self-check.
   void assertOK() const;
+
+  /// Invalidate this loop. That is, the underlying IR does not fulfill the
+  /// requirements of an OpenMP canonical loop anymore.
+  void invalidate();
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index eb673b199fc4..8e4f7568fb9c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -39,7 +39,6 @@ __OMP_TYPE(Int32Ptr)
 __OMP_TYPE(Int64Ptr)
 
 OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
-OMP_TYPE(LanemaskTy, getLanemaskType())
 
 #define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())
 
@@ -272,6 +271,15 @@ __OMP_RTL(__kmpc_for_static_init_8, false, Void, IdentPtr, Int32, Int32,
 __OMP_RTL(__kmpc_for_static_init_8u, false, Void, IdentPtr, Int32, Int32,
           Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)
 __OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_distribute_static_init_4, false, Void, IdentPtr, Int32, Int32,
+          Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_init_4u, false, Void, IdentPtr, Int32, Int32,
+          Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_init_8, false, Void, IdentPtr, Int32, Int32,
+          Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_init_8u, false, Void, IdentPtr, Int32, Int32,
+          Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_fini, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_dist_dispatch_init_4, false, Void, IdentPtr, Int32, Int32,
           Int32Ptr, Int32, Int32, Int32, Int32)
 __OMP_RTL(__kmpc_dist_dispatch_init_4u, false, Void, IdentPtr, Int32, Int32,
@@ -415,8 +423,8 @@ __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
           /* Int */ Int32, /* kmp_task_t */ VoidPtr)
 
 /// OpenMP Device runtime functions
-__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1)
-__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1)
+__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1, Int1)
+__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8, Int1)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
@@ -442,9 +450,12 @@ __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
 __OMP_RTL(__kmpc_parallel_level, false, Int8, )
 __OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
 __OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 
-__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
-__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)
+__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
+__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
+
+__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
 
 __OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
 
@@ -510,6 +521,11 @@ __OMP_ATTRS_SET(NoCaptureAttrs,
                     ? AttributeSet(EnumAttr(NoCapture))
                     : AttributeSet(EnumAttr(NoCapture)))
 
+__OMP_ATTRS_SET(AlwaysInlineAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(AlwaysInline))
+                    : AttributeSet(EnumAttr(AlwaysInline)))
+
 #if 0
 __OMP_ATTRS_SET(InaccessibleOnlyAttrs,
                 OptimisticAttributes
@@ -535,6 +551,11 @@ __OMP_ATTRS_SET(ReadOnlyPtrAttrs,
                                    EnumAttr(NoCapture))
                     : AttributeSet())
 
+__OMP_ATTRS_SET(DeviceAllocAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync))
+                    : AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync)))
+
 #if 0
 __OMP_ATTRS_SET(WriteOnlyPtrAttrs,
                 OptimisticAttributes
@@ -575,6 +596,8 @@ __OMP_RTL_ATTRS(__kmpc_barrier, BarrierAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs))
 __OMP_RTL_ATTRS(__kmpc_barrier_simple_spmd, BarrierAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_barrier_simple_generic, BarrierAttrs, AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs))
 __OMP_RTL_ATTRS(__kmpc_warp_active_thread_mask, BarrierAttrs, AttributeSet(),
                 ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_syncwarp, BarrierAttrs, AttributeSet(), ParamAttrs())
@@ -703,6 +726,28 @@ __OMP_RTL_ATTRS(__kmpc_for_static_init_8u, GetterArgWriteAttrs, AttributeSet(),
                            AttributeSet(), AttributeSet()))
 __OMP_RTL_ATTRS(__kmpc_for_static_fini, InaccessibleArgOnlyAttrs,
                 AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_4, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_4u, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_8, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_8u, GetterArgWriteAttrs,
+                AttributeSet(),
+                ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+                           ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+                           AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_fini, InaccessibleArgOnlyAttrs,
+                AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
 __OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4, GetterArgWriteAttrs,
                 AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
@@ -854,9 +899,9 @@ __OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),
 __OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs))
 
-__OMP_RTL_ATTRS(__kmpc_alloc_shared, DefaultAttrs, ReturnPtrAttrs,
+__OMP_RTL_ATTRS(__kmpc_alloc_shared, DeviceAllocAttrs, ReturnPtrAttrs,
                 ParamAttrs())
-__OMP_RTL_ATTRS(__kmpc_free_shared, AllocAttrs, AttributeSet(),
+__OMP_RTL_ATTRS(__kmpc_free_shared, DeviceAllocAttrs, AttributeSet(),
                 ParamAttrs(NoCaptureAttrs))
 
 __OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
@@ -897,6 +942,9 @@ __OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(),
 __OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
                 ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs))
 
+__OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),
+                ParamAttrs())
+
 #undef __OMP_RTL_ATTRS
 #undef OMP_RTL_ATTRS
 #undef AttributeSet
@@ -920,6 +968,7 @@ __OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
   OMP_IDENT_FLAG(OMP_IDENT_FLAG_##Name, #Name, Value)
 
 __OMP_IDENT_FLAG(KMPC, 0x02)
+__OMP_IDENT_FLAG(ATOMIC_REDUCE, 0x10)
 __OMP_IDENT_FLAG(BARRIER_EXPL, 0x20)
 __OMP_IDENT_FLAG(BARRIER_IMPL, 0x0040)
 __OMP_IDENT_FLAG(BARRIER_IMPL_MASK, 0x01C0)
diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h
index e8cf05001542..31df4c75b6e7 100644
--- a/llvm/include/llvm/IR/AbstractCallSite.h
+++ b/llvm/include/llvm/IR/AbstractCallSite.h
@@ -153,7 +153,7 @@ public:
   /// Return the number of parameters of the callee.
   unsigned getNumArgOperands() const {
     if (isDirectCall())
-      return CB->getNumArgOperands();
+      return CB->arg_size();
     // Subtract 1 for the callee encoding.
     return CI.ParameterEncoding.size() - 1;
   }
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
index dcf658f439b4..396ab6a9d01d 100644
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -97,7 +97,7 @@ public:
   /// If this is a byval or inalloca argument, return its alignment.
   /// FIXME: Remove this function once transition to Align is over.
   /// Use getParamAlign() instead.
-  unsigned getParamAlignment() const;
+  uint64_t getParamAlignment() const;
 
   /// If this is a byval or inalloca argument, return its alignment.
   MaybeAlign getParamAlign() const;
diff --git a/llvm/include/llvm/IR/Assumptions.h b/llvm/include/llvm/IR/Assumptions.h
index f64616c25d87..08e6c8b6f1e0 100644
--- a/llvm/include/llvm/IR/Assumptions.h
+++ b/llvm/include/llvm/IR/Assumptions.h
@@ -15,12 +15,14 @@
 #ifndef LLVM_IR_ASSUMPTIONS_H
 #define LLVM_IR_ASSUMPTIONS_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 
 namespace llvm {
 
 class Function;
+class CallBase;
 
 /// The key we use for assumption attributes.
 constexpr StringRef AssumptionAttrKey = "llvm.assume";
@@ -43,7 +45,25 @@ private:
 };
 
 /// Return true if \p F has the assumption \p AssumptionStr attached.
-bool hasAssumption(Function &F, const KnownAssumptionString &AssumptionStr);
+bool hasAssumption(const Function &F,
+                   const KnownAssumptionString &AssumptionStr);
+
+/// Return true if \p CB or the callee has the assumption \p AssumptionStr
+/// attached.
+bool hasAssumption(const CallBase &CB,
+                   const KnownAssumptionString &AssumptionStr);
+
+/// Return the set of all assumptions for the function \p F.
+DenseSet<StringRef> getAssumptions(const Function &F);
+
+/// Return the set of all assumptions for the call \p CB.
+DenseSet<StringRef> getAssumptions(const CallBase &CB);
+
+/// Appends the set of assumptions \p Assumptions to \F.
+bool addAssumptions(Function &F, const DenseSet<StringRef> &Assumptions);
+
+/// Appends the set of assumptions \p Assumptions to \CB.
+bool addAssumptions(CallBase &CB, const DenseSet<StringRef> &Assumptions);
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index d7bd3edb3d4c..282be640d8be 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -37,7 +37,6 @@ class AttrBuilder;
 class AttributeImpl;
 class AttributeListImpl;
 class AttributeSetNode;
-template<typename T> struct DenseMapInfo;
 class FoldingSetNodeID;
 class Function;
 class LLVMContext;
@@ -78,6 +77,7 @@ public:
     TombstoneKey,          ///< Use as Tombstone key for DenseMap of AttrKind
   };
 
+  static const unsigned NumIntAttrKinds = LastIntAttr - FirstIntAttr + 1;
   static const unsigned NumTypeAttrKinds = LastTypeAttr - FirstTypeAttr + 1;
 
   static bool isEnumAttrKind(AttrKind Kind) {
@@ -265,7 +265,7 @@ inline Attribute unwrap(LLVMAttributeRef Attr) {
 /// and removing string or integer attributes involves a FoldingSet lookup.
 class AttributeSet {
   friend AttributeListImpl;
-  template <typename Ty> friend struct DenseMapInfo;
+  template <typename Ty, typename Enable> friend struct DenseMapInfo;
 
   // TODO: Extract AvailableAttrs from AttributeSetNode and store them here.
   // This will allow an efficient implementation of addAttribute and
@@ -366,7 +366,7 @@ public:
 //===----------------------------------------------------------------------===//
 /// \class
 /// Provide DenseMapInfo for AttributeSet.
-template <> struct DenseMapInfo<AttributeSet> {
+template <> struct DenseMapInfo<AttributeSet, void> {
   static AttributeSet getEmptyKey() {
     auto Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<void *>::NumLowBitsAvailable;
@@ -408,7 +408,7 @@ private:
   friend class AttributeListImpl;
   friend class AttributeSet;
   friend class AttributeSetNode;
-  template <typename Ty> friend struct DenseMapInfo;
+  template <typename Ty, typename Enable> friend struct DenseMapInfo;
 
   /// The attributes that we are managing. This can be null to represent
   /// the empty attributes list.
@@ -432,8 +432,8 @@ private:
 
   static AttributeList getImpl(LLVMContext &C, ArrayRef<AttributeSet> AttrSets);
 
-  AttributeList setAttributes(LLVMContext &C, unsigned Index,
-                              AttributeSet Attrs) const;
+  AttributeList setAttributesAtIndex(LLVMContext &C, unsigned Index,
+                                     AttributeSet Attrs) const;
 
 public:
   AttributeList() = default;
@@ -454,32 +454,84 @@ public:
   static AttributeList get(LLVMContext &C, unsigned Index,
                            const AttrBuilder &B);
 
+  // TODO: remove non-AtIndex versions of these methods.
   /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addAttribute(LLVMContext &C, unsigned Index,
-                                            Attribute::AttrKind Kind) const;
+  LLVM_NODISCARD AttributeList addAttributeAtIndex(
+      LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const;
 
   /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList
-  addAttribute(LLVMContext &C, unsigned Index, StringRef Kind,
-               StringRef Value = StringRef()) const;
+  addAttributeAtIndex(LLVMContext &C, unsigned Index, StringRef Kind,
+                      StringRef Value = StringRef()) const;
 
   /// Add an attribute to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addAttribute(LLVMContext &C, unsigned Index,
-                                            Attribute A) const;
+  LLVM_NODISCARD AttributeList addAttributeAtIndex(LLVMContext &C,
+                                                   unsigned Index,
+                                                   Attribute A) const;
 
   /// Add attributes to the attribute set at the given index.
   /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addAttributes(LLVMContext &C, unsigned Index,
-                                             const AttrBuilder &B) const;
+  LLVM_NODISCARD AttributeList addAttributesAtIndex(LLVMContext &C,
+                                                    unsigned Index,
+                                                    const AttrBuilder &B) const;
+
+  /// Add a function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttribute(LLVMContext &C,
+                                              Attribute::AttrKind Kind) const {
+    return addAttributeAtIndex(C, FunctionIndex, Kind);
+  }
+
+  /// Add a function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttribute(LLVMContext &C,
+                                              Attribute Attr) const {
+    return addAttributeAtIndex(C, FunctionIndex, Attr);
+  }
+
+  /// Add a function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttribute(
+      LLVMContext &C, StringRef Kind, StringRef Value = StringRef()) const {
+    return addAttributeAtIndex(C, FunctionIndex, Kind, Value);
+  }
+
+  /// Add function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttributes(LLVMContext &C,
+                                               const AttrBuilder &B) const {
+    return addAttributesAtIndex(C, FunctionIndex, B);
+  }
+
+  /// Add a return value attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addRetAttribute(LLVMContext &C,
+                                               Attribute::AttrKind Kind) const {
+    return addAttributeAtIndex(C, ReturnIndex, Kind);
+  }
+
+  /// Add a return value attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addRetAttribute(LLVMContext &C,
+                                               Attribute Attr) const {
+    return addAttributeAtIndex(C, ReturnIndex, Attr);
+  }
+
+  /// Add a return value attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addRetAttributes(LLVMContext &C,
+                                                const AttrBuilder &B) const {
+    return addAttributesAtIndex(C, ReturnIndex, B);
+  }
 
   /// Add an argument attribute to the list. Returns a new list because
   /// attribute lists are immutable.
   LLVM_NODISCARD AttributeList addParamAttribute(
       LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return addAttribute(C, ArgNo + FirstArgIndex, Kind);
+    return addAttributeAtIndex(C, ArgNo + FirstArgIndex, Kind);
   }
 
   /// Add an argument attribute to the list. Returns a new list because
@@ -487,7 +539,7 @@ public:
   LLVM_NODISCARD AttributeList
   addParamAttribute(LLVMContext &C, unsigned ArgNo, StringRef Kind,
                     StringRef Value = StringRef()) const {
-    return addAttribute(C, ArgNo + FirstArgIndex, Kind, Value);
+    return addAttributeAtIndex(C, ArgNo + FirstArgIndex, Kind, Value);
   }
 
   /// Add an attribute to the attribute list at the given arg indices. Returns a
@@ -501,34 +553,87 @@ public:
   LLVM_NODISCARD AttributeList addParamAttributes(LLVMContext &C,
                                                   unsigned ArgNo,
                                                   const AttrBuilder &B) const {
-    return addAttributes(C, ArgNo + FirstArgIndex, B);
+    return addAttributesAtIndex(C, ArgNo + FirstArgIndex, B);
   }
 
   /// Remove the specified attribute at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList removeAttribute(LLVMContext &C, unsigned Index,
-                                               Attribute::AttrKind Kind) const;
+  LLVM_NODISCARD AttributeList removeAttributeAtIndex(
+      LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const;
 
   /// Remove the specified attribute at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList removeAttributeAtIndex(LLVMContext &C,
+                                                      unsigned Index,
+                                                      StringRef Kind) const;
   LLVM_NODISCARD AttributeList removeAttribute(LLVMContext &C, unsigned Index,
-                                               StringRef Kind) const;
+                                               StringRef Kind) const {
+    return removeAttributeAtIndex(C, Index, Kind);
+  }
 
   /// Remove the specified attributes at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList removeAttributes(
+  LLVM_NODISCARD AttributeList removeAttributesAtIndex(
       LLVMContext &C, unsigned Index, const AttrBuilder &AttrsToRemove) const;
 
   /// Remove all attributes at the specified index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList removeAttributes(LLVMContext &C,
-                                                unsigned Index) const;
+  LLVM_NODISCARD AttributeList removeAttributesAtIndex(LLVMContext &C,
+                                                       unsigned Index) const;
+
+  /// Remove the specified attribute at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeFnAttribute(LLVMContext &C, Attribute::AttrKind Kind) const {
+    return removeAttributeAtIndex(C, FunctionIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList removeFnAttribute(LLVMContext &C,
+                                                 StringRef Kind) const {
+    return removeAttributeAtIndex(C, FunctionIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeFnAttributes(LLVMContext &C, const AttrBuilder &AttrsToRemove) const {
+    return removeAttributesAtIndex(C, FunctionIndex, AttrsToRemove);
+  }
+
+  /// Remove the attributes at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList removeFnAttributes(LLVMContext &C) const {
+    return removeAttributesAtIndex(C, FunctionIndex);
+  }
+
+  /// Remove the specified attribute at the return value index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeRetAttribute(LLVMContext &C, Attribute::AttrKind Kind) const {
+    return removeAttributeAtIndex(C, ReturnIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the return value index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList removeRetAttribute(LLVMContext &C,
+                                                  StringRef Kind) const {
+    return removeAttributeAtIndex(C, ReturnIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the return value index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeRetAttributes(LLVMContext &C, const AttrBuilder &AttrsToRemove) const {
+    return removeAttributesAtIndex(C, ReturnIndex, AttrsToRemove);
+  }
 
   /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList removeParamAttribute(
       LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return removeAttribute(C, ArgNo + FirstArgIndex, Kind);
+    return removeAttributeAtIndex(C, ArgNo + FirstArgIndex, Kind);
   }
 
   /// Remove the specified attribute at the specified arg index from this
@@ -536,80 +641,55 @@ public:
   LLVM_NODISCARD AttributeList removeParamAttribute(LLVMContext &C,
                                                     unsigned ArgNo,
                                                     StringRef Kind) const {
-    return removeAttribute(C, ArgNo + FirstArgIndex, Kind);
+    return removeAttributeAtIndex(C, ArgNo + FirstArgIndex, Kind);
   }
 
   /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList removeParamAttributes(
       LLVMContext &C, unsigned ArgNo, const AttrBuilder &AttrsToRemove) const {
-    return removeAttributes(C, ArgNo + FirstArgIndex, AttrsToRemove);
+    return removeAttributesAtIndex(C, ArgNo + FirstArgIndex, AttrsToRemove);
   }
 
   /// Remove all attributes at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList removeParamAttributes(LLVMContext &C,
                                                      unsigned ArgNo) const {
-    return removeAttributes(C, ArgNo + FirstArgIndex);
+    return removeAttributesAtIndex(C, ArgNo + FirstArgIndex);
   }
 
   /// Replace the type contained by attribute \p AttrKind at index \p ArgNo wih
   /// \p ReplacementTy, preserving all other attributes.
-  LLVM_NODISCARD AttributeList replaceAttributeType(LLVMContext &C,
-                                                    unsigned ArgNo,
-                                                    Attribute::AttrKind Kind,
-                                                    Type *ReplacementTy) const {
-    Attribute Attr = getAttribute(ArgNo, Kind);
-    auto Attrs = removeAttribute(C, ArgNo, Kind);
-    return Attrs.addAttribute(C, ArgNo, Attr.getWithNewType(C, ReplacementTy));
+  LLVM_NODISCARD AttributeList replaceAttributeTypeAtIndex(
+      LLVMContext &C, unsigned ArgNo, Attribute::AttrKind Kind,
+      Type *ReplacementTy) const {
+    Attribute Attr = getAttributeAtIndex(ArgNo, Kind);
+    auto Attrs = removeAttributeAtIndex(C, ArgNo, Kind);
+    return Attrs.addAttributeAtIndex(C, ArgNo,
+                                     Attr.getWithNewType(C, ReplacementTy));
   }
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// index. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addDereferenceableAttr(LLVMContext &C,
-                                                      unsigned Index,
-                                                      uint64_t Bytes) const;
+  LLVM_NODISCARD AttributeList addDereferenceableRetAttr(LLVMContext &C,
+                                                         uint64_t Bytes) const;
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// arg index. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList addDereferenceableParamAttr(
-      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const {
-    return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes);
-  }
-
-  /// Add the dereferenceable_or_null attribute to the attribute set at
-  /// the given index. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addDereferenceableOrNullAttr(
-      LLVMContext &C, unsigned Index, uint64_t Bytes) const;
+      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const;
 
   /// Add the dereferenceable_or_null attribute to the attribute set at
   /// the given arg index. Returns a new list because attribute lists are
   /// immutable.
   LLVM_NODISCARD AttributeList addDereferenceableOrNullParamAttr(
-      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const {
-    return addDereferenceableOrNullAttr(C, ArgNo + FirstArgIndex, Bytes);
-  }
-
-  /// Add the allocsize attribute to the attribute set at the given index.
-  /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList
-  addAllocSizeAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg,
-                   const Optional<unsigned> &NumElemsArg);
+      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const;
 
   /// Add the allocsize attribute to the attribute set at the given arg index.
   /// Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList
   addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, unsigned ElemSizeArg,
-                        const Optional<unsigned> &NumElemsArg) {
-    return addAllocSizeAttr(C, ArgNo + FirstArgIndex, ElemSizeArg, NumElemsArg);
-  }
-
-  /// Add the vscale_range attribute to the attribute set at the given index.
-  /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addVScaleRangeAttr(LLVMContext &C,
-                                                  unsigned Index,
-                                                  unsigned MinValue,
-                                                  unsigned MaxValue);
+                        const Optional<unsigned> &NumElemsArg);
 
   //===--------------------------------------------------------------------===//
   // AttributeList Accessors
@@ -620,48 +700,59 @@ public:
 
   /// The attributes for the argument or parameter at the given index are
   /// returned.
-  AttributeSet getParamAttributes(unsigned ArgNo) const;
+  AttributeSet getParamAttrs(unsigned ArgNo) const;
 
   /// The attributes for the ret value are returned.
-  AttributeSet getRetAttributes() const;
+  AttributeSet getRetAttrs() const;
 
   /// The function attributes are returned.
-  AttributeSet getFnAttributes() const;
+  AttributeSet getFnAttrs() const;
 
   /// Return true if the attribute exists at the given index.
-  bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const;
+  bool hasAttributeAtIndex(unsigned Index, Attribute::AttrKind Kind) const;
 
   /// Return true if the attribute exists at the given index.
-  bool hasAttribute(unsigned Index, StringRef Kind) const;
+  bool hasAttributeAtIndex(unsigned Index, StringRef Kind) const;
 
   /// Return true if attribute exists at the given index.
-  bool hasAttributes(unsigned Index) const;
+  bool hasAttributesAtIndex(unsigned Index) const;
 
   /// Return true if the attribute exists for the given argument
   bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return hasAttribute(ArgNo + FirstArgIndex, Kind);
+    return hasAttributeAtIndex(ArgNo + FirstArgIndex, Kind);
   }
 
   /// Return true if the attribute exists for the given argument
   bool hasParamAttr(unsigned ArgNo, StringRef Kind) const {
-    return hasAttribute(ArgNo + FirstArgIndex, Kind);
+    return hasAttributeAtIndex(ArgNo + FirstArgIndex, Kind);
   }
 
   /// Return true if attributes exists for the given argument
   bool hasParamAttrs(unsigned ArgNo) const {
-    return hasAttributes(ArgNo + FirstArgIndex);
+    return hasAttributesAtIndex(ArgNo + FirstArgIndex);
+  }
+
+  /// Return true if the attribute exists for the return value.
+  bool hasRetAttr(Attribute::AttrKind Kind) const {
+    return hasAttributeAtIndex(ReturnIndex, Kind);
   }
 
-  /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
-  /// may be faster.
-  bool hasFnAttribute(Attribute::AttrKind Kind) const;
+  /// Return true if the attribute exists for the return value.
+  bool hasRetAttr(StringRef Kind) const {
+    return hasAttributeAtIndex(ReturnIndex, Kind);
+  }
+
+  /// Return true if attributes exist for the return value.
+  bool hasRetAttrs() const { return hasAttributesAtIndex(ReturnIndex); }
+
+  /// Return true if the attribute exists for the function.
+  bool hasFnAttr(Attribute::AttrKind Kind) const;
 
-  /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
-  /// may be faster.
-  bool hasFnAttribute(StringRef Kind) const;
+  /// Return true if the attribute exists for the function.
+  bool hasFnAttr(StringRef Kind) const;
 
-  /// Equivalent to hasAttribute(ArgNo + FirstArgIndex, Kind).
-  bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
+  /// Return true the attributes exist for the function.
+  bool hasFnAttrs() const { return hasAttributesAtIndex(FunctionIndex); }
 
   /// Return true if the specified attribute is set for at least one
   /// parameter or for the return value. If Index is not nullptr, the index
@@ -670,19 +761,29 @@ public:
                         unsigned *Index = nullptr) const;
 
   /// Return the attribute object that exists at the given index.
-  Attribute getAttribute(unsigned Index, Attribute::AttrKind Kind) const;
+  Attribute getAttributeAtIndex(unsigned Index, Attribute::AttrKind Kind) const;
 
   /// Return the attribute object that exists at the given index.
-  Attribute getAttribute(unsigned Index, StringRef Kind) const;
+  Attribute getAttributeAtIndex(unsigned Index, StringRef Kind) const;
 
   /// Return the attribute object that exists at the arg index.
   Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return getAttribute(ArgNo + FirstArgIndex, Kind);
+    return getAttributeAtIndex(ArgNo + FirstArgIndex, Kind);
   }
 
   /// Return the attribute object that exists at the given index.
   Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const {
-    return getAttribute(ArgNo + FirstArgIndex, Kind);
+    return getAttributeAtIndex(ArgNo + FirstArgIndex, Kind);
+  }
+
+  /// Return the attribute object that exists for the function.
+  Attribute getFnAttr(Attribute::AttrKind Kind) const {
+    return getAttributeAtIndex(FunctionIndex, Kind);
+  }
+
+  /// Return the attribute object that exists for the function.
+  Attribute getFnAttr(StringRef Kind) const {
+    return getAttributeAtIndex(FunctionIndex, Kind);
   }
 
   /// Return the alignment of the return value.
@@ -712,34 +813,26 @@ public:
   /// Return the elementtype type for the specified function parameter.
   Type *getParamElementType(unsigned ArgNo) const;
 
-  /// Get the stack alignment.
-  MaybeAlign getStackAlignment(unsigned Index) const;
+  /// Get the stack alignment of the function.
+  MaybeAlign getFnStackAlignment() const;
 
-  /// Get the number of dereferenceable bytes (or zero if unknown).
-  uint64_t getDereferenceableBytes(unsigned Index) const;
+  /// Get the stack alignment of the return value.
+  MaybeAlign getRetStackAlignment() const;
 
-  /// Get the number of dereferenceable bytes (or zero if unknown) of an
-  /// arg.
-  uint64_t getParamDereferenceableBytes(unsigned ArgNo) const {
-    return getDereferenceableBytes(ArgNo + FirstArgIndex);
-  }
+  /// Get the number of dereferenceable bytes (or zero if unknown) of the return
+  /// value.
+  uint64_t getRetDereferenceableBytes() const;
 
-  /// Get the number of dereferenceable_or_null bytes (or zero if
-  /// unknown).
-  uint64_t getDereferenceableOrNullBytes(unsigned Index) const;
+  /// Get the number of dereferenceable bytes (or zero if unknown) of an arg.
+  uint64_t getParamDereferenceableBytes(unsigned Index) const;
 
-  /// Get the number of dereferenceable_or_null bytes (or zero if
-  /// unknown) of an arg.
-  uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const {
-    return getDereferenceableOrNullBytes(ArgNo + FirstArgIndex);
-  }
+  /// Get the number of dereferenceable_or_null bytes (or zero if unknown) of
+  /// the return value.
+  uint64_t getRetDereferenceableOrNullBytes() const;
 
-  /// Get the allocsize argument numbers (or pair(0, 0) if unknown).
-  std::pair<unsigned, Optional<unsigned>>
-  getAllocSizeArgs(unsigned Index) const;
-
-  /// Get the vscale_range argument numbers (or pair(0, 0) if unknown).
-  std::pair<unsigned, unsigned> getVScaleRangeArgs(unsigned Index) const;
+  /// Get the number of dereferenceable_or_null bytes (or zero if unknown) of an
+  /// arg.
+  uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const;
 
   /// Return the attributes at the index as a string.
   std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
@@ -758,9 +851,32 @@ public:
 
   unsigned getNumAttrSets() const;
 
-  /// Use these to iterate over the valid attribute indices.
-  unsigned index_begin() const { return AttributeList::FunctionIndex; }
-  unsigned index_end() const { return getNumAttrSets() - 1; }
+  // Implementation of indexes(). Produces iterators that wrap an index. Mostly
+  // to hide the awkwardness of unsigned wrapping when iterating over valid
+  // indexes.
+  struct index_iterator {
+    unsigned NumAttrSets;
+    index_iterator(int NumAttrSets) : NumAttrSets(NumAttrSets) {}
+    struct int_wrapper {
+      int_wrapper(unsigned i) : i(i) {}
+      unsigned i;
+      unsigned operator*() { return i; }
+      bool operator!=(const int_wrapper &Other) { return i != Other.i; }
+      int_wrapper &operator++() {
+        // This is expected to undergo unsigned wrapping since FunctionIndex is
+        // ~0 and that's where we start.
+        ++i;
+        return *this;
+      }
+    };
+
+    int_wrapper begin() { return int_wrapper(AttributeList::FunctionIndex); }
+
+    int_wrapper end() { return int_wrapper(NumAttrSets - 1); }
+  };
+
+  /// Use this to iterate over the valid attribute indexes.
+  index_iterator indexes() const { return index_iterator(getNumAttrSets()); }
 
   /// operator==/!= - Provide equality predicates.
   bool operator==(const AttributeList &RHS) const { return pImpl == RHS.pImpl; }
@@ -782,7 +898,7 @@ public:
 //===----------------------------------------------------------------------===//
 /// \class
 /// Provide DenseMapInfo for AttributeList.
-template <> struct DenseMapInfo<AttributeList> {
+template <> struct DenseMapInfo<AttributeList, void> {
   static AttributeList getEmptyKey() {
     auto Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
@@ -814,14 +930,10 @@ template <> struct DenseMapInfo<AttributeList> {
 class AttrBuilder {
   std::bitset<Attribute::EndAttrKinds> Attrs;
   std::map<SmallString<32>, SmallString<32>, std::less<>> TargetDepAttrs;
-  MaybeAlign Alignment;
-  MaybeAlign StackAlignment;
-  uint64_t DerefBytes = 0;
-  uint64_t DerefOrNullBytes = 0;
-  uint64_t AllocSizeArgs = 0;
-  uint64_t VScaleRangeArgs = 0;
+  std::array<uint64_t, Attribute::NumIntAttrKinds> IntAttrs = {};
   std::array<Type *, Attribute::NumTypeAttrKinds> TypeAttrs = {};
 
+  Optional<unsigned> kindToIntIndex(Attribute::AttrKind Kind) const;
   Optional<unsigned> kindToTypeIndex(Attribute::AttrKind Kind) const;
 
 public:
@@ -891,19 +1003,31 @@ public:
   /// Return true if the builder has an alignment attribute.
   bool hasAlignmentAttr() const;
 
+  /// Return raw (possibly packed/encoded) value of integer attribute or 0 if
+  /// not set.
+  uint64_t getRawIntAttr(Attribute::AttrKind Kind) const;
+
   /// Retrieve the alignment attribute, if it exists.
-  MaybeAlign getAlignment() const { return Alignment; }
+  MaybeAlign getAlignment() const {
+    return MaybeAlign(getRawIntAttr(Attribute::Alignment));
+  }
 
   /// Retrieve the stack alignment attribute, if it exists.
-  MaybeAlign getStackAlignment() const { return StackAlignment; }
+  MaybeAlign getStackAlignment() const {
+    return MaybeAlign(getRawIntAttr(Attribute::StackAlignment));
+  }
 
   /// Retrieve the number of dereferenceable bytes, if the
   /// dereferenceable attribute exists (zero is returned otherwise).
-  uint64_t getDereferenceableBytes() const { return DerefBytes; }
+  uint64_t getDereferenceableBytes() const {
+    return getRawIntAttr(Attribute::Dereferenceable);
+  }
 
   /// Retrieve the number of dereferenceable_or_null bytes, if the
   /// dereferenceable_or_null attribute exists (zero is returned otherwise).
-  uint64_t getDereferenceableOrNullBytes() const { return DerefOrNullBytes; }
+  uint64_t getDereferenceableOrNullBytes() const {
+    return getRawIntAttr(Attribute::DereferenceableOrNull);
+  }
 
   /// Retrieve type for the given type attribute.
   Type *getTypeAttr(Attribute::AttrKind Kind) const;
@@ -933,6 +1057,9 @@ public:
   /// it doesn't exist, pair(0, 0) is returned.
   std::pair<unsigned, unsigned> getVScaleRangeArgs() const;
 
+  /// Add integer attribute with raw value (packed/encoded if necessary).
+  AttrBuilder &addRawIntAttr(Attribute::AttrKind Kind, uint64_t Value);
+
   /// This turns an alignment into the form used internally in Attribute.
   /// This call has no effect if Align is not set.
   AttrBuilder &addAlignmentAttr(MaybeAlign Align);
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 99b474161df7..de25b51a6292 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -86,6 +86,9 @@ def Dereferenceable : IntAttr<"dereferenceable", [ParamAttr, RetAttr]>;
 def DereferenceableOrNull : IntAttr<"dereferenceable_or_null",
                                     [ParamAttr, RetAttr]>;
 
+/// Do not instrument function with sanitizers.
+def DisableSanitizerInstrumentation: EnumAttr<"disable_sanitizer_instrumentation", [FnAttr]>;
+
 /// Provide pointer element type to intrinsic.
 def ElementType : TypeAttr<"elementtype", [ParamAttr]>;
 
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 0af4ec4ef138..184ddfc01c29 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -167,8 +167,8 @@ public:
   /// Returns a pointer to the first instruction in this block that is not a
   /// PHINode or a debug intrinsic, or any pseudo operation if \c SkipPseudoOp
   /// is true.
-  const Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = false) const;
-  Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = false) {
+  const Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = true) const;
+  Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = true) {
     return const_cast<Instruction *>(
         static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbg(
             SkipPseudoOp));
@@ -178,8 +178,8 @@ public:
   /// PHINode, a debug intrinsic, or a lifetime intrinsic, or any pseudo
   /// operation if \c SkipPseudoOp is true.
   const Instruction *
-  getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = false) const;
-  Instruction *getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = false) {
+  getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = true) const;
+  Instruction *getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = true) {
     return const_cast<Instruction *>(
         static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbgOrLifetime(
             SkipPseudoOp));
@@ -200,14 +200,14 @@ public:
   /// SkipPseudoOp is true.
   iterator_range<filter_iterator<BasicBlock::const_iterator,
                                  std::function<bool(const Instruction &)>>>
-  instructionsWithoutDebug(bool SkipPseudoOp = false) const;
+  instructionsWithoutDebug(bool SkipPseudoOp = true) const;
 
   /// Return an iterator range over the instructions in the block, skipping any
   /// debug instructions. Skip and any pseudo operations as well if \c
   /// SkipPseudoOp is true.
   iterator_range<
       filter_iterator<BasicBlock::iterator, std::function<bool(Instruction &)>>>
-  instructionsWithoutDebug(bool SkipPseudoOp = false);
+  instructionsWithoutDebug(bool SkipPseudoOp = true);
 
   /// Return the size of the basic block ignoring debug instructions
   filter_iterator<BasicBlock::const_iterator,
diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index 4e2022b36e30..c8999b71f3d1 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -198,6 +198,12 @@ public:
   /// hanging off of the globals.
   void removeDeadConstantUsers() const;
 
+  /// Return true if the constant has exactly one live use.
+  ///
+  /// This returns the same result as calling Value::hasOneUse after
+  /// Constant::removeDeadConstantUsers, but doesn't remove dead constants.
+  bool hasOneLiveUse() const;
+
   const Constant *stripPointerCasts() const {
     return cast<Constant>(Value::stripPointerCasts());
   }
diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h
index 44b8c395c89e..fea4d0da1d0d 100644
--- a/llvm/include/llvm/IR/ConstantRange.h
+++ b/llvm/include/llvm/IR/ConstantRange.h
@@ -128,6 +128,28 @@ public:
   /// NOTE: false does not mean that inverse predicate holds!
   bool icmp(CmpInst::Predicate Pred, const ConstantRange &Other) const;
 
+  /// Return true iff CR1 ult CR2 is equivalent to CR1 slt CR2.
+  /// Does not depend on strictness/direction of the predicate.
+  static bool
+  areInsensitiveToSignednessOfICmpPredicate(const ConstantRange &CR1,
+                                            const ConstantRange &CR2);
+
+  /// Return true iff CR1 ult CR2 is equivalent to CR1 sge CR2.
+  /// Does not depend on strictness/direction of the predicate.
+  static bool
+  areInsensitiveToSignednessOfInvertedICmpPredicate(const ConstantRange &CR1,
+                                                    const ConstantRange &CR2);
+
+  /// If the comparison between constant ranges this and Other
+  /// is insensitive to the signedness of the comparison predicate,
+  /// return a predicate equivalent to \p Pred, with flipped signedness
+  /// (i.e. unsigned instead of signed or vice versa), and maybe inverted,
+  /// otherwise returns CmpInst::Predicate::BAD_ICMP_PREDICATE.
+  static CmpInst::Predicate
+  getEquivalentPredWithFlippedSignedness(CmpInst::Predicate Pred,
+                                         const ConstantRange &CR1,
+                                         const ConstantRange &CR2);
+
   /// Produce the largest range containing all X such that "X BinOp Y" is
   /// guaranteed not to wrap (overflow) for *all* Y in Other. However, there may
   /// be *some* Y in Other for which additional X not contained in the result
@@ -167,6 +189,11 @@ public:
   /// successful.
   bool getEquivalentICmp(CmpInst::Predicate &Pred, APInt &RHS) const;
 
+  /// Set up \p Pred, \p RHS and \p Offset such that (V + Offset) Pred RHS
+  /// is true iff V is in the range. Prefers using Offset == 0 if possible.
+  void
+  getEquivalentICmp(CmpInst::Predicate &Pred, APInt &RHS, APInt &Offset) const;
+
   /// Return the lower value for this range.
   const APInt &getLower() const { return Lower; }
 
@@ -305,6 +332,14 @@ public:
   ConstantRange unionWith(const ConstantRange &CR,
                           PreferredRangeType Type = Smallest) const;
 
+  /// Intersect the two ranges and return the result if it can be represented
+  /// exactly, otherwise return None.
+  Optional<ConstantRange> exactIntersectWith(const ConstantRange &CR) const;
+
+  /// Union the two ranges and return the result if it can be represented
+  /// exactly, otherwise return None.
+  Optional<ConstantRange> exactUnionWith(const ConstantRange &CR) const;
+
   /// Return a new range representing the possible values resulting
   /// from an application of the specified cast operator to this range. \p
   /// BitWidth is the target bitwidth of the cast.  For casts which don't
@@ -383,6 +418,11 @@ public:
   /// treating both this and \p Other as unsigned ranges.
   ConstantRange multiply(const ConstantRange &Other) const;
 
+  /// Return range of possible values for a signed multiplication of this and
+  /// \p Other. However, if overflow is possible always return a full range
+  /// rather than trying to determine a more precise result.
+  ConstantRange smul_fast(const ConstantRange &Other) const;
+
   /// Return a new range representing the possible values resulting
   /// from a signed maximum of a value in this range and a value in \p Other.
   ConstantRange smax(const ConstantRange &Other) const;
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 1f716a45b70f..71414d95d9a3 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -191,19 +191,19 @@ public:
   /// This is just a convenience method to make client code smaller for a
   /// common code. It also correctly performs the comparison without the
   /// potential for an assertion from getZExtValue().
-  bool isZero() const { return Val.isNullValue(); }
+  bool isZero() const { return Val.isZero(); }
 
   /// This is just a convenience method to make client code smaller for a
   /// common case. It also correctly performs the comparison without the
   /// potential for an assertion from getZExtValue().
   /// Determine if the value is one.
-  bool isOne() const { return Val.isOneValue(); }
+  bool isOne() const { return Val.isOne(); }
 
   /// This function will return true iff every bit in this constant is set
   /// to true.
   /// @returns true iff this constant's bits are all set to true.
   /// Determine if the value is all ones.
-  bool isMinusOne() const { return Val.isAllOnesValue(); }
+  bool isMinusOne() const { return Val.isAllOnes(); }
 
   /// This function will return true iff this constant represents the largest
   /// value that may be represented by the constant's type.
@@ -1287,10 +1287,6 @@ public:
   /// Return a string representation for an opcode.
   const char *getOpcodeName() const;
 
-  /// Return a constant expression identical to this one, but with the specified
-  /// operand set to the specified value.
-  Constant *getWithOperandReplaced(unsigned OpNo, Constant *Op) const;
-
   /// This returns the current constant expression with the operands replaced
   /// with the specified values. The specified array must have the same number
   /// of operands as our current one.
@@ -1312,13 +1308,14 @@ public:
                             Type *SrcTy = nullptr) const;
 
   /// Returns an Instruction which implements the same operation as this
-  /// ConstantExpr. The instruction is not linked to any basic block.
+  /// ConstantExpr. If \p InsertBefore is not null, the new instruction is
+  /// inserted before it, otherwise it is not inserted into any basic block.
   ///
   /// A better approach to this could be to have a constructor for Instruction
   /// which would take a ConstantExpr parameter, but that would have spread
   /// implementation details of ConstantExpr outside of Constants.cpp, which
   /// would make it harder to remove ConstantExprs altogether.
-  Instruction *getAsInstruction() const;
+  Instruction *getAsInstruction(Instruction *InsertBefore = nullptr) const;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 23ac47ca4d81..61c6dd885980 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -181,7 +181,7 @@ namespace llvm {
                                      DIFile *File);
 
     /// Create a single enumerator value.
-    DIEnumerator *createEnumerator(StringRef Name, APSInt Value);
+    DIEnumerator *createEnumerator(StringRef Name, const APSInt &Value);
     DIEnumerator *createEnumerator(StringRef Name, uint64_t Val,
                                    bool IsUnsigned = false);
 
@@ -219,11 +219,12 @@ namespace llvm {
     /// \param AlignInBits       Alignment. (optional)
     /// \param DWARFAddressSpace DWARF address space. (optional)
     /// \param Name              Pointer type name. (optional)
-    DIDerivedType *createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
-                                     uint32_t AlignInBits = 0,
-                                     Optional<unsigned> DWARFAddressSpace =
-                                         None,
-                                     StringRef Name = "");
+    /// \param Annotations       Member annotations.
+    DIDerivedType *
+    createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
+                      uint32_t AlignInBits = 0,
+                      Optional<unsigned> DWARFAddressSpace = None,
+                      StringRef Name = "", DINodeArray Annotations = nullptr);
 
     /// Create debugging information entry for a pointer to member.
     /// \param PointeeTy Type pointed to by this pointer.
@@ -250,9 +251,11 @@ namespace llvm {
     /// \param LineNo      Line number.
     /// \param Context     The surrounding context for the typedef.
     /// \param AlignInBits Alignment. (optional)
+    /// \param Annotations Annotations. (optional)
     DIDerivedType *createTypedef(DIType *Ty, StringRef Name, DIFile *File,
                                  unsigned LineNo, DIScope *Context,
-                                 uint32_t AlignInBits = 0);
+                                 uint32_t AlignInBits = 0,
+                                 DINodeArray Annotations = nullptr);
 
     /// Create debugging information entry for a 'friend'.
     DIDerivedType *createFriend(DIType *Ty, DIType *FriendTy);
@@ -279,12 +282,13 @@ namespace llvm {
     /// \param OffsetInBits Member offset.
     /// \param Flags        Flags to encode member attribute, e.g. private
     /// \param Ty           Parent type.
+    /// \param Annotations  Member annotations.
     DIDerivedType *createMemberType(DIScope *Scope, StringRef Name,
                                     DIFile *File, unsigned LineNo,
-                                    uint64_t SizeInBits,
-                                    uint32_t AlignInBits,
+                                    uint64_t SizeInBits, uint32_t AlignInBits,
                                     uint64_t OffsetInBits,
-                                    DINode::DIFlags Flags, DIType *Ty);
+                                    DINode::DIFlags Flags, DIType *Ty,
+                                    DINodeArray Annotations = nullptr);
 
     /// Create debugging information entry for a variant.  A variant
     /// normally should be a member of a variant part.
@@ -317,10 +321,14 @@ namespace llvm {
     /// \param StorageOffsetInBits Member storage offset.
     /// \param Flags               Flags to encode member attribute.
     /// \param Ty                  Parent type.
-    DIDerivedType *createBitFieldMemberType(
-        DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo,
-        uint64_t SizeInBits, uint64_t OffsetInBits,
-        uint64_t StorageOffsetInBits, DINode::DIFlags Flags, DIType *Ty);
+    /// \param Annotations         Member annotations.
+    DIDerivedType *createBitFieldMemberType(DIScope *Scope, StringRef Name,
+                                            DIFile *File, unsigned LineNo,
+                                            uint64_t SizeInBits,
+                                            uint64_t OffsetInBits,
+                                            uint64_t StorageOffsetInBits,
+                                            DINode::DIFlags Flags, DIType *Ty,
+                                            DINodeArray Annotations = nullptr);
 
     /// Create debugging information entry for a
     /// C++ static data member.
@@ -586,7 +594,7 @@ namespace llvm {
         unsigned Tag, StringRef Name, DIScope *Scope, DIFile *F, unsigned Line,
         unsigned RuntimeLang = 0, uint64_t SizeInBits = 0,
         uint32_t AlignInBits = 0, DINode::DIFlags Flags = DINode::FlagFwdDecl,
-        StringRef UniqueIdentifier = "");
+        StringRef UniqueIdentifier = "", DINodeArray Annotations = nullptr);
 
     /// Retain DIScope* in a module even if it is not referenced
     /// through debug info anchors.
@@ -636,7 +644,8 @@ namespace llvm {
         DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
         unsigned LineNo, DIType *Ty, bool IsLocalToUnit, bool isDefined = true,
         DIExpression *Expr = nullptr, MDNode *Decl = nullptr,
-        MDTuple *TemplateParams = nullptr, uint32_t AlignInBits = 0);
+        MDTuple *TemplateParams = nullptr, uint32_t AlignInBits = 0,
+        DINodeArray Annotations = nullptr);
 
     /// Identical to createGlobalVariable
     /// except that the resulting DbgNode is temporary and meant to be RAUWed.
@@ -682,7 +691,8 @@ namespace llvm {
     createParameterVariable(DIScope *Scope, StringRef Name, unsigned ArgNo,
                             DIFile *File, unsigned LineNo, DIType *Ty,
                             bool AlwaysPreserve = false,
-                            DINode::DIFlags Flags = DINode::FlagZero);
+                            DINode::DIFlags Flags = DINode::FlagZero,
+                            DINodeArray Annotations = nullptr);
 
     /// Create a new descriptor for the specified
     /// variable which has a complex address expression for its address.
@@ -711,6 +721,7 @@ namespace llvm {
     /// \param SPFlags       Additional flags specific to subprograms.
     /// \param TParams       Function template parameters.
     /// \param ThrownTypes   Exception types this function may throw.
+    /// \param Annotations   Attribute Annotations.
     DISubprogram *
     createFunction(DIScope *Scope, StringRef Name, StringRef LinkageName,
                    DIFile *File, unsigned LineNo, DISubroutineType *Ty,
@@ -718,7 +729,8 @@ namespace llvm {
                    DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagZero,
                    DITemplateParameterArray TParams = nullptr,
                    DISubprogram *Decl = nullptr,
-                   DITypeArray ThrownTypes = nullptr);
+                   DITypeArray ThrownTypes = nullptr,
+                   DINodeArray Annotations = nullptr);
 
     /// Identical to createFunction,
     /// except that the resulting DbgNode is meant to be RAUWed.
@@ -818,29 +830,35 @@ namespace llvm {
                                        unsigned Line, unsigned Col);
 
     /// Create a descriptor for an imported module.
-    /// \param Context The scope this module is imported into
-    /// \param NS      The namespace being imported here.
-    /// \param File    File where the declaration is located.
-    /// \param Line    Line number of the declaration.
+    /// \param Context        The scope this module is imported into
+    /// \param NS             The namespace being imported here.
+    /// \param File           File where the declaration is located.
+    /// \param Line           Line number of the declaration.
+    /// \param Elements       Renamed elements.
     DIImportedEntity *createImportedModule(DIScope *Context, DINamespace *NS,
-                                           DIFile *File, unsigned Line);
+                                           DIFile *File, unsigned Line,
+                                           DINodeArray Elements = nullptr);
 
     /// Create a descriptor for an imported module.
     /// \param Context The scope this module is imported into.
     /// \param NS      An aliased namespace.
     /// \param File    File where the declaration is located.
     /// \param Line    Line number of the declaration.
+    /// \param Elements       Renamed elements.
     DIImportedEntity *createImportedModule(DIScope *Context,
                                            DIImportedEntity *NS, DIFile *File,
-                                           unsigned Line);
+                                           unsigned Line,
+                                           DINodeArray Elements = nullptr);
 
     /// Create a descriptor for an imported module.
-    /// \param Context The scope this module is imported into.
-    /// \param M       The module being imported here
-    /// \param File    File where the declaration is located.
-    /// \param Line    Line number of the declaration.
+    /// \param Context        The scope this module is imported into.
+    /// \param M              The module being imported here
+    /// \param File           File where the declaration is located.
+    /// \param Line           Line number of the declaration.
+    /// \param Elements       Renamed elements.
     DIImportedEntity *createImportedModule(DIScope *Context, DIModule *M,
-                                           DIFile *File, unsigned Line);
+                                           DIFile *File, unsigned Line,
+                                           DINodeArray Elements = nullptr);
 
     /// Create a descriptor for an imported function.
     /// \param Context The scope this module is imported into.
@@ -848,9 +866,11 @@ namespace llvm {
     ///                variable.
     /// \param File    File where the declaration is located.
     /// \param Line    Line number of the declaration.
+    /// \param Elements       Renamed elements.
     DIImportedEntity *createImportedDeclaration(DIScope *Context, DINode *Decl,
                                                 DIFile *File, unsigned Line,
-                                                StringRef Name = "");
+                                                StringRef Name = "",
+                                                DINodeArray Elements = nullptr);
 
     /// Insert a new llvm.dbg.declare intrinsic call.
     /// \param Storage     llvm::Value of the variable
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 300f73c12df0..46acd403bef1 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -19,6 +19,7 @@
 #ifndef LLVM_IR_DATALAYOUT_H
 #define LLVM_IR_DATALAYOUT_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -135,6 +136,7 @@ private:
     MM_MachO,
     MM_WinCOFF,
     MM_WinCOFFX86,
+    MM_GOFF,
     MM_Mips,
     MM_XCOFF
   };
@@ -316,6 +318,7 @@ public:
     switch (ManglingMode) {
     case MM_None:
     case MM_ELF:
+    case MM_GOFF:
     case MM_Mips:
     case MM_WinCOFF:
     case MM_XCOFF:
@@ -334,6 +337,8 @@ public:
     case MM_ELF:
     case MM_WinCOFF:
       return ".L";
+    case MM_GOFF:
+      return "@";
     case MM_Mips:
       return "$";
     case MM_MachO:
@@ -372,8 +377,8 @@ public:
   /// the backends/clients are updated.
   unsigned getPointerSize(unsigned AS = 0) const;
 
-  /// Returns the maximum pointer size over all address spaces.
-  unsigned getMaxPointerSize() const;
+  /// Returns the maximum index size over all address spaces.
+  unsigned getMaxIndexSize() const;
 
   // Index size used for address calculation.
   unsigned getIndexSize(unsigned AS) const;
@@ -405,9 +410,9 @@ public:
     return getPointerSize(AS) * 8;
   }
 
-  /// Returns the maximum pointer size over all address spaces.
-  unsigned getMaxPointerSizeInBits() const {
-    return getMaxPointerSize() * 8;
+  /// Returns the maximum index size over all address spaces.
+  unsigned getMaxIndexSizeInBits() const {
+    return getMaxIndexSize() * 8;
   }
 
   /// Size in bits of index used for address calculation in getelementptr.
@@ -514,7 +519,7 @@ public:
 
   /// Returns the minimum ABI-required alignment for the specified type.
   /// FIXME: Deprecate this function once migration to Align is over.
-  unsigned getABITypeAlignment(Type *Ty) const;
+  uint64_t getABITypeAlignment(Type *Ty) const;
 
   /// Returns the minimum ABI-required alignment for the specified type.
   Align getABITypeAlign(Type *Ty) const;
@@ -537,7 +542,7 @@ public:
   ///
   /// This is always at least as good as the ABI alignment.
   /// FIXME: Deprecate this function once migration to Align is over.
-  unsigned getPrefTypeAlignment(Type *Ty) const;
+  uint64_t getPrefTypeAlignment(Type *Ty) const;
 
   /// Returns the preferred stack/global alignment for the specified
   /// type.
@@ -579,6 +584,10 @@ public:
   /// This is used to implement getelementptr.
   int64_t getIndexedOffsetInType(Type *ElemTy, ArrayRef<Value *> Indices) const;
 
+  /// Get GEP indices to access Offset inside ElemTy. ElemTy is updated to be
+  /// the result element type and Offset to be the residual offset.
+  SmallVector<APInt> getGEPIndicesForOffset(Type *&ElemTy, APInt &Offset) const;
+
   /// Returns a StructLayout object, indicating the alignment of the
   /// struct, its size, and the offsets of its fields.
   ///
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index eba422a9fde6..730c69d0c622 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -106,8 +106,6 @@ public:
   void reset();
 
 private:
-  void InitializeTypeMap(const Module &M);
-
   void processCompileUnit(DICompileUnit *CU);
   void processScope(DIScope *Scope);
   void processType(DIType *DT);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 20a032f04909..c04f07c534af 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -256,11 +256,13 @@ class GenericDINode : public DINode {
 public:
   unsigned getHash() const { return SubclassData32; }
 
-  DEFINE_MDNODE_GET(GenericDINode, (unsigned Tag, StringRef Header,
-                                    ArrayRef<Metadata *> DwarfOps),
+  DEFINE_MDNODE_GET(GenericDINode,
+                    (unsigned Tag, StringRef Header,
+                     ArrayRef<Metadata *> DwarfOps),
                     (Tag, Header, DwarfOps))
-  DEFINE_MDNODE_GET(GenericDINode, (unsigned Tag, MDString *Header,
-                                    ArrayRef<Metadata *> DwarfOps),
+  DEFINE_MDNODE_GET(GenericDINode,
+                    (unsigned Tag, MDString *Header,
+                     ArrayRef<Metadata *> DwarfOps),
                     (Tag, Header, DwarfOps))
 
   /// Return a (temporary) clone of this.
@@ -324,7 +326,7 @@ public:
   DEFINE_MDNODE_GET(DISubrange, (int64_t Count, int64_t LowerBound = 0),
                     (Count, LowerBound))
 
-  DEFINE_MDNODE_GET(DISubrange, (Metadata *CountNode, int64_t LowerBound = 0),
+  DEFINE_MDNODE_GET(DISubrange, (Metadata * CountNode, int64_t LowerBound = 0),
                     (CountNode, LowerBound))
 
   DEFINE_MDNODE_GET(DISubrange,
@@ -334,9 +336,7 @@ public:
 
   TempDISubrange clone() const { return cloneImpl(); }
 
-  Metadata *getRawCountNode() const {
-    return getOperand(0).get();
-  }
+  Metadata *getRawCountNode() const { return getOperand(0).get(); }
 
   Metadata *getRawLowerBound() const { return getOperand(1).get(); }
 
@@ -548,14 +548,13 @@ public:
   };
 
   /// A single checksum, represented by a \a Kind and a \a Value (a string).
-  template <typename T>
-  struct ChecksumInfo {
+  template <typename T> struct ChecksumInfo {
     /// The kind of checksum which \a Value encodes.
     ChecksumKind Kind;
     /// The string value of the checksum.
     T Value;
 
-    ChecksumInfo(ChecksumKind Kind, T Value) : Kind(Kind), Value(Value) { }
+    ChecksumInfo(ChecksumKind Kind, T Value) : Kind(Kind), Value(Value) {}
     ~ChecksumInfo() = default;
     bool operator==(const ChecksumInfo<T> &X) const {
       return Kind == X.Kind && Value == X.Value;
@@ -578,15 +577,17 @@ private:
   static DIFile *getImpl(LLVMContext &Context, StringRef Filename,
                          StringRef Directory,
                          Optional<ChecksumInfo<StringRef>> CS,
-                         Optional<StringRef> Source,
-                         StorageType Storage, bool ShouldCreate = true) {
+                         Optional<StringRef> Source, StorageType Storage,
+                         bool ShouldCreate = true) {
     Optional<ChecksumInfo<MDString *>> MDChecksum;
     if (CS)
       MDChecksum.emplace(CS->Kind, getCanonicalMDString(Context, CS->Value));
-    return getImpl(Context, getCanonicalMDString(Context, Filename),
-                   getCanonicalMDString(Context, Directory), MDChecksum,
-                   Source ? Optional<MDString *>(getCanonicalMDString(Context, *Source)) : None,
-                   Storage, ShouldCreate);
+    return getImpl(
+        Context, getCanonicalMDString(Context, Filename),
+        getCanonicalMDString(Context, Directory), MDChecksum,
+        Source ? Optional<MDString *>(getCanonicalMDString(Context, *Source))
+               : None,
+        Storage, ShouldCreate);
   }
   static DIFile *getImpl(LLVMContext &Context, MDString *Filename,
                          MDString *Directory,
@@ -600,13 +601,15 @@ private:
   }
 
 public:
-  DEFINE_MDNODE_GET(DIFile, (StringRef Filename, StringRef Directory,
-                             Optional<ChecksumInfo<StringRef>> CS = None,
-                             Optional<StringRef> Source = None),
+  DEFINE_MDNODE_GET(DIFile,
+                    (StringRef Filename, StringRef Directory,
+                     Optional<ChecksumInfo<StringRef>> CS = None,
+                     Optional<StringRef> Source = None),
                     (Filename, Directory, CS, Source))
-  DEFINE_MDNODE_GET(DIFile, (MDString * Filename, MDString *Directory,
-                             Optional<ChecksumInfo<MDString *>> CS = None,
-                             Optional<MDString *> Source = None),
+  DEFINE_MDNODE_GET(DIFile,
+                    (MDString * Filename, MDString *Directory,
+                     Optional<ChecksumInfo<MDString *>> CS = None,
+                     Optional<MDString *> Source = None),
                     (Filename, Directory, CS, Source))
 
   TempDIFile clone() const { return cloneImpl(); }
@@ -707,7 +710,6 @@ public:
   DIScope *getScope() const { return cast_or_null<DIScope>(getRawScope()); }
   StringRef getName() const { return getStringOperand(2); }
 
-
   Metadata *getRawScope() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
 
@@ -936,47 +938,48 @@ class DIDerivedType : public DIType {
           unsigned Line, DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
           uint32_t AlignInBits, uint64_t OffsetInBits,
           Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-          Metadata *ExtraData, StorageType Storage, bool ShouldCreate = true) {
+          Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
                    Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
-                   DWARFAddressSpace, Flags, ExtraData, Storage, ShouldCreate);
-  }
-  static DIDerivedType *getImpl(LLVMContext &Context, unsigned Tag,
-                                MDString *Name, Metadata *File, unsigned Line,
-                                Metadata *Scope, Metadata *BaseType,
-                                uint64_t SizeInBits, uint32_t AlignInBits,
-                                uint64_t OffsetInBits,
-                                Optional<unsigned> DWARFAddressSpace,
-                                DIFlags Flags, Metadata *ExtraData,
-                                StorageType Storage, bool ShouldCreate = true);
+                   DWARFAddressSpace, Flags, ExtraData, Annotations.get(),
+                   Storage, ShouldCreate);
+  }
+  static DIDerivedType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+          unsigned Line, Metadata *Scope, Metadata *BaseType,
+          uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+          Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+          Metadata *ExtraData, Metadata *Annotations, StorageType Storage,
+          bool ShouldCreate = true);
 
   TempDIDerivedType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
-                        getScope(), getBaseType(), getSizeInBits(),
-                        getAlignInBits(), getOffsetInBits(),
-                        getDWARFAddressSpace(), getFlags(), getExtraData());
+    return getTemporary(
+        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
+        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
+        getDWARFAddressSpace(), getFlags(), getExtraData(), getAnnotations());
   }
 
 public:
-  DEFINE_MDNODE_GET(DIDerivedType,
-                    (unsigned Tag, MDString *Name, Metadata *File,
-                     unsigned Line, Metadata *Scope, Metadata *BaseType,
-                     uint64_t SizeInBits, uint32_t AlignInBits,
-                     uint64_t OffsetInBits,
-                     Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-                     Metadata *ExtraData = nullptr),
-                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                     AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                     ExtraData))
+  DEFINE_MDNODE_GET(
+      DIDerivedType,
+      (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
+       Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+       uint32_t AlignInBits, uint64_t OffsetInBits,
+       Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+       Metadata *ExtraData = nullptr, Metadata *Annotations = nullptr),
+      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
+       OffsetInBits, DWARFAddressSpace, Flags, ExtraData, Annotations))
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
                      DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
                      uint32_t AlignInBits, uint64_t OffsetInBits,
                      Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-                     Metadata *ExtraData = nullptr),
+                     Metadata *ExtraData = nullptr,
+                     DINodeArray Annotations = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                      AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                     ExtraData))
+                     ExtraData, Annotations))
 
   TempDIDerivedType clone() const { return cloneImpl(); }
 
@@ -999,6 +1002,12 @@ public:
   Metadata *getExtraData() const { return getRawExtraData(); }
   Metadata *getRawExtraData() const { return getOperand(4); }
 
+  /// Get annotations associated with this derived type.
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
+  Metadata *getRawAnnotations() const { return getOperand(5); }
+
   /// Get casted version of extra data.
   /// @{
   DIType *getClassType() const {
@@ -1065,8 +1074,8 @@ class DICompositeType : public DIType {
 
   /// Change fields in place.
   void mutate(unsigned Tag, unsigned Line, unsigned RuntimeLang,
-              uint64_t SizeInBits, uint32_t AlignInBits,
-              uint64_t OffsetInBits, DIFlags Flags) {
+              uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+              DIFlags Flags) {
     assert(isDistinct() && "Only distinct nodes can mutate");
     assert(getRawIdentifier() && "Only ODR-uniqued nodes should mutate");
     this->RuntimeLang = RuntimeLang;
@@ -1081,13 +1090,14 @@ class DICompositeType : public DIType {
           DITemplateParameterArray TemplateParams, StringRef Identifier,
           DIDerivedType *Discriminator, Metadata *DataLocation,
           Metadata *Associated, Metadata *Allocated, Metadata *Rank,
-          StorageType Storage, bool ShouldCreate = true) {
+          DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(
         Context, Tag, getCanonicalMDString(Context, Name), File, Line, Scope,
         BaseType, SizeInBits, AlignInBits, OffsetInBits, Flags, Elements.get(),
         RuntimeLang, VTableHolder, TemplateParams.get(),
         getCanonicalMDString(Context, Identifier), Discriminator, DataLocation,
-        Associated, Allocated, Rank, Storage, ShouldCreate);
+        Associated, Allocated, Rank, Annotations.get(), Storage, ShouldCreate);
   }
   static DICompositeType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
@@ -1097,16 +1107,16 @@ class DICompositeType : public DIType {
           Metadata *VTableHolder, Metadata *TemplateParams,
           MDString *Identifier, Metadata *Discriminator, Metadata *DataLocation,
           Metadata *Associated, Metadata *Allocated, Metadata *Rank,
-          StorageType Storage, bool ShouldCreate = true);
+          Metadata *Annotations, StorageType Storage, bool ShouldCreate = true);
 
   TempDICompositeType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
-                        getScope(), getBaseType(), getSizeInBits(),
-                        getAlignInBits(), getOffsetInBits(), getFlags(),
-                        getElements(), getRuntimeLang(), getVTableHolder(),
-                        getTemplateParams(), getIdentifier(),
-                        getDiscriminator(), getRawDataLocation(),
-                        getRawAssociated(), getRawAllocated(), getRawRank());
+    return getTemporary(
+        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
+        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
+        getFlags(), getElements(), getRuntimeLang(), getVTableHolder(),
+        getTemplateParams(), getIdentifier(), getDiscriminator(),
+        getRawDataLocation(), getRawAssociated(), getRawAllocated(),
+        getRawRank(), getAnnotations());
   }
 
 public:
@@ -1119,10 +1129,12 @@ public:
        DITemplateParameterArray TemplateParams = nullptr,
        StringRef Identifier = "", DIDerivedType *Discriminator = nullptr,
        Metadata *DataLocation = nullptr, Metadata *Associated = nullptr,
-       Metadata *Allocated = nullptr, Metadata *Rank = nullptr),
+       Metadata *Allocated = nullptr, Metadata *Rank = nullptr,
+       DINodeArray Annotations = nullptr),
       (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank))
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank,
+       Annotations))
   DEFINE_MDNODE_GET(
       DICompositeType,
       (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
@@ -1132,10 +1144,11 @@ public:
        Metadata *TemplateParams = nullptr, MDString *Identifier = nullptr,
        Metadata *Discriminator = nullptr, Metadata *DataLocation = nullptr,
        Metadata *Associated = nullptr, Metadata *Allocated = nullptr,
-       Metadata *Rank = nullptr),
+       Metadata *Rank = nullptr, Metadata *Annotations = nullptr),
       (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank))
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank,
+       Annotations))
 
   TempDICompositeType clone() const { return cloneImpl(); }
 
@@ -1154,7 +1167,7 @@ public:
              unsigned RuntimeLang, Metadata *VTableHolder,
              Metadata *TemplateParams, Metadata *Discriminator,
              Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-             Metadata *Rank);
+             Metadata *Rank, Metadata *Annotations);
   static DICompositeType *getODRTypeIfExists(LLVMContext &Context,
                                              MDString &Identifier);
 
@@ -1175,7 +1188,7 @@ public:
                unsigned RuntimeLang, Metadata *VTableHolder,
                Metadata *TemplateParams, Metadata *Discriminator,
                Metadata *DataLocation, Metadata *Associated,
-               Metadata *Allocated, Metadata *Rank);
+               Metadata *Allocated, Metadata *Rank, Metadata *Annotations);
 
   DIType *getBaseType() const { return cast_or_null<DIType>(getRawBaseType()); }
   DINodeArray getElements() const {
@@ -1196,7 +1209,9 @@ public:
   Metadata *getRawTemplateParams() const { return getOperand(6); }
   MDString *getRawIdentifier() const { return getOperandAs<MDString>(7); }
   Metadata *getRawDiscriminator() const { return getOperand(8); }
-  DIDerivedType *getDiscriminator() const { return getOperandAs<DIDerivedType>(8); }
+  DIDerivedType *getDiscriminator() const {
+    return getOperandAs<DIDerivedType>(8);
+  }
   Metadata *getRawDataLocation() const { return getOperand(9); }
   DIVariable *getDataLocation() const {
     return dyn_cast_or_null<DIVariable>(getRawDataLocation());
@@ -1228,6 +1243,11 @@ public:
     return dyn_cast_or_null<DIExpression>(getRawRank());
   }
 
+  Metadata *getRawAnnotations() const { return getOperand(13); }
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
+
   /// Replace operands.
   ///
   /// If this \a isUniqued() and not \a isResolved(), on a uniquing collision
@@ -1507,9 +1527,7 @@ public:
   void replaceEnumTypes(DICompositeTypeArray N) {
     replaceOperandWith(4, N.get());
   }
-  void replaceRetainedTypes(DITypeArray N) {
-    replaceOperandWith(5, N.get());
-  }
+  void replaceRetainedTypes(DITypeArray N) { replaceOperandWith(5, N.get()); }
   void replaceGlobalVariables(DIGlobalVariableExpressionArray N) {
     replaceOperandWith(6, N.get());
   }
@@ -1691,7 +1709,8 @@ public:
   /// base discriminator is set in the new DILocation, the other encoded values
   /// are elided.
   /// If the discriminator cannot be encoded, the function returns None.
-  inline Optional<const DILocation *> cloneWithBaseDiscriminator(unsigned BD) const;
+  inline Optional<const DILocation *>
+  cloneWithBaseDiscriminator(unsigned BD) const;
 
   /// Returns the duplication factor stored in the discriminator, or 1 if no
   /// duplication factor (or 0) is encoded.
@@ -1707,7 +1726,8 @@ public:
   /// duplication factor encoded in the discriminator. The current duplication
   /// factor is as defined by getDuplicationFactor().
   /// Returns None if encoding failed.
-  inline Optional<const DILocation *> cloneByMultiplyingDuplicationFactor(unsigned DF) const;
+  inline Optional<const DILocation *>
+  cloneByMultiplyingDuplicationFactor(unsigned DF) const;
 
   /// When two instructions are combined into a single instruction we also
   /// need to combine the original locations into a single location.
@@ -1730,8 +1750,8 @@ public:
   /// This function applies getMergedLocation() repeatedly left-to-right.
   ///
   /// \p Locs: The locations to be merged.
-  static
-  const DILocation *getMergedLocations(ArrayRef<const DILocation *> Locs);
+  static const DILocation *
+  getMergedLocations(ArrayRef<const DILocation *> Locs);
 
   /// Return the masked discriminator value for an input discrimnator value D
   /// (i.e. zero out the (B+1)-th and above bits for D (B is 0-base).
@@ -1755,13 +1775,18 @@ public:
   /// Raw encoding of the discriminator. APIs such as cloneWithDuplicationFactor
   /// have certain special case behavior (e.g. treating empty duplication factor
   /// as the value '1').
-  /// This API, in conjunction with cloneWithDiscriminator, may be used to encode
-  /// the raw values provided. \p BD: base discriminator \p DF: duplication factor
+  /// This API, in conjunction with cloneWithDiscriminator, may be used to
+  /// encode the raw values provided.
+  ///
+  /// \p BD: base discriminator
+  /// \p DF: duplication factor
   /// \p CI: copy index
+  ///
   /// The return is None if the values cannot be encoded in 32 bits - for
-  /// example, values for BD or DF larger than 12 bits. Otherwise, the return
-  /// is the encoded value.
-  static Optional<unsigned> encodeDiscriminator(unsigned BD, unsigned DF, unsigned CI);
+  /// example, values for BD or DF larger than 12 bits. Otherwise, the return is
+  /// the encoded value.
+  static Optional<unsigned> encodeDiscriminator(unsigned BD, unsigned DF,
+                                                unsigned CI);
 
   /// Raw decoder for values in an encoded discriminator D.
   static void decodeDiscriminator(unsigned D, unsigned &BD, unsigned &DF,
@@ -1781,11 +1806,10 @@ public:
 
   /// Returns the copy identifier for a given encoded discriminator \p D.
   static unsigned getCopyIdentifierFromDiscriminator(unsigned D) {
-    return getUnsignedFromPrefixEncoding(getNextComponentInDiscriminator(
-        getNextComponentInDiscriminator(D)));
+    return getUnsignedFromPrefixEncoding(
+        getNextComponentInDiscriminator(getNextComponentInDiscriminator(D)));
   }
 
-
   Metadata *getRawScope() const { return getOperand(0); }
   Metadata *getRawInlinedAt() const {
     if (getNumOperands() == 2)
@@ -1839,10 +1863,10 @@ public:
                              unsigned Virtuality = SPFlagNonvirtual,
                              bool IsMainSubprogram = false) {
     // We're assuming virtuality is the low-order field.
-    static_assert(
-        int(SPFlagVirtual) == int(dwarf::DW_VIRTUALITY_virtual) &&
-            int(SPFlagPureVirtual) == int(dwarf::DW_VIRTUALITY_pure_virtual),
-        "Virtuality constant mismatch");
+    static_assert(int(SPFlagVirtual) == int(dwarf::DW_VIRTUALITY_virtual) &&
+                      int(SPFlagPureVirtual) ==
+                          int(dwarf::DW_VIRTUALITY_pure_virtual),
+                  "Virtuality constant mismatch");
     return static_cast<DISPFlags>(
         (Virtuality & SPFlagVirtuality) |
         (IsLocalToUnit ? SPFlagLocalToUnit : SPFlagZero) |
@@ -1874,23 +1898,23 @@ private:
           DISPFlags SPFlags, DICompileUnit *Unit,
           DITemplateParameterArray TemplateParams, DISubprogram *Declaration,
           DINodeArray RetainedNodes, DITypeArray ThrownTypes,
-          StorageType Storage, bool ShouldCreate = true) {
+          DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
                    ScopeLine, ContainingType, VirtualIndex, ThisAdjustment,
                    Flags, SPFlags, Unit, TemplateParams.get(), Declaration,
-                   RetainedNodes.get(), ThrownTypes.get(), Storage,
-                   ShouldCreate);
+                   RetainedNodes.get(), ThrownTypes.get(), Annotations.get(),
+                   Storage, ShouldCreate);
   }
-  static DISubprogram *getImpl(LLVMContext &Context, Metadata *Scope,
-                               MDString *Name, MDString *LinkageName,
-                               Metadata *File, unsigned Line, Metadata *Type,
-                               unsigned ScopeLine, Metadata *ContainingType,
-                               unsigned VirtualIndex, int ThisAdjustment,
-                               DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
-                               Metadata *TemplateParams, Metadata *Declaration,
-                               Metadata *RetainedNodes, Metadata *ThrownTypes,
-                               StorageType Storage, bool ShouldCreate = true);
+  static DISubprogram *
+  getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+          MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+          unsigned ScopeLine, Metadata *ContainingType, unsigned VirtualIndex,
+          int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
+          Metadata *TemplateParams, Metadata *Declaration,
+          Metadata *RetainedNodes, Metadata *ThrownTypes, Metadata *Annotations,
+          StorageType Storage, bool ShouldCreate = true);
 
   TempDISubprogram cloneImpl() const {
     return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
@@ -1898,7 +1922,7 @@ private:
                         getContainingType(), getVirtualIndex(),
                         getThisAdjustment(), getFlags(), getSPFlags(),
                         getUnit(), getTemplateParams(), getDeclaration(),
-                        getRetainedNodes(), getThrownTypes());
+                        getRetainedNodes(), getThrownTypes(), getAnnotations());
   }
 
 public:
@@ -1910,10 +1934,10 @@ public:
        DIFlags Flags, DISPFlags SPFlags, DICompileUnit *Unit,
        DITemplateParameterArray TemplateParams = nullptr,
        DISubprogram *Declaration = nullptr, DINodeArray RetainedNodes = nullptr,
-       DITypeArray ThrownTypes = nullptr),
+       DITypeArray ThrownTypes = nullptr, DINodeArray Annotations = nullptr),
       (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType,
        VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams,
-       Declaration, RetainedNodes, ThrownTypes))
+       Declaration, RetainedNodes, ThrownTypes, Annotations))
 
   DEFINE_MDNODE_GET(
       DISubprogram,
@@ -1922,10 +1946,11 @@ public:
        Metadata *ContainingType, unsigned VirtualIndex, int ThisAdjustment,
        DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
        Metadata *TemplateParams = nullptr, Metadata *Declaration = nullptr,
-       Metadata *RetainedNodes = nullptr, Metadata *ThrownTypes = nullptr),
+       Metadata *RetainedNodes = nullptr, Metadata *ThrownTypes = nullptr,
+       Metadata *Annotations = nullptr),
       (Scope, Name, LinkageName, File, Line, Type, ScopeLine, ContainingType,
        VirtualIndex, ThisAdjustment, Flags, SPFlags, Unit, TemplateParams,
-       Declaration, RetainedNodes, ThrownTypes))
+       Declaration, RetainedNodes, ThrownTypes, Annotations))
 
   TempDISubprogram clone() const { return cloneImpl(); }
 
@@ -1942,7 +1967,10 @@ public:
   unsigned getVirtualIndex() const { return VirtualIndex; }
   int getThisAdjustment() const { return ThisAdjustment; }
   unsigned getScopeLine() const { return ScopeLine; }
-  void setScopeLine(unsigned L) { assert(isDistinct()); ScopeLine = L; }
+  void setScopeLine(unsigned L) {
+    assert(isDistinct());
+    ScopeLine = L;
+  }
   DIFlags getFlags() const { return Flags; }
   DISPFlags getSPFlags() const { return SPFlags; }
   bool isLocalToUnit() const { return getSPFlags() & SPFlagLocalToUnit; }
@@ -2028,6 +2056,9 @@ public:
   DITypeArray getThrownTypes() const {
     return cast_or_null<MDTuple>(getRawThrownTypes());
   }
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
 
   Metadata *getRawScope() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
@@ -2045,6 +2076,9 @@ public:
   Metadata *getRawThrownTypes() const {
     return getNumOperands() > 10 ? getOperandAs<Metadata>(10) : nullptr;
   }
+  Metadata *getRawAnnotations() const {
+    return getNumOperands() > 11 ? getOperandAs<Metadata>(11) : nullptr;
+  }
 
   void replaceRawLinkageName(MDString *LinkageName) {
     replaceOperandWith(3, LinkageName);
@@ -2112,11 +2146,13 @@ class DILexicalBlock : public DILexicalBlockBase {
   }
 
 public:
-  DEFINE_MDNODE_GET(DILexicalBlock, (DILocalScope * Scope, DIFile *File,
-                                     unsigned Line, unsigned Column),
+  DEFINE_MDNODE_GET(DILexicalBlock,
+                    (DILocalScope * Scope, DIFile *File, unsigned Line,
+                     unsigned Column),
                     (Scope, File, Line, Column))
-  DEFINE_MDNODE_GET(DILexicalBlock, (Metadata * Scope, Metadata *File,
-                                     unsigned Line, unsigned Column),
+  DEFINE_MDNODE_GET(DILexicalBlock,
+                    (Metadata * Scope, Metadata *File, unsigned Line,
+                     unsigned Column),
                     (Scope, File, Line, Column))
 
   TempDILexicalBlock clone() const { return cloneImpl(); }
@@ -2161,8 +2197,9 @@ class DILexicalBlockFile : public DILexicalBlockBase {
   }
 
 public:
-  DEFINE_MDNODE_GET(DILexicalBlockFile, (DILocalScope * Scope, DIFile *File,
-                                         unsigned Discriminator),
+  DEFINE_MDNODE_GET(DILexicalBlockFile,
+                    (DILocalScope * Scope, DIFile *File,
+                     unsigned Discriminator),
                     (Scope, File, Discriminator))
   DEFINE_MDNODE_GET(DILexicalBlockFile,
                     (Metadata * Scope, Metadata *File, unsigned Discriminator),
@@ -2212,7 +2249,8 @@ unsigned DILocation::getCopyIdentifier() const {
   return getCopyIdentifierFromDiscriminator(getDiscriminator());
 }
 
-Optional<const DILocation *> DILocation::cloneWithBaseDiscriminator(unsigned D) const {
+Optional<const DILocation *>
+DILocation::cloneWithBaseDiscriminator(unsigned D) const {
   unsigned BD, DF, CI;
 
   if (EnableFSDiscriminator) {
@@ -2230,7 +2268,8 @@ Optional<const DILocation *> DILocation::cloneWithBaseDiscriminator(unsigned D)
   return None;
 }
 
-Optional<const DILocation *> DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const {
+Optional<const DILocation *>
+DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const {
   assert(!EnableFSDiscriminator && "FSDiscriminator should not call this.");
 
   DF *= getDuplicationFactor();
@@ -2274,10 +2313,10 @@ class DINamespace : public DIScope {
 
 public:
   DEFINE_MDNODE_GET(DINamespace,
-                    (DIScope *Scope, StringRef Name, bool ExportSymbols),
+                    (DIScope * Scope, StringRef Name, bool ExportSymbols),
                     (Scope, Name, ExportSymbols))
   DEFINE_MDNODE_GET(DINamespace,
-                    (Metadata *Scope, MDString *Name, bool ExportSymbols),
+                    (Metadata * Scope, MDString *Name, bool ExportSymbols),
                     (Scope, Name, ExportSymbols))
 
   TempDINamespace clone() const { return cloneImpl(); }
@@ -2426,7 +2465,7 @@ public:
                     (StringRef Name, DIType *Type, bool IsDefault),
                     (Name, Type, IsDefault))
   DEFINE_MDNODE_GET(DITemplateTypeParameter,
-                    (MDString *Name, Metadata *Type, bool IsDefault),
+                    (MDString * Name, Metadata *Type, bool IsDefault),
                     (Name, Type, IsDefault))
 
   TempDITemplateTypeParameter clone() const { return cloneImpl(); }
@@ -2819,7 +2858,8 @@ public:
   /// \param OffsetInBits Offset of the piece in bits.
   /// \param SizeInBits   Size of the piece in bits.
   /// \return             Creating a fragment expression may fail if \c Expr
-  ///                     contains arithmetic operations that would be truncated.
+  ///                     contains arithmetic operations that would be
+  ///                     truncated.
   static Optional<DIExpression *>
   createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits,
                            unsigned SizeInBits);
@@ -2876,6 +2916,12 @@ public:
     return getNumElements() > 0 &&
            getElement(0) == dwarf::DW_OP_LLVM_entry_value;
   }
+
+  /// Try to shorten an expression with an initial constant operand.
+  /// Returns a new expression and constant on success, or the original
+  /// expression and constant on failure.
+  std::pair<DIExpression *, const ConstantInt *>
+  constantFold(const ConstantInt *CI);
 };
 
 inline bool operator==(const DIExpression::FragmentInfo &A,
@@ -2927,46 +2973,47 @@ class DIGlobalVariable : public DIVariable {
           StringRef LinkageName, DIFile *File, unsigned Line, DIType *Type,
           bool IsLocalToUnit, bool IsDefinition,
           DIDerivedType *StaticDataMemberDeclaration, MDTuple *TemplateParams,
-          uint32_t AlignInBits, StorageType Storage, bool ShouldCreate = true) {
+          uint32_t AlignInBits, DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
                    IsLocalToUnit, IsDefinition, StaticDataMemberDeclaration,
-                   cast_or_null<Metadata>(TemplateParams), AlignInBits, Storage,
-                   ShouldCreate);
+                   cast_or_null<Metadata>(TemplateParams), AlignInBits,
+                   Annotations.get(), Storage, ShouldCreate);
   }
   static DIGlobalVariable *
   getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
           MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
           bool IsLocalToUnit, bool IsDefinition,
           Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
-          uint32_t AlignInBits, StorageType Storage, bool ShouldCreate = true);
+          uint32_t AlignInBits, Metadata *Annotations, StorageType Storage,
+          bool ShouldCreate = true);
 
   TempDIGlobalVariable cloneImpl() const {
     return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
                         getFile(), getLine(), getType(), isLocalToUnit(),
                         isDefinition(), getStaticDataMemberDeclaration(),
-                        getTemplateParams(), getAlignInBits());
+                        getTemplateParams(), getAlignInBits(),
+                        getAnnotations());
   }
 
 public:
-  DEFINE_MDNODE_GET(DIGlobalVariable,
-                    (DIScope * Scope, StringRef Name, StringRef LinkageName,
-                     DIFile *File, unsigned Line, DIType *Type,
-                     bool IsLocalToUnit, bool IsDefinition,
-                     DIDerivedType *StaticDataMemberDeclaration,
-                     MDTuple *TemplateParams, uint32_t AlignInBits),
-                    (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
-                     IsDefinition, StaticDataMemberDeclaration, TemplateParams,
-                     AlignInBits))
-  DEFINE_MDNODE_GET(DIGlobalVariable,
-                    (Metadata * Scope, MDString *Name, MDString *LinkageName,
-                     Metadata *File, unsigned Line, Metadata *Type,
-                     bool IsLocalToUnit, bool IsDefinition,
-                     Metadata *StaticDataMemberDeclaration,
-                     Metadata *TemplateParams, uint32_t AlignInBits),
-                    (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
-                     IsDefinition, StaticDataMemberDeclaration, TemplateParams,
-                     AlignInBits))
+  DEFINE_MDNODE_GET(
+      DIGlobalVariable,
+      (DIScope * Scope, StringRef Name, StringRef LinkageName, DIFile *File,
+       unsigned Line, DIType *Type, bool IsLocalToUnit, bool IsDefinition,
+       DIDerivedType *StaticDataMemberDeclaration, MDTuple *TemplateParams,
+       uint32_t AlignInBits, DINodeArray Annotations),
+      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+       StaticDataMemberDeclaration, TemplateParams, AlignInBits, Annotations))
+  DEFINE_MDNODE_GET(
+      DIGlobalVariable,
+      (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File,
+       unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+       Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
+       uint32_t AlignInBits, Metadata *Annotations),
+      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+       StaticDataMemberDeclaration, TemplateParams, AlignInBits, Annotations))
 
   TempDIGlobalVariable clone() const { return cloneImpl(); }
 
@@ -2977,11 +3024,15 @@ public:
   DIDerivedType *getStaticDataMemberDeclaration() const {
     return cast_or_null<DIDerivedType>(getRawStaticDataMemberDeclaration());
   }
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
 
   MDString *getRawLinkageName() const { return getOperandAs<MDString>(5); }
   Metadata *getRawStaticDataMemberDeclaration() const { return getOperand(6); }
   Metadata *getRawTemplateParams() const { return getOperand(7); }
   MDTuple *getTemplateParams() const { return getOperandAs<MDTuple>(7); }
+  Metadata *getRawAnnotations() const { return getOperand(8); }
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DIGlobalVariableKind;
@@ -2997,20 +3048,20 @@ class DICommonBlock : public DIScope {
   DICommonBlock(LLVMContext &Context, StorageType Storage, unsigned LineNo,
                 ArrayRef<Metadata *> Ops)
       : DIScope(Context, DICommonBlockKind, Storage, dwarf::DW_TAG_common_block,
-                Ops), LineNo(LineNo) {}
+                Ops),
+        LineNo(LineNo) {}
 
   static DICommonBlock *getImpl(LLVMContext &Context, DIScope *Scope,
                                 DIGlobalVariable *Decl, StringRef Name,
                                 DIFile *File, unsigned LineNo,
-                                StorageType Storage,
-                                bool ShouldCreate = true) {
+                                StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Scope, Decl, getCanonicalMDString(Context, Name),
                    File, LineNo, Storage, ShouldCreate);
   }
   static DICommonBlock *getImpl(LLVMContext &Context, Metadata *Scope,
                                 Metadata *Decl, MDString *Name, Metadata *File,
-                                unsigned LineNo, 
-                                StorageType Storage, bool ShouldCreate = true);
+                                unsigned LineNo, StorageType Storage,
+                                bool ShouldCreate = true);
 
   TempDICommonBlock cloneImpl() const {
     return getTemporary(getContext(), getScope(), getDecl(), getName(),
@@ -3019,11 +3070,11 @@ class DICommonBlock : public DIScope {
 
 public:
   DEFINE_MDNODE_GET(DICommonBlock,
-                    (DIScope *Scope, DIGlobalVariable *Decl, StringRef Name,
+                    (DIScope * Scope, DIGlobalVariable *Decl, StringRef Name,
                      DIFile *File, unsigned LineNo),
                     (Scope, Decl, Name, File, LineNo))
   DEFINE_MDNODE_GET(DICommonBlock,
-                    (Metadata *Scope, Metadata *Decl, MDString *Name,
+                    (Metadata * Scope, Metadata *Decl, MDString *Name,
                      Metadata *File, unsigned LineNo),
                     (Scope, Decl, Name, File, LineNo))
 
@@ -3069,34 +3120,39 @@ class DILocalVariable : public DIVariable {
   static DILocalVariable *getImpl(LLVMContext &Context, DIScope *Scope,
                                   StringRef Name, DIFile *File, unsigned Line,
                                   DIType *Type, unsigned Arg, DIFlags Flags,
-                                  uint32_t AlignInBits, StorageType Storage,
+                                  uint32_t AlignInBits, DINodeArray Annotations,
+                                  StorageType Storage,
                                   bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name), File,
-                   Line, Type, Arg, Flags, AlignInBits, Storage, ShouldCreate);
+                   Line, Type, Arg, Flags, AlignInBits, Annotations.get(),
+                   Storage, ShouldCreate);
   }
   static DILocalVariable *getImpl(LLVMContext &Context, Metadata *Scope,
                                   MDString *Name, Metadata *File, unsigned Line,
                                   Metadata *Type, unsigned Arg, DIFlags Flags,
-                                  uint32_t AlignInBits, StorageType Storage,
+                                  uint32_t AlignInBits, Metadata *Annotations,
+                                  StorageType Storage,
                                   bool ShouldCreate = true);
 
   TempDILocalVariable cloneImpl() const {
     return getTemporary(getContext(), getScope(), getName(), getFile(),
                         getLine(), getType(), getArg(), getFlags(),
-                        getAlignInBits());
+                        getAlignInBits(), getAnnotations());
   }
 
 public:
   DEFINE_MDNODE_GET(DILocalVariable,
                     (DILocalScope * Scope, StringRef Name, DIFile *File,
                      unsigned Line, DIType *Type, unsigned Arg, DIFlags Flags,
-                     uint32_t AlignInBits),
-                    (Scope, Name, File, Line, Type, Arg, Flags, AlignInBits))
+                     uint32_t AlignInBits, DINodeArray Annotations),
+                    (Scope, Name, File, Line, Type, Arg, Flags, AlignInBits,
+                     Annotations))
   DEFINE_MDNODE_GET(DILocalVariable,
                     (Metadata * Scope, MDString *Name, Metadata *File,
-                     unsigned Line, Metadata *Type, unsigned Arg,
-                     DIFlags Flags, uint32_t AlignInBits),
-                    (Scope, Name, File, Line, Type, Arg, Flags, AlignInBits))
+                     unsigned Line, Metadata *Type, unsigned Arg, DIFlags Flags,
+                     uint32_t AlignInBits, Metadata *Annotations),
+                    (Scope, Name, File, Line, Type, Arg, Flags, AlignInBits,
+                     Annotations))
 
   TempDILocalVariable clone() const { return cloneImpl(); }
 
@@ -3111,6 +3167,11 @@ public:
   unsigned getArg() const { return Arg; }
   DIFlags getFlags() const { return Flags; }
 
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
+  Metadata *getRawAnnotations() const { return getOperand(4); }
+
   bool isArtificial() const { return getFlags() & FlagArtificial; }
   bool isObjectPointer() const { return getFlags() & FlagObjectPointer; }
 
@@ -3141,16 +3202,14 @@ class DILabel : public DINode {
       : DINode(C, DILabelKind, Storage, dwarf::DW_TAG_label, Ops), Line(Line) {}
   ~DILabel() = default;
 
-  static DILabel *getImpl(LLVMContext &Context, DIScope *Scope,
-                          StringRef Name, DIFile *File, unsigned Line,
-                          StorageType Storage,
+  static DILabel *getImpl(LLVMContext &Context, DIScope *Scope, StringRef Name,
+                          DIFile *File, unsigned Line, StorageType Storage,
                           bool ShouldCreate = true) {
     return getImpl(Context, Scope, getCanonicalMDString(Context, Name), File,
                    Line, Storage, ShouldCreate);
   }
-  static DILabel *getImpl(LLVMContext &Context, Metadata *Scope,
-                          MDString *Name, Metadata *File, unsigned Line,
-                          StorageType Storage,
+  static DILabel *getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                          Metadata *File, unsigned Line, StorageType Storage,
                           bool ShouldCreate = true);
 
   TempDILabel cloneImpl() const {
@@ -3295,31 +3354,33 @@ class DIImportedEntity : public DINode {
   static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
                                    DIScope *Scope, DINode *Entity, DIFile *File,
                                    unsigned Line, StringRef Name,
-                                   StorageType Storage,
+                                   DINodeArray Elements, StorageType Storage,
                                    bool ShouldCreate = true) {
     return getImpl(Context, Tag, Scope, Entity, File, Line,
-                   getCanonicalMDString(Context, Name), Storage, ShouldCreate);
+                   getCanonicalMDString(Context, Name), Elements.get(), Storage,
+                   ShouldCreate);
   }
-  static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
-                                   Metadata *Scope, Metadata *Entity,
-                                   Metadata *File, unsigned Line,
-                                   MDString *Name, StorageType Storage,
-                                   bool ShouldCreate = true);
+  static DIImportedEntity *
+  getImpl(LLVMContext &Context, unsigned Tag, Metadata *Scope, Metadata *Entity,
+          Metadata *File, unsigned Line, MDString *Name, Metadata *Elements,
+          StorageType Storage, bool ShouldCreate = true);
 
   TempDIImportedEntity cloneImpl() const {
     return getTemporary(getContext(), getTag(), getScope(), getEntity(),
-                        getFile(), getLine(), getName());
+                        getFile(), getLine(), getName(), getElements());
   }
 
 public:
   DEFINE_MDNODE_GET(DIImportedEntity,
                     (unsigned Tag, DIScope *Scope, DINode *Entity, DIFile *File,
-                     unsigned Line, StringRef Name = ""),
-                    (Tag, Scope, Entity, File, Line, Name))
+                     unsigned Line, StringRef Name = "",
+                     DINodeArray Elements = nullptr),
+                    (Tag, Scope, Entity, File, Line, Name, Elements))
   DEFINE_MDNODE_GET(DIImportedEntity,
                     (unsigned Tag, Metadata *Scope, Metadata *Entity,
-                     Metadata *File, unsigned Line, MDString *Name),
-                    (Tag, Scope, Entity, File, Line, Name))
+                     Metadata *File, unsigned Line, MDString *Name,
+                     Metadata *Elements = nullptr),
+                    (Tag, Scope, Entity, File, Line, Name, Elements))
 
   TempDIImportedEntity clone() const { return cloneImpl(); }
 
@@ -3328,11 +3389,15 @@ public:
   DINode *getEntity() const { return cast_or_null<DINode>(getRawEntity()); }
   StringRef getName() const { return getStringOperand(2); }
   DIFile *getFile() const { return cast_or_null<DIFile>(getRawFile()); }
+  DINodeArray getElements() const {
+    return cast_or_null<MDTuple>(getRawElements());
+  }
 
   Metadata *getRawScope() const { return getOperand(0); }
   Metadata *getRawEntity() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
   Metadata *getRawFile() const { return getOperand(3); }
+  Metadata *getRawElements() const { return getOperand(4); }
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DIImportedEntityKind;
@@ -3457,11 +3522,13 @@ class DIMacro : public DIMacroNode {
   }
 
 public:
-  DEFINE_MDNODE_GET(DIMacro, (unsigned MIType, unsigned Line, StringRef Name,
-                              StringRef Value = ""),
+  DEFINE_MDNODE_GET(DIMacro,
+                    (unsigned MIType, unsigned Line, StringRef Name,
+                     StringRef Value = ""),
                     (MIType, Line, Name, Value))
-  DEFINE_MDNODE_GET(DIMacro, (unsigned MIType, unsigned Line, MDString *Name,
-                              MDString *Value),
+  DEFINE_MDNODE_GET(DIMacro,
+                    (unsigned MIType, unsigned Line, MDString *Name,
+                     MDString *Value),
                     (MIType, Line, Name, Value))
 
   TempDIMacro clone() const { return cloneImpl(); }
@@ -3508,11 +3575,13 @@ class DIMacroFile : public DIMacroNode {
   }
 
 public:
-  DEFINE_MDNODE_GET(DIMacroFile, (unsigned MIType, unsigned Line, DIFile *File,
-                                  DIMacroNodeArray Elements),
+  DEFINE_MDNODE_GET(DIMacroFile,
+                    (unsigned MIType, unsigned Line, DIFile *File,
+                     DIMacroNodeArray Elements),
                     (MIType, Line, File, Elements))
-  DEFINE_MDNODE_GET(DIMacroFile, (unsigned MIType, unsigned Line,
-                                  Metadata *File, Metadata *Elements),
+  DEFINE_MDNODE_GET(DIMacroFile,
+                    (unsigned MIType, unsigned Line, Metadata *File,
+                     Metadata *Elements),
                     (MIType, Line, File, Elements))
 
   TempDIMacroFile clone() const { return cloneImpl(); }
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index b68a912b5f70..8a1b26e699e3 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -49,10 +49,11 @@ public:
   /// This enum is just used to hold constants we need for IntegerType.
   enum {
     MIN_INT_BITS = 1,        ///< Minimum number of bits that can be specified
-    MAX_INT_BITS = (1<<24)-1 ///< Maximum number of bits that can be specified
+    MAX_INT_BITS = (1<<23)   ///< Maximum number of bits that can be specified
       ///< Note that bit width is stored in the Type classes SubclassData field
-      ///< which has 24 bits. This yields a maximum bit width of 16,777,215
-      ///< bits.
+      ///< which has 24 bits. SelectionDAG type legalization can require a
+      ///< power of 2 IntegerType, so limit to the largest representable power
+      ///< of 2, 8388608.
   };
 
   /// This static method is the primary way of constructing an IntegerType.
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index 5064f4f4edf7..73b0be43e136 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -33,6 +33,7 @@ namespace llvm {
 
 // Forward declarations.
 class DiagnosticPrinter;
+class CallInst;
 class Function;
 class Instruction;
 class InstructionCost;
@@ -79,6 +80,7 @@ enum DiagnosticKind {
   DK_PGOProfile,
   DK_Unsupported,
   DK_SrcMgr,
+  DK_DontCall,
   DK_FirstPluginKind // Must be last value to work with
                      // getNextAvailablePluginDiagnosticKind
 };
@@ -194,10 +196,9 @@ public:
   /// \p The function that is concerned by this stack size diagnostic.
   /// \p The computed stack size.
   DiagnosticInfoResourceLimit(const Function &Fn, const char *ResourceName,
-                              uint64_t ResourceSize,
+                              uint64_t ResourceSize, uint64_t ResourceLimit,
                               DiagnosticSeverity Severity = DS_Warning,
-                              DiagnosticKind Kind = DK_ResourceLimit,
-                              uint64_t ResourceLimit = 0)
+                              DiagnosticKind Kind = DK_ResourceLimit)
       : DiagnosticInfo(Kind, Severity), Fn(Fn), ResourceName(ResourceName),
         ResourceSize(ResourceSize), ResourceLimit(ResourceLimit) {}
 
@@ -218,10 +219,10 @@ class DiagnosticInfoStackSize : public DiagnosticInfoResourceLimit {
   void anchor() override;
 public:
   DiagnosticInfoStackSize(const Function &Fn, uint64_t StackSize,
-                          DiagnosticSeverity Severity = DS_Warning,
-                          uint64_t StackLimit = 0)
-      : DiagnosticInfoResourceLimit(Fn, "stack frame size", StackSize, Severity,
-                                    DK_StackSize, StackLimit) {}
+                          uint64_t StackLimit,
+                          DiagnosticSeverity Severity = DS_Warning)
+      : DiagnosticInfoResourceLimit(Fn, "stack frame size", StackSize,
+                                    StackLimit, Severity, DK_StackSize) {}
 
   uint64_t getStackSize() const { return getResourceSize(); }
   uint64_t getStackLimit() const { return getResourceLimit(); }
@@ -1070,6 +1071,27 @@ public:
   }
 };
 
+void diagnoseDontCall(const CallInst &CI);
+
+class DiagnosticInfoDontCall : public DiagnosticInfo {
+  StringRef CalleeName;
+  StringRef Note;
+  unsigned LocCookie;
+
+public:
+  DiagnosticInfoDontCall(StringRef CalleeName, StringRef Note,
+                         DiagnosticSeverity DS, unsigned LocCookie)
+      : DiagnosticInfo(DK_DontCall, DS), CalleeName(CalleeName), Note(Note),
+        LocCookie(LocCookie) {}
+  StringRef getFunctionName() const { return CalleeName; }
+  StringRef getNote() const { return Note; }
+  unsigned getLocCookie() const { return LocCookie; }
+  void print(DiagnosticPrinter &DP) const override;
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_DontCall;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_DIAGNOSTICINFO_H
diff --git a/llvm/include/llvm/IR/DiagnosticPrinter.h b/llvm/include/llvm/IR/DiagnosticPrinter.h
index 102932ceefa5..2df6fc3dfe73 100644
--- a/llvm/include/llvm/IR/DiagnosticPrinter.h
+++ b/llvm/include/llvm/IR/DiagnosticPrinter.h
@@ -1,4 +1,4 @@
-//===- llvm/Support/DiagnosticPrinter.h - Diagnostic Printer ----*- C++ -*-===//
+//===- llvm/IR/DiagnosticPrinter.h - Diagnostic Printer ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index 4d140c3ad0f2..475355af5647 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -277,6 +277,12 @@ struct DominatorTreeVerifierPass : PassInfoMixin<DominatorTreeVerifierPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+/// Enables verification of dominator trees.
+///
+/// This check is expensive and is disabled by default.  `-verify-dom-info`
+/// allows selectively enabling the check without needing to recompile.
+extern bool VerifyDomInfo;
+
 /// Legacy analysis pass which computes a \c DominatorTree.
 class DominatorTreeWrapperPass : public FunctionPass {
   DominatorTree DT;
diff --git a/llvm/include/llvm/IR/FPEnv.h b/llvm/include/llvm/IR/FPEnv.h
index 621540000b5c..bf435ec6d109 100644
--- a/llvm/include/llvm/IR/FPEnv.h
+++ b/llvm/include/llvm/IR/FPEnv.h
@@ -39,24 +39,30 @@ enum ExceptionBehavior : uint8_t {
 /// Returns a valid RoundingMode enumerator when given a string
 /// that is valid as input in constrained intrinsic rounding mode
 /// metadata.
-Optional<RoundingMode> StrToRoundingMode(StringRef);
+Optional<RoundingMode> convertStrToRoundingMode(StringRef);
 
 /// For any RoundingMode enumerator, returns a string valid as input in
 /// constrained intrinsic rounding mode metadata.
-Optional<StringRef> RoundingModeToStr(RoundingMode);
+Optional<StringRef> convertRoundingModeToStr(RoundingMode);
 
 /// Returns a valid ExceptionBehavior enumerator when given a string
 /// valid as input in constrained intrinsic exception behavior metadata.
-Optional<fp::ExceptionBehavior> StrToExceptionBehavior(StringRef);
+Optional<fp::ExceptionBehavior> convertStrToExceptionBehavior(StringRef);
 
 /// For any ExceptionBehavior enumerator, returns a string valid as
 /// input in constrained intrinsic exception behavior metadata.
-Optional<StringRef> ExceptionBehaviorToStr(fp::ExceptionBehavior);
+Optional<StringRef> convertExceptionBehaviorToStr(fp::ExceptionBehavior);
 
 /// Returns true if the exception handling behavior and rounding mode
 /// match what is used in the default floating point environment.
 inline bool isDefaultFPEnvironment(fp::ExceptionBehavior EB, RoundingMode RM) {
   return EB == fp::ebIgnore && RM == RoundingMode::NearestTiesToEven;
 }
+
+/// Returns true if the rounding mode RM may be QRM at compile time or
+/// at run time.
+inline bool canRoundingModeBe(RoundingMode RM, RoundingMode QRM) {
+  return RM == QRM || RM == RoundingMode::Dynamic;
+}
 }
 #endif
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index e0094e2afff2..669418eacbb0 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -48,6 +48,7 @@ typedef unsigned ID;
 
 class AssemblyAnnotationWriter;
 class Constant;
+struct DenormalMode;
 class DISubprogram;
 class LLVMContext;
 class Module;
@@ -58,7 +59,8 @@ class User;
 class BranchProbabilityInfo;
 class BlockFrequencyInfo;
 
-class Function : public GlobalObject, public ilist_node<Function> {
+class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
+                                          public ilist_node<Function> {
 public:
   using BasicBlockListType = SymbolTableList<BasicBlock>;
 
@@ -245,72 +247,22 @@ public:
     setValueSubclassData((getSubclassDataFromValue() & 0xc00f) | (ID << 4));
   }
 
-  /// Return the attribute list for this Function.
-  AttributeList getAttributes() const { return AttributeSets; }
-
-  /// Set the attribute list for this Function.
-  void setAttributes(AttributeList Attrs) { AttributeSets = Attrs; }
-
-  /// Add function attributes to this function.
-  void addFnAttr(Attribute::AttrKind Kind) {
-    addAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Add function attributes to this function.
-  void addFnAttr(StringRef Kind, StringRef Val = StringRef()) {
-    addAttribute(AttributeList::FunctionIndex,
-                 Attribute::get(getContext(), Kind, Val));
-  }
-
-  /// Add function attributes to this function.
-  void addFnAttr(Attribute Attr) {
-    addAttribute(AttributeList::FunctionIndex, Attr);
-  }
-
-  /// Remove function attributes from this function.
-  void removeFnAttr(Attribute::AttrKind Kind) {
-    removeAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Remove function attribute from this function.
-  void removeFnAttr(StringRef Kind) {
-    setAttributes(getAttributes().removeAttribute(
-        getContext(), AttributeList::FunctionIndex, Kind));
-  }
-
-  /// A function will have the "coroutine.presplit" attribute if it's
-  /// a coroutine and has not gone through full CoroSplit pass.
-  bool isPresplitCoroutine() const {
-    return hasFnAttribute("coroutine.presplit");
-  }
-
-  enum ProfileCountType { PCT_Invalid, PCT_Real, PCT_Synthetic };
+  enum ProfileCountType { PCT_Real, PCT_Synthetic };
 
   /// Class to represent profile counts.
   ///
   /// This class represents both real and synthetic profile counts.
   class ProfileCount {
   private:
-    uint64_t Count;
-    ProfileCountType PCT;
-    static ProfileCount Invalid;
+    uint64_t Count = 0;
+    ProfileCountType PCT = PCT_Real;
 
   public:
-    ProfileCount() : Count(-1), PCT(PCT_Invalid) {}
     ProfileCount(uint64_t Count, ProfileCountType PCT)
         : Count(Count), PCT(PCT) {}
-    bool hasValue() const { return PCT != PCT_Invalid; }
     uint64_t getCount() const { return Count; }
     ProfileCountType getType() const { return PCT; }
     bool isSynthetic() const { return PCT == PCT_Synthetic; }
-    explicit operator bool() { return hasValue(); }
-    bool operator!() const { return !hasValue(); }
-    // Update the count retaining the same profile count type.
-    ProfileCount &setCount(uint64_t C) {
-      Count = C;
-      return *this;
-    }
-    static ProfileCount getInvalid() { return ProfileCount(-1, PCT_Invalid); }
   };
 
   /// Set the entry count for this function.
@@ -330,7 +282,7 @@ public:
   ///
   /// Entry count is the number of times the function was executed.
   /// When AllowSynthetic is false, only pgo_data will be returned.
-  ProfileCount getEntryCount(bool AllowSynthetic = false) const;
+  Optional<ProfileCount> getEntryCount(bool AllowSynthetic = false) const;
 
   /// Return true if the function is annotated with profile data.
   ///
@@ -351,43 +303,6 @@ public:
   /// Get the section prefix for this function.
   Optional<StringRef> getSectionPrefix() const;
 
-  /// Return true if the function has the attribute.
-  bool hasFnAttribute(Attribute::AttrKind Kind) const {
-    return AttributeSets.hasFnAttribute(Kind);
-  }
-
-  /// Return true if the function has the attribute.
-  bool hasFnAttribute(StringRef Kind) const {
-    return AttributeSets.hasFnAttribute(Kind);
-  }
-
-  /// Return the attribute for the given attribute kind.
-  Attribute getFnAttribute(Attribute::AttrKind Kind) const {
-    return getAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Return the attribute for the given attribute kind.
-  Attribute getFnAttribute(StringRef Kind) const {
-    return getAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Return the stack alignment for the function.
-  unsigned getFnStackAlignment() const {
-    if (!hasFnAttribute(Attribute::StackAlignment))
-      return 0;
-    if (const auto MA =
-            AttributeSets.getStackAlignment(AttributeList::FunctionIndex))
-      return MA->value();
-    return 0;
-  }
-
-  /// Return the stack alignment for the function.
-  MaybeAlign getFnStackAlign() const {
-    if (!hasFnAttribute(Attribute::StackAlignment))
-      return None;
-    return AttributeSets.getStackAlignment(AttributeList::FunctionIndex);
-  }
-
   /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm
   ///                             to use during code generation.
   bool hasGC() const {
@@ -397,17 +312,36 @@ public:
   void setGC(std::string Str);
   void clearGC();
 
-  /// Returns true if the function has ssp, sspstrong, or sspreq fn attrs.
-  bool hasStackProtectorFnAttr() const;
+  /// Return the attribute list for this Function.
+  AttributeList getAttributes() const { return AttributeSets; }
 
-  /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute::AttrKind Kind);
+  /// Set the attribute list for this Function.
+  void setAttributes(AttributeList Attrs) { AttributeSets = Attrs; }
 
+  // TODO: remove non-AtIndex versions of these methods.
   /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute Attr);
+  void addAttributeAtIndex(unsigned i, Attribute Attr);
+
+  /// Add function attributes to this function.
+  void addFnAttr(Attribute::AttrKind Kind);
+
+  /// Add function attributes to this function.
+  void addFnAttr(StringRef Kind, StringRef Val = StringRef());
+
+  /// Add function attributes to this function.
+  void addFnAttr(Attribute Attr);
+
+  /// Add function attributes to this function.
+  void addFnAttrs(const AttrBuilder &Attrs);
 
-  /// adds the attributes to the list of attributes.
-  void addAttributes(unsigned i, const AttrBuilder &Attrs);
+  /// Add return value attributes to this function.
+  void addRetAttr(Attribute::AttrKind Kind);
+
+  /// Add return value attributes to this function.
+  void addRetAttr(Attribute Attr);
+
+  /// Add return value attributes to this function.
+  void addRetAttrs(const AttrBuilder &Attrs);
 
   /// adds the attribute to the list of attributes for the given arg.
   void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
@@ -419,13 +353,27 @@ public:
   void addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs);
 
   /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, Attribute::AttrKind Kind);
+  void removeAttributeAtIndex(unsigned i, Attribute::AttrKind Kind);
 
   /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, StringRef Kind);
+  void removeAttributeAtIndex(unsigned i, StringRef Kind);
+
+  /// Remove function attributes from this function.
+  void removeFnAttr(Attribute::AttrKind Kind);
+
+  /// Remove function attribute from this function.
+  void removeFnAttr(StringRef Kind);
+
+  void removeFnAttrs(const AttrBuilder &Attrs);
 
-  /// removes the attributes from the list of attributes.
-  void removeAttributes(unsigned i, const AttrBuilder &Attrs);
+  /// removes the attribute from the return value list of attributes.
+  void removeRetAttr(Attribute::AttrKind Kind);
+
+  /// removes the attribute from the return value list of attributes.
+  void removeRetAttr(StringRef Kind);
+
+  /// removes the attributes from the return value list of attributes.
+  void removeRetAttrs(const AttrBuilder &Attrs);
 
   /// removes the attribute from the list of attributes.
   void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
@@ -436,54 +384,57 @@ public:
   /// removes the attribute from the list of attributes.
   void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs);
 
-  /// removes noundef and other attributes that imply undefined behavior if a
-  /// `undef` or `poison` value is passed from the list of attributes.
-  void removeParamUndefImplyingAttrs(unsigned ArgNo);
+  /// Return true if the function has the attribute.
+  bool hasFnAttribute(Attribute::AttrKind Kind) const;
 
-  /// check if an attributes is in the list of attributes.
-  bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return getAttributes().hasAttribute(i, Kind);
-  }
+  /// Return true if the function has the attribute.
+  bool hasFnAttribute(StringRef Kind) const;
 
-  /// check if an attributes is in the list of attributes.
-  bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return getAttributes().hasParamAttribute(ArgNo, Kind);
-  }
+  /// check if an attribute is in the list of attributes for the return value.
+  bool hasRetAttribute(Attribute::AttrKind Kind) const;
 
-  /// gets the specified attribute from the list of attributes.
-  Attribute getParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return getAttributes().getParamAttr(ArgNo, Kind);
-  }
+  /// check if an attributes is in the list of attributes.
+  bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
   /// gets the attribute from the list of attributes.
-  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return AttributeSets.getAttribute(i, Kind);
-  }
+  Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const;
 
   /// gets the attribute from the list of attributes.
-  Attribute getAttribute(unsigned i, StringRef Kind) const {
-    return AttributeSets.getAttribute(i, Kind);
+  Attribute getAttributeAtIndex(unsigned i, StringRef Kind) const;
+
+  /// Return the attribute for the given attribute kind.
+  Attribute getFnAttribute(Attribute::AttrKind Kind) const;
+
+  /// Return the attribute for the given attribute kind.
+  Attribute getFnAttribute(StringRef Kind) const;
+
+  /// gets the specified attribute from the list of attributes.
+  Attribute getParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
+
+  /// removes noundef and other attributes that imply undefined behavior if a
+  /// `undef` or `poison` value is passed from the list of attributes.
+  void removeParamUndefImplyingAttrs(unsigned ArgNo);
+
+  /// Return the stack alignment for the function.
+  MaybeAlign getFnStackAlign() const {
+    return AttributeSets.getFnStackAlignment();
   }
 
-  /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
+  /// Returns true if the function has ssp, sspstrong, or sspreq fn attrs.
+  bool hasStackProtectorFnAttr() const;
 
   /// adds the dereferenceable attribute to the list of attributes for
   /// the given arg.
   void addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes);
 
   /// adds the dereferenceable_or_null attribute to the list of
-  /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes);
-
-  /// adds the dereferenceable_or_null attribute to the list of
   /// attributes for the given arg.
   void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes);
 
   /// Extract the alignment for a call or parameter (0=unknown).
   /// FIXME: Remove this function once transition to Align is over.
   /// Use getParamAlign() instead.
-  unsigned getParamAlignment(unsigned ArgNo) const {
+  uint64_t getParamAlignment(unsigned ArgNo) const {
     if (const auto MA = getParamAlign(ArgNo))
       return MA->value();
     return 0;
@@ -517,11 +468,9 @@ public:
     return AttributeSets.getParamByRefType(ArgNo);
   }
 
-  /// Extract the number of dereferenceable bytes for a call or
-  /// parameter (0=unknown).
-  /// @param i AttributeList index, referring to a return value or argument.
-  uint64_t getDereferenceableBytes(unsigned i) const {
-    return AttributeSets.getDereferenceableBytes(i);
+  /// Extract the preallocated type for a parameter.
+  Type *getParamPreallocatedType(unsigned ArgNo) const {
+    return AttributeSets.getParamPreallocatedType(ArgNo);
   }
 
   /// Extract the number of dereferenceable bytes for a parameter.
@@ -530,13 +479,6 @@ public:
     return AttributeSets.getParamDereferenceableBytes(ArgNo);
   }
 
-  /// Extract the number of dereferenceable_or_null bytes for a call or
-  /// parameter (0=unknown).
-  /// @param i AttributeList index, referring to a return value or argument.
-  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return AttributeSets.getDereferenceableOrNullBytes(i);
-  }
-
   /// Extract the number of dereferenceable_or_null bytes for a
   /// parameter.
   /// @param ArgNo AttributeList ArgNo, referring to an argument.
@@ -544,6 +486,12 @@ public:
     return AttributeSets.getParamDereferenceableOrNullBytes(ArgNo);
   }
 
+  /// A function will have the "coroutine.presplit" attribute if it's
+  /// a coroutine and has not gone through full CoroSplit pass.
+  bool isPresplitCoroutine() const {
+    return hasFnAttribute("coroutine.presplit");
+  }
+
   /// Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
     return hasFnAttribute(Attribute::ReadNone);
@@ -692,19 +640,16 @@ public:
   /// Determine if the function returns a structure through first
   /// or second pointer argument.
   bool hasStructRetAttr() const {
-    return AttributeSets.hasParamAttribute(0, Attribute::StructRet) ||
-           AttributeSets.hasParamAttribute(1, Attribute::StructRet);
+    return AttributeSets.hasParamAttr(0, Attribute::StructRet) ||
+           AttributeSets.hasParamAttr(1, Attribute::StructRet);
   }
 
   /// Determine if the parameter or return value is marked with NoAlias
   /// attribute.
   bool returnDoesNotAlias() const {
-    return AttributeSets.hasAttribute(AttributeList::ReturnIndex,
-                                      Attribute::NoAlias);
-  }
-  void setReturnDoesNotAlias() {
-    addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+    return AttributeSets.hasRetAttr(Attribute::NoAlias);
   }
+  void setReturnDoesNotAlias() { addRetAttr(Attribute::NoAlias); }
 
   /// Do not optimize this function (-O0).
   bool hasOptNone() const { return hasFnAttribute(Attribute::OptimizeNone); }
@@ -904,13 +849,14 @@ public:
   /// hasAddressTaken - returns true if there are any uses of this function
   /// other than direct calls or invokes to it, or blockaddress expressions.
   /// Optionally passes back an offending user for diagnostic purposes,
-  /// ignores callback uses, assume like pointer annotation calls, and
-  /// references in llvm.used and llvm.compiler.used variables.
-  ///
+  /// ignores callback uses, assume like pointer annotation calls, references in
+  /// llvm.used and llvm.compiler.used variables, and operand bundle
+  /// "clang.arc.attachedcall".
   bool hasAddressTaken(const User ** = nullptr,
                        bool IgnoreCallbackUses = false,
                        bool IgnoreAssumeLikeCalls = true,
-                       bool IngoreLLVMUsed = false) const;
+                       bool IngoreLLVMUsed = false,
+                       bool IgnoreARCAttachedCall = false) const;
 
   /// isDefTriviallyDead - Return true if it is trivially safe to remove
   /// this function definition from the module (because it isn't externally
diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h
index a69958d596c6..4fa8e3a8dcf4 100644
--- a/llvm/include/llvm/IR/GCStrategy.h
+++ b/llvm/include/llvm/IR/GCStrategy.h
@@ -131,6 +131,9 @@ public:
 /// GCMetadataPrinterRegistery as well.
 using GCRegistry = Registry<GCStrategy>;
 
+/// Lookup the GCStrategy object associated with the given gc name.
+std::unique_ptr<GCStrategy> getGCStrategy(const StringRef Name);
+
 } // end namespace llvm
 
 #endif // LLVM_IR_GCSTRATEGY_H
diff --git a/llvm/include/llvm/IR/GlobalAlias.h b/llvm/include/llvm/IR/GlobalAlias.h
index f2d9b9676ec9..01134448a8fa 100644
--- a/llvm/include/llvm/IR/GlobalAlias.h
+++ b/llvm/include/llvm/IR/GlobalAlias.h
@@ -15,7 +15,8 @@
 #define LLVM_IR_GLOBALALIAS_H
 
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Value.h"
 
 namespace llvm {
@@ -24,8 +25,7 @@ class Twine;
 class Module;
 template <typename ValueSubClass> class SymbolTableListTraits;
 
-class GlobalAlias : public GlobalIndirectSymbol,
-                    public ilist_node<GlobalAlias> {
+class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   friend class SymbolTableListTraits<GlobalAlias>;
 
   GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
@@ -58,6 +58,17 @@ public:
   // Linkage, Type, Parent and AddressSpace taken from the Aliasee.
   static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
 
+  // allocate space for exactly one operand
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
+
+  void copyAttributesFrom(const GlobalAlias *Src) {
+    GlobalValue::copyAttributesFrom(Src);
+  }
+
   /// removeFromParent - This method unlinks 'this' from the containing module,
   /// but does not delete it.
   ///
@@ -71,10 +82,14 @@ public:
   /// These methods retrieve and set alias target.
   void setAliasee(Constant *Aliasee);
   const Constant *getAliasee() const {
-    return getIndirectSymbol();
+    return static_cast<Constant *>(Op<0>().get());
   }
-  Constant *getAliasee() {
-    return getIndirectSymbol();
+  Constant *getAliasee() { return static_cast<Constant *>(Op<0>().get()); }
+
+  const GlobalObject *getAliaseeObject() const;
+  GlobalObject *getAliaseeObject() {
+    return const_cast<GlobalObject *>(
+        static_cast<const GlobalAlias *>(this)->getAliaseeObject());
   }
 
   static bool isValidLinkage(LinkageTypes L) {
@@ -88,6 +103,12 @@ public:
   }
 };
 
+template <>
+struct OperandTraits<GlobalAlias>
+    : public FixedNumOperandTraits<GlobalAlias, 1> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GlobalAlias, Constant)
+
 } // end namespace llvm
 
 #endif // LLVM_IR_GLOBALALIAS_H
diff --git a/llvm/include/llvm/IR/GlobalIFunc.h b/llvm/include/llvm/IR/GlobalIFunc.h
index ddd29c8a4a19..10088ee2fff4 100644
--- a/llvm/include/llvm/IR/GlobalIFunc.h
+++ b/llvm/include/llvm/IR/GlobalIFunc.h
@@ -18,7 +18,9 @@
 #define LLVM_IR_GLOBALIFUNC_H
 
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Value.h"
 
 namespace llvm {
@@ -29,8 +31,7 @@ class Module;
 // Traits class for using GlobalIFunc in symbol table in Module.
 template <typename ValueSubClass> class SymbolTableListTraits;
 
-class GlobalIFunc final : public GlobalIndirectSymbol,
-                          public ilist_node<GlobalIFunc> {
+class GlobalIFunc final : public GlobalObject, public ilist_node<GlobalIFunc> {
   friend class SymbolTableListTraits<GlobalIFunc>;
 
   GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
@@ -46,6 +47,17 @@ public:
                              LinkageTypes Linkage, const Twine &Name,
                              Constant *Resolver, Module *Parent);
 
+  // allocate space for exactly one operand
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
+
+  void copyAttributesFrom(const GlobalIFunc *Src) {
+    GlobalObject::copyAttributesFrom(Src);
+  }
+
   /// This method unlinks 'this' from the containing module, but does not
   /// delete it.
   void removeFromParent();
@@ -54,14 +66,22 @@ public:
   void eraseFromParent();
 
   /// These methods retrieve and set ifunc resolver function.
-  void setResolver(Constant *Resolver) {
-    setIndirectSymbol(Resolver);
-  }
+  void setResolver(Constant *Resolver) { Op<0>().set(Resolver); }
   const Constant *getResolver() const {
-    return getIndirectSymbol();
+    return static_cast<Constant *>(Op<0>().get());
   }
-  Constant *getResolver() {
-    return getIndirectSymbol();
+  Constant *getResolver() { return static_cast<Constant *>(Op<0>().get()); }
+
+  // Return the resolver function after peeling off potential ConstantExpr
+  // indirection.
+  const Function *getResolverFunction() const;
+  Function *getResolverFunction() {
+    return const_cast<Function *>(
+        static_cast<const GlobalIFunc *>(this)->getResolverFunction());
+  }
+
+  static FunctionType *getResolverFunctionType(Type *IFuncValTy) {
+    return FunctionType::get(IFuncValTy->getPointerTo(), false);
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -70,6 +90,12 @@ public:
   }
 };
 
+template <>
+struct OperandTraits<GlobalIFunc>
+    : public FixedNumOperandTraits<GlobalIFunc, 1> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GlobalIFunc, Constant)
+
 } // end namespace llvm
 
 #endif // LLVM_IR_GLOBALIFUNC_H
diff --git a/llvm/include/llvm/IR/GlobalIndirectSymbol.h b/llvm/include/llvm/IR/GlobalIndirectSymbol.h
deleted file mode 100644
index e45c7529885d..000000000000
--- a/llvm/include/llvm/IR/GlobalIndirectSymbol.h
+++ /dev/null
@@ -1,93 +0,0 @@
-//===- llvm/GlobalIndirectSymbol.h - GlobalIndirectSymbol class -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the GlobalIndirectSymbol class, which
-// is a base class for GlobalAlias and GlobalIFunc. It contains all common code
-// for aliases and ifuncs.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_GLOBALINDIRECTSYMBOL_H
-#define LLVM_IR_GLOBALINDIRECTSYMBOL_H
-
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include <cstddef>
-
-namespace llvm {
-
-class GlobalIndirectSymbol : public GlobalValue {
-protected:
-  GlobalIndirectSymbol(Type *Ty, ValueTy VTy, unsigned AddressSpace,
-      LinkageTypes Linkage, const Twine &Name, Constant *Symbol);
-
-public:
-  GlobalIndirectSymbol(const GlobalIndirectSymbol &) = delete;
-  GlobalIndirectSymbol &operator=(const GlobalIndirectSymbol &) = delete;
-
-  // allocate space for exactly one operand
-  void *operator new(size_t S) { return User::operator new(S, 1); }
-  void operator delete(void *Ptr) { User::operator delete(Ptr); }
-
-  /// Provide fast operand accessors
-  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
-
-  void copyAttributesFrom(const GlobalValue *Src) {
-    GlobalValue::copyAttributesFrom(Src);
-  }
-
-  /// These methods set and retrieve indirect symbol.
-  void setIndirectSymbol(Constant *Symbol) {
-    setOperand(0, Symbol);
-  }
-  const Constant *getIndirectSymbol() const {
-    return getOperand(0);
-  }
-  Constant *getIndirectSymbol() {
-    return const_cast<Constant *>(
-          static_cast<const GlobalIndirectSymbol *>(this)->getIndirectSymbol());
-  }
-
-  const GlobalObject *getBaseObject() const;
-  GlobalObject *getBaseObject() {
-    return const_cast<GlobalObject *>(
-              static_cast<const GlobalIndirectSymbol *>(this)->getBaseObject());
-  }
-
-  const GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) const {
-    return dyn_cast<GlobalObject>(
-        getIndirectSymbol()->stripAndAccumulateInBoundsConstantOffsets(DL,
-                                                                       Offset));
-  }
-  GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) {
-    return const_cast<GlobalObject *>(
-                                 static_cast<const GlobalIndirectSymbol *>(this)
-                                   ->getBaseObject(DL, Offset));
-  }
-
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static bool classof(const Value *V) {
-    return V->getValueID() == Value::GlobalAliasVal ||
-           V->getValueID() == Value::GlobalIFuncVal;
-  }
-};
-
-template <>
-struct OperandTraits<GlobalIndirectSymbol> :
-  public FixedNumOperandTraits<GlobalIndirectSymbol, 1> {
-};
-
-DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GlobalIndirectSymbol, Constant)
-
-} // end namespace llvm
-
-#endif // LLVM_IR_GLOBALINDIRECTSYMBOL_H
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index 341fbec66080..e15cf718bb10 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -51,7 +51,7 @@ protected:
 
   Comdat *ObjComdat;
   enum {
-    LastAlignmentBit = 4,
+    LastAlignmentBit = 5,
     HasSectionHashEntryBit,
 
     GlobalObjectBits,
@@ -68,7 +68,7 @@ public:
   GlobalObject(const GlobalObject &) = delete;
 
   /// FIXME: Remove this function once transition to Align is over.
-  unsigned getAlignment() const {
+  uint64_t getAlignment() const {
     MaybeAlign Align = getAlign();
     return Align ? Align->value() : 0;
   }
@@ -153,7 +153,8 @@ public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
     return V->getValueID() == Value::FunctionVal ||
-           V->getValueID() == Value::GlobalVariableVal;
+           V->getValueID() == Value::GlobalVariableVal ||
+           V->getValueID() == Value::GlobalIFuncVal;
   }
 
 private:
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index cf704d1f2374..1818f2a8f3cc 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -302,11 +302,14 @@ public:
   static bool isAvailableExternallyLinkage(LinkageTypes Linkage) {
     return Linkage == AvailableExternallyLinkage;
   }
+  static bool isLinkOnceAnyLinkage(LinkageTypes Linkage) {
+    return Linkage == LinkOnceAnyLinkage;
+  }
   static bool isLinkOnceODRLinkage(LinkageTypes Linkage) {
     return Linkage == LinkOnceODRLinkage;
   }
   static bool isLinkOnceLinkage(LinkageTypes Linkage) {
-    return Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage;
+    return isLinkOnceAnyLinkage(Linkage) || isLinkOnceODRLinkage(Linkage);
   }
   static bool isWeakAnyLinkage(LinkageTypes Linkage) {
     return Linkage == WeakAnyLinkage;
@@ -433,6 +436,9 @@ public:
     return isAvailableExternallyLinkage(getLinkage());
   }
   bool hasLinkOnceLinkage() const { return isLinkOnceLinkage(getLinkage()); }
+  bool hasLinkOnceAnyLinkage() const {
+    return isLinkOnceAnyLinkage(getLinkage());
+  }
   bool hasLinkOnceODRLinkage() const {
     return isLinkOnceODRLinkage(getLinkage());
   }
@@ -548,10 +554,10 @@ public:
     return !(isDeclarationForLinker() || isWeakForLinker());
   }
 
-  const GlobalObject *getBaseObject() const;
-  GlobalObject *getBaseObject() {
+  const GlobalObject *getAliaseeObject() const;
+  GlobalObject *getAliaseeObject() {
     return const_cast<GlobalObject *>(
-                       static_cast<const GlobalValue *>(this)->getBaseObject());
+        static_cast<const GlobalValue *>(this)->getAliaseeObject());
   }
 
   /// Returns whether this is a reference to an absolute symbol.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 8998ad0f94a9..b4e099e4ec20 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -316,7 +316,7 @@ public:
   /// Set the exception handling to be used with constrained floating point
   void setDefaultConstrainedExcept(fp::ExceptionBehavior NewExcept) {
 #ifndef NDEBUG
-    Optional<StringRef> ExceptStr = ExceptionBehaviorToStr(NewExcept);
+    Optional<StringRef> ExceptStr = convertExceptionBehaviorToStr(NewExcept);
     assert(ExceptStr.hasValue() && "Garbage strict exception behavior!");
 #endif
     DefaultConstrainedExcept = NewExcept;
@@ -325,7 +325,7 @@ public:
   /// Set the rounding mode handling to be used with constrained floating point
   void setDefaultConstrainedRounding(RoundingMode NewRounding) {
 #ifndef NDEBUG
-    Optional<StringRef> RoundingStr = RoundingModeToStr(NewRounding);
+    Optional<StringRef> RoundingStr = convertRoundingModeToStr(NewRounding);
     assert(RoundingStr.hasValue() && "Garbage strict rounding mode!");
 #endif
     DefaultConstrainedRounding = NewRounding;
@@ -351,7 +351,7 @@ public:
   }
 
   void setConstrainedFPCallAttr(CallBase *I) {
-    I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
+    I->addFnAttr(Attribute::StrictFP);
   }
 
   void setDefaultOperandBundles(ArrayRef<OperandBundleDef> OpBundles) {
@@ -697,12 +697,16 @@ public:
       MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr,
       MDNode *NoAliasTag = nullptr);
 
-  /// Create a vector fadd reduction intrinsic of the source vector.
-  /// The first parameter is a scalar accumulator value for ordered reductions.
+  /// Create a sequential vector fadd reduction intrinsic of the source vector.
+  /// The first parameter is a scalar accumulator value. An unordered reduction
+  /// can be created by adding the reassoc fast-math flag to the resulting
+  /// sequential reduction.
   CallInst *CreateFAddReduce(Value *Acc, Value *Src);
 
-  /// Create a vector fmul reduction intrinsic of the source vector.
-  /// The first parameter is a scalar accumulator value for ordered reductions.
+  /// Create a sequential vector fmul reduction intrinsic of the source vector.
+  /// The first parameter is a scalar accumulator value. An unordered reduction
+  /// can be created by adding the reassoc fast-math flag to the resulting
+  /// sequential reduction.
   CallInst *CreateFMulReduce(Value *Acc, Value *Src);
 
   /// Create a vector int add reduction intrinsic of the source vector.
@@ -1172,7 +1176,7 @@ private:
     if (Rounding.hasValue())
       UseRounding = Rounding.getValue();
 
-    Optional<StringRef> RoundingStr = RoundingModeToStr(UseRounding);
+    Optional<StringRef> RoundingStr = convertRoundingModeToStr(UseRounding);
     assert(RoundingStr.hasValue() && "Garbage strict rounding mode!");
     auto *RoundingMDS = MDString::get(Context, RoundingStr.getValue());
 
@@ -1185,7 +1189,7 @@ private:
     if (Except.hasValue())
       UseExcept = Except.getValue();
 
-    Optional<StringRef> ExceptStr = ExceptionBehaviorToStr(UseExcept);
+    Optional<StringRef> ExceptStr = convertExceptionBehaviorToStr(UseExcept);
     assert(ExceptStr.hasValue() && "Garbage strict exception behavior!");
     auto *ExceptMDS = MDString::get(Context, ExceptStr.getValue());
 
@@ -2448,6 +2452,16 @@ public:
     return CreateExtractElement(Vec, getInt64(Idx), Name);
   }
 
+  Value *CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx,
+                             const Twine &Name = "") {
+    return CreateInsertElement(PoisonValue::get(VecTy), NewElt, Idx, Name);
+  }
+
+  Value *CreateInsertElement(Type *VecTy, Value *NewElt, uint64_t Idx,
+                             const Twine &Name = "") {
+    return CreateInsertElement(PoisonValue::get(VecTy), NewElt, Idx, Name);
+  }
+
   Value *CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx,
                              const Twine &Name = "") {
     if (auto *VC = dyn_cast<Constant>(Vec))
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index ef2c279ed455..143a87f4997d 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -755,6 +756,20 @@ public:
   using PredicateField =
       Bitfield::Element<Predicate, 0, 6, LAST_ICMP_PREDICATE>;
 
+  /// Returns the sequence of all FCmp predicates.
+  static auto FCmpPredicates() {
+    return enum_seq_inclusive(Predicate::FIRST_FCMP_PREDICATE,
+                              Predicate::LAST_FCMP_PREDICATE,
+                              force_iteration_on_noniterable_enum);
+  }
+
+  /// Returns the sequence of all ICmp predicates.
+  static auto ICmpPredicates() {
+    return enum_seq_inclusive(Predicate::FIRST_ICMP_PREDICATE,
+                              Predicate::LAST_ICMP_PREDICATE,
+                              force_iteration_on_noniterable_enum);
+  }
+
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
@@ -1325,33 +1340,23 @@ public:
   bool arg_empty() const { return arg_end() == arg_begin(); }
   unsigned arg_size() const { return arg_end() - arg_begin(); }
 
-  // Legacy API names that duplicate the above and will be removed once users
-  // are migrated.
-  iterator_range<User::op_iterator> arg_operands() {
-    return make_range(arg_begin(), arg_end());
-  }
-  iterator_range<User::const_op_iterator> arg_operands() const {
-    return make_range(arg_begin(), arg_end());
-  }
-  unsigned getNumArgOperands() const { return arg_size(); }
-
   Value *getArgOperand(unsigned i) const {
-    assert(i < getNumArgOperands() && "Out of bounds!");
+    assert(i < arg_size() && "Out of bounds!");
     return getOperand(i);
   }
 
   void setArgOperand(unsigned i, Value *v) {
-    assert(i < getNumArgOperands() && "Out of bounds!");
+    assert(i < arg_size() && "Out of bounds!");
     setOperand(i, v);
   }
 
   /// Wrappers for getting the \c Use of a call argument.
   const Use &getArgOperandUse(unsigned i) const {
-    assert(i < getNumArgOperands() && "Out of bounds!");
+    assert(i < arg_size() && "Out of bounds!");
     return User::getOperandUse(i);
   }
   Use &getArgOperandUse(unsigned i) {
-    assert(i < getNumArgOperands() && "Out of bounds!");
+    assert(i < arg_size() && "Out of bounds!");
     return User::getOperandUse(i);
   }
 
@@ -1485,92 +1490,104 @@ public:
   /// the attribute is allowed for the call.
   bool hasFnAttr(StringRef Kind) const { return hasFnAttrImpl(Kind); }
 
+  // TODO: remove non-AtIndex versions of these methods.
   /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute::AttrKind Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
+  void addAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) {
+    Attrs = Attrs.addAttributeAtIndex(getContext(), i, Kind);
   }
 
   /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute Attr) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addAttribute(getContext(), i, Attr);
-    setAttributes(PAL);
+  void addAttributeAtIndex(unsigned i, Attribute Attr) {
+    Attrs = Attrs.addAttributeAtIndex(getContext(), i, Attr);
+  }
+
+  /// Adds the attribute to the function.
+  void addFnAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.addFnAttribute(getContext(), Kind);
+  }
+
+  /// Adds the attribute to the function.
+  void addFnAttr(Attribute Attr) {
+    Attrs = Attrs.addFnAttribute(getContext(), Attr);
+  }
+
+  /// Adds the attribute to the return value.
+  void addRetAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.addRetAttribute(getContext(), Kind);
+  }
+
+  /// Adds the attribute to the return value.
+  void addRetAttr(Attribute Attr) {
+    Attrs = Attrs.addRetAttribute(getContext(), Attr);
   }
 
   /// Adds the attribute to the indicated argument
   void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
+    assert(ArgNo < arg_size() && "Out of bounds");
+    Attrs = Attrs.addParamAttribute(getContext(), ArgNo, Kind);
   }
 
   /// Adds the attribute to the indicated argument
   void addParamAttr(unsigned ArgNo, Attribute Attr) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
-    setAttributes(PAL);
+    assert(ArgNo < arg_size() && "Out of bounds");
+    Attrs = Attrs.addParamAttribute(getContext(), ArgNo, Attr);
   }
 
   /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
+  void removeAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) {
+    Attrs = Attrs.removeAttributeAtIndex(getContext(), i, Kind);
   }
 
   /// removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, StringRef Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
+  void removeAttributeAtIndex(unsigned i, StringRef Kind) {
+    Attrs = Attrs.removeAttributeAtIndex(getContext(), i, Kind);
+  }
+
+  /// Removes the attributes from the function
+  void removeFnAttrs(const AttrBuilder &AttrsToRemove) {
+    Attrs = Attrs.removeFnAttributes(getContext(), AttrsToRemove);
+  }
+
+  /// Removes the attribute from the function
+  void removeFnAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.removeFnAttribute(getContext(), Kind);
   }
 
-  void removeAttributes(unsigned i, const AttrBuilder &Attrs) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttributes(getContext(), i, Attrs);
-    setAttributes(PAL);
+  /// Removes the attribute from the return value
+  void removeRetAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.removeRetAttribute(getContext(), Kind);
+  }
+
+  /// Removes the attributes from the return value
+  void removeRetAttrs(const AttrBuilder &AttrsToRemove) {
+    Attrs = Attrs.removeRetAttributes(getContext(), AttrsToRemove);
   }
 
   /// Removes the attribute from the given argument
   void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
+    assert(ArgNo < arg_size() && "Out of bounds");
+    Attrs = Attrs.removeParamAttribute(getContext(), ArgNo, Kind);
   }
 
   /// Removes the attribute from the given argument
   void removeParamAttr(unsigned ArgNo, StringRef Kind) {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
+    assert(ArgNo < arg_size() && "Out of bounds");
+    Attrs = Attrs.removeParamAttribute(getContext(), ArgNo, Kind);
   }
 
   /// Removes the attributes from the given argument
-  void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs);
-    setAttributes(PAL);
+  void removeParamAttrs(unsigned ArgNo, const AttrBuilder &AttrsToRemove) {
+    Attrs = Attrs.removeParamAttributes(getContext(), ArgNo, AttrsToRemove);
   }
 
   /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-    setAttributes(PAL);
+  void addDereferenceableParamAttr(unsigned i, uint64_t Bytes) {
+    Attrs = Attrs.addDereferenceableParamAttr(getContext(), i, Bytes);
   }
 
-  /// adds the dereferenceable_or_null attribute to the list of
-  /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-    setAttributes(PAL);
+  /// adds the dereferenceable attribute to the list of attributes.
+  void addDereferenceableRetAttr(uint64_t Bytes) {
+    Attrs = Attrs.addDereferenceableRetAttr(getContext(), Bytes);
   }
 
   /// Determine whether the return value has the given attribute.
@@ -1584,24 +1601,34 @@ public:
   bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
   /// Get the attribute of a given kind at a position.
-  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return getAttributes().getAttribute(i, Kind);
+  Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const {
+    return getAttributes().getAttributeAtIndex(i, Kind);
   }
 
   /// Get the attribute of a given kind at a position.
-  Attribute getAttribute(unsigned i, StringRef Kind) const {
-    return getAttributes().getAttribute(i, Kind);
+  Attribute getAttributeAtIndex(unsigned i, StringRef Kind) const {
+    return getAttributes().getAttributeAtIndex(i, Kind);
+  }
+
+  /// Get the attribute of a given kind for the function.
+  Attribute getFnAttr(StringRef Kind) const {
+    return getAttributes().getFnAttr(Kind);
+  }
+
+  /// Get the attribute of a given kind for the function.
+  Attribute getFnAttr(Attribute::AttrKind Kind) const {
+    return getAttributes().getFnAttr(Kind);
   }
 
   /// Get the attribute of a given kind from a given arg
   Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    assert(ArgNo < arg_size() && "Out of bounds");
     return getAttributes().getParamAttr(ArgNo, Kind);
   }
 
   /// Get the attribute of a given kind from a given arg
   Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const {
-    assert(ArgNo < getNumArgOperands() && "Out of bounds");
+    assert(ArgNo < arg_size() && "Out of bounds");
     return getAttributes().getParamAttr(ArgNo, Kind);
   }
 
@@ -1609,42 +1636,35 @@ public:
   /// A.
   ///
   /// Data operands include call arguments and values used in operand bundles,
-  /// but does not include the callee operand.  This routine dispatches to the
-  /// underlying AttributeList or the OperandBundleUser as appropriate.
+  /// but does not include the callee operand.
   ///
   /// The index \p i is interpreted as
   ///
-  ///  \p i == Attribute::ReturnIndex  -> the return value
-  ///  \p i in [1, arg_size + 1)  -> argument number (\p i - 1)
-  ///  \p i in [arg_size + 1, data_operand_size + 1) -> bundle operand at index
-  ///     (\p i - 1) in the operand list.
+  ///  \p i in [0, arg_size)  -> argument number (\p i)
+  ///  \p i in [arg_size, data_operand_size) -> bundle operand at index
+  ///     (\p i) in the operand list.
   bool dataOperandHasImpliedAttr(unsigned i, Attribute::AttrKind Kind) const {
     // Note that we have to add one because `i` isn't zero-indexed.
-    assert(i < (getNumArgOperands() + getNumTotalBundleOperands() + 1) &&
+    assert(i < arg_size() + getNumTotalBundleOperands() &&
            "Data operand index out of bounds!");
 
     // The attribute A can either be directly specified, if the operand in
     // question is a call argument; or be indirectly implied by the kind of its
     // containing operand bundle, if the operand is a bundle operand.
 
-    if (i == AttributeList::ReturnIndex)
-      return hasRetAttr(Kind);
-
-    // FIXME: Avoid these i - 1 calculations and update the API to use
-    // zero-based indices.
-    if (i < (getNumArgOperands() + 1))
-      return paramHasAttr(i - 1, Kind);
+    if (i < arg_size())
+      return paramHasAttr(i, Kind);
 
-    assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
+    assert(hasOperandBundles() && i >= getBundleOperandsStartIndex() &&
            "Must be either a call argument or an operand bundle!");
-    return bundleOperandHasAttr(i - 1, Kind);
+    return bundleOperandHasAttr(i, Kind);
   }
 
   /// Determine whether this data operand is not captured.
   // FIXME: Once this API is no longer duplicated in `CallSite`, rename this to
   // better indicate that this may return a conservative answer.
   bool doesNotCapture(unsigned OpNo) const {
-    return dataOperandHasImpliedAttr(OpNo + 1, Attribute::NoCapture);
+    return dataOperandHasImpliedAttr(OpNo, Attribute::NoCapture);
   }
 
   /// Determine whether this argument is passed by value.
@@ -1685,21 +1705,21 @@ public:
   // FIXME: Once this API is no longer duplicated in `CallSite`, rename this to
   // better indicate that this may return a conservative answer.
   bool doesNotAccessMemory(unsigned OpNo) const {
-    return dataOperandHasImpliedAttr(OpNo + 1, Attribute::ReadNone);
+    return dataOperandHasImpliedAttr(OpNo, Attribute::ReadNone);
   }
 
   // FIXME: Once this API is no longer duplicated in `CallSite`, rename this to
   // better indicate that this may return a conservative answer.
   bool onlyReadsMemory(unsigned OpNo) const {
-    return dataOperandHasImpliedAttr(OpNo + 1, Attribute::ReadOnly) ||
-           dataOperandHasImpliedAttr(OpNo + 1, Attribute::ReadNone);
+    return dataOperandHasImpliedAttr(OpNo, Attribute::ReadOnly) ||
+           dataOperandHasImpliedAttr(OpNo, Attribute::ReadNone);
   }
 
   // FIXME: Once this API is no longer duplicated in `CallSite`, rename this to
   // better indicate that this may return a conservative answer.
   bool doesNotReadMemory(unsigned OpNo) const {
-    return dataOperandHasImpliedAttr(OpNo + 1, Attribute::WriteOnly) ||
-           dataOperandHasImpliedAttr(OpNo + 1, Attribute::ReadNone);
+    return dataOperandHasImpliedAttr(OpNo, Attribute::WriteOnly) ||
+           dataOperandHasImpliedAttr(OpNo, Attribute::ReadNone);
   }
 
   /// Extract the alignment of the return value.
@@ -1743,14 +1763,26 @@ public:
 
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
-  uint64_t getDereferenceableBytes(unsigned i) const {
-    return Attrs.getDereferenceableBytes(i);
+  uint64_t getRetDereferenceableBytes() const {
+    return Attrs.getRetDereferenceableBytes();
+  }
+
+  /// Extract the number of dereferenceable bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getParamDereferenceableBytes(unsigned i) const {
+    return Attrs.getParamDereferenceableBytes(i);
   }
 
-  /// Extract the number of dereferenceable_or_null bytes for a call or
+  /// Extract the number of dereferenceable_or_null bytes for a call
+  /// (0=unknown).
+  uint64_t getRetDereferenceableOrNullBytes() const {
+    return Attrs.getRetDereferenceableOrNullBytes();
+  }
+
+  /// Extract the number of dereferenceable_or_null bytes for a
   /// parameter (0=unknown).
-  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return Attrs.getDereferenceableOrNullBytes(i);
+  uint64_t getParamDereferenceableOrNullBytes(unsigned i) const {
+    return Attrs.getParamDereferenceableOrNullBytes(i);
   }
 
   /// Return true if the return value is known to be not null.
@@ -1760,7 +1792,7 @@ public:
 
   /// Determine if the return value is marked with NoAlias attribute.
   bool returnDoesNotAlias() const {
-    return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+    return Attrs.hasRetAttr(Attribute::NoAlias);
   }
 
   /// If one of the arguments has the 'returned' attribute, returns its
@@ -1779,40 +1811,30 @@ public:
 
   /// Return true if the call should not be inlined.
   bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
-  void setIsNoInline() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
-  }
+  void setIsNoInline() { addFnAttr(Attribute::NoInline); }
   /// Determine if the call does not access memory.
   bool doesNotAccessMemory() const { return hasFnAttr(Attribute::ReadNone); }
-  void setDoesNotAccessMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
-  }
+  void setDoesNotAccessMemory() { addFnAttr(Attribute::ReadNone); }
 
   /// Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
     return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
 
-  void setOnlyReadsMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
-  }
+  void setOnlyReadsMemory() { addFnAttr(Attribute::ReadOnly); }
 
   /// Determine if the call does not access or only writes memory.
   bool doesNotReadMemory() const {
     return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
   }
-  void setDoesNotReadMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
-  }
+  void setDoesNotReadMemory() { addFnAttr(Attribute::WriteOnly); }
 
   /// Determine if the call can access memmory only using pointers based
   /// on its arguments.
   bool onlyAccessesArgMemory() const {
     return hasFnAttr(Attribute::ArgMemOnly);
   }
-  void setOnlyAccessesArgMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
-  }
+  void setOnlyAccessesArgMemory() { addFnAttr(Attribute::ArgMemOnly); }
 
   /// Determine if the function may only access memory that is
   /// inaccessible from the IR.
@@ -1820,7 +1842,7 @@ public:
     return hasFnAttr(Attribute::InaccessibleMemOnly);
   }
   void setOnlyAccessesInaccessibleMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly);
+    addFnAttr(Attribute::InaccessibleMemOnly);
   }
 
   /// Determine if the function may only access memory that is
@@ -1829,49 +1851,36 @@ public:
     return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
   }
   void setOnlyAccessesInaccessibleMemOrArgMem() {
-    addAttribute(AttributeList::FunctionIndex,
-                 Attribute::InaccessibleMemOrArgMemOnly);
+    addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
   }
   /// Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
-  void setDoesNotReturn() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
-  }
+  void setDoesNotReturn() { addFnAttr(Attribute::NoReturn); }
 
   /// Determine if the call should not perform indirect branch tracking.
   bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); }
 
   /// Determine if the call cannot unwind.
   bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
-  void setDoesNotThrow() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
-  }
+  void setDoesNotThrow() { addFnAttr(Attribute::NoUnwind); }
 
   /// Determine if the invoke cannot be duplicated.
   bool cannotDuplicate() const { return hasFnAttr(Attribute::NoDuplicate); }
-  void setCannotDuplicate() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
-  }
+  void setCannotDuplicate() { addFnAttr(Attribute::NoDuplicate); }
 
   /// Determine if the call cannot be tail merged.
   bool cannotMerge() const { return hasFnAttr(Attribute::NoMerge); }
-  void setCannotMerge() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoMerge);
-  }
+  void setCannotMerge() { addFnAttr(Attribute::NoMerge); }
 
   /// Determine if the invoke is convergent
   bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
-  void setConvergent() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
-  void setNotConvergent() {
-    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
+  void setConvergent() { addFnAttr(Attribute::Convergent); }
+  void setNotConvergent() { removeFnAttr(Attribute::Convergent); }
 
   /// Determine if the call returns a structure through first
   /// pointer argument.
   bool hasStructRetAttr() const {
-    if (getNumArgOperands() == 0)
+    if (arg_empty())
       return false;
 
     // Be friendly and also check the callee.
@@ -1918,6 +1927,13 @@ public:
            Idx < getBundleOperandsEndIndex();
   }
 
+  /// Return true if the operand at index \p Idx is a bundle operand that has
+  /// tag ID \p ID.
+  bool isOperandBundleOfType(uint32_t ID, unsigned Idx) const {
+    return isBundleOperand(Idx) &&
+           getOperandBundleForOperand(Idx).getTagID() == ID;
+  }
+
   /// Returns true if the use is a bundle operand.
   bool isBundleOperand(const Use *U) const {
     assert(this == U->getUser() &&
@@ -2258,7 +2274,7 @@ private:
   bool hasFnAttrOnCalledFunction(StringRef Kind) const;
 
   template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
-    if (Attrs.hasFnAttribute(Kind))
+    if (Attrs.hasFnAttr(Kind))
       return true;
 
     // Operand bundles override attributes on the called function, but don't
@@ -2272,12 +2288,12 @@ private:
   /// Determine whether the return value has the given attribute. Supports
   /// Attribute::AttrKind and StringRef as \p AttrKind types.
   template <typename AttrKind> bool hasRetAttrImpl(AttrKind Kind) const {
-    if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    if (Attrs.hasRetAttr(Kind))
       return true;
 
     // Look at the callee, if available.
     if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+      return F->getAttributes().hasRetAttr(Kind);
     return false;
   }
 };
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index deb85cf277fe..9878082ffffa 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -59,11 +59,11 @@ protected:
   // Template alias so that all Instruction storing alignment use the same
   // definiton.
   // Valid alignments are powers of two from 2^0 to 2^MaxAlignmentExponent =
-  // 2^29. We store them as Log2(Alignment), so we need 5 bits to encode the 30
+  // 2^32. We store them as Log2(Alignment), so we need 6 bits to encode the 33
   // possible values.
   template <unsigned Offset>
   using AlignmentBitfieldElementT =
-      typename Bitfield::Element<unsigned, Offset, 5,
+      typename Bitfield::Element<unsigned, Offset, 6,
                                  Value::MaxAlignmentExponent>;
 
   template <unsigned Offset>
@@ -307,11 +307,6 @@ public:
     Value::getAllMetadata(MDs);
   }
 
-  /// Fills the AAMDNodes structure with AA metadata from this instruction.
-  /// When Merge is true, the existing AA metadata is merged with that from this
-  /// instruction providing the most-general result.
-  void getAAMetadata(AAMDNodes &N, bool Merge = false) const;
-
   /// Set the metadata of the specified kind to the specified node. This updates
   /// or replaces metadata if already present, or removes it if Node is null.
   void setMetadata(unsigned KindID, MDNode *Node);
@@ -352,7 +347,10 @@ public:
   /// to the existing node.
   void addAnnotationMetadata(StringRef Annotation);
 
-  /// Sets the metadata on this instruction from the AAMDNodes structure.
+  /// Returns the AA metadata for this instruction.
+  AAMDNodes getAAMetadata() const;
+
+  /// Sets the AA metadata on this instruction from the AAMDNodes structure.
   void setAAMetadata(const AAMDNodes &N);
 
   /// Retrieve the raw weight values of a conditional branch or select.
@@ -389,6 +387,10 @@ public:
   /// Determine whether the no signed wrap flag is set.
   bool hasNoSignedWrap() const;
 
+  /// Return true if this operator has flags which may cause this instruction
+  /// to evaluate to poison despite having non-poison inputs.
+  bool hasPoisonGeneratingFlags() const;
+
   /// Drops flags that may cause this instruction to evaluate to poison despite
   /// having non-poison inputs.
   void dropPoisonGeneratingFlags();
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 0c43a56daa33..6d32a898b668 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -126,7 +126,7 @@ public:
   }
 
   // FIXME: Remove this one transition to Align is over.
-  unsigned getAlignment() const { return getAlign().value(); }
+  uint64_t getAlignment() const { return getAlign().value(); }
 
   /// Return true if this alloca is in the entry block of the function and is a
   /// constant size. If so, the code generator will fold it into the
@@ -217,7 +217,7 @@ public:
   /// Return the alignment of the access that is being performed.
   /// FIXME: Remove this function once transition to Align is over.
   /// Use getAlign() instead.
-  unsigned getAlignment() const { return getAlign().value(); }
+  uint64_t getAlignment() const { return getAlign().value(); }
 
   /// Return the alignment of the access that is being performed.
   Align getAlign() const {
@@ -348,7 +348,7 @@ public:
   /// Return the alignment of the access that is being performed
   /// FIXME: Remove this function once transition to Align is over.
   /// Use getAlign() instead.
-  unsigned getAlignment() const { return getAlign().value(); }
+  uint64_t getAlignment() const { return getAlign().value(); }
 
   Align getAlign() const {
     return Align(1ULL << (getSubclassData<AlignmentField>()));
@@ -1339,6 +1339,10 @@ public:
     return P == ICMP_SLE || P == ICMP_ULE;
   }
 
+  /// Returns the sequence of all ICmp predicates.
+  ///
+  static auto predicates() { return ICmpPredicates(); }
+
   /// Exchange the two operands to this instruction in such a way that it does
   /// not modify the semantics of the instruction. The predicate value may be
   /// changed to retain the same result if the predicate is order dependent
@@ -1349,6 +1353,10 @@ public:
     Op<0>().swap(Op<1>());
   }
 
+  /// Return result of `LHS Pred RHS` comparison.
+  static bool compare(const APInt &LHS, const APInt &RHS,
+                      ICmpInst::Predicate Pred);
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ICmp;
@@ -1457,6 +1465,10 @@ public:
     Op<0>().swap(Op<1>());
   }
 
+  /// Returns the sequence of all FCmp predicates.
+  ///
+  static auto predicates() { return FCmpPredicates(); }
+
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::FCmp;
@@ -1685,9 +1697,7 @@ public:
 
   /// Return true if the call can return twice
   bool canReturnTwice() const { return hasFnAttr(Attribute::ReturnsTwice); }
-  void setCanReturnTwice() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice);
-  }
+  void setCanReturnTwice() { addFnAttr(Attribute::ReturnsTwice); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
@@ -2019,6 +2029,14 @@ protected:
   ShuffleVectorInst *cloneImpl() const;
 
 public:
+  ShuffleVectorInst(Value *V1, Value *Mask, const Twine &NameStr = "",
+                    Instruction *InsertBefore = nullptr);
+  ShuffleVectorInst(Value *V1, Value *Mask, const Twine &NameStr,
+                    BasicBlock *InsertAtEnd);
+  ShuffleVectorInst(Value *V1, ArrayRef<int> Mask, const Twine &NameStr = "",
+                    Instruction *InsertBefore = nullptr);
+  ShuffleVectorInst(Value *V1, ArrayRef<int> Mask, const Twine &NameStr,
+                    BasicBlock *InsertAtEnd);
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr = "",
                     Instruction *InsertBefor = nullptr);
@@ -2306,6 +2324,57 @@ public:
     return isExtractSubvectorMask(ShuffleMask, NumSrcElts, Index);
   }
 
+  /// Return true if this shuffle mask is an insert subvector mask.
+  /// A valid insert subvector mask inserts the lowest elements of a second
+  /// source operand into an in-place first source operand operand.
+  /// Both the sub vector width and the insertion index is returned.
+  static bool isInsertSubvectorMask(ArrayRef<int> Mask, int NumSrcElts,
+                                    int &NumSubElts, int &Index);
+  static bool isInsertSubvectorMask(const Constant *Mask, int NumSrcElts,
+                                    int &NumSubElts, int &Index) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(Mask->getType()))
+      return false;
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isInsertSubvectorMask(MaskAsInts, NumSrcElts, NumSubElts, Index);
+  }
+
+  /// Return true if this shuffle mask is an insert subvector mask.
+  bool isInsertSubvectorMask(int &NumSubElts, int &Index) const {
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(getType()))
+      return false;
+
+    int NumSrcElts =
+        cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
+    return isInsertSubvectorMask(ShuffleMask, NumSrcElts, NumSubElts, Index);
+  }
+
+  /// Return true if this shuffle mask replicates each of the \p VF elements
+  /// in a vector \p ReplicationFactor times.
+  /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
+  ///   <0,0,0,1,1,1,2,2,2,3,3,3>
+  static bool isReplicationMask(ArrayRef<int> Mask, int &ReplicationFactor,
+                                int &VF);
+  static bool isReplicationMask(const Constant *Mask, int &ReplicationFactor,
+                                int &VF) {
+    assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
+    // Not possible to express a shuffle mask for a scalable vector for this
+    // case.
+    if (isa<ScalableVectorType>(Mask->getType()))
+      return false;
+    SmallVector<int, 16> MaskAsInts;
+    getShuffleMask(Mask, MaskAsInts);
+    return isReplicationMask(MaskAsInts, ReplicationFactor, VF);
+  }
+
+  /// Return true if this shuffle mask is a replication mask.
+  bool isReplicationMask(int &ReplicationFactor, int &VF) const;
+
   /// Change values in a shuffle permute mask assuming the two vector operands
   /// of length InVecNumElts have swapped position.
   static void commuteShuffleMask(MutableArrayRef<int> Mask,
@@ -3281,14 +3350,14 @@ public:
     CaseHandle(SwitchInst *SI, ptrdiff_t Index) : CaseHandleImpl(SI, Index) {}
 
     /// Sets the new value for current case.
-    void setValue(ConstantInt *V) {
+    void setValue(ConstantInt *V) const {
       assert((unsigned)Index < SI->getNumCases() &&
              "Index out the number of cases.");
       SI->setOperand(2 + Index*2, reinterpret_cast<Value*>(V));
     }
 
     /// Sets the new successor for current case.
-    void setSuccessor(BasicBlock *S) {
+    void setSuccessor(BasicBlock *S) const {
       SI->setSuccessor(getSuccessorIndex(), S);
     }
   };
@@ -3297,7 +3366,7 @@ public:
   class CaseIteratorImpl
       : public iterator_facade_base<CaseIteratorImpl<CaseHandleT>,
                                     std::random_access_iterator_tag,
-                                    CaseHandleT> {
+                                    const CaseHandleT> {
     using SwitchInstT = typename CaseHandleT::SwitchInstType;
 
     CaseHandleT Case;
@@ -3356,7 +3425,6 @@ public:
       assert(Case.SI == RHS.Case.SI && "Incompatible operators.");
       return Case.Index < RHS.Case.Index;
     }
-    CaseHandleT &operator*() { return Case; }
     const CaseHandleT &operator*() const { return Case; }
   };
 
@@ -3446,15 +3514,12 @@ public:
   /// default case iterator to indicate that it is handled by the default
   /// handler.
   CaseIt findCaseValue(const ConstantInt *C) {
-    CaseIt I = llvm::find_if(
-        cases(), [C](CaseHandle &Case) { return Case.getCaseValue() == C; });
-    if (I != case_end())
-      return I;
-
-    return case_default();
+    return CaseIt(
+        this,
+        const_cast<const SwitchInst *>(this)->findCaseValue(C)->getCaseIndex());
   }
   ConstCaseIt findCaseValue(const ConstantInt *C) const {
-    ConstCaseIt I = llvm::find_if(cases(), [C](ConstCaseHandle &Case) {
+    ConstCaseIt I = llvm::find_if(cases(), [C](const ConstCaseHandle &Case) {
       return Case.getCaseValue() == C;
     });
     if (I != case_end())
@@ -4069,14 +4134,12 @@ public:
   ///
   Value *getIndirectDestLabel(unsigned i) const {
     assert(i < getNumIndirectDests() && "Out of bounds!");
-    return getOperand(i + getNumArgOperands() + getNumTotalBundleOperands() +
-                      1);
+    return getOperand(i + arg_size() + getNumTotalBundleOperands() + 1);
   }
 
   Value *getIndirectDestLabelUse(unsigned i) const {
     assert(i < getNumIndirectDests() && "Out of bounds!");
-    return getOperandUse(i + getNumArgOperands() + getNumTotalBundleOperands() +
-                         1);
+    return getOperandUse(i + arg_size() + getNumTotalBundleOperands() + 1);
   }
 
   // Return the destination basic blocks...
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 6b42cb949050..d186029db8cf 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -448,6 +448,28 @@ public:
   static Optional<unsigned> getFunctionalOpcodeForVP(Intrinsic::ID ID);
 };
 
+/// This represents vector predication reduction intrinsics.
+class VPReductionIntrinsic : public VPIntrinsic {
+public:
+  static bool isVPReduction(Intrinsic::ID ID);
+
+  unsigned getStartParamPos() const;
+  unsigned getVectorParamPos() const;
+
+  static Optional<unsigned> getStartParamPos(Intrinsic::ID ID);
+  static Optional<unsigned> getVectorParamPos(Intrinsic::ID ID);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// @{
+  static bool classof(const IntrinsicInst *I) {
+    return VPReductionIntrinsic::isVPReduction(I->getIntrinsicID());
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+  /// @}
+};
+
 /// This is the common base class for constrained floating point intrinsics.
 class ConstrainedFPIntrinsic : public IntrinsicInst {
 public:
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 80a2f5a8cd3e..2ff48380ac28 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -140,7 +140,8 @@ namespace Intrinsic {
       Subdivide2Argument,
       Subdivide4Argument,
       VecOfBitcastsToInt,
-      AMX
+      AMX,
+      PPCQuad,
     } Kind;
 
     union {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 28fcc13266b1..637e6d8f6cf5 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -312,6 +312,8 @@ def llvm_v1i128_ty     : LLVMType<v1i128>;   //  1 x i128
 def llvm_v2f16_ty      : LLVMType<v2f16>;    //  2 x half (__fp16)
 def llvm_v4f16_ty      : LLVMType<v4f16>;    //  4 x half (__fp16)
 def llvm_v8f16_ty      : LLVMType<v8f16>;    //  8 x half (__fp16)
+def llvm_v16f16_ty     : LLVMType<v16f16>;   // 16 x half (__fp16)
+def llvm_v32f16_ty     : LLVMType<v32f16>;   // 32 x half (__fp16)
 def llvm_v2bf16_ty     : LLVMType<v2bf16>;   //  2 x bfloat (__bf16)
 def llvm_v4bf16_ty     : LLVMType<v4bf16>;   //  4 x bfloat (__bf16)
 def llvm_v8bf16_ty     : LLVMType<v8bf16>;   //  8 x bfloat (__bf16)
@@ -1329,10 +1331,10 @@ def int_donothing : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrWillReturn]>;
 def int_sideeffect : DefaultAttrsIntrinsic<[], [], [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
 // The pseudoprobe intrinsic works as a place holder to the block it probes.
-// Like the sideeffect intrinsic defined above, this intrinsic is treated by the 
-// optimizer as having opaque side effects so that it won't be get rid of or moved 
+// Like the sideeffect intrinsic defined above, this intrinsic is treated by the
+// optimizer as having opaque side effects so that it won't be get rid of or moved
 // out of the block it probes.
-def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
+def int_pseudoprobe : DefaultAttrsIntrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
                                     [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
 // Arithmetic fence intrinsic.
@@ -1497,12 +1499,96 @@ let IntrProperties =
                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                 llvm_i32_ty]>;
 }
+// Shuffles.
+def int_vp_select : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                llvm_i32_ty]>;
+
+// Reductions
+let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_add  : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+}
 
 def int_get_active_lane_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [llvm_anyint_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrNoSync, IntrWillReturn]>;
 
+def int_experimental_vp_splice:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>,
+             LLVMMatchType<0>,
+             llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+             llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_load:
@@ -1558,12 +1644,15 @@ def int_icall_branch_funnel : DefaultAttrsIntrinsic<[], [llvm_vararg_ty], []>;
 def int_load_relative: DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
                                  [IntrReadMem, IntrArgMemOnly]>;
 
+def int_asan_check_memaccess :
+  Intrinsic<[],[llvm_ptr_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>]>;
+
 def int_hwasan_check_memaccess :
   Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
+            [ImmArg<ArgIndex<2>>]>;
 def int_hwasan_check_memaccess_shortgranules :
   Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
+            [ImmArg<ArgIndex<2>>]>;
 
 // Xray intrinsics
 //===----------------------------------------------------------------------===//
@@ -1658,7 +1747,7 @@ def int_matrix_multiply
 
 def int_matrix_column_major_load
   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-              [LLVMPointerToElt<0>, llvm_i64_ty, llvm_i1_ty,
+              [LLVMPointerToElt<0>, llvm_anyint_ty, llvm_i1_ty,
                llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrReadMem,
                NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
@@ -1667,7 +1756,7 @@ def int_matrix_column_major_load
 def int_matrix_column_major_store
   : DefaultAttrsIntrinsic<[],
               [llvm_anyvector_ty, LLVMPointerToElt<0>,
-               llvm_i64_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
+               llvm_anyint_ty, llvm_i1_ty, llvm_i32_ty, llvm_i32_ty],
               [IntrNoSync, IntrWillReturn, IntrArgMemOnly, IntrWriteMem,
                WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
                ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
@@ -1761,6 +1850,61 @@ def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                             llvm_i32_ty],
                                                            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
+
+//===----------------- Pointer Authentication Intrinsics ------------------===//
+//
+
+// Sign an unauthenticated pointer using the specified key and discriminator,
+// passed in that order.
+// Returns the first argument, with some known bits replaced with a signature.
+def int_ptrauth_sign : Intrinsic<[llvm_i64_ty],
+                                 [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
+                                 [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
+// Authenticate a signed pointer, using the specified key and discriminator.
+// Returns the first argument, with the signature bits removed.
+// The signature must be valid.
+def int_ptrauth_auth : Intrinsic<[llvm_i64_ty],
+                                 [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
+                                 [IntrNoMem,ImmArg<ArgIndex<1>>]>;
+
+// Authenticate a signed pointer and resign it.
+// The second (key) and third (discriminator) arguments specify the signing
+// schema used for authenticating.
+// The fourth and fifth arguments specify the schema used for signing.
+// The signature must be valid.
+// This is a combined form of @llvm.ptrauth.sign and @llvm.ptrauth.auth, with
+// an additional integrity guarantee on the intermediate value.
+def int_ptrauth_resign : Intrinsic<[llvm_i64_ty],
+                                   [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty,
+                                    llvm_i32_ty, llvm_i64_ty],
+                                   [IntrNoMem, ImmArg<ArgIndex<1>>,
+                                    ImmArg<ArgIndex<3>>]>;
+
+// Strip the embedded signature out of a signed pointer.
+// The second argument specifies the key.
+// This behaves like @llvm.ptrauth.auth, but doesn't require the signature to
+// be valid.
+def int_ptrauth_strip : Intrinsic<[llvm_i64_ty],
+                                  [llvm_i64_ty, llvm_i32_ty],
+                                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
+// Blend a small integer discriminator with an address discriminator, producing
+// a new discriminator value.
+def int_ptrauth_blend : Intrinsic<[llvm_i64_ty],
+                                  [llvm_i64_ty, llvm_i64_ty],
+                                  [IntrNoMem]>;
+
+// Compute the signature of a value, using a given discriminator.
+// This differs from @llvm.ptrauth.sign in that it doesn't embed the computed
+// signature in the pointer, but instead returns the signature as a value.
+// That allows it to be used to sign non-pointer data: in that sense, it is
+// generic.  There is no generic @llvm.ptrauth.auth: instead, the signature
+// can be computed using @llvm.ptrauth.sign_generic, and compared with icmp.
+def int_ptrauth_sign_generic : Intrinsic<[llvm_i64_ty],
+                                         [llvm_i64_ty, llvm_i64_ty],
+                                         [IntrNoMem]>;
+
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 87e0f83f85b7..c586af45f34d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -962,6 +962,25 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMPointerToElt<0>],
                 [IntrReadMem, IntrArgMemOnly]>;
 
+  class AdvSIMD_2Vec_PredLoad_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMPointerToElt<0>],
+                [IntrReadMem, IntrArgMemOnly]>;
+
+  class AdvSIMD_3Vec_PredLoad_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMPointerToElt<0>],
+                [IntrReadMem, IntrArgMemOnly]>;
+
+  class AdvSIMD_4Vec_PredLoad_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMPointerToElt<0>],
+                [IntrReadMem, IntrArgMemOnly]>;
+
   class AdvSIMD_1Vec_PredLoad_WriteFFR_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1365,7 +1384,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 
   // This class of intrinsics are not intended to be useful within LLVM IR but
   // are instead here to support some of the more regid parts of the ACLE.
-  class Builtin_SVCVT<string name, LLVMType OUT, LLVMType PRED, LLVMType IN>
+  class Builtin_SVCVT<LLVMType OUT, LLVMType PRED, LLVMType IN>
       : DefaultAttrsIntrinsic<[OUT], [OUT, PRED, IN], [IntrNoMem]>;
 }
 
@@ -1535,6 +1554,10 @@ def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
 def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
 def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
 
+def int_aarch64_sve_ld2_sret : AdvSIMD_2Vec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld3_sret : AdvSIMD_3Vec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld4_sret : AdvSIMD_4Vec_PredLoad_Intrinsic;
+
 def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
 def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_WriteFFR_Intrinsic;
 def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_WriteFFR_Intrinsic;
@@ -1957,44 +1980,44 @@ def int_aarch64_sve_fcmpgt : AdvSIMD_SVE_Compare_Intrinsic;
 def int_aarch64_sve_fcmpne : AdvSIMD_SVE_Compare_Intrinsic;
 def int_aarch64_sve_fcmpuo : AdvSIMD_SVE_Compare_Intrinsic;
 
-def int_aarch64_sve_fcvtzs_i32f16   : Builtin_SVCVT<"svcvt_s32_f16_m", llvm_nxv4i32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvtzs_i32f64   : Builtin_SVCVT<"svcvt_s32_f64_m", llvm_nxv4i32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
-def int_aarch64_sve_fcvtzs_i64f16   : Builtin_SVCVT<"svcvt_s64_f16_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvtzs_i64f32   : Builtin_SVCVT<"svcvt_s64_f32_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtzs_i32f16   : Builtin_SVCVT<llvm_nxv4i32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzs_i32f64   : Builtin_SVCVT<llvm_nxv4i32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtzs_i64f16   : Builtin_SVCVT<llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzs_i64f32   : Builtin_SVCVT<llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
 
-def int_aarch64_sve_fcvt_bf16f32    : Builtin_SVCVT<"svcvt_bf16_f32_m",   llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>;
-def int_aarch64_sve_fcvtnt_bf16f32  : Builtin_SVCVT<"svcvtnt_bf16_f32_m", llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvt_bf16f32    : Builtin_SVCVT<llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtnt_bf16f32  : Builtin_SVCVT<llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>;
 
-def int_aarch64_sve_fcvtzu_i32f16   : Builtin_SVCVT<"svcvt_u32_f16_m", llvm_nxv4i32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvtzu_i32f64   : Builtin_SVCVT<"svcvt_u32_f64_m", llvm_nxv4i32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
-def int_aarch64_sve_fcvtzu_i64f16   : Builtin_SVCVT<"svcvt_u64_f16_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvtzu_i64f32   : Builtin_SVCVT<"svcvt_u64_f32_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtzu_i32f16   : Builtin_SVCVT<llvm_nxv4i32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzu_i32f64   : Builtin_SVCVT<llvm_nxv4i32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtzu_i64f16   : Builtin_SVCVT<llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzu_i64f32   : Builtin_SVCVT<llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
 
-def int_aarch64_sve_fcvt_f16f32     : Builtin_SVCVT<"svcvt_f16_f32_m", llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4f32_ty>;
-def int_aarch64_sve_fcvt_f16f64     : Builtin_SVCVT<"svcvt_f16_f64_m", llvm_nxv8f16_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
-def int_aarch64_sve_fcvt_f32f64     : Builtin_SVCVT<"svcvt_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvt_f16f32     : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvt_f16f64     : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvt_f32f64     : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
 
-def int_aarch64_sve_fcvt_f32f16     : Builtin_SVCVT<"svcvt_f32_f16_m", llvm_nxv4f32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvt_f64f16     : Builtin_SVCVT<"svcvt_f64_f16_m", llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvt_f64f32     : Builtin_SVCVT<"svcvt_f64_f32_m", llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvt_f32f16     : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvt_f64f16     : Builtin_SVCVT<llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvt_f64f32     : Builtin_SVCVT<llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
 
-def int_aarch64_sve_fcvtlt_f32f16   : Builtin_SVCVT<"svcvtlt_f32_f16_m", llvm_nxv4f32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
-def int_aarch64_sve_fcvtlt_f64f32   : Builtin_SVCVT<"svcvtlt_f64_f32_m", llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
-def int_aarch64_sve_fcvtnt_f16f32   : Builtin_SVCVT<"svcvtnt_f16_f32_m", llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4f32_ty>;
-def int_aarch64_sve_fcvtnt_f32f64   : Builtin_SVCVT<"svcvtnt_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtlt_f32f16   : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtlt_f64f32   : Builtin_SVCVT<llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtnt_f16f32   : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtnt_f32f64   : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
 
-def int_aarch64_sve_fcvtx_f32f64    : Builtin_SVCVT<"svcvtx_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
-def int_aarch64_sve_fcvtxnt_f32f64  : Builtin_SVCVT<"svcvtxnt_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtx_f32f64    : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtxnt_f32f64  : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
 
-def int_aarch64_sve_scvtf_f16i32    : Builtin_SVCVT<"svcvt_f16_s32_m", llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4i32_ty>;
-def int_aarch64_sve_scvtf_f16i64    : Builtin_SVCVT<"svcvt_f16_s64_m", llvm_nxv8f16_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
-def int_aarch64_sve_scvtf_f32i64    : Builtin_SVCVT<"svcvt_f32_s64_m", llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
-def int_aarch64_sve_scvtf_f64i32    : Builtin_SVCVT<"svcvt_f64_s32_m", llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4i32_ty>;
+def int_aarch64_sve_scvtf_f16i32    : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4i32_ty>;
+def int_aarch64_sve_scvtf_f16i64    : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_scvtf_f32i64    : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_scvtf_f64i32    : Builtin_SVCVT<llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4i32_ty>;
 
-def int_aarch64_sve_ucvtf_f16i32    : Builtin_SVCVT<"svcvt_f16_u32_m", llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4i32_ty>;
-def int_aarch64_sve_ucvtf_f16i64    : Builtin_SVCVT<"svcvt_f16_u64_m", llvm_nxv8f16_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
-def int_aarch64_sve_ucvtf_f32i64    : Builtin_SVCVT<"svcvt_f32_u64_m", llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
-def int_aarch64_sve_ucvtf_f64i32    : Builtin_SVCVT<"svcvt_f64_u32_m", llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4i32_ty>;
+def int_aarch64_sve_ucvtf_f16i32    : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv4i1_ty, llvm_nxv4i32_ty>;
+def int_aarch64_sve_ucvtf_f16i64    : Builtin_SVCVT<llvm_nxv8f16_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_ucvtf_f32i64    : Builtin_SVCVT<llvm_nxv4f32_ty, llvm_nxv2i1_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_ucvtf_f64i32    : Builtin_SVCVT<llvm_nxv2f64_ty, llvm_nxv2i1_ty, llvm_nxv4i32_ty>;
 
 //
 // Predicate creation
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 46a7aeb39c9a..0a44670de76e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -684,7 +684,14 @@ class AMDGPUDimAtomicProfile<string opmod,
   let IsAtomic = true;
 }
 
-class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
+class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
+                                  list<AMDGPUArg> dataargs>
+    : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
+  let RetTypes = [llvm_anyfloat_ty];
+}
+
+class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
+    : AMDGPUDimProfile<"GET_RESINFO", dim> {
   let RetTypes = [llvm_anyfloat_ty];
   let DataArgs = [];
   let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
@@ -860,17 +867,24 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
 // atomic intrinsics
 //////////////////////////////////////////////////////////////////////////
 defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
-  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs> {
-    foreach dim = AMDGPUDims.All in {
-      def !strconcat(NAME, "_", dim.Name)
-        : AMDGPUImageDimIntrinsic<
-            AMDGPUDimAtomicProfile<opmod, dim, dataargs>,
-            [], [SDNPMemOperand]>;
-    }
+  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
+                                   int isFloat = 0> {
+        foreach dim = AMDGPUDims.All in {
+          def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
+              !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
+                   AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
+              [], [SDNPMemOperand]>;
+        }
+  }
+
+  multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
+    defm ""
+        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
+                                isFloat>;
   }
 
-  multiclass AMDGPUImageDimAtomic<string opmod> {
-    defm "" : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">]>;
+  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
+    defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
   }
 
   defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
@@ -878,8 +892,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
   defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
   defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
   defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
+  defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
   defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
   defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
+  defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
   defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
   defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
   defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
@@ -1015,8 +1031,10 @@ def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
@@ -1036,10 +1054,6 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
 // gfx908 intrinsic
 def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 
-// gfx90a intrinsics
-def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-
 class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
   !if(NoRtn, [], [data_ty]),
   [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
@@ -1521,6 +1535,16 @@ def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]
 >;
 
+def int_amdgcn_mulhi_i24 : Intrinsic<[llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+>;
+
+def int_amdgcn_mulhi_u24 : Intrinsic<[llvm_i32_ty],
+  [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+>;
+
 // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
 //
 // bar_val is the total number of waves that will wait on this
diff --git a/llvm/include/llvm/IR/IntrinsicsBPF.td b/llvm/include/llvm/IR/IntrinsicsBPF.td
index 4b4dd94b1599..a6bd6f841aab 100644
--- a/llvm/include/llvm/IR/IntrinsicsBPF.td
+++ b/llvm/include/llvm/IR/IntrinsicsBPF.td
@@ -34,4 +34,7 @@ let TargetPrefix = "bpf" in {  // All intrinsics start with "llvm.bpf."
               [IntrNoMem]>;
   def int_bpf_passthrough : GCCBuiltin<"__builtin_bpf_passthrough">,
               Intrinsic<[llvm_any_ty], [llvm_i32_ty, llvm_any_ty], [IntrNoMem]>;
+  def int_bpf_compare : GCCBuiltin<"__builtin_bpf_compare">,
+              Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_anyint_ty, llvm_anyint_ty],
+              [IntrNoMem]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index cc43d23bec1c..6f55d1ef730e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -43,7 +43,7 @@ def llvm_shared_i64ptr_ty : LLVMQualPointerType<llvm_i64_ty, 3>; // (shared)i64*
 
 // Helper class that represents a 'fragment' of an NVPTX *MMA instruction.
 // Geom: m<M>n<N>k<K>. E.g. m8n32k16
-// Frag: [abcd]
+// Frag: [a|b|c|d] ([x1|x2|x4] for ldmatrix)
 // PtxEltType: PTX type for the element.
 class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
   string geom = Geom;
@@ -190,6 +190,11 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
     !eq(gft,"m16n8k256:b:b1") : !listsplat(llvm_i32_ty, 2),
     !eq(gft,"m16n8k256:c:s32") : !listsplat(llvm_i32_ty, 4),
     !eq(gft,"m16n8k256:d:s32") : !listsplat(llvm_i32_ty, 4),
+
+    // ldmatrix b16 -> s32 @ m8n8
+    !eq(gft,"m8n8:x1:b16") : !listsplat(llvm_i32_ty, 1),
+    !eq(gft,"m8n8:x2:b16") : !listsplat(llvm_i32_ty, 2),
+    !eq(gft,"m8n8:x4:b16") : !listsplat(llvm_i32_ty, 4),
   );
 }
 
@@ -256,6 +261,17 @@ class MMA_NAME<string ALayout, string BLayout, int Satfinite, string b1op,
                   !subst("llvm.", "int_", llvm));
 }
 
+class LDMATRIX_NAME<WMMA_REGS Frag, int Trans> {
+  string intr = "llvm.nvvm.ldmatrix.sync.aligned"
+                # "." # Frag.geom
+                # "." # Frag.frag
+                # !if(Trans, ".trans", "")
+                # "." # Frag.ptx_elt_type
+                ;
+  string record = !subst(".", "_",
+                  !subst("llvm.", "int_", intr));
+}
+
 // Generates list of 4-tuples of WMMA_REGS representing a valid MMA op.
 //   Geom: list of supported geometries.
 //   TypeN: PTX type of the corresponding fragment's element.
@@ -286,9 +302,19 @@ class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
    list<string> ops = !foreach(x, ret, x.gft);
 }
 
-// Creates list of valid combinations of fragments. This is the master list that
+class LDMATRIX_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
+  list<WMMA_REGS> ret =
+     !foldl([]<WMMA_REGS>, Geom, t1, geom, !listconcat(t1,
+     !foldl([]<WMMA_REGS>, Frags, t2, frag, !listconcat(t2,
+     !foldl([]<WMMA_REGS>, Types, t3, type, !listconcat(t3,
+            [WMMA_REGS<geom, frag, type>]))))));
+   // Debugging aid for readable representation of the list above.
+   list<string> ops = !foreach(x, ret, x.gft);
+}
+
+// Creates list of valid combinations of fragments. This is the main list that
 // drives generation of corresponding intrinsics and instructions.
-class NVVM_MMA_OPS<int _ = 0> {
+class NVVM_MMA_OPS {
   list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<
             ["m16n16k8"],
             ["tf32"], [], ["f32"], []>.ret;
@@ -370,11 +396,14 @@ class NVVM_MMA_OPS<int _ = 0> {
   // Separate A/B/C fragments (loads) from D (stores).
   list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d"));
   list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d"));
+
+  list<WMMA_REGS> ldmatrix_b16_ops = LDMATRIX_OPS<
+    ["m8n8"], ["x1", "x2", "x4"], ["b16"]>.ret;
+  list<WMMA_REGS> all_ldmatrix_ops = ldmatrix_b16_ops;
 }
 
 def NVVM_MMA_OPS : NVVM_MMA_OPS;
 
-
 // Returns true if this combination of fragment and layout for WMMA load/store
 // ops is supported; false otherwise.
 // E.g.
@@ -489,6 +518,23 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
   );
 }
 
+// Returns true if the fragment is valid for ldmatrix ops is supported;
+// false otherwise.
+// E.g.
+// if NVVM_LDMATRIX_SUPPORTED<...>.ret then
+//   def : FOO<>; // The record will only be defined for supported ops.
+//
+class NVVM_LDMATRIX_SUPPORTED<WMMA_REGS frag> {
+  string g = frag.geom;
+  string t = frag.ptx_elt_type;
+
+  bit ret = !cond(
+    // Only currently support m8n8 and b16
+    !and(!eq(g, "m8n8"), !eq(t, "b16")): true,
+    true: false
+  );
+}
+
 class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
   string Suffix = !if(sync, "sync_", "")
                   # mode # "_"
@@ -511,7 +557,7 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
 
 let TargetPrefix = "nvvm" in {
   def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, Commutative]>;
 
 //
@@ -519,150 +565,150 @@ let TargetPrefix = "nvvm" in {
 //
 
   def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]
-        , [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]
+        , [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
 //
 // Multiplication
 //
 
   def int_nvvm_mulhi_i : GCCBuiltin<"__nvvm_mulhi_i">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mulhi_ui : GCCBuiltin<"__nvvm_mulhi_ui">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_mulhi_ll : GCCBuiltin<"__nvvm_mulhi_ll">,
-      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mulhi_ull : GCCBuiltin<"__nvvm_mulhi_ull">,
-      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_mul_rn_ftz_f : GCCBuiltin<"__nvvm_mul_rn_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rn_f : GCCBuiltin<"__nvvm_mul_rn_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rz_ftz_f : GCCBuiltin<"__nvvm_mul_rz_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rz_f : GCCBuiltin<"__nvvm_mul_rz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rm_ftz_f : GCCBuiltin<"__nvvm_mul_rm_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rm_f : GCCBuiltin<"__nvvm_mul_rm_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rp_ftz_f : GCCBuiltin<"__nvvm_mul_rp_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rp_f : GCCBuiltin<"__nvvm_mul_rp_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_mul_rn_d : GCCBuiltin<"__nvvm_mul_rn_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rz_d : GCCBuiltin<"__nvvm_mul_rz_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rm_d : GCCBuiltin<"__nvvm_mul_rm_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul_rp_d : GCCBuiltin<"__nvvm_mul_rp_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_mul24_i : GCCBuiltin<"__nvvm_mul24_i">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_mul24_ui : GCCBuiltin<"__nvvm_mul24_ui">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
 //
 // Div
 //
 
   def int_nvvm_div_approx_ftz_f : GCCBuiltin<"__nvvm_div_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_approx_f : GCCBuiltin<"__nvvm_div_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
 
   def int_nvvm_div_rn_ftz_f : GCCBuiltin<"__nvvm_div_rn_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rn_f : GCCBuiltin<"__nvvm_div_rn_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
 
   def int_nvvm_div_rz_ftz_f : GCCBuiltin<"__nvvm_div_rz_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rz_f : GCCBuiltin<"__nvvm_div_rz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
 
   def int_nvvm_div_rm_ftz_f : GCCBuiltin<"__nvvm_div_rm_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rm_f : GCCBuiltin<"__nvvm_div_rm_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
 
   def int_nvvm_div_rp_ftz_f : GCCBuiltin<"__nvvm_div_rp_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rp_f : GCCBuiltin<"__nvvm_div_rp_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem]>;
 
   def int_nvvm_div_rn_d : GCCBuiltin<"__nvvm_div_rn_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rz_d : GCCBuiltin<"__nvvm_div_rz_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rm_d : GCCBuiltin<"__nvvm_div_rm_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem]>;
   def int_nvvm_div_rp_d : GCCBuiltin<"__nvvm_div_rp_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem]>;
 
 //
 // Sad
 //
 
   def int_nvvm_sad_i : GCCBuiltin<"__nvvm_sad_i">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, Commutative]>;
   def int_nvvm_sad_ui : GCCBuiltin<"__nvvm_sad_ui">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, Commutative]>;
 
 //
@@ -670,493 +716,493 @@ let TargetPrefix = "nvvm" in {
 //
 
   def int_nvvm_floor_ftz_f : GCCBuiltin<"__nvvm_floor_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_floor_f : GCCBuiltin<"__nvvm_floor_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_floor_d : GCCBuiltin<"__nvvm_floor_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_ceil_ftz_f : GCCBuiltin<"__nvvm_ceil_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ceil_f : GCCBuiltin<"__nvvm_ceil_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ceil_d : GCCBuiltin<"__nvvm_ceil_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Abs
 //
 
   def int_nvvm_fabs_ftz_f : GCCBuiltin<"__nvvm_fabs_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fabs_f : GCCBuiltin<"__nvvm_fabs_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Round
 //
 
   def int_nvvm_round_ftz_f : GCCBuiltin<"__nvvm_round_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_round_f : GCCBuiltin<"__nvvm_round_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_round_d : GCCBuiltin<"__nvvm_round_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Trunc
 //
 
   def int_nvvm_trunc_ftz_f : GCCBuiltin<"__nvvm_trunc_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_trunc_f : GCCBuiltin<"__nvvm_trunc_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_trunc_d : GCCBuiltin<"__nvvm_trunc_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Saturate
 //
 
   def int_nvvm_saturate_ftz_f : GCCBuiltin<"__nvvm_saturate_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_saturate_f : GCCBuiltin<"__nvvm_saturate_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_saturate_d : GCCBuiltin<"__nvvm_saturate_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Exp2  Log2
 //
 
   def int_nvvm_ex2_approx_ftz_f : GCCBuiltin<"__nvvm_ex2_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_ex2_approx_f : GCCBuiltin<"__nvvm_ex2_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_ex2_approx_d : GCCBuiltin<"__nvvm_ex2_approx_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
   def int_nvvm_lg2_approx_ftz_f : GCCBuiltin<"__nvvm_lg2_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_lg2_approx_f : GCCBuiltin<"__nvvm_lg2_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_lg2_approx_d : GCCBuiltin<"__nvvm_lg2_approx_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Sin  Cos
 //
 
   def int_nvvm_sin_approx_ftz_f : GCCBuiltin<"__nvvm_sin_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sin_approx_f : GCCBuiltin<"__nvvm_sin_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
   def int_nvvm_cos_approx_ftz_f : GCCBuiltin<"__nvvm_cos_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
 //
 // Fma
 //
 
   def int_nvvm_fma_rn_ftz_f : GCCBuiltin<"__nvvm_fma_rn_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rn_f : GCCBuiltin<"__nvvm_fma_rn_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rz_ftz_f : GCCBuiltin<"__nvvm_fma_rz_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rz_f : GCCBuiltin<"__nvvm_fma_rz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rm_ftz_f : GCCBuiltin<"__nvvm_fma_rm_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rm_f : GCCBuiltin<"__nvvm_fma_rm_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rp_ftz_f : GCCBuiltin<"__nvvm_fma_rp_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rp_f : GCCBuiltin<"__nvvm_fma_rp_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_fma_rn_d : GCCBuiltin<"__nvvm_fma_rn_d">,
-      Intrinsic<[llvm_double_ty],
+      DefaultAttrsIntrinsic<[llvm_double_ty],
         [llvm_double_ty, llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rz_d : GCCBuiltin<"__nvvm_fma_rz_d">,
-      Intrinsic<[llvm_double_ty],
+      DefaultAttrsIntrinsic<[llvm_double_ty],
         [llvm_double_ty, llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rm_d : GCCBuiltin<"__nvvm_fma_rm_d">,
-      Intrinsic<[llvm_double_ty],
+      DefaultAttrsIntrinsic<[llvm_double_ty],
         [llvm_double_ty, llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+        [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_fma_rp_d : GCCBuiltin<"__nvvm_fma_rp_d">,
-      Intrinsic<[llvm_double_ty],
+      DefaultAttrsIntrinsic<[llvm_double_ty],
         [llvm_double_ty, llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+        [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Rcp
 //
 
   def int_nvvm_rcp_rn_ftz_f : GCCBuiltin<"__nvvm_rcp_rn_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rn_f : GCCBuiltin<"__nvvm_rcp_rn_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rz_ftz_f : GCCBuiltin<"__nvvm_rcp_rz_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rz_f : GCCBuiltin<"__nvvm_rcp_rz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rm_ftz_f : GCCBuiltin<"__nvvm_rcp_rm_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rm_f : GCCBuiltin<"__nvvm_rcp_rm_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rp_ftz_f : GCCBuiltin<"__nvvm_rcp_rp_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rp_f : GCCBuiltin<"__nvvm_rcp_rp_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
   def int_nvvm_rcp_rn_d : GCCBuiltin<"__nvvm_rcp_rn_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rz_d : GCCBuiltin<"__nvvm_rcp_rz_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rm_d : GCCBuiltin<"__nvvm_rcp_rm_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_rcp_rp_d : GCCBuiltin<"__nvvm_rcp_rp_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
   def int_nvvm_rcp_approx_ftz_d : GCCBuiltin<"__nvvm_rcp_approx_ftz_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Sqrt
 //
 
   def int_nvvm_sqrt_f : GCCBuiltin<"__nvvm_sqrt_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rz_ftz_f : GCCBuiltin<"__nvvm_sqrt_rz_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rz_f : GCCBuiltin<"__nvvm_sqrt_rz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rm_ftz_f : GCCBuiltin<"__nvvm_sqrt_rm_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rm_f : GCCBuiltin<"__nvvm_sqrt_rm_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rp_ftz_f : GCCBuiltin<"__nvvm_sqrt_rp_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rp_f : GCCBuiltin<"__nvvm_sqrt_rp_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_approx_ftz_f : GCCBuiltin<"__nvvm_sqrt_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_approx_f : GCCBuiltin<"__nvvm_sqrt_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
   def int_nvvm_sqrt_rn_d : GCCBuiltin<"__nvvm_sqrt_rn_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rz_d : GCCBuiltin<"__nvvm_sqrt_rz_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rm_d : GCCBuiltin<"__nvvm_sqrt_rm_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rp_d : GCCBuiltin<"__nvvm_sqrt_rp_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Rsqrt
 //
 
   def int_nvvm_rsqrt_approx_ftz_f : GCCBuiltin<"__nvvm_rsqrt_approx_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rsqrt_approx_f : GCCBuiltin<"__nvvm_rsqrt_approx_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_rsqrt_approx_d : GCCBuiltin<"__nvvm_rsqrt_approx_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
 
 //
 // Add
 //
 
   def int_nvvm_add_rn_ftz_f : GCCBuiltin<"__nvvm_add_rn_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rn_f : GCCBuiltin<"__nvvm_add_rn_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rz_ftz_f : GCCBuiltin<"__nvvm_add_rz_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rz_f : GCCBuiltin<"__nvvm_add_rz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rm_ftz_f : GCCBuiltin<"__nvvm_add_rm_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rm_f : GCCBuiltin<"__nvvm_add_rm_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rp_ftz_f : GCCBuiltin<"__nvvm_add_rp_ftz_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rp_f : GCCBuiltin<"__nvvm_add_rp_f">,
-      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_add_rn_d : GCCBuiltin<"__nvvm_add_rn_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rz_d : GCCBuiltin<"__nvvm_add_rz_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rm_d : GCCBuiltin<"__nvvm_add_rm_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
   def int_nvvm_add_rp_d : GCCBuiltin<"__nvvm_add_rp_d">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
 //
 // Convert
 //
 
   def int_nvvm_d2f_rn_ftz : GCCBuiltin<"__nvvm_d2f_rn_ftz">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rn : GCCBuiltin<"__nvvm_d2f_rn">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rz_ftz : GCCBuiltin<"__nvvm_d2f_rz_ftz">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rz : GCCBuiltin<"__nvvm_d2f_rz">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rm_ftz : GCCBuiltin<"__nvvm_d2f_rm_ftz">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rm : GCCBuiltin<"__nvvm_d2f_rm">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rp_ftz : GCCBuiltin<"__nvvm_d2f_rp_ftz">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2f_rp : GCCBuiltin<"__nvvm_d2f_rp">,
-      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_d2i_rn : GCCBuiltin<"__nvvm_d2i_rn">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2i_rz : GCCBuiltin<"__nvvm_d2i_rz">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2i_rm : GCCBuiltin<"__nvvm_d2i_rm">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2i_rp : GCCBuiltin<"__nvvm_d2i_rp">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_d2ui_rn : GCCBuiltin<"__nvvm_d2ui_rn">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ui_rz : GCCBuiltin<"__nvvm_d2ui_rz">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ui_rm : GCCBuiltin<"__nvvm_d2ui_rm">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ui_rp : GCCBuiltin<"__nvvm_d2ui_rp">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_i2d_rn : GCCBuiltin<"__nvvm_i2d_rn">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_i2d_rz : GCCBuiltin<"__nvvm_i2d_rz">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_i2d_rm : GCCBuiltin<"__nvvm_i2d_rm">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_i2d_rp : GCCBuiltin<"__nvvm_i2d_rp">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_ui2d_rn : GCCBuiltin<"__nvvm_ui2d_rn">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ui2d_rz : GCCBuiltin<"__nvvm_ui2d_rz">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ui2d_rm : GCCBuiltin<"__nvvm_ui2d_rm">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ui2d_rp : GCCBuiltin<"__nvvm_ui2d_rp">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_f2i_rn_ftz : GCCBuiltin<"__nvvm_f2i_rn_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rn : GCCBuiltin<"__nvvm_f2i_rn">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rz_ftz : GCCBuiltin<"__nvvm_f2i_rz_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rz : GCCBuiltin<"__nvvm_f2i_rz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rm_ftz : GCCBuiltin<"__nvvm_f2i_rm_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rm : GCCBuiltin<"__nvvm_f2i_rm">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rp_ftz : GCCBuiltin<"__nvvm_f2i_rp_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2i_rp : GCCBuiltin<"__nvvm_f2i_rp">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_f2ui_rn_ftz : GCCBuiltin<"__nvvm_f2ui_rn_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rn : GCCBuiltin<"__nvvm_f2ui_rn">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rz_ftz : GCCBuiltin<"__nvvm_f2ui_rz_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rz : GCCBuiltin<"__nvvm_f2ui_rz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rm_ftz : GCCBuiltin<"__nvvm_f2ui_rm_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rm : GCCBuiltin<"__nvvm_f2ui_rm">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rp_ftz : GCCBuiltin<"__nvvm_f2ui_rp_ftz">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ui_rp : GCCBuiltin<"__nvvm_f2ui_rp">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_i2f_rn : GCCBuiltin<"__nvvm_i2f_rn">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_i2f_rz : GCCBuiltin<"__nvvm_i2f_rz">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_i2f_rm : GCCBuiltin<"__nvvm_i2f_rm">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_i2f_rp : GCCBuiltin<"__nvvm_i2f_rp">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_ui2f_rn : GCCBuiltin<"__nvvm_ui2f_rn">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ui2f_rz : GCCBuiltin<"__nvvm_ui2f_rz">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ui2f_rm : GCCBuiltin<"__nvvm_ui2f_rm">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ui2f_rp : GCCBuiltin<"__nvvm_ui2f_rp">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_lohi_i2d : GCCBuiltin<"__nvvm_lohi_i2d">,
-      Intrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, Commutative]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable, Commutative]>;
 
   def int_nvvm_d2i_lo : GCCBuiltin<"__nvvm_d2i_lo">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2i_hi : GCCBuiltin<"__nvvm_d2i_hi">,
-      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_f2ll_rn_ftz : GCCBuiltin<"__nvvm_f2ll_rn_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rn : GCCBuiltin<"__nvvm_f2ll_rn">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rz_ftz : GCCBuiltin<"__nvvm_f2ll_rz_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rz : GCCBuiltin<"__nvvm_f2ll_rz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rm_ftz : GCCBuiltin<"__nvvm_f2ll_rm_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rm : GCCBuiltin<"__nvvm_f2ll_rm">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rp_ftz : GCCBuiltin<"__nvvm_f2ll_rp_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ll_rp : GCCBuiltin<"__nvvm_f2ll_rp">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_f2ull_rn_ftz : GCCBuiltin<"__nvvm_f2ull_rn_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rn : GCCBuiltin<"__nvvm_f2ull_rn">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rz_ftz : GCCBuiltin<"__nvvm_f2ull_rz_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rz : GCCBuiltin<"__nvvm_f2ull_rz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rm_ftz : GCCBuiltin<"__nvvm_f2ull_rm_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rm : GCCBuiltin<"__nvvm_f2ull_rm">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rp_ftz : GCCBuiltin<"__nvvm_f2ull_rp_ftz">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2ull_rp : GCCBuiltin<"__nvvm_f2ull_rp">,
-      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_d2ll_rn : GCCBuiltin<"__nvvm_d2ll_rn">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ll_rz : GCCBuiltin<"__nvvm_d2ll_rz">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ll_rm : GCCBuiltin<"__nvvm_d2ll_rm">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ll_rp : GCCBuiltin<"__nvvm_d2ll_rp">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_d2ull_rn : GCCBuiltin<"__nvvm_d2ull_rn">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ull_rz : GCCBuiltin<"__nvvm_d2ull_rz">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ull_rm : GCCBuiltin<"__nvvm_d2ull_rm">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_d2ull_rp : GCCBuiltin<"__nvvm_d2ull_rp">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_ll2f_rn : GCCBuiltin<"__nvvm_ll2f_rn">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ll2f_rz : GCCBuiltin<"__nvvm_ll2f_rz">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ll2f_rm : GCCBuiltin<"__nvvm_ll2f_rm">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ll2f_rp : GCCBuiltin<"__nvvm_ll2f_rp">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2f_rn : GCCBuiltin<"__nvvm_ull2f_rn">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2f_rz : GCCBuiltin<"__nvvm_ull2f_rz">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2f_rm : GCCBuiltin<"__nvvm_ull2f_rm">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2f_rp : GCCBuiltin<"__nvvm_ull2f_rp">,
-      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_ll2d_rn : GCCBuiltin<"__nvvm_ll2d_rn">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ll2d_rz : GCCBuiltin<"__nvvm_ll2d_rz">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ll2d_rm : GCCBuiltin<"__nvvm_ll2d_rm">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ll2d_rp : GCCBuiltin<"__nvvm_ll2d_rp">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2d_rn : GCCBuiltin<"__nvvm_ull2d_rn">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2d_rz : GCCBuiltin<"__nvvm_ull2d_rz">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2d_rm : GCCBuiltin<"__nvvm_ull2d_rm">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_ull2d_rp : GCCBuiltin<"__nvvm_ull2d_rp">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_f2h_rn_ftz : GCCBuiltin<"__nvvm_f2h_rn_ftz">,
-      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_f2h_rn : GCCBuiltin<"__nvvm_f2h_rn">,
-      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
 
 //
 // Bitcast
 //
 
   def int_nvvm_bitcast_f2i : GCCBuiltin<"__nvvm_bitcast_f2i">,
-      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_bitcast_i2f : GCCBuiltin<"__nvvm_bitcast_i2f">,
-      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
 
   def int_nvvm_bitcast_ll2d : GCCBuiltin<"__nvvm_bitcast_ll2d">,
-      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>;
   def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">,
-      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
 // FNS
 
   def int_nvvm_fns : GCCBuiltin<"__nvvm_fns">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                 [IntrNoMem]>;
 
 // Atomics not available as llvm intrinsics.
@@ -1385,37 +1431,37 @@ def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
 // - This complements the llvm bitcast, which can be used to cast one type
 //   of pointer to another type of pointer, while the address space remains
 //   the same.
-def int_nvvm_ptr_local_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_local_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.local.to.gen">;
-def int_nvvm_ptr_shared_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_shared_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.shared.to.gen">;
-def int_nvvm_ptr_global_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_global_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.global.to.gen">;
-def int_nvvm_ptr_constant_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_constant_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.constant.to.gen">;
 
-def int_nvvm_ptr_gen_to_global: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_gen_to_global: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.gen.to.global">;
-def int_nvvm_ptr_gen_to_shared: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_gen_to_shared: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.gen.to.shared">;
-def int_nvvm_ptr_gen_to_local: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_gen_to_local: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.gen.to.local">;
-def int_nvvm_ptr_gen_to_constant: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem],
+def int_nvvm_ptr_gen_to_constant: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
                  "llvm.nvvm.ptr.gen.to.constant">;
 
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
                                      [llvm_anyptr_ty],
-                                   [IntrNoMem],
+                                   [IntrNoMem, IntrSpeculatable],
                                    "llvm.nvvm.ptr.gen.to.param">;
 
 // Move intrinsics, used in nvvm internally
@@ -1453,149 +1499,149 @@ def int_nvvm_reflect :
 
 // isspacep.{const, global, local, shared}
 def int_nvvm_isspacep_const
-  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.const">,
     GCCBuiltin<"__nvvm_isspacep_const">;
 def int_nvvm_isspacep_global
-  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.global">,
     GCCBuiltin<"__nvvm_isspacep_global">;
 def int_nvvm_isspacep_local
-  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.local">,
     GCCBuiltin<"__nvvm_isspacep_local">;
 def int_nvvm_isspacep_shared
-  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.isspacep.shared">,
     GCCBuiltin<"__nvvm_isspacep_shared">;
 
 // Environment register read
 def int_nvvm_read_ptx_sreg_envreg0
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg0">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg0">;
 def int_nvvm_read_ptx_sreg_envreg1
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg1">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg1">;
 def int_nvvm_read_ptx_sreg_envreg2
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg2">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg2">;
 def int_nvvm_read_ptx_sreg_envreg3
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg3">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg3">;
 def int_nvvm_read_ptx_sreg_envreg4
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg4">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg4">;
 def int_nvvm_read_ptx_sreg_envreg5
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg5">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg5">;
 def int_nvvm_read_ptx_sreg_envreg6
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg6">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg6">;
 def int_nvvm_read_ptx_sreg_envreg7
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg7">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg7">;
 def int_nvvm_read_ptx_sreg_envreg8
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg8">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg8">;
 def int_nvvm_read_ptx_sreg_envreg9
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg9">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg9">;
 def int_nvvm_read_ptx_sreg_envreg10
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg10">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg10">;
 def int_nvvm_read_ptx_sreg_envreg11
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg11">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg11">;
 def int_nvvm_read_ptx_sreg_envreg12
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg12">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg12">;
 def int_nvvm_read_ptx_sreg_envreg13
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg13">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg13">;
 def int_nvvm_read_ptx_sreg_envreg14
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg14">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg14">;
 def int_nvvm_read_ptx_sreg_envreg15
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg15">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg15">;
 def int_nvvm_read_ptx_sreg_envreg16
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg16">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg16">;
 def int_nvvm_read_ptx_sreg_envreg17
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg17">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg17">;
 def int_nvvm_read_ptx_sreg_envreg18
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg18">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg18">;
 def int_nvvm_read_ptx_sreg_envreg19
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg19">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg19">;
 def int_nvvm_read_ptx_sreg_envreg20
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg20">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg20">;
 def int_nvvm_read_ptx_sreg_envreg21
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg21">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg21">;
 def int_nvvm_read_ptx_sreg_envreg22
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg22">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg22">;
 def int_nvvm_read_ptx_sreg_envreg23
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg23">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg23">;
 def int_nvvm_read_ptx_sreg_envreg24
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg24">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg24">;
 def int_nvvm_read_ptx_sreg_envreg25
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg25">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg25">;
 def int_nvvm_read_ptx_sreg_envreg26
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg26">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg26">;
 def int_nvvm_read_ptx_sreg_envreg27
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg27">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg27">;
 def int_nvvm_read_ptx_sreg_envreg28
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg28">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg28">;
 def int_nvvm_read_ptx_sreg_envreg29
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg29">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg29">;
 def int_nvvm_read_ptx_sreg_envreg30
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg30">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg30">;
 def int_nvvm_read_ptx_sreg_envreg31
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable],
               "llvm.nvvm.read.ptx.sreg.envreg31">,
     GCCBuiltin<"__nvvm_read_ptx_sreg_envreg31">;
 
@@ -4200,49 +4246,49 @@ def int_nvvm_sust_p_3d_v4i32_trap
 
 
 def int_nvvm_rotate_b32
-  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-              [IntrNoMem], "llvm.nvvm.rotate.b32">,
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">,
               GCCBuiltin<"__nvvm_rotate_b32">;
 
 def int_nvvm_rotate_b64
-  :Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
-             [IntrNoMem], "llvm.nvvm.rotate.b64">,
+  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
+             [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">,
              GCCBuiltin<"__nvvm_rotate_b64">;
 
 def int_nvvm_rotate_right_b64
-  : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
-              [IntrNoMem], "llvm.nvvm.rotate.right.b64">,
+  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
+              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">,
               GCCBuiltin<"__nvvm_rotate_right_b64">;
 
 def int_nvvm_swap_lo_hi_b64
-  : Intrinsic<[llvm_i64_ty], [llvm_i64_ty],
-              [IntrNoMem], "llvm.nvvm.swap.lo.hi.b64">,
+  : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
+              [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,
               GCCBuiltin<"__nvvm_swap_lo_hi_b64">;
 
 
 // Accessing special registers.
 multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
 // FIXME: Do we need the 128-bit integer type version?
-//    def _r64   : Intrinsic<[llvm_i128_ty],   [], [IntrNoMem]>;
+//    def _r64   : Intrinsic<[llvm_i128_ty],   [], [IntrNoMem, IntrSpeculatable]>;
 
 // FIXME: Enable this once v4i32 support is enabled in back-end.
-//    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem]>;
+//    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
-  def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+  def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
                GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_x">;
-  def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+  def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
                GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_y">;
-  def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+  def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
                GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_z">;
-  def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+  def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
                GCCBuiltin<"__nvvm_read_ptx_sreg_" # regname # "_w">;
 }
 
 class PTXReadSRegIntrinsic_r32<string name>
-  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>,
     GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 class PTXReadSRegIntrinsic_r64<string name>
-  : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>,
+  : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>,
     GCCBuiltin<"__nvvm_read_ptx_sreg_" # name>;
 
 // Intrinsics to read registers with non-constant values. E.g. the values that
@@ -4519,4 +4565,20 @@ foreach layout_a = ["row", "col"] in {
   } // layout_b
 } // layout_a
 
+// LDMATRIX
+class NVVM_LDMATRIX<WMMA_REGS Frag, int Transposed>
+  : Intrinsic<Frag.regs, [llvm_anyptr_ty],
+              [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
+               NoCapture<ArgIndex<0>>],
+              LDMATRIX_NAME<Frag, Transposed>.intr>;
+
+foreach transposed = [0, 1] in {
+  foreach frag = NVVM_MMA_OPS.all_ldmatrix_ops in {
+    if NVVM_LDMATRIX_SUPPORTED<frag>.ret then {
+      def LDMATRIX_NAME<frag, transposed>.record
+        : NVVM_LDMATRIX<frag, transposed>;
+    }
+  }
+}
+
 } // let TargetPrefix = "nvvm"
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 92d3bdea37ed..8290342c0d51 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -31,10 +31,12 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
 
   // Get content from current FPSCR register
   def int_ppc_readflm : GCCBuiltin<"__builtin_readflm">,
-                        Intrinsic<[llvm_double_ty], [], [IntrNoMem]>;
+                        Intrinsic<[llvm_double_ty], [],
+                                  [IntrNoMerge, IntrHasSideEffects]>;
   // Set FPSCR register, and return previous content
   def int_ppc_setflm : GCCBuiltin<"__builtin_setflm">,
-                       Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
+                       Intrinsic<[llvm_double_ty], [llvm_double_ty],
+                                 [IntrHasSideEffects]>;
 
   // Intrinsics for [double]word extended forms of divide instructions
   def int_ppc_divwe : GCCBuiltin<"__builtin_divwe">,
@@ -50,6 +52,15 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
                        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                  [IntrNoMem]>;
 
+  def int_ppc_unpack_longdouble : GCCBuiltin<"__builtin_unpack_longdouble">,
+                                  Intrinsic<[llvm_double_ty],
+                                            [llvm_ppcf128_ty, llvm_i32_ty],
+                                            [IntrNoMem]>;
+  def int_ppc_pack_longdouble : GCCBuiltin<"__builtin_pack_longdouble">,
+                                Intrinsic<[llvm_ppcf128_ty],
+                                          [llvm_double_ty, llvm_double_ty],
+                                          [IntrNoMem]>;
+
   // Generate a random number
   def int_ppc_darn : GCCBuiltin<"__builtin_darn">,
                      Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
@@ -1042,6 +1053,9 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vbpermq : GCCBuiltin<"__builtin_altivec_vbpermq">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
                         [IntrNoMem]>;
+  def int_ppc_altivec_vbpermd : GCCBuiltin<"__builtin_altivec_vbpermd">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty],
+                        [IntrNoMem]>;
 }
 
 def int_ppc_altivec_vexptefp  : PowerPC_Vec_FF_Intrinsic<"vexptefp">;
@@ -1626,8 +1640,7 @@ let TargetPrefix = "ppc" in {
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
   // load
   def int_ppc_load2r
-      : GCCBuiltin<"__builtin_ppc_load2r">,
-        Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
+      : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
   def int_ppc_load4r
       : GCCBuiltin<"__builtin_ppc_load4r">,
         Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;
@@ -1706,7 +1719,10 @@ let TargetPrefix = "ppc" in {
   def int_ppc_fres
       : GCCBuiltin<"__builtin_ppc_fres">,
         Intrinsic <[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  
+  def int_ppc_addex
+      : GCCBuiltin<"__builtin_ppc_addex">,
+        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty],
+                  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<2>>]>;
   def int_ppc_fsel : GCCBuiltin<"__builtin_ppc_fsel">,
                      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty, 
                                                   llvm_double_ty], [IntrNoMem]>;
@@ -1717,6 +1733,33 @@ let TargetPrefix = "ppc" in {
                         Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
   def int_ppc_frsqrtes : GCCBuiltin<"__builtin_ppc_frsqrtes">,
                          Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_ppc_compare_exp_uo : GCCBuiltin<"__builtin_ppc_compare_exp_uo">,
+                               Intrinsic<[llvm_i32_ty],
+                                         [llvm_double_ty, llvm_double_ty], 
+                                         [IntrNoMem]>;
+  def int_ppc_compare_exp_lt : GCCBuiltin<"__builtin_ppc_compare_exp_lt">,
+                               Intrinsic<[llvm_i32_ty], 
+                                         [llvm_double_ty, llvm_double_ty], 
+                                         [IntrNoMem]>;
+  def int_ppc_compare_exp_gt : GCCBuiltin<"__builtin_ppc_compare_exp_gt">,
+                               Intrinsic<[llvm_i32_ty],
+                                         [llvm_double_ty, llvm_double_ty], 
+                                         [IntrNoMem]>;
+  def int_ppc_compare_exp_eq : GCCBuiltin<"__builtin_ppc_compare_exp_eq">,
+                               Intrinsic<[llvm_i32_ty], 
+                                         [llvm_double_ty, llvm_double_ty], 
+                                         [IntrNoMem]>;
+  def int_ppc_test_data_class_d : Intrinsic<[llvm_i32_ty],
+                                            [llvm_double_ty, llvm_i32_ty],
+                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  def int_ppc_test_data_class_f : Intrinsic<[llvm_i32_ty],
+                                            [llvm_float_ty, llvm_i32_ty],
+                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
+  def int_ppc_convert_f128_to_ppcf128
+      : Intrinsic<[llvm_ppcf128_ty], [llvm_f128_ty], [IntrNoMem]>;
+  def int_ppc_convert_ppcf128_to_f128
+      : Intrinsic<[llvm_f128_ty], [llvm_ppcf128_ty], [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1738,4 +1781,11 @@ let TargetPrefix = "ppc" in {
                                         llvm_i64_ty, llvm_i64_ty,
                                         llvm_i64_ty, llvm_i64_ty],
                                        [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+  def int_ppc_atomic_load_i128 :
+    Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_ptr_ty],
+              [IntrArgMemOnly, IntrReadMem, NoCapture<ArgIndex<0>>]>;
+  def int_ppc_atomic_store_i128 :
+    Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty],
+              [IntrArgMemOnly, IntrWriteMem, NoCapture<ArgIndex<2>>]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index a46709bf09d1..3ceb347e97bf 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -159,16 +159,17 @@ let TargetPrefix = "riscv" in {
                     [NoCapture<ArgIndex<0>>]>,
                     RISCVVIntrinsic;
   // For unit stride load with mask
-  // Input: (maskedoff, pointer, mask, vl)
+  // Input: (maskedoff, pointer, mask, vl, ta)
   class RISCVUSLoadMask
         : Intrinsic<[llvm_anyvector_ty ],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                     llvm_anyint_ty],
-                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic;
+                     llvm_anyint_ty, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<4>>, IntrReadMem]>,
+                    RISCVVIntrinsic;
   // For unit stride fault-only-first load with mask
-  // Input: (maskedoff, pointer, mask, vl)
+  // Input: (maskedoff, pointer, mask, vl, ta)
   // Output: (data, vl)
   // NOTE: We model this with default memory properties since we model writing
   // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
@@ -177,8 +178,8 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                     LLVMMatchType<1>],
-                    [NoCapture<ArgIndex<1>>]>, RISCVVIntrinsic;
+                     LLVMMatchType<1>, LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>, RISCVVIntrinsic;
   // For strided load
   // Input: (pointer, stride, vl)
   class RISCVSLoad
@@ -187,13 +188,15 @@ let TargetPrefix = "riscv" in {
                      llvm_anyint_ty, LLVMMatchType<1>],
                     [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
   // For strided load with mask
-  // Input: (maskedoff, pointer, stride, mask, vl)
+  // Input: (maskedoff, pointer, stride, mask, vl, ta)
   class RISCVSLoadMask
         : Intrinsic<[llvm_anyvector_ty ],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>, llvm_anyint_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>],
-                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>,
+                     LLVMMatchType<1>],
+                    [NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<5>>, IntrReadMem]>,
+                    RISCVVIntrinsic;
   // For indexed load
   // Input: (pointer, index, vl)
   class RISCVILoad
@@ -202,13 +205,15 @@ let TargetPrefix = "riscv" in {
                      llvm_anyvector_ty, llvm_anyint_ty],
                     [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
   // For indexed load with mask
-  // Input: (maskedoff, pointer, index, mask, vl)
+  // Input: (maskedoff, pointer, index, mask, vl, ta)
   class RISCVILoadMask
         : Intrinsic<[llvm_anyvector_ty ],
                     [LLVMMatchType<0>,
                      LLVMPointerType<LLVMMatchType<0>>, llvm_anyvector_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<2>],
+                    [NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<5>>, IntrReadMem]>,
+                    RISCVVIntrinsic;
   // For unit stride store
   // Input: (vector_in, pointer, vl)
   class RISCVUSStore
@@ -265,10 +270,16 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is the same as first source vector (with mask).
-  // Input: (vector_in, mask, vl)
+  // Input: (vector_in, mask, vl, ta)
   class RISCVUnaryAAMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>,
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<1>],
+                    [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic;
+  class RISCVUnaryAAMaskNoTA
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, LLVMMatchType<0>,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is the same as first and second source vector.
@@ -284,12 +295,13 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is the same as first and second source vector.
-  // Input: (vector_in, vector_in, int_vector_in, vl)
+  // Input: (vector_in, vector_in, int_vector_in, vl, ta)
   class RISCVRGatherVVMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<1>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic;
   // Input: (vector_in, int16_vector_in, vl)
   class RISCVRGatherEI16VVNoMask
         : Intrinsic<[llvm_anyvector_ty],
@@ -297,13 +309,14 @@ let TargetPrefix = "riscv" in {
                      llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is the same as first and second source vector.
-  // Input: (vector_in, vector_in, int16_vector_in, vl)
+  // Input: (vector_in, vector_in, int16_vector_in, vl, ta)
   class RISCVRGatherEI16VVMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>,
                     LLVMScalarOrSameVectorWidth<0, llvm_i16_ty>,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<1>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is the same as first source vector, and the
   // second operand is XLen.
   // Input: (vector_in, xlen_in, vl)
@@ -314,12 +327,13 @@ let TargetPrefix = "riscv" in {
   }
   // For destination vector type is the same as first source vector (with mask).
   // Second operand is XLen.
-  // Input: (maskedoff, vector_in, xlen_in, mask, vl)
+  // Input: (maskedoff, vector_in, xlen_in, mask, vl, ta)
   class RISCVGatherVXMask
        : Intrinsic<[llvm_anyvector_ty],
                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
-                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>],
-                   [IntrNoMem]>, RISCVVIntrinsic {
+                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<1>,
+                    LLVMMatchType<1>],
+                   [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
   }
   // For destination vector type is the same as first source vector.
   // Input: (vector_in, vector_in/scalar_in, vl)
@@ -330,12 +344,13 @@ let TargetPrefix = "riscv" in {
     let SplatOperand = 2;
   }
   // For destination vector type is the same as first source vector (with mask).
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVBinaryAAXMask
        : Intrinsic<[llvm_anyvector_ty],
                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
-                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                   [IntrNoMem]>, RISCVVIntrinsic {
+                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                    LLVMMatchType<2>],
+                   [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
     let SplatOperand = 3;
   }
   // For destination vector type is the same as first source vector. The
@@ -347,12 +362,13 @@ let TargetPrefix = "riscv" in {
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is the same as first source vector (with mask).
   // The second source operand must match the destination type or be an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVBinaryAAShiftMask
        : Intrinsic<[llvm_anyvector_ty],
                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
-                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                   [IntrNoMem]>, RISCVVIntrinsic;
+                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                    LLVMMatchType<2>],
+                   [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is NOT the same as first source vector.
   // Input: (vector_in, vector_in/scalar_in, vl)
   class RISCVBinaryABXNoMask
@@ -362,12 +378,13 @@ let TargetPrefix = "riscv" in {
     let SplatOperand = 2;
   }
   // For destination vector type is NOT the same as first source vector (with mask).
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVBinaryABXMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic {
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<3>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
     let SplatOperand = 3;
   }
   // For destination vector type is NOT the same as first source vector. The
@@ -379,12 +396,13 @@ let TargetPrefix = "riscv" in {
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is NOT the same as first source vector (with mask).
   // The second source operand must match the destination type or be an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVBinaryABShiftMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<3>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic;
   // For binary operations with V0 as input.
   // Input: (vector_in, vector_in/scalar_in, V0, vl)
   class RISCVBinaryWithV0
@@ -461,12 +479,13 @@ let TargetPrefix = "riscv" in {
   }
   // For Saturating binary operations with mask.
   // The destination vector type is the same as first source vector.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVSaturatingBinaryAAXMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
     let SplatOperand = 3;
   }
   // For Saturating binary operations.
@@ -480,12 +499,13 @@ let TargetPrefix = "riscv" in {
   // For Saturating binary operations with mask.
   // The destination vector type is the same as first source vector.
   // The second source operand matches the destination type or is an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVSaturatingBinaryAAShiftMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic;
   // For Saturating binary operations.
   // The destination vector type is NOT the same as first source vector.
   // The second source operand matches the destination type or is an XLen scalar.
@@ -497,12 +517,13 @@ let TargetPrefix = "riscv" in {
   // For Saturating binary operations with mask.
   // The destination vector type is NOT the same as first source vector (with mask).
   // The second source operand matches the destination type or is an XLen scalar.
-  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl)
+  // Input: (maskedoff, vector_in, vector_in/scalar_in, mask, vl, ta)
   class RISCVSaturatingBinaryABShiftMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<3>],
+                    [ImmArg<ArgIndex<5>>, IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic;
   class RISCVTernaryAAAXNoMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyint_ty,
@@ -579,13 +600,13 @@ let TargetPrefix = "riscv" in {
                     [llvm_anyvector_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For destination vector type is NOT the same as source vector (with mask).
-  // Input: (maskedoff, vector_in, mask, vl)
+  // Input: (maskedoff, vector_in, mask, vl, ta)
   class RISCVUnaryABMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty,
                      LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>,
-                     llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic;
+                     llvm_anyint_ty, LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic;
   // For unary operations with the same vector type in/out without mask
   // Output: (vector)
   // Input: (vector_in, vl)
@@ -614,12 +635,13 @@ let TargetPrefix = "riscv" in {
                     [llvm_anyvector_ty, llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic;
   // For Conversion unary operations with mask.
-  // Input: (maskedoff, vector_in, mask, vl)
+  // Input: (maskedoff, vector_in, mask, vl, ta)
   class RISCVConversionMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty,
-                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic;
+                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
+                     LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic;
   // For atomic operations without mask
   // Input: (base, index, value, vl)
   class RISCVAMONoMask
@@ -643,15 +665,16 @@ let TargetPrefix = "riscv" in {
                     [LLVMPointerToElt<0>, llvm_anyint_ty],
                     [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
   // For unit stride segment load with mask
-  // Input: (maskedoff, pointer, mask, vl)
+  // Input: (maskedoff, pointer, mask, vl, ta)
   class RISCVUSSegLoadMask<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
                                 [LLVMPointerToElt<0>,
                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                 llvm_anyint_ty]),
-                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic;
+                                 llvm_anyint_ty, LLVMMatchType<1>]),
+                    [ImmArg<ArgIndex<!add(nf, 3)>>, NoCapture<ArgIndex<nf>>, IntrReadMem]>,
+                    RISCVVIntrinsic;
 
   // For unit stride fault-only-first segment load
   // Input: (pointer, vl)
@@ -664,7 +687,7 @@ let TargetPrefix = "riscv" in {
                     [LLVMPointerToElt<0>, LLVMMatchType<1>],
                     [NoCapture<ArgIndex<0>>]>, RISCVVIntrinsic;
   // For unit stride fault-only-first segment load with mask
-  // Input: (maskedoff, pointer, mask, vl)
+  // Input: (maskedoff, pointer, mask, vl, ta)
   // Output: (data, vl)
   // NOTE: We model this with default memory properties since we model writing
   // VL as a side effect. IntrReadMem, IntrHasSideEffects does not work.
@@ -674,8 +697,9 @@ let TargetPrefix = "riscv" in {
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
                      [LLVMPointerToElt<0>,
                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                      LLVMMatchType<1>]),
-                    [NoCapture<ArgIndex<nf>>]>, RISCVVIntrinsic;
+                      LLVMMatchType<1>, LLVMMatchType<1>]),
+                    [ImmArg<ArgIndex<!add(nf, 3)>>, NoCapture<ArgIndex<nf>>]>,
+                    RISCVVIntrinsic;
 
   // For stride segment load
   // Input: (pointer, offset, vl)
@@ -685,7 +709,7 @@ let TargetPrefix = "riscv" in {
                     [LLVMPointerToElt<0>, llvm_anyint_ty, LLVMMatchType<1>],
                     [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
   // For stride segment load with mask
-  // Input: (maskedoff, pointer, offset, mask, vl)
+  // Input: (maskedoff, pointer, offset, mask, vl, ta)
   class RISCVSSegLoadMask<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
@@ -693,8 +717,9 @@ let TargetPrefix = "riscv" in {
                                 [LLVMPointerToElt<0>,
                                  llvm_anyint_ty,
                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                 LLVMMatchType<1>]),
-                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic;
+                                 LLVMMatchType<1>, LLVMMatchType<1>]),
+                    [ImmArg<ArgIndex<!add(nf, 4)>>, NoCapture<ArgIndex<nf>>, IntrReadMem]>,
+                    RISCVVIntrinsic;
 
   // For indexed segment load
   // Input: (pointer, index, vl)
@@ -704,7 +729,7 @@ let TargetPrefix = "riscv" in {
                     [LLVMPointerToElt<0>, llvm_anyvector_ty, llvm_anyint_ty],
                     [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic;
   // For indexed segment load with mask
-  // Input: (maskedoff, pointer, index, mask, vl)
+  // Input: (maskedoff, pointer, index, mask, vl, ta)
   class RISCVISegLoadMask<int nf>
         : Intrinsic<!listconcat([llvm_anyvector_ty], !listsplat(LLVMMatchType<0>,
                                 !add(nf, -1))),
@@ -712,8 +737,9 @@ let TargetPrefix = "riscv" in {
                                 [LLVMPointerToElt<0>,
                                  llvm_anyvector_ty,
                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                                 llvm_anyint_ty]),
-                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic;
+                                 llvm_anyint_ty, LLVMMatchType<2>]),
+                    [ImmArg<ArgIndex<!add(nf, 4)>>, NoCapture<ArgIndex<nf>>, IntrReadMem]>,
+                    RISCVVIntrinsic;
 
   // For unit stride segment store
   // Input: (value, pointer, vl)
@@ -947,8 +973,8 @@ let TargetPrefix = "riscv" in {
   defm vsoxei : RISCVIStore;
   defm vsuxei : RISCVIStore;
 
-  def int_riscv_vle1 : RISCVUSLoad;
-  def int_riscv_vse1 : RISCVUSStore;
+  def int_riscv_vlm : RISCVUSLoad;
+  def int_riscv_vsm : RISCVUSStore;
 
   defm vamoswap : RISCVAMO;
   defm vamoadd : RISCVAMO;
@@ -1049,7 +1075,7 @@ let TargetPrefix = "riscv" in {
   defm vssubu : RISCVSaturatingBinaryAAX;
   defm vssub : RISCVSaturatingBinaryAAX;
 
-  def int_riscv_vmerge : RISCVBinaryWithV0;
+  defm vmerge : RISCVBinaryWithV0;
 
   def int_riscv_vmv_v_v : Intrinsic<[llvm_anyvector_ty],
                                     [LLVMMatchType<0>, llvm_anyint_ty],
@@ -1124,7 +1150,7 @@ let TargetPrefix = "riscv" in {
   defm vrgather_vx : RISCVRGatherVX;
   defm vrgatherei16_vv : RISCVRGatherEI16VV;
 
-  def "int_riscv_vcompress" : RISCVUnaryAAMask;
+  def "int_riscv_vcompress" : RISCVUnaryAAMaskNoTA;
 
   defm vaaddu : RISCVSaturatingBinaryAAX;
   defm vaadd : RISCVSaturatingBinaryAAX;
@@ -1159,25 +1185,25 @@ let TargetPrefix = "riscv" in {
   defm vwredsum : RISCVReduction;
 
   defm vfredosum : RISCVReduction;
-  defm vfredsum : RISCVReduction;
+  defm vfredusum : RISCVReduction;
   defm vfredmin : RISCVReduction;
   defm vfredmax : RISCVReduction;
 
-  defm vfwredsum : RISCVReduction;
+  defm vfwredusum : RISCVReduction;
   defm vfwredosum : RISCVReduction;
 
   def int_riscv_vmand: RISCVBinaryAAANoMask;
   def int_riscv_vmnand: RISCVBinaryAAANoMask;
-  def int_riscv_vmandnot: RISCVBinaryAAANoMask;
+  def int_riscv_vmandn: RISCVBinaryAAANoMask;
   def int_riscv_vmxor: RISCVBinaryAAANoMask;
   def int_riscv_vmor: RISCVBinaryAAANoMask;
   def int_riscv_vmnor: RISCVBinaryAAANoMask;
-  def int_riscv_vmornot: RISCVBinaryAAANoMask;
+  def int_riscv_vmorn: RISCVBinaryAAANoMask;
   def int_riscv_vmxnor: RISCVBinaryAAANoMask;
   def int_riscv_vmclr : RISCVNullaryIntrinsic;
   def int_riscv_vmset : RISCVNullaryIntrinsic;
 
-  defm vpopc : RISCVMaskUnarySOut;
+  defm vcpop : RISCVMaskUnarySOut;
   defm vfirst : RISCVMaskUnarySOut;
   defm vmsbf : RISCVMaskUnaryMOut;
   defm vmsof : RISCVMaskUnaryMOut;
@@ -1245,4 +1271,15 @@ let TargetPrefix = "riscv" in {
     defm vsuxseg # nf : RISCVISegStore<nf>;
   }
 
+  // Strided loads/stores for fixed vectors.
+  def int_riscv_masked_strided_load
+        : Intrinsic<[llvm_anyvector_ty],
+                    [LLVMMatchType<0>, llvm_anyptr_ty,
+                     llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [NoCapture<ArgIndex<1>>, IntrReadMem]>;
+  def int_riscv_masked_strided_store
+        : Intrinsic<[],
+                    [llvm_anyvector_ty, llvm_anyptr_ty,
+                     llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>;
 } // TargetPrefix = "riscv"
diff --git a/llvm/include/llvm/IR/IntrinsicsSystemZ.td b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
index 81435e98bea0..a149b571072c 100644
--- a/llvm/include/llvm/IR/IntrinsicsSystemZ.td
+++ b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
@@ -144,7 +144,7 @@ multiclass SystemZBinaryCCBHF {
   def fs : SystemZBinaryCC<llvm_v4i32_ty>;
 }
 
-multiclass SystemZCompareBHFG<string name> {
+multiclass SystemZCompareBHFG {
   def bs : SystemZBinaryCC<llvm_v16i8_ty>;
   def hs : SystemZBinaryCC<llvm_v8i16_ty>;
   def fs : SystemZBinaryCC<llvm_v4i32_ty>;
@@ -341,9 +341,9 @@ let TargetPrefix = "s390" in {
 
   def int_s390_vtm : SystemZBinaryConv<"vtm", llvm_i32_ty, llvm_v16i8_ty>;
 
-  defm int_s390_vceq : SystemZCompareBHFG<"vceq">;
-  defm int_s390_vch  : SystemZCompareBHFG<"vch">;
-  defm int_s390_vchl : SystemZCompareBHFG<"vchl">;
+  defm int_s390_vceq : SystemZCompareBHFG;
+  defm int_s390_vch  : SystemZCompareBHFG;
+  defm int_s390_vchl : SystemZCompareBHFG;
 
   defm int_s390_vfae  : SystemZTernaryIntBHF<"vfae">;
   defm int_s390_vfae  : SystemZTernaryIntCCBHF;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 11990554037d..6a8e6c797f85 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -50,7 +50,8 @@ def int_wasm_trunc_saturate_unsigned : Intrinsic<[llvm_anyint_ty],
 //===----------------------------------------------------------------------===//
 
 // throw / rethrow
-// The immediate argument is an index to a tag, which is 0 for C++.
+// The first immediate argument is an index to a tag, which is 0 for C++
+// exception. The second argument is the thrown exception pointer.
 def int_wasm_throw : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty],
                                [Throws, IntrNoReturn, ImmArg<ArgIndex<0>>]>;
 def int_wasm_rethrow : Intrinsic<[], [], [Throws, IntrNoReturn]>;
@@ -63,8 +64,9 @@ def int_wasm_get_ehselector : Intrinsic<[llvm_i32_ty], [llvm_token_ty],
                                         [IntrHasSideEffects]>;
 
 // wasm.catch returns the pointer to the exception object caught by wasm 'catch'
-// instruction. This returns a single pointer, which is sufficient for C++
-// support. The immediate argument is an index to for a tag, which is 0 for C++.
+// instruction. This returns a single pointer, which is the case for C++
+// exceptions. The immediate argument is an index to for a tag, which is 0 for
+// C++ exceptions.
 def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
                                [IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
 
@@ -162,6 +164,15 @@ def int_wasm_q15mulr_sat_signed :
             [llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
+def int_wasm_pmin :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_pmax :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
 def int_wasm_extadd_pairwise_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMSubdivide2VectorType<0>],
@@ -172,6 +183,59 @@ def int_wasm_extadd_pairwise_unsigned :
             [IntrNoMem, IntrSpeculatable]>;
 
 //===----------------------------------------------------------------------===//
+// Relaxed SIMD intrinsics (experimental)
+//===----------------------------------------------------------------------===//
+
+def int_wasm_fma :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_fms :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_laneselect :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_relaxed_swizzle :
+  Intrinsic<[llvm_v16i8_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_relaxed_min :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_relaxed_max :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_relaxed_trunc_signed:
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_v4f32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_relaxed_trunc_unsigned:
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_v4f32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_relaxed_trunc_zero_signed:
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_v2f64_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+def int_wasm_relaxed_trunc_zero_unsigned:
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_v2f64_ty],
+            [IntrNoMem, IntrSpeculatable]>;
+
+
+//===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 5848356b5b1a..8de737a1c7a5 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -792,7 +792,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_mpsadbw         : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
-                    [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Test instruction with bitwise comparison.
@@ -1779,7 +1779,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
-                         llvm_i8_ty], [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
+                         llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5093,6 +5093,10 @@ let TargetPrefix = "x86" in {
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
+  def int_x86_cast_vector_to_tile:
+              Intrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  def int_x86_cast_tile_to_vector:
+              Intrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5108,3 +5112,757 @@ let TargetPrefix = "x86" in {
   def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
               Intrinsic<[], [llvm_i64_ty], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// avx512_fp16: vaddph
+let TargetPrefix = "x86" in {
+  def int_x86_avx512fp16_add_ph_512
+      : GCCBuiltin<"__builtin_ia32_addph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_sub_ph_512
+      : GCCBuiltin<"__builtin_ia32_subph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_mul_ph_512
+      : GCCBuiltin<"__builtin_ia32_mulph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_div_ph_512
+      : GCCBuiltin<"__builtin_ia32_divph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_max_ph_128
+      : GCCBuiltin<"__builtin_ia32_maxph128">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_max_ph_256
+      : GCCBuiltin<"__builtin_ia32_maxph256">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_max_ph_512
+      : GCCBuiltin<"__builtin_ia32_maxph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_min_ph_128
+      : GCCBuiltin<"__builtin_ia32_minph128">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_min_ph_256
+      : GCCBuiltin<"__builtin_ia32_minph256">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_min_ph_512
+      : GCCBuiltin<"__builtin_ia32_minph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+
+  def int_x86_avx512fp16_mask_cmp_ph_512
+      : Intrinsic<[ llvm_v32i1_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_v32i1_ty,
+                  llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_cmp_ph_256
+      : Intrinsic<[ llvm_v16i1_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i32_ty, llvm_v16i1_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_mask_cmp_ph_128
+      : Intrinsic<[ llvm_v8i1_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8i1_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+
+  def int_x86_avx512fp16_mask_add_sh_round
+      : GCCBuiltin<"__builtin_ia32_addsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_sub_sh_round
+      : GCCBuiltin<"__builtin_ia32_subsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_mul_sh_round
+      : GCCBuiltin<"__builtin_ia32_mulsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_div_sh_round
+      : GCCBuiltin<"__builtin_ia32_divsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_min_sh_round
+      : GCCBuiltin<"__builtin_ia32_minsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_max_sh_round
+      : GCCBuiltin<"__builtin_ia32_maxsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_cmp_sh
+      : GCCBuiltin<"__builtin_ia32_cmpsh_mask">,
+        Intrinsic<[ llvm_i8_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_vcomi_sh
+      : GCCBuiltin<"__builtin_ia32_vcomish">,
+        Intrinsic<[ llvm_i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtph2psx_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2psx128_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2psx_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2psx256_mask">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2psx_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2psx512_mask">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtps2phx_128
+      : GCCBuiltin<"__builtin_ia32_vcvtps2phx128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtps2phx_256
+      : GCCBuiltin<"__builtin_ia32_vcvtps2phx256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtps2phx_512
+      : GCCBuiltin<"__builtin_ia32_vcvtps2phx512_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtpd2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtpd2ph_256
+      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtpd2ph_512
+      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph512_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f64_ty, llvm_v8f16_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtph2pd_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2pd128_mask">,
+        Intrinsic<[ llvm_v2f64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2pd_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2pd256_mask">,
+        Intrinsic<[ llvm_v4f64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4f64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2pd_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2pd512_mask">,
+        Intrinsic<[ llvm_v8f64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtsh2ss_round
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2ss_round_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vcvtss2sh_round
+      : GCCBuiltin<"__builtin_ia32_vcvtss2sh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vcvtsd2sh_round
+      : GCCBuiltin<"__builtin_ia32_vcvtsd2sh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vcvtsh2sd_round
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2sd_round_mask">,
+        Intrinsic<[ llvm_v2f64_ty ],
+                  [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtph2w_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2w128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2w_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2w256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2w_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2w512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2w_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2w128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2w_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2w256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2w_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2w512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uw_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uw128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uw_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uw256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uw_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uw512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uw_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uw128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uw_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uw256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uw_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uw512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtph2dq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2dq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2dq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2dq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2dq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2dq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtph2udq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2udq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2udq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2udq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2udq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2udq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtdq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtdq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtudq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtudq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2dq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2dq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2dq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2dq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2dq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2dq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2udq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2udq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2udq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2udq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2udq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2udq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtqq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtqq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtqq2ph_256
+      : GCCBuiltin<"__builtin_ia32_vcvtqq2ph256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2qq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2qq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2qq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2qq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2qq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2qq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtuqq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtuqq2ph_256
+      : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uqq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uqq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uqq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2qq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2qq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2qq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2qq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2qq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2qq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uqq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uqq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uqq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_vcvtsh2si32
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2si32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtsh2usi32
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2usi32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtsh2si64
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2si64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtsh2usi64
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2usi64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtusi2sh
+      : GCCBuiltin<"__builtin_ia32_vcvtusi2sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvtusi642sh
+      : GCCBuiltin<"__builtin_ia32_vcvtusi642sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvtsi2sh
+      : GCCBuiltin<"__builtin_ia32_vcvtsi2sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvtsi642sh
+      : GCCBuiltin<"__builtin_ia32_vcvtsi642sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvttsh2si32
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2si32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvttsh2si64
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2si64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvttsh2usi32
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2usi32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvttsh2usi64
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2usi64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+
+  def int_x86_avx512fp16_sqrt_ph_512
+      : Intrinsic<[ llvm_v32f16_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_sqrt_sh
+      : Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_rsqrt_ph_128
+      : GCCBuiltin<"__builtin_ia32_rsqrtph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rsqrt_ph_256
+      : GCCBuiltin<"__builtin_ia32_rsqrtph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rsqrt_ph_512
+      : GCCBuiltin<"__builtin_ia32_rsqrtph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rsqrt_sh
+      : GCCBuiltin<"__builtin_ia32_rsqrtsh_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_ph_128
+      : GCCBuiltin<"__builtin_ia32_rcpph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_ph_256
+      : GCCBuiltin<"__builtin_ia32_rcpph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_ph_512
+      : GCCBuiltin<"__builtin_ia32_rcpph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_sh
+      : GCCBuiltin<"__builtin_ia32_rcpsh_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_reduce_ph_128
+      : GCCBuiltin<"__builtin_ia32_reduceph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_reduce_ph_256
+      : GCCBuiltin<"__builtin_ia32_reduceph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_reduce_ph_512
+      : GCCBuiltin<"__builtin_ia32_reduceph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_reduce_sh
+      : GCCBuiltin<"__builtin_ia32_reducesh_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
+  def int_x86_avx512fp16_fpclass_ph_128
+      : Intrinsic<[ llvm_v8i1_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_fpclass_ph_256
+      : Intrinsic<[ llvm_v16i1_ty ], [ llvm_v16f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_fpclass_ph_512
+      : Intrinsic<[ llvm_v32i1_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_fpclass_sh
+      : GCCBuiltin<"__builtin_ia32_fpclasssh_mask">,
+        Intrinsic<[ llvm_i8_ty ], [ llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_getexp_ph_128
+      : GCCBuiltin<"__builtin_ia32_getexpph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_getexp_ph_256
+      : GCCBuiltin<"__builtin_ia32_getexpph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_getexp_ph_512
+      : GCCBuiltin<"__builtin_ia32_getexpph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_getexp_sh
+      : GCCBuiltin<"__builtin_ia32_getexpsh128_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_getmant_ph_128
+      : GCCBuiltin<"__builtin_ia32_getmantph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_getmant_ph_256
+      : GCCBuiltin<"__builtin_ia32_getmantph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_getmant_ph_512
+      : GCCBuiltin<"__builtin_ia32_getmantph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_getmant_sh
+      : GCCBuiltin<"__builtin_ia32_getmantsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty,
+                    llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_ph_128
+      : GCCBuiltin<"__builtin_ia32_rndscaleph_128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_ph_256
+      : GCCBuiltin<"__builtin_ia32_rndscaleph_256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_ph_512
+      : GCCBuiltin<"__builtin_ia32_rndscaleph_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_sh
+      : GCCBuiltin<"__builtin_ia32_rndscalesh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
+  def int_x86_avx512fp16_mask_scalef_ph_128
+      : GCCBuiltin<"__builtin_ia32_scalefph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_scalef_ph_256
+      : GCCBuiltin<"__builtin_ia32_scalefph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_scalef_ph_512
+      : GCCBuiltin<"__builtin_ia32_scalefph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_scalef_sh
+      : GCCBuiltin<"__builtin_ia32_scalefsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+
+  def int_x86_avx512fp16_vfmadd_ph_512
+      : Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_vfmaddsub_ph_128
+      : GCCBuiltin<"__builtin_ia32_vfmaddsubph">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_vfmaddsub_ph_256
+      : GCCBuiltin<"__builtin_ia32_vfmaddsubph256">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_vfmaddsub_ph_512
+      : Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_vfmadd_f16
+      : Intrinsic<[ llvm_half_ty ],
+                  [ llvm_half_ty, llvm_half_ty, llvm_half_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vfcmadd_cph_128
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_maskz_vfcmadd_cph_128
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcph128_maskz">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfcmadd_cph_256
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_mask">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_maskz_vfcmadd_cph_256
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcph256_maskz">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfcmadd_cph_512
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_mask3">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_maskz_vfcmadd_cph_512
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcph512_maskz">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfmadd_cph_128
+      : GCCBuiltin<"__builtin_ia32_vfmaddcph128_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_maskz_vfmadd_cph_128
+      : GCCBuiltin<"__builtin_ia32_vfmaddcph128_maskz">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfmadd_cph_256
+      : GCCBuiltin<"__builtin_ia32_vfmaddcph256_mask">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_maskz_vfmadd_cph_256
+      : GCCBuiltin<"__builtin_ia32_vfmaddcph256_maskz">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfmadd_cph_512
+      : GCCBuiltin<"__builtin_ia32_vfmaddcph512_mask3">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_maskz_vfmadd_cph_512
+      : GCCBuiltin<"__builtin_ia32_vfmaddcph512_maskz">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfmadd_csh
+      : GCCBuiltin<"__builtin_ia32_vfmaddcsh_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_maskz_vfmadd_csh
+      : GCCBuiltin<"__builtin_ia32_vfmaddcsh_maskz">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfcmadd_csh
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_maskz_vfcmadd_csh
+      : GCCBuiltin<"__builtin_ia32_vfcmaddcsh_maskz">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfmul_cph_128
+      : GCCBuiltin<"__builtin_ia32_vfmulcph128_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfcmul_cph_128
+      : GCCBuiltin<"__builtin_ia32_vfcmulcph128_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfmul_cph_256
+      : GCCBuiltin<"__builtin_ia32_vfmulcph256_mask">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfcmul_cph_256
+      : GCCBuiltin<"__builtin_ia32_vfcmulcph256_mask">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vfmul_cph_512
+      : GCCBuiltin<"__builtin_ia32_vfmulcph512_mask">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfcmul_cph_512
+      : GCCBuiltin<"__builtin_ia32_vfcmulcph512_mask">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfmul_csh
+      : GCCBuiltin<"__builtin_ia32_vfmulcsh_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vfcmul_csh
+      : GCCBuiltin<"__builtin_ia32_vfcmulcsh_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+}
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index bc605f108340..1c902ebce5ad 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -305,6 +305,10 @@ public:
   /// LLVMContext is used by compilation.
   void setOptPassGate(OptPassGate&);
 
+  /// Enable opaque pointers. Can only be called before creating the first
+  /// pointer type.
+  void enableOpaquePointers() const;
+
   /// Whether typed pointers are supported. If false, all pointers are opaque.
   bool supportsTypedPointers() const;
 
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index b14127df2182..6cc5797269e2 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -74,7 +74,7 @@ public:
 
     Value *Ops[] = {DataPtr, Stride, B.getInt1(IsVolatile), B.getInt32(Rows),
                     B.getInt32(Columns)};
-    Type *OverloadedTypes[] = {RetType};
+    Type *OverloadedTypes[] = {RetType, Stride->getType()};
 
     Function *TheFn = Intrinsic::getDeclaration(
         getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes);
@@ -82,7 +82,7 @@ public:
     CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
     Attribute AlignAttr =
         Attribute::getWithAlignment(Call->getContext(), Alignment);
-    Call->addAttribute(1, AlignAttr);
+    Call->addParamAttr(0, AlignAttr);
     return Call;
   }
 
@@ -97,7 +97,7 @@ public:
     Value *Ops[] = {Matrix,           Ptr,
                     Stride,           B.getInt1(IsVolatile),
                     B.getInt32(Rows), B.getInt32(Columns)};
-    Type *OverloadedTypes[] = {Matrix->getType()};
+    Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()};
 
     Function *TheFn = Intrinsic::getDeclaration(
         getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes);
@@ -105,7 +105,7 @@ public:
     CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
     Attribute AlignAttr =
         Attribute::getWithAlignment(Call->getContext(), Alignment);
-    Call->addAttribute(2, AlignAttr);
+    Call->addParamAttr(1, AlignAttr);
     return Call;
   }
 
@@ -231,9 +231,23 @@ public:
                : (IsUnsigned ? B.CreateUDiv(LHS, RHS) : B.CreateSDiv(LHS, RHS));
   }
 
-  /// Extracts the element at (\p RowIdx, \p ColumnIdx) from \p Matrix.
-  Value *CreateExtractElement(Value *Matrix, Value *RowIdx, Value *ColumnIdx,
-                              unsigned NumRows, Twine const &Name = "") {
+  /// Create an assumption that \p Idx is less than \p NumElements.
+  void CreateIndexAssumption(Value *Idx, unsigned NumElements,
+                             Twine const &Name = "") {
+
+    Value *NumElts =
+        B.getIntN(Idx->getType()->getScalarSizeInBits(), NumElements);
+    auto *Cmp = B.CreateICmpULT(Idx, NumElts);
+    if (auto *ConstCond = dyn_cast<ConstantInt>(Cmp))
+      assert(ConstCond->isOne() && "Index must be valid!");
+    else
+      B.CreateAssumption(Cmp);
+  }
+
+  /// Compute the index to access the element at (\p RowIdx, \p ColumnIdx) from
+  /// a matrix with \p NumRows embedded in a vector.
+  Value *CreateIndex(Value *RowIdx, Value *ColumnIdx, unsigned NumRows,
+                     Twine const &Name = "") {
 
     unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(),
                                  ColumnIdx->getType()->getScalarSizeInBits());
@@ -241,9 +255,7 @@ public:
     RowIdx = B.CreateZExt(RowIdx, IntTy);
     ColumnIdx = B.CreateZExt(ColumnIdx, IntTy);
     Value *NumRowsV = B.getIntN(MaxWidth, NumRows);
-    return B.CreateExtractElement(
-        Matrix, B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx),
-        "matext");
+    return B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx);
   }
 };
 
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index c5840564454e..26d70b4db2d5 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -707,6 +707,15 @@ struct AAMDNodes {
     Result.NoAlias = NoAlias;
     return Result;
   }
+
+  /// Given two sets of AAMDNodes applying to potentially different locations,
+  /// determine the best AAMDNodes that apply to both.
+  AAMDNodes merge(const AAMDNodes &Other) const;
+
+  /// Determine the best AAMDNodes after concatenating two different locations
+  /// together. Different from `merge`, where different locations should
+  /// overlap each other, `concat` puts non-overlapping locations together.
+  AAMDNodes concat(const AAMDNodes &Other) const;
 };
 
 // Specialize DenseMapInfo for AAMDNodes.
@@ -897,6 +906,7 @@ struct TempMDNodeDeleter {
 class MDNode : public Metadata {
   friend class ReplaceableMetadataImpl;
   friend class LLVMContextImpl;
+  friend class DIArgList;
 
   unsigned NumOperands;
   unsigned NumUnresolved;
@@ -1028,6 +1038,31 @@ public:
     return cast<T>(N.release()->replaceWithDistinctImpl());
   }
 
+  /// Print in tree shape.
+  ///
+  /// Prints definition of \c this in tree shape.
+  ///
+  /// If \c M is provided, metadata nodes will be numbered canonically;
+  /// otherwise, pointer addresses are substituted.
+  /// @{
+  void printTree(raw_ostream &OS, const Module *M = nullptr) const;
+  void printTree(raw_ostream &OS, ModuleSlotTracker &MST,
+                 const Module *M = nullptr) const;
+  /// @}
+
+  /// User-friendly dump in tree shape.
+  ///
+  /// If \c M is provided, metadata nodes will be numbered canonically;
+  /// otherwise, pointer addresses are substituted.
+  ///
+  /// Note: this uses an explicit overload instead of default arguments so that
+  /// the nullptr version is easy to call from a debugger.
+  ///
+  /// @{
+  void dumpTree() const;
+  void dumpTree(const Module *M) const;
+  /// @}
+
 private:
   MDNode *replaceWithPermanentImpl();
   MDNode *replaceWithUniquedImpl();
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 81e29d9b86e8..bd3a196c7181 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -64,9 +64,9 @@ class VersionTuple;
 /// constant references to global variables in the module.  When a global
 /// variable is destroyed, it should have no entries in the GlobalValueRefMap.
 /// The main container class for the LLVM Intermediate Representation.
-class Module {
-/// @name Types And Enumerations
-/// @{
+class LLVM_EXTERNAL_VISIBILITY Module {
+  /// @name Types And Enumerations
+  /// @{
 public:
   /// The type for the list of global variables.
   using GlobalListType = SymbolTableList<GlobalVariable>;
@@ -324,6 +324,9 @@ public:
   /// name is not found.
   GlobalValue *getNamedValue(StringRef Name) const;
 
+  /// Return the number of global values in the module.
+  unsigned getNumNamedValues() const;
+
   /// Return a unique non-zero ID for the specified metadata kind. This ID is
   /// uniqued across modules in the current LLVMContext.
   unsigned getMDKindID(StringRef Name) const;
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 4b84f6b0408d..e00b78d45c63 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -572,6 +572,50 @@ public:
     unsigned NoInline : 1;
     // Indicate if function should be always inlined.
     unsigned AlwaysInline : 1;
+    // Indicate if function never raises an exception. Can be modified during
+    // thinlink function attribute propagation
+    unsigned NoUnwind : 1;
+    // Indicate if function contains instructions that mayThrow
+    unsigned MayThrow : 1;
+
+    // If there are calls to unknown targets (e.g. indirect)
+    unsigned HasUnknownCall : 1;
+
+    FFlags &operator&=(const FFlags &RHS) {
+      this->ReadNone &= RHS.ReadNone;
+      this->ReadOnly &= RHS.ReadOnly;
+      this->NoRecurse &= RHS.NoRecurse;
+      this->ReturnDoesNotAlias &= RHS.ReturnDoesNotAlias;
+      this->NoInline &= RHS.NoInline;
+      this->AlwaysInline &= RHS.AlwaysInline;
+      this->NoUnwind &= RHS.NoUnwind;
+      this->MayThrow &= RHS.MayThrow;
+      this->HasUnknownCall &= RHS.HasUnknownCall;
+      return *this;
+    }
+
+    bool anyFlagSet() {
+      return this->ReadNone | this->ReadOnly | this->NoRecurse |
+             this->ReturnDoesNotAlias | this->NoInline | this->AlwaysInline |
+             this->NoUnwind | this->MayThrow | this->HasUnknownCall;
+    }
+
+    operator std::string() {
+      std::string Output;
+      raw_string_ostream OS(Output);
+      OS << "funcFlags: (";
+      OS << "readNone: " << this->ReadNone;
+      OS << ", readOnly: " << this->ReadOnly;
+      OS << ", noRecurse: " << this->NoRecurse;
+      OS << ", returnDoesNotAlias: " << this->ReturnDoesNotAlias;
+      OS << ", noInline: " << this->NoInline;
+      OS << ", alwaysInline: " << this->AlwaysInline;
+      OS << ", noUnwind: " << this->NoUnwind;
+      OS << ", mayThrow: " << this->MayThrow;
+      OS << ", hasUnknownCall: " << this->HasUnknownCall;
+      OS << ")";
+      return OS.str();
+    }
   };
 
   /// Describes the uses of a parameter by the function.
@@ -688,6 +732,10 @@ public:
   /// Get function summary flags.
   FFlags fflags() const { return FunFlags; }
 
+  void setNoRecurse() { FunFlags.NoRecurse = true; }
+
+  void setNoUnwind() { FunFlags.NoUnwind = true; }
+
   /// Get the instruction count recorded for this function.
   unsigned instCount() const { return InstCount; }
 
@@ -700,6 +748,8 @@ public:
   /// Return the list of <CalleeValueInfo, CalleeInfo> pairs.
   ArrayRef<EdgeTy> calls() const { return CallGraphEdgeList; }
 
+  std::vector<EdgeTy> &mutableCalls() { return CallGraphEdgeList; }
+
   void addCall(EdgeTy E) { CallGraphEdgeList.push_back(E); }
 
   /// Returns the list of type identifiers used by this function in
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index d0bce742cc96..b83d83f0d0ab 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -59,6 +59,10 @@ public:
   static bool classof(const Value *V) {
     return isa<Instruction>(V) || isa<ConstantExpr>(V);
   }
+
+  /// Return true if this operator has flags which may cause this operator
+  /// to evaluate to poison despite having non-poison inputs.
+  bool hasPoisonGeneratingFlags() const;
 };
 
 /// Utility class for integer operators which may exhibit overflow - Add, Sub,
@@ -243,6 +247,9 @@ public:
   void operator|=(const FastMathFlags &OtherFlags) {
     Flags |= OtherFlags.Flags;
   }
+  bool operator!=(const FastMathFlags &OtherFlags) const {
+    return Flags != OtherFlags.Flags;
+  }
 };
 
 /// Utility class for floating point operations which can have
diff --git a/llvm/include/llvm/IR/OptBisect.h b/llvm/include/llvm/IR/OptBisect.h
index 6c2a1b01d897..63fd98073b51 100644
--- a/llvm/include/llvm/IR/OptBisect.h
+++ b/llvm/include/llvm/IR/OptBisect.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ManagedStatic.h"
+#include <limits>
 
 namespace llvm {
 
@@ -43,14 +44,12 @@ public:
 /// optimization-related problems.
 class OptBisect : public OptPassGate {
 public:
-  /// Default constructor, initializes the OptBisect state based on the
-  /// -opt-bisect-limit command line argument.
-  ///
-  /// By default, bisection is disabled.
-  ///
+  /// Default constructor. Initializes the state to "disabled". The bisection
+  /// will be enabled by the cl::opt call-back when the command line option
+  /// is processed.
   /// Clients should not instantiate this class directly.  All access should go
   /// through LLVMContext.
-  OptBisect();
+  OptBisect() = default;
 
   virtual ~OptBisect() = default;
 
@@ -60,7 +59,14 @@ public:
   bool shouldRunPass(const Pass *P, StringRef IRDescription) override;
 
   /// isEnabled() should return true before calling shouldRunPass().
-  bool isEnabled() const override { return BisectEnabled; }
+  bool isEnabled() const override { return BisectLimit != Disabled; }
+
+  /// Set the new optimization limit and reset the counter. Passing
+  /// OptBisect::Disabled disables the limiting.
+  void setLimit(int Limit) {
+    BisectLimit = Limit;
+    LastBisectNum = 0;
+  }
 
   /// Checks the bisect limit to determine if the specified pass should run.
   ///
@@ -75,9 +81,11 @@ public:
   /// instance, function passes should call FunctionPass::skipFunction().
   bool checkPass(const StringRef PassName, const StringRef TargetDesc);
 
+  static const int Disabled = std::numeric_limits<int>::max();
+
 private:
-  bool BisectEnabled = false;
-  unsigned LastBisectNum = 0;
+  int BisectLimit = Disabled;
+  int LastBisectNum = 0;
 };
 
 /// Singleton instance of the OptBisect class, so multiple pass managers don't
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 8e592bfb0c78..e88d2233daba 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -377,10 +377,16 @@ template <typename DerivedT> struct PassInfoMixin {
     static_assert(std::is_base_of<PassInfoMixin, DerivedT>::value,
                   "Must pass the derived type as the template argument!");
     StringRef Name = getTypeName<DerivedT>();
-    if (Name.startswith("llvm::"))
-      Name = Name.drop_front(strlen("llvm::"));
+    Name.consume_front("llvm::");
     return Name;
   }
+
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    StringRef ClassName = DerivedT::name();
+    auto PassName = MapClassName2PassName(ClassName);
+    OS << PassName;
+  }
 };
 
 /// A CRTP mix-in that provides informational APIs needed for analysis passes.
@@ -480,6 +486,16 @@ public:
     return *this;
   }
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
+      auto *P = Passes[Idx].get();
+      P->printPipeline(OS, MapClassName2PassName);
+      if (Idx + 1 < Size)
+        OS << ",";
+    }
+  }
+
   /// Run all of the passes in this manager over the given unit of IR.
   /// ExtraArgs are passed to each pass.
   PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
@@ -520,12 +536,6 @@ public:
       // Finally, intersect the preserved analyses to compute the aggregate
       // preserved set for this pass manager.
       PA.intersect(std::move(PassPA));
-
-      // FIXME: Historically, the pass managers all called the LLVM context's
-      // yield function here. We don't have a generic way to acquire the
-      // context and it isn't yet clear what the right pattern is for yielding
-      // in the new pass manager so it is currently omitted.
-      //IR.getContext().yield();
     }
 
     // Invalidation was handled after each pass in the above loop for the
@@ -538,13 +548,16 @@ public:
   }
 
   template <typename PassT>
-  std::enable_if_t<!std::is_same<PassT, PassManager>::value>
-  addPass(PassT &&Pass) {
+  LLVM_ATTRIBUTE_MINSIZE
+      std::enable_if_t<!std::is_same<PassT, PassManager>::value>
+      addPass(PassT &&Pass) {
     using PassModelT =
         detail::PassModel<IRUnitT, PassT, PreservedAnalyses, AnalysisManagerT,
                           ExtraArgTs...>;
-
-    Passes.emplace_back(new PassModelT(std::forward<PassT>(Pass)));
+    // Do not use make_unique or emplace_back, they cause too many template
+    // instantiations, causing terrible compile times.
+    Passes.push_back(std::unique_ptr<PassConceptT>(
+        new PassModelT(std::forward<PassT>(Pass))));
   }
 
   /// When adding a pass manager pass that has the same type as this pass
@@ -553,10 +566,11 @@ public:
   /// implementation complexity and avoid potential invalidation issues that may
   /// happen with nested pass managers of the same type.
   template <typename PassT>
-  std::enable_if_t<std::is_same<PassT, PassManager>::value>
-  addPass(PassT &&Pass) {
+  LLVM_ATTRIBUTE_MINSIZE
+      std::enable_if_t<std::is_same<PassT, PassManager>::value>
+      addPass(PassT &&Pass) {
     for (auto &P : Pass.Passes)
-      Passes.emplace_back(std::move(P));
+      Passes.push_back(std::move(P));
   }
 
   /// Returns if the pass manager contains any passes.
@@ -1190,29 +1204,37 @@ class ModuleToFunctionPassAdaptor
 public:
   using PassConceptT = detail::PassConcept<Function, FunctionAnalysisManager>;
 
-  explicit ModuleToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass)
-      : Pass(std::move(Pass)) {}
+  explicit ModuleToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass,
+                                       bool EagerlyInvalidate)
+      : Pass(std::move(Pass)), EagerlyInvalidate(EagerlyInvalidate) {}
 
   /// Runs the function pass across every function in the module.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
   static bool isRequired() { return true; }
 
 private:
   std::unique_ptr<PassConceptT> Pass;
+  bool EagerlyInvalidate;
 };
 
 /// A function to deduce a function pass type and wrap it in the
 /// templated adaptor.
 template <typename FunctionPassT>
 ModuleToFunctionPassAdaptor
-createModuleToFunctionPassAdaptor(FunctionPassT &&Pass) {
+createModuleToFunctionPassAdaptor(FunctionPassT &&Pass,
+                                  bool EagerlyInvalidate = false) {
   using PassModelT =
       detail::PassModel<Function, FunctionPassT, PreservedAnalyses,
                         FunctionAnalysisManager>;
-
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
   return ModuleToFunctionPassAdaptor(
-      std::make_unique<PassModelT>(std::forward<FunctionPassT>(Pass)));
+      std::unique_ptr<ModuleToFunctionPassAdaptor::PassConceptT>(
+          new PassModelT(std::forward<FunctionPassT>(Pass))),
+      EagerlyInvalidate);
 }
 
 /// A utility pass template to force an analysis result to be available.
@@ -1243,6 +1265,12 @@ struct RequireAnalysisPass
 
     return PreservedAnalyses::all();
   }
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    auto ClassName = AnalysisT::name();
+    auto PassName = MapClassName2PassName(ClassName);
+    OS << "require<" << PassName << ">";
+  }
   static bool isRequired() { return true; }
 };
 
@@ -1263,6 +1291,12 @@ struct InvalidateAnalysisPass
     PA.abandon<AnalysisT>();
     return PA;
   }
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    auto ClassName = AnalysisT::name();
+    auto PassName = MapClassName2PassName(ClassName);
+    OS << "invalidate<" << PassName << ">";
+  }
 };
 
 /// A utility pass that does nothing, but preserves no analyses.
@@ -1312,6 +1346,13 @@ public:
     return PA;
   }
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    OS << "repeat<" << Count << ">(";
+    P.printPipeline(OS, MapClassName2PassName);
+    OS << ")";
+  }
+
 private:
   int Count;
   PassT P;
diff --git a/llvm/include/llvm/IR/PassManagerInternal.h b/llvm/include/llvm/IR/PassManagerInternal.h
index 8f42e69f3063..29b55a8172e6 100644
--- a/llvm/include/llvm/IR/PassManagerInternal.h
+++ b/llvm/include/llvm/IR/PassManagerInternal.h
@@ -46,6 +46,9 @@ struct PassConcept {
   virtual PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
                                 ExtraArgTs... ExtraArgs) = 0;
 
+  virtual void
+  printPipeline(raw_ostream &OS,
+                function_ref<StringRef(StringRef)> MapClassName2PassName) = 0;
   /// Polymorphic method to access the name of a pass.
   virtual StringRef name() const = 0;
 
@@ -85,6 +88,12 @@ struct PassModel : PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...> {
     return Pass.run(IR, AM, ExtraArgs...);
   }
 
+  void printPipeline(
+      raw_ostream &OS,
+      function_ref<StringRef(StringRef)> MapClassName2PassName) override {
+    Pass.printPipeline(OS, MapClassName2PassName);
+  }
+
   StringRef name() const override { return PassT::name(); }
 
   template <typename T>
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index cbd429f84ee4..b858733530e3 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -438,7 +438,7 @@ inline cst_pred_ty<is_any_apint> m_AnyIntegralConstant() {
 }
 
 struct is_all_ones {
-  bool isValue(const APInt &C) { return C.isAllOnesValue(); }
+  bool isValue(const APInt &C) { return C.isAllOnes(); }
 };
 /// Match an integer or vector with all bits set.
 /// For vectors, this includes constants with undefined elements.
@@ -506,7 +506,7 @@ inline cst_pred_ty<is_nonpositive> m_NonPositive() {
 inline api_pred_ty<is_nonpositive> m_NonPositive(const APInt *&V) { return V; }
 
 struct is_one {
-  bool isValue(const APInt &C) { return C.isOneValue(); }
+  bool isValue(const APInt &C) { return C.isOne(); }
 };
 /// Match an integer 1 or a vector with all elements equal to 1.
 /// For vectors, this includes constants with undefined elements.
@@ -515,7 +515,7 @@ inline cst_pred_ty<is_one> m_One() {
 }
 
 struct is_zero_int {
-  bool isValue(const APInt &C) { return C.isNullValue(); }
+  bool isValue(const APInt &C) { return C.isZero(); }
 };
 /// Match an integer 0 or a vector with all elements equal to 0.
 /// For vectors, this includes constants with undefined elements.
@@ -549,7 +549,7 @@ inline api_pred_ty<is_power2> m_Power2(const APInt *&V) {
 }
 
 struct is_negated_power2 {
-  bool isValue(const APInt &C) { return (-C).isPowerOf2(); }
+  bool isValue(const APInt &C) { return C.isNegatedPowerOf2(); }
 };
 /// Match a integer or vector negated power-of-2.
 /// For vectors, this includes constants with undefined elements.
@@ -593,32 +593,7 @@ inline cst_pred_ty<is_lowbit_mask> m_LowBitMask() {
 struct icmp_pred_with_threshold {
   ICmpInst::Predicate Pred;
   const APInt *Thr;
-  bool isValue(const APInt &C) {
-    switch (Pred) {
-    case ICmpInst::Predicate::ICMP_EQ:
-      return C.eq(*Thr);
-    case ICmpInst::Predicate::ICMP_NE:
-      return C.ne(*Thr);
-    case ICmpInst::Predicate::ICMP_UGT:
-      return C.ugt(*Thr);
-    case ICmpInst::Predicate::ICMP_UGE:
-      return C.uge(*Thr);
-    case ICmpInst::Predicate::ICMP_ULT:
-      return C.ult(*Thr);
-    case ICmpInst::Predicate::ICMP_ULE:
-      return C.ule(*Thr);
-    case ICmpInst::Predicate::ICMP_SGT:
-      return C.sgt(*Thr);
-    case ICmpInst::Predicate::ICMP_SGE:
-      return C.sge(*Thr);
-    case ICmpInst::Predicate::ICMP_SLT:
-      return C.slt(*Thr);
-    case ICmpInst::Predicate::ICMP_SLE:
-      return C.sle(*Thr);
-    default:
-      llvm_unreachable("Unhandled ICmp predicate");
-    }
-  }
+  bool isValue(const APInt &C) { return ICmpInst::compare(C, *Thr, Pred); }
 };
 /// Match an integer or vector with every element comparing 'pred' (eg/ne/...)
 /// to Threshold. For vectors, this includes constants with undefined elements.
@@ -988,20 +963,22 @@ struct BinaryOp_match {
   // The LHS is always matched first.
   BinaryOp_match(const LHS_t &LHS, const RHS_t &RHS) : L(LHS), R(RHS) {}
 
-  template <typename OpTy> bool match(OpTy *V) {
-    if (V->getValueID() == Value::InstructionVal + Opcode) {
+  template <typename OpTy> inline bool match(unsigned Opc, OpTy *V) {
+    if (V->getValueID() == Value::InstructionVal + Opc) {
       auto *I = cast<BinaryOperator>(V);
       return (L.match(I->getOperand(0)) && R.match(I->getOperand(1))) ||
              (Commutable && L.match(I->getOperand(1)) &&
               R.match(I->getOperand(0)));
     }
     if (auto *CE = dyn_cast<ConstantExpr>(V))
-      return CE->getOpcode() == Opcode &&
+      return CE->getOpcode() == Opc &&
              ((L.match(CE->getOperand(0)) && R.match(CE->getOperand(1))) ||
               (Commutable && L.match(CE->getOperand(1)) &&
                R.match(CE->getOperand(0))));
     return false;
   }
+
+  template <typename OpTy> bool match(OpTy *V) { return match(Opcode, V); }
 };
 
 template <typename LHS, typename RHS>
@@ -1246,6 +1223,26 @@ m_NUWShl(const LHS &L, const RHS &R) {
       L, R);
 }
 
+template <typename LHS_t, typename RHS_t, bool Commutable = false>
+struct SpecificBinaryOp_match
+    : public BinaryOp_match<LHS_t, RHS_t, 0, Commutable> {
+  unsigned Opcode;
+
+  SpecificBinaryOp_match(unsigned Opcode, const LHS_t &LHS, const RHS_t &RHS)
+      : BinaryOp_match<LHS_t, RHS_t, 0, Commutable>(LHS, RHS), Opcode(Opcode) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    return BinaryOp_match<LHS_t, RHS_t, 0, Commutable>::match(Opcode, V);
+  }
+};
+
+/// Matches a specific opcode.
+template <typename LHS, typename RHS>
+inline SpecificBinaryOp_match<LHS, RHS> m_BinOp(unsigned Opcode, const LHS &L,
+                                                const RHS &R) {
+  return SpecificBinaryOp_match<LHS, RHS>(Opcode, L, R);
+}
+
 //===----------------------------------------------------------------------===//
 // Class that matches a group of binary opcodes.
 //
@@ -2223,6 +2220,13 @@ m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
                                                                        R);
 }
 
+/// Matches a specific opcode with LHS and RHS in either order.
+template <typename LHS, typename RHS>
+inline SpecificBinaryOp_match<LHS, RHS, true>
+m_c_BinOp(unsigned Opcode, const LHS &L, const RHS &R) {
+  return SpecificBinaryOp_match<LHS, RHS, true>(Opcode, L, R);
+}
+
 /// Matches a Add with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline BinaryOp_match<LHS, RHS, Instruction::Add, true> m_c_Add(const LHS &L,
@@ -2456,7 +2460,7 @@ inline VScaleVal_match m_VScale(const DataLayout &DL) {
   return VScaleVal_match(DL);
 }
 
-template <typename LHS, typename RHS, unsigned Opcode>
+template <typename LHS, typename RHS, unsigned Opcode, bool Commutable = false>
 struct LogicalOp_match {
   LHS L;
   RHS R;
@@ -2464,27 +2468,32 @@ struct LogicalOp_match {
   LogicalOp_match(const LHS &L, const RHS &R) : L(L), R(R) {}
 
   template <typename T> bool match(T *V) {
-    if (auto *I = dyn_cast<Instruction>(V)) {
-      if (!I->getType()->isIntOrIntVectorTy(1))
-        return false;
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I || !I->getType()->isIntOrIntVectorTy(1))
+      return false;
 
-      if (I->getOpcode() == Opcode && L.match(I->getOperand(0)) &&
-          R.match(I->getOperand(1)))
-        return true;
+    if (I->getOpcode() == Opcode) {
+      auto *Op0 = I->getOperand(0);
+      auto *Op1 = I->getOperand(1);
+      return (L.match(Op0) && R.match(Op1)) ||
+             (Commutable && L.match(Op1) && R.match(Op0));
+    }
 
-      if (auto *SI = dyn_cast<SelectInst>(I)) {
-        if (Opcode == Instruction::And) {
-          if (const auto *C = dyn_cast<Constant>(SI->getFalseValue()))
-            if (C->isNullValue() && L.match(SI->getCondition()) &&
-                R.match(SI->getTrueValue()))
-              return true;
-        } else {
-          assert(Opcode == Instruction::Or);
-          if (const auto *C = dyn_cast<Constant>(SI->getTrueValue()))
-            if (C->isOneValue() && L.match(SI->getCondition()) &&
-                R.match(SI->getFalseValue()))
-              return true;
-        }
+    if (auto *Select = dyn_cast<SelectInst>(I)) {
+      auto *Cond = Select->getCondition();
+      auto *TVal = Select->getTrueValue();
+      auto *FVal = Select->getFalseValue();
+      if (Opcode == Instruction::And) {
+        auto *C = dyn_cast<Constant>(FVal);
+        if (C && C->isNullValue())
+          return (L.match(Cond) && R.match(TVal)) ||
+                 (Commutable && L.match(TVal) && R.match(Cond));
+      } else {
+        assert(Opcode == Instruction::Or);
+        auto *C = dyn_cast<Constant>(TVal);
+        if (C && C->isOneValue())
+          return (L.match(Cond) && R.match(FVal)) ||
+                 (Commutable && L.match(FVal) && R.match(Cond));
       }
     }
 
@@ -2503,6 +2512,13 @@ m_LogicalAnd(const LHS &L, const RHS &R) {
 /// Matches L && R where L and R are arbitrary values.
 inline auto m_LogicalAnd() { return m_LogicalAnd(m_Value(), m_Value()); }
 
+/// Matches L && R with LHS and RHS in either order.
+template <typename LHS, typename RHS>
+inline LogicalOp_match<LHS, RHS, Instruction::And, true>
+m_c_LogicalAnd(const LHS &L, const RHS &R) {
+  return LogicalOp_match<LHS, RHS, Instruction::And, true>(L, R);
+}
+
 /// Matches L || R either in the form of L | R or L ? true : R.
 /// Note that the latter form is poison-blocking.
 template <typename LHS, typename RHS>
@@ -2512,8 +2528,13 @@ m_LogicalOr(const LHS &L, const RHS &R) {
 }
 
 /// Matches L || R where L and R are arbitrary values.
-inline auto m_LogicalOr() {
-  return m_LogicalOr(m_Value(), m_Value());
+inline auto m_LogicalOr() { return m_LogicalOr(m_Value(), m_Value()); }
+
+/// Matches L || R with LHS and RHS in either order.
+template <typename LHS, typename RHS>
+inline LogicalOp_match<LHS, RHS, Instruction::Or, true>
+m_c_LogicalOr(const LHS &L, const RHS &R) {
+  return LogicalOp_match<LHS, RHS, Instruction::Or, true>(L, R);
 }
 
 } // end namespace PatternMatch
diff --git a/llvm/include/llvm/IR/ProfileSummary.h b/llvm/include/llvm/IR/ProfileSummary.h
index 889568e7946b..4bb6bb8d4a40 100644
--- a/llvm/include/llvm/IR/ProfileSummary.h
+++ b/llvm/include/llvm/IR/ProfileSummary.h
@@ -31,9 +31,9 @@ class raw_ostream;
 // number of counts needed to reach this target and the minimum among these
 // counts.
 struct ProfileSummaryEntry {
-  uint32_t Cutoff;    ///< The required percentile of counts.
-  uint64_t MinCount;  ///< The minimum count for this percentile.
-  uint64_t NumCounts; ///< Number of counts >= the minimum count.
+  const uint32_t Cutoff;    ///< The required percentile of counts.
+  const uint64_t MinCount;  ///< The minimum count for this percentile.
+  const uint64_t NumCounts; ///< Number of counts >= the minimum count.
 
   ProfileSummaryEntry(uint32_t TheCutoff, uint64_t TheMinCount,
                       uint64_t TheNumCounts)
@@ -48,9 +48,9 @@ public:
 
 private:
   const Kind PSK;
-  SummaryEntryVector DetailedSummary;
-  uint64_t TotalCount, MaxCount, MaxInternalCount, MaxFunctionCount;
-  uint32_t NumCounts, NumFunctions;
+  const SummaryEntryVector DetailedSummary;
+  const uint64_t TotalCount, MaxCount, MaxInternalCount, MaxFunctionCount;
+  const uint32_t NumCounts, NumFunctions;
   /// If 'Partial' is false, it means the profile being used to optimize
   /// a target is collected from the same target.
   /// If 'Partial' is true, it means the profile is for common/shared
@@ -61,14 +61,14 @@ private:
   /// of the program being built to the number of profile counters in the
   /// partial sample profile. When 'Partial' is false, it is undefined. This is
   /// currently only available under thin LTO mode.
-  double PartialProfileRatio = 0;
+  double PartialProfileRatio = 0.0;
   /// Return detailed summary as metadata.
   Metadata *getDetailedSummaryMD(LLVMContext &Context);
 
 public:
   static const int Scale = 1000000;
 
-  ProfileSummary(Kind K, SummaryEntryVector DetailedSummary,
+  ProfileSummary(Kind K, const SummaryEntryVector &DetailedSummary,
                  uint64_t TotalCount, uint64_t MaxCount,
                  uint64_t MaxInternalCount, uint64_t MaxFunctionCount,
                  uint32_t NumCounts, uint32_t NumFunctions,
@@ -85,22 +85,22 @@ public:
                   bool AddPartialProfileRatioField = true);
   /// Construct profile summary from metdata.
   static ProfileSummary *getFromMD(Metadata *MD);
-  SummaryEntryVector &getDetailedSummary() { return DetailedSummary; }
-  uint32_t getNumFunctions() { return NumFunctions; }
-  uint64_t getMaxFunctionCount() { return MaxFunctionCount; }
-  uint32_t getNumCounts() { return NumCounts; }
-  uint64_t getTotalCount() { return TotalCount; }
-  uint64_t getMaxCount() { return MaxCount; }
-  uint64_t getMaxInternalCount() { return MaxInternalCount; }
+  const SummaryEntryVector &getDetailedSummary() { return DetailedSummary; }
+  uint32_t getNumFunctions() const { return NumFunctions; }
+  uint64_t getMaxFunctionCount() const { return MaxFunctionCount; }
+  uint32_t getNumCounts() const { return NumCounts; }
+  uint64_t getTotalCount() const { return TotalCount; }
+  uint64_t getMaxCount() const { return MaxCount; }
+  uint64_t getMaxInternalCount() const { return MaxInternalCount; }
   void setPartialProfile(bool PP) { Partial = PP; }
-  bool isPartialProfile() { return Partial; }
-  double getPartialProfileRatio() { return PartialProfileRatio; }
+  bool isPartialProfile() const { return Partial; }
+  double getPartialProfileRatio() const { return PartialProfileRatio; }
   void setPartialProfileRatio(double R) {
     assert(isPartialProfile() && "Unexpected when not partial profile");
     PartialProfileRatio = R;
   }
-  void printSummary(raw_ostream &OS);
-  void printDetailedSummary(raw_ostream &OS);
+  void printSummary(raw_ostream &OS) const;
+  void printDetailedSummary(raw_ostream &OS) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/PseudoProbe.h b/llvm/include/llvm/IR/PseudoProbe.h
index 53100f049910..51ba7e675efe 100644
--- a/llvm/include/llvm/IR/PseudoProbe.h
+++ b/llvm/include/llvm/IR/PseudoProbe.h
@@ -27,10 +27,6 @@ constexpr const char *PseudoProbeDescMetadataName = "llvm.pseudo_probe_desc";
 
 enum class PseudoProbeType { Block = 0, IndirectCall, DirectCall };
 
-enum class PseudoProbeAttributes {
-  Reserved = 0x1, // Reserved for future use.
-};
-
 // The saturated distrution factor representing 100% for block probes.
 constexpr static uint64_t PseudoProbeFullDistributionFactor =
     std::numeric_limits<uint64_t>::max();
diff --git a/llvm/include/llvm/IR/ReplaceConstant.h b/llvm/include/llvm/IR/ReplaceConstant.h
index 4d95143a4bd2..5ad1d0a6f920 100644
--- a/llvm/include/llvm/IR/ReplaceConstant.h
+++ b/llvm/include/llvm/IR/ReplaceConstant.h
@@ -21,10 +21,6 @@
 
 namespace llvm {
 
-/// Create a replacement instruction for constant expression \p CE and insert
-/// it before \p Instr.
-Instruction *createReplacementInstr(ConstantExpr *CE, Instruction *Instr);
-
 /// The given instruction \p I contains given constant expression \p CE as one
 /// of its operands, possibly nested within constant expression trees. Convert
 /// all reachable paths from contant expression operands of \p I to \p CE into
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index c73172612b1e..62d67308114f 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -287,6 +287,7 @@ HANDLE_LIBCALL(FPEXT_F80_F128, "__extendxftf2")
 HANDLE_LIBCALL(FPEXT_F64_F128, "__extenddftf2")
 HANDLE_LIBCALL(FPEXT_F32_F128, "__extendsftf2")
 HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2")
+HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2")
 HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2")
 HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2")
 HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee")
@@ -375,6 +376,8 @@ HANDLE_LIBCALL(UINTTOFP_I128_F64, "__floatuntidf")
 HANDLE_LIBCALL(UINTTOFP_I128_F80, "__floatuntixf")
 HANDLE_LIBCALL(UINTTOFP_I128_F128, "__floatuntitf")
 HANDLE_LIBCALL(UINTTOFP_I128_PPCF128, "__floatuntitf")
+HANDLE_LIBCALL(CONVERT_F128_PPCF128, "__extendkftf2")
+HANDLE_LIBCALL(CONVERT_PPCF128_F128, "__trunctfkf2")
 
 // Comparison
 HANDLE_LIBCALL(OEQ_F32, "__eqsf2")
@@ -431,6 +434,7 @@ HANDLE_LIBCALL(MEMSET_ELEMENT_UNORDERED_ATOMIC_16, "__llvm_memset_element_unorde
 
 // Exception handling
 HANDLE_LIBCALL(UNWIND_RESUME, "_Unwind_Resume")
+HANDLE_LIBCALL(CXA_END_CLEANUP, "__cxa_end_cleanup")
 
 // Note: there are two sets of atomics libcalls; see
 // <https://llvm.org/docs/Atomics.html> for more info on the
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 430bc34a47e7..47431adc6fac 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_IR_TYPE_H
 #define LLVM_IR_TYPE_H
 
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/CBindingWrapping.h"
@@ -29,6 +28,7 @@
 namespace llvm {
 
 class IntegerType;
+struct fltSemantics;
 class LLVMContext;
 class PointerType;
 class raw_ostream;
@@ -166,18 +166,7 @@ public:
            getTypeID() == PPC_FP128TyID;
   }
 
-  const fltSemantics &getFltSemantics() const {
-    switch (getTypeID()) {
-    case HalfTyID: return APFloat::IEEEhalf();
-    case BFloatTyID: return APFloat::BFloat();
-    case FloatTyID: return APFloat::IEEEsingle();
-    case DoubleTyID: return APFloat::IEEEdouble();
-    case X86_FP80TyID: return APFloat::x87DoubleExtended();
-    case FP128TyID: return APFloat::IEEEquad();
-    case PPC_FP128TyID: return APFloat::PPCDoubleDouble();
-    default: llvm_unreachable("Invalid floating type");
-    }
-  }
+  const fltSemantics &getFltSemantics() const;
 
   /// Return true if this is X86 MMX.
   bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
@@ -312,7 +301,7 @@ public:
 
   /// Return whether the type is IEEE compatible, as defined by the eponymous
   /// method in APFloat.
-  bool isIEEE() const { return APFloat::getZero(getFltSemantics()).isIEEE(); }
+  bool isIEEE() const;
 
   /// If this is a vector type, return the element type, otherwise return
   /// 'this'.
@@ -443,26 +432,7 @@ public:
     }
     llvm_unreachable("Unsupported type in Type::getScalarTy");
   }
-  static Type *getFloatingPointTy(LLVMContext &C, const fltSemantics &S) {
-    Type *Ty;
-    if (&S == &APFloat::IEEEhalf())
-      Ty = Type::getHalfTy(C);
-    else if (&S == &APFloat::BFloat())
-      Ty = Type::getBFloatTy(C);
-    else if (&S == &APFloat::IEEEsingle())
-      Ty = Type::getFloatTy(C);
-    else if (&S == &APFloat::IEEEdouble())
-      Ty = Type::getDoubleTy(C);
-    else if (&S == &APFloat::x87DoubleExtended())
-      Ty = Type::getX86_FP80Ty(C);
-    else if (&S == &APFloat::IEEEquad())
-      Ty = Type::getFP128Ty(C);
-    else {
-      assert(&S == &APFloat::PPCDoubleDouble() && "Unknown FP format");
-      Ty = Type::getPPC_FP128Ty(C);
-    }
-    return Ty;
-  }
+  static Type *getFloatingPointTy(LLVMContext &C, const fltSemantics &S);
 
   //===--------------------------------------------------------------------===//
   // Convenience methods for getting pointer types with one of the above builtin
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 92e2cd3a2783..361d6357b303 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -111,6 +111,21 @@ END_REGISTER_VP_SDNODE(SDOPC)
 #define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS)
 #endif
 
+// Map this VP reduction intrinsic to its reduction operand positions.
+#ifndef HANDLE_VP_REDUCTION
+#define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS)
+#endif
+
+// A property to infer VP binary-op SDNode opcodes automatically.
+#ifndef PROPERTY_VP_BINARYOP_SDNODE
+#define PROPERTY_VP_BINARYOP_SDNODE(ID)
+#endif
+
+// A property to infer VP reduction SDNode opcodes automatically.
+#ifndef PROPERTY_VP_REDUCTION_SDNODE
+#define PROPERTY_VP_REDUCTION_SDNODE(ID)
+#endif
+
 /// } Property Macros
 
 ///// Integer Arithmetic {
@@ -122,6 +137,7 @@ END_REGISTER_VP_SDNODE(SDOPC)
 #define HELPER_REGISTER_BINARY_INT_VP(INTRIN, SDOPC, OPC) \
 BEGIN_REGISTER_VP(INTRIN, 2, 3, SDOPC, -1) \
 HANDLE_VP_TO_OPC(OPC) \
+PROPERTY_VP_BINARYOP_SDNODE(SDOPC) \
 END_REGISTER_VP(INTRIN, SDOPC)
 
 
@@ -181,6 +197,7 @@ HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor)
   BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, SDOPC, -1)                            \
   HANDLE_VP_TO_OPC(OPC)                                                        \
   HANDLE_VP_TO_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX)        \
+  PROPERTY_VP_BINARYOP_SDNODE(SDOPC)                                           \
   END_REGISTER_VP(vp_##OPSUFFIX, SDOPC)
 
 // llvm.vp.fadd(x,y,mask,vlen)
@@ -204,33 +221,146 @@ HELPER_REGISTER_BINARY_FP_VP(frem, VP_FREM, FRem)
 
 ///// Memory Operations {
 // llvm.vp.store(ptr,val,mask,vlen)
-BEGIN_REGISTER_VP(vp_store, 2, 3, VP_STORE, 0)
+BEGIN_REGISTER_VP_INTRINSIC(vp_store, 2, 3)
+// chain = VP_STORE chain,val,base,offset,mask,evl
+BEGIN_REGISTER_VP_SDNODE(VP_STORE, 0, vp_store, 4, 5)
 HANDLE_VP_TO_OPC(Store)
 HANDLE_VP_TO_INTRIN(masked_store)
 HANDLE_VP_IS_MEMOP(vp_store, 1, 0)
 END_REGISTER_VP(vp_store, VP_STORE)
 
 // llvm.vp.scatter(ptr,val,mask,vlen)
-BEGIN_REGISTER_VP(vp_scatter, 2, 3, VP_SCATTER, 0)
+BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3)
+// chain = VP_SCATTER chain,val,base,indices,scale,mask,evl
+BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, -1, vp_scatter, 5, 6)
 HANDLE_VP_TO_INTRIN(masked_scatter)
 HANDLE_VP_IS_MEMOP(vp_scatter, 1, 0)
 END_REGISTER_VP(vp_scatter, VP_SCATTER)
 
 // llvm.vp.load(ptr,mask,vlen)
-BEGIN_REGISTER_VP(vp_load, 1, 2, VP_LOAD, -1)
+BEGIN_REGISTER_VP_INTRINSIC(vp_load, 1, 2)
+// val,chain = VP_LOAD chain,base,offset,mask,evl
+BEGIN_REGISTER_VP_SDNODE(VP_LOAD, -1, vp_load, 3, 4)
 HANDLE_VP_TO_OPC(Load)
 HANDLE_VP_TO_INTRIN(masked_load)
 HANDLE_VP_IS_MEMOP(vp_load, 0, None)
 END_REGISTER_VP(vp_load, VP_LOAD)
 
 // llvm.vp.gather(ptr,mask,vlen)
-BEGIN_REGISTER_VP(vp_gather, 1, 2, VP_GATHER, -1)
+BEGIN_REGISTER_VP_INTRINSIC(vp_gather, 1, 2)
+// val,chain = VP_GATHER chain,base,indices,scale,mask,evl
+BEGIN_REGISTER_VP_SDNODE(VP_GATHER, -1, vp_gather, 4, 5)
 HANDLE_VP_TO_INTRIN(masked_gather)
 HANDLE_VP_IS_MEMOP(vp_gather, 0, None)
 END_REGISTER_VP(vp_gather, VP_GATHER)
 
 ///// } Memory Operations
 
+///// Reductions {
+
+// Specialized helper macro for VP reductions (%start, %x, %mask, %evl).
+#ifdef HELPER_REGISTER_REDUCTION_VP
+#error "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!"
+#endif
+#define HELPER_REGISTER_REDUCTION_VP(VPINTRIN, SDOPC, INTRIN) \
+BEGIN_REGISTER_VP(VPINTRIN, 2, 3, SDOPC, -1) \
+HANDLE_VP_TO_INTRIN(INTRIN) \
+HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
+PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \
+END_REGISTER_VP(VPINTRIN, SDOPC)
+
+// llvm.vp.reduce.add(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD,
+                             experimental_vector_reduce_add)
+
+// llvm.vp.reduce.mul(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL,
+                             experimental_vector_reduce_mul)
+
+// llvm.vp.reduce.and(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND,
+                             experimental_vector_reduce_and)
+
+// llvm.vp.reduce.or(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR,
+                             experimental_vector_reduce_or)
+
+// llvm.vp.reduce.xor(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR,
+                             experimental_vector_reduce_xor)
+
+// llvm.vp.reduce.smax(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX,
+                             experimental_vector_reduce_smax)
+
+// llvm.vp.reduce.smin(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN,
+                             experimental_vector_reduce_smin)
+
+// llvm.vp.reduce.umax(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX,
+                             experimental_vector_reduce_umax)
+
+// llvm.vp.reduce.umin(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN,
+                             experimental_vector_reduce_umin)
+
+// llvm.vp.reduce.fmax(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX,
+                             experimental_vector_reduce_fmax)
+
+// llvm.vp.reduce.fmin(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
+                             experimental_vector_reduce_fmin)
+
+#undef HELPER_REGISTER_REDUCTION_VP
+
+// Specialized helper macro for VP reductions as above but with two forms:
+// sequential and reassociative. These manifest as the presence of 'reassoc'
+// fast-math flags in the IR and as two distinct ISD opcodes in the
+// SelectionDAG.
+#ifdef HELPER_REGISTER_REDUCTION_SEQ_VP
+#error "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!"
+#endif
+#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPINTRIN, SDOPC, SEQ_SDOPC, INTRIN) \
+BEGIN_REGISTER_VP_INTRINSIC(VPINTRIN, 2, 3) \
+BEGIN_REGISTER_VP_SDNODE(SDOPC, -1, VPINTRIN, 2, 3) \
+END_REGISTER_VP_SDNODE(SDOPC) \
+BEGIN_REGISTER_VP_SDNODE(SEQ_SDOPC, -1, VPINTRIN, 2, 3) \
+END_REGISTER_VP_SDNODE(SEQ_SDOPC) \
+HANDLE_VP_TO_INTRIN(INTRIN) \
+HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
+PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \
+PROPERTY_VP_REDUCTION_SDNODE(SEQ_SDOPC) \
+END_REGISTER_VP_INTRINSIC(VPINTRIN)
+
+// llvm.vp.reduce.fadd(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fadd, VP_REDUCE_FADD,
+                                 VP_REDUCE_SEQ_FADD,
+                                 experimental_vector_reduce_fadd)
+
+// llvm.vp.reduce.fmul(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fmul, VP_REDUCE_FMUL,
+                                 VP_REDUCE_SEQ_FMUL,
+                                 experimental_vector_reduce_fmul)
+
+#undef HELPER_REGISTER_REDUCTION_SEQ_VP
+
+///// } Reduction
+
+///// Shuffles {
+
+// llvm.vp.select(mask,on_true,on_false,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(vp_select, 0, 3)
+// BEGIN_REGISTER_VP_SDNODE(VP_SELECT, -1, vp_select, 0, 4)
+// END_REGISTER_CASES(vp_select, VP_SELECT)
+END_REGISTER_VP_INTRINSIC(vp_select)
+
+BEGIN_REGISTER_VP(experimental_vp_splice, 3, 5,
+                  EXPERIMENTAL_VP_SPLICE, -1)
+END_REGISTER_VP(experimental_vp_splice, EXPERIMENTAL_VP_SPLICE)
+
+///// } Shuffles
 
 #undef BEGIN_REGISTER_VP
 #undef BEGIN_REGISTER_VP_INTRINSIC
@@ -242,3 +372,6 @@ END_REGISTER_VP(vp_gather, VP_GATHER)
 #undef HANDLE_VP_TO_CONSTRAINEDFP
 #undef HANDLE_VP_TO_INTRIN
 #undef HANDLE_VP_IS_MEMOP
+#undef HANDLE_VP_REDUCTION
+#undef PROPERTY_VP_BINARYOP_SDNODE
+#undef PROPERTY_VP_REDUCTION_SDNODE
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index 2ad1c9e8c300..fc2ed00d770f 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -37,7 +37,6 @@ class DataLayout;
 class Function;
 class GlobalAlias;
 class GlobalIFunc;
-class GlobalIndirectSymbol;
 class GlobalObject;
 class GlobalValue;
 class GlobalVariable;
@@ -454,14 +453,18 @@ public:
 
   /// Return true if there is exactly one use of this value that cannot be
   /// dropped.
-  ///
-  /// This is specialized because it is a common request and does not require
-  /// traversing the whole use list.
   Use *getSingleUndroppableUse();
   const Use *getSingleUndroppableUse() const {
     return const_cast<Value *>(this)->getSingleUndroppableUse();
   }
 
+  /// Return true if there is exactly one unique user of this value that cannot be
+  /// dropped (that user can have multiple uses of this value).
+  User *getUniqueUndroppableUser();
+  const User *getUniqueUndroppableUser() const {
+    return const_cast<Value *>(this)->getUniqueUndroppableUser();
+  }
+
   /// Return true if there this value.
   ///
   /// This is specialized because it is a common request and does not require
@@ -690,6 +693,9 @@ public:
   /// If \p AllowNonInbounds is true, offsets in GEPs are stripped and
   /// accumulated even if the GEP is not "inbounds".
   ///
+  /// If \p AllowInvariantGroup is true then this method also looks through
+  /// strip.invariant.group and launder.invariant.group intrinsics.
+  ///
   /// If \p ExternalAnalysis is provided it will be used to calculate a offset
   /// when a operand of GEP is not constant.
   /// For example, for a value \p ExternalAnalysis might try to calculate a
@@ -705,13 +711,15 @@ public:
   /// is unchanged.
   const Value *stripAndAccumulateConstantOffsets(
       const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
+      bool AllowInvariantGroup = false,
       function_ref<bool(Value &Value, APInt &Offset)> ExternalAnalysis =
           nullptr) const;
   Value *stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset,
-                                           bool AllowNonInbounds) {
+                                           bool AllowNonInbounds,
+                                           bool AllowInvariantGroup = false) {
     return const_cast<Value *>(
         static_cast<const Value *>(this)->stripAndAccumulateConstantOffsets(
-            DL, Offset, AllowNonInbounds));
+            DL, Offset, AllowNonInbounds, AllowInvariantGroup));
   }
 
   /// This is a wrapper around stripAndAccumulateConstantOffsets with the
@@ -781,8 +789,8 @@ public:
   ///
   /// This is the greatest alignment value supported by load, store, and alloca
   /// instructions, and global values.
-  static const unsigned MaxAlignmentExponent = 29;
-  static const unsigned MaximumAlignment = 1u << MaxAlignmentExponent;
+  static constexpr unsigned MaxAlignmentExponent = 32;
+  static constexpr uint64_t MaximumAlignment = 1ULL << MaxAlignmentExponent;
 
   /// Mutate the type of this Value to be of the specified type.
   ///
@@ -1012,21 +1020,16 @@ template <> struct isa_impl<GlobalIFunc, Value> {
   }
 };
 
-template <> struct isa_impl<GlobalIndirectSymbol, Value> {
-  static inline bool doit(const Value &Val) {
-    return isa<GlobalAlias>(Val) || isa<GlobalIFunc>(Val);
-  }
-};
-
 template <> struct isa_impl<GlobalValue, Value> {
   static inline bool doit(const Value &Val) {
-    return isa<GlobalObject>(Val) || isa<GlobalIndirectSymbol>(Val);
+    return isa<GlobalObject>(Val) || isa<GlobalAlias>(Val);
   }
 };
 
 template <> struct isa_impl<GlobalObject, Value> {
   static inline bool doit(const Value &Val) {
-    return isa<GlobalVariable>(Val) || isa<Function>(Val);
+    return isa<GlobalVariable>(Val) || isa<Function>(Val) ||
+           isa<GlobalIFunc>(Val);
   }
 };
 
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 365240de321a..845d7dcdebd2 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -64,6 +64,7 @@ void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
+void initializeAddFSDiscriminatorsPass(PassRegistry &);
 void initializeModuleAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeASanGlobalsMetadataWrapperPassPass(PassRegistry &);
 void initializeAddressSanitizerLegacyPassPass(PassRegistry &);
@@ -183,6 +184,7 @@ void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeHardwareLoopsPass(PassRegistry&);
+void initializeMIRProfileLoaderPassPass(PassRegistry &);
 void initializeMemProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
@@ -234,7 +236,8 @@ void initializeLiveIntervalsPass(PassRegistry&);
 void initializeLiveRangeShrinkPass(PassRegistry&);
 void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
-void initializeLiveVariablesPass(PassRegistry&);
+void initializeLiveVariablesPass(PassRegistry &);
+void initializeLoadStoreOptPass(PassRegistry &);
 void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/InterfaceStub/IFSHandler.h b/llvm/include/llvm/InterfaceStub/IFSHandler.h
index de627492366f..6ae6a421318e 100644
--- a/llvm/include/llvm/InterfaceStub/IFSHandler.h
+++ b/llvm/include/llvm/InterfaceStub/IFSHandler.h
@@ -51,6 +51,9 @@ Error validateIFSTarget(IFSStub &Stub, bool ParseTriple);
 void stripIFSTarget(IFSStub &Stub, bool StripTriple, bool StripArch,
                     bool StripEndianness, bool StripBitWidth);
 
+/// Strips symbols from IFS symbol table that are undefined.
+void stripIFSUndefinedSymbols(IFSStub &Stub);
+
 /// Parse llvm triple string into a IFSTarget struct.
 IFSTarget parseTriple(StringRef TripleStr);
 
diff --git a/llvm/include/llvm/LTO/Caching.h b/llvm/include/llvm/LTO/Caching.h
deleted file mode 100644
index 43b978328b74..000000000000
--- a/llvm/include/llvm/LTO/Caching.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- Caching.h - LLVM Link Time Optimizer Configuration -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the localCache function, which allows clients to add a
-// filesystem cache to ThinLTO.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LTO_CACHING_H
-#define LLVM_LTO_CACHING_H
-
-#include "llvm/LTO/LTO.h"
-
-namespace llvm {
-namespace lto {
-
-/// This type defines the callback to add a pre-existing native object file
-/// (e.g. in a cache).
-///
-/// Buffer callbacks must be thread safe.
-using AddBufferFn =
-    std::function<void(unsigned Task, std::unique_ptr<MemoryBuffer> MB)>;
-
-/// Create a local file system cache which uses the given cache directory and
-/// file callback. This function also creates the cache directory if it does not
-/// already exist.
-Expected<NativeObjectCache> localCache(StringRef CacheDirectoryPath,
-                                       AddBufferFn AddBuffer);
-
-} // namespace lto
-} // namespace llvm
-
-#endif
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 5fd3c9f408f3..eb793d62907e 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -70,6 +70,9 @@ struct Config {
   /// Run PGO context sensitive IR instrumentation.
   bool RunCSIRInstr = false;
 
+  /// Turn on/off the warning about a hash mismatch in the PGO profile data.
+  bool PGOWarnMismatch = true;
+
   /// Asserts whether we can assume whole program visibility during the LTO
   /// link.
   bool HasWholeProgramVisibility = false;
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index ea1dea2d6f42..d2b0fef1ca47 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -21,8 +21,10 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/LTO/Config.h"
 #include "llvm/Object/IRSymtab.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/thread.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
 
 namespace llvm {
@@ -38,7 +40,7 @@ class ToolOutputFile;
 
 /// Resolve linkage for prevailing symbols in the \p Index. Linkage changes
 /// recorded in the index and the ThinLTO backends must apply the changes to
-/// the module via thinLTOResolvePrevailingInModule.
+/// the module via thinLTOFinalizeInModule.
 ///
 /// This is done for correctness (if value exported, ensure we always
 /// emit a copy), and compile-time optimization (allow drop of duplicates).
@@ -186,47 +188,13 @@ private:
   }
 };
 
-/// This class wraps an output stream for a native object. Most clients should
-/// just be able to return an instance of this base class from the stream
-/// callback, but if a client needs to perform some action after the stream is
-/// written to, that can be done by deriving from this class and overriding the
-/// destructor.
-class NativeObjectStream {
-public:
-  NativeObjectStream(std::unique_ptr<raw_pwrite_stream> OS) : OS(std::move(OS)) {}
-  std::unique_ptr<raw_pwrite_stream> OS;
-  virtual ~NativeObjectStream() = default;
-};
-
-/// This type defines the callback to add a native object that is generated on
-/// the fly.
-///
-/// Stream callbacks must be thread safe.
-using AddStreamFn =
-    std::function<std::unique_ptr<NativeObjectStream>(unsigned Task)>;
-
-/// This is the type of a native object cache. To request an item from the
-/// cache, pass a unique string as the Key. For hits, the cached file will be
-/// added to the link and this function will return AddStreamFn(). For misses,
-/// the cache will return a stream callback which must be called at most once to
-/// produce content for the stream. The native object stream produced by the
-/// stream callback will add the file to the link after the stream is written
-/// to.
-///
-/// Clients generally look like this:
-///
-/// if (AddStreamFn AddStream = Cache(Task, Key))
-///   ProduceContent(AddStream);
-using NativeObjectCache =
-    std::function<AddStreamFn(unsigned Task, StringRef Key)>;
-
 /// A ThinBackend defines what happens after the thin-link phase during ThinLTO.
 /// The details of this type definition aren't important; clients can only
 /// create a ThinBackend using one of the create*ThinBackend() functions below.
 using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
     const Config &C, ModuleSummaryIndex &CombinedIndex,
     StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-    AddStreamFn AddStream, NativeObjectCache Cache)>;
+    AddStreamFn AddStream, FileCache Cache)>;
 
 /// This ThinBackend runs the individual backend jobs in-process.
 /// The default value means to use one job per hardware core (not hyper-thread).
@@ -299,7 +267,7 @@ public:
   ///
   /// The client will receive at most one callback (via either AddStream or
   /// Cache) for each task identifier.
-  Error run(AddStreamFn AddStream, NativeObjectCache Cache = nullptr);
+  Error run(AddStreamFn AddStream, FileCache Cache = nullptr);
 
   /// Static method that returns a list of libcall symbols that can be generated
   /// by LTO but might not be visible from bitcode symbol table.
@@ -431,7 +399,7 @@ private:
                    const SymbolResolution *&ResI, const SymbolResolution *ResE);
 
   Error runRegularLTO(AddStreamFn AddStream);
-  Error runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
+  Error runThinLTO(AddStreamFn AddStream, FileCache Cache,
                    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols);
 
   Error checkPartiallySplit();
@@ -444,6 +412,9 @@ private:
   // Identify symbols exported dynamically, and that therefore could be
   // referenced by a shared library not visible to the linker.
   DenseSet<GlobalValue::GUID> DynamicExportSymbols;
+
+  // Diagnostic optimization remarks file
+  std::unique_ptr<ToolOutputFile> DiagnosticOutputFile;
 };
 
 /// The resolution for a symbol. The linker must provide a SymbolResolution for
diff --git a/llvm/include/llvm/LTO/SummaryBasedOptimizations.h b/llvm/include/llvm/LTO/SummaryBasedOptimizations.h
index 6697c821a5ea..508ab2587ac5 100644
--- a/llvm/include/llvm/LTO/SummaryBasedOptimizations.h
+++ b/llvm/include/llvm/LTO/SummaryBasedOptimizations.h
@@ -10,6 +10,8 @@
 #define LLVM_LTO_SUMMARYBASEDOPTIMIZATIONS_H
 namespace llvm {
 class ModuleSummaryIndex;
+
+/// Compute synthetic function entry counts.
 void computeSyntheticCounts(ModuleSummaryIndex &Index);
 
 } // namespace llvm
diff --git a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
index 31688e43e174..333f483f29c5 100644
--- a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -176,7 +176,7 @@ struct LTOCodeGenerator {
   /// created using the \p AddStream callback. Returns true on success.
   ///
   /// Calls \a verifyMergedModuleOnce().
-  bool compileOptimized(lto::AddStreamFn AddStream, unsigned ParallelismLevel);
+  bool compileOptimized(AddStreamFn AddStream, unsigned ParallelismLevel);
 
   /// Enable the Freestanding mode: indicate that the optimizer should not
   /// assume builtins are present on the target.
diff --git a/llvm/include/llvm/LTO/legacy/LTOModule.h b/llvm/include/llvm/LTO/legacy/LTOModule.h
index 2a25dab58ada..01e63db4bab3 100644
--- a/llvm/include/llvm/LTO/legacy/LTOModule.h
+++ b/llvm/include/llvm/LTO/legacy/LTOModule.h
@@ -167,6 +167,10 @@ public:
 
   Expected<uint32_t> getMachOCPUSubType() const;
 
+  /// Returns true if the module has either the @llvm.global_ctors or the
+  /// @llvm.global_dtors symbol. Otherwise returns false.
+  bool hasCtorDtor() const;
+
 private:
   /// Parse metadata from the module
   // FIXME: it only parses "llvm.linker.options" metadata at the moment
diff --git a/llvm/include/llvm/LinkAllIR.h b/llvm/include/llvm/LinkAllIR.h
index 4b0aabeee701..ceed784d557d 100644
--- a/llvm/include/llvm/LinkAllIR.h
+++ b/llvm/include/llvm/LinkAllIR.h
@@ -38,6 +38,9 @@ namespace {
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
+      // This is so that globals in the translation units where these functions
+      // are defined are forced to be initialized, populating various
+      // registries.
       if (std::getenv("bar") != (char*) -1)
         return;
       llvm::LLVMContext Context;
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 45978828a8ce..c8b9aaeed76a 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -64,6 +64,9 @@ namespace {
       // delete it all as dead code, even with whole program optimization,
       // yet is effectively a NO-OP. As the compiler isn't smart enough
       // to know that getenv() never returns -1, this will do the job.
+      // This is so that globals in the translation units where these functions
+      // are defined are forced to be initialized, populating various
+      // registries.
       if (std::getenv("bar") != (char*) -1)
         return;
 
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 08739d51f751..bb57c3453d10 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -55,7 +55,8 @@ public:
   /// Give the target a chance to manipulate state related to instruction
   /// alignment (e.g. padding for optimization), instruction relaxablility, etc.
   /// before and after actually emitting the instruction.
-  virtual void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) {}
+  virtual void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
+                                    const MCSubtargetInfo &STI) {}
   virtual void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {}
 
   /// lifetime management
@@ -185,13 +186,16 @@ public:
 
   /// Returns the maximum size of a nop in bytes on this target.
   ///
-  virtual unsigned getMaximumNopSize() const { return 0; }
+  virtual unsigned getMaximumNopSize(const MCSubtargetInfo &STI) const {
+    return 0;
+  }
 
   /// Write an (optimal) nop sequence of Count bytes to the given output. If the
   /// target cannot generate such a sequence, it should return an error.
   ///
   /// \return - True on success.
-  virtual bool writeNopData(raw_ostream &OS, uint64_t Count) const = 0;
+  virtual bool writeNopData(raw_ostream &OS, uint64_t Count,
+                            const MCSubtargetInfo *STI) const = 0;
 
   /// Give backend an opportunity to finish layout after relaxation
   virtual void finishLayout(MCAssembler const &Asm,
diff --git a/llvm/include/llvm/MC/MCAsmInfoGOFF.h b/llvm/include/llvm/MC/MCAsmInfoGOFF.h
new file mode 100644
index 000000000000..1f3b26311b37
--- /dev/null
+++ b/llvm/include/llvm/MC/MCAsmInfoGOFF.h
@@ -0,0 +1,29 @@
+//===- MCAsmInfoGOFF.h - GOFF Asm Info Fields -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines certain target specific asm properties for GOFF (z/OS)
+/// based targets.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMINFOGOFF_H
+#define LLVM_MC_MCASMINFOGOFF_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class MCAsmInfoGOFF : public MCAsmInfo {
+  virtual void anchor();
+
+protected:
+  MCAsmInfoGOFF();
+};
+} // end namespace llvm
+
+#endif // LLVM_MC_MCASMINFOGOFF_H
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index 877b2dc4ac92..bde750759a0b 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -817,7 +817,7 @@ namespace llvm {
     // Unrecoverable error has occurred. Display the best diagnostic we can
     // and bail via exit(1). For now, most MC backend errors are unrecoverable.
     // FIXME: We should really do something about that.
-    LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L, const Twine &Msg);
+    [[noreturn]] void reportFatalError(SMLoc L, const Twine &Msg);
 
     const MCAsmMacro *lookupMacro(StringRef Name) {
       StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index 23efdc70609b..7e72d56f3097 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MD5.h"
 #include <cassert>
@@ -34,7 +35,6 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class MCAsmBackend;
 class MCContext;
-class MCDwarfLineStr;
 class MCObjectStreamer;
 class MCStreamer;
 class MCSymbol;
@@ -47,6 +47,24 @@ namespace mcdwarf {
 MCSymbol *emitListsTableHeaderStart(MCStreamer &S);
 } // namespace mcdwarf
 
+/// Manage the .debug_line_str section contents, if we use it.
+class MCDwarfLineStr {
+  MCSymbol *LineStrLabel = nullptr;
+  StringTableBuilder LineStrings{StringTableBuilder::DWARF};
+  bool UseRelocs = false;
+
+public:
+  /// Construct an instance that can emit .debug_line_str (for use in a normal
+  /// v5 line table).
+  explicit MCDwarfLineStr(MCContext &Ctx);
+
+  /// Emit a reference to the string.
+  void emitRef(MCStreamer *MCOS, StringRef Path);
+
+  /// Emit the .debug_line_str section if appropriate.
+  void emitSection(MCStreamer *MCOS);
+};
+
 /// Instances of this class represent the name of the dwarf .file directive and
 /// its associated dwarf file number in the MC file. MCDwarfFile's are created
 /// and uniqued by the MCContext class. In Dwarf 4 file numbers start from 1;
@@ -170,6 +188,15 @@ public:
 
   MCSymbol *getLabel() const { return Label; }
 
+  // This indicates the line entry is synthesized for an end entry.
+  bool IsEndEntry = false;
+
+  // Override the label with the given EndLabel.
+  void setEndLabel(MCSymbol *EndLabel) {
+    Label = EndLabel;
+    IsEndEntry = true;
+  }
+
   // This is called when an instruction is assembled into the specified
   // section and if there is information from the last .loc directive that
   // has yet to have a line entry made for it is made.
@@ -187,6 +214,10 @@ public:
     MCLineDivisions[Sec].push_back(LineEntry);
   }
 
+  // Add an end entry by cloning the last entry, if exists, for the section
+  // the given EndLabel belongs to. The label is replaced by the given EndLabel.
+  void addEndEntry(MCSymbol *EndLabel);
+
   using MCDwarfLineEntryCollection = std::vector<MCDwarfLineEntry>;
   using iterator = MCDwarfLineEntryCollection::iterator;
   using const_iterator = MCDwarfLineEntryCollection::const_iterator;
@@ -317,6 +348,11 @@ public:
   void emitCU(MCStreamer *MCOS, MCDwarfLineTableParams Params,
               Optional<MCDwarfLineStr> &LineStr) const;
 
+  // This emits a single line table associated with a given Section.
+  static void
+  emitOne(MCStreamer *MCOS, MCSection *Section,
+          const MCLineSection::MCDwarfLineEntryCollection &LineEntries);
+
   Expected<unsigned> tryGetFile(StringRef &Directory, StringRef &FileName,
                                 Optional<MD5::MD5Result> Checksum,
                                 Optional<StringRef> Source,
diff --git a/llvm/include/llvm/MC/MCELFObjectWriter.h b/llvm/include/llvm/MC/MCELFObjectWriter.h
index 9f4b8de7947b..fa17759bc21a 100644
--- a/llvm/include/llvm/MC/MCELFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCELFObjectWriter.h
@@ -78,6 +78,8 @@ public:
       case Triple::PS4:
       case Triple::FreeBSD:
         return ELF::ELFOSABI_FREEBSD;
+      case Triple::Solaris:
+        return ELF::ELFOSABI_SOLARIS;
       default:
         return ELF::ELFOSABI_NONE;
     }
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index 8c1e22a14702..8f2b176862c8 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -39,7 +39,7 @@ public:
   /// \name MCStreamer Interface
   /// @{
 
-  void InitSections(bool NoExecStack) override;
+  void initSections(bool NoExecStack, const MCSubtargetInfo &STI) override;
   void changeSection(MCSection *Section, const MCExpr *Subsection) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment *F,
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 38cca2413e1e..bf1f32bb91ba 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -200,6 +200,7 @@ public:
     VK_GOTREL,
     VK_PCREL,
     VK_GOTPCREL,
+    VK_GOTPCREL_NORELAX,
     VK_GOTTPOFF,
     VK_INDNTPOFF,
     VK_NTPOFF,
@@ -328,6 +329,7 @@ public:
     VK_WASM_TLSREL,    // Memory address relative to __tls_base
     VK_WASM_MBREL,     // Memory address relative to __memory_base
     VK_WASM_TBREL,     // Table index relative to __table_base
+    VK_WASM_GOT_TLS,   // Wasm global index of TLS symbol.
 
     VK_AMDGPU_GOTPCREL32_LO, // symbol@gotpcrel32@lo
     VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index f3a785fb09b7..736fdd992063 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -311,6 +311,9 @@ class MCAlignFragment : public MCFragment {
   /// cannot be satisfied in this width then this fragment is ignored.
   unsigned MaxBytesToEmit;
 
+  /// When emitting Nops some subtargets have specific nop encodings.
+  const MCSubtargetInfo *STI;
+
 public:
   MCAlignFragment(unsigned Alignment, int64_t Value, unsigned ValueSize,
                   unsigned MaxBytesToEmit, MCSection *Sec = nullptr)
@@ -326,7 +329,12 @@ public:
   unsigned getMaxBytesToEmit() const { return MaxBytesToEmit; }
 
   bool hasEmitNops() const { return EmitNops; }
-  void setEmitNops(bool Value) { EmitNops = Value; }
+  void setEmitNops(bool Value, const MCSubtargetInfo *STI) {
+    EmitNops = Value;
+    this->STI = STI;
+  }
+
+  const MCSubtargetInfo *getSubtargetInfo() const { return STI; }
 
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Align;
@@ -369,17 +377,22 @@ class MCNopsFragment : public MCFragment {
   /// Source location of the directive that this fragment was created for.
   SMLoc Loc;
 
+  /// When emitting Nops some subtargets have specific nop encodings.
+  const MCSubtargetInfo &STI;
+
 public:
   MCNopsFragment(int64_t NumBytes, int64_t ControlledNopLength, SMLoc L,
-                 MCSection *Sec = nullptr)
+                 const MCSubtargetInfo &STI, MCSection *Sec = nullptr)
       : MCFragment(FT_Nops, false, Sec), Size(NumBytes),
-        ControlledNopLength(ControlledNopLength), Loc(L) {}
+        ControlledNopLength(ControlledNopLength), Loc(L), STI(STI) {}
 
   int64_t getNumBytes() const { return Size; }
   int64_t getControlledNopLength() const { return ControlledNopLength; }
 
   SMLoc getLoc() const { return Loc; }
 
+  const MCSubtargetInfo *getSubtargetInfo() const { return &STI; }
+
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Nops;
   }
@@ -572,10 +585,14 @@ class MCBoundaryAlignFragment : public MCFragment {
   /// is not meaningful before that.
   uint64_t Size = 0;
 
+  /// When emitting Nops some subtargets have specific nop encodings.
+  const MCSubtargetInfo &STI;
+
 public:
-  MCBoundaryAlignFragment(Align AlignBoundary, MCSection *Sec = nullptr)
-      : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary) {
-  }
+  MCBoundaryAlignFragment(Align AlignBoundary, const MCSubtargetInfo &STI,
+                          MCSection *Sec = nullptr)
+      : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary),
+        STI(STI) {}
 
   uint64_t getSize() const { return Size; }
   void setSize(uint64_t Value) { Size = Value; }
@@ -589,6 +606,8 @@ public:
     LastFragment = F;
   }
 
+  const MCSubtargetInfo *getSubtargetInfo() const { return &STI; }
+
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_BoundaryAlign;
   }
diff --git a/llvm/include/llvm/MC/MCInstrAnalysis.h b/llvm/include/llvm/MC/MCInstrAnalysis.h
index 898ca47b13b8..632a7d8f820e 100644
--- a/llvm/include/llvm/MC/MCInstrAnalysis.h
+++ b/llvm/include/llvm/MC/MCInstrAnalysis.h
@@ -154,9 +154,14 @@ public:
 
   /// Given an instruction tries to get the address of a memory operand. Returns
   /// the address on success.
-  virtual Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
-                                                          uint64_t Addr,
-                                                          uint64_t Size) const;
+  virtual Optional<uint64_t>
+  evaluateMemoryOperandAddress(const MCInst &Inst, const MCSubtargetInfo *STI,
+                               uint64_t Addr, uint64_t Size) const;
+
+  /// Given an instruction with a memory operand that could require relocation,
+  /// returns the offset within the instruction of that relocation.
+  virtual Optional<uint64_t>
+  getMemoryOperandRelocationOffset(const MCInst &Inst, uint64_t Size) const;
 
   /// Returns (PLT virtual address, GOT virtual address) pairs for PLT entries.
   virtual std::vector<std::pair<uint64_t, uint64_t>>
diff --git a/llvm/include/llvm/MC/MCInstrDesc.h b/llvm/include/llvm/MC/MCInstrDesc.h
index 0e6b677098e8..e8ffd29170e6 100644
--- a/llvm/include/llvm/MC/MCInstrDesc.h
+++ b/llvm/include/llvm/MC/MCInstrDesc.h
@@ -76,7 +76,7 @@ enum OperandType {
   OPERAND_FIRST_TARGET = 13,
 };
 
-}
+} // namespace MCOI
 
 /// This holds information about one operand of a machine instruction,
 /// indicating the register class for register operands, etc.
@@ -185,7 +185,7 @@ enum Flag {
   VariadicOpsAreDefs,
   Authenticated,
 };
-}
+} // namespace MCID
 
 /// Describe properties that are true of each instruction in the target
 /// description file.  This captures information about side effects, register
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index 8ae86ef2a574..ba7450ac64f1 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -225,10 +225,13 @@ protected:
 
   // XCOFF specific sections
   MCSection *TOCBaseSection = nullptr;
+  MCSection *ReadOnly8Section = nullptr;
+  MCSection *ReadOnly16Section = nullptr;
 
 public:
   void initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
                             bool LargeCodeModel = false);
+  virtual ~MCObjectFileInfo();
   MCContext &getContext() const { return *Ctx; }
 
   bool getSupportsWeakOmittedEHFrame() const {
@@ -251,6 +254,7 @@ public:
     return CompactUnwindDwarfEHFrameOnly;
   }
 
+  virtual unsigned getTextSectionAlignment() const { return 4; }
   MCSection *getTextSection() const { return TextSection; }
   MCSection *getDataSection() const { return DataSection; }
   MCSection *getBSSSection() const { return BSSSection; }
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index dcdee2b5774b..9d6416e4a18d 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -137,7 +137,7 @@ public:
   void emitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                             unsigned ValueSize = 1,
                             unsigned MaxBytesToEmit = 0) override;
-  void emitCodeAlignment(unsigned ByteAlignment,
+  void emitCodeAlignment(unsigned ByteAlignment, const MCSubtargetInfo *STI,
                          unsigned MaxBytesToEmit = 0) override;
   void emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                          SMLoc Loc) override;
@@ -181,8 +181,8 @@ public:
                 SMLoc Loc = SMLoc()) override;
   void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                 SMLoc Loc = SMLoc()) override;
-  void emitNops(int64_t NumBytes, int64_t ControlledNopLength,
-                SMLoc Loc) override;
+  void emitNops(int64_t NumBytes, int64_t ControlledNopLength, SMLoc Loc,
+                const MCSubtargetInfo &STI) override;
   void emitFileDirective(StringRef Filename) override;
   void emitFileDirective(StringRef Filename, StringRef CompilerVerion,
                          StringRef TimeStamp, StringRef Description) override;
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 1c6926b9a9e6..abc9705f0851 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -44,17 +44,26 @@
 #ifndef LLVM_MC_MCPSEUDOPROBE_H
 #define LLVM_MC_MCPSEUDOPROBE_H
 
-#include "llvm/ADT/MapVector.h"
-#include "llvm/MC/MCSection.h"
-#include <functional>
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/Support/ErrorOr.h"
+#include <list>
 #include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 namespace llvm {
 
+class MCSection;
 class MCStreamer;
 class MCSymbol;
 class MCObjectStreamer;
+class raw_ostream;
 
 enum class MCPseudoProbeFlag {
   // If set, indicates that the probe is encoded as an address delta
@@ -62,69 +71,211 @@ enum class MCPseudoProbeFlag {
   AddressDelta = 0x1,
 };
 
+// Function descriptor decoded from .pseudo_probe_desc section
+struct MCPseudoProbeFuncDesc {
+  uint64_t FuncGUID = 0;
+  uint64_t FuncHash = 0;
+  std::string FuncName;
+
+  MCPseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
+      : FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};
+
+  void print(raw_ostream &OS);
+};
+
+class MCPseudoProbe;
+class MCDecodedPseudoProbe;
+
+// An inline frame has the form <Guid, ProbeID>
+using InlineSite = std::tuple<uint64_t, uint32_t>;
+using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
+// GUID to PseudoProbeFuncDesc map
+using GUIDProbeFunctionMap =
+    std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
+// Address to pseudo probes map.
+using AddressProbesMap =
+    std::unordered_map<uint64_t, std::list<MCDecodedPseudoProbe>>;
+
+class MCPseudoProbeInlineTree;
+class MCDecodedPseudoProbeInlineTree;
+
+class MCPseudoProbeBase {
+protected:
+  uint64_t Guid;
+  uint64_t Index;
+  uint8_t Attributes;
+  uint8_t Type;
+  // The value should be equal to PseudoProbeReservedId::Last + 1 which is
+  // defined in SampleProfileProbe.h. The header file is not included here to
+  // reduce the dependency from MC to IPO.
+  const static uint32_t PseudoProbeFirstId = 1;
+
+public:
+  MCPseudoProbeBase(uint64_t G, uint64_t I, uint64_t At, uint8_t T)
+      : Guid(G), Index(I), Attributes(At), Type(T) {}
+
+  bool isEntry() const { return Index == PseudoProbeFirstId; }
+
+  uint64_t getGuid() const { return Guid; }
+
+  uint64_t getIndex() const { return Index; }
+
+  uint8_t getAttributes() const { return Attributes; }
+
+  uint8_t getType() const { return Type; }
+
+  bool isBlock() const {
+    return Type == static_cast<uint8_t>(PseudoProbeType::Block);
+  }
+
+  bool isIndirectCall() const {
+    return Type == static_cast<uint8_t>(PseudoProbeType::IndirectCall);
+  }
+
+  bool isDirectCall() const {
+    return Type == static_cast<uint8_t>(PseudoProbeType::DirectCall);
+  }
+
+  bool isCall() const { return isIndirectCall() || isDirectCall(); }
+
+  void setAttributes(uint8_t Attr) { Attributes = Attr; }
+};
+
 /// Instances of this class represent a pseudo probe instance for a pseudo probe
 /// table entry, which is created during a machine instruction is assembled and
 /// uses an address from a temporary label created at the current address in the
 /// current section.
-class MCPseudoProbe {
+class MCPseudoProbe : public MCPseudoProbeBase {
   MCSymbol *Label;
-  uint64_t Guid;
-  uint64_t Index;
-  uint8_t Type;
-  uint8_t Attributes;
 
 public:
   MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
                 uint64_t Attributes)
-      : Label(Label), Guid(Guid), Index(Index), Type(Type),
-        Attributes(Attributes) {
+      : MCPseudoProbeBase(Guid, Index, Attributes, Type), Label(Label) {
     assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
     assert(Attributes <= 0xFF &&
            "Probe attributes too big to encode, exceeding 2^16");
   }
 
   MCSymbol *getLabel() const { return Label; }
+  void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
+};
 
-  uint64_t getGuid() const { return Guid; }
+// Represents a callsite with caller function name and probe id
+using MCPseduoProbeFrameLocation = std::pair<StringRef, uint32_t>;
 
-  uint64_t getIndex() const { return Index; }
+class MCDecodedPseudoProbe : public MCPseudoProbeBase {
+  uint64_t Address;
+  MCDecodedPseudoProbeInlineTree *InlineTree;
 
-  uint8_t getType() const { return Type; }
+public:
+  MCDecodedPseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
+                       uint8_t At, MCDecodedPseudoProbeInlineTree *Tree)
+      : MCPseudoProbeBase(G, I, At, static_cast<uint8_t>(K)), Address(Ad),
+        InlineTree(Tree){};
 
-  uint8_t getAttributes() const { return Attributes; }
+  uint64_t getAddress() const { return Address; }
 
-  void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
+  void setAddress(uint64_t Addr) { Address = Addr; }
+
+  MCDecodedPseudoProbeInlineTree *getInlineTreeNode() const {
+    return InlineTree;
+  }
+
+  // Get the inlined context by traversing current inline tree backwards,
+  // each tree node has its InlineSite which is taken as the context.
+  // \p ContextStack is populated in root to leaf order
+  void
+  getInlineContext(SmallVectorImpl<MCPseduoProbeFrameLocation> &ContextStack,
+                   const GUIDProbeFunctionMap &GUID2FuncMAP) const;
+
+  // Helper function to get the string from context stack
+  std::string
+  getInlineContextStr(const GUIDProbeFunctionMap &GUID2FuncMAP) const;
+
+  // Print pseudo probe while disassembling
+  void print(raw_ostream &OS, const GUIDProbeFunctionMap &GUID2FuncMAP,
+             bool ShowName) const;
 };
 
-// An inline frame has the form <Guid, ProbeID>
-using InlineSite = std::tuple<uint64_t, uint32_t>;
-using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
+template <typename ProbeType, typename DerivedProbeInlineTreeType>
+class MCPseudoProbeInlineTreeBase {
+  struct InlineSiteHash {
+    uint64_t operator()(const InlineSite &Site) const {
+      return std::get<0>(Site) ^ std::get<1>(Site);
+    }
+  };
 
-// A Tri-tree based data structure to group probes by inline stack.
-// A tree is allocated for a standalone .text section. A fake
-// instance is created as the root of a tree.
-// A real instance of this class is created for each function, either an
-// unlined function that has code in .text section or an inlined function.
-class MCPseudoProbeInlineTree {
-  uint64_t Guid;
+protected:
+  // Track children (e.g. inlinees) of current context
+  using InlinedProbeTreeMap = std::unordered_map<
+      InlineSite, std::unique_ptr<DerivedProbeInlineTreeType>, InlineSiteHash>;
+  InlinedProbeTreeMap Children;
   // Set of probes that come with the function.
-  std::vector<MCPseudoProbe> Probes;
-  // Use std::map for a deterministic output.
-  std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees;
+  std::vector<ProbeType> Probes;
+  MCPseudoProbeInlineTreeBase() {
+    static_assert(std::is_base_of<MCPseudoProbeInlineTreeBase,
+                                  DerivedProbeInlineTreeType>::value,
+                  "DerivedProbeInlineTreeType must be subclass of "
+                  "MCPseudoProbeInlineTreeBase");
+  }
+
+public:
+  uint64_t Guid = 0;
 
   // Root node has a GUID 0.
-  bool isRoot() { return Guid == 0; }
-  MCPseudoProbeInlineTree *getOrAddNode(InlineSite Site);
+  bool isRoot() const { return Guid == 0; }
+  InlinedProbeTreeMap &getChildren() { return Children; }
+  const InlinedProbeTreeMap &getChildren() const { return Children; }
+  std::vector<ProbeType> &getProbes() { return Probes; }
+  void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
+  // Caller node of the inline site
+  MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent;
+  DerivedProbeInlineTreeType *getOrAddNode(const InlineSite &Site) {
+    auto Ret = Children.emplace(
+        Site, std::make_unique<DerivedProbeInlineTreeType>(Site));
+    Ret.first->second->Parent = this;
+    return Ret.first->second.get();
+  };
+};
 
+// A Tri-tree based data structure to group probes by inline stack.
+// A tree is allocated for a standalone .text section. A fake
+// instance is created as the root of a tree.
+// A real instance of this class is created for each function, either a
+// not inlined function that has code in .text section or an inlined function.
+class MCPseudoProbeInlineTree
+    : public MCPseudoProbeInlineTreeBase<MCPseudoProbe,
+                                         MCPseudoProbeInlineTree> {
 public:
   MCPseudoProbeInlineTree() = default;
-  MCPseudoProbeInlineTree(uint64_t Guid) : Guid(Guid) {}
-  ~MCPseudoProbeInlineTree();
+  MCPseudoProbeInlineTree(uint64_t Guid) { this->Guid = Guid; }
+  MCPseudoProbeInlineTree(const InlineSite &Site) {
+    this->Guid = std::get<0>(Site);
+  }
+
+  // MCPseudoProbeInlineTree method based on Inlinees
   void addPseudoProbe(const MCPseudoProbe &Probe,
                       const MCPseudoProbeInlineStack &InlineStack);
   void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *&LastProbe);
 };
 
+// inline tree node for the decoded pseudo probe
+class MCDecodedPseudoProbeInlineTree
+    : public MCPseudoProbeInlineTreeBase<MCDecodedPseudoProbe *,
+                                         MCDecodedPseudoProbeInlineTree> {
+public:
+  InlineSite ISite;
+  // Used for decoding
+  uint32_t ChildrenToProcess = 0;
+
+  MCDecodedPseudoProbeInlineTree(){};
+  MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
+
+  // Return false if it's a dummy inline site
+  bool hasInlineSite() const { return std::get<0>(ISite) != 0; }
+};
+
 /// Instances of this class represent the pseudo probes inserted into a compile
 /// unit.
 class MCPseudoProbeSection {
@@ -172,6 +323,83 @@ public:
   static int DdgPrintIndent;
 #endif
 };
+
+class MCPseudoProbeDecoder {
+  // GUID to PseudoProbeFuncDesc map.
+  GUIDProbeFunctionMap GUID2FuncDescMap;
+
+  // Address to probes map.
+  AddressProbesMap Address2ProbesMap;
+
+  // The dummy root of the inline trie, all the outlined function will directly
+  // be the children of the dummy root, all the inlined function will be the
+  // children of its inlineer. So the relation would be like:
+  // DummyRoot --> OutlinedFunc --> InlinedFunc1 --> InlinedFunc2
+  MCDecodedPseudoProbeInlineTree DummyInlineRoot;
+
+  /// Points to the current location in the buffer.
+  const uint8_t *Data = nullptr;
+
+  /// Points to the end of the buffer.
+  const uint8_t *End = nullptr;
+
+  // Decoding helper function
+  template <typename T> ErrorOr<T> readUnencodedNumber();
+  template <typename T> ErrorOr<T> readUnsignedNumber();
+  template <typename T> ErrorOr<T> readSignedNumber();
+  ErrorOr<StringRef> readString(uint32_t Size);
+
+public:
+  // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
+  bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
+
+  // Decode pseudo_probe section to build address to probes map.
+  bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size);
+
+  // Print pseudo_probe_desc section info
+  void printGUID2FuncDescMap(raw_ostream &OS);
+
+  // Print pseudo_probe section info, used along with show-disassembly
+  void printProbeForAddress(raw_ostream &OS, uint64_t Address);
+
+  // do printProbeForAddress for all addresses
+  void printProbesForAllAddresses(raw_ostream &OS);
+
+  // Look up the probe of a call for the input address
+  const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const;
+
+  const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) const;
+
+  // Helper function to populate one probe's inline stack into
+  // \p InlineContextStack.
+  // Current leaf location info will be added if IncludeLeaf is true
+  // Example:
+  //  Current probe(bar:3) inlined at foo:2 then inlined at main:1
+  //  IncludeLeaf = true,  Output: [main:1, foo:2, bar:3]
+  //  IncludeLeaf = false, Output: [main:1, foo:2]
+  void getInlineContextForProbe(
+      const MCDecodedPseudoProbe *Probe,
+      SmallVectorImpl<MCPseduoProbeFrameLocation> &InlineContextStack,
+      bool IncludeLeaf) const;
+
+  const AddressProbesMap &getAddress2ProbesMap() const {
+    return Address2ProbesMap;
+  }
+
+  AddressProbesMap &getAddress2ProbesMap() { return Address2ProbesMap; }
+
+  const GUIDProbeFunctionMap &getGUID2FuncDescMap() const {
+    return GUID2FuncDescMap;
+  }
+
+  const MCPseudoProbeFuncDesc *
+  getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) const;
+
+  const MCDecodedPseudoProbeInlineTree &getDummyInlineRoot() const {
+    return DummyInlineRoot;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_MC_MCPSEUDOPROBE_H
diff --git a/llvm/include/llvm/MC/MCRegister.h b/llvm/include/llvm/MC/MCRegister.h
index 72507b7d8ee4..1e8c747785eb 100644
--- a/llvm/include/llvm/MC/MCRegister.h
+++ b/llvm/include/llvm/MC/MCRegister.h
@@ -10,6 +10,7 @@
 #define LLVM_MC_MCREGISTER_H
 
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
 #include <cassert>
 #include <limits>
 
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
index acfbfd387ff3..6dffc158af50 100644
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_MC_MCSCHEDULE_H
 #define LLVM_MC_MCSCHEDULE_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index fd326ff18712..e00f50f617fa 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -123,6 +123,8 @@ public:
   /// This is used to emit bytes in \p Data as sequence of .byte directives.
   virtual void emitRawBytes(StringRef Data);
 
+  virtual void emitConstantPools();
+
   virtual void finish();
 };
 
@@ -165,7 +167,7 @@ public:
 
   virtual void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value);
 
-  void finish() override;
+  void emitConstantPools() override;
 
   /// Reset any state between object emissions, i.e. the equivalent of
   /// MCStreamer's reset method.
@@ -445,7 +447,7 @@ public:
   }
 
   /// Create the default sections and set the initial one.
-  virtual void InitSections(bool NoExecStack);
+  virtual void initSections(bool NoExecStack, const MCSubtargetInfo &STI);
 
   MCSymbol *endSection(MCSection *Section);
 
@@ -797,7 +799,7 @@ public:
                         SMLoc Loc = SMLoc());
 
   virtual void emitNops(int64_t NumBytes, int64_t ControlledNopLength,
-                        SMLoc Loc);
+                        SMLoc Loc, const MCSubtargetInfo& STI);
 
   /// Emit NumBytes worth of zeros.
   /// This function properly handles data in virtual sections.
@@ -831,10 +833,12 @@ public:
   ///
   /// \param ByteAlignment - The alignment to reach. This must be a power of
   /// two on some targets.
+  /// \param STI - The MCSubtargetInfo in operation when padding is emitted.
   /// \param MaxBytesToEmit - The maximum numbers of bytes to emit, or 0. If
   /// the alignment cannot be reached in this many bytes, no bytes are
   /// emitted.
   virtual void emitCodeAlignment(unsigned ByteAlignment,
+                                 const MCSubtargetInfo *STI,
                                  unsigned MaxBytesToEmit = 0);
 
   /// Emit some number of copies of \p Value until the byte offset \p
diff --git a/llvm/include/llvm/MC/MCSymbolWasm.h b/llvm/include/llvm/MC/MCSymbolWasm.h
index 852ab678e616..5a4852e0e895 100644
--- a/llvm/include/llvm/MC/MCSymbolWasm.h
+++ b/llvm/include/llvm/MC/MCSymbolWasm.h
@@ -27,7 +27,6 @@ class MCSymbolWasm : public MCSymbol {
   wasm::WasmSignature *Signature = nullptr;
   Optional<wasm::WasmGlobalType> GlobalType;
   Optional<wasm::WasmTableType> TableType;
-  Optional<wasm::WasmTagType> TagType;
 
   /// An expression describing how to calculate the size of a symbol. If a
   /// symbol has no size this field will be NULL.
@@ -67,6 +66,11 @@ public:
     modifyFlags(wasm::WASM_SYMBOL_NO_STRIP, wasm::WASM_SYMBOL_NO_STRIP);
   }
 
+  bool isTLS() const { return getFlags() & wasm::WASM_SYMBOL_TLS; }
+  void setTLS() const {
+    modifyFlags(wasm::WASM_SYMBOL_TLS, wasm::WASM_SYMBOL_TLS);
+  }
+
   bool isWeak() const { return IsWeak; }
   void setWeak(bool isWeak) { IsWeak = isWeak; }
 
@@ -142,12 +146,6 @@ public:
     wasm::WasmLimits Limits = {wasm::WASM_LIMITS_FLAG_NONE, 0, 0};
     setTableType({uint8_t(VT), Limits});
   }
-
-  const wasm::WasmTagType &getTagType() const {
-    assert(TagType.hasValue());
-    return TagType.getValue();
-  }
-  void setTagType(wasm::WasmTagType ET) { TagType = ET; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCWasmStreamer.h b/llvm/include/llvm/MC/MCWasmStreamer.h
index 6651f071f799..818f59e5ab3e 100644
--- a/llvm/include/llvm/MC/MCWasmStreamer.h
+++ b/llvm/include/llvm/MC/MCWasmStreamer.h
@@ -41,6 +41,9 @@ public:
   /// @{
 
   void changeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment *F,
+                      uint64_t Offset) override;
   void emitAssemblerFlag(MCAssemblerFlag Flag) override;
   void emitThumbFunc(MCSymbol *Func) override;
   void emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
@@ -68,6 +71,8 @@ private:
   void emitInstToFragment(const MCInst &Inst, const MCSubtargetInfo &) override;
   void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
 
+  void fixSymbolsInTLSFixups(const MCExpr *expr);
+
   /// Merge the content of the fragment \p EF into the fragment \p DF.
   void mergeFragment(MCDataFragment *, MCDataFragment *);
 
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 53b2ef0bd96e..af1ed6faf753 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -39,7 +39,7 @@ public:
   /// \name MCStreamer interface
   /// \{
 
-  void InitSections(bool NoExecStack) override;
+  void initSections(bool NoExecStack, const MCSubtargetInfo &STI) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void emitAssemblerFlag(MCAssemblerFlag Flag) override;
   void emitThumbFunc(MCSymbol *Func) override;
diff --git a/llvm/include/llvm/Support/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index e661ae26cb4e..da9a9269edbf 100644
--- a/llvm/include/llvm/Support/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -1,4 +1,4 @@
-//===- Support/TargetRegistry.h - Target Registration -----------*- C++ -*-===//
+//===- MC/TargetRegistry.h - Target Registration ----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_TARGETREGISTRY_H
-#define LLVM_SUPPORT_TARGETREGISTRY_H
+#ifndef LLVM_MC_TARGETREGISTRY_H
+#define LLVM_MC_TARGETREGISTRY_H
 
 #include "llvm-c/DisassemblerTypes.h"
 #include "llvm/ADT/Optional.h"
@@ -59,6 +59,11 @@ class raw_ostream;
 class raw_pwrite_stream;
 class TargetMachine;
 class TargetOptions;
+namespace mca {
+class CustomBehaviour;
+class InstrPostProcess;
+class SourceMgr;
+} // namespace mca
 
 MCStreamer *createNullStreamer(MCContext &Ctx);
 // Takes ownership of \p TAB and \p CE.
@@ -114,6 +119,13 @@ MCSymbolizer *createMCSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                  void *DisInfo, MCContext *Ctx,
                                  std::unique_ptr<MCRelocationInfo> &&RelInfo);
 
+mca::CustomBehaviour *createCustomBehaviour(const MCSubtargetInfo &STI,
+                                            const mca::SourceMgr &SrcMgr,
+                                            const MCInstrInfo &MCII);
+
+mca::InstrPostProcess *createInstrPostProcess(const MCSubtargetInfo &STI,
+                                              const MCInstrInfo &MCII);
+
 /// Target - Wrapper for Target specific information.
 ///
 /// For registration purposes, this is a POD type so that targets can be
@@ -206,6 +218,15 @@ public:
       LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo, MCContext *Ctx,
       std::unique_ptr<MCRelocationInfo> &&RelInfo);
 
+  using CustomBehaviourCtorTy =
+      mca::CustomBehaviour *(*)(const MCSubtargetInfo &STI,
+                                const mca::SourceMgr &SrcMgr,
+                                const MCInstrInfo &MCII);
+
+  using InstrPostProcessCtorTy =
+      mca::InstrPostProcess *(*)(const MCSubtargetInfo &STI,
+                                 const MCInstrInfo &MCII);
+
 private:
   /// Next - The next registered target in the linked list, maintained by the
   /// TargetRegistry.
@@ -305,6 +326,14 @@ private:
   /// MCSymbolizer, if registered (default = llvm::createMCSymbolizer)
   MCSymbolizerCtorTy MCSymbolizerCtorFn = nullptr;
 
+  /// CustomBehaviourCtorFn - Construction function for this target's
+  /// CustomBehaviour, if registered (default = nullptr).
+  CustomBehaviourCtorTy CustomBehaviourCtorFn = nullptr;
+
+  /// InstrPostProcessCtorFn - Construction function for this target's
+  /// InstrPostProcess, if registered (default = nullptr).
+  InstrPostProcessCtorTy InstrPostProcessCtorFn = nullptr;
+
 public:
   Target() = default;
 
@@ -623,6 +652,25 @@ public:
               std::move(RelInfo));
   }
 
+  /// createCustomBehaviour - Create a target specific CustomBehaviour.
+  /// This class is used by llvm-mca and requires backend functionality.
+  mca::CustomBehaviour *createCustomBehaviour(const MCSubtargetInfo &STI,
+                                              const mca::SourceMgr &SrcMgr,
+                                              const MCInstrInfo &MCII) const {
+    if (CustomBehaviourCtorFn)
+      return CustomBehaviourCtorFn(STI, SrcMgr, MCII);
+    return nullptr;
+  }
+
+  /// createInstrPostProcess - Create a target specific InstrPostProcess.
+  /// This class is used by llvm-mca and requires backend functionality.
+  mca::InstrPostProcess *createInstrPostProcess(const MCSubtargetInfo &STI,
+                                                const MCInstrInfo &MCII) const {
+    if (InstrPostProcessCtorFn)
+      return InstrPostProcessCtorFn(STI, MCII);
+    return nullptr;
+  }
+
   /// @}
 };
 
@@ -959,6 +1007,34 @@ struct TargetRegistry {
     T.MCSymbolizerCtorFn = Fn;
   }
 
+  /// RegisterCustomBehaviour - Register a CustomBehaviour
+  /// implementation for the given target.
+  ///
+  /// Clients are responsible for ensuring that registration doesn't occur
+  /// while another thread is attempting to access the registry. Typically
+  /// this is done by initializing all targets at program startup.
+  ///
+  /// @param T - The target being registered.
+  /// @param Fn - A function to construct a CustomBehaviour for the target.
+  static void RegisterCustomBehaviour(Target &T,
+                                      Target::CustomBehaviourCtorTy Fn) {
+    T.CustomBehaviourCtorFn = Fn;
+  }
+
+  /// RegisterInstrPostProcess - Register an InstrPostProcess
+  /// implementation for the given target.
+  ///
+  /// Clients are responsible for ensuring that registration doesn't occur
+  /// while another thread is attempting to access the registry. Typically
+  /// this is done by initializing all targets at program startup.
+  ///
+  /// @param T - The target being registered.
+  /// @param Fn - A function to construct an InstrPostProcess for the target.
+  static void RegisterInstrPostProcess(Target &T,
+                                       Target::InstrPostProcessCtorTy Fn) {
+    T.InstrPostProcessCtorFn = Fn;
+  }
+
   /// @}
 };
 
@@ -1294,4 +1370,4 @@ private:
 
 } // end namespace llvm
 
-#endif // LLVM_SUPPORT_TARGETREGISTRY_H
+#endif // LLVM_MC_TARGETREGISTRY_H
diff --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h
index 655a9c49c599..395b07cf722b 100644
--- a/llvm/include/llvm/MCA/CustomBehaviour.h
+++ b/llvm/include/llvm/MCA/CustomBehaviour.h
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MCA/SourceMgr.h"
+#include "llvm/MCA/View.h"
 
 namespace llvm {
 namespace mca {
@@ -55,29 +56,53 @@ public:
 class CustomBehaviour {
 protected:
   const MCSubtargetInfo &STI;
-  const SourceMgr &SrcMgr;
+  const mca::SourceMgr &SrcMgr;
   const MCInstrInfo &MCII;
 
 public:
-  CustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
+  CustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr,
                   const MCInstrInfo &MCII)
       : STI(STI), SrcMgr(SrcMgr), MCII(MCII) {}
 
   virtual ~CustomBehaviour();
 
-  // Before the llvm-mca pipeline dispatches an instruction, it first checks
-  // for any register or resource dependencies / hazards. If it doesn't find
-  // any, this method will be invoked to determine if there are any custom
-  // hazards that the instruction needs to wait for.
-  // The return value of this method is the number of cycles that the
-  // instruction needs to wait for.
-  // It's safe to underestimate the number of cycles to wait for since these
-  // checks will be invoked again before the intruction gets dispatched.
-  // However, it's not safe (accurate) to overestimate the number of cycles
-  // to wait for since the instruction will wait for AT LEAST that number of
-  // cycles before attempting to be dispatched again.
+  /// Before the llvm-mca pipeline dispatches an instruction, it first checks
+  /// for any register or resource dependencies / hazards. If it doesn't find
+  /// any, this method will be invoked to determine if there are any custom
+  /// hazards that the instruction needs to wait for.
+  /// The return value of this method is the number of cycles that the
+  /// instruction needs to wait for.
+  /// It's safe to underestimate the number of cycles to wait for since these
+  /// checks will be invoked again before the intruction gets dispatched.
+  /// However, it's not safe (accurate) to overestimate the number of cycles
+  /// to wait for since the instruction will wait for AT LEAST that number of
+  /// cycles before attempting to be dispatched again.
   virtual unsigned checkCustomHazard(ArrayRef<InstRef> IssuedInst,
                                      const InstRef &IR);
+
+  // Functions that target CBs can override to return a list of
+  // target specific Views that need to live within /lib/Target/ so that
+  // they can benefit from the target CB or from backend functionality that is
+  // not already exposed through MC-layer classes. Keep in mind that how this
+  // function is used is that the function is called within llvm-mca.cpp and
+  // then each unique_ptr<View> is passed into the PipelinePrinter::addView()
+  // function. This function will then std::move the View into its own vector of
+  // Views. So any CB that overrides this function needs to make sure that they
+  // are not relying on the current address or reference of the View
+  // unique_ptrs. If you do need the CB and View to be able to communicate with
+  // each other, consider giving the View a reference or pointer to the CB when
+  // the View is constructed. Then the View can query the CB for information
+  // when it needs it.
+  /// Return a vector of Views that will be added before all other Views.
+  virtual std::vector<std::unique_ptr<View>>
+  getStartViews(llvm::MCInstPrinter &IP, llvm::ArrayRef<llvm::MCInst> Insts);
+  /// Return a vector of Views that will be added after the InstructionInfoView.
+  virtual std::vector<std::unique_ptr<View>>
+  getPostInstrInfoViews(llvm::MCInstPrinter &IP,
+                        llvm::ArrayRef<llvm::MCInst> Insts);
+  /// Return a vector of Views that will be added after all other Views.
+  virtual std::vector<std::unique_ptr<View>>
+  getEndViews(llvm::MCInstPrinter &IP, llvm::ArrayRef<llvm::MCInst> Insts);
 };
 
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
index 988cddcbe013..3eb32186d551 100644
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -46,7 +46,7 @@ class MCAOperand {
     kSFPImmediate, ///< Single-floating-point immediate operand.
     kDFPImmediate, ///< Double-Floating-point immediate operand.
   };
-  MCAOperandType Kind = kInvalid;
+  MCAOperandType Kind;
 
   union {
     unsigned RegVal;
@@ -62,7 +62,7 @@ class MCAOperand {
   unsigned Index;
 
 public:
-  MCAOperand() : FPImmVal(0) {}
+  MCAOperand() : Kind(kInvalid), FPImmVal(), Index() {}
 
   bool isValid() const { return Kind != kInvalid; }
   bool isReg() const { return Kind == kRegister; }
diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
index b7006e761647..42f386a13d85 100644
--- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
+++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
@@ -21,6 +21,7 @@
 
 namespace llvm {
 namespace mca {
+class LSUnit;
 class RegisterFile;
 
 struct StallInfo {
@@ -29,6 +30,7 @@ struct StallInfo {
     REGISTER_DEPS,
     DISPATCH,
     DELAY,
+    LOAD_STORE,
     CUSTOM_STALL
   };
 
@@ -54,6 +56,7 @@ class InOrderIssueStage final : public Stage {
   RegisterFile &PRF;
   ResourceManager RM;
   CustomBehaviour &CB;
+  LSUnit &LSU;
 
   /// Instructions that were issued, but not executed yet.
   SmallVector<InstRef, 4> IssuedInst;
@@ -110,7 +113,7 @@ class InOrderIssueStage final : public Stage {
 
 public:
   InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF,
-                    CustomBehaviour &CB);
+                    CustomBehaviour &CB, LSUnit &LSU);
 
   unsigned getIssueWidth() const;
   bool isAvailable(const InstRef &) const override;
diff --git a/llvm/tools/llvm-mca/Views/View.h b/llvm/include/llvm/MCA/View.h
index c604733d4ec9..ff8fc1ceb3f1 100644
--- a/llvm/tools/llvm-mca/Views/View.h
+++ b/llvm/include/llvm/MCA/View.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_MCA_VIEW_H
-#define LLVM_TOOLS_LLVM_MCA_VIEW_H
+#ifndef LLVM_MCA_VIEW_H
+#define LLVM_MCA_VIEW_H
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MCA/HWEventListener.h"
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index c5f966891bd0..37f23c435ae1 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -81,10 +81,6 @@ getElfArchType(StringRef Object) {
                         (uint8_t)Object[ELF::EI_DATA]);
 }
 
-static inline Error createError(const Twine &Err) {
-  return make_error<StringError>(Err, object_error::parse_failed);
-}
-
 enum PPCInstrMasks : uint64_t {
   PADDI_R12_NO_DISP = 0x0610000039800000,
   ADDIS_R12_TO_R2_NO_DISP = 0x3D820000,
@@ -392,8 +388,7 @@ public:
   Expected<ArrayRef<T>> getSectionContentsAsArray(const Elf_Shdr &Sec) const;
   Expected<ArrayRef<uint8_t>> getSectionContents(const Elf_Shdr &Sec) const;
   Expected<ArrayRef<uint8_t>> getSegmentContents(const Elf_Phdr &Phdr) const;
-  Expected<std::vector<Elf_BBAddrMap>>
-  decodeBBAddrMap(const Elf_Shdr &Sec) const;
+  Expected<std::vector<BBAddrMap>> decodeBBAddrMap(const Elf_Shdr &Sec) const;
 };
 
 using ELF32LEFile = ELFFile<ELF32LE>;
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index c87a09f86fae..716b94d92d03 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -96,6 +96,10 @@ public:
 
   std::vector<std::pair<Optional<DataRefImpl>, uint64_t>>
   getPltAddresses() const;
+
+  /// Returns a vector containing a symbol version for each dynamic symbol.
+  /// Returns an empty vector if version sections do not exist.
+  Expected<std::vector<VersionEntry>> readDynsymVersions() const;
 };
 
 class ELFSectionRef : public SectionRef {
@@ -407,7 +411,8 @@ public:
   const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
     auto RelSecOrErr = EF.getSection(Rel.d.a);
     if (!RelSecOrErr)
-      report_fatal_error(errorToErrorCode(RelSecOrErr.takeError()).message());
+      report_fatal_error(
+          Twine(errorToErrorCode(RelSecOrErr.takeError()).message()));
     return *RelSecOrErr;
   }
 
@@ -728,7 +733,8 @@ Expected<uint32_t> ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Sym) const {
   } else if (EF.getHeader().e_machine == ELF::EM_ARM) {
     if (Expected<StringRef> NameOrErr = getSymbolName(Sym)) {
       StringRef Name = *NameOrErr;
-      if (Name.startswith("$d") || Name.startswith("$t") ||
+      // TODO Investigate why empty name symbols need to be marked.
+      if (Name.empty() || Name.startswith("$d") || Name.startswith("$t") ||
           Name.startswith("$a"))
         Result |= SymbolRef::SF_FormatSpecific;
     } else {
@@ -966,7 +972,8 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
   // Error check sh_link here so that getRelocationSymbol can just use it.
   auto SymSecOrErr = EF.getSection(RelSec->sh_link);
   if (!SymSecOrErr)
-    report_fatal_error(errorToErrorCode(SymSecOrErr.takeError()).message());
+    report_fatal_error(
+        Twine(errorToErrorCode(SymSecOrErr.takeError()).message()));
 
   RelData.d.b += S->sh_size / S->sh_entsize;
   return relocation_iterator(RelocationRef(RelData, this));
@@ -1055,7 +1062,7 @@ ELFObjectFile<ELFT>::getRel(DataRefImpl Rel) const {
   assert(getRelSection(Rel)->sh_type == ELF::SHT_REL);
   auto Ret = EF.template getEntry<Elf_Rel>(Rel.d.a, Rel.d.b);
   if (!Ret)
-    report_fatal_error(errorToErrorCode(Ret.takeError()).message());
+    report_fatal_error(Twine(errorToErrorCode(Ret.takeError()).message()));
   return *Ret;
 }
 
@@ -1065,7 +1072,7 @@ ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
   assert(getRelSection(Rela)->sh_type == ELF::SHT_RELA);
   auto Ret = EF.template getEntry<Elf_Rela>(Rela.d.a, Rela.d.b);
   if (!Ret)
-    report_fatal_error(errorToErrorCode(Ret.takeError()).message());
+    report_fatal_error(Twine(errorToErrorCode(Ret.takeError()).message()));
   return *Ret;
 }
 
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index 54ebd751d8d2..e59a63d93989 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -44,7 +44,6 @@ template <class ELFT> struct Elf_Nhdr_Impl;
 template <class ELFT> class Elf_Note_Impl;
 template <class ELFT> class Elf_Note_Iterator_Impl;
 template <class ELFT> struct Elf_CGProfile_Impl;
-template <class ELFT> struct Elf_BBAddrMap_Impl;
 
 template <endianness E, bool Is64> struct ELFType {
 private:
@@ -76,7 +75,6 @@ public:
   using Note = Elf_Note_Impl<ELFType<E, Is64>>;
   using NoteIterator = Elf_Note_Iterator_Impl<ELFType<E, Is64>>;
   using CGProfile = Elf_CGProfile_Impl<ELFType<E, Is64>>;
-  using BBAddrMap = Elf_BBAddrMap_Impl<ELFType<E, Is64>>;
   using DynRange = ArrayRef<Dyn>;
   using ShdrRange = ArrayRef<Shdr>;
   using SymRange = ArrayRef<Sym>;
@@ -131,7 +129,6 @@ using ELF64BE = ELFType<support::big, true>;
   using Elf_Note = typename ELFT::Note;                                        \
   using Elf_Note_Iterator = typename ELFT::NoteIterator;                       \
   using Elf_CGProfile = typename ELFT::CGProfile;                              \
-  using Elf_BBAddrMap = typename ELFT::BBAddrMap;                              \
   using Elf_Dyn_Range = typename ELFT::DynRange;                               \
   using Elf_Shdr_Range = typename ELFT::ShdrRange;                             \
   using Elf_Sym_Range = typename ELFT::SymRange;                               \
@@ -797,9 +794,8 @@ template <class ELFT> struct Elf_Mips_ABIFlags {
 };
 
 // Struct representing the BBAddrMap for one function.
-template <class ELFT> struct Elf_BBAddrMap_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  uintX_t Addr; // Function address
+struct BBAddrMap {
+  uint64_t Addr; // Function address
   // Struct representing the BBAddrMap information for one basic block.
   struct BBEntry {
     uint32_t Offset; // Offset of basic block relative to function start.
diff --git a/llvm/include/llvm/Object/Error.h b/llvm/include/llvm/Object/Error.h
index 07744188444a..1fc1f6603a36 100644
--- a/llvm/include/llvm/Object/Error.h
+++ b/llvm/include/llvm/Object/Error.h
@@ -82,6 +82,10 @@ private:
 /// error() function needs to called on the llvm::Error.
 Error isNotObjectErrorInvalidFileType(llvm::Error Err);
 
+inline Error createError(const Twine &Err) {
+  return make_error<StringError>(Err, object_error::parse_failed);
+}
+
 } // end namespace object.
 
 } // end namespace llvm.
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index d2ad12e98deb..ca5d63e4074f 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -311,6 +311,9 @@ public:
   bool isSectionBitcode(DataRefImpl Sec) const override;
   bool isDebugSection(DataRefImpl Sec) const override;
 
+  /// Return the raw contents of an entire segment.
+  ArrayRef<uint8_t> getSegmentContents(StringRef SegmentName) const;
+
   /// When dsymutil generates the companion file, it strips all unnecessary
   /// sections (e.g. everything in the _TEXT segment) by omitting their body
   /// and setting the offset in their corresponding load command to zero.
diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h
index 2cea950fcf25..e4802c087b8b 100644
--- a/llvm/include/llvm/Object/Wasm.h
+++ b/llvm/include/llvm/Object/Wasm.h
@@ -9,7 +9,7 @@
 // This file declares the WasmObjectFile class, which implements the ObjectFile
 // interface for Wasm files.
 //
-// See: https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
+// See: https://github.com/WebAssembly/design/blob/main/BinaryEncoding.md
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,15 +37,13 @@ public:
   WasmSymbol(const wasm::WasmSymbolInfo &Info,
              const wasm::WasmGlobalType *GlobalType,
              const wasm::WasmTableType *TableType,
-             const wasm::WasmTagType *TagType,
              const wasm::WasmSignature *Signature)
       : Info(Info), GlobalType(GlobalType), TableType(TableType),
-        TagType(TagType), Signature(Signature) {}
+        Signature(Signature) {}
 
   const wasm::WasmSymbolInfo &Info;
   const wasm::WasmGlobalType *GlobalType;
   const wasm::WasmTableType *TableType;
-  const wasm::WasmTagType *TagType;
   const wasm::WasmSignature *Signature;
 
   bool isTypeFunction() const {
@@ -138,7 +136,6 @@ public:
     return TargetFeatures;
   }
   ArrayRef<wasm::WasmSignature> types() const { return Signatures; }
-  ArrayRef<uint32_t> functionTypes() const { return FunctionTypes; }
   ArrayRef<wasm::WasmImport> imports() const { return Imports; }
   ArrayRef<wasm::WasmTable> tables() const { return Tables; }
   ArrayRef<wasm::WasmLimits> memories() const { return Memories; }
@@ -260,6 +257,7 @@ private:
 
   // Custom section types
   Error parseDylinkSection(ReadContext &Ctx);
+  Error parseDylink0Section(ReadContext &Ctx);
   Error parseNameSection(ReadContext &Ctx);
   Error parseLinkingSection(ReadContext &Ctx);
   Error parseLinkingSectionSymtab(ReadContext &Ctx);
@@ -274,7 +272,6 @@ private:
   wasm::WasmProducerInfo ProducerInfo;
   std::vector<wasm::WasmFeatureEntry> TargetFeatures;
   std::vector<wasm::WasmSignature> Signatures;
-  std::vector<uint32_t> FunctionTypes;
   std::vector<wasm::WasmTable> Tables;
   std::vector<wasm::WasmLimits> Memories;
   std::vector<wasm::WasmGlobal> Globals;
diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h
index 7d024fbc3eae..94136afc45ea 100644
--- a/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -51,6 +51,101 @@ struct XCOFFFileHeader64 {
   support::ubig32_t NumberOfSymTableEntries;
 };
 
+template <typename T> struct XCOFFAuxiliaryHeader {
+  static constexpr uint8_t AuxiHeaderFlagMask = 0xF0;
+  static constexpr uint8_t AuxiHeaderTDataAlignmentMask = 0x0F;
+
+public:
+  uint8_t getFlag() const {
+    return static_cast<const T *>(this)->FlagAndTDataAlignment &
+           AuxiHeaderFlagMask;
+  }
+  uint8_t getTDataAlignment() const {
+    return static_cast<const T *>(this)->FlagAndTDataAlignment &
+           AuxiHeaderTDataAlignmentMask;
+  }
+};
+
+struct XCOFFAuxiliaryHeader32 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader32> {
+  support::ubig16_t
+      AuxMagic; ///< If the value of the o_vstamp field is greater than 1, the
+                ///< o_mflags field is reserved for future use and it should
+                ///< contain 0. Otherwise, this field is not used.
+  support::ubig16_t
+      Version; ///< The valid values are 1 and 2. When the o_vstamp field is 2
+               ///< in an XCOFF32 file, the new interpretation of the n_type
+               ///< field in the symbol table entry is used.
+  support::ubig32_t TextSize;
+  support::ubig32_t InitDataSize;
+  support::ubig32_t BssDataSize;
+  support::ubig32_t EntryPointAddr;
+  support::ubig32_t TextStartAddr;
+  support::ubig32_t DataStartAddr;
+  support::ubig32_t TOCAnchorAddr;
+  support::ubig16_t SecNumOfEntryPoint;
+  support::ubig16_t SecNumOfText;
+  support::ubig16_t SecNumOfData;
+  support::ubig16_t SecNumOfTOC;
+  support::ubig16_t SecNumOfLoader;
+  support::ubig16_t SecNumOfBSS;
+  support::ubig16_t MaxAlignOfText;
+  support::ubig16_t MaxAlignOfData;
+  support::ubig16_t ModuleType;
+  uint8_t CpuFlag;
+  uint8_t CpuType;
+  support::ubig32_t MaxStackSize; ///< If the value is 0, the system default
+                                  ///< maximum stack size is used.
+  support::ubig32_t MaxDataSize;  ///< If the value is 0, the system default
+                                  ///< maximum data size is used.
+  support::ubig32_t
+      ReservedForDebugger; ///< This field should contain 0. When a loaded
+                           ///< program is being debugged, the memory image of
+                           ///< this field may be modified by a debugger to
+                           ///< insert a trap instruction.
+  uint8_t TextPageSize;  ///< Specifies the size of pages for the exec text. The
+                         ///< default value is 0 (system-selected page size).
+  uint8_t DataPageSize;  ///< Specifies the size of pages for the exec data. The
+                         ///< default value is 0 (system-selected page size).
+  uint8_t StackPageSize; ///< Specifies the size of pages for the stack. The
+                         ///< default value is 0 (system-selected page size).
+  uint8_t FlagAndTDataAlignment;
+  support::ubig16_t SecNumOfTData;
+  support::ubig16_t SecNumOfTBSS;
+};
+
+struct XCOFFAuxiliaryHeader64 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader32> {
+  support::ubig16_t AuxMagic;
+  support::ubig16_t Version;
+  support::ubig32_t ReservedForDebugger;
+  support::ubig64_t TextStartAddr;
+  support::ubig64_t DataStartAddr;
+  support::ubig64_t TOCAnchorAddr;
+  support::ubig16_t SecNumOfEntryPoint;
+  support::ubig16_t SecNumOfText;
+  support::ubig16_t SecNumOfData;
+  support::ubig16_t SecNumOfTOC;
+  support::ubig16_t SecNumOfLoader;
+  support::ubig16_t SecNumOfBSS;
+  support::ubig16_t MaxAlignOfText;
+  support::ubig16_t MaxAlignOfData;
+  support::ubig16_t ModuleType;
+  uint8_t CpuFlag;
+  uint8_t CpuType;
+  uint8_t TextPageSize;
+  uint8_t DataPageSize;
+  uint8_t StackPageSize;
+  uint8_t FlagAndTDataAlignment;
+  support::ubig64_t TextSize;
+  support::ubig64_t InitDataSize;
+  support::ubig64_t BssDataSize;
+  support::ubig64_t EntryPointAddr;
+  support::ubig64_t MaxStackSize;
+  support::ubig64_t MaxDataSize;
+  support::ubig16_t SecNumOfTData;
+  support::ubig16_t SecNumOfTBSS;
+  support::ubig16_t XCOFF64Flag;
+};
+
 template <typename T> struct XCOFFSectionHeader {
   // Least significant 3 bits are reserved.
   static constexpr unsigned SectionFlagsReservedMask = 0x7;
@@ -97,6 +192,31 @@ struct XCOFFSectionHeader64 : XCOFFSectionHeader<XCOFFSectionHeader64> {
   char Padding[4];
 };
 
+struct LoaderSectionHeader32 {
+  support::ubig32_t Version;
+  support::ubig32_t NumberOfSymTabEnt;
+  support::ubig32_t NumberOfRelTabEnt;
+  support::ubig32_t LengthOfImpidStrTbl;
+  support::ubig32_t NumberOfImpid;
+  support::big32_t OffsetToImpid;
+  support::ubig32_t LengthOfStrTbl;
+  support::big32_t OffsetToStrTbl;
+};
+
+struct LoaderSectionHeader64 {
+  support::ubig32_t Version;
+  support::ubig32_t NumberOfSymTabEnt;
+  support::ubig32_t NumberOfRelTabEnt;
+  support::ubig32_t LengthOfImpidStrTbl;
+  support::ubig32_t NumberOfImpid;
+  support::ubig32_t LengthOfStrTbl;
+  support::big64_t OffsetToImpid;
+  support::big64_t OffsetToStrTbl;
+  support::big64_t OffsetToSymTbl;
+  char Padding[16];
+  support::big32_t OffsetToRelEnt;
+};
+
 struct XCOFFStringTable {
   uint32_t Size;
   const char *Data;
@@ -228,7 +348,7 @@ struct XCOFFSectAuxEntForStat {
   uint8_t Pad[10];
 }; // 32-bit XCOFF file only.
 
-struct XCOFFRelocation32 {
+template <typename AddressType> struct XCOFFRelocation {
   // Masks for packing/unpacking the r_rsize field of relocations.
 
   // The msb is used to indicate if the bits being relocated are signed or
@@ -244,7 +364,7 @@ struct XCOFFRelocation32 {
   static constexpr uint8_t XR_BIASED_LENGTH_MASK = 0x3f;
 
 public:
-  support::ubig32_t VirtualAddress;
+  AddressType VirtualAddress;
   support::ubig32_t SymbolIndex;
 
   // Packed field, see XR_* masks for details of packing.
@@ -260,11 +380,18 @@ public:
   uint8_t getRelocatedLength() const;
 };
 
+extern template struct XCOFFRelocation<llvm::support::ubig32_t>;
+extern template struct XCOFFRelocation<llvm::support::ubig64_t>;
+
+struct XCOFFRelocation32 : XCOFFRelocation<llvm::support::ubig32_t> {};
+struct XCOFFRelocation64 : XCOFFRelocation<llvm::support::ubig64_t> {};
+
 class XCOFFSymbolRef;
 
 class XCOFFObjectFile : public ObjectFile {
 private:
   const void *FileHeader = nullptr;
+  const void *AuxiliaryHeader = nullptr;
   const void *SectionHeaderTable = nullptr;
 
   const void *SymbolTblPtr = nullptr;
@@ -275,6 +402,7 @@ private:
 
   const XCOFFSectionHeader32 *sectionHeaderTable32() const;
   const XCOFFSectionHeader64 *sectionHeaderTable64() const;
+  template <typename T> const T *sectionHeaderTable() const;
 
   size_t getFileHeaderSize() const;
   size_t getSectionHeaderSize() const;
@@ -283,6 +411,7 @@ private:
   const XCOFFSectionHeader64 *toSection64(DataRefImpl Ref) const;
   uintptr_t getSectionHeaderTableAddress() const;
   uintptr_t getEndOfSymbolTableAddress() const;
+  Expected<uintptr_t> getLoaderSectionAddress() const;
 
   // This returns a pointer to the start of the storage for the name field of
   // the 32-bit or 64-bit SectionHeader struct. This string is *not* necessarily
@@ -322,6 +451,7 @@ public:
   Expected<StringRef> getSymbolName(DataRefImpl Symb) const override;
   Expected<uint64_t> getSymbolAddress(DataRefImpl Symb) const override;
   uint64_t getSymbolValueImpl(DataRefImpl Symb) const override;
+  uint32_t getSymbolAlignment(DataRefImpl Symb) const override;
   uint64_t getCommonSymbolSizeImpl(DataRefImpl Symb) const override;
   Expected<SymbolRef::Type> getSymbolType(DataRefImpl Symb) const override;
   Expected<section_iterator> getSymbolSection(DataRefImpl Symb) const override;
@@ -368,6 +498,9 @@ public:
   // Below here is the non-inherited interface.
   bool is64Bit() const;
 
+  const XCOFFAuxiliaryHeader32 *auxiliaryHeader32() const;
+  const XCOFFAuxiliaryHeader64 *auxiliaryHeader64() const;
+
   const void *getPointerToSymbolTable() const { return SymbolTblPtr; }
 
   Expected<StringRef> getSymbolSectionName(XCOFFSymbolRef Ref) const;
@@ -398,6 +531,11 @@ public:
   uint32_t getNumberOfSymbolTableEntries() const;
 
   uint32_t getSymbolIndex(uintptr_t SymEntPtr) const;
+  uint64_t getSymbolSize(DataRefImpl Symb) const;
+  uintptr_t getSymbolByIndex(uint32_t Idx) const {
+    return reinterpret_cast<uintptr_t>(SymbolTblPtr) +
+           XCOFF::SymbolTableEntrySize * Idx;
+  }
   uintptr_t getSymbolEntryAddressByIndex(uint32_t SymbolTableIndex) const;
   Expected<StringRef> getSymbolNameByIndex(uint32_t SymbolTableIndex) const;
 
@@ -415,11 +553,15 @@ public:
   void checkSymbolEntryPointer(uintptr_t SymbolEntPtr) const;
 
   // Relocation-related interfaces.
+  template <typename T>
   Expected<uint32_t>
-  getLogicalNumberOfRelocationEntries(const XCOFFSectionHeader32 &Sec) const;
+  getNumberOfRelocationEntries(const XCOFFSectionHeader<T> &Sec) const;
 
-  Expected<ArrayRef<XCOFFRelocation32>>
-  relocations(const XCOFFSectionHeader32 &) const;
+  template <typename Shdr, typename Reloc>
+  Expected<ArrayRef<Reloc>> relocations(const Shdr &Sec) const;
+
+  // Loader section related interfaces.
+  Expected<StringRef> getImportFileTable() const;
 
   // This function returns string table entry.
   Expected<StringRef> getStringTableEntry(uint32_t Offset) const;
@@ -572,6 +714,7 @@ class XCOFFTracebackTable {
   Optional<uint8_t> ExtensionTable;
 
   XCOFFTracebackTable(const uint8_t *Ptr, uint64_t &Size, Error &Err);
+
 public:
   /// Parse an XCOFF Traceback Table from \a Ptr with \a Size bytes.
   /// Returns an XCOFFTracebackTable upon successful parsing, otherwise an
diff --git a/llvm/include/llvm/ObjectYAML/MachOYAML.h b/llvm/include/llvm/ObjectYAML/MachOYAML.h
index 5d1d3ee23594..ee89f4eac61f 100644
--- a/llvm/include/llvm/ObjectYAML/MachOYAML.h
+++ b/llvm/include/llvm/ObjectYAML/MachOYAML.h
@@ -131,6 +131,7 @@ struct Object {
   std::vector<LoadCommand> LoadCommands;
   std::vector<Section> Sections;
   LinkEditData LinkEdit;
+  Optional<llvm::yaml::BinaryRef> RawLinkEditSegment;
   DWARFYAML::Data DWARF;
 };
 
diff --git a/llvm/include/llvm/ObjectYAML/WasmYAML.h b/llvm/include/llvm/ObjectYAML/WasmYAML.h
index 661e06fba8bd..e3a1ba0d58a6 100644
--- a/llvm/include/llvm/ObjectYAML/WasmYAML.h
+++ b/llvm/include/llvm/ObjectYAML/WasmYAML.h
@@ -77,12 +77,6 @@ struct Global {
   wasm::WasmInitExpr InitExpr;
 };
 
-struct Tag {
-  uint32_t Index;
-  uint32_t Attribute;
-  uint32_t SigIndex;
-};
-
 struct Import {
   StringRef Module;
   StringRef Field;
@@ -92,7 +86,7 @@ struct Import {
     Global GlobalImport;
     Table TableImport;
     Limits Memory;
-    Tag TagImport;
+    uint32_t TagIndex;
   };
 };
 
@@ -199,12 +193,23 @@ struct CustomSection : Section {
   yaml::BinaryRef Payload;
 };
 
+struct DylinkImportInfo {
+  StringRef Module;
+  StringRef Field;
+  SymbolFlags Flags;
+};
+
+struct DylinkExportInfo {
+  StringRef Name;
+  SymbolFlags Flags;
+};
+
 struct DylinkSection : CustomSection {
-  DylinkSection() : CustomSection("dylink") {}
+  DylinkSection() : CustomSection("dylink.0") {}
 
   static bool classof(const Section *S) {
     auto C = dyn_cast<CustomSection>(S);
-    return C && C->Name == "dylink";
+    return C && C->Name == "dylink.0";
   }
 
   uint32_t MemorySize;
@@ -212,6 +217,8 @@ struct DylinkSection : CustomSection {
   uint32_t TableSize;
   uint32_t TableAlignment;
   std::vector<StringRef> Needed;
+  std::vector<DylinkImportInfo> ImportInfo;
+  std::vector<DylinkExportInfo> ExportInfo;
 };
 
 struct NameSection : CustomSection {
@@ -323,7 +330,7 @@ struct TagSection : Section {
     return S->Type == wasm::WASM_SEC_TAG;
   }
 
-  std::vector<Tag> Tags;
+  std::vector<uint32_t> TagTypes;
 };
 
 struct GlobalSection : Section {
@@ -425,7 +432,8 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::SymbolInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::InitFunction)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::ComdatEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Comdat)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Tag)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::DylinkImportInfo)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::DylinkExportInfo)
 
 namespace llvm {
 namespace yaml {
@@ -570,8 +578,12 @@ template <> struct ScalarEnumerationTraits<WasmYAML::RelocType> {
   static void enumeration(IO &IO, WasmYAML::RelocType &Kind);
 };
 
-template <> struct MappingTraits<WasmYAML::Tag> {
-  static void mapping(IO &IO, WasmYAML::Tag &Tag);
+template <> struct MappingTraits<WasmYAML::DylinkImportInfo> {
+  static void mapping(IO &IO, WasmYAML::DylinkImportInfo &Info);
+};
+
+template <> struct MappingTraits<WasmYAML::DylinkExportInfo> {
+  static void mapping(IO &IO, WasmYAML::DylinkExportInfo &Info);
 };
 
 } // end namespace yaml
diff --git a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
index 2630175642c4..aa1bc396f134 100644
--- a/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/XCOFFYAML.h
@@ -24,11 +24,43 @@ struct FileHeader {
   uint16_t NumberOfSections;
   int32_t TimeStamp;
   llvm::yaml::Hex64 SymbolTableOffset;
-  uint32_t NumberOfSymTableEntries;
+  int32_t NumberOfSymTableEntries;
   uint16_t AuxHeaderSize;
   llvm::yaml::Hex16 Flags;
 };
 
+struct AuxiliaryHeader {
+  Optional<llvm::yaml::Hex16> Magic;
+  Optional<llvm::yaml::Hex16> Version;
+  Optional<llvm::yaml::Hex64> TextStartAddr;
+  Optional<llvm::yaml::Hex64> DataStartAddr;
+  Optional<llvm::yaml::Hex64> TOCAnchorAddr;
+  Optional<uint16_t> SecNumOfEntryPoint;
+  Optional<uint16_t> SecNumOfText;
+  Optional<uint16_t> SecNumOfData;
+  Optional<uint16_t> SecNumOfTOC;
+  Optional<uint16_t> SecNumOfLoader;
+  Optional<uint16_t> SecNumOfBSS;
+  Optional<llvm::yaml::Hex16> MaxAlignOfText;
+  Optional<llvm::yaml::Hex16> MaxAlignOfData;
+  Optional<llvm::yaml::Hex16> ModuleType;
+  Optional<llvm::yaml::Hex8> CpuFlag;
+  Optional<llvm::yaml::Hex8> CpuType;
+  Optional<llvm::yaml::Hex8> TextPageSize;
+  Optional<llvm::yaml::Hex8> DataPageSize;
+  Optional<llvm::yaml::Hex8> StackPageSize;
+  Optional<llvm::yaml::Hex8> FlagAndTDataAlignment;
+  Optional<llvm::yaml::Hex64> TextSize;
+  Optional<llvm::yaml::Hex64> InitDataSize;
+  Optional<llvm::yaml::Hex64> BssDataSize;
+  Optional<llvm::yaml::Hex64> EntryPointAddr;
+  Optional<llvm::yaml::Hex64> MaxStackSize;
+  Optional<llvm::yaml::Hex64> MaxDataSize;
+  Optional<uint16_t> SecNumOfTData;
+  Optional<uint16_t> SecNumOfTBSS;
+  Optional<llvm::yaml::Hex16> Flag;
+};
+
 struct Relocation {
   llvm::yaml::Hex64 VirtualAddress;
   llvm::yaml::Hex64 SymbolIndex;
@@ -53,16 +85,27 @@ struct Section {
 struct Symbol {
   StringRef SymbolName;
   llvm::yaml::Hex64 Value; // Symbol value; storage class-dependent.
-  StringRef SectionName;
+  Optional<StringRef> SectionName;
+  Optional<uint16_t> SectionIndex;
   llvm::yaml::Hex16 Type;
   XCOFF::StorageClass StorageClass;
   uint8_t NumberOfAuxEntries;
 };
 
+struct StringTable {
+  Optional<uint32_t> ContentSize; // The total size of the string table.
+  Optional<uint32_t> Length;      // The value of the length field for the first
+                                  // 4 bytes of the table.
+  Optional<std::vector<StringRef>> Strings;
+  Optional<yaml::BinaryRef> RawContent;
+};
+
 struct Object {
   FileHeader Header;
+  Optional<AuxiliaryHeader> AuxHeader;
   std::vector<Section> Sections;
   std::vector<Symbol> Symbols;
+  StringTable StrTbl;
   Object();
 };
 } // namespace XCOFFYAML
@@ -87,6 +130,9 @@ template <> struct MappingTraits<XCOFFYAML::FileHeader> {
   static void mapping(IO &IO, XCOFFYAML::FileHeader &H);
 };
 
+template <> struct MappingTraits<XCOFFYAML::AuxiliaryHeader> {
+  static void mapping(IO &IO, XCOFFYAML::AuxiliaryHeader &AuxHdr);
+};
 
 template <> struct MappingTraits<XCOFFYAML::Symbol> {
   static void mapping(IO &IO, XCOFFYAML::Symbol &S);
@@ -100,6 +146,10 @@ template <> struct MappingTraits<XCOFFYAML::Section> {
   static void mapping(IO &IO, XCOFFYAML::Section &Sec);
 };
 
+template <> struct MappingTraits<XCOFFYAML::StringTable> {
+  static void mapping(IO &IO, XCOFFYAML::StringTable &Str);
+};
+
 template <> struct MappingTraits<XCOFFYAML::Object> {
   static void mapping(IO &IO, XCOFFYAML::Object &Obj);
 };
diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h
index 22e2bcf06a6e..4be254ccdab4 100644
--- a/llvm/include/llvm/Option/Arg.h
+++ b/llvm/include/llvm/Option/Arg.h
@@ -118,10 +118,7 @@ public:
   const SmallVectorImpl<const char *> &getValues() const { return Values; }
 
   bool containsValue(StringRef Value) const {
-    for (unsigned i = 0, e = getNumValues(); i != e; ++i)
-      if (Values[i] == Value)
-        return true;
-    return false;
+    return llvm::is_contained(Values, Value);
   }
 
   /// Append the argument onto the given array as strings.
diff --git a/llvm/include/llvm/Option/OptParser.td b/llvm/include/llvm/Option/OptParser.td
index 96014b505d0f..9c73f478db5e 100644
--- a/llvm/include/llvm/Option/OptParser.td
+++ b/llvm/include/llvm/Option/OptParser.td
@@ -214,7 +214,7 @@ class MarshallingInfoBitfieldFlag<KeyPathAndMacro kpm, code value>
 }
 
 // Implementation detail of BoolOption.
-class MarshallingInfoBooleanFlag<KeyPathAndMacro kpm, code defaultvalue, code value, code name,
+class MarshallingInfoBooleanFlag<KeyPathAndMacro kpm, code defaultvalue, code value,
                                  code other_value, code other_name>
   : MarshallingInfoFlag<kpm, defaultvalue> {
   code Normalizer = "makeBooleanOptionNormalizer("#value#", "#other_value#", OPT_"#other_name#")";
diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h
index ca2013ee6f04..07d9870f71b3 100644
--- a/llvm/include/llvm/Option/OptTable.h
+++ b/llvm/include/llvm/Option/OptTable.h
@@ -64,8 +64,8 @@ private:
   bool GroupedShortOptions = false;
   const char *EnvVar = nullptr;
 
-  unsigned TheInputOptionID = 0;
-  unsigned TheUnknownOptionID = 0;
+  unsigned InputOptionID = 0;
+  unsigned UnknownOptionID = 0;
 
   /// The index of the first option which can be parsed (i.e., is not a
   /// special option like 'input' or 'unknown', and is not an option group).
@@ -83,7 +83,8 @@ private:
     return OptionInfos[id - 1];
   }
 
-  Arg *parseOneArgGrouped(InputArgList &Args, unsigned &Index) const;
+  std::unique_ptr<Arg> parseOneArgGrouped(InputArgList &Args,
+                                          unsigned &Index) const;
 
 protected:
   OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase = false);
@@ -199,9 +200,9 @@ public:
   /// \return The parsed argument, or 0 if the argument is missing values
   /// (in which case Index still points at the conceptual next argument string
   /// to parse).
-  Arg *ParseOneArg(const ArgList &Args, unsigned &Index,
-                   unsigned FlagsToInclude = 0,
-                   unsigned FlagsToExclude = 0) const;
+  std::unique_ptr<Arg> ParseOneArg(const ArgList &Args, unsigned &Index,
+                                   unsigned FlagsToInclude = 0,
+                                   unsigned FlagsToExclude = 0) const;
 
   /// Parse an list of arguments into an InputArgList.
   ///
diff --git a/llvm/include/llvm/Option/Option.h b/llvm/include/llvm/Option/Option.h
index 196cf656355d..106f6863fca1 100644
--- a/llvm/include/llvm/Option/Option.h
+++ b/llvm/include/llvm/Option/Option.h
@@ -205,9 +205,9 @@ public:
   /// always be false.
   bool matches(OptSpecifier ID) const;
 
-  /// accept - Potentially accept the current argument, returning a
-  /// new Arg instance, or 0 if the option does not accept this
-  /// argument (or the argument is missing values).
+  /// Potentially accept the current argument, returning a new Arg instance,
+  /// or 0 if the option does not accept this argument (or the argument is
+  /// missing values).
   ///
   /// If the option accepts the current argument, accept() sets
   /// Index to the position where argument parsing should resume
@@ -217,12 +217,12 @@ public:
   /// underlying storage to represent a Joined argument.
   /// \p GroupedShortOption If true, we are handling the fallback case of
   /// parsing a prefix of the current argument as a short option.
-  Arg *accept(const ArgList &Args, StringRef CurArg, bool GroupedShortOption,
-              unsigned &Index) const;
+  std::unique_ptr<Arg> accept(const ArgList &Args, StringRef CurArg,
+                              bool GroupedShortOption, unsigned &Index) const;
 
 private:
-  Arg *acceptInternal(const ArgList &Args, StringRef CurArg,
-                      unsigned &Index) const;
+  std::unique_ptr<Arg> acceptInternal(const ArgList &Args, StringRef CurArg,
+                                      unsigned &Index) const;
 
 public:
   void print(raw_ostream &O) const;
diff --git a/llvm/include/llvm/Passes/OptimizationLevel.h b/llvm/include/llvm/Passes/OptimizationLevel.h
new file mode 100644
index 000000000000..d2c3fde4935f
--- /dev/null
+++ b/llvm/include/llvm/Passes/OptimizationLevel.h
@@ -0,0 +1,127 @@
+//===-------- LLVM-provided High-Level Optimization levels -*- C++ -*------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This header enumerates the LLVM-provided high-level optimization levels.
+/// Each level has a specific goal and rationale.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PASSES_OPTIMIZATIONLEVEL_H
+#define LLVM_PASSES_OPTIMIZATIONLEVEL_H
+
+#include <assert.h>
+
+namespace llvm {
+
+class OptimizationLevel final {
+  unsigned SpeedLevel = 2;
+  unsigned SizeLevel = 0;
+  OptimizationLevel(unsigned SpeedLevel, unsigned SizeLevel)
+      : SpeedLevel(SpeedLevel), SizeLevel(SizeLevel) {
+    // Check that only valid combinations are passed.
+    assert(SpeedLevel <= 3 &&
+           "Optimization level for speed should be 0, 1, 2, or 3");
+    assert(SizeLevel <= 2 &&
+           "Optimization level for size should be 0, 1, or 2");
+    assert((SizeLevel == 0 || SpeedLevel == 2) &&
+           "Optimize for size should be encoded with speedup level == 2");
+  }
+
+public:
+  OptimizationLevel() = default;
+  /// Disable as many optimizations as possible. This doesn't completely
+  /// disable the optimizer in all cases, for example always_inline functions
+  /// can be required to be inlined for correctness.
+  static const OptimizationLevel O0;
+
+  /// Optimize quickly without destroying debuggability.
+  ///
+  /// This level is tuned to produce a result from the optimizer as quickly
+  /// as possible and to avoid destroying debuggability. This tends to result
+  /// in a very good development mode where the compiled code will be
+  /// immediately executed as part of testing. As a consequence, where
+  /// possible, we would like to produce efficient-to-execute code, but not
+  /// if it significantly slows down compilation or would prevent even basic
+  /// debugging of the resulting binary.
+  ///
+  /// As an example, complex loop transformations such as versioning,
+  /// vectorization, or fusion don't make sense here due to the degree to
+  /// which the executed code differs from the source code, and the compile
+  /// time cost.
+  static const OptimizationLevel O1;
+  /// Optimize for fast execution as much as possible without triggering
+  /// significant incremental compile time or code size growth.
+  ///
+  /// The key idea is that optimizations at this level should "pay for
+  /// themselves". So if an optimization increases compile time by 5% or
+  /// increases code size by 5% for a particular benchmark, that benchmark
+  /// should also be one which sees a 5% runtime improvement. If the compile
+  /// time or code size penalties happen on average across a diverse range of
+  /// LLVM users' benchmarks, then the improvements should as well.
+  ///
+  /// And no matter what, the compile time needs to not grow superlinearly
+  /// with the size of input to LLVM so that users can control the runtime of
+  /// the optimizer in this mode.
+  ///
+  /// This is expected to be a good default optimization level for the vast
+  /// majority of users.
+  static const OptimizationLevel O2;
+  /// Optimize for fast execution as much as possible.
+  ///
+  /// This mode is significantly more aggressive in trading off compile time
+  /// and code size to get execution time improvements. The core idea is that
+  /// this mode should include any optimization that helps execution time on
+  /// balance across a diverse collection of benchmarks, even if it increases
+  /// code size or compile time for some benchmarks without corresponding
+  /// improvements to execution time.
+  ///
+  /// Despite being willing to trade more compile time off to get improved
+  /// execution time, this mode still tries to avoid superlinear growth in
+  /// order to make even significantly slower compile times at least scale
+  /// reasonably. This does not preclude very substantial constant factor
+  /// costs though.
+  static const OptimizationLevel O3;
+  /// Similar to \c O2 but tries to optimize for small code size instead of
+  /// fast execution without triggering significant incremental execution
+  /// time slowdowns.
+  ///
+  /// The logic here is exactly the same as \c O2, but with code size and
+  /// execution time metrics swapped.
+  ///
+  /// A consequence of the different core goal is that this should in general
+  /// produce substantially smaller executables that still run in
+  /// a reasonable amount of time.
+  static const OptimizationLevel Os;
+  /// A very specialized mode that will optimize for code size at any and all
+  /// costs.
+  ///
+  /// This is useful primarily when there are absolute size limitations and
+  /// any effort taken to reduce the size is worth it regardless of the
+  /// execution time impact. You should expect this level to produce rather
+  /// slow, but very small, code.
+  static const OptimizationLevel Oz;
+
+  bool isOptimizingForSpeed() const { return SizeLevel == 0 && SpeedLevel > 0; }
+
+  bool isOptimizingForSize() const { return SizeLevel > 0; }
+
+  bool operator==(const OptimizationLevel &Other) const {
+    return SizeLevel == Other.SizeLevel && SpeedLevel == Other.SpeedLevel;
+  }
+  bool operator!=(const OptimizationLevel &Other) const {
+    return SizeLevel != Other.SizeLevel || SpeedLevel != Other.SpeedLevel;
+  }
+
+  unsigned getSpeedupLevel() const { return SpeedLevel; }
+
+  unsigned getSizeLevel() const { return SizeLevel; }
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index fae3e2cd2e0b..7c7883e98183 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -18,9 +18,12 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
@@ -31,57 +34,6 @@ class AAManager;
 class TargetMachine;
 class ModuleSummaryIndex;
 
-/// A struct capturing PGO tunables.
-struct PGOOptions {
-  enum PGOAction { NoAction, IRInstr, IRUse, SampleUse };
-  enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
-  PGOOptions(std::string ProfileFile = "", std::string CSProfileGenFile = "",
-             std::string ProfileRemappingFile = "", PGOAction Action = NoAction,
-             CSPGOAction CSAction = NoCSAction,
-             bool DebugInfoForProfiling = false,
-             bool PseudoProbeForProfiling = false)
-      : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
-        ProfileRemappingFile(ProfileRemappingFile), Action(Action),
-        CSAction(CSAction), DebugInfoForProfiling(DebugInfoForProfiling ||
-                                                  (Action == SampleUse &&
-                                                   !PseudoProbeForProfiling)),
-        PseudoProbeForProfiling(PseudoProbeForProfiling) {
-    // Note, we do allow ProfileFile.empty() for Action=IRUse LTO can
-    // callback with IRUse action without ProfileFile.
-
-    // If there is a CSAction, PGOAction cannot be IRInstr or SampleUse.
-    assert(this->CSAction == NoCSAction ||
-           (this->Action != IRInstr && this->Action != SampleUse));
-
-    // For CSIRInstr, CSProfileGenFile also needs to be nonempty.
-    assert(this->CSAction != CSIRInstr || !this->CSProfileGenFile.empty());
-
-    // If CSAction is CSIRUse, PGOAction needs to be IRUse as they share
-    // a profile.
-    assert(this->CSAction != CSIRUse || this->Action == IRUse);
-
-    // If neither Action nor CSAction, DebugInfoForProfiling or
-    // PseudoProbeForProfiling needs to be true.
-    assert(this->Action != NoAction || this->CSAction != NoCSAction ||
-           this->DebugInfoForProfiling || this->PseudoProbeForProfiling);
-
-    // Pseudo probe emission does not work with -fdebug-info-for-profiling since
-    // they both use the discriminator field of debug lines but for different
-    // purposes.
-    if (this->DebugInfoForProfiling && this->PseudoProbeForProfiling) {
-      report_fatal_error(
-          "Pseudo probes cannot be used with -debug-info-for-profiling", false);
-    }
-  }
-  std::string ProfileFile;
-  std::string CSProfileGenFile;
-  std::string ProfileRemappingFile;
-  PGOAction Action;
-  CSPGOAction CSAction;
-  bool DebugInfoForProfiling;
-  bool PseudoProbeForProfiling;
-};
-
 /// Tunable parameters for passes in the default pipelines.
 class PipelineTuningOptions {
 public:
@@ -122,6 +74,15 @@ public:
   /// Tuning option to enable/disable function merging. Its default value is
   /// false.
   bool MergeFunctions;
+
+  // Experimental option to eagerly invalidate more analyses. This has the
+  // potential to decrease max memory usage in exchange for more compile time.
+  // This may affect codegen due to either passes using analyses only when
+  // cached, or invalidating and recalculating an analysis that was
+  // stale/imprecise but still valid. Currently this invalidates all function
+  // analyses after various module->function or cgscc->function adaptors in the
+  // default pipelines.
+  bool EagerlyInvalidateAnalyses;
 };
 
 /// This class provides access to building LLVM's passes.
@@ -150,116 +111,6 @@ public:
     std::vector<PipelineElement> InnerPipeline;
   };
 
-  /// LLVM-provided high-level optimization levels.
-  ///
-  /// This enumerates the LLVM-provided high-level optimization levels. Each
-  /// level has a specific goal and rationale.
-  class OptimizationLevel final {
-    unsigned SpeedLevel = 2;
-    unsigned SizeLevel = 0;
-    OptimizationLevel(unsigned SpeedLevel, unsigned SizeLevel)
-        : SpeedLevel(SpeedLevel), SizeLevel(SizeLevel) {
-      // Check that only valid combinations are passed.
-      assert(SpeedLevel <= 3 &&
-             "Optimization level for speed should be 0, 1, 2, or 3");
-      assert(SizeLevel <= 2 &&
-             "Optimization level for size should be 0, 1, or 2");
-      assert((SizeLevel == 0 || SpeedLevel == 2) &&
-             "Optimize for size should be encoded with speedup level == 2");
-    }
-
-  public:
-    OptimizationLevel() = default;
-    /// Disable as many optimizations as possible. This doesn't completely
-    /// disable the optimizer in all cases, for example always_inline functions
-    /// can be required to be inlined for correctness.
-    static const OptimizationLevel O0;
-
-    /// Optimize quickly without destroying debuggability.
-    ///
-    /// This level is tuned to produce a result from the optimizer as quickly
-    /// as possible and to avoid destroying debuggability. This tends to result
-    /// in a very good development mode where the compiled code will be
-    /// immediately executed as part of testing. As a consequence, where
-    /// possible, we would like to produce efficient-to-execute code, but not
-    /// if it significantly slows down compilation or would prevent even basic
-    /// debugging of the resulting binary.
-    ///
-    /// As an example, complex loop transformations such as versioning,
-    /// vectorization, or fusion don't make sense here due to the degree to
-    /// which the executed code differs from the source code, and the compile
-    /// time cost.
-    static const OptimizationLevel O1;
-    /// Optimize for fast execution as much as possible without triggering
-    /// significant incremental compile time or code size growth.
-    ///
-    /// The key idea is that optimizations at this level should "pay for
-    /// themselves". So if an optimization increases compile time by 5% or
-    /// increases code size by 5% for a particular benchmark, that benchmark
-    /// should also be one which sees a 5% runtime improvement. If the compile
-    /// time or code size penalties happen on average across a diverse range of
-    /// LLVM users' benchmarks, then the improvements should as well.
-    ///
-    /// And no matter what, the compile time needs to not grow superlinearly
-    /// with the size of input to LLVM so that users can control the runtime of
-    /// the optimizer in this mode.
-    ///
-    /// This is expected to be a good default optimization level for the vast
-    /// majority of users.
-    static const OptimizationLevel O2;
-    /// Optimize for fast execution as much as possible.
-    ///
-    /// This mode is significantly more aggressive in trading off compile time
-    /// and code size to get execution time improvements. The core idea is that
-    /// this mode should include any optimization that helps execution time on
-    /// balance across a diverse collection of benchmarks, even if it increases
-    /// code size or compile time for some benchmarks without corresponding
-    /// improvements to execution time.
-    ///
-    /// Despite being willing to trade more compile time off to get improved
-    /// execution time, this mode still tries to avoid superlinear growth in
-    /// order to make even significantly slower compile times at least scale
-    /// reasonably. This does not preclude very substantial constant factor
-    /// costs though.
-    static const OptimizationLevel O3;
-    /// Similar to \c O2 but tries to optimize for small code size instead of
-    /// fast execution without triggering significant incremental execution
-    /// time slowdowns.
-    ///
-    /// The logic here is exactly the same as \c O2, but with code size and
-    /// execution time metrics swapped.
-    ///
-    /// A consequence of the different core goal is that this should in general
-    /// produce substantially smaller executables that still run in
-    /// a reasonable amount of time.
-    static const OptimizationLevel Os;
-    /// A very specialized mode that will optimize for code size at any and all
-    /// costs.
-    ///
-    /// This is useful primarily when there are absolute size limitations and
-    /// any effort taken to reduce the size is worth it regardless of the
-    /// execution time impact. You should expect this level to produce rather
-    /// slow, but very small, code.
-    static const OptimizationLevel Oz;
-
-    bool isOptimizingForSpeed() const {
-      return SizeLevel == 0 && SpeedLevel > 0;
-    }
-
-    bool isOptimizingForSize() const { return SizeLevel > 0; }
-
-    bool operator==(const OptimizationLevel &Other) const {
-      return SizeLevel == Other.SizeLevel && SpeedLevel == Other.SpeedLevel;
-    }
-    bool operator!=(const OptimizationLevel &Other) const {
-      return SizeLevel != Other.SizeLevel || SpeedLevel != Other.SpeedLevel;
-    }
-
-    unsigned getSpeedupLevel() const { return SpeedLevel; }
-
-    unsigned getSizeLevel() const { return SizeLevel; }
-  };
-
   explicit PassBuilder(TargetMachine *TM = nullptr,
                        PipelineTuningOptions PTO = PipelineTuningOptions(),
                        Optional<PGOOptions> PGOOpt = None,
@@ -346,6 +197,11 @@ public:
   ModuleInlinerWrapperPass buildInlinerPipeline(OptimizationLevel Level,
                                                 ThinOrFullLTOPhase Phase);
 
+  /// Construct the module pipeline that performs inlining with
+  /// module inliner pass.
+  ModuleInlinerPass buildModuleInlinerPipeline(OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
+
   /// Construct the core LLVM module optimization pipeline.
   ///
   /// This pipeline focuses on optimizing the execution speed of the IR. It
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 2f573585e766..6cab4ce7d138 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -215,8 +215,6 @@ protected:
   virtual void handleFiltered(StringRef PassID, std::string &Name) = 0;
   // Called when an ignored pass is encountered.
   virtual void handleIgnored(StringRef PassID, std::string &Name) = 0;
-  // Called to compare the before and after representations of the IR.
-  virtual bool same(const IRUnitT &Before, const IRUnitT &After) = 0;
 
   // Stack of IRs before passes.
   std::vector<IRUnitT> BeforeStack;
@@ -269,50 +267,47 @@ protected:
   void handleAfter(StringRef PassID, std::string &Name,
                    const std::string &Before, const std::string &After,
                    Any) override;
-  // Called to compare the before and after representations of the IR.
-  bool same(const std::string &Before, const std::string &After) override;
 };
 
-// The following classes hold a representation of the IR for a change
-// reporter that uses string comparisons of the basic blocks
-// that are created using print (ie, similar to dump()).
-// These classes respect the filtering of passes and functions using
-// -filter-passes and -filter-print-funcs.
-//
 // Information that needs to be saved for a basic block in order to compare
 // before and after the pass to determine if it was changed by a pass.
-class ChangedBlockData {
+template <typename T> class BlockDataT {
 public:
-  ChangedBlockData(const BasicBlock &B);
-
-  bool operator==(const ChangedBlockData &That) const {
-    return Body == That.Body;
-  }
-  bool operator!=(const ChangedBlockData &That) const {
-    return Body != That.Body;
+  BlockDataT(const BasicBlock &B) : Label(B.getName().str()), Data(B) {
+    raw_string_ostream SS(Body);
+    B.print(SS, nullptr, true, true);
   }
 
+  bool operator==(const BlockDataT &That) const { return Body == That.Body; }
+  bool operator!=(const BlockDataT &That) const { return Body != That.Body; }
+
   // Return the label of the represented basic block.
   StringRef getLabel() const { return Label; }
   // Return the string representation of the basic block.
   StringRef getBody() const { return Body; }
 
+  // Return the associated data
+  const T &getData() const { return Data; }
+
 protected:
   std::string Label;
   std::string Body;
+
+  // Extra data associated with a basic block
+  T Data;
 };
 
-template <typename IRData> class OrderedChangedData {
+template <typename T> class OrderedChangedData {
 public:
   // Return the names in the order they were saved
   std::vector<std::string> &getOrder() { return Order; }
   const std::vector<std::string> &getOrder() const { return Order; }
 
   // Return a map of names to saved representations
-  StringMap<IRData> &getData() { return Data; }
-  const StringMap<IRData> &getData() const { return Data; }
+  StringMap<T> &getData() { return Data; }
+  const StringMap<T> &getData() const { return Data; }
 
-  bool operator==(const OrderedChangedData<IRData> &That) const {
+  bool operator==(const OrderedChangedData<T> &That) const {
     return Data == That.getData();
   }
 
@@ -321,55 +316,64 @@ public:
   // with ones that are only in \p Before interspersed based on where they
   // occur in \p Before.  This is used to present the output in an order
   // based on how the data is ordered in LLVM.
-  static void
-  report(const OrderedChangedData &Before, const OrderedChangedData &After,
-         function_ref<void(const IRData *, const IRData *)> HandlePair);
+  static void report(const OrderedChangedData &Before,
+                     const OrderedChangedData &After,
+                     function_ref<void(const T *, const T *)> HandlePair);
 
 protected:
   std::vector<std::string> Order;
-  StringMap<IRData> Data;
+  StringMap<T> Data;
+};
+
+// Do not need extra information for patch-style change reporter.
+class EmptyData {
+public:
+  EmptyData(const BasicBlock &) {}
 };
 
 // The data saved for comparing functions.
-using ChangedFuncData = OrderedChangedData<ChangedBlockData>;
+template <typename T>
+class FuncDataT : public OrderedChangedData<BlockDataT<T>> {
+public:
+  FuncDataT(std::string S) : EntryBlockName(S) {}
+
+  // Return the name of the entry block
+  std::string getEntryBlockName() const { return EntryBlockName; }
+
+protected:
+  std::string EntryBlockName;
+};
 
-// A map of names to the saved data.
-using ChangedIRData = OrderedChangedData<ChangedFuncData>;
+// The data saved for comparing IRs.
+template <typename T>
+class IRDataT : public OrderedChangedData<FuncDataT<T>> {};
 
-// A class that compares two IRs and does a diff between them.  The
-// added lines are prefixed with a '+', the removed lines are prefixed
-// with a '-' and unchanged lines are prefixed with a space (to have
-// things line up).
-class ChangedIRComparer {
+// Abstract template base class for a class that compares two IRs.  The
+// class is created with the 2 IRs to compare and then compare is called.
+// The static function analyzeIR is used to build up the IR representation.
+template <typename T> class IRComparer {
 public:
-  ChangedIRComparer(raw_ostream &OS, const ChangedIRData &Before,
-                    const ChangedIRData &After, bool ColourMode)
-      : Before(Before), After(After), Out(OS), UseColour(ColourMode) {}
+  IRComparer(const IRDataT<T> &Before, const IRDataT<T> &After)
+      : Before(Before), After(After) {}
 
-  // Compare the 2 IRs.
-  void compare(Any IR, StringRef Prefix, StringRef PassID, StringRef Name);
+  // Compare the 2 IRs. \p handleFunctionCompare is called to handle the
+  // compare of a function. When \p InModule is set,
+  // this function is being handled as part of comparing a module.
+  void compare(
+      bool CompareModule,
+      std::function<void(bool InModule, unsigned Minor,
+                         const FuncDataT<T> &Before, const FuncDataT<T> &After)>
+          CompareFunc);
 
   // Analyze \p IR and build the IR representation in \p Data.
-  static void analyzeIR(Any IR, ChangedIRData &Data);
+  static void analyzeIR(Any IR, IRDataT<T> &Data);
 
 protected:
-  // Return the module when that is the appropriate level of
-  // comparison for \p IR.
-  static const Module *getModuleForComparison(Any IR);
-
   // Generate the data for \p F into \p Data.
-  static bool generateFunctionData(ChangedIRData &Data, const Function &F);
+  static bool generateFunctionData(IRDataT<T> &Data, const Function &F);
 
-  // Called to handle the compare of a function. When \p InModule is set,
-  // this function is being handled as part of comparing a module.
-  void handleFunctionCompare(StringRef Name, StringRef Prefix, StringRef PassID,
-                             bool InModule, const ChangedFuncData &Before,
-                             const ChangedFuncData &After);
-
-  const ChangedIRData &Before;
-  const ChangedIRData &After;
-  raw_ostream &Out;
-  bool UseColour;
+  const IRDataT<T> &Before;
+  const IRDataT<T> &After;
 };
 
 // A change printer that prints out in-line differences in the basic
@@ -378,25 +382,28 @@ protected:
 // and added, respectively.  Changes to the IR that do not affect basic
 // blocks are not reported as having changed the IR.  The option
 // -print-module-scope does not affect this change reporter.
-class InLineChangePrinter : public TextChangeReporter<ChangedIRData> {
+class InLineChangePrinter : public TextChangeReporter<IRDataT<EmptyData>> {
 public:
   InLineChangePrinter(bool VerboseMode, bool ColourMode)
-      : TextChangeReporter<ChangedIRData>(VerboseMode), UseColour(ColourMode) {}
+      : TextChangeReporter<IRDataT<EmptyData>>(VerboseMode),
+        UseColour(ColourMode) {}
   ~InLineChangePrinter() override;
   void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 protected:
   // Create a representation of the IR.
   virtual void generateIRRepresentation(Any IR, StringRef PassID,
-                                        ChangedIRData &Output) override;
+                                        IRDataT<EmptyData> &Output) override;
 
   // Called when an interesting IR has changed.
   virtual void handleAfter(StringRef PassID, std::string &Name,
-                           const ChangedIRData &Before,
-                           const ChangedIRData &After, Any) override;
-  // Called to compare the before and after representations of the IR.
-  virtual bool same(const ChangedIRData &Before,
-                    const ChangedIRData &After) override;
+                           const IRDataT<EmptyData> &Before,
+                           const IRDataT<EmptyData> &After, Any) override;
+
+  void handleFunctionCompare(StringRef Name, StringRef Prefix, StringRef PassID,
+                             StringRef Divider, bool InModule, unsigned Minor,
+                             const FuncDataT<EmptyData> &Before,
+                             const FuncDataT<EmptyData> &After);
 
   bool UseColour;
 };
@@ -409,6 +416,81 @@ public:
   void registerCallbacks(PassInstrumentationCallbacks &PIC);
 };
 
+// Class that holds transitions between basic blocks.  The transitions
+// are contained in a map of values to names of basic blocks.
+class DCData {
+public:
+  // Fill the map with the transitions from basic block \p B.
+  DCData(const BasicBlock &B);
+
+  // Return an iterator to the names of the successor blocks.
+  StringMap<std::string>::const_iterator begin() const {
+    return Successors.begin();
+  }
+  StringMap<std::string>::const_iterator end() const {
+    return Successors.end();
+  }
+
+  // Return the label of the basic block reached on a transition on \p S.
+  const StringRef getSuccessorLabel(StringRef S) const {
+    assert(Successors.count(S) == 1 && "Expected to find successor.");
+    return Successors.find(S)->getValue();
+  }
+
+protected:
+  // Add a transition to \p Succ on \p Label
+  void addSuccessorLabel(StringRef Succ, StringRef Label) {
+    std::pair<std::string, std::string> SS{Succ.str(), Label.str()};
+    Successors.insert(SS);
+  }
+
+  StringMap<std::string> Successors;
+};
+
+// A change reporter that builds a website with links to pdf files showing
+// dot control flow graphs with changed instructions shown in colour.
+class DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
+public:
+  DotCfgChangeReporter(bool Verbose);
+  ~DotCfgChangeReporter() override;
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+protected:
+  // Initialize the HTML file and output the header.
+  bool initializeHTML();
+
+  // Called on the first IR processed.
+  void handleInitialIR(Any IR) override;
+  // Called before and after a pass to get the representation of the IR.
+  void generateIRRepresentation(Any IR, StringRef PassID,
+                                IRDataT<DCData> &Output) override;
+  // Called when the pass is not iteresting.
+  void omitAfter(StringRef PassID, std::string &Name) override;
+  // Called when an interesting IR has changed.
+  void handleAfter(StringRef PassID, std::string &Name,
+                   const IRDataT<DCData> &Before, const IRDataT<DCData> &After,
+                   Any) override;
+  // Called when an interesting pass is invalidated.
+  void handleInvalidated(StringRef PassID) override;
+  // Called when the IR or pass is not interesting.
+  void handleFiltered(StringRef PassID, std::string &Name) override;
+  // Called when an ignored pass is encountered.
+  void handleIgnored(StringRef PassID, std::string &Name) override;
+
+  // Generate the pdf file into \p Dir / \p PDFFileName using \p DotFile as
+  // input and return the html <a> tag with \Text as the content.
+  static std::string genHTML(StringRef Text, StringRef DotFile,
+                             StringRef PDFFileName);
+
+  void handleFunctionCompare(StringRef Name, StringRef Prefix, StringRef PassID,
+                             StringRef Divider, bool InModule, unsigned Minor,
+                             const FuncDataT<DCData> &Before,
+                             const FuncDataT<DCData> &After);
+
+  unsigned N = 0;
+  std::unique_ptr<raw_fd_ostream> HTML;
+};
+
 /// This class provides an interface to register all the standard pass
 /// instrumentations and manages their state (if any).
 class StandardInstrumentations {
@@ -421,6 +503,7 @@ class StandardInstrumentations {
   IRChangedPrinter PrintChangedIR;
   PseudoProbeVerifier PseudoProbeVerification;
   InLineChangePrinter PrintChangedDiff;
+  DotCfgChangeReporter WebsiteChangeReporter;
   VerifyInstrumentation Verify;
 
   bool VerifyEach;
@@ -440,8 +523,12 @@ public:
 extern template class ChangeReporter<std::string>;
 extern template class TextChangeReporter<std::string>;
 
-extern template class ChangeReporter<ChangedIRData>;
-extern template class TextChangeReporter<ChangedIRData>;
+extern template class BlockDataT<EmptyData>;
+extern template class FuncDataT<EmptyData>;
+extern template class IRDataT<EmptyData>;
+extern template class ChangeReporter<IRDataT<EmptyData>>;
+extern template class TextChangeReporter<IRDataT<EmptyData>>;
+extern template class IRComparer<EmptyData>;
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 8f336c13af61..d3a5d44ce8dd 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -693,8 +693,9 @@ public:
 /// An iterator over the \c LineCoverageStats objects for lines described by
 /// a \c CoverageData instance.
 class LineCoverageIterator
-    : public iterator_facade_base<
-          LineCoverageIterator, std::forward_iterator_tag, LineCoverageStats> {
+    : public iterator_facade_base<LineCoverageIterator,
+                                  std::forward_iterator_tag,
+                                  const LineCoverageStats> {
 public:
   LineCoverageIterator(const CoverageData &CD)
       : LineCoverageIterator(CD, CD.begin()->Line) {}
@@ -711,8 +712,6 @@ public:
 
   const LineCoverageStats &operator*() const { return Stats; }
 
-  LineCoverageStats &operator*() { return Stats; }
-
   LineCoverageIterator &operator++();
 
   LineCoverageIterator getEnd() const {
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 08a934e6985f..4395c2abb33e 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -205,9 +205,9 @@ StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName,
                                    StringRef FileName = "<unknown>");
 
 /// Given a vector of strings (function PGO names) \c NameStrs, the
-/// method generates a combined string \c Result thatis ready to be
+/// method generates a combined string \c Result that is ready to be
 /// serialized.  The \c Result string is comprised of three fields:
-/// The first field is the legnth of the uncompressed strings, and the
+/// The first field is the length of the uncompressed strings, and the
 /// the second field is the length of the zlib-compressed string.
 /// Both fields are encoded in ULEB128.  If \c doCompress is false, the
 ///  third field is the uncompressed strings; otherwise it is the
@@ -308,7 +308,8 @@ inline std::error_code make_error_code(instrprof_error E) {
 
 class InstrProfError : public ErrorInfo<InstrProfError> {
 public:
-  InstrProfError(instrprof_error Err) : Err(Err) {
+  InstrProfError(instrprof_error Err, const Twine &ErrStr = Twine())
+      : Err(Err), Msg(ErrStr.str()) {
     assert(Err != instrprof_error::success && "Not an error");
   }
 
@@ -321,6 +322,7 @@ public:
   }
 
   instrprof_error get() const { return Err; }
+  const std::string &getMessage() const { return Msg; }
 
   /// Consume an Error and return the raw enum value contained within it. The
   /// Error must either be a success value, or contain a single InstrProfError.
@@ -337,6 +339,7 @@ public:
 
 private:
   instrprof_error Err;
+  std::string Msg;
 };
 
 class SoftInstrProfErrors {
@@ -474,7 +477,8 @@ public:
   /// is used by the raw and text profile readers.
   Error addFuncName(StringRef FuncName) {
     if (FuncName.empty())
-      return make_error<InstrProfError>(instrprof_error::malformed);
+      return make_error<InstrProfError>(instrprof_error::malformed,
+                                        "function name is empty");
     auto Ins = NameTab.insert(FuncName);
     if (Ins.second) {
       MD5NameMap.push_back(std::make_pair(
@@ -1104,6 +1108,8 @@ namespace RawInstrProf {
 // Version 5: Bit 60 of FuncHash is reserved for the flag for the context
 // sensitive records.
 // Version 6: Added binary id.
+// Version 7: Reorder binary id and include version in signature.
+// Version 8: Use relative counter pointer.
 const uint64_t Version = INSTR_PROF_RAW_VERSION;
 
 template <class IntPtrT> inline uint64_t getMagic();
@@ -1142,8 +1148,8 @@ void getMemOPSizeRangeFromOption(StringRef Str, int64_t &RangeStart,
 
 // Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
 // aware this is an ir_level profile so it can set the version flag.
-void createIRLevelProfileFlagVar(Module &M, bool IsCS,
-                                 bool InstrEntryBBEnabled);
+GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS,
+                                            bool InstrEntryBBEnabled);
 
 // Create the variable for the profile file name.
 void createProfileFileNameVar(Module &M, StringRef InstrProfileOutput);
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 08a642469627..008b8dde5820 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -75,9 +75,7 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), NameRef, \
 INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
-INSTR_PROF_DATA(const IntPtrT, llvm::Type::getInt64PtrTy(Ctx), CounterPtr, \
-                ConstantExpr::getBitCast(CounterPtr, \
-                llvm::Type::getInt64PtrTy(Ctx)))
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
  * data.
@@ -129,15 +127,16 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::Type::getInt8PtrTy(Ctx), Next, \
 #endif
 INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
 INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
-INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
+                      (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
-INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
 
@@ -646,7 +645,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 6
+#define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
 #define INSTR_PROF_INDEX_VERSION 7
 /* Coverage mapping format version (start from 0). */
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 501c6f011d53..b62d4ff044a3 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -71,6 +71,7 @@ public:
 /// format. Provides an iterator over NamedInstrProfRecords.
 class InstrProfReader {
   instrprof_error LastError = instrprof_error::success;
+  std::string LastErrorMsg;
 
 public:
   InstrProfReader() = default;
@@ -114,14 +115,21 @@ protected:
   std::unique_ptr<InstrProfSymtab> Symtab;
 
   /// Set the current error and return same.
-  Error error(instrprof_error Err) {
+  Error error(instrprof_error Err, const std::string &ErrMsg = "") {
     LastError = Err;
+    LastErrorMsg = ErrMsg;
     if (Err == instrprof_error::success)
       return Error::success();
-    return make_error<InstrProfError>(Err);
+    return make_error<InstrProfError>(Err, ErrMsg);
   }
 
-  Error error(Error &&E) { return error(InstrProfError::take(std::move(E))); }
+  Error error(Error &&E) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      LastError = IPE.get();
+      LastErrorMsg = IPE.getMessage();
+    });
+    return make_error<InstrProfError>(LastError, LastErrorMsg);
+  }
 
   /// Clear the current error and return a successful one.
   Error success() { return error(instrprof_error::success); }
@@ -136,7 +144,7 @@ public:
   /// Get the current error.
   Error getError() {
     if (hasError())
-      return make_error<InstrProfError>(LastError);
+      return make_error<InstrProfError>(LastError, LastErrorMsg);
     return Error::success();
   }
 
@@ -197,7 +205,7 @@ public:
 
 /// Reader for the raw instrprof binary format from runtime.
 ///
-/// This format is a raw memory dump of the instrumentation-baed profiling data
+/// This format is a raw memory dump of the instrumentation-based profiling data
 /// from the runtime.  It has no index.
 ///
 /// Templated on the unsigned type whose size matches pointers on the platform
diff --git a/llvm/include/llvm/ProfileData/ProfileCommon.h b/llvm/include/llvm/ProfileData/ProfileCommon.h
index f2d9ccc45fdc..ad92af22d92e 100644
--- a/llvm/include/llvm/ProfileData/ProfileCommon.h
+++ b/llvm/include/llvm/ProfileData/ProfileCommon.h
@@ -66,9 +66,9 @@ public:
 
   /// Find the summary entry for a desired percentile of counts.
   static const ProfileSummaryEntry &
-  getEntryForPercentile(SummaryEntryVector &DS, uint64_t Percentile);
-  static uint64_t getHotCountThreshold(SummaryEntryVector &DS);
-  static uint64_t getColdCountThreshold(SummaryEntryVector &DS);
+  getEntryForPercentile(const SummaryEntryVector &DS, uint64_t Percentile);
+  static uint64_t getHotCountThreshold(const SummaryEntryVector &DS);
+  static uint64_t getColdCountThreshold(const SummaryEntryVector &DS);
 };
 
 class InstrProfSummaryBuilder final : public ProfileSummaryBuilder {
@@ -92,8 +92,8 @@ public:
 
   void addRecord(const sampleprof::FunctionSamples &FS,
                  bool isCallsiteSample = false);
-  std::unique_ptr<ProfileSummary> computeSummaryForProfiles(
-      const StringMap<sampleprof::FunctionSamples> &Profiles);
+  std::unique_ptr<ProfileSummary>
+  computeSummaryForProfiles(const sampleprof::SampleProfileMap &Profiles);
   std::unique_ptr<ProfileSummary> getSummary();
 };
 
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 2f71bbc6bbbe..7ac9eccf8ac2 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -29,10 +29,13 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
+#include <list>
 #include <map>
 #include <set>
+#include <sstream>
 #include <string>
 #include <system_error>
+#include <unordered_map>
 #include <utility>
 
 namespace llvm {
@@ -104,10 +107,10 @@ static inline uint64_t SPMagic(SampleProfileFormat Format = SPF_Binary) {
 /// current Format uses MD5 to represent the string.
 static inline StringRef getRepInFormat(StringRef Name, bool UseMD5,
                                        std::string &GUIDBuf) {
-  if (Name.empty())
+  if (Name.empty() || !UseMD5)
     return Name;
   GUIDBuf = std::to_string(Function::getGUID(Name));
-  return UseMD5 ? StringRef(GUIDBuf) : Name;
+  return GUIDBuf;
 }
 
 static inline uint64_t SPVersion() { return 103; }
@@ -122,13 +125,14 @@ enum SecType {
   SecProfileSymbolList = 3,
   SecFuncOffsetTable = 4,
   SecFuncMetadata = 5,
+  SecCSNameTable = 6,
   // marker for the first type of profile.
   SecFuncProfileFirst = 32,
   SecLBRProfile = SecFuncProfileFirst
 };
 
 static inline std::string getSecName(SecType Type) {
-  switch (Type) {
+  switch ((int)Type) { // Avoid -Wcovered-switch-default
   case SecInValid:
     return "InvalidSection";
   case SecProfSummary:
@@ -141,10 +145,13 @@ static inline std::string getSecName(SecType Type) {
     return "FuncOffsetTableSection";
   case SecFuncMetadata:
     return "FunctionMetadata";
+  case SecCSNameTable:
+    return "CSNameTableSection";
   case SecLBRProfile:
     return "LBRProfileSection";
+  default:
+    return "UnknownSection";
   }
-  llvm_unreachable("A SecType has no name for output");
 }
 
 // Entry type of section header table used by SampleProfileExtBinaryBaseReader
@@ -202,6 +209,13 @@ enum class SecFuncMetadataFlags : uint32_t {
   SecFlagHasAttribute = (1 << 1)
 };
 
+enum class SecFuncOffsetFlags : uint32_t {
+  SecFlagInvalid = 0,
+  // Store function offsets in an order of contexts. The order ensures that
+  // callee contexts of a given context laid out next to it.
+  SecFlagOrdered = (1 << 0),
+};
+
 // Verify section specific flag is used for the correct section.
 template <class SecFlagType>
 static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
@@ -222,6 +236,8 @@ static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
     IsFlagLegal = std::is_same<SecFuncMetadataFlags, SecFlagType>();
     break;
   default:
+  case SecFuncOffsetTable:
+    IsFlagLegal = std::is_same<SecFuncOffsetFlags, SecFlagType>();
     break;
   }
   if (!IsFlagLegal)
@@ -396,54 +412,123 @@ enum ContextAttributeMask {
   ContextShouldBeInlined = 0x2, // Leaf of context should be inlined
 };
 
+// Represents a context frame with function name and line location
+struct SampleContextFrame {
+  StringRef FuncName;
+  LineLocation Location;
+
+  SampleContextFrame() : Location(0, 0) {}
+
+  SampleContextFrame(StringRef FuncName, LineLocation Location)
+      : FuncName(FuncName), Location(Location) {}
+
+  bool operator==(const SampleContextFrame &That) const {
+    return Location == That.Location && FuncName == That.FuncName;
+  }
+
+  bool operator!=(const SampleContextFrame &That) const {
+    return !(*this == That);
+  }
+
+  std::string toString(bool OutputLineLocation) const {
+    std::ostringstream OContextStr;
+    OContextStr << FuncName.str();
+    if (OutputLineLocation) {
+      OContextStr << ":" << Location.LineOffset;
+      if (Location.Discriminator)
+        OContextStr << "." << Location.Discriminator;
+    }
+    return OContextStr.str();
+  }
+};
+
+static inline hash_code hash_value(const SampleContextFrame &arg) {
+  return hash_combine(arg.FuncName, arg.Location.LineOffset,
+                      arg.Location.Discriminator);
+}
+
+using SampleContextFrameVector = SmallVector<SampleContextFrame, 10>;
+using SampleContextFrames = ArrayRef<SampleContextFrame>;
+
+struct SampleContextFrameHash {
+  uint64_t operator()(const SampleContextFrameVector &S) const {
+    return hash_combine_range(S.begin(), S.end());
+  }
+};
+
 // Sample context for FunctionSamples. It consists of the calling context,
 // the function name and context state. Internally sample context is represented
-// using StringRef, which is also the input for constructing a `SampleContext`.
+// using ArrayRef, which is also the input for constructing a `SampleContext`.
 // It can accept and represent both full context string as well as context-less
 // function name.
-// Example of full context string (note the wrapping `[]`):
-//    `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
-// Example of context-less function name (same as AutoFDO):
-//    `_Z8funcLeafi`
+// For a CS profile, a full context vector can look like:
+//    `main:3 _Z5funcAi:1 _Z8funcLeafi`
+// For a base CS profile without calling context, the context vector should only
+// contain the leaf frame name.
+// For a non-CS profile, the context vector should be empty.
 class SampleContext {
 public:
   SampleContext() : State(UnknownContext), Attributes(ContextNone) {}
-  SampleContext(StringRef ContextStr, ContextStateMask CState = UnknownContext)
-      : Attributes(ContextNone) {
-    setContext(ContextStr, CState);
-  }
 
-  // Promote context by removing top frames (represented by `ContextStrToRemove`).
-  // Note that with string representation of context, the promotion is effectively
-  // a substr operation with `ContextStrToRemove` removed from left.
-  void promoteOnPath(StringRef ContextStrToRemove) {
-    assert(FullContext.startswith(ContextStrToRemove));
+  SampleContext(StringRef Name)
+      : Name(Name), State(UnknownContext), Attributes(ContextNone) {}
 
-    // Remove leading context and frame separator " @ ".
-    FullContext = FullContext.substr(ContextStrToRemove.size() + 3);
-    CallingContext = CallingContext.substr(ContextStrToRemove.size() + 3);
+  SampleContext(SampleContextFrames Context,
+                ContextStateMask CState = RawContext)
+      : Attributes(ContextNone) {
+    assert(!Context.empty() && "Context is empty");
+    setContext(Context, CState);
   }
 
-  // Split the top context frame (left-most substr) from context.
-  static std::pair<StringRef, StringRef>
-  splitContextString(StringRef ContextStr) {
-    return ContextStr.split(" @ ");
+  // Give a context string, decode and populate internal states like
+  // Function name, Calling context and context state. Example of input
+  // `ContextStr`: `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
+  SampleContext(StringRef ContextStr,
+                std::list<SampleContextFrameVector> &CSNameTable,
+                ContextStateMask CState = RawContext)
+      : Attributes(ContextNone) {
+    assert(!ContextStr.empty());
+    // Note that `[]` wrapped input indicates a full context string, otherwise
+    // it's treated as context-less function name only.
+    bool HasContext = ContextStr.startswith("[");
+    if (!HasContext) {
+      State = UnknownContext;
+      Name = ContextStr;
+    } else {
+      CSNameTable.emplace_back();
+      SampleContextFrameVector &Context = CSNameTable.back();
+      createCtxVectorFromStr(ContextStr, Context);
+      setContext(Context, CState);
+    }
   }
 
-  // Reconstruct a new context with the last k frames, return the context-less
-  // name if K = 1
-  StringRef getContextWithLastKFrames(uint32_t K) {
-    if (K == 1)
-      return getNameWithoutContext();
-
-    size_t I = FullContext.size();
-    while (K--) {
-      I = FullContext.find_last_of(" @ ", I);
-      if (I == StringRef::npos)
-        return FullContext;
-      I -= 2;
+  /// Create a context vector from a given context string and save it in
+  /// `Context`.
+  static void createCtxVectorFromStr(StringRef ContextStr,
+                                     SampleContextFrameVector &Context) {
+    // Remove encapsulating '[' and ']' if any
+    ContextStr = ContextStr.substr(1, ContextStr.size() - 2);
+    StringRef ContextRemain = ContextStr;
+    StringRef ChildContext;
+    StringRef CalleeName;
+    while (!ContextRemain.empty()) {
+      auto ContextSplit = ContextRemain.split(" @ ");
+      ChildContext = ContextSplit.first;
+      ContextRemain = ContextSplit.second;
+      LineLocation CallSiteLoc(0, 0);
+      decodeContextString(ChildContext, CalleeName, CallSiteLoc);
+      Context.emplace_back(CalleeName, CallSiteLoc);
     }
-    return FullContext.slice(I + 3, StringRef::npos);
+  }
+
+  // Promote context by removing top frames with the length of
+  // `ContextFramesToRemove`. Note that with array representation of context,
+  // the promotion is effectively a slice operation with first
+  // `ContextFramesToRemove` elements removed from left.
+  void promoteOnPath(uint32_t ContextFramesToRemove) {
+    assert(ContextFramesToRemove <= FullContext.size() &&
+           "Cannot remove more than the whole context");
+    FullContext = FullContext.drop_front(ContextFramesToRemove);
   }
 
   // Decode context string for a frame to get function name and location.
@@ -469,7 +554,7 @@ public:
     }
   }
 
-  operator StringRef() const { return FullContext; }
+  operator SampleContextFrames() const { return FullContext; }
   bool hasAttribute(ContextAttributeMask A) { return Attributes & (uint32_t)A; }
   void setAttribute(ContextAttributeMask A) { Attributes |= (uint32_t)A; }
   uint32_t getAllAttributes() { return Attributes; }
@@ -478,60 +563,114 @@ public:
   void setState(ContextStateMask S) { State |= (uint32_t)S; }
   void clearState(ContextStateMask S) { State &= (uint32_t)~S; }
   bool hasContext() const { return State != UnknownContext; }
-  bool isBaseContext() const { return CallingContext.empty(); }
-  StringRef getNameWithoutContext() const { return Name; }
-  StringRef getCallingContext() const { return CallingContext; }
-  StringRef getNameWithContext() const { return FullContext; }
-
-private:
-  // Give a context string, decode and populate internal states like
-  // Function name, Calling context and context state. Example of input
-  // `ContextStr`: `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
-  void setContext(StringRef ContextStr, ContextStateMask CState) {
-    assert(!ContextStr.empty());
-    // Note that `[]` wrapped input indicates a full context string, otherwise
-    // it's treated as context-less function name only.
-    bool HasContext = ContextStr.startswith("[");
-    if (!HasContext && CState == UnknownContext) {
-      State = UnknownContext;
-      Name = FullContext = ContextStr;
-    } else {
-      // Assume raw context profile if unspecified
-      if (CState == UnknownContext)
-        State = RawContext;
-      else
-        State = CState;
-
-      // Remove encapsulating '[' and ']' if any
-      if (HasContext)
-        FullContext = ContextStr.substr(1, ContextStr.size() - 2);
-      else
-        FullContext = ContextStr;
-
-      // Caller is to the left of callee in context string
-      auto NameContext = FullContext.rsplit(" @ ");
-      if (NameContext.second.empty()) {
-        Name = NameContext.first;
-        CallingContext = NameContext.second;
-      } else {
-        Name = NameContext.second;
-        CallingContext = NameContext.first;
+  bool isBaseContext() const { return FullContext.size() == 1; }
+  StringRef getName() const { return Name; }
+  SampleContextFrames getContextFrames() const { return FullContext; }
+
+  static std::string getContextString(SampleContextFrames Context,
+                                      bool IncludeLeafLineLocation = false) {
+    std::ostringstream OContextStr;
+    for (uint32_t I = 0; I < Context.size(); I++) {
+      if (OContextStr.str().size()) {
+        OContextStr << " @ ";
       }
+      OContextStr << Context[I].toString(I != Context.size() - 1 ||
+                                         IncludeLeafLineLocation);
     }
+    return OContextStr.str();
+  }
+
+  std::string toString() const {
+    if (!hasContext())
+      return Name.str();
+    return getContextString(FullContext, false);
+  }
+
+  uint64_t getHashCode() const {
+    return hasContext() ? hash_value(getContextFrames())
+                        : hash_value(getName());
+  }
+
+  /// Set the name of the function.
+  void setName(StringRef FunctionName) {
+    assert(FullContext.empty() &&
+           "setName should only be called for non-CS profile");
+    Name = FunctionName;
+  }
+
+  void setContext(SampleContextFrames Context,
+                  ContextStateMask CState = RawContext) {
+    assert(CState != UnknownContext);
+    FullContext = Context;
+    Name = Context.back().FuncName;
+    State = CState;
+  }
+
+  bool operator==(const SampleContext &That) const {
+    return State == That.State && Name == That.Name &&
+           FullContext == That.FullContext;
+  }
+
+  bool operator!=(const SampleContext &That) const { return !(*this == That); }
+
+  bool operator<(const SampleContext &That) const {
+    if (State != That.State)
+      return State < That.State;
+
+    if (!hasContext()) {
+      return (Name.compare(That.Name)) == -1;
+    }
+
+    uint64_t I = 0;
+    while (I < std::min(FullContext.size(), That.FullContext.size())) {
+      auto &Context1 = FullContext[I];
+      auto &Context2 = That.FullContext[I];
+      auto V = Context1.FuncName.compare(Context2.FuncName);
+      if (V)
+        return V == -1;
+      if (Context1.Location != Context2.Location)
+        return Context1.Location < Context2.Location;
+      I++;
+    }
+
+    return FullContext.size() < That.FullContext.size();
+  }
+
+  struct Hash {
+    uint64_t operator()(const SampleContext &Context) const {
+      return Context.getHashCode();
+    }
+  };
+
+  bool IsPrefixOf(const SampleContext &That) const {
+    auto ThisContext = FullContext;
+    auto ThatContext = That.FullContext;
+    if (ThatContext.size() < ThisContext.size())
+      return false;
+    ThatContext = ThatContext.take_front(ThisContext.size());
+    // Compare Leaf frame first
+    if (ThisContext.back().FuncName != ThatContext.back().FuncName)
+      return false;
+    // Compare leading context
+    return ThisContext.drop_back() == ThatContext.drop_back();
   }
 
-  // Full context string including calling context and leaf function name
-  StringRef FullContext;
-  // Function name for the associated sample profile
+private:
+  /// Mangled name of the function.
   StringRef Name;
-  // Calling context (leaf function excluded) for the associated sample profile
-  StringRef CallingContext;
+  // Full context including calling context and leaf function name
+  SampleContextFrames FullContext;
   // State of the associated sample profile
   uint32_t State;
   // Attribute of the associated sample profile
   uint32_t Attributes;
 };
 
+static inline hash_code hash_value(const SampleContext &arg) {
+  return arg.hasContext() ? hash_value(arg.getContextFrames())
+                          : hash_value(arg.getName());
+}
+
 class FunctionSamples;
 class SampleProfileReaderItaniumRemapper;
 
@@ -592,6 +731,20 @@ public:
     return BodySamples[LineLocation(Index, 0)].merge(S, Weight);
   }
 
+  // Accumulate all body samples to set total samples.
+  void updateTotalSamples() {
+    setTotalSamples(0);
+    for (const auto &I : BodySamples)
+      addTotalSamples(I.second.getSamples());
+
+    for (auto &I : CallsiteSamples) {
+      for (auto &CS : I.second) {
+        CS.second.updateTotalSamples();
+        addTotalSamples(CS.second.getTotalSamples());
+      }
+    }
+  }
+
   /// Return the number of samples collected at the given location.
   /// Each location is specified by \p LineOffset and \p Discriminator.
   /// If the location is not found in profile, return error.
@@ -709,10 +862,9 @@ public:
   /// Optionally scale samples by \p Weight.
   sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight = 1) {
     sampleprof_error Result = sampleprof_error::success;
-    Name = Other.getName();
     if (!GUIDToFuncNameMap)
       GUIDToFuncNameMap = Other.GUIDToFuncNameMap;
-    if (Context.getNameWithContext().empty())
+    if (Context.getName().empty())
       Context = Other.getContext();
     if (FunctionHash == 0) {
       // Set the function hash code for the target profile.
@@ -758,7 +910,7 @@ public:
     };
     if (isDeclaration(SymbolMap.lookup(getFuncName()))) {
       // Add to the import list only when it's defined out of module.
-      S.insert(getGUID(Name));
+      S.insert(getGUID(getName()));
     }
     // Import hot CallTargets, which may not be available in IR because full
     // profile annotation cannot be done until backend compilation in ThinLTO.
@@ -775,18 +927,13 @@ public:
   }
 
   /// Set the name of the function.
-  void setName(StringRef FunctionName) { Name = FunctionName; }
+  void setName(StringRef FunctionName) { Context.setName(FunctionName); }
 
   /// Return the function name.
-  StringRef getName() const { return Name; }
-
-  /// Return function name with context.
-  StringRef getNameWithContext() const {
-    return FunctionSamples::ProfileIsCS ? Context.getNameWithContext() : Name;
-  }
+  StringRef getName() const { return Context.getName(); }
 
   /// Return the original function name.
-  StringRef getFuncName() const { return getFuncName(Name); }
+  StringRef getFuncName() const { return getFuncName(getName()); }
 
   void setFunctionHash(uint64_t Hash) { FunctionHash = Hash; }
 
@@ -913,9 +1060,6 @@ public:
   void findAllNames(DenseSet<StringRef> &NameSet) const;
 
 private:
-  /// Mangled name of the function.
-  StringRef Name;
-
   /// CFG hash value for the function.
   uint64_t FunctionHash = 0;
 
@@ -961,6 +1105,14 @@ private:
 
 raw_ostream &operator<<(raw_ostream &OS, const FunctionSamples &FS);
 
+using SampleProfileMap =
+    std::unordered_map<SampleContext, FunctionSamples, SampleContext::Hash>;
+
+using NameFunctionSamples = std::pair<SampleContext, const FunctionSamples *>;
+
+void sortFuncProfiles(const SampleProfileMap &ProfileMap,
+                      std::vector<NameFunctionSamples> &SortedProfiles);
+
 /// Sort a LocationT->SampleT map by LocationT.
 ///
 /// It produces a sorted list of <LocationT, SampleT> records by ascending
@@ -989,18 +1141,24 @@ private:
 /// sure ProfileMap's key is consistent with FunctionSample's name/context.
 class SampleContextTrimmer {
 public:
-  SampleContextTrimmer(StringMap<FunctionSamples> &Profiles)
-      : ProfileMap(Profiles){};
-  // Trim and merge cold context profile when requested.
+  SampleContextTrimmer(SampleProfileMap &Profiles) : ProfileMap(Profiles){};
+  // Trim and merge cold context profile when requested. TrimBaseProfileOnly
+  // should only be effective when TrimColdContext is true. On top of
+  // TrimColdContext, TrimBaseProfileOnly can be used to specify to trim all
+  // cold profiles or only cold base profiles. Trimming base profiles only is
+  // mainly to honor the preinliner decsion. Note that when MergeColdContext is
+  // true, preinliner decsion is not honored anyway so TrimBaseProfileOnly will
+  // be ignored.
   void trimAndMergeColdContextProfiles(uint64_t ColdCountThreshold,
                                        bool TrimColdContext,
                                        bool MergeColdContext,
-                                       uint32_t ColdContextFrameLength);
+                                       uint32_t ColdContextFrameLength,
+                                       bool TrimBaseProfileOnly);
   // Canonicalize context profile name and attributes.
   void canonicalizeContextProfiles();
 
 private:
-  StringMap<FunctionSamples> &ProfileMap;
+  SampleProfileMap &ProfileMap;
 };
 
 /// ProfileSymbolList records the list of function symbols shown up
@@ -1045,6 +1203,22 @@ private:
 };
 
 } // end namespace sampleprof
+
+using namespace sampleprof;
+// Provide DenseMapInfo for SampleContext.
+template <> struct DenseMapInfo<SampleContext> {
+  static inline SampleContext getEmptyKey() { return SampleContext(); }
+
+  static inline SampleContext getTombstoneKey() { return SampleContext("@"); }
+
+  static unsigned getHashValue(const SampleContext &Val) {
+    return Val.getHashCode();
+  }
+
+  static bool isEqual(const SampleContext &LHS, const SampleContext &RHS) {
+    return LHS == RHS;
+  }
+};
 } // end namespace llvm
 
 #endif // LLVM_PROFILEDATA_SAMPLEPROF_H
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 2d5925bdb2b4..e6d31f1b9098 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -242,9 +242,11 @@
 #include "llvm/Support/SymbolRemappingReader.h"
 #include <algorithm>
 #include <cstdint>
+#include <list>
 #include <memory>
 #include <string>
 #include <system_error>
+#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -380,8 +382,8 @@ public:
   /// The implementaion to read sample profiles from the associated file.
   virtual std::error_code readImpl() = 0;
 
-  /// Print the profile for \p FName on stream \p OS.
-  void dumpFunctionProfile(StringRef FName, raw_ostream &OS = dbgs());
+  /// Print the profile for \p FContext on stream \p OS.
+  void dumpFunctionProfile(SampleContext FContext, raw_ostream &OS = dbgs());
 
   /// Collect functions with definitions in Module M. For reader which
   /// support loading function profiles on demand, return true when the
@@ -407,6 +409,13 @@ public:
     std::string FGUID;
     StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
     CanonName = getRepInFormat(CanonName, useMD5(), FGUID);
+    auto It = Profiles.find(CanonName);
+    if (It != Profiles.end())
+      return &It->second;
+    if (!FGUID.empty()) {
+      assert(useMD5() && "New name should only be generated for md5 profile");
+      CanonName = *MD5NameBuffer.insert(FGUID).first;
+    }
     return &Profiles[CanonName];
   }
 
@@ -429,7 +438,7 @@ public:
   }
 
   /// Return all the profiles.
-  StringMap<FunctionSamples> &getProfiles() { return Profiles; }
+  SampleProfileMap &getProfiles() { return Profiles; }
 
   /// Report a parse error message.
   void reportError(int64_t LineNumber, const Twine &Msg) const {
@@ -495,7 +504,7 @@ protected:
   /// The profile of every function executed at runtime is collected
   /// in the structure FunctionSamples. This maps function objects
   /// to their corresponding profiles.
-  StringMap<FunctionSamples> Profiles;
+  SampleProfileMap Profiles;
 
   /// LLVM context used to emit diagnostics.
   LLVMContext &Ctx;
@@ -503,6 +512,10 @@ protected:
   /// Memory buffer holding the profile file.
   std::unique_ptr<MemoryBuffer> Buffer;
 
+  /// Extra name buffer holding names created on demand.
+  /// This should only be needed for md5 profiles.
+  std::unordered_set<std::string> MD5NameBuffer;
+
   /// Profile summary information.
   std::unique_ptr<ProfileSummary> Summary;
 
@@ -555,6 +568,11 @@ public:
 
   /// Return true if \p Buffer is in the format supported by this class.
   static bool hasFormat(const MemoryBuffer &Buffer);
+
+private:
+  /// CSNameTable is used to save full context vectors. This serves as an
+  /// underlying immutable buffer for all clients.
+  std::list<SampleContextFrameVector> CSNameTable;
 };
 
 class SampleProfileReaderBinary : public SampleProfileReader {
@@ -626,6 +644,7 @@ protected:
 
   /// Read a string indirectly via the name table.
   virtual ErrorOr<StringRef> readStringFromTable();
+  virtual ErrorOr<SampleContext> readSampleContextFromTable();
 
 private:
   std::error_code readSummaryEntry(std::vector<ProfileSummaryEntry> &Entries);
@@ -683,6 +702,7 @@ protected:
   std::error_code readFuncProfiles();
   std::error_code readMD5NameTable();
   std::error_code readNameTableSec(bool IsMD5);
+  std::error_code readCSNameTableSec();
   std::error_code readProfileSymbolList();
 
   virtual std::error_code readHeader() override;
@@ -692,12 +712,19 @@ protected:
   // placeholder for subclasses to dispatch their own section readers.
   virtual std::error_code readCustomSection(const SecHdrTableEntry &Entry) = 0;
   virtual ErrorOr<StringRef> readStringFromTable() override;
+  virtual ErrorOr<SampleContext> readSampleContextFromTable() override;
+  ErrorOr<SampleContextFrames> readContextFromTable();
 
   std::unique_ptr<ProfileSymbolList> ProfSymList;
 
-  /// The table mapping from function name to the offset of its FunctionSample
-  /// towards file start.
-  DenseMap<StringRef, uint64_t> FuncOffsetTable;
+  /// The table mapping from function context to the offset of its
+  /// FunctionSample towards file start.
+  DenseMap<SampleContext, uint64_t> FuncOffsetTable;
+
+  /// Function offset mapping ordered by contexts.
+  std::unique_ptr<std::vector<std::pair<SampleContext, uint64_t>>>
+      OrderedFuncOffsets;
+
   /// The set containing the functions to use when compiling a module.
   DenseSet<StringRef> FuncsToUse;
 
@@ -716,10 +743,16 @@ protected:
   /// the lifetime of MD5StringBuf is not shorter than that of NameTable.
   std::unique_ptr<std::vector<std::string>> MD5StringBuf;
 
+  /// CSNameTable is used to save full context vectors. This serves as an
+  /// underlying immutable buffer for all clients.
+  std::unique_ptr<const std::vector<SampleContextFrameVector>> CSNameTable;
+
   /// If SkipFlatProf is true, skip the sections with
   /// SecFlagFlat flag.
   bool SkipFlatProf = false;
 
+  bool FuncOffsetsOrdered = false;
+
 public:
   SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
                                    LLVMContext &C, SampleProfileFormat Format)
@@ -753,6 +786,8 @@ private:
   virtual std::error_code verifySPMagic(uint64_t Magic) override;
   virtual std::error_code
   readCustomSection(const SecHdrTableEntry &Entry) override {
+    // Update the data reader pointer to the end of the section.
+    Data = End;
     return sampleprof_error::success;
   };
 
diff --git a/llvm/include/llvm/ProfileData/SampleProfWriter.h b/llvm/include/llvm/ProfileData/SampleProfWriter.h
index 107f7a730a3c..773beac24ebc 100644
--- a/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -52,7 +52,7 @@ public:
   /// Write all the sample profiles in the given map of samples.
   ///
   /// \returns status code of the file update operation.
-  virtual std::error_code write(const StringMap<FunctionSamples> &ProfileMap);
+  virtual std::error_code write(const SampleProfileMap &ProfileMap);
 
   raw_ostream &getOutputStream() { return *OutputStream; }
 
@@ -78,12 +78,10 @@ protected:
       : OutputStream(std::move(OS)) {}
 
   /// Write a file header for the profile file.
-  virtual std::error_code
-  writeHeader(const StringMap<FunctionSamples> &ProfileMap) = 0;
+  virtual std::error_code writeHeader(const SampleProfileMap &ProfileMap) = 0;
 
   // Write function profiles to the profile file.
-  virtual std::error_code
-  writeFuncProfiles(const StringMap<FunctionSamples> &ProfileMap);
+  virtual std::error_code writeFuncProfiles(const SampleProfileMap &ProfileMap);
 
   /// Output stream where to emit the profile to.
   std::unique_ptr<raw_ostream> OutputStream;
@@ -92,7 +90,7 @@ protected:
   std::unique_ptr<ProfileSummary> Summary;
 
   /// Compute summary for this profile.
-  void computeSummary(const StringMap<FunctionSamples> &ProfileMap);
+  void computeSummary(const SampleProfileMap &ProfileMap);
 
   /// Profile format.
   SampleProfileFormat Format = SPF_None;
@@ -107,8 +105,7 @@ protected:
   SampleProfileWriterText(std::unique_ptr<raw_ostream> &OS)
       : SampleProfileWriter(OS), Indent(0) {}
 
-  std::error_code
-  writeHeader(const StringMap<FunctionSamples> &ProfileMap) override {
+  std::error_code writeHeader(const SampleProfileMap &ProfileMap) override {
     return sampleprof_error::success;
   }
 
@@ -132,19 +129,22 @@ public:
   virtual std::error_code writeSample(const FunctionSamples &S) override;
 
 protected:
+  virtual MapVector<StringRef, uint32_t> &getNameTable() { return NameTable; }
   virtual std::error_code writeMagicIdent(SampleProfileFormat Format);
   virtual std::error_code writeNameTable();
   virtual std::error_code
-  writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
+  writeHeader(const SampleProfileMap &ProfileMap) override;
   std::error_code writeSummary();
-  std::error_code writeNameIdx(StringRef FName, bool IsContextName = false);
+  virtual std::error_code writeContextIdx(const SampleContext &Context);
+  std::error_code writeNameIdx(StringRef FName);
   std::error_code writeBody(const FunctionSamples &S);
-  inline void stablizeNameTable(std::set<StringRef> &V);
+  inline void stablizeNameTable(MapVector<StringRef, uint32_t> &NameTable,
+                                std::set<StringRef> &V);
 
   MapVector<StringRef, uint32_t> NameTable;
-  std::unordered_set<std::string> BracketedContextStr;
 
-  void addName(StringRef FName, bool IsContextName = false);
+  void addName(StringRef FName);
+  virtual void addContext(const SampleContext &Context);
   void addNames(const FunctionSamples &S);
 
 private:
@@ -168,6 +168,7 @@ const std::array<SmallVector<SecHdrTableEntry, 8>, NumOfLayout>
         // DefaultLayout
         SmallVector<SecHdrTableEntry, 8>({{SecProfSummary, 0, 0, 0, 0},
                                           {SecNameTable, 0, 0, 0, 0},
+                                          {SecCSNameTable, 0, 0, 0, 0},
                                           {SecFuncOffsetTable, 0, 0, 0, 0},
                                           {SecLBRProfile, 0, 0, 0, 0},
                                           {SecProfileSymbolList, 0, 0, 0, 0},
@@ -190,8 +191,7 @@ const std::array<SmallVector<SecHdrTableEntry, 8>, NumOfLayout>
 class SampleProfileWriterExtBinaryBase : public SampleProfileWriterBinary {
   using SampleProfileWriterBinary::SampleProfileWriterBinary;
 public:
-  virtual std::error_code
-  write(const StringMap<FunctionSamples> &ProfileMap) override;
+  virtual std::error_code write(const SampleProfileMap &ProfileMap) override;
 
   virtual void setToCompressAllSections() override;
   void setToCompressSection(SecType Type);
@@ -246,29 +246,32 @@ protected:
     addSecFlag(SectionHdrLayout[SectionIdx], Flag);
   }
 
+  virtual void addContext(const SampleContext &Context) override;
+
   // placeholder for subclasses to dispatch their own section writers.
   virtual std::error_code writeCustomSection(SecType Type) = 0;
   // Verify the SecLayout is supported by the format.
   virtual void verifySecLayout(SectionLayout SL) = 0;
 
   // specify the order to write sections.
-  virtual std::error_code
-  writeSections(const StringMap<FunctionSamples> &ProfileMap) = 0;
+  virtual std::error_code writeSections(const SampleProfileMap &ProfileMap) = 0;
 
   // Dispatch section writer for each section. \p LayoutIdx is the sequence
   // number indicating where the section is located in SectionHdrLayout.
-  virtual std::error_code
-  writeOneSection(SecType Type, uint32_t LayoutIdx,
-                  const StringMap<FunctionSamples> &ProfileMap);
+  virtual std::error_code writeOneSection(SecType Type, uint32_t LayoutIdx,
+                                          const SampleProfileMap &ProfileMap);
 
   // Helper function to write name table.
   virtual std::error_code writeNameTable() override;
+  virtual std::error_code
+  writeContextIdx(const SampleContext &Context) override;
+  std::error_code writeCSNameIdx(const SampleContext &Context);
+  std::error_code writeCSNameTableSection();
 
-  std::error_code writeFuncMetadata(const StringMap<FunctionSamples> &Profiles);
+  std::error_code writeFuncMetadata(const SampleProfileMap &Profiles);
 
   // Functions to write various kinds of sections.
-  std::error_code
-  writeNameTableSection(const StringMap<FunctionSamples> &ProfileMap);
+  std::error_code writeNameTableSection(const SampleProfileMap &ProfileMap);
   std::error_code writeFuncOffsetTable();
   std::error_code writeProfileSymbolListSection();
 
@@ -289,7 +292,7 @@ private:
   void allocSecHdrTable();
   std::error_code writeSecHdrTable();
   virtual std::error_code
-  writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
+  writeHeader(const SampleProfileMap &ProfileMap) override;
   std::error_code compressAndOutput();
 
   // We will swap the raw_ostream held by LocalBufStream and that
@@ -312,12 +315,16 @@ private:
   // be read.
   std::vector<SecHdrTableEntry> SecHdrTable;
 
-  // FuncOffsetTable maps function name to its profile offset in SecLBRProfile
-  // section. It is used to load function profile on demand.
-  MapVector<StringRef, uint64_t> FuncOffsetTable;
+  // FuncOffsetTable maps function context to its profile offset in
+  // SecLBRProfile section. It is used to load function profile on demand.
+  MapVector<SampleContext, uint64_t> FuncOffsetTable;
   // Whether to use MD5 to represent string.
   bool UseMD5 = false;
 
+  /// CSNameTable maps function context to its offset in SecCSNameTable section.
+  /// The offset will be used everywhere where the context is referenced.
+  MapVector<SampleContext, uint32_t> CSNameTable;
+
   ProfileSymbolList *ProfSymList = nullptr;
 };
 
@@ -327,13 +334,11 @@ public:
       : SampleProfileWriterExtBinaryBase(OS) {}
 
 private:
-  std::error_code
-  writeDefaultLayout(const StringMap<FunctionSamples> &ProfileMap);
-  std::error_code
-  writeCtxSplitLayout(const StringMap<FunctionSamples> &ProfileMap);
+  std::error_code writeDefaultLayout(const SampleProfileMap &ProfileMap);
+  std::error_code writeCtxSplitLayout(const SampleProfileMap &ProfileMap);
 
   virtual std::error_code
-  writeSections(const StringMap<FunctionSamples> &ProfileMap) override;
+  writeSections(const SampleProfileMap &ProfileMap) override;
 
   virtual std::error_code writeCustomSection(SecType Type) override {
     return sampleprof_error::success;
@@ -380,8 +385,7 @@ class SampleProfileWriterCompactBinary : public SampleProfileWriterBinary {
 
 public:
   virtual std::error_code writeSample(const FunctionSamples &S) override;
-  virtual std::error_code
-  write(const StringMap<FunctionSamples> &ProfileMap) override;
+  virtual std::error_code write(const SampleProfileMap &ProfileMap) override;
 
 protected:
   /// The table mapping from function name to the offset of its FunctionSample
@@ -392,7 +396,7 @@ protected:
   uint64_t TableOffset;
   virtual std::error_code writeNameTable() override;
   virtual std::error_code
-  writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
+  writeHeader(const SampleProfileMap &ProfileMap) override;
   std::error_code writeFuncOffsetTable();
 };
 
diff --git a/llvm/include/llvm/Support/AArch64TargetParser.def b/llvm/include/llvm/Support/AArch64TargetParser.def
index ae2fc673c54e..b3cfb71601f1 100644
--- a/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/llvm/include/llvm/Support/AArch64TargetParser.def
@@ -58,6 +58,24 @@ AARCH64_ARCH("armv8.7-a", ARMV8_7A, "8.7-A", "v8.7a",
               AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
               AArch64::AEK_SM4 | AArch64::AEK_SHA3 | AArch64::AEK_BF16 |
               AArch64::AEK_SHA2 | AArch64::AEK_AES | AArch64::AEK_I8MM))
+AARCH64_ARCH("armv9-a",   ARMV9A, "9-A", "v9a",
+             ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC | AArch64::AEK_FP |
+              AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
+              AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
+              AArch64::AEK_SVE2))
+AARCH64_ARCH("armv9.1-a", ARMV9_1A, "9.1-A", "v9.1a",
+             ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC | AArch64::AEK_FP |
+              AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
+              AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
+              AArch64::AEK_SVE2))
+AARCH64_ARCH("armv9.2-a", ARMV9_2A, "9.2-A", "v9.2a",
+             ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+             (AArch64::AEK_CRC | AArch64::AEK_FP |
+              AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
+              AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD |
+              AArch64::AEK_SVE2))
 // For v8-R, we do not enable crypto and align with GCC that enables a more
 // minimal set of optional architecture extensions.
 AARCH64_ARCH("armv8-r", ARMV8R, "8-R", "v8r",
@@ -126,6 +144,11 @@ AARCH64_CPU_NAME("cortex-a53", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true,
                  (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("cortex-a55", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC))
+AARCH64_CPU_NAME("cortex-a510", ARMV9A, FK_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_BF16 | AArch64::AEK_I8MM | AArch64::AEK_SB |
+                  AArch64::AEK_PAUTH | AArch64::AEK_MTE | AArch64::AEK_SSBS |
+                  AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM |
+                  AArch64::AEK_FP16FML))
 AARCH64_CPU_NAME("cortex-a57", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_CRC))
 AARCH64_CPU_NAME("cortex-a65", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
@@ -155,11 +178,20 @@ AARCH64_CPU_NAME("cortex-a78", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("cortex-a78c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("cortex-a710", ARMV9A, FK_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_MTE | AArch64::AEK_PAUTH | AArch64::AEK_FLAGM |
+                  AArch64::AEK_SB | AArch64::AEK_I8MM | AArch64::AEK_FP16FML |
+                  AArch64::AEK_SVE2BITPERM | AArch64::AEK_BF16))
 AARCH64_CPU_NAME("cortex-r82", ARMV8R, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_LSE))
 AARCH64_CPU_NAME("cortex-x1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
                   AArch64::AEK_SSBS))
+AARCH64_CPU_NAME("cortex-x2", ARMV9A, FK_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_MTE | AArch64::AEK_BF16 | AArch64::AEK_I8MM |
+                  AArch64::AEK_PAUTH | AArch64::AEK_SSBS | AArch64::AEK_SB |
+                  AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM |
+                  AArch64::AEK_FP16FML))
 AARCH64_CPU_NAME("neoverse-e1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | AArch64::AEK_RAS |
                   AArch64::AEK_RCPC | AArch64::AEK_SSBS))
@@ -172,6 +204,10 @@ AARCH64_CPU_NAME("neoverse-n2", ARMV8_5A, FK_CRYPTO_NEON_FP_ARMV8, false,
                   AArch64::AEK_I8MM | AArch64::AEK_MTE | AArch64::AEK_RAS |
                   AArch64::AEK_RCPC | AArch64::AEK_SB | AArch64::AEK_SSBS |
                   AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM))
+AARCH64_CPU_NAME("neoverse-512tvb", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                 (AArch64::AEK_RAS | AArch64::AEK_SVE | AArch64::AEK_SSBS |
+                  AArch64::AEK_RCPC | AArch64::AEK_FP16 | AArch64::AEK_BF16 |
+                  AArch64::AEK_DOTPROD ))
 AARCH64_CPU_NAME("neoverse-v1", ARMV8_4A, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_RAS | AArch64::AEK_SVE | AArch64::AEK_SSBS |
                   AArch64::AEK_RCPC | AArch64::AEK_FP16 | AArch64::AEK_BF16 |
diff --git a/llvm/include/llvm/Support/ARMTargetParser.def b/llvm/include/llvm/Support/ARMTargetParser.def
index 14b169a6e111..fd08f3e6960c 100644
--- a/llvm/include/llvm/Support/ARMTargetParser.def
+++ b/llvm/include/llvm/Support/ARMTargetParser.def
@@ -122,6 +122,21 @@ ARM_ARCH("armv8.7-a", ARMV8_7A, "8.7-A", "v8.7a",
          (ARM::AEK_SEC        | ARM::AEK_MP   | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP  | ARM::AEK_CRC  | ARM::AEK_RAS |
           ARM::AEK_DOTPROD    | ARM::AEK_BF16 | ARM::AEK_I8MM))
+ARM_ARCH("armv9-a", ARMV9A, "9-A", "v9a",
+         ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD))
+ARM_ARCH("armv9.1-a", ARMV9_1A, "9.1-A", "v9.1a",
+         ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
+ARM_ARCH("armv9.2-a", ARMV9_2A, "9.2-A", "v9.2a",
+         ARMBuildAttrs::CPUArch::v8_A, FK_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "v8r", ARMBuildAttrs::CPUArch::v8_R,
           FK_NEON_FP_ARMV8,
           (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB |
@@ -296,6 +311,9 @@ ARM_CPU_NAME("cortex-a78", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("cortex-a78c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              ARM::AEK_FP16 | ARM::AEK_DOTPROD)
+ARM_CPU_NAME("cortex-a710", ARMV9A, FK_NEON_FP_ARMV8, false,
+             (ARM::AEK_DOTPROD | ARM::AEK_FP16FML | ARM::AEK_BF16 | ARM::AEK_SB |
+              ARM::AEK_I8MM))
 ARM_CPU_NAME("cortex-x1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
              (ARM::AEK_FP16 | ARM::AEK_DOTPROD))
 ARM_CPU_NAME("neoverse-n1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index 245432debce6..9e8ce4e36197 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -277,7 +277,7 @@ public:
     size_t TotalMemory = 0;
     for (auto I = Slabs.begin(), E = Slabs.end(); I != E; ++I)
       TotalMemory += computeSlabSize(std::distance(Slabs.begin(), I));
-    for (auto &PtrAndSize : CustomSizedSlabs)
+    for (const auto &PtrAndSize : CustomSizedSlabs)
       TotalMemory += PtrAndSize.second;
     return TotalMemory;
   }
diff --git a/llvm/include/llvm/Support/AtomicOrdering.h b/llvm/include/llvm/Support/AtomicOrdering.h
index 27ca825cef46..1a0d108300bc 100644
--- a/llvm/include/llvm/Support/AtomicOrdering.h
+++ b/llvm/include/llvm/Support/AtomicOrdering.h
@@ -133,6 +133,16 @@ inline bool isReleaseOrStronger(AtomicOrdering AO) {
   return isAtLeastOrStrongerThan(AO, AtomicOrdering::Release);
 }
 
+/// Return a single atomic ordering that is at least as strong as both the \p AO
+/// and \p Other orderings for an atomic operation.
+inline AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO,
+                                              AtomicOrdering Other) {
+  if ((AO == AtomicOrdering::Acquire && Other == AtomicOrdering::Release) ||
+      (AO == AtomicOrdering::Release && Other == AtomicOrdering::Acquire))
+    return AtomicOrdering::AcquireRelease;
+  return isStrongerThan(AO, Other) ? AO : Other;
+}
+
 inline AtomicOrderingCABI toCABI(AtomicOrdering AO) {
   static const AtomicOrderingCABI lookup[8] = {
       /* NotAtomic */ AtomicOrderingCABI::relaxed,
diff --git a/llvm/include/llvm/Support/BinaryByteStream.h b/llvm/include/llvm/Support/BinaryByteStream.h
index ca5bb5abecfc..7d8b6d2dc43d 100644
--- a/llvm/include/llvm/Support/BinaryByteStream.h
+++ b/llvm/include/llvm/Support/BinaryByteStream.h
@@ -38,7 +38,7 @@ public:
 
   llvm::support::endianness getEndian() const override { return Endian; }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     if (auto EC = checkOffsetForRead(Offset, Size))
       return EC;
@@ -46,7 +46,7 @@ public:
     return Error::success();
   }
 
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     if (auto EC = checkOffsetForRead(Offset, 1))
       return EC;
@@ -54,7 +54,7 @@ public:
     return Error::success();
   }
 
-  uint32_t getLength() override { return Data.size(); }
+  uint64_t getLength() override { return Data.size(); }
 
   ArrayRef<uint8_t> data() const { return Data; }
 
@@ -97,19 +97,19 @@ public:
     return ImmutableStream.getEndian();
   }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     return ImmutableStream.readBytes(Offset, Size, Buffer);
   }
 
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     return ImmutableStream.readLongestContiguousChunk(Offset, Buffer);
   }
 
-  uint32_t getLength() override { return ImmutableStream.getLength(); }
+  uint64_t getLength() override { return ImmutableStream.getLength(); }
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) override {
+  Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Buffer) override {
     if (Buffer.empty())
       return Error::success();
 
@@ -145,7 +145,7 @@ public:
 
   llvm::support::endianness getEndian() const override { return Endian; }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     if (auto EC = checkOffsetForWrite(Offset, Buffer.size()))
       return EC;
@@ -154,11 +154,11 @@ public:
     return Error::success();
   }
 
-  void insert(uint32_t Offset, ArrayRef<uint8_t> Bytes) {
+  void insert(uint64_t Offset, ArrayRef<uint8_t> Bytes) {
     Data.insert(Data.begin() + Offset, Bytes.begin(), Bytes.end());
   }
 
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     if (auto EC = checkOffsetForWrite(Offset, 1))
       return EC;
@@ -167,9 +167,9 @@ public:
     return Error::success();
   }
 
-  uint32_t getLength() override { return Data.size(); }
+  uint64_t getLength() override { return Data.size(); }
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) override {
+  Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Buffer) override {
     if (Buffer.empty())
       return Error::success();
 
@@ -182,7 +182,7 @@ public:
     if (Offset > getLength())
       return make_error<BinaryStreamError>(stream_error_code::invalid_offset);
 
-    uint32_t RequiredSize = Offset + Buffer.size();
+    uint64_t RequiredSize = Offset + Buffer.size();
     if (RequiredSize > Data.size())
       Data.resize(RequiredSize);
 
@@ -240,19 +240,19 @@ public:
     return Impl.getEndian();
   }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     return Impl.readBytes(Offset, Size, Buffer);
   }
 
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     return Impl.readLongestContiguousChunk(Offset, Buffer);
   }
 
-  uint32_t getLength() override { return Impl.getLength(); }
+  uint64_t getLength() override { return Impl.getLength(); }
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) override {
+  Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Data) override {
     return Impl.writeBytes(Offset, Data);
   }
 
diff --git a/llvm/include/llvm/Support/BinaryItemStream.h b/llvm/include/llvm/Support/BinaryItemStream.h
index 4d27013ce368..eb512bf4721a 100644
--- a/llvm/include/llvm/Support/BinaryItemStream.h
+++ b/llvm/include/llvm/Support/BinaryItemStream.h
@@ -38,7 +38,7 @@ public:
 
   llvm::support::endianness getEndian() const override { return Endian; }
 
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     auto ExpectedIndex = translateOffsetIndex(Offset);
     if (!ExpectedIndex)
@@ -52,7 +52,7 @@ public:
     return Error::success();
   }
 
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     auto ExpectedIndex = translateOffsetIndex(Offset);
     if (!ExpectedIndex)
@@ -66,7 +66,7 @@ public:
     computeItemOffsets();
   }
 
-  uint32_t getLength() override {
+  uint64_t getLength() override {
     return ItemEndOffsets.empty() ? 0 : ItemEndOffsets.back();
   }
 
@@ -74,16 +74,16 @@ private:
   void computeItemOffsets() {
     ItemEndOffsets.clear();
     ItemEndOffsets.reserve(Items.size());
-    uint32_t CurrentOffset = 0;
+    uint64_t CurrentOffset = 0;
     for (const auto &Item : Items) {
-      uint32_t Len = Traits::length(Item);
+      uint64_t Len = Traits::length(Item);
       assert(Len > 0 && "no empty items");
       CurrentOffset += Len;
       ItemEndOffsets.push_back(CurrentOffset);
     }
   }
 
-  Expected<uint32_t> translateOffsetIndex(uint32_t Offset) {
+  Expected<uint32_t> translateOffsetIndex(uint64_t Offset) {
     // Make sure the offset is somewhere in our items array.
     if (Offset >= getLength())
       return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
@@ -98,7 +98,7 @@ private:
   ArrayRef<T> Items;
 
   // Sorted vector of offsets to accelerate lookup.
-  std::vector<uint32_t> ItemEndOffsets;
+  std::vector<uint64_t> ItemEndOffsets;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/BinaryStream.h b/llvm/include/llvm/Support/BinaryStream.h
index fcf4398550ee..e87129d8c201 100644
--- a/llvm/include/llvm/Support/BinaryStream.h
+++ b/llvm/include/llvm/Support/BinaryStream.h
@@ -41,22 +41,22 @@ public:
   /// Given an offset into the stream and a number of bytes, attempt to
   /// read the bytes and set the output ArrayRef to point to data owned by the
   /// stream.
-  virtual Error readBytes(uint32_t Offset, uint32_t Size,
+  virtual Error readBytes(uint64_t Offset, uint64_t Size,
                           ArrayRef<uint8_t> &Buffer) = 0;
 
   /// Given an offset into the stream, read as much as possible without
   /// copying any data.
-  virtual Error readLongestContiguousChunk(uint32_t Offset,
+  virtual Error readLongestContiguousChunk(uint64_t Offset,
                                            ArrayRef<uint8_t> &Buffer) = 0;
 
   /// Return the number of bytes of data in this stream.
-  virtual uint32_t getLength() = 0;
+  virtual uint64_t getLength() = 0;
 
   /// Return the properties of this stream.
   virtual BinaryStreamFlags getFlags() const { return BSF_None; }
 
 protected:
-  Error checkOffsetForRead(uint32_t Offset, uint32_t DataSize) {
+  Error checkOffsetForRead(uint64_t Offset, uint64_t DataSize) {
     if (Offset > getLength())
       return make_error<BinaryStreamError>(stream_error_code::invalid_offset);
     if (getLength() < DataSize + Offset)
@@ -77,7 +77,7 @@ public:
   /// Attempt to write the given bytes into the stream at the desired
   /// offset. This will always necessitate a copy.  Cannot shrink or grow the
   /// stream, only writes into existing allocated space.
-  virtual Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) = 0;
+  virtual Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Data) = 0;
 
   /// For buffered streams, commits changes to the backing store.
   virtual Error commit() = 0;
@@ -86,7 +86,7 @@ public:
   BinaryStreamFlags getFlags() const override { return BSF_Write; }
 
 protected:
-  Error checkOffsetForWrite(uint32_t Offset, uint32_t DataSize) {
+  Error checkOffsetForWrite(uint64_t Offset, uint64_t DataSize) {
     if (!(getFlags() & BSF_Append))
       return checkOffsetForRead(Offset, DataSize);
 
diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h
index 148ab85169f2..85d29be26ca9 100644
--- a/llvm/include/llvm/Support/BinaryStreamArray.h
+++ b/llvm/include/llvm/Support/BinaryStreamArray.h
@@ -153,7 +153,7 @@ private:
 template <typename ValueType, typename Extractor>
 class VarStreamArrayIterator
     : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>,
-                                  std::forward_iterator_tag, ValueType> {
+                                  std::forward_iterator_tag, const ValueType> {
   typedef VarStreamArrayIterator<ValueType, Extractor> IterType;
   typedef VarStreamArray<ValueType, Extractor> ArrayType;
 
@@ -197,11 +197,6 @@ public:
     return ThisValue;
   }
 
-  ValueType &operator*() {
-    assert(Array && !HasError);
-    return ThisValue;
-  }
-
   IterType &operator+=(unsigned N) {
     for (unsigned I = 0; I < N; ++I) {
       // We are done with the current record, discard it so that we are
diff --git a/llvm/include/llvm/Support/BinaryStreamReader.h b/llvm/include/llvm/Support/BinaryStreamReader.h
index 9ad98a89aaf9..29b4b09b848c 100644
--- a/llvm/include/llvm/Support/BinaryStreamReader.h
+++ b/llvm/include/llvm/Support/BinaryStreamReader.h
@@ -251,16 +251,16 @@ public:
   }
 
   bool empty() const { return bytesRemaining() == 0; }
-  void setOffset(uint32_t Off) { Offset = Off; }
-  uint32_t getOffset() const { return Offset; }
-  uint32_t getLength() const { return Stream.getLength(); }
-  uint32_t bytesRemaining() const { return getLength() - getOffset(); }
+  void setOffset(uint64_t Off) { Offset = Off; }
+  uint64_t getOffset() const { return Offset; }
+  uint64_t getLength() const { return Stream.getLength(); }
+  uint64_t bytesRemaining() const { return getLength() - getOffset(); }
 
   /// Advance the stream's offset by \p Amount bytes.
   ///
   /// \returns a success error code if at least \p Amount bytes remain in the
   /// stream, otherwise returns an appropriate error code.
-  Error skip(uint32_t Amount);
+  Error skip(uint64_t Amount);
 
   /// Examine the next byte of the underlying stream without advancing the
   /// stream's offset.  If the stream is empty the behavior is undefined.
@@ -271,11 +271,11 @@ public:
   Error padToAlignment(uint32_t Align);
 
   std::pair<BinaryStreamReader, BinaryStreamReader>
-  split(uint32_t Offset) const;
+  split(uint64_t Offset) const;
 
 private:
   BinaryStreamRef Stream;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Support/BinaryStreamRef.h b/llvm/include/llvm/Support/BinaryStreamRef.h
index ba4c3873586d..e0aaab82ffab 100644
--- a/llvm/include/llvm/Support/BinaryStreamRef.h
+++ b/llvm/include/llvm/Support/BinaryStreamRef.h
@@ -30,12 +30,12 @@ protected:
       Length = BorrowedImpl.getLength();
   }
 
-  BinaryStreamRefBase(std::shared_ptr<StreamType> SharedImpl, uint32_t Offset,
-                      Optional<uint32_t> Length)
+  BinaryStreamRefBase(std::shared_ptr<StreamType> SharedImpl, uint64_t Offset,
+                      Optional<uint64_t> Length)
       : SharedImpl(SharedImpl), BorrowedImpl(SharedImpl.get()),
         ViewOffset(Offset), Length(Length) {}
-  BinaryStreamRefBase(StreamType &BorrowedImpl, uint32_t Offset,
-                      Optional<uint32_t> Length)
+  BinaryStreamRefBase(StreamType &BorrowedImpl, uint64_t Offset,
+                      Optional<uint64_t> Length)
       : BorrowedImpl(&BorrowedImpl), ViewOffset(Offset), Length(Length) {}
   BinaryStreamRefBase(const BinaryStreamRefBase &Other) = default;
   BinaryStreamRefBase &operator=(const BinaryStreamRefBase &Other) = default;
@@ -48,7 +48,7 @@ public:
     return BorrowedImpl->getEndian();
   }
 
-  uint32_t getLength() const {
+  uint64_t getLength() const {
     if (Length.hasValue())
       return *Length;
 
@@ -58,7 +58,7 @@ public:
   /// Return a new BinaryStreamRef with the first \p N elements removed.  If
   /// this BinaryStreamRef is length-tracking, then the resulting one will be
   /// too.
-  RefType drop_front(uint32_t N) const {
+  RefType drop_front(uint64_t N) const {
     if (!BorrowedImpl)
       return RefType();
 
@@ -76,7 +76,7 @@ public:
   /// Return a new BinaryStreamRef with the last \p N elements removed.  If
   /// this BinaryStreamRef is length-tracking and \p N is greater than 0, then
   /// this BinaryStreamRef will no longer length-track.
-  RefType drop_back(uint32_t N) const {
+  RefType drop_back(uint64_t N) const {
     if (!BorrowedImpl)
       return RefType();
 
@@ -96,26 +96,26 @@ public:
   }
 
   /// Return a new BinaryStreamRef with only the first \p N elements remaining.
-  RefType keep_front(uint32_t N) const {
+  RefType keep_front(uint64_t N) const {
     assert(N <= getLength());
     return drop_back(getLength() - N);
   }
 
   /// Return a new BinaryStreamRef with only the last \p N elements remaining.
-  RefType keep_back(uint32_t N) const {
+  RefType keep_back(uint64_t N) const {
     assert(N <= getLength());
     return drop_front(getLength() - N);
   }
 
   /// Return a new BinaryStreamRef with the first and last \p N elements
   /// removed.
-  RefType drop_symmetric(uint32_t N) const {
+  RefType drop_symmetric(uint64_t N) const {
     return drop_front(N).drop_back(N);
   }
 
   /// Return a new BinaryStreamRef with the first \p Offset elements removed,
   /// and retaining exactly \p Len elements.
-  RefType slice(uint32_t Offset, uint32_t Len) const {
+  RefType slice(uint64_t Offset, uint64_t Len) const {
     return drop_front(Offset).keep_front(Len);
   }
 
@@ -132,7 +132,7 @@ public:
   }
 
 protected:
-  Error checkOffsetForRead(uint32_t Offset, uint32_t DataSize) const {
+  Error checkOffsetForRead(uint64_t Offset, uint64_t DataSize) const {
     if (Offset > getLength())
       return make_error<BinaryStreamError>(stream_error_code::invalid_offset);
     if (getLength() < DataSize + Offset)
@@ -142,8 +142,8 @@ protected:
 
   std::shared_ptr<StreamType> SharedImpl;
   StreamType *BorrowedImpl = nullptr;
-  uint32_t ViewOffset = 0;
-  Optional<uint32_t> Length;
+  uint64_t ViewOffset = 0;
+  Optional<uint64_t> Length;
 };
 
 /// BinaryStreamRef is to BinaryStream what ArrayRef is to an Array.  It
@@ -157,15 +157,15 @@ class BinaryStreamRef
     : public BinaryStreamRefBase<BinaryStreamRef, BinaryStream> {
   friend BinaryStreamRefBase<BinaryStreamRef, BinaryStream>;
   friend class WritableBinaryStreamRef;
-  BinaryStreamRef(std::shared_ptr<BinaryStream> Impl, uint32_t ViewOffset,
-                  Optional<uint32_t> Length)
+  BinaryStreamRef(std::shared_ptr<BinaryStream> Impl, uint64_t ViewOffset,
+                  Optional<uint64_t> Length)
       : BinaryStreamRefBase(Impl, ViewOffset, Length) {}
 
 public:
   BinaryStreamRef() = default;
   BinaryStreamRef(BinaryStream &Stream);
-  BinaryStreamRef(BinaryStream &Stream, uint32_t Offset,
-                  Optional<uint32_t> Length);
+  BinaryStreamRef(BinaryStream &Stream, uint64_t Offset,
+                  Optional<uint64_t> Length);
   explicit BinaryStreamRef(ArrayRef<uint8_t> Data,
                            llvm::support::endianness Endian);
   explicit BinaryStreamRef(StringRef Data, llvm::support::endianness Endian);
@@ -176,8 +176,8 @@ public:
   BinaryStreamRef &operator=(BinaryStreamRef &&Other) = default;
 
   // Use BinaryStreamRef.slice() instead.
-  BinaryStreamRef(BinaryStreamRef &S, uint32_t Offset,
-                  uint32_t Length) = delete;
+  BinaryStreamRef(BinaryStreamRef &S, uint64_t Offset,
+                  uint64_t Length) = delete;
 
   /// Given an Offset into this StreamRef and a Size, return a reference to a
   /// buffer owned by the stream.
@@ -185,7 +185,7 @@ public:
   /// \returns a success error code if the entire range of data is within the
   /// bounds of this BinaryStreamRef's view and the implementation could read
   /// the data, and an appropriate error code otherwise.
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) const;
 
   /// Given an Offset into this BinaryStreamRef, return a reference to the
@@ -193,29 +193,28 @@ public:
   ///
   /// \returns a success error code if implementation could read the data,
   /// and an appropriate error code otherwise.
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) const;
 };
 
 struct BinarySubstreamRef {
-  uint32_t Offset = 0;        // Offset in the parent stream
+  uint64_t Offset = 0;        // Offset in the parent stream
   BinaryStreamRef StreamData; // Stream Data
 
-  BinarySubstreamRef slice(uint32_t Off, uint32_t Size) const {
+  BinarySubstreamRef slice(uint64_t Off, uint64_t Size) const {
     BinaryStreamRef SubSub = StreamData.slice(Off, Size);
     return {Off + Offset, SubSub};
   }
-  BinarySubstreamRef drop_front(uint32_t N) const {
+  BinarySubstreamRef drop_front(uint64_t N) const {
     return slice(N, size() - N);
   }
-  BinarySubstreamRef keep_front(uint32_t N) const { return slice(0, N); }
+  BinarySubstreamRef keep_front(uint64_t N) const { return slice(0, N); }
 
-  std::pair<BinarySubstreamRef, BinarySubstreamRef>
-  split(uint32_t Off) const {
+  std::pair<BinarySubstreamRef, BinarySubstreamRef> split(uint64_t Off) const {
     return std::make_pair(keep_front(Off), drop_front(Off));
   }
 
-  uint32_t size() const { return StreamData.getLength(); }
+  uint64_t size() const { return StreamData.getLength(); }
   bool empty() const { return size() == 0; }
 };
 
@@ -224,10 +223,10 @@ class WritableBinaryStreamRef
                                  WritableBinaryStream> {
   friend BinaryStreamRefBase<WritableBinaryStreamRef, WritableBinaryStream>;
   WritableBinaryStreamRef(std::shared_ptr<WritableBinaryStream> Impl,
-                          uint32_t ViewOffset, Optional<uint32_t> Length)
+                          uint64_t ViewOffset, Optional<uint64_t> Length)
       : BinaryStreamRefBase(Impl, ViewOffset, Length) {}
 
-  Error checkOffsetForWrite(uint32_t Offset, uint32_t DataSize) const {
+  Error checkOffsetForWrite(uint64_t Offset, uint64_t DataSize) const {
     if (!(BorrowedImpl->getFlags() & BSF_Append))
       return checkOffsetForRead(Offset, DataSize);
 
@@ -239,8 +238,8 @@ class WritableBinaryStreamRef
 public:
   WritableBinaryStreamRef() = default;
   WritableBinaryStreamRef(WritableBinaryStream &Stream);
-  WritableBinaryStreamRef(WritableBinaryStream &Stream, uint32_t Offset,
-                          Optional<uint32_t> Length);
+  WritableBinaryStreamRef(WritableBinaryStream &Stream, uint64_t Offset,
+                          Optional<uint64_t> Length);
   explicit WritableBinaryStreamRef(MutableArrayRef<uint8_t> Data,
                                    llvm::support::endianness Endian);
   WritableBinaryStreamRef(const WritableBinaryStreamRef &Other) = default;
@@ -251,8 +250,8 @@ public:
   WritableBinaryStreamRef &operator=(WritableBinaryStreamRef &&Other) = default;
 
   // Use WritableBinaryStreamRef.slice() instead.
-  WritableBinaryStreamRef(WritableBinaryStreamRef &S, uint32_t Offset,
-                          uint32_t Length) = delete;
+  WritableBinaryStreamRef(WritableBinaryStreamRef &S, uint64_t Offset,
+                          uint64_t Length) = delete;
 
   /// Given an Offset into this WritableBinaryStreamRef and some input data,
   /// writes the data to the underlying stream.
@@ -260,7 +259,7 @@ public:
   /// \returns a success error code if the data could fit within the underlying
   /// stream at the specified location and the implementation could write the
   /// data, and an appropriate error code otherwise.
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const;
+  Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Data) const;
 
   /// Conver this WritableBinaryStreamRef to a read-only BinaryStreamRef.
   operator BinaryStreamRef() const;
diff --git a/llvm/include/llvm/Support/BinaryStreamWriter.h b/llvm/include/llvm/Support/BinaryStreamWriter.h
index ceba792e6b26..3054f4ac7ef0 100644
--- a/llvm/include/llvm/Support/BinaryStreamWriter.h
+++ b/llvm/include/llvm/Support/BinaryStreamWriter.h
@@ -124,7 +124,7 @@ public:
   ///
   /// \returns a success error code if the data was successfully written,
   /// otherwise returns an appropriate error code.
-  Error writeStreamRef(BinaryStreamRef Ref, uint32_t Size);
+  Error writeStreamRef(BinaryStreamRef Ref, uint64_t Size);
 
   /// Writes the object \p Obj to the underlying stream, as if by using memcpy.
   /// It is up to the caller to ensure that type of \p Obj can be safely copied
@@ -178,17 +178,17 @@ public:
   }
 
   /// Splits the Writer into two Writers at a given offset.
-  std::pair<BinaryStreamWriter, BinaryStreamWriter> split(uint32_t Off) const;
+  std::pair<BinaryStreamWriter, BinaryStreamWriter> split(uint64_t Off) const;
 
-  void setOffset(uint32_t Off) { Offset = Off; }
-  uint32_t getOffset() const { return Offset; }
-  uint32_t getLength() const { return Stream.getLength(); }
-  uint32_t bytesRemaining() const { return getLength() - getOffset(); }
+  void setOffset(uint64_t Off) { Offset = Off; }
+  uint64_t getOffset() const { return Offset; }
+  uint64_t getLength() const { return Stream.getLength(); }
+  uint64_t bytesRemaining() const { return getLength() - getOffset(); }
   Error padToAlignment(uint32_t Align);
 
 protected:
   WritableBinaryStreamRef Stream;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Caching.h b/llvm/include/llvm/Support/Caching.h
new file mode 100644
index 000000000000..1e5fea17f708
--- /dev/null
+++ b/llvm/include/llvm/Support/Caching.h
@@ -0,0 +1,71 @@
+//===- Caching.h - LLVM Local File Cache ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CachedFileStream and the localCache function, which
+// simplifies caching files on the local filesystem in a directory whose
+// contents are managed by a CachePruningPolicy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CACHING_H
+#define LLVM_SUPPORT_CACHING_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+class MemoryBuffer;
+
+/// This class wraps an output stream for a file. Most clients should just be
+/// able to return an instance of this base class from the stream callback, but
+/// if a client needs to perform some action after the stream is written to,
+/// that can be done by deriving from this class and overriding the destructor.
+class CachedFileStream {
+public:
+  CachedFileStream(std::unique_ptr<raw_pwrite_stream> OS) : OS(std::move(OS)) {}
+  std::unique_ptr<raw_pwrite_stream> OS;
+  virtual ~CachedFileStream() = default;
+};
+
+/// This type defines the callback to add a file that is generated on the fly.
+///
+/// Stream callbacks must be thread safe.
+using AddStreamFn =
+    std::function<Expected<std::unique_ptr<CachedFileStream>>(unsigned Task)>;
+
+/// This is the type of a file cache. To request an item from the cache, pass a
+/// unique string as the Key. For hits, the cached file will be added to the
+/// link and this function will return AddStreamFn(). For misses, the cache will
+/// return a stream callback which must be called at most once to produce
+/// content for the stream. The file stream produced by the stream callback will
+/// add the file to the link after the stream is written to.
+///
+/// Clients generally look like this:
+///
+/// if (AddStreamFn AddStream = Cache(Task, Key))
+///   ProduceContent(AddStream);
+using FileCache =
+    std::function<Expected<AddStreamFn>(unsigned Task, StringRef Key)>;
+
+/// This type defines the callback to add a pre-existing file (e.g. in a cache).
+///
+/// Buffer callbacks must be thread safe.
+using AddBufferFn =
+    std::function<void(unsigned Task, std::unique_ptr<MemoryBuffer> MB)>;
+
+/// Create a local file system cache which uses the given cache name, temporary
+/// file prefix, cache directory and file callback. This function also creates
+/// the cache directory if it does not already exist. The cache name appears in
+/// error messages for errors during caching. The temporary file prefix is used
+/// in the temporary file naming scheme used when writing files atomically.
+Expected<FileCache> localCache(Twine CacheNameRef, Twine TempFilePrefixRef,
+                               Twine CacheDirectoryPathRef,
+                               AddBufferFn AddBuffer);
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 14d7e21f78b2..2ee02010ff1d 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -316,9 +316,7 @@ public:
   }
 
   bool isInAllSubCommands() const {
-    return any_of(Subs, [](const SubCommand *SC) {
-      return SC == &*AllSubCommands;
-    });
+    return llvm::is_contained(Subs, &*AllSubCommands);
   }
 
   //-------------------------------------------------------------------------===
@@ -926,6 +924,9 @@ public:
 //--------------------------------------------------
 // parser<bool>
 //
+
+extern template class basic_parser<bool>;
+
 template <> class parser<bool> : public basic_parser<bool> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -949,10 +950,11 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<bool>;
-
 //--------------------------------------------------
 // parser<boolOrDefault>
+
+extern template class basic_parser<boolOrDefault>;
+
 template <> class parser<boolOrDefault> : public basic_parser<boolOrDefault> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -974,11 +976,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<boolOrDefault>;
-
 //--------------------------------------------------
 // parser<int>
 //
+
+extern template class basic_parser<int>;
+
 template <> class parser<int> : public basic_parser<int> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -996,11 +999,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<int>;
-
 //--------------------------------------------------
 // parser<long>
 //
+
+extern template class basic_parser<long>;
+
 template <> class parser<long> final : public basic_parser<long> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1018,11 +1022,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<long>;
-
 //--------------------------------------------------
 // parser<long long>
 //
+
+extern template class basic_parser<long long>;
+
 template <> class parser<long long> : public basic_parser<long long> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1040,11 +1045,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<long long>;
-
 //--------------------------------------------------
 // parser<unsigned>
 //
+
+extern template class basic_parser<unsigned>;
+
 template <> class parser<unsigned> : public basic_parser<unsigned> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1062,11 +1068,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<unsigned>;
-
 //--------------------------------------------------
 // parser<unsigned long>
 //
+
+extern template class basic_parser<unsigned long>;
+
 template <>
 class parser<unsigned long> final : public basic_parser<unsigned long> {
 public:
@@ -1085,11 +1092,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<unsigned long>;
-
 //--------------------------------------------------
 // parser<unsigned long long>
 //
+
+extern template class basic_parser<unsigned long long>;
+
 template <>
 class parser<unsigned long long> : public basic_parser<unsigned long long> {
 public:
@@ -1109,11 +1117,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<unsigned long long>;
-
 //--------------------------------------------------
 // parser<double>
 //
+
+extern template class basic_parser<double>;
+
 template <> class parser<double> : public basic_parser<double> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1131,11 +1140,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<double>;
-
 //--------------------------------------------------
 // parser<float>
 //
+
+extern template class basic_parser<float>;
+
 template <> class parser<float> : public basic_parser<float> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1153,11 +1163,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<float>;
-
 //--------------------------------------------------
 // parser<std::string>
 //
+
+extern template class basic_parser<std::string>;
+
 template <> class parser<std::string> : public basic_parser<std::string> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1178,11 +1189,12 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<std::string>;
-
 //--------------------------------------------------
 // parser<char>
 //
+
+extern template class basic_parser<char>;
+
 template <> class parser<char> : public basic_parser<char> {
 public:
   parser(Option &O) : basic_parser(O) {}
@@ -1203,8 +1215,6 @@ public:
   void anchor() override;
 };
 
-extern template class basic_parser<char>;
-
 //--------------------------------------------------
 // PrintOptionDiff
 //
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index 57052b596edb..c5318137ed3d 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -97,7 +97,7 @@
 /// Sadly, this is separate from just rvalue reference support because GCC
 /// and MSVC implemented this later than everything else. This appears to be
 /// corrected in MSVC 2019 but not MSVC 2017.
-#if __has_feature(cxx_rvalue_references) || LLVM_GNUC_PREREQ(4, 8, 1) ||       \
+#if __has_feature(cxx_rvalue_references) || defined(__GNUC__) ||               \
     LLVM_MSC_PREREQ(1920)
 #define LLVM_HAS_RVALUE_REFERENCE_THIS 1
 #else
@@ -123,8 +123,8 @@
 /// LLVM_EXTERNAL_VISIBILITY - classes, functions, and variables marked with
 /// this attribute will be made public and visible outside of any shared library
 /// they are linked in to.
-#if (__has_attribute(visibility) || LLVM_GNUC_PREREQ(4, 0, 0)) &&              \
-    !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(_WIN32)
+#if __has_attribute(visibility) && !defined(__MINGW32__) &&                    \
+    !defined(__CYGWIN__) && !defined(_WIN32)
 #define LLVM_LIBRARY_VISIBILITY __attribute__ ((visibility("hidden")))
 #define LLVM_EXTERNAL_VISIBILITY __attribute__ ((visibility("default")))
 #else
@@ -138,7 +138,7 @@
 #define LLVM_PREFETCH(addr, rw, locality)
 #endif
 
-#if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0)
+#if __has_attribute(used)
 #define LLVM_ATTRIBUTE_USED __attribute__((__used__))
 #else
 #define LLVM_ATTRIBUTE_USED
@@ -182,15 +182,15 @@
 // more portable solution:
 //   (void)unused_var_name;
 // Prefer cast-to-void wherever it is sufficient.
-#if __has_attribute(unused) || LLVM_GNUC_PREREQ(3, 1, 0)
+#if __has_attribute(unused)
 #define LLVM_ATTRIBUTE_UNUSED __attribute__((__unused__))
 #else
 #define LLVM_ATTRIBUTE_UNUSED
 #endif
 
 // FIXME: Provide this for PE/COFF targets.
-#if (__has_attribute(weak) || LLVM_GNUC_PREREQ(4, 0, 0)) &&                    \
-    (!defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(_WIN32))
+#if __has_attribute(weak) && !defined(__MINGW32__) && !defined(__CYGWIN__) &&  \
+    !defined(_WIN32)
 #define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__))
 #else
 #define LLVM_ATTRIBUTE_WEAK
@@ -212,7 +212,13 @@
 #define LLVM_READONLY
 #endif
 
-#if __has_builtin(__builtin_expect) || LLVM_GNUC_PREREQ(4, 0, 0)
+#if __has_attribute(minsize)
+#define LLVM_ATTRIBUTE_MINSIZE __attribute__((minsize))
+#else
+#define LLVM_ATTRIBUTE_MINSIZE
+#endif
+
+#if __has_builtin(__builtin_expect) || defined(__GNUC__)
 #define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
 #define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #else
@@ -222,7 +228,7 @@
 
 /// LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so,
 /// mark a method "not for inlining".
-#if __has_attribute(noinline) || LLVM_GNUC_PREREQ(3, 4, 0)
+#if __has_attribute(noinline)
 #define LLVM_ATTRIBUTE_NOINLINE __attribute__((noinline))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_NOINLINE __declspec(noinline)
@@ -231,10 +237,8 @@
 #endif
 
 /// LLVM_ATTRIBUTE_ALWAYS_INLINE - On compilers where we have a directive to do
-/// so, mark a method "always inline" because it is performance sensitive. GCC
-/// 3.4 supported this but is buggy in various cases and produces unimplemented
-/// errors, just use it in GCC 4.0 and later.
-#if __has_attribute(always_inline) || LLVM_GNUC_PREREQ(4, 0, 0)
+/// so, mark a method "always inline" because it is performance sensitive.
+#if __has_attribute(always_inline)
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline
@@ -242,15 +246,16 @@
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE inline
 #endif
 
-#ifdef __GNUC__
-#define LLVM_ATTRIBUTE_NORETURN __attribute__((noreturn))
-#elif defined(_MSC_VER)
-#define LLVM_ATTRIBUTE_NORETURN __declspec(noreturn)
+/// LLVM_ATTRIBUTE_NO_DEBUG - On compilers where we have a directive to do
+/// so, mark a method "no debug" because debug info makes the debugger
+/// experience worse.
+#if __has_attribute(nodebug)
+#define LLVM_ATTRIBUTE_NODEBUG __attribute__((nodebug))
 #else
-#define LLVM_ATTRIBUTE_NORETURN
+#define LLVM_ATTRIBUTE_NODEBUG
 #endif
 
-#if __has_attribute(returns_nonnull) || LLVM_GNUC_PREREQ(4, 9, 0)
+#if __has_attribute(returns_nonnull)
 #define LLVM_ATTRIBUTE_RETURNS_NONNULL __attribute__((returns_nonnull))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_RETURNS_NONNULL _Ret_notnull_
@@ -322,15 +327,17 @@
 /// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands
 /// to an expression which states that it is undefined behavior for the
 /// compiler to reach this point.  Otherwise is not defined.
-#if __has_builtin(__builtin_unreachable) || LLVM_GNUC_PREREQ(4, 5, 0)
+#if __has_builtin(__builtin_unreachable) || defined(__GNUC__)
 # define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable()
 #elif defined(_MSC_VER)
 # define LLVM_BUILTIN_UNREACHABLE __assume(false)
+#else
+# define LLVM_BUILTIN_UNREACHABLE
 #endif
 
 /// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression
 /// which causes the program to exit abnormally.
-#if __has_builtin(__builtin_trap) || LLVM_GNUC_PREREQ(4, 3, 0)
+#if __has_builtin(__builtin_trap) || defined(__GNUC__)
 # define LLVM_BUILTIN_TRAP __builtin_trap()
 #elif defined(_MSC_VER)
 // The __debugbreak intrinsic is supported by MSVC, does not require forward
@@ -361,7 +368,7 @@
 
 /// \macro LLVM_ASSUME_ALIGNED
 /// Returns a pointer with an assumed alignment.
-#if __has_builtin(__builtin_assume_aligned) || LLVM_GNUC_PREREQ(4, 7, 0)
+#if __has_builtin(__builtin_assume_aligned) || defined(__GNUC__)
 # define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a)
 #elif defined(LLVM_BUILTIN_UNREACHABLE)
 # define LLVM_ASSUME_ALIGNED(p, a) \
@@ -549,4 +556,13 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #define LLVM_ENABLE_EXCEPTIONS 1
 #endif
 
+/// \macro LLVM_NO_PROFILE_INSTRUMENT_FUNCTION
+/// Disable the profile instrument for a function.
+#if __has_attribute(no_profile_instrument_function)
+#define LLVM_NO_PROFILE_INSTRUMENT_FUNCTION                                    \
+  __attribute__((no_profile_instrument_function))
+#else
+#define LLVM_NO_PROFILE_INSTRUMENT_FUNCTION
+#endif
+
 #endif
diff --git a/llvm/include/llvm/Support/CrashRecoveryContext.h b/llvm/include/llvm/Support/CrashRecoveryContext.h
index 498690655fd1..2604ccb38431 100644
--- a/llvm/include/llvm/Support/CrashRecoveryContext.h
+++ b/llvm/include/llvm/Support/CrashRecoveryContext.h
@@ -99,8 +99,7 @@ public:
 
   /// Explicitly trigger a crash recovery in the current process, and
   /// return failure from RunSafely(). This function does not return.
-  LLVM_ATTRIBUTE_NORETURN
-  void HandleExit(int RetCode);
+  [[noreturn]] void HandleExit(int RetCode);
 
   /// Throw again a signal or an exception, after it was catched once by a
   /// CrashRecoveryContext.
diff --git a/llvm/include/llvm/Support/DOTGraphTraits.h b/llvm/include/llvm/Support/DOTGraphTraits.h
index a73538fa1462..ffa9abe328c8 100644
--- a/llvm/include/llvm/Support/DOTGraphTraits.h
+++ b/llvm/include/llvm/Support/DOTGraphTraits.h
@@ -65,6 +65,11 @@ public:
     return false;
   }
 
+  // renderNodesUsingHTML - If the function returns true, nodes will be
+  // rendered using HTML-like labels which allows colors, etc in the nodes
+  // and the edge source labels.
+  static bool renderNodesUsingHTML() { return false; }
+
   /// getNodeLabel - Given a node and a pointer to the top level graph, return
   /// the label to print in the node.
   template<typename GraphType>
diff --git a/llvm/include/llvm/Support/DataExtractor.h b/llvm/include/llvm/Support/DataExtractor.h
index f9335c161563..f4f5905d4bcc 100644
--- a/llvm/include/llvm/Support/DataExtractor.h
+++ b/llvm/include/llvm/Support/DataExtractor.h
@@ -70,6 +70,9 @@ public:
     /// the position of the Cursor before the first error was encountered.
     uint64_t tell() const { return Offset; }
 
+    /// Set the cursor to the new offset. This does not impact the error state.
+    void seek(uint64_t NewOffSet) { Offset = NewOffSet; }
+
     /// Return error contained inside this Cursor, if any. Clears the internal
     /// Cursor state.
     Error takeError() { return std::move(Err); }
diff --git a/llvm/include/llvm/Support/Debug.h b/llvm/include/llvm/Support/Debug.h
index 64b730951bda..2ff978476c79 100644
--- a/llvm/include/llvm/Support/Debug.h
+++ b/llvm/include/llvm/Support/Debug.h
@@ -78,27 +78,6 @@ void setCurrentDebugTypes(const char **Types, unsigned Count);
 ///
 extern bool DebugFlag;
 
-/// \name Verification flags.
-///
-/// These flags turns on/off that are expensive and are turned off by default,
-/// unless macro EXPENSIVE_CHECKS is defined. The flags allow selectively
-/// turning the checks on without need to recompile.
-/// \{
-
-/// Enables verification of dominator trees.
-///
-extern bool VerifyDomInfo;
-
-/// Enables verification of loop info.
-///
-extern bool VerifyLoopInfo;
-
-/// Enables verification of MemorySSA.
-///
-extern bool VerifyMemorySSA;
-
-///\}
-
 /// EnableDebugBuffering - This defaults to false.  If true, the debug
 /// stream will install signal handlers to dump any buffered debug
 /// output.  It allows clients to selectively allow the debug stream
diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
new file mode 100644
index 000000000000..5bb326178c3e
--- /dev/null
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -0,0 +1,38 @@
+//== llvm/Support/DivisonByConstantInfo.h - division by constant -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file implements support for optimizing divisions by a constant
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_DIVISON_BY_CONSTANT_INFO_H
+#define LLVM_SUPPORT_DIVISON_BY_CONSTANT_INFO_H
+
+#include "llvm/ADT/APInt.h"
+
+namespace llvm {
+
+/// Magic data for optimising signed division by a constant.
+struct SignedDivisionByConstantInfo {
+  static SignedDivisionByConstantInfo get(const APInt &D);
+  APInt Magic;          ///< magic number
+  unsigned ShiftAmount; ///< shift amount
+};
+
+/// Magic data for optimising unsigned division by a constant.
+struct UnsignedDivisonByConstantInfo {
+  static UnsignedDivisonByConstantInfo get(const APInt &D,
+                                           unsigned LeadingZeros = 0);
+  APInt Magic;          ///< magic number
+  bool IsAdd;           ///< add indicator
+  unsigned ShiftAmount; ///< shift amount
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index e8f340e452ef..e2002b89ada2 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -257,8 +257,7 @@ private:
   // of debug prints can cause the function to be too large for inlining.  So
   // it's important that we define this function out of line so that it can't be
   // inlined.
-  LLVM_ATTRIBUTE_NORETURN
-  void fatalUncheckedError() const;
+  [[noreturn]] void fatalUncheckedError() const;
 #endif
 
   void assertIsChecked() {
@@ -314,7 +313,7 @@ private:
   }
 
   friend raw_ostream &operator<<(raw_ostream &OS, const Error &E) {
-    if (auto P = E.getPtr())
+    if (auto *P = E.getPtr())
       P->log(OS);
     else
       OS << "success";
@@ -374,7 +373,7 @@ class ErrorList final : public ErrorInfo<ErrorList> {
 public:
   void log(raw_ostream &OS) const override {
     OS << "Multiple errors:\n";
-    for (auto &ErrPayload : Payloads) {
+    for (const auto &ErrPayload : Payloads) {
       ErrPayload->log(OS);
       OS << "\n";
     }
@@ -578,6 +577,16 @@ public:
     return const_cast<Expected<T> *>(this)->get();
   }
 
+  /// Returns \a takeError() after moving the held T (if any) into \p V.
+  template <class OtherT>
+  Error moveInto(OtherT &Value,
+                 std::enable_if_t<std::is_assignable<OtherT &, T &&>::value> * =
+                     nullptr) && {
+    if (*this)
+      Value = std::move(get());
+    return takeError();
+  }
+
   /// Check that this Expected<T> is an error of type ErrT.
   template <typename ErrT> bool errorIsA() const {
     return HasError && (*getErrorStorage())->template isA<ErrT>();
@@ -688,9 +697,7 @@ private:
   }
 
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
-  LLVM_ATTRIBUTE_NORETURN
-  LLVM_ATTRIBUTE_NOINLINE
-  void fatalUncheckedExpected() const {
+  [[noreturn]] LLVM_ATTRIBUTE_NOINLINE void fatalUncheckedExpected() const {
     dbgs() << "Expected<T> must be checked before access or destruction.\n";
     if (HasError) {
       dbgs() << "Unchecked Expected<T> contained error:\n";
@@ -722,8 +729,7 @@ private:
 
 /// Report a serious error, calling any installed error handler. See
 /// ErrorHandling.h.
-LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err,
-                                                bool gen_crash_diag = true);
+[[noreturn]] void report_fatal_error(Error Err, bool gen_crash_diag = true);
 
 /// Report a fatal error if Err is a failure value.
 ///
@@ -1159,7 +1165,7 @@ protected:
 /// It should only be used in this situation, and should never be used where a
 /// sensible conversion to std::error_code is available, as attempts to convert
 /// to/from this error will result in a fatal error. (i.e. it is a programmatic
-///error to try to convert such a value).
+/// error to try to convert such a value).
 std::error_code inconvertibleErrorCode();
 
 /// Helper for converting an std::error_code to a Error.
@@ -1263,13 +1269,20 @@ class FileError final : public ErrorInfo<FileError> {
 
 public:
   void log(raw_ostream &OS) const override {
-    assert(Err && !FileName.empty() && "Trying to log after takeError().");
+    assert(Err && "Trying to log after takeError().");
     OS << "'" << FileName << "': ";
     if (Line.hasValue())
       OS << "line " << Line.getValue() << ": ";
     Err->log(OS);
   }
 
+  std::string messageWithoutFileInfo() const {
+    std::string Msg;
+    raw_string_ostream OS(Msg);
+    Err->log(OS);
+    return OS.str();
+  }
+
   StringRef getFileName() { return FileName; }
 
   Error takeError() { return Error(std::move(Err)); }
@@ -1283,8 +1296,6 @@ private:
   FileError(const Twine &F, Optional<size_t> LineNum,
             std::unique_ptr<ErrorInfoBase> E) {
     assert(E && "Cannot create FileError from Error success value.");
-    assert(!F.isTriviallyEmpty() &&
-           "The file name provided to FileError must not be empty.");
     FileName = F.str();
     Err = std::move(E);
     Line = std::move(LineNum);
diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h
index 0ec0242d569d..f980510d37f0 100644
--- a/llvm/include/llvm/Support/ErrorHandling.h
+++ b/llvm/include/llvm/Support/ErrorHandling.h
@@ -15,15 +15,14 @@
 #define LLVM_SUPPORT_ERRORHANDLING_H
 
 #include "llvm/Support/Compiler.h"
-#include <string>
 
 namespace llvm {
-class StringRef;
+  class StringRef;
   class Twine;
 
   /// An error handler callback.
   typedef void (*fatal_error_handler_t)(void *user_data,
-                                        const std::string& reason,
+                                        const char *reason,
                                         bool gen_crash_diag);
 
   /// install_fatal_error_handler - Installs a new error handler to be used
@@ -68,14 +67,13 @@ class StringRef;
 /// standard error, followed by a newline.
 /// After the error handler is called this function will call abort(), it
 /// does not return.
-LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const char *reason,
-                                                bool gen_crash_diag = true);
-LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const std::string &reason,
-                                                bool gen_crash_diag = true);
-LLVM_ATTRIBUTE_NORETURN void report_fatal_error(StringRef reason,
-                                                bool gen_crash_diag = true);
-LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const Twine &reason,
-                                                bool gen_crash_diag = true);
+/// NOTE: The std::string variant was removed to avoid a <string> dependency.
+[[noreturn]] void report_fatal_error(const char *reason,
+                                     bool gen_crash_diag = true);
+[[noreturn]] void report_fatal_error(StringRef reason,
+                                     bool gen_crash_diag = true);
+[[noreturn]] void report_fatal_error(const Twine &reason,
+                                     bool gen_crash_diag = true);
 
 /// Installs a new bad alloc error handler that should be used whenever a
 /// bad alloc error, e.g. failing malloc/calloc, is encountered by LLVM.
@@ -113,13 +111,13 @@ void install_out_of_memory_new_handler();
 /// If no error handler is installed (default), throws a bad_alloc exception
 /// if LLVM is compiled with exception support. Otherwise prints the error
 /// to standard error and calls abort().
-LLVM_ATTRIBUTE_NORETURN void report_bad_alloc_error(const char *Reason,
-                                                    bool GenCrashDiag = true);
+[[noreturn]] void report_bad_alloc_error(const char *Reason,
+                                         bool GenCrashDiag = true);
 
 /// This function calls abort(), and prints the optional message to stderr.
 /// Use the llvm_unreachable macro (that adds location info), instead of
 /// calling this function directly.
-LLVM_ATTRIBUTE_NORETURN void
+[[noreturn]] void
 llvm_unreachable_internal(const char *msg = nullptr, const char *file = nullptr,
                           unsigned line = 0);
 }
diff --git a/llvm/include/llvm/Support/ExtensibleRTTI.h b/llvm/include/llvm/Support/ExtensibleRTTI.h
index 6b8510ce759f..21055247e932 100644
--- a/llvm/include/llvm/Support/ExtensibleRTTI.h
+++ b/llvm/include/llvm/Support/ExtensibleRTTI.h
@@ -1,9 +1,8 @@
 //===-- llvm/Support/ExtensibleRTTI.h - ExtensibleRTTI support --*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index 38779ef4a3af..1a049533b82b 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -772,7 +772,8 @@ enum OpenFlags : unsigned {
   /// The file should be opened in append mode.
   OF_Append = 4,
 
-  /// Delete the file on close. Only makes a difference on windows.
+  /// The returned handle can be used for deleting the file. Only makes a
+  /// difference on windows.
   OF_Delete = 8,
 
   /// When a child process is launched, this file should remain open in the
@@ -865,6 +866,11 @@ public:
   // The open file descriptor.
   int FD = -1;
 
+#ifdef _WIN32
+  // Whether we need to manually remove the file on close.
+  bool RemoveOnClose = false;
+#endif
+
   // Keep this with the given name.
   Error keep(const Twine &Name);
 
diff --git a/llvm/include/llvm/Support/FileSystem/UniqueID.h b/llvm/include/llvm/Support/FileSystem/UniqueID.h
index 229410c8292e..0d5367236e8d 100644
--- a/llvm/include/llvm/Support/FileSystem/UniqueID.h
+++ b/llvm/include/llvm/Support/FileSystem/UniqueID.h
@@ -14,7 +14,10 @@
 #ifndef LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
 #define LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
 
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
 #include <cstdint>
+#include <utility>
 
 namespace llvm {
 namespace sys {
@@ -47,6 +50,30 @@ public:
 
 } // end namespace fs
 } // end namespace sys
+
+// Support UniqueIDs as DenseMap keys.
+template <> struct DenseMapInfo<llvm::sys::fs::UniqueID> {
+  static inline llvm::sys::fs::UniqueID getEmptyKey() {
+    auto EmptyKey = DenseMapInfo<std::pair<uint64_t, uint64_t>>::getEmptyKey();
+    return {EmptyKey.first, EmptyKey.second};
+  }
+
+  static inline llvm::sys::fs::UniqueID getTombstoneKey() {
+    auto TombstoneKey =
+        DenseMapInfo<std::pair<uint64_t, uint64_t>>::getTombstoneKey();
+    return {TombstoneKey.first, TombstoneKey.second};
+  }
+
+  static hash_code getHashValue(const llvm::sys::fs::UniqueID &Tag) {
+    return hash_value(std::make_pair(Tag.getDevice(), Tag.getFile()));
+  }
+
+  static bool isEqual(const llvm::sys::fs::UniqueID &LHS,
+                      const llvm::sys::fs::UniqueID &RHS) {
+    return LHS == RHS;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index 094b054f773f..89575f01b717 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -94,7 +94,7 @@ public:
         continue;
       }
 
-      auto W = Adapters[R.Index];
+      auto *W = Adapters[R.Index];
 
       FmtAlign Align(*W, R.Where, R.Align, R.Pad);
       Align.format(S, R.Options);
diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
index d306ebe99bc1..e504a0eddeba 100644
--- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -78,7 +78,7 @@ struct SemiNCAInfo {
   using UpdateT = typename DomTreeT::UpdateType;
   using UpdateKind = typename DomTreeT::UpdateKind;
   struct BatchUpdateInfo {
-    // Note: Updates inside PreViewCFG are aleady legalized.
+    // Note: Updates inside PreViewCFG are already legalized.
     BatchUpdateInfo(GraphDiffT &PreViewCFG, GraphDiffT *PostViewCFG = nullptr)
         : PreViewCFG(PreViewCFG), PostViewCFG(PostViewCFG),
           NumLegalized(PreViewCFG.getNumLegalizedUpdates()) {}
@@ -430,7 +430,6 @@ struct SemiNCAInfo {
       // is unreachable. This is because we are still going to only visit each
       // unreachable node once, we may just visit it in two directions,
       // depending on how lucky we get.
-      SmallPtrSet<NodePtr, 4> ConnectToExitBlock;
       for (const NodePtr I : nodes(DT.Parent)) {
         if (SNCA.NodeToInfo.count(I) == 0) {
           LLVM_DEBUG(dbgs()
@@ -457,7 +456,6 @@ struct SemiNCAInfo {
           LLVM_DEBUG(dbgs() << "\t\t\tFound a new furthest away node "
                             << "(non-trivial root): "
                             << BlockNamePrinter(FurthestAway) << "\n");
-          ConnectToExitBlock.insert(FurthestAway);
           Roots.push_back(FurthestAway);
           LLVM_DEBUG(dbgs() << "\t\t\tPrev DFSNum: " << Num << ", new DFSNum: "
                             << NewNum << "\n\t\t\tRemoving DFS info\n");
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index b886bf45f474..11a31bf40160 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -66,6 +66,7 @@ template<typename GraphType>
 class GraphWriter {
   raw_ostream &O;
   const GraphType &G;
+  bool RenderUsingHTML = false;
 
   using DOTTraits = DOTGraphTraits<GraphType>;
   using GTraits = GraphTraits<GraphType>;
@@ -86,6 +87,9 @@ class GraphWriter {
     child_iterator EE = GTraits::child_end(Node);
     bool hasEdgeSourceLabels = false;
 
+    if (RenderUsingHTML)
+      O << "</tr><tr>";
+
     for (unsigned i = 0; EI != EE && i != 64; ++EI, ++i) {
       std::string label = DTraits.getEdgeSourceLabel(Node, EI);
 
@@ -94,14 +98,22 @@ class GraphWriter {
 
       hasEdgeSourceLabels = true;
 
-      if (i)
-        O << "|";
+      if (RenderUsingHTML)
+        O << "<td colspan=\"1\" port=\"s" << i << "\">" << label << "</td>";
+      else {
+        if (i)
+          O << "|";
 
-      O << "<s" << i << ">" << DOT::EscapeString(label);
+        O << "<s" << i << ">" << DOT::EscapeString(label);
+      }
     }
 
-    if (EI != EE && hasEdgeSourceLabels)
-      O << "|<s64>truncated...";
+    if (EI != EE && hasEdgeSourceLabels) {
+      if (RenderUsingHTML)
+        O << "<td colspan=\"1\" port=\"s64\">truncated...</td>";
+      else
+        O << "|<s64>truncated...";
+    }
 
     return hasEdgeSourceLabels;
   }
@@ -109,6 +121,7 @@ class GraphWriter {
 public:
   GraphWriter(raw_ostream &o, const GraphType &g, bool SN) : O(o), G(g) {
     DTraits = DOTTraits(SN);
+    RenderUsingHTML = DTraits.renderNodesUsingHTML();
   }
 
   void writeGraph(const std::string &Title = "") {
@@ -163,12 +176,39 @@ public:
   void writeNode(NodeRef Node) {
     std::string NodeAttributes = DTraits.getNodeAttributes(Node, G);
 
-    O << "\tNode" << static_cast<const void*>(Node) << " [shape=record,";
+    O << "\tNode" << static_cast<const void *>(Node) << " [shape=";
+    if (RenderUsingHTML)
+      O << "none,";
+    else
+      O << "record,";
+
     if (!NodeAttributes.empty()) O << NodeAttributes << ",";
-    O << "label=\"{";
+    O << "label=";
+
+    if (RenderUsingHTML) {
+      // Count the numbewr of edges out of the node to determine how
+      // many columns to span (max 64)
+      unsigned ColSpan = 0;
+      child_iterator EI = GTraits::child_begin(Node);
+      child_iterator EE = GTraits::child_end(Node);
+      for (; EI != EE && ColSpan != 64; ++EI, ++ColSpan)
+        ;
+      if (ColSpan == 0)
+        ColSpan = 1;
+      // Include truncated messages when counting.
+      if (EI != EE)
+        ++ColSpan;
+      O << "<<table border=\"0\" cellborder=\"1\" cellspacing=\"0\""
+        << " cellpadding=\"0\"><tr><td align=\"text\" colspan=\"" << ColSpan
+        << "\">";
+    } else
+      O << "\"{";
 
     if (!DTraits.renderGraphFromBottomUp()) {
-      O << DOT::EscapeString(DTraits.getNodeLabel(Node, G));
+      if (RenderUsingHTML)
+        O << DTraits.getNodeLabel(Node, G) << "</td>";
+      else
+        O << DOT::EscapeString(DTraits.getNodeLabel(Node, G));
 
       // If we should include the address of the node in the label, do so now.
       std::string Id = DTraits.getNodeIdentifierLabel(Node, G);
@@ -185,15 +225,25 @@ public:
     bool hasEdgeSourceLabels = getEdgeSourceLabels(EdgeSourceLabels, Node);
 
     if (hasEdgeSourceLabels) {
-      if (!DTraits.renderGraphFromBottomUp()) O << "|";
-
-      O << "{" << EdgeSourceLabels.str() << "}";
-
-      if (DTraits.renderGraphFromBottomUp()) O << "|";
+      if (!DTraits.renderGraphFromBottomUp())
+        if (!RenderUsingHTML)
+          O << "|";
+
+      if (RenderUsingHTML)
+        O << EdgeSourceLabels.str();
+      else
+        O << "{" << EdgeSourceLabels.str() << "}";
+
+      if (DTraits.renderGraphFromBottomUp())
+        if (!RenderUsingHTML)
+          O << "|";
     }
 
     if (DTraits.renderGraphFromBottomUp()) {
-      O << DOT::EscapeString(DTraits.getNodeLabel(Node, G));
+      if (RenderUsingHTML)
+        O << DTraits.getNodeLabel(Node, G);
+      else
+        O << DOT::EscapeString(DTraits.getNodeLabel(Node, G));
 
       // If we should include the address of the node in the label, do so now.
       std::string Id = DTraits.getNodeIdentifierLabel(Node, G);
@@ -215,12 +265,17 @@ public:
           << DOT::EscapeString(DTraits.getEdgeDestLabel(Node, i));
       }
 
-      if (i != e)
-        O << "|<d64>truncated...";
-      O << "}";
+      if (RenderUsingHTML)
+        O << "<td colspan=\"1\">... truncated</td>";
+      else if (i != e)
+        O << "|<d64>truncated...}";
     }
 
-    O << "}\"];\n";   // Finish printing the "node" line
+    if (RenderUsingHTML)
+      O << "</tr></table>>";
+    else
+      O << "}\"";
+    O << "];\n"; // Finish printing the "node" line
 
     // Output all of the edges now
     child_iterator EI = GTraits::child_begin(Node);
diff --git a/llvm/include/llvm/Support/HashBuilder.h b/llvm/include/llvm/Support/HashBuilder.h
new file mode 100644
index 000000000000..bf93a0d22da7
--- /dev/null
+++ b/llvm/include/llvm/Support/HashBuilder.h
@@ -0,0 +1,438 @@
+//===- llvm/Support/HashBuilder.h - Convenient hashing interface-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an interface allowing to conveniently build hashes of
+// various data types, without relying on the underlying hasher type to know
+// about hashed data types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HASHBUILDER_H
+#define LLVM_SUPPORT_HASHBUILDER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/type_traits.h"
+
+#include <iterator>
+#include <utility>
+
+namespace llvm {
+
+namespace hashbuilder_detail {
+/// Trait to indicate whether a type's bits can be hashed directly (after
+/// endianness correction).
+template <typename U>
+struct IsHashableData
+    : std::integral_constant<bool, is_integral_or_enum<U>::value> {};
+
+} // namespace hashbuilder_detail
+
+/// Declares the hasher member, and functions forwarding directly to the hasher.
+template <typename HasherT> class HashBuilderBase {
+public:
+  HasherT &getHasher() { return Hasher; }
+
+  /// Forward to `HasherT::update(ArrayRef<uint8_t>)`.
+  ///
+  /// This may not take the size of `Data` into account.
+  /// Users of this function should pay attention to respect endianness
+  /// contraints.
+  void update(ArrayRef<uint8_t> Data) { this->getHasher().update(Data); }
+
+  /// Forward to `HasherT::update(ArrayRef<uint8_t>)`.
+  ///
+  /// This may not take the size of `Data` into account.
+  /// Users of this function should pay attention to respect endianness
+  /// contraints.
+  void update(StringRef Data) {
+    update(makeArrayRef(reinterpret_cast<const uint8_t *>(Data.data()),
+                        Data.size()));
+  }
+
+  /// Forward to `HasherT::final()` if available.
+  template <typename HasherT_ = HasherT> StringRef final() {
+    return this->getHasher().final();
+  }
+
+  /// Forward to `HasherT::result()` if available.
+  template <typename HasherT_ = HasherT> StringRef result() {
+    return this->getHasher().result();
+  }
+
+protected:
+  explicit HashBuilderBase(HasherT &Hasher) : Hasher(Hasher) {}
+
+  template <typename... ArgTypes>
+  explicit HashBuilderBase(ArgTypes &&...Args)
+      : OptionalHasher(in_place, std::forward<ArgTypes>(Args)...),
+        Hasher(*OptionalHasher) {}
+
+private:
+  Optional<HasherT> OptionalHasher;
+  HasherT &Hasher;
+};
+
+/// Implementation of the `HashBuilder` interface.
+///
+/// `support::endianness::native` is not supported. `HashBuilder` is
+/// expected to canonicalize `support::endianness::native` to one of
+/// `support::endianness::big` or `support::endianness::little`.
+template <typename HasherT, support::endianness Endianness>
+class HashBuilderImpl : public HashBuilderBase<HasherT> {
+  static_assert(Endianness != support::endianness::native,
+                "HashBuilder should canonicalize endianness");
+
+public:
+  explicit HashBuilderImpl(HasherT &Hasher)
+      : HashBuilderBase<HasherT>(Hasher) {}
+  template <typename... ArgTypes>
+  explicit HashBuilderImpl(ArgTypes &&...Args)
+      : HashBuilderBase<HasherT>(Args...) {}
+
+  /// Implement hashing for hashable data types, e.g. integral or enum values.
+  template <typename T>
+  std::enable_if_t<hashbuilder_detail::IsHashableData<T>::value,
+                   HashBuilderImpl &>
+  add(T Value) {
+    return adjustForEndiannessAndAdd(Value);
+  }
+
+  /// Support hashing `ArrayRef`.
+  ///
+  /// `Value.size()` is taken into account to ensure cases like
+  /// ```
+  /// builder.add({1});
+  /// builder.add({2, 3});
+  /// ```
+  /// and
+  /// ```
+  /// builder.add({1, 2});
+  /// builder.add({3});
+  /// ```
+  /// do not collide.
+  template <typename T> HashBuilderImpl &add(ArrayRef<T> Value) {
+    // As of implementation time, simply calling `addRange(Value)` would also go
+    // through the `update` fast path. But that would rely on the implementation
+    // details of `ArrayRef::begin()` and `ArrayRef::end()`. Explicitly call
+    // `update` to guarantee the fast path.
+    add(Value.size());
+    if (hashbuilder_detail::IsHashableData<T>::value &&
+        Endianness == support::endian::system_endianness()) {
+      this->update(
+          makeArrayRef(reinterpret_cast<const uint8_t *>(Value.begin()),
+                       Value.size() * sizeof(T)));
+    } else {
+      for (auto &V : Value)
+        add(V);
+    }
+    return *this;
+  }
+
+  /// Support hashing `StringRef`.
+  ///
+  /// `Value.size()` is taken into account to ensure cases like
+  /// ```
+  /// builder.add("a");
+  /// builder.add("bc");
+  /// ```
+  /// and
+  /// ```
+  /// builder.add("ab");
+  /// builder.add("c");
+  /// ```
+  /// do not collide.
+  HashBuilderImpl &add(StringRef Value) {
+    // As of implementation time, simply calling `addRange(Value)` would also go
+    // through `update`. But that would rely on the implementation of
+    // `StringRef::begin()` and `StringRef::end()`. Explicitly call `update` to
+    // guarantee the fast path.
+    add(Value.size());
+    this->update(makeArrayRef(reinterpret_cast<const uint8_t *>(Value.begin()),
+                              Value.size()));
+    return *this;
+  }
+
+  template <typename T>
+  using HasAddHashT =
+      decltype(addHash(std::declval<HashBuilderImpl &>(), std::declval<T &>()));
+  /// Implement hashing for user-defined `struct`s.
+  ///
+  /// Any user-define `struct` can participate in hashing via `HashBuilder` by
+  /// providing a `addHash` templated function.
+  ///
+  /// ```
+  /// template <typename HasherT, support::endianness Endianness>
+  /// void addHash(HashBuilder<HasherT, Endianness> &HBuilder,
+  ///              const UserDefinedStruct &Value);
+  /// ```
+  ///
+  /// For example:
+  /// ```
+  /// struct SimpleStruct {
+  ///   char c;
+  ///   int i;
+  /// };
+  ///
+  /// template <typename HasherT, support::endianness Endianness>
+  /// void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  ///              const SimpleStruct &Value) {
+  ///   HBuilder.add(Value.c);
+  ///   HBuilder.add(Value.i);
+  /// }
+  /// ```
+  ///
+  /// To avoid endianness issues, specializations of `addHash` should
+  /// generally rely on exising `add`, `addRange`, and `addRangeElements`
+  /// functions. If directly using `update`, an implementation must correctly
+  /// handle endianness.
+  ///
+  /// ```
+  /// struct __attribute__ ((packed)) StructWithFastHash {
+  ///   int I;
+  ///   char C;
+  ///
+  ///   // If possible, we want to hash both `I` and `C` in a single
+  ///   // `update` call for performance concerns.
+  ///   template <typename HasherT, support::endianness Endianness>
+  ///   friend void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  ///                       const StructWithFastHash &Value) {
+  ///     if (Endianness == support::endian::system_endianness()) {
+  ///       HBuilder.update(makeArrayRef(
+  ///           reinterpret_cast<const uint8_t *>(&Value), sizeof(Value)));
+  ///     } else {
+  ///       // Rely on existing `add` methods to handle endianness.
+  ///       HBuilder.add(Value.I);
+  ///       HBuilder.add(Value.C);
+  ///     }
+  ///   }
+  /// };
+  /// ```
+  ///
+  /// To avoid collisions, specialization of `addHash` for variable-size
+  /// types must take the size into account.
+  ///
+  /// For example:
+  /// ```
+  /// struct CustomContainer {
+  /// private:
+  ///   size_t Size;
+  ///   int Elements[100];
+  ///
+  /// public:
+  ///   CustomContainer(size_t Size) : Size(Size) {
+  ///     for (size_t I = 0; I != Size; ++I)
+  ///       Elements[I] = I;
+  ///   }
+  ///   template <typename HasherT, support::endianness Endianness>
+  ///   friend void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+  ///                       const CustomContainer &Value) {
+  ///     if (Endianness == support::endian::system_endianness()) {
+  ///       HBuilder.update(makeArrayRef(
+  ///           reinterpret_cast<const uint8_t *>(&Value.Size),
+  ///           sizeof(Value.Size) + Value.Size * sizeof(Value.Elements[0])));
+  ///     } else {
+  ///       // `addRange` will take care of encoding the size.
+  ///       HBuilder.addRange(&Value.Elements[0], &Value.Elements[0] +
+  ///       Value.Size);
+  ///     }
+  ///   }
+  /// };
+  /// ```
+  template <typename T>
+  std::enable_if_t<is_detected<HasAddHashT, T>::value &&
+                       !hashbuilder_detail::IsHashableData<T>::value,
+                   HashBuilderImpl &>
+  add(const T &Value) {
+    addHash(*this, Value);
+    return *this;
+  }
+
+  template <typename T1, typename T2>
+  HashBuilderImpl &add(const std::pair<T1, T2> &Value) {
+    add(Value.first);
+    add(Value.second);
+    return *this;
+  }
+
+  template <typename... Ts> HashBuilderImpl &add(const std::tuple<Ts...> &Arg) {
+    return addTupleHelper(Arg, typename std::index_sequence_for<Ts...>());
+  }
+
+  /// A convenenience variadic helper.
+  /// It simply iterates over its arguments, in order.
+  /// ```
+  /// add(Arg1, Arg2);
+  /// ```
+  /// is equivalent to
+  /// ```
+  /// add(Arg1)
+  /// add(Arg2)
+  /// ```
+  template <typename T, typename... Ts>
+  typename std::enable_if<(sizeof...(Ts) >= 1), HashBuilderImpl &>::type
+  add(const T &FirstArg, const Ts &...Args) {
+    add(FirstArg);
+    add(Args...);
+    return *this;
+  }
+
+  template <typename ForwardIteratorT>
+  HashBuilderImpl &addRange(ForwardIteratorT First, ForwardIteratorT Last) {
+    add(std::distance(First, Last));
+    return addRangeElements(First, Last);
+  }
+
+  template <typename RangeT> HashBuilderImpl &addRange(const RangeT &Range) {
+    return addRange(adl_begin(Range), adl_end(Range));
+  }
+
+  template <typename ForwardIteratorT>
+  HashBuilderImpl &addRangeElements(ForwardIteratorT First,
+                                    ForwardIteratorT Last) {
+    return addRangeElementsImpl(
+        First, Last,
+        typename std::iterator_traits<ForwardIteratorT>::iterator_category());
+  }
+
+  template <typename RangeT>
+  HashBuilderImpl &addRangeElements(const RangeT &Range) {
+    return addRangeElements(adl_begin(Range), adl_end(Range));
+  }
+
+  template <typename T>
+  using HasByteSwapT = decltype(support::endian::byte_swap(
+      std::declval<T &>(), support::endianness::little));
+  /// Adjust `Value` for the target endianness and add it to the hash.
+  template <typename T>
+  std::enable_if_t<is_detected<HasByteSwapT, T>::value, HashBuilderImpl &>
+  adjustForEndiannessAndAdd(const T &Value) {
+    T SwappedValue = support::endian::byte_swap(Value, Endianness);
+    this->update(makeArrayRef(reinterpret_cast<const uint8_t *>(&SwappedValue),
+                              sizeof(SwappedValue)));
+    return *this;
+  }
+
+private:
+  template <typename... Ts, std::size_t... Indices>
+  HashBuilderImpl &addTupleHelper(const std::tuple<Ts...> &Arg,
+                                  std::index_sequence<Indices...>) {
+    add(std::get<Indices>(Arg)...);
+    return *this;
+  }
+
+  // FIXME: Once available, specialize this function for `contiguous_iterator`s,
+  // and use it for `ArrayRef` and `StringRef`.
+  template <typename ForwardIteratorT>
+  HashBuilderImpl &addRangeElementsImpl(ForwardIteratorT First,
+                                        ForwardIteratorT Last,
+                                        std::forward_iterator_tag) {
+    for (auto It = First; It != Last; ++It)
+      add(*It);
+    return *this;
+  }
+
+  template <typename T>
+  std::enable_if_t<hashbuilder_detail::IsHashableData<T>::value &&
+                       Endianness == support::endian::system_endianness(),
+                   HashBuilderImpl &>
+  addRangeElementsImpl(T *First, T *Last, std::forward_iterator_tag) {
+    this->update(makeArrayRef(reinterpret_cast<const uint8_t *>(First),
+                              (Last - First) * sizeof(T)));
+    return *this;
+  }
+};
+
+/// Interface to help hash various types through a hasher type.
+///
+/// Via provided specializations of `add`, `addRange`, and `addRangeElements`
+/// functions, various types (e.g. `ArrayRef`, `StringRef`, etc.) can be hashed
+/// without requiring any knowledge of hashed types from the hasher type.
+///
+/// The only method expected from the templated hasher type `HasherT` is:
+/// * void update(ArrayRef<uint8_t> Data)
+///
+/// Additionally, the following methods will be forwarded to the hasher type:
+/// * decltype(std::declval<HasherT &>().final()) final()
+/// * decltype(std::declval<HasherT &>().result()) result()
+///
+/// From a user point of view, the interface provides the following:
+/// * `template<typename T> add(const T &Value)`
+///   The `add` function implements hashing of various types.
+/// * `template <typename ItT> void addRange(ItT First, ItT Last)`
+///   The `addRange` function is designed to aid hashing a range of values.
+///   It explicitly adds the size of the range in the hash.
+/// * `template <typename ItT> void addRangeElements(ItT First, ItT Last)`
+///   The `addRangeElements` function is also designed to aid hashing a range of
+///   values. In contrast to `addRange`, it **ignores** the size of the range,
+///   behaving as if elements were added one at a time with `add`.
+///
+/// User-defined `struct` types can participate in this interface by providing
+/// an `addHash` templated function. See the associated template specialization
+/// for details.
+///
+/// This interface does not impose requirements on the hasher
+/// `update(ArrayRef<uint8_t> Data)` method. We want to avoid collisions for
+/// variable-size types; for example for
+/// ```
+/// builder.add({1});
+/// builder.add({2, 3});
+/// ```
+/// and
+/// ```
+/// builder.add({1, 2});
+/// builder.add({3});
+/// ```
+/// . Thus, specializations of `add` and `addHash` for variable-size types must
+/// not assume that the hasher type considers the size as part of the hash; they
+/// must explicitly add the size to the hash. See for example specializations
+/// for `ArrayRef` and `StringRef`.
+///
+/// Additionally, since types are eventually forwarded to the hasher's
+/// `void update(ArrayRef<uint8_t>)` method, endianness plays a role in the hash
+/// computation (for example when computing `add((int)123)`).
+/// Specifiying a non-`native` `Endianness` template parameter allows to compute
+/// stable hash across platforms with different endianness.
+template <class HasherT, support::endianness Endianness>
+using HashBuilder =
+    HashBuilderImpl<HasherT, (Endianness == support::endianness::native
+                                  ? support::endian::system_endianness()
+                                  : Endianness)>;
+
+namespace hashbuilder_detail {
+class HashCodeHasher {
+public:
+  HashCodeHasher() : Code(0) {}
+  void update(ArrayRef<uint8_t> Data) {
+    hash_code DataCode = hash_value(Data);
+    Code = hash_combine(Code, DataCode);
+  }
+  hash_code Code;
+};
+
+using HashCodeHashBuilder = HashBuilder<hashbuilder_detail::HashCodeHasher,
+                                        support::endianness::native>;
+} // namespace hashbuilder_detail
+
+/// Provide a default implementation of `hash_value` when `addHash(const T &)`
+/// is supported.
+template <typename T>
+std::enable_if_t<
+    is_detected<hashbuilder_detail::HashCodeHashBuilder::HasAddHashT, T>::value,
+    hash_code>
+hash_value(const T &Value) {
+  hashbuilder_detail::HashCodeHashBuilder HBuilder;
+  HBuilder.add(Value);
+  return HBuilder.getHasher().Code;
+}
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_HASHBUILDER_H
diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h
index c753cee60ec1..469f50be40e0 100644
--- a/llvm/include/llvm/Support/JSON.h
+++ b/llvm/include/llvm/Support/JSON.h
@@ -234,7 +234,7 @@ inline bool operator!=(const Array &L, const Array &R) { return !(L == R); }
 /// Each Value is one of the JSON kinds:
 ///   null    (nullptr_t)
 ///   boolean (bool)
-///   number  (double or int64)
+///   number  (double, int64 or uint64)
 ///   string  (StringRef)
 ///   array   (json::Array)
 ///   object  (json::Object)
@@ -342,9 +342,20 @@ public:
   Value(T B) : Type(T_Boolean) {
     create<bool>(B);
   }
-  // Integers (except boolean). Must be non-narrowing convertible to int64_t.
+
+  // Unsigned 64-bit long integers.
+  template <typename T,
+            typename = std::enable_if_t<std::is_same<T, uint64_t>::value>,
+            bool = false, bool = false>
+  Value(T V) : Type(T_UINT64) {
+    create<uint64_t>(uint64_t{V});
+  }
+
+  // Integers (except boolean and uint64_t).
+  // Must be non-narrowing convertible to int64_t.
   template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>,
-            typename = std::enable_if_t<!std::is_same<T, bool>::value>>
+            typename = std::enable_if_t<!std::is_same<T, bool>::value>,
+            typename = std::enable_if_t<!std::is_same<T, uint64_t>::value>>
   Value(T I) : Type(T_Integer) {
     create<int64_t>(int64_t{I});
   }
@@ -382,6 +393,7 @@ public:
       return Boolean;
     case T_Double:
     case T_Integer:
+    case T_UINT64:
       return Number;
     case T_String:
     case T_StringRef:
@@ -410,6 +422,8 @@ public:
       return as<double>();
     if (LLVM_LIKELY(Type == T_Integer))
       return as<int64_t>();
+    if (LLVM_LIKELY(Type == T_UINT64))
+      return as<uint64_t>();
     return llvm::None;
   }
   // Succeeds if the Value is a Number, and exactly representable as int64_t.
@@ -425,6 +439,16 @@ public:
     }
     return llvm::None;
   }
+  llvm::Optional<uint64_t> getAsUINT64() const {
+    if (Type == T_UINT64)
+      return as<uint64_t>();
+    else if (Type == T_Integer) {
+      int64_t N = as<int64_t>();
+      if (N >= 0)
+        return as<uint64_t>();
+    }
+    return llvm::None;
+  }
   llvm::Optional<llvm::StringRef> getAsString() const {
     if (Type == T_String)
       return llvm::StringRef(as<std::string>());
@@ -467,11 +491,12 @@ private:
 
   friend class OStream;
 
-  enum ValueType : char {
+  enum ValueType : char16_t {
     T_Null,
     T_Boolean,
     T_Double,
     T_Integer,
+    T_UINT64,
     T_StringRef,
     T_String,
     T_Object,
@@ -479,8 +504,9 @@ private:
   };
   // All members mutable, see moveFrom().
   mutable ValueType Type;
-  mutable llvm::AlignedCharArrayUnion<bool, double, int64_t, llvm::StringRef,
-                                      std::string, json::Array, json::Object>
+  mutable llvm::AlignedCharArrayUnion<bool, double, int64_t, uint64_t,
+                                      llvm::StringRef, std::string, json::Array,
+                                      json::Object>
       Union;
   friend bool operator==(const Value &, const Value &);
 };
@@ -683,6 +709,14 @@ inline bool fromJSON(const Value &E, bool &Out, Path P) {
   P.report("expected boolean");
   return false;
 }
+inline bool fromJSON(const Value &E, uint64_t &Out, Path P) {
+  if (auto S = E.getAsUINT64()) {
+    Out = *S;
+    return true;
+  }
+  P.report("expected uint64_t");
+  return false;
+}
 inline bool fromJSON(const Value &E, std::nullptr_t &Out, Path P) {
   if (auto S = E.getAsNull()) {
     Out = *S;
diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index cfec5796493f..1f32760a6fd1 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -60,7 +60,7 @@ public:
   }
 
   /// Returns true if we don't know any bits.
-  bool isUnknown() const { return Zero.isNullValue() && One.isNullValue(); }
+  bool isUnknown() const { return Zero.isZero() && One.isZero(); }
 
   /// Resets the known state of all bits.
   void resetAll() {
@@ -71,13 +71,13 @@ public:
   /// Returns true if value is all zero.
   bool isZero() const {
     assert(!hasConflict() && "KnownBits conflict!");
-    return Zero.isAllOnesValue();
+    return Zero.isAllOnes();
   }
 
   /// Returns true if value is all one bits.
   bool isAllOnes() const {
     assert(!hasConflict() && "KnownBits conflict!");
-    return One.isAllOnesValue();
+    return One.isAllOnes();
   }
 
   /// Make all bits known to be zero and discard any previous information.
@@ -99,10 +99,12 @@ public:
   bool isNonNegative() const { return Zero.isSignBitSet(); }
 
   /// Returns true if this value is known to be non-zero.
-  bool isNonZero() const { return !One.isNullValue(); }
+  bool isNonZero() const { return !One.isZero(); }
 
   /// Returns true if this value is known to be positive.
-  bool isStrictlyPositive() const { return Zero.isSignBitSet() && !One.isNullValue(); }
+  bool isStrictlyPositive() const {
+    return Zero.isSignBitSet() && !One.isZero();
+  }
 
   /// Make this value negative.
   void makeNegative() {
@@ -280,6 +282,10 @@ public:
     return getBitWidth() - Zero.countPopulation();
   }
 
+  unsigned countMaxActiveBits() const {
+    return getBitWidth() - countMinLeadingZeros();
+  }
+
   /// Create known bits from a known constant.
   static KnownBits makeConstant(const APInt &C) {
     return KnownBits(~C, C);
@@ -292,7 +298,7 @@ public:
 
   /// Return true if LHS and RHS have no common bits set.
   static bool haveNoCommonBitsSet(const KnownBits &LHS, const KnownBits &RHS) {
-    return (LHS.Zero | RHS.Zero).isAllOnesValue();
+    return (LHS.Zero | RHS.Zero).isAllOnes();
   }
 
   /// Compute known bits resulting from adding LHS, RHS and a 1-bit Carry.
@@ -304,7 +310,8 @@ public:
                                     KnownBits RHS);
 
   /// Compute known bits resulting from multiplying LHS and RHS.
-  static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS);
+  static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS,
+                       bool SelfMultiply = false);
 
   /// Compute known bits from sign-extended multiply-hi.
   static KnownBits mulhs(const KnownBits &LHS, const KnownBits &RHS);
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index 3b2d5b974d0b..3b960cd4fd88 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -39,18 +39,6 @@ template <unsigned N> class SmallString;
 template <typename T> class ArrayRef;
 
 class MD5 {
-  // Any 32-bit or wider unsigned integer data type will do.
-  typedef uint32_t MD5_u32plus;
-
-  MD5_u32plus a = 0x67452301;
-  MD5_u32plus b = 0xefcdab89;
-  MD5_u32plus c = 0x98badcfe;
-  MD5_u32plus d = 0x10325476;
-  MD5_u32plus hi = 0;
-  MD5_u32plus lo = 0;
-  uint8_t buffer[64];
-  MD5_u32plus block[16];
-
 public:
   struct MD5Result {
     std::array<uint8_t, 16> Bytes;
@@ -90,6 +78,14 @@ public:
   /// Finishes off the hash and puts the result in result.
   void final(MD5Result &Result);
 
+  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
+  StringRef final();
+
+  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
+  /// This is suitable for getting the MD5 at any time without invalidating the
+  /// internal state, so that more calls can be made into `update`.
+  StringRef result();
+
   /// Translates the bytes in \p Res to a hex string that is
   /// deposited into \p Str. The result will be of length 32.
   static void stringifyResult(MD5Result &Result, SmallString<32> &Str);
@@ -98,6 +94,23 @@ public:
   static std::array<uint8_t, 16> hash(ArrayRef<uint8_t> Data);
 
 private:
+  // Any 32-bit or wider unsigned integer data type will do.
+  typedef uint32_t MD5_u32plus;
+
+  // Internal State
+  struct {
+    MD5_u32plus a = 0x67452301;
+    MD5_u32plus b = 0xefcdab89;
+    MD5_u32plus c = 0x98badcfe;
+    MD5_u32plus d = 0x10325476;
+    MD5_u32plus hi = 0;
+    MD5_u32plus lo = 0;
+    uint8_t buffer[64];
+    MD5_u32plus block[16];
+  } InternalState;
+
+  MD5Result Result;
+
   const uint8_t *body(ArrayRef<uint8_t> Data);
 };
 
diff --git a/llvm/include/llvm/Support/MSP430AttributeParser.h b/llvm/include/llvm/Support/MSP430AttributeParser.h
new file mode 100644
index 000000000000..bc9b21494470
--- /dev/null
+++ b/llvm/include/llvm/Support/MSP430AttributeParser.h
@@ -0,0 +1,44 @@
+//===-- MSP430AttributeParser.h - MSP430 Attribute Parser -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains support routines for parsing MSP430 ELF build attributes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MSP430ATTRIBUTEPARSER_H
+#define LLVM_SUPPORT_MSP430ATTRIBUTEPARSER_H
+
+#include "llvm/Support/ELFAttributeParser.h"
+#include "llvm/Support/MSP430Attributes.h"
+
+namespace llvm {
+class MSP430AttributeParser : public ELFAttributeParser {
+  struct DisplayHandler {
+    MSP430Attrs::AttrType Attribute;
+    Error (MSP430AttributeParser::*Routine)(MSP430Attrs::AttrType);
+  };
+  static const std::array<DisplayHandler, 4> DisplayRoutines;
+
+  Error parseISA(MSP430Attrs::AttrType Tag);
+  Error parseCodeModel(MSP430Attrs::AttrType Tag);
+  Error parseDataModel(MSP430Attrs::AttrType Tag);
+  Error parseEnumSize(MSP430Attrs::AttrType Tag);
+
+  Error handler(uint64_t Tag, bool &Handled) override;
+
+public:
+  MSP430AttributeParser(ScopedPrinter *SW)
+      : ELFAttributeParser(SW, MSP430Attrs::getMSP430AttributeTags(),
+                           "mspabi") {}
+  MSP430AttributeParser()
+      : ELFAttributeParser(MSP430Attrs::getMSP430AttributeTags(), "mspabi") {}
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/MSP430Attributes.h b/llvm/include/llvm/Support/MSP430Attributes.h
new file mode 100644
index 000000000000..fccd65e844c3
--- /dev/null
+++ b/llvm/include/llvm/Support/MSP430Attributes.h
@@ -0,0 +1,44 @@
+//===-- MSP430Attributes.h - MSP430 Attributes ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains enumerations for MSP430 ELF build attributes as
+/// defined in the MSP430 ELF psABI specification.
+///
+/// MSP430 ELF psABI specification
+///
+/// https://www.ti.com/lit/pdf/slaa534
+///
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SUPPORT_MSP430ATTRIBUTES_H
+#define LLVM_SUPPORT_MSP430ATTRIBUTES_H
+
+#include "llvm/Support/ELFAttributes.h"
+
+namespace llvm {
+namespace MSP430Attrs {
+
+const TagNameMap &getMSP430AttributeTags();
+
+enum AttrType : unsigned {
+  // Attribute types in ELF/.MSP430.attributes.
+  TagISA = 4,
+  TagCodeModel = 6,
+  TagDataModel = 8,
+  TagEnumSize = 10
+};
+
+enum ISA { ISAMSP430 = 1, ISAMSP430X = 2 };
+enum CodeModel { CMSmall = 1, CMLarge = 2 };
+enum DataModel { DMSmall = 1, DMLarge = 2, DMRestricted = 3 };
+enum EnumSize { ESSmall = 1, ESInteger = 2, ESDontCare = 3 };
+
+} // namespace MSP430Attrs
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h
index 31f2d5a48183..ce10a4c58dfe 100644
--- a/llvm/include/llvm/Support/MachineValueType.h
+++ b/llvm/include/llvm/Support/MachineValueType.h
@@ -270,9 +270,10 @@ namespace llvm {
       funcref        = 175,    // WebAssembly's funcref type
       externref      = 176,    // WebAssembly's externref type
       x86amx         = 177,    // This is an X86 AMX value
+      i64x8          = 178,    // 8 Consecutive GPRs (AArch64)
 
       FIRST_VALUETYPE =  1,    // This is always the beginning of the list.
-      LAST_VALUETYPE = x86amx, // This always remains at the end of the list.
+      LAST_VALUETYPE = i64x8,  // This always remains at the end of the list.
       VALUETYPE_SIZE = LAST_VALUETYPE + 1,
 
       // This is the current maximum for LAST_VALUETYPE.
@@ -987,6 +988,7 @@ namespace llvm {
       case nxv16f16:
       case nxv8f32:
       case nxv4f64: return TypeSize::Scalable(256);
+      case i64x8:
       case v512i1:
       case v64i8:
       case v32i16:
@@ -1403,51 +1405,61 @@ namespace llvm {
     /// SimpleValueType Iteration
     /// @{
     static auto all_valuetypes() {
-      return seq_inclusive(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto integer_valuetypes() {
-      return seq_inclusive(MVT::FIRST_INTEGER_VALUETYPE,
-                           MVT::LAST_INTEGER_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_INTEGER_VALUETYPE,
+                                MVT::LAST_INTEGER_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto fp_valuetypes() {
-      return seq_inclusive(MVT::FIRST_FP_VALUETYPE, MVT::LAST_FP_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_FP_VALUETYPE, MVT::LAST_FP_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_VECTOR_VALUETYPE,
-                           MVT::LAST_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_VECTOR_VALUETYPE,
+                                MVT::LAST_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto fixedlen_vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
-                           MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
+                                MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto scalable_vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
-                           MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
+                                MVT::LAST_SCALABLE_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto integer_fixedlen_vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
-                           MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
+                                MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto fp_fixedlen_vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
-                           MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
+                                MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto integer_scalable_vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
-                           MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
+                                MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
 
     static auto fp_scalable_vector_valuetypes() {
-      return seq_inclusive(MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
-                           MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE);
+      return enum_seq_inclusive(MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
+                                MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE,
+                                force_iteration_on_noniterable_enum);
     }
     /// @}
   };
diff --git a/llvm/include/llvm/Support/Memory.h b/llvm/include/llvm/Support/Memory.h
index 31e0abbcdb61..d7d60371d315 100644
--- a/llvm/include/llvm/Support/Memory.h
+++ b/llvm/include/llvm/Support/Memory.h
@@ -37,7 +37,7 @@ namespace sys {
     /// The size as it was allocated. This is always greater or equal to the
     /// size that was originally requested.
     size_t allocatedSize() const { return AllocatedSize; }
-  
+
   private:
     void *Address;    ///< Address of first byte of memory area
     size_t AllocatedSize; ///< Size, in bytes of the memory area
@@ -148,13 +148,22 @@ namespace sys {
       return *this;
     }
     ~OwningMemoryBlock() {
-      Memory::releaseMappedMemory(M);
+      if (M.base())
+        Memory::releaseMappedMemory(M);
     }
     void *base() const { return M.base(); }
     /// The size as it was allocated. This is always greater or equal to the
     /// size that was originally requested.
     size_t allocatedSize() const { return M.allocatedSize(); }
     MemoryBlock getMemoryBlock() const { return M; }
+    std::error_code release() {
+      std::error_code EC;
+      if (M.base()) {
+        EC = Memory::releaseMappedMemory(M);
+        M = MemoryBlock();
+      }
+      return EC;
+    }
   private:
     MemoryBlock M;
   };
diff --git a/llvm/include/llvm/Support/PGOOptions.h b/llvm/include/llvm/Support/PGOOptions.h
new file mode 100644
index 000000000000..2141e2159c0c
--- /dev/null
+++ b/llvm/include/llvm/Support/PGOOptions.h
@@ -0,0 +1,65 @@
+//===------ PGOOptions.h -- PGO option tunables ----------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Define option tunables for PGO.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PGOOPTIONS_H
+#define LLVM_SUPPORT_PGOOPTIONS_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+/// A struct capturing PGO tunables.
+struct PGOOptions {
+  enum PGOAction { NoAction, IRInstr, IRUse, SampleUse };
+  enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
+  PGOOptions(std::string ProfileFile = "", std::string CSProfileGenFile = "",
+             std::string ProfileRemappingFile = "", PGOAction Action = NoAction,
+             CSPGOAction CSAction = NoCSAction,
+             bool DebugInfoForProfiling = false,
+             bool PseudoProbeForProfiling = false)
+      : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
+        ProfileRemappingFile(ProfileRemappingFile), Action(Action),
+        CSAction(CSAction), DebugInfoForProfiling(DebugInfoForProfiling ||
+                                                  (Action == SampleUse &&
+                                                   !PseudoProbeForProfiling)),
+        PseudoProbeForProfiling(PseudoProbeForProfiling) {
+    // Note, we do allow ProfileFile.empty() for Action=IRUse LTO can
+    // callback with IRUse action without ProfileFile.
+
+    // If there is a CSAction, PGOAction cannot be IRInstr or SampleUse.
+    assert(this->CSAction == NoCSAction ||
+           (this->Action != IRInstr && this->Action != SampleUse));
+
+    // For CSIRInstr, CSProfileGenFile also needs to be nonempty.
+    assert(this->CSAction != CSIRInstr || !this->CSProfileGenFile.empty());
+
+    // If CSAction is CSIRUse, PGOAction needs to be IRUse as they share
+    // a profile.
+    assert(this->CSAction != CSIRUse || this->Action == IRUse);
+
+    // If neither Action nor CSAction, DebugInfoForProfiling or
+    // PseudoProbeForProfiling needs to be true.
+    assert(this->Action != NoAction || this->CSAction != NoCSAction ||
+           this->DebugInfoForProfiling || this->PseudoProbeForProfiling);
+  }
+  std::string ProfileFile;
+  std::string CSProfileGenFile;
+  std::string ProfileRemappingFile;
+  PGOAction Action;
+  CSPGOAction CSAction;
+  bool DebugInfoForProfiling;
+  bool PseudoProbeForProfiling;
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index 28d171d45256..5c3b26d5754c 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -40,7 +40,10 @@ class Latch {
 
 public:
   explicit Latch(uint32_t Count = 0) : Count(Count) {}
-  ~Latch() { sync(); }
+  ~Latch() {
+    // Ensure at least that sync() was called.
+    assert(Count == 0);
+  }
 
   void inc() {
     std::lock_guard<std::mutex> lock(Mutex);
diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h
index af70e086a1b6..da5095714f48 100644
--- a/llvm/include/llvm/Support/Path.h
+++ b/llvm/include/llvm/Support/Path.h
@@ -25,7 +25,29 @@ namespace llvm {
 namespace sys {
 namespace path {
 
-enum class Style { windows, posix, native };
+enum class Style {
+  native,
+  posix,
+  windows_slash,
+  windows_backslash,
+  windows = windows_backslash, // deprecated
+};
+
+/// Check if \p S uses POSIX path rules.
+constexpr bool is_style_posix(Style S) {
+  if (S == Style::posix)
+    return true;
+  if (S != Style::native)
+    return false;
+#if defined(_WIN32)
+  return false;
+#else
+  return true;
+#endif
+}
+
+/// Check if \p S uses Windows path rules.
+constexpr bool is_style_windows(Style S) { return !is_style_posix(S); }
 
 /// @name Lexical Component Iterator
 /// @{
@@ -174,6 +196,21 @@ bool replace_path_prefix(SmallVectorImpl<char> &Path, StringRef OldPrefix,
                          StringRef NewPrefix,
                          Style style = Style::native);
 
+/// Remove redundant leading "./" pieces and consecutive separators.
+///
+/// @param path Input path.
+/// @result The cleaned-up \a path.
+StringRef remove_leading_dotslash(StringRef path, Style style = Style::native);
+
+/// In-place remove any './' and optionally '../' components from a path.
+///
+/// @param path processed path
+/// @param remove_dot_dot specify if '../' (except for leading "../") should be
+/// removed
+/// @result True if path was changed
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot = false,
+                 Style style = Style::native);
+
 /// Append to path.
 ///
 /// @code
@@ -212,7 +249,7 @@ void append(SmallVectorImpl<char> &path, const_iterator begin,
 
 /// Convert path to the native form. This is used to give paths to users and
 /// operating system calls in the platform's normal way. For example, on Windows
-/// all '/' are converted to '\'.
+/// all '/' are converted to '\'. On Unix, it converts all '\' to '/'.
 ///
 /// @param path A path that is transformed to native format.
 /// @param result Holds the result of the transformation.
@@ -226,6 +263,17 @@ void native(const Twine &path, SmallVectorImpl<char> &result,
 /// @param path A path that is transformed to native format.
 void native(SmallVectorImpl<char> &path, Style style = Style::native);
 
+/// For Windows path styles, convert path to use the preferred path separators.
+/// For other styles, do nothing.
+///
+/// @param path A path that is transformed to preferred format.
+inline void make_preferred(SmallVectorImpl<char> &path,
+                           Style style = Style::native) {
+  if (!is_style_windows(style))
+    return;
+  native(path, style);
+}
+
 /// Replaces backslashes with slashes if Windows.
 ///
 /// @param path processed path
@@ -499,21 +547,6 @@ bool is_absolute_gnu(const Twine &path, Style style = Style::native);
 /// @result True if the path is relative, false if it is not.
 bool is_relative(const Twine &path, Style style = Style::native);
 
-/// Remove redundant leading "./" pieces and consecutive separators.
-///
-/// @param path Input path.
-/// @result The cleaned-up \a path.
-StringRef remove_leading_dotslash(StringRef path, Style style = Style::native);
-
-/// In-place remove any './' and optionally '../' components from a path.
-///
-/// @param path processed path
-/// @param remove_dot_dot specify if '../' (except for leading "../") should be
-/// removed
-/// @result True if path was changed
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot = false,
-                 Style style = Style::native);
-
 } // end namespace path
 } // end namespace sys
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Process.h b/llvm/include/llvm/Support/Process.h
index 6687e5e7ff9a..ee03efeed9b2 100644
--- a/llvm/include/llvm/Support/Process.h
+++ b/llvm/include/llvm/Support/Process.h
@@ -214,12 +214,10 @@ public:
   /// In that case, the control flow will resume after RunSafely(), like for a
   /// crash, rather than exiting the current process.
   /// Use \arg NoCleanup for calling _exit() instead of exit().
-  LLVM_ATTRIBUTE_NORETURN
-  static void Exit(int RetCode, bool NoCleanup = false);
+  [[noreturn]] static void Exit(int RetCode, bool NoCleanup = false);
 
 private:
-  LLVM_ATTRIBUTE_NORETURN
-  static void ExitNoCleanup(int RetCode);
+  [[noreturn]] static void ExitNoCleanup(int RetCode);
 };
 
 }
diff --git a/llvm/include/llvm/Support/RISCVISAInfo.h b/llvm/include/llvm/Support/RISCVISAInfo.h
new file mode 100644
index 000000000000..7110de601123
--- /dev/null
+++ b/llvm/include/llvm/Support/RISCVISAInfo.h
@@ -0,0 +1,89 @@
+//===-- RISCVISAInfo.h - RISCV ISA Information ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_RISCVISAINFO_H
+#define LLVM_SUPPORT_RISCVISAINFO_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+struct RISCVExtensionInfo {
+  std::string ExtName;
+  unsigned MajorVersion;
+  unsigned MinorVersion;
+};
+
+class RISCVISAInfo {
+public:
+  RISCVISAInfo(const RISCVISAInfo &) = delete;
+  RISCVISAInfo &operator=(const RISCVISAInfo &) = delete;
+
+  static bool compareExtension(const std::string &LHS, const std::string &RHS);
+
+  /// Helper class for OrderedExtensionMap.
+  struct ExtensionComparator {
+    bool operator()(const std::string &LHS, const std::string &RHS) const {
+      return compareExtension(LHS, RHS);
+    }
+  };
+
+  /// OrderedExtensionMap is std::map, it's specialized to keep entries
+  /// in canonical order of extension.
+  typedef std::map<std::string, RISCVExtensionInfo, ExtensionComparator>
+      OrderedExtensionMap;
+
+  /// Parse RISCV ISA info from arch string.
+  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  parseArchString(StringRef Arch, bool EnableExperimentalExtension,
+                  bool ExperimentalExtensionVersionCheck = true);
+
+  /// Parse RISCV ISA info from feature vector.
+  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  parseFeatures(unsigned XLen, const std::vector<std::string> &Features);
+
+  /// Convert RISCV ISA info to a feature vector.
+  void toFeatures(std::vector<StringRef> &Features,
+                  std::function<StringRef(const Twine &)> StrAlloc) const;
+
+  const OrderedExtensionMap &getExtensions() const { return Exts; };
+
+  unsigned getXLen() const { return XLen; };
+  unsigned getFLen() const { return FLen; };
+
+  bool hasExtension(StringRef Ext) const;
+  std::string toString() const;
+
+  static bool isSupportedExtensionFeature(StringRef Ext);
+  static bool isSupportedExtension(StringRef Ext);
+  static bool isSupportedExtension(StringRef Ext, unsigned MajorVersion,
+                                   unsigned MinorVersion);
+
+private:
+  RISCVISAInfo(unsigned XLen) : XLen(XLen), FLen(0) {}
+
+  unsigned XLen;
+  unsigned FLen;
+
+  OrderedExtensionMap Exts;
+
+  void addExtension(StringRef ExtName, unsigned MajorVersion,
+                    unsigned MinorVersion);
+
+  void updateFLen();
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/RISCVTargetParser.def b/llvm/include/llvm/Support/RISCVTargetParser.def
index 6a06f9258105..f658cdb91c6b 100644
--- a/llvm/include/llvm/Support/RISCVTargetParser.def
+++ b/llvm/include/llvm/Support/RISCVTargetParser.def
@@ -19,9 +19,17 @@ PROC(ROCKET_RV32, {"rocket-rv32"}, FK_NONE, {""})
 PROC(ROCKET_RV64, {"rocket-rv64"}, FK_64BIT, {""})
 PROC(SIFIVE_732, {"sifive-7-rv32"}, FK_NONE, {""})
 PROC(SIFIVE_764, {"sifive-7-rv64"}, FK_64BIT, {""})
+PROC(SIFIVE_E20, {"sifive-e20"}, FK_NONE, {"rv32imc"})
+PROC(SIFIVE_E21, {"sifive-e21"}, FK_NONE, {"rv32imac"})
+PROC(SIFIVE_E24, {"sifive-e24"}, FK_NONE, {"rv32imafc"})
 PROC(SIFIVE_E31, {"sifive-e31"}, FK_NONE, {"rv32imac"})
-PROC(SIFIVE_U54, {"sifive-u54"}, FK_64BIT, {"rv64gc"})
+PROC(SIFIVE_E34, {"sifive-e34"}, FK_NONE, {"rv32imafc"})
 PROC(SIFIVE_E76, {"sifive-e76"}, FK_NONE, {"rv32imafc"})
+PROC(SIFIVE_S21, {"sifive-s21"}, FK_64BIT, {"rv64imac"})
+PROC(SIFIVE_S51, {"sifive-s51"}, FK_64BIT, {"rv64imac"})
+PROC(SIFIVE_S54, {"sifive-s54"}, FK_64BIT, {"rv64gc"})
+PROC(SIFIVE_S76, {"sifive-s76"}, FK_64BIT, {"rv64gc"})
+PROC(SIFIVE_U54, {"sifive-u54"}, FK_64BIT, {"rv64gc"})
 PROC(SIFIVE_U74, {"sifive-u74"}, FK_64BIT, {"rv64gc"})
 
 #undef PROC
diff --git a/llvm/include/llvm/Support/Signposts.h b/llvm/include/llvm/Support/Signposts.h
index bc6abba0a0e1..dabbba6f89d1 100644
--- a/llvm/include/llvm/Support/Signposts.h
+++ b/llvm/include/llvm/Support/Signposts.h
@@ -1,9 +1,8 @@
 //===-- llvm/Support/Signposts.h - Interval debug annotations ---*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -18,17 +17,8 @@
 #define LLVM_SUPPORT_SIGNPOSTS_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Config/llvm-config.h"
 #include <memory>
 
-#if LLVM_SUPPORT_XCODE_SIGNPOSTS
-#include <Availability.h>
-#include <os/signpost.h>
-#endif
-
-#define SIGNPOSTS_AVAILABLE()                                                  \
-  __builtin_available(macos 10.14, iOS 12, tvOS 12, watchOS 5, *)
-
 namespace llvm {
 class SignpostEmitterImpl;
 
@@ -45,33 +35,8 @@ public:
 
   /// Begin a signposted interval for a given object.
   void startInterval(const void *O, StringRef Name);
-
-#if LLVM_SUPPORT_XCODE_SIGNPOSTS
-  os_log_t &getLogger() const;
-  os_signpost_id_t getSignpostForObject(const void *O);
-#endif
-
-  /// A macro to take advantage of the special format string handling
-  /// in the os_signpost API. The format string substitution is
-  /// deferred to the log consumer and done outside of the
-  /// application.
-#if LLVM_SUPPORT_XCODE_SIGNPOSTS
-#define SIGNPOST_EMITTER_START_INTERVAL(SIGNPOST_EMITTER, O, ...)              \
-  do {                                                                         \
-    if ((SIGNPOST_EMITTER).isEnabled())                                        \
-      if (SIGNPOSTS_AVAILABLE())                                               \
-        os_signpost_interval_begin((SIGNPOST_EMITTER).getLogger(),             \
-                                   (SIGNPOST_EMITTER).getSignpostForObject(O), \
-                                   "LLVM Timers", __VA_ARGS__);                \
-  } while (0)
-#else
-#define SIGNPOST_EMITTER_START_INTERVAL(SIGNPOST_EMITTER, O, ...)              \
-  do {                                                                         \
-  } while (0)
-#endif
-
   /// End a signposted interval for a given object.
-  void endInterval(const void *O);
+  void endInterval(const void *O, StringRef Name);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index fbe0d1a55bfc..b34b885ddc35 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -652,6 +652,9 @@ HANDLE_TARGET_OPCODE(G_UMAX)
 /// Generic integer absolute value.
 HANDLE_TARGET_OPCODE(G_ABS)
 
+HANDLE_TARGET_OPCODE(G_LROUND)
+HANDLE_TARGET_OPCODE(G_LLROUND)
+
 /// Generic BRANCH instruction. This is an unconditional branch.
 HANDLE_TARGET_OPCODE(G_BR)
 
diff --git a/llvm/include/llvm/Support/TargetSelect.h b/llvm/include/llvm/Support/TargetSelect.h
index 9ffb84c4a570..e57614cea758 100644
--- a/llvm/include/llvm/Support/TargetSelect.h
+++ b/llvm/include/llvm/Support/TargetSelect.h
@@ -41,6 +41,10 @@ extern "C" {
 #define LLVM_DISASSEMBLER(TargetName) \
   void LLVMInitialize##TargetName##Disassembler();
 #include "llvm/Config/Disassemblers.def"
+
+// Declare all of the available TargetMCA initialization functions.
+#define LLVM_TARGETMCA(TargetName) void LLVMInitialize##TargetName##TargetMCA();
+#include "llvm/Config/TargetMCAs.def"
 }
 
 namespace llvm {
@@ -159,6 +163,14 @@ namespace llvm {
     return true;
 #endif
   }
+
+  /// InitializeAllTargetMCAs - The main program should call
+  /// this function to initialize the target CustomBehaviour and
+  /// InstrPostProcess classes.
+  inline void InitializeAllTargetMCAs() {
+#define LLVM_TARGETMCA(TargetName) LLVMInitialize##TargetName##TargetMCA();
+#include "llvm/Config/TargetMCAs.def"
+  }
 }
 
 #endif
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 30bbbd7db8c9..7d1274735a37 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -229,7 +229,6 @@ public:
   bool isZero() const { return !Value; }
   bool isNonZero() const { return !isZero(); }
   explicit operator bool() const { return isNonZero(); }
-  ScalarTy getValue() const { return Value; }
   ScalarTy getValue(unsigned Dim) const {
     return Dim == UnivariateDim ? Value : 0;
   }
@@ -250,7 +249,7 @@ public:
 
 //===----------------------------------------------------------------------===//
 // LinearPolySize - base class for fixed- or scalable sizes.
-//  ^  ^ 
+//  ^  ^
 //  |  |
 //  |  +----- ElementCount - Leaf class to represent an element count
 //  |                        (vscale x unsigned)
@@ -294,7 +293,7 @@ public:
   static LeafTy getNull() { return get(0, false); }
 
   /// Returns the minimum value this size can represent.
-  ScalarTy getKnownMinValue() const { return this->getValue(); }
+  ScalarTy getKnownMinValue() const { return this->Value; }
   /// Returns whether the size is scaled by a runtime quantity (vscale).
   bool isScalable() const { return this->UnivariateDim == ScalableDim; }
   /// A return value of true indicates we know at compile time that the number
@@ -500,8 +499,7 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-template <typename T> struct DenseMapInfo;
-template <> struct DenseMapInfo<ElementCount> {
+template <> struct DenseMapInfo<ElementCount, void> {
   static inline ElementCount getEmptyKey() {
     return ElementCount::getScalable(~0U);
   }
diff --git a/llvm/include/llvm/Support/VersionTuple.h b/llvm/include/llvm/Support/VersionTuple.h
index a48ae0bf52bd..1a1072d228f1 100644
--- a/llvm/include/llvm/Support/VersionTuple.h
+++ b/llvm/include/llvm/Support/VersionTuple.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/Support/HashBuilder.h"
 #include <string>
 #include <tuple>
 
@@ -164,6 +165,12 @@ public:
     return llvm::hash_combine(VT.Major, VT.Minor, VT.Subminor, VT.Build);
   }
 
+  template <typename HasherT, llvm::support::endianness Endianness>
+  friend void addHash(HashBuilderImpl<HasherT, Endianness> &HBuilder,
+                      const VersionTuple &VT) {
+    HBuilder.add(VT.Major, VT.Minor, VT.Subminor, VT.Build);
+  }
+
   /// Retrieve a string representation of the version number.
   std::string getAsString() const;
 
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index 323e6719645d..10d2389ee079 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -121,6 +121,14 @@ public:
 
   /// Closes the file.
   virtual std::error_code close() = 0;
+
+  // Get the same file with a different path.
+  static ErrorOr<std::unique_ptr<File>>
+  getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P);
+
+protected:
+  // Set the file's underlying path.
+  virtual void setPath(const Twine &Path) {}
 };
 
 /// A member of a directory, yielded by a directory_iterator.
@@ -596,6 +604,17 @@ class RedirectingFileSystemParser;
 /// contain multiple path components (e.g. /path/to/file). However, any
 /// directory in such a path that contains more than one child must be uniquely
 /// represented by a 'directory' entry.
+///
+/// When the 'use-external-name' field is set, calls to \a vfs::File::status()
+/// give the external (remapped) filesystem name instead of the name the file
+/// was accessed by. This is an intentional leak through the \a
+/// RedirectingFileSystem abstraction layer. It enables clients to discover
+/// (and use) the external file location when communicating with users or tools
+/// that don't use the same VFS overlay.
+///
+/// FIXME: 'use-external-name' causes behaviour that's inconsistent with how
+/// "real" filesystems behave. Maybe there should be a separate channel for
+/// this information.
 class RedirectingFileSystem : public vfs::FileSystem {
 public:
   enum EntryKind { EK_Directory, EK_DirectoryRemap, EK_File };
@@ -746,6 +765,12 @@ private:
   /// with the given error code on a path associated with the provided Entry.
   bool shouldFallBackToExternalFS(std::error_code EC, Entry *E = nullptr) const;
 
+  /// Get the File status, or error, from the underlying external file system.
+  /// This returns the status with the originally requested name, while looking
+  /// up the entry using the canonical path.
+  ErrorOr<Status> getExternalStatus(const Twine &CanonicalPath,
+                                    const Twine &OriginalPath) const;
+
   // In a RedirectingFileSystem, keys can be specified in Posix or Windows
   // style (or even a mixture of both), so this comparison helper allows
   // slashes (representing a root) to match backslashes (and vice versa).  Note
@@ -777,12 +802,7 @@ private:
   /// Whether to perform case-sensitive comparisons.
   ///
   /// Currently, case-insensitive matching only works correctly with ASCII.
-  bool CaseSensitive =
-#ifdef _WIN32
-      false;
-#else
-      true;
-#endif
+  bool CaseSensitive = is_style_posix(sys::path::Style::native);
 
   /// IsRelativeOverlay marks whether a ExternalContentsPrefixDir path must
   /// be prefixed in every 'external-contents' when reading from YAML files.
@@ -808,7 +828,8 @@ private:
                                        Entry *From) const;
 
   /// Get the status for a path with the provided \c LookupResult.
-  ErrorOr<Status> status(const Twine &Path, const LookupResult &Result);
+  ErrorOr<Status> status(const Twine &CanonicalPath, const Twine &OriginalPath,
+                         const LookupResult &Result);
 
 public:
   /// Looks up \p Path in \c Roots and returns a LookupResult giving the
diff --git a/llvm/include/llvm/Support/Windows/WindowsSupport.h b/llvm/include/llvm/Support/Windows/WindowsSupport.h
index a45eeaba4ad5..917822678e97 100644
--- a/llvm/include/llvm/Support/Windows/WindowsSupport.h
+++ b/llvm/include/llvm/Support/Windows/WindowsSupport.h
@@ -68,10 +68,10 @@ llvm::VersionTuple GetWindowsOSVersion();
 bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix);
 
 // Include GetLastError() in a fatal error message.
-LLVM_ATTRIBUTE_NORETURN inline void ReportLastErrorFatal(const char *Msg) {
+[[noreturn]] inline void ReportLastErrorFatal(const char *Msg) {
   std::string ErrMsg;
   MakeErrMsg(&ErrMsg, Msg);
-  llvm::report_fatal_error(ErrMsg);
+  llvm::report_fatal_error(Twine(ErrMsg));
 }
 
 template <typename HandleTraits>
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 757a3c0c8a71..aca717a9f6cb 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -31,6 +31,8 @@ namespace X86Disassembler {
 #define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
 #define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
 #define THREEDNOW_MAP_SYM x86Disassembler3DNowOpcodes
+#define MAP5_SYM          x86DisassemblerMap5Opcodes
+#define MAP6_SYM          x86DisassemblerMap6Opcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -42,6 +44,8 @@ namespace X86Disassembler {
 #define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
 #define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 #define THREEDNOW_MAP_STR "x86Disassembler3DNowOpcodes"
+#define MAP5_STR          "x86DisassemblerMap5Opcodes"
+#define MAP6_STR          "x86DisassemblerMap6Opcodes"
 
 // Attributes of an instruction that must be known before the opcode can be
 // processed correctly.  Most of these indicate the presence of particular
@@ -292,7 +296,9 @@ enum OpcodeType {
   XOP8_MAP      = 4,
   XOP9_MAP      = 5,
   XOPA_MAP      = 6,
-  THREEDNOW_MAP = 7
+  THREEDNOW_MAP = 7,
+  MAP5          = 8,
+  MAP6          = 9
 };
 
 // The following structs are used for the hierarchical decode table.  After
diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index ffcc2238e3ce..4443d822d3e8 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -91,54 +91,59 @@ X86_CPU_SUBTYPE(AMDFAM19H_ZNVER3,            "znver3")
 X86_CPU_SUBTYPE(INTEL_COREI7_ROCKETLAKE,     "rocketlake")
 #undef X86_CPU_SUBTYPE
 
-
-// This macro is used for cpu types present in compiler-rt/libgcc.
+// This macro is used for cpu types present in compiler-rt/libgcc. The third
+// parameter PRIORITY is as required by the attribute 'target' checking. Note
+// that not all are supported/prioritized by GCC, so synchronization with GCC's
+// implementation may require changing some existing values.
+//
+// We cannot just re-sort the list though because its order is dictated by the
+// order of bits in CodeGenFunction::GetX86CpuSupportsMask.
 #ifndef X86_FEATURE_COMPAT
-#define X86_FEATURE_COMPAT(ENUM, STR) X86_FEATURE(ENUM, STR)
+#define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY) X86_FEATURE(ENUM, STR)
 #endif
 
 #ifndef X86_FEATURE
 #define X86_FEATURE(ENUM, STR)
 #endif
 
-X86_FEATURE_COMPAT(CMOV,            "cmov")
-X86_FEATURE_COMPAT(MMX,             "mmx")
-X86_FEATURE_COMPAT(POPCNT,          "popcnt")
-X86_FEATURE_COMPAT(SSE,             "sse")
-X86_FEATURE_COMPAT(SSE2,            "sse2")
-X86_FEATURE_COMPAT(SSE3,            "sse3")
-X86_FEATURE_COMPAT(SSSE3,           "ssse3")
-X86_FEATURE_COMPAT(SSE4_1,          "sse4.1")
-X86_FEATURE_COMPAT(SSE4_2,          "sse4.2")
-X86_FEATURE_COMPAT(AVX,             "avx")
-X86_FEATURE_COMPAT(AVX2,            "avx2")
-X86_FEATURE_COMPAT(SSE4_A,          "sse4a")
-X86_FEATURE_COMPAT(FMA4,            "fma4")
-X86_FEATURE_COMPAT(XOP,             "xop")
-X86_FEATURE_COMPAT(FMA,             "fma")
-X86_FEATURE_COMPAT(AVX512F,         "avx512f")
-X86_FEATURE_COMPAT(BMI,             "bmi")
-X86_FEATURE_COMPAT(BMI2,            "bmi2")
-X86_FEATURE_COMPAT(AES,             "aes")
-X86_FEATURE_COMPAT(PCLMUL,          "pclmul")
-X86_FEATURE_COMPAT(AVX512VL,        "avx512vl")
-X86_FEATURE_COMPAT(AVX512BW,        "avx512bw")
-X86_FEATURE_COMPAT(AVX512DQ,        "avx512dq")
-X86_FEATURE_COMPAT(AVX512CD,        "avx512cd")
-X86_FEATURE_COMPAT(AVX512ER,        "avx512er")
-X86_FEATURE_COMPAT(AVX512PF,        "avx512pf")
-X86_FEATURE_COMPAT(AVX512VBMI,      "avx512vbmi")
-X86_FEATURE_COMPAT(AVX512IFMA,      "avx512ifma")
-X86_FEATURE_COMPAT(AVX5124VNNIW,    "avx5124vnniw")
-X86_FEATURE_COMPAT(AVX5124FMAPS,    "avx5124fmaps")
-X86_FEATURE_COMPAT(AVX512VPOPCNTDQ, "avx512vpopcntdq")
-X86_FEATURE_COMPAT(AVX512VBMI2,     "avx512vbmi2")
-X86_FEATURE_COMPAT(GFNI,            "gfni")
-X86_FEATURE_COMPAT(VPCLMULQDQ,      "vpclmulqdq")
-X86_FEATURE_COMPAT(AVX512VNNI,      "avx512vnni")
-X86_FEATURE_COMPAT(AVX512BITALG,    "avx512bitalg")
-X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16")
-X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect")
+X86_FEATURE_COMPAT(CMOV,            "cmov",                  0)
+X86_FEATURE_COMPAT(MMX,             "mmx",                   1)
+X86_FEATURE_COMPAT(POPCNT,          "popcnt",                9)
+X86_FEATURE_COMPAT(SSE,             "sse",                   2)
+X86_FEATURE_COMPAT(SSE2,            "sse2",                  3)
+X86_FEATURE_COMPAT(SSE3,            "sse3",                  4)
+X86_FEATURE_COMPAT(SSSE3,           "ssse3",                 5)
+X86_FEATURE_COMPAT(SSE4_1,          "sse4.1",                7)
+X86_FEATURE_COMPAT(SSE4_2,          "sse4.2",                8)
+X86_FEATURE_COMPAT(AVX,             "avx",                   12)
+X86_FEATURE_COMPAT(AVX2,            "avx2",                  18)
+X86_FEATURE_COMPAT(SSE4_A,          "sse4a",                 6)
+X86_FEATURE_COMPAT(FMA4,            "fma4",                  14)
+X86_FEATURE_COMPAT(XOP,             "xop",                   15)
+X86_FEATURE_COMPAT(FMA,             "fma",                   16)
+X86_FEATURE_COMPAT(AVX512F,         "avx512f",               19)
+X86_FEATURE_COMPAT(BMI,             "bmi",                   13)
+X86_FEATURE_COMPAT(BMI2,            "bmi2",                  17)
+X86_FEATURE_COMPAT(AES,             "aes",                   10)
+X86_FEATURE_COMPAT(PCLMUL,          "pclmul",                11)
+X86_FEATURE_COMPAT(AVX512VL,        "avx512vl",              20)
+X86_FEATURE_COMPAT(AVX512BW,        "avx512bw",              21)
+X86_FEATURE_COMPAT(AVX512DQ,        "avx512dq",              22)
+X86_FEATURE_COMPAT(AVX512CD,        "avx512cd",              23)
+X86_FEATURE_COMPAT(AVX512ER,        "avx512er",              24)
+X86_FEATURE_COMPAT(AVX512PF,        "avx512pf",              25)
+X86_FEATURE_COMPAT(AVX512VBMI,      "avx512vbmi",            26)
+X86_FEATURE_COMPAT(AVX512IFMA,      "avx512ifma",            27)
+X86_FEATURE_COMPAT(AVX5124VNNIW,    "avx5124vnniw",          28)
+X86_FEATURE_COMPAT(AVX5124FMAPS,    "avx5124fmaps",          29)
+X86_FEATURE_COMPAT(AVX512VPOPCNTDQ, "avx512vpopcntdq",       30)
+X86_FEATURE_COMPAT(AVX512VBMI2,     "avx512vbmi2",           31)
+X86_FEATURE_COMPAT(GFNI,            "gfni",                  32)
+X86_FEATURE_COMPAT(VPCLMULQDQ,      "vpclmulqdq",            33)
+X86_FEATURE_COMPAT(AVX512VNNI,      "avx512vnni",            34)
+X86_FEATURE_COMPAT(AVX512BITALG,    "avx512bitalg",          35)
+X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16",            36)
+X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect", 37)
 // Features below here are not in libgcc/compiler-rt.
 X86_FEATURE       (3DNOW,           "3dnow")
 X86_FEATURE       (3DNOWA,          "3dnowa")
@@ -153,6 +158,7 @@ X86_FEATURE       (CLWB,            "clwb")
 X86_FEATURE       (CLZERO,          "clzero")
 X86_FEATURE       (CMPXCHG16B,      "cx16")
 X86_FEATURE       (CMPXCHG8B,       "cx8")
+X86_FEATURE       (CRC32,           "crc32")
 X86_FEATURE       (ENQCMD,          "enqcmd")
 X86_FEATURE       (F16C,            "f16c")
 X86_FEATURE       (FSGSBASE,        "fsgsbase")
@@ -193,6 +199,7 @@ X86_FEATURE       (XSAVEC,          "xsavec")
 X86_FEATURE       (XSAVEOPT,        "xsaveopt")
 X86_FEATURE       (XSAVES,          "xsaves")
 X86_FEATURE       (HRESET,          "hreset")
+X86_FEATURE       (AVX512FP16,      "avx512fp16")
 X86_FEATURE       (AVXVNNI,         "avxvnni")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
@@ -202,3 +209,49 @@ X86_FEATURE       (LVI_CFI,                     "lvi-cfi")
 X86_FEATURE       (LVI_LOAD_HARDENING,          "lvi-load-hardening")
 #undef X86_FEATURE_COMPAT
 #undef X86_FEATURE
+
+#ifndef CPU_SPECIFIC
+#define CPU_SPECIFIC(NAME, MANGLING, FEATURES)
+#endif
+
+#ifndef CPU_SPECIFIC_ALIAS
+#define CPU_SPECIFIC_ALIAS(NEW_NAME, NAME)
+#endif
+
+CPU_SPECIFIC("generic", 'A', "")
+CPU_SPECIFIC("pentium", 'B', "")
+CPU_SPECIFIC("pentium_pro", 'C', "+cmov")
+CPU_SPECIFIC("pentium_mmx", 'D', "+mmx")
+CPU_SPECIFIC("pentium_ii", 'E', "+cmov,+mmx")
+CPU_SPECIFIC("pentium_iii", 'H', "+cmov,+mmx,+sse")
+CPU_SPECIFIC_ALIAS("pentium_iii_no_xmm_regs", "pentium_iii")
+CPU_SPECIFIC("pentium_4", 'J', "+cmov,+mmx,+sse,+sse2")
+CPU_SPECIFIC("pentium_m", 'K', "+cmov,+mmx,+sse,+sse2")
+CPU_SPECIFIC("pentium_4_sse3", 'L', "+cmov,+mmx,+sse,+sse2,+sse3")
+CPU_SPECIFIC("core_2_duo_ssse3", 'M', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3")
+CPU_SPECIFIC("core_2_duo_sse4_1", 'N', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1")
+CPU_SPECIFIC("atom", 'O', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+movbe")
+CPU_SPECIFIC("atom_sse4_2", 'c', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
+CPU_SPECIFIC("core_i7_sse4_2", 'P', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
+CPU_SPECIFIC("core_aes_pclmulqdq", 'Q', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt")
+CPU_SPECIFIC("atom_sse4_2_movbe", 'd', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt")
+CPU_SPECIFIC("goldmont", 'i', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt")
+CPU_SPECIFIC("sandybridge", 'R', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+avx")
+CPU_SPECIFIC_ALIAS("core_2nd_gen_avx", "sandybridge")
+CPU_SPECIFIC("ivybridge", 'S', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+popcnt,+f16c,+avx")
+CPU_SPECIFIC_ALIAS("core_3rd_gen_avx", "ivybridge")
+CPU_SPECIFIC("haswell", 'V', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2")
+CPU_SPECIFIC_ALIAS("core_4th_gen_avx", "haswell")
+CPU_SPECIFIC("core_4th_gen_avx_tsx", 'W', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2")
+CPU_SPECIFIC("broadwell", 'X', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx")
+CPU_SPECIFIC_ALIAS("core_5th_gen_avx", "broadwell")
+CPU_SPECIFIC("core_5th_gen_avx_tsx", 'Y', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx")
+CPU_SPECIFIC("knl", 'Z', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512f,+adx,+avx512er,+avx512pf,+avx512cd")
+CPU_SPECIFIC_ALIAS("mic_avx512", "knl")
+CPU_SPECIFIC("skylake", 'b', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+adx,+mpx")
+CPU_SPECIFIC( "skylake_avx512", 'a', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512dq,+avx512f,+adx,+avx512cd,+avx512bw,+avx512vl,+clwb")
+CPU_SPECIFIC("cannonlake", 'e', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512dq,+avx512f,+adx,+avx512ifma,+avx512cd,+avx512bw,+avx512vl,+avx512vbmi")
+CPU_SPECIFIC("knm", 'j', "+cmov,+mmx,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+movbe,+popcnt,+f16c,+avx,+fma,+bmi,+lzcnt,+avx2,+avx512f,+adx,+avx512er,+avx512pf,+avx512cd,+avx5124fmaps,+avx5124vnniw,+avx512vpopcntdq")
+
+#undef CPU_SPECIFIC_ALIAS
+#undef CPU_SPECIFIC
diff --git a/llvm/include/llvm/Support/X86TargetParser.h b/llvm/include/llvm/Support/X86TargetParser.h
index ed02066933a7..bfa3e23dbd9d 100644
--- a/llvm/include/llvm/Support/X86TargetParser.h
+++ b/llvm/include/llvm/Support/X86TargetParser.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_SUPPORT_X86TARGETPARSER_H
 #define LLVM_SUPPORT_X86TARGETPARSER_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 
@@ -154,6 +155,9 @@ void getFeaturesForCPU(StringRef CPU, SmallVectorImpl<StringRef> &Features);
 void updateImpliedFeatures(StringRef Feature, bool Enabled,
                            StringMap<bool> &Features);
 
+uint64_t getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
+unsigned getFeaturePriority(ProcessorFeatures Feat);
+
 } // namespace X86
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 9ac9eb300983..bea232e6e000 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -1641,7 +1641,7 @@ void IO::processKeyWithDefault(const char *Key, Optional<T> &Val,
     // usually None.
     bool IsNone = false;
     if (!outputting())
-      if (auto *Node = dyn_cast<ScalarNode>(((Input *)this)->getCurrentNode()))
+      if (const auto *Node = dyn_cast<ScalarNode>(((Input *)this)->getCurrentNode()))
         // We use rtrim to ignore possible white spaces that might exist when a
         // comment is present on the same line.
         IsNone = Node->getRawValue().rtrim(' ') == "<none>";
diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h
index c669c2babad9..98c26ef0b1e5 100644
--- a/llvm/include/llvm/Support/raw_ostream.h
+++ b/llvm/include/llvm/Support/raw_ostream.h
@@ -330,6 +330,8 @@ public:
   // changeColor() has no effect until enable_colors(true) is called.
   virtual void enable_colors(bool enable) { ColorEnabled = enable; }
 
+  bool colors_enabled() const { return ColorEnabled; }
+
   /// Tie this stream to the specified stream. Replaces any existing tied-to
   /// stream. Specifying a nullptr unties the stream.
   void tie(raw_ostream *TieTo) { TiedStream = TieTo; }
@@ -719,7 +721,11 @@ class buffer_unique_ostream : public raw_svector_ostream {
 
 public:
   buffer_unique_ostream(std::unique_ptr<raw_ostream> OS)
-      : raw_svector_ostream(Buffer), OS(std::move(OS)) {}
+      : raw_svector_ostream(Buffer), OS(std::move(OS)) {
+    // Turn off buffering on OS, which we now own, to avoid allocating a buffer
+    // when the destructor writes only to be immediately flushed again.
+    this->OS->SetUnbuffered();
+  }
   ~buffer_unique_ostream() override { *OS << str(); }
 };
 
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index 5c4a736eb107..d73b9ae49235 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -152,7 +152,7 @@ public:
       }
       return C;
     });
-    N.erase(std::remove(N.begin(), N.end(), '_'), N.end());
+    llvm::erase_value(N, '_');
     return N;
   }
 
diff --git a/llvm/include/llvm/TableGen/Error.h b/llvm/include/llvm/TableGen/Error.h
index a0e23aca211e..da0132b10f4f 100644
--- a/llvm/include/llvm/TableGen/Error.h
+++ b/llvm/include/llvm/TableGen/Error.h
@@ -22,13 +22,10 @@ namespace llvm {
 void PrintNote(const Twine &Msg);
 void PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg);
 
-LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(const Twine &Msg);
-LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(ArrayRef<SMLoc> ErrorLoc,
-                                            const Twine &Msg);
-LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(const Record *Rec,
-                                            const Twine &Msg);
-LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(const RecordVal *RecVal,
-                                            const Twine &Msg);
+[[noreturn]] void PrintFatalNote(const Twine &Msg);
+[[noreturn]] void PrintFatalNote(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg);
+[[noreturn]] void PrintFatalNote(const Record *Rec, const Twine &Msg);
+[[noreturn]] void PrintFatalNote(const RecordVal *RecVal, const Twine &Msg);
 
 void PrintWarning(const Twine &Msg);
 void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg);
@@ -40,13 +37,10 @@ void PrintError(const char *Loc, const Twine &Msg);
 void PrintError(const Record *Rec, const Twine &Msg);
 void PrintError(const RecordVal *RecVal, const Twine &Msg);
 
-LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const Twine &Msg);
-LLVM_ATTRIBUTE_NORETURN void PrintFatalError(ArrayRef<SMLoc> ErrorLoc,
-                                             const Twine &Msg);
-LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const Record *Rec,
-                                             const Twine &Msg);
-LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const RecordVal *RecVal,
-                                             const Twine &Msg);
+[[noreturn]] void PrintFatalError(const Twine &Msg);
+[[noreturn]] void PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg);
+[[noreturn]] void PrintFatalError(const Record *Rec, const Twine &Msg);
+[[noreturn]] void PrintFatalError(const RecordVal *RecVal, const Twine &Msg);
 
 void CheckAssert(SMLoc Loc, Init *Condition, Init *Message);
 
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 713d9375448c..5869a5cf0423 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -39,6 +39,9 @@
 #include <vector>
 
 namespace llvm {
+namespace detail {
+struct RecordContext;
+} // namespace detail
 
 class ListRecTy;
 struct MultiClass;
@@ -100,7 +103,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const RecTy &Ty) {
 
 /// 'bit' - Represent a single bit
 class BitRecTy : public RecTy {
-  static BitRecTy Shared;
+  friend detail::RecordContext;
 
   BitRecTy() : RecTy(BitRecTyKind) {}
 
@@ -109,7 +112,7 @@ public:
     return RT->getRecTyKind() == BitRecTyKind;
   }
 
-  static BitRecTy *get() { return &Shared; }
+  static BitRecTy *get();
 
   std::string getAsString() const override { return "bit"; }
 
@@ -140,7 +143,7 @@ public:
 
 /// 'int' - Represent an integer value of no particular size
 class IntRecTy : public RecTy {
-  static IntRecTy Shared;
+  friend detail::RecordContext;
 
   IntRecTy() : RecTy(IntRecTyKind) {}
 
@@ -149,7 +152,7 @@ public:
     return RT->getRecTyKind() == IntRecTyKind;
   }
 
-  static IntRecTy *get() { return &Shared; }
+  static IntRecTy *get();
 
   std::string getAsString() const override { return "int"; }
 
@@ -158,7 +161,7 @@ public:
 
 /// 'string' - Represent an string value
 class StringRecTy : public RecTy {
-  static StringRecTy Shared;
+  friend detail::RecordContext;
 
   StringRecTy() : RecTy(StringRecTyKind) {}
 
@@ -167,7 +170,7 @@ public:
     return RT->getRecTyKind() == StringRecTyKind;
   }
 
-  static StringRecTy *get() { return &Shared; }
+  static StringRecTy *get();
 
   std::string getAsString() const override;
 
@@ -200,7 +203,7 @@ public:
 
 /// 'dag' - Represent a dag fragment
 class DagRecTy : public RecTy {
-  static DagRecTy Shared;
+  friend detail::RecordContext;
 
   DagRecTy() : RecTy(DagRecTyKind) {}
 
@@ -209,7 +212,7 @@ public:
     return RT->getRecTyKind() == DagRecTyKind;
   }
 
-  static DagRecTy *get() { return &Shared; }
+  static DagRecTy *get();
 
   std::string getAsString() const override;
 };
@@ -221,6 +224,7 @@ public:
 class RecordRecTy final : public RecTy, public FoldingSetNode,
                           public TrailingObjects<RecordRecTy, Record *> {
   friend class Record;
+  friend detail::RecordContext;
 
   unsigned NumClasses;
 
@@ -437,6 +441,8 @@ public:
 
 /// '?' - Represents an uninitialized value.
 class UnsetInit : public Init {
+  friend detail::RecordContext;
+
   UnsetInit() : Init(IK_UnsetInit) {}
 
 public:
@@ -468,9 +474,11 @@ public:
 
 /// 'true'/'false' - Represent a concrete initializer for a bit.
 class BitInit final : public TypedInit {
+  friend detail::RecordContext;
+
   bool Value;
 
-  explicit BitInit(bool V) : TypedInit(IK_BitInit, BitRecTy::get()), Value(V) {}
+  explicit BitInit(bool V, RecTy *T) : TypedInit(IK_BitInit, T), Value(V) {}
 
 public:
   BitInit(const BitInit &) = delete;
@@ -637,7 +645,7 @@ public:
   }
 
   StringRef getValue() const { return Value; }
-  StringFormat getFormat() const { return Format; }  
+  StringFormat getFormat() const { return Format; }
   bool hasCodeFormat() const { return Format == SF_Code; }
 
   Init *convertInitializerTo(RecTy *Ty) const override;
@@ -1414,6 +1422,7 @@ private:
   SMLoc Loc; // Source location of definition of name.
   PointerIntPair<RecTy *, 2, FieldKind> TyAndKind;
   Init *Value;
+  bool IsUsed = false;
 
 public:
   RecordVal(Init *N, RecTy *T, FieldKind K);
@@ -1458,6 +1467,11 @@ public:
   /// Set the value and source location of the field.
   bool setValue(Init *V, SMLoc NewLoc);
 
+  /// Whether this value is used. Useful for reporting warnings, for example
+  /// when a template argument is unused.
+  void setUsed(bool Used) { IsUsed = Used; }
+  bool isUsed() const { return IsUsed; }
+
   void dump() const;
 
   /// Print the value to an output stream, possibly with a semicolon.
@@ -1483,8 +1497,6 @@ public:
   };
 
 private:
-  static unsigned LastID;
-
   Init *Name;
   // Location where record was instantiated, followed by the location of
   // multiclass prototypes used.
@@ -1515,8 +1527,8 @@ public:
   // Constructs a record.
   explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records,
                   bool Anonymous = false, bool Class = false)
-    : Name(N), Locs(locs.begin(), locs.end()), TrackedRecords(records),
-      ID(LastID++), IsAnonymous(Anonymous), IsClass(Class) {
+      : Name(N), Locs(locs.begin(), locs.end()), TrackedRecords(records),
+        ID(getNewUID()), IsAnonymous(Anonymous), IsClass(Class) {
     checkName();
   }
 
@@ -1528,12 +1540,12 @@ public:
   // ID number. Don't copy CorrespondingDefInit either, since it's owned by the
   // original record. All other fields can be copied normally.
   Record(const Record &O)
-    : Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
-      Values(O.Values), Assertions(O.Assertions), SuperClasses(O.SuperClasses),
-      TrackedRecords(O.TrackedRecords), ID(LastID++),
-      IsAnonymous(O.IsAnonymous), IsClass(O.IsClass) { }
+      : Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
+        Values(O.Values), Assertions(O.Assertions),
+        SuperClasses(O.SuperClasses), TrackedRecords(O.TrackedRecords),
+        ID(getNewUID()), IsAnonymous(O.IsAnonymous), IsClass(O.IsClass) {}
 
-  static unsigned getNewUID() { return LastID++; }
+  static unsigned getNewUID();
 
   unsigned getID() const { return ID; }
 
@@ -1632,6 +1644,7 @@ public:
   }
 
   void checkRecordAssertions();
+  void checkUnusedTemplateArgs();
 
   bool isSubClassOf(const Record *R) const {
     for (const auto &SCPair : SuperClasses)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index e3e1d5fc3c65..72c974834a2f 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -225,6 +225,18 @@ def G_FREEZE : GenericInstruction {
   let hasSideEffects = false;
 }
 
+def G_LROUND: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
+def G_LLROUND: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
 //------------------------------------------------------------------------------
 // Binary ops.
 //------------------------------------------------------------------------------
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f35156d59849..e2d3dbdda88a 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -130,7 +130,13 @@ def extending_loads : GICombineRule<
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD):$root,
          [{ return Helper.matchCombineExtendingLoads(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>;
-def combines_for_extload: GICombineGroup<[extending_loads]>;
+
+def load_and_mask : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_AND):$root,
+        [{ return Helper.matchCombineLoadWithAndMask(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def combines_for_extload: GICombineGroup<[extending_loads, load_and_mask]>;
 
 def sext_trunc_sextload : GICombineRule<
   (defs root:$d),
@@ -197,6 +203,12 @@ def reduce_shl_of_extend : GICombineRule<
          [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>;
 
+def narrow_binop_feeding_and : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_AND):$root,
+         [{ return Helper.matchNarrowBinopFeedingAnd(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
 // [us]itofp(undef) = 0, because the result value is bounded.
 def undef_to_fp_zero : GICombineRule<
   (defs root:$root),
@@ -275,7 +287,7 @@ def select_constant_cmp: GICombineRule<
 def right_identity_zero: GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_SUB, G_ADD, G_OR, G_XOR, G_SHL, G_ASHR, G_LSHR,
-                           G_PTR_ADD):$root,
+                           G_PTR_ADD, G_ROTL, G_ROTR):$root,
     [{ return Helper.matchConstantOp(${root}->getOperand(2), 0); }]),
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }])
 >;
@@ -507,6 +519,13 @@ def fabs_fabs_fold: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
 >;
 
+// Fold (fabs (fneg x)) -> (fabs x).
+def fabs_fneg_fold: GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_FABS):$root,
+         [{ return Helper.matchCombineFAbsOfFNeg(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
 // Fold (unmerge cst) -> cst1, cst2, ...
 def unmerge_cst_matchinfo : GIDefMatchData<"SmallVector<APInt, 8>">;
 def unmerge_cst : GICombineRule<
@@ -588,6 +607,14 @@ def load_or_combine : GICombineRule<
     [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+
+def truncstore_merge_matcdata : GIDefMatchData<"MergeTruncStoresInfo">;
+def truncstore_merge : GICombineRule<
+  (defs root:$root, truncstore_merge_matcdata:$info),
+  (match (wip_match_opcode G_STORE):$root,
+   [{ return Helper.matchTruncStoreMerge(*${root}, ${info}); }]),
+  (apply [{ Helper.applyTruncStoreMerge(*${root}, ${info}); }])>;
+
 def extend_through_phis_matchdata: GIDefMatchData<"MachineInstr*">;
 def extend_through_phis : GICombineRule<
   (defs root:$root, extend_through_phis_matchdata:$matchinfo),
@@ -638,6 +665,18 @@ def icmp_to_true_false_known_bits : GICombineRule<
          [{ return Helper.matchICmpToTrueFalseKnownBits(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.replaceInstWithConstant(*${d}, ${matchinfo}); }])>;
 
+def icmp_to_lhs_known_bits : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_ICMP):$root,
+         [{ return Helper.matchICmpToLHSKnownBits(*${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+def and_or_disjoint_mask : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_AND):$root,
+         [{ return Helper.matchAndOrDisjointMask(*${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${info}); }])>;
+
 def bitfield_extract_from_and : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$info),
   (match (wip_match_opcode G_AND):$root,
@@ -652,8 +691,31 @@ def bitfield_extract_from_sext_inreg : GICombineRule<
     [{ return Helper.matchBitfieldExtractFromSExtInReg(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def bitfield_extract_from_shr : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_ASHR, G_LSHR):$root,
+    [{ return Helper.matchBitfieldExtractFromShr(*${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+def bitfield_extract_from_shr_and : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_ASHR, G_LSHR):$root,
+    [{ return Helper.matchBitfieldExtractFromShrAnd(*${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
 def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
-                                            bitfield_extract_from_and]>;
+                                            bitfield_extract_from_and,
+                                            bitfield_extract_from_shr,
+                                            bitfield_extract_from_shr_and]>;
+
+def udiv_by_const : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UDIV):$root,
+   [{ return Helper.matchUDivByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivByConst(*${root}); }])>;
+
+def intdiv_combines : GICombineGroup<[udiv_by_const]>;
+
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
   (match (wip_match_opcode G_PTR_ADD):$root,
@@ -669,6 +731,26 @@ def constant_fold : GICombineRule<
    [{ return Helper.matchConstantFold(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.replaceInstWithConstant(*${d}, ${matchinfo}); }])>;
 
+def mulo_by_2: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_UMULO, G_SMULO):$root,
+         [{ return Helper.matchMulOBy2(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
+def mulh_to_lshr : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UMULH):$root,
+         [{ return Helper.matchUMulHToLShr(*${root}); }]),
+  (apply [{ Helper.applyUMulHToLShr(*${root}); }])>;
+
+def mulh_combines : GICombineGroup<[mulh_to_lshr]>;
+
+def redundant_neg_operands: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMAD, G_FMA):$root,
+    [{ return Helper.matchRedundantNegOperands(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -685,13 +767,14 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
                                         fneg_fneg_fold, right_identity_one]>;
 
 def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p,
-                                     overlapping_and]>;
+                                     overlapping_and, mulo_by_2]>;
 
 def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
-  zext_trunc_fold, icmp_to_true_false_known_bits]>;
+  zext_trunc_fold, icmp_to_true_false_known_bits, icmp_to_lhs_known_bits]>;
 
-def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>;
+def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
+                                               narrow_binop_feeding_and]>;
 
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
@@ -713,8 +796,10 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shl,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
     shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
-    div_rem_to_divrem, funnel_shift_combines, form_bitfield_extract,
-    constant_fold]>;
+    truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
+    form_bitfield_extract, constant_fold, fabs_fneg_fold,
+    intdiv_combines, mulh_combines, redundant_neg_operands,
+    and_or_disjoint_mask ]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 8a5052401e9b..12eee24b578f 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -144,6 +144,8 @@ def : GINodeEquiv<G_FMAXNUM_IEEE, fmaxnum_ieee>;
 def : GINodeEquiv<G_READCYCLECOUNTER, readcyclecounter>;
 def : GINodeEquiv<G_ROTR, rotr>;
 def : GINodeEquiv<G_ROTL, rotl>;
+def : GINodeEquiv<G_LROUND, lround>;
+def : GINodeEquiv<G_LLROUND, llround>;
 
 def : GINodeEquiv<G_STRICT_FADD, strict_fadd>;
 def : GINodeEquiv<G_STRICT_FSUB, strict_fsub>;
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index e9720d765167..7ae690b83770 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -306,6 +306,9 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // the assembly matcher will provide a function to map from diagnostic types
   // to message strings.
   string DiagnosticString = "";
+
+  // Target-specific flags. This becomes the TSFlags field in TargetRegisterClass.
+  bits<8> TSFlags = 0;
 }
 
 // The memberList in a RegisterClass is a dag of set operations. TableGen
@@ -650,6 +653,25 @@ class Instruction : InstructionEncoding {
   /// instruction selection predicates. FastISel cannot handle such cases, but
   /// SelectionDAG can.
   bit FastISelShouldIgnore = false;
+
+  /// HasPositionOrder: Indicate tablegen to sort the instructions by record
+  /// ID, so that instruction that is defined earlier can be sorted earlier
+  /// in the assembly matching table.
+  bit HasPositionOrder = false;
+}
+
+/// Defines a Pat match between compressed and uncompressed instruction.
+/// The relationship and helper function generation are handled by
+/// CompressInstEmitter backend.
+class CompressPat<dag input, dag output, list<Predicate> predicates = []> {
+  /// Uncompressed instruction description.
+  dag Input = input;
+  /// Compressed instruction description.
+  dag Output = output;
+  /// Predicates that must be true for this to match.
+  list<Predicate> Predicates = predicates;
+  /// Duplicate match when tied operand is just different.
+  bit isCompressOnly = false;
 }
 
 /// Defines an additional encoding that disassembles to the given instruction
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 93bfdd20e082..752032d3d04d 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_TARGETLOWERINGOBJECTFILE_H
 
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegister.h"
 #include <cstdint>
 
 namespace llvm {
@@ -219,6 +220,14 @@ public:
     return SupportDebugThreadLocalLocation;
   }
 
+  /// Returns the register used as static base in RWPI variants.
+  virtual const MCRegister getStaticBase() const { return MCRegister::NoRegister; }
+
+  /// Get the target specific RWPI relocation.
+  virtual const MCExpr *getIndirectSymViaRWPI(const MCSymbol *Sym) const {
+    return nullptr;
+  }
+
   /// Get the target specific PC relative GOT entry relocation
   virtual const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
                                                   const MCSymbol *Sym,
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index dd17af4a642a..acfb265a9ff9 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TARGET_TARGETMACHINE_H
 #define LLVM_TARGET_TARGETMACHINE_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
@@ -20,9 +21,11 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
+#include <utility>
 
 namespace llvm {
 
@@ -110,6 +113,9 @@ protected: // Can only create subclasses.
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
 
+  // PGO related tunables.
+  Optional<PGOOptions> PGOOption = None;
+
 public:
   const TargetOptions DefaultOptions;
   mutable TargetOptions Options;
@@ -303,6 +309,9 @@ public:
     return false;
   }
 
+  void setPGOOption(Optional<PGOOptions> PGOOpt) { PGOOption = PGOOpt; }
+  const Optional<PGOOptions> &getPGOOption() const { return PGOOption; }
+
   /// If the specified generic pointer could be assumed as a pointer to a
   /// specific address space, return that address space.
   ///
@@ -311,6 +320,18 @@ public:
   /// properties.
   virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
 
+  /// If the specified predicate checks whether a generic pointer falls within
+  /// a specified address space, return that generic pointer and the address
+  /// space being queried.
+  ///
+  /// Such predicates could be specified in @llvm.assume intrinsics for the
+  /// optimizer to assume that the given generic pointer always falls within
+  /// the address space based on that predicate.
+  virtual std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const {
+    return std::make_pair(nullptr, -1);
+  }
+
   /// Get a \c TargetIRAnalysis appropriate for the target.
   ///
   /// This is used to construct the new pass manager's target IR analysis pass,
@@ -464,6 +485,10 @@ public:
   virtual bool useIPRA() const {
     return false;
   }
+
+  /// The default variant to use in unqualified `asm` instructions.
+  /// If this returns 0, `asm "$(foo$|bar$)"` will evaluate to `asm "foo"`.
+  virtual int unqualifiedInlineAsmVariant() const { return 0; }
 };
 
 /// Helper method for getting the code model, returning Default if
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index e5bea9041479..912f6d1c153a 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -110,12 +110,23 @@ namespace llvm {
     DisableWithDiag // Disable the abort but emit a diagnostic on failure.
   };
 
+  /// Indicates when and how the Swift async frame pointer bit should be set.
+  enum class SwiftAsyncFramePointerMode {
+    /// Determine whether to set the bit statically or dynamically based
+    /// on the deployment target.
+    DeploymentBased,
+    /// Always set the bit.
+    Always,
+    /// Never set the bit.
+    Never,
+  };
+
   class TargetOptions {
   public:
     TargetOptions()
         : UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false),
           NoTrappingFPMath(true), NoSignedZerosFPMath(false),
-          EnableAIXExtendedAltivecABI(false),
+          ApproxFuncFPMath(false), EnableAIXExtendedAltivecABI(false),
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
           EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
@@ -129,7 +140,7 @@ namespace llvm {
           EnableMachineFunctionSplitter(false), SupportsDefaultOutlining(false),
           EmitAddrsig(false), EmitCallSiteInfo(false),
           SupportsDebugEntryValues(false), EnableDebugEntryValues(false),
-          PseudoProbeForProfiling(false), ValueTrackingVariableLocations(false),
+          ValueTrackingVariableLocations(false),
           ForceDwarfFrameSection(false), XRayOmitFunctionIndex(false),
           DebugStrictDwarf(false),
           FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {}
@@ -172,9 +183,15 @@ namespace llvm {
     /// argument or result as insignificant.
     unsigned NoSignedZerosFPMath : 1;
 
+    /// ApproxFuncFPMath - This flag is enabled when the
+    /// -enable-approx-func-fp-math is specified on the command line. This
+    /// specifies that optimizations are allowed to substitute math functions
+    /// with approximate calculations
+    unsigned ApproxFuncFPMath : 1;
+
     /// EnableAIXExtendedAltivecABI - This flag returns true when -vec-extabi is
     /// specified. The code generator is then able to use both volatile and
-    /// nonvolitle vector regisers. When false, the code generator only uses
+    /// nonvolitle vector registers. When false, the code generator only uses
     /// volatile vector registers which is the default setting on AIX.
     unsigned EnableAIXExtendedAltivecABI : 1;
 
@@ -219,6 +236,11 @@ namespace llvm {
     /// selection fails to lower/select an instruction.
     GlobalISelAbortMode GlobalISelAbort = GlobalISelAbortMode::Enable;
 
+    /// Control when and how the Swift async frame pointer bit should
+    /// be set.
+    SwiftAsyncFramePointerMode SwiftAsyncFramePointer =
+        SwiftAsyncFramePointerMode::Always;
+
     /// UseInitArray - Use .init_array instead of .ctors for static
     /// constructors.
     unsigned UseInitArray : 1;
@@ -305,9 +327,6 @@ namespace llvm {
     /// production.
     bool ShouldEmitDebugEntryValues() const;
 
-    /// Emit pseudo probes into the binary for sample profiling
-    unsigned PseudoProbeForProfiling : 1;
-
     // When set to true, use experimental new debug variable location tracking,
     // which seeks to follow the values of variables rather than their location,
     // post isel.
@@ -328,6 +347,9 @@ namespace llvm {
     /// passed on the command line.
     std::string StackUsageOutput;
 
+    /// If greater than 0, override TargetLoweringBase::PrefLoopAlignment.
+    unsigned LoopAlignment = 0;
+
     /// FloatABIType - This setting is set by -float-abi=xxx option is specfied
     /// on the command line. This setting may either be Default, Soft, or Hard.
     /// Default selects the target's default behavior. Soft selects the ABI for
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 44ec2250a9c5..d8ef7c49a5f9 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -297,10 +297,6 @@ def SDTAtomicLoad : SDTypeProfile<1, 1, [
   SDTCisInt<0>, SDTCisPtrTy<1>
 ]>;
 
-def SDTConvertOp : SDTypeProfile<1, 5, [ //cvtss, su, us, uu, ff, fs, fu, sf, su
-  SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>, SDTCisPtrTy<4>, SDTCisPtrTy<5>
-]>;
-
 class SDCallSeqStart<list<SDTypeConstraint> constraints> :
         SDTypeProfile<0, 2, constraints>;
 class SDCallSeqEnd<list<SDTypeConstraint> constraints> :
@@ -1050,6 +1046,10 @@ def extloadvi32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
   let IsLoad = true;
   let ScalarMemoryVT = i32;
 }
+def extloadvf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+  let IsLoad = true;
+  let ScalarMemoryVT = f16;
+}
 def extloadvf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
   let IsLoad = true;
   let ScalarMemoryVT = f32;
@@ -1472,7 +1472,7 @@ def any_fsetccs : PatFrags<(ops node:$lhs, node:$rhs, node:$pred),
                           [(strict_fsetccs node:$lhs, node:$rhs, node:$pred),
                            (setcc node:$lhs, node:$rhs, node:$pred)]>;
 
-multiclass binary_atomic_op_ord<SDNode atomic_op> {
+multiclass binary_atomic_op_ord {
   def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
     let IsAtomic = true;
@@ -1500,7 +1500,7 @@ multiclass binary_atomic_op_ord<SDNode atomic_op> {
   }
 }
 
-multiclass ternary_atomic_op_ord<SDNode atomic_op> {
+multiclass ternary_atomic_op_ord {
   def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val)> {
     let IsAtomic = true;
@@ -1550,10 +1550,10 @@ multiclass binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
     let MemoryVT = !if(IsInt, i64, f64);
   }
 
-  defm NAME#_8  : binary_atomic_op_ord<atomic_op>;
-  defm NAME#_16 : binary_atomic_op_ord<atomic_op>;
-  defm NAME#_32 : binary_atomic_op_ord<atomic_op>;
-  defm NAME#_64 : binary_atomic_op_ord<atomic_op>;
+  defm NAME#_8  : binary_atomic_op_ord;
+  defm NAME#_16 : binary_atomic_op_ord;
+  defm NAME#_32 : binary_atomic_op_ord;
+  defm NAME#_64 : binary_atomic_op_ord;
 }
 
 multiclass ternary_atomic_op<SDNode atomic_op> {
@@ -1578,10 +1578,10 @@ multiclass ternary_atomic_op<SDNode atomic_op> {
     let MemoryVT = i64;
   }
 
-  defm NAME#_8  : ternary_atomic_op_ord<atomic_op>;
-  defm NAME#_16 : ternary_atomic_op_ord<atomic_op>;
-  defm NAME#_32 : ternary_atomic_op_ord<atomic_op>;
-  defm NAME#_64 : ternary_atomic_op_ord<atomic_op>;
+  defm NAME#_8  : ternary_atomic_op_ord;
+  defm NAME#_16 : ternary_atomic_op_ord;
+  defm NAME#_32 : ternary_atomic_op_ord;
+  defm NAME#_64 : ternary_atomic_op_ord;
 }
 
 defm atomic_load_add  : binary_atomic_op<atomic_load_add>;
diff --git a/llvm/include/llvm/TextAPI/Architecture.h b/llvm/include/llvm/TextAPI/Architecture.h
index 3cd8a3a19e96..978359995074 100644
--- a/llvm/include/llvm/TextAPI/Architecture.h
+++ b/llvm/include/llvm/TextAPI/Architecture.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_ARCHITECTURE_H
-#define LLVM_TEXTAPI_MACHO_ARCHITECTURE_H
+#ifndef LLVM_TEXTAPI_ARCHITECTURE_H
+#define LLVM_TEXTAPI_ARCHITECTURE_H
 
 #include <cstdint>
 #include <utility>
@@ -54,4 +54,4 @@ raw_ostream &operator<<(raw_ostream &OS, Architecture Arch);
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_ARCHITECTURE_H
+#endif // LLVM_TEXTAPI_ARCHITECTURE_H
diff --git a/llvm/include/llvm/TextAPI/ArchitectureSet.h b/llvm/include/llvm/TextAPI/ArchitectureSet.h
index e9b374e4f69f..f17cb74c9183 100644
--- a/llvm/include/llvm/TextAPI/ArchitectureSet.h
+++ b/llvm/include/llvm/TextAPI/ArchitectureSet.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_ARCHITECTURESET_H
-#define LLVM_TEXTAPI_MACHO_ARCHITECTURESET_H
+#ifndef LLVM_TEXTAPI_ARCHITECTURESET_H
+#define LLVM_TEXTAPI_ARCHITECTURESET_H
 
 #include "llvm/TextAPI/Architecture.h"
 #include <cstddef>
@@ -168,4 +168,4 @@ raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set);
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_ARCHITECTURESET_H
+#endif // LLVM_TEXTAPI_ARCHITECTURESET_H
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index d17c0c1c5b47..03a541454e1a 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_INTERFACEFILE_H
-#define LLVM_TEXTAPI_MACHO_INTERFACEFILE_H
+#ifndef LLVM_TEXTAPI_INTERFACEFILE_H
+#define LLVM_TEXTAPI_INTERFACEFILE_H
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseMap.h"
@@ -445,7 +445,7 @@ bool operator==(const DenseMapBase<DerivedT, SymbolsMapKey, MachO::Symbol *,
                                    KeyInfoT, BucketT> &RHS) {
   if (LHS.size() != RHS.size())
     return false;
-  for (auto KV : LHS) {
+  for (const auto &KV : LHS) {
     auto I = RHS.find(KV.first);
     if (I == RHS.end() || *I->second != *KV.second)
       return false;
@@ -456,4 +456,4 @@ bool operator==(const DenseMapBase<DerivedT, SymbolsMapKey, MachO::Symbol *,
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_INTERFACEFILE_H
+#endif // LLVM_TEXTAPI_INTERFACEFILE_H
diff --git a/llvm/include/llvm/TextAPI/PackedVersion.h b/llvm/include/llvm/TextAPI/PackedVersion.h
index e3d2bd5ae2e5..24bec2ebe8fc 100644
--- a/llvm/include/llvm/TextAPI/PackedVersion.h
+++ b/llvm/include/llvm/TextAPI/PackedVersion.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_PACKEDVERSION_H
-#define LLVM_TEXTAPI_MACHO_PACKEDVERSION_H
+#ifndef LLVM_TEXTAPI_PACKEDVERSION_H
+#define LLVM_TEXTAPI_PACKEDVERSION_H
 
 #include <cstdint>
 #include <utility>
@@ -64,4 +64,4 @@ inline raw_ostream &operator<<(raw_ostream &OS, const PackedVersion &Version) {
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_PACKEDVERSION_H
+#endif // LLVM_TEXTAPI_PACKEDVERSION_H
diff --git a/llvm/include/llvm/TextAPI/Platform.h b/llvm/include/llvm/TextAPI/Platform.h
index 3f052b7b8624..f7affc3ae980 100644
--- a/llvm/include/llvm/TextAPI/Platform.h
+++ b/llvm/include/llvm/TextAPI/Platform.h
@@ -9,8 +9,8 @@
 // Defines the Platforms supported by Tapi and helpers.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_TEXTAPI_MACHO_PLATFORM_H
-#define LLVM_TEXTAPI_MACHO_PLATFORM_H
+#ifndef LLVM_TEXTAPI_PLATFORM_H
+#define LLVM_TEXTAPI_PLATFORM_H
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -46,4 +46,4 @@ std::string getOSAndEnvironmentName(PlatformKind Platform,
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_PLATFORM_H
+#endif // LLVM_TEXTAPI_PLATFORM_H
diff --git a/llvm/include/llvm/TextAPI/Symbol.h b/llvm/include/llvm/TextAPI/Symbol.h
index 02f184d2502f..dfc84908bba2 100644
--- a/llvm/include/llvm/TextAPI/Symbol.h
+++ b/llvm/include/llvm/TextAPI/Symbol.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_SYMBOL_H
-#define LLVM_TEXTAPI_MACHO_SYMBOL_H
+#ifndef LLVM_TEXTAPI_SYMBOL_H
+#define LLVM_TEXTAPI_SYMBOL_H
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
@@ -132,4 +132,4 @@ private:
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_SYMBOL_H
+#endif // LLVM_TEXTAPI_SYMBOL_H
diff --git a/llvm/include/llvm/TextAPI/Target.h b/llvm/include/llvm/TextAPI/Target.h
index 53f56a6ee7b0..c2588b9d5a21 100644
--- a/llvm/include/llvm/TextAPI/Target.h
+++ b/llvm/include/llvm/TextAPI/Target.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_TARGET_H
-#define LLVM_TEXTAPI_MACHO_TARGET_H
+#ifndef LLVM_TEXTAPI_TARGET_H
+#define LLVM_TEXTAPI_TARGET_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Error.h"
@@ -67,4 +67,4 @@ raw_ostream &operator<<(raw_ostream &OS, const Target &Target);
 } // namespace MachO
 } // namespace llvm
 
-#endif // LLVM_TEXTAPI_MACHO_TARGET_H
+#endif // LLVM_TEXTAPI_TARGET_H
diff --git a/llvm/include/llvm/TextAPI/TextAPIReader.h b/llvm/include/llvm/TextAPI/TextAPIReader.h
index a403bab8465d..389335312a74 100644
--- a/llvm/include/llvm/TextAPI/TextAPIReader.h
+++ b/llvm/include/llvm/TextAPI/TextAPIReader.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_TEXTAPIREADER_H
-#define LLVM_TEXTAPI_MACHO_TEXTAPIREADER_H
+#ifndef LLVM_TEXTAPI_TEXTAPIREADER_H
+#define LLVM_TEXTAPI_TEXTAPIREADER_H
 
 #include "llvm/Support/Error.h"
 
@@ -30,4 +30,4 @@ public:
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_TEXTAPIREADER_H
+#endif // LLVM_TEXTAPI_TEXTAPIREADER_H
diff --git a/llvm/include/llvm/TextAPI/TextAPIWriter.h b/llvm/include/llvm/TextAPI/TextAPIWriter.h
index 763805168ae6..f9857a806f60 100644
--- a/llvm/include/llvm/TextAPI/TextAPIWriter.h
+++ b/llvm/include/llvm/TextAPI/TextAPIWriter.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TEXTAPI_MACHO_TEXTAPIWRITER_H
-#define LLVM_TEXTAPI_MACHO_TEXTAPIWRITER_H
+#ifndef LLVM_TEXTAPI_TEXTAPIWRITER_H
+#define LLVM_TEXTAPI_TEXTAPIWRITER_H
 
 namespace llvm {
 
@@ -28,4 +28,4 @@ public:
 } // end namespace MachO.
 } // end namespace llvm.
 
-#endif // LLVM_TEXTAPI_MACHO_TEXTAPIWRITER_H
+#endif // LLVM_TEXTAPI_TEXTAPIWRITER_H
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index c93b8adcc890..d4cbc9bd20b7 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -101,6 +101,7 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator.h"
@@ -591,7 +592,7 @@ struct IRPosition {
 
     LLVMContext &Ctx = getAnchorValue().getContext();
     for (Attribute::AttrKind AK : AKs)
-      AttrList = AttrList.removeAttribute(Ctx, getAttrIdx(), AK);
+      AttrList = AttrList.removeAttributeAtIndex(Ctx, getAttrIdx(), AK);
 
     if (CB)
       CB->setAttributes(AttrList);
@@ -1150,8 +1151,6 @@ struct Attributor {
   /// \param Allowed If not null, a set limiting the attribute opportunities.
   /// \param DeleteFns Whether to delete functions.
   /// \param RewriteSignatures Whether to rewrite function signatures.
-  /// \param MaxFixedPointIterations Maximum number of iterations to run until
-  ///                                fixpoint.
   Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
              CallGraphUpdater &CGUpdater,
              DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
@@ -1169,8 +1168,9 @@ struct Attributor {
   /// \param CGUpdater Helper to update an underlying call graph.
   /// \param Allowed If not null, a set limiting the attribute opportunities.
   /// \param DeleteFns Whether to delete functions
-  /// \param MaxFixedPointIterations Maximum number of iterations to run until
-  ///                                fixpoint.
+  /// \param RewriteSignatures Whether to rewrite function signatures.
+  /// \param MaxFixpointIterations Maximum number of iterations to run until
+  ///                              fixpoint.
   /// \param OREGetter A callback function that returns an ORE object from a
   ///                  Function pointer.
   /// \param PassName  The name of the pass emitting remarks.
@@ -1855,6 +1855,10 @@ public:
   ///
   static void createShallowWrapper(Function &F);
 
+  /// Returns true if the function \p F can be internalized. i.e. it has a
+  /// compatible linkage.
+  static bool isInternalizable(Function &F);
+
   /// Make another copy of the function \p F such that the copied version has
   /// internal linkage afterwards and can be analysed. Then we replace all uses
   /// of the original function to the copied one
@@ -1870,6 +1874,22 @@ public:
   /// null pointer.
   static Function *internalizeFunction(Function &F, bool Force = false);
 
+  /// Make copies of each function in the set \p FnSet such that the copied
+  /// version has internal linkage afterwards and can be analysed. Then we
+  /// replace all uses of the original function to the copied one. The map
+  /// \p FnMap contains a mapping of functions to their internalized versions.
+  ///
+  /// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
+  /// linkage can be internalized because these linkages guarantee that other
+  /// definitions with the same name have the same semantics as this one.
+  ///
+  /// This version will internalize all the functions in the set \p FnSet at
+  /// once and then replace the uses. This prevents internalized functions being
+  /// called by external functions when there is an internalized version in the
+  /// module.
+  static bool internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
+                                   DenseMap<Function *, Function *> &FnMap);
+
   /// Return the data layout associated with the anchor scope.
   const DataLayout &getDataLayout() const { return InfoCache.DL; }
 
@@ -2492,6 +2512,139 @@ struct IntegerRangeState : public AbstractState {
     return *this;
   }
 };
+
+/// Simple state for a set.
+///
+/// This represents a state containing a set of values. The interface supports
+/// modelling sets that contain all possible elements. The state's internal
+/// value is modified using union or intersection operations.
+template <typename BaseTy> struct SetState : public AbstractState {
+  /// A wrapper around a set that has semantics for handling unions and
+  /// intersections with a "universal" set that contains all elements.
+  struct SetContents {
+    /// Creates a universal set with no concrete elements or an empty set.
+    SetContents(bool Universal) : Universal(Universal) {}
+
+    /// Creates a non-universal set with concrete values.
+    SetContents(const DenseSet<BaseTy> &Assumptions)
+        : Universal(false), Set(Assumptions) {}
+
+    SetContents(bool Universal, const DenseSet<BaseTy> &Assumptions)
+        : Universal(Universal), Set(Assumptions) {}
+
+    const DenseSet<BaseTy> &getSet() const { return Set; }
+
+    bool isUniversal() const { return Universal; }
+
+    bool empty() const { return Set.empty() && !Universal; }
+
+    /// Finds A := A ^ B where A or B could be the "Universal" set which
+    /// contains every possible attribute. Returns true if changes were made.
+    bool getIntersection(const SetContents &RHS) {
+      bool IsUniversal = Universal;
+      unsigned Size = Set.size();
+
+      // A := A ^ U = A
+      if (RHS.isUniversal())
+        return false;
+
+      // A := U ^ B = B
+      if (Universal)
+        Set = RHS.getSet();
+      else
+        set_intersect(Set, RHS.getSet());
+
+      Universal &= RHS.isUniversal();
+      return IsUniversal != Universal || Size != Set.size();
+    }
+
+    /// Finds A := A u B where A or B could be the "Universal" set which
+    /// contains every possible attribute. returns true if changes were made.
+    bool getUnion(const SetContents &RHS) {
+      bool IsUniversal = Universal;
+      unsigned Size = Set.size();
+
+      // A := A u U = U = U u B
+      if (!RHS.isUniversal() && !Universal)
+        set_union(Set, RHS.getSet());
+
+      Universal |= RHS.isUniversal();
+      return IsUniversal != Universal || Size != Set.size();
+    }
+
+  private:
+    /// Indicates if this set is "universal", containing every possible element.
+    bool Universal;
+
+    /// The set of currently active assumptions.
+    DenseSet<BaseTy> Set;
+  };
+
+  SetState() : Known(false), Assumed(true), IsAtFixedpoint(false) {}
+
+  /// Initializes the known state with an initial set and initializes the
+  /// assumed state as universal.
+  SetState(const DenseSet<BaseTy> &Known)
+      : Known(Known), Assumed(true), IsAtFixedpoint(false) {}
+
+  /// See AbstractState::isValidState()
+  bool isValidState() const override { return !Assumed.empty(); }
+
+  /// See AbstractState::isAtFixpoint()
+  bool isAtFixpoint() const override { return IsAtFixedpoint; }
+
+  /// See AbstractState::indicateOptimisticFixpoint(...)
+  ChangeStatus indicateOptimisticFixpoint() override {
+    IsAtFixedpoint = true;
+    Known = Assumed;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...)
+  ChangeStatus indicatePessimisticFixpoint() override {
+    IsAtFixedpoint = true;
+    Assumed = Known;
+    return ChangeStatus::CHANGED;
+  }
+
+  /// Return the known state encoding.
+  const SetContents &getKnown() const { return Known; }
+
+  /// Return the assumed state encoding.
+  const SetContents &getAssumed() const { return Assumed; }
+
+  /// Returns if the set state contains the element.
+  bool setContains(const BaseTy &Elem) const {
+    return Assumed.getSet().contains(Elem) || Known.getSet().contains(Elem);
+  }
+
+  /// Performs the set intersection between this set and \p RHS. Returns true if
+  /// changes were made.
+  bool getIntersection(const SetContents &RHS) {
+    unsigned SizeBefore = Assumed.getSet().size();
+
+    // Get intersection and make sure that the known set is still a proper
+    // subset of the assumed set. A := K u (A ^ R).
+    Assumed.getIntersection(RHS);
+    Assumed.getUnion(Known);
+
+    return SizeBefore != Assumed.getSet().size();
+  }
+
+  /// Performs the set union between this set and \p RHS. Returns true if
+  /// changes were made.
+  bool getUnion(const SetContents &RHS) { return Assumed.getUnion(RHS); }
+
+private:
+  /// The set of values known for this state.
+  SetContents Known;
+
+  /// The set of assumed values for this state.
+  SetContents Assumed;
+
+  bool IsAtFixedpoint;
+};
+
 /// Helper struct necessary as the modular build fails if the virtual method
 /// IRAttribute::manifest is defined in the Attributor.cpp.
 struct IRAttributeManifest {
@@ -3394,7 +3547,7 @@ struct AADereferenceable
 };
 
 using AAAlignmentStateType =
-    IncIntegerState<uint32_t, Value::MaximumAlignment, 1>;
+    IncIntegerState<uint64_t, Value::MaximumAlignment, 1>;
 /// An abstract interface for all align attributes.
 struct AAAlign : public IRAttribute<
                      Attribute::Alignment,
@@ -3402,10 +3555,10 @@ struct AAAlign : public IRAttribute<
   AAAlign(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
 
   /// Return assumed alignment.
-  unsigned getAssumedAlign() const { return getAssumed(); }
+  uint64_t getAssumedAlign() const { return getAssumed(); }
 
   /// Return known alignment.
-  unsigned getKnownAlign() const { return getKnown(); }
+  uint64_t getKnownAlign() const { return getKnown(); }
 
   /// See AbstractAttribute::getName()
   const std::string getName() const override { return "AAAlign"; }
@@ -3776,7 +3929,7 @@ struct AAMemoryLocation
   /// Return true if we assume that the associated functions has no observable
   /// accesses.
   bool isAssumedReadNone() const {
-    return isAssumed(NO_LOCATIONS) | isAssumedStackOnly();
+    return isAssumed(NO_LOCATIONS) || isAssumedStackOnly();
   }
 
   /// Return true if we know that the associated functions has at most
@@ -3920,19 +4073,19 @@ struct AAValueConstantRange
   static AAValueConstantRange &createForPosition(const IRPosition &IRP,
                                                  Attributor &A);
 
-  /// Return an assumed range for the assocaited value a program point \p CtxI.
+  /// Return an assumed range for the associated value a program point \p CtxI.
   /// If \p I is nullptr, simply return an assumed range.
   virtual ConstantRange
   getAssumedConstantRange(Attributor &A,
                           const Instruction *CtxI = nullptr) const = 0;
 
-  /// Return a known range for the assocaited value at a program point \p CtxI.
+  /// Return a known range for the associated value at a program point \p CtxI.
   /// If \p I is nullptr, simply return a known range.
   virtual ConstantRange
   getKnownConstantRange(Attributor &A,
                         const Instruction *CtxI = nullptr) const = 0;
 
-  /// Return an assumed constant for the assocaited value a program point \p
+  /// Return an assumed constant for the associated value a program point \p
   /// CtxI.
   Optional<ConstantInt *>
   getAssumedConstantInt(Attributor &A,
@@ -4435,6 +4588,9 @@ struct AAFunctionReachability
   /// If the function represented by this possition can reach \p Fn.
   virtual bool canReach(Attributor &A, Function *Fn) const = 0;
 
+  /// Can \p CB reach \p Fn
+  virtual bool canReach(Attributor &A, CallBase &CB, Function *Fn) const = 0;
+
   /// Create an abstract attribute view for the position \p IRP.
   static AAFunctionReachability &createForPosition(const IRPosition &IRP,
                                                    Attributor &A);
@@ -4587,6 +4743,40 @@ struct AAPointerInfo : public AbstractAttribute {
   static const char ID;
 };
 
+/// An abstract attribute for getting assumption information.
+struct AAAssumptionInfo
+    : public StateWrapper<SetState<StringRef>, AbstractAttribute,
+                          DenseSet<StringRef>> {
+  using Base =
+      StateWrapper<SetState<StringRef>, AbstractAttribute, DenseSet<StringRef>>;
+
+  AAAssumptionInfo(const IRPosition &IRP, Attributor &A,
+                   const DenseSet<StringRef> &Known)
+      : Base(IRP, Known) {}
+
+  /// Returns true if the assumption set contains the assumption \p Assumption.
+  virtual bool hasAssumption(const StringRef Assumption) const = 0;
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAssumptionInfo &createForPosition(const IRPosition &IRP,
+                                             Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AAAssumptionInfo"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAssumptionInfo
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
 raw_ostream &operator<<(raw_ostream &, const AAPointerInfo::Access &);
 
 /// Run options, used by the pass manager.
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
index ce61eea05c79..0b6734a3929d 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
@@ -17,6 +17,7 @@
 
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
@@ -38,6 +39,13 @@ enum MemoryAccessKind {
 /// Returns the memory access properties of this copy of the function.
 MemoryAccessKind computeFunctionBodyMemoryAccess(Function &F, AAResults &AAR);
 
+/// Propagate function attributes for function summaries along the index's
+/// callgraph during thinlink
+bool thinLTOPropagateFunctionAttrs(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing);
+
 /// Computes function attributes in post-order over the call graph.
 ///
 /// By operating in post-order, this pass computes precise attributes for
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index aad938d48570..c5bafb89fcb5 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -167,16 +167,24 @@ void ComputeCrossModuleImportForModuleFromIndex(
     FunctionImporter::ImportMapTy &ImportList);
 
 /// PrevailingType enum used as a return type of callback passed
-/// to computeDeadSymbols. Yes and No values used when status explicitly
-/// set by symbols resolution, otherwise status is Unknown.
+/// to computeDeadSymbolsAndUpdateIndirectCalls. Yes and No values used when
+/// status explicitly set by symbols resolution, otherwise status is Unknown.
 enum class PrevailingType { Yes, No, Unknown };
 
+/// Update call edges for indirect calls to local functions added from
+/// SamplePGO when needed. Normally this is done during
+/// computeDeadSymbolsAndUpdateIndirectCalls, but can be called standalone
+/// when that is not called (e.g. during testing).
+void updateIndirectCalls(ModuleSummaryIndex &Index);
+
 /// Compute all the symbols that are "dead": i.e these that can't be reached
 /// in the graph from any of the given symbols listed in
 /// \p GUIDPreservedSymbols. Non-prevailing symbols are symbols without a
 /// prevailing copy anywhere in IR and are normally dead, \p isPrevailing
 /// predicate returns status of symbol.
-void computeDeadSymbols(
+/// Also update call edges for indirect calls to local functions added from
+/// SamplePGO when needed.
+void computeDeadSymbolsAndUpdateIndirectCalls(
     ModuleSummaryIndex &Index,
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing);
@@ -214,12 +222,15 @@ std::error_code EmitImportsFiles(
     StringRef ModulePath, StringRef OutputFilename,
     const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
 
-/// Resolve prevailing symbol linkages and constrain visibility (1. CanAutoHide,
-/// 2. consider visibility from other definitions for ELF) in \p TheModule based
-/// on the information recorded in the summaries during global summary-based
-/// analysis.
-void thinLTOResolvePrevailingInModule(Module &TheModule,
-                                      const GVSummaryMapTy &DefinedGlobals);
+/// Based on the information recorded in the summaries during global
+/// summary-based analysis:
+/// 1. Resolve prevailing symbol linkages and constrain visibility (CanAutoHide
+///    and consider visibility from other definitions for ELF) in \p TheModule
+/// 2. (optional) Apply propagated function attributes to \p TheModule if
+///    PropagateAttrs is true
+void thinLTOFinalizeInModule(Module &TheModule,
+                             const GVSummaryMapTy &DefinedGlobals,
+                             bool PropagateAttrs);
 
 /// Internalize \p TheModule based on the information recorded in the summaries
 /// during global summary-based analysis.
diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
index 442a8ec1d2e2..110c0b4dcf16 100644
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -86,6 +86,15 @@ struct OutlinableRegion {
   DenseMap<unsigned, unsigned> ExtractedArgToAgg;
   DenseMap<unsigned, unsigned> AggArgToExtracted;
 
+  /// Marks whether we need to change the order of the arguments when mapping
+  /// the old extracted function call to the new aggregate outlined function
+  /// call.
+  bool ChangedArgOrder = false;
+
+  /// Marks whether this region ends in a branch, there is special handling
+  /// required for the following basic blocks in this case.
+  bool EndsInBranch = false;
+
   /// Mapping of the argument number in the deduplicated function
   /// to a given constant, which is used when creating the arguments to the call
   /// to the newly created deduplicated function.  This is handled separately
@@ -147,6 +156,14 @@ struct OutlinableRegion {
   /// containing the called function.
   void reattachCandidate();
 
+  /// Find a corresponding value for \p V in similar OutlinableRegion \p Other.
+  ///
+  /// \param Other [in] - The OutlinableRegion to find the corresponding Value
+  /// in.
+  /// \param V [in] - The Value to look for in the other region.
+  /// \return The corresponding Value to \p V if it exists, otherwise nullptr.
+  Value *findCorrespondingValueIn(const OutlinableRegion &Other, Value *V);
+
   /// Get the size of the code removed from the region.
   ///
   /// \param [in] TTI - The TargetTransformInfo for the parent function.
@@ -176,6 +193,16 @@ private:
   /// \returns The number of Functions created.
   unsigned doOutline(Module &M);
 
+  /// Check whether an OutlinableRegion is incompatible with code already
+  /// outlined. OutlinableRegions are incomptaible when there are overlapping
+  /// instructions, or code that has not been recorded has been added to the
+  /// instructions.
+  ///
+  /// \param [in] Region - The OutlinableRegion to check for conflicts with
+  /// already outlined code.
+  /// \returns whether the region can safely be outlined.
+  bool isCompatibleWithAlreadyOutlinedCode(const OutlinableRegion &Region);
+
   /// Remove all the IRSimilarityCandidates from \p CandidateVec that have
   /// instructions contained in a previously outlined region and put the
   /// remaining regions in \p CurrentGroup.
@@ -301,8 +328,9 @@ private:
   struct InstructionAllowed : public InstVisitor<InstructionAllowed, bool> {
     InstructionAllowed() {}
 
-    // TODO: Determine a scheme to resolve when the label is similar enough.
-    bool visitBranchInst(BranchInst &BI) { return false; }
+    bool visitBranchInst(BranchInst &BI) { 
+      return EnableBranches;
+    }
     // TODO: Determine a scheme to resolve when the labels are similar enough.
     bool visitPHINode(PHINode &PN) { return false; }
     // TODO: Handle allocas.
@@ -341,6 +369,10 @@ private:
     // TODO: Handle interblock similarity.
     bool visitTerminator(Instruction &I) { return false; }
     bool visitInstruction(Instruction &I) { return true; }
+
+    // The flag variable that marks whether we should allow branch instructions
+    // to be outlined.
+    bool EnableBranches = false;
   };
 
   /// A InstVisitor used to exclude certain instructions from being outlined.
diff --git a/llvm/include/llvm/Transforms/IPO/Inliner.h b/llvm/include/llvm/Transforms/IPO/Inliner.h
index 23a39d7f2e2b..a7060943c4c0 100644
--- a/llvm/include/llvm/Transforms/IPO/Inliner.h
+++ b/llvm/include/llvm/Transforms/IPO/Inliner.h
@@ -14,7 +14,6 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/IR/PassManager.h"
 #include <utility>
@@ -103,6 +102,9 @@ public:
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
+
 private:
   InlineAdvisor &getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
                             FunctionAnalysisManager &FAM, Module &M);
@@ -130,17 +132,27 @@ public:
   /// before run is called, as part of pass pipeline building.
   CGSCCPassManager &getPM() { return PM; }
 
-  /// Allow adding module-level passes benefiting the contained CGSCC passes.
+  /// Add a module pass that runs before the CGSCC passes.
   template <class T> void addModulePass(T Pass) {
     MPM.addPass(std::move(Pass));
   }
 
+  /// Add a module pass that runs after the CGSCC passes.
+  template <class T> void addLateModulePass(T Pass) {
+    AfterCGMPM.addPass(std::move(Pass));
+  }
+
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
+
 private:
   const InlineParams Params;
   const InliningAdvisorMode Mode;
   const unsigned MaxDevirtIterations;
+  // TODO: Clean this up so we only have one ModulePassManager.
   CGSCCPassManager PM;
   ModulePassManager MPM;
+  ModulePassManager AfterCGMPM;
 };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/IPO/LoopExtractor.h b/llvm/include/llvm/Transforms/IPO/LoopExtractor.h
index def3c5943919..aa697484d0e9 100644
--- a/llvm/include/llvm/Transforms/IPO/LoopExtractor.h
+++ b/llvm/include/llvm/Transforms/IPO/LoopExtractor.h
@@ -23,6 +23,8 @@ namespace llvm {
 struct LoopExtractorPass : public PassInfoMixin<LoopExtractorPass> {
   LoopExtractorPass(unsigned NumLoops = ~0) : NumLoops(NumLoops) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
 private:
   unsigned NumLoops;
diff --git a/llvm/include/llvm/Transforms/IPO/ModuleInliner.h b/llvm/include/llvm/Transforms/IPO/ModuleInliner.h
new file mode 100644
index 000000000000..963d74d71003
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/ModuleInliner.h
@@ -0,0 +1,51 @@
+//===- ModuleInliner.h - Module level Inliner pass --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_MODULEINLINER_H
+#define LLVM_TRANSFORMS_IPO_MODULEINLINER_H
+
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
+#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
+#include "llvm/IR/PassManager.h"
+#include <utility>
+
+namespace llvm {
+
+class AssumptionCacheTracker;
+class ProfileSummaryInfo;
+
+/// The module inliner pass for the new pass manager.
+///
+/// This pass wires together the inlining utilities and the inline cost
+/// analysis into a module pass. Different from SCC inliner, it considers every
+/// call in every function in the whole module and tries to inline if
+/// profitable. With this module level inliner, it is possible to evaluate more
+/// heuristics in the module level such like PriorityInlineOrder. It can be
+/// tuned with a number of parameters to control what cost model is used and
+/// what tradeoffs are made when making the decision.
+class ModuleInlinerPass : public PassInfoMixin<ModuleInlinerPass> {
+public:
+  ModuleInlinerPass(InlineParams Params = getInlineParams(),
+                    InliningAdvisorMode Mode = InliningAdvisorMode::Default)
+      : Params(Params), Mode(Mode){};
+  ModuleInlinerPass(ModuleInlinerPass &&Arg) = default;
+
+  PreservedAnalyses run(Module &, ModuleAnalysisManager &);
+
+private:
+  InlineAdvisor &getAdvisor(const ModuleAnalysisManager &MAM,
+                            FunctionAnalysisManager &FAM, Module &M);
+  std::unique_ptr<InlineAdvisor> OwnedAdvisor;
+  const InlineParams Params;
+  const InliningAdvisorMode Mode;
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_MODULEINLINER_H
diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 4f941d26df4c..7f321a688aff 100644
--- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -154,7 +154,6 @@ public:
   /// tests.
   const ModuleSummaryIndex *ImportSummary = nullptr;
 
-  bool DisableTailCalls;
   bool DisableUnrollLoops;
   bool CallGraphProfile;
   bool SLPVectorize;
diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
index 0adaa1b16d54..6e45f8f6fb05 100644
--- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
+++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDCALLGRAPH_H
-#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDCALLGRAPH_H
+#ifndef LLVM_TRANSFORMS_IPO_PROFILEDCALLGRAPH_H
+#define LLVM_TRANSFORMS_IPO_PROFILEDCALLGRAPH_H
 
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/StringMap.h"
@@ -42,7 +42,7 @@ public:
   using iterator = std::set<ProfiledCallGraphNode *>::iterator;
 
   // Constructor for non-CS profile.
-  ProfiledCallGraph(StringMap<FunctionSamples> &ProfileMap) {
+  ProfiledCallGraph(SampleProfileMap &ProfileMap) {
     assert(!FunctionSamples::ProfileIsCS && "CS profile is not handled here");
     for (const auto &Samples : ProfileMap) {
       addProfiledCalls(Samples.second);
@@ -56,7 +56,7 @@ public:
     std::queue<ContextTrieNode *> Queue;
     for (auto &Child : ContextTracker.getRootContext().getAllChildContext()) {
       ContextTrieNode *Callee = &Child.second;
-      addProfiledFunction(Callee->getFuncName());
+      addProfiledFunction(ContextTracker.getFuncNameFor(Callee));
       Queue.push(Callee);
     }
 
@@ -72,9 +72,10 @@ public:
       // context-based one, which may in turn block context-based inlining.
       for (auto &Child : Caller->getAllChildContext()) {
         ContextTrieNode *Callee = &Child.second;
-        addProfiledFunction(Callee->getFuncName());
+        addProfiledFunction(ContextTracker.getFuncNameFor(Callee));
         Queue.push(Callee);
-        addProfiledCall(Caller->getFuncName(), Callee->getFuncName());
+        addProfiledCall(ContextTracker.getFuncNameFor(Caller),
+                        ContextTracker.getFuncNameFor(Callee));
       }
     }
   }
diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
index 94f7796298db..5d80da407d7e 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -42,31 +42,34 @@ public:
       : ParentContext(Parent), FuncName(FName), FuncSamples(FSamples),
         CallSiteLoc(CallLoc){};
   ContextTrieNode *getChildContext(const LineLocation &CallSite,
-                                   StringRef CalleeName);
+                                   StringRef ChildName);
   ContextTrieNode *getHottestChildContext(const LineLocation &CallSite);
   ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
-                                           StringRef CalleeName,
+                                           StringRef ChildName,
                                            bool AllowCreate = true);
 
   ContextTrieNode &moveToChildContext(const LineLocation &CallSite,
                                       ContextTrieNode &&NodeToMove,
-                                      StringRef ContextStrToRemove,
+                                      uint32_t ContextFramesToRemove,
                                       bool DeleteNode = true);
-  void removeChildContext(const LineLocation &CallSite, StringRef CalleeName);
-  std::map<uint32_t, ContextTrieNode> &getAllChildContext();
+  void removeChildContext(const LineLocation &CallSite, StringRef ChildName);
+  std::map<uint64_t, ContextTrieNode> &getAllChildContext();
   StringRef getFuncName() const;
   FunctionSamples *getFunctionSamples() const;
   void setFunctionSamples(FunctionSamples *FSamples);
+  Optional<uint32_t> getFunctionSize() const;
+  void addFunctionSize(uint32_t FSize);
   LineLocation getCallSiteLoc() const;
   ContextTrieNode *getParentContext() const;
   void setParentContext(ContextTrieNode *Parent);
-  void dump();
+  void dumpNode();
+  void dumpTree();
 
 private:
-  static uint32_t nodeHash(StringRef ChildName, const LineLocation &Callsite);
+  static uint64_t nodeHash(StringRef ChildName, const LineLocation &Callsite);
 
   // Map line+discriminator location to child context
-  std::map<uint32_t, ContextTrieNode> AllChildContext;
+  std::map<uint64_t, ContextTrieNode> AllChildContext;
 
   // Link to parent context node
   ContextTrieNode *ParentContext;
@@ -77,6 +80,9 @@ private:
   // Function Samples for current context
   FunctionSamples *FuncSamples;
 
+  // Function size for current context
+  Optional<uint32_t> FuncSize;
+
   // Callsite location in parent context
   LineLocation CallSiteLoc;
 };
@@ -90,9 +96,22 @@ private:
 // calling context and the context is identified by path from root to the node.
 class SampleContextTracker {
 public:
-  using ContextSamplesTy = SmallVector<FunctionSamples *, 16>;
-
-  SampleContextTracker(StringMap<FunctionSamples> &Profiles);
+  struct ProfileComparer {
+    bool operator()(FunctionSamples *A, FunctionSamples *B) const {
+      // Sort function profiles by the number of total samples and their
+      // contexts.
+      if (A->getTotalSamples() == B->getTotalSamples())
+        return A->getContext() < B->getContext();
+      return A->getTotalSamples() > B->getTotalSamples();
+    }
+  };
+
+  // Keep profiles of a function sorted so that they will be processed/promoted
+  // deterministically.
+  using ContextSamplesTy = std::set<FunctionSamples *, ProfileComparer>;
+
+  SampleContextTracker(SampleProfileMap &Profiles,
+                       const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap);
   // Query context profile for a specific callee with given name at a given
   // call-site. The full context is identified by location of call instruction.
   FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst,
@@ -116,6 +135,8 @@ public:
   FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext = true);
   // Retrieve the context trie node for given profile context
   ContextTrieNode *getContextFor(const SampleContext &Context);
+  // Get real function name for a given trie node.
+  StringRef getFuncNameFor(ContextTrieNode *Node) const;
   // Mark a context profile as inlined when function is inlined.
   // This makes sure that inlined context profile will be excluded in
   // function's base profile.
@@ -136,14 +157,18 @@ private:
   ContextTrieNode &addTopLevelContextNode(StringRef FName);
   ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo);
   void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode,
-                        StringRef ContextStrToRemove);
-  ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
-                                                  ContextTrieNode &ToNodeParent,
-                                                  StringRef ContextStrToRemove);
+                        uint32_t ContextFramesToRemove);
+  ContextTrieNode &
+  promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
+                                 ContextTrieNode &ToNodeParent,
+                                 uint32_t ContextFramesToRemove);
 
   // Map from function name to context profiles (excluding base profile)
   StringMap<ContextSamplesTy> FuncToCtxtProfiles;
 
+  // Map from function guid to real function names. Only used in md5 mode.
+  const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap;
+
   // Root node for context trie tree
   ContextTrieNode RootContext;
 };
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
index af6d2a18a25a..6dee38c83b36 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h
@@ -18,12 +18,14 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
 
 namespace llvm {
 
 class InstCombinePass : public PassInfoMixin<InstCombinePass> {
-  InstCombineWorklist Worklist;
+  InstructionWorklist Worklist;
   const unsigned MaxIterations;
 
 public:
@@ -38,7 +40,7 @@ public:
 /// This is a basic whole-function wrapper around the instcombine utility. It
 /// will try to combine all instructions in the function.
 class InstructionCombiningPass : public FunctionPass {
-  InstCombineWorklist Worklist;
+  InstructionWorklist Worklist;
   const unsigned MaxIterations;
 
 public:
@@ -67,4 +69,6 @@ FunctionPass *createInstructionCombiningPass();
 FunctionPass *createInstructionCombiningPass(unsigned MaxIterations);
 }
 
+#undef DEBUG_TYPE
+
 #endif
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index ba0d41f9b748..c6aee439b5a0 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -25,10 +25,10 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include <cassert>
 
 #define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
 
 namespace llvm {
 
@@ -43,7 +43,9 @@ class TargetTransformInfo;
 /// This class provides both the logic to recursively visit instructions and
 /// combine them.
 class LLVM_LIBRARY_VISIBILITY InstCombiner {
-  /// Only used to call target specific inst combining.
+  /// Only used to call target specific intrinsic combining.
+  /// It must **NOT** be used for any other purpose, as InstCombine is a
+  /// target-independent canonicalization transform.
   TargetTransformInfo &TTI;
 
 public:
@@ -57,7 +59,7 @@ public:
 
 protected:
   /// A worklist of the instructions that need to be simplified.
-  InstCombineWorklist &Worklist;
+  InstructionWorklist &Worklist;
 
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
@@ -81,7 +83,7 @@ protected:
   bool MadeIRChange = false;
 
 public:
-  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
+  InstCombiner(InstructionWorklist &Worklist, BuilderTy &Builder,
                bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
                TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                DominatorTree &DT, OptimizationRemarkEmitter &ORE,
@@ -165,16 +167,16 @@ public:
     switch (Pred) {
     case ICmpInst::ICMP_SLT: // True if LHS s< 0
       TrueIfSigned = true;
-      return RHS.isNullValue();
+      return RHS.isZero();
     case ICmpInst::ICMP_SLE: // True if LHS s<= -1
       TrueIfSigned = true;
-      return RHS.isAllOnesValue();
+      return RHS.isAllOnes();
     case ICmpInst::ICMP_SGT: // True if LHS s> -1
       TrueIfSigned = false;
-      return RHS.isAllOnesValue();
+      return RHS.isAllOnes();
     case ICmpInst::ICMP_SGE: // True if LHS s>= 0
       TrueIfSigned = false;
-      return RHS.isNullValue();
+      return RHS.isZero();
     case ICmpInst::ICMP_UGT:
       // True if LHS u> RHS and RHS == sign-bit-mask - 1
       TrueIfSigned = true;
@@ -246,12 +248,13 @@ public:
 
     // If `V` is of the form `A + Constant` then `-1 - V` can be folded into
     // `(-1 - Constant) - A` if we are willing to invert all of the uses.
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
-      if (BO->getOpcode() == Instruction::Add ||
-          BO->getOpcode() == Instruction::Sub)
-        if (match(BO, PatternMatch::m_c_BinOp(PatternMatch::m_Value(),
-                                              PatternMatch::m_ImmConstant())))
-          return WillInvertAllUses;
+    if (match(V, m_Add(PatternMatch::m_Value(), PatternMatch::m_ImmConstant())))
+      return WillInvertAllUses;
+
+    // If `V` is of the form `Constant - A` then `-1 - V` can be folded into
+    // `A + (-1 - Constant)` if we are willing to invert all of the uses.
+    if (match(V, m_Sub(PatternMatch::m_ImmConstant(), PatternMatch::m_Value())))
+      return WillInvertAllUses;
 
     // Selects with invertible operands are freely invertible
     if (match(V,
@@ -259,6 +262,12 @@ public:
                        m_Not(PatternMatch::m_Value()))))
       return WillInvertAllUses;
 
+    // Min/max may be in the form of intrinsics, so handle those identically
+    // to select patterns.
+    if (match(V, m_MaxOrMin(m_Not(PatternMatch::m_Value()),
+                            m_Not(PatternMatch::m_Value()))))
+      return WillInvertAllUses;
+
     return false;
   }
 
@@ -354,14 +363,6 @@ public:
     return ConstantVector::get(Out);
   }
 
-  /// Create and insert the idiom we use to indicate a block is unreachable
-  /// without having to rewrite the CFG from within InstCombine.
-  static void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
-    auto &Ctx = InsertAt->getContext();
-    new StoreInst(ConstantInt::getTrue(Ctx),
-                  UndefValue::get(Type::getInt1PtrTy(Ctx)), InsertAt);
-  }
-
   void addToWorklist(Instruction *I) { Worklist.push(I); }
 
   AssumptionCache &getAssumptionCache() const { return AC; }
@@ -479,6 +480,11 @@ public:
     return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
   }
 
+  unsigned ComputeMinSignedBits(const Value *Op, unsigned Depth = 0,
+                                const Instruction *CxtI = nullptr) const {
+    return llvm::ComputeMinSignedBits(Op, DL, Depth, &AC, CxtI, &DT);
+  }
+
   OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
                                                const Value *RHS,
                                                const Instruction *CxtI) const {
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index 03108bacb0da..a288a3972c3d 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -78,7 +78,7 @@ struct GCOVOptions {
 ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
-// PGO Instrumention. Parameter IsCS indicates if this is the context senstive
+// PGO Instrumention. Parameter IsCS indicates if this is the context sensitive
 // instrumentation.
 ModulePass *createPGOInstrumentationGenLegacyPass(bool IsCS = false);
 ModulePass *
@@ -138,7 +138,7 @@ struct InstrProfOptions {
 };
 
 /// Insert frontend instrumentation based profiling. Parameter IsCS indicates if
-// this is the context senstive instrumentation.
+// this is the context sensitive instrumentation.
 ModulePass *createInstrProfilingLegacyPass(
     const InstrProfOptions &Options = InstrProfOptions(), bool IsCS = false);
 
@@ -169,6 +169,8 @@ struct SanitizerCoverageOptions {
   bool PCTable = false;
   bool NoPrune = false;
   bool StackDepth = false;
+  bool TraceLoads = false;
+  bool TraceStores = false;
 
   SanitizerCoverageOptions() = default;
 };
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index 3781253d2694..c13407a44091 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -1,9 +1,8 @@
 //===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -90,6 +89,14 @@ private:
   static AnalysisKey Key;
 };
 
+struct AddressSanitizerOptions {
+  bool CompileKernel = false;
+  bool Recover = false;
+  bool UseAfterScope = false;
+  AsanDetectStackUseAfterReturnMode UseAfterReturn =
+      AsanDetectStackUseAfterReturnMode::Runtime;
+};
+
 /// Public interface to the address sanitizer pass for instrumenting code to
 /// check for various memory errors at runtime.
 ///
@@ -99,19 +106,15 @@ private:
 /// surrounding requested memory to be checked for invalid accesses.
 class AddressSanitizerPass : public PassInfoMixin<AddressSanitizerPass> {
 public:
-  explicit AddressSanitizerPass(
-      bool CompileKernel = false, bool Recover = false,
-      bool UseAfterScope = false,
-      AsanDetectStackUseAfterReturnMode UseAfterReturn =
-          AsanDetectStackUseAfterReturnMode::Runtime);
+  AddressSanitizerPass(const AddressSanitizerOptions &Options)
+      : Options(Options){};
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
   static bool isRequired() { return true; }
 
 private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseAfterScope;
-  AsanDetectStackUseAfterReturnMode UseAfterReturn;
+  AddressSanitizerOptions Options;
 };
 
 /// Public interface to the address sanitizer module pass for instrumenting code
@@ -122,16 +125,17 @@ private:
 class ModuleAddressSanitizerPass
     : public PassInfoMixin<ModuleAddressSanitizerPass> {
 public:
-  explicit ModuleAddressSanitizerPass(
-      bool CompileKernel = false, bool Recover = false, bool UseGlobalGC = true,
+  ModuleAddressSanitizerPass(
+      const AddressSanitizerOptions &Options, bool UseGlobalGC = true,
       bool UseOdrIndicator = false,
       AsanDtorKind DestructorKind = AsanDtorKind::Global);
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
   static bool isRequired() { return true; }
 
 private:
-  bool CompileKernel;
-  bool Recover;
+  AddressSanitizerOptions Options;
   bool UseGlobalGC;
   bool UseOdrIndicator;
   AsanDtorKind DestructorKind;
@@ -148,6 +152,16 @@ ModulePass *createModuleAddressSanitizerLegacyPassPass(
     bool UseOdrIndicator = true,
     AsanDtorKind DestructorKind = AsanDtorKind::Global);
 
+struct ASanAccessInfo {
+  const int32_t Packed;
+  const uint8_t AccessSizeIndex;
+  const bool IsWrite;
+  const bool CompileKernel;
+
+  explicit ASanAccessInfo(int32_t Packed);
+  ASanAccessInfo(bool IsWrite, bool CompileKernel, uint8_t AccessSizeIndex);
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
index 0228992af874..6c351e3f8e1f 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
@@ -1,9 +1,8 @@
 //===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -18,6 +17,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 
 namespace llvm {
@@ -26,7 +26,6 @@ class InterestingMemoryOperand {
 public:
   Use *PtrUse;
   bool IsWrite;
-  Type *OpType;
   uint64_t TypeSize;
   MaybeAlign Alignment;
   // The mask Value, if we're looking at a masked load/store.
@@ -35,8 +34,7 @@ public:
   InterestingMemoryOperand(Instruction *I, unsigned OperandNo, bool IsWrite,
                            class Type *OpType, MaybeAlign Alignment,
                            Value *MaybeMask = nullptr)
-      : IsWrite(IsWrite), OpType(OpType), Alignment(Alignment),
-        MaybeMask(MaybeMask) {
+      : IsWrite(IsWrite), Alignment(Alignment), MaybeMask(MaybeMask) {
     const DataLayout &DL = I->getModule()->getDataLayout();
     TypeSize = DL.getTypeStoreSizeInBits(OpType);
     PtrUse = &I->getOperandUse(OperandNo);
@@ -47,47 +45,56 @@ public:
   Value *getPtr() { return PtrUse->get(); }
 };
 
-// For an alloca valid between lifetime markers Start and End, call the
+// For an alloca valid between lifetime markers Start and Ends, call the
 // Callback for all possible exits out of the lifetime in the containing
 // function, which can return from the instructions in RetVec.
 //
-// Returns whether End was the only possible exit. If it wasn't, the caller
-// should remove End to ensure that work done at the other exits does not
-// happen outside of the lifetime.
+// Returns whether Ends covered all possible exits. If they did not,
+// the caller should remove Ends to ensure that work done at the other
+// exits does not happen outside of the lifetime.
 template <typename F>
 bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
-                          const Instruction *Start, Instruction *End,
+                          const Instruction *Start,
+                          const SmallVectorImpl<IntrinsicInst *> &Ends,
                           const SmallVectorImpl<Instruction *> &RetVec,
                           F Callback) {
-  // We need to ensure that if we tag some object, we certainly untag it
-  // before the function exits.
-  if (PDT.dominates(End, Start)) {
-    Callback(End);
-  } else {
-    SmallVector<Instruction *, 8> ReachableRetVec;
-    unsigned NumCoveredExits = 0;
-    for (auto &RI : RetVec) {
-      if (!isPotentiallyReachable(Start, RI, nullptr, &DT))
-        continue;
-      ReachableRetVec.push_back(RI);
-      if (DT.dominates(End, RI))
-        ++NumCoveredExits;
-    }
-    // If there's a mix of covered and non-covered exits, just put the untag
-    // on exits, so we avoid the redundancy of untagging twice.
-    if (NumCoveredExits == ReachableRetVec.size()) {
+  if (Ends.size() == 1 && PDT.dominates(Ends[0], Start)) {
+    Callback(Ends[0]);
+    return true;
+  }
+  SmallVector<Instruction *, 8> ReachableRetVec;
+  unsigned NumCoveredExits = 0;
+  for (auto &RI : RetVec) {
+    if (!isPotentiallyReachable(Start, RI, nullptr, &DT))
+      continue;
+    ReachableRetVec.push_back(RI);
+    // TODO(fmayer): We don't support diamond shapes, where multiple lifetime
+    // ends together dominate the RI, but none of them does by itself.
+    // Check how often this happens and decide whether to support this here.
+    if (std::any_of(Ends.begin(), Ends.end(),
+                    [&](Instruction *End) { return DT.dominates(End, RI); }))
+      ++NumCoveredExits;
+  }
+  // If there's a mix of covered and non-covered exits, just put the untag
+  // on exits, so we avoid the redundancy of untagging twice.
+  if (NumCoveredExits == ReachableRetVec.size()) {
+    for (auto *End : Ends)
       Callback(End);
-    } else {
-      for (auto &RI : ReachableRetVec)
-        Callback(RI);
-      // We may have inserted untag outside of the lifetime interval.
-      // Signal the caller to remove the lifetime end call for this alloca.
-      return false;
-    }
+  } else {
+    for (auto &RI : ReachableRetVec)
+      Callback(RI);
+    // We may have inserted untag outside of the lifetime interval.
+    // Signal the caller to remove the lifetime end call for this alloca.
+    return false;
   }
   return true;
 }
 
+// Get AddressSanitizer parameters.
+void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                               bool IsKasan, uint64_t *ShadowBase,
+                               int *MappingScale, bool *OrShadowOffset);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h
index 029b3fc4b788..f019d1c00a35 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h
@@ -1,9 +1,8 @@
 //===--------- Definition of the AddressSanitizer options -------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // This file defines data types used to set Address Sanitizer options.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
index 2e4f3338030a..3118a3762935 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
@@ -1,9 +1,8 @@
 //===--------- Definition of the HWAddressSanitizer class -------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -19,21 +18,32 @@
 
 namespace llvm {
 
+struct HWAddressSanitizerOptions {
+  HWAddressSanitizerOptions()
+      : HWAddressSanitizerOptions(false, false, false){};
+  HWAddressSanitizerOptions(bool CompileKernel, bool Recover,
+                            bool DisableOptimization)
+      : CompileKernel(CompileKernel), Recover(Recover),
+        DisableOptimization(DisableOptimization){};
+  bool CompileKernel;
+  bool Recover;
+  bool DisableOptimization;
+};
+
 /// This is a public interface to the hardware address sanitizer pass for
 /// instrumenting code to check for various memory errors at runtime, similar to
 /// AddressSanitizer but based on partial hardware assistance.
 class HWAddressSanitizerPass : public PassInfoMixin<HWAddressSanitizerPass> {
 public:
-  explicit HWAddressSanitizerPass(bool CompileKernel = false,
-                                  bool Recover = false,
-                                  bool DisableOptimization = false);
+  explicit HWAddressSanitizerPass(HWAddressSanitizerOptions Options)
+      : Options(Options){};
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
   static bool isRequired() { return true; }
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
 private:
-  bool CompileKernel;
-  bool Recover;
-  bool DisableOptimization;
+  HWAddressSanitizerOptions Options;
 };
 
 FunctionPass *
diff --git a/llvm/include/llvm/Transforms/Instrumentation/InstrOrderFile.h b/llvm/include/llvm/Transforms/Instrumentation/InstrOrderFile.h
index f0f375e0acf6..e3d75f675c93 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/InstrOrderFile.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/InstrOrderFile.h
@@ -1,9 +1,8 @@
 //===- InstrOrderFile.h ---- Late IR instrumentation for order file ----===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
index ac6a07d299a6..f4d1b1d90e6f 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemProfiler.h
@@ -1,9 +1,8 @@
 //===--------- Definition of the MemProfiler class --------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
index f5f9ec7829bd..d47beb93397e 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
@@ -40,6 +40,23 @@ struct MemorySanitizerPass : public PassInfoMixin<MemorySanitizerPass> {
   MemorySanitizerPass(MemorySanitizerOptions Options) : Options(Options) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
+  static bool isRequired() { return true; }
+
+private:
+  MemorySanitizerOptions Options;
+};
+
+/// A module pass for msan instrumentation.
+///
+/// Instruments functions to detect unitialized reads. This function pass
+/// inserts calls to runtime library functions. If the functions aren't declared
+/// yet, the pass inserts the declarations. Otherwise the existing globals are
+/// used.
+struct ModuleMemorySanitizerPass : public PassInfoMixin<ModuleMemorySanitizerPass> {
+  ModuleMemorySanitizerPass(MemorySanitizerOptions Options) : Options(Options) {}
+
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
index f9c507624e6d..e795043630d5 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
@@ -27,6 +27,14 @@ FunctionPass *createThreadSanitizerLegacyPassPass();
 /// yet, the pass inserts the declarations. Otherwise the existing globals are
 struct ThreadSanitizerPass : public PassInfoMixin<ThreadSanitizerPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
+};
+
+/// A module pass for tsan instrumentation.
+///
+/// Create ctor and init functions.
+struct ModuleThreadSanitizerPass
+  : public PassInfoMixin<ModuleThreadSanitizerPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
diff --git a/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h b/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h
index 1e7fd71dcbf4..877d8145e746 100644
--- a/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h
+++ b/llvm/include/llvm/Transforms/Scalar/EarlyCSE.h
@@ -32,6 +32,8 @@ struct EarlyCSEPass : PassInfoMixin<EarlyCSEPass> {
 
   /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
   bool UseMemorySSA;
 };
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 5c29b289d158..cbe5057b9cde 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -115,17 +115,20 @@ struct GVNOptions {
 ///
 /// FIXME: We should have a good summary of the GVN algorithm implemented by
 /// this particular pass here.
-class GVN : public PassInfoMixin<GVN> {
+class GVNPass : public PassInfoMixin<GVNPass> {
   GVNOptions Options;
 
 public:
   struct Expression;
 
-  GVN(GVNOptions Options = {}) : Options(Options) {}
+  GVNPass(GVNOptions Options = {}) : Options(Options) {}
 
   /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
+
   /// This removes the specified instruction from
   /// our various maps and marks it for deletion.
   void markInstructionForDeletion(Instruction *I) {
@@ -179,11 +182,11 @@ public:
     Expression createExtractvalueExpr(ExtractValueInst *EI);
     uint32_t lookupOrAddCall(CallInst *C);
     uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
-                              uint32_t Num, GVN &Gvn);
+                              uint32_t Num, GVNPass &Gvn);
     bool areCallValsEqual(uint32_t Num, uint32_t NewNum, const BasicBlock *Pred,
-                          const BasicBlock *PhiBlock, GVN &Gvn);
+                          const BasicBlock *PhiBlock, GVNPass &Gvn);
     std::pair<uint32_t, bool> assignExpNewValueNum(Expression &exp);
-    bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVN &Gvn);
+    bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVNPass &Gvn);
 
   public:
     ValueTable();
@@ -197,7 +200,7 @@ public:
     uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred,
                             Value *LHS, Value *RHS);
     uint32_t phiTranslate(const BasicBlock *BB, const BasicBlock *PhiBlock,
-                          uint32_t Num, GVN &Gvn);
+                          uint32_t Num, GVNPass &Gvn);
     void eraseTranslateCacheEntry(uint32_t Num, const BasicBlock &CurrBlock);
     bool exists(Value *V) const;
     void add(Value *V, uint32_t num);
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 816ea1071e52..0ac7d7c62b7a 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -44,6 +44,7 @@ class PHINode;
 class SelectInst;
 class SwitchInst;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 class Value;
 
 /// A private "module" namespace for types and utilities used by
@@ -78,6 +79,7 @@ enum ConstantPreference { WantInteger, WantBlockAddress };
 /// revectored to the false side of the second if.
 class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
   TargetLibraryInfo *TLI;
+  TargetTransformInfo *TTI;
   LazyValueInfo *LVI;
   AAResults *AA;
   DomTreeUpdater *DTU;
@@ -99,9 +101,9 @@ public:
   JumpThreadingPass(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1);
 
   // Glue for old PM.
-  bool runImpl(Function &F, TargetLibraryInfo *TLI, LazyValueInfo *LVI,
-               AAResults *AA, DomTreeUpdater *DTU, bool HasProfileData,
-               std::unique_ptr<BlockFrequencyInfo> BFI,
+  bool runImpl(Function &F, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+               LazyValueInfo *LVI, AAResults *AA, DomTreeUpdater *DTU,
+               bool HasProfileData, std::unique_ptr<BlockFrequencyInfo> BFI,
                std::unique_ptr<BranchProbabilityInfo> BPI);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
index 020cfb9a6c85..419729271a23 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -94,6 +94,8 @@ public:
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
   /// Add either a loop pass or a loop-nest pass to the pass manager. Append \p
   /// Pass to the list of loop passes if it has a dedicated \fn run() method for
   /// loops and to the list of loop-nest passes if the \fn run() method is for
@@ -101,51 +103,65 @@ public:
   /// to the end of \var IsLoopNestPass so we can easily identify the types of
   /// passes in the pass manager later.
   template <typename PassT>
-  std::enable_if_t<is_detected<HasRunOnLoopT, PassT>::value>
-  addPass(PassT &&Pass) {
+  LLVM_ATTRIBUTE_MINSIZE
+      std::enable_if_t<is_detected<HasRunOnLoopT, PassT>::value>
+      addPass(PassT &&Pass) {
     using LoopPassModelT =
         detail::PassModel<Loop, PassT, PreservedAnalyses, LoopAnalysisManager,
                           LoopStandardAnalysisResults &, LPMUpdater &>;
     IsLoopNestPass.push_back(false);
-    LoopPasses.emplace_back(new LoopPassModelT(std::forward<PassT>(Pass)));
+    // Do not use make_unique or emplace_back, they cause too many template
+    // instantiations, causing terrible compile times.
+    LoopPasses.push_back(std::unique_ptr<LoopPassConceptT>(
+        new LoopPassModelT(std::forward<PassT>(Pass))));
   }
 
   template <typename PassT>
-  std::enable_if_t<!is_detected<HasRunOnLoopT, PassT>::value>
-  addPass(PassT &&Pass) {
+  LLVM_ATTRIBUTE_MINSIZE
+      std::enable_if_t<!is_detected<HasRunOnLoopT, PassT>::value>
+      addPass(PassT &&Pass) {
     using LoopNestPassModelT =
         detail::PassModel<LoopNest, PassT, PreservedAnalyses,
                           LoopAnalysisManager, LoopStandardAnalysisResults &,
                           LPMUpdater &>;
     IsLoopNestPass.push_back(true);
-    LoopNestPasses.emplace_back(
-        new LoopNestPassModelT(std::forward<PassT>(Pass)));
+    // Do not use make_unique or emplace_back, they cause too many template
+    // instantiations, causing terrible compile times.
+    LoopNestPasses.push_back(std::unique_ptr<LoopNestPassConceptT>(
+        new LoopNestPassModelT(std::forward<PassT>(Pass))));
   }
 
   // Specializations of `addPass` for `RepeatedPass`. These are necessary since
   // `RepeatedPass` has a templated `run` method that will result in incorrect
   // detection of `HasRunOnLoopT`.
   template <typename PassT>
-  std::enable_if_t<is_detected<HasRunOnLoopT, PassT>::value>
-  addPass(RepeatedPass<PassT> &&Pass) {
+  LLVM_ATTRIBUTE_MINSIZE
+      std::enable_if_t<is_detected<HasRunOnLoopT, PassT>::value>
+      addPass(RepeatedPass<PassT> &&Pass) {
     using RepeatedLoopPassModelT =
         detail::PassModel<Loop, RepeatedPass<PassT>, PreservedAnalyses,
                           LoopAnalysisManager, LoopStandardAnalysisResults &,
                           LPMUpdater &>;
     IsLoopNestPass.push_back(false);
-    LoopPasses.emplace_back(new RepeatedLoopPassModelT(std::move(Pass)));
+    // Do not use make_unique or emplace_back, they cause too many template
+    // instantiations, causing terrible compile times.
+    LoopPasses.push_back(std::unique_ptr<LoopPassConceptT>(
+        new RepeatedLoopPassModelT(std::move(Pass))));
   }
 
   template <typename PassT>
-  std::enable_if_t<!is_detected<HasRunOnLoopT, PassT>::value>
-  addPass(RepeatedPass<PassT> &&Pass) {
+  LLVM_ATTRIBUTE_MINSIZE
+      std::enable_if_t<!is_detected<HasRunOnLoopT, PassT>::value>
+      addPass(RepeatedPass<PassT> &&Pass) {
     using RepeatedLoopNestPassModelT =
         detail::PassModel<LoopNest, RepeatedPass<PassT>, PreservedAnalyses,
                           LoopAnalysisManager, LoopStandardAnalysisResults &,
                           LPMUpdater &>;
     IsLoopNestPass.push_back(true);
-    LoopNestPasses.emplace_back(
-        new RepeatedLoopNestPassModelT(std::move(Pass)));
+    // Do not use make_unique or emplace_back, they cause too many template
+    // instantiations, causing terrible compile times.
+    LoopNestPasses.push_back(std::unique_ptr<LoopNestPassConceptT>(
+        new RepeatedLoopNestPassModelT(std::move(Pass))));
   }
 
   bool isEmpty() const { return LoopPasses.empty() && LoopNestPasses.empty(); }
@@ -215,6 +231,12 @@ struct RequireAnalysisPass<AnalysisT, Loop, LoopAnalysisManager,
     (void)AM.template getResult<AnalysisT>(L, AR);
     return PreservedAnalyses::all();
   }
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
+    auto ClassName = AnalysisT::name();
+    auto PassName = MapClassName2PassName(ClassName);
+    OS << "require<" << PassName << ">";
+  }
 };
 
 /// An alias template to easily name a require analysis loop pass.
@@ -259,8 +281,6 @@ public:
   /// state, this routine will mark that the current loop should be skipped by
   /// the rest of the pass management infrastructure.
   void markLoopAsDeleted(Loop &L, llvm::StringRef Name) {
-    assert((!LoopNestMode || CurrentL == &L) &&
-           "L should be a top-level loop in loop-nest mode.");
     LAM.clear(L, Name);
     assert((&L == CurrentL || CurrentL->contains(&L)) &&
            "Cannot delete a loop outside of the "
@@ -413,10 +433,12 @@ public:
   explicit FunctionToLoopPassAdaptor(std::unique_ptr<PassConceptT> Pass,
                                      bool UseMemorySSA = false,
                                      bool UseBlockFrequencyInfo = false,
+                                     bool UseBranchProbabilityInfo = false,
                                      bool LoopNestMode = false)
       : Pass(std::move(Pass)), LoopCanonicalizationFPM(),
         UseMemorySSA(UseMemorySSA),
         UseBlockFrequencyInfo(UseBlockFrequencyInfo),
+        UseBranchProbabilityInfo(UseBranchProbabilityInfo),
         LoopNestMode(LoopNestMode) {
     LoopCanonicalizationFPM.addPass(LoopSimplifyPass());
     LoopCanonicalizationFPM.addPass(LCSSAPass());
@@ -424,6 +446,8 @@ public:
 
   /// Runs the loop passes across every loop in the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
   static bool isRequired() { return true; }
 
@@ -436,6 +460,7 @@ private:
 
   bool UseMemorySSA = false;
   bool UseBlockFrequencyInfo = false;
+  bool UseBranchProbabilityInfo = false;
   const bool LoopNestMode;
 };
 
@@ -447,13 +472,17 @@ template <typename LoopPassT>
 inline std::enable_if_t<is_detected<HasRunOnLoopT, LoopPassT>::value,
                         FunctionToLoopPassAdaptor>
 createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
-                                bool UseBlockFrequencyInfo = false) {
+                                bool UseBlockFrequencyInfo = false,
+                                bool UseBranchProbabilityInfo = false) {
   using PassModelT =
       detail::PassModel<Loop, LoopPassT, PreservedAnalyses, LoopAnalysisManager,
                         LoopStandardAnalysisResults &, LPMUpdater &>;
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
   return FunctionToLoopPassAdaptor(
-      std::make_unique<PassModelT>(std::forward<LoopPassT>(Pass)), UseMemorySSA,
-      UseBlockFrequencyInfo, false);
+      std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
+          new PassModelT(std::forward<LoopPassT>(Pass))),
+      UseMemorySSA, UseBlockFrequencyInfo, UseBranchProbabilityInfo, false);
 }
 
 /// If \p Pass is a loop-nest pass, \p Pass will first be wrapped into a
@@ -462,24 +491,29 @@ template <typename LoopNestPassT>
 inline std::enable_if_t<!is_detected<HasRunOnLoopT, LoopNestPassT>::value,
                         FunctionToLoopPassAdaptor>
 createFunctionToLoopPassAdaptor(LoopNestPassT &&Pass, bool UseMemorySSA = false,
-                                bool UseBlockFrequencyInfo = false) {
+                                bool UseBlockFrequencyInfo = false,
+                                bool UseBranchProbabilityInfo = false) {
   LoopPassManager LPM;
   LPM.addPass(std::forward<LoopNestPassT>(Pass));
   using PassModelT =
       detail::PassModel<Loop, LoopPassManager, PreservedAnalyses,
                         LoopAnalysisManager, LoopStandardAnalysisResults &,
                         LPMUpdater &>;
-  return FunctionToLoopPassAdaptor(std::make_unique<PassModelT>(std::move(LPM)),
-                                   UseMemorySSA, UseBlockFrequencyInfo, true);
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
+  return FunctionToLoopPassAdaptor(
+      std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
+          new PassModelT(std::move(LPM))),
+      UseMemorySSA, UseBlockFrequencyInfo, UseBranchProbabilityInfo, true);
 }
 
 /// If \p Pass is an instance of \c LoopPassManager, the returned adaptor will
 /// be in loop-nest mode if the pass manager contains only loop-nest passes.
 template <>
 inline FunctionToLoopPassAdaptor
-createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager &&LPM,
-                                                 bool UseMemorySSA,
-                                                 bool UseBlockFrequencyInfo) {
+createFunctionToLoopPassAdaptor<LoopPassManager>(
+    LoopPassManager &&LPM, bool UseMemorySSA, bool UseBlockFrequencyInfo,
+    bool UseBranchProbabilityInfo) {
   // Check if LPM contains any loop pass and if it does not, returns an adaptor
   // in loop-nest mode.
   using PassModelT =
@@ -487,9 +521,13 @@ createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager &&LPM,
                         LoopAnalysisManager, LoopStandardAnalysisResults &,
                         LPMUpdater &>;
   bool LoopNestMode = (LPM.getNumLoopPasses() == 0);
-  return FunctionToLoopPassAdaptor(std::make_unique<PassModelT>(std::move(LPM)),
-                                   UseMemorySSA, UseBlockFrequencyInfo,
-                                   LoopNestMode);
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
+  return FunctionToLoopPassAdaptor(
+      std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
+          new PassModelT(std::move(LPM))),
+      UseMemorySSA, UseBlockFrequencyInfo, UseBranchProbabilityInfo,
+      LoopNestMode);
 }
 
 /// Pass for printing a loop's contents as textual IR.
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 30cc08cb42ae..6afe7ecd2a5d 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -140,6 +140,8 @@ public:
       : UnrollOpts(UnrollOpts) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h b/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h
index dd574e4f32c6..d44d297dd4ff 100644
--- a/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Scalar/LowerMatrixIntrinsics.h
@@ -23,6 +23,8 @@ class LowerMatrixIntrinsicsPass
 public:
   LowerMatrixIntrinsicsPass(bool Minimal = false) : Minimal(Minimal) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
   static bool isRequired() { return true; }
 };
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 635b706d0bef..3a4db13d670a 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -31,7 +31,6 @@ class Instruction;
 class LoadInst;
 class MemCpyInst;
 class MemMoveInst;
-class MemoryDependenceResults;
 class MemorySSA;
 class MemorySSAUpdater;
 class MemSetInst;
@@ -40,7 +39,6 @@ class TargetLibraryInfo;
 class Value;
 
 class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
-  MemoryDependenceResults *MD = nullptr;
   TargetLibraryInfo *TLI = nullptr;
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
@@ -54,9 +52,8 @@ public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for the old PM.
-  bool runImpl(Function &F, MemoryDependenceResults *MD, TargetLibraryInfo *TLI,
-               AAResults *AA, AssumptionCache *AC, DominatorTree *DT,
-               MemorySSA *MSSA);
+  bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA,
+               AssumptionCache *AC, DominatorTree *DT, MemorySSA *MSSA);
 
 private:
   // Helper functions
@@ -65,7 +62,7 @@ private:
   bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
   bool processMemMove(MemMoveInst *M);
   bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
-                            Value *cpyDst, Value *cpySrc, uint64_t cpyLen,
+                            Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
                             Align cpyAlign, CallInst *C);
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
diff --git a/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h b/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
index c5f6d6e0e8bd..256d03675a07 100644
--- a/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
+++ b/llvm/include/llvm/Transforms/Scalar/MergedLoadStoreMotion.h
@@ -48,6 +48,8 @@ public:
   MergedLoadStoreMotionPass(const MergedLoadStoreMotionOptions &PassOptions)
       : Options(PassOptions) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 }
 
diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h
index 6ef7c6b22c0b..f1a43435d89a 100644
--- a/llvm/include/llvm/Transforms/Scalar/SROA.h
+++ b/llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -62,7 +62,7 @@ class SROALegacyPass;
 ///    onto insert and extract operations on a vector value, and convert them to
 ///    this form. By doing so, it will enable promotion of vector aggregates to
 ///    SSA vector values.
-class SROA : public PassInfoMixin<SROA> {
+class SROAPass : public PassInfoMixin<SROAPass> {
   LLVMContext *C = nullptr;
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
@@ -110,7 +110,7 @@ class SROA : public PassInfoMixin<SROA> {
   SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
 
 public:
-  SROA() = default;
+  SROAPass() = default;
 
   /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h b/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
index c1a9ab475ead..dfb1619c7f2a 100644
--- a/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
+++ b/llvm/include/llvm/Transforms/Scalar/SimpleLoopUnswitch.h
@@ -69,6 +69,9 @@ public:
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
+
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 
 /// Create the legacy pass object for the simple loop unswitcher.
diff --git a/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h b/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
index 7c5393851ae6..67db5031a443 100644
--- a/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
+++ b/llvm/include/llvm/Transforms/Scalar/SimplifyCFG.h
@@ -41,6 +41,9 @@ public:
 
   /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 };
 
 }
diff --git a/llvm/include/llvm/Transforms/Utils/ASanStackFrameLayout.h b/llvm/include/llvm/Transforms/Utils/ASanStackFrameLayout.h
index 0b570c0d1342..f87588db4ee2 100644
--- a/llvm/include/llvm/Transforms/Utils/ASanStackFrameLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/ASanStackFrameLayout.h
@@ -33,7 +33,7 @@ struct ASanStackVariableDescription {
   uint64_t Size;       // Size of the variable in bytes.
   size_t LifetimeSize; // Size in bytes to use for lifetime analysis check.
                        // Will be rounded up to Granularity.
-  size_t Alignment;    // Alignment of the variable (power of 2).
+  uint64_t Alignment;  // Alignment of the variable (power of 2).
   AllocaInst *AI;      // The actual AllocaInst.
   size_t Offset;       // Offset from the beginning of the frame;
                        // set by ComputeASanStackFrameLayout.
@@ -42,20 +42,20 @@ struct ASanStackVariableDescription {
 
 // Output data struct for ComputeASanStackFrameLayout.
 struct ASanStackFrameLayout {
-  size_t Granularity;     // Shadow granularity.
-  size_t FrameAlignment;  // Alignment for the entire frame.
-  size_t FrameSize;       // Size of the frame in bytes.
+  uint64_t Granularity;     // Shadow granularity.
+  uint64_t FrameAlignment;  // Alignment for the entire frame.
+  uint64_t FrameSize;       // Size of the frame in bytes.
 };
 
 ASanStackFrameLayout ComputeASanStackFrameLayout(
     // The array of stack variables. The elements may get reordered and changed.
     SmallVectorImpl<ASanStackVariableDescription> &Vars,
     // AddressSanitizer's shadow granularity. Usually 8, may also be 16, 32, 64.
-    size_t Granularity,
+    uint64_t Granularity,
     // The minimal size of the left-most redzone (header).
     // At least 4 pointer sizes, power of 2, and >= Granularity.
     // The resulting FrameSize should be multiple of MinHeaderSize.
-    size_t MinHeaderSize);
+    uint64_t MinHeaderSize);
 
 // Compute frame description, see DescribeAddressIfStack in ASan runtime.
 SmallString<64> ComputeASanStackFrameDescription(
diff --git a/llvm/include/llvm/Transforms/Utils/AddDiscriminators.h b/llvm/include/llvm/Transforms/Utils/AddDiscriminators.h
index f512c6c06331..0aee2fe95cad 100644
--- a/llvm/include/llvm/Transforms/Utils/AddDiscriminators.h
+++ b/llvm/include/llvm/Transforms/Utils/AddDiscriminators.h
@@ -24,6 +24,7 @@ class Function;
 class AddDiscriminatorsPass : public PassInfoMixin<AddDiscriminatorsPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index b45c1820bb20..8970afb3aeaa 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -129,6 +129,13 @@ void ReplaceInstWithInst(BasicBlock::InstListType &BIL,
 /// To. Copies DebugLoc from BI to I, if I doesn't already have a DebugLoc.
 void ReplaceInstWithInst(Instruction *From, Instruction *To);
 
+/// Check if we can prove that all paths starting from this block converge
+/// to a block that either has a @llvm.experimental.deoptimize call
+/// prior to its terminating return instruction or is terminated by unreachable.
+/// All blocks in the traversed sequence must have an unique successor, maybe
+/// except for the last one.
+bool IsBlockFollowedByDeoptOrUnreachable(const BasicBlock *BB);
+
 /// Option class for critical edge splitting.
 ///
 /// This provides a builder interface for overriding the default options used
@@ -214,29 +221,6 @@ BasicBlock *SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
                                        CriticalEdgeSplittingOptions(),
                                    const Twine &BBName = "");
 
-inline BasicBlock *
-SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
-                  const CriticalEdgeSplittingOptions &Options =
-                      CriticalEdgeSplittingOptions()) {
-  return SplitCriticalEdge(BB->getTerminator(), SI.getSuccessorIndex(),
-                           Options);
-}
-
-/// If the edge from *PI to BB is not critical, return false. Otherwise, split
-/// all edges between the two blocks and return true. This updates all of the
-/// same analyses as the other SplitCriticalEdge function. If P is specified, it
-/// updates the analyses described above.
-inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI,
-                              const CriticalEdgeSplittingOptions &Options =
-                                  CriticalEdgeSplittingOptions()) {
-  bool MadeChange = false;
-  Instruction *TI = (*PI)->getTerminator();
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-    if (TI->getSuccessor(i) == Succ)
-      MadeChange |= !!SplitCriticalEdge(TI, i, Options);
-  return MadeChange;
-}
-
 /// If an edge from Src to Dst is critical, split the edge and return true,
 /// otherwise return false. This method requires that there be an edge between
 /// the two blocks. It updates the analyses passed in the options struct
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index e7d41933a6c9..87d33b9b11b7 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -54,12 +54,6 @@ namespace llvm {
   /// 'i8*' type.
   Value *emitStrDup(Value *Ptr, IRBuilderBase &B, const TargetLibraryInfo *TLI);
 
-  /// Emit a call to the strnlen function to the builder, for the specified
-  /// pointer. Ptr is required to be some pointer type, MaxLen must be of size_t
-  /// type, and the return value has 'intptr_t' type.
-  Value *emitStrNLen(Value *Ptr, Value *MaxLen, IRBuilderBase &B,
-                     const DataLayout &DL, const TargetLibraryInfo *TLI);
-
   /// Emit a call to the strchr function to the builder, for the specified
   /// pointer and character. Ptr is required to be some pointer type, and the
   /// return value has 'i8*' type.
@@ -205,8 +199,8 @@ namespace llvm {
                     const TargetLibraryInfo *TLI);
 
   /// Emit a call to the calloc function.
-  Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
-                    IRBuilderBase &B, const TargetLibraryInfo &TLI);
+  Value *emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
+                    const TargetLibraryInfo &TLI);
 }
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index f4fb265c25e0..5a1f322b2054 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -296,10 +296,10 @@ BasicBlock *DuplicateInstructionsInSplitBetween(BasicBlock *BB,
                                                 DomTreeUpdater &DTU);
 
 /// Updates profile information by adjusting the entry count by adding
-/// entryDelta then scaling callsite information by the new count divided by the
+/// EntryDelta then scaling callsite information by the new count divided by the
 /// old count. VMap is used during inlinng to also update the new clone
 void updateProfileCallee(
-    Function *Callee, int64_t entryDelta,
+    Function *Callee, int64_t EntryDelta,
     const ValueMap<const Value *, WeakTrackingVH> *VMap = nullptr);
 
 /// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 1d9f2d135488..f08173e45a5b 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -100,6 +100,10 @@ public:
     unsigned NumExitBlocks = std::numeric_limits<unsigned>::max();
     Type *RetTy;
 
+    // Mapping from the original exit blocks, to the new blocks inside
+    // the function.
+    SmallVector<BasicBlock *, 4> OldTargets;
+
     // Suffix to use when creating extracted function (appended to the original
     // function name + "."). If empty, the default is to use the entry block
     // label, if non-empty, otherwise "extracted".
@@ -139,6 +143,20 @@ public:
     /// returns false.
     Function *extractCodeRegion(const CodeExtractorAnalysisCache &CEAC);
 
+    /// Perform the extraction, returning the new function and providing an
+    /// interface to see what was categorized as inputs and outputs.
+    ///
+    /// \param CEAC - Cache to speed up operations for the CodeExtractor when
+    /// hoisting, and extracting lifetime values and assumes.
+    /// \param Inputs [out] - filled with  values marked as inputs to the
+    /// newly outlined function.
+     /// \param Outputs [out] - filled with values marked as outputs to the
+    /// newly outlined function.
+    /// \returns zero when called on a CodeExtractor instance where isEligible
+    /// returns false.
+    Function *extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
+                                ValueSet &Inputs, ValueSet &Outputs);
+
     /// Verify that assumption cache isn't stale after a region is extracted.
     /// Returns true when verifier finds errors. AssumptionCache is passed as
     /// parameter to make this function stateless.
diff --git a/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h b/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h
index 630f936471f2..0f32a97f9fcc 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeMoverUtils.h
@@ -40,7 +40,8 @@ bool isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1,
 bool isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
                         DominatorTree &DT,
                         const PostDominatorTree *PDT = nullptr,
-                        DependenceInfo *DI = nullptr);
+                        DependenceInfo *DI = nullptr,
+                        bool CheckForEntireBlock = false);
 
 /// Return true if all instructions (except the terminator) in \p BB can be
 /// safely moved before \p InsertPoint.
@@ -62,6 +63,19 @@ void moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB,
                               DominatorTree &DT, const PostDominatorTree &PDT,
                               DependenceInfo &DI);
 
+/// In case that two BBs \p ThisBlock and \p OtherBlock are control flow
+/// equivalent but they do not strictly dominate and post-dominate each
+/// other, we determine if \p ThisBlock is reached after \p OtherBlock
+/// in the control flow.
+bool nonStrictlyPostDominate(const BasicBlock *ThisBlock,
+                             const BasicBlock *OtherBlock,
+                             const DominatorTree *DT,
+                             const PostDominatorTree *PDT);
+
+// Check if I0 is reached before I1 in the control flow.
+bool isReachedBefore(const Instruction *I0, const Instruction *I1,
+                     const DominatorTree *DT, const PostDominatorTree *PDT);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CODEMOVERUTILS_H
diff --git a/llvm/include/llvm/Transforms/Utils/EntryExitInstrumenter.h b/llvm/include/llvm/Transforms/Utils/EntryExitInstrumenter.h
index 31c023019272..f2b038494a5d 100644
--- a/llvm/include/llvm/Transforms/Utils/EntryExitInstrumenter.h
+++ b/llvm/include/llvm/Transforms/Utils/EntryExitInstrumenter.h
@@ -27,6 +27,9 @@ struct EntryExitInstrumenterPass
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
+
   bool PostInlining;
 
   static bool isRequired() { return true; }
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h b/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h
index 024d84a7abc8..749b7b2bb5d8 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h
@@ -130,9 +130,6 @@ bool renameModuleForThinLTO(
     bool ClearDSOLocalOnDeclarations,
     SetVector<GlobalValue *> *GlobalsToImport = nullptr);
 
-/// Compute synthetic function entry counts.
-void computeSyntheticCounts(ModuleSummaryIndex &Index);
-
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
index 519593c96766..78d7845c4353 100644
--- a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
+++ b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_GLOBALSTATUS_H
 #define LLVM_TRANSFORMS_UTILS_GLOBALSTATUS_H
 
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/AtomicOrdering.h"
 
 namespace llvm {
@@ -45,7 +46,7 @@ struct GlobalStatus {
 
     /// This global is stored to, but only its initializer and one other value
     /// is ever stored to it.  If this global isStoredOnce, we track the value
-    /// stored to it in StoredOnceValue below.  This is only tracked for scalar
+    /// stored to it via StoredOnceStore below.  This is only tracked for scalar
     /// globals.
     StoredOnce,
 
@@ -55,8 +56,16 @@ struct GlobalStatus {
   } StoredType = NotStored;
 
   /// If only one value (besides the initializer constant) is ever stored to
-  /// this global, keep track of what value it is.
-  Value *StoredOnceValue = nullptr;
+  /// this global, keep track of what value it is via the store instruction.
+  const StoreInst *StoredOnceStore = nullptr;
+
+  /// If only one value (besides the initializer constant) is ever stored to
+  /// this global return the stored value.
+  Value *getStoredOnceValue() const {
+    return (StoredType == StoredOnce && StoredOnceStore)
+               ? StoredOnceStore->getOperand(0)
+               : nullptr;
+  }
 
   /// These start out null/false.  When the first accessing function is noticed,
   /// it is recorded. When a second different accessing function is noticed,
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombineWorklist.h b/llvm/include/llvm/Transforms/Utils/InstructionWorklist.h
index 25aabe199d0f..a318c2cd28bb 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombineWorklist.h
+++ b/llvm/include/llvm/Transforms/Utils/InstructionWorklist.h
@@ -1,4 +1,4 @@
-//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===//
+//=== InstructionWorklist.h - Worklist for InstCombine & others -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
-#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
+#ifndef LLVM_TRANSFORMS_UTILS_INSTRUCTIONWORKLIST_H
+#define LLVM_TRANSFORMS_UTILS_INSTRUCTIONWORKLIST_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -18,13 +18,11 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define DEBUG_TYPE "instcombine"
-
 namespace llvm {
 
-/// InstCombineWorklist - This is the worklist management logic for
-/// InstCombine.
-class InstCombineWorklist {
+/// InstructionWorklist - This is the worklist management logic for
+/// InstCombine and other simplification passes.
+class InstructionWorklist {
   SmallVector<Instruction *, 256> Worklist;
   DenseMap<Instruction *, unsigned> WorklistMap;
   /// These instructions will be added in reverse order after the current
@@ -33,10 +31,10 @@ class InstCombineWorklist {
   SmallSetVector<Instruction *, 16> Deferred;
 
 public:
-  InstCombineWorklist() = default;
+  InstructionWorklist() = default;
 
-  InstCombineWorklist(InstCombineWorklist &&) = default;
-  InstCombineWorklist &operator=(InstCombineWorklist &&) = default;
+  InstructionWorklist(InstructionWorklist &&) = default;
+  InstructionWorklist &operator=(InstructionWorklist &&) = default;
 
   bool isEmpty() const { return Worklist.empty() && Deferred.empty(); }
 
@@ -45,7 +43,7 @@ public:
   /// You likely want to use this method.
   void add(Instruction *I) {
     if (Deferred.insert(I))
-      LLVM_DEBUG(dbgs() << "IC: ADD DEFERRED: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "ADD DEFERRED: " << *I << '\n');
   }
 
   /// Add value to the worklist if it is an instruction.
@@ -62,7 +60,7 @@ public:
     assert(I->getParent() && "Instruction not inserted yet?");
 
     if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
-      LLVM_DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
+      LLVM_DEBUG(dbgs() << "ADD: " << *I << '\n');
       Worklist.push_back(I);
     }
   }
@@ -85,7 +83,7 @@ public:
 
   /// Remove I from the worklist if it exists.
   void remove(Instruction *I) {
-    DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
+    DenseMap<Instruction *, unsigned>::iterator It = WorklistMap.find(I);
     if (It != WorklistMap.end()) {
       // Don't bother moving everything down, just null out the slot.
       Worklist[It->second] = nullptr;
@@ -110,7 +108,6 @@ public:
       push(cast<Instruction>(U));
   }
 
-
   /// Check that the worklist is empty and nuke the backing store for the map.
   void zap() {
     assert(WorklistMap.empty() && "Worklist empty, but map not?");
@@ -123,6 +120,4 @@ public:
 
 } // end namespace llvm.
 
-#undef DEBUG_TYPE
-
 #endif
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 0102aa9ef3cc..72cb606eb51a 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -55,6 +55,7 @@ class MDNode;
 class MemorySSAUpdater;
 class PHINode;
 class StoreInst;
+class SwitchInst;
 class TargetLibraryInfo;
 class TargetTransformInfo;
 
@@ -78,7 +79,8 @@ bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
 //
 
 /// Return true if the result produced by the instruction is not used, and the
-/// instruction has no side effects.
+/// instruction will return. Certain side-effecting instructions are also
+/// considered dead if there are no uses of the instruction.
 bool isInstructionTriviallyDead(Instruction *I,
                                 const TargetLibraryInfo *TLI = nullptr);
 
@@ -236,6 +238,10 @@ CallInst *createCallMatchingInvoke(InvokeInst *II);
 /// This function converts the specified invoek into a normall call.
 void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
 
+/// This function removes the default destination from the specified switch.
+void createUnreachableSwitchDefault(SwitchInst *Switch,
+                                    DomTreeUpdater *DTU = nullptr);
+
 ///===---------------------------------------------------------------------===//
 ///  Dbg Intrinsic utilities
 ///
@@ -292,14 +298,30 @@ void salvageDebugInfo(Instruction &I);
 void salvageDebugInfoForDbgValues(Instruction &I,
                                   ArrayRef<DbgVariableIntrinsic *> Insns);
 
-/// Given an instruction \p I and DIExpression \p DIExpr operating on it, write
-/// the effects of \p I into the returned DIExpression, or return nullptr if
-/// it cannot be salvaged. \p StackVal: whether DW_OP_stack_value should be
-/// appended to the expression. \p LocNo: the index of the location operand to
-/// which \p I applies, should be 0 for debug info without a DIArgList.
-DIExpression *salvageDebugInfoImpl(Instruction &I, DIExpression *DIExpr,
-                                   bool StackVal, unsigned LocNo,
-                                   SmallVectorImpl<Value *> &AdditionalValues);
+/// Given an instruction \p I and DIExpression \p DIExpr operating on
+/// it, append the effects of \p I to the DIExpression operand list
+/// \p Ops, or return \p nullptr if it cannot be salvaged.
+/// \p CurrentLocOps is the number of SSA values referenced by the
+/// incoming \p Ops.  \return the first non-constant operand
+/// implicitly referred to by Ops. If \p I references more than one
+/// non-constant operand, any additional operands are added to
+/// \p AdditionalValues.
+///
+/// \example
+////
+///   I = add %a, i32 1
+///
+///   Return = %a
+///   Ops = llvm::dwarf::DW_OP_lit1 llvm::dwarf::DW_OP_add
+///
+///   I = add %a, %b
+///
+///   Return = %a
+///   Ops = llvm::dwarf::DW_OP_LLVM_arg0 llvm::dwarf::DW_OP_add
+///   AdditionalValues = %b
+Value *salvageDebugInfoImpl(Instruction &I, uint64_t CurrentLocOps,
+                            SmallVectorImpl<uint64_t> &Ops,
+                            SmallVectorImpl<Value *> &AdditionalValues);
 
 /// Point debug users of \p From to \p To or salvage them. Use this function
 /// only when replacing all uses of \p From with \p To, with a guarantee that
diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 8f857e1e5c21..6f1b4a880457 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -32,8 +32,8 @@ gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
 
 void computePeelCount(Loop *L, unsigned LoopSize,
                       TargetTransformInfo::PeelingPreferences &PP,
-                      unsigned &TripCount, ScalarEvolution &SE,
-                      unsigned Threshold = UINT_MAX);
+                      unsigned &TripCount, DominatorTree &DT,
+                      ScalarEvolution &SE, unsigned Threshold = UINT_MAX);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 247b911b7c8f..30c3f71e0947 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -147,11 +147,22 @@ protected:
 /// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
 /// instructions of the loop and loop safety information as
 /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
+/// \p CurLoop is a loop to do sinking on. \p OutermostLoop is used only when
+/// this function is called by \p sinkRegionForLoopNest.
 bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                 BlockFrequencyInfo *, TargetLibraryInfo *,
-                TargetTransformInfo *, Loop *, AliasSetTracker *,
-                MemorySSAUpdater *, ICFLoopSafetyInfo *,
-                SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *);
+                TargetTransformInfo *, Loop *CurLoop, MemorySSAUpdater *,
+                ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
+                OptimizationRemarkEmitter *, Loop *OutermostLoop = nullptr);
+
+/// Call sinkRegion on loops contained within the specified loop
+/// in order from innermost to outermost.
+bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *,
+                           DominatorTree *, BlockFrequencyInfo *,
+                           TargetLibraryInfo *, TargetTransformInfo *, Loop *,
+                           MemorySSAUpdater *, ICFLoopSafetyInfo *,
+                           SinkAndHoistLICMFlags &,
+                           OptimizationRemarkEmitter *);
 
 /// Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in depth
@@ -163,9 +174,8 @@ bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
 /// Diagnostics is emitted via \p ORE. It returns changed status.
 bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                  BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
-                 AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *,
-                 ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
-                 OptimizationRemarkEmitter *, bool);
+                 MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *,
+                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -199,7 +209,7 @@ bool promoteLoopAccessesToScalars(
     const SmallSetVector<Value *, 8> &, SmallVectorImpl<BasicBlock *> &,
     SmallVectorImpl<Instruction *> &, SmallVectorImpl<MemoryAccess *> &,
     PredIteratorCache &, LoopInfo *, DominatorTree *, const TargetLibraryInfo *,
-    Loop *, AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
+    Loop *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
     OptimizationRemarkEmitter *);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
@@ -338,6 +348,18 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         SinkAndHoistLICMFlags *LICMFlags = nullptr,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Returns the comparison predicate used when expanding a min/max reduction.
+CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
+
+/// See RecurrenceDescriptor::isSelectCmpPattern for a description of the
+/// pattern we are trying to match. In this pattern we are only ever selecting
+/// between two values: 1) an initial PHI start value, and 2) a loop invariant
+/// value. This function uses \p LoopExitInst to determine 2), which we then use
+/// to select between \p Left and \p Right. Any lane value in \p Left that
+/// matches 2) will be merged into \p Right.
+Value *createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK,
+                         Value *Left, Value *Right);
+
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
@@ -365,12 +387,22 @@ Value *createSimpleTargetReduction(IRBuilderBase &B,
                                    RecurKind RdxKind,
                                    ArrayRef<Value *> RedOps = None);
 
+/// Create a target reduction of the given vector \p Src for a reduction of the
+/// kind RecurKind::SelectICmp or RecurKind::SelectFCmp. The reduction operation
+/// is described by \p Desc.
+Value *createSelectCmpTargetReduction(IRBuilderBase &B,
+                                      const TargetTransformInfo *TTI,
+                                      Value *Src,
+                                      const RecurrenceDescriptor &Desc,
+                                      PHINode *OrigPhi);
+
 /// Create a generic target reduction using a recurrence descriptor \p Desc
 /// The target is queried to determine if intrinsics or shuffle sequences are
 /// required to implement the reduction.
 /// Fast-math-flags are propagated using the RecurrenceDescriptor.
 Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI,
-                             const RecurrenceDescriptor &Desc, Value *Src);
+                             const RecurrenceDescriptor &Desc, Value *Src,
+                             PHINode *OrigPhi = nullptr);
 
 /// Create an ordered reduction intrinsic using the given recurrence
 /// descriptor \p Desc.
@@ -463,12 +495,8 @@ Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
                 LoopInfo *LI, LPPassManager *LPM);
 
 /// Add code that checks at runtime if the accessed arrays in \p PointerChecks
-/// overlap.
-///
-/// Returns a pair of instructions where the first element is the first
-/// instruction generated in possibly a sequence of instructions and the
-/// second value is the final comparator value or NULL if no check is needed.
-std::pair<Instruction *, Instruction *>
+/// overlap. Returns the final comparator value or NULL if no check is needed.
+Value *
 addRuntimeChecks(Instruction *Loc, Loop *TheLoop,
                  const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
                  SCEVExpander &Expander);
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryOpRemark.h b/llvm/include/llvm/Transforms/Utils/MemoryOpRemark.h
index 7b4a1cdbf4fd..e5f8a46eaf23 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryOpRemark.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryOpRemark.h
@@ -1,9 +1,8 @@
 //===- MemoryOpRemark.h - Memory operation remark analysis -*- C++ ------*-===//
 //
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index c4030735d965..c922476ac79d 100644
--- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -51,11 +51,13 @@
 #define LLVM_TRANSFORMS_UTILS_PREDICATEINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -176,7 +178,7 @@ public:
 class PredicateInfo {
 public:
   PredicateInfo(Function &, DominatorTree &, AssumptionCache &);
-  ~PredicateInfo() = default;
+  ~PredicateInfo();
 
   void verifyPredicateInfo() const;
 
@@ -203,6 +205,8 @@ private:
   // the Predicate Info, they belong to the ValueInfo structs in the ValueInfos
   // vector.
   DenseMap<const Value *, const PredicateBase *> PredicateMap;
+  // The set of ssa_copy declarations we created with our custom mangling.
+  SmallSet<AssertingVH<Function>, 20> CreatedDeclarations;
 };
 
 // This pass does eager building and then printing of PredicateInfo. It is used
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
index 3a78e22b7e94..5de575aed059 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
@@ -70,10 +70,6 @@ public:
   /// rewritten value when RewriteAllUses is called.
   void AddUse(unsigned Var, Use *U);
 
-  /// Return true if the SSAUpdater already has a value for the specified
-  /// variable in the specified block.
-  bool HasValueForBlock(unsigned Var, BasicBlock *BB);
-
   /// Perform all the necessary updates, including new PHI-nodes insertion and
   /// the requested uses update.
   ///
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index e0759d359dbe..6a2f0acf46f3 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -56,27 +56,28 @@ template <> struct IRTraits<BasicBlock> {
   using FunctionT = Function;
   using BlockFrequencyInfoT = BlockFrequencyInfo;
   using LoopT = Loop;
-  using LoopInfoT = LoopInfo;
+  using LoopInfoPtrT = std::unique_ptr<LoopInfo>;
+  using DominatorTreePtrT = std::unique_ptr<DominatorTree>;
+  using PostDominatorTreeT = PostDominatorTree;
+  using PostDominatorTreePtrT = std::unique_ptr<PostDominatorTree>;
   using OptRemarkEmitterT = OptimizationRemarkEmitter;
   using OptRemarkAnalysisT = OptimizationRemarkAnalysis;
-  using DominatorTreeT = DominatorTree;
-  using PostDominatorTreeT = PostDominatorTree;
+  using PredRangeT = pred_range;
+  using SuccRangeT = succ_range;
   static Function &getFunction(Function &F) { return F; }
   static const BasicBlock *getEntryBB(const Function *F) {
     return &F->getEntryBlock();
   }
+  static pred_range getPredecessors(BasicBlock *BB) { return predecessors(BB); }
+  static succ_range getSuccessors(BasicBlock *BB) { return successors(BB); }
 };
 
 } // end namespace afdo_detail
 
-extern cl::opt<unsigned> SampleProfileMaxPropagateIterations;
-extern cl::opt<unsigned> SampleProfileRecordCoverage;
-extern cl::opt<unsigned> SampleProfileSampleCoverage;
-extern cl::opt<bool> NoWarnSampleUnused;
-
 template <typename BT> class SampleProfileLoaderBaseImpl {
 public:
-  SampleProfileLoaderBaseImpl(std::string Name) : Filename(Name) {}
+  SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName)
+      : Filename(Name), RemappingFilename(RemapName) {}
   void dump() { Reader->dump(); }
 
   using InstructionT = typename afdo_detail::IRTraits<BT>::InstructionT;
@@ -85,14 +86,19 @@ public:
       typename afdo_detail::IRTraits<BT>::BlockFrequencyInfoT;
   using FunctionT = typename afdo_detail::IRTraits<BT>::FunctionT;
   using LoopT = typename afdo_detail::IRTraits<BT>::LoopT;
-  using LoopInfoT = typename afdo_detail::IRTraits<BT>::LoopInfoT;
+  using LoopInfoPtrT = typename afdo_detail::IRTraits<BT>::LoopInfoPtrT;
+  using DominatorTreePtrT =
+      typename afdo_detail::IRTraits<BT>::DominatorTreePtrT;
+  using PostDominatorTreePtrT =
+      typename afdo_detail::IRTraits<BT>::PostDominatorTreePtrT;
+  using PostDominatorTreeT =
+      typename afdo_detail::IRTraits<BT>::PostDominatorTreeT;
   using OptRemarkEmitterT =
       typename afdo_detail::IRTraits<BT>::OptRemarkEmitterT;
   using OptRemarkAnalysisT =
       typename afdo_detail::IRTraits<BT>::OptRemarkAnalysisT;
-  using DominatorTreeT = typename afdo_detail::IRTraits<BT>::DominatorTreeT;
-  using PostDominatorTreeT =
-      typename afdo_detail::IRTraits<BT>::PostDominatorTreeT;
+  using PredRangeT = typename afdo_detail::IRTraits<BT>::PredRangeT;
+  using SuccRangeT = typename afdo_detail::IRTraits<BT>::SuccRangeT;
 
   using BlockWeightMap = DenseMap<const BasicBlockT *, uint64_t>;
   using EquivalenceClassMap =
@@ -112,6 +118,12 @@ protected:
   const BasicBlockT *getEntryBB(const FunctionT *F) {
     return afdo_detail::IRTraits<BT>::getEntryBB(F);
   }
+  PredRangeT getPredecessors(BasicBlockT *BB) {
+    return afdo_detail::IRTraits<BT>::getPredecessors(BB);
+  }
+  SuccRangeT getSuccessors(BasicBlockT *BB) {
+    return afdo_detail::IRTraits<BT>::getSuccessors(BB);
+  }
 
   unsigned getFunctionLoc(FunctionT &Func);
   virtual ErrorOr<uint64_t> getInstWeight(const InstructionT &Inst);
@@ -129,12 +141,11 @@ protected:
   void findEquivalencesFor(BasicBlockT *BB1,
                            ArrayRef<BasicBlockT *> Descendants,
                            PostDominatorTreeT *DomTree);
-
   void propagateWeights(FunctionT &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(FunctionT &F);
   bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount);
-  void clearFunctionData();
+  void clearFunctionData(bool ResetDT = true);
   void computeDominanceAndLoopInfo(FunctionT &F);
   bool
   computeAndPropagateWeights(FunctionT &F,
@@ -168,9 +179,9 @@ protected:
   EquivalenceClassMap EquivalenceClass;
 
   /// Dominance, post-dominance and loop information.
-  std::unique_ptr<DominatorTreeT> DT;
-  std::unique_ptr<PostDominatorTreeT> PDT;
-  std::unique_ptr<LoopInfoT> LI;
+  DominatorTreePtrT DT;
+  PostDominatorTreePtrT PDT;
+  LoopInfoPtrT LI;
 
   /// Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
@@ -190,6 +201,9 @@ protected:
   /// Name of the profile file to load.
   std::string Filename;
 
+  /// Name of the profile remapping file to load.
+  std::string RemappingFilename;
+
   /// Profile Summary Info computed from sample profile.
   ProfileSummaryInfo *PSI = nullptr;
 
@@ -199,15 +213,17 @@ protected:
 
 /// Clear all the per-function data used to load samples and propagate weights.
 template <typename BT>
-void SampleProfileLoaderBaseImpl<BT>::clearFunctionData() {
+void SampleProfileLoaderBaseImpl<BT>::clearFunctionData(bool ResetDT) {
   BlockWeights.clear();
   EdgeWeights.clear();
   VisitedBlocks.clear();
   VisitedEdges.clear();
   EquivalenceClass.clear();
-  DT = nullptr;
-  PDT = nullptr;
-  LI = nullptr;
+  if (ResetDT) {
+    DT = nullptr;
+    PDT = nullptr;
+    LI = nullptr;
+  }
   Predecessors.clear();
   Successors.clear();
   CoverageTracker.clear();
@@ -475,7 +491,7 @@ void SampleProfileLoaderBaseImpl<BT>::findEquivalenceClasses(FunctionT &F) {
     // class by making BB2's equivalence class be BB1.
     DominatedBBs.clear();
     DT->getDescendants(BB1, DominatedBBs);
-    findEquivalencesFor(BB1, DominatedBBs, PDT.get());
+    findEquivalencesFor(BB1, DominatedBBs, &*PDT);
 
     LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
   }
@@ -692,7 +708,7 @@ void SampleProfileLoaderBaseImpl<BT>::buildEdges(FunctionT &F) {
     SmallPtrSet<BasicBlockT *, 16> Visited;
     if (!Predecessors[B1].empty())
       llvm_unreachable("Found a stale predecessors list in a basic block.");
-    for (BasicBlockT *B2 : predecessors(B1))
+    for (auto *B2 : getPredecessors(B1))
       if (Visited.insert(B2).second)
         Predecessors[B1].push_back(B2);
 
@@ -700,7 +716,7 @@ void SampleProfileLoaderBaseImpl<BT>::buildEdges(FunctionT &F) {
     Visited.clear();
     if (!Successors[B1].empty())
       llvm_unreachable("Found a stale successors list in a basic block.");
-    for (BasicBlockT *B2 : successors(B1))
+    for (auto *B2 : getSuccessors(B1))
       if (Visited.insert(B2).second)
         Successors[B1].push_back(B2);
   }
@@ -911,12 +927,12 @@ unsigned SampleProfileLoaderBaseImpl<BT>::getFunctionLoc(FunctionT &F) {
 template <typename BT>
 void SampleProfileLoaderBaseImpl<BT>::computeDominanceAndLoopInfo(
     FunctionT &F) {
-  DT.reset(new DominatorTreeT);
+  DT.reset(new DominatorTree);
   DT->recalculate(F);
 
   PDT.reset(new PostDominatorTree(F));
 
-  LI.reset(new LoopInfoT);
+  LI.reset(new LoopInfo);
   LI->analyze(*DT);
 }
 
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 59bf3a342caa..efc3cc775e11 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -32,8 +32,10 @@ extern cl::opt<unsigned> SCEVCheapExpansionBudget;
 
 /// Return true if the given expression is safe to expand in the sense that
 /// all materialized values are safe to speculate anywhere their operands are
-/// defined.
-bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE);
+/// defined, and the expander is capable of expanding the expression.
+/// CanonicalMode indicates whether the expander will be used in canonical mode.
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE,
+                    bool CanonicalMode = true);
 
 /// Return true if the given expression is safe to expand in the sense that
 /// all materialized values are defined and safe to speculate at the specified
@@ -489,9 +491,6 @@ private:
   Value *expandIVInc(PHINode *PN, Value *StepV, const Loop *L, Type *ExpandTy,
                      Type *IntTy, bool useSubtract);
 
-  void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
-                      Instruction *Pos, PHINode *LoopPhi);
-
   void fixupInsertPoints(Instruction *I);
 
   /// If required, create LCSSA PHIs for \p Users' operand \p OpIdx. If new
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 8703434e1696..a88e72fc9ba8 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -132,8 +132,6 @@ private:
     eraseFromParent(I);
   }
 
-  Value *foldMallocMemset(CallInst *Memset, IRBuilderBase &B);
-
 public:
   LibCallSimplifier(
       const DataLayout &DL, const TargetLibraryInfo *TLI,
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index d95ead2def3d..320c36b36924 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -117,7 +117,8 @@ MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 
 TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
+    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+    llvm::OptimizationRemarkEmitter &ORE, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount);
diff --git a/llvm/include/llvm/Transforms/Utils/ValueMapper.h b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
index 4245f51cc1e2..95fd0b14dd51 100644
--- a/llvm/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
@@ -22,7 +22,6 @@ namespace llvm {
 
 class Constant;
 class Function;
-class GlobalIndirectSymbol;
 class GlobalVariable;
 class Instruction;
 class MDNode;
@@ -122,7 +121,8 @@ inline RemapFlags operator|(RemapFlags LHS, RemapFlags RHS) {
 /// instance:
 /// - \a scheduleMapGlobalInitializer()
 /// - \a scheduleMapAppendingVariable()
-/// - \a scheduleMapGlobalIndirectSymbol()
+/// - \a scheduleMapGlobalAlias()
+/// - \a scheduleMapGlobalIFunc()
 /// - \a scheduleRemapFunction()
 ///
 /// Sometimes a callback needs a different mapping context.  Such a context can
@@ -182,9 +182,10 @@ public:
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MappingContextID = 0);
-  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
-                                       Constant &Target,
-                                       unsigned MappingContextID = 0);
+  void scheduleMapGlobalAlias(GlobalAlias &GA, Constant &Aliasee,
+                              unsigned MappingContextID = 0);
+  void scheduleMapGlobalIFunc(GlobalIFunc &GI, Constant &Resolver,
+                              unsigned MappingContextID = 0);
   void scheduleRemapFunction(Function &F, unsigned MappingContextID = 0);
 };
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index e7dcdda8af89..ed9e0beb0339 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -340,7 +340,7 @@ public:
   /// -1 - Address is consecutive, and decreasing.
   /// NOTE: This method must only be used before modifying the original scalar
   /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
-  int isConsecutivePtr(Value *Ptr) const;
+  int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
 
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index ad6a4b561a9b..d105496ad47f 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -153,6 +153,8 @@ public:
   ProfileSummaryInfo *PSI;
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
   // Shim for old PM.
   LoopVectorizeResult
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index f416a592d683..cd605aacb52d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -94,9 +94,11 @@ private:
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
   /// Try to vectorize a list of operands.
+  /// \param LimitForRegisterSize Vectorize only using maximal allowed register
+  /// size.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          bool AllowReorder = false);
+                          bool LimitForRegisterSize = false);
 
   /// Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h b/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h
index b7809aa24cae..a32f9fba967f 100644
--- a/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h
+++ b/llvm/include/llvm/Transforms/Vectorize/VectorCombine.h
@@ -20,10 +20,16 @@
 namespace llvm {
 
 /// Optimize scalar/vector interactions in IR using target cost models.
-struct VectorCombinePass : public PassInfoMixin<VectorCombinePass> {
+class VectorCombinePass : public PassInfoMixin<VectorCombinePass> {
+  /// If true only perform scalarization combines and do not introduce new
+  /// vector operations.
+  bool ScalarizationOnly;
+
 public:
+  VectorCombinePass(bool ScalarizationOnly = false)
+      : ScalarizationOnly(ScalarizationOnly) {}
+
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
 };
-
 }
 #endif // LLVM_TRANSFORMS_VECTORIZE_VECTORCOMBINE_H
diff --git a/llvm/include/llvm/WindowsManifest/WindowsManifestMerger.h b/llvm/include/llvm/WindowsManifest/WindowsManifestMerger.h
index 31f4daeb7019..2da74bb9dce8 100644
--- a/llvm/include/llvm/WindowsManifest/WindowsManifestMerger.h
+++ b/llvm/include/llvm/WindowsManifest/WindowsManifestMerger.h
@@ -30,6 +30,7 @@
 namespace llvm {
 
 class MemoryBuffer;
+class MemoryBufferRef;
 
 namespace windows_manifest {
 
@@ -49,7 +50,7 @@ class WindowsManifestMerger {
 public:
   WindowsManifestMerger();
   ~WindowsManifestMerger();
-  Error merge(const MemoryBuffer &Manifest);
+  Error merge(MemoryBufferRef Manifest);
 
   // Returns vector containing merged xml manifest, or uninitialized vector for
   // empty manifest.
diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap
index 848fb266374e..6cbbb9a4028e 100644
--- a/llvm/include/llvm/module.modulemap
+++ b/llvm/include/llvm/module.modulemap
@@ -181,21 +181,9 @@ module LLVM_ExecutionEngine {
   // translation unit (or none) and aren't part of this module.
   exclude header "ExecutionEngine/MCJIT.h"
   exclude header "ExecutionEngine/Interpreter.h"
-  exclude header "ExecutionEngine/OrcMCJITReplacement.h"
-
-  // FIXME: These exclude directives were added as a workaround for
-  //        <rdar://problem/29247092> and should be removed once it is fixed.
-  exclude header "ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h"
-  exclude header "ExecutionEngine/Orc/OrcRemoteTargetClient.h"
-  exclude header "ExecutionEngine/Orc/OrcRemoteTargetServer.h"
-  exclude header "ExecutionEngine/Orc/RemoteObjectLayer.h"
 
   // Exclude headers from LLVM_OrcSupport.
   exclude header "ExecutionEngine/Orc/Shared/OrcError.h"
-  exclude header "ExecutionEngine/Orc/RPC/RPCUtils.h"
-  exclude header "ExecutionEngine/Orc/RPC/RPCSerialization.h"
-  exclude header "ExecutionEngine/Orc/RPC/RawByteChannel.h"
-
 }
 
 module LLVM_FileCheck {
@@ -221,9 +209,6 @@ module LLVM_OrcSupport {
   requires cplusplus
 
   header "ExecutionEngine/Orc/Shared/OrcError.h"
-  header "ExecutionEngine/Orc/Shared/RPCUtils.h"
-  header "ExecutionEngine/Orc/Shared/Serialization.h"
-  header "ExecutionEngine/Orc/Shared/RawByteChannel.h"
 
   export *
 }
@@ -389,6 +374,9 @@ module LLVM_Transforms {
   umbrella "Transforms"
 
   module * { export * }
+
+  // Requires DEBUG_TYPE to be defined by including file.
+  exclude header "Transforms/Utils/InstructionWorklist.h"
 }
 
 extern module LLVM_Extern_Utils_DataTypes "module.extern.modulemap"
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index e7445e225d52..d030f74481cf 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -119,7 +119,7 @@ bool AAResults::invalidate(Function &F, const PreservedAnalyses &PA,
 
 AliasResult AAResults::alias(const MemoryLocation &LocA,
                              const MemoryLocation &LocB) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return alias(LocA, LocB, AAQIP);
 }
 
@@ -162,7 +162,7 @@ AliasResult AAResults::alias(const MemoryLocation &LocA,
 
 bool AAResults::pointsToConstantMemory(const MemoryLocation &Loc,
                                        bool OrLocal) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return pointsToConstantMemory(Loc, AAQIP, OrLocal);
 }
 
@@ -190,7 +190,7 @@ ModRefInfo AAResults::getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
 }
 
 ModRefInfo AAResults::getModRefInfo(Instruction *I, const CallBase *Call2) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(I, Call2, AAQIP);
 }
 
@@ -200,25 +200,24 @@ ModRefInfo AAResults::getModRefInfo(Instruction *I, const CallBase *Call2,
   if (const auto *Call1 = dyn_cast<CallBase>(I)) {
     // Check if the two calls modify the same memory.
     return getModRefInfo(Call1, Call2, AAQI);
-  } else if (I->isFenceLike()) {
-    // If this is a fence, just return ModRef.
-    return ModRefInfo::ModRef;
-  } else {
-    // Otherwise, check if the call modifies or references the
-    // location this memory access defines.  The best we can say
-    // is that if the call references what this instruction
-    // defines, it must be clobbered by this location.
-    const MemoryLocation DefLoc = MemoryLocation::get(I);
-    ModRefInfo MR = getModRefInfo(Call2, DefLoc, AAQI);
-    if (isModOrRefSet(MR))
-      return setModAndRef(MR);
   }
+  // If this is a fence, just return ModRef.
+  if (I->isFenceLike())
+    return ModRefInfo::ModRef;
+  // Otherwise, check if the call modifies or references the
+  // location this memory access defines.  The best we can say
+  // is that if the call references what this instruction
+  // defines, it must be clobbered by this location.
+  const MemoryLocation DefLoc = MemoryLocation::get(I);
+  ModRefInfo MR = getModRefInfo(Call2, DefLoc, AAQI);
+  if (isModOrRefSet(MR))
+    return setModAndRef(MR);
   return ModRefInfo::NoModRef;
 }
 
 ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(Call, Loc, AAQIP);
 }
 
@@ -285,7 +284,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
 
 ModRefInfo AAResults::getModRefInfo(const CallBase *Call1,
                                     const CallBase *Call2) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(Call1, Call2, AAQIP);
 }
 
@@ -475,7 +474,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) {
 
 ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(L, Loc, AAQIP);
 }
 ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
@@ -500,7 +499,7 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
 
 ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(S, Loc, AAQIP);
 }
 ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
@@ -532,7 +531,7 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
 }
 
 ModRefInfo AAResults::getModRefInfo(const FenceInst *S, const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(S, Loc, AAQIP);
 }
 
@@ -548,7 +547,7 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *S,
 
 ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(V, Loc, AAQIP);
 }
 
@@ -578,7 +577,7 @@ ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,
 
 ModRefInfo AAResults::getModRefInfo(const CatchPadInst *CatchPad,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(CatchPad, Loc, AAQIP);
 }
 
@@ -598,7 +597,7 @@ ModRefInfo AAResults::getModRefInfo(const CatchPadInst *CatchPad,
 
 ModRefInfo AAResults::getModRefInfo(const CatchReturnInst *CatchRet,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(CatchRet, Loc, AAQIP);
 }
 
@@ -618,7 +617,7 @@ ModRefInfo AAResults::getModRefInfo(const CatchReturnInst *CatchRet,
 
 ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(CX, Loc, AAQIP);
 }
 
@@ -646,7 +645,7 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,
 
 ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
                                     const MemoryLocation &Loc) {
-  AAQueryInfo AAQIP;
+  SimpleAAQueryInfo AAQIP;
   return getModRefInfo(RMW, Loc, AAQIP);
 }
 
@@ -746,7 +745,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
     // pointer were passed to arguments that were neither of these, then it
     // couldn't be no-capture.
     if (!(*CI)->getType()->isPointerTy() ||
-        (!Call->doesNotCapture(ArgNo) && ArgNo < Call->getNumArgOperands() &&
+        (!Call->doesNotCapture(ArgNo) && ArgNo < Call->arg_size() &&
          !Call->isByValArgument(ArgNo)))
       continue;
 
@@ -808,11 +807,6 @@ AAResults::Concept::~Concept() = default;
 // Provide a definition for the static object used to identify passes.
 AnalysisKey AAManager::Key;
 
-namespace {
-
-
-} // end anonymous namespace
-
 ExternalAAWrapperPass::ExternalAAWrapperPass() : ImmutablePass(ID) {
   initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
 }
diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp
index dee044346f02..9d4fe1225b33 100644
--- a/llvm/lib/Analysis/AssumeBundleQueries.cpp
+++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp
@@ -84,7 +84,7 @@ void llvm::fillMapFromAssume(AssumeInst &Assume, RetainedKnowledgeMap &Result) {
         getValueFromBundleOpInfo(Assume, Bundles, ABA_Argument));
     if (!CI)
       continue;
-    unsigned Val = CI->getZExtValue();
+    uint64_t Val = CI->getZExtValue();
     auto Lookup = Result.find(Key);
     if (Lookup == Result.end() || !Lookup->second.count(&Assume)) {
       Result[Key][&Assume] = {Val, Val};
@@ -102,7 +102,7 @@ llvm::getKnowledgeFromBundle(AssumeInst &Assume,
   Result.AttrKind = Attribute::getAttrKindFromName(BOI.Tag->getKey());
   if (bundleHasArgument(BOI, ABA_WasOn))
     Result.WasOn = getValueFromBundleOpInfo(Assume, BOI, ABA_WasOn);
-  auto GetArgOr1 = [&](unsigned Idx) -> unsigned {
+  auto GetArgOr1 = [&](unsigned Idx) -> uint64_t {
     if (auto *ConstInt = dyn_cast<ConstantInt>(
             getValueFromBundleOpInfo(Assume, BOI, ABA_Argument + Idx)))
       return ConstInt->getZExtValue();
diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp
index 0d95b33601f9..3e0214e21ecd 100644
--- a/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/llvm/lib/Analysis/AssumptionCache.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
@@ -56,7 +57,7 @@ AssumptionCache::getOrInsertAffectedValues(Value *V) {
 }
 
 static void
-findAffectedValues(CallBase *CI,
+findAffectedValues(CallBase *CI, TargetTransformInfo *TTI,
                    SmallVectorImpl<AssumptionCache::ResultElem> &Affected) {
   // Note: This code must be kept in-sync with the code in
   // computeKnownBitsFromAssume in ValueTracking.
@@ -124,24 +125,32 @@ findAffectedValues(CallBase *CI,
         match(B, m_ConstantInt()))
       AddAffected(X);
   }
+
+  if (TTI) {
+    const Value *Ptr;
+    unsigned AS;
+    std::tie(Ptr, AS) = TTI->getPredicatedAddrSpace(Cond);
+    if (Ptr)
+      AddAffected(const_cast<Value *>(Ptr->stripInBoundsOffsets()));
+  }
 }
 
 void AssumptionCache::updateAffectedValues(AssumeInst *CI) {
   SmallVector<AssumptionCache::ResultElem, 16> Affected;
-  findAffectedValues(CI, Affected);
+  findAffectedValues(CI, TTI, Affected);
 
   for (auto &AV : Affected) {
     auto &AVV = getOrInsertAffectedValues(AV.Assume);
-    if (std::find_if(AVV.begin(), AVV.end(), [&](ResultElem &Elem) {
+    if (llvm::none_of(AVV, [&](ResultElem &Elem) {
           return Elem.Assume == CI && Elem.Index == AV.Index;
-        }) == AVV.end())
+        }))
       AVV.push_back({CI, AV.Index});
   }
 }
 
 void AssumptionCache::unregisterAssumption(AssumeInst *CI) {
   SmallVector<AssumptionCache::ResultElem, 16> Affected;
-  findAffectedValues(CI, Affected);
+  findAffectedValues(CI, TTI, Affected);
 
   for (auto &AV : Affected) {
     auto AVI = AffectedValues.find_as(AV.Assume);
@@ -248,6 +257,12 @@ void AssumptionCache::registerAssumption(AssumeInst *CI) {
   updateAffectedValues(CI);
 }
 
+AssumptionCache AssumptionAnalysis::run(Function &F,
+                                        FunctionAnalysisManager &FAM) {
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  return AssumptionCache(F, &TTI);
+}
+
 AnalysisKey AssumptionAnalysis::Key;
 
 PreservedAnalyses AssumptionPrinterPass::run(Function &F,
@@ -278,10 +293,13 @@ AssumptionCache &AssumptionCacheTracker::getAssumptionCache(Function &F) {
   if (I != AssumptionCaches.end())
     return *I->second;
 
+  auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+  auto *TTI = TTIWP ? &TTIWP->getTTI(F) : nullptr;
+
   // Ok, build a new cache by scanning the function, insert it and the value
   // handle into our map, and return the newly populated cache.
   auto IP = AssumptionCaches.insert(std::make_pair(
-      FunctionCallbackVH(&F, this), std::make_unique<AssumptionCache>(F)));
+      FunctionCallbackVH(&F, this), std::make_unique<AssumptionCache>(F, TTI)));
   assert(IP.second && "Scanning function already in the map?");
   return *IP.first->second;
 }
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 357772c9c4f2..88b0f37b1d48 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -68,15 +69,6 @@ using namespace llvm;
 static cl::opt<bool> EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden,
                                           cl::init(true));
 
-/// By default, even on 32-bit architectures we use 64-bit integers for
-/// calculations. This will allow us to more-aggressively decompose indexing
-/// expressions calculated using i64 values (e.g., long long in C) which is
-/// common enough to worry about.
-static cl::opt<bool> ForceAtLeast64Bits("basic-aa-force-at-least-64b",
-                                        cl::Hidden, cl::init(true));
-static cl::opt<bool> DoubleCalcBits("basic-aa-double-calc-bits",
-                                    cl::Hidden, cl::init(false));
-
 /// SearchLimitReached / SearchTimes shows how often the limit of
 /// to decompose GEPs is reached. It will affect the precision
 /// of basic alias analysis.
@@ -91,8 +83,7 @@ STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
 const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;
 
 // The max limit of the search depth in DecomposeGEPExpression() and
-// getUnderlyingObject(), both functions need to use the same search
-// depth otherwise the algorithm in aliasGEP will assert.
+// getUnderlyingObject().
 static const unsigned MaxLookupSearchDepth = 6;
 
 bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
@@ -120,9 +111,6 @@ static bool isEscapeSource(const Value *V) {
   if (isa<CallBase>(V))
     return true;
 
-  if (isa<Argument>(V))
-    return true;
-
   // The load case works because isNonEscapingLocalObject considers all
   // stores to be escapes (it passes true for the StoreCaptures argument
   // to PointerMayBeCaptured).
@@ -206,12 +194,12 @@ static uint64_t getMinimalExtentFrom(const Value &V,
                                      bool NullIsValidLoc) {
   // If we have dereferenceability information we know a lower bound for the
   // extent as accesses for a lower offset would be valid. We need to exclude
-  // the "or null" part if null is a valid pointer.
+  // the "or null" part if null is a valid pointer. We can ignore frees, as an
+  // access after free would be undefined behavior.
   bool CanBeNull, CanBeFreed;
   uint64_t DerefBytes =
     V.getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
   DerefBytes = (CanBeNull && NullIsValidLoc) ? 0 : DerefBytes;
-  DerefBytes = CanBeFreed ? 0 : DerefBytes;
   // If queried with a precise location size, we assume that location size to be
   // accessed, thus valid.
   if (LocSize.isPrecise())
@@ -227,82 +215,163 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
 }
 
 //===----------------------------------------------------------------------===//
+// CaptureInfo implementations
+//===----------------------------------------------------------------------===//
+
+CaptureInfo::~CaptureInfo() = default;
+
+bool SimpleCaptureInfo::isNotCapturedBeforeOrAt(const Value *Object,
+                                                const Instruction *I) {
+  return isNonEscapingLocalObject(Object, &IsCapturedCache);
+}
+
+bool EarliestEscapeInfo::isNotCapturedBeforeOrAt(const Value *Object,
+                                                 const Instruction *I) {
+  if (!isIdentifiedFunctionLocal(Object))
+    return false;
+
+  auto Iter = EarliestEscapes.insert({Object, nullptr});
+  if (Iter.second) {
+    Instruction *EarliestCapture = FindEarliestCapture(
+        Object, *const_cast<Function *>(I->getFunction()),
+        /*ReturnCaptures=*/false, /*StoreCaptures=*/true, DT);
+    if (EarliestCapture) {
+      auto Ins = Inst2Obj.insert({EarliestCapture, {}});
+      Ins.first->second.push_back(Object);
+    }
+    Iter.first->second = EarliestCapture;
+  }
+
+  // No capturing instruction.
+  if (!Iter.first->second)
+    return true;
+
+  return I != Iter.first->second &&
+         !isPotentiallyReachable(Iter.first->second, I, nullptr, &DT, &LI);
+}
+
+void EarliestEscapeInfo::removeInstruction(Instruction *I) {
+  auto Iter = Inst2Obj.find(I);
+  if (Iter != Inst2Obj.end()) {
+    for (const Value *Obj : Iter->second)
+      EarliestEscapes.erase(Obj);
+    Inst2Obj.erase(I);
+  }
+}
+
+//===----------------------------------------------------------------------===//
 // GetElementPtr Instruction Decomposition and Analysis
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// Represents zext(sext(V)).
-struct ExtendedValue {
+/// Represents zext(sext(trunc(V))).
+struct CastedValue {
   const Value *V;
-  unsigned ZExtBits;
-  unsigned SExtBits;
+  unsigned ZExtBits = 0;
+  unsigned SExtBits = 0;
+  unsigned TruncBits = 0;
 
-  explicit ExtendedValue(const Value *V, unsigned ZExtBits = 0,
-                         unsigned SExtBits = 0)
-      : V(V), ZExtBits(ZExtBits), SExtBits(SExtBits) {}
+  explicit CastedValue(const Value *V) : V(V) {}
+  explicit CastedValue(const Value *V, unsigned ZExtBits, unsigned SExtBits,
+                       unsigned TruncBits)
+      : V(V), ZExtBits(ZExtBits), SExtBits(SExtBits), TruncBits(TruncBits) {}
 
   unsigned getBitWidth() const {
-    return V->getType()->getPrimitiveSizeInBits() + ZExtBits + SExtBits;
+    return V->getType()->getPrimitiveSizeInBits() - TruncBits + ZExtBits +
+           SExtBits;
   }
 
-  ExtendedValue withValue(const Value *NewV) const {
-    return ExtendedValue(NewV, ZExtBits, SExtBits);
+  CastedValue withValue(const Value *NewV) const {
+    return CastedValue(NewV, ZExtBits, SExtBits, TruncBits);
   }
 
-  ExtendedValue withZExtOfValue(const Value *NewV) const {
+  /// Replace V with zext(NewV)
+  CastedValue withZExtOfValue(const Value *NewV) const {
     unsigned ExtendBy = V->getType()->getPrimitiveSizeInBits() -
                         NewV->getType()->getPrimitiveSizeInBits();
+    if (ExtendBy <= TruncBits)
+      return CastedValue(NewV, ZExtBits, SExtBits, TruncBits - ExtendBy);
+
     // zext(sext(zext(NewV))) == zext(zext(zext(NewV)))
-    return ExtendedValue(NewV, ZExtBits + SExtBits + ExtendBy, 0);
+    ExtendBy -= TruncBits;
+    return CastedValue(NewV, ZExtBits + SExtBits + ExtendBy, 0, 0);
   }
 
-  ExtendedValue withSExtOfValue(const Value *NewV) const {
+  /// Replace V with sext(NewV)
+  CastedValue withSExtOfValue(const Value *NewV) const {
     unsigned ExtendBy = V->getType()->getPrimitiveSizeInBits() -
                         NewV->getType()->getPrimitiveSizeInBits();
+    if (ExtendBy <= TruncBits)
+      return CastedValue(NewV, ZExtBits, SExtBits, TruncBits - ExtendBy);
+
     // zext(sext(sext(NewV)))
-    return ExtendedValue(NewV, ZExtBits, SExtBits + ExtendBy);
+    ExtendBy -= TruncBits;
+    return CastedValue(NewV, ZExtBits, SExtBits + ExtendBy, 0);
   }
 
   APInt evaluateWith(APInt N) const {
     assert(N.getBitWidth() == V->getType()->getPrimitiveSizeInBits() &&
            "Incompatible bit width");
+    if (TruncBits) N = N.trunc(N.getBitWidth() - TruncBits);
     if (SExtBits) N = N.sext(N.getBitWidth() + SExtBits);
     if (ZExtBits) N = N.zext(N.getBitWidth() + ZExtBits);
     return N;
   }
 
+  ConstantRange evaluateWith(ConstantRange N) const {
+    assert(N.getBitWidth() == V->getType()->getPrimitiveSizeInBits() &&
+           "Incompatible bit width");
+    if (TruncBits) N = N.truncate(N.getBitWidth() - TruncBits);
+    if (SExtBits) N = N.signExtend(N.getBitWidth() + SExtBits);
+    if (ZExtBits) N = N.zeroExtend(N.getBitWidth() + ZExtBits);
+    return N;
+  }
+
   bool canDistributeOver(bool NUW, bool NSW) const {
     // zext(x op<nuw> y) == zext(x) op<nuw> zext(y)
     // sext(x op<nsw> y) == sext(x) op<nsw> sext(y)
+    // trunc(x op y) == trunc(x) op trunc(y)
     return (!ZExtBits || NUW) && (!SExtBits || NSW);
   }
+
+  bool hasSameCastsAs(const CastedValue &Other) const {
+    return ZExtBits == Other.ZExtBits && SExtBits == Other.SExtBits &&
+           TruncBits == Other.TruncBits;
+  }
 };
 
-/// Represents zext(sext(V)) * Scale + Offset.
+/// Represents zext(sext(trunc(V))) * Scale + Offset.
 struct LinearExpression {
-  ExtendedValue Val;
+  CastedValue Val;
   APInt Scale;
   APInt Offset;
 
   /// True if all operations in this expression are NSW.
   bool IsNSW;
 
-  LinearExpression(const ExtendedValue &Val, const APInt &Scale,
+  LinearExpression(const CastedValue &Val, const APInt &Scale,
                    const APInt &Offset, bool IsNSW)
       : Val(Val), Scale(Scale), Offset(Offset), IsNSW(IsNSW) {}
 
-  LinearExpression(const ExtendedValue &Val) : Val(Val), IsNSW(true) {
+  LinearExpression(const CastedValue &Val) : Val(Val), IsNSW(true) {
     unsigned BitWidth = Val.getBitWidth();
     Scale = APInt(BitWidth, 1);
     Offset = APInt(BitWidth, 0);
   }
+
+  LinearExpression mul(const APInt &Other, bool MulIsNSW) const {
+    // The check for zero offset is necessary, because generally
+    // (X +nsw Y) *nsw Z does not imply (X *nsw Z) +nsw (Y *nsw Z).
+    bool NSW = IsNSW && (Other.isOne() || (MulIsNSW && Offset.isZero()));
+    return LinearExpression(Val, Scale * Other, Offset * Other, NSW);
+  }
 };
 }
 
 /// Analyzes the specified value as a linear expression: "A*V + B", where A and
 /// B are constant integers.
 static LinearExpression GetLinearExpression(
-    const ExtendedValue &Val,  const DataLayout &DL, unsigned Depth,
+    const CastedValue &Val,  const DataLayout &DL, unsigned Depth,
     AssumptionCache *AC, DominatorTree *DT) {
   // Limit our recursion depth.
   if (Depth == 6)
@@ -325,6 +394,11 @@ static LinearExpression GetLinearExpression(
       if (!Val.canDistributeOver(NUW, NSW))
         return Val;
 
+      // While we can distribute over trunc, we cannot preserve nowrap flags
+      // in that case.
+      if (Val.TruncBits)
+        NUW = NSW = false;
+
       LinearExpression E(Val);
       switch (BOp->getOpcode()) {
       default:
@@ -353,14 +427,11 @@ static LinearExpression GetLinearExpression(
         E.IsNSW &= NSW;
         break;
       }
-      case Instruction::Mul: {
+      case Instruction::Mul:
         E = GetLinearExpression(Val.withValue(BOp->getOperand(0)), DL,
-                                Depth + 1, AC, DT);
-        E.Offset *= RHS;
-        E.Scale *= RHS;
-        E.IsNSW &= NSW;
+                                Depth + 1, AC, DT)
+                .mul(RHS, NSW);
         break;
-      }
       case Instruction::Shl:
         // We're trying to linearize an expression of the kind:
         //   shl i8 -128, 36
@@ -394,25 +465,75 @@ static LinearExpression GetLinearExpression(
   return Val;
 }
 
-/// To ensure a pointer offset fits in an integer of size PointerSize
-/// (in bits) when that size is smaller than the maximum pointer size. This is
+/// To ensure a pointer offset fits in an integer of size IndexSize
+/// (in bits) when that size is smaller than the maximum index size. This is
 /// an issue, for example, in particular for 32b pointers with negative indices
 /// that rely on two's complement wrap-arounds for precise alias information
-/// where the maximum pointer size is 64b.
-static APInt adjustToPointerSize(const APInt &Offset, unsigned PointerSize) {
-  assert(PointerSize <= Offset.getBitWidth() && "Invalid PointerSize!");
-  unsigned ShiftBits = Offset.getBitWidth() - PointerSize;
+/// where the maximum index size is 64b.
+static APInt adjustToIndexSize(const APInt &Offset, unsigned IndexSize) {
+  assert(IndexSize <= Offset.getBitWidth() && "Invalid IndexSize!");
+  unsigned ShiftBits = Offset.getBitWidth() - IndexSize;
   return (Offset << ShiftBits).ashr(ShiftBits);
 }
 
-static unsigned getMaxPointerSize(const DataLayout &DL) {
-  unsigned MaxPointerSize = DL.getMaxPointerSizeInBits();
-  if (MaxPointerSize < 64 && ForceAtLeast64Bits) MaxPointerSize = 64;
-  if (DoubleCalcBits) MaxPointerSize *= 2;
+namespace {
+// A linear transformation of a Value; this class represents
+// ZExt(SExt(Trunc(V, TruncBits), SExtBits), ZExtBits) * Scale.
+struct VariableGEPIndex {
+  CastedValue Val;
+  APInt Scale;
+
+  // Context instruction to use when querying information about this index.
+  const Instruction *CxtI;
+
+  /// True if all operations in this expression are NSW.
+  bool IsNSW;
 
-  return MaxPointerSize;
+  void dump() const {
+    print(dbgs());
+    dbgs() << "\n";
+  }
+  void print(raw_ostream &OS) const {
+    OS << "(V=" << Val.V->getName()
+       << ", zextbits=" << Val.ZExtBits
+       << ", sextbits=" << Val.SExtBits
+       << ", truncbits=" << Val.TruncBits
+       << ", scale=" << Scale << ")";
+  }
+};
 }
 
+// Represents the internal structure of a GEP, decomposed into a base pointer,
+// constant offsets, and variable scaled indices.
+struct BasicAAResult::DecomposedGEP {
+  // Base pointer of the GEP
+  const Value *Base;
+  // Total constant offset from base.
+  APInt Offset;
+  // Scaled variable (non-constant) indices.
+  SmallVector<VariableGEPIndex, 4> VarIndices;
+  // Are all operations inbounds GEPs or non-indexing operations?
+  // (None iff expression doesn't involve any geps)
+  Optional<bool> InBounds;
+
+  void dump() const {
+    print(dbgs());
+    dbgs() << "\n";
+  }
+  void print(raw_ostream &OS) const {
+    OS << "(DecomposedGEP Base=" << Base->getName()
+       << ", Offset=" << Offset
+       << ", VarIndices=[";
+    for (size_t i = 0; i < VarIndices.size(); i++) {
+      if (i != 0)
+        OS << ", ";
+      VarIndices[i].print(OS);
+    }
+    OS << "])";
+  }
+};
+
+
 /// If V is a symbolic pointer expression, decompose it into a base pointer
 /// with a constant offset and a number of scaled symbolic offsets.
 ///
@@ -420,11 +541,6 @@ static unsigned getMaxPointerSize(const DataLayout &DL) {
 /// in the VarIndices vector) are Value*'s that are known to be scaled by the
 /// specified amount, but which may have other unrepresented high bits. As
 /// such, the gep cannot necessarily be reconstructed from its decomposed form.
-///
-/// This function is capable of analyzing everything that getUnderlyingObject
-/// can look through. To be able to do that getUnderlyingObject and
-/// DecomposeGEPExpression must use the same search depth
-/// (MaxLookupSearchDepth).
 BasicAAResult::DecomposedGEP
 BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
                                       AssumptionCache *AC, DominatorTree *DT) {
@@ -433,10 +549,9 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
   SearchTimes++;
   const Instruction *CxtI = dyn_cast<Instruction>(V);
 
-  unsigned MaxPointerSize = getMaxPointerSize(DL);
+  unsigned MaxIndexSize = DL.getMaxIndexSizeInBits();
   DecomposedGEP Decomposed;
-  Decomposed.Offset = APInt(MaxPointerSize, 0);
-  Decomposed.HasCompileTimeConstantScale = true;
+  Decomposed.Offset = APInt(MaxIndexSize, 0);
   do {
     // See if this is a bitcast or GEP.
     const Operator *Op = dyn_cast<Operator>(V);
@@ -493,24 +608,19 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
     else if (!GEPOp->isInBounds())
       Decomposed.InBounds = false;
 
-    // Don't attempt to analyze GEPs over unsized objects.
-    if (!GEPOp->getSourceElementType()->isSized()) {
-      Decomposed.Base = V;
-      return Decomposed;
-    }
+    assert(GEPOp->getSourceElementType()->isSized() && "GEP must be sized");
 
     // Don't attempt to analyze GEPs if index scale is not a compile-time
     // constant.
     if (isa<ScalableVectorType>(GEPOp->getSourceElementType())) {
       Decomposed.Base = V;
-      Decomposed.HasCompileTimeConstantScale = false;
       return Decomposed;
     }
 
     unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
-    unsigned PointerSize = DL.getPointerSizeInBits(AS);
+    unsigned IndexSize = DL.getIndexSizeInBits(AS);
     // Assume all GEP operands are constants until proven otherwise.
     bool GepHasConstantOffset = true;
     for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
@@ -533,49 +643,34 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
           continue;
         Decomposed.Offset +=
             DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
-            CIdx->getValue().sextOrTrunc(MaxPointerSize);
+            CIdx->getValue().sextOrTrunc(MaxIndexSize);
         continue;
       }
 
       GepHasConstantOffset = false;
 
-      APInt Scale(MaxPointerSize,
-                  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
-      // If the integer type is smaller than the pointer size, it is implicitly
-      // sign extended to pointer size.
+      // If the integer type is smaller than the index size, it is implicitly
+      // sign extended or truncated to index size.
       unsigned Width = Index->getType()->getIntegerBitWidth();
-      unsigned SExtBits = PointerSize > Width ? PointerSize - Width : 0;
+      unsigned SExtBits = IndexSize > Width ? IndexSize - Width : 0;
+      unsigned TruncBits = IndexSize < Width ? Width - IndexSize : 0;
       LinearExpression LE = GetLinearExpression(
-          ExtendedValue(Index, 0, SExtBits), DL, 0, AC, DT);
-
-      // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
-      // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
-
-      // It can be the case that, even through C1*V+C2 does not overflow for
-      // relevant values of V, (C2*Scale) can overflow. In that case, we cannot
-      // decompose the expression in this way.
-      //
-      // FIXME: C1*Scale and the other operations in the decomposed
-      // (C1*Scale)*V+C2*Scale can also overflow. We should check for this
-      // possibility.
-      bool Overflow;
-      APInt ScaledOffset = LE.Offset.sextOrTrunc(MaxPointerSize)
-                           .smul_ov(Scale, Overflow);
-      if (Overflow) {
-        LE = LinearExpression(ExtendedValue(Index, 0, SExtBits));
-      } else {
-        Decomposed.Offset += ScaledOffset;
-        Scale *= LE.Scale.sextOrTrunc(MaxPointerSize);
-      }
+          CastedValue(Index, 0, SExtBits, TruncBits), DL, 0, AC, DT);
+
+      // Scale by the type size.
+      unsigned TypeSize =
+          DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize();
+      LE = LE.mul(APInt(IndexSize, TypeSize), GEPOp->isInBounds());
+      Decomposed.Offset += LE.Offset.sextOrSelf(MaxIndexSize);
+      APInt Scale = LE.Scale.sextOrSelf(MaxIndexSize);
 
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
       for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
-        if (Decomposed.VarIndices[i].V == LE.Val.V &&
-            Decomposed.VarIndices[i].ZExtBits == LE.Val.ZExtBits &&
-            Decomposed.VarIndices[i].SExtBits == LE.Val.SExtBits) {
+        if (Decomposed.VarIndices[i].Val.V == LE.Val.V &&
+            Decomposed.VarIndices[i].Val.hasSameCastsAs(LE.Val)) {
           Scale += Decomposed.VarIndices[i].Scale;
           Decomposed.VarIndices.erase(Decomposed.VarIndices.begin() + i);
           break;
@@ -583,19 +678,18 @@ BasicAAResult::DecomposeGEPExpression(const Value *V, const DataLayout &DL,
       }
 
       // Make sure that we have a scale that makes sense for this target's
-      // pointer size.
-      Scale = adjustToPointerSize(Scale, PointerSize);
+      // index size.
+      Scale = adjustToIndexSize(Scale, IndexSize);
 
       if (!!Scale) {
-        VariableGEPIndex Entry = {
-            LE.Val.V, LE.Val.ZExtBits, LE.Val.SExtBits, Scale, CxtI, LE.IsNSW};
+        VariableGEPIndex Entry = {LE.Val, Scale, CxtI, LE.IsNSW};
         Decomposed.VarIndices.push_back(Entry);
       }
     }
 
     // Take care of wrap-arounds
     if (GepHasConstantOffset)
-      Decomposed.Offset = adjustToPointerSize(Decomposed.Offset, PointerSize);
+      Decomposed.Offset = adjustToIndexSize(Decomposed.Offset, IndexSize);
 
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
@@ -838,7 +932,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   // then the call can not mod/ref the pointer unless the call takes the pointer
   // as an argument, and itself doesn't capture it.
   if (!isa<Constant>(Object) && Call != Object &&
-      isNonEscapingLocalObject(Object, &AAQI.IsCapturedCache)) {
+      AAQI.CI->isNotCapturedBeforeOrAt(Object, Call)) {
 
     // Optimistically assume that call doesn't touch Object and check this
     // assumption in the following loop.
@@ -852,8 +946,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
       // pointer were passed to arguments that were neither of these, then it
       // couldn't be no-capture.
       if (!(*CI)->getType()->isPointerTy() ||
-          (!Call->doesNotCapture(OperandNo) &&
-           OperandNo < Call->getNumArgOperands() &&
+          (!Call->doesNotCapture(OperandNo) && OperandNo < Call->arg_size() &&
            !Call->isByValArgument(OperandNo)))
         continue;
 
@@ -1046,20 +1139,13 @@ AliasResult BasicAAResult::aliasGEP(
   DecomposedGEP DecompGEP1 = DecomposeGEPExpression(GEP1, DL, &AC, DT);
   DecomposedGEP DecompGEP2 = DecomposeGEPExpression(V2, DL, &AC, DT);
 
-  // Don't attempt to analyze the decomposed GEP if index scale is not a
-  // compile-time constant.
-  if (!DecompGEP1.HasCompileTimeConstantScale ||
-      !DecompGEP2.HasCompileTimeConstantScale)
+  // Bail if we were not able to decompose anything.
+  if (DecompGEP1.Base == GEP1 && DecompGEP2.Base == V2)
     return AliasResult::MayAlias;
 
-  assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
-         "DecomposeGEPExpression returned a result different from "
-         "getUnderlyingObject");
-
   // Subtract the GEP2 pointer from the GEP1 pointer to find out their
   // symbolic difference.
-  DecompGEP1.Offset -= DecompGEP2.Offset;
-  GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);
+  subtractDecomposedGEPs(DecompGEP1, DecompGEP2);
 
   // If an inbounds GEP would have to start from an out of bounds address
   // for the two to alias, then we can assume noalias.
@@ -1079,14 +1165,14 @@ AliasResult BasicAAResult::aliasGEP(
   // For GEPs with identical offsets, we can preserve the size and AAInfo
   // when performing the alias check on the underlying objects.
   if (DecompGEP1.Offset == 0 && DecompGEP1.VarIndices.empty())
-    return getBestAAResults().alias(
-        MemoryLocation(UnderlyingV1, V1Size),
-        MemoryLocation(UnderlyingV2, V2Size), AAQI);
+    return getBestAAResults().alias(MemoryLocation(DecompGEP1.Base, V1Size),
+                                    MemoryLocation(DecompGEP2.Base, V2Size),
+                                    AAQI);
 
   // Do the base pointers alias?
   AliasResult BaseAlias = getBestAAResults().alias(
-      MemoryLocation::getBeforeOrAfter(UnderlyingV1),
-      MemoryLocation::getBeforeOrAfter(UnderlyingV2), AAQI);
+      MemoryLocation::getBeforeOrAfter(DecompGEP1.Base),
+      MemoryLocation::getBeforeOrAfter(DecompGEP2.Base), AAQI);
 
   // If we get a No or May, then return it immediately, no amount of analysis
   // will improve this situation.
@@ -1100,7 +1186,7 @@ AliasResult BasicAAResult::aliasGEP(
   // is less than the size of the associated memory object, then we know
   // that the objects are partially overlapping.  If the difference is
   // greater, we know they do not overlap.
-  if (DecompGEP1.Offset != 0 && DecompGEP1.VarIndices.empty()) {
+  if (DecompGEP1.VarIndices.empty()) {
     APInt &Off = DecompGEP1.Offset;
 
     // Initialize for Off >= 0 (V2 <= GEP1) case.
@@ -1122,133 +1208,124 @@ AliasResult BasicAAResult::aliasGEP(
       Off = -Off;
     }
 
-    if (VLeftSize.hasValue()) {
-      const uint64_t LSize = VLeftSize.getValue();
-      if (Off.ult(LSize)) {
-        // Conservatively drop processing if a phi was visited and/or offset is
-        // too big.
-        AliasResult AR = AliasResult::PartialAlias;
-        if (VRightSize.hasValue() && Off.ule(INT32_MAX) &&
-            (Off + VRightSize.getValue()).ule(LSize)) {
-          // Memory referenced by right pointer is nested. Save the offset in
-          // cache. Note that originally offset estimated as GEP1-V2, but
-          // AliasResult contains the shift that represents GEP1+Offset=V2.
-          AR.setOffset(-Off.getSExtValue());
-          AR.swap(Swapped);
-        }
-        return AR;
+    if (!VLeftSize.hasValue())
+      return AliasResult::MayAlias;
+
+    const uint64_t LSize = VLeftSize.getValue();
+    if (Off.ult(LSize)) {
+      // Conservatively drop processing if a phi was visited and/or offset is
+      // too big.
+      AliasResult AR = AliasResult::PartialAlias;
+      if (VRightSize.hasValue() && Off.ule(INT32_MAX) &&
+          (Off + VRightSize.getValue()).ule(LSize)) {
+        // Memory referenced by right pointer is nested. Save the offset in
+        // cache. Note that originally offset estimated as GEP1-V2, but
+        // AliasResult contains the shift that represents GEP1+Offset=V2.
+        AR.setOffset(-Off.getSExtValue());
+        AR.swap(Swapped);
       }
-      return AliasResult::NoAlias;
+      return AR;
     }
+    return AliasResult::NoAlias;
   }
 
-  if (!DecompGEP1.VarIndices.empty()) {
-    APInt GCD;
-    bool AllNonNegative = DecompGEP1.Offset.isNonNegative();
-    bool AllNonPositive = DecompGEP1.Offset.isNonPositive();
-    for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {
-      APInt Scale = DecompGEP1.VarIndices[i].Scale;
-      APInt ScaleForGCD = DecompGEP1.VarIndices[i].Scale;
-      if (!DecompGEP1.VarIndices[i].IsNSW)
-        ScaleForGCD = APInt::getOneBitSet(Scale.getBitWidth(),
-                                          Scale.countTrailingZeros());
-
-      if (i == 0)
-        GCD = ScaleForGCD.abs();
-      else
-        GCD = APIntOps::GreatestCommonDivisor(GCD, ScaleForGCD.abs());
-
-      if (AllNonNegative || AllNonPositive) {
-        // If the Value could change between cycles, then any reasoning about
-        // the Value this cycle may not hold in the next cycle. We'll just
-        // give up if we can't determine conditions that hold for every cycle:
-        const Value *V = DecompGEP1.VarIndices[i].V;
-        const Instruction *CxtI = DecompGEP1.VarIndices[i].CxtI;
-
-        KnownBits Known = computeKnownBits(V, DL, 0, &AC, CxtI, DT);
-        bool SignKnownZero = Known.isNonNegative();
-        bool SignKnownOne = Known.isNegative();
-
-        // Zero-extension widens the variable, and so forces the sign
-        // bit to zero.
-        bool IsZExt = DecompGEP1.VarIndices[i].ZExtBits > 0 || isa<ZExtInst>(V);
-        SignKnownZero |= IsZExt;
-        SignKnownOne &= !IsZExt;
-
-        AllNonNegative &= (SignKnownZero && Scale.isNonNegative()) ||
-                          (SignKnownOne && Scale.isNonPositive());
-        AllNonPositive &= (SignKnownZero && Scale.isNonPositive()) ||
-                          (SignKnownOne && Scale.isNonNegative());
-      }
-    }
+  // We need to know both acess sizes for all the following heuristics.
+  if (!V1Size.hasValue() || !V2Size.hasValue())
+    return AliasResult::MayAlias;
 
-    // We now have accesses at two offsets from the same base:
-    //  1. (...)*GCD + DecompGEP1.Offset with size V1Size
-    //  2. 0 with size V2Size
-    // Using arithmetic modulo GCD, the accesses are at
-    // [ModOffset..ModOffset+V1Size) and [0..V2Size). If the first access fits
-    // into the range [V2Size..GCD), then we know they cannot overlap.
-    APInt ModOffset = DecompGEP1.Offset.srem(GCD);
-    if (ModOffset.isNegative())
-      ModOffset += GCD; // We want mod, not rem.
-    if (V1Size.hasValue() && V2Size.hasValue() &&
-        ModOffset.uge(V2Size.getValue()) &&
-        (GCD - ModOffset).uge(V1Size.getValue()))
-      return AliasResult::NoAlias;
+  APInt GCD;
+  ConstantRange OffsetRange = ConstantRange(DecompGEP1.Offset);
+  for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {
+    const VariableGEPIndex &Index = DecompGEP1.VarIndices[i];
+    const APInt &Scale = Index.Scale;
+    APInt ScaleForGCD = Scale;
+    if (!Index.IsNSW)
+      ScaleForGCD = APInt::getOneBitSet(Scale.getBitWidth(),
+                                        Scale.countTrailingZeros());
+
+    if (i == 0)
+      GCD = ScaleForGCD.abs();
+    else
+      GCD = APIntOps::GreatestCommonDivisor(GCD, ScaleForGCD.abs());
+
+    ConstantRange CR =
+        computeConstantRange(Index.Val.V, true, &AC, Index.CxtI);
+    KnownBits Known =
+        computeKnownBits(Index.Val.V, DL, 0, &AC, Index.CxtI, DT);
+    CR = CR.intersectWith(
+        ConstantRange::fromKnownBits(Known, /* Signed */ true),
+        ConstantRange::Signed);
+    CR = Index.Val.evaluateWith(CR).sextOrTrunc(OffsetRange.getBitWidth());
+
+    assert(OffsetRange.getBitWidth() == Scale.getBitWidth() &&
+           "Bit widths are normalized to MaxIndexSize");
+    if (Index.IsNSW)
+      OffsetRange = OffsetRange.add(CR.smul_sat(ConstantRange(Scale)));
+    else
+      OffsetRange = OffsetRange.add(CR.smul_fast(ConstantRange(Scale)));
+  }
 
-    // If we know all the variables are non-negative, then the total offset is
-    // also non-negative and >= DecompGEP1.Offset. We have the following layout:
-    // [0, V2Size) ... [TotalOffset, TotalOffer+V1Size]
-    // If DecompGEP1.Offset >= V2Size, the accesses don't alias.
-    if (AllNonNegative && V2Size.hasValue() &&
-        DecompGEP1.Offset.uge(V2Size.getValue()))
-      return AliasResult::NoAlias;
-    // Similarly, if the variables are non-positive, then the total offset is
-    // also non-positive and <= DecompGEP1.Offset. We have the following layout:
-    // [TotalOffset, TotalOffset+V1Size) ... [0, V2Size)
-    // If -DecompGEP1.Offset >= V1Size, the accesses don't alias.
-    if (AllNonPositive && V1Size.hasValue() &&
-        (-DecompGEP1.Offset).uge(V1Size.getValue()))
-      return AliasResult::NoAlias;
+  // We now have accesses at two offsets from the same base:
+  //  1. (...)*GCD + DecompGEP1.Offset with size V1Size
+  //  2. 0 with size V2Size
+  // Using arithmetic modulo GCD, the accesses are at
+  // [ModOffset..ModOffset+V1Size) and [0..V2Size). If the first access fits
+  // into the range [V2Size..GCD), then we know they cannot overlap.
+  APInt ModOffset = DecompGEP1.Offset.srem(GCD);
+  if (ModOffset.isNegative())
+    ModOffset += GCD; // We want mod, not rem.
+  if (ModOffset.uge(V2Size.getValue()) &&
+      (GCD - ModOffset).uge(V1Size.getValue()))
+    return AliasResult::NoAlias;
 
-    if (V1Size.hasValue() && V2Size.hasValue()) {
-      // Try to determine whether abs(VarIndex) > 0.
-      Optional<APInt> MinAbsVarIndex;
-      if (DecompGEP1.VarIndices.size() == 1) {
-        // VarIndex = Scale*V. If V != 0 then abs(VarIndex) >= abs(Scale).
-        const VariableGEPIndex &Var = DecompGEP1.VarIndices[0];
-        if (isKnownNonZero(Var.V, DL, 0, &AC, Var.CxtI, DT))
-          MinAbsVarIndex = Var.Scale.abs();
-      } else if (DecompGEP1.VarIndices.size() == 2) {
-        // VarIndex = Scale*V0 + (-Scale)*V1.
-        // If V0 != V1 then abs(VarIndex) >= abs(Scale).
-        // Check that VisitedPhiBBs is empty, to avoid reasoning about
-        // inequality of values across loop iterations.
-        const VariableGEPIndex &Var0 = DecompGEP1.VarIndices[0];
-        const VariableGEPIndex &Var1 = DecompGEP1.VarIndices[1];
-        if (Var0.Scale == -Var1.Scale && Var0.ZExtBits == Var1.ZExtBits &&
-            Var0.SExtBits == Var1.SExtBits && VisitedPhiBBs.empty() &&
-            isKnownNonEqual(Var0.V, Var1.V, DL, &AC, /* CxtI */ nullptr, DT))
-          MinAbsVarIndex = Var0.Scale.abs();
-      }
+  // Compute ranges of potentially accessed bytes for both accesses. If the
+  // interseciton is empty, there can be no overlap.
+  unsigned BW = OffsetRange.getBitWidth();
+  ConstantRange Range1 = OffsetRange.add(
+      ConstantRange(APInt(BW, 0), APInt(BW, V1Size.getValue())));
+  ConstantRange Range2 =
+      ConstantRange(APInt(BW, 0), APInt(BW, V2Size.getValue()));
+  if (Range1.intersectWith(Range2).isEmptySet())
+    return AliasResult::NoAlias;
 
-      if (MinAbsVarIndex) {
-        // The constant offset will have added at least +/-MinAbsVarIndex to it.
-        APInt OffsetLo = DecompGEP1.Offset - *MinAbsVarIndex;
-        APInt OffsetHi = DecompGEP1.Offset + *MinAbsVarIndex;
-        // Check that an access at OffsetLo or lower, and an access at OffsetHi
-        // or higher both do not alias.
-        if (OffsetLo.isNegative() && (-OffsetLo).uge(V1Size.getValue()) &&
-            OffsetHi.isNonNegative() && OffsetHi.uge(V2Size.getValue()))
-          return AliasResult::NoAlias;
-      }
+  // Try to determine the range of values for VarIndex such that
+  // VarIndex <= -MinAbsVarIndex || MinAbsVarIndex <= VarIndex.
+  Optional<APInt> MinAbsVarIndex;
+  if (DecompGEP1.VarIndices.size() == 1) {
+    // VarIndex = Scale*V.
+    const VariableGEPIndex &Var = DecompGEP1.VarIndices[0];
+    if (Var.Val.TruncBits == 0 &&
+        isKnownNonZero(Var.Val.V, DL, 0, &AC, Var.CxtI, DT)) {
+      // If V != 0 then abs(VarIndex) >= abs(Scale).
+      MinAbsVarIndex = Var.Scale.abs();
     }
+  } else if (DecompGEP1.VarIndices.size() == 2) {
+    // VarIndex = Scale*V0 + (-Scale)*V1.
+    // If V0 != V1 then abs(VarIndex) >= abs(Scale).
+    // Check that VisitedPhiBBs is empty, to avoid reasoning about
+    // inequality of values across loop iterations.
+    const VariableGEPIndex &Var0 = DecompGEP1.VarIndices[0];
+    const VariableGEPIndex &Var1 = DecompGEP1.VarIndices[1];
+    if (Var0.Scale == -Var1.Scale && Var0.Val.TruncBits == 0 &&
+        Var0.Val.hasSameCastsAs(Var1.Val) && VisitedPhiBBs.empty() &&
+        isKnownNonEqual(Var0.Val.V, Var1.Val.V, DL, &AC, /* CxtI */ nullptr,
+                        DT))
+      MinAbsVarIndex = Var0.Scale.abs();
+  }
 
-    if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
-                                DecompGEP1.Offset, &AC, DT))
+  if (MinAbsVarIndex) {
+    // The constant offset will have added at least +/-MinAbsVarIndex to it.
+    APInt OffsetLo = DecompGEP1.Offset - *MinAbsVarIndex;
+    APInt OffsetHi = DecompGEP1.Offset + *MinAbsVarIndex;
+    // We know that Offset <= OffsetLo || Offset >= OffsetHi
+    if (OffsetLo.isNegative() && (-OffsetLo).uge(V1Size.getValue()) &&
+        OffsetHi.isNonNegative() && OffsetHi.uge(V2Size.getValue()))
       return AliasResult::NoAlias;
   }
 
+  if (constantOffsetHeuristic(DecompGEP1, V1Size, V2Size, &AC, DT))
+    return AliasResult::NoAlias;
+
   // Statically, we can see that the base objects are the same, but the
   // pointers have dynamic offsets which we can't resolve. And none of our
   // little tricks above worked.
@@ -1517,10 +1594,10 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
     // location if that memory location doesn't escape. Or it may pass a
     // nocapture value to other functions as long as they don't capture it.
     if (isEscapeSource(O1) &&
-        isNonEscapingLocalObject(O2, &AAQI.IsCapturedCache))
+        AAQI.CI->isNotCapturedBeforeOrAt(O2, cast<Instruction>(O1)))
       return AliasResult::NoAlias;
     if (isEscapeSource(O2) &&
-        isNonEscapingLocalObject(O1, &AAQI.IsCapturedCache))
+        AAQI.CI->isNotCapturedBeforeOrAt(O1, cast<Instruction>(O2)))
       return AliasResult::NoAlias;
   }
 
@@ -1692,62 +1769,54 @@ bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
 }
 
 /// Computes the symbolic difference between two de-composed GEPs.
-///
-/// Dest and Src are the variable indices from two decomposed GetElementPtr
-/// instructions GEP1 and GEP2 which have common base pointers.
-void BasicAAResult::GetIndexDifference(
-    SmallVectorImpl<VariableGEPIndex> &Dest,
-    const SmallVectorImpl<VariableGEPIndex> &Src) {
-  if (Src.empty())
-    return;
-
-  for (unsigned i = 0, e = Src.size(); i != e; ++i) {
-    const Value *V = Src[i].V;
-    unsigned ZExtBits = Src[i].ZExtBits, SExtBits = Src[i].SExtBits;
-    APInt Scale = Src[i].Scale;
-
+void BasicAAResult::subtractDecomposedGEPs(DecomposedGEP &DestGEP,
+                                           const DecomposedGEP &SrcGEP) {
+  DestGEP.Offset -= SrcGEP.Offset;
+  for (const VariableGEPIndex &Src : SrcGEP.VarIndices) {
     // Find V in Dest.  This is N^2, but pointer indices almost never have more
     // than a few variable indexes.
-    for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
-      if (!isValueEqualInPotentialCycles(Dest[j].V, V) ||
-          Dest[j].ZExtBits != ZExtBits || Dest[j].SExtBits != SExtBits)
+    bool Found = false;
+    for (auto I : enumerate(DestGEP.VarIndices)) {
+      VariableGEPIndex &Dest = I.value();
+      if (!isValueEqualInPotentialCycles(Dest.Val.V, Src.Val.V) ||
+          !Dest.Val.hasSameCastsAs(Src.Val))
         continue;
 
       // If we found it, subtract off Scale V's from the entry in Dest.  If it
       // goes to zero, remove the entry.
-      if (Dest[j].Scale != Scale) {
-        Dest[j].Scale -= Scale;
-        Dest[j].IsNSW = false;
-      } else
-        Dest.erase(Dest.begin() + j);
-      Scale = 0;
+      if (Dest.Scale != Src.Scale) {
+        Dest.Scale -= Src.Scale;
+        Dest.IsNSW = false;
+      } else {
+        DestGEP.VarIndices.erase(DestGEP.VarIndices.begin() + I.index());
+      }
+      Found = true;
       break;
     }
 
     // If we didn't consume this entry, add it to the end of the Dest list.
-    if (!!Scale) {
-      VariableGEPIndex Entry = {V,      ZExtBits,    SExtBits,
-                                -Scale, Src[i].CxtI, Src[i].IsNSW};
-      Dest.push_back(Entry);
+    if (!Found) {
+      VariableGEPIndex Entry = {Src.Val, -Src.Scale, Src.CxtI, Src.IsNSW};
+      DestGEP.VarIndices.push_back(Entry);
     }
   }
 }
 
 bool BasicAAResult::constantOffsetHeuristic(
-    const SmallVectorImpl<VariableGEPIndex> &VarIndices,
-    LocationSize MaybeV1Size, LocationSize MaybeV2Size, const APInt &BaseOffset,
-    AssumptionCache *AC, DominatorTree *DT) {
-  if (VarIndices.size() != 2 || !MaybeV1Size.hasValue() ||
+    const DecomposedGEP &GEP, LocationSize MaybeV1Size,
+    LocationSize MaybeV2Size, AssumptionCache *AC, DominatorTree *DT) {
+  if (GEP.VarIndices.size() != 2 || !MaybeV1Size.hasValue() ||
       !MaybeV2Size.hasValue())
     return false;
 
   const uint64_t V1Size = MaybeV1Size.getValue();
   const uint64_t V2Size = MaybeV2Size.getValue();
 
-  const VariableGEPIndex &Var0 = VarIndices[0], &Var1 = VarIndices[1];
+  const VariableGEPIndex &Var0 = GEP.VarIndices[0], &Var1 = GEP.VarIndices[1];
 
-  if (Var0.ZExtBits != Var1.ZExtBits || Var0.SExtBits != Var1.SExtBits ||
-      Var0.Scale != -Var1.Scale || Var0.V->getType() != Var1.V->getType())
+  if (Var0.Val.TruncBits != 0 || !Var0.Val.hasSameCastsAs(Var1.Val) ||
+      Var0.Scale != -Var1.Scale ||
+      Var0.Val.V->getType() != Var1.Val.V->getType())
     return false;
 
   // We'll strip off the Extensions of Var0 and Var1 and do another round
@@ -1755,11 +1824,10 @@ bool BasicAAResult::constantOffsetHeuristic(
   // is zext(%x + 1) we should get V1 == %x and V1Offset == 1.
 
   LinearExpression E0 =
-      GetLinearExpression(ExtendedValue(Var0.V), DL, 0, AC, DT);
+      GetLinearExpression(CastedValue(Var0.Val.V), DL, 0, AC, DT);
   LinearExpression E1 =
-      GetLinearExpression(ExtendedValue(Var1.V), DL, 0, AC, DT);
-  if (E0.Scale != E1.Scale || E0.Val.ZExtBits != E1.Val.ZExtBits ||
-      E0.Val.SExtBits != E1.Val.SExtBits ||
+      GetLinearExpression(CastedValue(Var1.Val.V), DL, 0, AC, DT);
+  if (E0.Scale != E1.Scale || !E0.Val.hasSameCastsAs(E1.Val) ||
       !isValueEqualInPotentialCycles(E0.Val.V, E1.Val.V))
     return false;
 
@@ -1779,8 +1847,8 @@ bool BasicAAResult::constantOffsetHeuristic(
   // arithmetic (i.e. for some values of GEP1 and V2 GEP1 < V2, and for other
   // values GEP1 > V2). We'll therefore only declare NoAlias if both V1Size and
   // V2Size can fit in the MinDiffBytes gap.
-  return MinDiffBytes.uge(V1Size + BaseOffset.abs()) &&
-         MinDiffBytes.uge(V2Size + BaseOffset.abs());
+  return MinDiffBytes.uge(V1Size + GEP.Offset.abs()) &&
+         MinDiffBytes.uge(V2Size + GEP.Offset.abs());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index e4e45b3076be..2a5e1f65d731 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -602,7 +602,7 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
   if (!EntryCount)
     return None;
   // Use 128 bit APInt to do the arithmetic to avoid overflow.
-  APInt BlockCount(128, EntryCount.getCount());
+  APInt BlockCount(128, EntryCount->getCount());
   APInt BlockFreq(128, Freq);
   APInt EntryFreq(128, getEntryFreq());
   BlockCount *= BlockFreq;
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index aa6b93fe3f07..33fdc8b628c5 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -190,7 +190,7 @@ void BranchProbabilityInfo::SccInfo::getSccExitBlocks(
     if (isSCCExitingBlock(BB, SccNum))
       for (const auto *Succ : successors(BB))
         if (getSCCNum(Succ) != SccNum)
-          Exits.push_back(const_cast<BasicBlock *>(BB));
+          Exits.push_back(const_cast<BasicBlock *>(Succ));
   }
 }
 
diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp
index 253cc0b0a579..c60b70ae5b69 100644
--- a/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -38,12 +38,13 @@ using namespace llvm;
 // Explicit template instantiations and specialization definitions for core
 // template typedefs.
 namespace llvm {
-
 static cl::opt<bool> AbortOnMaxDevirtIterationsReached(
     "abort-on-max-devirt-iterations-reached",
     cl::desc("Abort when the max iterations for devirtualization CGSCC repeat "
              "pass is reached"));
 
+AnalysisKey ShouldNotRunFunctionPassesAnalysis::Key;
+
 // Explicit instantiations for the core proxy templates.
 template class AllAnalysesOn<LazyCallGraph::SCC>;
 template class AnalysisManager<LazyCallGraph::SCC, LazyCallGraph &>;
@@ -119,12 +120,6 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
     // Finally, we intersect the final preserved analyses to compute the
     // aggregate preserved set for this pass manager.
     PA.intersect(std::move(PassPA));
-
-    // FIXME: Historically, the pass managers all called the LLVM context's
-    // yield function here. We don't have a generic way to acquire the
-    // context and it isn't yet clear what the right pattern is for yielding
-    // in the new pass manager so it is currently omitted.
-    // ...getContext().yield();
   }
 
   // Before we mark all of *this* SCC's analyses as preserved below, intersect
@@ -547,6 +542,9 @@ PreservedAnalyses CGSCCToFunctionPassAdaptor::run(LazyCallGraph::SCC &C,
 
     Function &F = N->getFunction();
 
+    if (NoRerun && FAM.getCachedResult<ShouldNotRunFunctionPassesAnalysis>(F))
+      continue;
+
     PassInstrumentation PI = FAM.getResult<PassInstrumentationAnalysis>(F);
     if (!PI.runBeforePass<Function>(*Pass, F))
       continue;
@@ -562,7 +560,9 @@ PreservedAnalyses CGSCCToFunctionPassAdaptor::run(LazyCallGraph::SCC &C,
     // We know that the function pass couldn't have invalidated any other
     // function's analyses (that's the contract of a function pass), so
     // directly handle the function analysis manager's invalidation here.
-    FAM.invalidate(F, PassPA);
+    FAM.invalidate(F, EagerlyInvalidate ? PreservedAnalyses::none() : PassPA);
+    if (NoRerun)
+      (void)FAM.getResult<ShouldNotRunFunctionPassesAnalysis>(F);
 
     // Then intersect the preserved set so that invalidation of module
     // analyses will eventually occur when the module pass completes.
@@ -863,7 +863,7 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
   // split-off SCCs.
   // We know however that this will preserve any FAM proxy so go ahead and mark
   // that.
-  PreservedAnalyses PA;
+  auto PA = PreservedAnalyses::allInSet<AllAnalysesOn<Function>>();
   PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
   AM.invalidate(*OldC, PA);
 
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 5fe4f9befc86..8955658cb9e7 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -98,10 +98,10 @@ namespace {
   /// as the given instruction and the use.
   struct CapturesBefore : public CaptureTracker {
 
-    CapturesBefore(bool ReturnCaptures, const Instruction *I, const DominatorTree *DT,
-                   bool IncludeI)
-      : BeforeHere(I), DT(DT),
-        ReturnCaptures(ReturnCaptures), IncludeI(IncludeI), Captured(false) {}
+    CapturesBefore(bool ReturnCaptures, const Instruction *I,
+                   const DominatorTree *DT, bool IncludeI, const LoopInfo *LI)
+        : BeforeHere(I), DT(DT), ReturnCaptures(ReturnCaptures),
+          IncludeI(IncludeI), Captured(false), LI(LI) {}
 
     void tooManyUses() override { Captured = true; }
 
@@ -115,7 +115,7 @@ namespace {
         return true;
 
       // Check whether there is a path from I to BeforeHere.
-      return !isPotentiallyReachable(I, BeforeHere, nullptr, DT);
+      return !isPotentiallyReachable(I, BeforeHere, nullptr, DT, LI);
     }
 
     bool captured(const Use *U) override {
@@ -140,6 +140,68 @@ namespace {
     bool IncludeI;
 
     bool Captured;
+
+    const LoopInfo *LI;
+  };
+
+  /// Find the 'earliest' instruction before which the pointer is known not to
+  /// be captured. Here an instruction A is considered earlier than instruction
+  /// B, if A dominates B. If 2 escapes do not dominate each other, the
+  /// terminator of the common dominator is chosen. If not all uses cannot be
+  /// analyzed, the earliest escape is set to the first instruction in the
+  /// function entry block.
+  // NOTE: Users have to make sure instructions compared against the earliest
+  // escape are not in a cycle.
+  struct EarliestCaptures : public CaptureTracker {
+
+    EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT)
+        : DT(DT), ReturnCaptures(ReturnCaptures), Captured(false), F(F) {}
+
+    void tooManyUses() override {
+      Captured = true;
+      EarliestCapture = &*F.getEntryBlock().begin();
+    }
+
+    bool captured(const Use *U) override {
+      Instruction *I = cast<Instruction>(U->getUser());
+      if (isa<ReturnInst>(I) && !ReturnCaptures)
+        return false;
+
+      if (!EarliestCapture) {
+        EarliestCapture = I;
+      } else if (EarliestCapture->getParent() == I->getParent()) {
+        if (I->comesBefore(EarliestCapture))
+          EarliestCapture = I;
+      } else {
+        BasicBlock *CurrentBB = I->getParent();
+        BasicBlock *EarliestBB = EarliestCapture->getParent();
+        if (DT.dominates(EarliestBB, CurrentBB)) {
+          // EarliestCapture already comes before the current use.
+        } else if (DT.dominates(CurrentBB, EarliestBB)) {
+          EarliestCapture = I;
+        } else {
+          // Otherwise find the nearest common dominator and use its terminator.
+          auto *NearestCommonDom =
+              DT.findNearestCommonDominator(CurrentBB, EarliestBB);
+          EarliestCapture = NearestCommonDom->getTerminator();
+        }
+      }
+      Captured = true;
+
+      // Return false to continue analysis; we need to see all potential
+      // captures.
+      return false;
+    }
+
+    Instruction *EarliestCapture = nullptr;
+
+    const DominatorTree &DT;
+
+    bool ReturnCaptures;
+
+    bool Captured;
+
+    Function &F;
   };
 }
 
@@ -183,7 +245,8 @@ bool llvm::PointerMayBeCaptured(const Value *V,
 bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                       bool StoreCaptures, const Instruction *I,
                                       const DominatorTree *DT, bool IncludeI,
-                                      unsigned MaxUsesToExplore) {
+                                      unsigned MaxUsesToExplore,
+                                      const LoopInfo *LI) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
 
@@ -194,7 +257,7 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
   // TODO: See comment in PointerMayBeCaptured regarding what could be done
   // with StoreCaptures.
 
-  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI);
+  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI, LI);
   PointerMayBeCaptured(V, &CB, MaxUsesToExplore);
   if (CB.Captured)
     ++NumCapturedBefore;
@@ -203,6 +266,22 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
   return CB.Captured;
 }
 
+Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
+                                       bool ReturnCaptures, bool StoreCaptures,
+                                       const DominatorTree &DT,
+                                       unsigned MaxUsesToExplore) {
+  assert(!isa<GlobalValue>(V) &&
+         "It doesn't make sense to ask whether a global is captured.");
+
+  EarliestCaptures CB(ReturnCaptures, F, DT);
+  PointerMayBeCaptured(V, &CB, MaxUsesToExplore);
+  if (CB.Captured)
+    ++NumCapturedBefore;
+  else
+    ++NumNotCapturedBefore;
+  return CB.EarliestCapture;
+}
+
 void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
                                 unsigned MaxUsesToExplore) {
   assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
diff --git a/llvm/lib/Analysis/CmpInstAnalysis.cpp b/llvm/lib/Analysis/CmpInstAnalysis.cpp
index a5757be2c4f4..5b951980a0aa 100644
--- a/llvm/lib/Analysis/CmpInstAnalysis.cpp
+++ b/llvm/lib/Analysis/CmpInstAnalysis.cpp
@@ -77,28 +77,28 @@ bool llvm::decomposeBitTestICmp(Value *LHS, Value *RHS,
     return false;
   case ICmpInst::ICMP_SLT:
     // X < 0 is equivalent to (X & SignMask) != 0.
-    if (!C->isNullValue())
+    if (!C->isZero())
       return false;
     Mask = APInt::getSignMask(C->getBitWidth());
     Pred = ICmpInst::ICMP_NE;
     break;
   case ICmpInst::ICMP_SLE:
     // X <= -1 is equivalent to (X & SignMask) != 0.
-    if (!C->isAllOnesValue())
+    if (!C->isAllOnes())
       return false;
     Mask = APInt::getSignMask(C->getBitWidth());
     Pred = ICmpInst::ICMP_NE;
     break;
   case ICmpInst::ICMP_SGT:
     // X > -1 is equivalent to (X & SignMask) == 0.
-    if (!C->isAllOnesValue())
+    if (!C->isAllOnes())
       return false;
     Mask = APInt::getSignMask(C->getBitWidth());
     Pred = ICmpInst::ICMP_EQ;
     break;
   case ICmpInst::ICMP_SGE:
     // X >= 0 is equivalent to (X & SignMask) == 0.
-    if (!C->isNullValue())
+    if (!C->isZero())
       return false;
     Mask = APInt::getSignMask(C->getBitWidth());
     Pred = ICmpInst::ICMP_EQ;
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 8c8e2ee6627f..27c52506352f 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -34,8 +34,9 @@ appendSpeculatableOperands(const Value *V,
 
   for (const Value *Operand : U->operands())
     if (Visited.insert(Operand).second)
-      if (isSafeToSpeculativelyExecute(Operand))
-        Worklist.push_back(Operand);
+      if (const auto *I = dyn_cast<Instruction>(Operand))
+        if (!I->mayHaveSideEffects() && !I->isTerminator())
+          Worklist.push_back(I);
 }
 
 static void completeEphemeralValues(SmallPtrSetImpl<const Value *> &Visited,
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b28a0d6c78cd..3ed3b8902343 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -63,11 +63,6 @@
 using namespace llvm;
 
 namespace {
-Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
-                                  ArrayRef<Constant *> Ops,
-                                  const DataLayout &DL,
-                                  const TargetLibraryInfo *TLI,
-                                  bool ForLoadOperand);
 
 //===----------------------------------------------------------------------===//
 // Constant Folding internal helper functions
@@ -357,9 +352,9 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
                                          const DataLayout &DL) {
   do {
     Type *SrcTy = C->getType();
-    uint64_t DestSize = DL.getTypeSizeInBits(DestTy);
-    uint64_t SrcSize = DL.getTypeSizeInBits(SrcTy);
-    if (SrcSize < DestSize)
+    TypeSize DestSize = DL.getTypeSizeInBits(DestTy);
+    TypeSize SrcSize = DL.getTypeSizeInBits(SrcTy);
+    if (!TypeSize::isKnownGE(SrcSize, DestSize))
       return nullptr;
 
     // Catch the obvious splat cases (since all-zeros can coerce non-integral
@@ -550,19 +545,16 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
   return false;
 }
 
-Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
-                                          const DataLayout &DL) {
+Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
+                                       int64_t Offset, const DataLayout &DL) {
   // Bail out early. Not expect to load from scalable global variable.
   if (isa<ScalableVectorType>(LoadTy))
     return nullptr;
 
-  auto *PTy = cast<PointerType>(C->getType());
   auto *IntType = dyn_cast<IntegerType>(LoadTy);
 
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
-    unsigned AS = PTy->getAddressSpace();
-
     // If this is a float/double load, we can try folding it as an int32/64 load
     // and then bitcast the result.  This can be useful for union cases.  Note
     // that address spaces don't matter here since we're not going to result in
@@ -580,8 +572,7 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
     } else
       return nullptr;
 
-    C = FoldBitCast(C, MapTy->getPointerTo(AS), DL);
-    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, MapTy, DL)) {
+    if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) {
       if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
           !LoadTy->isX86_AMXTy())
         // Materializing a zero can be done trivially without a bitcast
@@ -607,19 +598,7 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
   if (BytesLoaded > 32 || BytesLoaded == 0)
     return nullptr;
 
-  GlobalValue *GVal;
-  APInt OffsetAI;
-  if (!IsConstantOffsetFromGlobal(C, GVal, OffsetAI, DL))
-    return nullptr;
-
-  auto *GV = dyn_cast<GlobalVariable>(GVal);
-  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
-      !GV->getInitializer()->getType()->isSized())
-    return nullptr;
-
-  int64_t Offset = OffsetAI.getSExtValue();
-  int64_t InitializerSize =
-      DL.getTypeAllocSize(GV->getInitializer()->getType()).getFixedSize();
+  int64_t InitializerSize = DL.getTypeAllocSize(C->getType()).getFixedSize();
 
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset <= -1 * static_cast<int64_t>(BytesLoaded))
@@ -640,7 +619,7 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
     Offset = 0;
   }
 
-  if (!ReadDataFromGlobal(GV->getInitializer(), Offset, CurPtr, BytesLeft, DL))
+  if (!ReadDataFromGlobal(C, Offset, CurPtr, BytesLeft, DL))
     return nullptr;
 
   APInt ResultVal = APInt(IntType->getBitWidth(), 0);
@@ -661,111 +640,70 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
   return ConstantInt::get(IntType->getContext(), ResultVal);
 }
 
-Constant *ConstantFoldLoadThroughBitcastExpr(ConstantExpr *CE, Type *DestTy,
-                                             const DataLayout &DL) {
-  auto *SrcPtr = CE->getOperand(0);
-  if (!SrcPtr->getType()->isPointerTy())
+/// If this Offset points exactly to the start of an aggregate element, return
+/// that element, otherwise return nullptr.
+Constant *getConstantAtOffset(Constant *Base, APInt Offset,
+                              const DataLayout &DL) {
+  if (Offset.isZero())
+    return Base;
+
+  if (!isa<ConstantAggregate>(Base) && !isa<ConstantDataSequential>(Base))
+    return nullptr;
+
+  Type *ElemTy = Base->getType();
+  SmallVector<APInt> Indices = DL.getGEPIndicesForOffset(ElemTy, Offset);
+  if (!Offset.isZero() || !Indices[0].isZero())
     return nullptr;
 
-  return ConstantFoldLoadFromConstPtr(SrcPtr, DestTy, DL);
+  Constant *C = Base;
+  for (const APInt &Index : drop_begin(Indices)) {
+    if (Index.isNegative() || Index.getActiveBits() >= 32)
+      return nullptr;
+
+    C = C->getAggregateElement(Index.getZExtValue());
+    if (!C)
+      return nullptr;
+  }
+
+  return C;
 }
 
 } // end anonymous namespace
 
-Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
-                                             const DataLayout &DL) {
-  // First, try the easy cases:
-  if (auto *GV = dyn_cast<GlobalVariable>(C))
-    if (GV->isConstant() && GV->hasDefinitiveInitializer())
-      return ConstantFoldLoadThroughBitcast(GV->getInitializer(), Ty, DL);
+Constant *llvm::ConstantFoldLoadFromConst(Constant *C, Type *Ty,
+                                          const APInt &Offset,
+                                          const DataLayout &DL) {
+  if (Constant *AtOffset = getConstantAtOffset(C, Offset, DL))
+    if (Constant *Result = ConstantFoldLoadThroughBitcast(AtOffset, Ty, DL))
+      return Result;
 
-  if (auto *GA = dyn_cast<GlobalAlias>(C))
-    if (GA->getAliasee() && !GA->isInterposable())
-      return ConstantFoldLoadFromConstPtr(GA->getAliasee(), Ty, DL);
+  // Try hard to fold loads from bitcasted strange and non-type-safe things.
+  if (Offset.getMinSignedBits() <= 64)
+    return FoldReinterpretLoadFromConst(C, Ty, Offset.getSExtValue(), DL);
 
-  // If the loaded value isn't a constant expr, we can't handle it.
-  auto *CE = dyn_cast<ConstantExpr>(C);
-  if (!CE)
-    return nullptr;
+  return nullptr;
+}
 
-  if (CE->getOpcode() == Instruction::GetElementPtr) {
-    if (auto *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
-      if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-        if (Constant *V = ConstantFoldLoadThroughGEPConstantExpr(
-                GV->getInitializer(), CE, Ty, DL))
-          return V;
-      }
-    } else {
-      // Try to simplify GEP if the pointer operand wasn't a GlobalVariable.
-      // SymbolicallyEvaluateGEP() with `ForLoadOperand = true` can potentially
-      // simplify the GEP more than it normally would have been, but should only
-      // be used for const folding loads.
-      SmallVector<Constant *> Ops;
-      for (unsigned I = 0, E = CE->getNumOperands(); I != E; ++I)
-        Ops.push_back(cast<Constant>(CE->getOperand(I)));
-      if (auto *Simplified = dyn_cast_or_null<ConstantExpr>(
-              SymbolicallyEvaluateGEP(cast<GEPOperator>(CE), Ops, DL, nullptr,
-                                      /*ForLoadOperand*/ true))) {
-        // If the symbolically evaluated GEP is another GEP, we can only const
-        // fold it if the resulting pointer operand is a GlobalValue. Otherwise
-        // there is nothing else to simplify since the GEP is already in the
-        // most simplified form.
-        if (isa<GEPOperator>(Simplified)) {
-          if (auto *GV = dyn_cast<GlobalVariable>(Simplified->getOperand(0))) {
-            if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-              if (Constant *V = ConstantFoldLoadThroughGEPConstantExpr(
-                      GV->getInitializer(), Simplified, Ty, DL))
-                return V;
-            }
-          }
-        } else {
-          return ConstantFoldLoadFromConstPtr(Simplified, Ty, DL);
-        }
-      }
-    }
-  }
+Constant *llvm::ConstantFoldLoadFromConst(Constant *C, Type *Ty,
+                                          const DataLayout &DL) {
+  return ConstantFoldLoadFromConst(C, Ty, APInt(64, 0), DL);
+}
 
-  if (CE->getOpcode() == Instruction::BitCast)
-    if (Constant *LoadedC = ConstantFoldLoadThroughBitcastExpr(CE, Ty, DL))
-      return LoadedC;
-
-  // Instead of loading constant c string, use corresponding integer value
-  // directly if string length is small enough.
-  StringRef Str;
-  if (getConstantStringInfo(CE, Str) && !Str.empty()) {
-    size_t StrLen = Str.size();
-    unsigned NumBits = Ty->getPrimitiveSizeInBits();
-    // Replace load with immediate integer if the result is an integer or fp
-    // value.
-    if ((NumBits >> 3) == StrLen + 1 && (NumBits & 7) == 0 &&
-        (isa<IntegerType>(Ty) || Ty->isFloatingPointTy())) {
-      APInt StrVal(NumBits, 0);
-      APInt SingleChar(NumBits, 0);
-      if (DL.isLittleEndian()) {
-        for (unsigned char C : reverse(Str.bytes())) {
-          SingleChar = static_cast<uint64_t>(C);
-          StrVal = (StrVal << 8) | SingleChar;
-        }
-      } else {
-        for (unsigned char C : Str.bytes()) {
-          SingleChar = static_cast<uint64_t>(C);
-          StrVal = (StrVal << 8) | SingleChar;
-        }
-        // Append NULL at the end.
-        SingleChar = 0;
-        StrVal = (StrVal << 8) | SingleChar;
-      }
+Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
+                                             APInt Offset,
+                                             const DataLayout &DL) {
+  C = cast<Constant>(C->stripAndAccumulateConstantOffsets(
+          DL, Offset, /* AllowNonInbounds */ true));
 
-      Constant *Res = ConstantInt::get(CE->getContext(), StrVal);
-      if (Ty->isFloatingPointTy())
-        Res = ConstantExpr::getBitCast(Res, Ty);
-      return Res;
-    }
-  }
+  if (auto *GV = dyn_cast<GlobalVariable>(C))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      if (Constant *Result = ConstantFoldLoadFromConst(GV->getInitializer(), Ty,
+                                                       Offset, DL))
+        return Result;
 
   // If this load comes from anywhere in a constant global, and if the global
   // is all undef or zero, we know what it loads.
-  if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(CE))) {
+  if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
       if (GV->getInitializer()->isNullValue())
         return Constant::getNullValue(Ty);
@@ -774,8 +712,13 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
     }
   }
 
-  // Try hard to fold loads from bitcasted strange and non-type-safe things.
-  return FoldReinterpretLoadFromConstPtr(CE, Ty, DL);
+  return nullptr;
+}
+
+Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
+                                             const DataLayout &DL) {
+  APInt Offset(DL.getIndexTypeSizeInBits(C->getType()), 0);
+  return ConstantFoldLoadFromConstPtr(C, Ty, Offset, DL);
 }
 
 namespace {
@@ -795,11 +738,11 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1,
   if (Opc == Instruction::And) {
     KnownBits Known0 = computeKnownBits(Op0, DL);
     KnownBits Known1 = computeKnownBits(Op1, DL);
-    if ((Known1.One | Known0.Zero).isAllOnesValue()) {
+    if ((Known1.One | Known0.Zero).isAllOnes()) {
       // All the bits of Op0 that the 'and' could be masking are already zero.
       return Op0;
     }
-    if ((Known0.One | Known1.Zero).isAllOnesValue()) {
+    if ((Known0.One | Known1.Zero).isAllOnes()) {
       // All the bits of Op1 that the 'and' could be masking are already zero.
       return Op1;
     }
@@ -867,17 +810,10 @@ Constant *CastGEPIndices(Type *SrcElemTy, ArrayRef<Constant *> Ops,
 }
 
 /// Strip the pointer casts, but preserve the address space information.
-Constant *StripPtrCastKeepAS(Constant *Ptr, bool ForLoadOperand) {
+Constant *StripPtrCastKeepAS(Constant *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
   auto *OldPtrTy = cast<PointerType>(Ptr->getType());
   Ptr = cast<Constant>(Ptr->stripPointerCasts());
-  if (ForLoadOperand) {
-    while (isa<GlobalAlias>(Ptr) && !cast<GlobalAlias>(Ptr)->isInterposable() &&
-           !cast<GlobalAlias>(Ptr)->getBaseObject()->isInterposable()) {
-      Ptr = cast<GlobalAlias>(Ptr)->getAliasee();
-    }
-  }
-
   auto *NewPtrTy = cast<PointerType>(Ptr->getType());
 
   // Preserve the address space number of the pointer.
@@ -893,8 +829,7 @@ Constant *StripPtrCastKeepAS(Constant *Ptr, bool ForLoadOperand) {
 Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
                                   ArrayRef<Constant *> Ops,
                                   const DataLayout &DL,
-                                  const TargetLibraryInfo *TLI,
-                                  bool ForLoadOperand) {
+                                  const TargetLibraryInfo *TLI) {
   const GEPOperator *InnermostGEP = GEP;
   bool InBounds = GEP->isInBounds();
 
@@ -939,7 +874,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
             DL.getIndexedOffsetInType(
                 SrcElemTy,
                 makeArrayRef((Value * const *)Ops.data() + 1, Ops.size() - 1)));
-  Ptr = StripPtrCastKeepAS(Ptr, ForLoadOperand);
+  Ptr = StripPtrCastKeepAS(Ptr);
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
   while (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
@@ -961,7 +896,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     Ptr = cast<Constant>(GEP->getOperand(0));
     SrcElemTy = GEP->getSourceElementType();
     Offset += APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps));
-    Ptr = StripPtrCastKeepAS(Ptr, ForLoadOperand);
+    Ptr = StripPtrCastKeepAS(Ptr);
   }
 
   // If the base value for this address is a literal integer value, fold the
@@ -985,72 +920,41 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   // we eliminate over-indexing of the notional static type array bounds.
   // This makes it easy to determine if the getelementptr is "inbounds".
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
-  SmallVector<Constant *, 32> NewIdxs;
-  Type *Ty = PTy;
-  SrcElemTy = PTy->getElementType();
 
-  do {
-    if (!Ty->isStructTy()) {
-      if (Ty->isPointerTy()) {
-        // The only pointer indexing we'll do is on the first index of the GEP.
-        if (!NewIdxs.empty())
-          break;
+  // For GEPs of GlobalValues, use the value type even for opaque pointers.
+  // Otherwise use an i8 GEP.
+  if (auto *GV = dyn_cast<GlobalValue>(Ptr))
+    SrcElemTy = GV->getValueType();
+  else if (!PTy->isOpaque())
+    SrcElemTy = PTy->getElementType();
+  else
+    SrcElemTy = Type::getInt8Ty(Ptr->getContext());
 
-        Ty = SrcElemTy;
+  if (!SrcElemTy->isSized())
+    return nullptr;
 
-        // Only handle pointers to sized types, not pointers to functions.
-        if (!Ty->isSized())
-          return nullptr;
-      } else {
-        Type *NextTy = GetElementPtrInst::getTypeAtIndex(Ty, (uint64_t)0);
-        if (!NextTy)
-          break;
-        Ty = NextTy;
-      }
+  Type *ElemTy = SrcElemTy;
+  SmallVector<APInt> Indices = DL.getGEPIndicesForOffset(ElemTy, Offset);
+  if (Offset != 0)
+    return nullptr;
 
-      // Determine which element of the array the offset points into.
-      APInt ElemSize(BitWidth, DL.getTypeAllocSize(Ty));
-      if (ElemSize == 0) {
-        // The element size is 0. This may be [0 x Ty]*, so just use a zero
-        // index for this level and proceed to the next level to see if it can
-        // accommodate the offset.
-        NewIdxs.push_back(ConstantInt::get(IntIdxTy, 0));
-      } else {
-        // The element size is non-zero divide the offset by the element
-        // size (rounding down), to compute the index at this level.
-        bool Overflow;
-        APInt NewIdx = Offset.sdiv_ov(ElemSize, Overflow);
-        if (Overflow)
-          break;
-        Offset -= NewIdx * ElemSize;
-        NewIdxs.push_back(ConstantInt::get(IntIdxTy, NewIdx));
-      }
-    } else {
-      auto *STy = cast<StructType>(Ty);
-      // If we end up with an offset that isn't valid for this struct type, we
-      // can't re-form this GEP in a regular form, so bail out. The pointer
-      // operand likely went through casts that are necessary to make the GEP
-      // sensible.
-      const StructLayout &SL = *DL.getStructLayout(STy);
-      if (Offset.isNegative() || Offset.uge(SL.getSizeInBytes()))
-        break;
+  // Try to add additional zero indices to reach the desired result element
+  // type.
+  // TODO: Should we avoid extra zero indices if ResElemTy can't be reached and
+  // we'll have to insert a bitcast anyway?
+  while (ElemTy != ResElemTy) {
+    Type *NextTy = GetElementPtrInst::getTypeAtIndex(ElemTy, (uint64_t)0);
+    if (!NextTy)
+      break;
 
-      // Determine which field of the struct the offset points into. The
-      // getZExtValue is fine as we've already ensured that the offset is
-      // within the range representable by the StructLayout API.
-      unsigned ElIdx = SL.getElementContainingOffset(Offset.getZExtValue());
-      NewIdxs.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
-                                         ElIdx));
-      Offset -= APInt(BitWidth, SL.getElementOffset(ElIdx));
-      Ty = STy->getTypeAtIndex(ElIdx);
-    }
-  } while (Ty != ResElemTy);
+    Indices.push_back(APInt::getZero(isa<StructType>(ElemTy) ? 32 : BitWidth));
+    ElemTy = NextTy;
+  }
 
-  // If we haven't used up the entire offset by descending the static
-  // type, then the offset is pointing into the middle of an indivisible
-  // member, so we can't simplify it.
-  if (Offset != 0)
-    return nullptr;
+  SmallVector<Constant *, 32> NewIdxs;
+  for (const APInt &Index : Indices)
+    NewIdxs.push_back(ConstantInt::get(
+        Type::getIntNTy(Ptr->getContext(), Index.getBitWidth()), Index));
 
   // Preserve the inrange index from the innermost GEP if possible. We must
   // have calculated the same indices up to and including the inrange index.
@@ -1067,8 +971,9 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   // Create a GEP.
   Constant *C = ConstantExpr::getGetElementPtr(SrcElemTy, Ptr, NewIdxs,
                                                InBounds, InRangeIndex);
-  assert(C->getType()->getPointerElementType() == Ty &&
-         "Computed GetElementPtr has unexpected type!");
+  assert(
+      cast<PointerType>(C->getType())->isOpaqueOrPointeeTypeMatches(ElemTy) &&
+      "Computed GetElementPtr has unexpected type!");
 
   // If we ended up indexing a member with a type that doesn't match
   // the type of what the original indices indexed, add a cast.
@@ -1099,8 +1004,7 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
     return ConstantFoldCastOperand(Opcode, Ops[0], DestTy, DL);
 
   if (auto *GEP = dyn_cast<GEPOperator>(InstOrCE)) {
-    if (Constant *C = SymbolicallyEvaluateGEP(GEP, Ops, DL, TLI,
-                                              /*ForLoadOperand*/ false))
+    if (Constant *C = SymbolicallyEvaluateGEP(GEP, Ops, DL, TLI))
       return C;
 
     return ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), Ops[0],
@@ -1375,21 +1279,31 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   default:
     llvm_unreachable("Missing case");
   case Instruction::PtrToInt:
-    // If the input is a inttoptr, eliminate the pair.  This requires knowing
-    // the width of a pointer, so it can't be done in ConstantExpr::getCast.
     if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+      Constant *FoldedValue = nullptr;
+      // If the input is a inttoptr, eliminate the pair.  This requires knowing
+      // the width of a pointer, so it can't be done in ConstantExpr::getCast.
       if (CE->getOpcode() == Instruction::IntToPtr) {
-        Constant *Input = CE->getOperand(0);
-        unsigned InWidth = Input->getType()->getScalarSizeInBits();
-        unsigned PtrWidth = DL.getPointerTypeSizeInBits(CE->getType());
-        if (PtrWidth < InWidth) {
-          Constant *Mask =
-            ConstantInt::get(CE->getContext(),
-                             APInt::getLowBitsSet(InWidth, PtrWidth));
-          Input = ConstantExpr::getAnd(Input, Mask);
+        // zext/trunc the inttoptr to pointer size.
+        FoldedValue = ConstantExpr::getIntegerCast(
+            CE->getOperand(0), DL.getIntPtrType(CE->getType()),
+            /*IsSigned=*/false);
+      } else if (auto *GEP = dyn_cast<GEPOperator>(CE)) {
+        // If we have GEP, we can perform the following folds:
+        // (ptrtoint (gep null, x)) -> x
+        // (ptrtoint (gep (gep null, x), y) -> x + y, etc.
+        unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
+        APInt BaseOffset(BitWidth, 0);
+        auto *Base = cast<Constant>(GEP->stripAndAccumulateConstantOffsets(
+            DL, BaseOffset, /*AllowNonInbounds=*/true));
+        if (Base->isNullValue()) {
+          FoldedValue = ConstantInt::get(CE->getContext(), BaseOffset);
         }
-        // Do a zext or trunc to get to the dest size.
-        return ConstantExpr::getIntegerCast(Input, DestTy, false);
+      }
+      if (FoldedValue) {
+        // Do a zext or trunc to get to the ptrtoint dest size.
+        return ConstantExpr::getIntegerCast(FoldedValue, DestTy,
+                                            /*IsSigned=*/false);
       }
     }
     return ConstantExpr::getCast(Opcode, C, DestTy);
@@ -1446,19 +1360,6 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
   return ConstantFoldLoadThroughBitcast(C, Ty, DL);
 }
 
-Constant *
-llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
-                                        ArrayRef<Constant *> Indices) {
-  // Loop over all of the operands, tracking down which value we are
-  // addressing.
-  for (Constant *Index : Indices) {
-    C = C->getAggregateElement(Index);
-    if (!C)
-      return nullptr;
-  }
-  return C;
-}
-
 //===----------------------------------------------------------------------===//
 //  Constant Folding for Calls
 //
@@ -1879,7 +1780,7 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI,
     // know that its evaluation does not raise exceptions, so side effect
     // is absent. To allow removing the call, mark it as not accessing memory.
     if (EB && *EB != fp::ExceptionBehavior::ebIgnore)
-      CI->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+      CI->addFnAttr(Attribute::ReadNone);
     return true;
   }
 
@@ -2112,7 +2013,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     /// the host native double versions.  Float versions are not called
     /// directly but for all these it is true (float)(f((double)arg)) ==
     /// f(arg).  Long double not supported yet.
-    APFloat APF = Op->getValueAPF();
+    const APFloat &APF = Op->getValueAPF();
 
     switch (IntrinsicID) {
       default: break;
@@ -2163,7 +2064,9 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return nullptr;
 
     LibFunc Func = NotLibFunc;
-    TLI->getLibFunc(Name, Func);
+    if (!TLI->getLibFunc(Name, Func))
+      return nullptr;
+
     switch (Func) {
     default:
       break;
@@ -2416,12 +2319,12 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
   if (const auto *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
     if (!Ty->isFloatingPointTy())
       return nullptr;
-    APFloat Op1V = Op1->getValueAPF();
+    const APFloat &Op1V = Op1->getValueAPF();
 
     if (const auto *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
       if (Op2->getType() != Op1->getType())
         return nullptr;
-      APFloat Op2V = Op2->getValueAPF();
+      const APFloat &Op2V = Op2->getValueAPF();
 
       if (const auto *ConstrIntr = dyn_cast<ConstrainedFPIntrinsic>(Call)) {
         RoundingMode RM = getEvaluationRoundingMode(ConstrIntr);
@@ -2487,7 +2390,9 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return nullptr;
 
       LibFunc Func = NotLibFunc;
-      TLI->getLibFunc(Name, Func);
+      if (!TLI->getLibFunc(Name, Func))
+        return nullptr;
+
       switch (Func) {
       default:
         break;
@@ -2671,7 +2576,7 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
       assert(C1 && "Must be constant int");
 
       // cttz(0, 1) and ctlz(0, 1) are undef.
-      if (C1->isOneValue() && (!C0 || C0->isNullValue()))
+      if (C1->isOne() && (!C0 || C0->isZero()))
         return UndefValue::get(Ty);
       if (!C0)
         return Constant::getNullValue(Ty);
@@ -2683,11 +2588,11 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
     case Intrinsic::abs:
       // Undef or minimum val operand with poison min --> undef
       assert(C1 && "Must be constant int");
-      if (C1->isOneValue() && (!C0 || C0->isMinSignedValue()))
+      if (C1->isOne() && (!C0 || C0->isMinSignedValue()))
         return UndefValue::get(Ty);
 
       // Undef operand with no poison min --> 0 (sign bit must be clear)
-      if (C1->isNullValue() && !C0)
+      if (C1->isZero() && !C0)
         return Constant::getNullValue(Ty);
 
       return ConstantInt::get(Ty, C0->abs());
@@ -3191,7 +3096,7 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
   if (!TLI || !TLI->getLibFunc(*F, Func))
     return false;
 
-  if (Call->getNumArgOperands() == 1) {
+  if (Call->arg_size() == 1) {
     if (ConstantFP *OpC = dyn_cast<ConstantFP>(Call->getArgOperand(0))) {
       const APFloat &Op = OpC->getValueAPF();
       switch (Func) {
@@ -3280,7 +3185,7 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
     }
   }
 
-  if (Call->getNumArgOperands() == 2) {
+  if (Call->arg_size() == 2) {
     ConstantFP *Op0C = dyn_cast<ConstantFP>(Call->getArgOperand(0));
     ConstantFP *Op1C = dyn_cast<ConstantFP>(Call->getArgOperand(1));
     if (Op0C && Op1C) {
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 83b7d5cbfc3e..f407ec0d017a 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -16,10 +16,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/CostModel.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -113,3 +115,23 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
     }
   }
 }
+
+PreservedAnalyses CostModelPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  OS << "Cost Model for function '" << F.getName() << "'\n";
+  for (BasicBlock &B : F) {
+    for (Instruction &Inst : B) {
+      // TODO: Use a pass parameter instead of cl::opt CostKind to determine
+      // which cost kind to print.
+      InstructionCost Cost = TTI.getInstructionCost(&Inst, CostKind);
+      if (auto CostVal = Cost.getValue())
+        OS << "Cost Model: Found an estimated cost of " << *CostVal;
+      else
+        OS << "Cost Model: Invalid cost";
+
+      OS << " for instruction: " << Inst << "\n";
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 448e970e9bcc..670532c6d9a8 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionDivision.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -36,6 +37,492 @@ using namespace llvm;
 #define DL_NAME "delinearize"
 #define DEBUG_TYPE DL_NAME
 
+// Return true when S contains at least an undef value.
+static inline bool containsUndefs(const SCEV *S) {
+  return SCEVExprContains(S, [](const SCEV *S) {
+    if (const auto *SU = dyn_cast<SCEVUnknown>(S))
+      return isa<UndefValue>(SU->getValue());
+    return false;
+  });
+}
+
+namespace {
+
+// Collect all steps of SCEV expressions.
+struct SCEVCollectStrides {
+  ScalarEvolution &SE;
+  SmallVectorImpl<const SCEV *> &Strides;
+
+  SCEVCollectStrides(ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &S)
+      : SE(SE), Strides(S) {}
+
+  bool follow(const SCEV *S) {
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+      Strides.push_back(AR->getStepRecurrence(SE));
+    return true;
+  }
+
+  bool isDone() const { return false; }
+};
+
+// Collect all SCEVUnknown and SCEVMulExpr expressions.
+struct SCEVCollectTerms {
+  SmallVectorImpl<const SCEV *> &Terms;
+
+  SCEVCollectTerms(SmallVectorImpl<const SCEV *> &T) : Terms(T) {}
+
+  bool follow(const SCEV *S) {
+    if (isa<SCEVUnknown>(S) || isa<SCEVMulExpr>(S) ||
+        isa<SCEVSignExtendExpr>(S)) {
+      if (!containsUndefs(S))
+        Terms.push_back(S);
+
+      // Stop recursion: once we collected a term, do not walk its operands.
+      return false;
+    }
+
+    // Keep looking.
+    return true;
+  }
+
+  bool isDone() const { return false; }
+};
+
+// Check if a SCEV contains an AddRecExpr.
+struct SCEVHasAddRec {
+  bool &ContainsAddRec;
+
+  SCEVHasAddRec(bool &ContainsAddRec) : ContainsAddRec(ContainsAddRec) {
+    ContainsAddRec = false;
+  }
+
+  bool follow(const SCEV *S) {
+    if (isa<SCEVAddRecExpr>(S)) {
+      ContainsAddRec = true;
+
+      // Stop recursion: once we collected a term, do not walk its operands.
+      return false;
+    }
+
+    // Keep looking.
+    return true;
+  }
+
+  bool isDone() const { return false; }
+};
+
+// Find factors that are multiplied with an expression that (possibly as a
+// subexpression) contains an AddRecExpr. In the expression:
+//
+//  8 * (100 +  %p * %q * (%a + {0, +, 1}_loop))
+//
+// "%p * %q" are factors multiplied by the expression "(%a + {0, +, 1}_loop)"
+// that contains the AddRec {0, +, 1}_loop. %p * %q are likely to be array size
+// parameters as they form a product with an induction variable.
+//
+// This collector expects all array size parameters to be in the same MulExpr.
+// It might be necessary to later add support for collecting parameters that are
+// spread over different nested MulExpr.
+struct SCEVCollectAddRecMultiplies {
+  SmallVectorImpl<const SCEV *> &Terms;
+  ScalarEvolution &SE;
+
+  SCEVCollectAddRecMultiplies(SmallVectorImpl<const SCEV *> &T,
+                              ScalarEvolution &SE)
+      : Terms(T), SE(SE) {}
+
+  bool follow(const SCEV *S) {
+    if (auto *Mul = dyn_cast<SCEVMulExpr>(S)) {
+      bool HasAddRec = false;
+      SmallVector<const SCEV *, 0> Operands;
+      for (auto Op : Mul->operands()) {
+        const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Op);
+        if (Unknown && !isa<CallInst>(Unknown->getValue())) {
+          Operands.push_back(Op);
+        } else if (Unknown) {
+          HasAddRec = true;
+        } else {
+          bool ContainsAddRec = false;
+          SCEVHasAddRec ContiansAddRec(ContainsAddRec);
+          visitAll(Op, ContiansAddRec);
+          HasAddRec |= ContainsAddRec;
+        }
+      }
+      if (Operands.size() == 0)
+        return true;
+
+      if (!HasAddRec)
+        return false;
+
+      Terms.push_back(SE.getMulExpr(Operands));
+      // Stop recursion: once we collected a term, do not walk its operands.
+      return false;
+    }
+
+    // Keep looking.
+    return true;
+  }
+
+  bool isDone() const { return false; }
+};
+
+} // end anonymous namespace
+
+/// Find parametric terms in this SCEVAddRecExpr. We first for parameters in
+/// two places:
+///   1) The strides of AddRec expressions.
+///   2) Unknowns that are multiplied with AddRec expressions.
+void llvm::collectParametricTerms(ScalarEvolution &SE, const SCEV *Expr,
+                                  SmallVectorImpl<const SCEV *> &Terms) {
+  SmallVector<const SCEV *, 4> Strides;
+  SCEVCollectStrides StrideCollector(SE, Strides);
+  visitAll(Expr, StrideCollector);
+
+  LLVM_DEBUG({
+    dbgs() << "Strides:\n";
+    for (const SCEV *S : Strides)
+      dbgs() << *S << "\n";
+  });
+
+  for (const SCEV *S : Strides) {
+    SCEVCollectTerms TermCollector(Terms);
+    visitAll(S, TermCollector);
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Terms:\n";
+    for (const SCEV *T : Terms)
+      dbgs() << *T << "\n";
+  });
+
+  SCEVCollectAddRecMultiplies MulCollector(Terms, SE);
+  visitAll(Expr, MulCollector);
+}
+
+static bool findArrayDimensionsRec(ScalarEvolution &SE,
+                                   SmallVectorImpl<const SCEV *> &Terms,
+                                   SmallVectorImpl<const SCEV *> &Sizes) {
+  int Last = Terms.size() - 1;
+  const SCEV *Step = Terms[Last];
+
+  // End of recursion.
+  if (Last == 0) {
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Step)) {
+      SmallVector<const SCEV *, 2> Qs;
+      for (const SCEV *Op : M->operands())
+        if (!isa<SCEVConstant>(Op))
+          Qs.push_back(Op);
+
+      Step = SE.getMulExpr(Qs);
+    }
+
+    Sizes.push_back(Step);
+    return true;
+  }
+
+  for (const SCEV *&Term : Terms) {
+    // Normalize the terms before the next call to findArrayDimensionsRec.
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Term, Step, &Q, &R);
+
+    // Bail out when GCD does not evenly divide one of the terms.
+    if (!R->isZero())
+      return false;
+
+    Term = Q;
+  }
+
+  // Remove all SCEVConstants.
+  erase_if(Terms, [](const SCEV *E) { return isa<SCEVConstant>(E); });
+
+  if (Terms.size() > 0)
+    if (!findArrayDimensionsRec(SE, Terms, Sizes))
+      return false;
+
+  Sizes.push_back(Step);
+  return true;
+}
+
+// Returns true when one of the SCEVs of Terms contains a SCEVUnknown parameter.
+static inline bool containsParameters(SmallVectorImpl<const SCEV *> &Terms) {
+  for (const SCEV *T : Terms)
+    if (SCEVExprContains(T, [](const SCEV *S) { return isa<SCEVUnknown>(S); }))
+      return true;
+
+  return false;
+}
+
+// Return the number of product terms in S.
+static inline int numberOfTerms(const SCEV *S) {
+  if (const SCEVMulExpr *Expr = dyn_cast<SCEVMulExpr>(S))
+    return Expr->getNumOperands();
+  return 1;
+}
+
+static const SCEV *removeConstantFactors(ScalarEvolution &SE, const SCEV *T) {
+  if (isa<SCEVConstant>(T))
+    return nullptr;
+
+  if (isa<SCEVUnknown>(T))
+    return T;
+
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(T)) {
+    SmallVector<const SCEV *, 2> Factors;
+    for (const SCEV *Op : M->operands())
+      if (!isa<SCEVConstant>(Op))
+        Factors.push_back(Op);
+
+    return SE.getMulExpr(Factors);
+  }
+
+  return T;
+}
+
+void llvm::findArrayDimensions(ScalarEvolution &SE,
+                               SmallVectorImpl<const SCEV *> &Terms,
+                               SmallVectorImpl<const SCEV *> &Sizes,
+                               const SCEV *ElementSize) {
+  if (Terms.size() < 1 || !ElementSize)
+    return;
+
+  // Early return when Terms do not contain parameters: we do not delinearize
+  // non parametric SCEVs.
+  if (!containsParameters(Terms))
+    return;
+
+  LLVM_DEBUG({
+    dbgs() << "Terms:\n";
+    for (const SCEV *T : Terms)
+      dbgs() << *T << "\n";
+  });
+
+  // Remove duplicates.
+  array_pod_sort(Terms.begin(), Terms.end());
+  Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end());
+
+  // Put larger terms first.
+  llvm::sort(Terms, [](const SCEV *LHS, const SCEV *RHS) {
+    return numberOfTerms(LHS) > numberOfTerms(RHS);
+  });
+
+  // Try to divide all terms by the element size. If term is not divisible by
+  // element size, proceed with the original term.
+  for (const SCEV *&Term : Terms) {
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
+    if (!Q->isZero())
+      Term = Q;
+  }
+
+  SmallVector<const SCEV *, 4> NewTerms;
+
+  // Remove constant factors.
+  for (const SCEV *T : Terms)
+    if (const SCEV *NewT = removeConstantFactors(SE, T))
+      NewTerms.push_back(NewT);
+
+  LLVM_DEBUG({
+    dbgs() << "Terms after sorting:\n";
+    for (const SCEV *T : NewTerms)
+      dbgs() << *T << "\n";
+  });
+
+  if (NewTerms.empty() || !findArrayDimensionsRec(SE, NewTerms, Sizes)) {
+    Sizes.clear();
+    return;
+  }
+
+  // The last element to be pushed into Sizes is the size of an element.
+  Sizes.push_back(ElementSize);
+
+  LLVM_DEBUG({
+    dbgs() << "Sizes:\n";
+    for (const SCEV *S : Sizes)
+      dbgs() << *S << "\n";
+  });
+}
+
+void llvm::computeAccessFunctions(ScalarEvolution &SE, const SCEV *Expr,
+                                  SmallVectorImpl<const SCEV *> &Subscripts,
+                                  SmallVectorImpl<const SCEV *> &Sizes) {
+  // Early exit in case this SCEV is not an affine multivariate function.
+  if (Sizes.empty())
+    return;
+
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(Expr))
+    if (!AR->isAffine())
+      return;
+
+  const SCEV *Res = Expr;
+  int Last = Sizes.size() - 1;
+  for (int i = Last; i >= 0; i--) {
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Res, Sizes[i], &Q, &R);
+
+    LLVM_DEBUG({
+      dbgs() << "Res: " << *Res << "\n";
+      dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
+      dbgs() << "Res divided by Sizes[i]:\n";
+      dbgs() << "Quotient: " << *Q << "\n";
+      dbgs() << "Remainder: " << *R << "\n";
+    });
+
+    Res = Q;
+
+    // Do not record the last subscript corresponding to the size of elements in
+    // the array.
+    if (i == Last) {
+
+      // Bail out if the byte offset is non-zero.
+      if (!R->isZero()) {
+        Subscripts.clear();
+        Sizes.clear();
+        return;
+      }
+
+      continue;
+    }
+
+    // Record the access function for the current subscript.
+    Subscripts.push_back(R);
+  }
+
+  // Also push in last position the remainder of the last division: it will be
+  // the access function of the innermost dimension.
+  Subscripts.push_back(Res);
+
+  std::reverse(Subscripts.begin(), Subscripts.end());
+
+  LLVM_DEBUG({
+    dbgs() << "Subscripts:\n";
+    for (const SCEV *S : Subscripts)
+      dbgs() << *S << "\n";
+  });
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access. Returns the remainder of the delinearization that
+/// is the offset start of the array.  The SCEV->delinearize algorithm computes
+/// the multiples of SCEV coefficients: that is a pattern matching of sub
+/// expressions in the stride and base of a SCEV corresponding to the
+/// computation of a GCD (greatest common divisor) of base and stride.  When
+/// SCEV->delinearize fails, it returns the SCEV unchanged.
+///
+/// For example: when analyzing the memory access A[i][j][k] in this loop nest
+///
+///  void foo(long n, long m, long o, double A[n][m][o]) {
+///
+///    for (long i = 0; i < n; i++)
+///      for (long j = 0; j < m; j++)
+///        for (long k = 0; k < o; k++)
+///          A[i][j][k] = 1.0;
+///  }
+///
+/// the delinearization input is the following AddRec SCEV:
+///
+///  AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+///
+/// From this SCEV, we are able to say that the base offset of the access is %A
+/// because it appears as an offset that does not divide any of the strides in
+/// the loops:
+///
+///  CHECK: Base offset: %A
+///
+/// and then SCEV->delinearize determines the size of some of the dimensions of
+/// the array as these are the multiples by which the strides are happening:
+///
+///  CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double)
+///  bytes.
+///
+/// Note that the outermost dimension remains of UnknownSize because there are
+/// no strides that would help identifying the size of the last dimension: when
+/// the array has been statically allocated, one could compute the size of that
+/// dimension by dividing the overall size of the array by the size of the known
+/// dimensions: %m * %o * 8.
+///
+/// Finally delinearize provides the access functions for the array reference
+/// that does correspond to A[i][j][k] of the above C testcase:
+///
+///  CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// The testcases are checking the output of a function pass:
+/// DelinearizationPass that walks through all loads and stores of a function
+/// asking for the SCEV of the memory access with respect to all enclosing
+/// loops, calling SCEV->delinearize on that and printing the results.
+void llvm::delinearize(ScalarEvolution &SE, const SCEV *Expr,
+                       SmallVectorImpl<const SCEV *> &Subscripts,
+                       SmallVectorImpl<const SCEV *> &Sizes,
+                       const SCEV *ElementSize) {
+  // First step: collect parametric terms.
+  SmallVector<const SCEV *, 4> Terms;
+  collectParametricTerms(SE, Expr, Terms);
+
+  if (Terms.empty())
+    return;
+
+  // Second step: find subscript sizes.
+  findArrayDimensions(SE, Terms, Sizes, ElementSize);
+
+  if (Sizes.empty())
+    return;
+
+  // Third step: compute the access functions for each subscript.
+  computeAccessFunctions(SE, Expr, Subscripts, Sizes);
+
+  if (Subscripts.empty())
+    return;
+
+  LLVM_DEBUG({
+    dbgs() << "succeeded to delinearize " << *Expr << "\n";
+    dbgs() << "ArrayDecl[UnknownSize]";
+    for (const SCEV *S : Sizes)
+      dbgs() << "[" << *S << "]";
+
+    dbgs() << "\nArrayRef";
+    for (const SCEV *S : Subscripts)
+      dbgs() << "[" << *S << "]";
+    dbgs() << "\n";
+  });
+}
+
+bool llvm::getIndexExpressionsFromGEP(ScalarEvolution &SE,
+                                      const GetElementPtrInst *GEP,
+                                      SmallVectorImpl<const SCEV *> &Subscripts,
+                                      SmallVectorImpl<int> &Sizes) {
+  assert(Subscripts.empty() && Sizes.empty() &&
+         "Expected output lists to be empty on entry to this function.");
+  assert(GEP && "getIndexExpressionsFromGEP called with a null GEP");
+  Type *Ty = nullptr;
+  bool DroppedFirstDim = false;
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    const SCEV *Expr = SE.getSCEV(GEP->getOperand(i));
+    if (i == 1) {
+      Ty = GEP->getSourceElementType();
+      if (auto *Const = dyn_cast<SCEVConstant>(Expr))
+        if (Const->getValue()->isZero()) {
+          DroppedFirstDim = true;
+          continue;
+        }
+      Subscripts.push_back(Expr);
+      continue;
+    }
+
+    auto *ArrayTy = dyn_cast<ArrayType>(Ty);
+    if (!ArrayTy) {
+      Subscripts.clear();
+      Sizes.clear();
+      return false;
+    }
+
+    Subscripts.push_back(Expr);
+    if (!(DroppedFirstDim && i == 2))
+      Sizes.push_back(ArrayTy->getNumElements());
+
+    Ty = ArrayTy->getElementType();
+  }
+  return !Subscripts.empty();
+}
+
 namespace {
 
 class Delinearization : public FunctionPass {
@@ -84,7 +571,7 @@ void printDelinearization(raw_ostream &O, Function *F, LoopInfo *LI,
       O << "AccessFunction: " << *AccessFn << "\n";
 
       SmallVector<const SCEV *, 3> Subscripts, Sizes;
-      SE->delinearize(AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst));
+      delinearize(*SE, AccessFn, Subscripts, Sizes, SE->getElementSize(&Inst));
       if (Subscripts.size() == 0 || Sizes.size() == 0 ||
           Subscripts.size() != Sizes.size()) {
         O << "failed to delinearize\n";
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index ca6d58fac825..117b12fc0701 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -362,7 +362,7 @@ void DemandedBits::performAnalysis() {
       if (Instruction *J = dyn_cast<Instruction>(OI)) {
         Type *T = J->getType();
         if (T->isIntOrIntVectorTy())
-          AliveBits[J] = APInt::getAllOnesValue(T->getScalarSizeInBits());
+          AliveBits[J] = APInt::getAllOnes(T->getScalarSizeInBits());
         else
           Visited.insert(J);
         Worklist.insert(J);
@@ -407,7 +407,7 @@ void DemandedBits::performAnalysis() {
       Type *T = OI->getType();
       if (T->isIntOrIntVectorTy()) {
         unsigned BitWidth = T->getScalarSizeInBits();
-        APInt AB = APInt::getAllOnesValue(BitWidth);
+        APInt AB = APInt::getAllOnes(BitWidth);
         if (InputIsKnownDead) {
           AB = APInt(BitWidth, 0);
         } else {
@@ -417,7 +417,7 @@ void DemandedBits::performAnalysis() {
                                    Known, Known2, KnownBitsComputed);
 
           // Keep track of uses which have no demanded bits.
-          if (AB.isNullValue())
+          if (AB.isZero())
             DeadUses.insert(&OI);
           else
             DeadUses.erase(&OI);
@@ -448,8 +448,7 @@ APInt DemandedBits::getDemandedBits(Instruction *I) {
     return Found->second;
 
   const DataLayout &DL = I->getModule()->getDataLayout();
-  return APInt::getAllOnesValue(
-      DL.getTypeSizeInBits(I->getType()->getScalarType()));
+  return APInt::getAllOnes(DL.getTypeSizeInBits(I->getType()->getScalarType()));
 }
 
 APInt DemandedBits::getDemandedBits(Use *U) {
@@ -461,7 +460,7 @@ APInt DemandedBits::getDemandedBits(Use *U) {
   // We only track integer uses, everything else produces a mask with all bits
   // set
   if (!T->isIntOrIntVectorTy())
-    return APInt::getAllOnesValue(BitWidth);
+    return APInt::getAllOnes(BitWidth);
 
   if (isUseDead(U))
     return APInt(BitWidth, 0);
@@ -469,7 +468,7 @@ APInt DemandedBits::getDemandedBits(Use *U) {
   performAnalysis();
 
   APInt AOut = getDemandedBits(UserI);
-  APInt AB = APInt::getAllOnesValue(BitWidth);
+  APInt AB = APInt::getAllOnes(BitWidth);
   KnownBits Known, Known2;
   bool KnownBitsComputed = false;
 
@@ -504,7 +503,7 @@ bool DemandedBits::isUseDead(Use *U) {
   // is dead. These uses might not be explicitly present in the DeadUses map.
   if (UserI->getType()->isIntOrIntVectorTy()) {
     auto Found = AliveBits.find(UserI);
-    if (Found != AliveBits.end() && Found->second.isNullValue())
+    if (Found != AliveBits.end() && Found->second.isZero())
       return true;
   }
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 9564cfb2aa45..f827f74d5367 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -53,6 +53,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -119,6 +120,11 @@ static cl::opt<bool> DisableDelinearizationChecks(
         "dependence vectors for languages that allow the subscript of one "
         "dimension to underflow or overflow into another dimension."));
 
+static cl::opt<unsigned> MIVMaxLevelThreshold(
+    "da-miv-max-level-threshold", cl::init(7), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Maximum depth allowed for the recursive algorithm used to "
+             "explore MIV direction vectors."));
+
 //===----------------------------------------------------------------------===//
 // basics
 
@@ -2319,7 +2325,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   LLVM_DEBUG(dbgs() << "starting gcd\n");
   ++GCDapplications;
   unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
-  APInt RunningGCD = APInt::getNullValue(BitWidth);
+  APInt RunningGCD = APInt::getZero(BitWidth);
 
   // Examine Src coefficients.
   // Compute running GCD and record source constant.
@@ -2359,7 +2365,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
   }
   const SCEV *DstConst = Coefficients;
 
-  APInt ExtraGCD = APInt::getNullValue(BitWidth);
+  APInt ExtraGCD = APInt::getZero(BitWidth);
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
   LLVM_DEBUG(dbgs() << "    Delta = " << *Delta << "\n");
   const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Delta);
@@ -2602,6 +2608,19 @@ unsigned DependenceInfo::exploreDirections(unsigned Level, CoefficientInfo *A,
                                            const SmallBitVector &Loops,
                                            unsigned &DepthExpanded,
                                            const SCEV *Delta) const {
+  // This algorithm has worst case complexity of O(3^n), where 'n' is the number
+  // of common loop levels. To avoid excessive compile-time, pessimize all the
+  // results and immediately return when the number of common levels is beyond
+  // the given threshold.
+  if (CommonLevels > MIVMaxLevelThreshold) {
+    LLVM_DEBUG(dbgs() << "Number of common levels exceeded the threshold. MIV "
+                         "direction exploration is terminated.\n");
+    for (unsigned K = 1; K <= CommonLevels; ++K)
+      if (Loops[K])
+        Bound[K].DirSet = Dependence::DVEntry::ALL;
+    return 1;
+  }
+
   if (Level > CommonLevels) {
     // record result
     LLVM_DEBUG(dbgs() << "\t[");
@@ -3320,8 +3339,8 @@ bool DependenceInfo::tryDelinearizeFixedSize(
     return false;
 
   SmallVector<int, 4> SrcSizes, DstSizes;
-  SE->getIndexExpressionsFromGEP(SrcGEP, SrcSubscripts, SrcSizes);
-  SE->getIndexExpressionsFromGEP(DstGEP, DstSubscripts, DstSizes);
+  getIndexExpressionsFromGEP(*SE, SrcGEP, SrcSubscripts, SrcSizes);
+  getIndexExpressionsFromGEP(*SE, DstGEP, DstSubscripts, DstSizes);
 
   // Check that the two size arrays are non-empty and equal in length and
   // value.
@@ -3421,16 +3440,16 @@ bool DependenceInfo::tryDelinearizeParametricSize(
 
   // First step: collect parametric terms in both array references.
   SmallVector<const SCEV *, 4> Terms;
-  SE->collectParametricTerms(SrcAR, Terms);
-  SE->collectParametricTerms(DstAR, Terms);
+  collectParametricTerms(*SE, SrcAR, Terms);
+  collectParametricTerms(*SE, DstAR, Terms);
 
   // Second step: find subscript sizes.
   SmallVector<const SCEV *, 4> Sizes;
-  SE->findArrayDimensions(Terms, Sizes, ElementSize);
+  findArrayDimensions(*SE, Terms, Sizes, ElementSize);
 
   // Third step: compute the access functions for each subscript.
-  SE->computeAccessFunctions(SrcAR, SrcSubscripts, Sizes);
-  SE->computeAccessFunctions(DstAR, DstSubscripts, Sizes);
+  computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes);
+  computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes);
 
   // Fail when there is only a subscript: that's a linearized access function.
   if (SrcSubscripts.size() < 2 || DstSubscripts.size() < 2 ||
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index ecfefa36918c..d87fa849d839 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -1,9 +1,8 @@
 //===- DevelopmentModeInlineAdvisor.cpp - runtime-loadable model runner  --===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -228,6 +227,8 @@ private:
                    (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore);
       getAdvisor()->updateNativeSizeEstimate(Reward);
       log(Reward, /*Success=*/true);
+    } else {
+      log(NoReward, /*Success=*/true);
     }
   }
 
@@ -377,7 +378,7 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event,
 void TrainingLogger::print() {
   std::error_code EC;
   raw_fd_ostream OutFile(LogFileName, EC);
-  L->print(OutFile);
+  L->flush(OutFile);
 }
 
 DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor(
diff --git a/llvm/lib/Analysis/HeatUtils.cpp b/llvm/lib/Analysis/HeatUtils.cpp
index a1a11be5fee3..0057de322cac 100644
--- a/llvm/lib/Analysis/HeatUtils.cpp
+++ b/llvm/lib/Analysis/HeatUtils.cpp
@@ -1,9 +1,8 @@
 //===-- HeatUtils.cpp - Utility for printing heat colors --------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index a6298afb66f5..f22c6aa04f5e 100644
--- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -23,13 +23,23 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+cl::opt<bool>
+    DisableBranches("no-ir-sim-branch-matching", cl::init(false),
+                    cl::ReallyHidden,
+                    cl::desc("disable similarity matching, and outlining, "
+                             "across branches for debugging purposes."));
+
 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
                                      IRInstructionDataList &IDList)
     : Inst(&I), Legal(Legality), IDL(&IDList) {
+  initializeInstruction();
+}
+
+void IRInstructionData::initializeInstruction() {
   // We check for whether we have a comparison instruction.  If it is, we
   // find the "less than" version of the predicate for consistency for
   // comparison instructions throught the program.
-  if (CmpInst *C = dyn_cast<CmpInst>(&I)) {
+  if (CmpInst *C = dyn_cast<CmpInst>(Inst)) {
     CmpInst::Predicate Predicate = predicateForConsistency(C);
     if (Predicate != C->getPredicate())
       RevisedPredicate = Predicate;
@@ -37,8 +47,8 @@ IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
 
   // Here we collect the operands and their types for determining whether
   // the structure of the operand use matches between two different candidates.
-  for (Use &OI : I.operands()) {
-    if (isa<CmpInst>(I) && RevisedPredicate.hasValue()) {
+  for (Use &OI : Inst->operands()) {
+    if (isa<CmpInst>(Inst) && RevisedPredicate.hasValue()) {
       // If we have a CmpInst where the predicate is reversed, it means the
       // operands must be reversed as well.
       OperVals.insert(OperVals.begin(), OI.get());
@@ -49,6 +59,33 @@ IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
   }
 }
 
+IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
+    : Inst(nullptr), Legal(false), IDL(&IDList) {}
+
+void IRInstructionData::setBranchSuccessors(
+    DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
+  assert(isa<BranchInst>(Inst) && "Instruction must be branch");
+
+  BranchInst *BI = cast<BranchInst>(Inst);
+  DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
+
+  BBNumIt = BasicBlockToInteger.find(BI->getParent());
+  assert(BBNumIt != BasicBlockToInteger.end() &&
+         "Could not find location for BasicBlock!");
+
+  int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
+
+  for (BasicBlock *Successor : BI->successors()) {
+    BBNumIt = BasicBlockToInteger.find(Successor);
+    assert(BBNumIt != BasicBlockToInteger.end() &&
+           "Could not find number for BasicBlock!");
+    int OtherBlockNumber = static_cast<int>(BBNumIt->second);
+
+    int Relative = OtherBlockNumber - CurrentBlockNumber;
+    RelativeBlockLocations.push_back(Relative);
+  }
+}
+
 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
   switch (CI->getPredicate()) {
   case CmpInst::FCMP_OGT:
@@ -143,6 +180,10 @@ bool IRSimilarity::isClose(const IRInstructionData &A,
       return false;
   }
 
+  if (isa<BranchInst>(A.Inst) && isa<BranchInst>(B.Inst) &&
+      A.RelativeBlockLocations.size() != B.RelativeBlockLocations.size())
+    return false;
+
   return true;
 }
 
@@ -156,10 +197,6 @@ void IRInstructionMapper::convertToUnsignedVec(
   std::vector<unsigned> IntegerMappingForBB;
   std::vector<IRInstructionData *> InstrListForBB;
 
-  HaveLegalRange = false;
-  CanCombineWithPrevInstr = false;
-  AddedIllegalLastTime = true;
-
   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
     switch (InstClassifier.visit(*It)) {
     case InstrType::Legal:
@@ -175,7 +212,8 @@ void IRInstructionMapper::convertToUnsignedVec(
   }
 
   if (HaveLegalRange) {
-    mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
+    if (AddedIllegalLastTime)
+      mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
     for (IRInstructionData *ID : InstrListForBB)
       this->IDL->push_back(*ID);
     llvm::append_range(InstrList, InstrListForBB);
@@ -203,6 +241,9 @@ unsigned IRInstructionMapper::mapToLegalUnsigned(
   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
   InstrListForBB.push_back(ID);
 
+  if (isa<BranchInst>(*It))
+    ID->setBranchSuccessors(BasicBlockToInteger);
+
   // Add to the instruction list
   bool WasInserted;
   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
@@ -235,6 +276,11 @@ IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
 }
 
+IRInstructionData *
+IRInstructionMapper::allocateIRInstructionData(IRInstructionDataList &IDL) {
+  return new (InstDataAllocator->Allocate()) IRInstructionData(IDL);
+}
+
 IRInstructionDataList *
 IRInstructionMapper::allocateIRInstructionDataList() {
   return new (IDLAllocator->Allocate()) IRInstructionDataList();
@@ -255,6 +301,8 @@ unsigned IRInstructionMapper::mapToIllegalUnsigned(
   IRInstructionData *ID = nullptr;
   if (!End)
     ID = allocateIRInstructionData(*It, false, *IDL);
+  else
+    ID = allocateIRInstructionData(*IDL);
   InstrListForBB.push_back(ID);
 
   // Remember that we added an illegal number last time.
@@ -563,8 +611,50 @@ bool IRSimilarityCandidate::compareCommutativeOperandMapping(
   return true;
 }
 
+bool IRSimilarityCandidate::checkRelativeLocations(RelativeLocMapping A,
+                                                   RelativeLocMapping B) {
+  // Get the basic blocks the label refers to.
+  BasicBlock *ABB = static_cast<BasicBlock *>(A.OperVal);
+  BasicBlock *BBB = static_cast<BasicBlock *>(B.OperVal);
+
+  // Get the basic blocks contained in each region.
+  DenseSet<BasicBlock *> BasicBlockA;
+  DenseSet<BasicBlock *> BasicBlockB;
+  A.IRSC.getBasicBlocks(BasicBlockA);
+  B.IRSC.getBasicBlocks(BasicBlockB);
+  
+  // Determine if the block is contained in the region.
+  bool AContained = BasicBlockA.contains(ABB);
+  bool BContained = BasicBlockB.contains(BBB);
+
+  // Both blocks need to be contained in the region, or both need to be outside
+  // the reigon.
+  if (AContained != BContained)
+    return false;
+  
+  // If both are contained, then we need to make sure that the relative
+  // distance to the target blocks are the same.
+  if (AContained)
+    return A.RelativeLocation == B.RelativeLocation;
+  return true;
+}
+
 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
                                              const IRSimilarityCandidate &B) {
+  DenseMap<unsigned, DenseSet<unsigned>> MappingA;
+  DenseMap<unsigned, DenseSet<unsigned>> MappingB;
+  return IRSimilarityCandidate::compareStructure(A, B, MappingA, MappingB);
+}
+
+typedef detail::zippy<detail::zip_shortest, SmallVector<int, 4> &,
+                      SmallVector<int, 4> &, ArrayRef<Value *> &,
+                      ArrayRef<Value *> &>
+    ZippedRelativeLocationsT;
+
+bool IRSimilarityCandidate::compareStructure(
+    const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
+    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
+    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
   if (A.getLength() != B.getLength())
     return false;
 
@@ -574,15 +664,12 @@ bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
   iterator ItA = A.begin();
   iterator ItB = B.begin();
 
-  // These sets create a create a mapping between the values in one candidate
-  // to values in the other candidate.  If we create a set with one element,
-  // and that same element maps to the original element in the candidate
-  // we have a good mapping.
-  DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
-  DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
+  // These ValueNumber Mapping sets create a create a mapping between the values
+  // in one candidate to values in the other candidate.  If we create a set with
+  // one element, and that same element maps to the original element in the
+  // candidate we have a good mapping.
   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
 
-  bool WasInserted;
 
   // Iterate over the instructions contained in each candidate
   unsigned SectionLength = A.getStartIdx() + A.getLength();
@@ -605,6 +692,7 @@ bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
     unsigned InstValA = A.ValueToNumber.find(IA)->second;
     unsigned InstValB = B.ValueToNumber.find(IB)->second;
 
+    bool WasInserted;
     // Ensure that the mappings for the instructions exists.
     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
         std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
@@ -632,6 +720,37 @@ bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
             {A, OperValsA, ValueNumberMappingA},
             {B, OperValsB, ValueNumberMappingB}))
       return false;
+
+    // Here we check that between two corresponding instructions,
+    // when referring to a basic block in the same region, the
+    // relative locations are the same. And, that the instructions refer to
+    // basic blocks outside the region in the same corresponding locations.
+
+    // We are able to make the assumption about blocks outside of the region
+    // since the target block labels are considered values and will follow the
+    // same number matching that we defined for the other instructions in the
+    // region.  So, at this point, in each location we target a specific block
+    // outside the region, we are targeting a corresponding block in each
+    // analagous location in the region we are comparing to.
+    if (!(isa<BranchInst>(IA) && isa<BranchInst>(IB)) &&
+        !(isa<PHINode>(IA) && isa<PHINode>(IB)))
+      continue;
+
+    SmallVector<int, 4> &RelBlockLocsA = ItA->RelativeBlockLocations;
+    SmallVector<int, 4> &RelBlockLocsB = ItB->RelativeBlockLocations;
+    if (RelBlockLocsA.size() != RelBlockLocsB.size() &&
+        OperValsA.size() != OperValsB.size())
+      return false;
+
+    ZippedRelativeLocationsT ZippedRelativeLocations =
+        zip(RelBlockLocsA, RelBlockLocsB, OperValsA, OperValsB);
+    if (any_of(ZippedRelativeLocations,
+               [&A, &B](std::tuple<int, int, Value *, Value *> R) {
+                 return !checkRelativeLocations(
+                     {A, std::get<0>(R), std::get<2>(R)},
+                     {B, std::get<1>(R), std::get<3>(R)});
+               }))
+      return false;
   }
   return true;
 }
@@ -657,6 +776,8 @@ void IRSimilarityIdentifier::populateMapper(
   std::vector<unsigned> IntegerMappingForModule;
   // Iterate over the functions in the module to map each Instruction in each
   // BasicBlock to an unsigned integer.
+  Mapper.initializeForBBs(M);
+
   for (Function &F : M) {
 
     if (F.empty())
@@ -664,15 +785,18 @@ void IRSimilarityIdentifier::populateMapper(
 
     for (BasicBlock &BB : F) {
 
-      if (BB.sizeWithoutDebug() < 2)
-        continue;
-
       // BB has potential to have similarity since it has a size greater than 2
       // and can therefore match other regions greater than 2. Map it to a list
       // of unsigned integers.
       Mapper.convertToUnsignedVec(BB, InstrListForModule,
                                   IntegerMappingForModule);
     }
+
+    BasicBlock::iterator It = F.begin()->end();
+    Mapper.mapToIllegalUnsigned(It, IntegerMappingForModule, InstrListForModule,
+                                true);
+    if (InstrListForModule.size() > 0)
+      Mapper.IDL->push_back(*InstrListForModule.back());
   }
 
   // Insert the InstrListForModule at the end of the overall InstrList so that
@@ -707,6 +831,8 @@ static void createCandidatesFromSuffixTree(
     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
 
   unsigned StringLen = RS.Length;
+  if (StringLen < 2)
+    return;
 
   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
   for (const unsigned &StartIdx : RS.StartIndices) {
@@ -739,6 +865,84 @@ static void createCandidatesFromSuffixTree(
   }
 }
 
+void IRSimilarityCandidate::createCanonicalRelationFrom(
+    IRSimilarityCandidate &SourceCand,
+    DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
+    DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
+  assert(SourceCand.CanonNumToNumber.size() != 0 &&
+         "Base canonical relationship is empty!");
+  assert(SourceCand.NumberToCanonNum.size() != 0 &&
+         "Base canonical relationship is empty!");
+
+  assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
+  assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
+
+  DenseSet<unsigned> UsedGVNs;
+  // Iterate over the mappings provided from this candidate to SourceCand.  We
+  // are then able to map the GVN in this candidate to the same canonical number
+  // given to the corresponding GVN in SourceCand.
+  for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
+    unsigned SourceGVN = GVNMapping.first;
+
+    assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
+
+    unsigned ResultGVN;
+    // We need special handling if we have more than one potential value.  This
+    // means that there are at least two GVNs that could correspond to this GVN.
+    // This could lead to potential swapping later on, so we make a decision
+    // here to ensure a one-to-one mapping.
+    if (GVNMapping.second.size() > 1) {
+      bool Found = false;
+      for (unsigned Val : GVNMapping.second) {
+        // We make sure the target value number hasn't already been reserved.
+        if (UsedGVNs.contains(Val))
+          continue;
+
+        // We make sure that the opposite mapping is still consistent.
+        DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
+            FromSourceMapping.find(Val);
+
+        if (!It->second.contains(SourceGVN))
+          continue;
+
+        // We pick the first item that satisfies these conditions.
+        Found = true;
+        ResultGVN = Val;
+        break;
+      }
+
+      assert(Found && "Could not find matching value for source GVN");
+      (void)Found;
+
+    } else
+      ResultGVN = *GVNMapping.second.begin();
+
+    // Whatever GVN is found, we mark it as used.
+    UsedGVNs.insert(ResultGVN);
+
+    unsigned CanonNum = *SourceCand.getCanonicalNum(ResultGVN);
+    CanonNumToNumber.insert(std::make_pair(CanonNum, SourceGVN));
+    NumberToCanonNum.insert(std::make_pair(SourceGVN, CanonNum));
+  }
+}
+
+void IRSimilarityCandidate::createCanonicalMappingFor(
+    IRSimilarityCandidate &CurrCand) {
+  assert(CurrCand.CanonNumToNumber.size() == 0 &&
+         "Canonical Relationship is non-empty");
+  assert(CurrCand.NumberToCanonNum.size() == 0 &&
+         "Canonical Relationship is non-empty");
+
+  unsigned CanonNum = 0;
+  // Iterate over the value numbers found, the order does not matter in this
+  // case.
+  for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
+    CurrCand.NumberToCanonNum.insert(std::make_pair(NumToVal.first, CanonNum));
+    CurrCand.CanonNumToNumber.insert(std::make_pair(CanonNum, NumToVal.first));
+    CanonNum++;
+  }
+}
+
 /// From the list of IRSimilarityCandidates, perform a comparison between each
 /// IRSimilarityCandidate to determine if there are overlapping
 /// IRInstructionData, or if they do not have the same structure.
@@ -774,6 +978,8 @@ static void findCandidateStructures(
 
   // Iterate over the candidates to determine its structural and overlapping
   // compatibility with other instructions
+  DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
+  DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
   for (CandIt = CandsForRepSubstring.begin(),
       CandEndIt = CandsForRepSubstring.end();
        CandIt != CandEndIt; CandIt++) {
@@ -792,9 +998,11 @@ static void findCandidateStructures(
     // Check if we already have a list of IRSimilarityCandidates for the current
     // structural group.  Create one if one does not exist.
     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
-    if (CurrentGroupPair == StructuralGroups.end())
+    if (CurrentGroupPair == StructuralGroups.end()) {
+      IRSimilarityCandidate::createCanonicalMappingFor(*CandIt);
       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
+    }
 
     // Iterate over the IRSimilarityCandidates following the current
     // IRSimilarityCandidate in the list to determine whether the two
@@ -811,11 +1019,15 @@ static void findCandidateStructures(
 
       // Otherwise we determine if they have the same structure and add it to
       // vector if they match.
-      SameStructure =
-          IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt);
+      ValueNumberMappingA.clear();
+      ValueNumberMappingB.clear();
+      SameStructure = IRSimilarityCandidate::compareStructure(
+          *CandIt, *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
       if (!SameStructure)
         continue;
 
+      InnerCandIt->createCanonicalRelationFrom(*CandIt, ValueNumberMappingA,
+                                               ValueNumberMappingB);
       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
       CurrentGroupPair->second.push_back(*InnerCandIt);
     }
@@ -862,6 +1074,7 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
 
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
+  Mapper.InstClassifier.EnableBranches = this->EnableBranches;
 
   populateMapper(Modules, InstrList, IntegerMapping);
   findCandidates(InstrList, IntegerMapping);
@@ -871,6 +1084,7 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
 
 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
   resetSimilarityCandidates();
+  Mapper.InstClassifier.EnableBranches = this->EnableBranches;
 
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
@@ -891,7 +1105,7 @@ IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
 }
 
 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
-  IRSI.reset(new IRSimilarityIdentifier());
+  IRSI.reset(new IRSimilarityIdentifier(!DisableBranches));
   return false;
 }
 
@@ -907,9 +1121,9 @@ bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
 
 AnalysisKey IRSimilarityAnalysis::Key;
 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
-                                               ModuleAnalysisManager &) {
+                                                 ModuleAnalysisManager &) {
 
-  auto IRSI = IRSimilarityIdentifier();
+  auto IRSI = IRSimilarityIdentifier(!DisableBranches);
   IRSI.findSimilarity(M);
   return IRSI;
 }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index fc6051b35efc..c4b7239b43ab 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -43,8 +43,8 @@ using namespace llvm::PatternMatch;
 
 bool RecurrenceDescriptor::areAllUsesIn(Instruction *I,
                                         SmallPtrSetImpl<Instruction *> &Set) {
-  for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
-    if (!Set.count(dyn_cast<Instruction>(*Use)))
+  for (const Use &Use : I->operands())
+    if (!Set.count(dyn_cast<Instruction>(Use)))
       return false;
   return true;
 }
@@ -62,6 +62,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
   case RecurKind::SMin:
   case RecurKind::UMax:
   case RecurKind::UMin:
+  case RecurKind::SelectICmp:
+  case RecurKind::SelectFCmp:
     return true;
   }
   return false;
@@ -144,12 +146,9 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
       // meaning that we will use sext instructions instead of zext
       // instructions to restore the original type.
       IsSigned = true;
-      if (!Bits.isNegative())
-        // If the value is not known to be negative, we don't known what the
-        // upper bit is, and therefore, we don't know what kind of extend we
-        // will need. In this case, just increase the bit width by one bit and
-        // use sext.
-        ++MaxBitWidth;
+      // Make sure at at least one sign bit is included in the result, so it
+      // will get properly sign-extended.
+      ++MaxBitWidth;
     }
   }
   if (!isPowerOf2_64(MaxBitWidth))
@@ -199,7 +198,10 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
   if (Kind != RecurKind::FAdd)
     return false;
 
-  if (Exit->getOpcode() != Instruction::FAdd || Exit != ExactFPMathInst)
+  // Ensure the exit instruction is an FAdd, and that it only has one user
+  // other than the reduction PHI
+  if (Exit->getOpcode() != Instruction::FAdd || Exit->hasNUsesOrMore(3) ||
+      Exit != ExactFPMathInst)
     return false;
 
   // The only pattern accepted is the one in which the reduction PHI
@@ -272,7 +274,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   } else if (RecurrenceType->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
       return false;
-    if (isArithmeticRecurrenceKind(Kind))
+    if (!isMinMaxRecurrenceKind(Kind))
       Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
   } else {
     // Pointer min/max may exist, but it is not supported as a reduction op.
@@ -327,7 +329,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
     // the starting value (the Phi or an AND instruction if the Phi has been
     // type-promoted).
     if (Cur != Start) {
-      ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, FuncFMF);
+      ReduxDesc =
+          isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF);
       if (!ReduxDesc.isRecurrence())
         return false;
       // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
@@ -360,6 +363,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
 
     // A reduction operation must only have one use of the reduction value.
     if (!IsAPhi && !IsASelect && !isMinMaxRecurrenceKind(Kind) &&
+        !isSelectCmpRecurrenceKind(Kind) &&
         hasMultipleUsesOf(Cur, VisitedInsts, 1))
       return false;
 
@@ -367,10 +371,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
     if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
       return false;
 
-    if (isIntMinMaxRecurrenceKind(Kind) &&
+    if ((isIntMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectICmp) &&
         (isa<ICmpInst>(Cur) || isa<SelectInst>(Cur)))
       ++NumCmpSelectPatternInst;
-    if (isFPMinMaxRecurrenceKind(Kind) &&
+    if ((isFPMinMaxRecurrenceKind(Kind) || Kind == RecurKind::SelectFCmp) &&
         (isa<FCmpInst>(Cur) || isa<SelectInst>(Cur)))
       ++NumCmpSelectPatternInst;
 
@@ -423,7 +427,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
                  ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                    !isa<SelectInst>(UI)) ||
                   (!isConditionalRdxPattern(Kind, UI).isRecurrence() &&
-                   !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())))
+                   !isSelectCmpPattern(TheLoop, Phi, UI, IgnoredVal)
+                        .isRecurrence() &&
+                   !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence())))
         return false;
 
       // Remember that we completed the cycle.
@@ -435,8 +441,13 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   }
 
   // This means we have seen one but not the other instruction of the
-  // pattern or more than just a select and cmp.
-  if (isMinMaxRecurrenceKind(Kind) && NumCmpSelectPatternInst != 2)
+  // pattern or more than just a select and cmp. Zero implies that we saw a
+  // llvm.min/max instrinsic, which is always OK.
+  if (isMinMaxRecurrenceKind(Kind) && NumCmpSelectPatternInst != 2 &&
+      NumCmpSelectPatternInst != 0)
+    return false;
+
+  if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
     return false;
 
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
@@ -505,11 +516,70 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   return true;
 }
 
+// We are looking for loops that do something like this:
+//   int r = 0;
+//   for (int i = 0; i < n; i++) {
+//     if (src[i] > 3)
+//       r = 3;
+//   }
+// where the reduction value (r) only has two states, in this example 0 or 3.
+// The generated LLVM IR for this type of loop will be like this:
+//   for.body:
+//     %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ]
+//     ...
+//     %cmp = icmp sgt i32 %5, 3
+//     %spec.select = select i1 %cmp, i32 3, i32 %r
+//     ...
+// In general we can support vectorization of loops where 'r' flips between
+// any two non-constants, provided they are loop invariant. The only thing
+// we actually care about at the end of the loop is whether or not any lane
+// in the selected vector is different from the start value. The final
+// across-vector reduction after the loop simply involves choosing the start
+// value if nothing changed (0 in the example above) or the other selected
+// value (3 in the example above).
 RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I,
-                                               const InstDesc &Prev) {
-  assert((isa<CmpInst>(I) || isa<SelectInst>(I)) &&
-         "Expected a cmp or select instruction");
+RecurrenceDescriptor::isSelectCmpPattern(Loop *Loop, PHINode *OrigPhi,
+                                         Instruction *I, InstDesc &Prev) {
+  // We must handle the select(cmp(),x,y) as a single instruction. Advance to
+  // the select.
+  CmpInst::Predicate Pred;
+  if (match(I, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) {
+    if (auto *Select = dyn_cast<SelectInst>(*I->user_begin()))
+      return InstDesc(Select, Prev.getRecKind());
+  }
+
+  // Only match select with single use cmp condition.
+  if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(),
+                         m_Value())))
+    return InstDesc(false, I);
+
+  SelectInst *SI = cast<SelectInst>(I);
+  Value *NonPhi = nullptr;
+
+  if (OrigPhi == dyn_cast<PHINode>(SI->getTrueValue()))
+    NonPhi = SI->getFalseValue();
+  else if (OrigPhi == dyn_cast<PHINode>(SI->getFalseValue()))
+    NonPhi = SI->getTrueValue();
+  else
+    return InstDesc(false, I);
+
+  // We are looking for selects of the form:
+  //   select(cmp(), phi, loop_invariant) or
+  //   select(cmp(), loop_invariant, phi)
+  if (!Loop->isLoopInvariant(NonPhi))
+    return InstDesc(false, I);
+
+  return InstDesc(I, isa<ICmpInst>(I->getOperand(0)) ? RecurKind::SelectICmp
+                                                     : RecurKind::SelectFCmp);
+}
+
+RecurrenceDescriptor::InstDesc
+RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
+                                      const InstDesc &Prev) {
+  assert((isa<CmpInst>(I) || isa<SelectInst>(I) || isa<CallInst>(I)) &&
+         "Expected a cmp or select or call instruction");
+  if (!isMinMaxRecurrenceKind(Kind))
+    return InstDesc(false, I);
 
   // We must handle the select(cmp()) as a single instruction. Advance to the
   // select.
@@ -519,28 +589,33 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I,
       return InstDesc(Select, Prev.getRecKind());
   }
 
-  // Only match select with single use cmp condition.
-  if (!match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(),
+  // Only match select with single use cmp condition, or a min/max intrinsic.
+  if (!isa<IntrinsicInst>(I) &&
+      !match(I, m_Select(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), m_Value(),
                          m_Value())))
     return InstDesc(false, I);
 
   // Look for a min/max pattern.
   if (match(I, m_UMin(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::UMin);
+    return InstDesc(Kind == RecurKind::UMin, I);
   if (match(I, m_UMax(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::UMax);
+    return InstDesc(Kind == RecurKind::UMax, I);
   if (match(I, m_SMax(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::SMax);
+    return InstDesc(Kind == RecurKind::SMax, I);
   if (match(I, m_SMin(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::SMin);
+    return InstDesc(Kind == RecurKind::SMin, I);
   if (match(I, m_OrdFMin(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::FMin);
+    return InstDesc(Kind == RecurKind::FMin, I);
   if (match(I, m_OrdFMax(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::FMax);
+    return InstDesc(Kind == RecurKind::FMax, I);
   if (match(I, m_UnordFMin(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::FMin);
+    return InstDesc(Kind == RecurKind::FMin, I);
   if (match(I, m_UnordFMax(m_Value(), m_Value())))
-    return InstDesc(I, RecurKind::FMax);
+    return InstDesc(Kind == RecurKind::FMax, I);
+  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+    return InstDesc(Kind == RecurKind::FMin, I);
+  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+    return InstDesc(Kind == RecurKind::FMax, I);
 
   return InstDesc(false, I);
 }
@@ -592,8 +667,10 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
 }
 
 RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind,
-                                        InstDesc &Prev, FastMathFlags FMF) {
+RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
+                                        Instruction *I, RecurKind Kind,
+                                        InstDesc &Prev, FastMathFlags FuncFMF) {
+  assert(Prev.getRecKind() == RecurKind::None || Prev.getRecKind() == Kind);
   switch (I->getOpcode()) {
   default:
     return InstDesc(false, I);
@@ -624,9 +701,15 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurKind Kind,
     LLVM_FALLTHROUGH;
   case Instruction::FCmp:
   case Instruction::ICmp:
+  case Instruction::Call:
+    if (isSelectCmpRecurrenceKind(Kind))
+      return isSelectCmpPattern(L, OrigPhi, I, Prev);
     if (isIntMinMaxRecurrenceKind(Kind) ||
-        (FMF.noNaNs() && FMF.noSignedZeros() && isFPMinMaxRecurrenceKind(Kind)))
-      return isMinMaxSelectCmpPattern(I, Prev);
+        (((FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) ||
+          (isa<FPMathOperator>(I) && I->hasNoNaNs() &&
+           I->hasNoSignedZeros())) &&
+         isFPMinMaxRecurrenceKind(Kind)))
+      return isMinMaxPattern(I, Kind, Prev);
     return InstDesc(false, I);
   }
 }
@@ -649,7 +732,6 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                           RecurrenceDescriptor &RedDes,
                                           DemandedBits *DB, AssumptionCache *AC,
                                           DominatorTree *DT) {
-
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
   FastMathFlags FMF;
@@ -694,6 +776,12 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
     return true;
   }
+  if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
+                      DT)) {
+    LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
+                      << *Phi << "\n");
+    return true;
+  }
   if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
     LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
@@ -710,6 +798,12 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
     return true;
   }
+  if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
+                      DT)) {
+    LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
+                      << " PHI." << *Phi << "\n");
+    return true;
+  }
   // Not a reduction of known type.
   return false;
 }
@@ -816,8 +910,8 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
-                                                      FastMathFlags FMF) {
+Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
+                                                   FastMathFlags FMF) {
   switch (K) {
   case RecurKind::Xor:
   case RecurKind::Add:
@@ -857,6 +951,10 @@ Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
     return ConstantFP::getInfinity(Tp, true);
   case RecurKind::FMax:
     return ConstantFP::getInfinity(Tp, false);
+  case RecurKind::SelectICmp:
+  case RecurKind::SelectFCmp:
+    return getRecurrenceStartValue();
+    break;
   default:
     llvm_unreachable("Unknown recurrence kind");
   }
@@ -882,9 +980,11 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   case RecurKind::SMin:
   case RecurKind::UMax:
   case RecurKind::UMin:
+  case RecurKind::SelectICmp:
     return Instruction::ICmp;
   case RecurKind::FMax:
   case RecurKind::FMin:
+  case RecurKind::SelectFCmp:
     return Instruction::FCmp;
   default:
     llvm_unreachable("Unknown recurrence operation");
@@ -963,8 +1063,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
 
 InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
                                          const SCEV *Step, BinaryOperator *BOp,
+                                         Type *ElementType,
                                          SmallVectorImpl<Instruction *> *Casts)
-    : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) {
+    : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp),
+      ElementType(ElementType) {
   assert(IK != IK_NoInduction && "Not an induction");
 
   // Start value type should match the induction kind and the value
@@ -992,6 +1094,11 @@ InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
             InductionBinOp->getOpcode() == Instruction::FSub))) &&
          "Binary opcode should be specified for FP induction");
 
+  if (IK == IK_PtrInduction)
+    assert(ElementType && "Pointer induction must have element type");
+  else
+    assert(!ElementType && "Non-pointer induction cannot have element type");
+
   if (Casts) {
     for (auto &Inst : *Casts) {
       RedundantCasts.push_back(Inst);
@@ -1239,8 +1346,6 @@ bool InductionDescriptor::isInductionPHI(
   BasicBlock *Latch = AR->getLoop()->getLoopLatch();
   if (!Latch)
     return false;
-  BinaryOperator *BOp =
-      dyn_cast<BinaryOperator>(Phi->getIncomingValueForBlock(Latch));
 
   const SCEV *Step = AR->getStepRecurrence(*SE);
   // Calculate the pointer stride and check if it is consecutive.
@@ -1250,8 +1355,10 @@ bool InductionDescriptor::isInductionPHI(
     return false;
 
   if (PhiTy->isIntegerTy()) {
+    BinaryOperator *BOp =
+        dyn_cast<BinaryOperator>(Phi->getIncomingValueForBlock(Latch));
     D = InductionDescriptor(StartValue, IK_IntInduction, Step, BOp,
-                            CastsToIgnore);
+                            /* ElementType */ nullptr, CastsToIgnore);
     return true;
   }
 
@@ -1260,15 +1367,16 @@ bool InductionDescriptor::isInductionPHI(
   if (!ConstStep)
     return false;
 
-  ConstantInt *CV = ConstStep->getValue();
-  Type *PointerElementType = PhiTy->getPointerElementType();
-  // The pointer stride cannot be determined if the pointer element type is not
-  // sized.
-  if (!PointerElementType->isSized())
+  // Always use i8 element type for opaque pointer inductions.
+  PointerType *PtrTy = cast<PointerType>(PhiTy);
+  Type *ElementType = PtrTy->isOpaque() ? Type::getInt8Ty(PtrTy->getContext())
+                                        : PtrTy->getElementType();
+  if (!ElementType->isSized())
     return false;
 
+  ConstantInt *CV = ConstStep->getValue();
   const DataLayout &DL = Phi->getModule()->getDataLayout();
-  int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType));
+  int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(ElementType));
   if (!Size)
     return false;
 
@@ -1277,6 +1385,7 @@ bool InductionDescriptor::isInductionPHI(
     return false;
   auto *StepValue =
       SE->getConstant(CV->getType(), CVSize / Size, true /* signed */);
-  D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue, BOp);
+  D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue,
+                          /* BinOp */ nullptr, ElementType);
   return true;
 }
diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp
index db6cff720642..d7b202f83189 100644
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@@ -90,34 +90,6 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
   return false;
 }
 
-/// Return true if all loop headers that dominate this block are in simplified
-/// form.
-static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
-                                 const LoopInfo *LI,
-                                 SmallPtrSetImpl<Loop*> &SimpleLoopNests) {
-  Loop *NearestLoop = nullptr;
-  for (DomTreeNode *Rung = DT->getNode(BB);
-       Rung; Rung = Rung->getIDom()) {
-    BasicBlock *DomBB = Rung->getBlock();
-    Loop *DomLoop = LI->getLoopFor(DomBB);
-    if (DomLoop && DomLoop->getHeader() == DomBB) {
-      // If we have already checked this loop nest, stop checking.
-      if (SimpleLoopNests.count(DomLoop))
-        break;
-      // If the domtree walk reaches a loop with no preheader, return false.
-      if (!DomLoop->isLoopSimplifyForm())
-        return false;
-      // If we have not already checked this loop nest, remember the loop
-      // header nearest to BB. The nearest loop may not contain BB.
-      if (!NearestLoop)
-        NearestLoop = DomLoop;
-    }
-  }
-  if (NearestLoop)
-    SimpleLoopNests.insert(NearestLoop);
-  return true;
-}
-
 /// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
 /// and now we need to decide whether the user should use the preinc or post-inc
 /// value.  If this user should use the post-inc version of the IV, return true.
@@ -162,11 +134,10 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
   return true;
 }
 
-/// AddUsersImpl - Inspect the specified instruction.  If it is a
-/// reducible SCEV, recursively add its users to the IVUsesByStride set and
-/// return true.  Otherwise, return false.
-bool IVUsers::AddUsersImpl(Instruction *I,
-                           SmallPtrSetImpl<Loop*> &SimpleLoopNests) {
+/// Inspect the specified instruction.  If it is a reducible SCEV, recursively
+/// add its users to the IVUsesByStride set and return true.  Otherwise, return
+/// false.
+bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   const DataLayout &DL = I->getModule()->getDataLayout();
 
   // Add this IV user to the Processed set before returning false to ensure that
@@ -213,18 +184,6 @@ bool IVUsers::AddUsersImpl(Instruction *I,
     if (isa<PHINode>(User) && Processed.count(User))
       continue;
 
-    // Only consider IVUsers that are dominated by simplified loop
-    // headers. Otherwise, SCEVExpander will crash.
-    BasicBlock *UseBB = User->getParent();
-    // A phi's use is live out of its predecessor block.
-    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
-      unsigned OperandNo = U.getOperandNo();
-      unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
-      UseBB = PHI->getIncomingBlock(ValNo);
-    }
-    if (!isSimplifiedLoopNest(UseBB, DT, LI, SimpleLoopNests))
-      return false;
-
     // Descend recursively, but not into PHI nodes outside the current loop.
     // It's important to see the entire expression outside the loop to get
     // choices that depend on addressing mode use right, although we won't
@@ -234,12 +193,12 @@ bool IVUsers::AddUsersImpl(Instruction *I,
     bool AddUserToIVUsers = false;
     if (LI->getLoopFor(User->getParent()) != L) {
       if (isa<PHINode>(User) || Processed.count(User) ||
-          !AddUsersImpl(User, SimpleLoopNests)) {
+          !AddUsersIfInteresting(User)) {
         LLVM_DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
                           << "   OF SCEV: " << *ISE << '\n');
         AddUserToIVUsers = true;
       }
-    } else if (Processed.count(User) || !AddUsersImpl(User, SimpleLoopNests)) {
+    } else if (Processed.count(User) || !AddUsersIfInteresting(User)) {
       LLVM_DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
                         << "   OF SCEV: " << *ISE << '\n');
       AddUserToIVUsers = true;
@@ -288,15 +247,6 @@ bool IVUsers::AddUsersImpl(Instruction *I,
   return true;
 }
 
-bool IVUsers::AddUsersIfInteresting(Instruction *I) {
-  // SCEVExpander can only handle users that are dominated by simplified loop
-  // entries. Keep track of all loops that are only dominated by other simple
-  // loops so we don't traverse the domtree for each user.
-  SmallPtrSet<Loop*,16> SimpleLoopNests;
-
-  return AddUsersImpl(I, SimpleLoopNests);
-}
-
 IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
   IVUses.push_back(new IVStrideUse(this, User, Operand));
   return IVUses.back();
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index a8ad2d6696bf..73d1eff1b968 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -49,6 +49,42 @@ static cl::opt<int>
 
 extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
 
+namespace {
+using namespace llvm::ore;
+class MandatoryInlineAdvice : public InlineAdvice {
+public:
+  MandatoryInlineAdvice(InlineAdvisor *Advisor, CallBase &CB,
+                        OptimizationRemarkEmitter &ORE,
+                        bool IsInliningMandatory)
+      : InlineAdvice(Advisor, CB, ORE, IsInliningMandatory) {}
+
+private:
+  void recordInliningWithCalleeDeletedImpl() override { recordInliningImpl(); }
+
+  void recordInliningImpl() override {
+    if (IsInliningRecommended)
+      emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, IsInliningRecommended,
+                      [&](OptimizationRemark &Remark) {
+                        Remark << ": always inline attribute";
+                      });
+  }
+
+  void recordUnsuccessfulInliningImpl(const InlineResult &Result) override {
+    if (IsInliningRecommended)
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+               << "'" << NV("Callee", Callee) << "' is not AlwaysInline into '"
+               << NV("Caller", Caller)
+               << "': " << NV("Reason", Result.getFailureReason());
+      });
+  }
+
+  void recordUnattemptedInliningImpl() override {
+    assert(!IsInliningRecommended && "Expected to attempt inlining");
+  }
+};
+} // namespace
+
 void DefaultInlineAdvice::recordUnsuccessfulInliningImpl(
     const InlineResult &Result) {
   using namespace ore;
@@ -56,20 +92,20 @@ void DefaultInlineAdvice::recordUnsuccessfulInliningImpl(
                                          "; " + inlineCostStr(*OIC));
   ORE.emit([&]() {
     return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
-           << NV("Callee", Callee) << " will not be inlined into "
-           << NV("Caller", Caller) << ": "
-           << NV("Reason", Result.getFailureReason());
+           << "'" << NV("Callee", Callee) << "' is not inlined into '"
+           << NV("Caller", Caller)
+           << "': " << NV("Reason", Result.getFailureReason());
   });
 }
 
 void DefaultInlineAdvice::recordInliningWithCalleeDeletedImpl() {
   if (EmitRemarks)
-    emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+    emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC);
 }
 
 void DefaultInlineAdvice::recordInliningImpl() {
   if (EmitRemarks)
-    emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+    emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC);
 }
 
 llvm::Optional<llvm::InlineCost> static getDefaultInlineAdvice(
@@ -151,9 +187,9 @@ void InlineAdvice::recordInliningWithCalleeDeleted() {
 
 AnalysisKey InlineAdvisorAnalysis::Key;
 
-bool InlineAdvisorAnalysis::Result::tryCreate(InlineParams Params,
-                                              InliningAdvisorMode Mode,
-                                              StringRef ReplayFile) {
+bool InlineAdvisorAnalysis::Result::tryCreate(
+    InlineParams Params, InliningAdvisorMode Mode,
+    const ReplayInlinerSettings &ReplaySettings) {
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   switch (Mode) {
   case InliningAdvisorMode::Default:
@@ -161,10 +197,10 @@ bool InlineAdvisorAnalysis::Result::tryCreate(InlineParams Params,
     Advisor.reset(new DefaultInlineAdvisor(M, FAM, Params));
     // Restrict replay to default advisor, ML advisors are stateful so
     // replay will need augmentations to interleave with them correctly.
-    if (!ReplayFile.empty()) {
-      Advisor = std::make_unique<ReplayInlineAdvisor>(
-          M, FAM, M.getContext(), std::move(Advisor), ReplayFile,
-          /* EmitRemarks =*/true);
+    if (!ReplaySettings.ReplayFile.empty()) {
+      Advisor = llvm::getReplayInlineAdvisor(M, FAM, M.getContext(),
+                                             std::move(Advisor), ReplaySettings,
+                                             /* EmitRemarks =*/true);
     }
     break;
   case InliningAdvisorMode::Development:
@@ -313,7 +349,7 @@ void llvm::setInlineRemark(CallBase &CB, StringRef Message) {
     return;
 
   Attribute Attr = Attribute::get(CB.getContext(), "inline-remark", Message);
-  CB.addAttribute(AttributeList::FunctionIndex, Attr);
+  CB.addFnAttr(Attr);
 }
 
 /// Return the cost only if the inliner should attempt to inline at the given
@@ -343,15 +379,15 @@ llvm::shouldInline(CallBase &CB,
     if (IC.isNever()) {
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
-               << NV("Callee", Callee) << " not inlined into "
-               << NV("Caller", Caller) << " because it should never be inlined "
-               << IC;
+               << "'" << NV("Callee", Callee) << "' not inlined into '"
+               << NV("Caller", Caller)
+               << "' because it should never be inlined " << IC;
       });
     } else {
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
-               << NV("Callee", Callee) << " not inlined into "
-               << NV("Caller", Caller) << " because too costly to inline "
+               << "'" << NV("Callee", Callee) << "' not inlined into '"
+               << NV("Caller", Caller) << "' because too costly to inline "
                << IC;
       });
     }
@@ -368,9 +404,9 @@ llvm::shouldInline(CallBase &CB,
     ORE.emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
                                       Call)
-             << "Not inlining. Cost of inlining " << NV("Callee", Callee)
-             << " increases the cost of inlining " << NV("Caller", Caller)
-             << " in other contexts";
+             << "Not inlining. Cost of inlining '" << NV("Callee", Callee)
+             << "' increases the cost of inlining '" << NV("Caller", Caller)
+             << "' in other contexts";
     });
     setInlineRemark(CB, "deferred");
     // IC does not bool() to false, so get an InlineCost that will.
@@ -383,7 +419,8 @@ llvm::shouldInline(CallBase &CB,
   return IC;
 }
 
-std::string llvm::getCallSiteLocation(DebugLoc DLoc) {
+std::string llvm::formatCallSiteLocation(DebugLoc DLoc,
+                                         const CallSiteFormat &Format) {
   std::string Buffer;
   raw_string_ostream CallSiteLoc(Buffer);
   bool First = true;
@@ -399,9 +436,10 @@ std::string llvm::getCallSiteLocation(DebugLoc DLoc) {
     StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
     if (Name.empty())
       Name = DIL->getScope()->getSubprogram()->getName();
-    CallSiteLoc << Name.str() << ":" << llvm::utostr(Offset) << ":"
-                << llvm::utostr(DIL->getColumn());
-    if (Discriminator)
+    CallSiteLoc << Name.str() << ":" << llvm::utostr(Offset);
+    if (Format.outputColumn())
+      CallSiteLoc << ":" << llvm::utostr(DIL->getColumn());
+    if (Format.outputDiscriminator() && Discriminator)
       CallSiteLoc << "." << llvm::utostr(Discriminator);
     First = false;
   }
@@ -435,25 +473,38 @@ void llvm::addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc) {
   Remark << ";";
 }
 
-void llvm::emitInlinedInto(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
-                           const BasicBlock *Block, const Function &Callee,
-                           const Function &Caller, const InlineCost &IC,
-                           bool ForProfileContext, const char *PassName) {
+void llvm::emitInlinedInto(
+    OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block,
+    const Function &Callee, const Function &Caller, bool AlwaysInline,
+    function_ref<void(OptimizationRemark &)> ExtraContext,
+    const char *PassName) {
   ORE.emit([&]() {
-    bool AlwaysInline = IC.isAlways();
     StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined";
     OptimizationRemark Remark(PassName ? PassName : DEBUG_TYPE, RemarkName,
                               DLoc, Block);
-    Remark << ore::NV("Callee", &Callee) << " inlined into ";
-    Remark << ore::NV("Caller", &Caller);
-    if (ForProfileContext)
-      Remark << " to match profiling context";
-    Remark << " with " << IC;
+    Remark << "'" << ore::NV("Callee", &Callee) << "' inlined into '"
+           << ore::NV("Caller", &Caller) << "'";
+    if (ExtraContext)
+      ExtraContext(Remark);
     addLocationToRemarks(Remark, DLoc);
     return Remark;
   });
 }
 
+void llvm::emitInlinedIntoBasedOnCost(
+    OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block,
+    const Function &Callee, const Function &Caller, const InlineCost &IC,
+    bool ForProfileContext, const char *PassName) {
+  llvm::emitInlinedInto(
+      ORE, DLoc, Block, Callee, Caller, IC.isAlways(),
+      [&](OptimizationRemark &Remark) {
+        if (ForProfileContext)
+          Remark << " to match profiling context";
+        Remark << " with " << IC;
+      },
+      PassName);
+}
+
 InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM)
     : M(M), FAM(FAM) {
   if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
@@ -475,7 +526,8 @@ InlineAdvisor::~InlineAdvisor() {
 
 std::unique_ptr<InlineAdvice> InlineAdvisor::getMandatoryAdvice(CallBase &CB,
                                                                 bool Advice) {
-  return std::make_unique<InlineAdvice>(this, CB, getCallerORE(CB), Advice);
+  return std::make_unique<MandatoryInlineAdvice>(this, CB, getCallerORE(CB),
+                                                 Advice);
 }
 
 InlineAdvisor::MandatoryInliningKind
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 4c2413e14435..ff31e81aad08 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -135,6 +135,31 @@ static cl::opt<bool> DisableGEPConstOperand(
 namespace {
 class InlineCostCallAnalyzer;
 
+/// This function behaves more like CallBase::hasFnAttr: when it looks for the
+/// requested attribute, it check both the call instruction and the called
+/// function (if it's available and operand bundles don't prohibit that).
+Attribute getFnAttr(CallBase &CB, StringRef AttrKind) {
+  Attribute CallAttr = CB.getFnAttr(AttrKind);
+  if (CallAttr.isValid())
+    return CallAttr;
+
+  // Operand bundles override attributes on the called function, but don't
+  // override attributes directly present on the call instruction.
+  if (!CB.isFnAttrDisallowedByOpBundle(AttrKind))
+    if (const Function *F = CB.getCalledFunction())
+      return F->getFnAttribute(AttrKind);
+
+  return {};
+}
+
+Optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
+  Attribute Attr = getFnAttr(CB, AttrKind);
+  int AttrValue;
+  if (Attr.getValueAsString().getAsInteger(10, AttrValue))
+    return None;
+  return AttrValue;
+}
+
 // This struct is used to store information about inline cost of a
 // particular instruction
 struct InstructionCostDetail {
@@ -235,6 +260,10 @@ protected:
   /// Called the analysis engine determines load elimination won't happen.
   virtual void onDisableLoadElimination() {}
 
+  /// Called when we visit a CallBase, before the analysis starts. Return false
+  /// to stop further processing of the instruction.
+  virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
+
   /// Called to account for a call.
   virtual void onCallPenalty() {}
 
@@ -333,6 +362,10 @@ protected:
   /// whenever we simplify away the stores that would otherwise cause them to be
   /// loads.
   bool EnableLoadElimination;
+
+  /// Whether we allow inlining for recursive call.
+  bool AllowRecursiveCall;
+
   SmallPtrSet<Value *, 16> LoadAddrSet;
 
   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
@@ -354,6 +387,7 @@ protected:
   bool simplifyCallSite(Function *F, CallBase &Call);
   template <typename Callable>
   bool simplifyInstruction(Instruction &I, Callable Evaluate);
+  bool simplifyIntrinsicCallIsConstant(CallBase &CB);
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 
   /// Return true if the given argument to the function being considered for
@@ -421,7 +455,8 @@ public:
                OptimizationRemarkEmitter *ORE = nullptr)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
-        CandidateCall(Call), EnableLoadElimination(true) {}
+        CandidateCall(Call), EnableLoadElimination(true),
+        AllowRecursiveCall(false) {}
 
   InlineResult analyze();
 
@@ -510,6 +545,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   // sense that it's not weighted by profile counts at all.
   int ColdSize = 0;
 
+  // Whether inlining is decided by cost-threshold analysis.
+  bool DecidedByCostThreshold = false;
+
   // Whether inlining is decided by cost-benefit analysis.
   bool DecidedByCostBenefit = false;
 
@@ -558,6 +596,22 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     addCost(LoadEliminationCost);
     LoadEliminationCost = 0;
   }
+
+  bool onCallBaseVisitStart(CallBase &Call) override {
+    if (Optional<int> AttrCallThresholdBonus =
+            getStringFnAttrAsInt(Call, "call-threshold-bonus"))
+      Threshold += *AttrCallThresholdBonus;
+
+    if (Optional<int> AttrCallCost =
+            getStringFnAttrAsInt(Call, "call-inline-cost")) {
+      addCost(*AttrCallCost);
+      // Prevent further processing of the call since we want to override its
+      // inline cost, not just add to it.
+      return false;
+    }
+    return true;
+  }
+
   void onCallPenalty() override { addCost(CallPenalty); }
   void onCallArgumentSetup(const CallBase &Call) override {
     // Pay the price of the argument setup. We account for the average 1
@@ -717,7 +771,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
 
     // Make sure we have a nonzero entry count.
     auto EntryCount = F.getEntryCount();
-    if (!EntryCount || !EntryCount.getCount())
+    if (!EntryCount || !EntryCount->getCount())
       return false;
 
     BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
@@ -763,7 +817,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
         if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
           // Count a conditional branch as savings if it becomes unconditional.
           if (BI->isConditional() &&
-              dyn_cast_or_null<ConstantInt>(
+              isa_and_nonnull<ConstantInt>(
                   SimplifiedValues.lookup(BI->getCondition()))) {
             CurrentSavings += InlineConstants::InstrCost;
           }
@@ -783,8 +837,8 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
 
     // Compute the cycle savings per call.
     auto EntryProfileCount = F.getEntryCount();
-    assert(EntryProfileCount.hasValue() && EntryProfileCount.getCount());
-    auto EntryCount = EntryProfileCount.getCount();
+    assert(EntryProfileCount.hasValue() && EntryProfileCount->getCount());
+    auto EntryCount = EntryProfileCount->getCount();
     CycleSavings += EntryCount / 2;
     CycleSavings = CycleSavings.udiv(EntryCount);
 
@@ -847,6 +901,14 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     else if (NumVectorInstructions <= NumInstructions / 2)
       Threshold -= VectorBonus / 2;
 
+    if (Optional<int> AttrCost =
+            getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
+      Cost = *AttrCost;
+
+    if (Optional<int> AttrThreshold =
+            getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
+      Threshold = *AttrThreshold;
+
     if (auto Result = costBenefitAnalysis()) {
       DecidedByCostBenefit = true;
       if (Result.getValue())
@@ -855,14 +917,24 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
         return InlineResult::failure("Cost over threshold.");
     }
 
-    if (IgnoreThreshold || Cost < std::max(1, Threshold))
+    if (IgnoreThreshold)
       return InlineResult::success();
-    return InlineResult::failure("Cost over threshold.");
+
+    DecidedByCostThreshold = true;
+    return Cost < std::max(1, Threshold)
+               ? InlineResult::success()
+               : InlineResult::failure("Cost over threshold.");
   }
+
   bool shouldStop() override {
+    if (IgnoreThreshold || ComputeFullInlineCost)
+      return false;
     // Bail out the moment we cross the threshold. This means we'll under-count
     // the cost, but only when undercounting doesn't matter.
-    return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost;
+    if (Cost < Threshold)
+      return false;
+    DecidedByCostThreshold = true;
+    return true;
   }
 
   void onLoadEliminationOpportunity() override {
@@ -930,7 +1002,9 @@ public:
         Params(Params), Threshold(Params.DefaultThreshold),
         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
         CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
-        Writer(this) {}
+        Writer(this) {
+    AllowRecursiveCall = Params.AllowRecursiveCall.getValue();
+  }
 
   /// Annotation Writer for instruction details
   InlineCostAnnotationWriter Writer;
@@ -939,7 +1013,7 @@ public:
 
   // Prints the same analysis as dump(), but its definition is not dependent
   // on the build.
-  void print();
+  void print(raw_ostream &OS);
 
   Optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
     if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end())
@@ -952,6 +1026,7 @@ public:
   int getCost() const { return Cost; }
   Optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
   bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
+  bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
 };
 
 class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
@@ -1310,7 +1385,7 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
   // Or could we skip the getPointerSizeInBits call completely? As far as I can
   // see the ZeroOffset is used as a dummy value, so we can probably use any
   // bit width for the ZeroOffset?
-  APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
+  APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
   bool CheckSROA = I.getType()->isPointerTy();
 
   // Track the constant or pointer with constant offset we've seen so far.
@@ -1471,6 +1546,27 @@ bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
   return true;
 }
 
+/// Try to simplify a call to llvm.is.constant.
+///
+/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
+/// we expect calls of this specific intrinsic to be infrequent.
+///
+/// FIXME: Given that we know CB's parent (F) caller
+/// (CandidateCall->getParent()->getParent()), we might be able to determine
+/// whether inlining F into F's caller would change how the call to
+/// llvm.is.constant would evaluate.
+bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
+  Value *Arg = CB.getArgOperand(0);
+  auto *C = dyn_cast<Constant>(Arg);
+
+  if (!C)
+    C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
+
+  Type *RT = CB.getFunctionType()->getReturnType();
+  SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
+  return true;
+}
+
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
@@ -1799,8 +1895,8 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
   VectorBonus = Threshold * VectorBonusPercent / 100;
 
-  bool OnlyOneCallAndLocalLinkage =
-      F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
+  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneLiveUse() &&
+                                    &F == Call.getCalledFunction();
   // If there is only one call of the function, and it has internal linkage,
   // the cost of inlining it drops dramatically. It may seem odd to update
   // Cost in updateThreshold, but the bonus depends on the logic in this method.
@@ -2029,6 +2125,9 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
 }
 
 bool CallAnalyzer::visitCallBase(CallBase &Call) {
+  if (!onCallBaseVisitStart(Call))
+    return true;
+
   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
     // This aborts the entire analysis.
@@ -2091,6 +2190,8 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
       if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
         SROAArgValues[II] = SROAArg;
       return true;
+    case Intrinsic::is_constant:
+      return simplifyIntrinsicCallIsConstant(Call);
     }
   }
 
@@ -2098,7 +2199,8 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
     // This flag will fully abort the analysis, so don't bother with anything
     // else.
     IsRecursiveCall = true;
-    return false;
+    if (!AllowRecursiveCall)
+      return false;
   }
 
   if (TTI.isLoweredToCall(F)) {
@@ -2123,7 +2225,7 @@ bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
   // inliner more regular and predictable. Interestingly, conditional branches
   // which will fold away are also free.
   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
-         dyn_cast_or_null<ConstantInt>(
+         isa_and_nonnull<ConstantInt>(
              SimplifiedValues.lookup(BI.getCondition()));
 }
 
@@ -2305,11 +2407,8 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // inlining due to debug symbols. Eventually, the number of unsimplified
     // instructions shouldn't factor into the cost computation, but until then,
     // hack around it here.
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
-    // Skip pseudo-probes.
-    if (isa<PseudoProbeInst>(I))
+    // Similarly, skip pseudo-probes.
+    if (I.isDebugOrPseudoInst())
       continue;
 
     // Skip ephemeral values.
@@ -2336,7 +2435,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
     InlineResult IR = InlineResult::success();
-    if (IsRecursiveCall)
+    if (IsRecursiveCall && !AllowRecursiveCall)
       IR = InlineResult::failure("recursive");
     else if (ExposesReturnsTwice)
       IR = InlineResult::failure("exposes returns twice");
@@ -2398,7 +2497,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
 
   unsigned AS = V->getType()->getPointerAddressSpace();
   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
-  APInt Offset = APInt::getNullValue(IntPtrWidth);
+  APInt Offset = APInt::getZero(IntPtrWidth);
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
@@ -2601,7 +2700,7 @@ InlineResult CallAnalyzer::analyze() {
     onBlockAnalyzed(BB);
   }
 
-  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
+  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneLiveUse() &&
                                     &F == CandidateCall.getCalledFunction();
   // If this is a noduplicate call, we can still inline as long as
   // inlining this would cause the removal of the caller (so the instruction
@@ -2612,10 +2711,10 @@ InlineResult CallAnalyzer::analyze() {
   return finalizeAnalysis();
 }
 
-void InlineCostCallAnalyzer::print() {
-#define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
+void InlineCostCallAnalyzer::print(raw_ostream &OS) {
+#define DEBUG_PRINT_STAT(x) OS << "      " #x ": " << x << "\n"
   if (PrintInstructionComments)
-    F.print(dbgs(), &Writer);
+    F.print(OS, &Writer);
   DEBUG_PRINT_STAT(NumConstantArgs);
   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
   DEBUG_PRINT_STAT(NumAllocaArgs);
@@ -2634,7 +2733,7 @@ void InlineCostCallAnalyzer::print() {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Dump stats about this call's analysis.
-LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(); }
+LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
 #endif
 
 /// Test that there are no attribute conflicts between Caller and Callee
@@ -2849,13 +2948,13 @@ InlineCost llvm::getInlineCost(
       return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
   }
 
-  // Check if there was a reason to force inlining or no inlining.
-  if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
-    return InlineCost::getNever(ShouldInline.getFailureReason());
-  if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
-    return InlineCost::getAlways("empty function");
+  if (CA.wasDecidedByCostThreshold())
+    return InlineCost::get(CA.getCost(), CA.getThreshold());
 
-  return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
+  // No details on how the decision was made, simply return always or never.
+  return ShouldInline.isSuccess()
+             ? InlineCost::getAlways("empty function")
+             : InlineCost::getNever(ShouldInline.getFailureReason());
 }
 
 InlineResult llvm::isInlineViable(Function &F) {
@@ -3028,7 +3127,8 @@ InlineCostAnnotationPrinterPass::run(Function &F,
         ICCA.analyze();
         OS << "      Analyzing call of " << CalledFunction->getName()
            << "... (caller:" << CI->getCaller()->getName() << ")\n";
-        ICCA.print();
+        ICCA.print(OS);
+        OS << "\n";
       }
     }
   }
diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
index 3c90e82fb952..a2e231e2d0f4 100644
--- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
+++ b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
@@ -1,9 +1,8 @@
 //===- InlineSizeEstimatorAnalysis.cpp - IR to native size from ML model --===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
index 7d1e630e6e80..9fee57c54b85 100644
--- a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
+++ b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
@@ -19,11 +19,15 @@
 
 #include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ipt"
+STATISTIC(NumInstScanned, "Number of insts scanned while updating ibt");
+
 #ifndef NDEBUG
 static cl::opt<bool> ExpensiveAsserts(
     "ipt-expensive-asserts",
@@ -64,11 +68,13 @@ bool InstructionPrecedenceTracking::isPreceededBySpecialInstruction(
 
 void InstructionPrecedenceTracking::fill(const BasicBlock *BB) {
   FirstSpecialInsts.erase(BB);
-  for (auto &I : *BB)
+  for (auto &I : *BB) {
+    NumInstScanned++;
     if (isSpecialInstruction(&I)) {
       FirstSpecialInsts[BB] = &I;
       return;
     }
+  }
 
   // Mark this block as having no special instructions.
   FirstSpecialInsts[BB] = nullptr;
@@ -107,8 +113,10 @@ void InstructionPrecedenceTracking::insertInstructionTo(const Instruction *Inst,
 }
 
 void InstructionPrecedenceTracking::removeInstruction(const Instruction *Inst) {
-  if (isSpecialInstruction(Inst))
-    FirstSpecialInsts.erase(Inst->getParent());
+  auto *BB = Inst->getParent();
+  assert(BB && "must be called before instruction is actually removed");
+  if (FirstSpecialInsts.count(BB) && FirstSpecialInsts[BB] == Inst)
+    FirstSpecialInsts.erase(BB);
 }
 
 void InstructionPrecedenceTracking::removeUsersOf(const Instruction *Inst) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 23083bc8178e..864eeea4f8bf 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -70,8 +70,8 @@ static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyCastInst(unsigned, Value *, Type *,
                                const SimplifyQuery &, unsigned);
-static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, const SimplifyQuery &,
-                              unsigned);
+static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, bool,
+                              const SimplifyQuery &, unsigned);
 static Value *SimplifySelectInst(Value *, Value *, Value *,
                                  const SimplifyQuery &, unsigned);
 
@@ -698,13 +698,12 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
                                                 bool AllowNonInbounds = false) {
   assert(V->getType()->isPtrOrPtrVectorTy());
 
-  Type *IntIdxTy = DL.getIndexType(V->getType())->getScalarType();
-  APInt Offset = APInt::getNullValue(IntIdxTy->getIntegerBitWidth());
+  APInt Offset = APInt::getZero(DL.getIndexTypeSizeInBits(V->getType()));
 
   V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds);
   // As that strip may trace through `addrspacecast`, need to sext or trunc
   // the offset calculated.
-  IntIdxTy = DL.getIndexType(V->getType())->getScalarType();
+  Type *IntIdxTy = DL.getIndexType(V->getType())->getScalarType();
   Offset = Offset.sextOrTrunc(IntIdxTy->getIntegerBitWidth());
 
   Constant *OffsetIntPtr = ConstantInt::get(IntIdxTy, Offset);
@@ -1407,8 +1406,7 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
       match(Op0, m_c_Or(m_NUWShl(m_Value(X), m_APInt(ShLAmt)), m_Value(Y))) &&
       *ShRAmt == *ShLAmt) {
     const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-    const unsigned Width = Op0->getType()->getScalarSizeInBits();
-    const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
+    const unsigned EffWidthY = YKnown.countMaxActiveBits();
     if (ShRAmt->uge(EffWidthY))
       return X;
   }
@@ -1429,9 +1427,11 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                     MaxRecurse))
     return V;
 
-  // all ones >>a X -> -1
+  // -1 >>a X --> -1
+  // (-1 << X) a>> X --> -1
   // Do not return Op0 because it may contain undef elements if it's a vector.
-  if (match(Op0, m_AllOnes()))
+  if (match(Op0, m_AllOnes()) ||
+      match(Op0, m_Shl(m_AllOnes(), m_Specific(Op1))))
     return Constant::getAllOnesValue(Op0->getType());
 
   // (X << A) >> A -> X
@@ -1765,7 +1765,7 @@ static Value *simplifyAndOrOfICmpsWithLimitConst(ICmpInst *Cmp0, ICmpInst *Cmp1,
   if (match(Cmp0->getOperand(1), m_APInt(C)))
     MinMaxC = HasNotOp ? ~*C : *C;
   else if (isa<ConstantPointerNull>(Cmp0->getOperand(1)))
-    MinMaxC = APInt::getNullValue(8);
+    MinMaxC = APInt::getZero(8);
   else
     return nullptr;
 
@@ -2040,24 +2040,32 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
+  // (X | Y) & (X | ~Y) --> X (commuted 8 ways)
+  Value *X, *Y;
+  if (match(Op0, m_c_Or(m_Value(X), m_Not(m_Value(Y)))) &&
+      match(Op1, m_c_Or(m_Deferred(X), m_Deferred(Y))))
+    return X;
+  if (match(Op1, m_c_Or(m_Value(X), m_Not(m_Value(Y)))) &&
+      match(Op0, m_c_Or(m_Deferred(X), m_Deferred(Y))))
+    return X;
+
   if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::And))
     return V;
 
   // A mask that only clears known zeros of a shifted value is a no-op.
-  Value *X;
   const APInt *Mask;
   const APInt *ShAmt;
   if (match(Op1, m_APInt(Mask))) {
     // If all bits in the inverted and shifted mask are clear:
     // and (shl X, ShAmt), Mask --> shl X, ShAmt
     if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) &&
-        (~(*Mask)).lshr(*ShAmt).isNullValue())
+        (~(*Mask)).lshr(*ShAmt).isZero())
       return Op0;
 
     // If all bits in the inverted and shifted mask are clear:
     // and (lshr X, ShAmt), Mask --> lshr X, ShAmt
     if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
-        (~(*Mask)).shl(*ShAmt).isNullValue())
+        (~(*Mask)).shl(*ShAmt).isZero())
       return Op0;
   }
 
@@ -2141,7 +2149,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   //     if Mask = ((1 << effective_width_of(X)) - 1) << A
   // SimplifyDemandedBits in InstCombine can optimize the general case.
   // This pattern aims to help other passes for a common case.
-  Value *Y, *XShifted;
+  Value *XShifted;
   if (match(Op1, m_APInt(Mask)) &&
       match(Op0, m_c_Or(m_CombineAnd(m_NUWShl(m_Value(X), m_APInt(ShAmt)),
                                      m_Value(XShifted)),
@@ -2149,11 +2157,11 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     const unsigned Width = Op0->getType()->getScalarSizeInBits();
     const unsigned ShftCnt = ShAmt->getLimitedValue(Width);
     const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-    const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros();
+    const unsigned EffWidthY = YKnown.countMaxActiveBits();
     if (EffWidthY <= ShftCnt) {
       const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI,
                                                 Q.DT);
-      const unsigned EffWidthX = Width - XKnown.countMinLeadingZeros();
+      const unsigned EffWidthX = XKnown.countMaxActiveBits();
       const APInt EffBitsY = APInt::getLowBitsSet(Width, EffWidthY);
       const APInt EffBitsX = APInt::getLowBitsSet(Width, EffWidthX) << ShftCnt;
       // If the mask is extracting all bits from X or Y as is, we can skip
@@ -2257,6 +2265,19 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
        match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
+  // (A | B) | (A ^ B) --> A | B
+  // (B | A) | (A ^ B) --> B | A
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+      match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+    return Op0;
+
+  // Commute the outer 'or' operands.
+  // (A ^ B) | (A | B) --> A | B
+  // (A ^ B) | (B | A) --> B | A
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+      match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
+    return Op1;
+
   // (~A & B) | ~(A | B) --> ~A
   // (~A & B) | ~(B | A) --> ~A
   // (B & ~A) | ~(A | B) --> ~A
@@ -2276,6 +2297,23 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op0, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
     return NotA;
 
+  // Rotated -1 is still -1:
+  // (-1 << X) | (-1 >> (C - X)) --> -1
+  // (-1 >> X) | (-1 << (C - X)) --> -1
+  // ...with C <= bitwidth (and commuted variants).
+  Value *X, *Y;
+  if ((match(Op0, m_Shl(m_AllOnes(), m_Value(X))) &&
+       match(Op1, m_LShr(m_AllOnes(), m_Value(Y)))) ||
+      (match(Op1, m_Shl(m_AllOnes(), m_Value(X))) &&
+       match(Op0, m_LShr(m_AllOnes(), m_Value(Y))))) {
+    const APInt *C;
+    if ((match(X, m_Sub(m_APInt(C), m_Specific(Y))) ||
+         match(Y, m_Sub(m_APInt(C), m_Specific(X)))) &&
+        C->ule(X->getType()->getScalarSizeInBits())) {
+      return ConstantInt::getAllOnesValue(X->getType());
+    }
+  }
+
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
@@ -3090,7 +3128,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
     // - C isn't zero.
     if (Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
         Q.IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(LBO)) ||
-        match(LHS, m_Shl(m_One(), m_Value())) || !C->isNullValue()) {
+        match(LHS, m_Shl(m_One(), m_Value())) || !C->isZero()) {
       if (Pred == ICmpInst::ICMP_EQ)
         return ConstantInt::getFalse(GetCompareTy(RHS));
       if (Pred == ICmpInst::ICMP_NE)
@@ -3640,30 +3678,6 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                          CRHS->getPointerOperand(), Q))
           return C;
 
-  if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
-    if (GEPOperator *GRHS = dyn_cast<GEPOperator>(RHS)) {
-      if (GLHS->getPointerOperand() == GRHS->getPointerOperand() &&
-          GLHS->hasAllConstantIndices() && GRHS->hasAllConstantIndices() &&
-          (ICmpInst::isEquality(Pred) ||
-           (GLHS->isInBounds() && GRHS->isInBounds() &&
-            Pred == ICmpInst::getSignedPredicate(Pred)))) {
-        // The bases are equal and the indices are constant.  Build a constant
-        // expression GEP with the same indices and a null base pointer to see
-        // what constant folding can make out of it.
-        Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType());
-        SmallVector<Value *, 4> IndicesLHS(GLHS->indices());
-        Constant *NewLHS = ConstantExpr::getGetElementPtr(
-            GLHS->getSourceElementType(), Null, IndicesLHS);
-
-        SmallVector<Value *, 4> IndicesRHS(GRHS->idx_begin(), GRHS->idx_end());
-        Constant *NewRHS = ConstantExpr::getGetElementPtr(
-            GLHS->getSourceElementType(), Null, IndicesRHS);
-        Constant *NewICmp = ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
-        return ConstantFoldConstant(NewICmp, Q.DL);
-      }
-    }
-  }
-
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
@@ -3966,7 +3980,8 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
       return PreventSelfSimplify(SimplifyGEPInst(GEP->getSourceElementType(),
-                                                 NewOps, Q, MaxRecurse - 1));
+                                                 NewOps, GEP->isInBounds(), Q,
+                                                 MaxRecurse - 1));
 
     if (isa<SelectInst>(I))
       return PreventSelfSimplify(
@@ -4080,6 +4095,22 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     std::swap(TrueVal, FalseVal);
   }
 
+  // Check for integer min/max with a limit constant:
+  // X > MIN_INT ? X : MIN_INT --> X
+  // X < MAX_INT ? X : MAX_INT --> X
+  if (TrueVal->getType()->isIntOrIntVectorTy()) {
+    Value *X, *Y;
+    SelectPatternFlavor SPF =
+        matchDecomposedSelectPattern(cast<ICmpInst>(CondVal), TrueVal, FalseVal,
+                                     X, Y).Flavor;
+    if (SelectPatternResult::isMinOrMax(SPF) && Pred == getMinMaxPred(SPF)) {
+      APInt LimitC = getMinMaxLimit(getInverseMinMaxFlavor(SPF),
+                                    X->getType()->getScalarSizeInBits());
+      if (match(Y, m_SpecificInt(LimitC)))
+        return X;
+    }
+  }
+
   if (Pred == ICmpInst::ICMP_EQ && match(CmpRHS, m_Zero())) {
     Value *X;
     const APInt *Y;
@@ -4210,14 +4241,27 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
       return FalseVal;
   }
 
-  // select i1 Cond, i1 true, i1 false --> i1 Cond
   assert(Cond->getType()->isIntOrIntVectorTy(1) &&
          "Select must have bool or bool vector condition");
   assert(TrueVal->getType() == FalseVal->getType() &&
          "Select must have same types for true/false ops");
-  if (Cond->getType() == TrueVal->getType() &&
-      match(TrueVal, m_One()) && match(FalseVal, m_ZeroInt()))
-    return Cond;
+
+  if (Cond->getType() == TrueVal->getType()) {
+    // select i1 Cond, i1 true, i1 false --> i1 Cond
+    if (match(TrueVal, m_One()) && match(FalseVal, m_ZeroInt()))
+      return Cond;
+
+    // (X || Y) && (X || !Y) --> X (commuted 8 ways)
+    Value *X, *Y;
+    if (match(FalseVal, m_ZeroInt())) {
+      if (match(Cond, m_c_LogicalOr(m_Value(X), m_Not(m_Value(Y)))) &&
+          match(TrueVal, m_c_LogicalOr(m_Specific(X), m_Specific(Y))))
+        return X;
+      if (match(TrueVal, m_c_LogicalOr(m_Value(X), m_Not(m_Value(Y)))) &&
+          match(Cond, m_c_LogicalOr(m_Specific(X), m_Specific(Y))))
+        return X;
+    }
+  }
 
   // select ?, X, X -> X
   if (TrueVal == FalseVal)
@@ -4295,7 +4339,7 @@ Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
 
 /// Given operands for an GetElementPtrInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
+static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
                               const SimplifyQuery &Q, unsigned) {
   // The type of the GEP pointer operand.
   unsigned AS =
@@ -4396,14 +4440,14 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
       // gep (gep V, C), (sub 0, V) -> C
       if (match(Ops.back(),
                 m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr)))) &&
-          !BasePtrOffset.isNullValue()) {
+          !BasePtrOffset.isZero()) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
       // gep (gep V, C), (xor V, -1) -> C-1
       if (match(Ops.back(),
                 m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes())) &&
-          !BasePtrOffset.isOneValue()) {
+          !BasePtrOffset.isOne()) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
@@ -4415,13 +4459,13 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
     return nullptr;
 
   auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
-                                            Ops.slice(1));
+                                            Ops.slice(1), InBounds);
   return ConstantFoldConstant(CE, Q.DL);
 }
 
-Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
+Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
                              const SimplifyQuery &Q) {
-  return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit);
+  return ::SimplifyGEPInst(SrcTy, Ops, InBounds, Q, RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
@@ -4891,6 +4935,11 @@ static Constant *simplifyFPOp(ArrayRef<Value *> Ops, FastMathFlags FMF,
   return nullptr;
 }
 
+// TODO: Move this out to a header file:
+static inline bool canIgnoreSNaN(fp::ExceptionBehavior EB, FastMathFlags FMF) {
+  return (EB == fp::ebIgnore || FMF.noNaNs());
+}
+
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
 /// returns null.
 static Value *
@@ -4905,17 +4954,25 @@ SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = simplifyFPOp({Op0, Op1}, FMF, Q, ExBehavior, Rounding))
     return C;
 
-  if (!isDefaultFPEnvironment(ExBehavior, Rounding))
-    return nullptr;
-
   // fadd X, -0 ==> X
-  if (match(Op1, m_NegZeroFP()))
-    return Op0;
+  // With strict/constrained FP, we have these possible edge cases that do
+  // not simplify to Op0:
+  // fadd SNaN, -0.0 --> QNaN
+  // fadd +0.0, -0.0 --> -0.0 (but only with round toward negative)
+  if (canIgnoreSNaN(ExBehavior, FMF) &&
+      (!canRoundingModeBe(Rounding, RoundingMode::TowardNegative) ||
+       FMF.noSignedZeros()))
+    if (match(Op1, m_NegZeroFP()))
+      return Op0;
 
   // fadd X, 0 ==> X, when we know X is not -0
-  if (match(Op1, m_PosZeroFP()) &&
-      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
-    return Op0;
+  if (canIgnoreSNaN(ExBehavior, FMF))
+    if (match(Op1, m_PosZeroFP()) &&
+        (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI)))
+      return Op0;
+
+  if (!isDefaultFPEnvironment(ExBehavior, Rounding))
+    return nullptr;
 
   // With nnan: -X + X --> 0.0 (and commuted variant)
   // We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN.
@@ -5457,6 +5514,9 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
     if (match(Op0,
               m_Intrinsic<Intrinsic::experimental_vector_reverse>(m_Value(X))))
       return X;
+    // experimental.vector.reverse(splat(X)) -> splat(X)
+    if (isSplatValue(Op0))
+      return Op0;
     break;
   default:
     break;
@@ -5772,13 +5832,32 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
 
 static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
 
-  // Intrinsics with no operands have some kind of side effect. Don't simplify.
-  unsigned NumOperands = Call->getNumArgOperands();
-  if (!NumOperands)
-    return nullptr;
-
+  unsigned NumOperands = Call->arg_size();
   Function *F = cast<Function>(Call->getCalledFunction());
   Intrinsic::ID IID = F->getIntrinsicID();
+
+  // Most of the intrinsics with no operands have some kind of side effect.
+  // Don't simplify.
+  if (!NumOperands) {
+    switch (IID) {
+    case Intrinsic::vscale: {
+      // Call may not be inserted into the IR yet at point of calling simplify.
+      if (!Call->getParent() || !Call->getParent()->getParent())
+        return nullptr;
+      auto Attr = Call->getFunction()->getFnAttribute(Attribute::VScaleRange);
+      if (!Attr.isValid())
+        return nullptr;
+      unsigned VScaleMin, VScaleMax;
+      std::tie(VScaleMin, VScaleMax) = Attr.getVScaleRangeArgs();
+      if (VScaleMin == VScaleMax && VScaleMax != 0)
+        return ConstantInt::get(F->getReturnType(), VScaleMin);
+      return nullptr;
+    }
+    default:
+      return nullptr;
+    }
+  }
+
   if (NumOperands == 1)
     return simplifyUnaryIntrinsic(F, Call->getArgOperand(0), Q);
 
@@ -5814,9 +5893,18 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
     if (match(ShAmtArg, m_APInt(ShAmtC))) {
       // If there's effectively no shift, return the 1st arg or 2nd arg.
       APInt BitWidth = APInt(ShAmtC->getBitWidth(), ShAmtC->getBitWidth());
-      if (ShAmtC->urem(BitWidth).isNullValue())
+      if (ShAmtC->urem(BitWidth).isZero())
         return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);
     }
+
+    // Rotating zero by anything is zero.
+    if (match(Op0, m_Zero()) && match(Op1, m_Zero()))
+      return ConstantInt::getNullValue(F->getReturnType());
+
+    // Rotating -1 by anything is -1.
+    if (match(Op0, m_AllOnes()) && match(Op1, m_AllOnes()))
+      return ConstantInt::getAllOnesValue(F->getReturnType());
+
     return nullptr;
   }
   case Intrinsic::experimental_constrained_fma: {
@@ -5939,7 +6027,7 @@ static Value *tryConstantFoldCall(CallBase *Call, const SimplifyQuery &Q) {
     return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
-  unsigned NumArgs = Call->getNumArgOperands();
+  unsigned NumArgs = Call->arg_size();
   ConstantArgs.reserve(NumArgs);
   for (auto &Arg : Call->args()) {
     Constant *C = dyn_cast<Constant>(&Arg);
@@ -5990,73 +6078,27 @@ Value *llvm::SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
   return ::SimplifyFreezeInst(Op0, Q);
 }
 
-static Constant *ConstructLoadOperandConstant(Value *Op) {
-  SmallVector<Value *, 4> Worklist;
-  // Invalid IR in unreachable code may contain self-referential values. Don't infinitely loop.
-  SmallPtrSet<Value *, 4> Visited;
-  Worklist.push_back(Op);
-  while (true) {
-    Value *CurOp = Worklist.back();
-    if (!Visited.insert(CurOp).second)
-      return nullptr;
-    if (isa<Constant>(CurOp))
-      break;
-    if (auto *BC = dyn_cast<BitCastOperator>(CurOp)) {
-      Worklist.push_back(BC->getOperand(0));
-    } else if (auto *GEP = dyn_cast<GEPOperator>(CurOp)) {
-      for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {
-        if (!isa<Constant>(GEP->getOperand(I)))
-          return nullptr;
-      }
-      Worklist.push_back(GEP->getOperand(0));
-    } else if (auto *II = dyn_cast<IntrinsicInst>(CurOp)) {
-      if (II->isLaunderOrStripInvariantGroup())
-        Worklist.push_back(II->getOperand(0));
-      else
-        return nullptr;
-    } else {
-      return nullptr;
-    }
-  }
-
-  Constant *NewOp = cast<Constant>(Worklist.pop_back_val());
-  while (!Worklist.empty()) {
-    Value *CurOp = Worklist.pop_back_val();
-    if (isa<BitCastOperator>(CurOp)) {
-      NewOp = ConstantExpr::getBitCast(NewOp, CurOp->getType());
-    } else if (auto *GEP = dyn_cast<GEPOperator>(CurOp)) {
-      SmallVector<Constant *> Idxs;
-      Idxs.reserve(GEP->getNumOperands() - 1);
-      for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
-        Idxs.push_back(cast<Constant>(GEP->getOperand(I)));
-      }
-      NewOp = ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), NewOp,
-                                             Idxs, GEP->isInBounds(),
-                                             GEP->getInRangeIndex());
-    } else {
-      assert(isa<IntrinsicInst>(CurOp) &&
-             cast<IntrinsicInst>(CurOp)->isLaunderOrStripInvariantGroup() &&
-             "expected invariant group intrinsic");
-      NewOp = ConstantExpr::getBitCast(NewOp, CurOp->getType());
-    }
-  }
-  return NewOp;
-}
-
 static Value *SimplifyLoadInst(LoadInst *LI, Value *PtrOp,
                                const SimplifyQuery &Q) {
   if (LI->isVolatile())
     return nullptr;
 
-  // Try to make the load operand a constant, specifically handle
-  // invariant.group intrinsics.
+  APInt Offset(Q.DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
   auto *PtrOpC = dyn_cast<Constant>(PtrOp);
-  if (!PtrOpC)
-    PtrOpC = ConstructLoadOperandConstant(PtrOp);
+  // Try to convert operand into a constant by stripping offsets while looking
+  // through invariant.group intrinsics. Don't bother if the underlying object
+  // is not constant, as calculating GEP offsets is expensive.
+  if (!PtrOpC && isa<Constant>(getUnderlyingObject(PtrOp))) {
+    PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
+        Q.DL, Offset, /* AllowNonInbounts */ true,
+        /* AllowInvariantGroup */ true);
+    // Index size may have changed due to address space casts.
+    Offset = Offset.sextOrTrunc(Q.DL.getIndexTypeSizeInBits(PtrOp->getType()));
+    PtrOpC = dyn_cast<Constant>(PtrOp);
+  }
 
   if (PtrOpC)
-    return ConstantFoldLoadFromConstPtr(PtrOpC, LI->getType(), Q.DL);
-
+    return ConstantFoldLoadFromConstPtr(PtrOpC, LI->getType(), Offset, Q.DL);
   return nullptr;
 }
 
@@ -6156,8 +6198,9 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
     Result = SimplifySelectInst(NewOps[0], NewOps[1], NewOps[2], Q);
     break;
   case Instruction::GetElementPtr: {
-    Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
-                             NewOps, Q);
+    auto *GEPI = cast<GetElementPtrInst>(I);
+    Result = SimplifyGEPInst(GEPI->getSourceElementType(), NewOps,
+                             GEPI->isInBounds(), Q);
     break;
   }
   case Instruction::InsertValue: {
diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp
index 8f87552fca1f..0007c54b16d0 100644
--- a/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -220,8 +220,7 @@ bool LazyCallGraph::invalidate(Module &, const PreservedAnalyses &PA,
   // Check whether the analysis, all analyses on functions, or the function's
   // CFG have been preserved.
   auto PAC = PA.getChecker<llvm::LazyCallGraphAnalysis>();
-  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Module>>() ||
-           PAC.preservedSet<CFGAnalyses>());
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Module>>());
 }
 
 LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
@@ -1962,6 +1961,29 @@ void LazyCallGraph::buildRefSCCs() {
       });
 }
 
+void LazyCallGraph::visitReferences(SmallVectorImpl<Constant *> &Worklist,
+                                    SmallPtrSetImpl<Constant *> &Visited,
+                                    function_ref<void(Function &)> Callback) {
+  while (!Worklist.empty()) {
+    Constant *C = Worklist.pop_back_val();
+
+    if (Function *F = dyn_cast<Function>(C)) {
+      if (!F->isDeclaration())
+        Callback(*F);
+      continue;
+    }
+
+    // blockaddresses are weird and don't participate in the call graph anyway,
+    // skip them.
+    if (isa<BlockAddress>(C))
+      continue;
+
+    for (Value *Op : C->operand_values())
+      if (Visited.insert(cast<Constant>(Op)).second)
+        Worklist.push_back(cast<Constant>(Op));
+  }
+}
+
 AnalysisKey LazyCallGraphAnalysis::Key;
 
 LazyCallGraphPrinterPass::LazyCallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 1dababafb8a6..50fa169c2081 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -126,7 +126,7 @@ static ValueLatticeElement intersect(const ValueLatticeElement &A,
   // Note: An empty range is implicitly converted to unknown or undef depending
   // on MayIncludeUndef internally.
   return ValueLatticeElement::getRange(
-      std::move(Range), /*MayIncludeUndef=*/A.isConstantRangeIncludingUndef() |
+      std::move(Range), /*MayIncludeUndef=*/A.isConstantRangeIncludingUndef() ||
                             B.isConstantRangeIncludingUndef());
 }
 
@@ -832,7 +832,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect(
         };
       }();
       return ValueLatticeElement::getRange(
-          ResultCR, TrueVal.isConstantRangeIncludingUndef() |
+          ResultCR, TrueVal.isConstantRangeIncludingUndef() ||
                         FalseVal.isConstantRangeIncludingUndef());
     }
 
@@ -846,7 +846,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect(
     }
 
     if (SPR.Flavor == SPF_NABS) {
-      ConstantRange Zero(APInt::getNullValue(TrueCR.getBitWidth()));
+      ConstantRange Zero(APInt::getZero(TrueCR.getBitWidth()));
       if (LHS == SI->getTrueValue())
         return ValueLatticeElement::getRange(
             Zero.sub(TrueCR.abs()), FalseVal.isConstantRangeIncludingUndef());
@@ -1117,12 +1117,11 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
     }
     // If (Val & Mask) != 0 then the value must be larger than the lowest set
     // bit of Mask.
-    if (EdgePred == ICmpInst::ICMP_NE && !Mask->isNullValue() &&
-        C->isNullValue()) {
+    if (EdgePred == ICmpInst::ICMP_NE && !Mask->isZero() && C->isZero()) {
       unsigned BitWidth = Ty->getIntegerBitWidth();
       return ValueLatticeElement::getRange(ConstantRange::getNonEmpty(
           APInt::getOneBitSet(BitWidth, Mask->countTrailingZeros()),
-          APInt::getNullValue(BitWidth)));
+          APInt::getZero(BitWidth)));
     }
   }
 
@@ -1780,62 +1779,62 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
   // We could consider extending this to search further backwards through the
   // CFG and/or value graph, but there are non-obvious compile time vs quality
   // tradeoffs.
-  if (CxtI) {
-    BasicBlock *BB = CxtI->getParent();
-
-    // Function entry or an unreachable block.  Bail to avoid confusing
-    // analysis below.
-    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-    if (PI == PE)
-      return Unknown;
-
-    // If V is a PHI node in the same block as the context, we need to ask
-    // questions about the predicate as applied to the incoming value along
-    // each edge. This is useful for eliminating cases where the predicate is
-    // known along all incoming edges.
-    if (auto *PHI = dyn_cast<PHINode>(V))
-      if (PHI->getParent() == BB) {
-        Tristate Baseline = Unknown;
-        for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) {
-          Value *Incoming = PHI->getIncomingValue(i);
-          BasicBlock *PredBB = PHI->getIncomingBlock(i);
-          // Note that PredBB may be BB itself.
-          Tristate Result = getPredicateOnEdge(Pred, Incoming, C, PredBB, BB,
-                                               CxtI);
-
-          // Keep going as long as we've seen a consistent known result for
-          // all inputs.
-          Baseline = (i == 0) ? Result /* First iteration */
-            : (Baseline == Result ? Baseline : Unknown); /* All others */
-          if (Baseline == Unknown)
-            break;
-        }
-        if (Baseline != Unknown)
-          return Baseline;
+  BasicBlock *BB = CxtI->getParent();
+
+  // Function entry or an unreachable block.  Bail to avoid confusing
+  // analysis below.
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return Unknown;
+
+  // If V is a PHI node in the same block as the context, we need to ask
+  // questions about the predicate as applied to the incoming value along
+  // each edge. This is useful for eliminating cases where the predicate is
+  // known along all incoming edges.
+  if (auto *PHI = dyn_cast<PHINode>(V))
+    if (PHI->getParent() == BB) {
+      Tristate Baseline = Unknown;
+      for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) {
+        Value *Incoming = PHI->getIncomingValue(i);
+        BasicBlock *PredBB = PHI->getIncomingBlock(i);
+        // Note that PredBB may be BB itself.
+        Tristate Result =
+            getPredicateOnEdge(Pred, Incoming, C, PredBB, BB, CxtI);
+
+        // Keep going as long as we've seen a consistent known result for
+        // all inputs.
+        Baseline = (i == 0) ? Result /* First iteration */
+                            : (Baseline == Result ? Baseline
+                                                  : Unknown); /* All others */
+        if (Baseline == Unknown)
+          break;
       }
+      if (Baseline != Unknown)
+        return Baseline;
+    }
 
-    // For a comparison where the V is outside this block, it's possible
-    // that we've branched on it before. Look to see if the value is known
-    // on all incoming edges.
-    if (!isa<Instruction>(V) ||
-        cast<Instruction>(V)->getParent() != BB) {
-      // For predecessor edge, determine if the comparison is true or false
-      // on that edge. If they're all true or all false, we can conclude
-      // the value of the comparison in this block.
-      Tristate Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
-      if (Baseline != Unknown) {
-        // Check that all remaining incoming values match the first one.
-        while (++PI != PE) {
-          Tristate Ret = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
-          if (Ret != Baseline) break;
-        }
-        // If we terminated early, then one of the values didn't match.
-        if (PI == PE) {
-          return Baseline;
-        }
+  // For a comparison where the V is outside this block, it's possible
+  // that we've branched on it before. Look to see if the value is known
+  // on all incoming edges.
+  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB) {
+    // For predecessor edge, determine if the comparison is true or false
+    // on that edge. If they're all true or all false, we can conclude
+    // the value of the comparison in this block.
+    Tristate Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
+    if (Baseline != Unknown) {
+      // Check that all remaining incoming values match the first one.
+      while (++PI != PE) {
+        Tristate Ret = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI);
+        if (Ret != Baseline)
+          break;
+      }
+      // If we terminated early, then one of the values didn't match.
+      if (PI == PE) {
+        return Baseline;
       }
     }
   }
+
   return Unknown;
 }
 
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 4de5e1e06c7e..f9a7a5bdf434 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -235,7 +235,7 @@ void Lint::visitCallBase(CallBase &I) {
           for (auto BI = I.arg_begin(); BI != AE; ++BI, ++ArgNo) {
             // Skip ByVal arguments since they will be memcpy'd to the callee's
             // stack so we're not really passing the pointer anyway.
-            if (PAL.hasParamAttribute(ArgNo, Attribute::ByVal))
+            if (PAL.hasParamAttr(ArgNo, Attribute::ByVal))
               continue;
             // If both arguments are readonly, they have no dependence.
             if (Formal->onlyReadsMemory() && I.onlyReadsMemory(ArgNo))
@@ -268,7 +268,7 @@ void Lint::visitCallBase(CallBase &I) {
       for (Value *Arg : I.args()) {
         // Skip ByVal arguments since they will be memcpy'd to the callee's
         // stack anyway.
-        if (PAL.hasParamAttribute(ArgNo++, Attribute::ByVal))
+        if (PAL.hasParamAttr(ArgNo++, Attribute::ByVal))
           continue;
         Value *Obj = findValue(Arg, /*OffsetOk=*/true);
         Assert(!isa<AllocaInst>(Obj),
@@ -715,6 +715,7 @@ PreservedAnalyses LintPass::run(Function &F, FunctionAnalysisManager &AM) {
   return PreservedAnalyses::all();
 }
 
+namespace {
 class LintLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -733,6 +734,7 @@ public:
   }
   void print(raw_ostream &O, const Module *M) const override {}
 };
+} // namespace
 
 char LintLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LintLegacyPass, "lint", "Statically lint-checks LLVM IR",
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 1c55f485aa76..0fbf1db0685d 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -147,7 +147,7 @@ static bool isDereferenceableAndAlignedPointer(
                                               Alignment, Size, DL, CtxI, DT,
                                               TLI, Visited, MaxDepth);
 
-  if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
+  if (const AddrSpaceCastOperator *ASC = dyn_cast<AddrSpaceCastOperator>(V))
     return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Alignment,
                                               Size, DL, CtxI, DT, TLI,
                                               Visited, MaxDepth);
@@ -451,8 +451,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
                                               const Value *StorePtr,
                                               Type *StoreTy,
                                               const DataLayout &DL) {
-  APInt LoadOffset(DL.getTypeSizeInBits(LoadPtr->getType()), 0);
-  APInt StoreOffset(DL.getTypeSizeInBits(StorePtr->getType()), 0);
+  APInt LoadOffset(DL.getIndexTypeSizeInBits(LoadPtr->getType()), 0);
+  APInt StoreOffset(DL.getIndexTypeSizeInBits(StorePtr->getType()), 0);
   const Value *LoadBase = LoadPtr->stripAndAccumulateConstantOffsets(
       DL, LoadOffset, /* AllowNonInbounds */ false);
   const Value *StoreBase = StorePtr->stripAndAccumulateConstantOffsets(
@@ -511,8 +511,11 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
     if (CastInst::isBitOrNoopPointerCastable(Val->getType(), AccessTy, DL))
       return Val;
 
-    if (auto *C = dyn_cast<Constant>(Val))
-      return ConstantFoldLoadThroughBitcast(C, AccessTy, DL);
+    TypeSize StoreSize = DL.getTypeStoreSize(Val->getType());
+    TypeSize LoadSize = DL.getTypeStoreSize(AccessTy);
+    if (TypeSize::isKnownLE(LoadSize, StoreSize))
+      if (auto *C = dyn_cast<Constant>(Val))
+        return ConstantFoldLoadFromConst(C, AccessTy, DL);
   }
 
   return nullptr;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a239928ecf38..f9bd7167317f 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -142,13 +142,12 @@ Value *llvm::stripIntegerCast(Value *V) {
 
 const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                             const ValueToValueMap &PtrToStride,
-                                            Value *Ptr, Value *OrigPtr) {
+                                            Value *Ptr) {
   const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
 
   // If there is an entry in the map return the SCEV of the pointer with the
   // symbolic stride replaced by one.
-  ValueToValueMap::const_iterator SI =
-      PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
+  ValueToValueMap::const_iterator SI = PtrToStride.find(Ptr);
   if (SI == PtrToStride.end())
     // For a non-symbolic stride, just return the original expression.
     return OrigSCEV;
@@ -659,7 +658,8 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE,
   if (PSE.getSE()->isLoopInvariant(PtrScev, L))
     return true;
 
-  int64_t Stride = getPtrStride(PSE, Ptr, L, Strides);
+  Type *AccessTy = Ptr->getType()->getPointerElementType();
+  int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides);
   if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
     return true;
 
@@ -1026,15 +1026,17 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// Check whether the access through \p Ptr has a constant stride.
-int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
-                           const Loop *Lp, const ValueToValueMap &StridesMap,
-                           bool Assume, bool ShouldCheckWrap) {
+int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy,
+                           Value *Ptr, const Loop *Lp,
+                           const ValueToValueMap &StridesMap, bool Assume,
+                           bool ShouldCheckWrap) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
+  unsigned AddrSpace = Ty->getPointerAddressSpace();
 
-  // Make sure that the pointer does not point to aggregate types.
-  auto *PtrTy = cast<PointerType>(Ty);
-  if (PtrTy->getElementType()->isAggregateType()) {
+  // Make sure we're not accessing an aggregate type.
+  // TODO: Why? This doesn't make any sense.
+  if (AccessTy->isAggregateType()) {
     LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
                       << *Ptr << "\n");
     return 0;
@@ -1071,8 +1073,7 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
     PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
     isNoWrapAddRec(Ptr, AR, PSE, Lp);
   if (!IsNoWrapAddRec && !IsInBoundsGEP &&
-      NullPointerIsDefined(Lp->getHeader()->getParent(),
-                           PtrTy->getAddressSpace())) {
+      NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace)) {
     if (Assume) {
       PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
       IsNoWrapAddRec = true;
@@ -1100,7 +1101,7 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   }
 
   auto &DL = Lp->getHeader()->getModule()->getDataLayout();
-  int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+  int64_t Size = DL.getTypeAllocSize(AccessTy);
   const APInt &APStepVal = C->getAPInt();
 
   // Huge step value - give up.
@@ -1120,7 +1121,7 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
   // zero we know that this won't happen without triggering undefined behavior.
   if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
       (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
-                                              PtrTy->getAddressSpace()))) {
+                                              AddrSpace))) {
     if (Assume) {
       // We can avoid this case by adding a run-time check.
       LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
@@ -1262,6 +1263,47 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   return Diff && *Diff == 1;
 }
 
+static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
+                          function_ref<void(Value *)> AddPointer) {
+  SmallPtrSet<Value *, 8> Visited;
+  SmallVector<Value *> WorkList;
+  WorkList.push_back(StartPtr);
+
+  while (!WorkList.empty()) {
+    Value *Ptr = WorkList.pop_back_val();
+    if (!Visited.insert(Ptr).second)
+      continue;
+    auto *PN = dyn_cast<PHINode>(Ptr);
+    // SCEV does not look through non-header PHIs inside the loop. Such phis
+    // can be analyzed by adding separate accesses for each incoming pointer
+    // value.
+    if (PN && InnermostLoop.contains(PN->getParent()) &&
+        PN->getParent() != InnermostLoop.getHeader()) {
+      for (const Use &Inc : PN->incoming_values())
+        WorkList.push_back(Inc);
+    } else
+      AddPointer(Ptr);
+  }
+}
+
+void MemoryDepChecker::addAccess(StoreInst *SI) {
+  visitPointers(SI->getPointerOperand(), *InnermostLoop,
+                [this, SI](Value *Ptr) {
+                  Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
+                  InstMap.push_back(SI);
+                  ++AccessIdx;
+                });
+}
+
+void MemoryDepChecker::addAccess(LoadInst *LI) {
+  visitPointers(LI->getPointerOperand(), *InnermostLoop,
+                [this, LI](Value *Ptr) {
+                  Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
+                  InstMap.push_back(LI);
+                  ++AccessIdx;
+                });
+}
+
 MemoryDepChecker::VectorizationSafetyStatus
 MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
   switch (Type) {
@@ -1478,6 +1520,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   Value *BPtr = B.getPointer();
   bool AIsWrite = A.getInt();
   bool BIsWrite = B.getInt();
+  Type *ATy = APtr->getType()->getPointerElementType();
+  Type *BTy = BPtr->getType()->getPointerElementType();
 
   // Two reads are independent.
   if (!AIsWrite && !BIsWrite)
@@ -1488,8 +1532,10 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  int64_t StrideAPtr = getPtrStride(PSE, APtr, InnermostLoop, Strides, true);
-  int64_t StrideBPtr = getPtrStride(PSE, BPtr, InnermostLoop, Strides, true);
+  int64_t StrideAPtr =
+      getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true);
+  int64_t StrideBPtr =
+      getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -1498,6 +1544,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // dependence.
   if (StrideAPtr < 0) {
     std::swap(APtr, BPtr);
+    std::swap(ATy, BTy);
     std::swap(Src, Sink);
     std::swap(AIsWrite, BIsWrite);
     std::swap(AIdx, BIdx);
@@ -1519,8 +1566,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::Unknown;
   }
 
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
   auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
   uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
   uint64_t Stride = std::abs(StrideAPtr);
@@ -1958,7 +2003,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
         Loc.AATags.TBAA = nullptr;
 
-      Accesses.addStore(Loc);
+      visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
+                    [&Accesses, Loc](Value *Ptr) {
+                      MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
+                      Accesses.addStore(NewLoc);
+                    });
     }
   }
 
@@ -1982,7 +2031,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
     if (Seen.insert(Ptr).second ||
-        !getPtrStride(*PSE, Ptr, TheLoop, SymbolicStrides)) {
+        !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -2002,7 +2051,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
       Loc.AATags.TBAA = nullptr;
 
-    Accesses.addLoad(Loc, IsReadOnlyPtr);
+    visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
+                  [&Accesses, Loc, IsReadOnlyPtr](Value *Ptr) {
+                    MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
+                    Accesses.addLoad(NewLoc, IsReadOnlyPtr);
+                  });
   }
 
   // If we write (or read-write) to a single destination and there are no
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 8a613647bbea..7b895d8a5dc2 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -290,8 +291,8 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     const SCEV *Coeff = getLastCoefficient();
     const SCEV *ElemSize = Sizes.back();
     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
-    const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
+    const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
     if (SE.isKnownNegative(Stride))
       Stride = SE.getNegativeSCEV(Stride);
     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
@@ -344,8 +345,8 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
     LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
                                 << "', AccessFn: " << *AccessFn << "\n");
 
-    SE.delinearize(AccessFn, Subscripts, Sizes,
-                   SE.getElementSize(&StoreOrLoadInst));
+    llvm::delinearize(SE, AccessFn, Subscripts, Sizes,
+                      SE.getElementSize(&StoreOrLoadInst));
 
     if (Subscripts.empty() || Sizes.empty() ||
         Subscripts.size() != Sizes.size()) {
@@ -425,9 +426,7 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
 
 const SCEV *IndexedReference::getLastCoefficient() const {
   const SCEV *LastSubscript = getLastSubscript();
-  assert(isa<SCEVAddRecExpr>(LastSubscript) &&
-         "Expecting a SCEV add recurrence expression");
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LastSubscript);
+  auto *AR = cast<SCEVAddRecExpr>(LastSubscript);
   return AR->getStepRecurrence(SE);
 }
 
@@ -522,10 +521,9 @@ void CacheCost::calculateCacheFootprint() {
 
   LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n");
   for (const Loop *L : Loops) {
-    assert((std::find_if(LoopCosts.begin(), LoopCosts.end(),
-                         [L](const LoopCacheCostTy &LCC) {
-                           return LCC.first == L;
-                         }) == LoopCosts.end()) &&
+    assert(llvm::none_of(
+               LoopCosts,
+               [L](const LoopCacheCostTy &LCC) { return LCC.first == L; }) &&
            "Should not add duplicate element");
     CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups);
     LoopCosts.push_back(std::make_pair(L, LoopCost));
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 66aab4c195c8..b35fb2a190f6 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -301,15 +301,16 @@ PHINode *Loop::getInductionVariable(ScalarEvolution &SE) const {
   if (!CmpInst)
     return nullptr;
 
-  Instruction *LatchCmpOp0 = dyn_cast<Instruction>(CmpInst->getOperand(0));
-  Instruction *LatchCmpOp1 = dyn_cast<Instruction>(CmpInst->getOperand(1));
+  Value *LatchCmpOp0 = CmpInst->getOperand(0);
+  Value *LatchCmpOp1 = CmpInst->getOperand(1);
 
   for (PHINode &IndVar : Header->phis()) {
     InductionDescriptor IndDesc;
     if (!InductionDescriptor::isInductionPHI(&IndVar, this, &SE, IndDesc))
       continue;
 
-    Instruction *StepInst = IndDesc.getInductionBinOp();
+    BasicBlock *Latch = getLoopLatch();
+    Value *StepInst = IndVar.getIncomingValueForBlock(Latch);
 
     // case 1:
     // IndVar = phi[{InitialValue, preheader}, {StepInst, latch}]
@@ -1102,6 +1103,11 @@ llvm::Optional<int> llvm::getOptionalIntLoopAttribute(const Loop *TheLoop,
   return IntMD->getSExtValue();
 }
 
+int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
+                              int Default) {
+  return getOptionalIntLoopAttribute(TheLoop, Name).getValueOr(Default);
+}
+
 static const char *LLVMLoopMustProgress = "llvm.loop.mustprogress";
 
 bool llvm::hasMustProgress(const Loop *L) {
diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp
index 2649ed60f762..675bb7a7749c 100644
--- a/llvm/lib/Analysis/LoopNestAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -50,8 +50,66 @@ std::unique_ptr<LoopNest> LoopNest::getLoopNest(Loop &Root,
   return std::make_unique<LoopNest>(Root, SE);
 }
 
+static CmpInst *getOuterLoopLatchCmp(const Loop &OuterLoop) {
+
+  const BasicBlock *Latch = OuterLoop.getLoopLatch();
+  assert(Latch && "Expecting a valid loop latch");
+
+  const BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+  assert(BI && BI->isConditional() &&
+         "Expecting loop latch terminator to be a branch instruction");
+
+  CmpInst *OuterLoopLatchCmp = dyn_cast<CmpInst>(BI->getCondition());
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (OuterLoopLatchCmp) {
+        dbgs() << "Outer loop latch compare instruction: " << *OuterLoopLatchCmp
+               << "\n";
+      });
+  return OuterLoopLatchCmp;
+}
+
+static CmpInst *getInnerLoopGuardCmp(const Loop &InnerLoop) {
+
+  BranchInst *InnerGuard = InnerLoop.getLoopGuardBranch();
+  CmpInst *InnerLoopGuardCmp =
+      (InnerGuard) ? dyn_cast<CmpInst>(InnerGuard->getCondition()) : nullptr;
+
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (InnerLoopGuardCmp) {
+        dbgs() << "Inner loop guard compare instruction: " << *InnerLoopGuardCmp
+               << "\n";
+      });
+  return InnerLoopGuardCmp;
+}
+
+static bool checkSafeInstruction(const Instruction &I,
+                                 const CmpInst *InnerLoopGuardCmp,
+                                 const CmpInst *OuterLoopLatchCmp,
+                                 Optional<Loop::LoopBounds> OuterLoopLB) {
+
+  bool IsAllowed =
+      isSafeToSpeculativelyExecute(&I) || isa<PHINode>(I) || isa<BranchInst>(I);
+  if (!IsAllowed)
+    return false;
+  // The only binary instruction allowed is the outer loop step instruction,
+  // the only comparison instructions allowed are the inner loop guard
+  // compare instruction and the outer loop latch compare instruction.
+  if ((isa<BinaryOperator>(I) && &I != &OuterLoopLB->getStepInst()) ||
+      (isa<CmpInst>(I) && &I != OuterLoopLatchCmp && &I != InnerLoopGuardCmp)) {
+    return false;
+  }
+  return true;
+}
+
 bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
                                   ScalarEvolution &SE) {
+  return (analyzeLoopNestForPerfectNest(OuterLoop, InnerLoop, SE) ==
+          PerfectLoopNest);
+}
+
+LoopNest::LoopNestEnum LoopNest::analyzeLoopNestForPerfectNest(
+    const Loop &OuterLoop, const Loop &InnerLoop, ScalarEvolution &SE) {
+
   assert(!OuterLoop.isInnermost() && "Outer loop should have subloops");
   assert(!InnerLoop.isOutermost() && "Inner loop should have a parent");
   LLVM_DEBUG(dbgs() << "Checking whether loop '" << OuterLoop.getName()
@@ -66,7 +124,7 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
   //    the outer loop latch.
   if (!checkLoopsStructure(OuterLoop, InnerLoop, SE)) {
     LLVM_DEBUG(dbgs() << "Not perfectly nested: invalid loop structure.\n");
-    return false;
+    return InvalidLoopStructure;
   }
 
   // Bail out if we cannot retrieve the outer loop bounds.
@@ -74,33 +132,11 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
   if (OuterLoopLB == None) {
     LLVM_DEBUG(dbgs() << "Cannot compute loop bounds of OuterLoop: "
                       << OuterLoop << "\n";);
-    return false;
+    return OuterLoopLowerBoundUnknown;
   }
 
-  // Identify the outer loop latch comparison instruction.
-  const BasicBlock *Latch = OuterLoop.getLoopLatch();
-  assert(Latch && "Expecting a valid loop latch");
-  const BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
-  assert(BI && BI->isConditional() &&
-         "Expecting loop latch terminator to be a branch instruction");
-
-  const CmpInst *OuterLoopLatchCmp = dyn_cast<CmpInst>(BI->getCondition());
-  DEBUG_WITH_TYPE(
-      VerboseDebug, if (OuterLoopLatchCmp) {
-        dbgs() << "Outer loop latch compare instruction: " << *OuterLoopLatchCmp
-               << "\n";
-      });
-
-  // Identify the inner loop guard instruction.
-  BranchInst *InnerGuard = InnerLoop.getLoopGuardBranch();
-  const CmpInst *InnerLoopGuardCmp =
-      (InnerGuard) ? dyn_cast<CmpInst>(InnerGuard->getCondition()) : nullptr;
-
-  DEBUG_WITH_TYPE(
-      VerboseDebug, if (InnerLoopGuardCmp) {
-        dbgs() << "Inner loop guard compare instruction: " << *InnerLoopGuardCmp
-               << "\n";
-      });
+  CmpInst *OuterLoopLatchCmp = getOuterLoopLatchCmp(OuterLoop);
+  CmpInst *InnerLoopGuardCmp = getInnerLoopGuardCmp(InnerLoop);
 
   // Determine whether instructions in a basic block are one of:
   //  - the inner loop guard comparison
@@ -109,29 +145,15 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
   //  - a phi node, a cast or a branch
   auto containsOnlySafeInstructions = [&](const BasicBlock &BB) {
     return llvm::all_of(BB, [&](const Instruction &I) {
-      bool isAllowed = isSafeToSpeculativelyExecute(&I) || isa<PHINode>(I) ||
-                       isa<BranchInst>(I);
-      if (!isAllowed) {
-        DEBUG_WITH_TYPE(VerboseDebug, {
-          dbgs() << "Instruction: " << I << "\nin basic block: " << BB
-                 << " is considered unsafe.\n";
-        });
-        return false;
-      }
-
-      // The only binary instruction allowed is the outer loop step instruction,
-      // the only comparison instructions allowed are the inner loop guard
-      // compare instruction and the outer loop latch compare instruction.
-      if ((isa<BinaryOperator>(I) && &I != &OuterLoopLB->getStepInst()) ||
-          (isa<CmpInst>(I) && &I != OuterLoopLatchCmp &&
-           &I != InnerLoopGuardCmp)) {
+      bool IsSafeInstr = checkSafeInstruction(I, InnerLoopGuardCmp,
+                                              OuterLoopLatchCmp, OuterLoopLB);
+      if (IsSafeInstr) {
         DEBUG_WITH_TYPE(VerboseDebug, {
           dbgs() << "Instruction: " << I << "\nin basic block:" << BB
                  << "is unsafe.\n";
         });
-        return false;
       }
-      return true;
+      return IsSafeInstr;
     });
   };
 
@@ -148,13 +170,72 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
       !containsOnlySafeInstructions(*InnerLoop.getExitBlock())) {
     LLVM_DEBUG(dbgs() << "Not perfectly nested: code surrounding inner loop is "
                          "unsafe\n";);
-    return false;
+    return ImperfectLoopNest;
   }
 
   LLVM_DEBUG(dbgs() << "Loop '" << OuterLoop.getName() << "' and '"
                     << InnerLoop.getName() << "' are perfectly nested.\n");
 
-  return true;
+  return PerfectLoopNest;
+}
+
+LoopNest::InstrVectorTy LoopNest::getInterveningInstructions(
+    const Loop &OuterLoop, const Loop &InnerLoop, ScalarEvolution &SE) {
+  InstrVectorTy Instr;
+  switch (analyzeLoopNestForPerfectNest(OuterLoop, InnerLoop, SE)) {
+  case PerfectLoopNest:
+    LLVM_DEBUG(dbgs() << "The loop Nest is Perfect, returning empty "
+                         "instruction vector. \n";);
+    return Instr;
+
+  case InvalidLoopStructure:
+    LLVM_DEBUG(dbgs() << "Not perfectly nested: invalid loop structure. "
+                         "Instruction vector is empty.\n";);
+    return Instr;
+
+  case OuterLoopLowerBoundUnknown:
+    LLVM_DEBUG(dbgs() << "Cannot compute loop bounds of OuterLoop: "
+                      << OuterLoop << "\nInstruction vector is empty.\n";);
+    return Instr;
+
+  case ImperfectLoopNest:
+    break;
+  }
+
+  // Identify the outer loop latch comparison instruction.
+  auto OuterLoopLB = OuterLoop.getBounds(SE);
+
+  CmpInst *OuterLoopLatchCmp = getOuterLoopLatchCmp(OuterLoop);
+  CmpInst *InnerLoopGuardCmp = getInnerLoopGuardCmp(InnerLoop);
+
+  auto GetUnsafeInstructions = [&](const BasicBlock &BB) {
+    for (const Instruction &I : BB) {
+      if (!checkSafeInstruction(I, InnerLoopGuardCmp, OuterLoopLatchCmp,
+                                OuterLoopLB)) {
+        Instr.push_back(&I);
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Instruction: " << I << "\nin basic block:" << BB
+                 << "is unsafe.\n";
+        });
+      }
+    }
+  };
+
+  // Check the code surrounding the inner loop for instructions that are deemed
+  // unsafe.
+  const BasicBlock *OuterLoopHeader = OuterLoop.getHeader();
+  const BasicBlock *OuterLoopLatch = OuterLoop.getLoopLatch();
+  const BasicBlock *InnerLoopPreHeader = InnerLoop.getLoopPreheader();
+  const BasicBlock *InnerLoopExitBlock = InnerLoop.getExitBlock();
+
+  GetUnsafeInstructions(*OuterLoopHeader);
+  GetUnsafeInstructions(*OuterLoopLatch);
+  GetUnsafeInstructions(*InnerLoopExitBlock);
+
+  if (InnerLoopPreHeader != OuterLoopHeader) {
+    GetUnsafeInstructions(*InnerLoopPreHeader);
+  }
+  return Instr;
 }
 
 SmallVector<LoopVectorTy, 4>
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 5b95ed223fd9..6fc4c42bdd71 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -116,6 +116,8 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
 void MLInlineAdvisor::onPassEntry() {
   // Function passes executed between InlinerPass runs may have changed the
   // module-wide features.
+  if (!Invalid)
+    return;
   NodeCount = 0;
   EdgeCount = 0;
   for (auto &F : M)
@@ -123,6 +125,7 @@ void MLInlineAdvisor::onPassEntry() {
       ++NodeCount;
       EdgeCount += getLocalCalls(F);
     }
+  Invalid = false;
 }
 
 int64_t MLInlineAdvisor::getLocalCalls(Function &F) {
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 68e997656d84..4f2b5b34304d 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -111,7 +111,7 @@ static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
   {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
   {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
   {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}},
-  {LibFunc___kmpc_alloc_shared, {MallocLike,  1, 0,  -1}}
+  {LibFunc___kmpc_alloc_shared, {MallocLike,  1, 0,  -1}},
   // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
@@ -135,9 +135,8 @@ static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast
   return nullptr;
 }
 
-/// Returns the allocation data for the given value if it's either a call to a
-/// known allocation function, or a call to a function with the allocsize
-/// attribute.
+/// Returns the allocation data for the given value if it's a call to a known
+/// allocation function.
 static Optional<AllocFnsTy>
 getAllocationDataForFunction(const Function *Callee, AllocType AllocTy,
                              const TargetLibraryInfo *TLI) {
@@ -610,7 +609,7 @@ ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout &DL,
 
 SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
   IntTyBits = DL.getIndexTypeSizeInBits(V->getType());
-  Zero = APInt::getNullValue(IntTyBits);
+  Zero = APInt::getZero(IntTyBits);
 
   V = V->stripPointerCasts();
   if (Instruction *I = dyn_cast<Instruction>(V)) {
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index ef9cda37ce35..7f2d04c49565 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -35,54 +35,44 @@ void LocationSize::print(raw_ostream &OS) const {
 }
 
 MemoryLocation MemoryLocation::get(const LoadInst *LI) {
-  AAMDNodes AATags;
-  LI->getAAMetadata(AATags);
   const auto &DL = LI->getModule()->getDataLayout();
 
   return MemoryLocation(
       LI->getPointerOperand(),
-      LocationSize::precise(DL.getTypeStoreSize(LI->getType())), AATags);
+      LocationSize::precise(DL.getTypeStoreSize(LI->getType())),
+      LI->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::get(const StoreInst *SI) {
-  AAMDNodes AATags;
-  SI->getAAMetadata(AATags);
   const auto &DL = SI->getModule()->getDataLayout();
 
   return MemoryLocation(SI->getPointerOperand(),
                         LocationSize::precise(DL.getTypeStoreSize(
                             SI->getValueOperand()->getType())),
-                        AATags);
+                        SI->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::get(const VAArgInst *VI) {
-  AAMDNodes AATags;
-  VI->getAAMetadata(AATags);
-
   return MemoryLocation(VI->getPointerOperand(),
-                        LocationSize::afterPointer(), AATags);
+                        LocationSize::afterPointer(), VI->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) {
-  AAMDNodes AATags;
-  CXI->getAAMetadata(AATags);
   const auto &DL = CXI->getModule()->getDataLayout();
 
   return MemoryLocation(CXI->getPointerOperand(),
                         LocationSize::precise(DL.getTypeStoreSize(
                             CXI->getCompareOperand()->getType())),
-                        AATags);
+                        CXI->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::get(const AtomicRMWInst *RMWI) {
-  AAMDNodes AATags;
-  RMWI->getAAMetadata(AATags);
   const auto &DL = RMWI->getModule()->getDataLayout();
 
   return MemoryLocation(RMWI->getPointerOperand(),
                         LocationSize::precise(DL.getTypeStoreSize(
                             RMWI->getValOperand()->getType())),
-                        AATags);
+                        RMWI->getAAMetadata());
 }
 
 Optional<MemoryLocation> MemoryLocation::getOrNone(const Instruction *Inst) {
@@ -117,10 +107,7 @@ MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) {
 
   // memcpy/memmove can have AA tags. For memcpy, they apply
   // to both the source and the destination.
-  AAMDNodes AATags;
-  MTI->getAAMetadata(AATags);
-
-  return MemoryLocation(MTI->getRawSource(), Size, AATags);
+  return MemoryLocation(MTI->getRawSource(), Size, MTI->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::getForDest(const MemIntrinsic *MI) {
@@ -138,17 +125,13 @@ MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
 
   // memcpy/memmove can have AA tags. For memcpy, they apply
   // to both the source and the destination.
-  AAMDNodes AATags;
-  MI->getAAMetadata(AATags);
-
-  return MemoryLocation(MI->getRawDest(), Size, AATags);
+  return MemoryLocation(MI->getRawDest(), Size, MI->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
                                               unsigned ArgIdx,
                                               const TargetLibraryInfo *TLI) {
-  AAMDNodes AATags;
-  Call->getAAMetadata(AATags);
+  AAMDNodes AATags = Call->getAAMetadata();
   const Value *Arg = Call->getArgOperand(ArgIdx);
 
   // We may be able to produce an exact size for known intrinsics.
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index b402b0467f5d..ac20e20f0c0d 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -90,22 +90,18 @@ bool llvm::VerifyMemorySSA = true;
 #else
 bool llvm::VerifyMemorySSA = false;
 #endif
-/// Enables memory ssa as a dependency for loop passes in legacy pass manager.
-cl::opt<bool> llvm::EnableMSSALoopDependency(
-    "enable-mssa-loop-dependency", cl::Hidden, cl::init(true),
-    cl::desc("Enable MemorySSA dependency for loop pass manager"));
 
 static cl::opt<bool, true>
     VerifyMemorySSAX("verify-memoryssa", cl::location(VerifyMemorySSA),
                      cl::Hidden, cl::desc("Enable verification of MemorySSA."));
 
-namespace llvm {
+const static char LiveOnEntryStr[] = "liveOnEntry";
+
+namespace {
 
 /// An assembly annotator class to print Memory SSA information in
 /// comments.
 class MemorySSAAnnotatedWriter : public AssemblyAnnotationWriter {
-  friend class MemorySSA;
-
   const MemorySSA *MSSA;
 
 public:
@@ -124,7 +120,34 @@ public:
   }
 };
 
-} // end namespace llvm
+/// An assembly annotator class to print Memory SSA information in
+/// comments.
+class MemorySSAWalkerAnnotatedWriter : public AssemblyAnnotationWriter {
+  MemorySSA *MSSA;
+  MemorySSAWalker *Walker;
+
+public:
+  MemorySSAWalkerAnnotatedWriter(MemorySSA *M)
+      : MSSA(M), Walker(M->getWalker()) {}
+
+  void emitInstructionAnnot(const Instruction *I,
+                            formatted_raw_ostream &OS) override {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
+      MemoryAccess *Clobber = Walker->getClobberingMemoryAccess(MA);
+      OS << "; " << *MA;
+      if (Clobber) {
+        OS << " - clobbered by ";
+        if (MSSA->isLiveOnEntryDef(Clobber))
+          OS << LiveOnEntryStr;
+        else
+          OS << *Clobber;
+      }
+      OS << "\n";
+    }
+  }
+};
+
+} // namespace
 
 namespace {
 
@@ -286,6 +309,7 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
     case Intrinsic::invariant_end:
     case Intrinsic::assume:
     case Intrinsic::experimental_noalias_scope_decl:
+    case Intrinsic::pseudoprobe:
       return {false, AliasResult(AliasResult::NoAlias)};
     case Intrinsic::dbg_addr:
     case Intrinsic::dbg_declare:
@@ -1016,7 +1040,8 @@ public:
   // updated if a new clobber is found by this SkipSelf search. If this
   // additional query becomes heavily used we may decide to cache the result.
   // Walker instantiations will decide how to set the SkipSelf bool.
-  MemoryAccess *getClobberingMemoryAccessBase(MemoryAccess *, unsigned &, bool);
+  MemoryAccess *getClobberingMemoryAccessBase(MemoryAccess *, unsigned &, bool,
+                                              bool UseInvariantGroup = true);
 };
 
 /// A MemorySSAWalker that does AA walks to disambiguate accesses. It no
@@ -1041,6 +1066,11 @@ public:
                                           unsigned &UWL) {
     return Walker->getClobberingMemoryAccessBase(MA, Loc, UWL);
   }
+  // This method is not accessible outside of this file.
+  MemoryAccess *getClobberingMemoryAccessWithoutInvariantGroup(MemoryAccess *MA,
+                                                               unsigned &UWL) {
+    return Walker->getClobberingMemoryAccessBase(MA, UWL, false, false);
+  }
 
   MemoryAccess *getClobberingMemoryAccess(MemoryAccess *MA) override {
     unsigned UpwardWalkLimit = MaxCheckLimit;
@@ -1437,10 +1467,13 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
     unsigned UpwardWalkLimit = MaxCheckLimit;
     while (UpperBound > LocInfo.LowerBound) {
       if (isa<MemoryPhi>(VersionStack[UpperBound])) {
-        // For phis, use the walker, see where we ended up, go there
+        // For phis, use the walker, see where we ended up, go there.
+        // The invariant.group handling in MemorySSA is ad-hoc and doesn't
+        // support updates, so don't use it to optimize uses.
         MemoryAccess *Result =
-            Walker->getClobberingMemoryAccess(MU, UpwardWalkLimit);
-        // We are guaranteed to find it or something is wrong
+            Walker->getClobberingMemoryAccessWithoutInvariantGroup(
+                MU, UpwardWalkLimit);
+        // We are guaranteed to find it or something is wrong.
         while (VersionStack[UpperBound] != Result) {
           assert(UpperBound != 0);
           --UpperBound;
@@ -1750,6 +1783,7 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I,
       break;
     case Intrinsic::assume:
     case Intrinsic::experimental_noalias_scope_decl:
+    case Intrinsic::pseudoprobe:
       return nullptr;
     }
   }
@@ -1864,10 +1898,17 @@ void MemorySSA::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void MemorySSA::dump() const { print(dbgs()); }
 #endif
 
-void MemorySSA::verifyMemorySSA() const {
-  verifyOrderingDominationAndDefUses(F);
+void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
+#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS)
+  VL = VerificationLevel::Full;
+#endif
+
+#ifndef NDEBUG
+  verifyOrderingDominationAndDefUses(F, VL);
   verifyDominationNumbers(F);
-  verifyPrevDefInPhis(F);
+  if (VL == VerificationLevel::Full)
+    verifyPrevDefInPhis(F);
+#endif
   // Previously, the verification used to also verify that the clobberingAccess
   // cached by MemorySSA is the same as the clobberingAccess found at a later
   // query to AA. This does not hold true in general due to the current fragility
@@ -1881,7 +1922,6 @@ void MemorySSA::verifyMemorySSA() const {
 }
 
 void MemorySSA::verifyPrevDefInPhis(Function &F) const {
-#if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS)
   for (const BasicBlock &BB : F) {
     if (MemoryPhi *Phi = getMemoryAccess(&BB)) {
       for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
@@ -1896,6 +1936,8 @@ void MemorySSA::verifyPrevDefInPhis(Function &F) const {
               auto *LastAcc = &*(--DefList->end());
               assert(LastAcc == IncAcc &&
                      "Incorrect incoming access into phi.");
+              (void)IncAcc;
+              (void)LastAcc;
               break;
             }
             DTNode = DTNode->getIDom();
@@ -1911,13 +1953,11 @@ void MemorySSA::verifyPrevDefInPhis(Function &F) const {
       }
     }
   }
-#endif
 }
 
 /// Verify that all of the blocks we believe to have valid domination numbers
 /// actually have valid domination numbers.
 void MemorySSA::verifyDominationNumbers(const Function &F) const {
-#ifndef NDEBUG
   if (BlockNumberingValid.empty())
     return;
 
@@ -1943,13 +1983,13 @@ void MemorySSA::verifyDominationNumbers(const Function &F) const {
       unsigned long ThisNumber = ThisNumberIter->second;
       assert(ThisNumber > LastNumber &&
              "Domination numbers should be strictly increasing!");
+      (void)LastNumber;
       LastNumber = ThisNumber;
     }
   }
 
   assert(ValidBlocks.empty() &&
          "All valid BasicBlocks should exist in F -- dangling pointers?");
-#endif
 }
 
 /// Verify ordering: the order and existence of MemoryAccesses matches the
@@ -1958,8 +1998,8 @@ void MemorySSA::verifyDominationNumbers(const Function &F) const {
 /// Verify def-uses: the immediate use information - walk all the memory
 /// accesses and verifying that, for each use, it appears in the appropriate
 /// def's use list
-void MemorySSA::verifyOrderingDominationAndDefUses(Function &F) const {
-#if !defined(NDEBUG)
+void MemorySSA::verifyOrderingDominationAndDefUses(Function &F,
+                                                   VerificationLevel VL) const {
   // Walk all the blocks, comparing what the lookups think and what the access
   // lists think, as well as the order in the blocks vs the order in the access
   // lists.
@@ -1974,19 +2014,21 @@ void MemorySSA::verifyOrderingDominationAndDefUses(Function &F) const {
       ActualAccesses.push_back(Phi);
       ActualDefs.push_back(Phi);
       // Verify domination
-      for (const Use &U : Phi->uses())
+      for (const Use &U : Phi->uses()) {
         assert(dominates(Phi, U) && "Memory PHI does not dominate it's uses");
-#if defined(EXPENSIVE_CHECKS)
-      // Verify def-uses.
-      assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
-                                          pred_begin(&B), pred_end(&B))) &&
-             "Incomplete MemoryPhi Node");
-      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
-        verifyUseInDefs(Phi->getIncomingValue(I), Phi);
-        assert(is_contained(predecessors(&B), Phi->getIncomingBlock(I)) &&
-               "Incoming phi block not a block predecessor");
+        (void)U;
+      }
+      // Verify def-uses for full verify.
+      if (VL == VerificationLevel::Full) {
+        assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
+                                            pred_begin(&B), pred_end(&B))) &&
+               "Incomplete MemoryPhi Node");
+        for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+          verifyUseInDefs(Phi->getIncomingValue(I), Phi);
+          assert(is_contained(predecessors(&B), Phi->getIncomingBlock(I)) &&
+                 "Incoming phi block not a block predecessor");
+        }
       }
-#endif
     }
 
     for (Instruction &I : B) {
@@ -2002,14 +2044,15 @@ void MemorySSA::verifyOrderingDominationAndDefUses(Function &F) const {
           // Verify ordering.
           ActualDefs.push_back(MA);
           // Verify domination.
-          for (const Use &U : MD->uses())
+          for (const Use &U : MD->uses()) {
             assert(dominates(MD, U) &&
                    "Memory Def does not dominate it's uses");
+            (void)U;
+          }
         }
-#if defined(EXPENSIVE_CHECKS)
-        // Verify def-uses.
-        verifyUseInDefs(MA->getDefiningAccess(), MA);
-#endif
+        // Verify def-uses for full verify.
+        if (VL == VerificationLevel::Full)
+          verifyUseInDefs(MA->getDefiningAccess(), MA);
       }
     }
     // Either we hit the assert, really have no accesses, or we have both
@@ -2044,13 +2087,11 @@ void MemorySSA::verifyOrderingDominationAndDefUses(Function &F) const {
     }
     ActualDefs.clear();
   }
-#endif
 }
 
 /// Verify the def-use lists in MemorySSA, by verifying that \p Use
 /// appears in the use list of \p Def.
 void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
-#ifndef NDEBUG
   // The live on entry use may cause us to get a NULL def here
   if (!Def)
     assert(isLiveOnEntryDef(Use) &&
@@ -2058,7 +2099,6 @@ void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
   else
     assert(is_contained(Def->users(), Use) &&
            "Did not find use in def's use list");
-#endif
 }
 
 /// Perform a local numbering on blocks so that instruction ordering can be
@@ -2138,8 +2178,6 @@ bool MemorySSA::dominates(const MemoryAccess *Dominator,
   return dominates(Dominator, cast<MemoryAccess>(Dominatee.getUser()));
 }
 
-const static char LiveOnEntryStr[] = "liveOnEntry";
-
 void MemoryAccess::print(raw_ostream &OS) const {
   switch (getValueID()) {
   case MemoryPhiVal: return static_cast<const MemoryPhi *>(this)->print(OS);
@@ -2355,6 +2393,16 @@ PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
+PreservedAnalyses MemorySSAWalkerPrinterPass::run(Function &F,
+                                                  FunctionAnalysisManager &AM) {
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  OS << "MemorySSA (walker) for function: " << F.getName() << "\n";
+  MemorySSAWalkerAnnotatedWriter Writer(&MSSA);
+  F.print(OS, &Writer);
+
+  return PreservedAnalyses::all();
+}
+
 PreservedAnalyses MemorySSAVerifierPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
   AM.getResult<MemorySSAAnalysis>(F).getMSSA().verifyMemorySSA();
@@ -2438,15 +2486,88 @@ MemorySSA::ClobberWalkerBase<AliasAnalysisType>::getClobberingMemoryAccessBase(
   return Clobber;
 }
 
+static const Instruction *
+getInvariantGroupClobberingInstruction(Instruction &I, DominatorTree &DT) {
+  if (!I.hasMetadata(LLVMContext::MD_invariant_group) || I.isVolatile())
+    return nullptr;
+
+  // We consider bitcasts and zero GEPs to be the same pointer value. Start by
+  // stripping bitcasts and zero GEPs, then we will recursively look at loads
+  // and stores through bitcasts and zero GEPs.
+  Value *PointerOperand = getLoadStorePointerOperand(&I)->stripPointerCasts();
+
+  // It's not safe to walk the use list of a global value because function
+  // passes aren't allowed to look outside their functions.
+  // FIXME: this could be fixed by filtering instructions from outside of
+  // current function.
+  if (isa<Constant>(PointerOperand))
+    return nullptr;
+
+  // Queue to process all pointers that are equivalent to load operand.
+  SmallVector<const Value *, 8> PointerUsesQueue;
+  PointerUsesQueue.push_back(PointerOperand);
+
+  const Instruction *MostDominatingInstruction = &I;
+
+  // FIXME: This loop is O(n^2) because dominates can be O(n) and in worst case
+  // we will see all the instructions. It may not matter in practice. If it
+  // does, we will have to support MemorySSA construction and updates.
+  while (!PointerUsesQueue.empty()) {
+    const Value *Ptr = PointerUsesQueue.pop_back_val();
+    assert(Ptr && !isa<GlobalValue>(Ptr) &&
+           "Null or GlobalValue should not be inserted");
+
+    for (const User *Us : Ptr->users()) {
+      auto *U = dyn_cast<Instruction>(Us);
+      if (!U || U == &I || !DT.dominates(U, MostDominatingInstruction))
+        continue;
+
+      // Add bitcasts and zero GEPs to queue.
+      if (isa<BitCastInst>(U)) {
+        PointerUsesQueue.push_back(U);
+        continue;
+      }
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+        if (GEP->hasAllZeroIndices())
+          PointerUsesQueue.push_back(U);
+        continue;
+      }
+
+      // If we hit a load/store with an invariant.group metadata and the same
+      // pointer operand, we can assume that value pointed to by the pointer
+      // operand didn't change.
+      if (U->hasMetadata(LLVMContext::MD_invariant_group) &&
+          getLoadStorePointerOperand(U) == Ptr && !U->isVolatile()) {
+        MostDominatingInstruction = U;
+      }
+    }
+  }
+  return MostDominatingInstruction == &I ? nullptr : MostDominatingInstruction;
+}
+
 template <typename AliasAnalysisType>
 MemoryAccess *
 MemorySSA::ClobberWalkerBase<AliasAnalysisType>::getClobberingMemoryAccessBase(
-    MemoryAccess *MA, unsigned &UpwardWalkLimit, bool SkipSelf) {
+    MemoryAccess *MA, unsigned &UpwardWalkLimit, bool SkipSelf,
+    bool UseInvariantGroup) {
   auto *StartingAccess = dyn_cast<MemoryUseOrDef>(MA);
   // If this is a MemoryPhi, we can't do anything.
   if (!StartingAccess)
     return MA;
 
+  if (UseInvariantGroup) {
+    if (auto *I = getInvariantGroupClobberingInstruction(
+            *StartingAccess->getMemoryInst(), MSSA->getDomTree())) {
+      assert(isa<LoadInst>(I) || isa<StoreInst>(I));
+
+      auto *ClobberMA = MSSA->getMemoryAccess(I);
+      assert(ClobberMA);
+      if (isa<MemoryUse>(ClobberMA))
+        return ClobberMA->getDefiningAccess();
+      return ClobberMA;
+    }
+  }
+
   bool IsOptimized = false;
 
   // If this is an already optimized use or def, return the optimized result.
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index 616864f360bf..9c841883de6d 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -296,9 +296,8 @@ static void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB,
   assert(i != -1 && "Should have found the basic block in the phi");
   // We can't just compare i against getNumOperands since one is signed and the
   // other not. So use it to index into the block iterator.
-  for (auto BBIter = MP->block_begin() + i; BBIter != MP->block_end();
-       ++BBIter) {
-    if (*BBIter != BB)
+  for (const BasicBlock *BlockBB : llvm::drop_begin(MP->blocks(), i)) {
+    if (BlockBB != BB)
       break;
     MP->setIncomingValue(i, NewDef);
     ++i;
@@ -491,8 +490,7 @@ void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<WeakVH> &Vars) {
     }
 
     while (!Worklist.empty()) {
-      const BasicBlock *FixupBlock = Worklist.back();
-      Worklist.pop_back();
+      const BasicBlock *FixupBlock = Worklist.pop_back_val();
 
       // Get the first def in the block that isn't a phi node.
       if (auto *Defs = MSSA->getWritableBlockDefs(FixupBlock)) {
@@ -822,25 +820,30 @@ void MemorySSAUpdater::applyUpdates(ArrayRef<CFGUpdate> Updates,
   }
 
   if (!DeleteUpdates.empty()) {
-    if (!UpdateDT) {
-      SmallVector<CFGUpdate, 0> Empty;
-      // Deletes are reversed applied, because this CFGView is pretending the
-      // deletes did not happen yet, hence the edges still exist.
-      DT.applyUpdates(Empty, RevDeleteUpdates);
+    if (!InsertUpdates.empty()) {
+      if (!UpdateDT) {
+        SmallVector<CFGUpdate, 0> Empty;
+        // Deletes are reversed applied, because this CFGView is pretending the
+        // deletes did not happen yet, hence the edges still exist.
+        DT.applyUpdates(Empty, RevDeleteUpdates);
+      } else {
+        // Apply all updates, with the RevDeleteUpdates as PostCFGView.
+        DT.applyUpdates(Updates, RevDeleteUpdates);
+      }
+
+      // Note: the MSSA update below doesn't distinguish between a GD with
+      // (RevDelete,false) and (Delete, true), but this matters for the DT
+      // updates above; for "children" purposes they are equivalent; but the
+      // updates themselves convey the desired update, used inside DT only.
+      GraphDiff<BasicBlock *> GD(RevDeleteUpdates);
+      applyInsertUpdates(InsertUpdates, DT, &GD);
+      // Update DT to redelete edges; this matches the real CFG so we can
+      // perform the standard update without a postview of the CFG.
+      DT.applyUpdates(DeleteUpdates);
     } else {
-      // Apply all updates, with the RevDeleteUpdates as PostCFGView.
-      DT.applyUpdates(Updates, RevDeleteUpdates);
+      if (UpdateDT)
+        DT.applyUpdates(DeleteUpdates);
     }
-
-    // Note: the MSSA update below doesn't distinguish between a GD with
-    // (RevDelete,false) and (Delete, true), but this matters for the DT
-    // updates above; for "children" purposes they are equivalent; but the
-    // updates themselves convey the desired update, used inside DT only.
-    GraphDiff<BasicBlock *> GD(RevDeleteUpdates);
-    applyInsertUpdates(InsertUpdates, DT, &GD);
-    // Update DT to redelete edges; this matches the real CFG so we can perform
-    // the standard update without a postview of the CFG.
-    DT.applyUpdates(DeleteUpdates);
   } else {
     if (UpdateDT)
       DT.applyUpdates(Updates);
@@ -1131,11 +1134,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
     if (auto DefsList = MSSA->getWritableBlockDefs(BlockWithDefsToReplace)) {
       for (auto &DefToReplaceUses : *DefsList) {
         BasicBlock *DominatingBlock = DefToReplaceUses.getBlock();
-        Value::use_iterator UI = DefToReplaceUses.use_begin(),
-                            E = DefToReplaceUses.use_end();
-        for (; UI != E;) {
-          Use &U = *UI;
-          ++UI;
+        for (Use &U : llvm::make_early_inc_range(DefToReplaceUses.uses())) {
           MemoryAccess *Usr = cast<MemoryAccess>(U.getUser());
           if (MemoryPhi *UsrPhi = dyn_cast<MemoryPhi>(Usr)) {
             BasicBlock *DominatedBlock = UsrPhi->getIncomingBlock(U);
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index e43553222128..d80814852e19 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -264,11 +264,27 @@ static void computeFunctionSummary(
   std::vector<const Instruction *> NonVolatileStores;
 
   bool HasInlineAsmMaybeReferencingInternal = false;
-  for (const BasicBlock &BB : F)
+  bool HasIndirBranchToBlockAddress = false;
+  bool HasUnknownCall = false;
+  bool MayThrow = false;
+  for (const BasicBlock &BB : F) {
+    // We don't allow inlining of function with indirect branch to blockaddress.
+    // If the blockaddress escapes the function, e.g., via a global variable,
+    // inlining may lead to an invalid cross-function reference. So we shouldn't
+    // import such function either.
+    if (BB.hasAddressTaken()) {
+      for (User *U : BlockAddress::get(const_cast<BasicBlock *>(&BB))->users())
+        if (!isa<CallBrInst>(*U)) {
+          HasIndirBranchToBlockAddress = true;
+          break;
+        }
+    }
+
     for (const Instruction &I : BB) {
-      if (isa<DbgInfoIntrinsic>(I))
+      if (I.isDebugOrPseudoInst())
         continue;
       ++NumInsts;
+
       // Regular LTO module doesn't participate in ThinLTO import,
       // so no reference from it can be read/writeonly, since this
       // would require importing variable as local copy
@@ -300,8 +316,11 @@ static void computeFunctionSummary(
       }
       findRefEdges(Index, &I, RefEdges, Visited);
       const auto *CB = dyn_cast<CallBase>(&I);
-      if (!CB)
+      if (!CB) {
+        if (I.mayThrow())
+          MayThrow = true;
         continue;
+      }
 
       const auto *CI = dyn_cast<CallInst>(&I);
       // Since we don't know exactly which local values are referenced in inline
@@ -323,7 +342,7 @@ static void computeFunctionSummary(
       // called aliasee for the checks below.
       if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
         assert(!CalledFunction && "Expected null called function in callsite for alias");
-        CalledFunction = dyn_cast<Function>(GA->getBaseObject());
+        CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
       }
       // Check if this is a direct call to a known function or a known
       // intrinsic, or an indirect call with profile data.
@@ -357,6 +376,7 @@ static void computeFunctionSummary(
           ValueInfo.updateRelBlockFreq(BBFreq, EntryFreq);
         }
       } else {
+        HasUnknownCall = true;
         // Skip inline assembly calls.
         if (CI && CI->isInlineAsm())
           continue;
@@ -386,6 +406,7 @@ static void computeFunctionSummary(
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
     }
+  }
   Index.addBlockCount(F.size());
 
   std::vector<ValueInfo> Refs;
@@ -452,8 +473,9 @@ static void computeFunctionSummary(
             : CalleeInfo::HotnessType::Critical);
 
   bool NonRenamableLocal = isNonRenamableLocal(F);
-  bool NotEligibleForImport =
-      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal;
+  bool NotEligibleForImport = NonRenamableLocal ||
+                              HasInlineAsmMaybeReferencingInternal ||
+                              HasIndirBranchToBlockAddress;
   GlobalValueSummary::GVFlags Flags(
       F.getLinkage(), F.getVisibility(), NotEligibleForImport,
       /* Live = */ false, F.isDSOLocal(),
@@ -464,8 +486,9 @@ static void computeFunctionSummary(
       F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(),
       // FIXME: refactor this to use the same code that inliner is using.
       // Don't try to import functions with noinline attribute.
-      F.getAttributes().hasFnAttribute(Attribute::NoInline),
-      F.hasFnAttribute(Attribute::AlwaysInline)};
+      F.getAttributes().hasFnAttr(Attribute::NoInline),
+      F.hasFnAttribute(Attribute::AlwaysInline),
+      F.hasFnAttribute(Attribute::NoUnwind), MayThrow, HasUnknownCall};
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   if (auto *SSI = GetSSICallback(F))
     ParamAccesses = SSI->getParamAccesses(Index);
@@ -622,7 +645,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
       /* Live = */ false, A.isDSOLocal(),
       A.hasLinkOnceODRLinkage() && A.hasGlobalUnnamedAddr());
   auto AS = std::make_unique<AliasSummary>(Flags);
-  auto *Aliasee = A.getBaseObject();
+  auto *Aliasee = A.getAliaseeObject();
   auto AliaseeVI = Index.getValueInfo(Aliasee->getGUID());
   assert(AliaseeVI && "Alias expects aliasee summary to be available");
   assert(AliaseeVI.getSummaryList().size() == 1 &&
@@ -711,7 +734,10 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                         F->hasFnAttribute(Attribute::NoRecurse),
                         F->returnDoesNotAlias(),
                         /* NoInline = */ false,
-                        F->hasFnAttribute(Attribute::AlwaysInline)},
+                        F->hasFnAttribute(Attribute::AlwaysInline),
+                        F->hasFnAttribute(Attribute::NoUnwind),
+                        /* MayThrow */ true,
+                        /* HasUnknownCall */ true},
                     /*EntryCount=*/0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{},
diff --git a/llvm/lib/Analysis/ObjCARCInstKind.cpp b/llvm/lib/Analysis/ObjCARCInstKind.cpp
index 704d15f3280d..f74a9f7f104f 100644
--- a/llvm/lib/Analysis/ObjCARCInstKind.cpp
+++ b/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -296,9 +296,8 @@ ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
       // operand isn't actually being dereferenced, it is being stored to
       // memory where we can no longer track who might read it and dereference
       // it, so we have to consider it potentially used.
-      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
-           OI != OE; ++OI)
-        if (IsPotentialRetainableObjPtr(*OI))
+      for (const Use &U : I->operands())
+        if (IsPotentialRetainableObjPtr(U))
           return ARCInstKind::User;
     }
   }
diff --git a/llvm/lib/Analysis/OverflowInstAnalysis.cpp b/llvm/lib/Analysis/OverflowInstAnalysis.cpp
index 9f17d5b2064d..87a85e6a7364 100644
--- a/llvm/lib/Analysis/OverflowInstAnalysis.cpp
+++ b/llvm/lib/Analysis/OverflowInstAnalysis.cpp
@@ -69,4 +69,4 @@ bool llvm::isCheckForZeroAndMulWithOverflow(Value *Op0, Value *Op1,
                                             bool IsAnd) {
   Use *Y;
   return isCheckForZeroAndMulWithOverflow(Op0, Op1, IsAnd, Y);
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index 7f77ab146c4c..c73e1fd82915 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -226,8 +226,8 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(),
-                                   GEPOps, {DL, TLI, DT, AC})) {
+    if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(), GEPOps,
+                                   GEP->isInBounds(), {DL, TLI, DT, AC})) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 6dda0bf0a1b4..268ed9d04741 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -103,7 +103,7 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) const {
   // FIXME: The heuristic used below for determining hotness is based on
   // preliminary SPEC tuning for inliner. This will eventually be a
   // convenience method that calls isHotCount.
-  return FunctionCount && isHotCount(FunctionCount.getCount());
+  return FunctionCount && isHotCount(FunctionCount->getCount());
 }
 
 /// Returns true if the function contains hot code. This can include a hot
@@ -116,7 +116,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(
   if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
-    if (isHotCount(FunctionCount.getCount()))
+    if (isHotCount(FunctionCount->getCount()))
       return true;
 
   if (hasSampleProfile()) {
@@ -145,7 +145,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(
   if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
-    if (!isColdCount(FunctionCount.getCount()))
+    if (!isColdCount(FunctionCount->getCount()))
       return false;
 
   if (hasSampleProfile()) {
@@ -176,10 +176,10 @@ bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
     return false;
   if (auto FunctionCount = F->getEntryCount()) {
     if (isHot &&
-        isHotCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+        isHotCountNthPercentile(PercentileCutoff, FunctionCount->getCount()))
       return true;
     if (!isHot &&
-        !isColdCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+        !isColdCountNthPercentile(PercentileCutoff, FunctionCount->getCount()))
       return false;
   }
   if (hasSampleProfile()) {
@@ -230,7 +230,7 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) const {
   // FIXME: The heuristic used below for determining coldness is based on
   // preliminary SPEC tuning for inliner. This will eventually be a
   // convenience method that calls isHotCount.
-  return FunctionCount && isColdCount(FunctionCount.getCount());
+  return FunctionCount && isColdCount(FunctionCount->getCount());
 }
 
 /// Compute the hot and cold thresholds.
@@ -316,11 +316,11 @@ bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff,
 }
 
 uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() const {
-  return HotCountThreshold ? HotCountThreshold.getValue() : UINT64_MAX;
+  return HotCountThreshold.getValueOr(UINT64_MAX);
 }
 
 uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() const {
-  return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
+  return ColdCountThreshold.getValueOr(0);
 }
 
 bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB,
diff --git a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
index b9dac2f3ff11..f83d8b0fd230 100644
--- a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
@@ -17,18 +17,21 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/LineIterator.h"
+#include <memory>
 
 using namespace llvm;
 
-#define DEBUG_TYPE "inline-replay"
+#define DEBUG_TYPE "replay-inline"
 
 ReplayInlineAdvisor::ReplayInlineAdvisor(
     Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
-    std::unique_ptr<InlineAdvisor> OriginalAdvisor, StringRef RemarksFile,
-    bool EmitRemarks)
+    std::unique_ptr<InlineAdvisor> OriginalAdvisor,
+    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks)
     : InlineAdvisor(M, FAM), OriginalAdvisor(std::move(OriginalAdvisor)),
-      HasReplayRemarks(false), EmitRemarks(EmitRemarks) {
-  auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(RemarksFile);
+      HasReplayRemarks(false), ReplaySettings(ReplaySettings),
+      EmitRemarks(EmitRemarks) {
+
+  auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(ReplaySettings.ReplayFile);
   std::error_code EC = BufferOrErr.getError();
   if (EC) {
     Context.emitError("Could not open remarks file: " + EC.message());
@@ -36,47 +39,112 @@ ReplayInlineAdvisor::ReplayInlineAdvisor(
   }
 
   // Example for inline remarks to parse:
-  //   main:3:1.1: _Z3subii inlined into main at callsite sum:1 @ main:3:1.1
+  //   main:3:1.1: '_Z3subii' inlined into 'main' at callsite sum:1 @
+  //   main:3:1.1;
   // We use the callsite string after `at callsite` to replay inlining.
   line_iterator LineIt(*BufferOrErr.get(), /*SkipBlanks=*/true);
+  const std::string PositiveRemark = "' inlined into '";
+  const std::string NegativeRemark = "' will not be inlined into '";
+
   for (; !LineIt.is_at_eof(); ++LineIt) {
     StringRef Line = *LineIt;
     auto Pair = Line.split(" at callsite ");
 
-    auto Callee = Pair.first.split(" inlined into").first.rsplit(": ").second;
+    bool IsPositiveRemark = true;
+    if (Pair.first.contains(NegativeRemark))
+      IsPositiveRemark = false;
+
+    auto CalleeCaller =
+        Pair.first.split(IsPositiveRemark ? PositiveRemark : NegativeRemark);
+
+    StringRef Callee = CalleeCaller.first.rsplit(": '").second;
+    StringRef Caller = CalleeCaller.second.rsplit("'").first;
 
     auto CallSite = Pair.second.split(";").first;
 
-    if (Callee.empty() || CallSite.empty())
-      continue;
+    if (Callee.empty() || Caller.empty() || CallSite.empty()) {
+      Context.emitError("Invalid remark format: " + Line);
+      return;
+    }
 
     std::string Combined = (Callee + CallSite).str();
-    InlineSitesFromRemarks.insert(Combined);
+    InlineSitesFromRemarks[Combined] = IsPositiveRemark;
+    if (ReplaySettings.ReplayScope == ReplayInlinerSettings::Scope::Function)
+      CallersToReplay.insert(Caller);
   }
 
   HasReplayRemarks = true;
 }
 
+std::unique_ptr<InlineAdvisor> llvm::getReplayInlineAdvisor(
+    Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context,
+    std::unique_ptr<InlineAdvisor> OriginalAdvisor,
+    const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks) {
+  auto Advisor = std::make_unique<ReplayInlineAdvisor>(
+      M, FAM, Context, std::move(OriginalAdvisor), ReplaySettings, EmitRemarks);
+  if (!Advisor->areReplayRemarksLoaded())
+    Advisor.reset();
+  return Advisor;
+}
+
 std::unique_ptr<InlineAdvice> ReplayInlineAdvisor::getAdviceImpl(CallBase &CB) {
   assert(HasReplayRemarks);
 
   Function &Caller = *CB.getCaller();
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
 
-  if (InlineSitesFromRemarks.empty())
-    return std::make_unique<DefaultInlineAdvice>(this, CB, None, ORE,
-                                                 EmitRemarks);
+  // Decision not made by replay system
+  if (!hasInlineAdvice(*CB.getFunction())) {
+    // If there's a registered original advisor, return its decision
+    if (OriginalAdvisor)
+      return OriginalAdvisor->getAdvice(CB);
 
-  std::string CallSiteLoc = getCallSiteLocation(CB.getDebugLoc());
+    // If no decision is made above, return non-decision
+    return {};
+  }
+
+  std::string CallSiteLoc =
+      formatCallSiteLocation(CB.getDebugLoc(), ReplaySettings.ReplayFormat);
   StringRef Callee = CB.getCalledFunction()->getName();
   std::string Combined = (Callee + CallSiteLoc).str();
-  auto Iter = InlineSitesFromRemarks.find(Combined);
 
-  Optional<InlineCost> InlineRecommended = None;
+  // Replay decision, if it has one
+  auto Iter = InlineSitesFromRemarks.find(Combined);
   if (Iter != InlineSitesFromRemarks.end()) {
-    InlineRecommended = llvm::InlineCost::getAlways("found in replay");
+    if (InlineSitesFromRemarks[Combined]) {
+      LLVM_DEBUG(dbgs() << "Replay Inliner: Inlined " << Callee << " @ "
+                        << CallSiteLoc << "\n");
+      return std::make_unique<DefaultInlineAdvice>(
+          this, CB, llvm::InlineCost::getAlways("previously inlined"), ORE,
+          EmitRemarks);
+    } else {
+      LLVM_DEBUG(dbgs() << "Replay Inliner: Not Inlined " << Callee << " @ "
+                        << CallSiteLoc << "\n");
+      // A negative inline is conveyed by "None" Optional<InlineCost>
+      return std::make_unique<DefaultInlineAdvice>(this, CB, None, ORE,
+                                                   EmitRemarks);
+    }
+  }
+
+  // Fallback decisions
+  if (ReplaySettings.ReplayFallback ==
+      ReplayInlinerSettings::Fallback::AlwaysInline)
+    return std::make_unique<DefaultInlineAdvice>(
+        this, CB, llvm::InlineCost::getAlways("AlwaysInline Fallback"), ORE,
+        EmitRemarks);
+  else if (ReplaySettings.ReplayFallback ==
+           ReplayInlinerSettings::Fallback::NeverInline)
+    // A negative inline is conveyed by "None" Optional<InlineCost>
+    return std::make_unique<DefaultInlineAdvice>(this, CB, None, ORE,
+                                                 EmitRemarks);
+  else {
+    assert(ReplaySettings.ReplayFallback ==
+           ReplayInlinerSettings::Fallback::Original);
+    // If there's a registered original advisor, return its decision
+    if (OriginalAdvisor)
+      return OriginalAdvisor->getAdvice(CB);
   }
 
-  return std::make_unique<DefaultInlineAdvice>(this, CB, InlineRecommended, ORE,
-                                               EmitRemarks);
+  // If no decision is made above, return non-decision
+  return {};
 }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index f22d834b5e57..f7c22cfb0310 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -139,8 +139,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "scalar-evolution"
 
-STATISTIC(NumArrayLenItCounts,
-          "Number of trip counts computed with array length");
 STATISTIC(NumTripCountsComputed,
           "Number of loops with predictable loop counts");
 STATISTIC(NumTripCountsNotComputed,
@@ -1100,7 +1098,7 @@ const SCEV *ScalarEvolution::getLosslessPtrToIntExpr(const SCEV *Op,
     SCEV *S = new (SCEVAllocator)
         SCEVPtrToIntExpr(ID.Intern(SCEVAllocator), Op, IntPtrTy);
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    registerUser(S, Op);
     return S;
   }
 
@@ -1220,7 +1218,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
     SCEV *S =
         new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator), Op, Ty);
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    registerUser(S, Op);
     return S;
   }
 
@@ -1274,7 +1272,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
   SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator),
                                                  Op, Ty);
   UniqueSCEVs.InsertNode(S, IP);
-  addToLoopUseLists(S);
+  registerUser(S, Op);
   return S;
 }
 
@@ -1603,7 +1601,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
     SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
                                                      Op, Ty);
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    registerUser(S, Op);
     return S;
   }
 
@@ -1872,7 +1870,7 @@ ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
                                                    Op, Ty);
   UniqueSCEVs.InsertNode(S, IP);
-  addToLoopUseLists(S);
+  registerUser(S, Op);
   return S;
 }
 
@@ -1911,7 +1909,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
     SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
                                                      Op, Ty);
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    registerUser(S, Op);
     return S;
   }
 
@@ -2108,7 +2106,7 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
                                                    Op, Ty);
   UniqueSCEVs.InsertNode(S, IP);
-  addToLoopUseLists(S);
+  registerUser(S, { Op });
   return S;
 }
 
@@ -2390,6 +2388,24 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
     }
   }
 
+  // <0,+,nonnegative><nw> is also nuw
+  // TODO: Add corresponding nsw case
+  if (Type == scAddRecExpr && ScalarEvolution::hasFlags(Flags, SCEV::FlagNW) &&
+      !ScalarEvolution::hasFlags(Flags, SCEV::FlagNUW) && Ops.size() == 2 &&
+      Ops[0]->isZero() && IsKnownNonNegative(Ops[1]))
+    Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
+
+  // both (udiv X, Y) * Y and Y * (udiv X, Y) are always NUW
+  if (Type == scMulExpr && !ScalarEvolution::hasFlags(Flags, SCEV::FlagNUW) &&
+      Ops.size() == 2) {
+    if (auto *UDiv = dyn_cast<SCEVUDivExpr>(Ops[0]))
+      if (UDiv->getOperand(1) == Ops[1])
+        Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
+    if (auto *UDiv = dyn_cast<SCEVUDivExpr>(Ops[1]))
+      if (UDiv->getOperand(1) == Ops[0])
+        Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
+  }
+
   return Flags;
 }
 
@@ -2449,7 +2465,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   if (Depth > MaxArithDepth || hasHugeExpression(Ops))
     return getOrCreateAddExpr(Ops, ComputeFlags(Ops));
 
-  if (SCEV *S = std::get<0>(findExistingSCEVInCache(scAddExpr, Ops))) {
+  if (SCEV *S = findExistingSCEVInCache(scAddExpr, Ops)) {
     // Don't strengthen flags if we have no new information.
     SCEVAddExpr *Add = static_cast<SCEVAddExpr *>(S);
     if (Add->getNoWrapFlags(OrigFlags) != OrigFlags)
@@ -2562,8 +2578,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       APInt ConstAdd = C1 + C2;
       auto AddFlags = AddExpr->getNoWrapFlags();
       // Adding a smaller constant is NUW if the original AddExpr was NUW.
-      if (ScalarEvolution::maskFlags(AddFlags, SCEV::FlagNUW) ==
-              SCEV::FlagNUW &&
+      if (ScalarEvolution::hasFlags(AddFlags, SCEV::FlagNUW) &&
           ConstAdd.ule(C1)) {
         PreservedFlags =
             ScalarEvolution::setFlags(PreservedFlags, SCEV::FlagNUW);
@@ -2571,8 +2586,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
       // Adding a constant with the same sign and small magnitude is NSW, if the
       // original AddExpr was NSW.
-      if (ScalarEvolution::maskFlags(AddFlags, SCEV::FlagNSW) ==
-              SCEV::FlagNSW &&
+      if (ScalarEvolution::hasFlags(AddFlags, SCEV::FlagNSW) &&
           C1.isSignBitSet() == ConstAdd.isSignBitSet() &&
           ConstAdd.abs().ule(C1.abs())) {
         PreservedFlags =
@@ -2580,14 +2594,26 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       }
 
       if (PreservedFlags != SCEV::FlagAnyWrap) {
-        SmallVector<const SCEV *, 4> NewOps(AddExpr->op_begin(),
-                                            AddExpr->op_end());
+        SmallVector<const SCEV *, 4> NewOps(AddExpr->operands());
         NewOps[0] = getConstant(ConstAdd);
         return getAddExpr(NewOps, PreservedFlags);
       }
     }
   }
 
+  // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
+  if (Ops.size() == 2) {
+    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[0]);
+    if (Mul && Mul->getNumOperands() == 2 &&
+        Mul->getOperand(0)->isAllOnesValue()) {
+      const SCEV *X;
+      const SCEV *Y;
+      if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) {
+        return getMulExpr(Y, getUDivExpr(X, Y));
+      }
+    }
+  }
+
   // Skip past any other cast SCEVs.
   while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
     ++Idx;
@@ -2766,7 +2792,8 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // If we found some loop invariants, fold them into the recurrence.
     if (!LIOps.empty()) {
       // Compute nowrap flags for the addition of the loop-invariant ops and
-      // the addrec. Temporarily push it as an operand for that purpose.
+      // the addrec. Temporarily push it as an operand for that purpose. These
+      // flags are valid in the scope of the addrec only.
       LIOps.push_back(AddRec);
       SCEV::NoWrapFlags Flags = ComputeFlags(LIOps);
       LIOps.pop_back();
@@ -2775,10 +2802,25 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       LIOps.push_back(AddRec->getStart());
 
       SmallVector<const SCEV *, 4> AddRecOps(AddRec->operands());
-      // This follows from the fact that the no-wrap flags on the outer add
-      // expression are applicable on the 0th iteration, when the add recurrence
-      // will be equal to its start value.
-      AddRecOps[0] = getAddExpr(LIOps, Flags, Depth + 1);
+
+      // It is not in general safe to propagate flags valid on an add within
+      // the addrec scope to one outside it.  We must prove that the inner
+      // scope is guaranteed to execute if the outer one does to be able to
+      // safely propagate.  We know the program is undefined if poison is
+      // produced on the inner scoped addrec.  We also know that *for this use*
+      // the outer scoped add can't overflow (because of the flags we just
+      // computed for the inner scoped add) without the program being undefined.
+      // Proving that entry to the outer scope neccesitates entry to the inner
+      // scope, thus proves the program undefined if the flags would be violated
+      // in the outer scope.
+      SCEV::NoWrapFlags AddFlags = Flags;
+      if (AddFlags != SCEV::FlagAnyWrap) {
+        auto *DefI = getDefiningScopeBound(LIOps);
+        auto *ReachI = &*AddRecLoop->getHeader()->begin();
+        if (!isGuaranteedToTransferExecutionTo(DefI, ReachI))
+          AddFlags = SCEV::FlagAnyWrap;
+      }
+      AddRecOps[0] = getAddExpr(LIOps, AddFlags, Depth + 1);
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer add and the inner addrec are guaranteed to have no overflow.
@@ -2862,7 +2904,7 @@ ScalarEvolution::getOrCreateAddExpr(ArrayRef<const SCEV *> Ops,
     S = new (SCEVAllocator)
         SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size());
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    registerUser(S, Ops);
   }
   S->setNoWrapFlags(Flags);
   return S;
@@ -2885,7 +2927,8 @@ ScalarEvolution::getOrCreateAddRecExpr(ArrayRef<const SCEV *> Ops,
     S = new (SCEVAllocator)
         SCEVAddRecExpr(ID.Intern(SCEVAllocator), O, Ops.size(), L);
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    LoopUsers[L].push_back(S);
+    registerUser(S, Ops);
   }
   setNoWrapFlags(S, Flags);
   return S;
@@ -2907,7 +2950,7 @@ ScalarEvolution::getOrCreateMulExpr(ArrayRef<const SCEV *> Ops,
     S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator),
                                         O, Ops.size());
     UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
+    registerUser(S, Ops);
   }
   S->setNoWrapFlags(Flags);
   return S;
@@ -3022,7 +3065,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   if (Depth > MaxArithDepth || hasHugeExpression(Ops))
     return getOrCreateMulExpr(Ops, ComputeFlags(Ops));
 
-  if (SCEV *S = std::get<0>(findExistingSCEVInCache(scMulExpr, Ops))) {
+  if (SCEV *S = findExistingSCEVInCache(scMulExpr, Ops)) {
     // Don't strengthen flags if we have no new information.
     SCEVMulExpr *Mul = static_cast<SCEVMulExpr *>(S);
     if (Mul->getNoWrapFlags(OrigFlags) != OrigFlags)
@@ -3416,7 +3459,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
   SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator),
                                              LHS, RHS);
   UniqueSCEVs.InsertNode(S, IP);
-  addToLoopUseLists(S);
+  registerUser(S, {LHS, RHS});
   return S;
 }
 
@@ -3593,13 +3636,21 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
   // getSCEV(Base)->getType() has the same address space as Base->getType()
   // because SCEV::getType() preserves the address space.
   Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
-  // FIXME(PR23527): Don't blindly transfer the inbounds flag from the GEP
-  // instruction to its SCEV, because the Instruction may be guarded by control
-  // flow and the no-overflow bits may not be valid for the expression in any
-  // context. This can be fixed similarly to how these flags are handled for
-  // adds.
+  const bool AssumeInBoundsFlags = [&]() {
+    if (!GEP->isInBounds())
+      return false;
+
+    // We'd like to propagate flags from the IR to the corresponding SCEV nodes,
+    // but to do that, we have to ensure that said flag is valid in the entire
+    // defined scope of the SCEV.
+    auto *GEPI = dyn_cast<Instruction>(GEP);
+    // TODO: non-instructions have global scope.  We might be able to prove
+    // some global scope cases
+    return GEPI && isSCEVExprNeverPoison(GEPI);
+  }();
+
   SCEV::NoWrapFlags OffsetWrap =
-      GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
+    AssumeInBoundsFlags ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
   Type *CurTy = GEP->getType();
   bool FirstIter = true;
@@ -3645,21 +3696,22 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
   // Add the base address and the offset. We cannot use the nsw flag, as the
   // base address is unsigned. However, if we know that the offset is
   // non-negative, we can use nuw.
-  SCEV::NoWrapFlags BaseWrap = GEP->isInBounds() && isKnownNonNegative(Offset)
+  SCEV::NoWrapFlags BaseWrap = AssumeInBoundsFlags && isKnownNonNegative(Offset)
                                    ? SCEV::FlagNUW : SCEV::FlagAnyWrap;
-  return getAddExpr(BaseExpr, Offset, BaseWrap);
+  auto *GEPExpr = getAddExpr(BaseExpr, Offset, BaseWrap);
+  assert(BaseExpr->getType() == GEPExpr->getType() &&
+         "GEP should not change type mid-flight.");
+  return GEPExpr;
 }
 
-std::tuple<SCEV *, FoldingSetNodeID, void *>
-ScalarEvolution::findExistingSCEVInCache(SCEVTypes SCEVType,
-                                         ArrayRef<const SCEV *> Ops) {
+SCEV *ScalarEvolution::findExistingSCEVInCache(SCEVTypes SCEVType,
+                                               ArrayRef<const SCEV *> Ops) {
   FoldingSetNodeID ID;
-  void *IP = nullptr;
   ID.AddInteger(SCEVType);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
-  return std::tuple<SCEV *, FoldingSetNodeID, void *>(
-      UniqueSCEVs.FindNodeOrInsertPos(ID, IP), std::move(ID), IP);
+  void *IP = nullptr;
+  return UniqueSCEVs.FindNodeOrInsertPos(ID, IP);
 }
 
 const SCEV *ScalarEvolution::getAbsExpr(const SCEV *Op, bool IsNSW) {
@@ -3689,7 +3741,7 @@ const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
   GroupByComplexity(Ops, &LI, DT);
 
   // Check if we have created the same expression before.
-  if (const SCEV *S = std::get<0>(findExistingSCEVInCache(Kind, Ops))) {
+  if (const SCEV *S = findExistingSCEVInCache(Kind, Ops)) {
     return S;
   }
 
@@ -3787,10 +3839,12 @@ const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
 
   // Okay, it looks like we really DO need an expr.  Check to see if we
   // already have one, otherwise create a new one.
-  const SCEV *ExistingSCEV;
   FoldingSetNodeID ID;
-  void *IP;
-  std::tie(ExistingSCEV, ID, IP) = findExistingSCEVInCache(Kind, Ops);
+  ID.AddInteger(Kind);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = nullptr;
+  const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP);
   if (ExistingSCEV)
     return ExistingSCEV;
   const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
@@ -3799,7 +3853,7 @@ const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
       SCEVMinMaxExpr(ID.Intern(SCEVAllocator), Kind, O, Ops.size());
 
   UniqueSCEVs.InsertNode(S, IP);
-  addToLoopUseLists(S);
+  registerUser(S, Ops);
   return S;
 }
 
@@ -3943,6 +3997,21 @@ Type *ScalarEvolution::getWiderType(Type *T1, Type *T2) const {
   return  getTypeSizeInBits(T1) >= getTypeSizeInBits(T2) ? T1 : T2;
 }
 
+bool ScalarEvolution::instructionCouldExistWitthOperands(const SCEV *A,
+                                                         const SCEV *B) {
+  /// For a valid use point to exist, the defining scope of one operand
+  /// must dominate the other.
+  bool PreciseA, PreciseB;
+  auto *ScopeA = getDefiningScopeBound({A}, PreciseA);
+  auto *ScopeB = getDefiningScopeBound({B}, PreciseB);
+  if (!PreciseA || !PreciseB)
+    // Can't tell.
+    return false;
+  return (ScopeA == ScopeB) || DT.dominates(ScopeA, ScopeB) ||
+    DT.dominates(ScopeB, ScopeA);
+}
+
+
 const SCEV *ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute.get();
 }
@@ -4025,24 +4094,6 @@ void ScalarEvolution::eraseValueFromMap(Value *V) {
   }
 }
 
-/// Check whether value has nuw/nsw/exact set but SCEV does not.
-/// TODO: In reality it is better to check the poison recursively
-/// but this is better than nothing.
-static bool SCEVLostPoisonFlags(const SCEV *S, const Value *V) {
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    if (isa<OverflowingBinaryOperator>(I)) {
-      if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) {
-        if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap())
-          return true;
-        if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap())
-          return true;
-      }
-    } else if (isa<PossiblyExactOperator>(I) && I->isExact())
-      return true;
-  }
-  return false;
-}
-
 /// Return an existing SCEV if it exists, otherwise analyze the expression and
 /// create a new one.
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
@@ -4056,7 +4107,7 @@ const SCEV *ScalarEvolution::getSCEV(Value *V) {
     // ValueExprMap before insert S->{V, 0} into ExprValueMap.
     std::pair<ValueExprMapType::iterator, bool> Pair =
         ValueExprMap.insert({SCEVCallbackVH(V, this), S});
-    if (Pair.second && !SCEVLostPoisonFlags(S, V)) {
+    if (Pair.second) {
       ExprValueMap[S].insert({V, nullptr});
 
       // If S == Stripped + Offset, add Stripped -> {V, Offset} into
@@ -4120,6 +4171,8 @@ static const SCEV *MatchNotExpr(const SCEV *Expr) {
 
 /// Return a SCEV corresponding to ~V = -1-V
 const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
+  assert(!V->getType()->isPointerTy() && "Can't negate pointer");
+
   if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
     return getConstant(
                 cast<ConstantInt>(ConstantExpr::getNot(VC->getValue())));
@@ -4146,17 +4199,16 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
   return getMinusSCEV(getMinusOne(Ty), V);
 }
 
-/// Compute an expression equivalent to S - getPointerBase(S).
-static const SCEV *removePointerBase(ScalarEvolution *SE, const SCEV *P) {
+const SCEV *ScalarEvolution::removePointerBase(const SCEV *P) {
   assert(P->getType()->isPointerTy());
 
   if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(P)) {
     // The base of an AddRec is the first operand.
     SmallVector<const SCEV *> Ops{AddRec->operands()};
-    Ops[0] = removePointerBase(SE, Ops[0]);
+    Ops[0] = removePointerBase(Ops[0]);
     // Don't try to transfer nowrap flags for now. We could in some cases
     // (for example, if pointer operand of the AddRec is a SCEVUnknown).
-    return SE->getAddRecExpr(Ops, AddRec->getLoop(), SCEV::FlagAnyWrap);
+    return getAddRecExpr(Ops, AddRec->getLoop(), SCEV::FlagAnyWrap);
   }
   if (auto *Add = dyn_cast<SCEVAddExpr>(P)) {
     // The base of an Add is the pointer operand.
@@ -4164,21 +4216,17 @@ static const SCEV *removePointerBase(ScalarEvolution *SE, const SCEV *P) {
     const SCEV **PtrOp = nullptr;
     for (const SCEV *&AddOp : Ops) {
       if (AddOp->getType()->isPointerTy()) {
-        // If we find an Add with multiple pointer operands, treat it as a
-        // pointer base to be consistent with getPointerBase.  Eventually
-        // we should be able to assert this is impossible.
-        if (PtrOp)
-          return SE->getZero(P->getType());
+        assert(!PtrOp && "Cannot have multiple pointer ops");
         PtrOp = &AddOp;
       }
     }
-    *PtrOp = removePointerBase(SE, *PtrOp);
+    *PtrOp = removePointerBase(*PtrOp);
     // Don't try to transfer nowrap flags for now. We could in some cases
     // (for example, if the pointer operand of the Add is a SCEVUnknown).
-    return SE->getAddExpr(Ops);
+    return getAddExpr(Ops);
   }
   // Any other expression must be a pointer base.
-  return SE->getZero(P->getType());
+  return getZero(P->getType());
 }
 
 const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
@@ -4195,8 +4243,8 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
     if (!LHS->getType()->isPointerTy() ||
         getPointerBase(LHS) != getPointerBase(RHS))
       return getCouldNotCompute();
-    LHS = removePointerBase(this, LHS);
-    RHS = removePointerBase(this, RHS);
+    LHS = removePointerBase(LHS);
+    RHS = removePointerBase(RHS);
   }
 
   // We represent LHS - RHS as LHS + (-1)*RHS. This transformation
@@ -4204,7 +4252,7 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   auto AddFlags = SCEV::FlagAnyWrap;
   const bool RHSIsNotMinSigned =
       !getSignedRangeMin(RHS).isMinSignedValue();
-  if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) {
+  if (hasFlags(Flags, SCEV::FlagNSW)) {
     // Let M be the minimum representable signed value. Then (-1)*RHS
     // signed-wraps if and only if RHS is M. That can happen even for
     // a NSW subtraction because e.g. (-1)*M signed-wraps even though
@@ -4359,14 +4407,11 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
       const SCEV *PtrOp = nullptr;
       for (const SCEV *AddOp : Add->operands()) {
         if (AddOp->getType()->isPointerTy()) {
-          // Cannot find the base of an expression with multiple pointer ops.
-          if (PtrOp)
-            return V;
+          assert(!PtrOp && "Cannot have multiple pointer ops");
           PtrOp = AddOp;
         }
       }
-      if (!PtrOp) // All operands were non-pointer.
-        return V;
+      assert(PtrOp && "Must have pointer op");
       V = PtrOp;
     } else // Not something we can look further into.
       return V;
@@ -4374,24 +4419,25 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
 }
 
 /// Push users of the given Instruction onto the given Worklist.
-static void
-PushDefUseChildren(Instruction *I,
-                   SmallVectorImpl<Instruction *> &Worklist) {
+static void PushDefUseChildren(Instruction *I,
+                               SmallVectorImpl<Instruction *> &Worklist,
+                               SmallPtrSetImpl<Instruction *> &Visited) {
   // Push the def-use children onto the Worklist stack.
-  for (User *U : I->users())
-    Worklist.push_back(cast<Instruction>(U));
+  for (User *U : I->users()) {
+    auto *UserInsn = cast<Instruction>(U);
+    if (Visited.insert(UserInsn).second)
+      Worklist.push_back(UserInsn);
+  }
 }
 
 void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) {
   SmallVector<Instruction *, 16> Worklist;
-  PushDefUseChildren(PN, Worklist);
-
   SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<const SCEV *, 8> ToForget;
   Visited.insert(PN);
+  Worklist.push_back(PN);
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (!Visited.insert(I).second)
-      continue;
 
     auto It = ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
@@ -4413,12 +4459,13 @@ void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) {
           !isa<SCEVUnknown>(Old) ||
           (I != PN && Old == SymName)) {
         eraseValueFromMap(It->first);
-        forgetMemoizedResults(Old);
+        ToForget.push_back(Old);
       }
     }
 
-    PushDefUseChildren(I, Worklist);
+    PushDefUseChildren(I, Worklist, Visited);
   }
+  forgetMemoizedResults(ToForget);
 }
 
 namespace {
@@ -6109,7 +6156,7 @@ ScalarEvolution::getRangeRef(const SCEV *S,
     // initial value.
     if (AddRec->hasNoUnsignedWrap()) {
       APInt UnsignedMinValue = getUnsignedRangeMin(AddRec->getStart());
-      if (!UnsignedMinValue.isNullValue())
+      if (!UnsignedMinValue.isZero())
         ConservativeResult = ConservativeResult.intersectWith(
             ConstantRange(UnsignedMinValue, APInt(BitWidth, 0)), RangeType);
     }
@@ -6211,9 +6258,9 @@ ScalarEvolution::getRangeRef(const SCEV *S,
 
     if (NS > 1) {
       // If we know any of the sign bits, we know all of the sign bits.
-      if (!Known.Zero.getHiBits(NS).isNullValue())
+      if (!Known.Zero.getHiBits(NS).isZero())
         Known.Zero.setHighBits(NS);
-      if (!Known.One.getHiBits(NS).isNullValue())
+      if (!Known.One.getHiBits(NS).isZero())
         Known.One.setHighBits(NS);
     }
 
@@ -6549,17 +6596,99 @@ SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) {
   return isSCEVExprNeverPoison(BinOp) ? Flags : SCEV::FlagAnyWrap;
 }
 
-bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
-  // Here we check that I is in the header of the innermost loop containing I,
-  // since we only deal with instructions in the loop header. The actual loop we
-  // need to check later will come from an add recurrence, but getting that
-  // requires computing the SCEV of the operands, which can be expensive. This
-  // check we can do cheaply to rule out some cases early.
-  Loop *InnermostContainingLoop = LI.getLoopFor(I->getParent());
-  if (InnermostContainingLoop == nullptr ||
-      InnermostContainingLoop->getHeader() != I->getParent())
-    return false;
+const Instruction *
+ScalarEvolution::getNonTrivialDefiningScopeBound(const SCEV *S) {
+  if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(S))
+    return &*AddRec->getLoop()->getHeader()->begin();
+  if (auto *U = dyn_cast<SCEVUnknown>(S))
+    if (auto *I = dyn_cast<Instruction>(U->getValue()))
+      return I;
+  return nullptr;
+}
 
+/// Fills \p Ops with unique operands of \p S, if it has operands. If not,
+/// \p Ops remains unmodified.
+static void collectUniqueOps(const SCEV *S,
+                             SmallVectorImpl<const SCEV *> &Ops) {
+  SmallPtrSet<const SCEV *, 4> Unique;
+  auto InsertUnique = [&](const SCEV *S) {
+    if (Unique.insert(S).second)
+      Ops.push_back(S);
+  };
+  if (auto *S2 = dyn_cast<SCEVCastExpr>(S))
+    for (auto *Op : S2->operands())
+      InsertUnique(Op);
+  else if (auto *S2 = dyn_cast<SCEVNAryExpr>(S))
+    for (auto *Op : S2->operands())
+      InsertUnique(Op);
+  else if (auto *S2 = dyn_cast<SCEVUDivExpr>(S))
+    for (auto *Op : S2->operands())
+      InsertUnique(Op);
+}
+
+const Instruction *
+ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops,
+                                       bool &Precise) {
+  Precise = true;
+  // Do a bounded search of the def relation of the requested SCEVs.
+  SmallSet<const SCEV *, 16> Visited;
+  SmallVector<const SCEV *> Worklist;
+  auto pushOp = [&](const SCEV *S) {
+    if (!Visited.insert(S).second)
+      return;
+    // Threshold of 30 here is arbitrary.
+    if (Visited.size() > 30) {
+      Precise = false;
+      return;
+    }
+    Worklist.push_back(S);
+  };
+
+  for (auto *S : Ops)
+    pushOp(S);
+
+  const Instruction *Bound = nullptr;
+  while (!Worklist.empty()) {
+    auto *S = Worklist.pop_back_val();
+    if (auto *DefI = getNonTrivialDefiningScopeBound(S)) {
+      if (!Bound || DT.dominates(Bound, DefI))
+        Bound = DefI;
+    } else {
+      SmallVector<const SCEV *, 4> Ops;
+      collectUniqueOps(S, Ops);
+      for (auto *Op : Ops)
+        pushOp(Op);
+    }
+  }
+  return Bound ? Bound : &*F.getEntryBlock().begin();
+}
+
+const Instruction *
+ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops) {
+  bool Discard;
+  return getDefiningScopeBound(Ops, Discard);
+}
+
+bool ScalarEvolution::isGuaranteedToTransferExecutionTo(const Instruction *A,
+                                                        const Instruction *B) {
+  if (A->getParent() == B->getParent() &&
+      isGuaranteedToTransferExecutionToSuccessor(A->getIterator(),
+                                                 B->getIterator()))
+    return true;
+
+  auto *BLoop = LI.getLoopFor(B->getParent());
+  if (BLoop && BLoop->getHeader() == B->getParent() &&
+      BLoop->getLoopPreheader() == A->getParent() &&
+      isGuaranteedToTransferExecutionToSuccessor(A->getIterator(),
+                                                 A->getParent()->end()) &&
+      isGuaranteedToTransferExecutionToSuccessor(B->getParent()->begin(),
+                                                 B->getIterator()))
+    return true;
+  return false;
+}
+
+
+bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
   // Only proceed if we can prove that I does not yield poison.
   if (!programUndefinedIfPoison(I))
     return false;
@@ -6570,39 +6699,20 @@ bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
   // instructions can map to the same SCEV. If we apply NSW or NUW from I to
   // the SCEV, we must guarantee no wrapping for that SCEV also when it is
   // derived from other instructions that map to the same SCEV. We cannot make
-  // that guarantee for cases where I is not executed. So we need to find the
-  // loop that I is considered in relation to and prove that I is executed for
-  // every iteration of that loop. That implies that the value that I
-  // calculates does not wrap anywhere in the loop, so then we can apply the
-  // flags to the SCEV.
-  //
-  // We check isLoopInvariant to disambiguate in case we are adding recurrences
-  // from different loops, so that we know which loop to prove that I is
-  // executed in.
-  for (unsigned OpIndex = 0; OpIndex < I->getNumOperands(); ++OpIndex) {
+  // that guarantee for cases where I is not executed. So we need to find a
+  // upper bound on the defining scope for the SCEV, and prove that I is
+  // executed every time we enter that scope.  When the bounding scope is a
+  // loop (the common case), this is equivalent to proving I executes on every
+  // iteration of that loop.
+  SmallVector<const SCEV *> SCEVOps;
+  for (const Use &Op : I->operands()) {
     // I could be an extractvalue from a call to an overflow intrinsic.
     // TODO: We can do better here in some cases.
-    if (!isSCEVable(I->getOperand(OpIndex)->getType()))
-      return false;
-    const SCEV *Op = getSCEV(I->getOperand(OpIndex));
-    if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
-      bool AllOtherOpsLoopInvariant = true;
-      for (unsigned OtherOpIndex = 0; OtherOpIndex < I->getNumOperands();
-           ++OtherOpIndex) {
-        if (OtherOpIndex != OpIndex) {
-          const SCEV *OtherOp = getSCEV(I->getOperand(OtherOpIndex));
-          if (!isLoopInvariant(OtherOp, AddRec->getLoop())) {
-            AllOtherOpsLoopInvariant = false;
-            break;
-          }
-        }
-      }
-      if (AllOtherOpsLoopInvariant &&
-          isGuaranteedToExecuteForEveryIteration(I, AddRec->getLoop()))
-        return true;
-    }
+    if (isSCEVable(Op->getType()))
+      SCEVOps.push_back(getSCEV(Op));
   }
-  return false;
+  auto *DefI = getDefiningScopeBound(SCEVOps);
+  return isGuaranteedToTransferExecutionTo(DefI, I);
 }
 
 bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
@@ -7144,10 +7254,21 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 //                   Iteration Count Computation Code
 //
 
-const SCEV *ScalarEvolution::getTripCountFromExitCount(const SCEV *ExitCount) {
-  // Get the trip count from the BE count by adding 1.  Overflow, results
-  // in zero which means "unknown".
-  return getAddExpr(ExitCount, getOne(ExitCount->getType()));
+const SCEV *ScalarEvolution::getTripCountFromExitCount(const SCEV *ExitCount,
+                                                       bool Extend) {
+  if (isa<SCEVCouldNotCompute>(ExitCount))
+    return getCouldNotCompute();
+
+  auto *ExitCountType = ExitCount->getType();
+  assert(ExitCountType->isIntegerTy());
+
+  if (!Extend)
+    return getAddExpr(ExitCount, getOne(ExitCountType));
+
+  auto *WiderType = Type::getIntNTy(ExitCountType->getContext(),
+                                    1 + ExitCountType->getScalarSizeInBits());
+  return getAddExpr(getNoopOrZeroExtend(ExitCount, WiderType),
+                    getOne(WiderType));
 }
 
 static unsigned getConstantTripCount(const SCEVConstant *ExitCount) {
@@ -7186,6 +7307,131 @@ unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
   return getConstantTripCount(MaxExitCount);
 }
 
+const SCEV *ScalarEvolution::getConstantMaxTripCountFromArray(const Loop *L) {
+  // We can't infer from Array in Irregular Loop.
+  // FIXME: It's hard to infer loop bound from array operated in Nested Loop.
+  if (!L->isLoopSimplifyForm() || !L->isInnermost())
+    return getCouldNotCompute();
+
+  // FIXME: To make the scene more typical, we only analysis loops that have
+  // one exiting block and that block must be the latch. To make it easier to
+  // capture loops that have memory access and memory access will be executed
+  // in each iteration.
+  const BasicBlock *LoopLatch = L->getLoopLatch();
+  assert(LoopLatch && "See defination of simplify form loop.");
+  if (L->getExitingBlock() != LoopLatch)
+    return getCouldNotCompute();
+
+  const DataLayout &DL = getDataLayout();
+  SmallVector<const SCEV *> InferCountColl;
+  for (auto *BB : L->getBlocks()) {
+    // Go here, we can know that Loop is a single exiting and simplified form
+    // loop. Make sure that infer from Memory Operation in those BBs must be
+    // executed in loop. First step, we can make sure that max execution time
+    // of MemAccessBB in loop represents latch max excution time.
+    // If MemAccessBB does not dom Latch, skip.
+    //            Entry
+    //              │
+    //        ┌─────▼─────┐
+    //        │Loop Header◄─────┐
+    //        └──┬──────┬─┘     │
+    //           │      │       │
+    //  ┌────────▼──┐ ┌─▼─────┐ │
+    //  │MemAccessBB│ │OtherBB│ │
+    //  └────────┬──┘ └─┬─────┘ │
+    //           │      │       │
+    //         ┌─▼──────▼─┐     │
+    //         │Loop Latch├─────┘
+    //         └────┬─────┘
+    //              ▼
+    //             Exit
+    if (!DT.dominates(BB, LoopLatch))
+      continue;
+
+    for (Instruction &Inst : *BB) {
+      // Find Memory Operation Instruction.
+      auto *GEP = getLoadStorePointerOperand(&Inst);
+      if (!GEP)
+        continue;
+
+      auto *ElemSize = dyn_cast<SCEVConstant>(getElementSize(&Inst));
+      // Do not infer from scalar type, eg."ElemSize = sizeof()".
+      if (!ElemSize)
+        continue;
+
+      // Use a existing polynomial recurrence on the trip count.
+      auto *AddRec = dyn_cast<SCEVAddRecExpr>(getSCEV(GEP));
+      if (!AddRec)
+        continue;
+      auto *ArrBase = dyn_cast<SCEVUnknown>(getPointerBase(AddRec));
+      auto *Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(*this));
+      if (!ArrBase || !Step)
+        continue;
+      assert(isLoopInvariant(ArrBase, L) && "See addrec definition");
+
+      // Only handle { %array + step },
+      // FIXME: {(SCEVAddRecExpr) + step } could not be analysed here.
+      if (AddRec->getStart() != ArrBase)
+        continue;
+
+      // Memory operation pattern which have gaps.
+      // Or repeat memory opreation.
+      // And index of GEP wraps arround.
+      if (Step->getAPInt().getActiveBits() > 32 ||
+          Step->getAPInt().getZExtValue() !=
+              ElemSize->getAPInt().getZExtValue() ||
+          Step->isZero() || Step->getAPInt().isNegative())
+        continue;
+
+      // Only infer from stack array which has certain size.
+      // Make sure alloca instruction is not excuted in loop.
+      AllocaInst *AllocateInst = dyn_cast<AllocaInst>(ArrBase->getValue());
+      if (!AllocateInst || L->contains(AllocateInst->getParent()))
+        continue;
+
+      // Make sure only handle normal array.
+      auto *Ty = dyn_cast<ArrayType>(AllocateInst->getAllocatedType());
+      auto *ArrSize = dyn_cast<ConstantInt>(AllocateInst->getArraySize());
+      if (!Ty || !ArrSize || !ArrSize->isOne())
+        continue;
+      // Also make sure step was increased the same with sizeof allocated
+      // element type.
+      const PointerType *GEPT = dyn_cast<PointerType>(GEP->getType());
+      if (Ty->getElementType() != GEPT->getElementType())
+        continue;
+
+      // FIXME: Since gep indices are silently zext to the indexing type,
+      // we will have a narrow gep index which wraps around rather than
+      // increasing strictly, we shoule ensure that step is increasing
+      // strictly by the loop iteration.
+      // Now we can infer a max execution time by MemLength/StepLength.
+      const SCEV *MemSize =
+          getConstant(Step->getType(), DL.getTypeAllocSize(Ty));
+      auto *MaxExeCount =
+          dyn_cast<SCEVConstant>(getUDivCeilSCEV(MemSize, Step));
+      if (!MaxExeCount || MaxExeCount->getAPInt().getActiveBits() > 32)
+        continue;
+
+      // If the loop reaches the maximum number of executions, we can not
+      // access bytes starting outside the statically allocated size without
+      // being immediate UB. But it is allowed to enter loop header one more
+      // time.
+      auto *InferCount = dyn_cast<SCEVConstant>(
+          getAddExpr(MaxExeCount, getOne(MaxExeCount->getType())));
+      // Discard the maximum number of execution times under 32bits.
+      if (!InferCount || InferCount->getAPInt().getActiveBits() > 32)
+        continue;
+
+      InferCountColl.push_back(InferCount);
+    }
+  }
+
+  if (InferCountColl.size() == 0)
+    return getCouldNotCompute();
+
+  return getUMinFromMismatchedTypes(InferCountColl);
+}
+
 unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
   SmallVector<BasicBlock *, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -7287,13 +7533,15 @@ bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
 }
 
 /// Push PHI nodes in the header of the given loop onto the given Worklist.
-static void
-PushLoopPHIs(const Loop *L, SmallVectorImpl<Instruction *> &Worklist) {
+static void PushLoopPHIs(const Loop *L,
+                         SmallVectorImpl<Instruction *> &Worklist,
+                         SmallPtrSetImpl<Instruction *> &Visited) {
   BasicBlock *Header = L->getHeader();
 
   // Push all Loop-header PHIs onto the Worklist stack.
   for (PHINode &PN : Header->phis())
-    Worklist.push_back(&PN);
+    if (Visited.insert(&PN).second)
+      Worklist.push_back(&PN);
 }
 
 const ScalarEvolution::BackedgeTakenInfo &
@@ -7354,9 +7602,9 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // it handles SCEVUnknown PHI nodes specially.
   if (Result.hasAnyInfo()) {
     SmallVector<Instruction *, 16> Worklist;
-    PushLoopPHIs(L, Worklist);
-
     SmallPtrSet<Instruction *, 8> Discovered;
+    SmallVector<const SCEV *, 8> ToForget;
+    PushLoopPHIs(L, Worklist, Discovered);
     while (!Worklist.empty()) {
       Instruction *I = Worklist.pop_back_val();
 
@@ -7373,7 +7621,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
         // own when it gets to that point.
         if (!isa<PHINode>(I) || !isa<SCEVUnknown>(Old)) {
           eraseValueFromMap(It->first);
-          forgetMemoizedResults(Old);
+          ToForget.push_back(Old);
         }
         if (PHINode *PN = dyn_cast<PHINode>(I))
           ConstantEvolutionLoopExitValue.erase(PN);
@@ -7405,6 +7653,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
             Worklist.push_back(I);
         }
     }
+    forgetMemoizedResults(ToForget);
   }
 
   // Re-lookup the insert position, since the call to
@@ -7441,6 +7690,7 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
   SmallVector<const Loop *, 16> LoopWorklist(1, L);
   SmallVector<Instruction *, 32> Worklist;
   SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<const SCEV *, 16> ToForget;
 
   // Iterate over all the loops and sub-loops to drop SCEV information.
   while (!LoopWorklist.empty()) {
@@ -7462,29 +7712,27 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
 
     auto LoopUsersItr = LoopUsers.find(CurrL);
     if (LoopUsersItr != LoopUsers.end()) {
-      for (auto *S : LoopUsersItr->second)
-        forgetMemoizedResults(S);
+      ToForget.insert(ToForget.end(), LoopUsersItr->second.begin(),
+                LoopUsersItr->second.end());
       LoopUsers.erase(LoopUsersItr);
     }
 
     // Drop information about expressions based on loop-header PHIs.
-    PushLoopPHIs(CurrL, Worklist);
+    PushLoopPHIs(CurrL, Worklist, Visited);
 
     while (!Worklist.empty()) {
       Instruction *I = Worklist.pop_back_val();
-      if (!Visited.insert(I).second)
-        continue;
 
       ValueExprMapType::iterator It =
           ValueExprMap.find_as(static_cast<Value *>(I));
       if (It != ValueExprMap.end()) {
         eraseValueFromMap(It->first);
-        forgetMemoizedResults(It->second);
+        ToForget.push_back(It->second);
         if (PHINode *PN = dyn_cast<PHINode>(I))
           ConstantEvolutionLoopExitValue.erase(PN);
       }
 
-      PushDefUseChildren(I, Worklist);
+      PushDefUseChildren(I, Worklist, Visited);
     }
 
     LoopPropertiesCache.erase(CurrL);
@@ -7492,6 +7740,7 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     // ValuesAtScopes map.
     LoopWorklist.append(CurrL->begin(), CurrL->end());
   }
+  forgetMemoizedResults(ToForget);
 }
 
 void ScalarEvolution::forgetTopmostLoop(const Loop *L) {
@@ -7506,25 +7755,25 @@ void ScalarEvolution::forgetValue(Value *V) {
 
   // Drop information about expressions based on loop-header PHIs.
   SmallVector<Instruction *, 16> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<const SCEV *, 8> ToForget;
   Worklist.push_back(I);
+  Visited.insert(I);
 
-  SmallPtrSet<Instruction *, 8> Visited;
   while (!Worklist.empty()) {
     I = Worklist.pop_back_val();
-    if (!Visited.insert(I).second)
-      continue;
-
     ValueExprMapType::iterator It =
       ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       eraseValueFromMap(It->first);
-      forgetMemoizedResults(It->second);
+      ToForget.push_back(It->second);
       if (PHINode *PN = dyn_cast<PHINode>(I))
         ConstantEvolutionLoopExitValue.erase(PN);
     }
 
-    PushDefUseChildren(I, Worklist);
+    PushDefUseChildren(I, Worklist, Visited);
   }
+  forgetMemoizedResults(ToForget);
 }
 
 void ScalarEvolution::forgetLoopDispositions(const Loop *L) {
@@ -7598,7 +7847,7 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
     return !ENT.hasAlwaysTruePredicate();
   };
 
-  if (any_of(ExitNotTaken, PredicateNotAlwaysTrue) || !getConstantMax())
+  if (!getConstantMax() || any_of(ExitNotTaken, PredicateNotAlwaysTrue))
     return SE->getCouldNotCompute();
 
   assert((isa<SCEVCouldNotCompute>(getConstantMax()) ||
@@ -7635,6 +7884,12 @@ ScalarEvolution::ExitLimit::ExitLimit(
     const SCEV *E, const SCEV *M, bool MaxOrZero,
     ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
     : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
+  // If we prove the max count is zero, so is the symbolic bound.  This happens
+  // in practice due to differences in a) how context sensitive we've chosen
+  // to be and b) how we reason about bounds impied by UB.
+  if (MaxNotTaken->isZero())
+    ExactNotTaken = MaxNotTaken;
+
   assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
           !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
          "Exact is not allowed to be less precise than Max");
@@ -7740,7 +7995,7 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L,
     if (auto *BI = dyn_cast<BranchInst>(ExitBB->getTerminator()))
       if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
         bool ExitIfTrue = !L->contains(BI->getSuccessor(0));
-        if ((ExitIfTrue && CI->isZero()) || (!ExitIfTrue && CI->isOne()))
+        if (ExitIfTrue == CI->isZero())
           continue;
       }
 
@@ -8030,15 +8285,6 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
     Pred = ExitCond->getInversePredicate();
   const ICmpInst::Predicate OriginalPred = Pred;
 
-  // Handle common loops like: for (X = "string"; *X; ++X)
-  if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
-    if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
-      ExitLimit ItCnt =
-        computeLoadConstantCompareExitLimit(LI, RHS, L, Pred);
-      if (ItCnt.hasAnyInfo())
-        return ItCnt;
-    }
-
   const SCEV *LHS = getSCEV(ExitCond->getOperand(0));
   const SCEV *RHS = getSCEV(ExitCond->getOperand(1));
 
@@ -8070,6 +8316,32 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
         if (!isa<SCEVCouldNotCompute>(Ret)) return Ret;
       }
 
+  // If this loop must exit based on this condition (or execute undefined
+  // behaviour), and we can prove the test sequence produced must repeat
+  // the same values on self-wrap of the IV, then we can infer that IV
+  // doesn't self wrap because if it did, we'd have an infinite (undefined)
+  // loop.
+  if (ControlsExit && isLoopInvariant(RHS, L) && loopHasNoAbnormalExits(L) &&
+      loopIsFiniteByAssumption(L)) {
+
+    // TODO: We can peel off any functions which are invertible *in L*.  Loop
+    // invariant terms are effectively constants for our purposes here.
+    auto *InnerLHS = LHS;
+    if (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(LHS))
+      InnerLHS = ZExt->getOperand();
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(InnerLHS)) {
+      auto *StrideC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this));
+      if (!AR->hasNoSelfWrap() && AR->getLoop() == L && AR->isAffine() && 
+          StrideC && StrideC->getAPInt().isPowerOf2()) {
+        auto Flags = AR->getNoWrapFlags();
+        Flags = setFlags(Flags, SCEV::FlagNW);
+        SmallVector<const SCEV*> Operands{AR->operands()};
+        Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), Flags);
+      }
+    }
+  }
+
   switch (Pred) {
   case ICmpInst::ICMP_NE: {                     // while (X != Y)
     // Convert to: while (X-Y != 0)
@@ -8169,85 +8441,6 @@ EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C,
   return cast<SCEVConstant>(Val)->getValue();
 }
 
-/// Given an exit condition of 'icmp op load X, cst', try to see if we can
-/// compute the backedge execution count.
-ScalarEvolution::ExitLimit
-ScalarEvolution::computeLoadConstantCompareExitLimit(
-  LoadInst *LI,
-  Constant *RHS,
-  const Loop *L,
-  ICmpInst::Predicate predicate) {
-  if (LI->isVolatile()) return getCouldNotCompute();
-
-  // Check to see if the loaded pointer is a getelementptr of a global.
-  // TODO: Use SCEV instead of manually grubbing with GEPs.
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
-  if (!GEP) return getCouldNotCompute();
-
-  // Make sure that it is really a constant global we are gepping, with an
-  // initializer, and make sure the first IDX is really 0.
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
-  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
-      GEP->getNumOperands() < 3 || !isa<Constant>(GEP->getOperand(1)) ||
-      !cast<Constant>(GEP->getOperand(1))->isNullValue())
-    return getCouldNotCompute();
-
-  // Okay, we allow one non-constant index into the GEP instruction.
-  Value *VarIdx = nullptr;
-  std::vector<Constant*> Indexes;
-  unsigned VarIdxNum = 0;
-  for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i)
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
-      Indexes.push_back(CI);
-    } else if (!isa<ConstantInt>(GEP->getOperand(i))) {
-      if (VarIdx) return getCouldNotCompute();  // Multiple non-constant idx's.
-      VarIdx = GEP->getOperand(i);
-      VarIdxNum = i-2;
-      Indexes.push_back(nullptr);
-    }
-
-  // Loop-invariant loads may be a byproduct of loop optimization. Skip them.
-  if (!VarIdx)
-    return getCouldNotCompute();
-
-  // Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant.
-  // Check to see if X is a loop variant variable value now.
-  const SCEV *Idx = getSCEV(VarIdx);
-  Idx = getSCEVAtScope(Idx, L);
-
-  // We can only recognize very limited forms of loop index expressions, in
-  // particular, only affine AddRec's like {C1,+,C2}<L>.
-  const SCEVAddRecExpr *IdxExpr = dyn_cast<SCEVAddRecExpr>(Idx);
-  if (!IdxExpr || IdxExpr->getLoop() != L || !IdxExpr->isAffine() ||
-      isLoopInvariant(IdxExpr, L) ||
-      !isa<SCEVConstant>(IdxExpr->getOperand(0)) ||
-      !isa<SCEVConstant>(IdxExpr->getOperand(1)))
-    return getCouldNotCompute();
-
-  unsigned MaxSteps = MaxBruteForceIterations;
-  for (unsigned IterationNum = 0; IterationNum != MaxSteps; ++IterationNum) {
-    ConstantInt *ItCst = ConstantInt::get(
-                           cast<IntegerType>(IdxExpr->getType()), IterationNum);
-    ConstantInt *Val = EvaluateConstantChrecAtConstant(IdxExpr, ItCst, *this);
-
-    // Form the GEP offset.
-    Indexes[VarIdxNum] = Val;
-
-    Constant *Result = ConstantFoldLoadThroughGEPIndices(GV->getInitializer(),
-                                                         Indexes);
-    if (!Result) break;  // Cannot compute!
-
-    // Evaluate the condition for this iteration.
-    Result = ConstantExpr::getICmp(predicate, Result, RHS);
-    if (!isa<ConstantInt>(Result)) break;  // Couldn't decide for sure
-    if (cast<ConstantInt>(Result)->getValue().isMinValue()) {
-      ++NumArrayLenItCounts;
-      return getConstant(ItCst);   // Found terminating iteration!
-    }
-  }
-  return getCouldNotCompute();
-}
-
 ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
     Value *LHS, Value *RHSV, const Loop *L, ICmpInst::Predicate Pred) {
   ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV);
@@ -9160,7 +9353,7 @@ GetQuadraticEquation(const SCEVAddRecExpr *AddRec) {
   APInt L = LC->getAPInt();
   APInt M = MC->getAPInt();
   APInt N = NC->getAPInt();
-  assert(!N.isNullValue() && "This is not a quadratic addrec");
+  assert(!N.isZero() && "This is not a quadratic addrec");
 
   unsigned BitWidth = LC->getAPInt().getBitWidth();
   unsigned NewWidth = BitWidth + 1;
@@ -9486,9 +9679,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   //   N = Distance (as unsigned)
   if (StepC->getValue()->isOne() || StepC->getValue()->isMinusOne()) {
     APInt MaxBECount = getUnsignedRangeMax(applyLoopGuards(Distance, L));
-    APInt MaxBECountBase = getUnsignedRangeMax(Distance);
-    if (MaxBECountBase.ult(MaxBECount))
-      MaxBECount = MaxBECountBase;
+    MaxBECount = APIntOps::umin(MaxBECount, getUnsignedRangeMax(Distance));
 
     // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
     // we end up with a loop whose backedge-taken count is n - 1.  Detect this
@@ -9521,11 +9712,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     const SCEV *Max = getCouldNotCompute();
     if (Exact != getCouldNotCompute()) {
       APInt MaxInt = getUnsignedRangeMax(applyLoopGuards(Exact, L));
-      APInt BaseMaxInt = getUnsignedRangeMax(Exact);
-      if (BaseMaxInt.ult(MaxInt))
-        Max = getConstant(BaseMaxInt);
-      else
-        Max = getConstant(MaxInt);
+      Max = getConstant(APIntOps::umin(MaxInt, getUnsignedRangeMax(Exact)));
     }
     return ExitLimit(Exact, Max, false, Predicates);
   }
@@ -9533,9 +9720,12 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // Solve the general equation.
   const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(),
                                                getNegativeSCEV(Start), *this);
-  const SCEV *M = E == getCouldNotCompute()
-                      ? E
-                      : getConstant(getUnsignedRangeMax(E));
+
+  const SCEV *M = E;
+  if (E != getCouldNotCompute()) {
+    APInt MaxWithGuards = getUnsignedRangeMax(applyLoopGuards(E, L));
+    M = getConstant(APIntOps::umin(MaxWithGuards, getUnsignedRangeMax(E)));
+  }
   return ExitLimit(E, M, false, Predicates);
 }
 
@@ -9911,23 +10101,23 @@ Optional<bool> ScalarEvolution::evaluatePredicate(ICmpInst::Predicate Pred,
 
 bool ScalarEvolution::isKnownPredicateAt(ICmpInst::Predicate Pred,
                                          const SCEV *LHS, const SCEV *RHS,
-                                         const Instruction *Context) {
+                                         const Instruction *CtxI) {
   // TODO: Analyze guards and assumes from Context's block.
   return isKnownPredicate(Pred, LHS, RHS) ||
-         isBasicBlockEntryGuardedByCond(Context->getParent(), Pred, LHS, RHS);
+         isBasicBlockEntryGuardedByCond(CtxI->getParent(), Pred, LHS, RHS);
 }
 
-Optional<bool>
-ScalarEvolution::evaluatePredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS,
-                                     const SCEV *RHS,
-                                     const Instruction *Context) {
+Optional<bool> ScalarEvolution::evaluatePredicateAt(ICmpInst::Predicate Pred,
+                                                    const SCEV *LHS,
+                                                    const SCEV *RHS,
+                                                    const Instruction *CtxI) {
   Optional<bool> KnownWithoutContext = evaluatePredicate(Pred, LHS, RHS);
   if (KnownWithoutContext)
     return KnownWithoutContext;
 
-  if (isBasicBlockEntryGuardedByCond(Context->getParent(), Pred, LHS, RHS))
+  if (isBasicBlockEntryGuardedByCond(CtxI->getParent(), Pred, LHS, RHS))
     return true;
-  else if (isBasicBlockEntryGuardedByCond(Context->getParent(),
+  else if (isBasicBlockEntryGuardedByCond(CtxI->getParent(),
                                           ICmpInst::getInversePredicate(Pred),
                                           LHS, RHS))
     return false;
@@ -10057,7 +10247,7 @@ ScalarEvolution::getLoopInvariantPredicate(ICmpInst::Predicate Pred,
 Optional<ScalarEvolution::LoopInvariantPredicate>
 ScalarEvolution::getLoopInvariantExitCondDuringFirstIterations(
     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L,
-    const Instruction *Context, const SCEV *MaxIter) {
+    const Instruction *CtxI, const SCEV *MaxIter) {
   // Try to prove the following set of facts:
   // - The predicate is monotonic in the iteration space.
   // - If the check does not fail on the 1st iteration:
@@ -10111,7 +10301,7 @@ ScalarEvolution::getLoopInvariantExitCondDuringFirstIterations(
   if (Step == MinusOne)
     NoOverflowPred = CmpInst::getSwappedPredicate(NoOverflowPred);
   const SCEV *Start = AR->getStart();
-  if (!isKnownPredicateAt(NoOverflowPred, Start, Last, Context))
+  if (!isKnownPredicateAt(NoOverflowPred, Start, Last, CtxI))
     return None;
 
   // Everything is fine.
@@ -10448,12 +10638,12 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
 
   // Try to prove (Pred, LHS, RHS) using isImpliedCond.
   auto ProveViaCond = [&](const Value *Condition, bool Inverse) {
-    const Instruction *Context = &BB->front();
-    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, Context))
+    const Instruction *CtxI = &BB->front();
+    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, CtxI))
       return true;
     if (ProvingStrictComparison) {
       auto ProofFn = [&](ICmpInst::Predicate P) {
-        return isImpliedCond(P, LHS, RHS, Condition, Inverse, Context);
+        return isImpliedCond(P, LHS, RHS, Condition, Inverse, CtxI);
       };
       if (SplitAndProve(ProofFn))
         return true;
@@ -10525,7 +10715,7 @@ bool ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
                                     const Value *FoundCondValue, bool Inverse,
-                                    const Instruction *Context) {
+                                    const Instruction *CtxI) {
   // False conditions implies anything. Do not bother analyzing it further.
   if (FoundCondValue ==
       ConstantInt::getBool(FoundCondValue->getContext(), Inverse))
@@ -10541,12 +10731,12 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   const Value *Op0, *Op1;
   if (match(FoundCondValue, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
     if (!Inverse)
-      return isImpliedCond(Pred, LHS, RHS, Op0, Inverse, Context) ||
-              isImpliedCond(Pred, LHS, RHS, Op1, Inverse, Context);
+      return isImpliedCond(Pred, LHS, RHS, Op0, Inverse, CtxI) ||
+             isImpliedCond(Pred, LHS, RHS, Op1, Inverse, CtxI);
   } else if (match(FoundCondValue, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) {
     if (Inverse)
-      return isImpliedCond(Pred, LHS, RHS, Op0, Inverse, Context) ||
-              isImpliedCond(Pred, LHS, RHS, Op1, Inverse, Context);
+      return isImpliedCond(Pred, LHS, RHS, Op0, Inverse, CtxI) ||
+             isImpliedCond(Pred, LHS, RHS, Op1, Inverse, CtxI);
   }
 
   const ICmpInst *ICI = dyn_cast<ICmpInst>(FoundCondValue);
@@ -10563,14 +10753,14 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
   const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));
 
-  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, Context);
+  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, CtxI);
 }
 
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
                                     ICmpInst::Predicate FoundPred,
                                     const SCEV *FoundLHS, const SCEV *FoundRHS,
-                                    const Instruction *Context) {
+                                    const Instruction *CtxI) {
   // Balance the types.
   if (getTypeSizeInBits(LHS->getType()) <
       getTypeSizeInBits(FoundLHS->getType())) {
@@ -10583,12 +10773,14 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
       auto BitWidth = getTypeSizeInBits(NarrowType);
       const SCEV *MaxValue = getZeroExtendExpr(
           getConstant(APInt::getMaxValue(BitWidth)), WideType);
-      if (isKnownPredicate(ICmpInst::ICMP_ULE, FoundLHS, MaxValue) &&
-          isKnownPredicate(ICmpInst::ICMP_ULE, FoundRHS, MaxValue)) {
+      if (isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, FoundLHS,
+                                          MaxValue) &&
+          isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_ULE, FoundRHS,
+                                          MaxValue)) {
         const SCEV *TruncFoundLHS = getTruncateExpr(FoundLHS, NarrowType);
         const SCEV *TruncFoundRHS = getTruncateExpr(FoundRHS, NarrowType);
         if (isImpliedCondBalancedTypes(Pred, LHS, RHS, FoundPred, TruncFoundLHS,
-                                       TruncFoundRHS, Context))
+                                       TruncFoundRHS, CtxI))
           return true;
       }
     }
@@ -10615,13 +10807,13 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
     }
   }
   return isImpliedCondBalancedTypes(Pred, LHS, RHS, FoundPred, FoundLHS,
-                                    FoundRHS, Context);
+                                    FoundRHS, CtxI);
 }
 
 bool ScalarEvolution::isImpliedCondBalancedTypes(
     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
     ICmpInst::Predicate FoundPred, const SCEV *FoundLHS, const SCEV *FoundRHS,
-    const Instruction *Context) {
+    const Instruction *CtxI) {
   assert(getTypeSizeInBits(LHS->getType()) ==
              getTypeSizeInBits(FoundLHS->getType()) &&
          "Types should be balanced!");
@@ -10647,7 +10839,7 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
 
   // Check whether the found predicate is the same as the desired predicate.
   if (FoundPred == Pred)
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI);
 
   // Check whether swapping the found predicate makes it the same as the
   // desired predicate.
@@ -10663,27 +10855,70 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
     // do this if it would break canonical constant/addrec ordering.
     if (!isa<SCEVConstant>(RHS) && !isa<SCEVAddRecExpr>(LHS))
       return isImpliedCondOperands(FoundPred, RHS, LHS, FoundLHS, FoundRHS,
-                                   Context);
+                                   CtxI);
     if (!isa<SCEVConstant>(FoundRHS) && !isa<SCEVAddRecExpr>(FoundLHS))
-      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, Context);
+      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, CtxI);
+
+    // There's no clear preference between forms 3. and 4., try both.  Avoid
+    // forming getNotSCEV of pointer values as the resulting subtract is
+    // not legal.
+    if (!LHS->getType()->isPointerTy() && !RHS->getType()->isPointerTy() &&
+        isImpliedCondOperands(FoundPred, getNotSCEV(LHS), getNotSCEV(RHS),
+                              FoundLHS, FoundRHS, CtxI))
+      return true;
 
-    // Don't try to getNotSCEV pointers.
-    if (LHS->getType()->isPointerTy() || FoundLHS->getType()->isPointerTy())
-      return false;
+    if (!FoundLHS->getType()->isPointerTy() &&
+        !FoundRHS->getType()->isPointerTy() &&
+        isImpliedCondOperands(Pred, LHS, RHS, getNotSCEV(FoundLHS),
+                              getNotSCEV(FoundRHS), CtxI))
+      return true;
 
-    // There's no clear preference between forms 3. and 4., try both.
-    return isImpliedCondOperands(FoundPred, getNotSCEV(LHS), getNotSCEV(RHS),
-                                 FoundLHS, FoundRHS, Context) ||
-           isImpliedCondOperands(Pred, LHS, RHS, getNotSCEV(FoundLHS),
-                                 getNotSCEV(FoundRHS), Context);
+    return false;
   }
 
-  // Unsigned comparison is the same as signed comparison when both the operands
-  // are non-negative.
-  if (CmpInst::isUnsigned(FoundPred) &&
-      CmpInst::getSignedPredicate(FoundPred) == Pred &&
-      isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS))
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
+  auto IsSignFlippedPredicate = [](CmpInst::Predicate P1,
+                                   CmpInst::Predicate P2) {
+    assert(P1 != P2 && "Handled earlier!");
+    return CmpInst::isRelational(P2) &&
+           P1 == CmpInst::getFlippedSignednessPredicate(P2);
+  };
+  if (IsSignFlippedPredicate(Pred, FoundPred)) {
+    // Unsigned comparison is the same as signed comparison when both the
+    // operands are non-negative or negative.
+    if ((isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS)) ||
+        (isKnownNegative(FoundLHS) && isKnownNegative(FoundRHS)))
+      return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI);
+    // Create local copies that we can freely swap and canonicalize our
+    // conditions to "le/lt".
+    ICmpInst::Predicate CanonicalPred = Pred, CanonicalFoundPred = FoundPred;
+    const SCEV *CanonicalLHS = LHS, *CanonicalRHS = RHS,
+               *CanonicalFoundLHS = FoundLHS, *CanonicalFoundRHS = FoundRHS;
+    if (ICmpInst::isGT(CanonicalPred) || ICmpInst::isGE(CanonicalPred)) {
+      CanonicalPred = ICmpInst::getSwappedPredicate(CanonicalPred);
+      CanonicalFoundPred = ICmpInst::getSwappedPredicate(CanonicalFoundPred);
+      std::swap(CanonicalLHS, CanonicalRHS);
+      std::swap(CanonicalFoundLHS, CanonicalFoundRHS);
+    }
+    assert((ICmpInst::isLT(CanonicalPred) || ICmpInst::isLE(CanonicalPred)) &&
+           "Must be!");
+    assert((ICmpInst::isLT(CanonicalFoundPred) ||
+            ICmpInst::isLE(CanonicalFoundPred)) &&
+           "Must be!");
+    if (ICmpInst::isSigned(CanonicalPred) && isKnownNonNegative(CanonicalRHS))
+      // Use implication:
+      // x <u y && y >=s 0 --> x <s y.
+      // If we can prove the left part, the right part is also proven.
+      return isImpliedCondOperands(CanonicalFoundPred, CanonicalLHS,
+                                   CanonicalRHS, CanonicalFoundLHS,
+                                   CanonicalFoundRHS);
+    if (ICmpInst::isUnsigned(CanonicalPred) && isKnownNegative(CanonicalRHS))
+      // Use implication:
+      // x <s y && y <s 0 --> x <u y.
+      // If we can prove the left part, the right part is also proven.
+      return isImpliedCondOperands(CanonicalFoundPred, CanonicalLHS,
+                                   CanonicalRHS, CanonicalFoundLHS,
+                                   CanonicalFoundRHS);
+  }
 
   // Check if we can make progress by sharpening ranges.
   if (FoundPred == ICmpInst::ICMP_NE &&
@@ -10721,7 +10956,7 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
           // We know V `Pred` SharperMin.  If this implies LHS `Pred`
           // RHS, we're done.
           if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin),
-                                    Context))
+                                    CtxI))
             return true;
           LLVM_FALLTHROUGH;
 
@@ -10736,8 +10971,7 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
           //
           // If V `Pred` Min implies LHS `Pred` RHS, we're done.
 
-          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min),
-                                    Context))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min), CtxI))
             return true;
           break;
 
@@ -10745,14 +10979,14 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
         case ICmpInst::ICMP_SLE:
         case ICmpInst::ICMP_ULE:
           if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
-                                    LHS, V, getConstant(SharperMin), Context))
+                                    LHS, V, getConstant(SharperMin), CtxI))
             return true;
           LLVM_FALLTHROUGH;
 
         case ICmpInst::ICMP_SLT:
         case ICmpInst::ICMP_ULT:
           if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
-                                    LHS, V, getConstant(Min), Context))
+                                    LHS, V, getConstant(Min), CtxI))
             return true;
           break;
 
@@ -10766,12 +11000,11 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
   // Check whether the actual condition is beyond sufficient.
   if (FoundPred == ICmpInst::ICMP_EQ)
     if (ICmpInst::isTrueWhenEqual(Pred))
-      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context))
+      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI))
         return true;
   if (Pred == ICmpInst::ICMP_NE)
     if (!ICmpInst::isTrueWhenEqual(FoundPred))
-      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS,
-                                Context))
+      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS, CtxI))
         return true;
 
   // Otherwise assume the worst.
@@ -10852,7 +11085,7 @@ Optional<APInt> ScalarEvolution::computeConstantDifference(const SCEV *More,
 
 bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
-    const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *Context) {
+    const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *CtxI) {
   // Try to recognize the following pattern:
   //
   //   FoundRHS = ...
@@ -10866,9 +11099,9 @@ bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
   // each iteration of this loop, including the first iteration. Therefore, in
   // this case, `FoundLHS Pred FoundRHS` implies `Start Pred FoundRHS`. Try to
   // prove the original pred using this fact.
-  if (!Context)
+  if (!CtxI)
     return false;
-  const BasicBlock *ContextBB = Context->getParent();
+  const BasicBlock *ContextBB = CtxI->getParent();
   // Make sure AR varies in the context block.
   if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundLHS)) {
     const Loop *L = AR->getLoop();
@@ -11090,7 +11323,7 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
                                             const SCEV *FoundRHS,
-                                            const Instruction *Context) {
+                                            const Instruction *CtxI) {
   if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
@@ -11098,7 +11331,7 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
     return true;
 
   if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS,
-                                          Context))
+                                          CtxI))
     return true;
 
   return isImpliedCondOperandsHelper(Pred, LHS, RHS,
@@ -11534,6 +11767,12 @@ const SCEV *ScalarEvolution::computeMaxBECountForLT(const SCEV *Start,
   if (IsSigned && BitWidth == 1)
     return getZero(Stride->getType());
 
+  // This code has only been closely audited for negative strides in the
+  // unsigned comparison case, it may be correct for signed comparison, but
+  // that needs to be established.
+  assert((!IsSigned || !isKnownNonPositive(Stride)) &&
+         "Stride is expected strictly positive for signed case!");
+
   // Calculate the maximum backedge count based on the range of values
   // permitted by Start, End, and Stride.
   APInt MinStart =
@@ -11576,6 +11815,80 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
   bool PredicatedIV = false;
 
+  auto canAssumeNoSelfWrap = [&](const SCEVAddRecExpr *AR) {
+    // Can we prove this loop *must* be UB if overflow of IV occurs?
+    // Reasoning goes as follows:
+    // * Suppose the IV did self wrap.
+    // * If Stride evenly divides the iteration space, then once wrap
+    //   occurs, the loop must revisit the same values.
+    // * We know that RHS is invariant, and that none of those values
+    //   caused this exit to be taken previously.  Thus, this exit is
+    //   dynamically dead.
+    // * If this is the sole exit, then a dead exit implies the loop
+    //   must be infinite if there are no abnormal exits.
+    // * If the loop were infinite, then it must either not be mustprogress
+    //   or have side effects. Otherwise, it must be UB.
+    // * It can't (by assumption), be UB so we have contradicted our
+    //   premise and can conclude the IV did not in fact self-wrap.
+    if (!isLoopInvariant(RHS, L))
+      return false;
+
+    auto *StrideC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this));
+    if (!StrideC || !StrideC->getAPInt().isPowerOf2())
+      return false;
+
+    if (!ControlsExit || !loopHasNoAbnormalExits(L))
+      return false;
+
+    return loopIsFiniteByAssumption(L);
+  };
+
+  if (!IV) {
+    if (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(LHS)) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ZExt->getOperand());
+      if (AR && AR->getLoop() == L && AR->isAffine()) {
+        auto canProveNUW = [&]() {
+          if (!isLoopInvariant(RHS, L))
+            return false;
+
+          if (!isKnownNonZero(AR->getStepRecurrence(*this)))
+            // We need the sequence defined by AR to strictly increase in the
+            // unsigned integer domain for the logic below to hold.
+            return false;
+
+          const unsigned InnerBitWidth = getTypeSizeInBits(AR->getType());
+          const unsigned OuterBitWidth = getTypeSizeInBits(RHS->getType());
+          // If RHS <=u Limit, then there must exist a value V in the sequence
+          // defined by AR (e.g. {Start,+,Step}) such that V >u RHS, and
+          // V <=u UINT_MAX.  Thus, we must exit the loop before unsigned
+          // overflow occurs.  This limit also implies that a signed comparison
+          // (in the wide bitwidth) is equivalent to an unsigned comparison as
+          // the high bits on both sides must be zero.
+          APInt StrideMax = getUnsignedRangeMax(AR->getStepRecurrence(*this));
+          APInt Limit = APInt::getMaxValue(InnerBitWidth) - (StrideMax - 1);
+          Limit = Limit.zext(OuterBitWidth);
+          return getUnsignedRangeMax(applyLoopGuards(RHS, L)).ule(Limit);
+        };
+        auto Flags = AR->getNoWrapFlags();
+        if (!hasFlags(Flags, SCEV::FlagNUW) && canProveNUW())
+          Flags = setFlags(Flags, SCEV::FlagNUW);
+
+        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), Flags);
+        if (AR->hasNoUnsignedWrap()) {
+          // Emulate what getZeroExtendExpr would have done during construction
+          // if we'd been able to infer the fact just above at that time.
+          const SCEV *Step = AR->getStepRecurrence(*this);
+          Type *Ty = ZExt->getType();
+          auto *S = getAddRecExpr(
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, 0),
+            getZeroExtendExpr(Step, Ty, 0), L, AR->getNoWrapFlags());
+          IV = dyn_cast<SCEVAddRecExpr>(S);
+        }
+      }
+    }
+  }
+
+
   if (!IV && AllowPredicates) {
     // Try to make this an AddRec using runtime tests, in the first X
     // iterations of this loop, where X is the SCEV expression found by the
@@ -11626,32 +11939,29 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     //
     // a) IV is either nuw or nsw depending upon signedness (indicated by the
     //    NoWrap flag).
-    // b) loop is single exit with no side effects.
-    //
+    // b) the loop is guaranteed to be finite (e.g. is mustprogress and has
+    //    no side effects within the loop)
+    // c) loop has a single static exit (with no abnormal exits)
     //
     // Precondition a) implies that if the stride is negative, this is a single
     // trip loop. The backedge taken count formula reduces to zero in this case.
     //
-    // Precondition b) implies that if rhs is invariant in L, then unknown
-    // stride being zero means the backedge can't be taken without UB.
+    // Precondition b) and c) combine to imply that if rhs is invariant in L,
+    // then a zero stride means the backedge can't be taken without executing
+    // undefined behavior.
     //
     // The positive stride case is the same as isKnownPositive(Stride) returning
     // true (original behavior of the function).
     //
-    // We want to make sure that the stride is truly unknown as there are edge
-    // cases where ScalarEvolution propagates no wrap flags to the
-    // post-increment/decrement IV even though the increment/decrement operation
-    // itself is wrapping. The computed backedge taken count may be wrong in
-    // such cases. This is prevented by checking that the stride is not known to
-    // be either positive or non-positive. For example, no wrap flags are
-    // propagated to the post-increment IV of this loop with a trip count of 2 -
-    //
-    // unsigned char i;
-    // for(i=127; i<128; i+=129)
-    //   A[i] = i;
-    //
-    if (PredicatedIV || !NoWrap || isKnownNonPositive(Stride) ||
-        !loopIsFiniteByAssumption(L))
+    if (PredicatedIV || !NoWrap || !loopIsFiniteByAssumption(L) ||
+        !loopHasNoAbnormalExits(L))
+      return getCouldNotCompute();
+
+    // This bailout is protecting the logic in computeMaxBECountForLT which
+    // has not yet been sufficiently auditted or tested with negative strides.
+    // We used to filter out all known-non-positive cases here, we're in the
+    // process of being less restrictive bit by bit.
+    if (IsSigned && isKnownNonPositive(Stride))
       return getCouldNotCompute();
 
     if (!isKnownNonZero(Stride)) {
@@ -11687,37 +11997,12 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     }
   } else if (!Stride->isOne() && !NoWrap) {
     auto isUBOnWrap = [&]() {
-      // Can we prove this loop *must* be UB if overflow of IV occurs?
-      // Reasoning goes as follows:
-      // * Suppose the IV did self wrap.
-      // * If Stride evenly divides the iteration space, then once wrap
-      //   occurs, the loop must revisit the same values.
-      // * We know that RHS is invariant, and that none of those values
-      //   caused this exit to be taken previously.  Thus, this exit is
-      //   dynamically dead.
-      // * If this is the sole exit, then a dead exit implies the loop
-      //   must be infinite if there are no abnormal exits.
-      // * If the loop were infinite, then it must either not be mustprogress
-      //   or have side effects. Otherwise, it must be UB.
-      // * It can't (by assumption), be UB so we have contradicted our
-      //   premise and can conclude the IV did not in fact self-wrap.
       // From no-self-wrap, we need to then prove no-(un)signed-wrap.  This
       // follows trivially from the fact that every (un)signed-wrapped, but
       // not self-wrapped value must be LT than the last value before
       // (un)signed wrap.  Since we know that last value didn't exit, nor
       // will any smaller one.
-
-      if (!isLoopInvariant(RHS, L))
-        return false;
-
-      auto *StrideC = dyn_cast<SCEVConstant>(Stride);
-      if (!StrideC || !StrideC->getAPInt().isPowerOf2())
-        return false;
-
-      if (!ControlsExit || !loopHasNoAbnormalExits(L))
-        return false;
-
-      return loopIsFiniteByAssumption(L);
+      return canAssumeNoSelfWrap(IV);
     };
 
     // Avoid proven overflow cases: this will ensure that the backedge taken
@@ -11740,7 +12025,9 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   const SCEV *Start = IV->getStart();
 
   // Preserve pointer-typed Start/RHS to pass to isLoopEntryGuardedByCond.
-  // Use integer-typed versions for actual computation.
+  // If we convert to integers, isLoopEntryGuardedByCond will miss some cases.
+  // Use integer-typed versions for actual computation; we can't subtract
+  // pointers in general.
   const SCEV *OrigStart = Start;
   const SCEV *OrigRHS = RHS;
   if (Start->getType()->isPointerTy()) {
@@ -11771,10 +12058,13 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   // is End and so the result is as above, and if not max(End,Start) is Start
   // so we get a backedge count of zero.
   const SCEV *BECount = nullptr;
-  auto *StartMinusStride = getMinusSCEV(OrigStart, Stride);
+  auto *OrigStartMinusStride = getMinusSCEV(OrigStart, Stride);
+  assert(isAvailableAtLoopEntry(OrigStartMinusStride, L) && "Must be!");
+  assert(isAvailableAtLoopEntry(OrigStart, L) && "Must be!");
+  assert(isAvailableAtLoopEntry(OrigRHS, L) && "Must be!");
   // Can we prove (max(RHS,Start) > Start - Stride?
-  if (isLoopEntryGuardedByCond(L, Cond, StartMinusStride, Start) &&
-      isLoopEntryGuardedByCond(L, Cond, StartMinusStride, RHS)) {
+  if (isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigStart) &&
+      isLoopEntryGuardedByCond(L, Cond, OrigStartMinusStride, OrigRHS)) {
     // In this case, we can use a refined formula for computing backedge taken
     // count.  The general formula remains:
     //   "End-Start /uceiling Stride" where "End = max(RHS,Start)"
@@ -11795,10 +12085,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     //   Our preconditions trivially imply no overflow in that form.
     const SCEV *MinusOne = getMinusOne(Stride->getType());
     const SCEV *Numerator =
-        getMinusSCEV(getAddExpr(RHS, MinusOne), StartMinusStride);
-    if (!isa<SCEVCouldNotCompute>(Numerator)) {
-      BECount = getUDivExpr(Numerator, Stride);
-    }
+        getMinusSCEV(getAddExpr(RHS, MinusOne), getMinusSCEV(Start, Stride));
+    BECount = getUDivExpr(Numerator, Stride);
   }
 
   const SCEV *BECountIfBackedgeTaken = nullptr;
@@ -12141,7 +12429,7 @@ SCEVAddRecExpr::getPostIncExpr(ScalarEvolution &SE) const {
 }
 
 // Return true when S contains at least an undef value.
-static inline bool containsUndefs(const SCEV *S) {
+bool ScalarEvolution::containsUndefs(const SCEV *S) const {
   return SCEVExprContains(S, [](const SCEV *S) {
     if (const auto *SU = dyn_cast<SCEVUnknown>(S))
       return isa<UndefValue>(SU->getValue());
@@ -12149,237 +12437,6 @@ static inline bool containsUndefs(const SCEV *S) {
   });
 }
 
-namespace {
-
-// Collect all steps of SCEV expressions.
-struct SCEVCollectStrides {
-  ScalarEvolution &SE;
-  SmallVectorImpl<const SCEV *> &Strides;
-
-  SCEVCollectStrides(ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &S)
-      : SE(SE), Strides(S) {}
-
-  bool follow(const SCEV *S) {
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
-      Strides.push_back(AR->getStepRecurrence(SE));
-    return true;
-  }
-
-  bool isDone() const { return false; }
-};
-
-// Collect all SCEVUnknown and SCEVMulExpr expressions.
-struct SCEVCollectTerms {
-  SmallVectorImpl<const SCEV *> &Terms;
-
-  SCEVCollectTerms(SmallVectorImpl<const SCEV *> &T) : Terms(T) {}
-
-  bool follow(const SCEV *S) {
-    if (isa<SCEVUnknown>(S) || isa<SCEVMulExpr>(S) ||
-        isa<SCEVSignExtendExpr>(S)) {
-      if (!containsUndefs(S))
-        Terms.push_back(S);
-
-      // Stop recursion: once we collected a term, do not walk its operands.
-      return false;
-    }
-
-    // Keep looking.
-    return true;
-  }
-
-  bool isDone() const { return false; }
-};
-
-// Check if a SCEV contains an AddRecExpr.
-struct SCEVHasAddRec {
-  bool &ContainsAddRec;
-
-  SCEVHasAddRec(bool &ContainsAddRec) : ContainsAddRec(ContainsAddRec) {
-    ContainsAddRec = false;
-  }
-
-  bool follow(const SCEV *S) {
-    if (isa<SCEVAddRecExpr>(S)) {
-      ContainsAddRec = true;
-
-      // Stop recursion: once we collected a term, do not walk its operands.
-      return false;
-    }
-
-    // Keep looking.
-    return true;
-  }
-
-  bool isDone() const { return false; }
-};
-
-// Find factors that are multiplied with an expression that (possibly as a
-// subexpression) contains an AddRecExpr. In the expression:
-//
-//  8 * (100 +  %p * %q * (%a + {0, +, 1}_loop))
-//
-// "%p * %q" are factors multiplied by the expression "(%a + {0, +, 1}_loop)"
-// that contains the AddRec {0, +, 1}_loop. %p * %q are likely to be array size
-// parameters as they form a product with an induction variable.
-//
-// This collector expects all array size parameters to be in the same MulExpr.
-// It might be necessary to later add support for collecting parameters that are
-// spread over different nested MulExpr.
-struct SCEVCollectAddRecMultiplies {
-  SmallVectorImpl<const SCEV *> &Terms;
-  ScalarEvolution &SE;
-
-  SCEVCollectAddRecMultiplies(SmallVectorImpl<const SCEV *> &T, ScalarEvolution &SE)
-      : Terms(T), SE(SE) {}
-
-  bool follow(const SCEV *S) {
-    if (auto *Mul = dyn_cast<SCEVMulExpr>(S)) {
-      bool HasAddRec = false;
-      SmallVector<const SCEV *, 0> Operands;
-      for (auto Op : Mul->operands()) {
-        const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Op);
-        if (Unknown && !isa<CallInst>(Unknown->getValue())) {
-          Operands.push_back(Op);
-        } else if (Unknown) {
-          HasAddRec = true;
-        } else {
-          bool ContainsAddRec = false;
-          SCEVHasAddRec ContiansAddRec(ContainsAddRec);
-          visitAll(Op, ContiansAddRec);
-          HasAddRec |= ContainsAddRec;
-        }
-      }
-      if (Operands.size() == 0)
-        return true;
-
-      if (!HasAddRec)
-        return false;
-
-      Terms.push_back(SE.getMulExpr(Operands));
-      // Stop recursion: once we collected a term, do not walk its operands.
-      return false;
-    }
-
-    // Keep looking.
-    return true;
-  }
-
-  bool isDone() const { return false; }
-};
-
-} // end anonymous namespace
-
-/// Find parametric terms in this SCEVAddRecExpr. We first for parameters in
-/// two places:
-///   1) The strides of AddRec expressions.
-///   2) Unknowns that are multiplied with AddRec expressions.
-void ScalarEvolution::collectParametricTerms(const SCEV *Expr,
-    SmallVectorImpl<const SCEV *> &Terms) {
-  SmallVector<const SCEV *, 4> Strides;
-  SCEVCollectStrides StrideCollector(*this, Strides);
-  visitAll(Expr, StrideCollector);
-
-  LLVM_DEBUG({
-    dbgs() << "Strides:\n";
-    for (const SCEV *S : Strides)
-      dbgs() << *S << "\n";
-  });
-
-  for (const SCEV *S : Strides) {
-    SCEVCollectTerms TermCollector(Terms);
-    visitAll(S, TermCollector);
-  }
-
-  LLVM_DEBUG({
-    dbgs() << "Terms:\n";
-    for (const SCEV *T : Terms)
-      dbgs() << *T << "\n";
-  });
-
-  SCEVCollectAddRecMultiplies MulCollector(Terms, *this);
-  visitAll(Expr, MulCollector);
-}
-
-static bool findArrayDimensionsRec(ScalarEvolution &SE,
-                                   SmallVectorImpl<const SCEV *> &Terms,
-                                   SmallVectorImpl<const SCEV *> &Sizes) {
-  int Last = Terms.size() - 1;
-  const SCEV *Step = Terms[Last];
-
-  // End of recursion.
-  if (Last == 0) {
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Step)) {
-      SmallVector<const SCEV *, 2> Qs;
-      for (const SCEV *Op : M->operands())
-        if (!isa<SCEVConstant>(Op))
-          Qs.push_back(Op);
-
-      Step = SE.getMulExpr(Qs);
-    }
-
-    Sizes.push_back(Step);
-    return true;
-  }
-
-  for (const SCEV *&Term : Terms) {
-    // Normalize the terms before the next call to findArrayDimensionsRec.
-    const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Term, Step, &Q, &R);
-
-    // Bail out when GCD does not evenly divide one of the terms.
-    if (!R->isZero())
-      return false;
-
-    Term = Q;
-  }
-
-  // Remove all SCEVConstants.
-  erase_if(Terms, [](const SCEV *E) { return isa<SCEVConstant>(E); });
-
-  if (Terms.size() > 0)
-    if (!findArrayDimensionsRec(SE, Terms, Sizes))
-      return false;
-
-  Sizes.push_back(Step);
-  return true;
-}
-
-// Returns true when one of the SCEVs of Terms contains a SCEVUnknown parameter.
-static inline bool containsParameters(SmallVectorImpl<const SCEV *> &Terms) {
-  for (const SCEV *T : Terms)
-    if (SCEVExprContains(T, [](const SCEV *S) { return isa<SCEVUnknown>(S); }))
-      return true;
-
-  return false;
-}
-
-// Return the number of product terms in S.
-static inline int numberOfTerms(const SCEV *S) {
-  if (const SCEVMulExpr *Expr = dyn_cast<SCEVMulExpr>(S))
-    return Expr->getNumOperands();
-  return 1;
-}
-
-static const SCEV *removeConstantFactors(ScalarEvolution &SE, const SCEV *T) {
-  if (isa<SCEVConstant>(T))
-    return nullptr;
-
-  if (isa<SCEVUnknown>(T))
-    return T;
-
-  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(T)) {
-    SmallVector<const SCEV *, 2> Factors;
-    for (const SCEV *Op : M->operands())
-      if (!isa<SCEVConstant>(Op))
-        Factors.push_back(Op);
-
-    return SE.getMulExpr(Factors);
-  }
-
-  return T;
-}
-
 /// Return the size of an element read or written by Inst.
 const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
   Type *Ty;
@@ -12394,248 +12451,6 @@ const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
   return getSizeOfExpr(ETy, Ty);
 }
 
-void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
-                                          SmallVectorImpl<const SCEV *> &Sizes,
-                                          const SCEV *ElementSize) {
-  if (Terms.size() < 1 || !ElementSize)
-    return;
-
-  // Early return when Terms do not contain parameters: we do not delinearize
-  // non parametric SCEVs.
-  if (!containsParameters(Terms))
-    return;
-
-  LLVM_DEBUG({
-    dbgs() << "Terms:\n";
-    for (const SCEV *T : Terms)
-      dbgs() << *T << "\n";
-  });
-
-  // Remove duplicates.
-  array_pod_sort(Terms.begin(), Terms.end());
-  Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end());
-
-  // Put larger terms first.
-  llvm::sort(Terms, [](const SCEV *LHS, const SCEV *RHS) {
-    return numberOfTerms(LHS) > numberOfTerms(RHS);
-  });
-
-  // Try to divide all terms by the element size. If term is not divisible by
-  // element size, proceed with the original term.
-  for (const SCEV *&Term : Terms) {
-    const SCEV *Q, *R;
-    SCEVDivision::divide(*this, Term, ElementSize, &Q, &R);
-    if (!Q->isZero())
-      Term = Q;
-  }
-
-  SmallVector<const SCEV *, 4> NewTerms;
-
-  // Remove constant factors.
-  for (const SCEV *T : Terms)
-    if (const SCEV *NewT = removeConstantFactors(*this, T))
-      NewTerms.push_back(NewT);
-
-  LLVM_DEBUG({
-    dbgs() << "Terms after sorting:\n";
-    for (const SCEV *T : NewTerms)
-      dbgs() << *T << "\n";
-  });
-
-  if (NewTerms.empty() || !findArrayDimensionsRec(*this, NewTerms, Sizes)) {
-    Sizes.clear();
-    return;
-  }
-
-  // The last element to be pushed into Sizes is the size of an element.
-  Sizes.push_back(ElementSize);
-
-  LLVM_DEBUG({
-    dbgs() << "Sizes:\n";
-    for (const SCEV *S : Sizes)
-      dbgs() << *S << "\n";
-  });
-}
-
-void ScalarEvolution::computeAccessFunctions(
-    const SCEV *Expr, SmallVectorImpl<const SCEV *> &Subscripts,
-    SmallVectorImpl<const SCEV *> &Sizes) {
-  // Early exit in case this SCEV is not an affine multivariate function.
-  if (Sizes.empty())
-    return;
-
-  if (auto *AR = dyn_cast<SCEVAddRecExpr>(Expr))
-    if (!AR->isAffine())
-      return;
-
-  const SCEV *Res = Expr;
-  int Last = Sizes.size() - 1;
-  for (int i = Last; i >= 0; i--) {
-    const SCEV *Q, *R;
-    SCEVDivision::divide(*this, Res, Sizes[i], &Q, &R);
-
-    LLVM_DEBUG({
-      dbgs() << "Res: " << *Res << "\n";
-      dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
-      dbgs() << "Res divided by Sizes[i]:\n";
-      dbgs() << "Quotient: " << *Q << "\n";
-      dbgs() << "Remainder: " << *R << "\n";
-    });
-
-    Res = Q;
-
-    // Do not record the last subscript corresponding to the size of elements in
-    // the array.
-    if (i == Last) {
-
-      // Bail out if the remainder is too complex.
-      if (isa<SCEVAddRecExpr>(R)) {
-        Subscripts.clear();
-        Sizes.clear();
-        return;
-      }
-
-      continue;
-    }
-
-    // Record the access function for the current subscript.
-    Subscripts.push_back(R);
-  }
-
-  // Also push in last position the remainder of the last division: it will be
-  // the access function of the innermost dimension.
-  Subscripts.push_back(Res);
-
-  std::reverse(Subscripts.begin(), Subscripts.end());
-
-  LLVM_DEBUG({
-    dbgs() << "Subscripts:\n";
-    for (const SCEV *S : Subscripts)
-      dbgs() << *S << "\n";
-  });
-}
-
-/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
-/// sizes of an array access. Returns the remainder of the delinearization that
-/// is the offset start of the array.  The SCEV->delinearize algorithm computes
-/// the multiples of SCEV coefficients: that is a pattern matching of sub
-/// expressions in the stride and base of a SCEV corresponding to the
-/// computation of a GCD (greatest common divisor) of base and stride.  When
-/// SCEV->delinearize fails, it returns the SCEV unchanged.
-///
-/// For example: when analyzing the memory access A[i][j][k] in this loop nest
-///
-///  void foo(long n, long m, long o, double A[n][m][o]) {
-///
-///    for (long i = 0; i < n; i++)
-///      for (long j = 0; j < m; j++)
-///        for (long k = 0; k < o; k++)
-///          A[i][j][k] = 1.0;
-///  }
-///
-/// the delinearization input is the following AddRec SCEV:
-///
-///  AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
-///
-/// From this SCEV, we are able to say that the base offset of the access is %A
-/// because it appears as an offset that does not divide any of the strides in
-/// the loops:
-///
-///  CHECK: Base offset: %A
-///
-/// and then SCEV->delinearize determines the size of some of the dimensions of
-/// the array as these are the multiples by which the strides are happening:
-///
-///  CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
-///
-/// Note that the outermost dimension remains of UnknownSize because there are
-/// no strides that would help identifying the size of the last dimension: when
-/// the array has been statically allocated, one could compute the size of that
-/// dimension by dividing the overall size of the array by the size of the known
-/// dimensions: %m * %o * 8.
-///
-/// Finally delinearize provides the access functions for the array reference
-/// that does correspond to A[i][j][k] of the above C testcase:
-///
-///  CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
-///
-/// The testcases are checking the output of a function pass:
-/// DelinearizationPass that walks through all loads and stores of a function
-/// asking for the SCEV of the memory access with respect to all enclosing
-/// loops, calling SCEV->delinearize on that and printing the results.
-void ScalarEvolution::delinearize(const SCEV *Expr,
-                                 SmallVectorImpl<const SCEV *> &Subscripts,
-                                 SmallVectorImpl<const SCEV *> &Sizes,
-                                 const SCEV *ElementSize) {
-  // First step: collect parametric terms.
-  SmallVector<const SCEV *, 4> Terms;
-  collectParametricTerms(Expr, Terms);
-
-  if (Terms.empty())
-    return;
-
-  // Second step: find subscript sizes.
-  findArrayDimensions(Terms, Sizes, ElementSize);
-
-  if (Sizes.empty())
-    return;
-
-  // Third step: compute the access functions for each subscript.
-  computeAccessFunctions(Expr, Subscripts, Sizes);
-
-  if (Subscripts.empty())
-    return;
-
-  LLVM_DEBUG({
-    dbgs() << "succeeded to delinearize " << *Expr << "\n";
-    dbgs() << "ArrayDecl[UnknownSize]";
-    for (const SCEV *S : Sizes)
-      dbgs() << "[" << *S << "]";
-
-    dbgs() << "\nArrayRef";
-    for (const SCEV *S : Subscripts)
-      dbgs() << "[" << *S << "]";
-    dbgs() << "\n";
-  });
-}
-
-bool ScalarEvolution::getIndexExpressionsFromGEP(
-    const GetElementPtrInst *GEP, SmallVectorImpl<const SCEV *> &Subscripts,
-    SmallVectorImpl<int> &Sizes) {
-  assert(Subscripts.empty() && Sizes.empty() &&
-         "Expected output lists to be empty on entry to this function.");
-  assert(GEP && "getIndexExpressionsFromGEP called with a null GEP");
-  Type *Ty = nullptr;
-  bool DroppedFirstDim = false;
-  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
-    const SCEV *Expr = getSCEV(GEP->getOperand(i));
-    if (i == 1) {
-      Ty = GEP->getSourceElementType();
-      if (auto *Const = dyn_cast<SCEVConstant>(Expr))
-        if (Const->getValue()->isZero()) {
-          DroppedFirstDim = true;
-          continue;
-        }
-      Subscripts.push_back(Expr);
-      continue;
-    }
-
-    auto *ArrayTy = dyn_cast<ArrayType>(Ty);
-    if (!ArrayTy) {
-      Subscripts.clear();
-      Sizes.clear();
-      return false;
-    }
-
-    Subscripts.push_back(Expr);
-    if (!(DroppedFirstDim && i == 2))
-      Sizes.push_back(ArrayTy->getNumElements());
-
-    Ty = ArrayTy->getElementType();
-  }
-  return !Subscripts.empty();
-}
-
 //===----------------------------------------------------------------------===//
 //                   SCEVCallbackVH Class Implementation
 //===----------------------------------------------------------------------===//
@@ -12722,6 +12537,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       LoopDispositions(std::move(Arg.LoopDispositions)),
       LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)),
       BlockDispositions(std::move(Arg.BlockDispositions)),
+      SCEVUsers(std::move(Arg.SCEVUsers)),
       UnsignedRanges(std::move(Arg.UnsignedRanges)),
       SignedRanges(std::move(Arg.SignedRanges)),
       UniqueSCEVs(std::move(Arg.UniqueSCEVs)),
@@ -12934,7 +12750,7 @@ ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
   Values.emplace_back(L, LoopVariant);
   LoopDisposition D = computeLoopDisposition(S, L);
   auto &Values2 = LoopDispositions[S];
-  for (auto &V : make_range(Values2.rbegin(), Values2.rend())) {
+  for (auto &V : llvm::reverse(Values2)) {
     if (V.getPointer() == L) {
       V.setInt(D);
       break;
@@ -13042,7 +12858,7 @@ ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
   Values.emplace_back(BB, DoesNotDominateBlock);
   BlockDisposition D = computeBlockDisposition(S, BB);
   auto &Values2 = BlockDispositions[S];
-  for (auto &V : make_range(Values2.rbegin(), Values2.rend())) {
+  for (auto &V : llvm::reverse(Values2)) {
     if (V.getPointer() == BB) {
       V.setInt(D);
       break;
@@ -13130,41 +12946,58 @@ bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
   return SCEVExprContains(S, [&](const SCEV *Expr) { return Expr == Op; });
 }
 
-void
-ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
-  ValuesAtScopes.erase(S);
-  LoopDispositions.erase(S);
-  BlockDispositions.erase(S);
-  UnsignedRanges.erase(S);
-  SignedRanges.erase(S);
-  ExprValueMap.erase(S);
-  HasRecMap.erase(S);
-  MinTrailingZerosCache.erase(S);
+void ScalarEvolution::forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs) {
+  SmallPtrSet<const SCEV *, 8> ToForget(SCEVs.begin(), SCEVs.end());
+  SmallVector<const SCEV *, 8> Worklist(ToForget.begin(), ToForget.end());
+
+  while (!Worklist.empty()) {
+    const SCEV *Curr = Worklist.pop_back_val();
+    auto Users = SCEVUsers.find(Curr);
+    if (Users != SCEVUsers.end())
+      for (auto *User : Users->second)
+        if (ToForget.insert(User).second)
+          Worklist.push_back(User);
+  }
+
+  for (auto *S : ToForget)
+    forgetMemoizedResultsImpl(S);
 
   for (auto I = PredicatedSCEVRewrites.begin();
        I != PredicatedSCEVRewrites.end();) {
     std::pair<const SCEV *, const Loop *> Entry = I->first;
-    if (Entry.first == S)
+    if (ToForget.count(Entry.first))
       PredicatedSCEVRewrites.erase(I++);
     else
       ++I;
   }
 
-  auto RemoveSCEVFromBackedgeMap =
-      [S](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
+  auto RemoveSCEVFromBackedgeMap = [&ToForget](
+      DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
         for (auto I = Map.begin(), E = Map.end(); I != E;) {
           BackedgeTakenInfo &BEInfo = I->second;
-          if (BEInfo.hasOperand(S))
+          if (any_of(ToForget,
+                     [&BEInfo](const SCEV *S) { return BEInfo.hasOperand(S); }))
             Map.erase(I++);
           else
             ++I;
         }
-      };
+  };
 
   RemoveSCEVFromBackedgeMap(BackedgeTakenCounts);
   RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts);
 }
 
+void ScalarEvolution::forgetMemoizedResultsImpl(const SCEV *S) {
+  ValuesAtScopes.erase(S);
+  LoopDispositions.erase(S);
+  BlockDispositions.erase(S);
+  UnsignedRanges.erase(S);
+  SignedRanges.erase(S);
+  ExprValueMap.erase(S);
+  HasRecMap.erase(S);
+  MinTrailingZerosCache.erase(S);
+}
+
 void
 ScalarEvolution::getUsedLoops(const SCEV *S,
                               SmallPtrSetImpl<const Loop *> &LoopsUsed) {
@@ -13185,13 +13018,6 @@ ScalarEvolution::getUsedLoops(const SCEV *S,
   SCEVTraversal<FindUsedLoops>(F).visitAll(S);
 }
 
-void ScalarEvolution::addToLoopUseLists(const SCEV *S) {
-  SmallPtrSet<const Loop *, 8> LoopsUsed;
-  getUsedLoops(S, LoopsUsed);
-  for (auto *L : LoopsUsed)
-    LoopUsers[L].push_back(S);
-}
-
 void ScalarEvolution::verify() const {
   ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
   ScalarEvolution SE2(F, TLI, AC, DT, LI);
@@ -13282,6 +13108,23 @@ void ScalarEvolution::verify() const {
     assert(ValidLoops.contains(AR->getLoop()) &&
            "AddRec references invalid loop");
   }
+
+  // Verify intergity of SCEV users.
+  for (const auto &S : UniqueSCEVs) {
+    SmallVector<const SCEV *, 4> Ops;
+    collectUniqueOps(&S, Ops);
+    for (const auto *Op : Ops) {
+      // We do not store dependencies of constants.
+      if (isa<SCEVConstant>(Op))
+        continue;
+      auto It = SCEVUsers.find(Op);
+      if (It != SCEVUsers.end() && It->second.count(&S))
+        continue;
+      dbgs() << "Use of operand  " << *Op << " by user " << S
+             << " is not being tracked!\n";
+      std::abort();
+    }
+  }
 }
 
 bool ScalarEvolution::invalidate(
@@ -13685,6 +13528,16 @@ PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE,
                                                      Loop &L)
     : SE(SE), L(L) {}
 
+void ScalarEvolution::registerUser(const SCEV *User,
+                                   ArrayRef<const SCEV *> Ops) {
+  for (auto *Op : Ops)
+    // We do not expect that forgetting cached data for SCEVConstants will ever
+    // open any prospects for sharpening or introduce any correctness issues,
+    // so we don't bother storing their dependencies.
+    if (!isa<SCEVConstant>(Op))
+      SCEVUsers[Op].insert(User);
+}
+
 const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
   const SCEV *Expr = SE.getSCEV(V);
   RewriteEntry &Entry = RewriteMap[Expr];
@@ -13897,52 +13750,51 @@ ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) {
   return getUMinFromMismatchedTypes(ExitCounts);
 }
 
-/// This rewriter is similar to SCEVParameterRewriter (it replaces SCEVUnknown
-/// components following the Map (Value -> SCEV)), but skips AddRecExpr because
-/// we cannot guarantee that the replacement is loop invariant in the loop of
-/// the AddRec.
+/// A rewriter to replace SCEV expressions in Map with the corresponding entry
+/// in the map. It skips AddRecExpr because we cannot guarantee that the
+/// replacement is loop invariant in the loop of the AddRec.
+///
+/// At the moment only rewriting SCEVUnknown and SCEVZeroExtendExpr is
+/// supported.
 class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
-  ValueToSCEVMapTy &Map;
+  const DenseMap<const SCEV *, const SCEV *> &Map;
 
 public:
-  SCEVLoopGuardRewriter(ScalarEvolution &SE, ValueToSCEVMapTy &M)
+  SCEVLoopGuardRewriter(ScalarEvolution &SE,
+                        DenseMap<const SCEV *, const SCEV *> &M)
       : SCEVRewriteVisitor(SE), Map(M) {}
 
   const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { return Expr; }
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    auto I = Map.find(Expr->getValue());
+    auto I = Map.find(Expr);
     if (I == Map.end())
       return Expr;
     return I->second;
   }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    auto I = Map.find(Expr);
+    if (I == Map.end())
+      return SCEVRewriteVisitor<SCEVLoopGuardRewriter>::visitZeroExtendExpr(
+          Expr);
+    return I->second;
+  }
 };
 
 const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
+  SmallVector<const SCEV *> ExprsToRewrite;
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
-                              const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) {
-    // If we have LHS == 0, check if LHS is computing a property of some unknown
-    // SCEV %v which we can rewrite %v to express explicitly.
-    const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
-    if (Predicate == CmpInst::ICMP_EQ && RHSC &&
-        RHSC->getValue()->isNullValue()) {
-      // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
-      // explicitly express that.
-      const SCEV *URemLHS = nullptr;
-      const SCEV *URemRHS = nullptr;
-      if (matchURem(LHS, URemLHS, URemRHS)) {
-        if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
-          Value *V = LHSUnknown->getValue();
-          auto Multiple =
-              getMulExpr(getUDivExpr(URemLHS, URemRHS), URemRHS,
-                         (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
-          RewriteMap[V] = Multiple;
-          return;
-        }
-      }
-    }
-
-    if (!isa<SCEVUnknown>(LHS) && isa<SCEVUnknown>(RHS)) {
+                              const SCEV *RHS,
+                              DenseMap<const SCEV *, const SCEV *>
+                                  &RewriteMap) {
+    // WARNING: It is generally unsound to apply any wrap flags to the proposed
+    // replacement SCEV which isn't directly implied by the structure of that
+    // SCEV.  In particular, using contextual facts to imply flags is *NOT*
+    // legal.  See the scoping rules for flags in the header to understand why.
+
+    // If LHS is a constant, apply information to the other expression.
+    if (isa<SCEVConstant>(LHS)) {
       std::swap(LHS, RHS);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
@@ -13950,7 +13802,8 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
     // Check for a condition of the form (-C1 + X < C2).  InstCombine will
     // create this form when combining two checks of the form (X u< C2 + C1) and
     // (X >=u C1).
-    auto MatchRangeCheckIdiom = [this, Predicate, LHS, RHS, &RewriteMap]() {
+    auto MatchRangeCheckIdiom = [this, Predicate, LHS, RHS, &RewriteMap,
+                                 &ExprsToRewrite]() {
       auto *AddExpr = dyn_cast<SCEVAddExpr>(LHS);
       if (!AddExpr || AddExpr->getNumOperands() != 2)
         return false;
@@ -13968,26 +13821,55 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
       // Bail out, unless we have a non-wrapping, monotonic range.
       if (ExactRegion.isWrappedSet() || ExactRegion.isFullSet())
         return false;
-      auto I = RewriteMap.find(LHSUnknown->getValue());
-      const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : LHS;
-      RewriteMap[LHSUnknown->getValue()] = getUMaxExpr(
+      auto I = RewriteMap.find(LHSUnknown);
+      const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : LHSUnknown;
+      RewriteMap[LHSUnknown] = getUMaxExpr(
           getConstant(ExactRegion.getUnsignedMin()),
           getUMinExpr(RewrittenLHS, getConstant(ExactRegion.getUnsignedMax())));
+      ExprsToRewrite.push_back(LHSUnknown);
       return true;
     };
     if (MatchRangeCheckIdiom())
       return;
 
-    // For now, limit to conditions that provide information about unknown
-    // expressions. RHS also cannot contain add recurrences.
-    auto *LHSUnknown = dyn_cast<SCEVUnknown>(LHS);
-    if (!LHSUnknown || containsAddRecurrence(RHS))
+    // If we have LHS == 0, check if LHS is computing a property of some unknown
+    // SCEV %v which we can rewrite %v to express explicitly.
+    const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+    if (Predicate == CmpInst::ICMP_EQ && RHSC &&
+        RHSC->getValue()->isNullValue()) {
+      // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+      // explicitly express that.
+      const SCEV *URemLHS = nullptr;
+      const SCEV *URemRHS = nullptr;
+      if (matchURem(LHS, URemLHS, URemRHS)) {
+        if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
+          auto Multiple = getMulExpr(getUDivExpr(URemLHS, URemRHS), URemRHS);
+          RewriteMap[LHSUnknown] = Multiple;
+          ExprsToRewrite.push_back(LHSUnknown);
+          return;
+        }
+      }
+    }
+
+    // Do not apply information for constants or if RHS contains an AddRec.
+    if (isa<SCEVConstant>(LHS) || containsAddRecurrence(RHS))
+      return;
+
+    // If RHS is SCEVUnknown, make sure the information is applied to it.
+    if (!isa<SCEVUnknown>(LHS) && isa<SCEVUnknown>(RHS)) {
+      std::swap(LHS, RHS);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+
+    // Limit to expressions that can be rewritten.
+    if (!isa<SCEVUnknown>(LHS) && !isa<SCEVZeroExtendExpr>(LHS))
       return;
 
     // Check whether LHS has already been rewritten. In that case we want to
     // chain further rewrites onto the already rewritten value.
-    auto I = RewriteMap.find(LHSUnknown->getValue());
+    auto I = RewriteMap.find(LHS);
     const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : LHS;
+
     const SCEV *RewrittenRHS = nullptr;
     switch (Predicate) {
     case CmpInst::ICMP_ULT:
@@ -14031,14 +13913,17 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
       break;
     }
 
-    if (RewrittenRHS)
-      RewriteMap[LHSUnknown->getValue()] = RewrittenRHS;
+    if (RewrittenRHS) {
+      RewriteMap[LHS] = RewrittenRHS;
+      if (LHS == RewrittenLHS)
+        ExprsToRewrite.push_back(LHS);
+    }
   };
   // Starting at the loop predecessor, climb up the predecessor chain, as long
   // as there are predecessors that can be found that have unique successors
   // leading to the original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
-  ValueToSCEVMapTy RewriteMap;
+  DenseMap<const SCEV *, const SCEV *> RewriteMap;
   for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
            L->getLoopPredecessor(), L->getHeader());
        Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
@@ -14088,6 +13973,19 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
 
   if (RewriteMap.empty())
     return Expr;
+
+  // Now that all rewrite information is collect, rewrite the collected
+  // expressions with the information in the map. This applies information to
+  // sub-expressions.
+  if (ExprsToRewrite.size() > 1) {
+    for (const SCEV *Expr : ExprsToRewrite) {
+      const SCEV *RewriteTo = RewriteMap[Expr];
+      RewriteMap.erase(Expr);
+      SCEVLoopGuardRewriter Rewriter(*this, RewriteMap);
+      RewriteMap.insert({Expr, Rewriter.visit(RewriteTo)});
+    }
+  }
+
   SCEVLoopGuardRewriter Rewriter(*this, RewriteMap);
   return Rewriter.visit(Expr);
 }
diff --git a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 2262fc9d7913..f4fa159d1ec7 100644
--- a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -23,6 +23,15 @@
 #include "llvm/InitializePasses.h"
 using namespace llvm;
 
+static bool canComputePointerDiff(ScalarEvolution &SE,
+                                  const SCEV *A, const SCEV *B) {
+  if (SE.getEffectiveSCEVType(A->getType()) !=
+      SE.getEffectiveSCEVType(B->getType()))
+    return false;
+
+  return SE.instructionCouldExistWitthOperands(A, B);
+}
+
 AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
                                 const MemoryLocation &LocB, AAQueryInfo &AAQI) {
   // If either of the memory references is empty, it doesn't matter what the
@@ -41,8 +50,7 @@ AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
 
   // If something is known about the difference between the two addresses,
   // see if it's enough to prove a NoAlias.
-  if (SE.getEffectiveSCEVType(AS->getType()) ==
-      SE.getEffectiveSCEVType(BS->getType())) {
+  if (canComputePointerDiff(SE, AS, BS)) {
     unsigned BitWidth = SE.getTypeSizeInBits(AS->getType());
     APInt ASizeInt(BitWidth, LocA.Size.hasValue()
                                  ? LocA.Size.getValue()
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index ab5f2db7d1cd..9056cc01484d 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -257,14 +257,12 @@ void StackLifetime::calculateLiveIntervals() {
       unsigned AllocaNo = It.second.AllocaNo;
 
       if (IsStart) {
-        assert(!Started.test(AllocaNo) || Start[AllocaNo] == BBStart);
         if (!Started.test(AllocaNo)) {
           Started.set(AllocaNo);
           Ended.reset(AllocaNo);
           Start[AllocaNo] = InstNo;
         }
       } else {
-        assert(!Ended.test(AllocaNo));
         if (Started.test(AllocaNo)) {
           LiveRanges[AllocaNo].addRange(Start[AllocaNo], InstNo);
           Started.reset(AllocaNo);
@@ -400,3 +398,19 @@ PreservedAnalyses StackLifetimePrinterPass::run(Function &F,
   SL.print(OS);
   return PreservedAnalyses::all();
 }
+
+void StackLifetimePrinterPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<StackLifetimePrinterPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  switch (Type) {
+  case StackLifetime::LivenessType::May:
+    OS << "may";
+    break;
+  case StackLifetime::LivenessType::Must:
+    OS << "must";
+    break;
+  }
+  OS << ">";
+}
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 76f195fedf31..74cc39b7f2c0 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <memory>
+#include <tuple>
 
 using namespace llvm;
 
@@ -116,6 +117,7 @@ template <typename CalleeTy> struct UseInfo {
   // Access range if the address (alloca or parameters).
   // It is allowed to be empty-set when there are no known accesses.
   ConstantRange Range;
+  std::map<const Instruction *, ConstantRange> Accesses;
 
   // List of calls which pass address as an argument.
   // Value is offset range of address from base address (alloca or calling
@@ -129,6 +131,12 @@ template <typename CalleeTy> struct UseInfo {
   UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
 
   void updateRange(const ConstantRange &R) { Range = unionNoWrap(Range, R); }
+  void addRange(const Instruction *I, const ConstantRange &R) {
+    auto Ins = Accesses.emplace(I, R);
+    if (!Ins.second)
+      Ins.first->second = unionNoWrap(Ins.first->second, R);
+    updateRange(R);
+  }
 };
 
 template <typename CalleeTy>
@@ -146,7 +154,7 @@ raw_ostream &operator<<(raw_ostream &OS, const UseInfo<CalleeTy> &U) {
 ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
   const DataLayout &DL = AI.getModule()->getDataLayout();
   TypeSize TS = DL.getTypeAllocSize(AI.getAllocatedType());
-  unsigned PointerSize = DL.getMaxPointerSizeInBits();
+  unsigned PointerSize = DL.getPointerTypeSizeInBits(AI.getType());
   // Fallback to empty range for alloca size.
   ConstantRange R = ConstantRange::getEmpty(PointerSize);
   if (TS.isScalable())
@@ -167,7 +175,7 @@ ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
     if (Overflow)
       return R;
   }
-  R = ConstantRange(APInt::getNullValue(PointerSize), APSize);
+  R = ConstantRange(APInt::getZero(PointerSize), APSize);
   assert(!isUnsafe(R));
   return R;
 }
@@ -208,7 +216,6 @@ template <typename CalleeTy> struct FunctionInfo {
     } else {
       assert(Allocas.empty());
     }
-    O << "\n";
   }
 };
 
@@ -223,6 +230,7 @@ struct StackSafetyInfo::InfoTy {
 struct StackSafetyGlobalInfo::InfoTy {
   GVToSSI Info;
   SmallPtrSet<const AllocaInst *, 8> SafeAllocas;
+  std::map<const Instruction *, bool> AccessIsUnsafe;
 };
 
 namespace {
@@ -242,7 +250,7 @@ class StackSafetyLocalAnalysis {
   ConstantRange getMemIntrinsicAccessRange(const MemIntrinsic *MI, const Use &U,
                                            Value *Base);
 
-  bool analyzeAllUses(Value *Ptr, UseInfo<GlobalValue> &AS,
+  void analyzeAllUses(Value *Ptr, UseInfo<GlobalValue> &AS,
                       const StackLifetime &SL);
 
 public:
@@ -297,8 +305,8 @@ ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
   APInt APSize(PointerSize, Size.getFixedSize(), true);
   if (APSize.isNegative())
     return UnknownRange;
-  return getAccessRange(
-      Addr, Base, ConstantRange(APInt::getNullValue(PointerSize), APSize));
+  return getAccessRange(Addr, Base,
+                        ConstantRange(APInt::getZero(PointerSize), APSize));
 }
 
 ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
@@ -321,14 +329,13 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
   if (Sizes.getUpper().isNegative() || isUnsafe(Sizes))
     return UnknownRange;
   Sizes = Sizes.sextOrTrunc(PointerSize);
-  ConstantRange SizeRange(APInt::getNullValue(PointerSize),
-                          Sizes.getUpper() - 1);
+  ConstantRange SizeRange(APInt::getZero(PointerSize), Sizes.getUpper() - 1);
   return getAccessRange(U, Base, SizeRange);
 }
 
 /// The function analyzes all local uses of Ptr (alloca or argument) and
 /// calculates local access range and all function calls where it was used.
-bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
+void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
                                               UseInfo<GlobalValue> &US,
                                               const StackLifetime &SL) {
   SmallPtrSet<const Value *, 16> Visited;
@@ -349,11 +356,11 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
       switch (I->getOpcode()) {
       case Instruction::Load: {
         if (AI && !SL.isAliveAfter(AI, I)) {
-          US.updateRange(UnknownRange);
-          return false;
+          US.addRange(I, UnknownRange);
+          break;
         }
-        US.updateRange(
-            getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType())));
+        US.addRange(I,
+                    getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType())));
         break;
       }
 
@@ -363,15 +370,16 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
       case Instruction::Store: {
         if (V == I->getOperand(0)) {
           // Stored the pointer - conservatively assume it may be unsafe.
-          US.updateRange(UnknownRange);
-          return false;
+          US.addRange(I, UnknownRange);
+          break;
         }
         if (AI && !SL.isAliveAfter(AI, I)) {
-          US.updateRange(UnknownRange);
-          return false;
+          US.addRange(I, UnknownRange);
+          break;
         }
-        US.updateRange(getAccessRange(
-            UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType())));
+        US.addRange(
+            I, getAccessRange(
+                   UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType())));
         break;
       }
 
@@ -379,8 +387,8 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
         // Information leak.
         // FIXME: Process parameters correctly. This is a leak only if we return
         // alloca.
-        US.updateRange(UnknownRange);
-        return false;
+        US.addRange(I, UnknownRange);
+        break;
 
       case Instruction::Call:
       case Instruction::Invoke: {
@@ -388,25 +396,31 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
           break;
 
         if (AI && !SL.isAliveAfter(AI, I)) {
-          US.updateRange(UnknownRange);
-          return false;
+          US.addRange(I, UnknownRange);
+          break;
         }
 
         if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
-          US.updateRange(getMemIntrinsicAccessRange(MI, UI, Ptr));
+          US.addRange(I, getMemIntrinsicAccessRange(MI, UI, Ptr));
           break;
         }
 
         const auto &CB = cast<CallBase>(*I);
+        if (CB.getReturnedArgOperand() == V) {
+          if (Visited.insert(I).second)
+            WorkList.push_back(cast<const Instruction>(I));
+        }
+
         if (!CB.isArgOperand(&UI)) {
-          US.updateRange(UnknownRange);
-          return false;
+          US.addRange(I, UnknownRange);
+          break;
         }
 
         unsigned ArgNo = CB.getArgOperandNo(&UI);
         if (CB.isByValArgument(ArgNo)) {
-          US.updateRange(getAccessRange(
-              UI, Ptr, DL.getTypeStoreSize(CB.getParamByValType(ArgNo))));
+          US.addRange(I, getAccessRange(
+                             UI, Ptr,
+                             DL.getTypeStoreSize(CB.getParamByValType(ArgNo))));
           break;
         }
 
@@ -416,8 +430,8 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
         const GlobalValue *Callee =
             dyn_cast<GlobalValue>(CB.getCalledOperand()->stripPointerCasts());
         if (!Callee) {
-          US.updateRange(UnknownRange);
-          return false;
+          US.addRange(I, UnknownRange);
+          break;
         }
 
         assert(isa<Function>(Callee) || isa<GlobalAlias>(Callee));
@@ -435,8 +449,6 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
       }
     }
   }
-
-  return true;
 }
 
 FunctionInfo<GlobalValue> StackSafetyLocalAnalysis::run() {
@@ -468,7 +480,7 @@ FunctionInfo<GlobalValue> StackSafetyLocalAnalysis::run() {
   }
 
   LLVM_DEBUG(Info.print(dbgs(), F.getName(), &F));
-  LLVM_DEBUG(dbgs() << "[StackSafety] done\n");
+  LLVM_DEBUG(dbgs() << "\n[StackSafety] done\n");
   return Info;
 }
 
@@ -588,8 +600,7 @@ void StackSafetyDataFlowAnalysis<CalleeTy>::runDataFlow() {
   updateAllNodes();
 
   while (!WorkList.empty()) {
-    const CalleeTy *Callee = WorkList.back();
-    WorkList.pop_back();
+    const CalleeTy *Callee = WorkList.pop_back_val();
     updateOneNode(Callee);
   }
 }
@@ -674,7 +685,7 @@ const Function *findCalleeInModule(const GlobalValue *GV) {
     const GlobalAlias *A = dyn_cast<GlobalAlias>(GV);
     if (!A)
       return nullptr;
-    GV = A->getBaseObject();
+    GV = A->getAliaseeObject();
     if (GV == A)
       return nullptr;
   }
@@ -741,10 +752,8 @@ GVToSSI createGlobalStackSafetyInfo(
         KV.second.Calls.clear();
     }
 
-  uint32_t PointerSize = Copy.begin()
-                             ->first->getParent()
-                             ->getDataLayout()
-                             .getMaxPointerSizeInBits();
+  uint32_t PointerSize =
+      Copy.begin()->first->getParent()->getDataLayout().getPointerSizeInBits();
   StackSafetyDataFlowAnalysis<GlobalValue> SSDFA(PointerSize, std::move(Copy));
 
   for (auto &F : SSDFA.run()) {
@@ -794,6 +803,7 @@ const StackSafetyInfo::InfoTy &StackSafetyInfo::getInfo() const {
 
 void StackSafetyInfo::print(raw_ostream &O) const {
   getInfo().Info.print(O, F->getName(), dyn_cast<Function>(F));
+  O << "\n";
 }
 
 const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
@@ -806,17 +816,22 @@ const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
       }
     }
     Info.reset(new InfoTy{
-        createGlobalStackSafetyInfo(std::move(Functions), Index), {}});
+        createGlobalStackSafetyInfo(std::move(Functions), Index), {}, {}});
+
     for (auto &FnKV : Info->Info) {
       for (auto &KV : FnKV.second.Allocas) {
         ++NumAllocaTotal;
         const AllocaInst *AI = KV.first;
-        if (getStaticAllocaSizeRange(*AI).contains(KV.second.Range)) {
+        auto AIRange = getStaticAllocaSizeRange(*AI);
+        if (AIRange.contains(KV.second.Range)) {
           Info->SafeAllocas.insert(AI);
           ++NumAllocaStackSafe;
         }
+        for (const auto &A : KV.second.Accesses)
+          Info->AccessIsUnsafe[A.first] |= !AIRange.contains(A.second);
       }
     }
+
     if (StackSafetyPrint)
       print(errs());
   }
@@ -886,6 +901,15 @@ bool StackSafetyGlobalInfo::isSafe(const AllocaInst &AI) const {
   return Info.SafeAllocas.count(&AI);
 }
 
+bool StackSafetyGlobalInfo::stackAccessIsSafe(const Instruction &I) const {
+  const auto &Info = getInfo();
+  auto It = Info.AccessIsUnsafe.find(&I);
+  if (It == Info.AccessIsUnsafe.end()) {
+    return true;
+  }
+  return !It->second;
+}
+
 void StackSafetyGlobalInfo::print(raw_ostream &O) const {
   auto &SSI = getInfo().Info;
   if (SSI.empty())
@@ -894,6 +918,16 @@ void StackSafetyGlobalInfo::print(raw_ostream &O) const {
   for (auto &F : M.functions()) {
     if (!F.isDeclaration()) {
       SSI.find(&F)->second.print(O, F.getName(), &F);
+      O << "    safe accesses:"
+        << "\n";
+      for (const auto &I : instructions(F)) {
+        const CallInst *Call = dyn_cast<CallInst>(&I);
+        if ((isa<StoreInst>(I) || isa<LoadInst>(I) || isa<MemIntrinsic>(I) ||
+             (Call && Call->hasByValArgument())) &&
+            stackAccessIsSafe(I)) {
+          O << "     " << I << "\n";
+        }
+      }
       O << "\n";
     }
   }
diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp
index e93dc303ae63..3d10479c4544 100644
--- a/llvm/lib/Analysis/TFUtils.cpp
+++ b/llvm/lib/Analysis/TFUtils.cpp
@@ -1,9 +1,8 @@
 //===- TFUtils.cpp - tensorflow evaluation utilities ----------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -262,29 +261,58 @@ private:
 class LoggerDataImpl {
   const std::vector<LoggedFeatureSpec> LoggedFeatureSpecs;
   const TensorSpec RewardSpec;
+  const bool IncludeReward;
+
+  std::vector<tensorflow::FeatureList> FeatureLists;
+  tensorflow::FeatureList Reward;
+
+  bool isSelfConsistent(const tensorflow::SequenceExample &SE,
+                        size_t NrRecords) const {
+    bool Ret = true;
+    for (const auto &TSpecs : LoggedFeatureSpecs) {
+      const auto &Name = TSpecs.getLoggingName();
+      const auto &FL = SE.feature_lists().feature_list().at(Name).feature();
+      if (NrRecords != static_cast<size_t>(FL.size())) {
+        dbgs() << "[TF-UTILS]: " << Name << " has missing records. Expected "
+               << NrRecords << " got " << FL.size() << "\n";
+        Ret = false;
+      }
+    }
+    if (IncludeReward && static_cast<size_t>(SE.feature_lists()
+                                                 .feature_list()
+                                                 .at(RewardSpec.name())
+                                                 .feature()
+                                                 .size()) != NrRecords) {
+      dbgs() << "[TF-UTILS]: reward is missing records.\n";
+      Ret = false;
+    }
+    return Ret;
+  }
 
-  tensorflow::SequenceExample SE;
-  std::vector<tensorflow::FeatureList *> FeatureLists;
-  tensorflow::FeatureList *Reward = nullptr;
-
-public:
-  LoggerDataImpl(const std::vector<LoggedFeatureSpec> &LoggedSpecs,
-                 const TensorSpec &RewardSpec, bool IncludeReward)
-      : LoggedFeatureSpecs(LoggedSpecs), RewardSpec(RewardSpec) {
+  void transferLog(tensorflow::SequenceExample &SE) {
     auto *FL = SE.mutable_feature_lists()->mutable_feature_list();
     if (IncludeReward)
-      Reward = &(*FL)[RewardSpec.name()];
-    // Allocate first the map entries, then capture their address. We will not
-    // mutate the set of features after this (i.e. the pointers won't dangle).
-    for (const auto &LFS : LoggedSpecs) {
-      (*FL)[LFS.LoggingName ? *LFS.LoggingName : LFS.Spec.name()] = {};
+      (*FL)[RewardSpec.name()] = std::move(Reward);
+    assert(FeatureLists.size() == LoggedFeatureSpecs.size());
+    for (size_t I = 0; I < FeatureLists.size(); ++I) {
+      const auto &LFS = LoggedFeatureSpecs[I];
+      (*FL)[LFS.getLoggingName()] = std::move(FeatureLists[I]);
     }
-    for (const auto &LFS : LoggedSpecs)
-      FeatureLists.push_back(
-          &(*FL)[LFS.LoggingName ? *LFS.LoggingName : LFS.Spec.name()]);
   }
 
-  void print(raw_ostream &OS) {
+public:
+  LoggerDataImpl(const std::vector<LoggedFeatureSpec> &LoggedSpecs,
+                 const TensorSpec &RewardSpec, bool IncludeReward)
+      : LoggedFeatureSpecs(LoggedSpecs), RewardSpec(RewardSpec),
+        IncludeReward(IncludeReward), FeatureLists(LoggedFeatureSpecs.size()) {}
+
+  // flush the logged info to a stream and clear the log contents.
+  void flush(raw_ostream &OS) {
+    size_t NrRecords = getNrRecords();
+    (void)NrRecords;
+    tensorflow::SequenceExample SE;
+    transferLog(SE);
+    assert(isSelfConsistent(SE, NrRecords));
     std::string OutStr;
     if (ProtobufTextMode)
       google::protobuf::TextFormat::PrintToString(SE, &OutStr);
@@ -298,14 +326,14 @@ public:
     const auto &Spec = LoggedFeatureSpecs[FeatureID].Spec;
     if (Spec.isElementType<float>()) {
       auto *RF = FeatureLists[FeatureID]
-                     ->add_feature()
+                     .add_feature()
                      ->mutable_float_list()
                      ->mutable_value();
       RF->Resize(Spec.getElementCount(), 0.0);
       return reinterpret_cast<char *>(RF->mutable_data());
     } else if (Spec.isElementType<int32_t>() || Spec.isElementType<int64_t>()) {
       auto *RF = FeatureLists[FeatureID]
-                     ->add_feature()
+                     .add_feature()
                      ->mutable_int64_list()
                      ->mutable_value();
       RF->Resize(Spec.getElementCount(), 0);
@@ -315,17 +343,18 @@ public:
   }
 
   template <typename T> void logReward(T Value) {
+    assert(IncludeReward);
     if (RewardSpec.isElementType<float>())
-      Reward->add_feature()->mutable_float_list()->add_value(Value);
+      Reward.add_feature()->mutable_float_list()->add_value(Value);
     else if (RewardSpec.isElementType<int32_t>() ||
              RewardSpec.isElementType<int64_t>())
-      Reward->add_feature()->mutable_int64_list()->add_value(Value);
+      Reward.add_feature()->mutable_int64_list()->add_value(Value);
     else
       llvm_unreachable("Unsupported tensor type.");
   }
 
   size_t getNrRecords() const {
-    return FeatureLists.empty() ? 0 : FeatureLists[0]->feature().size();
+    return FeatureLists.empty() ? 0 : FeatureLists[0].feature().size();
   }
 };
 } // namespace llvm
@@ -538,5 +567,5 @@ char *Logger::addEntryAndGetFloatOrInt64Buffer(size_t FeatureID) {
   return reinterpret_cast<char *>(LoggerData->addNewTensor(FeatureID));
 }
 
-void Logger::print(raw_ostream &OS) { LoggerData->print(OS); }
+void Logger::flush(raw_ostream &OS) { LoggerData->flush(OS); }
 #endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 4a8818f2e2a8..7326ba74c071 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -123,6 +123,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
 
   // Set IO unlocked variants as unavailable
   // Set them as available per system below
+  TLI.setUnavailable(LibFunc_getc_unlocked);
   TLI.setUnavailable(LibFunc_getchar_unlocked);
   TLI.setUnavailable(LibFunc_putc_unlocked);
   TLI.setUnavailable(LibFunc_putchar_unlocked);
@@ -156,15 +157,10 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // isn't true for a target those defaults should be overridden below.
   TLI.setIntSize(T.isArch16Bit() ? 16 : 32);
 
-  if (T.isAMDGPU())
-    TLI.disableAllFunctions();
-
-  // There are no library implementations of memcpy and memset for AMD gpus and
-  // these can be difficult to lower in the backend.
+  // There is really no runtime library on AMDGPU, apart from
+  // __kmpc_alloc/free_shared.
   if (T.isAMDGPU()) {
-    TLI.setUnavailable(LibFunc_memcpy);
-    TLI.setUnavailable(LibFunc_memset);
-    TLI.setUnavailable(LibFunc_memset_pattern16);
+    TLI.disableAllFunctions();
     TLI.setAvailable(llvm::LibFunc___kmpc_alloc_shared);
     TLI.setAvailable(llvm::LibFunc___kmpc_free_shared);
     return;
@@ -418,6 +414,65 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_utimes);
   }
 
+  // Pick just one set of new/delete variants.
+  if (T.isOSMSVCRT()) {
+    // MSVC, doesn't have the Itanium new/delete.
+    TLI.setUnavailable(LibFunc_ZdaPv);
+    TLI.setUnavailable(LibFunc_ZdaPvRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZdaPvSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZdaPvj);
+    TLI.setUnavailable(LibFunc_ZdaPvjSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZdaPvm);
+    TLI.setUnavailable(LibFunc_ZdaPvmSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZdlPv);
+    TLI.setUnavailable(LibFunc_ZdlPvRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZdlPvSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZdlPvj);
+    TLI.setUnavailable(LibFunc_ZdlPvjSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZdlPvm);
+    TLI.setUnavailable(LibFunc_ZdlPvmSt11align_val_t);
+    TLI.setUnavailable(LibFunc_Znaj);
+    TLI.setUnavailable(LibFunc_ZnajRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZnajSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZnajSt11align_val_tRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_Znam);
+    TLI.setUnavailable(LibFunc_ZnamRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZnamSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZnamSt11align_val_tRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_Znwj);
+    TLI.setUnavailable(LibFunc_ZnwjRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZnwjSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_Znwm);
+    TLI.setUnavailable(LibFunc_ZnwmRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZnwmSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t);
+  } else {
+    // Not MSVC, assume it's Itanium.
+    TLI.setUnavailable(LibFunc_msvc_new_int);
+    TLI.setUnavailable(LibFunc_msvc_new_int_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_new_longlong);
+    TLI.setUnavailable(LibFunc_msvc_new_longlong_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_delete_ptr32);
+    TLI.setUnavailable(LibFunc_msvc_delete_ptr32_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_delete_ptr32_int);
+    TLI.setUnavailable(LibFunc_msvc_delete_ptr64);
+    TLI.setUnavailable(LibFunc_msvc_delete_ptr64_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_delete_ptr64_longlong);
+    TLI.setUnavailable(LibFunc_msvc_new_array_int);
+    TLI.setUnavailable(LibFunc_msvc_new_array_int_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_new_array_longlong);
+    TLI.setUnavailable(LibFunc_msvc_new_array_longlong_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_delete_array_ptr32);
+    TLI.setUnavailable(LibFunc_msvc_delete_array_ptr32_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_delete_array_ptr32_int);
+    TLI.setUnavailable(LibFunc_msvc_delete_array_ptr64);
+    TLI.setUnavailable(LibFunc_msvc_delete_array_ptr64_nothrow);
+    TLI.setUnavailable(LibFunc_msvc_delete_array_ptr64_longlong);
+  }
+
   switch (T.getOS()) {
   case Triple::MacOSX:
     // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
@@ -572,6 +627,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_sinh_finite);
     TLI.setUnavailable(LibFunc_sinhf_finite);
     TLI.setUnavailable(LibFunc_sinhl_finite);
+    TLI.setUnavailable(LibFunc_sqrt_finite);
+    TLI.setUnavailable(LibFunc_sqrtf_finite);
+    TLI.setUnavailable(LibFunc_sqrtl_finite);
   }
 
   if ((T.isOSLinux() && T.isGNUEnvironment()) ||
@@ -589,6 +647,140 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setAvailable(LibFunc_fgets_unlocked);
   }
 
+  if (T.isAndroid() && T.isAndroidVersionLT(21)) {
+    TLI.setUnavailable(LibFunc_stpcpy);
+    TLI.setUnavailable(LibFunc_stpncpy);
+  }
+
+  if (T.isPS4()) {
+    // PS4 does have memalign.
+    TLI.setAvailable(LibFunc_memalign);
+
+    // PS4 does not have new/delete with "unsigned int" size parameter;
+    // it only has the "unsigned long" versions.
+    TLI.setUnavailable(LibFunc_ZdaPvj);
+    TLI.setUnavailable(LibFunc_ZdaPvjSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZdlPvj);
+    TLI.setUnavailable(LibFunc_ZdlPvjSt11align_val_t);
+    TLI.setUnavailable(LibFunc_Znaj);
+    TLI.setUnavailable(LibFunc_ZnajRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZnajSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZnajSt11align_val_tRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_Znwj);
+    TLI.setUnavailable(LibFunc_ZnwjRKSt9nothrow_t);
+    TLI.setUnavailable(LibFunc_ZnwjSt11align_val_t);
+    TLI.setUnavailable(LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t);
+
+    // None of the *_chk functions.
+    TLI.setUnavailable(LibFunc_memccpy_chk);
+    TLI.setUnavailable(LibFunc_memcpy_chk);
+    TLI.setUnavailable(LibFunc_memmove_chk);
+    TLI.setUnavailable(LibFunc_mempcpy_chk);
+    TLI.setUnavailable(LibFunc_memset_chk);
+    TLI.setUnavailable(LibFunc_snprintf_chk);
+    TLI.setUnavailable(LibFunc_sprintf_chk);
+    TLI.setUnavailable(LibFunc_stpcpy_chk);
+    TLI.setUnavailable(LibFunc_stpncpy_chk);
+    TLI.setUnavailable(LibFunc_strcat_chk);
+    TLI.setUnavailable(LibFunc_strcpy_chk);
+    TLI.setUnavailable(LibFunc_strlcat_chk);
+    TLI.setUnavailable(LibFunc_strlcat_chk);
+    TLI.setUnavailable(LibFunc_strlcpy_chk);
+    TLI.setUnavailable(LibFunc_strlen_chk);
+    TLI.setUnavailable(LibFunc_strncat_chk);
+    TLI.setUnavailable(LibFunc_strncpy_chk);
+    TLI.setUnavailable(LibFunc_vsnprintf_chk);
+    TLI.setUnavailable(LibFunc_vsprintf_chk);
+
+    // Various Posix system functions.
+    TLI.setUnavailable(LibFunc_access);
+    TLI.setUnavailable(LibFunc_chmod);
+    TLI.setUnavailable(LibFunc_chown);
+    TLI.setUnavailable(LibFunc_closedir);
+    TLI.setUnavailable(LibFunc_ctermid);
+    TLI.setUnavailable(LibFunc_execl);
+    TLI.setUnavailable(LibFunc_execle);
+    TLI.setUnavailable(LibFunc_execlp);
+    TLI.setUnavailable(LibFunc_execv);
+    TLI.setUnavailable(LibFunc_execvP);
+    TLI.setUnavailable(LibFunc_execve);
+    TLI.setUnavailable(LibFunc_execvp);
+    TLI.setUnavailable(LibFunc_execvpe);
+    TLI.setUnavailable(LibFunc_fork);
+    TLI.setUnavailable(LibFunc_fstat);
+    TLI.setUnavailable(LibFunc_fstatvfs);
+    TLI.setUnavailable(LibFunc_getenv);
+    TLI.setUnavailable(LibFunc_getitimer);
+    TLI.setUnavailable(LibFunc_getlogin_r);
+    TLI.setUnavailable(LibFunc_getpwnam);
+    TLI.setUnavailable(LibFunc_gettimeofday);
+    TLI.setUnavailable(LibFunc_lchown);
+    TLI.setUnavailable(LibFunc_lstat);
+    TLI.setUnavailable(LibFunc_mkdir);
+    TLI.setUnavailable(LibFunc_open);
+    TLI.setUnavailable(LibFunc_opendir);
+    TLI.setUnavailable(LibFunc_pclose);
+    TLI.setUnavailable(LibFunc_popen);
+    TLI.setUnavailable(LibFunc_pread);
+    TLI.setUnavailable(LibFunc_pwrite);
+    TLI.setUnavailable(LibFunc_read);
+    TLI.setUnavailable(LibFunc_readlink);
+    TLI.setUnavailable(LibFunc_realpath);
+    TLI.setUnavailable(LibFunc_rename);
+    TLI.setUnavailable(LibFunc_rmdir);
+    TLI.setUnavailable(LibFunc_setitimer);
+    TLI.setUnavailable(LibFunc_stat);
+    TLI.setUnavailable(LibFunc_statvfs);
+    TLI.setUnavailable(LibFunc_system);
+    TLI.setUnavailable(LibFunc_times);
+    TLI.setUnavailable(LibFunc_tmpfile);
+    TLI.setUnavailable(LibFunc_unlink);
+    TLI.setUnavailable(LibFunc_uname);
+    TLI.setUnavailable(LibFunc_unsetenv);
+    TLI.setUnavailable(LibFunc_utime);
+    TLI.setUnavailable(LibFunc_utimes);
+    TLI.setUnavailable(LibFunc_valloc);
+    TLI.setUnavailable(LibFunc_write);
+
+    // Miscellaneous other functions not provided.
+    TLI.setUnavailable(LibFunc_atomic_load);
+    TLI.setUnavailable(LibFunc_atomic_store);
+    TLI.setUnavailable(LibFunc___kmpc_alloc_shared);
+    TLI.setUnavailable(LibFunc___kmpc_free_shared);
+    TLI.setUnavailable(LibFunc_dunder_strndup);
+    TLI.setUnavailable(LibFunc_bcmp);
+    TLI.setUnavailable(LibFunc_bcopy);
+    TLI.setUnavailable(LibFunc_bzero);
+    TLI.setUnavailable(LibFunc_cabs);
+    TLI.setUnavailable(LibFunc_cabsf);
+    TLI.setUnavailable(LibFunc_cabsl);
+    TLI.setUnavailable(LibFunc_ffs);
+    TLI.setUnavailable(LibFunc_flockfile);
+    TLI.setUnavailable(LibFunc_fseeko);
+    TLI.setUnavailable(LibFunc_ftello);
+    TLI.setUnavailable(LibFunc_ftrylockfile);
+    TLI.setUnavailable(LibFunc_funlockfile);
+    TLI.setUnavailable(LibFunc_htonl);
+    TLI.setUnavailable(LibFunc_htons);
+    TLI.setUnavailable(LibFunc_isascii);
+    TLI.setUnavailable(LibFunc_memccpy);
+    TLI.setUnavailable(LibFunc_mempcpy);
+    TLI.setUnavailable(LibFunc_memrchr);
+    TLI.setUnavailable(LibFunc_ntohl);
+    TLI.setUnavailable(LibFunc_ntohs);
+    TLI.setUnavailable(LibFunc_reallocf);
+    TLI.setUnavailable(LibFunc_roundeven);
+    TLI.setUnavailable(LibFunc_roundevenf);
+    TLI.setUnavailable(LibFunc_roundevenl);
+    TLI.setUnavailable(LibFunc_stpcpy);
+    TLI.setUnavailable(LibFunc_stpncpy);
+    TLI.setUnavailable(LibFunc_strlcat);
+    TLI.setUnavailable(LibFunc_strlcpy);
+    TLI.setUnavailable(LibFunc_strndup);
+    TLI.setUnavailable(LibFunc_strnlen);
+    TLI.setUnavailable(LibFunc_toascii);
+  }
+
   // As currently implemented in clang, NVPTX code has no standard library to
   // speak of.  Headers provide a standard-ish library implementation, but many
   // of the signatures are wrong -- for example, many libm functions are not
@@ -691,7 +883,7 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(TargetLibraryInfoImpl &&
 static StringRef sanitizeFunctionName(StringRef funcName) {
   // Filter out empty names and names containing null bytes, those can't be in
   // our table.
-  if (funcName.empty() || funcName.find('\0') != StringRef::npos)
+  if (funcName.empty() || funcName.contains('\0'))
     return StringRef();
 
   // Check for \01 prefix that is used to mangle __asm declarations and
@@ -716,12 +908,12 @@ bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName, LibFunc &F) const {
 
 bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
                                                    LibFunc F,
-                                                   const DataLayout *DL) const {
-  LLVMContext &Ctx = FTy.getContext();
-  Type *SizeTTy = DL ? DL->getIntPtrType(Ctx, /*AddressSpace=*/0) : nullptr;
-  auto IsSizeTTy = [SizeTTy](Type *Ty) {
-    return SizeTTy ? Ty == SizeTTy : Ty->isIntegerTy();
-  };
+                                                   const Module &M) const {
+  // FIXME: There is really no guarantee that sizeof(size_t) is equal to
+  // sizeof(int*) for every target. So the assumption used here to derive the
+  // SizeTBits based on the size of an integer pointer in address space zero
+  // isn't always valid.
+  unsigned SizeTBits = M.getDataLayout().getPointerSizeInBits(/*AddrSpace=*/0);
   unsigned NumParams = FTy.getNumParams();
 
   switch (F) {
@@ -745,12 +937,12 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_strlen_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_strlen:
-    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getReturnType()->isIntegerTy());
+    return NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
+           FTy.getReturnType()->isIntegerTy(SizeTBits);
 
   case LibFunc_strchr:
   case LibFunc_strrchr:
@@ -770,7 +962,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getParamType(1)->isPointerTy());
   case LibFunc_strcat_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_strcat:
@@ -780,19 +972,19 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
 
   case LibFunc_strncat_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_strncat:
     return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1) == FTy.getReturnType() &&
-            IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
 
   case LibFunc_strcpy_chk:
   case LibFunc_stpcpy_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_strcpy:
@@ -804,20 +996,20 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_strlcat_chk:
   case LibFunc_strlcpy_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_strlcat:
   case LibFunc_strlcpy:
-    return NumParams == 3 && IsSizeTTy(FTy.getReturnType()) &&
+    return NumParams == 3 && FTy.getReturnType()->isIntegerTy(SizeTBits) &&
            FTy.getParamType(0)->isPointerTy() &&
            FTy.getParamType(1)->isPointerTy() &&
-           IsSizeTTy(FTy.getParamType(2));
+           FTy.getParamType(2)->isIntegerTy(SizeTBits);
 
   case LibFunc_strncpy_chk:
   case LibFunc_stpncpy_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_strncpy:
@@ -825,7 +1017,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(0)->isPointerTy() &&
-            IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
 
   case LibFunc_strxfrm:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
@@ -840,7 +1032,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
-            IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
 
   case LibFunc_strspn:
   case LibFunc_strcspn:
@@ -888,20 +1080,21 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_sprintf_chk:
     return NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
            FTy.getParamType(1)->isIntegerTy(32) &&
-           IsSizeTTy(FTy.getParamType(2)) &&
+           FTy.getParamType(2)->isIntegerTy(SizeTBits) &&
            FTy.getParamType(3)->isPointerTy() &&
            FTy.getReturnType()->isIntegerTy(32);
 
   case LibFunc_snprintf:
-    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(2)->isPointerTy() &&
-            FTy.getReturnType()->isIntegerTy(32));
+    return NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+           FTy.getParamType(1)->isIntegerTy(SizeTBits) &&
+           FTy.getParamType(2)->isPointerTy() &&
+           FTy.getReturnType()->isIntegerTy(32);
 
   case LibFunc_snprintf_chk:
     return NumParams == 5 && FTy.getParamType(0)->isPointerTy() &&
-           IsSizeTTy(FTy.getParamType(1)) &&
+           FTy.getParamType(1)->isIntegerTy(SizeTBits) &&
            FTy.getParamType(2)->isIntegerTy(32) &&
-           IsSizeTTy(FTy.getParamType(3)) &&
+           FTy.getParamType(3)->isIntegerTy(SizeTBits) &&
            FTy.getParamType(4)->isPointerTy() &&
            FTy.getReturnType()->isIntegerTy(32);
 
@@ -915,16 +1108,17 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_vec_malloc:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
   case LibFunc_memcmp:
-    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
-            FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(1)->isPointerTy());
+    return NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+           FTy.getParamType(0)->isPointerTy() &&
+           FTy.getParamType(1)->isPointerTy() &&
+           FTy.getParamType(2)->isIntegerTy(SizeTBits);
 
   case LibFunc_memchr:
   case LibFunc_memrchr:
     return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(1)->isIntegerTy(32) &&
-            IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
   case LibFunc_modf:
   case LibFunc_modff:
   case LibFunc_modfl:
@@ -934,7 +1128,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_mempcpy_chk:
   case LibFunc_memmove_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_memcpy:
@@ -943,22 +1137,22 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy() &&
-            IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
 
   case LibFunc_memset_chk:
     --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_memset:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isIntegerTy() &&
-            IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
 
   case LibFunc_memccpy_chk:
       --NumParams;
-    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+    if (!FTy.getParamType(NumParams)->isIntegerTy(SizeTBits))
       return false;
     LLVM_FALLTHROUGH;
   case LibFunc_memccpy:
@@ -970,7 +1164,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_vec_realloc:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
-            IsSizeTTy(FTy.getParamType(1)));
+            FTy.getParamType(1)->isIntegerTy(SizeTBits));
   case LibFunc_read:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
   case LibFunc_rewind:
@@ -1051,7 +1245,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams != 0 && FTy.getParamType(0)->isPointerTy());
   case LibFunc___kmpc_free_shared:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
-            IsSizeTTy(FTy.getParamType(1)));
+            FTy.getParamType(1)->isIntegerTy(SizeTBits));
 
   case LibFunc_fopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
@@ -1141,14 +1335,14 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_vsprintf_chk:
     return NumParams == 5 && FTy.getParamType(0)->isPointerTy() &&
            FTy.getParamType(1)->isIntegerTy(32) &&
-           IsSizeTTy(FTy.getParamType(2)) && FTy.getParamType(3)->isPointerTy();
+           FTy.getParamType(2)->isIntegerTy(SizeTBits) && FTy.getParamType(3)->isPointerTy();
   case LibFunc_vsnprintf:
     return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
   case LibFunc_vsnprintf_chk:
     return NumParams == 6 && FTy.getParamType(0)->isPointerTy() &&
            FTy.getParamType(2)->isIntegerTy(32) &&
-           IsSizeTTy(FTy.getParamType(3)) && FTy.getParamType(4)->isPointerTy();
+           FTy.getParamType(3)->isIntegerTy(SizeTBits) && FTy.getParamType(4)->isPointerTy();
   case LibFunc_open:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
   case LibFunc_opendir:
@@ -1560,12 +1754,13 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_strnlen:
     return (NumParams == 2 && FTy.getReturnType() == FTy.getParamType(1) &&
             FTy.getParamType(0)->isPointerTy() &&
-            IsSizeTTy(FTy.getParamType(1)));
+            FTy.getParamType(1)->isIntegerTy(SizeTBits));
 
   case LibFunc_posix_memalign:
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isPointerTy() &&
-            IsSizeTTy(FTy.getParamType(1)) && IsSizeTTy(FTy.getParamType(2)));
+            FTy.getParamType(1)->isIntegerTy(SizeTBits) &&
+            FTy.getParamType(2)->isIntegerTy(SizeTBits));
 
   case LibFunc_wcslen:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
@@ -1605,10 +1800,11 @@ bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
   // avoid string normalization and comparison.
   if (FDecl.isIntrinsic()) return false;
 
-  const DataLayout *DL =
-      FDecl.getParent() ? &FDecl.getParent()->getDataLayout() : nullptr;
+  const Module *M = FDecl.getParent();
+  assert(M && "Expecting FDecl to be connected to a Module.");
+
   return getLibFunc(FDecl.getName(), F) &&
-         isValidProtoForLibFunc(*FDecl.getFunctionType(), F, DL);
+         isValidProtoForLibFunc(*FDecl.getFunctionType(), F, *M);
 }
 
 void TargetLibraryInfoImpl::disableAllFunctions() {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 304d24fe8e4a..5067f493f02d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -167,11 +167,7 @@ bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
     // Note that this block may not be the loop latch block, even if the loop
     // has a latch block.
     ExitBlock = BB;
-    TripCount = SE.getAddExpr(EC, SE.getOne(EC->getType()));
-
-    if (!EC->getType()->isPointerTy() && EC->getType() != CountType)
-      TripCount = SE.getZeroExtendExpr(TripCount, CountType);
-
+    ExitCount = EC;
     break;
   }
 
@@ -263,10 +259,20 @@ bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS,
   return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
 }
 
+bool TargetTransformInfo::canHaveNonUndefGlobalInitializerInAddressSpace(
+    unsigned AS) const {
+  return TTIImpl->canHaveNonUndefGlobalInitializerInAddressSpace(AS);
+}
+
 unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const {
   return TTIImpl->getAssumedAddrSpace(V);
 }
 
+std::pair<const Value *, unsigned>
+TargetTransformInfo::getPredicatedAddrSpace(const Value *V) const {
+  return TTIImpl->getPredicatedAddrSpace(V);
+}
+
 Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
     IntrinsicInst *II, Value *OldV, Value *NewV) const {
   return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
@@ -317,8 +323,9 @@ Optional<Value *> TargetTransformInfo::simplifyDemandedVectorEltsIntrinsic(
 }
 
 void TargetTransformInfo::getUnrollingPreferences(
-    Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
-  return TTIImpl->getUnrollingPreferences(L, SE, UP);
+    Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP,
+    OptimizationRemarkEmitter *ORE) const {
+  return TTIImpl->getUnrollingPreferences(L, SE, UP, ORE);
 }
 
 void TargetTransformInfo::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
@@ -409,6 +416,10 @@ bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const {
   return TTIImpl->isLegalMaskedExpandLoad(DataType);
 }
 
+bool TargetTransformInfo::enableOrderedReductions() const {
+  return TTIImpl->enableOrderedReductions();
+}
+
 bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const {
   return TTIImpl->hasDivRemOp(DataType, IsSigned);
 }
@@ -598,6 +609,10 @@ Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
   return TTIImpl->getMaxVScale();
 }
 
+Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
+  return TTIImpl->getVScaleForTuning();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
   return TTIImpl->shouldMaximizeVectorBandwidth();
 }
@@ -818,6 +833,15 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(unsigned Opcode,
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getReplicationShuffleCost(
+    Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
+    TTI::TargetCostKind CostKind) {
+  InstructionCost Cost = TTIImpl->getReplicationShuffleCost(
+      EltTy, ReplicationFactor, VF, DemandedDstElts, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getMemoryOpCost(
     unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
     TTI::TargetCostKind CostKind, const Instruction *I) const {
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 20d718f4fad3..23dbb32f38de 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -521,21 +521,21 @@ static const MDNode *getLeastCommonType(const MDNode *A, const MDNode *B) {
   return Ret;
 }
 
-void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
-  if (Merge) {
-    N.TBAA =
-        MDNode::getMostGenericTBAA(N.TBAA, getMetadata(LLVMContext::MD_tbaa));
-    N.TBAAStruct = nullptr;
-    N.Scope = MDNode::getMostGenericAliasScope(
-        N.Scope, getMetadata(LLVMContext::MD_alias_scope));
-    N.NoAlias =
-        MDNode::intersect(N.NoAlias, getMetadata(LLVMContext::MD_noalias));
-  } else {
-    N.TBAA = getMetadata(LLVMContext::MD_tbaa);
-    N.TBAAStruct = getMetadata(LLVMContext::MD_tbaa_struct);
-    N.Scope = getMetadata(LLVMContext::MD_alias_scope);
-    N.NoAlias = getMetadata(LLVMContext::MD_noalias);
-  }
+AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const {
+  AAMDNodes Result;
+  Result.TBAA = MDNode::getMostGenericTBAA(TBAA, Other.TBAA);
+  Result.TBAAStruct = nullptr;
+  Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
+  Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+  return Result;
+}
+
+AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const {
+  AAMDNodes Result;
+  Result.TBAA = Result.TBAAStruct = nullptr;
+  Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
+  Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+  return Result;
 }
 
 static const MDNode *createAccessTag(const MDNode *AccessType) {
diff --git a/llvm/lib/Analysis/TypeMetadataUtils.cpp b/llvm/lib/Analysis/TypeMetadataUtils.cpp
index f015ba9a09ca..80051fd5f7c1 100644
--- a/llvm/lib/Analysis/TypeMetadataUtils.cpp
+++ b/llvm/lib/Analysis/TypeMetadataUtils.cpp
@@ -126,7 +126,8 @@ void llvm::findDevirtualizableCallsForTypeCheckedLoad(
                               Offset->getZExtValue(), CI, DT);
 }
 
-Constant *llvm::getPointerAtOffset(Constant *I, uint64_t Offset, Module &M) {
+Constant *llvm::getPointerAtOffset(Constant *I, uint64_t Offset, Module &M,
+                                   Constant *TopLevelGlobal) {
   if (I->getType()->isPointerTy()) {
     if (Offset == 0)
       return I;
@@ -142,7 +143,8 @@ Constant *llvm::getPointerAtOffset(Constant *I, uint64_t Offset, Module &M) {
 
     unsigned Op = SL->getElementContainingOffset(Offset);
     return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
-                              Offset - SL->getElementOffset(Op), M);
+                              Offset - SL->getElementOffset(Op), M,
+                              TopLevelGlobal);
   }
   if (auto *C = dyn_cast<ConstantArray>(I)) {
     ArrayType *VTableTy = C->getType();
@@ -153,7 +155,62 @@ Constant *llvm::getPointerAtOffset(Constant *I, uint64_t Offset, Module &M) {
       return nullptr;
 
     return getPointerAtOffset(cast<Constant>(I->getOperand(Op)),
-                              Offset % ElemSize, M);
+                              Offset % ElemSize, M, TopLevelGlobal);
+  }
+
+  // (Swift-specific) relative-pointer support starts here.
+  if (auto *CI = dyn_cast<ConstantInt>(I)) {
+    if (Offset == 0 && CI->getZExtValue() == 0) {
+      return I;
+    }
+  }
+  if (auto *C = dyn_cast<ConstantExpr>(I)) {
+    switch (C->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::PtrToInt:
+      return getPointerAtOffset(cast<Constant>(C->getOperand(0)), Offset, M,
+                                TopLevelGlobal);
+    case Instruction::Sub: {
+      auto *Operand0 = cast<Constant>(C->getOperand(0));
+      auto *Operand1 = cast<Constant>(C->getOperand(1));
+
+      auto StripGEP = [](Constant *C) {
+        auto *CE = dyn_cast<ConstantExpr>(C);
+        if (!CE)
+          return C;
+        if (CE->getOpcode() != Instruction::GetElementPtr)
+          return C;
+        return CE->getOperand(0);
+      };
+      auto *Operand1TargetGlobal = StripGEP(getPointerAtOffset(Operand1, 0, M));
+
+      // Check that in the "sub (@a, @b)" expression, @b points back to the top
+      // level global (or a GEP thereof) that we're processing. Otherwise bail.
+      if (Operand1TargetGlobal != TopLevelGlobal)
+        return nullptr;
+
+      return getPointerAtOffset(Operand0, Offset, M, TopLevelGlobal);
+    }
+    default:
+      return nullptr;
+    }
   }
   return nullptr;
 }
+
+void llvm::replaceRelativePointerUsersWithZero(Function *F) {
+  for (auto *U : F->users()) {
+    auto *PtrExpr = dyn_cast<ConstantExpr>(U);
+    if (!PtrExpr || PtrExpr->getOpcode() != Instruction::PtrToInt)
+      continue;
+
+    for (auto *PtrToIntUser : PtrExpr->users()) {
+      auto *SubExpr = dyn_cast<ConstantExpr>(PtrToIntUser);
+      if (!SubExpr || SubExpr->getOpcode() != Instruction::Sub)
+        continue;
+
+      SubExpr->replaceNonMetadataUsesWith(
+          ConstantInt::get(SubExpr->getType(), 0));
+    }
+  }
+}
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 522d21812c6a..1c41c77a8cfb 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -84,6 +84,17 @@ using namespace llvm::PatternMatch;
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
                                               cl::Hidden, cl::init(20));
 
+// According to the LangRef, branching on a poison condition is absolutely
+// immediate full UB.  However, historically we haven't implemented that
+// consistently as we have an important transformation (non-trivial unswitch)
+// which introduces instances of branch on poison/undef to otherwise well
+// defined programs.  This flag exists to let us test optimization benefit
+// of exploiting the specified behavior (in combination with enabling the
+// unswitch fix.)
+static cl::opt<bool> BranchOnPoisonAsUB("branch-on-poison-as-ub",
+                                        cl::Hidden, cl::init(false));
+
+
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
@@ -165,8 +176,8 @@ static bool getShuffleDemandedElts(const ShuffleVectorInst *Shuf,
   int NumElts =
       cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
   int NumMaskElts = cast<FixedVectorType>(Shuf->getType())->getNumElements();
-  DemandedLHS = DemandedRHS = APInt::getNullValue(NumElts);
-  if (DemandedElts.isNullValue())
+  DemandedLHS = DemandedRHS = APInt::getZero(NumElts);
+  if (DemandedElts.isZero())
     return true;
   // Simple case of a shuffle with zeroinitializer.
   if (all_of(Shuf->getShuffleMask(), [](int Elt) { return Elt == 0; })) {
@@ -206,7 +217,7 @@ static void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
 
   auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
   APInt DemandedElts =
-      FVTy ? APInt::getAllOnesValue(FVTy->getNumElements()) : APInt(1, 1);
+      FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
   computeKnownBits(V, DemandedElts, Known, Depth, Q);
 }
 
@@ -279,16 +290,11 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
   return KnownBits::haveNoCommonBitsSet(LHSKnown, RHSKnown);
 }
 
-bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
-  for (const User *U : CxtI->users()) {
-    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    return false;
-  }
-  return true;
+bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *I) {
+  return !I->user_empty() && all_of(I->users(), [](const User *U) {
+    ICmpInst::Predicate P;
+    return match(U, m_ICmp(P, m_Value(), m_Zero())) && ICmpInst::isEquality(P);
+  });
 }
 
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
@@ -378,7 +384,7 @@ static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
 
   auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
   APInt DemandedElts =
-      FVTy ? APInt::getAllOnesValue(FVTy->getNumElements()) : APInt(1, 1);
+      FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
   return ComputeNumSignBits(V, DemandedElts, Depth, Q);
 }
 
@@ -390,6 +396,14 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
       V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo));
 }
 
+unsigned llvm::ComputeMinSignedBits(const Value *V, const DataLayout &DL,
+                                    unsigned Depth, AssumptionCache *AC,
+                                    const Instruction *CxtI,
+                                    const DominatorTree *DT) {
+  unsigned SignBits = ComputeNumSignBits(V, DL, Depth, AC, CxtI, DT);
+  return V->getType()->getScalarSizeInBits() - SignBits + 1;
+}
+
 static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    bool NSW, const APInt &DemandedElts,
                                    KnownBits &KnownOut, KnownBits &Known2,
@@ -499,7 +513,9 @@ static bool isEphemeralValueOf(const Instruction *I, const Value *E) {
       if (V == E)
         return true;
 
-      if (V == I || isSafeToSpeculativelyExecute(V)) {
+      if (V == I || (isa<Instruction>(V) &&
+                     !cast<Instruction>(V)->mayHaveSideEffects() &&
+                     !cast<Instruction>(V)->isTerminator())) {
        EphValues.insert(V);
        if (const User *U = dyn_cast<User>(V))
          append_range(WorkSet, U->operands());
@@ -547,10 +563,9 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
     // We limit the scan distance between the assume and its context instruction
     // to avoid a compile-time explosion. This limit is chosen arbitrarily, so
     // it can be adjusted if needed (could be turned into a cl::opt).
-    unsigned ScanLimit = 15;
-    for (BasicBlock::const_iterator I(CxtI), IE(Inv); I != IE; ++I)
-      if (!isGuaranteedToTransferExecutionToSuccessor(&*I) || --ScanLimit == 0)
-        return false;
+    auto Range = make_range(CxtI->getIterator(), Inv->getIterator());
+    if (!isGuaranteedToTransferExecutionToSuccessor(Range, 15))
+      return false;
 
     return !isEphemeralValueOf(Inv, CxtI);
   }
@@ -582,7 +597,7 @@ static bool cmpExcludesZero(CmpInst::Predicate Pred, const Value *RHS) {
     return false;
 
   ConstantRange TrueValues = ConstantRange::makeExactICmpRegion(Pred, *C);
-  return !TrueValues.contains(APInt::getNullValue(C->getBitWidth()));
+  return !TrueValues.contains(APInt::getZero(C->getBitWidth()));
 }
 
 static bool isKnownNonZeroFromAssume(const Value *V, const Query &Q) {
@@ -641,7 +656,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
   if (V->getType()->isPointerTy()) {
     if (RetainedKnowledge RK = getKnowledgeValidInContext(
             V, {Attribute::Alignment}, Q.CxtI, Q.DT, Q.AC)) {
-      Known.Zero.setLowBits(Log2_32(RK.ArgValue));
+      Known.Zero.setLowBits(Log2_64(RK.ArgValue));
     }
   }
 
@@ -1210,7 +1225,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       // (dependent on endian) to form the full result of known bits.
       unsigned NumElts = DemandedElts.getBitWidth();
       unsigned SubScale = BitWidth / SubBitWidth;
-      APInt SubDemandedElts = APInt::getNullValue(NumElts * SubScale);
+      APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
         if (DemandedElts[i])
           SubDemandedElts.setBit(i * SubScale);
@@ -1383,7 +1398,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       Known = KnownBits::computeForAddSub(
           /*Add=*/true, /*NSW=*/false, Known, IndexBits);
     }
-    if (!Known.isUnknown() && !AccConstIndices.isNullValue()) {
+    if (!Known.isUnknown() && !AccConstIndices.isZero()) {
       KnownBits Index = KnownBits::makeConstant(AccConstIndices);
       Known = KnownBits::computeForAddSub(
           /*Add=*/true, /*NSW=*/false, Known, Index);
@@ -1512,7 +1527,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
     // taking conservative care to avoid excessive recursion.
     if (Depth < MaxAnalysisRecursionDepth - 1 && !Known.Zero && !Known.One) {
       // Skip if every incoming value references to ourself.
-      if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
+      if (isa_and_nonnull<UndefValue>(P->hasConstantValue()))
         break;
 
       Known.Zero.setAllBits();
@@ -1689,6 +1704,33 @@ static void computeKnownBitsFromOperator(const Operator *I,
         if (BitWidth >= 32)
           Known.Zero.setBitsFrom(31);
         break;
+      case Intrinsic::vscale: {
+        if (!II->getParent() || !II->getFunction() ||
+            !II->getFunction()->hasFnAttribute(Attribute::VScaleRange))
+          break;
+
+        auto VScaleRange = II->getFunction()
+                               ->getFnAttribute(Attribute::VScaleRange)
+                               .getVScaleRangeArgs();
+
+        if (VScaleRange.second == 0)
+          break;
+
+        // If vscale min = max then we know the exact value at compile time
+        // and hence we know the exact bits.
+        if (VScaleRange.first == VScaleRange.second) {
+          Known.One = VScaleRange.first;
+          Known.Zero = VScaleRange.first;
+          Known.Zero.flipAllBits();
+          break;
+        }
+
+        unsigned FirstZeroHighBit = 32 - countLeadingZeros(VScaleRange.second);
+        if (FirstZeroHighBit < BitWidth)
+          Known.Zero.setBitsFrom(FirstZeroHighBit);
+
+        break;
+      }
       }
     }
     break;
@@ -1763,7 +1805,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       break;
     }
     unsigned NumElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
-    APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
+    APInt DemandedVecElts = APInt::getAllOnes(NumElts);
     if (CIdx && CIdx->getValue().ult(NumElts))
       DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
     computeKnownBits(Vec, DemandedVecElts, Known, Depth + 1, Q);
@@ -2248,7 +2290,7 @@ static bool isNonZeroRecurrence(const PHINode *PN) {
   Value *Start = nullptr, *Step = nullptr;
   const APInt *StartC, *StepC;
   if (!matchSimpleRecurrence(PN, BO, Start, Step) ||
-      !match(Start, m_APInt(StartC)) || StartC->isNullValue())
+      !match(Start, m_APInt(StartC)) || StartC->isZero())
     return false;
 
   switch (BO->getOpcode()) {
@@ -2260,7 +2302,7 @@ static bool isNonZeroRecurrence(const PHINode *PN) {
             StartC->isNegative() == StepC->isNegative());
   case Instruction::Mul:
     return (BO->hasNoUnsignedWrap() || BO->hasNoSignedWrap()) &&
-           match(Step, m_APInt(StepC)) && !StepC->isNullValue();
+           match(Step, m_APInt(StepC)) && !StepC->isZero();
   case Instruction::Shl:
     return BO->hasNoUnsignedWrap() || BO->hasNoSignedWrap();
   case Instruction::AShr:
@@ -2532,7 +2574,7 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
     auto *CIdx = dyn_cast<ConstantInt>(Idx);
     if (auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType())) {
       unsigned NumElts = VecTy->getNumElements();
-      APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
+      APInt DemandedVecElts = APInt::getAllOnes(NumElts);
       if (CIdx && CIdx->getValue().ult(NumElts))
         DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
       return isKnownNonZero(Vec, DemandedVecElts, Depth, Q);
@@ -2559,7 +2601,7 @@ bool isKnownNonZero(const Value* V, unsigned Depth, const Query& Q) {
 
   auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
   APInt DemandedElts =
-      FVTy ? APInt::getAllOnesValue(FVTy->getNumElements()) : APInt(1, 1);
+      FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
   return isKnownNonZero(V, DemandedElts, Depth, Q);
 }
 
@@ -2694,8 +2736,7 @@ static bool isNonEqualMul(const Value *V1, const Value *V2, unsigned Depth,
     const APInt *C;
     return match(OBO, m_Mul(m_Specific(V1), m_APInt(C))) &&
            (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) &&
-           !C->isNullValue() && !C->isOneValue() &&
-           isKnownNonZero(V1, Depth + 1, Q);
+           !C->isZero() && !C->isOne() && isKnownNonZero(V1, Depth + 1, Q);
   }
   return false;
 }
@@ -2708,7 +2749,7 @@ static bool isNonEqualShl(const Value *V1, const Value *V2, unsigned Depth,
     const APInt *C;
     return match(OBO, m_Shl(m_Specific(V1), m_APInt(C))) &&
            (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) &&
-           !C->isNullValue() && isKnownNonZero(V1, Depth + 1, Q);
+           !C->isZero() && isKnownNonZero(V1, Depth + 1, Q);
   }
   return false;
 }
@@ -3051,7 +3092,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 
           // If the input is known to be 0 or 1, the output is 0/-1, which is
           // all sign bits set.
-          if ((Known.Zero | 1).isAllOnesValue())
+          if ((Known.Zero | 1).isAllOnes())
             return TyBits;
 
           // If we are subtracting one from a positive number, there is no carry
@@ -3075,7 +3116,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
           computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
           // If the input is known to be 0 or 1, the output is 0/-1, which is
           // all sign bits set.
-          if ((Known.Zero | 1).isAllOnesValue())
+          if ((Known.Zero | 1).isAllOnes())
             return TyBits;
 
           // If the input is known to be positive (the sign bit is known clear),
@@ -4533,6 +4574,12 @@ AllocaInst *llvm::findAllocaForValue(Value *V, bool OffsetZero) {
       if (OffsetZero && !GEP->hasAllZeroIndices())
         return nullptr;
       AddWork(GEP->getPointerOperand());
+    } else if (CallBase *CB = dyn_cast<CallBase>(V)) {
+      Value *Returned = CB->getReturnedArgOperand();
+      if (Returned)
+        AddWork(Returned);
+      else
+        return nullptr;
     } else {
       return nullptr;
     }
@@ -4614,7 +4661,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     if (*Denominator == 0)
       return false;
     // It's safe to hoist if the denominator is not 0 or -1.
-    if (!Denominator->isAllOnesValue())
+    if (!Denominator->isAllOnes())
       return true;
     // At this point we know that the denominator is -1.  It is safe to hoist as
     // long we know that the numerator is not INT_MIN.
@@ -4922,15 +4969,14 @@ bool llvm::isOverflowIntrinsicNoWrap(const WithOverflowInst *WO,
   return llvm::any_of(GuardingBranches, AllUsesGuardedByBranch);
 }
 
-static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) {
-  // See whether I has flags that may create poison
-  if (const auto *OvOp = dyn_cast<OverflowingBinaryOperator>(Op)) {
-    if (OvOp->hasNoSignedWrap() || OvOp->hasNoUnsignedWrap())
-      return true;
-  }
-  if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(Op))
-    if (ExactOp->isExact())
-      return true;
+static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
+                                   bool ConsiderFlags) {
+
+  if (ConsiderFlags && Op->hasPoisonGeneratingFlags())
+    return true;
+
+  // TODO: this should really be under the ConsiderFlags block, but currently
+  // these are not dropped by dropPoisonGeneratingFlags
   if (const auto *FP = dyn_cast<FPMathOperator>(Op)) {
     auto FMF = FP->getFastMathFlags();
     if (FMF.noNaNs() || FMF.noInfs())
@@ -5019,10 +5065,10 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) {
   case Instruction::ICmp:
   case Instruction::FCmp:
     return false;
-  case Instruction::GetElementPtr: {
-    const auto *GEP = cast<GEPOperator>(Op);
-    return GEP->isInBounds();
-  }
+  case Instruction::GetElementPtr:
+    // inbounds is handled above
+    // TODO: what about inrange on constexpr?
+    return false;
   default: {
     const auto *CE = dyn_cast<ConstantExpr>(Op);
     if (isa<CastInst>(Op) || (CE && CE->isCast()))
@@ -5035,12 +5081,12 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) {
   }
 }
 
-bool llvm::canCreateUndefOrPoison(const Operator *Op) {
-  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/false);
+bool llvm::canCreateUndefOrPoison(const Operator *Op, bool ConsiderFlags) {
+  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/false, ConsiderFlags);
 }
 
-bool llvm::canCreatePoison(const Operator *Op) {
-  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/true);
+bool llvm::canCreatePoison(const Operator *Op, bool ConsiderFlags) {
+  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/true, ConsiderFlags);
 }
 
 static bool directlyImpliesPoison(const Value *ValAssumedPoison,
@@ -5068,7 +5114,7 @@ static bool directlyImpliesPoison(const Value *ValAssumedPoison,
     const WithOverflowInst *II;
     if (match(I, m_ExtractValue(m_WithOverflowInst(II))) &&
         (match(ValAssumedPoison, m_ExtractValue(m_Specific(II))) ||
-         llvm::is_contained(II->arg_operands(), ValAssumedPoison)))
+         llvm::is_contained(II->args(), ValAssumedPoison)))
       return true;
   }
   return false;
@@ -5225,8 +5271,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
     Dominator = Dominator->getIDom();
   }
 
-  SmallVector<Attribute::AttrKind, 2> AttrKinds{Attribute::NoUndef};
-  if (getKnowledgeValidInContext(V, AttrKinds, CtxI, DT, AC))
+  if (getKnowledgeValidInContext(V, {Attribute::NoUndef}, CtxI, DT, AC))
     return true;
 
   return false;
@@ -5304,6 +5349,27 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) {
   return true;
 }
 
+bool llvm::isGuaranteedToTransferExecutionToSuccessor(
+   BasicBlock::const_iterator Begin, BasicBlock::const_iterator End,
+   unsigned ScanLimit) {
+  return isGuaranteedToTransferExecutionToSuccessor(make_range(Begin, End),
+                                                    ScanLimit);
+}
+
+bool llvm::isGuaranteedToTransferExecutionToSuccessor(
+   iterator_range<BasicBlock::const_iterator> Range, unsigned ScanLimit) {
+  assert(ScanLimit && "scan limit must be non-zero");
+  for (const Instruction &I : Range) {
+    if (isa<DbgInfoIntrinsic>(I))
+        continue;
+    if (--ScanLimit == 0)
+      return false;
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      return false;
+  }
+  return true;
+}
+
 bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
                                                   const Loop *L) {
   // The loop header is guaranteed to be executed for every iteration.
@@ -5391,7 +5457,10 @@ void llvm::getGuaranteedWellDefinedOps(
       }
       break;
     }
-
+    case Instruction::Ret:
+      if (I->getFunction()->hasRetAttribute(Attribute::NoUndef))
+        Operands.insert(I->getOperand(0));
+      break;
     default:
       break;
   }
@@ -5408,7 +5477,16 @@ void llvm::getGuaranteedNonPoisonOps(const Instruction *I,
   case Instruction::SRem:
     Operands.insert(I->getOperand(1));
     break;
-
+  case Instruction::Switch:
+    if (BranchOnPoisonAsUB)
+      Operands.insert(cast<SwitchInst>(I)->getCondition());
+    break;
+  case Instruction::Br: {
+    auto *BR = cast<BranchInst>(I);
+    if (BranchOnPoisonAsUB && BR->isConditional())
+      Operands.insert(BR->getCondition());
+    break;
+  }
   default:
     break;
   }
@@ -5835,15 +5913,13 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
     // Is the sign bit set?
     // (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
     // (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
-    if (Pred == CmpInst::ICMP_SLT && C1->isNullValue() &&
-        C2->isMaxSignedValue())
+    if (Pred == CmpInst::ICMP_SLT && C1->isZero() && C2->isMaxSignedValue())
       return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
 
     // Is the sign bit clear?
     // (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
     // (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
-    if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
-        C2->isMinSignedValue())
+    if (Pred == CmpInst::ICMP_SGT && C1->isAllOnes() && C2->isMinSignedValue())
       return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
   }
 
@@ -6253,6 +6329,16 @@ CmpInst::Predicate llvm::getInverseMinMaxPred(SelectPatternFlavor SPF) {
   return getMinMaxPred(getInverseMinMaxFlavor(SPF));
 }
 
+APInt llvm::getMinMaxLimit(SelectPatternFlavor SPF, unsigned BitWidth) {
+  switch (SPF) {
+  case SPF_SMAX: return APInt::getSignedMaxValue(BitWidth);
+  case SPF_SMIN: return APInt::getSignedMinValue(BitWidth);
+  case SPF_UMAX: return APInt::getMaxValue(BitWidth);
+  case SPF_UMIN: return APInt::getMinValue(BitWidth);
+  default: llvm_unreachable("Unexpected flavor");
+  }
+}
+
 std::pair<Intrinsic::ID, bool>
 llvm::canConvertToMinOrMaxIntrinsic(ArrayRef<Value *> VL) {
   // Check if VL contains select instructions that can be folded into a min/max
@@ -6681,7 +6767,7 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
   const APInt *C;
   switch (BO.getOpcode()) {
   case Instruction::Add:
-    if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
+    if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
       // FIXME: If we have both nuw and nsw, we should reduce the range further.
       if (IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(&BO))) {
         // 'add nuw x, C' produces [C, UINT_MAX].
@@ -6719,7 +6805,7 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
       Upper = APInt::getSignedMaxValue(Width).ashr(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       unsigned ShiftAmount = Width - 1;
-      if (!C->isNullValue() && IIQ.isExact(&BO))
+      if (!C->isZero() && IIQ.isExact(&BO))
         ShiftAmount = C->countTrailingZeros();
       if (C->isNegative()) {
         // 'ashr C, x' produces [C, C >> (Width-1)]
@@ -6736,11 +6822,11 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
   case Instruction::LShr:
     if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
       // 'lshr x, C' produces [0, UINT_MAX >> C].
-      Upper = APInt::getAllOnesValue(Width).lshr(*C) + 1;
+      Upper = APInt::getAllOnes(Width).lshr(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
       // 'lshr C, x' produces [C >> (Width-1), C].
       unsigned ShiftAmount = Width - 1;
-      if (!C->isNullValue() && IIQ.isExact(&BO))
+      if (!C->isZero() && IIQ.isExact(&BO))
         ShiftAmount = C->countTrailingZeros();
       Lower = C->lshr(ShiftAmount);
       Upper = *C + 1;
@@ -6773,7 +6859,7 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
     if (match(BO.getOperand(1), m_APInt(C))) {
       APInt IntMin = APInt::getSignedMinValue(Width);
       APInt IntMax = APInt::getSignedMaxValue(Width);
-      if (C->isAllOnesValue()) {
+      if (C->isAllOnes()) {
         // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
         //    where C != -1 and C != 0 and C != 1
         Lower = IntMin + 1;
@@ -6802,7 +6888,7 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
     break;
 
   case Instruction::UDiv:
-    if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
+    if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
       // 'udiv x, C' produces [0, UINT_MAX / C].
       Upper = APInt::getMaxValue(Width).udiv(*C) + 1;
     } else if (match(BO.getOperand(0), m_APInt(C))) {
@@ -6946,7 +7032,7 @@ static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower,
     // If the negation part of the abs (in RHS) has the NSW flag,
     // then the result of abs(X) is [0..SIGNED_MAX],
     // otherwise it is [0..SIGNED_MIN], as -SIGNED_MIN == SIGNED_MIN.
-    Lower = APInt::getNullValue(BitWidth);
+    Lower = APInt::getZero(BitWidth);
     if (match(RHS, m_Neg(m_Specific(LHS))) &&
         IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
       Upper = APInt::getSignedMaxValue(BitWidth) + 1;
@@ -6986,9 +7072,27 @@ static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower,
   }
 }
 
+static void setLimitForFPToI(const Instruction *I, APInt &Lower, APInt &Upper) {
+  // The maximum representable value of a half is 65504. For floats the maximum
+  // value is 3.4e38 which requires roughly 129 bits.
+  unsigned BitWidth = I->getType()->getScalarSizeInBits();
+  if (!I->getOperand(0)->getType()->getScalarType()->isHalfTy())
+    return;
+  if (isa<FPToSIInst>(I) && BitWidth >= 17) {
+    Lower = APInt(BitWidth, -65504);
+    Upper = APInt(BitWidth, 65505);
+  }
+
+  if (isa<FPToUIInst>(I) && BitWidth >= 16) {
+    // For a fptoui the lower limit is left as 0.
+    Upper = APInt(BitWidth, 65505);
+  }
+}
+
 ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
                                          AssumptionCache *AC,
                                          const Instruction *CtxI,
+                                         const DominatorTree *DT,
                                          unsigned Depth) {
   assert(V->getType()->isIntOrIntVectorTy() && "Expected integer instruction");
 
@@ -7009,6 +7113,8 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
     setLimitsForIntrinsic(*II, Lower, Upper);
   else if (auto *SI = dyn_cast<SelectInst>(V))
     setLimitsForSelectPattern(*SI, Lower, Upper, IIQ);
+  else if (isa<FPToUIInst>(V) || isa<FPToSIInst>(V))
+    setLimitForFPToI(cast<Instruction>(V), Lower, Upper);
 
   ConstantRange CR = ConstantRange::getNonEmpty(Lower, Upper);
 
@@ -7027,7 +7133,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
       assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
              "must be an assume intrinsic");
 
-      if (!isValidAssumeForContext(I, CtxI, nullptr))
+      if (!isValidAssumeForContext(I, CtxI, DT))
         continue;
       Value *Arg = I->getArgOperand(0);
       ICmpInst *Cmp = dyn_cast<ICmpInst>(Arg);
@@ -7035,9 +7141,9 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
       if (!Cmp || Cmp->getOperand(0) != V)
         continue;
       ConstantRange RHS = computeConstantRange(Cmp->getOperand(1), UseInstrInfo,
-                                               AC, I, Depth + 1);
+                                               AC, I, DT, Depth + 1);
       CR = CR.intersectWith(
-          ConstantRange::makeSatisfyingICmpRegion(Cmp->getPredicate(), RHS));
+          ConstantRange::makeAllowedICmpRegion(Cmp->getPredicate(), RHS));
     }
   }
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 0a14a1432934..655c248907f6 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -331,6 +331,12 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
       if (Elt->isNullValue())
         return findScalarElement(Val, EltNo);
 
+  // If the vector is a splat then we can trivially find the scalar element.
+  if (isa<ScalableVectorType>(VTy))
+    if (Value *Splat = getSplatValue(V))
+      if (EltNo < VTy->getElementCount().getKnownMinValue())
+        return Splat;
+
   // Otherwise, we don't know.
   return nullptr;
 }
@@ -824,6 +830,23 @@ llvm::SmallVector<int, 16> llvm::createSequentialMask(unsigned Start,
   return Mask;
 }
 
+llvm::SmallVector<int, 16> llvm::createUnaryMask(ArrayRef<int> Mask,
+                                                 unsigned NumElts) {
+  // Avoid casts in the loop and make sure we have a reasonable number.
+  int NumEltsSigned = NumElts;
+  assert(NumEltsSigned > 0 && "Expected smaller or non-zero element count");
+
+  // If the mask chooses an element from operand 1, reduce it to choose from the
+  // corresponding element of operand 0. Undef mask elements are unchanged.
+  SmallVector<int, 16> UnaryMask;
+  for (int MaskElt : Mask) {
+    assert((MaskElt < NumEltsSigned * 2) && "Expected valid shuffle mask");
+    int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
+    UnaryMask.push_back(UnaryElt);
+  }
+  return UnaryMask;
+}
+
 /// A helper function for concatenating vectors. This function concatenates two
 /// vectors having the same element type. If the second vector has fewer
 /// elements than the first, it is padded with undefs.
@@ -940,7 +963,7 @@ APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
 
   const unsigned VWidth =
       cast<FixedVectorType>(Mask->getType())->getNumElements();
-  APInt DemandedElts = APInt::getAllOnesValue(VWidth);
+  APInt DemandedElts = APInt::getAllOnes(VWidth);
   if (auto *CV = dyn_cast<ConstantVector>(Mask))
     for (unsigned i = 0; i < VWidth; i++)
       if (CV->getAggregateElement(i)->isNullValue())
@@ -980,7 +1003,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       // wrap around the address space we would do a memory access at nullptr
       // even without the transformation. The wrapping checks are therefore
       // deferred until after we've formed the interleaved groups.
-      int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
+      int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
                                     /*Assume=*/true, /*ShouldCheckWrap=*/false);
 
       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
@@ -1193,15 +1216,24 @@ void InterleavedAccessInfo::analyzeInterleaving(
     } // Iteration over A accesses.
   }   // Iteration over B accesses.
 
-  // Remove interleaved store groups with gaps.
-  for (auto *Group : StoreGroups)
-    if (Group->getNumMembers() != Group->getFactor()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Invalidate candidate interleaved store group due "
-                    "to gaps.\n");
-      releaseGroup(Group);
-    }
-  // Remove interleaved groups with gaps (currently only loads) whose memory
+  auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
+                                            int Index,
+                                            std::string FirstOrLast) -> bool {
+    Instruction *Member = Group->getMember(Index);
+    assert(Member && "Group member does not exist");
+    Value *MemberPtr = getLoadStorePointerOperand(Member);
+    Type *AccessTy = getLoadStoreType(Member);
+    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
+                     /*Assume=*/false, /*ShouldCheckWrap=*/true))
+      return false;
+    LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+                      << FirstOrLast
+                      << " group member potentially pointer-wrapping.\n");
+    releaseGroup(Group);
+    return true;
+  };
+
+  // Remove interleaved groups with gaps whose memory
   // accesses may wrap around. We have to revisit the getPtrStride analysis,
   // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
   // not check wrapping (see documentation there).
@@ -1227,26 +1259,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // So we check only group member 0 (which is always guaranteed to exist),
     // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reversed accsess -- see Case 3).
-    Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
-    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
-                      /*ShouldCheckWrap=*/true)) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Invalidate candidate interleaved group due to "
-                    "first group member potentially pointer-wrapping.\n");
-      releaseGroup(Group);
+    if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
       continue;
-    }
-    Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
-    if (LastMember) {
-      Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
-      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
-                        /*ShouldCheckWrap=*/true)) {
-        LLVM_DEBUG(
-            dbgs() << "LV: Invalidate candidate interleaved group due to "
-                      "last group member potentially pointer-wrapping.\n");
-        releaseGroup(Group);
-      }
-    } else {
+    if (Group->getMember(Group->getFactor() - 1))
+      InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1,
+                                     std::string("last"));
+    else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
       // to execute at least one scalar epilogue iteration. This will ensure
       // we don't speculatively access memory out-of-bounds. We only need
@@ -1264,6 +1282,39 @@ void InterleavedAccessInfo::analyzeInterleaving(
       RequiresScalarEpilogue = true;
     }
   }
+
+  for (auto *Group : StoreGroups) {
+    // Case 1: A full group. Can Skip the checks; For full groups, if the wide
+    // store would wrap around the address space we would do a memory access at
+    // nullptr even without the transformation.
+    if (Group->getNumMembers() == Group->getFactor())
+      continue;
+
+    // Interleave-store-group with gaps is implemented using masked wide store.
+    // Remove interleaved store groups with gaps if
+    // masked-interleaved-accesses are not enabled by the target.
+    if (!EnablePredicatedInterleavedMemAccesses) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Invalidate candidate interleaved store group due "
+                    "to gaps.\n");
+      releaseGroup(Group);
+      continue;
+    }
+
+    // Case 2: If first and last members of the group don't wrap this implies
+    // that all the pointers in the group don't wrap.
+    // So we check only group member 0 (which is always guaranteed to exist),
+    // and the last group member. Case 3 (scalar epilog) is not relevant for
+    // stores with gaps, which are implemented with masked-store (rather than
+    // speculative access, as in loads).
+    if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+      continue;
+    for (int Index = Group->getFactor() - 1; Index > 0; Index--)
+      if (Group->getMember(Index)) {
+        InvalidateGroupIfMemberMayWrap(Group, Index, std::string("last"));
+        break;
+      }
+  }
 }
 
 void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
@@ -1325,9 +1376,7 @@ std::string VFABI::mangleTLIVectorName(StringRef VectorName,
 
 void VFABI::getVectorVariantNames(
     const CallInst &CI, SmallVectorImpl<std::string> &VariantMappings) {
-  const StringRef S =
-      CI.getAttribute(AttributeList::FunctionIndex, VFABI::MappingsAttrName)
-          .getValueAsString();
+  const StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
   if (S.empty())
     return;
 
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 4f72c6f9921a..41fb0b9008be 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -643,6 +643,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(convergent);
   KEYWORD(dereferenceable);
   KEYWORD(dereferenceable_or_null);
+  KEYWORD(disable_sanitizer_instrumentation);
   KEYWORD(elementtype);
   KEYWORD(inaccessiblememonly);
   KEYWORD(inaccessiblemem_or_argmemonly);
@@ -769,6 +770,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(returnDoesNotAlias);
   KEYWORD(noInline);
   KEYWORD(alwaysInline);
+  KEYWORD(noUnwind);
+  KEYWORD(mayThrow);
+  KEYWORD(hasUnknownCall);
   KEYWORD(calls);
   KEYWORD(callee);
   KEYWORD(params);
@@ -848,7 +852,15 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
   TYPEKEYWORD("x86_amx",   Type::getX86_AMXTy(Context));
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
-  TYPEKEYWORD("ptr", PointerType::getUnqual(Context));
+
+  if (Keyword == "ptr") {
+    if (Context.supportsTypedPointers()) {
+      Warning("ptr type is only supported in -opaque-pointers mode");
+      return lltok::Error;
+    }
+    TyVal = PointerType::getUnqual(Context);
+    return lltok::Type;
+  }
 
 #undef TYPEKEYWORD
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 799cb03c8c8c..5bce1eaa59a0 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -140,8 +140,8 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
 
     if (Function *Fn = dyn_cast<Function>(V)) {
       AttributeList AS = Fn->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
-      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
+      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AS = AS.removeFnAttributes(Context);
 
       FnAttrs.merge(B);
 
@@ -152,32 +152,28 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
         FnAttrs.removeAttribute(Attribute::Alignment);
       }
 
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
       AttributeList AS = CI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
-      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
+      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
       AttributeList AS = II->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
-      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
+      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       II->setAttributes(AS);
     } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(V)) {
       AttributeList AS = CBI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
-      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
+      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       CBI->setAttributes(AS);
     } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
       AttrBuilder Attrs(GV->getAttributes());
@@ -239,18 +235,18 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
       Inst->setMetadata(LLVMContext::MD_tbaa, UpgradedMD);
   }
 
-  // Look for intrinsic functions and CallInst that need to be upgraded
-  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
-    UpgradeCallsToIntrinsic(&*FI++); // must be post-increment, as we remove
+  // Look for intrinsic functions and CallInst that need to be upgraded.  We use
+  // make_early_inc_range here because we may remove some functions.
+  for (Function &F : llvm::make_early_inc_range(*M))
+    UpgradeCallsToIntrinsic(&F);
 
   // Some types could be renamed during loading if several modules are
   // loaded in the same LLVMContext (LTO scenario). In this case we should
   // remangle intrinsics names as well.
-  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ) {
-    Function *F = &*FI++;
-    if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) {
-      F->replaceAllUsesWith(Remangled.getValue());
-      F->eraseFromParent();
+  for (Function &F : llvm::make_early_inc_range(*M)) {
+    if (auto Remangled = Intrinsic::remangleIntrinsicFunction(&F)) {
+      F.replaceAllUsesWith(Remangled.getValue());
+      F.eraseFromParent();
     }
   }
 
@@ -605,12 +601,15 @@ bool LLParser::parseUnnamedGlobal() {
       parseOptionalThreadLocal(TLM) || parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
-  if (Lex.getKind() != lltok::kw_alias && Lex.getKind() != lltok::kw_ifunc)
+  switch (Lex.getKind()) {
+  default:
     return parseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
-
-  return parseIndirectSymbol(Name, NameLoc, Linkage, Visibility,
+  case lltok::kw_alias:
+  case lltok::kw_ifunc:
+    return parseAliasOrIFunc(Name, NameLoc, Linkage, Visibility,
                              DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
+  }
 }
 
 /// parseNamedGlobal:
@@ -635,12 +634,15 @@ bool LLParser::parseNamedGlobal() {
       parseOptionalThreadLocal(TLM) || parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
-  if (Lex.getKind() != lltok::kw_alias && Lex.getKind() != lltok::kw_ifunc)
+  switch (Lex.getKind()) {
+  default:
     return parseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
-
-  return parseIndirectSymbol(Name, NameLoc, Linkage, Visibility,
+  case lltok::kw_alias:
+  case lltok::kw_ifunc:
+    return parseAliasOrIFunc(Name, NameLoc, Linkage, Visibility,
                              DLLStorageClass, DSOLocal, TLM, UnnamedAddr);
+  }
 }
 
 bool LLParser::parseComdat() {
@@ -913,25 +915,25 @@ static std::string typeComparisonErrorMessage(StringRef Message, Type *Ty1,
   return ErrOS.str();
 }
 
-/// parseIndirectSymbol:
+/// parseAliasOrIFunc:
 ///   ::= GlobalVar '=' OptionalLinkage OptionalPreemptionSpecifier
 ///                     OptionalVisibility OptionalDLLStorageClass
 ///                     OptionalThreadLocal OptionalUnnamedAddr
-///                     'alias|ifunc' IndirectSymbol IndirectSymbolAttr*
+///                     'alias|ifunc' AliaseeOrResolver SymbolAttrs*
 ///
-/// IndirectSymbol
+/// AliaseeOrResolver
 ///   ::= TypeAndValue
 ///
-/// IndirectSymbolAttr
+/// SymbolAttrs
 ///   ::= ',' 'partition' StringConstant
 ///
 /// Everything through OptionalUnnamedAddr has already been parsed.
 ///
-bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
-                                   unsigned L, unsigned Visibility,
-                                   unsigned DLLStorageClass, bool DSOLocal,
-                                   GlobalVariable::ThreadLocalMode TLM,
-                                   GlobalVariable::UnnamedAddr UnnamedAddr) {
+bool LLParser::parseAliasOrIFunc(const std::string &Name, LocTy NameLoc,
+                                 unsigned L, unsigned Visibility,
+                                 unsigned DLLStorageClass, bool DSOLocal,
+                                 GlobalVariable::ThreadLocalMode TLM,
+                                 GlobalVariable::UnnamedAddr UnnamedAddr) {
   bool IsAlias;
   if (Lex.getKind() == lltok::kw_alias)
     IsAlias = true;
@@ -1013,21 +1015,26 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
     }
   }
 
-  // Okay, create the alias but do not insert it into the module yet.
-  std::unique_ptr<GlobalIndirectSymbol> GA;
-  if (IsAlias)
+  // Okay, create the alias/ifunc but do not insert it into the module yet.
+  std::unique_ptr<GlobalAlias> GA;
+  std::unique_ptr<GlobalIFunc> GI;
+  GlobalValue *GV;
+  if (IsAlias) {
     GA.reset(GlobalAlias::create(Ty, AddrSpace,
                                  (GlobalValue::LinkageTypes)Linkage, Name,
                                  Aliasee, /*Parent*/ nullptr));
-  else
-    GA.reset(GlobalIFunc::create(Ty, AddrSpace,
+    GV = GA.get();
+  } else {
+    GI.reset(GlobalIFunc::create(Ty, AddrSpace,
                                  (GlobalValue::LinkageTypes)Linkage, Name,
                                  Aliasee, /*Parent*/ nullptr));
-  GA->setThreadLocalMode(TLM);
-  GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
-  GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
-  GA->setUnnamedAddr(UnnamedAddr);
-  maybeSetDSOLocal(DSOLocal, *GA);
+    GV = GI.get();
+  }
+  GV->setThreadLocalMode(TLM);
+  GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  GV->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
+  GV->setUnnamedAddr(UnnamedAddr);
+  maybeSetDSOLocal(DSOLocal, *GV);
 
   // At this point we've parsed everything except for the IndirectSymbolAttrs.
   // Now parse them if there are any.
@@ -1036,7 +1043,7 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
 
     if (Lex.getKind() == lltok::kw_partition) {
       Lex.Lex();
-      GA->setPartition(Lex.getStrVal());
+      GV->setPartition(Lex.getStrVal());
       if (parseToken(lltok::StringConstant, "expected partition string"))
         return true;
     } else {
@@ -1045,30 +1052,27 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
   }
 
   if (Name.empty())
-    NumberedVals.push_back(GA.get());
+    NumberedVals.push_back(GV);
 
   if (GVal) {
     // Verify that types agree.
-    if (GVal->getType() != GA->getType())
+    if (GVal->getType() != GV->getType())
       return error(
           ExplicitTypeLoc,
           "forward reference and definition of alias have different types");
 
     // If they agree, just RAUW the old value with the alias and remove the
     // forward ref info.
-    GVal->replaceAllUsesWith(GA.get());
+    GVal->replaceAllUsesWith(GV);
     GVal->eraseFromParent();
   }
 
   // Insert into the module, we know its name won't collide now.
   if (IsAlias)
-    M->getAliasList().push_back(cast<GlobalAlias>(GA.get()));
+    M->getAliasList().push_back(GA.release());
   else
-    M->getIFuncList().push_back(cast<GlobalIFunc>(GA.get()));
-  assert(GA->getName() == Name && "Should not be a name conflict!");
-
-  // The module owns this now
-  GA.release();
+    M->getIFuncList().push_back(GI.release());
+  assert(GV->getName() == Name && "Should not be a name conflict!");
 
   return false;
 }
@@ -1408,14 +1412,10 @@ static inline GlobalValue *createGlobalFwdRef(Module *M, PointerType *PTy) {
 }
 
 Value *LLParser::checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
-                                        Value *Val, bool IsCall) {
+                                        Value *Val) {
   Type *ValTy = Val->getType();
   if (ValTy == Ty)
     return Val;
-  // For calls, we also allow opaque pointers.
-  if (IsCall && ValTy == PointerType::get(Ty->getContext(),
-                                          Ty->getPointerAddressSpace()))
-    return Val;
   if (Ty->isLabelTy())
     error(Loc, "'" + Name + "' is not a basic block");
   else
@@ -1429,7 +1429,7 @@ Value *LLParser::checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
 GlobalValue *LLParser::getGlobalVal(const std::string &Name, Type *Ty,
-                                    LocTy Loc, bool IsCall) {
+                                    LocTy Loc) {
   PointerType *PTy = dyn_cast<PointerType>(Ty);
   if (!PTy) {
     error(Loc, "global variable reference must have pointer type");
@@ -1451,7 +1451,7 @@ GlobalValue *LLParser::getGlobalVal(const std::string &Name, Type *Ty,
   // If we have the value in the symbol table or fwd-ref table, return it.
   if (Val)
     return cast_or_null<GlobalValue>(
-        checkValidVariableType(Loc, "@" + Name, Ty, Val, IsCall));
+        checkValidVariableType(Loc, "@" + Name, Ty, Val));
 
   // Otherwise, create a new forward reference for this value and remember it.
   GlobalValue *FwdVal = createGlobalFwdRef(M, PTy);
@@ -1459,8 +1459,7 @@ GlobalValue *LLParser::getGlobalVal(const std::string &Name, Type *Ty,
   return FwdVal;
 }
 
-GlobalValue *LLParser::getGlobalVal(unsigned ID, Type *Ty, LocTy Loc,
-                                    bool IsCall) {
+GlobalValue *LLParser::getGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
   PointerType *PTy = dyn_cast<PointerType>(Ty);
   if (!PTy) {
     error(Loc, "global variable reference must have pointer type");
@@ -1480,7 +1479,7 @@ GlobalValue *LLParser::getGlobalVal(unsigned ID, Type *Ty, LocTy Loc,
   // If we have the value in the symbol table or fwd-ref table, return it.
   if (Val)
     return cast_or_null<GlobalValue>(
-        checkValidVariableType(Loc, "@" + Twine(ID), Ty, Val, IsCall));
+        checkValidVariableType(Loc, "@" + Twine(ID), Ty, Val));
 
   // Otherwise, create a new forward reference for this value and remember it.
   GlobalValue *FwdVal = createGlobalFwdRef(M, PTy);
@@ -1936,7 +1935,7 @@ bool LLParser::parseOptionalAlignment(MaybeAlign &Alignment, bool AllowParens) {
   if (!EatIfPresent(lltok::kw_align))
     return false;
   LocTy AlignLoc = Lex.getLoc();
-  uint32_t Value = 0;
+  uint64_t Value = 0;
 
   LocTy ParenLoc = Lex.getLoc();
   bool HaveParens = false;
@@ -1945,13 +1944,13 @@ bool LLParser::parseOptionalAlignment(MaybeAlign &Alignment, bool AllowParens) {
       HaveParens = true;
   }
 
-  if (parseUInt32(Value))
+  if (parseUInt64(Value))
     return true;
 
   if (HaveParens && !EatIfPresent(lltok::rparen))
     return error(ParenLoc, "expected ')'");
 
-  if (!isPowerOf2_32(Value))
+  if (!isPowerOf2_64(Value))
     return error(AlignLoc, "alignment is not a power of two");
   if (Value > Value::MaximumAlignment)
     return error(AlignLoc, "huge alignments are not supported yet");
@@ -2221,6 +2220,26 @@ bool LLParser::parseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
     // Type ::= 'float' | 'void' (etc)
     Result = Lex.getTyVal();
     Lex.Lex();
+
+    // Handle "ptr" opaque pointer type.
+    //
+    // Type ::= ptr ('addrspace' '(' uint32 ')')?
+    if (Result->isOpaquePointerTy()) {
+      unsigned AddrSpace;
+      if (parseOptionalAddrSpace(AddrSpace))
+        return true;
+      Result = PointerType::get(getContext(), AddrSpace);
+
+      // Give a nice error for 'ptr*'.
+      if (Lex.getKind() == lltok::star)
+        return tokError("ptr* is invalid - use ptr instead");
+
+      // Fall through to parsing the type suffixes only if this 'ptr' is a
+      // function return. Otherwise, return success, implicitly rejecting other
+      // suffixes.
+      if (Lex.getKind() != lltok::lparen)
+        return false;
+    }
     break;
   case lltok::lbrace:
     // Type ::= StructType
@@ -2274,26 +2293,6 @@ bool LLParser::parseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
   }
   }
 
-  // Handle (explicit) opaque pointer types (not --force-opaque-pointers).
-  //
-  // Type ::= ptr ('addrspace' '(' uint32 ')')?
-  if (Result->isOpaquePointerTy()) {
-    unsigned AddrSpace;
-    if (parseOptionalAddrSpace(AddrSpace))
-      return true;
-    Result = PointerType::get(getContext(), AddrSpace);
-
-    // Give a nice error for 'ptr*'.
-    if (Lex.getKind() == lltok::star)
-      return tokError("ptr* is invalid - use ptr instead");
-
-    // Fall through to parsing the type suffixes only if this 'ptr' is a
-    // function return. Otherwise, return success, implicitly rejecting other
-    // suffixes.
-    if (Lex.getKind() != lltok::lparen)
-      return false;
-  }
-
   // parse the type suffixes.
   while (true) {
     switch (Lex.getKind()) {
@@ -2798,7 +2797,7 @@ bool LLParser::PerFunctionState::finishFunction() {
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
 Value *LLParser::PerFunctionState::getVal(const std::string &Name, Type *Ty,
-                                          LocTy Loc, bool IsCall) {
+                                          LocTy Loc) {
   // Look this name up in the normal function symbol table.
   Value *Val = F.getValueSymbolTable()->lookup(Name);
 
@@ -2812,7 +2811,7 @@ Value *LLParser::PerFunctionState::getVal(const std::string &Name, Type *Ty,
 
   // If we have the value in the symbol table or fwd-ref table, return it.
   if (Val)
-    return P.checkValidVariableType(Loc, "%" + Name, Ty, Val, IsCall);
+    return P.checkValidVariableType(Loc, "%" + Name, Ty, Val);
 
   // Don't make placeholders with invalid type.
   if (!Ty->isFirstClassType()) {
@@ -2832,8 +2831,7 @@ Value *LLParser::PerFunctionState::getVal(const std::string &Name, Type *Ty,
   return FwdVal;
 }
 
-Value *LLParser::PerFunctionState::getVal(unsigned ID, Type *Ty, LocTy Loc,
-                                          bool IsCall) {
+Value *LLParser::PerFunctionState::getVal(unsigned ID, Type *Ty, LocTy Loc) {
   // Look this name up in the normal function symbol table.
   Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
 
@@ -2847,7 +2845,7 @@ Value *LLParser::PerFunctionState::getVal(unsigned ID, Type *Ty, LocTy Loc,
 
   // If we have the value in the symbol table or fwd-ref table, return it.
   if (Val)
-    return P.checkValidVariableType(Loc, "%" + Twine(ID), Ty, Val, IsCall);
+    return P.checkValidVariableType(Loc, "%" + Twine(ID), Ty, Val);
 
   if (!Ty->isFirstClassType()) {
     P.error(Loc, "invalid use of a non-first-class type");
@@ -2934,12 +2932,12 @@ bool LLParser::PerFunctionState::setInstName(int NameID,
 BasicBlock *LLParser::PerFunctionState::getBB(const std::string &Name,
                                               LocTy Loc) {
   return dyn_cast_or_null<BasicBlock>(
-      getVal(Name, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
+      getVal(Name, Type::getLabelTy(F.getContext()), Loc));
 }
 
 BasicBlock *LLParser::PerFunctionState::getBB(unsigned ID, LocTy Loc) {
   return dyn_cast_or_null<BasicBlock>(
-      getVal(ID, Type::getLabelTy(F.getContext()), Loc, /*IsCall=*/false));
+      getVal(ID, Type::getLabelTy(F.getContext()), Loc));
 }
 
 /// defineBB - Define the specified basic block, which is either named or
@@ -3652,7 +3650,7 @@ bool LLParser::parseGlobalValue(Type *Ty, Constant *&C) {
   ValID ID;
   Value *V = nullptr;
   bool Parsed = parseValID(ID, /*PFS=*/nullptr, Ty) ||
-                convertValIDToValue(Ty, ID, V, nullptr, /*IsCall=*/false);
+                convertValIDToValue(Ty, ID, V, nullptr);
   if (V && !(C = dyn_cast<Constant>(V)))
     return error(ID.Loc, "global values must be constants");
   return Parsed;
@@ -3876,10 +3874,6 @@ struct MDField : public MDFieldImpl<Metadata *> {
   MDField(bool AllowNull = true) : ImplTy(nullptr), AllowNull(AllowNull) {}
 };
 
-struct MDConstant : public MDFieldImpl<ConstantAsMetadata *> {
-  MDConstant() : ImplTy(nullptr) {}
-};
-
 struct MDStringField : public MDFieldImpl<MDString *> {
   bool AllowEmpty;
   MDStringField(bool AllowEmpty = true)
@@ -3914,22 +3908,6 @@ struct MDSignedOrMDField : MDEitherFieldImpl<MDSignedField, MDField> {
   }
 };
 
-struct MDSignedOrUnsignedField
-    : MDEitherFieldImpl<MDSignedField, MDUnsignedField> {
-  MDSignedOrUnsignedField() : ImplTy(MDSignedField(0), MDUnsignedField(0)) {}
-
-  bool isMDSignedField() const { return WhatIs == IsTypeA; }
-  bool isMDUnsignedField() const { return WhatIs == IsTypeB; }
-  int64_t getMDSignedValue() const {
-    assert(isMDSignedField() && "Wrong field type");
-    return A.Val;
-  }
-  uint64_t getMDUnsignedValue() const {
-    assert(isMDUnsignedField() && "Wrong field type");
-    return B.Val;
-  }
-};
-
 } // end anonymous namespace
 
 namespace llvm {
@@ -4578,7 +4556,8 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(extraData, MDField, );                                              \
-  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));
+  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));      \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4590,7 +4569,7 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
                             offset.Val, DWARFAddressSpace, flags.Val,
-                            extraData.Val));
+                            extraData.Val, annotations.Val));
   return false;
 }
 
@@ -4615,7 +4594,8 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(dataLocation, MDField, );                                           \
   OPTIONAL(associated, MDField, );                                             \
   OPTIONAL(allocated, MDField, );                                              \
-  OPTIONAL(rank, MDSignedOrMDField, );
+  OPTIONAL(rank, MDSignedOrMDField, );                                         \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4633,7 +4613,7 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
             scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
             elements.Val, runtimeLang.Val, vtableHolder.Val, templateParams.Val,
             discriminator.Val, dataLocation.Val, associated.Val, allocated.Val,
-            Rank)) {
+            Rank, annotations.Val)) {
       Result = CT;
       return false;
     }
@@ -4645,8 +4625,8 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
        size.Val, align.Val, offset.Val, flags.Val, elements.Val,
        runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val,
-       discriminator.Val, dataLocation.Val, associated.Val, allocated.Val,
-       Rank));
+       discriminator.Val, dataLocation.Val, associated.Val, allocated.Val, Rank,
+       annotations.Val));
   return false;
 }
 
@@ -4746,7 +4726,8 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     virtuality: DW_VIRTUALTIY_pure_virtual,
 ///                     virtualIndex: 10, thisAdjustment: 4, flags: 11,
 ///                     spFlags: 10, isOptimized: false, templateParams: !4,
-///                     declaration: !5, retainedNodes: !6, thrownTypes: !7)
+///                     declaration: !5, retainedNodes: !6, thrownTypes: !7,
+///                     annotations: !8)
 bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
   auto Loc = Lex.getLoc();
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
@@ -4770,7 +4751,8 @@ bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
   OPTIONAL(retainedNodes, MDField, );                                          \
-  OPTIONAL(thrownTypes, MDField, );
+  OPTIONAL(thrownTypes, MDField, );                                            \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4789,7 +4771,7 @@ bool LLParser::parseDISubprogram(MDNode *&Result, bool IsDistinct) {
       (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val,
        type.Val, scopeLine.Val, containingType.Val, virtualIndex.Val,
        thisAdjustment.Val, flags.Val, SPFlags, unit.Val, templateParams.Val,
-       declaration.Val, retainedNodes.Val, thrownTypes.Val));
+       declaration.Val, retainedNodes.Val, thrownTypes.Val, annotations.Val));
   return false;
 }
 
@@ -4966,7 +4948,8 @@ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(isDefinition, MDBoolField, (true));                                 \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
-  OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));
+  OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4974,7 +4957,8 @@ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
       GET_OR_DISTINCT(DIGlobalVariable,
                       (Context, scope.Val, name.Val, linkageName.Val, file.Val,
                        line.Val, type.Val, isLocal.Val, isDefinition.Val,
-                       declaration.Val, templateParams.Val, align.Val));
+                       declaration.Val, templateParams.Val, align.Val,
+                       annotations.Val));
   return false;
 }
 
@@ -4994,13 +4978,15 @@ bool LLParser::parseDILocalVariable(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(type, MDField, );                                                   \
   OPTIONAL(flags, DIFlagField, );                                              \
-  OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));
+  OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
   Result = GET_OR_DISTINCT(DILocalVariable,
                            (Context, scope.Val, name.Val, file.Val, line.Val,
-                            type.Val, arg.Val, flags.Val, align.Val));
+                            type.Val, arg.Val, flags.Val, align.Val,
+                            annotations.Val));
   return false;
 }
 
@@ -5136,7 +5122,7 @@ bool LLParser::parseDIObjCProperty(MDNode *&Result, bool IsDistinct) {
 
 /// parseDIImportedEntity:
 ///   ::= !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0, entity: !1,
-///                         line: 7, name: "foo")
+///                         line: 7, name: "foo", elements: !2)
 bool LLParser::parseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
@@ -5144,13 +5130,14 @@ bool LLParser::parseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(entity, MDField, );                                                 \
   OPTIONAL(file, MDField, );                                                   \
   OPTIONAL(line, LineField, );                                                 \
-  OPTIONAL(name, MDStringField, );
+  OPTIONAL(name, MDStringField, );                                             \
+  OPTIONAL(elements, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(
-      DIImportedEntity,
-      (Context, tag.Val, scope.Val, entity.Val, file.Val, line.Val, name.Val));
+  Result = GET_OR_DISTINCT(DIImportedEntity,
+                           (Context, tag.Val, scope.Val, entity.Val, file.Val,
+                            line.Val, name.Val, elements.Val));
   return false;
 }
 
@@ -5254,7 +5241,7 @@ bool LLParser::parseMetadata(Metadata *&MD, PerFunctionState *PFS) {
 //===----------------------------------------------------------------------===//
 
 bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
-                                   PerFunctionState *PFS, bool IsCall) {
+                                   PerFunctionState *PFS) {
   if (Ty->isFunctionTy())
     return error(ID.Loc, "functions are not values, refer to them as pointers");
 
@@ -5262,12 +5249,12 @@ bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
   case ValID::t_LocalID:
     if (!PFS)
       return error(ID.Loc, "invalid use of function-local name");
-    V = PFS->getVal(ID.UIntVal, Ty, ID.Loc, IsCall);
+    V = PFS->getVal(ID.UIntVal, Ty, ID.Loc);
     return V == nullptr;
   case ValID::t_LocalName:
     if (!PFS)
       return error(ID.Loc, "invalid use of function-local name");
-    V = PFS->getVal(ID.StrVal, Ty, ID.Loc, IsCall);
+    V = PFS->getVal(ID.StrVal, Ty, ID.Loc);
     return V == nullptr;
   case ValID::t_InlineAsm: {
     if (!ID.FTy || !InlineAsm::Verify(ID.FTy, ID.StrVal2))
@@ -5278,10 +5265,10 @@ bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
     return false;
   }
   case ValID::t_GlobalName:
-    V = getGlobalVal(ID.StrVal, Ty, ID.Loc, IsCall);
+    V = getGlobalVal(ID.StrVal, Ty, ID.Loc);
     return V == nullptr;
   case ValID::t_GlobalID:
-    V = getGlobalVal(ID.UIntVal, Ty, ID.Loc, IsCall);
+    V = getGlobalVal(ID.UIntVal, Ty, ID.Loc);
     return V == nullptr;
   case ValID::t_APSInt:
     if (!Ty->isIntegerTy())
@@ -5405,7 +5392,7 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
   case ValID::t_ConstantStruct:
   case ValID::t_PackedConstantStruct: {
     Value *V;
-    if (convertValIDToValue(Ty, ID, V, /*PFS=*/nullptr, /*IsCall=*/false))
+    if (convertValIDToValue(Ty, ID, V, /*PFS=*/nullptr))
       return true;
     assert(isa<Constant>(V) && "Expected a constant value");
     C = cast<Constant>(V);
@@ -5423,7 +5410,7 @@ bool LLParser::parseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
   V = nullptr;
   ValID ID;
   return parseValID(ID, PFS, Ty) ||
-         convertValIDToValue(Ty, ID, V, PFS, /*IsCall=*/false);
+         convertValIDToValue(Ty, ID, V, PFS);
 }
 
 bool LLParser::parseTypeAndValue(Value *&V, PerFunctionState *PFS) {
@@ -5571,7 +5558,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
       AttributeList::get(Context, AttributeSet::get(Context, FuncAttrs),
                          AttributeSet::get(Context, RetAttrs), Attrs);
 
-  if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
+  if (PAL.hasParamAttr(0, Attribute::StructRet) && !RetType->isVoidTy())
     return error(RetTypeLoc, "functions with 'sret' argument must return void");
 
   FunctionType *FT = FunctionType::get(RetType, ParamTypeList, IsVarArg);
@@ -5718,7 +5705,7 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
 
     Value *ResolvedVal = BlockAddress::get(&F, BB);
     ResolvedVal = P.checkValidVariableType(BBID.Loc, BBID.StrVal, GV->getType(),
-                                           ResolvedVal, false);
+                                           ResolvedVal);
     if (!ResolvedVal)
       return true;
     GV->replaceAllUsesWith(ResolvedVal);
@@ -6287,7 +6274,7 @@ bool LLParser::parseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   // Look up the callee.
   Value *Callee;
   if (convertValIDToValue(PointerType::get(Ty, InvokeAddrSpace), CalleeID,
-                          Callee, &PFS, /*IsCall=*/true))
+                          Callee, &PFS))
     return true;
 
   // Set up the Attribute for the function.
@@ -6612,8 +6599,7 @@ bool LLParser::parseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Look up the callee.
   Value *Callee;
-  if (convertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS,
-                          /*IsCall=*/true))
+  if (convertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS))
     return true;
 
   // Set up the Attribute for the function.
@@ -7019,7 +7005,7 @@ bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS,
   // Look up the callee.
   Value *Callee;
   if (convertValIDToValue(PointerType::get(Ty, CallAddrSpace), CalleeID, Callee,
-                          &PFS, /*IsCall=*/true))
+                          &PFS))
     return true;
 
   // Set up the Attribute for the function.
@@ -8543,12 +8529,15 @@ bool LLParser::parseFlag(unsigned &Val) {
 ///        [',' 'returnDoesNotAlias' ':' Flag]? ')'
 ///        [',' 'noInline' ':' Flag]? ')'
 ///        [',' 'alwaysInline' ':' Flag]? ')'
+///        [',' 'noUnwind' ':' Flag]? ')'
+///        [',' 'mayThrow' ':' Flag]? ')'
+///        [',' 'hasUnknownCall' ':' Flag]? ')'
 
 bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
   assert(Lex.getKind() == lltok::kw_funcFlags);
   Lex.Lex();
 
-  if (parseToken(lltok::colon, "expected ':' in funcFlags") |
+  if (parseToken(lltok::colon, "expected ':' in funcFlags") ||
       parseToken(lltok::lparen, "expected '(' in funcFlags"))
     return true;
 
@@ -8591,6 +8580,24 @@ bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
         return true;
       FFlags.AlwaysInline = Val;
       break;
+    case lltok::kw_noUnwind:
+      Lex.Lex();
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
+        return true;
+      FFlags.NoUnwind = Val;
+      break;
+    case lltok::kw_mayThrow:
+      Lex.Lex();
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
+        return true;
+      FFlags.MayThrow = Val;
+      break;
+    case lltok::kw_hasUnknownCall:
+      Lex.Lex();
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
+        return true;
+      FFlags.HasUnknownCall = Val;
+      break;
     default:
       return error(Lex.getLoc(), "expected function flag type");
     }
@@ -8610,7 +8617,7 @@ bool LLParser::parseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls) {
   assert(Lex.getKind() == lltok::kw_calls);
   Lex.Lex();
 
-  if (parseToken(lltok::colon, "expected ':' in calls") |
+  if (parseToken(lltok::colon, "expected ':' in calls") ||
       parseToken(lltok::lparen, "expected '(' in calls"))
     return true;
 
@@ -8702,7 +8709,7 @@ bool LLParser::parseOptionalVTableFuncs(VTableFuncList &VTableFuncs) {
   assert(Lex.getKind() == lltok::kw_vTableFuncs);
   Lex.Lex();
 
-  if (parseToken(lltok::colon, "expected ':' in vTableFuncs") |
+  if (parseToken(lltok::colon, "expected ':' in vTableFuncs") ||
       parseToken(lltok::lparen, "expected '(' in vTableFuncs"))
     return true;
 
diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
index 1d9c81ef8ebc..3de3dccce0c6 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
@@ -1,9 +1,8 @@
 //===-- MsgPackDocumentYAML.cpp - MsgPack Document YAML interface -------*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index f577d3886e01..2723105b092f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -529,10 +529,9 @@ Error BitcodeAnalyzer::decodeMetadataStringsBlob(StringRef Indent,
     if (R.AtEndOfStream())
       return reportError("bad length");
 
-    Expected<uint32_t> MaybeSize = R.ReadVBR(6);
-    if (!MaybeSize)
-      return MaybeSize.takeError();
-    uint32_t Size = MaybeSize.get();
+    uint32_t Size;
+    if (Error E = R.ReadVBR(6).moveInto(Size))
+      return E;
     if (Strings.size() < Size)
       return reportError("truncated chars");
 
@@ -555,11 +554,8 @@ BitcodeAnalyzer::BitcodeAnalyzer(StringRef Buffer,
 
 Error BitcodeAnalyzer::analyze(Optional<BCDumpOptions> O,
                                Optional<StringRef> CheckHash) {
-  Expected<CurStreamTypeType> MaybeType = analyzeHeader(O, Stream);
-  if (!MaybeType)
-    return MaybeType.takeError();
-  else
-    CurStreamType = *MaybeType;
+  if (Error E = analyzeHeader(O, Stream).moveInto(CurStreamType))
+    return E;
 
   Stream.setBlockInfo(&BlockInfo);
 
@@ -567,9 +563,8 @@ Error BitcodeAnalyzer::analyze(Optional<BCDumpOptions> O,
   // The block info must be a top-level block.
   if (BlockInfoStream) {
     BitstreamCursor BlockInfoCursor(*BlockInfoStream);
-    Expected<CurStreamTypeType> H = analyzeHeader(O, BlockInfoCursor);
-    if (!H)
-      return H.takeError();
+    if (Error E = analyzeHeader(O, BlockInfoCursor).takeError())
+      return E;
 
     while (!BlockInfoCursor.AtEndOfStream()) {
       Expected<unsigned> MaybeCode = BlockInfoCursor.ReadCode();
@@ -582,12 +577,11 @@ Error BitcodeAnalyzer::analyze(Optional<BCDumpOptions> O,
       if (!MaybeBlockID)
         return MaybeBlockID.takeError();
       if (MaybeBlockID.get() == bitc::BLOCKINFO_BLOCK_ID) {
-        Expected<Optional<BitstreamBlockInfo>> MaybeNewBlockInfo =
-            BlockInfoCursor.ReadBlockInfoBlock(/*ReadBlockInfoNames=*/true);
-        if (!MaybeNewBlockInfo)
-          return MaybeNewBlockInfo.takeError();
-        Optional<BitstreamBlockInfo> NewBlockInfo =
-            std::move(MaybeNewBlockInfo.get());
+        Optional<BitstreamBlockInfo> NewBlockInfo;
+        if (Error E =
+                BlockInfoCursor.ReadBlockInfoBlock(/*ReadBlockInfoNames=*/true)
+                    .moveInto(NewBlockInfo))
+          return E;
         if (!NewBlockInfo)
           return reportError("Malformed BlockInfoBlock in block info file");
         BlockInfo = std::move(*NewBlockInfo);
@@ -744,22 +738,20 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
   // BLOCKINFO is a special part of the stream.
   bool DumpRecords = O.hasValue();
   if (BlockID == bitc::BLOCKINFO_BLOCK_ID) {
-    if (O)
+    if (O && !O->DumpBlockinfo)
       O->OS << Indent << "<BLOCKINFO_BLOCK/>\n";
-    Expected<Optional<BitstreamBlockInfo>> MaybeNewBlockInfo =
-        Stream.ReadBlockInfoBlock(/*ReadBlockInfoNames=*/true);
-    if (!MaybeNewBlockInfo)
-      return MaybeNewBlockInfo.takeError();
-    Optional<BitstreamBlockInfo> NewBlockInfo =
-        std::move(MaybeNewBlockInfo.get());
+    Optional<BitstreamBlockInfo> NewBlockInfo;
+    if (Error E = Stream.ReadBlockInfoBlock(/*ReadBlockInfoNames=*/true)
+                      .moveInto(NewBlockInfo))
+      return E;
     if (!NewBlockInfo)
       return reportError("Malformed BlockInfoBlock");
     BlockInfo = std::move(*NewBlockInfo);
     if (Error Err = Stream.JumpToBit(BlockBitStart))
       return Err;
     // It's not really interesting to dump the contents of the blockinfo
-    // block.
-    DumpRecords = false;
+    // block, so only do it if the user explicitly requests it.
+    DumpRecords = O && O->DumpBlockinfo;
   }
 
   unsigned NumWords = 0;
@@ -796,11 +788,10 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
 
     uint64_t RecordStartBit = Stream.GetCurrentBitNo();
 
-    Expected<BitstreamEntry> MaybeEntry =
-        Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E = Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs)
+                      .moveInto(Entry))
+      return E;
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
@@ -847,10 +838,9 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
 
     StringRef Blob;
     uint64_t CurrentRecordPos = Stream.GetCurrentBitNo();
-    Expected<unsigned> MaybeCode = Stream.readRecord(Entry.ID, Record, &Blob);
-    if (!MaybeCode)
-      return MaybeCode.takeError();
-    unsigned Code = MaybeCode.get();
+    unsigned Code;
+    if (Error E = Stream.readRecord(Entry.ID, Record, &Blob).moveInto(Code))
+      return E;
 
     // Increment the # occurrences of this code.
     if (BlockStats.CodeFreq.size() <= Code)
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index d5e366c21f7d..c568461e62b0 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -41,7 +41,6 @@
 #include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalIFunc.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -180,10 +179,8 @@ static Expected<std::string> readIdentificationBlock(BitstreamCursor &Stream) {
 
   while (true) {
     BitstreamEntry Entry;
-    if (Expected<BitstreamEntry> Res = Stream.advance())
-      Entry = Res.get();
-    else
-      return Res.takeError();
+    if (Error E = Stream.advance().moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     default:
@@ -227,10 +224,8 @@ static Expected<std::string> readIdentificationCode(BitstreamCursor &Stream) {
       return "";
 
     BitstreamEntry Entry;
-    if (Expected<BitstreamEntry> Res = Stream.advance())
-      Entry = std::move(Res.get());
-    else
-      return Res.takeError();
+    if (Error E = Stream.advance().moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     case BitstreamEntry::EndBlock:
@@ -246,10 +241,9 @@ static Expected<std::string> readIdentificationCode(BitstreamCursor &Stream) {
         return std::move(Err);
       continue;
     case BitstreamEntry::Record:
-      if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID))
-        continue;
-      else
-        return Skipped.takeError();
+      if (Error E = Stream.skipRecord(Entry.ID).takeError())
+        return std::move(E);
+      continue;
     }
   }
 }
@@ -306,10 +300,8 @@ static Expected<bool> hasObjCCategory(BitstreamCursor &Stream) {
   // need to understand them all.
   while (true) {
     BitstreamEntry Entry;
-    if (Expected<BitstreamEntry> Res = Stream.advance())
-      Entry = std::move(Res.get());
-    else
-      return Res.takeError();
+    if (Error E = Stream.advance().moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
@@ -327,10 +319,9 @@ static Expected<bool> hasObjCCategory(BitstreamCursor &Stream) {
       continue;
 
     case BitstreamEntry::Record:
-      if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID))
-        continue;
-      else
-        return Skipped.takeError();
+      if (Error E = Stream.skipRecord(Entry.ID).takeError())
+        return std::move(E);
+      continue;
     }
   }
 }
@@ -500,10 +491,15 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   SmallVector<Instruction *, 64> InstructionList;
 
   std::vector<std::pair<GlobalVariable *, unsigned>> GlobalInits;
-  std::vector<std::pair<GlobalIndirectSymbol *, unsigned>> IndirectSymbolInits;
-  std::vector<std::pair<Function *, unsigned>> FunctionPrefixes;
-  std::vector<std::pair<Function *, unsigned>> FunctionPrologues;
-  std::vector<std::pair<Function *, unsigned>> FunctionPersonalityFns;
+  std::vector<std::pair<GlobalValue *, unsigned>> IndirectSymbolInits;
+
+  struct FunctionOperandInfo {
+    Function *F;
+    unsigned PersonalityFn;
+    unsigned Prefix;
+    unsigned Prologue;
+  };
+  std::vector<FunctionOperandInfo> FunctionOperands;
 
   /// The set of attributes by index.  Index zero in the file is for null, and
   /// is thus not represented here.  As such all indices are off by one.
@@ -933,6 +929,9 @@ static FunctionSummary::FFlags getDecodedFFlags(uint64_t RawFlags) {
   Flags.ReturnDoesNotAlias = (RawFlags >> 3) & 0x1;
   Flags.NoInline = (RawFlags >> 4) & 0x1;
   Flags.AlwaysInline = (RawFlags >> 5) & 0x1;
+  Flags.NoUnwind = (RawFlags >> 6) & 0x1;
+  Flags.MayThrow = (RawFlags >> 7) & 0x1;
+  Flags.HasUnknownCall = (RawFlags >> 8) & 0x1;
   return Flags;
 }
 
@@ -1388,6 +1387,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Cold;
   case bitc::ATTR_KIND_CONVERGENT:
     return Attribute::Convergent;
+  case bitc::ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION:
+    return Attribute::DisableSanitizerInstrumentation;
   case bitc::ATTR_KIND_ELEMENTTYPE:
     return Attribute::ElementType;
   case bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY:
@@ -1785,6 +1786,9 @@ Error BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_OPAQUE_POINTER: { // OPAQUE_POINTER: [addrspace]
       if (Record.size() != 1)
         return error("Invalid record");
+      if (Context.supportsTypedPointers())
+        return error(
+            "Opaque pointers are only supported in -opaque-pointers mode");
       unsigned AddressSpace = Record[0];
       ResultTy = PointerType::get(Context, AddressSpace);
       break;
@@ -1913,7 +1917,7 @@ Error BitcodeReader::parseTypeTableBody() {
       if (Record[0] == 0)
         return error("Invalid vector length");
       ResultTy = getTypeByID(Record[1]);
-      if (!ResultTy || !StructType::isValidElementType(ResultTy))
+      if (!ResultTy || !VectorType::isValidElementType(ResultTy))
         return error("Invalid type");
       bool Scalable = Record.size() > 2 ? Record[2] : false;
       ResultTy = VectorType::get(ResultTy, Record[0], Scalable);
@@ -2240,17 +2244,12 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
 /// Resolve all of the initializers for global values and aliases that we can.
 Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
   std::vector<std::pair<GlobalVariable *, unsigned>> GlobalInitWorklist;
-  std::vector<std::pair<GlobalIndirectSymbol *, unsigned>>
-      IndirectSymbolInitWorklist;
-  std::vector<std::pair<Function *, unsigned>> FunctionPrefixWorklist;
-  std::vector<std::pair<Function *, unsigned>> FunctionPrologueWorklist;
-  std::vector<std::pair<Function *, unsigned>> FunctionPersonalityFnWorklist;
+  std::vector<std::pair<GlobalValue *, unsigned>> IndirectSymbolInitWorklist;
+  std::vector<FunctionOperandInfo> FunctionOperandWorklist;
 
   GlobalInitWorklist.swap(GlobalInits);
   IndirectSymbolInitWorklist.swap(IndirectSymbolInits);
-  FunctionPrefixWorklist.swap(FunctionPrefixes);
-  FunctionPrologueWorklist.swap(FunctionPrologues);
-  FunctionPersonalityFnWorklist.swap(FunctionPersonalityFns);
+  FunctionOperandWorklist.swap(FunctionOperands);
 
   while (!GlobalInitWorklist.empty()) {
     unsigned ValID = GlobalInitWorklist.back().second;
@@ -2274,51 +2273,59 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
       Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]);
       if (!C)
         return error("Expected a constant");
-      GlobalIndirectSymbol *GIS = IndirectSymbolInitWorklist.back().first;
-      if (isa<GlobalAlias>(GIS) && C->getType() != GIS->getType())
-        return error("Alias and aliasee types don't match");
-      GIS->setIndirectSymbol(C);
+      GlobalValue *GV = IndirectSymbolInitWorklist.back().first;
+      if (auto *GA = dyn_cast<GlobalAlias>(GV)) {
+        if (C->getType() != GV->getType())
+          return error("Alias and aliasee types don't match");
+        GA->setAliasee(C);
+      } else if (auto *GI = dyn_cast<GlobalIFunc>(GV)) {
+        Type *ResolverFTy =
+            GlobalIFunc::getResolverFunctionType(GI->getValueType());
+        // Transparently fix up the type for compatiblity with older bitcode
+        GI->setResolver(
+            ConstantExpr::getBitCast(C, ResolverFTy->getPointerTo()));
+      } else {
+        return error("Expected an alias or an ifunc");
+      }
     }
     IndirectSymbolInitWorklist.pop_back();
   }
 
-  while (!FunctionPrefixWorklist.empty()) {
-    unsigned ValID = FunctionPrefixWorklist.back().second;
-    if (ValID >= ValueList.size()) {
-      FunctionPrefixes.push_back(FunctionPrefixWorklist.back());
-    } else {
-      if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        FunctionPrefixWorklist.back().first->setPrefixData(C);
-      else
-        return error("Expected a constant");
+  while (!FunctionOperandWorklist.empty()) {
+    FunctionOperandInfo &Info = FunctionOperandWorklist.back();
+    if (Info.PersonalityFn) {
+      unsigned ValID = Info.PersonalityFn - 1;
+      if (ValID < ValueList.size()) {
+        if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
+          Info.F->setPersonalityFn(C);
+        else
+          return error("Expected a constant");
+        Info.PersonalityFn = 0;
+      }
     }
-    FunctionPrefixWorklist.pop_back();
-  }
-
-  while (!FunctionPrologueWorklist.empty()) {
-    unsigned ValID = FunctionPrologueWorklist.back().second;
-    if (ValID >= ValueList.size()) {
-      FunctionPrologues.push_back(FunctionPrologueWorklist.back());
-    } else {
-      if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        FunctionPrologueWorklist.back().first->setPrologueData(C);
-      else
-        return error("Expected a constant");
+    if (Info.Prefix) {
+      unsigned ValID = Info.Prefix - 1;
+      if (ValID < ValueList.size()) {
+        if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
+          Info.F->setPrefixData(C);
+        else
+          return error("Expected a constant");
+        Info.Prefix = 0;
+      }
     }
-    FunctionPrologueWorklist.pop_back();
-  }
-
-  while (!FunctionPersonalityFnWorklist.empty()) {
-    unsigned ValID = FunctionPersonalityFnWorklist.back().second;
-    if (ValID >= ValueList.size()) {
-      FunctionPersonalityFns.push_back(FunctionPersonalityFnWorklist.back());
-    } else {
-      if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        FunctionPersonalityFnWorklist.back().first->setPersonalityFn(C);
-      else
-        return error("Expected a constant");
+    if (Info.Prologue) {
+      unsigned ValID = Info.Prologue - 1;
+      if (ValID < ValueList.size()) {
+        if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
+          Info.F->setPrologueData(C);
+        else
+          return error("Expected a constant");
+        Info.Prologue = 0;
+      }
     }
-    FunctionPersonalityFnWorklist.pop_back();
+    if (Info.PersonalityFn || Info.Prefix || Info.Prologue)
+      FunctionOperands.push_back(Info);
+    FunctionOperandWorklist.pop_back();
   }
 
   return Error::success();
@@ -2351,6 +2358,15 @@ Error BitcodeReader::parseConstants() {
     unsigned CstNo;
   };
   std::vector<DelayedShufTy> DelayedShuffles;
+  struct DelayedSelTy {
+    Type *OpTy;
+    uint64_t Op0Idx;
+    uint64_t Op1Idx;
+    uint64_t Op2Idx;
+    unsigned CstNo;
+  };
+  std::vector<DelayedSelTy> DelayedSelectors;
+
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
     if (!MaybeEntry)
@@ -2387,6 +2403,27 @@ Error BitcodeReader::parseConstants() {
         Value *V = ConstantExpr::getShuffleVector(Op0, Op1, Mask);
         ValueList.assignValue(V, CstNo);
       }
+      for (auto &DelayedSelector : DelayedSelectors) {
+        Type *OpTy = DelayedSelector.OpTy;
+        Type *SelectorTy = Type::getInt1Ty(Context);
+        uint64_t Op0Idx = DelayedSelector.Op0Idx;
+        uint64_t Op1Idx = DelayedSelector.Op1Idx;
+        uint64_t Op2Idx = DelayedSelector.Op2Idx;
+        uint64_t CstNo = DelayedSelector.CstNo;
+        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy);
+        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, OpTy);
+        // The selector might be an i1 or an <n x i1>
+        // Get the type from the ValueList before getting a forward ref.
+        if (VectorType *VTy = dyn_cast<VectorType>(OpTy)) {
+          Value *V = ValueList[Op0Idx];
+          assert(V);
+          if (SelectorTy != V->getType())
+            SelectorTy = VectorType::get(SelectorTy, VTy->getElementCount());
+        }
+        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, SelectorTy);
+        Value *V = ConstantExpr::getSelect(Op0, Op1, Op2);
+        ValueList.assignValue(V, CstNo);
+      }
 
       if (NextCstNo != ValueList.size())
         return error("Invalid constant reference");
@@ -2683,21 +2720,11 @@ Error BitcodeReader::parseConstants() {
       if (Record.size() < 3)
         return error("Invalid record");
 
-      Type *SelectorTy = Type::getInt1Ty(Context);
-
-      // The selector might be an i1, an <n x i1>, or a <vscale x n x i1>
-      // Get the type from the ValueList before getting a forward ref.
-      if (VectorType *VTy = dyn_cast<VectorType>(CurTy))
-        if (Value *V = ValueList[Record[0]])
-          if (SelectorTy != V->getType())
-            SelectorTy = VectorType::get(SelectorTy,
-                                         VTy->getElementCount());
-
-      V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
-                                                              SelectorTy),
-                                  ValueList.getConstantFwdRef(Record[1],CurTy),
-                                  ValueList.getConstantFwdRef(Record[2],CurTy));
-      break;
+      DelayedSelectors.push_back(
+          {CurTy, Record[0], Record[1], Record[2], NextCstNo});
+      (void)ValueList.getConstantFwdRef(NextCstNo, CurTy);
+      ++NextCstNo;
+      continue;
     }
     case bitc::CST_CODE_CE_EXTRACTELT
         : { // CE_EXTRACTELT: [opty, opval, opty, opval]
@@ -3091,8 +3118,7 @@ Error BitcodeReader::globalCleanup() {
   // Force deallocation of memory for these vectors to favor the client that
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable *, unsigned>>().swap(GlobalInits);
-  std::vector<std::pair<GlobalIndirectSymbol *, unsigned>>().swap(
-      IndirectSymbolInits);
+  std::vector<std::pair<GlobalValue *, unsigned>>().swap(IndirectSymbolInits);
   return Error::success();
 }
 
@@ -3270,7 +3296,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   }
 
   if (Record.size() > 12) {
-    auto AS = getAttributes(Record[12]).getFnAttributes();
+    auto AS = getAttributes(Record[12]).getFnAttrs();
     NewGV->setAttributes(AS);
   }
 
@@ -3383,8 +3409,10 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
   if (Record.size() > 9)
     UnnamedAddr = getDecodedUnnamedAddrType(Record[9]);
   Func->setUnnamedAddr(UnnamedAddr);
-  if (Record.size() > 10 && Record[10] != 0)
-    FunctionPrologues.push_back(std::make_pair(Func, Record[10] - 1));
+
+  FunctionOperandInfo OperandInfo = {Func, 0, 0, 0};
+  if (Record.size() > 10)
+    OperandInfo.Prologue = Record[10];
 
   if (Record.size() > 11)
     Func->setDLLStorageClass(getDecodedDLLStorageClass(Record[11]));
@@ -3401,11 +3429,11 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
     Func->setComdat(reinterpret_cast<Comdat *>(1));
   }
 
-  if (Record.size() > 13 && Record[13] != 0)
-    FunctionPrefixes.push_back(std::make_pair(Func, Record[13] - 1));
+  if (Record.size() > 13)
+    OperandInfo.Prefix = Record[13];
 
-  if (Record.size() > 14 && Record[14] != 0)
-    FunctionPersonalityFns.push_back(std::make_pair(Func, Record[14] - 1));
+  if (Record.size() > 14)
+    OperandInfo.PersonalityFn = Record[14];
 
   if (Record.size() > 15) {
     Func->setDSOLocal(getDecodedDSOLocal(Record[15]));
@@ -3423,6 +3451,9 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
 
   ValueList.push_back(Func);
 
+  if (OperandInfo.PersonalityFn || OperandInfo.Prefix || OperandInfo.Prologue)
+    FunctionOperands.push_back(OperandInfo);
+
   // If this is a function with a body, remember the prototype we are
   // creating now, so that we can match up the body with them later.
   if (!isProto) {
@@ -3467,7 +3498,7 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 
   auto Val = Record[OpNum++];
   auto Linkage = Record[OpNum++];
-  GlobalIndirectSymbol *NewGA;
+  GlobalValue *NewGA;
   if (BitCode == bitc::MODULE_CODE_ALIAS ||
       BitCode == bitc::MODULE_CODE_ALIAS_OLD)
     NewGA = GlobalAlias::create(Ty, AddrSpace, getDecodedLinkage(Linkage), Name,
@@ -4898,8 +4929,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
       MaybeAlign Align;
-      if (Error Err =
-              parseAlignmentValue(Bitfield::get<APV::Align>(Rec), Align)) {
+      uint64_t AlignExp =
+          Bitfield::get<APV::AlignLower>(Rec) |
+          (Bitfield::get<APV::AlignUpper>(Rec) << APV::AlignLower::Bits);
+      if (Error Err = parseAlignmentValue(AlignExp, Align)) {
         return Err;
       }
       if (!Ty || !Size)
@@ -5505,21 +5538,16 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
 
   // Upgrade any old intrinsic calls in the function.
   for (auto &I : UpgradedIntrinsics) {
-    for (auto UI = I.first->materialized_user_begin(), UE = I.first->user_end();
-         UI != UE;) {
-      User *U = *UI;
-      ++UI;
+    for (User *U : llvm::make_early_inc_range(I.first->materialized_users()))
       if (CallInst *CI = dyn_cast<CallInst>(U))
         UpgradeIntrinsicCall(CI, I.second);
-    }
   }
 
   // Update calls to the remangled intrinsics
   for (auto &I : RemangledIntrinsics)
-    for (auto UI = I.first->materialized_user_begin(), UE = I.first->user_end();
-         UI != UE;)
+    for (User *U : llvm::make_early_inc_range(I.first->materialized_users()))
       // Don't expect any other users than call sites
-      cast<CallBase>(*UI++)->setCalledFunction(I.second);
+      cast<CallBase>(U)->setCalledFunction(I.second);
 
   // Finish fn->subprogram upgrade for materialized functions.
   if (DISubprogram *SP = MDLoader->lookupSubprogramForFunction(F))
@@ -5567,9 +5595,8 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
 
     // Remove incompatible attributes on function calls.
     if (auto *CI = dyn_cast<CallBase>(&I)) {
-      CI->removeAttributes(AttributeList::ReturnIndex,
-                           AttributeFuncs::typeIncompatible(
-                               CI->getFunctionType()->getReturnType()));
+      CI->removeRetAttrs(AttributeFuncs::typeIncompatible(
+          CI->getFunctionType()->getReturnType()));
 
       for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ++ArgNo)
         CI->removeParamAttrs(ArgNo, AttributeFuncs::typeIncompatible(
@@ -6742,10 +6769,9 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
       continue;
     }
     case BitstreamEntry::Record:
-      if (Expected<unsigned> StreamFailed = Stream.skipRecord(Entry.ID))
-        continue;
-      else
-        return StreamFailed.takeError();
+      if (Error E = Stream.skipRecord(Entry.ID).takeError())
+        return std::move(E);
+      continue;
     }
   }
 }
@@ -6768,12 +6794,9 @@ BitcodeModule::getModuleImpl(LLVMContext &Context, bool MaterializeAll,
   if (IdentificationBit != -1ull) {
     if (Error JumpFailed = Stream.JumpToBit(IdentificationBit))
       return std::move(JumpFailed);
-    Expected<std::string> ProducerIdentificationOrErr =
-        readIdentificationBlock(Stream);
-    if (!ProducerIdentificationOrErr)
-      return ProducerIdentificationOrErr.takeError();
-
-    ProducerIdentification = *ProducerIdentificationOrErr;
+    if (Error E =
+            readIdentificationBlock(Stream).moveInto(ProducerIdentification))
+      return std::move(E);
   }
 
   if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
@@ -6847,10 +6870,9 @@ static Expected<bool> getEnableSplitLTOUnitFlag(BitstreamCursor &Stream,
   SmallVector<uint64_t, 64> Record;
 
   while (true) {
-    Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
@@ -6895,10 +6917,9 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() {
     return std::move(Err);
 
   while (true) {
-    Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance();
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    llvm::BitstreamEntry Entry = MaybeEntry.get();
+    llvm::BitstreamEntry Entry;
+    if (Error E = Stream.advance().moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 8493eb7a28b2..6df5a4a64d51 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -21,8 +21,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -40,7 +40,6 @@
 #include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalIFunc.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -363,7 +362,8 @@ class PlaceholderQueue {
 
 public:
   ~PlaceholderQueue() {
-    assert(empty() && "PlaceholderQueue hasn't been flushed before being destroyed");
+    assert(empty() &&
+           "PlaceholderQueue hasn't been flushed before being destroyed");
   }
   bool empty() const { return PHs.empty(); }
   DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
@@ -546,7 +546,7 @@ class MetadataLoader::MetadataLoaderImpl {
         if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
           if (auto *DIExpr = DDI->getExpression())
             if (DIExpr->startsWithDeref() &&
-                dyn_cast_or_null<Argument>(DDI->getAddress())) {
+                isa_and_nonnull<Argument>(DDI->getAddress())) {
               SmallVector<uint64_t, 8> Ops;
               Ops.append(std::next(DIExpr->elements_begin()),
                          DIExpr->elements_end());
@@ -604,7 +604,7 @@ class MetadataLoader::MetadataLoaderImpl {
         // If the expression is malformed, make sure we don't
         // copy more elements than we should.
         HistoricSize = std::min(SubExpr.size(), HistoricSize);
-        ArrayRef<uint64_t> Args = SubExpr.slice(1, HistoricSize-1);
+        ArrayRef<uint64_t> Args = SubExpr.slice(1, HistoricSize - 1);
 
         switch (SubExpr.front()) {
         case dwarf::DW_OP_plus:
@@ -698,11 +698,12 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
   // Get the abbrevs, and preload record positions to make them lazy-loadable.
   while (true) {
     uint64_t SavedPos = IndexCursor.GetCurrentBitNo();
-    Expected<BitstreamEntry> MaybeEntry = IndexCursor.advanceSkippingSubblocks(
-        BitstreamCursor::AF_DontPopBlockAtEnd);
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E =
+            IndexCursor
+                .advanceSkippingSubblocks(BitstreamCursor::AF_DontPopBlockAtEnd)
+                .moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
@@ -715,10 +716,9 @@ MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
       // The interesting case.
       ++NumMDRecordLoaded;
       uint64_t CurrentPos = IndexCursor.GetCurrentBitNo();
-      Expected<unsigned> MaybeCode = IndexCursor.skipRecord(Entry.ID);
-      if (!MaybeCode)
-        return MaybeCode.takeError();
-      unsigned Code = MaybeCode.get();
+      unsigned Code;
+      if (Error E = IndexCursor.skipRecord(Entry.ID).moveInto(Code))
+        return std::move(E);
       switch (Code) {
       case bitc::METADATA_STRINGS: {
         // Rewind and parse the strings.
@@ -905,11 +905,12 @@ Expected<bool> MetadataLoader::MetadataLoaderImpl::loadGlobalDeclAttachments() {
   if (Error Err = TempCursor.JumpToBit(GlobalDeclAttachmentPos))
     return std::move(Err);
   while (true) {
-    Expected<BitstreamEntry> MaybeEntry = TempCursor.advanceSkippingSubblocks(
-        BitstreamCursor::AF_DontPopBlockAtEnd);
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E =
+            TempCursor
+                .advanceSkippingSubblocks(BitstreamCursor::AF_DontPopBlockAtEnd)
+                .moveInto(Entry))
+      return std::move(E);
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
@@ -1025,10 +1026,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
 
   // Read all the records.
   while (true) {
-    Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
+      return E;
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
@@ -1081,22 +1081,22 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
   if (Error Err = IndexCursor.JumpToBit(
           GlobalMetadataBitPosIndex[ID - MDStringRef.size()]))
     report_fatal_error("lazyLoadOneMetadata failed jumping: " +
-                       toString(std::move(Err)));
-  Expected<BitstreamEntry> MaybeEntry = IndexCursor.advanceSkippingSubblocks();
-  if (!MaybeEntry)
+                       Twine(toString(std::move(Err))));
+  BitstreamEntry Entry;
+  if (Error E = IndexCursor.advanceSkippingSubblocks().moveInto(Entry))
     // FIXME this drops the error on the floor.
     report_fatal_error("lazyLoadOneMetadata failed advanceSkippingSubblocks: " +
-                       toString(MaybeEntry.takeError()));
-  BitstreamEntry Entry = MaybeEntry.get();
+                       Twine(toString(std::move(E))));
   ++NumMDRecordLoaded;
   if (Expected<unsigned> MaybeCode =
           IndexCursor.readRecord(Entry.ID, Record, &Blob)) {
     if (Error Err =
             parseOneMetadata(Record, MaybeCode.get(), Placeholders, Blob, ID))
       report_fatal_error("Can't lazyload MD, parseOneMetadata: " +
-                         toString(std::move(Err)));
+                         Twine(toString(std::move(Err))));
   } else
-    report_fatal_error("Can't lazyload MD: " + toString(MaybeCode.takeError()));
+    report_fatal_error("Can't lazyload MD: " +
+                       Twine(toString(MaybeCode.takeError())));
 }
 
 /// Ensure that all forward-references and placeholders are resolved.
@@ -1193,10 +1193,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // Read name of the named metadata.
     SmallString<8> Name(Record.begin(), Record.end());
     Record.clear();
-    Expected<unsigned> MaybeCode = Stream.ReadCode();
-    if (!MaybeCode)
-      return MaybeCode.takeError();
-    Code = MaybeCode.get();
+    if (Error E = Stream.ReadCode().moveInto(Code))
+      return E;
 
     ++NumMDRecordLoaded;
     if (Expected<unsigned> MaybeNextBitCode = Stream.readRecord(Code, Record)) {
@@ -1411,8 +1409,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       return error("Invalid record");
 
     IsDistinct = Record[0];
-    DINode::DIFlags Flags = (Record.size() > 6) ?
-                    static_cast<DINode::DIFlags>(Record[6]) : DINode::FlagZero;
+    DINode::DIFlags Flags = (Record.size() > 6)
+                                ? static_cast<DINode::DIFlags>(Record[6])
+                                : DINode::FlagZero;
 
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIBasicType,
@@ -1437,7 +1436,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() < 12 || Record.size() > 13)
+    if (Record.size() < 12 || Record.size() > 14)
       return error("Invalid record");
 
     // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
@@ -1446,6 +1445,10 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() > 12 && Record[12])
       DWARFAddressSpace = Record[12] - 1;
 
+    Metadata *Annotations = nullptr;
+    if (Record.size() > 13 && Record[13])
+      Annotations = getMDOrNull(Record[13]);
+
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
     MetadataList.assignValue(
@@ -1455,13 +1458,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
                          Record[9], DWARFAddressSpace, Flags,
-                         getDITypeRefOrNull(Record[11]))),
+                         getDITypeRefOrNull(Record[11]), Annotations)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
   }
   case bitc::METADATA_COMPOSITE_TYPE: {
-    if (Record.size() < 16 || Record.size() > 21)
+    if (Record.size() < 16 || Record.size() > 22)
       return error("Invalid record");
 
     // If we have a UUID and this is not a forward declaration, lookup the
@@ -1489,6 +1492,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Metadata *Associated = nullptr;
     Metadata *Allocated = nullptr;
     Metadata *Rank = nullptr;
+    Metadata *Annotations = nullptr;
     auto *Identifier = getMDString(Record[15]);
     // If this module is being parsed so that it can be ThinLTO imported
     // into another module, composite types only need to be imported
@@ -1520,6 +1524,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       if (Record.size() > 20) {
         Rank = getMDOrNull(Record[20]);
       }
+      if (Record.size() > 21) {
+        Annotations = getMDOrNull(Record[21]);
+      }
     }
     DICompositeType *CT = nullptr;
     if (Identifier)
@@ -1527,7 +1534,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
           Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
           SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
           VTableHolder, TemplateParams, Discriminator, DataLocation, Associated,
-          Allocated, Rank);
+          Allocated, Rank, Annotations);
 
     // Create a node if we didn't get a lazy ODR type.
     if (!CT)
@@ -1536,7 +1543,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                             SizeInBits, AlignInBits, OffsetInBits, Flags,
                             Elements, RuntimeLang, VTableHolder, TemplateParams,
                             Identifier, Discriminator, DataLocation, Associated,
-                            Allocated, Rank));
+                            Allocated, Rank, Annotations));
     if (!IsNotUsedInTypeRef && Identifier)
       MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
 
@@ -1665,9 +1672,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       SPFlags |= DISubprogram::SPFlagMainSubprogram;
     else if (!HasSPFlags)
       SPFlags = DISubprogram::toSPFlags(
-                    /*IsLocalToUnit=*/Record[7], /*IsDefinition=*/Record[8],
-                    /*IsOptimized=*/Record[14], /*Virtuality=*/Record[11],
-                    /*DIFlagMainSubprogram*/HasOldMainSubprogramFlag);
+          /*IsLocalToUnit=*/Record[7], /*IsDefinition=*/Record[8],
+          /*IsOptimized=*/Record[14], /*Virtuality=*/Record[11],
+          /*DIFlagMainSubprogram=*/HasOldMainSubprogramFlag);
 
     // All definitions should be distinct.
     IsDistinct = (Record[0] & 1) || (SPFlags & DISubprogram::SPFlagDefinition);
@@ -1685,6 +1692,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     bool HasFn = false;
     bool HasThisAdj = true;
     bool HasThrownTypes = true;
+    bool HasAnnotations = false;
     unsigned OffsetA = 0;
     unsigned OffsetB = 0;
     if (!HasSPFlags) {
@@ -1696,29 +1704,33 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       }
       HasThisAdj = Record.size() >= 20;
       HasThrownTypes = Record.size() >= 21;
+    } else {
+      HasAnnotations = Record.size() >= 19;
     }
     Metadata *CUorFn = getMDOrNull(Record[12 + OffsetB]);
     DISubprogram *SP = GET_OR_DISTINCT(
         DISubprogram,
         (Context,
-         getDITypeRefOrNull(Record[1]),                     // scope
-         getMDString(Record[2]),                            // name
-         getMDString(Record[3]),                            // linkageName
-         getMDOrNull(Record[4]),                            // file
-         Record[5],                                         // line
-         getMDOrNull(Record[6]),                            // type
-         Record[7 + OffsetA],                               // scopeLine
-         getDITypeRefOrNull(Record[8 + OffsetA]),           // containingType
-         Record[10 + OffsetA],                              // virtualIndex
-         HasThisAdj ? Record[16 + OffsetB] : 0,             // thisAdjustment
-         Flags,                                             // flags
-         SPFlags,                                           // SPFlags
-         HasUnit ? CUorFn : nullptr,                        // unit
-         getMDOrNull(Record[13 + OffsetB]),                 // templateParams
-         getMDOrNull(Record[14 + OffsetB]),                 // declaration
-         getMDOrNull(Record[15 + OffsetB]),                 // retainedNodes
+         getDITypeRefOrNull(Record[1]),           // scope
+         getMDString(Record[2]),                  // name
+         getMDString(Record[3]),                  // linkageName
+         getMDOrNull(Record[4]),                  // file
+         Record[5],                               // line
+         getMDOrNull(Record[6]),                  // type
+         Record[7 + OffsetA],                     // scopeLine
+         getDITypeRefOrNull(Record[8 + OffsetA]), // containingType
+         Record[10 + OffsetA],                    // virtualIndex
+         HasThisAdj ? Record[16 + OffsetB] : 0,   // thisAdjustment
+         Flags,                                   // flags
+         SPFlags,                                 // SPFlags
+         HasUnit ? CUorFn : nullptr,              // unit
+         getMDOrNull(Record[13 + OffsetB]),       // templateParams
+         getMDOrNull(Record[14 + OffsetB]),       // declaration
+         getMDOrNull(Record[15 + OffsetB]),       // retainedNodes
          HasThrownTypes ? getMDOrNull(Record[17 + OffsetB])
-                        : nullptr                           // thrownTypes
+                        : nullptr, // thrownTypes
+         HasAnnotations ? getMDOrNull(Record[18 + OffsetB])
+                        : nullptr // annotations
          ));
     MetadataList.assignValue(SP, NextMetadataNo);
     NextMetadataNo++;
@@ -1860,13 +1872,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     unsigned Version = Record[0] >> 1;
 
     if (Version == 2) {
+      Metadata *Annotations = nullptr;
+      if (Record.size() > 12)
+        Annotations = getMDOrNull(Record[12]);
+
       MetadataList.assignValue(
-          GET_OR_DISTINCT(
-              DIGlobalVariable,
-              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
-               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
-               getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-               getMDOrNull(Record[9]), getMDOrNull(Record[10]), Record[11])),
+          GET_OR_DISTINCT(DIGlobalVariable,
+                          (Context, getMDOrNull(Record[1]),
+                           getMDString(Record[2]), getMDString(Record[3]),
+                           getMDOrNull(Record[4]), Record[5],
+                           getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+                           getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+                           Record[11], Annotations)),
           NextMetadataNo);
 
       NextMetadataNo++;
@@ -1874,12 +1891,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       // No upgrade necessary. A null field will be introduced to indicate
       // that no parameter information is available.
       MetadataList.assignValue(
-          GET_OR_DISTINCT(DIGlobalVariable,
-                          (Context, getMDOrNull(Record[1]),
-                           getMDString(Record[2]), getMDString(Record[3]),
-                           getMDOrNull(Record[4]), Record[5],
-                           getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                           getMDOrNull(Record[10]), nullptr, Record[11])),
+          GET_OR_DISTINCT(
+              DIGlobalVariable,
+              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+               getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+               getMDOrNull(Record[10]), nullptr, Record[11], nullptr)),
           NextMetadataNo);
 
       NextMetadataNo++;
@@ -1912,7 +1929,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
           (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
            getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
            getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-           getMDOrNull(Record[10]), nullptr, AlignInBits));
+           getMDOrNull(Record[10]), nullptr, AlignInBits, nullptr));
 
       DIGlobalVariableExpression *DGVE = nullptr;
       if (Attach || Expr)
@@ -1942,18 +1959,22 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     bool HasTag = !HasAlignment && Record.size() > 8;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7 + HasTag]);
     uint32_t AlignInBits = 0;
+    Metadata *Annotations = nullptr;
     if (HasAlignment) {
-      if (Record[8 + HasTag] > (uint64_t)std::numeric_limits<uint32_t>::max())
+      if (Record[8] > (uint64_t)std::numeric_limits<uint32_t>::max())
         return error("Alignment value is too large");
-      AlignInBits = Record[8 + HasTag];
+      AlignInBits = Record[8];
+      if (Record.size() > 9)
+        Annotations = getMDOrNull(Record[9]);
     }
+
     MetadataList.assignValue(
         GET_OR_DISTINCT(DILocalVariable,
                         (Context, getMDOrNull(Record[1 + HasTag]),
                          getMDString(Record[2 + HasTag]),
                          getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag],
                          getDITypeRefOrNull(Record[5 + HasTag]),
-                         Record[6 + HasTag], Flags, AlignInBits)),
+                         Record[6 + HasTag], Flags, AlignInBits, Annotations)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1964,10 +1985,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     IsDistinct = Record[0] & 1;
     MetadataList.assignValue(
-        GET_OR_DISTINCT(DILabel,
-                        (Context, getMDOrNull(Record[1]),
-                         getMDString(Record[2]),
-                         getMDOrNull(Record[3]), Record[4])),
+        GET_OR_DISTINCT(DILabel, (Context, getMDOrNull(Record[1]),
+                                  getMDString(Record[2]),
+                                  getMDOrNull(Record[3]), Record[4])),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1984,8 +2004,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Error Err = upgradeDIExpression(Version, Elts, Buffer))
       return Err;
 
-    MetadataList.assignValue(
-        GET_OR_DISTINCT(DIExpression, (Context, Elts)), NextMetadataNo);
+    MetadataList.assignValue(GET_OR_DISTINCT(DIExpression, (Context, Elts)),
+                             NextMetadataNo);
     NextMetadataNo++;
     break;
   }
@@ -2020,17 +2040,19 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_IMPORTED_ENTITY: {
-    if (Record.size() != 6 && Record.size() != 7)
+    if (Record.size() < 6 && Record.size() > 8)
       return error("Invalid record");
 
     IsDistinct = Record[0];
-    bool HasFile = (Record.size() == 7);
+    bool HasFile = (Record.size() >= 7);
+    bool HasElements = (Record.size() >= 8);
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIImportedEntity,
                         (Context, Record[1], getMDOrNull(Record[2]),
                          getDITypeRefOrNull(Record[3]),
                          HasFile ? getMDOrNull(Record[6]) : nullptr,
-                         HasFile ? Record[4] : 0, getMDString(Record[5]))),
+                         HasFile ? Record[4] : 0, getMDString(Record[5]),
+                         HasElements ? getMDOrNull(Record[7]) : nullptr)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -2121,10 +2143,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
     if (R.AtEndOfStream())
       return error("Invalid record: metadata strings bad length");
 
-    Expected<uint32_t> MaybeSize = R.ReadVBR(6);
-    if (!MaybeSize)
-      return MaybeSize.takeError();
-    uint32_t Size = MaybeSize.get();
+    uint32_t Size;
+    if (Error E = R.ReadVBR(6).moveInto(Size))
+      return E;
     if (Strings.size() < Size)
       return error("Invalid record: metadata strings truncated chars");
 
@@ -2161,10 +2182,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
   PlaceholderQueue Placeholders;
 
   while (true) {
-    Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
+      return E;
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
@@ -2265,10 +2285,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataKinds() {
 
   // Read all the records.
   while (true) {
-    Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
-    if (!MaybeEntry)
-      return MaybeEntry.takeError();
-    BitstreamEntry Entry = MaybeEntry.get();
+    BitstreamEntry Entry;
+    if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
+      return E;
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0a202c376981..1e9a9197aed7 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/IRSymtab.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
@@ -67,7 +68,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SHA1.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -142,7 +142,6 @@ public:
       : Stream(Stream), StrtabBuilder(StrtabBuilder) {}
 
 protected:
-  void writeBitcodeHeader();
   void writeModuleVersion();
 };
 
@@ -374,7 +373,6 @@ private:
   void writeModuleMetadata();
   void writeFunctionMetadata(const Function &F);
   void writeFunctionMetadataAttachment(const Function &F);
-  void writeGlobalVariableMetadataAttachment(const GlobalVariable &GV);
   void pushGlobalMetadataAttachment(SmallVectorImpl<uint64_t> &Record,
                                     const GlobalObject &GO);
   void writeModuleMetadataKinds();
@@ -628,6 +626,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_IN_ALLOCA;
   case Attribute::Cold:
     return bitc::ATTR_KIND_COLD;
+  case Attribute::DisableSanitizerInstrumentation:
+    return bitc::ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION;
   case Attribute::Hot:
     return bitc::ATTR_KIND_HOT;
   case Attribute::ElementType:
@@ -835,7 +835,7 @@ void ModuleBitcodeWriter::writeAttributeTable() {
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
     AttributeList AL = Attrs[i];
-    for (unsigned i = AL.index_begin(), e = AL.index_end(); i != e; ++i) {
+    for (unsigned i : AL.indexes()) {
       AttributeSet AS = AL.getAttributes(i);
       if (AS.hasAttributes())
         Record.push_back(VE.getAttributeGroupID({i, AS}));
@@ -973,9 +973,8 @@ void ModuleBitcodeWriter::writeTypeTable() {
       // STRUCT: [ispacked, eltty x N]
       TypeVals.push_back(ST->isPacked());
       // Output all of the element types.
-      for (StructType::element_iterator I = ST->element_begin(),
-           E = ST->element_end(); I != E; ++I)
-        TypeVals.push_back(VE.getTypeID(*I));
+      for (Type *ET : ST->elements())
+        TypeVals.push_back(VE.getTypeID(ET));
 
       if (ST->isLiteral()) {
         Code = bitc::TYPE_CODE_STRUCT_ANON;
@@ -1066,6 +1065,9 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
   RawFlags |= (Flags.ReturnDoesNotAlias << 3);
   RawFlags |= (Flags.NoInline << 4);
   RawFlags |= (Flags.AlwaysInline << 5);
+  RawFlags |= (Flags.NoUnwind << 6);
+  RawFlags |= (Flags.MayThrow << 7);
+  RawFlags |= (Flags.HasUnknownCall << 8);
   return RawFlags;
 }
 
@@ -1687,6 +1689,8 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
   else
     Record.push_back(0);
 
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -1716,6 +1720,7 @@ void ModuleBitcodeWriter::writeDICompositeType(
   Record.push_back(VE.getMetadataOrNullID(N->getRawAssociated()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawAllocated()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawRank()));
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
   Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
   Record.clear();
@@ -1811,6 +1816,7 @@ void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N,
   Record.push_back(VE.getMetadataOrNullID(N->getRetainedNodes().get()));
   Record.push_back(N->getThisAdjustment());
   Record.push_back(VE.getMetadataOrNullID(N->getThrownTypes().get()));
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
   Stream.EmitRecord(bitc::METADATA_SUBPROGRAM, Record, Abbrev);
   Record.clear();
@@ -1958,6 +1964,7 @@ void ModuleBitcodeWriter::writeDIGlobalVariable(
   Record.push_back(VE.getMetadataOrNullID(N->getStaticDataMemberDeclaration()));
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams()));
   Record.push_back(N->getAlignInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
   Stream.EmitRecord(bitc::METADATA_GLOBAL_VAR, Record, Abbrev);
   Record.clear();
@@ -1989,6 +1996,7 @@ void ModuleBitcodeWriter::writeDILocalVariable(
   Record.push_back(N->getArg());
   Record.push_back(N->getFlags());
   Record.push_back(N->getAlignInBits());
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
   Stream.EmitRecord(bitc::METADATA_LOCAL_VAR, Record, Abbrev);
   Record.clear();
@@ -2056,6 +2064,7 @@ void ModuleBitcodeWriter::writeDIImportedEntity(
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getElements().get()));
 
   Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev);
   Record.clear();
@@ -2907,8 +2916,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
-      for (unsigned i = FTy->getNumParams(), e = II->getNumArgOperands();
-           i != e; ++i)
+      for (unsigned i = FTy->getNumParams(), e = II->arg_size(); i != e; ++i)
         pushValueAndType(I.getOperand(i), InstID, Vals); // vararg
     }
     break;
@@ -2989,8 +2997,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
-      for (unsigned i = FTy->getNumParams(), e = CBI->getNumArgOperands();
-           i != e; ++i)
+      for (unsigned i = FTy->getNumParams(), e = CBI->arg_size(); i != e; ++i)
         pushValueAndType(I.getOperand(i), InstID, Vals); // vararg
     }
     break;
@@ -3047,7 +3054,11 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
     using APV = AllocaPackedValues;
     unsigned Record = 0;
-    Bitfield::set<APV::Align>(Record, getEncodedAlign(AI.getAlign()));
+    unsigned EncodedAlign = getEncodedAlign(AI.getAlign());
+    Bitfield::set<APV::AlignLower>(
+        Record, EncodedAlign & ((1 << APV::AlignLower::Bits) - 1));
+    Bitfield::set<APV::AlignUpper>(Record,
+                                   EncodedAlign >> APV::AlignLower::Bits);
     Bitfield::set<APV::UsedWithInAlloca>(Record, AI.isUsedWithInAlloca());
     Bitfield::set<APV::ExplicitType>(Record, true);
     Bitfield::set<APV::SwiftError>(Record, AI.isSwiftError());
@@ -3154,8 +3165,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
-      for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands();
-           i != e; ++i)
+      for (unsigned i = FTy->getNumParams(), e = CI.arg_size(); i != e; ++i)
         pushValueAndType(CI.getArgOperand(i), InstID, Vals); // varargs
     }
     break;
@@ -4028,7 +4038,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
                                FSModVTableRefsAbbrev);
 
   for (const GlobalAlias &A : M.aliases()) {
-    auto *Aliasee = A.getBaseObject();
+    auto *Aliasee = A.getAliaseeObject();
     if (!Aliasee->hasName())
       // Nameless function don't have an entry in the summary, skip it.
       continue;
@@ -4141,7 +4151,14 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // For local linkage, we also emit the original name separately
   // immediately after the record.
   auto MaybeEmitOriginalName = [&](GlobalValueSummary &S) {
-    if (!GlobalValue::isLocalLinkage(S.linkage()))
+    // We don't need to emit the original name if we are writing the index for
+    // distributed backends (in which case ModuleToSummariesForIndex is
+    // non-null). The original name is only needed during the thin link, since
+    // for SamplePGO the indirect call targets for local functions have
+    // have the original name annotated in profile.
+    // Continue to emit it when writing out the entire combined index, which is
+    // used in testing the thin link via llvm-lto.
+    if (ModuleToSummariesForIndex || !GlobalValue::isLocalLinkage(S.linkage()))
       return;
     NameVals.push_back(S.getOriginalName());
     Stream.EmitRecord(bitc::FS_COMBINED_ORIGINAL_NAME, NameVals);
@@ -4194,33 +4211,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
 
     auto GetValueId = [&](const ValueInfo &VI) -> Optional<unsigned> {
-      GlobalValue::GUID GUID = VI.getGUID();
-      Optional<unsigned> CallValueId = getValueId(GUID);
-      if (CallValueId)
-        return CallValueId;
-      // For SamplePGO, the indirect call targets for local functions will
-      // have its original name annotated in profile. We try to find the
-      // corresponding PGOFuncName as the GUID.
-      GUID = Index.getGUIDFromOriginalID(GUID);
-      if (!GUID)
-        return None;
-      CallValueId = getValueId(GUID);
-      if (!CallValueId)
-        return None;
-      // The mapping from OriginalId to GUID may return a GUID
-      // that corresponds to a static variable. Filter it out here.
-      // This can happen when
-      // 1) There is a call to a library function which does not have
-      // a CallValidId;
-      // 2) There is a static variable with the  OriginalGUID identical
-      // to the GUID of the library function in 1);
-      // When this happens, the logic for SamplePGO kicks in and
-      // the static variable in 2) will be found, which needs to be
-      // filtered out.
-      auto *GVSum = Index.getGlobalValueSummary(GUID, false);
-      if (GVSum && GVSum->getSummaryKind() == GlobalValueSummary::GlobalVarKind)
-        return None;
-      return CallValueId;
+      return getValueId(VI.getGUID());
     };
 
     auto *FS = cast<FunctionSummary>(S);
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index d86db61ee1f4..9465a3b11c8f 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -229,8 +229,11 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
     // have been read (despite having earlier IDs).  Rather than awkwardly
     // modeling this behaviour here, orderModule() has assigned IDs to
     // initializers of GlobalValues before GlobalValues themselves.
-    if (OM.isGlobalValue(LID) && OM.isGlobalValue(RID))
+    if (OM.isGlobalValue(LID) && OM.isGlobalValue(RID)) {
+      if (LID == RID)
+        return LU->getOperandNo() > RU->getOperandNo();
       return LID < RID;
+    }
 
     // If ID is 4, then expect: 7 6 5 1 2 3.
     if (LID < RID) {
@@ -1036,7 +1039,7 @@ void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
   }
 
   // Do lookups for all attribute groups.
-  for (unsigned i = PAL.index_begin(), e = PAL.index_end(); i != e; ++i) {
+  for (unsigned i : PAL.indexes()) {
     AttributeSet AS = PAL.getAttributes(i);
     if (!AS.hasAttributes())
       continue;
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e5d576d879b5..7d8a73e12d3a 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -221,9 +221,6 @@ ISD::CondCode llvm::getFCmpCodeWithoutNaN(ISD::CondCode CC) {
   }
 }
 
-/// getICmpCondCode - Return the ISD condition code corresponding to
-/// the given LLVM IR integer condition code.
-///
 ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
   switch (Pred) {
   case ICmpInst::ICMP_EQ:  return ISD::SETEQ;
@@ -241,6 +238,33 @@ ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
   }
 }
 
+ICmpInst::Predicate llvm::getICmpCondCode(ISD::CondCode Pred) {
+  switch (Pred) {
+  case ISD::SETEQ:
+    return ICmpInst::ICMP_EQ;
+  case ISD::SETNE:
+    return ICmpInst::ICMP_NE;
+  case ISD::SETLE:
+    return ICmpInst::ICMP_SLE;
+  case ISD::SETULE:
+    return ICmpInst::ICMP_ULE;
+  case ISD::SETGE:
+    return ICmpInst::ICMP_SGE;
+  case ISD::SETUGE:
+    return ICmpInst::ICMP_UGE;
+  case ISD::SETLT:
+    return ICmpInst::ICMP_SLT;
+  case ISD::SETULT:
+    return ICmpInst::ICMP_ULT;
+  case ISD::SETGT:
+    return ICmpInst::ICMP_SGT;
+  case ISD::SETUGT:
+    return ICmpInst::ICMP_UGT;
+  default:
+    llvm_unreachable("Invalid ISD integer condition code!");
+  }
+}
+
 static bool isNoopBitcast(Type *T1, Type *T2,
                           const TargetLoweringBase& TLI) {
   return T1 == T2 || (T1->isPointerTy() && T2->isPointerTy()) ||
@@ -524,10 +548,8 @@ bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM) {
     if (&*BBI == &Call)
       break;
     // Debug info intrinsics do not get in the way of tail call optimization.
-    if (isa<DbgInfoIntrinsic>(BBI))
-      continue;
     // Pseudo probe intrinsics do not block tail call optimization either.
-    if (isa<PseudoProbeInst>(BBI))
+    if (BBI->isDebugOrPseudoInst())
       continue;
     // A lifetime end, assume or noalias.decl intrinsic should not stop tail
     // call optimization.
diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index db4215e92d44..223840c21d8b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -75,7 +75,6 @@ void ARMException::endFunction(const MachineFunction *MF) {
     // Emit references to personality.
     if (Per) {
       MCSymbol *PerSym = Asm->getSymbol(Per);
-      Asm->OutStreamer->emitSymbolAttribute(PerSym, MCSA_Global);
       ATS.emitPersonality(PerSym);
     }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e528d33b5f8c..cc848d28a9a7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -71,7 +71,6 @@
 #include "llvm/IR/GCStrategy.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalIFunc.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -102,6 +101,7 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkFormat.h"
@@ -115,7 +115,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -275,7 +274,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .getModuleMetadata(M);
 
-  OutStreamer->InitSections(false);
+  OutStreamer->initSections(false, *TM.getMCSubtargetInfo());
 
   if (DisableDebugInfoPrinting)
     MMI->setDebugInfoAvailability(false);
@@ -326,16 +325,10 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   // Emit module-level inline asm if it exists.
   if (!M.getModuleInlineAsm().empty()) {
-    // We're at the module level. Construct MCSubtarget from the default CPU
-    // and target triple.
-    std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
-        TM.getTargetTriple().str(), TM.getTargetCPU(),
-        TM.getTargetFeatureString()));
-    assert(STI && "Unable to create subtarget info");
     OutStreamer->AddComment("Start of file scope inline assembly");
     OutStreamer->AddBlankLine();
-    emitInlineAsm(M.getModuleInlineAsm() + "\n",
-                  OutContext.getSubtargetCopy(*STI), TM.Options.MCOptions);
+    emitInlineAsm(M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
+                  TM.Options.MCOptions);
     OutStreamer->AddComment("End of file scope inline assembly");
     OutStreamer->AddBlankLine();
   }
@@ -1422,7 +1415,7 @@ void AsmPrinter::emitFunctionBody() {
       });
       R << "BasicBlock: " << ore::NV("BasicBlock", MBB.getName()) << "\n";
       for (auto &KV : MnemonicVec) {
-        auto Name = (Twine("INST_") + KV.first.trim()).str();
+        auto Name = (Twine("INST_") + getToken(KV.first.trim()).first).str();
         R << KV.first << ": " << ore::NV(Name, KV.second) << "\n";
       }
       ORE->emit(R);
@@ -1610,14 +1603,13 @@ void AsmPrinter::emitGlobalGOTEquivs() {
     emitGlobalVariable(GV);
 }
 
-void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
-                                          const GlobalIndirectSymbol& GIS) {
-  MCSymbol *Name = getSymbol(&GIS);
-  bool IsFunction = GIS.getValueType()->isFunctionTy();
+void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) {
+  MCSymbol *Name = getSymbol(&GA);
+  bool IsFunction = GA.getValueType()->isFunctionTy();
   // Treat bitcasts of functions as functions also. This is important at least
   // on WebAssembly where object and function addresses can't alias each other.
   if (!IsFunction)
-    if (auto *CE = dyn_cast<ConstantExpr>(GIS.getIndirectSymbol()))
+    if (auto *CE = dyn_cast<ConstantExpr>(GA.getAliasee()))
       if (CE->getOpcode() == Instruction::BitCast)
         IsFunction =
           CE->getOperand(0)->getType()->getPointerElementType()->isFunctionTy();
@@ -1627,61 +1619,80 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
   // point, all the extra label is emitted, we just have to emit linkage for
   // those labels.
   if (TM.getTargetTriple().isOSBinFormatXCOFF()) {
-    assert(!isa<GlobalIFunc>(GIS) && "IFunc is not supported on AIX.");
     assert(MAI->hasVisibilityOnlyWithLinkage() &&
            "Visibility should be handled with emitLinkage() on AIX.");
-    emitLinkage(&GIS, Name);
+    emitLinkage(&GA, Name);
     // If it's a function, also emit linkage for aliases of function entry
     // point.
     if (IsFunction)
-      emitLinkage(&GIS,
-                  getObjFileLowering().getFunctionEntryPointSymbol(&GIS, TM));
+      emitLinkage(&GA,
+                  getObjFileLowering().getFunctionEntryPointSymbol(&GA, TM));
     return;
   }
 
-  if (GIS.hasExternalLinkage() || !MAI->getWeakRefDirective())
+  if (GA.hasExternalLinkage() || !MAI->getWeakRefDirective())
     OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
-  else if (GIS.hasWeakLinkage() || GIS.hasLinkOnceLinkage())
+  else if (GA.hasWeakLinkage() || GA.hasLinkOnceLinkage())
     OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
   else
-    assert(GIS.hasLocalLinkage() && "Invalid alias or ifunc linkage");
+    assert(GA.hasLocalLinkage() && "Invalid alias linkage");
 
   // Set the symbol type to function if the alias has a function type.
   // This affects codegen when the aliasee is not a function.
   if (IsFunction)
-    OutStreamer->emitSymbolAttribute(Name, isa<GlobalIFunc>(GIS)
-                                               ? MCSA_ELF_TypeIndFunction
-                                               : MCSA_ELF_TypeFunction);
+    OutStreamer->emitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
 
-  emitVisibility(Name, GIS.getVisibility());
+  emitVisibility(Name, GA.getVisibility());
 
-  const MCExpr *Expr = lowerConstant(GIS.getIndirectSymbol());
+  const MCExpr *Expr = lowerConstant(GA.getAliasee());
 
-  if (isa<GlobalAlias>(&GIS) && MAI->hasAltEntry() && isa<MCBinaryExpr>(Expr))
+  if (MAI->hasAltEntry() && isa<MCBinaryExpr>(Expr))
     OutStreamer->emitSymbolAttribute(Name, MCSA_AltEntry);
 
   // Emit the directives as assignments aka .set:
   OutStreamer->emitAssignment(Name, Expr);
-  MCSymbol *LocalAlias = getSymbolPreferLocal(GIS);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(GA);
   if (LocalAlias != Name)
     OutStreamer->emitAssignment(LocalAlias, Expr);
 
-  if (auto *GA = dyn_cast<GlobalAlias>(&GIS)) {
-    // If the aliasee does not correspond to a symbol in the output, i.e. the
-    // alias is not of an object or the aliased object is private, then set the
-    // size of the alias symbol from the type of the alias. We don't do this in
-    // other situations as the alias and aliasee having differing types but same
-    // size may be intentional.
-    const GlobalObject *BaseObject = GA->getBaseObject();
-    if (MAI->hasDotTypeDotSizeDirective() && GA->getValueType()->isSized() &&
-        (!BaseObject || BaseObject->hasPrivateLinkage())) {
-      const DataLayout &DL = M.getDataLayout();
-      uint64_t Size = DL.getTypeAllocSize(GA->getValueType());
-      OutStreamer->emitELFSize(Name, MCConstantExpr::create(Size, OutContext));
-    }
+  // If the aliasee does not correspond to a symbol in the output, i.e. the
+  // alias is not of an object or the aliased object is private, then set the
+  // size of the alias symbol from the type of the alias. We don't do this in
+  // other situations as the alias and aliasee having differing types but same
+  // size may be intentional.
+  const GlobalObject *BaseObject = GA.getAliaseeObject();
+  if (MAI->hasDotTypeDotSizeDirective() && GA.getValueType()->isSized() &&
+      (!BaseObject || BaseObject->hasPrivateLinkage())) {
+    const DataLayout &DL = M.getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(GA.getValueType());
+    OutStreamer->emitELFSize(Name, MCConstantExpr::create(Size, OutContext));
   }
 }
 
+void AsmPrinter::emitGlobalIFunc(Module &M, const GlobalIFunc &GI) {
+  assert(!TM.getTargetTriple().isOSBinFormatXCOFF() &&
+         "IFunc is not supported on AIX.");
+
+  MCSymbol *Name = getSymbol(&GI);
+
+  if (GI.hasExternalLinkage() || !MAI->getWeakRefDirective())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
+  else if (GI.hasWeakLinkage() || GI.hasLinkOnceLinkage())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
+  else
+    assert(GI.hasLocalLinkage() && "Invalid ifunc linkage");
+
+  OutStreamer->emitSymbolAttribute(Name, MCSA_ELF_TypeIndFunction);
+  emitVisibility(Name, GI.getVisibility());
+
+  // Emit the directives as assignments aka .set:
+  const MCExpr *Expr = lowerConstant(GI.getResolver());
+  OutStreamer->emitAssignment(Name, Expr);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(GI);
+  if (LocalAlias != Name)
+    OutStreamer->emitAssignment(LocalAlias, Expr);
+}
+
 void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) {
   if (!RS.needsSection())
     return;
@@ -1815,6 +1826,11 @@ bool AsmPrinter::doFinalization(Module &M) {
     }
   }
 
+  // This needs to happen before emitting debug information since that can end
+  // arbitrary sections.
+  if (auto *TS = OutStreamer->getTargetStreamer())
+    TS->emitConstantPools();
+
   // Finalize debug and EH information.
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
@@ -1857,11 +1873,11 @@ bool AsmPrinter::doFinalization(Module &M) {
       AliasStack.push_back(Cur);
     }
     for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack))
-      emitGlobalIndirectSymbol(M, *AncestorAlias);
+      emitGlobalAlias(M, *AncestorAlias);
     AliasStack.clear();
   }
   for (const auto &IFunc : M.ifuncs())
-    emitGlobalIndirectSymbol(M, IFunc);
+    emitGlobalIFunc(M, IFunc);
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
@@ -2455,9 +2471,14 @@ void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
   if (Alignment == Align(1))
     return; // 1-byte aligned: no need to emit alignment.
 
-  if (getCurrentSection()->getKind().isText())
-    OutStreamer->emitCodeAlignment(Alignment.value());
-  else
+  if (getCurrentSection()->getKind().isText()) {
+    const MCSubtargetInfo *STI = nullptr;
+    if (this->MF)
+      STI = &getSubtargetInfo();
+    else
+      STI = TM.getMCSubtargetInfo();
+    OutStreamer->emitCodeAlignment(Alignment.value(), STI);
+  } else
     OutStreamer->emitValueToAlignment(Alignment.value());
 }
 
@@ -2513,7 +2534,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     OS << "Unsupported expression in static initializer: ";
     CE->printAsOperand(OS, /*PrintType=*/false,
                    !MF ? nullptr : MF->getFunction().getParent());
-    report_fatal_error(OS.str());
+    report_fatal_error(Twine(OS.str()));
   }
   case Instruction::GetElementPtr: {
     // Generate a symbolic expression for the byte address
@@ -3265,21 +3286,21 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // reference the block.  It is possible that there is more than one label
   // here, because multiple LLVM BB's may have been RAUW'd to this block after
   // the references were generated.
+  const BasicBlock *BB = MBB.getBasicBlock();
   if (MBB.hasAddressTaken()) {
-    const BasicBlock *BB = MBB.getBasicBlock();
     if (isVerbose())
       OutStreamer->AddComment("Block address taken");
 
     // MBBs can have their address taken as part of CodeGen without having
     // their corresponding BB's address taken in IR
-    if (BB->hasAddressTaken())
+    if (BB && BB->hasAddressTaken())
       for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
         OutStreamer->emitLabel(Sym);
   }
 
   // Print some verbose block comments.
   if (isVerbose()) {
-    if (const BasicBlock *BB = MBB.getBasicBlock()) {
+    if (BB) {
       if (BB->hasName()) {
         BB->printAsOperand(OutStreamer->GetCommentOS(),
                            /*PrintType=*/false, BB->getModule());
@@ -3538,7 +3559,7 @@ void AsmPrinter::emitXRayTable() {
   // pointers. This should work for both 32-bit and 64-bit platforms.
   if (FnSledIndex) {
     OutStreamer->SwitchSection(FnSledIndex);
-    OutStreamer->emitCodeAlignment(2 * WordSizeBytes);
+    OutStreamer->emitCodeAlignment(2 * WordSizeBytes, &getSubtargetInfo());
     OutStreamer->emitSymbolValue(SledsStart, WordSizeBytes, false);
     OutStreamer->emitSymbolValue(SledsEnd, WordSizeBytes, false);
     OutStreamer->SwitchSection(PrevSection);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 4a93181f5439..ef1abc47701a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -30,10 +30,10 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -129,13 +129,16 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
 }
 
 static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
-                               MachineModuleInfo *MMI, AsmPrinter *AP,
-                               uint64_t LocCookie, raw_ostream &OS) {
+                               MachineModuleInfo *MMI, const MCAsmInfo *MAI,
+                               AsmPrinter *AP, uint64_t LocCookie,
+                               raw_ostream &OS) {
   // Switch to the inline assembly variant.
   OS << "\t.intel_syntax\n\t";
 
+  int CurVariant = -1; // The number of the {.|.|.} region we are in.
   const char *LastEmitted = AsmStr; // One past the last character emitted.
   unsigned NumOperands = MI->getNumOperands();
+  int AsmPrinterVariant = 1; // X86MCAsmInfo.cpp's AsmWriterFlavorTy::Intel.
 
   while (*LastEmitted) {
     switch (*LastEmitted) {
@@ -145,8 +148,8 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
       while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
              *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
         ++LiteralEnd;
-
-      OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+        OS.write(LastEmitted, LiteralEnd - LastEmitted);
       LastEmitted = LiteralEnd;
       break;
     }
@@ -164,6 +167,27 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
       case '$':
         ++LastEmitted;  // Consume second '$' character.
         break;
+      case '(':        // $( -> same as GCC's { character.
+        ++LastEmitted; // Consume '(' character.
+        if (CurVariant != -1)
+          report_fatal_error("Nested variants found in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        CurVariant = 0; // We're in the first variant now.
+        break;
+      case '|':
+        ++LastEmitted; // Consume '|' character.
+        if (CurVariant == -1)
+          OS << '|'; // This is gcc's behavior for | outside a variant.
+        else
+          ++CurVariant; // We're in the next variant.
+        break;
+      case ')':        // $) -> same as GCC's } char.
+        ++LastEmitted; // Consume ')' character.
+        if (CurVariant == -1)
+          OS << '}'; // This is gcc's behavior for } outside a variant.
+        else
+          CurVariant = -1;
+        break;
       }
       if (Done) break;
 
@@ -176,16 +200,15 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
       // If we have ${:foo}, then this is not a real operand reference, it is a
       // "magic" string reference, just like in .td files.  Arrange to call
       // PrintSpecial.
-      if (HasCurlyBraces && LastEmitted[0] == ':') {
+      if (HasCurlyBraces && *LastEmitted == ':') {
         ++LastEmitted;
         const char *StrStart = LastEmitted;
         const char *StrEnd = strchr(StrStart, '}');
         if (!StrEnd)
           report_fatal_error("Unterminated ${:foo} operand in inline asm"
                              " string: '" + Twine(AsmStr) + "'");
-
-        std::string Val(StrStart, StrEnd);
-        AP->PrintSpecial(MI, OS, Val.c_str());
+        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+          AP->PrintSpecial(MI, OS, StringRef(StrStart, StrEnd - StrStart));
         LastEmitted = StrEnd+1;
         break;
       }
@@ -201,7 +224,7 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
                            Twine(AsmStr) + "'");
       LastEmitted = IDEnd;
 
-      if (Val >= NumOperands-1)
+      if (Val >= NumOperands - 1)
         report_fatal_error("Invalid $ operand number in inline asm string: '" +
                            Twine(AsmStr) + "'");
 
@@ -228,40 +251,50 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
 
       // Okay, we finally have a value number.  Ask the target to print this
       // operand!
-      unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
+        unsigned OpNo = InlineAsm::MIOp_FirstOperand;
 
-      bool Error = false;
+        bool Error = false;
 
-      // Scan to find the machine operand number for the operand.
-      for (; Val; --Val) {
-        if (OpNo >= MI->getNumOperands()) break;
-        unsigned OpFlags = MI->getOperand(OpNo).getImm();
-        OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
-      }
+        // Scan to find the machine operand number for the operand.
+        for (; Val; --Val) {
+          if (OpNo >= MI->getNumOperands())
+            break;
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
 
-      // We may have a location metadata attached to the end of the
-      // instruction, and at no point should see metadata at any
-      // other point while processing. It's an error if so.
-      if (OpNo >= MI->getNumOperands() ||
-          MI->getOperand(OpNo).isMetadata()) {
-        Error = true;
-      } else {
-        unsigned OpFlags = MI->getOperand(OpNo).getImm();
-        ++OpNo;  // Skip over the ID number.
-
-        if (InlineAsm::isMemKind(OpFlags)) {
-          Error = AP->PrintAsmMemoryOperand(
-              MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
+        // We may have a location metadata attached to the end of the
+        // instruction, and at no point should see metadata at any
+        // other point while processing. It's an error if so.
+        if (OpNo >= MI->getNumOperands() || MI->getOperand(OpNo).isMetadata()) {
+          Error = true;
         } else {
-          Error = AP->PrintAsmOperand(MI, OpNo,
-                                      Modifier[0] ? Modifier : nullptr, OS);
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          ++OpNo; // Skip over the ID number.
+
+          // FIXME: Shouldn't arch-independent output template handling go into
+          // PrintAsmOperand?
+          // Labels are target independent.
+          if (MI->getOperand(OpNo).isBlockAddress()) {
+            const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
+            MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
+            Sym->print(OS, AP->MAI);
+            MMI->getContext().registerInlineAsmLabel(Sym);
+          } else if (InlineAsm::isMemKind(OpFlags)) {
+            Error = AP->PrintAsmMemoryOperand(
+                MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
+          } else {
+            Error = AP->PrintAsmOperand(MI, OpNo,
+                                        Modifier[0] ? Modifier : nullptr, OS);
+          }
+        }
+        if (Error) {
+          std::string msg;
+          raw_string_ostream Msg(msg);
+          Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+          MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
         }
-      }
-      if (Error) {
-        std::string msg;
-        raw_string_ostream Msg(msg);
-        Msg << "invalid operand in inline asm: '" << AsmStr << "'";
-        MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
       }
       break;
     }
@@ -274,10 +307,10 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
                                 MachineModuleInfo *MMI, const MCAsmInfo *MAI,
                                 AsmPrinter *AP, uint64_t LocCookie,
                                 raw_ostream &OS) {
-  int CurVariant = -1;            // The number of the {.|.|.} region we are in.
+  int CurVariant = -1; // The number of the {.|.|.} region we are in.
   const char *LastEmitted = AsmStr; // One past the last character emitted.
   unsigned NumOperands = MI->getNumOperands();
-  int AsmPrinterVariant = MAI->getAssemblerDialect();
+  int AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant();
 
   if (MAI->getEmitGNUAsmStartIndentationMarker())
     OS << '\t';
@@ -291,7 +324,7 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
              *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
         ++LiteralEnd;
       if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
-        OS.write(LastEmitted, LiteralEnd-LastEmitted);
+        OS.write(LastEmitted, LiteralEnd - LastEmitted);
       LastEmitted = LiteralEnd;
       break;
     }
@@ -311,24 +344,24 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
           OS << '$';
         ++LastEmitted;  // Consume second '$' character.
         break;
-      case '(':             // $( -> same as GCC's { character.
-        ++LastEmitted;      // Consume '(' character.
+      case '(':        // $( -> same as GCC's { character.
+        ++LastEmitted; // Consume '(' character.
         if (CurVariant != -1)
           report_fatal_error("Nested variants found in inline asm string: '" +
                              Twine(AsmStr) + "'");
-        CurVariant = 0;     // We're in the first variant now.
+        CurVariant = 0; // We're in the first variant now.
         break;
       case '|':
-        ++LastEmitted;  // consume '|' character.
+        ++LastEmitted; // Consume '|' character.
         if (CurVariant == -1)
-          OS << '|';       // this is gcc's behavior for | outside a variant
+          OS << '|'; // This is gcc's behavior for | outside a variant.
         else
-          ++CurVariant;   // We're in the next variant.
+          ++CurVariant; // We're in the next variant.
         break;
-      case ')':         // $) -> same as GCC's } char.
-        ++LastEmitted;  // consume ')' character.
+      case ')':        // $) -> same as GCC's } char.
+        ++LastEmitted; // Consume ')' character.
         if (CurVariant == -1)
-          OS << '}';     // this is gcc's behavior for } outside a variant
+          OS << '}'; // This is gcc's behavior for } outside a variant.
         else
           CurVariant = -1;
         break;
@@ -351,9 +384,8 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
         if (!StrEnd)
           report_fatal_error("Unterminated ${:foo} operand in inline asm"
                              " string: '" + Twine(AsmStr) + "'");
-
-        std::string Val(StrStart, StrEnd);
-        AP->PrintSpecial(MI, OS, Val.c_str());
+        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+          AP->PrintSpecial(MI, OS, StringRef(StrStart, StrEnd - StrStart));
         LastEmitted = StrEnd+1;
         break;
       }
@@ -369,6 +401,10 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
                            Twine(AsmStr) + "'");
       LastEmitted = IDEnd;
 
+      if (Val >= NumOperands - 1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+
       char Modifier[2] = { 0, 0 };
 
       if (HasCurlyBraces) {
@@ -390,10 +426,6 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
         ++LastEmitted;    // Consume '}' character.
       }
 
-      if (Val >= NumOperands-1)
-        report_fatal_error("Invalid $ operand number in inline asm string: '" +
-                           Twine(AsmStr) + "'");
-
       // Okay, we finally have a value number.  Ask the target to print this
       // operand!
       if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
@@ -403,7 +435,8 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
 
         // Scan to find the machine operand number for the operand.
         for (; Val; --Val) {
-          if (OpNo >= MI->getNumOperands()) break;
+          if (OpNo >= MI->getNumOperands())
+            break;
           unsigned OpFlags = MI->getOperand(OpNo).getImm();
           OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
         }
@@ -411,12 +444,11 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
         // We may have a location metadata attached to the end of the
         // instruction, and at no point should see metadata at any
         // other point while processing. It's an error if so.
-        if (OpNo >= MI->getNumOperands() ||
-            MI->getOperand(OpNo).isMetadata()) {
+        if (OpNo >= MI->getNumOperands() || MI->getOperand(OpNo).isMetadata()) {
           Error = true;
         } else {
           unsigned OpFlags = MI->getOperand(OpNo).getImm();
-          ++OpNo;  // Skip over the ID number.
+          ++OpNo; // Skip over the ID number.
 
           // FIXME: Shouldn't arch-independent output template handling go into
           // PrintAsmOperand?
@@ -429,8 +461,6 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
           } else if (MI->getOperand(OpNo).isMBB()) {
             const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
             Sym->print(OS, AP->MAI);
-          } else if (Modifier[0] == 'l') {
-            Error = true;
           } else if (InlineAsm::isMemKind(OpFlags)) {
             Error = AP->PrintAsmMemoryOperand(
                 MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
@@ -506,7 +536,7 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
   if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
     EmitGCCInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS);
   else
-    EmitMSInlineAsmStr(AsmStr, MI, MMI, AP, LocCookie, OS);
+    EmitMSInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS);
 
   // Emit warnings if we use reserved registers on the clobber list, as
   // that might lead to undefined behaviour.
@@ -540,7 +570,7 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
         "preserved across the asm statement, and clobbering them may "
         "lead to undefined behaviour.";
     MMI->getModule()->getContext().diagnose(DiagnosticInfoInlineAsm(
-        LocCookie, Msg.c_str(), DiagnosticSeverity::DS_Warning));
+        LocCookie, Msg, DiagnosticSeverity::DS_Warning));
     MMI->getModule()->getContext().diagnose(
         DiagnosticInfoInlineAsm(LocCookie, Note, DiagnosticSeverity::DS_Note));
   }
@@ -560,13 +590,13 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
 /// syntax used is ${:comment}.  Targets can override this to add support
 /// for their own strange codes.
 void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
-                              const char *Code) const {
-  if (!strcmp(Code, "private")) {
+                              StringRef Code) const {
+  if (Code == "private") {
     const DataLayout &DL = MF->getDataLayout();
     OS << DL.getPrivateGlobalPrefix();
-  } else if (!strcmp(Code, "comment")) {
+  } else if (Code == "comment") {
     OS << MAI->getCommentString();
-  } else if (!strcmp(Code, "uid")) {
+  } else if (Code == "uid") {
     // Comparing the address of MI isn't sufficient, because machineinstrs may
     // be allocated to the same address across functions.
 
@@ -582,7 +612,7 @@ void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
     raw_string_ostream Msg(msg);
     Msg << "Unknown special formatter '" << Code
          << "' for machine instr: " << *MI;
-    report_fatal_error(Msg.str());
+    report_fatal_error(Twine(Msg.str()));
   }
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index bbb0504550c3..85ff84484ced 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -341,7 +341,16 @@ std::string CodeViewDebug::getFullyQualifiedName(const DIScope *Ty) {
 
 TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
   // No scope means global scope and that uses the zero index.
-  if (!Scope || isa<DIFile>(Scope))
+  //
+  // We also use zero index when the scope is a DISubprogram
+  // to suppress the emission of LF_STRING_ID for the function,
+  // which can trigger a link-time error with the linker in
+  // VS2019 version 16.11.2 or newer.
+  // Note, however, skipping the debug info emission for the DISubprogram
+  // is a temporary fix. The root issue here is that we need to figure out
+  // the proper way to encode a function nested in another function
+  // (as introduced by the Fortran 'contains' keyword) in CodeView.
+  if (!Scope || isa<DIFile>(Scope) || isa<DISubprogram>(Scope))
     return TypeIndex();
 
   assert(!isa<DIType>(Scope) && "shouldn't make a namespace scope for a type");
@@ -561,6 +570,44 @@ void CodeViewDebug::emitCodeViewMagicVersion() {
   OS.emitInt32(COFF::DEBUG_SECTION_MAGIC);
 }
 
+static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
+  switch (DWLang) {
+  case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C89:
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_C11:
+  case dwarf::DW_LANG_ObjC:
+    return SourceLanguage::C;
+  case dwarf::DW_LANG_C_plus_plus:
+  case dwarf::DW_LANG_C_plus_plus_03:
+  case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_C_plus_plus_14:
+    return SourceLanguage::Cpp;
+  case dwarf::DW_LANG_Fortran77:
+  case dwarf::DW_LANG_Fortran90:
+  case dwarf::DW_LANG_Fortran95:
+  case dwarf::DW_LANG_Fortran03:
+  case dwarf::DW_LANG_Fortran08:
+    return SourceLanguage::Fortran;
+  case dwarf::DW_LANG_Pascal83:
+    return SourceLanguage::Pascal;
+  case dwarf::DW_LANG_Cobol74:
+  case dwarf::DW_LANG_Cobol85:
+    return SourceLanguage::Cobol;
+  case dwarf::DW_LANG_Java:
+    return SourceLanguage::Java;
+  case dwarf::DW_LANG_D:
+    return SourceLanguage::D;
+  case dwarf::DW_LANG_Swift:
+    return SourceLanguage::Swift;
+  default:
+    // There's no CodeView representation for this language, and CV doesn't
+    // have an "unknown" option for the language field, so we'll use MASM,
+    // as it's very low level.
+    return SourceLanguage::Masm;
+  }
+}
+
 void CodeViewDebug::beginModule(Module *M) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
@@ -574,6 +621,13 @@ void CodeViewDebug::beginModule(Module *M) {
 
   TheCPU = mapArchToCVCPUType(Triple(M->getTargetTriple()).getArch());
 
+  // Get the current source language.
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  const MDNode *Node = *CUs->operands().begin();
+  const auto *CU = cast<DICompileUnit>(Node);
+
+  CurrentSourceLanguage = MapDWLangToCVLang(CU->getSourceLanguage());
+
   collectGlobalVariableInfo();
 
   // Check if we should emit type record hashes.
@@ -731,43 +785,6 @@ void CodeViewDebug::emitTypeGlobalHashes() {
   }
 }
 
-static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
-  switch (DWLang) {
-  case dwarf::DW_LANG_C:
-  case dwarf::DW_LANG_C89:
-  case dwarf::DW_LANG_C99:
-  case dwarf::DW_LANG_C11:
-  case dwarf::DW_LANG_ObjC:
-    return SourceLanguage::C;
-  case dwarf::DW_LANG_C_plus_plus:
-  case dwarf::DW_LANG_C_plus_plus_03:
-  case dwarf::DW_LANG_C_plus_plus_11:
-  case dwarf::DW_LANG_C_plus_plus_14:
-    return SourceLanguage::Cpp;
-  case dwarf::DW_LANG_Fortran77:
-  case dwarf::DW_LANG_Fortran90:
-  case dwarf::DW_LANG_Fortran03:
-  case dwarf::DW_LANG_Fortran08:
-    return SourceLanguage::Fortran;
-  case dwarf::DW_LANG_Pascal83:
-    return SourceLanguage::Pascal;
-  case dwarf::DW_LANG_Cobol74:
-  case dwarf::DW_LANG_Cobol85:
-    return SourceLanguage::Cobol;
-  case dwarf::DW_LANG_Java:
-    return SourceLanguage::Java;
-  case dwarf::DW_LANG_D:
-    return SourceLanguage::D;
-  case dwarf::DW_LANG_Swift:
-    return SourceLanguage::Swift;
-  default:
-    // There's no CodeView representation for this language, and CV doesn't
-    // have an "unknown" option for the language field, so we'll use MASM,
-    // as it's very low level.
-    return SourceLanguage::Masm;
-  }
-}
-
 namespace {
 struct Version {
   int Part[4];
@@ -797,12 +814,8 @@ void CodeViewDebug::emitCompilerInformation() {
   MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_COMPILE3);
   uint32_t Flags = 0;
 
-  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
-  const MDNode *Node = *CUs->operands().begin();
-  const auto *CU = cast<DICompileUnit>(Node);
-
   // The low byte of the flags indicates the source language.
-  Flags = MapDWLangToCVLang(CU->getSourceLanguage());
+  Flags = CurrentSourceLanguage;
   // TODO:  Figure out which other flags need to be set.
   if (MMI->getModule()->getProfileSummary(/*IsCS*/ false) != nullptr) {
     Flags |= static_cast<uint32_t>(CompileSym3Flags::PGO);
@@ -814,6 +827,10 @@ void CodeViewDebug::emitCompilerInformation() {
   OS.AddComment("CPUType");
   OS.emitInt16(static_cast<uint64_t>(TheCPU));
 
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  const MDNode *Node = *CUs->operands().begin();
+  const auto *CU = cast<DICompileUnit>(Node);
+
   StringRef CompilerVersion = CU->getProducer();
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
@@ -1573,6 +1590,8 @@ TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
     return lowerTypeClass(cast<DICompositeType>(Ty));
   case dwarf::DW_TAG_union_type:
     return lowerTypeUnion(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_string_type:
+    return lowerTypeString(cast<DIStringType>(Ty));
   case dwarf::DW_TAG_unspecified_type:
     if (Ty->getName() == "decltype(nullptr)")
       return TypeIndex::NullptrT();
@@ -1617,14 +1636,19 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
 
     const DISubrange *Subrange = cast<DISubrange>(Element);
     int64_t Count = -1;
-    // Calculate the count if either LowerBound is absent or is zero and
-    // either of Count or UpperBound are constant.
-    auto *LI = Subrange->getLowerBound().dyn_cast<ConstantInt *>();
-    if (!Subrange->getRawLowerBound() || (LI && (LI->getSExtValue() == 0))) {
-      if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt*>())
-        Count = CI->getSExtValue();
-      else if (auto *UI = Subrange->getUpperBound().dyn_cast<ConstantInt*>())
-        Count = UI->getSExtValue() + 1; // LowerBound is zero
+
+    // If Subrange has a Count field, use it.
+    // Otherwise, if it has an upperboud, use (upperbound - lowerbound + 1),
+    // where lowerbound is from the LowerBound field of the Subrange,
+    // or the language default lowerbound if that field is unspecified.
+    if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt *>())
+      Count = CI->getSExtValue();
+    else if (auto *UI = Subrange->getUpperBound().dyn_cast<ConstantInt *>()) {
+      // Fortran uses 1 as the default lowerbound; other languages use 0.
+      int64_t Lowerbound = (moduleIsInFortran()) ? 1 : 0;
+      auto *LI = Subrange->getLowerBound().dyn_cast<ConstantInt *>();
+      Lowerbound = (LI) ? LI->getSExtValue() : Lowerbound;
+      Count = UI->getSExtValue() - Lowerbound + 1;
     }
 
     // Forward declarations of arrays without a size and VLAs use a count of -1.
@@ -1650,6 +1674,26 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
   return ElementTypeIndex;
 }
 
+// This function lowers a Fortran character type (DIStringType).
+// Note that it handles only the character*n variant (using SizeInBits
+// field in DIString to describe the type size) at the moment.
+// Other variants (leveraging the StringLength and StringLengthExp
+// fields in DIStringType) remain TBD.
+TypeIndex CodeViewDebug::lowerTypeString(const DIStringType *Ty) {
+  TypeIndex CharType = TypeIndex(SimpleTypeKind::NarrowCharacter);
+  uint64_t ArraySize = Ty->getSizeInBits() >> 3;
+  StringRef Name = Ty->getName();
+  // IndexType is size_t, which depends on the bitness of the target.
+  TypeIndex IndexType = getPointerSizeInBytes() == 8
+                            ? TypeIndex(SimpleTypeKind::UInt64Quad)
+                            : TypeIndex(SimpleTypeKind::UInt32Long);
+
+  // Create a type of character array of ArraySize.
+  ArrayRecord AR(CharType, IndexType, ArraySize, Name);
+
+  return TypeTable.writeLeafType(AR);
+}
+
 TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
   TypeIndex Index;
   dwarf::TypeKind Kind;
@@ -1728,9 +1772,14 @@ TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
   }
 
   // Apply some fixups based on the source-level type name.
-  if (STK == SimpleTypeKind::Int32 && Ty->getName() == "long int")
+  // Include some amount of canonicalization from an old naming scheme Clang
+  // used to use for integer types (in an outdated effort to be compatible with
+  // GCC's debug info/GDB's behavior, which has since been addressed).
+  if (STK == SimpleTypeKind::Int32 &&
+      (Ty->getName() == "long int" || Ty->getName() == "long"))
     STK = SimpleTypeKind::Int32Long;
-  if (STK == SimpleTypeKind::UInt32 && Ty->getName() == "long unsigned int")
+  if (STK == SimpleTypeKind::UInt32 && (Ty->getName() == "long unsigned int" ||
+                                        Ty->getName() == "unsigned long"))
     STK = SimpleTypeKind::UInt32Long;
   if (STK == SimpleTypeKind::UInt16Short &&
       (Ty->getName() == "wchar_t" || Ty->getName() == "__wchar_t"))
@@ -2177,6 +2226,7 @@ void CodeViewDebug::clear() {
   TypeIndices.clear();
   CompleteTypeIndices.clear();
   ScopeGlobals.clear();
+  CVGlobalVariableOffsets.clear();
 }
 
 void CodeViewDebug::collectMemberInfo(ClassInfo &Info,
@@ -3062,6 +3112,15 @@ void CodeViewDebug::collectGlobalVariableInfo() {
       const DIGlobalVariable *DIGV = GVE->getVariable();
       const DIExpression *DIE = GVE->getExpression();
 
+      if ((DIE->getNumElements() == 2) &&
+          (DIE->getElement(0) == dwarf::DW_OP_plus_uconst))
+        // Record the constant offset for the variable.
+        //
+        // A Fortran common block uses this idiom to encode the offset
+        // of a variable from the common block's starting address.
+        CVGlobalVariableOffsets.insert(
+            std::make_pair(DIGV, DIE->getElement(1)));
+
       // Emit constant global variables in a global symbol section.
       if (GlobalMap.count(GVE) == 0 && DIE->isConstant()) {
         CVGlobalVariable CVGV = {DIGV, DIE};
@@ -3226,7 +3285,11 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
   if (const auto *MemberDecl = dyn_cast_or_null<DIDerivedType>(
           DIGV->getRawStaticDataMemberDeclaration()))
     Scope = MemberDecl->getScope();
-  std::string QualifiedName = getFullyQualifiedName(Scope, DIGV->getName());
+  // For Fortran, the scoping portion is elided in its name so that we can
+  // reference the variable in the command line of the VS debugger.
+  std::string QualifiedName =
+      (moduleIsInFortran()) ? std::string(DIGV->getName())
+                            : getFullyQualifiedName(Scope, DIGV->getName());
 
   if (const GlobalVariable *GV =
           CVGV.GVInfo.dyn_cast<const GlobalVariable *>()) {
@@ -3242,7 +3305,13 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
     OS.AddComment("Type");
     OS.emitInt32(getCompleteTypeIndex(DIGV->getType()).getIndex());
     OS.AddComment("DataOffset");
-    OS.EmitCOFFSecRel32(GVSym, /*Offset=*/0);
+
+    uint64_t Offset = 0;
+    if (CVGlobalVariableOffsets.find(DIGV) != CVGlobalVariableOffsets.end())
+      // Use the offset seen while collecting info on globals.
+      Offset = CVGlobalVariableOffsets[DIGV];
+    OS.EmitCOFFSecRel32(GVSym, Offset);
+
     OS.AddComment("Segment");
     OS.EmitCOFFSectionIndex(GVSym);
     OS.AddComment("Name");
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index d133474ee5aa..6f88e15ee8fe 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -186,6 +186,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   };
   FunctionInfo *CurFn = nullptr;
 
+  codeview::SourceLanguage CurrentSourceLanguage =
+      codeview::SourceLanguage::Masm;
+
+  // This map records the constant offset in DIExpression of the
+  // DIGlobalVariableExpression referencing the DIGlobalVariable.
+  DenseMap<const DIGlobalVariable *, uint64_t> CVGlobalVariableOffsets;
+
   // Map used to seperate variables according to the lexical scope they belong
   // in.  This is populated by recordLocalVariable() before
   // collectLexicalBlocks() separates the variables between the FunctionInfo
@@ -400,6 +407,7 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   codeview::TypeIndex lowerType(const DIType *Ty, const DIType *ClassTy);
   codeview::TypeIndex lowerTypeAlias(const DIDerivedType *Ty);
   codeview::TypeIndex lowerTypeArray(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeString(const DIStringType *Ty);
   codeview::TypeIndex lowerTypeBasic(const DIBasicType *Ty);
   codeview::TypeIndex lowerTypePointer(
       const DIDerivedType *Ty,
@@ -464,6 +472,11 @@ protected:
   /// Gather post-function debug information.
   void endFunctionImpl(const MachineFunction *) override;
 
+  /// Check if the current module is in Fortran.
+  bool moduleIsInFortran() {
+    return CurrentSourceLanguage == codeview::SourceLanguage::Fortran;
+  }
+
 public:
   CodeViewDebug(AsmPrinter *AP);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 802f0e880514..5f4ee747fcca 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -93,19 +93,15 @@ void DIEHash::addParentContext(const DIE &Parent) {
 
   // Reverse iterate over our list to go from the outermost construct to the
   // innermost.
-  for (SmallVectorImpl<const DIE *>::reverse_iterator I = Parents.rbegin(),
-                                                      E = Parents.rend();
-       I != E; ++I) {
-    const DIE &Die = **I;
-
+  for (const DIE *Die : llvm::reverse(Parents)) {
     // ... Append the letter "C" to the sequence...
     addULEB128('C');
 
     // ... Followed by the DWARF tag of the construct...
-    addULEB128(Die.getTag());
+    addULEB128(Die->getTag());
 
     // ... Then the name, taken from the DW_AT_name attribute.
-    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
+    StringRef Name = getDIEStringAttr(*Die, dwarf::DW_AT_name);
     LLVM_DEBUG(dbgs() << "... adding context: " << Name << "\n");
     if (!Name.empty())
       addString(Name);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index bb24f1414ef1..dd795079ac1a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -252,8 +252,8 @@ void DbgValueHistoryMap::trimLocationRanges(
 
     // Now actually remove the entries. Iterate backwards so that our remaining
     // ToRemove indices are valid after each erase.
-    for (auto Itr = ToRemove.rbegin(), End = ToRemove.rend(); Itr != End; ++Itr)
-      HistoryMapEntries.erase(HistoryMapEntries.begin() + *Itr);
+    for (EntryIndex Idx : llvm::reverse(ToRemove))
+      HistoryMapEntries.erase(HistoryMapEntries.begin() + Idx);
   }
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index c81288c0e460..4df34d2c9402 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -174,21 +174,26 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DIType *Ty) {
 }
 
 bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
-  // SROA may generate dbg value intrinsics to assign an unsigned value to a
-  // Fortran CHARACTER(1) type variables. Make them as unsigned.
   if (isa<DIStringType>(Ty)) {
-    assert((Ty->getSizeInBits()) == 8 && "Not a valid unsigned type!");
+    // Some transformations (e.g. instcombine) may decide to turn a Fortran
+    // character object into an integer, and later ones (e.g. SROA) may
+    // further inject a constant integer in a llvm.dbg.value call to track
+    // the object's value. Here we trust the transformations are doing the
+    // right thing, and treat the constant as unsigned to preserve that value
+    // (i.e. avoid sign extension).
     return true;
   }
-  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
-    // FIXME: Enums without a fixed underlying type have unknown signedness
-    // here, leading to incorrectly emitted constants.
-    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type)
-      return false;
 
-    // (Pieces of) aggregate types that get hacked apart by SROA may be
-    // represented by a constant. Encode them as unsigned bytes.
-    return true;
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type) {
+      if (!(Ty = CTy->getBaseType()))
+        // FIXME: Enums without a fixed underlying type have unknown signedness
+        // here, leading to incorrectly emitted constants.
+        return false;
+    } else
+      // (Pieces of) aggregate types that get hacked apart by SROA may be
+      // represented by a constant. Encode them as unsigned bytes.
+      return true;
   }
 
   if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 62ebadaf3cbe..d7ab2091967f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -158,7 +158,7 @@ public:
   friend bool operator<(const DbgValueLoc &, const DbgValueLoc &);
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const {
-    for (DbgValueLocEntry DV : ValueLocEntries)
+    for (const DbgValueLocEntry &DV : ValueLocEntries)
       DV.dump();
     if (Expression)
       Expression->dump();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index faa14dca1c3f..922c91840520 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -143,8 +143,6 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   auto *GVContext = GV->getScope();
   const DIType *GTy = GV->getType();
 
-  // Construct the context before querying for the existence of the DIE in
-  // case such construction creates the DIE.
   auto *CB = GVContext ? dyn_cast<DICommonBlock>(GVContext) : nullptr;
   DIE *ContextDIE = CB ? getOrCreateCommonBlock(CB, GlobalExprs)
     : getOrCreateContextDIE(GVContext);
@@ -183,6 +181,8 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   else
     addGlobalName(GV->getName(), *VariableDIE, DeclContext);
 
+  addAnnotation(*VariableDIE, GV->getAnnotations());
+
   if (uint32_t AlignInBytes = GV->getAlignInBytes())
     addUInt(*VariableDIE, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
             AlignInBytes);
@@ -260,14 +260,14 @@ void DwarfCompileUnit::addLocationAttribute(
 
     if (Global) {
       const MCSymbol *Sym = Asm->getSymbol(Global);
+      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+      assert((PointerSize == 4 || PointerSize == 8) &&
+             "Add support for other sizes if necessary");
       if (Global->isThreadLocal()) {
         if (Asm->TM.useEmulatedTLS()) {
           // TODO: add debug info for emulated thread local mode.
         } else {
           // FIXME: Make this work with -gsplit-dwarf.
-          unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-          assert((PointerSize == 4 || PointerSize == 8) &&
-                 "Add support for other sizes if necessary");
           // Based on GCC's support for TLS:
           if (!DD->useSplitDwarf()) {
             // 1) Start with a constNu of the appropriate pointer size
@@ -290,6 +290,24 @@ void DwarfCompileUnit::addLocationAttribute(
                   DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
                                         : dwarf::DW_OP_form_tls_address);
         }
+      } else if (Asm->TM.getRelocationModel() == Reloc::RWPI ||
+                 Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) {
+        // Constant
+        addUInt(*Loc, dwarf::DW_FORM_data1,
+                PointerSize == 4 ? dwarf::DW_OP_const4u
+                                 : dwarf::DW_OP_const8u);
+        // Relocation offset
+        addExpr(*Loc, PointerSize == 4 ? dwarf::DW_FORM_data4
+                                       : dwarf::DW_FORM_data8,
+                Asm->getObjFileLowering().getIndirectSymViaRWPI(Sym));
+        // Base register
+        Register BaseReg = Asm->getObjFileLowering().getStaticBase();
+        BaseReg = Asm->TM.getMCRegisterInfo()->getDwarfRegNum(BaseReg, false);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + BaseReg);
+        // Offset from base register
+        addSInt(*Loc, dwarf::DW_FORM_sdata, 0);
+        // Operation
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
       } else {
         DD->addArangeLabel(SymbolCU(this, Sym));
         addOpAddress(*Loc, Sym);
@@ -331,12 +349,10 @@ void DwarfCompileUnit::addLocationAttribute(
 
 DIE *DwarfCompileUnit::getOrCreateCommonBlock(
     const DICommonBlock *CB, ArrayRef<GlobalExpr> GlobalExprs) {
-  // Construct the context before querying for the existence of the DIE in case
-  // such construction creates the DIE.
-  DIE *ContextDIE = getOrCreateContextDIE(CB->getScope());
-
+  // Check for pre-existence.
   if (DIE *NDie = getDIE(CB))
     return NDie;
+  DIE *ContextDIE = getOrCreateContextDIE(CB->getScope());
   DIE &NDie = createAndAddDIE(dwarf::DW_TAG_common_block, *ContextDIE, CB);
   StringRef Name = CB->getName().empty() ? "_BLNK_" : CB->getName();
   addString(NDie, dwarf::DW_AT_name, Name);
@@ -351,7 +367,8 @@ DIE *DwarfCompileUnit::getOrCreateCommonBlock(
 void DwarfCompileUnit::addRange(RangeSpan Range) {
   DD->insertSectionLabel(Range.Begin);
 
-  bool SameAsPrevCU = this == DD->getPrevCU();
+  auto *PrevCU = DD->getPrevCU();
+  bool SameAsPrevCU = this == PrevCU;
   DD->setPrevCU(this);
   // If we have no current ranges just add the range and return, otherwise,
   // check the current section and CU against the previous section and CU we
@@ -360,6 +377,9 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
   if (CURanges.empty() || !SameAsPrevCU ||
       (&CURanges.back().End->getSection() !=
        &Range.End->getSection())) {
+    // Before a new range is added, always terminate the prior line table.
+    if (PrevCU)
+      DD->terminateLineTable(PrevCU);
     CURanges.push_back(Range);
     return;
   }
@@ -470,7 +490,6 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
         addSInt(*Loc, dwarf::DW_FORM_sdata, TI_GLOBAL_RELOC);
         if (!isDwoUnit()) {
           addLabel(*Loc, dwarf::DW_FORM_data4, SPSym);
-          DD->addArangeLabel(SymbolCU(this, SPSym));
         } else {
           // FIXME: when writing dwo, we need to avoid relocations. Probably
           // the "right" solution is to treat globals the way func and data
@@ -961,9 +980,7 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
     bool visitedAllDependencies = Item.getInt();
     WorkList.pop_back();
 
-    // Dependency is in a different lexical scope or a global.
-    if (!Var)
-      continue;
+    assert(Var);
 
     // Already handled.
     if (Visited.count(Var))
@@ -987,8 +1004,10 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
     // visited again after all of its dependencies are handled.
     WorkList.push_back({Var, 1});
     for (auto *Dependency : dependencies(Var)) {
-      auto Dep = dyn_cast_or_null<const DILocalVariable>(Dependency);
-      WorkList.push_back({DbgVar[Dep], 0});
+      // Don't add dependency if it is in a different lexical scope or a global.
+      if (const auto *Dep = dyn_cast<const DILocalVariable>(Dependency))
+        if (DbgVariable *Var = DbgVar.lookup(Dep))
+          WorkList.push_back({Var, 0});
     }
   }
   return Result;
@@ -1103,9 +1122,10 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
   // shouldn't be found by lookup.
   AbsDef = &ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, nullptr);
   ContextCU->applySubprogramAttributesToDefinition(SP, *AbsDef);
-
-  if (!ContextCU->includeMinimalInlineScopes())
-    ContextCU->addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  ContextCU->addSInt(*AbsDef, dwarf::DW_AT_inline,
+                     DD->getDwarfVersion() <= 4 ? Optional<dwarf::Form>()
+                                                : dwarf::DW_FORM_implicit_const,
+                     dwarf::DW_INL_inlined);
   if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, *AbsDef))
     ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
 }
@@ -1162,7 +1182,7 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
 }
 
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
-                                                 DIE *CalleeDIE,
+                                                 const DISubprogram *CalleeSP,
                                                  bool IsTail,
                                                  const MCSymbol *PCAddr,
                                                  const MCSymbol *CallAddr,
@@ -1176,7 +1196,8 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
   } else {
-    assert(CalleeDIE && "No DIE for call site entry origin");
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    assert(CalleeDIE && "Could not create DIE for call site entry origin");
     addDIEEntry(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_origin),
                 *CalleeDIE);
   }
@@ -1265,6 +1286,16 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
   if (!Name.empty())
     addString(*IMDie, dwarf::DW_AT_name, Name);
 
+  // This is for imported module with renamed entities (such as variables and
+  // subprograms).
+  DINodeArray Elements = Module->getElements();
+  for (const auto *Element : Elements) {
+    if (!Element)
+      continue;
+    IMDie->addChild(
+        constructImportedEntityDIE(cast<DIImportedEntity>(Element)));
+  }
+
   return IMDie;
 }
 
@@ -1479,10 +1510,12 @@ void DwarfCompileUnit::applyVariableAttributes(const DbgVariable &Var,
   if (!Name.empty())
     addString(VariableDie, dwarf::DW_AT_name, Name);
   const auto *DIVar = Var.getVariable();
-  if (DIVar)
+  if (DIVar) {
     if (uint32_t AlignInBytes = DIVar->getAlignInBytes())
       addUInt(VariableDie, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
               AlignInBytes);
+    addAnnotation(VariableDie, DIVar->getAnnotations());
+  }
 
   addSourceLine(VariableDie, DIVar);
   addType(VariableDie, Var.getType());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 6d8186a5ee2b..6e9261087686 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -249,16 +249,14 @@ public:
   dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeDIE.
-  /// \p CalleeDIE is a declaration or definition subprogram DIE for the callee.
-  /// For indirect calls \p CalleeDIE is set to nullptr.
+  /// callee described by \p CalleeSP.
   /// \p IsTail specifies whether the call is a tail call.
   /// \p PCAddr points to the PC value after the call instruction.
   /// \p CallAddr points to the PC value at the call instruction (or is null).
   /// \p CallReg is a register location for an indirect call. For direct calls
   /// the \p CallReg is set to 0.
-  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, DIE *CalleeDIE, bool IsTail,
-                                 const MCSymbol *PCAddr,
+  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
+                                 bool IsTail, const MCSymbol *PCAddr,
                                  const MCSymbol *CallAddr, unsigned CallReg);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
   /// were collected by the \ref collectCallSiteParameters.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ee14423ca3d0..047676d4c11e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -480,7 +480,7 @@ static bool hasObjCCategory(StringRef Name) {
   if (!isObjCClass(Name))
     return false;
 
-  return Name.find(") ") != StringRef::npos;
+  return Name.contains(") ");
 }
 
 static void getObjCClassCategory(StringRef In, StringRef &Class,
@@ -587,14 +587,6 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
   }
 }
 
-DIE &DwarfDebug::constructSubprogramDefinitionDIE(const DISubprogram *SP) {
-  DICompileUnit *Unit = SP->getUnit();
-  assert(SP->isDefinition() && "Subprogram not a definition");
-  assert(Unit && "Subprogram definition without parent unit");
-  auto &CU = getOrCreateDwarfCompileUnit(Unit);
-  return *CU.getOrCreateSubprogramDIE(SP);
-}
-
 /// Represents a parameter whose call site value can be described by applying a
 /// debug expression to a register in the forwarded register worklist.
 struct FwdRegParamInfo {
@@ -945,7 +937,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         continue;
 
       unsigned CallReg = 0;
-      DIE *CalleeDIE = nullptr;
+      const DISubprogram *CalleeSP = nullptr;
       const Function *CalleeDecl = nullptr;
       if (CalleeOp.isReg()) {
         CallReg = CalleeOp.getReg();
@@ -955,19 +947,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
         if (!CalleeDecl || !CalleeDecl->getSubprogram())
           continue;
-        const DISubprogram *CalleeSP = CalleeDecl->getSubprogram();
-
-        if (CalleeSP->isDefinition()) {
-          // Ensure that a subprogram DIE for the callee is available in the
-          // appropriate CU.
-          CalleeDIE = &constructSubprogramDefinitionDIE(CalleeSP);
-        } else {
-          // Create the declaration DIE if it is missing. This is required to
-          // support compilation of old bitcode with an incomplete list of
-          // retained metadata.
-          CalleeDIE = CU.getOrCreateSubprogramDIE(CalleeSP);
-        }
-        assert(CalleeDIE && "Must have a DIE for the callee");
+        CalleeSP = CalleeDecl->getSubprogram();
       }
 
       // TODO: Omit call site entries for runtime calls (objc_msgSend, etc).
@@ -1004,7 +984,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
       DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
-          ScopeDIE, CalleeDIE, IsTail, PCAddr, CallAddr, CallReg);
+          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg);
 
       // Optionally emit call-site-param debug info.
       if (emitDebugEntryValues()) {
@@ -1427,6 +1407,10 @@ void DwarfDebug::finalizeModuleInfo() {
 
 // Emit all Dwarf sections that should come after the content.
 void DwarfDebug::endModule() {
+  // Terminate the pending line table.
+  if (PrevCU)
+    terminateLineTable(PrevCU);
+  PrevCU = nullptr;
   assert(CurFn == nullptr);
   assert(CurMI == nullptr);
 
@@ -2102,12 +2086,22 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
 static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
   // First known non-DBG_VALUE and non-frame setup location marks
   // the beginning of the function body.
-  for (const auto &MBB : *MF)
-    for (const auto &MI : MBB)
+  DebugLoc LineZeroLoc;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
       if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) &&
-          MI.getDebugLoc())
-        return MI.getDebugLoc();
-  return DebugLoc();
+          MI.getDebugLoc()) {
+        // Scan forward to try to find a non-zero line number. The prologue_end
+        // marks the first breakpoint in the function after the frame setup, and
+        // a compiler-generated line 0 location is not a meaningful breakpoint.
+        // If none is found, return the first location after the frame setup.
+        if (MI.getDebugLoc().getLine())
+          return MI.getDebugLoc();
+        LineZeroLoc = MI.getDebugLoc();
+      }
+    }
+  }
+  return LineZeroLoc;
 }
 
 /// Register a source line with debug info. Returns the  unique label that was
@@ -2162,24 +2156,42 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
 
   DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
 
+  Asm->OutStreamer->getContext().setDwarfCompileUnitID(
+      getDwarfCompileUnitIDForLineTable(CU));
+
+  // Record beginning of function.
+  PrologEndLoc = emitInitialLocDirective(
+      *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
+}
+
+unsigned
+DwarfDebug::getDwarfCompileUnitIDForLineTable(const DwarfCompileUnit &CU) {
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
   if (Asm->OutStreamer->hasRawTextSupport())
     // Use a single line table if we are generating assembly.
-    Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
+    return 0;
   else
-    Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID());
+    return CU.getUniqueID();
+}
 
-  // Record beginning of function.
-  PrologEndLoc = emitInitialLocDirective(
-      *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
+void DwarfDebug::terminateLineTable(const DwarfCompileUnit *CU) {
+  const auto &CURanges = CU->getRanges();
+  auto &LineTable = Asm->OutStreamer->getContext().getMCDwarfLineTable(
+      getDwarfCompileUnitIDForLineTable(*CU));
+  // Add the last range label for the given CU.
+  LineTable.getMCLineSections().addEndEntry(
+      const_cast<MCSymbol *>(CURanges.back().End));
 }
 
 void DwarfDebug::skippedNonDebugFunction() {
   // If we don't have a subprogram for this function then there will be a hole
   // in the range information. Keep note of this by setting the previously used
   // section to nullptr.
+  // Terminate the pending line table.
+  if (PrevCU)
+    terminateLineTable(PrevCU);
   PrevCU = nullptr;
   CurFn = nullptr;
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 6356a65b50d3..4e1a1b1e068d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -65,19 +65,21 @@ class Module;
 /// such that it could levarage polymorphism to extract common code for
 /// DbgVariable and DbgLabel.
 class DbgEntity {
-  const DINode *Entity;
-  const DILocation *InlinedAt;
-  DIE *TheDIE = nullptr;
-  unsigned SubclassID;
-
 public:
   enum DbgEntityKind {
     DbgVariableKind,
     DbgLabelKind
   };
 
-  DbgEntity(const DINode *N, const DILocation *IA, unsigned ID)
-    : Entity(N), InlinedAt(IA), SubclassID(ID) {}
+private:
+  const DINode *Entity;
+  const DILocation *InlinedAt;
+  DIE *TheDIE = nullptr;
+  const DbgEntityKind SubclassID;
+
+public:
+  DbgEntity(const DINode *N, const DILocation *IA, DbgEntityKind ID)
+      : Entity(N), InlinedAt(IA), SubclassID(ID) {}
   virtual ~DbgEntity() {}
 
   /// Accessors.
@@ -85,19 +87,18 @@ public:
   const DINode *getEntity() const { return Entity; }
   const DILocation *getInlinedAt() const { return InlinedAt; }
   DIE *getDIE() const { return TheDIE; }
-  unsigned getDbgEntityID() const { return SubclassID; }
+  DbgEntityKind getDbgEntityID() const { return SubclassID; }
   /// @}
 
   void setDIE(DIE &D) { TheDIE = &D; }
 
   static bool classof(const DbgEntity *N) {
     switch (N->getDbgEntityID()) {
-    default:
-      return false;
     case DbgVariableKind:
     case DbgLabelKind:
       return true;
     }
+    llvm_unreachable("Invalid DbgEntityKind");
   }
 };
 
@@ -471,9 +472,6 @@ private:
   /// Construct a DIE for this abstract scope.
   void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
 
-  /// Construct a DIE for the subprogram definition \p SP and return it.
-  DIE &constructSubprogramDefinitionDIE(const DISubprogram *SP);
-
   /// Construct DIEs for call site entries describing the calls in \p MF.
   void constructCallSiteEntryDIEs(const DISubprogram &SP, DwarfCompileUnit &CU,
                                   DIE &ScopeDIE, const MachineFunction &MF);
@@ -615,7 +613,7 @@ private:
                          DenseSet<InlinedEntity> &ProcessedVars);
 
   /// Build the location list for all DBG_VALUEs in the
-  /// function that describe the same variable. If the resulting 
+  /// function that describe the same variable. If the resulting
   /// list has only one entry that is valid for entire variable's
   /// scope return true.
   bool buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
@@ -635,6 +633,9 @@ protected:
   /// Gather and emit post-function debug information.
   void endFunctionImpl(const MachineFunction *MF) override;
 
+  /// Get Dwarf compile unit ID for line table.
+  unsigned getDwarfCompileUnitIDForLineTable(const DwarfCompileUnit &CU);
+
   void skippedNonDebugFunction() override;
 
 public:
@@ -781,6 +782,9 @@ public:
   const DwarfCompileUnit *getPrevCU() const { return PrevCU; }
   void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; }
 
+  /// Terminate the line table by adding the last range label.
+  void terminateLineTable(const DwarfCompileUnit *CU);
+
   /// Returns the entries for the .debug_loc section.
   const DebugLocStream &getDebugLocs() const { return DebugLocs; }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 344d30fad347..976e35905144 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -186,9 +186,8 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
 
 /// Check whether the DIE for this MDNode can be shared across CUs.
 bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
-  // When the MDNode can be part of the type system (this includes subprogram
-  // declarations *and* subprogram definitions, even local definitions), the
-  // DIE must be shared across CUs.
+  // When the MDNode can be part of the type system, the DIE can be shared
+  // across CUs.
   // Combining type units and cross-CU DIE sharing is lower value (since
   // cross-CU DIE sharing is used in LTO and removes type redundancy at that
   // level already) but may be implementable for some value in projects
@@ -196,7 +195,9 @@ bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
   // together.
   if (isDwoUnit() && !DD->shareAcrossDWOCUs())
     return false;
-  return (isa<DIType>(D) || isa<DISubprogram>(D)) && !DD->generateTypeUnits();
+  return (isa<DIType>(D) ||
+          (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
+         !DD->generateTypeUnits();
 }
 
 DIE *DwarfUnit::getDIE(const DINode *D) const {
@@ -671,7 +672,7 @@ std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
 
   // Reverse iterate over our list to go from the outermost construct to the
   // innermost.
-  for (const DIScope *Ctx : make_range(Parents.rbegin(), Parents.rend())) {
+  for (const DIScope *Ctx : llvm::reverse(Parents)) {
     StringRef Name = Ctx->getName();
     if (Name.empty() && isa<DINamespace>(Ctx))
       Name = "(anonymous namespace)";
@@ -753,6 +754,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   if (!Name.empty())
     addString(Buffer, dwarf::DW_AT_name, Name);
 
+  addAnnotation(Buffer, DTy->getAnnotations());
+
   // If alignment is specified for a typedef , create and insert DW_AT_alignment
   // attribute in DW_TAG_typedef DIE.
   if (Tag == dwarf::DW_TAG_typedef && DD->getDwarfVersion() >= 5) {
@@ -832,6 +835,23 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
     addFlag(Buffer, dwarf::DW_AT_rvalue_reference);
 }
 
+void DwarfUnit::addAnnotation(DIE &Buffer, DINodeArray Annotations) {
+  if (!Annotations)
+    return;
+
+  for (const Metadata *Annotation : Annotations->operands()) {
+    const MDNode *MD = cast<MDNode>(Annotation);
+    const MDString *Name = cast<MDString>(MD->getOperand(0));
+
+    // Currently, only MDString is supported with btf_decl_tag attribute.
+    const MDString *Value = cast<MDString>(MD->getOperand(1));
+
+    DIE &AnnotationDie = createAndAddDIE(dwarf::DW_TAG_LLVM_annotation, Buffer);
+    addString(AnnotationDie, dwarf::DW_AT_name, Name->getString());
+    addString(AnnotationDie, dwarf::DW_AT_const_value, Value->getString());
+  }
+}
+
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   // Add name if not anonymous or intermediate type.
   StringRef Name = CTy->getName();
@@ -849,7 +869,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   case dwarf::DW_TAG_variant_part:
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_union_type:
-  case dwarf::DW_TAG_class_type: {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_namelist: {
     // Emit the discriminator for a variant part.
     DIDerivedType *Discriminator = nullptr;
     if (Tag == dwarf::DW_TAG_variant_part) {
@@ -918,6 +939,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
           DIE &VariantPart = createAndAddDIE(Composite->getTag(), Buffer);
           constructTypeDIE(VariantPart, Composite);
         }
+      } else if (Tag == dwarf::DW_TAG_namelist) {
+        auto *Var = dyn_cast<DINode>(Element);
+        auto *VarDIE = getDIE(Var);
+        if (VarDIE) {
+          DIE &ItemDie = createAndAddDIE(dwarf::DW_TAG_namelist_item, Buffer);
+          addDIEEntry(ItemDie, dwarf::DW_AT_namelist_item, *VarDIE);
+        }
       }
     }
 
@@ -960,6 +988,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
   if (!Name.empty())
     addString(Buffer, dwarf::DW_AT_name, Name);
 
+  addAnnotation(Buffer, CTy->getAnnotations());
+
   if (Tag == dwarf::DW_TAG_enumeration_type ||
       Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
       Tag == dwarf::DW_TAG_union_type) {
@@ -1196,6 +1226,8 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   if (!SP->getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP->getName());
 
+  addAnnotation(SPDie, SP->getAnnotations());
+
   if (!SkipSPSourceLocation)
     addSourceLine(SPDie, SP);
 
@@ -1546,6 +1578,8 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, Name);
 
+  addAnnotation(MemberDie, DT->getAnnotations());
+
   if (DIType *Resolved = DT->getBaseType())
     addType(MemberDie, Resolved);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 4d31dd0daf59..8140279adaef 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -294,6 +294,9 @@ public:
   void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
                        const MCSymbol *Label, const MCSymbol *Sec);
 
+  /// Add DW_TAG_LLVM_annotation.
+  void addAnnotation(DIE &Buffer, DINodeArray Annotations);
+
   /// Get context owner's DIE.
   DIE *createTypeDIE(const DICompositeType *Ty);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index e589c2e64abd..150f19324834 100644
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -812,8 +812,7 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
     Entry = TypeInfos.size();
   }
 
-  for (const GlobalValue *GV : make_range(TypeInfos.rbegin(),
-                                          TypeInfos.rend())) {
+  for (const GlobalValue *GV : llvm::reverse(TypeInfos)) {
     if (VerboseAsm)
       Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
     Asm->emitTTypeReference(GV, TTypeEncoding);
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
index 35a830f416f6..9e6f1a537de3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -20,6 +20,8 @@
 
 using namespace llvm;
 
+PseudoProbeHandler::~PseudoProbeHandler() = default;
+
 void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
                                          uint64_t Type, uint64_t Attr,
                                          const DILocation *DebugLoc) {
@@ -35,7 +37,10 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
     auto Name = SP->getLinkageName();
     if (Name.empty())
       Name = SP->getName();
-    uint64_t CallerGuid = Function::getGUID(Name);
+    // Use caching to avoid redundant md5 computation for build speed.
+    uint64_t &CallerGuid = NameGuidMap[Name];
+    if (!CallerGuid)
+      CallerGuid = Function::getGUID(Name);
     uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex(
         InlinedAt->getDiscriminator());
     ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId);
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
index f2026a118bf5..7d5e51218693 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -26,9 +26,12 @@ class DILocation;
 class PseudoProbeHandler : public AsmPrinterHandler {
   // Target of pseudo probe emission.
   AsmPrinter *Asm;
+  // Name to GUID map, used as caching/memoization for speed.
+  DenseMap<StringRef, uint64_t> NameGuidMap;
 
 public:
   PseudoProbeHandler(AsmPrinter *A) : Asm(A){};
+  ~PseudoProbeHandler() override;
 
   void emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t Type,
                        uint64_t Attr, const DILocation *DebugLoc);
diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
index 352a33e8639d..a17a2ca2790e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -18,16 +18,25 @@
 using namespace llvm;
 
 void WasmException::endModule() {
-  // This is the symbol used in 'throw' and 'catch' instruction to denote this
-  // is a C++ exception. This symbol has to be emitted somewhere once in the
-  // module.  Check if the symbol has already been created, i.e., we have at
-  // least one 'throw' or 'catch' instruction in the module, and emit the symbol
-  // only if so.
-  SmallString<60> NameStr;
-  Mangler::getNameWithPrefix(NameStr, "__cpp_exception", Asm->getDataLayout());
-  if (Asm->OutContext.lookupSymbol(NameStr)) {
-    MCSymbol *ExceptionSym = Asm->GetExternalSymbolSymbol("__cpp_exception");
-    Asm->OutStreamer->emitLabel(ExceptionSym);
+  // These are symbols used to throw/catch C++ exceptions and C longjmps. These
+  // symbols have to be emitted somewhere once in the module. Check if each of
+  // the symbols has already been created, i.e., we have at least one 'throw' or
+  // 'catch' instruction with the symbol in the module, and emit the symbol only
+  // if so.
+  //
+  // But in dynamic linking, it is in general not possible to come up with a
+  // module instantiating order in which tag-defining modules are loaded before
+  // the importing modules. So we make them undefined symbols here, define tags
+  // in the JS side, and feed them to each importing module.
+  if (!Asm->isPositionIndependent()) {
+    for (const char *SymName : {"__cpp_exception", "__c_longjmp"}) {
+      SmallString<60> NameStr;
+      Mangler::getNameWithPrefix(NameStr, SymName, Asm->getDataLayout());
+      if (Asm->OutContext.lookupSymbol(NameStr)) {
+        MCSymbol *ExceptionSym = Asm->GetExternalSymbolSymbol(SymName);
+        Asm->OutStreamer->emitLabel(ExceptionSym);
+      }
+    }
   }
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index b30d9cc12abc..ef57031c7294 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -43,6 +43,7 @@ WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
   // platforms use an imagerel32 relocation to refer to symbols.
   useImageRel32 = (A->getDataLayout().getPointerSizeInBits() == 64);
   isAArch64 = Asm->TM.getTargetTriple().isAArch64();
+  isThumb = Asm->TM.getTargetTriple().isThumb();
 }
 
 WinException::~WinException() {}
@@ -330,10 +331,12 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
 }
 
 const MCExpr *WinException::getLabel(const MCSymbol *Label) {
-  if (isAArch64)
-    return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
-                                   Asm->OutContext);
-  return MCBinaryExpr::createAdd(create32bitRef(Label),
+  return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+  return MCBinaryExpr::createAdd(getLabel(Label),
                                  MCConstantExpr::create(1, Asm->OutContext),
                                  Asm->OutContext);
 }
@@ -561,8 +564,8 @@ InvokeStateChangeIterator &InvokeStateChangeIterator::scan() {
 ///   struct Table {
 ///     int NumEntries;
 ///     struct Entry {
-///       imagerel32 LabelStart;
-///       imagerel32 LabelEnd;
+///       imagerel32 LabelStart;       // Inclusive
+///       imagerel32 LabelEnd;         // Exclusive
 ///       imagerel32 FilterOrFinally;  // One means catch-all.
 ///       imagerel32 LabelLPad;        // Zero means __finally.
 ///     } Entries[NumEntries];
@@ -664,7 +667,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
     AddComment("LabelStart");
     OS.emitValue(getLabel(BeginLabel), 4);
     AddComment("LabelEnd");
-    OS.emitValue(getLabel(EndLabel), 4);
+    OS.emitValue(getLabelPlusOne(EndLabel), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
                                                              : "CatchAll");
     OS.emitValue(FilterOrFinally, 4);
@@ -949,8 +952,15 @@ void WinException::computeIP2StateTable(
       if (!ChangeLabel)
         ChangeLabel = StateChange.PreviousEndLabel;
       // Emit an entry indicating that PCs after 'Label' have this EH state.
+      // NOTE: On ARM architectures, the StateFromIp automatically takes into
+      // account that the return address is after the call instruction (whose EH
+      // state we should be using), but on other platforms we need to +1 to the
+      // label so that we are using the correct EH state.
+      const MCExpr *LabelExpression = (isAArch64 || isThumb)
+                                          ? getLabel(ChangeLabel)
+                                          : getLabelPlusOne(ChangeLabel);
       IPToStateTable.push_back(
-          std::make_pair(getLabel(ChangeLabel), StateChange.NewState));
+          std::make_pair(LabelExpression, StateChange.NewState));
       // FIXME: assert that NewState is between CatchLow and CatchHigh.
     }
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h
index feea05ba63ad..638589adf0dd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -39,6 +39,9 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   /// True if we are generating exception handling on Windows for ARM64.
   bool isAArch64 = false;
 
+  /// True if we are generating exception handling on Windows for ARM (Thumb).
+  bool isThumb = false;
+
   /// Pointer to the current funclet entry BB.
   const MachineBasicBlock *CurrentFuncletEntry = nullptr;
 
@@ -77,6 +80,7 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   const MCExpr *create32bitRef(const MCSymbol *Value);
   const MCExpr *create32bitRef(const GlobalValue *GV);
   const MCExpr *getLabel(const MCSymbol *Label);
+  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
   const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
   const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
                                  const MCSymbol *OffsetFrom);
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 125a3be585cb..4838f6da750d 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -179,11 +180,9 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
   // Changing control-flow while iterating through it is a bad idea, so gather a
   // list of all atomic instructions before we start.
-  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
-    Instruction *I = &*II;
-    if (I->isAtomic() && !isa<FenceInst>(I))
-      AtomicInsts.push_back(I);
-  }
+  for (Instruction &I : instructions(F))
+    if (I.isAtomic() && !isa<FenceInst>(&I))
+      AtomicInsts.push_back(&I);
 
   bool MadeChange = false;
   for (auto I : AtomicInsts) {
@@ -570,7 +569,9 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
 }
 
 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
-  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
+  LLVMContext &Ctx = AI->getModule()->getContext();
+  TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);
+  switch (Kind) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
@@ -600,6 +601,18 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
       expandPartwordAtomicRMW(AI,
                               TargetLoweringBase::AtomicExpansionKind::CmpXChg);
     } else {
+      SmallVector<StringRef> SSNs;
+      Ctx.getSyncScopeNames(SSNs);
+      auto MemScope = SSNs[AI->getSyncScopeID()].empty()
+                          ? "system"
+                          : SSNs[AI->getSyncScopeID()];
+      OptimizationRemarkEmitter ORE(AI->getFunction());
+      ORE.emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Passed", AI)
+               << "A compare and swap loop was generated for an atomic "
+               << AI->getOperationName(AI->getOperation()) << " operation at "
+               << MemScope << " memory scope";
+      });
       expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
     }
     return true;
@@ -1850,7 +1863,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // Now, the return type.
   if (CASExpected) {
     ResultTy = Type::getInt1Ty(Ctx);
-    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
+    Attr = Attr.addRetAttribute(Ctx, Attribute::ZExt);
   } else if (HasResult && UseSizedLibcall)
     ResultTy = SizedIntTy;
   else
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index 1a6eed272ca2..c1901bc46d72 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -21,9 +21,21 @@
 // clusters of basic blocks. Every cluster will be emitted into a separate
 // section with its basic blocks sequenced in the given order. To get the
 // optimized performance, the clusters must form an optimal BB layout for the
-// function. Every cluster's section is labeled with a symbol to allow the
-// linker to reorder the sections in any arbitrary sequence. A global order of
-// these sections would encapsulate the function layout.
+// function. We insert a symbol at the beginning of every cluster's section to
+// allow the linker to reorder the sections in any arbitrary sequence. A global
+// order of these sections would encapsulate the function layout.
+// For example, consider the following clusters for a function foo (consisting
+// of 6 basic blocks 0, 1, ..., 5).
+//
+// 0 2
+// 1 3 5
+//
+// * Basic blocks 0 and 2 are placed in one section with symbol `foo`
+//   referencing the beginning of this section.
+// * Basic blocks 1, 3, 5 are placed in a separate section. A new symbol
+//   `foo.__part.1` will reference the beginning of this section.
+// * Basic block 4 (note that it is not referenced in the list) is placed in
+//   one section, and a new symbol `foo.cold` will point to it.
 //
 // There are a couple of challenges to be addressed:
 //
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 65e7e92fe152..5ac8f49a9522 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -611,7 +611,7 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
   // there are fallthroughs, and we don't know until after layout.
   if (AfterPlacement && FullBlockTail1 && FullBlockTail2) {
     auto BothFallThrough = [](MachineBasicBlock *MBB) {
-      if (MBB->succ_size() != 0 && !MBB->canFallThrough())
+      if (!MBB->succ_empty() && !MBB->canFallThrough())
         return false;
       MachineFunction::iterator I(MBB);
       MachineFunction *MF = MBB->getParent();
@@ -1198,14 +1198,13 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
   // Renumbering blocks alters EH scope membership, recalculate it.
   EHScopeMembership = getEHScopeMembership(MF);
 
-  for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
-       I != E; ) {
-    MachineBasicBlock *MBB = &*I++;
-    MadeChange |= OptimizeBlock(MBB);
+  for (MachineBasicBlock &MBB :
+       llvm::make_early_inc_range(llvm::drop_begin(MF))) {
+    MadeChange |= OptimizeBlock(&MBB);
 
     // If it is dead, remove it.
-    if (MBB->pred_empty()) {
-      RemoveDeadBlock(MBB);
+    if (MBB.pred_empty()) {
+      RemoveDeadBlock(&MBB);
       MadeChange = true;
       ++NumDeadBlocks;
     }
@@ -1753,10 +1752,8 @@ ReoptimizeBlock:
 
 bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
   bool MadeChange = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
-    MachineBasicBlock *MBB = &*I++;
-    MadeChange |= HoistCommonCodeInSuccs(MBB);
-  }
+  for (MachineBasicBlock &MBB : llvm::make_early_inc_range(MF))
+    MadeChange |= HoistCommonCodeInSuccs(&MBB);
 
   return MadeChange;
 }
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index 366c303614d6..50825ccf9bac 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -463,10 +463,48 @@ bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
 
   DebugLoc DL = MI.getDebugLoc();
   MI.eraseFromParent();
-  BlockInfo[BranchBB->getNumber()].Size += TII->insertIndirectBranch(
-    *BranchBB, *DestBB, DL, DestOffset - SrcOffset, RS.get());
 
+  // Create the optional restore block and, initially, place it at the end of
+  // function. That block will be placed later if it's used; otherwise, it will
+  // be erased.
+  MachineBasicBlock *RestoreBB = createNewBlockAfter(MF->back());
+
+  TII->insertIndirectBranch(*BranchBB, *DestBB, *RestoreBB, DL,
+                            DestOffset - SrcOffset, RS.get());
+
+  BlockInfo[BranchBB->getNumber()].Size = computeBlockSize(*BranchBB);
   adjustBlockOffsets(*MBB);
+
+  // If RestoreBB is required, try to place just before DestBB.
+  if (!RestoreBB->empty()) {
+    // TODO: For multiple far branches to the same destination, there are
+    // chances that some restore blocks could be shared if they clobber the
+    // same registers and share the same restore sequence. So far, those
+    // restore blocks are just duplicated for each far branch.
+    assert(!DestBB->isEntryBlock());
+    MachineBasicBlock *PrevBB = &*std::prev(DestBB->getIterator());
+    if (auto *FT = PrevBB->getFallThrough()) {
+      assert(FT == DestBB);
+      TII->insertUnconditionalBranch(*PrevBB, FT, DebugLoc());
+      // Recalculate the block size.
+      BlockInfo[PrevBB->getNumber()].Size = computeBlockSize(*PrevBB);
+    }
+    // Now, RestoreBB could be placed directly before DestBB.
+    MF->splice(DestBB->getIterator(), RestoreBB->getIterator());
+    // Update successors and predecessors.
+    RestoreBB->addSuccessor(DestBB);
+    BranchBB->replaceSuccessor(DestBB, RestoreBB);
+    if (TRI->trackLivenessAfterRegAlloc(*MF))
+      computeAndAddLiveIns(LiveRegs, *RestoreBB);
+    // Compute the restore block size.
+    BlockInfo[RestoreBB->getNumber()].Size = computeBlockSize(*RestoreBB);
+    // Update the offset starting from the previous block.
+    adjustBlockOffsets(*PrevBB);
+  } else {
+    // Remove restore block if it's not required.
+    MF->erase(RestoreBB);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index b11db3e65770..558700bd9b3b 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -244,7 +244,7 @@ void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) {
   MachineInstr *UndefMI = UndefReads.back().first;
   unsigned OpIdx = UndefReads.back().second;
 
-  for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
+  for (MachineInstr &I : llvm::reverse(*MBB)) {
     // Update liveness, including the current instruction's defs.
     LiveRegSet.stepBackward(I);
 
diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
new file mode 100644
index 000000000000..877aa69c3e58
--- /dev/null
+++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp
@@ -0,0 +1,169 @@
+//===-- CodeGenCommonISel.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common utilies that are shared between SelectionDAG and
+// GlobalISel frameworks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+
+/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
+/// is 0.
+MachineBasicBlock *
+StackProtectorDescriptor::addSuccessorMBB(
+    const BasicBlock *BB, MachineBasicBlock *ParentMBB, bool IsLikely,
+    MachineBasicBlock *SuccMBB) {
+  // If SuccBB has not been created yet, create it.
+  if (!SuccMBB) {
+    MachineFunction *MF = ParentMBB->getParent();
+    MachineFunction::iterator BBI(ParentMBB);
+    SuccMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(++BBI, SuccMBB);
+  }
+  // Add it as a successor of ParentMBB.
+  ParentMBB->addSuccessor(
+      SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
+  return SuccMBB;
+}
+
+/// Given that the input MI is before a partial terminator sequence TSeq, return
+/// true if M + TSeq also a partial terminator sequence.
+///
+/// A Terminator sequence is a sequence of MachineInstrs which at this point in
+/// lowering copy vregs into physical registers, which are then passed into
+/// terminator instructors so we can satisfy ABI constraints. A partial
+/// terminator sequence is an improper subset of a terminator sequence (i.e. it
+/// may be the whole terminator sequence).
+static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
+  // If we do not have a copy or an implicit def, we return true if and only if
+  // MI is a debug value.
+  if (!MI.isCopy() && !MI.isImplicitDef()) {
+    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
+    // physical registers if there is debug info associated with the terminator
+    // of our mbb. We want to include said debug info in our terminator
+    // sequence, so we return true in that case.
+    if (MI.isDebugInstr())
+      return true;
+
+    // For GlobalISel, we may have extension instructions for arguments within
+    // copy sequences. Allow these.
+    switch (MI.getOpcode()) {
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_MERGE_VALUES:
+    case TargetOpcode::G_UNMERGE_VALUES:
+    case TargetOpcode::G_CONCAT_VECTORS:
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_EXTRACT:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  // We have left the terminator sequence if we are not doing one of the
+  // following:
+  //
+  // 1. Copying a vreg into a physical register.
+  // 2. Copying a vreg into a vreg.
+  // 3. Defining a register via an implicit def.
+
+  // OPI should always be a register definition...
+  MachineInstr::const_mop_iterator OPI = MI.operands_begin();
+  if (!OPI->isReg() || !OPI->isDef())
+    return false;
+
+  // Defining any register via an implicit def is always ok.
+  if (MI.isImplicitDef())
+    return true;
+
+  // Grab the copy source...
+  MachineInstr::const_mop_iterator OPI2 = OPI;
+  ++OPI2;
+  assert(OPI2 != MI.operands_end()
+         && "Should have a copy implying we should have 2 arguments.");
+
+  // Make sure that the copy dest is not a vreg when the copy source is a
+  // physical register.
+  if (!OPI2->isReg() || (!Register::isPhysicalRegister(OPI->getReg()) &&
+                         Register::isPhysicalRegister(OPI2->getReg())))
+    return false;
+
+  return true;
+}
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+MachineBasicBlock::iterator
+llvm::findSplitPointForStackProtector(MachineBasicBlock *BB,
+                                      const TargetInstrInfo &TII) {
+  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
+  if (SplitPoint == BB->begin())
+    return SplitPoint;
+
+  MachineBasicBlock::iterator Start = BB->begin();
+  MachineBasicBlock::iterator Previous = SplitPoint;
+  --Previous;
+
+  if (TII.isTailCall(*SplitPoint) &&
+      Previous->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // Call frames cannot be nested, so if this frame is describing the tail
+    // call itself, then we must insert before the sequence even starts. For
+    // example:
+    //     <split point>
+    //     ADJCALLSTACKDOWN ...
+    //     <Moves>
+    //     ADJCALLSTACKUP ...
+    //     TAILJMP somewhere
+    // On the other hand, it could be an unrelated call in which case this tail
+    // call has to register moves of its own and should be the split point. For
+    // example:
+    //     ADJCALLSTACKDOWN
+    //     CALL something_else
+    //     ADJCALLSTACKUP
+    //     <split point>
+    //     TAILJMP somewhere
+    do {
+      --Previous;
+      if (Previous->isCall())
+        return SplitPoint;
+    } while(Previous->getOpcode() != TII.getCallFrameSetupOpcode());
+
+    return Previous;
+  }
+
+  while (MIIsInTerminatorSequence(*Previous)) {
+    SplitPoint = Previous;
+    if (Previous == Start)
+      break;
+    --Previous;
+  }
+
+  return SplitPoint;
+}
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 77ce3d2fb563..ac4180c4c3ab 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -530,10 +530,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   while (MadeChange) {
     MadeChange = false;
     DT.reset();
-    for (Function::iterator I = F.begin(); I != F.end(); ) {
-      BasicBlock *BB = &*I++;
+    for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
       bool ModifiedDTOnIteration = false;
-      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+      MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration);
 
       // Restart BB iteration if the dominator tree of the Function was changed
       if (ModifiedDTOnIteration)
@@ -660,12 +659,8 @@ void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
     return;
 
   auto &GEPVector = VecI->second;
-  const auto &I =
-      llvm::find_if(GEPVector, [=](auto &Elt) { return Elt.first == GEP; });
-  if (I == GEPVector.end())
-    return;
+  llvm::erase_if(GEPVector, [=](auto &Elt) { return Elt.first == GEP; });
 
-  GEPVector.erase(I);
   if (GEPVector.empty())
     LargeOffsetGEPMap.erase(VecI);
 }
@@ -2037,7 +2032,7 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
 
   // Only handle legal scalar cases. Anything else requires too much work.
   Type *Ty = CountZeros->getType();
-  unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
+  unsigned SizeInBits = Ty->getScalarSizeInBits();
   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
     return false;
 
@@ -2108,7 +2103,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   // idea
   unsigned MinSize, PrefAlign;
   if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
-    for (auto &Arg : CI->arg_operands()) {
+    for (auto &Arg : CI->args()) {
       // We want to align both objects whose address is used directly and
       // objects whose address is used in casts and GEPs, though it only makes
       // sense for GEPs if the offset is a multiple of the desired alignment and
@@ -2159,7 +2154,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   // into their uses.  TODO: generalize this to work over profiling data
   if (CI->hasFnAttr(Attribute::Cold) &&
       !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
-    for (auto &Arg : CI->arg_operands()) {
+    for (auto &Arg : CI->args()) {
       if (!Arg->getType()->isPointerTy())
         continue;
       unsigned AS = Arg->getType()->getPointerAddressSpace();
@@ -3718,7 +3713,8 @@ private:
       // Traverse all Phis until we found equivalent or fail to do that.
       bool IsMatched = false;
       for (auto &P : PHI->getParent()->phis()) {
-        if (&P == PHI)
+        // Skip new Phi nodes.
+        if (PhiNodesToMatch.count(&P))
           continue;
         if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
           break;
@@ -4187,7 +4183,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
   if (Inst->getOpcode() == Instruction::Xor) {
     const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
     // Make sure it is not a NOT.
-    if (Cst && !Cst->getValue().isAllOnesValue())
+    if (Cst && !Cst->getValue().isAllOnes())
       return true;
   }
 
@@ -4858,10 +4854,9 @@ static constexpr int MaxMemoryUsesToScan = 20;
 
 /// Recursively walk all the uses of I until we find a memory use.
 /// If we find an obviously non-foldable instruction, return true.
-/// Add the ultimately found memory instructions to MemoryUses.
+/// Add accessed addresses and types to MemoryUses.
 static bool FindAllMemoryUses(
-    Instruction *I,
-    SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
+    Instruction *I, SmallVectorImpl<std::pair<Value *, Type *>> &MemoryUses,
     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
     const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
     BlockFrequencyInfo *BFI, int SeenInsts = 0) {
@@ -4882,31 +4877,28 @@ static bool FindAllMemoryUses(
 
     Instruction *UserI = cast<Instruction>(U.getUser());
     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
-      MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
+      MemoryUses.push_back({U.get(), LI->getType()});
       continue;
     }
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
-      unsigned opNo = U.getOperandNo();
-      if (opNo != StoreInst::getPointerOperandIndex())
+      if (U.getOperandNo() != StoreInst::getPointerOperandIndex())
         return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(SI, opNo));
+      MemoryUses.push_back({U.get(), SI->getValueOperand()->getType()});
       continue;
     }
 
     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
-      unsigned opNo = U.getOperandNo();
-      if (opNo != AtomicRMWInst::getPointerOperandIndex())
+      if (U.getOperandNo() != AtomicRMWInst::getPointerOperandIndex())
         return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(RMW, opNo));
+      MemoryUses.push_back({U.get(), RMW->getValOperand()->getType()});
       continue;
     }
 
     if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
-      unsigned opNo = U.getOperandNo();
-      if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
+      if (U.getOperandNo() != AtomicCmpXchgInst::getPointerOperandIndex())
         return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(CmpX, opNo));
+      MemoryUses.push_back({U.get(), CmpX->getCompareOperand()->getType()});
       continue;
     }
 
@@ -5016,7 +5008,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // we can remove the addressing mode and effectively trade one live register
   // for another (at worst.)  In this context, folding an addressing mode into
   // the use is just a particularly nice way of sinking it.
-  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
+  SmallVector<std::pair<Value *, Type *>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
                         PSI, BFI))
@@ -5032,18 +5024,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // growth since most architectures have some reasonable small and fast way to
   // compute an effective address.  (i.e LEA on x86)
   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
-  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
-    Instruction *User = MemoryUses[i].first;
-    unsigned OpNo = MemoryUses[i].second;
-
-    // Get the access type of this use.  If the use isn't a pointer, we don't
-    // know what it accesses.
-    Value *Address = User->getOperand(OpNo);
-    PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
-    if (!AddrTy)
-      return false;
-    Type *AddressAccessTy = AddrTy->getElementType();
-    unsigned AS = AddrTy->getAddressSpace();
+  for (const std::pair<Value *, Type *> &Pair : MemoryUses) {
+    Value *Address = Pair.first;
+    Type *AddressAccessTy = Pair.second;
+    unsigned AS = Address->getType()->getPointerAddressSpace();
 
     // Do a match against the root of this address, ignoring profitability. This
     // will tell us if the addressing mode for the memory operation will
@@ -5124,8 +5108,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
   while (!worklist.empty()) {
-    Value *V = worklist.back();
-    worklist.pop_back();
+    Value *V = worklist.pop_back_val();
 
     // We allow traversing cyclic Phi nodes.
     // In case of success after this loop we ensure that traversing through
@@ -6477,8 +6460,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   APInt WidestAndBits(BitWidth, 0);
 
   while (!WorkList.empty()) {
-    Instruction *I = WorkList.back();
-    WorkList.pop_back();
+    Instruction *I = WorkList.pop_back_val();
 
     // Break use-def graph loops.
     if (!Visited.insert(I).second)
@@ -6950,16 +6932,26 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
   BasicBlock *TargetBB = I->getParent();
   bool Changed = false;
   SmallVector<Use *, 4> ToReplace;
+  Instruction *InsertPoint = I;
+  DenseMap<const Instruction *, unsigned long> InstOrdering;
+  unsigned long InstNumber = 0;
+  for (const auto &I : *TargetBB)
+    InstOrdering[&I] = InstNumber++;
+
   for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
-    if (UI->getParent() == TargetBB || isa<PHINode>(UI))
+    if (isa<PHINode>(UI))
       continue;
+    if (UI->getParent() == TargetBB) {
+      if (InstOrdering[UI] < InstOrdering[InsertPoint])
+        InsertPoint = UI;
+      continue;
+    }
     ToReplace.push_back(U);
   }
 
   SetVector<Instruction *> MaybeDead;
   DenseMap<Instruction *, Instruction *> NewInstructions;
-  Instruction *InsertPoint = I;
   for (Use *U : ToReplace) {
     auto *UI = cast<Instruction>(U->get());
     Instruction *NI = UI->clone();
@@ -7863,8 +7855,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
 
   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
 
-  if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)
-    return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
+  if (BinOp && BinOp->getOpcode() == Instruction::And && EnableAndCmpSinking &&
+      sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))
+    return true;
 
   // TODO: Move this into the switch on opcode - it handles shifts already.
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
@@ -8030,9 +8023,8 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
   DominatorTree DT(F);
 
   for (BasicBlock &BB : F) {
-    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
-      Instruction *Insn = &*BI++;
-      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
+    for (Instruction &Insn : llvm::make_early_inc_range(BB)) {
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(&Insn);
       if (!DVI)
         continue;
 
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index f3cba6225107..a1ff02178ffa 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -65,6 +65,7 @@ CGOPT(DenormalMode::DenormalModeKind, DenormalFP32Math)
 CGOPT(bool, EnableHonorSignDependentRoundingFPMath)
 CGOPT(FloatABI::ABIType, FloatABIForCalls)
 CGOPT(FPOpFusion::FPOpFusionMode, FuseFPOps)
+CGOPT(SwiftAsyncFramePointerMode, SwiftAsyncFramePointer)
 CGOPT(bool, DontPlaceZerosInBSS)
 CGOPT(bool, EnableGuaranteedTailCallOpt)
 CGOPT(bool, DisableTailCalls)
@@ -89,11 +90,11 @@ CGOPT(bool, EnableAddrsig)
 CGOPT(bool, EmitCallSiteInfo)
 CGOPT(bool, EnableMachineFunctionSplitter)
 CGOPT(bool, EnableDebugEntryValues)
-CGOPT(bool, PseudoProbeForProfiling)
 CGOPT(bool, ValueTrackingVariableLocations)
 CGOPT(bool, ForceDwarfFrameSection)
 CGOPT(bool, XRayOmitFunctionIndex)
 CGOPT(bool, DebugStrictDwarf)
+CGOPT(unsigned, AlignLoops)
 
 codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
 #define CGBINDOPT(NAME)                                                        \
@@ -277,6 +278,18 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
                      "Only fuse FP ops when the result won't be affected.")));
   CGBINDOPT(FuseFPOps);
 
+  static cl::opt<SwiftAsyncFramePointerMode> SwiftAsyncFramePointer(
+      "swift-async-fp",
+      cl::desc("Determine when the Swift async frame pointer should be set"),
+      cl::init(SwiftAsyncFramePointerMode::Always),
+      cl::values(clEnumValN(SwiftAsyncFramePointerMode::DeploymentBased, "auto",
+                            "Determine based on deployment target"),
+                 clEnumValN(SwiftAsyncFramePointerMode::Always, "always",
+                            "Always set the bit"),
+                 clEnumValN(SwiftAsyncFramePointerMode::Never, "never",
+                            "Never set the bit")));
+  CGBINDOPT(SwiftAsyncFramePointer);
+
   static cl::opt<bool> DontPlaceZerosInBSS(
       "nozero-initialized-in-bss",
       cl::desc("Don't place zero-initialized symbols into bss section"),
@@ -420,11 +433,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(EnableDebugEntryValues);
 
-  static cl::opt<bool> PseudoProbeForProfiling(
-      "pseudo-probe-for-profiling", cl::desc("Emit pseudo probes for AutoFDO"),
-      cl::init(false));
-  CGBINDOPT(PseudoProbeForProfiling);
-
   static cl::opt<bool> ValueTrackingVariableLocations(
       "experimental-debug-variable-locations",
       cl::desc("Use experimental new value-tracking variable locations"),
@@ -452,6 +460,10 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       "strict-dwarf", cl::desc("use strict dwarf"), cl::init(false));
   CGBINDOPT(DebugStrictDwarf);
 
+  static cl::opt<unsigned> AlignLoops("align-loops",
+                                      cl::desc("Default alignment for loops"));
+  CGBINDOPT(AlignLoops);
+
 #undef CGBINDOPT
 
   mc::RegisterMCTargetOptionsFlags();
@@ -522,18 +534,18 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.EmitAddrsig = getEnableAddrsig();
   Options.EmitCallSiteInfo = getEmitCallSiteInfo();
   Options.EnableDebugEntryValues = getEnableDebugEntryValues();
-  Options.PseudoProbeForProfiling = getPseudoProbeForProfiling();
   Options.ValueTrackingVariableLocations = getValueTrackingVariableLocations();
   Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
   Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex();
   Options.DebugStrictDwarf = getDebugStrictDwarf();
+  Options.LoopAlignment = getAlignLoops();
 
   Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
 
   Options.ThreadModel = getThreadModel();
   Options.EABIVersion = getEABIVersion();
   Options.DebuggerTuning = getDebuggerTuningOpt();
-
+  Options.SwiftAsyncFramePointer = getSwiftAsyncFramePointer();
   return Options;
 }
 
@@ -666,13 +678,11 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
           if (const auto *F = Call->getCalledFunction())
             if (F->getIntrinsicID() == Intrinsic::debugtrap ||
                 F->getIntrinsicID() == Intrinsic::trap)
-              Call->addAttribute(
-                  AttributeList::FunctionIndex,
+              Call->addFnAttr(
                   Attribute::get(Ctx, "trap-func-name", getTrapFuncName()));
 
   // Let NewAttrs override Attrs.
-  F.setAttributes(
-      Attrs.addAttributes(Ctx, AttributeList::FunctionIndex, NewAttrs));
+  F.setAttributes(Attrs.addFnAttributes(Ctx, NewAttrs));
 }
 
 /// Set function attributes of functions in Module M based on CPU,
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index c56c8c87734f..981f5973fee8 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -212,6 +212,21 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
     if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
       RegRefs.insert(std::make_pair(Reg, &MO));
 
+    if (MO.isUse() && Special) {
+      if (!KeepRegs.test(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+             SubRegs.isValid(); ++SubRegs)
+          KeepRegs.set(*SubRegs);
+      }
+    }
+  }
+
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (!MO.isReg()) continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isValid())
+      continue;
     // If this reg is tied and live (Classes[Reg] is set to -1), we can't change
     // it or any of its sub or super regs. We need to use KeepRegs to mark the
     // reg because not all uses of the same reg within an instruction are
@@ -222,7 +237,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
     // of a register? In the above 'xor' example, the uses of %eax are undef, so
     // earlier instructions could still replace %eax even though the 'xor'
     // itself can't be changed.
-    if (MI.isRegTiedToUseOperand(i) &&
+    if (MI.isRegTiedToUseOperand(I) &&
         Classes[Reg] == reinterpret_cast<TargetRegisterClass *>(-1)) {
       for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs) {
@@ -233,14 +248,6 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
         KeepRegs.set(*SuperRegs);
       }
     }
-
-    if (MO.isUse() && Special) {
-      if (!KeepRegs.test(Reg)) {
-        for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-             SubRegs.isValid(); ++SubRegs)
-          KeepRegs.set(*SubRegs);
-      }
-    }
   }
 }
 
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 6e7db95b5c2a..c6c0b79cd7e7 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -138,26 +138,22 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
 
     // Now scan the instructions and delete dead ones, tracking physreg
     // liveness as we go.
-    for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
-                                             MIE = MBB->rend();
-         MII != MIE;) {
-      MachineInstr *MI = &*MII++;
-
+    for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(*MBB))) {
       // If the instruction is dead, delete it!
-      if (isDead(MI)) {
-        LLVM_DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
+      if (isDead(&MI)) {
+        LLVM_DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << MI);
         // It is possible that some DBG_VALUE instructions refer to this
         // instruction.  They get marked as undef and will be deleted
         // in the live debug variable analysis.
-        MI->eraseFromParentAndMarkDBGValuesForRemoval();
+        MI.eraseFromParentAndMarkDBGValuesForRemoval();
         AnyChanges = true;
         ++NumDeletes;
         continue;
       }
 
       // Record the physreg defs.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = MI->getOperand(i);
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI.getOperand(i);
         if (MO.isReg() && MO.isDef()) {
           Register Reg = MO.getReg();
           if (Register::isPhysicalRegister(Reg)) {
@@ -175,8 +171,8 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
       }
       // Record the physreg uses, after the defs, in case a physreg is
       // both defined and used in the same instruction.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = MI->getOperand(i);
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI.getOperand(i);
         if (MO.isReg() && MO.isUse()) {
           Register Reg = MO.getReg();
           if (Register::isPhysicalRegister(Reg)) {
diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index 5ca1e91cc5f4..fb8a3e383950 100644
--- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -54,13 +55,11 @@ namespace {
 class DwarfEHPrepare {
   CodeGenOpt::Level OptLevel;
 
-  // RewindFunction - _Unwind_Resume or the target equivalent.
-  FunctionCallee &RewindFunction;
-
   Function &F;
   const TargetLowering &TLI;
   DomTreeUpdater *DTU;
   const TargetTransformInfo *TTI;
+  const Triple &TargetTriple;
 
   /// Return the exception object from the value passed into
   /// the 'resume' instruction (typically an aggregate). Clean up any dead
@@ -78,11 +77,11 @@ class DwarfEHPrepare {
   bool InsertUnwindResumeCalls();
 
 public:
-  DwarfEHPrepare(CodeGenOpt::Level OptLevel_, FunctionCallee &RewindFunction_,
-                 Function &F_, const TargetLowering &TLI_, DomTreeUpdater *DTU_,
-                 const TargetTransformInfo *TTI_)
-      : OptLevel(OptLevel_), RewindFunction(RewindFunction_), F(F_), TLI(TLI_),
-        DTU(DTU_), TTI(TTI_) {}
+  DwarfEHPrepare(CodeGenOpt::Level OptLevel_, Function &F_,
+                 const TargetLowering &TLI_, DomTreeUpdater *DTU_,
+                 const TargetTransformInfo *TTI_, const Triple &TargetTriple_)
+      : OptLevel(OptLevel_), F(F_), TLI(TLI_), DTU(DTU_), TTI(TTI_),
+        TargetTriple(TargetTriple_) {}
 
   bool run();
 };
@@ -211,13 +210,28 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls() {
   if (ResumesLeft == 0)
     return true; // We pruned them all.
 
-  // Find the rewind function if we didn't already.
-  if (!RewindFunction) {
-    FunctionType *FTy =
+  // RewindFunction - _Unwind_Resume or the target equivalent.
+  FunctionCallee RewindFunction;
+  CallingConv::ID RewindFunctionCallingConv;
+  FunctionType *FTy;
+  const char *RewindName;
+  bool DoesRewindFunctionNeedExceptionObject;
+
+  if ((Pers == EHPersonality::GNU_CXX || Pers == EHPersonality::GNU_CXX_SjLj) &&
+      TargetTriple.isTargetEHABICompatible()) {
+    RewindName = TLI.getLibcallName(RTLIB::CXA_END_CLEANUP);
+    FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
+    RewindFunctionCallingConv =
+        TLI.getLibcallCallingConv(RTLIB::CXA_END_CLEANUP);
+    DoesRewindFunctionNeedExceptionObject = false;
+  } else {
+    RewindName = TLI.getLibcallName(RTLIB::UNWIND_RESUME);
+    FTy =
         FunctionType::get(Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), false);
-    const char *RewindName = TLI.getLibcallName(RTLIB::UNWIND_RESUME);
-    RewindFunction = F.getParent()->getOrInsertFunction(RewindName, FTy);
+    RewindFunctionCallingConv = TLI.getLibcallCallingConv(RTLIB::UNWIND_RESUME);
+    DoesRewindFunctionNeedExceptionObject = true;
   }
+  RewindFunction = F.getParent()->getOrInsertFunction(RewindName, FTy);
 
   // Create the basic block where the _Unwind_Resume call will live.
   if (ResumesLeft == 1) {
@@ -226,10 +240,14 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls() {
     ResumeInst *RI = Resumes.front();
     BasicBlock *UnwindBB = RI->getParent();
     Value *ExnObj = GetExceptionObject(RI);
+    llvm::SmallVector<Value *, 1> RewindFunctionArgs;
+    if (DoesRewindFunctionNeedExceptionObject)
+      RewindFunctionArgs.push_back(ExnObj);
 
-    // Call the _Unwind_Resume function.
-    CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB);
-    CI->setCallingConv(TLI.getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+    // Call the rewind function.
+    CallInst *CI =
+        CallInst::Create(RewindFunction, RewindFunctionArgs, "", UnwindBB);
+    CI->setCallingConv(RewindFunctionCallingConv);
 
     // We never expect _Unwind_Resume to return.
     CI->setDoesNotReturn();
@@ -240,6 +258,8 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls() {
   std::vector<DominatorTree::UpdateType> Updates;
   Updates.reserve(Resumes.size());
 
+  llvm::SmallVector<Value *, 1> RewindFunctionArgs;
+
   BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", &F);
   PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), ResumesLeft, "exn.obj",
                                 UnwindBB);
@@ -257,9 +277,13 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls() {
     ++NumResumesLowered;
   }
 
+  if (DoesRewindFunctionNeedExceptionObject)
+    RewindFunctionArgs.push_back(PN);
+
   // Call the function.
-  CallInst *CI = CallInst::Create(RewindFunction, PN, "", UnwindBB);
-  CI->setCallingConv(TLI.getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+  CallInst *CI =
+      CallInst::Create(RewindFunction, RewindFunctionArgs, "", UnwindBB);
+  CI->setCallingConv(RewindFunctionCallingConv);
 
   // We never expect _Unwind_Resume to return.
   CI->setDoesNotReturn();
@@ -277,22 +301,20 @@ bool DwarfEHPrepare::run() {
   return Changed;
 }
 
-static bool prepareDwarfEH(CodeGenOpt::Level OptLevel,
-                           FunctionCallee &RewindFunction, Function &F,
+static bool prepareDwarfEH(CodeGenOpt::Level OptLevel, Function &F,
                            const TargetLowering &TLI, DominatorTree *DT,
-                           const TargetTransformInfo *TTI) {
+                           const TargetTransformInfo *TTI,
+                           const Triple &TargetTriple) {
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
 
-  return DwarfEHPrepare(OptLevel, RewindFunction, F, TLI, DT ? &DTU : nullptr,
-                        TTI)
+  return DwarfEHPrepare(OptLevel, F, TLI, DT ? &DTU : nullptr, TTI,
+                        TargetTriple)
       .run();
 }
 
 namespace {
 
 class DwarfEHPrepareLegacyPass : public FunctionPass {
-  // RewindFunction - _Unwind_Resume or the target equivalent.
-  FunctionCallee RewindFunction = nullptr;
 
   CodeGenOpt::Level OptLevel;
 
@@ -315,7 +337,7 @@ public:
         DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     }
-    return prepareDwarfEH(OptLevel, RewindFunction, F, TLI, DT, TTI);
+    return prepareDwarfEH(OptLevel, F, TLI, DT, TTI, TM.getTargetTriple());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 50fdc2114780..d0c2b8c267ff 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -348,17 +348,17 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
                                     ConstantInt::get(Diff->getType(), 0));
     BranchInst *CmpBr =
         BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
+    Builder.Insert(CmpBr);
     if (DTU)
       DTU->applyUpdates(
           {{DominatorTree::Insert, BB, EndBlock},
            {DominatorTree::Insert, BB, LoadCmpBlocks[BlockIndex + 1]}});
-    Builder.Insert(CmpBr);
   } else {
     // The last block has an unconditional branch to EndBlock.
     BranchInst *CmpBr = BranchInst::Create(EndBlock);
+    Builder.Insert(CmpBr);
     if (DTU)
       DTU->applyUpdates({{DominatorTree::Insert, BB, EndBlock}});
-    Builder.Insert(CmpBr);
   }
 }
 
diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
index d909d6aa5b0a..7300ea6b50ee 100644
--- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -189,12 +189,7 @@ bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
   bool MadeChange = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineBasicBlock::iterator mi = MBB.begin(), me = MBB.end();
-         mi != me;) {
-      MachineInstr &MI = *mi;
-      // Advance iterator here because MI may be erased.
-      ++mi;
-
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       // Only expand pseudos.
       if (!MI.isPseudo())
         continue;
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index a8d4d4ebe8bd..bb8d2b3e9a78 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -158,6 +158,11 @@ struct CachingVPExpander {
   Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
                                            VPIntrinsic &PI);
 
+  /// \brief Lower this VP reduction to a call to an unpredicated reduction
+  /// intrinsic.
+  Value *expandPredicationInReduction(IRBuilder<> &Builder,
+                                      VPReductionIntrinsic &PI);
+
   /// \brief Query TTI and expand the vector predication in \p P accordingly.
   Value *expandPredication(VPIntrinsic &PI);
 
@@ -248,6 +253,136 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
   return NewBinOp;
 }
 
+static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
+                                         Type *EltTy) {
+  bool Negative = false;
+  unsigned EltBits = EltTy->getScalarSizeInBits();
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Expecting a VP reduction intrinsic");
+  case Intrinsic::vp_reduce_add:
+  case Intrinsic::vp_reduce_or:
+  case Intrinsic::vp_reduce_xor:
+  case Intrinsic::vp_reduce_umax:
+    return Constant::getNullValue(EltTy);
+  case Intrinsic::vp_reduce_mul:
+    return ConstantInt::get(EltTy, 1, /*IsSigned*/ false);
+  case Intrinsic::vp_reduce_and:
+  case Intrinsic::vp_reduce_umin:
+    return ConstantInt::getAllOnesValue(EltTy);
+  case Intrinsic::vp_reduce_smin:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMaxValue(EltBits));
+  case Intrinsic::vp_reduce_smax:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMinValue(EltBits));
+  case Intrinsic::vp_reduce_fmax:
+    Negative = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::vp_reduce_fmin: {
+    FastMathFlags Flags = VPI.getFastMathFlags();
+    const fltSemantics &Semantics = EltTy->getFltSemantics();
+    return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative)
+           : !Flags.noInfs()
+               ? ConstantFP::getInfinity(EltTy, Negative)
+               : ConstantFP::get(EltTy,
+                                 APFloat::getLargest(Semantics, Negative));
+  }
+  case Intrinsic::vp_reduce_fadd:
+    return ConstantFP::getNegativeZero(EltTy);
+  case Intrinsic::vp_reduce_fmul:
+    return ConstantFP::get(EltTy, 1.0);
+  }
+}
+
+Value *
+CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
+                                                VPReductionIntrinsic &VPI) {
+  assert((isSafeToSpeculativelyExecute(&VPI) ||
+          VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  Value *Mask = VPI.getMaskParam();
+  Value *RedOp = VPI.getOperand(VPI.getVectorParamPos());
+
+  // Insert neutral element in masked-out positions
+  if (Mask && !isAllTrueMask(Mask)) {
+    auto *NeutralElt = getNeutralReductionElement(VPI, VPI.getType());
+    auto *NeutralVector = Builder.CreateVectorSplat(
+        cast<VectorType>(RedOp->getType())->getElementCount(), NeutralElt);
+    RedOp = Builder.CreateSelect(Mask, RedOp, NeutralVector);
+  }
+
+  Value *Reduction;
+  Value *Start = VPI.getOperand(VPI.getStartParamPos());
+
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Impossible reduction kind");
+  case Intrinsic::vp_reduce_add:
+    Reduction = Builder.CreateAddReduce(RedOp);
+    Reduction = Builder.CreateAdd(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_mul:
+    Reduction = Builder.CreateMulReduce(RedOp);
+    Reduction = Builder.CreateMul(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_and:
+    Reduction = Builder.CreateAndReduce(RedOp);
+    Reduction = Builder.CreateAnd(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_or:
+    Reduction = Builder.CreateOrReduce(RedOp);
+    Reduction = Builder.CreateOr(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_xor:
+    Reduction = Builder.CreateXorReduce(RedOp);
+    Reduction = Builder.CreateXor(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_smax:
+    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_smin:
+    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_umax:
+    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_umin:
+    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fmax:
+    Reduction = Builder.CreateFPMaxReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fmin:
+    Reduction = Builder.CreateFPMinReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fadd:
+    Reduction = Builder.CreateFAddReduce(Start, RedOp);
+    break;
+  case Intrinsic::vp_reduce_fmul:
+    Reduction = Builder.CreateFMulReduce(Start, RedOp);
+    break;
+  }
+
+  replaceOperation(*Reduction, VPI);
+  return Reduction;
+}
+
 void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
@@ -321,6 +456,9 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   if (OC && Instruction::isBinaryOp(*OC))
     return expandPredicationInBinaryOperator(Builder, VPI);
 
+  if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
+    return expandPredicationInReduction(Builder, *VPRI);
+
   return &VPI;
 }
 
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index e3c4e86d203b..ec6bf18b2769 100644
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -1,9 +1,8 @@
 //===-- FixupStatepointCallerSaved.cpp - Fixup caller saved registers  ----===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
diff --git a/llvm/lib/CodeGen/GCMetadata.cpp b/llvm/lib/CodeGen/GCMetadata.cpp
index 8fae798b31d9..af5515cc6bfd 100644
--- a/llvm/lib/CodeGen/GCMetadata.cpp
+++ b/llvm/lib/CodeGen/GCMetadata.cpp
@@ -145,24 +145,9 @@ GCStrategy *GCModuleInfo::getGCStrategy(const StringRef Name) {
   if (NMI != GCStrategyMap.end())
     return NMI->getValue();
 
-  for (auto& Entry : GCRegistry::entries()) {
-    if (Name == Entry.getName()) {
-      std::unique_ptr<GCStrategy> S = Entry.instantiate();
-      S->Name = std::string(Name);
-      GCStrategyMap[Name] = S.get();
-      GCStrategyList.push_back(std::move(S));
-      return GCStrategyList.back().get();
-    }
-  }
-
-  if (GCRegistry::begin() == GCRegistry::end()) {
-    // In normal operation, the registry should not be empty.  There should
-    // be the builtin GCs if nothing else.  The most likely scenario here is
-    // that we got here without running the initializers used by the Registry
-    // itself and it's registration mechanism.
-    const std::string error = ("unsupported GC: " + Name).str() +
-      " (did you remember to link and initialize the CodeGen library?)";
-    report_fatal_error(error);
-  } else
-    report_fatal_error(std::string("unsupported GC: ") + Name);
+  std::unique_ptr<GCStrategy> S = llvm::getGCStrategy(Name);
+  S->Name = std::string(Name);
+  GCStrategyMap[Name] = S.get();
+  GCStrategyList.push_back(std::move(S));
+  return GCStrategyList.back().get();
 }
diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp
index 58269e172c57..637a877810a1 100644
--- a/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -193,8 +193,8 @@ bool LowerIntrinsics::DoLowering(Function &F, GCStrategy &S) {
 
   bool MadeChange = false;
   for (BasicBlock &BB : F)
-    for (BasicBlock::iterator II = BB.begin(), E = BB.end(); II != E;) {
-      IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++);
+    for (Instruction &I : llvm::make_early_inc_range(BB)) {
+      IntrinsicInst *CI = dyn_cast<IntrinsicInst>(&I);
       if (!CI)
         continue;
 
@@ -271,16 +271,15 @@ void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
 
 void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
-    for (MachineBasicBlock::iterator MI = MBB.begin(), ME = MBB.end();
-         MI != ME; ++MI)
-      if (MI->isCall()) {
+    for (MachineInstr &MI : MBB)
+      if (MI.isCall()) {
         // Do not treat tail or sibling call sites as safe points.  This is
         // legal since any arguments passed to the callee which live in the
         // remnants of the callers frame will be owned and updated by the
         // callee if required.
-        if (MI->isTerminator())
+        if (MI.isTerminator())
           continue;
-        VisitCallPoint(MI);
+        VisitCallPoint(&MI);
       }
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index dd560e8ff145..2676becdd807 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -13,6 +13,8 @@
 
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 
 using namespace llvm;
@@ -187,6 +189,14 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
     // Try to constant fold these.
     assert(SrcOps.size() == 2 && "Invalid sources");
     assert(DstOps.size() == 1 && "Invalid dsts");
+    if (SrcOps[0].getLLTTy(*getMRI()).isVector()) {
+      // Try to constant fold vector constants.
+      auto VecCst = ConstantFoldVectorBinop(
+          Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this);
+      if (VecCst)
+        return MachineInstrBuilder(getMF(), *VecCst);
+      break;
+    }
     if (Optional<APInt> Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(),
                                                 SrcOps[1].getReg(), *getMRI()))
       return buildConstant(DstOps[0], *Cst);
@@ -213,6 +223,22 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
       return buildFConstant(DstOps[0], *Cst);
     break;
   }
+  case TargetOpcode::G_CTLZ: {
+    assert(SrcOps.size() == 1 && "Expected one source");
+    assert(DstOps.size() == 1 && "Expected one dest");
+    auto MaybeCsts = ConstantFoldCTLZ(SrcOps[0].getReg(), *getMRI());
+    if (!MaybeCsts)
+      break;
+    if (MaybeCsts->size() == 1)
+      return buildConstant(DstOps[0], (*MaybeCsts)[0]);
+    // This was a vector constant. Build a G_BUILD_VECTOR for them.
+    SmallVector<Register> ConstantRegs;
+    LLT VecTy = DstOps[0].getLLTTy(*getMRI());
+    for (unsigned Cst : *MaybeCsts)
+      ConstantRegs.emplace_back(
+          buildConstant(VecTy.getScalarType(), Cst).getReg(0));
+    return buildBuildVector(DstOps[0], ConstantRegs);
+  }
   }
   bool CanCopy = checkCopyToDefsPossible(DstOps);
   if (!canPerformCSEForOpc(Opc))
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index d2cda9ece31a..17094a8e44f8 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -73,7 +74,7 @@ void CallLowering::addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
                                              const AttributeList &Attrs,
                                              unsigned OpIdx) const {
   addFlagsUsingAttrFn(Flags, [&Attrs, &OpIdx](Attribute::AttrKind Attr) {
-    return Attrs.hasAttribute(OpIdx, Attr);
+    return Attrs.hasAttributeAtIndex(OpIdx, Attr);
   });
 }
 
@@ -139,6 +140,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   if (!Info.OrigRet.Ty->isVoidTy())
     setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
 
+  Info.CB = &CB;
   Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
   Info.CallConv = CallConv;
   Info.SwiftErrorVReg = SwiftErrorVReg;
@@ -165,18 +167,21 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
   Align MemAlign = DL.getABITypeAlign(Arg.Ty);
   if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
     assert(OpIdx >= AttributeList::FirstArgIndex);
-    Type *ElementTy = PtrTy->getElementType();
+    unsigned ParamIdx = OpIdx - AttributeList::FirstArgIndex;
 
-    auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
-    Flags.setByValSize(DL.getTypeAllocSize(Ty ? Ty : ElementTy));
+    Type *ElementTy = FuncInfo.getParamByValType(ParamIdx);
+    if (!ElementTy)
+      ElementTy = FuncInfo.getParamInAllocaType(ParamIdx);
+    if (!ElementTy)
+      ElementTy = FuncInfo.getParamPreallocatedType(ParamIdx);
+    assert(ElementTy && "Must have byval, inalloca or preallocated type");
+    Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
 
     // For ByVal, alignment should be passed from FE.  BE will guess if
     // this info is not there but there are cases it cannot get right.
-    if (auto ParamAlign =
-            FuncInfo.getParamStackAlign(OpIdx - AttributeList::FirstArgIndex))
+    if (auto ParamAlign = FuncInfo.getParamStackAlign(ParamIdx))
       MemAlign = *ParamAlign;
-    else if ((ParamAlign =
-                  FuncInfo.getParamAlign(OpIdx - AttributeList::FirstArgIndex)))
+    else if ((ParamAlign = FuncInfo.getParamAlign(ParamIdx)))
       MemAlign = *ParamAlign;
     else
       MemAlign = Align(getTLI()->getByValTypeAlignment(ElementTy, DL));
@@ -613,14 +618,31 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
 
   const unsigned NumArgs = Args.size();
 
+  // Stores thunks for outgoing register assignments. This is used so we delay
+  // generating register copies until mem loc assignments are done. We do this
+  // so that if the target is using the delayed stack protector feature, we can
+  // find the split point of the block accurately. E.g. if we have:
+  // G_STORE %val, %memloc
+  // $x0 = COPY %foo
+  // $x1 = COPY %bar
+  // CALL func
+  // ... then the split point for the block will correctly be at, and including,
+  // the copy to $x0. If instead the G_STORE instruction immediately precedes
+  // the CALL, then we'd prematurely choose the CALL as the split point, thus
+  // generating a split block with a CALL that uses undefined physregs.
+  SmallVector<std::function<void()>> DelayedOutgoingRegAssignments;
+
   for (unsigned i = 0, j = 0; i != NumArgs; ++i, ++j) {
     assert(j < ArgLocs.size() && "Skipped too many arg locs");
     CCValAssign &VA = ArgLocs[j];
     assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
 
     if (VA.needsCustom()) {
-      unsigned NumArgRegs =
-          Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      std::function<void()> Thunk;
+      unsigned NumArgRegs = Handler.assignCustomValue(
+          Args[i], makeArrayRef(ArgLocs).slice(j), &Thunk);
+      if (Thunk)
+        DelayedOutgoingRegAssignments.emplace_back(Thunk);
       if (!NumArgRegs)
         return false;
       j += NumArgRegs;
@@ -739,7 +761,13 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
         continue;
       }
 
-      Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
+      if (Handler.isIncomingArgumentHandler())
+        Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
+      else {
+        DelayedOutgoingRegAssignments.emplace_back([=, &Handler]() {
+          Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
+        });
+      }
     }
 
     // Now that all pieces have been assigned, re-pack the register typed values
@@ -753,6 +781,8 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
 
     j += NumParts - 1;
   }
+  for (auto &Fn : DelayedOutgoingRegAssignments)
+    Fn();
 
   return true;
 }
@@ -1153,7 +1183,7 @@ static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) {
 
 void CallLowering::IncomingValueHandler::assignValueToReg(Register ValVReg,
                                                           Register PhysReg,
-                                                          CCValAssign &VA) {
+                                                          CCValAssign VA) {
   const MVT LocVT = VA.getLocVT();
   const LLT LocTy(LocVT);
   const LLT RegTy = MRI.getType(ValVReg);
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 6f103bca6892..381c6df5c97a 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -130,16 +130,15 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
       WrapperObserver.addObserver(CSEInfo);
     RAIIDelegateInstaller DelInstall(MF, &WrapperObserver);
     for (MachineBasicBlock *MBB : post_order(&MF)) {
-      for (auto MII = MBB->rbegin(), MIE = MBB->rend(); MII != MIE;) {
-        MachineInstr *CurMI = &*MII;
-        ++MII;
+      for (MachineInstr &CurMI :
+           llvm::make_early_inc_range(llvm::reverse(*MBB))) {
         // Erase dead insts before even adding to the list.
-        if (isTriviallyDead(*CurMI, *MRI)) {
-          LLVM_DEBUG(dbgs() << *CurMI << "Is dead; erasing.\n");
-          CurMI->eraseFromParentAndMarkDBGValuesForRemoval();
+        if (isTriviallyDead(CurMI, *MRI)) {
+          LLVM_DEBUG(dbgs() << CurMI << "Is dead; erasing.\n");
+          CurMI.eraseFromParentAndMarkDBGValuesForRemoval();
           continue;
         }
-        WorkList.deferred_insert(CurMI);
+        WorkList.deferred_insert(&CurMI);
       }
     }
     WorkList.finalize();
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 06d827de2e96..3a52959d54bf 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -12,9 +12,11 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -26,8 +28,10 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DivisionByConstantInfo.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetMachine.h"
 #include <tuple>
 
 #define DEBUG_TYPE "gi-combiner"
@@ -46,8 +50,9 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B, GISelKnownBits *KB,
                                MachineDominatorTree *MDT,
                                const LegalizerInfo *LI)
-    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
-      KB(KB), MDT(MDT), LI(LI) {
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB),
+      MDT(MDT), LI(LI), RBI(Builder.getMF().getSubtarget().getRegBankInfo()),
+      TRI(Builder.getMF().getSubtarget().getRegisterInfo()) {
   (void)this->KB;
 }
 
@@ -64,6 +69,16 @@ static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
   return I;
 }
 
+/// Determines the LogBase2 value for a non-null input value using the
+/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
+static Register buildLogBase2(Register V, MachineIRBuilder &MIB) {
+  auto &MRI = *MIB.getMRI();
+  LLT Ty = MRI.getType(V);
+  auto Ctlz = MIB.buildCTLZ(Ty, V);
+  auto Base = MIB.buildConstant(Ty, Ty.getScalarSizeInBits() - 1);
+  return MIB.buildSub(Ty, Base, Ctlz).getReg(0);
+}
+
 /// \returns The big endian in-memory byte position of byte \p I in a
 /// \p ByteWidth bytes wide type.
 ///
@@ -143,6 +158,24 @@ void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
   Observer.changedInstr(*FromRegOp.getParent());
 }
 
+void CombinerHelper::replaceOpcodeWith(MachineInstr &FromMI,
+                                       unsigned ToOpcode) const {
+  Observer.changingInstr(FromMI);
+
+  FromMI.setDesc(Builder.getTII().get(ToOpcode));
+
+  Observer.changedInstr(FromMI);
+}
+
+const RegisterBank *CombinerHelper::getRegBank(Register Reg) const {
+  return RBI->getRegBank(Reg, MRI, *TRI);
+}
+
+void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) {
+  if (RegBank)
+    MRI.setRegBank(Reg, *RegBank);
+}
+
 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
   if (matchCombineCopy(MI)) {
     applyCombineCopy(MI);
@@ -486,10 +519,7 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
         continue;
       // Check for legality.
       if (LI) {
-        LegalityQuery::MemDesc MMDesc;
-        MMDesc.MemoryTy = MMO.getMemoryType();
-        MMDesc.AlignInBits = MMO.getAlign().value() * 8;
-        MMDesc.Ordering = MMO.getSuccessOrdering();
+        LegalityQuery::MemDesc MMDesc(MMO);
         LLT UseTy = MRI.getType(UseMI.getOperand(0).getReg());
         LLT SrcTy = MRI.getType(LoadMI->getPointerReg());
         if (LI->getAction({LoadMI->getOpcode(), {UseTy, SrcTy}, {MMDesc}})
@@ -623,13 +653,83 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
+bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  // If we have the following code:
+  //  %mask = G_CONSTANT 255
+  //  %ld   = G_LOAD %ptr, (load s16)
+  //  %and  = G_AND %ld, %mask
+  //
+  // Try to fold it into
+  //   %ld = G_ZEXTLOAD %ptr, (load s8)
+
+  Register Dst = MI.getOperand(0).getReg();
+  if (MRI.getType(Dst).isVector())
+    return false;
+
+  auto MaybeMask =
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeMask)
+    return false;
+
+  APInt MaskVal = MaybeMask->Value;
+
+  if (!MaskVal.isMask())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  GAnyLoad *LoadMI = getOpcodeDef<GAnyLoad>(SrcReg, MRI);
+  if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()) ||
+      !LoadMI->isSimple())
+    return false;
+
+  Register LoadReg = LoadMI->getDstReg();
+  LLT LoadTy = MRI.getType(LoadReg);
+  Register PtrReg = LoadMI->getPointerReg();
+  uint64_t LoadSizeBits = LoadMI->getMemSizeInBits();
+  unsigned MaskSizeBits = MaskVal.countTrailingOnes();
+
+  // The mask may not be larger than the in-memory type, as it might cover sign
+  // extended bits
+  if (MaskSizeBits > LoadSizeBits)
+    return false;
+
+  // If the mask covers the whole destination register, there's nothing to
+  // extend
+  if (MaskSizeBits >= LoadTy.getSizeInBits())
+    return false;
+
+  // Most targets cannot deal with loads of size < 8 and need to re-legalize to
+  // at least byte loads. Avoid creating such loads here
+  if (MaskSizeBits < 8 || !isPowerOf2_32(MaskSizeBits))
+    return false;
+
+  const MachineMemOperand &MMO = LoadMI->getMMO();
+  LegalityQuery::MemDesc MemDesc(MMO);
+  MemDesc.MemoryTy = LLT::scalar(MaskSizeBits);
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_ZEXTLOAD, {LoadTy, MRI.getType(PtrReg)}, {MemDesc}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.setInstrAndDebugLoc(*LoadMI);
+    auto &MF = B.getMF();
+    auto PtrInfo = MMO.getPointerInfo();
+    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MaskSizeBits / 8);
+    B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO);
+  };
+  return true;
+}
+
 bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
                                    const MachineInstr &UseMI) {
   assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
          "shouldn't consider debug uses");
   assert(DefMI.getParent() == UseMI.getParent());
   if (&DefMI == &UseMI)
-    return false;
+    return true;
   const MachineBasicBlock &MBB = *DefMI.getParent();
   auto DefOrUse = find_if(MBB, [&DefMI, &UseMI](const MachineInstr &MI) {
     return &MI == &DefMI || &MI == &UseMI;
@@ -711,6 +811,16 @@ bool CombinerHelper::matchSextInRegOfLoad(
   // anyway for most targets.
   if (!isPowerOf2_32(NewSizeBits))
     return false;
+
+  const MachineMemOperand &MMO = LoadDef->getMMO();
+  LegalityQuery::MemDesc MMDesc(MMO);
+  MMDesc.MemoryTy = LLT::scalar(NewSizeBits);
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SEXTLOAD,
+                                 {MRI.getType(LoadDef->getDstReg()),
+                                  MRI.getType(LoadDef->getPointerReg())},
+                                 {MMDesc}}))
+    return false;
+
   MatchInfo = std::make_tuple(LoadDef->getDstReg(), NewSizeBits);
   return true;
 }
@@ -1093,81 +1203,6 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
   Observer.changedInstr(*BrCond);
 }
 
-static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
-  // On Darwin, -Os means optimize for size without hurting performance, so
-  // only really optimize for size when -Oz (MinSize) is used.
-  if (MF.getTarget().getTargetTriple().isOSDarwin())
-    return MF.getFunction().hasMinSize();
-  return MF.getFunction().hasOptSize();
-}
-
-// Returns a list of types to use for memory op lowering in MemOps. A partial
-// port of findOptimalMemOpLowering in TargetLowering.
-static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
-                                          unsigned Limit, const MemOp &Op,
-                                          unsigned DstAS, unsigned SrcAS,
-                                          const AttributeList &FuncAttributes,
-                                          const TargetLowering &TLI) {
-  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
-    return false;
-
-  LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
-
-  if (Ty == LLT()) {
-    // Use the largest scalar type whose alignment constraints are satisfied.
-    // We only need to check DstAlign here as SrcAlign is always greater or
-    // equal to DstAlign (or zero).
-    Ty = LLT::scalar(64);
-    if (Op.isFixedDstAlign())
-      while (Op.getDstAlign() < Ty.getSizeInBytes() &&
-             !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
-        Ty = LLT::scalar(Ty.getSizeInBytes());
-    assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
-    // FIXME: check for the largest legal type we can load/store to.
-  }
-
-  unsigned NumMemOps = 0;
-  uint64_t Size = Op.size();
-  while (Size) {
-    unsigned TySize = Ty.getSizeInBytes();
-    while (TySize > Size) {
-      // For now, only use non-vector load / store's for the left-over pieces.
-      LLT NewTy = Ty;
-      // FIXME: check for mem op safety and legality of the types. Not all of
-      // SDAGisms map cleanly to GISel concepts.
-      if (NewTy.isVector())
-        NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
-      NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
-      unsigned NewTySize = NewTy.getSizeInBytes();
-      assert(NewTySize > 0 && "Could not find appropriate type");
-
-      // If the new LLT cannot cover all of the remaining bits, then consider
-      // issuing a (or a pair of) unaligned and overlapping load / store.
-      bool Fast;
-      // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
-      MVT VT = getMVTForLLT(Ty);
-      if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
-          TLI.allowsMisalignedMemoryAccesses(
-              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
-              MachineMemOperand::MONone, &Fast) &&
-          Fast)
-        TySize = Size;
-      else {
-        Ty = NewTy;
-        TySize = NewTySize;
-      }
-    }
-
-    if (++NumMemOps > Limit)
-      return false;
-
-    MemOps.push_back(Ty);
-    Size -= TySize;
-  }
-
-  return true;
-}
-
 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
   if (Ty.isVector())
     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
@@ -1175,460 +1210,20 @@ static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
   return IntegerType::get(C, Ty.getSizeInBits());
 }
 
-// Get a vectorized representation of the memset value operand, GISel edition.
-static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-  unsigned NumBits = Ty.getScalarSizeInBits();
-  auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
-  if (!Ty.isVector() && ValVRegAndVal) {
-    APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
-    APInt SplatVal = APInt::getSplat(NumBits, Scalar);
-    return MIB.buildConstant(Ty, SplatVal).getReg(0);
-  }
-
-  // Extend the byte value to the larger type, and then multiply by a magic
-  // value 0x010101... in order to replicate it across every byte.
-  // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
-  if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
-    return MIB.buildConstant(Ty, 0).getReg(0);
-  }
-
-  LLT ExtType = Ty.getScalarType();
-  auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
-  if (NumBits > 8) {
-    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
-    auto MagicMI = MIB.buildConstant(ExtType, Magic);
-    Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
-  }
-
-  // For vector types create a G_BUILD_VECTOR.
-  if (Ty.isVector())
-    Val = MIB.buildSplatVector(Ty, Val).getReg(0);
-
-  return Val;
-}
-
-bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst,
-                                    Register Val, uint64_t KnownLen,
-                                    Align Alignment, bool IsVolatile) {
-  auto &MF = *MI.getParent()->getParent();
-  const auto &TLI = *MF.getSubtarget().getTargetLowering();
-  auto &DL = MF.getDataLayout();
-  LLVMContext &C = MF.getFunction().getContext();
-
-  assert(KnownLen != 0 && "Have a zero length memset length!");
-
-  bool DstAlignCanChange = false;
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool OptSize = shouldLowerMemFuncForSize(MF);
-
-  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
-  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
-    DstAlignCanChange = true;
-
-  unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
-  std::vector<LLT> MemOps;
-
-  const auto &DstMMO = **MI.memoperands_begin();
-  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
-
-  auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
-  bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
-
-  if (!findGISelOptimalMemOpLowering(MemOps, Limit,
-                                     MemOp::Set(KnownLen, DstAlignCanChange,
-                                                Alignment,
-                                                /*IsZeroMemset=*/IsZeroVal,
-                                                /*IsVolatile=*/IsVolatile),
-                                     DstPtrInfo.getAddrSpace(), ~0u,
-                                     MF.getFunction().getAttributes(), TLI))
-    return false;
-
-  if (DstAlignCanChange) {
-    // Get an estimate of the type from the LLT.
-    Type *IRTy = getTypeForLLT(MemOps[0], C);
-    Align NewAlign = DL.getABITypeAlign(IRTy);
-    if (NewAlign > Alignment) {
-      Alignment = NewAlign;
-      unsigned FI = FIDef->getOperand(1).getIndex();
-      // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlign(FI) < Alignment)
-        MFI.setObjectAlignment(FI, Alignment);
-    }
-  }
-
-  MachineIRBuilder MIB(MI);
-  // Find the largest store and generate the bit pattern for it.
-  LLT LargestTy = MemOps[0];
-  for (unsigned i = 1; i < MemOps.size(); i++)
-    if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
-      LargestTy = MemOps[i];
-
-  // The memset stored value is always defined as an s8, so in order to make it
-  // work with larger store types we need to repeat the bit pattern across the
-  // wider type.
-  Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
-
-  if (!MemSetValue)
-    return false;
-
-  // Generate the stores. For each store type in the list, we generate the
-  // matching store of that type to the destination address.
-  LLT PtrTy = MRI.getType(Dst);
-  unsigned DstOff = 0;
-  unsigned Size = KnownLen;
-  for (unsigned I = 0; I < MemOps.size(); I++) {
-    LLT Ty = MemOps[I];
-    unsigned TySize = Ty.getSizeInBytes();
-    if (TySize > Size) {
-      // Issuing an unaligned load / store pair that overlaps with the previous
-      // pair. Adjust the offset accordingly.
-      assert(I == MemOps.size() - 1 && I != 0);
-      DstOff -= TySize - Size;
-    }
-
-    // If this store is smaller than the largest store see whether we can get
-    // the smaller value for free with a truncate.
-    Register Value = MemSetValue;
-    if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
-      MVT VT = getMVTForLLT(Ty);
-      MVT LargestVT = getMVTForLLT(LargestTy);
-      if (!LargestTy.isVector() && !Ty.isVector() &&
-          TLI.isTruncateFree(LargestVT, VT))
-        Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
-      else
-        Value = getMemsetValue(Val, Ty, MIB);
-      if (!Value)
-        return false;
-    }
-
-    auto *StoreMMO =
-        MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
-
-    Register Ptr = Dst;
-    if (DstOff != 0) {
-      auto Offset =
-          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
-      Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
-    }
-
-    MIB.buildStore(Value, Ptr, *StoreMMO);
-    DstOff += Ty.getSizeInBytes();
-    Size -= TySize;
-  }
-
-  MI.eraseFromParent();
-  return true;
-}
-
 bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
-
-  Register Dst = MI.getOperand(0).getReg();
-  Register Src = MI.getOperand(1).getReg();
-  Register Len = MI.getOperand(2).getReg();
-
-  const auto *MMOIt = MI.memoperands_begin();
-  const MachineMemOperand *MemOp = *MMOIt;
-  bool IsVolatile = MemOp->isVolatile();
-
-  // See if this is a constant length copy
-  auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
-  // FIXME: support dynamically sized G_MEMCPY_INLINE
-  assert(LenVRegAndVal.hasValue() &&
-         "inline memcpy with dynamic size is not yet supported");
-  uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
-  if (KnownLen == 0) {
-    MI.eraseFromParent();
-    return true;
-  }
-
-  const auto &DstMMO = **MI.memoperands_begin();
-  const auto &SrcMMO = **std::next(MI.memoperands_begin());
-  Align DstAlign = DstMMO.getBaseAlign();
-  Align SrcAlign = SrcMMO.getBaseAlign();
-
-  return tryEmitMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
-                             IsVolatile);
-}
-
-bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI, Register Dst,
-                                         Register Src, uint64_t KnownLen,
-                                         Align DstAlign, Align SrcAlign,
-                                         bool IsVolatile) {
-  assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
-  return optimizeMemcpy(MI, Dst, Src, KnownLen,
-                        std::numeric_limits<uint64_t>::max(), DstAlign,
-                        SrcAlign, IsVolatile);
-}
-
-bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
-                                    Register Src, uint64_t KnownLen,
-                                    uint64_t Limit, Align DstAlign,
-                                    Align SrcAlign, bool IsVolatile) {
-  auto &MF = *MI.getParent()->getParent();
-  const auto &TLI = *MF.getSubtarget().getTargetLowering();
-  auto &DL = MF.getDataLayout();
-  LLVMContext &C = MF.getFunction().getContext();
-
-  assert(KnownLen != 0 && "Have a zero length memcpy length!");
-
-  bool DstAlignCanChange = false;
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-  Align Alignment = commonAlignment(DstAlign, SrcAlign);
-
-  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
-  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
-    DstAlignCanChange = true;
-
-  // FIXME: infer better src pointer alignment like SelectionDAG does here.
-  // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
-  // if the memcpy is in a tail call position.
-
-  std::vector<LLT> MemOps;
-
-  const auto &DstMMO = **MI.memoperands_begin();
-  const auto &SrcMMO = **std::next(MI.memoperands_begin());
-  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
-  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
-
-  if (!findGISelOptimalMemOpLowering(
-          MemOps, Limit,
-          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
-                      IsVolatile),
-          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
-          MF.getFunction().getAttributes(), TLI))
-    return false;
-
-  if (DstAlignCanChange) {
-    // Get an estimate of the type from the LLT.
-    Type *IRTy = getTypeForLLT(MemOps[0], C);
-    Align NewAlign = DL.getABITypeAlign(IRTy);
-
-    // Don't promote to an alignment that would require dynamic stack
-    // realignment.
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-    if (!TRI->hasStackRealignment(MF))
-      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
-        NewAlign = NewAlign / 2;
-
-    if (NewAlign > Alignment) {
-      Alignment = NewAlign;
-      unsigned FI = FIDef->getOperand(1).getIndex();
-      // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlign(FI) < Alignment)
-        MFI.setObjectAlignment(FI, Alignment);
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
-
-  MachineIRBuilder MIB(MI);
-  // Now we need to emit a pair of load and stores for each of the types we've
-  // collected. I.e. for each type, generate a load from the source pointer of
-  // that type width, and then generate a corresponding store to the dest buffer
-  // of that value loaded. This can result in a sequence of loads and stores
-  // mixed types, depending on what the target specifies as good types to use.
-  unsigned CurrOffset = 0;
-  LLT PtrTy = MRI.getType(Src);
-  unsigned Size = KnownLen;
-  for (auto CopyTy : MemOps) {
-    // Issuing an unaligned load / store pair  that overlaps with the previous
-    // pair. Adjust the offset accordingly.
-    if (CopyTy.getSizeInBytes() > Size)
-      CurrOffset -= CopyTy.getSizeInBytes() - Size;
-
-    // Construct MMOs for the accesses.
-    auto *LoadMMO =
-        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
-    auto *StoreMMO =
-        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
-
-    // Create the load.
-    Register LoadPtr = Src;
-    Register Offset;
-    if (CurrOffset != 0) {
-      Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
-                   .getReg(0);
-      LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
-    }
-    auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
-
-    // Create the store.
-    Register StorePtr =
-        CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
-    MIB.buildStore(LdVal, StorePtr, *StoreMMO);
-    CurrOffset += CopyTy.getSizeInBytes();
-    Size -= CopyTy.getSizeInBytes();
-  }
-
-  MI.eraseFromParent();
-  return true;
-}
-
-bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
-                                     Register Src, uint64_t KnownLen,
-                                     Align DstAlign, Align SrcAlign,
-                                     bool IsVolatile) {
-  auto &MF = *MI.getParent()->getParent();
-  const auto &TLI = *MF.getSubtarget().getTargetLowering();
-  auto &DL = MF.getDataLayout();
-  LLVMContext &C = MF.getFunction().getContext();
-
-  assert(KnownLen != 0 && "Have a zero length memmove length!");
-
-  bool DstAlignCanChange = false;
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool OptSize = shouldLowerMemFuncForSize(MF);
-  Align Alignment = commonAlignment(DstAlign, SrcAlign);
-
-  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
-  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
-    DstAlignCanChange = true;
-
-  unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
-  std::vector<LLT> MemOps;
-
-  const auto &DstMMO = **MI.memoperands_begin();
-  const auto &SrcMMO = **std::next(MI.memoperands_begin());
-  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
-  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
-
-  // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
-  // to a bug in it's findOptimalMemOpLowering implementation. For now do the
-  // same thing here.
-  if (!findGISelOptimalMemOpLowering(
-          MemOps, Limit,
-          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
-                      /*IsVolatile*/ true),
-          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
-          MF.getFunction().getAttributes(), TLI))
-    return false;
-
-  if (DstAlignCanChange) {
-    // Get an estimate of the type from the LLT.
-    Type *IRTy = getTypeForLLT(MemOps[0], C);
-    Align NewAlign = DL.getABITypeAlign(IRTy);
-
-    // Don't promote to an alignment that would require dynamic stack
-    // realignment.
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-    if (!TRI->hasStackRealignment(MF))
-      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
-        NewAlign = NewAlign / 2;
-
-    if (NewAlign > Alignment) {
-      Alignment = NewAlign;
-      unsigned FI = FIDef->getOperand(1).getIndex();
-      // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlign(FI) < Alignment)
-        MFI.setObjectAlignment(FI, Alignment);
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
-
-  MachineIRBuilder MIB(MI);
-  // Memmove requires that we perform the loads first before issuing the stores.
-  // Apart from that, this loop is pretty much doing the same thing as the
-  // memcpy codegen function.
-  unsigned CurrOffset = 0;
-  LLT PtrTy = MRI.getType(Src);
-  SmallVector<Register, 16> LoadVals;
-  for (auto CopyTy : MemOps) {
-    // Construct MMO for the load.
-    auto *LoadMMO =
-        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
-
-    // Create the load.
-    Register LoadPtr = Src;
-    if (CurrOffset != 0) {
-      auto Offset =
-          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
-      LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
-    }
-    LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
-    CurrOffset += CopyTy.getSizeInBytes();
-  }
-
-  CurrOffset = 0;
-  for (unsigned I = 0; I < MemOps.size(); ++I) {
-    LLT CopyTy = MemOps[I];
-    // Now store the values loaded.
-    auto *StoreMMO =
-        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
-
-    Register StorePtr = Dst;
-    if (CurrOffset != 0) {
-      auto Offset =
-          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
-      StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
-    }
-    MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
-    CurrOffset += CopyTy.getSizeInBytes();
-  }
-  MI.eraseFromParent();
-  return true;
+  MachineIRBuilder HelperBuilder(MI);
+  GISelObserverWrapper DummyObserver;
+  LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
+  return Helper.lowerMemcpyInline(MI) ==
+         LegalizerHelper::LegalizeResult::Legalized;
 }
 
 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
-  const unsigned Opc = MI.getOpcode();
-  // This combine is fairly complex so it's not written with a separate
-  // matcher function.
-  assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
-          Opc == TargetOpcode::G_MEMSET) && "Expected memcpy like instruction");
-
-  auto MMOIt = MI.memoperands_begin();
-  const MachineMemOperand *MemOp = *MMOIt;
-
-  Align DstAlign = MemOp->getBaseAlign();
-  Align SrcAlign;
-  Register Dst = MI.getOperand(0).getReg();
-  Register Src = MI.getOperand(1).getReg();
-  Register Len = MI.getOperand(2).getReg();
-
-  if (Opc != TargetOpcode::G_MEMSET) {
-    assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
-    MemOp = *(++MMOIt);
-    SrcAlign = MemOp->getBaseAlign();
-  }
-
-  // See if this is a constant length copy
-  auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
-  if (!LenVRegAndVal)
-    return false; // Leave it to the legalizer to lower it to a libcall.
-  uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
-
-  if (KnownLen == 0) {
-    MI.eraseFromParent();
-    return true;
-  }
-
-  bool IsVolatile = MemOp->isVolatile();
-  if (Opc == TargetOpcode::G_MEMCPY_INLINE)
-    return tryEmitMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
-                               IsVolatile);
-
-  // Don't try to optimize volatile.
-  if (IsVolatile)
-    return false;
-
-  if (MaxLen && KnownLen > MaxLen)
-    return false;
-
-  if (Opc == TargetOpcode::G_MEMCPY) {
-    auto &MF = *MI.getParent()->getParent();
-    const auto &TLI = *MF.getSubtarget().getTargetLowering();
-    bool OptSize = shouldLowerMemFuncForSize(MF);
-    uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
-    return optimizeMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
-                          IsVolatile);
-  }
-  if (Opc == TargetOpcode::G_MEMMOVE)
-    return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
-  if (Opc == TargetOpcode::G_MEMSET)
-    return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
-  return false;
+  MachineIRBuilder HelperBuilder(MI);
+  GISelObserverWrapper DummyObserver;
+  LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
+  return Helper.lowerMemCpyFamily(MI, MaxLen) ==
+         LegalizerHelper::LegalizeResult::Legalized;
 }
 
 static Optional<APFloat> constantFoldFpUnary(unsigned Opcode, LLT DstTy,
@@ -1706,30 +1301,52 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
 
   Register Add2 = MI.getOperand(1).getReg();
   Register Imm1 = MI.getOperand(2).getReg();
-  auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
+  auto MaybeImmVal = getIConstantVRegValWithLookThrough(Imm1, MRI);
   if (!MaybeImmVal)
     return false;
 
-  // Don't do this combine if there multiple uses of the first PTR_ADD,
-  // since we may be able to compute the second PTR_ADD as an immediate
-  // offset anyway. Folding the first offset into the second may cause us
-  // to go beyond the bounds of our legal addressing modes.
-  if (!MRI.hasOneNonDBGUse(Add2))
-    return false;
-
-  MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
+  MachineInstr *Add2Def = MRI.getVRegDef(Add2);
   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
     return false;
 
   Register Base = Add2Def->getOperand(1).getReg();
   Register Imm2 = Add2Def->getOperand(2).getReg();
-  auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
+  auto MaybeImm2Val = getIConstantVRegValWithLookThrough(Imm2, MRI);
   if (!MaybeImm2Val)
     return false;
 
+  // Check if the new combined immediate forms an illegal addressing mode.
+  // Do not combine if it was legal before but would get illegal.
+  // To do so, we need to find a load/store user of the pointer to get
+  // the access type.
+  Type *AccessTy = nullptr;
+  auto &MF = *MI.getMF();
+  for (auto &UseMI : MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
+    if (auto *LdSt = dyn_cast<GLoadStore>(&UseMI)) {
+      AccessTy = getTypeForLLT(MRI.getType(LdSt->getReg(0)),
+                               MF.getFunction().getContext());
+      break;
+    }
+  }
+  TargetLoweringBase::AddrMode AMNew;
+  APInt CombinedImm = MaybeImmVal->Value + MaybeImm2Val->Value;
+  AMNew.BaseOffs = CombinedImm.getSExtValue();
+  if (AccessTy) {
+    AMNew.HasBaseReg = true;
+    TargetLoweringBase::AddrMode AMOld;
+    AMOld.BaseOffs = MaybeImm2Val->Value.getSExtValue();
+    AMOld.HasBaseReg = true;
+    unsigned AS = MRI.getType(Add2).getAddressSpace();
+    const auto &TLI = *MF.getSubtarget().getTargetLowering();
+    if (TLI.isLegalAddressingMode(MF.getDataLayout(), AMOld, AccessTy, AS) &&
+        !TLI.isLegalAddressingMode(MF.getDataLayout(), AMNew, AccessTy, AS))
+      return false;
+  }
+
   // Pass the combined immediate to the apply function.
-  MatchInfo.Imm = (MaybeImmVal->Value + MaybeImm2Val->Value).getSExtValue();
+  MatchInfo.Imm = AMNew.BaseOffs;
   MatchInfo.Base = Base;
+  MatchInfo.Bank = getRegBank(Imm2);
   return true;
 }
 
@@ -1739,6 +1356,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
   MachineIRBuilder MIB(MI);
   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
+  setRegBank(NewOffset.getReg(0), MatchInfo.Bank);
   Observer.changingInstr(MI);
   MI.getOperand(1).setReg(MatchInfo.Base);
   MI.getOperand(2).setReg(NewOffset.getReg(0));
@@ -1762,7 +1380,7 @@ bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI,
 
   Register Shl2 = MI.getOperand(1).getReg();
   Register Imm1 = MI.getOperand(2).getReg();
-  auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
+  auto MaybeImmVal = getIConstantVRegValWithLookThrough(Imm1, MRI);
   if (!MaybeImmVal)
     return false;
 
@@ -1772,7 +1390,7 @@ bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI,
 
   Register Base = Shl2Def->getOperand(1).getReg();
   Register Imm2 = Shl2Def->getOperand(2).getReg();
-  auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
+  auto MaybeImm2Val = getIConstantVRegValWithLookThrough(Imm2, MRI);
   if (!MaybeImm2Val)
     return false;
 
@@ -1856,7 +1474,7 @@ bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI,
 
   // Find a matching one-use shift by constant.
   const Register C1 = MI.getOperand(2).getReg();
-  auto MaybeImmVal = getConstantVRegValWithLookThrough(C1, MRI);
+  auto MaybeImmVal = getIConstantVRegValWithLookThrough(C1, MRI);
   if (!MaybeImmVal)
     return false;
 
@@ -1870,7 +1488,7 @@ bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI,
 
     // Must be a constant.
     auto MaybeImmVal =
-        getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+        getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
     if (!MaybeImmVal)
       return false;
 
@@ -1932,8 +1550,8 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
   Builder.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2});
 
   // These were one use so it's safe to remove them.
-  MatchInfo.Shift2->eraseFromParent();
-  MatchInfo.Logic->eraseFromParent();
+  MatchInfo.Shift2->eraseFromParentAndMarkDBGValuesForRemoval();
+  MatchInfo.Logic->eraseFromParentAndMarkDBGValuesForRemoval();
 
   MI.eraseFromParent();
 }
@@ -1942,7 +1560,7 @@ bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
                                           unsigned &ShiftVal) {
   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
   auto MaybeImmVal =
-      getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
   if (!MaybeImmVal)
     return false;
 
@@ -1977,7 +1595,7 @@ bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
 
   // TODO: Should handle vector splat.
   Register RHS = MI.getOperand(2).getReg();
-  auto MaybeShiftAmtVal = getConstantVRegValWithLookThrough(RHS, MRI);
+  auto MaybeShiftAmtVal = getIConstantVRegValWithLookThrough(RHS, MRI);
   if (!MaybeShiftAmtVal)
     return false;
 
@@ -2045,26 +1663,23 @@ bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
     MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
-  Register SrcReg =
-      peekThroughBitcast(MI.getOperand(MI.getNumOperands() - 1).getReg(), MRI);
+  auto &Unmerge = cast<GUnmerge>(MI);
+  Register SrcReg = peekThroughBitcast(Unmerge.getSourceReg(), MRI);
 
-  MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg);
-  if (SrcInstr->getOpcode() != TargetOpcode::G_MERGE_VALUES &&
-      SrcInstr->getOpcode() != TargetOpcode::G_BUILD_VECTOR &&
-      SrcInstr->getOpcode() != TargetOpcode::G_CONCAT_VECTORS)
+  auto *SrcInstr = getOpcodeDef<GMergeLikeOp>(SrcReg, MRI);
+  if (!SrcInstr)
     return false;
 
   // Check the source type of the merge.
-  LLT SrcMergeTy = MRI.getType(SrcInstr->getOperand(1).getReg());
-  LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg());
+  LLT SrcMergeTy = MRI.getType(SrcInstr->getSourceReg(0));
+  LLT Dst0Ty = MRI.getType(Unmerge.getReg(0));
   bool SameSize = Dst0Ty.getSizeInBits() == SrcMergeTy.getSizeInBits();
   if (SrcMergeTy != Dst0Ty && !SameSize)
     return false;
   // They are the same now (modulo a bitcast).
   // We can collect all the src registers.
-  for (unsigned Idx = 1, EndIdx = SrcInstr->getNumOperands(); Idx != EndIdx;
-       ++Idx)
-    Operands.push_back(SrcInstr->getOperand(Idx).getReg());
+  for (unsigned Idx = 0; Idx < SrcInstr->getNumSources(); ++Idx)
+    Operands.push_back(SrcInstr->getSourceReg(Idx));
   return true;
 }
 
@@ -2241,7 +1856,7 @@ bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
     return false;
 
   auto MaybeImmVal =
-    getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
   if (!MaybeImmVal)
     return false;
 
@@ -2410,12 +2025,12 @@ void CombinerHelper::applyCombineAddP2IToPtrAdd(
 
 bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
                                                   int64_t &NewCst) {
-  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected a G_PTR_ADD");
-  Register LHS = MI.getOperand(1).getReg();
-  Register RHS = MI.getOperand(2).getReg();
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Register LHS = PtrAdd.getBaseReg();
+  Register RHS = PtrAdd.getOffsetReg();
   MachineRegisterInfo &MRI = Builder.getMF().getRegInfo();
 
-  if (auto RHSCst = getConstantVRegSExtVal(RHS, MRI)) {
+  if (auto RHSCst = getIConstantVRegSExtVal(RHS, MRI)) {
     int64_t Cst;
     if (mi_match(LHS, MRI, m_GIntToPtr(m_ICst(Cst)))) {
       NewCst = Cst + *RHSCst;
@@ -2428,12 +2043,12 @@ bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
 
 void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI,
                                                   int64_t &NewCst) {
-  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected a G_PTR_ADD");
-  Register Dst = MI.getOperand(0).getReg();
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Register Dst = PtrAdd.getReg(0);
 
   Builder.setInstrAndDebugLoc(MI);
   Builder.buildConstant(Dst, NewCst);
-  MI.eraseFromParent();
+  PtrAdd.eraseFromParent();
 }
 
 bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
@@ -2536,6 +2151,23 @@ bool CombinerHelper::matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src) {
   return mi_match(Src, MRI, m_GFabs(m_Reg(AbsSrc)));
 }
 
+bool CombinerHelper::matchCombineFAbsOfFNeg(MachineInstr &MI,
+                                            BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS");
+  Register Src = MI.getOperand(1).getReg();
+  Register NegSrc;
+
+  if (!mi_match(Src, MRI, m_GFNeg(m_Reg(NegSrc))))
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(NegSrc);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
 bool CombinerHelper::matchCombineTruncOfExt(
     MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
@@ -2587,7 +2219,7 @@ bool CombinerHelper::matchCombineTruncOfShl(
            {DstTy, getTargetLowering().getPreferredShiftAmountTy(DstTy)}})) {
     KnownBits Known = KB->getKnownBits(ShiftAmt);
     unsigned Size = DstTy.getSizeInBits();
-    if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
+    if (Known.countMaxActiveBits() <= Log2_32(Size)) {
       MatchInfo = std::make_pair(ShiftSrc, ShiftAmt);
       return true;
     }
@@ -2644,13 +2276,13 @@ bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) {
-  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
-  if (auto MaybeCstCmp =
-          getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI)) {
-    OpIdx = MaybeCstCmp->Value.isNullValue() ? 3 : 2;
-    return true;
-  }
-  return false;
+  GSelect &SelMI = cast<GSelect>(MI);
+  auto Cst =
+      isConstantOrConstantSplatVector(*MRI.getVRegDef(SelMI.getCondReg()), MRI);
+  if (!Cst)
+    return false;
+  OpIdx = Cst->isZero() ? 3 : 2;
+  return true;
 }
 
 bool CombinerHelper::eraseInst(MachineInstr &MI) {
@@ -2662,12 +2294,14 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
                                     const MachineOperand &MOP2) {
   if (!MOP1.isReg() || !MOP2.isReg())
     return false;
-  MachineInstr *I1 = getDefIgnoringCopies(MOP1.getReg(), MRI);
-  if (!I1)
+  auto InstAndDef1 = getDefSrcRegIgnoringCopies(MOP1.getReg(), MRI);
+  if (!InstAndDef1)
     return false;
-  MachineInstr *I2 = getDefIgnoringCopies(MOP2.getReg(), MRI);
-  if (!I2)
+  auto InstAndDef2 = getDefSrcRegIgnoringCopies(MOP2.getReg(), MRI);
+  if (!InstAndDef2)
     return false;
+  MachineInstr *I1 = InstAndDef1->MI;
+  MachineInstr *I2 = InstAndDef2->MI;
 
   // Handle a case like this:
   //
@@ -2727,15 +2361,26 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
   //
   // On the off-chance that there's some target instruction feeding into the
   // instruction, let's use produceSameValue instead of isIdenticalTo.
-  return Builder.getTII().produceSameValue(*I1, *I2, &MRI);
+  if (Builder.getTII().produceSameValue(*I1, *I2, &MRI)) {
+    // Handle instructions with multiple defs that produce same values. Values
+    // are same for operands with same index.
+    // %0:_(s8), %1:_(s8), %2:_(s8), %3:_(s8) = G_UNMERGE_VALUES %4:_(<4 x s8>)
+    // %5:_(s8), %6:_(s8), %7:_(s8), %8:_(s8) = G_UNMERGE_VALUES %4:_(<4 x s8>)
+    // I1 and I2 are different instructions but produce same values,
+    // %1 and %6 are same, %1 and %7 are not the same value.
+    return I1->findRegisterDefOperandIdx(InstAndDef1->Reg) ==
+           I2->findRegisterDefOperandIdx(InstAndDef2->Reg);
+  }
+  return false;
 }
 
 bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) {
   if (!MOP.isReg())
     return false;
-  // MIPatternMatch doesn't let us look through G_ZEXT etc.
-  auto ValAndVReg = getConstantVRegValWithLookThrough(MOP.getReg(), MRI);
-  return ValAndVReg && ValAndVReg->Value == C;
+  auto *MI = MRI.getVRegDef(MOP.getReg());
+  auto MaybeCst = isConstantOrConstantSplatVector(*MI, MRI);
+  return MaybeCst.hasValue() && MaybeCst->getBitWidth() <= 64 &&
+         MaybeCst->getSExtValue() == C;
 }
 
 bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
@@ -3115,14 +2760,14 @@ bool CombinerHelper::matchRedundantAnd(MachineInstr &MI,
   //
   // Check if we can replace AndDst with the LHS of the G_AND
   if (canReplaceReg(AndDst, LHS, MRI) &&
-      (LHSBits.Zero | RHSBits.One).isAllOnesValue()) {
+      (LHSBits.Zero | RHSBits.One).isAllOnes()) {
     Replacement = LHS;
     return true;
   }
 
   // Check if we can replace AndDst with the RHS of the G_AND
   if (canReplaceReg(AndDst, RHS, MRI) &&
-      (LHSBits.One | RHSBits.Zero).isAllOnesValue()) {
+      (LHSBits.One | RHSBits.Zero).isAllOnes()) {
     Replacement = RHS;
     return true;
   }
@@ -3161,14 +2806,14 @@ bool CombinerHelper::matchRedundantOr(MachineInstr &MI, Register &Replacement) {
   //
   // Check if we can replace OrDst with the LHS of the G_OR
   if (canReplaceReg(OrDst, LHS, MRI) &&
-      (LHSBits.One | RHSBits.Zero).isAllOnesValue()) {
+      (LHSBits.One | RHSBits.Zero).isAllOnes()) {
     Replacement = LHS;
     return true;
   }
 
   // Check if we can replace OrDst with the RHS of the G_OR
   if (canReplaceReg(OrDst, RHS, MRI) &&
-      (LHSBits.Zero | RHSBits.One).isAllOnesValue()) {
+      (LHSBits.Zero | RHSBits.One).isAllOnes()) {
     Replacement = RHS;
     return true;
   }
@@ -3346,7 +2991,8 @@ void CombinerHelper::applyXorOfAndWithSameReg(
 }
 
 bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) {
-  Register DstReg = MI.getOperand(0).getReg();
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Register DstReg = PtrAdd.getReg(0);
   LLT Ty = MRI.getType(DstReg);
   const DataLayout &DL = Builder.getMF().getDataLayout();
 
@@ -3354,20 +3000,20 @@ bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) {
     return false;
 
   if (Ty.isPointer()) {
-    auto ConstVal = getConstantVRegVal(MI.getOperand(1).getReg(), MRI);
+    auto ConstVal = getIConstantVRegVal(PtrAdd.getBaseReg(), MRI);
     return ConstVal && *ConstVal == 0;
   }
 
   assert(Ty.isVector() && "Expecting a vector type");
-  const MachineInstr *VecMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+  const MachineInstr *VecMI = MRI.getVRegDef(PtrAdd.getBaseReg());
   return isBuildVectorAllZeros(*VecMI, MRI);
 }
 
 void CombinerHelper::applyPtrAddZero(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD);
-  Builder.setInstrAndDebugLoc(MI);
-  Builder.buildIntToPtr(MI.getOperand(0), MI.getOperand(2));
-  MI.eraseFromParent();
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Builder.setInstrAndDebugLoc(PtrAdd);
+  Builder.buildIntToPtr(PtrAdd.getReg(0), PtrAdd.getOffsetReg());
+  PtrAdd.eraseFromParent();
 }
 
 /// The second source operand is known to be a power of 2.
@@ -3704,10 +3350,8 @@ bool CombinerHelper::matchLoadOrCombine(
   // may not use index 0.
   Register Ptr = LowestIdxLoad->getPointerReg();
   const MachineMemOperand &MMO = LowestIdxLoad->getMMO();
-  LegalityQuery::MemDesc MMDesc;
+  LegalityQuery::MemDesc MMDesc(MMO);
   MMDesc.MemoryTy = Ty;
-  MMDesc.AlignInBits = MMO.getAlign().value() * 8;
-  MMDesc.Ordering = MMO.getSuccessOrdering();
   if (!isLegalOrBeforeLegalizer(
           {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
     return false;
@@ -3732,6 +3376,274 @@ bool CombinerHelper::matchLoadOrCombine(
   return true;
 }
 
+/// Check if the store \p Store is a truncstore that can be merged. That is,
+/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
+/// Register then it does not need to match and SrcVal is set to the source
+/// value found.
+/// On match, returns the start byte offset of the \p SrcVal that is being
+/// stored.
+static Optional<int64_t> getTruncStoreByteOffset(GStore &Store, Register &SrcVal,
+                                                 MachineRegisterInfo &MRI) {
+  Register TruncVal;
+  if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal))))
+    return None;
+
+  // The shift amount must be a constant multiple of the narrow type.
+  // It is translated to the offset address in the wide source value "y".
+  //
+  // x = G_LSHR y, ShiftAmtC
+  // s8 z = G_TRUNC x
+  // store z, ...
+  Register FoundSrcVal;
+  int64_t ShiftAmt;
+  if (!mi_match(TruncVal, MRI,
+                m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)),
+                         m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) {
+    if (!SrcVal.isValid() || TruncVal == SrcVal) {
+      if (!SrcVal.isValid())
+        SrcVal = TruncVal;
+      return 0; // If it's the lowest index store.
+    }
+    return None;
+  }
+
+  unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits();
+  if (ShiftAmt % NarrowBits!= 0)
+    return None;
+  const unsigned Offset = ShiftAmt / NarrowBits;
+
+  if (SrcVal.isValid() && FoundSrcVal != SrcVal)
+    return None;
+
+  if (!SrcVal.isValid())
+    SrcVal = FoundSrcVal;
+  else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal))
+    return None;
+  return Offset;
+}
+
+/// Match a pattern where a wide type scalar value is stored by several narrow
+/// stores. Fold it into a single store or a BSWAP and a store if the targets
+/// supports it.
+///
+/// Assuming little endian target:
+///  i8 *p = ...
+///  i32 val = ...
+///  p[0] = (val >> 0) & 0xFF;
+///  p[1] = (val >> 8) & 0xFF;
+///  p[2] = (val >> 16) & 0xFF;
+///  p[3] = (val >> 24) & 0xFF;
+/// =>
+///  *((i32)p) = val;
+///
+///  i8 *p = ...
+///  i32 val = ...
+///  p[0] = (val >> 24) & 0xFF;
+///  p[1] = (val >> 16) & 0xFF;
+///  p[2] = (val >> 8) & 0xFF;
+///  p[3] = (val >> 0) & 0xFF;
+/// =>
+///  *((i32)p) = BSWAP(val);
+bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI,
+                                          MergeTruncStoresInfo &MatchInfo) {
+  auto &StoreMI = cast<GStore>(MI);
+  LLT MemTy = StoreMI.getMMO().getMemoryType();
+
+  // We only handle merging simple stores of 1-4 bytes.
+  if (!MemTy.isScalar())
+    return false;
+  switch (MemTy.getSizeInBits()) {
+  case 8:
+  case 16:
+  case 32:
+    break;
+  default:
+    return false;
+  }
+  if (!StoreMI.isSimple())
+    return false;
+
+  // We do a simple search for mergeable stores prior to this one.
+  // Any potential alias hazard along the way terminates the search.
+  SmallVector<GStore *> FoundStores;
+
+  // We're looking for:
+  // 1) a (store(trunc(...)))
+  // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
+  //    the partial value stored.
+  // 3) where the offsets form either a little or big-endian sequence.
+
+  auto &LastStore = StoreMI;
+
+  // The single base pointer that all stores must use.
+  Register BaseReg;
+  int64_t LastOffset;
+  if (!mi_match(LastStore.getPointerReg(), MRI,
+                m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) {
+    BaseReg = LastStore.getPointerReg();
+    LastOffset = 0;
+  }
+
+  GStore *LowestIdxStore = &LastStore;
+  int64_t LowestIdxOffset = LastOffset;
+
+  Register WideSrcVal;
+  auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI);
+  if (!LowestShiftAmt)
+    return false; // Didn't match a trunc.
+  assert(WideSrcVal.isValid());
+
+  LLT WideStoreTy = MRI.getType(WideSrcVal);
+  // The wide type might not be a multiple of the memory type, e.g. s48 and s32.
+  if (WideStoreTy.getSizeInBits() % MemTy.getSizeInBits() != 0)
+    return false;
+  const unsigned NumStoresRequired =
+      WideStoreTy.getSizeInBits() / MemTy.getSizeInBits();
+
+  SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX);
+  OffsetMap[*LowestShiftAmt] = LastOffset;
+  FoundStores.emplace_back(&LastStore);
+
+  // Search the block up for more stores.
+  // We use a search threshold of 10 instructions here because the combiner
+  // works top-down within a block, and we don't want to search an unbounded
+  // number of predecessor instructions trying to find matching stores.
+  // If we moved this optimization into a separate pass then we could probably
+  // use a more efficient search without having a hard-coded threshold.
+  const int MaxInstsToCheck = 10;
+  int NumInstsChecked = 0;
+  for (auto II = ++LastStore.getReverseIterator();
+       II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck;
+       ++II) {
+    NumInstsChecked++;
+    GStore *NewStore;
+    if ((NewStore = dyn_cast<GStore>(&*II))) {
+      if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple())
+        break;
+    } else if (II->isLoadFoldBarrier() || II->mayLoad()) {
+      break;
+    } else {
+      continue; // This is a safe instruction we can look past.
+    }
+
+    Register NewBaseReg;
+    int64_t MemOffset;
+    // Check we're storing to the same base + some offset.
+    if (!mi_match(NewStore->getPointerReg(), MRI,
+                  m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) {
+      NewBaseReg = NewStore->getPointerReg();
+      MemOffset = 0;
+    }
+    if (BaseReg != NewBaseReg)
+      break;
+
+    auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI);
+    if (!ShiftByteOffset)
+      break;
+    if (MemOffset < LowestIdxOffset) {
+      LowestIdxOffset = MemOffset;
+      LowestIdxStore = NewStore;
+    }
+
+    // Map the offset in the store and the offset in the combined value, and
+    // early return if it has been set before.
+    if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired ||
+        OffsetMap[*ShiftByteOffset] != INT64_MAX)
+      break;
+    OffsetMap[*ShiftByteOffset] = MemOffset;
+
+    FoundStores.emplace_back(NewStore);
+    // Reset counter since we've found a matching inst.
+    NumInstsChecked = 0;
+    if (FoundStores.size() == NumStoresRequired)
+      break;
+  }
+
+  if (FoundStores.size() != NumStoresRequired) {
+    return false;
+  }
+
+  const auto &DL = LastStore.getMF()->getDataLayout();
+  auto &C = LastStore.getMF()->getFunction().getContext();
+  // Check that a store of the wide type is both allowed and fast on the target
+  bool Fast = false;
+  bool Allowed = getTargetLowering().allowsMemoryAccess(
+      C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast);
+  if (!Allowed || !Fast)
+    return false;
+
+  // Check if the pieces of the value are going to the expected places in memory
+  // to merge the stores.
+  unsigned NarrowBits = MemTy.getScalarSizeInBits();
+  auto checkOffsets = [&](bool MatchLittleEndian) {
+    if (MatchLittleEndian) {
+      for (unsigned i = 0; i != NumStoresRequired; ++i)
+        if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset)
+          return false;
+    } else { // MatchBigEndian by reversing loop counter.
+      for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired;
+           ++i, --j)
+        if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset)
+          return false;
+    }
+    return true;
+  };
+
+  // Check if the offsets line up for the native data layout of this target.
+  bool NeedBswap = false;
+  bool NeedRotate = false;
+  if (!checkOffsets(DL.isLittleEndian())) {
+    // Special-case: check if byte offsets line up for the opposite endian.
+    if (NarrowBits == 8 && checkOffsets(DL.isBigEndian()))
+      NeedBswap = true;
+    else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian()))
+      NeedRotate = true;
+    else
+      return false;
+  }
+
+  if (NeedBswap &&
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}}))
+    return false;
+  if (NeedRotate &&
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}}))
+    return false;
+
+  MatchInfo.NeedBSwap = NeedBswap;
+  MatchInfo.NeedRotate = NeedRotate;
+  MatchInfo.LowestIdxStore = LowestIdxStore;
+  MatchInfo.WideSrcVal = WideSrcVal;
+  MatchInfo.FoundStores = std::move(FoundStores);
+  return true;
+}
+
+void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI,
+                                          MergeTruncStoresInfo &MatchInfo) {
+
+  Builder.setInstrAndDebugLoc(MI);
+  Register WideSrcVal = MatchInfo.WideSrcVal;
+  LLT WideStoreTy = MRI.getType(WideSrcVal);
+
+  if (MatchInfo.NeedBSwap) {
+    WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0);
+  } else if (MatchInfo.NeedRotate) {
+    assert(WideStoreTy.getSizeInBits() % 2 == 0 &&
+           "Unexpected type for rotate");
+    auto RotAmt =
+        Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2);
+    WideSrcVal =
+        Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0);
+  }
+
+  Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(),
+                     MatchInfo.LowestIdxStore->getMMO().getPointerInfo(),
+                     MatchInfo.LowestIdxStore->getMMO().getAlign());
+
+  // Erase the old stores.
+  for (auto *ST : MatchInfo.FoundStores)
+    ST->eraseFromParent();
+}
+
 bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
                                             MachineInstr *&ExtMI) {
   assert(MI.getOpcode() == TargetOpcode::G_PHI);
@@ -3844,7 +3756,7 @@ bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI,
           {TargetOpcode::G_BUILD_VECTOR, {SrcTy, SrcTy.getElementType()}}))
     return false;
 
-  auto Cst = getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  auto Cst = getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
   if (!Cst || Cst->Value.getZExtValue() >= SrcTy.getNumElements())
     return false;
 
@@ -3917,7 +3829,7 @@ bool CombinerHelper::matchExtractAllEltsFromBuildVector(
                              MRI.use_instr_nodbg_end())) {
     if (II.getOpcode() != TargetOpcode::G_EXTRACT_VECTOR_ELT)
       return false;
-    auto Cst = getConstantVRegVal(II.getOperand(2).getReg(), MRI);
+    auto Cst = getIConstantVRegVal(II.getOperand(2).getReg(), MRI);
     if (!Cst)
       return false;
     unsigned Idx = Cst.getValue().getZExtValue();
@@ -4064,6 +3976,78 @@ bool CombinerHelper::matchICmpToTrueFalseKnownBits(MachineInstr &MI,
   return true;
 }
 
+bool CombinerHelper::matchICmpToLHSKnownBits(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+  // Given:
+  //
+  // %x = G_WHATEVER (... x is known to be 0 or 1 ...)
+  // %cmp = G_ICMP ne %x, 0
+  //
+  // Or:
+  //
+  // %x = G_WHATEVER (... x is known to be 0 or 1 ...)
+  // %cmp = G_ICMP eq %x, 1
+  //
+  // We can replace %cmp with %x assuming true is 1 on the target.
+  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+  if (!CmpInst::isEquality(Pred))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  if (getICmpTrueVal(getTargetLowering(), DstTy.isVector(),
+                     /* IsFP = */ false) != 1)
+    return false;
+  int64_t OneOrZero = Pred == CmpInst::ICMP_EQ;
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(OneOrZero)))
+    return false;
+  Register LHS = MI.getOperand(2).getReg();
+  auto KnownLHS = KB->getKnownBits(LHS);
+  if (KnownLHS.getMinValue() != 0 || KnownLHS.getMaxValue() != 1)
+    return false;
+  // Make sure replacing Dst with the LHS is a legal operation.
+  LLT LHSTy = MRI.getType(LHS);
+  unsigned LHSSize = LHSTy.getSizeInBits();
+  unsigned DstSize = DstTy.getSizeInBits();
+  unsigned Op = TargetOpcode::COPY;
+  if (DstSize != LHSSize)
+    Op = DstSize < LHSSize ? TargetOpcode::G_TRUNC : TargetOpcode::G_ZEXT;
+  if (!isLegalOrBeforeLegalizer({Op, {DstTy, LHSTy}}))
+    return false;
+  MatchInfo = [=](MachineIRBuilder &B) { B.buildInstr(Op, {Dst}, {LHS}); };
+  return true;
+}
+
+// Replace (and (or x, c1), c2) with (and x, c2) iff c1 & c2 == 0
+bool CombinerHelper::matchAndOrDisjointMask(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  // Ignore vector types to simplify matching the two constants.
+  // TODO: do this for vectors and scalars via a demanded bits analysis.
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  if (Ty.isVector())
+    return false;
+
+  Register Src;
+  int64_t MaskAnd;
+  int64_t MaskOr;
+  if (!mi_match(MI, MRI,
+                m_GAnd(m_GOr(m_Reg(Src), m_ICst(MaskOr)), m_ICst(MaskAnd))))
+    return false;
+
+  // Check if MaskOr could turn on any bits in Src.
+  if (MaskAnd & MaskOr)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Src);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
 /// Form a G_SBFX from a G_SEXT_INREG fed by a right shift.
 bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
     MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
@@ -4130,6 +4114,104 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(
   return true;
 }
 
+bool CombinerHelper::matchBitfieldExtractFromShr(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  const unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_ASHR || Opcode == TargetOpcode::G_LSHR);
+
+  const Register Dst = MI.getOperand(0).getReg();
+
+  const unsigned ExtrOpcode = Opcode == TargetOpcode::G_ASHR
+                                  ? TargetOpcode::G_SBFX
+                                  : TargetOpcode::G_UBFX;
+
+  // Check if the type we would use for the extract is legal
+  LLT Ty = MRI.getType(Dst);
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!LI || !LI->isLegalOrCustom({ExtrOpcode, {Ty, ExtractTy}}))
+    return false;
+
+  Register ShlSrc;
+  int64_t ShrAmt;
+  int64_t ShlAmt;
+  const unsigned Size = Ty.getScalarSizeInBits();
+
+  // Try to match shr (shl x, c1), c2
+  if (!mi_match(Dst, MRI,
+                m_BinOp(Opcode,
+                        m_OneNonDBGUse(m_GShl(m_Reg(ShlSrc), m_ICst(ShlAmt))),
+                        m_ICst(ShrAmt))))
+    return false;
+
+  // Make sure that the shift sizes can fit a bitfield extract
+  if (ShlAmt < 0 || ShlAmt > ShrAmt || ShrAmt >= Size)
+    return false;
+
+  // Skip this combine if the G_SEXT_INREG combine could handle it
+  if (Opcode == TargetOpcode::G_ASHR && ShlAmt == ShrAmt)
+    return false;
+
+  // Calculate start position and width of the extract
+  const int64_t Pos = ShrAmt - ShlAmt;
+  const int64_t Width = Size - ShrAmt;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto WidthCst = B.buildConstant(ExtractTy, Width);
+    auto PosCst = B.buildConstant(ExtractTy, Pos);
+    B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});
+  };
+  return true;
+}
+
+bool CombinerHelper::matchBitfieldExtractFromShrAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  const unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_ASHR);
+
+  const Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal(
+          TargetOpcode::G_UBFX, Ty, Ty))
+    return false;
+
+  // Try to match shr (and x, c1), c2
+  Register AndSrc;
+  int64_t ShrAmt;
+  int64_t SMask;
+  if (!mi_match(Dst, MRI,
+                m_BinOp(Opcode,
+                        m_OneNonDBGUse(m_GAnd(m_Reg(AndSrc), m_ICst(SMask))),
+                        m_ICst(ShrAmt))))
+    return false;
+
+  const unsigned Size = Ty.getScalarSizeInBits();
+  if (ShrAmt < 0 || ShrAmt >= Size)
+    return false;
+
+  // Check that ubfx can do the extraction, with no holes in the mask.
+  uint64_t UMask = SMask;
+  UMask |= maskTrailingOnes<uint64_t>(ShrAmt);
+  UMask &= maskTrailingOnes<uint64_t>(Size);
+  if (!isMask_64(UMask))
+    return false;
+
+  // Calculate start position and width of the extract.
+  const int64_t Pos = ShrAmt;
+  const int64_t Width = countTrailingOnes(UMask) - ShrAmt;
+
+  // It's preferable to keep the shift, rather than form G_SBFX.
+  // TODO: remove the G_AND via demanded bits analysis.
+  if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto WidthCst = B.buildConstant(Ty, Width);
+    auto PosCst = B.buildConstant(Ty, Pos);
+    B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
+  };
+  return true;
+}
+
 bool CombinerHelper::reassociationCanBreakAddressingModePattern(
     MachineInstr &PtrAdd) {
   assert(PtrAdd.getOpcode() == TargetOpcode::G_PTR_ADD);
@@ -4144,10 +4226,10 @@ bool CombinerHelper::reassociationCanBreakAddressingModePattern(
   if (MRI.hasOneNonDBGUse(Src1Reg))
     return false;
 
-  auto C1 = getConstantVRegVal(Src1Def->getOperand(2).getReg(), MRI);
+  auto C1 = getIConstantVRegVal(Src1Def->getOperand(2).getReg(), MRI);
   if (!C1)
     return false;
-  auto C2 = getConstantVRegVal(Src2Reg, MRI);
+  auto C2 = getIConstantVRegVal(Src2Reg, MRI);
   if (!C2)
     return false;
 
@@ -4198,9 +4280,91 @@ bool CombinerHelper::reassociationCanBreakAddressingModePattern(
   return false;
 }
 
-bool CombinerHelper::matchReassocPtrAdd(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
-  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD);
+bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
+                                                  MachineInstr *RHS,
+                                                  BuildFnTy &MatchInfo) {
+  // G_PTR_ADD(BASE, G_ADD(X, C)) -> G_PTR_ADD(G_PTR_ADD(BASE, X), C)
+  Register Src1Reg = MI.getOperand(1).getReg();
+  if (RHS->getOpcode() != TargetOpcode::G_ADD)
+    return false;
+  auto C2 = getIConstantVRegVal(RHS->getOperand(2).getReg(), MRI);
+  if (!C2)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
+
+    auto NewBase =
+        Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(NewBase.getReg(0));
+    MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
+    Observer.changedInstr(MI);
+  };
+  return !reassociationCanBreakAddressingModePattern(MI);
+}
+
+bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
+                                                  MachineInstr *LHS,
+                                                  MachineInstr *RHS,
+                                                  BuildFnTy &MatchInfo) {
+  // G_PTR_ADD (G_PTR_ADD X, C), Y) -> (G_PTR_ADD (G_PTR_ADD(X, Y), C)
+  // if and only if (G_PTR_ADD X, C) has one use.
+  Register LHSBase;
+  Optional<ValueAndVReg> LHSCstOff;
+  if (!mi_match(MI.getBaseReg(), MRI,
+                m_OneNonDBGUse(m_GPtrAdd(m_Reg(LHSBase), m_GCst(LHSCstOff)))))
+    return false;
+
+  auto *LHSPtrAdd = cast<GPtrAdd>(LHS);
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    // When we change LHSPtrAdd's offset register we might cause it to use a reg
+    // before its def. Sink the instruction so the outer PTR_ADD to ensure this
+    // doesn't happen.
+    LHSPtrAdd->moveBefore(&MI);
+    Register RHSReg = MI.getOffsetReg();
+    Observer.changingInstr(MI);
+    MI.getOperand(2).setReg(LHSCstOff->VReg);
+    Observer.changedInstr(MI);
+    Observer.changingInstr(*LHSPtrAdd);
+    LHSPtrAdd->getOperand(2).setReg(RHSReg);
+    Observer.changedInstr(*LHSPtrAdd);
+  };
+  return !reassociationCanBreakAddressingModePattern(MI);
+}
+
+bool CombinerHelper::matchReassocFoldConstantsInSubTree(GPtrAdd &MI,
+                                                        MachineInstr *LHS,
+                                                        MachineInstr *RHS,
+                                                        BuildFnTy &MatchInfo) {
+  // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
+  auto *LHSPtrAdd = dyn_cast<GPtrAdd>(LHS);
+  if (!LHSPtrAdd)
+    return false;
+
+  Register Src2Reg = MI.getOperand(2).getReg();
+  Register LHSSrc1 = LHSPtrAdd->getBaseReg();
+  Register LHSSrc2 = LHSPtrAdd->getOffsetReg();
+  auto C1 = getIConstantVRegVal(LHSSrc2, MRI);
+  if (!C1)
+    return false;
+  auto C2 = getIConstantVRegVal(Src2Reg, MRI);
+  if (!C2)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(LHSSrc1);
+    MI.getOperand(2).setReg(NewCst.getReg(0));
+    Observer.changedInstr(MI);
+  };
+  return !reassociationCanBreakAddressingModePattern(MI);
+}
+
+bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI,
+                                        BuildFnTy &MatchInfo) {
+  auto &PtrAdd = cast<GPtrAdd>(MI);
   // We're trying to match a few pointer computation patterns here for
   // re-association opportunities.
   // 1) Isolating a constant operand to be on the RHS, e.g.:
@@ -4209,49 +4373,26 @@ bool CombinerHelper::matchReassocPtrAdd(
   // 2) Folding two constants in each sub-tree as long as such folding
   // doesn't break a legal addressing mode.
   // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
-  Register Src1Reg = MI.getOperand(1).getReg();
-  Register Src2Reg = MI.getOperand(2).getReg();
-  MachineInstr *LHS = MRI.getVRegDef(Src1Reg);
-  MachineInstr *RHS = MRI.getVRegDef(Src2Reg);
-
-  if (LHS->getOpcode() != TargetOpcode::G_PTR_ADD) {
-    // Try to match example 1).
-    if (RHS->getOpcode() != TargetOpcode::G_ADD)
-      return false;
-    auto C2 = getConstantVRegVal(RHS->getOperand(2).getReg(), MRI);
-    if (!C2)
-      return false;
+  //
+  // 3) Move a constant from the LHS of an inner op to the RHS of the outer.
+  // G_PTR_ADD (G_PTR_ADD X, C), Y) -> G_PTR_ADD (G_PTR_ADD(X, Y), C)
+  // iif (G_PTR_ADD X, C) has one use.
+  MachineInstr *LHS = MRI.getVRegDef(PtrAdd.getBaseReg());
+  MachineInstr *RHS = MRI.getVRegDef(PtrAdd.getOffsetReg());
+
+  // Try to match example 2.
+  if (matchReassocFoldConstantsInSubTree(PtrAdd, LHS, RHS, MatchInfo))
+    return true;
 
-    MatchInfo = [=,&MI](MachineIRBuilder &B) {
-      LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
+  // Try to match example 3.
+  if (matchReassocConstantInnerLHS(PtrAdd, LHS, RHS, MatchInfo))
+    return true;
 
-      auto NewBase =
-          Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
-      Observer.changingInstr(MI);
-      MI.getOperand(1).setReg(NewBase.getReg(0));
-      MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
-      Observer.changedInstr(MI);
-    };
-  } else {
-    // Try to match example 2.
-    Register LHSSrc1 = LHS->getOperand(1).getReg();
-    Register LHSSrc2 = LHS->getOperand(2).getReg();
-    auto C1 = getConstantVRegVal(LHSSrc2, MRI);
-    if (!C1)
-      return false;
-    auto C2 = getConstantVRegVal(Src2Reg, MRI);
-    if (!C2)
-      return false;
+  // Try to match example 1.
+  if (matchReassocConstantInnerRHS(PtrAdd, RHS, MatchInfo))
+    return true;
 
-    MatchInfo = [=, &MI](MachineIRBuilder &B) {
-      auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
-      Observer.changingInstr(MI);
-      MI.getOperand(1).setReg(LHSSrc1);
-      MI.getOperand(2).setReg(NewCst.getReg(0));
-      Observer.changedInstr(MI);
-    };
-  }
-  return !reassociationCanBreakAddressingModePattern(MI);
+  return false;
 }
 
 bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) {
@@ -4264,6 +4405,361 @@ bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) {
   return true;
 }
 
+bool CombinerHelper::matchNarrowBinopFeedingAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  // Look for a binop feeding into an AND with a mask:
+  //
+  // %add = G_ADD %lhs, %rhs
+  // %and = G_AND %add, 000...11111111
+  //
+  // Check if it's possible to perform the binop at a narrower width and zext
+  // back to the original width like so:
+  //
+  // %narrow_lhs = G_TRUNC %lhs
+  // %narrow_rhs = G_TRUNC %rhs
+  // %narrow_add = G_ADD %narrow_lhs, %narrow_rhs
+  // %new_add = G_ZEXT %narrow_add
+  // %and = G_AND %new_add, 000...11111111
+  //
+  // This can allow later combines to eliminate the G_AND if it turns out
+  // that the mask is irrelevant.
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+  Register Dst = MI.getOperand(0).getReg();
+  Register AndLHS = MI.getOperand(1).getReg();
+  Register AndRHS = MI.getOperand(2).getReg();
+  LLT WideTy = MRI.getType(Dst);
+
+  // If the potential binop has more than one use, then it's possible that one
+  // of those uses will need its full width.
+  if (!WideTy.isScalar() || !MRI.hasOneNonDBGUse(AndLHS))
+    return false;
+
+  // Check if the LHS feeding the AND is impacted by the high bits that we're
+  // masking out.
+  //
+  // e.g. for 64-bit x, y:
+  //
+  // add_64(x, y) & 65535 == zext(add_16(trunc(x), trunc(y))) & 65535
+  MachineInstr *LHSInst = getDefIgnoringCopies(AndLHS, MRI);
+  if (!LHSInst)
+    return false;
+  unsigned LHSOpc = LHSInst->getOpcode();
+  switch (LHSOpc) {
+  default:
+    return false;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+    break;
+  }
+
+  // Find the mask on the RHS.
+  auto Cst = getIConstantVRegValWithLookThrough(AndRHS, MRI);
+  if (!Cst)
+    return false;
+  auto Mask = Cst->Value;
+  if (!Mask.isMask())
+    return false;
+
+  // No point in combining if there's nothing to truncate.
+  unsigned NarrowWidth = Mask.countTrailingOnes();
+  if (NarrowWidth == WideTy.getSizeInBits())
+    return false;
+  LLT NarrowTy = LLT::scalar(NarrowWidth);
+
+  // Check if adding the zext + truncates could be harmful.
+  auto &MF = *MI.getMF();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) ||
+      !TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx))
+    return false;
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) ||
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}}))
+    return false;
+  Register BinOpLHS = LHSInst->getOperand(1).getReg();
+  Register BinOpRHS = LHSInst->getOperand(2).getReg();
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS);
+    auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS);
+    auto NarrowBinOp =
+        Builder.buildInstr(LHSOpc, {NarrowTy}, {NarrowLHS, NarrowRHS});
+    auto Ext = Builder.buildZExt(WideTy, NarrowBinOp);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Ext.getReg(0));
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO);
+  // Check for a constant 2 or a splat of 2 on the RHS.
+  auto RHS = MI.getOperand(3).getReg();
+  bool IsVector = MRI.getType(RHS).isVector();
+  if (!IsVector && !mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(2)))
+    return false;
+  if (IsVector) {
+    // FIXME: There's no mi_match pattern for this yet.
+    auto *RHSDef = getDefIgnoringCopies(RHS, MRI);
+    if (!RHSDef)
+      return false;
+    auto Splat = getBuildVectorConstantSplat(*RHSDef, MRI);
+    if (!Splat || *Splat != 2)
+      return false;
+  }
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    unsigned NewOpc = Opc == TargetOpcode::G_UMULO ? TargetOpcode::G_UADDO
+                                                   : TargetOpcode::G_SADDO;
+    MI.setDesc(Builder.getTII().get(NewOpc));
+    MI.getOperand(3).setReg(MI.getOperand(2).getReg());
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+  auto &UDiv = cast<GenericMachineInstr>(MI);
+  Register Dst = UDiv.getReg(0);
+  Register LHS = UDiv.getReg(1);
+  Register RHS = UDiv.getReg(2);
+  LLT Ty = MRI.getType(Dst);
+  LLT ScalarTy = Ty.getScalarType();
+  const unsigned EltBits = ScalarTy.getScalarSizeInBits();
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
+  auto &MIB = Builder;
+  MIB.setInstrAndDebugLoc(MI);
+
+  bool UseNPQ = false;
+  SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
+
+  auto BuildUDIVPattern = [&](const Constant *C) {
+    auto *CI = cast<ConstantInt>(C);
+    const APInt &Divisor = CI->getValue();
+    UnsignedDivisonByConstantInfo magics =
+        UnsignedDivisonByConstantInfo::get(Divisor);
+    unsigned PreShift = 0, PostShift = 0;
+
+    // If the divisor is even, we can avoid using the expensive fixup by
+    // shifting the divided value upfront.
+    if (magics.IsAdd != 0 && !Divisor[0]) {
+      PreShift = Divisor.countTrailingZeros();
+      // Get magic number for the shifted divisor.
+      magics =
+          UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
+      assert(magics.IsAdd == 0 && "Should use cheap fixup now");
+    }
+
+    APInt Magic = magics.Magic;
+
+    unsigned SelNPQ;
+    if (magics.IsAdd == 0 || Divisor.isOneValue()) {
+      assert(magics.ShiftAmount < Divisor.getBitWidth() &&
+             "We shouldn't generate an undefined shift!");
+      PostShift = magics.ShiftAmount;
+      SelNPQ = false;
+    } else {
+      PostShift = magics.ShiftAmount - 1;
+      SelNPQ = true;
+    }
+
+    PreShifts.push_back(
+        MIB.buildConstant(ScalarShiftAmtTy, PreShift).getReg(0));
+    MagicFactors.push_back(MIB.buildConstant(ScalarTy, Magic).getReg(0));
+    NPQFactors.push_back(
+        MIB.buildConstant(ScalarTy,
+                          SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
+                                 : APInt::getZero(EltBits))
+            .getReg(0));
+    PostShifts.push_back(
+        MIB.buildConstant(ScalarShiftAmtTy, PostShift).getReg(0));
+    UseNPQ |= SelNPQ;
+    return true;
+  };
+
+  // Collect the shifts/magic values from each element.
+  bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
+  (void)Matched;
+  assert(Matched && "Expected unary predicate match to succeed");
+
+  Register PreShift, PostShift, MagicFactor, NPQFactor;
+  auto *RHSDef = getOpcodeDef<GBuildVector>(RHS, MRI);
+  if (RHSDef) {
+    PreShift = MIB.buildBuildVector(ShiftAmtTy, PreShifts).getReg(0);
+    MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0);
+    NPQFactor = MIB.buildBuildVector(Ty, NPQFactors).getReg(0);
+    PostShift = MIB.buildBuildVector(ShiftAmtTy, PostShifts).getReg(0);
+  } else {
+    assert(MRI.getType(RHS).isScalar() &&
+           "Non-build_vector operation should have been a scalar");
+    PreShift = PreShifts[0];
+    MagicFactor = MagicFactors[0];
+    PostShift = PostShifts[0];
+  }
+
+  Register Q = LHS;
+  Q = MIB.buildLShr(Ty, Q, PreShift).getReg(0);
+
+  // Multiply the numerator (operand 0) by the magic value.
+  Q = MIB.buildUMulH(Ty, Q, MagicFactor).getReg(0);
+
+  if (UseNPQ) {
+    Register NPQ = MIB.buildSub(Ty, LHS, Q).getReg(0);
+
+    // For vectors we might have a mix of non-NPQ/NPQ paths, so use
+    // G_UMULH to act as a SRL-by-1 for NPQ, else multiply by zero.
+    if (Ty.isVector())
+      NPQ = MIB.buildUMulH(Ty, NPQ, NPQFactor).getReg(0);
+    else
+      NPQ = MIB.buildLShr(Ty, NPQ, MIB.buildConstant(ShiftAmtTy, 1)).getReg(0);
+
+    Q = MIB.buildAdd(Ty, NPQ, Q).getReg(0);
+  }
+
+  Q = MIB.buildLShr(Ty, Q, PostShift).getReg(0);
+  auto One = MIB.buildConstant(Ty, 1);
+  auto IsOne = MIB.buildICmp(
+      CmpInst::Predicate::ICMP_EQ,
+      Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
+  return MIB.buildSelect(Ty, IsOne, LHS, Q);
+}
+
+bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+  Register Dst = MI.getOperand(0).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  auto *RHSDef = MRI.getVRegDef(RHS);
+  if (!isConstantOrConstantVector(*RHSDef, MRI))
+    return false;
+
+  auto &MF = *MI.getMF();
+  AttributeList Attr = MF.getFunction().getAttributes();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr))
+    return false;
+
+  // Don't do this for minsize because the instruction sequence is usually
+  // larger.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  // Don't do this if the types are not going to be legal.
+  if (LI) {
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_ICMP,
+             {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
+              DstTy}}))
+      return false;
+  }
+
+  auto CheckEltValue = [&](const Constant *C) {
+    if (auto *CI = dyn_cast_or_null<ConstantInt>(C))
+      return !CI->isZero();
+    return false;
+  };
+  return matchUnaryPredicate(MRI, RHS, CheckEltValue);
+}
+
+void CombinerHelper::applyUDivByConst(MachineInstr &MI) {
+  auto *NewMI = buildUDivUsingMul(MI);
+  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
+}
+
+bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UMULH);
+  Register RHS = MI.getOperand(2).getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  auto MatchPow2ExceptOne = [&](const Constant *C) {
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      return CI->getValue().isPowerOf2() && !CI->getValue().isOne();
+    return false;
+  };
+  if (!matchUnaryPredicate(MRI, RHS, MatchPow2ExceptOne, false))
+    return false;
+  return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}});
+}
+
+void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  unsigned NumEltBits = Ty.getScalarSizeInBits();
+
+  Builder.setInstrAndDebugLoc(MI);
+  auto LogBase2 = buildLogBase2(RHS, Builder);
+  auto ShiftAmt =
+      Builder.buildSub(Ty, Builder.buildConstant(Ty, NumEltBits), LogBase2);
+  auto Trunc = Builder.buildZExtOrTrunc(ShiftAmtTy, ShiftAmt);
+  Builder.buildLShr(Dst, LHS, Trunc);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI,
+                                               BuildFnTy &MatchInfo) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_FADD || Opc == TargetOpcode::G_FSUB ||
+         Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV ||
+         Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  LLT Type = MRI.getType(Dst);
+
+  // fold (fadd x, fneg(y)) -> (fsub x, y)
+  // fold (fadd fneg(y), x) -> (fsub x, y)
+  // G_ADD is commutative so both cases are checked by m_GFAdd
+  if (mi_match(Dst, MRI, m_GFAdd(m_Reg(X), m_GFNeg(m_Reg(Y)))) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_FSUB, {Type}})) {
+    Opc = TargetOpcode::G_FSUB;
+  }
+  /// fold (fsub x, fneg(y)) -> (fadd x, y)
+  else if (mi_match(Dst, MRI, m_GFSub(m_Reg(X), m_GFNeg(m_Reg(Y)))) &&
+           isLegalOrBeforeLegalizer({TargetOpcode::G_FADD, {Type}})) {
+    Opc = TargetOpcode::G_FADD;
+  }
+  // fold (fmul fneg(x), fneg(y)) -> (fmul x, y)
+  // fold (fdiv fneg(x), fneg(y)) -> (fdiv x, y)
+  // fold (fmad fneg(x), fneg(y), z) -> (fmad x, y, z)
+  // fold (fma fneg(x), fneg(y), z) -> (fma x, y, z)
+  else if ((Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV ||
+            Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA) &&
+           mi_match(X, MRI, m_GFNeg(m_Reg(X))) &&
+           mi_match(Y, MRI, m_GFNeg(m_Reg(Y)))) {
+    // no opcode change
+  } else
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    MI.setDesc(B.getTII().get(Opc));
+    MI.getOperand(1).setReg(X);
+    MI.getOperand(2).setReg(Y);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 8146a67d4dfb..306af808659a 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -9,7 +9,7 @@
 /// Provides analysis for querying information about KnownBits during GISel
 /// passes.
 //
-//===------------------
+//===----------------------------------------------------------------------===//
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -57,7 +57,7 @@ KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
 KnownBits GISelKnownBits::getKnownBits(Register R) {
   const LLT Ty = MRI.getType(R);
   APInt DemandedElts =
-      Ty.isVector() ? APInt::getAllOnesValue(Ty.getNumElements()) : APInt(1, 1);
+      Ty.isVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
   return getKnownBits(R, DemandedElts);
 }
 
@@ -198,8 +198,8 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   case TargetOpcode::COPY:
   case TargetOpcode::G_PHI:
   case TargetOpcode::PHI: {
-    Known.One = APInt::getAllOnesValue(BitWidth);
-    Known.Zero = APInt::getAllOnesValue(BitWidth);
+    Known.One = APInt::getAllOnes(BitWidth);
+    Known.Zero = APInt::getAllOnes(BitWidth);
     // Destination registers should not have subregisters at this
     // point of the pipeline, otherwise the main live-range will be
     // defined more than once, which is against SSA.
@@ -245,7 +245,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     break;
   }
   case TargetOpcode::G_CONSTANT: {
-    auto CstVal = getConstantVRegVal(R, MRI);
+    auto CstVal = getIConstantVRegVal(R, MRI);
     if (!CstVal)
       break;
     Known = KnownBits::makeConstant(*CstVal);
@@ -510,6 +510,18 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     Known = Known.reverseBits();
     break;
   }
+  case TargetOpcode::G_CTPOP: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    // We can bound the space the count needs.  Also, bits known to be zero can't
+    // contribute to the population.
+    unsigned BitsPossiblySet = Known2.countMaxPopulation();
+    unsigned LowBits = Log2_32(BitsPossiblySet)+1;
+    Known.Zero.setBitsFrom(LowBits);
+    // TODO: we could bound Known.One using the lower bound on the number of
+    // bits which might be set provided by popcnt KnownOne2.
+    break;
+  }
   case TargetOpcode::G_UBFX: {
     KnownBits SrcOpKnown, OffsetKnown, WidthKnown;
     computeKnownBitsImpl(MI.getOperand(1).getReg(), SrcOpKnown, DemandedElts,
@@ -676,9 +688,8 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
 
 unsigned GISelKnownBits::computeNumSignBits(Register R, unsigned Depth) {
   LLT Ty = MRI.getType(R);
-  APInt DemandedElts = Ty.isVector()
-                           ? APInt::getAllOnesValue(Ty.getNumElements())
-                           : APInt(1, 1);
+  APInt DemandedElts =
+      Ty.isVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
   return computeNumSignBits(R, DemandedElts, Depth);
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
index e0391e6f6467..252b931602c6 100644
--- a/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -18,6 +18,7 @@ using namespace llvm;
 void llvm::initializeGlobalISel(PassRegistry &Registry) {
   initializeIRTranslatorPass(Registry);
   initializeLegalizerPass(Registry);
+  initializeLoadStoreOptPass(Registry);
   initializeLocalizerPass(Registry);
   initializeRegBankSelectPass(Registry);
   initializeInstructionSelectPass(Registry);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 73b763710fdf..87cc60d51bc2 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -32,6 +33,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
@@ -47,6 +49,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
@@ -114,7 +117,7 @@ static void reportTranslationError(MachineFunction &MF,
     R << (" (in function: " + MF.getName() + ")").str();
 
   if (TPC.isGlobalISelAbortEnabled())
-    report_fatal_error(R.getMsg());
+    report_fatal_error(Twine(R.getMsg()));
   else
     ORE.emit(R);
 }
@@ -566,7 +569,7 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
 
   if (BrInst.isUnconditional()) {
     // If the unconditional target is the layout successor, fallthrough.
-    if (!CurMBB.isLayoutSuccessor(Succ0MBB))
+    if (OptLevel == CodeGenOpt::None || !CurMBB.isLayoutSuccessor(Succ0MBB))
       MIRBuilder.buildBr(*Succ0MBB);
 
     // Link successors.
@@ -739,8 +742,7 @@ bool IRTranslator::translateSwitch(const User &U, MachineIRBuilder &MIB) {
   // FIXME: At the moment we don't do any splitting optimizations here like
   // SelectionDAG does, so this worklist only has one entry.
   while (!WorkList.empty()) {
-    SwitchWorkListItem W = WorkList.back();
-    WorkList.pop_back();
+    SwitchWorkListItem W = WorkList.pop_back_val();
     if (!lowerSwitchWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB, MIB))
       return false;
   }
@@ -784,7 +786,7 @@ bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
 
   JT.Reg = Sub.getReg(0);
 
-  if (JTH.OmitRangeCheck) {
+  if (JTH.FallthroughUnreachable) {
     if (JT.MBB != HeaderBB->getNextNode())
       MIB.buildBr(*JT.MBB);
     return true;
@@ -936,11 +938,10 @@ bool IRTranslator::lowerJumpTableWorkItem(SwitchCG::SwitchWorkListItem W,
     }
   }
 
-  // Skip the range check if the fallthrough block is unreachable.
   if (FallthroughUnreachable)
-    JTH->OmitRangeCheck = true;
+    JTH->FallthroughUnreachable = true;
 
-  if (!JTH->OmitRangeCheck)
+  if (!JTH->FallthroughUnreachable)
     addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
   addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
   CurMBB->normalizeSuccProbs();
@@ -1004,14 +1005,22 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
   Register MinValReg = MIB.buildConstant(SwitchOpTy, B.First).getReg(0);
   auto RangeSub = MIB.buildSub(SwitchOpTy, SwitchOpReg, MinValReg);
 
-  // Ensure that the type will fit the mask value.
+  Type *PtrIRTy = Type::getInt8PtrTy(MF->getFunction().getContext());
+  const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+
   LLT MaskTy = SwitchOpTy;
-  for (unsigned I = 0, E = B.Cases.size(); I != E; ++I) {
-    if (!isUIntN(SwitchOpTy.getSizeInBits(), B.Cases[I].Mask)) {
-      // Switch table case range are encoded into series of masks.
-      // Just use pointer type, it's guaranteed to fit.
-      MaskTy = LLT::scalar(64);
-      break;
+  if (MaskTy.getSizeInBits() > PtrTy.getSizeInBits() ||
+      !isPowerOf2_32(MaskTy.getSizeInBits()))
+    MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+  else {
+    // Ensure that the type will fit the mask value.
+    for (unsigned I = 0, E = B.Cases.size(); I != E; ++I) {
+      if (!isUIntN(SwitchOpTy.getSizeInBits(), B.Cases[I].Mask)) {
+        // Switch table case range are encoded into series of masks.
+        // Just use pointer type, it's guaranteed to fit.
+        MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+        break;
+      }
     }
   }
   Register SubReg = RangeSub.getReg(0);
@@ -1023,13 +1032,13 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
 
   MachineBasicBlock *MBB = B.Cases[0].ThisBB;
 
-  if (!B.OmitRangeCheck)
+  if (!B.FallthroughUnreachable)
     addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
   addSuccessorWithProb(SwitchBB, MBB, B.Prob);
 
   SwitchBB->normalizeSuccProbs();
 
-  if (!B.OmitRangeCheck) {
+  if (!B.FallthroughUnreachable) {
     // Conditional branch to the default block.
     auto RangeCst = MIB.buildConstant(SwitchOpTy, B.Range);
     auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, LLT::scalar(1),
@@ -1129,10 +1138,8 @@ bool IRTranslator::lowerBitTestWorkItem(
     BTB->DefaultProb -= DefaultProb / 2;
   }
 
-  if (FallthroughUnreachable) {
-    // Skip the range check if the fallthrough block is unreachable.
-    BTB->OmitRangeCheck = true;
-  }
+  if (FallthroughUnreachable)
+    BTB->FallthroughUnreachable = true;
 
   // If we're in the right place, emit the bit test header right now.
   if (CurMBB == SwitchMBB) {
@@ -1297,11 +1304,9 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
 
     MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
     Align BaseAlign = getMemOpAlign(LI);
-    AAMDNodes AAMetadata;
-    LI.getAAMetadata(AAMetadata);
     auto MMO = MF->getMachineMemOperand(
         Ptr, Flags, MRI->getType(Regs[i]),
-        commonAlignment(BaseAlign, Offsets[i] / 8), AAMetadata, Ranges,
+        commonAlignment(BaseAlign, Offsets[i] / 8), LI.getAAMetadata(), Ranges,
         LI.getSyncScopeID(), LI.getOrdering());
     MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
   }
@@ -1339,11 +1344,9 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
 
     MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
     Align BaseAlign = getMemOpAlign(SI);
-    AAMDNodes AAMetadata;
-    SI.getAAMetadata(AAMetadata);
     auto MMO = MF->getMachineMemOperand(
         Ptr, Flags, MRI->getType(Vals[i]),
-        commonAlignment(BaseAlign, Offsets[i] / 8), AAMetadata, nullptr,
+        commonAlignment(BaseAlign, Offsets[i] / 8), SI.getAAMetadata(), nullptr,
         SI.getSyncScopeID(), SI.getOrdering());
     MIRBuilder.buildStore(Vals[i], Addr, *MMO);
   }
@@ -1590,8 +1593,7 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   Align DstAlign;
   Align SrcAlign;
   unsigned IsVol =
-      cast<ConstantInt>(CI.getArgOperand(CI.getNumArgOperands() - 1))
-          ->getZExtValue();
+      cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1))->getZExtValue();
 
   if (auto *MCI = dyn_cast<MemCpyInst>(&CI)) {
     DstAlign = MCI->getDestAlign().valueOrOne();
@@ -1763,6 +1765,10 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_VECREDUCE_UMAX;
     case Intrinsic::vector_reduce_umin:
       return TargetOpcode::G_VECREDUCE_UMIN;
+    case Intrinsic::lround:
+      return TargetOpcode::G_LROUND;
+    case Intrinsic::llround:
+      return TargetOpcode::G_LLROUND;
   }
   return Intrinsic::not_intrinsic;
 }
@@ -1779,7 +1785,7 @@ bool IRTranslator::translateSimpleIntrinsic(const CallInst &CI,
 
   // Yes. Let's translate it.
   SmallVector<llvm::SrcOp, 4> VRegs;
-  for (auto &Arg : CI.arg_operands())
+  for (auto &Arg : CI.args())
     VRegs.push_back(getOrCreateVReg(*Arg));
 
   MIRBuilder.buildInstr(Op, {getOrCreateVReg(CI)}, VRegs,
@@ -2172,7 +2178,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
     // is the same on all targets.
-    for (unsigned Idx = 0, E = CI.getNumArgOperands(); Idx < E; ++Idx) {
+    for (unsigned Idx = 0, E = CI.arg_size(); Idx < E; ++Idx) {
       Value *Arg = CI.getArgOperand(Idx)->stripPointerCasts();
       if (isa<ConstantPointerNull>(Arg))
         continue; // Skip null pointers. They represent a hole in index space.
@@ -2228,6 +2234,23 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     return true;
   }
+  case Intrinsic::trap:
+  case Intrinsic::debugtrap:
+  case Intrinsic::ubsantrap: {
+    StringRef TrapFuncName =
+        CI.getAttributes().getFnAttr("trap-func-name").getValueAsString();
+    if (TrapFuncName.empty())
+      break; // Use the default handling.
+    CallLowering::CallLoweringInfo Info;
+    if (ID == Intrinsic::ubsantrap) {
+      Info.OrigArgs.push_back({getOrCreateVRegs(*CI.getArgOperand(0)),
+                               CI.getArgOperand(0)->getType(), 0});
+    }
+    Info.Callee = MachineOperand::CreateES(TrapFuncName.data());
+    Info.CB = &CI;
+    Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
+    return CLI->lowerCall(MIRBuilder, Info);
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
@@ -2321,6 +2344,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (CI.isInlineAsm())
     return translateInlineAsm(CI, MIRBuilder);
 
+  diagnoseDontCall(CI);
+
   Intrinsic::ID ID = Intrinsic::not_intrinsic;
   if (F && F->isIntrinsic()) {
     ID = F->getIntrinsicID();
@@ -2347,7 +2372,7 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (isa<FPMathOperator>(CI))
     MIB->copyIRFlags(CI);
 
-  for (auto &Arg : enumerate(CI.arg_operands())) {
+  for (auto &Arg : enumerate(CI.args())) {
     // If this is required to be an immediate, don't materialize it in a
     // register.
     if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
@@ -2360,10 +2385,15 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       } else {
         MIB.addFPImm(cast<ConstantFP>(Arg.value()));
       }
-    } else if (auto MD = dyn_cast<MetadataAsValue>(Arg.value())) {
-      auto *MDN = dyn_cast<MDNode>(MD->getMetadata());
-      if (!MDN) // This was probably an MDString.
-        return false;
+    } else if (auto *MDVal = dyn_cast<MetadataAsValue>(Arg.value())) {
+      auto *MD = MDVal->getMetadata();
+      auto *MDN = dyn_cast<MDNode>(MD);
+      if (!MDN) {
+        if (auto *ConstMD = dyn_cast<ConstantAsMetadata>(MD))
+          MDN = MDNode::get(MF->getFunction().getContext(), ConstMD);
+        else // This was probably an MDString.
+          return false;
+      }
       MIB.addMetadata(MDN);
     } else {
       ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg.value());
@@ -2472,32 +2502,19 @@ bool IRTranslator::translateInvoke(const User &U,
   if (!isa<LandingPadInst>(EHPadBB->getFirstNonPHI()))
     return false;
 
-  bool LowerInlineAsm = false;
-  if (I.isInlineAsm()) {
-    const InlineAsm *IA = cast<InlineAsm>(I.getCalledOperand());
-    if (!IA->canThrow()) {
-      // Fast path without emitting EH_LABELs.
-
-      if (!translateInlineAsm(I, MIRBuilder))
-        return false;
-
-      MachineBasicBlock *InvokeMBB = &MIRBuilder.getMBB(),
-                        *ReturnMBB = &getMBB(*ReturnBB);
-
-      // Update successor info.
-      addSuccessorWithProb(InvokeMBB, ReturnMBB, BranchProbability::getOne());
-
-      MIRBuilder.buildBr(*ReturnMBB);
-      return true;
-    } else {
-      LowerInlineAsm = true;
-    }
-  }
+  bool LowerInlineAsm = I.isInlineAsm();
+  bool NeedEHLabel = true;
+  // If it can't throw then use a fast-path without emitting EH labels.
+  if (LowerInlineAsm)
+    NeedEHLabel = (cast<InlineAsm>(I.getCalledOperand()))->canThrow();
 
   // Emit the actual call, bracketed by EH_LABELs so that the MF knows about
   // the region covered by the try.
-  MCSymbol *BeginSymbol = Context.createTempSymbol();
-  MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
+  MCSymbol *BeginSymbol = nullptr;
+  if (NeedEHLabel) {
+    BeginSymbol = Context.createTempSymbol();
+    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
+  }
 
   if (LowerInlineAsm) {
     if (!translateInlineAsm(I, MIRBuilder))
@@ -2505,8 +2522,11 @@ bool IRTranslator::translateInvoke(const User &U,
   } else if (!translateCallBase(I, MIRBuilder))
     return false;
 
-  MCSymbol *EndSymbol = Context.createTempSymbol();
-  MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
+  MCSymbol *EndSymbol = nullptr;
+  if (NeedEHLabel) {
+    EndSymbol = Context.createTempSymbol();
+    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
+  }
 
   SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
@@ -2528,7 +2548,12 @@ bool IRTranslator::translateInvoke(const User &U,
   }
   InvokeMBB->normalizeSuccProbs();
 
-  MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
+  if (NeedEHLabel) {
+    assert(BeginSymbol && "Expected a begin symbol!");
+    assert(EndSymbol && "Expected an end symbol!");
+    MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
+  }
+
   MIRBuilder.buildBr(ReturnMBB);
   return true;
 }
@@ -2670,6 +2695,28 @@ bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
   return true;
 }
 
+bool IRTranslator::translateUnreachable(const User &U, MachineIRBuilder &MIRBuilder) {
+    if (!MF->getTarget().Options.TrapUnreachable)
+    return true;
+
+  auto &UI = cast<UnreachableInst>(U);
+  // We may be able to ignore unreachable behind a noreturn call.
+  if (MF->getTarget().Options.NoTrapAfterNoreturn) {
+    const BasicBlock &BB = *UI.getParent();
+    if (&UI != &BB.front()) {
+      BasicBlock::const_iterator PredI =
+        std::prev(BasicBlock::const_iterator(UI));
+      if (const CallInst *Call = dyn_cast<CallInst>(&*PredI)) {
+        if (Call->doesNotReturn())
+          return true;
+      }
+    }
+  }
+
+  MIRBuilder.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
+  return true;
+}
+
 bool IRTranslator::translateInsertElement(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   // If it is a <1 x Ty> vector, use the scalar as it is
@@ -2757,14 +2804,11 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
   Register Cmp = getOrCreateVReg(*I.getCompareOperand());
   Register NewVal = getOrCreateVReg(*I.getNewValOperand());
 
-  AAMDNodes AAMetadata;
-  I.getAAMetadata(AAMetadata);
-
   MIRBuilder.buildAtomicCmpXchgWithSuccess(
       OldValRes, SuccessRes, Addr, Cmp, NewVal,
       *MF->getMachineMemOperand(
           MachinePointerInfo(I.getPointerOperand()), Flags, MRI->getType(Cmp),
-          getMemOpAlign(I), AAMetadata, nullptr, I.getSyncScopeID(),
+          getMemOpAlign(I), I.getAAMetadata(), nullptr, I.getSyncScopeID(),
           I.getSuccessOrdering(), I.getFailureOrdering()));
   return true;
 }
@@ -2824,14 +2868,11 @@ bool IRTranslator::translateAtomicRMW(const User &U,
     break;
   }
 
-  AAMDNodes AAMetadata;
-  I.getAAMetadata(AAMetadata);
-
   MIRBuilder.buildAtomicRMW(
       Opcode, Res, Addr, Val,
       *MF->getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
                                 Flags, MRI->getType(Val), getMemOpAlign(I),
-                                AAMetadata, nullptr, I.getSyncScopeID(),
+                                I.getAAMetadata(), nullptr, I.getSyncScopeID(),
                                 I.getOrdering()));
   return true;
 }
@@ -2985,7 +3026,8 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
   return true;
 }
 
-void IRTranslator::finalizeBasicBlock() {
+bool IRTranslator::finalizeBasicBlock(const BasicBlock &BB,
+                                      MachineBasicBlock &MBB) {
   for (auto &BTB : SL->BitTestCases) {
     // Emit header first, if it wasn't already emitted.
     if (!BTB.Emitted)
@@ -3005,7 +3047,7 @@ void IRTranslator::finalizeBasicBlock() {
       // test, and delete the last bit test.
 
       MachineBasicBlock *NextMBB;
-      if (BTB.ContiguousRange && j + 2 == ej) {
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
         // Second-to-last bit-test with contiguous range: fall through to the
         // target of the final bit test.
         NextMBB = BTB.Cases[j + 1].TargetBB;
@@ -3019,7 +3061,7 @@ void IRTranslator::finalizeBasicBlock() {
 
       emitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j], MBB);
 
-      if (BTB.ContiguousRange && j + 2 == ej) {
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
         // We need to record the replacement phi edge here that normally
         // happens in emitBitTestCase before we delete the case, otherwise the
         // phi edge will be lost.
@@ -3054,6 +3096,176 @@ void IRTranslator::finalizeBasicBlock() {
   for (auto &SwCase : SL->SwitchCases)
     emitSwitchCase(SwCase, &CurBuilder->getMBB(), *CurBuilder);
   SL->SwitchCases.clear();
+
+  // Check if we need to generate stack-protector guard checks.
+  StackProtector &SP = getAnalysis<StackProtector>();
+  if (SP.shouldEmitSDCheck(BB)) {
+    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+    bool FunctionBasedInstrumentation =
+        TLI.getSSPStackGuardCheck(*MF->getFunction().getParent());
+    SPDescriptor.initialize(&BB, &MBB, FunctionBasedInstrumentation);
+  }
+  // Handle stack protector.
+  if (SPDescriptor.shouldEmitFunctionBasedCheckStackProtector()) {
+    LLVM_DEBUG(dbgs() << "Unimplemented stack protector case\n");
+    return false;
+  } else if (SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint = findSplitPointForStackProtector(
+        ParentMBB, *MF->getSubtarget().getInstrInfo());
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB, SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    if (!emitSPDescriptorParent(SPDescriptor, ParentMBB))
+      return false;
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SPDescriptor.getFailureMBB();
+    if (FailureMBB->empty()) {
+      if (!emitSPDescriptorFailure(SPDescriptor, FailureMBB))
+        return false;
+    }
+
+    // Clear the Per-BB State.
+    SPDescriptor.resetPerBBState();
+  }
+  return true;
+}
+
+bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                          MachineBasicBlock *ParentBB) {
+  CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+  Type *PtrIRTy = Type::getInt8PtrTy(MF->getFunction().getContext());
+  const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+  LLT PtrMemTy = getLLTForMVT(TLI.getPointerMemTy(*DL));
+
+  MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI.getStackProtectorIndex();
+
+  Register Guard;
+  Register StackSlotPtr = CurBuilder->buildFrameIndex(PtrTy, FI).getReg(0);
+  const Module &M = *ParentBB->getParent()->getFunction().getParent();
+  Align Align = DL->getPrefTypeAlign(Type::getInt8PtrTy(M.getContext()));
+
+  // Generate code to load the content of the guard slot.
+  Register GuardVal =
+      CurBuilder
+          ->buildLoad(PtrMemTy, StackSlotPtr,
+                      MachinePointerInfo::getFixedStack(*MF, FI), Align,
+                      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile)
+          .getReg(0);
+
+  if (TLI.useStackGuardXorFP()) {
+    LLVM_DEBUG(dbgs() << "Stack protector xor'ing with FP not yet implemented");
+    return false;
+  }
+
+  // Retrieve guard check function, nullptr if instrumentation is inlined.
+  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+    // This path is currently untestable on GlobalISel, since the only platform
+    // that needs this seems to be Windows, and we fall back on that currently.
+    // The code still lives here in case that changes.
+    // Silence warning about unused variable until the code below that uses
+    // 'GuardCheckFn' is enabled.
+    (void)GuardCheckFn;
+    return false;
+#if 0
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    FunctionType *FnTy = GuardCheckFn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
+    ISD::ArgFlagsTy Flags;
+    if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg))
+      Flags.setInReg();
+    CallLowering::ArgInfo GuardArgInfo(
+        {GuardVal, FnTy->getParamType(0), {Flags}});
+
+    CallLowering::CallLoweringInfo Info;
+    Info.OrigArgs.push_back(GuardArgInfo);
+    Info.CallConv = GuardCheckFn->getCallingConv();
+    Info.Callee = MachineOperand::CreateGA(GuardCheckFn, 0);
+    Info.OrigRet = {Register(), FnTy->getReturnType()};
+    if (!CLI->lowerCall(MIRBuilder, Info)) {
+      LLVM_DEBUG(dbgs() << "Failed to lower call to stack protector check\n");
+      return false;
+    }
+    return true;
+#endif
+  }
+
+  // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
+  // Otherwise, emit a volatile load to retrieve the stack guard value.
+  if (TLI.useLoadStackGuardNode()) {
+    Guard =
+        MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
+    getStackGuard(Guard, *CurBuilder);
+  } else {
+    // TODO: test using android subtarget when we support @llvm.thread.pointer.
+    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    Register GuardPtr = getOrCreateVReg(*IRGuard);
+
+    Guard = CurBuilder
+                ->buildLoad(PtrMemTy, GuardPtr,
+                            MachinePointerInfo::getFixedStack(*MF, FI), Align,
+                            MachineMemOperand::MOLoad |
+                                MachineMemOperand::MOVolatile)
+                .getReg(0);
+  }
+
+  // Perform the comparison.
+  auto Cmp =
+      CurBuilder->buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Guard, GuardVal);
+  // If the guard/stackslot do not equal, branch to failure MBB.
+  CurBuilder->buildBrCond(Cmp, *SPD.getFailureMBB());
+  // Otherwise branch to success MBB.
+  CurBuilder->buildBr(*SPD.getSuccessMBB());
+  return true;
+}
+
+bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
+                                           MachineBasicBlock *FailureBB) {
+  CurBuilder->setInsertPt(*FailureBB, FailureBB->end());
+  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+
+  const RTLIB::Libcall Libcall = RTLIB::STACKPROTECTOR_CHECK_FAIL;
+  const char *Name = TLI.getLibcallName(Libcall);
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = {Register(), Type::getVoidTy(MF->getFunction().getContext()),
+                  0};
+  if (!CLI->lowerCall(*CurBuilder, Info)) {
+    LLVM_DEBUG(dbgs() << "Failed to lower call to stack protector fail\n");
+    return false;
+  }
+
+  // On PS4, the "return address" must still be within the calling function,
+  // even if it's at the very end, so emit an explicit TRAP here.
+  // Passing 'true' for doesNotReturn above won't generate the trap for us.
+  // WebAssembly needs an unreachable instruction after a non-returning call,
+  // because the function return type can be different from __stack_chk_fail's
+  // return type (void).
+  const TargetMachine &TM = MF->getTarget();
+  if (TM.getTargetTriple().isPS4CPU() || TM.getTargetTriple().isWasm()) {
+    LLVM_DEBUG(dbgs() << "Unhandled trap emission for stack protector fail\n");
+    return false;
+  }
+  return true;
 }
 
 void IRTranslator::finalizeFunction() {
@@ -3069,6 +3281,7 @@ void IRTranslator::finalizeFunction() {
   EntryBuilder.reset();
   CurBuilder.reset();
   FuncInfo.clear();
+  SPDescriptor.resetPerFunctionState();
 }
 
 /// Returns true if a BasicBlock \p BB within a variadic function contains a
@@ -3079,7 +3292,7 @@ static bool checkForMustTailInVarArgFn(bool IsVarArg, const BasicBlock &BB) {
 
   // Walk the block backwards, because tail calls usually only appear at the end
   // of a block.
-  return std::any_of(BB.rbegin(), BB.rend(), [](const Instruction &I) {
+  return llvm::any_of(llvm::reverse(BB), [](const Instruction &I) {
     const auto *CI = dyn_cast<CallInst>(&I);
     return CI && CI->isMustTailCall();
   });
@@ -3088,8 +3301,6 @@ static bool checkForMustTailInVarArgFn(bool IsVarArg, const BasicBlock &BB) {
 bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MF = &CurMF;
   const Function &F = MF->getFunction();
-  if (F.empty())
-    return false;
   GISelCSEAnalysisWrapper &Wrapper =
       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
   // Set the CSEConfig and run the analysis.
@@ -3257,7 +3468,8 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
         return false;
       }
 
-      finalizeBasicBlock();
+      if (!finalizeBasicBlock(*BB, MBB))
+        return false;
     }
 #ifndef NDEBUG
     WrapperObserver.removeObserver(&Verifier);
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index bb4d41cfd69f..4ae427484945 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -325,7 +325,8 @@ bool InlineAsmLowering::lowerInlineAsm(
         return false;
       }
 
-      OpInfo.ConstraintVT = TLI->getValueType(DL, OpTy, true).getSimpleVT();
+      OpInfo.ConstraintVT =
+          TLI->getAsmOperandValueType(DL, OpTy, true).getSimpleVT();
 
     } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
@@ -334,13 +335,17 @@ bool InlineAsmLowering::lowerInlineAsm(
             TLI->getSimpleValueType(DL, STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT = TLI->getSimpleValueType(DL, Call.getType());
+        OpInfo.ConstraintVT =
+            TLI->getAsmOperandValueType(DL, Call.getType()).getSimpleVT();
       }
       ++ResNo;
     } else {
       OpInfo.ConstraintVT = MVT::Other;
     }
 
+    if (OpInfo.ConstraintVT == MVT::i64x8)
+      return false;
+
     // Compute the constraint code and ConstraintType to use.
     computeConstraintToUse(TLI, OpInfo);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 75a8f03fcb3f..9b2692486384 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
-#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -30,9 +30,9 @@
 #include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "instruction-select"
@@ -130,9 +130,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   // Until then, keep track of the number of blocks to assert that we don't.
   const size_t NumBlocks = MF.size();
 #endif
+  // Keep track of selected blocks, so we can delete unreachable ones later.
+  DenseSet<MachineBasicBlock *> SelectedBlocks;
 
   for (MachineBasicBlock *MBB : post_order(&MF)) {
     ISel->CurMBB = MBB;
+    SelectedBlocks.insert(MBB);
     if (MBB->empty())
       continue;
 
@@ -205,6 +208,15 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     if (MBB.empty())
       continue;
 
+    if (!SelectedBlocks.contains(&MBB)) {
+      // This is an unreachable block and therefore hasn't been selected, since
+      // the main selection loop above uses a postorder block traversal.
+      // We delete all the instructions in this block since it's unreachable.
+      MBB.clear();
+      // Don't delete the block in case the block has it's address taken or is
+      // still being referenced by a phi somewhere.
+      continue;
+    }
     // Try to find redundant copies b/w vregs of the same register class.
     bool ReachedBegin = false;
     for (auto MII = std::prev(MBB.end()), Begin = MBB.begin(); !ReachedBegin;) {
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 4fec9e628ddb..dc5a4d8f85aa 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -37,7 +37,7 @@ bool InstructionSelector::isOperandImmEqual(
     const MachineOperand &MO, int64_t Value,
     const MachineRegisterInfo &MRI) const {
   if (MO.isReg() && MO.getReg())
-    if (auto VRegVal = getConstantVRegValWithLookThrough(MO.getReg(), MRI))
+    if (auto VRegVal = getIConstantVRegValWithLookThrough(MO.getReg(), MRI))
       return VRegVal->Value.getSExtValue() == Value;
   return false;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 7c5e4e52ca3e..1f0738a8d9d2 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -153,6 +153,14 @@ LegalityPredicate LegalityPredicates::scalarOrEltSizeNotPow2(unsigned TypeIdx) {
   };
 }
 
+LegalityPredicate LegalityPredicates::sizeNotMultipleOf(unsigned TypeIdx,
+                                                        unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && QueryTy.getSizeInBits() % Size != 0;
+  };
+}
+
 LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index fc2570ae4b8e..75b7fcb5663a 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -63,6 +63,16 @@ LegalizeMutation LegalizeMutations::widenScalarOrEltToNextPow2(unsigned TypeIdx,
   };
 }
 
+LegalizeMutation
+LegalizeMutations::widenScalarOrEltToNextMultipleOf(unsigned TypeIdx,
+                                                    unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    unsigned NewEltSizeInBits = alignTo(Ty.getScalarSizeInBits(), Size);
+    return std::make_pair(TypeIdx, Ty.changeElementSize(NewEltSizeInBits));
+  };
+}
+
 LegalizeMutation LegalizeMutations::moreElementsToNextPow2(unsigned TypeIdx,
                                                            unsigned Min) {
   return [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index 635b1445ee07..0ab4a7f64840 100644
--- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -218,9 +218,6 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
   RAIIMFObsDelInstaller Installer(MF, WrapperObserver);
   LegalizerHelper Helper(MF, LI, WrapperObserver, MIRBuilder);
   LegalizationArtifactCombiner ArtCombiner(MIRBuilder, MRI, LI);
-  auto RemoveDeadInstFromLists = [&WrapperObserver](MachineInstr *DeadMI) {
-    WrapperObserver.erasingInstr(*DeadMI);
-  };
   bool Changed = false;
   SmallVector<MachineInstr *, 128> RetryList;
   do {
@@ -232,9 +229,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
       assert(isPreISelGenericOpcode(MI.getOpcode()) &&
              "Expecting generic opcode");
       if (isTriviallyDead(MI, MRI)) {
-        LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
-        MI.eraseFromParentAndMarkDBGValuesForRemoval();
-        LocObserver.checkpoint(false);
+        eraseInstr(MI, MRI, &LocObserver);
         continue;
       }
 
@@ -281,10 +276,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
       assert(isPreISelGenericOpcode(MI.getOpcode()) &&
              "Expecting generic opcode");
       if (isTriviallyDead(MI, MRI)) {
-        LLVM_DEBUG(dbgs() << MI << "Is dead\n");
-        RemoveDeadInstFromLists(&MI);
-        MI.eraseFromParentAndMarkDBGValuesForRemoval();
-        LocObserver.checkpoint(false);
+        eraseInstr(MI, MRI, &LocObserver);
         continue;
       }
       SmallVector<MachineInstr *, 4> DeadInstructions;
@@ -292,11 +284,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
       if (ArtCombiner.tryCombineInstruction(MI, DeadInstructions,
                                             WrapperObserver)) {
         WorkListObserver.printNewInstrs();
-        for (auto *DeadMI : DeadInstructions) {
-          LLVM_DEBUG(dbgs() << "Is dead: " << *DeadMI);
-          RemoveDeadInstFromLists(DeadMI);
-          DeadMI->eraseFromParentAndMarkDBGValuesForRemoval();
-        }
+        eraseInstrs(DeadInstructions, MRI, &LocObserver);
         LocObserver.checkpoint(
             VerifyDebugLocs ==
             DebugLocVerifyLevel::LegalizationsAndArtifactCombiners);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index c1e0d2549c42..c74bec7dfc0d 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "legalizer"
 
@@ -497,8 +498,8 @@ static bool isLibCallInTailPosition(MachineInstr &MI,
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+  if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
+      CallerAttrs.hasRetAttr(Attribute::SExt))
     return false;
 
   // Only tail call if the following instruction is a standard return or if we
@@ -2051,10 +2052,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
 
     Register SrcReg = MI.getOperand(1).getReg();
 
-    // First ZEXT the input.
-    auto MIBSrc = MIRBuilder.buildZExt(WideTy, SrcReg);
+    // First extend the input.
+    unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
+                              MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
+                          ? TargetOpcode::G_ANYEXT
+                          : TargetOpcode::G_ZEXT;
+    auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
     LLT CurTy = MRI.getType(SrcReg);
-    if (MI.getOpcode() == TargetOpcode::G_CTTZ) {
+    unsigned NewOpc = MI.getOpcode();
+    if (NewOpc == TargetOpcode::G_CTTZ) {
       // The count is the same in the larger type except if the original
       // value was zero.  This can be handled by setting the bit just off
       // the top of the original type.
@@ -2062,10 +2068,12 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
       MIBSrc = MIRBuilder.buildOr(
         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
+      // Now we know the operand is non-zero, use the more relaxed opcode.
+      NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
     }
 
     // Perform the operation at the larger size.
-    auto MIBNewOp = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, {MIBSrc});
+    auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
     // This is already the correct result for CTPOP and CTTZs
     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
@@ -2427,7 +2435,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
 
       widenScalarSrc(
           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
-          TargetOpcode::G_SEXT);
+          TargetOpcode::G_ANYEXT);
 
       widenScalarDst(MI, WideTy, 0);
       Observer.changedInstr(MI);
@@ -2662,7 +2670,7 @@ static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
 
   // Now figure out the amount we need to shift to get the target bits.
   auto OffsetMask = B.buildConstant(
-    IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
+      IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
   return B.buildShl(IdxTy, OffsetIdx,
                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
@@ -2886,13 +2894,14 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
   MachineMemOperand &MMO = LoadMI.getMMO();
   LLT MemTy = MMO.getMemoryType();
   MachineFunction &MF = MIRBuilder.getMF();
-  if (MemTy.isVector())
-    return UnableToLegalize;
 
   unsigned MemSizeInBits = MemTy.getSizeInBits();
   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
 
   if (MemSizeInBits != MemStoreSizeInBits) {
+    if (MemTy.isVector())
+      return UnableToLegalize;
+
     // Promote to a byte-sized load if not loading an integral number of
     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
@@ -2928,16 +2937,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
     return Legalized;
   }
 
-  // This load needs splitting into power of 2 sized loads.
-  if (DstTy.isVector())
-    return UnableToLegalize;
-  if (isPowerOf2_32(MemSizeInBits))
-    return UnableToLegalize; // Don't know what we're being asked to do.
-
   // Big endian lowering not implemented.
   if (MIRBuilder.getDataLayout().isBigEndian())
     return UnableToLegalize;
 
+  // This load needs splitting into power of 2 sized loads.
+  //
   // Our strategy here is to generate anyextending loads for the smaller
   // types up to next power-2 result type, and then combine the two larger
   // result values together, before truncating back down to the non-pow-2
@@ -2950,8 +2955,34 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
   // v1 = i24 trunc v5
   // By doing this we generate the correct truncate which should get
   // combined away as an artifact with a matching extend.
-  uint64_t LargeSplitSize = PowerOf2Floor(MemSizeInBits);
-  uint64_t SmallSplitSize = MemSizeInBits - LargeSplitSize;
+
+  uint64_t LargeSplitSize, SmallSplitSize;
+
+  if (!isPowerOf2_32(MemSizeInBits)) {
+    // This load needs splitting into power of 2 sized loads.
+    LargeSplitSize = PowerOf2Floor(MemSizeInBits);
+    SmallSplitSize = MemSizeInBits - LargeSplitSize;
+  } else {
+    // This is already a power of 2, but we still need to split this in half.
+    //
+    // Assume we're being asked to decompose an unaligned load.
+    // TODO: If this requires multiple splits, handle them all at once.
+    auto &Ctx = MF.getFunction().getContext();
+    if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
+      return UnableToLegalize;
+
+    SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
+  }
+
+  if (MemTy.isVector()) {
+    // TODO: Handle vector extloads
+    if (MemTy != DstTy)
+      return UnableToLegalize;
+
+    // TODO: We can do better than scalarizing the vector and at least split it
+    // in half.
+    return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
+  }
 
   MachineMemOperand *LargeMMO =
       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
@@ -2976,9 +3007,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
 
   if (AnyExtTy == DstTy)
     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
-  else {
+  else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
     MIRBuilder.buildTrunc(DstReg, {Or});
+  } else {
+    assert(DstTy.isPointer() && "expected pointer");
+    auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
+
+    // FIXME: We currently consider this to be illegal for non-integral address
+    // spaces, but we need still need a way to reinterpret the bits.
+    MIRBuilder.buildIntToPtr(DstReg, Or);
   }
 
   LoadMI.eraseFromParent();
@@ -2999,13 +3037,13 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
   LLT MemTy = MMO.getMemoryType();
 
-  if (SrcTy.isVector())
-    return UnableToLegalize;
-
   unsigned StoreWidth = MemTy.getSizeInBits();
   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
 
   if (StoreWidth != StoreSizeInBits) {
+    if (SrcTy.isVector())
+      return UnableToLegalize;
+
     // Promote to a byte-sized store with upper bits zero if not
     // storing an integral number of bytes.  For example, promote
     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
@@ -3026,18 +3064,44 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
     return Legalized;
   }
 
-  if (isPowerOf2_32(MemTy.getSizeInBits()))
-    return UnableToLegalize; // Don't know what we're being asked to do.
+  if (MemTy.isVector()) {
+    // TODO: Handle vector trunc stores
+    if (MemTy != SrcTy)
+      return UnableToLegalize;
+
+    // TODO: We can do better than scalarizing the vector and at least split it
+    // in half.
+    return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
+  }
+
+  unsigned MemSizeInBits = MemTy.getSizeInBits();
+  uint64_t LargeSplitSize, SmallSplitSize;
+
+  if (!isPowerOf2_32(MemSizeInBits)) {
+    LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
+    SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
+  } else {
+    auto &Ctx = MF.getFunction().getContext();
+    if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
+      return UnableToLegalize; // Don't know what we're being asked to do.
+
+    SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
+  }
 
   // Extend to the next pow-2. If this store was itself the result of lowering,
   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
-  // that's wider the stored size. 
-  const LLT NewSrcTy = LLT::scalar(NextPowerOf2(MemTy.getSizeInBits()));
+  // that's wider than the stored size.
+  unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
+  const LLT NewSrcTy = LLT::scalar(AnyExtSize);
+
+  if (SrcTy.isPointer()) {
+    const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
+    SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
+  }
+
   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
 
   // Obtain the smaller value by shifting away the larger value.
-  uint64_t LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
-  uint64_t SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
 
@@ -3045,9 +3109,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   LLT PtrTy = MRI.getType(PtrReg);
   auto OffsetCst = MIRBuilder.buildConstant(
     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
-  Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
   auto SmallPtr =
-    MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
+    MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
 
   MachineMemOperand *LargeMMO =
     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
@@ -3424,6 +3487,14 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_ROTL:
   case G_ROTR:
     return lowerRotate(MI);
+  case G_MEMSET:
+  case G_MEMCPY:
+  case G_MEMMOVE:
+    return lowerMemCpyFamily(MI);
+  case G_MEMCPY_INLINE:
+    return lowerMemcpyInline(MI);
+  GISEL_VECREDUCE_CASES_NONSEQ
+    return lowerVectorReduction(MI);
   }
 }
 
@@ -4004,9 +4075,7 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
   // If the index is a constant, we can really break this down as you would
   // expect, and index into the target size pieces.
   int64_t IdxVal;
-  auto MaybeCst =
-      getConstantVRegValWithLookThrough(Idx, MRI, /*LookThroughInstrs*/ true,
-                                        /*HandleFConstants*/ false);
+  auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
   if (MaybeCst) {
     IdxVal = MaybeCst->Value.getSExtValue();
     // Avoid out of bounds indexing the pieces.
@@ -4363,6 +4432,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FMAXIMUM:
   case G_FSHL:
   case G_FSHR:
+  case G_ROTL:
+  case G_ROTR:
   case G_FREEZE:
   case G_SADDSAT:
   case G_SSUBSAT:
@@ -4572,35 +4643,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
-    MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
-  unsigned Opc = MI.getOpcode();
-  assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
-         Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
-         "Sequential reductions not expected");
-
-  if (TypeIdx != 1)
-    return UnableToLegalize;
-
-  // The semantics of the normal non-sequential reductions allow us to freely
-  // re-associate the operation.
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-  Register DstReg = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-
-  if (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)
-    return UnableToLegalize;
-
-  SmallVector<Register> SplitSrcs;
-  const unsigned NumParts = SrcTy.getNumElements() / NarrowTy.getNumElements();
-  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
-  SmallVector<Register> PartialReductions;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    PartialReductions.push_back(
-        MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
-  }
-
+static unsigned getScalarOpcForReduction(unsigned Opc) {
   unsigned ScalarOpc;
   switch (Opc) {
   case TargetOpcode::G_VECREDUCE_FADD:
@@ -4643,10 +4686,81 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
     ScalarOpc = TargetOpcode::G_UMIN;
     break;
   default:
-    LLVM_DEBUG(dbgs() << "Can't legalize: unknown reduction kind.\n");
+    llvm_unreachable("Unhandled reduction");
+  }
+  return ScalarOpc;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
+    MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
+         Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
+         "Sequential reductions not expected");
+
+  if (TypeIdx != 1)
     return UnableToLegalize;
+
+  // The semantics of the normal non-sequential reductions allow us to freely
+  // re-associate the operation.
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  if (NarrowTy.isVector() &&
+      (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
+    return UnableToLegalize;
+
+  unsigned ScalarOpc = getScalarOpcForReduction(Opc);
+  SmallVector<Register> SplitSrcs;
+  // If NarrowTy is a scalar then we're being asked to scalarize.
+  const unsigned NumParts =
+      NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
+                          : SrcTy.getNumElements();
+
+  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
+  if (NarrowTy.isScalar()) {
+    if (DstTy != NarrowTy)
+      return UnableToLegalize; // FIXME: handle implicit extensions.
+
+    if (isPowerOf2_32(NumParts)) {
+      // Generate a tree of scalar operations to reduce the critical path.
+      SmallVector<Register> PartialResults;
+      unsigned NumPartsLeft = NumParts;
+      while (NumPartsLeft > 1) {
+        for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
+          PartialResults.emplace_back(
+              MIRBuilder
+                  .buildInstr(ScalarOpc, {NarrowTy},
+                              {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
+                  .getReg(0));
+        }
+        SplitSrcs = PartialResults;
+        PartialResults.clear();
+        NumPartsLeft = SplitSrcs.size();
+      }
+      assert(SplitSrcs.size() == 1);
+      MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+    // If we can't generate a tree, then just do sequential operations.
+    Register Acc = SplitSrcs[0];
+    for (unsigned Idx = 1; Idx < NumParts; ++Idx)
+      Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
+                .getReg(0);
+    MIRBuilder.buildCopy(DstReg, Acc);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  SmallVector<Register> PartialReductions;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    PartialReductions.push_back(
+        MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
   }
 
+
   // If the types involved are powers of 2, we can generate intermediate vector
   // ops, before generating a final reduction operation.
   if (isPowerOf2_32(SrcTy.getNumElements()) &&
@@ -4706,7 +4820,7 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
   Register InH = MRI.createGenericVirtualRegister(HalfTy);
   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
 
-  if (Amt.isNullValue()) {
+  if (Amt.isZero()) {
     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
     MI.eraseFromParent();
     return Legalized;
@@ -4815,10 +4929,9 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
   const LLT HalfTy = LLT::scalar(NewBitSize);
   const LLT CondTy = LLT::scalar(1);
 
-  if (const MachineInstr *KShiftAmt =
-          getOpcodeDef(TargetOpcode::G_CONSTANT, Amt, MRI)) {
-    return narrowScalarShiftByConstant(
-        MI, KShiftAmt->getOperand(1).getCImm()->getValue(), HalfTy, ShiftAmtTy);
+  if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
+    return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
+                                       ShiftAmtTy);
   }
 
   // TODO: Expand with known bits.
@@ -5224,26 +5337,23 @@ LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
   if (Ty.isVector())
     return UnableToLegalize;
 
-  unsigned SrcSize = MRI.getType(Src1).getSizeInBits();
-  unsigned DstSize = Ty.getSizeInBits();
+  unsigned Size = Ty.getSizeInBits();
   unsigned NarrowSize = NarrowTy.getSizeInBits();
-  if (DstSize % NarrowSize != 0 || SrcSize % NarrowSize != 0)
+  if (Size % NarrowSize != 0)
     return UnableToLegalize;
 
-  unsigned NumDstParts = DstSize / NarrowSize;
-  unsigned NumSrcParts = SrcSize / NarrowSize;
+  unsigned NumParts = Size / NarrowSize;
   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
-  unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
+  unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
 
   SmallVector<Register, 2> Src1Parts, Src2Parts;
   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
-  extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
-  extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
+  extractParts(Src1, NarrowTy, NumParts, Src1Parts);
+  extractParts(Src2, NarrowTy, NumParts, Src2Parts);
   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
 
   // Take only high half of registers if this is high mul.
-  ArrayRef<Register> DstRegs(
-      IsMulHigh ? &DstTmpRegs[DstTmpParts / 2] : &DstTmpRegs[0], NumDstParts);
+  ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
   MIRBuilder.buildMerge(DstReg, DstRegs);
   MI.eraseFromParent();
   return Legalized;
@@ -5951,7 +6061,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
   Register Src = MI.getOperand(1).getReg();
   Register Amt = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
-  LLT SrcTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
   LLT AmtTy = MRI.getType(Amt);
 
   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
@@ -5965,6 +6075,27 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
       isPowerOf2_32(EltSizeInBits))
     return lowerRotateWithReverseRotate(MI);
 
+  // If a funnel shift is supported, use it.
+  unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
+  unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
+  bool IsFShLegal = false;
+  if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
+      LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
+    auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
+                                Register R3) {
+      MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
+      MI.eraseFromParent();
+      return Legalized;
+    };
+    // If a funnel shift in the other direction is supported, use it.
+    if (IsFShLegal) {
+      return buildFunnelShift(FShOpc, Dst, Src, Amt);
+    } else if (isPowerOf2_32(EltSizeInBits)) {
+      Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
+      return buildFunnelShift(RevFsh, Dst, Src, Amt);
+    }
+  }
+
   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
@@ -6150,7 +6281,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
                                                 : APFloat::IEEEdouble(),
-                    APInt::getNullValue(SrcTy.getSizeInBits()));
+                    APInt::getZero(SrcTy.getSizeInBits()));
   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
 
   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
@@ -7293,3 +7424,563 @@ LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
   MI.eraseFromParent();
   return Legalized;
 }
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(SrcReg);
+
+  // The source could be a scalar if the IR type was <1 x sN>.
+  if (SrcTy.isScalar()) {
+    if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
+      return UnableToLegalize; // FIXME: handle extension.
+    // This can be just a plain copy.
+    Observer.changingInstr(MI);
+    MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  return UnableToLegalize;;
+}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction().hasMinSize();
+  return MF.getFunction().hasOptSize();
+}
+
+// Returns a list of types to use for memory op lowering in MemOps. A partial
+// port of findOptimalMemOpLowering in TargetLowering.
+static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
+                                          unsigned Limit, const MemOp &Op,
+                                          unsigned DstAS, unsigned SrcAS,
+                                          const AttributeList &FuncAttributes,
+                                          const TargetLowering &TLI) {
+  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+    return false;
+
+  LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
+
+  if (Ty == LLT()) {
+    // Use the largest scalar type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    Ty = LLT::scalar(64);
+    if (Op.isFixedDstAlign())
+      while (Op.getDstAlign() < Ty.getSizeInBytes() &&
+             !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
+        Ty = LLT::scalar(Ty.getSizeInBytes());
+    assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
+    // FIXME: check for the largest legal type we can load/store to.
+  }
+
+  unsigned NumMemOps = 0;
+  uint64_t Size = Op.size();
+  while (Size) {
+    unsigned TySize = Ty.getSizeInBytes();
+    while (TySize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      LLT NewTy = Ty;
+      // FIXME: check for mem op safety and legality of the types. Not all of
+      // SDAGisms map cleanly to GISel concepts.
+      if (NewTy.isVector())
+        NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
+      NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
+      unsigned NewTySize = NewTy.getSizeInBytes();
+      assert(NewTySize > 0 && "Could not find appropriate type");
+
+      // If the new LLT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      bool Fast;
+      // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
+      MVT VT = getMVTForLLT(Ty);
+      if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
+          TLI.allowsMisalignedMemoryAccesses(
+              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
+              MachineMemOperand::MONone, &Fast) &&
+          Fast)
+        TySize = Size;
+      else {
+        Ty = NewTy;
+        TySize = NewTySize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(Ty);
+    Size -= TySize;
+  }
+
+  return true;
+}
+
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+  if (Ty.isVector())
+    return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                                Ty.getNumElements());
+  return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+// Get a vectorized representation of the memset value operand, GISel edition.
+static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NumBits = Ty.getScalarSizeInBits();
+  auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
+  if (!Ty.isVector() && ValVRegAndVal) {
+    APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
+    APInt SplatVal = APInt::getSplat(NumBits, Scalar);
+    return MIB.buildConstant(Ty, SplatVal).getReg(0);
+  }
+
+  // Extend the byte value to the larger type, and then multiply by a magic
+  // value 0x010101... in order to replicate it across every byte.
+  // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
+  if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
+    return MIB.buildConstant(Ty, 0).getReg(0);
+  }
+
+  LLT ExtType = Ty.getScalarType();
+  auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
+  if (NumBits > 8) {
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    auto MagicMI = MIB.buildConstant(ExtType, Magic);
+    Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
+  }
+
+  // For vector types create a G_BUILD_VECTOR.
+  if (Ty.isVector())
+    Val = MIB.buildSplatVector(Ty, Val).getReg(0);
+
+  return Val;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
+                             uint64_t KnownLen, Align Alignment,
+                             bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memset length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+
+  auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
+  bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
+
+  if (!findGISelOptimalMemOpLowering(MemOps, Limit,
+                                     MemOp::Set(KnownLen, DstAlignCanChange,
+                                                Alignment,
+                                                /*IsZeroMemset=*/IsZeroVal,
+                                                /*IsVolatile=*/IsVolatile),
+                                     DstPtrInfo.getAddrSpace(), ~0u,
+                                     MF.getFunction().getAttributes(), TLI))
+    return UnableToLegalize;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  MachineIRBuilder MIB(MI);
+  // Find the largest store and generate the bit pattern for it.
+  LLT LargestTy = MemOps[0];
+  for (unsigned i = 1; i < MemOps.size(); i++)
+    if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
+      LargestTy = MemOps[i];
+
+  // The memset stored value is always defined as an s8, so in order to make it
+  // work with larger store types we need to repeat the bit pattern across the
+  // wider type.
+  Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
+
+  if (!MemSetValue)
+    return UnableToLegalize;
+
+  // Generate the stores. For each store type in the list, we generate the
+  // matching store of that type to the destination address.
+  LLT PtrTy = MRI.getType(Dst);
+  unsigned DstOff = 0;
+  unsigned Size = KnownLen;
+  for (unsigned I = 0; I < MemOps.size(); I++) {
+    LLT Ty = MemOps[I];
+    unsigned TySize = Ty.getSizeInBytes();
+    if (TySize > Size) {
+      // Issuing an unaligned load / store pair that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(I == MemOps.size() - 1 && I != 0);
+      DstOff -= TySize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    Register Value = MemSetValue;
+    if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
+      MVT VT = getMVTForLLT(Ty);
+      MVT LargestVT = getMVTForLLT(LargestTy);
+      if (!LargestTy.isVector() && !Ty.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
+      else
+        Value = getMemsetValue(Val, Ty, MIB);
+      if (!Value)
+        return UnableToLegalize;
+    }
+
+    auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
+
+    Register Ptr = Dst;
+    if (DstOff != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
+      Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+    }
+
+    MIB.buildStore(Value, Ptr, *StoreMMO);
+    DstOff += Ty.getSizeInBytes();
+    Size -= TySize;
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register Len = MI.getOperand(2).getReg();
+
+  const auto *MMOIt = MI.memoperands_begin();
+  const MachineMemOperand *MemOp = *MMOIt;
+  bool IsVolatile = MemOp->isVolatile();
+
+  // See if this is a constant length copy
+  auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
+  // FIXME: support dynamically sized G_MEMCPY_INLINE
+  assert(LenVRegAndVal.hasValue() &&
+         "inline memcpy with dynamic size is not yet supported");
+  uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
+  if (KnownLen == 0) {
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  Align DstAlign = DstMMO.getBaseAlign();
+  Align SrcAlign = SrcMMO.getBaseAlign();
+
+  return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
+                           IsVolatile);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
+                                   uint64_t KnownLen, Align DstAlign,
+                                   Align SrcAlign, bool IsVolatile) {
+  assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
+  return lowerMemcpy(MI, Dst, Src, KnownLen,
+                     std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
+                     IsVolatile);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
+                             uint64_t KnownLen, uint64_t Limit, Align DstAlign,
+                             Align SrcAlign, bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memcpy length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  Align Alignment = commonAlignment(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  // FIXME: infer better src pointer alignment like SelectionDAG does here.
+  // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
+  // if the memcpy is in a tail call position.
+
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit,
+          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+                      IsVolatile),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes(), TLI))
+    return UnableToLegalize;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign / 2;
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Now we need to emit a pair of load and stores for each of the types we've
+  // collected. I.e. for each type, generate a load from the source pointer of
+  // that type width, and then generate a corresponding store to the dest buffer
+  // of that value loaded. This can result in a sequence of loads and stores
+  // mixed types, depending on what the target specifies as good types to use.
+  unsigned CurrOffset = 0;
+  LLT PtrTy = MRI.getType(Src);
+  unsigned Size = KnownLen;
+  for (auto CopyTy : MemOps) {
+    // Issuing an unaligned load / store pair  that overlaps with the previous
+    // pair. Adjust the offset accordingly.
+    if (CopyTy.getSizeInBytes() > Size)
+      CurrOffset -= CopyTy.getSizeInBytes() - Size;
+
+    // Construct MMOs for the accesses.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    Register Offset;
+    if (CurrOffset != 0) {
+      Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
+                   .getReg(0);
+      LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
+    }
+    auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
+
+    // Create the store.
+    Register StorePtr =
+        CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+    MIB.buildStore(LdVal, StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+    Size -= CopyTy.getSizeInBytes();
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
+                              uint64_t KnownLen, Align DstAlign, Align SrcAlign,
+                              bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memmove length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  Align Alignment = commonAlignment(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
+  // to a bug in it's findOptimalMemOpLowering implementation. For now do the
+  // same thing here.
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit,
+          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+                      /*IsVolatile*/ true),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes(), TLI))
+    return UnableToLegalize;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign / 2;
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Memmove requires that we perform the loads first before issuing the stores.
+  // Apart from that, this loop is pretty much doing the same thing as the
+  // memcpy codegen function.
+  unsigned CurrOffset = 0;
+  LLT PtrTy = MRI.getType(Src);
+  SmallVector<Register, 16> LoadVals;
+  for (auto CopyTy : MemOps) {
+    // Construct MMO for the load.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    if (CurrOffset != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+      LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
+    }
+    LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+
+  CurrOffset = 0;
+  for (unsigned I = 0; I < MemOps.size(); ++I) {
+    LLT CopyTy = MemOps[I];
+    // Now store the values loaded.
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    Register StorePtr = Dst;
+    if (CurrOffset != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+      StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+    }
+    MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+  const unsigned Opc = MI.getOpcode();
+  // This combine is fairly complex so it's not written with a separate
+  // matcher function.
+  assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
+          Opc == TargetOpcode::G_MEMSET) &&
+         "Expected memcpy like instruction");
+
+  auto MMOIt = MI.memoperands_begin();
+  const MachineMemOperand *MemOp = *MMOIt;
+
+  Align DstAlign = MemOp->getBaseAlign();
+  Align SrcAlign;
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register Len = MI.getOperand(2).getReg();
+
+  if (Opc != TargetOpcode::G_MEMSET) {
+    assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
+    MemOp = *(++MMOIt);
+    SrcAlign = MemOp->getBaseAlign();
+  }
+
+  // See if this is a constant length copy
+  auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
+  if (!LenVRegAndVal)
+    return UnableToLegalize;
+  uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
+
+  if (KnownLen == 0) {
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  bool IsVolatile = MemOp->isVolatile();
+  if (Opc == TargetOpcode::G_MEMCPY_INLINE)
+    return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
+                             IsVolatile);
+
+  // Don't try to optimize volatile.
+  if (IsVolatile)
+    return UnableToLegalize;
+
+  if (MaxLen && KnownLen > MaxLen)
+    return UnableToLegalize;
+
+  if (Opc == TargetOpcode::G_MEMCPY) {
+    auto &MF = *MI.getParent()->getParent();
+    const auto &TLI = *MF.getSubtarget().getTargetLowering();
+    bool OptSize = shouldLowerMemFuncForSize(MF);
+    uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
+    return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
+                       IsVolatile);
+  }
+  if (Opc == TargetOpcode::G_MEMMOVE)
+    return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+  if (Opc == TargetOpcode::G_MEMSET)
+    return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
+  return UnableToLegalize;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 3e3141657e87..30697913a6a4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -352,8 +352,7 @@ LegalizerInfo::getAction(const MachineInstr &MI,
 
   SmallVector<LegalityQuery::MemDesc, 2> MemDescrs;
   for (const auto &MMO : MI.memoperands())
-    MemDescrs.push_back({MMO->getMemoryType(), 8 * MMO->getAlign().value(),
-                         MMO->getSuccessOrdering()});
+    MemDescrs.push_back({*MMO});
 
   return getAction({MI.getOpcode(), Types, MemDescrs});
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
new file mode 100644
index 000000000000..03dda806cb1e
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -0,0 +1,669 @@
+//===- LoadStoreOpt.cpp ----------- Generic memory optimizations -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the LoadStoreOpt optimization pass.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "loadstore-opt"
+
+using namespace llvm;
+using namespace ore;
+using namespace MIPatternMatch;
+
+STATISTIC(NumStoresMerged, "Number of stores merged");
+
+const unsigned MaxStoreSizeToForm = 128;
+
+char LoadStoreOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(LoadStoreOpt, DEBUG_TYPE, "Generic memory optimizations",
+                      false, false)
+INITIALIZE_PASS_END(LoadStoreOpt, DEBUG_TYPE, "Generic memory optimizations",
+                    false, false)
+
+LoadStoreOpt::LoadStoreOpt(std::function<bool(const MachineFunction &)> F)
+    : MachineFunctionPass(ID), DoNotRunPass(F) {}
+
+LoadStoreOpt::LoadStoreOpt()
+    : LoadStoreOpt([](const MachineFunction &) { return false; }) {}
+
+void LoadStoreOpt::init(MachineFunction &MF) {
+  this->MF = &MF;
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  TLI = MF.getSubtarget().getTargetLowering();
+  LI = MF.getSubtarget().getLegalizerInfo();
+  Builder.setMF(MF);
+  IsPreLegalizer = !MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::Legalized);
+  InstsToErase.clear();
+}
+
+void LoadStoreOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+BaseIndexOffset GISelAddressing::getPointerInfo(Register Ptr,
+                                                MachineRegisterInfo &MRI) {
+  BaseIndexOffset Info;
+  Register PtrAddRHS;
+  if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(Info.BaseReg), m_Reg(PtrAddRHS)))) {
+    Info.BaseReg = Ptr;
+    Info.IndexReg = Register();
+    Info.IsIndexSignExt = false;
+    return Info;
+  }
+
+  auto RHSCst = getIConstantVRegValWithLookThrough(PtrAddRHS, MRI);
+  if (RHSCst)
+    Info.Offset = RHSCst->Value.getSExtValue();
+
+  // Just recognize a simple case for now. In future we'll need to match
+  // indexing patterns for base + index + constant.
+  Info.IndexReg = PtrAddRHS;
+  Info.IsIndexSignExt = false;
+  return Info;
+}
+
+bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
+                                               const MachineInstr &MI2,
+                                               bool &IsAlias,
+                                               MachineRegisterInfo &MRI) {
+  auto *LdSt1 = dyn_cast<GLoadStore>(&MI1);
+  auto *LdSt2 = dyn_cast<GLoadStore>(&MI2);
+  if (!LdSt1 || !LdSt2)
+    return false;
+
+  BaseIndexOffset BasePtr0 = getPointerInfo(LdSt1->getPointerReg(), MRI);
+  BaseIndexOffset BasePtr1 = getPointerInfo(LdSt2->getPointerReg(), MRI);
+
+  if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid())
+    return false;
+
+  int64_t Size1 = LdSt1->getMemSize();
+  int64_t Size2 = LdSt2->getMemSize();
+
+  int64_t PtrDiff;
+  if (BasePtr0.BaseReg == BasePtr1.BaseReg) {
+    PtrDiff = BasePtr1.Offset - BasePtr0.Offset;
+    // If the size of memory access is unknown, do not use it to do analysis.
+    // One example of unknown size memory access is to load/store scalable
+    // vector objects on the stack.
+    // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
+    // following situations arise:
+    if (PtrDiff >= 0 &&
+        Size1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      // [----BasePtr0----]
+      //                         [---BasePtr1--]
+      // ========PtrDiff========>
+      IsAlias = !(Size1 <= PtrDiff);
+      return true;
+    }
+    if (PtrDiff < 0 &&
+        Size2 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      //                     [----BasePtr0----]
+      // [---BasePtr1--]
+      // =====(-PtrDiff)====>
+      IsAlias = !((PtrDiff + Size2) <= 0);
+      return true;
+    }
+    return false;
+  }
+
+  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+  // able to calculate their relative offset if at least one arises
+  // from an alloca. However, these allocas cannot overlap and we
+  // can infer there is no alias.
+  auto *Base0Def = getDefIgnoringCopies(BasePtr0.BaseReg, MRI);
+  auto *Base1Def = getDefIgnoringCopies(BasePtr1.BaseReg, MRI);
+  if (!Base0Def || !Base1Def)
+    return false; // Couldn't tell anything.
+
+
+  if (Base0Def->getOpcode() != Base1Def->getOpcode())
+    return false;
+
+  if (Base0Def->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    MachineFrameInfo &MFI = Base0Def->getMF()->getFrameInfo();
+    // If the bases have the same frame index but we couldn't find a
+    // constant offset, (indices are different) be conservative.
+    if (Base0Def != Base1Def &&
+        (!MFI.isFixedObjectIndex(Base0Def->getOperand(1).getIndex()) ||
+         !MFI.isFixedObjectIndex(Base1Def->getOperand(1).getIndex()))) {
+      IsAlias = false;
+      return true;
+    }
+  }
+
+  // This implementation is a lot more primitive than the SDAG one for now.
+  // FIXME: what about constant pools?
+  if (Base0Def->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
+    auto GV0 = Base0Def->getOperand(1).getGlobal();
+    auto GV1 = Base1Def->getOperand(1).getGlobal();
+    if (GV0 != GV1) {
+      IsAlias = false;
+      return true;
+    }
+  }
+
+  // Can't tell anything about aliasing.
+  return false;
+}
+
+bool GISelAddressing::instMayAlias(const MachineInstr &MI,
+                                   const MachineInstr &Other,
+                                   MachineRegisterInfo &MRI,
+                                   AliasAnalysis *AA) {
+  struct MemUseCharacteristics {
+    bool IsVolatile;
+    bool IsAtomic;
+    Register BasePtr;
+    int64_t Offset;
+    uint64_t NumBytes;
+    MachineMemOperand *MMO;
+  };
+
+  auto getCharacteristics =
+      [&](const MachineInstr *MI) -> MemUseCharacteristics {
+    if (const auto *LS = dyn_cast<GLoadStore>(MI)) {
+      Register BaseReg;
+      int64_t Offset = 0;
+      // No pre/post-inc addressing modes are considered here, unlike in SDAG.
+      if (!mi_match(LS->getPointerReg(), MRI,
+                    m_GPtrAdd(m_Reg(BaseReg), m_ICst(Offset)))) {
+        BaseReg = LS->getPointerReg();
+        Offset = 0;
+      }
+
+      uint64_t Size = MemoryLocation::getSizeOrUnknown(
+          LS->getMMO().getMemoryType().getSizeInBytes());
+      return {LS->isVolatile(),       LS->isAtomic(),          BaseReg,
+              Offset /*base offset*/, Size, &LS->getMMO()};
+    }
+    // FIXME: support recognizing lifetime instructions.
+    // Default.
+    return {false /*isvolatile*/,
+            /*isAtomic*/ false,          Register(),
+            (int64_t)0 /*offset*/,       0 /*size*/,
+            (MachineMemOperand *)nullptr};
+  };
+  MemUseCharacteristics MUC0 = getCharacteristics(&MI),
+                        MUC1 = getCharacteristics(&Other);
+
+  // If they are to the same address, then they must be aliases.
+  if (MUC0.BasePtr.isValid() && MUC0.BasePtr == MUC1.BasePtr &&
+      MUC0.Offset == MUC1.Offset)
+    return true;
+
+  // If they are both volatile then they cannot be reordered.
+  if (MUC0.IsVolatile && MUC1.IsVolatile)
+    return true;
+
+  // Be conservative about atomics for the moment
+  // TODO: This is way overconservative for unordered atomics (see D66309)
+  if (MUC0.IsAtomic && MUC1.IsAtomic)
+    return true;
+
+  // If one operation reads from invariant memory, and the other may store, they
+  // cannot alias.
+  if (MUC0.MMO && MUC1.MMO) {
+    if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
+        (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
+      return false;
+  }
+
+  // Try to prove that there is aliasing, or that there is no aliasing. Either
+  // way, we can return now. If nothing can be proved, proceed with more tests.
+  bool IsAlias;
+  if (GISelAddressing::aliasIsKnownForLoadStore(MI, Other, IsAlias, MRI))
+    return IsAlias;
+
+  // The following all rely on MMO0 and MMO1 being valid.
+  if (!MUC0.MMO || !MUC1.MMO)
+    return true;
+
+  // FIXME: port the alignment based alias analysis from SDAG's isAlias().
+  int64_t SrcValOffset0 = MUC0.MMO->getOffset();
+  int64_t SrcValOffset1 = MUC1.MMO->getOffset();
+  uint64_t Size0 = MUC0.NumBytes;
+  uint64_t Size1 = MUC1.NumBytes;
+  if (AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
+      Size0 != MemoryLocation::UnknownSize &&
+      Size1 != MemoryLocation::UnknownSize) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
+    int64_t Overlap0 = Size0 + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = Size1 + SrcValOffset1 - MinOffset;
+    if (AA->isNoAlias(MemoryLocation(MUC0.MMO->getValue(), Overlap0,
+                                     MUC0.MMO->getAAInfo()),
+                      MemoryLocation(MUC1.MMO->getValue(), Overlap1,
+                                     MUC1.MMO->getAAInfo())))
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+/// Returns true if the instruction creates an unavoidable hazard that
+/// forces a boundary between store merge candidates.
+static bool isInstHardMergeHazard(MachineInstr &MI) {
+  return MI.hasUnmodeledSideEffects() || MI.hasOrderedMemoryRef();
+}
+
+bool LoadStoreOpt::mergeStores(SmallVectorImpl<GStore *> &StoresToMerge) {
+  // Try to merge all the stores in the vector, splitting into separate segments
+  // as necessary.
+  assert(StoresToMerge.size() > 1 && "Expected multiple stores to merge");
+  LLT OrigTy = MRI->getType(StoresToMerge[0]->getValueReg());
+  LLT PtrTy = MRI->getType(StoresToMerge[0]->getPointerReg());
+  unsigned AS = PtrTy.getAddressSpace();
+  // Ensure the legal store info is computed for this address space.
+  initializeStoreMergeTargetInfo(AS);
+  const auto &LegalSizes = LegalStoreSizes[AS];
+
+#ifndef NDEBUG
+  for (auto StoreMI : StoresToMerge)
+    assert(MRI->getType(StoreMI->getValueReg()) == OrigTy);
+#endif
+
+  const auto &DL = MF->getFunction().getParent()->getDataLayout();
+  bool AnyMerged = false;
+  do {
+    unsigned NumPow2 = PowerOf2Floor(StoresToMerge.size());
+    unsigned MaxSizeBits = NumPow2 * OrigTy.getSizeInBits().getFixedSize();
+    // Compute the biggest store we can generate to handle the number of stores.
+    unsigned MergeSizeBits;
+    for (MergeSizeBits = MaxSizeBits; MergeSizeBits > 1; MergeSizeBits /= 2) {
+      LLT StoreTy = LLT::scalar(MergeSizeBits);
+      EVT StoreEVT =
+          getApproximateEVTForLLT(StoreTy, DL, MF->getFunction().getContext());
+      if (LegalSizes.size() > MergeSizeBits && LegalSizes[MergeSizeBits] &&
+          TLI->canMergeStoresTo(AS, StoreEVT, *MF) &&
+          (TLI->isTypeLegal(StoreEVT)))
+        break; // We can generate a MergeSize bits store.
+    }
+    if (MergeSizeBits <= OrigTy.getSizeInBits())
+      return AnyMerged; // No greater merge.
+
+    unsigned NumStoresToMerge = MergeSizeBits / OrigTy.getSizeInBits();
+    // Perform the actual merging.
+    SmallVector<GStore *, 8> SingleMergeStores(
+        StoresToMerge.begin(), StoresToMerge.begin() + NumStoresToMerge);
+    AnyMerged |= doSingleStoreMerge(SingleMergeStores);
+    StoresToMerge.erase(StoresToMerge.begin(),
+                        StoresToMerge.begin() + NumStoresToMerge);
+  } while (StoresToMerge.size() > 1);
+  return AnyMerged;
+}
+
+bool LoadStoreOpt::isLegalOrBeforeLegalizer(const LegalityQuery &Query,
+                                            MachineFunction &MF) const {
+  auto Action = LI->getAction(Query).Action;
+  // If the instruction is unsupported, it can't be legalized at all.
+  if (Action == LegalizeActions::Unsupported)
+    return false;
+  return IsPreLegalizer || Action == LegalizeAction::Legal;
+}
+
+bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
+  assert(Stores.size() > 1);
+  // We know that all the stores are consecutive and there are no aliasing
+  // operations in the range. However, the values that are being stored may be
+  // generated anywhere before each store. To ensure we have the values
+  // available, we materialize the wide value and new store at the place of the
+  // final store in the merge sequence.
+  GStore *FirstStore = Stores[0];
+  const unsigned NumStores = Stores.size();
+  LLT SmallTy = MRI->getType(FirstStore->getValueReg());
+  LLT WideValueTy =
+      LLT::scalar(NumStores * SmallTy.getSizeInBits().getFixedSize());
+
+  // For each store, compute pairwise merged debug locs.
+  DebugLoc MergedLoc;
+  for (unsigned AIdx = 0, BIdx = 1; BIdx < NumStores; ++AIdx, ++BIdx)
+    MergedLoc = DILocation::getMergedLocation(Stores[AIdx]->getDebugLoc(),
+                                              Stores[BIdx]->getDebugLoc());
+  Builder.setInstr(*Stores.back());
+  Builder.setDebugLoc(MergedLoc);
+
+  // If all of the store values are constants, then create a wide constant
+  // directly. Otherwise, we need to generate some instructions to merge the
+  // existing values together into a wider type.
+  SmallVector<APInt, 8> ConstantVals;
+  for (auto Store : Stores) {
+    auto MaybeCst =
+        getIConstantVRegValWithLookThrough(Store->getValueReg(), *MRI);
+    if (!MaybeCst) {
+      ConstantVals.clear();
+      break;
+    }
+    ConstantVals.emplace_back(MaybeCst->Value);
+  }
+
+  Register WideReg;
+  auto *WideMMO =
+      MF->getMachineMemOperand(&FirstStore->getMMO(), 0, WideValueTy);
+  if (ConstantVals.empty()) {
+    // Mimic the SDAG behaviour here and don't try to do anything for unknown
+    // values. In future, we should also support the cases of loads and
+    // extracted vector elements.
+    return false;
+  }
+
+  assert(ConstantVals.size() == NumStores);
+  // Check if our wide constant is legal.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {WideValueTy}}, *MF))
+    return false;
+  APInt WideConst(WideValueTy.getSizeInBits(), 0);
+  for (unsigned Idx = 0; Idx < ConstantVals.size(); ++Idx) {
+    // Insert the smaller constant into the corresponding position in the
+    // wider one.
+    WideConst.insertBits(ConstantVals[Idx], Idx * SmallTy.getSizeInBits());
+  }
+  WideReg = Builder.buildConstant(WideValueTy, WideConst).getReg(0);
+  auto NewStore =
+      Builder.buildStore(WideReg, FirstStore->getPointerReg(), *WideMMO);
+  (void) NewStore;
+  LLVM_DEBUG(dbgs() << "Created merged store: " << *NewStore);
+  NumStoresMerged += Stores.size();
+
+  MachineOptimizationRemarkEmitter MORE(*MF, nullptr);
+  MORE.emit([&]() {
+    MachineOptimizationRemark R(DEBUG_TYPE, "MergedStore",
+                                FirstStore->getDebugLoc(),
+                                FirstStore->getParent());
+    R << "Merged " << NV("NumMerged", Stores.size()) << " stores of "
+      << NV("OrigWidth", SmallTy.getSizeInBytes())
+      << " bytes into a single store of "
+      << NV("NewWidth", WideValueTy.getSizeInBytes()) << " bytes";
+    return R;
+  });
+
+  for (auto MI : Stores)
+    InstsToErase.insert(MI);
+  return true;
+}
+
+bool LoadStoreOpt::processMergeCandidate(StoreMergeCandidate &C) {
+  if (C.Stores.size() < 2) {
+    C.reset();
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Checking store merge candidate with " << C.Stores.size()
+                    << " stores, starting with " << *C.Stores[0]);
+  // We know that the stores in the candidate are adjacent.
+  // Now we need to check if any potential aliasing instructions recorded
+  // during the search alias with load/stores added to the candidate after.
+  // For example, if we have the candidate:
+  //   C.Stores = [ST1, ST2, ST3, ST4]
+  // and after seeing ST2 we saw a load LD1, which did not alias with ST1 or
+  // ST2, then we would have recorded it into the PotentialAliases structure
+  // with the associated index value of "1". Then we see ST3 and ST4 and add
+  // them to the candidate group. We know that LD1 does not alias with ST1 or
+  // ST2, since we already did that check. However we don't yet know if it
+  // may alias ST3 and ST4, so we perform those checks now.
+  SmallVector<GStore *> StoresToMerge;
+
+  auto DoesStoreAliasWithPotential = [&](unsigned Idx, GStore &CheckStore) {
+    for (auto AliasInfo : reverse(C.PotentialAliases)) {
+      MachineInstr *PotentialAliasOp = AliasInfo.first;
+      unsigned PreCheckedIdx = AliasInfo.second;
+      if (static_cast<unsigned>(Idx) > PreCheckedIdx) {
+        // Need to check this alias.
+        if (GISelAddressing::instMayAlias(CheckStore, *PotentialAliasOp, *MRI,
+                                          AA)) {
+          LLVM_DEBUG(dbgs() << "Potential alias " << *PotentialAliasOp
+                            << " detected\n");
+          return true;
+        }
+      } else {
+        // Once our store index is lower than the index associated with the
+        // potential alias, we know that we've already checked for this alias
+        // and all of the earlier potential aliases too.
+        return false;
+      }
+    }
+    return false;
+  };
+  // Start from the last store in the group, and check if it aliases with any
+  // of the potential aliasing operations in the list.
+  for (int StoreIdx = C.Stores.size() - 1; StoreIdx >= 0; --StoreIdx) {
+    auto *CheckStore = C.Stores[StoreIdx];
+    if (DoesStoreAliasWithPotential(StoreIdx, *CheckStore))
+      continue;
+    StoresToMerge.emplace_back(CheckStore);
+  }
+
+  LLVM_DEBUG(dbgs() << StoresToMerge.size()
+                    << " stores remaining after alias checks. Merging...\n");
+
+  // Now we've checked for aliasing hazards, merge any stores left.
+  C.reset();
+  if (StoresToMerge.size() < 2)
+    return false;
+  return mergeStores(StoresToMerge);
+}
+
+bool LoadStoreOpt::operationAliasesWithCandidate(MachineInstr &MI,
+                                                 StoreMergeCandidate &C) {
+  if (C.Stores.empty())
+    return false;
+  return llvm::any_of(C.Stores, [&](MachineInstr *OtherMI) {
+    return instMayAlias(MI, *OtherMI, *MRI, AA);
+  });
+}
+
+void LoadStoreOpt::StoreMergeCandidate::addPotentialAlias(MachineInstr &MI) {
+  PotentialAliases.emplace_back(std::make_pair(&MI, Stores.size() - 1));
+}
+
+bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
+                                       StoreMergeCandidate &C) {
+  // Check if the given store writes to an adjacent address, and other
+  // requirements.
+  LLT ValueTy = MRI->getType(StoreMI.getValueReg());
+  LLT PtrTy = MRI->getType(StoreMI.getPointerReg());
+
+  // Only handle scalars.
+  if (!ValueTy.isScalar())
+    return false;
+
+  // Don't allow truncating stores for now.
+  if (StoreMI.getMemSizeInBits() != ValueTy.getSizeInBits())
+    return false;
+
+  Register StoreAddr = StoreMI.getPointerReg();
+  auto BIO = getPointerInfo(StoreAddr, *MRI);
+  Register StoreBase = BIO.BaseReg;
+  uint64_t StoreOffCst = BIO.Offset;
+  if (C.Stores.empty()) {
+    // This is the first store of the candidate.
+    // If the offset can't possibly allow for a lower addressed store with the
+    // same base, don't bother adding it.
+    if (StoreOffCst < ValueTy.getSizeInBytes())
+      return false;
+    C.BasePtr = StoreBase;
+    C.CurrentLowestOffset = StoreOffCst;
+    C.Stores.emplace_back(&StoreMI);
+    LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: "
+                      << StoreMI);
+    return true;
+  }
+
+  // Check the store is the same size as the existing ones in the candidate.
+  if (MRI->getType(C.Stores[0]->getValueReg()).getSizeInBits() !=
+      ValueTy.getSizeInBits())
+    return false;
+
+  if (MRI->getType(C.Stores[0]->getPointerReg()).getAddressSpace() !=
+      PtrTy.getAddressSpace())
+    return false;
+
+  // There are other stores in the candidate. Check that the store address
+  // writes to the next lowest adjacent address.
+  if (C.BasePtr != StoreBase)
+    return false;
+  if ((C.CurrentLowestOffset - ValueTy.getSizeInBytes()) !=
+      static_cast<uint64_t>(StoreOffCst))
+    return false;
+
+  // This writes to an adjacent address. Allow it.
+  C.Stores.emplace_back(&StoreMI);
+  C.CurrentLowestOffset = C.CurrentLowestOffset - ValueTy.getSizeInBytes();
+  LLVM_DEBUG(dbgs() << "Candidate added store: " << StoreMI);
+  return true;
+}
+
+bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  // Walk through the block bottom-up, looking for merging candidates.
+  StoreMergeCandidate Candidate;
+  for (auto II = MBB.rbegin(), IE = MBB.rend(); II != IE; ++II) {
+    MachineInstr &MI = *II;
+    if (InstsToErase.contains(&MI))
+      continue;
+
+    if (auto StoreMI = dyn_cast<GStore>(&*II)) {
+      // We have a G_STORE. Add it to the candidate if it writes to an adjacent
+      // address.
+      if (!addStoreToCandidate(*StoreMI, Candidate)) {
+        // Store wasn't eligible to be added. May need to record it as a
+        // potential alias.
+        if (operationAliasesWithCandidate(*StoreMI, Candidate)) {
+          Changed |= processMergeCandidate(Candidate);
+          continue;
+        }
+        Candidate.addPotentialAlias(*StoreMI);
+      }
+      continue;
+    }
+
+    // If we don't have any stores yet, this instruction can't pose a problem.
+    if (Candidate.Stores.empty())
+      continue;
+
+    // We're dealing with some other kind of instruction.
+    if (isInstHardMergeHazard(MI)) {
+      Changed |= processMergeCandidate(Candidate);
+      Candidate.Stores.clear();
+      continue;
+    }
+
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    if (operationAliasesWithCandidate(MI, Candidate)) {
+      // We have a potential alias, so process the current candidate if we can
+      // and then continue looking for a new candidate.
+      Changed |= processMergeCandidate(Candidate);
+      continue;
+    }
+
+    // Record this instruction as a potential alias for future stores that are
+    // added to the candidate.
+    Candidate.addPotentialAlias(MI);
+  }
+
+  // Process any candidate left after finishing searching the entire block.
+  Changed |= processMergeCandidate(Candidate);
+
+  // Erase instructions now that we're no longer iterating over the block.
+  for (auto *MI : InstsToErase)
+    MI->eraseFromParent();
+  InstsToErase.clear();
+  return Changed;
+}
+
+bool LoadStoreOpt::mergeFunctionStores(MachineFunction &MF) {
+  bool Changed = false;
+  for (auto &BB : MF) {
+    Changed |= mergeBlockStores(BB);
+  }
+  return Changed;
+}
+
+void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) {
+  // Query the legalizer info to record what store types are legal.
+  // We record this because we don't want to bother trying to merge stores into
+  // illegal ones, which would just result in being split again.
+
+  if (LegalStoreSizes.count(AddrSpace)) {
+    assert(LegalStoreSizes[AddrSpace].any());
+    return; // Already cached sizes for this address space.
+  }
+
+  // Need to reserve at least MaxStoreSizeToForm + 1 bits.
+  BitVector LegalSizes(MaxStoreSizeToForm * 2);
+  const auto &LI = *MF->getSubtarget().getLegalizerInfo();
+  const auto &DL = MF->getFunction().getParent()->getDataLayout();
+  Type *IntPtrIRTy =
+      DL.getIntPtrType(MF->getFunction().getContext(), AddrSpace);
+  LLT PtrTy = getLLTForType(*IntPtrIRTy->getPointerTo(AddrSpace), DL);
+  // We assume that we're not going to be generating any stores wider than
+  // MaxStoreSizeToForm bits for now.
+  for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) {
+    LLT Ty = LLT::scalar(Size);
+    SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
+        {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}});
+    SmallVector<LLT> StoreTys({Ty, PtrTy});
+    LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs);
+    LegalizeActionStep ActionStep = LI.getAction(Q);
+    if (ActionStep.Action == LegalizeActions::Legal)
+      LegalSizes.set(Size);
+  }
+  assert(LegalSizes.any() && "Expected some store sizes to be legal!");
+  LegalStoreSizes[AddrSpace] = LegalSizes;
+}
+
+bool LoadStoreOpt::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Begin memory optimizations for: " << MF.getName()
+                    << '\n');
+
+  init(MF);
+  bool Changed = false;
+  Changed |= mergeFunctionStores(MF);
+
+  LegalStoreSizes.clear();
+  return Changed;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index d45fdae43f01..a1acc4195840 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -92,9 +92,8 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
     // Check if all the users of MI are local.
     // We are going to invalidation the list of use operands, so we
     // can't use range iterator.
-    for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
-         MOIt != MOItEnd;) {
-      MachineOperand &MOUse = *MOIt++;
+    for (MachineOperand &MOUse :
+         llvm::make_early_inc_range(MRI->use_operands(Reg))) {
       // Check if the use is already local.
       MachineBasicBlock *InsertMBB;
       LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 54ac62793b08..fb5ed35c1f72 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -673,7 +673,8 @@ MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,
   LLT DstTy = Res.getLLTTy(*getMRI());
   LLT Src1Ty = Src1.getLLTTy(*getMRI());
   LLT Src2Ty = Src2.getLLTTy(*getMRI());
-  assert(Src1Ty.getNumElements() + Src2Ty.getNumElements() >= Mask.size());
+  assert((size_t)(Src1Ty.getNumElements() + Src2Ty.getNumElements()) >=
+         Mask.size());
   assert(DstTy.getElementType() == Src1Ty.getElementType() &&
          DstTy.getElementType() == Src2Ty.getElementType());
   (void)DstTy;
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 644a81d8021e..937d94764be1 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -699,11 +699,11 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
     // Set a sensible insertion point so that subsequent calls to
     // MIRBuilder.
     MIRBuilder.setMBB(*MBB);
-    for (MachineBasicBlock::iterator MII = MBB->begin(), End = MBB->end();
-         MII != End;) {
-      // MI might be invalidated by the assignment, so move the
-      // iterator before hand.
-      MachineInstr &MI = *MII++;
+    SmallVector<MachineInstr *> WorkList(
+        make_pointer_range(reverse(MBB->instrs())));
+
+    while (!WorkList.empty()) {
+      MachineInstr &MI = *WorkList.pop_back_val();
 
       // Ignore target-specific post-isel instructions: they should use proper
       // regclasses.
@@ -728,18 +728,6 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
                            "unable to map instruction", MI);
         return false;
       }
-
-      // It's possible the mapping changed control flow, and moved the following
-      // instruction to a new block, so figure out the new parent.
-      if (MII != End) {
-        MachineBasicBlock *NextInstBB = MII->getParent();
-        if (NextInstBB != MBB) {
-          LLVM_DEBUG(dbgs() << "Instruction mapping changed control flow\n");
-          MBB = NextInstBB;
-          MIRBuilder.setMBB(*MBB);
-          End = MBB->end();
-        }
-      }
     }
   }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index e2a963747101..1a2102e3ef21 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -570,7 +570,7 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const {
     assert((ValueMask & PartMapMask) == PartMapMask &&
            "Some partial mappings overlap");
   }
-  assert(ValueMask.isAllOnesValue() && "Value is not fully mapped");
+  assert(ValueMask.isAllOnes() && "Value is not fully mapped");
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index f64e41b9dccc..1a440c064a59 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -15,7 +15,9 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -60,6 +62,8 @@ Register llvm::constrainOperandRegClass(
   if (ConstrainedReg != Reg) {
     MachineBasicBlock::iterator InsertIt(&InsertPt);
     MachineBasicBlock &MBB = *InsertPt.getParent();
+    // FIXME: The copy needs to have the classes constrained for its operands.
+    // Use operand's regbank to get the class for old register (Reg).
     if (RegMO.isUse()) {
       BuildMI(MBB, InsertIt, InsertPt.getDebugLoc(),
               TII.get(TargetOpcode::COPY), ConstrainedReg)
@@ -99,19 +103,25 @@ Register llvm::constrainOperandRegClass(
   // Assume physical registers are properly constrained.
   assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
-  const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF);
+  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI, MF);
   // Some of the target independent instructions, like COPY, may not impose any
   // register class constraints on some of their operands: If it's a use, we can
   // skip constraining as the instruction defining the register would constrain
   // it.
 
-  // We can't constrain unallocatable register classes, because we can't create
-  // virtual registers for these classes, so we need to let targets handled this
-  // case.
-  if (RegClass && !RegClass->isAllocatable())
-    RegClass = TRI.getConstrainedRegClassForOperand(RegMO, MRI);
+  if (OpRC) {
+    // Obtain the RC from incoming regbank if it is a proper sub-class. Operands
+    // can have multiple regbanks for a superclass that combine different
+    // register types (E.g., AMDGPU's VGPR and AGPR). The regbank ambiguity
+    // resolved by targets during regbankselect should not be overridden.
+    if (const auto *SubRC = TRI.getCommonSubClass(
+            OpRC, TRI.getConstrainedRegClassForOperand(RegMO, MRI)))
+      OpRC = SubRC;
 
-  if (!RegClass) {
+    OpRC = TRI.getAllocatableClass(OpRC);
+  }
+
+  if (!OpRC) {
     assert((!isTargetSpecificOpcode(II.getOpcode()) || RegMO.isUse()) &&
            "Register class constraint is required unless either the "
            "instruction is target independent or the operand is a use");
@@ -127,7 +137,7 @@ Register llvm::constrainOperandRegClass(
     // and they never reach this function.
     return Reg;
   }
-  return constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt, *RegClass,
+  return constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt, *OpRC,
                                   RegMO);
 }
 
@@ -236,7 +246,7 @@ static void reportGISelDiagnostic(DiagnosticSeverity Severity,
     R << (" (in function: " + MF.getName() + ")").str();
 
   if (IsFatal)
-    report_fatal_error(R.getMsg());
+    report_fatal_error(Twine(R.getMsg()));
   else
     MORE.emit(R);
 }
@@ -267,10 +277,10 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
   reportGISelFailure(MF, TPC, MORE, R);
 }
 
-Optional<APInt> llvm::getConstantVRegVal(Register VReg,
-                                         const MachineRegisterInfo &MRI) {
-  Optional<ValueAndVReg> ValAndVReg =
-      getConstantVRegValWithLookThrough(VReg, MRI, /*LookThroughInstrs*/ false);
+Optional<APInt> llvm::getIConstantVRegVal(Register VReg,
+                                          const MachineRegisterInfo &MRI) {
+  Optional<ValueAndVReg> ValAndVReg = getIConstantVRegValWithLookThrough(
+      VReg, MRI, /*LookThroughInstrs*/ false);
   assert((!ValAndVReg || ValAndVReg->VReg == VReg) &&
          "Value found while looking through instrs");
   if (!ValAndVReg)
@@ -278,41 +288,27 @@ Optional<APInt> llvm::getConstantVRegVal(Register VReg,
   return ValAndVReg->Value;
 }
 
-Optional<int64_t> llvm::getConstantVRegSExtVal(Register VReg,
-                                               const MachineRegisterInfo &MRI) {
-  Optional<APInt> Val = getConstantVRegVal(VReg, MRI);
+Optional<int64_t>
+llvm::getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI) {
+  Optional<APInt> Val = getIConstantVRegVal(VReg, MRI);
   if (Val && Val->getBitWidth() <= 64)
     return Val->getSExtValue();
   return None;
 }
 
-Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
-    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
-    bool HandleFConstant, bool LookThroughAnyExt) {
+namespace {
+
+typedef std::function<bool(const MachineInstr *)> IsOpcodeFn;
+typedef std::function<Optional<APInt>(const MachineInstr *MI)> GetAPCstFn;
+
+Optional<ValueAndVReg> getConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, IsOpcodeFn IsConstantOpcode,
+    GetAPCstFn getAPCstValue, bool LookThroughInstrs = true,
+    bool LookThroughAnyExt = false) {
   SmallVector<std::pair<unsigned, unsigned>, 4> SeenOpcodes;
   MachineInstr *MI;
-  auto IsConstantOpcode = [HandleFConstant](unsigned Opcode) {
-    return Opcode == TargetOpcode::G_CONSTANT ||
-           (HandleFConstant && Opcode == TargetOpcode::G_FCONSTANT);
-  };
-  auto GetImmediateValue = [HandleFConstant,
-                            &MRI](const MachineInstr &MI) -> Optional<APInt> {
-    const MachineOperand &CstVal = MI.getOperand(1);
-    if (!CstVal.isImm() && !CstVal.isCImm() &&
-        (!HandleFConstant || !CstVal.isFPImm()))
-      return None;
-    if (!CstVal.isFPImm()) {
-      unsigned BitWidth =
-          MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-      APInt Val = CstVal.isImm() ? APInt(BitWidth, CstVal.getImm())
-                                 : CstVal.getCImm()->getValue();
-      assert(Val.getBitWidth() == BitWidth &&
-             "Value bitwidth doesn't match definition type");
-      return Val;
-    }
-    return CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
-  };
-  while ((MI = MRI.getVRegDef(VReg)) && !IsConstantOpcode(MI->getOpcode()) &&
+
+  while ((MI = MRI.getVRegDef(VReg)) && !IsConstantOpcode(MI) &&
          LookThroughInstrs) {
     switch (MI->getOpcode()) {
     case TargetOpcode::G_ANYEXT:
@@ -339,10 +335,10 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
       return None;
     }
   }
-  if (!MI || !IsConstantOpcode(MI->getOpcode()))
+  if (!MI || !IsConstantOpcode(MI))
     return None;
 
-  Optional<APInt> MaybeVal = GetImmediateValue(*MI);
+  Optional<APInt> MaybeVal = getAPCstValue(MI);
   if (!MaybeVal)
     return None;
   APInt &Val = *MaybeVal;
@@ -365,12 +361,65 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
   return ValueAndVReg{Val, VReg};
 }
 
-const ConstantInt *llvm::getConstantIntVRegVal(Register VReg,
-                                               const MachineRegisterInfo &MRI) {
-  MachineInstr *MI = MRI.getVRegDef(VReg);
-  if (MI->getOpcode() != TargetOpcode::G_CONSTANT)
-    return nullptr;
-  return MI->getOperand(1).getCImm();
+bool isIConstant(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+  return MI->getOpcode() == TargetOpcode::G_CONSTANT;
+}
+
+bool isFConstant(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+  return MI->getOpcode() == TargetOpcode::G_FCONSTANT;
+}
+
+bool isAnyConstant(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+  unsigned Opc = MI->getOpcode();
+  return Opc == TargetOpcode::G_CONSTANT || Opc == TargetOpcode::G_FCONSTANT;
+}
+
+Optional<APInt> getCImmAsAPInt(const MachineInstr *MI) {
+  const MachineOperand &CstVal = MI->getOperand(1);
+  if (CstVal.isCImm())
+    return CstVal.getCImm()->getValue();
+  return None;
+}
+
+Optional<APInt> getCImmOrFPImmAsAPInt(const MachineInstr *MI) {
+  const MachineOperand &CstVal = MI->getOperand(1);
+  if (CstVal.isCImm())
+    return CstVal.getCImm()->getValue();
+  if (CstVal.isFPImm())
+    return CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
+  return None;
+}
+
+} // end anonymous namespace
+
+Optional<ValueAndVReg> llvm::getIConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs) {
+  return getConstantVRegValWithLookThrough(VReg, MRI, isIConstant,
+                                           getCImmAsAPInt, LookThroughInstrs);
+}
+
+Optional<ValueAndVReg> llvm::getAnyConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
+    bool LookThroughAnyExt) {
+  return getConstantVRegValWithLookThrough(
+      VReg, MRI, isAnyConstant, getCImmOrFPImmAsAPInt, LookThroughInstrs,
+      LookThroughAnyExt);
+}
+
+Optional<FPValueAndVReg> llvm::getFConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs) {
+  auto Reg = getConstantVRegValWithLookThrough(
+      VReg, MRI, isFConstant, getCImmOrFPImmAsAPInt, LookThroughInstrs);
+  if (!Reg)
+    return None;
+  return FPValueAndVReg{getConstantFPVRegVal(Reg->VReg, MRI)->getValueAPF(),
+                        Reg->VReg};
 }
 
 const ConstantFP *
@@ -437,16 +486,16 @@ APFloat llvm::getAPFloatFromSize(double Val, unsigned Size) {
 Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1,
                                         const Register Op2,
                                         const MachineRegisterInfo &MRI) {
-  auto MaybeOp2Cst = getConstantVRegVal(Op2, MRI);
+  auto MaybeOp2Cst = getAnyConstantVRegValWithLookThrough(Op2, MRI, false);
   if (!MaybeOp2Cst)
     return None;
 
-  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
+  auto MaybeOp1Cst = getAnyConstantVRegValWithLookThrough(Op1, MRI, false);
   if (!MaybeOp1Cst)
     return None;
 
-  const APInt &C1 = *MaybeOp1Cst;
-  const APInt &C2 = *MaybeOp2Cst;
+  const APInt &C1 = MaybeOp1Cst->Value;
+  const APInt &C2 = MaybeOp2Cst->Value;
   switch (Opcode) {
   default:
     break;
@@ -543,6 +592,35 @@ Optional<APFloat> llvm::ConstantFoldFPBinOp(unsigned Opcode, const Register Op1,
   return None;
 }
 
+Optional<MachineInstr *>
+llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
+                              const Register Op2,
+                              const MachineRegisterInfo &MRI,
+                              MachineIRBuilder &MIB) {
+  auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI);
+  if (!SrcVec1)
+    return None;
+  auto *SrcVec2 = getOpcodeDef<GBuildVector>(Op2, MRI);
+  if (!SrcVec2)
+    return None;
+
+  const LLT EltTy = MRI.getType(SrcVec1->getSourceReg(0));
+
+  SmallVector<Register, 16> FoldedElements;
+  for (unsigned Idx = 0, E = SrcVec1->getNumSources(); Idx < E; ++Idx) {
+    auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx),
+                                      SrcVec2->getSourceReg(Idx), MRI);
+    if (!MaybeCst)
+      return None;
+    auto FoldedCstReg = MIB.buildConstant(EltTy, *MaybeCst).getReg(0);
+    FoldedElements.emplace_back(FoldedCstReg);
+  }
+  // Create the new vector constant.
+  auto CstVec =
+      MIB.buildBuildVector(MRI.getType(SrcVec1->getReg(0)), FoldedElements);
+  return &*CstVec;
+}
+
 bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
                            bool SNaN) {
   const MachineInstr *DefMI = MRI.getVRegDef(Val);
@@ -659,7 +737,7 @@ Register llvm::getFunctionLiveInPhysReg(MachineFunction &MF,
 Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const Register Op1,
                                         uint64_t Imm,
                                         const MachineRegisterInfo &MRI) {
-  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
+  auto MaybeOp1Cst = getIConstantVRegVal(Op1, MRI);
   if (MaybeOp1Cst) {
     switch (Opcode) {
     default:
@@ -677,7 +755,7 @@ Optional<APFloat> llvm::ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy,
                                                Register Src,
                                                const MachineRegisterInfo &MRI) {
   assert(Opcode == TargetOpcode::G_SITOFP || Opcode == TargetOpcode::G_UITOFP);
-  if (auto MaybeSrcVal = getConstantVRegVal(Src, MRI)) {
+  if (auto MaybeSrcVal = getIConstantVRegVal(Src, MRI)) {
     APFloat DstVal(getFltSemanticForLLT(DstTy));
     DstVal.convertFromAPInt(*MaybeSrcVal, Opcode == TargetOpcode::G_SITOFP,
                             APFloat::rmNearestTiesToEven);
@@ -686,6 +764,37 @@ Optional<APFloat> llvm::ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy,
   return None;
 }
 
+Optional<SmallVector<unsigned>>
+llvm::ConstantFoldCTLZ(Register Src, const MachineRegisterInfo &MRI) {
+  LLT Ty = MRI.getType(Src);
+  SmallVector<unsigned> FoldedCTLZs;
+  auto tryFoldScalar = [&](Register R) -> Optional<unsigned> {
+    auto MaybeCst = getIConstantVRegVal(R, MRI);
+    if (!MaybeCst)
+      return None;
+    return MaybeCst->countLeadingZeros();
+  };
+  if (Ty.isVector()) {
+    // Try to constant fold each element.
+    auto *BV = getOpcodeDef<GBuildVector>(Src, MRI);
+    if (!BV)
+      return None;
+    for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) {
+      if (auto MaybeFold = tryFoldScalar(BV->getSourceReg(SrcIdx))) {
+        FoldedCTLZs.emplace_back(*MaybeFold);
+        continue;
+      }
+      return None;
+    }
+    return FoldedCTLZs;
+  }
+  if (auto MaybeCst = tryFoldScalar(Src)) {
+    FoldedCTLZs.emplace_back(*MaybeCst);
+    return FoldedCTLZs;
+  }
+  return None;
+}
+
 bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
                                   GISelKnownBits *KB) {
   Optional<DefinitionAndSourceRegister> DefSrcReg =
@@ -707,7 +816,7 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
     // shifting the bit off the end is undefined.
 
     // TODO: Constant splat
-    if (auto ConstLHS = getConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
+    if (auto ConstLHS = getIConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
       if (*ConstLHS == 1)
         return true;
     }
@@ -715,7 +824,7 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
     break;
   }
   case TargetOpcode::G_LSHR: {
-    if (auto ConstLHS = getConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
+    if (auto ConstLHS = getIConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
       if (ConstLHS->isSignMask())
         return true;
     }
@@ -737,7 +846,7 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
     // zeros is greater than the truncation amount.
     const unsigned BitWidth = Ty.getScalarSizeInBits();
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-      auto Const = getConstantVRegVal(MI.getOperand(I).getReg(), MRI);
+      auto Const = getIConstantVRegVal(MI.getOperand(I).getReg(), MRI);
       if (!Const || !Const->zextOrTrunc(BitWidth).isPowerOf2())
         return false;
     }
@@ -885,53 +994,81 @@ static bool isBuildVectorOp(unsigned Opcode) {
          Opcode == TargetOpcode::G_BUILD_VECTOR_TRUNC;
 }
 
-// TODO: Handle mixed undef elements.
-static bool isBuildVectorConstantSplat(const MachineInstr &MI,
-                                       const MachineRegisterInfo &MRI,
-                                       int64_t SplatValue) {
-  if (!isBuildVectorOp(MI.getOpcode()))
-    return false;
+namespace {
 
-  const unsigned NumOps = MI.getNumOperands();
-  for (unsigned I = 1; I != NumOps; ++I) {
-    Register Element = MI.getOperand(I).getReg();
-    if (!mi_match(Element, MRI, m_SpecificICst(SplatValue)))
-      return false;
+Optional<ValueAndVReg> getAnyConstantSplat(Register VReg,
+                                           const MachineRegisterInfo &MRI,
+                                           bool AllowUndef) {
+  MachineInstr *MI = getDefIgnoringCopies(VReg, MRI);
+  if (!MI)
+    return None;
+
+  if (!isBuildVectorOp(MI->getOpcode()))
+    return None;
+
+  Optional<ValueAndVReg> SplatValAndReg = None;
+  for (MachineOperand &Op : MI->uses()) {
+    Register Element = Op.getReg();
+    auto ElementValAndReg =
+        getAnyConstantVRegValWithLookThrough(Element, MRI, true, true);
+
+    // If AllowUndef, treat undef as value that will result in a constant splat.
+    if (!ElementValAndReg) {
+      if (AllowUndef && isa<GImplicitDef>(MRI.getVRegDef(Element)))
+        continue;
+      return None;
+    }
+
+    // Record splat value
+    if (!SplatValAndReg)
+      SplatValAndReg = ElementValAndReg;
+
+    // Different constant then the one already recorded, not a constant splat.
+    if (SplatValAndReg->Value != ElementValAndReg->Value)
+      return None;
   }
 
-  return true;
+  return SplatValAndReg;
 }
 
+bool isBuildVectorConstantSplat(const MachineInstr &MI,
+                                const MachineRegisterInfo &MRI,
+                                int64_t SplatValue, bool AllowUndef) {
+  if (auto SplatValAndReg =
+          getAnyConstantSplat(MI.getOperand(0).getReg(), MRI, AllowUndef))
+    return mi_match(SplatValAndReg->VReg, MRI, m_SpecificICst(SplatValue));
+  return false;
+}
+
+} // end anonymous namespace
+
 Optional<int64_t>
 llvm::getBuildVectorConstantSplat(const MachineInstr &MI,
                                   const MachineRegisterInfo &MRI) {
-  if (!isBuildVectorOp(MI.getOpcode()))
-    return None;
-
-  const unsigned NumOps = MI.getNumOperands();
-  Optional<int64_t> Scalar;
-  for (unsigned I = 1; I != NumOps; ++I) {
-    Register Element = MI.getOperand(I).getReg();
-    int64_t ElementValue;
-    if (!mi_match(Element, MRI, m_ICst(ElementValue)))
-      return None;
-    if (!Scalar)
-      Scalar = ElementValue;
-    else if (*Scalar != ElementValue)
-      return None;
-  }
+  if (auto SplatValAndReg =
+          getAnyConstantSplat(MI.getOperand(0).getReg(), MRI, false))
+    return getIConstantVRegSExtVal(SplatValAndReg->VReg, MRI);
+  return None;
+}
 
-  return Scalar;
+Optional<FPValueAndVReg> llvm::getFConstantSplat(Register VReg,
+                                                 const MachineRegisterInfo &MRI,
+                                                 bool AllowUndef) {
+  if (auto SplatValAndReg = getAnyConstantSplat(VReg, MRI, AllowUndef))
+    return getFConstantVRegValWithLookThrough(SplatValAndReg->VReg, MRI);
+  return None;
 }
 
 bool llvm::isBuildVectorAllZeros(const MachineInstr &MI,
-                                 const MachineRegisterInfo &MRI) {
-  return isBuildVectorConstantSplat(MI, MRI, 0);
+                                 const MachineRegisterInfo &MRI,
+                                 bool AllowUndef) {
+  return isBuildVectorConstantSplat(MI, MRI, 0, AllowUndef);
 }
 
 bool llvm::isBuildVectorAllOnes(const MachineInstr &MI,
-                                const MachineRegisterInfo &MRI) {
-  return isBuildVectorConstantSplat(MI, MRI, -1);
+                                const MachineRegisterInfo &MRI,
+                                bool AllowUndef) {
+  return isBuildVectorConstantSplat(MI, MRI, -1, AllowUndef);
 }
 
 Optional<RegOrConstant> llvm::getVectorSplat(const MachineInstr &MI,
@@ -948,6 +1085,36 @@ Optional<RegOrConstant> llvm::getVectorSplat(const MachineInstr &MI,
   return RegOrConstant(Reg);
 }
 
+bool llvm::isConstantOrConstantVector(MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI) {
+  Register Def = MI.getOperand(0).getReg();
+  if (auto C = getIConstantVRegValWithLookThrough(Def, MRI))
+    return true;
+  GBuildVector *BV = dyn_cast<GBuildVector>(&MI);
+  if (!BV)
+    return false;
+  for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) {
+    if (getIConstantVRegValWithLookThrough(BV->getSourceReg(SrcIdx), MRI) ||
+        getOpcodeDef<GImplicitDef>(BV->getSourceReg(SrcIdx), MRI))
+      continue;
+    return false;
+  }
+  return true;
+}
+
+Optional<APInt>
+llvm::isConstantOrConstantSplatVector(MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI) {
+  Register Def = MI.getOperand(0).getReg();
+  if (auto C = getIConstantVRegValWithLookThrough(Def, MRI))
+    return C->Value;
+  auto MaybeCst = getBuildVectorConstantSplat(MI, MRI);
+  if (!MaybeCst)
+    return None;
+  const unsigned ScalarSize = MRI.getType(Def).getScalarSizeInBits();
+  return APInt(ScalarSize, *MaybeCst, true);
+}
+
 bool llvm::matchUnaryPredicate(
     const MachineRegisterInfo &MRI, Register Reg,
     std::function<bool(const Constant *ConstVal)> Match, bool AllowUndefs) {
@@ -1011,3 +1178,59 @@ bool llvm::shouldOptForSize(const MachineBasicBlock &MBB,
   return F.hasOptSize() || F.hasMinSize() ||
          llvm::shouldOptimizeForSize(MBB.getBasicBlock(), PSI, BFI);
 }
+
+/// These artifacts generally don't have any debug users because they don't
+/// directly originate from IR instructions, but instead usually from
+/// legalization. Avoiding checking for debug users improves compile time.
+/// Note that truncates or extends aren't included because they have IR
+/// counterparts which can have debug users after translation.
+static bool shouldSkipDbgValueFor(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_UNMERGE_VALUES:
+  case TargetOpcode::G_MERGE_VALUES:
+  case TargetOpcode::G_CONCAT_VECTORS:
+  case TargetOpcode::G_BUILD_VECTOR:
+  case TargetOpcode::G_EXTRACT:
+  case TargetOpcode::G_INSERT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+void llvm::saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI,
+                            LostDebugLocObserver *LocObserver,
+                            SmallInstListTy &DeadInstChain) {
+  for (MachineOperand &Op : MI.uses()) {
+    if (Op.isReg() && Op.getReg().isVirtual())
+      DeadInstChain.insert(MRI.getVRegDef(Op.getReg()));
+  }
+  LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
+  DeadInstChain.remove(&MI);
+  if (shouldSkipDbgValueFor(MI))
+    MI.eraseFromParent();
+  else
+    MI.eraseFromParentAndMarkDBGValuesForRemoval();
+  if (LocObserver)
+    LocObserver->checkpoint(false);
+}
+
+void llvm::eraseInstrs(ArrayRef<MachineInstr *> DeadInstrs,
+                       MachineRegisterInfo &MRI,
+                       LostDebugLocObserver *LocObserver) {
+  SmallInstListTy DeadInstChain;
+  for (MachineInstr *MI : DeadInstrs)
+    saveUsesAndErase(*MI, MRI, LocObserver, DeadInstChain);
+
+  while (!DeadInstChain.empty()) {
+    MachineInstr *Inst = DeadInstChain.pop_back_val();
+    if (!isTriviallyDead(*Inst, MRI))
+      continue;
+    saveUsesAndErase(*Inst, MRI, LocObserver, DeadInstChain);
+  }
+}
+
+void llvm::eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LostDebugLocObserver *LocObserver) {
+  return eraseInstrs({&MI}, MRI, LocObserver);
+}
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 4316034371a5..83b8c2d0eacb 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -187,7 +187,7 @@ namespace {
                  const DataLayout &DL,
                  OptimizationRemarkEmitter *ORE) :
       SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()),
-      TripCount(Info.TripCount),
+      ExitCount(Info.ExitCount),
       CountType(Info.CountType),
       ExitBranch(Info.ExitBranch),
       LoopDecrement(Info.LoopDecrement),
@@ -202,7 +202,7 @@ namespace {
     OptimizationRemarkEmitter *ORE = nullptr;
     Loop *L                 = nullptr;
     Module *M               = nullptr;
-    const SCEV *TripCount   = nullptr;
+    const SCEV *ExitCount   = nullptr;
     Type *CountType         = nullptr;
     BranchInst *ExitBranch  = nullptr;
     Value *LoopDecrement    = nullptr;
@@ -296,7 +296,7 @@ bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
   }
 
   assert(
-      (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.TripCount) &&
+      (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
       "Hardware Loop must have set exit info.");
 
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -365,7 +365,13 @@ static bool CanGenerateTest(Loop *L, Value *Count) {
     return false;
   };
 
-  if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1))
+  // Check if Count is a zext.
+  Value *CountBefZext =
+      isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) : nullptr;
+
+  if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
+      !IsCompareZero(ICmp, CountBefZext, 0) &&
+      !IsCompareZero(ICmp, CountBefZext, 1))
     return false;
 
   unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
@@ -381,13 +387,18 @@ Value *HardwareLoop::InitLoopCount() {
   // loop counter and tests that is not zero?
 
   SCEVExpander SCEVE(SE, DL, "loopcnt");
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
+
+  ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
 
   // If we're trying to use the 'test and set' form of the intrinsic, we need
   // to replace a conditional branch that is controlling entry to the loop. It
   // is likely (guaranteed?) that the preheader has an unconditional branch to
   // the loop header, so also check if it has a single predecessor.
-  if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, TripCount,
-                                  SE.getZero(TripCount->getType()))) {
+  if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
+                                  SE.getZero(ExitCount->getType()))) {
     LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
     UseLoopGuard |= ForceGuardLoopEntry;
   } else
@@ -399,19 +410,19 @@ Value *HardwareLoop::InitLoopCount() {
     BasicBlock *Predecessor = BB->getSinglePredecessor();
     // If it's not safe to create a while loop then don't force it and create a
     // do-while loop instead
-    if (!isSafeToExpandAt(TripCount, Predecessor->getTerminator(), SE))
+    if (!isSafeToExpandAt(ExitCount, Predecessor->getTerminator(), SE))
         UseLoopGuard = false;
     else
         BB = Predecessor;
   }
 
-  if (!isSafeToExpandAt(TripCount, BB->getTerminator(), SE)) {
-    LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand TripCount " << *TripCount
-                      << "\n");
+  if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
+    LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
+               << *ExitCount << "\n");
     return nullptr;
   }
 
-  Value *Count = SCEVE.expandCodeFor(TripCount, CountType,
+  Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
                                      BB->getTerminator());
 
   // FIXME: We've expanded Count where we hope to insert the counter setting
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 71e91b445d9a..64e1f4351456 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -341,9 +341,8 @@ void InlineSpiller::collectRegsToSpill() {
   if (Original == Reg)
     return;
 
-  for (MachineRegisterInfo::reg_instr_iterator
-       RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end(); RI != E; ) {
-    MachineInstr &MI = *RI++;
+  for (MachineInstr &MI :
+       llvm::make_early_inc_range(MRI.reg_instructions(Reg))) {
     Register SnipReg = isFullCopyOf(MI, Reg);
     if (!isSibling(SnipReg))
       continue;
@@ -465,10 +464,8 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     LLVM_DEBUG(dbgs() << "Merged to stack int: " << *StackInt << '\n');
 
     // Find all spills and copies of VNI.
-    for (MachineRegisterInfo::use_instr_nodbg_iterator
-         UI = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
-         UI != E; ) {
-      MachineInstr &MI = *UI++;
+    for (MachineInstr &MI :
+         llvm::make_early_inc_range(MRI.use_nodbg_instructions(Reg))) {
       if (!MI.isCopy() && !MI.mayStore())
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
@@ -676,11 +673,7 @@ void InlineSpiller::reMaterializeAll() {
   bool anyRemat = false;
   for (Register Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
-    for (MachineRegisterInfo::reg_bundle_iterator
-           RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
-         RegI != E; ) {
-      MachineInstr &MI = *RegI++;
-
+    for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
       // Debug values are not allowed to affect codegen.
       if (MI.isDebugValue())
         continue;
@@ -928,6 +921,39 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   // Update the call site info.
   if (MI->isCandidateForCallSiteEntry())
     MI->getMF()->moveCallSiteInfo(MI, FoldMI);
+
+  // If we've folded a store into an instruction labelled with debug-info,
+  // record a substitution from the old operand to the memory operand. Handle
+  // the simple common case where operand 0 is the one being folded, plus when
+  // the destination operand is also a tied def. More values could be
+  // substituted / preserved with more analysis.
+  if (MI->peekDebugInstrNum() && Ops[0].second == 0) {
+    // Helper lambda.
+    auto MakeSubstitution = [this,FoldMI,MI,&Ops]() {
+      // Substitute old operand zero to the new instructions memory operand.
+      unsigned OldOperandNum = Ops[0].second;
+      unsigned NewNum = FoldMI->getDebugInstrNum();
+      unsigned OldNum = MI->getDebugInstrNum();
+      MF.makeDebugValueSubstitution({OldNum, OldOperandNum},
+                         {NewNum, MachineFunction::DebugOperandMemNumber});
+    };
+
+    const MachineOperand &Op0 = MI->getOperand(Ops[0].second);
+    if (Ops.size() == 1 && Op0.isDef()) {
+      MakeSubstitution();
+    } else if (Ops.size() == 2 && Op0.isDef() && MI->getOperand(1).isTied() &&
+               Op0.getReg() == MI->getOperand(1).getReg()) {
+      MakeSubstitution();
+    }
+  } else if (MI->peekDebugInstrNum()) {
+    // This is a debug-labelled instruction, but the operand being folded isn't
+    // at operand zero. Most likely this means it's a load being folded in.
+    // Substitute any register defs from operand zero up to the one being
+    // folded -- past that point, we don't know what the new operand indexes
+    // will be.
+    MF.substituteDebugValuesForInst(*MI, *FoldMI, Ops[0].second);
+  }
+
   MI->eraseFromParent();
 
   // Insert any new instructions other than FoldMI into the LIS maps.
@@ -1038,57 +1064,53 @@ void InlineSpiller::spillAroundUses(Register Reg) {
   LiveInterval &OldLI = LIS.getInterval(Reg);
 
   // Iterate over instructions using Reg.
-  for (MachineRegisterInfo::reg_bundle_iterator
-       RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
-       RegI != E; ) {
-    MachineInstr *MI = &*(RegI++);
-
+  for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
     // Debug values are not allowed to affect codegen.
-    if (MI->isDebugValue()) {
+    if (MI.isDebugValue()) {
       // Modify DBG_VALUE now that the value is in a spill slot.
-      MachineBasicBlock *MBB = MI->getParent();
-      LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:\t" << *MI);
-      buildDbgValueForSpill(*MBB, MI, *MI, StackSlot, Reg);
+      MachineBasicBlock *MBB = MI.getParent();
+      LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:\t" << MI);
+      buildDbgValueForSpill(*MBB, &MI, MI, StackSlot, Reg);
       MBB->erase(MI);
       continue;
     }
 
-    assert(!MI->isDebugInstr() && "Did not expect to find a use in debug "
+    assert(!MI.isDebugInstr() && "Did not expect to find a use in debug "
            "instruction that isn't a DBG_VALUE");
 
     // Ignore copies to/from snippets. We'll delete them.
-    if (SnippetCopies.count(MI))
+    if (SnippetCopies.count(&MI))
       continue;
 
     // Stack slot accesses may coalesce away.
-    if (coalesceStackAccess(MI, Reg))
+    if (coalesceStackAccess(&MI, Reg))
       continue;
 
     // Analyze instruction.
     SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-    VirtRegInfo RI = AnalyzeVirtRegInBundle(*MI, Reg, &Ops);
+    VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, Reg, &Ops);
 
     // Find the slot index where this instruction reads and writes OldLI.
     // This is usually the def slot, except for tied early clobbers.
-    SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
+    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
     if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getRegSlot(true)))
       if (SlotIndex::isSameInstr(Idx, VNI->def))
         Idx = VNI->def;
 
     // Check for a sibling copy.
-    Register SibReg = isFullCopyOf(*MI, Reg);
+    Register SibReg = isFullCopyOf(MI, Reg);
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {
-        LLVM_DEBUG(dbgs() << "Found new snippet copy: " << *MI);
-        SnippetCopies.insert(MI);
+        LLVM_DEBUG(dbgs() << "Found new snippet copy: " << MI);
+        SnippetCopies.insert(&MI);
         continue;
       }
       if (RI.Writes) {
-        if (hoistSpillInsideBB(OldLI, *MI)) {
+        if (hoistSpillInsideBB(OldLI, MI)) {
           // This COPY is now dead, the value is already in the stack slot.
-          MI->getOperand(0).setIsDead();
-          DeadDefs.push_back(MI);
+          MI.getOperand(0).setIsDead();
+          DeadDefs.push_back(&MI);
           continue;
         }
       } else {
@@ -1108,7 +1130,7 @@ void InlineSpiller::spillAroundUses(Register Reg) {
     Register NewVReg = Edit->createFrom(Reg);
 
     if (RI.Reads)
-      insertReload(NewVReg, Idx, MI);
+      insertReload(NewVReg, Idx, &MI);
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
@@ -1123,12 +1145,12 @@ void InlineSpiller::spillAroundUses(Register Reg) {
           hasLiveDef = true;
       }
     }
-    LLVM_DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
+    LLVM_DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << MI << '\n');
 
     // FIXME: Use a second vreg if instruction has no tied ops.
     if (RI.Writes)
       if (hasLiveDef)
-        insertSpill(NewVReg, true, MI);
+        insertSpill(NewVReg, true, &MI);
   }
 }
 
@@ -1163,10 +1185,8 @@ void InlineSpiller::spillAll() {
 
   // Finally delete the SnippetCopies.
   for (Register Reg : RegsToSpill) {
-    for (MachineRegisterInfo::reg_instr_iterator
-         RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end();
-         RI != E; ) {
-      MachineInstr &MI = *(RI++);
+    for (MachineInstr &MI :
+         llvm::make_early_inc_range(MRI.reg_instructions(Reg))) {
       assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy");
       // FIXME: Do this with a LiveRangeEdit callback.
       LIS.RemoveMachineInstrFromMaps(MI);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 24a57cc21c57..5a20580e5479 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -95,7 +95,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
   }
 
 private:
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 71bfb1d87d66..9fabcfb1f326 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -308,12 +308,12 @@ public:
     }
 
     // Multiplying by one is a no-op.
-    if (C.isOneValue()) {
+    if (C.isOne()) {
       return *this;
     }
 
     // Multiplying by zero removes the coefficient B and defines all bits.
-    if (C.isNullValue()) {
+    if (C.isZero()) {
       ErrorMSBs = 0;
       deleteB();
     }
@@ -464,7 +464,7 @@ public:
       return *this;
     }
 
-    if (C.isNullValue())
+    if (C.isZero())
       return *this;
 
     // Test if the result will be zero
@@ -571,7 +571,7 @@ public:
   bool isProvenEqualTo(const Polynomial &o) {
     // Subtract both polynomials and test if it is fully defined and zero.
     Polynomial r = *this - o;
-    return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue());
+    return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
   }
 
   /// Print the polynomial into a stream.
@@ -1131,6 +1131,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
 
   InstructionCost InterleavedCost;
   InstructionCost InstructionCost = 0;
+  const TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency;
 
   // Get the interleave factor
   unsigned Factor = InterleavedLoad.size();
@@ -1158,8 +1159,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   // be expected. Also sum the cost of the Instructions beeing left dead.
   for (auto &I : Is) {
     // Compute the old cost
-    InstructionCost +=
-        TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+    InstructionCost += TTI.getInstructionCost(I, CostKind);
 
     // The final SVIs are allowed not to be dead, all uses will be replaced
     if (SVIs.find(I) != SVIs.end())
@@ -1212,7 +1212,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
     Indices.push_back(i);
   InterleavedCost = TTI.getInterleavedMemoryOpCost(
       Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
-      InsertionPoint->getPointerAddressSpace());
+      InsertionPoint->getPointerAddressSpace(), CostKind);
 
   if (InterleavedCost >= InstructionCost) {
     return false;
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 55089d3b90d0..808a79d9792a 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -453,8 +453,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
 
 bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
   // Verify this is a simple bswap.
-  if (CI->getNumArgOperands() != 1 ||
-      CI->getType() != CI->getArgOperand(0)->getType() ||
+  if (CI->arg_size() != 1 || CI->getType() != CI->getArgOperand(0)->getType() ||
       !CI->getType()->isIntegerTy())
     return false;
 
diff --git a/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 37c0b44ea2b2..0d3685d4141c 100644
--- a/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -25,10 +25,10 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index dc9907058340..a4eb3094612b 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -11,114 +11,48 @@
 /// LiveDebugValues.cpp and VarLocBasedImpl.cpp for more information.
 ///
 /// This pass propagates variable locations between basic blocks, resolving
-/// control flow conflicts between them. The problem is much like SSA
-/// construction, where each DBG_VALUE instruction assigns the *value* that
-/// a variable has, and every instruction where the variable is in scope uses
-/// that variable. The resulting map of instruction-to-value is then translated
-/// into a register (or spill) location for each variable over each instruction.
+/// control flow conflicts between them. The problem is SSA construction, where
+/// each debug instruction assigns the *value* that a variable has, and every
+/// instruction where the variable is in scope uses that variable. The resulting
+/// map of instruction-to-value is then translated into a register (or spill)
+/// location for each variable over each instruction.
 ///
-/// This pass determines which DBG_VALUE dominates which instructions, or if
-/// none do, where values must be merged (like PHI nodes). The added
-/// complication is that because codegen has already finished, a PHI node may
-/// be needed for a variable location to be correct, but no register or spill
-/// slot merges the necessary values. In these circumstances, the variable
-/// location is dropped.
+/// The primary difference from normal SSA construction is that we cannot
+/// _create_ PHI values that contain variable values. CodeGen has already
+/// completed, and we can't alter it just to make debug-info complete. Thus:
+/// we can identify function positions where we would like a PHI value for a
+/// variable, but must search the MachineFunction to see whether such a PHI is
+/// available. If no such PHI exists, the variable location must be dropped.
 ///
-/// What makes this analysis non-trivial is loops: we cannot tell in advance
-/// whether a variable location is live throughout a loop, or whether its
-/// location is clobbered (or redefined by another DBG_VALUE), without
-/// exploring all the way through.
-///
-/// To make this simpler we perform two kinds of analysis. First, we identify
+/// To achieve this, we perform two kinds of analysis. First, we identify
 /// every value defined by every instruction (ignoring those that only move
-/// another value), then compute a map of which values are available for each
-/// instruction. This is stronger than a reaching-def analysis, as we create
-/// PHI values where other values merge.
-///
-/// Secondly, for each variable, we effectively re-construct SSA using each
-/// DBG_VALUE as a def. The DBG_VALUEs read a value-number computed by the
-/// first analysis from the location they refer to. We can then compute the
-/// dominance frontiers of where a variable has a value, and create PHI nodes
-/// where they merge.
-/// This isn't precisely SSA-construction though, because the function shape
-/// is pre-defined. If a variable location requires a PHI node, but no
-/// PHI for the relevant values is present in the function (as computed by the
-/// first analysis), the location must be dropped.
-///
-/// Once both are complete, we can pass back over all instructions knowing:
-///  * What _value_ each variable should contain, either defined by an
-///    instruction or where control flow merges
-///  * What the location of that value is (if any).
-/// Allowing us to create appropriate live-in DBG_VALUEs, and DBG_VALUEs when
-/// a value moves location. After this pass runs, all variable locations within
-/// a block should be specified by DBG_VALUEs within that block, allowing
-/// DbgEntityHistoryCalculator to focus on individual blocks.
-///
-/// This pass is able to go fast because the size of the first
-/// reaching-definition analysis is proportional to the working-set size of
-/// the function, which the compiler tries to keep small. (It's also
-/// proportional to the number of blocks). Additionally, we repeatedly perform
-/// the second reaching-definition analysis with only the variables and blocks
-/// in a single lexical scope, exploiting their locality.
-///
-/// Determining where PHIs happen is trickier with this approach, and it comes
-/// to a head in the major problem for LiveDebugValues: is a value live-through
-/// a loop, or not? Your garden-variety dataflow analysis aims to build a set of
-/// facts about a function, however this analysis needs to generate new value
-/// numbers at joins.
-///
-/// To do this, consider a lattice of all definition values, from instructions
-/// and from PHIs. Each PHI is characterised by the RPO number of the block it
-/// occurs in. Each value pair A, B can be ordered by RPO(A) < RPO(B):
-/// with non-PHI values at the top, and any PHI value in the last block (by RPO
-/// order) at the bottom.
-///
-/// (Awkwardly: lower-down-the _lattice_ means a greater RPO _number_. Below,
-/// "rank" always refers to the former).
-///
-/// At any join, for each register, we consider:
-///  * All incoming values, and
-///  * The PREVIOUS live-in value at this join.
-/// If all incoming values agree: that's the live-in value. If they do not, the
-/// incoming values are ranked according to the partial order, and the NEXT
-/// LOWEST rank after the PREVIOUS live-in value is picked (multiple values of
-/// the same rank are ignored as conflicting). If there are no candidate values,
-/// or if the rank of the live-in would be lower than the rank of the current
-/// blocks PHIs, create a new PHI value.
-///
-/// Intuitively: if it's not immediately obvious what value a join should result
-/// in, we iteratively descend from instruction-definitions down through PHI
-/// values, getting closer to the current block each time. If the current block
-/// is a loop head, this ordering is effectively searching outer levels of
-/// loops, to find a value that's live-through the current loop.
+/// another value), then re-compute an SSA-form representation of the
+/// MachineFunction, using value propagation to eliminate any un-necessary
+/// PHI values. This gives us a map of every value computed in the function,
+/// and its location within the register file / stack.
 ///
-/// If there is no value that's live-through this loop, a PHI is created for
-/// this location instead. We can't use a lower-ranked PHI because by definition
-/// it doesn't dominate the current block. We can't create a PHI value any
-/// earlier, because we risk creating a PHI value at a location where values do
-/// not in fact merge, thus misrepresenting the truth, and not making the true
-/// live-through value for variable locations.
+/// Secondly, for each variable we perform the same analysis, where each debug
+/// instruction is considered a def, and every instruction where the variable
+/// is in lexical scope as a use. Value propagation is used again to eliminate
+/// any un-necessary PHIs. This gives us a map of each variable to the value
+/// it should have in a block.
 ///
-/// This algorithm applies to both calculating the availability of values in
-/// the first analysis, and the location of variables in the second. However
-/// for the second we add an extra dimension of pain: creating a variable
-/// location PHI is only valid if, for each incoming edge,
-///  * There is a value for the variable on the incoming edge, and
-///  * All the edges have that value in the same register.
-/// Or put another way: we can only create a variable-location PHI if there is
-/// a matching machine-location PHI, each input to which is the variables value
-/// in the predecessor block.
+/// Once both are complete, we have two maps for each block:
+///  * Variables to the values they should have,
+///  * Values to the register / spill slot they are located in.
+/// After which we can marry-up variable values with a location, and emit
+/// DBG_VALUE instructions specifying those locations. Variable locations may
+/// be dropped in this process due to the desired variable value not being
+/// resident in any machine location, or because there is no PHI value in any
+/// location that accurately represents the desired value.  The building of
+/// location lists for each block is left to DbgEntityHistoryCalculator.
 ///
-/// To accommodate this difference, each point on the lattice is split in
-/// two: a "proposed" PHI and "definite" PHI. Any PHI that can immediately
-/// have a location determined are "definite" PHIs, and no further work is
-/// needed. Otherwise, a location that all non-backedge predecessors agree
-/// on is picked and propagated as a "proposed" PHI value. If that PHI value
-/// is truly live-through, it'll appear on the loop backedges on the next
-/// dataflow iteration, after which the block live-in moves to be a "definite"
-/// PHI. If it's not truly live-through, the variable value will be downgraded
-/// further as we explore the lattice, or remains "proposed" and is considered
-/// invalid once dataflow completes.
+/// This pass is kept efficient because the size of the first SSA problem
+/// is proportional to the working-set size of the function, which the compiler
+/// tries to keep small. (It's also proportional to the number of blocks).
+/// Additionally, we repeatedly perform the second SSA problem analysis with
+/// only the variables and blocks in a single lexical scope, exploiting their
+/// locality.
 ///
 /// ### Terminology
 ///
@@ -128,15 +62,13 @@
 /// contain the appropriate variable value. A value that is a PHI node is
 /// occasionally called an mphi.
 ///
-/// The first dataflow problem is the "machine value location" problem,
+/// The first SSA problem is the "machine value location" problem,
 /// because we're determining which machine locations contain which values.
 /// The "locations" are constant: what's unknown is what value they contain.
 ///
-/// The second dataflow problem (the one for variables) is the "variable value
+/// The second SSA problem (the one for variables) is the "variable value
 /// problem", because it's determining what values a variable has, rather than
-/// what location those values are placed in. Unfortunately, it's not that
-/// simple, because producing a PHI value always involves picking a location.
-/// This is an imperfection that we just have to accept, at least for now.
+/// what location those values are placed in.
 ///
 /// TODO:
 ///   Overlapping fragments
@@ -153,9 +85,10 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -192,16 +125,18 @@
 #include <cassert>
 #include <cstdint>
 #include <functional>
+#include <limits.h>
+#include <limits>
 #include <queue>
 #include <tuple>
 #include <utility>
 #include <vector>
-#include <limits.h>
-#include <limits>
 
+#include "InstrRefBasedImpl.h"
 #include "LiveDebugValues.h"
 
 using namespace llvm;
+using namespace LiveDebugValues;
 
 // SSAUpdaterImple sets DEBUG_TYPE, change it.
 #undef DEBUG_TYPE
@@ -213,730 +148,6 @@ static cl::opt<bool> EmulateOldLDV("emulate-old-livedebugvalues", cl::Hidden,
                                    cl::desc("Act like old LiveDebugValues did"),
                                    cl::init(false));
 
-namespace {
-
-// The location at which a spilled value resides. It consists of a register and
-// an offset.
-struct SpillLoc {
-  unsigned SpillBase;
-  StackOffset SpillOffset;
-  bool operator==(const SpillLoc &Other) const {
-    return std::make_pair(SpillBase, SpillOffset) ==
-           std::make_pair(Other.SpillBase, Other.SpillOffset);
-  }
-  bool operator<(const SpillLoc &Other) const {
-    return std::make_tuple(SpillBase, SpillOffset.getFixed(),
-                    SpillOffset.getScalable()) <
-           std::make_tuple(Other.SpillBase, Other.SpillOffset.getFixed(),
-                    Other.SpillOffset.getScalable());
-  }
-};
-
-class LocIdx {
-  unsigned Location;
-
-  // Default constructor is private, initializing to an illegal location number.
-  // Use only for "not an entry" elements in IndexedMaps.
-  LocIdx() : Location(UINT_MAX) { }
-
-public:
-  #define NUM_LOC_BITS 24
-  LocIdx(unsigned L) : Location(L) {
-    assert(L < (1 << NUM_LOC_BITS) && "Machine locations must fit in 24 bits");
-  }
-
-  static LocIdx MakeIllegalLoc() {
-    return LocIdx();
-  }
-
-  bool isIllegal() const {
-    return Location == UINT_MAX;
-  }
-
-  uint64_t asU64() const {
-    return Location;
-  }
-
-  bool operator==(unsigned L) const {
-    return Location == L;
-  }
-
-  bool operator==(const LocIdx &L) const {
-    return Location == L.Location;
-  }
-
-  bool operator!=(unsigned L) const {
-    return !(*this == L);
-  }
-
-  bool operator!=(const LocIdx &L) const {
-    return !(*this == L);
-  }
-
-  bool operator<(const LocIdx &Other) const {
-    return Location < Other.Location;
-  }
-};
-
-class LocIdxToIndexFunctor {
-public:
-  using argument_type = LocIdx;
-  unsigned operator()(const LocIdx &L) const {
-    return L.asU64();
-  }
-};
-
-/// Unique identifier for a value defined by an instruction, as a value type.
-/// Casts back and forth to a uint64_t. Probably replacable with something less
-/// bit-constrained. Each value identifies the instruction and machine location
-/// where the value is defined, although there may be no corresponding machine
-/// operand for it (ex: regmasks clobbering values). The instructions are
-/// one-based, and definitions that are PHIs have instruction number zero.
-///
-/// The obvious limits of a 1M block function or 1M instruction blocks are
-/// problematic; but by that point we should probably have bailed out of
-/// trying to analyse the function.
-class ValueIDNum {
-  uint64_t BlockNo : 20;         /// The block where the def happens.
-  uint64_t InstNo : 20;          /// The Instruction where the def happens.
-                                 /// One based, is distance from start of block.
-  uint64_t LocNo : NUM_LOC_BITS; /// The machine location where the def happens.
-
-public:
-  // XXX -- temporarily enabled while the live-in / live-out tables are moved
-  // to something more type-y
-  ValueIDNum() : BlockNo(0xFFFFF),
-                 InstNo(0xFFFFF),
-                 LocNo(0xFFFFFF) { }
-
-  ValueIDNum(uint64_t Block, uint64_t Inst, uint64_t Loc)
-    : BlockNo(Block), InstNo(Inst), LocNo(Loc) { }
-
-  ValueIDNum(uint64_t Block, uint64_t Inst, LocIdx Loc)
-    : BlockNo(Block), InstNo(Inst), LocNo(Loc.asU64()) { }
-
-  uint64_t getBlock() const { return BlockNo; }
-  uint64_t getInst() const { return InstNo; }
-  uint64_t getLoc() const { return LocNo; }
-  bool isPHI() const { return InstNo == 0; }
-
-  uint64_t asU64() const {
-    uint64_t TmpBlock = BlockNo;
-    uint64_t TmpInst = InstNo;
-    return TmpBlock << 44ull | TmpInst << NUM_LOC_BITS | LocNo;
-  }
-
-  static ValueIDNum fromU64(uint64_t v) {
-    uint64_t L = (v & 0x3FFF);
-    return {v >> 44ull, ((v >> NUM_LOC_BITS) & 0xFFFFF), L};
-  }
-
-  bool operator<(const ValueIDNum &Other) const {
-    return asU64() < Other.asU64();
-  }
-
-  bool operator==(const ValueIDNum &Other) const {
-    return std::tie(BlockNo, InstNo, LocNo) ==
-           std::tie(Other.BlockNo, Other.InstNo, Other.LocNo);
-  }
-
-  bool operator!=(const ValueIDNum &Other) const { return !(*this == Other); }
-
-  std::string asString(const std::string &mlocname) const {
-    return Twine("Value{bb: ")
-        .concat(Twine(BlockNo).concat(
-            Twine(", inst: ")
-                .concat((InstNo ? Twine(InstNo) : Twine("live-in"))
-                            .concat(Twine(", loc: ").concat(Twine(mlocname)))
-                            .concat(Twine("}")))))
-        .str();
-  }
-
-  static ValueIDNum EmptyValue;
-};
-
-} // end anonymous namespace
-
-namespace {
-
-/// Meta qualifiers for a value. Pair of whatever expression is used to qualify
-/// the the value, and Boolean of whether or not it's indirect.
-class DbgValueProperties {
-public:
-  DbgValueProperties(const DIExpression *DIExpr, bool Indirect)
-      : DIExpr(DIExpr), Indirect(Indirect) {}
-
-  /// Extract properties from an existing DBG_VALUE instruction.
-  DbgValueProperties(const MachineInstr &MI) {
-    assert(MI.isDebugValue());
-    DIExpr = MI.getDebugExpression();
-    Indirect = MI.getOperand(1).isImm();
-  }
-
-  bool operator==(const DbgValueProperties &Other) const {
-    return std::tie(DIExpr, Indirect) == std::tie(Other.DIExpr, Other.Indirect);
-  }
-
-  bool operator!=(const DbgValueProperties &Other) const {
-    return !(*this == Other);
-  }
-
-  const DIExpression *DIExpr;
-  bool Indirect;
-};
-
-/// Tracker for what values are in machine locations. Listens to the Things
-/// being Done by various instructions, and maintains a table of what machine
-/// locations have what values (as defined by a ValueIDNum).
-///
-/// There are potentially a much larger number of machine locations on the
-/// target machine than the actual working-set size of the function. On x86 for
-/// example, we're extremely unlikely to want to track values through control
-/// or debug registers. To avoid doing so, MLocTracker has several layers of
-/// indirection going on, with two kinds of ``location'':
-///  * A LocID uniquely identifies a register or spill location, with a
-///    predictable value.
-///  * A LocIdx is a key (in the database sense) for a LocID and a ValueIDNum.
-/// Whenever a location is def'd or used by a MachineInstr, we automagically
-/// create a new LocIdx for a location, but not otherwise. This ensures we only
-/// account for locations that are actually used or defined. The cost is another
-/// vector lookup (of LocID -> LocIdx) over any other implementation. This is
-/// fairly cheap, and the compiler tries to reduce the working-set at any one
-/// time in the function anyway.
-///
-/// Register mask operands completely blow this out of the water; I've just
-/// piled hacks on top of hacks to get around that.
-class MLocTracker {
-public:
-  MachineFunction &MF;
-  const TargetInstrInfo &TII;
-  const TargetRegisterInfo &TRI;
-  const TargetLowering &TLI;
-
-  /// IndexedMap type, mapping from LocIdx to ValueIDNum.
-  using LocToValueType = IndexedMap<ValueIDNum, LocIdxToIndexFunctor>;
-
-  /// Map of LocIdxes to the ValueIDNums that they store. This is tightly
-  /// packed, entries only exist for locations that are being tracked.
-  LocToValueType LocIdxToIDNum;
-
-  /// "Map" of machine location IDs (i.e., raw register or spill number) to the
-  /// LocIdx key / number for that location. There are always at least as many
-  /// as the number of registers on the target -- if the value in the register
-  /// is not being tracked, then the LocIdx value will be zero. New entries are
-  /// appended if a new spill slot begins being tracked.
-  /// This, and the corresponding reverse map persist for the analysis of the
-  /// whole function, and is necessarying for decoding various vectors of
-  /// values.
-  std::vector<LocIdx> LocIDToLocIdx;
-
-  /// Inverse map of LocIDToLocIdx.
-  IndexedMap<unsigned, LocIdxToIndexFunctor> LocIdxToLocID;
-
-  /// Unique-ification of spill slots. Used to number them -- their LocID
-  /// number is the index in SpillLocs minus one plus NumRegs.
-  UniqueVector<SpillLoc> SpillLocs;
-
-  // If we discover a new machine location, assign it an mphi with this
-  // block number.
-  unsigned CurBB;
-
-  /// Cached local copy of the number of registers the target has.
-  unsigned NumRegs;
-
-  /// Collection of register mask operands that have been observed. Second part
-  /// of pair indicates the instruction that they happened in. Used to
-  /// reconstruct where defs happened if we start tracking a location later
-  /// on.
-  SmallVector<std::pair<const MachineOperand *, unsigned>, 32> Masks;
-
-  /// Iterator for locations and the values they contain. Dereferencing
-  /// produces a struct/pair containing the LocIdx key for this location,
-  /// and a reference to the value currently stored. Simplifies the process
-  /// of seeking a particular location.
-  class MLocIterator {
-    LocToValueType &ValueMap;
-    LocIdx Idx;
-
-  public:
-    class value_type {
-      public:
-      value_type(LocIdx Idx, ValueIDNum &Value) : Idx(Idx), Value(Value) { }
-      const LocIdx Idx;  /// Read-only index of this location.
-      ValueIDNum &Value; /// Reference to the stored value at this location.
-    };
-
-    MLocIterator(LocToValueType &ValueMap, LocIdx Idx)
-      : ValueMap(ValueMap), Idx(Idx) { }
-
-    bool operator==(const MLocIterator &Other) const {
-      assert(&ValueMap == &Other.ValueMap);
-      return Idx == Other.Idx;
-    }
-
-    bool operator!=(const MLocIterator &Other) const {
-      return !(*this == Other);
-    }
-
-    void operator++() {
-      Idx = LocIdx(Idx.asU64() + 1);
-    }
-
-    value_type operator*() {
-      return value_type(Idx, ValueMap[LocIdx(Idx)]);
-    }
-  };
-
-  MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
-              const TargetRegisterInfo &TRI, const TargetLowering &TLI)
-      : MF(MF), TII(TII), TRI(TRI), TLI(TLI),
-        LocIdxToIDNum(ValueIDNum::EmptyValue),
-        LocIdxToLocID(0) {
-    NumRegs = TRI.getNumRegs();
-    reset();
-    LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
-    assert(NumRegs < (1u << NUM_LOC_BITS)); // Detect bit packing failure
-
-    // Always track SP. This avoids the implicit clobbering caused by regmasks
-    // from affectings its values. (LiveDebugValues disbelieves calls and
-    // regmasks that claim to clobber SP).
-    Register SP = TLI.getStackPointerRegisterToSaveRestore();
-    if (SP) {
-      unsigned ID = getLocID(SP, false);
-      (void)lookupOrTrackRegister(ID);
-    }
-  }
-
-  /// Produce location ID number for indexing LocIDToLocIdx. Takes the register
-  /// or spill number, and flag for whether it's a spill or not.
-  unsigned getLocID(Register RegOrSpill, bool isSpill) {
-    return (isSpill) ? RegOrSpill.id() + NumRegs - 1 : RegOrSpill.id();
-  }
-
-  /// Accessor for reading the value at Idx.
-  ValueIDNum getNumAtPos(LocIdx Idx) const {
-    assert(Idx.asU64() < LocIdxToIDNum.size());
-    return LocIdxToIDNum[Idx];
-  }
-
-  unsigned getNumLocs(void) const { return LocIdxToIDNum.size(); }
-
-  /// Reset all locations to contain a PHI value at the designated block. Used
-  /// sometimes for actual PHI values, othertimes to indicate the block entry
-  /// value (before any more information is known).
-  void setMPhis(unsigned NewCurBB) {
-    CurBB = NewCurBB;
-    for (auto Location : locations())
-      Location.Value = {CurBB, 0, Location.Idx};
-  }
-
-  /// Load values for each location from array of ValueIDNums. Take current
-  /// bbnum just in case we read a value from a hitherto untouched register.
-  void loadFromArray(ValueIDNum *Locs, unsigned NewCurBB) {
-    CurBB = NewCurBB;
-    // Iterate over all tracked locations, and load each locations live-in
-    // value into our local index.
-    for (auto Location : locations())
-      Location.Value = Locs[Location.Idx.asU64()];
-  }
-
-  /// Wipe any un-necessary location records after traversing a block.
-  void reset(void) {
-    // We could reset all the location values too; however either loadFromArray
-    // or setMPhis should be called before this object is re-used. Just
-    // clear Masks, they're definitely not needed.
-    Masks.clear();
-  }
-
-  /// Clear all data. Destroys the LocID <=> LocIdx map, which makes most of
-  /// the information in this pass uninterpretable.
-  void clear(void) {
-    reset();
-    LocIDToLocIdx.clear();
-    LocIdxToLocID.clear();
-    LocIdxToIDNum.clear();
-    //SpillLocs.reset(); XXX UniqueVector::reset assumes a SpillLoc casts from 0
-    SpillLocs = decltype(SpillLocs)();
-
-    LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
-  }
-
-  /// Set a locaiton to a certain value.
-  void setMLoc(LocIdx L, ValueIDNum Num) {
-    assert(L.asU64() < LocIdxToIDNum.size());
-    LocIdxToIDNum[L] = Num;
-  }
-
-  /// Create a LocIdx for an untracked register ID. Initialize it to either an
-  /// mphi value representing a live-in, or a recent register mask clobber.
-  LocIdx trackRegister(unsigned ID) {
-    assert(ID != 0);
-    LocIdx NewIdx = LocIdx(LocIdxToIDNum.size());
-    LocIdxToIDNum.grow(NewIdx);
-    LocIdxToLocID.grow(NewIdx);
-
-    // Default: it's an mphi.
-    ValueIDNum ValNum = {CurBB, 0, NewIdx};
-    // Was this reg ever touched by a regmask?
-    for (const auto &MaskPair : reverse(Masks)) {
-      if (MaskPair.first->clobbersPhysReg(ID)) {
-        // There was an earlier def we skipped.
-        ValNum = {CurBB, MaskPair.second, NewIdx};
-        break;
-      }
-    }
-
-    LocIdxToIDNum[NewIdx] = ValNum;
-    LocIdxToLocID[NewIdx] = ID;
-    return NewIdx;
-  }
-
-  LocIdx lookupOrTrackRegister(unsigned ID) {
-    LocIdx &Index = LocIDToLocIdx[ID];
-    if (Index.isIllegal())
-      Index = trackRegister(ID);
-    return Index;
-  }
-
-  /// Record a definition of the specified register at the given block / inst.
-  /// This doesn't take a ValueIDNum, because the definition and its location
-  /// are synonymous.
-  void defReg(Register R, unsigned BB, unsigned Inst) {
-    unsigned ID = getLocID(R, false);
-    LocIdx Idx = lookupOrTrackRegister(ID);
-    ValueIDNum ValueID = {BB, Inst, Idx};
-    LocIdxToIDNum[Idx] = ValueID;
-  }
-
-  /// Set a register to a value number. To be used if the value number is
-  /// known in advance.
-  void setReg(Register R, ValueIDNum ValueID) {
-    unsigned ID = getLocID(R, false);
-    LocIdx Idx = lookupOrTrackRegister(ID);
-    LocIdxToIDNum[Idx] = ValueID;
-  }
-
-  ValueIDNum readReg(Register R) {
-    unsigned ID = getLocID(R, false);
-    LocIdx Idx = lookupOrTrackRegister(ID);
-    return LocIdxToIDNum[Idx];
-  }
-
-  /// Reset a register value to zero / empty. Needed to replicate the
-  /// VarLoc implementation where a copy to/from a register effectively
-  /// clears the contents of the source register. (Values can only have one
-  ///  machine location in VarLocBasedImpl).
-  void wipeRegister(Register R) {
-    unsigned ID = getLocID(R, false);
-    LocIdx Idx = LocIDToLocIdx[ID];
-    LocIdxToIDNum[Idx] = ValueIDNum::EmptyValue;
-  }
-
-  /// Determine the LocIdx of an existing register.
-  LocIdx getRegMLoc(Register R) {
-    unsigned ID = getLocID(R, false);
-    return LocIDToLocIdx[ID];
-  }
-
-  /// Record a RegMask operand being executed. Defs any register we currently
-  /// track, stores a pointer to the mask in case we have to account for it
-  /// later.
-  void writeRegMask(const MachineOperand *MO, unsigned CurBB, unsigned InstID) {
-    // Ensure SP exists, so that we don't override it later.
-    Register SP = TLI.getStackPointerRegisterToSaveRestore();
-
-    // Def any register we track have that isn't preserved. The regmask
-    // terminates the liveness of a register, meaning its value can't be
-    // relied upon -- we represent this by giving it a new value.
-    for (auto Location : locations()) {
-      unsigned ID = LocIdxToLocID[Location.Idx];
-      // Don't clobber SP, even if the mask says it's clobbered.
-      if (ID < NumRegs && ID != SP && MO->clobbersPhysReg(ID))
-        defReg(ID, CurBB, InstID);
-    }
-    Masks.push_back(std::make_pair(MO, InstID));
-  }
-
-  /// Find LocIdx for SpillLoc \p L, creating a new one if it's not tracked.
-  LocIdx getOrTrackSpillLoc(SpillLoc L) {
-    unsigned SpillID = SpillLocs.idFor(L);
-    if (SpillID == 0) {
-      SpillID = SpillLocs.insert(L);
-      unsigned L = getLocID(SpillID, true);
-      LocIdx Idx = LocIdx(LocIdxToIDNum.size()); // New idx
-      LocIdxToIDNum.grow(Idx);
-      LocIdxToLocID.grow(Idx);
-      LocIDToLocIdx.push_back(Idx);
-      LocIdxToLocID[Idx] = L;
-      return Idx;
-    } else {
-      unsigned L = getLocID(SpillID, true);
-      LocIdx Idx = LocIDToLocIdx[L];
-      return Idx;
-    }
-  }
-
-  /// Set the value stored in a spill slot.
-  void setSpill(SpillLoc L, ValueIDNum ValueID) {
-    LocIdx Idx = getOrTrackSpillLoc(L);
-    LocIdxToIDNum[Idx] = ValueID;
-  }
-
-  /// Read whatever value is in a spill slot, or None if it isn't tracked.
-  Optional<ValueIDNum> readSpill(SpillLoc L) {
-    unsigned SpillID = SpillLocs.idFor(L);
-    if (SpillID == 0)
-      return None;
-
-    unsigned LocID = getLocID(SpillID, true);
-    LocIdx Idx = LocIDToLocIdx[LocID];
-    return LocIdxToIDNum[Idx];
-  }
-
-  /// Determine the LocIdx of a spill slot. Return None if it previously
-  /// hasn't had a value assigned.
-  Optional<LocIdx> getSpillMLoc(SpillLoc L) {
-    unsigned SpillID = SpillLocs.idFor(L);
-    if (SpillID == 0)
-      return None;
-    unsigned LocNo = getLocID(SpillID, true);
-    return LocIDToLocIdx[LocNo];
-  }
-
-  /// Return true if Idx is a spill machine location.
-  bool isSpill(LocIdx Idx) const {
-    return LocIdxToLocID[Idx] >= NumRegs;
-  }
-
-  MLocIterator begin() {
-    return MLocIterator(LocIdxToIDNum, 0);
-  }
-
-  MLocIterator end() {
-    return MLocIterator(LocIdxToIDNum, LocIdxToIDNum.size());
-  }
-
-  /// Return a range over all locations currently tracked.
-  iterator_range<MLocIterator> locations() {
-    return llvm::make_range(begin(), end());
-  }
-
-  std::string LocIdxToName(LocIdx Idx) const {
-    unsigned ID = LocIdxToLocID[Idx];
-    if (ID >= NumRegs)
-      return Twine("slot ").concat(Twine(ID - NumRegs)).str();
-    else
-      return TRI.getRegAsmName(ID).str();
-  }
-
-  std::string IDAsString(const ValueIDNum &Num) const {
-    std::string DefName = LocIdxToName(Num.getLoc());
-    return Num.asString(DefName);
-  }
-
-  LLVM_DUMP_METHOD
-  void dump() {
-    for (auto Location : locations()) {
-      std::string MLocName = LocIdxToName(Location.Value.getLoc());
-      std::string DefName = Location.Value.asString(MLocName);
-      dbgs() << LocIdxToName(Location.Idx) << " --> " << DefName << "\n";
-    }
-  }
-
-  LLVM_DUMP_METHOD
-  void dump_mloc_map() {
-    for (auto Location : locations()) {
-      std::string foo = LocIdxToName(Location.Idx);
-      dbgs() << "Idx " << Location.Idx.asU64() << " " << foo << "\n";
-    }
-  }
-
-  /// Create a DBG_VALUE based on  machine location \p MLoc. Qualify it with the
-  /// information in \pProperties, for variable Var. Don't insert it anywhere,
-  /// just return the builder for it.
-  MachineInstrBuilder emitLoc(Optional<LocIdx> MLoc, const DebugVariable &Var,
-                              const DbgValueProperties &Properties) {
-    DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
-                                  Var.getVariable()->getScope(),
-                                  const_cast<DILocation *>(Var.getInlinedAt()));
-    auto MIB = BuildMI(MF, DL, TII.get(TargetOpcode::DBG_VALUE));
-
-    const DIExpression *Expr = Properties.DIExpr;
-    if (!MLoc) {
-      // No location -> DBG_VALUE $noreg
-      MIB.addReg(0, RegState::Debug);
-      MIB.addReg(0, RegState::Debug);
-    } else if (LocIdxToLocID[*MLoc] >= NumRegs) {
-      unsigned LocID = LocIdxToLocID[*MLoc];
-      const SpillLoc &Spill = SpillLocs[LocID - NumRegs + 1];
-
-      auto *TRI = MF.getSubtarget().getRegisterInfo();
-      Expr = TRI->prependOffsetExpression(Expr, DIExpression::ApplyOffset,
-                                          Spill.SpillOffset);
-      unsigned Base = Spill.SpillBase;
-      MIB.addReg(Base, RegState::Debug);
-      MIB.addImm(0);
-    } else {
-      unsigned LocID = LocIdxToLocID[*MLoc];
-      MIB.addReg(LocID, RegState::Debug);
-      if (Properties.Indirect)
-        MIB.addImm(0);
-      else
-        MIB.addReg(0, RegState::Debug);
-    }
-
-    MIB.addMetadata(Var.getVariable());
-    MIB.addMetadata(Expr);
-    return MIB;
-  }
-};
-
-/// Class recording the (high level) _value_ of a variable. Identifies either
-/// the value of the variable as a ValueIDNum, or a constant MachineOperand.
-/// This class also stores meta-information about how the value is qualified.
-/// Used to reason about variable values when performing the second
-/// (DebugVariable specific) dataflow analysis.
-class DbgValue {
-public:
-  union {
-    /// If Kind is Def, the value number that this value is based on.
-    ValueIDNum ID;
-    /// If Kind is Const, the MachineOperand defining this value.
-    MachineOperand MO;
-    /// For a NoVal DbgValue, which block it was generated in.
-    unsigned BlockNo;
-  };
-  /// Qualifiers for the ValueIDNum above.
-  DbgValueProperties Properties;
-
-  typedef enum {
-    Undef,     // Represents a DBG_VALUE $noreg in the transfer function only.
-    Def,       // This value is defined by an inst, or is a PHI value.
-    Const,     // A constant value contained in the MachineOperand field.
-    Proposed,  // This is a tentative PHI value, which may be confirmed or
-               // invalidated later.
-    NoVal      // Empty DbgValue, generated during dataflow. BlockNo stores
-               // which block this was generated in.
-   } KindT;
-  /// Discriminator for whether this is a constant or an in-program value.
-  KindT Kind;
-
-  DbgValue(const ValueIDNum &Val, const DbgValueProperties &Prop, KindT Kind)
-    : ID(Val), Properties(Prop), Kind(Kind) {
-    assert(Kind == Def || Kind == Proposed);
-  }
-
-  DbgValue(unsigned BlockNo, const DbgValueProperties &Prop, KindT Kind)
-    : BlockNo(BlockNo), Properties(Prop), Kind(Kind) {
-    assert(Kind == NoVal);
-  }
-
-  DbgValue(const MachineOperand &MO, const DbgValueProperties &Prop, KindT Kind)
-    : MO(MO), Properties(Prop), Kind(Kind) {
-    assert(Kind == Const);
-  }
-
-  DbgValue(const DbgValueProperties &Prop, KindT Kind)
-    : Properties(Prop), Kind(Kind) {
-    assert(Kind == Undef &&
-           "Empty DbgValue constructor must pass in Undef kind");
-  }
-
-  void dump(const MLocTracker *MTrack) const {
-    if (Kind == Const) {
-      MO.dump();
-    } else if (Kind == NoVal) {
-      dbgs() << "NoVal(" << BlockNo << ")";
-    } else if (Kind == Proposed) {
-      dbgs() << "VPHI(" << MTrack->IDAsString(ID) << ")";
-    } else {
-      assert(Kind == Def);
-      dbgs() << MTrack->IDAsString(ID);
-    }
-    if (Properties.Indirect)
-      dbgs() << " indir";
-    if (Properties.DIExpr)
-      dbgs() << " " << *Properties.DIExpr;
-  }
-
-  bool operator==(const DbgValue &Other) const {
-    if (std::tie(Kind, Properties) != std::tie(Other.Kind, Other.Properties))
-      return false;
-    else if (Kind == Proposed && ID != Other.ID)
-      return false;
-    else if (Kind == Def && ID != Other.ID)
-      return false;
-    else if (Kind == NoVal && BlockNo != Other.BlockNo)
-      return false;
-    else if (Kind == Const)
-      return MO.isIdenticalTo(Other.MO);
-
-    return true;
-  }
-
-  bool operator!=(const DbgValue &Other) const { return !(*this == Other); }
-};
-
-/// Types for recording sets of variable fragments that overlap. For a given
-/// local variable, we record all other fragments of that variable that could
-/// overlap it, to reduce search time.
-using FragmentOfVar =
-    std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
-using OverlapMap =
-    DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
-
-/// Collection of DBG_VALUEs observed when traversing a block. Records each
-/// variable and the value the DBG_VALUE refers to. Requires the machine value
-/// location dataflow algorithm to have run already, so that values can be
-/// identified.
-class VLocTracker {
-public:
-  /// Map DebugVariable to the latest Value it's defined to have.
-  /// Needs to be a MapVector because we determine order-in-the-input-MIR from
-  /// the order in this container.
-  /// We only retain the last DbgValue in each block for each variable, to
-  /// determine the blocks live-out variable value. The Vars container forms the
-  /// transfer function for this block, as part of the dataflow analysis. The
-  /// movement of values between locations inside of a block is handled at a
-  /// much later stage, in the TransferTracker class.
-  MapVector<DebugVariable, DbgValue> Vars;
-  DenseMap<DebugVariable, const DILocation *> Scopes;
-  MachineBasicBlock *MBB;
-
-public:
-  VLocTracker() {}
-
-  void defVar(const MachineInstr &MI, const DbgValueProperties &Properties,
-              Optional<ValueIDNum> ID) {
-    assert(MI.isDebugValue() || MI.isDebugRef());
-    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
-                      MI.getDebugLoc()->getInlinedAt());
-    DbgValue Rec = (ID) ? DbgValue(*ID, Properties, DbgValue::Def)
-                        : DbgValue(Properties, DbgValue::Undef);
-
-    // Attempt insertion; overwrite if it's already mapped.
-    auto Result = Vars.insert(std::make_pair(Var, Rec));
-    if (!Result.second)
-      Result.first->second = Rec;
-    Scopes[Var] = MI.getDebugLoc().get();
-  }
-
-  void defVar(const MachineInstr &MI, const MachineOperand &MO) {
-    // Only DBG_VALUEs can define constant-valued variables.
-    assert(MI.isDebugValue());
-    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
-                      MI.getDebugLoc()->getInlinedAt());
-    DbgValueProperties Properties(MI);
-    DbgValue Rec = DbgValue(MO, Properties, DbgValue::Const);
-
-    // Attempt insertion; overwrite if it's already mapped.
-    auto Result = Vars.insert(std::make_pair(Var, Rec));
-    if (!Result.second)
-      Result.first->second = Rec;
-    Scopes[Var] = MI.getDebugLoc().get();
-  }
-};
-
 /// Tracker for converting machine value locations and variable values into
 /// variable locations (the output of LiveDebugValues), recorded as DBG_VALUEs
 /// specifying block live-in locations and transfers within blocks.
@@ -985,12 +196,12 @@ public:
   /// between TransferTrackers view of variable locations and MLocTrackers. For
   /// example, MLocTracker observes all clobbers, but TransferTracker lazily
   /// does not.
-  std::vector<ValueIDNum> VarLocs;
+  SmallVector<ValueIDNum, 32> VarLocs;
 
   /// Map from LocIdxes to which DebugVariables are based that location.
   /// Mantained while stepping through the block. Not accurate if
   /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx].
-  std::map<LocIdx, SmallSet<DebugVariable, 4>> ActiveMLocs;
+  DenseMap<LocIdx, SmallSet<DebugVariable, 4>> ActiveMLocs;
 
   /// Map from DebugVariable to it's current location and qualifying meta
   /// information. To be used in conjunction with ActiveMLocs to construct
@@ -1062,6 +273,8 @@ public:
 
     // Map of the preferred location for each value.
     std::map<ValueIDNum, LocIdx> ValueToLoc;
+    ActiveMLocs.reserve(VLocs.size());
+    ActiveVLocs.reserve(VLocs.size());
 
     // Produce a map of value numbers to the current machine locs they live
     // in. When emulating VarLocBasedImpl, there should only be one
@@ -1088,7 +301,7 @@ public:
     for (auto Var : VLocs) {
       if (Var.second.Kind == DbgValue::Const) {
         PendingDbgValues.push_back(
-            emitMOLoc(Var.second.MO, Var.first, Var.second.Properties));
+            emitMOLoc(*Var.second.MO, Var.first, Var.second.Properties));
         continue;
       }
 
@@ -1142,7 +355,7 @@ public:
       // instruction or similar with an instruction number, where it doesn't
       // actually define a new value, instead it moves a value. In case this
       // happens, discard.
-      if (MTracker->LocIdxToIDNum[L] != Use.ID)
+      if (MTracker->readMLoc(L) != Use.ID)
         continue;
 
       // If a different debug instruction defined the variable value / location
@@ -1220,7 +433,6 @@ public:
         DIExpression::prepend(Prop.DIExpr, DIExpression::EntryValue);
     Register Reg = MTracker->LocIdxToLocID[Num.getLoc()];
     MachineOperand MO = MachineOperand::CreateReg(Reg, false);
-    MO.setIsDebug(true);
 
     PendingDbgValues.push_back(emitMOLoc(MO, Var, {NewExpr, Prop.Indirect}));
     return true;
@@ -1274,12 +486,12 @@ public:
     // Check whether our local copy of values-by-location in #VarLocs is out of
     // date. Wipe old tracking data for the location if it's been clobbered in
     // the meantime.
-    if (MTracker->getNumAtPos(NewLoc) != VarLocs[NewLoc.asU64()]) {
+    if (MTracker->readMLoc(NewLoc) != VarLocs[NewLoc.asU64()]) {
       for (auto &P : ActiveMLocs[NewLoc]) {
         ActiveVLocs.erase(P);
       }
       ActiveMLocs[NewLoc.asU64()].clear();
-      VarLocs[NewLoc.asU64()] = MTracker->getNumAtPos(NewLoc);
+      VarLocs[NewLoc.asU64()] = MTracker->readMLoc(NewLoc);
     }
 
     ActiveMLocs[NewLoc].insert(Var);
@@ -1358,6 +570,8 @@ public:
 
     flushDbgValues(Pos, nullptr);
 
+    // Re-find ActiveMLocIt, iterator could have been invalidated.
+    ActiveMLocIt = ActiveMLocs.find(MLoc);
     ActiveMLocIt->second.clear();
   }
 
@@ -1367,21 +581,23 @@ public:
   void transferMlocs(LocIdx Src, LocIdx Dst, MachineBasicBlock::iterator Pos) {
     // Does Src still contain the value num we expect? If not, it's been
     // clobbered in the meantime, and our variable locations are stale.
-    if (VarLocs[Src.asU64()] != MTracker->getNumAtPos(Src))
+    if (VarLocs[Src.asU64()] != MTracker->readMLoc(Src))
       return;
 
     // assert(ActiveMLocs[Dst].size() == 0);
     //^^^ Legitimate scenario on account of un-clobbered slot being assigned to?
-    ActiveMLocs[Dst] = ActiveMLocs[Src];
+
+    // Move set of active variables from one location to another.
+    auto MovingVars = ActiveMLocs[Src];
+    ActiveMLocs[Dst] = MovingVars;
     VarLocs[Dst.asU64()] = VarLocs[Src.asU64()];
 
     // For each variable based on Src; create a location at Dst.
-    for (auto &Var : ActiveMLocs[Src]) {
+    for (auto &Var : MovingVars) {
       auto ActiveVLocIt = ActiveVLocs.find(Var);
       assert(ActiveVLocIt != ActiveVLocs.end());
       ActiveVLocIt->second.Loc = Dst;
 
-      assert(Dst != 0);
       MachineInstr *MI =
           MTracker->emitLoc(Dst, Var, ActiveVLocIt->second.Properties);
       PendingDbgValues.push_back(MI);
@@ -1413,306 +629,245 @@ public:
   }
 };
 
-class InstrRefBasedLDV : public LDVImpl {
-private:
-  using FragmentInfo = DIExpression::FragmentInfo;
-  using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
-
-  // Helper while building OverlapMap, a map of all fragments seen for a given
-  // DILocalVariable.
-  using VarToFragments =
-      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
-
-  /// Machine location/value transfer function, a mapping of which locations
-  /// are assigned which new values.
-  using MLocTransferMap = std::map<LocIdx, ValueIDNum>;
-
-  /// Live in/out structure for the variable values: a per-block map of
-  /// variables to their values. XXX, better name?
-  using LiveIdxT =
-      DenseMap<const MachineBasicBlock *, DenseMap<DebugVariable, DbgValue> *>;
-
-  using VarAndLoc = std::pair<DebugVariable, DbgValue>;
-
-  /// Type for a live-in value: the predecessor block, and its value.
-  using InValueT = std::pair<MachineBasicBlock *, DbgValue *>;
-
-  /// Vector (per block) of a collection (inner smallvector) of live-ins.
-  /// Used as the result type for the variable value dataflow problem.
-  using LiveInsT = SmallVector<SmallVector<VarAndLoc, 8>, 8>;
-
-  const TargetRegisterInfo *TRI;
-  const TargetInstrInfo *TII;
-  const TargetFrameLowering *TFI;
-  const MachineFrameInfo *MFI;
-  BitVector CalleeSavedRegs;
-  LexicalScopes LS;
-  TargetPassConfig *TPC;
-
-  /// Object to track machine locations as we step through a block. Could
-  /// probably be a field rather than a pointer, as it's always used.
-  MLocTracker *MTracker;
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
 
-  /// Number of the current block LiveDebugValues is stepping through.
-  unsigned CurBB;
+ValueIDNum ValueIDNum::EmptyValue = {UINT_MAX, UINT_MAX, UINT_MAX};
+ValueIDNum ValueIDNum::TombstoneValue = {UINT_MAX, UINT_MAX, UINT_MAX - 1};
 
-  /// Number of the current instruction LiveDebugValues is evaluating.
-  unsigned CurInst;
+#ifndef NDEBUG
+void DbgValue::dump(const MLocTracker *MTrack) const {
+  if (Kind == Const) {
+    MO->dump();
+  } else if (Kind == NoVal) {
+    dbgs() << "NoVal(" << BlockNo << ")";
+  } else if (Kind == VPHI) {
+    dbgs() << "VPHI(" << BlockNo << "," << MTrack->IDAsString(ID) << ")";
+  } else {
+    assert(Kind == Def);
+    dbgs() << MTrack->IDAsString(ID);
+  }
+  if (Properties.Indirect)
+    dbgs() << " indir";
+  if (Properties.DIExpr)
+    dbgs() << " " << *Properties.DIExpr;
+}
+#endif
 
-  /// Variable tracker -- listens to DBG_VALUEs occurring as InstrRefBasedImpl
-  /// steps through a block. Reads the values at each location from the
-  /// MLocTracker object.
-  VLocTracker *VTracker;
+MLocTracker::MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
+                         const TargetRegisterInfo &TRI,
+                         const TargetLowering &TLI)
+    : MF(MF), TII(TII), TRI(TRI), TLI(TLI),
+      LocIdxToIDNum(ValueIDNum::EmptyValue), LocIdxToLocID(0) {
+  NumRegs = TRI.getNumRegs();
+  reset();
+  LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
+  assert(NumRegs < (1u << NUM_LOC_BITS)); // Detect bit packing failure
+
+  // Always track SP. This avoids the implicit clobbering caused by regmasks
+  // from affectings its values. (LiveDebugValues disbelieves calls and
+  // regmasks that claim to clobber SP).
+  Register SP = TLI.getStackPointerRegisterToSaveRestore();
+  if (SP) {
+    unsigned ID = getLocID(SP);
+    (void)lookupOrTrackRegister(ID);
+
+    for (MCRegAliasIterator RAI(SP, &TRI, true); RAI.isValid(); ++RAI)
+      SPAliases.insert(*RAI);
+  }
+
+  // Build some common stack positions -- full registers being spilt to the
+  // stack.
+  StackSlotIdxes.insert({{8, 0}, 0});
+  StackSlotIdxes.insert({{16, 0}, 1});
+  StackSlotIdxes.insert({{32, 0}, 2});
+  StackSlotIdxes.insert({{64, 0}, 3});
+  StackSlotIdxes.insert({{128, 0}, 4});
+  StackSlotIdxes.insert({{256, 0}, 5});
+  StackSlotIdxes.insert({{512, 0}, 6});
+
+  // Traverse all the subregister idxes, and ensure there's an index for them.
+  // Duplicates are no problem: we're interested in their position in the
+  // stack slot, we don't want to type the slot.
+  for (unsigned int I = 1; I < TRI.getNumSubRegIndices(); ++I) {
+    unsigned Size = TRI.getSubRegIdxSize(I);
+    unsigned Offs = TRI.getSubRegIdxOffset(I);
+    unsigned Idx = StackSlotIdxes.size();
+
+    // Some subregs have -1, -2 and so forth fed into their fields, to mean
+    // special backend things. Ignore those.
+    if (Size > 60000 || Offs > 60000)
+      continue;
 
-  /// Tracker for transfers, listens to DBG_VALUEs and transfers of values
-  /// between locations during stepping, creates new DBG_VALUEs when values move
-  /// location.
-  TransferTracker *TTracker;
+    StackSlotIdxes.insert({{Size, Offs}, Idx});
+  }
 
-  /// Blocks which are artificial, i.e. blocks which exclusively contain
-  /// instructions without DebugLocs, or with line 0 locations.
-  SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
+  for (auto &Idx : StackSlotIdxes)
+    StackIdxesToPos[Idx.second] = Idx.first;
 
-  // Mapping of blocks to and from their RPOT order.
-  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
-  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
-  DenseMap<unsigned, unsigned> BBNumToRPO;
+  NumSlotIdxes = StackSlotIdxes.size();
+}
 
-  /// Pair of MachineInstr, and its 1-based offset into the containing block.
-  using InstAndNum = std::pair<const MachineInstr *, unsigned>;
-  /// Map from debug instruction number to the MachineInstr labelled with that
-  /// number, and its location within the function. Used to transform
-  /// instruction numbers in DBG_INSTR_REFs into machine value numbers.
-  std::map<uint64_t, InstAndNum> DebugInstrNumToInstr;
+LocIdx MLocTracker::trackRegister(unsigned ID) {
+  assert(ID != 0);
+  LocIdx NewIdx = LocIdx(LocIdxToIDNum.size());
+  LocIdxToIDNum.grow(NewIdx);
+  LocIdxToLocID.grow(NewIdx);
+
+  // Default: it's an mphi.
+  ValueIDNum ValNum = {CurBB, 0, NewIdx};
+  // Was this reg ever touched by a regmask?
+  for (const auto &MaskPair : reverse(Masks)) {
+    if (MaskPair.first->clobbersPhysReg(ID)) {
+      // There was an earlier def we skipped.
+      ValNum = {CurBB, MaskPair.second, NewIdx};
+      break;
+    }
+  }
 
-  /// Record of where we observed a DBG_PHI instruction.
-  class DebugPHIRecord {
-  public:
-    uint64_t InstrNum;      ///< Instruction number of this DBG_PHI.
-    MachineBasicBlock *MBB; ///< Block where DBG_PHI occurred.
-    ValueIDNum ValueRead;   ///< The value number read by the DBG_PHI.
-    LocIdx ReadLoc;         ///< Register/Stack location the DBG_PHI reads.
+  LocIdxToIDNum[NewIdx] = ValNum;
+  LocIdxToLocID[NewIdx] = ID;
+  return NewIdx;
+}
 
-    operator unsigned() const { return InstrNum; }
-  };
+void MLocTracker::writeRegMask(const MachineOperand *MO, unsigned CurBB,
+                               unsigned InstID) {
+  // Def any register we track have that isn't preserved. The regmask
+  // terminates the liveness of a register, meaning its value can't be
+  // relied upon -- we represent this by giving it a new value.
+  for (auto Location : locations()) {
+    unsigned ID = LocIdxToLocID[Location.Idx];
+    // Don't clobber SP, even if the mask says it's clobbered.
+    if (ID < NumRegs && !SPAliases.count(ID) && MO->clobbersPhysReg(ID))
+      defReg(ID, CurBB, InstID);
+  }
+  Masks.push_back(std::make_pair(MO, InstID));
+}
 
-  /// Map from instruction numbers defined by DBG_PHIs to a record of what that
-  /// DBG_PHI read and where. Populated and edited during the machine value
-  /// location problem -- we use LLVMs SSA Updater to fix changes by
-  /// optimizations that destroy PHI instructions.
-  SmallVector<DebugPHIRecord, 32> DebugPHINumToValue;
-
-  // Map of overlapping variable fragments.
-  OverlapMap OverlapFragments;
-  VarToFragments SeenFragments;
-
-  /// Tests whether this instruction is a spill to a stack slot.
-  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
-
-  /// Decide if @MI is a spill instruction and return true if it is. We use 2
-  /// criteria to make this decision:
-  /// - Is this instruction a store to a spill slot?
-  /// - Is there a register operand that is both used and killed?
-  /// TODO: Store optimization can fold spills into other stores (including
-  /// other spills). We do not handle this yet (more than one memory operand).
-  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
-                       unsigned &Reg);
-
-  /// If a given instruction is identified as a spill, return the spill slot
-  /// and set \p Reg to the spilled register.
-  Optional<SpillLoc> isRestoreInstruction(const MachineInstr &MI,
-                                          MachineFunction *MF, unsigned &Reg);
-
-  /// Given a spill instruction, extract the register and offset used to
-  /// address the spill slot in a target independent way.
-  SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
-
-  /// Observe a single instruction while stepping through a block.
-  void process(MachineInstr &MI, ValueIDNum **MLiveOuts = nullptr,
-               ValueIDNum **MLiveIns = nullptr);
-
-  /// Examines whether \p MI is a DBG_VALUE and notifies trackers.
-  /// \returns true if MI was recognized and processed.
-  bool transferDebugValue(const MachineInstr &MI);
-
-  /// Examines whether \p MI is a DBG_INSTR_REF and notifies trackers.
-  /// \returns true if MI was recognized and processed.
-  bool transferDebugInstrRef(MachineInstr &MI, ValueIDNum **MLiveOuts,
-                             ValueIDNum **MLiveIns);
-
-  /// Stores value-information about where this PHI occurred, and what
-  /// instruction number is associated with it.
-  /// \returns true if MI was recognized and processed.
-  bool transferDebugPHI(MachineInstr &MI);
-
-  /// Examines whether \p MI is copy instruction, and notifies trackers.
-  /// \returns true if MI was recognized and processed.
-  bool transferRegisterCopy(MachineInstr &MI);
-
-  /// Examines whether \p MI is stack spill or restore  instruction, and
-  /// notifies trackers. \returns true if MI was recognized and processed.
-  bool transferSpillOrRestoreInst(MachineInstr &MI);
-
-  /// Examines \p MI for any registers that it defines, and notifies trackers.
-  void transferRegisterDef(MachineInstr &MI);
-
-  /// Copy one location to the other, accounting for movement of subregisters
-  /// too.
-  void performCopy(Register Src, Register Dst);
-
-  void accumulateFragmentMap(MachineInstr &MI);
-
-  /// Determine the machine value number referred to by (potentially several)
-  /// DBG_PHI instructions. Block duplication and tail folding can duplicate
-  /// DBG_PHIs, shifting the position where values in registers merge, and
-  /// forming another mini-ssa problem to solve.
-  /// \p Here the position of a DBG_INSTR_REF seeking a machine value number
-  /// \p InstrNum Debug instruction number defined by DBG_PHI instructions.
-  /// \returns The machine value number at position Here, or None.
-  Optional<ValueIDNum> resolveDbgPHIs(MachineFunction &MF,
-                                      ValueIDNum **MLiveOuts,
-                                      ValueIDNum **MLiveIns, MachineInstr &Here,
-                                      uint64_t InstrNum);
-
-  /// Step through the function, recording register definitions and movements
-  /// in an MLocTracker. Convert the observations into a per-block transfer
-  /// function in \p MLocTransfer, suitable for using with the machine value
-  /// location dataflow problem.
-  void
-  produceMLocTransferFunction(MachineFunction &MF,
-                              SmallVectorImpl<MLocTransferMap> &MLocTransfer,
-                              unsigned MaxNumBlocks);
-
-  /// Solve the machine value location dataflow problem. Takes as input the
-  /// transfer functions in \p MLocTransfer. Writes the output live-in and
-  /// live-out arrays to the (initialized to zero) multidimensional arrays in
-  /// \p MInLocs and \p MOutLocs. The outer dimension is indexed by block
-  /// number, the inner by LocIdx.
-  void mlocDataflow(ValueIDNum **MInLocs, ValueIDNum **MOutLocs,
-                    SmallVectorImpl<MLocTransferMap> &MLocTransfer);
-
-  /// Perform a control flow join (lattice value meet) of the values in machine
-  /// locations at \p MBB. Follows the algorithm described in the file-comment,
-  /// reading live-outs of predecessors from \p OutLocs, the current live ins
-  /// from \p InLocs, and assigning the newly computed live ins back into
-  /// \p InLocs. \returns two bools -- the first indicates whether a change
-  /// was made, the second whether a lattice downgrade occurred. If the latter
-  /// is true, revisiting this block is necessary.
-  std::tuple<bool, bool>
-  mlocJoin(MachineBasicBlock &MBB,
-           SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-           ValueIDNum **OutLocs, ValueIDNum *InLocs);
-
-  /// Solve the variable value dataflow problem, for a single lexical scope.
-  /// Uses the algorithm from the file comment to resolve control flow joins,
-  /// although there are extra hacks, see vlocJoin. Reads the
-  /// locations of values from the \p MInLocs and \p MOutLocs arrays (see
-  /// mlocDataflow) and reads the variable values transfer function from
-  /// \p AllTheVlocs. Live-in and Live-out variable values are stored locally,
-  /// with the live-ins permanently stored to \p Output once the fixedpoint is
-  /// reached.
-  /// \p VarsWeCareAbout contains a collection of the variables in \p Scope
-  /// that we should be tracking.
-  /// \p AssignBlocks contains the set of blocks that aren't in \p Scope, but
-  /// which do contain DBG_VALUEs, which VarLocBasedImpl tracks locations
-  /// through.
-  void vlocDataflow(const LexicalScope *Scope, const DILocation *DILoc,
-                    const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
-                    SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
-                    LiveInsT &Output, ValueIDNum **MOutLocs,
-                    ValueIDNum **MInLocs,
-                    SmallVectorImpl<VLocTracker> &AllTheVLocs);
-
-  /// Compute the live-ins to a block, considering control flow merges according
-  /// to the method in the file comment. Live out and live in variable values
-  /// are stored in \p VLOCOutLocs and \p VLOCInLocs. The live-ins for \p MBB
-  /// are computed and stored into \p VLOCInLocs. \returns true if the live-ins
-  /// are modified.
-  /// \p InLocsT Output argument, storage for calculated live-ins.
-  /// \returns two bools -- the first indicates whether a change
-  /// was made, the second whether a lattice downgrade occurred. If the latter
-  /// is true, revisiting this block is necessary.
-  std::tuple<bool, bool>
-  vlocJoin(MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs, LiveIdxT &VLOCInLocs,
-           SmallPtrSet<const MachineBasicBlock *, 16> *VLOCVisited,
-           unsigned BBNum, const SmallSet<DebugVariable, 4> &AllVars,
-           ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
-           SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
-           SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
-           DenseMap<DebugVariable, DbgValue> &InLocsT);
-
-  /// Continue exploration of the variable-value lattice, as explained in the
-  /// file-level comment. \p OldLiveInLocation contains the current
-  /// exploration position, from which we need to descend further. \p Values
-  /// contains the set of live-in values, \p CurBlockRPONum the RPO number of
-  /// the current block, and \p CandidateLocations a set of locations that
-  /// should be considered as PHI locations, if we reach the bottom of the
-  /// lattice. \returns true if we should downgrade; the value is the agreeing
-  /// value number in a non-backedge predecessor.
-  bool vlocDowngradeLattice(const MachineBasicBlock &MBB,
-                            const DbgValue &OldLiveInLocation,
-                            const SmallVectorImpl<InValueT> &Values,
-                            unsigned CurBlockRPONum);
-
-  /// For the given block and live-outs feeding into it, try to find a
-  /// machine location where they all join. If a solution for all predecessors
-  /// can't be found, a location where all non-backedge-predecessors join
-  /// will be returned instead. While this method finds a join location, this
-  /// says nothing as to whether it should be used.
-  /// \returns Pair of value ID if found, and true when the correct value
-  /// is available on all predecessor edges, or false if it's only available
-  /// for non-backedge predecessors.
-  std::tuple<Optional<ValueIDNum>, bool>
-  pickVPHILoc(MachineBasicBlock &MBB, const DebugVariable &Var,
-              const LiveIdxT &LiveOuts, ValueIDNum **MOutLocs,
-              ValueIDNum **MInLocs,
-              const SmallVectorImpl<MachineBasicBlock *> &BlockOrders);
-
-  /// Given the solutions to the two dataflow problems, machine value locations
-  /// in \p MInLocs and live-in variable values in \p SavedLiveIns, runs the
-  /// TransferTracker class over the function to produce live-in and transfer
-  /// DBG_VALUEs, then inserts them. Groups of DBG_VALUEs are inserted in the
-  /// order given by AllVarsNumbering -- this could be any stable order, but
-  /// right now "order of appearence in function, when explored in RPO", so
-  /// that we can compare explictly against VarLocBasedImpl.
-  void emitLocations(MachineFunction &MF, LiveInsT SavedLiveIns,
-                     ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
-                     DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
-                     const TargetPassConfig &TPC);
-
-  /// Boilerplate computation of some initial sets, artifical blocks and
-  /// RPOT block ordering.
-  void initialSetup(MachineFunction &MF);
-
-  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override;
+SpillLocationNo MLocTracker::getOrTrackSpillLoc(SpillLoc L) {
+  SpillLocationNo SpillID(SpillLocs.idFor(L));
+  if (SpillID.id() == 0) {
+    // Spill location is untracked: create record for this one, and all
+    // subregister slots too.
+    SpillID = SpillLocationNo(SpillLocs.insert(L));
+    for (unsigned StackIdx = 0; StackIdx < NumSlotIdxes; ++StackIdx) {
+      unsigned L = getSpillIDWithIdx(SpillID, StackIdx);
+      LocIdx Idx = LocIdx(LocIdxToIDNum.size()); // New idx
+      LocIdxToIDNum.grow(Idx);
+      LocIdxToLocID.grow(Idx);
+      LocIDToLocIdx.push_back(Idx);
+      LocIdxToLocID[Idx] = L;
+      // Initialize to PHI value; corresponds to the location's live-in value
+      // during transfer function construction.
+      LocIdxToIDNum[Idx] = ValueIDNum(CurBB, 0, Idx);
+    }
+  }
+  return SpillID;
+}
 
-public:
-  /// Default construct and initialize the pass.
-  InstrRefBasedLDV();
+std::string MLocTracker::LocIdxToName(LocIdx Idx) const {
+  unsigned ID = LocIdxToLocID[Idx];
+  if (ID >= NumRegs) {
+    StackSlotPos Pos = locIDToSpillIdx(ID);
+    ID -= NumRegs;
+    unsigned Slot = ID / NumSlotIdxes;
+    return Twine("slot ")
+        .concat(Twine(Slot).concat(Twine(" sz ").concat(Twine(Pos.first)
+        .concat(Twine(" offs ").concat(Twine(Pos.second))))))
+        .str();
+  } else {
+    return TRI.getRegAsmName(ID).str();
+  }
+}
 
-  LLVM_DUMP_METHOD
-  void dump_mloc_transfer(const MLocTransferMap &mloc_transfer) const;
+std::string MLocTracker::IDAsString(const ValueIDNum &Num) const {
+  std::string DefName = LocIdxToName(Num.getLoc());
+  return Num.asString(DefName);
+}
 
-  bool isCalleeSaved(LocIdx L) {
-    unsigned Reg = MTracker->LocIdxToLocID[L];
-    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
-      if (CalleeSavedRegs.test(*RAI))
-        return true;
-    return false;
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void MLocTracker::dump() {
+  for (auto Location : locations()) {
+    std::string MLocName = LocIdxToName(Location.Value.getLoc());
+    std::string DefName = Location.Value.asString(MLocName);
+    dbgs() << LocIdxToName(Location.Idx) << " --> " << DefName << "\n";
   }
-};
+}
 
-} // end anonymous namespace
+LLVM_DUMP_METHOD void MLocTracker::dump_mloc_map() {
+  for (auto Location : locations()) {
+    std::string foo = LocIdxToName(Location.Idx);
+    dbgs() << "Idx " << Location.Idx.asU64() << " " << foo << "\n";
+  }
+}
+#endif
 
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
+MachineInstrBuilder MLocTracker::emitLoc(Optional<LocIdx> MLoc,
+                                         const DebugVariable &Var,
+                                         const DbgValueProperties &Properties) {
+  DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
+                                Var.getVariable()->getScope(),
+                                const_cast<DILocation *>(Var.getInlinedAt()));
+  auto MIB = BuildMI(MF, DL, TII.get(TargetOpcode::DBG_VALUE));
+
+  const DIExpression *Expr = Properties.DIExpr;
+  if (!MLoc) {
+    // No location -> DBG_VALUE $noreg
+    MIB.addReg(0);
+    MIB.addReg(0);
+  } else if (LocIdxToLocID[*MLoc] >= NumRegs) {
+    unsigned LocID = LocIdxToLocID[*MLoc];
+    SpillLocationNo SpillID = locIDToSpill(LocID);
+    StackSlotPos StackIdx = locIDToSpillIdx(LocID);
+    unsigned short Offset = StackIdx.second;
+
+    // TODO: support variables that are located in spill slots, with non-zero
+    // offsets from the start of the spill slot. It would require some more
+    // complex DIExpression calculations. This doesn't seem to be produced by
+    // LLVM right now, so don't try and support it.
+    // Accept no-subregister slots and subregisters where the offset is zero.
+    // The consumer should already have type information to work out how large
+    // the variable is.
+    if (Offset == 0) {
+      const SpillLoc &Spill = SpillLocs[SpillID.id()];
+      Expr = TRI.prependOffsetExpression(Expr, DIExpression::ApplyOffset,
+                                         Spill.SpillOffset);
+      unsigned Base = Spill.SpillBase;
+      MIB.addReg(Base);
+      MIB.addImm(0);
+    } else {
+      // This is a stack location with a weird subregister offset: emit an undef
+      // DBG_VALUE instead.
+      MIB.addReg(0);
+      MIB.addReg(0);
+    }
+  } else {
+    // Non-empty, non-stack slot, must be a plain register.
+    unsigned LocID = LocIdxToLocID[*MLoc];
+    MIB.addReg(LocID);
+    if (Properties.Indirect)
+      MIB.addImm(0);
+    else
+      MIB.addReg(0);
+  }
 
-ValueIDNum ValueIDNum::EmptyValue = {UINT_MAX, UINT_MAX, UINT_MAX};
+  MIB.addMetadata(Var.getVariable());
+  MIB.addMetadata(Expr);
+  return MIB;
+}
 
 /// Default construct and initialize the pass.
 InstrRefBasedLDV::InstrRefBasedLDV() {}
 
+bool InstrRefBasedLDV::isCalleeSaved(LocIdx L) const {
+  unsigned Reg = MTracker->LocIdxToLocID[L];
+  for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+    if (CalleeSavedRegs.test(*RAI))
+      return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 //            Debug Range Extension Implementation
 //===----------------------------------------------------------------------===//
@@ -1722,7 +877,7 @@ InstrRefBasedLDV::InstrRefBasedLDV() {}
 // void InstrRefBasedLDV::printVarLocInMBB(..)
 #endif
 
-SpillLoc
+SpillLocationNo
 InstrRefBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
   assert(MI.hasOneMemOperand() &&
          "Spill instruction does not have exactly one memory operand?");
@@ -1734,7 +889,28 @@ InstrRefBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
   const MachineBasicBlock *MBB = MI.getParent();
   Register Reg;
   StackOffset Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
-  return {Reg, Offset};
+  return MTracker->getOrTrackSpillLoc({Reg, Offset});
+}
+
+Optional<LocIdx> InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr &MI) {
+  SpillLocationNo SpillLoc =  extractSpillBaseRegAndOffset(MI);
+
+  // Where in the stack slot is this value defined -- i.e., what size of value
+  // is this? An important question, because it could be loaded into a register
+  // from the stack at some point. Happily the memory operand will tell us
+  // the size written to the stack.
+  auto *MemOperand = *MI.memoperands_begin();
+  unsigned SizeInBits = MemOperand->getSizeInBits();
+
+  // Find that position in the stack indexes we're tracking.
+  auto IdxIt = MTracker->StackSlotIdxes.find({SizeInBits, 0});
+  if (IdxIt == MTracker->StackSlotIdxes.end())
+    // That index is not tracked. This is suprising, and unlikely to ever
+    // occur, but the safe action is to indicate the variable is optimised out.
+    return None;
+
+  unsigned SpillID = MTracker->getSpillIDWithIdx(SpillLoc, IdxIt->second);
+  return MTracker->getSpillMLoc(SpillID);
 }
 
 /// End all previous ranges related to @MI and start a new range from @MI
@@ -1759,6 +935,17 @@ bool InstrRefBasedLDV::transferDebugValue(const MachineInstr &MI) {
   if (Scope == nullptr)
     return true; // handled it; by doing nothing
 
+  // For now, ignore DBG_VALUE_LISTs when extending ranges. Allow it to
+  // contribute to locations in this block, but don't propagate further.
+  // Interpret it like a DBG_VALUE $noreg.
+  if (MI.isDebugValueList()) {
+    if (VTracker)
+      VTracker->defVar(MI, Properties, None);
+    if (TTracker)
+      TTracker->redefVar(MI, Properties, None);
+    return true;
+  }
+
   const MachineOperand &MO = MI.getOperand(0);
 
   // MLocTracker needs to know that this register is read, even if it's only
@@ -1852,16 +1039,25 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
     const MachineInstr &TargetInstr = *InstrIt->second.first;
     uint64_t BlockNo = TargetInstr.getParent()->getNumber();
 
-    // Pick out the designated operand.
-    assert(OpNo < TargetInstr.getNumOperands());
-    const MachineOperand &MO = TargetInstr.getOperand(OpNo);
-
-    // Today, this can only be a register.
-    assert(MO.isReg() && MO.isDef());
-
-    unsigned LocID = MTracker->getLocID(MO.getReg(), false);
-    LocIdx L = MTracker->LocIDToLocIdx[LocID];
-    NewID = ValueIDNum(BlockNo, InstrIt->second.second, L);
+    // Pick out the designated operand. It might be a memory reference, if
+    // a register def was folded into a stack store.
+    if (OpNo == MachineFunction::DebugOperandMemNumber &&
+        TargetInstr.hasOneMemOperand()) {
+      Optional<LocIdx> L = findLocationForMemOperand(TargetInstr);
+      if (L)
+        NewID = ValueIDNum(BlockNo, InstrIt->second.second, *L);
+    } else if (OpNo != MachineFunction::DebugOperandMemNumber) {
+      assert(OpNo < TargetInstr.getNumOperands());
+      const MachineOperand &MO = TargetInstr.getOperand(OpNo);
+
+      // Today, this can only be a register.
+      assert(MO.isReg() && MO.isDef());
+
+      unsigned LocID = MTracker->getLocID(MO.getReg());
+      LocIdx L = MTracker->LocIDToLocIdx[LocID];
+      NewID = ValueIDNum(BlockNo, InstrIt->second.second, L);
+    }
+    // else: NewID is left as None.
   } else if (PHIIt != DebugPHINumToValue.end() && PHIIt->InstrNum == InstNo) {
     // It's actually a PHI value. Which value it is might not be obvious, use
     // the resolver helper to find out.
@@ -1957,7 +1153,7 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
   Optional<LocIdx> FoundLoc = None;
   for (auto Location : MTracker->locations()) {
     LocIdx CurL = Location.Idx;
-    ValueIDNum ID = MTracker->LocIdxToIDNum[CurL];
+    ValueIDNum ID = MTracker->readMLoc(CurL);
     if (NewID && ID == NewID) {
       // If this is the first location with that value, pick it. Otherwise,
       // consider whether it's a "longer term" location.
@@ -2016,6 +1212,10 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
     auto PHIRec = DebugPHIRecord(
         {InstrNum, MI.getParent(), Num, MTracker->lookupOrTrackRegister(Reg)});
     DebugPHINumToValue.push_back(PHIRec);
+
+    // Ensure this register is tracked.
+    for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+      MTracker->lookupOrTrackRegister(*RAI);
   } else {
     // The value is whatever's in this stack slot.
     assert(MO.isFI());
@@ -2026,19 +1226,46 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
     if (MFI->isDeadObjectIndex(FI))
       return true;
 
-    // Identify this spill slot.
+    // Identify this spill slot, ensure it's tracked.
     Register Base;
     StackOffset Offs = TFI->getFrameIndexReference(*MI.getMF(), FI, Base);
     SpillLoc SL = {Base, Offs};
-    Optional<ValueIDNum> Num = MTracker->readSpill(SL);
+    SpillLocationNo SpillNo = MTracker->getOrTrackSpillLoc(SL);
+
+    // Problem: what value should we extract from the stack? LLVM does not
+    // record what size the last store to the slot was, and it would become
+    // sketchy after stack slot colouring anyway. Take a look at what values
+    // are stored on the stack, and pick the largest one that wasn't def'd
+    // by a spill (i.e., the value most likely to have been def'd in a register
+    // and then spilt.
+    std::array<unsigned, 4> CandidateSizes = {64, 32, 16, 8};
+    Optional<ValueIDNum> Result = None;
+    Optional<LocIdx> SpillLoc = None;
+    for (unsigned int I = 0; I < CandidateSizes.size(); ++I) {
+      unsigned SpillID = MTracker->getLocID(SpillNo, {CandidateSizes[I], 0});
+      SpillLoc = MTracker->getSpillMLoc(SpillID);
+      ValueIDNum Val = MTracker->readMLoc(*SpillLoc);
+      // If this value was defined in it's own position, then it was probably
+      // an aliasing index of a small value that was spilt.
+      if (Val.getLoc() != SpillLoc->asU64()) {
+        Result = Val;
+        break;
+      }
+    }
 
-    if (!Num)
-      // Nothing ever writes to this slot. Curious, but nothing we can do.
-      return true;
+    // If we didn't find anything, we're probably looking at a PHI, or a memory
+    // store folded into an instruction. FIXME: Take a guess that's it's 64
+    // bits. This isn't ideal, but tracking the size that the spill is
+    // "supposed" to be is more complex, and benefits a small number of
+    // locations.
+    if (!Result) {
+      unsigned SpillID = MTracker->getLocID(SpillNo, {64, 0});
+      SpillLoc = MTracker->getSpillMLoc(SpillID);
+      Result = MTracker->readMLoc(*SpillLoc);
+    }
 
     // Record this DBG_PHI for later analysis.
-    auto DbgPHI = DebugPHIRecord(
-        {InstrNum, MI.getParent(), *Num, *MTracker->getSpillMLoc(SL)});
+    auto DbgPHI = DebugPHIRecord({InstrNum, MI.getParent(), *Result, *SpillLoc});
     DebugPHINumToValue.push_back(DbgPHI);
   }
 
@@ -2061,10 +1288,6 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
   } else if (MI.isMetaInstruction())
     return;
 
-  MachineFunction *MF = MI.getMF();
-  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-  Register SP = TLI->getStackPointerRegisterToSaveRestore();
-
   // Find the regs killed by MI, and find regmasks of preserved regs.
   // Max out the number of statically allocated elements in `DeadRegs`, as this
   // prevents fallback to std::set::count() operations.
@@ -2075,7 +1298,7 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
     // Determine whether the operand is a register def.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
         Register::isPhysicalRegister(MO.getReg()) &&
-        !(MI.isCall() && MO.getReg() == SP)) {
+        !(MI.isCall() && MTracker->SPAliases.count(MO.getReg()))) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         // FIXME: Can we break out of this loop early if no insertion occurs?
@@ -2093,6 +1316,16 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
   for (auto *MO : RegMaskPtrs)
     MTracker->writeRegMask(MO, CurBB, CurInst);
 
+  // If this instruction writes to a spill slot, def that slot.
+  if (hasFoldedStackStore(MI)) {
+    SpillLocationNo SpillNo = extractSpillBaseRegAndOffset(MI);
+    for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
+      unsigned SpillID = MTracker->getSpillIDWithIdx(SpillNo, I);
+      LocIdx L = MTracker->getSpillMLoc(SpillID);
+      MTracker->setMLoc(L, ValueIDNum(CurBB, CurInst, L));
+    }
+  }
+
   if (!TTracker)
     return;
 
@@ -2118,32 +1351,27 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
       if (MO->clobbersPhysReg(Reg))
         TTracker->clobberMloc(L.Idx, MI.getIterator(), false);
   }
+
+  // Tell TTracker about any folded stack store.
+  if (hasFoldedStackStore(MI)) {
+    SpillLocationNo SpillNo = extractSpillBaseRegAndOffset(MI);
+    for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
+      unsigned SpillID = MTracker->getSpillIDWithIdx(SpillNo, I);
+      LocIdx L = MTracker->getSpillMLoc(SpillID);
+      TTracker->clobberMloc(L, MI.getIterator(), true);
+    }
+  }
 }
 
 void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
-  ValueIDNum SrcValue = MTracker->readReg(SrcRegNum);
+  // In all circumstances, re-def all aliases. It's definitely a new value now.
+  for (MCRegAliasIterator RAI(DstRegNum, TRI, true); RAI.isValid(); ++RAI)
+    MTracker->defReg(*RAI, CurBB, CurInst);
 
+  ValueIDNum SrcValue = MTracker->readReg(SrcRegNum);
   MTracker->setReg(DstRegNum, SrcValue);
 
-  // In all circumstances, re-def the super registers. It's definitely a new
-  // value now. This doesn't uniquely identify the composition of subregs, for
-  // example, two identical values in subregisters composed in different
-  // places would not get equal value numbers.
-  for (MCSuperRegIterator SRI(DstRegNum, TRI); SRI.isValid(); ++SRI)
-    MTracker->defReg(*SRI, CurBB, CurInst);
-
-  // If we're emulating VarLocBasedImpl, just define all the subregisters.
-  // DBG_VALUEs of them will expect to be tracked from the DBG_VALUE, not
-  // through prior copies.
-  if (EmulateOldLDV) {
-    for (MCSubRegIndexIterator DRI(DstRegNum, TRI); DRI.isValid(); ++DRI)
-      MTracker->defReg(DRI.getSubReg(), CurBB, CurInst);
-    return;
-  }
-
-  // Otherwise, actually copy subregisters from one location to another.
-  // XXX: in addition, any subregisters of DstRegNum that don't line up with
-  // the source register should be def'd.
+  // Copy subregisters from one location to another.
   for (MCSubRegIndexIterator SRI(SrcRegNum, TRI); SRI.isValid(); ++SRI) {
     unsigned SrcSubReg = SRI.getSubReg();
     unsigned SubRegIdx = SRI.getSubRegIndex();
@@ -2154,15 +1382,13 @@ void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
     // Do copy. There are two matching subregisters, the source value should
     // have been def'd when the super-reg was, the latter might not be tracked
     // yet.
-    // This will force SrcSubReg to be tracked, if it isn't yet.
-    (void)MTracker->readReg(SrcSubReg);
-    LocIdx SrcL = MTracker->getRegMLoc(SrcSubReg);
-    assert(SrcL.asU64());
-    (void)MTracker->readReg(DstSubReg);
-    LocIdx DstL = MTracker->getRegMLoc(DstSubReg);
-    assert(DstL.asU64());
+    // This will force SrcSubReg to be tracked, if it isn't yet. Will read
+    // mphi values if it wasn't tracked.
+    LocIdx SrcL = MTracker->lookupOrTrackRegister(SrcSubReg);
+    LocIdx DstL = MTracker->lookupOrTrackRegister(DstSubReg);
+    (void)SrcL;
     (void)DstL;
-    ValueIDNum CpyValue = {SrcValue.getBlock(), SrcValue.getInst(), SrcL};
+    ValueIDNum CpyValue = MTracker->readReg(SrcSubReg);
 
     MTracker->setReg(DstSubReg, CpyValue);
   }
@@ -2174,6 +1400,12 @@ bool InstrRefBasedLDV::isSpillInstruction(const MachineInstr &MI,
   if (!MI.hasOneMemOperand())
     return false;
 
+  // Reject any memory operand that's aliased -- we can't guarantee its value.
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  if (PVal->isAliased(MFI))
+    return false;
+
   if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
     return false; // This is not a spill instruction, since no valid size was
                   // returned from either function.
@@ -2191,7 +1423,7 @@ bool InstrRefBasedLDV::isLocationSpill(const MachineInstr &MI,
   return Reg != 0;
 }
 
-Optional<SpillLoc>
+Optional<SpillLocationNo>
 InstrRefBasedLDV::isRestoreInstruction(const MachineInstr &MI,
                                        MachineFunction *MF, unsigned &Reg) {
   if (!MI.hasOneMemOperand())
@@ -2213,84 +1445,117 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
   if (EmulateOldLDV)
     return false;
 
+  // Strictly limit ourselves to plain loads and stores, not all instructions
+  // that can access the stack.
+  int DummyFI = -1;
+  if (!TII->isStoreToStackSlotPostFE(MI, DummyFI) &&
+      !TII->isLoadFromStackSlotPostFE(MI, DummyFI))
+    return false;
+
   MachineFunction *MF = MI.getMF();
   unsigned Reg;
-  Optional<SpillLoc> Loc;
 
   LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
 
+  // Strictly limit ourselves to plain loads and stores, not all instructions
+  // that can access the stack.
+  int FIDummy;
+  if (!TII->isStoreToStackSlotPostFE(MI, FIDummy) &&
+      !TII->isLoadFromStackSlotPostFE(MI, FIDummy))
+    return false;
+
   // First, if there are any DBG_VALUEs pointing at a spill slot that is
   // written to, terminate that variable location. The value in memory
   // will have changed. DbgEntityHistoryCalculator doesn't try to detect this.
   if (isSpillInstruction(MI, MF)) {
-    Loc = extractSpillBaseRegAndOffset(MI);
-
-    if (TTracker) {
-      Optional<LocIdx> MLoc = MTracker->getSpillMLoc(*Loc);
-      if (MLoc) {
-        // Un-set this location before clobbering, so that we don't salvage
-        // the variable location back to the same place.
-        MTracker->setMLoc(*MLoc, ValueIDNum::EmptyValue);
+    SpillLocationNo Loc = extractSpillBaseRegAndOffset(MI);
+
+    // Un-set this location and clobber, so that earlier locations don't
+    // continue past this store.
+    for (unsigned SlotIdx = 0; SlotIdx < MTracker->NumSlotIdxes; ++SlotIdx) {
+      unsigned SpillID = MTracker->getSpillIDWithIdx(Loc, SlotIdx);
+      Optional<LocIdx> MLoc = MTracker->getSpillMLoc(SpillID);
+      if (!MLoc)
+        continue;
+
+      // We need to over-write the stack slot with something (here, a def at
+      // this instruction) to ensure no values are preserved in this stack slot
+      // after the spill. It also prevents TTracker from trying to recover the
+      // location and re-installing it in the same place.
+      ValueIDNum Def(CurBB, CurInst, *MLoc);
+      MTracker->setMLoc(*MLoc, Def);
+      if (TTracker)
         TTracker->clobberMloc(*MLoc, MI.getIterator());
-      }
     }
   }
 
   // Try to recognise spill and restore instructions that may transfer a value.
   if (isLocationSpill(MI, MF, Reg)) {
-    Loc = extractSpillBaseRegAndOffset(MI);
-    auto ValueID = MTracker->readReg(Reg);
+    SpillLocationNo Loc = extractSpillBaseRegAndOffset(MI);
 
-    // If the location is empty, produce a phi, signify it's the live-in value.
-    if (ValueID.getLoc() == 0)
-      ValueID = {CurBB, 0, MTracker->getRegMLoc(Reg)};
+    auto DoTransfer = [&](Register SrcReg, unsigned SpillID) {
+      auto ReadValue = MTracker->readReg(SrcReg);
+      LocIdx DstLoc = MTracker->getSpillMLoc(SpillID);
+      MTracker->setMLoc(DstLoc, ReadValue);
+
+      if (TTracker) {
+        LocIdx SrcLoc = MTracker->getRegMLoc(SrcReg);
+        TTracker->transferMlocs(SrcLoc, DstLoc, MI.getIterator());
+      }
+    };
 
-    MTracker->setSpill(*Loc, ValueID);
-    auto OptSpillLocIdx = MTracker->getSpillMLoc(*Loc);
-    assert(OptSpillLocIdx && "Spill slot set but has no LocIdx?");
-    LocIdx SpillLocIdx = *OptSpillLocIdx;
+    // Then, transfer subreg bits.
+    for (MCSubRegIterator SRI(Reg, TRI, false); SRI.isValid(); ++SRI) {
+      // Ensure this reg is tracked,
+      (void)MTracker->lookupOrTrackRegister(*SRI);
+      unsigned SubregIdx = TRI->getSubRegIndex(Reg, *SRI);
+      unsigned SpillID = MTracker->getLocID(Loc, SubregIdx);
+      DoTransfer(*SRI, SpillID);
+    }
 
-    // Tell TransferTracker about this spill, produce DBG_VALUEs for it.
-    if (TTracker)
-      TTracker->transferMlocs(MTracker->getRegMLoc(Reg), SpillLocIdx,
-                              MI.getIterator());
+    // Directly lookup size of main source reg, and transfer.
+    unsigned Size = TRI->getRegSizeInBits(Reg, *MRI);
+    unsigned SpillID = MTracker->getLocID(Loc, {Size, 0});
+    DoTransfer(Reg, SpillID);
   } else {
-    if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
+    Optional<SpillLocationNo> OptLoc = isRestoreInstruction(MI, MF, Reg);
+    if (!OptLoc)
       return false;
+    SpillLocationNo Loc = *OptLoc;
 
-    // Is there a value to be restored?
-    auto OptValueID = MTracker->readSpill(*Loc);
-    if (OptValueID) {
-      ValueIDNum ValueID = *OptValueID;
-      LocIdx SpillLocIdx = *MTracker->getSpillMLoc(*Loc);
-      // XXX -- can we recover sub-registers of this value? Until we can, first
-      // overwrite all defs of the register being restored to.
-      for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
-        MTracker->defReg(*RAI, CurBB, CurInst);
+    // Assumption: we're reading from the base of the stack slot, not some
+    // offset into it. It seems very unlikely LLVM would ever generate
+    // restores where this wasn't true. This then becomes a question of what
+    // subregisters in the destination register line up with positions in the
+    // stack slot.
 
-      // Now override the reg we're restoring to.
-      MTracker->setReg(Reg, ValueID);
+    // Def all registers that alias the destination.
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      MTracker->defReg(*RAI, CurBB, CurInst);
+
+    // Now find subregisters within the destination register, and load values
+    // from stack slot positions.
+    auto DoTransfer = [&](Register DestReg, unsigned SpillID) {
+      LocIdx SrcIdx = MTracker->getSpillMLoc(SpillID);
+      auto ReadValue = MTracker->readMLoc(SrcIdx);
+      MTracker->setReg(DestReg, ReadValue);
+
+      if (TTracker) {
+        LocIdx DstLoc = MTracker->getRegMLoc(DestReg);
+        TTracker->transferMlocs(SrcIdx, DstLoc, MI.getIterator());
+      }
+    };
 
-      // Report this restore to the transfer tracker too.
-      if (TTracker)
-        TTracker->transferMlocs(SpillLocIdx, MTracker->getRegMLoc(Reg),
-                                MI.getIterator());
-    } else {
-      // There isn't anything in the location; not clear if this is a code path
-      // that still runs. Def this register anyway just in case.
-      for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
-        MTracker->defReg(*RAI, CurBB, CurInst);
-
-      // Force the spill slot to be tracked.
-      LocIdx L = MTracker->getOrTrackSpillLoc(*Loc);
-
-      // Set the restored value to be a machine phi number, signifying that it's
-      // whatever the spills live-in value is in this block. Definitely has
-      // a LocIdx due to the setSpill above.
-      ValueIDNum ValueID = {CurBB, 0, L};
-      MTracker->setReg(Reg, ValueID);
-      MTracker->setSpill(*Loc, ValueID);
+    for (MCSubRegIterator SRI(Reg, TRI, false); SRI.isValid(); ++SRI) {
+      unsigned Subreg = TRI->getSubRegIndex(Reg, *SRI);
+      unsigned SpillID = MTracker->getLocID(Loc, Subreg);
+      DoTransfer(*SRI, SpillID);
     }
+
+    // Directly look up this registers slot idx by size, and transfer.
+    unsigned Size = TRI->getRegSizeInBits(Reg, *MRI);
+    unsigned SpillID = MTracker->getLocID(Loc, {Size, 0});
+    DoTransfer(Reg, SpillID);
   }
   return true;
 }
@@ -2510,12 +1775,11 @@ void InstrRefBasedLDV::produceMLocTransferFunction(
   }
 
   // Compute a bitvector of all the registers that are tracked in this block.
-  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
-  Register SP = TLI->getStackPointerRegisterToSaveRestore();
   BitVector UsedRegs(TRI->getNumRegs());
   for (auto Location : MTracker->locations()) {
     unsigned ID = MTracker->LocIdxToLocID[Location.Idx];
-    if (ID >= TRI->getNumRegs() || ID == SP)
+    // Ignore stack slots, and aliases of the stack pointer.
+    if (ID >= TRI->getNumRegs() || MTracker->SPAliases.count(ID))
       continue;
     UsedRegs.set(ID);
   }
@@ -2531,7 +1795,7 @@ void InstrRefBasedLDV::produceMLocTransferFunction(
     // they're all clobbered or at least set in the designated transfer
     // elem.
     for (unsigned Bit : BV.set_bits()) {
-      unsigned ID = MTracker->getLocID(Bit, false);
+      unsigned ID = MTracker->getLocID(Bit);
       LocIdx Idx = MTracker->LocIDToLocIdx[ID];
       auto &TransferMap = MLocTransfer[I];
 
@@ -2553,23 +1817,20 @@ void InstrRefBasedLDV::produceMLocTransferFunction(
   }
 }
 
-std::tuple<bool, bool>
-InstrRefBasedLDV::mlocJoin(MachineBasicBlock &MBB,
-                           SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-                           ValueIDNum **OutLocs, ValueIDNum *InLocs) {
+bool InstrRefBasedLDV::mlocJoin(
+    MachineBasicBlock &MBB, SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+    ValueIDNum **OutLocs, ValueIDNum *InLocs) {
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
   bool Changed = false;
-  bool DowngradeOccurred = false;
 
-  // Collect predecessors that have been visited. Anything that hasn't been
-  // visited yet is a backedge on the first iteration, and the meet of it's
-  // lattice value for all locations will be unaffected.
+  // Handle value-propagation when control flow merges on entry to a block. For
+  // any location without a PHI already placed, the location has the same value
+  // as its predecessors. If a PHI is placed, test to see whether it's now a
+  // redundant PHI that we can eliminate.
+
   SmallVector<const MachineBasicBlock *, 8> BlockOrders;
-  for (auto Pred : MBB.predecessors()) {
-    if (Visited.count(Pred)) {
-      BlockOrders.push_back(Pred);
-    }
-  }
+  for (auto Pred : MBB.predecessors())
+    BlockOrders.push_back(Pred);
 
   // Visit predecessors in RPOT order.
   auto Cmp = [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
@@ -2579,83 +1840,216 @@ InstrRefBasedLDV::mlocJoin(MachineBasicBlock &MBB,
 
   // Skip entry block.
   if (BlockOrders.size() == 0)
-    return std::tuple<bool, bool>(false, false);
+    return false;
 
-  // Step through all machine locations, then look at each predecessor and
-  // detect disagreements.
-  unsigned ThisBlockRPO = BBToOrder.find(&MBB)->second;
+  // Step through all machine locations, look at each predecessor and test
+  // whether we can eliminate redundant PHIs.
   for (auto Location : MTracker->locations()) {
     LocIdx Idx = Location.Idx;
+
     // Pick out the first predecessors live-out value for this location. It's
-    // guaranteed to be not a backedge, as we order by RPO.
-    ValueIDNum BaseVal = OutLocs[BlockOrders[0]->getNumber()][Idx.asU64()];
+    // guaranteed to not be a backedge, as we order by RPO.
+    ValueIDNum FirstVal = OutLocs[BlockOrders[0]->getNumber()][Idx.asU64()];
+
+    // If we've already eliminated a PHI here, do no further checking, just
+    // propagate the first live-in value into this block.
+    if (InLocs[Idx.asU64()] != ValueIDNum(MBB.getNumber(), 0, Idx)) {
+      if (InLocs[Idx.asU64()] != FirstVal) {
+        InLocs[Idx.asU64()] = FirstVal;
+        Changed |= true;
+      }
+      continue;
+    }
 
-    // Some flags for whether there's a disagreement, and whether it's a
-    // disagreement with a backedge or not.
+    // We're now examining a PHI to see whether it's un-necessary. Loop around
+    // the other live-in values and test whether they're all the same.
     bool Disagree = false;
-    bool NonBackEdgeDisagree = false;
-
-    // Loop around everything that wasn't 'base'.
     for (unsigned int I = 1; I < BlockOrders.size(); ++I) {
-      auto *MBB = BlockOrders[I];
-      if (BaseVal != OutLocs[MBB->getNumber()][Idx.asU64()]) {
-        // Live-out of a predecessor disagrees with the first predecessor.
-        Disagree = true;
-
-        // Test whether it's a disagreemnt in the backedges or not.
-        if (BBToOrder.find(MBB)->second < ThisBlockRPO) // might be self b/e
-          NonBackEdgeDisagree = true;
-      }
-    }
+      const MachineBasicBlock *PredMBB = BlockOrders[I];
+      const ValueIDNum &PredLiveOut =
+          OutLocs[PredMBB->getNumber()][Idx.asU64()];
 
-    bool OverRide = false;
-    if (Disagree && !NonBackEdgeDisagree) {
-      // Only the backedges disagree. Consider demoting the livein
-      // lattice value, as per the file level comment. The value we consider
-      // demoting to is the value that the non-backedge predecessors agree on.
-      // The order of values is that non-PHIs are \top, a PHI at this block
-      // \bot, and phis between the two are ordered by their RPO number.
-      // If there's no agreement, or we've already demoted to this PHI value
-      // before, replace with a PHI value at this block.
-
-      // Calculate order numbers: zero means normal def, nonzero means RPO
-      // number.
-      unsigned BaseBlockRPONum = BBNumToRPO[BaseVal.getBlock()] + 1;
-      if (!BaseVal.isPHI())
-        BaseBlockRPONum = 0;
-
-      ValueIDNum &InLocID = InLocs[Idx.asU64()];
-      unsigned InLocRPONum = BBNumToRPO[InLocID.getBlock()] + 1;
-      if (!InLocID.isPHI())
-        InLocRPONum = 0;
-
-      // Should we ignore the disagreeing backedges, and override with the
-      // value the other predecessors agree on (in "base")?
-      unsigned ThisBlockRPONum = BBNumToRPO[MBB.getNumber()] + 1;
-      if (BaseBlockRPONum > InLocRPONum && BaseBlockRPONum < ThisBlockRPONum) {
-        // Override.
-        OverRide = true;
-        DowngradeOccurred = true;
-      }
+      // Incoming values agree, continue trying to eliminate this PHI.
+      if (FirstVal == PredLiveOut)
+        continue;
+
+      // We can also accept a PHI value that feeds back into itself.
+      if (PredLiveOut == ValueIDNum(MBB.getNumber(), 0, Idx))
+        continue;
+
+      // Live-out of a predecessor disagrees with the first predecessor.
+      Disagree = true;
     }
-    // else: if we disagree in the non-backedges, then this is definitely
-    // a control flow merge where different values merge. Make it a PHI.
 
-    // Generate a phi...
-    ValueIDNum PHI = {(uint64_t)MBB.getNumber(), 0, Idx};
-    ValueIDNum NewVal = (Disagree && !OverRide) ? PHI : BaseVal;
-    if (InLocs[Idx.asU64()] != NewVal) {
+    // No disagreement? No PHI. Otherwise, leave the PHI in live-ins.
+    if (!Disagree) {
+      InLocs[Idx.asU64()] = FirstVal;
       Changed |= true;
-      InLocs[Idx.asU64()] = NewVal;
     }
   }
 
   // TODO: Reimplement NumInserted and NumRemoved.
-  return std::tuple<bool, bool>(Changed, DowngradeOccurred);
+  return Changed;
+}
+
+void InstrRefBasedLDV::findStackIndexInterference(
+    SmallVectorImpl<unsigned> &Slots) {
+  // We could spend a bit of time finding the exact, minimal, set of stack
+  // indexes that interfere with each other, much like reg units. Or, we can
+  // rely on the fact that:
+  //  * The smallest / lowest index will interfere with everything at zero
+  //    offset, which will be the largest set of registers,
+  //  * Most indexes with non-zero offset will end up being interference units
+  //    anyway.
+  // So just pick those out and return them.
+
+  // We can rely on a single-byte stack index existing already, because we
+  // initialize them in MLocTracker.
+  auto It = MTracker->StackSlotIdxes.find({8, 0});
+  assert(It != MTracker->StackSlotIdxes.end());
+  Slots.push_back(It->second);
+
+  // Find anything that has a non-zero offset and add that too.
+  for (auto &Pair : MTracker->StackSlotIdxes) {
+    // Is offset zero? If so, ignore.
+    if (!Pair.first.second)
+      continue;
+    Slots.push_back(Pair.second);
+  }
 }
 
-void InstrRefBasedLDV::mlocDataflow(
-    ValueIDNum **MInLocs, ValueIDNum **MOutLocs,
+void InstrRefBasedLDV::placeMLocPHIs(
+    MachineFunction &MF, SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+    ValueIDNum **MInLocs, SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
+  SmallVector<unsigned, 4> StackUnits;
+  findStackIndexInterference(StackUnits);
+
+  // To avoid repeatedly running the PHI placement algorithm, leverage the
+  // fact that a def of register MUST also def its register units. Find the
+  // units for registers, place PHIs for them, and then replicate them for
+  // aliasing registers. Some inputs that are never def'd (DBG_PHIs of
+  // arguments) don't lead to register units being tracked, just place PHIs for
+  // those registers directly. Stack slots have their own form of "unit",
+  // store them to one side.
+  SmallSet<Register, 32> RegUnitsToPHIUp;
+  SmallSet<LocIdx, 32> NormalLocsToPHI;
+  SmallSet<SpillLocationNo, 32> StackSlots;
+  for (auto Location : MTracker->locations()) {
+    LocIdx L = Location.Idx;
+    if (MTracker->isSpill(L)) {
+      StackSlots.insert(MTracker->locIDToSpill(MTracker->LocIdxToLocID[L]));
+      continue;
+    }
+
+    Register R = MTracker->LocIdxToLocID[L];
+    SmallSet<Register, 8> FoundRegUnits;
+    bool AnyIllegal = false;
+    for (MCRegUnitIterator RUI(R.asMCReg(), TRI); RUI.isValid(); ++RUI) {
+      for (MCRegUnitRootIterator URoot(*RUI, TRI); URoot.isValid(); ++URoot){
+        if (!MTracker->isRegisterTracked(*URoot)) {
+          // Not all roots were loaded into the tracking map: this register
+          // isn't actually def'd anywhere, we only read from it. Generate PHIs
+          // for this reg, but don't iterate units.
+          AnyIllegal = true;
+        } else {
+          FoundRegUnits.insert(*URoot);
+        }
+      }
+    }
+
+    if (AnyIllegal) {
+      NormalLocsToPHI.insert(L);
+      continue;
+    }
+
+    RegUnitsToPHIUp.insert(FoundRegUnits.begin(), FoundRegUnits.end());
+  }
+
+  // Lambda to fetch PHIs for a given location, and write into the PHIBlocks
+  // collection.
+  SmallVector<MachineBasicBlock *, 32> PHIBlocks;
+  auto CollectPHIsForLoc = [&](LocIdx L) {
+    // Collect the set of defs.
+    SmallPtrSet<MachineBasicBlock *, 32> DefBlocks;
+    for (unsigned int I = 0; I < OrderToBB.size(); ++I) {
+      MachineBasicBlock *MBB = OrderToBB[I];
+      const auto &TransferFunc = MLocTransfer[MBB->getNumber()];
+      if (TransferFunc.find(L) != TransferFunc.end())
+        DefBlocks.insert(MBB);
+    }
+
+    // The entry block defs the location too: it's the live-in / argument value.
+    // Only insert if there are other defs though; everything is trivially live
+    // through otherwise.
+    if (!DefBlocks.empty())
+      DefBlocks.insert(&*MF.begin());
+
+    // Ask the SSA construction algorithm where we should put PHIs. Clear
+    // anything that might have been hanging around from earlier.
+    PHIBlocks.clear();
+    BlockPHIPlacement(AllBlocks, DefBlocks, PHIBlocks);
+  };
+
+  auto InstallPHIsAtLoc = [&PHIBlocks, &MInLocs](LocIdx L) {
+    for (const MachineBasicBlock *MBB : PHIBlocks)
+      MInLocs[MBB->getNumber()][L.asU64()] = ValueIDNum(MBB->getNumber(), 0, L);
+  };
+
+  // For locations with no reg units, just place PHIs.
+  for (LocIdx L : NormalLocsToPHI) {
+    CollectPHIsForLoc(L);
+    // Install those PHI values into the live-in value array.
+    InstallPHIsAtLoc(L);
+  }
+
+  // For stack slots, calculate PHIs for the equivalent of the units, then
+  // install for each index.
+  for (SpillLocationNo Slot : StackSlots) {
+    for (unsigned Idx : StackUnits) {
+      unsigned SpillID = MTracker->getSpillIDWithIdx(Slot, Idx);
+      LocIdx L = MTracker->getSpillMLoc(SpillID);
+      CollectPHIsForLoc(L);
+      InstallPHIsAtLoc(L);
+
+      // Find anything that aliases this stack index, install PHIs for it too.
+      unsigned Size, Offset;
+      std::tie(Size, Offset) = MTracker->StackIdxesToPos[Idx];
+      for (auto &Pair : MTracker->StackSlotIdxes) {
+        unsigned ThisSize, ThisOffset;
+        std::tie(ThisSize, ThisOffset) = Pair.first;
+        if (ThisSize + ThisOffset <= Offset || Size + Offset <= ThisOffset)
+          continue;
+
+        unsigned ThisID = MTracker->getSpillIDWithIdx(Slot, Pair.second);
+        LocIdx ThisL = MTracker->getSpillMLoc(ThisID);
+        InstallPHIsAtLoc(ThisL);
+      }
+    }
+  }
+
+  // For reg units, place PHIs, and then place them for any aliasing registers.
+  for (Register R : RegUnitsToPHIUp) {
+    LocIdx L = MTracker->lookupOrTrackRegister(R);
+    CollectPHIsForLoc(L);
+
+    // Install those PHI values into the live-in value array.
+    InstallPHIsAtLoc(L);
+
+    // Now find aliases and install PHIs for those.
+    for (MCRegAliasIterator RAI(R, TRI, true); RAI.isValid(); ++RAI) {
+      // Super-registers that are "above" the largest register read/written by
+      // the function will alias, but will not be tracked.
+      if (!MTracker->isRegisterTracked(*RAI))
+        continue;
+
+      LocIdx AliasLoc = MTracker->lookupOrTrackRegister(*RAI);
+      InstallPHIsAtLoc(AliasLoc);
+    }
+  }
+}
+
+void InstrRefBasedLDV::buildMLocValueMap(
+    MachineFunction &MF, ValueIDNum **MInLocs, ValueIDNum **MOutLocs,
     SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
   std::priority_queue<unsigned int, std::vector<unsigned int>,
                       std::greater<unsigned int>>
@@ -2666,20 +2060,34 @@ void InstrRefBasedLDV::mlocDataflow(
   // but this is probably not worth it.
   SmallPtrSet<MachineBasicBlock *, 16> OnPending, OnWorklist;
 
-  // Initialize worklist with every block to be visited.
+  // Initialize worklist with every block to be visited. Also produce list of
+  // all blocks.
+  SmallPtrSet<MachineBasicBlock *, 32> AllBlocks;
   for (unsigned int I = 0; I < BBToOrder.size(); ++I) {
     Worklist.push(I);
     OnWorklist.insert(OrderToBB[I]);
+    AllBlocks.insert(OrderToBB[I]);
   }
 
-  MTracker->reset();
-
-  // Set inlocs for entry block -- each as a PHI at the entry block. Represents
-  // the incoming value to the function.
-  MTracker->setMPhis(0);
+  // Initialize entry block to PHIs. These represent arguments.
   for (auto Location : MTracker->locations())
-    MInLocs[0][Location.Idx.asU64()] = Location.Value;
+    MInLocs[0][Location.Idx.asU64()] = ValueIDNum(0, 0, Location.Idx);
 
+  MTracker->reset();
+
+  // Start by placing PHIs, using the usual SSA constructor algorithm. Consider
+  // any machine-location that isn't live-through a block to be def'd in that
+  // block.
+  placeMLocPHIs(MF, AllBlocks, MInLocs, MLocTransfer);
+
+  // Propagate values to eliminate redundant PHIs. At the same time, this
+  // produces the table of Block x Location => Value for the entry to each
+  // block.
+  // The kind of PHIs we can eliminate are, for example, where one path in a
+  // conditional spills and restores a register, and the register still has
+  // the same value once control flow joins, unbeknowns to the PHI placement
+  // code. Propagating values allows us to identify such un-necessary PHIs and
+  // remove them.
   SmallPtrSet<const MachineBasicBlock *, 16> Visited;
   while (!Worklist.empty() || !Pending.empty()) {
     // Vector for storing the evaluated block transfer function.
@@ -2691,16 +2099,10 @@ void InstrRefBasedLDV::mlocDataflow(
       Worklist.pop();
 
       // Join the values in all predecessor blocks.
-      bool InLocsChanged, DowngradeOccurred;
-      std::tie(InLocsChanged, DowngradeOccurred) =
-          mlocJoin(*MBB, Visited, MOutLocs, MInLocs[CurBB]);
+      bool InLocsChanged;
+      InLocsChanged = mlocJoin(*MBB, Visited, MOutLocs, MInLocs[CurBB]);
       InLocsChanged |= Visited.insert(MBB).second;
 
-      // If a downgrade occurred, book us in for re-examination on the next
-      // iteration.
-      if (DowngradeOccurred && OnPending.insert(MBB).second)
-        Pending.push(BBToOrder[MBB]);
-
       // Don't examine transfer function if we've visited this loc at least
       // once, and inlocs haven't changed.
       if (!InLocsChanged)
@@ -2715,7 +2117,7 @@ void InstrRefBasedLDV::mlocDataflow(
       for (auto &P : MLocTransfer[CurBB]) {
         if (P.second.getBlock() == CurBB && P.second.isPHI()) {
           // This is a movement of whatever was live in. Read it.
-          ValueIDNum NewID = MTracker->getNumAtPos(P.second.getLoc());
+          ValueIDNum NewID = MTracker->readMLoc(P.second.getLoc());
           ToRemap.push_back(std::make_pair(P.first, NewID));
         } else {
           // It's a def. Just set it.
@@ -2745,8 +2147,8 @@ void InstrRefBasedLDV::mlocDataflow(
         continue;
 
       // All successors should be visited: put any back-edges on the pending
-      // list for the next dataflow iteration, and any other successors to be
-      // visited this iteration, if they're not going to be already.
+      // list for the next pass-through, and any other successors to be
+      // visited this pass, if they're not going to be already.
       for (auto s : MBB->successors()) {
         // Does branching to this successor represent a back-edge?
         if (BBToOrder[s] > BBToOrder[MBB]) {
@@ -2769,170 +2171,169 @@ void InstrRefBasedLDV::mlocDataflow(
     assert(Pending.empty() && "Pending should be empty");
   }
 
-  // Once all the live-ins don't change on mlocJoin(), we've reached a
-  // fixedpoint.
+  // Once all the live-ins don't change on mlocJoin(), we've eliminated all
+  // redundant PHIs.
 }
 
-bool InstrRefBasedLDV::vlocDowngradeLattice(
-    const MachineBasicBlock &MBB, const DbgValue &OldLiveInLocation,
-    const SmallVectorImpl<InValueT> &Values, unsigned CurBlockRPONum) {
-  // Ranking value preference: see file level comment, the highest rank is
-  // a plain def, followed by PHI values in reverse post-order. Numerically,
-  // we assign all defs the rank '0', all PHIs their blocks RPO number plus
-  // one, and consider the lowest value the highest ranked.
-  int OldLiveInRank = BBNumToRPO[OldLiveInLocation.ID.getBlock()] + 1;
-  if (!OldLiveInLocation.ID.isPHI())
-    OldLiveInRank = 0;
-
-  // Allow any unresolvable conflict to be over-ridden.
-  if (OldLiveInLocation.Kind == DbgValue::NoVal) {
-    // Although if it was an unresolvable conflict from _this_ block, then
-    // all other seeking of downgrades and PHIs must have failed before hand.
-    if (OldLiveInLocation.BlockNo == (unsigned)MBB.getNumber())
-      return false;
-    OldLiveInRank = INT_MIN;
-  }
-
-  auto &InValue = *Values[0].second;
+// Boilerplate for feeding MachineBasicBlocks into IDF calculator. Provide
+// template specialisations for graph traits and a successor enumerator.
+namespace llvm {
+template <> struct GraphTraits<MachineBasicBlock> {
+  using NodeRef = MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::succ_iterator;
 
-  if (InValue.Kind == DbgValue::Const || InValue.Kind == DbgValue::NoVal)
-    return false;
+  static NodeRef getEntryNode(MachineBasicBlock *BB) { return BB; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); }
+};
 
-  unsigned ThisRPO = BBNumToRPO[InValue.ID.getBlock()];
-  int ThisRank = ThisRPO + 1;
-  if (!InValue.ID.isPHI())
-    ThisRank = 0;
+template <> struct GraphTraits<const MachineBasicBlock> {
+  using NodeRef = const MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
 
-  // Too far down the lattice?
-  if (ThisRPO >= CurBlockRPONum)
-    return false;
+  static NodeRef getEntryNode(const MachineBasicBlock *BB) { return BB; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); }
+};
 
-  // Higher in the lattice than what we've already explored?
-  if (ThisRank <= OldLiveInRank)
-    return false;
+using MachineDomTreeBase = DomTreeBase<MachineBasicBlock>::NodeType;
+using MachineDomTreeChildGetter =
+    typename IDFCalculatorDetail::ChildrenGetterTy<MachineDomTreeBase, false>;
 
-  return true;
+namespace IDFCalculatorDetail {
+template <>
+typename MachineDomTreeChildGetter::ChildrenTy
+MachineDomTreeChildGetter::get(const NodeRef &N) {
+  return {N->succ_begin(), N->succ_end()};
+}
+} // namespace IDFCalculatorDetail
+} // namespace llvm
+
+void InstrRefBasedLDV::BlockPHIPlacement(
+    const SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+    const SmallPtrSetImpl<MachineBasicBlock *> &DefBlocks,
+    SmallVectorImpl<MachineBasicBlock *> &PHIBlocks) {
+  // Apply IDF calculator to the designated set of location defs, storing
+  // required PHIs into PHIBlocks. Uses the dominator tree stored in the
+  // InstrRefBasedLDV object.
+  IDFCalculatorDetail::ChildrenGetterTy<MachineDomTreeBase, false> foo;
+  IDFCalculatorBase<MachineDomTreeBase, false> IDF(DomTree->getBase(), foo);
+
+  IDF.setLiveInBlocks(AllBlocks);
+  IDF.setDefiningBlocks(DefBlocks);
+  IDF.calculate(PHIBlocks);
 }
 
-std::tuple<Optional<ValueIDNum>, bool> InstrRefBasedLDV::pickVPHILoc(
-    MachineBasicBlock &MBB, const DebugVariable &Var, const LiveIdxT &LiveOuts,
-    ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
-    const SmallVectorImpl<MachineBasicBlock *> &BlockOrders) {
+Optional<ValueIDNum> InstrRefBasedLDV::pickVPHILoc(
+    const MachineBasicBlock &MBB, const DebugVariable &Var,
+    const LiveIdxT &LiveOuts, ValueIDNum **MOutLocs,
+    const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders) {
   // Collect a set of locations from predecessor where its live-out value can
   // be found.
   SmallVector<SmallVector<LocIdx, 4>, 8> Locs;
+  SmallVector<const DbgValueProperties *, 4> Properties;
   unsigned NumLocs = MTracker->getNumLocs();
-  unsigned BackEdgesStart = 0;
 
-  for (auto p : BlockOrders) {
-    // Pick out where backedges start in the list of predecessors. Relies on
-    // BlockOrders being sorted by RPO.
-    if (BBToOrder[p] < BBToOrder[&MBB])
-      ++BackEdgesStart;
+  // No predecessors means no PHIs.
+  if (BlockOrders.empty())
+    return None;
 
-    // For each predecessor, create a new set of locations.
-    Locs.resize(Locs.size() + 1);
+  for (auto p : BlockOrders) {
     unsigned ThisBBNum = p->getNumber();
-    auto LiveOutMap = LiveOuts.find(p);
-    if (LiveOutMap == LiveOuts.end())
-      // This predecessor isn't in scope, it must have no live-in/live-out
-      // locations.
-      continue;
-
-    auto It = LiveOutMap->second->find(Var);
-    if (It == LiveOutMap->second->end())
-      // There's no value recorded for this variable in this predecessor,
-      // leave an empty set of locations.
-      continue;
-
-    const DbgValue &OutVal = It->second;
+    auto OutValIt = LiveOuts.find(p);
+    if (OutValIt == LiveOuts.end())
+      // If we have a predecessor not in scope, we'll never find a PHI position.
+      return None;
+    const DbgValue &OutVal = *OutValIt->second;
 
     if (OutVal.Kind == DbgValue::Const || OutVal.Kind == DbgValue::NoVal)
       // Consts and no-values cannot have locations we can join on.
-      continue;
+      return None;
 
-    assert(OutVal.Kind == DbgValue::Proposed || OutVal.Kind == DbgValue::Def);
-    ValueIDNum ValToLookFor = OutVal.ID;
+    Properties.push_back(&OutVal.Properties);
+
+    // Create new empty vector of locations.
+    Locs.resize(Locs.size() + 1);
 
-    // Search the live-outs of the predecessor for the specified value.
-    for (unsigned int I = 0; I < NumLocs; ++I) {
-      if (MOutLocs[ThisBBNum][I] == ValToLookFor)
-        Locs.back().push_back(LocIdx(I));
+    // If the live-in value is a def, find the locations where that value is
+    // present. Do the same for VPHIs where we know the VPHI value.
+    if (OutVal.Kind == DbgValue::Def ||
+        (OutVal.Kind == DbgValue::VPHI && OutVal.BlockNo != MBB.getNumber() &&
+         OutVal.ID != ValueIDNum::EmptyValue)) {
+      ValueIDNum ValToLookFor = OutVal.ID;
+      // Search the live-outs of the predecessor for the specified value.
+      for (unsigned int I = 0; I < NumLocs; ++I) {
+        if (MOutLocs[ThisBBNum][I] == ValToLookFor)
+          Locs.back().push_back(LocIdx(I));
+      }
+    } else {
+      assert(OutVal.Kind == DbgValue::VPHI);
+      // For VPHIs where we don't know the location, we definitely can't find
+      // a join loc.
+      if (OutVal.BlockNo != MBB.getNumber())
+        return None;
+
+      // Otherwise: this is a VPHI on a backedge feeding back into itself, i.e.
+      // a value that's live-through the whole loop. (It has to be a backedge,
+      // because a block can't dominate itself). We can accept as a PHI location
+      // any location where the other predecessors agree, _and_ the machine
+      // locations feed back into themselves. Therefore, add all self-looping
+      // machine-value PHI locations.
+      for (unsigned int I = 0; I < NumLocs; ++I) {
+        ValueIDNum MPHI(MBB.getNumber(), 0, LocIdx(I));
+        if (MOutLocs[ThisBBNum][I] == MPHI)
+          Locs.back().push_back(LocIdx(I));
+      }
     }
   }
 
-  // If there were no locations at all, return an empty result.
-  if (Locs.empty())
-    return std::tuple<Optional<ValueIDNum>, bool>(None, false);
-
-  // Lambda for seeking a common location within a range of location-sets.
-  using LocsIt = SmallVector<SmallVector<LocIdx, 4>, 8>::iterator;
-  auto SeekLocation =
-      [&Locs](llvm::iterator_range<LocsIt> SearchRange) -> Optional<LocIdx> {
-    // Starting with the first set of locations, take the intersection with
-    // subsequent sets.
-    SmallVector<LocIdx, 4> base = Locs[0];
-    for (auto &S : SearchRange) {
-      SmallVector<LocIdx, 4> new_base;
-      std::set_intersection(base.begin(), base.end(), S.begin(), S.end(),
-                            std::inserter(new_base, new_base.begin()));
-      base = new_base;
-    }
-    if (base.empty())
-      return None;
+  // We should have found locations for all predecessors, or returned.
+  assert(Locs.size() == BlockOrders.size());
 
-    // We now have a set of LocIdxes that contain the right output value in
-    // each of the predecessors. Pick the lowest; if there's a register loc,
-    // that'll be it.
-    return *base.begin();
-  };
+  // Check that all properties are the same. We can't pick a location if they're
+  // not.
+  const DbgValueProperties *Properties0 = Properties[0];
+  for (auto *Prop : Properties)
+    if (*Prop != *Properties0)
+      return None;
 
-  // Search for a common location for all predecessors. If we can't, then fall
-  // back to only finding a common location between non-backedge predecessors.
-  bool ValidForAllLocs = true;
-  auto TheLoc = SeekLocation(Locs);
-  if (!TheLoc) {
-    ValidForAllLocs = false;
-    TheLoc =
-        SeekLocation(make_range(Locs.begin(), Locs.begin() + BackEdgesStart));
-  }
+  // Starting with the first set of locations, take the intersection with
+  // subsequent sets.
+  SmallVector<LocIdx, 4> CandidateLocs = Locs[0];
+  for (unsigned int I = 1; I < Locs.size(); ++I) {
+    auto &LocVec = Locs[I];
+    SmallVector<LocIdx, 4> NewCandidates;
+    std::set_intersection(CandidateLocs.begin(), CandidateLocs.end(),
+                          LocVec.begin(), LocVec.end(), std::inserter(NewCandidates, NewCandidates.begin()));
+    CandidateLocs = NewCandidates;
+  }
+  if (CandidateLocs.empty())
+    return None;
 
-  if (!TheLoc)
-    return std::tuple<Optional<ValueIDNum>, bool>(None, false);
+  // We now have a set of LocIdxes that contain the right output value in
+  // each of the predecessors. Pick the lowest; if there's a register loc,
+  // that'll be it.
+  LocIdx L = *CandidateLocs.begin();
 
   // Return a PHI-value-number for the found location.
-  LocIdx L = *TheLoc;
   ValueIDNum PHIVal = {(unsigned)MBB.getNumber(), 0, L};
-  return std::tuple<Optional<ValueIDNum>, bool>(PHIVal, ValidForAllLocs);
+  return PHIVal;
 }
 
-std::tuple<bool, bool> InstrRefBasedLDV::vlocJoin(
-    MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs, LiveIdxT &VLOCInLocs,
-    SmallPtrSet<const MachineBasicBlock *, 16> *VLOCVisited, unsigned BBNum,
-    const SmallSet<DebugVariable, 4> &AllVars, ValueIDNum **MOutLocs,
-    ValueIDNum **MInLocs,
+bool InstrRefBasedLDV::vlocJoin(
+    MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs,
     SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
     SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
-    DenseMap<DebugVariable, DbgValue> &InLocsT) {
-  bool DowngradeOccurred = false;
-
+    DbgValue &LiveIn) {
   // To emulate VarLocBasedImpl, process this block if it's not in scope but
   // _does_ assign a variable value. No live-ins for this scope are transferred
   // in though, so we can return immediately.
-  if (InScopeBlocks.count(&MBB) == 0 && !ArtificialBlocks.count(&MBB)) {
-    if (VLOCVisited)
-      return std::tuple<bool, bool>(true, false);
-    return std::tuple<bool, bool>(false, false);
-  }
+  if (InScopeBlocks.count(&MBB) == 0 && !ArtificialBlocks.count(&MBB))
+    return false;
 
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
   bool Changed = false;
 
-  // Find any live-ins computed in a prior iteration.
-  auto ILSIt = VLOCInLocs.find(&MBB);
-  assert(ILSIt != VLOCInLocs.end());
-  auto &ILS = *ILSIt->second;
-
   // Order predecessors by RPOT order, for exploring them in that order.
   SmallVector<MachineBasicBlock *, 8> BlockOrders(MBB.predecessors());
 
@@ -2944,244 +2345,102 @@ std::tuple<bool, bool> InstrRefBasedLDV::vlocJoin(
 
   unsigned CurBlockRPONum = BBToOrder[&MBB];
 
-  // Force a re-visit to loop heads in the first dataflow iteration.
-  // FIXME: if we could "propose" Const values this wouldn't be needed,
-  // because they'd need to be confirmed before being emitted.
-  if (!BlockOrders.empty() &&
-      BBToOrder[BlockOrders[BlockOrders.size() - 1]] >= CurBlockRPONum &&
-      VLOCVisited)
-    DowngradeOccurred = true;
-
-  auto ConfirmValue = [&InLocsT](const DebugVariable &DV, DbgValue VR) {
-    auto Result = InLocsT.insert(std::make_pair(DV, VR));
-    (void)Result;
-    assert(Result.second);
-  };
-
-  auto ConfirmNoVal = [&ConfirmValue, &MBB](const DebugVariable &Var, const DbgValueProperties &Properties) {
-    DbgValue NoLocPHIVal(MBB.getNumber(), Properties, DbgValue::NoVal);
-
-    ConfirmValue(Var, NoLocPHIVal);
-  };
+  // Collect all the incoming DbgValues for this variable, from predecessor
+  // live-out values.
+  SmallVector<InValueT, 8> Values;
+  bool Bail = false;
+  int BackEdgesStart = 0;
+  for (auto p : BlockOrders) {
+    // If the predecessor isn't in scope / to be explored, we'll never be
+    // able to join any locations.
+    if (!BlocksToExplore.contains(p)) {
+      Bail = true;
+      break;
+    }
 
-  // Attempt to join the values for each variable.
-  for (auto &Var : AllVars) {
-    // Collect all the DbgValues for this variable.
-    SmallVector<InValueT, 8> Values;
-    bool Bail = false;
-    unsigned BackEdgesStart = 0;
-    for (auto p : BlockOrders) {
-      // If the predecessor isn't in scope / to be explored, we'll never be
-      // able to join any locations.
-      if (!BlocksToExplore.contains(p)) {
-        Bail = true;
-        break;
-      }
+    // All Live-outs will have been initialized.
+    DbgValue &OutLoc = *VLOCOutLocs.find(p)->second;
 
-      // Don't attempt to handle unvisited predecessors: they're implicitly
-      // "unknown"s in the lattice.
-      if (VLOCVisited && !VLOCVisited->count(p))
-        continue;
+    // Keep track of where back-edges begin in the Values vector. Relies on
+    // BlockOrders being sorted by RPO.
+    unsigned ThisBBRPONum = BBToOrder[p];
+    if (ThisBBRPONum < CurBlockRPONum)
+      ++BackEdgesStart;
 
-      // If the predecessors OutLocs is absent, there's not much we can do.
-      auto OL = VLOCOutLocs.find(p);
-      if (OL == VLOCOutLocs.end()) {
-        Bail = true;
-        break;
-      }
+    Values.push_back(std::make_pair(p, &OutLoc));
+  }
 
-      // No live-out value for this predecessor also means we can't produce
-      // a joined value.
-      auto VIt = OL->second->find(Var);
-      if (VIt == OL->second->end()) {
-        Bail = true;
-        break;
-      }
+  // If there were no values, or one of the predecessors couldn't have a
+  // value, then give up immediately. It's not safe to produce a live-in
+  // value. Leave as whatever it was before.
+  if (Bail || Values.size() == 0)
+    return false;
 
-      // Keep track of where back-edges begin in the Values vector. Relies on
-      // BlockOrders being sorted by RPO.
-      unsigned ThisBBRPONum = BBToOrder[p];
-      if (ThisBBRPONum < CurBlockRPONum)
-        ++BackEdgesStart;
+  // All (non-entry) blocks have at least one non-backedge predecessor.
+  // Pick the variable value from the first of these, to compare against
+  // all others.
+  const DbgValue &FirstVal = *Values[0].second;
+
+  // If the old live-in value is not a PHI then either a) no PHI is needed
+  // here, or b) we eliminated the PHI that was here. If so, we can just
+  // propagate in the first parent's incoming value.
+  if (LiveIn.Kind != DbgValue::VPHI || LiveIn.BlockNo != MBB.getNumber()) {
+    Changed = LiveIn != FirstVal;
+    if (Changed)
+      LiveIn = FirstVal;
+    return Changed;
+  }
+
+  // Scan for variable values that can never be resolved: if they have
+  // different DIExpressions, different indirectness, or are mixed constants /
+  // non-constants.
+  for (auto &V : Values) {
+    if (V.second->Properties != FirstVal.Properties)
+      return false;
+    if (V.second->Kind == DbgValue::NoVal)
+      return false;
+    if (V.second->Kind == DbgValue::Const && FirstVal.Kind != DbgValue::Const)
+      return false;
+  }
 
-      Values.push_back(std::make_pair(p, &VIt->second));
-    }
+  // Try to eliminate this PHI. Do the incoming values all agree?
+  bool Disagree = false;
+  for (auto &V : Values) {
+    if (*V.second == FirstVal)
+      continue; // No disagreement.
 
-    // If there were no values, or one of the predecessors couldn't have a
-    // value, then give up immediately. It's not safe to produce a live-in
-    // value.
-    if (Bail || Values.size() == 0)
+    // Eliminate if a backedge feeds a VPHI back into itself.
+    if (V.second->Kind == DbgValue::VPHI &&
+        V.second->BlockNo == MBB.getNumber() &&
+        // Is this a backedge?
+        std::distance(Values.begin(), &V) >= BackEdgesStart)
       continue;
 
-    // Enumeration identifying the current state of the predecessors values.
-    enum {
-      Unset = 0,
-      Agreed,       // All preds agree on the variable value.
-      PropDisagree, // All preds agree, but the value kind is Proposed in some.
-      BEDisagree,   // Only back-edges disagree on variable value.
-      PHINeeded,    // Non-back-edge predecessors have conflicing values.
-      NoSolution    // Conflicting Value metadata makes solution impossible.
-    } OurState = Unset;
-
-    // All (non-entry) blocks have at least one non-backedge predecessor.
-    // Pick the variable value from the first of these, to compare against
-    // all others.
-    const DbgValue &FirstVal = *Values[0].second;
-    const ValueIDNum &FirstID = FirstVal.ID;
-
-    // Scan for variable values that can't be resolved: if they have different
-    // DIExpressions, different indirectness, or are mixed constants /
-    // non-constants.
-    for (auto &V : Values) {
-      if (V.second->Properties != FirstVal.Properties)
-        OurState = NoSolution;
-      if (V.second->Kind == DbgValue::Const && FirstVal.Kind != DbgValue::Const)
-        OurState = NoSolution;
-    }
-
-    // Flags diagnosing _how_ the values disagree.
-    bool NonBackEdgeDisagree = false;
-    bool DisagreeOnPHINess = false;
-    bool IDDisagree = false;
-    bool Disagree = false;
-    if (OurState == Unset) {
-      for (auto &V : Values) {
-        if (*V.second == FirstVal)
-          continue; // No disagreement.
-
-        Disagree = true;
-
-        // Flag whether the value number actually diagrees.
-        if (V.second->ID != FirstID)
-          IDDisagree = true;
-
-        // Distinguish whether disagreement happens in backedges or not.
-        // Relies on Values (and BlockOrders) being sorted by RPO.
-        unsigned ThisBBRPONum = BBToOrder[V.first];
-        if (ThisBBRPONum < CurBlockRPONum)
-          NonBackEdgeDisagree = true;
-
-        // Is there a difference in whether the value is definite or only
-        // proposed?
-        if (V.second->Kind != FirstVal.Kind &&
-            (V.second->Kind == DbgValue::Proposed ||
-             V.second->Kind == DbgValue::Def) &&
-            (FirstVal.Kind == DbgValue::Proposed ||
-             FirstVal.Kind == DbgValue::Def))
-          DisagreeOnPHINess = true;
-      }
-
-      // Collect those flags together and determine an overall state for
-      // what extend the predecessors agree on a live-in value.
-      if (!Disagree)
-        OurState = Agreed;
-      else if (!IDDisagree && DisagreeOnPHINess)
-        OurState = PropDisagree;
-      else if (!NonBackEdgeDisagree)
-        OurState = BEDisagree;
-      else
-        OurState = PHINeeded;
-    }
-
-    // An extra indicator: if we only disagree on whether the value is a
-    // Def, or proposed, then also flag whether that disagreement happens
-    // in backedges only.
-    bool PropOnlyInBEs = Disagree && !IDDisagree && DisagreeOnPHINess &&
-                         !NonBackEdgeDisagree && FirstVal.Kind == DbgValue::Def;
-
-    const auto &Properties = FirstVal.Properties;
-
-    auto OldLiveInIt = ILS.find(Var);
-    const DbgValue *OldLiveInLocation =
-        (OldLiveInIt != ILS.end()) ? &OldLiveInIt->second : nullptr;
-
-    bool OverRide = false;
-    if (OurState == BEDisagree && OldLiveInLocation) {
-      // Only backedges disagree: we can consider downgrading. If there was a
-      // previous live-in value, use it to work out whether the current
-      // incoming value represents a lattice downgrade or not.
-      OverRide =
-          vlocDowngradeLattice(MBB, *OldLiveInLocation, Values, CurBlockRPONum);
-    }
-
-    // Use the current state of predecessor agreement and other flags to work
-    // out what to do next. Possibilities include:
-    //  * Accept a value all predecessors agree on, or accept one that
-    //    represents a step down the exploration lattice,
-    //  * Use a PHI value number, if one can be found,
-    //  * Propose a PHI value number, and see if it gets confirmed later,
-    //  * Emit a 'NoVal' value, indicating we couldn't resolve anything.
-    if (OurState == Agreed) {
-      // Easiest solution: all predecessors agree on the variable value.
-      ConfirmValue(Var, FirstVal);
-    } else if (OurState == BEDisagree && OverRide) {
-      // Only backedges disagree, and the other predecessors have produced
-      // a new live-in value further down the exploration lattice.
-      DowngradeOccurred = true;
-      ConfirmValue(Var, FirstVal);
-    } else if (OurState == PropDisagree) {
-      // Predecessors agree on value, but some say it's only a proposed value.
-      // Propagate it as proposed: unless it was proposed in this block, in
-      // which case we're able to confirm the value.
-      if (FirstID.getBlock() == (uint64_t)MBB.getNumber() && FirstID.isPHI()) {
-        ConfirmValue(Var, DbgValue(FirstID, Properties, DbgValue::Def));
-      } else if (PropOnlyInBEs) {
-        // If only backedges disagree, a higher (in RPO) block confirmed this
-        // location, and we need to propagate it into this loop.
-        ConfirmValue(Var, DbgValue(FirstID, Properties, DbgValue::Def));
-      } else {
-        // Otherwise; a Def meeting a Proposed is still a Proposed.
-        ConfirmValue(Var, DbgValue(FirstID, Properties, DbgValue::Proposed));
-      }
-    } else if ((OurState == PHINeeded || OurState == BEDisagree)) {
-      // Predecessors disagree and can't be downgraded: this can only be
-      // solved with a PHI. Use pickVPHILoc to go look for one.
-      Optional<ValueIDNum> VPHI;
-      bool AllEdgesVPHI = false;
-      std::tie(VPHI, AllEdgesVPHI) =
-          pickVPHILoc(MBB, Var, VLOCOutLocs, MOutLocs, MInLocs, BlockOrders);
-
-      if (VPHI && AllEdgesVPHI) {
-        // There's a PHI value that's valid for all predecessors -- we can use
-        // it. If any of the non-backedge predecessors have proposed values
-        // though, this PHI is also only proposed, until the predecessors are
-        // confirmed.
-        DbgValue::KindT K = DbgValue::Def;
-        for (unsigned int I = 0; I < BackEdgesStart; ++I)
-          if (Values[I].second->Kind == DbgValue::Proposed)
-            K = DbgValue::Proposed;
-
-        ConfirmValue(Var, DbgValue(*VPHI, Properties, K));
-      } else if (VPHI) {
-        // There's a PHI value, but it's only legal for backedges. Leave this
-        // as a proposed PHI value: it might come back on the backedges,
-        // and allow us to confirm it in the future.
-        DbgValue NoBEValue = DbgValue(*VPHI, Properties, DbgValue::Proposed);
-        ConfirmValue(Var, NoBEValue);
-      } else {
-        ConfirmNoVal(Var, Properties);
-      }
-    } else {
-      // Otherwise: we don't know. Emit a "phi but no real loc" phi.
-      ConfirmNoVal(Var, Properties);
-    }
+    Disagree = true;
   }
 
-  // Store newly calculated in-locs into VLOCInLocs, if they've changed.
-  Changed = ILS != InLocsT;
-  if (Changed)
-    ILS = InLocsT;
-
-  return std::tuple<bool, bool>(Changed, DowngradeOccurred);
+  // No disagreement -> live-through value.
+  if (!Disagree) {
+    Changed = LiveIn != FirstVal;
+    if (Changed)
+      LiveIn = FirstVal;
+    return Changed;
+  } else {
+    // Otherwise use a VPHI.
+    DbgValue VPHI(MBB.getNumber(), FirstVal.Properties, DbgValue::VPHI);
+    Changed = LiveIn != VPHI;
+    if (Changed)
+      LiveIn = VPHI;
+    return Changed;
+  }
 }
 
-void InstrRefBasedLDV::vlocDataflow(
-    const LexicalScope *Scope, const DILocation *DILoc,
+void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc,
     const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
     SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks, LiveInsT &Output,
     ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
     SmallVectorImpl<VLocTracker> &AllTheVLocs) {
-  // This method is much like mlocDataflow: but focuses on a single
+  // This method is much like buildMLocValueMap: but focuses on a single
   // LexicalScope at a time. Pick out a set of blocks and variables that are
   // to have their value assignments solved, then run our dataflow algorithm
   // until a fixedpoint is reached.
@@ -3235,8 +2494,8 @@ void InstrRefBasedLDV::vlocDataflow(
             continue;
           if (!ArtificialBlocks.count(succ))
             continue;
-          DFS.push_back(std::make_pair(succ, succ->succ_begin()));
           ToAdd.insert(succ);
+          DFS.push_back(std::make_pair(succ, succ->succ_begin()));
         }
 
         // Search all those blocks, depth first.
@@ -3252,8 +2511,8 @@ void InstrRefBasedLDV::vlocDataflow(
           // If the current successor is artificial and unexplored, descend into
           // it.
           if (!ToAdd.count(*CurSucc) && ArtificialBlocks.count(*CurSucc)) {
-            DFS.push_back(std::make_pair(*CurSucc, (*CurSucc)->succ_begin()));
             ToAdd.insert(*CurSucc);
+            DFS.push_back(std::make_pair(*CurSucc, (*CurSucc)->succ_begin()));
             continue;
           }
 
@@ -3278,6 +2537,13 @@ void InstrRefBasedLDV::vlocDataflow(
   if (BlocksToExplore.size() == 1)
     return;
 
+  // Convert a const set to a non-const set. LexicalScopes
+  // getMachineBasicBlocks returns const MBB pointers, IDF wants mutable ones.
+  // (Neither of them mutate anything).
+  SmallPtrSet<MachineBasicBlock *, 8> MutBlocksToExplore;
+  for (const auto *MBB : BlocksToExplore)
+    MutBlocksToExplore.insert(const_cast<MachineBasicBlock *>(MBB));
+
   // Picks out relevants blocks RPO order and sort them.
   for (auto *MBB : BlocksToExplore)
     BlockOrders.push_back(const_cast<MachineBasicBlock *>(MBB));
@@ -3286,9 +2552,18 @@ void InstrRefBasedLDV::vlocDataflow(
   unsigned NumBlocks = BlockOrders.size();
 
   // Allocate some vectors for storing the live ins and live outs. Large.
-  SmallVector<DenseMap<DebugVariable, DbgValue>, 32> LiveIns, LiveOuts;
-  LiveIns.resize(NumBlocks);
-  LiveOuts.resize(NumBlocks);
+  SmallVector<DbgValue, 32> LiveIns, LiveOuts;
+  LiveIns.reserve(NumBlocks);
+  LiveOuts.reserve(NumBlocks);
+
+  // Initialize all values to start as NoVals. This signifies "it's live
+  // through, but we don't know what it is".
+  DbgValueProperties EmptyProperties(EmptyExpr, false);
+  for (unsigned int I = 0; I < NumBlocks; ++I) {
+    DbgValue EmptyDbgValue(I, EmptyProperties, DbgValue::NoVal);
+    LiveIns.push_back(EmptyDbgValue);
+    LiveOuts.push_back(EmptyDbgValue);
+  }
 
   // Produce by-MBB indexes of live-in/live-outs, to ease lookup within
   // vlocJoin.
@@ -3300,108 +2575,164 @@ void InstrRefBasedLDV::vlocDataflow(
     LiveInIdx[BlockOrders[I]] = &LiveIns[I];
   }
 
-  for (auto *MBB : BlockOrders) {
-    Worklist.push(BBToOrder[MBB]);
-    OnWorklist.insert(MBB);
-  }
+  // Loop over each variable and place PHIs for it, then propagate values
+  // between blocks. This keeps the locality of working on one lexical scope at
+  // at time, but avoids re-processing variable values because some other
+  // variable has been assigned.
+  for (auto &Var : VarsWeCareAbout) {
+    // Re-initialize live-ins and live-outs, to clear the remains of previous
+    // variables live-ins / live-outs.
+    for (unsigned int I = 0; I < NumBlocks; ++I) {
+      DbgValue EmptyDbgValue(I, EmptyProperties, DbgValue::NoVal);
+      LiveIns[I] = EmptyDbgValue;
+      LiveOuts[I] = EmptyDbgValue;
+    }
 
-  // Iterate over all the blocks we selected, propagating variable values.
-  bool FirstTrip = true;
-  SmallPtrSet<const MachineBasicBlock *, 16> VLOCVisited;
-  while (!Worklist.empty() || !Pending.empty()) {
-    while (!Worklist.empty()) {
-      auto *MBB = OrderToBB[Worklist.top()];
-      CurBB = MBB->getNumber();
-      Worklist.pop();
+    // Place PHIs for variable values, using the LLVM IDF calculator.
+    // Collect the set of blocks where variables are def'd.
+    SmallPtrSet<MachineBasicBlock *, 32> DefBlocks;
+    for (const MachineBasicBlock *ExpMBB : BlocksToExplore) {
+      auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars;
+      if (TransferFunc.find(Var) != TransferFunc.end())
+        DefBlocks.insert(const_cast<MachineBasicBlock *>(ExpMBB));
+    }
 
-      DenseMap<DebugVariable, DbgValue> JoinedInLocs;
+    SmallVector<MachineBasicBlock *, 32> PHIBlocks;
 
-      // Join values from predecessors. Updates LiveInIdx, and writes output
-      // into JoinedInLocs.
-      bool InLocsChanged, DowngradeOccurred;
-      std::tie(InLocsChanged, DowngradeOccurred) = vlocJoin(
-          *MBB, LiveOutIdx, LiveInIdx, (FirstTrip) ? &VLOCVisited : nullptr,
-          CurBB, VarsWeCareAbout, MOutLocs, MInLocs, InScopeBlocks,
-          BlocksToExplore, JoinedInLocs);
+    // Request the set of PHIs we should insert for this variable.
+    BlockPHIPlacement(MutBlocksToExplore, DefBlocks, PHIBlocks);
 
-      bool FirstVisit = VLOCVisited.insert(MBB).second;
+    // Insert PHIs into the per-block live-in tables for this variable.
+    for (MachineBasicBlock *PHIMBB : PHIBlocks) {
+      unsigned BlockNo = PHIMBB->getNumber();
+      DbgValue *LiveIn = LiveInIdx[PHIMBB];
+      *LiveIn = DbgValue(BlockNo, EmptyProperties, DbgValue::VPHI);
+    }
 
-      // Always explore transfer function if inlocs changed, or if we've not
-      // visited this block before.
-      InLocsChanged |= FirstVisit;
+    for (auto *MBB : BlockOrders) {
+      Worklist.push(BBToOrder[MBB]);
+      OnWorklist.insert(MBB);
+    }
 
-      // If a downgrade occurred, book us in for re-examination on the next
-      // iteration.
-      if (DowngradeOccurred && OnPending.insert(MBB).second)
-        Pending.push(BBToOrder[MBB]);
+    // Iterate over all the blocks we selected, propagating the variables value.
+    // This loop does two things:
+    //  * Eliminates un-necessary VPHIs in vlocJoin,
+    //  * Evaluates the blocks transfer function (i.e. variable assignments) and
+    //    stores the result to the blocks live-outs.
+    // Always evaluate the transfer function on the first iteration, and when
+    // the live-ins change thereafter.
+    bool FirstTrip = true;
+    while (!Worklist.empty() || !Pending.empty()) {
+      while (!Worklist.empty()) {
+        auto *MBB = OrderToBB[Worklist.top()];
+        CurBB = MBB->getNumber();
+        Worklist.pop();
+
+        auto LiveInsIt = LiveInIdx.find(MBB);
+        assert(LiveInsIt != LiveInIdx.end());
+        DbgValue *LiveIn = LiveInsIt->second;
+
+        // Join values from predecessors. Updates LiveInIdx, and writes output
+        // into JoinedInLocs.
+        bool InLocsChanged =
+            vlocJoin(*MBB, LiveOutIdx, InScopeBlocks, BlocksToExplore, *LiveIn);
+
+        SmallVector<const MachineBasicBlock *, 8> Preds;
+        for (const auto *Pred : MBB->predecessors())
+          Preds.push_back(Pred);
+
+        // If this block's live-in value is a VPHI, try to pick a machine-value
+        // for it. This makes the machine-value available and propagated
+        // through all blocks by the time value propagation finishes. We can't
+        // do this any earlier as it needs to read the block live-outs.
+        if (LiveIn->Kind == DbgValue::VPHI && LiveIn->BlockNo == (int)CurBB) {
+          // There's a small possibility that on a preceeding path, a VPHI is
+          // eliminated and transitions from VPHI-with-location to
+          // live-through-value. As a result, the selected location of any VPHI
+          // might change, so we need to re-compute it on each iteration.
+          Optional<ValueIDNum> ValueNum =
+              pickVPHILoc(*MBB, Var, LiveOutIdx, MOutLocs, Preds);
+
+          if (ValueNum) {
+            InLocsChanged |= LiveIn->ID != *ValueNum;
+            LiveIn->ID = *ValueNum;
+          }
+        }
 
-      if (!InLocsChanged)
-        continue;
+        if (!InLocsChanged && !FirstTrip)
+          continue;
+
+        DbgValue *LiveOut = LiveOutIdx[MBB];
+        bool OLChanged = false;
 
-      // Do transfer function.
-      auto &VTracker = AllTheVLocs[MBB->getNumber()];
-      for (auto &Transfer : VTracker.Vars) {
-        // Is this var we're mangling in this scope?
-        if (VarsWeCareAbout.count(Transfer.first)) {
+        // Do transfer function.
+        auto &VTracker = AllTheVLocs[MBB->getNumber()];
+        auto TransferIt = VTracker.Vars.find(Var);
+        if (TransferIt != VTracker.Vars.end()) {
           // Erase on empty transfer (DBG_VALUE $noreg).
-          if (Transfer.second.Kind == DbgValue::Undef) {
-            JoinedInLocs.erase(Transfer.first);
+          if (TransferIt->second.Kind == DbgValue::Undef) {
+            DbgValue NewVal(MBB->getNumber(), EmptyProperties, DbgValue::NoVal);
+            if (*LiveOut != NewVal) {
+              *LiveOut = NewVal;
+              OLChanged = true;
+            }
           } else {
             // Insert new variable value; or overwrite.
-            auto NewValuePair = std::make_pair(Transfer.first, Transfer.second);
-            auto Result = JoinedInLocs.insert(NewValuePair);
-            if (!Result.second)
-              Result.first->second = Transfer.second;
+            if (*LiveOut != TransferIt->second) {
+              *LiveOut = TransferIt->second;
+              OLChanged = true;
+            }
+          }
+        } else {
+          // Just copy live-ins to live-outs, for anything not transferred.
+          if (*LiveOut != *LiveIn) {
+            *LiveOut = *LiveIn;
+            OLChanged = true;
           }
         }
-      }
-
-      // Did the live-out locations change?
-      bool OLChanged = JoinedInLocs != *LiveOutIdx[MBB];
-
-      // If they haven't changed, there's no need to explore further.
-      if (!OLChanged)
-        continue;
 
-      // Commit to the live-out record.
-      *LiveOutIdx[MBB] = JoinedInLocs;
-
-      // We should visit all successors. Ensure we'll visit any non-backedge
-      // successors during this dataflow iteration; book backedge successors
-      // to be visited next time around.
-      for (auto s : MBB->successors()) {
-        // Ignore out of scope / not-to-be-explored successors.
-        if (LiveInIdx.find(s) == LiveInIdx.end())
+        // If no live-out value changed, there's no need to explore further.
+        if (!OLChanged)
           continue;
 
-        if (BBToOrder[s] > BBToOrder[MBB]) {
-          if (OnWorklist.insert(s).second)
-            Worklist.push(BBToOrder[s]);
-        } else if (OnPending.insert(s).second && (FirstTrip || OLChanged)) {
-          Pending.push(BBToOrder[s]);
+        // We should visit all successors. Ensure we'll visit any non-backedge
+        // successors during this dataflow iteration; book backedge successors
+        // to be visited next time around.
+        for (auto s : MBB->successors()) {
+          // Ignore out of scope / not-to-be-explored successors.
+          if (LiveInIdx.find(s) == LiveInIdx.end())
+            continue;
+
+          if (BBToOrder[s] > BBToOrder[MBB]) {
+            if (OnWorklist.insert(s).second)
+              Worklist.push(BBToOrder[s]);
+          } else if (OnPending.insert(s).second && (FirstTrip || OLChanged)) {
+            Pending.push(BBToOrder[s]);
+          }
         }
       }
+      Worklist.swap(Pending);
+      std::swap(OnWorklist, OnPending);
+      OnPending.clear();
+      assert(Pending.empty());
+      FirstTrip = false;
     }
-    Worklist.swap(Pending);
-    std::swap(OnWorklist, OnPending);
-    OnPending.clear();
-    assert(Pending.empty());
-    FirstTrip = false;
-  }
-
-  // Dataflow done. Now what? Save live-ins. Ignore any that are still marked
-  // as being variable-PHIs, because those did not have their machine-PHI
-  // value confirmed. Such variable values are places that could have been
-  // PHIs, but are not.
-  for (auto *MBB : BlockOrders) {
-    auto &VarMap = *LiveInIdx[MBB];
-    for (auto &P : VarMap) {
-      if (P.second.Kind == DbgValue::Proposed ||
-          P.second.Kind == DbgValue::NoVal)
+
+    // Save live-ins to output vector. Ignore any that are still marked as being
+    // VPHIs with no location -- those are variables that we know the value of,
+    // but are not actually available in the register file.
+    for (auto *MBB : BlockOrders) {
+      DbgValue *BlockLiveIn = LiveInIdx[MBB];
+      if (BlockLiveIn->Kind == DbgValue::NoVal)
         continue;
-      Output[MBB->getNumber()].push_back(P);
+      if (BlockLiveIn->Kind == DbgValue::VPHI &&
+          BlockLiveIn->ID == ValueIDNum::EmptyValue)
+        continue;
+      if (BlockLiveIn->Kind == DbgValue::VPHI)
+        BlockLiveIn->Kind = DbgValue::Def;
+      Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn));
     }
-  }
+  } // Per-variable loop.
 
   BlockOrders.clear();
   BlocksToExplore.clear();
@@ -3485,6 +2816,10 @@ void InstrRefBasedLDV::emitLocations(
 
 void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
   // Build some useful data structures.
+
+  LLVMContext &Context = MF.getFunction().getContext();
+  EmptyExpr = DIExpression::get(Context, {});
+
   auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
     if (const DebugLoc &DL = MI.getDebugLoc())
       return DL.getLine() != 0;
@@ -3524,7 +2859,10 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
 bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
-                                    TargetPassConfig *TPC) {
+                                    MachineDominatorTree *DomTree,
+                                    TargetPassConfig *TPC,
+                                    unsigned InputBBLimit,
+                                    unsigned InputDbgValLimit) {
   // No subprogram means this function contains no debuginfo.
   if (!MF.getFunction().getSubprogram())
     return false;
@@ -3532,7 +2870,9 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
   this->TPC = TPC;
 
+  this->DomTree = DomTree;
   TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
   TII = MF.getSubtarget().getInstrInfo();
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
@@ -3569,6 +2909,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   ValueIDNum **MInLocs = new ValueIDNum *[MaxNumBlocks];
   unsigned NumLocs = MTracker->getNumLocs();
   for (int i = 0; i < MaxNumBlocks; ++i) {
+    // These all auto-initialize to ValueIDNum::EmptyValue
     MOutLocs[i] = new ValueIDNum[NumLocs];
     MInLocs[i] = new ValueIDNum[NumLocs];
   }
@@ -3577,7 +2918,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   // storing the computed live-ins / live-outs into the array-of-arrays. We use
   // both live-ins and live-outs for decision making in the variable value
   // dataflow problem.
-  mlocDataflow(MInLocs, MOutLocs, MLocTransfer);
+  buildMLocValueMap(MF, MInLocs, MOutLocs, MLocTransfer);
 
   // Patch up debug phi numbers, turning unknown block-live-in values into
   // either live-through machine values, or PHIs.
@@ -3626,6 +2967,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
 
   // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise
   // the order is unimportant, it just has to be stable.
+  unsigned VarAssignCount = 0;
   for (unsigned int I = 0; I < OrderToBB.size(); ++I) {
     auto *MBB = OrderToBB[I];
     auto *VTracker = &vlocs[MBB->getNumber()];
@@ -3643,24 +2985,42 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
       ScopeToVars[Scope].insert(Var);
       ScopeToBlocks[Scope].insert(VTracker->MBB);
       ScopeToDILocation[Scope] = ScopeLoc;
+      ++VarAssignCount;
     }
   }
 
-  // OK. Iterate over scopes: there might be something to be said for
-  // ordering them by size/locality, but that's for the future. For each scope,
-  // solve the variable value problem, producing a map of variables to values
-  // in SavedLiveIns.
-  for (auto &P : ScopeToVars) {
-    vlocDataflow(P.first, ScopeToDILocation[P.first], P.second,
-                 ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs,
-                 vlocs);
-  }
+  bool Changed = false;
+
+  // If we have an extremely large number of variable assignments and blocks,
+  // bail out at this point. We've burnt some time doing analysis already,
+  // however we should cut our losses.
+  if ((unsigned)MaxNumBlocks > InputBBLimit &&
+      VarAssignCount > InputDbgValLimit) {
+    LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName()
+                      << " has " << MaxNumBlocks << " basic blocks and "
+                      << VarAssignCount
+                      << " variable assignments, exceeding limits.\n");
+  } else {
+    // Compute the extended ranges, iterating over scopes. There might be
+    // something to be said for ordering them by size/locality, but that's for
+    // the future. For each scope, solve the variable value problem, producing
+    // a map of variables to values in SavedLiveIns.
+    for (auto &P : ScopeToVars) {
+      buildVLocValueMap(ScopeToDILocation[P.first], P.second,
+                   ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs,
+                   vlocs);
+    }
+
+    // Using the computed value locations and variable values for each block,
+    // create the DBG_VALUE instructions representing the extended variable
+    // locations.
+    emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC);
 
-  // Using the computed value locations and variable values for each block,
-  // create the DBG_VALUE instructions representing the extended variable
-  // locations.
-  emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC);
+    // Did we actually make any changes? If we created any DBG_VALUEs, then yes.
+    Changed = TTracker->Transfers.size() != 0;
+  }
 
+  // Common clean-up of memory.
   for (int Idx = 0; Idx < MaxNumBlocks; ++Idx) {
     delete[] MOutLocs[Idx];
     delete[] MInLocs[Idx];
@@ -3668,9 +3028,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   delete[] MOutLocs;
   delete[] MInLocs;
 
-  // Did we actually make any changes? If we created any DBG_VALUEs, then yes.
-  bool Changed = TTracker->Transfers.size() != 0;
-
   delete MTracker;
   delete TTracker;
   MTracker = nullptr;
@@ -3883,10 +3240,8 @@ public:
   /// vector.
   static void FindPredecessorBlocks(LDVSSABlock *BB,
                                     SmallVectorImpl<LDVSSABlock *> *Preds) {
-    for (MachineBasicBlock::pred_iterator PI = BB->BB.pred_begin(),
-                                          E = BB->BB.pred_end();
-         PI != E; ++PI)
-      Preds->push_back(BB->Updater.getSSALDVBlock(*PI));
+    for (MachineBasicBlock *Pred : BB->BB.predecessors())
+      Preds->push_back(BB->Updater.getSSALDVBlock(Pred));
   }
 
   /// GetUndefVal - Normally creates an IMPLICIT_DEF instruction with a new
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
new file mode 100644
index 000000000000..d96ef6d4f6e5
--- /dev/null
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -0,0 +1,1051 @@
+//===- InstrRefBasedImpl.h - Tracking Debug Value MIs ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+
+#include "LiveDebugValues.h"
+
+class TransferTracker;
+
+// Forward dec of unit test class, so that we can peer into the LDV object.
+class InstrRefLDVTest;
+
+namespace LiveDebugValues {
+
+class MLocTracker;
+
+using namespace llvm;
+
+/// Handle-class for a particular "location". This value-type uniquely
+/// symbolises a register or stack location, allowing manipulation of locations
+/// without concern for where that location is. Practically, this allows us to
+/// treat the state of the machine at a particular point as an array of values,
+/// rather than a map of values.
+class LocIdx {
+  unsigned Location;
+
+  // Default constructor is private, initializing to an illegal location number.
+  // Use only for "not an entry" elements in IndexedMaps.
+  LocIdx() : Location(UINT_MAX) {}
+
+public:
+#define NUM_LOC_BITS 24
+  LocIdx(unsigned L) : Location(L) {
+    assert(L < (1 << NUM_LOC_BITS) && "Machine locations must fit in 24 bits");
+  }
+
+  static LocIdx MakeIllegalLoc() { return LocIdx(); }
+  static LocIdx MakeTombstoneLoc() {
+    LocIdx L = LocIdx();
+    --L.Location;
+    return L;
+  }
+
+  bool isIllegal() const { return Location == UINT_MAX; }
+
+  uint64_t asU64() const { return Location; }
+
+  bool operator==(unsigned L) const { return Location == L; }
+
+  bool operator==(const LocIdx &L) const { return Location == L.Location; }
+
+  bool operator!=(unsigned L) const { return !(*this == L); }
+
+  bool operator!=(const LocIdx &L) const { return !(*this == L); }
+
+  bool operator<(const LocIdx &Other) const {
+    return Location < Other.Location;
+  }
+};
+
+// The location at which a spilled value resides. It consists of a register and
+// an offset.
+struct SpillLoc {
+  unsigned SpillBase;
+  StackOffset SpillOffset;
+  bool operator==(const SpillLoc &Other) const {
+    return std::make_pair(SpillBase, SpillOffset) ==
+           std::make_pair(Other.SpillBase, Other.SpillOffset);
+  }
+  bool operator<(const SpillLoc &Other) const {
+    return std::make_tuple(SpillBase, SpillOffset.getFixed(),
+                           SpillOffset.getScalable()) <
+           std::make_tuple(Other.SpillBase, Other.SpillOffset.getFixed(),
+                           Other.SpillOffset.getScalable());
+  }
+};
+
+/// Unique identifier for a value defined by an instruction, as a value type.
+/// Casts back and forth to a uint64_t. Probably replacable with something less
+/// bit-constrained. Each value identifies the instruction and machine location
+/// where the value is defined, although there may be no corresponding machine
+/// operand for it (ex: regmasks clobbering values). The instructions are
+/// one-based, and definitions that are PHIs have instruction number zero.
+///
+/// The obvious limits of a 1M block function or 1M instruction blocks are
+/// problematic; but by that point we should probably have bailed out of
+/// trying to analyse the function.
+class ValueIDNum {
+  union {
+    struct {
+      uint64_t BlockNo : 20; /// The block where the def happens.
+      uint64_t InstNo : 20;  /// The Instruction where the def happens.
+                             /// One based, is distance from start of block.
+      uint64_t LocNo
+          : NUM_LOC_BITS; /// The machine location where the def happens.
+    } s;
+    uint64_t Value;
+  } u;
+
+  static_assert(sizeof(u) == 8, "Badly packed ValueIDNum?");
+
+public:
+  // Default-initialize to EmptyValue. This is necessary to make IndexedMaps
+  // of values to work.
+  ValueIDNum() { u.Value = EmptyValue.asU64(); }
+
+  ValueIDNum(uint64_t Block, uint64_t Inst, uint64_t Loc) {
+    u.s = {Block, Inst, Loc};
+  }
+
+  ValueIDNum(uint64_t Block, uint64_t Inst, LocIdx Loc) {
+    u.s = {Block, Inst, Loc.asU64()};
+  }
+
+  uint64_t getBlock() const { return u.s.BlockNo; }
+  uint64_t getInst() const { return u.s.InstNo; }
+  uint64_t getLoc() const { return u.s.LocNo; }
+  bool isPHI() const { return u.s.InstNo == 0; }
+
+  uint64_t asU64() const { return u.Value; }
+
+  static ValueIDNum fromU64(uint64_t v) {
+    ValueIDNum Val;
+    Val.u.Value = v;
+    return Val;
+  }
+
+  bool operator<(const ValueIDNum &Other) const {
+    return asU64() < Other.asU64();
+  }
+
+  bool operator==(const ValueIDNum &Other) const {
+    return u.Value == Other.u.Value;
+  }
+
+  bool operator!=(const ValueIDNum &Other) const { return !(*this == Other); }
+
+  std::string asString(const std::string &mlocname) const {
+    return Twine("Value{bb: ")
+        .concat(Twine(u.s.BlockNo)
+                    .concat(Twine(", inst: ")
+                                .concat((u.s.InstNo ? Twine(u.s.InstNo)
+                                                    : Twine("live-in"))
+                                            .concat(Twine(", loc: ").concat(
+                                                Twine(mlocname)))
+                                            .concat(Twine("}")))))
+        .str();
+  }
+
+  static ValueIDNum EmptyValue;
+  static ValueIDNum TombstoneValue;
+};
+
+/// Thin wrapper around an integer -- designed to give more type safety to
+/// spill location numbers.
+class SpillLocationNo {
+public:
+  explicit SpillLocationNo(unsigned SpillNo) : SpillNo(SpillNo) {}
+  unsigned SpillNo;
+  unsigned id() const { return SpillNo; }
+
+  bool operator<(const SpillLocationNo &Other) const {
+    return SpillNo < Other.SpillNo;
+  }
+
+  bool operator==(const SpillLocationNo &Other) const {
+    return SpillNo == Other.SpillNo;
+  }
+  bool operator!=(const SpillLocationNo &Other) const {
+    return !(*this == Other);
+  }
+};
+
+/// Meta qualifiers for a value. Pair of whatever expression is used to qualify
+/// the the value, and Boolean of whether or not it's indirect.
+class DbgValueProperties {
+public:
+  DbgValueProperties(const DIExpression *DIExpr, bool Indirect)
+      : DIExpr(DIExpr), Indirect(Indirect) {}
+
+  /// Extract properties from an existing DBG_VALUE instruction.
+  DbgValueProperties(const MachineInstr &MI) {
+    assert(MI.isDebugValue());
+    DIExpr = MI.getDebugExpression();
+    Indirect = MI.getOperand(1).isImm();
+  }
+
+  bool operator==(const DbgValueProperties &Other) const {
+    return std::tie(DIExpr, Indirect) == std::tie(Other.DIExpr, Other.Indirect);
+  }
+
+  bool operator!=(const DbgValueProperties &Other) const {
+    return !(*this == Other);
+  }
+
+  const DIExpression *DIExpr;
+  bool Indirect;
+};
+
+/// Class recording the (high level) _value_ of a variable. Identifies either
+/// the value of the variable as a ValueIDNum, or a constant MachineOperand.
+/// This class also stores meta-information about how the value is qualified.
+/// Used to reason about variable values when performing the second
+/// (DebugVariable specific) dataflow analysis.
+class DbgValue {
+public:
+  /// If Kind is Def, the value number that this value is based on. VPHIs set
+  /// this field to EmptyValue if there is no machine-value for this VPHI, or
+  /// the corresponding machine-value if there is one.
+  ValueIDNum ID;
+  /// If Kind is Const, the MachineOperand defining this value.
+  Optional<MachineOperand> MO;
+  /// For a NoVal or VPHI DbgValue, which block it was generated in.
+  int BlockNo;
+
+  /// Qualifiers for the ValueIDNum above.
+  DbgValueProperties Properties;
+
+  typedef enum {
+    Undef, // Represents a DBG_VALUE $noreg in the transfer function only.
+    Def,   // This value is defined by an inst, or is a PHI value.
+    Const, // A constant value contained in the MachineOperand field.
+    VPHI,  // Incoming values to BlockNo differ, those values must be joined by
+           // a PHI in this block.
+    NoVal, // Empty DbgValue indicating an unknown value. Used as initializer,
+           // before dominating blocks values are propagated in.
+  } KindT;
+  /// Discriminator for whether this is a constant or an in-program value.
+  KindT Kind;
+
+  DbgValue(const ValueIDNum &Val, const DbgValueProperties &Prop, KindT Kind)
+      : ID(Val), MO(None), BlockNo(0), Properties(Prop), Kind(Kind) {
+    assert(Kind == Def);
+  }
+
+  DbgValue(unsigned BlockNo, const DbgValueProperties &Prop, KindT Kind)
+      : ID(ValueIDNum::EmptyValue), MO(None), BlockNo(BlockNo),
+        Properties(Prop), Kind(Kind) {
+    assert(Kind == NoVal || Kind == VPHI);
+  }
+
+  DbgValue(const MachineOperand &MO, const DbgValueProperties &Prop, KindT Kind)
+      : ID(ValueIDNum::EmptyValue), MO(MO), BlockNo(0), Properties(Prop),
+        Kind(Kind) {
+    assert(Kind == Const);
+  }
+
+  DbgValue(const DbgValueProperties &Prop, KindT Kind)
+    : ID(ValueIDNum::EmptyValue), MO(None), BlockNo(0), Properties(Prop),
+      Kind(Kind) {
+    assert(Kind == Undef &&
+           "Empty DbgValue constructor must pass in Undef kind");
+  }
+
+#ifndef NDEBUG
+  void dump(const MLocTracker *MTrack) const;
+#endif
+
+  bool operator==(const DbgValue &Other) const {
+    if (std::tie(Kind, Properties) != std::tie(Other.Kind, Other.Properties))
+      return false;
+    else if (Kind == Def && ID != Other.ID)
+      return false;
+    else if (Kind == NoVal && BlockNo != Other.BlockNo)
+      return false;
+    else if (Kind == Const)
+      return MO->isIdenticalTo(*Other.MO);
+    else if (Kind == VPHI && BlockNo != Other.BlockNo)
+      return false;
+    else if (Kind == VPHI && ID != Other.ID)
+      return false;
+
+    return true;
+  }
+
+  bool operator!=(const DbgValue &Other) const { return !(*this == Other); }
+};
+
+class LocIdxToIndexFunctor {
+public:
+  using argument_type = LocIdx;
+  unsigned operator()(const LocIdx &L) const { return L.asU64(); }
+};
+
+/// Tracker for what values are in machine locations. Listens to the Things
+/// being Done by various instructions, and maintains a table of what machine
+/// locations have what values (as defined by a ValueIDNum).
+///
+/// There are potentially a much larger number of machine locations on the
+/// target machine than the actual working-set size of the function. On x86 for
+/// example, we're extremely unlikely to want to track values through control
+/// or debug registers. To avoid doing so, MLocTracker has several layers of
+/// indirection going on, described below, to avoid unnecessarily tracking
+/// any location.
+///
+/// Here's a sort of diagram of the indexes, read from the bottom up:
+///
+///           Size on stack   Offset on stack
+///                 \              /
+///          Stack Idx (Where in slot is this?)
+///                         /
+///                        /
+/// Slot Num (%stack.0)   /
+/// FrameIdx => SpillNum /
+///              \      /
+///           SpillID (int)              Register number (int)
+///                      \                  /
+///                      LocationID => LocIdx
+///                                |
+///                       LocIdx => ValueIDNum
+///
+/// The aim here is that the LocIdx => ValueIDNum vector is just an array of
+/// values in numbered locations, so that later analyses can ignore whether the
+/// location is a register or otherwise. To map a register / spill location to
+/// a LocIdx, you have to use the (sparse) LocationID => LocIdx map. And to
+/// build a LocationID for a stack slot, you need to combine identifiers for
+/// which stack slot it is and where within that slot is being described.
+///
+/// Register mask operands cause trouble by technically defining every register;
+/// various hacks are used to avoid tracking registers that are never read and
+/// only written by regmasks.
+class MLocTracker {
+public:
+  MachineFunction &MF;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const TargetLowering &TLI;
+
+  /// IndexedMap type, mapping from LocIdx to ValueIDNum.
+  using LocToValueType = IndexedMap<ValueIDNum, LocIdxToIndexFunctor>;
+
+  /// Map of LocIdxes to the ValueIDNums that they store. This is tightly
+  /// packed, entries only exist for locations that are being tracked.
+  LocToValueType LocIdxToIDNum;
+
+  /// "Map" of machine location IDs (i.e., raw register or spill number) to the
+  /// LocIdx key / number for that location. There are always at least as many
+  /// as the number of registers on the target -- if the value in the register
+  /// is not being tracked, then the LocIdx value will be zero. New entries are
+  /// appended if a new spill slot begins being tracked.
+  /// This, and the corresponding reverse map persist for the analysis of the
+  /// whole function, and is necessarying for decoding various vectors of
+  /// values.
+  std::vector<LocIdx> LocIDToLocIdx;
+
+  /// Inverse map of LocIDToLocIdx.
+  IndexedMap<unsigned, LocIdxToIndexFunctor> LocIdxToLocID;
+
+  /// When clobbering register masks, we chose to not believe the machine model
+  /// and don't clobber SP. Do the same for SP aliases, and for efficiency,
+  /// keep a set of them here.
+  SmallSet<Register, 8> SPAliases;
+
+  /// Unique-ification of spill. Used to number them -- their LocID number is
+  /// the index in SpillLocs minus one plus NumRegs.
+  UniqueVector<SpillLoc> SpillLocs;
+
+  // If we discover a new machine location, assign it an mphi with this
+  // block number.
+  unsigned CurBB;
+
+  /// Cached local copy of the number of registers the target has.
+  unsigned NumRegs;
+
+  /// Number of slot indexes the target has -- distinct segments of a stack
+  /// slot that can take on the value of a subregister, when a super-register
+  /// is written to the stack.
+  unsigned NumSlotIdxes;
+
+  /// Collection of register mask operands that have been observed. Second part
+  /// of pair indicates the instruction that they happened in. Used to
+  /// reconstruct where defs happened if we start tracking a location later
+  /// on.
+  SmallVector<std::pair<const MachineOperand *, unsigned>, 32> Masks;
+
+  /// Pair for describing a position within a stack slot -- first the size in
+  /// bits, then the offset.
+  typedef std::pair<unsigned short, unsigned short> StackSlotPos;
+
+  /// Map from a size/offset pair describing a position in a stack slot, to a
+  /// numeric identifier for that position. Allows easier identification of
+  /// individual positions.
+  DenseMap<StackSlotPos, unsigned> StackSlotIdxes;
+
+  /// Inverse of StackSlotIdxes.
+  DenseMap<unsigned, StackSlotPos> StackIdxesToPos;
+
+  /// Iterator for locations and the values they contain. Dereferencing
+  /// produces a struct/pair containing the LocIdx key for this location,
+  /// and a reference to the value currently stored. Simplifies the process
+  /// of seeking a particular location.
+  class MLocIterator {
+    LocToValueType &ValueMap;
+    LocIdx Idx;
+
+  public:
+    class value_type {
+    public:
+      value_type(LocIdx Idx, ValueIDNum &Value) : Idx(Idx), Value(Value) {}
+      const LocIdx Idx;  /// Read-only index of this location.
+      ValueIDNum &Value; /// Reference to the stored value at this location.
+    };
+
+    MLocIterator(LocToValueType &ValueMap, LocIdx Idx)
+        : ValueMap(ValueMap), Idx(Idx) {}
+
+    bool operator==(const MLocIterator &Other) const {
+      assert(&ValueMap == &Other.ValueMap);
+      return Idx == Other.Idx;
+    }
+
+    bool operator!=(const MLocIterator &Other) const {
+      return !(*this == Other);
+    }
+
+    void operator++() { Idx = LocIdx(Idx.asU64() + 1); }
+
+    value_type operator*() { return value_type(Idx, ValueMap[LocIdx(Idx)]); }
+  };
+
+  MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
+              const TargetRegisterInfo &TRI, const TargetLowering &TLI);
+
+  /// Produce location ID number for a Register. Provides some small amount of
+  /// type safety.
+  /// \param Reg The register we're looking up.
+  unsigned getLocID(Register Reg) { return Reg.id(); }
+
+  /// Produce location ID number for a spill position.
+  /// \param Spill The number of the spill we're fetching the location for.
+  /// \param SpillSubReg Subregister within the spill we're addressing.
+  unsigned getLocID(SpillLocationNo Spill, unsigned SpillSubReg) {
+    unsigned short Size = TRI.getSubRegIdxSize(SpillSubReg);
+    unsigned short Offs = TRI.getSubRegIdxOffset(SpillSubReg);
+    return getLocID(Spill, {Size, Offs});
+  }
+
+  /// Produce location ID number for a spill position.
+  /// \param Spill The number of the spill we're fetching the location for.
+  /// \apram SpillIdx size/offset within the spill slot to be addressed.
+  unsigned getLocID(SpillLocationNo Spill, StackSlotPos Idx) {
+    unsigned SlotNo = Spill.id() - 1;
+    SlotNo *= NumSlotIdxes;
+    assert(StackSlotIdxes.find(Idx) != StackSlotIdxes.end());
+    SlotNo += StackSlotIdxes[Idx];
+    SlotNo += NumRegs;
+    return SlotNo;
+  }
+
+  /// Given a spill number, and a slot within the spill, calculate the ID number
+  /// for that location.
+  unsigned getSpillIDWithIdx(SpillLocationNo Spill, unsigned Idx) {
+    unsigned SlotNo = Spill.id() - 1;
+    SlotNo *= NumSlotIdxes;
+    SlotNo += Idx;
+    SlotNo += NumRegs;
+    return SlotNo;
+  }
+
+  /// Return the spill number that a location ID corresponds to.
+  SpillLocationNo locIDToSpill(unsigned ID) const {
+    assert(ID >= NumRegs);
+    ID -= NumRegs;
+    // Truncate away the index part, leaving only the spill number.
+    ID /= NumSlotIdxes;
+    return SpillLocationNo(ID + 1); // The UniqueVector is one-based.
+  }
+
+  /// Returns the spill-slot size/offs that a location ID corresponds to.
+  StackSlotPos locIDToSpillIdx(unsigned ID) const {
+    assert(ID >= NumRegs);
+    ID -= NumRegs;
+    unsigned Idx = ID % NumSlotIdxes;
+    return StackIdxesToPos.find(Idx)->second;
+  }
+
+  unsigned getNumLocs(void) const { return LocIdxToIDNum.size(); }
+
+  /// Reset all locations to contain a PHI value at the designated block. Used
+  /// sometimes for actual PHI values, othertimes to indicate the block entry
+  /// value (before any more information is known).
+  void setMPhis(unsigned NewCurBB) {
+    CurBB = NewCurBB;
+    for (auto Location : locations())
+      Location.Value = {CurBB, 0, Location.Idx};
+  }
+
+  /// Load values for each location from array of ValueIDNums. Take current
+  /// bbnum just in case we read a value from a hitherto untouched register.
+  void loadFromArray(ValueIDNum *Locs, unsigned NewCurBB) {
+    CurBB = NewCurBB;
+    // Iterate over all tracked locations, and load each locations live-in
+    // value into our local index.
+    for (auto Location : locations())
+      Location.Value = Locs[Location.Idx.asU64()];
+  }
+
+  /// Wipe any un-necessary location records after traversing a block.
+  void reset(void) {
+    // We could reset all the location values too; however either loadFromArray
+    // or setMPhis should be called before this object is re-used. Just
+    // clear Masks, they're definitely not needed.
+    Masks.clear();
+  }
+
+  /// Clear all data. Destroys the LocID <=> LocIdx map, which makes most of
+  /// the information in this pass uninterpretable.
+  void clear(void) {
+    reset();
+    LocIDToLocIdx.clear();
+    LocIdxToLocID.clear();
+    LocIdxToIDNum.clear();
+    // SpillLocs.reset(); XXX UniqueVector::reset assumes a SpillLoc casts from
+    // 0
+    SpillLocs = decltype(SpillLocs)();
+    StackSlotIdxes.clear();
+    StackIdxesToPos.clear();
+
+    LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
+  }
+
+  /// Set a locaiton to a certain value.
+  void setMLoc(LocIdx L, ValueIDNum Num) {
+    assert(L.asU64() < LocIdxToIDNum.size());
+    LocIdxToIDNum[L] = Num;
+  }
+
+  /// Read the value of a particular location
+  ValueIDNum readMLoc(LocIdx L) {
+    assert(L.asU64() < LocIdxToIDNum.size());
+    return LocIdxToIDNum[L];
+  }
+
+  /// Create a LocIdx for an untracked register ID. Initialize it to either an
+  /// mphi value representing a live-in, or a recent register mask clobber.
+  LocIdx trackRegister(unsigned ID);
+
+  LocIdx lookupOrTrackRegister(unsigned ID) {
+    LocIdx &Index = LocIDToLocIdx[ID];
+    if (Index.isIllegal())
+      Index = trackRegister(ID);
+    return Index;
+  }
+
+  /// Is register R currently tracked by MLocTracker?
+  bool isRegisterTracked(Register R) {
+    LocIdx &Index = LocIDToLocIdx[R];
+    return !Index.isIllegal();
+  }
+
+  /// Record a definition of the specified register at the given block / inst.
+  /// This doesn't take a ValueIDNum, because the definition and its location
+  /// are synonymous.
+  void defReg(Register R, unsigned BB, unsigned Inst) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    ValueIDNum ValueID = {BB, Inst, Idx};
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  /// Set a register to a value number. To be used if the value number is
+  /// known in advance.
+  void setReg(Register R, ValueIDNum ValueID) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  ValueIDNum readReg(Register R) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    return LocIdxToIDNum[Idx];
+  }
+
+  /// Reset a register value to zero / empty. Needed to replicate the
+  /// VarLoc implementation where a copy to/from a register effectively
+  /// clears the contents of the source register. (Values can only have one
+  ///  machine location in VarLocBasedImpl).
+  void wipeRegister(Register R) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = LocIDToLocIdx[ID];
+    LocIdxToIDNum[Idx] = ValueIDNum::EmptyValue;
+  }
+
+  /// Determine the LocIdx of an existing register.
+  LocIdx getRegMLoc(Register R) {
+    unsigned ID = getLocID(R);
+    assert(ID < LocIDToLocIdx.size());
+    assert(LocIDToLocIdx[ID] != UINT_MAX); // Sentinal for IndexedMap.
+    return LocIDToLocIdx[ID];
+  }
+
+  /// Record a RegMask operand being executed. Defs any register we currently
+  /// track, stores a pointer to the mask in case we have to account for it
+  /// later.
+  void writeRegMask(const MachineOperand *MO, unsigned CurBB, unsigned InstID);
+
+  /// Find LocIdx for SpillLoc \p L, creating a new one if it's not tracked.
+  SpillLocationNo getOrTrackSpillLoc(SpillLoc L);
+
+  // Get LocIdx of a spill ID.
+  LocIdx getSpillMLoc(unsigned SpillID) {
+    assert(LocIDToLocIdx[SpillID] != UINT_MAX); // Sentinal for IndexedMap.
+    return LocIDToLocIdx[SpillID];
+  }
+
+  /// Return true if Idx is a spill machine location.
+  bool isSpill(LocIdx Idx) const { return LocIdxToLocID[Idx] >= NumRegs; }
+
+  MLocIterator begin() { return MLocIterator(LocIdxToIDNum, 0); }
+
+  MLocIterator end() {
+    return MLocIterator(LocIdxToIDNum, LocIdxToIDNum.size());
+  }
+
+  /// Return a range over all locations currently tracked.
+  iterator_range<MLocIterator> locations() {
+    return llvm::make_range(begin(), end());
+  }
+
+  std::string LocIdxToName(LocIdx Idx) const;
+
+  std::string IDAsString(const ValueIDNum &Num) const;
+
+#ifndef NDEBUG
+  LLVM_DUMP_METHOD void dump();
+
+  LLVM_DUMP_METHOD void dump_mloc_map();
+#endif
+
+  /// Create a DBG_VALUE based on  machine location \p MLoc. Qualify it with the
+  /// information in \pProperties, for variable Var. Don't insert it anywhere,
+  /// just return the builder for it.
+  MachineInstrBuilder emitLoc(Optional<LocIdx> MLoc, const DebugVariable &Var,
+                              const DbgValueProperties &Properties);
+};
+
+/// Collection of DBG_VALUEs observed when traversing a block. Records each
+/// variable and the value the DBG_VALUE refers to. Requires the machine value
+/// location dataflow algorithm to have run already, so that values can be
+/// identified.
+class VLocTracker {
+public:
+  /// Map DebugVariable to the latest Value it's defined to have.
+  /// Needs to be a MapVector because we determine order-in-the-input-MIR from
+  /// the order in this container.
+  /// We only retain the last DbgValue in each block for each variable, to
+  /// determine the blocks live-out variable value. The Vars container forms the
+  /// transfer function for this block, as part of the dataflow analysis. The
+  /// movement of values between locations inside of a block is handled at a
+  /// much later stage, in the TransferTracker class.
+  MapVector<DebugVariable, DbgValue> Vars;
+  DenseMap<DebugVariable, const DILocation *> Scopes;
+  MachineBasicBlock *MBB = nullptr;
+
+public:
+  VLocTracker() {}
+
+  void defVar(const MachineInstr &MI, const DbgValueProperties &Properties,
+              Optional<ValueIDNum> ID) {
+    assert(MI.isDebugValue() || MI.isDebugRef());
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValue Rec = (ID) ? DbgValue(*ID, Properties, DbgValue::Def)
+                        : DbgValue(Properties, DbgValue::Undef);
+
+    // Attempt insertion; overwrite if it's already mapped.
+    auto Result = Vars.insert(std::make_pair(Var, Rec));
+    if (!Result.second)
+      Result.first->second = Rec;
+    Scopes[Var] = MI.getDebugLoc().get();
+  }
+
+  void defVar(const MachineInstr &MI, const MachineOperand &MO) {
+    // Only DBG_VALUEs can define constant-valued variables.
+    assert(MI.isDebugValue());
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValueProperties Properties(MI);
+    DbgValue Rec = DbgValue(MO, Properties, DbgValue::Const);
+
+    // Attempt insertion; overwrite if it's already mapped.
+    auto Result = Vars.insert(std::make_pair(Var, Rec));
+    if (!Result.second)
+      Result.first->second = Rec;
+    Scopes[Var] = MI.getDebugLoc().get();
+  }
+};
+
+/// Types for recording sets of variable fragments that overlap. For a given
+/// local variable, we record all other fragments of that variable that could
+/// overlap it, to reduce search time.
+using FragmentOfVar =
+    std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
+using OverlapMap =
+    DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+
+// XXX XXX docs
+class InstrRefBasedLDV : public LDVImpl {
+public:
+  friend class ::InstrRefLDVTest;
+
+  using FragmentInfo = DIExpression::FragmentInfo;
+  using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
+
+  // Helper while building OverlapMap, a map of all fragments seen for a given
+  // DILocalVariable.
+  using VarToFragments =
+      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
+
+  /// Machine location/value transfer function, a mapping of which locations
+  /// are assigned which new values.
+  using MLocTransferMap = SmallDenseMap<LocIdx, ValueIDNum>;
+
+  /// Live in/out structure for the variable values: a per-block map of
+  /// variables to their values.
+  using LiveIdxT = DenseMap<const MachineBasicBlock *, DbgValue *>;
+
+  using VarAndLoc = std::pair<DebugVariable, DbgValue>;
+
+  /// Type for a live-in value: the predecessor block, and its value.
+  using InValueT = std::pair<MachineBasicBlock *, DbgValue *>;
+
+  /// Vector (per block) of a collection (inner smallvector) of live-ins.
+  /// Used as the result type for the variable value dataflow problem.
+  using LiveInsT = SmallVector<SmallVector<VarAndLoc, 8>, 8>;
+
+private:
+  MachineDominatorTree *DomTree;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
+  const MachineFrameInfo *MFI;
+  BitVector CalleeSavedRegs;
+  LexicalScopes LS;
+  TargetPassConfig *TPC;
+
+  // An empty DIExpression. Used default / placeholder DbgValueProperties
+  // objects, as we can't have null expressions.
+  const DIExpression *EmptyExpr;
+
+  /// Object to track machine locations as we step through a block. Could
+  /// probably be a field rather than a pointer, as it's always used.
+  MLocTracker *MTracker = nullptr;
+
+  /// Number of the current block LiveDebugValues is stepping through.
+  unsigned CurBB;
+
+  /// Number of the current instruction LiveDebugValues is evaluating.
+  unsigned CurInst;
+
+  /// Variable tracker -- listens to DBG_VALUEs occurring as InstrRefBasedImpl
+  /// steps through a block. Reads the values at each location from the
+  /// MLocTracker object.
+  VLocTracker *VTracker = nullptr;
+
+  /// Tracker for transfers, listens to DBG_VALUEs and transfers of values
+  /// between locations during stepping, creates new DBG_VALUEs when values move
+  /// location.
+  TransferTracker *TTracker = nullptr;
+
+  /// Blocks which are artificial, i.e. blocks which exclusively contain
+  /// instructions without DebugLocs, or with line 0 locations.
+  SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
+
+  // Mapping of blocks to and from their RPOT order.
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<const MachineBasicBlock *, unsigned int> BBToOrder;
+  DenseMap<unsigned, unsigned> BBNumToRPO;
+
+  /// Pair of MachineInstr, and its 1-based offset into the containing block.
+  using InstAndNum = std::pair<const MachineInstr *, unsigned>;
+  /// Map from debug instruction number to the MachineInstr labelled with that
+  /// number, and its location within the function. Used to transform
+  /// instruction numbers in DBG_INSTR_REFs into machine value numbers.
+  std::map<uint64_t, InstAndNum> DebugInstrNumToInstr;
+
+  /// Record of where we observed a DBG_PHI instruction.
+  class DebugPHIRecord {
+  public:
+    uint64_t InstrNum;      ///< Instruction number of this DBG_PHI.
+    MachineBasicBlock *MBB; ///< Block where DBG_PHI occurred.
+    ValueIDNum ValueRead;   ///< The value number read by the DBG_PHI.
+    LocIdx ReadLoc;         ///< Register/Stack location the DBG_PHI reads.
+
+    operator unsigned() const { return InstrNum; }
+  };
+
+  /// Map from instruction numbers defined by DBG_PHIs to a record of what that
+  /// DBG_PHI read and where. Populated and edited during the machine value
+  /// location problem -- we use LLVMs SSA Updater to fix changes by
+  /// optimizations that destroy PHI instructions.
+  SmallVector<DebugPHIRecord, 32> DebugPHINumToValue;
+
+  // Map of overlapping variable fragments.
+  OverlapMap OverlapFragments;
+  VarToFragments SeenFragments;
+
+  /// Tests whether this instruction is a spill to a stack slot.
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       unsigned &Reg);
+
+  /// If a given instruction is identified as a spill, return the spill slot
+  /// and set \p Reg to the spilled register.
+  Optional<SpillLocationNo> isRestoreInstruction(const MachineInstr &MI,
+                                          MachineFunction *MF, unsigned &Reg);
+
+  /// Given a spill instruction, extract the spill slot information, ensure it's
+  /// tracked, and return the spill number.
+  SpillLocationNo extractSpillBaseRegAndOffset(const MachineInstr &MI);
+
+  /// Observe a single instruction while stepping through a block.
+  void process(MachineInstr &MI, ValueIDNum **MLiveOuts = nullptr,
+               ValueIDNum **MLiveIns = nullptr);
+
+  /// Examines whether \p MI is a DBG_VALUE and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugValue(const MachineInstr &MI);
+
+  /// Examines whether \p MI is a DBG_INSTR_REF and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugInstrRef(MachineInstr &MI, ValueIDNum **MLiveOuts,
+                             ValueIDNum **MLiveIns);
+
+  /// Stores value-information about where this PHI occurred, and what
+  /// instruction number is associated with it.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugPHI(MachineInstr &MI);
+
+  /// Examines whether \p MI is copy instruction, and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferRegisterCopy(MachineInstr &MI);
+
+  /// Examines whether \p MI is stack spill or restore  instruction, and
+  /// notifies trackers. \returns true if MI was recognized and processed.
+  bool transferSpillOrRestoreInst(MachineInstr &MI);
+
+  /// Examines \p MI for any registers that it defines, and notifies trackers.
+  void transferRegisterDef(MachineInstr &MI);
+
+  /// Copy one location to the other, accounting for movement of subregisters
+  /// too.
+  void performCopy(Register Src, Register Dst);
+
+  void accumulateFragmentMap(MachineInstr &MI);
+
+  /// Determine the machine value number referred to by (potentially several)
+  /// DBG_PHI instructions. Block duplication and tail folding can duplicate
+  /// DBG_PHIs, shifting the position where values in registers merge, and
+  /// forming another mini-ssa problem to solve.
+  /// \p Here the position of a DBG_INSTR_REF seeking a machine value number
+  /// \p InstrNum Debug instruction number defined by DBG_PHI instructions.
+  /// \returns The machine value number at position Here, or None.
+  Optional<ValueIDNum> resolveDbgPHIs(MachineFunction &MF,
+                                      ValueIDNum **MLiveOuts,
+                                      ValueIDNum **MLiveIns, MachineInstr &Here,
+                                      uint64_t InstrNum);
+
+  /// Step through the function, recording register definitions and movements
+  /// in an MLocTracker. Convert the observations into a per-block transfer
+  /// function in \p MLocTransfer, suitable for using with the machine value
+  /// location dataflow problem.
+  void
+  produceMLocTransferFunction(MachineFunction &MF,
+                              SmallVectorImpl<MLocTransferMap> &MLocTransfer,
+                              unsigned MaxNumBlocks);
+
+  /// Solve the machine value location dataflow problem. Takes as input the
+  /// transfer functions in \p MLocTransfer. Writes the output live-in and
+  /// live-out arrays to the (initialized to zero) multidimensional arrays in
+  /// \p MInLocs and \p MOutLocs. The outer dimension is indexed by block
+  /// number, the inner by LocIdx.
+  void buildMLocValueMap(MachineFunction &MF, ValueIDNum **MInLocs,
+                         ValueIDNum **MOutLocs,
+                         SmallVectorImpl<MLocTransferMap> &MLocTransfer);
+
+  /// Examine the stack indexes (i.e. offsets within the stack) to find the
+  /// basic units of interference -- like reg units, but for the stack.
+  void findStackIndexInterference(SmallVectorImpl<unsigned> &Slots);
+
+  /// Install PHI values into the live-in array for each block, according to
+  /// the IDF of each register.
+  void placeMLocPHIs(MachineFunction &MF,
+                     SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+                     ValueIDNum **MInLocs,
+                     SmallVectorImpl<MLocTransferMap> &MLocTransfer);
+
+  /// Calculate the iterated-dominance-frontier for a set of defs, using the
+  /// existing LLVM facilities for this. Works for a single "value" or
+  /// machine/variable location.
+  /// \p AllBlocks Set of blocks where we might consume the value.
+  /// \p DefBlocks Set of blocks where the value/location is defined.
+  /// \p PHIBlocks Output set of blocks where PHIs must be placed.
+  void BlockPHIPlacement(const SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+                         const SmallPtrSetImpl<MachineBasicBlock *> &DefBlocks,
+                         SmallVectorImpl<MachineBasicBlock *> &PHIBlocks);
+
+  /// Perform a control flow join (lattice value meet) of the values in machine
+  /// locations at \p MBB. Follows the algorithm described in the file-comment,
+  /// reading live-outs of predecessors from \p OutLocs, the current live ins
+  /// from \p InLocs, and assigning the newly computed live ins back into
+  /// \p InLocs. \returns two bools -- the first indicates whether a change
+  /// was made, the second whether a lattice downgrade occurred. If the latter
+  /// is true, revisiting this block is necessary.
+  bool mlocJoin(MachineBasicBlock &MBB,
+                SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+                ValueIDNum **OutLocs, ValueIDNum *InLocs);
+
+  /// Solve the variable value dataflow problem, for a single lexical scope.
+  /// Uses the algorithm from the file comment to resolve control flow joins
+  /// using PHI placement and value propagation. Reads the locations of machine
+  /// values from the \p MInLocs and \p MOutLocs arrays (see buildMLocValueMap)
+  /// and reads the variable values transfer function from \p AllTheVlocs.
+  /// Live-in and Live-out variable values are stored locally, with the live-ins
+  /// permanently stored to \p Output once a fixedpoint is reached.
+  /// \p VarsWeCareAbout contains a collection of the variables in \p Scope
+  /// that we should be tracking.
+  /// \p AssignBlocks contains the set of blocks that aren't in \p DILoc's
+  /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks
+  /// locations through.
+  void buildVLocValueMap(const DILocation *DILoc,
+                    const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+                    SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
+                    LiveInsT &Output, ValueIDNum **MOutLocs,
+                    ValueIDNum **MInLocs,
+                    SmallVectorImpl<VLocTracker> &AllTheVLocs);
+
+  /// Attempt to eliminate un-necessary PHIs on entry to a block. Examines the
+  /// live-in values coming from predecessors live-outs, and replaces any PHIs
+  /// already present in this blocks live-ins with a live-through value if the
+  /// PHI isn't needed.
+  /// \p LiveIn Old live-in value, overwritten with new one if live-in changes.
+  /// \returns true if any live-ins change value, either from value propagation
+  ///          or PHI elimination.
+  bool vlocJoin(MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs,
+                SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks,
+                SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
+                DbgValue &LiveIn);
+
+  /// For the given block and live-outs feeding into it, try to find a
+  /// machine location where all the variable values join together.
+  /// \returns Value ID of a machine PHI if an appropriate one is available.
+  Optional<ValueIDNum>
+  pickVPHILoc(const MachineBasicBlock &MBB, const DebugVariable &Var,
+              const LiveIdxT &LiveOuts, ValueIDNum **MOutLocs,
+              const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders);
+
+  /// Given the solutions to the two dataflow problems, machine value locations
+  /// in \p MInLocs and live-in variable values in \p SavedLiveIns, runs the
+  /// TransferTracker class over the function to produce live-in and transfer
+  /// DBG_VALUEs, then inserts them. Groups of DBG_VALUEs are inserted in the
+  /// order given by AllVarsNumbering -- this could be any stable order, but
+  /// right now "order of appearence in function, when explored in RPO", so
+  /// that we can compare explictly against VarLocBasedImpl.
+  void emitLocations(MachineFunction &MF, LiveInsT SavedLiveIns,
+                     ValueIDNum **MOutLocs, ValueIDNum **MInLocs,
+                     DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
+                     const TargetPassConfig &TPC);
+
+  /// Boilerplate computation of some initial sets, artifical blocks and
+  /// RPOT block ordering.
+  void initialSetup(MachineFunction &MF);
+
+  bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
+                    TargetPassConfig *TPC, unsigned InputBBLimit,
+                    unsigned InputDbgValLimit) override;
+
+public:
+  /// Default construct and initialize the pass.
+  InstrRefBasedLDV();
+
+  LLVM_DUMP_METHOD
+  void dump_mloc_transfer(const MLocTransferMap &mloc_transfer) const;
+
+  bool isCalleeSaved(LocIdx L) const;
+
+  bool hasFoldedStackStore(const MachineInstr &MI) {
+    // Instruction must have a memory operand that's a stack slot, and isn't
+    // aliased, meaning it's a spill from regalloc instead of a variable.
+    // If it's aliased, we can't guarantee its value.
+    if (!MI.hasOneMemOperand())
+      return false;
+    auto *MemOperand = *MI.memoperands_begin();
+    return MemOperand->isStore() &&
+           MemOperand->getPseudoValue() &&
+           MemOperand->getPseudoValue()->kind() == PseudoSourceValue::FixedStack
+           && !MemOperand->getPseudoValue()->isAliased(MFI);
+  }
+
+  Optional<LocIdx> findLocationForMemOperand(const MachineInstr &MI);
+};
+
+} // namespace LiveDebugValues
+
+namespace llvm {
+using namespace LiveDebugValues;
+
+template <> struct DenseMapInfo<LocIdx> {
+  static inline LocIdx getEmptyKey() { return LocIdx::MakeIllegalLoc(); }
+  static inline LocIdx getTombstoneKey() { return LocIdx::MakeTombstoneLoc(); }
+
+  static unsigned getHashValue(const LocIdx &Loc) { return Loc.asU64(); }
+
+  static bool isEqual(const LocIdx &A, const LocIdx &B) { return A == B; }
+};
+
+template <> struct DenseMapInfo<ValueIDNum> {
+  static inline ValueIDNum getEmptyKey() { return ValueIDNum::EmptyValue; }
+  static inline ValueIDNum getTombstoneKey() {
+    return ValueIDNum::TombstoneValue;
+  }
+
+  static unsigned getHashValue(const ValueIDNum &Val) { return Val.asU64(); }
+
+  static bool isEqual(const ValueIDNum &A, const ValueIDNum &B) {
+    return A == B;
+  }
+};
+
+} // end namespace llvm
+
+#endif /* LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H */
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index 38e803d1abb5..691977dc34e6 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -40,6 +40,19 @@ static cl::opt<bool>
                               "normal DBG_VALUE inputs"),
                      cl::init(false));
 
+// Options to prevent pathological compile-time behavior. If InputBBLimit and
+// InputDbgValueLimit are both exceeded, range extension is disabled.
+static cl::opt<unsigned> InputBBLimit(
+    "livedebugvalues-input-bb-limit",
+    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
+    cl::init(10000), cl::Hidden);
+static cl::opt<unsigned> InputDbgValueLimit(
+    "livedebugvalues-input-dbg-value-limit",
+    cl::desc(
+        "Maximum input DBG_VALUE insts supported by debug range extension"),
+    cl::init(50000), cl::Hidden);
+
+namespace {
 /// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or
 /// InstrRefBasedLDV to perform location propagation, via the LDVImpl
 /// base class.
@@ -48,10 +61,7 @@ public:
   static char ID;
 
   LiveDebugValues();
-  ~LiveDebugValues() {
-    if (TheImpl)
-      delete TheImpl;
-  }
+  ~LiveDebugValues() {}
 
   /// Calculate the liveness information for the given machine function.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -67,9 +77,12 @@ public:
   }
 
 private:
-  LDVImpl *TheImpl;
+  std::unique_ptr<LDVImpl> InstrRefImpl;
+  std::unique_ptr<LDVImpl> VarLocImpl;
   TargetPassConfig *TPC;
+  MachineDominatorTree MDT;
 };
+} // namespace
 
 char LiveDebugValues::ID = 0;
 
@@ -81,27 +94,26 @@ INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false,
 /// Default construct and initialize the pass.
 LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
   initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
-  TheImpl = nullptr;
+  InstrRefImpl =
+      std::unique_ptr<LDVImpl>(llvm::makeInstrRefBasedLiveDebugValues());
+  VarLocImpl = std::unique_ptr<LDVImpl>(llvm::makeVarLocBasedLiveDebugValues());
 }
 
 bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
-  if (!TheImpl) {
-    TPC = getAnalysisIfAvailable<TargetPassConfig>();
-
-    bool InstrRefBased = false;
-    if (TPC) {
-      auto &TM = TPC->getTM<TargetMachine>();
-      InstrRefBased = TM.Options.ValueTrackingVariableLocations;
-    }
-
-    // Allow the user to force selection of InstrRef LDV.
-    InstrRefBased |= ForceInstrRefLDV;
-
-    if (InstrRefBased)
-      TheImpl = llvm::makeInstrRefBasedLiveDebugValues();
-    else
-      TheImpl = llvm::makeVarLocBasedLiveDebugValues();
+  bool InstrRefBased = MF.useDebugInstrRef();
+  // Allow the user to force selection of InstrRef LDV.
+  InstrRefBased |= ForceInstrRefLDV;
+
+  TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  LDVImpl *TheImpl = &*VarLocImpl;
+
+  MachineDominatorTree *DomTree = nullptr;
+  if (InstrRefBased) {
+    DomTree = &MDT;
+    MDT.calculate(MF);
+    TheImpl = &*InstrRefImpl;
   }
 
-  return TheImpl->ExtendRanges(MF, TPC);
+  return TheImpl->ExtendRanges(MF, DomTree, TPC, InputBBLimit,
+                               InputDbgValueLimit);
 }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
index 9c910f180b9f..a5936c8a96f0 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
 #define LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
 
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 
@@ -23,7 +24,9 @@ inline namespace SharedLiveDebugValues {
 // implementation.
 class LDVImpl {
 public:
-  virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) = 0;
+  virtual bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
+                            TargetPassConfig *TPC, unsigned InputBBLimit,
+                            unsigned InputDbgValLimit) = 0;
   virtual ~LDVImpl() {}
 };
 
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 1e6d65c18953..a632d3d9ce76 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -155,6 +155,7 @@
 #include <cassert>
 #include <cstdint>
 #include <functional>
+#include <map>
 #include <queue>
 #include <tuple>
 #include <utility>
@@ -166,18 +167,6 @@ using namespace llvm;
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 
-// Options to prevent pathological compile-time behavior. If InputBBLimit and
-// InputDbgValueLimit are both exceeded, range extension is disabled.
-static cl::opt<unsigned> InputBBLimit(
-    "livedebugvalues-input-bb-limit",
-    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
-    cl::init(10000), cl::Hidden);
-static cl::opt<unsigned> InputDbgValueLimit(
-    "livedebugvalues-input-dbg-value-limit",
-    cl::desc(
-        "Maximum input DBG_VALUE insts supported by debug range extension"),
-    cl::init(50000), cl::Hidden);
-
 /// If \p Op is a stack or frame register return true, otherwise return false.
 /// This is used to avoid basing the debug entry values on the registers, since
 /// we do not support it at the moment.
@@ -296,6 +285,8 @@ private:
   LexicalScopes LS;
   VarLocSet::Allocator Alloc;
 
+  const MachineInstr *LastNonDbgMI;
+
   enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };
 
   using FragmentInfo = DIExpression::FragmentInfo;
@@ -555,7 +546,6 @@ private:
               EVKind == EntryValueLocKind::EntryValueKind ? Orig.getReg()
                                                           : Register(Loc.RegNo),
               false));
-          MOs.back().setIsDebug();
           break;
         case MachineLocKind::SpillLocKind: {
           // Spills are indirect DBG_VALUEs, with a base register and offset.
@@ -565,9 +555,10 @@ private:
           unsigned Base = Loc.SpillLocation.SpillBase;
           auto *TRI = MF.getSubtarget().getRegisterInfo();
           if (MI.isNonListDebugValue()) {
-            DIExpr =
-                TRI->prependOffsetExpression(DIExpr, DIExpression::ApplyOffset,
-                                             Loc.SpillLocation.SpillOffset);
+            auto Deref = Indirect ? DIExpression::DerefAfter : 0;
+            DIExpr = TRI->prependOffsetExpression(
+                DIExpr, DIExpression::ApplyOffset | Deref,
+                Loc.SpillLocation.SpillOffset);
             Indirect = true;
           } else {
             SmallVector<uint64_t, 4> Ops;
@@ -576,7 +567,6 @@ private:
             DIExpr = DIExpression::appendOpsToArg(DIExpr, Ops, I);
           }
           MOs.push_back(MachineOperand::CreateReg(Base, false));
-          MOs.back().setIsDebug();
           break;
         }
         case MachineLocKind::ImmediateKind: {
@@ -626,7 +616,7 @@ private:
     unsigned getRegIdx(Register Reg) const {
       for (unsigned Idx = 0; Idx < Locs.size(); ++Idx)
         if (Locs[Idx].Kind == MachineLocKind::RegisterKind &&
-            Locs[Idx].Value.RegNo == Reg)
+            Register{static_cast<unsigned>(Locs[Idx].Value.RegNo)} == Reg)
           return Idx;
       llvm_unreachable("Could not find given Reg in Locs");
     }
@@ -635,7 +625,7 @@ private:
     /// add each of them to \p Regs and return true.
     bool getDescribingRegs(SmallVectorImpl<uint32_t> &Regs) const {
       bool AnyRegs = false;
-      for (auto Loc : Locs)
+      for (const auto &Loc : Locs)
         if (Loc.Kind == MachineLocKind::RegisterKind) {
           Regs.push_back(Loc.Value.RegNo);
           AnyRegs = true;
@@ -801,6 +791,10 @@ private:
     LocIndex LocationID;        ///< Location number for the transfer dest.
   };
   using TransferMap = SmallVector<TransferDebugPair, 4>;
+  // Types for recording Entry Var Locations emitted by a single MachineInstr,
+  // as well as recording MachineInstr which last defined a register.
+  using InstToEntryLocMap = std::multimap<const MachineInstr *, LocIndex>;
+  using RegDefToInstMap = DenseMap<Register, MachineInstr *>;
 
   // Types for recording sets of variable fragments that overlap. For a given
   // local variable, we record all other fragments of that variable that could
@@ -974,13 +968,22 @@ private:
                                Register NewReg = Register());
 
   void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
-                          VarLocMap &VarLocIDs);
+                          VarLocMap &VarLocIDs,
+                          InstToEntryLocMap &EntryValTransfers,
+                          RegDefToInstMap &RegSetInstrs);
   void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
                                   VarLocMap &VarLocIDs, TransferMap &Transfers);
-  bool removeEntryValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
-                        VarLocMap &VarLocIDs, const VarLoc &EntryVL);
+  void cleanupEntryValueTransfers(const MachineInstr *MI,
+                                  OpenRangesSet &OpenRanges,
+                                  VarLocMap &VarLocIDs, const VarLoc &EntryVL,
+                                  InstToEntryLocMap &EntryValTransfers);
+  void removeEntryValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                        VarLocMap &VarLocIDs, const VarLoc &EntryVL,
+                        InstToEntryLocMap &EntryValTransfers,
+                        RegDefToInstMap &RegSetInstrs);
   void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                       VarLocMap &VarLocIDs, TransferMap &Transfers,
+                       VarLocMap &VarLocIDs,
+                       InstToEntryLocMap &EntryValTransfers,
                        VarLocsInRange &KillSet);
   void recordEntryValue(const MachineInstr &MI,
                         const DefinedRegsSet &DefinedRegs,
@@ -988,12 +991,16 @@ private:
   void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
                             VarLocMap &VarLocIDs, TransferMap &Transfers);
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                           VarLocMap &VarLocIDs, TransferMap &Transfers);
+                           VarLocMap &VarLocIDs,
+                           InstToEntryLocMap &EntryValTransfers,
+                           RegDefToInstMap &RegSetInstrs);
   bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
                           VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
 
   void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
-               VarLocMap &VarLocIDs, TransferMap &Transfers);
+               VarLocMap &VarLocIDs, TransferMap &Transfers,
+               InstToEntryLocMap &EntryValTransfers,
+               RegDefToInstMap &RegSetInstrs);
 
   void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
                              OverlapMap &OLapMap);
@@ -1007,7 +1014,9 @@ private:
   /// had their instruction creation deferred.
   void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
 
-  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override;
+  bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
+                    TargetPassConfig *TPC, unsigned InputBBLimit,
+                    unsigned InputDbgValLimit) override;
 
 public:
   /// Default construct and initialize the pass.
@@ -1225,62 +1234,100 @@ VarLocBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
   return {Reg, Offset};
 }
 
+/// Do cleanup of \p EntryValTransfers created by \p TRInst, by removing the
+/// Transfer, which uses the to-be-deleted \p EntryVL.
+void VarLocBasedLDV::cleanupEntryValueTransfers(
+    const MachineInstr *TRInst, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
+    const VarLoc &EntryVL, InstToEntryLocMap &EntryValTransfers) {
+  if (EntryValTransfers.empty() || TRInst == nullptr)
+    return;
+
+  auto TransRange = EntryValTransfers.equal_range(TRInst);
+  for (auto TDPair : llvm::make_range(TransRange.first, TransRange.second)) {
+    const VarLoc &EmittedEV = VarLocIDs[TDPair.second];
+    if (std::tie(EntryVL.Var, EntryVL.Locs[0].Value.RegNo, EntryVL.Expr) ==
+        std::tie(EmittedEV.Var, EmittedEV.Locs[0].Value.RegNo,
+                 EmittedEV.Expr)) {
+      OpenRanges.erase(EmittedEV);
+      EntryValTransfers.erase(TRInst);
+      break;
+    }
+  }
+}
+
 /// Try to salvage the debug entry value if we encounter a new debug value
 /// describing the same parameter, otherwise stop tracking the value. Return
-/// true if we should stop tracking the entry value, otherwise return false.
-bool VarLocBasedLDV::removeEntryValue(const MachineInstr &MI,
-                                       OpenRangesSet &OpenRanges,
-                                       VarLocMap &VarLocIDs,
-                                       const VarLoc &EntryVL) {
+/// true if we should stop tracking the entry value and do the cleanup of
+/// emitted Entry Value Transfers, otherwise return false.
+void VarLocBasedLDV::removeEntryValue(const MachineInstr &MI,
+                                      OpenRangesSet &OpenRanges,
+                                      VarLocMap &VarLocIDs,
+                                      const VarLoc &EntryVL,
+                                      InstToEntryLocMap &EntryValTransfers,
+                                      RegDefToInstMap &RegSetInstrs) {
   // Skip the DBG_VALUE which is the debug entry value itself.
-  if (MI.isIdenticalTo(EntryVL.MI))
-    return false;
+  if (&MI == &EntryVL.MI)
+    return;
 
   // If the parameter's location is not register location, we can not track
-  // the entry value any more. In addition, if the debug expression from the
-  // DBG_VALUE is not empty, we can assume the parameter's value has changed
-  // indicating that we should stop tracking its entry value as well.
-  if (!MI.getDebugOperand(0).isReg() ||
-      MI.getDebugExpression()->getNumElements() != 0)
-    return true;
-
-  // If the DBG_VALUE comes from a copy instruction that copies the entry value,
-  // it means the parameter's value has not changed and we should be able to use
-  // its entry value.
+  // the entry value any more. It doesn't have the TransferInst which defines
+  // register, so no Entry Value Transfers have been emitted already.
+  if (!MI.getDebugOperand(0).isReg())
+    return;
+
+  // Try to get non-debug instruction responsible for the DBG_VALUE.
+  const MachineInstr *TransferInst = nullptr;
   Register Reg = MI.getDebugOperand(0).getReg();
-  auto I = std::next(MI.getReverseIterator());
-  const MachineOperand *SrcRegOp, *DestRegOp;
-  if (I != MI.getParent()->rend()) {
+  if (Reg.isValid() && RegSetInstrs.find(Reg) != RegSetInstrs.end())
+    TransferInst = RegSetInstrs.find(Reg)->second;
+
+  // Case of the parameter's DBG_VALUE at the start of entry MBB.
+  if (!TransferInst && !LastNonDbgMI && MI.getParent()->isEntryBlock())
+    return;
 
+  // If the debug expression from the DBG_VALUE is not empty, we can assume the
+  // parameter's value has changed indicating that we should stop tracking its
+  // entry value as well.
+  if (MI.getDebugExpression()->getNumElements() == 0 && TransferInst) {
+    // If the DBG_VALUE comes from a copy instruction that copies the entry
+    // value, it means the parameter's value has not changed and we should be
+    // able to use its entry value.
     // TODO: Try to keep tracking of an entry value if we encounter a propagated
     // DBG_VALUE describing the copy of the entry value. (Propagated entry value
     // does not indicate the parameter modification.)
-    auto DestSrc = TII->isCopyInstr(*I);
-    if (!DestSrc)
-      return true;
-
-    SrcRegOp = DestSrc->Source;
-    DestRegOp = DestSrc->Destination;
-    if (Reg != DestRegOp->getReg())
-      return true;
-
-    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
-      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
-      if (VL.isEntryValueCopyBackupReg(Reg) &&
-          // Entry Values should not be variadic.
-          VL.MI.getDebugOperand(0).getReg() == SrcRegOp->getReg())
-        return false;
+    auto DestSrc = TII->isCopyInstr(*TransferInst);
+    if (DestSrc) {
+      const MachineOperand *SrcRegOp, *DestRegOp;
+      SrcRegOp = DestSrc->Source;
+      DestRegOp = DestSrc->Destination;
+      if (Reg == DestRegOp->getReg()) {
+        for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+          const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
+          if (VL.isEntryValueCopyBackupReg(Reg) &&
+              // Entry Values should not be variadic.
+              VL.MI.getDebugOperand(0).getReg() == SrcRegOp->getReg())
+            return;
+        }
+      }
     }
   }
 
-  return true;
+  LLVM_DEBUG(dbgs() << "Deleting a DBG entry value because of: ";
+             MI.print(dbgs(), /*IsStandalone*/ false,
+                      /*SkipOpers*/ false, /*SkipDebugLoc*/ false,
+                      /*AddNewLine*/ true, TII));
+  cleanupEntryValueTransfers(TransferInst, OpenRanges, VarLocIDs, EntryVL,
+                             EntryValTransfers);
+  OpenRanges.erase(EntryVL);
 }
 
 /// End all previous ranges related to @MI and start a new range from @MI
 /// if it is a DBG_VALUE instr.
 void VarLocBasedLDV::transferDebugValue(const MachineInstr &MI,
-                                         OpenRangesSet &OpenRanges,
-                                         VarLocMap &VarLocIDs) {
+                                        OpenRangesSet &OpenRanges,
+                                        VarLocMap &VarLocIDs,
+                                        InstToEntryLocMap &EntryValTransfers,
+                                        RegDefToInstMap &RegSetInstrs) {
   if (!MI.isDebugValue())
     return;
   const DILocalVariable *Var = MI.getDebugVariable();
@@ -1297,13 +1344,8 @@ void VarLocBasedLDV::transferDebugValue(const MachineInstr &MI,
   auto EntryValBackupID = OpenRanges.getEntryValueBackup(V);
   if (Var->isParameter() && EntryValBackupID) {
     const VarLoc &EntryVL = VarLocIDs[EntryValBackupID->back()];
-    if (removeEntryValue(MI, OpenRanges, VarLocIDs, EntryVL)) {
-      LLVM_DEBUG(dbgs() << "Deleting a DBG entry value because of: ";
-                 MI.print(dbgs(), /*IsStandalone*/ false,
-                          /*SkipOpers*/ false, /*SkipDebugLoc*/ false,
-                          /*AddNewLine*/ true, TII));
-      OpenRanges.erase(EntryVL);
-    }
+    removeEntryValue(MI, OpenRanges, VarLocIDs, EntryVL, EntryValTransfers,
+                     RegSetInstrs);
   }
 
   if (all_of(MI.debug_operands(), [](const MachineOperand &MO) {
@@ -1351,7 +1393,7 @@ void VarLocBasedLDV::collectAllVarLocs(SmallVectorImpl<VarLoc> &Collected,
 void VarLocBasedLDV::emitEntryValues(MachineInstr &MI,
                                      OpenRangesSet &OpenRanges,
                                      VarLocMap &VarLocIDs,
-                                     TransferMap &Transfers,
+                                     InstToEntryLocMap &EntryValTransfers,
                                      VarLocsInRange &KillSet) {
   // Do not insert entry value locations after a terminator.
   if (MI.isTerminator())
@@ -1377,7 +1419,9 @@ void VarLocBasedLDV::emitEntryValues(MachineInstr &MI,
     VarLoc EntryLoc = VarLoc::CreateEntryLoc(EntryVL.MI, LS, EntryVL.Expr,
                                              EntryVL.Locs[0].Value.RegNo);
     LocIndices EntryValueIDs = VarLocIDs.insert(EntryLoc);
-    Transfers.push_back({&MI, EntryValueIDs.back()});
+    assert(EntryValueIDs.size() == 1 &&
+           "EntryValue loc should not be variadic");
+    EntryValTransfers.insert({&MI, EntryValueIDs.back()});
     OpenRanges.insert(EntryValueIDs, EntryLoc);
   }
 }
@@ -1454,9 +1498,11 @@ void VarLocBasedLDV::insertTransferDebugPair(
 }
 
 /// A definition of a register may mark the end of a range.
-void VarLocBasedLDV::transferRegisterDef(
-    MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
-    TransferMap &Transfers) {
+void VarLocBasedLDV::transferRegisterDef(MachineInstr &MI,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocMap &VarLocIDs,
+                                         InstToEntryLocMap &EntryValTransfers,
+                                         RegDefToInstMap &RegSetInstrs) {
 
   // Meta Instructions do not affect the debug liveness of any register they
   // define.
@@ -1479,6 +1525,8 @@ void VarLocBasedLDV::transferRegisterDef(
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         // FIXME: Can we break out of this loop early if no insertion occurs?
         DeadRegs.insert(*RAI);
+      RegSetInstrs.erase(MO.getReg());
+      RegSetInstrs.insert({MO.getReg(), &MI});
     } else if (MO.isRegMask()) {
       RegMasks.push_back(MO.getRegMask());
     }
@@ -1505,6 +1553,10 @@ void VarLocBasedLDV::transferRegisterDef(
           });
       if (AnyRegMaskKillsReg)
         DeadRegs.insert(Reg);
+      if (AnyRegMaskKillsReg) {
+        RegSetInstrs.erase(Reg);
+        RegSetInstrs.insert({Reg, &MI});
+      }
     }
   }
 
@@ -1518,7 +1570,7 @@ void VarLocBasedLDV::transferRegisterDef(
   if (TPC) {
     auto &TM = TPC->getTM<TargetMachine>();
     if (TM.Options.ShouldEmitDebugEntryValues())
-      emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet);
+      emitEntryValues(MI, OpenRanges, VarLocIDs, EntryValTransfers, KillSet);
   }
 }
 
@@ -1851,9 +1903,15 @@ void VarLocBasedLDV::accumulateFragmentMap(MachineInstr &MI,
 
 /// This routine creates OpenRanges.
 void VarLocBasedLDV::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                              VarLocMap &VarLocIDs, TransferMap &Transfers) {
-  transferDebugValue(MI, OpenRanges, VarLocIDs);
-  transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers);
+                             VarLocMap &VarLocIDs, TransferMap &Transfers,
+                             InstToEntryLocMap &EntryValTransfers,
+                             RegDefToInstMap &RegSetInstrs) {
+  if (!MI.isDebugInstr())
+    LastNonDbgMI = &MI;
+  transferDebugValue(MI, OpenRanges, VarLocIDs, EntryValTransfers,
+                     RegSetInstrs);
+  transferRegisterDef(MI, OpenRanges, VarLocIDs, EntryValTransfers,
+                      RegSetInstrs);
   transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
   transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
 }
@@ -2048,7 +2106,11 @@ void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI,
 
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
-bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
+bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
+                                  MachineDominatorTree *DomTree,
+                                  TargetPassConfig *TPC, unsigned InputBBLimit,
+                                  unsigned InputDbgValLimit) {
+  (void)DomTree;
   LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
 
   if (!MF.getFunction().getSubprogram())
@@ -2079,6 +2141,10 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
   VarLocInMBB InLocs;         // Ranges that are incoming after joining.
   TransferMap Transfers;      // DBG_VALUEs associated with transfers (such as
                               // spills, copies and restores).
+  // Map responsible MI to attached Transfer emitted from Backup Entry Value.
+  InstToEntryLocMap EntryValTransfers;
+  // Map a Register to the last MI which clobbered it.
+  RegDefToInstMap RegSetInstrs;
 
   VarToFragments SeenFragments;
 
@@ -2141,7 +2207,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
       for (auto &MI : MBB)
         if (MI.isDebugValue())
           ++NumInputDbgValues;
-    if (NumInputDbgValues > InputDbgValueLimit) {
+    if (NumInputDbgValues > InputDbgValLimit) {
       LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName()
                         << " has " << RPONumber << " basic blocks and "
                         << NumInputDbgValues
@@ -2175,8 +2241,11 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
         // operate with registers that correspond to user variables.
         // First load any pending inlocs.
         OpenRanges.insertFromLocSet(getVarLocsInMBB(MBB, InLocs), VarLocIDs);
+        LastNonDbgMI = nullptr;
+        RegSetInstrs.clear();
         for (auto &MI : *MBB)
-          process(MI, OpenRanges, VarLocIDs, Transfers);
+          process(MI, OpenRanges, VarLocIDs, Transfers, EntryValTransfers,
+                  RegSetInstrs);
         OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
 
         LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
@@ -2210,6 +2279,18 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
   }
   Transfers.clear();
 
+  // Add DBG_VALUEs created using Backup Entry Value location.
+  for (auto &TR : EntryValTransfers) {
+    MachineInstr *TRInst = const_cast<MachineInstr *>(TR.first);
+    assert(!TRInst->isTerminator() &&
+           "Cannot insert DBG_VALUE after terminator");
+    MachineBasicBlock *MBB = TRInst->getParent();
+    const VarLoc &VL = VarLocIDs[TR.second];
+    MachineInstr *MI = VL.BuildDbgValue(MF);
+    MBB->insertAfterBundle(TRInst->getIterator(), MI);
+  }
+  EntryValTransfers.clear();
+
   // Deferred inlocs will not have had any DBG_VALUE insts created; do
   // that now.
   flushPendingLocs(InLocs, VarLocIDs);
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 54058a547928..dcd546f9c6db 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -417,7 +417,7 @@ public:
   void addDef(SlotIndex Idx, ArrayRef<MachineOperand> LocMOs, bool IsIndirect,
               bool IsList, const DIExpression &Expr) {
     SmallVector<unsigned> Locs;
-    for (MachineOperand Op : LocMOs)
+    for (const MachineOperand &Op : LocMOs)
       Locs.push_back(getLocationNo(Op));
     DbgVariableValue DbgValue(Locs, IsIndirect, IsList, Expr);
     // Add a singular (Idx,Idx) -> value mapping.
@@ -1294,13 +1294,9 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf, bool InstrRef) {
 
 static void removeDebugInstrs(MachineFunction &mf) {
   for (MachineBasicBlock &MBB : mf) {
-    for (auto MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ) {
-      if (!MBBI->isDebugInstr()) {
-        ++MBBI;
-        continue;
-      }
-      MBBI = MBB.erase(MBBI);
-    }
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB))
+      if (MI.isDebugInstr())
+        MBB.erase(&MI);
   }
 }
 
@@ -1314,12 +1310,7 @@ bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
 
   // Have we been asked to track variable locations using instruction
   // referencing?
-  bool InstrRef = false;
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (TPC) {
-    auto &TM = TPC->getTM<TargetMachine>();
-    InstrRef = TM.Options.ValueTrackingVariableLocations;
-  }
+  bool InstrRef = mf.useDebugInstrRef();
 
   if (!pImpl)
     pImpl = new LDVImpl(this);
diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp
index 1eed0ec5bbbe..9ded0fb6ae0a 100644
--- a/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/llvm/lib/CodeGen/LiveInterval.cpp
@@ -592,21 +592,10 @@ void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
   VNInfo *ValNo = I->valno;
   if (I->start == Start) {
     if (I->end == End) {
-      if (RemoveDeadValNo) {
-        // Check if val# is dead.
-        bool isDead = true;
-        for (const_iterator II = begin(), EE = end(); II != EE; ++II)
-          if (II != I && II->valno == ValNo) {
-            isDead = false;
-            break;
-          }
-        if (isDead) {
-          // Now that ValNo is dead, remove it.
-          markValNoForDeletion(ValNo);
-        }
-      }
-
       segments.erase(I);  // Removed the whole Segment.
+
+      if (RemoveDeadValNo)
+        removeValNoIfDead(ValNo);
     } else
       I->start = End;
     return;
@@ -627,13 +616,25 @@ void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
   segments.insert(std::next(I), Segment(End, OldEnd, ValNo));
 }
 
+LiveRange::iterator LiveRange::removeSegment(iterator I, bool RemoveDeadValNo) {
+  VNInfo *ValNo = I->valno;
+  I = segments.erase(I);
+  if (RemoveDeadValNo)
+    removeValNoIfDead(ValNo);
+  return I;
+}
+
+void LiveRange::removeValNoIfDead(VNInfo *ValNo) {
+  if (none_of(*this, [=](const Segment &S) { return S.valno == ValNo; }))
+    markValNoForDeletion(ValNo);
+}
+
 /// removeValNo - Remove all the segments defined by the specified value#.
 /// Also remove the value# from value# list.
 void LiveRange::removeValNo(VNInfo *ValNo) {
   if (empty()) return;
-  segments.erase(remove_if(*this, [ValNo](const Segment &S) {
-    return S.valno == ValNo;
-  }), end());
+  llvm::erase_if(segments,
+                 [ValNo](const Segment &S) { return S.valno == ValNo; });
   // Now that ValNo is dead, remove it.
   markValNoForDeletion(ValNo);
 }
@@ -1019,7 +1020,7 @@ void LiveRange::print(raw_ostream &OS) const {
 
   // Print value number info.
   if (getNumValNums()) {
-    OS << "  ";
+    OS << ' ';
     unsigned vnum = 0;
     for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
          ++i, ++vnum) {
@@ -1038,8 +1039,8 @@ void LiveRange::print(raw_ostream &OS) const {
 }
 
 void LiveInterval::SubRange::print(raw_ostream &OS) const {
-  OS << " L" << PrintLaneMask(LaneMask) << ' '
-     << static_cast<const LiveRange&>(*this);
+  OS << "  L" << PrintLaneMask(LaneMask) << ' '
+     << static_cast<const LiveRange &>(*this);
 }
 
 void LiveInterval::print(raw_ostream &OS) const {
@@ -1048,7 +1049,7 @@ void LiveInterval::print(raw_ostream &OS) const {
   // Print subranges
   for (const SubRange &SR : subranges())
     OS << SR;
-  OS << " weight:" << Weight;
+  OS << "  weight:" << Weight;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index dfa523d4bf41..50b31e1eb247 100644
--- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -112,7 +112,7 @@ LiveInterval *LiveIntervalUnion::getOneVReg() const {
 // Scan the vector of interfering virtual registers in this union. Assume it's
 // quite small.
 bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
-  return is_contained(*InterferingVRegs, VirtReg);
+  return is_contained(InterferingVRegs, VirtReg);
 }
 
 // Collect virtual registers in this union that interfere with this
@@ -124,14 +124,11 @@ bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
 // 2. SeenAllInterferences == true: InterferingVRegs complete, iterators unused.
 // 3. Iterators left at the last seen intersection.
 //
-unsigned LiveIntervalUnion::Query::
-collectInterferingVRegs(unsigned MaxInterferingRegs) {
-  if (!InterferingVRegs)
-    InterferingVRegs.emplace();
-
+unsigned
+LiveIntervalUnion::Query::collectInterferingVRegs(unsigned MaxInterferingRegs) {
   // Fast path return if we already have the desired information.
-  if (SeenAllInterferences || InterferingVRegs->size() >= MaxInterferingRegs)
-    return InterferingVRegs->size();
+  if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs)
+    return InterferingVRegs.size();
 
   // Set up iterators on the first call.
   if (!CheckedFirstInterference) {
@@ -160,14 +157,14 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
       LiveInterval *VReg = LiveUnionI.value();
       if (VReg != RecentReg && !isSeenInterference(VReg)) {
         RecentReg = VReg;
-        InterferingVRegs->push_back(VReg);
-        if (InterferingVRegs->size() >= MaxInterferingRegs)
-          return InterferingVRegs->size();
+        InterferingVRegs.push_back(VReg);
+        if (InterferingVRegs.size() >= MaxInterferingRegs)
+          return InterferingVRegs.size();
       }
       // This LiveUnion segment is no longer interesting.
       if (!(++LiveUnionI).valid()) {
         SeenAllInterferences = true;
-        return InterferingVRegs->size();
+        return InterferingVRegs.size();
       }
     }
 
@@ -188,7 +185,7 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     LiveUnionI.advanceTo(LRI->start);
   }
   SeenAllInterferences = true;
-  return InterferingVRegs->size();
+  return InterferingVRegs.size();
 }
 
 void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 23036c2b115f..2f97386b6d18 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -1571,15 +1571,14 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
                                         LaneBitmask LaneMask) {
   LiveInterval::iterator LII = LR.find(EndIdx);
   SlotIndex lastUseIdx;
-  if (LII == LR.begin()) {
-    // This happens when the function is called for a subregister that only
-    // occurs _after_ the range that is to be repaired.
-    return;
-  }
-  if (LII != LR.end() && LII->start < EndIdx)
+  if (LII != LR.end() && LII->start < EndIdx) {
     lastUseIdx = LII->end;
-  else
+  } else if (LII == LR.begin()) {
+    // We may not have a liverange at all if this is a subregister untouched
+    // between \p Begin and \p End.
+  } else {
     --LII;
+  }
 
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
@@ -1593,10 +1592,7 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
 
     // FIXME: This doesn't currently handle early-clobber or multiple removed
     // defs inside of the region to repair.
-    for (MachineInstr::mop_iterator OI = MI.operands_begin(),
-                                    OE = MI.operands_end();
-         OI != OE; ++OI) {
-      const MachineOperand &MO = *OI;
+    for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg() || MO.getReg() != Reg)
         continue;
 
@@ -1608,17 +1604,9 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
       if (MO.isDef()) {
         if (!isStartValid) {
           if (LII->end.isDead()) {
-            SlotIndex prevStart;
+            LII = LR.removeSegment(LII, true);
             if (LII != LR.begin())
-              prevStart = std::prev(LII)->start;
-
-            // FIXME: This could be more efficient if there was a
-            // removeSegment method that returned an iterator.
-            LR.removeSegment(*LII, true);
-            if (prevStart.isValid())
-              LII = LR.find(prevStart);
-            else
-              LII = LR.begin();
+              --LII;
           } else {
             LII->start = instrIdx.getRegSlot();
             LII->valno->def = instrIdx.getRegSlot();
@@ -1656,6 +1644,10 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
       }
     }
   }
+
+  bool isStartValid = getInstructionFromIndex(LII->start);
+  if (!isStartValid && LII->end.isDead())
+    LR.removeSegment(*LII, true);
 }
 
 void
@@ -1678,22 +1670,33 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
 
   Indexes->repairIndexesInRange(MBB, Begin, End);
 
+  // Make sure a live interval exists for all register operands in the range.
+  SmallVector<Register> RegsToRepair(OrigRegs.begin(), OrigRegs.end());
   for (MachineBasicBlock::iterator I = End; I != Begin;) {
     --I;
     MachineInstr &MI = *I;
     if (MI.isDebugOrPseudoInstr())
       continue;
-    for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
-                                          MOE = MI.operands_end();
-         MOI != MOE; ++MOI) {
-      if (MOI->isReg() && Register::isVirtualRegister(MOI->getReg()) &&
-          !hasInterval(MOI->getReg())) {
-        createAndComputeVirtRegInterval(MOI->getReg());
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.getReg().isVirtual()) {
+        Register Reg = MO.getReg();
+        // If the new instructions refer to subregs but the old instructions did
+        // not, throw away any old live interval so it will be recomputed with
+        // subranges.
+        if (MO.getSubReg() && hasInterval(Reg) &&
+            !getInterval(Reg).hasSubRanges() &&
+            MRI->shouldTrackSubRegLiveness(Reg))
+          removeInterval(Reg);
+        if (!hasInterval(Reg)) {
+          createAndComputeVirtRegInterval(Reg);
+          // Don't bother to repair a freshly calculated live interval.
+          erase_value(RegsToRepair, Reg);
+        }
       }
     }
   }
 
-  for (Register Reg : OrigRegs) {
+  for (Register Reg : RegsToRepair) {
     if (!Reg.isVirtual())
       continue;
 
@@ -1704,6 +1707,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
 
     for (LiveInterval::SubRange &S : LI.subranges())
       repairOldRegInRange(Begin, End, EndIdx, S, Reg, S.LaneMask);
+    LI.removeEmptySubRanges();
 
     repairOldRegInRange(Begin, End, EndIdx, LI, Reg);
   }
diff --git a/llvm/lib/CodeGen/LivePhysRegs.cpp b/llvm/lib/CodeGen/LivePhysRegs.cpp
index c0c7848139e4..d4848f16dcf2 100644
--- a/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -81,22 +81,24 @@ void LivePhysRegs::stepForward(const MachineInstr &MI,
     SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> &Clobbers) {
   // Remove killed registers from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
-    if (O->isReg() && !O->isDebug()) {
+    if (O->isReg()) {
+      if (O->isDebug())
+        continue;
       Register Reg = O->getReg();
-      if (!Register::isPhysicalRegister(Reg))
+      if (!Reg.isPhysical())
         continue;
       if (O->isDef()) {
         // Note, dead defs are still recorded.  The caller should decide how to
         // handle them.
         Clobbers.push_back(std::make_pair(Reg, &*O));
       } else {
-        if (!O->isKill())
-          continue;
         assert(O->isUse());
-        removeReg(Reg);
+        if (O->isKill())
+          removeReg(Reg);
       }
-    } else if (O->isRegMask())
+    } else if (O->isRegMask()) {
       removeRegsInMask(*O, &Clobbers);
+    }
   }
 
   // Add defs to the set.
@@ -250,7 +252,7 @@ void llvm::computeLiveIns(LivePhysRegs &LiveRegs,
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   LiveRegs.init(TRI);
   LiveRegs.addLiveOutsNoPristines(MBB);
-  for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend()))
+  for (const MachineInstr &MI : llvm::reverse(MBB))
     LiveRegs.stepBackward(MI);
 }
 
@@ -287,7 +289,7 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
   LiveRegs.init(TRI);
   LiveRegs.addLiveOutsNoPristines(MBB);
 
-  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
     // Recompute dead flags.
     for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
       if (!MO->isReg() || !MO->isDef() || MO->isDebug())
@@ -296,7 +298,7 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       Register Reg = MO->getReg();
       if (Reg == 0)
         continue;
-      assert(Register::isPhysicalRegister(Reg));
+      assert(Reg.isPhysical());
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
 
@@ -325,7 +327,7 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       Register Reg = MO->getReg();
       if (Reg == 0)
         continue;
-      assert(Register::isPhysicalRegister(Reg));
+      assert(Reg.isPhysical());
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
       MO->setIsKill(IsNotLive);
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 64a2dd275643..d91ff734ad8f 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -107,7 +107,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
                                        SlotIndex OrigIdx,
                                        SlotIndex UseIdx) const {
   OrigIdx = OrigIdx.getRegSlot(true);
-  UseIdx = UseIdx.getRegSlot(true);
+  UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true));
   for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = OrigMI->getOperand(i);
     if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
@@ -305,17 +305,18 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
       isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
   }
 
+  bool HasLiveVRegUses = false;
+
   // Check for live intervals that may shrink
-  for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-         MOE = MI->operands_end(); MOI != MOE; ++MOI) {
-    if (!MOI->isReg())
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
       continue;
-    Register Reg = MOI->getReg();
+    Register Reg = MO.getReg();
     if (!Register::isVirtualRegister(Reg)) {
       // Check if MI reads any unreserved physregs.
-      if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
+      if (Reg && MO.readsReg() && !MRI.isReserved(Reg))
         ReadsPhysRegs = true;
-      else if (MOI->isDef())
+      else if (MO.isDef())
         LIS.removePhysRegDefAt(Reg.asMCReg(), Idx);
       continue;
     }
@@ -325,12 +326,14 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // unlikely to change anything. We typically don't want to shrink the
     // PIC base register that has lots of uses everywhere.
     // Always shrink COPY uses that probably come from live range splitting.
-    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MOI->isDef())) ||
-        (MOI->readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, *MOI))))
+    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) ||
+        (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO))))
       ToShrink.insert(&LI);
+    else if (MO.readsReg())
+      HasLiveVRegUses = true;
 
     // Remove defined value.
-    if (MOI->isDef()) {
+    if (MO.isDef()) {
       if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
         TheDelegate->LRE_WillShrinkVirtReg(LI.reg());
       LIS.removeVRegDefAt(LI, Idx);
@@ -362,7 +365,11 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // the inst for remat of other siblings. The inst is saved in
     // LiveRangeEdit::DeadRemats and will be deleted after all the
     // allocations of the func are done.
-    if (isOrigDef && DeadRemats && TII.isTriviallyReMaterializable(*MI, AA)) {
+    // However, immediately delete instructions which have unshrunk virtual
+    // register uses. That may provoke RA to split an interval at the KILL
+    // and later result in an invalid live segment end.
+    if (isOrigDef && DeadRemats && !HasLiveVRegUses &&
+        TII.isTriviallyReMaterializable(*MI, AA)) {
       LiveInterval &NewLI = createEmptyIntervalFrom(Dest, false);
       VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
       NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI));
@@ -405,8 +412,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
       break;
 
     // Shrink just one live interval. Then delete new dead defs.
-    LiveInterval *LI = ToShrink.back();
-    ToShrink.pop_back();
+    LiveInterval *LI = ToShrink.pop_back_val();
     if (foldAsLoad(LI, Dead))
       continue;
     unsigned VReg = LI->reg();
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index 7181dbc9c870..51ba4b7e53eb 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -119,8 +119,7 @@ void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
   MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
 
   while (!WorkList.empty()) {
-    MachineBasicBlock *Pred = WorkList.back();
-    WorkList.pop_back();
+    MachineBasicBlock *Pred = WorkList.pop_back_val();
     MarkVirtRegAliveInBlock(VRInfo, DefBlock, Pred, WorkList);
   }
 }
@@ -484,8 +483,7 @@ void LiveVariables::HandlePhysRegDef(Register Reg, MachineInstr *MI,
 void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
                                       SmallVectorImpl<unsigned> &Defs) {
   while (!Defs.empty()) {
-    Register Reg = Defs.back();
-    Defs.pop_back();
+    Register Reg = Defs.pop_back_val();
     for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
          SubRegs.isValid(); ++SubRegs) {
       unsigned SubReg = *SubRegs;
@@ -671,6 +669,86 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   return false;
 }
 
+void LiveVariables::recomputeForSingleDefVirtReg(Register Reg) {
+  assert(Reg.isVirtual());
+
+  VarInfo &VI = getVarInfo(Reg);
+  VI.AliveBlocks.clear();
+  VI.Kills.clear();
+
+  MachineInstr &DefMI = *MRI->getUniqueVRegDef(Reg);
+  MachineBasicBlock &DefBB = *DefMI.getParent();
+
+  // Handle the case where all uses have been removed.
+  if (MRI->use_nodbg_empty(Reg)) {
+    VI.Kills.push_back(&DefMI);
+    DefMI.addRegisterDead(Reg, nullptr);
+    return;
+  }
+  DefMI.clearRegisterDeads(Reg);
+
+  // Initialize a worklist of BBs that Reg is live-to-end of. (Here
+  // "live-to-end" means Reg is live at the end of a block even if it is only
+  // live because of phi uses in a successor. This is different from isLiveOut()
+  // which does not consider phi uses.)
+  SmallVector<MachineBasicBlock *> LiveToEndBlocks;
+  SparseBitVector<> UseBlocks;
+  for (auto &UseMO : MRI->use_nodbg_operands(Reg)) {
+    UseMO.setIsKill(false);
+    MachineInstr &UseMI = *UseMO.getParent();
+    MachineBasicBlock &UseBB = *UseMI.getParent();
+    UseBlocks.set(UseBB.getNumber());
+    if (UseMI.isPHI()) {
+      // If Reg is used in a phi then it is live-to-end of the corresponding
+      // predecessor.
+      unsigned Idx = UseMI.getOperandNo(&UseMO);
+      LiveToEndBlocks.push_back(UseMI.getOperand(Idx + 1).getMBB());
+    } else if (&UseBB == &DefBB) {
+      // A non-phi use in the same BB as the single def must come after the def.
+    } else {
+      // Otherwise Reg must be live-to-end of all predecessors.
+      LiveToEndBlocks.append(UseBB.pred_begin(), UseBB.pred_end());
+    }
+  }
+
+  // Iterate over the worklist adding blocks to AliveBlocks.
+  bool LiveToEndOfDefBB = false;
+  while (!LiveToEndBlocks.empty()) {
+    MachineBasicBlock &BB = *LiveToEndBlocks.pop_back_val();
+    if (&BB == &DefBB) {
+      LiveToEndOfDefBB = true;
+      continue;
+    }
+    if (VI.AliveBlocks.test(BB.getNumber()))
+      continue;
+    VI.AliveBlocks.set(BB.getNumber());
+    LiveToEndBlocks.append(BB.pred_begin(), BB.pred_end());
+  }
+
+  // Recompute kill flags. For each block in which Reg is used but is not
+  // live-through, find the last instruction that uses Reg. Ignore phi nodes
+  // because they should not be included in Kills.
+  for (unsigned UseBBNum : UseBlocks) {
+    if (VI.AliveBlocks.test(UseBBNum))
+      continue;
+    MachineBasicBlock &UseBB = *MF->getBlockNumbered(UseBBNum);
+    if (&UseBB == &DefBB && LiveToEndOfDefBB)
+      continue;
+    for (auto &MI : reverse(UseBB)) {
+      if (MI.isDebugOrPseudoInstr())
+        continue;
+      if (MI.isPHI())
+        break;
+      if (MI.readsRegister(Reg)) {
+        assert(!MI.killsRegister(Reg));
+        MI.addRegisterKilled(Reg, nullptr);
+        VI.Kills.push_back(&MI);
+        break;
+      }
+    }
+  }
+}
+
 /// replaceKillInstruction - Update register kill info by replacing a kill
 /// instruction with a new one.
 void LiveVariables::replaceKillInstruction(Register Reg, MachineInstr &OldMI,
diff --git a/llvm/lib/CodeGen/LoopTraversal.cpp b/llvm/lib/CodeGen/LoopTraversal.cpp
index 9490dfc40a82..0d400253c652 100644
--- a/llvm/lib/CodeGen/LoopTraversal.cpp
+++ b/llvm/lib/CodeGen/LoopTraversal.cpp
@@ -39,8 +39,7 @@ LoopTraversal::TraversalOrder LoopTraversal::traverse(MachineFunction &MF) {
     bool Primary = true;
     Workqueue.push_back(MBB);
     while (!Workqueue.empty()) {
-      MachineBasicBlock *ActiveMBB = &*Workqueue.back();
-      Workqueue.pop_back();
+      MachineBasicBlock *ActiveMBB = Workqueue.pop_back_val();
       bool Done = isBlockDone(ActiveMBB);
       MBBTraversalOrder.push_back(TraversedMBBInfo(ActiveMBB, Primary, Done));
       for (MachineBasicBlock *Succ : ActiveMBB->successors()) {
diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp
index 62e9c6b629d3..dce64ab9f5ca 100644
--- a/llvm/lib/CodeGen/LowLevelType.cpp
+++ b/llvm/lib/CodeGen/LowLevelType.cpp
@@ -52,6 +52,16 @@ MVT llvm::getMVTForLLT(LLT Ty) {
       Ty.getNumElements());
 }
 
+EVT llvm::getApproximateEVTForLLT(LLT Ty, const DataLayout &DL,
+                                  LLVMContext &Ctx) {
+  if (Ty.isVector()) {
+    EVT EltVT = getApproximateEVTForLLT(Ty.getElementType(), DL, Ctx);
+    return EVT::getVectorVT(Ctx, EltVT, Ty.getElementCount());
+  }
+
+  return EVT::getIntegerVT(Ctx, Ty.getSizeInBits());
+}
+
 LLT llvm::getLLTForMVT(MVT Ty) {
   if (!Ty.isVector())
     return LLT::scalar(Ty.getSizeInBits());
diff --git a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index 8ef6aca602a1..3ec8c627f131 100644
--- a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -38,10 +38,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-extern char &MIRCanonicalizerID;
-} // namespace llvm
-
 #define DEBUG_TYPE "mir-canonicalizer"
 
 static cl::opt<unsigned>
@@ -332,8 +328,8 @@ static bool propagateLocalCopies(MachineBasicBlock *MBB) {
       continue;
 
     std::vector<MachineOperand *> Uses;
-    for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI)
-      Uses.push_back(&*UI);
+    for (MachineOperand &MO : MRI.use_operands(Dst))
+      Uses.push_back(&MO);
     for (auto *MO : Uses)
       MO->setReg(Src);
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 87fde7d39a60..0ca820f160aa 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -261,6 +261,8 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
+      .Case("inlineasm-br-indirect-target",
+            MIToken::kw_inlineasm_br_indirect_target)
       .Case("ehfunclet-entry", MIToken::kw_ehfunclet_entry)
       .Case("liveins", MIToken::kw_liveins)
       .Case("successors", MIToken::kw_successors)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 68425b41c3fb..70d17f819ce3 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -116,6 +116,7 @@ struct MIToken {
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
+    kw_inlineasm_br_indirect_target,
     kw_ehfunclet_entry,
     kw_liveins,
     kw_successors,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 34e1f9225d42..1a04e1ca56a9 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -498,7 +498,7 @@ public:
                                          MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
-  bool parseAlignment(unsigned &Alignment);
+  bool parseAlignment(uint64_t &Alignment);
   bool parseAddrspace(unsigned &Addrspace);
   bool parseSectionID(Optional<MBBSectionID> &SID);
   bool parseOperandsOffset(MachineOperand &Op);
@@ -674,9 +674,10 @@ bool MIParser::parseBasicBlockDefinition(
   lex();
   bool HasAddressTaken = false;
   bool IsLandingPad = false;
+  bool IsInlineAsmBrIndirectTarget = false;
   bool IsEHFuncletEntry = false;
   Optional<MBBSectionID> SectionID;
-  unsigned Alignment = 0;
+  uint64_t Alignment = 0;
   BasicBlock *BB = nullptr;
   if (consumeIfPresent(MIToken::lparen)) {
     do {
@@ -690,6 +691,10 @@ bool MIParser::parseBasicBlockDefinition(
         IsLandingPad = true;
         lex();
         break;
+      case MIToken::kw_inlineasm_br_indirect_target:
+        IsInlineAsmBrIndirectTarget = true;
+        lex();
+        break;
       case MIToken::kw_ehfunclet_entry:
         IsEHFuncletEntry = true;
         lex();
@@ -737,6 +742,7 @@ bool MIParser::parseBasicBlockDefinition(
   if (HasAddressTaken)
     MBB->setHasAddressTaken();
   MBB->setIsEHPad(IsLandingPad);
+  MBB->setIsInlineAsmBrIndirectTarget(IsInlineAsmBrIndirectTarget);
   MBB->setIsEHFuncletEntry(IsEHFuncletEntry);
   if (SectionID.hasValue()) {
     MBB->setSectionID(SectionID.getValue());
@@ -1011,10 +1017,6 @@ bool MIParser::parse(MachineInstr *&MI) {
     Optional<unsigned> TiedDefIdx;
     if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
       return true;
-    if ((OpCode == TargetOpcode::DBG_VALUE ||
-         OpCode == TargetOpcode::DBG_VALUE_LIST) &&
-        MO.isReg())
-      MO.setIsDebug();
     Operands.push_back(
         ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
     if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
@@ -2898,16 +2900,16 @@ bool MIParser::parseOffset(int64_t &Offset) {
   return false;
 }
 
-bool MIParser::parseAlignment(unsigned &Alignment) {
+bool MIParser::parseAlignment(uint64_t &Alignment) {
   assert(Token.is(MIToken::kw_align) || Token.is(MIToken::kw_basealign));
   lex();
   if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
     return error("expected an integer literal after 'align'");
-  if (getUnsigned(Alignment))
+  if (getUint64(Alignment))
     return true;
   lex();
 
-  if (!isPowerOf2_32(Alignment))
+  if (!isPowerOf2_64(Alignment))
     return error("expected a power-of-2 literal after 'align'");
 
   return false;
@@ -3261,7 +3263,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     if (parseMachinePointerInfo(Ptr))
       return true;
   }
-  unsigned BaseAlignment =
+  uint64_t BaseAlignment =
       (Size != MemoryLocation::UnknownSize ? PowerOf2Ceil(Size) : 1);
   AAMDNodes AAInfo;
   MDNode *Range = nullptr;
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index d77104752880..6221b5929301 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -454,6 +454,9 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
     MF.getProperties().set(MachineFunctionProperties::Property::Selected);
   if (YamlMF.FailedISel)
     MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  if (YamlMF.FailsVerification)
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::FailsVerification);
 
   PerFunctionMIParsingState PFS(MF, SM, IRSlots, *Target);
   if (parseRegisterInfo(PFS, YamlMF))
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 2a78bb62762a..f1369396e37f 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -217,6 +217,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
       MachineFunctionProperties::Property::Selected);
   YamlMF.FailedISel = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::FailedISel);
+  YamlMF.FailsVerification = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::FailsVerification);
 
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
   MachineModuleSlotTracker MST(&MF);
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
new file mode 100644
index 000000000000..90ecc6fc68fc
--- /dev/null
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -0,0 +1,343 @@
+//===-------- MIRSampleProfile.cpp: MIRSampleFDO (For FSAFDO) -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the MIRSampleProfile loader, mainly
+// for flow sensitive SampleFDO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRSampleProfile.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
+
+using namespace llvm;
+using namespace sampleprof;
+using namespace llvm::sampleprofutil;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "fs-profile-loader"
+
+static cl::opt<bool> ShowFSBranchProb(
+    "show-fs-branchprob", cl::Hidden, cl::init(false),
+    cl::desc("Print setting flow sensitive branch probabilities"));
+static cl::opt<unsigned> FSProfileDebugProbDiffThreshold(
+    "fs-profile-debug-prob-diff-threshold", cl::init(10),
+    cl::desc("Only show debug message if the branch probility is greater than "
+             "this value (in percentage)."));
+
+static cl::opt<unsigned> FSProfileDebugBWThreshold(
+    "fs-profile-debug-bw-threshold", cl::init(10000),
+    cl::desc("Only show debug message if the source branch weight is greater "
+             " than this value."));
+
+static cl::opt<bool> ViewBFIBefore("fs-viewbfi-before", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("View BFI before MIR loader"));
+static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden,
+                                  cl::init(false),
+                                  cl::desc("View BFI after MIR loader"));
+
+char MIRProfileLoaderPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE,
+                      "Load MIR Sample Profile",
+                      /* cfg = */ false, /* is_analysis = */ false)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_END(MIRProfileLoaderPass, DEBUG_TYPE, "Load MIR Sample Profile",
+                    /* cfg = */ false, /* is_analysis = */ false)
+
+char &llvm::MIRProfileLoaderPassID = MIRProfileLoaderPass::ID;
+
+FunctionPass *llvm::createMIRProfileLoaderPass(std::string File,
+                                               std::string RemappingFile,
+                                               FSDiscriminatorPass P) {
+  return new MIRProfileLoaderPass(File, RemappingFile, P);
+}
+
+namespace llvm {
+
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi={none | fraction | integer | count}
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+namespace afdo_detail {
+template <> struct IRTraits<MachineBasicBlock> {
+  using InstructionT = MachineInstr;
+  using BasicBlockT = MachineBasicBlock;
+  using FunctionT = MachineFunction;
+  using BlockFrequencyInfoT = MachineBlockFrequencyInfo;
+  using LoopT = MachineLoop;
+  using LoopInfoPtrT = MachineLoopInfo *;
+  using DominatorTreePtrT = MachineDominatorTree *;
+  using PostDominatorTreePtrT = MachinePostDominatorTree *;
+  using PostDominatorTreeT = MachinePostDominatorTree;
+  using OptRemarkEmitterT = MachineOptimizationRemarkEmitter;
+  using OptRemarkAnalysisT = MachineOptimizationRemarkAnalysis;
+  using PredRangeT = iterator_range<std::vector<MachineBasicBlock *>::iterator>;
+  using SuccRangeT = iterator_range<std::vector<MachineBasicBlock *>::iterator>;
+  static Function &getFunction(MachineFunction &F) { return F.getFunction(); }
+  static const MachineBasicBlock *getEntryBB(const MachineFunction *F) {
+    return GraphTraits<const MachineFunction *>::getEntryNode(F);
+  }
+  static PredRangeT getPredecessors(MachineBasicBlock *BB) {
+    return BB->predecessors();
+  }
+  static SuccRangeT getSuccessors(MachineBasicBlock *BB) {
+    return BB->successors();
+  }
+};
+} // namespace afdo_detail
+
+class MIRProfileLoader final
+    : public SampleProfileLoaderBaseImpl<MachineBasicBlock> {
+public:
+  void setInitVals(MachineDominatorTree *MDT, MachinePostDominatorTree *MPDT,
+                   MachineLoopInfo *MLI, MachineBlockFrequencyInfo *MBFI,
+                   MachineOptimizationRemarkEmitter *MORE) {
+    DT = MDT;
+    PDT = MPDT;
+    LI = MLI;
+    BFI = MBFI;
+    ORE = MORE;
+  }
+  void setFSPass(FSDiscriminatorPass Pass) {
+    P = Pass;
+    LowBit = getFSPassBitBegin(P);
+    HighBit = getFSPassBitEnd(P);
+    assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
+  }
+
+  MIRProfileLoader(StringRef Name, StringRef RemapName)
+      : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)) {
+  }
+
+  void setBranchProbs(MachineFunction &F);
+  bool runOnFunction(MachineFunction &F);
+  bool doInitialization(Module &M);
+  bool isValid() const { return ProfileIsValid; }
+
+protected:
+  friend class SampleCoverageTracker;
+
+  /// Hold the information of the basic block frequency.
+  MachineBlockFrequencyInfo *BFI;
+
+  /// PassNum is the sequence number this pass is called, start from 1.
+  FSDiscriminatorPass P;
+
+  // LowBit in the FS discriminator used by this instance. Note the number is
+  // 0-based. Base discrimnator use bit 0 to bit 11.
+  unsigned LowBit;
+  // HighwBit in the FS discriminator used by this instance. Note the number
+  // is 0-based.
+  unsigned HighBit;
+
+  bool ProfileIsValid = true;
+};
+
+template <>
+void SampleProfileLoaderBaseImpl<
+    MachineBasicBlock>::computeDominanceAndLoopInfo(MachineFunction &F) {}
+
+void MIRProfileLoader::setBranchProbs(MachineFunction &F) {
+  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch probs\n");
+  for (auto &BI : F) {
+    MachineBasicBlock *BB = &BI;
+    if (BB->succ_size() < 2)
+      continue;
+    const MachineBasicBlock *EC = EquivalenceClass[BB];
+    uint64_t BBWeight = BlockWeights[EC];
+    uint64_t SumEdgeWeight = 0;
+    for (MachineBasicBlock *Succ : BB->successors()) {
+      Edge E = std::make_pair(BB, Succ);
+      SumEdgeWeight += EdgeWeights[E];
+    }
+
+    if (BBWeight != SumEdgeWeight) {
+      LLVM_DEBUG(dbgs() << "BBweight is not equal to SumEdgeWeight: BBWWeight="
+                        << BBWeight << " SumEdgeWeight= " << SumEdgeWeight
+                        << "\n");
+      BBWeight = SumEdgeWeight;
+    }
+    if (BBWeight == 0) {
+      LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+      continue;
+    }
+
+#ifndef NDEBUG
+    uint64_t BBWeightOrig = BBWeight;
+#endif
+    uint32_t MaxWeight = std::numeric_limits<uint32_t>::max();
+    uint32_t Factor = 1;
+    if (BBWeight > MaxWeight) {
+      Factor = BBWeight / MaxWeight + 1;
+      BBWeight /= Factor;
+      LLVM_DEBUG(dbgs() << "Scaling weights by " << Factor << "\n");
+    }
+
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+                                          SE = BB->succ_end();
+         SI != SE; ++SI) {
+      MachineBasicBlock *Succ = *SI;
+      Edge E = std::make_pair(BB, Succ);
+      uint64_t EdgeWeight = EdgeWeights[E];
+      EdgeWeight /= Factor;
+
+      assert(BBWeight >= EdgeWeight &&
+             "BBweight is larger than EdgeWeight -- should not happen.\n");
+
+      BranchProbability OldProb = BFI->getMBPI()->getEdgeProbability(BB, SI);
+      BranchProbability NewProb(EdgeWeight, BBWeight);
+      if (OldProb == NewProb)
+        continue;
+      BB->setSuccProbability(SI, NewProb);
+#ifndef NDEBUG
+      if (!ShowFSBranchProb)
+        continue;
+      bool Show = false;
+      BranchProbability Diff;
+      if (OldProb > NewProb)
+        Diff = OldProb - NewProb;
+      else
+        Diff = NewProb - OldProb;
+      Show = (Diff >= BranchProbability(FSProfileDebugProbDiffThreshold, 100));
+      Show &= (BBWeightOrig >= FSProfileDebugBWThreshold);
+
+      auto DIL = BB->findBranchDebugLoc();
+      auto SuccDIL = Succ->findBranchDebugLoc();
+      if (Show) {
+        dbgs() << "Set branch fs prob: MBB (" << BB->getNumber() << " -> "
+               << Succ->getNumber() << "): ";
+        if (DIL)
+          dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                 << DIL->getColumn();
+        if (SuccDIL)
+          dbgs() << "-->" << SuccDIL->getFilename() << ":" << SuccDIL->getLine()
+                 << ":" << SuccDIL->getColumn();
+        dbgs() << " W=" << BBWeightOrig << "  " << OldProb << " --> " << NewProb
+               << "\n";
+      }
+#endif
+    }
+  }
+}
+
+bool MIRProfileLoader::doInitialization(Module &M) {
+  auto &Ctx = M.getContext();
+
+  auto ReaderOrErr = sampleprof::SampleProfileReader::create(Filename, Ctx, P,
+                                                             RemappingFilename);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+    return false;
+  }
+
+  Reader = std::move(ReaderOrErr.get());
+  Reader->setModule(&M);
+  ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  Reader->getSummary();
+
+  return true;
+}
+
+bool MIRProfileLoader::runOnFunction(MachineFunction &MF) {
+  Function &Func = MF.getFunction();
+  clearFunctionData(false);
+  Samples = Reader->getSamplesFor(Func);
+  if (!Samples || Samples->empty())
+    return false;
+
+  if (getFunctionLoc(MF) == 0)
+    return false;
+
+  DenseSet<GlobalValue::GUID> InlinedGUIDs;
+  bool Changed = computeAndPropagateWeights(MF, InlinedGUIDs);
+
+  // Set the new BPI, BFI.
+  setBranchProbs(MF);
+
+  return Changed;
+}
+
+} // namespace llvm
+
+MIRProfileLoaderPass::MIRProfileLoaderPass(std::string FileName,
+                                           std::string RemappingFileName,
+                                           FSDiscriminatorPass P)
+    : MachineFunctionPass(ID), ProfileFileName(FileName), P(P),
+      MIRSampleLoader(
+          std::make_unique<MIRProfileLoader>(FileName, RemappingFileName)) {
+  LowBit = getFSPassBitBegin(P);
+  HighBit = getFSPassBitEnd(P);
+  assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
+}
+
+bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
+  if (!MIRSampleLoader->isValid())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Func: "
+                    << MF.getFunction().getName() << "\n");
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  MIRSampleLoader->setInitVals(
+      &getAnalysis<MachineDominatorTree>(),
+      &getAnalysis<MachinePostDominatorTree>(), &getAnalysis<MachineLoopInfo>(),
+      MBFI, &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE());
+
+  MF.RenumberBlocks();
+  if (ViewBFIBefore && ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MIR_Prof_loader_b." + MF.getName(), false);
+  }
+
+  bool Changed = MIRSampleLoader->runOnFunction(MF);
+
+  if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MIR_prof_loader_a." + MF.getName(), false);
+  }
+
+  return Changed;
+}
+
+bool MIRProfileLoaderPass::doInitialization(Module &M) {
+  LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Module " << M.getName()
+                    << "\n");
+
+  MIRSampleLoader->setFSPass(P);
+  return MIRSampleLoader->doInitialization(M);
+}
+
+void MIRProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequiredTransitive<MachineLoopInfo>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index c6914dcd0e54..23c511aaa056 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -134,9 +134,8 @@ void ilist_callback_traits<MachineBasicBlock>::addNodeToList(
 
   // Make sure the instructions have their operands in the reginfo lists.
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  for (MachineBasicBlock::instr_iterator
-         I = N->instr_begin(), E = N->instr_end(); I != E; ++I)
-    I->AddRegOperandsToUseLists(RegInfo);
+  for (MachineInstr &MI : N->instrs())
+    MI.AddRegOperandsToUseLists(RegInfo);
 }
 
 void ilist_callback_traits<MachineBasicBlock>::removeNodeFromList(
@@ -281,8 +280,8 @@ MachineBasicBlock::getLastNonDebugInstr(bool SkipPseudoOp) {
 }
 
 bool MachineBasicBlock::hasEHPadSuccessor() const {
-  for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
-    if ((*I)->isEHPad())
+  for (const MachineBasicBlock *Succ : successors())
+    if (Succ->isEHPad())
       return true;
   return false;
 }
@@ -517,6 +516,11 @@ void MachineBasicBlock::printName(raw_ostream &os, unsigned printNameFlags,
       os << "landing-pad";
       hasAttributes = true;
     }
+    if (isInlineAsmBrIndirectTarget()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "inlineasm-br-indirect-target";
+      hasAttributes = true;
+    }
     if (isEHFuncletEntry()) {
       os << (hasAttributes ? ", " : " (");
       os << "ehfunclet-entry";
@@ -1037,17 +1041,16 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I) {
       MachineInstr *MI = &*I;
-      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-           OE = MI->operands_end(); OI != OE; ++OI) {
-        if (!OI->isReg() || OI->getReg() == 0 ||
-            !OI->isUse() || !OI->isKill() || OI->isUndef())
+      for (MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg() || MO.getReg() == 0 || !MO.isUse() || !MO.isKill() ||
+            MO.isUndef())
           continue;
-        Register Reg = OI->getReg();
+        Register Reg = MO.getReg();
         if (Register::isPhysicalRegister(Reg) ||
             LV->getVarInfo(Reg).removeKill(*MI)) {
           KilledRegs.push_back(Reg);
-          LLVM_DEBUG(dbgs() << "Removing terminator kill: " << *MI);
-          OI->setIsKill(false);
+          LLVM_DEBUG(dbgs() << "Removing terminator kill: " << MI);
+          MO.setIsKill(false);
         }
       }
     }
@@ -1058,12 +1061,11 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
          I != E; ++I) {
       MachineInstr *MI = &*I;
 
-      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-           OE = MI->operands_end(); OI != OE; ++OI) {
-        if (!OI->isReg() || OI->getReg() == 0)
+      for (const MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg() || MO.getReg() == 0)
           continue;
 
-        Register Reg = OI->getReg();
+        Register Reg = MO.getReg();
         if (!is_contained(UsedRegs, Reg))
           UsedRegs.push_back(Reg);
       }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index f61142d202eb..8a1b4031642d 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -1185,7 +1185,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
   // The integrated tail duplication is really designed for increasing
   // fallthrough from predecessors from Succ to its successors. We may need
   // other machanism to handle different cases.
-  if (Succ->succ_size() == 0)
+  if (Succ->succ_empty())
     return true;
 
   // Plus the already placed predecessor.
@@ -2050,6 +2050,8 @@ MachineBlockPlacement::findBestLoopTopHelper(
   BlockChain &HeaderChain = *BlockToChain[OldTop];
   if (!LoopBlockSet.count(*HeaderChain.begin()))
     return OldTop;
+  if (OldTop != *HeaderChain.begin())
+    return OldTop;
 
   LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
                     << "\n");
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index cb2e18e8c813..0fcb07252d0e 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -514,41 +514,38 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
   SmallVector<unsigned, 2> ImplicitDefsToUpdate;
   SmallVector<unsigned, 2> ImplicitDefs;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
-    MachineInstr *MI = &*I;
-    ++I;
-
-    if (!isCSECandidate(MI))
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+    if (!isCSECandidate(&MI))
       continue;
 
-    bool FoundCSE = VNT.count(MI);
+    bool FoundCSE = VNT.count(&MI);
     if (!FoundCSE) {
       // Using trivial copy propagation to find more CSE opportunities.
-      if (PerformTrivialCopyPropagation(MI, MBB)) {
+      if (PerformTrivialCopyPropagation(&MI, MBB)) {
         Changed = true;
 
         // After coalescing MI itself may become a copy.
-        if (MI->isCopyLike())
+        if (MI.isCopyLike())
           continue;
 
         // Try again to see if CSE is possible.
-        FoundCSE = VNT.count(MI);
+        FoundCSE = VNT.count(&MI);
       }
     }
 
     // Commute commutable instructions.
     bool Commuted = false;
-    if (!FoundCSE && MI->isCommutable()) {
-      if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) {
+    if (!FoundCSE && MI.isCommutable()) {
+      if (MachineInstr *NewMI = TII->commuteInstruction(MI)) {
         Commuted = true;
         FoundCSE = VNT.count(NewMI);
-        if (NewMI != MI) {
+        if (NewMI != &MI) {
           // New instruction. It doesn't need to be kept.
           NewMI->eraseFromParent();
           Changed = true;
         } else if (!FoundCSE)
           // MI was changed but it didn't help, commute it back!
-          (void)TII->commuteInstruction(*MI);
+          (void)TII->commuteInstruction(MI);
       }
     }
 
@@ -559,8 +556,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
     SmallSet<MCRegister, 8> PhysRefs;
     PhysDefVector PhysDefs;
     bool PhysUseDef = false;
-    if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
-                                          PhysDefs, PhysUseDef)) {
+    if (FoundCSE &&
+        hasLivePhysRegDefUses(&MI, MBB, PhysRefs, PhysDefs, PhysUseDef)) {
       FoundCSE = false;
 
       // ... Unless the CS is local or is in the sole predecessor block
@@ -569,23 +566,23 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
       // This can never be the case if the instruction both uses and
       // defines the same physical register, which was detected above.
       if (!PhysUseDef) {
-        unsigned CSVN = VNT.lookup(MI);
+        unsigned CSVN = VNT.lookup(&MI);
         MachineInstr *CSMI = Exps[CSVN];
-        if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
+        if (PhysRegDefsReach(CSMI, &MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
           FoundCSE = true;
       }
     }
 
     if (!FoundCSE) {
-      VNT.insert(MI, CurrVN++);
-      Exps.push_back(MI);
+      VNT.insert(&MI, CurrVN++);
+      Exps.push_back(&MI);
       continue;
     }
 
     // Found a common subexpression, eliminate it.
-    unsigned CSVN = VNT.lookup(MI);
+    unsigned CSVN = VNT.lookup(&MI);
     MachineInstr *CSMI = Exps[CSVN];
-    LLVM_DEBUG(dbgs() << "Examining: " << *MI);
+    LLVM_DEBUG(dbgs() << "Examining: " << MI);
     LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
 
     // Prevent CSE-ing non-local convergent instructions.
@@ -597,20 +594,20 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
     // definition, so it's necessary to use `isConvergent` to prevent illegally
     // CSE-ing the subset of `isConvergent` instructions which do fall into this
     // extended definition.
-    if (MI->isConvergent() && MI->getParent() != CSMI->getParent()) {
+    if (MI.isConvergent() && MI.getParent() != CSMI->getParent()) {
       LLVM_DEBUG(dbgs() << "*** Convergent MI and subexpression exist in "
                            "different BBs, avoid CSE!\n");
-      VNT.insert(MI, CurrVN++);
-      Exps.push_back(MI);
+      VNT.insert(&MI, CurrVN++);
+      Exps.push_back(&MI);
       continue;
     }
 
     // Check if it's profitable to perform this CSE.
     bool DoCSE = true;
-    unsigned NumDefs = MI->getNumDefs();
+    unsigned NumDefs = MI.getNumDefs();
 
-    for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg() || !MO.isDef())
         continue;
       Register OldReg = MO.getReg();
@@ -635,7 +632,7 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
              Register::isVirtualRegister(NewReg) &&
              "Do not CSE physical register defs!");
 
-      if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
+      if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), &MI)) {
         LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
         DoCSE = false;
         break;
@@ -674,7 +671,7 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
       for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
         CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
       for (const auto &PhysDef : PhysDefs)
-        if (!MI->getOperand(PhysDef.first).isDead())
+        if (!MI.getOperand(PhysDef.first).isDead())
           CSMI->getOperand(PhysDef.first).setIsDead(false);
 
       // Go through implicit defs of CSMI and MI, and clear the kill flags on
@@ -687,8 +684,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
       // Since we eliminated MI, and reused a register imp-def'd by CSMI
       // (here %nzcv), that register, if it was killed before MI, should have
       // that kill flag removed, because it's lifetime was extended.
-      if (CSMI->getParent() == MI->getParent()) {
-        for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
+      if (CSMI->getParent() == MI.getParent()) {
+        for (MachineBasicBlock::iterator II = CSMI, IE = &MI; II != IE; ++II)
           for (auto ImplicitDef : ImplicitDefs)
             if (MachineOperand *MO = II->findRegisterUseOperand(
                     ImplicitDef, /*isKill=*/true, TRI))
@@ -711,7 +708,7 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
         ++NumCrossBBCSEs;
       }
 
-      MI->eraseFromParent();
+      MI.eraseFromParent();
       ++NumCSEs;
       if (!PhysRefs.empty())
         ++NumPhysCSEs;
@@ -719,8 +716,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
         ++NumCommutes;
       Changed = true;
     } else {
-      VNT.insert(MI, CurrVN++);
-      Exps.push_back(MI);
+      VNT.insert(&MI, CurrVN++);
+      Exps.push_back(&MI);
     }
     CSEPairs.clear();
     ImplicitDefsToUpdate.clear();
@@ -807,19 +804,16 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI) {
 bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
                                  MachineBasicBlock *MBB) {
   bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr *MI = &*I;
-    ++I;
-
-    if (!isPRECandidate(MI))
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+    if (!isPRECandidate(&MI))
       continue;
 
-    if (!PREMap.count(MI)) {
-      PREMap[MI] = MBB;
+    if (!PREMap.count(&MI)) {
+      PREMap[&MI] = MBB;
       continue;
     }
 
-    auto MBB1 = PREMap[MI];
+    auto MBB1 = PREMap[&MI];
     assert(
         !DT->properlyDominates(MBB, MBB1) &&
         "MBB cannot properly dominate MBB1 while DFS through dominators tree!");
@@ -844,17 +838,17 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
         // it's necessary to use `isConvergent` to prevent illegally PRE-ing the
         // subset of `isConvergent` instructions which do fall into this
         // extended definition.
-        if (MI->isConvergent() && CMBB != MBB)
+        if (MI.isConvergent() && CMBB != MBB)
           continue;
 
-        assert(MI->getOperand(0).isDef() &&
+        assert(MI.getOperand(0).isDef() &&
                "First operand of instr with one explicit def must be this def");
-        Register VReg = MI->getOperand(0).getReg();
+        Register VReg = MI.getOperand(0).getReg();
         Register NewReg = MRI->cloneVirtualRegister(VReg);
-        if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
+        if (!isProfitableToCSE(NewReg, VReg, CMBB, &MI))
           continue;
         MachineInstr &NewMI =
-            TII->duplicate(*CMBB, CMBB->getFirstTerminator(), *MI);
+            TII->duplicate(*CMBB, CMBB->getFirstTerminator(), MI);
 
         // When hoisting, make sure we don't carry the debug location of
         // the original instruction, as that's not correct and can cause
@@ -864,7 +858,7 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
 
         NewMI.getOperand(0).setReg(NewReg);
 
-        PREMap[MI] = CMBB;
+        PREMap[&MI] = CMBB;
         ++NumPREs;
         Changed = true;
       }
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 10b74f5f47f5..7c83bacd80d9 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -414,6 +414,31 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
   if (!UseI.isCopy())
     return false;
 
+  const TargetRegisterClass *CopySrcRC =
+      TRI->getMinimalPhysRegClass(CopySrcReg);
+  const TargetRegisterClass *UseDstRC =
+      TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg());
+  const TargetRegisterClass *CrossCopyRC = TRI->getCrossCopyRegClass(CopySrcRC);
+
+  // If cross copy register class is not the same as copy source register class
+  // then it is not possible to copy the register directly and requires a cross
+  // register class copy. Fowarding this copy without checking register class of
+  // UseDst may create additional cross register copies when expanding the copy
+  // instruction in later passes.
+  if (CopySrcRC != CrossCopyRC) {
+    const TargetRegisterClass *CopyDstRC =
+        TRI->getMinimalPhysRegClass(Copy.getOperand(0).getReg());
+
+    // Check if UseDstRC matches the necessary register class to copy from
+    // CopySrc's register class. If so then forwarding the copy will not
+    // introduce any cross-class copys. Else if CopyDstRC matches then keep the
+    // copy and do not forward. If neither UseDstRC or CopyDstRC matches then
+    // we may need a cross register copy later but we do not worry about it
+    // here.
+    if (UseDstRC != CrossCopyRC && CopyDstRC == CrossCopyRC)
+      return false;
+  }
+
   /// COPYs don't have register class constraints, so if the user instruction
   /// is a COPY, we just try to avoid introducing additional cross-class
   /// COPYs.  For example:
@@ -430,9 +455,6 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
   ///
   /// so we have reduced the number of cross-class COPYs and potentially
   /// introduced a nop COPY that can be removed.
-  const TargetRegisterClass *UseDstRC =
-      TRI->getMinimalPhysRegClass(UseI.getOperand(0).getReg());
-
   const TargetRegisterClass *SuperRC = UseDstRC;
   for (TargetRegisterClass::sc_iterator SuperRCI = UseDstRC->getSuperClasses();
        SuperRC; SuperRC = *SuperRCI++)
@@ -554,6 +576,7 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     MOUse.setReg(CopySrcReg);
     if (!CopySrc.isRenamable())
       MOUse.setIsRenamable(false);
+    MOUse.setIsUndef(CopySrc.isUndef());
 
     LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
 
@@ -571,19 +594,16 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName()
                     << "\n");
 
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
-    MachineInstr *MI = &*I;
-    ++I;
-
+  for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
     // Analyze copies (which don't overlap themselves).
-    if (MI->isCopy() && !TRI->regsOverlap(MI->getOperand(0).getReg(),
-                                          MI->getOperand(1).getReg())) {
-      assert(MI->getOperand(0).getReg().isPhysical() &&
-             MI->getOperand(1).getReg().isPhysical() &&
+    if (MI.isCopy() && !TRI->regsOverlap(MI.getOperand(0).getReg(),
+                                         MI.getOperand(1).getReg())) {
+      assert(MI.getOperand(0).getReg().isPhysical() &&
+             MI.getOperand(1).getReg().isPhysical() &&
              "MachineCopyPropagation should be run after register allocation!");
 
-      MCRegister Def = MI->getOperand(0).getReg().asMCReg();
-      MCRegister Src = MI->getOperand(1).getReg().asMCReg();
+      MCRegister Def = MI.getOperand(0).getReg().asMCReg();
+      MCRegister Src = MI.getOperand(1).getReg().asMCReg();
 
       // The two copies cancel out and the source of the first copy
       // hasn't been overridden, eliminate the second one. e.g.
@@ -600,31 +620,31 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
       //  %ecx = COPY %eax
       // =>
       //  %ecx = COPY %eax
-      if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
+      if (eraseIfRedundant(MI, Def, Src) || eraseIfRedundant(MI, Src, Def))
         continue;
 
-      forwardUses(*MI);
+      forwardUses(MI);
 
       // Src may have been changed by forwardUses()
-      Src = MI->getOperand(1).getReg().asMCReg();
+      Src = MI.getOperand(1).getReg().asMCReg();
 
       // If Src is defined by a previous copy, the previous copy cannot be
       // eliminated.
-      ReadRegister(Src, *MI, RegularUse);
-      for (const MachineOperand &MO : MI->implicit_operands()) {
+      ReadRegister(Src, MI, RegularUse);
+      for (const MachineOperand &MO : MI.implicit_operands()) {
         if (!MO.isReg() || !MO.readsReg())
           continue;
         MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
           continue;
-        ReadRegister(Reg, *MI, RegularUse);
+        ReadRegister(Reg, MI, RegularUse);
       }
 
-      LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
+      LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI.dump());
 
       // Copy is now a candidate for deletion.
       if (!MRI->isReserved(Def))
-        MaybeDeadCopies.insert(MI);
+        MaybeDeadCopies.insert(&MI);
 
       // If 'Def' is previously source of another copy, then this earlier copy's
       // source is no longer available. e.g.
@@ -634,7 +654,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
       // ...
       // %xmm2 = copy %xmm9
       Tracker.clobberRegister(Def, *TRI);
-      for (const MachineOperand &MO : MI->implicit_operands()) {
+      for (const MachineOperand &MO : MI.implicit_operands()) {
         if (!MO.isReg() || !MO.isDef())
           continue;
         MCRegister Reg = MO.getReg().asMCReg();
@@ -643,29 +663,29 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         Tracker.clobberRegister(Reg, *TRI);
       }
 
-      Tracker.trackCopy(MI, *TRI);
+      Tracker.trackCopy(&MI, *TRI);
 
       continue;
     }
 
     // Clobber any earlyclobber regs first.
-    for (const MachineOperand &MO : MI->operands())
+    for (const MachineOperand &MO : MI.operands())
       if (MO.isReg() && MO.isEarlyClobber()) {
         MCRegister Reg = MO.getReg().asMCReg();
         // If we have a tied earlyclobber, that means it is also read by this
         // instruction, so we need to make sure we don't remove it as dead
         // later.
         if (MO.isTied())
-          ReadRegister(Reg, *MI, RegularUse);
+          ReadRegister(Reg, MI, RegularUse);
         Tracker.clobberRegister(Reg, *TRI);
       }
 
-    forwardUses(*MI);
+    forwardUses(MI);
 
     // Not a copy.
     SmallVector<Register, 2> Defs;
     const MachineOperand *RegMask = nullptr;
-    for (const MachineOperand &MO : MI->operands()) {
+    for (const MachineOperand &MO : MI.operands()) {
       if (MO.isRegMask())
         RegMask = &MO;
       if (!MO.isReg())
@@ -681,7 +701,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         Defs.push_back(Reg.asMCReg());
         continue;
       } else if (MO.readsReg())
-        ReadRegister(Reg.asMCReg(), *MI, MO.isDebug() ? DebugUse : RegularUse);
+        ReadRegister(Reg.asMCReg(), MI, MO.isDebug() ? DebugUse : RegularUse);
     }
 
     // The instruction has a register mask operand which means that it clobbers
diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp
index c8845d838282..28cff2a4f3f3 100644
--- a/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/llvm/lib/CodeGen/MachineDominators.cpp
@@ -73,7 +73,7 @@ void MachineDominatorTree::releaseMemory() {
 
 void MachineDominatorTree::verifyAnalysis() const {
   if (DT && VerifyMachineDomInfo)
-    if (!DT->verify(DomTreeT::VerificationLevel::Basic)) {
+    if (!DT->verify(MachineDomTree::VerificationLevel::Basic)) {
       errs() << "MachineDominatorTree verification failed\n";
       abort();
     }
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 0a454b68aca3..366d06871245 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -99,6 +99,7 @@ static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   case P::Selected: return "Selected";
   case P::TracksLiveness: return "TracksLiveness";
   case P::TiedOpsRewritten: return "TiedOpsRewritten";
+  case P::FailsVerification: return "FailsVerification";
   }
   llvm_unreachable("Invalid machine function property");
 }
@@ -129,8 +130,8 @@ void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
 
 static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
                                            const Function &F) {
-  if (F.hasFnAttribute(Attribute::StackAlignment))
-    return F.getFnStackAlignment();
+  if (auto MA = F.getFnStackAlign())
+    return MA->value();
   return STI->getFrameLowering()->getStackAlign().value();
 }
 
@@ -745,9 +746,8 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
         // Add filters in a list.
         auto *CVal = cast<Constant>(Val);
         SmallVector<const GlobalValue *, 4> FilterList;
-        for (User::op_iterator II = CVal->op_begin(), IE = CVal->op_end();
-             II != IE; ++II)
-          FilterList.push_back(cast<GlobalValue>((*II)->stripPointerCasts()));
+        for (const Use &U : CVal->operands())
+          FilterList.push_back(cast<GlobalValue>(U->stripPointerCasts()));
 
         addFilterTypeInfo(LandingPad, FilterList);
       }
@@ -973,6 +973,9 @@ void MachineFunction::makeDebugValueSubstitution(DebugInstrOperandPair A,
                                                  unsigned Subreg) {
   // Catch any accidental self-loops.
   assert(A.first != B.first);
+  // Don't allow any substitutions _from_ the memory operand number.
+  assert(A.second != DebugOperandMemNumber);
+
   DebugValueSubstitutions.push_back({A, B, Subreg});
 }
 
@@ -1148,17 +1151,17 @@ auto MachineFunction::salvageCopySSA(MachineInstr &MI)
     // locations.
     ;
   } else {
-    // Assert that this is the entry block. If it isn't, then there is some
-    // code construct we don't recognise that deals with physregs across
-    // blocks.
+    // Assert that this is the entry block, or an EH pad. If it isn't, then
+    // there is some code construct we don't recognise that deals with physregs
+    // across blocks.
     assert(!State.first.isVirtual());
-    assert(&*InsertBB.getParent()->begin() == &InsertBB);
+    assert(&*InsertBB.getParent()->begin() == &InsertBB || InsertBB.isEHPad());
   }
 
   // Create DBG_PHI for specified physreg.
   auto Builder = BuildMI(InsertBB, InsertBB.getFirstNonPHI(), DebugLoc(),
                          TII.get(TargetOpcode::DBG_PHI));
-  Builder.addReg(State.first, RegState::Debug);
+  Builder.addReg(State.first);
   unsigned NewNum = getNewDebugInstrNum();
   Builder.addImm(NewNum);
   return ApplySubregisters({NewNum, 0u});
@@ -1171,10 +1174,9 @@ void MachineFunction::finalizeDebugInstrRefs() {
     const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_VALUE);
     MI.setDesc(RefII);
     MI.getOperand(1).ChangeToRegister(0, false);
-    MI.getOperand(0).setIsDebug();
   };
 
-  if (!getTarget().Options.ValueTrackingVariableLocations)
+  if (!useDebugInstrRef())
     return;
 
   for (auto &MBB : *this) {
@@ -1221,6 +1223,27 @@ void MachineFunction::finalizeDebugInstrRefs() {
   }
 }
 
+bool MachineFunction::useDebugInstrRef() const {
+  // Disable instr-ref at -O0: it's very slow (in compile time). We can still
+  // have optimized code inlined into this unoptimized code, however with
+  // fewer and less aggressive optimizations happening, coverage and accuracy
+  // should not suffer.
+  if (getTarget().getOptLevel() == CodeGenOpt::None)
+    return false;
+
+  // Don't use instr-ref if this function is marked optnone.
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
+  if (getTarget().Options.ValueTrackingVariableLocations)
+    return true;
+
+  return false;
+}
+
+// Use one million as a high / reserved number.
+const unsigned MachineFunction::DebugOperandMemNumber = 1000000;
+
 /// \}
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 0707945e7fb7..5c4f75e9ceb9 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -294,6 +294,9 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
       if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
         NewMO->setIsEarlyClobber(true);
     }
+    // Ensure debug instructions set debug flag on register uses.
+    if (NewMO->isUse() && isDebugInstr())
+      NewMO->setIsDebug();
   }
 }
 
@@ -2111,11 +2114,11 @@ MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
   assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
-  auto MIB = BuildMI(MF, DL, MCID).addReg(Reg, RegState::Debug);
+  auto MIB = BuildMI(MF, DL, MCID).addReg(Reg);
   if (IsIndirect)
     MIB.addImm(0U);
   else
-    MIB.addReg(0U, RegState::Debug);
+    MIB.addReg(0U);
   return MIB.addMetadata(Variable).addMetadata(Expr);
 }
 
@@ -2134,7 +2137,7 @@ MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
   if (IsIndirect)
     MIB.addImm(0U);
   else
-    MIB.addReg(0U, RegState::Debug);
+    MIB.addReg(0U);
   return MIB.addMetadata(Variable).addMetadata(Expr);
 }
 
@@ -2153,7 +2156,7 @@ MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
   MIB.addMetadata(Variable).addMetadata(Expr);
   for (const MachineOperand &MO : MOs)
     if (MO.isReg())
-      MIB.addReg(MO.getReg(), RegState::Debug);
+      MIB.addReg(MO.getReg());
     else
       MIB.add(MO);
   return MIB;
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 883299c452b7..500cf8e0b79b 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -230,6 +230,9 @@ namespace {
 
     bool IsGuaranteedToExecute(MachineBasicBlock *BB);
 
+    bool isTriviallyReMaterializable(const MachineInstr &MI,
+                                     AAResults *AA) const;
+
     void EnterScope(MachineBasicBlock *MBB);
 
     void ExitScope(MachineBasicBlock *MBB);
@@ -659,6 +662,23 @@ bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   return true;
 }
 
+/// Check if \p MI is trivially remateralizable and if it does not have any
+/// virtual register uses. Even though rematerializable RA might not actually
+/// rematerialize it in this scenario. In that case we do not want to hoist such
+/// instruction out of the loop in a belief RA will sink it back if needed.
+bool MachineLICMBase::isTriviallyReMaterializable(const MachineInstr &MI,
+                                                  AAResults *AA) const {
+  if (!TII->isTriviallyReMaterializable(MI, AA))
+    return false;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
+      return false;
+  }
+
+  return true;
+}
+
 void MachineLICMBase::EnterScope(MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n');
 
@@ -761,15 +781,11 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
 
     // Process the block
     SpeculationState = SpeculateUnknown;
-    for (MachineBasicBlock::iterator
-         MII = MBB->begin(), E = MBB->end(); MII != E; ) {
-      MachineBasicBlock::iterator NextMII = MII; ++NextMII;
-      MachineInstr *MI = &*MII;
-      if (!Hoist(MI, Preheader))
-        UpdateRegPressure(MI);
+    for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+      if (!Hoist(&MI, Preheader))
+        UpdateRegPressure(&MI);
       // If we have hoisted an instruction that may store, it can only be a
       // constant store.
-      MII = NextMII;
     }
 
     // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
@@ -1156,9 +1172,9 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
     return false;
   }
 
-  // Rematerializable instructions should always be hoisted since the register
-  // allocator can just pull them down again when needed.
-  if (TII->isTriviallyReMaterializable(MI, AA))
+  // Rematerializable instructions should always be hoisted providing the
+  // register allocator can just pull them down again when needed.
+  if (isTriviallyReMaterializable(MI, AA))
     return true;
 
   // FIXME: If there are long latency loop-invariant instructions inside the
@@ -1211,7 +1227,7 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
 
   // High register pressure situation, only hoist if the instruction is going
   // to be remat'ed.
-  if (!TII->isTriviallyReMaterializable(MI, AA) &&
+  if (!isTriviallyReMaterializable(MI, AA) &&
       !MI.isDereferenceableInvariantLoad(AA)) {
     LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
     return false;
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 8f91a5b698d0..9b96bc5e5e7f 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/InitializePasses.h"
@@ -154,7 +155,9 @@ MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader,
 bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
   MachineFunction *MF = I.getParent()->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
 
   // The instruction is loop invariant if all of its operands are.
   for (const MachineOperand &MO : I.operands()) {
@@ -174,7 +177,8 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
         // However, if the physreg is known to always be caller saved/restored
         // then this use is safe to hoist.
         if (!MRI->isConstantPhysReg(Reg) &&
-            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())))
+            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
+            !TII->isIgnorableUse(MO))
           return false;
         // Otherwise it's safe to move.
         continue;
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index b8ba0453d24c..4d080e1a4f82 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -250,6 +250,11 @@ void MachineOperand::ChangeToRegister(Register Reg, bool isDef, bool isImp,
   if (RegInfo && WasReg)
     RegInfo->removeRegOperandFromUseList(this);
 
+  // Ensure debug instructions set debug flag on register uses.
+  const MachineInstr *MI = getParent();
+  if (!isDef && MI && MI->isDebugInstr())
+    isDebug = true;
+
   // Change this to a register and set the reg#.
   assert(!(isDead && !isDef) && "Dead flag on non-def");
   assert(!(isKill && isDef) && "Kill flag on def");
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 1d55bd00e033..cfbccebaff3e 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -798,6 +798,7 @@ bool MachineOutliner::outline(Module &M,
                  Last = std::next(CallInst.getReverse());
              Iter != Last; Iter++) {
           MachineInstr *MI = &*Iter;
+          SmallSet<Register, 2> InstrUseRegs;
           for (MachineOperand &MOP : MI->operands()) {
             // Skip over anything that isn't a register.
             if (!MOP.isReg())
@@ -806,7 +807,8 @@ bool MachineOutliner::outline(Module &M,
             if (MOP.isDef()) {
               // Introduce DefRegs set to skip the redundant register.
               DefRegs.insert(MOP.getReg());
-              if (!MOP.isDead() && UseRegs.count(MOP.getReg()))
+              if (UseRegs.count(MOP.getReg()) &&
+                  !InstrUseRegs.count(MOP.getReg()))
                 // Since the regiester is modeled as defined,
                 // it is not necessary to be put in use register set.
                 UseRegs.erase(MOP.getReg());
@@ -814,6 +816,7 @@ bool MachineOutliner::outline(Module &M,
               // Any register which is not undefined should
               // be put in the use register set.
               UseRegs.insert(MOP.getReg());
+              InstrUseRegs.insert(MOP.getReg());
             }
           }
           if (MI->isCandidateForCallSiteEntry())
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index caa3f8049aeb..e18318386def 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -200,8 +200,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableSWP)
     return false;
 
-  if (mf.getFunction().getAttributes().hasAttribute(
-          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
+  if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
       !EnableSWPOptSize.getPosition())
     return false;
 
@@ -386,7 +385,7 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
   MachineRegisterInfo &MRI = MF->getRegInfo();
   SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
 
-  for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
+  for (MachineInstr &PI : B.phis()) {
     MachineOperand &DefOp = PI.getOperand(0);
     assert(DefOp.getSubReg() == 0);
     auto *RC = MRI.getRegClass(DefOp.getReg());
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 3f6b11e072b4..19bf87d3e290 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -383,9 +383,7 @@ void MachineRegisterInfo::replaceRegWith(Register FromReg, Register ToReg) {
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
 
   // TODO: This could be more efficient by bulk changing the operands.
-  for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
-    MachineOperand &O = *I;
-    ++I;
+  for (MachineOperand &O : llvm::make_early_inc_range(reg_operands(FromReg))) {
     if (Register::isPhysicalRegister(ToReg)) {
       O.substPhysReg(ToReg, *TRI);
     } else {
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4f42a2c8aeff..47d40f0823c8 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -583,7 +583,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
                         << " " << MBB->getName() << "\n  From: " << *I
                         << "    To: ";
                  if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-                 else dbgs() << "End";
+                 else dbgs() << "End\n";
                  dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
       if (DumpCriticalPathLength) {
         errs() << MF->getName();
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index ec98394dca79..30745c7a5583 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -131,7 +131,7 @@ namespace {
     // will be split.
     SetVector<std::pair<MachineBasicBlock *, MachineBasicBlock *>> ToSplit;
 
-    SparseBitVector<> RegsToClearKillFlags;
+    DenseSet<Register> RegsToClearKillFlags;
 
     using AllSuccsCache =
         std::map<MachineBasicBlock *, SmallVector<MachineBasicBlock *, 4>>;
@@ -476,14 +476,13 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
       // of a def-use chain, if there is any.
       // TODO: Sort the candidates using a cost-model.
       unsigned i = 0;
-      for (auto It = Candidates.rbegin(); It != Candidates.rend(); ++It) {
+      for (MachineInstr *I : llvm::reverse(Candidates)) {
         if (i++ == SinkIntoLoopLimit) {
           LLVM_DEBUG(dbgs() << "LoopSink:   Limit reached of instructions to "
                                "be analysed.");
           break;
         }
 
-        MachineInstr *I = *It;
         if (!SinkIntoLoop(L, *I))
           break;
         EverMadeChange = true;
@@ -683,13 +682,9 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
   // There is no need to do this check if all the uses are PHI nodes. PHI
   // sources are only defined on the specific predecessor edges.
   if (!BreakPHIEdge) {
-    for (MachineBasicBlock::pred_iterator PI = ToBB->pred_begin(),
-           E = ToBB->pred_end(); PI != E; ++PI) {
-      if (*PI == FromBB)
-        continue;
-      if (!DT->dominates(ToBB, *PI))
+    for (MachineBasicBlock *Pred : ToBB->predecessors())
+      if (Pred != FromBB && !DT->dominates(ToBB, Pred))
         return false;
-    }
   }
 
   ToSplit.insert(std::make_pair(FromBB, ToBB));
@@ -1329,7 +1324,8 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
-    if (!MO.isReg()) continue;
+    if (!MO.isReg() || MO.isUse())
+      continue;
     Register Reg = MO.getReg();
     if (Reg == 0 || !Register::isPhysicalRegister(Reg))
       continue;
@@ -1439,7 +1435,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   // used registers.
   for (MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.isUse())
-      RegsToClearKillFlags.set(MO.getReg()); // Remember to clear kill flags.
+      RegsToClearKillFlags.insert(MO.getReg()); // Remember to clear kill flags.
   }
 
   return true;
@@ -1718,10 +1714,7 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
   UsedRegUnits.clear();
   SeenDbgInstrs.clear();
 
-  for (auto I = CurBB.rbegin(), E = CurBB.rend(); I != E;) {
-    MachineInstr *MI = &*I;
-    ++I;
-
+  for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(CurBB))) {
     // Track the operand index for use in Copy.
     SmallVector<unsigned, 2> UsedOpsInCopy;
     // Track the register number defed in Copy.
@@ -1729,14 +1722,14 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
 
     // We must sink this DBG_VALUE if its operand is sunk. To avoid searching
     // for DBG_VALUEs later, record them when they're encountered.
-    if (MI->isDebugValue()) {
+    if (MI.isDebugValue()) {
       SmallDenseMap<MCRegister, SmallVector<unsigned, 2>, 4> MIUnits;
       bool IsValid = true;
-      for (MachineOperand &MO : MI->debug_operands()) {
+      for (MachineOperand &MO : MI.debug_operands()) {
         if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
           // Bail if we can already tell the sink would be rejected, rather
           // than needlessly accumulating lots of DBG_VALUEs.
-          if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
+          if (hasRegisterDependency(&MI, UsedOpsInCopy, DefedRegsInCopy,
                                     ModifiedRegUnits, UsedRegUnits)) {
             IsValid = false;
             break;
@@ -1750,28 +1743,28 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
       }
       if (IsValid) {
         for (auto RegOps : MIUnits)
-          SeenDbgInstrs[RegOps.first].push_back({MI, RegOps.second});
+          SeenDbgInstrs[RegOps.first].push_back({&MI, RegOps.second});
       }
       continue;
     }
 
-    if (MI->isDebugOrPseudoInstr())
+    if (MI.isDebugOrPseudoInstr())
       continue;
 
     // Do not move any instruction across function call.
-    if (MI->isCall())
+    if (MI.isCall())
       return false;
 
-    if (!MI->isCopy() || !MI->getOperand(0).isRenamable()) {
-      LiveRegUnits::accumulateUsedDefed(*MI, ModifiedRegUnits, UsedRegUnits,
+    if (!MI.isCopy() || !MI.getOperand(0).isRenamable()) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
                                         TRI);
       continue;
     }
 
     // Don't sink the COPY if it would violate a register dependency.
-    if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
+    if (hasRegisterDependency(&MI, UsedOpsInCopy, DefedRegsInCopy,
                               ModifiedRegUnits, UsedRegUnits)) {
-      LiveRegUnits::accumulateUsedDefed(*MI, ModifiedRegUnits, UsedRegUnits,
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
                                         TRI);
       continue;
     }
@@ -1782,7 +1775,7 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // Don't sink if we cannot find a single sinkable successor in which Reg
     // is live-in.
     if (!SuccBB) {
-      LiveRegUnits::accumulateUsedDefed(*MI, ModifiedRegUnits, UsedRegUnits,
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
                                         TRI);
       continue;
     }
@@ -1793,7 +1786,7 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // recorded which reg units that DBG_VALUEs read, if this instruction
     // writes any of those units then the corresponding DBG_VALUEs must sink.
     MapVector<MachineInstr *, MIRegs::second_type> DbgValsToSinkMap;
-    for (auto &MO : MI->operands()) {
+    for (auto &MO : MI.operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
 
@@ -1811,10 +1804,10 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
 
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
-    clearKillFlags(MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
+    clearKillFlags(&MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
     MachineBasicBlock::iterator InsertPos = SuccBB->getFirstNonPHI();
-    performSink(*MI, *SuccBB, InsertPos, DbgValsToSink);
-    updateLiveIn(MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
+    performSink(MI, *SuccBB, InsertPos, DbgValsToSink);
+    updateLiveIn(&MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
 
     Changed = true;
     ++NumPostRACopySink;
diff --git a/llvm/lib/CodeGen/MachineSizeOpts.cpp b/llvm/lib/CodeGen/MachineSizeOpts.cpp
index 584d43b42004..28712d1a816b 100644
--- a/llvm/lib/CodeGen/MachineSizeOpts.cpp
+++ b/llvm/lib/CodeGen/MachineSizeOpts.cpp
@@ -82,7 +82,7 @@ bool isFunctionColdInCallGraph(
     ProfileSummaryInfo *PSI,
     const MachineBlockFrequencyInfo &MBFI) {
   if (auto FunctionCount = MF->getFunction().getEntryCount())
-    if (!PSI->isColdCount(FunctionCount.getCount()))
+    if (!PSI->isColdCount(FunctionCount->getCount()))
       return false;
   for (const auto &MBB : *MF)
     if (!isColdBlock(&MBB, PSI, &MBFI))
@@ -99,7 +99,7 @@ bool isFunctionHotInCallGraphNthPercentile(
     const MachineBlockFrequencyInfo &MBFI) {
   if (auto FunctionCount = MF->getFunction().getEntryCount())
     if (PSI->isHotCountNthPercentile(PercentileCutoff,
-                                     FunctionCount.getCount()))
+                                     FunctionCount->getCount()))
       return true;
   for (const auto &MBB : *MF)
     if (isHotBlockNthPercentile(PercentileCutoff, &MBB, PSI, &MBFI))
@@ -112,7 +112,7 @@ bool isFunctionColdInCallGraphNthPercentile(
     const MachineBlockFrequencyInfo &MBFI) {
   if (auto FunctionCount = MF->getFunction().getEntryCount())
     if (!PSI->isColdCountNthPercentile(PercentileCutoff,
-                                       FunctionCount.getCount()))
+                                       FunctionCount->getCount()))
       return false;
   for (const auto &MBB : *MF)
     if (!isColdBlockNthPercentile(PercentileCutoff, &MBB, PSI, &MBFI))
diff --git a/llvm/lib/CodeGen/MachineStripDebug.cpp b/llvm/lib/CodeGen/MachineStripDebug.cpp
index a1cb12f91275..86cf4999d4b0 100644
--- a/llvm/lib/CodeGen/MachineStripDebug.cpp
+++ b/llvm/lib/CodeGen/MachineStripDebug.cpp
@@ -50,29 +50,26 @@ struct StripDebugMachineModule : public ModulePass {
         continue;
       MachineFunction &MF = *MaybeMF;
       for (MachineBasicBlock &MBB : MF) {
-        for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-             I != E;) {
-          if (I->isDebugInstr()) {
+        for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+          if (MI.isDebugInstr()) {
             // FIXME: We should remove all of them. However, AArch64 emits an
             //        invalid `DBG_VALUE $lr` with only one operand instead of
             //        the usual three and has a test that depends on it's
             //        preservation. Preserve it for now.
-            if (I->getNumOperands() > 1) {
-              LLVM_DEBUG(dbgs() << "Removing debug instruction " << *I);
-              I = MBB.erase(I);
+            if (MI.getNumOperands() > 1) {
+              LLVM_DEBUG(dbgs() << "Removing debug instruction " << MI);
+              MBB.erase(&MI);
               Changed |= true;
               continue;
             }
           }
-          if (I->getDebugLoc()) {
-            LLVM_DEBUG(dbgs() << "Removing location " << *I);
-            I->setDebugLoc(DebugLoc());
+          if (MI.getDebugLoc()) {
+            LLVM_DEBUG(dbgs() << "Removing location " << MI);
+            MI.setDebugLoc(DebugLoc());
             Changed |= true;
-            ++I;
             continue;
           }
-          LLVM_DEBUG(dbgs() << "Keeping " << *I);
-          ++I;
+          LLVM_DEBUG(dbgs() << "Keeping " << MI);
         }
       }
     }
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 7e3198af02cd..d6bb3e7c9e58 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -210,6 +210,11 @@ namespace {
     void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
     void visitMachineBundleBefore(const MachineInstr *MI);
 
+    /// Verify that all of \p MI's virtual register operands are scalars.
+    /// \returns True if all virtual register operands are scalar. False
+    /// otherwise.
+    bool verifyAllRegOpsScalar(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI);
     bool verifyVectorElementMatch(LLT Ty0, LLT Ty1, const MachineInstr *MI);
     void verifyPreISelGenericInstruction(const MachineInstr *MI);
     void visitMachineInstrBefore(const MachineInstr *MI);
@@ -287,6 +292,13 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
+      // Skip functions that have known verification problems.
+      // FIXME: Remove this mechanism when all problematic passes have been
+      // fixed.
+      if (MF.getProperties().hasProperty(
+              MachineFunctionProperties::Property::FailsVerification))
+        return false;
+
       unsigned FoundErrors = MachineVerifier(this, Banner.c_str()).verify(MF);
       if (FoundErrors)
         report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
@@ -849,6 +861,21 @@ void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
   }
 }
 
+bool MachineVerifier::verifyAllRegOpsScalar(const MachineInstr &MI,
+                                            const MachineRegisterInfo &MRI) {
+  if (none_of(MI.explicit_operands(), [&MRI](const MachineOperand &Op) {
+        if (!Op.isReg())
+          return false;
+        const auto Reg = Op.getReg();
+        if (Reg.isPhysical())
+          return false;
+        return !MRI.getType(Reg).isScalar();
+      }))
+    return true;
+  report("All register operands must have scalar types", &MI);
+  return false;
+}
+
 /// Check that types are consistent when two operands need to have the same
 /// number of vector elements.
 /// \return true if the types are valid.
@@ -1392,7 +1419,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       AttributeList Attrs
         = Intrinsic::getAttributes(MF->getFunction().getContext(),
                                    static_cast<Intrinsic::ID>(IntrID));
-      bool DeclHasSideEffects = !Attrs.hasFnAttribute(Attribute::ReadNone);
+      bool DeclHasSideEffects = !Attrs.hasFnAttr(Attribute::ReadNone);
       if (NoSideEffects && DeclHasSideEffects) {
         report("G_INTRINSIC used with intrinsic that accesses memory", MI);
         break;
@@ -1570,11 +1597,8 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   case TargetOpcode::G_VECREDUCE_UMAX:
   case TargetOpcode::G_VECREDUCE_UMIN: {
     LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
-    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
     if (!DstTy.isScalar())
       report("Vector reduction requires a scalar destination type", MI);
-    if (!SrcTy.isVector())
-      report("Vector reduction requires vector source=", MI);
     break;
   }
 
@@ -1598,7 +1622,11 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     }
     break;
   }
-
+  case TargetOpcode::G_LLROUND:
+  case TargetOpcode::G_LROUND: {
+    verifyAllRegOpsScalar(*MI, *MRI);
+    break;
+  }
   default:
     break;
   }
@@ -1632,6 +1660,8 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
       report("Unspillable Terminator does not define a reg", MI);
     Register Def = MI->getOperand(0).getReg();
     if (Def.isVirtual() &&
+        !MF->getProperties().hasProperty(
+            MachineFunctionProperties::Property::NoPHIs) &&
         std::distance(MRI->use_nodbg_begin(Def), MRI->use_nodbg_end()) > 1)
       report("Unspillable Terminator expected to have at most one use!", MI);
   }
@@ -1866,6 +1896,15 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 
   switch (MO->getType()) {
   case MachineOperand::MO_Register: {
+    // Verify debug flag on debug instructions. Check this first because reg0
+    // indicates an undefined debug value.
+    if (MI->isDebugInstr() && MO->isUse()) {
+      if (!MO->isDebug())
+        report("Register operand must be marked debug", MO, MONum);
+    } else if (MO->isDebug()) {
+      report("Register operand must not be marked debug", MO, MONum);
+    }
+
     const Register Reg = MO->getReg();
     if (!Reg)
       return;
@@ -1932,10 +1971,6 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           return;
         }
       }
-      if (MI->isDebugValue() && MO->isUse() && !MO->isDebug()) {
-        report("Use-reg is not IsDebug in a DBG_VALUE", MO, MONum);
-        return;
-      }
     } else {
       // Virtual register.
       const TargetRegisterClass *RC = MRI->getRegClassOrNull(Reg);
@@ -2182,14 +2217,30 @@ void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
 void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   const MachineInstr *MI = MO->getParent();
   const Register Reg = MO->getReg();
+  const unsigned SubRegIdx = MO->getSubReg();
+
+  const LiveInterval *LI = nullptr;
+  if (LiveInts && Reg.isVirtual()) {
+    if (LiveInts->hasInterval(Reg)) {
+      LI = &LiveInts->getInterval(Reg);
+      if (SubRegIdx != 0 && !LI->empty() && !LI->hasSubRanges() &&
+          MRI->shouldTrackSubRegLiveness(Reg))
+        report("Live interval for subreg operand has no subranges", MO, MONum);
+    } else {
+      report("Virtual register has no live interval", MO, MONum);
+    }
+  }
 
   // Both use and def operands can read a register.
   if (MO->readsReg()) {
     if (MO->isKill())
       addRegWithSubRegs(regsKilled, Reg);
 
-    // Check that LiveVars knows this kill.
-    if (LiveVars && Register::isVirtualRegister(Reg) && MO->isKill()) {
+    // Check that LiveVars knows this kill (unless we are inside a bundle, in
+    // which case we have already checked that LiveVars knows any kills on the
+    // bundle header instead).
+    if (LiveVars && Reg.isVirtual() && MO->isKill() &&
+        !MI->isBundledWithPred()) {
       LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
       if (!is_contained(VI.Kills, MI))
         report("Kill missing from LiveVariables", MO, MONum);
@@ -2209,42 +2260,36 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         }
       }
 
-      if (Register::isVirtualRegister(Reg)) {
-        if (LiveInts->hasInterval(Reg)) {
-          // This is a virtual register interval.
-          const LiveInterval &LI = LiveInts->getInterval(Reg);
-          checkLivenessAtUse(MO, MONum, UseIdx, LI, Reg);
-
-          if (LI.hasSubRanges() && !MO->isDef()) {
-            unsigned SubRegIdx = MO->getSubReg();
-            LaneBitmask MOMask = SubRegIdx != 0
-                               ? TRI->getSubRegIndexLaneMask(SubRegIdx)
-                               : MRI->getMaxLaneMaskForVReg(Reg);
-            LaneBitmask LiveInMask;
-            for (const LiveInterval::SubRange &SR : LI.subranges()) {
-              if ((MOMask & SR.LaneMask).none())
-                continue;
-              checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask);
-              LiveQueryResult LRQ = SR.Query(UseIdx);
-              if (LRQ.valueIn())
-                LiveInMask |= SR.LaneMask;
-            }
-            // At least parts of the register has to be live at the use.
-            if ((LiveInMask & MOMask).none()) {
-              report("No live subrange at use", MO, MONum);
-              report_context(LI);
-              report_context(UseIdx);
-            }
+      if (Reg.isVirtual()) {
+        // This is a virtual register interval.
+        checkLivenessAtUse(MO, MONum, UseIdx, *LI, Reg);
+
+        if (LI->hasSubRanges() && !MO->isDef()) {
+          LaneBitmask MOMask = SubRegIdx != 0
+                                   ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                                   : MRI->getMaxLaneMaskForVReg(Reg);
+          LaneBitmask LiveInMask;
+          for (const LiveInterval::SubRange &SR : LI->subranges()) {
+            if ((MOMask & SR.LaneMask).none())
+              continue;
+            checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask);
+            LiveQueryResult LRQ = SR.Query(UseIdx);
+            if (LRQ.valueIn())
+              LiveInMask |= SR.LaneMask;
+          }
+          // At least parts of the register has to be live at the use.
+          if ((LiveInMask & MOMask).none()) {
+            report("No live subrange at use", MO, MONum);
+            report_context(*LI);
+            report_context(UseIdx);
           }
-        } else {
-          report("Virtual register has no live interval", MO, MONum);
         }
       }
     }
 
     // Use of a dead register.
     if (!regsLive.count(Reg)) {
-      if (Register::isPhysicalRegister(Reg)) {
+      if (Reg.isPhysical()) {
         // Reserved registers may be used even when 'dead'.
         bool Bad = !isReserved(Reg);
         // We are fine if just any subregister has a defined value.
@@ -2266,7 +2311,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
             if (!MOP.isReg() || !MOP.isImplicit())
               continue;
 
-            if (!Register::isPhysicalRegister(MOP.getReg()))
+            if (!MOP.getReg().isPhysical())
               continue;
 
             if (llvm::is_contained(TRI->subregs(MOP.getReg()), Reg))
@@ -2299,7 +2344,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       addRegWithSubRegs(regsDefined, Reg);
 
     // Verify SSA form.
-    if (MRI->isSSA() && Register::isVirtualRegister(Reg) &&
+    if (MRI->isSSA() && Reg.isVirtual() &&
         std::next(MRI->def_begin(Reg)) != MRI->def_end())
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
@@ -2308,24 +2353,18 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       SlotIndex DefIdx = LiveInts->getInstructionIndex(*MI);
       DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
 
-      if (Register::isVirtualRegister(Reg)) {
-        if (LiveInts->hasInterval(Reg)) {
-          const LiveInterval &LI = LiveInts->getInterval(Reg);
-          checkLivenessAtDef(MO, MONum, DefIdx, LI, Reg);
-
-          if (LI.hasSubRanges()) {
-            unsigned SubRegIdx = MO->getSubReg();
-            LaneBitmask MOMask = SubRegIdx != 0
-              ? TRI->getSubRegIndexLaneMask(SubRegIdx)
-              : MRI->getMaxLaneMaskForVReg(Reg);
-            for (const LiveInterval::SubRange &SR : LI.subranges()) {
-              if ((SR.LaneMask & MOMask).none())
-                continue;
-              checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, true, SR.LaneMask);
-            }
+      if (Reg.isVirtual()) {
+        checkLivenessAtDef(MO, MONum, DefIdx, *LI, Reg);
+
+        if (LI->hasSubRanges()) {
+          LaneBitmask MOMask = SubRegIdx != 0
+                                   ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                                   : MRI->getMaxLaneMaskForVReg(Reg);
+          for (const LiveInterval::SubRange &SR : LI->subranges()) {
+            if ((SR.LaneMask & MOMask).none())
+              continue;
+            checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, true, SR.LaneMask);
           }
-        } else {
-          report("Virtual register has no Live interval", MO, MONum);
         }
       }
     }
@@ -2918,9 +2957,13 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     }
   }
 
-  // A live segment can only end at an early-clobber slot if it is being
-  // redefined by an early-clobber def.
-  if (S.end.isEarlyClobber()) {
+  // After tied operands are rewritten, a live segment can only end at an
+  // early-clobber slot if it is being redefined by an early-clobber def.
+  // TODO: Before tied operands are rewritten, a live segment can only end at an
+  // early-clobber slot if the last use is tied to an early-clobber def.
+  if (MF->getProperties().hasProperty(
+          MachineFunctionProperties::Property::TiedOpsRewritten) &&
+      S.end.isEarlyClobber()) {
     if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
              "redefined by an EC def in the same instruction", EndMBB);
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index d2ee21c8720f..b0760322064c 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -44,15 +44,15 @@ static SUnit *getPredClusterSU(const SUnit &SU) {
   return nullptr;
 }
 
-static bool hasLessThanNumFused(const SUnit &SU, unsigned FuseLimit) {
+bool llvm::hasLessThanNumFused(const SUnit &SU, unsigned FuseLimit) {
   unsigned Num = 1;
   const SUnit *CurrentSU = &SU;
   while ((CurrentSU = getPredClusterSU(*CurrentSU)) && Num < FuseLimit) Num ++;
   return Num < FuseLimit;
 }
 
-static bool fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
-                                SUnit &SecondSU) {
+bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
+                               SUnit &SecondSU) {
   // Check that neither instr is already paired with another along the edge
   // between them.
   for (SDep &SI : FirstSU.Succs)
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index b5517c40a28a..8b3cdfab4d42 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -81,10 +81,7 @@ void ModuloScheduleExpander::expand() {
       Register Reg = Op.getReg();
       unsigned MaxDiff = 0;
       bool PhiIsSwapped = false;
-      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
-                                             EI = MRI.use_end();
-           UI != EI; ++UI) {
-        MachineOperand &UseOp = *UI;
+      for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
         MachineInstr *UseMI = UseOp.getParent();
         int UseStage = Schedule.getStage(UseMI);
         unsigned Diff = 0;
@@ -141,13 +138,11 @@ void ModuloScheduleExpander::generatePipelinedLoop() {
 
   // Copy any terminator instructions to the new kernel, and update
   // names as needed.
-  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
-                                   E = BB->instr_end();
-       I != E; ++I) {
-    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+  for (MachineInstr &MI : BB->terminators()) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
     updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
     KernelBB->push_back(NewMI);
-    InstrMap[NewMI] = &*I;
+    InstrMap[NewMI] = &MI;
   }
 
   NewKernel = KernelBB;
@@ -334,14 +329,10 @@ static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
                                     MachineBasicBlock *MBB,
                                     MachineRegisterInfo &MRI,
                                     LiveIntervals &LIS) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
-                                         E = MRI.use_end();
-       I != E;) {
-    MachineOperand &O = *I;
-    ++I;
+  for (MachineOperand &O :
+       llvm::make_early_inc_range(MRI.use_operands(FromReg)))
     if (O.getParent()->getParent() != MBB)
       O.setReg(ToReg);
-  }
   if (!LIS.hasInterval(ToReg))
     LIS.createEmptyInterval(ToReg);
 }
@@ -350,10 +341,8 @@ static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
 /// specified loop.
 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
                             MachineRegisterInfo &MRI) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
-                                         E = MRI.use_end();
-       I != E; ++I)
-    if (I->getParent()->getParent() != BB)
+  for (const MachineOperand &MO : MRI.use_operands(Reg))
+    if (MO.getParent()->getParent() != BB)
       return true;
   return false;
 }
@@ -702,11 +691,9 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
                                                     MBBVectorTy &EpilogBBs) {
   // For each epilog block, check that the value defined by each instruction
   // is used.  If not, delete it.
-  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
-                                     MBE = EpilogBBs.rend();
-       MBB != MBE; ++MBB)
-    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
-                                                   ME = (*MBB)->instr_rend();
+  for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
+    for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
+                                                   ME = MBB->instr_rend();
          MI != ME;) {
       // From DeadMachineInstructionElem. Don't delete inline assembly.
       if (MI->isInlineAsm()) {
@@ -721,26 +708,22 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
         continue;
       }
       bool used = true;
-      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                      MOE = MI->operands_end();
-           MOI != MOE; ++MOI) {
-        if (!MOI->isReg() || !MOI->isDef())
+      for (const MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg() || !MO.isDef())
           continue;
-        Register reg = MOI->getReg();
+        Register reg = MO.getReg();
         // Assume physical registers are used, unless they are marked dead.
         if (Register::isPhysicalRegister(reg)) {
-          used = !MOI->isDead();
+          used = !MO.isDead();
           if (used)
             break;
           continue;
         }
         unsigned realUses = 0;
-        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
-                                               EI = MRI.use_end();
-             UI != EI; ++UI) {
+        for (const MachineOperand &U : MRI.use_operands(reg)) {
           // Check if there are any uses that occur only in the original
           // loop.  If so, that's not a real use.
-          if (UI->getParent()->getParent() != BB) {
+          if (U.getParent()->getParent() != BB) {
             realUses++;
             used = true;
             break;
@@ -759,15 +742,11 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
     }
   // In the kernel block, check if we can remove a Phi that generates a value
   // used in an instruction removed in the epilog block.
-  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
-                                   BBE = KernelBB->getFirstNonPHI();
-       BBI != BBE;) {
-    MachineInstr *MI = &*BBI;
-    ++BBI;
-    Register reg = MI->getOperand(0).getReg();
+  for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
+    Register reg = MI.getOperand(0).getReg();
     if (MRI.use_begin(reg) == MRI.use_end()) {
-      LIS.RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI.eraseFromParent();
     }
   }
 }
@@ -1145,12 +1124,9 @@ void ModuloScheduleExpander::rewriteScheduledInstr(
   int StagePhi = Schedule.getStage(Phi) + PhiNum;
   // Rewrite uses that have been scheduled already to use the new
   // Phi register.
-  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
-                                         EI = MRI.use_end();
-       UI != EI;) {
-    MachineOperand &UseOp = *UI;
+  for (MachineOperand &UseOp :
+       llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
     MachineInstr *UseMI = UseOp.getParent();
-    ++UI;
     if (UseMI->getParent() != BB)
       continue;
     if (UseMI->isPHI()) {
@@ -1223,8 +1199,7 @@ void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
   bool Changed = true;
   while (Changed) {
     Changed = false;
-    for (auto I = MBB->begin(); I != MBB->getFirstNonPHI();) {
-      MachineInstr &MI = *I++;
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
       assert(MI.isPHI());
       if (MRI.use_empty(MI.getOperand(0).getReg())) {
         if (LIS)
@@ -1624,32 +1599,32 @@ void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
     MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
   auto InsertPt = DestBB->getFirstNonPHI();
   DenseMap<Register, Register> Remaps;
-  for (auto I = SourceBB->getFirstNonPHI(); I != SourceBB->end();) {
-    MachineInstr *MI = &*I++;
-    if (MI->isPHI()) {
+  for (MachineInstr &MI : llvm::make_early_inc_range(
+           llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
+    if (MI.isPHI()) {
       // This is an illegal PHI. If we move any instructions using an illegal
       // PHI, we need to create a legal Phi.
-      if (getStage(MI) != Stage) {
+      if (getStage(&MI) != Stage) {
         // The legal Phi is not necessary if the illegal phi's stage
         // is being moved.
-        Register PhiR = MI->getOperand(0).getReg();
+        Register PhiR = MI.getOperand(0).getReg();
         auto RC = MRI.getRegClass(PhiR);
         Register NR = MRI.createVirtualRegister(RC);
         MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
                                    DebugLoc(), TII->get(TargetOpcode::PHI), NR)
                                .addReg(PhiR)
                                .addMBB(SourceBB);
-        BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
-        CanonicalMIs[NI] = CanonicalMIs[MI];
+        BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
+        CanonicalMIs[NI] = CanonicalMIs[&MI];
         Remaps[PhiR] = NR;
       }
     }
-    if (getStage(MI) != Stage)
+    if (getStage(&MI) != Stage)
       continue;
-    MI->removeFromParent();
-    DestBB->insert(InsertPt, MI);
-    auto *KernelMI = CanonicalMIs[MI];
-    BlockMIs[{DestBB, KernelMI}] = MI;
+    MI.removeFromParent();
+    DestBB->insert(InsertPt, &MI);
+    auto *KernelMI = CanonicalMIs[&MI];
+    BlockMIs[{DestBB, KernelMI}] = &MI;
     BlockMIs.erase({SourceBB, KernelMI});
   }
   SmallVector<MachineInstr *, 4> PhiToDelete;
@@ -1768,8 +1743,8 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
     // Keep track at which iteration each phi belongs to. We need it to know
     // what version of the variable to use during prologue/epilogue stitching.
     EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
-    for (auto Phi = B->begin(), IE = B->getFirstNonPHI(); Phi != IE; ++Phi)
-      PhiNodeLoopIteration[&*Phi] = Schedule.getNumStages() - I;
+    for (MachineInstr &Phi : B->phis())
+      PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
   }
   for (size_t I = 0; I < Epilogs.size(); I++) {
     LS.reset();
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 54805584dbc1..77a6c37e1362 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -107,6 +107,7 @@ namespace {
     using BBVRegPair = std::pair<unsigned, Register>;
     using VRegPHIUse = DenseMap<BBVRegPair, unsigned>;
 
+    // Count the number of non-undef PHI uses of each register in each BB.
     VRegPHIUse VRegPHIUseCount;
 
     // Defs of PHI sources which are implicit_def.
@@ -426,9 +427,13 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   }
 
   // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
-  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
-    --VRegPHIUseCount[BBVRegPair(MPhi->getOperand(i+1).getMBB()->getNumber(),
-                                 MPhi->getOperand(i).getReg())];
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+    if (!MPhi->getOperand(i).isUndef()) {
+      --VRegPHIUseCount[BBVRegPair(
+          MPhi->getOperand(i + 1).getMBB()->getNumber(),
+          MPhi->getOperand(i).getReg())];
+    }
+  }
 
   // Now loop over all of the incoming arguments, changing them to copy into the
   // IncomingReg register in the corresponding predecessor basic block.
@@ -461,6 +466,15 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       assert(MRI->use_empty(SrcReg) &&
              "Expected a single use from UnspillableTerminator");
       SrcRegDef->getOperand(0).setReg(IncomingReg);
+
+      // Update LiveVariables.
+      if (LV) {
+        LiveVariables::VarInfo &SrcVI = LV->getVarInfo(SrcReg);
+        LiveVariables::VarInfo &IncomingVI = LV->getVarInfo(IncomingReg);
+        IncomingVI.AliveBlocks = std::move(SrcVI.AliveBlocks);
+        SrcVI.AliveBlocks.clear();
+      }
+
       continue;
     }
 
@@ -515,9 +529,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // case, we should mark the last such terminator as being the killing
       // block, not the copy.
       MachineBasicBlock::iterator KillInst = opBlock.end();
-      MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
-      for (MachineBasicBlock::iterator Term = FirstTerm;
-          Term != opBlock.end(); ++Term) {
+      for (MachineBasicBlock::iterator Term = InsertPos; Term != opBlock.end();
+           ++Term) {
         if (Term->readsRegister(SrcReg))
           KillInst = Term;
       }
@@ -527,7 +540,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
         if (reusedIncoming || !IncomingReg) {
           // We may have to rewind a bit if we didn't insert a copy this time.
-          KillInst = FirstTerm;
+          KillInst = InsertPos;
           while (KillInst != opBlock.begin()) {
             --KillInst;
             if (KillInst->isDebugInstr())
@@ -574,9 +587,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
         if (!isLiveOut) {
           MachineBasicBlock::iterator KillInst = opBlock.end();
-          MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
-          for (MachineBasicBlock::iterator Term = FirstTerm;
-              Term != opBlock.end(); ++Term) {
+          for (MachineBasicBlock::iterator Term = InsertPos;
+               Term != opBlock.end(); ++Term) {
             if (Term->readsRegister(SrcReg))
               KillInst = Term;
           }
@@ -586,7 +598,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
             if (reusedIncoming || !IncomingReg) {
               // We may have to rewind a bit if we didn't just insert a copy.
-              KillInst = FirstTerm;
+              KillInst = InsertPos;
               while (KillInst != opBlock.begin()) {
                 --KillInst;
                 if (KillInst->isDebugInstr())
@@ -623,14 +635,19 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 /// used in a PHI node. We map that to the BB the vreg is coming from. This is
 /// used later to determine when the vreg is killed in the BB.
 void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
-  for (const auto &MBB : MF)
+  for (const auto &MBB : MF) {
     for (const auto &BBI : MBB) {
       if (!BBI.isPHI())
         break;
-      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2)
-        ++VRegPHIUseCount[BBVRegPair(BBI.getOperand(i+1).getMBB()->getNumber(),
-                                     BBI.getOperand(i).getReg())];
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2) {
+        if (!BBI.getOperand(i).isUndef()) {
+          ++VRegPHIUseCount[BBVRegPair(
+              BBI.getOperand(i + 1).getMBB()->getNumber(),
+              BBI.getOperand(i).getReg())];
+        }
+      }
     }
+  }
 }
 
 bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 49bdba518322..f9b16d2630d6 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -626,7 +626,7 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr &MI) {
   // If this instruction is a comparison against zero and isn't comparing a
   // physical register, we can try to optimize it.
   Register SrcReg, SrcReg2;
-  int CmpMask, CmpValue;
+  int64_t CmpMask, CmpValue;
   if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
       SrcReg.isPhysical() || SrcReg2.isPhysical())
     return false;
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 80c38f3ec341..e3eb3f825851 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -36,9 +37,8 @@ static bool lowerLoadRelative(Function &F) {
   Type *Int32PtrTy = Int32Ty->getPointerTo();
   Type *Int8Ty = Type::getInt8Ty(F.getContext());
 
-  for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
-    auto CI = dyn_cast<CallInst>(I->getUser());
-    ++I;
+  for (Use &U : llvm::make_early_inc_range(F.uses())) {
+    auto CI = dyn_cast<CallInst>(U.getUser());
     if (!CI || CI->getCalledOperand() != &F)
       continue;
 
@@ -90,10 +90,22 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
 
   CallInst::TailCallKind OverridingTCK = getOverridingTailCallKind(F);
 
-  for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
-    auto *CI = cast<CallInst>(I->getUser());
+  for (Use &U : llvm::make_early_inc_range(F.uses())) {
+    auto *CB = cast<CallBase>(U.getUser());
+
+    if (CB->getCalledFunction() != &F) {
+      objcarc::ARCInstKind Kind = objcarc::getAttachedARCFunctionKind(CB);
+      (void)Kind;
+      assert((Kind == objcarc::ARCInstKind::RetainRV ||
+              Kind == objcarc::ARCInstKind::ClaimRV) &&
+             "use expected to be the argument of operand bundle "
+             "\"clang.arc.attachedcall\"");
+      U.set(FCache.getCallee());
+      continue;
+    }
+
+    auto *CI = cast<CallInst>(CB);
     assert(CI->getCalledFunction() && "Cannot lower an indirect call!");
-    ++I;
 
     IRBuilder<> Builder(CI->getParent(), CI->getIterator());
     SmallVector<Value *, 8> Args(CI->args());
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 2f65a450fb02..9a4f70a6070f 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -285,7 +285,7 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
     (void)Failed;
   }
   if (StackSize > Threshold) {
-    DiagnosticInfoStackSize DiagStackSize(F, StackSize, DS_Warning, Threshold);
+    DiagnosticInfoStackSize DiagStackSize(F, StackSize, Threshold, DS_Warning);
     F.getContext().diagnose(DiagStackSize);
   }
   ORE->emit([&]() {
@@ -395,12 +395,28 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
 
   const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
   const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs();
+  BitVector CSMask(SavedRegs.size());
+
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    CSMask.set(CSRegs[i]);
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
-    if (SavedRegs.test(Reg))
-      CSI.push_back(CalleeSavedInfo(Reg));
+    if (SavedRegs.test(Reg)) {
+      bool SavedSuper = false;
+      for (const MCPhysReg &SuperReg : RegInfo->superregs(Reg)) {
+        // Some backends set all aliases for some registers as saved, such as
+        // Mips's $fp, so they appear in SavedRegs but not CSRegs.
+        if (SavedRegs.test(SuperReg) && CSMask.test(SuperReg)) {
+          SavedSuper = true;
+          break;
+        }
+      }
+
+      if (!SavedSuper)
+        CSI.push_back(CalleeSavedInfo(Reg));
+    }
   }
 
   const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
@@ -1237,7 +1253,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
         StackOffset Offset =
             TFI->getFrameIndexReference(MF, FrameIdx, Reg);
         Op.ChangeToRegister(Reg, false /*isDef*/);
-        Op.setIsDebug();
 
         const DIExpression *DIExpr = MI.getDebugExpression();
 
diff --git a/llvm/lib/CodeGen/PseudoProbeInserter.cpp b/llvm/lib/CodeGen/PseudoProbeInserter.cpp
index a9fb577d5735..5f69f9194125 100644
--- a/llvm/lib/CodeGen/PseudoProbeInserter.cpp
+++ b/llvm/lib/CodeGen/PseudoProbeInserter.cpp
@@ -44,7 +44,14 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
+  bool doInitialization(Module &M) override {
+    ShouldRun = M.getNamedMetadata(PseudoProbeDescMetadataName);
+    return false;
+  }
+
   bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!ShouldRun)
+      return false;
     const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
     bool Changed = false;
     for (MachineBasicBlock &MBB : MF) {
@@ -129,6 +136,8 @@ private:
       Name = SP->getName();
     return Function::getGUID(Name);
   }
+
+  bool ShouldRun = false;
 };
 } // namespace
 
diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index d92c6a997f31..d704cf7b3213 100644
--- a/llvm/lib/CodeGen/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -171,7 +171,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
 
   SmallSet<NodeId,32> Defs;
 
-  // Remove all non-phi defs that are not aliased to RefRR, and segregate
+  // Remove all non-phi defs that are not aliased to RefRR, and separate
   // the the remaining defs into buckets for containing blocks.
   std::map<NodeId, NodeAddr<InstrNode*>> Owners;
   std::map<MachineBasicBlock*, SmallVector<NodeId,32>> Blocks;
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index c850571da2ed..1264e6021b6e 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -30,16 +30,32 @@ static bool isValidRegUse(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isUse();
 }
 
-static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg) {
-  return isValidRegUse(MO) && MO.getReg() == PhysReg;
+static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg,
+                            const TargetRegisterInfo *TRI) {
+  if (!isValidRegUse(MO))
+    return false;
+  if (MO.getReg() == PhysReg)
+    return true;
+  for (MCRegAliasIterator R(PhysReg, TRI, false); R.isValid(); ++R)
+    if (MO.getReg() == *R)
+      return true;
+  return false;
 }
 
 static bool isValidRegDef(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isDef();
 }
 
-static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg) {
-  return isValidRegDef(MO) && MO.getReg() == PhysReg;
+static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg,
+                            const TargetRegisterInfo *TRI) {
+  if (!isValidRegDef(MO))
+    return false;
+  if (MO.getReg() == PhysReg)
+    return true;
+  for (MCRegAliasIterator R(PhysReg, TRI, false); R.isValid(); ++R)
+    if (MO.getReg() == *R)
+      return true;
+  return false;
 }
 
 void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
@@ -337,7 +353,7 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
       return;
 
     for (auto &MO : MI->operands()) {
-      if (!isValidRegUseOf(MO, PhysReg))
+      if (!isValidRegUseOf(MO, PhysReg, TRI))
         continue;
 
       Uses.insert(&*MI);
@@ -353,7 +369,7 @@ bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
   for (MachineInstr &MI :
        instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) {
     for (auto &MO : MI.operands()) {
-      if (!isValidRegUseOf(MO, PhysReg))
+      if (!isValidRegUseOf(MO, PhysReg, TRI))
         continue;
       if (getReachingDef(&MI, PhysReg) >= 0)
         return false;
@@ -381,8 +397,7 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
     SmallVector<MachineBasicBlock *, 4> ToVisit(MBB->successors());
     SmallPtrSet<MachineBasicBlock*, 4>Visited;
     while (!ToVisit.empty()) {
-      MachineBasicBlock *MBB = ToVisit.back();
-      ToVisit.pop_back();
+      MachineBasicBlock *MBB = ToVisit.pop_back_val();
       if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
         continue;
       if (getLiveInUses(MBB, PhysReg, Uses))
@@ -419,7 +434,7 @@ void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
   VisitedBBs.insert(MBB);
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (!LiveRegs.contains(PhysReg))
+  if (LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
     return;
 
   if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
@@ -469,7 +484,7 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
   LiveRegs.addLiveOuts(*MBB);
 
   // Yes if the register is live out of the basic block.
-  if (LiveRegs.contains(PhysReg))
+  if (!LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
     return true;
 
   // Walk backwards through the block to see if the register is live at some
@@ -477,7 +492,7 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
   for (MachineInstr &Last :
        instructionsWithoutDebug(MBB->instr_rbegin(), MBB->instr_rend())) {
     LiveRegs.stepBackward(Last);
-    if (LiveRegs.contains(PhysReg))
+    if (!LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
       return InstIds.lookup(&Last) > InstIds.lookup(MI);
   }
   return false;
@@ -502,7 +517,7 @@ bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (!LiveRegs.contains(PhysReg))
+  if (LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
     return false;
 
   auto Last = MBB->getLastNonDebugInstr();
@@ -512,7 +527,7 @@ bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
 
   // Finally check that the last instruction doesn't redefine the register.
   for (auto &MO : Last->operands())
-    if (isValidRegDefOf(MO, PhysReg))
+    if (isValidRegDefOf(MO, PhysReg, TRI))
       return false;
 
   return true;
@@ -523,7 +538,7 @@ ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
                                           MCRegister PhysReg) const {
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (!LiveRegs.contains(PhysReg))
+  if (LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
     return nullptr;
 
   auto Last = MBB->getLastNonDebugInstr();
@@ -532,7 +547,7 @@ ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
 
   int Def = getReachingDef(&*Last, PhysReg);
   for (auto &MO : Last->operands())
-    if (isValidRegDefOf(MO, PhysReg))
+    if (isValidRegDefOf(MO, PhysReg, TRI))
       return &*Last;
 
   return Def < 0 ? nullptr : getInstFromId(MBB, Def);
@@ -700,7 +715,7 @@ bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
       if (Ignore.count(&*I))
         continue;
       for (auto &MO : I->operands())
-        if (isValidRegDefOf(MO, PhysReg))
+        if (isValidRegDefOf(MO, PhysReg, TRI))
           return false;
     }
   }
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index b65d58077958..a9816b13e798 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -217,9 +217,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, MCRegister PhysReg,
   // Collect interferences assigned to any alias of the physical register.
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    Q.collectInterferingVRegs();
-    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+    for (auto *Intf : reverse(Q.interferingVRegs())) {
       if (!Intf->isSpillable() || Intf->weight() > VirtReg.weight())
         return false;
       Intfs.push_back(Intf);
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
new file mode 100644
index 000000000000..85fd3207888b
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
@@ -0,0 +1,90 @@
+//===- RegAllocEvictionAdvisor.h - Interference resolution ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
+#define LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
+
+#include "AllocationOrder.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+using SmallVirtRegSet = SmallSet<Register, 16>;
+
+// Live ranges pass through a number of stages as we try to allocate them.
+// Some of the stages may also create new live ranges:
+//
+// - Region splitting.
+// - Per-block splitting.
+// - Local splitting.
+// - Spilling.
+//
+// Ranges produced by one of the stages skip the previous stages when they are
+// dequeued. This improves performance because we can skip interference checks
+// that are unlikely to give any results. It also guarantees that the live
+// range splitting algorithm terminates, something that is otherwise hard to
+// ensure.
+enum LiveRangeStage {
+  /// Newly created live range that has never been queued.
+  RS_New,
+
+  /// Only attempt assignment and eviction. Then requeue as RS_Split.
+  RS_Assign,
+
+  /// Attempt live range splitting if assignment is impossible.
+  RS_Split,
+
+  /// Attempt more aggressive live range splitting that is guaranteed to make
+  /// progress.  This is used for split products that may not be making
+  /// progress.
+  RS_Split2,
+
+  /// Live range will be spilled.  No more splitting will be attempted.
+  RS_Spill,
+
+  /// Live range is in memory. Because of other evictions, it might get moved
+  /// in a register in the end.
+  RS_Memory,
+
+  /// There is nothing more we can do to this live range.  Abort compilation
+  /// if it can't be assigned.
+  RS_Done
+};
+
+/// Cost of evicting interference - used by default advisor, and the eviction
+/// chain heuristic in RegAllocGreedy.
+// FIXME: this can be probably made an implementation detail of the default
+// advisor, if the eviction chain logic can be refactored.
+struct EvictionCost {
+  unsigned BrokenHints = 0; ///< Total number of broken hints.
+  float MaxWeight = 0;      ///< Maximum spill weight evicted.
+
+  EvictionCost() = default;
+
+  bool isMax() const { return BrokenHints == ~0u; }
+
+  void setMax() { BrokenHints = ~0u; }
+
+  void setBrokenHints(unsigned NHints) { BrokenHints = NHints; }
+
+  bool operator<(const EvictionCost &O) const {
+    return std::tie(BrokenHints, MaxWeight) <
+           std::tie(O.BrokenHints, O.MaxWeight);
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 707161d5a8b0..68920e2e50df 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
@@ -432,7 +433,7 @@ void RegAllocFast::spill(MachineBasicBlock::iterator Before, Register VirtReg,
   // every definition of it, meaning we can switch all the DBG_VALUEs over
   // to just reference the stack slot.
   SmallVectorImpl<MachineOperand *> &LRIDbgOperands = LiveDbgValueMap[VirtReg];
-  SmallDenseMap<MachineInstr *, SmallVector<const MachineOperand *>>
+  SmallMapVector<MachineInstr *, SmallVector<const MachineOperand *>, 2>
       SpilledOperandsMap;
   for (MachineOperand *MO : LRIDbgOperands)
     SpilledOperandsMap[MO->getParent()].push_back(MO);
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4eb12aa30ee9..5a93b58e0baf 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -15,6 +15,7 @@
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
 #include "RegAllocBase.h"
+#include "RegAllocEvictionAdvisor.h"
 #include "SpillPlacement.h"
 #include "SplitKit.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -57,6 +58,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -69,7 +71,6 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -148,7 +149,6 @@ class RAGreedy : public MachineFunctionPass,
   // Convenient shortcuts.
   using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
   using SmallLISet = SmallPtrSet<LiveInterval *, 4>;
-  using SmallVirtRegSet = SmallSet<Register, 16>;
 
   // context
   MachineFunction *MF;
@@ -175,47 +175,6 @@ class RAGreedy : public MachineFunctionPass,
   unsigned NextCascade;
   std::unique_ptr<VirtRegAuxInfo> VRAI;
 
-  // Live ranges pass through a number of stages as we try to allocate them.
-  // Some of the stages may also create new live ranges:
-  //
-  // - Region splitting.
-  // - Per-block splitting.
-  // - Local splitting.
-  // - Spilling.
-  //
-  // Ranges produced by one of the stages skip the previous stages when they are
-  // dequeued. This improves performance because we can skip interference checks
-  // that are unlikely to give any results. It also guarantees that the live
-  // range splitting algorithm terminates, something that is otherwise hard to
-  // ensure.
-  enum LiveRangeStage {
-    /// Newly created live range that has never been queued.
-    RS_New,
-
-    /// Only attempt assignment and eviction. Then requeue as RS_Split.
-    RS_Assign,
-
-    /// Attempt live range splitting if assignment is impossible.
-    RS_Split,
-
-    /// Attempt more aggressive live range splitting that is guaranteed to make
-    /// progress.  This is used for split products that may not be making
-    /// progress.
-    RS_Split2,
-
-    /// Live range will be spilled.  No more splitting will be attempted.
-    RS_Spill,
-
-
-    /// Live range is in memory. Because of other evictions, it might get moved
-    /// in a register in the end.
-    RS_Memory,
-
-    /// There is nothing more we can do to this live range.  Abort compilation
-    /// if it can't be assigned.
-    RS_Done
-  };
-
   // Enum CutOffStage to keep a track whether the register allocation failed
   // because of the cutoffs encountered in last chance recoloring.
   // Note: This is used as bitmask. New value should be next power of 2.
@@ -267,25 +226,6 @@ class RAGreedy : public MachineFunctionPass,
     }
   }
 
-  /// Cost of evicting interference.
-  struct EvictionCost {
-    unsigned BrokenHints = 0; ///< Total number of broken hints.
-    float MaxWeight = 0;      ///< Maximum spill weight evicted.
-
-    EvictionCost() = default;
-
-    bool isMax() const { return BrokenHints == ~0u; }
-
-    void setMax() { BrokenHints = ~0u; }
-
-    void setBrokenHints(unsigned NHints) { BrokenHints = NHints; }
-
-    bool operator<(const EvictionCost &O) const {
-      return std::tie(BrokenHints, MaxWeight) <
-             std::tie(O.BrokenHints, O.MaxWeight);
-    }
-  };
-
   /// EvictionTrack - Keeps track of past evictions in order to optimize region
   /// split decision.
   class EvictionTrack {
@@ -488,6 +428,8 @@ private:
   MCRegister tryAssign(LiveInterval&, AllocationOrder&,
                      SmallVectorImpl<Register>&,
                      const SmallVirtRegSet&);
+  MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &,
+                                      uint8_t, const SmallVirtRegSet &) const;
   MCRegister tryEvict(LiveInterval &, AllocationOrder &,
                     SmallVectorImpl<Register> &, uint8_t,
                     const SmallVirtRegSet &);
@@ -760,10 +702,9 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     // Giant live ranges fall back to the global assignment heuristic, which
     // prevents excessive spilling in pathological cases.
     bool ReverseLocal = TRI->reverseLocalAssignment();
-    bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges();
     const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
     bool ForceGlobal = !ReverseLocal &&
-      (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs());
+      (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC));
 
     if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
         LIS->intervalIsInOneMBB(*LI)) {
@@ -785,8 +726,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
       // interference.  Mark a bit to prioritize global above local ranges.
       Prio = (1u << 29) + Size;
 
-      if (AddPriorityToGlobal)
-        Prio |= RC.AllocationPriority << 24;
+      Prio |= RC.AllocationPriority << 24;
     }
     // Mark a higher bit to prioritize global and local above RS_Split.
     Prio |= (1u << 31);
@@ -860,7 +800,7 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg,
     return PhysReg;
 
   LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is available at cost "
-                    << Cost << '\n');
+                    << (unsigned)Cost << '\n');
   MCRegister CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost, FixedRegisters);
   return CheapReg ? CheapReg : PhysReg;
 }
@@ -957,11 +897,12 @@ bool RAGreedy::canEvictInterference(
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // If there is 10 or more interferences, chances are one is heavier.
-    if (Q.collectInterferingVRegs(10) >= 10)
+    const auto &Interferences = Q.interferingVRegs(10);
+    if (Interferences.size() >= 10)
       return false;
 
     // Check if any interfering live range is heavier than MaxWeight.
-    for (LiveInterval *Intf : reverse(Q.interferingVRegs())) {
+    for (LiveInterval *Intf : reverse(Interferences)) {
       assert(Register::isVirtualRegister(Intf->reg()) &&
              "Only expecting virtual register interference from query");
 
@@ -1039,7 +980,6 @@ bool RAGreedy::canEvictInterferenceInRange(const LiveInterval &VirtReg,
 
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    Q.collectInterferingVRegs();
 
     // Check if any interfering live range is heavier than MaxWeight.
     for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) {
@@ -1129,7 +1069,6 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
     // should be fast, we may need to recalculate if when different physregs
     // overlap the same register unit so we had different SubRanges queried
     // against it.
-    Q.collectInterferingVRegs();
     ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
     Intfs.append(IVR.begin(), IVR.end());
   }
@@ -1162,17 +1101,9 @@ bool RAGreedy::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
   return !Matrix->isPhysRegUsed(PhysReg);
 }
 
-/// tryEvict - Try to evict all interferences for a physreg.
-/// @param  VirtReg Currently unassigned virtual register.
-/// @param  Order   Physregs to try.
-/// @return         Physreg to assign VirtReg, or 0.
-MCRegister RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order,
-                            SmallVectorImpl<Register> &NewVRegs,
-                            uint8_t CostPerUseLimit,
-                            const SmallVirtRegSet &FixedRegisters) {
-  NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription,
-                     TimePassesIsEnabled);
-
+MCRegister RAGreedy::tryFindEvictionCandidate(
+    LiveInterval &VirtReg, const AllocationOrder &Order,
+    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
   // Keep track of the cheapest interference seen so far.
   EvictionCost BestCost;
   BestCost.setMax();
@@ -1230,7 +1161,22 @@ MCRegister RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order,
     if (I.isHint())
       break;
   }
+  return BestPhys;
+}
 
+/// tryEvict - Try to evict all interferences for a physreg.
+/// @param  VirtReg Currently unassigned virtual register.
+/// @param  Order   Physregs to try.
+/// @return         Physreg to assign VirtReg, or 0.
+MCRegister RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order,
+                              SmallVectorImpl<Register> &NewVRegs,
+                              uint8_t CostPerUseLimit,
+                              const SmallVirtRegSet &FixedRegisters) {
+  NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription,
+                     TimePassesIsEnabled);
+
+  MCRegister BestPhys =
+      tryFindEvictionCandidate(VirtReg, Order, CostPerUseLimit, FixedRegisters);
   if (BestPhys.isValid())
     evictInterference(VirtReg, BestPhys, NewVRegs);
   return BestPhys;
@@ -2135,7 +2081,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // the constraints on the virtual register.
   // Otherwise, splitting just inserts uncoalescable copies that do not help
   // the allocation.
-  for (const auto &Use : Uses) {
+  for (const SlotIndex Use : Uses) {
     if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use))
       if (MI->isFullCopy() ||
           SuperRCNumAllocatableRegs ==
@@ -2462,12 +2408,12 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
   unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
   if (NewGaps >= NumGaps) {
-    LLVM_DEBUG(dbgs() << "Tagging non-progress ranges: ");
+    LLVM_DEBUG(dbgs() << "Tagging non-progress ranges:");
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned I = 0, E = IntvMap.size(); I != E; ++I)
       if (IntvMap[I] == 1) {
         setStage(LIS->getInterval(LREdit.get(I)), RS_Split2);
-        LLVM_DEBUG(dbgs() << printReg(LREdit.get(I)));
+        LLVM_DEBUG(dbgs() << ' ' << printReg(LREdit.get(I)));
       }
     LLVM_DEBUG(dbgs() << '\n');
   }
@@ -2506,17 +2452,6 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   SA->analyze(&VirtReg);
 
-  // FIXME: SplitAnalysis may repair broken live ranges coming from the
-  // coalescer. That may cause the range to become allocatable which means that
-  // tryRegionSplit won't be making progress. This check should be replaced with
-  // an assertion when the coalescer is fixed.
-  if (SA->didRepairRange()) {
-    // VirtReg has changed, so all cached queries are invalid.
-    Matrix->invalidateVirtRegs();
-    if (Register PhysReg = tryAssign(VirtReg, Order, NewVRegs, FixedRegisters))
-      return PhysReg;
-  }
-
   // First try to split around a region spanning multiple blocks. RS_Split2
   // ranges already made dubious progress with region splitting, so they go
   // straight to single block splitting.
@@ -2560,8 +2495,9 @@ bool RAGreedy::mayRecolorAllInterferences(
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // If there is LastChanceRecoloringMaxInterference or more interferences,
     // chances are one would not be recolorable.
-    if (Q.collectInterferingVRegs(LastChanceRecoloringMaxInterference) >=
-        LastChanceRecoloringMaxInterference && !ExhaustiveSearch) {
+    if (Q.interferingVRegs(LastChanceRecoloringMaxInterference).size() >=
+            LastChanceRecoloringMaxInterference &&
+        !ExhaustiveSearch) {
       LLVM_DEBUG(dbgs() << "Early abort: too many interferences.\n");
       CutOffInfo |= CO_Interf;
       return false;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 751f79e66b73..c847068bca90 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -932,12 +932,8 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   //   = B
 
   // Update uses of IntA of the specific Val# with IntB.
-  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg()),
-                                         UE = MRI->use_end();
-       UI != UE;
-       /* ++UI is below because of possible MI removal */) {
-    MachineOperand &UseMO = *UI;
-    ++UI;
+  for (MachineOperand &UseMO :
+       llvm::make_early_inc_range(MRI->use_operands(IntA.reg()))) {
     if (UseMO.isUndef())
       continue;
     MachineInstr *UseMI = UseMO.getParent();
@@ -1573,9 +1569,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
   // to describe DstReg instead.
   if (MRI->use_nodbg_empty(SrcReg)) {
-    for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg);
-         UI != MRI->use_end();) {
-      MachineOperand &UseMO = *UI++;
+    for (MachineOperand &UseMO :
+         llvm::make_early_inc_range(MRI->use_operands(SrcReg))) {
       MachineInstr *UseMI = UseMO.getParent();
       if (UseMI->isDebugInstr()) {
         if (Register::isPhysicalRegister(DstReg))
@@ -3708,7 +3703,7 @@ void RegisterCoalescer::buildVRegToDbgValueMap(MachineFunction &MF)
   // vreg => DbgValueLoc map.
   auto CloseNewDVRange = [this, &ToInsert](SlotIndex Slot) {
     for (auto *X : ToInsert) {
-      for (auto Op : X->debug_operands()) {
+      for (const auto &Op : X->debug_operands()) {
         if (Op.isReg() && Op.getReg().isVirtual())
           DbgVRegToValues[Op.getReg()].push_back({Slot, X});
       }
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index e35cf7aa6958..c0a07ec4c91d 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -495,21 +495,20 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
     // Spill the scavenged register before \p Before.
     int FI = Scavenged[SI].FrameIndex;
     if (FI < FIB || FI >= FIE) {
-      std::string Msg = std::string("Error while trying to spill ") +
-          TRI->getName(Reg) + " from class " + TRI->getRegClassName(&RC) +
-          ": Cannot scavenge register without an emergency spill slot!";
-      report_fatal_error(Msg.c_str());
+      report_fatal_error(Twine("Error while trying to spill ") +
+                         TRI->getName(Reg) + " from class " +
+                         TRI->getRegClassName(&RC) +
+                         ": Cannot scavenge register without an emergency "
+                         "spill slot!");
     }
-    TII->storeRegToStackSlot(*MBB, Before, Reg, true, Scavenged[SI].FrameIndex,
-                             &RC, TRI);
+    TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI);
     MachineBasicBlock::iterator II = std::prev(Before);
 
     unsigned FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
-    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, Scavenged[SI].FrameIndex,
-                              &RC, TRI);
+    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI);
     II = std::prev(UseMI);
 
     FIOperandNum = getFrameIndexOperandNum(*II);
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 1619381967c4..0ff045fa787e 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -70,7 +70,7 @@ static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
   // Replace the call to the vector intrinsic with a call
   // to the corresponding function from the vector library.
   IRBuilder<> IRBuilder(&CI);
-  SmallVector<Value *> Args(CI.arg_operands());
+  SmallVector<Value *> Args(CI.args());
   // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
   CI.getOperandBundlesAsDefs(OpBundles);
@@ -106,7 +106,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // all vector operands have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
   SmallVector<Type *> ScalarTypes;
-  for (auto Arg : enumerate(CI.arg_operands())) {
+  for (auto Arg : enumerate(CI.args())) {
     auto *ArgType = Arg.value()->getType();
     // Vector calls to intrinsics can still have
     // scalar operands for specific arguments.
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 94add920f284..50d9d64bfcfd 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -147,7 +147,7 @@ class SafeStack {
   ///
   /// 16 seems like a reasonable upper bound on the alignment of objects that we
   /// might expect to appear on the stack on most common targets.
-  enum { StackAlignment = 16 };
+  static constexpr uint64_t StackAlignment = 16;
 
   /// Return the value of the stack canary.
   Value *getStackGuard(IRBuilder<> &IRB, Function &F);
@@ -221,6 +221,8 @@ public:
   bool run();
 };
 
+constexpr uint64_t SafeStack::StackAlignment;
+
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
   uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
   if (AI->isArrayAllocation()) {
@@ -519,7 +521,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   StackLayout SSL(StackAlignment);
   if (StackGuardSlot) {
     Type *Ty = StackGuardSlot->getAllocatedType();
-    unsigned Align =
+    uint64_t Align =
         std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
                   Align, SSC.getFullLiveRange());
@@ -532,8 +534,8 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty),
-                              Arg->getParamAlignment());
+    uint64_t Align =
+        std::max(DL.getPrefTypeAlignment(Ty), Arg->getParamAlignment());
     SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
   }
 
@@ -544,21 +546,20 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    unsigned Align =
-        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
+    uint64_t Align = std::max(DL.getPrefTypeAlignment(Ty), AI->getAlignment());
 
     SSL.addObject(AI, Size, Align,
                   ClColoring ? SSC.getLiveRange(AI) : NoColoringRange);
   }
 
   SSL.computeLayout();
-  unsigned FrameAlignment = SSL.getFrameAlignment();
+  uint64_t FrameAlignment = SSL.getFrameAlignment();
 
   // FIXME: tell SSL that we start at a less-then-MaxAlignment aligned location
   // (AlignmentSkew).
   if (FrameAlignment > StackAlignment) {
     // Re-align the base pointer according to the max requested alignment.
-    assert(isPowerOf2_32(FrameAlignment));
+    assert(isPowerOf2_64(FrameAlignment));
     IRB.SetInsertPoint(BasePointer->getNextNode());
     BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
         IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
@@ -676,9 +677,9 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
     SP = IRB.CreateSub(SP, Size);
 
     // Align the SP value to satisfy the AllocaInst, type and stack alignments.
-    unsigned Align = std::max(
-        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
-        (unsigned)StackAlignment);
+    uint64_t Align =
+        std::max(std::max(DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
+                 StackAlignment);
 
     assert(isPowerOf2_32(Align));
     Value *NewTop = IRB.CreateIntToPtr(
@@ -701,9 +702,8 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
 
   if (!DynamicAllocas.empty()) {
     // Now go through the instructions again, replacing stacksave/stackrestore.
-    for (inst_iterator It = inst_begin(&F), Ie = inst_end(&F); It != Ie;) {
-      Instruction *I = &*(It++);
-      auto II = dyn_cast<IntrinsicInst>(I);
+    for (Instruction &I : llvm::make_early_inc_range(instructions(&F))) {
+      auto *II = dyn_cast<IntrinsicInst>(&I);
       if (!II)
         continue;
 
diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp
index 5d61b3a146b4..7cdda7743c16 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -37,7 +37,7 @@ LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
   }
 }
 
-void StackLayout::addObject(const Value *V, unsigned Size, unsigned Alignment,
+void StackLayout::addObject(const Value *V, unsigned Size, uint64_t Alignment,
                             const StackLifetime::LiveRange &Range) {
   StackObjects.push_back({V, Size, Alignment, Range});
   ObjectAlignments[V] = Alignment;
@@ -45,7 +45,7 @@ void StackLayout::addObject(const Value *V, unsigned Size, unsigned Alignment,
 }
 
 static unsigned AdjustStackOffset(unsigned Offset, unsigned Size,
-                                  unsigned Alignment) {
+                                  uint64_t Alignment) {
   return alignTo(Offset + Size, Alignment) - Size;
 }
 
diff --git a/llvm/lib/CodeGen/SafeStackLayout.h b/llvm/lib/CodeGen/SafeStackLayout.h
index f0db1b42aa00..b72450e57080 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.h
+++ b/llvm/lib/CodeGen/SafeStackLayout.h
@@ -22,7 +22,7 @@ namespace safestack {
 
 /// Compute the layout of an unsafe stack frame.
 class StackLayout {
-  unsigned MaxAlignment;
+  uint64_t MaxAlignment;
 
   struct StackRegion {
     unsigned Start;
@@ -39,23 +39,24 @@ class StackLayout {
 
   struct StackObject {
     const Value *Handle;
-    unsigned Size, Alignment;
+    unsigned Size;
+    uint64_t Alignment;
     StackLifetime::LiveRange Range;
   };
 
   SmallVector<StackObject, 8> StackObjects;
 
   DenseMap<const Value *, unsigned> ObjectOffsets;
-  DenseMap<const Value *, unsigned> ObjectAlignments;
+  DenseMap<const Value *, uint64_t> ObjectAlignments;
 
   void layoutObject(StackObject &Obj);
 
 public:
-  StackLayout(unsigned StackAlignment) : MaxAlignment(StackAlignment) {}
+  StackLayout(uint64_t StackAlignment) : MaxAlignment(StackAlignment) {}
 
   /// Add an object to the stack frame. Value pointer is opaque and used as a
   /// handle to retrieve the object's offset in the frame later.
-  void addObject(const Value *V, unsigned Size, unsigned Alignment,
+  void addObject(const Value *V, unsigned Size, uint64_t Alignment,
                  const StackLifetime::LiveRange &Range);
 
   /// Run the layout computation for all previously added objects.
@@ -65,13 +66,13 @@ public:
   unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; }
 
   /// Returns the alignment of the object
-  unsigned getObjectAlignment(const Value *V) { return ObjectAlignments[V]; }
+  uint64_t getObjectAlignment(const Value *V) { return ObjectAlignments[V]; }
 
   /// Returns the size of the entire frame.
   unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; }
 
   /// Returns the alignment of the frame.
-  unsigned getFrameAlignment() { return MaxAlignment; }
+  uint64_t getFrameAlignment() { return MaxAlignment; }
 
   void print(raw_ostream &OS);
 };
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index 60f8eec1b9bc..ef3afab2b730 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -577,8 +577,7 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     SU = WorkList.back();
     WorkList.pop_back();
     Visited.set(SU->NodeNum);
-    for (const SDep &SuccDep
-         : make_range(SU->Succs.rbegin(), SU->Succs.rend())) {
+    for (const SDep &SuccDep : llvm::reverse(SU->Succs)) {
       unsigned s = SuccDep.getSUnit()->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
       if (s >= Node2Index.size())
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index daff3af3bc3c..3f013eb6024e 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -271,15 +271,10 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
         Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
                                                         RegUse, UseOp));
-        ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
       } else {
         Dep.setLatency(0);
-        // FIXME: We could always let target to adjustSchedDependency(), and
-        // remove this condition, but that currently asserts in Hexagon BE.
-        if (SU->getInstr()->isBundle() || (RegUse && RegUse->isBundle()))
-          ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
       }
-
+      ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
       UseSU->addPred(Dep);
     }
   }
@@ -1117,7 +1112,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
   LiveRegs.addLiveOuts(MBB);
 
   // Examine block from end to start...
-  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
     if (MI.isDebugOrPseudoInstr())
       continue;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b104e995019f..ce400ea43f29 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -129,12 +129,12 @@ static cl::opt<unsigned> StoreMergeDependenceLimit(
 
 static cl::opt<bool> EnableReduceLoadOpStoreWidth(
     "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
-    cl::desc("DAG cominber enable reducing the width of load/op/store "
+    cl::desc("DAG combiner enable reducing the width of load/op/store "
              "sequence"));
 
 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
     "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
-    cl::desc("DAG cominber enable load/<replace bytes>/store with "
+    cl::desc("DAG combiner enable load/<replace bytes>/store with "
              "a narrower store"));
 
 namespace {
@@ -319,7 +319,7 @@ namespace {
     /// If so, return true.
     bool SimplifyDemandedBits(SDValue Op) {
       unsigned BitWidth = Op.getScalarValueSizeInBits();
-      APInt DemandedBits = APInt::getAllOnesValue(BitWidth);
+      APInt DemandedBits = APInt::getAllOnes(BitWidth);
       return SimplifyDemandedBits(Op, DemandedBits);
     }
 
@@ -345,7 +345,7 @@ namespace {
         return false;
 
       unsigned NumElts = Op.getValueType().getVectorNumElements();
-      APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+      APInt DemandedElts = APInt::getAllOnes(NumElts);
       return SimplifyDemandedVectorElts(Op, DemandedElts);
     }
 
@@ -436,7 +436,7 @@ namespace {
     SDValue visitOR(SDNode *N);
     SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitXOR(SDNode *N);
-    SDValue SimplifyVBinOp(SDNode *N);
+    SDValue SimplifyVBinOp(SDNode *N, const SDLoc &DL);
     SDValue visitSHL(SDNode *N);
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
@@ -515,6 +515,7 @@ namespace {
     SDValue visitFP_TO_FP16(SDNode *N);
     SDValue visitFP16_TO_FP(SDNode *N);
     SDValue visitVECREDUCE(SDNode *N);
+    SDValue visitVPOp(SDNode *N);
 
     SDValue visitFADDForFMACombine(SDNode *N);
     SDValue visitFSUBForFMACombine(SDNode *N);
@@ -615,7 +616,7 @@ namespace {
                           SmallVectorImpl<SDValue> &Aliases);
 
     /// Return true if there is any possibility that the two addresses overlap.
-    bool isAlias(SDNode *Op0, SDNode *Op1) const;
+    bool mayAlias(SDNode *Op0, SDNode *Op1) const;
 
     /// Walk up chain skipping non-aliasing memory nodes, looking for a better
     /// chain (aliasing node.)
@@ -1062,21 +1063,22 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   if (N0.getOpcode() != Opc)
     return SDValue();
 
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
-    if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+
+  if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+    if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
-      if (SDValue OpNode =
-              DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1}))
-        return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+      if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
+        return DAG.getNode(Opc, DL, VT, N00, OpNode);
       return SDValue();
     }
     if (N0.hasOneUse()) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
-      SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
-      if (!OpNode.getNode())
-        return SDValue();
-      return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
+      if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))
+        return DAG.getNode(Opc, DL, VT, OpNode, N01);
+      return SDValue();
     }
   }
   return SDValue();
@@ -1738,6 +1740,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::VECREDUCE_UMIN:
   case ISD::VECREDUCE_FMAX:
   case ISD::VECREDUCE_FMIN:     return visitVECREDUCE(N);
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC:
+#include "llvm/IR/VPIntrinsics.def"
+    return visitVPOp(N);
   }
   return SDValue();
 }
@@ -2257,7 +2262,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     // fold (add x, 0) -> x, vector edition
@@ -2439,9 +2444,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
                          N0.getOperand(0));
 
     // fold (add (add (xor a, -1), b), 1) -> (sub b, a)
-    if (N0.getOpcode() == ISD::ADD ||
-        N0.getOpcode() == ISD::UADDO ||
-        N0.getOpcode() == ISD::SADDO) {
+    if (N0.getOpcode() == ISD::ADD) {
       SDValue A, Xor;
 
       if (isBitwiseNot(N0.getOperand(0))) {
@@ -2783,7 +2786,7 @@ static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
       IsFlip = Const->isOne();
       break;
     case TargetLowering::ZeroOrNegativeOneBooleanContent:
-      IsFlip = Const->isAllOnesValue();
+      IsFlip = Const->isAllOnes();
       break;
     case TargetLowering::UndefinedBooleanContent:
       IsFlip = (Const->getAPIntValue() & 0x01) == 1;
@@ -3259,7 +3262,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     // fold (sub x, 0) -> x, vector edition
@@ -3317,11 +3320,10 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
 
     // Convert 0 - abs(x).
-    SDValue Result;
     if (N1->getOpcode() == ISD::ABS &&
-        !TLI.isOperationLegalOrCustom(ISD::ABS, VT) &&
-        TLI.expandABS(N1.getNode(), Result, DAG, true))
-      return Result;
+        !TLI.isOperationLegalOrCustom(ISD::ABS, VT))
+      if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true))
+        return Result;
 
     // Fold neg(splat(neg(x)) -> splat(x)
     if (VT.isVector()) {
@@ -3785,7 +3787,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
     N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
@@ -3810,18 +3812,18 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
 
   // fold (mul x, 0) -> 0
-  if (N1IsConst && ConstValue1.isNullValue())
+  if (N1IsConst && ConstValue1.isZero())
     return N1;
 
   // fold (mul x, 1) -> x
-  if (N1IsConst && ConstValue1.isOneValue())
+  if (N1IsConst && ConstValue1.isOne())
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // fold (mul x, -1) -> 0-x
-  if (N1IsConst && ConstValue1.isAllOnesValue()) {
+  if (N1IsConst && ConstValue1.isAllOnes()) {
     SDLoc DL(N);
     return DAG.getNode(ISD::SUB, DL, VT,
                        DAG.getConstant(0, DL, VT), N0);
@@ -3839,7 +3841,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
 
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
-  if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
+  if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) {
     unsigned Log2Val = (-ConstValue1).logBase2();
     SDLoc DL(N);
     // FIXME: If the input is something that is easily negated (e.g. a
@@ -3968,7 +3970,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     SmallBitVector ClearMask;
     ClearMask.reserve(NumElts);
     auto IsClearMask = [&ClearMask](ConstantSDNode *V) {
-      if (!V || V->isNullValue()) {
+      if (!V || V->isZero()) {
         ClearMask.push_back(true);
         return true;
       }
@@ -4054,9 +4056,7 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
   SDValue combined;
-  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
-         UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
+  for (SDNode *User : Op0.getNode()->uses()) {
     if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
         User->use_empty())
       continue;
@@ -4113,7 +4113,7 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
   // 0 / X -> 0
   // 0 % X -> 0
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  if (N0C && N0C->isNullValue())
+  if (N0C && N0C->isZero())
     return N0;
 
   // X / X -> 1
@@ -4138,21 +4138,20 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(VT);
+  SDLoc DL(N);
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-  SDLoc DL(N);
-
   // fold (sdiv c1, c2) -> c1/c2
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
     return C;
 
   // fold (sdiv X, -1) -> 0-X
-  if (N1C && N1C->isAllOnesValue())
+  if (N1C && N1C->isAllOnes())
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
 
   // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
@@ -4206,11 +4205,11 @@ SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
   // Helper for determining whether a value is a power-2 constant scalar or a
   // vector of such elements.
   auto IsPowerOfTwo = [](ConstantSDNode *C) {
-    if (C->isNullValue() || C->isOpaque())
+    if (C->isZero() || C->isOpaque())
       return false;
     if (C->getAPIntValue().isPowerOf2())
       return true;
-    if ((-C->getAPIntValue()).isPowerOf2())
+    if (C->getAPIntValue().isNegatedPowerOf2())
       return true;
     return false;
   };
@@ -4283,21 +4282,20 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(VT);
+  SDLoc DL(N);
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-  SDLoc DL(N);
-
   // fold (udiv c1, c2) -> c1/c2
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
     return C;
 
   // fold (udiv X, -1) -> select(X == -1, 1, 0)
-  if (N1C && N1C->getAPIntValue().isAllOnesValue())
+  if (N1C && N1C->isAllOnes())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(1, DL, VT),
                          DAG.getConstant(0, DL, VT));
@@ -4393,7 +4391,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     return C;
 
   // fold (urem X, -1) -> select(X == -1, 0, x)
-  if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())
+  if (!isSigned && N1C && N1C->isAllOnes())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(0, DL, VT), N0);
 
@@ -4477,6 +4475,11 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1}))
     return C;
 
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0);
+
   // fold (mulhs x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
@@ -4529,6 +4532,11 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1}))
     return C;
 
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0);
+
   // fold (mulhu x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
@@ -4569,6 +4577,12 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
     }
   }
 
+  // Simplify the operands using demanded-bits information.
+  // We don't have demanded bits support for MULHU so this just enables constant
+  // folding based on known bits.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -4770,20 +4784,21 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned Opcode = N->getOpcode();
+  SDLoc DL(N);
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
   // fold operation with constant operands.
-  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
     return C;
 
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
+    return DAG.getNode(N->getOpcode(), DL, VT, N1, N0);
 
   // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
   // Only do this if the current op isn't legal and the flipped is.
@@ -4799,7 +4814,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
     default: llvm_unreachable("Unknown MINMAX opcode");
     }
     if (TLI.isOperationLegal(AltOpcode, VT))
-      return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1);
+      return DAG.getNode(AltOpcode, DL, VT, N0, N1);
   }
 
   // Simplify the operands using demanded-bits information.
@@ -5135,8 +5150,9 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
   if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
     return V;
 
+  // TODO: Rewrite this to return a new 'AND' instead of using CombineTo.
   if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
-      VT.getSizeInBits() <= 64) {
+      VT.getSizeInBits() <= 64 && N0->hasOneUse()) {
     if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
         // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
@@ -5608,6 +5624,39 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
   return DAG.getZExtOrTrunc(Setcc, DL, VT);
 }
 
+/// For targets that support usubsat, match a bit-hack form of that operation
+/// that ends in 'and' and convert it.
+static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // Canonicalize SRA as operand 1.
+  if (N0.getOpcode() == ISD::SRA)
+    std::swap(N0, N1);
+
+  // xor/add with SMIN (signmask) are logically equivalent.
+  if (N0.getOpcode() != ISD::XOR && N0.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  if (N1.getOpcode() != ISD::SRA || !N0.hasOneUse() || !N1.hasOneUse() ||
+      N0.getOperand(0) != N1.getOperand(0))
+    return SDValue();
+
+  unsigned BitWidth = VT.getScalarSizeInBits();
+  ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
+  ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
+  if (!XorC || !XorC->getAPIntValue().isSignMask() ||
+      !SraC || SraC->getAPIntValue() != BitWidth - 1)
+    return SDValue();
+
+  // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
+  // (i8 X + 128) & (i8 X s>> 7) --> usubsat X, 128
+  SDLoc DL(N);
+  SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
+  return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5619,17 +5668,17 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
     // fold (and x, 0) -> 0, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
       // do not return N0, because undef node may exist in N0
-      return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()),
+      return DAG.getConstant(APInt::getZero(N0.getScalarValueSizeInBits()),
                              SDLoc(N), N0.getValueType());
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       // do not return N1, because undef node may exist in N1
-      return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()),
+      return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
                              SDLoc(N), N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
@@ -5680,8 +5729,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // if (and x, c) is known to be zero, return 0
   unsigned BitWidth = VT.getScalarSizeInBits();
-  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
-                                   APInt::getAllOnesValue(BitWidth)))
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -5743,7 +5791,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     // Get the constant (if applicable) the zero'th operand is being ANDed with.
     // This can be a pure constant or a vector splat, in which case we treat the
     // vector as a scalar and use the splat value.
-    APInt Constant = APInt::getNullValue(1);
+    APInt Constant = APInt::getZero(1);
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
       Constant = C->getAPIntValue();
     } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
@@ -5774,7 +5822,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
         // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
         if ((SplatBitSize % EltBitWidth) == 0) {
-          Constant = APInt::getAllOnesValue(EltBitWidth);
+          Constant = APInt::getAllOnes(EltBitWidth);
           for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i)
             Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth);
         }
@@ -5801,7 +5849,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     case ISD::NON_EXTLOAD: B = true; break;
     }
 
-    if (B && Constant.isAllOnesValue()) {
+    if (B && Constant.isAllOnes()) {
       // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
       // preserve semantics once we get rid of the AND.
       SDValue NewLoad(Load, 0);
@@ -5971,6 +6019,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (IsAndZeroExtMask(N0, N1))
     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
 
+  if (hasOperation(ISD::USUBSAT, VT))
+    if (SDValue V = foldAndToUsubsat(N, DAG))
+      return V;
+
   return SDValue();
 }
 
@@ -6385,7 +6437,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
     // fold (or x, 0) -> x, vector edition
@@ -6926,17 +6978,16 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
 // a rot[lr]. This also matches funnel shift patterns, similar to rotation but
 // with different shifted sources.
 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
-  // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
-  if (!TLI.isTypeLegal(VT))
-    return SDValue();
 
   // The target must have at least one rotate/funnel flavor.
+  // We still try to match rotate by constant pre-legalization.
+  // TODO: Support pre-legalization funnel-shift by constant.
   bool HasROTL = hasOperation(ISD::ROTL, VT);
   bool HasROTR = hasOperation(ISD::ROTR, VT);
   bool HasFSHL = hasOperation(ISD::FSHL, VT);
   bool HasFSHR = hasOperation(ISD::FSHR, VT);
-  if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
+  if (LegalOperations && !HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
     return SDValue();
 
   // Check for truncated rotate.
@@ -6989,6 +7040,7 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
     return SDValue(); // Shifts must disagree.
 
+  // TODO: Support pre-legalization funnel-shift by constant.
   bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
   if (!IsRotate && !(HasFSHL || HasFSHR))
     return SDValue(); // Requires funnel shift support.
@@ -7017,12 +7069,15 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   };
   if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
     SDValue Res;
-    if (IsRotate && (HasROTL || HasROTR))
-      Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
-                        HasROTL ? LHSShiftAmt : RHSShiftAmt);
-    else
-      Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
-                        RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt);
+    if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {
+      bool UseROTL = !LegalOperations || HasROTL;
+      Res = DAG.getNode(UseROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                        UseROTL ? LHSShiftAmt : RHSShiftAmt);
+    } else {
+      bool UseFSHL = !LegalOperations || HasFSHL;
+      Res = DAG.getNode(UseFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
+                        RHSShiftArg, UseFSHL ? LHSShiftAmt : RHSShiftAmt);
+    }
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
@@ -7046,6 +7101,11 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
     return Res;
   }
 
+  // Even pre-legalization, we can't easily rotate/funnel-shift by a variable
+  // shift.
+  if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
+    return SDValue();
+
   // If there is a mask here, and we have a variable shift, we can't be sure
   // that we're masking out the right stuff.
   if (LHSMask.getNode() || RHSMask.getNode())
@@ -7297,7 +7357,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   // TODO: If there is evidence that running this later would help, this
   //       limitation could be removed. Legality checks may need to be added
   //       for the created store and optional bswap/rotate.
-  if (LegalOperations)
+  if (LegalOperations || OptLevel == CodeGenOpt::None)
     return SDValue();
 
   // We only handle merging simple stores of 1-4 bytes.
@@ -7672,9 +7732,12 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
 //    |  D  |
 // Into:
 //   (x & m) | (y & ~m)
-// If y is a constant, and the 'andn' does not work with immediates,
-// we unfold into a different pattern:
+// If y is a constant, m is not a 'not', and the 'andn' does not work with
+// immediates, we unfold into a different pattern:
 //   ~(~x & m) & (m | y)
+// If x is a constant, m is a 'not', and the 'andn' does not work with
+// immediates, we unfold into a different pattern:
+//   (x | ~m) & ~(~m & ~y)
 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
 //       the very least that breaks andnpd / andnps patterns, and because those
 //       patterns are simplified in IR and shouldn't be created in the DAG
@@ -7729,8 +7792,9 @@ SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
 
   SDLoc DL(N);
 
-  // If Y is a constant, check that 'andn' works with immediates.
-  if (!TLI.hasAndNot(Y)) {
+  // If Y is a constant, check that 'andn' works with immediates. Unless M is
+  // a bitwise not that would already allow ANDN to be used.
+  if (!TLI.hasAndNot(Y) && !isBitwiseNot(M)) {
     assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
     // If not, we need to do a bit more work to make sure andn is still used.
     SDValue NotX = DAG.getNOT(DL, X, VT);
@@ -7740,6 +7804,19 @@ SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
     return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
   }
 
+  // If X is a constant and M is a bitwise not, check that 'andn' works with
+  // immediates.
+  if (!TLI.hasAndNot(X) && isBitwiseNot(M)) {
+    assert(TLI.hasAndNot(Y) && "Only mask is a variable? Unreachable.");
+    // If not, we need to do a bit more work to make sure andn is still used.
+    SDValue NotM = M.getOperand(0);
+    SDValue LHS = DAG.getNode(ISD::OR, DL, VT, X, NotM);
+    SDValue NotY = DAG.getNOT(DL, Y, VT);
+    SDValue RHS = DAG.getNode(ISD::AND, DL, VT, NotM, NotY);
+    SDValue NotRHS = DAG.getNOT(DL, RHS, VT);
+    return DAG.getNode(ISD::AND, DL, VT, LHS, NotRHS);
+  }
+
   SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
   SDValue NotM = DAG.getNOT(DL, M, VT);
   SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
@@ -7751,10 +7828,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  SDLoc DL(N);
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     // fold (xor x, 0) -> x, vector edition
@@ -7765,7 +7843,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
-  SDLoc DL(N);
   if (N0.isUndef() && N1.isUndef())
     return DAG.getConstant(0, DL, VT);
 
@@ -7900,7 +7977,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       // shift has been simplified to undef.
       uint64_t ShiftAmt = ShiftC->getLimitedValue();
       if (ShiftAmt < BitWidth) {
-        APInt Ones = APInt::getAllOnesValue(BitWidth);
+        APInt Ones = APInt::getAllOnes(BitWidth);
         Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt);
         if (XorC->getAPIntValue() == Ones) {
           // If the xor constant is a shifted -1, do a 'not' before the shift:
@@ -8223,7 +8300,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
     BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
@@ -8256,8 +8333,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     return NewSel;
 
   // if (shl x, c) is known to be zero, return 0
-  if (DAG.MaskedValueIsZero(SDValue(N, 0),
-                            APInt::getAllOnesValue(OpSizeInBits)))
+  if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
@@ -8502,28 +8578,43 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
   // Both operands must be equivalent extend nodes.
   SDValue LeftOp = ShiftOperand.getOperand(0);
   SDValue RightOp = ShiftOperand.getOperand(1);
+
   bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
   bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
 
-  if ((!(IsSignExt || IsZeroExt)) || LeftOp.getOpcode() != RightOp.getOpcode())
+  if (!IsSignExt && !IsZeroExt)
     return SDValue();
 
-  EVT WideVT1 = LeftOp.getValueType();
-  EVT WideVT2 = RightOp.getValueType();
-  (void)WideVT2;
+  EVT NarrowVT = LeftOp.getOperand(0).getValueType();
+  unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
+
+  SDValue MulhRightOp;
+  if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+    unsigned ActiveBits = IsSignExt
+                              ? Constant->getAPIntValue().getMinSignedBits()
+                              : Constant->getAPIntValue().getActiveBits();
+    if (ActiveBits > NarrowVTSize)
+      return SDValue();
+    MulhRightOp = DAG.getConstant(
+        Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+        NarrowVT);
+  } else {
+    if (LeftOp.getOpcode() != RightOp.getOpcode())
+      return SDValue();
+    // Check that the two extend nodes are the same type.
+    if (NarrowVT != RightOp.getOperand(0).getValueType())
+      return SDValue();
+    MulhRightOp = RightOp.getOperand(0);
+  }
+
+  EVT WideVT = LeftOp.getValueType();
   // Proceed with the transformation if the wide types match.
-  assert((WideVT1 == WideVT2) &&
+  assert((WideVT == RightOp.getValueType()) &&
          "Cannot have a multiply node with two different operand types.");
 
-  EVT NarrowVT = LeftOp.getOperand(0).getValueType();
-  // Check that the two extend nodes are the same type.
-  if (NarrowVT !=  RightOp.getOperand(0).getValueType())
-    return SDValue();
-
   // Proceed with the transformation if the wide type is twice as large
   // as the narrow type.
-  unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
-  if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize)
+  if (WideVT.getScalarSizeInBits() != 2 * NarrowVTSize)
     return SDValue();
 
   // Check the shift amount with the narrow type size.
@@ -8541,10 +8632,10 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
   if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))
     return SDValue();
 
-  SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0),
-                               RightOp.getOperand(0));
-  return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1)
-                                     : DAG.getZExtOrTrunc(Result, DL, WideVT1));
+  SDValue Result =
+      DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp);
+  return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT)
+                                     : DAG.getZExtOrTrunc(Result, DL, WideVT));
 }
 
 SDValue DAGCombiner::visitSRA(SDNode *N) {
@@ -8564,7 +8655,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
@@ -8762,7 +8853,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
@@ -8775,8 +8866,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     return NewSel;
 
   // if (srl x, c) is known to be zero, return 0
-  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
-                                   APInt::getAllOnesValue(OpSizeInBits)))
+  if (N1C &&
+      DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
@@ -9358,27 +9449,27 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   // is also a target-independent combine here in DAGCombiner in the other
   // direction for (select Cond, -1, 0) when the condition is not i1.
   if (CondVT == MVT::i1 && !LegalOperations) {
-    if (C1->isNullValue() && C2->isOne()) {
+    if (C1->isZero() && C2->isOne()) {
       // select Cond, 0, 1 --> zext (!Cond)
       SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
       if (VT != MVT::i1)
         NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
       return NotCond;
     }
-    if (C1->isNullValue() && C2->isAllOnesValue()) {
+    if (C1->isZero() && C2->isAllOnes()) {
       // select Cond, 0, -1 --> sext (!Cond)
       SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
       if (VT != MVT::i1)
         NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
       return NotCond;
     }
-    if (C1->isOne() && C2->isNullValue()) {
+    if (C1->isOne() && C2->isZero()) {
       // select Cond, 1, 0 --> zext (Cond)
       if (VT != MVT::i1)
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
       return Cond;
     }
-    if (C1->isAllOnesValue() && C2->isNullValue()) {
+    if (C1->isAllOnes() && C2->isZero()) {
       // select Cond, -1, 0 --> sext (Cond)
       if (VT != MVT::i1)
         Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
@@ -9406,7 +9497,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
       }
 
       // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
-      if (C1Val.isPowerOf2() && C2Val.isNullValue()) {
+      if (C1Val.isPowerOf2() && C2Val.isZero()) {
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
         SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT);
@@ -9434,7 +9525,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
           TargetLowering::ZeroOrOneBooleanContent &&
       TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
           TargetLowering::ZeroOrOneBooleanContent &&
-      C1->isNullValue() && C2->isOne()) {
+      C1->isZero() && C2->isOne()) {
     SDValue NotCond =
         DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
     if (VT.bitsEq(CondVT))
@@ -9479,6 +9570,64 @@ static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() != ISD::SETCC || !N0.hasOneUse())
+    return SDValue();
+
+  SDValue Cond0 = N0.getOperand(0);
+  SDValue Cond1 = N0.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+  if (VT != Cond0.getValueType())
+    return SDValue();
+
+  // Match a signbit check of Cond0 as "Cond0 s<0". Swap select operands if the
+  // compare is inverted from that pattern ("Cond0 s> -1").
+  if (CC == ISD::SETLT && isNullOrNullSplat(Cond1))
+    ; // This is the pattern we are looking for.
+  else if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(Cond1))
+    std::swap(N1, N2);
+  else
+    return SDValue();
+
+  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
+  if (isNullOrNullSplat(N2)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
+  }
+
+  // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
+  if (isAllOnesOrAllOnesSplat(N1)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
+  }
+
+  // If we have to invert the sign bit mask, only do that transform if the
+  // target has a bitwise 'and not' instruction (the invert is free).
+  // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    SDValue Not = DAG.getNOT(DL, Sra, VT);
+    return DAG.getNode(ISD::AND, DL, VT, Not, N2);
+  }
+
+  // TODO: There's another pattern in this family, but it may require
+  //       implementing hasOrNot() to check for profitability:
+  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -9703,8 +9852,8 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
          "same value. This should have been addressed before this function.");
   return DAG.getNode(
       ISD::CONCAT_VECTORS, DL, VT,
-      BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
-      TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
+      BottomHalf->isZero() ? RHS->getOperand(0) : LHS->getOperand(0),
+      TopHalf->isZero() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
 bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
@@ -10169,6 +10318,10 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (SDValue V = foldVSelectOfConstants(N))
     return V;
 
+  if (hasOperation(ISD::SRA, VT))
+    if (SDValue V = foldVSelectToSignBitSplatMask(N, DAG))
+      return V;
+
   return SDValue();
 }
 
@@ -10190,7 +10343,7 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
     AddToWorklist(SCC.getNode());
 
     if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
-      if (!SCCC->isNullValue())
+      if (!SCCC->isZero())
         return N2;    // cond always true -> true val
       else
         return N3;    // cond always false -> false val
@@ -10248,13 +10401,13 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
 
     // Is 'X Cond C' always true or false?
     auto IsAlwaysTrueOrFalse = [](ISD::CondCode Cond, ConstantSDNode *C) {
-      bool False = (Cond == ISD::SETULT && C->isNullValue()) ||
+      bool False = (Cond == ISD::SETULT && C->isZero()) ||
                    (Cond == ISD::SETLT  && C->isMinSignedValue()) ||
-                   (Cond == ISD::SETUGT && C->isAllOnesValue()) ||
+                   (Cond == ISD::SETUGT && C->isAllOnes()) ||
                    (Cond == ISD::SETGT  && C->isMaxSignedValue());
-      bool True =  (Cond == ISD::SETULE && C->isAllOnesValue()) ||
+      bool True =  (Cond == ISD::SETULE && C->isAllOnes()) ||
                    (Cond == ISD::SETLE  && C->isMaxSignedValue()) ||
-                   (Cond == ISD::SETUGE && C->isNullValue()) ||
+                   (Cond == ISD::SETUGE && C->isZero()) ||
                    (Cond == ISD::SETGE  && C->isMinSignedValue());
       return True || False;
     };
@@ -10863,7 +11016,7 @@ static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
   if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
     return SDValue();
 
-  if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0)))
+  if (!TLI.isLoadExtLegalOrCustom(ExtLoadType, VT, Ld->getValueType(0)))
     return SDValue();
 
   if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
@@ -11257,7 +11410,7 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
 
   Known = DAG.computeKnownBits(Op);
 
-  return (Known.Zero | 1).isAllOnesValue();
+  return (Known.Zero | 1).isAllOnes();
 }
 
 /// Given an extending node with a pop-count operand, if the target does not
@@ -12016,7 +12169,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
-  if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1))
+  if (ExtVTBits >= DAG.ComputeMinSignedBits(N0))
     return N0;
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
@@ -12032,8 +12185,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     unsigned N00Bits = N00.getScalarValueSizeInBits();
-    if ((N00Bits <= ExtVTBits ||
-         (N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) &&
+    if ((N00Bits <= ExtVTBits || DAG.ComputeMinSignedBits(N00) <= ExtVTBits) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
   }
@@ -12052,8 +12204,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts);
     if ((N00Bits == ExtVTBits ||
          (!IsZext && (N00Bits < ExtVTBits ||
-                      (N00Bits - DAG.ComputeNumSignBits(N00, DemandedSrcElts)) <
-                          ExtVTBits))) &&
+                      DAG.ComputeMinSignedBits(N00) <= ExtVTBits))) &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00);
@@ -12290,7 +12441,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue Amt = N0.getOperand(1);
     KnownBits Known = DAG.computeKnownBits(Amt);
     unsigned Size = VT.getScalarSizeInBits();
-    if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
+    if (Known.countMaxActiveBits() <= Log2_32(Size)) {
       SDLoc SL(N);
       EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
 
@@ -12538,8 +12689,8 @@ static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   assert(N->getOpcode() == ISD::BUILD_PAIR);
 
-  LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
-  LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+  auto *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
+  auto *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
 
   // A BUILD_PAIR is always having the least significant part in elt 0 and the
   // most significant part in elt 1. So when combining into one large load, we
@@ -12547,22 +12698,20 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   if (DAG.getDataLayout().isBigEndian())
     std::swap(LD1, LD2);
 
-  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||
+  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !ISD::isNON_EXTLoad(LD2) ||
+      !LD1->hasOneUse() || !LD2->hasOneUse() ||
       LD1->getAddressSpace() != LD2->getAddressSpace())
     return SDValue();
+
+  bool LD1Fast = false;
   EVT LD1VT = LD1->getValueType(0);
   unsigned LD1Bytes = LD1VT.getStoreSize();
-  if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
-      DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
-    Align Alignment = LD1->getAlign();
-    Align NewAlign = DAG.getDataLayout().getABITypeAlign(
-        VT.getTypeForEVT(*DAG.getContext()));
-
-    if (NewAlign <= Alignment &&
-        (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
-      return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
-                         LD1->getPointerInfo(), Alignment);
-  }
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1) &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                             *LD1->getMemOperand(), &LD1Fast) && LD1Fast)
+    return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
+                       LD1->getPointerInfo(), LD1->getAlign());
 
   return SDValue();
 }
@@ -12938,69 +13087,45 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
   }
 
-  SDLoc DL(BV);
-
   // Okay, we know the src/dst types are both integers of differing types.
-  // Handling growing first.
   assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
-  if (SrcBitSize < DstBitSize) {
-    unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
 
-    SmallVector<SDValue, 8> Ops;
-    for (unsigned i = 0, e = BV->getNumOperands(); i != e;
-         i += NumInputsPerOutput) {
-      bool isLE = DAG.getDataLayout().isLittleEndian();
-      APInt NewBits = APInt(DstBitSize, 0);
-      bool EltIsUndef = true;
-      for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
-        // Shift the previously computed bits over.
-        NewBits <<= SrcBitSize;
-        SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
-        if (Op.isUndef()) continue;
-        EltIsUndef = false;
-
-        NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue().
-                   zextOrTrunc(SrcBitSize).zext(DstBitSize);
-      }
-
-      if (EltIsUndef)
-        Ops.push_back(DAG.getUNDEF(DstEltVT));
-      else
-        Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
-    }
+  // TODO: Should ConstantFoldBITCASTofBUILD_VECTOR always take a
+  // BuildVectorSDNode?
+  auto *BVN = cast<BuildVectorSDNode>(BV);
 
-    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
-    return DAG.getBuildVector(VT, DL, Ops);
-  }
+  // Extract the constant raw bit data.
+  BitVector UndefElements;
+  SmallVector<APInt> RawBits;
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  if (!BVN->getConstantRawBits(IsLE, DstBitSize, RawBits, UndefElements))
+    return SDValue();
 
-  // Finally, this must be the case where we are shrinking elements: each input
-  // turns into multiple outputs.
-  unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
-  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
-                            NumOutputsPerInput*BV->getNumOperands());
+  SDLoc DL(BV);
   SmallVector<SDValue, 8> Ops;
+  for (unsigned I = 0, E = RawBits.size(); I != E; ++I) {
+    if (UndefElements[I])
+      Ops.push_back(DAG.getUNDEF(DstEltVT));
+    else
+      Ops.push_back(DAG.getConstant(RawBits[I], DL, DstEltVT));
+  }
 
-  for (const SDValue &Op : BV->op_values()) {
-    if (Op.isUndef()) {
-      Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
-      continue;
-    }
-
-    APInt OpVal = cast<ConstantSDNode>(Op)->
-                  getAPIntValue().zextOrTrunc(SrcBitSize);
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
+  return DAG.getBuildVector(VT, DL, Ops);
+}
 
-    for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
-      APInt ThisVal = OpVal.trunc(DstBitSize);
-      Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
-      OpVal.lshrInPlace(DstBitSize);
-    }
+// Returns true if floating point contraction is allowed on the FMUL-SDValue
+// `N`
+static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
+  assert(N.getOpcode() == ISD::FMUL);
 
-    // For big endian targets, swap the order of the pieces of each element.
-    if (DAG.getDataLayout().isBigEndian())
-      std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
-  }
+  return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+         N->getFlags().hasAllowContract();
+}
 
-  return DAG.getBuildVector(VT, DL, Ops);
+// Returns true if `N` can assume no infinities involved in its computation.
+static bool hasNoInfs(const TargetOptions &Options, SDValue N) {
+  return Options.NoInfsFPMath || N.getNode()->getFlags().hasNoInfs();
 }
 
 /// Try to perform FMA combining on a given FADD node.
@@ -13039,6 +13164,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
 
+  auto isFusedOp = [&](SDValue N) {
+    unsigned Opcode = N.getOpcode();
+    return Opcode == ISD::FMA || Opcode == ISD::FMAD;
+  };
+
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
   auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
@@ -13070,12 +13200,12 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
   // This requires reassociation because it changes the order of operations.
   SDValue FMA, E;
-  if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode &&
+  if (CanReassociate && isFusedOp(N0) &&
       N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&
       N0.getOperand(2).hasOneUse()) {
     FMA = N0;
     E = N1;
-  } else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode &&
+  } else if (CanReassociate && isFusedOp(N1) &&
              N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&
              N1.getOperand(2).hasOneUse()) {
     FMA = N1;
@@ -13131,7 +13261,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
                                      Z));
     };
-    if (N0.getOpcode() == PreferredFusedOpcode) {
+    if (isFusedOp(N0)) {
       SDValue N02 = N0.getOperand(2);
       if (N02.getOpcode() == ISD::FP_EXTEND) {
         SDValue N020 = N02.getOperand(0);
@@ -13161,7 +13291,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     };
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == PreferredFusedOpcode) {
+      if (isFusedOp(N00)) {
         SDValue N002 = N00.getOperand(2);
         if (isContractableFMUL(N002) &&
             TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -13175,7 +13305,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
 
     // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
     //   -> (fma y, z, (fma (fpext u), (fpext v), x))
-    if (N1.getOpcode() == PreferredFusedOpcode) {
+    if (isFusedOp(N1)) {
       SDValue N12 = N1.getOperand(2);
       if (N12.getOpcode() == ISD::FP_EXTEND) {
         SDValue N120 = N12.getOperand(0);
@@ -13196,7 +13326,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     // interesting for all targets, especially GPUs.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == PreferredFusedOpcode) {
+      if (isFusedOp(N10)) {
         SDValue N102 = N10.getOperand(2);
         if (isContractableFMUL(N102) &&
             TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -13392,12 +13522,17 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     return isContractableFMUL(N) && isReassociable(N.getNode());
   };
 
+  auto isFusedOp = [&](SDValue N) {
+    unsigned Opcode = N.getOpcode();
+    return Opcode == ISD::FMA || Opcode == ISD::FMAD;
+  };
+
   // More folding opportunities when target permits.
   if (Aggressive && isReassociable(N)) {
     bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
-    if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
+    if (CanFuse && isFusedOp(N0) &&
         isContractableAndReassociableFMUL(N0.getOperand(2)) &&
         N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
@@ -13410,7 +13545,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
     // fold (fsub x, (fma y, z, (fmul u, v)))
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
-    if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
+    if (CanFuse && isFusedOp(N1) &&
         isContractableAndReassociableFMUL(N1.getOperand(2)) &&
         N1->hasOneUse() && NoSignedZero) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
@@ -13424,8 +13559,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
     // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
-    if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0->hasOneUse()) {
+    if (isFusedOp(N0) && N0->hasOneUse()) {
       SDValue N02 = N0.getOperand(2);
       if (N02.getOpcode() == ISD::FP_EXTEND) {
         SDValue N020 = N02.getOperand(0);
@@ -13451,7 +13585,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // interesting for all targets, especially GPUs.
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == PreferredFusedOpcode) {
+      if (isFusedOp(N00)) {
         SDValue N002 = N00.getOperand(2);
         if (isContractableAndReassociableFMUL(N002) &&
             TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -13471,8 +13605,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
     // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
     //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
-    if (N1.getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
+    if (isFusedOp(N1) && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
         N1->hasOneUse()) {
       SDValue N120 = N1.getOperand(2).getOperand(0);
       if (isContractableAndReassociableFMUL(N120) &&
@@ -13496,8 +13629,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // FIXME: This turns two single-precision and one double-precision
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
-    if (N1.getOpcode() == ISD::FP_EXTEND &&
-        N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
+    if (N1.getOpcode() == ISD::FP_EXTEND && isFusedOp(N1.getOperand(0))) {
       SDValue CvtSrc = N1.getOperand(0);
       SDValue N100 = CvtSrc.getOperand(0);
       SDValue N101 = CvtSrc.getOperand(1);
@@ -13538,12 +13670,13 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
 
   // The transforms below are incorrect when x == 0 and y == inf, because the
   // intermediate multiplication produces a nan.
-  if (!Options.NoInfsFPMath)
+  SDValue FAdd = N0.getOpcode() == ISD::FADD ? N0 : N1;
+  if (!hasNoInfs(Options, FAdd))
     return SDValue();
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+      isContractableFMUL(Options, SDValue(N, 0)) &&
       TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
@@ -13633,7 +13766,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
   // fold (fadd c1, c2) -> c1 + c2
@@ -13841,7 +13974,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
   // fold (fsub c1, c2) -> c1-c2
@@ -13926,7 +14059,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   // fold vector ops
   if (VT.isVector()) {
     // This just handles C1 * C2 for vectors. Other vector folds are below.
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
   }
 
@@ -13971,10 +14104,13 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
 
-  // fold (fmul X, -1.0) -> (fneg X)
-  if (N1CFP && N1CFP->isExactlyValue(-1.0))
-    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, DL, VT, N0);
+  // fold (fmul X, -1.0) -> (fsub -0.0, X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0)) {
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FSUB, VT)) {
+      return DAG.getNode(ISD::FSUB, DL, VT,
+                         DAG.getConstantFP(-0.0, DL, VT), N0, Flags);
+    }
+  }
 
   // -N0 * -N1 --> N0 * N1
   TargetLowering::NegatibleCost CostN0 =
@@ -14260,7 +14396,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
   // fold (fdiv c1, c2) -> c1/c2
@@ -16245,11 +16381,12 @@ struct LoadedSlice {
       return false;
 
     // Check if it will be merged with the load.
-    // 1. Check the alignment constraint.
-    Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign(
-        ResVT.getTypeForEVT(*DAG->getContext()));
-
-    if (RequiredAlignment > getAlign())
+    // 1. Check the alignment / fast memory access constraint.
+    bool IsFast = false;
+    if (!TLI.allowsMemoryAccess(*DAG->getContext(), DAG->getDataLayout(), ResVT,
+                                Origin->getAddressSpace(), getAlign(),
+                                Origin->getMemOperand()->getFlags(), &IsFast) ||
+        !IsFast)
       return false;
 
     // 2. Check that the load is a legal operation for that type.
@@ -16270,7 +16407,7 @@ struct LoadedSlice {
 /// \p UsedBits looks like 0..0 1..1 0..0.
 static bool areUsedBitsDense(const APInt &UsedBits) {
   // If all the bits are one, this is dense!
-  if (UsedBits.isAllOnesValue())
+  if (UsedBits.isAllOnes())
     return true;
 
   // Get rid of the unused bits on the right.
@@ -16279,7 +16416,7 @@ static bool areUsedBitsDense(const APInt &UsedBits) {
   if (NarrowedUsedBits.countLeadingZeros())
     NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
   // Check that the chunk of bits is completely used.
-  return NarrowedUsedBits.isAllOnesValue();
+  return NarrowedUsedBits.isAllOnes();
 }
 
 /// Check whether or not \p First and \p Second are next to each other
@@ -16697,8 +16834,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     unsigned BitWidth = N1.getValueSizeInBits();
     APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
     if (Opc == ISD::AND)
-      Imm ^= APInt::getAllOnesValue(BitWidth);
-    if (Imm == 0 || Imm.isAllOnesValue())
+      Imm ^= APInt::getAllOnes(BitWidth);
+    if (Imm == 0 || Imm.isAllOnes())
       return SDValue();
     unsigned ShAmt = Imm.countTrailingZeros();
     unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
@@ -16725,16 +16862,19 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     if ((Imm & Mask) == Imm) {
       APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
       if (Opc == ISD::AND)
-        NewImm ^= APInt::getAllOnesValue(NewBW);
+        NewImm ^= APInt::getAllOnes(NewBW);
       uint64_t PtrOff = ShAmt / 8;
       // For big endian targets, we need to adjust the offset to the pointer to
       // load the correct bytes.
       if (DAG.getDataLayout().isBigEndian())
         PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
 
+      bool IsFast = false;
       Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
-      Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
-      if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy))
+      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
+                                  LD->getAddressSpace(), NewAlign,
+                                  LD->getMemOperand()->getFlags(), &IsFast) ||
+          !IsFast)
         return SDValue();
 
       SDValue NewPtr =
@@ -16788,27 +16928,26 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     if (VTSize.isScalable())
       return SDValue();
 
+    bool FastLD = false, FastST = false;
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());
     if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
         !TLI.isOperationLegal(ISD::STORE, IntVT) ||
         !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
-        !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
-      return SDValue();
-
-    Align LDAlign = LD->getAlign();
-    Align STAlign = ST->getAlign();
-    Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
-    Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy);
-    if (LDAlign < ABIAlign || STAlign < ABIAlign)
+        !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT) ||
+        !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
+                                *LD->getMemOperand(), &FastLD) ||
+        !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
+                                *ST->getMemOperand(), &FastST) ||
+        !FastLD || !FastST)
       return SDValue();
 
     SDValue NewLD =
         DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
-                    LD->getPointerInfo(), LDAlign);
+                    LD->getPointerInfo(), LD->getAlign());
 
     SDValue NewST =
         DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
-                     ST->getPointerInfo(), STAlign);
+                     ST->getPointerInfo(), ST->getAlign());
 
     AddToWorklist(NewLD.getNode());
     AddToWorklist(NewST.getNode());
@@ -16839,8 +16978,10 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
                                               SDValue &ConstNode) {
   APInt Val;
 
-  // If the add only has one use, this would be OK to do.
-  if (AddNode.getNode()->hasOneUse())
+  // If the add only has one use, and the target thinks the folding is
+  // profitable or does not lead to worse code, this would be OK to do.
+  if (AddNode.getNode()->hasOneUse() &&
+      TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
     return true;
 
   // Walk all the users of the constant with which we're multiplying.
@@ -16932,6 +17073,22 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
   unsigned SizeInBits = NumStores * ElementSizeBits;
   unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
 
+  Optional<MachineMemOperand::Flags> Flags;
+  AAMDNodes AAInfo;
+  for (unsigned I = 0; I != NumStores; ++I) {
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
+    if (!Flags) {
+      Flags = St->getMemOperand()->getFlags();
+      AAInfo = St->getAAInfo();
+      continue;
+    }
+    // Skip merging if there's an inconsistent flag.
+    if (Flags != St->getMemOperand()->getFlags())
+      return false;
+    // Concatenate AA metadata.
+    AAInfo = AAInfo.concat(St->getAAInfo());
+  }
+
   EVT StoreTy;
   if (UseVector) {
     unsigned Elts = NumStores * NumMemElts;
@@ -17049,9 +17206,9 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
   // make sure we use trunc store if it's necessary to be legal.
   SDValue NewStore;
   if (!UseTrunc) {
-    NewStore =
-        DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
-                     FirstInChain->getPointerInfo(), FirstInChain->getAlign());
+    NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
+                            FirstInChain->getPointerInfo(),
+                            FirstInChain->getAlign(), Flags.getValue(), AAInfo);
   } else { // Must be realized as a trunc store
     EVT LegalizedStoredValTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
@@ -17063,7 +17220,7 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
     NewStore = DAG.getTruncStore(
         NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
         FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
-        FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags());
+        FirstInChain->getAlign(), Flags.getValue(), AAInfo);
   }
 
   // Replace all merged stores with the new store.
@@ -17360,7 +17517,7 @@ bool DAGCombiner::tryStoreMergeOfConstants(
       SDValue StoredVal = ST->getValue();
       bool IsElementZero = false;
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
-        IsElementZero = C->isNullValue();
+        IsElementZero = C->isZero();
       else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
         IsElementZero = C->getConstantFPValue()->isNullValue();
       if (IsElementZero) {
@@ -17379,7 +17536,8 @@ bool DAGCombiner::tryStoreMergeOfConstants(
         break;
 
       if (TLI.isTypeLegal(StoreTy) &&
-          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
+                               DAG.getMachineFunction()) &&
           TLI.allowsMemoryAccess(Context, DL, StoreTy,
                                  *FirstInChain->getMemOperand(), &IsFast) &&
           IsFast) {
@@ -17391,7 +17549,8 @@ bool DAGCombiner::tryStoreMergeOfConstants(
         EVT LegalizedStoredValTy =
             TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
         if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
-            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
+                                 DAG.getMachineFunction()) &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy,
                                    *FirstInChain->getMemOperand(), &IsFast) &&
             IsFast) {
@@ -17410,7 +17569,7 @@ bool DAGCombiner::tryStoreMergeOfConstants(
         unsigned Elts = (i + 1) * NumMemElts;
         EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
         if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
-            TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+            TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
             TLI.allowsMemoryAccess(Context, DL, Ty,
                                    *FirstInChain->getMemOperand(), &IsFast) &&
             IsFast)
@@ -17486,7 +17645,8 @@ bool DAGCombiner::tryStoreMergeOfExtracts(
       if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
         break;
 
-      if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+      if (TLI.isTypeLegal(Ty) &&
+          TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
           TLI.allowsMemoryAccess(Context, DL, Ty,
                                  *FirstInChain->getMemOperand(), &IsFast) &&
           IsFast)
@@ -17634,8 +17794,13 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
 
       bool IsFastSt = false;
       bool IsFastLd = false;
-      if (TLI.isTypeLegal(StoreTy) &&
-          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+      // Don't try vector types if we need a rotate. We may still fail the
+      // legality checks for the integer type, but we can't handle the rotate
+      // case with vectors.
+      // FIXME: We could use a shuffle in place of the rotate.
+      if (!NeedRotate && TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
+                               DAG.getMachineFunction()) &&
           TLI.allowsMemoryAccess(Context, DL, StoreTy,
                                  *FirstInChain->getMemOperand(), &IsFastSt) &&
           IsFastSt &&
@@ -17649,7 +17814,8 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
       unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
       StoreTy = EVT::getIntegerVT(Context, SizeInBits);
       if (TLI.isTypeLegal(StoreTy) &&
-          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
+                               DAG.getMachineFunction()) &&
           TLI.allowsMemoryAccess(Context, DL, StoreTy,
                                  *FirstInChain->getMemOperand(), &IsFastSt) &&
           IsFastSt &&
@@ -17663,7 +17829,8 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
                  TargetLowering::TypePromoteInteger) {
         EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
         if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
-            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
+                                 DAG.getMachineFunction()) &&
             TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
             TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
             TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
@@ -18215,7 +18382,7 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
     case ISD::LIFETIME_END:
       // We can forward past any lifetime start/end that can be proven not to
       // alias the node.
-      if (!isAlias(Chain.getNode(), N))
+      if (!mayAlias(Chain.getNode(), N))
         Chains.push_back(Chain.getOperand(0));
       break;
     case ISD::STORE: {
@@ -18593,32 +18760,35 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
   if (!VecEltVT.isByteSized())
     return SDValue();
 
-  Align Alignment = OriginalLoad->getAlign();
-  Align NewAlign = DAG.getDataLayout().getABITypeAlign(
-      VecEltVT.getTypeForEVT(*DAG.getContext()));
-
-  if (NewAlign > Alignment ||
-      !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
-    return SDValue();
-
-  ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
-    ISD::NON_EXTLOAD : ISD::EXTLOAD;
-  if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
+  ISD::LoadExtType ExtTy =
+      ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
+  if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
+      !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
     return SDValue();
 
-  Alignment = NewAlign;
-
+  Align Alignment = OriginalLoad->getAlign();
   MachinePointerInfo MPI;
   SDLoc DL(EVE);
   if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
     int Elt = ConstEltNo->getZExtValue();
     unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
     MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+    Alignment = commonAlignment(Alignment, PtrOff);
   } else {
     // Discard the pointer info except the address space because the memory
     // operand can't represent this new access since the offset is variable.
     MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
+    Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
   }
+
+  bool IsFast = false;
+  if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
+                              OriginalLoad->getAddressSpace(), Alignment,
+                              OriginalLoad->getMemOperand()->getFlags(),
+                              &IsFast) ||
+      !IsFast)
+    return SDValue();
+
   SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
                                                InVecVT, EltNo);
 
@@ -18864,7 +19034,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
                Use->getOperand(0) == VecOp &&
                isa<ConstantSDNode>(Use->getOperand(1));
       })) {
-    APInt DemandedElts = APInt::getNullValue(NumElts);
+    APInt DemandedElts = APInt::getZero(NumElts);
     for (SDNode *Use : VecOp->uses()) {
       auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
       if (CstElt->getAPIntValue().ult(NumElts))
@@ -18877,7 +19047,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
         AddToWorklist(N);
       return SDValue(N, 0);
     }
-    APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);
+    APInt DemandedBits = APInt::getAllOnes(VecEltBitWidth);
     if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
       // We simplified the vector operand of this extract element. If this
       // extract is not dead, visit it again so it is folded properly.
@@ -19672,8 +19842,10 @@ SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
 
   // Make sure the first element matches
   // (zext (extract_vector_elt X, C))
+  // Offset must be a constant multiple of the
+  // known-minimum vector length of the result type.
   int64_t Offset = checkElem(Op0);
-  if (Offset < 0)
+  if (Offset < 0 || (Offset % VT.getVectorNumElements()) != 0)
     return SDValue();
 
   unsigned NumElems = N->getNumOperands();
@@ -19844,6 +20016,44 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
 }
 
+// Attempt to merge nested concat_vectors/undefs.
+// Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d))
+//  --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d)
+static SDValue combineConcatVectorOfConcatVectors(SDNode *N,
+                                                  SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types.
+  EVT SubVT;
+  SDValue FirstConcat;
+  for (const SDValue &Op : N->ops()) {
+    if (Op.isUndef())
+      continue;
+    if (Op.getOpcode() != ISD::CONCAT_VECTORS)
+      return SDValue();
+    if (!FirstConcat) {
+      SubVT = Op.getOperand(0).getValueType();
+      if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
+        return SDValue();
+      FirstConcat = Op;
+      continue;
+    }
+    if (SubVT != Op.getOperand(0).getValueType())
+      return SDValue();
+  }
+  assert(FirstConcat && "Concat of all-undefs found");
+
+  SmallVector<SDValue> ConcatOps;
+  for (const SDValue &Op : N->ops()) {
+    if (Op.isUndef()) {
+      ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT));
+      continue;
+    }
+    ConcatOps.append(Op->op_begin(), Op->op_end());
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps);
+}
+
 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
 // most two distinct vectors the same size as the result, attempt to turn this
@@ -20103,13 +20313,19 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   }
 
   // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
+  // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...).
   if (SDValue V = combineConcatVectorOfScalars(N, DAG))
     return V;
 
-  // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
-  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
+    // Fold CONCAT_VECTORS of CONCAT_VECTORS (or undef) to VECTOR_SHUFFLE.
+    if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG))
+      return V;
+
+    // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
     if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
       return V;
+  }
 
   if (SDValue V = combineConcatVectorOfCasts(N, DAG))
     return V;
@@ -20351,9 +20567,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
     return SDValue();
 
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
-  auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!Ld || Ld->getExtensionType() || !Ld->isSimple() ||
-      !ExtIdx)
+  if (!Ld || Ld->getExtensionType() || !Ld->isSimple())
     return SDValue();
 
   // Allow targets to opt-out.
@@ -20363,7 +20577,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   if (!VT.isByteSized())
     return SDValue();
 
-  unsigned Index = ExtIdx->getZExtValue();
+  unsigned Index = Extract->getConstantOperandVal(1);
   unsigned NumElts = VT.getVectorMinNumElements();
 
   // The definition of EXTRACT_SUBVECTOR states that the index must be a
@@ -20492,7 +20706,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     // If the concatenated source types match this extract, it's a direct
     // simplification:
     // extract_subvec (concat V1, V2, ...), i --> Vi
-    if (ConcatSrcNumElts == ExtNumElts)
+    if (NVT.getVectorElementCount() == ConcatSrcVT.getVectorElementCount())
       return V.getOperand(ConcatOpIdx);
 
     // If the concatenated source vectors are a multiple length of this extract,
@@ -20500,7 +20714,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     // concat operand. Example:
     //   v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
     //   v2i8 extract_subvec v8i8 Y, 6
-    if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) {
+    if (NVT.isFixedLengthVector() && ConcatSrcVT.isFixedLengthVector() &&
+        ConcatSrcNumElts % ExtNumElts == 0) {
       SDLoc DL(N);
       unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
       assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
@@ -20562,8 +20777,12 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     //    otherwise => (extract_subvec V1, ExtIdx)
     uint64_t InsIdx = V.getConstantOperandVal(2);
     if (InsIdx * SmallVT.getScalarSizeInBits() ==
-        ExtIdx * NVT.getScalarSizeInBits())
+        ExtIdx * NVT.getScalarSizeInBits()) {
+      if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
+        return SDValue();
+
       return DAG.getBitcast(NVT, V.getOperand(1));
+    }
     return DAG.getNode(
         ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
         DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
@@ -21131,15 +21350,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
 
   // Canonicalize shuffle v, v -> v, undef
-  if (N0 == N1) {
-    SmallVector<int, 8> NewMask;
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int Idx = SVN->getMaskElt(i);
-      if (Idx >= (int)NumElts) Idx -= NumElts;
-      NewMask.push_back(Idx);
-    }
-    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
-  }
+  if (N0 == N1)
+    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT),
+                                createUnaryMask(SVN->getMask(), NumElts));
 
   // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
   if (N0.isUndef())
@@ -21290,6 +21503,70 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     }
   }
 
+  // See if we can replace a shuffle with an insert_subvector.
+  // e.g. v2i32 into v8i32:
+  // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7).
+  // --> insert_subvector(lhs,rhs1,4).
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) &&
+      TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) {
+    auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef<int> Mask) {
+      // Ensure RHS subvectors are legal.
+      assert(RHS.getOpcode() == ISD::CONCAT_VECTORS && "Can't find subvectors");
+      EVT SubVT = RHS.getOperand(0).getValueType();
+      int NumSubVecs = RHS.getNumOperands();
+      int NumSubElts = SubVT.getVectorNumElements();
+      assert((NumElts % NumSubElts) == 0 && "Subvector mismatch");
+      if (!TLI.isTypeLegal(SubVT))
+        return SDValue();
+
+      // Don't bother if we have an unary shuffle (matches undef + LHS elts).
+      if (all_of(Mask, [NumElts](int M) { return M < (int)NumElts; }))
+        return SDValue();
+
+      // Search [NumSubElts] spans for RHS sequence.
+      // TODO: Can we avoid nested loops to increase performance?
+      SmallVector<int> InsertionMask(NumElts);
+      for (int SubVec = 0; SubVec != NumSubVecs; ++SubVec) {
+        for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) {
+          // Reset mask to identity.
+          std::iota(InsertionMask.begin(), InsertionMask.end(), 0);
+
+          // Add subvector insertion.
+          std::iota(InsertionMask.begin() + SubIdx,
+                    InsertionMask.begin() + SubIdx + NumSubElts,
+                    NumElts + (SubVec * NumSubElts));
+
+          // See if the shuffle mask matches the reference insertion mask.
+          bool MatchingShuffle = true;
+          for (int i = 0; i != (int)NumElts; ++i) {
+            int ExpectIdx = InsertionMask[i];
+            int ActualIdx = Mask[i];
+            if (0 <= ActualIdx && ExpectIdx != ActualIdx) {
+              MatchingShuffle = false;
+              break;
+            }
+          }
+
+          if (MatchingShuffle)
+            return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS,
+                               RHS.getOperand(SubVec),
+                               DAG.getVectorIdxConstant(SubIdx, SDLoc(N)));
+        }
+      }
+      return SDValue();
+    };
+    ArrayRef<int> Mask = SVN->getMask();
+    if (N1.getOpcode() == ISD::CONCAT_VECTORS)
+      if (SDValue InsertN1 = ShuffleToInsert(N0, N1, Mask))
+        return InsertN1;
+    if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
+      SmallVector<int> CommuteMask(Mask.begin(), Mask.end());
+      ShuffleVectorSDNode::commuteMask(CommuteMask);
+      if (SDValue InsertN0 = ShuffleToInsert(N1, N0, CommuteMask))
+        return InsertN0;
+    }
+  }
+
   // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
   // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
   if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
@@ -21859,6 +22136,40 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitVPOp(SDNode *N) {
+  // VP operations in which all vector elements are disabled - either by
+  // determining that the mask is all false or that the EVL is 0 - can be
+  // eliminated.
+  bool AreAllEltsDisabled = false;
+  if (auto EVLIdx = ISD::getVPExplicitVectorLengthIdx(N->getOpcode()))
+    AreAllEltsDisabled |= isNullConstant(N->getOperand(*EVLIdx));
+  if (auto MaskIdx = ISD::getVPMaskIdx(N->getOpcode()))
+    AreAllEltsDisabled |=
+        ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode());
+
+  // This is the only generic VP combine we support for now.
+  if (!AreAllEltsDisabled)
+    return SDValue();
+
+  // Binary operations can be replaced by UNDEF.
+  if (ISD::isVPBinaryOp(N->getOpcode()))
+    return DAG.getUNDEF(N->getValueType(0));
+
+  // VP Memory operations can be replaced by either the chain (stores) or the
+  // chain + undef (loads).
+  if (const auto *MemSD = dyn_cast<MemSDNode>(N)) {
+    if (MemSD->writeMem())
+      return MemSD->getChain();
+    return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), MemSD->getChain());
+  }
+
+  // Reduction operations return the start operand when no elements are active.
+  if (ISD::isVPReduction(N->getOpcode()))
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
 /// with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
@@ -21915,7 +22226,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
       else
         Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits);
 
-      if (Bits.isAllOnesValue())
+      if (Bits.isAllOnes())
         Indices.push_back(i);
       else if (Bits == 0)
         Indices.push_back(i + NumSubElts);
@@ -21950,7 +22261,8 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
 
 /// If a vector binop is performed on splat values, it may be profitable to
 /// extract, scalarize, and insert/splat.
-static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {
+static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
+                                      const SDLoc &DL) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   unsigned Opcode = N->getOpcode();
@@ -21971,7 +22283,6 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {
       !TLI.isOperationLegalOrCustom(Opcode, EltVT))
     return SDValue();
 
-  SDLoc DL(N);
   SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
   SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
   SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
@@ -21995,20 +22306,19 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {
 }
 
 /// Visit a binary vector operation, like ADD.
-SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
-  assert(N->getValueType(0).isVector() &&
-         "SimplifyVBinOp only works on vectors!");
+SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) {
+  EVT VT = N->getValueType(0);
+  assert(VT.isVector() && "SimplifyVBinOp only works on vectors!");
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDValue Ops[] = {LHS, RHS};
-  EVT VT = N->getValueType(0);
   unsigned Opcode = N->getOpcode();
   SDNodeFlags Flags = N->getFlags();
 
   // See if we can constant fold the vector operation.
-  if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
-          Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
+  if (SDValue Fold = DAG.FoldConstantArithmetic(Opcode, SDLoc(LHS),
+                                                LHS.getValueType(), Ops))
     return Fold;
 
   // Move unary shuffles with identical masks after a vector binop:
@@ -22026,7 +22336,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
         LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
         (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
-      SDLoc DL(N);
       SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
                                      RHS.getOperand(0), Flags);
       SDValue UndefV = LHS.getOperand(1);
@@ -22043,7 +22352,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
         Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
         Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
       // binop (splat X), (splat C) --> splat (binop X, C)
-      SDLoc DL(N);
       SDValue X = Shuf0->getOperand(0);
       SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
       return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
@@ -22053,7 +22361,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
         Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
         Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
       // binop (splat C), (splat X) --> splat (binop C, X)
-      SDLoc DL(N);
       SDValue X = Shuf1->getOperand(0);
       SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
       return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
@@ -22077,7 +22384,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
         TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT,
                                               LegalOperations)) {
       // (binop undef, undef) may not return undef, so compute that result.
-      SDLoc DL(N);
       SDValue VecC =
           DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
       SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
@@ -22104,7 +22410,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     EVT NarrowVT = LHS.getOperand(0).getValueType();
     if (NarrowVT == RHS.getOperand(0).getValueType() &&
         TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
-      SDLoc DL(N);
       unsigned NumOperands = LHS.getNumOperands();
       SmallVector<SDValue, 4> ConcatOps;
       for (unsigned i = 0; i != NumOperands; ++i) {
@@ -22117,7 +22422,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     }
   }
 
-  if (SDValue V = scalarizeBinOpOfSplats(N, DAG))
+  if (SDValue V = scalarizeBinOpOfSplats(N, DAG, DL))
     return V;
 
   return SDValue();
@@ -22431,15 +22736,23 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
   if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
     return SDValue();
 
-  if (!N->isOnlyUserOf(N0.getNode()) || !N->isOnlyUserOf(N1.getNode()))
+  // The use checks are intentionally on SDNode because we may be dealing
+  // with opcodes that produce more than one SDValue.
+  // TODO: Do we really need to check N0 (the condition operand of the select)?
+  //       But removing that clause could cause an infinite loop...
+  if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
     return SDValue();
 
+  // Binops may include opcodes that return multiple values, so all values
+  // must be created/propagated from the newly created binops below.
+  SDVTList OpVTs = N1->getVTList();
+
   // Fold select(cond, binop(x, y), binop(z, y))
   //  --> binop(select(cond, x, z), y)
   if (N1.getOperand(1) == N2.getOperand(1)) {
     SDValue NewSel =
         DAG.getSelect(DL, VT, N0, N1.getOperand(0), N2.getOperand(0));
-    SDValue NewBinOp = DAG.getNode(BinOpc, DL, VT, NewSel, N1.getOperand(1));
+    SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, NewSel, N1.getOperand(1));
     NewBinOp->setFlags(N1->getFlags());
     NewBinOp->intersectFlagsWith(N2->getFlags());
     return NewBinOp;
@@ -22453,7 +22766,7 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
       VT == N2.getOperand(1).getValueType()) {
     SDValue NewSel =
         DAG.getSelect(DL, VT, N0, N1.getOperand(1), N2.getOperand(1));
-    SDValue NewBinOp = DAG.getNode(BinOpc, DL, VT, N1.getOperand(0), NewSel);
+    SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, N1.getOperand(0), NewSel);
     NewBinOp->setFlags(N1->getFlags());
     NewBinOp->intersectFlagsWith(N2->getFlags());
     return NewBinOp;
@@ -22581,7 +22894,7 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
     if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) {
       // fold select_cc true, x, y -> x
       // fold select_cc false, x, y -> y
-      return !(SCCC->isNullValue()) ? N2 : N3;
+      return !(SCCC->isZero()) ? N2 : N3;
     }
   }
 
@@ -22680,7 +22993,7 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
   // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
   // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
-  if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+  if (N1C && N1C->isZero() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     SDValue ValueOnZero = N2;
     SDValue Count = N3;
     // If the condition is NE instead of E, swap the operands.
@@ -22707,6 +23020,20 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
     }
   }
 
+  // Fold select_cc setgt X, -1, C, ~C -> xor (ashr X, BW-1), C
+  // Fold select_cc setlt X, 0, C, ~C -> xor (ashr X, BW-1), ~C
+  if (!NotExtCompare && N1C && N2C && N3C &&
+      N2C->getAPIntValue() == ~N3C->getAPIntValue() &&
+      ((N1C->isAllOnes() && CC == ISD::SETGT) ||
+       (N1C->isZero() && CC == ISD::SETLT)) &&
+      !TLI.shouldAvoidTransformToShift(VT, CmpOpVT.getScalarSizeInBits() - 1)) {
+    SDValue ASR = DAG.getNode(
+        ISD::SRA, DL, CmpOpVT, N0,
+        DAG.getConstant(CmpOpVT.getScalarSizeInBits() - 1, DL, CmpOpVT));
+    return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASR, DL, VT),
+                       DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT));
+  }
+
   return SDValue();
 }
 
@@ -22747,7 +23074,7 @@ SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
     return SDValue();
 
   // Avoid division by zero.
-  if (C->isNullValue())
+  if (C->isZero())
     return SDValue();
 
   SmallVector<SDNode *, 8> Built;
@@ -22792,7 +23119,7 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
 
 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
 /// For the reciprocal, we need to find the zero of the function:
-///   F(X) = A X - 1 [which has a zero at X = 1/A]
+///   F(X) = 1/X - A [which has a zero at X = 1/A]
 ///     =>
 ///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
 ///     does not require additional intermediate precision]
@@ -22803,9 +23130,10 @@ SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
   if (LegalDAG)
     return SDValue();
 
-  // TODO: Handle half and/or extended types?
+  // TODO: Handle extended types?
   EVT VT = Op.getValueType();
-  if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
+  if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
+      VT.getScalarType() != MVT::f64)
     return SDValue();
 
   // If estimates are explicitly disabled for this function, we're done.
@@ -22942,9 +23270,10 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
   if (LegalDAG)
     return SDValue();
 
-  // TODO: Handle half and/or extended types?
+  // TODO: Handle extended types?
   EVT VT = Op.getValueType();
-  if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
+  if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
+      VT.getScalarType() != MVT::f64)
     return SDValue();
 
   // If estimates are explicitly disabled for this function, we're done.
@@ -22994,7 +23323,7 @@ SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
 }
 
 /// Return true if there is any possibility that the two addresses overlap.
-bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
+bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
 
   struct MemUseCharacteristics {
     bool IsVolatile;
@@ -23154,7 +23483,7 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       // TODO: Relax aliasing for unordered atomics (see D66309)
       bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
                       cast<LSBaseSDNode>(C.getNode())->isSimple();
-      if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) {
+      if ((IsLoad && IsOpLoad) || !mayAlias(N, C.getNode())) {
         // Look further up the chain.
         C = C.getOperand(0);
         return true;
@@ -23172,7 +23501,7 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
     case ISD::LIFETIME_END: {
       // We can forward past any lifetime start/end that can be proven not to
       // alias the memory access.
-      if (!isAlias(N, C.getNode())) {
+      if (!mayAlias(N, C.getNode())) {
         // Look further up the chain.
         C = C.getOperand(0);
         return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 4ca731cfdf62..4d1449bc2751 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -75,6 +75,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalValue.h"
@@ -195,10 +196,8 @@ void FastISel::flushLocalValueMap() {
         EmitStartPt ? MachineBasicBlock::reverse_iterator(EmitStartPt)
                     : FuncInfo.MBB->rend();
     MachineBasicBlock::reverse_iterator RI(LastLocalValue);
-    for (; RI != RE;) {
-      MachineInstr &LocalMI = *RI;
-      // Increment before erasing what it points to.
-      ++RI;
+    for (MachineInstr &LocalMI :
+         llvm::make_early_inc_range(llvm::make_range(RI, RE))) {
       Register DefReg = findLocalRegDef(LocalMI);
       if (!DefReg)
         continue;
@@ -622,7 +621,7 @@ bool FastISel::selectGetElementPtr(const User *I) {
 
 bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
                                    const CallInst *CI, unsigned StartIdx) {
-  for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) {
+  for (unsigned i = StartIdx, e = CI->arg_size(); i != e; ++i) {
     Value *Val = CI->getArgOperand(i);
     // Check for constants and encode them with a StackMaps::ConstantOp prefix.
     if (const auto *C = dyn_cast<ConstantInt>(Val)) {
@@ -784,7 +783,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
   // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
   // This includes all meta-operands up to but not including CC.
   unsigned NumMetaOpers = PatchPointOpers::CCPos;
-  assert(I->getNumArgOperands() >= NumMetaOpers + NumArgs &&
+  assert(I->arg_size() >= NumMetaOpers + NumArgs &&
          "Not enough arguments provided to the patchpoint intrinsic");
 
   // For AnyRegCC the arguments are lowered later on manually.
@@ -1151,6 +1150,8 @@ bool FastISel::lowerCall(const CallInst *CI) {
   CLI.setCallee(RetTy, FuncTy, CI->getCalledOperand(), std::move(Args), *CI)
       .setTailCall(IsTailCall);
 
+  diagnoseDontCall(*CI);
+
   return lowerCallTo(CLI);
 }
 
@@ -1264,7 +1265,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       // If using instruction referencing, mutate this into a DBG_INSTR_REF,
       // to be later patched up by finalizeDebugInstrRefs. Tack a deref onto
       // the expression, we don't have an "indirect" flag in DBG_INSTR_REF.
-      if (TM.Options.ValueTrackingVariableLocations && Op->isReg()) {
+      if (FuncInfo.MF->useDebugInstrRef() && Op->isReg()) {
         Builder->setDesc(TII.get(TargetOpcode::DBG_INSTR_REF));
         Builder->getOperand(1).ChangeToImmediate(0);
         auto *NewExpr =
@@ -1292,18 +1293,22 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, false, 0U,
               DI->getVariable(), DI->getExpression());
     } else if (const auto *CI = dyn_cast<ConstantInt>(V)) {
+      // See if there's an expression to constant-fold.
+      DIExpression *Expr = DI->getExpression();
+      if (Expr)
+        std::tie(Expr, CI) = Expr->constantFold(CI);
       if (CI->getBitWidth() > 64)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addCImm(CI)
             .addImm(0U)
             .addMetadata(DI->getVariable())
-            .addMetadata(DI->getExpression());
+            .addMetadata(Expr);
       else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addImm(CI->getZExtValue())
             .addImm(0U)
             .addMetadata(DI->getVariable())
-            .addMetadata(DI->getExpression());
+            .addMetadata(Expr);
     } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
           .addFPImm(CF)
@@ -1319,7 +1324,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
 
       // If using instruction referencing, mutate this into a DBG_INSTR_REF,
       // to be later patched up by finalizeDebugInstrRefs.
-      if (TM.Options.ValueTrackingVariableLocations) {
+      if (FuncInfo.MF->useDebugInstrRef()) {
         Builder->setDesc(TII.get(TargetOpcode::DBG_INSTR_REF));
         Builder->getOperand(1).ChangeToImmediate(0);
       }
@@ -2303,8 +2308,7 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
   bool IsDereferenceable = I->hasMetadata(LLVMContext::MD_dereferenceable);
   const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
 
-  AAMDNodes AAInfo;
-  I->getAAMetadata(AAInfo);
+  AAMDNodes AAInfo = I->getAAMetadata();
 
   if (!Alignment) // Ensure that codegen never sees alignment 0.
     Alignment = DL.getABITypeAlign(ValTy);
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 348fad6daf8f..c1bb65409282 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -722,7 +722,7 @@ void InstrEmitter::AddDbgValueLocationOps(
       MIB.addFrameIndex(Op.getFrameIx());
       break;
     case SDDbgOperand::VREG:
-      MIB.addReg(Op.getVReg(), RegState::Debug);
+      MIB.addReg(Op.getVReg());
       break;
     case SDDbgOperand::SDNODE: {
       SDValue V = SDValue(Op.getSDNode(), Op.getResNo());
@@ -862,7 +862,7 @@ MachineInstr *InstrEmitter::EmitDbgNoLocation(SDDbgValue *SD) {
   DebugLoc DL = SD->getDebugLoc();
   auto MIB = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE));
   MIB.addReg(0U);
-  MIB.addReg(0U, RegState::Debug);
+  MIB.addReg(0U);
   MIB.addMetadata(Var);
   MIB.addMetadata(Expr);
   return &*MIB;
@@ -872,22 +872,33 @@ MachineInstr *
 InstrEmitter::EmitDbgValueFromSingleOp(SDDbgValue *SD,
                                        DenseMap<SDValue, Register> &VRBaseMap) {
   MDNode *Var = SD->getVariable();
-  MDNode *Expr = SD->getExpression();
+  DIExpression *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
   const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
 
   assert(SD->getLocationOps().size() == 1 &&
          "Non variadic dbg_value should have only one location op");
 
+  // See about constant-folding the expression.
+  // Copy the location operand in case we replace it.
+  SmallVector<SDDbgOperand, 1> LocationOps(1, SD->getLocationOps()[0]);
+  if (Expr && LocationOps[0].getKind() == SDDbgOperand::CONST) {
+    const Value *V = LocationOps[0].getConst();
+    if (auto *C = dyn_cast<ConstantInt>(V)) {
+      std::tie(Expr, C) = Expr->constantFold(C);
+      LocationOps[0] = SDDbgOperand::fromConst(C);
+    }
+  }
+
   // Emit non-variadic dbg_value nodes as DBG_VALUE.
   // DBG_VALUE := "DBG_VALUE" loc, isIndirect, var, expr
   auto MIB = BuildMI(*MF, DL, II);
-  AddDbgValueLocationOps(MIB, II, SD->getLocationOps(), VRBaseMap);
+  AddDbgValueLocationOps(MIB, II, LocationOps, VRBaseMap);
 
   if (SD->isIndirect())
     MIB.addImm(0U);
   else
-    MIB.addReg(0U, RegState::Debug);
+    MIB.addReg(0U);
 
   return MIB.addMetadata(Var).addMetadata(Expr);
 }
@@ -1329,5 +1340,5 @@ InstrEmitter::InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
       TRI(MF->getSubtarget().getRegisterInfo()),
       TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
       InsertPos(insertpos) {
-  EmitDebugInstrRefs = TM.Options.ValueTrackingVariableLocations;
+  EmitDebugInstrRefs = MF->useDebugInstrRef();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d92b23f56e4d..eb9d2286aeb4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1164,6 +1164,16 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(Node->getOpcode(),
                     cast<MaskedStoreSDNode>(Node)->getValue().getValueType());
     break;
+  case ISD::VP_SCATTER:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<VPScatterSDNode>(Node)->getValue().getValueType());
+    break;
+  case ISD::VP_STORE:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<VPStoreSDNode>(Node)->getValue().getValueType());
+    break;
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMUL:
   case ISD::VECREDUCE_ADD:
@@ -1181,6 +1191,22 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
         Node->getOpcode(), Node->getOperand(0).getValueType());
     break;
   case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_SEQ_FMUL:
     Action = TLI.getOperationAction(
         Node->getOpcode(), Node->getOperand(1).getValueType());
     break;
@@ -1333,9 +1359,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   Visited.insert(Op.getNode());
   Worklist.push_back(Idx.getNode());
   SDValue StackPtr, Ch;
-  for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
-       UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
+  for (SDNode *User : Vec.getNode()->uses()) {
     if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
       if (ST->isIndexed() || ST->isTruncatingStore() ||
           ST->getValue() != Vec)
@@ -2197,9 +2221,7 @@ static bool useSinCos(SDNode *Node) {
     ? ISD::FCOS : ISD::FSIN;
 
   SDValue Op0 = Node->getOperand(0);
-  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
-       UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
+  for (const SDNode *User : Op0.getNode()->uses()) {
     if (User == Node)
       continue;
     // The other user might have been turned into sincos already.
@@ -2636,7 +2658,7 @@ SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
 
   // If CTPOP is legal, use it. Otherwise use shifts and xor.
   SDValue Result;
-  if (TLI.isOperationLegal(ISD::CTPOP, VT)) {
+  if (TLI.isOperationLegalOrPromote(ISD::CTPOP, VT)) {
     Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
   } else {
     Result = Op;
@@ -2658,21 +2680,21 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::ABS:
-    if (TLI.expandABS(Node, Tmp1, DAG))
+    if ((Tmp1 = TLI.expandABS(Node, DAG)))
       Results.push_back(Tmp1);
     break;
   case ISD::CTPOP:
-    if (TLI.expandCTPOP(Node, Tmp1, DAG))
+    if ((Tmp1 = TLI.expandCTPOP(Node, DAG)))
       Results.push_back(Tmp1);
     break;
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
-    if (TLI.expandCTLZ(Node, Tmp1, DAG))
+    if ((Tmp1 = TLI.expandCTLZ(Node, DAG)))
       Results.push_back(Tmp1);
     break;
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
-    if (TLI.expandCTTZ(Node, Tmp1, DAG))
+    if ((Tmp1 = TLI.expandCTTZ(Node, DAG)))
       Results.push_back(Tmp1);
     break;
   case ISD::BITREVERSE:
@@ -3229,9 +3251,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
            TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
            "Don't know how to expand this subtraction!");
-    Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
-               DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
-                               VT));
+    Tmp1 = DAG.getNOT(dl, Node->getOperand(1), VT);
     Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
     Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
     break;
@@ -4242,8 +4262,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
     SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
     EVT VT = Node->getValueType(0);
-    assert(cast<ConstantSDNode>(Node->getOperand(IsStrict ? 2 : 1))
-               ->isNullValue() &&
+    assert(cast<ConstantSDNode>(Node->getOperand(IsStrict ? 2 : 1))->isZero() &&
            "Unable to expand as libcall if it is not normal rounding");
 
     RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), VT);
@@ -4737,6 +4756,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FROUND:
   case ISD::STRICT_FSIN:
   case ISD::STRICT_FCOS:
   case ISD::STRICT_FLOG:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 3553f9ec16c2..27f9cede1922 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -61,6 +61,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
+    case ISD::ARITH_FENCE: R = SoftenFloatRes_ARITH_FENCE(N); break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
     case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
@@ -206,6 +207,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FREEZE(SDNode *N) {
                      GetSoftenedFloat(N->getOperand(0)));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_ARITH_FENCE(SDNode *N) {
+  EVT Ty = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue NewFence = DAG.getNode(ISD::ARITH_FENCE, SDLoc(N), Ty,
+                                 GetSoftenedFloat(N->getOperand(0)));
+  return NewFence;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
                                                       unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -257,7 +265,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
   unsigned Size = NVT.getSizeInBits();
 
   // Mask = ~(1 << (Size-1))
-  APInt API = APInt::getAllOnesValue(Size);
+  APInt API = APInt::getAllOnes(Size);
   API.clearBit(Size - 1);
   SDValue Mask = DAG.getConstant(API, SDLoc(N), NVT);
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
@@ -820,6 +828,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::STRICT_FP_TO_FP16:
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
@@ -871,13 +880,17 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   // We actually deal with the partially-softened FP_TO_FP16 node too, which
   // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
   assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 ||
+         N->getOpcode() == ISD::STRICT_FP_TO_FP16 ||
          N->getOpcode() == ISD::STRICT_FP_ROUND);
 
   bool IsStrict = N->isStrictFPOpcode();
   SDValue Op = N->getOperand(IsStrict ? 1 : 0);
   EVT SVT = Op.getValueType();
   EVT RVT = N->getValueType(0);
-  EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT;
+  EVT FloatRVT = (N->getOpcode() == ISD::FP_TO_FP16 ||
+                  N->getOpcode() == ISD::STRICT_FP_TO_FP16)
+                     ? MVT::f16
+                     : RVT;
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b8a3dd014901..1fa4d88fcb4a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 using namespace llvm;
 
 #define DEBUG_TYPE "legalize-types"
@@ -81,15 +82,23 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FSETCCS:
   case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
   case ISD::SMIN:
-  case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
+  case ISD::SMAX:
+    Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ false);
+    break;
   case ISD::UMIN:
   case ISD::UMAX:        Res = PromoteIntRes_UMINUMAX(N); break;
 
-  case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
+  case ISD::SHL:
+    Res = PromoteIntRes_SHL(N, /*IsVP*/ false);
+    break;
   case ISD::SIGN_EXTEND_INREG:
                          Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
-  case ISD::SRA:         Res = PromoteIntRes_SRA(N); break;
-  case ISD::SRL:         Res = PromoteIntRes_SRL(N); break;
+  case ISD::SRA:
+    Res = PromoteIntRes_SRA(N, /*IsVP*/ false);
+    break;
+  case ISD::SRL:
+    Res = PromoteIntRes_SRL(N, /*IsVP*/ false);
+    break;
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
@@ -144,13 +153,19 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::XOR:
   case ISD::ADD:
   case ISD::SUB:
-  case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
+  case ISD::MUL:
+    Res = PromoteIntRes_SimpleIntBinOp(N, /*IsVP*/ false);
+    break;
 
   case ISD::SDIV:
-  case ISD::SREM:        Res = PromoteIntRes_SExtIntBinOp(N); break;
+  case ISD::SREM:
+    Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ false);
+    break;
 
   case ISD::UDIV:
-  case ISD::UREM:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
+  case ISD::UREM:
+    Res = PromoteIntRes_ZExtIntBinOp(N, /*IsVP*/ false);
+    break;
 
   case ISD::SADDO:
   case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
@@ -220,6 +235,18 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_VECREDUCE(N);
     break;
 
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+    Res = PromoteIntRes_VP_REDUCE(N);
+    break;
+
   case ISD::FREEZE:
     Res = PromoteIntRes_FREEZE(N);
     break;
@@ -233,6 +260,32 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FSHR:
     Res = PromoteIntRes_FunnelShift(N);
     break;
+
+  case ISD::VP_AND:
+  case ISD::VP_OR:
+  case ISD::VP_XOR:
+  case ISD::VP_ADD:
+  case ISD::VP_SUB:
+  case ISD::VP_MUL:
+    Res = PromoteIntRes_SimpleIntBinOp(N, /*IsVP*/ true);
+    break;
+  case ISD::VP_SDIV:
+  case ISD::VP_SREM:
+    Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ true);
+    break;
+  case ISD::VP_UDIV:
+  case ISD::VP_UREM:
+    Res = PromoteIntRes_ZExtIntBinOp(N, /*IsVP*/ true);
+    break;
+  case ISD::VP_SHL:
+    Res = PromoteIntRes_SHL(N, /*IsVP*/ true);
+    break;
+  case ISD::VP_ASHR:
+    Res = PromoteIntRes_SRA(N, /*IsVP*/ true);
+    break;
+  case ISD::VP_LSHR:
+    Res = PromoteIntRes_SRL(N, /*IsVP*/ true);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -438,19 +491,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
                      CreateStackStoreLoad(InOp, OutVT));
 }
 
-// Helper for BSWAP/BITREVERSE promotion to ensure we can fit any shift amount
-// in the VT returned by getShiftAmountTy and to return a safe VT if we can't.
-static EVT getShiftAmountTyForConstant(EVT VT, const TargetLowering &TLI,
-                                       SelectionDAG &DAG) {
-  EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-  // If any possible shift value won't fit in the prefered type, just use
-  // something safe. It will be legalized when the shift is expanded.
-  if (!ShiftVT.isVector() &&
-      ShiftVT.getSizeInBits() < Log2_32_Ceil(VT.getSizeInBits()))
-    ShiftVT = MVT::i32;
-  return ShiftVT;
-}
-
 SDValue DAGTypeLegalizer::PromoteIntRes_FREEZE(SDNode *N) {
   SDValue V = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::FREEZE, SDLoc(N),
@@ -474,7 +514,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   }
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  EVT ShiftVT = getShiftAmountTyForConstant(NVT, TLI, DAG);
+  EVT ShiftVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
                      DAG.getConstant(DiffBits, dl, ShiftVT));
 }
@@ -496,7 +536,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
   }
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  EVT ShiftVT = getShiftAmountTyForConstant(NVT, TLI, DAG);
+  EVT ShiftVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   return DAG.getNode(ISD::SRL, dl, NVT,
                      DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
                      DAG.getConstant(DiffBits, dl, ShiftVT));
@@ -526,11 +566,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDLoc dl(N);
+
+  // If the larger CTLZ isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type.
+  if (!OVT.isVector() && TLI.isTypeLegal(NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ, NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ_ZERO_UNDEF, NVT)) {
+    if (SDValue Result = TLI.expandCTLZ(N, DAG)) {
+      Result = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Result);
+      return Result;
+    }
+  }
+
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  SDLoc dl(N);
-  EVT OVT = N->getValueType(0);
-  EVT NVT = Op.getValueType();
   Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
   // Subtract off the extra leading bits in the bigger type.
   return DAG.getNode(
@@ -540,6 +593,22 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+
+  // If the larger CTPOP isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type.
+  // TODO: Expand ISD::PARITY. Need to move ExpandPARITY from LegalizeDAG to
+  // TargetLowering.
+  if (N->getOpcode() == ISD::CTPOP && !OVT.isVector() && TLI.isTypeLegal(NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTPOP, NVT)) {
+    if (SDValue Result = TLI.expandCTPOP(N, DAG)) {
+      Result = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Result);
+      return Result;
+    }
+  }
+
   // Zero extend to the promoted type and do the count or parity there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
@@ -550,6 +619,22 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
   SDLoc dl(N);
+
+  // If the larger CTTZ isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type. Don't expand if we can use CTPOP or CTLZ expansion on the
+  // larger type.
+  if (!OVT.isVector() && TLI.isTypeLegal(NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ, NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ_ZERO_UNDEF, NVT) &&
+      !TLI.isOperationLegal(ISD::CTPOP, NVT) &&
+      !TLI.isOperationLegal(ISD::CTLZ, NVT)) {
+    if (SDValue Result = TLI.expandCTTZ(N, DAG)) {
+      Result = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Result);
+      return Result;
+    }
+  }
+
   if (N->getOpcode() == ISD::CTTZ) {
     // The count is the same in the promoted type except if the original
     // value was zero.  This can be handled by setting the bit just off
@@ -702,11 +787,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue ExtPassThru = GetPromotedInteger(N->getPassThru());
 
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    ExtType = ISD::EXTLOAD;
+
   SDLoc dl(N);
   SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
                                   N->getOffset(), N->getMask(), ExtPassThru,
                                   N->getMemoryVT(), N->getMemOperand(),
-                                  N->getAddressingMode(), ISD::EXTLOAD);
+                                  N->getAddressingMode(), ExtType,
+                                  N->isExpandingLoad());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -792,7 +882,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   unsigned NewBits = PromotedType.getScalarSizeInBits();
 
   if (Opcode == ISD::UADDSAT) {
-    APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
+    APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits);
     SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
     SDValue Add =
         DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
@@ -806,7 +896,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
 
   // Shift cannot use a min/max expansion, we can't detect overflow if all of
   // the bits have been shifted out.
-  if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
+  if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
     unsigned ShiftOp;
     switch (Opcode) {
     case ISD::SADDSAT:
@@ -1103,12 +1193,15 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N, bool IsVP) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  return DAG.getNode(ISD::SHL, SDLoc(N), LHS.getValueType(), LHS, RHS);
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
@@ -1117,30 +1210,36 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N, bool IsVP) {
   // The input may have strange things in the top bits of the registers, but
   // these operations don't care.  They may have weird bits going out, but
   // that too is okay if they are integer operations.
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = GetPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     LHS.getValueType(), LHS, RHS);
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N, bool IsVP) {
   // Sign extend the input.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     LHS.getValueType(), LHS, RHS);
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N, bool IsVP) {
   // Zero extend the input.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     LHS.getValueType(), LHS, RHS);
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
@@ -1152,22 +1251,28 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N, bool IsVP) {
   // The input value must be properly sign extended.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  return DAG.getNode(ISD::SRA, SDLoc(N), LHS.getValueType(), LHS, RHS);
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N, bool IsVP) {
   // The input value must be properly zero extended.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS);
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
@@ -1383,7 +1488,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   if (N->getOpcode() == ISD::UMULO) {
     // Unsigned overflow occurred if the high part is non-zero.
     unsigned Shift = SmallVT.getScalarSizeInBits();
-    EVT ShiftTy = getShiftAmountTyForConstant(Mul.getValueType(), TLI, DAG);
+    EVT ShiftTy = TLI.getShiftAmountTy(Mul.getValueType(), DAG.getDataLayout());
     SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
                              DAG.getConstant(Shift, DL, ShiftTy));
     Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
@@ -1523,6 +1628,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::STRICT_UINT_TO_FP:  Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break;
   case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_SUBVECTOR: Res = PromoteIntOp_INSERT_SUBVECTOR(N); break;
 
   case ISD::SHL:
   case ISD::SRA:
@@ -1560,6 +1666,17 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::VECREDUCE_SMIN:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN: Res = PromoteIntOp_VECREDUCE(N); break;
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+    Res = PromoteIntOp_VP_REDUCE(N, OpNo);
+    break;
 
   case ISD::SET_ROUNDING: Res = PromoteIntOp_SET_ROUNDING(N); break;
   }
@@ -1605,10 +1722,8 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
     // If the width of OpL/OpR excluding the duplicated sign bits is no greater
     // than the width of NewLHS/NewRH, we can avoid inserting real truncate
     // instruction, which is redundant eventually.
-    unsigned OpLEffectiveBits =
-        OpL.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(OpL) + 1;
-    unsigned OpREffectiveBits =
-        OpR.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(OpR) + 1;
+    unsigned OpLEffectiveBits = DAG.ComputeMinSignedBits(OpL);
+    unsigned OpREffectiveBits = DAG.ComputeMinSignedBits(OpR);
     if (OpLEffectiveBits <= NewLHS.getScalarValueSizeInBits() &&
         OpREffectiveBits <= NewRHS.getScalarValueSizeInBits()) {
       NewLHS = OpL;
@@ -1832,29 +1947,25 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
                                               unsigned OpNo) {
-
   SDValue DataOp = N->getValue();
-  EVT DataVT = DataOp.getValueType();
   SDValue Mask = N->getMask();
-  SDLoc dl(N);
 
-  bool TruncateStore = false;
   if (OpNo == 4) {
+    // The Mask. Update in place.
+    EVT DataVT = DataOp.getValueType();
     Mask = PromoteTargetBoolean(Mask, DataVT);
-    // Update in place.
     SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
     NewOps[4] = Mask;
     return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
-  } else { // Data operand
-    assert(OpNo == 1 && "Unexpected operand for promotion");
-    DataOp = GetPromotedInteger(DataOp);
-    TruncateStore = true;
   }
 
-  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(),
+  assert(OpNo == 1 && "Unexpected operand for promotion");
+  DataOp = GetPromotedInteger(DataOp);
+
+  return DAG.getMaskedStore(N->getChain(), SDLoc(N), DataOp, N->getBasePtr(),
                             N->getOffset(), Mask, N->getMemoryVT(),
                             N->getMemOperand(), N->getAddressingMode(),
-                            TruncateStore, N->isCompressingStore());
+                            /*IsTruncating*/ true, N->isCompressingStore());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
@@ -2023,30 +2134,54 @@ SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
-  SDLoc dl(N);
-  SDValue Op;
+static unsigned getExtendForIntVecReduction(SDNode *N) {
   switch (N->getOpcode()) {
-  default: llvm_unreachable("Expected integer vector reduction");
+  default:
+    llvm_unreachable("Expected integer vector reduction");
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
   case ISD::VECREDUCE_AND:
   case ISD::VECREDUCE_OR:
   case ISD::VECREDUCE_XOR:
-    Op = GetPromotedInteger(N->getOperand(0));
-    break;
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+    return ISD::ANY_EXTEND;
   case ISD::VECREDUCE_SMAX:
   case ISD::VECREDUCE_SMIN:
-    Op = SExtPromotedInteger(N->getOperand(0));
-    break;
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+    return ISD::SIGN_EXTEND;
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN:
-    Op = ZExtPromotedInteger(N->getOperand(0));
-    break;
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+    return ISD::ZERO_EXTEND;
   }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOpVectorReduction(SDNode *N, SDValue V) {
+  switch (getExtendForIntVecReduction(N)) {
+  default:
+    llvm_unreachable("Impossible extension kind for integer reduction");
+  case ISD::ANY_EXTEND:
+    return GetPromotedInteger(V);
+  case ISD::SIGN_EXTEND:
+    return SExtPromotedInteger(V);
+  case ISD::ZERO_EXTEND:
+    return ZExtPromotedInteger(V);
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op = PromoteIntOpVectorReduction(N, N->getOperand(0));
 
   EVT EltVT = Op.getValueType().getVectorElementType();
   EVT VT = N->getValueType(0);
+
   if (VT.bitsGE(EltVT))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op);
 
@@ -2056,6 +2191,38 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Reduce);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(OpNo);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+
+  if (OpNo == 2) { // Mask
+    // Update in place.
+    NewOps[2] = PromoteTargetBoolean(Op, N->getOperand(1).getValueType());
+    return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  }
+
+  assert(OpNo == 1 && "Unexpected operand for promotion");
+
+  Op = PromoteIntOpVectorReduction(N, Op);
+
+  NewOps[OpNo] = Op;
+
+  EVT VT = N->getValueType(0);
+  EVT EltVT = Op.getValueType().getScalarType();
+
+  if (VT.bitsGE(EltVT))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, NewOps);
+
+  // Result size must be >= element/start-value size. If this is not the case
+  // after promotion, also promote both the start value and result type and
+  // then truncate.
+  NewOps[0] =
+      DAG.getNode(getExtendForIntVecReduction(N), DL, EltVT, N->getOperand(0));
+  SDValue Reduce = DAG.getNode(N->getOpcode(), DL, EltVT, NewOps);
+  return DAG.getNode(ISD::TRUNCATE, DL, VT, Reduce);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
   SDValue Op = ZExtPromotedInteger(N->getOperand(1));
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0);
@@ -2088,6 +2255,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
     report_fatal_error("Do not know how to expand the result of this "
                        "operator!");
 
+  case ISD::ARITH_FENCE:  SplitRes_ARITH_FENCE(N, Lo, Hi); break;
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
@@ -2978,7 +3146,7 @@ void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) {
   bool HasAddCarry = TLI.isOperationLegalOrCustom(
       ISD::ADDCARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
   if (HasAddCarry) {
-    EVT ShiftAmtTy = getShiftAmountTyForConstant(NVT, TLI, DAG);
+    EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
     SDValue Sign =
         DAG.getNode(ISD::SRA, dl, NVT, Hi,
                     DAG.getConstant(NVT.getSizeInBits() - 1, dl, ShiftAmtTy));
@@ -3087,6 +3255,9 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
     EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
     Op = GetSoftPromotedHalf(Op);
     Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    SplitInteger(Op, Lo, Hi);
+    return;
   }
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
@@ -3116,6 +3287,9 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
     EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
     Op = GetSoftPromotedHalf(Op);
     Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+    Op = DAG.getNode(ISD::FP_TO_UINT, dl, VT, Op);
+    SplitInteger(Op, Lo, Hi);
+    return;
   }
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
@@ -3367,11 +3541,6 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
     SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
 
     EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
-    if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) {
-      // The type from TLI is too small to fit the shift amount we want.
-      // Override it with i32. The shift will have to be legalized.
-      ShiftAmtTy = MVT::i32;
-    }
     SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
     SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
     SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
@@ -3464,8 +3633,11 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
         SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
         SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
         SDValue Zero = DAG.getConstant(0, dl, VT);
-        SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
-        Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+        // Xor the inputs, if resulting sign bit is 0 the product will be
+        // positive, else negative.
+        SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, LHS, RHS);
+        SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Xor, Zero, ISD::SETLT);
+        Result = DAG.getSelect(dl, VT, ProdNeg, SatMin, SatMax);
         Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
       } else {
         // For unsigned multiplication, we only need to check the max since we
@@ -3638,7 +3810,7 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
 
   // Saturate to signed maximum.
   APInt MaxHi = APInt::getSignedMaxValue(NVTSize);
-  APInt MaxLo = APInt::getAllOnesValue(NVTSize);
+  APInt MaxLo = APInt::getAllOnes(NVTSize);
   Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxHi, dl, NVT), Hi);
   Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxLo, dl, NVT), Lo);
   // Saturate to signed minimum.
@@ -3808,9 +3980,6 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
     // the new SHL_PARTS operation would need further legalization.
     SDValue ShiftOp = N->getOperand(1);
     EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-    assert(ShiftTy.getScalarSizeInBits() >=
-           Log2_32_Ceil(VT.getScalarSizeInBits()) &&
-           "ShiftAmountTy is too small to cover the range of this type!");
     if (ShiftOp.getValueType() != ShiftTy)
       ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
 
@@ -3857,7 +4026,10 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
   }
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
-    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    EVT ShAmtTy =
+        EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
+    SDValue ShAmt = DAG.getZExtOrTrunc(N->getOperand(1), dl, ShAmtTy);
+    SDValue Ops[2] = {N->getOperand(0), ShAmt};
     TargetLowering::MakeLibCallOptions CallOptions;
     CallOptions.setSExt(isSigned);
     SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
@@ -4035,7 +4207,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     LC = RTLIB::MULO_I64;
   else if (VT == MVT::i128)
     LC = RTLIB::MULO_I128;
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XMULO!");
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
+    // FIXME: This is not an optimal expansion, but better than crashing.
+    EVT WideVT =
+        EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
+    SDValue LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N->getOperand(0));
+    SDValue RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N->getOperand(1));
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
+    SDValue MulLo, MulHi;
+    SplitInteger(Mul, MulLo, MulHi);
+    SDValue SRA =
+        DAG.getNode(ISD::SRA, dl, VT, MulLo,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+    SDValue Overflow =
+        DAG.getSetCC(dl, N->getValueType(1), MulHi, SRA, ISD::SETNE);
+    SplitInteger(MulLo, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Overflow);
+    return;
+  }
 
   SDValue Temp = DAG.CreateStackTemporary(PtrVT);
   // Temporary for the overflow value, default it to zero.
@@ -4188,18 +4378,45 @@ void DAGTypeLegalizer::ExpandIntRes_VECREDUCE(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_Rotate(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  // Lower the rotate to shifts and ORs which can be expanded.
-  SDValue Res;
-  TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG);
+  // Delegate to funnel-shift expansion.
+  SDLoc DL(N);
+  unsigned Opcode = N->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
+  SDValue Res = DAG.getNode(Opcode, DL, N->getValueType(0), N->getOperand(0),
+                            N->getOperand(0), N->getOperand(1));
   SplitInteger(Res, Lo, Hi);
 }
 
-void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N,
-                                                SDValue &Lo, SDValue &Hi) {
-  // Lower the funnel shift to shifts and ORs which can be expanded.
-  SDValue Res;
-  TLI.expandFunnelShift(N, Res, DAG);
-  SplitInteger(Res, Lo, Hi);
+void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  // Values numbered from least significant to most significant.
+  SDValue In1, In2, In3, In4;
+  GetExpandedInteger(N->getOperand(0), In3, In4);
+  GetExpandedInteger(N->getOperand(1), In1, In2);
+  EVT HalfVT = In1.getValueType();
+
+  SDLoc DL(N);
+  unsigned Opc = N->getOpcode();
+  SDValue ShAmt = N->getOperand(2);
+  EVT ShAmtVT = ShAmt.getValueType();
+  EVT ShAmtCCVT = getSetCCResultType(ShAmtVT);
+
+  // If the shift amount is at least half the bitwidth, swap the inputs.
+  unsigned HalfVTBits = HalfVT.getScalarSizeInBits();
+  SDValue AndNode = DAG.getNode(ISD::AND, DL, ShAmtVT, ShAmt,
+                                DAG.getConstant(HalfVTBits, DL, ShAmtVT));
+  SDValue Cond =
+      DAG.getSetCC(DL, ShAmtCCVT, AndNode, DAG.getConstant(0, DL, ShAmtVT),
+                   Opc == ISD::FSHL ? ISD::SETNE : ISD::SETEQ);
+
+  // Expand to a pair of funnel shifts.
+  EVT NewShAmtVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
+  SDValue NewShAmt = DAG.getAnyExtOrTrunc(ShAmt, DL, NewShAmtVT);
+
+  SDValue Select1 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In1, In2);
+  SDValue Select2 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In2, In3);
+  SDValue Select3 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In3, In4);
+  Lo = DAG.getNode(Opc, DL, HalfVT, Select2, Select1, NewShAmt);
+  Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
@@ -4297,7 +4514,7 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
   if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
     if (RHSLo == RHSHi) {
       if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
-        if (RHSCST->isAllOnesValue()) {
+        if (RHSCST->isAllOnes()) {
           // Equality comparison to -1.
           NewLHS = DAG.getNode(ISD::AND, dl,
                                LHSLo.getValueType(), LHSLo, LHSHi);
@@ -4317,8 +4534,8 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
   // If this is a comparison of the sign bit, just look at the top part.
   // X > -1,  x < 0
   if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
-    if ((CCCode == ISD::SETLT && CST->isNullValue()) ||     // X < 0
-        (CCCode == ISD::SETGT && CST->isAllOnesValue())) {  // X > -1
+    if ((CCCode == ISD::SETLT && CST->isZero()) ||    // X < 0
+        (CCCode == ISD::SETGT && CST->isAllOnes())) { // X > -1
       NewLHS = LHSHi;
       NewRHS = RHSHi;
       return;
@@ -4369,9 +4586,11 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
   bool EqAllowed = (CCCode == ISD::SETLE || CCCode == ISD::SETGE ||
                     CCCode == ISD::SETUGE || CCCode == ISD::SETULE);
 
-  if ((EqAllowed && (HiCmpC && HiCmpC->isNullValue())) ||
-      (!EqAllowed && ((HiCmpC && (HiCmpC->getAPIntValue() == 1)) ||
-                      (LoCmpC && LoCmpC->isNullValue())))) {
+  // FIXME: Is the HiCmpC->isOne() here correct for
+  // ZeroOrNegativeOneBooleanContent.
+  if ((EqAllowed && (HiCmpC && HiCmpC->isZero())) ||
+      (!EqAllowed &&
+       ((HiCmpC && HiCmpC->isOne()) || (LoCmpC && LoCmpC->isZero())))) {
     // For LE / GE, if high part is known false, ignore the low part.
     // For LT / GT: if low part is known false, return the high part.
     //              if high part is known true, ignore the low part.
@@ -4706,6 +4925,30 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
     SDValue InOp0 = N->getOperand(0);
     EVT InVT = InOp0.getValueType();
 
+    // Try and extract from a smaller type so that it eventually falls
+    // into the promotion code below.
+    if (getTypeAction(InVT) == TargetLowering::TypeSplitVector ||
+        getTypeAction(InVT) == TargetLowering::TypeLegal) {
+      EVT NInVT = InVT.getHalfNumVectorElementsVT(*DAG.getContext());
+      unsigned NElts = NInVT.getVectorMinNumElements();
+      uint64_t IdxVal = cast<ConstantSDNode>(BaseIdx)->getZExtValue();
+
+      SDValue Step1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NInVT, InOp0,
+                                  DAG.getConstant(alignDown(IdxVal, NElts), dl,
+                                                  BaseIdx.getValueType()));
+      SDValue Step2 = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, OutVT, Step1,
+          DAG.getConstant(IdxVal % NElts, dl, BaseIdx.getValueType()));
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Step2);
+    }
+
+    // Try and extract from a widened type.
+    if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+      SDValue Ops[] = {GetWidenedVector(InOp0), BaseIdx};
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), OutVT, Ops);
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Ext);
+    }
+
     // Promote operands and see if this is handled by target lowering,
     // Otherwise, use the BUILD_VECTOR approach below
     if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) {
@@ -4873,11 +5116,46 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
 
+  unsigned NumOperands = N->getNumOperands();
+  unsigned NumOutElem = NOutVT.getVectorMinNumElements();
   EVT OutElemTy = NOutVT.getVectorElementType();
+  if (OutVT.isScalableVector()) {
+    // Find the largest promoted element type for each of the operands.
+    SDUse *MaxSizedValue = std::max_element(
+        N->op_begin(), N->op_end(), [](const SDValue &A, const SDValue &B) {
+          EVT AVT = A.getValueType().getVectorElementType();
+          EVT BVT = B.getValueType().getVectorElementType();
+          return AVT.getScalarSizeInBits() < BVT.getScalarSizeInBits();
+        });
+    EVT MaxElementVT = MaxSizedValue->getValueType().getVectorElementType();
+
+    // Then promote all vectors to the largest element type.
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned I = 0; I < NumOperands; ++I) {
+      SDValue Op = N->getOperand(I);
+      EVT OpVT = Op.getValueType();
+      if (getTypeAction(OpVT) == TargetLowering::TypePromoteInteger)
+        Op = GetPromotedInteger(Op);
+      else
+        assert(getTypeAction(OpVT) == TargetLowering::TypeLegal &&
+               "Unhandled legalization type");
+
+      if (OpVT.getVectorElementType().getScalarSizeInBits() <
+          MaxElementVT.getScalarSizeInBits())
+        Op = DAG.getAnyExtOrTrunc(Op, dl,
+                                  OpVT.changeVectorElementType(MaxElementVT));
+      Ops.push_back(Op);
+    }
+
+    // Do the CONCAT on the promoted type and finally truncate to (the promoted)
+    // NOutVT.
+    return DAG.getAnyExtOrTrunc(
+        DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                    OutVT.changeVectorElementType(MaxElementVT), Ops),
+        dl, NOutVT);
+  }
 
   unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
-  unsigned NumOutElem = NOutVT.getVectorNumElements();
-  unsigned NumOperands = N->getNumOperands();
   assert(NumElem * NumOperands == NumOutElem &&
          "Unexpected number of elements");
 
@@ -4957,7 +5235,17 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECREDUCE(SDNode *N) {
   // we can simply change the result type.
   SDLoc dl(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->ops());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VP_REDUCE(SDNode *N) {
+  // The VP_REDUCE result size may be larger than the element size, so we can
+  // simply change the result type. However the start value and result must be
+  // the same.
+  SDLoc DL(N);
+  SDValue Start = PromoteIntOpVectorReduction(N, N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), DL, Start.getValueType(), Start,
+                     N->getOperand(1), N->getOperand(2), N->getOperand(3));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -4974,6 +5262,21 @@ SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_SUBVECTOR(SDNode *N) {
+  SDLoc dl(N);
+  // The result type is equal to the first input operand's type, so the
+  // type that needs promoting must be the second source vector.
+  SDValue V0 = N->getOperand(0);
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  SDValue Idx = N->getOperand(2);
+  EVT PromVT = EVT::getVectorVT(*DAG.getContext(),
+                                V1.getValueType().getVectorElementType(),
+                                V0.getValueType().getVectorElementCount());
+  V0 = DAG.getAnyExtOrTrunc(V0, dl, PromVT);
+  SDValue Ext = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, PromVT, V0, V1, Idx);
+  return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 05a974af3b55..1f73c9eea104 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -223,8 +223,7 @@ bool DAGTypeLegalizer::run() {
 #endif
       PerformExpensiveChecks();
 
-    SDNode *N = Worklist.back();
-    Worklist.pop_back();
+    SDNode *N = Worklist.pop_back_val();
     assert(N->getNodeId() == ReadyToProcess &&
            "Node should be ready if on worklist!");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 8d17d8fc68b1..da282ecad282 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -289,6 +289,12 @@ private:
     return DAG.getZeroExtendInReg(Op, DL, OldVT);
   }
 
+  // Promote the given operand V (vector or scalar) according to N's specific
+  // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
+  // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
+  // promoted value.
+  SDValue PromoteIntOpVectorReduction(SDNode *N, SDValue V);
+
   // Integer Result Promotion.
   void PromoteIntegerResult(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
@@ -332,14 +338,14 @@ private:
   SDValue PromoteIntRes_VSELECT(SDNode *N);
   SDValue PromoteIntRes_SELECT_CC(SDNode *N);
   SDValue PromoteIntRes_SETCC(SDNode *N);
-  SDValue PromoteIntRes_SHL(SDNode *N);
-  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
-  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N);
-  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SHL(SDNode *N, bool IsVP);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N, bool IsVP);
+  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N, bool IsVP);
+  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N, bool IsVP);
   SDValue PromoteIntRes_UMINUMAX(SDNode *N);
   SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
-  SDValue PromoteIntRes_SRA(SDNode *N);
-  SDValue PromoteIntRes_SRL(SDNode *N);
+  SDValue PromoteIntRes_SRA(SDNode *N, bool IsVP);
+  SDValue PromoteIntRes_SRL(SDNode *N, bool IsVP);
   SDValue PromoteIntRes_TRUNCATE(SDNode *N);
   SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo);
@@ -353,6 +359,7 @@ private:
   SDValue PromoteIntRes_DIVFIX(SDNode *N);
   SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N);
   SDValue PromoteIntRes_VECREDUCE(SDNode *N);
+  SDValue PromoteIntRes_VP_REDUCE(SDNode *N);
   SDValue PromoteIntRes_ABS(SDNode *N);
   SDValue PromoteIntRes_Rotate(SDNode *N);
   SDValue PromoteIntRes_FunnelShift(SDNode *N);
@@ -369,6 +376,7 @@ private:
   SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntOp_INSERT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
   SDValue PromoteIntOp_SPLAT_VECTOR(SDNode *N);
@@ -394,6 +402,7 @@ private:
   SDValue PromoteIntOp_FIX(SDNode *N);
   SDValue PromoteIntOp_FPOWI(SDNode *N);
   SDValue PromoteIntOp_VECREDUCE(SDNode *N);
+  SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -518,6 +527,7 @@ private:
   SDValue SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC);
   SDValue SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_ARITH_FENCE(SDNode *N);
   SDValue SoftenFloatRes_BITCAST(SDNode *N);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
   SDValue SoftenFloatRes_ConstantFP(SDNode *N);
@@ -816,7 +826,7 @@ private:
 
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
   void SplitVectorResult(SDNode *N, unsigned ResNo);
-  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, bool IsVP);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -898,6 +908,7 @@ private:
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
   SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
@@ -912,7 +923,7 @@ private:
   SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
 
   SDValue WidenVecRes_Ternary(SDNode *N);
-  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_Binary(SDNode *N, bool IsVP);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N);
   SDValue WidenVecRes_StrictFP(SDNode *N);
@@ -972,10 +983,10 @@ private:
                                  LoadSDNode *LD, ISD::LoadExtType ExtType);
 
   /// Helper function to generate a set of stores to store a widen vector into
-  /// non-widen memory.
+  /// non-widen memory. Returns true if successful, false otherwise.
   ///   StChain: list of chains for the stores we have generated
   ///   ST:      store of a widen value
-  void GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
+  bool GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
 
   /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
   /// input vector must have the same element type as NVT.
@@ -1011,6 +1022,7 @@ private:
   // Generic Result Splitting.
   void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
                              SDValue &Lo, SDValue &Hi);
+  void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 81cc2bf10d25..3d3c9a2ad837 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -571,3 +571,13 @@ void DAGTypeLegalizer::SplitRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Lo = DAG.getNode(ISD::FREEZE, dl, L.getValueType(), L);
   Hi = DAG.getNode(ISD::FREEZE, dl, H.getValueType(), H);
 }
+
+void DAGTypeLegalizer::SplitRes_ARITH_FENCE(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDValue L, H;
+  SDLoc DL(N);
+  GetSplitOp(N->getOperand(0), L, H);
+
+  Lo = DAG.getNode(ISD::ARITH_FENCE, DL, L.getValueType(), L);
+  Hi = DAG.getNode(ISD::ARITH_FENCE, DL, H.getValueType(), H);
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ebe3bfc4b75a..88a28a3be53e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -538,8 +538,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   return RecursivelyLegalizeResults(Op, ResultVals);
 }
 
-// FIXME: This is very similar to the X86 override of
-// TargetLowering::LowerOperationWrapper. Can we merge them somehow?
+// FIXME: This is very similar to TargetLowering::LowerOperationWrapper. Can we
+// merge them somehow?
 bool VectorLegalizer::LowerOperationWrapper(SDNode *Node,
                                             SmallVectorImpl<SDValue> &Results) {
   SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
@@ -774,8 +774,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     ExpandSETCC(Node, Results);
     return;
   case ISD::ABS:
-    if (TLI.expandABS(Node, Tmp, DAG)) {
-      Results.push_back(Tmp);
+    if (SDValue Expanded = TLI.expandABS(Node, DAG)) {
+      Results.push_back(Expanded);
       return;
     }
     break;
@@ -783,22 +783,22 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     ExpandBITREVERSE(Node, Results);
     return;
   case ISD::CTPOP:
-    if (TLI.expandCTPOP(Node, Tmp, DAG)) {
-      Results.push_back(Tmp);
+    if (SDValue Expanded = TLI.expandCTPOP(Node, DAG)) {
+      Results.push_back(Expanded);
       return;
     }
     break;
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
-    if (TLI.expandCTLZ(Node, Tmp, DAG)) {
-      Results.push_back(Tmp);
+    if (SDValue Expanded = TLI.expandCTLZ(Node, DAG)) {
+      Results.push_back(Expanded);
       return;
     }
     break;
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
-    if (TLI.expandCTTZ(Node, Tmp, DAG)) {
-      Results.push_back(Tmp);
+    if (SDValue Expanded = TLI.expandCTTZ(Node, DAG)) {
+      Results.push_back(Expanded);
       return;
     }
     break;
@@ -943,10 +943,8 @@ SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) {
   // What is the size of each element in the vector mask.
   EVT BitTy = MaskTy.getScalarType();
 
-  Mask = DAG.getSelect(DL, BitTy, Mask,
-          DAG.getConstant(APInt::getAllOnesValue(BitTy.getSizeInBits()), DL,
-                          BitTy),
-          DAG.getConstant(0, DL, BitTy));
+  Mask = DAG.getSelect(DL, BitTy, Mask, DAG.getAllOnesConstant(DL, BitTy),
+                       DAG.getConstant(0, DL, BitTy));
 
   // Broadcast the mask so that the entire vector is all one or all zero.
   if (VT.isFixedLengthVector())
@@ -960,9 +958,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) {
   Op1 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op1);
   Op2 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op2);
 
-  SDValue AllOnes = DAG.getConstant(
-            APInt::getAllOnesValue(BitTy.getSizeInBits()), DL, MaskTy);
-  SDValue NotMask = DAG.getNode(ISD::XOR, DL, MaskTy, Mask, AllOnes);
+  SDValue NotMask = DAG.getNOT(DL, Mask, MaskTy);
 
   Op1 = DAG.getNode(ISD::AND, DL, MaskTy, Op1, Mask);
   Op2 = DAG.getNode(ISD::AND, DL, MaskTy, Op2, NotMask);
@@ -1099,25 +1095,45 @@ static void createBSWAPShuffleMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
 SDValue VectorLegalizer::ExpandBSWAP(SDNode *Node) {
   EVT VT = Node->getValueType(0);
 
+  // Scalable vectors can't use shuffle expansion.
+  if (VT.isScalableVector())
+    return TLI.expandBSWAP(Node, DAG);
+
   // Generate a byte wise shuffle mask for the BSWAP.
   SmallVector<int, 16> ShuffleMask;
   createBSWAPShuffleMask(VT, ShuffleMask);
   EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
 
   // Only emit a shuffle if the mask is legal.
-  if (!TLI.isShuffleMaskLegal(ShuffleMask, ByteVT))
-    return DAG.UnrollVectorOp(Node);
+  if (TLI.isShuffleMaskLegal(ShuffleMask, ByteVT)) {
+    SDLoc DL(Node);
+    SDValue Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Node->getOperand(0));
+    Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), ShuffleMask);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
 
-  SDLoc DL(Node);
-  SDValue Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Node->getOperand(0));
-  Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), ShuffleMask);
-  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
+  if (TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
+    return TLI.expandBSWAP(Node, DAG);
+
+  // Otherwise unroll.
+  return DAG.UnrollVectorOp(Node);
 }
 
 void VectorLegalizer::ExpandBITREVERSE(SDNode *Node,
                                        SmallVectorImpl<SDValue> &Results) {
   EVT VT = Node->getValueType(0);
 
+  // We can't unroll or use shuffles for scalable vectors.
+  if (VT.isScalableVector()) {
+    Results.push_back(TLI.expandBITREVERSE(Node, DAG));
+    return;
+  }
+
   // If we have the scalar operation, it's probably cheaper to unroll it.
   if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType())) {
     SDValue Tmp = DAG.UnrollVectorOp(Node);
@@ -1156,9 +1172,10 @@ void VectorLegalizer::ExpandBITREVERSE(SDNode *Node,
   if (TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
       TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
       TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
-    // Let LegalizeDAG handle this later.
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT)) {
+    Results.push_back(TLI.expandBITREVERSE(Node, DAG));
     return;
+  }
 
   // Otherwise unroll.
   SDValue Tmp = DAG.UnrollVectorOp(Node);
@@ -1207,9 +1224,7 @@ SDValue VectorLegalizer::ExpandVSELECT(SDNode *Node) {
   Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1);
   Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2);
 
-  SDValue AllOnes = DAG.getConstant(
-    APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL, VT);
-  SDValue NotMask = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes);
+  SDValue NotMask = DAG.getNOT(DL, Mask, VT);
 
   Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
   Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
@@ -1502,9 +1517,8 @@ void VectorLegalizer::UnrollStrictFPOp(SDNode *Node,
     if (Node->getOpcode() == ISD::STRICT_FSETCC ||
         Node->getOpcode() == ISD::STRICT_FSETCCS)
       ScalarResult = DAG.getSelect(dl, EltVT, ScalarResult,
-                           DAG.getConstant(APInt::getAllOnesValue
-                                           (EltVT.getSizeInBits()), dl, EltVT),
-                           DAG.getConstant(0, dl, EltVT));
+                                   DAG.getAllOnesConstant(dl, EltVT),
+                                   DAG.getConstant(0, dl, EltVT));
 
     OpValues.push_back(ScalarResult);
     OpChains.push_back(ScalarChain);
@@ -1536,9 +1550,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDNode *Node) {
                          TLI.getSetCCResultType(DAG.getDataLayout(),
                                                 *DAG.getContext(), TmpEltVT),
                          LHSElem, RHSElem, CC);
-    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i],
-                           DAG.getConstant(APInt::getAllOnesValue
-                                           (EltVT.getSizeInBits()), dl, EltVT),
+    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], DAG.getAllOnesConstant(dl, EltVT),
                            DAG.getConstant(0, dl, EltVT));
   }
   return DAG.getBuildVector(VT, dl, Ops);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 91242bbf866f..539c9cb9c256 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -529,7 +529,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
   SDValue Arg = N->getOperand(2).getOperand(0);
   if (Arg.isUndef())
     return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
-  unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
+  unsigned Op = !cast<ConstantSDNode>(Arg)->isZero();
   return GetScalarizedVector(N->getOperand(Op));
 }
 
@@ -1045,7 +1045,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
-    SplitVecRes_BinOp(N, Lo, Hi);
+    SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ false);
     break;
   case ISD::FMA:
   case ISD::FSHL:
@@ -1082,6 +1082,26 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UDIVFIXSAT:
     SplitVecRes_FIX(N, Lo, Hi);
     break;
+  case ISD::VP_ADD:
+  case ISD::VP_AND:
+  case ISD::VP_MUL:
+  case ISD::VP_OR:
+  case ISD::VP_SUB:
+  case ISD::VP_XOR:
+  case ISD::VP_SHL:
+  case ISD::VP_LSHR:
+  case ISD::VP_ASHR:
+  case ISD::VP_SDIV:
+  case ISD::VP_UDIV:
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+  case ISD::VP_FADD:
+  case ISD::VP_FSUB:
+  case ISD::VP_FMUL:
+  case ISD::VP_FDIV:
+  case ISD::VP_FREM:
+    SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ true);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1113,8 +1133,8 @@ void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
   }
 }
 
-void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
-                                         SDValue &Hi) {
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi,
+                                         bool IsVP) {
   SDValue LHSLo, LHSHi;
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
   SDValue RHSLo, RHSHi;
@@ -1123,8 +1143,41 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
 
   const SDNodeFlags Flags = N->getFlags();
   unsigned Opcode = N->getOpcode();
-  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
-  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
+  if (!IsVP) {
+    Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
+    Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
+    return;
+  }
+
+  // Split the mask.
+  SDValue MaskLo, MaskHi;
+  SDValue Mask = N->getOperand(2);
+  EVT MaskVT = Mask.getValueType();
+  if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, SDLoc(Mask));
+
+  // Split the vector length parameter.
+  // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts).
+  SDValue EVL = N->getOperand(3);
+  EVT VecVT = N->getValueType(0);
+  EVT EVLVT = EVL.getValueType();
+  assert(VecVT.getVectorElementCount().isKnownEven() &&
+         "Expecting the mask to be an evenly-sized vector");
+  unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2;
+  SDValue HalfNumElts =
+      VecVT.isFixedLengthVector()
+          ? DAG.getConstant(HalfMinNumElts, dl, EVLVT)
+          : DAG.getVScale(dl, EVLVT,
+                          APInt(EVLVT.getScalarSizeInBits(), HalfMinNumElts));
+  SDValue EVLLo = DAG.getNode(ISD::UMIN, dl, EVLVT, EVL, HalfNumElts);
+  SDValue EVLHi = DAG.getNode(ISD::USUBSAT, dl, EVLVT, EVL, HalfNumElts);
+
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(),
+                   {LHSLo, RHSLo, MaskLo, EVLLo}, Flags);
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(),
+                   {LHSHi, RHSHi, MaskHi, EVLHi}, Flags);
 }
 
 void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
@@ -2985,6 +3038,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::INSERT_SUBVECTOR:
+    Res = WidenVecRes_INSERT_SUBVECTOR(N);
+    break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
@@ -3035,7 +3091,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
-    Res = WidenVecRes_Binary(N);
+    Res = WidenVecRes_Binary(N, /*IsVP*/ false);
     break;
 
   case ISD::FADD:
@@ -3159,6 +3215,31 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FSHR:
     Res = WidenVecRes_Ternary(N);
     break;
+  case ISD::VP_ADD:
+  case ISD::VP_AND:
+  case ISD::VP_MUL:
+  case ISD::VP_OR:
+  case ISD::VP_SUB:
+  case ISD::VP_XOR:
+  case ISD::VP_SHL:
+  case ISD::VP_LSHR:
+  case ISD::VP_ASHR:
+  case ISD::VP_SDIV:
+  case ISD::VP_UDIV:
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+  case ISD::VP_FADD:
+  case ISD::VP_FSUB:
+  case ISD::VP_FMUL:
+  case ISD::VP_FDIV:
+  case ISD::VP_FREM:
+    // Vector-predicated binary op widening. Note that -- unlike the
+    // unpredicated versions -- we don't have to worry about trapping on
+    // operations like UDIV, FADD, etc., as we pass on the original vector
+    // length parameter. This means the widened elements containing garbage
+    // aren't active.
+    Res = WidenVecRes_Binary(N, /*IsVP*/ true);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -3176,13 +3257,31 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N, bool IsVP) {
   // Binary op widening.
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2,
+                       N->getFlags());
+  // For VP operations, we must also widen the mask. Note that the mask type
+  // may not actually need widening, leading it be split along with the VP
+  // operation.
+  // FIXME: This could lead to an infinite split/widen loop. We only handle the
+  // case where the mask needs widening to an identically-sized type as the
+  // vector inputs.
+  SDValue Mask = N->getOperand(2);
+  assert(getTypeAction(Mask.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unable to widen binary VP op");
+  Mask = GetWidenedVector(Mask);
+  assert(Mask.getValueType().getVectorElementCount() ==
+             WidenVT.getVectorElementCount() &&
+         "Unable to widen binary VP op");
+  return DAG.getNode(N->getOpcode(), dl, WidenVT,
+                     {InOp1, InOp2, Mask, N->getOperand(3)}, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
@@ -3527,7 +3626,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   SDLoc DL(N);
 
   EVT WidenVT = TLI.getTypeToTransformTo(Ctx, N->getValueType(0));
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  ElementCount WidenEC = WidenVT.getVectorElementCount();
 
   EVT InVT = InOp.getValueType();
 
@@ -3547,14 +3646,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   }
 
   EVT InEltVT = InVT.getVectorElementType();
-  EVT InWidenVT = EVT::getVectorVT(Ctx, InEltVT, WidenNumElts);
-  unsigned InVTNumElts = InVT.getVectorNumElements();
+  EVT InWidenVT = EVT::getVectorVT(Ctx, InEltVT, WidenEC);
+  ElementCount InVTEC = InVT.getVectorElementCount();
 
   if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
-    InVTNumElts = InVT.getVectorNumElements();
-    if (InVTNumElts == WidenNumElts) {
+    InVTEC = InVT.getVectorElementCount();
+    if (InVTEC == WidenEC) {
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InOp);
       return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
@@ -3578,9 +3677,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     // it an illegal type that might lead to repeatedly splitting the input
     // and then widening it. To avoid this, we widen the input only if
     // it results in a legal type.
-    if (WidenNumElts % InVTNumElts == 0) {
+    if (WidenEC.isKnownMultipleOf(InVTEC.getKnownMinValue())) {
       // Widen the input and call convert on the widened input vector.
-      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      unsigned NumConcat =
+          WidenEC.getKnownMinValue() / InVTEC.getKnownMinValue();
       SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
       Ops[0] = InOp;
       SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
@@ -3589,7 +3689,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
     }
 
-    if (InVTNumElts % WidenNumElts == 0) {
+    if (InVTEC.isKnownMultipleOf(WidenEC.getKnownMinValue())) {
       SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
                                   DAG.getVectorIdxConstant(0, DL));
       // Extract the input and convert the shorten input vector.
@@ -3601,7 +3701,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
 
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
   EVT EltVT = WidenVT.getVectorElementType();
-  SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+  SmallVector<SDValue, 16> Ops(WidenEC.getFixedValue(), DAG.getUNDEF(EltVT));
   // Use the original element count so we don't do more scalar opts than
   // necessary.
   unsigned MinElts = N->getValueType(0).getVectorNumElements();
@@ -3962,14 +4062,26 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WidenVT, InOp1, InOp2, Idx);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT      VT = N->getValueType(0);
+  EVT      EltVT = VT.getVectorElementType();
   EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue  InOp = N->getOperand(0);
   SDValue  Idx  = N->getOperand(1);
   SDLoc dl(N);
 
-  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+  auto InOpTypeAction = getTypeAction(InOp.getValueType());
+  if (InOpTypeAction == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
 
   EVT InVT = InOp.getValueType();
@@ -3979,20 +4091,49 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   if (IdxVal == 0 && InVT == WidenVT)
     return InOp;
 
-  if (VT.isScalableVector())
-    report_fatal_error("Don't know how to widen the result of "
-                       "EXTRACT_SUBVECTOR for scalable vectors");
-
   // Check if we can extract from the vector.
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
-  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+  unsigned InNumElts = InVT.getVectorMinNumElements();
   if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
 
+  if (VT.isScalableVector()) {
+    // Try to split the operation up into smaller extracts and concat the
+    // results together, e.g.
+    //    nxv6i64 extract_subvector(nxv12i64, 6)
+    // <->
+    //  nxv8i64 concat(
+    //    nxv2i64 extract_subvector(nxv16i64, 6)
+    //    nxv2i64 extract_subvector(nxv16i64, 8)
+    //    nxv2i64 extract_subvector(nxv16i64, 10)
+    //    undef)
+    unsigned VTNElts = VT.getVectorMinNumElements();
+    unsigned GCD = greatestCommonDivisor(VTNElts, WidenNumElts);
+    assert((IdxVal % GCD) == 0 && "Expected Idx to be a multiple of the broken "
+                                  "down type's element count");
+    EVT PartVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                                  ElementCount::getScalable(GCD));
+    // Avoid recursion around e.g. nxv1i8.
+    if (getTypeAction(PartVT) != TargetLowering::TypeWidenVector) {
+      SmallVector<SDValue> Parts;
+      unsigned I = 0;
+      for (; I < VTNElts / GCD; ++I)
+        Parts.push_back(
+            DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, PartVT, InOp,
+                        DAG.getVectorIdxConstant(IdxVal + I * GCD, dl)));
+      for (; I < WidenNumElts / GCD; ++I)
+        Parts.push_back(DAG.getUNDEF(PartVT));
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
+    }
+
+    report_fatal_error("Don't know how to widen the result of "
+                       "EXTRACT_SUBVECTOR for scalable vectors");
+  }
+
   // We could try widening the input to the right length but for now, extract
   // the original elements, fill the rest with undefs and build a vector.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
-  EVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned i;
   for (i = 0; i < NumElts; ++i)
@@ -4037,20 +4178,55 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   else
     Result = GenWidenVectorLoads(LdChain, LD);
 
-  // If we generate a single load, we can use that for the chain.  Otherwise,
-  // build a factor node to remember the multiple loads are independent and
-  // chain to that.
-  SDValue NewChain;
-  if (LdChain.size() == 1)
-    NewChain = LdChain[0];
-  else
-    NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
+  if (Result) {
+    // If we generate a single load, we can use that for the chain.  Otherwise,
+    // build a factor node to remember the multiple loads are independent and
+    // chain to that.
+    SDValue NewChain;
+    if (LdChain.size() == 1)
+      NewChain = LdChain[0];
+    else
+      NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
 
-  // Modified the chain - switch anything that used the old chain to use
-  // the new one.
-  ReplaceValueWith(SDValue(N, 1), NewChain);
+    // Modified the chain - switch anything that used the old chain to use
+    // the new one.
+    ReplaceValueWith(SDValue(N, 1), NewChain);
 
-  return Result;
+    return Result;
+  }
+
+  // Generate a vector-predicated load if it is custom/legal on the target. To
+  // avoid possible recursion, only do this if the widened mask type is legal.
+  // FIXME: Not all targets may support EVL in VP_LOAD. These will have been
+  // removed from the IR by the ExpandVectorPredication pass but we're
+  // reintroducing them here.
+  EVT LdVT = LD->getMemoryVT();
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideVT.getVectorElementCount());
+  if (ExtType == ISD::NON_EXTLOAD && WideVT.isScalableVector() &&
+      TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
+      TLI.isTypeLegal(WideMaskVT)) {
+    SDLoc DL(N);
+    SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
+    MVT EVLVT = TLI.getVPExplicitVectorLengthTy();
+    unsigned NumVTElts = LdVT.getVectorMinNumElements();
+    SDValue EVL =
+        DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts));
+    const auto *MMO = LD->getMemOperand();
+    SDValue NewLoad =
+        DAG.getLoadVP(WideVT, DL, LD->getChain(), LD->getBasePtr(), Mask, EVL,
+                      MMO->getPointerInfo(), MMO->getAlign(), MMO->getFlags(),
+                      MMO->getAAInfo());
+
+    // Modified the chain - switch anything that used the old chain to use
+    // the new one.
+    ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
+
+    return NewLoad;
+  }
+
+  report_fatal_error("Unable to widen vector load");
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
@@ -4351,7 +4527,7 @@ SDValue DAGTypeLegalizer::WidenVSELECTMask(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  ElementCount WidenEC = WidenVT.getVectorElementCount();
 
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
@@ -4365,8 +4541,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
     }
 
     EVT CondEltVT = CondVT.getVectorElementType();
-    EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
-                                        CondEltVT, WidenNumElts);
+    EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenEC);
     if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
       Cond1 = GetWidenedVector(Cond1);
 
@@ -4891,12 +5066,42 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
     return TLI.scalarizeVectorStore(ST, DAG);
 
   SmallVector<SDValue, 16> StChain;
-  GenWidenVectorStores(StChain, ST);
+  if (GenWidenVectorStores(StChain, ST)) {
+    if (StChain.size() == 1)
+      return StChain[0];
 
-  if (StChain.size() == 1)
-    return StChain[0];
-  else
     return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
+  }
+
+  // Generate a vector-predicated store if it is custom/legal on the target.
+  // To avoid possible recursion, only do this if the widened mask type is
+  // legal.
+  // FIXME: Not all targets may support EVL in VP_STORE. These will have been
+  // removed from the IR by the ExpandVectorPredication pass but we're
+  // reintroducing them here.
+  SDValue StVal = ST->getValue();
+  EVT StVT = StVal.getValueType();
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideVT.getVectorElementCount());
+  if (WideVT.isScalableVector() &&
+      TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
+      TLI.isTypeLegal(WideMaskVT)) {
+    // Widen the value.
+    SDLoc DL(N);
+    StVal = GetWidenedVector(StVal);
+    SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
+    MVT EVLVT = TLI.getVPExplicitVectorLengthTy();
+    unsigned NumVTElts = StVT.getVectorMinNumElements();
+    SDValue EVL =
+        DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts));
+    const auto *MMO = ST->getMemOperand();
+    return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), Mask,
+                          EVL, MMO->getPointerInfo(), MMO->getAlign(),
+                          MMO->getFlags(), MMO->getAAInfo());
+  }
+
+  report_fatal_error("Unable to widen vector store");
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
@@ -5147,9 +5352,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
 //  Align:     If 0, don't allow use of a wider type
 //  WidenEx:   If Align is not 0, the amount additional we can load/store from.
 
-static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
-                       unsigned Width, EVT WidenVT,
-                       unsigned Align = 0, unsigned WidenEx = 0) {
+static Optional<EVT> findMemType(SelectionDAG &DAG, const TargetLowering &TLI,
+                                 unsigned Width, EVT WidenVT,
+                                 unsigned Align = 0, unsigned WidenEx = 0) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   const bool Scalable = WidenVT.isScalableVector();
   unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize();
@@ -5204,9 +5409,11 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
     }
   }
 
+  // Using element-wise loads and stores for widening operations is not
+  // supported for scalable vectors
   if (Scalable)
-    report_fatal_error("Using element-wise loads and stores for widening "
-                       "operations is not supported for scalable vectors");
+    return None;
+
   return RetVT;
 }
 
@@ -5266,32 +5473,63 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   TypeSize WidthDiff = WidenWidth - LdWidth;
   // Allow wider loads if they are sufficiently aligned to avoid memory faults
   // and if the original load is simple.
-  unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
+  unsigned LdAlign =
+      (!LD->isSimple() || LdVT.isScalableVector()) ? 0 : LD->getAlignment();
 
   // Find the vector type that can load from.
-  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
-                          WidthDiff.getKnownMinSize());
-  TypeSize NewVTWidth = NewVT.getSizeInBits();
-  SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
+  Optional<EVT> FirstVT =
+      findMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                  WidthDiff.getKnownMinSize());
+
+  if (!FirstVT)
+    return SDValue();
+
+  SmallVector<EVT, 8> MemVTs;
+  TypeSize FirstVTWidth = FirstVT->getSizeInBits();
+
+  // Unless we're able to load in one instruction we must work out how to load
+  // the remainder.
+  if (!TypeSize::isKnownLE(LdWidth, FirstVTWidth)) {
+    Optional<EVT> NewVT = FirstVT;
+    TypeSize RemainingWidth = LdWidth;
+    TypeSize NewVTWidth = FirstVTWidth;
+    do {
+      RemainingWidth -= NewVTWidth;
+      if (TypeSize::isKnownLT(RemainingWidth, NewVTWidth)) {
+        // The current type we are using is too large. Find a better size.
+        NewVT = findMemType(DAG, TLI, RemainingWidth.getKnownMinSize(), WidenVT,
+                            LdAlign, WidthDiff.getKnownMinSize());
+        if (!NewVT)
+          return SDValue();
+        NewVTWidth = NewVT->getSizeInBits();
+      }
+      MemVTs.push_back(*NewVT);
+    } while (TypeSize::isKnownGT(RemainingWidth, NewVTWidth));
+  }
+
+  SDValue LdOp = DAG.getLoad(*FirstVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              LD->getOriginalAlign(), MMOFlags, AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction.
-  if (TypeSize::isKnownLE(LdWidth, NewVTWidth)) {
-    if (!NewVT.isVector()) {
-      unsigned NumElts = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
-      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+  if (MemVTs.empty()) {
+    assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
+    if (!FirstVT->isVector()) {
+      unsigned NumElts =
+          WidenWidth.getFixedSize() / FirstVTWidth.getFixedSize();
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), *FirstVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
     }
-    if (NewVT == WidenVT)
+    if (FirstVT == WidenVT)
       return LdOp;
 
     // TODO: We don't currently have any tests that exercise this code path.
-    assert(WidenWidth.getFixedSize() % NewVTWidth.getFixedSize() == 0);
-    unsigned NumConcat = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
+    assert(WidenWidth.getFixedSize() % FirstVTWidth.getFixedSize() == 0);
+    unsigned NumConcat =
+        WidenWidth.getFixedSize() / FirstVTWidth.getFixedSize();
     SmallVector<SDValue, 16> ConcatOps(NumConcat);
-    SDValue UndefVal = DAG.getUNDEF(NewVT);
+    SDValue UndefVal = DAG.getUNDEF(*FirstVT);
     ConcatOps[0] = LdOp;
     for (unsigned i = 1; i != NumConcat; ++i)
       ConcatOps[i] = UndefVal;
@@ -5304,28 +5542,22 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
   uint64_t ScaledOffset = 0;
   MachinePointerInfo MPI = LD->getPointerInfo();
-  do {
-    LdWidth -= NewVTWidth;
-    IncrementPointer(cast<LoadSDNode>(LdOp), NewVT, MPI, BasePtr,
-                     &ScaledOffset);
-
-    if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) {
-      // The current type we are using is too large. Find a better size.
-      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
-                          WidthDiff.getKnownMinSize());
-      NewVTWidth = NewVT.getSizeInBits();
-    }
 
+  // First incremement past the first load.
+  IncrementPointer(cast<LoadSDNode>(LdOp), *FirstVT, MPI, BasePtr,
+                   &ScaledOffset);
+
+  for (EVT MemVT : MemVTs) {
     Align NewAlign = ScaledOffset == 0
                          ? LD->getOriginalAlign()
                          : commonAlignment(LD->getAlign(), ScaledOffset);
     SDValue L =
-        DAG.getLoad(NewVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo);
-    LdChain.push_back(L.getValue(1));
+        DAG.getLoad(MemVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo);
 
     LdOps.push_back(L);
-    LdOp = L;
-  } while (TypeSize::isKnownGT(LdWidth, NewVTWidth));
+    LdChain.push_back(L.getValue(1));
+    IncrementPointer(cast<LoadSDNode>(L), MemVT, MPI, BasePtr, &ScaledOffset);
+  }
 
   // Build the vector from the load operations.
   unsigned End = LdOps.size();
@@ -5447,7 +5679,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
-void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
+bool DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // The strategy assumes that we can efficiently store power-of-two widths.
   // The routine chops the vector into the largest vector stores with the same
@@ -5473,9 +5705,30 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
 
   MachinePointerInfo MPI = ST->getPointerInfo();
   uint64_t ScaledOffset = 0;
+
+  // A breakdown of how to widen this vector store. Each element of the vector
+  // is a memory VT combined with the number of times it is to be stored to,
+  // e,g., v5i32 -> {{v2i32,2},{i32,1}}
+  SmallVector<std::pair<EVT, unsigned>, 4> MemVTs;
+
   while (StWidth.isNonZero()) {
     // Find the largest vector type we can store with.
-    EVT NewVT = FindMemType(DAG, TLI, StWidth.getKnownMinSize(), ValVT);
+    Optional<EVT> NewVT =
+        findMemType(DAG, TLI, StWidth.getKnownMinSize(), ValVT);
+    if (!NewVT)
+      return false;
+    MemVTs.push_back({*NewVT, 0});
+    TypeSize NewVTWidth = NewVT->getSizeInBits();
+
+    do {
+      StWidth -= NewVTWidth;
+      MemVTs.back().second++;
+    } while (StWidth.isNonZero() && TypeSize::isKnownGE(StWidth, NewVTWidth));
+  }
+
+  for (const auto &Pair : MemVTs) {
+    EVT NewVT = Pair.first;
+    unsigned Count = Pair.second;
     TypeSize NewVTWidth = NewVT.getSizeInBits();
 
     if (NewVT.isVector()) {
@@ -5490,12 +5743,10 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                          MMOFlags, AAInfo);
         StChain.push_back(PartStore);
 
-        StWidth -= NewVTWidth;
         Idx += NumVTElts;
-
         IncrementPointer(cast<StoreSDNode>(PartStore), NewVT, MPI, BasePtr,
                          &ScaledOffset);
-      } while (StWidth.isNonZero() && TypeSize::isKnownGE(StWidth, NewVTWidth));
+      } while (--Count);
     } else {
       // Cast the vector to the scalar type we can store.
       unsigned NumElts = ValWidth.getFixedSize() / NewVTWidth.getFixedSize();
@@ -5511,13 +5762,14 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                          MMOFlags, AAInfo);
         StChain.push_back(PartStore);
 
-        StWidth -= NewVTWidth;
         IncrementPointer(cast<StoreSDNode>(PartStore), NewVT, MPI, BasePtr);
-      } while (StWidth.isNonZero() && TypeSize::isKnownGE(StWidth, NewVTWidth));
+      } while (--Count);
       // Restore index back to be relative to the original widen element type.
       Idx = Idx * NewVTWidth.getFixedSize() / ValEltWidth;
     }
   }
+
+  return true;
 }
 
 /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
diff --git a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index 75b4242a415c..f64b332a7fef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -192,7 +192,7 @@ public:
   // Returns the SDNodes which this SDDbgValue depends on.
   SmallVector<SDNode *> getSDNodes() const {
     SmallVector<SDNode *> Dependencies;
-    for (SDDbgOperand DbgOp : getLocationOps())
+    for (const SDDbgOperand &DbgOp : getLocationOps())
       if (DbgOp.getKind() == SDDbgOperand::SDNODE)
         Dependencies.push_back(DbgOp.getSDNode());
     for (SDNode *Node : getAdditionalDependencies())
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 0022e5ec31f0..1b89864116cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -56,9 +56,7 @@ namespace {
 
     SUnit *pop() {
       if (empty()) return nullptr;
-      SUnit *V = Queue.back();
-      Queue.pop_back();
-      return V;
+      return Queue.pop_back_val();
     }
   };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index b2a8c8bdd78c..95f7e43b151d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -384,13 +384,12 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
 
       // There are either zero or one users of the Glue result.
       bool HasGlueUse = false;
-      for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
-           UI != E; ++UI)
-        if (GlueVal.isOperandOf(*UI)) {
+      for (SDNode *U : N->uses())
+        if (GlueVal.isOperandOf(U)) {
           HasGlueUse = true;
           assert(N->getNodeId() == -1 && "Node already inserted!");
           N->setNodeId(NodeSUnit->NodeNum);
-          N = *UI;
+          N = U;
           if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
             NodeSUnit->isCall = true;
           break;
@@ -742,7 +741,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   /// Returns true if \p DV has any VReg operand locations which don't exist in
   /// VRBaseMap.
   auto HasUnknownVReg = [&VRBaseMap](SDDbgValue *DV) {
-    for (SDDbgOperand L : DV->getLocationOps()) {
+    for (const SDDbgOperand &L : DV->getLocationOps()) {
       if (L.getKind() == SDDbgOperand::SDNODE &&
           VRBaseMap.count({L.getSDNode(), L.getResNo()}) == 0)
         return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2a98464425c4..008665d50233 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -175,7 +176,7 @@ bool ISD::isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly) {
 
   if (!BuildVectorOnly && N->getOpcode() == ISD::SPLAT_VECTOR) {
     APInt SplatVal;
-    return isConstantSplatVector(N, SplatVal) && SplatVal.isAllOnesValue();
+    return isConstantSplatVector(N, SplatVal) && SplatVal.isAllOnes();
   }
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
@@ -224,7 +225,7 @@ bool ISD::isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly) {
 
   if (!BuildVectorOnly && N->getOpcode() == ISD::SPLAT_VECTOR) {
     APInt SplatVal;
-    return isConstantSplatVector(N, SplatVal) && SplatVal.isNullValue();
+    return isConstantSplatVector(N, SplatVal) && SplatVal.isZero();
   }
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
@@ -412,6 +413,28 @@ bool ISD::isVPOpcode(unsigned Opcode) {
   }
 }
 
+bool ISD::isVPBinaryOp(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+#define PROPERTY_VP_BINARYOP_SDNODE(SDOPC)                                     \
+  case ISD::SDOPC:                                                             \
+    return true;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+bool ISD::isVPReduction(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+#define PROPERTY_VP_REDUCTION_SDNODE(SDOPC)                                    \
+  case ISD::SDOPC:                                                             \
+    return true;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
 /// The operand position of the vector mask.
 Optional<unsigned> ISD::getVPMaskIdx(unsigned Opcode) {
   switch (Opcode) {
@@ -683,6 +706,34 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
+  case ISD::VP_LOAD: {
+    const VPLoadSDNode *ELD = cast<VPLoadSDNode>(N);
+    ID.AddInteger(ELD->getMemoryVT().getRawBits());
+    ID.AddInteger(ELD->getRawSubclassData());
+    ID.AddInteger(ELD->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::VP_STORE: {
+    const VPStoreSDNode *EST = cast<VPStoreSDNode>(N);
+    ID.AddInteger(EST->getMemoryVT().getRawBits());
+    ID.AddInteger(EST->getRawSubclassData());
+    ID.AddInteger(EST->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::VP_GATHER: {
+    const VPGatherSDNode *EG = cast<VPGatherSDNode>(N);
+    ID.AddInteger(EG->getMemoryVT().getRawBits());
+    ID.AddInteger(EG->getRawSubclassData());
+    ID.AddInteger(EG->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::VP_SCATTER: {
+    const VPScatterSDNode *ES = cast<VPScatterSDNode>(N);
+    ID.AddInteger(ES->getMemoryVT().getRawBits());
+    ID.AddInteger(ES->getRawSubclassData());
+    ID.AddInteger(ES->getPointerInfo().getAddrSpace());
+    break;
+  }
   case ISD::MLOAD: {
     const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
     ID.AddInteger(MLD->getMemoryVT().getRawBits());
@@ -1319,10 +1370,7 @@ SDValue SelectionDAG::getPtrExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
 
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
-  EVT EltVT = VT.getScalarType();
-  SDValue NegOne =
-    getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
-  return getNode(ISD::XOR, DL, VT, Val, NegOne);
+  return getNode(ISD::XOR, DL, VT, Val, getAllOnesConstant(DL, VT));
 }
 
 SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
@@ -1901,7 +1949,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
         if (SameNumElts)
           return N1;
         if (auto *C = dyn_cast<ConstantSDNode>(Splat))
-          if (C->isNullValue())
+          if (C->isZero())
             return N1;
       }
 
@@ -2265,19 +2313,8 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
     if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
       const APInt &C1 = N1C->getAPIntValue();
 
-      switch (Cond) {
-      default: llvm_unreachable("Unknown integer setcc!");
-      case ISD::SETEQ:  return getBoolConstant(C1 == C2, dl, VT, OpVT);
-      case ISD::SETNE:  return getBoolConstant(C1 != C2, dl, VT, OpVT);
-      case ISD::SETULT: return getBoolConstant(C1.ult(C2), dl, VT, OpVT);
-      case ISD::SETUGT: return getBoolConstant(C1.ugt(C2), dl, VT, OpVT);
-      case ISD::SETULE: return getBoolConstant(C1.ule(C2), dl, VT, OpVT);
-      case ISD::SETUGE: return getBoolConstant(C1.uge(C2), dl, VT, OpVT);
-      case ISD::SETLT:  return getBoolConstant(C1.slt(C2), dl, VT, OpVT);
-      case ISD::SETGT:  return getBoolConstant(C1.sgt(C2), dl, VT, OpVT);
-      case ISD::SETLE:  return getBoolConstant(C1.sle(C2), dl, VT, OpVT);
-      case ISD::SETGE:  return getBoolConstant(C1.sge(C2), dl, VT, OpVT);
-      }
+      return getBoolConstant(ICmpInst::compare(C1, C2, getICmpCondCode(Cond)),
+                             dl, VT, OpVT);
     }
   }
 
@@ -2380,7 +2417,7 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits) {
     return SDValue();
 
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return GetDemandedBits(V, DemandedBits, DemandedElts);
 }
@@ -2475,7 +2512,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
   switch (V.getOpcode()) {
   case ISD::SPLAT_VECTOR:
     UndefElts = V.getOperand(0).isUndef()
-                    ? APInt::getAllOnesValue(DemandedElts.getBitWidth())
+                    ? APInt::getAllOnes(DemandedElts.getBitWidth())
                     : APInt(DemandedElts.getBitWidth(), 0);
     return true;
   case ISD::ADD:
@@ -2507,7 +2544,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
 
   unsigned NumElts = VT.getVectorNumElements();
   assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
-  UndefElts = APInt::getNullValue(NumElts);
+  UndefElts = APInt::getZero(NumElts);
 
   switch (V.getOpcode()) {
   case ISD::BUILD_VECTOR: {
@@ -2576,7 +2613,7 @@ bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
 
   // For now we don't support this with scalable vectors.
   if (!VT.isScalableVector())
-    DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+    DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   return isSplatValue(V, DemandedElts, UndefElts) &&
          (AllowUndefs || !UndefElts);
 }
@@ -2592,7 +2629,7 @@ SDValue SelectionDAG::getSplatSourceVector(SDValue V, int &SplatIdx) {
     APInt DemandedElts;
 
     if (!VT.isScalableVector())
-      DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+      DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
 
     if (isSplatValue(V, DemandedElts, UndefElts)) {
       if (VT.isScalableVector()) {
@@ -2740,7 +2777,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, unsigned Depth) const {
   }
 
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return computeKnownBits(Op, DemandedElts, Depth);
 }
@@ -2878,7 +2915,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
     APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
     APInt DemandedSrcElts = DemandedElts;
-    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
 
     Known.One.setAllBits();
     Known.Zero.setAllBits();
@@ -2965,11 +3002,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       // bits from the overlapping larger input elements and extracting the
       // sub sections we actually care about.
       unsigned SubScale = SubBitWidth / BitWidth;
-      APInt SubDemandedElts(NumElts / SubScale, 0);
-      for (unsigned i = 0; i != NumElts; ++i)
-        if (DemandedElts[i])
-          SubDemandedElts.setBit(i / SubScale);
-
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
       Known2 = computeKnownBits(N0, SubDemandedElts, Depth + 1);
 
       Known.Zero.setAllBits(); Known.One.setAllBits();
@@ -3415,7 +3449,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
     // If we know the element index, just demand that vector element, else for
     // an unknown element index, ignore DemandedElts and demand them all.
-    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
     auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
       DemandedSrcElts =
@@ -3647,6 +3681,12 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
         }))
       return true;
 
+  // Is the operand of a splat vector a constant power of two?
+  if (Val.getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val->getOperand(0)))
+      if (C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2())
+        return true;
+
   // More could be done here, though the above checks are enough
   // to handle some common cases.
 
@@ -3663,7 +3703,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return 1;
 
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return ComputeNumSignBits(Op, DemandedElts, Depth);
 }
@@ -3771,10 +3811,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       assert(VT.isVector() && "Expected bitcast to vector");
 
       unsigned Scale = SrcBits / VTBits;
-      APInt SrcDemandedElts(NumElts / Scale, 0);
-      for (unsigned i = 0; i != NumElts; ++i)
-        if (DemandedElts[i])
-          SrcDemandedElts.setBit(i / Scale);
+      APInt SrcDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / Scale);
 
       // Fast case - sign splat can be simply split across the small elements.
       Tmp = ComputeNumSignBits(N0, SrcDemandedElts, Depth + 1);
@@ -3946,13 +3984,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     // Special case decrementing a value (ADD X, -1):
     if (ConstantSDNode *CRHS =
             isConstOrConstSplat(Op.getOperand(1), DemandedElts))
-      if (CRHS->isAllOnesValue()) {
+      if (CRHS->isAllOnes()) {
         KnownBits Known =
             computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((Known.Zero | 1).isAllOnesValue())
+        if ((Known.Zero | 1).isAllOnes())
           return VTBits;
 
         // If we are subtracting one from a positive number, there is no carry
@@ -3971,12 +4009,12 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     // Handle NEG.
     if (ConstantSDNode *CLHS =
             isConstOrConstSplat(Op.getOperand(0), DemandedElts))
-      if (CLHS->isNullValue()) {
+      if (CLHS->isZero()) {
         KnownBits Known =
             computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((Known.Zero | 1).isAllOnesValue())
+        if ((Known.Zero | 1).isAllOnes())
           return VTBits;
 
         // If the input is known to be positive (the sign bit is known clear),
@@ -4080,7 +4118,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     // If we know the element index, just demand that vector element, else for
     // an unknown element index, ignore DemandedElts and demand them all.
-    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
     auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
       DemandedSrcElts =
@@ -4126,7 +4164,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
     APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
     APInt DemandedSrcElts = DemandedElts;
-    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
 
     Tmp = std::numeric_limits<unsigned>::max();
     if (!!DemandedSubElts) {
@@ -4248,6 +4286,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   return std::max(FirstAnswer, Mask.countLeadingOnes());
 }
 
+unsigned SelectionDAG::ComputeMinSignedBits(SDValue Op, unsigned Depth) const {
+  unsigned SignBits = ComputeNumSignBits(Op, Depth);
+  return Op.getScalarValueSizeInBits() - SignBits + 1;
+}
+
+unsigned SelectionDAG::ComputeMinSignedBits(SDValue Op,
+                                            const APInt &DemandedElts,
+                                            unsigned Depth) const {
+  unsigned SignBits = ComputeNumSignBits(Op, DemandedElts, Depth);
+  return Op.getScalarValueSizeInBits() - SignBits + 1;
+}
+
 bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly,
                                                     unsigned Depth) const {
   // Early out for FREEZE.
@@ -4260,7 +4310,7 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly,
     return false;
 
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return isGuaranteedNotToBeUndefOrPoison(Op, DemandedElts, PoisonOnly, Depth);
 }
@@ -4285,7 +4335,17 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
   case ISD::UNDEF:
     return PoisonOnly;
 
-  // TODO: ISD::BUILD_VECTOR handling
+  case ISD::BUILD_VECTOR:
+    // NOTE: BUILD_VECTOR has implicit truncation of wider scalar elements -
+    // this shouldn't affect the result.
+    for (unsigned i = 0, e = Op.getNumOperands(); i < e; ++i) {
+      if (!DemandedElts[i])
+        continue;
+      if (!isGuaranteedNotToBeUndefOrPoison(Op.getOperand(i), PoisonOnly,
+                                            Depth + 1))
+        return false;
+    }
+    return true;
 
   // TODO: Search for noundef attributes from library functions.
 
@@ -4449,8 +4509,8 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
          "Floating point types unsupported - use isKnownNeverZeroFloat");
 
   // If the value is a constant, we can obviously see if it is a zero or not.
-  if (ISD::matchUnaryPredicate(
-          Op, [](ConstantSDNode *C) { return !C->isNullValue(); }))
+  if (ISD::matchUnaryPredicate(Op,
+                               [](ConstantSDNode *C) { return !C->isZero(); }))
     return true;
 
   // TODO: Recognize more cases here.
@@ -4490,7 +4550,7 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
 
 static SDValue FoldSTEP_VECTOR(const SDLoc &DL, EVT VT, SDValue Step,
                                SelectionDAG &DAG) {
-  if (cast<ConstantSDNode>(Step)->isNullValue())
+  if (cast<ConstantSDNode>(Step)->isZero())
     return DAG.getConstant(0, DL, VT);
 
   return SDValue();
@@ -4676,7 +4736,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::UINT_TO_FP:
     case ISD::SINT_TO_FP: {
       APFloat apf(EVTToAPFloatSemantics(VT),
-                  APInt::getNullValue(VT.getSizeInBits()));
+                  APInt::getZero(VT.getSizeInBits()));
       (void)apf.convertFromAPInt(Val,
                                  Opcode==ISD::SINT_TO_FP,
                                  APFloat::rmNearestTiesToEven);
@@ -4828,7 +4888,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTPOP: {
     SDValue Ops = {Operand};
-    if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
+    if (SDValue Fold = FoldConstantArithmetic(Opcode, DL, VT, Ops))
       return Fold;
   }
   }
@@ -4976,6 +5036,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
+    if (OpOpcode == ISD::VSCALE && !NewNodesMustHaveLegalTypes)
+      return getVScale(DL, VT, Operand.getConstantOperandAPInt(0));
     break;
   case ISD::ANY_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
@@ -5206,173 +5268,111 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS)
     return SDValue();
 
-  // For now, the array Ops should only contain two values.
-  // This enforcement will be removed once this function is merged with
-  // FoldConstantVectorArithmetic
-  if (Ops.size() != 2)
+  unsigned NumOps = Ops.size();
+  if (NumOps == 0)
     return SDValue();
 
   if (isUndef(Opcode, Ops))
     return getUNDEF(VT);
 
-  SDNode *N1 = Ops[0].getNode();
-  SDNode *N2 = Ops[1].getNode();
-
   // Handle the case of two scalars.
-  if (auto *C1 = dyn_cast<ConstantSDNode>(N1)) {
-    if (auto *C2 = dyn_cast<ConstantSDNode>(N2)) {
-      if (C1->isOpaque() || C2->isOpaque())
-        return SDValue();
-
-      Optional<APInt> FoldAttempt =
-          FoldValue(Opcode, C1->getAPIntValue(), C2->getAPIntValue());
-      if (!FoldAttempt)
-        return SDValue();
-
-      SDValue Folded = getConstant(FoldAttempt.getValue(), DL, VT);
-      assert((!Folded || !VT.isVector()) &&
-             "Can't fold vectors ops with scalar operands");
-      return Folded;
-    }
-  }
-
-  // fold (add Sym, c) -> Sym+c
-  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N1))
-    return FoldSymbolOffset(Opcode, VT, GA, N2);
-  if (TLI->isCommutativeBinOp(Opcode))
-    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N2))
-      return FoldSymbolOffset(Opcode, VT, GA, N1);
-
-  // For fixed width vectors, extract each constant element and fold them
-  // individually. Either input may be an undef value.
-  bool IsBVOrSV1 = N1->getOpcode() == ISD::BUILD_VECTOR ||
-                   N1->getOpcode() == ISD::SPLAT_VECTOR;
-  if (!IsBVOrSV1 && !N1->isUndef())
-    return SDValue();
-  bool IsBVOrSV2 = N2->getOpcode() == ISD::BUILD_VECTOR ||
-                   N2->getOpcode() == ISD::SPLAT_VECTOR;
-  if (!IsBVOrSV2 && !N2->isUndef())
-    return SDValue();
-  // If both operands are undef, that's handled the same way as scalars.
-  if (!IsBVOrSV1 && !IsBVOrSV2)
-    return SDValue();
-
-  EVT SVT = VT.getScalarType();
-  EVT LegalSVT = SVT;
-  if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
-    LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
-    if (LegalSVT.bitsLT(SVT))
-      return SDValue();
-  }
-
-  SmallVector<SDValue, 4> Outputs;
-  unsigned NumOps = 0;
-  if (IsBVOrSV1)
-    NumOps = std::max(NumOps, N1->getNumOperands());
-  if (IsBVOrSV2)
-    NumOps = std::max(NumOps, N2->getNumOperands());
-  assert(NumOps != 0 && "Expected non-zero operands");
-  // Scalable vectors should only be SPLAT_VECTOR or UNDEF here. We only need
-  // one iteration for that.
-  assert((!VT.isScalableVector() || NumOps == 1) &&
-         "Scalable vector should only have one scalar");
-
-  for (unsigned I = 0; I != NumOps; ++I) {
-    // We can have a fixed length SPLAT_VECTOR and a BUILD_VECTOR so we need
-    // to use operand 0 of the SPLAT_VECTOR for each fixed element.
-    SDValue V1;
-    if (N1->getOpcode() == ISD::BUILD_VECTOR)
-      V1 = N1->getOperand(I);
-    else if (N1->getOpcode() == ISD::SPLAT_VECTOR)
-      V1 = N1->getOperand(0);
-    else
-      V1 = getUNDEF(SVT);
-
-    SDValue V2;
-    if (N2->getOpcode() == ISD::BUILD_VECTOR)
-      V2 = N2->getOperand(I);
-    else if (N2->getOpcode() == ISD::SPLAT_VECTOR)
-      V2 = N2->getOperand(0);
-    else
-      V2 = getUNDEF(SVT);
-
-    if (SVT.isInteger()) {
-      if (V1.getValueType().bitsGT(SVT))
-        V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
-      if (V2.getValueType().bitsGT(SVT))
-        V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
+  if (NumOps == 2) {
+    // TODO: Move foldConstantFPMath here?
+
+    if (auto *C1 = dyn_cast<ConstantSDNode>(Ops[0])) {
+      if (auto *C2 = dyn_cast<ConstantSDNode>(Ops[1])) {
+        if (C1->isOpaque() || C2->isOpaque())
+          return SDValue();
+
+        Optional<APInt> FoldAttempt =
+            FoldValue(Opcode, C1->getAPIntValue(), C2->getAPIntValue());
+        if (!FoldAttempt)
+          return SDValue();
+
+        SDValue Folded = getConstant(FoldAttempt.getValue(), DL, VT);
+        assert((!Folded || !VT.isVector()) &&
+               "Can't fold vectors ops with scalar operands");
+        return Folded;
+      }
     }
 
-    if (V1.getValueType() != SVT || V2.getValueType() != SVT)
-      return SDValue();
-
-    // Fold one vector element.
-    SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
-    if (LegalSVT != SVT)
-      ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
-
-    // Scalar folding only succeeded if the result is a constant or UNDEF.
-    if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
-        ScalarResult.getOpcode() != ISD::ConstantFP)
-      return SDValue();
-    Outputs.push_back(ScalarResult);
-  }
-
-  if (N1->getOpcode() == ISD::BUILD_VECTOR ||
-      N2->getOpcode() == ISD::BUILD_VECTOR) {
-    assert(VT.getVectorNumElements() == Outputs.size() &&
-           "Vector size mismatch!");
-
-    // Build a big vector out of the scalar elements we generated.
-    return getBuildVector(VT, SDLoc(), Outputs);
+    // fold (add Sym, c) -> Sym+c
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ops[0]))
+      return FoldSymbolOffset(Opcode, VT, GA, Ops[1].getNode());
+    if (TLI->isCommutativeBinOp(Opcode))
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ops[1]))
+        return FoldSymbolOffset(Opcode, VT, GA, Ops[0].getNode());
   }
 
-  assert((N1->getOpcode() == ISD::SPLAT_VECTOR ||
-          N2->getOpcode() == ISD::SPLAT_VECTOR) &&
-         "One operand should be a splat vector");
-
-  assert(Outputs.size() == 1 && "Vector size mismatch!");
-  return getSplatVector(VT, SDLoc(), Outputs[0]);
-}
-
-// TODO: Merge with FoldConstantArithmetic
-SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
-                                                   const SDLoc &DL, EVT VT,
-                                                   ArrayRef<SDValue> Ops,
-                                                   const SDNodeFlags Flags) {
-  // If the opcode is a target-specific ISD node, there's nothing we can
-  // do here and the operand rules may not line up with the below, so
-  // bail early.
-  if (Opcode >= ISD::BUILTIN_OP_END)
-    return SDValue();
-
-  if (isUndef(Opcode, Ops))
-    return getUNDEF(VT);
-
-  // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
+  // This is for vector folding only from here on.
   if (!VT.isVector())
     return SDValue();
 
   ElementCount NumElts = VT.getVectorElementCount();
 
+  // See if we can fold through bitcasted integer ops.
+  // TODO: Can we handle undef elements?
+  if (NumOps == 2 && VT.isFixedLengthVector() && VT.isInteger() &&
+      Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
+      Ops[0].getOpcode() == ISD::BITCAST &&
+      Ops[1].getOpcode() == ISD::BITCAST) {
+    SDValue N1 = peekThroughBitcasts(Ops[0]);
+    SDValue N2 = peekThroughBitcasts(Ops[1]);
+    auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+    auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
+    EVT BVVT = N1.getValueType();
+    if (BV1 && BV2 && BVVT.isInteger() && BVVT == N2.getValueType()) {
+      bool IsLE = getDataLayout().isLittleEndian();
+      unsigned EltBits = VT.getScalarSizeInBits();
+      SmallVector<APInt> RawBits1, RawBits2;
+      BitVector UndefElts1, UndefElts2;
+      if (BV1->getConstantRawBits(IsLE, EltBits, RawBits1, UndefElts1) &&
+          BV2->getConstantRawBits(IsLE, EltBits, RawBits2, UndefElts2) &&
+          UndefElts1.none() && UndefElts2.none()) {
+        SmallVector<APInt> RawBits;
+        for (unsigned I = 0, E = NumElts.getFixedValue(); I != E; ++I) {
+          Optional<APInt> Fold = FoldValue(Opcode, RawBits1[I], RawBits2[I]);
+          if (!Fold)
+            break;
+          RawBits.push_back(Fold.getValue());
+        }
+        if (RawBits.size() == NumElts.getFixedValue()) {
+          // We have constant folded, but we need to cast this again back to
+          // the original (possibly legalized) type.
+          SmallVector<APInt> DstBits;
+          BitVector DstUndefs;
+          BuildVectorSDNode::recastRawBits(IsLE, BVVT.getScalarSizeInBits(),
+                                           DstBits, RawBits, DstUndefs,
+                                           BitVector(RawBits.size(), false));
+          EVT BVEltVT = BV1->getOperand(0).getValueType();
+          unsigned BVEltBits = BVEltVT.getSizeInBits();
+          SmallVector<SDValue> Ops(DstBits.size(), getUNDEF(BVEltVT));
+          for (unsigned I = 0, E = DstBits.size(); I != E; ++I) {
+            if (DstUndefs[I])
+              continue;
+            Ops[I] = getConstant(DstBits[I].sextOrSelf(BVEltBits), DL, BVEltVT);
+          }
+          return getBitcast(VT, getBuildVector(BVVT, DL, Ops));
+        }
+      }
+    }
+  }
+
   auto IsScalarOrSameVectorSize = [NumElts](const SDValue &Op) {
     return !Op.getValueType().isVector() ||
            Op.getValueType().getVectorElementCount() == NumElts;
   };
 
-  auto IsConstantBuildVectorSplatVectorOrUndef = [](const SDValue &Op) {
-    APInt SplatVal;
-    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
+  auto IsBuildVectorSplatVectorOrUndef = [](const SDValue &Op) {
     return Op.isUndef() || Op.getOpcode() == ISD::CONDCODE ||
-           (BV && BV->isConstant()) ||
-           (Op.getOpcode() == ISD::SPLAT_VECTOR &&
-            ISD::isConstantSplatVector(Op.getNode(), SplatVal));
+           Op.getOpcode() == ISD::BUILD_VECTOR ||
+           Op.getOpcode() == ISD::SPLAT_VECTOR;
   };
 
   // All operands must be vector types with the same number of elements as
-  // the result type and must be either UNDEF or a build vector of constant
+  // the result type and must be either UNDEF or a build/splat vector
   // or UNDEF scalars.
-  if (!llvm::all_of(Ops, IsConstantBuildVectorSplatVectorOrUndef) ||
+  if (!llvm::all_of(Ops, IsBuildVectorSplatVectorOrUndef) ||
       !llvm::all_of(Ops, IsScalarOrSameVectorSize))
     return SDValue();
 
@@ -5392,17 +5392,16 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   // For scalable vector types we know we're dealing with SPLAT_VECTORs. We
   // only have one operand to check. For fixed-length vector types we may have
   // a combination of BUILD_VECTOR and SPLAT_VECTOR.
-  unsigned NumOperands = NumElts.isScalable() ? 1 : NumElts.getFixedValue();
+  unsigned NumVectorElts = NumElts.isScalable() ? 1 : NumElts.getFixedValue();
 
   // Constant fold each scalar lane separately.
   SmallVector<SDValue, 4> ScalarResults;
-  for (unsigned I = 0; I != NumOperands; I++) {
+  for (unsigned I = 0; I != NumVectorElts; I++) {
     SmallVector<SDValue, 4> ScalarOps;
     for (SDValue Op : Ops) {
       EVT InSVT = Op.getValueType().getScalarType();
       if (Op.getOpcode() != ISD::BUILD_VECTOR &&
           Op.getOpcode() != ISD::SPLAT_VECTOR) {
-        // We've checked that this is UNDEF or a constant of some kind.
         if (Op.isUndef())
           ScalarOps.push_back(getUNDEF(InSVT));
         else
@@ -5423,7 +5422,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
     }
 
     // Constant fold the scalar operands.
-    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
+    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps);
 
     // Legalize the (integer) scalar constant if necessary.
     if (LegalSVT != SVT)
@@ -5591,9 +5590,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
     // worth handling here.
-    if (N2C && N2C->isNullValue())
+    if (N2C && N2C->isZero())
       return N2;
-    if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
+    if (N2C && N2C->isAllOnes()) // X & -1 -> X
       return N1;
     break;
   case ISD::OR:
@@ -5605,7 +5604,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
     // it's worth handling here.
-    if (N2C && N2C->isNullValue())
+    if (N2C && N2C->isZero())
       return N1;
     if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && VT.isVector() &&
         VT.getVectorElementType() == MVT::i1)
@@ -5711,7 +5710,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // size of the value, the shift/rotate count is guaranteed to be zero.
     if (VT == MVT::i1)
       return N1;
-    if (N2C && N2C->isNullValue())
+    if (N2C && N2C->isZero())
       return N1;
     break;
   case ISD::FP_ROUND:
@@ -6086,7 +6085,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return V;
     // Vector constant folding.
     SDValue Ops[] = {N1, N2, N3};
-    if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) {
+    if (SDValue V = FoldConstantArithmetic(Opcode, DL, VT, Ops)) {
       NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
       return V;
     }
@@ -6099,6 +6098,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   case ISD::VECTOR_SHUFFLE:
     llvm_unreachable("should use getVectorShuffle constructor!");
+  case ISD::VECTOR_SPLICE: {
+    if (cast<ConstantSDNode>(N3)->isNullValue())
+      return N1;
+    break;
+  }
   case ISD::INSERT_VECTOR_ELT: {
     ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
     // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
@@ -6214,9 +6218,8 @@ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument.
-  for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
-       UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+  for (SDNode *U : getEntryNode().getNode()->uses())
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0)
           ArgChains.push_back(SDValue(L, 1));
@@ -6720,7 +6723,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
   bool IsZeroVal =
-    isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
+      isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero();
   if (!TLI.findOptimalMemOpLowering(
           MemOps, TLI.getMaxStoresPerMemset(OptSize),
           MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
@@ -6809,7 +6812,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (ConstantSize) {
     // Memcpy with size zero? Just return the original chain.
-    if (ConstantSize->isNullValue())
+    if (ConstantSize->isZero())
       return Chain;
 
     SDValue Result = getMemcpyLoadsAndStores(
@@ -6924,7 +6927,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (ConstantSize) {
     // Memmove with size zero? Just return the original chain.
-    if (ConstantSize->isNullValue())
+    if (ConstantSize->isZero())
       return Chain;
 
     SDValue Result = getMemmoveLoadsAndStores(
@@ -7026,7 +7029,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (ConstantSize) {
     // Memset with size zero? Just return the original chain.
-    if (ConstantSize->isNullValue())
+    if (ConstantSize->isZero())
       return Chain;
 
     SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
@@ -7618,6 +7621,374 @@ SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::getLoadVP(
+    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &dl,
+    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Mask, SDValue EVL,
+    MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    const MDNode *Ranges, bool IsExpanding) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
+
+  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
+                                                   Alignment, AAInfo, Ranges);
+  return getLoadVP(AM, ExtType, VT, dl, Chain, Ptr, Offset, Mask, EVL, MemVT,
+                   MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getLoadVP(ISD::MemIndexedMode AM,
+                                ISD::LoadExtType ExtType, EVT VT,
+                                const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                                SDValue Offset, SDValue Mask, SDValue EVL,
+                                EVT MemVT, MachineMemOperand *MMO,
+                                bool IsExpanding) {
+  if (VT == MemVT) {
+    ExtType = ISD::NON_EXTLOAD;
+  } else if (ExtType == ISD::NON_EXTLOAD) {
+    assert(VT == MemVT && "Non-extending load from different memory type!");
+  } else {
+    // Extending load.
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
+    assert(VT.isInteger() == MemVT.isInteger() &&
+           "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use an ext load to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorElementCount() == MemVT.getVectorElementCount()) &&
+           "Cannot use an ext load to change the number of vector elements!");
+  }
+
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ? getVTList(VT, Ptr.getValueType(), MVT::Other)
+                         : getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr, Offset, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_LOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPLoadSDNode>(
+      dl.getIROrder(), VTs, AM, ExtType, IsExpanding, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                    ExtType, IsExpanding, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getLoadVP(EVT VT, const SDLoc &dl, SDValue Chain,
+                                SDValue Ptr, SDValue Mask, SDValue EVL,
+                                MachinePointerInfo PtrInfo,
+                                MaybeAlign Alignment,
+                                MachineMemOperand::Flags MMOFlags,
+                                const AAMDNodes &AAInfo, const MDNode *Ranges,
+                                bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                   Mask, EVL, PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges,
+                   IsExpanding);
+}
+
+SDValue SelectionDAG::getLoadVP(EVT VT, const SDLoc &dl, SDValue Chain,
+                                SDValue Ptr, SDValue Mask, SDValue EVL,
+                                MachineMemOperand *MMO, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                   Mask, EVL, VT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getExtLoadVP(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                   EVT VT, SDValue Chain, SDValue Ptr,
+                                   SDValue Mask, SDValue EVL,
+                                   MachinePointerInfo PtrInfo, EVT MemVT,
+                                   MaybeAlign Alignment,
+                                   MachineMemOperand::Flags MMOFlags,
+                                   const AAMDNodes &AAInfo, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, Mask,
+                   EVL, PtrInfo, MemVT, Alignment, MMOFlags, AAInfo, nullptr,
+                   IsExpanding);
+}
+
+SDValue SelectionDAG::getExtLoadVP(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                   EVT VT, SDValue Chain, SDValue Ptr,
+                                   SDValue Mask, SDValue EVL, EVT MemVT,
+                                   MachineMemOperand *MMO, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, Mask,
+                   EVL, MemVT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getIndexedLoadVP(SDValue OrigLoad, const SDLoc &dl,
+                                       SDValue Base, SDValue Offset,
+                                       ISD::MemIndexedMode AM) {
+  auto *LD = cast<VPLoadSDNode>(OrigLoad);
+  assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
+  // Don't propagate the invariant or dereferenceable flags.
+  auto MMOFlags =
+      LD->getMemOperand()->getFlags() &
+      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+  return getLoadVP(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                   LD->getChain(), Base, Offset, LD->getMask(),
+                   LD->getVectorLength(), LD->getPointerInfo(),
+                   LD->getMemoryVT(), LD->getAlign(), MMOFlags, LD->getAAInfo(),
+                   nullptr, LD->isExpandingLoad());
+}
+
+SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                 SDValue Ptr, SDValue Mask, SDValue EVL,
+                                 MachinePointerInfo PtrInfo, Align Alignment,
+                                 MachineMemOperand::Flags MMOFlags,
+                                 const AAMDNodes &AAInfo, bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  uint64_t Size =
+      MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize());
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
+  return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing);
+}
+
+SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                 SDValue Ptr, SDValue Mask, SDValue EVL,
+                                 MachineMemOperand *MMO, bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = {Chain, Val, Ptr, Undef, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
+      dl.getIROrder(), VTs, ISD::UNINDEXED, false, IsCompressing, VT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N =
+      newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                               ISD::UNINDEXED, false, IsCompressing, VT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl,
+                                      SDValue Val, SDValue Ptr, SDValue Mask,
+                                      SDValue EVL, MachinePointerInfo PtrInfo,
+                                      EVT SVT, Align Alignment,
+                                      MachineMemOperand::Flags MMOFlags,
+                                      const AAMDNodes &AAInfo,
+                                      bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemoryLocation::getSizeOrUnknown(SVT.getStoreSize()),
+      Alignment, AAInfo);
+  return getTruncStoreVP(Chain, dl, Val, Ptr, Mask, EVL, SVT, MMO,
+                         IsCompressing);
+}
+
+SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl,
+                                      SDValue Val, SDValue Ptr, SDValue Mask,
+                                      SDValue EVL, EVT SVT,
+                                      MachineMemOperand *MMO,
+                                      bool IsCompressing) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  if (VT == SVT)
+    return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() && "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == SVT.getVectorElementCount()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = {Chain, Val, Ptr, Undef, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
+      dl.getIROrder(), VTs, ISD::UNINDEXED, true, IsCompressing, SVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N =
+      newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                               ISD::UNINDEXED, true, IsCompressing, SVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
+                                        SDValue Base, SDValue Offset,
+                                        ISD::MemIndexedMode AM) {
+  auto *ST = cast<VPStoreSDNode>(OrigStore);
+  assert(ST->getOffset().isUndef() && "Store is already an indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = {ST->getChain(), ST->getValue(), Base,
+                   Offset,         ST->getMask(),  ST->getVectorLength()};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<VPStoreSDNode>(
+      dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ST->isTruncatingStore(),
+      ST->isCompressingStore(), ST->getMemoryVT(), ST->getMemOperand());
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getGatherVP(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                  ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                                  ISD::MemIndexType IndexType) {
+  assert(Ops.size() == 6 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_GATHER, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPGatherSDNode>(
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPGatherSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<VPGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                      VT, MMO, IndexType);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValueType(0).getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+             N->getValueType(0).getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValueType(0).getVectorElementCount()) &&
+         "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getScatterVP(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                   ArrayRef<SDValue> Ops,
+                                   MachineMemOperand *MMO,
+                                   ISD::MemIndexType IndexType) {
+  assert(Ops.size() == 7 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_SCATTER, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPScatterSDNode>(
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPScatterSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                       VT, MMO, IndexType);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValue().getValueType().getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(
+      N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+          N->getValue().getValueType().getVectorElementCount().isScalable() &&
+      "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValue().getValueType().getVectorElementCount()) &&
+         "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                                     SDValue Base, SDValue Offset, SDValue Mask,
                                     SDValue PassThru, EVT MemVT,
@@ -7818,7 +8189,7 @@ SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
   // select true, T, F --> T
   // select false, T, F --> F
   if (auto *CondC = dyn_cast<ConstantSDNode>(Cond))
-    return CondC->isNullValue() ? F : T;
+    return CondC->isZero() ? F : T;
 
   // TODO: This should simplify VSELECT with constant condition using something
   // like this (but check boolean contents to be complete?):
@@ -9296,7 +9667,7 @@ void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
 }
 
 #ifndef NDEBUG
-void SelectionDAG::VerifyDAGDiverence() {
+void SelectionDAG::VerifyDAGDivergence() {
   std::vector<SDNode *> TopoOrder;
   CreateTopologicalOrder(TopoOrder);
   for (auto *N : TopoOrder) {
@@ -9384,21 +9755,20 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   // before SortedPos will contain the topological sort index, and the
   // Node Id fields for nodes At SortedPos and after will contain the
   // count of outstanding operands.
-  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
-    SDNode *N = &*I++;
-    checkForCycles(N, this);
-    unsigned Degree = N->getNumOperands();
+  for (SDNode &N : llvm::make_early_inc_range(allnodes())) {
+    checkForCycles(&N, this);
+    unsigned Degree = N.getNumOperands();
     if (Degree == 0) {
       // A node with no uses, add it to the result array immediately.
-      N->setNodeId(DAGSize++);
-      allnodes_iterator Q(N);
+      N.setNodeId(DAGSize++);
+      allnodes_iterator Q(&N);
       if (Q != SortedPos)
         SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
       assert(SortedPos != AllNodes.end() && "Overran node list");
       ++SortedPos;
     } else {
       // Temporarily use the Node Id as scratch space for the degree count.
-      N->setNodeId(Degree);
+      N.setNodeId(Degree);
     }
   }
 
@@ -9512,12 +9882,9 @@ SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
 
   std::string ErrorStr;
   raw_string_ostream ErrorFormatter(ErrorStr);
-
   ErrorFormatter << "Undefined external symbol ";
   ErrorFormatter << '"' << Symbol << '"';
-  ErrorFormatter.flush();
-
-  report_fatal_error(ErrorStr);
+  report_fatal_error(Twine(ErrorFormatter.str()));
 }
 
 //===----------------------------------------------------------------------===//
@@ -9526,7 +9893,7 @@ SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
 
 bool llvm::isNullConstant(SDValue V) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
-  return Const != nullptr && Const->isNullValue();
+  return Const != nullptr && Const->isZero();
 }
 
 bool llvm::isNullFPConstant(SDValue V) {
@@ -9536,7 +9903,7 @@ bool llvm::isNullFPConstant(SDValue V) {
 
 bool llvm::isAllOnesConstant(SDValue V) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
-  return Const != nullptr && Const->isAllOnesValue();
+  return Const != nullptr && Const->isAllOnes();
 }
 
 bool llvm::isOneConstant(SDValue V) {
@@ -9670,7 +10037,7 @@ bool llvm::isNullOrNullSplat(SDValue N, bool AllowUndefs) {
   // TODO: may want to use peekThroughBitcast() here.
   ConstantSDNode *C =
       isConstOrConstSplat(N, AllowUndefs, /*AllowTruncation=*/true);
-  return C && C->isNullValue();
+  return C && C->isZero();
 }
 
 bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
@@ -9684,7 +10051,7 @@ bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
   N = peekThroughBitcasts(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
   ConstantSDNode *C = isConstOrConstSplat(N, AllowUndefs);
-  return C && C->isAllOnesValue() && C->getValueSizeInBits(0) == BitWidth;
+  return C && C->isAllOnes() && C->getValueSizeInBits(0) == BitWidth;
 }
 
 HandleSDNode::~HandleSDNode() {
@@ -9790,8 +10157,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const {
 /// isOnlyUserOf - Return true if this node is the only use of N.
 bool SDNode::isOnlyUserOf(const SDNode *N) const {
   bool Seen = false;
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
-    SDNode *User = *I;
+  for (const SDNode *User : N->uses()) {
     if (User == this)
       Seen = true;
     else
@@ -9804,8 +10170,7 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {
 /// Return true if the only users of N are contained in Nodes.
 bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
   bool Seen = false;
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
-    SDNode *User = *I;
+  for (const SDNode *User : N->uses()) {
     if (llvm::is_contained(Nodes, User))
       Seen = true;
     else
@@ -10212,14 +10577,14 @@ SelectionDAG::GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT,
          "Mixing fixed width and scalable vectors when enveloping a type");
   EVT LoVT, HiVT;
   if (VTNumElts.getKnownMinValue() > EnvNumElts.getKnownMinValue()) {
-    LoVT = EnvVT;
+    LoVT = EVT::getVectorVT(*getContext(), EltTp, EnvNumElts);
     HiVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts - EnvNumElts);
     *HiIsEmpty = false;
   } else {
     // Flag that hi type has zero storage size, but return split envelop type
     // (this would be easier if vector types with zero elements were allowed).
     LoVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts);
-    HiVT = EnvVT;
+    HiVT = EVT::getVectorVT(*getContext(), EltTp, EnvNumElts);
     *HiIsEmpty = true;
   }
   return std::make_pair(LoVT, HiVT);
@@ -10387,7 +10752,7 @@ SDValue BuildVectorSDNode::getSplatValue(const APInt &DemandedElts,
 }
 
 SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
-  APInt DemandedElts = APInt::getAllOnesValue(getNumOperands());
+  APInt DemandedElts = APInt::getAllOnes(getNumOperands());
   return getSplatValue(DemandedElts, UndefElements);
 }
 
@@ -10439,7 +10804,7 @@ bool BuildVectorSDNode::getRepeatedSequence(const APInt &DemandedElts,
 
 bool BuildVectorSDNode::getRepeatedSequence(SmallVectorImpl<SDValue> &Sequence,
                                             BitVector *UndefElements) const {
-  APInt DemandedElts = APInt::getAllOnesValue(getNumOperands());
+  APInt DemandedElts = APInt::getAllOnes(getNumOperands());
   return getRepeatedSequence(DemandedElts, Sequence, UndefElements);
 }
 
@@ -10485,6 +10850,97 @@ BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
   return -1;
 }
 
+bool BuildVectorSDNode::getConstantRawBits(
+    bool IsLittleEndian, unsigned DstEltSizeInBits,
+    SmallVectorImpl<APInt> &RawBitElements, BitVector &UndefElements) const {
+  // Early-out if this contains anything but Undef/Constant/ConstantFP.
+  if (!isConstant())
+    return false;
+
+  unsigned NumSrcOps = getNumOperands();
+  unsigned SrcEltSizeInBits = getValueType(0).getScalarSizeInBits();
+  assert(((NumSrcOps * SrcEltSizeInBits) % DstEltSizeInBits) == 0 &&
+         "Invalid bitcast scale");
+
+  // Extract raw src bits.
+  SmallVector<APInt> SrcBitElements(NumSrcOps,
+                                    APInt::getNullValue(SrcEltSizeInBits));
+  BitVector SrcUndeElements(NumSrcOps, false);
+
+  for (unsigned I = 0; I != NumSrcOps; ++I) {
+    SDValue Op = getOperand(I);
+    if (Op.isUndef()) {
+      SrcUndeElements.set(I);
+      continue;
+    }
+    auto *CInt = dyn_cast<ConstantSDNode>(Op);
+    auto *CFP = dyn_cast<ConstantFPSDNode>(Op);
+    assert((CInt || CFP) && "Unknown constant");
+    SrcBitElements[I] =
+        CInt ? CInt->getAPIntValue().truncOrSelf(SrcEltSizeInBits)
+             : CFP->getValueAPF().bitcastToAPInt();
+  }
+
+  // Recast to dst width.
+  recastRawBits(IsLittleEndian, DstEltSizeInBits, RawBitElements,
+                SrcBitElements, UndefElements, SrcUndeElements);
+  return true;
+}
+
+void BuildVectorSDNode::recastRawBits(bool IsLittleEndian,
+                                      unsigned DstEltSizeInBits,
+                                      SmallVectorImpl<APInt> &DstBitElements,
+                                      ArrayRef<APInt> SrcBitElements,
+                                      BitVector &DstUndefElements,
+                                      const BitVector &SrcUndefElements) {
+  unsigned NumSrcOps = SrcBitElements.size();
+  unsigned SrcEltSizeInBits = SrcBitElements[0].getBitWidth();
+  assert(((NumSrcOps * SrcEltSizeInBits) % DstEltSizeInBits) == 0 &&
+         "Invalid bitcast scale");
+  assert(NumSrcOps == SrcUndefElements.size() &&
+         "Vector size mismatch");
+
+  unsigned NumDstOps = (NumSrcOps * SrcEltSizeInBits) / DstEltSizeInBits;
+  DstUndefElements.clear();
+  DstUndefElements.resize(NumDstOps, false);
+  DstBitElements.assign(NumDstOps, APInt::getNullValue(DstEltSizeInBits));
+
+  // Concatenate src elements constant bits together into dst element.
+  if (SrcEltSizeInBits <= DstEltSizeInBits) {
+    unsigned Scale = DstEltSizeInBits / SrcEltSizeInBits;
+    for (unsigned I = 0; I != NumDstOps; ++I) {
+      DstUndefElements.set(I);
+      APInt &DstBits = DstBitElements[I];
+      for (unsigned J = 0; J != Scale; ++J) {
+        unsigned Idx = (I * Scale) + (IsLittleEndian ? J : (Scale - J - 1));
+        if (SrcUndefElements[Idx])
+          continue;
+        DstUndefElements.reset(I);
+        const APInt &SrcBits = SrcBitElements[Idx];
+        assert(SrcBits.getBitWidth() == SrcEltSizeInBits &&
+               "Illegal constant bitwidths");
+        DstBits.insertBits(SrcBits, J * SrcEltSizeInBits);
+      }
+    }
+    return;
+  }
+
+  // Split src element constant bits into dst elements.
+  unsigned Scale = SrcEltSizeInBits / DstEltSizeInBits;
+  for (unsigned I = 0; I != NumSrcOps; ++I) {
+    if (SrcUndefElements[I]) {
+      DstUndefElements.set(I * Scale, (I + 1) * Scale);
+      continue;
+    }
+    const APInt &SrcBits = SrcBitElements[I];
+    for (unsigned J = 0; J != Scale; ++J) {
+      unsigned Idx = (I * Scale) + (IsLittleEndian ? J : (Scale - J - 1));
+      APInt &DstBits = DstBitElements[Idx];
+      DstBits = SrcBits.extractBits(DstEltSizeInBits, J * DstEltSizeInBits);
+    }
+  }
+}
+
 bool BuildVectorSDNode::isConstant() const {
   for (const SDValue &Op : op_values()) {
     unsigned Opc = Op.getOpcode();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 20c7d771bfb6..6d8252046501 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include <cstdint>
@@ -143,13 +144,27 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
   bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
   bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
 
-  // If of mismatched base types or checkable indices we can check
-  // they do not alias.
-  if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
-       (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
-      (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) {
-    IsAlias = false;
-    return true;
+  if ((IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) {
+    // We can derive NoAlias In case of mismatched base types.
+    if (IsFI0 != IsFI1 || IsGV0 != IsGV1 || IsCV0 != IsCV1) {
+      IsAlias = false;
+      return true;
+    }
+    if (IsGV0 && IsGV1) {
+      auto *GV0 = cast<GlobalAddressSDNode>(BasePtr0.getBase())->getGlobal();
+      auto *GV1 = cast<GlobalAddressSDNode>(BasePtr1.getBase())->getGlobal();
+      // It doesn't make sense to access one global value using another globals
+      // values address, so we can assume that there is no aliasing in case of
+      // two different globals (unless we have symbols that may indirectly point
+      // to each other).
+      // FIXME: This is perhaps a bit too defensive. We could try to follow the
+      // chain with aliasee information for GlobalAlias variables to find out if
+      // we indirect symbols may alias or not.
+      if (GV0 != GV1 && !isa<GlobalAlias>(GV0) && !isa<GlobalAlias>(GV1)) {
+        IsAlias = false;
+        return true;
+      }
+    }
   }
   return false; // Cannot determine whether the pointers alias.
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d56d4bcc9169..5d911c165293 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -69,6 +69,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
@@ -399,29 +400,31 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     return Val;
 
   if (PartEVT.isVector()) {
+    // Vector/Vector bitcast.
+    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
     // If the element type of the source/dest vectors are the same, but the
     // parts vector has more elements than the value vector, then we have a
     // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
     // elements we want.
-    if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
+    if (PartEVT.getVectorElementCount() != ValueVT.getVectorElementCount()) {
       assert((PartEVT.getVectorElementCount().getKnownMinValue() >
               ValueVT.getVectorElementCount().getKnownMinValue()) &&
              (PartEVT.getVectorElementCount().isScalable() ==
               ValueVT.getVectorElementCount().isScalable()) &&
              "Cannot narrow, it would be a lossy transformation");
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
-                         DAG.getVectorIdxConstant(0, DL));
+      PartEVT =
+          EVT::getVectorVT(*DAG.getContext(), PartEVT.getVectorElementType(),
+                           ValueVT.getVectorElementCount());
+      Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, PartEVT, Val,
+                        DAG.getVectorIdxConstant(0, DL));
+      if (PartEVT == ValueVT)
+        return Val;
     }
 
-    // Vector/Vector bitcast.
-    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
-      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
-
-    assert(PartEVT.getVectorElementCount() == ValueVT.getVectorElementCount() &&
-      "Cannot handle this kind of promotion");
     // Promoted vector extract
     return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
-
   }
 
   // Trivial bitcast if the types are the same size and the destination
@@ -670,6 +673,17 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
       // Promoted vector extract
       Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+    } else if (PartEVT.isVector() &&
+               PartEVT.getVectorElementType() !=
+                   ValueVT.getVectorElementType() &&
+               TLI.getTypeAction(*DAG.getContext(), ValueVT) ==
+                   TargetLowering::TypeWidenVector) {
+      // Combination of widening and promotion.
+      EVT WidenVT =
+          EVT::getVectorVT(*DAG.getContext(), ValueVT.getVectorElementType(),
+                           PartVT.getVectorElementCount());
+      SDValue Widened = widenVectorToPartType(DAG, Val, DL, WidenVT);
+      Val = DAG.getAnyExtOrTrunc(Widened, DL, PartVT);
     } else {
       if (ValueVT.getVectorElementCount().isScalar()) {
         Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
@@ -726,15 +740,19 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   } else if (ValueVT.getSizeInBits() == BuiltVectorTy.getSizeInBits()) {
     // Bitconvert vector->vector case.
     Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
-  } else if (SDValue Widened =
-                 widenVectorToPartType(DAG, Val, DL, BuiltVectorTy)) {
-    Val = Widened;
-  } else if (BuiltVectorTy.getVectorElementType().bitsGE(
-                 ValueVT.getVectorElementType()) &&
-             BuiltVectorTy.getVectorElementCount() ==
-                 ValueVT.getVectorElementCount()) {
-    // Promoted vector extract
-    Val = DAG.getAnyExtOrTrunc(Val, DL, BuiltVectorTy);
+  } else {
+    if (BuiltVectorTy.getVectorElementType().bitsGT(
+            ValueVT.getVectorElementType())) {
+      // Integer promotion.
+      ValueVT = EVT::getVectorVT(*DAG.getContext(),
+                                 BuiltVectorTy.getVectorElementType(),
+                                 ValueVT.getVectorElementCount());
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
+    }
+
+    if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy)) {
+      Val = Widened;
+    }
   }
 
   assert(Val.getValueType() == BuiltVectorTy && "Unexpected vector value type");
@@ -1275,21 +1293,23 @@ void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) {
   while (isa<Instruction>(V)) {
     Instruction &VAsInst = *cast<Instruction>(V);
     // Temporary "0", awaiting real implementation.
+    SmallVector<uint64_t, 16> Ops;
     SmallVector<Value *, 4> AdditionalValues;
-    DIExpression *SalvagedExpr =
-        salvageDebugInfoImpl(VAsInst, Expr, StackValue, 0, AdditionalValues);
-
+    V = salvageDebugInfoImpl(VAsInst, Expr->getNumLocationOperands(), Ops,
+                             AdditionalValues);
     // If we cannot salvage any further, and haven't yet found a suitable debug
     // expression, bail out.
+    if (!V)
+      break;
+
     // TODO: If AdditionalValues isn't empty, then the salvage can only be
     // represented with a DBG_VALUE_LIST, so we give up. When we have support
     // here for variadic dbg_values, remove that condition.
-    if (!SalvagedExpr || !AdditionalValues.empty())
+    if (!AdditionalValues.empty())
       break;
 
     // New value and expr now represent this debuginfo.
-    V = VAsInst.getOperand(0);
-    Expr = SalvagedExpr;
+    Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, StackValue);
 
     // Some kind of simplification occurred: check whether the operand of the
     // salvaged debug expression can be encoded in this DAG.
@@ -1400,7 +1420,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
           BitsToDescribe = *VarSize;
         if (auto Fragment = Expr->getFragmentInfo())
           BitsToDescribe = Fragment->SizeInBits;
-        for (auto RegAndSize : RFV.getRegsAndSizes()) {
+        for (const auto &RegAndSize : RFV.getRegsAndSizes()) {
           // Bail out if all bits are described already.
           if (Offset >= BitsToDescribe)
             break;
@@ -1945,16 +1965,13 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           /*IsVarArg*/ false, DL);
 
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                          Attribute::SExt))
+      if (F->getAttributes().hasRetAttr(Attribute::SExt))
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                               Attribute::ZExt))
+      else if (F->getAttributes().hasRetAttr(Attribute::ZExt))
         ExtendKind = ISD::ZERO_EXTEND;
 
       LLVMContext &Context = F->getContext();
-      bool RetInReg = F->getAttributes().hasAttribute(
-          AttributeList::ReturnIndex, Attribute::InReg);
+      bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
@@ -1995,7 +2012,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           Flags.setZExt();
 
         for (unsigned i = 0; i < NumParts; ++i) {
-          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+          Outs.push_back(ISD::OutputArg(Flags,
+                                        Parts[i].getValueType().getSimpleVT(),
                                         VT, /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
@@ -2012,10 +2030,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     assert(SwiftError.getFunctionArg() && "Need a swift error argument");
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
     Flags.setSwiftError();
-    Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /*vt*/,
-                                  EVT(TLI.getPointerTy(DL)) /*argvt*/,
-                                  true /*isfixed*/, 1 /*origidx*/,
-                                  0 /*partOffs*/));
+    Outs.push_back(ISD::OutputArg(
+        Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)),
+        /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0));
     // Create SDNode for the swifterror virtual register.
     OutVals.push_back(
         DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
@@ -2566,7 +2583,7 @@ void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
 
-  if (!JTH.OmitRangeCheck) {
+  if (!JTH.FallthroughUnreachable) {
     // Emit the range check for the jump table, and branch to the default block
     // for the switch statement if the value being switched on exceeds the
     // largest case in the switch.
@@ -2663,7 +2680,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
     TargetLowering::ArgListEntry Entry;
     Entry.Node = GuardVal;
     Entry.Ty = FnTy->getParamType(0);
-    if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg))
+    if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
       Entry.IsInReg = true;
     Args.push_back(Entry);
 
@@ -2778,13 +2795,13 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   MachineBasicBlock* MBB = B.Cases[0].ThisBB;
 
-  if (!B.OmitRangeCheck)
+  if (!B.FallthroughUnreachable)
     addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
   addSuccessorWithProb(SwitchBB, MBB, B.Prob);
   SwitchBB->normalizeSuccProbs();
 
   SDValue Root = CopyTo;
-  if (!B.OmitRangeCheck) {
+  if (!B.FallthroughUnreachable) {
     // Conditional branch to the default block.
     SDValue RangeCmp = DAG.getSetCC(dl,
         TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
@@ -3140,7 +3157,7 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
     // count type has enough bits to represent any shift value, truncate
     // it now. This is a common case and it exposes the truncate to
     // optimization early.
-    else if (ShiftSize >= Log2_32_Ceil(Op2.getValueSizeInBits()))
+    else if (ShiftSize >= Log2_32_Ceil(Op1.getValueSizeInBits()))
       Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
     // Otherwise we'll need to temporarily settle for some other convenient
     // type.  Type legalization will make adjustments once the shiftee is split.
@@ -4057,8 +4074,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   Type *Ty = I.getType();
   Align Alignment = I.getAlign();
 
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
+  AAMDNodes AAInfo = I.getAAMetadata();
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   SmallVector<EVT, 4> ValueVTs, MemVTs;
@@ -4185,13 +4201,11 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
 
   const Value *SV = I.getOperand(0);
   Type *Ty = I.getType();
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
   assert(
       (!AA ||
        !AA->pointsToConstantMemory(MemoryLocation(
            SV, LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
-           AAInfo))) &&
+           I.getAAMetadata()))) &&
       "load_from_swift_error should not be constant memory");
 
   SmallVector<EVT, 4> ValueVTs;
@@ -4249,8 +4263,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   SDLoc dl = getCurSDLoc();
   Align Alignment = I.getAlign();
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
+  AAMDNodes AAInfo = I.getAAMetadata();
 
   auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
 
@@ -4321,14 +4334,11 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
   if (!Alignment)
     Alignment = DAG.getEVTAlign(VT);
 
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
-
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
       // TODO: Make MachineMemOperands aware of scalable
       // vectors.
-      VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo);
+      VT.getStoreSize().getKnownMinSize(), *Alignment, I.getAAMetadata());
   SDValue StoreNode =
       DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
                          ISD::UNINDEXED, false /* Truncating */, IsCompressing);
@@ -4358,7 +4368,7 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const DataLayout &DL = DAG.getDataLayout();
 
-  assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
+  assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
 
   // Handle splat constant pointer.
   if (auto *C = dyn_cast<Constant>(Ptr)) {
@@ -4412,9 +4422,6 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
                         .getValueOr(DAG.getEVTAlign(VT.getScalarType()));
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
-
   SDValue Base;
   SDValue Index;
   ISD::MemIndexType IndexType;
@@ -4427,7 +4434,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
       // TODO: Make MachineMemOperands aware of scalable
       // vectors.
-      MemoryLocation::UnknownSize, Alignment, AAInfo);
+      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata());
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
@@ -4485,8 +4492,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   if (!Alignment)
     Alignment = DAG.getEVTAlign(VT);
 
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
+  AAMDNodes AAInfo = I.getAAMetadata();
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   // Do not serialize masked loads of constant memory with anything.
@@ -4529,8 +4535,6 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
                         ->getMaybeAlignValue()
                         .getValueOr(DAG.getEVTAlign(VT.getScalarType()));
 
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   SDValue Root = DAG.getRoot();
@@ -4545,7 +4549,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
       // TODO: Make MachineMemOperands aware of scalable
       // vectors.
-      MemoryLocation::UnknownSize, Alignment, AAInfo, Ranges);
+      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
 
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
@@ -4786,7 +4790,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
                                         TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
-  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = I.arg_size(); i != e; ++i) {
     const Value *Arg = I.getArgOperand(i);
     if (!I.paramHasAttr(i, Attribute::ImmArg)) {
       Ops.push_back(getValue(Arg));
@@ -4823,12 +4827,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   SDValue Result;
   if (IsTgtIntrinsic) {
     // This is target intrinsic that touches memory
-    AAMDNodes AAInfo;
-    I.getAAMetadata(AAInfo);
     Result =
         DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
                                 MachinePointerInfo(Info.ptrVal, Info.offset),
-                                Info.align, Info.flags, Info.size, AAInfo);
+                                Info.align, Info.flags, Info.size,
+                                I.getAAMetadata());
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -5510,12 +5513,12 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   // we've been asked to pursue.
   auto MakeVRegDbgValue = [&](Register Reg, DIExpression *FragExpr,
                               bool Indirect) {
-    if (Reg.isVirtual() && TM.Options.ValueTrackingVariableLocations) {
+    if (Reg.isVirtual() && MF.useDebugInstrRef()) {
       // For VRegs, in instruction referencing mode, create a DBG_INSTR_REF
       // pointing at the VReg, which will be patched up later.
       auto &Inst = TII->get(TargetOpcode::DBG_INSTR_REF);
       auto MIB = BuildMI(MF, DL, Inst);
-      MIB.addReg(Reg, RegState::Debug);
+      MIB.addReg(Reg);
       MIB.addImm(0);
       MIB.addMetadata(Variable);
       auto *NewDIExpr = FragExpr;
@@ -5637,7 +5640,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     auto splitMultiRegDbgValue = [&](ArrayRef<std::pair<unsigned, TypeSize>>
                                          SplitRegs) {
       unsigned Offset = 0;
-      for (auto RegAndSize : SplitRegs) {
+      for (const auto &RegAndSize : SplitRegs) {
         // If the expression is already a fragment, the current register
         // offset+size might extend beyond the fragment. In this case, only
         // the register bits that are inside the fragment are relevant.
@@ -5866,12 +5869,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
     // node.
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    AAMDNodes AAInfo;
-    I.getAAMetadata(AAInfo);
     SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
                                /* AlwaysInline */ false, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
-                               MachinePointerInfo(I.getArgOperand(1)), AAInfo);
+                               MachinePointerInfo(I.getArgOperand(1)),
+                               I.getAAMetadata());
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -5889,12 +5891,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
     // node.
-    AAMDNodes AAInfo;
-    I.getAAMetadata(AAInfo);
     SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Alignment, isVol,
                                /* AlwaysInline */ true, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
-                               MachinePointerInfo(I.getArgOperand(1)), AAInfo);
+                               MachinePointerInfo(I.getArgOperand(1)),
+                               I.getAAMetadata());
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -5908,10 +5909,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     bool isVol = MSI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    AAMDNodes AAInfo;
-    I.getAAMetadata(AAInfo);
     SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC,
-                               MachinePointerInfo(I.getArgOperand(0)), AAInfo);
+                               MachinePointerInfo(I.getArgOperand(0)),
+                               I.getAAMetadata());
     updateDAGForMaybeTailCall(MS);
     return;
   }
@@ -5929,11 +5929,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // FIXME: Support passing different dest/src alignments to the memmove DAG
     // node.
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    AAMDNodes AAInfo;
-    I.getAAMetadata(AAInfo);
     SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
                                 isTC, MachinePointerInfo(I.getArgOperand(0)),
-                                MachinePointerInfo(I.getArgOperand(1)), AAInfo);
+                                MachinePointerInfo(I.getArgOperand(1)),
+                                I.getAAMetadata());
     updateDAGForMaybeTailCall(MM);
     return;
   }
@@ -6124,7 +6123,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     if (Values.empty())
       return;
 
-    if (std::count(Values.begin(), Values.end(), nullptr))
+    if (llvm::is_contained(Values, nullptr))
       return;
 
     bool IsVariadic = DI.hasArgList();
@@ -6706,9 +6705,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::debugtrap:
   case Intrinsic::trap: {
     StringRef TrapFuncName =
-        I.getAttributes()
-            .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
-            .getValueAsString();
+        I.getAttributes().getFnAttr("trap-func-name").getValueAsString();
     if (TrapFuncName.empty()) {
       switch (Intrinsic) {
       case Intrinsic::trap:
@@ -6888,7 +6885,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
     // is the same on all targets.
-    for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
+    for (unsigned Idx = 0, E = I.arg_size(); Idx < E; ++Idx) {
       Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
       if (isa<ConstantPointerNull>(Arg))
         continue; // Skip null pointers. They represent a hole in index space.
@@ -7058,7 +7055,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     };
     SmallVector<BranchFunnelTarget, 8> Targets;
 
-    for (unsigned Op = 1, N = I.getNumArgOperands(); Op != N; Op += 2) {
+    for (unsigned Op = 1, N = I.arg_size(); Op != N; Op += 2) {
       auto *ElemBase = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
           I.getArgOperand(Op), Offset, DAG.getDataLayout()));
       if (ElemBase != Base)
@@ -7327,9 +7324,128 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
     llvm_unreachable(
         "Inconsistency: no SDNode available for this VPIntrinsic!");
 
+  if (*ResOPC == ISD::VP_REDUCE_SEQ_FADD ||
+      *ResOPC == ISD::VP_REDUCE_SEQ_FMUL) {
+    if (VPIntrin.getFastMathFlags().allowReassoc())
+      return *ResOPC == ISD::VP_REDUCE_SEQ_FADD ? ISD::VP_REDUCE_FADD
+                                                : ISD::VP_REDUCE_FMUL;
+  }
+
   return ResOPC.getValue();
 }
 
+void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
+                                            SmallVector<SDValue, 7> &OpValues,
+                                            bool isGather) {
+  SDLoc DL = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  Value *PtrOperand = VPIntrin.getArgOperand(0);
+  MaybeAlign Alignment = DAG.getEVTAlign(VT);
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
+  SDValue LD;
+  bool AddToChain = true;
+  if (!isGather) {
+    // Do not serialize variable-length loads of constant memory with
+    // anything.
+    MemoryLocation ML;
+    if (VT.isScalableVector())
+      ML = MemoryLocation::getAfter(PtrOperand);
+    else
+      ML = MemoryLocation(
+          PtrOperand,
+          LocationSize::precise(
+              DAG.getDataLayout().getTypeStoreSize(VPIntrin.getType())),
+          AAInfo);
+    AddToChain = !AA || !AA->pointsToConstantMemory(ML);
+    SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+        VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges);
+    LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
+                       MMO, false /*IsExpanding */);
+  } else {
+    unsigned AS =
+        PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo(AS), MachineMemOperand::MOLoad,
+        MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+    SDValue Base, Index, Scale;
+    ISD::MemIndexType IndexType;
+    bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
+                                      this, VPIntrin.getParent());
+    if (!UniformBase) {
+      Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
+      Index = getValue(PtrOperand);
+      IndexType = ISD::SIGNED_UNSCALED;
+      Scale =
+          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+    }
+    EVT IdxVT = Index.getValueType();
+    EVT EltTy = IdxVT.getVectorElementType();
+    if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+      EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+      Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+    }
+    LD = DAG.getGatherVP(
+        DAG.getVTList(VT, MVT::Other), VT, DL,
+        {DAG.getRoot(), Base, Index, Scale, OpValues[1], OpValues[2]}, MMO,
+        IndexType);
+  }
+  if (AddToChain)
+    PendingLoads.push_back(LD.getValue(1));
+  setValue(&VPIntrin, LD);
+}
+
+void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin,
+                                              SmallVector<SDValue, 7> &OpValues,
+                                              bool isScatter) {
+  SDLoc DL = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  Value *PtrOperand = VPIntrin.getArgOperand(1);
+  EVT VT = OpValues[0].getValueType();
+  MaybeAlign Alignment = DAG.getEVTAlign(VT);
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  SDValue ST;
+  if (!isScatter) {
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+        VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo);
+    ST =
+        DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], OpValues[1],
+                       OpValues[2], OpValues[3], MMO, false /* IsTruncating */);
+  } else {
+    unsigned AS =
+        PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo(AS), MachineMemOperand::MOStore,
+        MemoryLocation::UnknownSize, *Alignment, AAInfo);
+    SDValue Base, Index, Scale;
+    ISD::MemIndexType IndexType;
+    bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
+                                      this, VPIntrin.getParent());
+    if (!UniformBase) {
+      Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
+      Index = getValue(PtrOperand);
+      IndexType = ISD::SIGNED_UNSCALED;
+      Scale =
+          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+    }
+    EVT IdxVT = Index.getValueType();
+    EVT EltTy = IdxVT.getVectorElementType();
+    if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+      EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+      Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+    }
+    ST = DAG.getScatterVP(DAG.getVTList(MVT::Other), VT, DL,
+                          {getMemoryRoot(), OpValues[0], Base, Index, Scale,
+                           OpValues[2], OpValues[3]},
+                          MMO, IndexType);
+  }
+  DAG.setRoot(ST);
+  setValue(&VPIntrin, ST);
+}
+
 void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
     const VPIntrinsic &VPIntrin) {
   SDLoc DL = getCurSDLoc();
@@ -7349,15 +7465,29 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
 
   // Request operands.
   SmallVector<SDValue, 7> OpValues;
-  for (unsigned I = 0; I < VPIntrin.getNumArgOperands(); ++I) {
+  for (unsigned I = 0; I < VPIntrin.arg_size(); ++I) {
     auto Op = getValue(VPIntrin.getArgOperand(I));
     if (I == EVLParamPos)
       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, EVLParamVT, Op);
     OpValues.push_back(Op);
   }
 
-  SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues);
-  setValue(&VPIntrin, Result);
+  switch (Opcode) {
+  default: {
+    SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues);
+    setValue(&VPIntrin, Result);
+    break;
+  }
+  case ISD::VP_LOAD:
+  case ISD::VP_GATHER:
+    visitVPLoadGather(VPIntrin, ValueVTs[0], OpValues,
+                      Opcode == ISD::VP_GATHER);
+    break;
+  case ISD::VP_STORE:
+  case ISD::VP_SCATTER:
+    visitVPStoreScatter(VPIntrin, OpValues, Opcode == ISD::VP_SCATTER);
+    break;
+  }
 }
 
 SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
@@ -7760,12 +7890,11 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   // because the return pointer needs to be adjusted by the size of
   // the copied memory.
   SDValue Root = isVol ? getRoot() : getMemoryRoot();
-  AAMDNodes AAInfo;
-  I.getAAMetadata(AAInfo);
   SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Alignment, isVol, false,
                              /*isTailCall=*/false,
                              MachinePointerInfo(I.getArgOperand(0)),
-                             MachinePointerInfo(I.getArgOperand(1)), AAInfo);
+                             MachinePointerInfo(I.getArgOperand(1)),
+                             I.getAAMetadata());
   assert(MC.getNode() != nullptr &&
          "** memcpy should not be lowered as TailCall in mempcpy context **");
   DAG.setRoot(MC);
@@ -7918,6 +8047,8 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   }
 
   if (Function *F = I.getCalledFunction()) {
+    diagnoseDontCall(I);
+
     if (F->isDeclaration()) {
       // Is this an LLVM intrinsic or a target-specific intrinsic?
       unsigned IID = F->getIntrinsicID();
@@ -8176,7 +8307,7 @@ public:
       }
     }
 
-    return TLI.getValueType(DL, OpTy, true);
+    return TLI.getAsmOperandValueType(DL, OpTy, true);
   }
 };
 
@@ -8261,9 +8392,10 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
 ///
 ///   OpInfo describes the operand
 ///   RefOpInfo describes the matching operand if any, the operand otherwise
-static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
-                                 SDISelAsmOperandInfo &OpInfo,
-                                 SDISelAsmOperandInfo &RefOpInfo) {
+static llvm::Optional<unsigned>
+getRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
+                     SDISelAsmOperandInfo &OpInfo,
+                     SDISelAsmOperandInfo &RefOpInfo) {
   LLVMContext &Context = *DAG.getContext();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
@@ -8273,7 +8405,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
 
   // No work to do for memory operations.
   if (OpInfo.ConstraintType == TargetLowering::C_Memory)
-    return;
+    return None;
 
   // If this is a constraint for a single physreg, or a constraint for a
   // register class, find it.
@@ -8283,7 +8415,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
       &TRI, RefOpInfo.ConstraintCode, RefOpInfo.ConstraintVT);
   // RC is unset only on failure. Return immediately.
   if (!RC)
-    return;
+    return None;
 
   // Get the actual register value type.  This is important, because the user
   // may have asked for (e.g.) the AX register in i32 type.  We need to
@@ -8328,7 +8460,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
   // No need to allocate a matching input constraint since the constraint it's
   // matching to has already been allocated.
   if (OpInfo.isMatchingInputConstraint())
-    return;
+    return None;
 
   EVT ValueVT = OpInfo.ConstraintVT;
   if (OpInfo.ConstraintVT == MVT::Other)
@@ -8351,8 +8483,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
 
   // Do not check for single registers.
   if (AssignedReg) {
-      for (; *I != AssignedReg; ++I)
-        assert(I != RC->end() && "AssignedReg should be member of RC");
+    I = std::find(I, RC->end(), AssignedReg);
+    if (I == RC->end()) {
+      // RC does not contain the selected register, which indicates a
+      // mismatch between the register and the required type/bitwidth.
+      return {AssignedReg};
+    }
   }
 
   for (; NumRegs; --NumRegs, ++I) {
@@ -8362,6 +8498,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
   }
 
   OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+  return None;
 }
 
 static unsigned
@@ -8452,12 +8589,12 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
       // Process the call argument. BasicBlocks are labels, currently appearing
       // only in asm's.
       if (isa<CallBrInst>(Call) &&
-          ArgNo - 1 >= (cast<CallBrInst>(&Call)->getNumArgOperands() -
+          ArgNo - 1 >= (cast<CallBrInst>(&Call)->arg_size() -
                         cast<CallBrInst>(&Call)->getNumIndirectDests() -
                         NumMatchingOps) &&
           (NumMatchingOps == 0 ||
-           ArgNo - 1 < (cast<CallBrInst>(&Call)->getNumArgOperands() -
-                        NumMatchingOps))) {
+           ArgNo - 1 <
+               (cast<CallBrInst>(&Call)->arg_size() - NumMatchingOps))) {
         const auto *BA = cast<BlockAddress>(OpInfo.CallOperandVal);
         EVT VT = TLI.getValueType(DAG.getDataLayout(), BA->getType(), true);
         OpInfo.CallOperand = DAG.getTargetBlockAddress(BA, VT);
@@ -8479,8 +8616,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
             DAG.getDataLayout(), STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT =
-            TLI.getSimpleValueType(DAG.getDataLayout(), Call.getType());
+        OpInfo.ConstraintVT = TLI.getAsmOperandValueType(
+            DAG.getDataLayout(), Call.getType()).getSimpleVT();
       }
       ++ResNo;
     } else {
@@ -8595,7 +8732,18 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
         OpInfo.isMatchingInputConstraint()
             ? ConstraintOperands[OpInfo.getMatchedOperand()]
             : OpInfo;
-    GetRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);
+    const auto RegError =
+        getRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);
+    if (RegError.hasValue()) {
+      const MachineFunction &MF = DAG.getMachineFunction();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const char *RegName = TRI.getName(RegError.getValue());
+      emitInlineAsmError(Call, "register '" + Twine(RegName) +
+                                   "' allocated for constraint '" +
+                                   Twine(OpInfo.ConstraintCode) +
+                                   "' does not match required type");
+      return;
+    }
 
     auto DetectWriteToReservedRegister = [&]() {
       const MachineFunction &MF = DAG.getMachineFunction();
@@ -8674,11 +8822,13 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
           MachineFunction &MF = DAG.getMachineFunction();
           MachineRegisterInfo &MRI = MF.getRegInfo();
           const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-          RegisterSDNode *R = dyn_cast<RegisterSDNode>(AsmNodeOperands[CurOp+1]);
+          auto *R = cast<RegisterSDNode>(AsmNodeOperands[CurOp+1]);
           Register TiedReg = R->getReg();
           MVT RegVT = R->getSimpleValueType(0);
-          const TargetRegisterClass *RC = TiedReg.isVirtual() ?
-            MRI.getRegClass(TiedReg) : TRI.getMinimalPhysRegClass(TiedReg);
+          const TargetRegisterClass *RC =
+              TiedReg.isVirtual()     ? MRI.getRegClass(TiedReg)
+              : RegVT != MVT::Untyped ? TLI.getRegClassFor(RegVT)
+                                      : TRI.getMinimalPhysRegClass(TiedReg);
           unsigned NumRegs = InlineAsm::getNumOperandRegisters(OpFlag);
           for (unsigned i = 0; i != NumRegs; ++i)
             Regs.push_back(MRI.createVirtualRegister(RC));
@@ -9317,7 +9467,7 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Op1 = getValue(I.getArgOperand(0));
   SDValue Op2;
-  if (I.getNumArgOperands() > 1)
+  if (I.arg_size() > 1)
     Op2 = getValue(I.getArgOperand(1));
   SDLoc dl = getCurSDLoc();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
@@ -9671,9 +9821,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // if it isn't first piece, alignment must be 1
         // For scalable vectors the scalable part is currently handled
         // by individual targets, so we just use the known minimum size here.
-        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
-                    i < CLI.NumFixedArgs, i,
-                    j*Parts[j].getValueType().getStoreSize().getKnownMinSize());
+        ISD::OutputArg MyFlags(
+            Flags, Parts[j].getValueType().getSimpleVT(), VT,
+            i < CLI.NumFixedArgs, i,
+            j * Parts[j].getValueType().getStoreSize().getKnownMinSize());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0) {
@@ -9841,10 +9992,10 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
                    None); // This is not an ABI copy.
   SDValue Chain = DAG.getEntryNode();
 
-  ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
-                              FuncInfo.PreferredExtendType.end())
-                                 ? ISD::ANY_EXTEND
-                                 : FuncInfo.PreferredExtendType[V];
+  ISD::NodeType ExtendType = ISD::ANY_EXTEND;
+  auto PreferredExtendIt = FuncInfo.PreferredExtendType.find(V);
+  if (PreferredExtendIt != FuncInfo.PreferredExtendType.end())
+    ExtendType = PreferredExtendIt->second;
   RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
   PendingExports.push_back(Chain);
 }
@@ -10490,27 +10641,6 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
   ConstantsOut.clear();
 }
 
-/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
-/// is 0.
-MachineBasicBlock *
-SelectionDAGBuilder::StackProtectorDescriptor::
-AddSuccessorMBB(const BasicBlock *BB,
-                MachineBasicBlock *ParentMBB,
-                bool IsLikely,
-                MachineBasicBlock *SuccMBB) {
-  // If SuccBB has not been created yet, create it.
-  if (!SuccMBB) {
-    MachineFunction *MF = ParentMBB->getParent();
-    MachineFunction::iterator BBI(ParentMBB);
-    SuccMBB = MF->CreateMachineBasicBlock(BB);
-    MF->insert(++BBI, SuccMBB);
-  }
-  // Add it as a successor of ParentMBB.
-  ParentMBB->addSuccessor(
-      SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
-  return SuccMBB;
-}
-
 MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) {
   MachineFunction::iterator I(MBB);
   if (++I == FuncInfo.MF->end())
@@ -10675,12 +10805,10 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
           }
         }
 
-        if (FallthroughUnreachable) {
-          // Skip the range check if the fallthrough block is unreachable.
-          JTH->OmitRangeCheck = true;
-        }
+        if (FallthroughUnreachable)
+          JTH->FallthroughUnreachable = true;
 
-        if (!JTH->OmitRangeCheck)
+        if (!JTH->FallthroughUnreachable)
           addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
         addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
         CurMBB->normalizeSuccProbs();
@@ -10718,10 +10846,8 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
           BTB->DefaultProb -= DefaultProb / 2;
         }
 
-        if (FallthroughUnreachable) {
-          // Skip the range check if the fallthrough block is unreachable.
-          BTB->OmitRangeCheck = true;
-        }
+        if (FallthroughUnreachable)
+          BTB->FallthroughUnreachable = true;
 
         // If we're in the right place, emit the bit test header right now.
         if (CurMBB == SwitchMBB) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index df5be156821f..d6122aa0a739 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
@@ -180,204 +181,6 @@ private:
                           SwitchCG::CaseClusterVector &Clusters,
                           BranchProbability &PeeledCaseProb);
 
-  /// A class which encapsulates all of the information needed to generate a
-  /// stack protector check and signals to isel via its state being initialized
-  /// that a stack protector needs to be generated.
-  ///
-  /// *NOTE* The following is a high level documentation of SelectionDAG Stack
-  /// Protector Generation. The reason that it is placed here is for a lack of
-  /// other good places to stick it.
-  ///
-  /// High Level Overview of SelectionDAG Stack Protector Generation:
-  ///
-  /// Previously, generation of stack protectors was done exclusively in the
-  /// pre-SelectionDAG Codegen LLVM IR Pass "Stack Protector". This necessitated
-  /// splitting basic blocks at the IR level to create the success/failure basic
-  /// blocks in the tail of the basic block in question. As a result of this,
-  /// calls that would have qualified for the sibling call optimization were no
-  /// longer eligible for optimization since said calls were no longer right in
-  /// the "tail position" (i.e. the immediate predecessor of a ReturnInst
-  /// instruction).
-  ///
-  /// Then it was noticed that since the sibling call optimization causes the
-  /// callee to reuse the caller's stack, if we could delay the generation of
-  /// the stack protector check until later in CodeGen after the sibling call
-  /// decision was made, we get both the tail call optimization and the stack
-  /// protector check!
-  ///
-  /// A few goals in solving this problem were:
-  ///
-  ///   1. Preserve the architecture independence of stack protector generation.
-  ///
-  ///   2. Preserve the normal IR level stack protector check for platforms like
-  ///      OpenBSD for which we support platform-specific stack protector
-  ///      generation.
-  ///
-  /// The main problem that guided the present solution is that one can not
-  /// solve this problem in an architecture independent manner at the IR level
-  /// only. This is because:
-  ///
-  ///   1. The decision on whether or not to perform a sibling call on certain
-  ///      platforms (for instance i386) requires lower level information
-  ///      related to available registers that can not be known at the IR level.
-  ///
-  ///   2. Even if the previous point were not true, the decision on whether to
-  ///      perform a tail call is done in LowerCallTo in SelectionDAG which
-  ///      occurs after the Stack Protector Pass. As a result, one would need to
-  ///      put the relevant callinst into the stack protector check success
-  ///      basic block (where the return inst is placed) and then move it back
-  ///      later at SelectionDAG/MI time before the stack protector check if the
-  ///      tail call optimization failed. The MI level option was nixed
-  ///      immediately since it would require platform-specific pattern
-  ///      matching. The SelectionDAG level option was nixed because
-  ///      SelectionDAG only processes one IR level basic block at a time
-  ///      implying one could not create a DAG Combine to move the callinst.
-  ///
-  /// To get around this problem a few things were realized:
-  ///
-  ///   1. While one can not handle multiple IR level basic blocks at the
-  ///      SelectionDAG Level, one can generate multiple machine basic blocks
-  ///      for one IR level basic block. This is how we handle bit tests and
-  ///      switches.
-  ///
-  ///   2. At the MI level, tail calls are represented via a special return
-  ///      MIInst called "tcreturn". Thus if we know the basic block in which we
-  ///      wish to insert the stack protector check, we get the correct behavior
-  ///      by always inserting the stack protector check right before the return
-  ///      statement. This is a "magical transformation" since no matter where
-  ///      the stack protector check intrinsic is, we always insert the stack
-  ///      protector check code at the end of the BB.
-  ///
-  /// Given the aforementioned constraints, the following solution was devised:
-  ///
-  ///   1. On platforms that do not support SelectionDAG stack protector check
-  ///      generation, allow for the normal IR level stack protector check
-  ///      generation to continue.
-  ///
-  ///   2. On platforms that do support SelectionDAG stack protector check
-  ///      generation:
-  ///
-  ///     a. Use the IR level stack protector pass to decide if a stack
-  ///        protector is required/which BB we insert the stack protector check
-  ///        in by reusing the logic already therein. If we wish to generate a
-  ///        stack protector check in a basic block, we place a special IR
-  ///        intrinsic called llvm.stackprotectorcheck right before the BB's
-  ///        returninst or if there is a callinst that could potentially be
-  ///        sibling call optimized, before the call inst.
-  ///
-  ///     b. Then when a BB with said intrinsic is processed, we codegen the BB
-  ///        normally via SelectBasicBlock. In said process, when we visit the
-  ///        stack protector check, we do not actually emit anything into the
-  ///        BB. Instead, we just initialize the stack protector descriptor
-  ///        class (which involves stashing information/creating the success
-  ///        mbbb and the failure mbb if we have not created one for this
-  ///        function yet) and export the guard variable that we are going to
-  ///        compare.
-  ///
-  ///     c. After we finish selecting the basic block, in FinishBasicBlock if
-  ///        the StackProtectorDescriptor attached to the SelectionDAGBuilder is
-  ///        initialized, we produce the validation code with one of these
-  ///        techniques:
-  ///          1) with a call to a guard check function
-  ///          2) with inlined instrumentation
-  ///
-  ///        1) We insert a call to the check function before the terminator.
-  ///
-  ///        2) We first find a splice point in the parent basic block
-  ///        before the terminator and then splice the terminator of said basic
-  ///        block into the success basic block. Then we code-gen a new tail for
-  ///        the parent basic block consisting of the two loads, the comparison,
-  ///        and finally two branches to the success/failure basic blocks. We
-  ///        conclude by code-gening the failure basic block if we have not
-  ///        code-gened it already (all stack protector checks we generate in
-  ///        the same function, use the same failure basic block).
-  class StackProtectorDescriptor {
-  public:
-    StackProtectorDescriptor() = default;
-
-    /// Returns true if all fields of the stack protector descriptor are
-    /// initialized implying that we should/are ready to emit a stack protector.
-    bool shouldEmitStackProtector() const {
-      return ParentMBB && SuccessMBB && FailureMBB;
-    }
-
-    bool shouldEmitFunctionBasedCheckStackProtector() const {
-      return ParentMBB && !SuccessMBB && !FailureMBB;
-    }
-
-    /// Initialize the stack protector descriptor structure for a new basic
-    /// block.
-    void initialize(const BasicBlock *BB, MachineBasicBlock *MBB,
-                    bool FunctionBasedInstrumentation) {
-      // Make sure we are not initialized yet.
-      assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
-             "already initialized!");
-      ParentMBB = MBB;
-      if (!FunctionBasedInstrumentation) {
-        SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
-        FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
-      }
-    }
-
-    /// Reset state that changes when we handle different basic blocks.
-    ///
-    /// This currently includes:
-    ///
-    /// 1. The specific basic block we are generating a
-    /// stack protector for (ParentMBB).
-    ///
-    /// 2. The successor machine basic block that will contain the tail of
-    /// parent mbb after we create the stack protector check (SuccessMBB). This
-    /// BB is visited only on stack protector check success.
-    void resetPerBBState() {
-      ParentMBB = nullptr;
-      SuccessMBB = nullptr;
-    }
-
-    /// Reset state that only changes when we switch functions.
-    ///
-    /// This currently includes:
-    ///
-    /// 1. FailureMBB since we reuse the failure code path for all stack
-    /// protector checks created in an individual function.
-    ///
-    /// 2.The guard variable since the guard variable we are checking against is
-    /// always the same.
-    void resetPerFunctionState() {
-      FailureMBB = nullptr;
-    }
-
-    MachineBasicBlock *getParentMBB() { return ParentMBB; }
-    MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
-    MachineBasicBlock *getFailureMBB() { return FailureMBB; }
-
-  private:
-    /// The basic block for which we are generating the stack protector.
-    ///
-    /// As a result of stack protector generation, we will splice the
-    /// terminators of this basic block into the successor mbb SuccessMBB and
-    /// replace it with a compare/branch to the successor mbbs
-    /// SuccessMBB/FailureMBB depending on whether or not the stack protector
-    /// was violated.
-    MachineBasicBlock *ParentMBB = nullptr;
-
-    /// A basic block visited on stack protector check success that contains the
-    /// terminators of ParentMBB.
-    MachineBasicBlock *SuccessMBB = nullptr;
-
-    /// This basic block visited on stack protector check failure that will
-    /// contain a call to __stack_chk_fail().
-    MachineBasicBlock *FailureMBB = nullptr;
-
-    /// Add a successor machine basic block to ParentMBB. If the successor mbb
-    /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
-    /// block will be created. Assign a large weight if IsLikely is true.
-    MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
-                                       MachineBasicBlock *ParentMBB,
-                                       bool IsLikely,
-                                       MachineBasicBlock *SuccMBB = nullptr);
-  };
-
 private:
   const TargetMachine &TM;
 
@@ -764,6 +567,10 @@ private:
   void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
+  void visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
+                         SmallVector<SDValue, 7> &OpValues, bool isGather);
+  void visitVPStoreScatter(const VPIntrinsic &VPIntrin,
+                           SmallVector<SDValue, 7> &OpValues, bool isScatter);
   void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin);
 
   void visitVAStart(const CallInst &I);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 40083c614a6c..77e9e53668f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -146,9 +146,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
     if (IID < Intrinsic::num_intrinsics)
       return Intrinsic::getBaseName((Intrinsic::ID)IID).str();
-    else if (!G)
+    if (!G)
       return "Unknown intrinsic";
-    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+    if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
       return TII->getName(IID);
     llvm_unreachable("Invalid intrinsic ID");
   }
@@ -526,13 +526,13 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (G) {
     const MachineFunction *MF = &G->getMachineFunction();
     return printMemOperand(OS, MMO, MF, MF->getFunction().getParent(),
-                           &MF->getFrameInfo(), G->getSubtarget().getInstrInfo(),
-                           *G->getContext());
-  } else {
-    LLVMContext Ctx;
-    return printMemOperand(OS, MMO, /*MF=*/nullptr, /*M=*/nullptr,
-                           /*MFI=*/nullptr, /*TII=*/nullptr, Ctx);
+                           &MF->getFrameInfo(),
+                           G->getSubtarget().getInstrInfo(), *G->getContext());
   }
+
+  LLVMContext Ctx;
+  return printMemOperand(OS, MMO, /*MF=*/nullptr, /*M=*/nullptr,
+                         /*MFI=*/nullptr, /*TII=*/nullptr, Ctx);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -948,17 +948,19 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
   if (!Value.getNode()) {
     OS << "<null>";
     return false;
-  } else if (shouldPrintInline(*Value.getNode(), G)) {
+  }
+
+  if (shouldPrintInline(*Value.getNode(), G)) {
     OS << Value->getOperationName(G) << ':';
     Value->print_types(OS, G);
     Value->print_details(OS, G);
     return true;
-  } else {
-    OS << PrintNodeId(*Value.getNode());
-    if (unsigned RN = Value.getResNo())
-      OS << ':' << RN;
-    return false;
   }
+
+  OS << PrintNodeId(*Value.getNode());
+  if (unsigned RN = Value.getResNo())
+    OS << ':' << RN;
+  return false;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1012,15 +1014,12 @@ static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
 
   N->print(OS, G);
 
-  if (depth < 1)
-    return;
-
   for (const SDValue &Op : N->op_values()) {
     // Don't follow chain operands.
     if (Op.getValueType() == MVT::Other)
       continue;
     OS << '\n';
-    printrWithDepthHelper(OS, Op.getNode(), G, depth-1, indent+2);
+    printrWithDepthHelper(OS, Op.getNode(), G, depth - 1, indent + 2);
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 1415cce3b1df..c7e37cf8ca14 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -575,7 +576,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         LiveInMap.insert(LI);
 
   // Insert DBG_VALUE instructions for function arguments to the entry block.
-  bool InstrRef = TM.Options.ValueTrackingVariableLocations;
+  bool InstrRef = MF->useDebugInstrRef();
   for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
     MachineInstr *MI = FuncInfo->ArgDbgValues[e - i - 1];
     assert(MI->getOpcode() != TargetOpcode::DBG_VALUE_LIST &&
@@ -699,7 +700,7 @@ static void reportFastISelFailure(MachineFunction &MF,
     R << (" (in function: " + MF.getName() + ")").str();
 
   if (ShouldAbort)
-    report_fatal_error(R.getMsg());
+    report_fatal_error(Twine(R.getMsg()));
 
   ORE.emit(R);
 }
@@ -798,7 +799,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
   if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
+    CurDAG->VerifyDAGDivergence();
 #endif
 
   if (ViewDAGCombine1 && MatchFilterBB)
@@ -818,7 +819,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
   if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
+    CurDAG->VerifyDAGDivergence();
 #endif
 
   // Second step, hack on the DAG until it only uses operations and types that
@@ -840,7 +841,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
   if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
+    CurDAG->VerifyDAGDivergence();
 #endif
 
   // Only allow creation of legal node types.
@@ -864,7 +865,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
     if (TTI.hasBranchDivergence())
-      CurDAG->VerifyDAGDiverence();
+      CurDAG->VerifyDAGDivergence();
 #endif
   }
 
@@ -882,7 +883,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
     if (TTI.hasBranchDivergence())
-      CurDAG->VerifyDAGDiverence();
+      CurDAG->VerifyDAGDivergence();
 #endif
 
     {
@@ -898,7 +899,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
     if (TTI.hasBranchDivergence())
-      CurDAG->VerifyDAGDiverence();
+      CurDAG->VerifyDAGDivergence();
 #endif
 
     if (ViewDAGCombineLT && MatchFilterBB)
@@ -918,7 +919,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
     if (TTI.hasBranchDivergence())
-      CurDAG->VerifyDAGDiverence();
+      CurDAG->VerifyDAGDivergence();
 #endif
   }
 
@@ -938,7 +939,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
   if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
+    CurDAG->VerifyDAGDivergence();
 #endif
 
   if (ViewDAGCombine2 && MatchFilterBB)
@@ -958,7 +959,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
 #ifndef NDEBUG
   if (TTI.hasBranchDivergence())
-    CurDAG->VerifyDAGDiverence();
+    CurDAG->VerifyDAGDivergence();
 #endif
 
   if (OptLevel != CodeGenOpt::None)
@@ -1045,25 +1046,25 @@ public:
 } // end anonymous namespace
 
 // This function is used to enforce the topological node id property
-// property leveraged during Instruction selection. Before selection all
-// nodes are given a non-negative id such that all nodes have a larger id than
+// leveraged during instruction selection. Before the selection process all
+// nodes are given a non-negative id such that all nodes have a greater id than
 // their operands. As this holds transitively we can prune checks that a node N
 // is a predecessor of M another by not recursively checking through M's
-// operands if N's ID is larger than M's ID. This is significantly improves
-// performance of for various legality checks (e.g. IsLegalToFold /
-// UpdateChains).
-
-// However, when we fuse multiple nodes into a single node
-// during selection we may induce a predecessor relationship between inputs and
-// outputs of distinct nodes being merged violating the topological property.
-// Should a fused node have a successor which has yet to be selected, our
-// legality checks would be incorrect. To avoid this we mark all unselected
-// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x =>
+// operands if N's ID is larger than M's ID. This significantly improves
+// performance of various legality checks (e.g. IsLegalToFold / UpdateChains).
+
+// However, when we fuse multiple nodes into a single node during the
+// selection we may induce a predecessor relationship between inputs and
+// outputs of distinct nodes being merged, violating the topological property.
+// Should a fused node have a successor which has yet to be selected,
+// our legality checks would be incorrect. To avoid this we mark all unselected
+// successor nodes, i.e. id != -1, as invalid for pruning by bit-negating (x =>
 // (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M.
 // We use bit-negation to more clearly enforce that node id -1 can only be
-// achieved by selected nodes). As the conversion is reversable the original Id,
-// topological pruning can still be leveraged when looking for unselected nodes.
-// This method is call internally in all ISel replacement calls.
+// achieved by selected nodes. As the conversion is reversable to the original
+// Id, topological pruning can still be leveraged when looking for unselected
+// nodes. This method is called internally in all ISel replacement related
+// functions.
 void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
   SmallVector<SDNode *, 4> Nodes;
   Nodes.push_back(Node);
@@ -1080,7 +1081,7 @@ void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
   }
 }
 
-// InvalidateNodeId - As discusses in EnforceNodeIdInvariant, mark a
+// InvalidateNodeId - As explained in EnforceNodeIdInvariant, mark a
 // NodeId with the equivalent node id which is invalid for topological
 // pruning.
 void SelectionDAGISel::InvalidateNodeId(SDNode *N) {
@@ -1226,7 +1227,10 @@ static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
   bool IsSingleCatchAllClause =
       CPI->getNumArgOperands() == 1 &&
       cast<Constant>(CPI->getArgOperand(0))->isNullValue();
-  if (!IsSingleCatchAllClause) {
+  // cathchpads for longjmp use an empty type list, e.g. catchpad within %0 []
+  // and they don't need LSDA info
+  bool IsCatchLongjmp = CPI->getNumArgOperands() == 0;
+  if (!IsSingleCatchAllClause && !IsCatchLongjmp) {
     // Create a mapping from landing pad label to landing pad index.
     bool IntrFound = false;
     for (const User *U : CPI->users()) {
@@ -1644,114 +1648,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   SDB->SPDescriptor.resetPerFunctionState();
 }
 
-/// Given that the input MI is before a partial terminator sequence TSeq, return
-/// true if M + TSeq also a partial terminator sequence.
-///
-/// A Terminator sequence is a sequence of MachineInstrs which at this point in
-/// lowering copy vregs into physical registers, which are then passed into
-/// terminator instructors so we can satisfy ABI constraints. A partial
-/// terminator sequence is an improper subset of a terminator sequence (i.e. it
-/// may be the whole terminator sequence).
-static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
-  // If we do not have a copy or an implicit def, we return true if and only if
-  // MI is a debug value.
-  if (!MI.isCopy() && !MI.isImplicitDef())
-    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
-    // physical registers if there is debug info associated with the terminator
-    // of our mbb. We want to include said debug info in our terminator
-    // sequence, so we return true in that case.
-    return MI.isDebugValue();
-
-  // We have left the terminator sequence if we are not doing one of the
-  // following:
-  //
-  // 1. Copying a vreg into a physical register.
-  // 2. Copying a vreg into a vreg.
-  // 3. Defining a register via an implicit def.
-
-  // OPI should always be a register definition...
-  MachineInstr::const_mop_iterator OPI = MI.operands_begin();
-  if (!OPI->isReg() || !OPI->isDef())
-    return false;
-
-  // Defining any register via an implicit def is always ok.
-  if (MI.isImplicitDef())
-    return true;
-
-  // Grab the copy source...
-  MachineInstr::const_mop_iterator OPI2 = OPI;
-  ++OPI2;
-  assert(OPI2 != MI.operands_end()
-         && "Should have a copy implying we should have 2 arguments.");
-
-  // Make sure that the copy dest is not a vreg when the copy source is a
-  // physical register.
-  if (!OPI2->isReg() || (!Register::isPhysicalRegister(OPI->getReg()) &&
-                         Register::isPhysicalRegister(OPI2->getReg())))
-    return false;
-
-  return true;
-}
-
-/// Find the split point at which to splice the end of BB into its success stack
-/// protector check machine basic block.
-///
-/// On many platforms, due to ABI constraints, terminators, even before register
-/// allocation, use physical registers. This creates an issue for us since
-/// physical registers at this point can not travel across basic
-/// blocks. Luckily, selectiondag always moves physical registers into vregs
-/// when they enter functions and moves them through a sequence of copies back
-/// into the physical registers right before the terminator creating a
-/// ``Terminator Sequence''. This function is searching for the beginning of the
-/// terminator sequence so that we can ensure that we splice off not just the
-/// terminator, but additionally the copies that move the vregs into the
-/// physical registers.
-static MachineBasicBlock::iterator
-FindSplitPointForStackProtector(MachineBasicBlock *BB,
-                                const TargetInstrInfo &TII) {
-  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
-  if (SplitPoint == BB->begin())
-    return SplitPoint;
-
-  MachineBasicBlock::iterator Start = BB->begin();
-  MachineBasicBlock::iterator Previous = SplitPoint;
-  --Previous;
-
-  if (TII.isTailCall(*SplitPoint) &&
-      Previous->getOpcode() == TII.getCallFrameDestroyOpcode()) {
-    // call itself, then we must insert before the sequence even starts. For
-    // example:
-    //     <split point>
-    //     ADJCALLSTACKDOWN ...
-    //     <Moves>
-    //     ADJCALLSTACKUP ...
-    //     TAILJMP somewhere
-    // On the other hand, it could be an unrelated call in which case this tail call
-    // has to register moves of its own and should be the split point. For example:
-    //     ADJCALLSTACKDOWN
-    //     CALL something_else
-    //     ADJCALLSTACKUP
-    //     <split point>
-    //     TAILJMP somewhere
-    do {
-      --Previous;
-      if (Previous->isCall())
-        return SplitPoint;
-    } while(Previous->getOpcode() != TII.getCallFrameSetupOpcode());
-
-    return Previous;
-  }
-
-  while (MIIsInTerminatorSequence(*Previous)) {
-    SplitPoint = Previous;
-    if (Previous == Start)
-      break;
-    --Previous;
-  }
-
-  return SplitPoint;
-}
-
 void
 SelectionDAGISel::FinishBasicBlock() {
   LLVM_DEBUG(dbgs() << "Total amount of phi nodes to update: "
@@ -1781,7 +1677,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // Add load and check to the basicblock.
     FuncInfo->MBB = ParentMBB;
     FuncInfo->InsertPt =
-        FindSplitPointForStackProtector(ParentMBB, *TII);
+        findSplitPointForStackProtector(ParentMBB, *TII);
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
@@ -1800,7 +1696,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // register allocation issues caused by us splitting the parent mbb. The
     // register allocator will clean up said virtual copies later on.
     MachineBasicBlock::iterator SplitPoint =
-        FindSplitPointForStackProtector(ParentMBB, *TII);
+        findSplitPointForStackProtector(ParentMBB, *TII);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
     SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
@@ -1861,9 +1757,9 @@ SelectionDAGISel::FinishBasicBlock() {
       // test, and delete the last bit test.
 
       MachineBasicBlock *NextMBB;
-      if (BTB.ContiguousRange && j + 2 == ej) {
-        // Second-to-last bit-test with contiguous range: fall through to the
-        // target of the final bit test.
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
+        // Second-to-last bit-test with contiguous range or omitted range
+        // check: fall through to the target of the final bit test.
         NextMBB = BTB.Cases[j + 1].TargetBB;
       } else if (j + 1 == ej) {
         // For the last bit test, fall through to Default.
@@ -1880,7 +1776,7 @@ SelectionDAGISel::FinishBasicBlock() {
       SDB->clear();
       CodeGenAndEmitDAG();
 
-      if (BTB.ContiguousRange && j + 2 == ej) {
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
         // Since we're not going to use the final bit test, remove it.
         BTB.Cases.pop_back();
         break;
@@ -3800,7 +3696,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
     else
       Msg << "unknown intrinsic #" << iid;
   }
-  report_fatal_error(Msg.str());
+  report_fatal_error(Twine(Msg.str()));
 }
 
 char SelectionDAGISel::ID = 0;
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index a903c2401264..e2db9633bfb9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -1119,7 +1119,7 @@ void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl(
   StatepointLoweringInfo SI(DAG);
   unsigned ArgBeginIndex = Call->arg_begin() - Call->op_begin();
   populateCallLoweringInfo(
-      SI.CLI, Call, ArgBeginIndex, Call->getNumArgOperands(), Callee,
+      SI.CLI, Call, ArgBeginIndex, Call->arg_size(), Callee,
       ForceVoidReturnTy ? Type::getVoidTy(*DAG.getContext()) : Call->getType(),
       false);
   if (!VarArgDisallowed)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1c1dae8f953f..e4a69adff05b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DivisionByConstantInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
@@ -537,7 +538,7 @@ bool TargetLowering::ShrinkDemandedConstant(SDValue Op,
                                             TargetLoweringOpt &TLO) const {
   EVT VT = Op.getValueType();
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO);
 }
@@ -621,7 +622,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
   }
 
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, Depth,
                               AssumeSingleUse);
@@ -667,12 +668,12 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
         DAG.getDataLayout().isLittleEndian()) {
       unsigned Scale = NumDstEltBits / NumSrcEltBits;
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
-      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
-      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
       for (unsigned i = 0; i != Scale; ++i) {
         unsigned Offset = i * NumSrcEltBits;
         APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
-        if (!Sub.isNullValue()) {
+        if (!Sub.isZero()) {
           DemandedSrcBits |= Sub;
           for (unsigned j = 0; j != NumElts; ++j)
             if (DemandedElts[j])
@@ -690,8 +691,8 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
         DAG.getDataLayout().isLittleEndian()) {
       unsigned Scale = NumSrcEltBits / NumDstEltBits;
       unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
-      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
-      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i]) {
           unsigned Offset = (i % Scale) * NumDstEltBits;
@@ -819,13 +820,21 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
-    // If we don't demand the inserted subvector, return the base vector.
     SDValue Vec = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
     uint64_t Idx = Op.getConstantOperandVal(2);
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
-    if (DemandedElts.extractBits(NumSubElts, Idx) == 0)
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    // If we don't demand the inserted subvector, return the base vector.
+    if (DemandedSubElts == 0)
       return Vec;
+    // If this simply widens the lowest subvector, see if we can do it earlier.
+    if (Idx == 0 && Vec.isUndef()) {
+      if (SDValue NewSub = SimplifyMultipleUseDemandedBits(
+              Sub, DemandedBits, DemandedSubElts, DAG, Depth + 1))
+        return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                           Op.getOperand(0), NewSub, Op.getOperand(2));
+    }
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
@@ -866,7 +875,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     unsigned Depth) const {
   EVT VT = Op.getValueType();
   APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
                                          Depth);
@@ -875,7 +884,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
 SDValue TargetLowering::SimplifyMultipleUseDemandedVectorElts(
     SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG,
     unsigned Depth) const {
-  APInt DemandedBits = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+  APInt DemandedBits = APInt::getAllOnes(Op.getScalarValueSizeInBits());
   return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
                                          Depth);
 }
@@ -942,8 +951,8 @@ bool TargetLowering::SimplifyDemandedBits(
     }
     // If this is the root being simplified, allow it to have multiple uses,
     // just set the DemandedBits/Elts to all bits.
-    DemandedBits = APInt::getAllOnesValue(BitWidth);
-    DemandedElts = APInt::getAllOnesValue(NumElts);
+    DemandedBits = APInt::getAllOnes(BitWidth);
+    DemandedElts = APInt::getAllOnes(NumElts);
   } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
     // Not demanding any bits/elts from Op.
     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -1038,7 +1047,7 @@ bool TargetLowering::SimplifyDemandedBits(
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
     APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
     APInt DemandedSrcElts = DemandedElts;
-    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
 
     KnownBits KnownSub, KnownSrc;
     if (SimplifyDemandedBits(Sub, DemandedBits, DemandedSubElts, KnownSub, TLO,
@@ -1056,8 +1065,8 @@ bool TargetLowering::SimplifyDemandedBits(
       Known = KnownBits::commonBits(Known, KnownSrc);
 
     // Attempt to avoid multi-use src if we don't need anything from it.
-    if (!DemandedBits.isAllOnesValue() || !DemandedSubElts.isAllOnesValue() ||
-        !DemandedSrcElts.isAllOnesValue()) {
+    if (!DemandedBits.isAllOnes() || !DemandedSubElts.isAllOnes() ||
+        !DemandedSrcElts.isAllOnes()) {
       SDValue NewSub = SimplifyMultipleUseDemandedBits(
           Sub, DemandedBits, DemandedSubElts, TLO.DAG, Depth + 1);
       SDValue NewSrc = SimplifyMultipleUseDemandedBits(
@@ -1086,7 +1095,7 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
 
     // Attempt to avoid multi-use src if we don't need anything from it.
-    if (!DemandedBits.isAllOnesValue() || !DemandedSrcElts.isAllOnesValue()) {
+    if (!DemandedBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) {
       SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
           Src, DemandedBits, DemandedSrcElts, TLO.DAG, Depth + 1);
       if (DemandedSrc) {
@@ -1216,7 +1225,7 @@ bool TargetLowering::SimplifyDemandedBits(
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+    if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) {
       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
           Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
@@ -1263,7 +1272,7 @@ bool TargetLowering::SimplifyDemandedBits(
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+    if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) {
       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
           Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
@@ -1306,7 +1315,7 @@ bool TargetLowering::SimplifyDemandedBits(
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+    if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) {
       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
           Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
@@ -1351,8 +1360,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // If the RHS is a constant, see if we can change it. Don't alter a -1
       // constant because that's a 'not' op, and that is better for combining
       // and codegen.
-      if (!C->isAllOnesValue() &&
-          DemandedBits.isSubsetOf(C->getAPIntValue())) {
+      if (!C->isAllOnes() && DemandedBits.isSubsetOf(C->getAPIntValue())) {
         // We're flipping all demanded bits. Flip the undemanded bits too.
         SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
         return TLO.CombineTo(Op, New);
@@ -1360,7 +1368,7 @@ bool TargetLowering::SimplifyDemandedBits(
     }
 
     // If we can't turn this into a 'not', try to shrink the constant.
-    if (!C || !C->isAllOnesValue())
+    if (!C || !C->isAllOnes())
       if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
         return true;
 
@@ -1605,7 +1613,7 @@ bool TargetLowering::SimplifyDemandedBits(
     // always convert this into a logical shr, even if the shift amount is
     // variable.  The low bit of the shift cannot be an input sign bit unless
     // the shift amount is >= the size of the datatype, which is undefined.
-    if (DemandedBits.isOneValue())
+    if (DemandedBits.isOne())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
     if (const APInt *SA =
@@ -1655,7 +1663,7 @@ bool TargetLowering::SimplifyDemandedBits(
         Known.One.setHighBits(ShAmt);
 
       // Attempt to avoid multi-use ops if we don't need anything from them.
-      if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      if (!InDemandedMask.isAllOnes() || !DemandedElts.isAllOnes()) {
         SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
             Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
         if (DemandedOp0) {
@@ -1781,7 +1789,7 @@ bool TargetLowering::SimplifyDemandedBits(
     // If only 1 bit is demanded, replace with PARITY as long as we're before
     // op legalization.
     // FIXME: Limit to scalars for now.
-    if (DemandedBits.isOneValue() && !TLO.LegalOps && !VT.isVector())
+    if (DemandedBits.isOne() && !TLO.LegalOps && !VT.isVector())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::PARITY, dl, VT,
                                                Op.getOperand(0)));
 
@@ -1795,9 +1803,9 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // If we only care about the highest bit, don't bother shifting right.
     if (DemandedBits.isSignMask()) {
-      unsigned NumSignBits =
-          TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
-      bool AlreadySignExtended = NumSignBits >= BitWidth - ExVTBits + 1;
+      unsigned MinSignedBits =
+          TLO.DAG.ComputeMinSignedBits(Op0, DemandedElts, Depth + 1);
+      bool AlreadySignExtended = ExVTBits >= MinSignedBits;
       // However if the input is already sign extended we expect the sign
       // extension to be dropped altogether later and do not simplify.
       if (!AlreadySignExtended) {
@@ -2071,7 +2079,7 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // Demand the bits from every vector element without a constant index.
     unsigned NumSrcElts = SrcEltCnt.getFixedValue();
-    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
     if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
       if (CIdx->getAPIntValue().ult(NumSrcElts))
         DemandedSrcElts = APInt::getOneBitSet(NumSrcElts, CIdx->getZExtValue());
@@ -2087,8 +2095,7 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!DemandedSrcBits.isAllOnesValue() ||
-        !DemandedSrcElts.isAllOnesValue()) {
+    if (!DemandedSrcBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) {
       if (SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
               Src, DemandedSrcBits, DemandedSrcElts, TLO.DAG, Depth + 1)) {
         SDValue NewOp =
@@ -2138,12 +2145,12 @@ bool TargetLowering::SimplifyDemandedBits(
         TLO.DAG.getDataLayout().isLittleEndian()) {
       unsigned Scale = BitWidth / NumSrcEltBits;
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
-      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
-      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
       for (unsigned i = 0; i != Scale; ++i) {
         unsigned Offset = i * NumSrcEltBits;
         APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
-        if (!Sub.isNullValue()) {
+        if (!Sub.isZero()) {
           DemandedSrcBits |= Sub;
           for (unsigned j = 0; j != NumElts; ++j)
             if (DemandedElts[j])
@@ -2164,8 +2171,8 @@ bool TargetLowering::SimplifyDemandedBits(
                TLO.DAG.getDataLayout().isLittleEndian()) {
       unsigned Scale = NumSrcEltBits / BitWidth;
       unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
-      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
-      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i]) {
           unsigned Offset = (i % Scale) * BitWidth;
@@ -2222,7 +2229,7 @@ bool TargetLowering::SimplifyDemandedBits(
     }
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!LoMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+    if (!LoMask.isAllOnes() || !DemandedElts.isAllOnes()) {
       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
           Op0, LoMask, DemandedElts, TLO.DAG, Depth + 1);
       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
@@ -2245,8 +2252,8 @@ bool TargetLowering::SimplifyDemandedBits(
     // is probably not useful (and could be detrimental).
     ConstantSDNode *C = isConstOrConstSplat(Op1);
     APInt HighMask = APInt::getHighBitsSet(BitWidth, DemandedBitsLZ);
-    if (C && !C->isAllOnesValue() && !C->isOne() &&
-        (C->getAPIntValue() | HighMask).isAllOnesValue()) {
+    if (C && !C->isAllOnes() && !C->isOne() &&
+        (C->getAPIntValue() | HighMask).isAllOnes()) {
       SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
       // Disable the nsw and nuw flags. We can no longer guarantee that we
       // won't wrap after simplification.
@@ -2344,7 +2351,7 @@ static APInt getKnownUndefForVectorBinop(SDValue BO, SelectionDAG &DAG,
     return SDValue();
   };
 
-  APInt KnownUndef = APInt::getNullValue(NumElts);
+  APInt KnownUndef = APInt::getZero(NumElts);
   for (unsigned i = 0; i != NumElts; ++i) {
     // If both inputs for this element are either constant or undef and match
     // the element type, compute the constant/undef result for this element of
@@ -2371,7 +2378,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   unsigned NumElts = DemandedElts.getBitWidth();
   assert(VT.isVector() && "Expected vector op");
 
-  KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+  KnownUndef = KnownZero = APInt::getZero(NumElts);
 
   // TODO: For now we assume we know nothing about scalable vectors.
   if (VT.isScalableVector())
@@ -2463,17 +2470,13 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef,
                                         KnownZero, TLO, Depth + 1);
 
-    APInt SrcZero, SrcUndef;
-    APInt SrcDemandedElts = APInt::getNullValue(NumSrcElts);
+    APInt SrcDemandedElts, SrcZero, SrcUndef;
 
     // Bitcast from 'large element' src vector to 'small element' vector, we
     // must demand a source element if any DemandedElt maps to it.
     if ((NumElts % NumSrcElts) == 0) {
       unsigned Scale = NumElts / NumSrcElts;
-      for (unsigned i = 0; i != NumElts; ++i)
-        if (DemandedElts[i])
-          SrcDemandedElts.setBit(i / Scale);
-
+      SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
       if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
                                      TLO, Depth + 1))
         return true;
@@ -2483,7 +2486,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       // TODO - bigendian once we have test coverage.
       if (TLO.DAG.getDataLayout().isLittleEndian()) {
         unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
-        APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits);
+        APInt SrcDemandedBits = APInt::getZero(SrcEltSizeInBits);
         for (unsigned i = 0; i != NumElts; ++i)
           if (DemandedElts[i]) {
             unsigned Ofs = (i % Scale) * EltSizeInBits;
@@ -2513,10 +2516,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     // of this vector.
     if ((NumSrcElts % NumElts) == 0) {
       unsigned Scale = NumSrcElts / NumElts;
-      for (unsigned i = 0; i != NumElts; ++i)
-        if (DemandedElts[i])
-          SrcDemandedElts.setBits(i * Scale, (i + 1) * Scale);
-
+      SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
       if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
                                      TLO, Depth + 1))
         return true;
@@ -2525,9 +2525,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       // the output element will be as well, assuming it was demanded.
       for (unsigned i = 0; i != NumElts; ++i) {
         if (DemandedElts[i]) {
-          if (SrcZero.extractBits(Scale, i * Scale).isAllOnesValue())
+          if (SrcZero.extractBits(Scale, i * Scale).isAllOnes())
             KnownZero.setBit(i);
-          if (SrcUndef.extractBits(Scale, i * Scale).isAllOnesValue())
+          if (SrcUndef.extractBits(Scale, i * Scale).isAllOnes())
             KnownUndef.setBit(i);
         }
       }
@@ -2536,7 +2536,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   }
   case ISD::BUILD_VECTOR: {
     // Check all elements and simplify any unused elements with UNDEF.
-    if (!DemandedElts.isAllOnesValue()) {
+    if (!DemandedElts.isAllOnes()) {
       // Don't simplify BROADCASTS.
       if (llvm::any_of(Op->op_values(),
                        [&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
@@ -2589,7 +2589,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
     APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
     APInt DemandedSrcElts = DemandedElts;
-    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
 
     APInt SubUndef, SubZero;
     if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
@@ -2609,8 +2609,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownZero.insertBits(SubZero, Idx);
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!DemandedSrcElts.isAllOnesValue() ||
-        !DemandedSubElts.isAllOnesValue()) {
+    if (!DemandedSrcElts.isAllOnes() || !DemandedSubElts.isAllOnes()) {
       SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
           Src, DemandedSrcElts, TLO.DAG, Depth + 1);
       SDValue NewSub = SimplifyMultipleUseDemandedVectorElts(
@@ -2642,7 +2641,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownZero = SrcZero.extractBits(NumElts, Idx);
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    if (!DemandedElts.isAllOnesValue()) {
+    if (!DemandedElts.isAllOnes()) {
       SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
           Src, DemandedSrcElts, TLO.DAG, Depth + 1);
       if (NewSrc) {
@@ -2810,6 +2809,25 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       if (DemandedElts.isSubsetOf(KnownUndef))
         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
       KnownUndef.clearAllBits();
+
+      // zext - if we just need the bottom element then we can mask:
+      // zext(and(x,c)) -> and(x,c') iff the zext is the only user of the and.
+      if (DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian() &&
+          Src.getOpcode() == ISD::AND && Op->isOnlyUserOf(Src.getNode()) &&
+          Op.getValueSizeInBits() == Src.getValueSizeInBits()) {
+        SDLoc DL(Op);
+        EVT SrcVT = Src.getValueType();
+        EVT SrcSVT = SrcVT.getScalarType();
+        SmallVector<SDValue> MaskElts;
+        MaskElts.push_back(TLO.DAG.getAllOnesConstant(DL, SrcSVT));
+        MaskElts.append(NumSrcElts - 1, TLO.DAG.getConstant(0, DL, SrcSVT));
+        SDValue Mask = TLO.DAG.getBuildVector(SrcVT, DL, MaskElts);
+        if (SDValue Fold = TLO.DAG.FoldConstantArithmetic(
+                ISD::AND, DL, SrcVT, {Src.getOperand(1), Mask})) {
+          Fold = TLO.DAG.getNode(ISD::AND, DL, SrcVT, Src.getOperand(0), Fold);
+          return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Fold));
+        }
+      }
     }
     break;
   }
@@ -2842,7 +2860,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
     // TODO - use KnownUndef to relax the demandedelts?
-    if (!DemandedElts.isAllOnesValue())
+    if (!DemandedElts.isAllOnes())
       if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
         return true;
     break;
@@ -2869,7 +2887,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
     // TODO - use KnownUndef to relax the demandedelts?
-    if (!DemandedElts.isAllOnesValue())
+    if (!DemandedElts.isAllOnes())
       if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
         return true;
     break;
@@ -2897,7 +2915,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
     // TODO - use KnownUndef to relax the demandedelts?
-    if (!DemandedElts.isAllOnesValue())
+    if (!DemandedElts.isAllOnes())
       if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
         return true;
     break;
@@ -2923,7 +2941,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
         return true;
     } else {
       KnownBits Known;
-      APInt DemandedBits = APInt::getAllOnesValue(EltSizeInBits);
+      APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
       if (SimplifyDemandedBits(Op, DemandedBits, OriginalDemandedElts, Known,
                                TLO, Depth, AssumeSingleUse))
         return true;
@@ -3111,9 +3129,9 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   case UndefinedBooleanContent:
     return CVal[0];
   case ZeroOrOneBooleanContent:
-    return CVal.isOneValue();
+    return CVal.isOne();
   case ZeroOrNegativeOneBooleanContent:
-    return CVal.isAllOnesValue();
+    return CVal.isAllOnes();
   }
 
   llvm_unreachable("Invalid boolean contents");
@@ -3140,7 +3158,7 @@ bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent)
     return !CN->getAPIntValue()[0];
 
-  return CN->isNullValue();
+  return CN->isZero();
 }
 
 bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
@@ -3156,7 +3174,7 @@ bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
     return (N->isOne() && !SExt) || (SExt && (N->getValueType(0) != MVT::i1));
   case TargetLowering::UndefinedBooleanContent:
   case TargetLowering::ZeroOrNegativeOneBooleanContent:
-    return N->isAllOnesValue() && SExt;
+    return N->isAllOnes() && SExt;
   }
   llvm_unreachable("Unexpected enumeration.");
 }
@@ -3210,7 +3228,7 @@ SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
     // Bail out if the compare operand that we want to turn into a zero is
     // already a zero (otherwise, infinite loop).
     auto *YConst = dyn_cast<ConstantSDNode>(Y);
-    if (YConst && YConst->isNullValue())
+    if (YConst && YConst->isZero())
       return SDValue();
 
     // Transform this into: ~X & Y == 0.
@@ -3325,7 +3343,7 @@ SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift(
     EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
     DAGCombinerInfo &DCI, const SDLoc &DL) const {
   assert(isConstOrConstSplat(N1C) &&
-         isConstOrConstSplat(N1C)->getAPIntValue().isNullValue() &&
+         isConstOrConstSplat(N1C)->getAPIntValue().isZero() &&
          "Should be a comparison with 0.");
   assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
          "Valid only for [in]equality comparisons.");
@@ -3548,7 +3566,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
     // equality comparison, then we're just comparing whether X itself is
     // zero.
-    if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) &&
+    if (N0.getOpcode() == ISD::SRL && (C1.isZero() || C1.isOne()) &&
         N0.getOperand(0).getOpcode() == ISD::CTLZ &&
         isPowerOf2_32(N0.getScalarValueSizeInBits())) {
       if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) {
@@ -3648,8 +3666,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             (isConstFalseVal(N1C) ||
              isExtendedTrueVal(N1C, N0->getValueType(0), SExt))) {
 
-          bool Inverse = (N1C->isNullValue() && Cond == ISD::SETEQ) ||
-                         (!N1C->isNullValue() && Cond == ISD::SETNE);
+          bool Inverse = (N1C->isZero() && Cond == ISD::SETEQ) ||
+                         (!N1C->isZero() && Cond == ISD::SETNE);
 
           if (!Inverse)
             return TopSetCC;
@@ -3800,8 +3818,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // Otherwise, make this a use of a zext.
       return DAG.getSetCC(dl, VT, ZextOp,
                           DAG.getConstant(C1 & Imm, dl, ExtDstTy), Cond);
-    } else if ((N1C->isNullValue() || N1C->isOne()) &&
-                (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+    } else if ((N1C->isZero() || N1C->isOne()) &&
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
       // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
       if (N0.getOpcode() == ISD::SETCC &&
           isTypeLegal(VT) && VT.bitsLE(N0.getValueType()) &&
@@ -3894,7 +3912,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     //   icmp eq/ne (urem %x, %y), 0
     // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
     //   icmp eq/ne %x, 0
-    if (N0.getOpcode() == ISD::UREM && N1C->isNullValue() &&
+    if (N0.getOpcode() == ISD::UREM && N1C->isZero() &&
         (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
       KnownBits XKnown = DAG.computeKnownBits(N0.getOperand(0));
       KnownBits YKnown = DAG.computeKnownBits(N0.getOperand(1));
@@ -3902,6 +3920,17 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         return DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
     }
 
+    // Fold set_cc seteq (ashr X, BW-1), -1 -> set_cc setlt X, 0
+    //  and set_cc setne (ashr X, BW-1), -1 -> set_cc setge X, 0
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        N0.getOpcode() == ISD::SRA && isa<ConstantSDNode>(N0.getOperand(1)) &&
+        N0.getConstantOperandAPInt(1) == OpVT.getScalarSizeInBits() - 1 &&
+        N1C && N1C->isAllOnes()) {
+      return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                          DAG.getConstant(0, dl, OpVT),
+                          Cond == ISD::SETEQ ? ISD::SETLT : ISD::SETGE);
+    }
+
     if (SDValue V =
             optimizeSetCCOfSignedTruncationCheck(VT, N0, N1, Cond, DCI, dl))
       return V;
@@ -4001,7 +4030,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
     if (Cond == ISD::SETEQ || Cond == ISD::SETNE) {
       // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
-      if (C1.isNullValue())
+      if (C1.isZero())
         if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
                 VT, N0, N1, Cond, DCI, dl))
           return CC;
@@ -4010,8 +4039,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // For example, when high 32-bits of i64 X are known clear:
       // all bits clear: (X | (Y<<32)) ==  0 --> (X | Y) ==  0
       // all bits set:   (X | (Y<<32)) == -1 --> (X & Y) == -1
-      bool CmpZero = N1C->getAPIntValue().isNullValue();
-      bool CmpNegOne = N1C->getAPIntValue().isAllOnesValue();
+      bool CmpZero = N1C->getAPIntValue().isZero();
+      bool CmpNegOne = N1C->getAPIntValue().isAllOnes();
       if ((CmpZero || CmpNegOne) && N0.hasOneUse()) {
         // Match or(lo,shl(hi,bw/2)) pattern.
         auto IsConcat = [&](SDValue V, SDValue &Lo, SDValue &Hi) {
@@ -4140,7 +4169,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
         if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
           const APInt &AndRHSC = AndRHS->getAPIntValue();
-          if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
+          if (AndRHSC.isNegatedPowerOf2() && (AndRHSC & C1) == C1) {
             unsigned ShiftBits = AndRHSC.countTrailingZeros();
             if (!TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) {
               SDValue Shift =
@@ -4336,7 +4365,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
     // When division is cheap or optimizing for minimum size,
     // fall through to DIVREM creation by skipping this fold.
-    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) {
+    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) {
       if (N0.getOpcode() == ISD::UREM) {
         if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
           return Folded;
@@ -4687,7 +4716,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
             getSimpleValueType(DL, STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT = getSimpleValueType(DL, Call.getType());
+        OpInfo.ConstraintVT =
+            getAsmOperandValueType(DL, Call.getType()).getSimpleVT();
       }
       ++ResNo;
       break;
@@ -5049,7 +5079,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
   SmallVector<SDValue, 16> Shifts, Factors;
 
   auto BuildSDIVPattern = [&](ConstantSDNode *C) {
-    if (C->isNullValue())
+    if (C->isZero())
       return false;
     APInt Divisor = C->getAPIntValue();
     unsigned Shift = Divisor.countTrailingZeros();
@@ -5151,31 +5181,31 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
   SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;
 
   auto BuildSDIVPattern = [&](ConstantSDNode *C) {
-    if (C->isNullValue())
+    if (C->isZero())
       return false;
 
     const APInt &Divisor = C->getAPIntValue();
-    APInt::ms magics = Divisor.magic();
+    SignedDivisionByConstantInfo magics = SignedDivisionByConstantInfo::get(Divisor);
     int NumeratorFactor = 0;
     int ShiftMask = -1;
 
-    if (Divisor.isOneValue() || Divisor.isAllOnesValue()) {
+    if (Divisor.isOne() || Divisor.isAllOnes()) {
       // If d is +1/-1, we just multiply the numerator by +1/-1.
       NumeratorFactor = Divisor.getSExtValue();
-      magics.m = 0;
-      magics.s = 0;
+      magics.Magic = 0;
+      magics.ShiftAmount = 0;
       ShiftMask = 0;
-    } else if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
+    } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) {
       // If d > 0 and m < 0, add the numerator.
       NumeratorFactor = 1;
-    } else if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
+    } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) {
       // If d < 0 and m > 0, subtract the numerator.
       NumeratorFactor = -1;
     }
 
-    MagicFactors.push_back(DAG.getConstant(magics.m, dl, SVT));
+    MagicFactors.push_back(DAG.getConstant(magics.Magic, dl, SVT));
     Factors.push_back(DAG.getConstant(NumeratorFactor, dl, SVT));
-    Shifts.push_back(DAG.getConstant(magics.s, dl, ShSVT));
+    Shifts.push_back(DAG.getConstant(magics.ShiftAmount, dl, ShSVT));
     ShiftMasks.push_back(DAG.getConstant(ShiftMask, dl, SVT));
     return true;
   };
@@ -5296,33 +5326,33 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
   auto BuildUDIVPattern = [&](ConstantSDNode *C) {
-    if (C->isNullValue())
+    if (C->isZero())
       return false;
     // FIXME: We should use a narrower constant when the upper
     // bits are known to be zero.
     const APInt& Divisor = C->getAPIntValue();
-    APInt::mu magics = Divisor.magicu();
+    UnsignedDivisonByConstantInfo magics = UnsignedDivisonByConstantInfo::get(Divisor);
     unsigned PreShift = 0, PostShift = 0;
 
     // If the divisor is even, we can avoid using the expensive fixup by
     // shifting the divided value upfront.
-    if (magics.a != 0 && !Divisor[0]) {
+    if (magics.IsAdd != 0 && !Divisor[0]) {
       PreShift = Divisor.countTrailingZeros();
       // Get magic number for the shifted divisor.
-      magics = Divisor.lshr(PreShift).magicu(PreShift);
-      assert(magics.a == 0 && "Should use cheap fixup now");
+      magics = UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
+      assert(magics.IsAdd == 0 && "Should use cheap fixup now");
     }
 
-    APInt Magic = magics.m;
+    APInt Magic = magics.Magic;
 
     unsigned SelNPQ;
-    if (magics.a == 0 || Divisor.isOneValue()) {
-      assert(magics.s < Divisor.getBitWidth() &&
+    if (magics.IsAdd == 0 || Divisor.isOne()) {
+      assert(magics.ShiftAmount < Divisor.getBitWidth() &&
              "We shouldn't generate an undefined shift!");
-      PostShift = magics.s;
+      PostShift = magics.ShiftAmount;
       SelNPQ = false;
     } else {
-      PostShift = magics.s - 1;
+      PostShift = magics.ShiftAmount - 1;
       SelNPQ = true;
     }
 
@@ -5330,7 +5360,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     MagicFactors.push_back(DAG.getConstant(Magic, dl, SVT));
     NPQFactors.push_back(
         DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
-                               : APInt::getNullValue(EltBits),
+                               : APInt::getZero(EltBits),
                         dl, SVT));
     PostShifts.push_back(DAG.getConstant(PostShift, dl, ShSVT));
     UseNPQ |= SelNPQ;
@@ -5510,13 +5540,13 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
 
   auto BuildUREMPattern = [&](ConstantSDNode *CDiv, ConstantSDNode *CCmp) {
     // Division by 0 is UB. Leave it to be constant-folded elsewhere.
-    if (CDiv->isNullValue())
+    if (CDiv->isZero())
       return false;
 
     const APInt &D = CDiv->getAPIntValue();
     const APInt &Cmp = CCmp->getAPIntValue();
 
-    ComparingWithAllZeros &= Cmp.isNullValue();
+    ComparingWithAllZeros &= Cmp.isZero();
 
     // x u% C1` is *always* less than C1. So given `x u% C1 == C2`,
     // if C2 is not less than C1, the comparison is always false.
@@ -5528,26 +5558,26 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
     // If all lanes are tautological (either all divisors are ones, or divisor
     // is not greater than the constant we are comparing with),
     // we will prefer to avoid the fold.
-    bool TautologicalLane = D.isOneValue() || TautologicalInvertedLane;
+    bool TautologicalLane = D.isOne() || TautologicalInvertedLane;
     HadTautologicalLanes |= TautologicalLane;
     AllLanesAreTautological &= TautologicalLane;
 
     // If we are comparing with non-zero, we need'll need  to subtract said
     // comparison value from the LHS. But there is no point in doing that if
     // every lane where we are comparing with non-zero is tautological..
-    if (!Cmp.isNullValue())
+    if (!Cmp.isZero())
       AllComparisonsWithNonZerosAreTautological &= TautologicalLane;
 
     // Decompose D into D0 * 2^K
     unsigned K = D.countTrailingZeros();
-    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
+    assert((!D.isOne() || (K == 0)) && "For divisor '1' we won't rotate.");
     APInt D0 = D.lshr(K);
 
     // D is even if it has trailing zeros.
     HadEvenDivisor |= (K != 0);
     // D is a power-of-two if D0 is one.
     // If all divisors are power-of-two, we will prefer to avoid the fold.
-    AllDivisorsArePowerOfTwo &= D0.isOneValue();
+    AllDivisorsArePowerOfTwo &= D0.isOne();
 
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
@@ -5555,20 +5585,20 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
     APInt P = D0.zext(W + 1)
                   .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
                   .trunc(W);
-    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
-    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOne() && "Multiplicative inverse sanity check.");
 
     // Q = floor((2^W - 1) u/ D)
     // R = ((2^W - 1) u% D)
     APInt Q, R;
-    APInt::udivrem(APInt::getAllOnesValue(W), D, Q, R);
+    APInt::udivrem(APInt::getAllOnes(W), D, Q, R);
 
     // If we are comparing with zero, then that comparison constant is okay,
     // else it may need to be one less than that.
     if (Cmp.ugt(R))
       Q -= 1;
 
-    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
+    assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
            "We are expecting that K is always less than all-ones for ShSVT");
 
     // If the lane is tautological the result can be constant-folded.
@@ -5751,7 +5781,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
 
   // TODO: Could support comparing with non-zero too.
   ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
-  if (!CompTarget || !CompTarget->isNullValue())
+  if (!CompTarget || !CompTarget->isZero())
     return SDValue();
 
   bool HadIntMinDivisor = false;
@@ -5764,7 +5794,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
 
   auto BuildSREMPattern = [&](ConstantSDNode *C) {
     // Division by 0 is UB. Leave it to be constant-folded elsewhere.
-    if (C->isNullValue())
+    if (C->isZero())
       return false;
 
     // FIXME: we don't fold `rem %X, -C` to `rem %X, C` in DAGCombine.
@@ -5777,12 +5807,12 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     HadIntMinDivisor |= D.isMinSignedValue();
 
     // If all divisors are ones, we will prefer to avoid the fold.
-    HadOneDivisor |= D.isOneValue();
-    AllDivisorsAreOnes &= D.isOneValue();
+    HadOneDivisor |= D.isOne();
+    AllDivisorsAreOnes &= D.isOne();
 
     // Decompose D into D0 * 2^K
     unsigned K = D.countTrailingZeros();
-    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
+    assert((!D.isOne() || (K == 0)) && "For divisor '1' we won't rotate.");
     APInt D0 = D.lshr(K);
 
     if (!D.isMinSignedValue()) {
@@ -5793,7 +5823,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
 
     // D is a power-of-two if D0 is one. This includes INT_MIN.
     // If all divisors are power-of-two, we will prefer to avoid the fold.
-    AllDivisorsArePowerOfTwo &= D0.isOneValue();
+    AllDivisorsArePowerOfTwo &= D0.isOne();
 
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
@@ -5801,8 +5831,8 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     APInt P = D0.zext(W + 1)
                   .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
                   .trunc(W);
-    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
-    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOne() && "Multiplicative inverse sanity check.");
 
     // A = floor((2^(W - 1) - 1) / D0) & -2^K
     APInt A = APInt::getSignedMaxValue(W).udiv(D0);
@@ -5817,14 +5847,14 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     // Q = floor((2 * A) / (2^K))
     APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
 
-    assert(APInt::getAllOnesValue(SVT.getSizeInBits()).ugt(A) &&
+    assert(APInt::getAllOnes(SVT.getSizeInBits()).ugt(A) &&
            "We are expecting that A is always less than all-ones for SVT");
-    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
+    assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
            "We are expecting that K is always less than all-ones for ShSVT");
 
     // If the divisor is 1 the result can be constant-folded. Likewise, we
     // don't care about INT_MIN lanes, those can be set to undef if appropriate.
-    if (D.isOneValue()) {
+    if (D.isOne()) {
       // Set P, A and K to a bogus values so we can try to splat them.
       P = 0;
       A = -1;
@@ -5950,7 +5980,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
   SDValue IntMax = DAG.getConstant(
       APInt::getSignedMaxValue(SVT.getScalarSizeInBits()), DL, VT);
   SDValue Zero =
-      DAG.getConstant(APInt::getNullValue(SVT.getScalarSizeInBits()), DL, VT);
+      DAG.getConstant(APInt::getZero(SVT.getScalarSizeInBits()), DL, VT);
 
   // Which lanes had INT_MIN divisors? Divisor is constant, so const-folded.
   SDValue DivisorIsIntMin = DAG.getSetCC(DL, SETCCVT, D, IntMin, ISD::SETEQ);
@@ -6776,7 +6806,7 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
   // the destination signmask can't be represented by the float, so we can
   // just use FP_TO_SINT directly.
   const fltSemantics &APFSem = DAG.EVTToAPFloatSemantics(SrcVT);
-  APFloat APF(APFSem, APInt::getNullValue(SrcVT.getScalarSizeInBits()));
+  APFloat APF(APFSem, APInt::getZero(SrcVT.getScalarSizeInBits()));
   APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
   if (APFloat::opOverflow &
       APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
@@ -6969,8 +6999,18 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
-bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
-                                 SelectionDAG &DAG) const {
+// Only expand vector types if we have the appropriate vector bit operations.
+static bool canExpandVectorCTPOP(const TargetLowering &TLI, EVT VT) {
+  assert(VT.isVector() && "Expected vector type");
+  unsigned Len = VT.getScalarSizeInBits();
+  return TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+         TLI.isOperationLegalOrCustom(ISD::SUB, VT) &&
+         TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+         (Len == 8 || TLI.isOperationLegalOrCustom(ISD::MUL, VT)) &&
+         TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT);
+}
+
+SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
@@ -6980,15 +7020,11 @@ bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
 
   // TODO: Add support for irregular type lengths.
   if (!(Len <= 128 && Len % 8 == 0))
-    return false;
+    return SDValue();
 
   // Only expand vector types if we have the appropriate vector bit operations.
-  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) ||
-                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
-                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
-                        (Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) ||
-                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
-    return false;
+  if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
+    return SDValue();
 
   // This is the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
@@ -7025,12 +7061,10 @@ bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
         DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
                     DAG.getConstant(Len - 8, dl, ShVT));
 
-  Result = Op;
-  return true;
+  return Op;
 }
 
-bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
-                                SelectionDAG &DAG) const {
+SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
@@ -7039,10 +7073,8 @@ bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
 
   // If the non-ZERO_UNDEF version is supported we can use that instead.
   if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
-      isOperationLegalOrCustom(ISD::CTLZ, VT)) {
-    Result = DAG.getNode(ISD::CTLZ, dl, VT, Op);
-    return true;
-  }
+      isOperationLegalOrCustom(ISD::CTLZ, VT))
+    return DAG.getNode(ISD::CTLZ, dl, VT, Op);
 
   // If the ZERO_UNDEF version is supported use that and handle the zero case.
   if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
@@ -7051,17 +7083,18 @@ bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
     SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
     SDValue Zero = DAG.getConstant(0, dl, VT);
     SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
-    Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+    return DAG.getSelect(dl, VT, SrcIsZero,
                          DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
-    return true;
   }
 
   // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
   if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
-                        !isOperationLegalOrCustom(ISD::CTPOP, VT) ||
+                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+                         !canExpandVectorCTPOP(*this, VT)) ||
                         !isOperationLegalOrCustom(ISD::SRL, VT) ||
                         !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
-    return false;
+    return SDValue();
 
   // for now, we do this:
   // x = x | (x >> 1);
@@ -7078,12 +7111,10 @@ bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
                      DAG.getNode(ISD::SRL, dl, VT, Op, Tmp));
   }
   Op = DAG.getNOT(dl, Op, VT);
-  Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
-  return true;
+  return DAG.getNode(ISD::CTPOP, dl, VT, Op);
 }
 
-bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
-                                SelectionDAG &DAG) const {
+SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   SDValue Op = Node->getOperand(0);
@@ -7091,10 +7122,8 @@ bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
 
   // If the non-ZERO_UNDEF version is supported we can use that instead.
   if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
-      isOperationLegalOrCustom(ISD::CTTZ, VT)) {
-    Result = DAG.getNode(ISD::CTTZ, dl, VT, Op);
-    return true;
-  }
+      isOperationLegalOrCustom(ISD::CTTZ, VT))
+    return DAG.getNode(ISD::CTTZ, dl, VT, Op);
 
   // If the ZERO_UNDEF version is supported use that and handle the zero case.
   if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
@@ -7103,19 +7132,20 @@ bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
     SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
     SDValue Zero = DAG.getConstant(0, dl, VT);
     SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
-    Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+    return DAG.getSelect(dl, VT, SrcIsZero,
                          DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
-    return true;
   }
 
   // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
   if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
                         (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
-                         !isOperationLegalOrCustom(ISD::CTLZ, VT)) ||
+                         !isOperationLegalOrCustom(ISD::CTLZ, VT) &&
+                         !canExpandVectorCTPOP(*this, VT)) ||
                         !isOperationLegalOrCustom(ISD::SUB, VT) ||
                         !isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
                         !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
-    return false;
+    return SDValue();
 
   // for now, we use: { return popcount(~x & (x - 1)); }
   // unless the target has ctlz but not ctpop, in which case we use:
@@ -7127,18 +7157,15 @@ bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
 
   // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
   if (isOperationLegal(ISD::CTLZ, VT) && !isOperationLegal(ISD::CTPOP, VT)) {
-    Result =
-        DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
-                    DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
-    return true;
+    return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
+                       DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
   }
 
-  Result = DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
-  return true;
+  return DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
 }
 
-bool TargetLowering::expandABS(SDNode *N, SDValue &Result,
-                               SelectionDAG &DAG, bool IsNegative) const {
+SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
+                                  bool IsNegative) const {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
@@ -7148,27 +7175,24 @@ bool TargetLowering::expandABS(SDNode *N, SDValue &Result,
   if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::SMAX, VT)) {
     SDValue Zero = DAG.getConstant(0, dl, VT);
-    Result = DAG.getNode(ISD::SMAX, dl, VT, Op,
-                         DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
-    return true;
+    return DAG.getNode(ISD::SMAX, dl, VT, Op,
+                       DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
   }
 
   // abs(x) -> umin(x,sub(0,x))
   if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::UMIN, VT)) {
     SDValue Zero = DAG.getConstant(0, dl, VT);
-    Result = DAG.getNode(ISD::UMIN, dl, VT, Op,
-                         DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
-    return true;
+    return DAG.getNode(ISD::UMIN, dl, VT, Op,
+                       DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
   }
 
   // 0 - abs(x) -> smin(x, sub(0,x))
   if (IsNegative && isOperationLegal(ISD::SUB, VT) &&
       isOperationLegal(ISD::SMIN, VT)) {
     SDValue Zero = DAG.getConstant(0, dl, VT);
-    Result = DAG.getNode(ISD::SMIN, dl, VT, Op,
-                         DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
-    return true;
+    return DAG.getNode(ISD::SMIN, dl, VT, Op,
+                       DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
   }
 
   // Only expand vector types if we have the appropriate vector operations.
@@ -7177,20 +7201,19 @@ bool TargetLowering::expandABS(SDNode *N, SDValue &Result,
        (!IsNegative && !isOperationLegalOrCustom(ISD::ADD, VT)) ||
        (IsNegative && !isOperationLegalOrCustom(ISD::SUB, VT)) ||
        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
-    return false;
+    return SDValue();
 
   SDValue Shift =
       DAG.getNode(ISD::SRA, dl, VT, Op,
                   DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
   if (!IsNegative) {
     SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
-    Result = DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
-  } else {
-    // 0 - abs(x) -> Y = sra (X, size(X)-1); sub (Y, xor (X, Y))
-    SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
-    Result = DAG.getNode(ISD::SUB, dl, VT, Shift, Xor);
+    return DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
   }
-  return true;
+
+  // 0 - abs(x) -> Y = sra (X, size(X)-1); sub (Y, xor (X, Y))
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
+  return DAG.getNode(ISD::SUB, dl, VT, Shift, Xor);
 }
 
 SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
@@ -7265,34 +7288,31 @@ SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
   // TODO: We can easily support i4/i2 legal types if any target ever does.
   if (Sz >= 8 && isPowerOf2_32(Sz)) {
     // Create the masks - repeating the pattern every byte.
-    APInt MaskHi4 = APInt::getSplat(Sz, APInt(8, 0xF0));
-    APInt MaskHi2 = APInt::getSplat(Sz, APInt(8, 0xCC));
-    APInt MaskHi1 = APInt::getSplat(Sz, APInt(8, 0xAA));
-    APInt MaskLo4 = APInt::getSplat(Sz, APInt(8, 0x0F));
-    APInt MaskLo2 = APInt::getSplat(Sz, APInt(8, 0x33));
-    APInt MaskLo1 = APInt::getSplat(Sz, APInt(8, 0x55));
+    APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F));
+    APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33));
+    APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55));
 
     // BSWAP if the type is wider than a single byte.
     Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);
 
-    // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
-    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT));
+    // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask4, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
-    // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
-    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT));
+    // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask2, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
-    // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
-    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
-    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT));
+    // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask1, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT));
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
     return Tmp;
@@ -7802,13 +7822,15 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
 
 static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
                                        EVT VecVT, const SDLoc &dl,
-                                       unsigned NumSubElts) {
-  if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
-    return Idx;
+                                       ElementCount SubEC) {
+  assert(!(SubEC.isScalable() && VecVT.isFixedLengthVector()) &&
+         "Cannot index a scalable vector within a fixed-width vector");
 
-  EVT IdxVT = Idx.getValueType();
   unsigned NElts = VecVT.getVectorMinNumElements();
-  if (VecVT.isScalableVector()) {
+  unsigned NumSubElts = SubEC.getKnownMinValue();
+  EVT IdxVT = Idx.getValueType();
+
+  if (VecVT.isScalableVector() && !SubEC.isScalable()) {
     // If this is a constant index and we know the value plus the number of the
     // elements in the subvector minus one is less than the minimum number of
     // elements then it's safe to return Idx.
@@ -7855,16 +7877,16 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
   unsigned EltSize = EltVT.getFixedSizeInBits() / 8; // FIXME: should be ABI size.
   assert(EltSize * 8 == EltVT.getFixedSizeInBits() &&
          "Converting bits to bytes lost precision");
-
-  // Scalable vectors don't need clamping as these are checked at compile time
-  if (SubVecVT.isFixedLengthVector()) {
-    assert(SubVecVT.getVectorElementType() == EltVT &&
-           "Sub-vector must be a fixed vector with matching element type");
-    Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl,
-                                    SubVecVT.getVectorNumElements());
-  }
+  assert(SubVecVT.getVectorElementType() == EltVT &&
+         "Sub-vector must be a vector with matching element type");
+  Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl,
+                                  SubVecVT.getVectorElementCount());
 
   EVT IdxVT = Index.getValueType();
+  if (SubVecVT.isScalableVector())
+    Index =
+        DAG.getNode(ISD::MUL, dl, IdxVT, Index,
+                    DAG.getVScale(dl, IdxVT, APInt(IdxVT.getSizeInBits(), 1)));
 
   Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
                       DAG.getConstant(EltSize, dl, IdxVT));
@@ -7920,7 +7942,7 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDLoc dl(Op);
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
-    if (C->isNullValue() && CC == ISD::SETEQ) {
+    if (C->isZero() && CC == ISD::SETEQ) {
       EVT VT = Op.getOperand(0).getValueType();
       SDValue Zext = Op.getOperand(0);
       if (VT.bitsLT(MVT::i32)) {
@@ -7948,10 +7970,8 @@ TargetLowering::getCanonicalIndexType(ISD::MemIndexType IndexType, EVT MemVT,
       (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::SIGNED_UNSCALED);
 
   // Scaling is unimportant for bytes, canonicalize to unscaled.
-  if (IsScaledIndex && MemVT.getScalarType() == MVT::i8) {
-    IsScaledIndex = false;
-    IndexType = IsSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
-  }
+  if (IsScaledIndex && MemVT.getScalarType() == MVT::i8)
+    return IsSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
 
   return IndexType;
 }
@@ -8072,14 +8092,12 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
     return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff);
   }
 
-  // SatMax -> Overflow && SumDiff < 0
-  // SatMin -> Overflow && SumDiff >= 0
+  // Overflow ? (SumDiff >> BW) ^ MinVal : SumDiff
   APInt MinVal = APInt::getSignedMinValue(BitWidth);
-  APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
   SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
-  SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-  SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
-  Result = DAG.getSelect(dl, VT, SumNeg, SatMax, SatMin);
+  SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, SumDiff,
+                              DAG.getConstant(BitWidth - 1, dl, VT));
+  Result = DAG.getNode(ISD::XOR, dl, VT, Shift, SatMin);
   return DAG.getSelect(dl, VT, Overflow, Result, SumDiff);
 }
 
@@ -8154,8 +8172,11 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
       APInt MaxVal = APInt::getSignedMaxValue(VTSize);
       SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
       SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
-      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+      // Xor the inputs, if resulting sign bit is 0 the product will be
+      // positive, else negative.
+      SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, LHS, RHS);
+      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Xor, Zero, ISD::SETLT);
+      Result = DAG.getSelect(dl, VT, ProdNeg, SatMin, SatMax);
       return DAG.getSelect(dl, VT, Overflow, Result, Product);
     } else if (!Signed && isOperationLegalOrCustom(ISD::UMULO, VT)) {
       SDValue Result =
@@ -8390,7 +8411,7 @@ void TargetLowering::expandSADDSUBO(
 
   // If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
   unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT;
-  if (isOperationLegalOrCustom(OpcSat, LHS.getValueType())) {
+  if (isOperationLegal(OpcSat, LHS.getValueType())) {
     SDValue Sat = DAG.getNode(OpcSat, dl, LHS.getValueType(), LHS, RHS);
     SDValue SetCC = DAG.getSetCC(dl, OType, Result, Sat, ISD::SETNE);
     Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
@@ -8443,8 +8464,8 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
 
   EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
   if (VT.isVector())
-    WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
-                              VT.getVectorNumElements());
+    WideVT =
+        EVT::getVectorVT(*DAG.getContext(), WideVT, VT.getVectorElementCount());
 
   SDValue BottomHalf;
   SDValue TopHalf;
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index c70620fd7532..7f9518e4c075 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -50,7 +50,6 @@ STATISTIC(NumFinished, "Number of splits finished");
 STATISTIC(NumSimple,   "Number of splits that were simple");
 STATISTIC(NumCopies,   "Number of copies inserted for splitting");
 STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
-STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
 
 //===----------------------------------------------------------------------===//
 //                     Last Insert Point Analysis
@@ -160,7 +159,6 @@ void SplitAnalysis::clear() {
   UseBlocks.clear();
   ThroughBlocks.clear();
   CurLI = nullptr;
-  DidRepairRange = false;
 }
 
 /// analyzeUses - Count instructions, basic blocks, and loops using CurLI.
@@ -188,20 +186,7 @@ void SplitAnalysis::analyzeUses() {
                  UseSlots.end());
 
   // Compute per-live block info.
-  if (!calcLiveBlockInfo()) {
-    // FIXME: calcLiveBlockInfo found inconsistencies in the live range.
-    // I am looking at you, RegisterCoalescer!
-    DidRepairRange = true;
-    ++NumRepairs;
-    LLVM_DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
-    const_cast<LiveIntervals&>(LIS)
-      .shrinkToUses(const_cast<LiveInterval*>(CurLI));
-    UseBlocks.clear();
-    ThroughBlocks.clear();
-    bool fixed = calcLiveBlockInfo();
-    (void)fixed;
-    assert(fixed && "Couldn't fix broken live interval");
-  }
+  calcLiveBlockInfo();
 
   LLVM_DEBUG(dbgs() << "Analyze counted " << UseSlots.size() << " instrs in "
                     << UseBlocks.size() << " blocks, through "
@@ -210,11 +195,11 @@ void SplitAnalysis::analyzeUses() {
 
 /// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
 /// where CurLI is live.
-bool SplitAnalysis::calcLiveBlockInfo() {
+void SplitAnalysis::calcLiveBlockInfo() {
   ThroughBlocks.resize(MF.getNumBlockIDs());
   NumThroughBlocks = NumGapBlocks = 0;
   if (CurLI->empty())
-    return true;
+    return;
 
   LiveInterval::const_iterator LVI = CurLI->begin();
   LiveInterval::const_iterator LVE = CurLI->end();
@@ -240,8 +225,7 @@ bool SplitAnalysis::calcLiveBlockInfo() {
       ThroughBlocks.set(BI.MBB->getNumber());
       // The range shouldn't end mid-block if there are no uses. This shouldn't
       // happen.
-      if (LVI->end < Stop)
-        return false;
+      assert(LVI->end >= Stop && "range ends mid block with no uses");
     } else {
       // This block has uses. Find the first and last uses in the block.
       BI.FirstInstr = *UseI;
@@ -312,7 +296,6 @@ bool SplitAnalysis::calcLiveBlockInfo() {
   }
 
   assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
-  return true;
 }
 
 unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
@@ -529,19 +512,12 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
               | getInternalReadRegState(!FirstCopy), SubIdx)
       .addReg(FromReg, 0, SubIdx);
 
-  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
   SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (FirstCopy) {
     Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
   } else {
     CopyMI->bundleWithPred();
   }
-  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx);
-  DestLI.refineSubRanges(Allocator, LaneMask,
-                         [Def, &Allocator](LiveInterval::SubRange &SR) {
-                           SR.createDeadDef(Def, Allocator);
-                         },
-                         Indexes, TRI);
   return Def;
 }
 
@@ -549,11 +525,11 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
     LaneBitmask LaneMask, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
     // The full vreg is copied.
     MachineInstr *CopyMI =
         BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
-    SlotIndexes &Indexes = *LIS.getSlotIndexes();
     return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
   }
 
@@ -567,18 +543,26 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
   const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
   assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
 
-  SmallVector<unsigned, 8> Indexes;
+  SmallVector<unsigned, 8> SubIndexes;
 
   // Abort if we cannot possibly implement the COPY with the given indexes.
-  if (!TRI.getCoveringSubRegIndexes(MRI, RC, LaneMask, Indexes))
+  if (!TRI.getCoveringSubRegIndexes(MRI, RC, LaneMask, SubIndexes))
     report_fatal_error("Impossible to implement partial COPY");
 
   SlotIndex Def;
-  for (unsigned BestIdx : Indexes) {
+  for (unsigned BestIdx : SubIndexes) {
     Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
                                 DestLI, Late, Def);
   }
 
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  DestLI.refineSubRanges(
+      Allocator, LaneMask,
+      [Def, &Allocator](LiveInterval::SubRange &SR) {
+        SR.createDeadDef(Def, Allocator);
+      },
+      Indexes, TRI);
+
   return Def;
 }
 
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index fbcffacb49ab..902546fe16d8 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -160,14 +160,11 @@ private:
   /// NumThroughBlocks - Number of live-through blocks.
   unsigned NumThroughBlocks;
 
-  /// DidRepairRange - analyze was forced to shrinkToUses().
-  bool DidRepairRange;
-
   // Sumarize statistics by counting instructions using CurLI.
   void analyzeUses();
 
   /// calcLiveBlockInfo - Compute per-block information about CurLI.
-  bool calcLiveBlockInfo();
+  void calcLiveBlockInfo();
 
 public:
   SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
@@ -177,11 +174,6 @@ public:
   /// split.
   void analyze(const LiveInterval *li);
 
-  /// didRepairRange() - Returns true if CurLI was invalid and has been repaired
-  /// by analyze(). This really shouldn't happen, but sometimes the coalescer
-  /// can create live ranges that end in mid-air.
-  bool didRepairRange() const { return DidRepairRange; }
-
   /// clear - clear all data structures so SplitAnalysis is ready to analyze a
   /// new interval.
   void clear();
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 162f3aab024d..623d5da9831e 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -687,6 +687,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
 
     // Walk the instructions in the block to look for start/end ops.
     for (MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
       if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
           MI.getOpcode() == TargetOpcode::LIFETIME_END) {
         int Slot = getStartOrEndSlot(MI);
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 9f229d51b985..7445f77c955d 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -148,10 +148,8 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
     return false;
 
   bool NeedsProtector = false;
-  for (StructType::element_iterator I = ST->element_begin(),
-                                    E = ST->element_end();
-       I != E; ++I)
-    if (ContainsProtectableArray(*I, IsLarge, Strong, true)) {
+  for (Type *ET : ST->elements())
+    if (ContainsProtectableArray(ET, IsLarge, Strong, true)) {
       // If the element is a protectable array and is large (>= SSPBufferSize)
       // then we are done.  If the protectable array is not large, then
       // keep looking in case a subsequent element is a large array.
@@ -436,13 +434,11 @@ bool StackProtector::InsertStackProtectors() {
   // protection in SDAG.
   bool SupportsSelectionDAGSP =
       TLI->useStackGuardXorFP() ||
-      (EnableSelectionDAGSP && !TM->Options.EnableFastISel &&
-       !TM->Options.EnableGlobalISel);
-  AllocaInst *AI = nullptr;       // Place on stack that stores the stack guard.
+      (EnableSelectionDAGSP && !TM->Options.EnableFastISel);
+  AllocaInst *AI = nullptr; // Place on stack that stores the stack guard.
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
-    BasicBlock *BB = &*I++;
-    ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+  for (BasicBlock &BB : llvm::make_early_inc_range(*F)) {
+    ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
     if (!RI)
       continue;
 
@@ -530,23 +526,23 @@ bool StackProtector::InsertStackProtectors() {
 
       // Split the basic block before the return instruction.
       BasicBlock *NewBB =
-          BB->splitBasicBlock(CheckLoc->getIterator(), "SP_return");
+          BB.splitBasicBlock(CheckLoc->getIterator(), "SP_return");
 
       // Update the dominator tree if we need to.
-      if (DT && DT->isReachableFromEntry(BB)) {
-        DT->addNewBlock(NewBB, BB);
-        DT->addNewBlock(FailBB, BB);
+      if (DT && DT->isReachableFromEntry(&BB)) {
+        DT->addNewBlock(NewBB, &BB);
+        DT->addNewBlock(FailBB, &BB);
       }
 
       // Remove default branch instruction to the new BB.
-      BB->getTerminator()->eraseFromParent();
+      BB.getTerminator()->eraseFromParent();
 
       // Move the newly created basic block to the point right after the old
       // basic block so that it's in the "fall through" position.
-      NewBB->moveAfter(BB);
+      NewBB->moveAfter(&BB);
 
       // Generate the stack protector instructions in the old basic block.
-      IRBuilder<> B(BB);
+      IRBuilder<> B(&BB);
       Value *Guard = getStackGuard(TLI, M, B);
       LoadInst *LI2 = B.CreateLoad(B.getInt8PtrTy(), AI, true);
       Value *Cmp = B.CreateICmpEQ(Guard, LI2);
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index ebe00bd7402f..9aea5a7a8853 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -169,7 +169,7 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
         if (!LS->hasInterval(FI))
           continue;
         LiveInterval &li = LS->getInterval(FI);
-        if (!MI.isDebugValue())
+        if (!MI.isDebugInstr())
           li.incrementWeight(
               LiveIntervals::getSpillWeight(false, true, MBFI, MI));
       }
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index dfcec32d9537..36a02d5beb4b 100644
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -405,7 +405,7 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
   if (Low.isStrictlyPositive() && High.slt(BitWidth)) {
     // Optimize the case where all the case values fit in a word without having
     // to subtract minValue. In this case, we can optimize away the subtraction.
-    LowBound = APInt::getNullValue(Low.getBitWidth());
+    LowBound = APInt::getZero(Low.getBitWidth());
     CmpRange = High;
     ContiguousRange = false;
   } else {
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index af735f2a0216..943bd18c6c8b 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -70,6 +70,12 @@ static cl::opt<unsigned> TailDupIndirectBranchSize(
              "end with indirect branches."), cl::init(20),
     cl::Hidden);
 
+static cl::opt<unsigned> TailDupJmpTableLoopSize(
+    "tail-dup-jmptable-loop-size",
+    cl::desc("Maximum loop latches to consider tail duplication that are "
+             "successors of loop header."),
+    cl::init(128), cl::Hidden);
+
 static cl::opt<bool>
     TailDupVerify("tail-dup-verify",
                   cl::desc("Verify sanity of PHI instructions during taildup"),
@@ -100,12 +106,11 @@ void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc,
 }
 
 static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
-  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = &*I;
-    SmallSetVector<MachineBasicBlock *, 8> Preds(MBB->pred_begin(),
-                                                 MBB->pred_end());
-    MachineBasicBlock::iterator MI = MBB->begin();
-    while (MI != MBB->end()) {
+  for (MachineBasicBlock &MBB : llvm::drop_begin(MF)) {
+    SmallSetVector<MachineBasicBlock *, 8> Preds(MBB.pred_begin(),
+                                                 MBB.pred_end());
+    MachineBasicBlock::iterator MI = MBB.begin();
+    while (MI != MBB.end()) {
       if (!MI->isPHI())
         break;
       for (MachineBasicBlock *PredBB : Preds) {
@@ -118,7 +123,7 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
           }
         }
         if (!Found) {
-          dbgs() << "Malformed PHI in " << printMBBReference(*MBB) << ": "
+          dbgs() << "Malformed PHI in " << printMBBReference(MBB) << ": "
                  << *MI;
           dbgs() << "  missing input from predecessor "
                  << printMBBReference(*PredBB) << '\n';
@@ -129,14 +134,14 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
       for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
         MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
         if (CheckExtra && !Preds.count(PHIBB)) {
-          dbgs() << "Warning: malformed PHI in " << printMBBReference(*MBB)
+          dbgs() << "Warning: malformed PHI in " << printMBBReference(MBB)
                  << ": " << *MI;
           dbgs() << "  extra input from predecessor "
                  << printMBBReference(*PHIBB) << '\n';
           llvm_unreachable(nullptr);
         }
         if (PHIBB->getNumber() < 0) {
-          dbgs() << "Malformed PHI in " << printMBBReference(*MBB) << ": "
+          dbgs() << "Malformed PHI in " << printMBBReference(MBB) << ": "
                  << *MI;
           dbgs() << "  non-existing " << printMBBReference(*PHIBB) << '\n';
           llvm_unreachable(nullptr);
@@ -279,18 +284,17 @@ bool TailDuplicator::tailDuplicateBlocks() {
     VerifyPHIs(*MF, true);
   }
 
-  for (MachineFunction::iterator I = ++MF->begin(), E = MF->end(); I != E;) {
-    MachineBasicBlock *MBB = &*I++;
-
+  for (MachineBasicBlock &MBB :
+       llvm::make_early_inc_range(llvm::drop_begin(*MF))) {
     if (NumTails == TailDupLimit)
       break;
 
-    bool IsSimple = isSimpleBB(MBB);
+    bool IsSimple = isSimpleBB(&MBB);
 
-    if (!shouldTailDuplicate(IsSimple, *MBB))
+    if (!shouldTailDuplicate(IsSimple, MBB))
       continue;
 
-    MadeChange |= tailDuplicateAndUpdate(IsSimple, MBB, nullptr);
+    MadeChange |= tailDuplicateAndUpdate(IsSimple, &MBB, nullptr);
   }
 
   if (PreRegAlloc && TailDupVerify)
@@ -565,6 +569,29 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   if (TailBB.isSuccessor(&TailBB))
     return false;
 
+  // When doing tail-duplication with jumptable loops like:
+  //    1 -> 2 <-> 3                 |
+  //          \  <-> 4               |
+  //           \   <-> 5             |
+  //            \    <-> ...         |
+  //             \---> rest          |
+  // quadratic number of edges and much more loops are added to CFG. This
+  // may cause compile time regression when jumptable is quiet large.
+  // So set the limit on jumptable cases.
+  auto isLargeJumpTableLoop = [](const MachineBasicBlock &TailBB) {
+    const SmallPtrSet<const MachineBasicBlock *, 8> Preds(TailBB.pred_begin(),
+                                                          TailBB.pred_end());
+    // Check the basic block has large number of successors, all of them only
+    // have one successor which is the basic block itself.
+    return llvm::count_if(
+               TailBB.successors(), [&](const MachineBasicBlock *SuccBB) {
+                 return Preds.count(SuccBB) && SuccBB->succ_size() == 1;
+               }) > TailDupJmpTableLoopSize;
+  };
+
+  if (isLargeJumpTableLoop(TailBB))
+    return false;
+
   // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
@@ -874,18 +901,15 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     // Clone the contents of TailBB into PredBB.
     DenseMap<Register, RegSubRegPair> LocalVRMap;
     SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
-    for (MachineBasicBlock::iterator I = TailBB->begin(), E = TailBB->end();
-         I != E; /* empty */) {
-      MachineInstr *MI = &*I;
-      ++I;
-      if (MI->isPHI()) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(*TailBB)) {
+      if (MI.isPHI()) {
         // Replace the uses of the def of the PHI with the register coming
         // from PredBB.
-        processPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+        processPHI(&MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
       } else {
         // Replace def of virtual registers with new registers, and update
         // uses with PHI source register or the new registers.
-        duplicateInstruction(MI, TailBB, PredBB, LocalVRMap, UsedByPhi);
+        duplicateInstruction(&MI, TailBB, PredBB, LocalVRMap, UsedByPhi);
       }
     }
     appendCopies(PredBB, CopyInfos, Copies);
@@ -930,44 +954,56 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     // There may be a branch to the layout successor. This is unlikely but it
     // happens. The correct thing to do is to remove the branch before
     // duplicating the instructions in all cases.
-    TII->removeBranch(*PrevBB);
-    if (PreRegAlloc) {
-      DenseMap<Register, RegSubRegPair> LocalVRMap;
-      SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
-      MachineBasicBlock::iterator I = TailBB->begin();
-      // Process PHI instructions first.
-      while (I != TailBB->end() && I->isPHI()) {
-        // Replace the uses of the def of the PHI with the register coming
-        // from PredBB.
-        MachineInstr *MI = &*I++;
-        processPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
-      }
+    bool RemovedBranches = TII->removeBranch(*PrevBB) != 0;
+
+    // If there are still tail instructions, abort the merge
+    if (PrevBB->getFirstTerminator() == PrevBB->end()) {
+      if (PreRegAlloc) {
+        DenseMap<Register, RegSubRegPair> LocalVRMap;
+        SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
+        MachineBasicBlock::iterator I = TailBB->begin();
+        // Process PHI instructions first.
+        while (I != TailBB->end() && I->isPHI()) {
+          // Replace the uses of the def of the PHI with the register coming
+          // from PredBB.
+          MachineInstr *MI = &*I++;
+          processPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi,
+                     true);
+        }
 
-      // Now copy the non-PHI instructions.
-      while (I != TailBB->end()) {
-        // Replace def of virtual registers with new registers, and update
-        // uses with PHI source register or the new registers.
-        MachineInstr *MI = &*I++;
-        assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
-        duplicateInstruction(MI, TailBB, PrevBB, LocalVRMap, UsedByPhi);
-        MI->eraseFromParent();
+        // Now copy the non-PHI instructions.
+        while (I != TailBB->end()) {
+          // Replace def of virtual registers with new registers, and update
+          // uses with PHI source register or the new registers.
+          MachineInstr *MI = &*I++;
+          assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
+          duplicateInstruction(MI, TailBB, PrevBB, LocalVRMap, UsedByPhi);
+          MI->eraseFromParent();
+        }
+        appendCopies(PrevBB, CopyInfos, Copies);
+      } else {
+        TII->removeBranch(*PrevBB);
+        // No PHIs to worry about, just splice the instructions over.
+        PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
       }
-      appendCopies(PrevBB, CopyInfos, Copies);
-    } else {
-      TII->removeBranch(*PrevBB);
-      // No PHIs to worry about, just splice the instructions over.
-      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
-    }
-    PrevBB->removeSuccessor(PrevBB->succ_begin());
-    assert(PrevBB->succ_empty());
-    PrevBB->transferSuccessors(TailBB);
+      PrevBB->removeSuccessor(PrevBB->succ_begin());
+      assert(PrevBB->succ_empty());
+      PrevBB->transferSuccessors(TailBB);
 
-    // Update branches in PrevBB based on Tail's layout successor.
-    if (ShouldUpdateTerminators)
-      PrevBB->updateTerminator(TailBB->getNextNode());
+      // Update branches in PrevBB based on Tail's layout successor.
+      if (ShouldUpdateTerminators)
+        PrevBB->updateTerminator(TailBB->getNextNode());
 
-    TDBBs.push_back(PrevBB);
-    Changed = true;
+      TDBBs.push_back(PrevBB);
+      Changed = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Abort merging blocks, the predecessor still "
+                           "contains terminator instructions");
+      // Return early if no changes were made
+      if (!Changed)
+        return RemovedBranches;
+    }
+    Changed |= RemovedBranches;
   }
 
   // If this is after register allocation, there are no phis to fix.
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 2e4a656ea0c8..e74b3195a130 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -366,7 +366,7 @@ bool TargetInstrInfo::hasLoadFromStackSlot(
                                   oe = MI.memoperands_end();
        o != oe; ++o) {
     if ((*o)->isLoad() &&
-        dyn_cast_or_null<FixedStackPseudoSourceValue>((*o)->getPseudoValue()))
+        isa_and_nonnull<FixedStackPseudoSourceValue>((*o)->getPseudoValue()))
       Accesses.push_back(*o);
   }
   return Accesses.size() != StartSize;
@@ -380,7 +380,7 @@ bool TargetInstrInfo::hasStoreToStackSlot(
                                   oe = MI.memoperands_end();
        o != oe; ++o) {
     if ((*o)->isStore() &&
-        dyn_cast_or_null<FixedStackPseudoSourceValue>((*o)->getPseudoValue()))
+        isa_and_nonnull<FixedStackPseudoSourceValue>((*o)->getPseudoValue()))
       Accesses.push_back(*o);
   }
   return Accesses.size() != StartSize;
@@ -1264,22 +1264,6 @@ int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
 }
 
-/// If we can determine the operand latency from the def only, without itinerary
-/// lookup, do so. Otherwise return -1.
-int TargetInstrInfo::computeDefOperandLatency(
-    const InstrItineraryData *ItinData, const MachineInstr &DefMI) const {
-
-  // Let the target hook getInstrLatency handle missing itineraries.
-  if (!ItinData)
-    return getInstrLatency(ItinData, DefMI);
-
-  if(ItinData->isEmpty())
-    return defaultDefLatency(ItinData->SchedModel, DefMI);
-
-  // ...operand lookup required
-  return -1;
-}
-
 bool TargetInstrInfo::getRegSequenceInputs(
     const MachineInstr &MI, unsigned DefIdx,
     SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3c5dd29036db..c0a7efff9e98 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -52,6 +52,7 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
@@ -236,6 +237,8 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
       return FPEXT_F16_F32;
     if (RetVT == MVT::f64)
       return FPEXT_F16_F64;
+    if (RetVT == MVT::f80)
+      return FPEXT_F16_F80;
     if (RetVT == MVT::f128)
       return FPEXT_F16_F128;
   } else if (OpVT == MVT::f32) {
@@ -659,7 +662,7 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
 
 /// InitCmpLibcallCCs - Set default comparison libcall CC.
 static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
-  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
+  std::fill(CCs, CCs + RTLIB::UNKNOWN_LIBCALL, ISD::SETCC_INVALID);
   CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
   CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
   CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
@@ -896,8 +899,6 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::FCEIL,      VT, Expand);
     setOperationAction(ISD::FRINT,      VT, Expand);
     setOperationAction(ISD::FTRUNC,     VT, Expand);
-    setOperationAction(ISD::FROUND,     VT, Expand);
-    setOperationAction(ISD::FROUNDEVEN, VT, Expand);
     setOperationAction(ISD::LROUND,     VT, Expand);
     setOperationAction(ISD::LLROUND,    VT, Expand);
     setOperationAction(ISD::LRINT,      VT, Expand);
@@ -924,8 +925,15 @@ EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
   assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
   if (LHSTy.isVector())
     return LHSTy;
-  return LegalTypes ? getScalarShiftAmountTy(DL, LHSTy)
-                    : getPointerTy(DL);
+  MVT ShiftVT =
+      LegalTypes ? getScalarShiftAmountTy(DL, LHSTy) : getPointerTy(DL);
+  // If any possible shift value won't fit in the prefered type, just use
+  // something safe. Assume it will be legalized when the shift is expanded.
+  if (ShiftVT.getSizeInBits() < Log2_32_Ceil(LHSTy.getSizeInBits()))
+    ShiftVT = MVT::i32;
+  assert(ShiftVT.getSizeInBits() >= Log2_32_Ceil(LHSTy.getSizeInBits()) &&
+         "ShiftVT is still too small!");
+  return ShiftVT;
 }
 
 bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
@@ -1556,7 +1564,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context,
 
   // Scalable vectors cannot be scalarized, so handle the legalisation of the
   // types like done elsewhere in SelectionDAG.
-  if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.getKnownMinValue())) {
+  if (EltCnt.isScalable()) {
     LegalizeKind LK;
     EVT PartVT = VT;
     do {
@@ -1565,16 +1573,14 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context,
       PartVT = LK.second;
     } while (LK.first != TypeLegal);
 
-    NumIntermediates = VT.getVectorElementCount().getKnownMinValue() /
-                       PartVT.getVectorElementCount().getKnownMinValue();
+    if (!PartVT.isVector()) {
+      report_fatal_error(
+          "Don't know how to legalize this scalable vector type");
+    }
 
-    // FIXME: This code needs to be extended to handle more complex vector
-    // breakdowns, like nxv7i64 -> nxv8i64 -> 4 x nxv2i64. Currently the only
-    // supported cases are vectors that are broken down into equal parts
-    // such as nxv6i64 -> 3 x nxv2i64.
-    assert((PartVT.getVectorElementCount() * NumIntermediates) ==
-               VT.getVectorElementCount() &&
-           "Expected an integer multiple of PartVT");
+    NumIntermediates =
+        divideCeil(VT.getVectorElementCount().getKnownMinValue(),
+                   PartVT.getVectorElementCount().getKnownMinValue());
     IntermediateVT = PartVT;
     RegisterVT = getRegisterType(Context, IntermediateVT);
     return NumIntermediates;
@@ -1657,9 +1663,9 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+    if (attr.hasRetAttr(Attribute::SExt))
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasRetAttr(Attribute::ZExt))
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1679,13 +1685,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))
+    if (attr.hasRetAttr(Attribute::InReg))
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+    if (attr.hasRetAttr(Attribute::SExt))
       Flags.setSExt();
-    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasRetAttr(Attribute::ZExt))
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
@@ -1696,7 +1702,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.  This is the actual
 /// alignment, not its logarithm.
-unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
+uint64_t TargetLoweringBase::getByValTypeAlignment(Type *Ty,
                                                    const DataLayout &DL) const {
   return DL.getABITypeAlign(Ty).value();
 }
@@ -1749,8 +1755,9 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                                             const DataLayout &DL, LLT Ty,
                                             const MachineMemOperand &MMO,
                                             bool *Fast) const {
-  return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
-                            MMO.getAlign(), MMO.getFlags(), Fast);
+  EVT VT = getApproximateEVTForLLT(Ty, DL, Context);
+  return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), MMO.getAlign(),
+                            MMO.getFlags(), Fast);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1849,8 +1856,12 @@ TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
   while (true) {
     LegalizeKind LK = getTypeConversion(C, MTy);
 
-    if (LK.first == TypeScalarizeScalableVector)
-      return std::make_pair(InstructionCost::getInvalid(), MVT::getVT(Ty));
+    if (LK.first == TypeScalarizeScalableVector) {
+      // Ensure we return a sensible simple VT here, since many callers of this
+      // function require it.
+      MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
+      return std::make_pair(InstructionCost::getInvalid(), VT);
+    }
 
     if (LK.first == TypeLegal)
       return std::make_pair(Cost, MTy.getSimpleVT());
@@ -1980,8 +1991,11 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
     auto *GV = new GlobalVariable(M, Type::getInt8PtrTy(M.getContext()), false,
                                   GlobalVariable::ExternalLinkage, nullptr,
                                   "__stack_chk_guard");
+
+    // FreeBSD has "__stack_chk_guard" defined externally on libc.so
     if (TM.getRelocationModel() == Reloc::Static &&
-        !TM.getTargetTriple().isWindowsGNUEnvironment())
+        !TM.getTargetTriple().isWindowsGNUEnvironment() &&
+        !TM.getTargetTriple().isOSFreeBSD())
       GV->setDSOLocal(true);
   }
 }
@@ -2020,6 +2034,12 @@ bool TargetLoweringBase::isJumpTableRelative() const {
   return getTargetMachine().isPositionIndependent();
 }
 
+Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
+  if (TM.Options.LoopAlignment)
+    return Align(TM.Options.LoopAlignment);
+  return PrefLoopAlignment;
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index add34eccc1f3..1d3bb286c882 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -677,8 +677,9 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
   }
 
   if (Retain) {
-    if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
-        Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))
+    if ((Ctx.getAsmInfo()->useIntegratedAssembler() ||
+         Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
+        !TM.getTargetTriple().isOSSolaris())
       Flags |= ELF::SHF_GNU_RETAIN;
     return NextUniqueID++;
   }
@@ -855,8 +856,10 @@ static MCSection *selectELFSectionForGlobal(
     EmitUniqueSection = true;
     Flags |= ELF::SHF_LINK_ORDER;
   }
-  if (Retain && (Ctx.getAsmInfo()->useIntegratedAssembler() ||
-                 Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))) {
+  if (Retain &&
+      (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+       Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
+      !TM.getTargetTriple().isOSSolaris()) {
     EmitUniqueSection = true;
     Flags |= ELF::SHF_GNU_RETAIN;
   }
@@ -1492,7 +1495,7 @@ void TargetLoweringObjectFileMachO::getNameWithPrefix(
     SmallVectorImpl<char> &OutName, const GlobalValue *GV,
     const TargetMachine &TM) const {
   bool CannotUsePrivateLabel = true;
-  if (auto *GO = GV->getBaseObject()) {
+  if (auto *GO = GV->getAliaseeObject()) {
     SectionKind GOKind = TargetLoweringObjectFile::getKindForGlobal(GO, TM);
     const MCSection *TheSection = SectionForGlobal(GO, GOKind, TM);
     CannotUsePrivateLabel =
@@ -1563,7 +1566,7 @@ static int getSelectionForCOFF(const GlobalValue *GV) {
   if (const Comdat *C = GV->getComdat()) {
     const GlobalValue *ComdatKey = getComdatGVForCOFF(GV);
     if (const auto *GA = dyn_cast<GlobalAlias>(ComdatKey))
-      ComdatKey = GA->getBaseObject();
+      ComdatKey = GA->getAliaseeObject();
     if (ComdatKey == GV) {
       switch (C->getSelectionKind()) {
       case Comdat::Any:
@@ -1942,7 +1945,7 @@ static std::string APIntToHexString(const APInt &AI) {
 static std::string scalarConstantToHexString(const Constant *C) {
   Type *Ty = C->getType();
   if (isa<UndefValue>(C)) {
-    return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
+    return APIntToHexString(APInt::getZero(Ty->getPrimitiveSizeInBits()));
   } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
     return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
   } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
@@ -2414,7 +2417,20 @@ bool TargetLoweringObjectFileXCOFF::shouldPutJumpTableInFunctionSection(
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
     Align &Alignment) const {
-  //TODO: Enable emiting constant pool to unique sections when we support it.
+  // TODO: Enable emiting constant pool to unique sections when we support it.
+  if (Alignment > Align(16))
+    report_fatal_error("Alignments greater than 16 not yet supported.");
+
+  if (Alignment == Align(8)) {
+    assert(ReadOnly8Section && "Section should always be initialized.");
+    return ReadOnly8Section;
+  }
+
+  if (Alignment == Align(16)) {
+    assert(ReadOnly16Section && "Section should always be initialized.");
+    return ReadOnly16Section;
+  }
+
   return ReadOnlySection;
 }
 
@@ -2443,7 +2459,8 @@ MCSection *TargetLoweringObjectFileXCOFF::getStaticDtorSection(
 const MCExpr *TargetLoweringObjectFileXCOFF::lowerRelativeReference(
     const GlobalValue *LHS, const GlobalValue *RHS,
     const TargetMachine &TM) const {
-  report_fatal_error("XCOFF not yet implemented.");
+  /* Not implemented yet, but don't crash, return nullptr. */
+  return nullptr;
 }
 
 XCOFF::StorageClass
@@ -2473,12 +2490,12 @@ TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(const GlobalValue *GV) {
 
 MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol(
     const GlobalValue *Func, const TargetMachine &TM) const {
-  assert(
-      (isa<Function>(Func) ||
-       (isa<GlobalAlias>(Func) &&
-        isa_and_nonnull<Function>(cast<GlobalAlias>(Func)->getBaseObject()))) &&
-      "Func must be a function or an alias which has a function as base "
-      "object.");
+  assert((isa<Function>(Func) ||
+          (isa<GlobalAlias>(Func) &&
+           isa_and_nonnull<Function>(
+               cast<GlobalAlias>(Func)->getAliaseeObject()))) &&
+         "Func must be a function or an alias which has a function as base "
+         "object.");
 
   SmallString<128> NameStr;
   NameStr.push_back('.');
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 4024fd452fc4..402e21d3708b 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -172,6 +172,24 @@ static cl::opt<bool>
     FSNoFinalDiscrim("fs-no-final-discrim", cl::init(false), cl::Hidden,
                      cl::desc("Do not insert FS-AFDO discriminators before "
                               "emit."));
+// Disable MIRProfileLoader before RegAlloc. This is for for debugging and
+// tuning purpose.
+static cl::opt<bool> DisableRAFSProfileLoader(
+    "disable-ra-fsprofile-loader", cl::init(true), cl::Hidden,
+    cl::desc("Disable MIRProfileLoader before RegAlloc"));
+// Disable MIRProfileLoader before BloackPlacement. This is for for debugging
+// and tuning purpose.
+static cl::opt<bool> DisableLayoutFSProfileLoader(
+    "disable-layout-fsprofile-loader", cl::init(true), cl::Hidden,
+    cl::desc("Disable MIRProfileLoader before BlockPlacement"));
+// Specify FSProfile file name.
+static cl::opt<std::string>
+    FSProfileFile("fs-profile-file", cl::init(""), cl::value_desc("filename"),
+                  cl::desc("Flow Sensitive profile file name."), cl::Hidden);
+// Specify Remapping file for FSProfile.
+static cl::opt<std::string> FSRemappingFile(
+    "fs-remapping-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Flow Sensitive profile remapping file name."), cl::Hidden);
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
@@ -308,6 +326,28 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   return TargetID;
 }
 
+// Find the FSProfile file name. The internal option takes the precedence
+// before getting from TargetMachine.
+static const std::string getFSProfileFile(const TargetMachine *TM) {
+  if (!FSProfileFile.empty())
+    return FSProfileFile.getValue();
+  const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
+  if (PGOOpt == None || PGOOpt->Action != PGOOptions::SampleUse)
+    return std::string();
+  return PGOOpt->ProfileFile;
+}
+
+// Find the Profile remapping file name. The internal option takes the
+// precedence before getting from TargetMachine.
+static const std::string getFSRemappingFile(const TargetMachine *TM) {
+  if (!FSRemappingFile.empty())
+    return FSRemappingFile.getValue();
+  const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
+  if (PGOOpt == None || PGOOpt->Action != PGOOptions::SampleUse)
+    return std::string();
+  return PGOOpt->ProfileRemappingFile;
+}
+
 //===---------------------------------------------------------------------===//
 /// TargetPassConfig
 //===---------------------------------------------------------------------===//
@@ -321,12 +361,9 @@ namespace {
 struct InsertedPass {
   AnalysisID TargetPassID;
   IdentifyingPassPtr InsertedPassID;
-  bool VerifyAfter;
 
-  InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
-               bool VerifyAfter)
-      : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID),
-        VerifyAfter(VerifyAfter) {}
+  InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID)
+      : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID) {}
 
   Pass *getInsertedPass() const {
     assert(InsertedPassID.isValid() && "Illegal Pass ID!");
@@ -601,14 +638,13 @@ CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
 
 /// Insert InsertedPassID pass after TargetPassID.
 void TargetPassConfig::insertPass(AnalysisID TargetPassID,
-                                  IdentifyingPassPtr InsertedPassID,
-                                  bool VerifyAfter) {
+                                  IdentifyingPassPtr InsertedPassID) {
   assert(((!InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getID()) ||
           (InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
          "Insert a pass after itself!");
-  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter);
+  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID);
 }
 
 /// createPassConfig - Create a pass configuration object to be used by
@@ -686,7 +722,7 @@ bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
 /// a later pass or that it should stop after an earlier pass, then do not add
 /// the pass.  Finally, compare the current pass against the StartAfter
 /// and StopAfter options and change the Started/Stopped flags accordingly.
-void TargetPassConfig::addPass(Pass *P, bool verifyAfter) {
+void TargetPassConfig::addPass(Pass *P) {
   assert(!Initialized && "PassConfig is immutable");
 
   // Cache the Pass ID here in case the pass manager finds this pass is
@@ -704,16 +740,16 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter) {
       addMachinePrePasses();
     std::string Banner;
     // Construct banner message before PM->add() as that may delete the pass.
-    if (AddingMachinePasses && verifyAfter)
+    if (AddingMachinePasses)
       Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
     if (AddingMachinePasses)
-      addMachinePostPasses(Banner, /*AllowVerify*/ verifyAfter);
+      addMachinePostPasses(Banner);
 
     // Add the passes after the pass P if there is any.
     for (const auto &IP : Impl->InsertedPasses) {
       if (IP.TargetPassID == PassID)
-        addPass(IP.getInsertedPass(), IP.VerifyAfter);
+        addPass(IP.getInsertedPass());
     }
   } else {
     delete P;
@@ -733,7 +769,7 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter) {
 ///
 /// addPass cannot return a pointer to the pass instance because is internal the
 /// PassManager and the instance we create here may already be freed.
-AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter) {
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
@@ -748,7 +784,7 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter) {
       llvm_unreachable("Pass ID not registered");
   }
   AnalysisID FinalID = P->getPassID();
-  addPass(P, verifyAfter); // Ends the lifetime of P.
+  addPass(P); // Ends the lifetime of P.
 
   return FinalID;
 }
@@ -792,8 +828,7 @@ void TargetPassConfig::addMachinePrePasses(bool AllowDebugify) {
     addDebugifyPass();
 }
 
-void TargetPassConfig::addMachinePostPasses(const std::string &Banner,
-                                            bool AllowVerify, bool AllowStrip) {
+void TargetPassConfig::addMachinePostPasses(const std::string &Banner) {
   if (DebugifyIsSafe) {
     if (DebugifyCheckAndStripAll == cl::BOU_TRUE) {
       addCheckDebugPass();
@@ -801,8 +836,7 @@ void TargetPassConfig::addMachinePostPasses(const std::string &Banner,
     } else if (DebugifyAndStripAll == cl::BOU_TRUE)
       addStripDebugPass();
   }
-  if (AllowVerify)
-    addVerifyPass(Banner);
+  addVerifyPass(Banner);
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
@@ -1113,6 +1147,18 @@ void TargetPassConfig::addMachinePasses() {
   // where it becomes safe again so stop debugifying here.
   DebugifyIsSafe = false;
 
+  // Add a FSDiscriminator pass right before RA, so that we could get
+  // more precise SampleFDO profile for RA.
+  if (EnableFSDiscriminator) {
+    addPass(createMIRAddFSDiscriminatorsPass(
+        sampleprof::FSDiscriminatorPass::Pass1));
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty() && !DisableRAFSProfileLoader)
+      addPass(
+          createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM),
+                                     sampleprof::FSDiscriminatorPass::Pass1));
+  }
+
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
   if (getOptimizeRegAlloc())
@@ -1123,7 +1169,7 @@ void TargetPassConfig::addMachinePasses() {
   // Run post-ra passes.
   addPostRegAlloc();
 
-  addPass(&RemoveRedundantDebugValuesID, false);
+  addPass(&RemoveRedundantDebugValuesID);
 
   addPass(&FixupStatepointCallerSavedID);
 
@@ -1165,7 +1211,7 @@ void TargetPassConfig::addMachinePasses() {
   // GC
   if (addGCPasses()) {
     if (PrintGCInfo)
-      addPass(createGCInfoPrinter(dbgs()), false);
+      addPass(createGCInfoPrinter(dbgs()));
   }
 
   // Basic block placement.
@@ -1195,10 +1241,10 @@ void TargetPassConfig::addMachinePasses() {
 
   // FIXME: Some backends are incompatible with running the verifier after
   // addPreEmitPass.  Maybe only pass "false" here for those targets?
-  addPass(&FuncletLayoutID, false);
+  addPass(&FuncletLayoutID);
 
-  addPass(&StackMapLivenessID, false);
-  addPass(&LiveDebugValuesID, false);
+  addPass(&StackMapLivenessID);
+  addPass(&LiveDebugValuesID);
 
   if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOpt::None &&
       EnableMachineOutliner != RunOutliner::NeverOutline) {
@@ -1224,10 +1270,6 @@ void TargetPassConfig::addMachinePasses() {
   // Add passes that directly emit MI after all other MI passes.
   addPreEmitPass2();
 
-  // Insert pseudo probe annotation for callsite profiling
-  if (TM->Options.PseudoProbeForProfiling)
-    addPass(createPseudoProbeInserter());
-
   AddingMachinePasses = false;
 }
 
@@ -1369,8 +1411,8 @@ bool TargetPassConfig::usingDefaultRegAlloc() const {
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc() {
-  addPass(&PHIEliminationID, false);
-  addPass(&TwoAddressInstructionPassID, false);
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
 
   addRegAssignAndRewriteFast();
 }
@@ -1379,9 +1421,9 @@ void TargetPassConfig::addFastRegAlloc() {
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc() {
-  addPass(&DetectDeadLanesID, false);
+  addPass(&DetectDeadLanesID);
 
-  addPass(&ProcessImplicitDefsID, false);
+  addPass(&ProcessImplicitDefsID);
 
   // LiveVariables currently requires pure SSA form.
   //
@@ -1393,18 +1435,18 @@ void TargetPassConfig::addOptimizedRegAlloc() {
   // When LiveVariables is removed this has to be removed/moved either.
   // Explicit addition of UnreachableMachineBlockElim allows stopping before or
   // after it with -stop-before/-stop-after.
-  addPass(&UnreachableMachineBlockElimID, false);
-  addPass(&LiveVariablesID, false);
+  addPass(&UnreachableMachineBlockElimID);
+  addPass(&LiveVariablesID);
 
   // Edge splitting is smarter with machine loop info.
-  addPass(&MachineLoopInfoID, false);
-  addPass(&PHIEliminationID, false);
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
-    addPass(&LiveIntervalsID, false);
+    addPass(&LiveIntervalsID);
 
-  addPass(&TwoAddressInstructionPassID, false);
+  addPass(&TwoAddressInstructionPassID);
   addPass(&RegisterCoalescerID);
 
   // The machine scheduler may accidentally create disconnected components
@@ -1417,9 +1459,6 @@ void TargetPassConfig::addOptimizedRegAlloc() {
 
   if (addRegAssignAndRewriteOptimized()) {
     // Perform stack slot coloring and post-ra machine LICM.
-    //
-    // FIXME: Re-enable coloring with register when it's capable of adding
-    // kill markers.
     addPass(&StackSlotColoringID);
 
     // Allow targets to expand pseudo instructions depending on the choice of
@@ -1459,12 +1498,21 @@ void TargetPassConfig::addMachineLateOptimization() {
 
 /// Add standard GC passes.
 bool TargetPassConfig::addGCPasses() {
-  addPass(&GCMachineCodeAnalysisID, false);
+  addPass(&GCMachineCodeAnalysisID);
   return true;
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
+  if (EnableFSDiscriminator) {
+    addPass(createMIRAddFSDiscriminatorsPass(
+        sampleprof::FSDiscriminatorPass::Pass2));
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty() && !DisableLayoutFSProfileLoader)
+      addPass(
+          createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM),
+                                     sampleprof::FSDiscriminatorPass::Pass2));
+  }
   if (addPass(&MachineBlockPlacementID)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1664b4dadfec..46cec5407565 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -118,6 +118,8 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // registers. e.g. r1 = move v1024.
   DenseMap<Register, Register> DstRegMap;
 
+  void removeClobberedSrcRegMap(MachineInstr *MI);
+
   bool isRevCopyChain(Register FromReg, Register ToReg, int Maxlen);
 
   bool noUseAfterLastDef(Register Reg, unsigned Dist, unsigned &LastDef);
@@ -132,7 +134,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
 
   bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
                           MachineBasicBlock::iterator &nmi, Register RegA,
-                          Register RegB, unsigned Dist);
+                          Register RegB, unsigned &Dist);
 
   bool isDefTooClose(Register Reg, unsigned Dist, MachineInstr *MI);
 
@@ -144,7 +146,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
                                MachineBasicBlock::iterator &nmi,
                                unsigned SrcIdx, unsigned DstIdx,
-                               unsigned Dist, bool shouldOnlyCommute);
+                               unsigned &Dist, bool shouldOnlyCommute);
 
   bool tryInstructionCommute(MachineInstr *MI,
                              unsigned DstOpIdx,
@@ -380,7 +382,8 @@ findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
   if (!MRI->hasOneNonDBGUse(Reg))
     // None or more than one use.
     return nullptr;
-  MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(Reg);
+  MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg);
+  MachineInstr &UseMI = *UseOp.getParent();
   if (UseMI.getParent() != MBB)
     return nullptr;
   Register SrcReg;
@@ -394,6 +397,18 @@ findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
     IsDstPhys = DstReg.isPhysical();
     return &UseMI;
   }
+  if (UseMI.isCommutable()) {
+    unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex;
+    unsigned Src2 = UseMI.getOperandNo(&UseOp);
+    if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) {
+      MachineOperand &MO = UseMI.getOperand(Src1);
+      if (MO.isReg() && MO.isUse() &&
+          isTwoAddrUse(UseMI, MO.getReg(), DstReg)) {
+        IsDstPhys = DstReg.isPhysical();
+        return &UseMI;
+      }
+    }
+  }
   return nullptr;
 }
 
@@ -422,6 +437,76 @@ static bool regsAreCompatible(Register RegA, Register RegB,
   return TRI->regsOverlap(RegA, RegB);
 }
 
+/// From RegMap remove entries mapped to a physical register which overlaps MO.
+static void removeMapRegEntry(const MachineOperand &MO,
+                              DenseMap<Register, Register> &RegMap,
+                              const TargetRegisterInfo *TRI) {
+  assert(
+      (MO.isReg() || MO.isRegMask()) &&
+      "removeMapRegEntry must be called with a register or regmask operand.");
+
+  SmallVector<Register, 2> Srcs;
+  for (auto SI : RegMap) {
+    Register ToReg = SI.second;
+    if (ToReg.isVirtual())
+      continue;
+
+    if (MO.isReg()) {
+      Register Reg = MO.getReg();
+      if (TRI->regsOverlap(ToReg, Reg))
+        Srcs.push_back(SI.first);
+    } else if (MO.clobbersPhysReg(ToReg))
+      Srcs.push_back(SI.first);
+  }
+
+  for (auto SrcReg : Srcs)
+    RegMap.erase(SrcReg);
+}
+
+/// If a physical register is clobbered, old entries mapped to it should be
+/// deleted. For example
+///
+///     %2:gr64 = COPY killed $rdx
+///     MUL64r %3:gr64, implicit-def $rax, implicit-def $rdx
+///
+/// After the MUL instruction, $rdx contains different value than in the COPY
+/// instruction. So %2 should not map to $rdx after MUL.
+void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) {
+  if (MI->isCopy()) {
+    // If a virtual register is copied to its mapped physical register, it
+    // doesn't change the potential coalescing between them, so we don't remove
+    // entries mapped to the physical register. For example
+    //
+    // %100 = COPY $r8
+    //     ...
+    // $r8  = COPY %100
+    //
+    // The first copy constructs SrcRegMap[%100] = $r8, the second copy doesn't
+    // destroy the content of $r8, and should not impact SrcRegMap.
+    Register Dst = MI->getOperand(0).getReg();
+    if (!Dst || Dst.isVirtual())
+      return;
+
+    Register Src = MI->getOperand(1).getReg();
+    if (regsAreCompatible(Dst, getMappedReg(Src, SrcRegMap), TRI))
+      return;
+  }
+
+  for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask()) {
+      removeMapRegEntry(MO, SrcRegMap, TRI);
+      continue;
+    }
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg || Reg.isVirtual())
+      continue;
+    removeMapRegEntry(MO, SrcRegMap, TRI);
+  }
+}
+
 // Returns true if Reg is equal or aliased to at least one register in Set.
 static bool regOverlapsSet(const SmallVectorImpl<Register> &Set, Register Reg,
                            const TargetRegisterInfo *TRI) {
@@ -589,21 +674,15 @@ bool TwoAddressInstructionPass::isProfitableToConv3Addr(Register RegA,
 /// Return true if this transformation was successful.
 bool TwoAddressInstructionPass::convertInstTo3Addr(
     MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
-    Register RegA, Register RegB, unsigned Dist) {
-  // FIXME: Why does convertToThreeAddress() need an iterator reference?
-  MachineFunction::iterator MFI = MBB->getIterator();
-  MachineInstr *NewMI = TII->convertToThreeAddress(MFI, *mi, LV);
-  assert(MBB->getIterator() == MFI &&
-         "convertToThreeAddress changed iterator reference");
+    Register RegA, Register RegB, unsigned &Dist) {
+  MachineInstrSpan MIS(mi, MBB);
+  MachineInstr *NewMI = TII->convertToThreeAddress(*mi, LV, LIS);
   if (!NewMI)
     return false;
 
   LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
   LLVM_DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
 
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(*mi, *NewMI);
-
   // If the old instruction is debug value tracked, an update is required.
   if (auto OldInstrNum = mi->peekDebugInstrNum()) {
     // Sanity check.
@@ -624,7 +703,9 @@ bool TwoAddressInstructionPass::convertInstTo3Addr(
 
   MBB->erase(mi); // Nuke the old inst.
 
-  DistanceMap.insert(std::make_pair(NewMI, Dist));
+  for (MachineInstr &MI : MIS)
+    DistanceMap.insert(std::make_pair(&MI, Dist++));
+  Dist--;
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -656,9 +737,7 @@ void TwoAddressInstructionPass::scanUses(Register DstReg) {
       VirtRegPairs.push_back(NewReg);
       break;
     }
-    bool isNew = SrcRegMap.insert(std::make_pair(NewReg, Reg)).second;
-    if (!isNew)
-      assert(SrcRegMap[NewReg] == Reg && "Can't map to two src registers!");
+    SrcRegMap[NewReg] = Reg;
     VirtRegPairs.push_back(NewReg);
     Reg = NewReg;
   }
@@ -667,8 +746,7 @@ void TwoAddressInstructionPass::scanUses(Register DstReg) {
     unsigned ToReg = VirtRegPairs.back();
     VirtRegPairs.pop_back();
     while (!VirtRegPairs.empty()) {
-      unsigned FromReg = VirtRegPairs.back();
-      VirtRegPairs.pop_back();
+      unsigned FromReg = VirtRegPairs.pop_back_val();
       bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
       if (!isNew)
         assert(DstRegMap[FromReg] == ToReg &&"Can't map to two dst registers!");
@@ -857,12 +935,13 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill(
   nmi = End;
   MachineBasicBlock::iterator InsertPos = KillPos;
   if (LIS) {
-    // We have to move the copies first so that the MBB is still well-formed
-    // when calling handleMove().
+    // We have to move the copies (and any interleaved debug instructions)
+    // first so that the MBB is still well-formed when calling handleMove().
     for (MachineBasicBlock::iterator MBBI = AfterMI; MBBI != End;) {
       auto CopyMI = MBBI++;
       MBB->splice(InsertPos, MBB, CopyMI);
-      LIS->handleMove(*CopyMI);
+      if (!CopyMI->isDebugOrPseudoInstr())
+        LIS->handleMove(*CopyMI);
       InsertPos = CopyMI;
     }
     End = std::next(MachineBasicBlock::iterator(MI));
@@ -1130,7 +1209,7 @@ bool TwoAddressInstructionPass::
 tryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
                         unsigned SrcIdx, unsigned DstIdx,
-                        unsigned Dist, bool shouldOnlyCommute) {
+                        unsigned &Dist, bool shouldOnlyCommute) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
@@ -1238,6 +1317,8 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
         // look "normal" to the transformation logic.
         MBB->insert(mi, NewMIs[0]);
         MBB->insert(mi, NewMIs[1]);
+        DistanceMap.insert(std::make_pair(NewMIs[0], Dist++));
+        DistanceMap.insert(std::make_pair(NewMIs[1], Dist));
 
         LLVM_DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
                           << "2addr:    NEW INST: " << *NewMIs[1]);
@@ -1288,9 +1369,12 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
               if (MO.isReg())
                 OrigRegs.push_back(MO.getReg());
             }
+
+            LIS->RemoveMachineInstrFromMaps(MI);
           }
 
           MI.eraseFromParent();
+          DistanceMap.erase(&MI);
 
           // Update LiveIntervals.
           if (LIS) {
@@ -1307,6 +1391,9 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
           LLVM_DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
           NewMIs[0]->eraseFromParent();
           NewMIs[1]->eraseFromParent();
+          DistanceMap.erase(NewMIs[0]);
+          DistanceMap.erase(NewMIs[1]);
+          Dist--;
         }
       }
     }
@@ -1320,7 +1407,6 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
 // Return true if any tied operands where found, including the trivial ones.
 bool TwoAddressInstructionPass::
 collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
-  const MCInstrDesc &MCID = MI->getDesc();
   bool AnyOps = false;
   unsigned NumOps = MI->getNumOperands();
 
@@ -1342,10 +1428,10 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
     // Deal with undef uses immediately - simply rewrite the src operand.
     if (SrcMO.isUndef() && !DstMO.getSubReg()) {
       // Constrain the DstReg register class if required.
-      if (DstReg.isVirtual())
-        if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
-                                                             TRI, *MF))
-          MRI->constrainRegClass(DstReg, RC);
+      if (DstReg.isVirtual()) {
+        const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+        MRI->constrainRegClass(DstReg, RC);
+      }
       SrcMO.setReg(DstReg);
       SrcMO.setSubReg(0);
       LLVM_DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
@@ -1434,12 +1520,24 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     if (LIS) {
       LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
 
+      SlotIndex endIdx =
+          LIS->getInstructionIndex(*MI).getRegSlot(IsEarlyClobber);
       if (RegA.isVirtual()) {
         LiveInterval &LI = LIS->getInterval(RegA);
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
-        SlotIndex endIdx =
-            LIS->getInstructionIndex(*MI).getRegSlot(IsEarlyClobber);
-        LI.addSegment(LiveInterval::Segment(LastCopyIdx, endIdx, VNI));
+        LI.addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI));
+        for (auto &S : LI.subranges()) {
+          VNI = S.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+          S.addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI));
+        }
+      } else {
+        for (MCRegUnitIterator Unit(RegA, TRI); Unit.isValid(); ++Unit) {
+          if (LiveRange *LR = LIS->getCachedRegUnit(*Unit)) {
+            VNInfo *VNI =
+                LR->getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+            LR->addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI));
+          }
+        }
       }
     }
 
@@ -1461,49 +1559,58 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     // by SubRegB is compatible with RegA with no subregister. So regardless of
     // whether the dest oper writes a subreg, the source oper should not.
     MO.setSubReg(0);
-
-    // Propagate SrcRegMap.
-    SrcRegMap[RegA] = RegB;
   }
 
   if (AllUsesCopied) {
-    bool ReplacedAllUntiedUses = true;
-    if (!IsEarlyClobber) {
-      // Replace other (un-tied) uses of regB with LastCopiedReg.
-      for (MachineOperand &MO : MI->operands()) {
-        if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
-          if (MO.getSubReg() == SubRegB) {
-            if (MO.isKill()) {
-              MO.setIsKill(false);
-              RemovedKillFlag = true;
-            }
-            MO.setReg(LastCopiedReg);
-            MO.setSubReg(0);
-          } else {
-            ReplacedAllUntiedUses = false;
+    LaneBitmask RemainingUses = LaneBitmask::getNone();
+    // Replace other (un-tied) uses of regB with LastCopiedReg.
+    for (MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+        if (MO.getSubReg() == SubRegB && !IsEarlyClobber) {
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
           }
+          MO.setReg(LastCopiedReg);
+          MO.setSubReg(0);
+        } else {
+          RemainingUses |= TRI->getSubRegIndexLaneMask(MO.getSubReg());
         }
       }
     }
 
     // Update live variables for regB.
-    if (RemovedKillFlag && ReplacedAllUntiedUses &&
-        LV && LV->getVarInfo(RegB).removeKill(*MI)) {
+    if (RemovedKillFlag && RemainingUses.none() && LV &&
+        LV->getVarInfo(RegB).removeKill(*MI)) {
       MachineBasicBlock::iterator PrevMI = MI;
       --PrevMI;
       LV->addVirtualRegisterKilled(RegB, *PrevMI);
     }
 
+    if (RemovedKillFlag && RemainingUses.none())
+      SrcRegMap[LastCopiedReg] = RegB;
+
     // Update LiveIntervals.
     if (LIS) {
-      LiveInterval &LI = LIS->getInterval(RegB);
-      SlotIndex MIIdx = LIS->getInstructionIndex(*MI);
-      LiveInterval::const_iterator I = LI.find(MIIdx);
-      assert(I != LI.end() && "RegB must be live-in to use.");
+      SlotIndex UseIdx = LIS->getInstructionIndex(*MI);
+      auto Shrink = [=](LiveRange &LR, LaneBitmask LaneMask) {
+        LiveRange::Segment *S = LR.getSegmentContaining(LastCopyIdx);
+        if (!S)
+          return true;
+        if ((LaneMask & RemainingUses).any())
+          return false;
+        if (S->end.getBaseIndex() != UseIdx)
+          return false;
+        S->end = LastCopyIdx;
+        return true;
+      };
 
-      SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
-      if (I->end == UseIdx)
-        LI.removeSegment(LastCopyIdx, UseIdx);
+      LiveInterval &LI = LIS->getInterval(RegB);
+      bool ShrinkLI = true;
+      for (auto &S : LI.subranges())
+        ShrinkLI &= Shrink(S, S.LaneMask);
+      if (ShrinkLI)
+        Shrink(LI, LaneBitmask::getAll());
     }
   } else if (RemovedKillFlag) {
     // Some tied uses of regB matched their destination registers, so
@@ -1580,6 +1687,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
       // First scan through all the tied register uses in this instruction
       // and record a list of pairs of tied operands for each register.
       if (!collectTiedOperands(&*mi, TiedOperands)) {
+        removeClobberedSrcRegMap(&*mi);
         mi = nmi;
         continue;
       }
@@ -1604,6 +1712,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
             // The tied operands have been eliminated or shifted further down
             // the block to ease elimination. Continue processing with 'nmi'.
             TiedOperands.clear();
+            removeClobberedSrcRegMap(&*mi);
             mi = nmi;
             continue;
           }
@@ -1628,18 +1737,44 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
         mi->RemoveOperand(1);
         mi->setDesc(TII->get(TargetOpcode::COPY));
         LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+
+        // Update LiveIntervals.
+        if (LIS) {
+          Register Reg = mi->getOperand(0).getReg();
+          LiveInterval &LI = LIS->getInterval(Reg);
+          if (LI.hasSubRanges()) {
+            // The COPY no longer defines subregs of %reg except for
+            // %reg.subidx.
+            LaneBitmask LaneMask =
+                TRI->getSubRegIndexLaneMask(mi->getOperand(0).getSubReg());
+            SlotIndex Idx = LIS->getInstructionIndex(*mi);
+            for (auto &S : LI.subranges()) {
+              if ((S.LaneMask & LaneMask).none()) {
+                LiveRange::iterator UseSeg = S.FindSegmentContaining(Idx);
+                LiveRange::iterator DefSeg = std::next(UseSeg);
+                S.MergeValueNumberInto(DefSeg->valno, UseSeg->valno);
+              }
+            }
+
+            // The COPY no longer has a use of %reg.
+            LIS->shrinkToUses(&LI);
+          } else {
+            // The live interval for Reg did not have subranges but now it needs
+            // them because we have introduced a subreg def. Recompute it.
+            LIS->removeInterval(Reg);
+            LIS->createAndComputeVirtRegInterval(Reg);
+          }
+        }
       }
 
       // Clear TiedOperands here instead of at the top of the loop
       // since most instructions do not have tied operands.
       TiedOperands.clear();
+      removeClobberedSrcRegMap(&*mi);
       mi = nmi;
     }
   }
 
-  if (LIS)
-    MF->verify(this, "After two-address instruction pass");
-
   return MadeChange;
 }
 
@@ -1722,6 +1857,9 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
       MI.RemoveOperand(j);
   } else {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(MI);
+
     LLVM_DEBUG(dbgs() << "Eliminated: " << MI);
     MI.eraseFromParent();
   }
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 2ce6ea1d4212..d042deefd746 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -108,7 +108,7 @@ class IRPromoter {
   SetVector<Value*> &Visited;
   SetVector<Value*> &Sources;
   SetVector<Instruction*> &Sinks;
-  SmallVectorImpl<Instruction*> &SafeWrap;
+  SmallPtrSetImpl<Instruction *> &SafeWrap;
   IntegerType *ExtTy = nullptr;
   SmallPtrSet<Value*, 8> NewInsts;
   SmallPtrSet<Instruction*, 4> InstsToRemove;
@@ -116,7 +116,6 @@ class IRPromoter {
   SmallPtrSet<Value*, 8> Promoted;
 
   void ReplaceAllUsersOfWith(Value *From, Value *To);
-  void PrepareWrappingAdds(void);
   void ExtendSources(void);
   void ConvertTruncs(void);
   void PromoteTree(void);
@@ -125,11 +124,11 @@ class IRPromoter {
 
 public:
   IRPromoter(LLVMContext &C, IntegerType *Ty, unsigned Width,
-             SetVector<Value*> &visited, SetVector<Value*> &sources,
-             SetVector<Instruction*> &sinks,
-             SmallVectorImpl<Instruction*> &wrap) :
-    Ctx(C), OrigTy(Ty), PromotedWidth(Width), Visited(visited),
-    Sources(sources), Sinks(sinks), SafeWrap(wrap) {
+             SetVector<Value *> &visited, SetVector<Value *> &sources,
+             SetVector<Instruction *> &sinks,
+             SmallPtrSetImpl<Instruction *> &wrap)
+      : Ctx(C), OrigTy(Ty), PromotedWidth(Width), Visited(visited),
+        Sources(sources), Sinks(sinks), SafeWrap(wrap) {
     ExtTy = IntegerType::get(Ctx, PromotedWidth);
     assert(OrigTy->getPrimitiveSizeInBits().getFixedSize() <
                ExtTy->getPrimitiveSizeInBits().getFixedSize() &&
@@ -145,7 +144,7 @@ class TypePromotion : public FunctionPass {
   unsigned RegisterBitWidth = 0;
   SmallPtrSet<Value*, 16> AllVisited;
   SmallPtrSet<Instruction*, 8> SafeToPromote;
-  SmallVector<Instruction*, 4> SafeWrap;
+  SmallPtrSet<Instruction *, 4> SafeWrap;
 
   // Does V have the same size result type as TypeSize.
   bool EqualTypeSize(Value *V);
@@ -183,6 +182,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
   }
 
   StringRef getPassName() const override { return PASS_NAME; }
@@ -192,11 +192,8 @@ public:
 
 }
 
-static bool GenerateSignBits(Value *V) {
-  if (!isa<Instruction>(V))
-    return false;
-
-  unsigned Opc = cast<Instruction>(V)->getOpcode();
+static bool GenerateSignBits(Instruction *I) {
+  unsigned Opc = I->getOpcode();
   return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
          Opc == Instruction::SRem || Opc == Instruction::SExt;
 }
@@ -283,7 +280,7 @@ bool TypePromotion::isSafeWrap(Instruction *I) {
   // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
   // just underflows the range, the icmp would give the same result whether the
   // result has been truncated or not. We calculate this by:
-  // - Zero extending both constants, if needed, to 32-bits.
+  // - Zero extending both constants, if needed, to RegisterBitWidth.
   // - Take the absolute value of I's constant, adding this to the icmp const.
   // - Check that this value is not out of range for small type. If it is, it
   //   means that it has underflowed enough to wrap around the icmp constant.
@@ -335,53 +332,46 @@ bool TypePromotion::isSafeWrap(Instruction *I) {
   if (Opc != Instruction::Add && Opc != Instruction::Sub)
     return false;
 
-  if (!I->hasOneUse() ||
-      !isa<ICmpInst>(*I->user_begin()) ||
+  if (!I->hasOneUse() || !isa<ICmpInst>(*I->user_begin()) ||
       !isa<ConstantInt>(I->getOperand(1)))
     return false;
 
-  ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1));
-  bool NegImm = OverflowConst->isNegative();
-  bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
-                       ((Opc == Instruction::Add) && NegImm);
-  if (!IsDecreasing)
-    return false;
-
   // Don't support an icmp that deals with sign bits.
   auto *CI = cast<ICmpInst>(*I->user_begin());
   if (CI->isSigned() || CI->isEquality())
     return false;
 
-  ConstantInt *ICmpConst = nullptr;
+  ConstantInt *ICmpConstant = nullptr;
   if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
-    ICmpConst = Const;
+    ICmpConstant = Const;
   else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
-    ICmpConst = Const;
+    ICmpConstant = Const;
   else
     return false;
 
-  // Now check that the result can't wrap on itself.
-  APInt Total = ICmpConst->getValue().getBitWidth() < 32 ?
-    ICmpConst->getValue().zext(32) : ICmpConst->getValue();
-
-  Total += OverflowConst->getValue().getBitWidth() < 32 ?
-    OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
-
-  APInt Max = APInt::getAllOnesValue(TypePromotion::TypeSize);
-
-  if (Total.getBitWidth() > Max.getBitWidth()) {
-    if (Total.ugt(Max.zext(Total.getBitWidth())))
-      return false;
-  } else if (Max.getBitWidth() > Total.getBitWidth()) {
-    if (Total.zext(Max.getBitWidth()).ugt(Max))
-      return false;
-  } else if (Total.ugt(Max))
+  const APInt &ICmpConst = ICmpConstant->getValue();
+  APInt OverflowConst = cast<ConstantInt>(I->getOperand(1))->getValue();
+  if (Opc == Instruction::Sub)
+    OverflowConst = -OverflowConst;
+  if (!OverflowConst.isNonPositive())
     return false;
 
-  LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for "
-             << *I << "\n");
-  SafeWrap.push_back(I);
-  return true;
+  // Using C1 = OverflowConst and C2 = ICmpConst, we can use either prove that:
+  //   zext(x) + sext(C1) <u zext(C2)  if C1 < 0 and C1 >s C2
+  //   zext(x) + sext(C1) <u sext(C2)  if C1 < 0 and C1 <=s C2
+  if (OverflowConst.sgt(ICmpConst)) {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+                      << "const of " << *I << "\n");
+    SafeWrap.insert(I);
+    return true;
+  } else {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+                      << "const of " << *I << " and " << *CI << "\n");
+    SafeWrap.insert(I);
+    SafeWrap.insert(CI);
+    return true;
+  }
+  return false;
 }
 
 bool TypePromotion::shouldPromote(Value *V) {
@@ -403,17 +393,14 @@ bool TypePromotion::shouldPromote(Value *V) {
 
 /// Return whether we can safely mutate V's type to ExtTy without having to be
 /// concerned with zero extending or truncation.
-static bool isPromotedResultSafe(Value *V) {
-  if (GenerateSignBits(V))
+static bool isPromotedResultSafe(Instruction *I) {
+  if (GenerateSignBits(I))
     return false;
 
-  if (!isa<Instruction>(V))
+  if (!isa<OverflowingBinaryOperator>(I))
     return true;
 
-  if (!isa<OverflowingBinaryOperator>(V))
-    return true;
-
-  return cast<Instruction>(V)->hasNoUnsignedWrap();
+  return I->hasNoUnsignedWrap();
 }
 
 void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
@@ -422,7 +409,7 @@ void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
   bool ReplacedAll = true;
 
   LLVM_DEBUG(dbgs() << "IR Promotion: Replacing " << *From << " with " << *To
-             << "\n");
+                    << "\n");
 
   for (Use &U : From->uses()) {
     auto *User = cast<Instruction>(U.getUser());
@@ -441,39 +428,6 @@ void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
       InstsToRemove.insert(I);
 }
 
-void IRPromoter::PrepareWrappingAdds() {
-  LLVM_DEBUG(dbgs() << "IR Promotion: Prepare wrapping adds.\n");
-  IRBuilder<> Builder{Ctx};
-
-  // For adds that safely wrap and use a negative immediate as operand 1, we
-  // create an equivalent instruction using a positive immediate.
-  // That positive immediate can then be zext along with all the other
-  // immediates later.
-  for (auto *I : SafeWrap) {
-    if (I->getOpcode() != Instruction::Add)
-      continue;
-
-    LLVM_DEBUG(dbgs() << "IR Promotion: Adjusting " << *I << "\n");
-    assert((isa<ConstantInt>(I->getOperand(1)) &&
-            cast<ConstantInt>(I->getOperand(1))->isNegative()) &&
-           "Wrapping should have a negative immediate as the second operand");
-
-    auto Const = cast<ConstantInt>(I->getOperand(1));
-    auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
-    Builder.SetInsertPoint(I);
-    Value *NewVal = Builder.CreateSub(I->getOperand(0), NewConst);
-    if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
-      NewInst->copyIRFlags(I);
-      NewInsts.insert(NewInst);
-    }
-    InstsToRemove.insert(I);
-    I->replaceAllUsesWith(NewVal);
-    LLVM_DEBUG(dbgs() << "IR Promotion: New equivalent: " << *NewVal << "\n");
-  }
-  for (auto *I : NewInsts)
-    Visited.insert(I);
-}
-
 void IRPromoter::ExtendSources() {
   IRBuilder<> Builder{Ctx};
 
@@ -515,8 +469,6 @@ void IRPromoter::ExtendSources() {
 void IRPromoter::PromoteTree() {
   LLVM_DEBUG(dbgs() << "IR Promotion: Mutating the tree..\n");
 
-  IRBuilder<> Builder{Ctx};
-
   // Mutate the types of the instructions within the tree. Here we handle
   // constant operands.
   for (auto *V : Visited) {
@@ -533,14 +485,16 @@ void IRPromoter::PromoteTree() {
         continue;
 
       if (auto *Const = dyn_cast<ConstantInt>(Op)) {
-        Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
+        Constant *NewConst = SafeWrap.contains(I)
+                                 ? ConstantExpr::getSExt(Const, ExtTy)
+                                 : ConstantExpr::getZExt(Const, ExtTy);
         I->setOperand(i, NewConst);
       } else if (isa<UndefValue>(Op))
         I->setOperand(i, UndefValue::get(ExtTy));
     }
 
-    // Mutate the result type, unless this is an icmp.
-    if (!isa<ICmpInst>(I)) {
+    // Mutate the result type, unless this is an icmp or switch.
+    if (!isa<ICmpInst>(I) && !isa<SwitchInst>(I)) {
       I->mutateType(ExtTy);
       Promoted.insert(I);
     }
@@ -575,7 +529,7 @@ void IRPromoter::TruncateSinks() {
 
     // Handle calls separately as we need to iterate over arg operands.
     if (auto *Call = dyn_cast<CallInst>(I)) {
-      for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
+      for (unsigned i = 0; i < Call->arg_size(); ++i) {
         Value *Arg = Call->getArgOperand(i);
         Type *Ty = TruncTysMap[Call][i];
         if (Instruction *Trunc = InsertTrunc(Arg, Ty)) {
@@ -678,10 +632,8 @@ void IRPromoter::Mutate() {
   // Cache original types of the values that will likely need truncating
   for (auto *I : Sinks) {
     if (auto *Call = dyn_cast<CallInst>(I)) {
-      for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
-        Value *Arg = Call->getArgOperand(i);
+      for (Value *Arg : Call->args())
         TruncTysMap[Call].push_back(Arg->getType());
-      }
     } else if (auto *Switch = dyn_cast<SwitchInst>(I))
       TruncTysMap[I].push_back(Switch->getCondition()->getType());
     else {
@@ -696,10 +648,6 @@ void IRPromoter::Mutate() {
     TruncTysMap[Trunc].push_back(Trunc->getDestTy());
   }
 
-  // Convert adds using negative immediates to equivalent instructions that use
-  // positive constants.
-  PrepareWrappingAdds();
-
   // Insert zext instructions between sources and their users.
   ExtendSources();
 
@@ -798,7 +746,7 @@ bool TypePromotion::isLegalToPromote(Value *V) {
   if (SafeToPromote.count(I))
    return true;
 
-  if (isPromotedResultSafe(V) || isSafeWrap(I)) {
+  if (isPromotedResultSafe(I) || isSafeWrap(I)) {
     SafeToPromote.insert(I);
     return true;
   }
@@ -815,7 +763,7 @@ bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
     return false;
 
   LLVM_DEBUG(dbgs() << "IR Promotion: TryToPromote: " << *V << ", from "
-             << TypeSize << " bits to " << PromotedWidth << "\n");
+                    << TypeSize << " bits to " << PromotedWidth << "\n");
 
   SetVector<Value*> WorkList;
   SetVector<Value*> Sources;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 9daebfd9e63d..4876b9e23717 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -167,6 +167,7 @@ std::string EVT::getEVTString() const {
   case MVT::Glue:      return "glue";
   case MVT::x86mmx:    return "x86mmx";
   case MVT::x86amx:    return "x86amx";
+  case MVT::i64x8:     return "i64x8";
   case MVT::Metadata:  return "Metadata";
   case MVT::Untyped:   return "Untyped";
   case MVT::funcref:   return "funcref";
@@ -198,6 +199,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
+  case MVT::i64x8:   return IntegerType::get(Context, 512);
   case MVT::externref:
     return PointerType::get(StructType::create(Context), 10);
   case MVT::funcref:
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 0f164e2637a2..069aca742da0 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -541,15 +541,8 @@ void VirtRegRewriter::rewrite() {
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     LLVM_DEBUG(MBBI->print(dbgs(), Indexes));
-    for (MachineBasicBlock::instr_iterator
-           MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
-      MachineInstr *MI = &*MII;
-      ++MII;
-
-      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-           MOE = MI->operands_end(); MOI != MOE; ++MOI) {
-        MachineOperand &MO = *MOI;
-
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) {
+      for (MachineOperand &MO : MI.operands()) {
         // Make sure MRI knows about registers clobbered by regmasks.
         if (MO.isRegMask())
           MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
@@ -574,7 +567,7 @@ void VirtRegRewriter::rewrite() {
             // have to add implicit killed operands for the super-register.  A
             // partial redef always kills and redefines the super-register.
             if ((MO.readsReg() && (MO.isDef() || MO.isKill())) ||
-                (MO.isDef() && subRegLiveThrough(*MI, PhysReg)))
+                (MO.isDef() && subRegLiveThrough(MI, PhysReg)))
               SuperKills.push_back(PhysReg);
 
             if (MO.isDef()) {
@@ -619,20 +612,20 @@ void VirtRegRewriter::rewrite() {
       // Add any missing super-register kills after rewriting the whole
       // instruction.
       while (!SuperKills.empty())
-        MI->addRegisterKilled(SuperKills.pop_back_val(), TRI, true);
+        MI.addRegisterKilled(SuperKills.pop_back_val(), TRI, true);
 
       while (!SuperDeads.empty())
-        MI->addRegisterDead(SuperDeads.pop_back_val(), TRI, true);
+        MI.addRegisterDead(SuperDeads.pop_back_val(), TRI, true);
 
       while (!SuperDefs.empty())
-        MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI);
+        MI.addRegisterDefined(SuperDefs.pop_back_val(), TRI);
 
-      LLVM_DEBUG(dbgs() << "> " << *MI);
+      LLVM_DEBUG(dbgs() << "> " << MI);
 
-      expandCopyBundle(*MI);
+      expandCopyBundle(MI);
 
       // We can remove identity copies right now.
-      handleIdentityCopy(*MI);
+      handleIdentityCopy(MI);
     }
   }
 
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index c4c84cd921fa..c04a7b28eff9 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -29,7 +29,7 @@
 //   __wasm_lpad_context.lpad_index = index;
 //   __wasm_lpad_context.lsda = wasm.lsda();
 //   _Unwind_CallPersonality(exn);
-//   selector = __wasm.landingpad_context.selector;
+//   selector = __wasm_lpad_context.selector;
 //   ...
 //
 //
@@ -329,7 +329,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
                                     OperandBundleDef("funclet", CPI));
   PersCI->setDoesNotThrow();
 
-  // Pseudocode: int selector = __wasm.landingpad_context.selector;
+  // Pseudocode: int selector = __wasm_lpad_context.selector;
   Instruction *Selector =
       IRB.CreateLoad(IRB.getInt32Ty(), SelectorField, "selector");
 
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 4449cd8ef555..a3dec6c25e44 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -549,6 +549,7 @@ static void updateChildIncompleteness(const DWARFDie &Die, CompileUnit &CU,
   switch (Die.getTag()) {
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_union_type:
     break;
   default:
     return;
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index 3a9f79e47012..46e7457f2368 100644
--- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -21,8 +21,8 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 
 namespace llvm {
diff --git a/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp b/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp
index 799cffb7116e..c7b1c65f2f9a 100644
--- a/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/ContinuationRecordBuilder.cpp
@@ -103,7 +103,7 @@ void ContinuationRecordBuilder::writeMemberType(RecordType &Record) {
   if (getCurrentSegmentLength() > MaxSegmentLength) {
     // We need to inject some bytes before the member we just wrote but after
     // the previous member.  Save off the length of the member we just wrote so
-    // that we can do some sanity checking on it.
+    // that we can do validate it.
     uint32_t MemberLength = SegmentWriter.getOffset() - OriginalOffset;
     (void) MemberLength;
     insertSegmentEnd(OriginalOffset);
diff --git a/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp b/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
index ac3b30175956..d963e34628db 100644
--- a/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
@@ -53,7 +53,7 @@ ArrayRef<uint8_t> SimpleTypeSerializer::serialize(T &Record) {
   Prefix->RecordKind = CVT.kind();
   Prefix->RecordLen = Writer.getOffset() - sizeof(uint16_t);
 
-  return {ScratchBuffer.data(), Writer.getOffset()};
+  return {ScratchBuffer.data(), static_cast<size_t>(Writer.getOffset())};
 }
 
 // Explicitly instantiate the member function for each known type so that we can
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index ee1ff5460b9b..1be5a752453a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -147,41 +147,57 @@ DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const {
   return None;
 }
 
-Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
-    const uint64_t DIEOffset, const dwarf::Attribute Attr,
-    const DWARFUnit &U) const {
-  // Check if this abbreviation has this attribute without needing to skip
-  // any data so we can return quickly if it doesn't.
-  Optional<uint32_t> MatchAttrIndex = findAttributeIndex(Attr);
-  if (!MatchAttrIndex)
-    return None;
-
-  auto DebugInfoData = U.getDebugInfoExtractor();
+uint64_t DWARFAbbreviationDeclaration::getAttributeOffsetFromIndex(
+    uint32_t AttrIndex, uint64_t DIEOffset, const DWARFUnit &U) const {
+  DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor();
 
   // Add the byte size of ULEB that for the abbrev Code so we can start
   // skipping the attribute data.
   uint64_t Offset = DIEOffset + CodeByteSize;
-  for (uint32_t CurAttrIdx = 0; CurAttrIdx != *MatchAttrIndex; ++CurAttrIdx)
+  for (uint32_t CurAttrIdx = 0; CurAttrIdx != AttrIndex; ++CurAttrIdx)
     // Match Offset along until we get to the attribute we want.
     if (auto FixedSize = AttributeSpecs[CurAttrIdx].getByteSize(U))
       Offset += *FixedSize;
     else
       DWARFFormValue::skipValue(AttributeSpecs[CurAttrIdx].Form, DebugInfoData,
                                 &Offset, U.getFormParams());
+  return Offset;
+}
+
+Optional<DWARFFormValue>
+DWARFAbbreviationDeclaration::getAttributeValueFromOffset(
+    uint32_t AttrIndex, uint64_t Offset, const DWARFUnit &U) const {
+  assert(AttributeSpecs.size() > AttrIndex &&
+         "Attribute Index is out of bounds.");
 
   // We have arrived at the attribute to extract, extract if from Offset.
-  const AttributeSpec &Spec = AttributeSpecs[*MatchAttrIndex];
+  const AttributeSpec &Spec = AttributeSpecs[AttrIndex];
   if (Spec.isImplicitConst())
     return DWARFFormValue::createFromSValue(Spec.Form,
                                             Spec.getImplicitConstValue());
 
   DWARFFormValue FormValue(Spec.Form);
+  DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor();
   if (FormValue.extractValue(DebugInfoData, &Offset, U.getFormParams(), &U))
     return FormValue;
-
   return None;
 }
 
+Optional<DWARFFormValue>
+DWARFAbbreviationDeclaration::getAttributeValue(const uint64_t DIEOffset,
+                                                const dwarf::Attribute Attr,
+                                                const DWARFUnit &U) const {
+  // Check if this abbreviation has this attribute without needing to skip
+  // any data so we can return quickly if it doesn't.
+  Optional<uint32_t> MatchAttrIndex = findAttributeIndex(Attr);
+  if (!MatchAttrIndex)
+    return None;
+
+  uint64_t Offset = getAttributeOffsetFromIndex(*MatchAttrIndex, DIEOffset, U);
+
+  return getAttributeValueFromOffset(*MatchAttrIndex, Offset, U);
+}
+
 size_t DWARFAbbreviationDeclaration::FixedSizeInfo::getByteSize(
     const DWARFUnit &U) const {
   size_t ByteSize = NumBytes;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 28d35b609c24..c77d4d4d989c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -767,7 +767,7 @@ LLVM_DUMP_METHOD void DWARFDebugNames::NameIndex::dump(ScopedPrinter &W) const {
   }
 
   W.startLine() << "Hash table not present\n";
-  for (NameTableEntry NTE : *this)
+  for (const NameTableEntry &NTE : *this)
     dumpName(W, NTE, None);
 }
 
@@ -799,7 +799,7 @@ DWARFDebugNames::ValueIterator::findEntryOffsetInCurrentIndex() {
   const Header &Hdr = CurrentIndex->Hdr;
   if (Hdr.BucketCount == 0) {
     // No Hash Table, We need to search through all names in the Name Index.
-    for (NameTableEntry NTE : *CurrentIndex) {
+    for (const NameTableEntry &NTE : *CurrentIndex) {
       if (NTE.getString() == Key)
         return NTE.getEntryOffset();
     }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 4e1cafeb2126..c8331487f282 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -33,6 +33,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
@@ -44,7 +45,6 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
@@ -693,6 +693,18 @@ void DWARFContext::dump(
     getDebugNames().dump(OS);
 }
 
+DWARFTypeUnit *DWARFContext::getTypeUnitForHash(uint16_t Version, uint64_t Hash,
+                                                bool IsDWO) {
+  // FIXME: Check for/use the tu_index here, if there is one.
+  for (const auto &U : IsDWO ? dwo_units() : normal_units()) {
+    if (DWARFTypeUnit *TU = dyn_cast<DWARFTypeUnit>(U.get())) {
+      if (TU->getTypeHash() == Hash)
+        return TU;
+    }
+  }
+  return nullptr;
+}
+
 DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
   parseDWOUnits(LazyParse);
 
@@ -1411,7 +1423,8 @@ DWARFContext::getDWOContext(StringRef AbsolutePath) {
 
   auto S = std::make_shared<DWOFile>();
   S->File = std::move(Obj.get());
-  S->Context = DWARFContext::create(*S->File.getBinary());
+  S->Context = DWARFContext::create(*S->File.getBinary(),
+                                    ProcessDebugRelocations::Ignore);
   *Entry = S;
   auto *Ctxt = S->Context.get();
   return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
@@ -1652,7 +1665,9 @@ public:
     }
   }
   DWARFObjInMemory(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
-                   function_ref<void(Error)> HandleError, function_ref<void(Error)> HandleWarning )
+                   function_ref<void(Error)> HandleError,
+                   function_ref<void(Error)> HandleWarning,
+                   DWARFContext::ProcessDebugRelocations RelocAction)
       : IsLittleEndian(Obj.isLittleEndian()),
         AddressSize(Obj.getBytesInAddress()), FileName(Obj.getFileName()),
         Obj(&Obj) {
@@ -1735,7 +1750,12 @@ public:
         S.Data = Data;
       }
 
-      if (RelocatedSection == Obj.section_end())
+      if (RelocatedSection != Obj.section_end() && Name.contains(".dwo"))
+        HandleWarning(
+            createError("Unexpected relocations for dwo section " + Name));
+
+      if (RelocatedSection == Obj.section_end() ||
+          (RelocAction == DWARFContext::ProcessDebugRelocations::Ignore))
         continue;
 
       StringRef RelSecName;
@@ -1772,18 +1792,10 @@ public:
         if (RelSecName == "debug_info")
           Map = &static_cast<DWARFSectionMap &>(InfoSections[*RelocatedSection])
                      .Relocs;
-        else if (RelSecName == "debug_info.dwo")
-          Map = &static_cast<DWARFSectionMap &>(
-                     InfoDWOSections[*RelocatedSection])
-                     .Relocs;
         else if (RelSecName == "debug_types")
           Map =
               &static_cast<DWARFSectionMap &>(TypesSections[*RelocatedSection])
                    .Relocs;
-        else if (RelSecName == "debug_types.dwo")
-          Map = &static_cast<DWARFSectionMap &>(
-                     TypesDWOSections[*RelocatedSection])
-                     .Relocs;
         else
           continue;
       }
@@ -1966,12 +1978,13 @@ public:
 } // namespace
 
 std::unique_ptr<DWARFContext>
-DWARFContext::create(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
-                     std::string DWPName,
+DWARFContext::create(const object::ObjectFile &Obj,
+                     ProcessDebugRelocations RelocAction,
+                     const LoadedObjectInfo *L, std::string DWPName,
                      std::function<void(Error)> RecoverableErrorHandler,
                      std::function<void(Error)> WarningHandler) {
-  auto DObj =
-      std::make_unique<DWARFObjInMemory>(Obj, L, RecoverableErrorHandler, WarningHandler);
+  auto DObj = std::make_unique<DWARFObjInMemory>(
+      Obj, L, RecoverableErrorHandler, WarningHandler, RelocAction);
   return std::make_unique<DWARFContext>(std::move(DObj), std::move(DWPName),
                                         RecoverableErrorHandler,
                                         WarningHandler);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
index dcf2aefeb39f..5b1c62e6a259 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugAddr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 
 using namespace llvm;
 
@@ -18,12 +18,10 @@ Error DWARFDebugAddrTable::extractAddresses(const DWARFDataExtractor &Data,
   assert(EndOffset >= *OffsetPtr);
   uint64_t DataSize = EndOffset - *OffsetPtr;
   assert(Data.isValidOffsetForDataOfSize(*OffsetPtr, DataSize));
-  if (AddrSize != 4 && AddrSize != 8)
-    return createStringError(errc::not_supported,
-                             "address table at offset 0x%" PRIx64
-                             " has unsupported address size %" PRIu8
-                             " (4 and 8 are supported)",
-                             Offset, AddrSize);
+  if (Error SizeErr = DWARFContext::checkAddressSizeSupported(
+          AddrSize, errc::not_supported, "address table at offset 0x%" PRIx64,
+          Offset))
+    return SizeErr;
   if (DataSize % AddrSize != 0) {
     invalidateLength();
     return createStringError(errc::invalid_argument,
@@ -148,8 +146,20 @@ void DWARFDebugAddrTable::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   }
 
   if (Addrs.size() > 0) {
-    const char *AddrFmt =
-        (AddrSize == 4) ? "0x%8.8" PRIx64 "\n" : "0x%16.16" PRIx64 "\n";
+    const char *AddrFmt;
+    switch (AddrSize) {
+    case 2:
+      AddrFmt = "0x%4.4" PRIx64 "\n";
+      break;
+    case 4:
+      AddrFmt = "0x%8.8" PRIx64 "\n";
+      break;
+    case 8:
+      AddrFmt = "0x%16.16" PRIx64 "\n";
+      break;
+    default:
+      llvm_unreachable("unsupported address size");
+    }
     OS << "Addrs: [\n";
     for (uint64_t Addr : Addrs)
       OS << format(AddrFmt, Addr);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 598e3ecee30e..c60c9d9d7227 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
@@ -87,12 +88,10 @@ Error DWARFDebugArangeSet::extract(DWARFDataExtractor data,
                              "the length of address range table at offset "
                              "0x%" PRIx64 " exceeds section size",
                              Offset);
-  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
-    return createStringError(errc::invalid_argument,
-                             "address range table at offset 0x%" PRIx64
-                             " has unsupported address size: %d "
-                             "(4 and 8 supported)",
-                             Offset, HeaderData.AddrSize);
+  if (Error SizeErr = DWARFContext::checkAddressSizeSupported(
+          HeaderData.AddrSize, errc::invalid_argument,
+          "address range table at offset 0x%" PRIx64, Offset))
+    return SizeErr;
   if (HeaderData.SegSize != 0)
     return createStringError(errc::not_supported,
                              "non-zero segment selector size in address range "
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 7ebb0092c34a..385bde51e2e7 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -19,18 +19,11 @@
 using namespace llvm;
 using namespace dwarf;
 
-bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
-                                             uint64_t *OffsetPtr) {
-  DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor();
-  const uint64_t UEndOffset = U.getNextUnitOffset();
-  return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0);
-}
-
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint64_t *OffsetPtr,
                                       const DWARFDataExtractor &DebugInfoData,
-                                      uint64_t UEndOffset, uint32_t D) {
+                                      uint64_t UEndOffset, uint32_t ParentIdx) {
   Offset = *OffsetPtr;
-  Depth = D;
+  this->ParentIdx = ParentIdx;
   if (Offset >= UEndOffset) {
     U.getContext().getWarningHandler()(
         createStringError(errc::invalid_argument,
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index dc7da5d9348f..cad3dcab8a7e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -16,6 +16,12 @@
 
 using namespace llvm;
 
+bool DWARFDebugRangeList::RangeListEntry::isBaseAddressSelectionEntry(
+    uint8_t AddressSize) const {
+  assert(DWARFContext::isAddressSizeSupported(AddressSize));
+  return StartAddress == dwarf::computeTombstoneAddress(AddressSize);
+}
+
 void DWARFDebugRangeList::clear() {
   Offset = -1ULL;
   AddressSize = 0;
@@ -30,9 +36,10 @@ Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
                        "invalid range list offset 0x%" PRIx64, *offset_ptr);
 
   AddressSize = data.getAddressSize();
-  if (AddressSize != 4 && AddressSize != 8)
-    return createStringError(errc::invalid_argument,
-                       "invalid address size: %" PRIu8, AddressSize);
+  if (Error SizeErr = DWARFContext::checkAddressSizeSupported(
+          AddressSize, errc::invalid_argument,
+          "range list at offset 0x%" PRIx64, *offset_ptr))
+    return SizeErr;
   Offset = *offset_ptr;
   while (true) {
     RangeListEntry Entry;
@@ -58,12 +65,22 @@ Error DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
 }
 
 void DWARFDebugRangeList::dump(raw_ostream &OS) const {
-  for (const RangeListEntry &RLE : Entries) {
-    const char *format_str =
-        (AddressSize == 4 ? "%08" PRIx64 " %08" PRIx64 " %08" PRIx64 "\n"
-                          : "%08" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n");
-    OS << format(format_str, Offset, RLE.StartAddress, RLE.EndAddress);
+  const char *AddrFmt;
+  switch (AddressSize) {
+  case 2:
+    AddrFmt = "%08" PRIx64 " %04" PRIx64 " %04" PRIx64 "\n";
+    break;
+  case 4:
+    AddrFmt = "%08" PRIx64 " %08" PRIx64 " %08" PRIx64 "\n";
+    break;
+  case 8:
+    AddrFmt = "%08" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n";
+    break;
+  default:
+    llvm_unreachable("unsupported address size");
   }
+  for (const RangeListEntry &RLE : Entries)
+    OS << format(AddrFmt, Offset, RLE.StartAddress, RLE.EndAddress);
   OS << format("%08" PRIx64 " <End of list>\n", Offset);
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 0501e3ee3f9b..ed50f2635738 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -108,17 +108,41 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue,
   return;
 }
 
-/// Dump the name encoded in the type tag.
-static void dumpTypeTagName(raw_ostream &OS, dwarf::Tag T) {
-  StringRef TagStr = TagString(T);
-  if (!TagStr.startswith("DW_TAG_") || !TagStr.endswith("_type"))
-    return;
-  OS << TagStr.substr(7, TagStr.size() - 12) << " ";
+static DWARFDie resolveReferencedType(DWARFDie D,
+                                      dwarf::Attribute Attr = DW_AT_type) {
+  return D.getAttributeValueAsReferencedDie(Attr).resolveTypeUnitReference();
+}
+static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
+  return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
 }
 
-static void dumpArrayType(raw_ostream &OS, const DWARFDie &D) {
-  for (const DWARFDie &C : D.children())
-    if (C.getTag() == DW_TAG_subrange_type) {
+namespace {
+
+// FIXME: We should have pretty printers per language. Currently we print
+// everything as if it was C++ and fall back to the TAG type name.
+struct DWARFTypePrinter {
+  raw_ostream &OS;
+  bool Word = true;
+  bool EndedWithTemplate = false;
+
+  DWARFTypePrinter(raw_ostream &OS) : OS(OS) {}
+
+  /// Dump the name encoded in the type tag.
+  void appendTypeTagName(dwarf::Tag T) {
+    StringRef TagStr = TagString(T);
+    static constexpr StringRef Prefix = "DW_TAG_";
+    static constexpr StringRef Suffix = "_type";
+    if (!TagStr.startswith(Prefix) || !TagStr.endswith(Suffix))
+      return;
+    OS << TagStr.substr(Prefix.size(),
+                        TagStr.size() - (Prefix.size() + Suffix.size()))
+       << " ";
+  }
+
+  void appendArrayType(const DWARFDie &D) {
+    for (const DWARFDie &C : D.children()) {
+      if (C.getTag() != DW_TAG_subrange_type)
+        continue;
       Optional<uint64_t> LB;
       Optional<uint64_t> Count;
       Optional<uint64_t> UB;
@@ -159,79 +183,503 @@ static void dumpArrayType(raw_ostream &OS, const DWARFDie &D) {
         OS << ")]";
       }
     }
-}
-
-/// Recursively dump the DIE type name when applicable.
-static void dumpTypeName(raw_ostream &OS, const DWARFDie &D) {
-  if (!D.isValid())
-    return;
+    EndedWithTemplate = false;
+  }
 
-  if (const char *Name = D.getName(DINameKind::LinkageName)) {
-    OS << Name;
-    return;
+  DWARFDie skipQualifiers(DWARFDie D) {
+    while (D && (D.getTag() == DW_TAG_const_type ||
+                 D.getTag() == DW_TAG_volatile_type))
+      D = resolveReferencedType(D);
+    return D;
   }
 
-  // FIXME: We should have pretty printers per language. Currently we print
-  // everything as if it was C++ and fall back to the TAG type name.
-  const dwarf::Tag T = D.getTag();
-  switch (T) {
-  case DW_TAG_array_type:
-  case DW_TAG_pointer_type:
-  case DW_TAG_ptr_to_member_type:
-  case DW_TAG_reference_type:
-  case DW_TAG_rvalue_reference_type:
-  case DW_TAG_subroutine_type:
-    break;
-  default:
-    dumpTypeTagName(OS, T);
+  bool needsParens(DWARFDie D) {
+    D = skipQualifiers(D);
+    return D && (D.getTag() == DW_TAG_subroutine_type || D.getTag() == DW_TAG_array_type);
   }
 
-  // Follow the DW_AT_type if possible.
-  DWARFDie TypeDie = D.getAttributeValueAsReferencedDie(DW_AT_type);
-  dumpTypeName(OS, TypeDie);
+  void appendPointerLikeTypeBefore(DWARFDie D, DWARFDie Inner, StringRef Ptr) {
+    appendQualifiedNameBefore(Inner);
+    if (Word)
+      OS << ' ';
+    if (needsParens(Inner))
+      OS << '(';
+    OS << Ptr;
+    Word = false;
+    EndedWithTemplate = false;
+  }
 
-  switch (T) {
-  case DW_TAG_subroutine_type: {
-    if (!TypeDie)
+  DWARFDie
+  appendUnqualifiedNameBefore(DWARFDie D,
+                              std::string *OriginalFullName = nullptr) {
+    Word = true;
+    if (!D) {
       OS << "void";
+      return DWARFDie();
+    }
+    DWARFDie Inner = resolveReferencedType(D);
+    const dwarf::Tag T = D.getTag();
+    switch (T) {
+    case DW_TAG_pointer_type: {
+      appendPointerLikeTypeBefore(D, Inner, "*");
+      break;
+    }
+    case DW_TAG_subroutine_type: {
+      appendQualifiedNameBefore(Inner);
+      if (Word) {
+        OS << ' ';
+      }
+      Word = false;
+      break;
+    }
+    case DW_TAG_array_type: {
+      appendQualifiedNameBefore(Inner);
+      break;
+    }
+    case DW_TAG_reference_type:
+      appendPointerLikeTypeBefore(D, Inner, "&");
+      break;
+    case DW_TAG_rvalue_reference_type:
+      appendPointerLikeTypeBefore(D, Inner, "&&");
+      break;
+    case DW_TAG_ptr_to_member_type: {
+      appendQualifiedNameBefore(Inner);
+      if (needsParens(Inner))
+        OS << '(';
+      else if (Word)
+        OS << ' ';
+      if (DWARFDie Cont = resolveReferencedType(D, DW_AT_containing_type)) {
+        appendQualifiedName(Cont);
+        OS << "::";
+      }
+      OS << "*";
+      Word = false;
+      break;
+    }
+    case DW_TAG_const_type:
+    case DW_TAG_volatile_type:
+      appendConstVolatileQualifierBefore(D);
+      break;
+    case DW_TAG_namespace: {
+      if (const char *Name = dwarf::toString(D.find(DW_AT_name), nullptr))
+        OS << Name;
+      else
+        OS << "(anonymous namespace)";
+      break;
+    }
+    case DW_TAG_unspecified_type: {
+      StringRef TypeName = D.getShortName();
+      if (TypeName == "decltype(nullptr)")
+        TypeName = "std::nullptr_t";
+      Word = true;
+      OS << TypeName;
+      EndedWithTemplate = false;
+      break;
+    }
+      /*
+    case DW_TAG_structure_type:
+    case DW_TAG_class_type:
+    case DW_TAG_enumeration_type:
+    case DW_TAG_base_type:
+    */
+    default: {
+      const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr);
+      if (!NamePtr) {
+        appendTypeTagName(D.getTag());
+        return Inner;
+      }
+      Word = true;
+      StringRef Name = NamePtr;
+      static constexpr StringRef MangledPrefix = "_STN";
+      if (Name.startswith(MangledPrefix)) {
+        Name = Name.drop_front(MangledPrefix.size());
+        auto Separator = Name.find('|');
+        assert(Separator != StringRef::npos);
+        StringRef BaseName = Name.substr(0, Separator);
+        StringRef TemplateArgs = Name.substr(Separator + 1);
+        if (OriginalFullName)
+          *OriginalFullName = (BaseName + TemplateArgs).str();
+        Name = BaseName;
+      } else
+        EndedWithTemplate = Name.endswith(">");
+      OS << Name;
+      // This check would be insufficient for operator overloads like
+      // "operator>>" - but for now Clang doesn't try to simplify them, so this
+      // is OK. Add more nuanced operator overload handling here if/when needed.
+      if (Name.endswith(">"))
+        break;
+      if (!appendTemplateParameters(D))
+        break;
+
+      if (EndedWithTemplate)
+        OS << ' ';
+      OS << '>';
+      EndedWithTemplate = true;
+      Word = true;
+      break;
+    }
+    }
+    return Inner;
+  }
+
+  void appendUnqualifiedNameAfter(DWARFDie D, DWARFDie Inner,
+                                  bool SkipFirstParamIfArtificial = false) {
+    if (!D)
+      return;
+    switch (D.getTag()) {
+    case DW_TAG_subroutine_type: {
+      appendSubroutineNameAfter(D, Inner, SkipFirstParamIfArtificial, false,
+                                false);
+      break;
+    }
+    case DW_TAG_array_type: {
+      appendArrayType(D);
+      break;
+    }
+    case DW_TAG_const_type:
+    case DW_TAG_volatile_type:
+      appendConstVolatileQualifierAfter(D);
+      break;
+    case DW_TAG_ptr_to_member_type:
+    case DW_TAG_reference_type:
+    case DW_TAG_rvalue_reference_type:
+    case DW_TAG_pointer_type: {
+      if (needsParens(Inner))
+        OS << ')';
+      appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner),
+                                 /*SkipFirstParamIfArtificial=*/D.getTag() ==
+                                     DW_TAG_ptr_to_member_type);
+      break;
+    }
+      /*
+    case DW_TAG_structure_type:
+    case DW_TAG_class_type:
+    case DW_TAG_enumeration_type:
+    case DW_TAG_base_type:
+    case DW_TAG_namespace:
+    */
+    default:
+      break;
+    }
+  }
+
+  void appendQualifiedName(DWARFDie D) {
+    if (D)
+      appendScopes(D.getParent());
+    appendUnqualifiedName(D);
+  }
+  DWARFDie appendQualifiedNameBefore(DWARFDie D) {
+    if (D)
+      appendScopes(D.getParent());
+    return appendUnqualifiedNameBefore(D);
+  }
+  bool appendTemplateParameters(DWARFDie D, bool *FirstParameter = nullptr) {
+    bool FirstParameterValue = true;
+    bool IsTemplate = false;
+    if (!FirstParameter)
+      FirstParameter = &FirstParameterValue;
+    for (const DWARFDie &C : D) {
+      auto Sep = [&] {
+        if (*FirstParameter)
+          OS << '<';
+        else
+          OS << ", ";
+        IsTemplate = true;
+        EndedWithTemplate = false;
+        *FirstParameter = false;
+      };
+      if (C.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+        IsTemplate = true;
+        appendTemplateParameters(C, FirstParameter);
+      }
+      if (C.getTag() == dwarf::DW_TAG_template_value_parameter) {
+        DWARFDie T = resolveReferencedType(C);
+        Sep();
+        if (T.getTag() == DW_TAG_enumeration_type) {
+          auto V = C.find(DW_AT_const_value);
+          bool FoundEnumerator = false;
+          for (const DWARFDie &Enumerator : T) {
+            auto EV = Enumerator.find(DW_AT_const_value);
+            if (V && EV &&
+                V->getAsSignedConstant() == EV->getAsSignedConstant()) {
+              if (T.find(DW_AT_enum_class)) {
+                appendQualifiedName(T);
+                OS << "::";
+              } else
+                appendScopes(T.getParent());
+              OS << Enumerator.getShortName();
+              FoundEnumerator = true;
+              break;
+            }
+          }
+          if (FoundEnumerator)
+            continue;
+          OS << '(';
+          appendQualifiedName(T);
+          OS << ')';
+          OS << to_string(*V->getAsSignedConstant());
+          continue;
+        }
+        // /Maybe/ we could do pointer type parameters, looking for the
+        // symbol in the ELF symbol table to get back to the variable...
+        // but probably not worth it.
+        if (T.getTag() == DW_TAG_pointer_type)
+          continue;
+        const char *RawName = dwarf::toString(T.find(DW_AT_name), nullptr);
+        assert(RawName);
+        StringRef Name = RawName;
+        auto V = C.find(DW_AT_const_value);
+        bool IsQualifiedChar = false;
+        if (Name == "bool") {
+          OS << (*V->getAsUnsignedConstant() ? "true" : "false");
+        } else if (Name == "short") {
+          OS << "(short)";
+          OS << to_string(*V->getAsSignedConstant());
+        } else if (Name == "unsigned short") {
+          OS << "(unsigned short)";
+          OS << to_string(*V->getAsSignedConstant());
+        } else if (Name == "int")
+          OS << to_string(*V->getAsSignedConstant());
+        else if (Name == "long") {
+          OS << to_string(*V->getAsSignedConstant());
+          OS << "L";
+        } else if (Name == "long long") {
+          OS << to_string(*V->getAsSignedConstant());
+          OS << "LL";
+        } else if (Name == "unsigned int") {
+          OS << to_string(*V->getAsUnsignedConstant());
+          OS << "U";
+        } else if (Name == "unsigned long") {
+          OS << to_string(*V->getAsUnsignedConstant());
+          OS << "UL";
+        } else if (Name == "unsigned long long") {
+          OS << to_string(*V->getAsUnsignedConstant());
+          OS << "ULL";
+        } else if (Name == "char" ||
+                   (IsQualifiedChar =
+                        (Name == "unsigned char" || Name == "signed char"))) {
+          // FIXME: check T's DW_AT_type to see if it's signed or not (since
+          // char signedness is implementation defined).
+          auto Val = *V->getAsSignedConstant();
+          // Copied/hacked up from Clang's CharacterLiteral::print - incomplete
+          // (doesn't actually support different character types/widths, sign
+          // handling's not done, and doesn't correctly test if a character is
+          // printable or needs to use a numeric escape sequence instead)
+          if (IsQualifiedChar) {
+            OS << '(';
+            OS << Name;
+            OS << ')';
+          }
+          switch (Val) {
+          case '\\':
+            OS << "'\\\\'";
+            break;
+          case '\'':
+            OS << "'\\''";
+            break;
+          case '\a':
+            // TODO: K&R: the meaning of '\\a' is different in traditional C
+            OS << "'\\a'";
+            break;
+          case '\b':
+            OS << "'\\b'";
+            break;
+          case '\f':
+            OS << "'\\f'";
+            break;
+          case '\n':
+            OS << "'\\n'";
+            break;
+          case '\r':
+            OS << "'\\r'";
+            break;
+          case '\t':
+            OS << "'\\t'";
+            break;
+          case '\v':
+            OS << "'\\v'";
+            break;
+          default:
+            if ((Val & ~0xFFu) == ~0xFFu)
+              Val &= 0xFFu;
+            if (Val < 127 && Val >= 32) {
+              OS << "'";
+              OS << (char)Val;
+              OS << "'";
+            } else if (Val < 256)
+              OS << to_string(llvm::format("'\\x%02x'", Val));
+            else if (Val <= 0xFFFF)
+              OS << to_string(llvm::format("'\\u%04x'", Val));
+            else
+              OS << to_string(llvm::format("'\\U%08x'", Val));
+          }
+        }
+        continue;
+      }
+      if (C.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+        const char *RawName =
+            dwarf::toString(C.find(DW_AT_GNU_template_name), nullptr);
+        assert(RawName);
+        StringRef Name = RawName;
+        Sep();
+        OS << Name;
+        continue;
+      }
+      if (C.getTag() != dwarf::DW_TAG_template_type_parameter)
+        continue;
+      auto TypeAttr = C.find(DW_AT_type);
+      Sep();
+      appendQualifiedName(TypeAttr ? resolveReferencedType(C, *TypeAttr)
+                                   : DWARFDie());
+    }
+    if (IsTemplate && *FirstParameter && FirstParameter == &FirstParameterValue)
+      OS << '<';
+    return IsTemplate;
+  }
+  void decomposeConstVolatile(DWARFDie &N, DWARFDie &T, DWARFDie &C,
+                              DWARFDie &V) {
+    (N.getTag() == DW_TAG_const_type ? C : V) = N;
+    T = resolveReferencedType(N);
+    if (T) {
+      auto Tag = T.getTag();
+      if (Tag == DW_TAG_const_type) {
+        C = T;
+        T = resolveReferencedType(T);
+      } else if (Tag == DW_TAG_volatile_type) {
+        V = T;
+        T = resolveReferencedType(T);
+      }
+    }
+  }
+  void appendConstVolatileQualifierAfter(DWARFDie N) {
+    DWARFDie C;
+    DWARFDie V;
+    DWARFDie T;
+    decomposeConstVolatile(N, T, C, V);
+    if (T && T.getTag() == DW_TAG_subroutine_type)
+      appendSubroutineNameAfter(T, resolveReferencedType(T), false, C.isValid(),
+                                V.isValid());
+    else
+      appendUnqualifiedNameAfter(T, resolveReferencedType(T));
+  }
+  void appendConstVolatileQualifierBefore(DWARFDie N) {
+    DWARFDie C;
+    DWARFDie V;
+    DWARFDie T;
+    decomposeConstVolatile(N, T, C, V);
+    bool Subroutine = T && T.getTag() == DW_TAG_subroutine_type;
+    DWARFDie A = T;
+    while (A && A.getTag() == DW_TAG_array_type)
+      A = resolveReferencedType(A);
+    bool Leading =
+        (!A || (A.getTag() != DW_TAG_pointer_type &&
+                A.getTag() != llvm::dwarf::DW_TAG_ptr_to_member_type)) &&
+        !Subroutine;
+    if (Leading) {
+      if (C)
+        OS << "const ";
+      if (V)
+        OS << "volatile ";
+    }
+    appendQualifiedNameBefore(T);
+    if (!Leading && !Subroutine) {
+      Word = true;
+      if (C)
+        OS << "const";
+      if (V) {
+        if (C)
+          OS << ' ';
+        OS << "volatile";
+      }
+    }
+  }
+
+  /// Recursively append the DIE type name when applicable.
+  void appendUnqualifiedName(DWARFDie D,
+                             std::string *OriginalFullName = nullptr) {
+    // FIXME: We should have pretty printers per language. Currently we print
+    // everything as if it was C++ and fall back to the TAG type name.
+    DWARFDie Inner = appendUnqualifiedNameBefore(D, OriginalFullName);
+    appendUnqualifiedNameAfter(D, Inner);
+  }
+
+  void appendSubroutineNameAfter(DWARFDie D, DWARFDie Inner,
+                                 bool SkipFirstParamIfArtificial, bool Const,
+                                 bool Volatile) {
+    DWARFDie FirstParamIfArtificial;
     OS << '(';
+    EndedWithTemplate = false;
     bool First = true;
-    for (const DWARFDie &C : D.children()) {
-      if (C.getTag() == DW_TAG_formal_parameter) {
-        if (!First)
-          OS << ", ";
-        First = false;
-        dumpTypeName(OS, C.getAttributeValueAsReferencedDie(DW_AT_type));
+    bool RealFirst = true;
+    for (DWARFDie P : D) {
+      if (P.getTag() != DW_TAG_formal_parameter)
+        return;
+      DWARFDie T = resolveReferencedType(P);
+      if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) {
+        FirstParamIfArtificial = T;
+        RealFirst = false;
+        continue;
       }
+      if (!First) {
+        OS << ", ";
+      }
+      First = false;
+      appendQualifiedName(T);
     }
+    EndedWithTemplate = false;
     OS << ')';
-    break;
-  }
-  case DW_TAG_array_type: {
-    dumpArrayType(OS, D);
-    break;
-  }
-  case DW_TAG_pointer_type:
-    OS << '*';
-    break;
-  case DW_TAG_ptr_to_member_type:
-    if (DWARFDie Cont =
-            D.getAttributeValueAsReferencedDie(DW_AT_containing_type)) {
-      dumpTypeName(OS << ' ', Cont);
-      OS << "::";
+    if (FirstParamIfArtificial) {
+      if (DWARFDie P = FirstParamIfArtificial) {
+        if (P.getTag() == DW_TAG_pointer_type) {
+          DWARFDie C;
+          DWARFDie V;
+          auto CVStep = [&](DWARFDie CV) {
+            if (DWARFDie U = resolveReferencedType(CV)) {
+              if (U.getTag() == DW_TAG_const_type)
+                return C = U;
+              if (U.getTag() == DW_TAG_volatile_type)
+                return V = U;
+            }
+            return DWARFDie();
+          };
+          if (DWARFDie CV = CVStep(P)) {
+            CVStep(CV);
+          }
+          if (C)
+            OS << " const";
+          if (V)
+            OS << " volatile";
+        }
+      }
+    } else {
+      if (Const)
+        OS << " const";
+      if (Volatile)
+        OS << " volatile";
     }
-    OS << '*';
-    break;
-  case DW_TAG_reference_type:
-    OS << '&';
-    break;
-  case DW_TAG_rvalue_reference_type:
-    OS << "&&";
-    break;
-  default:
-    break;
+    if (D.find(DW_AT_reference))
+      OS << " &";
+    if (D.find(DW_AT_rvalue_reference))
+      OS << " &&";
+    appendUnqualifiedNameAfter(Inner, resolveReferencedType(Inner));
   }
-}
+  void appendScopes(DWARFDie D) {
+    if (D.getTag() == DW_TAG_compile_unit)
+      return;
+    if (D.getTag() == DW_TAG_type_unit)
+      return;
+    if (D.getTag() == DW_TAG_skeleton_unit)
+      return;
+    if (D.getTag() == DW_TAG_subprogram)
+      return;
+    D = D.resolveTypeUnitReference();
+    if (DWARFDie P = D.getParent())
+      appendScopes(P);
+    appendUnqualifiedName(D);
+    OS << "::";
+  }
+};
+} // anonymous namespace
 
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
@@ -316,9 +764,12 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                 DINameKind::LinkageName))
       OS << Space << "\"" << Name << '\"';
   } else if (Attr == DW_AT_type) {
-    OS << Space << "\"";
-    dumpTypeName(OS, Die.getAttributeValueAsReferencedDie(FormValue));
-    OS << '"';
+    DWARFDie D = resolveReferencedType(Die, FormValue);
+    if (D && !D.isNULL()) {
+      OS << Space << "\"";
+      DWARFTypePrinter(OS).appendQualifiedName(D);
+      OS << '"';
+    }
   } else if (Attr == DW_AT_APPLE_property_attribute) {
     if (Optional<uint64_t> OptVal = FormValue.getAsUnsignedConstant())
       dumpApplePropertyAttribute(OS, *OptVal);
@@ -345,6 +796,14 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   OS << ")\n";
 }
 
+void DWARFDie::getFullName(raw_string_ostream &OS,
+                           std::string *OriginalFullName) const {
+  const char *NamePtr = getShortName();
+  if (!NamePtr)
+    return;
+  DWARFTypePrinter(OS).appendUnqualifiedName(*this, OriginalFullName);
+}
+
 bool DWARFDie::isSubprogramDIE() const { return getTag() == DW_TAG_subprogram; }
 
 bool DWARFDie::isSubroutineDIE() const {
@@ -417,13 +876,27 @@ DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const {
 
 DWARFDie
 DWARFDie::getAttributeValueAsReferencedDie(const DWARFFormValue &V) const {
+  DWARFDie Result;
   if (auto SpecRef = V.getAsRelativeReference()) {
     if (SpecRef->Unit)
-      return SpecRef->Unit->getDIEForOffset(SpecRef->Unit->getOffset() + SpecRef->Offset);
-    if (auto SpecUnit = U->getUnitVector().getUnitForOffset(SpecRef->Offset))
-      return SpecUnit->getDIEForOffset(SpecRef->Offset);
+      Result = SpecRef->Unit->getDIEForOffset(SpecRef->Unit->getOffset() +
+                                              SpecRef->Offset);
+    else if (auto SpecUnit =
+                 U->getUnitVector().getUnitForOffset(SpecRef->Offset))
+      Result = SpecUnit->getDIEForOffset(SpecRef->Offset);
   }
-  return DWARFDie();
+  return Result;
+}
+
+DWARFDie DWARFDie::resolveTypeUnitReference() const {
+  if (auto Attr = find(DW_AT_signature)) {
+    if (Optional<uint64_t> Sig = Attr->getAsReferenceUVal()) {
+      if (DWARFTypeUnit *TU = U->getContext().getTypeUnitForHash(
+              U->getVersion(), *Sig, U->isDWOUnit()))
+        return TU->getDIEForOffset(TU->getTypeOffset() + TU->getOffset());
+    }
+  }
+  return *this;
 }
 
 Optional<uint64_t> DWARFDie::getRangesBaseAttribute() const {
@@ -483,21 +956,6 @@ Expected<DWARFAddressRangesVector> DWARFDie::getAddressRanges() const {
   return DWARFAddressRangesVector();
 }
 
-void DWARFDie::collectChildrenAddressRanges(
-    DWARFAddressRangesVector &Ranges) const {
-  if (isNULL())
-    return;
-  if (isSubprogramDIE()) {
-    if (auto DIERangesOrError = getAddressRanges())
-      llvm::append_range(Ranges, DIERangesOrError.get());
-    else
-      llvm::consumeError(DIERangesOrError.takeError());
-  }
-
-  for (auto Child : children())
-    Child.collectChildrenAddressRanges(Ranges);
-}
-
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
   auto RangesOrError = getAddressRanges();
   if (!RangesOrError) {
@@ -581,18 +1039,10 @@ uint64_t DWARFDie::getDeclLine() const {
 
 std::string
 DWARFDie::getDeclFile(DILineInfoSpecifier::FileLineInfoKind Kind) const {
-  auto D = getAttributeValueAsReferencedDie(DW_AT_abstract_origin);
-  if (!D)
-    D = *this;
-  std::string FileName;
-  if (auto DeclFile = toUnsigned(D.find(DW_AT_decl_file))) {
-    if (const auto *LineTable =
-            getDwarfUnit()->getContext().getLineTableForUnit(
-                D.getDwarfUnit()->getLinkedUnit()))
-      LineTable->getFileNameByIndex(
-          *DeclFile, D.getDwarfUnit()->getCompilationDir(), Kind, FileName);
-  }
-  return FileName;
+  if (auto FormValue = findRecursively(DW_AT_decl_file))
+    if (auto OptString = FormValue->getAsFile(Kind))
+      return *OptString;
+  return {};
 }
 
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
@@ -641,9 +1091,13 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
       if (AbbrevDecl) {
         WithColor(OS, HighlightColor::Tag).get().indent(Indent)
             << formatv("{0}", getTag());
-        if (DumpOpts.Verbose)
+        if (DumpOpts.Verbose) {
           OS << format(" [%u] %c", abbrCode,
                        AbbrevDecl->hasChildren() ? '*' : ' ');
+          if (Optional<uint32_t> ParentIdx = Die->getParentIdx())
+            OS << format(" (0x%8.8" PRIx64 ")",
+                         U->getDIEAtIndex(*ParentIdx).getOffset());
+        }
         OS << '\n';
 
         // Dump all data in the DIE for the attributes.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index 4b9be85f6885..d0fbd702e831 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -207,7 +207,8 @@ bool DWARFExpression::Operation::extract(DataExtractor Data,
 }
 
 static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
-                                   DIDumpOptions DumpOpts, uint64_t Operands[2],
+                                   DIDumpOptions DumpOpts,
+                                   const uint64_t Operands[2],
                                    unsigned Operand) {
   assert(Operand < 2 && "operand out of bounds");
   auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
@@ -226,7 +227,7 @@ static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
 
 static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
                                   DIDumpOptions DumpOpts, uint8_t Opcode,
-                                  uint64_t Operands[2],
+                                  const uint64_t Operands[2],
                                   const MCRegisterInfo *MRI, bool isEH) {
   if (!MRI)
     return false;
@@ -262,7 +263,7 @@ static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
 bool DWARFExpression::Operation::print(raw_ostream &OS, DIDumpOptions DumpOpts,
                                        const DWARFExpression *Expr,
                                        const MCRegisterInfo *RegInfo,
-                                       DWARFUnit *U, bool isEH) {
+                                       DWARFUnit *U, bool isEH) const {
   if (Error) {
     OS << "<decoding error>";
     return false;
@@ -356,10 +357,9 @@ void DWARFExpression::print(raw_ostream &OS, DIDumpOptions DumpOpts,
   }
 }
 
-bool DWARFExpression::Operation::verify(DWARFUnit *U) {
-
+bool DWARFExpression::Operation::verify(const Operation &Op, DWARFUnit *U) {
   for (unsigned Operand = 0; Operand < 2; ++Operand) {
-    unsigned Size = Desc.Op[Operand];
+    unsigned Size = Op.Desc.Op[Operand];
 
     if (Size == Operation::SizeNA)
       break;
@@ -369,13 +369,11 @@ bool DWARFExpression::Operation::verify(DWARFUnit *U) {
       // the generic type should be done, so don't look up a base type in that
       // case. The same holds for DW_OP_reinterpret, which is currently not
       // supported.
-      if (Opcode == DW_OP_convert && Operands[Operand] == 0)
+      if (Op.Opcode == DW_OP_convert && Op.Operands[Operand] == 0)
         continue;
-      auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
-      if (!Die || Die.getTag() != dwarf::DW_TAG_base_type) {
-        Error = true;
+      auto Die = U->getDIEForOffset(U->getOffset() + Op.Operands[Operand]);
+      if (!Die || Die.getTag() != dwarf::DW_TAG_base_type)
         return false;
-      }
     }
   }
 
@@ -384,7 +382,7 @@ bool DWARFExpression::Operation::verify(DWARFUnit *U) {
 
 bool DWARFExpression::verify(DWARFUnit *U) {
   for (auto &Op : *this)
-    if (!Op.verify(U))
+    if (!Operation::verify(Op, U))
       return false;
 
   return true;
@@ -410,7 +408,7 @@ static bool printCompactDWARFExpr(raw_ostream &OS, DWARFExpression::iterator I,
   SmallVector<PrintedExpr, 4> Stack;
 
   while (I != E) {
-    DWARFExpression::Operation &Op = *I;
+    const DWARFExpression::Operation &Op = *I;
     uint8_t Opcode = Op.getCode();
     switch (Opcode) {
     case dwarf::DW_OP_regx: {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 2244a69bc121..cea0f63bbf81 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -332,7 +332,7 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
       break;
     case DW_FORM_LLVM_addrx_offset:
       Value.uval = Data.getULEB128(OffsetPtr, &Err) << 32;
-      Value.uval = Data.getU32(OffsetPtr, &Err);
+      Value.uval |= Data.getU32(OffsetPtr, &Err);
       break;
     case DW_FORM_string:
       Value.cstr = Data.getCStr(OffsetPtr, &Err);
@@ -690,7 +690,7 @@ Optional<uint64_t> DWARFFormValue::getAsReference() const {
     return R->Unit ? R->Unit->getOffset() + R->Offset : R->Offset;
   return None;
 }
-  
+
 Optional<DWARFFormValue::UnitOffset> DWARFFormValue::getAsRelativeReference() const {
   if (!isFormClass(FC_Reference))
     return None;
@@ -762,3 +762,17 @@ Optional<uint64_t> DWARFFormValue::getAsReferenceUVal() const {
     return None;
   return Value.uval;
 }
+
+Optional<std::string>
+DWARFFormValue::getAsFile(DILineInfoSpecifier::FileLineInfoKind Kind) const {
+  if (U == nullptr || !isFormClass(FC_Constant))
+    return None;
+  DWARFUnit *DLU = const_cast<DWARFUnit *>(U)->getLinkedUnit();
+  if (auto *LT = DLU->getContext().getLineTableForUnit(DLU)) {
+    std::string FileName;
+    if (LT->getFileNameByIndex(Value.uval, DLU->getCompilationDir(), Kind,
+                               FileName))
+      return FileName;
+  }
+  return None;
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
index c876af1e9b51..b73dda3ff9ce 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFListTable.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
@@ -54,11 +55,10 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
                        "unrecognised %s table version %" PRIu16
                        " in table at offset 0x%" PRIx64,
                        SectionName.data(), HeaderData.Version, HeaderOffset);
-  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
-    return createStringError(errc::not_supported,
-                       "%s table at offset 0x%" PRIx64
-                       " has unsupported address size %" PRIu8,
-                       SectionName.data(), HeaderOffset, HeaderData.AddrSize);
+  if (Error SizeErr = DWARFContext::checkAddressSizeSupported(
+          HeaderData.AddrSize, errc::not_supported,
+          "%s table at offset 0x%" PRIx64, SectionName.data(), HeaderOffset))
+    return SizeErr;
   if (HeaderData.SegSize != 0)
     return createStringError(errc::not_supported,
                        "%s table at offset 0x%" PRIx64
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index f17dacfce665..82c34f537036 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -315,15 +315,10 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
     return false;
   }
 
-  if (!DWARFContext::isAddressSizeSupported(getAddressByteSize())) {
-    SmallVector<std::string, 3> Sizes;
-    for (auto Size : DWARFContext::getSupportedAddressSizes())
-      Sizes.push_back(std::to_string(Size));
-    Context.getWarningHandler()(createStringError(
-        errc::invalid_argument,
-        "DWARF unit at offset 0x%8.8" PRIx64 " "
-        "has unsupported address size %" PRIu8 ", supported are %s",
-        Offset, getAddressByteSize(), llvm::join(Sizes, ", ").c_str()));
+  if (Error SizeErr = DWARFContext::checkAddressSizeSupported(
+          getAddressByteSize(), errc::invalid_argument,
+          "DWARF unit at offset 0x%8.8" PRIx64, Offset)) {
+    Context.getWarningHandler()(std::move(SizeErr));
     return false;
   }
 
@@ -349,29 +344,6 @@ bool DWARFUnitHeader::applyIndexEntry(const DWARFUnitIndex::Entry *Entry) {
   return true;
 }
 
-// Parse the rangelist table header, including the optional array of offsets
-// following it (DWARF v5 and later).
-template<typename ListTableType>
-static Expected<ListTableType>
-parseListTableHeader(DWARFDataExtractor &DA, uint64_t Offset,
-                        DwarfFormat Format) {
-  // We are expected to be called with Offset 0 or pointing just past the table
-  // header. Correct Offset in the latter case so that it points to the start
-  // of the header.
-  if (Offset > 0) {
-    uint64_t HeaderSize = DWARFListTableHeader::getHeaderSize(Format);
-    if (Offset < HeaderSize)
-      return createStringError(errc::invalid_argument, "did not detect a valid"
-                               " list table with base = 0x%" PRIx64 "\n",
-                               Offset);
-    Offset -= HeaderSize;
-  }
-  ListTableType Table;
-  if (Error E = Table.extractHeaderAndOffsets(DA, &Offset))
-    return std::move(E);
-  return Table;
-}
-
 Error DWARFUnit::extractRangeList(uint64_t RangeListOffset,
                                   DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
@@ -411,11 +383,39 @@ void DWARFUnit::extractDIEsToVector(
   DWARFDataExtractor DebugInfoData = getDebugInfoExtractor();
   // The end offset has been already checked by DWARFUnitHeader::extract.
   assert(DebugInfoData.isValidOffset(NextCUOffset - 1));
-  uint32_t Depth = 0;
+  std::vector<uint32_t> Parents;
+  std::vector<uint32_t> PrevSiblings;
   bool IsCUDie = true;
 
-  while (DIE.extractFast(*this, &DIEOffset, DebugInfoData, NextCUOffset,
-                         Depth)) {
+  assert(
+      ((AppendCUDie && Dies.empty()) || (!AppendCUDie && Dies.size() == 1)) &&
+      "Dies array is not empty");
+
+  // Fill Parents and Siblings stacks with initial value.
+  Parents.push_back(UINT32_MAX);
+  if (!AppendCUDie)
+    Parents.push_back(0);
+  PrevSiblings.push_back(0);
+
+  // Start to extract dies.
+  do {
+    assert(Parents.size() > 0 && "Empty parents stack");
+    assert((Parents.back() == UINT32_MAX || Parents.back() <= Dies.size()) &&
+           "Wrong parent index");
+
+    // Extract die. Stop if any error occured.
+    if (!DIE.extractFast(*this, &DIEOffset, DebugInfoData, NextCUOffset,
+                         Parents.back()))
+      break;
+
+    // If previous sibling is remembered then update it`s SiblingIdx field.
+    if (PrevSiblings.back() > 0) {
+      assert(PrevSiblings.back() < Dies.size() &&
+             "Previous sibling index is out of Dies boundaries");
+      Dies[PrevSiblings.back()].setSiblingIdx(Dies.size());
+    }
+
+    // Store die into the Dies vector.
     if (IsCUDie) {
       if (AppendCUDie)
         Dies.push_back(DIE);
@@ -425,26 +425,36 @@ void DWARFUnit::extractDIEsToVector(
       // around 14-20 so let's pre-reserve the needed memory for
       // our DIE entries accordingly.
       Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
-      IsCUDie = false;
     } else {
+      // Remember last previous sibling.
+      PrevSiblings.back() = Dies.size();
+
       Dies.push_back(DIE);
     }
 
+    // Check for new children scope.
     if (const DWARFAbbreviationDeclaration *AbbrDecl =
             DIE.getAbbreviationDeclarationPtr()) {
-      // Normal DIE
-      if (AbbrDecl->hasChildren())
-        ++Depth;
-      else if (Depth == 0)
-        break; // This unit has a single DIE with no children.
+      if (AbbrDecl->hasChildren()) {
+        if (AppendCUDie || !IsCUDie) {
+          assert(Dies.size() > 0 && "Dies does not contain any die");
+          Parents.push_back(Dies.size() - 1);
+          PrevSiblings.push_back(0);
+        }
+      } else if (IsCUDie)
+        // Stop if we have single compile unit die w/o children.
+        break;
     } else {
-      // NULL DIE.
-      if (Depth > 0)
-        --Depth;
-      if (Depth == 0)
-        break;  // We are done with this compile unit!
+      // NULL DIE: finishes current children scope.
+      Parents.pop_back();
+      PrevSiblings.pop_back();
     }
-  }
+
+    if (IsCUDie)
+      IsCUDie = false;
+
+    // Stop when compile unit die is removed from the parents stack.
+  } while (Parents.size() > 1);
 }
 
 void DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
@@ -600,10 +610,14 @@ bool DWARFUnit::parseDWO() {
 }
 
 void DWARFUnit::clearDIEs(bool KeepCUDie) {
-  if (DieArray.size() > (unsigned)KeepCUDie) {
-    DieArray.resize((unsigned)KeepCUDie);
-    DieArray.shrink_to_fit();
-  }
+  // Do not use resize() + shrink_to_fit() to free memory occupied by dies.
+  // shrink_to_fit() is a *non-binding* request to reduce capacity() to size().
+  // It depends on the implementation whether the request is fulfilled.
+  // Create a new vector with a small capacity and assign it to the DieArray to
+  // have previous contents freed.
+  DieArray = (KeepCUDie && !DieArray.empty())
+                 ? std::vector<DWARFDebugInfoEntry>({DieArray[0]})
+                 : std::vector<DWARFDebugInfoEntry>();
 }
 
 Expected<DWARFAddressRangesVector>
@@ -750,65 +764,65 @@ const DWARFUnitIndex &llvm::getDWARFUnitIndex(DWARFContext &Context,
 DWARFDie DWARFUnit::getParent(const DWARFDebugInfoEntry *Die) {
   if (!Die)
     return DWARFDie();
-  const uint32_t Depth = Die->getDepth();
-  // Unit DIEs always have a depth of zero and never have parents.
-  if (Depth == 0)
-    return DWARFDie();
-  // Depth of 1 always means parent is the compile/type unit.
-  if (Depth == 1)
-    return getUnitDIE();
-  // Look for previous DIE with a depth that is one less than the Die's depth.
-  const uint32_t ParentDepth = Depth - 1;
-  for (uint32_t I = getDIEIndex(Die) - 1; I > 0; --I) {
-    if (DieArray[I].getDepth() == ParentDepth)
-      return DWARFDie(this, &DieArray[I]);
+
+  if (Optional<uint32_t> ParentIdx = Die->getParentIdx()) {
+    assert(*ParentIdx < DieArray.size() &&
+           "ParentIdx is out of DieArray boundaries");
+    return DWARFDie(this, &DieArray[*ParentIdx]);
   }
+
   return DWARFDie();
 }
 
 DWARFDie DWARFUnit::getSibling(const DWARFDebugInfoEntry *Die) {
   if (!Die)
     return DWARFDie();
-  uint32_t Depth = Die->getDepth();
-  // Unit DIEs always have a depth of zero and never have siblings.
-  if (Depth == 0)
-    return DWARFDie();
-  // NULL DIEs don't have siblings.
-  if (Die->getAbbreviationDeclarationPtr() == nullptr)
-    return DWARFDie();
 
-  // Find the next DIE whose depth is the same as the Die's depth.
-  for (size_t I = getDIEIndex(Die) + 1, EndIdx = DieArray.size(); I < EndIdx;
-       ++I) {
-    if (DieArray[I].getDepth() == Depth)
-      return DWARFDie(this, &DieArray[I]);
+  if (Optional<uint32_t> SiblingIdx = Die->getSiblingIdx()) {
+    assert(*SiblingIdx < DieArray.size() &&
+           "SiblingIdx is out of DieArray boundaries");
+    return DWARFDie(this, &DieArray[*SiblingIdx]);
   }
+
   return DWARFDie();
 }
 
 DWARFDie DWARFUnit::getPreviousSibling(const DWARFDebugInfoEntry *Die) {
   if (!Die)
     return DWARFDie();
-  uint32_t Depth = Die->getDepth();
-  // Unit DIEs always have a depth of zero and never have siblings.
-  if (Depth == 0)
+
+  Optional<uint32_t> ParentIdx = Die->getParentIdx();
+  if (!ParentIdx)
+    // Die is a root die, there is no previous sibling.
     return DWARFDie();
 
-  // Find the previous DIE whose depth is the same as the Die's depth.
-  for (size_t I = getDIEIndex(Die); I > 0;) {
-    --I;
-    if (DieArray[I].getDepth() == Depth - 1)
-      return DWARFDie();
-    if (DieArray[I].getDepth() == Depth)
-      return DWARFDie(this, &DieArray[I]);
+  assert(*ParentIdx < DieArray.size() &&
+         "ParentIdx is out of DieArray boundaries");
+  assert(getDIEIndex(Die) > 0 && "Die is a root die");
+
+  uint32_t PrevDieIdx = getDIEIndex(Die) - 1;
+  if (PrevDieIdx == *ParentIdx)
+    // Immediately previous node is parent, there is no previous sibling.
+    return DWARFDie();
+
+  while (DieArray[PrevDieIdx].getParentIdx() != *ParentIdx) {
+    PrevDieIdx = *DieArray[PrevDieIdx].getParentIdx();
+
+    assert(PrevDieIdx < DieArray.size() &&
+           "PrevDieIdx is out of DieArray boundaries");
+    assert(PrevDieIdx >= *ParentIdx &&
+           "PrevDieIdx is not a child of parent of Die");
   }
-  return DWARFDie();
+
+  return DWARFDie(this, &DieArray[PrevDieIdx]);
 }
 
 DWARFDie DWARFUnit::getFirstChild(const DWARFDebugInfoEntry *Die) {
   if (!Die->hasChildren())
     return DWARFDie();
 
+  // TODO: Instead of checking here for invalid die we might reject
+  // invalid dies at parsing stage(DWARFUnit::extractDIEsToVector).
   // We do not want access out of bounds when parsing corrupted debug data.
   size_t I = getDIEIndex(Die) + 1;
   if (I >= DieArray.size())
@@ -820,14 +834,30 @@ DWARFDie DWARFUnit::getLastChild(const DWARFDebugInfoEntry *Die) {
   if (!Die->hasChildren())
     return DWARFDie();
 
-  uint32_t Depth = Die->getDepth();
-  for (size_t I = getDIEIndex(Die) + 1, EndIdx = DieArray.size(); I < EndIdx;
-       ++I) {
-    if (DieArray[I].getDepth() == Depth + 1 &&
-        DieArray[I].getTag() == dwarf::DW_TAG_null)
-      return DWARFDie(this, &DieArray[I]);
-    assert(DieArray[I].getDepth() > Depth && "Not processing children?");
+  if (Optional<uint32_t> SiblingIdx = Die->getSiblingIdx()) {
+    assert(*SiblingIdx < DieArray.size() &&
+           "SiblingIdx is out of DieArray boundaries");
+    assert(DieArray[*SiblingIdx - 1].getTag() == dwarf::DW_TAG_null &&
+           "Bad end of children marker");
+    return DWARFDie(this, &DieArray[*SiblingIdx - 1]);
+  }
+
+  // If SiblingIdx is set for non-root dies we could be sure that DWARF is
+  // correct and "end of children marker" must be found. For root die we do not
+  // have such a guarantee(parsing root die might be stopped if "end of children
+  // marker" is missing, SiblingIdx is always zero for root die). That is why we
+  // do not use assertion for checking for "end of children marker" for root
+  // die.
+
+  // TODO: Instead of checking here for invalid die we might reject
+  // invalid dies at parsing stage(DWARFUnit::extractDIEsToVector).
+  if (getDIEIndex(Die) == 0 && DieArray.size() > 1 &&
+      DieArray.back().getTag() == dwarf::DW_TAG_null) {
+    // For the unit die we might take last item from DieArray.
+    assert(getDIEIndex(Die) == getDIEIndex(getUnitDIE()) && "Bad unit die");
+    return DWARFDie(this, &DieArray.back());
   }
+
   return DWARFDie();
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index ac624ec8b80f..dcabefb9896e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
@@ -50,6 +51,9 @@ DWARFVerifier::DieRangeInfo::insert(const DWARFAddressRange &R) {
 
 DWARFVerifier::DieRangeInfo::die_range_info_iterator
 DWARFVerifier::DieRangeInfo::insert(const DieRangeInfo &RI) {
+  if (RI.Ranges.empty())
+    return Children.end();
+
   auto End = Children.end();
   auto Iter = Children.begin();
   while (Iter != End) {
@@ -158,7 +162,30 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
   return Success;
 }
 
-unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) {
+bool DWARFVerifier::verifyName(const DWARFDie &Die) {
+  // FIXME Add some kind of record of which DIE names have already failed and
+  // don't bother checking a DIE that uses an already failed DIE.
+
+  std::string ReconstructedName;
+  raw_string_ostream OS(ReconstructedName);
+  std::string OriginalFullName;
+  Die.getFullName(OS, &OriginalFullName);
+  OS.flush();
+  if (OriginalFullName.empty() || OriginalFullName == ReconstructedName)
+    return 0;
+
+  error() << "Simplified template DW_AT_name could not be reconstituted:\n"
+          << formatv("         original: {0}\n"
+                     "    reconstituted: {1}\n",
+                     OriginalFullName, ReconstructedName);
+  dump(Die) << '\n';
+  dump(Die.getDwarfUnit()->getUnitDIE()) << '\n';
+  return 1;
+}
+
+unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit,
+                                           ReferenceMap &UnitLocalReferences,
+                                           ReferenceMap &CrossUnitReferences) {
   unsigned NumUnitErrors = 0;
   unsigned NumDies = Unit.getNumDIEs();
   for (unsigned I = 0; I < NumDies; ++I) {
@@ -169,9 +196,12 @@ unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) {
 
     for (auto AttrValue : Die.attributes()) {
       NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue);
-      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
+      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue, UnitLocalReferences,
+                                           CrossUnitReferences);
     }
 
+    NumUnitErrors += verifyName(Die);
+
     if (Die.hasChildren()) {
       if (Die.getFirstChild().isValid() &&
           Die.getFirstChild().getTag() == DW_TAG_null) {
@@ -299,6 +329,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   bool hasDIE = DebugInfoData.isValidOffset(Offset);
   DWARFUnitVector TypeUnitVector;
   DWARFUnitVector CompileUnitVector;
+  /// A map that tracks all references (converted absolute references) so we
+  /// can verify each reference points to a valid DIE and not an offset that
+  /// lies between to valid DIEs.
+  ReferenceMap CrossUnitReferences;
   while (hasDIE) {
     OffsetStart = Offset;
     if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
@@ -309,6 +343,7 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
     } else {
       DWARFUnitHeader Header;
       Header.extract(DCtx, DebugInfoData, &OffsetStart, SectionKind);
+      ReferenceMap UnitLocalReferences;
       DWARFUnit *Unit;
       switch (UnitType) {
       case dwarf::DW_UT_type:
@@ -337,7 +372,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       }
       default: { llvm_unreachable("Invalid UnitType."); }
       }
-      NumDebugInfoErrors += verifyUnitContents(*Unit);
+      NumDebugInfoErrors +=
+          verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
+      NumDebugInfoErrors += verifyDebugInfoReferences(
+          UnitLocalReferences, [&](uint64_t Offset) { return Unit; });
     }
     hasDIE = DebugInfoData.isValidOffset(Offset);
     ++UnitIdx;
@@ -348,7 +386,14 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   }
   if (!isHeaderChainValid)
     ++NumDebugInfoErrors;
-  NumDebugInfoErrors += verifyDebugInfoReferences();
+  NumDebugInfoErrors += verifyDebugInfoReferences(
+      CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * {
+        if (DWARFUnit *U = TypeUnitVector.getUnitForOffset(Offset))
+          return U;
+        if (DWARFUnit *U = CompileUnitVector.getUnitForOffset(Offset))
+          return U;
+        return nullptr;
+      });
   return NumDebugInfoErrors;
 }
 
@@ -383,7 +428,7 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
     return NumErrors;
   }
 
-  DWARFAddressRangesVector Ranges = RangesOrError.get();
+  const DWARFAddressRangesVector &Ranges = RangesOrError.get();
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
@@ -409,7 +454,7 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
 
   if (!IsObjectFile || IsMachOObject || Die.getTag() != DW_TAG_compile_unit) {
     bool DumpDieAfterError = false;
-    for (auto Range : Ranges) {
+    for (const auto &Range : Ranges) {
       if (!Range.valid()) {
         ++NumErrors;
         error() << "Invalid address range " << Range << "\n";
@@ -444,7 +489,7 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   }
 
   // Verify that ranges are contained within their parent.
-  bool ShouldBeContained = !Ranges.empty() && !ParentRI.Ranges.empty() &&
+  bool ShouldBeContained = !RI.Ranges.empty() && !ParentRI.Ranges.empty() &&
                            !(Die.getTag() == DW_TAG_subprogram &&
                              ParentRI.Die.getTag() == DW_TAG_subprogram);
   if (ShouldBeContained && !ParentRI.contains(RI)) {
@@ -507,9 +552,10 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
         DataExtractor Data(toStringRef(Entry.Expr), DCtx.isLittleEndian(), 0);
         DWARFExpression Expression(Data, U->getAddressByteSize(),
                                    U->getFormParams().Format);
-        bool Error = any_of(Expression, [](DWARFExpression::Operation &Op) {
-          return Op.isError();
-        });
+        bool Error =
+            any_of(Expression, [](const DWARFExpression::Operation &Op) {
+              return Op.isError();
+            });
         if (Error || !Expression.verify(U))
           ReportError("DIE contains invalid DWARF expression:");
       }
@@ -587,7 +633,9 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
 }
 
 unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
-                                            DWARFAttribute &AttrValue) {
+                                            DWARFAttribute &AttrValue,
+                                            ReferenceMap &LocalReferences,
+                                            ReferenceMap &CrossUnitReferences) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
   auto DieCU = Die.getDwarfUnit();
   unsigned NumErrors = 0;
@@ -615,7 +663,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       } else {
         // Valid reference, but we will verify it points to an actual
         // DIE later.
-        ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
+        LocalReferences[*RefVal].insert(Die.getOffset());
       }
     }
     break;
@@ -634,7 +682,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       } else {
         // Valid reference, but we will verify it points to an actual
         // DIE later.
-        ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
+        CrossUnitReferences[*RefVal].insert(Die.getOffset());
       }
     }
     break;
@@ -694,20 +742,24 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
   return NumErrors;
 }
 
-unsigned DWARFVerifier::verifyDebugInfoReferences() {
-  // Take all references and make sure they point to an actual DIE by
-  // getting the DIE by offset and emitting an error
-  OS << "Verifying .debug_info references...\n";
+unsigned DWARFVerifier::verifyDebugInfoReferences(
+    const ReferenceMap &References,
+    llvm::function_ref<DWARFUnit *(uint64_t)> GetUnitForOffset) {
+  auto GetDIEForOffset = [&](uint64_t Offset) {
+    if (DWARFUnit *U = GetUnitForOffset(Offset))
+      return U->getDIEForOffset(Offset);
+    return DWARFDie();
+  };
   unsigned NumErrors = 0;
   for (const std::pair<const uint64_t, std::set<uint64_t>> &Pair :
-       ReferenceToDIEOffsets) {
-    if (DCtx.getDIEForOffset(Pair.first))
+       References) {
+    if (GetDIEForOffset(Pair.first))
       continue;
     ++NumErrors;
     error() << "invalid DIE reference " << format("0x%08" PRIx64, Pair.first)
             << ". Offset is in between DIEs:\n";
     for (auto Offset : Pair.second)
-      dump(DCtx.getDIEForOffset(Offset)) << '\n';
+      dump(GetDIEForOffset(Offset)) << '\n';
     OS << "\n";
   }
   return NumErrors;
@@ -1349,11 +1401,12 @@ static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) {
                        U->getAddressByteSize());
     DWARFExpression Expression(Data, U->getAddressByteSize(),
                                U->getFormParams().Format);
-    bool IsInteresting = any_of(Expression, [](DWARFExpression::Operation &Op) {
-      return !Op.isError() && (Op.getCode() == DW_OP_addr ||
-                               Op.getCode() == DW_OP_form_tls_address ||
-                               Op.getCode() == DW_OP_GNU_push_tls_address);
-    });
+    bool IsInteresting =
+        any_of(Expression, [](const DWARFExpression::Operation &Op) {
+          return !Op.isError() && (Op.getCode() == DW_OP_addr ||
+                                   Op.getCode() == DW_OP_form_tls_address ||
+                                   Op.getCode() == DW_OP_GNU_push_tls_address);
+        });
     if (IsInteresting)
       return true;
   }
@@ -1488,7 +1541,7 @@ unsigned DWARFVerifier::verifyDebugNames(const DWARFSection &AccelSection,
   if (NumErrors > 0)
     return NumErrors;
   for (const auto &NI : AccelTable)
-    for (DWARFDebugNames::NameTableEntry NTE : NI)
+    for (const DWARFDebugNames::NameTableEntry &NTE : NI)
       NumErrors += verifyNameIndexEntries(NI, NTE);
 
   if (NumErrors > 0)
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index cdea0e39486d..b2c43b893cd3 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -260,17 +260,15 @@ static void convertFunctionLineTable(raw_ostream &Log, CUInfo &CUI,
   if (!CUI.LineTable->lookupAddressRange(SecAddress, RangeSize, RowVector)) {
     // If we have a DW_TAG_subprogram but no line entries, fall back to using
     // the DW_AT_decl_file an d DW_AT_decl_line if we have both attributes.
-    if (auto FileIdx =
-            dwarf::toUnsigned(Die.findRecursively({dwarf::DW_AT_decl_file}))) {
-      if (auto Line =
-              dwarf::toUnsigned(Die.findRecursively({dwarf::DW_AT_decl_line}))) {
-        LineEntry LE(StartAddress, CUI.DWARFToGSYMFileIndex(Gsym, *FileIdx),
-                     *Line);
-        FI.OptLineTable = LineTable();
-        FI.OptLineTable->push(LE);
-        // LE.Addr = EndAddress;
-        // FI.OptLineTable->push(LE);
-      }
+    std::string FilePath = Die.getDeclFile(
+        DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath);
+    if (FilePath.empty())
+      return;
+    if (auto Line =
+            dwarf::toUnsigned(Die.findRecursively({dwarf::DW_AT_decl_line}))) {
+      LineEntry LE(StartAddress, Gsym.insertFile(FilePath), *Line);
+      FI.OptLineTable = LineTable();
+      FI.OptLineTable->push(LE);
     }
     return;
   }
@@ -394,11 +392,11 @@ void DwarfTransformer::handleDie(raw_ostream &OS, CUInfo &CUI, DWARFDie Die) {
         if (Range.LowPC != 0) {
           if (!Gsym.isQuiet()) {
             // Unexpected invalid address, emit a warning
-            Log << "warning: DIE has an address range whose start address is "
-                   "not in any executable sections ("
-                << *Gsym.GetValidTextRanges()
-                << ") and will not be processed:\n";
-            Die.dump(Log, 0, DIDumpOptions::getForSingleDIE());
+            OS << "warning: DIE has an address range whose start address is "
+                  "not in any executable sections ("
+               << *Gsym.GetValidTextRanges()
+               << ") and will not be processed:\n";
+            Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
           }
         }
         break;
diff --git a/llvm/lib/DebugInfo/GSYM/FileWriter.cpp b/llvm/lib/DebugInfo/GSYM/FileWriter.cpp
index 4b30dcb60a7b..b725f3ac74f5 100644
--- a/llvm/lib/DebugInfo/GSYM/FileWriter.cpp
+++ b/llvm/lib/DebugInfo/GSYM/FileWriter.cpp
@@ -1,9 +1,8 @@
 //===- FileWriter.cpp -------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/DebugInfo/GSYM/Range.cpp b/llvm/lib/DebugInfo/GSYM/Range.cpp
index 044ddb8ba1ba..c1e8eccd0daa 100644
--- a/llvm/lib/DebugInfo/GSYM/Range.cpp
+++ b/llvm/lib/DebugInfo/GSYM/Range.cpp
@@ -1,9 +1,8 @@
 //===- Range.cpp ------------------------------------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp b/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
index 5dc9c86b34fd..00fc70ca5a54 100644
--- a/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -35,7 +35,7 @@ public:
 
 } // end anonymous namespace
 
-using Interval = std::pair<uint32_t, uint32_t>;
+using Interval = std::pair<uint64_t, uint64_t>;
 
 static Interval intersect(const Interval &I1, const Interval &I2) {
   return std::make_pair(std::max(I1.first, I2.first),
@@ -85,7 +85,7 @@ MappedBlockStream::createFpmStream(const MSFLayout &Layout,
   return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
-Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
+Error MappedBlockStream::readBytes(uint64_t Offset, uint64_t Size,
                                    ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
   if (auto EC = checkOffsetForRead(Offset, Size))
@@ -138,7 +138,7 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
     if (Intersection != RequestExtent)
       continue;
 
-    uint32_t CacheRangeOffset =
+    uint64_t CacheRangeOffset =
         AbsoluteDifference(CachedExtent.first, Intersection.first);
     Buffer = CachedAlloc.slice(CacheRangeOffset, Size);
     return Error::success();
@@ -163,14 +163,14 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
   return Error::success();
 }
 
-Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
+Error MappedBlockStream::readLongestContiguousChunk(uint64_t Offset,
                                                     ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
   if (auto EC = checkOffsetForRead(Offset, 1))
     return EC;
 
-  uint32_t First = Offset / BlockSize;
-  uint32_t Last = First;
+  uint64_t First = Offset / BlockSize;
+  uint64_t Last = First;
 
   while (Last < getNumBlocks() - 1) {
     if (StreamLayout.Blocks[Last] != StreamLayout.Blocks[Last + 1] - 1)
@@ -178,13 +178,13 @@ Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
     ++Last;
   }
 
-  uint32_t OffsetInFirstBlock = Offset % BlockSize;
-  uint32_t BytesFromFirstBlock = BlockSize - OffsetInFirstBlock;
-  uint32_t BlockSpan = Last - First + 1;
-  uint32_t ByteSpan = BytesFromFirstBlock + (BlockSpan - 1) * BlockSize;
+  uint64_t OffsetInFirstBlock = Offset % BlockSize;
+  uint64_t BytesFromFirstBlock = BlockSize - OffsetInFirstBlock;
+  uint64_t BlockSpan = Last - First + 1;
+  uint64_t ByteSpan = BytesFromFirstBlock + (BlockSpan - 1) * BlockSize;
 
   ArrayRef<uint8_t> BlockData;
-  uint32_t MsfOffset = blockToOffset(StreamLayout.Blocks[First], BlockSize);
+  uint64_t MsfOffset = blockToOffset(StreamLayout.Blocks[First], BlockSize);
   if (auto EC = MsfData.readBytes(MsfOffset, BlockSize, BlockData))
     return EC;
 
@@ -193,9 +193,9 @@ Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
   return Error::success();
 }
 
-uint32_t MappedBlockStream::getLength() { return StreamLayout.Length; }
+uint64_t MappedBlockStream::getLength() { return StreamLayout.Length; }
 
-bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
+bool MappedBlockStream::tryReadContiguously(uint64_t Offset, uint64_t Size,
                                             ArrayRef<uint8_t> &Buffer) {
   if (Size == 0) {
     Buffer = ArrayRef<uint8_t>();
@@ -206,15 +206,15 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
   // all subsequent blocks are contiguous.  For example, a 10k read with a 4k
   // block size can be filled with a reference if, from the starting offset,
   // 3 blocks in a row are contiguous.
-  uint32_t BlockNum = Offset / BlockSize;
-  uint32_t OffsetInBlock = Offset % BlockSize;
-  uint32_t BytesFromFirstBlock = std::min(Size, BlockSize - OffsetInBlock);
-  uint32_t NumAdditionalBlocks =
+  uint64_t BlockNum = Offset / BlockSize;
+  uint64_t OffsetInBlock = Offset % BlockSize;
+  uint64_t BytesFromFirstBlock = std::min(Size, BlockSize - OffsetInBlock);
+  uint64_t NumAdditionalBlocks =
       alignTo(Size - BytesFromFirstBlock, BlockSize) / BlockSize;
 
-  uint32_t RequiredContiguousBlocks = NumAdditionalBlocks + 1;
-  uint32_t E = StreamLayout.Blocks[BlockNum];
-  for (uint32_t I = 0; I < RequiredContiguousBlocks; ++I, ++E) {
+  uint64_t RequiredContiguousBlocks = NumAdditionalBlocks + 1;
+  uint64_t E = StreamLayout.Blocks[BlockNum];
+  for (uint64_t I = 0; I < RequiredContiguousBlocks; ++I, ++E) {
     if (StreamLayout.Blocks[I + BlockNum] != E)
       return false;
   }
@@ -225,8 +225,8 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
   // cross-block span, explicitly resize the ArrayRef to cover the entire
   // request length.
   ArrayRef<uint8_t> BlockData;
-  uint32_t FirstBlockAddr = StreamLayout.Blocks[BlockNum];
-  uint32_t MsfOffset = blockToOffset(FirstBlockAddr, BlockSize);
+  uint64_t FirstBlockAddr = StreamLayout.Blocks[BlockNum];
+  uint64_t MsfOffset = blockToOffset(FirstBlockAddr, BlockSize);
   if (auto EC = MsfData.readBytes(MsfOffset, BlockSize, BlockData)) {
     consumeError(std::move(EC));
     return false;
@@ -236,28 +236,28 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
   return true;
 }
 
-Error MappedBlockStream::readBytes(uint32_t Offset,
+Error MappedBlockStream::readBytes(uint64_t Offset,
                                    MutableArrayRef<uint8_t> Buffer) {
-  uint32_t BlockNum = Offset / BlockSize;
-  uint32_t OffsetInBlock = Offset % BlockSize;
+  uint64_t BlockNum = Offset / BlockSize;
+  uint64_t OffsetInBlock = Offset % BlockSize;
 
   // Make sure we aren't trying to read beyond the end of the stream.
   if (auto EC = checkOffsetForRead(Offset, Buffer.size()))
     return EC;
 
-  uint32_t BytesLeft = Buffer.size();
-  uint32_t BytesWritten = 0;
+  uint64_t BytesLeft = Buffer.size();
+  uint64_t BytesWritten = 0;
   uint8_t *WriteBuffer = Buffer.data();
   while (BytesLeft > 0) {
-    uint32_t StreamBlockAddr = StreamLayout.Blocks[BlockNum];
+    uint64_t StreamBlockAddr = StreamLayout.Blocks[BlockNum];
 
     ArrayRef<uint8_t> BlockData;
-    uint32_t Offset = blockToOffset(StreamBlockAddr, BlockSize);
+    uint64_t Offset = blockToOffset(StreamBlockAddr, BlockSize);
     if (auto EC = MsfData.readBytes(Offset, BlockSize, BlockData))
       return EC;
 
     const uint8_t *ChunkStart = BlockData.data() + OffsetInBlock;
-    uint32_t BytesInChunk = std::min(BytesLeft, BlockSize - OffsetInBlock);
+    uint64_t BytesInChunk = std::min(BytesLeft, BlockSize - OffsetInBlock);
     ::memcpy(WriteBuffer + BytesWritten, ChunkStart, BytesInChunk);
 
     BytesWritten += BytesInChunk;
@@ -271,7 +271,7 @@ Error MappedBlockStream::readBytes(uint32_t Offset,
 
 void MappedBlockStream::invalidateCache() { CacheMap.shrink_and_clear(); }
 
-void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
+void MappedBlockStream::fixCacheAfterWrite(uint64_t Offset,
                                            ArrayRef<uint8_t> Data) const {
   // If this write overlapped a read which previously came from the pool,
   // someone may still be holding a pointer to that alloc which is now invalid.
@@ -297,10 +297,10 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
       auto Intersection = intersect(WriteInterval, CachedInterval);
       assert(Intersection.first <= Intersection.second);
 
-      uint32_t Length = Intersection.second - Intersection.first;
-      uint32_t SrcOffset =
+      uint64_t Length = Intersection.second - Intersection.first;
+      uint64_t SrcOffset =
           AbsoluteDifference(WriteInterval.first, Intersection.first);
-      uint32_t DestOffset =
+      uint64_t DestOffset =
           AbsoluteDifference(CachedInterval.first, Intersection.first);
       ::memcpy(Alloc.data() + DestOffset, Data.data() + SrcOffset, Length);
     }
@@ -370,39 +370,39 @@ WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout,
   return createStream(Layout.SB->BlockSize, MinLayout, MsfData, Allocator);
 }
 
-Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
+Error WritableMappedBlockStream::readBytes(uint64_t Offset, uint64_t Size,
                                            ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readBytes(Offset, Size, Buffer);
 }
 
 Error WritableMappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) {
+    uint64_t Offset, ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readLongestContiguousChunk(Offset, Buffer);
 }
 
-uint32_t WritableMappedBlockStream::getLength() {
+uint64_t WritableMappedBlockStream::getLength() {
   return ReadInterface.getLength();
 }
 
-Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
+Error WritableMappedBlockStream::writeBytes(uint64_t Offset,
                                             ArrayRef<uint8_t> Buffer) {
   // Make sure we aren't trying to write beyond the end of the stream.
   if (auto EC = checkOffsetForWrite(Offset, Buffer.size()))
     return EC;
 
-  uint32_t BlockNum = Offset / getBlockSize();
-  uint32_t OffsetInBlock = Offset % getBlockSize();
+  uint64_t BlockNum = Offset / getBlockSize();
+  uint64_t OffsetInBlock = Offset % getBlockSize();
 
-  uint32_t BytesLeft = Buffer.size();
-  uint32_t BytesWritten = 0;
+  uint64_t BytesLeft = Buffer.size();
+  uint64_t BytesWritten = 0;
   while (BytesLeft > 0) {
-    uint32_t StreamBlockAddr = getStreamLayout().Blocks[BlockNum];
-    uint32_t BytesToWriteInChunk =
+    uint64_t StreamBlockAddr = getStreamLayout().Blocks[BlockNum];
+    uint64_t BytesToWriteInChunk =
         std::min(BytesLeft, getBlockSize() - OffsetInBlock);
 
     const uint8_t *Chunk = Buffer.data() + BytesWritten;
     ArrayRef<uint8_t> ChunkData(Chunk, BytesToWriteInChunk);
-    uint32_t MsfOffset = blockToOffset(StreamBlockAddr, getBlockSize());
+    uint64_t MsfOffset = blockToOffset(StreamBlockAddr, getBlockSize());
     MsfOffset += OffsetInBlock;
     if (auto EC = WriteInterface.writeBytes(MsfOffset, ChunkData))
       return EC;
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 98a8acaffd60..0584966a98c5 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -334,8 +334,6 @@ static uint16_t toSecMapFlags(uint32_t Flags) {
     Ret |= static_cast<uint16_t>(OMFSegDescFlags::Write);
   if (Flags & COFF::IMAGE_SCN_MEM_EXECUTE)
     Ret |= static_cast<uint16_t>(OMFSegDescFlags::Execute);
-  if (Flags & COFF::IMAGE_SCN_MEM_EXECUTE)
-    Ret |= static_cast<uint16_t>(OMFSegDescFlags::Execute);
   if (!(Flags & COFF::IMAGE_SCN_MEM_16BIT))
     Ret |= static_cast<uint16_t>(OMFSegDescFlags::AddressIs32Bit);
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
index 7a258acbd7c0..5e6412275063 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
@@ -17,8 +17,8 @@ namespace pdb {
 
 namespace {
 
-Expected<std::string> readStreamData(BinaryStream &Stream, uint32_t Limit) {
-  uint32_t Offset = 0, DataLength = std::min(Limit, Stream.getLength());
+Expected<std::string> readStreamData(BinaryStream &Stream, uint64_t Limit) {
+  uint64_t Offset = 0, DataLength = std::min(Limit, Stream.getLength());
   std::string Result;
   Result.reserve(DataLength);
   while (Offset < DataLength) {
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index a508f163a2d8..f33125474e3a 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -103,7 +103,7 @@ void PDBFileBuilder::addInjectedSource(StringRef Name,
   // table and the hash value is dependent on the exact contents of the string.
   // link.exe lowercases a path and converts / to \, so we must do the same.
   SmallString<64> VName;
-  sys::path::native(Name.lower(), VName);
+  sys::path::native(Name.lower(), VName, sys::path::Style::windows_backslash);
 
   uint32_t NI = getStringTableBuilder().insert(Name);
   uint32_t VNI = getStringTableBuilder().insert(VName);
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 72ca72230507..f3f09584fdc9 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -280,10 +280,7 @@ bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
     return false;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
-    if (Expected<StringRef> NameOrErr = Section.getName())
-      Name = *NameOrErr;
-    else
-      consumeError(NameOrErr.takeError());
+    consumeError(Section.getName().moveInto(Name));
 
     Name = Name.substr(Name.find_first_not_of("._"));
     if (Name == "gnu_debuglink") {
@@ -600,7 +597,9 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
     }
   }
   if (!Context)
-    Context = DWARFContext::create(*Objects.second, nullptr, Opts.DWPName);
+    Context = DWARFContext::create(
+        *Objects.second, DWARFContext::ProcessDebugRelocations::Process,
+        nullptr, Opts.DWPName);
   return createModuleInfo(Objects.first, std::move(Context), ModuleName);
 }
 
@@ -650,18 +649,9 @@ StringRef demanglePE32ExternCFunc(StringRef SymbolName) {
 std::string
 LLVMSymbolizer::DemangleName(const std::string &Name,
                              const SymbolizableModule *DbiModuleDescriptor) {
-  // We can spoil names of symbols with C linkage, so use an heuristic
-  // approach to check if the name should be demangled.
-  if (Name.substr(0, 2) == "_Z") {
-    int status = 0;
-    char *DemangledName =
-        itaniumDemangle(Name.c_str(), nullptr, nullptr, &status);
-    if (status != 0)
-      return Name;
-    std::string Result = DemangledName;
-    free(DemangledName);
+  std::string Result;
+  if (nonMicrosoftDemangle(Name.c_str(), Result))
     return Result;
-  }
 
   if (!Name.empty() && Name.front() == '?') {
     // Only do MSVC C++ demangling on symbols starting with '?'.
@@ -672,7 +662,7 @@ LLVMSymbolizer::DemangleName(const std::string &Name,
                         MSDF_NoMemberType | MSDF_NoReturnType));
     if (status != 0)
       return Name;
-    std::string Result = DemangledName;
+    Result = DemangledName;
     free(DemangledName);
     return Result;
   }
diff --git a/llvm/lib/Demangle/DLangDemangle.cpp b/llvm/lib/Demangle/DLangDemangle.cpp
new file mode 100644
index 000000000000..d2f1bf4323ee
--- /dev/null
+++ b/llvm/lib/Demangle/DLangDemangle.cpp
@@ -0,0 +1,45 @@
+//===--- DLangDemangle.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a demangler for the D programming language as specified
+/// in the ABI specification, available at:
+/// https://dlang.org/spec/abi.html#name_mangling
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Demangle/Demangle.h"
+#include "llvm/Demangle/Utility.h"
+
+#include <cstring>
+
+using namespace llvm;
+using llvm::itanium_demangle::OutputBuffer;
+
+char *llvm::dlangDemangle(const char *MangledName) {
+  if (MangledName == nullptr || strncmp(MangledName, "_D", 2) != 0)
+    return nullptr;
+
+  OutputBuffer Demangled;
+  if (!initializeOutputBuffer(nullptr, nullptr, Demangled, 1024))
+    return nullptr;
+
+  if (strcmp(MangledName, "_Dmain") == 0)
+    Demangled << "D main";
+
+  // OutputBuffer's internal buffer is not null terminated and therefore we need
+  // to add it to comply with C null terminated strings.
+  if (Demangled.getCurrentPosition() > 0) {
+    Demangled << '\0';
+    Demangled.setCurrentPosition(Demangled.getCurrentPosition() - 1);
+    return Demangled.getBuffer();
+  }
+
+  free(Demangled.getBuffer());
+  return nullptr;
+}
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 1851fb77b09e..13aa2864c183 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -12,32 +12,53 @@
 
 #include "llvm/Demangle/Demangle.h"
 #include <cstdlib>
+#include <cstring>
 
-static bool isItaniumEncoding(const std::string &MangledName) {
-  size_t Pos = MangledName.find_first_not_of('_');
-  // A valid Itanium encoding requires 1-4 leading underscores, followed by 'Z'.
-  return Pos > 0 && Pos <= 4 && MangledName[Pos] == 'Z';
+static bool isItaniumEncoding(const char *S) {
+  // Itanium encoding requires 1 or 3 leading underscores, followed by 'Z'.
+  return std::strncmp(S, "_Z", 2) == 0 || std::strncmp(S, "___Z", 4) == 0;
 }
 
-static bool isRustEncoding(const std::string &MangledName) {
+static bool isRustEncoding(const char *S) { return S[0] == '_' && S[1] == 'R'; }
+
+static bool isDLangEncoding(const std::string &MangledName) {
   return MangledName.size() >= 2 && MangledName[0] == '_' &&
-         MangledName[1] == 'R';
+         MangledName[1] == 'D';
 }
 
 std::string llvm::demangle(const std::string &MangledName) {
-  char *Demangled;
+  std::string Result;
+  const char *S = MangledName.c_str();
+
+  if (nonMicrosoftDemangle(S, Result))
+    return Result;
+
+  if (S[0] == '_' && nonMicrosoftDemangle(S + 1, Result))
+    return Result;
+
+  if (char *Demangled =
+          microsoftDemangle(S, nullptr, nullptr, nullptr, nullptr)) {
+    Result = Demangled;
+    std::free(Demangled);
+    return Result;
+  }
+
+  return MangledName;
+}
+
+bool llvm::nonMicrosoftDemangle(const char *MangledName, std::string &Result) {
+  char *Demangled = nullptr;
   if (isItaniumEncoding(MangledName))
-    Demangled = itaniumDemangle(MangledName.c_str(), nullptr, nullptr, nullptr);
+    Demangled = itaniumDemangle(MangledName, nullptr, nullptr, nullptr);
   else if (isRustEncoding(MangledName))
-    Demangled = rustDemangle(MangledName.c_str(), nullptr, nullptr, nullptr);
-  else
-    Demangled = microsoftDemangle(MangledName.c_str(), nullptr, nullptr,
-                                  nullptr, nullptr);
+    Demangled = rustDemangle(MangledName, nullptr, nullptr, nullptr);
+  else if (isDLangEncoding(MangledName))
+    Demangled = dlangDemangle(MangledName);
 
   if (!Demangled)
-    return MangledName;
+    return false;
 
-  std::string Ret = Demangled;
+  Result = Demangled;
   std::free(Demangled);
-  return Ret;
+  return true;
 }
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index fad9b6b7b63b..3f68f76761ce 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -333,21 +333,21 @@ char *llvm::itaniumDemangle(const char *MangledName, char *Buf,
 
   int InternalStatus = demangle_success;
   Demangler Parser(MangledName, MangledName + std::strlen(MangledName));
-  OutputStream S;
+  OutputBuffer OB;
 
   Node *AST = Parser.parse();
 
   if (AST == nullptr)
     InternalStatus = demangle_invalid_mangled_name;
-  else if (!initializeOutputStream(Buf, N, S, 1024))
+  else if (!initializeOutputBuffer(Buf, N, OB, 1024))
     InternalStatus = demangle_memory_alloc_failure;
   else {
     assert(Parser.ForwardTemplateRefs.empty());
-    AST->print(S);
-    S += '\0';
+    AST->print(OB);
+    OB += '\0';
     if (N != nullptr)
-      *N = S.getCurrentPosition();
-    Buf = S.getBuffer();
+      *N = OB.getCurrentPosition();
+    Buf = OB.getBuffer();
   }
 
   if (Status)
@@ -385,14 +385,14 @@ bool ItaniumPartialDemangler::partialDemangle(const char *MangledName) {
 }
 
 static char *printNode(const Node *RootNode, char *Buf, size_t *N) {
-  OutputStream S;
-  if (!initializeOutputStream(Buf, N, S, 128))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(Buf, N, OB, 128))
     return nullptr;
-  RootNode->print(S);
-  S += '\0';
+  RootNode->print(OB);
+  OB += '\0';
   if (N != nullptr)
-    *N = S.getCurrentPosition();
-  return S.getBuffer();
+    *N = OB.getCurrentPosition();
+  return OB.getBuffer();
 }
 
 char *ItaniumPartialDemangler::getFunctionBaseName(char *Buf, size_t *N) const {
@@ -430,8 +430,8 @@ char *ItaniumPartialDemangler::getFunctionDeclContextName(char *Buf,
     return nullptr;
   const Node *Name = static_cast<const FunctionEncoding *>(RootNode)->getName();
 
-  OutputStream S;
-  if (!initializeOutputStream(Buf, N, S, 128))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(Buf, N, OB, 128))
     return nullptr;
 
  KeepGoingLocalFunction:
@@ -449,25 +449,25 @@ char *ItaniumPartialDemangler::getFunctionDeclContextName(char *Buf,
 
   switch (Name->getKind()) {
   case Node::KStdQualifiedName:
-    S += "std";
+    OB += "std";
     break;
   case Node::KNestedName:
-    static_cast<const NestedName *>(Name)->Qual->print(S);
+    static_cast<const NestedName *>(Name)->Qual->print(OB);
     break;
   case Node::KLocalName: {
     auto *LN = static_cast<const LocalName *>(Name);
-    LN->Encoding->print(S);
-    S += "::";
+    LN->Encoding->print(OB);
+    OB += "::";
     Name = LN->Entity;
     goto KeepGoingLocalFunction;
   }
   default:
     break;
   }
-  S += '\0';
+  OB += '\0';
   if (N != nullptr)
-    *N = S.getCurrentPosition();
-  return S.getBuffer();
+    *N = OB.getCurrentPosition();
+  return OB.getBuffer();
 }
 
 char *ItaniumPartialDemangler::getFunctionName(char *Buf, size_t *N) const {
@@ -483,17 +483,17 @@ char *ItaniumPartialDemangler::getFunctionParameters(char *Buf,
     return nullptr;
   NodeArray Params = static_cast<FunctionEncoding *>(RootNode)->getParams();
 
-  OutputStream S;
-  if (!initializeOutputStream(Buf, N, S, 128))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(Buf, N, OB, 128))
     return nullptr;
 
-  S += '(';
-  Params.printWithComma(S);
-  S += ')';
-  S += '\0';
+  OB += '(';
+  Params.printWithComma(OB);
+  OB += ')';
+  OB += '\0';
   if (N != nullptr)
-    *N = S.getCurrentPosition();
-  return S.getBuffer();
+    *N = OB.getCurrentPosition();
+  return OB.getBuffer();
 }
 
 char *ItaniumPartialDemangler::getFunctionReturnType(
@@ -501,18 +501,18 @@ char *ItaniumPartialDemangler::getFunctionReturnType(
   if (!isFunction())
     return nullptr;
 
-  OutputStream S;
-  if (!initializeOutputStream(Buf, N, S, 128))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(Buf, N, OB, 128))
     return nullptr;
 
   if (const Node *Ret =
           static_cast<const FunctionEncoding *>(RootNode)->getReturnType())
-    Ret->print(S);
+    Ret->print(OB);
 
-  S += '\0';
+  OB += '\0';
   if (N != nullptr)
-    *N = S.getCurrentPosition();
-  return S.getBuffer();
+    *N = OB.getCurrentPosition();
+  return OB.getBuffer();
 }
 
 char *ItaniumPartialDemangler::finishDemangle(char *Buf, size_t *N) const {
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index 303207176be7..d8da3b48e25b 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -965,13 +965,13 @@ NamedIdentifierNode *Demangler::demangleBackRefName(StringView &MangledName) {
 void Demangler::memorizeIdentifier(IdentifierNode *Identifier) {
   // Render this class template name into a string buffer so that we can
   // memorize it for the purpose of back-referencing.
-  OutputStream OS;
-  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(nullptr, nullptr, OB, 1024))
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
-  Identifier->output(OS, OF_Default);
-  OS << '\0';
-  char *Name = OS.getBuffer();
+  Identifier->output(OB, OF_Default);
+  OB << '\0';
+  char *Name = OB.getBuffer();
 
   StringView Owned = copyString(Name);
   memorizeString(Owned);
@@ -1107,7 +1107,7 @@ static void writeHexDigit(char *Buffer, uint8_t Digit) {
   *Buffer = (Digit < 10) ? ('0' + Digit) : ('A' + Digit - 10);
 }
 
-static void outputHex(OutputStream &OS, unsigned C) {
+static void outputHex(OutputBuffer &OB, unsigned C) {
   assert (C != 0);
 
   // It's easier to do the math if we can work from right to left, but we need
@@ -1130,43 +1130,43 @@ static void outputHex(OutputStream &OS, unsigned C) {
   TempBuffer[Pos--] = 'x';
   assert(Pos >= 0);
   TempBuffer[Pos--] = '\\';
-  OS << StringView(&TempBuffer[Pos + 1]);
+  OB << StringView(&TempBuffer[Pos + 1]);
 }
 
-static void outputEscapedChar(OutputStream &OS, unsigned C) {
+static void outputEscapedChar(OutputBuffer &OB, unsigned C) {
   switch (C) {
   case '\0': // nul
-    OS << "\\0";
+    OB << "\\0";
     return;
   case '\'': // single quote
-    OS << "\\\'";
+    OB << "\\\'";
     return;
   case '\"': // double quote
-    OS << "\\\"";
+    OB << "\\\"";
     return;
   case '\\': // backslash
-    OS << "\\\\";
+    OB << "\\\\";
     return;
   case '\a': // bell
-    OS << "\\a";
+    OB << "\\a";
     return;
   case '\b': // backspace
-    OS << "\\b";
+    OB << "\\b";
     return;
   case '\f': // form feed
-    OS << "\\f";
+    OB << "\\f";
     return;
   case '\n': // new line
-    OS << "\\n";
+    OB << "\\n";
     return;
   case '\r': // carriage return
-    OS << "\\r";
+    OB << "\\r";
     return;
   case '\t': // tab
-    OS << "\\t";
+    OB << "\\t";
     return;
   case '\v': // vertical tab
-    OS << "\\v";
+    OB << "\\v";
     return;
   default:
     break;
@@ -1174,11 +1174,11 @@ static void outputEscapedChar(OutputStream &OS, unsigned C) {
 
   if (C > 0x1F && C < 0x7F) {
     // Standard ascii char.
-    OS << (char)C;
+    OB << (char)C;
     return;
   }
 
-  outputHex(OS, C);
+  outputHex(OB, C);
 }
 
 static unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
@@ -1273,7 +1273,7 @@ FunctionSymbolNode *Demangler::demangleVcallThunkNode(StringView &MangledName) {
 EncodedStringLiteralNode *
 Demangler::demangleStringLiteral(StringView &MangledName) {
   // This function uses goto, so declare all variables up front.
-  OutputStream OS;
+  OutputBuffer OB;
   StringView CRC;
   uint64_t StringByteSize;
   bool IsWcharT = false;
@@ -1284,7 +1284,7 @@ Demangler::demangleStringLiteral(StringView &MangledName) {
   EncodedStringLiteralNode *Result = Arena.alloc<EncodedStringLiteralNode>();
 
   // Must happen before the first `goto StringLiteralError`.
-  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
+  if (!initializeOutputBuffer(nullptr, nullptr, OB, 1024))
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
 
@@ -1329,7 +1329,7 @@ Demangler::demangleStringLiteral(StringView &MangledName) {
         goto StringLiteralError;
       wchar_t W = demangleWcharLiteral(MangledName);
       if (StringByteSize != 2 || Result->IsTruncated)
-        outputEscapedChar(OS, W);
+        outputEscapedChar(OB, W);
       StringByteSize -= 2;
       if (Error)
         goto StringLiteralError;
@@ -1371,19 +1371,19 @@ Demangler::demangleStringLiteral(StringView &MangledName) {
       unsigned NextChar =
           decodeMultiByteChar(StringBytes, CharIndex, CharBytes);
       if (CharIndex + 1 < NumChars || Result->IsTruncated)
-        outputEscapedChar(OS, NextChar);
+        outputEscapedChar(OB, NextChar);
     }
   }
 
-  OS << '\0';
-  ResultBuffer = OS.getBuffer();
+  OB << '\0';
+  ResultBuffer = OB.getBuffer();
   Result->DecodedString = copyString(ResultBuffer);
   std::free(ResultBuffer);
   return Result;
 
 StringLiteralError:
   Error = true;
-  std::free(OS.getBuffer());
+  std::free(OB.getBuffer());
   return nullptr;
 }
 
@@ -1447,16 +1447,16 @@ Demangler::demangleLocallyScopedNamePiece(StringView &MangledName) {
     return nullptr;
 
   // Render the parent symbol's name into a buffer.
-  OutputStream OS;
-  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(nullptr, nullptr, OB, 1024))
     // FIXME: Propagate out-of-memory as an error?
     std::terminate();
-  OS << '`';
-  Scope->output(OS, OF_Default);
-  OS << '\'';
-  OS << "::`" << Number << "'";
-  OS << '\0';
-  char *Result = OS.getBuffer();
+  OB << '`';
+  Scope->output(OB, OF_Default);
+  OB << '\'';
+  OB << "::`" << Number << "'";
+  OB << '\0';
+  char *Result = OB.getBuffer();
   Identifier->Name = copyString(Result);
   std::free(Result);
   return Identifier;
@@ -2313,19 +2313,19 @@ void Demangler::dumpBackReferences() {
               (int)Backrefs.FunctionParamCount);
 
   // Create an output stream so we can render each type.
-  OutputStream OS;
-  if (!initializeOutputStream(nullptr, nullptr, OS, 1024))
+  OutputBuffer OB;
+  if (!initializeOutputBuffer(nullptr, nullptr, OB, 1024))
     std::terminate();
   for (size_t I = 0; I < Backrefs.FunctionParamCount; ++I) {
-    OS.setCurrentPosition(0);
+    OB.setCurrentPosition(0);
 
     TypeNode *T = Backrefs.FunctionParams[I];
-    T->output(OS, OF_Default);
+    T->output(OB, OF_Default);
 
-    std::printf("  [%d] - %.*s\n", (int)I, (int)OS.getCurrentPosition(),
-                OS.getBuffer());
+    std::printf("  [%d] - %.*s\n", (int)I, (int)OB.getCurrentPosition(),
+                OB.getBuffer());
   }
-  std::free(OS.getBuffer());
+  std::free(OB.getBuffer());
 
   if (Backrefs.FunctionParamCount > 0)
     std::printf("\n");
@@ -2342,7 +2342,7 @@ char *llvm::microsoftDemangle(const char *MangledName, size_t *NMangled,
                               char *Buf, size_t *N,
                               int *Status, MSDemangleFlags Flags) {
   Demangler D;
-  OutputStream S;
+  OutputBuffer OB;
 
   StringView Name{MangledName};
   SymbolNode *AST = D.parse(Name);
@@ -2361,18 +2361,20 @@ char *llvm::microsoftDemangle(const char *MangledName, size_t *NMangled,
     OF = OutputFlags(OF | OF_NoReturnType);
   if (Flags & MSDF_NoMemberType)
     OF = OutputFlags(OF | OF_NoMemberType);
+  if (Flags & MSDF_NoVariableType)
+    OF = OutputFlags(OF | OF_NoVariableType);
 
   int InternalStatus = demangle_success;
   if (D.Error)
     InternalStatus = demangle_invalid_mangled_name;
-  else if (!initializeOutputStream(Buf, N, S, 1024))
+  else if (!initializeOutputBuffer(Buf, N, OB, 1024))
     InternalStatus = demangle_memory_alloc_failure;
   else {
-    AST->output(S, OF);
-    S += '\0';
+    AST->output(OB, OF);
+    OB += '\0';
     if (N != nullptr)
-      *N = S.getCurrentPosition();
-    Buf = S.getBuffer();
+      *N = OB.getCurrentPosition();
+    Buf = OB.getBuffer();
   }
 
   if (Status)
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 9fe157bf0d2a..32d8dff66c3f 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -21,97 +21,97 @@ using namespace ms_demangle;
 
 #define OUTPUT_ENUM_CLASS_VALUE(Enum, Value, Desc)                             \
   case Enum::Value:                                                            \
-    OS << Desc;                                                                \
+    OB << Desc;                                                                \
     break;
 
 // Writes a space if the last token does not end with a punctuation.
-static void outputSpaceIfNecessary(OutputStream &OS) {
-  if (OS.empty())
+static void outputSpaceIfNecessary(OutputBuffer &OB) {
+  if (OB.empty())
     return;
 
-  char C = OS.back();
+  char C = OB.back();
   if (std::isalnum(C) || C == '>')
-    OS << " ";
+    OB << " ";
 }
 
-static void outputSingleQualifier(OutputStream &OS, Qualifiers Q) {
+static void outputSingleQualifier(OutputBuffer &OB, Qualifiers Q) {
   switch (Q) {
   case Q_Const:
-    OS << "const";
+    OB << "const";
     break;
   case Q_Volatile:
-    OS << "volatile";
+    OB << "volatile";
     break;
   case Q_Restrict:
-    OS << "__restrict";
+    OB << "__restrict";
     break;
   default:
     break;
   }
 }
 
-static bool outputQualifierIfPresent(OutputStream &OS, Qualifiers Q,
+static bool outputQualifierIfPresent(OutputBuffer &OB, Qualifiers Q,
                                      Qualifiers Mask, bool NeedSpace) {
   if (!(Q & Mask))
     return NeedSpace;
 
   if (NeedSpace)
-    OS << " ";
+    OB << " ";
 
-  outputSingleQualifier(OS, Mask);
+  outputSingleQualifier(OB, Mask);
   return true;
 }
 
-static void outputQualifiers(OutputStream &OS, Qualifiers Q, bool SpaceBefore,
+static void outputQualifiers(OutputBuffer &OB, Qualifiers Q, bool SpaceBefore,
                              bool SpaceAfter) {
   if (Q == Q_None)
     return;
 
-  size_t Pos1 = OS.getCurrentPosition();
-  SpaceBefore = outputQualifierIfPresent(OS, Q, Q_Const, SpaceBefore);
-  SpaceBefore = outputQualifierIfPresent(OS, Q, Q_Volatile, SpaceBefore);
-  SpaceBefore = outputQualifierIfPresent(OS, Q, Q_Restrict, SpaceBefore);
-  size_t Pos2 = OS.getCurrentPosition();
+  size_t Pos1 = OB.getCurrentPosition();
+  SpaceBefore = outputQualifierIfPresent(OB, Q, Q_Const, SpaceBefore);
+  SpaceBefore = outputQualifierIfPresent(OB, Q, Q_Volatile, SpaceBefore);
+  SpaceBefore = outputQualifierIfPresent(OB, Q, Q_Restrict, SpaceBefore);
+  size_t Pos2 = OB.getCurrentPosition();
   if (SpaceAfter && Pos2 > Pos1)
-    OS << " ";
+    OB << " ";
 }
 
-static void outputCallingConvention(OutputStream &OS, CallingConv CC) {
-  outputSpaceIfNecessary(OS);
+static void outputCallingConvention(OutputBuffer &OB, CallingConv CC) {
+  outputSpaceIfNecessary(OB);
 
   switch (CC) {
   case CallingConv::Cdecl:
-    OS << "__cdecl";
+    OB << "__cdecl";
     break;
   case CallingConv::Fastcall:
-    OS << "__fastcall";
+    OB << "__fastcall";
     break;
   case CallingConv::Pascal:
-    OS << "__pascal";
+    OB << "__pascal";
     break;
   case CallingConv::Regcall:
-    OS << "__regcall";
+    OB << "__regcall";
     break;
   case CallingConv::Stdcall:
-    OS << "__stdcall";
+    OB << "__stdcall";
     break;
   case CallingConv::Thiscall:
-    OS << "__thiscall";
+    OB << "__thiscall";
     break;
   case CallingConv::Eabi:
-    OS << "__eabi";
+    OB << "__eabi";
     break;
   case CallingConv::Vectorcall:
-    OS << "__vectorcall";
+    OB << "__vectorcall";
     break;
   case CallingConv::Clrcall:
-    OS << "__clrcall";
+    OB << "__clrcall";
     break;
   case CallingConv::Swift:
-    OS << "__attribute__((__swiftcall__)) ";
+    OB << "__attribute__((__swiftcall__)) ";
     break;
   case CallingConv::SwiftAsync:
-    OS << "__attribute__((__swiftasynccall__)) ";
+    OB << "__attribute__((__swiftasynccall__)) ";
     break;
   default:
     break;
@@ -119,16 +119,16 @@ static void outputCallingConvention(OutputStream &OS, CallingConv CC) {
 }
 
 std::string Node::toString(OutputFlags Flags) const {
-  OutputStream OS;
-  initializeOutputStream(nullptr, nullptr, OS, 1024);
-  this->output(OS, Flags);
-  OS << '\0';
-  std::string Owned(OS.getBuffer());
-  std::free(OS.getBuffer());
+  OutputBuffer OB;
+  initializeOutputBuffer(nullptr, nullptr, OB, 1024);
+  this->output(OB, Flags);
+  OB << '\0';
+  std::string Owned(OB.getBuffer());
+  std::free(OB.getBuffer());
   return Owned;
 }
 
-void PrimitiveTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
+void PrimitiveTypeNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const {
   switch (PrimKind) {
     OUTPUT_ENUM_CLASS_VALUE(PrimitiveKind, Void, "void");
     OUTPUT_ENUM_CLASS_VALUE(PrimitiveKind, Bool, "bool");
@@ -152,107 +152,107 @@ void PrimitiveTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
     OUTPUT_ENUM_CLASS_VALUE(PrimitiveKind, Ldouble, "long double");
     OUTPUT_ENUM_CLASS_VALUE(PrimitiveKind, Nullptr, "std::nullptr_t");
   }
-  outputQualifiers(OS, Quals, true, false);
+  outputQualifiers(OB, Quals, true, false);
 }
 
-void NodeArrayNode::output(OutputStream &OS, OutputFlags Flags) const {
-  output(OS, Flags, ", ");
+void NodeArrayNode::output(OutputBuffer &OB, OutputFlags Flags) const {
+  output(OB, Flags, ", ");
 }
 
-void NodeArrayNode::output(OutputStream &OS, OutputFlags Flags,
+void NodeArrayNode::output(OutputBuffer &OB, OutputFlags Flags,
                            StringView Separator) const {
   if (Count == 0)
     return;
   if (Nodes[0])
-    Nodes[0]->output(OS, Flags);
+    Nodes[0]->output(OB, Flags);
   for (size_t I = 1; I < Count; ++I) {
-    OS << Separator;
-    Nodes[I]->output(OS, Flags);
+    OB << Separator;
+    Nodes[I]->output(OB, Flags);
   }
 }
 
-void EncodedStringLiteralNode::output(OutputStream &OS,
+void EncodedStringLiteralNode::output(OutputBuffer &OB,
                                       OutputFlags Flags) const {
   switch (Char) {
   case CharKind::Wchar:
-    OS << "L\"";
+    OB << "L\"";
     break;
   case CharKind::Char:
-    OS << "\"";
+    OB << "\"";
     break;
   case CharKind::Char16:
-    OS << "u\"";
+    OB << "u\"";
     break;
   case CharKind::Char32:
-    OS << "U\"";
+    OB << "U\"";
     break;
   }
-  OS << DecodedString << "\"";
+  OB << DecodedString << "\"";
   if (IsTruncated)
-    OS << "...";
+    OB << "...";
 }
 
-void IntegerLiteralNode::output(OutputStream &OS, OutputFlags Flags) const {
+void IntegerLiteralNode::output(OutputBuffer &OB, OutputFlags Flags) const {
   if (IsNegative)
-    OS << '-';
-  OS << Value;
+    OB << '-';
+  OB << Value;
 }
 
-void TemplateParameterReferenceNode::output(OutputStream &OS,
+void TemplateParameterReferenceNode::output(OutputBuffer &OB,
                                             OutputFlags Flags) const {
   if (ThunkOffsetCount > 0)
-    OS << "{";
+    OB << "{";
   else if (Affinity == PointerAffinity::Pointer)
-    OS << "&";
+    OB << "&";
 
   if (Symbol) {
-    Symbol->output(OS, Flags);
+    Symbol->output(OB, Flags);
     if (ThunkOffsetCount > 0)
-      OS << ", ";
+      OB << ", ";
   }
 
   if (ThunkOffsetCount > 0)
-    OS << ThunkOffsets[0];
+    OB << ThunkOffsets[0];
   for (int I = 1; I < ThunkOffsetCount; ++I) {
-    OS << ", " << ThunkOffsets[I];
+    OB << ", " << ThunkOffsets[I];
   }
   if (ThunkOffsetCount > 0)
-    OS << "}";
+    OB << "}";
 }
 
-void IdentifierNode::outputTemplateParameters(OutputStream &OS,
+void IdentifierNode::outputTemplateParameters(OutputBuffer &OB,
                                               OutputFlags Flags) const {
   if (!TemplateParams)
     return;
-  OS << "<";
-  TemplateParams->output(OS, Flags);
-  OS << ">";
+  OB << "<";
+  TemplateParams->output(OB, Flags);
+  OB << ">";
 }
 
-void DynamicStructorIdentifierNode::output(OutputStream &OS,
+void DynamicStructorIdentifierNode::output(OutputBuffer &OB,
                                            OutputFlags Flags) const {
   if (IsDestructor)
-    OS << "`dynamic atexit destructor for ";
+    OB << "`dynamic atexit destructor for ";
   else
-    OS << "`dynamic initializer for ";
+    OB << "`dynamic initializer for ";
 
   if (Variable) {
-    OS << "`";
-    Variable->output(OS, Flags);
-    OS << "''";
+    OB << "`";
+    Variable->output(OB, Flags);
+    OB << "''";
   } else {
-    OS << "'";
-    Name->output(OS, Flags);
-    OS << "''";
+    OB << "'";
+    Name->output(OB, Flags);
+    OB << "''";
   }
 }
 
-void NamedIdentifierNode::output(OutputStream &OS, OutputFlags Flags) const {
-  OS << Name;
-  outputTemplateParameters(OS, Flags);
+void NamedIdentifierNode::output(OutputBuffer &OB, OutputFlags Flags) const {
+  OB << Name;
+  outputTemplateParameters(OB, Flags);
 }
 
-void IntrinsicFunctionIdentifierNode::output(OutputStream &OS,
+void IntrinsicFunctionIdentifierNode::output(OutputBuffer &OB,
                                              OutputFlags Flags) const {
   switch (Operator) {
     OUTPUT_ENUM_CLASS_VALUE(IntrinsicFunctionKind, New, "operator new");
@@ -350,188 +350,188 @@ void IntrinsicFunctionIdentifierNode::output(OutputStream &OS,
   case IntrinsicFunctionKind::None:
     break;
   }
-  outputTemplateParameters(OS, Flags);
+  outputTemplateParameters(OB, Flags);
 }
 
-void LocalStaticGuardIdentifierNode::output(OutputStream &OS,
+void LocalStaticGuardIdentifierNode::output(OutputBuffer &OB,
                                             OutputFlags Flags) const {
   if (IsThread)
-    OS << "`local static thread guard'";
+    OB << "`local static thread guard'";
   else
-    OS << "`local static guard'";
+    OB << "`local static guard'";
   if (ScopeIndex > 0)
-    OS << "{" << ScopeIndex << "}";
+    OB << "{" << ScopeIndex << "}";
 }
 
-void ConversionOperatorIdentifierNode::output(OutputStream &OS,
+void ConversionOperatorIdentifierNode::output(OutputBuffer &OB,
                                               OutputFlags Flags) const {
-  OS << "operator";
-  outputTemplateParameters(OS, Flags);
-  OS << " ";
-  TargetType->output(OS, Flags);
+  OB << "operator";
+  outputTemplateParameters(OB, Flags);
+  OB << " ";
+  TargetType->output(OB, Flags);
 }
 
-void StructorIdentifierNode::output(OutputStream &OS, OutputFlags Flags) const {
+void StructorIdentifierNode::output(OutputBuffer &OB, OutputFlags Flags) const {
   if (IsDestructor)
-    OS << "~";
-  Class->output(OS, Flags);
-  outputTemplateParameters(OS, Flags);
+    OB << "~";
+  Class->output(OB, Flags);
+  outputTemplateParameters(OB, Flags);
 }
 
-void LiteralOperatorIdentifierNode::output(OutputStream &OS,
+void LiteralOperatorIdentifierNode::output(OutputBuffer &OB,
                                            OutputFlags Flags) const {
-  OS << "operator \"\"" << Name;
-  outputTemplateParameters(OS, Flags);
+  OB << "operator \"\"" << Name;
+  outputTemplateParameters(OB, Flags);
 }
 
-void FunctionSignatureNode::outputPre(OutputStream &OS,
+void FunctionSignatureNode::outputPre(OutputBuffer &OB,
                                       OutputFlags Flags) const {
   if (!(Flags & OF_NoAccessSpecifier)) {
     if (FunctionClass & FC_Public)
-      OS << "public: ";
+      OB << "public: ";
     if (FunctionClass & FC_Protected)
-      OS << "protected: ";
+      OB << "protected: ";
     if (FunctionClass & FC_Private)
-      OS << "private: ";
+      OB << "private: ";
   }
 
   if (!(Flags & OF_NoMemberType)) {
     if (!(FunctionClass & FC_Global)) {
       if (FunctionClass & FC_Static)
-        OS << "static ";
+        OB << "static ";
     }
     if (FunctionClass & FC_Virtual)
-      OS << "virtual ";
+      OB << "virtual ";
 
     if (FunctionClass & FC_ExternC)
-      OS << "extern \"C\" ";
+      OB << "extern \"C\" ";
   }
 
   if (!(Flags & OF_NoReturnType) && ReturnType) {
-    ReturnType->outputPre(OS, Flags);
-    OS << " ";
+    ReturnType->outputPre(OB, Flags);
+    OB << " ";
   }
 
   if (!(Flags & OF_NoCallingConvention))
-    outputCallingConvention(OS, CallConvention);
+    outputCallingConvention(OB, CallConvention);
 }
 
-void FunctionSignatureNode::outputPost(OutputStream &OS,
+void FunctionSignatureNode::outputPost(OutputBuffer &OB,
                                        OutputFlags Flags) const {
   if (!(FunctionClass & FC_NoParameterList)) {
-    OS << "(";
+    OB << "(";
     if (Params)
-      Params->output(OS, Flags);
+      Params->output(OB, Flags);
     else
-      OS << "void";
+      OB << "void";
 
     if (IsVariadic) {
-      if (OS.back() != '(')
-        OS << ", ";
-      OS << "...";
+      if (OB.back() != '(')
+        OB << ", ";
+      OB << "...";
     }
-    OS << ")";
+    OB << ")";
   }
 
   if (Quals & Q_Const)
-    OS << " const";
+    OB << " const";
   if (Quals & Q_Volatile)
-    OS << " volatile";
+    OB << " volatile";
   if (Quals & Q_Restrict)
-    OS << " __restrict";
+    OB << " __restrict";
   if (Quals & Q_Unaligned)
-    OS << " __unaligned";
+    OB << " __unaligned";
 
   if (IsNoexcept)
-    OS << " noexcept";
+    OB << " noexcept";
 
   if (RefQualifier == FunctionRefQualifier::Reference)
-    OS << " &";
+    OB << " &";
   else if (RefQualifier == FunctionRefQualifier::RValueReference)
-    OS << " &&";
+    OB << " &&";
 
   if (!(Flags & OF_NoReturnType) && ReturnType)
-    ReturnType->outputPost(OS, Flags);
+    ReturnType->outputPost(OB, Flags);
 }
 
-void ThunkSignatureNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
-  OS << "[thunk]: ";
+void ThunkSignatureNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const {
+  OB << "[thunk]: ";
 
-  FunctionSignatureNode::outputPre(OS, Flags);
+  FunctionSignatureNode::outputPre(OB, Flags);
 }
 
-void ThunkSignatureNode::outputPost(OutputStream &OS, OutputFlags Flags) const {
+void ThunkSignatureNode::outputPost(OutputBuffer &OB, OutputFlags Flags) const {
   if (FunctionClass & FC_StaticThisAdjust) {
-    OS << "`adjustor{" << ThisAdjust.StaticOffset << "}'";
+    OB << "`adjustor{" << ThisAdjust.StaticOffset << "}'";
   } else if (FunctionClass & FC_VirtualThisAdjust) {
     if (FunctionClass & FC_VirtualThisAdjustEx) {
-      OS << "`vtordispex{" << ThisAdjust.VBPtrOffset << ", "
+      OB << "`vtordispex{" << ThisAdjust.VBPtrOffset << ", "
          << ThisAdjust.VBOffsetOffset << ", " << ThisAdjust.VtordispOffset
          << ", " << ThisAdjust.StaticOffset << "}'";
     } else {
-      OS << "`vtordisp{" << ThisAdjust.VtordispOffset << ", "
+      OB << "`vtordisp{" << ThisAdjust.VtordispOffset << ", "
          << ThisAdjust.StaticOffset << "}'";
     }
   }
 
-  FunctionSignatureNode::outputPost(OS, Flags);
+  FunctionSignatureNode::outputPost(OB, Flags);
 }
 
-void PointerTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
+void PointerTypeNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const {
   if (Pointee->kind() == NodeKind::FunctionSignature) {
     // If this is a pointer to a function, don't output the calling convention.
     // It needs to go inside the parentheses.
     const FunctionSignatureNode *Sig =
         static_cast<const FunctionSignatureNode *>(Pointee);
-    Sig->outputPre(OS, OF_NoCallingConvention);
+    Sig->outputPre(OB, OF_NoCallingConvention);
   } else
-    Pointee->outputPre(OS, Flags);
+    Pointee->outputPre(OB, Flags);
 
-  outputSpaceIfNecessary(OS);
+  outputSpaceIfNecessary(OB);
 
   if (Quals & Q_Unaligned)
-    OS << "__unaligned ";
+    OB << "__unaligned ";
 
   if (Pointee->kind() == NodeKind::ArrayType) {
-    OS << "(";
+    OB << "(";
   } else if (Pointee->kind() == NodeKind::FunctionSignature) {
-    OS << "(";
+    OB << "(";
     const FunctionSignatureNode *Sig =
         static_cast<const FunctionSignatureNode *>(Pointee);
-    outputCallingConvention(OS, Sig->CallConvention);
-    OS << " ";
+    outputCallingConvention(OB, Sig->CallConvention);
+    OB << " ";
   }
 
   if (ClassParent) {
-    ClassParent->output(OS, Flags);
-    OS << "::";
+    ClassParent->output(OB, Flags);
+    OB << "::";
   }
 
   switch (Affinity) {
   case PointerAffinity::Pointer:
-    OS << "*";
+    OB << "*";
     break;
   case PointerAffinity::Reference:
-    OS << "&";
+    OB << "&";
     break;
   case PointerAffinity::RValueReference:
-    OS << "&&";
+    OB << "&&";
     break;
   default:
     assert(false);
   }
-  outputQualifiers(OS, Quals, false, false);
+  outputQualifiers(OB, Quals, false, false);
 }
 
-void PointerTypeNode::outputPost(OutputStream &OS, OutputFlags Flags) const {
+void PointerTypeNode::outputPost(OutputBuffer &OB, OutputFlags Flags) const {
   if (Pointee->kind() == NodeKind::ArrayType ||
       Pointee->kind() == NodeKind::FunctionSignature)
-    OS << ")";
+    OB << ")";
 
-  Pointee->outputPost(OS, Flags);
+  Pointee->outputPost(OB, Flags);
 }
 
-void TagTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
+void TagTypeNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const {
   if (!(Flags & OF_NoTagSpecifier)) {
     switch (Tag) {
       OUTPUT_ENUM_CLASS_VALUE(TagKind, Class, "class");
@@ -539,59 +539,59 @@ void TagTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
       OUTPUT_ENUM_CLASS_VALUE(TagKind, Union, "union");
       OUTPUT_ENUM_CLASS_VALUE(TagKind, Enum, "enum");
     }
-    OS << " ";
+    OB << " ";
   }
-  QualifiedName->output(OS, Flags);
-  outputQualifiers(OS, Quals, true, false);
+  QualifiedName->output(OB, Flags);
+  outputQualifiers(OB, Quals, true, false);
 }
 
-void TagTypeNode::outputPost(OutputStream &OS, OutputFlags Flags) const {}
+void TagTypeNode::outputPost(OutputBuffer &OB, OutputFlags Flags) const {}
 
-void ArrayTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
-  ElementType->outputPre(OS, Flags);
-  outputQualifiers(OS, Quals, true, false);
+void ArrayTypeNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const {
+  ElementType->outputPre(OB, Flags);
+  outputQualifiers(OB, Quals, true, false);
 }
 
-void ArrayTypeNode::outputOneDimension(OutputStream &OS, OutputFlags Flags,
+void ArrayTypeNode::outputOneDimension(OutputBuffer &OB, OutputFlags Flags,
                                        Node *N) const {
   assert(N->kind() == NodeKind::IntegerLiteral);
   IntegerLiteralNode *ILN = static_cast<IntegerLiteralNode *>(N);
   if (ILN->Value != 0)
-    ILN->output(OS, Flags);
+    ILN->output(OB, Flags);
 }
 
-void ArrayTypeNode::outputDimensionsImpl(OutputStream &OS,
+void ArrayTypeNode::outputDimensionsImpl(OutputBuffer &OB,
                                          OutputFlags Flags) const {
   if (Dimensions->Count == 0)
     return;
 
-  outputOneDimension(OS, Flags, Dimensions->Nodes[0]);
+  outputOneDimension(OB, Flags, Dimensions->Nodes[0]);
   for (size_t I = 1; I < Dimensions->Count; ++I) {
-    OS << "][";
-    outputOneDimension(OS, Flags, Dimensions->Nodes[I]);
+    OB << "][";
+    outputOneDimension(OB, Flags, Dimensions->Nodes[I]);
   }
 }
 
-void ArrayTypeNode::outputPost(OutputStream &OS, OutputFlags Flags) const {
-  OS << "[";
-  outputDimensionsImpl(OS, Flags);
-  OS << "]";
+void ArrayTypeNode::outputPost(OutputBuffer &OB, OutputFlags Flags) const {
+  OB << "[";
+  outputDimensionsImpl(OB, Flags);
+  OB << "]";
 
-  ElementType->outputPost(OS, Flags);
+  ElementType->outputPost(OB, Flags);
 }
 
-void SymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
-  Name->output(OS, Flags);
+void SymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
+  Name->output(OB, Flags);
 }
 
-void FunctionSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
-  Signature->outputPre(OS, Flags);
-  outputSpaceIfNecessary(OS);
-  Name->output(OS, Flags);
-  Signature->outputPost(OS, Flags);
+void FunctionSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
+  Signature->outputPre(OB, Flags);
+  outputSpaceIfNecessary(OB);
+  Name->output(OB, Flags);
+  Signature->outputPost(OB, Flags);
 }
 
-void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
+void VariableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
   const char *AccessSpec = nullptr;
   bool IsStatic = true;
   switch (SC) {
@@ -609,52 +609,52 @@ void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
     break;
   }
   if (!(Flags & OF_NoAccessSpecifier) && AccessSpec)
-    OS << AccessSpec << ": ";
+    OB << AccessSpec << ": ";
   if (!(Flags & OF_NoMemberType) && IsStatic)
-    OS << "static ";
+    OB << "static ";
 
-  if (Type) {
-    Type->outputPre(OS, Flags);
-    outputSpaceIfNecessary(OS);
+  if (!(Flags & OF_NoVariableType) && Type) {
+    Type->outputPre(OB, Flags);
+    outputSpaceIfNecessary(OB);
   }
-  Name->output(OS, Flags);
-  if (Type)
-    Type->outputPost(OS, Flags);
+  Name->output(OB, Flags);
+  if (!(Flags & OF_NoVariableType) && Type)
+    Type->outputPost(OB, Flags);
 }
 
-void CustomTypeNode::outputPre(OutputStream &OS, OutputFlags Flags) const {
-  Identifier->output(OS, Flags);
+void CustomTypeNode::outputPre(OutputBuffer &OB, OutputFlags Flags) const {
+  Identifier->output(OB, Flags);
 }
-void CustomTypeNode::outputPost(OutputStream &OS, OutputFlags Flags) const {}
+void CustomTypeNode::outputPost(OutputBuffer &OB, OutputFlags Flags) const {}
 
-void QualifiedNameNode::output(OutputStream &OS, OutputFlags Flags) const {
-  Components->output(OS, Flags, "::");
+void QualifiedNameNode::output(OutputBuffer &OB, OutputFlags Flags) const {
+  Components->output(OB, Flags, "::");
 }
 
-void RttiBaseClassDescriptorNode::output(OutputStream &OS,
+void RttiBaseClassDescriptorNode::output(OutputBuffer &OB,
                                          OutputFlags Flags) const {
-  OS << "`RTTI Base Class Descriptor at (";
-  OS << NVOffset << ", " << VBPtrOffset << ", " << VBTableOffset << ", "
+  OB << "`RTTI Base Class Descriptor at (";
+  OB << NVOffset << ", " << VBPtrOffset << ", " << VBTableOffset << ", "
      << this->Flags;
-  OS << ")'";
+  OB << ")'";
 }
 
-void LocalStaticGuardVariableNode::output(OutputStream &OS,
+void LocalStaticGuardVariableNode::output(OutputBuffer &OB,
                                           OutputFlags Flags) const {
-  Name->output(OS, Flags);
+  Name->output(OB, Flags);
 }
 
-void VcallThunkIdentifierNode::output(OutputStream &OS,
+void VcallThunkIdentifierNode::output(OutputBuffer &OB,
                                       OutputFlags Flags) const {
-  OS << "`vcall'{" << OffsetInVTable << ", {flat}}";
+  OB << "`vcall'{" << OffsetInVTable << ", {flat}}";
 }
 
-void SpecialTableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
-  outputQualifiers(OS, Quals, false, true);
-  Name->output(OS, Flags);
+void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
+  outputQualifiers(OB, Quals, false, true);
+  Name->output(OB, Flags);
   if (TargetName) {
-    OS << "{for `";
-    TargetName->output(OS, Flags);
-    OS << "'}";
+    OB << "{for `";
+    TargetName->output(OB, Flags);
+    OB << "'}";
   }
 }
diff --git a/llvm/lib/Demangle/RustDemangle.cpp b/llvm/lib/Demangle/RustDemangle.cpp
index f916300835ce..dcac0bd63859 100644
--- a/llvm/lib/Demangle/RustDemangle.cpp
+++ b/llvm/lib/Demangle/RustDemangle.cpp
@@ -23,7 +23,7 @@
 
 using namespace llvm;
 
-using llvm::itanium_demangle::OutputStream;
+using llvm::itanium_demangle::OutputBuffer;
 using llvm::itanium_demangle::StringView;
 using llvm::itanium_demangle::SwapAndRestore;
 
@@ -88,7 +88,7 @@ class Demangler {
 
 public:
   // Demangled output.
-  OutputStream Output;
+  OutputBuffer Output;
 
   Demangler(size_t MaxRecursionLevel = 500);
 
@@ -135,6 +135,7 @@ private:
   void printDecimalNumber(uint64_t N);
   void printBasicType(BasicType);
   void printLifetime(uint64_t Index);
+  void printIdentifier(Identifier Ident);
 
   char look() const;
   char consume();
@@ -163,7 +164,7 @@ char *llvm::rustDemangle(const char *MangledName, char *Buf, size_t *N,
   }
 
   Demangler D;
-  if (!initializeOutputStream(nullptr, nullptr, D.Output, 1024)) {
+  if (!initializeOutputBuffer(nullptr, nullptr, D.Output, 1024)) {
     if (Status != nullptr)
       *Status = demangle_memory_alloc_failure;
     return nullptr;
@@ -283,8 +284,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
   switch (consume()) {
   case 'C': {
     parseOptionalBase62Number('s');
-    Identifier Ident = parseIdentifier();
-    print(Ident.Name);
+    printIdentifier(parseIdentifier());
     break;
   }
   case 'M': {
@@ -333,7 +333,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
         print(NS);
       if (!Ident.empty()) {
         print(":");
-        print(Ident.Name);
+        printIdentifier(Ident);
       }
       print('#');
       printDecimalNumber(Disambiguator);
@@ -342,7 +342,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
       // Implementation internal namespaces.
       if (!Ident.empty()) {
         print("::");
-        print(Ident.Name);
+        printIdentifier(Ident);
       }
     }
     break;
@@ -669,6 +669,8 @@ void Demangler::demangleFnSig() {
       print("C");
     } else {
       Identifier Ident = parseIdentifier();
+      if (Ident.Punycode)
+        Error = true;
       for (char C : Ident.Name) {
         // When mangling ABI string, the "-" is replaced with "_".
         if (C == '_')
@@ -1078,6 +1080,172 @@ void Demangler::printLifetime(uint64_t Index) {
   }
 }
 
+static inline bool decodePunycodeDigit(char C, size_t &Value) {
+  if (isLower(C)) {
+    Value = C - 'a';
+    return true;
+  }
+
+  if (isDigit(C)) {
+    Value = 26 + (C - '0');
+    return true;
+  }
+
+  return false;
+}
+
+static void removeNullBytes(OutputBuffer &Output, size_t StartIdx) {
+  char *Buffer = Output.getBuffer();
+  char *Start = Buffer + StartIdx;
+  char *End = Buffer + Output.getCurrentPosition();
+  Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer);
+}
+
+// Encodes code point as UTF-8 and stores results in Output. Returns false if
+// CodePoint is not a valid unicode scalar value.
+static inline bool encodeUTF8(size_t CodePoint, char *Output) {
+  if (0xD800 <= CodePoint && CodePoint <= 0xDFFF)
+    return false;
+
+  if (CodePoint <= 0x7F) {
+    Output[0] = CodePoint;
+    return true;
+  }
+
+  if (CodePoint <= 0x7FF) {
+    Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F);
+    Output[1] = 0x80 | (CodePoint & 0x3F);
+    return true;
+  }
+
+  if (CodePoint <= 0xFFFF) {
+    Output[0] = 0xE0 | (CodePoint >> 12);
+    Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F);
+    Output[2] = 0x80 | (CodePoint & 0x3F);
+    return true;
+  }
+
+  if (CodePoint <= 0x10FFFF) {
+    Output[0] = 0xF0 | (CodePoint >> 18);
+    Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F);
+    Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F);
+    Output[3] = 0x80 | (CodePoint & 0x3F);
+    return true;
+  }
+
+  return false;
+}
+
+// Decodes string encoded using punycode and appends results to Output.
+// Returns true if decoding was successful.
+static bool decodePunycode(StringView Input, OutputBuffer &Output) {
+  size_t OutputSize = Output.getCurrentPosition();
+  size_t InputIdx = 0;
+
+  // Rust uses an underscore as a delimiter.
+  size_t DelimiterPos = StringView::npos;
+  for (size_t I = 0; I != Input.size(); ++I)
+    if (Input[I] == '_')
+      DelimiterPos = I;
+
+  if (DelimiterPos != StringView::npos) {
+    // Copy basic code points before the last delimiter to the output.
+    for (; InputIdx != DelimiterPos; ++InputIdx) {
+      char C = Input[InputIdx];
+      if (!isValid(C))
+        return false;
+      // Code points are padded with zeros while decoding is in progress.
+      char UTF8[4] = {C};
+      Output += StringView(UTF8, UTF8 + 4);
+    }
+    // Skip over the delimiter.
+    ++InputIdx;
+  }
+
+  size_t Base = 36;
+  size_t Skew = 38;
+  size_t Bias = 72;
+  size_t N = 0x80;
+  size_t TMin = 1;
+  size_t TMax = 26;
+  size_t Damp = 700;
+
+  auto Adapt = [&](size_t Delta, size_t NumPoints) {
+    Delta /= Damp;
+    Delta += Delta / NumPoints;
+    Damp = 2;
+
+    size_t K = 0;
+    while (Delta > (Base - TMin) * TMax / 2) {
+      Delta /= Base - TMin;
+      K += Base;
+    }
+    return K + (((Base - TMin + 1) * Delta) / (Delta + Skew));
+  };
+
+  // Main decoding loop.
+  for (size_t I = 0; InputIdx != Input.size(); I += 1) {
+    size_t OldI = I;
+    size_t W = 1;
+    size_t Max = std::numeric_limits<size_t>::max();
+    for (size_t K = Base; true; K += Base) {
+      if (InputIdx == Input.size())
+        return false;
+      char C = Input[InputIdx++];
+      size_t Digit = 0;
+      if (!decodePunycodeDigit(C, Digit))
+        return false;
+
+      if (Digit > (Max - I) / W)
+        return false;
+      I += Digit * W;
+
+      size_t T;
+      if (K <= Bias)
+        T = TMin;
+      else if (K >= Bias + TMax)
+        T = TMax;
+      else
+        T = K - Bias;
+
+      if (Digit < T)
+        break;
+
+      if (W > Max / (Base - T))
+        return false;
+      W *= (Base - T);
+    }
+    size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1;
+    Bias = Adapt(I - OldI, NumPoints);
+
+    if (I / NumPoints > Max - N)
+      return false;
+    N += I / NumPoints;
+    I = I % NumPoints;
+
+    // Insert N at position I in the output.
+    char UTF8[4] = {};
+    if (!encodeUTF8(N, UTF8))
+      return false;
+    Output.insert(OutputSize + I * 4, UTF8, 4);
+  }
+
+  removeNullBytes(Output, OutputSize);
+  return true;
+}
+
+void Demangler::printIdentifier(Identifier Ident) {
+  if (Error || !Print)
+    return;
+
+  if (Ident.Punycode) {
+    if (!decodePunycode(Ident.Name, Output))
+      Error = true;
+  } else {
+    print(Ident.Name);
+  }
+}
+
 char Demangler::look() const {
   if (Error || Position >= Input.size())
     return 0;
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index c8bbf0bcdfda..fe3c433bd2c5 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -28,13 +28,13 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index addec6871fa1..672fd7b991c2 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -188,8 +188,7 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
     for (auto &F : *Mod) {
       auto Attrs = F.getAttributes();
       StringRef Value = options.NoFramePointerElim ? "all" : "none";
-      Attrs = Attrs.addAttribute(F.getContext(), AttributeList::FunctionIndex,
-                                 "frame-pointer", Value);
+      Attrs = Attrs.addFnAttribute(F.getContext(), "frame-pointer", Value);
       F.setAttributes(Attrs);
     }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index c85e80b52e5a..4d7d5ce26668 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -1,9 +1,8 @@
 //===-------- JITLink_EHFrameSupport.cpp - JITLink eh-frame utils ---------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -300,7 +299,7 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
     if (auto Err = RecordReader.skip(PC.G.getPointerSize()))
       return Err;
 
-  // Read and sanity check the code alignment factor.
+  // Read and validate the code alignment factor.
   {
     uint64_t CodeAlignmentFactor = 0;
     if (auto Err = RecordReader.readULEB128(CodeAlignmentFactor))
@@ -311,7 +310,7 @@ Error EHFrameEdgeFixer::processCIE(ParseContext &PC, Block &B,
                                       " (expected 1)");
   }
 
-  // Read and sanity check the data alignment factor.
+  // Read and validate the data alignment factor.
   {
     int64_t DataAlignmentFactor = 0;
     if (auto Err = RecordReader.readSLEB128(DataAlignmentFactor))
@@ -665,7 +664,7 @@ EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding,
     EffectiveType = (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4;
 
   JITTargetAddress Addr;
-  Edge::Kind PointerEdgeKind;
+  Edge::Kind PointerEdgeKind = Edge::Invalid;
   switch (EffectiveType) {
   case DW_EH_PE_udata4: {
     uint32_t Val;
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index 252e44fe4a74..eb98e4ba4041 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -1,9 +1,8 @@
 //===-------------- ELF.cpp - JIT linker function for ELF -------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -14,6 +13,7 @@
 #include "llvm/ExecutionEngine/JITLink/ELF.h"
 
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/Object/ELF.h"
@@ -65,6 +65,8 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer) {
     return TargetMachineArch.takeError();
 
   switch (*TargetMachineArch) {
+  case ELF::EM_AARCH64:
+    return createLinkGraphFromELFObject_aarch64(ObjectBuffer);
   case ELF::EM_RISCV:
     return createLinkGraphFromELFObject_riscv(ObjectBuffer);
   case ELF::EM_X86_64:
@@ -79,6 +81,9 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer) {
 void link_ELF(std::unique_ptr<LinkGraph> G,
               std::unique_ptr<JITLinkContext> Ctx) {
   switch (G->getTargetTriple().getArch()) {
+  case Triple::aarch64:
+    link_ELF_aarch64(std::move(G), std::move(Ctx));
+    return;
   case Triple::riscv32:
   case Triple::riscv64:
     link_ELF_riscv(std::move(G), std::move(Ctx));
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 2b2a1a8db4c1..fdc987751286 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -36,11 +36,9 @@ protected:
   }
 
   Section &getCommonSection() {
-    if (!CommonSection) {
-      auto Prot = static_cast<sys::Memory::ProtectionFlags>(
-          sys::Memory::MF_READ | sys::Memory::MF_WRITE);
-      CommonSection = &G->createSection(CommonSectionName, Prot);
-    }
+    if (!CommonSection)
+      CommonSection =
+          &G->createSection(CommonSectionName, MemProt::Read | MemProt::Write);
     return *CommonSection;
   }
 
@@ -110,6 +108,31 @@ protected:
   Error graphifySections();
   Error graphifySymbols();
 
+  /// Traverse all matching relocation records in the given section. The handler
+  /// function Func should be callable with this signature:
+  ///   Error(const typename ELFT::Rela &,
+  ///         const typename ELFT::Shdr &, Section &)
+  ///
+  template <typename RelocHandlerFunction>
+  Error forEachRelocation(const typename ELFT::Shdr &RelSect,
+                          RelocHandlerFunction &&Func,
+                          bool ProcessDebugSections = false);
+
+  /// Traverse all matching relocation records in the given section. Convenience
+  /// wrapper to allow passing a member function for the handler.
+  ///
+  template <typename ClassT, typename RelocHandlerMethod>
+  Error forEachRelocation(const typename ELFT::Shdr &RelSect, ClassT *Instance,
+                          RelocHandlerMethod &&Method,
+                          bool ProcessDebugSections = false) {
+    return forEachRelocation(
+        RelSect,
+        [Instance, Method](const auto &Rel, const auto &Target, auto &GS) {
+          return (Instance->*Method)(Rel, Target, GS);
+        },
+        ProcessDebugSections);
+  }
+
   const ELFFile &Obj;
 
   typename ELFFile::Elf_Shdr_Range Sections;
@@ -170,11 +193,14 @@ ELFLinkGraphBuilder<ELFT>::getSymbolLinkageAndScope(
     // Nothing to do here.
     break;
   case ELF::STB_WEAK:
+  case ELF::STB_GNU_UNIQUE:
     L = Linkage::Weak;
     break;
   default:
-    return make_error<StringError>("Unrecognized symbol binding for " + Name,
-                                   inconvertibleErrorCode());
+    return make_error<StringError>(
+        "Unrecognized symbol binding " +
+            Twine(static_cast<int>(Sym.getBinding())) + " for " + Name,
+        inconvertibleErrorCode());
   }
 
   switch (Sym.getVisibility()) {
@@ -190,8 +216,10 @@ ELFLinkGraphBuilder<ELFT>::getSymbolLinkageAndScope(
       S = Scope::Hidden;
     break;
   case ELF::STV_INTERNAL:
-    return make_error<StringError>("Unrecognized symbol visibility for " + Name,
-                                   inconvertibleErrorCode());
+    return make_error<StringError>(
+        "Unrecognized symbol visibility " +
+            Twine(static_cast<int>(Sym.getVisibility())) + " for " + Name,
+        inconvertibleErrorCode());
   }
 
   return std::make_pair(L, S);
@@ -265,13 +293,11 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
     });
 
     // Get the section's memory protection flags.
-    sys::Memory::ProtectionFlags Prot;
+    MemProt Prot;
     if (Sec.sh_flags & ELF::SHF_EXECINSTR)
-      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                       sys::Memory::MF_EXEC);
+      Prot = MemProt::Read | MemProt::Exec;
     else
-      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                       sys::Memory::MF_WRITE);
+      Prot = MemProt::Read | MemProt::Write;
 
     // For now we just use this to skip the "undefined" section, probably need
     // to revist.
@@ -374,7 +400,7 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
     if (Sym.isDefined() &&
         (Sym.getType() == ELF::STT_NOTYPE || Sym.getType() == ELF::STT_FUNC ||
          Sym.getType() == ELF::STT_OBJECT ||
-         Sym.getType() == ELF::STT_SECTION)) {
+         Sym.getType() == ELF::STT_SECTION || Sym.getType() == ELF::STT_TLS)) {
 
       // FIXME: Handle extended tables.
       if (auto *GraphSec = getGraphSection(Sym.st_shndx)) {
@@ -421,6 +447,54 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
   return Error::success();
 }
 
+template <typename ELFT>
+template <typename RelocHandlerFunction>
+Error ELFLinkGraphBuilder<ELFT>::forEachRelocation(
+    const typename ELFT::Shdr &RelSect, RelocHandlerFunction &&Func,
+    bool ProcessDebugSections) {
+
+  // Only look into sections that store relocation entries.
+  if (RelSect.sh_type != ELF::SHT_RELA && RelSect.sh_type != ELF::SHT_REL)
+    return Error::success();
+
+  // sh_info contains the section header index of the target (FixupSection),
+  // which is the section to which all relocations in RelSect apply.
+  auto FixupSection = Obj.getSection(RelSect.sh_info);
+  if (!FixupSection)
+    return FixupSection.takeError();
+
+  // Target sections have names in valid ELF object files.
+  Expected<StringRef> Name = Obj.getSectionName(**FixupSection);
+  if (!Name)
+    return Name.takeError();
+  LLVM_DEBUG(dbgs() << "  " << *Name << ":\n");
+
+  // Consider skipping these relocations.
+  if (!ProcessDebugSections && isDwarfSection(*Name)) {
+    LLVM_DEBUG(dbgs() << "    skipped (dwarf section)\n\n");
+    return Error::success();
+  }
+
+  // Lookup the link-graph node corresponding to the target section name.
+  Section *GraphSect = G->findSectionByName(*Name);
+  if (!GraphSect)
+    return make_error<StringError>(
+        "Refencing a section that wasn't added to the graph: " + *Name,
+        inconvertibleErrorCode());
+
+  auto RelEntries = Obj.relas(RelSect);
+  if (!RelEntries)
+    return RelEntries.takeError();
+
+  // Let the callee process relocation entries one by one.
+  for (const typename ELFT::Rela &R : *RelEntries)
+    if (Error Err = Func(R, **FixupSection, *GraphSect))
+      return Err;
+
+  LLVM_DEBUG(dbgs() << "\n");
+  return Error::success();
+}
+
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
new file mode 100644
index 000000000000..dc183dfddfae
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
@@ -0,0 +1,185 @@
+//===----- ELF_aarch64.cpp - JIT linker implementation for ELF/aarch64 ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF/aarch64 jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h"
+#include "ELFLinkGraphBuilder.h"
+#include "JITLinkGeneric.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/aarch64.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+
+namespace llvm {
+namespace jitlink {
+
+class ELFJITLinker_aarch64 : public JITLinker<ELFJITLinker_aarch64> {
+  friend class JITLinker<ELFJITLinker_aarch64>;
+
+public:
+  ELFJITLinker_aarch64(std::unique_ptr<JITLinkContext> Ctx,
+                       std::unique_ptr<LinkGraph> G,
+                       PassConfiguration PassConfig)
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {}
+
+private:
+  Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
+    using namespace aarch64;
+    using namespace llvm::support;
+
+    char *BlockWorkingMem = B.getAlreadyMutableContent().data();
+    char *FixupPtr = BlockWorkingMem + E.getOffset();
+    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+    switch (E.getKind()) {
+    case aarch64::R_AARCH64_CALL26: {
+      assert((FixupAddress & 0x3) == 0 && "Call-inst is not 32-bit aligned");
+      int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
+
+      if (static_cast<uint64_t>(Value) & 0x3)
+        return make_error<JITLinkError>("Call target is not 32-bit aligned");
+
+      if (!fitsRangeSignedInt<27>(Value))
+        return makeTargetOutOfRangeError(G, B, E);
+
+      uint32_t RawInstr = *(little32_t *)FixupPtr;
+      assert((RawInstr & 0x7fffffff) == 0x14000000 &&
+             "RawInstr isn't a B or BR immediate instruction");
+      uint32_t Imm = (static_cast<uint32_t>(Value) & ((1 << 28) - 1)) >> 2;
+      uint32_t FixedInstr = RawInstr | Imm;
+      *(little32_t *)FixupPtr = FixedInstr;
+      break;
+    }
+    }
+    return Error::success();
+  }
+
+  template <uint8_t Bits> static bool fitsRangeSignedInt(int64_t Value) {
+    return Value >= -(1ll << Bits) && Value < (1ll << Bits);
+  }
+};
+
+template <typename ELFT>
+class ELFLinkGraphBuilder_aarch64 : public ELFLinkGraphBuilder<ELFT> {
+private:
+  static Expected<aarch64::EdgeKind_aarch64>
+  getRelocationKind(const uint32_t Type) {
+    using namespace aarch64;
+    switch (Type) {
+    case ELF::R_AARCH64_CALL26:
+      return EdgeKind_aarch64::R_AARCH64_CALL26;
+    }
+
+    return make_error<JITLinkError>("Unsupported aarch64 relocation:" +
+                                    formatv("{0:d}", Type));
+  }
+
+  Error addRelocations() override {
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
+    using Base = ELFLinkGraphBuilder<ELFT>;
+    using Self = ELFLinkGraphBuilder_aarch64<ELFT>;
+    for (const auto &RelSect : Base::Sections)
+      if (Error Err = Base::forEachRelocation(RelSect, this,
+                                              &Self::addSingleRelocation))
+        return Err;
+
+    return Error::success();
+  }
+
+  Error addSingleRelocation(const typename ELFT::Rela &Rel,
+                            const typename ELFT::Shdr &FixupSect,
+                            Section &GraphSection) {
+    using Base = ELFLinkGraphBuilder<ELFT>;
+
+    uint32_t SymbolIndex = Rel.getSymbol(false);
+    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
+    if (!ObjSymbol)
+      return ObjSymbol.takeError();
+
+    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
+    if (!GraphSymbol)
+      return make_error<StringError>(
+          formatv("Could not find symbol at given index, did you add it to "
+                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
+                  SymbolIndex, (*ObjSymbol)->st_shndx,
+                  Base::GraphSymbols.size()),
+          inconvertibleErrorCode());
+
+    uint32_t Type = Rel.getType(false);
+    Expected<aarch64::EdgeKind_aarch64> Kind = getRelocationKind(Type);
+    if (!Kind)
+      return Kind.takeError();
+
+    int64_t Addend = Rel.r_addend;
+    Block *BlockToFix = *(GraphSection.blocks().begin());
+    JITTargetAddress FixupAddress = FixupSect.sh_addr + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress();
+    Edge GE(*Kind, Offset, *GraphSymbol, Addend);
+    LLVM_DEBUG({
+      dbgs() << "    ";
+      printEdge(dbgs(), *BlockToFix, GE, aarch64::getEdgeKindName(*Kind));
+      dbgs() << "\n";
+    });
+
+    BlockToFix->addEdge(std::move(GE));
+    return Error::success();
+  }
+
+public:
+  ELFLinkGraphBuilder_aarch64(StringRef FileName,
+                              const object::ELFFile<ELFT> &Obj, const Triple T)
+      : ELFLinkGraphBuilder<ELFT>(Obj, std::move(T), FileName,
+                                  aarch64::getEdgeKindName) {}
+};
+
+Expected<std::unique_ptr<LinkGraph>>
+createLinkGraphFromELFObject_aarch64(MemoryBufferRef ObjectBuffer) {
+  LLVM_DEBUG({
+    dbgs() << "Building jitlink graph for new input "
+           << ObjectBuffer.getBufferIdentifier() << "...\n";
+  });
+
+  auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer);
+  if (!ELFObj)
+    return ELFObj.takeError();
+
+  assert((*ELFObj)->getArch() == Triple::aarch64 &&
+         "Only AArch64 (little endian) is supported for now");
+
+  auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64LE>>(**ELFObj);
+  return ELFLinkGraphBuilder_aarch64<object::ELF64LE>((*ELFObj)->getFileName(),
+                                                      ELFObjFile.getELFFile(),
+                                                      (*ELFObj)->makeTriple())
+      .buildGraph();
+}
+
+void link_ELF_aarch64(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx) {
+  PassConfiguration Config;
+  const Triple &TT = G->getTargetTriple();
+  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
+  }
+  if (auto Err = Ctx->modifyPassConfig(*G, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
+  ELFJITLinker_aarch64::link(std::move(Ctx), std::move(G), std::move(Config));
+}
+
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index d0e65ef1c3ac..b057788ce3ef 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -11,17 +11,117 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
+#include "ELFLinkGraphBuilder.h"
+#include "JITLinkGeneric.h"
+#include "PerGraphGOTAndPLTStubsBuilder.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/ExecutionEngine/JITLink/riscv.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
 
-#include "ELFLinkGraphBuilder.h"
-#include "JITLinkGeneric.h"
-
 #define DEBUG_TYPE "jitlink"
 using namespace llvm;
+using namespace llvm::jitlink;
+using namespace llvm::jitlink::riscv;
+
+namespace {
+
+class PerGraphGOTAndPLTStubsBuilder_ELF_riscv
+    : public PerGraphGOTAndPLTStubsBuilder<
+          PerGraphGOTAndPLTStubsBuilder_ELF_riscv> {
+public:
+  static constexpr size_t StubEntrySize = 16;
+  static const uint8_t NullGOTEntryContent[8];
+  static const uint8_t RV64StubContent[StubEntrySize];
+  static const uint8_t RV32StubContent[StubEntrySize];
+
+  using PerGraphGOTAndPLTStubsBuilder<
+      PerGraphGOTAndPLTStubsBuilder_ELF_riscv>::PerGraphGOTAndPLTStubsBuilder;
+
+  bool isRV64() const { return G.getPointerSize() == 8; }
+
+  bool isGOTEdgeToFix(Edge &E) const { return E.getKind() == R_RISCV_GOT_HI20; }
+
+  Symbol &createGOTEntry(Symbol &Target) {
+    Block &GOTBlock = G.createContentBlock(
+        getGOTSection(), getGOTEntryBlockContent(), 0, G.getPointerSize(), 0);
+    GOTBlock.addEdge(isRV64() ? R_RISCV_64 : R_RISCV_32, 0, Target, 0);
+    return G.addAnonymousSymbol(GOTBlock, 0, G.getPointerSize(), false, false);
+  }
+
+  Symbol &createPLTStub(Symbol &Target) {
+    Block &StubContentBlock =
+        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 4, 0);
+    auto &GOTEntrySymbol = getGOTEntry(Target);
+    StubContentBlock.addEdge(R_RISCV_CALL, 0, GOTEntrySymbol, 0);
+    return G.addAnonymousSymbol(StubContentBlock, 0, StubEntrySize, true,
+                                false);
+  }
+
+  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
+    // Replace the relocation pair (R_RISCV_GOT_HI20, R_RISCV_PCREL_LO12)
+    // with (R_RISCV_PCREL_HI20, R_RISCV_PCREL_LO12)
+    // Therefore, here just change the R_RISCV_GOT_HI20 to R_RISCV_PCREL_HI20
+    E.setKind(R_RISCV_PCREL_HI20);
+    E.setTarget(GOTEntry);
+  }
+
+  void fixPLTEdge(Edge &E, Symbol &PLTStubs) {
+    assert(E.getKind() == R_RISCV_CALL_PLT && "Not a R_RISCV_CALL_PLT edge?");
+    E.setKind(R_RISCV_CALL);
+    E.setTarget(PLTStubs);
+  }
+
+  bool isExternalBranchEdge(Edge &E) const {
+    return E.getKind() == R_RISCV_CALL_PLT;
+  }
+
+private:
+  Section &getGOTSection() const {
+    if (!GOTSection)
+      GOTSection = &G.createSection("$__GOT", MemProt::Read);
+    return *GOTSection;
+  }
+
+  Section &getStubsSection() const {
+    if (!StubsSection)
+      StubsSection =
+          &G.createSection("$__STUBS", MemProt::Read | MemProt::Exec);
+    return *StubsSection;
+  }
+
+  ArrayRef<char> getGOTEntryBlockContent() {
+    return {reinterpret_cast<const char *>(NullGOTEntryContent),
+            G.getPointerSize()};
+  }
+
+  ArrayRef<char> getStubBlockContent() {
+    auto StubContent = isRV64() ? RV64StubContent : RV32StubContent;
+    return {reinterpret_cast<const char *>(StubContent), StubEntrySize};
+  }
+
+  mutable Section *GOTSection = nullptr;
+  mutable Section *StubsSection = nullptr;
+};
 
+const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_riscv::NullGOTEntryContent[8] =
+    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+const uint8_t
+    PerGraphGOTAndPLTStubsBuilder_ELF_riscv::RV64StubContent[StubEntrySize] = {
+        0x17, 0x0e, 0x00, 0x00,  // auipc t3, literal
+        0x03, 0x3e, 0x0e, 0x00,  // ld    t3, literal(t3)
+        0x67, 0x00, 0x0e, 0x00,  // jr    t3
+        0x13, 0x00, 0x00, 0x00}; // nop
+
+const uint8_t
+    PerGraphGOTAndPLTStubsBuilder_ELF_riscv::RV32StubContent[StubEntrySize] = {
+        0x17, 0x0e, 0x00, 0x00,  // auipc t3, literal
+        0x03, 0x2e, 0x0e, 0x00,  // lw    t3, literal(t3)
+        0x67, 0x00, 0x0e, 0x00,  // jr    t3
+        0x13, 0x00, 0x00, 0x00}; // nop
+} // namespace
 namespace llvm {
 namespace jitlink {
 
@@ -78,6 +178,16 @@ private:
     char *FixupPtr = BlockWorkingMem + E.getOffset();
     JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
     switch (E.getKind()) {
+    case R_RISCV_32: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend();
+      *(little32_t *)FixupPtr = static_cast<uint32_t>(Value);
+      break;
+    }
+    case R_RISCV_64: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend();
+      *(little64_t *)FixupPtr = static_cast<uint64_t>(Value);
+      break;
+    }
     case R_RISCV_HI20: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend();
       int32_t Hi = (Value + 0x800) & 0xFFFFF000;
@@ -163,6 +273,10 @@ private:
       return EdgeKind_riscv::R_RISCV_PCREL_LO12_I;
     case ELF::R_RISCV_PCREL_LO12_S:
       return EdgeKind_riscv::R_RISCV_PCREL_LO12_S;
+    case ELF::R_RISCV_GOT_HI20:
+      return EdgeKind_riscv::R_RISCV_GOT_HI20;
+    case ELF::R_RISCV_CALL_PLT:
+      return EdgeKind_riscv::R_RISCV_CALL_PLT;
     }
 
     return make_error<JITLinkError>("Unsupported riscv relocation:" +
@@ -170,93 +284,54 @@ private:
   }
 
   Error addRelocations() override {
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
     using Base = ELFLinkGraphBuilder<ELFT>;
-    LLVM_DEBUG(dbgs() << "Adding relocations\n");
-
-    // TODO a partern is forming of iterate some sections but only give me
-    // ones I am interested, I should abstract that concept some where
-    for (auto &SecRef : Base::Sections) {
-      if (SecRef.sh_type != ELF::SHT_RELA && SecRef.sh_type != ELF::SHT_REL)
-        continue;
-      auto RelSectName = Base::Obj.getSectionName(SecRef);
-      if (!RelSectName)
-        return RelSectName.takeError();
-
-      LLVM_DEBUG({
-        dbgs() << "Adding relocations from section " << *RelSectName << "\n";
-      });
-
-      auto UpdateSection = Base::Obj.getSection(SecRef.sh_info);
-      if (!UpdateSection)
-        return UpdateSection.takeError();
-
-      auto UpdateSectionName = Base::Obj.getSectionName(**UpdateSection);
-      if (!UpdateSectionName)
-        return UpdateSectionName.takeError();
-      // Don't process relocations for debug sections.
-      if (Base::isDwarfSection(*UpdateSectionName)) {
-        LLVM_DEBUG({
-          dbgs() << "  Target is dwarf section " << *UpdateSectionName
-                 << ". Skipping.\n";
-        });
-        continue;
-      } else
-        LLVM_DEBUG({
-          dbgs() << "  For target section " << *UpdateSectionName << "\n";
-        });
-
-      auto *JITSection = Base::G->findSectionByName(*UpdateSectionName);
-      if (!JITSection)
-        return make_error<llvm::StringError>(
-            "Refencing a section that wasn't added to graph" +
-                *UpdateSectionName,
-            llvm::inconvertibleErrorCode());
-
-      auto Relocations = Base::Obj.relas(SecRef);
-      if (!Relocations)
-        return Relocations.takeError();
-
-      for (const auto &Rela : *Relocations) {
-        auto Type = Rela.getType(false);
-
-        LLVM_DEBUG({
-          dbgs() << "Relocation Type: " << Type << "\n"
-                 << "Name: " << Base::Obj.getRelocationTypeName(Type) << "\n";
-        });
-
-        auto SymbolIndex = Rela.getSymbol(false);
-        auto Symbol = Base::Obj.getRelocationSymbol(Rela, Base::SymTabSec);
-        if (!Symbol)
-          return Symbol.takeError();
-
-        auto BlockToFix = *(JITSection->blocks().begin());
-        auto *TargetSymbol = Base::getGraphSymbol(SymbolIndex);
-
-        if (!TargetSymbol) {
-          return make_error<llvm::StringError>(
-              "Could not find symbol at given index, did you add it to "
-              "JITSymbolTable? index: " +
-                  std::to_string(SymbolIndex) + ", shndx: " +
-                  std::to_string((*Symbol)->st_shndx) + " Size of table: " +
-                  std::to_string(Base::GraphSymbols.size()),
-              llvm::inconvertibleErrorCode());
-        }
-        int64_t Addend = Rela.r_addend;
-        JITTargetAddress FixupAddress =
-            (*UpdateSection)->sh_addr + Rela.r_offset;
-
-        LLVM_DEBUG({
-          dbgs() << "Processing relocation at "
-                 << format("0x%016" PRIx64, FixupAddress) << "\n";
-        });
-        auto Kind = getRelocationKind(Type);
-        if (!Kind)
-          return Kind.takeError();
-
-        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
-                            *TargetSymbol, Addend);
-      }
-    }
+    using Self = ELFLinkGraphBuilder_riscv<ELFT>;
+    for (const auto &RelSect : Base::Sections)
+      if (Error Err = Base::forEachRelocation(RelSect, this,
+                                              &Self::addSingleRelocation))
+        return Err;
+
+    return Error::success();
+  }
+
+  Error addSingleRelocation(const typename ELFT::Rela &Rel,
+                            const typename ELFT::Shdr &FixupSect,
+                            Section &GraphSection) {
+    using Base = ELFLinkGraphBuilder<ELFT>;
+
+    uint32_t SymbolIndex = Rel.getSymbol(false);
+    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
+    if (!ObjSymbol)
+      return ObjSymbol.takeError();
+
+    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
+    if (!GraphSymbol)
+      return make_error<StringError>(
+          formatv("Could not find symbol at given index, did you add it to "
+                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
+                  SymbolIndex, (*ObjSymbol)->st_shndx,
+                  Base::GraphSymbols.size()),
+          inconvertibleErrorCode());
+
+    uint32_t Type = Rel.getType(false);
+    Expected<riscv::EdgeKind_riscv> Kind = getRelocationKind(Type);
+    if (!Kind)
+      return Kind.takeError();
+
+    int64_t Addend = Rel.r_addend;
+    Block *BlockToFix = *(GraphSection.blocks().begin());
+    JITTargetAddress FixupAddress = FixupSect.sh_addr + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress();
+    Edge GE(*Kind, Offset, *GraphSymbol, Addend);
+    LLVM_DEBUG({
+      dbgs() << "    ";
+      printEdge(dbgs(), *BlockToFix, GE, riscv::getEdgeKindName(*Kind));
+      dbgs() << "\n";
+    });
+
+    BlockToFix->addEdge(std::move(GE));
     return Error::success();
   }
 
@@ -304,6 +379,8 @@ void link_ELF_riscv(std::unique_ptr<LinkGraph> G,
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
+    Config.PostPrunePasses.push_back(
+        PerGraphGOTAndPLTStubsBuilder_ELF_riscv::asPass);
   }
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
     return Ctx->notifyFailed(std::move(Err));
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index a5aed6d25200..3ea9ffee6554 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/JITLink/TableManager.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Endian.h"
@@ -20,7 +21,6 @@
 #include "EHFrameSupportImpl.h"
 #include "ELFLinkGraphBuilder.h"
 #include "JITLinkGeneric.h"
-#include "PerGraphGOTAndPLTStubsBuilder.h"
 
 #define DEBUG_TYPE "jitlink"
 
@@ -30,196 +30,82 @@ using namespace llvm::jitlink::ELF_x86_64_Edges;
 
 namespace {
 
-constexpr StringRef ELFGOTSectionName = "$__GOT";
 constexpr StringRef ELFGOTSymbolName = "_GLOBAL_OFFSET_TABLE_";
+constexpr StringRef ELFTLSInfoSectionName = "$__TLSINFO";
 
-class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64
-    : public PerGraphGOTAndPLTStubsBuilder<
-          PerGraphGOTAndPLTStubsBuilder_ELF_x86_64> {
+class TLSInfoTableManager_ELF_x86_64
+    : public TableManager<TLSInfoTableManager_ELF_x86_64> {
 public:
-  static const uint8_t NullGOTEntryContent[8];
-  static const uint8_t StubContent[6];
-
-  using PerGraphGOTAndPLTStubsBuilder<
-      PerGraphGOTAndPLTStubsBuilder_ELF_x86_64>::PerGraphGOTAndPLTStubsBuilder;
-
-  bool isGOTEdgeToFix(Edge &E) const {
-    if (E.getKind() == GOTOFF64) {
-      // We need to make sure that the GOT section exists, but don't otherwise
-      // need to fix up this edge.
-      getGOTSection();
-      return false;
-    }
-
-    return E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad ||
-           E.getKind() == PCRel64GOT || E.getKind() == GOT64;
-  }
+  static const uint8_t TLSInfoEntryContent[16];
 
-  Symbol &createGOTEntry(Symbol &Target) {
-    auto &GOTEntryBlock = G.createContentBlock(
-        getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0);
-    GOTEntryBlock.addEdge(Pointer64, 0, Target, 0);
-    return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
-  }
+  static StringRef getSectionName() { return ELFTLSInfoSectionName; }
 
-  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
-    // If this is a PCRel32GOT/PCRel64GOT then change it to an ordinary
-    // PCRel32/PCRel64. If it is a PCRel32GOTLoad then leave it as-is for now:
-    // We will use the kind to check for GOT optimization opportunities in the
-    // optimizeMachO_x86_64_GOTAndStubs pass below.
-    // If it's a GOT64 leave it as is.
-    switch (E.getKind()) {
-    case PCRel32GOT:
-      E.setKind(PCRel32);
-      break;
-    case PCRel64GOT:
-      E.setKind(PCRel64);
-      break;
-    case GOT64:
-      break;
-    case PCRel32GOTLoad:
-      break;
-    default:
-      llvm_unreachable("Unexpected GOT edge kind");
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getKind() == x86_64::RequestTLSDescInGOTAndTransformToDelta32) {
+      LLVM_DEBUG({
+        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+               << formatv("{0:x}", B->getFixupAddress(E)) << " ("
+               << formatv("{0:x}", B->getAddress()) << " + "
+               << formatv("{0:x}", E.getOffset()) << ")\n";
+      });
+      E.setKind(x86_64::Delta32);
+      E.setTarget(getEntryForTarget(G, E.getTarget()));
+      return true;
     }
-
-    E.setTarget(GOTEntry);
-    // Leave the edge addend as-is.
+    return false;
   }
 
-  bool isExternalBranchEdge(Edge &E) {
-    return E.getKind() == Branch32 && !E.getTarget().isDefined();
-  }
-
-  Symbol &createPLTStub(Symbol &Target) {
-    auto &StubContentBlock =
-        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0);
-    // Re-use GOT entries for stub targets.
-    auto &GOTEntrySymbol = getGOTEntry(Target);
-    StubContentBlock.addEdge(PCRel32, 2, GOTEntrySymbol, -4);
-    return G.addAnonymousSymbol(StubContentBlock, 0, 6, true, false);
-  }
-
-  void fixPLTEdge(Edge &E, Symbol &Stub) {
-    assert(E.getKind() == Branch32 && "Not a Branch32 edge?");
-
-    // Set the edge kind to Branch32ToStub. We will use this to check for stub
-    // optimization opportunities in the optimize ELF_x86_64_GOTAndStubs pass
-    // below.
-    E.setKind(Branch32ToStub);
-    E.setTarget(Stub);
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    // the TLS Info entry's key value will be written by the fixTLVSectionByName
+    // pass, so create mutable content.
+    auto &TLSInfoEntry = G.createMutableContentBlock(
+        getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()), 0, 8,
+        0);
+    TLSInfoEntry.addEdge(x86_64::Pointer64, 8, Target, 0);
+    return G.addAnonymousSymbol(TLSInfoEntry, 0, 16, false, false);
   }
 
 private:
-  Section &getGOTSection() const {
-    if (!GOTSection)
-      GOTSection = &G.createSection(ELFGOTSectionName, sys::Memory::MF_READ);
-    return *GOTSection;
-  }
-
-  Section &getStubsSection() const {
-    if (!StubsSection) {
-      auto StubsProt = static_cast<sys::Memory::ProtectionFlags>(
-          sys::Memory::MF_READ | sys::Memory::MF_EXEC);
-      StubsSection = &G.createSection("$__STUBS", StubsProt);
-    }
-    return *StubsSection;
-  }
-
-  ArrayRef<char> getGOTEntryBlockContent() {
-    return {reinterpret_cast<const char *>(NullGOTEntryContent),
-            sizeof(NullGOTEntryContent)};
+  Section &getTLSInfoSection(LinkGraph &G) {
+    if (!TLSInfoTable)
+      TLSInfoTable = &G.createSection(ELFTLSInfoSectionName, MemProt::Read);
+    return *TLSInfoTable;
   }
 
-  ArrayRef<char> getStubBlockContent() {
-    return {reinterpret_cast<const char *>(StubContent), sizeof(StubContent)};
+  ArrayRef<char> getTLSInfoEntryContent() const {
+    return {reinterpret_cast<const char *>(TLSInfoEntryContent),
+            sizeof(TLSInfoEntryContent)};
   }
 
-  mutable Section *GOTSection = nullptr;
-  mutable Section *StubsSection = nullptr;
+  Section *TLSInfoTable = nullptr;
 };
 
-} // namespace
+const uint8_t TLSInfoTableManager_ELF_x86_64::TLSInfoEntryContent[16] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*pthread key */
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /*data address*/
+};
 
-const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::NullGOTEntryContent[8] =
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::StubContent[6] = {
-    0xFF, 0x25, 0x00, 0x00, 0x00, 0x00};
-
-static Error optimizeELF_x86_64_GOTAndStubs(LinkGraph &G) {
-  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
-
-  for (auto *B : G.blocks())
-    for (auto &E : B->edges())
-      if (E.getKind() == PCRel32GOTLoad) {
-        // Replace GOT load with LEA only for MOVQ instructions.
-        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
-        if (E.getOffset() < 3 ||
-            strncmp(B->getContent().data() + E.getOffset() - 3,
-                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
-          continue;
-
-        auto &GOTBlock = E.getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT entry block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT entry should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          // Change the edge kind as we don't go through GOT anymore. This is
-          // for formal correctness only. Technically, the two relocation kinds
-          // are resolved the same way.
-          E.setKind(PCRel32);
-          E.setTarget(GOTTarget);
-          auto *BlockData = reinterpret_cast<uint8_t *>(
-              const_cast<char *>(B->getContent().data()));
-          BlockData[E.getOffset() - 2] = 0x8d;
-          LLVM_DEBUG({
-            dbgs() << "  Replaced GOT load wih LEA:\n    ";
-            printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      } else if (E.getKind() == Branch32ToStub) {
-        auto &StubBlock = E.getTarget().getBlock();
-        assert(
-            StubBlock.getSize() ==
-                sizeof(PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::StubContent) &&
-            "Stub block should be stub sized");
-        assert(StubBlock.edges_size() == 1 &&
-               "Stub block should only have one outgoing edge");
-
-        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT block should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          E.setKind(Branch32);
-          E.setTarget(GOTTarget);
-          LLVM_DEBUG({
-            dbgs() << "  Replaced stub branch with direct branch:\n    ";
-            printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      }
+Error buildTables_ELF_x86_64(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
 
+  x86_64::GOTTableManager GOT;
+  x86_64::PLTTableManager PLT(GOT);
+  TLSInfoTableManager_ELF_x86_64 TLSInfo;
+  visitExistingEdges(G, GOT, PLT, TLSInfo);
   return Error::success();
 }
+} // namespace
+
+static const char *getELFX86_64RelocName(uint32_t Type) {
+  switch (Type) {
+#define ELF_RELOC(Name, Number)                                                \
+  case Number:                                                                 \
+    return #Name;
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
+#undef ELF_RELOC
+  }
+  return "Unrecognized ELF/x86-64 relocation type";
+}
 
 namespace llvm {
 namespace jitlink {
@@ -228,10 +114,13 @@ namespace jitlink {
 // generic
 class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder<object::ELF64LE> {
 private:
+  using ELFT = object::ELF64LE;
 
   static Expected<ELF_x86_64_Edges::ELFX86RelocationKind>
   getRelocationKind(const uint32_t Type) {
     switch (Type) {
+    case ELF::R_X86_64_32S:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer32Signed;
     case ELF::R_X86_64_PC32:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32;
     case ELF::R_X86_64_PC64:
@@ -240,9 +129,11 @@ private:
     case ELF::R_X86_64_64:
       return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64;
     case ELF::R_X86_64_GOTPCREL:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
     case ELF::R_X86_64_GOTPCRELX:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoadRelaxable;
     case ELF::R_X86_64_REX_GOTPCRELX:
-      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoadRelaxable;
     case ELF::R_X86_64_GOTPCREL64:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel64GOT;
     case ELF::R_X86_64_GOT64:
@@ -251,109 +142,121 @@ private:
       return ELF_x86_64_Edges::ELFX86RelocationKind::GOTOFF64;
     case ELF::R_X86_64_PLT32:
       return ELF_x86_64_Edges::ELFX86RelocationKind::Branch32;
+    case ELF::R_X86_64_TLSGD:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32TLV;
     }
-    return make_error<JITLinkError>("Unsupported x86-64 relocation:" +
-                                    formatv("{0:d}", Type));
+    return make_error<JITLinkError>("Unsupported x86-64 relocation type " +
+                                    formatv("{0:d}: ", Type) +
+                                    getELFX86_64RelocName(Type));
   }
 
   Error addRelocations() override {
-    LLVM_DEBUG(dbgs() << "Adding relocations\n");
-    // TODO a partern is forming of iterate some sections but only give me
-    // ones I am interested, i should abstract that concept some where
-    for (auto &SecRef : Sections) {
-      if (SecRef.sh_type != ELF::SHT_RELA && SecRef.sh_type != ELF::SHT_REL)
-        continue;
-      // TODO can the elf obj file do this for me?
-      if (SecRef.sh_type == ELF::SHT_REL)
-        return make_error<llvm::StringError>("Shouldn't have REL in x64",
-                                             llvm::inconvertibleErrorCode());
-
-      auto RelSectName = Obj.getSectionName(SecRef);
-      if (!RelSectName)
-        return RelSectName.takeError();
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
+    using Base = ELFLinkGraphBuilder<ELFT>;
+    using Self = ELFLinkGraphBuilder_x86_64;
+    for (const auto &RelSect : Base::Sections) {
+      // Validate the section to read relocation entries from.
+      if (RelSect.sh_type == ELF::SHT_REL)
+        return make_error<StringError>(
+            "No SHT_REL in valid x64 ELF object files",
+            inconvertibleErrorCode());
+
+      if (Error Err = Base::forEachRelocation(RelSect, this,
+                                              &Self::addSingleRelocation))
+        return Err;
+    }
 
-      LLVM_DEBUG({
-        dbgs() << "Adding relocations from section " << *RelSectName << "\n";
-      });
+    return Error::success();
+  }
 
-      auto UpdateSection = Obj.getSection(SecRef.sh_info);
-      if (!UpdateSection)
-        return UpdateSection.takeError();
-
-      auto UpdateSectionName = Obj.getSectionName(**UpdateSection);
-      if (!UpdateSectionName)
-        return UpdateSectionName.takeError();
-
-      // Don't process relocations for debug sections.
-      if (isDwarfSection(*UpdateSectionName)) {
-        LLVM_DEBUG({
-          dbgs() << "  Target is dwarf section " << *UpdateSectionName
-                 << ". Skipping.\n";
-        });
-        continue;
-      } else
-        LLVM_DEBUG({
-          dbgs() << "  For target section " << *UpdateSectionName << "\n";
-        });
-
-      auto JITSection = G->findSectionByName(*UpdateSectionName);
-      if (!JITSection)
-        return make_error<llvm::StringError>(
-            "Refencing a a section that wasn't added to graph" +
-                *UpdateSectionName,
-            llvm::inconvertibleErrorCode());
-
-      auto Relocations = Obj.relas(SecRef);
-      if (!Relocations)
-        return Relocations.takeError();
-
-      for (const auto &Rela : *Relocations) {
-        auto Type = Rela.getType(false);
-
-        LLVM_DEBUG({
-          dbgs() << "Relocation Type: " << Type << "\n"
-                 << "Name: " << Obj.getRelocationTypeName(Type) << "\n";
-        });
-        auto SymbolIndex = Rela.getSymbol(false);
-        auto Symbol = Obj.getRelocationSymbol(Rela, SymTabSec);
-        if (!Symbol)
-          return Symbol.takeError();
-
-        auto BlockToFix = *(JITSection->blocks().begin());
-        auto *TargetSymbol = getGraphSymbol(SymbolIndex);
-
-        if (!TargetSymbol) {
-          return make_error<llvm::StringError>(
-              "Could not find symbol at given index, did you add it to "
-              "JITSymbolTable? index: " +
-                  std::to_string(SymbolIndex) +
-                  ", shndx: " + std::to_string((*Symbol)->st_shndx) +
-                  " Size of table: " + std::to_string(GraphSymbols.size()),
-              llvm::inconvertibleErrorCode());
-        }
-        uint64_t Addend = Rela.r_addend;
-        JITTargetAddress FixupAddress =
-            (*UpdateSection)->sh_addr + Rela.r_offset;
-
-        LLVM_DEBUG({
-          dbgs() << "Processing relocation at "
-                 << format("0x%016" PRIx64, FixupAddress) << "\n";
-        });
-        auto Kind = getRelocationKind(Type);
-        if (!Kind)
-          return Kind.takeError();
-
-        LLVM_DEBUG({
-          Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
-                  Addend);
-          printEdge(dbgs(), *BlockToFix, GE,
-                    getELFX86RelocationKindName(*Kind));
-          dbgs() << "\n";
-        });
-        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
-                            *TargetSymbol, Addend);
-      }
+  Error addSingleRelocation(const typename ELFT::Rela &Rel,
+                            const typename ELFT::Shdr &FixupSection,
+                            Section &GraphSection) {
+    using Base = ELFLinkGraphBuilder<ELFT>;
+
+    uint32_t SymbolIndex = Rel.getSymbol(false);
+    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
+    if (!ObjSymbol)
+      return ObjSymbol.takeError();
+
+    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
+    if (!GraphSymbol)
+      return make_error<StringError>(
+          formatv("Could not find symbol at given index, did you add it to "
+                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
+                  SymbolIndex, (*ObjSymbol)->st_shndx,
+                  Base::GraphSymbols.size()),
+          inconvertibleErrorCode());
+
+    // Validate the relocation kind.
+    auto ELFRelocKind = getRelocationKind(Rel.getType(false));
+    if (!ELFRelocKind)
+      return ELFRelocKind.takeError();
+
+    int64_t Addend = Rel.r_addend;
+    Edge::Kind Kind = Edge::Invalid;
+    switch (*ELFRelocKind) {
+    case PCRel32:
+      Kind = x86_64::Delta32;
+      break;
+    case Delta64:
+      Kind = x86_64::Delta64;
+      break;
+    case Pointer32Signed:
+      Kind = x86_64::Pointer32Signed;
+      break;
+    case Pointer64:
+      Kind = x86_64::Pointer64;
+      break;
+    case PCRel32GOTLoad: {
+      Kind = x86_64::RequestGOTAndTransformToDelta32;
+      break;
+    }
+    case PCRel32REXGOTLoadRelaxable: {
+      Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
+      Addend = 0;
+      break;
+    }
+    case PCRel32TLV: {
+      Kind = x86_64::RequestTLSDescInGOTAndTransformToDelta32;
+      break;
+    }
+    case PCRel32GOTLoadRelaxable: {
+      Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
+      Addend = 0;
+      break;
+    }
+    case PCRel64GOT: {
+      Kind = x86_64::RequestGOTAndTransformToDelta64;
+      break;
+    }
+    case GOT64: {
+      Kind = x86_64::RequestGOTAndTransformToDelta64FromGOT;
+      break;
+    }
+    case GOTOFF64: {
+      Kind = x86_64::Delta64FromGOT;
+      break;
+    }
+    case Branch32: {
+      Kind = x86_64::BranchPCRel32;
+      Addend = 0;
+      break;
     }
+    }
+
+    Block *BlockToFix = *(GraphSection.blocks().begin());
+    JITTargetAddress FixupAddress = FixupSection.sh_addr + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress();
+    Edge GE(Kind, Offset, *GraphSymbol, Addend);
+    LLVM_DEBUG({
+      dbgs() << "    ";
+      printEdge(dbgs(), *BlockToFix, GE, getELFX86RelocationKindName(Kind));
+      dbgs() << "\n";
+    });
+
+    BlockToFix->addEdge(std::move(GE));
     return Error::success();
   }
 
@@ -361,7 +264,7 @@ public:
   ELFLinkGraphBuilder_x86_64(StringRef FileName,
                              const object::ELFFile<object::ELF64LE> &Obj)
       : ELFLinkGraphBuilder(Obj, Triple("x86_64-unknown-linux"), FileName,
-                            getELFX86RelocationKindName) {}
+                            x86_64::getEdgeKindName) {}
 };
 
 class ELFJITLinker_x86_64 : public JITLinker<ELFJITLinker_x86_64> {
@@ -384,7 +287,8 @@ private:
         createDefineExternalSectionStartAndEndSymbolsPass(
             [&](LinkGraph &LG, Symbol &Sym) -> SectionRangeSymbolDesc {
               if (Sym.getName() == ELFGOTSymbolName)
-                if (auto *GOTSection = G.findSectionByName(ELFGOTSectionName)) {
+                if (auto *GOTSection = G.findSectionByName(
+                        x86_64::GOTTableManager::getSectionName())) {
                   GOTSymbol = &Sym;
                   return {*GOTSection, true};
                 }
@@ -403,7 +307,8 @@ private:
     // Otherwise look for a GOT section: If it already has a start symbol we'll
     // record it, otherwise we'll create our own.
     // If there's a GOT section but we didn't find an external GOT symbol...
-    if (auto *GOTSection = G.findSectionByName(ELFGOTSectionName)) {
+    if (auto *GOTSection =
+            G.findSectionByName(x86_64::GOTTableManager::getSectionName())) {
 
       // Check for an existing defined symbol.
       for (auto *Sym : GOTSection->symbols())
@@ -427,81 +332,7 @@ private:
   }
 
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
-    using namespace ELF_x86_64_Edges;
-    using namespace llvm::support;
-
-    char *BlockWorkingMem = B.getAlreadyMutableContent().data();
-    char *FixupPtr = BlockWorkingMem + E.getOffset();
-    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
-    switch (E.getKind()) {
-    case ELFX86RelocationKind::Branch32:
-    case ELFX86RelocationKind::Branch32ToStub:
-    case ELFX86RelocationKind::PCRel32:
-    case ELFX86RelocationKind::PCRel32GOTLoad: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      if (LLVM_LIKELY(x86_64::isInRangeForImmS32(Value)))
-        *(little32_t *)FixupPtr = Value;
-      else
-        return makeTargetOutOfRangeError(G, B, E);
-      break;
-    }
-    case ELFX86RelocationKind::PCRel64: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      *(little64_t *)FixupPtr = Value;
-      break;
-    }
-    case ELFX86RelocationKind::Pointer64: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend();
-      *(ulittle64_t *)FixupPtr = Value;
-      break;
-    }
-    case ELFX86RelocationKind::Delta32: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      if (LLVM_LIKELY(x86_64::isInRangeForImmS32(Value)))
-        *(little32_t *)FixupPtr = Value;
-      else
-        return makeTargetOutOfRangeError(G, B, E);
-      break;
-    }
-    case ELFX86RelocationKind::Delta64: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      *(little64_t *)FixupPtr = Value;
-      break;
-    }
-    case ELFX86RelocationKind::NegDelta32: {
-      int64_t Value = FixupAddress - E.getTarget().getAddress() + E.getAddend();
-      if (LLVM_LIKELY(x86_64::isInRangeForImmS32(Value)))
-        *(little32_t *)FixupPtr = Value;
-      else
-        return makeTargetOutOfRangeError(G, B, E);
-      break;
-    }
-    case ELFX86RelocationKind::NegDelta64: {
-      int64_t Value = FixupAddress - E.getTarget().getAddress() + E.getAddend();
-      *(little64_t *)FixupPtr = Value;
-      break;
-    }
-    case ELFX86RelocationKind::GOT64:
-    case ELFX86RelocationKind::GOTOFF64: {
-      // GOT64: Offset of GOT entry within GOT.
-      // GOTOFF64: Offset from GOT base to target.
-      // The expressions are the same in both cases, but in the GOT64 case the
-      // edge will have been fixed to point at the GOT entry, and in the
-      // GOTOFF64 case it will still point at the original target.
-      assert(GOTSymbol && "No GOT section symbol");
-      int64_t Value =
-          E.getTarget().getAddress() - GOTSymbol->getAddress() + E.getAddend();
-      *(little64_t *)FixupPtr = Value;
-      break;
-    }
-    default:
-      LLVM_DEBUG({
-        dbgs() << "Bad edge: " << getELFX86RelocationKindName(E.getKind())
-               << "\n";
-      });
-      llvm_unreachable("Unsupported relocation");
-    }
-    return Error::success();
+    return x86_64::applyFixup(G, B, E, GOTSymbol);
   }
 };
 
@@ -547,8 +378,9 @@ void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
   if (Ctx->shouldAddDefaultTargetPasses(G->getTargetTriple())) {
 
     Config.PrePrunePasses.push_back(EHFrameSplitter(".eh_frame"));
-    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
-        ".eh_frame", G->getPointerSize(), Delta64, Delta32, NegDelta32));
+    Config.PrePrunePasses.push_back(
+        EHFrameEdgeFixer(".eh_frame", x86_64::PointerSize, x86_64::Delta64,
+                         x86_64::Delta32, x86_64::NegDelta32));
     Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame"));
 
     // Construct a JITLinker and run the link function.
@@ -558,9 +390,8 @@ void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
-    // Add an in-place GOT/Stubs pass.
-    Config.PostPrunePasses.push_back(
-        PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::asPass);
+    // Add an in-place GOT/Stubs/TLSInfoEntry build pass.
+    Config.PostPrunePasses.push_back(buildTables_ELF_x86_64);
 
     // Resolve any external section start / end symbols.
     Config.PostAllocationPasses.push_back(
@@ -568,7 +399,7 @@ void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
             identifyELFSectionStartAndEndSymbols));
 
     // Add GOT/Stubs optimizer pass.
-    Config.PreFixupPasses.push_back(optimizeELF_x86_64_GOTAndStubs);
+    Config.PreFixupPasses.push_back(x86_64::optimizeGOTAndStubAccesses);
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
@@ -580,44 +411,26 @@ const char *getELFX86RelocationKindName(Edge::Kind R) {
   switch (R) {
   case Branch32:
     return "Branch32";
-  case Branch32ToStub:
-    return "Branch32ToStub";
-  case Pointer32:
-    return "Pointer32";
+  case Pointer32Signed:
+    return "Pointer32Signed";
   case Pointer64:
     return "Pointer64";
-  case Pointer64Anon:
-    return "Pointer64Anon";
   case PCRel32:
     return "PCRel32";
-  case PCRel32Minus1:
-    return "PCRel32Minus1";
-  case PCRel32Minus2:
-    return "PCRel32Minus2";
-  case PCRel32Minus4:
-    return "PCRel32Minus4";
-  case PCRel32Anon:
-    return "PCRel32Anon";
-  case PCRel32Minus1Anon:
-    return "PCRel32Minus1Anon";
-  case PCRel32Minus2Anon:
-    return "PCRel32Minus2Anon";
-  case PCRel32Minus4Anon:
-    return "PCRel32Minus4Anon";
   case PCRel32GOTLoad:
     return "PCRel32GOTLoad";
-  case PCRel32GOT:
-    return "PCRel32GOT";
-  case PCRel32TLV:
-    return "PCRel32TLV";
-  case Delta32:
-    return "Delta32";
+  case PCRel32GOTLoadRelaxable:
+    return "PCRel32GOTLoadRelaxable";
+  case PCRel32REXGOTLoadRelaxable:
+    return "PCRel32REXGOTLoad";
+  case PCRel64GOT:
+    return "PCRel64GOT";
   case Delta64:
     return "Delta64";
-  case NegDelta32:
-    return "NegDelta32";
-  case NegDelta64:
-    return "NegDelta64";
+  case GOT64:
+    return "GOT64";
+  case GOTOFF64:
+    return "GOTOFF64";
   }
   return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
 }
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index a4976f2f3d27..51dcc1c35fad 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -1,9 +1,8 @@
 //===------------- JITLink.cpp - Core Run-time JIT linker APIs ------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -214,7 +213,12 @@ Block &LinkGraph::splitBlock(Block &B, size_t SplitIndex,
     // Transfer all symbols with offset less than SplitIndex to NewBlock.
     while (!BlockSymbols.empty() &&
            BlockSymbols.back()->getOffset() < SplitIndex) {
-      BlockSymbols.back()->setBlock(NewBlock);
+      auto *Sym = BlockSymbols.back();
+      // If the symbol extends beyond the split, update the size to be within
+      // the new block.
+      if (Sym->getOffset() + Sym->getSize() > SplitIndex)
+        Sym->setSize(SplitIndex - Sym->getOffset());
+      Sym->setBlock(NewBlock);
       BlockSymbols.pop_back();
     }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 5b163ab6316d..706688aba4ec 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -48,12 +48,21 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
   if (auto Err = runPasses(Passes.PostPrunePasses))
     return Ctx->notifyFailed(std::move(Err));
 
-  // Sort blocks into segments.
-  auto Layout = layOutBlocks();
+  Ctx->getMemoryManager().allocate(
+      Ctx->getJITLinkDylib(), *G,
+      [S = std::move(Self)](AllocResult AR) mutable {
+        auto *TmpSelf = S.get();
+        TmpSelf->linkPhase2(std::move(S), std::move(AR));
+      });
+}
 
-  // Allocate memory for segments.
-  if (auto Err = allocateSegments(Layout))
-    return Ctx->notifyFailed(std::move(Err));
+void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
+                               AllocResult AR) {
+
+  if (AR)
+    Alloc = std::move(*AR);
+  else
+    return Ctx->notifyFailed(AR.takeError());
 
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName()
@@ -73,16 +82,16 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
   auto ExternalSymbols = getExternalSymbolNames();
 
-  // If there are no external symbols then proceed immediately with phase 2.
+  // If there are no external symbols then proceed immediately with phase 3.
   if (ExternalSymbols.empty()) {
     LLVM_DEBUG({
       dbgs() << "No external symbols for " << G->getName()
-             << ". Proceeding immediately with link phase 2.\n";
+             << ". Proceeding immediately with link phase 3.\n";
     });
     // FIXME: Once callee expressions are defined to be sequenced before
     //        argument expressions (c++17) we can simplify this. See below.
     auto &TmpSelf = *Self;
-    TmpSelf.linkPhase2(std::move(Self), AsyncLookupResult(), std::move(Layout));
+    TmpSelf.linkPhase3(std::move(Self), AsyncLookupResult());
     return;
   }
 
@@ -100,37 +109,31 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
   //
   // Ctx->lookup(std::move(UnresolvedExternals),
   //             [Self=std::move(Self)](Expected<AsyncLookupResult> Result) {
-  //               Self->linkPhase2(std::move(Self), std::move(Result));
+  //               Self->linkPhase3(std::move(Self), std::move(Result));
   //             });
-  auto *TmpCtx = Ctx.get();
-  TmpCtx->lookup(std::move(ExternalSymbols),
-                 createLookupContinuation(
-                     [S = std::move(Self), L = std::move(Layout)](
-                         Expected<AsyncLookupResult> LookupResult) mutable {
-                       auto &TmpSelf = *S;
-                       TmpSelf.linkPhase2(std::move(S), std::move(LookupResult),
-                                          std::move(L));
-                     }));
+  Ctx->lookup(std::move(ExternalSymbols),
+              createLookupContinuation(
+                  [S = std::move(Self)](
+                      Expected<AsyncLookupResult> LookupResult) mutable {
+                    auto &TmpSelf = *S;
+                    TmpSelf.linkPhase3(std::move(S), std::move(LookupResult));
+                  }));
 }
 
-void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
-                               Expected<AsyncLookupResult> LR,
-                               SegmentLayoutMap Layout) {
+void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
+                               Expected<AsyncLookupResult> LR) {
 
   LLVM_DEBUG({
-    dbgs() << "Starting link phase 2 for graph " << G->getName() << "\n";
+    dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
   });
 
   // If the lookup failed, bail out.
   if (!LR)
-    return deallocateAndBailOut(LR.takeError());
+    return abandonAllocAndBailOut(std::move(Self), LR.takeError());
 
   // Assign addresses to external addressables.
   applyLookupResult(*LR);
 
-  // Copy block content to working memory.
-  copyBlockContentToWorkingMemory(Layout, *Alloc);
-
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName()
            << "\" before pre-fixup passes:\n";
@@ -138,7 +141,7 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
   });
 
   if (auto Err = runPasses(Passes.PreFixupPasses))
-    return deallocateAndBailOut(std::move(Err));
+    return abandonAllocAndBailOut(std::move(Self), std::move(Err));
 
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
@@ -147,7 +150,7 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
 
   // Fix up block content.
   if (auto Err = fixUpBlocks(*G))
-    return deallocateAndBailOut(std::move(Err));
+    return abandonAllocAndBailOut(std::move(Self), std::move(Err));
 
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n";
@@ -155,27 +158,25 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
   });
 
   if (auto Err = runPasses(Passes.PostFixupPasses))
-    return deallocateAndBailOut(std::move(Err));
-
-  // FIXME: Use move capture once we have c++14.
-  auto *UnownedSelf = Self.release();
-  auto Phase3Continuation = [UnownedSelf](Error Err) {
-    std::unique_ptr<JITLinkerBase> Self(UnownedSelf);
-    UnownedSelf->linkPhase3(std::move(Self), std::move(Err));
-  };
+    return abandonAllocAndBailOut(std::move(Self), std::move(Err));
 
-  Alloc->finalizeAsync(std::move(Phase3Continuation));
+  Alloc->finalize([S = std::move(Self)](FinalizeResult FR) mutable {
+    auto *TmpSelf = S.get();
+    TmpSelf->linkPhase4(std::move(S), std::move(FR));
+  });
 }
 
-void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, Error Err) {
+void JITLinkerBase::linkPhase4(std::unique_ptr<JITLinkerBase> Self,
+                               FinalizeResult FR) {
 
   LLVM_DEBUG({
-    dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
+    dbgs() << "Starting link phase 4 for graph " << G->getName() << "\n";
   });
 
-  if (Err)
-    return deallocateAndBailOut(std::move(Err));
-  Ctx->notifyFinalized(std::move(Alloc));
+  if (!FR)
+    return Ctx->notifyFailed(FR.takeError());
+
+  Ctx->notifyFinalized(std::move(*FR));
 
   LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; });
 }
@@ -187,131 +188,6 @@ Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
   return Error::success();
 }
 
-JITLinkerBase::SegmentLayoutMap JITLinkerBase::layOutBlocks() {
-
-  SegmentLayoutMap Layout;
-
-  /// Partition blocks based on permissions and content vs. zero-fill.
-  for (auto *B : G->blocks()) {
-    auto &SegLists = Layout[B->getSection().getProtectionFlags()];
-    if (!B->isZeroFill())
-      SegLists.ContentBlocks.push_back(B);
-    else
-      SegLists.ZeroFillBlocks.push_back(B);
-  }
-
-  /// Sort blocks within each list.
-  for (auto &KV : Layout) {
-
-    auto CompareBlocks = [](const Block *LHS, const Block *RHS) {
-      // Sort by section, address and size
-      if (LHS->getSection().getOrdinal() != RHS->getSection().getOrdinal())
-        return LHS->getSection().getOrdinal() < RHS->getSection().getOrdinal();
-      if (LHS->getAddress() != RHS->getAddress())
-        return LHS->getAddress() < RHS->getAddress();
-      return LHS->getSize() < RHS->getSize();
-    };
-
-    auto &SegLists = KV.second;
-    llvm::sort(SegLists.ContentBlocks, CompareBlocks);
-    llvm::sort(SegLists.ZeroFillBlocks, CompareBlocks);
-  }
-
-  LLVM_DEBUG({
-    dbgs() << "Computed segment ordering:\n";
-    for (auto &KV : Layout) {
-      dbgs() << "  Segment "
-             << static_cast<sys::Memory::ProtectionFlags>(KV.first) << ":\n";
-      auto &SL = KV.second;
-      for (auto &SIEntry :
-           {std::make_pair(&SL.ContentBlocks, "content block"),
-            std::make_pair(&SL.ZeroFillBlocks, "zero-fill block")}) {
-        dbgs() << "    " << SIEntry.second << ":\n";
-        for (auto *B : *SIEntry.first)
-          dbgs() << "      " << *B << "\n";
-      }
-    }
-  });
-
-  return Layout;
-}
-
-Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
-
-  // Compute segment sizes and allocate memory.
-  LLVM_DEBUG(dbgs() << "JIT linker requesting: { ");
-  JITLinkMemoryManager::SegmentsRequestMap Segments;
-  for (auto &KV : Layout) {
-    auto &Prot = KV.first;
-    auto &SegLists = KV.second;
-
-    uint64_t SegAlign = 1;
-
-    // Calculate segment content size.
-    size_t SegContentSize = 0;
-    for (auto *B : SegLists.ContentBlocks) {
-      SegAlign = std::max(SegAlign, B->getAlignment());
-      SegContentSize = alignToBlock(SegContentSize, *B);
-      SegContentSize += B->getSize();
-    }
-
-    uint64_t SegZeroFillStart = SegContentSize;
-    uint64_t SegZeroFillEnd = SegZeroFillStart;
-
-    for (auto *B : SegLists.ZeroFillBlocks) {
-      SegAlign = std::max(SegAlign, B->getAlignment());
-      SegZeroFillEnd = alignToBlock(SegZeroFillEnd, *B);
-      SegZeroFillEnd += B->getSize();
-    }
-
-    Segments[Prot] = {SegAlign, SegContentSize,
-                      SegZeroFillEnd - SegZeroFillStart};
-
-    LLVM_DEBUG({
-      dbgs() << (&KV == &*Layout.begin() ? "" : "; ")
-             << static_cast<sys::Memory::ProtectionFlags>(Prot)
-             << ": alignment = " << SegAlign
-             << ", content size = " << SegContentSize
-             << ", zero-fill size = " << (SegZeroFillEnd - SegZeroFillStart);
-    });
-  }
-  LLVM_DEBUG(dbgs() << " }\n");
-
-  if (auto AllocOrErr =
-          Ctx->getMemoryManager().allocate(Ctx->getJITLinkDylib(), Segments))
-    Alloc = std::move(*AllocOrErr);
-  else
-    return AllocOrErr.takeError();
-
-  LLVM_DEBUG({
-    dbgs() << "JIT linker got memory (working -> target):\n";
-    for (auto &KV : Layout) {
-      auto Prot = static_cast<sys::Memory::ProtectionFlags>(KV.first);
-      dbgs() << "  " << Prot << ": "
-             << (const void *)Alloc->getWorkingMemory(Prot).data() << " -> "
-             << formatv("{0:x16}", Alloc->getTargetMemory(Prot)) << "\n";
-    }
-  });
-
-  // Update block target addresses.
-  for (auto &KV : Layout) {
-    auto &Prot = KV.first;
-    auto &SL = KV.second;
-
-    JITTargetAddress NextBlockAddr =
-        Alloc->getTargetMemory(static_cast<sys::Memory::ProtectionFlags>(Prot));
-
-    for (auto *SIList : {&SL.ContentBlocks, &SL.ZeroFillBlocks})
-      for (auto *B : *SIList) {
-        NextBlockAddr = alignToBlock(NextBlockAddr, *B);
-        B->setAddress(NextBlockAddr);
-        NextBlockAddr += B->getSize();
-      }
-  }
-
-  return Error::success();
-}
-
 JITLinkContext::LookupMap JITLinkerBase::getExternalSymbolNames() const {
   // Identify unresolved external symbols.
   JITLinkContext::LookupMap UnresolvedExternals;
@@ -351,81 +227,13 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
   });
 }
 
-void JITLinkerBase::copyBlockContentToWorkingMemory(
-    const SegmentLayoutMap &Layout, JITLinkMemoryManager::Allocation &Alloc) {
-
-  LLVM_DEBUG(dbgs() << "Copying block content:\n");
-  for (auto &KV : Layout) {
-    auto &Prot = KV.first;
-    auto &SegLayout = KV.second;
-
-    auto SegMem =
-        Alloc.getWorkingMemory(static_cast<sys::Memory::ProtectionFlags>(Prot));
-    char *LastBlockEnd = SegMem.data();
-    char *BlockDataPtr = LastBlockEnd;
-
-    LLVM_DEBUG({
-      dbgs() << "  Processing segment "
-             << static_cast<sys::Memory::ProtectionFlags>(Prot) << " [ "
-             << (const void *)SegMem.data() << " .. "
-             << (const void *)((char *)SegMem.data() + SegMem.size())
-             << " ]\n    Processing content sections:\n";
-    });
-
-    for (auto *B : SegLayout.ContentBlocks) {
-      LLVM_DEBUG(dbgs() << "    " << *B << ":\n");
-
-      // Pad to alignment/alignment-offset.
-      BlockDataPtr = alignToBlock(BlockDataPtr, *B);
-
-      LLVM_DEBUG({
-        dbgs() << "      Bumped block pointer to " << (const void *)BlockDataPtr
-               << " to meet block alignment " << B->getAlignment()
-               << " and alignment offset " << B->getAlignmentOffset() << "\n";
-      });
-
-      // Zero pad up to alignment.
-      LLVM_DEBUG({
-        if (LastBlockEnd != BlockDataPtr)
-          dbgs() << "      Zero padding from " << (const void *)LastBlockEnd
-                 << " to " << (const void *)BlockDataPtr << "\n";
-      });
-
-      while (LastBlockEnd != BlockDataPtr)
-        *LastBlockEnd++ = 0;
-
-      // Copy initial block content.
-      LLVM_DEBUG({
-        dbgs() << "      Copying block " << *B << " content, "
-               << B->getContent().size() << " bytes, from "
-               << (const void *)B->getContent().data() << " to "
-               << (const void *)BlockDataPtr << "\n";
-      });
-      memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size());
-
-      // Point the block's content to the fixed up buffer.
-      B->setMutableContent({BlockDataPtr, B->getContent().size()});
-
-      // Update block end pointer.
-      LastBlockEnd = BlockDataPtr + B->getContent().size();
-      BlockDataPtr = LastBlockEnd;
-    }
-
-    // Zero pad the rest of the segment.
-    LLVM_DEBUG({
-      dbgs() << "    Zero padding end of segment from "
-             << (const void *)LastBlockEnd << " to "
-             << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n";
-    });
-    while (LastBlockEnd != SegMem.data() + SegMem.size())
-      *LastBlockEnd++ = 0;
-  }
-}
-
-void JITLinkerBase::deallocateAndBailOut(Error Err) {
+void JITLinkerBase::abandonAllocAndBailOut(std::unique_ptr<JITLinkerBase> Self,
+                                           Error Err) {
   assert(Err && "Should not be bailing out on success value");
-  assert(Alloc && "can not call deallocateAndBailOut before allocation");
-  Ctx->notifyFailed(joinErrors(std::move(Err), Alloc->deallocate()));
+  assert(Alloc && "can not call abandonAllocAndBailOut before allocation");
+  Alloc->abandon([S = std::move(Self), E1 = std::move(Err)](Error E2) mutable {
+    S->Ctx->notifyFailed(joinErrors(std::move(E1), std::move(E2)));
+  });
 }
 
 void prune(LinkGraph &G) {
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index 6b815fe4fb31..e4fdda0783a4 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -42,14 +42,9 @@ public:
   virtual ~JITLinkerBase();
 
 protected:
-  struct SegmentLayout {
-    using BlocksList = std::vector<Block *>;
-
-    BlocksList ContentBlocks;
-    BlocksList ZeroFillBlocks;
-  };
-
-  using SegmentLayoutMap = DenseMap<unsigned, SegmentLayout>;
+  using InFlightAlloc = JITLinkMemoryManager::InFlightAlloc;
+  using AllocResult = Expected<std::unique_ptr<InFlightAlloc>>;
+  using FinalizeResult = Expected<JITLinkMemoryManager::FinalizedAlloc>;
 
   // Returns the PassConfiguration for this instance. This can be used by
   // JITLinkerBase implementations to add late passes that reference their
@@ -61,39 +56,27 @@ protected:
   //   1.1: Run pre-prune passes
   //   1.2: Prune graph
   //   1.3: Run post-prune passes
-  //   1.4: Sort blocks into segments
-  //   1.5: Allocate segment memory, update node vmaddrs to target vmaddrs
-  //   1.6: Run post-allocation passes
-  //   1.7: Notify context of final assigned symbol addresses
-  //   1.8: Identify external symbols and make an async call to resolve
+  //   1.4: Allocate memory.
   void linkPhase1(std::unique_ptr<JITLinkerBase> Self);
 
   // Phase 2:
-  //   2.1: Apply resolution results
-  //   2.2: Run pre-fixup passes
-  //   2.3: Fix up block contents
-  //   2.4: Run post-fixup passes
-  //   2.5: Make an async call to transfer and finalize memory.
-  void linkPhase2(std::unique_ptr<JITLinkerBase> Self,
-                  Expected<AsyncLookupResult> LookupResult,
-                  SegmentLayoutMap Layout);
+  //   2.2: Run post-allocation passes
+  //   2.3: Notify context of final assigned symbol addresses
+  //   2.4: Identify external symbols and make an async call to resolve
+  void linkPhase2(std::unique_ptr<JITLinkerBase> Self, AllocResult AR);
 
   // Phase 3:
-  //   3.1: Call OnFinalized callback, handing off allocation.
-  void linkPhase3(std::unique_ptr<JITLinkerBase> Self, Error Err);
-
-  // Align a JITTargetAddress to conform with block alignment requirements.
-  static JITTargetAddress alignToBlock(JITTargetAddress Addr, Block &B) {
-    uint64_t Delta = (B.getAlignmentOffset() - Addr) % B.getAlignment();
-    return Addr + Delta;
-  }
-
-  // Align a pointer to conform with block alignment requirements.
-  static char *alignToBlock(char *P, Block &B) {
-    uint64_t PAddr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(P));
-    uint64_t Delta = (B.getAlignmentOffset() - PAddr) % B.getAlignment();
-    return P + Delta;
-  }
+  //   3.1: Apply resolution results
+  //   3.2: Run pre-fixup passes
+  //   3.3: Fix up block contents
+  //   3.4: Run post-fixup passes
+  //   3.5: Make an async call to transfer and finalize memory.
+  void linkPhase3(std::unique_ptr<JITLinkerBase> Self,
+                  Expected<AsyncLookupResult> LookupResult);
+
+  // Phase 4:
+  //   4.1: Call OnFinalized callback, handing off allocation.
+  void linkPhase4(std::unique_ptr<JITLinkerBase> Self, FinalizeResult FR);
 
 private:
   // Run all passes in the given pass list, bailing out immediately if any pass
@@ -104,18 +87,14 @@ private:
   // Implemented in JITLinker.
   virtual Error fixUpBlocks(LinkGraph &G) const = 0;
 
-  SegmentLayoutMap layOutBlocks();
-  Error allocateSegments(const SegmentLayoutMap &Layout);
   JITLinkContext::LookupMap getExternalSymbolNames() const;
   void applyLookupResult(AsyncLookupResult LR);
-  void copyBlockContentToWorkingMemory(const SegmentLayoutMap &Layout,
-                                       JITLinkMemoryManager::Allocation &Alloc);
-  void deallocateAndBailOut(Error Err);
+  void abandonAllocAndBailOut(std::unique_ptr<JITLinkerBase> Self, Error Err);
 
   std::unique_ptr<JITLinkContext> Ctx;
   std::unique_ptr<LinkGraph> G;
   PassConfiguration Passes;
-  std::unique_ptr<JITLinkMemoryManager::Allocation> Alloc;
+  std::unique_ptr<InFlightAlloc> Alloc;
 };
 
 template <typename LinkerImpl> class JITLinker : public JITLinkerBase {
@@ -152,6 +131,8 @@ private:
 
       // Copy Block data and apply fixups.
       LLVM_DEBUG(dbgs() << "    Applying fixups.\n");
+      assert((!B->isZeroFill() || B->edges_size() == 0) &&
+             "Edges in zero-fill block?");
       for (auto &E : B->edges()) {
 
         // Skip non-relocation edges.
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 36067ccf2753..831b9b26d2fd 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -1,135 +1,528 @@
 //===--- JITLinkMemoryManager.cpp - JITLinkMemoryManager implementation ---===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Process.h"
 
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+
+namespace {
+
+// FIXME: Remove this copy of CWrapperFunctionResult as soon as JITLink can
+// depend on shared utils from Orc.
+
+// Must be kept in-sync with compiler-rt/lib/orc/c-api.h.
+union CWrapperFunctionResultDataUnion {
+  char *ValuePtr;
+  char Value[sizeof(ValuePtr)];
+};
+
+// Must be kept in-sync with compiler-rt/lib/orc/c-api.h.
+typedef struct {
+  CWrapperFunctionResultDataUnion Data;
+  size_t Size;
+} CWrapperFunctionResult;
+
+Error toError(CWrapperFunctionResult R) {
+  bool HasError = false;
+  std::string ErrMsg;
+  if (R.Size) {
+    bool Large = R.Size > sizeof(CWrapperFunctionResultDataUnion);
+    char *Content = Large ? R.Data.ValuePtr : R.Data.Value;
+    if (Content[0]) {
+      HasError = true;
+      constexpr unsigned StrStart = 1 + sizeof(uint64_t);
+      ErrMsg.resize(R.Size - StrStart);
+      memcpy(&ErrMsg[0], Content + StrStart, R.Size - StrStart);
+    }
+    if (Large)
+      free(R.Data.ValuePtr);
+  } else if (R.Data.ValuePtr) {
+    HasError = true;
+    ErrMsg = R.Data.ValuePtr;
+    free(R.Data.ValuePtr);
+  }
+
+  if (HasError)
+    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
+  return Error::success();
+}
+} // namespace
+
 namespace llvm {
 namespace jitlink {
 
 JITLinkMemoryManager::~JITLinkMemoryManager() = default;
-JITLinkMemoryManager::Allocation::~Allocation() = default;
-
-Expected<std::unique_ptr<JITLinkMemoryManager::Allocation>>
-InProcessMemoryManager::allocate(const JITLinkDylib *JD,
-                                 const SegmentsRequestMap &Request) {
-
-  using AllocationMap = DenseMap<unsigned, sys::MemoryBlock>;
-
-  // Local class for allocation.
-  class IPMMAlloc : public Allocation {
-  public:
-    IPMMAlloc(AllocationMap SegBlocks) : SegBlocks(std::move(SegBlocks)) {}
-    MutableArrayRef<char> getWorkingMemory(ProtectionFlags Seg) override {
-      assert(SegBlocks.count(Seg) && "No allocation for segment");
-      return {static_cast<char *>(SegBlocks[Seg].base()),
-              SegBlocks[Seg].allocatedSize()};
+JITLinkMemoryManager::InFlightAlloc::~InFlightAlloc() = default;
+
+static Error runAllocAction(JITLinkMemoryManager::AllocActionCall &C) {
+  using WrapperFnTy = CWrapperFunctionResult (*)(const void *, size_t);
+  auto *Fn = jitTargetAddressToPointer<WrapperFnTy>(C.FnAddr);
+
+  return toError(Fn(jitTargetAddressToPointer<const void *>(C.CtxAddr),
+                    static_cast<size_t>(C.CtxSize)));
+}
+
+BasicLayout::BasicLayout(LinkGraph &G) : G(G) {
+
+  for (auto &Sec : G.sections()) {
+    // Skip empty sections.
+    if (empty(Sec.blocks()))
+      continue;
+
+    auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemDeallocPolicy()}];
+    for (auto *B : Sec.blocks())
+      if (LLVM_LIKELY(!B->isZeroFill()))
+        Seg.ContentBlocks.push_back(B);
+      else
+        Seg.ZeroFillBlocks.push_back(B);
+  }
+
+  // Build Segments map.
+  auto CompareBlocks = [](const Block *LHS, const Block *RHS) {
+    // Sort by section, address and size
+    if (LHS->getSection().getOrdinal() != RHS->getSection().getOrdinal())
+      return LHS->getSection().getOrdinal() < RHS->getSection().getOrdinal();
+    if (LHS->getAddress() != RHS->getAddress())
+      return LHS->getAddress() < RHS->getAddress();
+    return LHS->getSize() < RHS->getSize();
+  };
+
+  LLVM_DEBUG(dbgs() << "Generated BasicLayout for " << G.getName() << ":\n");
+  for (auto &KV : Segments) {
+    auto &Seg = KV.second;
+
+    llvm::sort(Seg.ContentBlocks, CompareBlocks);
+    llvm::sort(Seg.ZeroFillBlocks, CompareBlocks);
+
+    for (auto *B : Seg.ContentBlocks) {
+      Seg.ContentSize = alignToBlock(Seg.ContentSize, *B);
+      Seg.ContentSize += B->getSize();
+      Seg.Alignment = std::max(Seg.Alignment, Align(B->getAlignment()));
     }
-    JITTargetAddress getTargetMemory(ProtectionFlags Seg) override {
-      assert(SegBlocks.count(Seg) && "No allocation for segment");
-      return pointerToJITTargetAddress(SegBlocks[Seg].base());
+
+    uint64_t SegEndOffset = Seg.ContentSize;
+    for (auto *B : Seg.ZeroFillBlocks) {
+      SegEndOffset = alignToBlock(SegEndOffset, *B);
+      SegEndOffset += B->getSize();
+      Seg.Alignment = std::max(Seg.Alignment, Align(B->getAlignment()));
     }
-    void finalizeAsync(FinalizeContinuation OnFinalize) override {
-      OnFinalize(applyProtections());
+    Seg.ZeroFillSize = SegEndOffset - Seg.ContentSize;
+
+    LLVM_DEBUG({
+      dbgs() << "  Seg " << KV.first
+             << ": content-size=" << formatv("{0:x}", Seg.ContentSize)
+             << ", zero-fill-size=" << formatv("{0:x}", Seg.ZeroFillSize)
+             << ", align=" << formatv("{0:x}", Seg.Alignment.value()) << "\n";
+    });
+  }
+}
+
+Expected<BasicLayout::ContiguousPageBasedLayoutSizes>
+BasicLayout::getContiguousPageBasedLayoutSizes(uint64_t PageSize) {
+  ContiguousPageBasedLayoutSizes SegsSizes;
+
+  for (auto &KV : segments()) {
+    auto &AG = KV.first;
+    auto &Seg = KV.second;
+
+    if (Seg.Alignment > PageSize)
+      return make_error<StringError>("Segment alignment greater than page size",
+                                     inconvertibleErrorCode());
+
+    uint64_t SegSize = alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize);
+    if (AG.getMemDeallocPolicy() == MemDeallocPolicy::Standard)
+      SegsSizes.StandardSegs += SegSize;
+    else
+      SegsSizes.FinalizeSegs += SegSize;
+  }
+
+  return SegsSizes;
+}
+
+Error BasicLayout::apply() {
+  for (auto &KV : Segments) {
+    auto &Seg = KV.second;
+
+    assert(!(Seg.ContentBlocks.empty() && Seg.ZeroFillBlocks.empty()) &&
+           "Empty section recorded?");
+
+    for (auto *B : Seg.ContentBlocks) {
+      // Align addr and working-mem-offset.
+      Seg.Addr = alignToBlock(Seg.Addr, *B);
+      Seg.NextWorkingMemOffset = alignToBlock(Seg.NextWorkingMemOffset, *B);
+
+      // Update block addr.
+      B->setAddress(Seg.Addr);
+      Seg.Addr += B->getSize();
+
+      // Copy content to working memory, then update content to point at working
+      // memory.
+      memcpy(Seg.WorkingMem + Seg.NextWorkingMemOffset, B->getContent().data(),
+             B->getSize());
+      B->setMutableContent(
+          {Seg.WorkingMem + Seg.NextWorkingMemOffset, B->getSize()});
+      Seg.NextWorkingMemOffset += B->getSize();
     }
-    Error deallocate() override {
-      if (SegBlocks.empty())
-        return Error::success();
-      void *SlabStart = SegBlocks.begin()->second.base();
-      char *SlabEnd = (char *)SlabStart;
-      for (auto &KV : SegBlocks) {
-        SlabStart = std::min(SlabStart, KV.second.base());
-        SlabEnd = std::max(SlabEnd, (char *)(KV.second.base()) +
-                                        KV.second.allocatedSize());
-      }
-      size_t SlabSize = SlabEnd - (char *)SlabStart;
-      assert((SlabSize % sys::Process::getPageSizeEstimate()) == 0 &&
-             "Slab size is not a multiple of page size");
-      sys::MemoryBlock Slab(SlabStart, SlabSize);
-      if (auto EC = sys::Memory::releaseMappedMemory(Slab))
+
+    for (auto *B : Seg.ZeroFillBlocks) {
+      // Align addr.
+      Seg.Addr = alignToBlock(Seg.Addr, *B);
+      // Update block addr.
+      B->setAddress(Seg.Addr);
+      Seg.Addr += B->getSize();
+    }
+
+    Seg.ContentBlocks.clear();
+    Seg.ZeroFillBlocks.clear();
+  }
+
+  return Error::success();
+}
+
+JITLinkMemoryManager::AllocActions &BasicLayout::graphAllocActions() {
+  return G.allocActions();
+}
+
+void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
+                                const JITLinkDylib *JD, SegmentMap Segments,
+                                OnCreatedFunction OnCreated) {
+
+  static_assert(AllocGroup::NumGroups == 16,
+                "AllocGroup has changed. Section names below must be updated");
+  StringRef AGSectionNames[] = {
+      "__---.standard", "__R--.standard", "__-W-.standard", "__RW-.standard",
+      "__--X.standard", "__R-X.standard", "__-WX.standard", "__RWX.standard",
+      "__---.finalize", "__R--.finalize", "__-W-.finalize", "__RW-.finalize",
+      "__--X.finalize", "__R-X.finalize", "__-WX.finalize", "__RWX.finalize"};
+
+  auto G =
+      std::make_unique<LinkGraph>("", Triple(), 0, support::native, nullptr);
+  AllocGroupSmallMap<Block *> ContentBlocks;
+
+  JITTargetAddress NextAddr = 0x100000;
+  for (auto &KV : Segments) {
+    auto &AG = KV.first;
+    auto &Seg = KV.second;
+
+    auto AGSectionName =
+        AGSectionNames[static_cast<unsigned>(AG.getMemProt()) |
+                       static_cast<bool>(AG.getMemDeallocPolicy()) << 3];
+
+    auto &Sec = G->createSection(AGSectionName, AG.getMemProt());
+    Sec.setMemDeallocPolicy(AG.getMemDeallocPolicy());
+
+    if (Seg.ContentSize != 0) {
+      NextAddr = alignTo(NextAddr, Seg.ContentAlign);
+      auto &B =
+          G->createMutableContentBlock(Sec, G->allocateBuffer(Seg.ContentSize),
+                                       NextAddr, Seg.ContentAlign.value(), 0);
+      ContentBlocks[AG] = &B;
+      NextAddr += Seg.ContentSize;
+    }
+  }
+
+  // GRef declared separately since order-of-argument-eval isn't specified.
+  auto &GRef = *G;
+  MemMgr.allocate(JD, GRef,
+                  [G = std::move(G), ContentBlocks = std::move(ContentBlocks),
+                   OnCreated = std::move(OnCreated)](
+                      JITLinkMemoryManager::AllocResult Alloc) mutable {
+                    if (!Alloc)
+                      OnCreated(Alloc.takeError());
+                    else
+                      OnCreated(SimpleSegmentAlloc(std::move(G),
+                                                   std::move(ContentBlocks),
+                                                   std::move(*Alloc)));
+                  });
+}
+
+Expected<SimpleSegmentAlloc>
+SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr, const JITLinkDylib *JD,
+                           SegmentMap Segments) {
+  std::promise<MSVCPExpected<SimpleSegmentAlloc>> AllocP;
+  auto AllocF = AllocP.get_future();
+  Create(MemMgr, JD, std::move(Segments),
+         [&](Expected<SimpleSegmentAlloc> Result) {
+           AllocP.set_value(std::move(Result));
+         });
+  return AllocF.get();
+}
+
+SimpleSegmentAlloc::SimpleSegmentAlloc(SimpleSegmentAlloc &&) = default;
+SimpleSegmentAlloc &
+SimpleSegmentAlloc::operator=(SimpleSegmentAlloc &&) = default;
+SimpleSegmentAlloc::~SimpleSegmentAlloc() {}
+
+SimpleSegmentAlloc::SegmentInfo SimpleSegmentAlloc::getSegInfo(AllocGroup AG) {
+  auto I = ContentBlocks.find(AG);
+  if (I != ContentBlocks.end()) {
+    auto &B = *I->second;
+    return {B.getAddress(), B.getAlreadyMutableContent()};
+  }
+  return {};
+}
+
+SimpleSegmentAlloc::SimpleSegmentAlloc(
+    std::unique_ptr<LinkGraph> G, AllocGroupSmallMap<Block *> ContentBlocks,
+    std::unique_ptr<JITLinkMemoryManager::InFlightAlloc> Alloc)
+    : G(std::move(G)), ContentBlocks(std::move(ContentBlocks)),
+      Alloc(std::move(Alloc)) {}
+
+class InProcessMemoryManager::IPInFlightAlloc
+    : public JITLinkMemoryManager::InFlightAlloc {
+public:
+  IPInFlightAlloc(InProcessMemoryManager &MemMgr, LinkGraph &G, BasicLayout BL,
+                  sys::MemoryBlock StandardSegments,
+                  sys::MemoryBlock FinalizationSegments)
+      : MemMgr(MemMgr), G(G), BL(std::move(BL)),
+        StandardSegments(std::move(StandardSegments)),
+        FinalizationSegments(std::move(FinalizationSegments)) {}
+
+  void finalize(OnFinalizedFunction OnFinalized) override {
+
+    // Apply memory protections to all segments.
+    if (auto Err = applyProtections()) {
+      OnFinalized(std::move(Err));
+      return;
+    }
+
+    // Run finalization actions.
+    // FIXME: Roll back previous successful actions on failure.
+    std::vector<AllocActionCall> DeallocActions;
+    DeallocActions.reserve(G.allocActions().size());
+    for (auto &ActPair : G.allocActions()) {
+      if (ActPair.Finalize.FnAddr)
+        if (auto Err = runAllocAction(ActPair.Finalize)) {
+          OnFinalized(std::move(Err));
+          return;
+        }
+      if (ActPair.Dealloc.FnAddr)
+        DeallocActions.push_back(ActPair.Dealloc);
+    }
+    G.allocActions().clear();
+
+    // Release the finalize segments slab.
+    if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) {
+      OnFinalized(errorCodeToError(EC));
+      return;
+    }
+
+    // Continue with finalized allocation.
+    OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments),
+                                            std::move(DeallocActions)));
+  }
+
+  void abandon(OnAbandonedFunction OnAbandoned) override {
+    Error Err = Error::success();
+    if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
+    if (auto EC = sys::Memory::releaseMappedMemory(StandardSegments))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
+    OnAbandoned(std::move(Err));
+  }
+
+private:
+  Error applyProtections() {
+    for (auto &KV : BL.segments()) {
+      const auto &AG = KV.first;
+      auto &Seg = KV.second;
+
+      auto Prot = toSysMemoryProtectionFlags(AG.getMemProt());
+
+      uint64_t SegSize =
+          alignTo(Seg.ContentSize + Seg.ZeroFillSize, MemMgr.PageSize);
+      sys::MemoryBlock MB(Seg.WorkingMem, SegSize);
+      if (auto EC = sys::Memory::protectMappedMemory(MB, Prot))
         return errorCodeToError(EC);
-      return Error::success();
+      if (Prot & sys::Memory::MF_EXEC)
+        sys::Memory::InvalidateInstructionCache(MB.base(), MB.allocatedSize());
     }
+    return Error::success();
+  }
+
+  InProcessMemoryManager &MemMgr;
+  LinkGraph &G;
+  BasicLayout BL;
+  sys::MemoryBlock StandardSegments;
+  sys::MemoryBlock FinalizationSegments;
+};
+
+Expected<std::unique_ptr<InProcessMemoryManager>>
+InProcessMemoryManager::Create() {
+  if (auto PageSize = sys::Process::getPageSize())
+    return std::make_unique<InProcessMemoryManager>(*PageSize);
+  else
+    return PageSize.takeError();
+}
+
+void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
+                                      OnAllocatedFunction OnAllocated) {
+
+  // FIXME: Just check this once on startup.
+  if (!isPowerOf2_64((uint64_t)PageSize)) {
+    OnAllocated(make_error<StringError>("Page size is not a power of 2",
+                                        inconvertibleErrorCode()));
+    return;
+  }
+
+  BasicLayout BL(G);
+
+  /// Scan the request and calculate the group and total sizes.
+  /// Check that segment size is no larger than a page.
+  auto SegsSizes = BL.getContiguousPageBasedLayoutSizes(PageSize);
+  if (!SegsSizes) {
+    OnAllocated(SegsSizes.takeError());
+    return;
+  }
+
+  /// Check that the total size requested (including zero fill) is not larger
+  /// than a size_t.
+  if (SegsSizes->total() > std::numeric_limits<size_t>::max()) {
+    OnAllocated(make_error<JITLinkError>(
+        "Total requested size " + formatv("{0:x}", SegsSizes->total()) +
+        " for graph " + G.getName() + " exceeds address space"));
+    return;
+  }
+
+  // Allocate one slab for the whole thing (to make sure everything is
+  // in-range), then partition into standard and finalization blocks.
+  //
+  // FIXME: Make two separate allocations in the future to reduce
+  // fragmentation: finalization segments will usually be a single page, and
+  // standard segments are likely to be more than one page. Where multiple
+  // allocations are in-flight at once (likely) the current approach will leave
+  // a lot of single-page holes.
+  sys::MemoryBlock Slab;
+  sys::MemoryBlock StandardSegsMem;
+  sys::MemoryBlock FinalizeSegsMem;
+  {
+    const sys::Memory::ProtectionFlags ReadWrite =
+        static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                  sys::Memory::MF_WRITE);
+
+    std::error_code EC;
+    Slab = sys::Memory::allocateMappedMemory(SegsSizes->total(), nullptr,
+                                             ReadWrite, EC);
 
-  private:
-    Error applyProtections() {
-      for (auto &KV : SegBlocks) {
-        auto &Prot = KV.first;
-        auto &Block = KV.second;
-        if (auto EC = sys::Memory::protectMappedMemory(Block, Prot))
-          return errorCodeToError(EC);
-        if (Prot & sys::Memory::MF_EXEC)
-          sys::Memory::InvalidateInstructionCache(Block.base(),
-                                                  Block.allocatedSize());
-      }
-      return Error::success();
+    if (EC) {
+      OnAllocated(errorCodeToError(EC));
+      return;
     }
 
-    AllocationMap SegBlocks;
-  };
+    // Zero-fill the whole slab up-front.
+    memset(Slab.base(), 0, Slab.allocatedSize());
+
+    StandardSegsMem = {Slab.base(),
+                       static_cast<size_t>(SegsSizes->StandardSegs)};
+    FinalizeSegsMem = {(void *)((char *)Slab.base() + SegsSizes->StandardSegs),
+                       static_cast<size_t>(SegsSizes->FinalizeSegs)};
+  }
 
-  if (!isPowerOf2_64((uint64_t)sys::Process::getPageSizeEstimate()))
-    return make_error<StringError>("Page size is not a power of 2",
-                                   inconvertibleErrorCode());
+  auto NextStandardSegAddr = pointerToJITTargetAddress(StandardSegsMem.base());
+  auto NextFinalizeSegAddr = pointerToJITTargetAddress(FinalizeSegsMem.base());
 
-  AllocationMap Blocks;
-  const sys::Memory::ProtectionFlags ReadWrite =
-      static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                sys::Memory::MF_WRITE);
+  LLVM_DEBUG({
+    dbgs() << "InProcessMemoryManager allocated:\n";
+    if (SegsSizes->StandardSegs)
+      dbgs() << formatv("  [ {0:x16} -- {1:x16} ]", NextStandardSegAddr,
+                        NextStandardSegAddr + StandardSegsMem.allocatedSize())
+             << " to stardard segs\n";
+    else
+      dbgs() << "  no standard segs\n";
+    if (SegsSizes->FinalizeSegs)
+      dbgs() << formatv("  [ {0:x16} -- {1:x16} ]", NextFinalizeSegAddr,
+                        NextFinalizeSegAddr + FinalizeSegsMem.allocatedSize())
+             << " to finalize segs\n";
+    else
+      dbgs() << "  no finalize segs\n";
+  });
 
-  // Compute the total number of pages to allocate.
-  size_t TotalSize = 0;
-  for (auto &KV : Request) {
-    const auto &Seg = KV.second;
+  // Build ProtMap, assign addresses.
+  for (auto &KV : BL.segments()) {
+    auto &AG = KV.first;
+    auto &Seg = KV.second;
 
-    if (Seg.getAlignment() > sys::Process::getPageSizeEstimate())
-      return make_error<StringError>("Cannot request higher than page "
-                                     "alignment",
-                                     inconvertibleErrorCode());
+    auto &SegAddr = (AG.getMemDeallocPolicy() == MemDeallocPolicy::Standard)
+                        ? NextStandardSegAddr
+                        : NextFinalizeSegAddr;
 
-    TotalSize = alignTo(TotalSize, sys::Process::getPageSizeEstimate());
-    TotalSize += Seg.getContentSize();
-    TotalSize += Seg.getZeroFillSize();
+    Seg.WorkingMem = jitTargetAddressToPointer<char *>(SegAddr);
+    Seg.Addr = SegAddr;
+
+    SegAddr += alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize);
   }
 
-  // Allocate one slab to cover all the segments.
-  std::error_code EC;
-  auto SlabRemaining =
-      sys::Memory::allocateMappedMemory(TotalSize, nullptr, ReadWrite, EC);
+  if (auto Err = BL.apply()) {
+    OnAllocated(std::move(Err));
+    return;
+  }
 
-  if (EC)
-    return errorCodeToError(EC);
+  OnAllocated(std::make_unique<IPInFlightAlloc>(*this, G, std::move(BL),
+                                                std::move(StandardSegsMem),
+                                                std::move(FinalizeSegsMem)));
+}
 
-  // Allocate segment memory from the slab.
-  for (auto &KV : Request) {
+void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs,
+                                        OnDeallocatedFunction OnDeallocated) {
+  std::vector<sys::MemoryBlock> StandardSegmentsList;
+  std::vector<std::vector<AllocActionCall>> DeallocActionsList;
 
-    const auto &Seg = KV.second;
+  {
+    std::lock_guard<std::mutex> Lock(FinalizedAllocsMutex);
+    for (auto &Alloc : Allocs) {
+      auto *FA =
+          jitTargetAddressToPointer<FinalizedAllocInfo *>(Alloc.release());
+      StandardSegmentsList.push_back(std::move(FA->StandardSegments));
+      if (!FA->DeallocActions.empty())
+        DeallocActionsList.push_back(std::move(FA->DeallocActions));
+      FA->~FinalizedAllocInfo();
+      FinalizedAllocInfos.Deallocate(FA);
+    }
+  }
+
+  Error DeallocErr = Error::success();
 
-    uint64_t SegmentSize = alignTo(Seg.getContentSize() + Seg.getZeroFillSize(),
-                                   sys::Process::getPageSizeEstimate());
-    assert(SlabRemaining.allocatedSize() >= SegmentSize &&
-           "Mapping exceeds allocation");
+  while (!DeallocActionsList.empty()) {
+    auto &DeallocActions = DeallocActionsList.back();
+    auto &StandardSegments = StandardSegmentsList.back();
 
-    sys::MemoryBlock SegMem(SlabRemaining.base(), SegmentSize);
-    SlabRemaining = sys::MemoryBlock((char *)SlabRemaining.base() + SegmentSize,
-                                     SlabRemaining.allocatedSize() - SegmentSize);
+    /// Run any deallocate calls.
+    while (!DeallocActions.empty()) {
+      if (auto Err = runAllocAction(DeallocActions.back()))
+        DeallocErr = joinErrors(std::move(DeallocErr), std::move(Err));
+      DeallocActions.pop_back();
+    }
 
-    // Zero out the zero-fill memory.
-    memset(static_cast<char *>(SegMem.base()) + Seg.getContentSize(), 0,
-           Seg.getZeroFillSize());
+    /// Release the standard segments slab.
+    if (auto EC = sys::Memory::releaseMappedMemory(StandardSegments))
+      DeallocErr = joinErrors(std::move(DeallocErr), errorCodeToError(EC));
 
-    // Record the block for this segment.
-    Blocks[KV.first] = std::move(SegMem);
+    DeallocActionsList.pop_back();
+    StandardSegmentsList.pop_back();
   }
 
-  return std::unique_ptr<InProcessMemoryManager::Allocation>(
-      new IPMMAlloc(std::move(Blocks)));
+  OnDeallocated(std::move(DeallocErr));
+}
+
+JITLinkMemoryManager::FinalizedAlloc
+InProcessMemoryManager::createFinalizedAlloc(
+    sys::MemoryBlock StandardSegments,
+    std::vector<AllocActionCall> DeallocActions) {
+  std::lock_guard<std::mutex> Lock(FinalizedAllocsMutex);
+  auto *FA = FinalizedAllocInfos.Allocate<FinalizedAllocInfo>();
+  new (FA) FinalizedAllocInfo(
+      {std::move(StandardSegments), std::move(DeallocActions)});
+  return FinalizedAlloc(pointerToJITTargetAddress(FA));
 }
 
 } // end namespace jitlink
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
index eda2b8811deb..e49480c78662 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
@@ -1,9 +1,8 @@
 //===-------------- MachO.cpp - JIT linker function for MachO -------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index 03a8b98dff18..d588b63d9e88 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -23,7 +23,7 @@ MachOLinkGraphBuilder::~MachOLinkGraphBuilder() {}
 
 Expected<std::unique_ptr<LinkGraph>> MachOLinkGraphBuilder::buildGraph() {
 
-  // Sanity check: we only operate on relocatable objects.
+  // We only operate on relocatable objects.
   if (!Obj.isRelocatableObject())
     return make_error<JITLinkError>("Object is not a relocatable MachO");
 
@@ -107,11 +107,9 @@ MachOLinkGraphBuilder::getEndianness(const object::MachOObjectFile &Obj) {
 }
 
 Section &MachOLinkGraphBuilder::getCommonSection() {
-  if (!CommonSection) {
-    auto Prot = static_cast<sys::Memory::ProtectionFlags>(
-        sys::Memory::MF_READ | sys::Memory::MF_WRITE);
-    CommonSection = &G->createSection(CommonSectionName, Prot);
-  }
+  if (!CommonSection)
+    CommonSection =
+        &G->createSection(CommonSectionName, MemProt::Read | MemProt::Write);
   return *CommonSection;
 }
 
@@ -176,25 +174,16 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
     // Get prot flags.
     // FIXME: Make sure this test is correct (it's probably missing cases
     // as-is).
-    sys::Memory::ProtectionFlags Prot;
+    MemProt Prot;
     if (NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS)
-      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                       sys::Memory::MF_EXEC);
+      Prot = MemProt::Read | MemProt::Exec;
     else
-      Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                       sys::Memory::MF_WRITE);
-
-    if (!isDebugSection(NSec)) {
-      auto FullyQualifiedName =
-          G->allocateString(StringRef(NSec.SegName) + "," + NSec.SectName);
-      NSec.GraphSection = &G->createSection(
-          StringRef(FullyQualifiedName.data(), FullyQualifiedName.size()),
-          Prot);
-    } else
-      LLVM_DEBUG({
-        dbgs() << "    " << NSec.SegName << "," << NSec.SectName
-               << " is a debug section: No graph section will be created.\n";
-      });
+      Prot = MemProt::Read | MemProt::Write;
+
+    auto FullyQualifiedName =
+        G->allocateString(StringRef(NSec.SegName) + "," + NSec.SectName);
+    NSec.GraphSection = &G->createSection(
+        StringRef(FullyQualifiedName.data(), FullyQualifiedName.size()), Prot);
 
     IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec)));
   }
@@ -292,15 +281,16 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() {
       dbgs() << "\n";
     });
 
-    // If this symbol has a section, sanity check that the addresses line up.
+    // If this symbol has a section, verify that the addresses line up.
     if (Sect != 0) {
       auto NSec = findSectionByIndex(Sect - 1);
       if (!NSec)
         return NSec.takeError();
 
       if (Value < NSec->Address || Value > NSec->Address + NSec->Size)
-        return make_error<JITLinkError>("Symbol address does not fall within "
-                                        "section");
+        return make_error<JITLinkError>("Address " + formatv("{0:x}", Value) +
+                                        " for symbol " + *Name +
+                                        " does not fall within section");
 
       if (!NSec->GraphSection) {
         LLVM_DEBUG({
@@ -321,16 +311,19 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() {
 }
 
 void MachOLinkGraphBuilder::addSectionStartSymAndBlock(
-    Section &GraphSec, uint64_t Address, const char *Data, uint64_t Size,
-    uint32_t Alignment, bool IsLive) {
+    unsigned SecIndex, Section &GraphSec, uint64_t Address, const char *Data,
+    uint64_t Size, uint32_t Alignment, bool IsLive) {
   Block &B =
       Data ? G->createContentBlock(GraphSec, ArrayRef<char>(Data, Size),
                                    Address, Alignment, 0)
            : G->createZeroFillBlock(GraphSec, Size, Address, Alignment, 0);
   auto &Sym = G->addAnonymousSymbol(B, 0, Size, false, IsLive);
-  assert(!AddrToCanonicalSymbol.count(Sym.getAddress()) &&
+  auto SecI = IndexToSection.find(SecIndex);
+  assert(SecI != IndexToSection.end() && "SecIndex invalid");
+  auto &NSec = SecI->second;
+  assert(!NSec.CanonicalSymbols.count(Sym.getAddress()) &&
          "Anonymous block start symbol clashes with existing symbol address");
-  AddrToCanonicalSymbol[Sym.getAddress()] = &Sym;
+  NSec.CanonicalSymbols[Sym.getAddress()] = &Sym;
 }
 
 Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
@@ -444,8 +437,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
                  << formatv("{0:x16}", NSec.Address) << " -- "
                  << formatv("{0:x16}", NSec.Address + NSec.Size) << "\n";
         });
-        addSectionStartSymAndBlock(*NSec.GraphSection, NSec.Address, NSec.Data,
-                                   NSec.Size, NSec.Alignment,
+        addSectionStartSymAndBlock(SecIndex, *NSec.GraphSection, NSec.Address,
+                                   NSec.Data, NSec.Size, NSec.Alignment,
                                    SectionIsNoDeadStrip);
       } else
         LLVM_DEBUG({
@@ -483,8 +476,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
                << formatv("{0:x16}", NSec.Address) << " -- "
                << formatv("{0:x16}", NSec.Address + AnonBlockSize) << " ]\n";
       });
-      addSectionStartSymAndBlock(*NSec.GraphSection, NSec.Address, NSec.Data,
-                                 AnonBlockSize, NSec.Alignment,
+      addSectionStartSymAndBlock(SecIndex, *NSec.GraphSection, NSec.Address,
+                                 NSec.Data, AnonBlockSize, NSec.Alignment,
                                  SectionIsNoDeadStrip);
     }
 
@@ -583,7 +576,7 @@ Symbol &MachOLinkGraphBuilder::createStandardGraphSymbol(NormalizedSymbol &NSym,
   NSym.GraphSymbol = &Sym;
 
   if (IsCanonical)
-    setCanonicalSymbol(Sym);
+    setCanonicalSymbol(getSectionByIndex(NSym.Sect - 1), Sym);
 
   return Sym;
 }
@@ -610,7 +603,6 @@ Error MachOLinkGraphBuilder::graphifySectionsWithCustomParsers() {
 
 Error MachOLinkGraphBuilder::graphifyCStringSection(
     NormalizedSection &NSec, std::vector<NormalizedSymbol *> NSyms) {
-
   assert(NSec.GraphSection && "C string literal section missing graph section");
   assert(NSec.Data && "C string literal section has no data");
 
@@ -664,7 +656,7 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
       // If there's no symbol at the start of this block then create one.
       if (NSyms.empty() || NSyms.back()->Value != B.getAddress()) {
         auto &S = G->addAnonymousSymbol(B, 0, BlockSize, false, false);
-        setCanonicalSymbol(S);
+        setCanonicalSymbol(NSec, S);
         LLVM_DEBUG({
           dbgs() << "      Adding anonymous symbol for c-string block "
                  << formatv("{0:x16} -- {1:x16}", S.getAddress(),
@@ -700,5 +692,119 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
   return Error::success();
 }
 
+Error CompactUnwindSplitter::operator()(LinkGraph &G) {
+  auto *CUSec = G.findSectionByName(CompactUnwindSectionName);
+  if (!CUSec)
+    return Error::success();
+
+  if (!G.getTargetTriple().isOSBinFormatMachO())
+    return make_error<JITLinkError>(
+        "Error linking " + G.getName() +
+        ": compact unwind splitting not supported on non-macho target " +
+        G.getTargetTriple().str());
+
+  unsigned CURecordSize = 0;
+  unsigned PersonalityEdgeOffset = 0;
+  unsigned LSDAEdgeOffset = 0;
+  switch (G.getTargetTriple().getArch()) {
+  case Triple::aarch64:
+  case Triple::x86_64:
+    // 64-bit compact-unwind record format:
+    // Range start: 8 bytes.
+    // Range size:  4 bytes.
+    // CU encoding: 4 bytes.
+    // Personality: 8 bytes.
+    // LSDA:        8 bytes.
+    CURecordSize = 32;
+    PersonalityEdgeOffset = 16;
+    LSDAEdgeOffset = 24;
+    break;
+  default:
+    return make_error<JITLinkError>(
+        "Error linking " + G.getName() +
+        ": compact unwind splitting not supported on " +
+        G.getTargetTriple().getArchName());
+  }
+
+  std::vector<Block *> OriginalBlocks(CUSec->blocks().begin(),
+                                      CUSec->blocks().end());
+  LLVM_DEBUG({
+    dbgs() << "In " << G.getName() << " splitting compact unwind section "
+           << CompactUnwindSectionName << " containing "
+           << OriginalBlocks.size() << " initial blocks...\n";
+  });
+
+  while (!OriginalBlocks.empty()) {
+    auto *B = OriginalBlocks.back();
+    OriginalBlocks.pop_back();
+
+    if (B->getSize() == 0) {
+      LLVM_DEBUG({
+        dbgs() << "  Skipping empty block at "
+               << formatv("{0:x16}", B->getAddress()) << "\n";
+      });
+      continue;
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "  Splitting block at " << formatv("{0:x16}", B->getAddress())
+             << " into " << (B->getSize() / CURecordSize)
+             << " compact unwind record(s)\n";
+    });
+
+    if (B->getSize() % CURecordSize)
+      return make_error<JITLinkError>(
+          "Error splitting compact unwind record in " + G.getName() +
+          ": block at " + formatv("{0:x}", B->getAddress()) + " has size " +
+          formatv("{0:x}", B->getSize()) +
+          " (not a multiple of CU record size of " +
+          formatv("{0:x}", CURecordSize) + ")");
+
+    unsigned NumBlocks = B->getSize() / CURecordSize;
+    LinkGraph::SplitBlockCache C;
+
+    for (unsigned I = 0; I != NumBlocks; ++I) {
+      auto &CURec = G.splitBlock(*B, CURecordSize, &C);
+      bool AddedKeepAlive = false;
+
+      for (auto &E : CURec.edges()) {
+        if (E.getOffset() == 0) {
+          LLVM_DEBUG({
+            dbgs() << "    Updating compact unwind record at "
+                   << formatv("{0:x16}", CURec.getAddress()) << " to point to "
+                   << (E.getTarget().hasName() ? E.getTarget().getName()
+                                               : StringRef())
+                   << " (at " << formatv("{0:x16}", E.getTarget().getAddress())
+                   << ")\n";
+          });
+
+          if (E.getTarget().isExternal())
+            return make_error<JITLinkError>(
+                "Error adding keep-alive edge for compact unwind record at " +
+                formatv("{0:x}", CURec.getAddress()) + ": target " +
+                E.getTarget().getName() + " is an external symbol");
+          auto &TgtBlock = E.getTarget().getBlock();
+          auto &CURecSym =
+              G.addAnonymousSymbol(CURec, 0, CURecordSize, 0, false);
+          TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0);
+          AddedKeepAlive = true;
+        } else if (E.getOffset() != PersonalityEdgeOffset &&
+                   E.getOffset() != LSDAEdgeOffset)
+          return make_error<JITLinkError>("Unexpected edge at offset " +
+                                          formatv("{0:x}", E.getOffset()) +
+                                          " in compact unwind record at " +
+                                          formatv("{0:x}", CURec.getAddress()));
+      }
+
+      if (!AddedKeepAlive)
+        return make_error<JITLinkError>(
+            "Error adding keep-alive edge for compact unwind record at " +
+            formatv("{0:x}", CURec.getAddress()) +
+            ": no outgoing target edge at offset 0");
+    }
+  }
+  return Error::success();
+}
+
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index 90b14c44ff8a..d29732ebdba8 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -77,6 +77,7 @@ protected:
     uint32_t Flags = 0;
     const char *Data = nullptr;
     Section *GraphSection = nullptr;
+    std::map<JITTargetAddress, Symbol *> CanonicalSymbols;
   };
 
   using SectionParserFunction = std::function<Error(NormalizedSection &S)>;
@@ -125,30 +126,31 @@ protected:
   /// given index is out of range, or if no symbol has been added for the given
   /// index.
   Expected<NormalizedSymbol &> findSymbolByIndex(uint64_t Index) {
-    if (Index >= IndexToSymbol.size())
-      return make_error<JITLinkError>("Symbol index out of range");
-    auto *Sym = IndexToSymbol[Index];
-    if (!Sym)
+    auto I = IndexToSymbol.find(Index);
+    if (I == IndexToSymbol.end())
       return make_error<JITLinkError>("No symbol at index " +
                                       formatv("{0:d}", Index));
-    return *Sym;
+    assert(I->second && "Null symbol at index");
+    return *I->second;
   }
 
   /// Returns the symbol with the highest address not greater than the search
   /// address, or null if no such symbol exists.
-  Symbol *getSymbolByAddress(JITTargetAddress Address) {
-    auto I = AddrToCanonicalSymbol.upper_bound(Address);
-    if (I == AddrToCanonicalSymbol.begin())
+  Symbol *getSymbolByAddress(NormalizedSection &NSec,
+                             JITTargetAddress Address) {
+    auto I = NSec.CanonicalSymbols.upper_bound(Address);
+    if (I == NSec.CanonicalSymbols.begin())
       return nullptr;
     return std::prev(I)->second;
   }
 
   /// Returns the symbol with the highest address not greater than the search
   /// address, or an error if no such symbol exists.
-  Expected<Symbol &> findSymbolByAddress(JITTargetAddress Address) {
-    auto *Sym = getSymbolByAddress(Address);
+  Expected<Symbol &> findSymbolByAddress(NormalizedSection &NSec,
+                                         JITTargetAddress Address) {
+    auto *Sym = getSymbolByAddress(NSec, Address);
     if (Sym)
-      if (Address < Sym->getAddress() + Sym->getSize())
+      if (Address <= Sym->getAddress() + Sym->getSize())
         return *Sym;
     return make_error<JITLinkError>("No symbol covering address " +
                                     formatv("{0:x16}", Address));
@@ -179,8 +181,8 @@ private:
   static unsigned getPointerSize(const object::MachOObjectFile &Obj);
   static support::endianness getEndianness(const object::MachOObjectFile &Obj);
 
-  void setCanonicalSymbol(Symbol &Sym) {
-    auto *&CanonicalSymEntry = AddrToCanonicalSymbol[Sym.getAddress()];
+  void setCanonicalSymbol(NormalizedSection &NSec, Symbol &Sym) {
+    auto *&CanonicalSymEntry = NSec.CanonicalSymbols[Sym.getAddress()];
     // There should be no symbol at this address, or, if there is,
     // it should be a zero-sized symbol from an empty section (which
     // we can safely override).
@@ -190,9 +192,10 @@ private:
   }
 
   Section &getCommonSection();
-  void addSectionStartSymAndBlock(Section &GraphSec, uint64_t Address,
-                                  const char *Data, uint64_t Size,
-                                  uint32_t Alignment, bool IsLive);
+  void addSectionStartSymAndBlock(unsigned SecIndex, Section &GraphSec,
+                                  uint64_t Address, const char *Data,
+                                  uint64_t Size, uint32_t Alignment,
+                                  bool IsLive);
 
   Error createNormalizedSections();
   Error createNormalizedSymbols();
@@ -227,10 +230,20 @@ private:
   Section *CommonSection = nullptr;
 
   DenseMap<uint32_t, NormalizedSymbol *> IndexToSymbol;
-  std::map<JITTargetAddress, Symbol *> AddrToCanonicalSymbol;
   StringMap<SectionParserFunction> CustomSectionParserFunctions;
 };
 
+/// A pass to split up __LD,__compact_unwind sections.
+class CompactUnwindSplitter {
+public:
+  CompactUnwindSplitter(StringRef CompactUnwindSectionName)
+      : CompactUnwindSectionName(CompactUnwindSectionName) {}
+  Error operator()(LinkGraph &G);
+
+private:
+  StringRef CompactUnwindSectionName;
+};
+
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 169e20a1d1d3..f2a029d35cd5 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -81,6 +81,14 @@ private:
       if (!RI.r_pcrel && !RI.r_extern && RI.r_length == 2)
         return PairedAddend;
       break;
+    case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
+      if (RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return TLVPage21;
+      break;
+    case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
+      if (!RI.r_pcrel && RI.r_extern && RI.r_length == 2)
+        return TLVPageOffset12;
+      break;
     }
 
     return make_error<JITLinkError>(
@@ -152,7 +160,7 @@ private:
       auto ToSymbolSec = findSectionByIndex(UnsignedRI.r_symbolnum - 1);
       if (!ToSymbolSec)
         return ToSymbolSec.takeError();
-      ToSymbol = getSymbolByAddress(ToSymbolSec->Address);
+      ToSymbol = getSymbolByAddress(*ToSymbolSec, ToSymbolSec->Address);
       assert(ToSymbol && "No symbol for section");
       FixupValue -= ToSymbol->getAddress();
     }
@@ -197,14 +205,18 @@ private:
         continue;
       }
 
-      // Skip relocations for debug symbols.
+      auto NSec =
+          findSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
+      if (!NSec)
+        return NSec.takeError();
+
+      // Skip relocations for MachO sections without corresponding graph
+      // sections.
       {
-        auto &NSec =
-            getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
-        if (!NSec.GraphSection) {
+        if (!NSec->GraphSection) {
           LLVM_DEBUG({
             dbgs() << "  Skipping relocations for MachO section "
-                   << NSec.SegName << "/" << NSec.SectName
+                   << NSec->SegName << "/" << NSec->SectName
                    << " which has no associated graph section\n";
           });
           continue;
@@ -216,25 +228,22 @@ private:
 
         MachO::relocation_info RI = getRelocationInfo(RelItr);
 
-        // Sanity check the relocation kind.
+        // Validate the relocation kind.
         auto Kind = getRelocationKind(RI);
         if (!Kind)
           return Kind.takeError();
 
         // Find the address of the value to fix up.
         JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
-
         LLVM_DEBUG({
-          auto &NSec =
-              getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
-          dbgs() << "  " << NSec.SectName << " + "
+          dbgs() << "  " << NSec->SectName << " + "
                  << formatv("{0:x8}", RI.r_address) << ":\n";
         });
 
         // Find the block that the fixup points to.
         Block *BlockToFix = nullptr;
         {
-          auto SymbolToFixOrErr = findSymbolByAddress(FixupAddress);
+          auto SymbolToFixOrErr = findSymbolByAddress(*NSec, FixupAddress);
           if (!SymbolToFixOrErr)
             return SymbolToFixOrErr.takeError();
           BlockToFix = &SymbolToFixOrErr->getBlock();
@@ -316,7 +325,11 @@ private:
           break;
         case Pointer64Anon: {
           JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent;
-          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+          auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
+          if (!TargetNSec)
+            return TargetNSec.takeError();
+          if (auto TargetSymbolOrErr =
+                  findSymbolByAddress(*TargetNSec, TargetAddress))
             TargetSymbol = &*TargetSymbolOrErr;
           else
             return TargetSymbolOrErr.takeError();
@@ -324,6 +337,7 @@ private:
           break;
         }
         case Page21:
+        case TLVPage21:
         case GOTPage21: {
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
@@ -348,6 +362,7 @@ private:
                                             "encoded addend");
           break;
         }
+        case TLVPageOffset12:
         case GOTPageOffset12: {
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
@@ -414,6 +429,7 @@ public:
 
   bool isGOTEdgeToFix(Edge &E) const {
     return E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12 ||
+           E.getKind() == TLVPage21 || E.getKind() == TLVPageOffset12 ||
            E.getKind() == PointerToGOT;
   }
 
@@ -425,7 +441,8 @@ public:
   }
 
   void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
-    if (E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12) {
+    if (E.getKind() == GOTPage21 || E.getKind() == GOTPageOffset12 ||
+        E.getKind() == TLVPage21 || E.getKind() == TLVPageOffset12) {
       // Update the target, but leave the edge addend as-is.
       E.setTarget(GOTEntry);
     } else if (E.getKind() == PointerToGOT) {
@@ -457,16 +474,14 @@ public:
 private:
   Section &getGOTSection() {
     if (!GOTSection)
-      GOTSection = &G.createSection("$__GOT", sys::Memory::MF_READ);
+      GOTSection = &G.createSection("$__GOT", MemProt::Read);
     return *GOTSection;
   }
 
   Section &getStubsSection() {
-    if (!StubsSection) {
-      auto StubsProt = static_cast<sys::Memory::ProtectionFlags>(
-          sys::Memory::MF_READ | sys::Memory::MF_EXEC);
-      StubsSection = &G.createSection("$__STUBS", StubsProt);
-    }
+    if (!StubsSection)
+      StubsSection =
+          &G.createSection("$__STUBS", MemProt::Read | MemProt::Exec);
     return *StubsSection;
   }
 
@@ -567,6 +582,7 @@ private:
       break;
     }
     case Page21:
+    case TLVPage21:
     case GOTPage21: {
       assert((E.getKind() != GOTPage21 || E.getAddend() == 0) &&
              "GOTPAGE21 with non-zero addend");
@@ -603,6 +619,7 @@ private:
       *(ulittle32_t *)FixupPtr = FixedInstr;
       break;
     }
+    case TLVPageOffset12:
     case GOTPageOffset12: {
       assert(E.getAddend() == 0 && "GOTPAGEOF12 with non-zero addend");
 
@@ -629,7 +646,8 @@ private:
       if (Delta < -(1 << 20) || Delta > ((1 << 20) - 1))
         return makeTargetOutOfRangeError(G, B, E);
 
-      uint32_t EncodedImm = (static_cast<uint32_t>(Delta) >> 2) << 5;
+      uint32_t EncodedImm =
+        ((static_cast<uint32_t>(Delta) >> 2) & 0x7ffff) << 5;
       uint32_t FixedInstr = RawInstr | EncodedImm;
       *(ulittle32_t *)FixupPtr = FixedInstr;
       break;
@@ -683,6 +701,10 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
     else
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
+    // Add compact unwind splitter pass.
+    Config.PrePrunePasses.push_back(
+        CompactUnwindSplitter("__LD,__compact_unwind"));
+
     // Add an in-place GOT/Stubs pass.
     Config.PostPrunePasses.push_back(
         PerGraphGOTAndPLTStubsBuilder_MachO_arm64::asPass);
@@ -711,6 +733,10 @@ const char *getMachOARM64RelocationKindName(Edge::Kind R) {
     return "GOTPage21";
   case GOTPageOffset12:
     return "GOTPageOffset12";
+  case TLVPage21:
+    return "TLVPage21";
+  case TLVPageOffset12:
+    return "TLVPageOffset12";
   case PointerToGOT:
     return "PointerToGOT";
   case PairedAddend:
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 61d5c5e21ff1..a4fcd3b9a5f5 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -170,7 +170,7 @@ private:
       auto ToSymbolSec = findSectionByIndex(UnsignedRI.r_symbolnum - 1);
       if (!ToSymbolSec)
         return ToSymbolSec.takeError();
-      ToSymbol = getSymbolByAddress(ToSymbolSec->Address);
+      ToSymbol = getSymbolByAddress(*ToSymbolSec, ToSymbolSec->Address);
       assert(ToSymbol && "No symbol for section");
       FixupValue -= ToSymbol->getAddress();
     }
@@ -216,14 +216,18 @@ private:
         continue;
       }
 
-      // Skip relocations for debug symbols.
+      auto NSec =
+          findSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
+      if (!NSec)
+        return NSec.takeError();
+
+      // Skip relocations for MachO sections without corresponding graph
+      // sections.
       {
-        auto &NSec =
-            getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
-        if (!NSec.GraphSection) {
+        if (!NSec->GraphSection) {
           LLVM_DEBUG({
             dbgs() << "  Skipping relocations for MachO section "
-                   << NSec.SegName << "/" << NSec.SectName
+                   << NSec->SegName << "/" << NSec->SectName
                    << " which has no associated graph section\n";
           });
           continue;
@@ -240,16 +244,14 @@ private:
         JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
 
         LLVM_DEBUG({
-          auto &NSec =
-              getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
-          dbgs() << "  " << NSec.SectName << " + "
+          dbgs() << "  " << NSec->SectName << " + "
                  << formatv("{0:x8}", RI.r_address) << ":\n";
         });
 
         // Find the block that the fixup points to.
         Block *BlockToFix = nullptr;
         {
-          auto SymbolToFixOrErr = findSymbolByAddress(FixupAddress);
+          auto SymbolToFixOrErr = findSymbolByAddress(*NSec, FixupAddress);
           if (!SymbolToFixOrErr)
             return SymbolToFixOrErr.takeError();
           BlockToFix = &SymbolToFixOrErr->getBlock();
@@ -270,7 +272,7 @@ private:
         Symbol *TargetSymbol = nullptr;
         uint64_t Addend = 0;
 
-        // Sanity check the relocation kind.
+        // Validate the relocation kind.
         auto MachORelocKind = getRelocKind(RI);
         if (!MachORelocKind)
           return MachORelocKind.takeError();
@@ -300,7 +302,7 @@ private:
           else
             return TargetSymbolOrErr.takeError();
           Addend = *(const little32_t *)FixupContent;
-          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
+          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
           if (FixupOffset < 3)
             return make_error<JITLinkError>("GOTLD at invalid offset " +
                                             formatv("{0}", FixupOffset));
@@ -319,7 +321,10 @@ private:
           else
             return TargetSymbolOrErr.takeError();
           Addend = *(const little32_t *)FixupContent;
-          Kind = x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable;
+          Kind = x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable;
+          if (FixupOffset < 3)
+            return make_error<JITLinkError>("TLV at invalid offset " +
+                                            formatv("{0}", FixupOffset));
           break;
         case MachOPointer32:
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
@@ -339,7 +344,11 @@ private:
           break;
         case MachOPointer64Anon: {
           JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent;
-          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+          auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
+          if (!TargetNSec)
+            return TargetNSec.takeError();
+          if (auto TargetSymbolOrErr =
+                  findSymbolByAddress(*TargetNSec, TargetAddress))
             TargetSymbol = &*TargetSymbolOrErr;
           else
             return TargetSymbolOrErr.takeError();
@@ -360,7 +369,11 @@ private:
         case MachOPCRel32Anon: {
           JITTargetAddress TargetAddress =
               FixupAddress + 4 + *(const little32_t *)FixupContent;
-          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+          auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
+          if (!TargetNSec)
+            return TargetNSec.takeError();
+          if (auto TargetSymbolOrErr =
+                  findSymbolByAddress(*TargetNSec, TargetAddress))
             TargetSymbol = &*TargetSymbolOrErr;
           else
             return TargetSymbolOrErr.takeError();
@@ -376,7 +389,11 @@ private:
                       1ULL << (*MachORelocKind - MachOPCRel32Minus1Anon));
           JITTargetAddress TargetAddress =
               FixupAddress + Delta + *(const little32_t *)FixupContent;
-          if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
+          auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
+          if (!TargetNSec)
+            return TargetNSec.takeError();
+          if (auto TargetSymbolOrErr =
+                  findSymbolByAddress(*TargetNSec, TargetAddress))
             TargetSymbol = &*TargetSymbolOrErr;
           else
             return TargetSymbolOrErr.takeError();
@@ -417,157 +434,15 @@ private:
   }
 };
 
-class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64
-    : public PerGraphGOTAndPLTStubsBuilder<
-          PerGraphGOTAndPLTStubsBuilder_MachO_x86_64> {
-public:
-
-  using PerGraphGOTAndPLTStubsBuilder<
-      PerGraphGOTAndPLTStubsBuilder_MachO_x86_64>::
-      PerGraphGOTAndPLTStubsBuilder;
-
-  bool isGOTEdgeToFix(Edge &E) const {
-    return E.getKind() == x86_64::RequestGOTAndTransformToDelta32 ||
-           E.getKind() ==
-               x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
-  }
-
-  Symbol &createGOTEntry(Symbol &Target) {
-    return x86_64::createAnonymousPointer(G, getGOTSection(), &Target);
-  }
-
-  void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
-    // Fix the edge kind.
-    switch (E.getKind()) {
-    case x86_64::RequestGOTAndTransformToDelta32:
-      E.setKind(x86_64::Delta32);
-      break;
-    case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable:
-      E.setKind(x86_64::PCRel32GOTLoadRelaxable);
-      break;
-    default:
-      llvm_unreachable("Not a GOT transform edge");
-    }
-    // Fix the target, leave the addend as-is.
-    E.setTarget(GOTEntry);
-  }
-
-  bool isExternalBranchEdge(Edge &E) {
-    return E.getKind() == x86_64::BranchPCRel32 && E.getTarget().isExternal();
-  }
-
-  Symbol &createPLTStub(Symbol &Target) {
-    return x86_64::createAnonymousPointerJumpStub(G, getStubsSection(),
-                                                  getGOTEntry(Target));
-  }
-
-  void fixPLTEdge(Edge &E, Symbol &Stub) {
-    assert(E.getKind() == x86_64::BranchPCRel32 && "Not a Branch32 edge?");
-    assert(E.getAddend() == 0 &&
-           "BranchPCRel32 edge has unexpected addend value");
-
-    // Set the edge kind to BranchPCRel32ToPtrJumpStubRelaxable. We will use
-    // this to check for stub optimization opportunities in the
-    // optimizeMachO_x86_64_GOTAndStubs pass below.
-    E.setKind(x86_64::BranchPCRel32ToPtrJumpStubRelaxable);
-    E.setTarget(Stub);
-  }
-
-private:
-  Section &getGOTSection() {
-    if (!GOTSection)
-      GOTSection = &G.createSection("$__GOT", sys::Memory::MF_READ);
-    return *GOTSection;
-  }
-
-  Section &getStubsSection() {
-    if (!StubsSection) {
-      auto StubsProt = static_cast<sys::Memory::ProtectionFlags>(
-          sys::Memory::MF_READ | sys::Memory::MF_EXEC);
-      StubsSection = &G.createSection("$__STUBS", StubsProt);
-    }
-    return *StubsSection;
-  }
-
-  Section *GOTSection = nullptr;
-  Section *StubsSection = nullptr;
-};
-
-} // namespace
-
-static Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) {
-  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
-
-  for (auto *B : G.blocks())
-    for (auto &E : B->edges())
-      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable) {
-        assert(E.getOffset() >= 3 && "GOT edge occurs too early in block");
-
-        // Optimize GOT references.
-        auto &GOTBlock = E.getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT entry block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT entry should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        // Check that this is a recognized MOV instruction.
-        // FIXME: Can we assume this?
-        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
-        if (strncmp(B->getContent().data() + E.getOffset() - 3,
-                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
-          continue;
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          E.setTarget(GOTTarget);
-          E.setKind(x86_64::Delta32);
-          E.setAddend(E.getAddend() - 4);
-          char *BlockData = B->getMutableContent(G).data();
-          BlockData[E.getOffset() - 2] = (char)0x8d;
-          LLVM_DEBUG({
-            dbgs() << "  Replaced GOT load wih LEA:\n    ";
-            printEdge(dbgs(), *B, E, x86_64::getEdgeKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubRelaxable) {
-        auto &StubBlock = E.getTarget().getBlock();
-        assert(StubBlock.getSize() == sizeof(x86_64::PointerJumpStubContent) &&
-               "Stub block should be stub sized");
-        assert(StubBlock.edges_size() == 1 &&
-               "Stub block should only have one outgoing edge");
-
-        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT block should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          E.setKind(x86_64::BranchPCRel32);
-          E.setTarget(GOTTarget);
-          LLVM_DEBUG({
-            dbgs() << "  Replaced stub branch with direct branch:\n    ";
-            printEdge(dbgs(), *B, E, x86_64::getEdgeKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      }
-
+Error buildGOTAndStubs_MachO_x86_64(LinkGraph &G) {
+  x86_64::GOTTableManager GOT;
+  x86_64::PLTTableManager PLT(GOT);
+  visitExistingEdges(G, GOT, PLT);
   return Error::success();
 }
 
+} // namespace
+
 namespace llvm {
 namespace jitlink {
 
@@ -582,7 +457,7 @@ public:
 
 private:
   Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
-    return x86_64::applyFixup(G, B, E);
+    return x86_64::applyFixup(G, B, E, nullptr);
   }
 };
 
@@ -604,6 +479,10 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
     Config.PrePrunePasses.push_back(createEHFrameSplitterPass_MachO_x86_64());
     Config.PrePrunePasses.push_back(createEHFrameEdgeFixerPass_MachO_x86_64());
 
+    // Add compact unwind splitter pass.
+    Config.PrePrunePasses.push_back(
+        CompactUnwindSplitter("__LD,__compact_unwind"));
+
     // Add a mark-live pass.
     if (auto MarkLive = Ctx->getMarkLivePass(G->getTargetTriple()))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
@@ -611,11 +490,10 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
       Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
     // Add an in-place GOT/Stubs pass.
-    Config.PostPrunePasses.push_back(
-        PerGraphGOTAndPLTStubsBuilder_MachO_x86_64::asPass);
+    Config.PostPrunePasses.push_back(buildGOTAndStubs_MachO_x86_64);
 
     // Add GOT/Stubs optimizer pass.
-    Config.PreFixupPasses.push_back(optimizeMachO_x86_64_GOTAndStubs);
+    Config.PreFixupPasses.push_back(x86_64::optimizeGOTAndStubAccesses);
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
diff --git a/llvm/lib/ExecutionEngine/JITLink/MemoryFlags.cpp b/llvm/lib/ExecutionEngine/JITLink/MemoryFlags.cpp
new file mode 100644
index 000000000000..b73a310b2910
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/MemoryFlags.cpp
@@ -0,0 +1,33 @@
+//===------------- MemoryFlags.cpp - Memory allocation flags --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/MemoryFlags.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+
+raw_ostream &operator<<(raw_ostream &OS, MemProt MP) {
+  return OS << (((MP & MemProt::Read) != MemProt::None) ? 'R' : '-')
+            << (((MP & MemProt::Write) != MemProt::None) ? 'W' : '-')
+            << (((MP & MemProt::Exec) != MemProt::None) ? 'X' : '-');
+}
+
+raw_ostream &operator<<(raw_ostream &OS, MemDeallocPolicy MDP) {
+  return OS << (MDP == MemDeallocPolicy::Standard ? "standard" : "finalize");
+}
+
+raw_ostream &operator<<(raw_ostream &OS, AllocGroup AG) {
+  return OS << '(' << AG.getMemProt() << ", " << AG.getMemDeallocPolicy()
+            << ')';
+}
+
+} // end namespace jitlink
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
new file mode 100644
index 000000000000..6dccc4811885
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -0,0 +1,30 @@
+//===---- aarch64.cpp - Generic JITLink aarch64 edge kinds, utilities -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing aarch64 objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/aarch64.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+namespace aarch64 {
+
+const char *getEdgeKindName(Edge::Kind K) {
+  switch (K) {
+  case R_AARCH64_CALL26:
+    return "R_AARCH64_CALL26";
+  }
+  return getGenericEdgeKindName(K);
+}
+} // namespace aarch64
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
index c951ed6d95be..48521280059d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -24,6 +24,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "Pointer64";
   case Pointer32:
     return "Pointer32";
+  case Pointer32Signed:
+    return "Pointer32Signed";
   case Delta64:
     return "Delta64";
   case Delta32:
@@ -32,22 +34,32 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "NegDelta64";
   case NegDelta32:
     return "NegDelta32";
+  case Delta64FromGOT:
+    return "Delta64FromGOT";
   case BranchPCRel32:
     return "BranchPCRel32";
   case BranchPCRel32ToPtrJumpStub:
     return "BranchPCRel32ToPtrJumpStub";
-  case BranchPCRel32ToPtrJumpStubRelaxable:
-    return "BranchPCRel32ToPtrJumpStubRelaxable";
+  case BranchPCRel32ToPtrJumpStubBypassable:
+    return "BranchPCRel32ToPtrJumpStubBypassable";
   case RequestGOTAndTransformToDelta32:
     return "RequestGOTAndTransformToDelta32";
+  case RequestGOTAndTransformToDelta64:
+    return "RequestGOTAndTransformToDelta64";
+  case RequestGOTAndTransformToDelta64FromGOT:
+    return "RequestGOTAndTransformToDelta64FromGOT";
+  case PCRel32GOTLoadREXRelaxable:
+    return "PCRel32GOTLoadREXRelaxable";
+  case RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable:
+    return "RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable";
   case PCRel32GOTLoadRelaxable:
     return "PCRel32GOTLoadRelaxable";
   case RequestGOTAndTransformToPCRel32GOTLoadRelaxable:
     return "RequestGOTAndTransformToPCRel32GOTLoadRelaxable";
-  case PCRel32TLVPLoadRelaxable:
-    return "PCRel32TLVPLoadRelaxable";
-  case RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable:
-    return "RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable";
+  case PCRel32TLVPLoadREXRelaxable:
+    return "PCRel32TLVPLoadREXRelaxable";
+  case RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable:
+    return "RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable";
   default:
     return getGenericEdgeKindName(static_cast<Edge::Kind>(K));
   }
@@ -59,6 +71,119 @@ const char NullPointerContent[PointerSize] = {0x00, 0x00, 0x00, 0x00,
 const char PointerJumpStubContent[6] = {
     static_cast<char>(0xFFu), 0x25, 0x00, 0x00, 0x00, 0x00};
 
+Error optimizeGOTAndStubAccesses(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
+
+  for (auto *B : G.blocks())
+    for (auto &E : B->edges()) {
+      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable ||
+          E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) {
+#ifndef NDEBUG
+        bool REXPrefix = E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable;
+        assert(E.getOffset() >= (REXPrefix ? 3u : 2u) &&
+               "GOT edge occurs too early in block");
+#endif
+        auto *FixupData = reinterpret_cast<uint8_t *>(
+                              const_cast<char *>(B->getContent().data())) +
+                          E.getOffset();
+        const uint8_t Op = FixupData[-2];
+        const uint8_t ModRM = FixupData[-1];
+
+        auto &GOTEntryBlock = E.getTarget().getBlock();
+        assert(GOTEntryBlock.getSize() == G.getPointerSize() &&
+               "GOT entry block should be pointer sized");
+        assert(GOTEntryBlock.edges_size() == 1 &&
+               "GOT entry should only have one outgoing edge");
+        auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+        JITTargetAddress EdgeAddr = B->getFixupAddress(E);
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr);
+        bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement);
+
+        // If both of the Target and displacement is out of range, then
+        // there isn't optimization chance.
+        if (!(TargetInRangeForImmU32 || DisplacementInRangeForImmS32))
+          continue;
+
+        // Transform "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
+        if (Op == 0x8b && DisplacementInRangeForImmS32) {
+          FixupData[-2] = 0x8d;
+          E.setKind(x86_64::Delta32);
+          E.setTarget(GOTTarget);
+          E.setAddend(E.getAddend() - 4);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced GOT load wih LEA:\n    ";
+            printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+          continue;
+        }
+
+        // Transform call/jmp instructions
+        if (Op == 0xff && TargetInRangeForImmU32) {
+          if (ModRM == 0x15) {
+            // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call
+            // foo" But lld convert it to "addr32 call foo, because that makes
+            // result expression to be a single instruction.
+            FixupData[-2] = 0x67;
+            FixupData[-1] = 0xe8;
+            LLVM_DEBUG({
+              dbgs() << "  replaced call instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          } else {
+            // Transform "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop"
+            assert(ModRM == 0x25 && "Invalid ModRm for call/jmp instructions");
+            FixupData[-2] = 0xe9;
+            FixupData[3] = 0x90;
+            E.setOffset(E.getOffset() - 1);
+            LLVM_DEBUG({
+              dbgs() << "  replaced jmp instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          }
+          E.setKind(x86_64::Pointer32);
+          E.setTarget(GOTTarget);
+          continue;
+        }
+      } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) {
+        auto &StubBlock = E.getTarget().getBlock();
+        assert(StubBlock.getSize() == sizeof(PointerJumpStubContent) &&
+               "Stub block should be stub sized");
+        assert(StubBlock.edges_size() == 1 &&
+               "Stub block should only have one outgoing edge");
+
+        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT block should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (isInRangeForImmS32(Displacement)) {
+          E.setKind(x86_64::BranchPCRel32);
+          E.setTarget(GOTTarget);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced stub branch with direct branch:\n    ";
+            printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      }
+    }
+
+  return Error::success();
+}
+
 } // end namespace x86_64
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 144329aa8bea..200f42aec067 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -218,8 +218,7 @@ void MCJIT::generateCodeForModule(Module *M) {
     std::string Buf;
     raw_string_ostream OS(Buf);
     logAllUnhandledErrors(LoadedObject.takeError(), OS);
-    OS.flush();
-    report_fatal_error(Buf);
+    report_fatal_error(Twine(OS.str()));
   }
   std::unique_ptr<RuntimeDyld::LoadedObjectInfo> L =
     Dyld.loadObject(*LoadedObject.get());
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 5b73c0e2fbc8..9ff6cec8c6c5 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -184,6 +184,8 @@ void CompileOnDemandLayer::emit(
 
 CompileOnDemandLayer::PerDylibResources &
 CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
+  std::lock_guard<std::mutex> Lock(CODLayerMutex);
+
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
     auto &ImplD =
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 12a501f7f98c..64e5090e4c53 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -29,7 +29,6 @@ char SymbolsNotFound::ID = 0;
 char SymbolsCouldNotBeRemoved::ID = 0;
 char MissingSymbolDefinitions::ID = 0;
 char UnexpectedSymbolDefinitions::ID = 0;
-char Task::ID = 0;
 char MaterializationTask::ID = 0;
 
 RegisterDependenciesFunction NoDependenciesToRegister =
@@ -90,14 +89,17 @@ void FailedToMaterialize::log(raw_ostream &OS) const {
   OS << "Failed to materialize symbols: " << *Symbols;
 }
 
-SymbolsNotFound::SymbolsNotFound(SymbolNameSet Symbols) {
+SymbolsNotFound::SymbolsNotFound(std::shared_ptr<SymbolStringPool> SSP,
+                                 SymbolNameSet Symbols)
+    : SSP(std::move(SSP)) {
   for (auto &Sym : Symbols)
     this->Symbols.push_back(Sym);
   assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
 }
 
-SymbolsNotFound::SymbolsNotFound(SymbolNameVector Symbols)
-    : Symbols(std::move(Symbols)) {
+SymbolsNotFound::SymbolsNotFound(std::shared_ptr<SymbolStringPool> SSP,
+                                 SymbolNameVector Symbols)
+    : SSP(std::move(SSP)), Symbols(std::move(Symbols)) {
   assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
 }
 
@@ -109,8 +111,9 @@ void SymbolsNotFound::log(raw_ostream &OS) const {
   OS << "Symbols not found: " << Symbols;
 }
 
-SymbolsCouldNotBeRemoved::SymbolsCouldNotBeRemoved(SymbolNameSet Symbols)
-    : Symbols(std::move(Symbols)) {
+SymbolsCouldNotBeRemoved::SymbolsCouldNotBeRemoved(
+    std::shared_ptr<SymbolStringPool> SSP, SymbolNameSet Symbols)
+    : SSP(std::move(SSP)), Symbols(std::move(Symbols)) {
   assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
 }
 
@@ -1333,11 +1336,13 @@ Error JITDylib::remove(const SymbolNameSet &Names) {
 
     // If any of the symbols are not defined, return an error.
     if (!Missing.empty())
-      return make_error<SymbolsNotFound>(std::move(Missing));
+      return make_error<SymbolsNotFound>(ES.getSymbolStringPool(),
+                                         std::move(Missing));
 
     // If any of the symbols are currently materializing, return an error.
     if (!Materializing.empty())
-      return make_error<SymbolsCouldNotBeRemoved>(std::move(Materializing));
+      return make_error<SymbolsCouldNotBeRemoved>(ES.getSymbolStringPool(),
+                                                  std::move(Materializing));
 
     // Remove the symbols.
     for (auto &SymbolMaterializerItrPair : SymbolsToRemove) {
@@ -1793,8 +1798,6 @@ void Platform::lookupInitSymbolsAsync(
   }
 }
 
-void Task::anchor() {}
-
 void MaterializationTask::printDescription(raw_ostream &OS) {
   OS << "Materialization task: " << MU->getName() << " in "
      << MR->getTargetJITDylib().getName();
@@ -2086,8 +2089,8 @@ Error ExecutionSession::registerJITDispatchHandlers(
 }
 
 void ExecutionSession::runJITDispatchHandler(
-    ExecutorProcessControl::SendResultFunction SendResult,
-    JITTargetAddress HandlerFnTagAddr, ArrayRef<char> ArgBuffer) {
+    SendResultFunction SendResult, JITTargetAddress HandlerFnTagAddr,
+    ArrayRef<char> ArgBuffer) {
 
   std::shared_ptr<JITDispatchHandlerFunction> F;
   {
@@ -2234,7 +2237,8 @@ Error ExecutionSession::IL_updateCandidatesFor(
         // weakly referenced" specific error here to reduce confusion.
         if (SymI->second.getFlags().hasMaterializationSideEffectsOnly() &&
             SymLookupFlags != SymbolLookupFlags::WeaklyReferencedSymbol)
-          return make_error<SymbolsNotFound>(SymbolNameVector({Name}));
+          return make_error<SymbolsNotFound>(getSymbolStringPool(),
+                                             SymbolNameVector({Name}));
 
         // If we matched against this symbol but it is in the error state
         // then bail out and treat it as a failure to materialize.
@@ -2422,7 +2426,7 @@ void ExecutionSession::OL_applyQueryPhase1(
   } else {
     LLVM_DEBUG(dbgs() << "Phase 1 failed with unresolved symbols.\n");
     IPLS->fail(make_error<SymbolsNotFound>(
-        IPLS->DefGeneratorCandidates.getSymbolNames()));
+        getSymbolStringPool(), IPLS->DefGeneratorCandidates.getSymbolNames()));
   }
 }
 
@@ -2492,7 +2496,8 @@ void ExecutionSession::OL_completeLookup(
                 dbgs() << "error: "
                           "required, but symbol is has-side-effects-only\n";
               });
-              return make_error<SymbolsNotFound>(SymbolNameVector({Name}));
+              return make_error<SymbolsNotFound>(getSymbolStringPool(),
+                                                 SymbolNameVector({Name}));
             }
 
             // If we matched against this symbol but it is in the error state
@@ -2594,7 +2599,7 @@ void ExecutionSession::OL_completeLookup(
       }
     }
 
-    LLVM_DEBUG(dbgs() << "Stripping unmatched weakly-refererced symbols\n");
+    LLVM_DEBUG(dbgs() << "Stripping unmatched weakly-referenced symbols\n");
     IPLS->LookupSet.forEachWithRemoval(
         [&](const SymbolStringPtr &Name, SymbolLookupFlags SymLookupFlags) {
           if (SymLookupFlags == SymbolLookupFlags::WeaklyReferencedSymbol) {
@@ -2606,7 +2611,8 @@ void ExecutionSession::OL_completeLookup(
 
     if (!IPLS->LookupSet.empty()) {
       LLVM_DEBUG(dbgs() << "Failing due to unresolved symbols\n");
-      return make_error<SymbolsNotFound>(IPLS->LookupSet.getSymbolNames());
+      return make_error<SymbolsNotFound>(getSymbolStringPool(),
+                                         IPLS->LookupSet.getSymbolNames());
     }
 
     // Record whether the query completed.
@@ -2733,7 +2739,8 @@ void ExecutionSession::OL_completeLookupFlags(
 
     if (!IPLS->LookupSet.empty()) {
       LLVM_DEBUG(dbgs() << "Failing due to unresolved symbols\n");
-      return make_error<SymbolsNotFound>(IPLS->LookupSet.getSymbolNames());
+      return make_error<SymbolsNotFound>(getSymbolStringPool(),
+                                         IPLS->LookupSet.getSymbolNames());
     }
 
     LLVM_DEBUG(dbgs() << "Succeded, result = " << Result << "\n");
@@ -2911,6 +2918,7 @@ void ExecutionSession::dumpDispatchInfo(Task &T) {
   runSessionLocked([&]() {
     dbgs() << "Dispatching: ";
     T.printDescription(dbgs());
+    dbgs() << "\n";
   });
 }
 #endif // NDEBUG
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
index 36efc744bf30..fcfe389f82a8 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
@@ -1,10 +1,15 @@
-//===---- DebugObjectManagerPlugin.h - JITLink debug objects ---*- C++ -*-===//
+//===------- DebugObjectManagerPlugin.cpp - JITLink debug objects ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// FIXME: Update Plugin to poke the debug object into a new JITLink section,
+//        rather than creating a new allocation.
+//
+//===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h"
 
@@ -108,70 +113,77 @@ void ELFDebugObjectSection<ELFT>::dump(raw_ostream &OS, StringRef Name) {
   }
 }
 
-static constexpr sys::Memory::ProtectionFlags ReadOnly =
-    static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ);
-
 enum class Requirement {
   // Request final target memory load-addresses for all sections.
   ReportFinalSectionLoadAddresses,
 };
 
-/// The plugin creates a debug object from JITLinkContext when JITLink starts
-/// processing the corresponding LinkGraph. It provides access to the pass
-/// configuration of the LinkGraph and calls the finalization function, once
-/// the resulting link artifact was emitted.
+/// The plugin creates a debug object from when JITLink starts processing the
+/// corresponding LinkGraph. It provides access to the pass configuration of
+/// the LinkGraph and calls the finalization function, once the resulting link
+/// artifact was emitted.
 ///
 class DebugObject {
 public:
-  DebugObject(JITLinkContext &Ctx, ExecutionSession &ES) : Ctx(Ctx), ES(ES) {}
+  DebugObject(JITLinkMemoryManager &MemMgr, const JITLinkDylib *JD,
+              ExecutionSession &ES)
+      : MemMgr(MemMgr), JD(JD), ES(ES) {}
 
   void set(Requirement Req) { Reqs.insert(Req); }
   bool has(Requirement Req) const { return Reqs.count(Req) > 0; }
 
-  using FinalizeContinuation = std::function<void(Expected<sys::MemoryBlock>)>;
+  using FinalizeContinuation = std::function<void(Expected<ExecutorAddrRange>)>;
+
   void finalizeAsync(FinalizeContinuation OnFinalize);
 
   virtual ~DebugObject() {
-    if (Alloc)
-      if (Error Err = Alloc->deallocate())
+    if (Alloc) {
+      std::vector<FinalizedAlloc> Allocs;
+      Allocs.push_back(std::move(Alloc));
+      if (Error Err = MemMgr.deallocate(std::move(Allocs)))
         ES.reportError(std::move(Err));
+    }
   }
 
   virtual void reportSectionTargetMemoryRange(StringRef Name,
                                               SectionRange TargetMem) {}
 
 protected:
-  using Allocation = JITLinkMemoryManager::Allocation;
+  using InFlightAlloc = JITLinkMemoryManager::InFlightAlloc;
+  using FinalizedAlloc = JITLinkMemoryManager::FinalizedAlloc;
 
-  virtual Expected<std::unique_ptr<Allocation>>
-  finalizeWorkingMemory(JITLinkContext &Ctx) = 0;
+  virtual Expected<SimpleSegmentAlloc> finalizeWorkingMemory() = 0;
+
+  JITLinkMemoryManager &MemMgr;
+  const JITLinkDylib *JD = nullptr;
 
 private:
-  JITLinkContext &Ctx;
   ExecutionSession &ES;
   std::set<Requirement> Reqs;
-  std::unique_ptr<Allocation> Alloc{nullptr};
+  FinalizedAlloc Alloc;
 };
 
 // Finalize working memory and take ownership of the resulting allocation. Start
 // copying memory over to the target and pass on the result once we're done.
 // Ownership of the allocation remains with us for the rest of our lifetime.
 void DebugObject::finalizeAsync(FinalizeContinuation OnFinalize) {
-  assert(Alloc == nullptr && "Cannot finalize more than once");
-
-  auto AllocOrErr = finalizeWorkingMemory(Ctx);
-  if (!AllocOrErr)
-    OnFinalize(AllocOrErr.takeError());
-  Alloc = std::move(*AllocOrErr);
-
-  Alloc->finalizeAsync([this, OnFinalize](Error Err) {
-    if (Err)
-      OnFinalize(std::move(Err));
-    else
-      OnFinalize(sys::MemoryBlock(
-          jitTargetAddressToPointer<void *>(Alloc->getTargetMemory(ReadOnly)),
-          Alloc->getWorkingMemory(ReadOnly).size()));
-  });
+  assert(!Alloc && "Cannot finalize more than once");
+
+  if (auto SimpleSegAlloc = finalizeWorkingMemory()) {
+    auto ROSeg = SimpleSegAlloc->getSegInfo(MemProt::Read);
+    ExecutorAddrRange DebugObjRange(ExecutorAddr(ROSeg.Addr),
+                                    ExecutorAddrDiff(ROSeg.WorkingMem.size()));
+    SimpleSegAlloc->finalize(
+        [this, DebugObjRange,
+         OnFinalize = std::move(OnFinalize)](Expected<FinalizedAlloc> FA) {
+          if (FA) {
+            Alloc = std::move(*FA);
+            OnFinalize(DebugObjRange);
+          } else
+            OnFinalize(FA.takeError());
+        });
+  } else
+    OnFinalize(SimpleSegAlloc.takeError());
 }
 
 /// The current implementation of ELFDebugObject replicates the approach used in
@@ -190,8 +202,7 @@ public:
   StringRef getBuffer() const { return Buffer->getMemBufferRef().getBuffer(); }
 
 protected:
-  Expected<std::unique_ptr<Allocation>>
-  finalizeWorkingMemory(JITLinkContext &Ctx) override;
+  Expected<SimpleSegmentAlloc> finalizeWorkingMemory() override;
 
   template <typename ELFT>
   Error recordSection(StringRef Name,
@@ -201,15 +212,16 @@ protected:
 private:
   template <typename ELFT>
   static Expected<std::unique_ptr<ELFDebugObject>>
-  CreateArchType(MemoryBufferRef Buffer, JITLinkContext &Ctx,
-                 ExecutionSession &ES);
+  CreateArchType(MemoryBufferRef Buffer, JITLinkMemoryManager &MemMgr,
+                 const JITLinkDylib *JD, ExecutionSession &ES);
 
   static std::unique_ptr<WritableMemoryBuffer>
   CopyBuffer(MemoryBufferRef Buffer, Error &Err);
 
   ELFDebugObject(std::unique_ptr<WritableMemoryBuffer> Buffer,
-                 JITLinkContext &Ctx, ExecutionSession &ES)
-      : DebugObject(Ctx, ES), Buffer(std::move(Buffer)) {
+                 JITLinkMemoryManager &MemMgr, const JITLinkDylib *JD,
+                 ExecutionSession &ES)
+      : DebugObject(MemMgr, JD, ES), Buffer(std::move(Buffer)) {
     set(Requirement::ReportFinalSectionLoadAddresses);
   }
 
@@ -244,13 +256,14 @@ ELFDebugObject::CopyBuffer(MemoryBufferRef Buffer, Error &Err) {
 
 template <typename ELFT>
 Expected<std::unique_ptr<ELFDebugObject>>
-ELFDebugObject::CreateArchType(MemoryBufferRef Buffer, JITLinkContext &Ctx,
-                               ExecutionSession &ES) {
+ELFDebugObject::CreateArchType(MemoryBufferRef Buffer,
+                               JITLinkMemoryManager &MemMgr,
+                               const JITLinkDylib *JD, ExecutionSession &ES) {
   using SectionHeader = typename ELFT::Shdr;
 
   Error Err = Error::success();
   std::unique_ptr<ELFDebugObject> DebugObj(
-      new ELFDebugObject(CopyBuffer(Buffer, Err), Ctx, ES));
+      new ELFDebugObject(CopyBuffer(Buffer, Err), MemMgr, JD, ES));
   if (Err)
     return std::move(Err);
 
@@ -299,23 +312,26 @@ ELFDebugObject::Create(MemoryBufferRef Buffer, JITLinkContext &Ctx,
 
   if (Class == ELF::ELFCLASS32) {
     if (Endian == ELF::ELFDATA2LSB)
-      return CreateArchType<ELF32LE>(Buffer, Ctx, ES);
+      return CreateArchType<ELF32LE>(Buffer, Ctx.getMemoryManager(),
+                                     Ctx.getJITLinkDylib(), ES);
     if (Endian == ELF::ELFDATA2MSB)
-      return CreateArchType<ELF32BE>(Buffer, Ctx, ES);
+      return CreateArchType<ELF32BE>(Buffer, Ctx.getMemoryManager(),
+                                     Ctx.getJITLinkDylib(), ES);
     return nullptr;
   }
   if (Class == ELF::ELFCLASS64) {
     if (Endian == ELF::ELFDATA2LSB)
-      return CreateArchType<ELF64LE>(Buffer, Ctx, ES);
+      return CreateArchType<ELF64LE>(Buffer, Ctx.getMemoryManager(),
+                                     Ctx.getJITLinkDylib(), ES);
     if (Endian == ELF::ELFDATA2MSB)
-      return CreateArchType<ELF64BE>(Buffer, Ctx, ES);
+      return CreateArchType<ELF64BE>(Buffer, Ctx.getMemoryManager(),
+                                     Ctx.getJITLinkDylib(), ES);
     return nullptr;
   }
   return nullptr;
 }
 
-Expected<std::unique_ptr<DebugObject::Allocation>>
-ELFDebugObject::finalizeWorkingMemory(JITLinkContext &Ctx) {
+Expected<SimpleSegmentAlloc> ELFDebugObject::finalizeWorkingMemory() {
   LLVM_DEBUG({
     dbgs() << "Section load-addresses in debug object for \""
            << Buffer->getBufferIdentifier() << "\":\n";
@@ -324,28 +340,21 @@ ELFDebugObject::finalizeWorkingMemory(JITLinkContext &Ctx) {
   });
 
   // TODO: This works, but what actual alignment requirements do we have?
-  unsigned Alignment = sys::Process::getPageSizeEstimate();
-  JITLinkMemoryManager &MemMgr = Ctx.getMemoryManager();
-  const JITLinkDylib *JD = Ctx.getJITLinkDylib();
+  unsigned PageSize = sys::Process::getPageSizeEstimate();
   size_t Size = Buffer->getBufferSize();
 
   // Allocate working memory for debug object in read-only segment.
-  JITLinkMemoryManager::SegmentsRequestMap SingleReadOnlySegment;
-  SingleReadOnlySegment[ReadOnly] =
-      JITLinkMemoryManager::SegmentRequest(Alignment, Size, 0);
-
-  auto AllocOrErr = MemMgr.allocate(JD, SingleReadOnlySegment);
-  if (!AllocOrErr)
-    return AllocOrErr.takeError();
+  auto Alloc = SimpleSegmentAlloc::Create(
+      MemMgr, JD, {{MemProt::Read, {Size, Align(PageSize)}}});
+  if (!Alloc)
+    return Alloc;
 
   // Initialize working memory with a copy of our object buffer.
-  // TODO: Use our buffer as working memory directly.
-  std::unique_ptr<Allocation> Alloc = std::move(*AllocOrErr);
-  MutableArrayRef<char> WorkingMem = Alloc->getWorkingMemory(ReadOnly);
-  memcpy(WorkingMem.data(), Buffer->getBufferStart(), Size);
+  auto SegInfo = Alloc->getSegInfo(MemProt::Read);
+  memcpy(SegInfo.WorkingMem.data(), Buffer->getBufferStart(), Size);
   Buffer.reset();
 
-  return std::move(Alloc);
+  return Alloc;
 }
 
 void ELFDebugObject::reportSectionTargetMemoryRange(StringRef Name,
@@ -447,7 +456,7 @@ Error DebugObjectManagerPlugin::notifyEmitted(
   std::future<MSVCPError> FinalizeErr = FinalizePromise.get_future();
 
   It->second->finalizeAsync(
-      [this, &FinalizePromise, &MR](Expected<sys::MemoryBlock> TargetMem) {
+      [this, &FinalizePromise, &MR](Expected<ExecutorAddrRange> TargetMem) {
         // Any failure here will fail materialization.
         if (!TargetMem) {
           FinalizePromise.set_value(TargetMem.takeError());
diff --git a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
new file mode 100644
index 000000000000..8479495623b8
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
@@ -0,0 +1,450 @@
+//===------- DebuggerSupportPlugin.cpp - Utils for debugger support -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/DebuggerSupportPlugin.h"
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/BinaryFormat/MachO.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+using namespace llvm::orc;
+
+static const char *SynthDebugSectionName = "__jitlink_synth_debug_object";
+
+namespace {
+
+struct MachO64LE {
+  using UIntPtr = uint64_t;
+
+  using Header = MachO::mach_header_64;
+  using SegmentLC = MachO::segment_command_64;
+  using Section = MachO::section_64;
+  using NList = MachO::nlist_64;
+
+  static constexpr support::endianness Endianness = support::little;
+  static constexpr const uint32_t Magic = MachO::MH_MAGIC_64;
+  static constexpr const uint32_t SegmentCmd = MachO::LC_SEGMENT_64;
+};
+
+class MachODebugObjectSynthesizerBase
+    : public GDBJITDebugInfoRegistrationPlugin::DebugSectionSynthesizer {
+public:
+  static bool isDebugSection(Section &Sec) {
+    return Sec.getName().startswith("__DWARF,");
+  }
+
+  MachODebugObjectSynthesizerBase(LinkGraph &G, ExecutorAddr RegisterActionAddr)
+      : G(G), RegisterActionAddr(RegisterActionAddr) {}
+  virtual ~MachODebugObjectSynthesizerBase() {}
+
+  Error preserveDebugSections() {
+    if (G.findSectionByName(SynthDebugSectionName)) {
+      LLVM_DEBUG({
+        dbgs() << "MachODebugObjectSynthesizer skipping graph " << G.getName()
+               << " which contains an unexpected existing "
+               << SynthDebugSectionName << " section.\n";
+      });
+      return Error::success();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "MachODebugObjectSynthesizer visiting graph " << G.getName()
+             << "\n";
+    });
+    for (auto &Sec : G.sections()) {
+      if (!isDebugSection(Sec))
+        continue;
+      // Preserve blocks in this debug section by marking one existing symbol
+      // live for each block, and introducing a new live, anonymous symbol for
+      // each currently unreferenced block.
+      LLVM_DEBUG({
+        dbgs() << "  Preserving debug section " << Sec.getName() << "\n";
+      });
+      SmallSet<Block *, 8> PreservedBlocks;
+      for (auto *Sym : Sec.symbols()) {
+        bool NewPreservedBlock =
+            PreservedBlocks.insert(&Sym->getBlock()).second;
+        if (NewPreservedBlock)
+          Sym->setLive(true);
+      }
+      for (auto *B : Sec.blocks())
+        if (!PreservedBlocks.count(B))
+          G.addAnonymousSymbol(*B, 0, 0, false, true);
+    }
+    return Error::success();
+  }
+
+protected:
+  LinkGraph &G;
+  ExecutorAddr RegisterActionAddr;
+};
+
+template <typename MachOTraits>
+class MachODebugObjectSynthesizer : public MachODebugObjectSynthesizerBase {
+private:
+  class MachOStructWriter {
+  public:
+    MachOStructWriter(MutableArrayRef<char> Buffer) : Buffer(Buffer) {}
+
+    size_t getOffset() const { return Offset; }
+
+    template <typename MachOStruct> void write(MachOStruct S) {
+      assert(Offset + sizeof(S) <= Buffer.size() &&
+             "Container block overflow while constructing debug MachO");
+      if (MachOTraits::Endianness != support::endian::system_endianness())
+        MachO::swapStruct(S);
+      memcpy(Buffer.data() + Offset, &S, sizeof(S));
+      Offset += sizeof(S);
+    }
+
+  private:
+    MutableArrayRef<char> Buffer;
+    size_t Offset = 0;
+  };
+
+public:
+  using MachODebugObjectSynthesizerBase::MachODebugObjectSynthesizerBase;
+
+  Error startSynthesis() override {
+    LLVM_DEBUG({
+      dbgs() << "Creating " << SynthDebugSectionName << " for " << G.getName()
+             << "\n";
+    });
+    auto &SDOSec = G.createSection(SynthDebugSectionName, MemProt::Read);
+
+    struct DebugSectionInfo {
+      Section *Sec = nullptr;
+      StringRef SegName;
+      StringRef SecName;
+      JITTargetAddress Alignment = 0;
+      JITTargetAddress StartAddr = 0;
+      uint64_t Size = 0;
+    };
+
+    SmallVector<DebugSectionInfo, 12> DebugSecInfos;
+    size_t NumSections = 0;
+    for (auto &Sec : G.sections()) {
+      if (llvm::empty(Sec.blocks()))
+        continue;
+
+      ++NumSections;
+      if (isDebugSection(Sec)) {
+        size_t SepPos = Sec.getName().find(',');
+        if (SepPos > 16 || (Sec.getName().size() - (SepPos + 1) > 16)) {
+          LLVM_DEBUG({
+            dbgs() << "Skipping debug object synthesis for graph "
+                   << G.getName()
+                   << ": encountered non-standard DWARF section name \""
+                   << Sec.getName() << "\"\n";
+          });
+          return Error::success();
+        }
+        DebugSecInfos.push_back({&Sec, Sec.getName().substr(0, SepPos),
+                                 Sec.getName().substr(SepPos + 1), 0, 0});
+      } else
+        NonDebugSections.push_back(&Sec);
+    }
+
+    // Create container block.
+    size_t SectionsCmdSize =
+        sizeof(typename MachOTraits::Section) * NumSections;
+    size_t SegmentLCSize =
+        sizeof(typename MachOTraits::SegmentLC) + SectionsCmdSize;
+    size_t ContainerBlockSize =
+        sizeof(typename MachOTraits::Header) + SegmentLCSize;
+    auto ContainerBlockContent = G.allocateBuffer(ContainerBlockSize);
+    MachOContainerBlock =
+        &G.createMutableContentBlock(SDOSec, ContainerBlockContent, 0, 8, 0);
+
+    // Copy debug section blocks and symbols.
+    JITTargetAddress NextBlockAddr = MachOContainerBlock->getSize();
+    for (auto &SI : DebugSecInfos) {
+      assert(!llvm::empty(SI.Sec->blocks()) && "Empty debug info section?");
+
+      // Update addresses in debug section.
+      LLVM_DEBUG({
+        dbgs() << "  Appending " << SI.Sec->getName() << " ("
+               << SI.Sec->blocks_size() << " block(s)) at "
+               << formatv("{0:x8}", NextBlockAddr) << "\n";
+      });
+      for (auto *B : SI.Sec->blocks()) {
+        NextBlockAddr = alignToBlock(NextBlockAddr, *B);
+        B->setAddress(NextBlockAddr);
+        NextBlockAddr += B->getSize();
+      }
+
+      auto &FirstBlock = **SI.Sec->blocks().begin();
+      if (FirstBlock.getAlignmentOffset() != 0)
+        return make_error<StringError>(
+            "First block in " + SI.Sec->getName() +
+                " section has non-zero alignment offset",
+            inconvertibleErrorCode());
+      if (FirstBlock.getAlignment() > std::numeric_limits<uint32_t>::max())
+        return make_error<StringError>("First block in " + SI.Sec->getName() +
+                                           " has alignment >4Gb",
+                                       inconvertibleErrorCode());
+
+      SI.Alignment = FirstBlock.getAlignment();
+      SI.StartAddr = FirstBlock.getAddress();
+      SI.Size = NextBlockAddr - SI.StartAddr;
+      G.mergeSections(SDOSec, *SI.Sec);
+      SI.Sec = nullptr;
+    }
+    size_t DebugSectionsSize = NextBlockAddr - MachOContainerBlock->getSize();
+
+    // Write MachO header and debug section load commands.
+    MachOStructWriter Writer(MachOContainerBlock->getAlreadyMutableContent());
+    typename MachOTraits::Header Hdr;
+    memset(&Hdr, 0, sizeof(Hdr));
+    Hdr.magic = MachOTraits::Magic;
+    switch (G.getTargetTriple().getArch()) {
+    case Triple::x86_64:
+      Hdr.cputype = MachO::CPU_TYPE_X86_64;
+      Hdr.cpusubtype = MachO::CPU_SUBTYPE_X86_64_ALL;
+      break;
+    case Triple::aarch64:
+      Hdr.cputype = MachO::CPU_TYPE_ARM64;
+      Hdr.cpusubtype = MachO::CPU_SUBTYPE_ARM64_ALL;
+      break;
+    default:
+      llvm_unreachable("Unsupported architecture");
+    }
+    Hdr.filetype = MachO::MH_OBJECT;
+    Hdr.ncmds = 1;
+    Hdr.sizeofcmds = SegmentLCSize;
+    Hdr.flags = 0;
+    Writer.write(Hdr);
+
+    typename MachOTraits::SegmentLC SegLC;
+    memset(&SegLC, 0, sizeof(SegLC));
+    SegLC.cmd = MachOTraits::SegmentCmd;
+    SegLC.cmdsize = SegmentLCSize;
+    SegLC.vmaddr = ContainerBlockSize;
+    SegLC.vmsize = DebugSectionsSize;
+    SegLC.fileoff = ContainerBlockSize;
+    SegLC.filesize = DebugSectionsSize;
+    SegLC.maxprot =
+        MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE;
+    SegLC.initprot =
+        MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE;
+    SegLC.nsects = NumSections;
+    SegLC.flags = 0;
+    Writer.write(SegLC);
+
+    StringSet<> ExistingLongNames;
+    for (auto &SI : DebugSecInfos) {
+      typename MachOTraits::Section Sec;
+      memset(&Sec, 0, sizeof(Sec));
+      memcpy(Sec.sectname, SI.SecName.data(), SI.SecName.size());
+      memcpy(Sec.segname, SI.SegName.data(), SI.SegName.size());
+      Sec.addr = SI.StartAddr;
+      Sec.size = SI.Size;
+      Sec.offset = SI.StartAddr;
+      Sec.align = SI.Alignment;
+      Sec.reloff = 0;
+      Sec.nreloc = 0;
+      Sec.flags = MachO::S_ATTR_DEBUG;
+      Writer.write(Sec);
+    }
+
+    // Set MachOContainerBlock to indicate success to
+    // completeSynthesisAndRegister.
+    NonDebugSectionsStart = Writer.getOffset();
+    return Error::success();
+  }
+
+  Error completeSynthesisAndRegister() override {
+    if (!MachOContainerBlock) {
+      LLVM_DEBUG({
+        dbgs() << "Not writing MachO debug object header for " << G.getName()
+               << " since createDebugSection failed\n";
+      });
+      return Error::success();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "Writing MachO debug object header for " << G.getName() << "\n";
+    });
+
+    MachOStructWriter Writer(
+        MachOContainerBlock->getAlreadyMutableContent().drop_front(
+            NonDebugSectionsStart));
+
+    unsigned LongSectionNameIdx = 0;
+    for (auto *Sec : NonDebugSections) {
+      size_t SepPos = Sec->getName().find(',');
+      StringRef SegName, SecName;
+      std::string CustomSecName;
+
+      if ((SepPos == StringRef::npos && Sec->getName().size() <= 16)) {
+        // No embedded segment name, short section name.
+        SegName = "__JITLINK_CUSTOM";
+        SecName = Sec->getName();
+      } else if (SepPos < 16 && (Sec->getName().size() - (SepPos + 1) <= 16)) {
+        // Canonical embedded segment and section name.
+        SegName = Sec->getName().substr(0, SepPos);
+        SecName = Sec->getName().substr(SepPos + 1);
+      } else {
+        // Long section name that needs to be truncated.
+        assert(Sec->getName().size() > 16 &&
+               "Short section name should have been handled above");
+        SegName = "__JITLINK_CUSTOM";
+        auto IdxStr = std::to_string(++LongSectionNameIdx);
+        CustomSecName = Sec->getName().substr(0, 15 - IdxStr.size()).str();
+        CustomSecName += ".";
+        CustomSecName += IdxStr;
+        SecName = StringRef(CustomSecName.data(), 16);
+      }
+
+      SectionRange R(*Sec);
+      if (R.getFirstBlock()->getAlignmentOffset() != 0)
+        return make_error<StringError>(
+            "While building MachO debug object for " + G.getName() +
+                " first block has non-zero alignment offset",
+            inconvertibleErrorCode());
+
+      typename MachOTraits::Section SecCmd;
+      memset(&SecCmd, 0, sizeof(SecCmd));
+      memcpy(SecCmd.sectname, SecName.data(), SecName.size());
+      memcpy(SecCmd.segname, SegName.data(), SegName.size());
+      SecCmd.addr = R.getStart();
+      SecCmd.size = R.getSize();
+      SecCmd.offset = 0;
+      SecCmd.align = R.getFirstBlock()->getAlignment();
+      SecCmd.reloff = 0;
+      SecCmd.nreloc = 0;
+      SecCmd.flags = 0;
+      Writer.write(SecCmd);
+    }
+
+    SectionRange R(MachOContainerBlock->getSection());
+    G.allocActions().push_back(
+        {{RegisterActionAddr.getValue(), R.getStart(), R.getSize()}, {}});
+    return Error::success();
+  }
+
+private:
+  Block *MachOContainerBlock = nullptr;
+  SmallVector<Section *, 16> NonDebugSections;
+  size_t NonDebugSectionsStart = 0;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+namespace orc {
+
+Expected<std::unique_ptr<GDBJITDebugInfoRegistrationPlugin>>
+GDBJITDebugInfoRegistrationPlugin::Create(ExecutionSession &ES,
+                                          JITDylib &ProcessJD,
+                                          const Triple &TT) {
+  auto RegisterActionAddr =
+      TT.isOSBinFormatMachO()
+          ? ES.intern("_llvm_orc_registerJITLoaderGDBAllocAction")
+          : ES.intern("llvm_orc_registerJITLoaderGDBAllocAction");
+
+  if (auto Addr = ES.lookup({&ProcessJD}, RegisterActionAddr))
+    return std::make_unique<GDBJITDebugInfoRegistrationPlugin>(
+        ExecutorAddr(Addr->getAddress()));
+  else
+    return Addr.takeError();
+}
+
+Error GDBJITDebugInfoRegistrationPlugin::notifyFailed(
+    MaterializationResponsibility &MR) {
+  return Error::success();
+}
+
+Error GDBJITDebugInfoRegistrationPlugin::notifyRemovingResources(
+    ResourceKey K) {
+  return Error::success();
+}
+
+void GDBJITDebugInfoRegistrationPlugin::notifyTransferringResources(
+    ResourceKey DstKey, ResourceKey SrcKey) {}
+
+void GDBJITDebugInfoRegistrationPlugin::modifyPassConfig(
+    MaterializationResponsibility &MR, LinkGraph &LG,
+    PassConfiguration &PassConfig) {
+
+  if (LG.getTargetTriple().getObjectFormat() == Triple::MachO)
+    modifyPassConfigForMachO(MR, LG, PassConfig);
+  else {
+    LLVM_DEBUG({
+      dbgs() << "GDBJITDebugInfoRegistrationPlugin skipping unspported graph "
+             << LG.getName() << "(triple = " << LG.getTargetTriple().str()
+             << "\n";
+    });
+  }
+}
+
+void GDBJITDebugInfoRegistrationPlugin::modifyPassConfigForMachO(
+    MaterializationResponsibility &MR, jitlink::LinkGraph &LG,
+    jitlink::PassConfiguration &PassConfig) {
+
+  switch (LG.getTargetTriple().getArch()) {
+  case Triple::x86_64:
+  case Triple::aarch64:
+    // Supported, continue.
+    assert(LG.getPointerSize() == 8 && "Graph has incorrect pointer size");
+    assert(LG.getEndianness() == support::little &&
+           "Graph has incorrect endianness");
+    break;
+  default:
+    // Unsupported.
+    LLVM_DEBUG({
+      dbgs() << "GDBJITDebugInfoRegistrationPlugin skipping unsupported "
+             << "MachO graph " << LG.getName()
+             << "(triple = " << LG.getTargetTriple().str()
+             << ", pointer size = " << LG.getPointerSize() << ", endianness = "
+             << (LG.getEndianness() == support::big ? "big" : "little")
+             << ")\n";
+    });
+    return;
+  }
+
+  // Scan for debug sections. If we find one then install passes.
+  bool HasDebugSections = false;
+  for (auto &Sec : LG.sections())
+    if (MachODebugObjectSynthesizerBase::isDebugSection(Sec)) {
+      HasDebugSections = true;
+      break;
+    }
+
+  if (HasDebugSections) {
+    LLVM_DEBUG({
+      dbgs() << "GDBJITDebugInfoRegistrationPlugin: Graph " << LG.getName()
+             << " contains debug info. Installing debugger support passes.\n";
+    });
+
+    auto MDOS = std::make_shared<MachODebugObjectSynthesizer<MachO64LE>>(
+        LG, RegisterActionAddr);
+    PassConfig.PrePrunePasses.push_back(
+        [=](LinkGraph &G) { return MDOS->preserveDebugSections(); });
+    PassConfig.PostPrunePasses.push_back(
+        [=](LinkGraph &G) { return MDOS->startSynthesis(); });
+    PassConfig.PreFixupPasses.push_back(
+        [=](LinkGraph &G) { return MDOS->completeSynthesisAndRegister(); });
+  } else {
+    LLVM_DEBUG({
+      dbgs() << "GDBJITDebugInfoRegistrationPlugin: Graph " << LG.getName()
+             << " contains no debug info. Skipping.\n";
+    });
+  }
+}
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
new file mode 100644
index 000000000000..b17d196f01b6
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -0,0 +1,818 @@
+//===------ ELFNixPlatform.cpp - Utilities for executing MachO in Orc -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
+#include "llvm/ExecutionEngine/JITLink/x86_64.h"
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+using namespace llvm::orc;
+using namespace llvm::orc::shared;
+
+namespace {
+
+class DSOHandleMaterializationUnit : public MaterializationUnit {
+public:
+  DSOHandleMaterializationUnit(ELFNixPlatform &ENP,
+                               const SymbolStringPtr &DSOHandleSymbol)
+      : MaterializationUnit(createDSOHandleSectionSymbols(ENP, DSOHandleSymbol),
+                            DSOHandleSymbol),
+        ENP(ENP) {}
+
+  StringRef getName() const override { return "DSOHandleMU"; }
+
+  void materialize(std::unique_ptr<MaterializationResponsibility> R) override {
+    unsigned PointerSize;
+    support::endianness Endianness;
+    jitlink::Edge::Kind EdgeKind;
+    const auto &TT =
+        ENP.getExecutionSession().getExecutorProcessControl().getTargetTriple();
+
+    switch (TT.getArch()) {
+    case Triple::x86_64:
+      PointerSize = 8;
+      Endianness = support::endianness::little;
+      EdgeKind = jitlink::x86_64::Pointer64;
+      break;
+    default:
+      llvm_unreachable("Unrecognized architecture");
+    }
+
+    // void *__dso_handle = &__dso_handle;
+    auto G = std::make_unique<jitlink::LinkGraph>(
+        "<DSOHandleMU>", TT, PointerSize, Endianness,
+        jitlink::getGenericEdgeKindName);
+    auto &DSOHandleSection =
+        G->createSection(".data.__dso_handle", jitlink::MemProt::Read);
+    auto &DSOHandleBlock = G->createContentBlock(
+        DSOHandleSection, getDSOHandleContent(PointerSize), 0, 8, 0);
+    auto &DSOHandleSymbol = G->addDefinedSymbol(
+        DSOHandleBlock, 0, *R->getInitializerSymbol(), DSOHandleBlock.getSize(),
+        jitlink::Linkage::Strong, jitlink::Scope::Default, false, true);
+    DSOHandleBlock.addEdge(EdgeKind, 0, DSOHandleSymbol, 0);
+
+    ENP.getObjectLinkingLayer().emit(std::move(R), std::move(G));
+  }
+
+  void discard(const JITDylib &JD, const SymbolStringPtr &Sym) override {}
+
+private:
+  static SymbolFlagsMap
+  createDSOHandleSectionSymbols(ELFNixPlatform &ENP,
+                                const SymbolStringPtr &DSOHandleSymbol) {
+    SymbolFlagsMap SymbolFlags;
+    SymbolFlags[DSOHandleSymbol] = JITSymbolFlags::Exported;
+    return SymbolFlags;
+  }
+
+  ArrayRef<char> getDSOHandleContent(size_t PointerSize) {
+    static const char Content[8] = {0};
+    assert(PointerSize <= sizeof Content);
+    return {Content, PointerSize};
+  }
+
+  ELFNixPlatform &ENP;
+};
+
+StringRef EHFrameSectionName = ".eh_frame";
+StringRef InitArrayFuncSectionName = ".init_array";
+
+StringRef ThreadBSSSectionName = ".tbss";
+StringRef ThreadDataSectionName = ".tdata";
+
+StringRef InitSectionNames[] = {InitArrayFuncSectionName};
+
+} // end anonymous namespace
+
+namespace llvm {
+namespace orc {
+
+Expected<std::unique_ptr<ELFNixPlatform>>
+ELFNixPlatform::Create(ExecutionSession &ES,
+                       ObjectLinkingLayer &ObjLinkingLayer,
+                       JITDylib &PlatformJD, const char *OrcRuntimePath,
+                       Optional<SymbolAliasMap> RuntimeAliases) {
+
+  auto &EPC = ES.getExecutorProcessControl();
+
+  // If the target is not supported then bail out immediately.
+  if (!supportedTarget(EPC.getTargetTriple()))
+    return make_error<StringError>("Unsupported ELFNixPlatform triple: " +
+                                       EPC.getTargetTriple().str(),
+                                   inconvertibleErrorCode());
+
+  // Create default aliases if the caller didn't supply any.
+  if (!RuntimeAliases)
+    RuntimeAliases = standardPlatformAliases(ES);
+
+  // Define the aliases.
+  if (auto Err = PlatformJD.define(symbolAliases(std::move(*RuntimeAliases))))
+    return std::move(Err);
+
+  // Add JIT-dispatch function support symbols.
+  if (auto Err = PlatformJD.define(absoluteSymbols(
+          {{ES.intern("__orc_rt_jit_dispatch"),
+            {EPC.getJITDispatchInfo().JITDispatchFunction.getValue(),
+             JITSymbolFlags::Exported}},
+           {ES.intern("__orc_rt_jit_dispatch_ctx"),
+            {EPC.getJITDispatchInfo().JITDispatchContext.getValue(),
+             JITSymbolFlags::Exported}}})))
+    return std::move(Err);
+
+  // Create a generator for the ORC runtime archive.
+  auto OrcRuntimeArchiveGenerator = StaticLibraryDefinitionGenerator::Load(
+      ObjLinkingLayer, OrcRuntimePath, EPC.getTargetTriple());
+  if (!OrcRuntimeArchiveGenerator)
+    return OrcRuntimeArchiveGenerator.takeError();
+
+  // Create the instance.
+  Error Err = Error::success();
+  auto P = std::unique_ptr<ELFNixPlatform>(
+      new ELFNixPlatform(ES, ObjLinkingLayer, PlatformJD,
+                         std::move(*OrcRuntimeArchiveGenerator), Err));
+  if (Err)
+    return std::move(Err);
+  return std::move(P);
+}
+
+Error ELFNixPlatform::setupJITDylib(JITDylib &JD) {
+  return JD.define(
+      std::make_unique<DSOHandleMaterializationUnit>(*this, DSOHandleSymbol));
+}
+
+Error ELFNixPlatform::notifyAdding(ResourceTracker &RT,
+                                   const MaterializationUnit &MU) {
+  auto &JD = RT.getJITDylib();
+  const auto &InitSym = MU.getInitializerSymbol();
+  if (!InitSym)
+    return Error::success();
+
+  RegisteredInitSymbols[&JD].add(InitSym,
+                                 SymbolLookupFlags::WeaklyReferencedSymbol);
+  LLVM_DEBUG({
+    dbgs() << "ELFNixPlatform: Registered init symbol " << *InitSym
+           << " for MU " << MU.getName() << "\n";
+  });
+  return Error::success();
+}
+
+Error ELFNixPlatform::notifyRemoving(ResourceTracker &RT) {
+  llvm_unreachable("Not supported yet");
+}
+
+static void addAliases(ExecutionSession &ES, SymbolAliasMap &Aliases,
+                       ArrayRef<std::pair<const char *, const char *>> AL) {
+  for (auto &KV : AL) {
+    auto AliasName = ES.intern(KV.first);
+    assert(!Aliases.count(AliasName) && "Duplicate symbol name in alias map");
+    Aliases[std::move(AliasName)] = {ES.intern(KV.second),
+                                     JITSymbolFlags::Exported};
+  }
+}
+
+SymbolAliasMap ELFNixPlatform::standardPlatformAliases(ExecutionSession &ES) {
+  SymbolAliasMap Aliases;
+  addAliases(ES, Aliases, requiredCXXAliases());
+  addAliases(ES, Aliases, standardRuntimeUtilityAliases());
+  return Aliases;
+}
+
+ArrayRef<std::pair<const char *, const char *>>
+ELFNixPlatform::requiredCXXAliases() {
+  static const std::pair<const char *, const char *> RequiredCXXAliases[] = {
+      {"__cxa_atexit", "__orc_rt_elfnix_cxa_atexit"},
+      {"atexit", "__orc_rt_elfnix_atexit"}};
+
+  return ArrayRef<std::pair<const char *, const char *>>(RequiredCXXAliases);
+}
+
+ArrayRef<std::pair<const char *, const char *>>
+ELFNixPlatform::standardRuntimeUtilityAliases() {
+  static const std::pair<const char *, const char *>
+      StandardRuntimeUtilityAliases[] = {
+          {"__orc_rt_run_program", "__orc_rt_elfnix_run_program"},
+          {"__orc_rt_log_error", "__orc_rt_log_error_to_stderr"}};
+
+  return ArrayRef<std::pair<const char *, const char *>>(
+      StandardRuntimeUtilityAliases);
+}
+
+bool ELFNixPlatform::isInitializerSection(StringRef SecName) {
+  for (auto &Name : InitSectionNames) {
+    if (Name.equals(SecName))
+      return true;
+  }
+  return false;
+}
+
+bool ELFNixPlatform::supportedTarget(const Triple &TT) {
+  switch (TT.getArch()) {
+  case Triple::x86_64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+ELFNixPlatform::ELFNixPlatform(
+    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
+    JITDylib &PlatformJD,
+    std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator, Error &Err)
+    : ES(ES), ObjLinkingLayer(ObjLinkingLayer),
+      DSOHandleSymbol(ES.intern("__dso_handle")) {
+  ErrorAsOutParameter _(&Err);
+
+  ObjLinkingLayer.addPlugin(std::make_unique<ELFNixPlatformPlugin>(*this));
+
+  PlatformJD.addGenerator(std::move(OrcRuntimeGenerator));
+
+  // PlatformJD hasn't been 'set-up' by the platform yet (since we're creating
+  // the platform now), so set it up.
+  if (auto E2 = setupJITDylib(PlatformJD)) {
+    Err = std::move(E2);
+    return;
+  }
+
+  RegisteredInitSymbols[&PlatformJD].add(
+      DSOHandleSymbol, SymbolLookupFlags::WeaklyReferencedSymbol);
+
+  // Associate wrapper function tags with JIT-side function implementations.
+  if (auto E2 = associateRuntimeSupportFunctions(PlatformJD)) {
+    Err = std::move(E2);
+    return;
+  }
+
+  // Lookup addresses of runtime functions callable by the platform,
+  // call the platform bootstrap function to initialize the platform-state
+  // object in the executor.
+  if (auto E2 = bootstrapELFNixRuntime(PlatformJD)) {
+    Err = std::move(E2);
+    return;
+  }
+}
+
+Error ELFNixPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
+  ExecutionSession::JITDispatchHandlerAssociationMap WFs;
+
+  using GetInitializersSPSSig =
+      SPSExpected<SPSELFNixJITDylibInitializerSequence>(SPSString);
+  WFs[ES.intern("__orc_rt_elfnix_get_initializers_tag")] =
+      ES.wrapAsyncWithSPS<GetInitializersSPSSig>(
+          this, &ELFNixPlatform::rt_getInitializers);
+
+  using GetDeinitializersSPSSig =
+      SPSExpected<SPSELFJITDylibDeinitializerSequence>(SPSExecutorAddr);
+  WFs[ES.intern("__orc_rt_elfnix_get_deinitializers_tag")] =
+      ES.wrapAsyncWithSPS<GetDeinitializersSPSSig>(
+          this, &ELFNixPlatform::rt_getDeinitializers);
+
+  using LookupSymbolSPSSig =
+      SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSString);
+  WFs[ES.intern("__orc_rt_elfnix_symbol_lookup_tag")] =
+      ES.wrapAsyncWithSPS<LookupSymbolSPSSig>(this,
+                                              &ELFNixPlatform::rt_lookupSymbol);
+
+  return ES.registerJITDispatchHandlers(PlatformJD, std::move(WFs));
+}
+
+void ELFNixPlatform::getInitializersBuildSequencePhase(
+    SendInitializerSequenceFn SendResult, JITDylib &JD,
+    std::vector<JITDylibSP> DFSLinkOrder) {
+  ELFNixJITDylibInitializerSequence FullInitSeq;
+  {
+    std::lock_guard<std::mutex> Lock(PlatformMutex);
+    for (auto &InitJD : reverse(DFSLinkOrder)) {
+      LLVM_DEBUG({
+        dbgs() << "ELFNixPlatform: Appending inits for \"" << InitJD->getName()
+               << "\" to sequence\n";
+      });
+      auto ISItr = InitSeqs.find(InitJD.get());
+      if (ISItr != InitSeqs.end()) {
+        FullInitSeq.emplace_back(std::move(ISItr->second));
+        InitSeqs.erase(ISItr);
+      }
+    }
+  }
+
+  SendResult(std::move(FullInitSeq));
+}
+
+void ELFNixPlatform::getInitializersLookupPhase(
+    SendInitializerSequenceFn SendResult, JITDylib &JD) {
+
+  auto DFSLinkOrder = JD.getDFSLinkOrder();
+  DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
+  ES.runSessionLocked([&]() {
+    for (auto &InitJD : DFSLinkOrder) {
+      auto RISItr = RegisteredInitSymbols.find(InitJD.get());
+      if (RISItr != RegisteredInitSymbols.end()) {
+        NewInitSymbols[InitJD.get()] = std::move(RISItr->second);
+        RegisteredInitSymbols.erase(RISItr);
+      }
+    }
+  });
+
+  // If there are no further init symbols to look up then move on to the next
+  // phase.
+  if (NewInitSymbols.empty()) {
+    getInitializersBuildSequencePhase(std::move(SendResult), JD,
+                                      std::move(DFSLinkOrder));
+    return;
+  }
+
+  // Otherwise issue a lookup and re-run this phase when it completes.
+  lookupInitSymbolsAsync(
+      [this, SendResult = std::move(SendResult), &JD](Error Err) mutable {
+        if (Err)
+          SendResult(std::move(Err));
+        else
+          getInitializersLookupPhase(std::move(SendResult), JD);
+      },
+      ES, std::move(NewInitSymbols));
+}
+
+void ELFNixPlatform::rt_getInitializers(SendInitializerSequenceFn SendResult,
+                                        StringRef JDName) {
+  LLVM_DEBUG({
+    dbgs() << "ELFNixPlatform::rt_getInitializers(\"" << JDName << "\")\n";
+  });
+
+  JITDylib *JD = ES.getJITDylibByName(JDName);
+  if (!JD) {
+    LLVM_DEBUG({
+      dbgs() << "  No such JITDylib \"" << JDName << "\". Sending error.\n";
+    });
+    SendResult(make_error<StringError>("No JITDylib named " + JDName,
+                                       inconvertibleErrorCode()));
+    return;
+  }
+
+  getInitializersLookupPhase(std::move(SendResult), *JD);
+}
+
+void ELFNixPlatform::rt_getDeinitializers(
+    SendDeinitializerSequenceFn SendResult, ExecutorAddr Handle) {
+  LLVM_DEBUG({
+    dbgs() << "ELFNixPlatform::rt_getDeinitializers(\""
+           << formatv("{0:x}", Handle.getValue()) << "\")\n";
+  });
+
+  JITDylib *JD = nullptr;
+
+  {
+    std::lock_guard<std::mutex> Lock(PlatformMutex);
+    auto I = HandleAddrToJITDylib.find(Handle.getValue());
+    if (I != HandleAddrToJITDylib.end())
+      JD = I->second;
+  }
+
+  if (!JD) {
+    LLVM_DEBUG({
+      dbgs() << "  No JITDylib for handle "
+             << formatv("{0:x}", Handle.getValue()) << "\n";
+    });
+    SendResult(make_error<StringError>("No JITDylib associated with handle " +
+                                           formatv("{0:x}", Handle.getValue()),
+                                       inconvertibleErrorCode()));
+    return;
+  }
+
+  SendResult(ELFNixJITDylibDeinitializerSequence());
+}
+
+void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
+                                     ExecutorAddr Handle,
+                                     StringRef SymbolName) {
+  LLVM_DEBUG({
+    dbgs() << "ELFNixPlatform::rt_lookupSymbol(\""
+           << formatv("{0:x}", Handle.getValue()) << "\")\n";
+  });
+
+  JITDylib *JD = nullptr;
+
+  {
+    std::lock_guard<std::mutex> Lock(PlatformMutex);
+    auto I = HandleAddrToJITDylib.find(Handle.getValue());
+    if (I != HandleAddrToJITDylib.end())
+      JD = I->second;
+  }
+
+  if (!JD) {
+    LLVM_DEBUG({
+      dbgs() << "  No JITDylib for handle "
+             << formatv("{0:x}", Handle.getValue()) << "\n";
+    });
+    SendResult(make_error<StringError>("No JITDylib associated with handle " +
+                                           formatv("{0:x}", Handle.getValue()),
+                                       inconvertibleErrorCode()));
+    return;
+  }
+
+  // Use functor class to work around XL build compiler issue on AIX.
+  class RtLookupNotifyComplete {
+  public:
+    RtLookupNotifyComplete(SendSymbolAddressFn &&SendResult)
+        : SendResult(std::move(SendResult)) {}
+    void operator()(Expected<SymbolMap> Result) {
+      if (Result) {
+        assert(Result->size() == 1 && "Unexpected result map count");
+        SendResult(ExecutorAddr(Result->begin()->second.getAddress()));
+      } else {
+        SendResult(Result.takeError());
+      }
+    }
+
+  private:
+    SendSymbolAddressFn SendResult;
+  };
+
+  ES.lookup(
+      LookupKind::DLSym, {{JD, JITDylibLookupFlags::MatchExportedSymbolsOnly}},
+      SymbolLookupSet(ES.intern(SymbolName)), SymbolState::Ready,
+      RtLookupNotifyComplete(std::move(SendResult)), NoDependenciesToRegister);
+}
+
+Error ELFNixPlatform::bootstrapELFNixRuntime(JITDylib &PlatformJD) {
+
+  std::pair<const char *, ExecutorAddr *> Symbols[] = {
+      {"__orc_rt_elfnix_platform_bootstrap", &orc_rt_elfnix_platform_bootstrap},
+      {"__orc_rt_elfnix_platform_shutdown", &orc_rt_elfnix_platform_shutdown},
+      {"__orc_rt_elfnix_register_object_sections",
+       &orc_rt_elfnix_register_object_sections},
+      {"__orc_rt_elfnix_create_pthread_key",
+       &orc_rt_elfnix_create_pthread_key}};
+
+  SymbolLookupSet RuntimeSymbols;
+  std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> AddrsToRecord;
+  for (const auto &KV : Symbols) {
+    auto Name = ES.intern(KV.first);
+    RuntimeSymbols.add(Name);
+    AddrsToRecord.push_back({std::move(Name), KV.second});
+  }
+
+  auto RuntimeSymbolAddrs = ES.lookup(
+      {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, RuntimeSymbols);
+  if (!RuntimeSymbolAddrs)
+    return RuntimeSymbolAddrs.takeError();
+
+  for (const auto &KV : AddrsToRecord) {
+    auto &Name = KV.first;
+    assert(RuntimeSymbolAddrs->count(Name) && "Missing runtime symbol?");
+    KV.second->setValue((*RuntimeSymbolAddrs)[Name].getAddress());
+  }
+
+  auto PJDDSOHandle = ES.lookup(
+      {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, DSOHandleSymbol);
+  if (!PJDDSOHandle)
+    return PJDDSOHandle.takeError();
+
+  if (auto Err = ES.callSPSWrapper<void(uint64_t)>(
+          orc_rt_elfnix_platform_bootstrap, PJDDSOHandle->getAddress()))
+    return Err;
+
+  // FIXME: Ordering is fuzzy here. We're probably best off saying
+  // "behavior is undefined if code that uses the runtime is added before
+  // the platform constructor returns", then move all this to the constructor.
+  RuntimeBootstrapped = true;
+  std::vector<ELFPerObjectSectionsToRegister> DeferredPOSRs;
+  {
+    std::lock_guard<std::mutex> Lock(PlatformMutex);
+    DeferredPOSRs = std::move(BootstrapPOSRs);
+  }
+
+  for (auto &D : DeferredPOSRs)
+    if (auto Err = registerPerObjectSections(D))
+      return Err;
+
+  return Error::success();
+}
+
+Error ELFNixPlatform::registerInitInfo(
+    JITDylib &JD, ArrayRef<jitlink::Section *> InitSections) {
+
+  std::unique_lock<std::mutex> Lock(PlatformMutex);
+
+  ELFNixJITDylibInitializers *InitSeq = nullptr;
+  {
+    auto I = InitSeqs.find(&JD);
+    if (I == InitSeqs.end()) {
+      // If there's no init sequence entry yet then we need to look up the
+      // header symbol to force creation of one.
+      Lock.unlock();
+
+      auto SearchOrder =
+          JD.withLinkOrderDo([](const JITDylibSearchOrder &SO) { return SO; });
+      if (auto Err = ES.lookup(SearchOrder, DSOHandleSymbol).takeError())
+        return Err;
+
+      Lock.lock();
+      I = InitSeqs.find(&JD);
+      assert(I != InitSeqs.end() &&
+             "Entry missing after header symbol lookup?");
+    }
+    InitSeq = &I->second;
+  }
+
+  for (auto *Sec : InitSections) {
+    // FIXME: Avoid copy here.
+    jitlink::SectionRange R(*Sec);
+    InitSeq->InitSections[Sec->getName()].push_back(
+        {ExecutorAddr(R.getStart()), ExecutorAddr(R.getEnd())});
+  }
+
+  return Error::success();
+}
+
+Error ELFNixPlatform::registerPerObjectSections(
+    const ELFPerObjectSectionsToRegister &POSR) {
+
+  if (!orc_rt_elfnix_register_object_sections)
+    return make_error<StringError>("Attempting to register per-object "
+                                   "sections, but runtime support has not "
+                                   "been loaded yet",
+                                   inconvertibleErrorCode());
+
+  Error ErrResult = Error::success();
+  if (auto Err = ES.callSPSWrapper<shared::SPSError(
+                     SPSELFPerObjectSectionsToRegister)>(
+          orc_rt_elfnix_register_object_sections, ErrResult, POSR))
+    return Err;
+  return ErrResult;
+}
+
+Expected<uint64_t> ELFNixPlatform::createPThreadKey() {
+  if (!orc_rt_elfnix_create_pthread_key)
+    return make_error<StringError>(
+        "Attempting to create pthread key in target, but runtime support has "
+        "not been loaded yet",
+        inconvertibleErrorCode());
+
+  Expected<uint64_t> Result(0);
+  if (auto Err = ES.callSPSWrapper<SPSExpected<uint64_t>(void)>(
+          orc_rt_elfnix_create_pthread_key, Result))
+    return std::move(Err);
+  return Result;
+}
+
+void ELFNixPlatform::ELFNixPlatformPlugin::modifyPassConfig(
+    MaterializationResponsibility &MR, jitlink::LinkGraph &LG,
+    jitlink::PassConfiguration &Config) {
+
+  // If the initializer symbol is the __dso_handle symbol then just add
+  // the DSO handle support passes.
+  if (MR.getInitializerSymbol() == MP.DSOHandleSymbol) {
+    addDSOHandleSupportPasses(MR, Config);
+    // The DSOHandle materialization unit doesn't require any other
+    // support, so we can bail out early.
+    return;
+  }
+
+  // If the object contains initializers then add passes to record them.
+  if (MR.getInitializerSymbol())
+    addInitializerSupportPasses(MR, Config);
+
+  // Add passes for eh-frame and TLV support.
+  addEHAndTLVSupportPasses(MR, Config);
+}
+
+ObjectLinkingLayer::Plugin::SyntheticSymbolDependenciesMap
+ELFNixPlatform::ELFNixPlatformPlugin::getSyntheticSymbolDependencies(
+    MaterializationResponsibility &MR) {
+  std::lock_guard<std::mutex> Lock(PluginMutex);
+  auto I = InitSymbolDeps.find(&MR);
+  if (I != InitSymbolDeps.end()) {
+    SyntheticSymbolDependenciesMap Result;
+    Result[MR.getInitializerSymbol()] = std::move(I->second);
+    InitSymbolDeps.erase(&MR);
+    return Result;
+  }
+  return SyntheticSymbolDependenciesMap();
+}
+
+void ELFNixPlatform::ELFNixPlatformPlugin::addInitializerSupportPasses(
+    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
+
+  /// Preserve init sections.
+  Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error {
+    if (auto Err = preserveInitSections(G, MR))
+      return Err;
+    return Error::success();
+  });
+
+  Config.PostFixupPasses.push_back(
+      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
+        return registerInitSections(G, JD);
+      });
+}
+
+void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses(
+    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
+
+  Config.PostAllocationPasses.push_back([this, &JD = MR.getTargetJITDylib()](
+                                            jitlink::LinkGraph &G) -> Error {
+    auto I = llvm::find_if(G.defined_symbols(), [this](jitlink::Symbol *Sym) {
+      return Sym->getName() == *MP.DSOHandleSymbol;
+    });
+    assert(I != G.defined_symbols().end() && "Missing DSO handle symbol");
+    {
+      std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+      JITTargetAddress HandleAddr = (*I)->getAddress();
+      MP.HandleAddrToJITDylib[HandleAddr] = &JD;
+      assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
+      MP.InitSeqs.insert(std::make_pair(
+          &JD,
+          ELFNixJITDylibInitializers(JD.getName(), ExecutorAddr(HandleAddr))));
+    }
+    return Error::success();
+  });
+}
+
+void ELFNixPlatform::ELFNixPlatformPlugin::addEHAndTLVSupportPasses(
+    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
+
+  // Insert TLV lowering at the start of the PostPrunePasses, since we want
+  // it to run before GOT/PLT lowering.
+
+  // TODO: Check that before the fixTLVSectionsAndEdges pass, the GOT/PLT build
+  // pass has done. Because the TLS descriptor need to be allocate in GOT.
+  Config.PostPrunePasses.push_back(
+      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
+        return fixTLVSectionsAndEdges(G, JD);
+      });
+
+  // Add a pass to register the final addresses of the eh-frame and TLV sections
+  // with the runtime.
+  Config.PostFixupPasses.push_back([this](jitlink::LinkGraph &G) -> Error {
+    ELFPerObjectSectionsToRegister POSR;
+
+    if (auto *EHFrameSection = G.findSectionByName(EHFrameSectionName)) {
+      jitlink::SectionRange R(*EHFrameSection);
+      if (!R.empty())
+        POSR.EHFrameSection = {ExecutorAddr(R.getStart()),
+                               ExecutorAddr(R.getEnd())};
+    }
+
+    // Get a pointer to the thread data section if there is one. It will be used
+    // below.
+    jitlink::Section *ThreadDataSection =
+        G.findSectionByName(ThreadDataSectionName);
+
+    // Handle thread BSS section if there is one.
+    if (auto *ThreadBSSSection = G.findSectionByName(ThreadBSSSectionName)) {
+      // If there's already a thread data section in this graph then merge the
+      // thread BSS section content into it, otherwise just treat the thread
+      // BSS section as the thread data section.
+      if (ThreadDataSection)
+        G.mergeSections(*ThreadDataSection, *ThreadBSSSection);
+      else
+        ThreadDataSection = ThreadBSSSection;
+    }
+
+    // Having merged thread BSS (if present) and thread data (if present),
+    // record the resulting section range.
+    if (ThreadDataSection) {
+      jitlink::SectionRange R(*ThreadDataSection);
+      if (!R.empty())
+        POSR.ThreadDataSection = {ExecutorAddr(R.getStart()),
+                                  ExecutorAddr(R.getEnd())};
+    }
+
+    if (POSR.EHFrameSection.Start || POSR.ThreadDataSection.Start) {
+
+      // If we're still bootstrapping the runtime then just record this
+      // frame for now.
+      if (!MP.RuntimeBootstrapped) {
+        std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+        MP.BootstrapPOSRs.push_back(POSR);
+        return Error::success();
+      }
+
+      // Otherwise register it immediately.
+      if (auto Err = MP.registerPerObjectSections(POSR))
+        return Err;
+    }
+
+    return Error::success();
+  });
+}
+
+Error ELFNixPlatform::ELFNixPlatformPlugin::preserveInitSections(
+    jitlink::LinkGraph &G, MaterializationResponsibility &MR) {
+
+  JITLinkSymbolSet InitSectionSymbols;
+  for (auto &InitSectionName : InitSectionNames) {
+    // Skip non-init sections.
+    auto *InitSection = G.findSectionByName(InitSectionName);
+    if (!InitSection)
+      continue;
+
+    // Make a pass over live symbols in the section: those blocks are already
+    // preserved.
+    DenseSet<jitlink::Block *> AlreadyLiveBlocks;
+    for (auto &Sym : InitSection->symbols()) {
+      auto &B = Sym->getBlock();
+      if (Sym->isLive() && Sym->getOffset() == 0 &&
+          Sym->getSize() == B.getSize() && !AlreadyLiveBlocks.count(&B)) {
+        InitSectionSymbols.insert(Sym);
+        AlreadyLiveBlocks.insert(&B);
+      }
+    }
+
+    // Add anonymous symbols to preserve any not-already-preserved blocks.
+    for (auto *B : InitSection->blocks())
+      if (!AlreadyLiveBlocks.count(B))
+        InitSectionSymbols.insert(
+            &G.addAnonymousSymbol(*B, 0, B->getSize(), false, true));
+  }
+
+  if (!InitSectionSymbols.empty()) {
+    std::lock_guard<std::mutex> Lock(PluginMutex);
+    InitSymbolDeps[&MR] = std::move(InitSectionSymbols);
+  }
+
+  return Error::success();
+}
+
+Error ELFNixPlatform::ELFNixPlatformPlugin::registerInitSections(
+    jitlink::LinkGraph &G, JITDylib &JD) {
+
+  SmallVector<jitlink::Section *> InitSections;
+
+  LLVM_DEBUG({ dbgs() << "ELFNixPlatform::registerInitSections\n"; });
+
+  for (auto InitSectionName : InitSectionNames) {
+    if (auto *Sec = G.findSectionByName(InitSectionName)) {
+      InitSections.push_back(Sec);
+    }
+  }
+
+  // Dump the scraped inits.
+  LLVM_DEBUG({
+    dbgs() << "ELFNixPlatform: Scraped " << G.getName() << " init sections:\n";
+    for (auto *Sec : InitSections) {
+      jitlink::SectionRange R(*Sec);
+      dbgs() << "  " << Sec->getName() << ": "
+             << formatv("[ {0:x} -- {1:x} ]", R.getStart(), R.getEnd()) << "\n";
+    }
+  });
+
+  return MP.registerInitInfo(JD, InitSections);
+}
+
+Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges(
+    jitlink::LinkGraph &G, JITDylib &JD) {
+
+  // TODO implement TLV support
+  for (auto *Sym : G.external_symbols())
+    if (Sym->getName() == "__tls_get_addr") {
+      Sym->setName("___orc_rt_elfnix_tls_get_addr");
+    }
+
+  auto *TLSInfoEntrySection = G.findSectionByName("$__TLSINFO");
+
+  if (TLSInfoEntrySection) {
+    Optional<uint64_t> Key;
+    {
+      std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+      auto I = MP.JITDylibToPThreadKey.find(&JD);
+      if (I != MP.JITDylibToPThreadKey.end())
+        Key = I->second;
+    }
+    if (!Key) {
+      if (auto KeyOrErr = MP.createPThreadKey())
+        Key = *KeyOrErr;
+      else
+        return KeyOrErr.takeError();
+    }
+
+    uint64_t PlatformKeyBits =
+        support::endian::byte_swap(*Key, G.getEndianness());
+
+    for (auto *B : TLSInfoEntrySection->blocks()) {
+      // FIXME: The TLS descriptor byte length may different with different
+      // ISA
+      assert(B->getSize() == (G.getPointerSize() * 2) &&
+             "TLS descriptor must be 2 words length");
+      auto TLSInfoEntryContent = B->getMutableContent(G);
+      memcpy(TLSInfoEntryContent.data(), &PlatformKeyBits, G.getPointerSize());
+    }
+  }
+
+  return Error::success();
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
index 5715eda71eee..f3fe0555fa75 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp
@@ -39,13 +39,13 @@ createJITLoaderGDBRegistrar(ExecutionSession &ES) {
   assert((*Result)[0].size() == 1 &&
          "Unexpected number of addresses in result");
 
-  return std::make_unique<EPCDebugObjectRegistrar>(ES, (*Result)[0][0]);
+  return std::make_unique<EPCDebugObjectRegistrar>(
+      ES, ExecutorAddr((*Result)[0][0]));
 }
 
-Error EPCDebugObjectRegistrar::registerDebugObject(sys::MemoryBlock TargetMem) {
-  return ES.callSPSWrapper<void(SPSExecutorAddress, uint64_t)>(
-      RegisterFn, ExecutorAddress::fromPtr(TargetMem.base()),
-      static_cast<uint64_t>(TargetMem.allocatedSize()));
+Error EPCDebugObjectRegistrar::registerDebugObject(
+    ExecutorAddrRange TargetMem) {
+  return ES.callSPSWrapper<void(SPSExecutorAddrRange)>(RegisterFn, TargetMem);
 }
 
 } // namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp
index 8cdda9ab5a15..4c0fab8aa9fa 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp
@@ -51,21 +51,22 @@ EPCEHFrameRegistrar::Create(ExecutionSession &ES) {
   auto RegisterEHFrameWrapperFnAddr = (*Result)[0][0];
   auto DeregisterEHFrameWrapperFnAddr = (*Result)[0][1];
 
-  return std::make_unique<EPCEHFrameRegistrar>(ES, RegisterEHFrameWrapperFnAddr,
-                                               DeregisterEHFrameWrapperFnAddr);
+  return std::make_unique<EPCEHFrameRegistrar>(
+      ES, ExecutorAddr(RegisterEHFrameWrapperFnAddr),
+      ExecutorAddr(DeregisterEHFrameWrapperFnAddr));
 }
 
 Error EPCEHFrameRegistrar::registerEHFrames(JITTargetAddress EHFrameSectionAddr,
                                             size_t EHFrameSectionSize) {
-  return ES.callSPSWrapper<void(SPSExecutorAddress, uint64_t)>(
-      RegisterEHFrameWrapperFnAddr, EHFrameSectionAddr,
+  return ES.callSPSWrapper<void(SPSExecutorAddr, uint64_t)>(
+      RegisterEHFrameWrapperFnAddr, ExecutorAddr(EHFrameSectionAddr),
       static_cast<uint64_t>(EHFrameSectionSize));
 }
 
 Error EPCEHFrameRegistrar::deregisterEHFrames(
     JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
-  return ES.callSPSWrapper<void(SPSExecutorAddress, uint64_t)>(
-      DeregisterEHFrameWrapperFnAddr, EHFrameSectionAddr,
+  return ES.callSPSWrapper<void(SPSExecutorAddr, uint64_t)>(
+      DeregisterEHFrameWrapperFnAddr, ExecutorAddr(EHFrameSectionAddr),
       static_cast<uint64_t>(EHFrameSectionSize));
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
new file mode 100644
index 000000000000..6c47c5c5f7bb
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericDylibManager.cpp
@@ -0,0 +1,107 @@
+//===------- EPCGenericDylibManager.cpp -- Dylib management via EPC -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h"
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+template <>
+class SPSSerializationTraits<SPSRemoteSymbolLookupSetElement,
+                             SymbolLookupSet::value_type> {
+public:
+  static size_t size(const SymbolLookupSet::value_type &V) {
+    return SPSArgList<SPSString, bool>::size(
+        *V.first, V.second == SymbolLookupFlags::RequiredSymbol);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const SymbolLookupSet::value_type &V) {
+    return SPSArgList<SPSString, bool>::serialize(
+        OB, *V.first, V.second == SymbolLookupFlags::RequiredSymbol);
+  }
+};
+
+template <>
+class TrivialSPSSequenceSerialization<SPSRemoteSymbolLookupSetElement,
+                                      SymbolLookupSet> {
+public:
+  static constexpr bool available = true;
+};
+
+template <>
+class SPSSerializationTraits<SPSRemoteSymbolLookup,
+                             ExecutorProcessControl::LookupRequest> {
+  using MemberSerialization =
+      SPSArgList<SPSExecutorAddr, SPSRemoteSymbolLookupSet>;
+
+public:
+  static size_t size(const ExecutorProcessControl::LookupRequest &LR) {
+    return MemberSerialization::size(ExecutorAddr(LR.Handle), LR.Symbols);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB,
+                        const ExecutorProcessControl::LookupRequest &LR) {
+    return MemberSerialization::serialize(OB, ExecutorAddr(LR.Handle),
+                                          LR.Symbols);
+  }
+};
+
+} // end namespace shared
+
+Expected<EPCGenericDylibManager>
+EPCGenericDylibManager::CreateWithDefaultBootstrapSymbols(
+    ExecutorProcessControl &EPC) {
+  SymbolAddrs SAs;
+  if (auto Err = EPC.getBootstrapSymbols(
+          {{SAs.Instance, rt::SimpleExecutorDylibManagerInstanceName},
+           {SAs.Open, rt::SimpleExecutorDylibManagerOpenWrapperName},
+           {SAs.Lookup, rt::SimpleExecutorDylibManagerLookupWrapperName}}))
+    return std::move(Err);
+  return EPCGenericDylibManager(EPC, std::move(SAs));
+}
+
+Expected<tpctypes::DylibHandle> EPCGenericDylibManager::open(StringRef Path,
+                                                             uint64_t Mode) {
+  Expected<tpctypes::DylibHandle> H(0);
+  if (auto Err =
+          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerOpenSignature>(
+              SAs.Open, H, SAs.Instance, Path, Mode))
+    return std::move(Err);
+  return H;
+}
+
+Expected<std::vector<ExecutorAddr>>
+EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
+                               const SymbolLookupSet &Lookup) {
+  Expected<std::vector<ExecutorAddr>> Result((std::vector<ExecutorAddr>()));
+  if (auto Err =
+          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
+              SAs.Lookup, Result, SAs.Instance, H, Lookup))
+    return std::move(Err);
+  return Result;
+}
+
+Expected<std::vector<ExecutorAddr>>
+EPCGenericDylibManager::lookup(tpctypes::DylibHandle H,
+                               const RemoteSymbolLookupSet &Lookup) {
+  Expected<std::vector<ExecutorAddr>> Result((std::vector<ExecutorAddr>()));
+  if (auto Err =
+          EPC.callSPSWrapper<rt::SPSSimpleExecutorDylibManagerLookupSignature>(
+              SAs.Lookup, Result, SAs.Instance, H, Lookup))
+    return std::move(Err);
+  return Result;
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
new file mode 100644
index 000000000000..9b712cb8f7ca
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -0,0 +1,184 @@
+//===---- EPCGenericJITLinkMemoryManager.cpp -- Mem management via EPC ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+#include <limits>
+
+using namespace llvm::jitlink;
+
+namespace llvm {
+namespace orc {
+
+class EPCGenericJITLinkMemoryManager::InFlightAlloc
+    : public jitlink::JITLinkMemoryManager::InFlightAlloc {
+public:
+
+  // FIXME: The C++98 initializer is an attempt to work around compile failures
+  // due to http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1397.
+  // We should be able to switch this back to member initialization once that
+  // issue is fixed.
+  struct SegInfo {
+    SegInfo() : WorkingMem(nullptr), ContentSize(0), ZeroFillSize(0) {}
+
+    char *WorkingMem;
+    ExecutorAddr Addr;
+    uint64_t ContentSize;
+    uint64_t ZeroFillSize;
+  };
+
+  using SegInfoMap = AllocGroupSmallMap<SegInfo>;
+
+  InFlightAlloc(EPCGenericJITLinkMemoryManager &Parent, LinkGraph &G,
+                ExecutorAddr AllocAddr, SegInfoMap Segs)
+      : Parent(Parent), G(G), AllocAddr(AllocAddr), Segs(std::move(Segs)) {}
+
+  void finalize(OnFinalizedFunction OnFinalize) override {
+    tpctypes::FinalizeRequest FR;
+    for (auto &KV : Segs) {
+      assert(KV.second.ContentSize <= std::numeric_limits<size_t>::max());
+      FR.Segments.push_back(tpctypes::SegFinalizeRequest{
+          tpctypes::toWireProtectionFlags(
+              toSysMemoryProtectionFlags(KV.first.getMemProt())),
+          KV.second.Addr,
+          alignTo(KV.second.ContentSize + KV.second.ZeroFillSize,
+                  Parent.EPC.getPageSize()),
+          {KV.second.WorkingMem, static_cast<size_t>(KV.second.ContentSize)}});
+    }
+
+    // Transfer allocation actions.
+    // FIXME: Merge JITLink and ORC SupportFunctionCall and Action list types,
+    //        turn this into a std::swap.
+    FR.Actions.reserve(G.allocActions().size());
+    for (auto &ActPair : G.allocActions())
+      FR.Actions.push_back({{ExecutorAddr(ActPair.Finalize.FnAddr),
+                             {ExecutorAddr(ActPair.Finalize.CtxAddr),
+                              ExecutorAddrDiff(ActPair.Finalize.CtxSize)}},
+                            {ExecutorAddr(ActPair.Dealloc.FnAddr),
+                             {ExecutorAddr(ActPair.Dealloc.CtxAddr),
+                              ExecutorAddrDiff(ActPair.Dealloc.CtxSize)}}});
+    G.allocActions().clear();
+
+    Parent.EPC.callSPSWrapperAsync<
+        rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
+        Parent.SAs.Finalize,
+        [OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr](
+            Error SerializationErr, Error FinalizeErr) mutable {
+          // FIXME: Release abandoned alloc.
+          if (SerializationErr) {
+            cantFail(std::move(FinalizeErr));
+            OnFinalize(std::move(SerializationErr));
+          } else if (FinalizeErr)
+            OnFinalize(std::move(FinalizeErr));
+          else
+            OnFinalize(FinalizedAlloc(AllocAddr.getValue()));
+        },
+        Parent.SAs.Allocator, std::move(FR));
+  }
+
+  void abandon(OnAbandonedFunction OnAbandoned) override {
+    // FIXME: Return memory to pool instead.
+    Parent.EPC.callSPSWrapperAsync<
+        rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+        Parent.SAs.Deallocate,
+        [OnAbandoned = std::move(OnAbandoned)](Error SerializationErr,
+                                               Error DeallocateErr) mutable {
+          if (SerializationErr) {
+            cantFail(std::move(DeallocateErr));
+            OnAbandoned(std::move(SerializationErr));
+          } else
+            OnAbandoned(std::move(DeallocateErr));
+        },
+        Parent.SAs.Allocator, ArrayRef<ExecutorAddr>(AllocAddr));
+  }
+
+private:
+  EPCGenericJITLinkMemoryManager &Parent;
+  LinkGraph &G;
+  ExecutorAddr AllocAddr;
+  SegInfoMap Segs;
+};
+
+void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD,
+                                              LinkGraph &G,
+                                              OnAllocatedFunction OnAllocated) {
+  BasicLayout BL(G);
+
+  auto Pages = BL.getContiguousPageBasedLayoutSizes(EPC.getPageSize());
+  if (!Pages)
+    return OnAllocated(Pages.takeError());
+
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReserveSignature>(
+      SAs.Reserve,
+      [this, BL = std::move(BL), OnAllocated = std::move(OnAllocated)](
+          Error SerializationErr, Expected<ExecutorAddr> AllocAddr) mutable {
+        if (SerializationErr) {
+          cantFail(AllocAddr.takeError());
+          return OnAllocated(std::move(SerializationErr));
+        }
+        if (!AllocAddr)
+          return OnAllocated(AllocAddr.takeError());
+
+        completeAllocation(*AllocAddr, std::move(BL), std::move(OnAllocated));
+      },
+      SAs.Allocator, Pages->total());
+}
+
+void EPCGenericJITLinkMemoryManager::deallocate(
+    std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) {
+  EPC.callSPSWrapperAsync<
+      rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+      SAs.Deallocate,
+      [OnDeallocated = std::move(OnDeallocated)](Error SerErr,
+                                                 Error DeallocErr) mutable {
+        if (SerErr) {
+          cantFail(std::move(DeallocErr));
+          OnDeallocated(std::move(SerErr));
+        } else
+          OnDeallocated(std::move(DeallocErr));
+      },
+      SAs.Allocator, Allocs);
+  for (auto &A : Allocs)
+    A.release();
+}
+
+void EPCGenericJITLinkMemoryManager::completeAllocation(
+    ExecutorAddr AllocAddr, BasicLayout BL, OnAllocatedFunction OnAllocated) {
+
+  InFlightAlloc::SegInfoMap SegInfos;
+
+  ExecutorAddr NextSegAddr = AllocAddr;
+  for (auto &KV : BL.segments()) {
+    const auto &AG = KV.first;
+    auto &Seg = KV.second;
+
+    Seg.Addr = NextSegAddr.getValue();
+    KV.second.WorkingMem = BL.getGraph().allocateBuffer(Seg.ContentSize).data();
+    NextSegAddr += ExecutorAddrDiff(
+        alignTo(Seg.ContentSize + Seg.ZeroFillSize, EPC.getPageSize()));
+
+    auto &SegInfo = SegInfos[AG];
+    SegInfo.ContentSize = Seg.ContentSize;
+    SegInfo.ZeroFillSize = Seg.ZeroFillSize;
+    SegInfo.Addr = ExecutorAddr(Seg.Addr);
+    SegInfo.WorkingMem = Seg.WorkingMem;
+  }
+
+  if (auto Err = BL.apply())
+    return OnAllocated(std::move(Err));
+
+  OnAllocated(std::make_unique<InFlightAlloc>(*this, BL.getGraph(), AllocAddr,
+                                              std::move(SegInfos)));
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
new file mode 100644
index 000000000000..1d98e104a4d7
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -0,0 +1,317 @@
+//===----- EPCGenericRTDyldMemoryManager.cpp - EPC-bbasde MemMgr -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+
+Expected<std::unique_ptr<EPCGenericRTDyldMemoryManager>>
+EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
+    ExecutorProcessControl &EPC) {
+  SymbolAddrs SAs;
+  if (auto Err = EPC.getBootstrapSymbols(
+          {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
+           {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
+           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
+           {SAs.Deallocate,
+            rt::SimpleExecutorMemoryManagerDeallocateWrapperName},
+           {SAs.RegisterEHFrame,
+            rt::RegisterEHFrameSectionCustomDirectWrapperName},
+           {SAs.DeregisterEHFrame,
+            rt::DeregisterEHFrameSectionCustomDirectWrapperName}}))
+    return std::move(Err);
+  return std::make_unique<EPCGenericRTDyldMemoryManager>(EPC, std::move(SAs));
+}
+
+EPCGenericRTDyldMemoryManager::EPCGenericRTDyldMemoryManager(
+    ExecutorProcessControl &EPC, SymbolAddrs SAs)
+    : EPC(EPC), SAs(std::move(SAs)) {
+  LLVM_DEBUG(dbgs() << "Created remote allocator " << (void *)this << "\n");
+}
+
+EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() {
+  LLVM_DEBUG(dbgs() << "Destroyed remote allocator " << (void *)this << "\n");
+  if (!ErrMsg.empty())
+    errs() << "Destroying with existing errors:\n" << ErrMsg << "\n";
+
+  Error Err = Error::success();
+  if (auto Err2 = EPC.callSPSWrapper<
+                  rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+          SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) {
+    // FIXME: Report errors through EPC once that functionality is available.
+    logAllUnhandledErrors(std::move(Err2), errs(), "");
+    return;
+  }
+
+  if (Err)
+    logAllUnhandledErrors(std::move(Err), errs(), "");
+}
+
+uint8_t *EPCGenericRTDyldMemoryManager::allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName) {
+  std::lock_guard<std::mutex> Lock(M);
+  LLVM_DEBUG({
+    dbgs() << "Allocator " << (void *)this << " allocating code section "
+           << SectionName << ": size = " << formatv("{0:x}", Size)
+           << " bytes, alignment = " << Alignment << "\n";
+  });
+  auto &Seg = Unmapped.back().CodeAllocs;
+  Seg.emplace_back(Size, Alignment);
+  return reinterpret_cast<uint8_t *>(
+      alignAddr(Seg.back().Contents.get(), Align(Alignment)));
+}
+
+uint8_t *EPCGenericRTDyldMemoryManager::allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool IsReadOnly) {
+  std::lock_guard<std::mutex> Lock(M);
+  LLVM_DEBUG({
+    dbgs() << "Allocator " << (void *)this << " allocating "
+           << (IsReadOnly ? "ro" : "rw") << "-data section " << SectionName
+           << ": size = " << formatv("{0:x}", Size) << " bytes, alignment "
+           << Alignment << ")\n";
+  });
+
+  auto &Seg =
+      IsReadOnly ? Unmapped.back().RODataAllocs : Unmapped.back().RWDataAllocs;
+
+  Seg.emplace_back(Size, Alignment);
+  return reinterpret_cast<uint8_t *>(
+      alignAddr(Seg.back().Contents.get(), Align(Alignment)));
+}
+
+void EPCGenericRTDyldMemoryManager::reserveAllocationSpace(
+    uintptr_t CodeSize, uint32_t CodeAlign, uintptr_t RODataSize,
+    uint32_t RODataAlign, uintptr_t RWDataSize, uint32_t RWDataAlign) {
+
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    // If there's already an error then bail out.
+    if (!ErrMsg.empty())
+      return;
+
+    if (!isPowerOf2_32(CodeAlign) || CodeAlign > EPC.getPageSize()) {
+      ErrMsg = "Invalid code alignment in reserveAllocationSpace";
+      return;
+    }
+    if (!isPowerOf2_32(RODataAlign) || RODataAlign > EPC.getPageSize()) {
+      ErrMsg = "Invalid ro-data alignment in reserveAllocationSpace";
+      return;
+    }
+    if (!isPowerOf2_32(RWDataAlign) || RWDataAlign > EPC.getPageSize()) {
+      ErrMsg = "Invalid rw-data alignment in reserveAllocationSpace";
+      return;
+    }
+  }
+
+  uint64_t TotalSize = 0;
+  TotalSize += alignTo(CodeSize, EPC.getPageSize());
+  TotalSize += alignTo(RODataSize, EPC.getPageSize());
+  TotalSize += alignTo(RWDataSize, EPC.getPageSize());
+
+  LLVM_DEBUG({
+    dbgs() << "Allocator " << (void *)this << " reserving "
+           << formatv("{0:x}", TotalSize) << " bytes.\n";
+  });
+
+  Expected<ExecutorAddr> TargetAllocAddr((ExecutorAddr()));
+  if (auto Err = EPC.callSPSWrapper<
+                 rt::SPSSimpleExecutorMemoryManagerReserveSignature>(
+          SAs.Reserve, TargetAllocAddr, SAs.Instance, TotalSize)) {
+    std::lock_guard<std::mutex> Lock(M);
+    ErrMsg = toString(std::move(Err));
+    return;
+  }
+  if (!TargetAllocAddr) {
+    std::lock_guard<std::mutex> Lock(M);
+    ErrMsg = toString(TargetAllocAddr.takeError());
+    return;
+  }
+
+  std::lock_guard<std::mutex> Lock(M);
+  Unmapped.push_back(AllocGroup());
+  Unmapped.back().RemoteCode = {
+      *TargetAllocAddr, ExecutorAddrDiff(alignTo(CodeSize, EPC.getPageSize()))};
+  Unmapped.back().RemoteROData = {
+      Unmapped.back().RemoteCode.End,
+      ExecutorAddrDiff(alignTo(RODataSize, EPC.getPageSize()))};
+  Unmapped.back().RemoteRWData = {
+      Unmapped.back().RemoteROData.End,
+      ExecutorAddrDiff(alignTo(RWDataSize, EPC.getPageSize()))};
+}
+
+bool EPCGenericRTDyldMemoryManager::needsToReserveAllocationSpace() {
+  return true;
+}
+
+void EPCGenericRTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                                     uint64_t LoadAddr,
+                                                     size_t Size) {
+  LLVM_DEBUG({
+    dbgs() << "Allocator " << (void *)this << " added unfinalized eh-frame "
+           << formatv("[ {0:x} {1:x} ]", LoadAddr, LoadAddr + Size) << "\n";
+  });
+  std::lock_guard<std::mutex> Lock(M);
+  // Bail out early if there's already an error.
+  if (!ErrMsg.empty())
+    return;
+
+  ExecutorAddr LA(LoadAddr);
+  for (auto &Alloc : llvm::reverse(Unfinalized)) {
+    if (Alloc.RemoteCode.contains(LA) || Alloc.RemoteROData.contains(LA) ||
+        Alloc.RemoteRWData.contains(LA)) {
+      Alloc.UnfinalizedEHFrames.push_back({LA, Size});
+      return;
+    }
+  }
+  ErrMsg = "eh-frame does not lie inside unfinalized alloc";
+}
+
+void EPCGenericRTDyldMemoryManager::deregisterEHFrames() {
+  // This is a no-op for us: We've registered a deallocation action for it.
+}
+
+void EPCGenericRTDyldMemoryManager::notifyObjectLoaded(
+    RuntimeDyld &Dyld, const object::ObjectFile &Obj) {
+  std::lock_guard<std::mutex> Lock(M);
+  LLVM_DEBUG(dbgs() << "Allocator " << (void *)this << " applied mappings:\n");
+  for (auto &ObjAllocs : Unmapped) {
+    mapAllocsToRemoteAddrs(Dyld, ObjAllocs.CodeAllocs,
+                           ObjAllocs.RemoteCode.Start);
+    mapAllocsToRemoteAddrs(Dyld, ObjAllocs.RODataAllocs,
+                           ObjAllocs.RemoteROData.Start);
+    mapAllocsToRemoteAddrs(Dyld, ObjAllocs.RWDataAllocs,
+                           ObjAllocs.RemoteRWData.Start);
+    Unfinalized.push_back(std::move(ObjAllocs));
+  }
+  Unmapped.clear();
+}
+
+bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
+  LLVM_DEBUG(dbgs() << "Allocator " << (void *)this << " finalizing:\n");
+
+  // If there's an error then bail out here.
+  std::vector<AllocGroup> Allocs;
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    if (ErrMsg && !this->ErrMsg.empty()) {
+      *ErrMsg = std::move(this->ErrMsg);
+      return true;
+    }
+    std::swap(Allocs, Unfinalized);
+  }
+
+  // Loop over unfinalized objects to make finalization requests.
+  for (auto &ObjAllocs : Allocs) {
+
+    tpctypes::WireProtectionFlags SegProts[3] = {
+        tpctypes::toWireProtectionFlags(
+            static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                      sys::Memory::MF_EXEC)),
+        tpctypes::toWireProtectionFlags(sys::Memory::MF_READ),
+        tpctypes::toWireProtectionFlags(
+            static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                      sys::Memory::MF_WRITE))};
+
+    ExecutorAddrRange *RemoteAddrs[3] = {&ObjAllocs.RemoteCode,
+                                         &ObjAllocs.RemoteROData,
+                                         &ObjAllocs.RemoteRWData};
+
+    std::vector<Alloc> *SegSections[3] = {&ObjAllocs.CodeAllocs,
+                                          &ObjAllocs.RODataAllocs,
+                                          &ObjAllocs.RWDataAllocs};
+
+    tpctypes::FinalizeRequest FR;
+    std::unique_ptr<char[]> AggregateContents[3];
+
+    for (unsigned I = 0; I != 3; ++I) {
+      FR.Segments.push_back({});
+      auto &Seg = FR.Segments.back();
+      Seg.Prot = SegProts[I];
+      Seg.Addr = RemoteAddrs[I]->Start;
+      for (auto &SecAlloc : *SegSections[I]) {
+        Seg.Size = alignTo(Seg.Size, SecAlloc.Align);
+        Seg.Size += SecAlloc.Size;
+      }
+      AggregateContents[I] = std::make_unique<char[]>(Seg.Size);
+      size_t SecOffset = 0;
+      for (auto &SecAlloc : *SegSections[I]) {
+        SecOffset = alignTo(SecOffset, SecAlloc.Align);
+        memcpy(&AggregateContents[I][SecOffset],
+               reinterpret_cast<const char *>(
+                   alignAddr(SecAlloc.Contents.get(), Align(SecAlloc.Align))),
+               SecAlloc.Size);
+        SecOffset += SecAlloc.Size;
+        // FIXME: Can we reset SecAlloc.Content here, now that it's copied into
+        // the aggregated content?
+      }
+      Seg.Content = {AggregateContents[I].get(), SecOffset};
+    }
+
+    for (auto &Frame : ObjAllocs.UnfinalizedEHFrames)
+      FR.Actions.push_back(
+          {{SAs.RegisterEHFrame,
+            {ExecutorAddr(Frame.Addr), ExecutorAddrDiff(Frame.Size)}},
+           {SAs.DeregisterEHFrame,
+            {ExecutorAddr(Frame.Addr), ExecutorAddrDiff(Frame.Size)}}});
+
+    // We'll also need to make an extra allocation for the eh-frame wrapper call
+    // arguments.
+    Error FinalizeErr = Error::success();
+    if (auto Err = EPC.callSPSWrapper<
+                   rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
+            SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) {
+      std::lock_guard<std::mutex> Lock(M);
+      this->ErrMsg = toString(std::move(Err));
+      dbgs() << "Serialization error: " << this->ErrMsg << "\n";
+      if (ErrMsg)
+        *ErrMsg = this->ErrMsg;
+      return true;
+    }
+    if (FinalizeErr) {
+      std::lock_guard<std::mutex> Lock(M);
+      this->ErrMsg = toString(std::move(FinalizeErr));
+      dbgs() << "Finalization error: " << this->ErrMsg << "\n";
+      if (ErrMsg)
+        *ErrMsg = this->ErrMsg;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void EPCGenericRTDyldMemoryManager::mapAllocsToRemoteAddrs(
+    RuntimeDyld &Dyld, std::vector<Alloc> &Allocs, ExecutorAddr NextAddr) {
+  for (auto &Alloc : Allocs) {
+    NextAddr.setValue(alignTo(NextAddr.getValue(), Alloc.Align));
+    LLVM_DEBUG({
+      dbgs() << "     " << static_cast<void *>(Alloc.Contents.get()) << " -> "
+             << format("0x%016" PRIx64, NextAddr.getValue()) << "\n";
+    });
+    Dyld.mapSectionAddress(reinterpret_cast<const void *>(alignAddr(
+                               Alloc.Contents.get(), Align(Alloc.Align))),
+                           NextAddr.getValue());
+    Alloc.RemoteAddr = NextAddr;
+    // Only advance NextAddr if it was non-null to begin with,
+    // otherwise leave it as null.
+    if (NextAddr)
+      NextAddr += ExecutorAddrDiff(Alloc.Size);
+  }
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
index b9c70b0aeb3c..818b6b52ff83 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
@@ -43,12 +43,12 @@ public:
 protected:
   Error grow() override;
 
-  using Allocation = jitlink::JITLinkMemoryManager::Allocation;
+  using FinalizedAlloc = jitlink::JITLinkMemoryManager::FinalizedAlloc;
 
   EPCIndirectionUtils &EPCIU;
   unsigned TrampolineSize = 0;
   unsigned TrampolinesPerPage = 0;
-  std::vector<std::unique_ptr<Allocation>> TrampolineBlocks;
+  std::vector<FinalizedAlloc> TrampolineBlocks;
 };
 
 class EPCIndirectStubsManager : public IndirectStubsManager,
@@ -89,12 +89,19 @@ EPCTrampolinePool::EPCTrampolinePool(EPCIndirectionUtils &EPCIU)
 
 Error EPCTrampolinePool::deallocatePool() {
   Error Err = Error::success();
-  for (auto &Alloc : TrampolineBlocks)
-    Err = joinErrors(std::move(Err), Alloc->deallocate());
-  return Err;
+  std::promise<MSVCPError> DeallocResultP;
+  auto DeallocResultF = DeallocResultP.get_future();
+
+  EPCIU.getExecutorProcessControl().getMemMgr().deallocate(
+      std::move(TrampolineBlocks),
+      [&](Error Err) { DeallocResultP.set_value(std::move(Err)); });
+
+  return DeallocResultF.get();
 }
 
 Error EPCTrampolinePool::grow() {
+  using namespace jitlink;
+
   assert(AvailableTrampolines.empty() &&
          "Grow called with trampolines still available");
 
@@ -102,34 +109,26 @@ Error EPCTrampolinePool::grow() {
   assert(ResolverAddress && "Resolver address can not be null");
 
   auto &EPC = EPCIU.getExecutorProcessControl();
-  constexpr auto TrampolinePagePermissions =
-      static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                sys::Memory::MF_EXEC);
   auto PageSize = EPC.getPageSize();
-  jitlink::JITLinkMemoryManager::SegmentsRequestMap Request;
-  Request[TrampolinePagePermissions] = {PageSize, static_cast<size_t>(PageSize),
-                                        0};
-  auto Alloc = EPC.getMemMgr().allocate(nullptr, Request);
-
+  auto Alloc = SimpleSegmentAlloc::Create(
+      EPC.getMemMgr(), nullptr,
+      {{MemProt::Read | MemProt::Exec, {PageSize, Align(PageSize)}}});
   if (!Alloc)
     return Alloc.takeError();
 
   unsigned NumTrampolines = TrampolinesPerPage;
 
-  auto WorkingMemory = (*Alloc)->getWorkingMemory(TrampolinePagePermissions);
-  auto TargetAddress = (*Alloc)->getTargetMemory(TrampolinePagePermissions);
-
-  EPCIU.getABISupport().writeTrampolines(WorkingMemory.data(), TargetAddress,
-                                         ResolverAddress, NumTrampolines);
-
-  auto TargetAddr = (*Alloc)->getTargetMemory(TrampolinePagePermissions);
+  auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec);
+  EPCIU.getABISupport().writeTrampolines(
+      SegInfo.WorkingMem.data(), SegInfo.Addr, ResolverAddress, NumTrampolines);
   for (unsigned I = 0; I < NumTrampolines; ++I)
-    AvailableTrampolines.push_back(TargetAddr + (I * TrampolineSize));
+    AvailableTrampolines.push_back(SegInfo.Addr + (I * TrampolineSize));
 
-  if (auto Err = (*Alloc)->finalize())
-    return Err;
+  auto FA = Alloc->finalize();
+  if (!FA)
+    return FA.takeError();
 
-  TrampolineBlocks.push_back(std::move(*Alloc));
+  TrampolineBlocks.push_back(std::move(*FA));
 
   return Error::success();
 }
@@ -162,16 +161,18 @@ Error EPCIndirectStubsManager::createStubs(const StubInitsMap &StubInits) {
     unsigned ASIdx = 0;
     std::vector<tpctypes::UInt32Write> PtrUpdates;
     for (auto &SI : StubInits)
-      PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress,
-                            static_cast<uint32_t>(SI.second.first)});
+      PtrUpdates.push_back(
+          {ExecutorAddr((*AvailableStubInfos)[ASIdx++].PointerAddress),
+           static_cast<uint32_t>(SI.second.first)});
     return MemAccess.writeUInt32s(PtrUpdates);
   }
   case 8: {
     unsigned ASIdx = 0;
     std::vector<tpctypes::UInt64Write> PtrUpdates;
     for (auto &SI : StubInits)
-      PtrUpdates.push_back({(*AvailableStubInfos)[ASIdx++].PointerAddress,
-                            static_cast<uint64_t>(SI.second.first)});
+      PtrUpdates.push_back(
+          {ExecutorAddr((*AvailableStubInfos)[ASIdx++].PointerAddress),
+           static_cast<uint64_t>(SI.second.first)});
     return MemAccess.writeUInt64s(PtrUpdates);
   }
   default:
@@ -213,11 +214,11 @@ Error EPCIndirectStubsManager::updatePointer(StringRef Name,
   auto &MemAccess = EPCIU.getExecutorProcessControl().getMemoryAccess();
   switch (EPCIU.getABISupport().getPointerSize()) {
   case 4: {
-    tpctypes::UInt32Write PUpdate(PtrAddr, NewAddr);
+    tpctypes::UInt32Write PUpdate(ExecutorAddr(PtrAddr), NewAddr);
     return MemAccess.writeUInt32s(PUpdate);
   }
   case 8: {
-    tpctypes::UInt64Write PUpdate(PtrAddr, NewAddr);
+    tpctypes::UInt64Write PUpdate(ExecutorAddr(PtrAddr), NewAddr);
     return MemAccess.writeUInt64s(PUpdate);
   }
   default:
@@ -267,17 +268,17 @@ EPCIndirectionUtils::Create(ExecutorProcessControl &EPC) {
 }
 
 Error EPCIndirectionUtils::cleanup() {
-  Error Err = Error::success();
 
-  for (auto &A : IndirectStubAllocs)
-    Err = joinErrors(std::move(Err), A->deallocate());
+  auto &MemMgr = EPC.getMemMgr();
+  auto Err = MemMgr.deallocate(std::move(IndirectStubAllocs));
 
   if (TP)
     Err = joinErrors(std::move(Err),
                      static_cast<EPCTrampolinePool &>(*TP).deallocatePool());
 
   if (ResolverBlock)
-    Err = joinErrors(std::move(Err), ResolverBlock->deallocate());
+    Err =
+        joinErrors(std::move(Err), MemMgr.deallocate(std::move(ResolverBlock)));
 
   return Err;
 }
@@ -285,29 +286,29 @@ Error EPCIndirectionUtils::cleanup() {
 Expected<JITTargetAddress>
 EPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr,
                                         JITTargetAddress ReentryCtxAddr) {
+  using namespace jitlink;
+
   assert(ABI && "ABI can not be null");
-  constexpr auto ResolverBlockPermissions =
-      static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                sys::Memory::MF_EXEC);
   auto ResolverSize = ABI->getResolverCodeSize();
 
-  jitlink::JITLinkMemoryManager::SegmentsRequestMap Request;
-  Request[ResolverBlockPermissions] = {EPC.getPageSize(),
-                                       static_cast<size_t>(ResolverSize), 0};
-  auto Alloc = EPC.getMemMgr().allocate(nullptr, Request);
+  auto Alloc =
+      SimpleSegmentAlloc::Create(EPC.getMemMgr(), nullptr,
+                                 {{MemProt::Read | MemProt::Exec,
+                                   {ResolverSize, Align(EPC.getPageSize())}}});
+
   if (!Alloc)
     return Alloc.takeError();
 
-  auto WorkingMemory = (*Alloc)->getWorkingMemory(ResolverBlockPermissions);
-  ResolverBlockAddr = (*Alloc)->getTargetMemory(ResolverBlockPermissions);
-  ABI->writeResolverCode(WorkingMemory.data(), ResolverBlockAddr, ReentryFnAddr,
+  auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec);
+  ABI->writeResolverCode(SegInfo.WorkingMem.data(), SegInfo.Addr, ReentryFnAddr,
                          ReentryCtxAddr);
 
-  if (auto Err = (*Alloc)->finalize())
-    return std::move(Err);
+  auto FA = Alloc->finalize();
+  if (!FA)
+    return FA.takeError();
 
-  ResolverBlock = std::move(*Alloc);
-  return ResolverBlockAddr;
+  ResolverBlock = std::move(*FA);
+  return SegInfo.Addr;
 }
 
 std::unique_ptr<IndirectStubsManager>
@@ -341,6 +342,7 @@ EPCIndirectionUtils::EPCIndirectionUtils(ExecutorProcessControl &EPC,
 
 Expected<EPCIndirectionUtils::IndirectStubInfoVector>
 EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) {
+  using namespace jitlink;
 
   std::lock_guard<std::mutex> Lock(EPCUIMutex);
 
@@ -350,42 +352,40 @@ EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) {
     auto PageSize = EPC.getPageSize();
     auto StubBytes = alignTo(NumStubsToAllocate * ABI->getStubSize(), PageSize);
     NumStubsToAllocate = StubBytes / ABI->getStubSize();
-    auto PointerBytes =
+    auto PtrBytes =
         alignTo(NumStubsToAllocate * ABI->getPointerSize(), PageSize);
 
-    constexpr auto StubPagePermissions =
-        static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                  sys::Memory::MF_EXEC);
-    constexpr auto PointerPagePermissions =
-        static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
-                                                  sys::Memory::MF_WRITE);
-
-    jitlink::JITLinkMemoryManager::SegmentsRequestMap Request;
-    Request[StubPagePermissions] = {PageSize, static_cast<size_t>(StubBytes),
-                                    0};
-    Request[PointerPagePermissions] = {PageSize, 0, PointerBytes};
-    auto Alloc = EPC.getMemMgr().allocate(nullptr, Request);
+    auto StubProt = MemProt::Read | MemProt::Exec;
+    auto PtrProt = MemProt::Read | MemProt::Write;
+
+    auto Alloc = SimpleSegmentAlloc::Create(
+        EPC.getMemMgr(), nullptr,
+        {{StubProt, {static_cast<size_t>(StubBytes), Align(PageSize)}},
+         {PtrProt, {static_cast<size_t>(PtrBytes), Align(PageSize)}}});
+
     if (!Alloc)
       return Alloc.takeError();
 
-    auto StubTargetAddr = (*Alloc)->getTargetMemory(StubPagePermissions);
-    auto PointerTargetAddr = (*Alloc)->getTargetMemory(PointerPagePermissions);
+    auto StubSeg = Alloc->getSegInfo(StubProt);
+    auto PtrSeg = Alloc->getSegInfo(PtrProt);
+
+    ABI->writeIndirectStubsBlock(StubSeg.WorkingMem.data(), StubSeg.Addr,
+                                 PtrSeg.Addr, NumStubsToAllocate);
 
-    ABI->writeIndirectStubsBlock(
-        (*Alloc)->getWorkingMemory(StubPagePermissions).data(), StubTargetAddr,
-        PointerTargetAddr, NumStubsToAllocate);
+    auto FA = Alloc->finalize();
+    if (!FA)
+      return FA.takeError();
 
-    if (auto Err = (*Alloc)->finalize())
-      return std::move(Err);
+    IndirectStubAllocs.push_back(std::move(*FA));
 
+    auto StubExecutorAddr = StubSeg.Addr;
+    auto PtrExecutorAddr = PtrSeg.Addr;
     for (unsigned I = 0; I != NumStubsToAllocate; ++I) {
       AvailableIndirectStubs.push_back(
-          IndirectStubInfo(StubTargetAddr, PointerTargetAddr));
-      StubTargetAddr += ABI->getStubSize();
-      PointerTargetAddr += ABI->getPointerSize();
+          IndirectStubInfo(StubExecutorAddr, PtrExecutorAddr));
+      StubExecutorAddr += ABI->getStubSize();
+      PtrExecutorAddr += ABI->getPointerSize();
     }
-
-    IndirectStubAllocs.push_back(std::move(*Alloc));
   }
 
   assert(NumStubs <= AvailableIndirectStubs.size() &&
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 7a76a6ccc122..2ab9ed4f856b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -12,9 +12,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include <string>
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index 7d86d125d1db..2eb835551adb 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -24,20 +24,22 @@ ExecutorProcessControl::MemoryAccess::~MemoryAccess() {}
 ExecutorProcessControl::~ExecutorProcessControl() {}
 
 SelfExecutorProcessControl::SelfExecutorProcessControl(
-    std::shared_ptr<SymbolStringPool> SSP, Triple TargetTriple,
-    unsigned PageSize, std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr)
-    : ExecutorProcessControl(std::move(SSP)) {
+    std::shared_ptr<SymbolStringPool> SSP, std::unique_ptr<TaskDispatcher> D,
+    Triple TargetTriple, unsigned PageSize,
+    std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr)
+    : ExecutorProcessControl(std::move(SSP), std::move(D)) {
 
   OwnedMemMgr = std::move(MemMgr);
   if (!OwnedMemMgr)
-    OwnedMemMgr = std::make_unique<jitlink::InProcessMemoryManager>();
+    OwnedMemMgr = std::make_unique<jitlink::InProcessMemoryManager>(
+        sys::Process::getPageSizeEstimate());
 
   this->TargetTriple = std::move(TargetTriple);
   this->PageSize = PageSize;
   this->MemMgr = OwnedMemMgr.get();
   this->MemAccess = this;
-  this->JDI = {ExecutorAddress::fromPtr(jitDispatchViaWrapperFunctionManager),
-               ExecutorAddress::fromPtr(this)};
+  this->JDI = {ExecutorAddr::fromPtr(jitDispatchViaWrapperFunctionManager),
+               ExecutorAddr::fromPtr(this)};
   if (this->TargetTriple.isOSBinFormatMachO())
     GlobalManglingPrefix = '_';
 }
@@ -45,11 +47,20 @@ SelfExecutorProcessControl::SelfExecutorProcessControl(
 Expected<std::unique_ptr<SelfExecutorProcessControl>>
 SelfExecutorProcessControl::Create(
     std::shared_ptr<SymbolStringPool> SSP,
+    std::unique_ptr<TaskDispatcher> D,
     std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr) {
 
   if (!SSP)
     SSP = std::make_shared<SymbolStringPool>();
 
+  if (!D) {
+#if LLVM_ENABLE_THREADS
+    D = std::make_unique<DynamicThreadPoolTaskDispatcher>();
+#else
+    D = std::make_unique<InPlaceTaskDispatcher>();
+#endif
+  }
+
   auto PageSize = sys::Process::getPageSize();
   if (!PageSize)
     return PageSize.takeError();
@@ -57,7 +68,8 @@ SelfExecutorProcessControl::Create(
   Triple TT(sys::getProcessTriple());
 
   return std::make_unique<SelfExecutorProcessControl>(
-      std::move(SSP), std::move(TT), *PageSize, std::move(MemMgr));
+      std::move(SSP), std::move(D), std::move(TT), *PageSize,
+      std::move(MemMgr));
 }
 
 Expected<tpctypes::DylibHandle>
@@ -93,7 +105,7 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
         // FIXME: Collect all failing symbols before erroring out.
         SymbolNameVector MissingSymbols;
         MissingSymbols.push_back(Sym);
-        return make_error<SymbolsNotFound>(std::move(MissingSymbols));
+        return make_error<SymbolsNotFound>(SSP, std::move(MissingSymbols));
       }
       R.back().push_back(pointerToJITTargetAddress(Addr));
     }
@@ -103,60 +115,62 @@ SelfExecutorProcessControl::lookupSymbols(ArrayRef<LookupRequest> Request) {
 }
 
 Expected<int32_t>
-SelfExecutorProcessControl::runAsMain(JITTargetAddress MainFnAddr,
+SelfExecutorProcessControl::runAsMain(ExecutorAddr MainFnAddr,
                                       ArrayRef<std::string> Args) {
   using MainTy = int (*)(int, char *[]);
-  return orc::runAsMain(jitTargetAddressToFunction<MainTy>(MainFnAddr), Args);
+  return orc::runAsMain(MainFnAddr.toPtr<MainTy>(), Args);
 }
 
-void SelfExecutorProcessControl::callWrapperAsync(
-    SendResultFunction SendResult, JITTargetAddress WrapperFnAddr,
-    ArrayRef<char> ArgBuffer) {
+void SelfExecutorProcessControl::callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                                                  IncomingWFRHandler SendResult,
+                                                  ArrayRef<char> ArgBuffer) {
   using WrapperFnTy =
-      shared::detail::CWrapperFunctionResult (*)(const char *Data, size_t Size);
-  auto *WrapperFn = jitTargetAddressToFunction<WrapperFnTy>(WrapperFnAddr);
+      shared::CWrapperFunctionResult (*)(const char *Data, size_t Size);
+  auto *WrapperFn = WrapperFnAddr.toPtr<WrapperFnTy>();
   SendResult(WrapperFn(ArgBuffer.data(), ArgBuffer.size()));
 }
 
-Error SelfExecutorProcessControl::disconnect() { return Error::success(); }
+Error SelfExecutorProcessControl::disconnect() {
+  D->shutdown();
+  return Error::success();
+}
 
-void SelfExecutorProcessControl::writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                                             WriteResultFn OnWriteComplete) {
+void SelfExecutorProcessControl::writeUInt8sAsync(
+    ArrayRef<tpctypes::UInt8Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
-    *jitTargetAddressToPointer<uint8_t *>(W.Address) = W.Value;
+    *W.Addr.toPtr<uint8_t *>() = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeUInt16s(
+void SelfExecutorProcessControl::writeUInt16sAsync(
     ArrayRef<tpctypes::UInt16Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
-    *jitTargetAddressToPointer<uint16_t *>(W.Address) = W.Value;
+    *W.Addr.toPtr<uint16_t *>() = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeUInt32s(
+void SelfExecutorProcessControl::writeUInt32sAsync(
     ArrayRef<tpctypes::UInt32Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
-    *jitTargetAddressToPointer<uint32_t *>(W.Address) = W.Value;
+    *W.Addr.toPtr<uint32_t *>() = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeUInt64s(
+void SelfExecutorProcessControl::writeUInt64sAsync(
     ArrayRef<tpctypes::UInt64Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
-    *jitTargetAddressToPointer<uint64_t *>(W.Address) = W.Value;
+    *W.Addr.toPtr<uint64_t *>() = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeBuffers(
+void SelfExecutorProcessControl::writeBuffersAsync(
     ArrayRef<tpctypes::BufferWrite> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
-    memcpy(jitTargetAddressToPointer<char *>(W.Address), W.Buffer.data(),
-           W.Buffer.size());
+    memcpy(W.Addr.toPtr<char *>(), W.Buffer.data(), W.Buffer.size());
   OnWriteComplete(Error::success());
 }
 
-shared::detail::CWrapperFunctionResult
+shared::CWrapperFunctionResult
 SelfExecutorProcessControl::jitDispatchViaWrapperFunctionManager(
     void *Ctx, const void *FnTag, const char *Data, size_t Size) {
 
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index e8dd1bb90c9a..ee1630a2ffa8 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -9,12 +9,17 @@
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <sstream>
 
+#define DEBUG_TYPE "orc"
+
 using namespace llvm;
 using namespace llvm::orc;
 
@@ -372,5 +377,77 @@ void cloneModuleFlagsMetadata(Module &Dst, const Module &Src,
     Dst.addModuleFlag(MapMetadata(MF, VMap));
 }
 
+Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym,
+                                                   jitlink::LinkGraph &G,
+                                                   MCDisassembler &Disassembler,
+                                                   MCInstrAnalysis &MIA) {
+  // AArch64 appears to already come with the necessary relocations. Among other
+  // architectures, only x86_64 is currently implemented here.
+  if (G.getTargetTriple().getArch() != Triple::x86_64)
+    return Error::success();
+
+  raw_null_ostream CommentStream;
+  auto &STI = Disassembler.getSubtargetInfo();
+
+  // Determine the function bounds
+  auto &B = Sym.getBlock();
+  assert(!B.isZeroFill() && "expected content block");
+  auto SymAddress = Sym.getAddress();
+  auto SymStartInBlock =
+      (const uint8_t *)B.getContent().data() + Sym.getOffset();
+  auto SymSize = Sym.getSize() ? Sym.getSize() : B.getSize() - Sym.getOffset();
+  auto Content = makeArrayRef(SymStartInBlock, SymSize);
+
+  LLVM_DEBUG(dbgs() << "Adding self-relocations to " << Sym.getName() << "\n");
+
+  SmallDenseSet<uintptr_t, 8> ExistingRelocations;
+  for (auto &E : B.edges()) {
+    if (E.isRelocation())
+      ExistingRelocations.insert(E.getOffset());
+  }
+
+  size_t I = 0;
+  while (I < Content.size()) {
+    MCInst Instr;
+    uint64_t InstrSize = 0;
+    uint64_t InstrStart = SymAddress + I;
+    auto DecodeStatus = Disassembler.getInstruction(
+        Instr, InstrSize, Content.drop_front(I), InstrStart, CommentStream);
+    if (DecodeStatus != MCDisassembler::Success) {
+      LLVM_DEBUG(dbgs() << "Aborting due to disassembly failure at address "
+                        << InstrStart);
+      return make_error<StringError>(
+          formatv("failed to disassemble at address {0:x16}", InstrStart),
+          inconvertibleErrorCode());
+    }
+    // Advance to the next instruction.
+    I += InstrSize;
+
+    // Check for a PC-relative address equal to the symbol itself.
+    auto PCRelAddr =
+        MIA.evaluateMemoryOperandAddress(Instr, &STI, InstrStart, InstrSize);
+    if (!PCRelAddr.hasValue() || PCRelAddr.getValue() != SymAddress)
+      continue;
+
+    auto RelocOffInInstr =
+        MIA.getMemoryOperandRelocationOffset(Instr, InstrSize);
+    if (!RelocOffInInstr.hasValue() ||
+        InstrSize - RelocOffInInstr.getValue() != 4) {
+      LLVM_DEBUG(dbgs() << "Skipping unknown self-relocation at "
+                        << InstrStart);
+      continue;
+    }
+
+    auto RelocOffInBlock =
+        InstrStart + *RelocOffInInstr - SymAddress + Sym.getOffset();
+    if (ExistingRelocations.contains(RelocOffInBlock))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Adding delta32 self-relocation at " << InstrStart);
+    B.addEdge(jitlink::x86_64::Delta32, RelocOffInBlock, Sym, /*Addend=*/-4);
+  }
+  return Error::success();
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 4257137a2212..0fbf79b8a56d 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -8,8 +8,8 @@
 
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 2ac32293e4db..0ab0d7d2e2b6 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -105,16 +105,18 @@ private:
 /// llvm.global_ctors.
 class GlobalCtorDtorScraper {
 public:
-
   GlobalCtorDtorScraper(GenericLLVMIRPlatformSupport &PS,
-                        StringRef InitFunctionPrefix)
-    : PS(PS), InitFunctionPrefix(InitFunctionPrefix) {}
+                        StringRef InitFunctionPrefix,
+                        StringRef DeInitFunctionPrefix)
+      : PS(PS), InitFunctionPrefix(InitFunctionPrefix),
+        DeInitFunctionPrefix(DeInitFunctionPrefix) {}
   Expected<ThreadSafeModule> operator()(ThreadSafeModule TSM,
                                         MaterializationResponsibility &R);
 
 private:
   GenericLLVMIRPlatformSupport &PS;
   StringRef InitFunctionPrefix;
+  StringRef DeInitFunctionPrefix;
 };
 
 /// Generic IR Platform Support
@@ -125,12 +127,14 @@ private:
 class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
 public:
   GenericLLVMIRPlatformSupport(LLJIT &J)
-      : J(J), InitFunctionPrefix(J.mangle("__orc_init_func.")) {
+      : J(J), InitFunctionPrefix(J.mangle("__orc_init_func.")),
+        DeInitFunctionPrefix(J.mangle("__orc_deinit_func.")) {
 
     getExecutionSession().setPlatform(
         std::make_unique<GenericLLVMIRPlatform>(*this));
 
-    setInitTransform(J, GlobalCtorDtorScraper(*this, InitFunctionPrefix));
+    setInitTransform(J, GlobalCtorDtorScraper(*this, InitFunctionPrefix,
+                                              DeInitFunctionPrefix));
 
     SymbolMap StdInterposes;
 
@@ -203,6 +207,8 @@ public:
           InitSymbols[&JD].add(KV.first,
                                SymbolLookupFlags::WeaklyReferencedSymbol);
           InitFunctions[&JD].add(KV.first);
+        } else if ((*KV.first).startswith(DeInitFunctionPrefix)) {
+          DeInitFunctions[&JD].add(KV.first);
         }
     }
     return Error::success();
@@ -256,6 +262,11 @@ public:
       });
   }
 
+  void registerDeInitFunc(JITDylib &JD, SymbolStringPtr DeInitName) {
+    getExecutionSession().runSessionLocked(
+        [&]() { DeInitFunctions[&JD].add(DeInitName); });
+  }
+
 private:
 
   Expected<std::vector<JITTargetAddress>> getInitializers(JITDylib &JD) {
@@ -438,6 +449,7 @@ private:
 
   LLJIT &J;
   std::string InitFunctionPrefix;
+  std::string DeInitFunctionPrefix;
   DenseMap<JITDylib *, SymbolLookupSet> InitSymbols;
   DenseMap<JITDylib *, SymbolLookupSet> InitFunctions;
   DenseMap<JITDylib *, SymbolLookupSet> DeInitFunctions;
@@ -459,40 +471,63 @@ GlobalCtorDtorScraper::operator()(ThreadSafeModule TSM,
   auto Err = TSM.withModuleDo([&](Module &M) -> Error {
     auto &Ctx = M.getContext();
     auto *GlobalCtors = M.getNamedGlobal("llvm.global_ctors");
-
-    // If there's no llvm.global_ctors or it's just a decl then skip.
-    if (!GlobalCtors || GlobalCtors->isDeclaration())
+    auto *GlobalDtors = M.getNamedGlobal("llvm.global_dtors");
+
+    auto RegisterCOrDtors = [&](GlobalVariable *GlobalCOrDtors,
+                                bool isCtor) -> Error {
+      // If there's no llvm.global_c/dtor or it's just a decl then skip.
+      if (!GlobalCOrDtors || GlobalCOrDtors->isDeclaration())
+        return Error::success();
+      std::string InitOrDeInitFunctionName;
+      if (isCtor)
+        raw_string_ostream(InitOrDeInitFunctionName)
+            << InitFunctionPrefix << M.getModuleIdentifier();
+      else
+        raw_string_ostream(InitOrDeInitFunctionName)
+            << DeInitFunctionPrefix << M.getModuleIdentifier();
+
+      MangleAndInterner Mangle(PS.getExecutionSession(), M.getDataLayout());
+      auto InternedInitOrDeInitName = Mangle(InitOrDeInitFunctionName);
+      if (auto Err = R.defineMaterializing(
+              {{InternedInitOrDeInitName, JITSymbolFlags::Callable}}))
+        return Err;
+
+      auto *InitOrDeInitFunc = Function::Create(
+          FunctionType::get(Type::getVoidTy(Ctx), {}, false),
+          GlobalValue::ExternalLinkage, InitOrDeInitFunctionName, &M);
+      InitOrDeInitFunc->setVisibility(GlobalValue::HiddenVisibility);
+      std::vector<std::pair<Function *, unsigned>> InitsOrDeInits;
+      auto COrDtors = isCtor ? getConstructors(M) : getDestructors(M);
+
+      for (auto E : COrDtors)
+        InitsOrDeInits.push_back(std::make_pair(E.Func, E.Priority));
+      llvm::sort(InitsOrDeInits,
+                 [](const std::pair<Function *, unsigned> &LHS,
+                    const std::pair<Function *, unsigned> &RHS) {
+                   return LHS.first < RHS.first;
+                 });
+
+      auto *InitOrDeInitFuncEntryBlock =
+          BasicBlock::Create(Ctx, "entry", InitOrDeInitFunc);
+      IRBuilder<> IB(InitOrDeInitFuncEntryBlock);
+      for (auto &KV : InitsOrDeInits)
+        IB.CreateCall(KV.first);
+      IB.CreateRetVoid();
+
+      if (isCtor)
+        PS.registerInitFunc(R.getTargetJITDylib(), InternedInitOrDeInitName);
+      else
+        PS.registerDeInitFunc(R.getTargetJITDylib(), InternedInitOrDeInitName);
+
+      GlobalCOrDtors->eraseFromParent();
       return Error::success();
+    };
 
-    std::string InitFunctionName;
-    raw_string_ostream(InitFunctionName)
-        << InitFunctionPrefix << M.getModuleIdentifier();
-
-    MangleAndInterner Mangle(PS.getExecutionSession(), M.getDataLayout());
-    auto InternedName = Mangle(InitFunctionName);
-    if (auto Err =
-            R.defineMaterializing({{InternedName, JITSymbolFlags::Callable}}))
+    if (auto Err = RegisterCOrDtors(GlobalCtors, true))
+      return Err;
+    if (auto Err = RegisterCOrDtors(GlobalDtors, false))
       return Err;
 
-    auto *InitFunc =
-        Function::Create(FunctionType::get(Type::getVoidTy(Ctx), {}, false),
-                         GlobalValue::ExternalLinkage, InitFunctionName, &M);
-    InitFunc->setVisibility(GlobalValue::HiddenVisibility);
-    std::vector<std::pair<Function *, unsigned>> Inits;
-    for (auto E : getConstructors(M))
-      Inits.push_back(std::make_pair(E.Func, E.Priority));
-    llvm::sort(Inits, [](const std::pair<Function *, unsigned> &LHS,
-                         const std::pair<Function *, unsigned> &RHS) {
-      return LHS.first < RHS.first;
-    });
-    auto *EntryBlock = BasicBlock::Create(Ctx, "entry", InitFunc);
-    IRBuilder<> IB(EntryBlock);
-    for (auto &KV : Inits)
-      IB.CreateCall(KV.first);
-    IB.CreateRetVoid();
-
-    PS.registerInitFunc(R.getTargetJITDylib(), InternedName);
-    GlobalCtors->eraseFromParent();
     return Error::success();
   });
 
diff --git a/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
new file mode 100644
index 000000000000..44cb78c773c9
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
@@ -0,0 +1,82 @@
+//===------- LookupAndRecordAddrs.h - Symbol lookup support utility -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
+
+#include <future>
+
+namespace llvm {
+namespace orc {
+
+void lookupAndRecordAddrs(
+    unique_function<void(Error)> OnRecorded, ExecutionSession &ES, LookupKind K,
+    const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> Pairs,
+    SymbolLookupFlags LookupFlags) {
+
+  SymbolLookupSet Symbols;
+  for (auto &KV : Pairs)
+    Symbols.add(KV.first, LookupFlags);
+
+  ES.lookup(
+      K, SearchOrder, Symbols, SymbolState::Ready,
+      [Pairs = std::move(Pairs),
+       OnRec = std::move(OnRecorded)](Expected<SymbolMap> Result) mutable {
+        if (!Result)
+          return OnRec(Result.takeError());
+        for (auto &KV : Pairs) {
+          auto I = Result->find(KV.first);
+          KV.second->setValue((I != Result->end()) ? I->second.getAddress()
+                                                   : 0);
+        }
+        OnRec(Error::success());
+      },
+      NoDependenciesToRegister);
+}
+
+Error lookupAndRecordAddrs(
+    ExecutionSession &ES, LookupKind K, const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> Pairs,
+    SymbolLookupFlags LookupFlags) {
+
+  std::promise<MSVCPError> ResultP;
+  auto ResultF = ResultP.get_future();
+  lookupAndRecordAddrs([&](Error Err) { ResultP.set_value(std::move(Err)); },
+                       ES, K, SearchOrder, Pairs, LookupFlags);
+  return ResultF.get();
+}
+
+Error lookupAndRecordAddrs(
+    ExecutorProcessControl &EPC, tpctypes::DylibHandle H,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddr *>> Pairs,
+    SymbolLookupFlags LookupFlags) {
+
+  SymbolLookupSet Symbols;
+  for (auto &KV : Pairs)
+    Symbols.add(KV.first, LookupFlags);
+
+  ExecutorProcessControl::LookupRequest LR(H, Symbols);
+  auto Result = EPC.lookupSymbols(LR);
+  if (!Result)
+    return Result.takeError();
+
+  if (Result->size() != 1)
+    return make_error<StringError>("Error in lookup result",
+                                   inconvertibleErrorCode());
+  if (Result->front().size() != Pairs.size())
+    return make_error<StringError>("Error in lookup result elements",
+                                   inconvertibleErrorCode());
+
+  for (unsigned I = 0; I != Pairs.size(); ++I)
+    Pairs[I].second->setValue(Result->front()[I]);
+
+  return Error::success();
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 66ef835dc34d..46c915dfea9e 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Debug.h"
 
@@ -52,7 +53,7 @@ public:
     auto G = std::make_unique<jitlink::LinkGraph>(
         "<MachOHeaderMU>", TT, PointerSize, Endianness,
         jitlink::getGenericEdgeKindName);
-    auto &HeaderSection = G->createSection("__header", sys::Memory::MF_READ);
+    auto &HeaderSection = G->createSection("__header", jitlink::MemProt::Read);
     auto &HeaderBlock = createHeaderBlock(*G, HeaderSection);
 
     // Init symbol is header-start symbol.
@@ -135,13 +136,14 @@ StringRef ObjCImageInfoSectionName = "__DATA,__objc_image_info";
 StringRef ObjCSelRefsSectionName = "__DATA,__objc_selrefs";
 StringRef Swift5ProtoSectionName = "__TEXT,__swift5_proto";
 StringRef Swift5ProtosSectionName = "__TEXT,__swift5_protos";
+StringRef Swift5TypesSectionName = "__TEXT,__swift5_types";
 StringRef ThreadBSSSectionName = "__DATA,__thread_bss";
 StringRef ThreadDataSectionName = "__DATA,__thread_data";
 StringRef ThreadVarsSectionName = "__DATA,__thread_vars";
 
 StringRef InitSectionNames[] = {
     ModInitFuncSectionName, ObjCSelRefsSectionName, ObjCClassListSectionName,
-    Swift5ProtosSectionName, Swift5ProtoSectionName};
+    Swift5ProtosSectionName, Swift5ProtoSectionName, Swift5TypesSectionName};
 
 } // end anonymous namespace
 
@@ -172,10 +174,10 @@ MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
   // Add JIT-dispatch function support symbols.
   if (auto Err = PlatformJD.define(absoluteSymbols(
           {{ES.intern("___orc_rt_jit_dispatch"),
-            {EPC.getJITDispatchInfo().JITDispatchFunctionAddress.getValue(),
+            {EPC.getJITDispatchInfo().JITDispatchFunction.getValue(),
              JITSymbolFlags::Exported}},
            {ES.intern("___orc_rt_jit_dispatch_ctx"),
-            {EPC.getJITDispatchInfo().JITDispatchContextAddress.getValue(),
+            {EPC.getJITDispatchInfo().JITDispatchContext.getValue(),
              JITSymbolFlags::Exported}}})))
     return std::move(Err);
 
@@ -267,6 +269,7 @@ bool MachOPlatform::isInitializerSection(StringRef SegName,
 
 bool MachOPlatform::supportedTarget(const Triple &TT) {
   switch (TT.getArch()) {
+  case Triple::aarch64:
   case Triple::x86_64:
     return true;
   default:
@@ -286,6 +289,19 @@ MachOPlatform::MachOPlatform(
 
   PlatformJD.addGenerator(std::move(OrcRuntimeGenerator));
 
+  // Force linking of eh-frame registration functions.
+  if (auto Err2 = lookupAndRecordAddrs(
+          ES, LookupKind::Static, makeJITDylibSearchOrder(&PlatformJD),
+          {{ES.intern("___orc_rt_macho_register_ehframe_section"),
+            &orc_rt_macho_register_ehframe_section},
+           {ES.intern("___orc_rt_macho_deregister_ehframe_section"),
+            &orc_rt_macho_deregister_ehframe_section}})) {
+    Err = std::move(Err2);
+    return;
+  }
+
+  State = BootstrapPhase2;
+
   // PlatformJD hasn't been 'set-up' by the platform yet (since we're creating
   // the platform now), so set it up.
   if (auto E2 = setupJITDylib(PlatformJD)) {
@@ -309,6 +325,8 @@ MachOPlatform::MachOPlatform(
     Err = std::move(E2);
     return;
   }
+
+  State = Initialized;
 }
 
 Error MachOPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
@@ -321,13 +339,13 @@ Error MachOPlatform::associateRuntimeSupportFunctions(JITDylib &PlatformJD) {
           this, &MachOPlatform::rt_getInitializers);
 
   using GetDeinitializersSPSSig =
-      SPSExpected<SPSMachOJITDylibDeinitializerSequence>(SPSExecutorAddress);
+      SPSExpected<SPSMachOJITDylibDeinitializerSequence>(SPSExecutorAddr);
   WFs[ES.intern("___orc_rt_macho_get_deinitializers_tag")] =
       ES.wrapAsyncWithSPS<GetDeinitializersSPSSig>(
           this, &MachOPlatform::rt_getDeinitializers);
 
   using LookupSymbolSPSSig =
-      SPSExpected<SPSExecutorAddress>(SPSExecutorAddress, SPSString);
+      SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSString);
   WFs[ES.intern("___orc_rt_macho_symbol_lookup_tag")] =
       ES.wrapAsyncWithSPS<LookupSymbolSPSSig>(this,
                                               &MachOPlatform::rt_lookupSymbol);
@@ -411,7 +429,7 @@ void MachOPlatform::rt_getInitializers(SendInitializerSequenceFn SendResult,
 }
 
 void MachOPlatform::rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
-                                         ExecutorAddress Handle) {
+                                         ExecutorAddr Handle) {
   LLVM_DEBUG({
     dbgs() << "MachOPlatform::rt_getDeinitializers(\""
            << formatv("{0:x}", Handle.getValue()) << "\")\n";
@@ -441,8 +459,7 @@ void MachOPlatform::rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
 }
 
 void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
-                                    ExecutorAddress Handle,
-                                    StringRef SymbolName) {
+                                    ExecutorAddr Handle, StringRef SymbolName) {
   LLVM_DEBUG({
     dbgs() << "MachOPlatform::rt_lookupSymbol(\""
            << formatv("{0:x}", Handle.getValue()) << "\")\n";
@@ -476,7 +493,7 @@ void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
     void operator()(Expected<SymbolMap> Result) {
       if (Result) {
         assert(Result->size() == 1 && "Unexpected result map count");
-        SendResult(ExecutorAddress(Result->begin()->second.getAddress()));
+        SendResult(ExecutorAddr(Result->begin()->second.getAddress()));
       } else {
         SendResult(Result.takeError());
       }
@@ -495,56 +512,25 @@ void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
 }
 
 Error MachOPlatform::bootstrapMachORuntime(JITDylib &PlatformJD) {
-
-  std::pair<const char *, ExecutorAddress *> Symbols[] = {
-      {"___orc_rt_macho_platform_bootstrap", &orc_rt_macho_platform_bootstrap},
-      {"___orc_rt_macho_platform_shutdown", &orc_rt_macho_platform_shutdown},
-      {"___orc_rt_macho_register_object_sections",
-       &orc_rt_macho_register_object_sections},
-      {"___orc_rt_macho_create_pthread_key", &orc_rt_macho_create_pthread_key}};
-
-  SymbolLookupSet RuntimeSymbols;
-  std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> AddrsToRecord;
-  for (const auto &KV : Symbols) {
-    auto Name = ES.intern(KV.first);
-    RuntimeSymbols.add(Name);
-    AddrsToRecord.push_back({std::move(Name), KV.second});
-  }
-
-  auto RuntimeSymbolAddrs = ES.lookup(
-      {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, RuntimeSymbols);
-  if (!RuntimeSymbolAddrs)
-    return RuntimeSymbolAddrs.takeError();
-
-  for (const auto &KV : AddrsToRecord) {
-    auto &Name = KV.first;
-    assert(RuntimeSymbolAddrs->count(Name) && "Missing runtime symbol?");
-    KV.second->setValue((*RuntimeSymbolAddrs)[Name].getAddress());
-  }
-
-  if (auto Err =
-          ES.callSPSWrapper<void()>(orc_rt_macho_platform_bootstrap.getValue()))
+  if (auto Err = lookupAndRecordAddrs(
+          ES, LookupKind::Static, makeJITDylibSearchOrder(&PlatformJD),
+          {{ES.intern("___orc_rt_macho_platform_bootstrap"),
+            &orc_rt_macho_platform_bootstrap},
+           {ES.intern("___orc_rt_macho_platform_shutdown"),
+            &orc_rt_macho_platform_shutdown},
+           {ES.intern("___orc_rt_macho_register_thread_data_section"),
+            &orc_rt_macho_register_thread_data_section},
+           {ES.intern("___orc_rt_macho_deregister_thread_data_section"),
+            &orc_rt_macho_deregister_thread_data_section},
+           {ES.intern("___orc_rt_macho_create_pthread_key"),
+            &orc_rt_macho_create_pthread_key}}))
     return Err;
 
-  // FIXME: Ordering is fuzzy here. We're probably best off saying
-  // "behavior is undefined if code that uses the runtime is added before
-  // the platform constructor returns", then move all this to the constructor.
-  RuntimeBootstrapped = true;
-  std::vector<MachOPerObjectSectionsToRegister> DeferredPOSRs;
-  {
-    std::lock_guard<std::mutex> Lock(PlatformMutex);
-    DeferredPOSRs = std::move(BootstrapPOSRs);
-  }
-
-  for (auto &D : DeferredPOSRs)
-    if (auto Err = registerPerObjectSections(D))
-      return Err;
-
-  return Error::success();
+  return ES.callSPSWrapper<void()>(orc_rt_macho_platform_bootstrap);
 }
 
 Error MachOPlatform::registerInitInfo(
-    JITDylib &JD, ExecutorAddress ObjCImageInfoAddr,
+    JITDylib &JD, ExecutorAddr ObjCImageInfoAddr,
     ArrayRef<jitlink::Section *> InitSections) {
 
   std::unique_lock<std::mutex> Lock(PlatformMutex);
@@ -576,29 +562,12 @@ Error MachOPlatform::registerInitInfo(
     // FIXME: Avoid copy here.
     jitlink::SectionRange R(*Sec);
     InitSeq->InitSections[Sec->getName()].push_back(
-        {ExecutorAddress(R.getStart()), ExecutorAddress(R.getEnd())});
+        {ExecutorAddr(R.getStart()), ExecutorAddr(R.getEnd())});
   }
 
   return Error::success();
 }
 
-Error MachOPlatform::registerPerObjectSections(
-    const MachOPerObjectSectionsToRegister &POSR) {
-
-  if (!orc_rt_macho_register_object_sections)
-    return make_error<StringError>("Attempting to register per-object "
-                                   "sections, but runtime support has not "
-                                   "been loaded yet",
-                                   inconvertibleErrorCode());
-
-  Error ErrResult = Error::success();
-  if (auto Err = ES.callSPSWrapper<shared::SPSError(
-                     SPSMachOPerObjectSectionsToRegister)>(
-          orc_rt_macho_register_object_sections.getValue(), ErrResult, POSR))
-    return Err;
-  return ErrResult;
-}
-
 Expected<uint64_t> MachOPlatform::createPThreadKey() {
   if (!orc_rt_macho_create_pthread_key)
     return make_error<StringError>(
@@ -608,7 +577,7 @@ Expected<uint64_t> MachOPlatform::createPThreadKey() {
 
   Expected<uint64_t> Result(0);
   if (auto Err = ES.callSPSWrapper<SPSExpected<uint64_t>(void)>(
-          orc_rt_macho_create_pthread_key.getValue(), Result))
+          orc_rt_macho_create_pthread_key, Result))
     return std::move(Err);
   return Result;
 }
@@ -617,21 +586,55 @@ void MachOPlatform::MachOPlatformPlugin::modifyPassConfig(
     MaterializationResponsibility &MR, jitlink::LinkGraph &LG,
     jitlink::PassConfiguration &Config) {
 
-  // If the initializer symbol is the MachOHeader start symbol then just add
-  // the macho header support passes.
-  if (MR.getInitializerSymbol() == MP.MachOHeaderStartSymbol) {
-    addMachOHeaderSupportPasses(MR, Config);
-    // The header materialization unit doesn't require any other support, so we
-    // can bail out early.
+  auto PS = MP.State.load();
+
+  // --- Handle Initializers ---
+  if (auto InitSymbol = MR.getInitializerSymbol()) {
+
+    // If the initializer symbol is the MachOHeader start symbol then just
+    // register it and then bail out -- the header materialization unit
+    // definitely doesn't need any other passes.
+    if (InitSymbol == MP.MachOHeaderStartSymbol) {
+      Config.PostAllocationPasses.push_back([this, &MR](jitlink::LinkGraph &G) {
+        return associateJITDylibHeaderSymbol(G, MR);
+      });
+      return;
+    }
+
+    // If the object contains an init symbol other than the header start symbol
+    // then add passes to preserve, process and register the init
+    // sections/symbols.
+    Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) {
+      if (auto Err = preserveInitSections(G, MR))
+        return Err;
+      return processObjCImageInfo(G, MR);
+    });
+
+    Config.PostFixupPasses.push_back(
+        [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
+          return registerInitSections(G, JD);
+        });
+  }
+
+  // --- Add passes for eh-frame and TLV support ---
+  if (PS == MachOPlatform::BootstrapPhase1) {
+    Config.PostFixupPasses.push_back(
+        [this](jitlink::LinkGraph &G) { return registerEHSectionsPhase1(G); });
     return;
   }
 
-  // If the object contains initializers then add passes to record them.
-  if (MR.getInitializerSymbol())
-    addInitializerSupportPasses(MR, Config);
+  // Insert TLV lowering at the start of the PostPrunePasses, since we want
+  // it to run before GOT/PLT lowering.
+  Config.PostPrunePasses.insert(
+      Config.PostPrunePasses.begin(),
+      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
+        return fixTLVSectionsAndEdges(G, JD);
+      });
 
-  // Add passes for eh-frame and TLV support.
-  addEHAndTLVSupportPasses(MR, Config);
+  // Add a pass to register the final addresses of the eh-frame and TLV sections
+  // with the runtime.
+  Config.PostFixupPasses.push_back(
+      [this](jitlink::LinkGraph &G) { return registerEHAndTLVSections(G); });
 }
 
 ObjectLinkingLayer::Plugin::SyntheticSymbolDependenciesMap
@@ -648,111 +651,22 @@ MachOPlatform::MachOPlatformPlugin::getSyntheticSymbolDependencies(
   return SyntheticSymbolDependenciesMap();
 }
 
-void MachOPlatform::MachOPlatformPlugin::addInitializerSupportPasses(
-    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
-
-  /// Preserve init sections.
-  Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) {
-    if (auto Err = preserveInitSections(G, MR))
-      return Err;
-    return processObjCImageInfo(G, MR);
-  });
-
-  Config.PostFixupPasses.push_back(
-      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
-        return registerInitSections(G, JD);
-      });
-}
-
-void MachOPlatform::MachOPlatformPlugin::addMachOHeaderSupportPasses(
-    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
-
-  Config.PostAllocationPasses.push_back([this, &JD = MR.getTargetJITDylib()](
-                                            jitlink::LinkGraph &G) -> Error {
-    auto I = llvm::find_if(G.defined_symbols(), [this](jitlink::Symbol *Sym) {
-      return Sym->getName() == *MP.MachOHeaderStartSymbol;
-    });
-    assert(I != G.defined_symbols().end() &&
-           "Missing MachO header start symbol");
-    {
-      std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
-      JITTargetAddress HeaderAddr = (*I)->getAddress();
-      MP.HeaderAddrToJITDylib[HeaderAddr] = &JD;
-      assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
-      MP.InitSeqs.insert(
-          std::make_pair(&JD, MachOJITDylibInitializers(
-                                  JD.getName(), ExecutorAddress(HeaderAddr))));
-    }
-    return Error::success();
-  });
-}
-
-void MachOPlatform::MachOPlatformPlugin::addEHAndTLVSupportPasses(
-    MaterializationResponsibility &MR, jitlink::PassConfiguration &Config) {
-
-  // Insert TLV lowering at the start of the PostPrunePasses, since we want
-  // it to run before GOT/PLT lowering.
-  Config.PostPrunePasses.insert(
-      Config.PostPrunePasses.begin(),
-      [this, &JD = MR.getTargetJITDylib()](jitlink::LinkGraph &G) {
-        return fixTLVSectionsAndEdges(G, JD);
-      });
-
-  // Add a pass to register the final addresses of the eh-frame and TLV sections
-  // with the runtime.
-  Config.PostFixupPasses.push_back([this](jitlink::LinkGraph &G) -> Error {
-    MachOPerObjectSectionsToRegister POSR;
-
-    if (auto *EHFrameSection = G.findSectionByName(EHFrameSectionName)) {
-      jitlink::SectionRange R(*EHFrameSection);
-      if (!R.empty())
-        POSR.EHFrameSection = {ExecutorAddress(R.getStart()),
-                               ExecutorAddress(R.getEnd())};
-    }
-
-    // Get a pointer to the thread data section if there is one. It will be used
-    // below.
-    jitlink::Section *ThreadDataSection =
-        G.findSectionByName(ThreadDataSectionName);
-
-    // Handle thread BSS section if there is one.
-    if (auto *ThreadBSSSection = G.findSectionByName(ThreadBSSSectionName)) {
-      // If there's already a thread data section in this graph then merge the
-      // thread BSS section content into it, otherwise just treat the thread
-      // BSS section as the thread data section.
-      if (ThreadDataSection)
-        G.mergeSections(*ThreadDataSection, *ThreadBSSSection);
-      else
-        ThreadDataSection = ThreadBSSSection;
-    }
-
-    // Having merged thread BSS (if present) and thread data (if present),
-    // record the resulting section range.
-    if (ThreadDataSection) {
-      jitlink::SectionRange R(*ThreadDataSection);
-      if (!R.empty())
-        POSR.ThreadDataSection = {ExecutorAddress(R.getStart()),
-                                  ExecutorAddress(R.getEnd())};
-    }
-
-    if (POSR.EHFrameSection.StartAddress ||
-        POSR.ThreadDataSection.StartAddress) {
-
-      // If we're still bootstrapping the runtime then just record this
-      // frame for now.
-      if (!MP.RuntimeBootstrapped) {
-        std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
-        MP.BootstrapPOSRs.push_back(POSR);
-        return Error::success();
-      }
-
-      // Otherwise register it immediately.
-      if (auto Err = MP.registerPerObjectSections(POSR))
-        return Err;
-    }
+Error MachOPlatform::MachOPlatformPlugin::associateJITDylibHeaderSymbol(
+    jitlink::LinkGraph &G, MaterializationResponsibility &MR) {
 
-    return Error::success();
+  auto I = llvm::find_if(G.defined_symbols(), [this](jitlink::Symbol *Sym) {
+    return Sym->getName() == *MP.MachOHeaderStartSymbol;
   });
+  assert(I != G.defined_symbols().end() && "Missing MachO header start symbol");
+
+  auto &JD = MR.getTargetJITDylib();
+  std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
+  JITTargetAddress HeaderAddr = (*I)->getAddress();
+  MP.HeaderAddrToJITDylib[HeaderAddr] = &JD;
+  assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
+  MP.InitSeqs.insert(std::make_pair(
+      &JD, MachOJITDylibInitializers(JD.getName(), ExecutorAddr(HeaderAddr))));
+  return Error::success();
 }
 
 Error MachOPlatform::MachOPlatformPlugin::preserveInitSections(
@@ -873,7 +787,7 @@ Error MachOPlatform::MachOPlatformPlugin::processObjCImageInfo(
 Error MachOPlatform::MachOPlatformPlugin::registerInitSections(
     jitlink::LinkGraph &G, JITDylib &JD) {
 
-  ExecutorAddress ObjCImageInfoAddr;
+  ExecutorAddr ObjCImageInfoAddr;
   SmallVector<jitlink::Section *> InitSections;
 
   if (auto *ObjCImageInfoSec = G.findSectionByName(ObjCImageInfoSectionName)) {
@@ -950,9 +864,109 @@ Error MachOPlatform::MachOPlatformPlugin::fixTLVSectionsAndEdges(
   for (auto *B : G.blocks())
     for (auto &E : B->edges())
       if (E.getKind() ==
-          jitlink::x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable)
-        E.setKind(
-            jitlink::x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable);
+          jitlink::x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable)
+        E.setKind(jitlink::x86_64::
+                      RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable);
+
+  return Error::success();
+}
+
+Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections(
+    jitlink::LinkGraph &G) {
+
+  // Add a pass to register the final addresses of the eh-frame and TLV sections
+  // with the runtime.
+  if (auto *EHFrameSection = G.findSectionByName(EHFrameSectionName)) {
+    jitlink::SectionRange R(*EHFrameSection);
+    if (!R.empty())
+      G.allocActions().push_back(
+          {{MP.orc_rt_macho_register_ehframe_section.getValue(), R.getStart(),
+            R.getSize()},
+           {MP.orc_rt_macho_deregister_ehframe_section.getValue(), R.getStart(),
+            R.getSize()}});
+  }
+
+  // Get a pointer to the thread data section if there is one. It will be used
+  // below.
+  jitlink::Section *ThreadDataSection =
+      G.findSectionByName(ThreadDataSectionName);
+
+  // Handle thread BSS section if there is one.
+  if (auto *ThreadBSSSection = G.findSectionByName(ThreadBSSSectionName)) {
+    // If there's already a thread data section in this graph then merge the
+    // thread BSS section content into it, otherwise just treat the thread
+    // BSS section as the thread data section.
+    if (ThreadDataSection)
+      G.mergeSections(*ThreadDataSection, *ThreadBSSSection);
+    else
+      ThreadDataSection = ThreadBSSSection;
+  }
+
+  // Having merged thread BSS (if present) and thread data (if present),
+  // record the resulting section range.
+  if (ThreadDataSection) {
+    jitlink::SectionRange R(*ThreadDataSection);
+    if (!R.empty()) {
+      if (MP.State != MachOPlatform::Initialized)
+        return make_error<StringError>("__thread_data section encountered, but "
+                                       "MachOPlatform has not finished booting",
+                                       inconvertibleErrorCode());
+
+      G.allocActions().push_back(
+          {{MP.orc_rt_macho_register_thread_data_section.getValue(),
+            R.getStart(), R.getSize()},
+           {MP.orc_rt_macho_deregister_thread_data_section.getValue(),
+            R.getStart(), R.getSize()}});
+    }
+  }
+  return Error::success();
+}
+
+Error MachOPlatform::MachOPlatformPlugin::registerEHSectionsPhase1(
+    jitlink::LinkGraph &G) {
+
+  // If there's no eh-frame there's nothing to do.
+  auto *EHFrameSection = G.findSectionByName(EHFrameSectionName);
+  if (!EHFrameSection)
+    return Error::success();
+
+  // If the eh-frame section is empty there's nothing to do.
+  jitlink::SectionRange R(*EHFrameSection);
+  if (R.empty())
+    return Error::success();
+
+  // Since we're linking the object containing the registration code now the
+  // addresses won't be ready in the platform. We'll have to find them in this
+  // graph instead.
+  ExecutorAddr orc_rt_macho_register_ehframe_section;
+  ExecutorAddr orc_rt_macho_deregister_ehframe_section;
+  for (auto *Sym : G.defined_symbols()) {
+    if (!Sym->hasName())
+      continue;
+    if (Sym->getName() == "___orc_rt_macho_register_ehframe_section")
+      orc_rt_macho_register_ehframe_section = ExecutorAddr(Sym->getAddress());
+    else if (Sym->getName() == "___orc_rt_macho_deregister_ehframe_section")
+      orc_rt_macho_deregister_ehframe_section = ExecutorAddr(Sym->getAddress());
+
+    if (orc_rt_macho_register_ehframe_section &&
+        orc_rt_macho_deregister_ehframe_section)
+      break;
+  }
+
+  // If we failed to find the required functions then bail out.
+  if (!orc_rt_macho_register_ehframe_section ||
+      !orc_rt_macho_deregister_ehframe_section)
+    return make_error<StringError>("Could not find eh-frame registration "
+                                   "functions during platform bootstrap",
+                                   inconvertibleErrorCode());
+
+  // Otherwise, add allocation actions to the graph to register eh-frames for
+  // this object.
+  G.allocActions().push_back(
+      {{orc_rt_macho_register_ehframe_section.getValue(), R.getStart(),
+        R.getSize()},
+       {orc_rt_macho_deregister_ehframe_section.getValue(), R.getStart(),
+        R.getSize()}});
 
   return Error::success();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
index 14b22880ab7e..7b21e6a684ca 100644
--- a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
@@ -83,17 +85,29 @@ void IRSymbolMapper::add(ExecutionSession &ES, const ManglingOptions &MO,
   }
 }
 
-Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
-getObjectSymbolInfo(ExecutionSession &ES, MemoryBufferRef ObjBuffer) {
-  auto Obj = object::ObjectFile::createObjectFile(ObjBuffer);
+static SymbolStringPtr addInitSymbol(SymbolFlagsMap &SymbolFlags,
+                                     ExecutionSession &ES,
+                                     StringRef ObjFileName) {
+  SymbolStringPtr InitSymbol;
+  size_t Counter = 0;
 
-  if (!Obj)
-    return Obj.takeError();
+  do {
+    std::string InitSymString;
+    raw_string_ostream(InitSymString)
+      << "$." << ObjFileName << ".__inits." << Counter++;
+    InitSymbol = ES.intern(InitSymString);
+  } while (SymbolFlags.count(InitSymbol));
 
-  bool IsMachO = isa<object::MachOObjectFile>(Obj->get());
+  SymbolFlags[InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly;
+  return InitSymbol;
+}
 
+static Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
+getMachOObjectFileSymbolInfo(ExecutionSession &ES,
+                             const object::MachOObjectFile &Obj) {
   SymbolFlagsMap SymbolFlags;
-  for (auto &Sym : (*Obj)->symbols()) {
+
+  for (auto &Sym : Obj.symbols()) {
     Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
     if (!SymFlagsOrErr)
       // TODO: Test this error.
@@ -123,48 +137,135 @@ getObjectSymbolInfo(ExecutionSession &ES, MemoryBufferRef ObjBuffer) {
       return SymFlags.takeError();
 
     // Strip the 'exported' flag from MachO linker-private symbols.
-    if (IsMachO && Name->startswith("l"))
+    if (Name->startswith("l"))
       *SymFlags &= ~JITSymbolFlags::Exported;
 
     SymbolFlags[InternedName] = std::move(*SymFlags);
   }
 
   SymbolStringPtr InitSymbol;
+  for (auto &Sec : Obj.sections()) {
+    auto SecType = Obj.getSectionType(Sec);
+    if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) {
+      InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName());
+      break;
+    }
+    auto SegName = Obj.getSectionFinalSegmentName(Sec.getRawDataRefImpl());
+    auto SecName = cantFail(Obj.getSectionName(Sec.getRawDataRefImpl()));
+    if (MachOPlatform::isInitializerSection(SegName, SecName)) {
+      InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName());
+      break;
+    }
+  }
 
-  size_t Counter = 0;
-  auto AddInitSymbol = [&]() {
-    while (true) {
-      std::string InitSymString;
-      raw_string_ostream(InitSymString)
-          << "$." << ObjBuffer.getBufferIdentifier() << ".__inits."
-          << Counter++;
-      InitSymbol = ES.intern(InitSymString);
-      if (SymbolFlags.count(InitSymbol))
+  return std::make_pair(std::move(SymbolFlags), std::move(InitSymbol));
+}
+
+static Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
+getELFObjectFileSymbolInfo(ExecutionSession &ES,
+                           const object::ELFObjectFileBase &Obj) {
+  SymbolFlagsMap SymbolFlags;
+  for (auto &Sym : Obj.symbols()) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr)
+      // TODO: Test this error.
+      return SymFlagsOrErr.takeError();
+
+    // Skip symbols not defined in this object file.
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+      continue;
+
+    // Skip symbols that are not global.
+    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    // Skip symbols that have type SF_File.
+    if (auto SymType = Sym.getType()) {
+      if (*SymType == object::SymbolRef::ST_File)
         continue;
-      SymbolFlags[InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly;
-      return;
-    }
-  };
-
-  if (IsMachO) {
-    auto &MachOObj = cast<object::MachOObjectFile>(*Obj->get());
-    for (auto &Sec : MachOObj.sections()) {
-      auto SecType = MachOObj.getSectionType(Sec);
-      if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) {
-        AddInitSymbol();
-        break;
-      }
-      auto SegName =
-          MachOObj.getSectionFinalSegmentName(Sec.getRawDataRefImpl());
-      auto SecName = cantFail(MachOObj.getSectionName(Sec.getRawDataRefImpl()));
-      if (MachOPlatform::isInitializerSection(SegName, SecName)) {
-        AddInitSymbol();
+    } else
+      return SymType.takeError();
+
+    auto Name = Sym.getName();
+    if (!Name)
+      return Name.takeError();
+    auto InternedName = ES.intern(*Name);
+    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
+    if (!SymFlags)
+      return SymFlags.takeError();
+
+    // ELF STB_GNU_UNIQUE should map to Weak for ORC.
+    if (Sym.getBinding() == ELF::STB_GNU_UNIQUE)
+      *SymFlags |= JITSymbolFlags::Weak;
+
+    SymbolFlags[InternedName] = std::move(*SymFlags);
+  }
+
+  SymbolStringPtr InitSymbol;
+  for (auto &Sec : Obj.sections()) {
+    if (auto SecName = Sec.getName()) {
+      if (ELFNixPlatform::isInitializerSection(*SecName)) {
+        InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName());
         break;
       }
     }
   }
 
-  return std::make_pair(std::move(SymbolFlags), std::move(InitSymbol));
+  return std::make_pair(std::move(SymbolFlags), InitSymbol);
+}
+
+Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
+getGenericObjectFileSymbolInfo(ExecutionSession &ES,
+                               const object::ObjectFile &Obj) {
+  SymbolFlagsMap SymbolFlags;
+  for (auto &Sym : Obj.symbols()) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr)
+      // TODO: Test this error.
+      return SymFlagsOrErr.takeError();
+
+    // Skip symbols not defined in this object file.
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+      continue;
+
+    // Skip symbols that are not global.
+    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    // Skip symbols that have type SF_File.
+    if (auto SymType = Sym.getType()) {
+      if (*SymType == object::SymbolRef::ST_File)
+        continue;
+    } else
+      return SymType.takeError();
+
+    auto Name = Sym.getName();
+    if (!Name)
+      return Name.takeError();
+    auto InternedName = ES.intern(*Name);
+    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
+    if (!SymFlags)
+      return SymFlags.takeError();
+
+    SymbolFlags[InternedName] = std::move(*SymFlags);
+  }
+
+  return std::make_pair(std::move(SymbolFlags), nullptr);
+}
+
+Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
+getObjectSymbolInfo(ExecutionSession &ES, MemoryBufferRef ObjBuffer) {
+  auto Obj = object::ObjectFile::createObjectFile(ObjBuffer);
+
+  if (!Obj)
+    return Obj.takeError();
+
+  if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj->get()))
+    return getMachOObjectFileSymbolInfo(ES, *MachOObj);
+  else if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj->get()))
+    return getELFObjectFileSymbolInfo(ES, *ELFObj);
+
+  return getGenericObjectFileSymbolInfo(ES, **Obj);
 }
 
 } // End namespace orc.
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index fd260089c04b..6f840a079dd1 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -64,9 +64,9 @@ private:
       LGI.SymbolFlags[ES.intern(Sym->getName())] = Flags;
     }
 
-    if (G.getTargetTriple().isOSBinFormatMachO())
-      if (hasMachOInitSection(G))
-        LGI.InitSymbol = makeInitSymbol(ES, G);
+    if ((G.getTargetTriple().isOSBinFormatMachO() && hasMachOInitSection(G)) ||
+        (G.getTargetTriple().isOSBinFormatELF() && hasELFInitSection(G)))
+      LGI.InitSymbol = makeInitSymbol(ES, G);
 
     return LGI;
   }
@@ -77,11 +77,19 @@ private:
           Sec.getName() == "__DATA,__objc_classlist" ||
           Sec.getName() == "__TEXT,__swift5_protos" ||
           Sec.getName() == "__TEXT,__swift5_proto" ||
+          Sec.getName() == "__TEXT,__swift5_types" ||
           Sec.getName() == "__DATA,__mod_init_func")
         return true;
     return false;
   }
 
+  static bool hasELFInitSection(LinkGraph &G) {
+    for (auto &Sec : G.sections())
+      if (Sec.getName() == ".init_array")
+        return true;
+    return false;
+  }
+
   static SymbolStringPtr makeInitSymbol(ExecutionSession &ES, LinkGraph &G) {
     std::string InitSymString;
     raw_string_ostream(InitSymString)
@@ -272,8 +280,9 @@ public:
 
       // If there were missing symbols then report the error.
       if (!MissingSymbols.empty())
-        return make_error<MissingSymbolDefinitions>(G.getName(),
-                                                    std::move(MissingSymbols));
+        return make_error<MissingSymbolDefinitions>(
+            Layer.getExecutionSession().getSymbolStringPool(), G.getName(),
+            std::move(MissingSymbols));
 
       // If there are more definitions than expected, add them to the
       // ExtraSymbols vector.
@@ -286,8 +295,9 @@ public:
 
       // If there were extra definitions then report the error.
       if (!ExtraSymbols.empty())
-        return make_error<UnexpectedSymbolDefinitions>(G.getName(),
-                                                       std::move(ExtraSymbols));
+        return make_error<UnexpectedSymbolDefinitions>(
+            Layer.getExecutionSession().getSymbolStringPool(), G.getName(),
+            std::move(ExtraSymbols));
     }
 
     if (auto Err = MR->notifyResolved(InternedResult))
@@ -297,8 +307,7 @@ public:
     return Error::success();
   }
 
-  void notifyFinalized(
-      std::unique_ptr<JITLinkMemoryManager::Allocation> A) override {
+  void notifyFinalized(JITLinkMemoryManager::FinalizedAlloc A) override {
     if (auto Err = Layer.notifyEmitted(*MR, std::move(A))) {
       Layer.getExecutionSession().reportError(std::move(Err));
       MR->failMaterialization();
@@ -414,7 +423,8 @@ private:
     std::vector<std::pair<SymbolStringPtr, Symbol *>> NameToSym;
 
     auto ProcessSymbol = [&](Symbol *Sym) {
-      if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) {
+      if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak &&
+          Sym->getScope() != Scope::Local) {
         auto Name = ES.intern(Sym->getName());
         if (!MR->getSymbols().count(ES.intern(Sym->getName()))) {
           JITSymbolFlags SF = JITSymbolFlags::Weak;
@@ -543,8 +553,7 @@ private:
 
     // Propagate block-level dependencies through the block-dependence graph.
     while (!WorkList.empty()) {
-      auto *B = WorkList.back();
-      WorkList.pop_back();
+      auto *B = WorkList.pop_back_val();
 
       auto &BI = BlockInfos[B];
       assert(BI.DependenciesChanged &&
@@ -672,7 +681,7 @@ void ObjectLinkingLayer::notifyLoaded(MaterializationResponsibility &MR) {
 }
 
 Error ObjectLinkingLayer::notifyEmitted(MaterializationResponsibility &MR,
-                                        AllocPtr Alloc) {
+                                        FinalizedAlloc FA) {
   Error Err = Error::success();
   for (auto &P : Plugins)
     Err = joinErrors(std::move(Err), P->notifyEmitted(MR));
@@ -681,17 +690,20 @@ Error ObjectLinkingLayer::notifyEmitted(MaterializationResponsibility &MR,
     return Err;
 
   return MR.withResourceKeyDo(
-      [&](ResourceKey K) { Allocs[K].push_back(std::move(Alloc)); });
+      [&](ResourceKey K) { Allocs[K].push_back(std::move(FA)); });
 }
 
 Error ObjectLinkingLayer::handleRemoveResources(ResourceKey K) {
 
-  Error Err = Error::success();
-
-  for (auto &P : Plugins)
-    Err = joinErrors(std::move(Err), P->notifyRemovingResources(K));
+  {
+    Error Err = Error::success();
+    for (auto &P : Plugins)
+      Err = joinErrors(std::move(Err), P->notifyRemovingResources(K));
+    if (Err)
+      return Err;
+  }
 
-  std::vector<AllocPtr> AllocsToRemove;
+  std::vector<FinalizedAlloc> AllocsToRemove;
   getExecutionSession().runSessionLocked([&] {
     auto I = Allocs.find(K);
     if (I != Allocs.end()) {
@@ -700,12 +712,10 @@ Error ObjectLinkingLayer::handleRemoveResources(ResourceKey K) {
     }
   });
 
-  while (!AllocsToRemove.empty()) {
-    Err = joinErrors(std::move(Err), AllocsToRemove.back()->deallocate());
-    AllocsToRemove.pop_back();
-  }
+  if (AllocsToRemove.empty())
+    return Error::success();
 
-  return Err;
+  return MemMgr.deallocate(std::move(AllocsToRemove));
 }
 
 void ObjectLinkingLayer::handleTransferResources(ResourceKey DstKey,
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index d6f73a8b0864..673f7394450f 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -619,6 +619,61 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
   return LLVMErrorSuccess;
 }
 
+LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForPath(
+    LLVMOrcDefinitionGeneratorRef *Result, const char *FileName,
+    char GlobalPrefix, LLVMOrcSymbolPredicate Filter, void *FilterCtx) {
+  assert(Result && "Result can not be null");
+  assert(FileName && "FileName can not be null");
+  assert((Filter || !FilterCtx) &&
+         "if Filter is null then FilterCtx must also be null");
+
+  DynamicLibrarySearchGenerator::SymbolPredicate Pred;
+  if (Filter)
+    Pred = [=](const SymbolStringPtr &Name) -> bool {
+      return Filter(FilterCtx, wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(Name)));
+    };
+
+  auto LibrarySymsGenerator =
+      DynamicLibrarySearchGenerator::Load(FileName, GlobalPrefix, Pred);
+
+  if (!LibrarySymsGenerator) {
+    *Result = 0;
+    return wrap(LibrarySymsGenerator.takeError());
+  }
+
+  *Result = wrap(LibrarySymsGenerator->release());
+  return LLVMErrorSuccess;
+}
+
+LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath(
+    LLVMOrcDefinitionGeneratorRef *Result, LLVMOrcObjectLayerRef ObjLayer,
+    const char *FileName, const char *TargetTriple) {
+  assert(Result && "Result can not be null");
+  assert(FileName && "Filename can not be null");
+  assert(ObjLayer && "ObjectLayer can not be null");
+
+  if (TargetTriple) {
+    auto TT = Triple(TargetTriple);
+    auto LibrarySymsGenerator =
+        StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName, TT);
+    if (!LibrarySymsGenerator) {
+      *Result = 0;
+      return wrap(LibrarySymsGenerator.takeError());
+    }
+    *Result = wrap(LibrarySymsGenerator->release());
+    return LLVMErrorSuccess;
+  } else {
+    auto LibrarySymsGenerator =
+        StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName);
+    if (!LibrarySymsGenerator) {
+      *Result = 0;
+      return wrap(LibrarySymsGenerator.takeError());
+    }
+    *Result = wrap(LibrarySymsGenerator->release());
+    return LLVMErrorSuccess;
+  }
+}
+
 LLVMOrcThreadSafeContextRef LLVMOrcCreateNewThreadSafeContext(void) {
   return wrap(new ThreadSafeContext(std::make_unique<LLVMContext>()));
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
new file mode 100644
index 000000000000..02044e4af29a
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -0,0 +1,47 @@
+//===------ OrcRTBridge.cpp - Executor functions for bootstrap -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+namespace llvm {
+namespace orc {
+namespace rt {
+
+const char *SimpleExecutorDylibManagerInstanceName =
+    "__llvm_orc_SimpleExecutorDylibManager_Instance";
+const char *SimpleExecutorDylibManagerOpenWrapperName =
+    "__llvm_orc_SimpleExecutorDylibManager_open_wrapper";
+const char *SimpleExecutorDylibManagerLookupWrapperName =
+    "__llvm_orc_SimpleExecutorDylibManager_lookup_wrapper";
+const char *SimpleExecutorMemoryManagerInstanceName =
+    "__llvm_orc_SimpleExecutorMemoryManager_Instance";
+const char *SimpleExecutorMemoryManagerReserveWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper";
+const char *SimpleExecutorMemoryManagerFinalizeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper";
+const char *SimpleExecutorMemoryManagerDeallocateWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper";
+const char *MemoryWriteUInt8sWrapperName =
+    "__llvm_orc_bootstrap_mem_write_uint8s_wrapper";
+const char *MemoryWriteUInt16sWrapperName =
+    "__llvm_orc_bootstrap_mem_write_uint16s_wrapper";
+const char *MemoryWriteUInt32sWrapperName =
+    "__llvm_orc_bootstrap_mem_write_uint32s_wrapper";
+const char *MemoryWriteUInt64sWrapperName =
+    "__llvm_orc_bootstrap_mem_write_uint64s_wrapper";
+const char *MemoryWriteBuffersWrapperName =
+    "__llvm_orc_bootstrap_mem_write_buffers_wrapper";
+const char *RegisterEHFrameSectionCustomDirectWrapperName =
+    "__llvm_orc_bootstrap_register_ehframe_section_custom_direct_wrapper";
+const char *DeregisterEHFrameSectionCustomDirectWrapperName =
+    "__llvm_orc_bootstrap_deregister_ehframe_section_custom_direct_wrapper";
+const char *RunAsMainWrapperName = "__llvm_orc_bootstrap_run_as_main_wrapper";
+
+} // end namespace rt
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp
deleted file mode 100644
index a55cb220f218..000000000000
--- a/llvm/lib/ExecutionEngine/Orc/Shared/RPCError.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===--------------- RPCError.cpp - RPCERror implementation ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RPC Error type implmentations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/Shared/RPCUtils.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <string>
-#include <system_error>
-
-char llvm::orc::shared::RPCFatalError::ID = 0;
-char llvm::orc::shared::ConnectionClosed::ID = 0;
-char llvm::orc::shared::ResponseAbandoned::ID = 0;
-char llvm::orc::shared::CouldNotNegotiate::ID = 0;
-
-namespace llvm {
-namespace orc {
-namespace shared {
-
-std::error_code ConnectionClosed::convertToErrorCode() const {
-  return orcError(OrcErrorCode::RPCConnectionClosed);
-}
-
-void ConnectionClosed::log(raw_ostream &OS) const {
-  OS << "RPC connection already closed";
-}
-
-std::error_code ResponseAbandoned::convertToErrorCode() const {
-  return orcError(OrcErrorCode::RPCResponseAbandoned);
-}
-
-void ResponseAbandoned::log(raw_ostream &OS) const {
-  OS << "RPC response abandoned";
-}
-
-CouldNotNegotiate::CouldNotNegotiate(std::string Signature)
-    : Signature(std::move(Signature)) {}
-
-std::error_code CouldNotNegotiate::convertToErrorCode() const {
-  return orcError(OrcErrorCode::RPCCouldNotNegotiateFunction);
-}
-
-void CouldNotNegotiate::log(raw_ostream &OS) const {
-  OS << "Could not negotiate RPC function " << Signature;
-}
-
-} // end namespace shared
-} // end namespace orc
-} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
new file mode 100644
index 000000000000..64fc717b7b56
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.cpp
@@ -0,0 +1,250 @@
+//===------ SimpleRemoteEPCUtils.cpp - Utils for Simple Remote EPC --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Message definitions and other utilities for SimpleRemoteEPC and
+// SimpleRemoteEPCServer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+namespace {
+
+struct FDMsgHeader {
+  static constexpr unsigned MsgSizeOffset = 0;
+  static constexpr unsigned OpCOffset = MsgSizeOffset + sizeof(uint64_t);
+  static constexpr unsigned SeqNoOffset = OpCOffset + sizeof(uint64_t);
+  static constexpr unsigned TagAddrOffset = SeqNoOffset + sizeof(uint64_t);
+  static constexpr unsigned Size = TagAddrOffset + sizeof(uint64_t);
+};
+
+} // namespace
+
+namespace llvm {
+namespace orc {
+namespace SimpleRemoteEPCDefaultBootstrapSymbolNames {
+
+const char *ExecutorSessionObjectName =
+    "__llvm_orc_SimpleRemoteEPC_dispatch_ctx";
+const char *DispatchFnName = "__llvm_orc_SimpleRemoteEPC_dispatch_fn";
+
+} // end namespace SimpleRemoteEPCDefaultBootstrapSymbolNames
+
+SimpleRemoteEPCTransportClient::~SimpleRemoteEPCTransportClient() {}
+SimpleRemoteEPCTransport::~SimpleRemoteEPCTransport() {}
+
+Expected<std::unique_ptr<FDSimpleRemoteEPCTransport>>
+FDSimpleRemoteEPCTransport::Create(SimpleRemoteEPCTransportClient &C, int InFD,
+                                   int OutFD) {
+#if LLVM_ENABLE_THREADS
+  if (InFD == -1)
+    return make_error<StringError>("Invalid input file descriptor " +
+                                       Twine(InFD),
+                                   inconvertibleErrorCode());
+  if (OutFD == -1)
+    return make_error<StringError>("Invalid output file descriptor " +
+                                       Twine(OutFD),
+                                   inconvertibleErrorCode());
+  std::unique_ptr<FDSimpleRemoteEPCTransport> FDT(
+      new FDSimpleRemoteEPCTransport(C, InFD, OutFD));
+  return std::move(FDT);
+#else
+  return make_error<StringError>("FD-based SimpleRemoteEPC transport requires "
+                                 "thread support, but llvm was built with "
+                                 "LLVM_ENABLE_THREADS=Off",
+                                 inconvertibleErrorCode());
+#endif
+}
+
+FDSimpleRemoteEPCTransport::~FDSimpleRemoteEPCTransport() {
+#if LLVM_ENABLE_THREADS
+  ListenerThread.join();
+#endif
+}
+
+Error FDSimpleRemoteEPCTransport::start() {
+#if LLVM_ENABLE_THREADS
+  ListenerThread = std::thread([this]() { listenLoop(); });
+  return Error::success();
+#endif
+  llvm_unreachable("Should not be called with LLVM_ENABLE_THREADS=Off");
+}
+
+Error FDSimpleRemoteEPCTransport::sendMessage(SimpleRemoteEPCOpcode OpC,
+                                              uint64_t SeqNo,
+                                              ExecutorAddr TagAddr,
+                                              ArrayRef<char> ArgBytes) {
+  char HeaderBuffer[FDMsgHeader::Size];
+
+  *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::MsgSizeOffset)) =
+      FDMsgHeader::Size + ArgBytes.size();
+  *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::OpCOffset)) =
+      static_cast<uint64_t>(OpC);
+  *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::SeqNoOffset)) = SeqNo;
+  *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::TagAddrOffset)) =
+      TagAddr.getValue();
+
+  std::lock_guard<std::mutex> Lock(M);
+  if (Disconnected)
+    return make_error<StringError>("FD-transport disconnected",
+                                   inconvertibleErrorCode());
+  if (int ErrNo = writeBytes(HeaderBuffer, FDMsgHeader::Size))
+    return errorCodeToError(std::error_code(ErrNo, std::generic_category()));
+  if (int ErrNo = writeBytes(ArgBytes.data(), ArgBytes.size()))
+    return errorCodeToError(std::error_code(ErrNo, std::generic_category()));
+  return Error::success();
+}
+
+void FDSimpleRemoteEPCTransport::disconnect() {
+  if (Disconnected)
+    return; // Return if already disconnected.
+
+  Disconnected = true;
+  bool CloseOutFD = InFD != OutFD;
+
+  // Close InFD.
+  while (close(InFD) == -1) {
+    if (errno == EBADF)
+      break;
+  }
+
+  // Close OutFD.
+  if (CloseOutFD) {
+    while (close(OutFD) == -1) {
+      if (errno == EBADF)
+        break;
+    }
+  }
+}
+
+static Error makeUnexpectedEOFError() {
+  return make_error<StringError>("Unexpected end-of-file",
+                                 inconvertibleErrorCode());
+}
+
+Error FDSimpleRemoteEPCTransport::readBytes(char *Dst, size_t Size,
+                                            bool *IsEOF) {
+  assert(Dst && "Attempt to read into null.");
+  ssize_t Completed = 0;
+  while (Completed < static_cast<ssize_t>(Size)) {
+    ssize_t Read = ::read(InFD, Dst + Completed, Size - Completed);
+    if (Read <= 0) {
+      auto ErrNo = errno;
+      if (Read == 0) {
+        if (Completed == 0 && IsEOF) {
+          *IsEOF = true;
+          return Error::success();
+        } else
+          return makeUnexpectedEOFError();
+      } else if (ErrNo == EAGAIN || ErrNo == EINTR)
+        continue;
+      else {
+        std::lock_guard<std::mutex> Lock(M);
+        if (Disconnected && IsEOF) { // disconnect called,  pretend this is EOF.
+          *IsEOF = true;
+          return Error::success();
+        }
+        return errorCodeToError(
+            std::error_code(ErrNo, std::generic_category()));
+      }
+    }
+    Completed += Read;
+  }
+  return Error::success();
+}
+
+int FDSimpleRemoteEPCTransport::writeBytes(const char *Src, size_t Size) {
+  assert(Src && "Attempt to append from null.");
+  ssize_t Completed = 0;
+  while (Completed < static_cast<ssize_t>(Size)) {
+    ssize_t Written = ::write(OutFD, Src + Completed, Size - Completed);
+    if (Written < 0) {
+      auto ErrNo = errno;
+      if (ErrNo == EAGAIN || ErrNo == EINTR)
+        continue;
+      else
+        return ErrNo;
+    }
+    Completed += Written;
+  }
+  return 0;
+}
+
+void FDSimpleRemoteEPCTransport::listenLoop() {
+  Error Err = Error::success();
+  do {
+
+    char HeaderBuffer[FDMsgHeader::Size];
+    // Read the header buffer.
+    {
+      bool IsEOF = false;
+      if (auto Err2 = readBytes(HeaderBuffer, FDMsgHeader::Size, &IsEOF)) {
+        Err = joinErrors(std::move(Err), std::move(Err2));
+        break;
+      }
+      if (IsEOF)
+        break;
+    }
+
+    // Decode header buffer.
+    uint64_t MsgSize;
+    SimpleRemoteEPCOpcode OpC;
+    uint64_t SeqNo;
+    ExecutorAddr TagAddr;
+
+    MsgSize =
+        *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::MsgSizeOffset));
+    OpC = static_cast<SimpleRemoteEPCOpcode>(static_cast<uint64_t>(
+        *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::OpCOffset))));
+    SeqNo =
+        *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::SeqNoOffset));
+    TagAddr.setValue(
+        *((support::ulittle64_t *)(HeaderBuffer + FDMsgHeader::TagAddrOffset)));
+
+    if (MsgSize < FDMsgHeader::Size) {
+      Err = joinErrors(std::move(Err),
+                       make_error<StringError>("Message size too small",
+                                               inconvertibleErrorCode()));
+      break;
+    }
+
+    // Read the argument bytes.
+    SimpleRemoteEPCArgBytesVector ArgBytes;
+    ArgBytes.resize(MsgSize - FDMsgHeader::Size);
+    if (auto Err2 = readBytes(ArgBytes.data(), ArgBytes.size())) {
+      Err = joinErrors(std::move(Err), std::move(Err2));
+      break;
+    }
+
+    if (auto Action = C.handleMessage(OpC, SeqNo, TagAddr, ArgBytes)) {
+      if (*Action == SimpleRemoteEPCTransportClient::EndSession)
+        break;
+    } else {
+      Err = joinErrors(std::move(Err), Action.takeError());
+      break;
+    }
+  } while (true);
+
+  // Attempt to close FDs, set Disconnected to true so that subsequent
+  // sendMessage calls fail.
+  disconnect();
+
+  // Call up to the client to handle the disconnection.
+  C.handleDisconnect(std::move(Err));
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
new file mode 100644
index 000000000000..47364a92a451
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -0,0 +1,406 @@
+//===------- SimpleRemoteEPC.cpp -- Simple remote executor control --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+
+SimpleRemoteEPC::~SimpleRemoteEPC() {
+#ifndef NDEBUG
+  std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+  assert(Disconnected && "Destroyed without disconnection");
+#endif // NDEBUG
+}
+
+Expected<tpctypes::DylibHandle>
+SimpleRemoteEPC::loadDylib(const char *DylibPath) {
+  return DylibMgr->open(DylibPath, 0);
+}
+
+Expected<std::vector<tpctypes::LookupResult>>
+SimpleRemoteEPC::lookupSymbols(ArrayRef<LookupRequest> Request) {
+  std::vector<tpctypes::LookupResult> Result;
+
+  for (auto &Element : Request) {
+    if (auto R = DylibMgr->lookup(Element.Handle, Element.Symbols)) {
+      Result.push_back({});
+      Result.back().reserve(R->size());
+      for (auto Addr : *R)
+        Result.back().push_back(Addr.getValue());
+    } else
+      return R.takeError();
+  }
+  return std::move(Result);
+}
+
+Expected<int32_t> SimpleRemoteEPC::runAsMain(ExecutorAddr MainFnAddr,
+                                             ArrayRef<std::string> Args) {
+  int64_t Result = 0;
+  if (auto Err = callSPSWrapper<rt::SPSRunAsMainSignature>(
+          RunAsMainAddr, Result, ExecutorAddr(MainFnAddr), Args))
+    return std::move(Err);
+  return Result;
+}
+
+void SimpleRemoteEPC::callWrapperAsync(ExecutorAddr WrapperFnAddr,
+                                       IncomingWFRHandler OnComplete,
+                                       ArrayRef<char> ArgBuffer) {
+  uint64_t SeqNo;
+  {
+    std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+    SeqNo = getNextSeqNo();
+    assert(!PendingCallWrapperResults.count(SeqNo) && "SeqNo already in use");
+    PendingCallWrapperResults[SeqNo] = std::move(OnComplete);
+  }
+
+  if (auto Err = sendMessage(SimpleRemoteEPCOpcode::CallWrapper, SeqNo,
+                             WrapperFnAddr, ArgBuffer)) {
+    IncomingWFRHandler H;
+
+    // We just registered OnComplete, but there may be a race between this
+    // thread returning from sendMessage and handleDisconnect being called from
+    // the transport's listener thread. If handleDisconnect gets there first
+    // then it will have failed 'H' for us. If we get there first (or if
+    // handleDisconnect already ran) then we need to take care of it.
+    {
+      std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+      auto I = PendingCallWrapperResults.find(SeqNo);
+      if (I != PendingCallWrapperResults.end()) {
+        H = std::move(I->second);
+        PendingCallWrapperResults.erase(I);
+      }
+    }
+
+    if (H)
+      H(shared::WrapperFunctionResult::createOutOfBandError("disconnecting"));
+
+    getExecutionSession().reportError(std::move(Err));
+  }
+}
+
+Error SimpleRemoteEPC::disconnect() {
+  T->disconnect();
+  D->shutdown();
+  std::unique_lock<std::mutex> Lock(SimpleRemoteEPCMutex);
+  DisconnectCV.wait(Lock, [this] { return Disconnected; });
+  return std::move(DisconnectErr);
+}
+
+Expected<SimpleRemoteEPCTransportClient::HandleMessageAction>
+SimpleRemoteEPC::handleMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                               ExecutorAddr TagAddr,
+                               SimpleRemoteEPCArgBytesVector ArgBytes) {
+
+  LLVM_DEBUG({
+    dbgs() << "SimpleRemoteEPC::handleMessage: opc = ";
+    switch (OpC) {
+    case SimpleRemoteEPCOpcode::Setup:
+      dbgs() << "Setup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Setup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Setup?");
+      break;
+    case SimpleRemoteEPCOpcode::Hangup:
+      dbgs() << "Hangup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Hangup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Hangup?");
+      break;
+    case SimpleRemoteEPCOpcode::Result:
+      dbgs() << "Result";
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Result?");
+      break;
+    case SimpleRemoteEPCOpcode::CallWrapper:
+      dbgs() << "CallWrapper";
+      break;
+    }
+    dbgs() << ", seqno = " << SeqNo
+           << ", tag-addr = " << formatv("{0:x}", TagAddr.getValue())
+           << ", arg-buffer = " << formatv("{0:x}", ArgBytes.size())
+           << " bytes\n";
+  });
+
+  using UT = std::underlying_type_t<SimpleRemoteEPCOpcode>;
+  if (static_cast<UT>(OpC) > static_cast<UT>(SimpleRemoteEPCOpcode::LastOpC))
+    return make_error<StringError>("Unexpected opcode",
+                                   inconvertibleErrorCode());
+
+  switch (OpC) {
+  case SimpleRemoteEPCOpcode::Setup:
+    if (auto Err = handleSetup(SeqNo, TagAddr, std::move(ArgBytes)))
+      return std::move(Err);
+    break;
+  case SimpleRemoteEPCOpcode::Hangup:
+    T->disconnect();
+    if (auto Err = handleHangup(std::move(ArgBytes)))
+      return std::move(Err);
+    return EndSession;
+  case SimpleRemoteEPCOpcode::Result:
+    if (auto Err = handleResult(SeqNo, TagAddr, std::move(ArgBytes)))
+      return std::move(Err);
+    break;
+  case SimpleRemoteEPCOpcode::CallWrapper:
+    handleCallWrapper(SeqNo, TagAddr, std::move(ArgBytes));
+    break;
+  }
+  return ContinueSession;
+}
+
+void SimpleRemoteEPC::handleDisconnect(Error Err) {
+  LLVM_DEBUG({
+    dbgs() << "SimpleRemoteEPC::handleDisconnect: "
+           << (Err ? "failure" : "success") << "\n";
+  });
+
+  PendingCallWrapperResultsMap TmpPending;
+
+  {
+    std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+    std::swap(TmpPending, PendingCallWrapperResults);
+  }
+
+  for (auto &KV : TmpPending)
+    KV.second(
+        shared::WrapperFunctionResult::createOutOfBandError("disconnecting"));
+
+  std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+  DisconnectErr = joinErrors(std::move(DisconnectErr), std::move(Err));
+  Disconnected = true;
+  DisconnectCV.notify_all();
+}
+
+Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
+SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) {
+  EPCGenericJITLinkMemoryManager::SymbolAddrs SAs;
+  if (auto Err = SREPC.getBootstrapSymbols(
+          {{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName},
+           {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
+           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
+           {SAs.Deallocate,
+            rt::SimpleExecutorMemoryManagerDeallocateWrapperName}}))
+    return std::move(Err);
+
+  return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs);
+}
+
+Expected<std::unique_ptr<ExecutorProcessControl::MemoryAccess>>
+SimpleRemoteEPC::createDefaultMemoryAccess(SimpleRemoteEPC &SREPC) {
+  return nullptr;
+}
+
+Error SimpleRemoteEPC::sendMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                                   ExecutorAddr TagAddr,
+                                   ArrayRef<char> ArgBytes) {
+  assert(OpC != SimpleRemoteEPCOpcode::Setup &&
+         "SimpleRemoteEPC sending Setup message? That's the wrong direction.");
+
+  LLVM_DEBUG({
+    dbgs() << "SimpleRemoteEPC::sendMessage: opc = ";
+    switch (OpC) {
+    case SimpleRemoteEPCOpcode::Hangup:
+      dbgs() << "Hangup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Hangup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Hangup?");
+      break;
+    case SimpleRemoteEPCOpcode::Result:
+      dbgs() << "Result";
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Result?");
+      break;
+    case SimpleRemoteEPCOpcode::CallWrapper:
+      dbgs() << "CallWrapper";
+      break;
+    default:
+      llvm_unreachable("Invalid opcode");
+    }
+    dbgs() << ", seqno = " << SeqNo
+           << ", tag-addr = " << formatv("{0:x}", TagAddr.getValue())
+           << ", arg-buffer = " << formatv("{0:x}", ArgBytes.size())
+           << " bytes\n";
+  });
+  auto Err = T->sendMessage(OpC, SeqNo, TagAddr, ArgBytes);
+  LLVM_DEBUG({
+    if (Err)
+      dbgs() << "  \\--> SimpleRemoteEPC::sendMessage failed\n";
+  });
+  return Err;
+}
+
+Error SimpleRemoteEPC::handleSetup(uint64_t SeqNo, ExecutorAddr TagAddr,
+                                   SimpleRemoteEPCArgBytesVector ArgBytes) {
+  if (SeqNo != 0)
+    return make_error<StringError>("Setup packet SeqNo not zero",
+                                   inconvertibleErrorCode());
+
+  if (TagAddr)
+    return make_error<StringError>("Setup packet TagAddr not zero",
+                                   inconvertibleErrorCode());
+
+  std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+  auto I = PendingCallWrapperResults.find(0);
+  assert(PendingCallWrapperResults.size() == 1 &&
+         I != PendingCallWrapperResults.end() &&
+         "Setup message handler not connectly set up");
+  auto SetupMsgHandler = std::move(I->second);
+  PendingCallWrapperResults.erase(I);
+
+  auto WFR =
+      shared::WrapperFunctionResult::copyFrom(ArgBytes.data(), ArgBytes.size());
+  SetupMsgHandler(std::move(WFR));
+  return Error::success();
+}
+
+Error SimpleRemoteEPC::setup(Setup S) {
+  using namespace SimpleRemoteEPCDefaultBootstrapSymbolNames;
+
+  std::promise<MSVCPExpected<SimpleRemoteEPCExecutorInfo>> EIP;
+  auto EIF = EIP.get_future();
+
+  // Prepare a handler for the setup packet.
+  PendingCallWrapperResults[0] =
+    RunInPlace()(
+      [&](shared::WrapperFunctionResult SetupMsgBytes) {
+        if (const char *ErrMsg = SetupMsgBytes.getOutOfBandError()) {
+          EIP.set_value(
+              make_error<StringError>(ErrMsg, inconvertibleErrorCode()));
+          return;
+        }
+        using SPSSerialize =
+            shared::SPSArgList<shared::SPSSimpleRemoteEPCExecutorInfo>;
+        shared::SPSInputBuffer IB(SetupMsgBytes.data(), SetupMsgBytes.size());
+        SimpleRemoteEPCExecutorInfo EI;
+        if (SPSSerialize::deserialize(IB, EI))
+          EIP.set_value(EI);
+        else
+          EIP.set_value(make_error<StringError>(
+              "Could not deserialize setup message", inconvertibleErrorCode()));
+      });
+
+  // Start the transport.
+  if (auto Err = T->start())
+    return Err;
+
+  // Wait for setup packet to arrive.
+  auto EI = EIF.get();
+  if (!EI) {
+    T->disconnect();
+    return EI.takeError();
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "SimpleRemoteEPC received setup message:\n"
+           << "  Triple: " << EI->TargetTriple << "\n"
+           << "  Page size: " << EI->PageSize << "\n"
+           << "  Bootstrap symbols:\n";
+    for (const auto &KV : EI->BootstrapSymbols)
+      dbgs() << "    " << KV.first() << ": "
+             << formatv("{0:x16}", KV.second.getValue()) << "\n";
+  });
+  TargetTriple = Triple(EI->TargetTriple);
+  PageSize = EI->PageSize;
+  BootstrapSymbols = std::move(EI->BootstrapSymbols);
+
+  if (auto Err = getBootstrapSymbols(
+          {{JDI.JITDispatchContext, ExecutorSessionObjectName},
+           {JDI.JITDispatchFunction, DispatchFnName},
+           {RunAsMainAddr, rt::RunAsMainWrapperName}}))
+    return Err;
+
+  if (auto DM =
+          EPCGenericDylibManager::CreateWithDefaultBootstrapSymbols(*this))
+    DylibMgr = std::make_unique<EPCGenericDylibManager>(std::move(*DM));
+  else
+    return DM.takeError();
+
+  // Set a default CreateMemoryManager if none is specified.
+  if (!S.CreateMemoryManager)
+    S.CreateMemoryManager = createDefaultMemoryManager;
+
+  if (auto MemMgr = S.CreateMemoryManager(*this)) {
+    OwnedMemMgr = std::move(*MemMgr);
+    this->MemMgr = OwnedMemMgr.get();
+  } else
+    return MemMgr.takeError();
+
+  // Set a default CreateMemoryAccess if none is specified.
+  if (!S.CreateMemoryAccess)
+    S.CreateMemoryAccess = createDefaultMemoryAccess;
+
+  if (auto MemAccess = S.CreateMemoryAccess(*this)) {
+    OwnedMemAccess = std::move(*MemAccess);
+    this->MemAccess = OwnedMemAccess.get();
+  } else
+    return MemAccess.takeError();
+
+  return Error::success();
+}
+
+Error SimpleRemoteEPC::handleResult(uint64_t SeqNo, ExecutorAddr TagAddr,
+                                    SimpleRemoteEPCArgBytesVector ArgBytes) {
+  IncomingWFRHandler SendResult;
+
+  if (TagAddr)
+    return make_error<StringError>("Unexpected TagAddr in result message",
+                                   inconvertibleErrorCode());
+
+  {
+    std::lock_guard<std::mutex> Lock(SimpleRemoteEPCMutex);
+    auto I = PendingCallWrapperResults.find(SeqNo);
+    if (I == PendingCallWrapperResults.end())
+      return make_error<StringError>("No call for sequence number " +
+                                         Twine(SeqNo),
+                                     inconvertibleErrorCode());
+    SendResult = std::move(I->second);
+    PendingCallWrapperResults.erase(I);
+    releaseSeqNo(SeqNo);
+  }
+
+  auto WFR =
+      shared::WrapperFunctionResult::copyFrom(ArgBytes.data(), ArgBytes.size());
+  SendResult(std::move(WFR));
+  return Error::success();
+}
+
+void SimpleRemoteEPC::handleCallWrapper(
+    uint64_t RemoteSeqNo, ExecutorAddr TagAddr,
+    SimpleRemoteEPCArgBytesVector ArgBytes) {
+  assert(ES && "No ExecutionSession attached");
+  D->dispatch(makeGenericNamedTask(
+      [this, RemoteSeqNo, TagAddr, ArgBytes = std::move(ArgBytes)]() {
+        ES->runJITDispatchHandler(
+            [this, RemoteSeqNo](shared::WrapperFunctionResult WFR) {
+              if (auto Err =
+                      sendMessage(SimpleRemoteEPCOpcode::Result, RemoteSeqNo,
+                                  ExecutorAddr(), {WFR.data(), WFR.size()}))
+                getExecutionSession().reportError(std::move(Err));
+            },
+            TagAddr.getValue(), ArgBytes);
+      },
+      "callWrapper task"));
+}
+
+Error SimpleRemoteEPC::handleHangup(SimpleRemoteEPCArgBytesVector ArgBytes) {
+  using namespace llvm::orc::shared;
+  auto WFR = WrapperFunctionResult::copyFrom(ArgBytes.data(), ArgBytes.size());
+  if (const char *ErrMsg = WFR.getOutOfBandError())
+    return make_error<StringError>(ErrMsg, inconvertibleErrorCode());
+
+  detail::SPSSerializableError Info;
+  SPSInputBuffer IB(WFR.data(), WFR.size());
+  if (!SPSArgList<SPSError>::deserialize(IB, Info))
+    return make_error<StringError>("Could not deserialize hangup info",
+                                   inconvertibleErrorCode());
+  return fromSPSSerializable(std::move(Info));
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
index 43c2a44835fd..4c15e25b1d89 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ManagedStatic.h"
 
 #include <cstdint>
@@ -64,14 +65,23 @@ LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() {
 }
 
 using namespace llvm;
+using namespace llvm::orc;
 
 // Serialize rendezvous with the debugger as well as access to shared data.
 ManagedStatic<std::mutex> JITDebugLock;
 
 // Register debug object, return error message or null for success.
-static void registerJITLoaderGDBImpl(JITTargetAddress Addr, uint64_t Size) {
+static void registerJITLoaderGDBImpl(const char *ObjAddr, size_t Size) {
+  LLVM_DEBUG({
+    dbgs() << "Registering debug object with GDB JIT interface "
+           << formatv("([{0:x16} -- {1:x16}])",
+                      reinterpret_cast<uintptr_t>(ObjAddr),
+                      reinterpret_cast<uintptr_t>(ObjAddr + Size))
+           << "\n";
+  });
+
   jit_code_entry *E = new jit_code_entry;
-  E->symfile_addr = jitTargetAddressToPointer<const char *>(Addr);
+  E->symfile_addr = ObjAddr;
   E->symfile_size = Size;
   E->prev_entry = nullptr;
 
@@ -92,10 +102,26 @@ static void registerJITLoaderGDBImpl(JITTargetAddress Addr, uint64_t Size) {
   __jit_debug_register_code();
 }
 
-extern "C" orc::shared::detail::CWrapperFunctionResult
+extern "C" orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderGDBAllocAction(const char *Data, size_t Size) {
+  using namespace orc::shared;
+  return WrapperFunction<SPSError()>::handle(nullptr, 0,
+                                             [=]() -> Error {
+                                               registerJITLoaderGDBImpl(Data,
+                                                                        Size);
+                                               return Error::success();
+                                             })
+      .release();
+}
+
+extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_registerJITLoaderGDBWrapper(const char *Data, uint64_t Size) {
   using namespace orc::shared;
-  return WrapperFunction<void(SPSExecutorAddress, uint64_t)>::handle(
-             Data, Size, registerJITLoaderGDBImpl)
+  return WrapperFunction<void(SPSExecutorAddrRange)>::handle(
+             Data, Size,
+             [](ExecutorAddrRange R) {
+               registerJITLoaderGDBImpl(R.Start.toPtr<char *>(),
+                                        R.size().getValue());
+             })
       .release();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
new file mode 100644
index 000000000000..82aa62a0c0d9
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
@@ -0,0 +1,84 @@
+//===------------------------ OrcRTBootstrap.cpp --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OrcRTBootstrap.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/TargetExecutionUtils.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm::orc::shared;
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+template <typename WriteT, typename SPSWriteT>
+static llvm::orc::shared::CWrapperFunctionResult
+writeUIntsWrapper(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<void(SPSSequence<SPSWriteT>)>::handle(
+             ArgData, ArgSize,
+             [](std::vector<WriteT> Ws) {
+               for (auto &W : Ws)
+                 *W.Addr.template toPtr<decltype(W.Value) *>() = W.Value;
+             })
+      .release();
+}
+
+static llvm::orc::shared::CWrapperFunctionResult
+writeBuffersWrapper(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<void(SPSSequence<SPSMemoryAccessBufferWrite>)>::handle(
+             ArgData, ArgSize,
+             [](std::vector<tpctypes::BufferWrite> Ws) {
+               for (auto &W : Ws)
+                 memcpy(W.Addr.template toPtr<char *>(), W.Buffer.data(),
+                        W.Buffer.size());
+             })
+      .release();
+}
+
+static llvm::orc::shared::CWrapperFunctionResult
+runAsMainWrapper(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<rt::SPSRunAsMainSignature>::handle(
+             ArgData, ArgSize,
+             [](ExecutorAddr MainAddr,
+                std::vector<std::string> Args) -> int64_t {
+               return runAsMain(MainAddr.toPtr<int (*)(int, char *[])>(), Args);
+             })
+      .release();
+}
+
+void addTo(StringMap<ExecutorAddr> &M) {
+  M[rt::MemoryWriteUInt8sWrapperName] = ExecutorAddr::fromPtr(
+      &writeUIntsWrapper<tpctypes::UInt8Write,
+                         shared::SPSMemoryAccessUInt8Write>);
+  M[rt::MemoryWriteUInt16sWrapperName] = ExecutorAddr::fromPtr(
+      &writeUIntsWrapper<tpctypes::UInt16Write,
+                         shared::SPSMemoryAccessUInt16Write>);
+  M[rt::MemoryWriteUInt32sWrapperName] = ExecutorAddr::fromPtr(
+      &writeUIntsWrapper<tpctypes::UInt32Write,
+                         shared::SPSMemoryAccessUInt32Write>);
+  M[rt::MemoryWriteUInt64sWrapperName] = ExecutorAddr::fromPtr(
+      &writeUIntsWrapper<tpctypes::UInt64Write,
+                         shared::SPSMemoryAccessUInt64Write>);
+  M[rt::MemoryWriteBuffersWrapperName] =
+      ExecutorAddr::fromPtr(&writeBuffersWrapper);
+  M[rt::RegisterEHFrameSectionCustomDirectWrapperName] = ExecutorAddr::fromPtr(
+      &llvm_orc_registerEHFrameSectionCustomDirectWrapper);
+  M[rt::DeregisterEHFrameSectionCustomDirectWrapperName] =
+      ExecutorAddr::fromPtr(
+          &llvm_orc_deregisterEHFrameSectionCustomDirectWrapper);
+  M[rt::RunAsMainWrapperName] = ExecutorAddr::fromPtr(&runAsMainWrapper);
+}
+
+} // end namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h
new file mode 100644
index 000000000000..6b7ff79a3efc
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h
@@ -0,0 +1,36 @@
+//===----------------------- OrcRTBootstrap.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OrcRTPrelinkImpl provides functions that should be linked into the executor
+// to bootstrap common JIT functionality (e.g. memory allocation and memory
+// access).
+//
+// Call rt_impl::addTo to add these functions to a bootstrap symbols map.
+//
+// FIXME: The functionality in this file should probably be moved to an ORC
+// runtime bootstrap library in compiler-rt.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H
+#define LIB_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+void addTo(StringMap<ExecutorAddr> &M);
+
+} // namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
index 4a408d61ee38..e331bad84200 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
@@ -1,9 +1,8 @@
 //===--------- RegisterEHFrames.cpp - Register EH frame sections ----------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -86,11 +85,11 @@ static Error deregisterFrameWrapper(const void *P) {
 }
 #endif
 
-#ifdef __APPLE__
+#if defined(HAVE_UNW_ADD_DYNAMIC_FDE) || defined(__APPLE__)
 
 template <typename HandleFDEFn>
-Error walkAppleEHFrameSection(const char *const SectionStart,
-                              size_t SectionSize, HandleFDEFn HandleFDE) {
+Error walkLibunwindEHFrameSection(const char *const SectionStart,
+                                  size_t SectionSize, HandleFDEFn HandleFDE) {
   const char *CurCFIRecord = SectionStart;
   const char *End = SectionStart + SectionSize;
   uint64_t Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
@@ -124,16 +123,19 @@ Error walkAppleEHFrameSection(const char *const SectionStart,
   return Error::success();
 }
 
-#endif // __APPLE__
+#endif // HAVE_UNW_ADD_DYNAMIC_FDE || __APPLE__
 
 Error registerEHFrameSection(const void *EHFrameSectionAddr,
                              size_t EHFrameSectionSize) {
-#ifdef __APPLE__
-  // On Darwin __register_frame has to be called for each FDE entry.
-  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
-                                 EHFrameSectionSize, registerFrameWrapper);
+  /* libgcc and libunwind __register_frame behave differently. We use the
+   * presence of __unw_add_dynamic_fde to detect libunwind. */
+#if defined(HAVE_UNW_ADD_DYNAMIC_FDE) || defined(__APPLE__)
+  // With libunwind, __register_frame has to be called for each FDE entry.
+  return walkLibunwindEHFrameSection(
+      static_cast<const char *>(EHFrameSectionAddr), EHFrameSectionSize,
+      registerFrameWrapper);
 #else
-  // On Linux __register_frame takes a single argument:
+  // With libgcc, __register_frame takes a single argument:
   // a pointer to the start of the .eh_frame section.
 
   // How can it find the end? Because crtendS.o is linked
@@ -144,9 +146,10 @@ Error registerEHFrameSection(const void *EHFrameSectionAddr,
 
 Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
                                size_t EHFrameSectionSize) {
-#ifdef __APPLE__
-  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
-                                 EHFrameSectionSize, deregisterFrameWrapper);
+#if defined(HAVE_UNW_ADD_DYNAMIC_FDE) || defined(__APPLE__)
+  return walkLibunwindEHFrameSection(
+      static_cast<const char *>(EHFrameSectionAddr), EHFrameSectionSize,
+      deregisterFrameWrapper);
 #else
   return deregisterFrameWrapper(EHFrameSectionAddr);
 #endif
@@ -155,26 +158,42 @@ Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
 } // end namespace orc
 } // end namespace llvm
 
-static Error registerEHFrameWrapper(JITTargetAddress Addr, uint64_t Size) {
-  return llvm::orc::registerEHFrameSection(
-      jitTargetAddressToPointer<const void *>(Addr), Size);
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerEHFrameSectionCustomDirectWrapper(
+    const char *EHFrameSectionAddr, uint64_t Size) {
+  if (auto Err = registerEHFrameSection(EHFrameSectionAddr, Size))
+    return WrapperFunctionResult::createOutOfBandError(toString(std::move(Err)))
+        .release();
+  return llvm::orc::shared::CWrapperFunctionResult();
+}
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_deregisterEHFrameSectionCustomDirectWrapper(
+    const char *EHFrameSectionAddr, uint64_t Size) {
+  if (auto Err = deregisterEHFrameSection(EHFrameSectionAddr, Size))
+    return WrapperFunctionResult::createOutOfBandError(toString(std::move(Err)))
+        .release();
+  return llvm::orc::shared::CWrapperFunctionResult();
+}
+
+static Error registerEHFrameWrapper(ExecutorAddr Addr, uint64_t Size) {
+  return llvm::orc::registerEHFrameSection(Addr.toPtr<const void *>(), Size);
 }
 
-static Error deregisterEHFrameWrapper(JITTargetAddress Addr, uint64_t Size) {
-  return llvm::orc::deregisterEHFrameSection(
-      jitTargetAddressToPointer<const void *>(Addr), Size);
+static Error deregisterEHFrameWrapper(ExecutorAddr Addr, uint64_t Size) {
+  return llvm::orc::deregisterEHFrameSection(Addr.toPtr<const void *>(), Size);
 }
 
-extern "C" orc::shared::detail::CWrapperFunctionResult
+extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_registerEHFrameSectionWrapper(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddress, uint64_t)>::handle(
+  return WrapperFunction<SPSError(SPSExecutorAddr, uint64_t)>::handle(
              Data, Size, registerEHFrameWrapper)
       .release();
 }
 
-extern "C" orc::shared::detail::CWrapperFunctionResult
+extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_deregisterEHFrameSectionWrapper(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddress, uint64_t)>::handle(
+  return WrapperFunction<SPSError(SPSExecutorAddr, uint64_t)>::handle(
              Data, Size, deregisterEHFrameWrapper)
       .release();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
new file mode 100644
index 000000000000..3c9dd21b0832
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.cpp
@@ -0,0 +1,129 @@
+//===--- SimpleExecutorDylibManager.cpp - Executor-side dylib management --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+SimpleExecutorDylibManager::~SimpleExecutorDylibManager() {
+  assert(Dylibs.empty() && "shutdown not called?");
+}
+
+Expected<tpctypes::DylibHandle>
+SimpleExecutorDylibManager::open(const std::string &Path, uint64_t Mode) {
+  if (Mode != 0)
+    return make_error<StringError>("open: non-zero mode bits not yet supported",
+                                   inconvertibleErrorCode());
+
+  const char *PathCStr = Path.empty() ? nullptr : Path.c_str();
+  std::string ErrMsg;
+
+  auto DL = sys::DynamicLibrary::getPermanentLibrary(PathCStr, &ErrMsg);
+  if (!DL.isValid())
+    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
+
+  std::lock_guard<std::mutex> Lock(M);
+  Dylibs[NextId] = std::move(DL);
+  return NextId++;
+}
+
+Expected<std::vector<ExecutorAddr>>
+SimpleExecutorDylibManager::lookup(tpctypes::DylibHandle H,
+                                   const RemoteSymbolLookupSet &L) {
+  std::vector<ExecutorAddr> Result;
+
+  std::lock_guard<std::mutex> Lock(M);
+  auto I = Dylibs.find(H);
+  if (I == Dylibs.end())
+    return make_error<StringError>("No dylib for handle " + formatv("{0:x}", H),
+                                   inconvertibleErrorCode());
+  auto &DL = I->second;
+
+  for (const auto &E : L) {
+
+    if (E.Name.empty()) {
+      if (E.Required)
+        return make_error<StringError>("Required address for empty symbol \"\"",
+                                       inconvertibleErrorCode());
+      else
+        Result.push_back(ExecutorAddr());
+    } else {
+
+      const char *DemangledSymName = E.Name.c_str();
+#ifdef __APPLE__
+      if (E.Name.front() != '_')
+        return make_error<StringError>(Twine("MachO symbol \"") + E.Name +
+                                           "\" missing leading '_'",
+                                       inconvertibleErrorCode());
+      ++DemangledSymName;
+#endif
+
+      void *Addr = DL.getAddressOfSymbol(DemangledSymName);
+      if (!Addr && E.Required)
+        return make_error<StringError>(Twine("Missing definition for ") +
+                                           DemangledSymName,
+                                       inconvertibleErrorCode());
+
+      Result.push_back(ExecutorAddr::fromPtr(Addr));
+    }
+  }
+
+  return Result;
+}
+
+Error SimpleExecutorDylibManager::shutdown() {
+
+  DylibsMap DM;
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    std::swap(DM, Dylibs);
+  }
+
+  // There is no removal of dylibs at the moment, so nothing to do here.
+  return Error::success();
+}
+
+void SimpleExecutorDylibManager::addBootstrapSymbols(
+    StringMap<ExecutorAddr> &M) {
+  M[rt::SimpleExecutorDylibManagerInstanceName] = ExecutorAddr::fromPtr(this);
+  M[rt::SimpleExecutorDylibManagerOpenWrapperName] =
+      ExecutorAddr::fromPtr(&openWrapper);
+  M[rt::SimpleExecutorDylibManagerLookupWrapperName] =
+      ExecutorAddr::fromPtr(&lookupWrapper);
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorDylibManager::openWrapper(const char *ArgData, size_t ArgSize) {
+  return shared::
+      WrapperFunction<rt::SPSSimpleExecutorDylibManagerOpenSignature>::handle(
+             ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorDylibManager::open))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorDylibManager::lookupWrapper(const char *ArgData, size_t ArgSize) {
+  return shared::
+      WrapperFunction<rt::SPSSimpleExecutorDylibManagerLookupSignature>::handle(
+             ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorDylibManager::lookup))
+          .release();
+}
+
+} // namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
new file mode 100644
index 000000000000..232340c22a32
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -0,0 +1,261 @@
+//===- SimpleExecuorMemoryManagare.cpp - Simple executor-side memory mgmt -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() {
+  assert(Allocations.empty() && "shutdown not called?");
+}
+
+Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
+  std::error_code EC;
+  auto MB = sys::Memory::allocateMappedMemory(
+      Size, 0, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
+  if (EC)
+    return errorCodeToError(EC);
+  std::lock_guard<std::mutex> Lock(M);
+  assert(!Allocations.count(MB.base()) && "Duplicate allocation addr");
+  Allocations[MB.base()].Size = Size;
+  return ExecutorAddr::fromPtr(MB.base());
+}
+
+Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
+  ExecutorAddr Base(~0ULL);
+  std::vector<tpctypes::WrapperFunctionCall> DeallocationActions;
+  size_t SuccessfulFinalizationActions = 0;
+
+  if (FR.Segments.empty()) {
+    // NOTE: Finalizing nothing is currently a no-op. Should it be an error?
+    if (FR.Actions.empty())
+      return Error::success();
+    else
+      return make_error<StringError>("Finalization actions attached to empty "
+                                     "finalization request",
+                                     inconvertibleErrorCode());
+  }
+
+  for (auto &Seg : FR.Segments)
+    Base = std::min(Base, Seg.Addr);
+
+  for (auto &ActPair : FR.Actions)
+    if (ActPair.Deallocate.Func)
+      DeallocationActions.push_back(ActPair.Deallocate);
+
+  // Get the Allocation for this finalization.
+  size_t AllocSize = 0;
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    auto I = Allocations.find(Base.toPtr<void *>());
+    if (I == Allocations.end())
+      return make_error<StringError>("Attempt to finalize unrecognized "
+                                     "allocation " +
+                                         formatv("{0:x}", Base.getValue()),
+                                     inconvertibleErrorCode());
+    AllocSize = I->second.Size;
+    I->second.DeallocationActions = std::move(DeallocationActions);
+  }
+  ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize);
+
+  // Bail-out function: this will run deallocation actions corresponding to any
+  // completed finalization actions, then deallocate memory.
+  auto BailOut = [&](Error Err) {
+    std::pair<void *, Allocation> AllocToDestroy;
+
+    // Get allocation to destory.
+    {
+      std::lock_guard<std::mutex> Lock(M);
+      auto I = Allocations.find(Base.toPtr<void *>());
+
+      // Check for missing allocation (effective a double free).
+      if (I == Allocations.end())
+        return joinErrors(
+            std::move(Err),
+            make_error<StringError>("No allocation entry found "
+                                    "for " +
+                                        formatv("{0:x}", Base.getValue()),
+                                    inconvertibleErrorCode()));
+      AllocToDestroy = std::move(*I);
+      Allocations.erase(I);
+    }
+
+    // Run deallocation actions for all completed finalization actions.
+    while (SuccessfulFinalizationActions)
+      Err =
+          joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions]
+                                         .Deallocate.runWithSPSRet());
+
+    // Deallocate memory.
+    sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size);
+    if (auto EC = sys::Memory::releaseMappedMemory(MB))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
+
+    return Err;
+  };
+
+  // Copy content and apply permissions.
+  for (auto &Seg : FR.Segments) {
+
+    // Check segment ranges.
+    if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size()))
+      return BailOut(make_error<StringError>(
+          formatv("Segment {0:x} content size ({1:x} bytes) "
+                  "exceeds segment size ({2:x} bytes)",
+                  Seg.Addr.getValue(), Seg.Content.size(), Seg.Size),
+          inconvertibleErrorCode()));
+    ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size);
+    if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd))
+      return BailOut(make_error<StringError>(
+          formatv("Segment {0:x} -- {1:x} crosses boundary of "
+                  "allocation {2:x} -- {3:x}",
+                  Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(),
+                  AllocEnd.getValue()),
+          inconvertibleErrorCode()));
+
+    char *Mem = Seg.Addr.toPtr<char *>();
+    memcpy(Mem, Seg.Content.data(), Seg.Content.size());
+    memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
+    assert(Seg.Size <= std::numeric_limits<size_t>::max());
+    if (auto EC = sys::Memory::protectMappedMemory(
+            {Mem, static_cast<size_t>(Seg.Size)},
+            tpctypes::fromWireProtectionFlags(Seg.Prot)))
+      return BailOut(errorCodeToError(EC));
+    if (Seg.Prot & tpctypes::WPF_Exec)
+      sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
+  }
+
+  // Run finalization actions.
+  for (auto &ActPair : FR.Actions) {
+    if (auto Err = ActPair.Finalize.runWithSPSRet())
+      return BailOut(std::move(Err));
+    ++SuccessfulFinalizationActions;
+  }
+
+  return Error::success();
+}
+
+Error SimpleExecutorMemoryManager::deallocate(
+    const std::vector<ExecutorAddr> &Bases) {
+  std::vector<std::pair<void *, Allocation>> AllocPairs;
+  AllocPairs.reserve(Bases.size());
+
+  // Get allocation to destory.
+  Error Err = Error::success();
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    for (auto &Base : Bases) {
+      auto I = Allocations.find(Base.toPtr<void *>());
+
+      // Check for missing allocation (effective a double free).
+      if (I != Allocations.end()) {
+        AllocPairs.push_back(std::move(*I));
+        Allocations.erase(I);
+      } else
+        Err = joinErrors(
+            std::move(Err),
+            make_error<StringError>("No allocation entry found "
+                                    "for " +
+                                        formatv("{0:x}", Base.getValue()),
+                                    inconvertibleErrorCode()));
+    }
+  }
+
+  while (!AllocPairs.empty()) {
+    auto &P = AllocPairs.back();
+    Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second));
+    AllocPairs.pop_back();
+  }
+
+  return Err;
+}
+
+Error SimpleExecutorMemoryManager::shutdown() {
+
+  AllocationsMap AM;
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    AM = std::move(Allocations);
+  }
+
+  Error Err = Error::success();
+  for (auto &KV : AM)
+    Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second));
+  return Err;
+}
+
+void SimpleExecutorMemoryManager::addBootstrapSymbols(
+    StringMap<ExecutorAddr> &M) {
+  M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this);
+  M[rt::SimpleExecutorMemoryManagerReserveWrapperName] =
+      ExecutorAddr::fromPtr(&reserveWrapper);
+  M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] =
+      ExecutorAddr::fromPtr(&finalizeWrapper);
+  M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] =
+      ExecutorAddr::fromPtr(&deallocateWrapper);
+}
+
+Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) {
+  Error Err = Error::success();
+
+  while (!A.DeallocationActions.empty()) {
+    Err = joinErrors(std::move(Err),
+                     A.DeallocationActions.back().runWithSPSRet());
+    A.DeallocationActions.pop_back();
+  }
+
+  sys::MemoryBlock MB(Base, A.Size);
+  if (auto EC = sys::Memory::releaseMappedMemory(MB))
+    Err = joinErrors(std::move(Err), errorCodeToError(EC));
+
+  return Err;
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData,
+                                            size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSSimpleExecutorMemoryManagerReserveSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorMemoryManager::allocate))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData,
+                                             size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorMemoryManager::finalize))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData,
+                                               size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorMemoryManager::deallocate))
+          .release();
+}
+
+} // namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
new file mode 100644
index 000000000000..b6b21bde1182
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.cpp
@@ -0,0 +1,293 @@
+//===------- SimpleEPCServer.cpp - EPC over simple abstract channel -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/Process.h"
+
+#include "OrcRTBootstrap.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm::orc::shared;
+
+namespace llvm {
+namespace orc {
+
+ExecutorBootstrapService::~ExecutorBootstrapService() {}
+
+SimpleRemoteEPCServer::Dispatcher::~Dispatcher() {}
+
+#if LLVM_ENABLE_THREADS
+void SimpleRemoteEPCServer::ThreadDispatcher::dispatch(
+    unique_function<void()> Work) {
+  {
+    std::lock_guard<std::mutex> Lock(DispatchMutex);
+    if (!Running)
+      return;
+    ++Outstanding;
+  }
+
+  std::thread([this, Work = std::move(Work)]() mutable {
+    Work();
+    std::lock_guard<std::mutex> Lock(DispatchMutex);
+    --Outstanding;
+    OutstandingCV.notify_all();
+  }).detach();
+}
+
+void SimpleRemoteEPCServer::ThreadDispatcher::shutdown() {
+  std::unique_lock<std::mutex> Lock(DispatchMutex);
+  Running = false;
+  OutstandingCV.wait(Lock, [this]() { return Outstanding == 0; });
+}
+#endif
+
+StringMap<ExecutorAddr> SimpleRemoteEPCServer::defaultBootstrapSymbols() {
+  StringMap<ExecutorAddr> DBS;
+  rt_bootstrap::addTo(DBS);
+  return DBS;
+}
+
+Expected<SimpleRemoteEPCTransportClient::HandleMessageAction>
+SimpleRemoteEPCServer::handleMessage(SimpleRemoteEPCOpcode OpC, uint64_t SeqNo,
+                                     ExecutorAddr TagAddr,
+                                     SimpleRemoteEPCArgBytesVector ArgBytes) {
+
+  LLVM_DEBUG({
+    dbgs() << "SimpleRemoteEPCServer::handleMessage: opc = ";
+    switch (OpC) {
+    case SimpleRemoteEPCOpcode::Setup:
+      dbgs() << "Setup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Setup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Setup?");
+      break;
+    case SimpleRemoteEPCOpcode::Hangup:
+      dbgs() << "Hangup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Hangup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Hangup?");
+      break;
+    case SimpleRemoteEPCOpcode::Result:
+      dbgs() << "Result";
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Result?");
+      break;
+    case SimpleRemoteEPCOpcode::CallWrapper:
+      dbgs() << "CallWrapper";
+      break;
+    }
+    dbgs() << ", seqno = " << SeqNo
+           << ", tag-addr = " << formatv("{0:x}", TagAddr.getValue())
+           << ", arg-buffer = " << formatv("{0:x}", ArgBytes.size())
+           << " bytes\n";
+  });
+
+  using UT = std::underlying_type_t<SimpleRemoteEPCOpcode>;
+  if (static_cast<UT>(OpC) > static_cast<UT>(SimpleRemoteEPCOpcode::LastOpC))
+    return make_error<StringError>("Unexpected opcode",
+                                   inconvertibleErrorCode());
+
+  // TODO: Clean detach message?
+  switch (OpC) {
+  case SimpleRemoteEPCOpcode::Setup:
+    return make_error<StringError>("Unexpected Setup opcode",
+                                   inconvertibleErrorCode());
+  case SimpleRemoteEPCOpcode::Hangup:
+    return SimpleRemoteEPCTransportClient::EndSession;
+  case SimpleRemoteEPCOpcode::Result:
+    if (auto Err = handleResult(SeqNo, TagAddr, std::move(ArgBytes)))
+      return std::move(Err);
+    break;
+  case SimpleRemoteEPCOpcode::CallWrapper:
+    handleCallWrapper(SeqNo, TagAddr, std::move(ArgBytes));
+    break;
+  }
+  return ContinueSession;
+}
+
+Error SimpleRemoteEPCServer::waitForDisconnect() {
+  std::unique_lock<std::mutex> Lock(ServerStateMutex);
+  ShutdownCV.wait(Lock, [this]() { return RunState == ServerShutDown; });
+  return std::move(ShutdownErr);
+}
+
+void SimpleRemoteEPCServer::handleDisconnect(Error Err) {
+  PendingJITDispatchResultsMap TmpPending;
+
+  {
+    std::lock_guard<std::mutex> Lock(ServerStateMutex);
+    std::swap(TmpPending, PendingJITDispatchResults);
+    RunState = ServerShuttingDown;
+  }
+
+  // Send out-of-band errors to any waiting threads.
+  for (auto &KV : TmpPending)
+    KV.second->set_value(
+        shared::WrapperFunctionResult::createOutOfBandError("disconnecting"));
+
+  // Wait for dispatcher to clear.
+  D->shutdown();
+
+  // Shut down services.
+  while (!Services.empty()) {
+    ShutdownErr =
+      joinErrors(std::move(ShutdownErr), Services.back()->shutdown());
+    Services.pop_back();
+  }
+
+  std::lock_guard<std::mutex> Lock(ServerStateMutex);
+  ShutdownErr = joinErrors(std::move(ShutdownErr), std::move(Err));
+  RunState = ServerShutDown;
+  ShutdownCV.notify_all();
+}
+
+Error SimpleRemoteEPCServer::sendMessage(SimpleRemoteEPCOpcode OpC,
+                                         uint64_t SeqNo, ExecutorAddr TagAddr,
+                                         ArrayRef<char> ArgBytes) {
+
+  LLVM_DEBUG({
+    dbgs() << "SimpleRemoteEPCServer::sendMessage: opc = ";
+    switch (OpC) {
+    case SimpleRemoteEPCOpcode::Setup:
+      dbgs() << "Setup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Setup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Setup?");
+      break;
+    case SimpleRemoteEPCOpcode::Hangup:
+      dbgs() << "Hangup";
+      assert(SeqNo == 0 && "Non-zero SeqNo for Hangup?");
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Hangup?");
+      break;
+    case SimpleRemoteEPCOpcode::Result:
+      dbgs() << "Result";
+      assert(TagAddr.getValue() == 0 && "Non-zero TagAddr for Result?");
+      break;
+    case SimpleRemoteEPCOpcode::CallWrapper:
+      dbgs() << "CallWrapper";
+      break;
+    }
+    dbgs() << ", seqno = " << SeqNo
+           << ", tag-addr = " << formatv("{0:x}", TagAddr.getValue())
+           << ", arg-buffer = " << formatv("{0:x}", ArgBytes.size())
+           << " bytes\n";
+  });
+  auto Err = T->sendMessage(OpC, SeqNo, TagAddr, ArgBytes);
+  LLVM_DEBUG({
+    if (Err)
+      dbgs() << "  \\--> SimpleRemoteEPC::sendMessage failed\n";
+  });
+  return Err;
+}
+
+Error SimpleRemoteEPCServer::sendSetupMessage(
+    StringMap<ExecutorAddr> BootstrapSymbols) {
+
+  using namespace SimpleRemoteEPCDefaultBootstrapSymbolNames;
+
+  std::vector<char> SetupPacket;
+  SimpleRemoteEPCExecutorInfo EI;
+  EI.TargetTriple = sys::getProcessTriple();
+  if (auto PageSize = sys::Process::getPageSize())
+    EI.PageSize = *PageSize;
+  else
+    return PageSize.takeError();
+  EI.BootstrapSymbols = std::move(BootstrapSymbols);
+
+  assert(!EI.BootstrapSymbols.count(ExecutorSessionObjectName) &&
+         "Dispatch context name should not be set");
+  assert(!EI.BootstrapSymbols.count(DispatchFnName) &&
+         "Dispatch function name should not be set");
+  EI.BootstrapSymbols[ExecutorSessionObjectName] = ExecutorAddr::fromPtr(this);
+  EI.BootstrapSymbols[DispatchFnName] = ExecutorAddr::fromPtr(jitDispatchEntry);
+
+  using SPSSerialize =
+      shared::SPSArgList<shared::SPSSimpleRemoteEPCExecutorInfo>;
+  auto SetupPacketBytes =
+      shared::WrapperFunctionResult::allocate(SPSSerialize::size(EI));
+  shared::SPSOutputBuffer OB(SetupPacketBytes.data(), SetupPacketBytes.size());
+  if (!SPSSerialize::serialize(OB, EI))
+    return make_error<StringError>("Could not send setup packet",
+                                   inconvertibleErrorCode());
+
+  return sendMessage(SimpleRemoteEPCOpcode::Setup, 0, ExecutorAddr(),
+                     {SetupPacketBytes.data(), SetupPacketBytes.size()});
+}
+
+Error SimpleRemoteEPCServer::handleResult(
+    uint64_t SeqNo, ExecutorAddr TagAddr,
+    SimpleRemoteEPCArgBytesVector ArgBytes) {
+  std::promise<shared::WrapperFunctionResult> *P = nullptr;
+  {
+    std::lock_guard<std::mutex> Lock(ServerStateMutex);
+    auto I = PendingJITDispatchResults.find(SeqNo);
+    if (I == PendingJITDispatchResults.end())
+      return make_error<StringError>("No call for sequence number " +
+                                         Twine(SeqNo),
+                                     inconvertibleErrorCode());
+    P = I->second;
+    PendingJITDispatchResults.erase(I);
+    releaseSeqNo(SeqNo);
+  }
+  auto R = shared::WrapperFunctionResult::allocate(ArgBytes.size());
+  memcpy(R.data(), ArgBytes.data(), ArgBytes.size());
+  P->set_value(std::move(R));
+  return Error::success();
+}
+
+void SimpleRemoteEPCServer::handleCallWrapper(
+    uint64_t RemoteSeqNo, ExecutorAddr TagAddr,
+    SimpleRemoteEPCArgBytesVector ArgBytes) {
+  D->dispatch([this, RemoteSeqNo, TagAddr, ArgBytes = std::move(ArgBytes)]() {
+    using WrapperFnTy =
+        shared::CWrapperFunctionResult (*)(const char *, size_t);
+    auto *Fn = TagAddr.toPtr<WrapperFnTy>();
+    shared::WrapperFunctionResult ResultBytes(
+        Fn(ArgBytes.data(), ArgBytes.size()));
+    if (auto Err = sendMessage(SimpleRemoteEPCOpcode::Result, RemoteSeqNo,
+                               ExecutorAddr(),
+                               {ResultBytes.data(), ResultBytes.size()}))
+      ReportError(std::move(Err));
+  });
+}
+
+shared::WrapperFunctionResult
+SimpleRemoteEPCServer::doJITDispatch(const void *FnTag, const char *ArgData,
+                                     size_t ArgSize) {
+  uint64_t SeqNo;
+  std::promise<shared::WrapperFunctionResult> ResultP;
+  auto ResultF = ResultP.get_future();
+  {
+    std::lock_guard<std::mutex> Lock(ServerStateMutex);
+    if (RunState != ServerRunning)
+      return shared::WrapperFunctionResult::createOutOfBandError(
+          "jit_dispatch not available (EPC server shut down)");
+
+    SeqNo = getNextSeqNo();
+    assert(!PendingJITDispatchResults.count(SeqNo) && "SeqNo already in use");
+    PendingJITDispatchResults[SeqNo] = &ResultP;
+  }
+
+  if (auto Err = sendMessage(SimpleRemoteEPCOpcode::CallWrapper, SeqNo,
+                             ExecutorAddr::fromPtr(FnTag), {ArgData, ArgSize}))
+    ReportError(std::move(Err));
+
+  return ResultF.get();
+}
+
+shared::CWrapperFunctionResult
+SimpleRemoteEPCServer::jitDispatchEntry(void *DispatchCtx, const void *FnTag,
+                                        const char *ArgData, size_t ArgSize) {
+  return reinterpret_cast<SimpleRemoteEPCServer *>(DispatchCtx)
+      ->doJITDispatch(FnTag, ArgData, ArgSize)
+      .release();
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
new file mode 100644
index 000000000000..111c84ec87ed
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
@@ -0,0 +1,48 @@
+//===------------ TaskDispatch.cpp - ORC task dispatch utils --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
+
+namespace llvm {
+namespace orc {
+
+char Task::ID = 0;
+char GenericNamedTask::ID = 0;
+const char *GenericNamedTask::DefaultDescription = "Generic Task";
+
+void Task::anchor() {}
+TaskDispatcher::~TaskDispatcher() {}
+
+void InPlaceTaskDispatcher::dispatch(std::unique_ptr<Task> T) { T->run(); }
+
+void InPlaceTaskDispatcher::shutdown() {}
+
+#if LLVM_ENABLE_THREADS
+void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
+  {
+    std::lock_guard<std::mutex> Lock(DispatchMutex);
+    ++Outstanding;
+  }
+
+  std::thread([this, T = std::move(T)]() mutable {
+    T->run();
+    std::lock_guard<std::mutex> Lock(DispatchMutex);
+    --Outstanding;
+    OutstandingCV.notify_all();
+  }).detach();
+}
+
+void DynamicThreadPoolTaskDispatcher::shutdown() {
+  std::unique_lock<std::mutex> Lock(DispatchMutex);
+  Running = false;
+  OutstandingCV.wait(Lock, [this]() { return Outstanding == 0; });
+}
+#endif
+
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
index 0f6f9efe1102..210fbf6e43e3 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
@@ -84,7 +84,7 @@ llvm::JITSymbolFlags::fromObjectSymbol(const object::SymbolRef &Symbol) {
   if (!SymbolType)
     return SymbolType.takeError();
 
-  if (*SymbolType & object::SymbolRef::ST_Function)
+  if (*SymbolType == object::SymbolRef::ST_Function)
     Flags |= JITSymbolFlags::Callable;
 
   return Flags;
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index b6ccd02405c1..9c8d402364c6 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -67,7 +67,9 @@ static void __deregister_frame(void *p) {
 }
 #endif
 
-#ifdef __APPLE__
+/* libgcc and libunwind __register_frame behave differently. We use the presence
+ * of __unw_add_dynamic_fde to detect libunwind. */
+#if defined(HAVE_UNW_ADD_DYNAMIC_FDE) || defined(__APPLE__)
 
 static const char *processFDE(const char *Entry, bool isDeregister) {
   const char *P = Entry;
@@ -284,7 +286,7 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   uint64_t Addr = getSymbolAddress(Name);
 
   if (!Addr && AbortOnFailure)
-    report_fatal_error("Program used external function '" + Name +
+    report_fatal_error(Twine("Program used external function '") + Name +
                        "' which could not be resolved!");
 
   return (void*)Addr;
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 687fd839805f..f16c6bdbfa4f 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -520,6 +520,13 @@ static bool isZeroInit(const SectionRef Section) {
          SectionType == MachO::S_GB_ZEROFILL;
 }
 
+static bool isTLS(const SectionRef Section) {
+  const ObjectFile *Obj = Section.getObject();
+  if (isa<object::ELFObjectFileBase>(Obj))
+    return ELFSectionRef(Section).getFlags() & ELF::SHF_TLS;
+  return false;
+}
+
 // Compute an upper bound of the memory size that is required to load all
 // sections
 Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
@@ -549,6 +556,7 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
       unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
       bool IsCode = Section.isText();
       bool IsReadOnly = isReadOnlyData(Section);
+      bool IsTLS = isTLS(Section);
 
       Expected<StringRef> NameOrErr = Section.getName();
       if (!NameOrErr)
@@ -582,7 +590,7 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
       } else if (IsReadOnly) {
         RODataAlign = std::max(RODataAlign, Alignment);
         ROSectionSizes.push_back(SectionSize);
-      } else {
+      } else if (!IsTLS) {
         RWDataAlign = std::max(RWDataAlign, Alignment);
         RWSectionSizes.push_back(SectionSize);
       }
@@ -672,7 +680,7 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
 
     Expected<section_iterator> RelSecOrErr = SI->getRelocatedSection();
     if (!RelSecOrErr)
-      report_fatal_error(toString(RelSecOrErr.takeError()));
+      report_fatal_error(Twine(toString(RelSecOrErr.takeError())));
 
     section_iterator RelSecI = *RelSecOrErr;
     if (!(RelSecI == Section))
@@ -800,6 +808,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   bool IsVirtual = Section.isVirtual();
   bool IsZeroInit = isZeroInit(Section);
   bool IsReadOnly = isReadOnlyData(Section);
+  bool IsTLS = isTLS(Section);
   uint64_t DataSize = Section.getSize();
 
   // An alignment of 0 (at least with ELF) is identical to an alignment of 1,
@@ -823,6 +832,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   uintptr_t Allocate;
   unsigned SectionID = Sections.size();
   uint8_t *Addr;
+  uint64_t LoadAddress = 0;
   const char *pData = nullptr;
 
   // If this section contains any bits (i.e. isn't a virtual or bss section),
@@ -851,10 +861,17 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
     Allocate = DataSize + PaddingSize + StubBufSize;
     if (!Allocate)
       Allocate = 1;
-    Addr = IsCode ? MemMgr.allocateCodeSection(Allocate, Alignment, SectionID,
-                                               Name)
-                  : MemMgr.allocateDataSection(Allocate, Alignment, SectionID,
-                                               Name, IsReadOnly);
+    if (IsTLS) {
+      auto TLSSection =
+          MemMgr.allocateTLSSection(Allocate, Alignment, SectionID, Name);
+      Addr = TLSSection.InitializationImage;
+      LoadAddress = TLSSection.Offset;
+    } else if (IsCode) {
+      Addr = MemMgr.allocateCodeSection(Allocate, Alignment, SectionID, Name);
+    } else {
+      Addr = MemMgr.allocateDataSection(Allocate, Alignment, SectionID, Name,
+                                        IsReadOnly);
+    }
     if (!Addr)
       report_fatal_error("Unable to allocate section memory!");
 
@@ -897,6 +914,10 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   Sections.push_back(
       SectionEntry(Name, Addr, DataSize, Allocate, (uintptr_t)pData));
 
+  // The load address of a TLS section is not equal to the address of its
+  // initialization image
+  if (IsTLS)
+    Sections.back().setLoadAddress(LoadAddress);
   // Debug info sections are linked as if their load address was zero
   if (!IsRequired)
     Sections.back().setLoadAddress(0);
@@ -1118,7 +1139,7 @@ void RuntimeDyldImpl::applyExternalSymbolRelocations(
 
       // FIXME: Implement error handling that doesn't kill the host program!
       if (!Addr && !Resolver.allowsZeroSymbols())
-        report_fatal_error("Program used external function '" + Name +
+        report_fatal_error(Twine("Program used external function '") + Name +
                            "' which could not be resolved!");
 
       // If Resolver returned UINT64_MAX, the client wants to handle this symbol
@@ -1261,6 +1282,14 @@ uint64_t RuntimeDyld::LoadedObjectInfo::getSectionLoadAddress(
   return 0;
 }
 
+RuntimeDyld::MemoryManager::TLSSection
+RuntimeDyld::MemoryManager::allocateTLSSection(uintptr_t Size,
+                                               unsigned Alignment,
+                                               unsigned SectionID,
+                                               StringRef SectionName) {
+  report_fatal_error("allocation of TLS not implemented");
+}
+
 void RuntimeDyld::MemoryManager::anchor() {}
 void JITSymbolResolver::anchor() {}
 void LegacyJITSymbolResolver::anchor() {}
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index a3005f786cf9..2b88c481dab0 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -232,6 +232,26 @@ private:
           EvalResult(("Cannot decode unknown symbol '" + Symbol + "'").str()),
           "");
 
+    // if there is an offset number expr
+    int64_t Offset = 0;
+    BinOpToken BinOp;
+    std::tie(BinOp, RemainingExpr) = parseBinOpToken(RemainingExpr);
+    switch (BinOp) {
+    case BinOpToken::Add: {
+      EvalResult Number;
+      std::tie(Number, RemainingExpr) = evalNumberExpr(RemainingExpr);
+      Offset = Number.getValue();
+      break;
+    }
+    case BinOpToken::Invalid:
+      break;
+    default:
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr,
+                          "expected '+' for offset or ',' if no offset"),
+          "");
+    }
+
     if (!RemainingExpr.startswith(","))
       return std::make_pair(
           unexpectedToken(RemainingExpr, RemainingExpr, "expected ','"), "");
@@ -249,7 +269,7 @@ private:
 
     MCInst Inst;
     uint64_t Size;
-    if (!decodeInst(Symbol, Inst, Size))
+    if (!decodeInst(Symbol, Inst, Size, Offset))
       return std::make_pair(
           EvalResult(("Couldn't decode instruction at '" + Symbol + "'").str()),
           "");
@@ -307,7 +327,7 @@ private:
 
     MCInst Inst;
     uint64_t InstSize;
-    if (!decodeInst(Symbol, Inst, InstSize))
+    if (!decodeInst(Symbol, Inst, InstSize, 0))
       return std::make_pair(
           EvalResult(("Couldn't decode instruction at '" + Symbol + "'").str()),
           "");
@@ -664,10 +684,12 @@ private:
     return evalComplexExpr(std::make_pair(ThisResult, RemainingExpr), PCtx);
   }
 
-  bool decodeInst(StringRef Symbol, MCInst &Inst, uint64_t &Size) const {
+  bool decodeInst(StringRef Symbol, MCInst &Inst, uint64_t &Size,
+                  int64_t Offset) const {
     MCDisassembler *Dis = Checker.Disassembler;
     StringRef SymbolMem = Checker.getSymbolContent(Symbol);
-    ArrayRef<uint8_t> SymbolBytes(SymbolMem.bytes_begin(), SymbolMem.size());
+    ArrayRef<uint8_t> SymbolBytes(SymbolMem.bytes_begin() + Offset,
+                                  SymbolMem.size() - Offset);
 
     MCDisassembler::DecodeStatus S =
         Dis->getInstruction(Inst, Size, SymbolBytes, 0, nulls());
@@ -675,7 +697,7 @@ private:
     return (S == MCDisassembler::Success);
   }
 };
-}
+} // namespace llvm
 
 RuntimeDyldCheckerImpl::RuntimeDyldCheckerImpl(
     IsSymbolValidFunction IsSymbolValid, GetSymbolInfoFunction GetSymbolInfo,
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index efe0b9cd61cd..1b7fdb588275 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -345,6 +345,32 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) = GOTOffset;
     break;
   }
+  case ELF::R_X86_64_DTPMOD64: {
+    // We only have one DSO, so the module id is always 1.
+    support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) = 1;
+    break;
+  }
+  case ELF::R_X86_64_DTPOFF64:
+  case ELF::R_X86_64_TPOFF64: {
+    // DTPOFF64 should resolve to the offset in the TLS block, TPOFF64 to the
+    // offset in the *initial* TLS block. Since we are statically linking, all
+    // TLS blocks already exist in the initial block, so resolve both
+    // relocations equally.
+    support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
+        Value + Addend;
+    break;
+  }
+  case ELF::R_X86_64_DTPOFF32:
+  case ELF::R_X86_64_TPOFF32: {
+    // As for the (D)TPOFF64 relocations above, both DTPOFF32 and TPOFF32 can
+    // be resolved equally.
+    int64_t RealValue = Value + Addend;
+    assert(RealValue >= INT32_MIN && RealValue <= INT32_MAX);
+    int32_t TruncValue = RealValue;
+    support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) =
+        TruncValue;
+    break;
+  }
   }
 }
 
@@ -674,7 +700,7 @@ Error RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj,
 
     Expected<section_iterator> RelSecOrErr = si->getRelocatedSection();
     if (!RelSecOrErr)
-      report_fatal_error(toString(RelSecOrErr.takeError()));
+      report_fatal_error(Twine(toString(RelSecOrErr.takeError())));
 
     section_iterator RelSecI = *RelSecOrErr;
     if (RelSecI == Obj.section_end())
@@ -1210,8 +1236,7 @@ RuntimeDyldELF::processRelocationRef(
       std::string Buf;
       raw_string_ostream OS(Buf);
       logAllUnhandledErrors(SymTypeOrErr.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
     SymType = *SymTypeOrErr;
   }
@@ -1231,8 +1256,7 @@ RuntimeDyldELF::processRelocationRef(
         std::string Buf;
         raw_string_ostream OS(Buf);
         logAllUnhandledErrors(SectionOrErr.takeError(), OS);
-        OS.flush();
-        report_fatal_error(Buf);
+        report_fatal_error(Twine(OS.str()));
       }
       section_iterator si = *SectionOrErr;
       if (si == Obj.section_end())
@@ -1813,11 +1837,14 @@ RuntimeDyldELF::processRelocationRef(
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
-    } else if (RelType == ELF::R_X86_64_GOTPC64) {
+    } else if (RelType == ELF::R_X86_64_GOTPC32) {
       // Materialize the address of the base of the GOT relative to the PC.
       // This doesn't create a GOT entry, but it does mean we need a GOT
       // section.
       (void)allocateGOTEntries(0);
+      resolveGOTOffsetRelocation(SectionID, Offset, Addend, ELF::R_X86_64_PC32);
+    } else if (RelType == ELF::R_X86_64_GOTPC64) {
+      (void)allocateGOTEntries(0);
       resolveGOTOffsetRelocation(SectionID, Offset, Addend, ELF::R_X86_64_PC64);
     } else if (RelType == ELF::R_X86_64_GOTOFF64) {
       // GOTOFF relocations ultimately require a section difference relocation.
@@ -1829,6 +1856,15 @@ RuntimeDyldELF::processRelocationRef(
     } else if (RelType == ELF::R_X86_64_PC64) {
       Value.Addend += support::ulittle64_t::ref(computePlaceholderAddress(SectionID, Offset));
       processSimpleRelocation(SectionID, Offset, RelType, Value);
+    } else if (RelType == ELF::R_X86_64_GOTTPOFF) {
+      processX86_64GOTTPOFFRelocation(SectionID, Offset, Value, Addend);
+    } else if (RelType == ELF::R_X86_64_TLSGD ||
+               RelType == ELF::R_X86_64_TLSLD) {
+      // The next relocation must be the relocation for __tls_get_addr.
+      ++RelI;
+      auto &GetAddrRelocation = *RelI;
+      processX86_64TLSRelocation(SectionID, Offset, RelType, Value, Addend,
+                                 GetAddrRelocation);
     } else {
       processSimpleRelocation(SectionID, Offset, RelType, Value);
     }
@@ -1841,6 +1877,330 @@ RuntimeDyldELF::processRelocationRef(
   return ++RelI;
 }
 
+void RuntimeDyldELF::processX86_64GOTTPOFFRelocation(unsigned SectionID,
+                                                     uint64_t Offset,
+                                                     RelocationValueRef Value,
+                                                     int64_t Addend) {
+  // Use the approach from "x86-64 Linker Optimizations" from the TLS spec
+  // to replace the GOTTPOFF relocation with a TPOFF relocation. The spec
+  // only mentions one optimization even though there are two different
+  // code sequences for the Initial Exec TLS Model. We match the code to
+  // find out which one was used.
+
+  // A possible TLS code sequence and its replacement
+  struct CodeSequence {
+    // The expected code sequence
+    ArrayRef<uint8_t> ExpectedCodeSequence;
+    // The negative offset of the GOTTPOFF relocation to the beginning of
+    // the sequence
+    uint64_t TLSSequenceOffset;
+    // The new code sequence
+    ArrayRef<uint8_t> NewCodeSequence;
+    // The offset of the new TPOFF relocation
+    uint64_t TpoffRelocationOffset;
+  };
+
+  std::array<CodeSequence, 2> CodeSequences;
+
+  // Initial Exec Code Model Sequence
+  {
+    static const std::initializer_list<uint8_t> ExpectedCodeSequenceList = {
+        0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00,
+        0x00,                                    // mov %fs:0, %rax
+        0x48, 0x03, 0x05, 0x00, 0x00, 0x00, 0x00 // add x@gotpoff(%rip),
+                                                 // %rax
+    };
+    CodeSequences[0].ExpectedCodeSequence =
+        ArrayRef<uint8_t>(ExpectedCodeSequenceList);
+    CodeSequences[0].TLSSequenceOffset = 12;
+
+    static const std::initializer_list<uint8_t> NewCodeSequenceList = {
+        0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0, %rax
+        0x48, 0x8d, 0x80, 0x00, 0x00, 0x00, 0x00 // lea x@tpoff(%rax), %rax
+    };
+    CodeSequences[0].NewCodeSequence = ArrayRef<uint8_t>(NewCodeSequenceList);
+    CodeSequences[0].TpoffRelocationOffset = 12;
+  }
+
+  // Initial Exec Code Model Sequence, II
+  {
+    static const std::initializer_list<uint8_t> ExpectedCodeSequenceList = {
+        0x48, 0x8b, 0x05, 0x00, 0x00, 0x00, 0x00, // mov x@gotpoff(%rip), %rax
+        0x64, 0x48, 0x8b, 0x00, 0x00, 0x00, 0x00  // mov %fs:(%rax), %rax
+    };
+    CodeSequences[1].ExpectedCodeSequence =
+        ArrayRef<uint8_t>(ExpectedCodeSequenceList);
+    CodeSequences[1].TLSSequenceOffset = 3;
+
+    static const std::initializer_list<uint8_t> NewCodeSequenceList = {
+        0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,             // 6 byte nop
+        0x64, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:x@tpoff, %rax
+    };
+    CodeSequences[1].NewCodeSequence = ArrayRef<uint8_t>(NewCodeSequenceList);
+    CodeSequences[1].TpoffRelocationOffset = 10;
+  }
+
+  bool Resolved = false;
+  auto &Section = Sections[SectionID];
+  for (const auto &C : CodeSequences) {
+    assert(C.ExpectedCodeSequence.size() == C.NewCodeSequence.size() &&
+           "Old and new code sequences must have the same size");
+
+    if (Offset < C.TLSSequenceOffset ||
+        (Offset - C.TLSSequenceOffset + C.NewCodeSequence.size()) >
+            Section.getSize()) {
+      // This can't be a matching sequence as it doesn't fit in the current
+      // section
+      continue;
+    }
+
+    auto TLSSequenceStartOffset = Offset - C.TLSSequenceOffset;
+    auto *TLSSequence = Section.getAddressWithOffset(TLSSequenceStartOffset);
+    if (ArrayRef<uint8_t>(TLSSequence, C.ExpectedCodeSequence.size()) !=
+        C.ExpectedCodeSequence) {
+      continue;
+    }
+
+    memcpy(TLSSequence, C.NewCodeSequence.data(), C.NewCodeSequence.size());
+
+    // The original GOTTPOFF relocation has an addend as it is PC relative,
+    // so it needs to be corrected. The TPOFF32 relocation is used as an
+    // absolute value (which is an offset from %fs:0), so remove the addend
+    // again.
+    RelocationEntry RE(SectionID,
+                       TLSSequenceStartOffset + C.TpoffRelocationOffset,
+                       ELF::R_X86_64_TPOFF32, Value.Addend - Addend);
+
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+
+    Resolved = true;
+    break;
+  }
+
+  if (!Resolved) {
+    // The GOTTPOFF relocation was not used in one of the sequences
+    // described in the spec, so we can't optimize it to a TPOFF
+    // relocation.
+    uint64_t GOTOffset = allocateGOTEntries(1);
+    resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                               ELF::R_X86_64_PC32);
+    RelocationEntry RE =
+        computeGOTOffsetRE(GOTOffset, Value.Offset, ELF::R_X86_64_TPOFF64);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
+}
+
+void RuntimeDyldELF::processX86_64TLSRelocation(
+    unsigned SectionID, uint64_t Offset, uint64_t RelType,
+    RelocationValueRef Value, int64_t Addend,
+    const RelocationRef &GetAddrRelocation) {
+  // Since we are statically linking and have no additional DSOs, we can resolve
+  // the relocation directly without using __tls_get_addr.
+  // Use the approach from "x86-64 Linker Optimizations" from the TLS spec
+  // to replace it with the Local Exec relocation variant.
+
+  // Find out whether the code was compiled with the large or small memory
+  // model. For this we look at the next relocation which is the relocation
+  // for the __tls_get_addr function. If it's a 32 bit relocation, it's the
+  // small code model, with a 64 bit relocation it's the large code model.
+  bool IsSmallCodeModel;
+  // Is the relocation for the __tls_get_addr a PC-relative GOT relocation?
+  bool IsGOTPCRel = false;
+
+  switch (GetAddrRelocation.getType()) {
+  case ELF::R_X86_64_GOTPCREL:
+  case ELF::R_X86_64_REX_GOTPCRELX:
+  case ELF::R_X86_64_GOTPCRELX:
+    IsGOTPCRel = true;
+    LLVM_FALLTHROUGH;
+  case ELF::R_X86_64_PLT32:
+    IsSmallCodeModel = true;
+    break;
+  case ELF::R_X86_64_PLTOFF64:
+    IsSmallCodeModel = false;
+    break;
+  default:
+    report_fatal_error(
+        "invalid TLS relocations for General/Local Dynamic TLS Model: "
+        "expected PLT or GOT relocation for __tls_get_addr function");
+  }
+
+  // The negative offset to the start of the TLS code sequence relative to
+  // the offset of the TLSGD/TLSLD relocation
+  uint64_t TLSSequenceOffset;
+  // The expected start of the code sequence
+  ArrayRef<uint8_t> ExpectedCodeSequence;
+  // The new TLS code sequence that will replace the existing code
+  ArrayRef<uint8_t> NewCodeSequence;
+
+  if (RelType == ELF::R_X86_64_TLSGD) {
+    // The offset of the new TPOFF32 relocation (offset starting from the
+    // beginning of the whole TLS sequence)
+    uint64_t TpoffRelocOffset;
+
+    if (IsSmallCodeModel) {
+      if (!IsGOTPCRel) {
+        static const std::initializer_list<uint8_t> CodeSequence = {
+            0x66, // data16 (no-op prefix)
+            0x48, 0x8d, 0x3d, 0x00, 0x00,
+            0x00, 0x00,                  // lea <disp32>(%rip), %rdi
+            0x66, 0x66,                  // two data16 prefixes
+            0x48,                        // rex64 (no-op prefix)
+            0xe8, 0x00, 0x00, 0x00, 0x00 // call __tls_get_addr@plt
+        };
+        ExpectedCodeSequence = ArrayRef<uint8_t>(CodeSequence);
+        TLSSequenceOffset = 4;
+      } else {
+        // This code sequence is not described in the TLS spec but gcc
+        // generates it sometimes.
+        static const std::initializer_list<uint8_t> CodeSequence = {
+            0x66, // data16 (no-op prefix)
+            0x48, 0x8d, 0x3d, 0x00, 0x00,
+            0x00, 0x00, // lea <disp32>(%rip), %rdi
+            0x66,       // data16 prefix (no-op prefix)
+            0x48,       // rex64 (no-op prefix)
+            0xff, 0x15, 0x00, 0x00, 0x00,
+            0x00 // call *__tls_get_addr@gotpcrel(%rip)
+        };
+        ExpectedCodeSequence = ArrayRef<uint8_t>(CodeSequence);
+        TLSSequenceOffset = 4;
+      }
+
+      // The replacement code for the small code model. It's the same for
+      // both sequences.
+      static const std::initializer_list<uint8_t> SmallSequence = {
+          0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00,
+          0x00,                                    // mov %fs:0, %rax
+          0x48, 0x8d, 0x80, 0x00, 0x00, 0x00, 0x00 // lea x@tpoff(%rax),
+                                                   // %rax
+      };
+      NewCodeSequence = ArrayRef<uint8_t>(SmallSequence);
+      TpoffRelocOffset = 12;
+    } else {
+      static const std::initializer_list<uint8_t> CodeSequence = {
+          0x48, 0x8d, 0x3d, 0x00, 0x00, 0x00, 0x00, // lea <disp32>(%rip),
+                                                    // %rdi
+          0x48, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+          0x00,             // movabs $__tls_get_addr@pltoff, %rax
+          0x48, 0x01, 0xd8, // add %rbx, %rax
+          0xff, 0xd0        // call *%rax
+      };
+      ExpectedCodeSequence = ArrayRef<uint8_t>(CodeSequence);
+      TLSSequenceOffset = 3;
+
+      // The replacement code for the large code model
+      static const std::initializer_list<uint8_t> LargeSequence = {
+          0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00,
+          0x00,                                     // mov %fs:0, %rax
+          0x48, 0x8d, 0x80, 0x00, 0x00, 0x00, 0x00, // lea x@tpoff(%rax),
+                                                    // %rax
+          0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00        // nopw 0x0(%rax,%rax,1)
+      };
+      NewCodeSequence = ArrayRef<uint8_t>(LargeSequence);
+      TpoffRelocOffset = 12;
+    }
+
+    // The TLSGD/TLSLD relocations are PC-relative, so they have an addend.
+    // The new TPOFF32 relocations is used as an absolute offset from
+    // %fs:0, so remove the TLSGD/TLSLD addend again.
+    RelocationEntry RE(SectionID, Offset - TLSSequenceOffset + TpoffRelocOffset,
+                       ELF::R_X86_64_TPOFF32, Value.Addend - Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  } else if (RelType == ELF::R_X86_64_TLSLD) {
+    if (IsSmallCodeModel) {
+      if (!IsGOTPCRel) {
+        static const std::initializer_list<uint8_t> CodeSequence = {
+            0x48, 0x8d, 0x3d, 0x00, 0x00, 0x00, // leaq <disp32>(%rip), %rdi
+            0x00, 0xe8, 0x00, 0x00, 0x00, 0x00  // call __tls_get_addr@plt
+        };
+        ExpectedCodeSequence = ArrayRef<uint8_t>(CodeSequence);
+        TLSSequenceOffset = 3;
+
+        // The replacement code for the small code model
+        static const std::initializer_list<uint8_t> SmallSequence = {
+            0x66, 0x66, 0x66, // three data16 prefixes (no-op)
+            0x64, 0x48, 0x8b, 0x04, 0x25,
+            0x00, 0x00, 0x00, 0x00 // mov %fs:0, %rax
+        };
+        NewCodeSequence = ArrayRef<uint8_t>(SmallSequence);
+      } else {
+        // This code sequence is not described in the TLS spec but gcc
+        // generates it sometimes.
+        static const std::initializer_list<uint8_t> CodeSequence = {
+            0x48, 0x8d, 0x3d, 0x00,
+            0x00, 0x00, 0x00, // leaq <disp32>(%rip), %rdi
+            0xff, 0x15, 0x00, 0x00,
+            0x00, 0x00 // call
+                       // *__tls_get_addr@gotpcrel(%rip)
+        };
+        ExpectedCodeSequence = ArrayRef<uint8_t>(CodeSequence);
+        TLSSequenceOffset = 3;
+
+        // The replacement is code is just like above but it needs to be
+        // one byte longer.
+        static const std::initializer_list<uint8_t> SmallSequence = {
+            0x0f, 0x1f, 0x40, 0x00, // 4 byte nop
+            0x64, 0x48, 0x8b, 0x04, 0x25,
+            0x00, 0x00, 0x00, 0x00 // mov %fs:0, %rax
+        };
+        NewCodeSequence = ArrayRef<uint8_t>(SmallSequence);
+      }
+    } else {
+      // This is the same sequence as for the TLSGD sequence with the large
+      // memory model above
+      static const std::initializer_list<uint8_t> CodeSequence = {
+          0x48, 0x8d, 0x3d, 0x00, 0x00, 0x00, 0x00, // lea <disp32>(%rip),
+                                                    // %rdi
+          0x48, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+          0x48,       // movabs $__tls_get_addr@pltoff, %rax
+          0x01, 0xd8, // add %rbx, %rax
+          0xff, 0xd0  // call *%rax
+      };
+      ExpectedCodeSequence = ArrayRef<uint8_t>(CodeSequence);
+      TLSSequenceOffset = 3;
+
+      // The replacement code for the large code model
+      static const std::initializer_list<uint8_t> LargeSequence = {
+          0x66, 0x66, 0x66, // three data16 prefixes (no-op)
+          0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00,
+          0x00,                                                // 10 byte nop
+          0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00 // mov %fs:0,%rax
+      };
+      NewCodeSequence = ArrayRef<uint8_t>(LargeSequence);
+    }
+  } else {
+    llvm_unreachable("both TLS relocations handled above");
+  }
+
+  assert(ExpectedCodeSequence.size() == NewCodeSequence.size() &&
+         "Old and new code sequences must have the same size");
+
+  auto &Section = Sections[SectionID];
+  if (Offset < TLSSequenceOffset ||
+      (Offset - TLSSequenceOffset + NewCodeSequence.size()) >
+          Section.getSize()) {
+    report_fatal_error("unexpected end of section in TLS sequence");
+  }
+
+  auto *TLSSequence = Section.getAddressWithOffset(Offset - TLSSequenceOffset);
+  if (ArrayRef<uint8_t>(TLSSequence, ExpectedCodeSequence.size()) !=
+      ExpectedCodeSequence) {
+    report_fatal_error(
+        "invalid TLS sequence for Global/Local Dynamic TLS Model");
+  }
+
+  memcpy(TLSSequence, NewCodeSequence.data(), NewCodeSequence.size());
+}
+
 size_t RuntimeDyldELF::getGOTEntrySize() {
   // We don't use the GOT in all of these cases, but it's essentially free
   // to put them all here.
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 31892b7466e6..1251036f4caa 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -161,6 +161,18 @@ private:
   bool relocationNeedsGot(const RelocationRef &R) const override;
   bool relocationNeedsStub(const RelocationRef &R) const override;
 
+  // Process a GOTTPOFF TLS relocation for x86-64
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void processX86_64GOTTPOFFRelocation(unsigned SectionID, uint64_t Offset,
+                                       RelocationValueRef Value,
+                                       int64_t Addend);
+  // Process a TLSLD/TLSGD relocation for x86-64
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void processX86_64TLSRelocation(unsigned SectionID, uint64_t Offset,
+                                  uint64_t RelType, RelocationValueRef Value,
+                                  int64_t Addend,
+                                  const RelocationRef &GetAddrRelocation);
+
 public:
   RuntimeDyldELF(RuntimeDyld::MemoryManager &MemMgr,
                  JITSymbolResolver &Resolver);
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 721f2b14829a..dd66ff7ecf70 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -29,8 +29,7 @@ static bool isThumbFunc(object::symbol_iterator Symbol,
     std::string Buf;
     raw_string_ostream OS(Buf);
     logAllUnhandledErrors(SymTypeOrErr.takeError(), OS);
-    OS.flush();
-    report_fatal_error(Buf);
+    report_fatal_error(Twine(OS.str()));
   }
 
   if (*SymTypeOrErr != object::SymbolRef::ST_Function)
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h
index 14fb36f070f8..f03acb41d670 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.h
@@ -10,7 +10,6 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDELFMIPS_H
 
 #include "../RuntimeDyldELF.h"
-#include <string>
 
 #define DEBUG_TYPE "dyld"
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index a76958a9e2c2..fcf723aaea28 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -10,7 +10,6 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOARM_H
 
 #include "../RuntimeDyldMachO.h"
-#include <string>
 
 #define DEBUG_TYPE "dyld"
 
@@ -141,7 +140,7 @@ public:
         return ++RelI;
     }
 
-    // Sanity check relocation type.
+    // Validate the relocation type.
     switch (RelType) {
     UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_PAIR);
     UNIMPLEMENTED_RELOC(MachO::ARM_RELOC_SECTDIFF);
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
index 523deb29b723..d029d3266f79 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
@@ -10,7 +10,6 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOI386_H
 
 #include "../RuntimeDyldMachO.h"
-#include <string>
 
 #define DEBUG_TYPE "dyld"
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
index 28febbdb948c..a4d91cf338cb 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
@@ -10,7 +10,6 @@
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOX86_64_H
 
 #include "../RuntimeDyldMachO.h"
-#include <string>
 
 #define DEBUG_TYPE "dyld"
 
diff --git a/llvm/lib/ExecutionEngine/TargetSelect.cpp b/llvm/lib/ExecutionEngine/TargetSelect.cpp
index 28ea04be1a5e..c67a1a7661d6 100644
--- a/llvm/lib/ExecutionEngine/TargetSelect.cpp
+++ b/llvm/lib/ExecutionEngine/TargetSelect.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index 04476d999336..c962231cbdc1 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -954,8 +954,8 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
 
   // Check to see if this is a fixed string, or if it has regex pieces.
   if (!MatchFullLinesHere &&
-      (PatternStr.size() < 2 || (PatternStr.find("{{") == StringRef::npos &&
-                                 PatternStr.find("[[") == StringRef::npos))) {
+      (PatternStr.size() < 2 ||
+       (!PatternStr.contains("{{") && !PatternStr.contains("[[")))) {
     FixedStr = PatternStr;
     return false;
   }
@@ -1034,7 +1034,8 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
       bool IsLegacyLineExpr = false;
       StringRef DefName;
       StringRef SubstStr;
-      std::string MatchRegexp;
+      StringRef MatchRegexp;
+      std::string WildcardRegexp;
       size_t SubstInsertIdx = RegExStr.size();
 
       // Parse string variable or legacy @LINE expression.
@@ -1078,7 +1079,7 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
             return true;
           }
           DefName = Name;
-          MatchRegexp = MatchStr.str();
+          MatchRegexp = MatchStr;
         } else {
           if (IsPseudo) {
             MatchStr = OrigMatchStr;
@@ -1117,7 +1118,8 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
           SubstStr = MatchStr;
         else {
           ExpressionFormat Format = ExpressionPointer->getFormat();
-          MatchRegexp = cantFail(Format.getWildcardRegex());
+          WildcardRegexp = cantFail(Format.getWildcardRegex());
+          MatchRegexp = WildcardRegexp;
         }
       }
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 76954f9a37e1..ce998df757ec 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -13,18 +13,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
-
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 
 #include <sstream>
 
@@ -39,16 +50,22 @@ static cl::opt<bool>
                                   "'as-if' properties of runtime calls."),
                          cl::init(false));
 
+static cl::opt<double> UnrollThresholdFactor(
+    "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
+    cl::desc("Factor for the unroll threshold to account for code "
+             "simplifications still taking place"),
+    cl::init(1.5));
+
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
   LLVMContext &Ctx = Fn.getContext();
 
   // Get the function's current attributes.
   auto Attrs = Fn.getAttributes();
-  auto FnAttrs = Attrs.getFnAttributes();
-  auto RetAttrs = Attrs.getRetAttributes();
+  auto FnAttrs = Attrs.getFnAttrs();
+  auto RetAttrs = Attrs.getRetAttrs();
   SmallVector<AttributeSet, 4> ArgAttrs;
   for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
-    ArgAttrs.emplace_back(Attrs.getParamAttributes(ArgNo));
+    ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
 
 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
@@ -228,6 +245,16 @@ OpenMPIRBuilder::~OpenMPIRBuilder() {
   assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
 }
 
+GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
+  IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
+  auto *GV =
+      new GlobalVariable(M, I32Ty,
+                         /* isConstant = */ true, GlobalValue::WeakODRLinkage,
+                         ConstantInt::get(I32Ty, Value), Name);
+
+  return GV;
+}
+
 Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
                                          IdentFlag LocFlags,
                                          unsigned Reserve2Flags) {
@@ -241,32 +268,29 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
     Constant *IdentData[] = {
         I32Null, ConstantInt::get(Int32, uint32_t(LocFlags)),
         ConstantInt::get(Int32, Reserve2Flags), I32Null, SrcLocStr};
-    Constant *Initializer = ConstantStruct::get(
-        cast<StructType>(IdentPtr->getPointerElementType()), IdentData);
+    Constant *Initializer =
+        ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
 
     // Look for existing encoding of the location + flags, not needed but
     // minimizes the difference to the existing solution while we transition.
     for (GlobalVariable &GV : M.getGlobalList())
-      if (GV.getType() == IdentPtr && GV.hasInitializer())
+      if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
         if (GV.getInitializer() == Initializer)
-          return Ident = &GV;
-
-    auto *GV = new GlobalVariable(M, IdentPtr->getPointerElementType(),
-                                  /* isConstant = */ true,
-                                  GlobalValue::PrivateLinkage, Initializer);
-    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    GV->setAlignment(Align(8));
-    Ident = GV;
+          Ident = &GV;
+
+    if (!Ident) {
+      auto *GV = new GlobalVariable(
+          M, OpenMPIRBuilder::Ident,
+          /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
+          nullptr, GlobalValue::NotThreadLocal,
+          M.getDataLayout().getDefaultGlobalsAddressSpace());
+      GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      GV->setAlignment(Align(8));
+      Ident = GV;
+    }
   }
-  return Builder.CreatePointerCast(Ident, IdentPtr);
-}
 
-Type *OpenMPIRBuilder::getLanemaskType() {
-  LLVMContext &Ctx = M.getContext();
-  Triple triple(M.getTargetTriple());
-
-  // This test is adequate until deviceRTL has finer grained lane widths
-  return triple.isAMDGCN() ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx);
+  return Builder.CreatePointerCast(Ident, IdentPtr);
 }
 
 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
@@ -310,9 +334,8 @@ Constant *OpenMPIRBuilder::getOrCreateDefaultSrcLocStr() {
   return getOrCreateSrcLocStr(";unknown;unknown;0;0;;");
 }
 
-Constant *
-OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
-  DILocation *DIL = Loc.DL.get();
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, Function *F) {
+  DILocation *DIL = DL.get();
   if (!DIL)
     return getOrCreateDefaultSrcLocStr();
   StringRef FileName = M.getName();
@@ -320,12 +343,17 @@ OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
     if (Optional<StringRef> Source = DIF->getSource())
       FileName = *Source;
   StringRef Function = DIL->getScope()->getSubprogram()->getName();
-  Function =
-      !Function.empty() ? Function : Loc.IP.getBlock()->getParent()->getName();
+  if (Function.empty() && F)
+    Function = F->getName();
   return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
                               DIL->getColumn());
 }
 
+Constant *
+OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
+  return getOrCreateSrcLocStr(Loc.DL, Loc.IP.getBlock()->getParent());
+}
+
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
   return Builder.CreateCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
@@ -581,8 +609,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
 
   // Add some fake uses for OpenMP provided arguments.
   ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
-  Instruction *ZeroAddrUse = Builder.CreateLoad(Int32, ZeroAddr,
-                                                "zero.addr.use");
+  Instruction *ZeroAddrUse =
+      Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
   ToBeDeleted.push_back(ZeroAddrUse);
 
   // ThenBB
@@ -965,8 +993,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
   Value *ST = ConstantInt::get(I32Ty, 1);
   llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop(
       Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
-  LoopInfo = createStaticWorkshareLoop(Loc, LoopInfo, AllocaIP, true);
-  BasicBlock *LoopAfterBB = LoopInfo->getAfter();
+  InsertPointTy AfterIP =
+      applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, true);
+  BasicBlock *LoopAfterBB = AfterIP.getBlock();
   Instruction *SplitPos = LoopAfterBB->getTerminator();
   if (!isa_and_nonnull<BranchInst>(SplitPos))
     SplitPos = new UnreachableInst(Builder.getContext(), LoopAfterBB);
@@ -1022,6 +1051,179 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
                               /*IsCancellable*/ true);
 }
 
+/// Create a function with a unique name and a "void (i8*, i8*)" signature in
+/// the given module and return it.
+Function *getFreshReductionFunc(Module &M) {
+  Type *VoidTy = Type::getVoidTy(M.getContext());
+  Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  auto *FuncTy =
+      FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
+  return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                          M.getDataLayout().getDefaultGlobalsAddressSpace(),
+                          ".omp.reduction.func", &M);
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
+    const LocationDescription &Loc, InsertPointTy AllocaIP,
+    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
+  for (const ReductionInfo &RI : ReductionInfos) {
+    (void)RI;
+    assert(RI.Variable && "expected non-null variable");
+    assert(RI.PrivateVariable && "expected non-null private variable");
+    assert(RI.ReductionGen && "expected non-null reduction generator callback");
+    assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
+           "expected variables and their private equivalents to have the same "
+           "type");
+    assert(RI.Variable->getType()->isPointerTy() &&
+           "expected variables to be pointers");
+  }
+
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
+
+  BasicBlock *InsertBlock = Loc.IP.getBlock();
+  BasicBlock *ContinuationBlock =
+      InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
+  InsertBlock->getTerminator()->eraseFromParent();
+
+  // Create and populate array of type-erased pointers to private reduction
+  // values.
+  unsigned NumReductions = ReductionInfos.size();
+  Type *RedArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumReductions);
+  Builder.restoreIP(AllocaIP);
+  Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
+
+  Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
+
+  for (auto En : enumerate(ReductionInfos)) {
+    unsigned Index = En.index();
+    const ReductionInfo &RI = En.value();
+    Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
+    Value *Casted =
+        Builder.CreateBitCast(RI.PrivateVariable, Builder.getInt8PtrTy(),
+                              "private.red.var." + Twine(Index) + ".casted");
+    Builder.CreateStore(Casted, RedArrayElemPtr);
+  }
+
+  // Emit a call to the runtime function that orchestrates the reduction.
+  // Declare the reduction function in the process.
+  Function *Func = Builder.GetInsertBlock()->getParent();
+  Module *Module = Func->getParent();
+  Value *RedArrayPtr =
+      Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr");
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  bool CanGenerateAtomic =
+      llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
+        return RI.AtomicReductionGen;
+      });
+  Value *Ident = getOrCreateIdent(
+      SrcLocStr, CanGenerateAtomic ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
+                                   : IdentFlag(0));
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Constant *NumVariables = Builder.getInt32(NumReductions);
+  const DataLayout &DL = Module->getDataLayout();
+  unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
+  Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
+  Function *ReductionFunc = getFreshReductionFunc(*Module);
+  Value *Lock = getOMPCriticalRegionLock(".reduction");
+  Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
+      IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
+               : RuntimeFunction::OMPRTL___kmpc_reduce);
+  CallInst *ReduceCall =
+      Builder.CreateCall(ReduceFunc,
+                         {Ident, ThreadId, NumVariables, RedArraySize,
+                          RedArrayPtr, ReductionFunc, Lock},
+                         "reduce");
+
+  // Create final reduction entry blocks for the atomic and non-atomic case.
+  // Emit IR that dispatches control flow to one of the blocks based on the
+  // reduction supporting the atomic mode.
+  BasicBlock *NonAtomicRedBlock =
+      BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
+  BasicBlock *AtomicRedBlock =
+      BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
+  SwitchInst *Switch =
+      Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
+  Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
+  Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
+
+  // Populate the non-atomic reduction using the elementwise reduction function.
+  // This loads the elements from the global and private variables and reduces
+  // them before storing back the result to the global variable.
+  Builder.SetInsertPoint(NonAtomicRedBlock);
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Type *ValueType = RI.getElementType();
+    Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
+                                         "red.value." + Twine(En.index()));
+    Value *PrivateRedValue =
+        Builder.CreateLoad(ValueType, RI.PrivateVariable,
+                           "red.private.value." + Twine(En.index()));
+    Value *Reduced;
+    Builder.restoreIP(
+        RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
+    if (!Builder.GetInsertBlock())
+      return InsertPointTy();
+    Builder.CreateStore(Reduced, RI.Variable);
+  }
+  Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
+      IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
+               : RuntimeFunction::OMPRTL___kmpc_end_reduce);
+  Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
+  Builder.CreateBr(ContinuationBlock);
+
+  // Populate the atomic reduction using the atomic elementwise reduction
+  // function. There are no loads/stores here because they will be happening
+  // inside the atomic elementwise reduction.
+  Builder.SetInsertPoint(AtomicRedBlock);
+  if (CanGenerateAtomic) {
+    for (const ReductionInfo &RI : ReductionInfos) {
+      Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.Variable,
+                                              RI.PrivateVariable));
+      if (!Builder.GetInsertBlock())
+        return InsertPointTy();
+    }
+    Builder.CreateBr(ContinuationBlock);
+  } else {
+    Builder.CreateUnreachable();
+  }
+
+  // Populate the outlined reduction function using the elementwise reduction
+  // function. Partial values are extracted from the type-erased array of
+  // pointers to private variables.
+  BasicBlock *ReductionFuncBlock =
+      BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+  Builder.SetInsertPoint(ReductionFuncBlock);
+  Value *LHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(0),
+                                             RedArrayTy->getPointerTo());
+  Value *RHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(1),
+                                             RedArrayTy->getPointerTo());
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, LHSArrayPtr, 0, En.index());
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr);
+    Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
+    Value *LHS = Builder.CreateLoad(RI.getElementType(), LHSPtr);
+    Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, RHSArrayPtr, 0, En.index());
+    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr);
+    Value *RHSPtr =
+        Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
+    Value *RHS = Builder.CreateLoad(RI.getElementType(), RHSPtr);
+    Value *Reduced;
+    Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+    if (!Builder.GetInsertBlock())
+      return InsertPointTy();
+    Builder.CreateStore(Reduced, LHSPtr);
+  }
+  Builder.CreateRetVoid();
+
+  Builder.SetInsertPoint(ContinuationBlock);
+  return Builder.saveIP();
+}
+
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
                               BodyGenCallbackTy BodyGenCB,
@@ -1133,8 +1335,6 @@ CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
   CL->Exit = Exit;
   CL->After = After;
 
-  CL->IsValid = true;
-
 #ifndef NDEBUG
   CL->assertOK();
 #endif
@@ -1271,14 +1471,17 @@ void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) {
   CLI->assertOK();
 }
 
-CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
-    const LocationDescription &Loc, CanonicalLoopInfo *CLI,
-    InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+                                          InsertPointTy AllocaIP,
+                                          bool NeedsBarrier, Value *Chunk) {
+  assert(CLI->isValid() && "Requires a valid canonical loop");
+
   // Set up the source location value for OpenMP runtime.
-  if (!updateToLocation(Loc))
-    return nullptr;
+  Builder.restoreIP(CLI->getPreheaderIP());
+  Builder.SetCurrentDebugLocation(DL);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL);
   Value *SrcLoc = getOrCreateIdent(SrcLocStr);
 
   // Declare useful OpenMP runtime functions.
@@ -1308,6 +1511,7 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
   Builder.CreateStore(UpperBound, PUpperBound);
   Builder.CreateStore(One, PStride);
 
+  // FIXME: schedule(static) is NOT the same as schedule(static,1)
   if (!Chunk)
     Chunk = One;
 
@@ -1348,19 +1552,21 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
 
   // Add the barrier if requested.
   if (NeedsBarrier)
-    createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+    createBarrier(LocationDescription(Builder.saveIP(), DL),
                   omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
                   /* CheckCancelFlag */ false);
 
-  CLI->assertOK();
-  return CLI;
+  InsertPointTy AfterIP = CLI->getAfterIP();
+  CLI->invalidate();
+
+  return AfterIP;
 }
 
-CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
-    const LocationDescription &Loc, CanonicalLoopInfo *CLI,
-    InsertPointTy AllocaIP, bool NeedsBarrier) {
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+                                    InsertPointTy AllocaIP, bool NeedsBarrier) {
   // Currently only supports static schedules.
-  return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
+  return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
 }
 
 /// Returns an LLVM function to call for initializing loop bounds using OpenMP
@@ -1395,14 +1601,15 @@ getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
 }
 
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
-    const LocationDescription &Loc, CanonicalLoopInfo *CLI,
-    InsertPointTy AllocaIP, OMPScheduleType SchedType, bool NeedsBarrier,
-    Value *Chunk) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
+  assert(CLI->isValid() && "Requires a valid canonical loop");
+
   // Set up the source location value for OpenMP runtime.
-  Builder.SetCurrentDebugLocation(Loc.DL);
+  Builder.SetCurrentDebugLocation(DL);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL);
   Value *SrcLoc = getOrCreateIdent(SrcLocStr);
 
   // Declare useful OpenMP runtime functions.
@@ -1496,11 +1703,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
   // Add the barrier if requested.
   if (NeedsBarrier) {
     Builder.SetInsertPoint(&Exit->back());
-    createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+    createBarrier(LocationDescription(Builder.saveIP(), DL),
                   omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
                   /* CheckCancelFlag */ false);
   }
 
+  CLI->invalidate();
   return AfterIP;
 }
 
@@ -1592,6 +1800,8 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   // TODO: Find common/largest indvar type.
   Value *CollapsedTripCount = nullptr;
   for (CanonicalLoopInfo *L : Loops) {
+    assert(L->isValid() &&
+           "All loops to collapse must be valid canonical loops");
     Value *OrigTripCount = L->getTripCount();
     if (!CollapsedTripCount) {
       CollapsedTripCount = OrigTripCount;
@@ -1680,6 +1890,9 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
     Loop->collectControlBlocks(OldControlBBs);
   removeUnusedBlocksFromParent(OldControlBBs);
 
+  for (CanonicalLoopInfo *L : Loops)
+    L->invalidate();
+
 #ifndef NDEBUG
   Result->assertOK();
 #endif
@@ -1706,6 +1919,7 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   // any original CanonicalLoopInfo.
   SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
   for (CanonicalLoopInfo *L : Loops) {
+    assert(L->isValid() && "All input loops must be valid canonical loops");
     OrigTripCounts.push_back(L->getTripCount());
     OrigIndVars.push_back(L->getIndVar());
   }
@@ -1864,6 +2078,9 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
     Loop->collectControlBlocks(OldControlBBs);
   removeUnusedBlocksFromParent(OldControlBBs);
 
+  for (CanonicalLoopInfo *L : Loops)
+    L->invalidate();
+
 #ifndef NDEBUG
   for (CanonicalLoopInfo *GenL : Result)
     GenL->assertOK();
@@ -1871,6 +2088,287 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   return Result;
 }
 
+/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
+/// loop already has metadata, the loop properties are appended.
+static void addLoopMetadata(CanonicalLoopInfo *Loop,
+                            ArrayRef<Metadata *> Properties) {
+  assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
+
+  // Nothing to do if no property to attach.
+  if (Properties.empty())
+    return;
+
+  LLVMContext &Ctx = Loop->getFunction()->getContext();
+  SmallVector<Metadata *> NewLoopProperties;
+  NewLoopProperties.push_back(nullptr);
+
+  // If the loop already has metadata, prepend it to the new metadata.
+  BasicBlock *Latch = Loop->getLatch();
+  assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
+  MDNode *Existing = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop);
+  if (Existing)
+    append_range(NewLoopProperties, drop_begin(Existing->operands(), 1));
+
+  append_range(NewLoopProperties, Properties);
+  MDNode *LoopID = MDNode::getDistinct(Ctx, NewLoopProperties);
+  LoopID->replaceOperandWith(0, LoopID);
+
+  Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
+}
+
+void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
+  LLVMContext &Ctx = Builder.getContext();
+  addLoopMetadata(
+      Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
+             MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
+}
+
+void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
+  LLVMContext &Ctx = Builder.getContext();
+  addLoopMetadata(
+      Loop, {
+                MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
+            });
+}
+
+/// Create the TargetMachine object to query the backend for optimization
+/// preferences.
+///
+/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
+/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
+/// needed for the LLVM pass pipline. We use some default options to avoid
+/// having to pass too many settings from the frontend that probably do not
+/// matter.
+///
+/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
+/// method. If we are going to use TargetMachine for more purposes, especially
+/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
+/// might become be worth requiring front-ends to pass on their TargetMachine,
+/// or at least cache it between methods. Note that while fontends such as Clang
+/// have just a single main TargetMachine per translation unit, "target-cpu" and
+/// "target-features" that determine the TargetMachine are per-function and can
+/// be overrided using __attribute__((target("OPTIONS"))).
+static std::unique_ptr<TargetMachine>
+createTargetMachine(Function *F, CodeGenOpt::Level OptLevel) {
+  Module *M = F->getParent();
+
+  StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
+  StringRef Features = F->getFnAttribute("target-features").getValueAsString();
+  const std::string &Triple = M->getTargetTriple();
+
+  std::string Error;
+  const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
+  if (!TheTarget)
+    return {};
+
+  llvm::TargetOptions Options;
+  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
+      Triple, CPU, Features, Options, /*RelocModel=*/None, /*CodeModel=*/None,
+      OptLevel));
+}
+
+/// Heuristically determine the best-performant unroll factor for \p CLI. This
+/// depends on the target processor. We are re-using the same heuristics as the
+/// LoopUnrollPass.
+static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
+  Function *F = CLI->getFunction();
+
+  // Assume the user requests the most aggressive unrolling, even if the rest of
+  // the code is optimized using a lower setting.
+  CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
+  std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
+
+  FunctionAnalysisManager FAM;
+  FAM.registerPass([]() { return TargetLibraryAnalysis(); });
+  FAM.registerPass([]() { return AssumptionAnalysis(); });
+  FAM.registerPass([]() { return DominatorTreeAnalysis(); });
+  FAM.registerPass([]() { return LoopAnalysis(); });
+  FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
+  FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
+  TargetIRAnalysis TIRA;
+  if (TM)
+    TIRA = TargetIRAnalysis(
+        [&](const Function &F) { return TM->getTargetTransformInfo(F); });
+  FAM.registerPass([&]() { return TIRA; });
+
+  TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
+  ScalarEvolutionAnalysis SEA;
+  ScalarEvolution &&SE = SEA.run(*F, FAM);
+  DominatorTreeAnalysis DTA;
+  DominatorTree &&DT = DTA.run(*F, FAM);
+  LoopAnalysis LIA;
+  LoopInfo &&LI = LIA.run(*F, FAM);
+  AssumptionAnalysis ACT;
+  AssumptionCache &&AC = ACT.run(*F, FAM);
+  OptimizationRemarkEmitter ORE{F};
+
+  Loop *L = LI.getLoopFor(CLI->getHeader());
+  assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
+
+  TargetTransformInfo::UnrollingPreferences UP =
+      gatherUnrollingPreferences(L, SE, TTI,
+                                 /*BlockFrequencyInfo=*/nullptr,
+                                 /*ProfileSummaryInfo=*/nullptr, ORE, OptLevel,
+                                 /*UserThreshold=*/None,
+                                 /*UserCount=*/None,
+                                 /*UserAllowPartial=*/true,
+                                 /*UserAllowRuntime=*/true,
+                                 /*UserUpperBound=*/None,
+                                 /*UserFullUnrollMaxCount=*/None);
+
+  UP.Force = true;
+
+  // Account for additional optimizations taking place before the LoopUnrollPass
+  // would unroll the loop.
+  UP.Threshold *= UnrollThresholdFactor;
+  UP.PartialThreshold *= UnrollThresholdFactor;
+
+  // Use normal unroll factors even if the rest of the code is optimized for
+  // size.
+  UP.OptSizeThreshold = UP.Threshold;
+  UP.PartialOptSizeThreshold = UP.PartialThreshold;
+
+  LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
+                    << "  Threshold=" << UP.Threshold << "\n"
+                    << "  PartialThreshold=" << UP.PartialThreshold << "\n"
+                    << "  OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
+                    << "  PartialOptSizeThreshold="
+                    << UP.PartialOptSizeThreshold << "\n");
+
+  // Disable peeling.
+  TargetTransformInfo::PeelingPreferences PP =
+      gatherPeelingPreferences(L, SE, TTI,
+                               /*UserAllowPeeling=*/false,
+                               /*UserAllowProfileBasedPeeling=*/false,
+                               /*UserUnrollingSpecficValues=*/false);
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+  // Assume that reads and writes to stack variables can be eliminated by
+  // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
+  // size.
+  for (BasicBlock *BB : L->blocks()) {
+    for (Instruction &I : *BB) {
+      Value *Ptr;
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        Ptr = Load->getPointerOperand();
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        Ptr = Store->getPointerOperand();
+      } else
+        continue;
+
+      Ptr = Ptr->stripPointerCasts();
+
+      if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
+        if (Alloca->getParent() == &F->getEntryBlock())
+          EphValues.insert(&I);
+      }
+    }
+  }
+
+  unsigned NumInlineCandidates;
+  bool NotDuplicatable;
+  bool Convergent;
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
+  LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSize << "\n");
+
+  // Loop is not unrollable if the loop contains certain instructions.
+  if (NotDuplicatable || Convergent) {
+    LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
+    return 1;
+  }
+
+  // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
+  // be able to use it.
+  int TripCount = 0;
+  int MaxTripCount = 0;
+  bool MaxOrZero = false;
+  unsigned TripMultiple = 0;
+
+  bool UseUpperBound = false;
+  computeUnrollCount(L, TTI, DT, &LI, SE, EphValues, &ORE, TripCount,
+                     MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, PP,
+                     UseUpperBound);
+  unsigned Factor = UP.Count;
+  LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
+
+  // This function returns 1 to signal to not unroll a loop.
+  if (Factor == 0)
+    return 1;
+  return Factor;
+}
+
+void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
+                                        int32_t Factor,
+                                        CanonicalLoopInfo **UnrolledCLI) {
+  assert(Factor >= 0 && "Unroll factor must not be negative");
+
+  Function *F = Loop->getFunction();
+  LLVMContext &Ctx = F->getContext();
+
+  // If the unrolled loop is not used for another loop-associated directive, it
+  // is sufficient to add metadata for the LoopUnrollPass.
+  if (!UnrolledCLI) {
+    SmallVector<Metadata *, 2> LoopMetadata;
+    LoopMetadata.push_back(
+        MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
+
+    if (Factor >= 1) {
+      ConstantAsMetadata *FactorConst = ConstantAsMetadata::get(
+          ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
+      LoopMetadata.push_back(MDNode::get(
+          Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
+    }
+
+    addLoopMetadata(Loop, LoopMetadata);
+    return;
+  }
+
+  // Heuristically determine the unroll factor.
+  if (Factor == 0)
+    Factor = computeHeuristicUnrollFactor(Loop);
+
+  // No change required with unroll factor 1.
+  if (Factor == 1) {
+    *UnrolledCLI = Loop;
+    return;
+  }
+
+  assert(Factor >= 2 &&
+         "unrolling only makes sense with a factor of 2 or larger");
+
+  Type *IndVarTy = Loop->getIndVarType();
+
+  // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
+  // unroll the inner loop.
+  Value *FactorVal =
+      ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
+                                       /*isSigned=*/false));
+  std::vector<CanonicalLoopInfo *> LoopNest =
+      tileLoops(DL, {Loop}, {FactorVal});
+  assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
+  *UnrolledCLI = LoopNest[0];
+  CanonicalLoopInfo *InnerLoop = LoopNest[1];
+
+  // LoopUnrollPass can only fully unroll loops with constant trip count.
+  // Unroll by the unroll factor with a fallback epilog for the remainder
+  // iterations if necessary.
+  ConstantAsMetadata *FactorConst = ConstantAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
+  addLoopMetadata(
+      InnerLoop,
+      {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
+       MDNode::get(
+           Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
+
+#ifndef NDEBUG
+  (*UnrolledCLI)->assertOK();
+#endif
+}
+
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
                                    llvm::Value *BufSize, llvm::Value *CpyBuf,
@@ -1960,6 +2458,74 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical(
                               /*Conditional*/ false, /*hasFinalize*/ true);
 }
 
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
+                                     InsertPointTy AllocaIP, unsigned NumLoops,
+                                     ArrayRef<llvm::Value *> StoreValues,
+                                     const Twine &Name, bool IsDependSource) {
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  // Allocate space for vector and generate alloc instruction.
+  auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
+  Builder.restoreIP(AllocaIP);
+  AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
+  ArgsBase->setAlignment(Align(8));
+  Builder.restoreIP(Loc.IP);
+
+  // Store the index value with offset in depend vector.
+  for (unsigned I = 0; I < NumLoops; ++I) {
+    Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
+        ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
+    Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
+  }
+
+  Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
+      ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
+
+  Function *RTLFn = nullptr;
+  if (IsDependSource)
+    RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
+  else
+    RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
+  Builder.CreateCall(RTLFn, Args);
+
+  return Builder.saveIP();
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd(
+    const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
+    FinalizeCallbackTy FiniCB, bool IsThreads) {
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  Directive OMPD = Directive::OMPD_ordered;
+  Instruction *EntryCall = nullptr;
+  Instruction *ExitCall = nullptr;
+
+  if (IsThreads) {
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+    Value *Ident = getOrCreateIdent(SrcLocStr);
+    Value *ThreadId = getOrCreateThreadID(Ident);
+    Value *Args[] = {Ident, ThreadId};
+
+    Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
+    EntryCall = Builder.CreateCall(EntryRTLFn, Args);
+
+    Function *ExitRTLFn =
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
+    ExitCall = Builder.CreateCall(ExitRTLFn, Args);
+  }
+
+  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+                              /*Conditional*/ false, /*hasFinalize*/ true);
+}
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
     Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
     BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
@@ -2193,25 +2759,30 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime) {
+OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
+                                  bool RequiresFullRuntime) {
   if (!updateToLocation(Loc))
     return Loc.IP;
 
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
   Value *Ident = getOrCreateIdent(SrcLocStr);
-  ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD);
+  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
+      IntegerType::getInt8Ty(Int8->getContext()),
+      IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
   ConstantInt *UseGenericStateMachine =
       ConstantInt::getBool(Int32->getContext(), !IsSPMD);
-  ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
+  ConstantInt *RequiresFullRuntimeVal =
+      ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
 
   Function *Fn = getOrCreateRuntimeFunctionPtr(
       omp::RuntimeFunction::OMPRTL___kmpc_target_init);
 
-  CallInst *ThreadKind =
-      Builder.CreateCall(Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal});
+  CallInst *ThreadKind = Builder.CreateCall(
+      Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal});
 
   Value *ExecUserCode = Builder.CreateICmpEQ(
-      ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), "exec_user_code");
+      ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
+      "exec_user_code");
 
   // ThreadKind = __kmpc_target_init(...)
   // if (ThreadKind == -1)
@@ -2241,14 +2812,18 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, b
 }
 
 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
-                                         bool IsSPMD, bool RequiresFullRuntime) {
+                                         bool IsSPMD,
+                                         bool RequiresFullRuntime) {
   if (!updateToLocation(Loc))
     return;
 
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
   Value *Ident = getOrCreateIdent(SrcLocStr);
-  ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD);
-  ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
+  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
+      IntegerType::getInt8Ty(Int8->getContext()),
+      IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
+  ConstantInt *RequiresFullRuntimeVal =
+      ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);
 
   Function *Fn = getOrCreateRuntimeFunctionPtr(
       omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
@@ -2749,7 +3324,8 @@ void CanonicalLoopInfo::collectControlBlocks(
 
 void CanonicalLoopInfo::assertOK() const {
 #ifndef NDEBUG
-  if (!IsValid)
+  // No constraints if this object currently does not describe a loop.
+  if (!isValid())
     return;
 
   // Verify standard control-flow we use for OpenMP loops.
@@ -2835,3 +3411,13 @@ void CanonicalLoopInfo::assertOK() const {
          "Exit condition must compare with the trip count");
 #endif
 }
+
+void CanonicalLoopInfo::invalidate() {
+  Preheader = nullptr;
+  Header = nullptr;
+  Cond = nullptr;
+  Body = nullptr;
+  Latch = nullptr;
+  Exit = nullptr;
+  After = nullptr;
+}
diff --git a/llvm/lib/IR/AbstractCallSite.cpp b/llvm/lib/IR/AbstractCallSite.cpp
index 6504e566ba4b..2e41799e13e9 100644
--- a/llvm/lib/IR/AbstractCallSite.cpp
+++ b/llvm/lib/IR/AbstractCallSite.cpp
@@ -121,7 +121,7 @@ AbstractCallSite::AbstractCallSite(const Use *U)
 
   assert(CallbackEncMD->getNumOperands() >= 2 && "Incomplete !callback metadata");
 
-  unsigned NumCallOperands = CB->getNumArgOperands();
+  unsigned NumCallOperands = CB->arg_size();
   // Skip the var-arg flag at the end when reading the metadata.
   for (unsigned u = 0, e = CallbackEncMD->getNumOperands() - 1; u < e; u++) {
     Metadata *OpAsM = CallbackEncMD->getOperand(u).get();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 69e2d85e58fe..7734c0a8de58 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -44,7 +45,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalIFunc.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -72,6 +72,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -554,16 +555,13 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     FunctionType *FTy = cast<FunctionType>(Ty);
     print(FTy->getReturnType(), OS);
     OS << " (";
-    for (FunctionType::param_iterator I = FTy->param_begin(),
-         E = FTy->param_end(); I != E; ++I) {
-      if (I != FTy->param_begin())
-        OS << ", ";
-      print(*I, OS);
-    }
-    if (FTy->isVarArg()) {
-      if (FTy->getNumParams()) OS << ", ";
-      OS << "...";
+    ListSeparator LS;
+    for (Type *Ty : FTy->params()) {
+      OS << LS;
+      print(Ty, OS);
     }
+    if (FTy->isVarArg())
+      OS << LS << "...";
     OS << ')';
     return;
   }
@@ -633,12 +631,11 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
   if (STy->getNumElements() == 0) {
     OS << "{}";
   } else {
-    StructType::element_iterator I = STy->element_begin();
     OS << "{ ";
-    print(*I++, OS);
-    for (StructType::element_iterator E = STy->element_end(); I != E; ++I) {
-      OS << ", ";
-      print(*I, OS);
+    ListSeparator LS;
+    for (Type *Ty : STy->elements()) {
+      OS << LS;
+      print(Ty, OS);
     }
 
     OS << " }";
@@ -988,7 +985,7 @@ void SlotTracker::processModule() {
 
     // Add all the function attributes to the table.
     // FIXME: Add attributes of other objects?
-    AttributeSet FnAttrs = F.getAttributes().getFnAttributes();
+    AttributeSet FnAttrs = F.getAttributes().getFnAttrs();
     if (FnAttrs.hasAttributes())
       CreateAttributeSetSlot(FnAttrs);
   }
@@ -1029,7 +1026,7 @@ void SlotTracker::processFunction() {
       // target may not be linked into the optimizer.
       if (const auto *Call = dyn_cast<CallBase>(&I)) {
         // Add all the call attributes to the table.
-        AttributeSet Attrs = Call->getAttributes().getFnAttributes();
+        AttributeSet Attrs = Call->getAttributes().getFnAttrs();
         if (Attrs.hasAttributes())
           CreateAttributeSetSlot(Attrs);
       }
@@ -1277,18 +1274,38 @@ void SlotTracker::CreateTypeIdSlot(StringRef Id) {
   TypeIdMap[Id] = TypeIdNext++;
 }
 
+namespace {
+/// Common instances used by most of the printer functions.
+struct AsmWriterContext {
+  TypePrinting *TypePrinter = nullptr;
+  SlotTracker *Machine = nullptr;
+  const Module *Context = nullptr;
+
+  AsmWriterContext(TypePrinting *TP, SlotTracker *ST, const Module *M = nullptr)
+      : TypePrinter(TP), Machine(ST), Context(M) {}
+
+  static AsmWriterContext &getEmpty() {
+    static AsmWriterContext EmptyCtx(nullptr, nullptr);
+    return EmptyCtx;
+  }
+
+  /// A callback that will be triggered when the underlying printer
+  /// prints a Metadata as operand.
+  virtual void onWriteMetadataAsOperand(const Metadata *) {}
+
+  virtual ~AsmWriterContext() {}
+};
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // AsmWriter Implementation
 //===----------------------------------------------------------------------===//
 
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
-                                   TypePrinting *TypePrinter,
-                                   SlotTracker *Machine,
-                                   const Module *Context);
+                                   AsmWriterContext &WriterCtx);
 
 static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
-                                   TypePrinting *TypePrinter,
-                                   SlotTracker *Machine, const Module *Context,
+                                   AsmWriterContext &WriterCtx,
                                    bool FromValue = false);
 
 static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
@@ -1331,9 +1348,7 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
 }
 
 static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
-                                  TypePrinting &TypePrinter,
-                                  SlotTracker *Machine,
-                                  const Module *Context) {
+                                  AsmWriterContext &WriterCtx) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
     if (CI->getType()->isIntegerTy(1)) {
       Out << (CI->getZExtValue() ? "true" : "false");
@@ -1442,36 +1457,30 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
     Out << "blockaddress(";
-    WriteAsOperandInternal(Out, BA->getFunction(), &TypePrinter, Machine,
-                           Context);
+    WriteAsOperandInternal(Out, BA->getFunction(), WriterCtx);
     Out << ", ";
-    WriteAsOperandInternal(Out, BA->getBasicBlock(), &TypePrinter, Machine,
-                           Context);
+    WriteAsOperandInternal(Out, BA->getBasicBlock(), WriterCtx);
     Out << ")";
     return;
   }
 
   if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV)) {
     Out << "dso_local_equivalent ";
-    WriteAsOperandInternal(Out, Equiv->getGlobalValue(), &TypePrinter, Machine,
-                           Context);
+    WriteAsOperandInternal(Out, Equiv->getGlobalValue(), WriterCtx);
     return;
   }
 
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
-    TypePrinter.print(ETy, Out);
+    WriterCtx.TypePrinter->print(ETy, Out);
     Out << ' ';
-    WriteAsOperandInternal(Out, CA->getOperand(0),
-                           &TypePrinter, Machine,
-                           Context);
+    WriteAsOperandInternal(Out, CA->getOperand(0), WriterCtx);
     for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
       Out << ", ";
-      TypePrinter.print(ETy, Out);
+      WriterCtx.TypePrinter->print(ETy, Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, CA->getOperand(i), &TypePrinter, Machine,
-                             Context);
+      WriteAsOperandInternal(Out, CA->getOperand(i), WriterCtx);
     }
     Out << ']';
     return;
@@ -1489,17 +1498,14 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
-    TypePrinter.print(ETy, Out);
+    WriterCtx.TypePrinter->print(ETy, Out);
     Out << ' ';
-    WriteAsOperandInternal(Out, CA->getElementAsConstant(0),
-                           &TypePrinter, Machine,
-                           Context);
+    WriteAsOperandInternal(Out, CA->getElementAsConstant(0), WriterCtx);
     for (unsigned i = 1, e = CA->getNumElements(); i != e; ++i) {
       Out << ", ";
-      TypePrinter.print(ETy, Out);
+      WriterCtx.TypePrinter->print(ETy, Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, CA->getElementAsConstant(i), &TypePrinter,
-                             Machine, Context);
+      WriteAsOperandInternal(Out, CA->getElementAsConstant(i), WriterCtx);
     }
     Out << ']';
     return;
@@ -1512,19 +1518,17 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     unsigned N = CS->getNumOperands();
     if (N) {
       Out << ' ';
-      TypePrinter.print(CS->getOperand(0)->getType(), Out);
+      WriterCtx.TypePrinter->print(CS->getOperand(0)->getType(), Out);
       Out << ' ';
 
-      WriteAsOperandInternal(Out, CS->getOperand(0), &TypePrinter, Machine,
-                             Context);
+      WriteAsOperandInternal(Out, CS->getOperand(0), WriterCtx);
 
       for (unsigned i = 1; i < N; i++) {
         Out << ", ";
-        TypePrinter.print(CS->getOperand(i)->getType(), Out);
+        WriterCtx.TypePrinter->print(CS->getOperand(i)->getType(), Out);
         Out << ' ';
 
-        WriteAsOperandInternal(Out, CS->getOperand(i), &TypePrinter, Machine,
-                               Context);
+        WriteAsOperandInternal(Out, CS->getOperand(i), WriterCtx);
       }
       Out << ' ';
     }
@@ -1539,16 +1543,14 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     auto *CVVTy = cast<FixedVectorType>(CV->getType());
     Type *ETy = CVVTy->getElementType();
     Out << '<';
-    TypePrinter.print(ETy, Out);
+    WriterCtx.TypePrinter->print(ETy, Out);
     Out << ' ';
-    WriteAsOperandInternal(Out, CV->getAggregateElement(0U), &TypePrinter,
-                           Machine, Context);
+    WriteAsOperandInternal(Out, CV->getAggregateElement(0U), WriterCtx);
     for (unsigned i = 1, e = CVVTy->getNumElements(); i != e; ++i) {
       Out << ", ";
-      TypePrinter.print(ETy, Out);
+      WriterCtx.TypePrinter->print(ETy, Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, CV->getAggregateElement(i), &TypePrinter,
-                             Machine, Context);
+      WriteAsOperandInternal(Out, CV->getAggregateElement(i), WriterCtx);
     }
     Out << '>';
     return;
@@ -1584,7 +1586,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
     Optional<unsigned> InRangeOp;
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
-      TypePrinter.print(GEP->getSourceElementType(), Out);
+      WriterCtx.TypePrinter->print(GEP->getSourceElementType(), Out);
       Out << ", ";
       InRangeOp = GEP->getInRangeIndex();
       if (InRangeOp)
@@ -1594,9 +1596,9 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     for (User::const_op_iterator OI=CE->op_begin(); OI != CE->op_end(); ++OI) {
       if (InRangeOp && unsigned(OI - CE->op_begin()) == *InRangeOp)
         Out << "inrange ";
-      TypePrinter.print((*OI)->getType(), Out);
+      WriterCtx.TypePrinter->print((*OI)->getType(), Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, *OI, &TypePrinter, Machine, Context);
+      WriteAsOperandInternal(Out, *OI, WriterCtx);
       if (OI+1 != CE->op_end())
         Out << ", ";
     }
@@ -1609,7 +1611,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
     if (CE->isCast()) {
       Out << " to ";
-      TypePrinter.print(CE->getType(), Out);
+      WriterCtx.TypePrinter->print(CE->getType(), Out);
     }
 
     if (CE->getOpcode() == Instruction::ShuffleVector)
@@ -1623,8 +1625,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 }
 
 static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
-                         TypePrinting *TypePrinter, SlotTracker *Machine,
-                         const Module *Context) {
+                         AsmWriterContext &WriterCtx) {
   Out << "!{";
   for (unsigned mi = 0, me = Node->getNumOperands(); mi != me; ++mi) {
     const Metadata *MD = Node->getOperand(mi);
@@ -1632,11 +1633,12 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
       Out << "null";
     else if (auto *MDV = dyn_cast<ValueAsMetadata>(MD)) {
       Value *V = MDV->getValue();
-      TypePrinter->print(V->getType(), Out);
+      WriterCtx.TypePrinter->print(V->getType(), Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, V, TypePrinter, Machine, Context);
+      WriteAsOperandInternal(Out, V, WriterCtx);
     } else {
-      WriteAsOperandInternal(Out, MD, TypePrinter, Machine, Context);
+      WriteAsOperandInternal(Out, MD, WriterCtx);
+      WriterCtx.onWriteMetadataAsOperand(MD);
     }
     if (mi + 1 != me)
       Out << ", ";
@@ -1665,15 +1667,12 @@ raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
 struct MDFieldPrinter {
   raw_ostream &Out;
   FieldSeparator FS;
-  TypePrinting *TypePrinter = nullptr;
-  SlotTracker *Machine = nullptr;
-  const Module *Context = nullptr;
+  AsmWriterContext &WriterCtx;
 
-  explicit MDFieldPrinter(raw_ostream &Out) : Out(Out) {}
-  MDFieldPrinter(raw_ostream &Out, TypePrinting *TypePrinter,
-                 SlotTracker *Machine, const Module *Context)
-      : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) {
-  }
+  explicit MDFieldPrinter(raw_ostream &Out)
+      : Out(Out), WriterCtx(AsmWriterContext::getEmpty()) {}
+  MDFieldPrinter(raw_ostream &Out, AsmWriterContext &Ctx)
+      : Out(Out), WriterCtx(Ctx) {}
 
   void printTag(const DINode *N);
   void printMacinfoType(const DIMacroNode *N);
@@ -1734,14 +1733,13 @@ void MDFieldPrinter::printString(StringRef Name, StringRef Value,
 }
 
 static void writeMetadataAsOperand(raw_ostream &Out, const Metadata *MD,
-                                   TypePrinting *TypePrinter,
-                                   SlotTracker *Machine,
-                                   const Module *Context) {
+                                   AsmWriterContext &WriterCtx) {
   if (!MD) {
     Out << "null";
     return;
   }
-  WriteAsOperandInternal(Out, MD, TypePrinter, Machine, Context);
+  WriteAsOperandInternal(Out, MD, WriterCtx);
+  WriterCtx.onWriteMetadataAsOperand(MD);
 }
 
 void MDFieldPrinter::printMetadata(StringRef Name, const Metadata *MD,
@@ -1750,7 +1748,7 @@ void MDFieldPrinter::printMetadata(StringRef Name, const Metadata *MD,
     return;
 
   Out << FS << Name << ": ";
-  writeMetadataAsOperand(Out, MD, TypePrinter, Machine, Context);
+  writeMetadataAsOperand(Out, MD, WriterCtx);
 }
 
 template <class IntTy>
@@ -1763,7 +1761,7 @@ void MDFieldPrinter::printInt(StringRef Name, IntTy Int, bool ShouldSkipZero) {
 
 void MDFieldPrinter::printAPInt(StringRef Name, const APInt &Int,
                                 bool IsUnsigned, bool ShouldSkipZero) {
-  if (ShouldSkipZero && Int.isNullValue())
+  if (ShouldSkipZero && Int.isZero())
     return;
 
   Out << FS << Name << ": ";
@@ -1847,10 +1845,9 @@ void MDFieldPrinter::printDwarfEnum(StringRef Name, IntTy Value,
 }
 
 static void writeGenericDINode(raw_ostream &Out, const GenericDINode *N,
-                               TypePrinting *TypePrinter, SlotTracker *Machine,
-                               const Module *Context) {
+                               AsmWriterContext &WriterCtx) {
   Out << "!GenericDINode(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printTag(N);
   Printer.printString("header", N->getHeader());
   if (N->getNumDwarfOperands()) {
@@ -1858,7 +1855,7 @@ static void writeGenericDINode(raw_ostream &Out, const GenericDINode *N,
     FieldSeparator IFS;
     for (auto &I : N->dwarf_operands()) {
       Out << IFS;
-      writeMetadataAsOperand(Out, I, TypePrinter, Machine, Context);
+      writeMetadataAsOperand(Out, I, WriterCtx);
     }
     Out << "}";
   }
@@ -1866,10 +1863,9 @@ static void writeGenericDINode(raw_ostream &Out, const GenericDINode *N,
 }
 
 static void writeDILocation(raw_ostream &Out, const DILocation *DL,
-                            TypePrinting *TypePrinter, SlotTracker *Machine,
-                            const Module *Context) {
+                            AsmWriterContext &WriterCtx) {
   Out << "!DILocation(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   // Always output the line, since 0 is a relevant and important value for it.
   Printer.printInt("line", DL->getLine(), /* ShouldSkipZero */ false);
   Printer.printInt("column", DL->getColumn());
@@ -1881,10 +1877,9 @@ static void writeDILocation(raw_ostream &Out, const DILocation *DL,
 }
 
 static void writeDISubrange(raw_ostream &Out, const DISubrange *N,
-                            TypePrinting *TypePrinter, SlotTracker *Machine,
-                            const Module *Context) {
+                            AsmWriterContext &WriterCtx) {
   Out << "!DISubrange(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
 
   auto *Count = N->getRawCountNode();
   if (auto *CE = dyn_cast_or_null<ConstantAsMetadata>(Count)) {
@@ -1923,18 +1918,15 @@ static void writeDISubrange(raw_ostream &Out, const DISubrange *N,
 }
 
 static void writeDIGenericSubrange(raw_ostream &Out, const DIGenericSubrange *N,
-                                   TypePrinting *TypePrinter,
-                                   SlotTracker *Machine,
-                                   const Module *Context) {
+                                   AsmWriterContext &WriterCtx) {
   Out << "!DIGenericSubrange(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
 
   auto IsConstant = [&](Metadata *Bound) -> bool {
     if (auto *BE = dyn_cast_or_null<DIExpression>(Bound)) {
-      return BE->isConstant()
-                 ? DIExpression::SignedOrUnsignedConstant::SignedConstant ==
-                       *BE->isConstant()
-                 : false;
+      return BE->isConstant() &&
+             DIExpression::SignedOrUnsignedConstant::SignedConstant ==
+                 *BE->isConstant();
     }
     return false;
   };
@@ -1977,7 +1969,7 @@ static void writeDIGenericSubrange(raw_ostream &Out, const DIGenericSubrange *N,
 }
 
 static void writeDIEnumerator(raw_ostream &Out, const DIEnumerator *N,
-                              TypePrinting *, SlotTracker *, const Module *) {
+                              AsmWriterContext &) {
   Out << "!DIEnumerator(";
   MDFieldPrinter Printer(Out);
   Printer.printString("name", N->getName(), /* ShouldSkipEmpty */ false);
@@ -1989,7 +1981,7 @@ static void writeDIEnumerator(raw_ostream &Out, const DIEnumerator *N,
 }
 
 static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N,
-                             TypePrinting *, SlotTracker *, const Module *) {
+                             AsmWriterContext &) {
   Out << "!DIBasicType(";
   MDFieldPrinter Printer(Out);
   if (N->getTag() != dwarf::DW_TAG_base_type)
@@ -2004,10 +1996,9 @@ static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N,
 }
 
 static void writeDIStringType(raw_ostream &Out, const DIStringType *N,
-                              TypePrinting *TypePrinter, SlotTracker *Machine,
-                              const Module *Context) {
+                              AsmWriterContext &WriterCtx) {
   Out << "!DIStringType(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   if (N->getTag() != dwarf::DW_TAG_string_type)
     Printer.printTag(N);
   Printer.printString("name", N->getName());
@@ -2021,10 +2012,9 @@ static void writeDIStringType(raw_ostream &Out, const DIStringType *N,
 }
 
 static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
-                               TypePrinting *TypePrinter, SlotTracker *Machine,
-                               const Module *Context) {
+                               AsmWriterContext &WriterCtx) {
   Out << "!DIDerivedType(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printTag(N);
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope());
@@ -2040,14 +2030,14 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
   if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
     Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
                      /* ShouldSkipZero */ false);
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
 static void writeDICompositeType(raw_ostream &Out, const DICompositeType *N,
-                                 TypePrinting *TypePrinter,
-                                 SlotTracker *Machine, const Module *Context) {
+                                 AsmWriterContext &WriterCtx) {
   Out << "!DICompositeType(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printTag(N);
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope());
@@ -2073,14 +2063,14 @@ static void writeDICompositeType(raw_ostream &Out, const DICompositeType *N,
                      /* ShouldSkipZero */ false);
   else
     Printer.printMetadata("rank", N->getRawRank(), /*ShouldSkipNull */ true);
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
 static void writeDISubroutineType(raw_ostream &Out, const DISubroutineType *N,
-                                  TypePrinting *TypePrinter,
-                                  SlotTracker *Machine, const Module *Context) {
+                                  AsmWriterContext &WriterCtx) {
   Out << "!DISubroutineType(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printDwarfEnum("cc", N->getCC(), dwarf::ConventionString);
   Printer.printMetadata("types", N->getRawTypeArray(),
@@ -2088,8 +2078,7 @@ static void writeDISubroutineType(raw_ostream &Out, const DISubroutineType *N,
   Out << ")";
 }
 
-static void writeDIFile(raw_ostream &Out, const DIFile *N, TypePrinting *,
-                        SlotTracker *, const Module *) {
+static void writeDIFile(raw_ostream &Out, const DIFile *N, AsmWriterContext &) {
   Out << "!DIFile(";
   MDFieldPrinter Printer(Out);
   Printer.printString("filename", N->getFilename(),
@@ -2105,10 +2094,9 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, TypePrinting *,
 }
 
 static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
-                               TypePrinting *TypePrinter, SlotTracker *Machine,
-                               const Module *Context) {
+                               AsmWriterContext &WriterCtx) {
   Out << "!DICompileUnit(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printDwarfEnum("language", N->getSourceLanguage(),
                          dwarf::LanguageString, /* ShouldSkipZero */ false);
   Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
@@ -2136,10 +2124,9 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
 }
 
 static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
-                              TypePrinting *TypePrinter, SlotTracker *Machine,
-                              const Module *Context) {
+                              AsmWriterContext &WriterCtx) {
   Out << "!DISubprogram(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printString("name", N->getName());
   Printer.printString("linkageName", N->getLinkageName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
@@ -2159,14 +2146,14 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printMetadata("declaration", N->getRawDeclaration());
   Printer.printMetadata("retainedNodes", N->getRawRetainedNodes());
   Printer.printMetadata("thrownTypes", N->getRawThrownTypes());
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
 static void writeDILexicalBlock(raw_ostream &Out, const DILexicalBlock *N,
-                                TypePrinting *TypePrinter, SlotTracker *Machine,
-                                const Module *Context) {
+                                AsmWriterContext &WriterCtx) {
   Out << "!DILexicalBlock(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
@@ -2176,11 +2163,9 @@ static void writeDILexicalBlock(raw_ostream &Out, const DILexicalBlock *N,
 
 static void writeDILexicalBlockFile(raw_ostream &Out,
                                     const DILexicalBlockFile *N,
-                                    TypePrinting *TypePrinter,
-                                    SlotTracker *Machine,
-                                    const Module *Context) {
+                                    AsmWriterContext &WriterCtx) {
   Out << "!DILexicalBlockFile(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("discriminator", N->getDiscriminator(),
@@ -2189,10 +2174,9 @@ static void writeDILexicalBlockFile(raw_ostream &Out,
 }
 
 static void writeDINamespace(raw_ostream &Out, const DINamespace *N,
-                             TypePrinting *TypePrinter, SlotTracker *Machine,
-                             const Module *Context) {
+                             AsmWriterContext &WriterCtx) {
   Out << "!DINamespace(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printBool("exportSymbols", N->getExportSymbols(), false);
@@ -2200,10 +2184,9 @@ static void writeDINamespace(raw_ostream &Out, const DINamespace *N,
 }
 
 static void writeDICommonBlock(raw_ostream &Out, const DICommonBlock *N,
-                               TypePrinting *TypePrinter, SlotTracker *Machine,
-                               const Module *Context) {
+                               AsmWriterContext &WriterCtx) {
   Out << "!DICommonBlock(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMetadata("scope", N->getRawScope(), false);
   Printer.printMetadata("declaration", N->getRawDecl(), false);
   Printer.printString("name", N->getName());
@@ -2213,10 +2196,9 @@ static void writeDICommonBlock(raw_ostream &Out, const DICommonBlock *N,
 }
 
 static void writeDIMacro(raw_ostream &Out, const DIMacro *N,
-                         TypePrinting *TypePrinter, SlotTracker *Machine,
-                         const Module *Context) {
+                         AsmWriterContext &WriterCtx) {
   Out << "!DIMacro(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMacinfoType(N);
   Printer.printInt("line", N->getLine());
   Printer.printString("name", N->getName());
@@ -2225,10 +2207,9 @@ static void writeDIMacro(raw_ostream &Out, const DIMacro *N,
 }
 
 static void writeDIMacroFile(raw_ostream &Out, const DIMacroFile *N,
-                             TypePrinting *TypePrinter, SlotTracker *Machine,
-                             const Module *Context) {
+                             AsmWriterContext &WriterCtx) {
   Out << "!DIMacroFile(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printInt("line", N->getLine());
   Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
   Printer.printMetadata("nodes", N->getRawElements());
@@ -2236,10 +2217,9 @@ static void writeDIMacroFile(raw_ostream &Out, const DIMacroFile *N,
 }
 
 static void writeDIModule(raw_ostream &Out, const DIModule *N,
-                          TypePrinting *TypePrinter, SlotTracker *Machine,
-                          const Module *Context) {
+                          AsmWriterContext &WriterCtx) {
   Out << "!DIModule(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printString("name", N->getName());
   Printer.printString("configMacros", N->getConfigurationMacros());
@@ -2251,14 +2231,11 @@ static void writeDIModule(raw_ostream &Out, const DIModule *N,
   Out << ")";
 }
 
-
 static void writeDITemplateTypeParameter(raw_ostream &Out,
                                          const DITemplateTypeParameter *N,
-                                         TypePrinting *TypePrinter,
-                                         SlotTracker *Machine,
-                                         const Module *Context) {
+                                         AsmWriterContext &WriterCtx) {
   Out << "!DITemplateTypeParameter(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printString("name", N->getName());
   Printer.printMetadata("type", N->getRawType(), /* ShouldSkipNull */ false);
   Printer.printBool("defaulted", N->isDefault(), /* Default= */ false);
@@ -2267,11 +2244,9 @@ static void writeDITemplateTypeParameter(raw_ostream &Out,
 
 static void writeDITemplateValueParameter(raw_ostream &Out,
                                           const DITemplateValueParameter *N,
-                                          TypePrinting *TypePrinter,
-                                          SlotTracker *Machine,
-                                          const Module *Context) {
+                                          AsmWriterContext &WriterCtx) {
   Out << "!DITemplateValueParameter(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   if (N->getTag() != dwarf::DW_TAG_template_value_parameter)
     Printer.printTag(N);
   Printer.printString("name", N->getName());
@@ -2282,10 +2257,9 @@ static void writeDITemplateValueParameter(raw_ostream &Out,
 }
 
 static void writeDIGlobalVariable(raw_ostream &Out, const DIGlobalVariable *N,
-                                  TypePrinting *TypePrinter,
-                                  SlotTracker *Machine, const Module *Context) {
+                                  AsmWriterContext &WriterCtx) {
   Out << "!DIGlobalVariable(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printString("name", N->getName());
   Printer.printString("linkageName", N->getLinkageName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
@@ -2297,14 +2271,14 @@ static void writeDIGlobalVariable(raw_ostream &Out, const DIGlobalVariable *N,
   Printer.printMetadata("declaration", N->getRawStaticDataMemberDeclaration());
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printInt("align", N->getAlignInBits());
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
 static void writeDILocalVariable(raw_ostream &Out, const DILocalVariable *N,
-                                 TypePrinting *TypePrinter,
-                                 SlotTracker *Machine, const Module *Context) {
+                                 AsmWriterContext &WriterCtx) {
   Out << "!DILocalVariable(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printString("name", N->getName());
   Printer.printInt("arg", N->getArg());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
@@ -2313,14 +2287,14 @@ static void writeDILocalVariable(raw_ostream &Out, const DILocalVariable *N,
   Printer.printMetadata("type", N->getRawType());
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printInt("align", N->getAlignInBits());
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
 static void writeDILabel(raw_ostream &Out, const DILabel *N,
-                         TypePrinting *TypePrinter,
-                         SlotTracker *Machine, const Module *Context) {
+                         AsmWriterContext &WriterCtx) {
   Out << "!DILabel(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printString("name", N->getName());
   Printer.printMetadata("file", N->getRawFile());
@@ -2329,8 +2303,7 @@ static void writeDILabel(raw_ostream &Out, const DILabel *N,
 }
 
 static void writeDIExpression(raw_ostream &Out, const DIExpression *N,
-                              TypePrinting *TypePrinter, SlotTracker *Machine,
-                              const Module *Context) {
+                              AsmWriterContext &WriterCtx) {
   Out << "!DIExpression(";
   FieldSeparator FS;
   if (N->isValid()) {
@@ -2355,37 +2328,34 @@ static void writeDIExpression(raw_ostream &Out, const DIExpression *N,
 }
 
 static void writeDIArgList(raw_ostream &Out, const DIArgList *N,
-                           TypePrinting *TypePrinter, SlotTracker *Machine,
-                           const Module *Context, bool FromValue = false) {
+                           AsmWriterContext &WriterCtx,
+                           bool FromValue = false) {
   assert(FromValue &&
          "Unexpected DIArgList metadata outside of value argument");
   Out << "!DIArgList(";
   FieldSeparator FS;
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   for (Metadata *Arg : N->getArgs()) {
     Out << FS;
-    WriteAsOperandInternal(Out, Arg, TypePrinter, Machine, Context, true);
+    WriteAsOperandInternal(Out, Arg, WriterCtx, true);
   }
   Out << ")";
 }
 
 static void writeDIGlobalVariableExpression(raw_ostream &Out,
                                             const DIGlobalVariableExpression *N,
-                                            TypePrinting *TypePrinter,
-                                            SlotTracker *Machine,
-                                            const Module *Context) {
+                                            AsmWriterContext &WriterCtx) {
   Out << "!DIGlobalVariableExpression(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printMetadata("var", N->getVariable());
   Printer.printMetadata("expr", N->getExpression());
   Out << ")";
 }
 
 static void writeDIObjCProperty(raw_ostream &Out, const DIObjCProperty *N,
-                                TypePrinting *TypePrinter, SlotTracker *Machine,
-                                const Module *Context) {
+                                AsmWriterContext &WriterCtx) {
   Out << "!DIObjCProperty(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printString("name", N->getName());
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
@@ -2397,23 +2367,21 @@ static void writeDIObjCProperty(raw_ostream &Out, const DIObjCProperty *N,
 }
 
 static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N,
-                                  TypePrinting *TypePrinter,
-                                  SlotTracker *Machine, const Module *Context) {
+                                  AsmWriterContext &WriterCtx) {
   Out << "!DIImportedEntity(";
-  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  MDFieldPrinter Printer(Out, WriterCtx);
   Printer.printTag(N);
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("entity", N->getRawEntity());
   Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
+  Printer.printMetadata("elements", N->getRawElements());
   Out << ")";
 }
 
 static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
-                                    TypePrinting *TypePrinter,
-                                    SlotTracker *Machine,
-                                    const Module *Context) {
+                                    AsmWriterContext &Ctx) {
   if (Node->isDistinct())
     Out << "distinct ";
   else if (Node->isTemporary())
@@ -2424,7 +2392,7 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
     llvm_unreachable("Expected uniquable MDNode");
 #define HANDLE_MDNODE_LEAF(CLASS)                                              \
   case Metadata::CLASS##Kind:                                                  \
-    write##CLASS(Out, cast<CLASS>(Node), TypePrinter, Machine, Context);       \
+    write##CLASS(Out, cast<CLASS>(Node), Ctx);                                 \
     break;
 #include "llvm/IR/Metadata.def"
   }
@@ -2433,9 +2401,7 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
 // Full implementation of printing a Value as an operand with support for
 // TypePrinting, etc.
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
-                                   TypePrinting *TypePrinter,
-                                   SlotTracker *Machine,
-                                   const Module *Context) {
+                                   AsmWriterContext &WriterCtx) {
   if (V->hasName()) {
     PrintLLVMName(Out, V);
     return;
@@ -2443,8 +2409,8 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
 
   const Constant *CV = dyn_cast<Constant>(V);
   if (CV && !isa<GlobalValue>(CV)) {
-    assert(TypePrinter && "Constants require TypePrinting!");
-    WriteConstantInternal(Out, CV, *TypePrinter, Machine, Context);
+    assert(WriterCtx.TypePrinter && "Constants require TypePrinting!");
+    WriteConstantInternal(Out, CV, WriterCtx);
     return;
   }
 
@@ -2468,13 +2434,14 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   }
 
   if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
-    WriteAsOperandInternal(Out, MD->getMetadata(), TypePrinter, Machine,
-                           Context, /* FromValue */ true);
+    WriteAsOperandInternal(Out, MD->getMetadata(), WriterCtx,
+                           /* FromValue */ true);
     return;
   }
 
   char Prefix = '%';
   int Slot;
+  auto *Machine = WriterCtx.Machine;
   // If we have a SlotTracker, use it.
   if (Machine) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
@@ -2513,30 +2480,30 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
 }
 
 static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
-                                   TypePrinting *TypePrinter,
-                                   SlotTracker *Machine, const Module *Context,
+                                   AsmWriterContext &WriterCtx,
                                    bool FromValue) {
   // Write DIExpressions and DIArgLists inline when used as a value. Improves
   // readability of debug info intrinsics.
   if (const DIExpression *Expr = dyn_cast<DIExpression>(MD)) {
-    writeDIExpression(Out, Expr, TypePrinter, Machine, Context);
+    writeDIExpression(Out, Expr, WriterCtx);
     return;
   }
   if (const DIArgList *ArgList = dyn_cast<DIArgList>(MD)) {
-    writeDIArgList(Out, ArgList, TypePrinter, Machine, Context, FromValue);
+    writeDIArgList(Out, ArgList, WriterCtx, FromValue);
     return;
   }
 
   if (const MDNode *N = dyn_cast<MDNode>(MD)) {
     std::unique_ptr<SlotTracker> MachineStorage;
-    if (!Machine) {
-      MachineStorage = std::make_unique<SlotTracker>(Context);
-      Machine = MachineStorage.get();
+    SaveAndRestore<SlotTracker *> SARMachine(WriterCtx.Machine);
+    if (!WriterCtx.Machine) {
+      MachineStorage = std::make_unique<SlotTracker>(WriterCtx.Context);
+      WriterCtx.Machine = MachineStorage.get();
     }
-    int Slot = Machine->getMetadataSlot(N);
+    int Slot = WriterCtx.Machine->getMetadataSlot(N);
     if (Slot == -1) {
       if (const DILocation *Loc = dyn_cast<DILocation>(N)) {
-        writeDILocation(Out, Loc, TypePrinter, Machine, Context);
+        writeDILocation(Out, Loc, WriterCtx);
         return;
       }
       // Give the pointer value instead of "badref", since this comes up all
@@ -2555,13 +2522,13 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
   }
 
   auto *V = cast<ValueAsMetadata>(MD);
-  assert(TypePrinter && "TypePrinter required for metadata values");
+  assert(WriterCtx.TypePrinter && "TypePrinter required for metadata values");
   assert((FromValue || !isa<LocalAsMetadata>(V)) &&
          "Unexpected function-local metadata outside of value argument");
 
-  TypePrinter->print(V->getValue()->getType(), Out);
+  WriterCtx.TypePrinter->print(V->getValue()->getType(), Out);
   Out << ' ';
-  WriteAsOperandInternal(Out, V->getValue(), TypePrinter, Machine, Context);
+  WriteAsOperandInternal(Out, V->getValue(), WriterCtx);
 }
 
 namespace {
@@ -2592,6 +2559,10 @@ public:
   AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                  const ModuleSummaryIndex *Index, bool IsForDebug);
 
+  AsmWriterContext getContext() {
+    return AsmWriterContext(&TypePrinter, &Machine, TheModule);
+  }
+
   void printMDNodeBody(const MDNode *MD);
   void printNamedMDNode(const NamedMDNode *NMD);
 
@@ -2618,7 +2589,8 @@ public:
 
   void printTypeIdentities();
   void printGlobal(const GlobalVariable *GV);
-  void printIndirectSymbol(const GlobalIndirectSymbol *GIS);
+  void printAlias(const GlobalAlias *GA);
+  void printIFunc(const GlobalIFunc *GI);
   void printComdat(const Comdat *C);
   void printFunction(const Function *F);
   void printArgument(const Argument *FA, AttributeSet Attrs);
@@ -2693,7 +2665,8 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
     TypePrinter.print(Operand->getType(), Out);
     Out << ' ';
   }
-  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
+  auto WriterCtx = getContext();
+  WriteAsOperandInternal(Out, Operand, WriterCtx);
 }
 
 void AssemblyWriter::writeSyncScope(const LLVMContext &Context,
@@ -2752,7 +2725,8 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
   }
   Out << ' ';
   // Print the operand
-  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
+  auto WriterCtx = getContext();
+  WriteAsOperandInternal(Out, Operand, WriterCtx);
 }
 
 void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
@@ -2776,6 +2750,7 @@ void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
     Out << '(';
 
     bool FirstInput = true;
+    auto WriterCtx = getContext();
     for (const auto &Input : BU.Inputs) {
       if (!FirstInput)
         Out << ", ";
@@ -2783,7 +2758,7 @@ void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
 
       TypePrinter.print(Input->getType(), Out);
       Out << " ";
-      WriteAsOperandInternal(Out, Input, &TypePrinter, &Machine, TheModule);
+      WriteAsOperandInternal(Out, Input, WriterCtx);
     }
 
     Out << ')';
@@ -2853,12 +2828,12 @@ void AssemblyWriter::printModule(const Module *M) {
   // Output all aliases.
   if (!M->alias_empty()) Out << "\n";
   for (const GlobalAlias &GA : M->aliases())
-    printIndirectSymbol(&GA);
+    printAlias(&GA);
 
   // Output all ifuncs.
   if (!M->ifunc_empty()) Out << "\n";
   for (const GlobalIFunc &GI : M->ifuncs())
-    printIndirectSymbol(&GI);
+    printIFunc(&GI);
 
   // Output all of the functions.
   for (const Function &F : *M) {
@@ -3198,19 +3173,9 @@ static const char *getVisibilityName(GlobalValue::VisibilityTypes Vis) {
 
 void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
   Out << ", insts: " << FS->instCount();
+  if (FS->fflags().anyFlagSet())
+    Out << ", " << FS->fflags();
 
-  FunctionSummary::FFlags FFlags = FS->fflags();
-  if (FFlags.ReadNone | FFlags.ReadOnly | FFlags.NoRecurse |
-      FFlags.ReturnDoesNotAlias | FFlags.NoInline | FFlags.AlwaysInline) {
-    Out << ", funcFlags: (";
-    Out << "readNone: " << FFlags.ReadNone;
-    Out << ", readOnly: " << FFlags.ReadOnly;
-    Out << ", noRecurse: " << FFlags.NoRecurse;
-    Out << ", returnDoesNotAlias: " << FFlags.ReturnDoesNotAlias;
-    Out << ", noInline: " << FFlags.NoInline;
-    Out << ", alwaysInline: " << FFlags.AlwaysInline;
-    Out << ")";
-  }
   if (!FS->calls().empty()) {
     Out << ", calls: (";
     FieldSeparator IFS;
@@ -3453,7 +3418,7 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
     assert(!isa<DIArgList>(Op) &&
            "DIArgLists should not appear in NamedMDNodes");
     if (auto *Expr = dyn_cast<DIExpression>(Op)) {
-      writeDIExpression(Out, Expr, nullptr, nullptr, nullptr);
+      writeDIExpression(Out, Expr, AsmWriterContext::getEmpty());
       continue;
     }
 
@@ -3544,7 +3509,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (GV->isMaterializable())
     Out << "; Materializable\n";
 
-  WriteAsOperandInternal(Out, GV, &TypePrinter, &Machine, GV->getParent());
+  AsmWriterContext WriterCtx(&TypePrinter, &Machine, GV->getParent());
+  WriteAsOperandInternal(Out, GV, WriterCtx);
   Out << " = ";
 
   if (!GV->hasInitializer() && GV->hasExternalLinkage())
@@ -3596,49 +3562,76 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   printInfoComment(*GV);
 }
 
-void AssemblyWriter::printIndirectSymbol(const GlobalIndirectSymbol *GIS) {
-  if (GIS->isMaterializable())
+void AssemblyWriter::printAlias(const GlobalAlias *GA) {
+  if (GA->isMaterializable())
     Out << "; Materializable\n";
 
-  WriteAsOperandInternal(Out, GIS, &TypePrinter, &Machine, GIS->getParent());
+  AsmWriterContext WriterCtx(&TypePrinter, &Machine, GA->getParent());
+  WriteAsOperandInternal(Out, GA, WriterCtx);
   Out << " = ";
 
-  Out << getLinkageNameWithSpace(GIS->getLinkage());
-  PrintDSOLocation(*GIS, Out);
-  PrintVisibility(GIS->getVisibility(), Out);
-  PrintDLLStorageClass(GIS->getDLLStorageClass(), Out);
-  PrintThreadLocalModel(GIS->getThreadLocalMode(), Out);
-  StringRef UA = getUnnamedAddrEncoding(GIS->getUnnamedAddr());
+  Out << getLinkageNameWithSpace(GA->getLinkage());
+  PrintDSOLocation(*GA, Out);
+  PrintVisibility(GA->getVisibility(), Out);
+  PrintDLLStorageClass(GA->getDLLStorageClass(), Out);
+  PrintThreadLocalModel(GA->getThreadLocalMode(), Out);
+  StringRef UA = getUnnamedAddrEncoding(GA->getUnnamedAddr());
   if (!UA.empty())
       Out << UA << ' ';
 
-  if (isa<GlobalAlias>(GIS))
-    Out << "alias ";
-  else if (isa<GlobalIFunc>(GIS))
-    Out << "ifunc ";
-  else
-    llvm_unreachable("Not an alias or ifunc!");
-
-  TypePrinter.print(GIS->getValueType(), Out);
+  Out << "alias ";
 
+  TypePrinter.print(GA->getValueType(), Out);
   Out << ", ";
 
-  const Constant *IS = GIS->getIndirectSymbol();
-
-  if (!IS) {
-    TypePrinter.print(GIS->getType(), Out);
+  if (const Constant *Aliasee = GA->getAliasee()) {
+    writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
+  } else {
+    TypePrinter.print(GA->getType(), Out);
     Out << " <<NULL ALIASEE>>";
+  }
+
+  if (GA->hasPartition()) {
+    Out << ", partition \"";
+    printEscapedString(GA->getPartition(), Out);
+    Out << '"';
+  }
+
+  printInfoComment(*GA);
+  Out << '\n';
+}
+
+void AssemblyWriter::printIFunc(const GlobalIFunc *GI) {
+  if (GI->isMaterializable())
+    Out << "; Materializable\n";
+
+  AsmWriterContext WriterCtx(&TypePrinter, &Machine, GI->getParent());
+  WriteAsOperandInternal(Out, GI, WriterCtx);
+  Out << " = ";
+
+  Out << getLinkageNameWithSpace(GI->getLinkage());
+  PrintDSOLocation(*GI, Out);
+  PrintVisibility(GI->getVisibility(), Out);
+
+  Out << "ifunc ";
+
+  TypePrinter.print(GI->getValueType(), Out);
+  Out << ", ";
+
+  if (const Constant *Resolver = GI->getResolver()) {
+    writeOperand(Resolver, !isa<ConstantExpr>(Resolver));
   } else {
-    writeOperand(IS, !isa<ConstantExpr>(IS));
+    TypePrinter.print(GI->getType(), Out);
+    Out << " <<NULL RESOLVER>>";
   }
 
-  if (GIS->hasPartition()) {
+  if (GI->hasPartition()) {
     Out << ", partition \"";
-    printEscapedString(GIS->getPartition(), Out);
+    printEscapedString(GI->getPartition(), Out);
     Out << '"';
   }
 
-  printInfoComment(*GIS);
+  printInfoComment(*GI);
   Out << '\n';
 }
 
@@ -3683,8 +3676,8 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "; Materializable\n";
 
   const AttributeList &Attrs = F->getAttributes();
-  if (Attrs.hasAttributes(AttributeList::FunctionIndex)) {
-    AttributeSet AS = Attrs.getFnAttributes();
+  if (Attrs.hasFnAttrs()) {
+    AttributeSet AS = Attrs.getFnAttrs();
     std::string AttrStr;
 
     for (const Attribute &Attr : AS) {
@@ -3721,11 +3714,12 @@ void AssemblyWriter::printFunction(const Function *F) {
   }
 
   FunctionType *FT = F->getFunctionType();
-  if (Attrs.hasAttributes(AttributeList::ReturnIndex))
+  if (Attrs.hasRetAttrs())
     Out << Attrs.getAsString(AttributeList::ReturnIndex) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
+  AsmWriterContext WriterCtx(&TypePrinter, &Machine, F->getParent());
   Out << ' ';
-  WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
+  WriteAsOperandInternal(Out, F, WriterCtx);
   Out << '(';
 
   // Loop over the arguments, printing them...
@@ -3738,7 +3732,7 @@ void AssemblyWriter::printFunction(const Function *F) {
       // Output type...
       TypePrinter.print(FT->getParamType(I), Out);
 
-      AttributeSet ArgAttrs = Attrs.getParamAttributes(I);
+      AttributeSet ArgAttrs = Attrs.getParamAttrs(I);
       if (ArgAttrs.hasAttributes()) {
         Out << ' ';
         writeAttributeSet(ArgAttrs);
@@ -3750,7 +3744,7 @@ void AssemblyWriter::printFunction(const Function *F) {
       // Insert commas as we go... the first arg doesn't get a comma
       if (Arg.getArgNo() != 0)
         Out << ", ";
-      printArgument(&Arg, Attrs.getParamAttributes(Arg.getArgNo()));
+      printArgument(&Arg, Attrs.getParamAttrs(Arg.getArgNo()));
     }
   }
 
@@ -3770,8 +3764,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->getAddressSpace() != 0 || !Mod ||
       Mod->getDataLayout().getProgramAddressSpace() != 0)
     Out << " addrspace(" << F->getAddressSpace() << ")";
-  if (Attrs.hasAttributes(AttributeList::FunctionIndex))
-    Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
+  if (Attrs.hasFnAttrs())
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttrs());
   if (F->hasSection()) {
     Out << " section \"";
     printEscapedString(F->getSection(), Out);
@@ -4127,7 +4121,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Type *RetTy = FTy->getReturnType();
     const AttributeList &PAL = CI->getAttributes();
 
-    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+    if (PAL.hasRetAttrs())
       Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // Only print addrspace(N) if necessary:
@@ -4142,10 +4136,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
-    for (unsigned op = 0, Eop = CI->getNumArgOperands(); op < Eop; ++op) {
+    for (unsigned op = 0, Eop = CI->arg_size(); op < Eop; ++op) {
       if (op > 0)
         Out << ", ";
-      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
     // Emit an ellipsis if this is a musttail call in a vararg function.  This
@@ -4156,8 +4150,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << ", ...";
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeList::FunctionIndex))
-      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+    if (PAL.hasFnAttrs())
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(CI);
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
@@ -4172,7 +4166,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+    if (PAL.hasRetAttrs())
       Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // Only print addrspace(N) if necessary:
@@ -4187,15 +4181,15 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
-    for (unsigned op = 0, Eop = II->getNumArgOperands(); op < Eop; ++op) {
+    for (unsigned op = 0, Eop = II->arg_size(); op < Eop; ++op) {
       if (op)
         Out << ", ";
-      writeParamOperand(II->getArgOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(II->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeList::FunctionIndex))
-      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+    if (PAL.hasFnAttrs())
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(II);
 
@@ -4215,7 +4209,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(CBI->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+    if (PAL.hasRetAttrs())
       Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the callbr instruction. We can
@@ -4227,15 +4221,15 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
-    for (unsigned op = 0, Eop = CBI->getNumArgOperands(); op < Eop; ++op) {
+    for (unsigned op = 0, Eop = CBI->arg_size(); op < Eop; ++op) {
       if (op)
         Out << ", ";
-      writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeList::FunctionIndex))
-      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+    if (PAL.hasFnAttrs())
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(CBI);
 
@@ -4375,6 +4369,7 @@ void AssemblyWriter::printMetadataAttachments(
   if (MDNames.empty())
     MDs[0].second->getContext().getMDKindNames(MDNames);
 
+  auto WriterCtx = getContext();
   for (const auto &I : MDs) {
     unsigned Kind = I.first;
     Out << Separator;
@@ -4384,7 +4379,7 @@ void AssemblyWriter::printMetadataAttachments(
     } else
       Out << "!<unknown kind #" << Kind << ">";
     Out << ' ';
-    WriteAsOperandInternal(Out, I.second, &TypePrinter, &Machine, TheModule);
+    WriteAsOperandInternal(Out, I.second, WriterCtx);
   }
 }
 
@@ -4406,7 +4401,8 @@ void AssemblyWriter::writeAllMDNodes() {
 }
 
 void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
-  WriteMDNodeBodyInternal(Out, Node, &TypePrinter, &Machine, TheModule);
+  auto WriterCtx = getContext();
+  WriteMDNodeBodyInternal(Out, Node, WriterCtx);
 }
 
 void AssemblyWriter::writeAttribute(const Attribute &Attr, bool InAttrGroup) {
@@ -4626,15 +4622,20 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST,
       W.printGlobal(V);
     else if (const Function *F = dyn_cast<Function>(GV))
       W.printFunction(F);
+    else if (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
+      W.printAlias(A);
+    else if (const GlobalIFunc *I = dyn_cast<GlobalIFunc>(GV))
+      W.printIFunc(I);
     else
-      W.printIndirectSymbol(cast<GlobalIndirectSymbol>(GV));
+      llvm_unreachable("Unknown GlobalValue to print out!");
   } else if (const MetadataAsValue *V = dyn_cast<MetadataAsValue>(this)) {
     V->getMetadata()->print(ROS, MST, getModuleFromVal(V));
   } else if (const Constant *C = dyn_cast<Constant>(this)) {
     TypePrinting TypePrinter;
     TypePrinter.print(C->getType(), OS);
     OS << ' ';
-    WriteConstantInternal(OS, C, TypePrinter, MST.getMachine(), nullptr);
+    AsmWriterContext WriterCtx(&TypePrinter, MST.getMachine());
+    WriteConstantInternal(OS, C, WriterCtx);
   } else if (isa<InlineAsm>(this) || isa<Argument>(this)) {
     this->printAsOperand(OS, /* PrintType */ true, MST);
   } else {
@@ -4649,7 +4650,8 @@ static bool printWithoutType(const Value &V, raw_ostream &O,
                              SlotTracker *Machine, const Module *M) {
   if (V.hasName() || isa<GlobalValue>(V) ||
       (!isa<Constant>(V) && !isa<MetadataAsValue>(V))) {
-    WriteAsOperandInternal(O, &V, nullptr, Machine, M);
+    AsmWriterContext WriterCtx(nullptr, Machine, M);
+    WriteAsOperandInternal(O, &V, WriterCtx);
     return true;
   }
   return false;
@@ -4663,8 +4665,8 @@ static void printAsOperandImpl(const Value &V, raw_ostream &O, bool PrintType,
     O << ' ';
   }
 
-  WriteAsOperandInternal(O, &V, &TypePrinter, MST.getMachine(),
-                         MST.getModule());
+  AsmWriterContext WriterCtx(&TypePrinter, MST.getMachine(), MST.getModule());
+  WriteAsOperandInternal(O, &V, WriterCtx);
 }
 
 void Value::printAsOperand(raw_ostream &O, bool PrintType,
@@ -4691,22 +4693,87 @@ void Value::printAsOperand(raw_ostream &O, bool PrintType,
   printAsOperandImpl(*this, O, PrintType, MST);
 }
 
+/// Recursive version of printMetadataImpl.
+static void printMetadataImplRec(raw_ostream &ROS, const Metadata &MD,
+                                 AsmWriterContext &WriterCtx) {
+  formatted_raw_ostream OS(ROS);
+  WriteAsOperandInternal(OS, &MD, WriterCtx, /* FromValue */ true);
+
+  auto *N = dyn_cast<MDNode>(&MD);
+  if (!N || isa<DIExpression>(MD) || isa<DIArgList>(MD))
+    return;
+
+  OS << " = ";
+  WriteMDNodeBodyInternal(OS, N, WriterCtx);
+}
+
+namespace {
+struct MDTreeAsmWriterContext : public AsmWriterContext {
+  unsigned Level;
+  // {Level, Printed string}
+  using EntryTy = std::pair<unsigned, std::string>;
+  SmallVector<EntryTy, 4> Buffer;
+
+  // Used to break the cycle in case there is any.
+  SmallPtrSet<const Metadata *, 4> Visited;
+
+  raw_ostream &MainOS;
+
+  MDTreeAsmWriterContext(TypePrinting *TP, SlotTracker *ST, const Module *M,
+                         raw_ostream &OS, const Metadata *InitMD)
+      : AsmWriterContext(TP, ST, M), Level(0U), Visited({InitMD}), MainOS(OS) {}
+
+  void onWriteMetadataAsOperand(const Metadata *MD) override {
+    if (Visited.count(MD))
+      return;
+    Visited.insert(MD);
+
+    std::string Str;
+    raw_string_ostream SS(Str);
+    ++Level;
+    // A placeholder entry to memorize the correct
+    // position in buffer.
+    Buffer.emplace_back(std::make_pair(Level, ""));
+    unsigned InsertIdx = Buffer.size() - 1;
+
+    printMetadataImplRec(SS, *MD, *this);
+    Buffer[InsertIdx].second = std::move(SS.str());
+    --Level;
+  }
+
+  ~MDTreeAsmWriterContext() {
+    for (const auto &Entry : Buffer) {
+      MainOS << "\n";
+      unsigned NumIndent = Entry.first * 2U;
+      MainOS.indent(NumIndent) << Entry.second;
+    }
+  }
+};
+} // end anonymous namespace
+
 static void printMetadataImpl(raw_ostream &ROS, const Metadata &MD,
                               ModuleSlotTracker &MST, const Module *M,
-                              bool OnlyAsOperand) {
+                              bool OnlyAsOperand, bool PrintAsTree = false) {
   formatted_raw_ostream OS(ROS);
 
   TypePrinting TypePrinter(M);
 
-  WriteAsOperandInternal(OS, &MD, &TypePrinter, MST.getMachine(), M,
-                         /* FromValue */ true);
+  std::unique_ptr<AsmWriterContext> WriterCtx;
+  if (PrintAsTree && !OnlyAsOperand)
+    WriterCtx = std::make_unique<MDTreeAsmWriterContext>(
+        &TypePrinter, MST.getMachine(), M, OS, &MD);
+  else
+    WriterCtx =
+        std::make_unique<AsmWriterContext>(&TypePrinter, MST.getMachine(), M);
+
+  WriteAsOperandInternal(OS, &MD, *WriterCtx, /* FromValue */ true);
 
   auto *N = dyn_cast<MDNode>(&MD);
   if (OnlyAsOperand || !N || isa<DIExpression>(MD) || isa<DIArgList>(MD))
     return;
 
   OS << " = ";
-  WriteMDNodeBodyInternal(OS, N, &TypePrinter, MST.getMachine(), M);
+  WriteMDNodeBodyInternal(OS, N, *WriterCtx);
 }
 
 void Metadata::printAsOperand(raw_ostream &OS, const Module *M) const {
@@ -4730,6 +4797,18 @@ void Metadata::print(raw_ostream &OS, ModuleSlotTracker &MST,
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false);
 }
 
+void MDNode::printTree(raw_ostream &OS, const Module *M) const {
+  ModuleSlotTracker MST(M, true);
+  printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false,
+                    /*PrintAsTree=*/true);
+}
+
+void MDNode::printTree(raw_ostream &OS, ModuleSlotTracker &MST,
+                       const Module *M) const {
+  printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false,
+                    /*PrintAsTree=*/true);
+}
+
 void ModuleSummaryIndex::print(raw_ostream &ROS, bool IsForDebug) const {
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
@@ -4781,6 +4860,15 @@ void Metadata::dump(const Module *M) const {
   dbgs() << '\n';
 }
 
+LLVM_DUMP_METHOD
+void MDNode::dumpTree() const { dumpTree(nullptr); }
+
+LLVM_DUMP_METHOD
+void MDNode::dumpTree(const Module *M) const {
+  printTree(dbgs(), M);
+  dbgs() << '\n';
+}
+
 // Allow printing of ModuleSummaryIndex from the debugger.
 LLVM_DUMP_METHOD
 void ModuleSummaryIndex::dump() const { print(dbgs(), /*IsForDebug=*/true); }
diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp
index 6498114cd60d..3d24ae062841 100644
--- a/llvm/lib/IR/Assumptions.cpp
+++ b/llvm/lib/IR/Assumptions.cpp
@@ -6,17 +6,23 @@
 //
 //===----------------------------------------------------------------------===//
 //
+//  This file implements helper functions for accessing assumption infomration
+//  inside of the "llvm.assume" metadata.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Assumptions.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
 
 using namespace llvm;
 
-bool llvm::hasAssumption(Function &F,
-                         const KnownAssumptionString &AssumptionStr) {
-  const Attribute &A = F.getFnAttribute(AssumptionAttrKey);
+namespace {
+bool hasAssumption(const Attribute &A,
+                   const KnownAssumptionString &AssumptionStr) {
   if (!A.isValid())
     return false;
   assert(A.isStringAttribute() && "Expected a string attribute!");
@@ -24,9 +30,76 @@ bool llvm::hasAssumption(Function &F,
   SmallVector<StringRef, 8> Strings;
   A.getValueAsString().split(Strings, ",");
 
-  return llvm::any_of(Strings, [=](StringRef Assumption) {
-    return Assumption == AssumptionStr;
-  });
+  return llvm::is_contained(Strings, AssumptionStr);
+}
+
+DenseSet<StringRef> getAssumptions(const Attribute &A) {
+  if (!A.isValid())
+    return DenseSet<StringRef>();
+  assert(A.isStringAttribute() && "Expected a string attribute!");
+
+  DenseSet<StringRef> Assumptions;
+  SmallVector<StringRef, 8> Strings;
+  A.getValueAsString().split(Strings, ",");
+
+  for (StringRef Str : Strings)
+    Assumptions.insert(Str);
+  return Assumptions;
+}
+
+template <typename AttrSite>
+bool addAssumptionsImpl(AttrSite &Site,
+                        const DenseSet<StringRef> &Assumptions) {
+  if (Assumptions.empty())
+    return false;
+
+  DenseSet<StringRef> CurAssumptions = getAssumptions(Site);
+
+  if (!set_union(CurAssumptions, Assumptions))
+    return false;
+
+  LLVMContext &Ctx = Site.getContext();
+  Site.addFnAttr(llvm::Attribute::get(
+      Ctx, llvm::AssumptionAttrKey,
+      llvm::join(CurAssumptions.begin(), CurAssumptions.end(), ",")));
+
+  return true;
+}
+} // namespace
+
+bool llvm::hasAssumption(const Function &F,
+                         const KnownAssumptionString &AssumptionStr) {
+  const Attribute &A = F.getFnAttribute(AssumptionAttrKey);
+  return ::hasAssumption(A, AssumptionStr);
+}
+
+bool llvm::hasAssumption(const CallBase &CB,
+                         const KnownAssumptionString &AssumptionStr) {
+  if (Function *F = CB.getCalledFunction())
+    if (hasAssumption(*F, AssumptionStr))
+      return true;
+
+  const Attribute &A = CB.getFnAttr(AssumptionAttrKey);
+  return ::hasAssumption(A, AssumptionStr);
+}
+
+DenseSet<StringRef> llvm::getAssumptions(const Function &F) {
+  const Attribute &A = F.getFnAttribute(AssumptionAttrKey);
+  return ::getAssumptions(A);
+}
+
+DenseSet<StringRef> llvm::getAssumptions(const CallBase &CB) {
+  const Attribute &A = CB.getFnAttr(AssumptionAttrKey);
+  return ::getAssumptions(A);
+}
+
+bool llvm::addAssumptions(Function &F, const DenseSet<StringRef> &Assumptions) {
+  return ::addAssumptionsImpl(F, Assumptions);
+}
+
+bool llvm::addAssumptions(CallBase &CB,
+                          const DenseSet<StringRef> &Assumptions) {
+  return ::addAssumptionsImpl(CB, Assumptions);
 }
 
 StringSet<> llvm::KnownAssumptionStrings({
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 5cd1bafccc47..f81a446d6e46 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -812,42 +812,13 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
     if (!B.contains(Kind))
       continue;
 
-    if (Attribute::isTypeAttrKind(Kind)) {
-      Attrs.push_back(Attribute::get(C, Kind, B.getTypeAttr(Kind)));
-      continue;
-    }
-
     Attribute Attr;
-    switch (Kind) {
-    case Attribute::Alignment:
-      assert(B.getAlignment() && "Alignment must be set");
-      Attr = Attribute::getWithAlignment(C, *B.getAlignment());
-      break;
-    case Attribute::StackAlignment:
-      assert(B.getStackAlignment() && "StackAlignment must be set");
-      Attr = Attribute::getWithStackAlignment(C, *B.getStackAlignment());
-      break;
-    case Attribute::Dereferenceable:
-      Attr = Attribute::getWithDereferenceableBytes(
-          C, B.getDereferenceableBytes());
-      break;
-    case Attribute::DereferenceableOrNull:
-      Attr = Attribute::getWithDereferenceableOrNullBytes(
-          C, B.getDereferenceableOrNullBytes());
-      break;
-    case Attribute::AllocSize: {
-      auto A = B.getAllocSizeArgs();
-      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
-      break;
-    }
-    case Attribute::VScaleRange: {
-      auto A = B.getVScaleRangeArgs();
-      Attr = Attribute::getWithVScaleRangeArgs(C, A.first, A.second);
-      break;
-    }
-    default:
+    if (Attribute::isTypeAttrKind(Kind))
+      Attr = Attribute::get(C, Kind, B.getTypeAttr(Kind));
+    else if (Attribute::isIntAttrKind(Kind))
+      Attr = Attribute::get(C, Kind, B.getRawIntAttr(Kind));
+    else
       Attr = Attribute::get(C, Kind);
-    }
     Attrs.push_back(Attr);
   }
 
@@ -1209,33 +1180,36 @@ AttributeList AttributeList::get(LLVMContext &C,
   return getImpl(C, NewAttrSets);
 }
 
-AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
-                                          Attribute::AttrKind Kind) const {
-  if (hasAttribute(Index, Kind)) return *this;
+AttributeList
+AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index,
+                                   Attribute::AttrKind Kind) const {
+  if (hasAttributeAtIndex(Index, Kind))
+    return *this;
   AttributeSet Attrs = getAttributes(Index);
   // TODO: Insert at correct position and avoid sort.
   SmallVector<Attribute, 8> NewAttrs(Attrs.begin(), Attrs.end());
   NewAttrs.push_back(Attribute::get(C, Kind));
-  return setAttributes(C, Index, AttributeSet::get(C, NewAttrs));
+  return setAttributesAtIndex(C, Index, AttributeSet::get(C, NewAttrs));
 }
 
-AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
-                                          StringRef Kind,
-                                          StringRef Value) const {
+AttributeList AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index,
+                                                 StringRef Kind,
+                                                 StringRef Value) const {
   AttrBuilder B;
   B.addAttribute(Kind, Value);
-  return addAttributes(C, Index, B);
+  return addAttributesAtIndex(C, Index, B);
 }
 
-AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
-                                          Attribute A) const {
+AttributeList AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index,
+                                                 Attribute A) const {
   AttrBuilder B;
   B.addAttribute(A);
-  return addAttributes(C, Index, B);
+  return addAttributesAtIndex(C, Index, B);
 }
 
-AttributeList AttributeList::setAttributes(LLVMContext &C, unsigned Index,
-                                           AttributeSet Attrs) const {
+AttributeList AttributeList::setAttributesAtIndex(LLVMContext &C,
+                                                  unsigned Index,
+                                                  AttributeSet Attrs) const {
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
   if (Index >= AttrSets.size())
@@ -1244,8 +1218,9 @@ AttributeList AttributeList::setAttributes(LLVMContext &C, unsigned Index,
   return AttributeList::getImpl(C, AttrSets);
 }
 
-AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
-                                           const AttrBuilder &B) const {
+AttributeList AttributeList::addAttributesAtIndex(LLVMContext &C,
+                                                  unsigned Index,
+                                                  const AttrBuilder &B) const {
   if (!B.hasAttributes())
     return *this;
 
@@ -1263,7 +1238,7 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
 
   AttrBuilder Merged(getAttributes(Index));
   Merged.merge(B);
-  return setAttributes(C, Index, AttributeSet::get(C, Merged));
+  return setAttributesAtIndex(C, Index, AttributeSet::get(C, Merged));
 }
 
 AttributeList AttributeList::addParamAttribute(LLVMContext &C,
@@ -1286,9 +1261,11 @@ AttributeList AttributeList::addParamAttribute(LLVMContext &C,
   return getImpl(C, AttrSets);
 }
 
-AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
-                                             Attribute::AttrKind Kind) const {
-  if (!hasAttribute(Index, Kind)) return *this;
+AttributeList
+AttributeList::removeAttributeAtIndex(LLVMContext &C, unsigned Index,
+                                      Attribute::AttrKind Kind) const {
+  if (!hasAttributeAtIndex(Index, Kind))
+    return *this;
 
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
@@ -1299,9 +1276,11 @@ AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
   return getImpl(C, AttrSets);
 }
 
-AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
-                                             StringRef Kind) const {
-  if (!hasAttribute(Index, Kind)) return *this;
+AttributeList AttributeList::removeAttributeAtIndex(LLVMContext &C,
+                                                    unsigned Index,
+                                                    StringRef Kind) const {
+  if (!hasAttributeAtIndex(Index, Kind))
+    return *this;
 
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
@@ -1313,18 +1292,19 @@ AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
 }
 
 AttributeList
-AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
-                                const AttrBuilder &AttrsToRemove) const {
+AttributeList::removeAttributesAtIndex(LLVMContext &C, unsigned Index,
+                                       const AttrBuilder &AttrsToRemove) const {
   AttributeSet Attrs = getAttributes(Index);
   AttributeSet NewAttrs = Attrs.removeAttributes(C, AttrsToRemove);
   // If nothing was removed, return the original list.
   if (Attrs == NewAttrs)
     return *this;
-  return setAttributes(C, Index, NewAttrs);
+  return setAttributesAtIndex(C, Index, NewAttrs);
 }
 
-AttributeList AttributeList::removeAttributes(LLVMContext &C,
-                                              unsigned WithoutIndex) const {
+AttributeList
+AttributeList::removeAttributesAtIndex(LLVMContext &C,
+                                       unsigned WithoutIndex) const {
   if (!pImpl)
     return {};
   WithoutIndex = attrIdxToArrayIdx(WithoutIndex);
@@ -1335,79 +1315,73 @@ AttributeList AttributeList::removeAttributes(LLVMContext &C,
   return getImpl(C, AttrSets);
 }
 
-AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C,
-                                                    unsigned Index,
-                                                    uint64_t Bytes) const {
+AttributeList AttributeList::addDereferenceableRetAttr(LLVMContext &C,
+                                                       uint64_t Bytes) const {
   AttrBuilder B;
   B.addDereferenceableAttr(Bytes);
-  return addAttributes(C, Index, B);
+  return addRetAttributes(C, B);
 }
 
-AttributeList
-AttributeList::addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
-                                            uint64_t Bytes) const {
+AttributeList AttributeList::addDereferenceableParamAttr(LLVMContext &C,
+                                                         unsigned Index,
+                                                         uint64_t Bytes) const {
   AttrBuilder B;
-  B.addDereferenceableOrNullAttr(Bytes);
-  return addAttributes(C, Index, B);
+  B.addDereferenceableAttr(Bytes);
+  return addParamAttributes(C, Index, B);
 }
 
 AttributeList
-AttributeList::addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                                unsigned ElemSizeArg,
-                                const Optional<unsigned> &NumElemsArg) {
+AttributeList::addDereferenceableOrNullParamAttr(LLVMContext &C, unsigned Index,
+                                                 uint64_t Bytes) const {
   AttrBuilder B;
-  B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
-  return addAttributes(C, Index, B);
+  B.addDereferenceableOrNullAttr(Bytes);
+  return addParamAttributes(C, Index, B);
 }
 
-AttributeList AttributeList::addVScaleRangeAttr(LLVMContext &C, unsigned Index,
-                                                unsigned MinValue,
-                                                unsigned MaxValue) {
+AttributeList
+AttributeList::addAllocSizeParamAttr(LLVMContext &C, unsigned Index,
+                                     unsigned ElemSizeArg,
+                                     const Optional<unsigned> &NumElemsArg) {
   AttrBuilder B;
-  B.addVScaleRangeAttr(MinValue, MaxValue);
-  return addAttributes(C, Index, B);
+  B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
+  return addParamAttributes(C, Index, B);
 }
 
 //===----------------------------------------------------------------------===//
 // AttributeList Accessor Methods
 //===----------------------------------------------------------------------===//
 
-AttributeSet AttributeList::getParamAttributes(unsigned ArgNo) const {
+AttributeSet AttributeList::getParamAttrs(unsigned ArgNo) const {
   return getAttributes(ArgNo + FirstArgIndex);
 }
 
-AttributeSet AttributeList::getRetAttributes() const {
+AttributeSet AttributeList::getRetAttrs() const {
   return getAttributes(ReturnIndex);
 }
 
-AttributeSet AttributeList::getFnAttributes() const {
+AttributeSet AttributeList::getFnAttrs() const {
   return getAttributes(FunctionIndex);
 }
 
-bool AttributeList::hasAttribute(unsigned Index,
-                                 Attribute::AttrKind Kind) const {
+bool AttributeList::hasAttributeAtIndex(unsigned Index,
+                                        Attribute::AttrKind Kind) const {
   return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeList::hasAttribute(unsigned Index, StringRef Kind) const {
+bool AttributeList::hasAttributeAtIndex(unsigned Index, StringRef Kind) const {
   return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeList::hasAttributes(unsigned Index) const {
+bool AttributeList::hasAttributesAtIndex(unsigned Index) const {
   return getAttributes(Index).hasAttributes();
 }
 
-bool AttributeList::hasFnAttribute(Attribute::AttrKind Kind) const {
+bool AttributeList::hasFnAttr(Attribute::AttrKind Kind) const {
   return pImpl && pImpl->hasFnAttribute(Kind);
 }
 
-bool AttributeList::hasFnAttribute(StringRef Kind) const {
-  return hasAttribute(AttributeList::FunctionIndex, Kind);
-}
-
-bool AttributeList::hasParamAttribute(unsigned ArgNo,
-                                      Attribute::AttrKind Kind) const {
-  return hasAttribute(ArgNo + FirstArgIndex, Kind);
+bool AttributeList::hasFnAttr(StringRef Kind) const {
+  return hasAttributeAtIndex(AttributeList::FunctionIndex, Kind);
 }
 
 bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
@@ -1415,12 +1389,13 @@ bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
   return pImpl && pImpl->hasAttrSomewhere(Attr, Index);
 }
 
-Attribute AttributeList::getAttribute(unsigned Index,
-                                      Attribute::AttrKind Kind) const {
+Attribute AttributeList::getAttributeAtIndex(unsigned Index,
+                                             Attribute::AttrKind Kind) const {
   return getAttributes(Index).getAttribute(Kind);
 }
 
-Attribute AttributeList::getAttribute(unsigned Index, StringRef Kind) const {
+Attribute AttributeList::getAttributeAtIndex(unsigned Index,
+                                             StringRef Kind) const {
   return getAttributes(Index).getAttribute(Kind);
 }
 
@@ -1460,26 +1435,29 @@ Type *AttributeList::getParamElementType(unsigned Index) const {
   return getAttributes(Index + FirstArgIndex).getElementType();
 }
 
-MaybeAlign AttributeList::getStackAlignment(unsigned Index) const {
-  return getAttributes(Index).getStackAlignment();
+MaybeAlign AttributeList::getFnStackAlignment() const {
+  return getFnAttrs().getStackAlignment();
 }
 
-uint64_t AttributeList::getDereferenceableBytes(unsigned Index) const {
-  return getAttributes(Index).getDereferenceableBytes();
+MaybeAlign AttributeList::getRetStackAlignment() const {
+  return getRetAttrs().getStackAlignment();
 }
 
-uint64_t AttributeList::getDereferenceableOrNullBytes(unsigned Index) const {
-  return getAttributes(Index).getDereferenceableOrNullBytes();
+uint64_t AttributeList::getRetDereferenceableBytes() const {
+  return getRetAttrs().getDereferenceableBytes();
 }
 
-std::pair<unsigned, Optional<unsigned>>
-AttributeList::getAllocSizeArgs(unsigned Index) const {
-  return getAttributes(Index).getAllocSizeArgs();
+uint64_t AttributeList::getParamDereferenceableBytes(unsigned Index) const {
+  return getParamAttrs(Index).getDereferenceableBytes();
 }
 
-std::pair<unsigned, unsigned>
-AttributeList::getVScaleRangeArgs(unsigned Index) const {
-  return getAttributes(Index).getVScaleRangeArgs();
+uint64_t AttributeList::getRetDereferenceableOrNullBytes() const {
+  return getRetAttrs().getDereferenceableOrNullBytes();
+}
+
+uint64_t
+AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const {
+  return getParamAttrs(Index).getDereferenceableOrNullBytes();
 }
 
 std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
@@ -1520,7 +1498,7 @@ unsigned AttributeList::getNumAttrSets() const {
 void AttributeList::print(raw_ostream &O) const {
   O << "AttributeList[\n";
 
-  for (unsigned i = index_begin(), e = index_end(); i != e; ++i) {
+  for (unsigned i : indexes()) {
     if (!getAttributes(i).hasAttributes())
       continue;
     O << "  { ";
@@ -1563,15 +1541,18 @@ AttrBuilder::AttrBuilder(AttributeSet AS) {
 void AttrBuilder::clear() {
   Attrs.reset();
   TargetDepAttrs.clear();
-  Alignment.reset();
-  StackAlignment.reset();
-  DerefBytes = DerefOrNullBytes = 0;
-  AllocSizeArgs = 0;
-  VScaleRangeArgs = 0;
+  IntAttrs = {};
   TypeAttrs = {};
 }
 
 Optional<unsigned>
+AttrBuilder::kindToIntIndex(Attribute::AttrKind Kind) const {
+  if (Attribute::isIntAttrKind(Kind))
+    return Kind - Attribute::FirstIntAttr;
+  return None;
+}
+
+Optional<unsigned>
 AttrBuilder::kindToTypeIndex(Attribute::AttrKind Kind) const {
   if (Attribute::isTypeAttrKind(Kind))
     return Kind - Attribute::FirstTypeAttr;
@@ -1589,18 +1570,8 @@ AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
 
   if (Optional<unsigned> TypeIndex = kindToTypeIndex(Kind))
     TypeAttrs[*TypeIndex] = Attr.getValueAsType();
-  else if (Kind == Attribute::Alignment)
-    Alignment = Attr.getAlignment();
-  else if (Kind == Attribute::StackAlignment)
-    StackAlignment = Attr.getStackAlignment();
-  else if (Kind == Attribute::Dereferenceable)
-    DerefBytes = Attr.getDereferenceableBytes();
-  else if (Kind == Attribute::DereferenceableOrNull)
-    DerefOrNullBytes = Attr.getDereferenceableOrNullBytes();
-  else if (Kind == Attribute::AllocSize)
-    AllocSizeArgs = Attr.getValueAsInt();
-  else if (Kind == Attribute::VScaleRange)
-    VScaleRangeArgs = Attr.getValueAsInt();
+  else if (Optional<unsigned> IntIndex = kindToIntIndex(Kind))
+    IntAttrs[*IntIndex] = Attr.getValueAsInt();
 
   return *this;
 }
@@ -1616,18 +1587,8 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
 
   if (Optional<unsigned> TypeIndex = kindToTypeIndex(Val))
     TypeAttrs[*TypeIndex] = nullptr;
-  else if (Val == Attribute::Alignment)
-    Alignment.reset();
-  else if (Val == Attribute::StackAlignment)
-    StackAlignment.reset();
-  else if (Val == Attribute::Dereferenceable)
-    DerefBytes = 0;
-  else if (Val == Attribute::DereferenceableOrNull)
-    DerefOrNullBytes = 0;
-  else if (Val == Attribute::AllocSize)
-    AllocSizeArgs = 0;
-  else if (Val == Attribute::VScaleRange)
-    VScaleRangeArgs = 0;
+  else if (Optional<unsigned> IntIndex = kindToIntIndex(Val))
+    IntAttrs[*IntIndex] = 0;
 
   return *this;
 }
@@ -1638,18 +1599,32 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) {
 }
 
 AttrBuilder &AttrBuilder::removeAttribute(StringRef A) {
-  auto I = TargetDepAttrs.find(A);
-  if (I != TargetDepAttrs.end())
-    TargetDepAttrs.erase(I);
+  TargetDepAttrs.erase(A);
+  return *this;
+}
+
+uint64_t AttrBuilder::getRawIntAttr(Attribute::AttrKind Kind) const {
+  Optional<unsigned> IntIndex = kindToIntIndex(Kind);
+  assert(IntIndex && "Not an int attribute");
+  return IntAttrs[*IntIndex];
+}
+
+AttrBuilder &AttrBuilder::addRawIntAttr(Attribute::AttrKind Kind,
+                                        uint64_t Value) {
+  Optional<unsigned> IntIndex = kindToIntIndex(Kind);
+  assert(IntIndex && "Not an int attribute");
+  assert(Value && "Value cannot be zero");
+  Attrs[Kind] = true;
+  IntAttrs[*IntIndex] = Value;
   return *this;
 }
 
 std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const {
-  return unpackAllocSizeArgs(AllocSizeArgs);
+  return unpackAllocSizeArgs(getRawIntAttr(Attribute::AllocSize));
 }
 
 std::pair<unsigned, unsigned> AttrBuilder::getVScaleRangeArgs() const {
-  return unpackVScaleRangeArgs(VScaleRangeArgs);
+  return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange));
 }
 
 AttrBuilder &AttrBuilder::addAlignmentAttr(MaybeAlign Align) {
@@ -1657,10 +1632,7 @@ AttrBuilder &AttrBuilder::addAlignmentAttr(MaybeAlign Align) {
     return *this;
 
   assert(*Align <= llvm::Value::MaximumAlignment && "Alignment too large.");
-
-  Attrs[Attribute::Alignment] = true;
-  Alignment = Align;
-  return *this;
+  return addRawIntAttr(Attribute::Alignment, Align->value());
 }
 
 AttrBuilder &AttrBuilder::addStackAlignmentAttr(MaybeAlign Align) {
@@ -1669,27 +1641,20 @@ AttrBuilder &AttrBuilder::addStackAlignmentAttr(MaybeAlign Align) {
     return *this;
 
   assert(*Align <= 0x100 && "Alignment too large.");
-
-  Attrs[Attribute::StackAlignment] = true;
-  StackAlignment = Align;
-  return *this;
+  return addRawIntAttr(Attribute::StackAlignment, Align->value());
 }
 
 AttrBuilder &AttrBuilder::addDereferenceableAttr(uint64_t Bytes) {
   if (Bytes == 0) return *this;
 
-  Attrs[Attribute::Dereferenceable] = true;
-  DerefBytes = Bytes;
-  return *this;
+  return addRawIntAttr(Attribute::Dereferenceable, Bytes);
 }
 
 AttrBuilder &AttrBuilder::addDereferenceableOrNullAttr(uint64_t Bytes) {
   if (Bytes == 0)
     return *this;
 
-  Attrs[Attribute::DereferenceableOrNull] = true;
-  DerefOrNullBytes = Bytes;
-  return *this;
+  return addRawIntAttr(Attribute::DereferenceableOrNull, Bytes);
 }
 
 AttrBuilder &AttrBuilder::addAllocSizeAttr(unsigned ElemSize,
@@ -1700,12 +1665,7 @@ AttrBuilder &AttrBuilder::addAllocSizeAttr(unsigned ElemSize,
 AttrBuilder &AttrBuilder::addAllocSizeAttrFromRawRepr(uint64_t RawArgs) {
   // (0, 0) is our "not present" value, so we need to check for it here.
   assert(RawArgs && "Invalid allocsize arguments -- given allocsize(0, 0)");
-
-  Attrs[Attribute::AllocSize] = true;
-  // Reuse existing machinery to store this as a single 64-bit integer so we can
-  // save a few bytes over using a pair<unsigned, Optional<unsigned>>.
-  AllocSizeArgs = RawArgs;
-  return *this;
+  return addRawIntAttr(Attribute::AllocSize, RawArgs);
 }
 
 AttrBuilder &AttrBuilder::addVScaleRangeAttr(unsigned MinValue,
@@ -1718,11 +1678,7 @@ AttrBuilder &AttrBuilder::addVScaleRangeAttrFromRawRepr(uint64_t RawArgs) {
   if (RawArgs == 0)
     return *this;
 
-  Attrs[Attribute::VScaleRange] = true;
-  // Reuse existing machinery to store this as a single 64-bit integer so we can
-  // save a few bytes over using a pair<unsigned, unsigned>.
-  VScaleRangeArgs = RawArgs;
-  return *this;
+  return addRawIntAttr(Attribute::VScaleRange, RawArgs);
 }
 
 Type *AttrBuilder::getTypeAttr(Attribute::AttrKind Kind) const {
@@ -1760,24 +1716,10 @@ AttrBuilder &AttrBuilder::addInAllocaAttr(Type *Ty) {
 }
 
 AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
-  // FIXME: What if both have alignments, but they don't match?!
-  if (!Alignment)
-    Alignment = B.Alignment;
-
-  if (!StackAlignment)
-    StackAlignment = B.StackAlignment;
-
-  if (!DerefBytes)
-    DerefBytes = B.DerefBytes;
-
-  if (!DerefOrNullBytes)
-    DerefOrNullBytes = B.DerefOrNullBytes;
-
-  if (!AllocSizeArgs)
-    AllocSizeArgs = B.AllocSizeArgs;
-
-  if (!VScaleRangeArgs)
-    VScaleRangeArgs = B.VScaleRangeArgs;
+  // FIXME: What if both have an int/type attribute, but they don't match?!
+  for (unsigned Index = 0; Index < Attribute::NumIntAttrKinds; ++Index)
+    if (!IntAttrs[Index])
+      IntAttrs[Index] = B.IntAttrs[Index];
 
   for (unsigned Index = 0; Index < Attribute::NumTypeAttrKinds; ++Index)
     if (!TypeAttrs[Index])
@@ -1792,24 +1734,10 @@ AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
 }
 
 AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) {
-  // FIXME: What if both have alignments, but they don't match?!
-  if (B.Alignment)
-    Alignment.reset();
-
-  if (B.StackAlignment)
-    StackAlignment.reset();
-
-  if (B.DerefBytes)
-    DerefBytes = 0;
-
-  if (B.DerefOrNullBytes)
-    DerefOrNullBytes = 0;
-
-  if (B.AllocSizeArgs)
-    AllocSizeArgs = 0;
-
-  if (B.VScaleRangeArgs)
-    VScaleRangeArgs = 0;
+  // FIXME: What if both have an int/type attribute, but they don't match?!
+  for (unsigned Index = 0; Index < Attribute::NumIntAttrKinds; ++Index)
+    if (B.IntAttrs[Index])
+      IntAttrs[Index] = 0;
 
   for (unsigned Index = 0; Index < Attribute::NumTypeAttrKinds; ++Index)
     if (B.TypeAttrs[Index])
@@ -1861,7 +1789,7 @@ bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const {
 }
 
 bool AttrBuilder::hasAlignmentAttr() const {
-  return Alignment != 0;
+  return getRawIntAttr(Attribute::Alignment) != 0;
 }
 
 bool AttrBuilder::operator==(const AttrBuilder &B) const {
@@ -1872,9 +1800,7 @@ bool AttrBuilder::operator==(const AttrBuilder &B) const {
     if (B.TargetDepAttrs.find(TDA.first) == B.TargetDepAttrs.end())
       return false;
 
-  return Alignment == B.Alignment && StackAlignment == B.StackAlignment &&
-         DerefBytes == B.DerefBytes && TypeAttrs == B.TypeAttrs &&
-         VScaleRangeArgs == B.VScaleRangeArgs;
+  return IntAttrs == B.IntAttrs && TypeAttrs == B.TypeAttrs;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1966,11 +1892,11 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
       .addAttribute(Attribute::StackProtectReq);
 
   if (Callee.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
+    Caller.removeFnAttrs(OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectReq);
   } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
+    Caller.removeFnAttrs(OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectStrong);
   } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 6271385183eb..d73d1e9c20b3 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -583,8 +583,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       // Can't use Intrinsic::getDeclaration here as the return types might
       // then only be structurally equal.
       FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false);
+      StringRef Suffix =
+          F->getContext().supportsTypedPointers() ? "p0i8" : "p0";
       NewFn = Function::Create(fType, F->getLinkage(), F->getAddressSpace(),
-                               "llvm." + Name + ".p0i8", F->getParent());
+                               "llvm." + Name + "." + Suffix, F->getParent());
       return true;
     }
     static const Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
@@ -601,7 +603,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
       auto fArgs = F->getFunctionType()->params();
       Type *Tys[] = {fArgs[0], fArgs[1]};
-      if (Name.find("lane") == StringRef::npos)
+      if (!Name.contains("lane"))
         NewFn = Intrinsic::getDeclaration(F->getParent(),
                                           StoreInts[fArgs.size() - 3], Tys);
       else
@@ -1273,7 +1275,7 @@ static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallInst &CI,
   Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Op0, Op1});
 
-  if (CI.getNumArgOperands() == 4) { // For masked intrinsics.
+  if (CI.arg_size() == 4) { // For masked intrinsics.
     Value *VecSrc = CI.getOperand(2);
     Value *Mask = CI.getOperand(3);
     Res = EmitX86Select(Builder, Mask, Res, VecSrc);
@@ -1300,7 +1302,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallInst &CI,
   Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Src, Src, Amt});
 
-  if (CI.getNumArgOperands() == 4) { // For masked intrinsics.
+  if (CI.arg_size() == 4) { // For masked intrinsics.
     Value *VecSrc = CI.getOperand(2);
     Value *Mask = CI.getOperand(3);
     Res = EmitX86Select(Builder, Mask, Res, VecSrc);
@@ -1370,7 +1372,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallInst &CI,
   Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Op0, Op1, Amt});
 
-  unsigned NumArgs = CI.getNumArgOperands();
+  unsigned NumArgs = CI.arg_size();
   if (NumArgs >= 4) { // For masked intrinsics.
     Value *VecSrc = NumArgs == 5 ? CI.getArgOperand(3) :
                     ZeroMask     ? ConstantAggregateZero::get(CI.getType()) :
@@ -1431,7 +1433,7 @@ static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) {
   Value *Op0 = CI.getArgOperand(0);
   Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty);
   Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)});
-  if (CI.getNumArgOperands() == 3)
+  if (CI.arg_size() == 3)
     Res = EmitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1));
   return Res;
 }
@@ -1459,7 +1461,7 @@ static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) {
 
   Value *Res = Builder.CreateMul(LHS, RHS);
 
-  if (CI.getNumArgOperands() == 4)
+  if (CI.arg_size() == 4)
     Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2));
 
   return Res;
@@ -1514,7 +1516,7 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
     Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
   }
 
-  Value *Mask = CI.getArgOperand(CI.getNumArgOperands() - 1);
+  Value *Mask = CI.getArgOperand(CI.arg_size() - 1);
 
   return ApplyX86MaskOn1BitsVec(Builder, Cmp, Mask);
 }
@@ -1779,13 +1781,12 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
   } else
     return false;
 
-  SmallVector<Value *, 4> Args(CI.arg_operands().begin(),
-                               CI.arg_operands().end());
+  SmallVector<Value *, 4> Args(CI.args());
   Args.pop_back();
   Args.pop_back();
   Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID),
                            Args);
-  unsigned NumArgs = CI.getNumArgOperands();
+  unsigned NumArgs = CI.arg_size();
   Rep = EmitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
                       CI.getArgOperand(NumArgs - 2));
   return true;
@@ -1964,7 +1965,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                                          CI->getType()),
                                {CI->getArgOperand(0)});
     } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p"))) {
-      if (CI->getNumArgOperands() == 4 &&
+      if (CI->arg_size() == 4 &&
           (!isa<ConstantInt>(CI->getArgOperand(3)) ||
            cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
         Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512
@@ -2124,8 +2125,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                { CI->getOperand(0), CI->getArgOperand(1) });
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.cmp.p")) {
-      SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                   CI->arg_operands().end());
+      SmallVector<Value *, 4> Args(CI->args());
       Type *OpTy = Args[0]->getType();
       unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
       unsigned EltWidth = OpTy->getScalarSizeInBits();
@@ -2257,7 +2257,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       bool IsUnsigned = (StringRef::npos != Name.find("cvtu"));
       if (IsPS2PD)
         Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
-      else if (CI->getNumArgOperands() == 4 &&
+      else if (CI->arg_size() == 4 &&
                (!isa<ConstantInt>(CI->getArgOperand(3)) ||
                 cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
         Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
@@ -2270,7 +2270,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          : Builder.CreateSIToFP(Rep, DstTy, "cvt");
       }
 
-      if (CI->getNumArgOperands() >= 3)
+      if (CI->arg_size() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") ||
@@ -2286,7 +2286,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateBitCast(
           Rep, FixedVectorType::get(Type::getHalfTy(C), NumDstElts));
       Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps");
-      if (CI->getNumArgOperands() >= 3)
+      if (CI->arg_size() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("avx512.mask.loadu."))) {
@@ -2353,7 +2353,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         llvm_unreachable("Unknown suffix");
 
       unsigned Imm;
-      if (CI->getNumArgOperands() == 3) {
+      if (CI->arg_size() == 3) {
         Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
       } else {
         Name = Name.substr(9); // strip off "xop.vpcom"
@@ -2417,7 +2417,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                           EltTy->getPointerTo());
       Value *Load = Builder.CreateLoad(EltTy, Cast);
       Type *I32Ty = Type::getInt32Ty(C);
-      Rep = UndefValue::get(VecTy);
+      Rep = PoisonValue::get(VecTy);
       for (unsigned I = 0; I < EltNum; ++I)
         Rep = Builder.CreateInsertElement(Rep, Load,
                                           ConstantInt::get(I32Ty, I));
@@ -2442,7 +2442,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
                    : Builder.CreateZExt(SV, DstTy);
       // If there are 3 arguments, it's a masked intrinsic so we need a select.
-      if (CI->getNumArgOperands() == 3)
+      if (CI->arg_size() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (Name == "avx512.mask.pmov.qd.256" ||
@@ -2518,7 +2518,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       ShuffleVectorInst::getShuffleMask(Constant::getNullValue(MaskTy), M);
       Rep = Builder.CreateShuffleVector(Op, M);
 
-      if (CI->getNumArgOperands() == 3)
+      if (CI->arg_size() == 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("sse2.padds.") ||
@@ -2636,7 +2636,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);
 
       // If the intrinsic has a mask operand, handle that.
-      if (CI->getNumArgOperands() == 5)
+      if (CI->arg_size() == 5)
         Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
                             CI->getArgOperand(3));
     } else if (IsX86 && (Name.startswith("avx.vextractf128.") ||
@@ -2661,7 +2661,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
       // If the intrinsic has a mask operand, handle that.
-      if (CI->getNumArgOperands() == 4)
+      if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (!IsX86 && Name == "stackprotectorcheck") {
@@ -2679,7 +2679,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
-      if (CI->getNumArgOperands() == 4)
+      if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name.startswith("avx.vperm2f128.") ||
@@ -2739,7 +2739,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
-      if (CI->getNumArgOperands() == 4)
+      if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name == "sse2.pshufl.w" ||
@@ -2758,7 +2758,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
-      if (CI->getNumArgOperands() == 4)
+      if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name == "sse2.pshufh.w" ||
@@ -2777,7 +2777,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
-      if (CI->getNumArgOperands() == 4)
+      if (CI->arg_size() == 4)
         Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.shuf.p")) {
@@ -3346,7 +3346,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       if (NegAcc)
         C = Builder.CreateFNeg(C);
 
-      if (CI->getNumArgOperands() == 5 &&
+      if (CI->arg_size() == 5 &&
           (!isa<ConstantInt>(CI->getArgOperand(4)) ||
            cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
         Intrinsic::ID IID;
@@ -3399,7 +3399,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // Drop the "avx512.mask." to make it easier.
       Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
       bool IsSubAdd = Name[3] == 's';
-      if (CI->getNumArgOperands() == 5) {
+      if (CI->arg_size() == 5) {
         Intrinsic::ID IID;
         // Check the character before ".512" in string.
         if (Name[Name.size()-5] == 's')
@@ -3686,8 +3686,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::arm_neon_vst2lane:
   case Intrinsic::arm_neon_vst3lane:
   case Intrinsic::arm_neon_vst4lane: {
-    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                 CI->arg_operands().end());
+    SmallVector<Value *, 4> Args(CI->args());
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
@@ -3701,14 +3700,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::aarch64_neon_bfmlalb:
   case Intrinsic::aarch64_neon_bfmlalt: {
     SmallVector<Value *, 3> Args;
-    assert(CI->getNumArgOperands() == 3 &&
+    assert(CI->arg_size() == 3 &&
            "Mismatch between function args and call args");
     size_t OperandWidth =
         CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
     assert((OperandWidth == 64 || OperandWidth == 128) &&
            "Unexpected operand width");
     Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
-    auto Iter = CI->arg_operands().begin();
+    auto Iter = CI->args().begin();
     Args.push_back(*Iter++);
     Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
     Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
@@ -3722,18 +3721,17 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
-    assert(CI->getNumArgOperands() == 1 &&
+    assert(CI->arg_size() == 1 &&
            "Mismatch between function args and call args");
     NewCall =
         Builder.CreateCall(NewFn, {CI->getArgOperand(0), Builder.getFalse()});
     break;
 
   case Intrinsic::objectsize: {
-    Value *NullIsUnknownSize = CI->getNumArgOperands() == 2
-                                   ? Builder.getFalse()
-                                   : CI->getArgOperand(2);
+    Value *NullIsUnknownSize =
+        CI->arg_size() == 2 ? Builder.getFalse() : CI->getArgOperand(2);
     Value *Dynamic =
-        CI->getNumArgOperands() < 4 ? Builder.getFalse() : CI->getArgOperand(3);
+        CI->arg_size() < 4 ? Builder.getFalse() : CI->getArgOperand(3);
     NewCall = Builder.CreateCall(
         NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), NullIsUnknownSize, Dynamic});
     break;
@@ -3749,7 +3747,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
   case Intrinsic::dbg_value:
     // Upgrade from the old version that had an extra offset argument.
-    assert(CI->getNumArgOperands() == 4);
+    assert(CI->arg_size() == 4);
     // Drop nonzero offsets instead of attempting to upgrade them.
     if (auto *Offset = dyn_cast_or_null<Constant>(CI->getArgOperand(1)))
       if (Offset->isZeroValue()) {
@@ -3763,7 +3761,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
   case Intrinsic::ptr_annotation:
     // Upgrade from versions that lacked the annotation attribute argument.
-    assert(CI->getNumArgOperands() == 4 &&
+    assert(CI->arg_size() == 4 &&
            "Before LLVM 12.0 this intrinsic took four arguments");
     // Create a new call with an added null annotation attribute argument.
     NewCall = Builder.CreateCall(
@@ -3777,7 +3775,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
   case Intrinsic::var_annotation:
     // Upgrade from versions that lacked the annotation attribute argument.
-    assert(CI->getNumArgOperands() == 4 &&
+    assert(CI->arg_size() == 4 &&
            "Before LLVM 12.0 this intrinsic took four arguments");
     // Create a new call with an added null annotation attribute argument.
     NewCall = Builder.CreateCall(
@@ -3796,8 +3794,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::x86_xop_vpermil2ps:
   case Intrinsic::x86_xop_vpermil2pd_256:
   case Intrinsic::x86_xop_vpermil2ps_256: {
-    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                 CI->arg_operands().end());
+    SmallVector<Value *, 4> Args(CI->args());
     VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
     VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
     Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
@@ -3858,8 +3855,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::x86_avx2_mpsadbw: {
     // Need to truncate the last argument from i32 to i8 -- this argument models
     // an inherently 8-bit immediate operand to these x86 instructions.
-    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                 CI->arg_operands().end());
+    SmallVector<Value *, 4> Args(CI->args());
 
     // Replace the last argument with a trunc.
     Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
@@ -3873,8 +3869,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::x86_avx512_mask_cmp_ps_128:
   case Intrinsic::x86_avx512_mask_cmp_ps_256:
   case Intrinsic::x86_avx512_mask_cmp_ps_512: {
-    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                 CI->arg_operands().end());
+    SmallVector<Value *, 4> Args(CI->args());
     unsigned NumElts =
         cast<FixedVectorType>(Args[0]->getType())->getNumElements();
     Args[3] = getX86MaskVec(Builder, Args[3], NumElts);
@@ -3895,8 +3890,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
   case Intrinsic::invariant_start:
   case Intrinsic::invariant_end: {
-    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                 CI->arg_operands().end());
+    SmallVector<Value *, 4> Args(CI->args());
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
@@ -3904,8 +3898,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::masked_store:
   case Intrinsic::masked_gather:
   case Intrinsic::masked_scatter: {
-    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
-                                 CI->arg_operands().end());
+    SmallVector<Value *, 4> Args(CI->args());
     NewCall = Builder.CreateCall(NewFn, Args);
     NewCall->copyMetadata(*CI);
     break;
@@ -3921,7 +3914,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     //  @llvm.memset...(i8*, i8, i[32|64], i32, i1)
     //    -> @llvm.memset...(i8*, i8, i[32|64], i1)
     // Note: i8*'s in the above can be any pointer type
-    if (CI->getNumArgOperands() != 5) {
+    if (CI->arg_size() != 5) {
       DefaultCase();
       return;
     }
@@ -4111,7 +4104,7 @@ void llvm::UpgradeARCRuntime(Module &M) {
 
       bool InvalidCast = false;
 
-      for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) {
+      for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
         Value *Arg = CI->getArgOperand(I);
 
         // Bitcast argument to the parameter type of the new function if it's
@@ -4361,8 +4354,8 @@ struct StrictFPUpgradeVisitor : public InstVisitor<StrictFPUpgradeVisitor> {
       return;
     // If we get here, the caller doesn't have the strictfp attribute
     // but this callsite does. Replace the strictfp attribute with nobuiltin.
-    Call.removeAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
-    Call.addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+    Call.removeFnAttr(Attribute::StrictFP);
+    Call.addFnAttr(Attribute::NoBuiltin);
   }
 };
 } // namespace
@@ -4383,8 +4376,7 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   }
 
   // Remove all incompatibile attributes from function.
-  F.removeAttributes(AttributeList::ReturnIndex,
-                     AttributeFuncs::typeIncompatible(F.getReturnType()));
+  F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
   for (auto &Arg : F.args())
     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
 }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index d14abafdef2e..ed1956e0f7e9 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/IR/BasicBlock.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -23,6 +24,9 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ir"
+STATISTIC(NumInstrRenumberings, "Number of renumberings across all blocks");
+
 ValueSymbolTable *BasicBlock::getValueSymbolTable() {
   if (Function *F = getParent())
     return F->getValueSymbolTable();
@@ -505,6 +509,8 @@ void BasicBlock::renumberInstructions() {
   BasicBlockBits Bits = getBasicBlockBits();
   Bits.InstrOrderValid = true;
   setBasicBlockBits(Bits);
+
+  NumInstrRenumberings++;
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 5f05aa2e94e7..437fd0558447 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -349,200 +349,6 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
   }
 }
 
-/// Wrapper around getFoldedSizeOfImpl() that adds caching.
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded,
-                                 DenseMap<Type *, Constant *> &Cache);
-
-/// Return a ConstantExpr with type DestTy for sizeof on Ty, with any known
-/// factors factored out. If Folded is false, return null if no factoring was
-/// possible, to avoid endlessly bouncing an unfoldable expression back into the
-/// top-level folder.
-static Constant *getFoldedSizeOfImpl(Type *Ty, Type *DestTy, bool Folded,
-                                     DenseMap<Type *, Constant *> &Cache) {
-  // This is the actual implementation of getFoldedSizeOf(). To get the caching
-  // behavior, we need to call getFoldedSizeOf() when we recurse.
-
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Constant *N = ConstantInt::get(DestTy, ATy->getNumElements());
-    Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true, Cache);
-    return ConstantExpr::getNUWMul(E, N);
-  }
-
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!STy->isPacked()) {
-      unsigned NumElems = STy->getNumElements();
-      // An empty struct has size zero.
-      if (NumElems == 0)
-        return ConstantExpr::getNullValue(DestTy);
-      // Check for a struct with all members having the same size.
-      Constant *MemberSize =
-          getFoldedSizeOf(STy->getElementType(0), DestTy, true, Cache);
-      bool AllSame = true;
-      for (unsigned i = 1; i != NumElems; ++i)
-        if (MemberSize !=
-            getFoldedSizeOf(STy->getElementType(i), DestTy, true, Cache)) {
-          AllSame = false;
-          break;
-        }
-      if (AllSame) {
-        Constant *N = ConstantInt::get(DestTy, NumElems);
-        return ConstantExpr::getNUWMul(MemberSize, N);
-      }
-    }
-
-  // Pointer size doesn't depend on the pointee type, so canonicalize them
-  // to an arbitrary pointee.
-  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-    if (!PTy->getElementType()->isIntegerTy(1))
-      return getFoldedSizeOf(
-          PointerType::get(IntegerType::get(PTy->getContext(), 1),
-                           PTy->getAddressSpace()),
-          DestTy, true, Cache);
-
-  // If there's no interesting folding happening, bail so that we don't create
-  // a constant that looks like it needs folding but really doesn't.
-  if (!Folded)
-    return nullptr;
-
-  // Base case: Get a regular sizeof expression.
-  Constant *C = ConstantExpr::getSizeOf(Ty);
-  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                    DestTy, false),
-                            C, DestTy);
-  return C;
-}
-
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded,
-                                 DenseMap<Type *, Constant *> &Cache) {
-  // Check for previously generated folded size constant.
-  auto It = Cache.find(Ty);
-  if (It != Cache.end())
-    return It->second;
-  return Cache[Ty] = getFoldedSizeOfImpl(Ty, DestTy, Folded, Cache);
-}
-
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded) {
-  DenseMap<Type *, Constant *> Cache;
-  return getFoldedSizeOf(Ty, DestTy, Folded, Cache);
-}
-
-/// Return a ConstantExpr with type DestTy for alignof on Ty, with any known
-/// factors factored out. If Folded is false, return null if no factoring was
-/// possible, to avoid endlessly bouncing an unfoldable expression back into the
-/// top-level folder.
-static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy, bool Folded) {
-  // The alignment of an array is equal to the alignment of the
-  // array element. Note that this is not always true for vectors.
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Constant *C = ConstantExpr::getAlignOf(ATy->getElementType());
-    C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                      DestTy,
-                                                      false),
-                              C, DestTy);
-    return C;
-  }
-
-  if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    // Packed structs always have an alignment of 1.
-    if (STy->isPacked())
-      return ConstantInt::get(DestTy, 1);
-
-    // Otherwise, struct alignment is the maximum alignment of any member.
-    // Without target data, we can't compare much, but we can check to see
-    // if all the members have the same alignment.
-    unsigned NumElems = STy->getNumElements();
-    // An empty struct has minimal alignment.
-    if (NumElems == 0)
-      return ConstantInt::get(DestTy, 1);
-    // Check for a struct with all members having the same alignment.
-    Constant *MemberAlign =
-      getFoldedAlignOf(STy->getElementType(0), DestTy, true);
-    bool AllSame = true;
-    for (unsigned i = 1; i != NumElems; ++i)
-      if (MemberAlign != getFoldedAlignOf(STy->getElementType(i), DestTy, true)) {
-        AllSame = false;
-        break;
-      }
-    if (AllSame)
-      return MemberAlign;
-  }
-
-  // Pointer alignment doesn't depend on the pointee type, so canonicalize them
-  // to an arbitrary pointee.
-  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-    if (!PTy->getElementType()->isIntegerTy(1))
-      return
-        getFoldedAlignOf(PointerType::get(IntegerType::get(PTy->getContext(),
-                                                           1),
-                                          PTy->getAddressSpace()),
-                         DestTy, true);
-
-  // If there's no interesting folding happening, bail so that we don't create
-  // a constant that looks like it needs folding but really doesn't.
-  if (!Folded)
-    return nullptr;
-
-  // Base case: Get a regular alignof expression.
-  Constant *C = ConstantExpr::getAlignOf(Ty);
-  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                    DestTy, false),
-                            C, DestTy);
-  return C;
-}
-
-/// Return a ConstantExpr with type DestTy for offsetof on Ty and FieldNo, with
-/// any known factors factored out. If Folded is false, return null if no
-/// factoring was possible, to avoid endlessly bouncing an unfoldable expression
-/// back into the top-level folder.
-static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo, Type *DestTy,
-                                   bool Folded) {
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo, false,
-                                                                DestTy, false),
-                                        FieldNo, DestTy);
-    Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
-    return ConstantExpr::getNUWMul(E, N);
-  }
-
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!STy->isPacked()) {
-      unsigned NumElems = STy->getNumElements();
-      // An empty struct has no members.
-      if (NumElems == 0)
-        return nullptr;
-      // Check for a struct with all members having the same size.
-      Constant *MemberSize =
-        getFoldedSizeOf(STy->getElementType(0), DestTy, true);
-      bool AllSame = true;
-      for (unsigned i = 1; i != NumElems; ++i)
-        if (MemberSize !=
-            getFoldedSizeOf(STy->getElementType(i), DestTy, true)) {
-          AllSame = false;
-          break;
-        }
-      if (AllSame) {
-        Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo,
-                                                                    false,
-                                                                    DestTy,
-                                                                    false),
-                                            FieldNo, DestTy);
-        return ConstantExpr::getNUWMul(MemberSize, N);
-      }
-    }
-
-  // If there's no interesting folding happening, bail so that we don't create
-  // a constant that looks like it needs folding but really doesn't.
-  if (!Folded)
-    return nullptr;
-
-  // Base case: Get a regular offsetof expression.
-  Constant *C = ConstantExpr::getOffsetOf(Ty, FieldNo);
-  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                    DestTy, false),
-                            C, DestTy);
-  return C;
-}
-
 Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                                             Type *DestTy) {
   if (isa<PoisonValue>(V))
@@ -666,53 +472,6 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     // Is it a null pointer value?
     if (V->isNullValue())
       return ConstantInt::get(DestTy, 0);
-    // If this is a sizeof-like expression, pull out multiplications by
-    // known factors to expose them to subsequent folding. If it's an
-    // alignof-like expression, factor out known factors.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-      if (CE->getOpcode() == Instruction::GetElementPtr &&
-          CE->getOperand(0)->isNullValue()) {
-        // FIXME: Looks like getFoldedSizeOf(), getFoldedOffsetOf() and
-        // getFoldedAlignOf() don't handle the case when DestTy is a vector of
-        // pointers yet. We end up in asserts in CastInst::getCastOpcode (see
-        // test/Analysis/ConstantFolding/cast-vector.ll). I've only seen this
-        // happen in one "real" C-code test case, so it does not seem to be an
-        // important optimization to handle vectors here. For now, simply bail
-        // out.
-        if (DestTy->isVectorTy())
-          return nullptr;
-        GEPOperator *GEPO = cast<GEPOperator>(CE);
-        Type *Ty = GEPO->getSourceElementType();
-        if (CE->getNumOperands() == 2) {
-          // Handle a sizeof-like expression.
-          Constant *Idx = CE->getOperand(1);
-          bool isOne = isa<ConstantInt>(Idx) && cast<ConstantInt>(Idx)->isOne();
-          if (Constant *C = getFoldedSizeOf(Ty, DestTy, !isOne)) {
-            Idx = ConstantExpr::getCast(CastInst::getCastOpcode(Idx, true,
-                                                                DestTy, false),
-                                        Idx, DestTy);
-            return ConstantExpr::getMul(C, Idx);
-          }
-        } else if (CE->getNumOperands() == 3 &&
-                   CE->getOperand(1)->isNullValue()) {
-          // Handle an alignof-like expression.
-          if (StructType *STy = dyn_cast<StructType>(Ty))
-            if (!STy->isPacked()) {
-              ConstantInt *CI = cast<ConstantInt>(CE->getOperand(2));
-              if (CI->isOne() &&
-                  STy->getNumElements() == 2 &&
-                  STy->getElementType(0)->isIntegerTy(1)) {
-                return getFoldedAlignOf(STy->getElementType(1), DestTy, false);
-              }
-            }
-          // Handle an offsetof-like expression.
-          if (Ty->isStructTy() || Ty->isArrayTy()) {
-            if (Constant *C = getFoldedOffsetOf(Ty, CE->getOperand(2),
-                                                DestTy, false))
-              return C;
-          }
-        }
-      }
     // Other pointer types cannot be casted
     return nullptr;
   case Instruction::UIToFP:
@@ -720,7 +479,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       const APInt &api = CI->getValue();
       APFloat apf(DestTy->getFltSemantics(),
-                  APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
+                  APInt::getZero(DestTy->getPrimitiveSizeInBits()));
       apf.convertFromAPInt(api, opc==Instruction::SIToFP,
                            APFloat::rmNearestTiesToEven);
       return ConstantFP::get(V->getContext(), apf);
@@ -908,13 +667,16 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
     }
   }
 
+  if (Constant *C = Val->getAggregateElement(CIdx))
+    return C;
+
   // Lane < Splat minimum vector width => extractelt Splat(x), Lane -> x
   if (CIdx->getValue().ult(ValVTy->getElementCount().getKnownMinValue())) {
     if (Constant *SplatVal = Val->getSplatValue())
       return SplatVal;
   }
 
-  return Val->getAggregateElement(CIdx);
+  return nullptr;
 }
 
 Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
@@ -969,12 +731,16 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
 
   // If the mask is all zeros this is a splat, no need to go through all
   // elements.
-  if (all_of(Mask, [](int Elt) { return Elt == 0; }) &&
-      !MaskEltCount.isScalable()) {
+  if (all_of(Mask, [](int Elt) { return Elt == 0; })) {
     Type *Ty = IntegerType::get(V1->getContext(), 32);
     Constant *Elt =
         ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, 0));
-    return ConstantVector::getSplat(MaskEltCount, Elt);
+
+    if (Elt->isNullValue()) {
+      auto *VTy = VectorType::get(EltTy, MaskEltCount);
+      return ConstantAggregateZero::get(VTy);
+    } else if (!MaskEltCount.isScalable())
+      return ConstantVector::getSplat(MaskEltCount, Elt);
   }
   // Do not iterate on scalable vector. The num of elements is unknown at
   // compile-time.
@@ -1379,7 +1145,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return ConstantInt::get(CI1->getContext(), C1V.udiv(C2V));
       case Instruction::SDiv:
         assert(!CI2->isZero() && "Div by zero handled above");
-        if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
+        if (C2V.isAllOnes() && C1V.isMinSignedValue())
           return PoisonValue::get(CI1->getType());   // MIN_INT / -1 -> poison
         return ConstantInt::get(CI1->getContext(), C1V.sdiv(C2V));
       case Instruction::URem:
@@ -1387,7 +1153,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return ConstantInt::get(CI1->getContext(), C1V.urem(C2V));
       case Instruction::SRem:
         assert(!CI2->isZero() && "Div by zero handled above");
-        if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
+        if (C2V.isAllOnes() && C1V.isMinSignedValue())
           return PoisonValue::get(CI1->getType());   // MIN_INT % -1 -> poison
         return ConstantInt::get(CI1->getContext(), C1V.srem(C2V));
       case Instruction::And:
@@ -2030,19 +1796,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) {
     const APInt &V1 = cast<ConstantInt>(C1)->getValue();
     const APInt &V2 = cast<ConstantInt>(C2)->getValue();
-    switch (pred) {
-    default: llvm_unreachable("Invalid ICmp Predicate");
-    case ICmpInst::ICMP_EQ:  return ConstantInt::get(ResultTy, V1 == V2);
-    case ICmpInst::ICMP_NE:  return ConstantInt::get(ResultTy, V1 != V2);
-    case ICmpInst::ICMP_SLT: return ConstantInt::get(ResultTy, V1.slt(V2));
-    case ICmpInst::ICMP_SGT: return ConstantInt::get(ResultTy, V1.sgt(V2));
-    case ICmpInst::ICMP_SLE: return ConstantInt::get(ResultTy, V1.sle(V2));
-    case ICmpInst::ICMP_SGE: return ConstantInt::get(ResultTy, V1.sge(V2));
-    case ICmpInst::ICMP_ULT: return ConstantInt::get(ResultTy, V1.ult(V2));
-    case ICmpInst::ICMP_UGT: return ConstantInt::get(ResultTy, V1.ugt(V2));
-    case ICmpInst::ICMP_ULE: return ConstantInt::get(ResultTy, V1.ule(V2));
-    case ICmpInst::ICMP_UGE: return ConstantInt::get(ResultTy, V1.uge(V2));
-    }
+    return ConstantInt::get(
+        ResultTy, ICmpInst::compare(V1, V2, (ICmpInst::Predicate)pred));
   } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
     const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF();
     const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF();
@@ -2564,7 +2319,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       if (isIndexInRangeOfArrayType(STy->getNumElements(), CI))
         // It's in range, skip to the next index.
         continue;
-      if (CI->getSExtValue() < 0) {
+      if (CI->isNegative()) {
         // It's out of range and negative, don't try to factor it.
         Unknown = true;
         continue;
@@ -2575,7 +2330,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
         auto *CI = cast<ConstantInt>(CV->getElementAsConstant(I));
         InRange &= isIndexInRangeOfArrayType(STy->getNumElements(), CI);
-        if (CI->getSExtValue() < 0) {
+        if (CI->isNegative()) {
           Unknown = true;
           break;
         }
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 0649776dbc22..a0f2179bddb4 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -110,7 +110,7 @@ ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred,
     APInt UMin(CR.getUnsignedMin());
     if (UMin.isMaxValue())
       return getEmpty(W);
-    return ConstantRange(std::move(UMin) + 1, APInt::getNullValue(W));
+    return ConstantRange(std::move(UMin) + 1, APInt::getZero(W));
   }
   case CmpInst::ICMP_SGT: {
     APInt SMin(CR.getSignedMin());
@@ -119,7 +119,7 @@ ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred,
     return ConstantRange(std::move(SMin) + 1, APInt::getSignedMinValue(W));
   }
   case CmpInst::ICMP_UGE:
-    return getNonEmpty(CR.getUnsignedMin(), APInt::getNullValue(W));
+    return getNonEmpty(CR.getUnsignedMin(), APInt::getZero(W));
   case CmpInst::ICMP_SGE:
     return getNonEmpty(CR.getSignedMin(), APInt::getSignedMinValue(W));
   }
@@ -147,38 +147,77 @@ ConstantRange ConstantRange::makeExactICmpRegion(CmpInst::Predicate Pred,
   return makeAllowedICmpRegion(Pred, C);
 }
 
-bool ConstantRange::getEquivalentICmp(CmpInst::Predicate &Pred,
-                                      APInt &RHS) const {
-  bool Success = false;
+bool ConstantRange::areInsensitiveToSignednessOfICmpPredicate(
+    const ConstantRange &CR1, const ConstantRange &CR2) {
+  if (CR1.isEmptySet() || CR2.isEmptySet())
+    return true;
+
+  return (CR1.isAllNonNegative() && CR2.isAllNonNegative()) ||
+         (CR1.isAllNegative() && CR2.isAllNegative());
+}
+
+bool ConstantRange::areInsensitiveToSignednessOfInvertedICmpPredicate(
+    const ConstantRange &CR1, const ConstantRange &CR2) {
+  if (CR1.isEmptySet() || CR2.isEmptySet())
+    return true;
+
+  return (CR1.isAllNonNegative() && CR2.isAllNegative()) ||
+         (CR1.isAllNegative() && CR2.isAllNonNegative());
+}
+
+CmpInst::Predicate ConstantRange::getEquivalentPredWithFlippedSignedness(
+    CmpInst::Predicate Pred, const ConstantRange &CR1,
+    const ConstantRange &CR2) {
+  assert(CmpInst::isIntPredicate(Pred) && CmpInst::isRelational(Pred) &&
+         "Only for relational integer predicates!");
 
+  CmpInst::Predicate FlippedSignednessPred =
+      CmpInst::getFlippedSignednessPredicate(Pred);
+
+  if (areInsensitiveToSignednessOfICmpPredicate(CR1, CR2))
+    return FlippedSignednessPred;
+
+  if (areInsensitiveToSignednessOfInvertedICmpPredicate(CR1, CR2))
+    return CmpInst::getInversePredicate(FlippedSignednessPred);
+
+  return CmpInst::Predicate::BAD_ICMP_PREDICATE;
+}
+
+void ConstantRange::getEquivalentICmp(CmpInst::Predicate &Pred,
+                                      APInt &RHS, APInt &Offset) const {
+  Offset = APInt(getBitWidth(), 0);
   if (isFullSet() || isEmptySet()) {
     Pred = isEmptySet() ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
     RHS = APInt(getBitWidth(), 0);
-    Success = true;
   } else if (auto *OnlyElt = getSingleElement()) {
     Pred = CmpInst::ICMP_EQ;
     RHS = *OnlyElt;
-    Success = true;
   } else if (auto *OnlyMissingElt = getSingleMissingElement()) {
     Pred = CmpInst::ICMP_NE;
     RHS = *OnlyMissingElt;
-    Success = true;
   } else if (getLower().isMinSignedValue() || getLower().isMinValue()) {
     Pred =
         getLower().isMinSignedValue() ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
     RHS = getUpper();
-    Success = true;
   } else if (getUpper().isMinSignedValue() || getUpper().isMinValue()) {
     Pred =
         getUpper().isMinSignedValue() ? CmpInst::ICMP_SGE : CmpInst::ICMP_UGE;
     RHS = getLower();
-    Success = true;
+  } else {
+    Pred = CmpInst::ICMP_ULT;
+    RHS = getUpper() - getLower();
+    Offset = -getLower();
   }
 
-  assert((!Success || ConstantRange::makeExactICmpRegion(Pred, RHS) == *this) &&
+  assert(ConstantRange::makeExactICmpRegion(Pred, RHS) == add(Offset) &&
          "Bad result!");
+}
 
-  return Success;
+bool ConstantRange::getEquivalentICmp(CmpInst::Predicate &Pred,
+                                      APInt &RHS) const {
+  APInt Offset;
+  getEquivalentICmp(Pred, RHS, Offset);
+  return Offset.isZero();
 }
 
 bool ConstantRange::icmp(CmpInst::Predicate Pred,
@@ -204,13 +243,13 @@ static ConstantRange makeExactMulNSWRegion(const APInt &V) {
   // Handle special case for 0, -1 and 1. See the last for reason why we
   // specialize -1 and 1.
   unsigned BitWidth = V.getBitWidth();
-  if (V == 0 || V.isOneValue())
+  if (V == 0 || V.isOne())
     return ConstantRange::getFull(BitWidth);
 
   APInt MinValue = APInt::getSignedMinValue(BitWidth);
   APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
   // e.g. Returning [-127, 127], represented as [-127, -128).
-  if (V.isAllOnesValue())
+  if (V.isAllOnes())
     return ConstantRange(-MaxValue, MinValue);
 
   APInt Lower, Upper;
@@ -248,8 +287,7 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
 
   case Instruction::Add: {
     if (Unsigned)
-      return getNonEmpty(APInt::getNullValue(BitWidth),
-                         -Other.getUnsignedMax());
+      return getNonEmpty(APInt::getZero(BitWidth), -Other.getUnsignedMax());
 
     APInt SignedMinVal = APInt::getSignedMinValue(BitWidth);
     APInt SMin = Other.getSignedMin(), SMax = Other.getSignedMax();
@@ -291,7 +329,7 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
     // to be at most bitwidth-1, which results in most conservative range.
     APInt ShAmtUMax = ShAmt.getUnsignedMax();
     if (Unsigned)
-      return getNonEmpty(APInt::getNullValue(BitWidth),
+      return getNonEmpty(APInt::getZero(BitWidth),
                          APInt::getMaxValue(BitWidth).lshr(ShAmtUMax) + 1);
     return getNonEmpty(APInt::getSignedMinValue(BitWidth).ashr(ShAmtUMax),
                        APInt::getSignedMaxValue(BitWidth).ashr(ShAmtUMax) + 1);
@@ -316,7 +354,7 @@ bool ConstantRange::isEmptySet() const {
 }
 
 bool ConstantRange::isWrappedSet() const {
-  return Lower.ugt(Upper) && !Upper.isNullValue();
+  return Lower.ugt(Upper) && !Upper.isZero();
 }
 
 bool ConstantRange::isUpperWrapped() const {
@@ -343,11 +381,10 @@ ConstantRange::isSizeStrictlySmallerThan(const ConstantRange &Other) const {
 
 bool
 ConstantRange::isSizeLargerThan(uint64_t MaxSize) const {
-  assert(MaxSize && "MaxSize can't be 0.");
   // If this a full set, we need special handling to avoid needing an extra bit
   // to represent the size.
   if (isFullSet())
-    return APInt::getMaxValue(getBitWidth()).ugt(MaxSize - 1);
+    return MaxSize == 0 || APInt::getMaxValue(getBitWidth()).ugt(MaxSize - 1);
 
   return (Upper - Lower).ugt(MaxSize);
 }
@@ -595,7 +632,7 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR,
     APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
     APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper;
 
-    if (L.isNullValue() && U.isNullValue())
+    if (L.isZero() && U.isZero())
       return getFull();
 
     return ConstantRange(std::move(L), std::move(U));
@@ -644,6 +681,24 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR,
   return ConstantRange(std::move(L), std::move(U));
 }
 
+Optional<ConstantRange>
+ConstantRange::exactIntersectWith(const ConstantRange &CR) const {
+  // TODO: This can be implemented more efficiently.
+  ConstantRange Result = intersectWith(CR);
+  if (Result == inverse().unionWith(CR.inverse()).inverse())
+    return Result;
+  return None;
+}
+
+Optional<ConstantRange>
+ConstantRange::exactUnionWith(const ConstantRange &CR) const {
+  // TODO: This can be implemented more efficiently.
+  ConstantRange Result = unionWith(CR);
+  if (Result == inverse().intersectWith(CR.inverse()).inverse())
+    return Result;
+  return None;
+}
+
 ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
                                     uint32_t ResultBitWidth) const {
   switch (CastOp) {
@@ -1055,6 +1110,25 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   return UR.isSizeStrictlySmallerThan(SR) ? UR : SR;
 }
 
+ConstantRange ConstantRange::smul_fast(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return getEmpty();
+
+  APInt Min = getSignedMin();
+  APInt Max = getSignedMax();
+  APInt OtherMin = Other.getSignedMin();
+  APInt OtherMax = Other.getSignedMax();
+
+  bool O1, O2, O3, O4;
+  auto Muls = {Min.smul_ov(OtherMin, O1), Min.smul_ov(OtherMax, O2),
+               Max.smul_ov(OtherMin, O3), Max.smul_ov(OtherMax, O4)};
+  if (O1 || O2 || O3 || O4)
+    return getFull();
+
+  auto Compare = [](const APInt &A, const APInt &B) { return A.slt(B); };
+  return getNonEmpty(std::min(Muls, Compare), std::max(Muls, Compare) + 1);
+}
+
 ConstantRange
 ConstantRange::smax(const ConstantRange &Other) const {
   // X smax Y is: range(smax(X_smin, Y_smin),
@@ -1113,13 +1187,13 @@ ConstantRange::umin(const ConstantRange &Other) const {
 
 ConstantRange
 ConstantRange::udiv(const ConstantRange &RHS) const {
-  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
+  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isZero())
     return getEmpty();
 
   APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
 
   APInt RHS_umin = RHS.getUnsignedMin();
-  if (RHS_umin.isNullValue()) {
+  if (RHS_umin.isZero()) {
     // We want the lowest value in RHS excluding zero. Usually that would be 1
     // except for a range in the form of [X, 1) in which case it would be X.
     if (RHS.getUpper() == 1)
@@ -1136,7 +1210,7 @@ ConstantRange ConstantRange::sdiv(const ConstantRange &RHS) const {
   // We split up the LHS and RHS into positive and negative components
   // and then also compute the positive and negative components of the result
   // separately by combining division results with the appropriate signs.
-  APInt Zero = APInt::getNullValue(getBitWidth());
+  APInt Zero = APInt::getZero(getBitWidth());
   APInt SignedMin = APInt::getSignedMinValue(getBitWidth());
   ConstantRange PosFilter(APInt(getBitWidth(), 1), SignedMin);
   ConstantRange NegFilter(SignedMin, Zero);
@@ -1159,12 +1233,12 @@ ConstantRange ConstantRange::sdiv(const ConstantRange &RHS) const {
     // (For APInts the operation is well-defined and yields SignedMin.) We
     // handle this by dropping either SignedMin from the LHS or -1 from the RHS.
     APInt Lo = (NegL.Upper - 1).sdiv(NegR.Lower);
-    if (NegL.Lower.isMinSignedValue() && NegR.Upper.isNullValue()) {
+    if (NegL.Lower.isMinSignedValue() && NegR.Upper.isZero()) {
       // Remove -1 from the LHS. Skip if it's the only element, as this would
       // leave us with an empty set.
-      if (!NegR.Lower.isAllOnesValue()) {
+      if (!NegR.Lower.isAllOnes()) {
         APInt AdjNegRUpper;
-        if (RHS.Lower.isAllOnesValue())
+        if (RHS.Lower.isAllOnes())
           // Negative part of [-1, X] without -1 is [SignedMin, X].
           AdjNegRUpper = RHS.Upper;
         else
@@ -1218,12 +1292,12 @@ ConstantRange ConstantRange::sdiv(const ConstantRange &RHS) const {
 }
 
 ConstantRange ConstantRange::urem(const ConstantRange &RHS) const {
-  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
+  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isZero())
     return getEmpty();
 
   if (const APInt *RHSInt = RHS.getSingleElement()) {
     // UREM by null is UB.
-    if (RHSInt->isNullValue())
+    if (RHSInt->isZero())
       return getEmpty();
     // Use APInt's implementation of UREM for single element ranges.
     if (const APInt *LHSInt = getSingleElement())
@@ -1236,7 +1310,7 @@ ConstantRange ConstantRange::urem(const ConstantRange &RHS) const {
 
   // L % R is <= L and < R.
   APInt Upper = APIntOps::umin(getUnsignedMax(), RHS.getUnsignedMax() - 1) + 1;
-  return getNonEmpty(APInt::getNullValue(getBitWidth()), std::move(Upper));
+  return getNonEmpty(APInt::getZero(getBitWidth()), std::move(Upper));
 }
 
 ConstantRange ConstantRange::srem(const ConstantRange &RHS) const {
@@ -1245,7 +1319,7 @@ ConstantRange ConstantRange::srem(const ConstantRange &RHS) const {
 
   if (const APInt *RHSInt = RHS.getSingleElement()) {
     // SREM by null is UB.
-    if (RHSInt->isNullValue())
+    if (RHSInt->isZero())
       return getEmpty();
     // Use APInt's implementation of SREM for single element ranges.
     if (const APInt *LHSInt = getSingleElement())
@@ -1257,10 +1331,10 @@ ConstantRange ConstantRange::srem(const ConstantRange &RHS) const {
   APInt MaxAbsRHS = AbsRHS.getUnsignedMax();
 
   // Modulus by zero is UB.
-  if (MaxAbsRHS.isNullValue())
+  if (MaxAbsRHS.isZero())
     return getEmpty();
 
-  if (MinAbsRHS.isNullValue())
+  if (MinAbsRHS.isZero())
     ++MinAbsRHS;
 
   APInt MinLHS = getSignedMin(), MaxLHS = getSignedMax();
@@ -1272,7 +1346,7 @@ ConstantRange ConstantRange::srem(const ConstantRange &RHS) const {
 
     // L % R is <= L and < R.
     APInt Upper = APIntOps::umin(MaxLHS, MaxAbsRHS - 1) + 1;
-    return ConstantRange(APInt::getNullValue(getBitWidth()), std::move(Upper));
+    return ConstantRange(APInt::getZero(getBitWidth()), std::move(Upper));
   }
 
   // Same basic logic as above, but the result is negative.
@@ -1291,7 +1365,7 @@ ConstantRange ConstantRange::srem(const ConstantRange &RHS) const {
 }
 
 ConstantRange ConstantRange::binaryNot() const {
-  return ConstantRange(APInt::getAllOnesValue(getBitWidth())).sub(*this);
+  return ConstantRange(APInt::getAllOnes(getBitWidth())).sub(*this);
 }
 
 ConstantRange
@@ -1306,7 +1380,7 @@ ConstantRange::binaryAnd(const ConstantRange &Other) const {
   // TODO: replace this with something less conservative
 
   APInt umin = APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax());
-  return getNonEmpty(APInt::getNullValue(getBitWidth()), std::move(umin) + 1);
+  return getNonEmpty(APInt::getZero(getBitWidth()), std::move(umin) + 1);
 }
 
 ConstantRange
@@ -1321,7 +1395,7 @@ ConstantRange::binaryOr(const ConstantRange &Other) const {
   // TODO: replace this with something less conservative
 
   APInt umax = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
-  return getNonEmpty(std::move(umax), APInt::getNullValue(getBitWidth()));
+  return getNonEmpty(std::move(umax), APInt::getZero(getBitWidth()));
 }
 
 ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
@@ -1333,9 +1407,9 @@ ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
     return {*getSingleElement() ^ *Other.getSingleElement()};
 
   // Special-case binary complement, since we can give a precise answer.
-  if (Other.isSingleElement() && Other.getSingleElement()->isAllOnesValue())
+  if (Other.isSingleElement() && Other.getSingleElement()->isAllOnes())
     return binaryNot();
-  if (isSingleElement() && getSingleElement()->isAllOnesValue())
+  if (isSingleElement() && getSingleElement()->isAllOnes())
     return Other.binaryNot();
 
   // TODO: replace this with something less conservative
@@ -1347,24 +1421,33 @@ ConstantRange::shl(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
-  APInt max = getUnsignedMax();
-  APInt Other_umax = Other.getUnsignedMax();
+  APInt Min = getUnsignedMin();
+  APInt Max = getUnsignedMax();
+  if (const APInt *RHS = Other.getSingleElement()) {
+    unsigned BW = getBitWidth();
+    if (RHS->uge(BW))
+      return getEmpty();
 
-  // If we are shifting by maximum amount of
-  // zero return return the original range.
-  if (Other_umax.isNullValue())
-    return *this;
-  // there's overflow!
-  if (Other_umax.ugt(max.countLeadingZeros()))
+    unsigned EqualLeadingBits = (Min ^ Max).countLeadingZeros();
+    if (RHS->ule(EqualLeadingBits))
+      return getNonEmpty(Min << *RHS, (Max << *RHS) + 1);
+
+    return getNonEmpty(APInt::getZero(BW),
+                       APInt::getBitsSetFrom(BW, RHS->getZExtValue()) + 1);
+  }
+
+  APInt OtherMax = Other.getUnsignedMax();
+
+  // There's overflow!
+  if (OtherMax.ugt(Max.countLeadingZeros()))
     return getFull();
 
   // FIXME: implement the other tricky cases
 
-  APInt min = getUnsignedMin();
-  min <<= Other.getUnsignedMin();
-  max <<= Other_umax;
+  Min <<= Other.getUnsignedMin();
+  Max <<= OtherMax;
 
-  return ConstantRange(std::move(min), std::move(max) + 1);
+  return ConstantRange::getNonEmpty(std::move(Min), std::move(Max) + 1);
 }
 
 ConstantRange
@@ -1483,20 +1566,15 @@ ConstantRange ConstantRange::smul_sat(const ConstantRange &Other) const {
   //   [-1,4) * [-2,3) = min(-1*-2, -1*2, 3*-2, 3*2) = -6.
   // Similarly for the upper bound, swapping min for max.
 
-  APInt this_min = getSignedMin().sext(getBitWidth() * 2);
-  APInt this_max = getSignedMax().sext(getBitWidth() * 2);
-  APInt Other_min = Other.getSignedMin().sext(getBitWidth() * 2);
-  APInt Other_max = Other.getSignedMax().sext(getBitWidth() * 2);
+  APInt Min = getSignedMin();
+  APInt Max = getSignedMax();
+  APInt OtherMin = Other.getSignedMin();
+  APInt OtherMax = Other.getSignedMax();
 
-  auto L = {this_min * Other_min, this_min * Other_max, this_max * Other_min,
-            this_max * Other_max};
+  auto L = {Min.smul_sat(OtherMin), Min.smul_sat(OtherMax),
+            Max.smul_sat(OtherMin), Max.smul_sat(OtherMax)};
   auto Compare = [](const APInt &A, const APInt &B) { return A.slt(B); };
-
-  // Note that we wanted to perform signed saturating multiplication,
-  // so since we performed plain multiplication in twice the bitwidth,
-  // we need to perform signed saturating truncation.
-  return getNonEmpty(std::min(L, Compare).truncSSat(getBitWidth()),
-                     std::max(L, Compare).truncSSat(getBitWidth()) + 1);
+  return getNonEmpty(std::min(L, Compare), std::max(L, Compare) + 1);
 }
 
 ConstantRange ConstantRange::ushl_sat(const ConstantRange &Other) const {
@@ -1535,7 +1613,7 @@ ConstantRange ConstantRange::abs(bool IntMinIsPoison) const {
     APInt Lo;
     // Check whether the range crosses zero.
     if (Upper.isStrictlyPositive() || !Lower.isStrictlyPositive())
-      Lo = APInt::getNullValue(getBitWidth());
+      Lo = APInt::getZero(getBitWidth());
     else
       Lo = APIntOps::umin(Lower, -Upper + 1);
 
@@ -1565,7 +1643,7 @@ ConstantRange ConstantRange::abs(bool IntMinIsPoison) const {
     return ConstantRange(-SMax, -SMin + 1);
 
   // Range crosses zero.
-  return ConstantRange(APInt::getNullValue(getBitWidth()),
+  return ConstantRange(APInt::getZero(getBitWidth()),
                        APIntOps::umax(-SMin, SMax) + 1);
 }
 
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 6c75085a6678..c66cfb6e9ac1 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -95,7 +95,7 @@ bool Constant::isAllOnesValue() const {
 
   // Check for FP which are bitcasted from -1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
-    return CFP->getValueAPF().bitcastToAPInt().isAllOnesValue();
+    return CFP->getValueAPF().bitcastToAPInt().isAllOnes();
 
   // Check for constant splat vectors of 1 values.
   if (getType()->isVectorTy())
@@ -112,7 +112,7 @@ bool Constant::isOneValue() const {
 
   // Check for FP which are bitcasted from 1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
-    return CFP->getValueAPF().bitcastToAPInt().isOneValue();
+    return CFP->getValueAPF().bitcastToAPInt().isOne();
 
   // Check for constant splat vectors of 1 values.
   if (getType()->isVectorTy())
@@ -129,7 +129,7 @@ bool Constant::isNotOneValue() const {
 
   // Check for FP which are bitcasted from 1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
-    return !CFP->getValueAPF().bitcastToAPInt().isOneValue();
+    return !CFP->getValueAPF().bitcastToAPInt().isOne();
 
   // Check that vectors don't contain 1
   if (auto *VTy = dyn_cast<FixedVectorType>(getType())) {
@@ -315,9 +315,11 @@ containsUndefinedElement(const Constant *C,
       return false;
 
     for (unsigned i = 0, e = cast<FixedVectorType>(VTy)->getNumElements();
-         i != e; ++i)
-      if (HasFn(C->getAggregateElement(i)))
-        return true;
+         i != e; ++i) {
+      if (Constant *Elem = C->getAggregateElement(i))
+        if (HasFn(Elem))
+          return true;
+    }
   }
 
   return false;
@@ -366,9 +368,8 @@ Constant *Constant::getNullValue(Type *Ty) {
     return ConstantFP::get(Ty->getContext(),
                            APFloat::getZero(APFloat::IEEEquad()));
   case Type::PPC_FP128TyID:
-    return ConstantFP::get(Ty->getContext(),
-                           APFloat(APFloat::PPCDoubleDouble(),
-                                   APInt::getNullValue(128)));
+    return ConstantFP::get(Ty->getContext(), APFloat(APFloat::PPCDoubleDouble(),
+                                                     APInt::getZero(128)));
   case Type::PointerTyID:
     return ConstantPointerNull::get(cast<PointerType>(Ty));
   case Type::StructTyID:
@@ -404,11 +405,10 @@ Constant *Constant::getIntegerValue(Type *Ty, const APInt &V) {
 Constant *Constant::getAllOnesValue(Type *Ty) {
   if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
     return ConstantInt::get(Ty->getContext(),
-                            APInt::getAllOnesValue(ITy->getBitWidth()));
+                            APInt::getAllOnes(ITy->getBitWidth()));
 
   if (Ty->isFloatingPointTy()) {
-    APFloat FL = APFloat::getAllOnesValue(Ty->getFltSemantics(),
-                                          Ty->getPrimitiveSizeInBits());
+    APFloat FL = APFloat::getAllOnesValue(Ty->getFltSemantics());
     return ConstantFP::get(Ty->getContext(), FL);
   }
 
@@ -714,29 +714,41 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
   return Result;
 }
 
-/// If the specified constantexpr is dead, remove it. This involves recursively
-/// eliminating any dead users of the constantexpr.
-static bool removeDeadUsersOfConstant(const Constant *C) {
+/// Return true if the specified constantexpr is dead. This involves
+/// recursively traversing users of the constantexpr.
+/// If RemoveDeadUsers is true, also remove dead users at the same time.
+static bool constantIsDead(const Constant *C, bool RemoveDeadUsers) {
   if (isa<GlobalValue>(C)) return false; // Cannot remove this
 
-  while (!C->use_empty()) {
-    const Constant *User = dyn_cast<Constant>(C->user_back());
+  Value::const_user_iterator I = C->user_begin(), E = C->user_end();
+  while (I != E) {
+    const Constant *User = dyn_cast<Constant>(*I);
     if (!User) return false; // Non-constant usage;
-    if (!removeDeadUsersOfConstant(User))
+    if (!constantIsDead(User, RemoveDeadUsers))
       return false; // Constant wasn't dead
+
+    // Just removed User, so the iterator was invalidated.
+    // Since we return immediately upon finding a live user, we can always
+    // restart from user_begin().
+    if (RemoveDeadUsers)
+      I = C->user_begin();
+    else
+      ++I;
   }
 
-  // If C is only used by metadata, it should not be preserved but should have
-  // its uses replaced.
-  if (C->isUsedByMetadata()) {
-    const_cast<Constant *>(C)->replaceAllUsesWith(
-        UndefValue::get(C->getType()));
+  if (RemoveDeadUsers) {
+    // If C is only used by metadata, it should not be preserved but should
+    // have its uses replaced.
+    if (C->isUsedByMetadata()) {
+      const_cast<Constant *>(C)->replaceAllUsesWith(
+          UndefValue::get(C->getType()));
+    }
+    const_cast<Constant *>(C)->destroyConstant();
   }
-  const_cast<Constant*>(C)->destroyConstant();
+
   return true;
 }
 
-
 void Constant::removeDeadConstantUsers() const {
   Value::const_user_iterator I = user_begin(), E = user_end();
   Value::const_user_iterator LastNonDeadUser = E;
@@ -748,7 +760,7 @@ void Constant::removeDeadConstantUsers() const {
       continue;
     }
 
-    if (!removeDeadUsersOfConstant(User)) {
+    if (!constantIsDead(User, /* RemoveDeadUsers= */ true)) {
       // If the constant wasn't dead, remember that this was the last live use
       // and move on to the next constant.
       LastNonDeadUser = I;
@@ -764,6 +776,20 @@ void Constant::removeDeadConstantUsers() const {
   }
 }
 
+bool Constant::hasOneLiveUse() const {
+  unsigned NumUses = 0;
+  for (const Use &use : uses()) {
+    const Constant *User = dyn_cast<Constant>(use.getUser());
+    if (!User || !constantIsDead(User, /* RemoveDeadUsers= */ false)) {
+      ++NumUses;
+
+      if (NumUses > 1)
+        return false;
+    }
+  }
+  return NumUses == 1;
+}
+
 Constant *Constant::replaceUndefsWith(Constant *C, Constant *Replacement) {
   assert(C && Replacement && "Expected non-nullptr constant arguments");
   Type *Ty = C->getType();
@@ -1430,12 +1456,12 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
   Type *I32Ty = Type::getInt32Ty(VTy->getContext());
 
   // Move scalar into vector.
-  Constant *UndefV = UndefValue::get(VTy);
-  V = ConstantExpr::getInsertElement(UndefV, V, ConstantInt::get(I32Ty, 0));
+  Constant *PoisonV = PoisonValue::get(VTy);
+  V = ConstantExpr::getInsertElement(PoisonV, V, ConstantInt::get(I32Ty, 0));
   // Build shuffle mask to perform the splat.
   SmallVector<int, 8> Zeros(EC.getKnownMinValue(), 0);
   // Splat.
-  return ConstantExpr::getShuffleVector(V, UndefV, Zeros);
+  return ConstantExpr::getShuffleVector(V, PoisonV, Zeros);
 }
 
 ConstantTokenNone *ConstantTokenNone::get(LLVMContext &Context) {
@@ -1508,20 +1534,6 @@ Constant *ConstantExpr::getShuffleMaskForBitcode() const {
   return cast<ShuffleVectorConstantExpr>(this)->ShuffleMaskForBitcode;
 }
 
-Constant *
-ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
-  assert(Op->getType() == getOperand(OpNo)->getType() &&
-         "Replacing operand with value of different type!");
-  if (getOperand(OpNo) == Op)
-    return const_cast<ConstantExpr*>(this);
-
-  SmallVector<Constant*, 8> NewOps;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    NewOps.push_back(i == OpNo ? Op : getOperand(i));
-
-  return getWithOperands(NewOps);
-}
-
 Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
                                         bool OnlyIfReduced, Type *SrcTy) const {
   assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
@@ -3282,7 +3294,7 @@ bool ConstantDataSequential::isCString() const {
   if (Str.back() != 0) return false;
 
   // Other elements must be non-nul.
-  return Str.drop_back().find(0) == StringRef::npos;
+  return !Str.drop_back().contains(0);
 }
 
 bool ConstantDataVector::isSplatData() const {
@@ -3480,7 +3492,7 @@ Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV) {
       NewOps, this, From, To, NumUpdated, OperandNo);
 }
 
-Instruction *ConstantExpr::getAsInstruction() const {
+Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const {
   SmallVector<Value *, 4> ValueOperands(operands());
   ArrayRef<Value*> Ops(ValueOperands);
 
@@ -3498,40 +3510,43 @@ Instruction *ConstantExpr::getAsInstruction() const {
   case Instruction::IntToPtr:
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
-    return CastInst::Create((Instruction::CastOps)getOpcode(),
-                            Ops[0], getType());
+    return CastInst::Create((Instruction::CastOps)getOpcode(), Ops[0],
+                            getType(), "", InsertBefore);
   case Instruction::Select:
-    return SelectInst::Create(Ops[0], Ops[1], Ops[2]);
+    return SelectInst::Create(Ops[0], Ops[1], Ops[2], "", InsertBefore);
   case Instruction::InsertElement:
-    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2]);
+    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "", InsertBefore);
   case Instruction::ExtractElement:
-    return ExtractElementInst::Create(Ops[0], Ops[1]);
+    return ExtractElementInst::Create(Ops[0], Ops[1], "", InsertBefore);
   case Instruction::InsertValue:
-    return InsertValueInst::Create(Ops[0], Ops[1], getIndices());
+    return InsertValueInst::Create(Ops[0], Ops[1], getIndices(), "",
+                                   InsertBefore);
   case Instruction::ExtractValue:
-    return ExtractValueInst::Create(Ops[0], getIndices());
+    return ExtractValueInst::Create(Ops[0], getIndices(), "", InsertBefore);
   case Instruction::ShuffleVector:
-    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask());
+    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "",
+                                 InsertBefore);
 
   case Instruction::GetElementPtr: {
     const auto *GO = cast<GEPOperator>(this);
     if (GO->isInBounds())
-      return GetElementPtrInst::CreateInBounds(GO->getSourceElementType(),
-                                               Ops[0], Ops.slice(1));
+      return GetElementPtrInst::CreateInBounds(
+          GO->getSourceElementType(), Ops[0], Ops.slice(1), "", InsertBefore);
     return GetElementPtrInst::Create(GO->getSourceElementType(), Ops[0],
-                                     Ops.slice(1));
+                                     Ops.slice(1), "", InsertBefore);
   }
   case Instruction::ICmp:
   case Instruction::FCmp:
     return CmpInst::Create((Instruction::OtherOps)getOpcode(),
-                           (CmpInst::Predicate)getPredicate(), Ops[0], Ops[1]);
+                           (CmpInst::Predicate)getPredicate(), Ops[0], Ops[1],
+                           "", InsertBefore);
   case Instruction::FNeg:
-    return UnaryOperator::Create((Instruction::UnaryOps)getOpcode(), Ops[0]);
+    return UnaryOperator::Create((Instruction::UnaryOps)getOpcode(), Ops[0], "",
+                                 InsertBefore);
   default:
     assert(getNumOperands() == 2 && "Must be binary operator?");
-    BinaryOperator *BO =
-      BinaryOperator::Create((Instruction::BinaryOps)getOpcode(),
-                             Ops[0], Ops[1]);
+    BinaryOperator *BO = BinaryOperator::Create(
+        (Instruction::BinaryOps)getOpcode(), Ops[0], Ops[1], "", InsertBefore);
     if (isa<OverflowingBinaryOperator>(BO)) {
       BO->setHasNoUnsignedWrap(SubclassOptionalData &
                                OverflowingBinaryOperator::NoUnsignedWrap);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8a7060c148c9..905372982dc2 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2460,7 +2460,7 @@ void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
 
 void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                              LLVMAttributeRef A) {
-  unwrap<Function>(F)->addAttribute(Idx, unwrap(A));
+  unwrap<Function>(F)->addAttributeAtIndex(Idx, unwrap(A));
 }
 
 unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
@@ -2478,31 +2478,32 @@ void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F,
                                              LLVMAttributeIndex Idx,
                                              unsigned KindID) {
-  return wrap(unwrap<Function>(F)->getAttribute(Idx,
-                                                (Attribute::AttrKind)KindID));
+  return wrap(unwrap<Function>(F)->getAttributeAtIndex(
+      Idx, (Attribute::AttrKind)KindID));
 }
 
 LLVMAttributeRef LLVMGetStringAttributeAtIndex(LLVMValueRef F,
                                                LLVMAttributeIndex Idx,
                                                const char *K, unsigned KLen) {
-  return wrap(unwrap<Function>(F)->getAttribute(Idx, StringRef(K, KLen)));
+  return wrap(
+      unwrap<Function>(F)->getAttributeAtIndex(Idx, StringRef(K, KLen)));
 }
 
 void LLVMRemoveEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                                     unsigned KindID) {
-  unwrap<Function>(F)->removeAttribute(Idx, (Attribute::AttrKind)KindID);
+  unwrap<Function>(F)->removeAttributeAtIndex(Idx, (Attribute::AttrKind)KindID);
 }
 
 void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                                       const char *K, unsigned KLen) {
-  unwrap<Function>(F)->removeAttribute(Idx, StringRef(K, KLen));
+  unwrap<Function>(F)->removeAttributeAtIndex(Idx, StringRef(K, KLen));
 }
 
 void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
                                         const char *V) {
   Function *Func = unwrap<Function>(Fn);
   Attribute Attr = Attribute::get(Func->getContext(), A, V);
-  Func->addAttribute(AttributeList::FunctionIndex, Attr);
+  Func->addFnAttr(Attr);
 }
 
 /*--.. Operations on parameters ............................................--*/
@@ -2843,7 +2844,7 @@ unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
   if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
     return FPI->getNumArgOperands();
   }
-  return unwrap<CallBase>(Instr)->getNumArgOperands();
+  return unwrap<CallBase>(Instr)->arg_size();
 }
 
 /*--.. Call and invoke instructions ........................................--*/
@@ -2857,17 +2858,17 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
       static_cast<CallingConv::ID>(CC));
 }
 
-void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, LLVMAttributeIndex Idx,
                                 unsigned align) {
   auto *Call = unwrap<CallBase>(Instr);
   Attribute AlignAttr =
       Attribute::getWithAlignment(Call->getContext(), Align(align));
-  Call->addAttribute(index, AlignAttr);
+  Call->addAttributeAtIndex(Idx, AlignAttr);
 }
 
 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
                               LLVMAttributeRef A) {
-  unwrap<CallBase>(C)->addAttribute(Idx, unwrap(A));
+  unwrap<CallBase>(C)->addAttributeAtIndex(Idx, unwrap(A));
 }
 
 unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
@@ -2888,24 +2889,25 @@ void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
 LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C,
                                               LLVMAttributeIndex Idx,
                                               unsigned KindID) {
-  return wrap(
-      unwrap<CallBase>(C)->getAttribute(Idx, (Attribute::AttrKind)KindID));
+  return wrap(unwrap<CallBase>(C)->getAttributeAtIndex(
+      Idx, (Attribute::AttrKind)KindID));
 }
 
 LLVMAttributeRef LLVMGetCallSiteStringAttribute(LLVMValueRef C,
                                                 LLVMAttributeIndex Idx,
                                                 const char *K, unsigned KLen) {
-  return wrap(unwrap<CallBase>(C)->getAttribute(Idx, StringRef(K, KLen)));
+  return wrap(
+      unwrap<CallBase>(C)->getAttributeAtIndex(Idx, StringRef(K, KLen)));
 }
 
 void LLVMRemoveCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
                                      unsigned KindID) {
-  unwrap<CallBase>(C)->removeAttribute(Idx, (Attribute::AttrKind)KindID);
+  unwrap<CallBase>(C)->removeAttributeAtIndex(Idx, (Attribute::AttrKind)KindID);
 }
 
 void LLVMRemoveCallSiteStringAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
                                        const char *K, unsigned KLen) {
-  unwrap<CallBase>(C)->removeAttribute(Idx, StringRef(K, KLen));
+  unwrap<CallBase>(C)->removeAttributeAtIndex(Idx, StringRef(K, KLen));
 }
 
 LLVMValueRef LLVMGetCalledValue(LLVMValueRef Instr) {
@@ -3131,6 +3133,10 @@ void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst) {
   unwrap(Builder)->SetInstDebugLocation(unwrap<Instruction>(Inst));
 }
 
+void LLVMAddMetadataToInst(LLVMBuilderRef Builder, LLVMValueRef Inst) {
+  unwrap(Builder)->AddMetadataToInst(unwrap<Instruction>(Inst));
+}
+
 void LLVMBuilderSetDefaultFPMathTag(LLVMBuilderRef Builder,
                                     LLVMMetadataRef FPMathTag) {
 
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 61d3b5e69e9e..ca7dafc814ce 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -32,8 +32,8 @@ static cl::opt<bool>
                cl::init(false), cl::Hidden);
 
 DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU)
-  : M(m), VMContext(M.getContext()), CUNode(CU),
-      DeclareFn(nullptr), ValueFn(nullptr), LabelFn(nullptr),
+    : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr),
+      ValueFn(nullptr), LabelFn(nullptr),
       AllowUnresolvedNodes(AllowUnresolvedNodes) {}
 
 void DIBuilder::trackIfUnresolved(MDNode *N) {
@@ -73,7 +73,8 @@ void DIBuilder::finalize() {
     return;
   }
 
-  CUNode->replaceEnumTypes(MDTuple::get(VMContext, AllEnumTypes));
+  if (!AllEnumTypes.empty())
+    CUNode->replaceEnumTypes(MDTuple::get(VMContext, AllEnumTypes));
 
   SmallVector<Metadata *, 16> RetainValues;
   // Declarations and definitions of the same type may be retained. Some
@@ -164,12 +165,13 @@ DICompileUnit *DIBuilder::createCompileUnit(
 static DIImportedEntity *
 createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
                      Metadata *NS, DIFile *File, unsigned Line, StringRef Name,
+                     DINodeArray Elements,
                      SmallVectorImpl<TrackingMDNodeRef> &AllImportedModules) {
   if (Line)
     assert(File && "Source location has line number but no file");
   unsigned EntitiesCount = C.pImpl->DIImportedEntitys.size();
   auto *M = DIImportedEntity::get(C, Tag, Context, cast_or_null<DINode>(NS),
-                                  File, Line, Name);
+                                  File, Line, Name, Elements);
   if (EntitiesCount < C.pImpl->DIImportedEntitys.size())
     // A new Imported Entity was just added to the context.
     // Add it to the Imported Modules list.
@@ -179,36 +181,38 @@ createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
                                                   DINamespace *NS, DIFile *File,
-                                                  unsigned Line) {
+                                                  unsigned Line,
+                                                  DINodeArray Elements) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, File, Line, StringRef(),
+                                Context, NS, File, Line, StringRef(), Elements,
                                 AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
                                                   DIImportedEntity *NS,
-                                                  DIFile *File, unsigned Line) {
+                                                  DIFile *File, unsigned Line,
+                                                  DINodeArray Elements) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, File, Line, StringRef(),
+                                Context, NS, File, Line, StringRef(), Elements,
                                 AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, DIModule *M,
-                                                  DIFile *File, unsigned Line) {
+                                                  DIFile *File, unsigned Line,
+                                                  DINodeArray Elements) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, M, File, Line, StringRef(),
+                                Context, M, File, Line, StringRef(), Elements,
                                 AllImportedModules);
 }
 
-DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context,
-                                                       DINode *Decl,
-                                                       DIFile *File,
-                                                       unsigned Line,
-                                                       StringRef Name) {
+DIImportedEntity *
+DIBuilder::createImportedDeclaration(DIScope *Context, DINode *Decl,
+                                     DIFile *File, unsigned Line,
+                                     StringRef Name, DINodeArray Elements) {
   // Make sure to use the unique identifier based metadata reference for
   // types that have one.
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
-                                Context, Decl, File, Line, Name,
+                                Context, Decl, File, Line, Name, Elements,
                                 AllImportedModules);
 }
 
@@ -250,7 +254,7 @@ DIEnumerator *DIBuilder::createEnumerator(StringRef Name, uint64_t Val,
                            Name);
 }
 
-DIEnumerator *DIBuilder::createEnumerator(StringRef Name, APSInt Value) {
+DIEnumerator *DIBuilder::createEnumerator(StringRef Name, const APSInt &Value) {
   assert(!Name.empty() && "Unable to create enumerator without name");
   return DIEnumerator::get(VMContext, APInt(Value), Value.isUnsigned(), Name);
 }
@@ -283,17 +287,16 @@ DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
                             0, 0, None, DINode::FlagZero);
 }
 
-DIDerivedType *DIBuilder::createPointerType(
-    DIType *PointeeTy,
-    uint64_t SizeInBits,
-    uint32_t AlignInBits,
-    Optional<unsigned> DWARFAddressSpace,
-    StringRef Name) {
+DIDerivedType *
+DIBuilder::createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
+                             uint32_t AlignInBits,
+                             Optional<unsigned> DWARFAddressSpace,
+                             StringRef Name, DINodeArray Annotations) {
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, DWARFAddressSpace,
-                            DINode::FlagZero);
+                            AlignInBits, 0, DWARFAddressSpace, DINode::FlagZero,
+                            nullptr, Annotations);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
@@ -306,11 +309,10 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                             AlignInBits, 0, None, Flags, Base);
 }
 
-DIDerivedType *DIBuilder::createReferenceType(
-    unsigned Tag, DIType *RTy,
-    uint64_t SizeInBits,
-    uint32_t AlignInBits,
-    Optional<unsigned> DWARFAddressSpace) {
+DIDerivedType *
+DIBuilder::createReferenceType(unsigned Tag, DIType *RTy, uint64_t SizeInBits,
+                               uint32_t AlignInBits,
+                               Optional<unsigned> DWARFAddressSpace) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
                             SizeInBits, AlignInBits, 0, DWARFAddressSpace,
@@ -319,11 +321,12 @@ DIDerivedType *DIBuilder::createReferenceType(
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DIFile *File, unsigned LineNo,
-                                        DIScope *Context,
-                                        uint32_t AlignInBits) {
+                                        DIScope *Context, uint32_t AlignInBits,
+                                        DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
                             LineNo, getNonCompileUnitScope(Context), Ty, 0,
-                            AlignInBits, 0, None, DINode::FlagZero);
+                            AlignInBits, 0, None, DINode::FlagZero, nullptr,
+                            Annotations);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
@@ -341,19 +344,18 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
   Metadata *ExtraData = ConstantAsMetadata::get(
       ConstantInt::get(IntegerType::get(VMContext, 32), VBPtrOffset));
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
-                            0, Ty, BaseTy, 0, 0, BaseOffset, None,
-                            Flags, ExtraData);
+                            0, Ty, BaseTy, 0, 0, BaseOffset, None, Flags,
+                            ExtraData);
 }
 
-DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
-                                           DIFile *File, unsigned LineNumber,
-                                           uint64_t SizeInBits,
-                                           uint32_t AlignInBits,
-                                           uint64_t OffsetInBits,
-                                           DINode::DIFlags Flags, DIType *Ty) {
+DIDerivedType *DIBuilder::createMemberType(
+    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+    DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, None, Flags);
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags,
+                            nullptr, Annotations);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -375,14 +377,15 @@ DIDerivedType *DIBuilder::createVariantMemberType(
 DIDerivedType *DIBuilder::createBitFieldMemberType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t OffsetInBits, uint64_t StorageOffsetInBits,
-    DINode::DIFlags Flags, DIType *Ty) {
+    DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations) {
   Flags |= DINode::FlagBitField;
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
-      getNonCompileUnitScope(Scope), Ty, SizeInBits, /* AlignInBits */ 0,
+      getNonCompileUnitScope(Scope), Ty, SizeInBits, /*AlignInBits=*/0,
       OffsetInBits, None, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
-                                               StorageOffsetInBits)));
+                                               StorageOffsetInBits)),
+      Annotations);
 }
 
 DIDerivedType *
@@ -498,10 +501,12 @@ DICompositeType *DIBuilder::createUnionType(
   return R;
 }
 
-DICompositeType *DIBuilder::createVariantPart(
-    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
-    uint64_t SizeInBits, uint32_t AlignInBits, DINode::DIFlags Flags,
-    DIDerivedType *Discriminator, DINodeArray Elements, StringRef UniqueIdentifier) {
+DICompositeType *
+DIBuilder::createVariantPart(DIScope *Scope, StringRef Name, DIFile *File,
+                             unsigned LineNumber, uint64_t SizeInBits,
+                             uint32_t AlignInBits, DINode::DIFlags Flags,
+                             DIDerivedType *Discriminator, DINodeArray Elements,
+                             StringRef UniqueIdentifier) {
   auto *R = DICompositeType::get(
       VMContext, dwarf::DW_TAG_variant_part, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), nullptr, SizeInBits, AlignInBits, 0, Flags,
@@ -542,16 +547,17 @@ DIDerivedType *DIBuilder::createSetType(DIScope *Scope, StringRef Name,
   return R;
 }
 
-DICompositeType *DIBuilder::createArrayType(
-    uint64_t Size, uint32_t AlignInBits, DIType *Ty, DINodeArray Subscripts,
-    PointerUnion<DIExpression *, DIVariable *> DL,
-    PointerUnion<DIExpression *, DIVariable *> AS,
-    PointerUnion<DIExpression *, DIVariable *> AL,
-    PointerUnion<DIExpression *, DIVariable *> RK) {
+DICompositeType *
+DIBuilder::createArrayType(uint64_t Size, uint32_t AlignInBits, DIType *Ty,
+                           DINodeArray Subscripts,
+                           PointerUnion<DIExpression *, DIVariable *> DL,
+                           PointerUnion<DIExpression *, DIVariable *> AS,
+                           PointerUnion<DIExpression *, DIVariable *> AL,
+                           PointerUnion<DIExpression *, DIVariable *> RK) {
   auto *R = DICompositeType::get(
-      VMContext, dwarf::DW_TAG_array_type, "", nullptr, 0,
-      nullptr, Ty, Size, AlignInBits, 0, DINode::FlagZero,
-      Subscripts, 0, nullptr, nullptr, "", nullptr,
+      VMContext, dwarf::DW_TAG_array_type, "", nullptr, 0, nullptr, Ty, Size,
+      AlignInBits, 0, DINode::FlagZero, Subscripts, 0, nullptr, nullptr, "",
+      nullptr,
       DL.is<DIExpression *>() ? (Metadata *)DL.get<DIExpression *>()
                               : (Metadata *)DL.get<DIVariable *>(),
       AS.is<DIExpression *>() ? (Metadata *)AS.get<DIExpression *>()
@@ -628,12 +634,14 @@ DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIScope *Scope,
 DICompositeType *DIBuilder::createReplaceableCompositeType(
     unsigned Tag, StringRef Name, DIScope *Scope, DIFile *F, unsigned Line,
     unsigned RuntimeLang, uint64_t SizeInBits, uint32_t AlignInBits,
-    DINode::DIFlags Flags, StringRef UniqueIdentifier) {
+    DINode::DIFlags Flags, StringRef UniqueIdentifier,
+    DINodeArray Annotations) {
   auto *RetTy =
       DICompositeType::getTemporary(
           VMContext, Tag, Name, F, Line, getNonCompileUnitScope(Scope), nullptr,
           SizeInBits, AlignInBits, 0, Flags, nullptr, RuntimeLang, nullptr,
-          nullptr, UniqueIdentifier)
+          nullptr, UniqueIdentifier, nullptr, nullptr, nullptr, nullptr,
+          nullptr, Annotations)
           .release();
   trackIfUnresolved(RetTy);
   return RetTy;
@@ -701,15 +709,16 @@ static void checkGlobalVariableScope(DIScope *Context) {
 
 DIGlobalVariableExpression *DIBuilder::createGlobalVariableExpression(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
-    unsigned LineNumber, DIType *Ty, bool IsLocalToUnit,
-    bool isDefined, DIExpression *Expr,
-    MDNode *Decl, MDTuple *TemplateParams, uint32_t AlignInBits) {
+    unsigned LineNumber, DIType *Ty, bool IsLocalToUnit, bool isDefined,
+    DIExpression *Expr, MDNode *Decl, MDTuple *TemplateParams,
+    uint32_t AlignInBits, DINodeArray Annotations) {
   checkGlobalVariableScope(Context);
 
   auto *GV = DIGlobalVariable::getDistinct(
       VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
-      LineNumber, Ty, IsLocalToUnit, isDefined, cast_or_null<DIDerivedType>(Decl),
-      TemplateParams, AlignInBits);
+      LineNumber, Ty, IsLocalToUnit, isDefined,
+      cast_or_null<DIDerivedType>(Decl), TemplateParams, AlignInBits,
+      Annotations);
   if (!Expr)
     Expr = createExpression();
   auto *N = DIGlobalVariableExpression::get(VMContext, GV, Expr);
@@ -726,7 +735,8 @@ DIGlobalVariable *DIBuilder::createTempGlobalVariableFwdDecl(
   return DIGlobalVariable::getTemporary(
              VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
              LineNumber, Ty, IsLocalToUnit, false,
-             cast_or_null<DIDerivedType>(Decl), TemplateParams, AlignInBits)
+             cast_or_null<DIDerivedType>(Decl), TemplateParams, AlignInBits,
+             nullptr)
       .release();
 }
 
@@ -735,16 +745,16 @@ static DILocalVariable *createLocalVariable(
     DenseMap<MDNode *, SmallVector<TrackingMDNodeRef, 1>> &PreservedVariables,
     DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File,
     unsigned LineNo, DIType *Ty, bool AlwaysPreserve, DINode::DIFlags Flags,
-    uint32_t AlignInBits) {
+    uint32_t AlignInBits, DINodeArray Annotations = nullptr) {
   // FIXME: Why getNonCompileUnitScope()?
   // FIXME: Why is "!Context" okay here?
   // FIXME: Why doesn't this check for a subprogram or lexical block (AFAICT
   // the only valid scopes)?
   DIScope *Context = getNonCompileUnitScope(Scope);
 
-  auto *Node =
-      DILocalVariable::get(VMContext, cast_or_null<DILocalScope>(Context), Name,
-                           File, LineNo, Ty, ArgNo, Flags, AlignInBits);
+  auto *Node = DILocalVariable::get(
+      VMContext, cast_or_null<DILocalScope>(Context), Name, File, LineNo, Ty,
+      ArgNo, Flags, AlignInBits, Annotations);
   if (AlwaysPreserve) {
     // The optimizer may remove local variables. If there is an interest
     // to preserve variable info in such situation then stash it in a
@@ -768,21 +778,20 @@ DILocalVariable *DIBuilder::createAutoVariable(DIScope *Scope, StringRef Name,
 
 DILocalVariable *DIBuilder::createParameterVariable(
     DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File,
-    unsigned LineNo, DIType *Ty, bool AlwaysPreserve, DINode::DIFlags Flags) {
+    unsigned LineNo, DIType *Ty, bool AlwaysPreserve, DINode::DIFlags Flags,
+    DINodeArray Annotations) {
   assert(ArgNo && "Expected non-zero argument number for parameter");
   return createLocalVariable(VMContext, PreservedVariables, Scope, Name, ArgNo,
                              File, LineNo, Ty, AlwaysPreserve, Flags,
-                             /* AlignInBits */0);
+                             /*AlignInBits=*/0, Annotations);
 }
 
-DILabel *DIBuilder::createLabel(
-    DIScope *Scope, StringRef Name, DIFile *File,
-    unsigned LineNo, bool AlwaysPreserve) {
+DILabel *DIBuilder::createLabel(DIScope *Scope, StringRef Name, DIFile *File,
+                                unsigned LineNo, bool AlwaysPreserve) {
   DIScope *Context = getNonCompileUnitScope(Scope);
 
-  auto *Node =
-      DILabel::get(VMContext, cast_or_null<DILocalScope>(Context), Name,
-                   File, LineNo);
+  auto *Node = DILabel::get(VMContext, cast_or_null<DILocalScope>(Context),
+                            Name, File, LineNo);
 
   if (AlwaysPreserve) {
     /// The optimizer may remove labels. If there is an interest
@@ -806,7 +815,7 @@ DIExpression *DIBuilder::createExpression(ArrayRef<int64_t> Signed) {
 }
 
 template <class... Ts>
-static DISubprogram *getSubprogram(bool IsDistinct, Ts &&... Args) {
+static DISubprogram *getSubprogram(bool IsDistinct, Ts &&...Args) {
   if (IsDistinct)
     return DISubprogram::getDistinct(std::forward<Ts>(Args)...);
   return DISubprogram::get(std::forward<Ts>(Args)...);
@@ -817,13 +826,14 @@ DISubprogram *DIBuilder::createFunction(
     unsigned LineNo, DISubroutineType *Ty, unsigned ScopeLine,
     DINode::DIFlags Flags, DISubprogram::DISPFlags SPFlags,
     DITemplateParameterArray TParams, DISubprogram *Decl,
-    DITypeArray ThrownTypes) {
+    DITypeArray ThrownTypes, DINodeArray Annotations) {
   bool IsDefinition = SPFlags & DISubprogram::SPFlagDefinition;
   auto *Node = getSubprogram(
       /*IsDistinct=*/IsDefinition, VMContext, getNonCompileUnitScope(Context),
       Name, LinkageName, File, LineNo, Ty, ScopeLine, nullptr, 0, 0, Flags,
       SPFlags, IsDefinition ? CUNode : nullptr, TParams, Decl,
-      MDTuple::getTemporary(VMContext, None).release(), ThrownTypes);
+      MDTuple::getTemporary(VMContext, None).release(), ThrownTypes,
+      Annotations);
 
   if (IsDefinition)
     AllSubprograms.push_back(Node);
@@ -869,11 +879,11 @@ DISubprogram *DIBuilder::createMethod(
   return SP;
 }
 
-DICommonBlock *DIBuilder::createCommonBlock(
-    DIScope *Scope, DIGlobalVariable *Decl, StringRef Name, DIFile *File,
-    unsigned LineNo) {
-  return DICommonBlock::get(
-      VMContext, Scope, Decl, Name, File, LineNo);
+DICommonBlock *DIBuilder::createCommonBlock(DIScope *Scope,
+                                            DIGlobalVariable *Decl,
+                                            StringRef Name, DIFile *File,
+                                            unsigned LineNo) {
+  return DICommonBlock::get(VMContext, Scope, Decl, Name, File, LineNo);
 }
 
 DINamespace *DIBuilder::createNameSpace(DIScope *Scope, StringRef Name,
@@ -929,9 +939,9 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
 
 Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
                                     Instruction *InsertBefore) {
-  return insertLabel(
-      LabelInfo, DL, InsertBefore ? InsertBefore->getParent() : nullptr,
-      InsertBefore);
+  return insertLabel(LabelInfo, DL,
+                     InsertBefore ? InsertBefore->getParent() : nullptr,
+                     InsertBefore);
 }
 
 Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
@@ -980,7 +990,8 @@ static Function *getDeclareIntrin(Module &M) {
 
 Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
                                       DIExpression *Expr, const DILocation *DL,
-                                      BasicBlock *InsertBB, Instruction *InsertBefore) {
+                                      BasicBlock *InsertBB,
+                                      Instruction *InsertBefore) {
   assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.declare");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
@@ -1023,9 +1034,9 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(
   return B.CreateCall(ValueFn, Args);
 }
 
-Instruction *DIBuilder::insertLabel(
-    DILabel *LabelInfo, const DILocation *DL,
-    BasicBlock *InsertBB, Instruction *InsertBefore) {
+Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                    BasicBlock *InsertBB,
+                                    Instruction *InsertBefore) {
   assert(LabelInfo && "empty or invalid DILabel* passed to dbg.label");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
@@ -1042,8 +1053,7 @@ Instruction *DIBuilder::insertLabel(
   return B.CreateCall(LabelFn, Args);
 }
 
-void DIBuilder::replaceVTableHolder(DICompositeType *&T,
-                                    DIType *VTableHolder) {
+void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) {
   {
     TypedTrackingMDRef<DICompositeType> N(T);
     N->replaceVTableHolder(VTableHolder);
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index ecd74449dc38..2ace18048262 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -151,6 +151,8 @@ PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
 //===----------------------------------------------------------------------===//
 
 const char *DataLayout::getManglingComponent(const Triple &T) {
+  if (T.isOSBinFormatGOFF())
+    return "-m:l";
   if (T.isOSBinFormatMachO())
     return "-m:o";
   if (T.isOSWindows() && T.isOSBinFormatCOFF())
@@ -258,12 +260,12 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
   while (!Desc.empty()) {
     // Split at '-'.
     std::pair<StringRef, StringRef> Split;
-    if (Error Err = split(Desc, '-', Split))
+    if (Error Err = ::split(Desc, '-', Split))
       return Err;
     Desc = Split.second;
 
     // Split at ':'.
-    if (Error Err = split(Split.first, ':', Split))
+    if (Error Err = ::split(Split.first, ':', Split))
       return Err;
 
     // Aliases used below.
@@ -272,7 +274,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
 
     if (Tok == "ni") {
       do {
-        if (Error Err = split(Rest, ':', Split))
+        if (Error Err = ::split(Rest, ':', Split))
           return Err;
         Rest = Split.second;
         unsigned AS;
@@ -313,7 +315,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       if (Rest.empty())
         return reportError(
             "Missing size specification for pointer in datalayout string");
-      if (Error Err = split(Rest, ':', Split))
+      if (Error Err = ::split(Rest, ':', Split))
         return Err;
       unsigned PointerMemSize;
       if (Error Err = getIntInBytes(Tok, PointerMemSize))
@@ -325,7 +327,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       if (Rest.empty())
         return reportError(
             "Missing alignment specification for pointer in datalayout string");
-      if (Error Err = split(Rest, ':', Split))
+      if (Error Err = ::split(Rest, ':', Split))
         return Err;
       unsigned PointerABIAlign;
       if (Error Err = getIntInBytes(Tok, PointerABIAlign))
@@ -340,7 +342,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       // Preferred alignment.
       unsigned PointerPrefAlign = PointerABIAlign;
       if (!Rest.empty()) {
-        if (Error Err = split(Rest, ':', Split))
+        if (Error Err = ::split(Rest, ':', Split))
           return Err;
         if (Error Err = getIntInBytes(Tok, PointerPrefAlign))
           return Err;
@@ -350,7 +352,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
 
         // Now read the index. It is the second optional parameter here.
         if (!Rest.empty()) {
-          if (Error Err = split(Rest, ':', Split))
+          if (Error Err = ::split(Rest, ':', Split))
             return Err;
           if (Error Err = getIntInBytes(Tok, IndexSize))
             return Err;
@@ -391,7 +393,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       if (Rest.empty())
         return reportError(
             "Missing alignment specification in datalayout string");
-      if (Error Err = split(Rest, ':', Split))
+      if (Error Err = ::split(Rest, ':', Split))
         return Err;
       unsigned ABIAlign;
       if (Error Err = getIntInBytes(Tok, ABIAlign))
@@ -408,7 +410,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       // Preferred alignment.
       unsigned PrefAlign = ABIAlign;
       if (!Rest.empty()) {
-        if (Error Err = split(Rest, ':', Split))
+        if (Error Err = ::split(Rest, ':', Split))
           return Err;
         if (Error Err = getIntInBytes(Tok, PrefAlign))
           return Err;
@@ -437,7 +439,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
         LegalIntWidths.push_back(Width);
         if (Rest.empty())
           break;
-        if (Error Err = split(Rest, ':', Split))
+        if (Error Err = ::split(Rest, ':', Split))
           return Err;
       }
       break;
@@ -500,6 +502,9 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       case 'e':
         ManglingMode = MM_ELF;
         break;
+      case 'l':
+        ManglingMode = MM_GOFF;
+        break;
       case 'o':
         ManglingMode = MM_MachO;
         break;
@@ -702,12 +707,12 @@ unsigned DataLayout::getPointerSize(unsigned AS) const {
   return getPointerAlignElem(AS).TypeByteWidth;
 }
 
-unsigned DataLayout::getMaxPointerSize() const {
-  unsigned MaxPointerSize = 0;
+unsigned DataLayout::getMaxIndexSize() const {
+  unsigned MaxIndexSize = 0;
   for (auto &P : Pointers)
-    MaxPointerSize = std::max(MaxPointerSize, P.TypeByteWidth);
+    MaxIndexSize = std::max(MaxIndexSize, P.IndexWidth);
 
-  return MaxPointerSize;
+  return MaxIndexSize;
 }
 
 unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
@@ -800,15 +805,11 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
 
     // By default, use natural alignment for vector types. This is consistent
     // with what clang and llvm-gcc do.
-    // TODO: This should probably not be using the alloc size.
-    unsigned Alignment =
-        getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+    //
     // We're only calculating a natural alignment, so it doesn't have to be
     // based on the full size for scalable vectors. Using the minimum element
     // count should be enough here.
-    Alignment *= cast<VectorType>(Ty)->getElementCount().getKnownMinValue();
-    Alignment = PowerOf2Ceil(Alignment);
-    return Align(Alignment);
+    return Align(PowerOf2Ceil(getTypeStoreSize(Ty).getKnownMinSize()));
   }
   case Type::X86_AMXTyID:
     return Align(64);
@@ -818,7 +819,7 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
 }
 
 /// TODO: Remove this function once the transition to Align is over.
-unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
+uint64_t DataLayout::getABITypeAlignment(Type *Ty) const {
   return getABITypeAlign(Ty).value();
 }
 
@@ -827,7 +828,7 @@ Align DataLayout::getABITypeAlign(Type *Ty) const {
 }
 
 /// TODO: Remove this function once the transition to Align is over.
-unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
+uint64_t DataLayout::getPrefTypeAlignment(Type *Ty) const {
   return getPrefTypeAlign(Ty).value();
 }
 
@@ -900,6 +901,72 @@ int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy,
   return Result;
 }
 
+static void addElementIndex(SmallVectorImpl<APInt> &Indices, TypeSize ElemSize,
+                            APInt &Offset) {
+  // Skip over scalable or zero size elements. Also skip element sizes larger
+  // than the positive index space, because the arithmetic below may not be
+  // correct in that case.
+  unsigned BitWidth = Offset.getBitWidth();
+  if (ElemSize.isScalable() || ElemSize == 0 ||
+      !isUIntN(BitWidth - 1, ElemSize)) {
+    Indices.push_back(APInt::getZero(BitWidth));
+    return;
+  }
+
+  APInt Index = Offset.sdiv(ElemSize);
+  Offset -= Index * ElemSize;
+  if (Offset.isNegative()) {
+    // Prefer a positive remaining offset to allow struct indexing.
+    --Index;
+    Offset += ElemSize;
+    assert(Offset.isNonNegative() && "Remaining offset shouldn't be negative");
+  }
+  Indices.push_back(Index);
+}
+
+SmallVector<APInt> DataLayout::getGEPIndicesForOffset(Type *&ElemTy,
+                                                      APInt &Offset) const {
+  assert(ElemTy->isSized() && "Element type must be sized");
+  SmallVector<APInt> Indices;
+  addElementIndex(Indices, getTypeAllocSize(ElemTy), Offset);
+  while (Offset != 0) {
+    if (auto *ArrTy = dyn_cast<ArrayType>(ElemTy)) {
+      ElemTy = ArrTy->getElementType();
+      addElementIndex(Indices, getTypeAllocSize(ElemTy), Offset);
+      continue;
+    }
+
+    if (auto *VecTy = dyn_cast<VectorType>(ElemTy)) {
+      ElemTy = VecTy->getElementType();
+      unsigned ElemSizeInBits = getTypeSizeInBits(ElemTy).getFixedSize();
+      // GEPs over non-multiple of 8 size vector elements are invalid.
+      if (ElemSizeInBits % 8 != 0)
+        break;
+
+      addElementIndex(Indices, TypeSize::Fixed(ElemSizeInBits / 8), Offset);
+      continue;
+    }
+
+    if (auto *STy = dyn_cast<StructType>(ElemTy)) {
+      const StructLayout *SL = getStructLayout(STy);
+      uint64_t IntOffset = Offset.getZExtValue();
+      if (IntOffset >= SL->getSizeInBytes())
+        break;
+
+      unsigned Index = SL->getElementContainingOffset(IntOffset);
+      Offset -= SL->getElementOffset(Index);
+      ElemTy = STy->getElementType(Index);
+      Indices.push_back(APInt(32, Index));
+      continue;
+    }
+
+    // Can't index into non-aggregate type.
+    break;
+  }
+
+  return Indices;
+}
+
 /// getPreferredAlign - Return the preferred alignment of the specified global.
 /// This includes an explicitly requested alignment (if the global has one).
 Align DataLayout::getPreferredAlign(const GlobalVariable *GV) const {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 06c511f8530a..7c69fbf7085d 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -447,8 +447,7 @@ bool llvm::stripDebugInfo(Function &F) {
 
   DenseMap<MDNode *, MDNode *> LoopIDsMap;
   for (BasicBlock &BB : F) {
-    for (auto II = BB.begin(), End = BB.end(); II != End;) {
-      Instruction &I = *II++; // We may delete the instruction, increment now.
+    for (Instruction &I : llvm::make_early_inc_range(BB)) {
       if (isa<DbgInfoIntrinsic>(&I)) {
         I.eraseFromParent();
         Changed = true;
@@ -909,6 +908,11 @@ void LLVMDIBuilderFinalize(LLVMDIBuilderRef Builder) {
   unwrap(Builder)->finalize();
 }
 
+void LLVMDIBuilderFinalizeSubprogram(LLVMDIBuilderRef Builder,
+                                     LLVMMetadataRef subprogram) {
+  unwrap(Builder)->finalizeSubprogram(unwrapDI<DISubprogram>(subprogram));
+}
+
 LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(
     LLVMDIBuilderRef Builder, LLVMDWARFSourceLanguage Lang,
     LLVMMetadataRef FileRef, const char *Producer, size_t ProducerLen,
@@ -1003,41 +1007,43 @@ LLVMDIBuilderCreateImportedModuleFromNamespace(LLVMDIBuilderRef Builder,
                                                     Line));
 }
 
-LLVMMetadataRef
-LLVMDIBuilderCreateImportedModuleFromAlias(LLVMDIBuilderRef Builder,
-                                           LLVMMetadataRef Scope,
-                                           LLVMMetadataRef ImportedEntity,
-                                           LLVMMetadataRef File,
-                                           unsigned Line) {
+LLVMMetadataRef LLVMDIBuilderCreateImportedModuleFromAlias(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope,
+    LLVMMetadataRef ImportedEntity, LLVMMetadataRef File, unsigned Line,
+    LLVMMetadataRef *Elements, unsigned NumElements) {
+  auto Elts =
+      (NumElements > 0)
+          ? unwrap(Builder)->getOrCreateArray({unwrap(Elements), NumElements})
+          : nullptr;
   return wrap(unwrap(Builder)->createImportedModule(
-                  unwrapDI<DIScope>(Scope),
-                  unwrapDI<DIImportedEntity>(ImportedEntity),
-                  unwrapDI<DIFile>(File), Line));
-}
-
-LLVMMetadataRef
-LLVMDIBuilderCreateImportedModuleFromModule(LLVMDIBuilderRef Builder,
-                                            LLVMMetadataRef Scope,
-                                            LLVMMetadataRef M,
-                                            LLVMMetadataRef File,
-                                            unsigned Line) {
-  return wrap(unwrap(Builder)->createImportedModule(unwrapDI<DIScope>(Scope),
-                                                    unwrapDI<DIModule>(M),
-                                                    unwrapDI<DIFile>(File),
-                                                    Line));
-}
-
-LLVMMetadataRef
-LLVMDIBuilderCreateImportedDeclaration(LLVMDIBuilderRef Builder,
-                                       LLVMMetadataRef Scope,
-                                       LLVMMetadataRef Decl,
-                                       LLVMMetadataRef File,
-                                       unsigned Line,
-                                       const char *Name, size_t NameLen) {
+      unwrapDI<DIScope>(Scope), unwrapDI<DIImportedEntity>(ImportedEntity),
+      unwrapDI<DIFile>(File), Line, Elts));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateImportedModuleFromModule(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, LLVMMetadataRef M,
+    LLVMMetadataRef File, unsigned Line, LLVMMetadataRef *Elements,
+    unsigned NumElements) {
+  auto Elts =
+      (NumElements > 0)
+          ? unwrap(Builder)->getOrCreateArray({unwrap(Elements), NumElements})
+          : nullptr;
+  return wrap(unwrap(Builder)->createImportedModule(
+      unwrapDI<DIScope>(Scope), unwrapDI<DIModule>(M), unwrapDI<DIFile>(File),
+      Line, Elts));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateImportedDeclaration(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, LLVMMetadataRef Decl,
+    LLVMMetadataRef File, unsigned Line, const char *Name, size_t NameLen,
+    LLVMMetadataRef *Elements, unsigned NumElements) {
+  auto Elts =
+      (NumElements > 0)
+          ? unwrap(Builder)->getOrCreateArray({unwrap(Elements), NumElements})
+          : nullptr;
   return wrap(unwrap(Builder)->createImportedDeclaration(
-                  unwrapDI<DIScope>(Scope),
-                  unwrapDI<DINode>(Decl),
-                  unwrapDI<DIFile>(File), Line, {Name, NameLen}));
+      unwrapDI<DIScope>(Scope), unwrapDI<DINode>(Decl), unwrapDI<DIFile>(File),
+      Line, {Name, NameLen}, Elts));
 }
 
 LLVMMetadataRef
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 7b0dab799e1a..b20e581d283a 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -82,8 +82,8 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
                    Storage, Context.pImpl->DILocations);
 }
 
-const
-DILocation *DILocation::getMergedLocations(ArrayRef<const DILocation *> Locs) {
+const DILocation *
+DILocation::getMergedLocations(ArrayRef<const DILocation *> Locs) {
   if (Locs.empty())
     return nullptr;
   if (Locs.size() == 1)
@@ -139,7 +139,8 @@ const DILocation *DILocation::getMergedLocation(const DILocation *LocA,
   return DILocation::get(Result->getContext(), 0, 0, S, L);
 }
 
-Optional<unsigned> DILocation::encodeDiscriminator(unsigned BD, unsigned DF, unsigned CI) {
+Optional<unsigned> DILocation::encodeDiscriminator(unsigned BD, unsigned DF,
+                                                   unsigned CI) {
   std::array<unsigned, 3> Components = {BD, DF, CI};
   uint64_t RemainingWork = 0U;
   // We use RemainingWork to figure out if we have no remaining components to
@@ -147,7 +148,8 @@ Optional<unsigned> DILocation::encodeDiscriminator(unsigned BD, unsigned DF, uns
   // encode anything for the latter 2.
   // Since any of the input components is at most 32 bits, their sum will be
   // less than 34 bits, and thus RemainingWork won't overflow.
-  RemainingWork = std::accumulate(Components.begin(), Components.end(), RemainingWork);
+  RemainingWork =
+      std::accumulate(Components.begin(), Components.end(), RemainingWork);
 
   int I = 0;
   unsigned Ret = 0;
@@ -179,7 +181,6 @@ void DILocation::decodeDiscriminator(unsigned D, unsigned &BD, unsigned &DF,
       getNextComponentInDiscriminator(getNextComponentInDiscriminator(D)));
 }
 
-
 DINode::DIFlags DINode::getFlag(StringRef Flag) {
   return StringSwitch<DIFlags>(Flag)
 #define HANDLE_DI_FLAG(ID, NAME) .Case("DIFlag" #NAME, Flag##NAME)
@@ -546,8 +547,8 @@ DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
   DEFINE_GETIMPL_LOOKUP(DIBasicType,
                         (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags));
   Metadata *Ops[] = {nullptr, nullptr, Name};
-  DEFINE_GETIMPL_STORE(DIBasicType, (Tag, SizeInBits, AlignInBits, Encoding,
-                      Flags), Ops);
+  DEFINE_GETIMPL_STORE(DIBasicType,
+                       (Tag, SizeInBits, AlignInBits, Encoding, Flags), Ops);
 }
 
 Optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
@@ -582,16 +583,17 @@ DIDerivedType *DIDerivedType::getImpl(
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
     uint32_t AlignInBits, uint64_t OffsetInBits,
     Optional<unsigned> DWARFAddressSpace, DIFlags Flags, Metadata *ExtraData,
-    StorageType Storage, bool ShouldCreate) {
+    Metadata *Annotations, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                          AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                         ExtraData));
-  Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
-  DEFINE_GETIMPL_STORE(
-      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
-                      DWARFAddressSpace, Flags), Ops);
+                         ExtraData, Annotations));
+  Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData, Annotations};
+  DEFINE_GETIMPL_STORE(DIDerivedType,
+                       (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
+                        DWARFAddressSpace, Flags),
+                       Ops);
 }
 
 DICompositeType *DICompositeType::getImpl(
@@ -601,22 +603,25 @@ DICompositeType *DICompositeType::getImpl(
     Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
     Metadata *TemplateParams, MDString *Identifier, Metadata *Discriminator,
     Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-    Metadata *Rank, StorageType Storage, bool ShouldCreate) {
+    Metadata *Rank, Metadata *Annotations, StorageType Storage,
+    bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
 
   // Keep this in sync with buildODRType.
-  DEFINE_GETIMPL_LOOKUP(
-      DICompositeType,
-      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
-       OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank));
+  DEFINE_GETIMPL_LOOKUP(DICompositeType,
+                        (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                         AlignInBits, OffsetInBits, Flags, Elements,
+                         RuntimeLang, VTableHolder, TemplateParams, Identifier,
+                         Discriminator, DataLocation, Associated, Allocated,
+                         Rank, Annotations));
   Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
                      Elements,      VTableHolder, TemplateParams, Identifier,
                      Discriminator, DataLocation, Associated,     Allocated,
-                     Rank};
-  DEFINE_GETIMPL_STORE(DICompositeType, (Tag, Line, RuntimeLang, SizeInBits,
-                                         AlignInBits, OffsetInBits, Flags),
-                       Ops);
+                     Rank,          Annotations};
+  DEFINE_GETIMPL_STORE(
+      DICompositeType,
+      (Tag, Line, RuntimeLang, SizeInBits, AlignInBits, OffsetInBits, Flags),
+      Ops);
 }
 
 DICompositeType *DICompositeType::buildODRType(
@@ -626,7 +631,7 @@ DICompositeType *DICompositeType::buildODRType(
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
     Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-    Metadata *Rank) {
+    Metadata *Rank, Metadata *Annotations) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -636,7 +641,10 @@ DICompositeType *DICompositeType::buildODRType(
                Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
                VTableHolder, TemplateParams, &Identifier, Discriminator,
-               DataLocation, Associated, Allocated, Rank);
+               DataLocation, Associated, Allocated, Rank, Annotations);
+
+  if (CT->getTag() != Tag)
+    return nullptr;
 
   // Only mutate CT if it's a forward declaration and the new operands aren't.
   assert(CT->getRawIdentifier() == &Identifier && "Wrong ODR identifier?");
@@ -649,7 +657,7 @@ DICompositeType *DICompositeType::buildODRType(
   Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
                      Elements,      VTableHolder, TemplateParams, &Identifier,
                      Discriminator, DataLocation, Associated,     Allocated,
-                     Rank};
+                     Rank,          Annotations};
   assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
          "Mismatched number of operands");
   for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
@@ -665,17 +673,21 @@ DICompositeType *DICompositeType::getODRType(
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
     Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-    Metadata *Rank) {
+    Metadata *Rank, Metadata *Annotations) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
   auto *&CT = (*Context.pImpl->DITypeMap)[&Identifier];
-  if (!CT)
+  if (!CT) {
     CT = DICompositeType::getDistinct(
         Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
         AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder,
         TemplateParams, &Identifier, Discriminator, DataLocation, Associated,
-        Allocated, Rank);
+        Allocated, Rank, Annotations);
+  } else {
+    if (CT->getTag() != Tag)
+      return nullptr;
+  }
   return CT;
 }
 
@@ -789,10 +801,14 @@ DICompileUnit::getNameTableKind(StringRef Str) {
 
 const char *DICompileUnit::emissionKindString(DebugEmissionKind EK) {
   switch (EK) {
-  case NoDebug:        return "NoDebug";
-  case FullDebug:      return "FullDebug";
-  case LineTablesOnly: return "LineTablesOnly";
-  case DebugDirectivesOnly: return "DebugDirectivesOnly";
+  case NoDebug:
+    return "NoDebug";
+  case FullDebug:
+    return "FullDebug";
+  case LineTablesOnly:
+    return "LineTablesOnly";
+  case DebugDirectivesOnly:
+    return "DebugDirectivesOnly";
   }
   return nullptr;
 }
@@ -862,23 +878,28 @@ DISubprogram *DISubprogram::getImpl(
     unsigned ScopeLine, Metadata *ContainingType, unsigned VirtualIndex,
     int ThisAdjustment, DIFlags Flags, DISPFlags SPFlags, Metadata *Unit,
     Metadata *TemplateParams, Metadata *Declaration, Metadata *RetainedNodes,
-    Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate) {
+    Metadata *ThrownTypes, Metadata *Annotations, StorageType Storage,
+    bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DISubprogram,
                         (Scope, Name, LinkageName, File, Line, Type, ScopeLine,
                          ContainingType, VirtualIndex, ThisAdjustment, Flags,
                          SPFlags, Unit, TemplateParams, Declaration,
-                         RetainedNodes, ThrownTypes));
-  SmallVector<Metadata *, 11> Ops = {
-      File,        Scope,         Name,           LinkageName,    Type,       Unit,
-      Declaration, RetainedNodes, ContainingType, TemplateParams, ThrownTypes};
-  if (!ThrownTypes) {
+                         RetainedNodes, ThrownTypes, Annotations));
+  SmallVector<Metadata *, 12> Ops = {
+      File,           Scope,          Name,        LinkageName,
+      Type,           Unit,           Declaration, RetainedNodes,
+      ContainingType, TemplateParams, ThrownTypes, Annotations};
+  if (!Annotations) {
     Ops.pop_back();
-    if (!TemplateParams) {
+    if (!ThrownTypes) {
       Ops.pop_back();
-      if (!ContainingType)
+      if (!TemplateParams) {
         Ops.pop_back();
+        if (!ContainingType)
+          Ops.pop_back();
+      }
     }
   }
   DEFINE_GETIMPL_STORE_N(
@@ -977,13 +998,14 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
                           Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
                           Metadata *StaticDataMemberDeclaration,
                           Metadata *TemplateParams, uint32_t AlignInBits,
-                          StorageType Storage, bool ShouldCreate) {
+                          Metadata *Annotations, StorageType Storage,
+                          bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIGlobalVariable, (Scope, Name, LinkageName, File, Line,
-                                           Type, IsLocalToUnit, IsDefinition,
-                                           StaticDataMemberDeclaration,
-                                           TemplateParams, AlignInBits));
+  DEFINE_GETIMPL_LOOKUP(
+      DIGlobalVariable,
+      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+       StaticDataMemberDeclaration, TemplateParams, AlignInBits, Annotations));
   Metadata *Ops[] = {Scope,
                      Name,
                      File,
@@ -991,27 +1013,26 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
                      Name,
                      LinkageName,
                      StaticDataMemberDeclaration,
-                     TemplateParams};
+                     TemplateParams,
+                     Annotations};
   DEFINE_GETIMPL_STORE(DIGlobalVariable,
                        (Line, IsLocalToUnit, IsDefinition, AlignInBits), Ops);
 }
 
-DILocalVariable *DILocalVariable::getImpl(LLVMContext &Context, Metadata *Scope,
-                                          MDString *Name, Metadata *File,
-                                          unsigned Line, Metadata *Type,
-                                          unsigned Arg, DIFlags Flags,
-                                          uint32_t AlignInBits,
-                                          StorageType Storage,
-                                          bool ShouldCreate) {
+DILocalVariable *
+DILocalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                         Metadata *File, unsigned Line, Metadata *Type,
+                         unsigned Arg, DIFlags Flags, uint32_t AlignInBits,
+                         Metadata *Annotations, StorageType Storage,
+                         bool ShouldCreate) {
   // 64K ought to be enough for any frontend.
   assert(Arg <= UINT16_MAX && "Expected argument number to fit in 16-bits");
 
   assert(Scope && "Expected scope");
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DILocalVariable,
-                        (Scope, Name, File, Line, Type, Arg, Flags,
-                         AlignInBits));
-  Metadata *Ops[] = {Scope, Name, File, Type};
+  DEFINE_GETIMPL_LOOKUP(DILocalVariable, (Scope, Name, File, Line, Type, Arg,
+                                          Flags, AlignInBits, Annotations));
+  Metadata *Ops[] = {Scope, Name, File, Type, Annotations};
   DEFINE_GETIMPL_STORE(DILocalVariable, (Line, Arg, Flags, AlignInBits), Ops);
 }
 
@@ -1038,14 +1059,12 @@ Optional<uint64_t> DIVariable::getSizeInBits() const {
   return None;
 }
 
-DILabel *DILabel::getImpl(LLVMContext &Context, Metadata *Scope,
-                          MDString *Name, Metadata *File, unsigned Line,
-                          StorageType Storage,
+DILabel *DILabel::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                          Metadata *File, unsigned Line, StorageType Storage,
                           bool ShouldCreate) {
   assert(Scope && "Expected scope");
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DILabel,
-                        (Scope, Name, File, Line));
+  DEFINE_GETIMPL_LOOKUP(DILabel, (Scope, Name, File, Line));
   Metadata *Ops[] = {Scope, Name, File};
   DEFINE_GETIMPL_STORE(DILabel, (Line), Ops);
 }
@@ -1194,10 +1213,11 @@ bool DIExpression::isComplex() const {
   // kind of complex computation occurs.
   for (const auto &It : expr_ops()) {
     switch (It.getOp()) {
-      case dwarf::DW_OP_LLVM_tag_offset:
-      case dwarf::DW_OP_LLVM_fragment:
-        continue;
-      default: return true;
+    case dwarf::DW_OP_LLVM_tag_offset:
+    case dwarf::DW_OP_LLVM_fragment:
+      continue;
+    default:
+      return true;
     }
   }
 
@@ -1346,8 +1366,7 @@ DIExpression *DIExpression::replaceArg(const DIExpression *Expr,
 
 DIExpression *DIExpression::prependOpcodes(const DIExpression *Expr,
                                            SmallVectorImpl<uint64_t> &Ops,
-                                           bool StackValue,
-                                           bool EntryValue) {
+                                           bool StackValue, bool EntryValue) {
   assert(Expr && "Can't prepend ops to this expression");
 
   if (EntryValue) {
@@ -1442,7 +1461,8 @@ Optional<DIExpression *> DIExpression::createFragmentExpression(
   if (Expr) {
     for (auto Op : Expr->expr_ops()) {
       switch (Op.getOp()) {
-      default: break;
+      default:
+        break;
       case dwarf::DW_OP_shr:
       case dwarf::DW_OP_shra:
       case dwarf::DW_OP_shl:
@@ -1476,6 +1496,45 @@ Optional<DIExpression *> DIExpression::createFragmentExpression(
   return DIExpression::get(Expr->getContext(), Ops);
 }
 
+std::pair<DIExpression *, const ConstantInt *>
+DIExpression::constantFold(const ConstantInt *CI) {
+  // Copy the APInt so we can modify it.
+  APInt NewInt = CI->getValue();
+  SmallVector<uint64_t, 8> Ops;
+
+  // Fold operators only at the beginning of the expression.
+  bool First = true;
+  bool Changed = false;
+  for (auto Op : expr_ops()) {
+    switch (Op.getOp()) {
+    default:
+      // We fold only the leading part of the expression; if we get to a part
+      // that we're going to copy unchanged, and haven't done any folding,
+      // then the entire expression is unchanged and we can return early.
+      if (!Changed)
+        return {this, CI};
+      First = false;
+      break;
+    case dwarf::DW_OP_LLVM_convert:
+      if (!First)
+        break;
+      Changed = true;
+      if (Op.getArg(1) == dwarf::DW_ATE_signed)
+        NewInt = NewInt.sextOrTrunc(Op.getArg(0));
+      else {
+        assert(Op.getArg(1) == dwarf::DW_ATE_unsigned && "Unexpected operand");
+        NewInt = NewInt.zextOrTrunc(Op.getArg(0));
+      }
+      continue;
+    }
+    Op.appendToVector(Ops);
+  }
+  if (!Changed)
+    return {this, CI};
+  return {DIExpression::get(getContext(), Ops),
+          ConstantInt::get(getContext(), NewInt)};
+}
+
 uint64_t DIExpression::getNumLocationOperands() const {
   uint64_t Result = 0;
   for (auto ExprOp : expr_ops())
@@ -1552,21 +1611,22 @@ DIObjCProperty *DIObjCProperty::getImpl(
 DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
                                             Metadata *Scope, Metadata *Entity,
                                             Metadata *File, unsigned Line,
-                                            MDString *Name, StorageType Storage,
+                                            MDString *Name, Metadata *Elements,
+                                            StorageType Storage,
                                             bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIImportedEntity,
-                        (Tag, Scope, Entity, File, Line, Name));
-  Metadata *Ops[] = {Scope, Entity, Name, File};
+                        (Tag, Scope, Entity, File, Line, Name, Elements));
+  Metadata *Ops[] = {Scope, Entity, Name, File, Elements};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
 
-DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType,
-                          unsigned Line, MDString *Name, MDString *Value,
-                          StorageType Storage, bool ShouldCreate) {
+DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType, unsigned Line,
+                          MDString *Name, MDString *Value, StorageType Storage,
+                          bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIMacro, (MIType, Line, Name, Value));
-  Metadata *Ops[] = { Name, Value };
+  Metadata *Ops[] = {Name, Value};
   DEFINE_GETIMPL_STORE(DIMacro, (MIType, Line), Ops);
 }
 
@@ -1574,9 +1634,8 @@ DIMacroFile *DIMacroFile::getImpl(LLVMContext &Context, unsigned MIType,
                                   unsigned Line, Metadata *File,
                                   Metadata *Elements, StorageType Storage,
                                   bool ShouldCreate) {
-  DEFINE_GETIMPL_LOOKUP(DIMacroFile,
-                        (MIType, Line, File, Elements));
-  Metadata *Ops[] = { File, Elements };
+  DEFINE_GETIMPL_LOOKUP(DIMacroFile, (MIType, Line, File, Elements));
+  Metadata *Ops[] = {File, Elements};
   DEFINE_GETIMPL_STORE(DIMacroFile, (MIType, Line), Ops);
 }
 
@@ -1592,6 +1651,12 @@ void DIArgList::handleChangedOperand(void *Ref, Metadata *New) {
   assert((!New || isa<ValueAsMetadata>(New)) &&
          "DIArgList must be passed a ValueAsMetadata");
   untrack();
+  bool Uniq = isUniqued();
+  if (Uniq) {
+    // We need to update the uniqueness once the Args are updated since they
+    // form the key to the DIArgLists store.
+    eraseFromStore();
+  }
   ValueAsMetadata *NewVM = cast_or_null<ValueAsMetadata>(New);
   for (ValueAsMetadata *&VM : Args) {
     if (&VM == OldVMPtr) {
@@ -1601,6 +1666,10 @@ void DIArgList::handleChangedOperand(void *Ref, Metadata *New) {
         VM = ValueAsMetadata::get(UndefValue::get(VM->getValue()->getType()));
     }
   }
+  if (Uniq) {
+    if (uniquify() != this)
+      storeDistinctInContext();
+  }
   track();
 }
 void DIArgList::track() {
diff --git a/llvm/lib/IR/DiagnosticHandler.cpp b/llvm/lib/IR/DiagnosticHandler.cpp
index 2fe634803894..7b40728a34e8 100644
--- a/llvm/lib/IR/DiagnosticHandler.cpp
+++ b/llvm/lib/IR/DiagnosticHandler.cpp
@@ -30,7 +30,7 @@ struct PassRemarksOpt {
       Pattern = std::make_shared<Regex>(Val);
       std::string RegexError;
       if (!Pattern->isValid(RegexError))
-        report_fatal_error("Invalid regular expression '" + Val +
+        report_fatal_error(Twine("Invalid regular expression '") + Val +
                                "' in -pass-remarks: " + RegexError,
                            false);
     }
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index f92138274801..0a872a81f911 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -1,4 +1,4 @@
-//===- llvm/Support/DiagnosticInfo.cpp - Diagnostic Definitions -*- C++ -*-===//
+//===- llvm/IR/DiagnosticInfo.cpp - Diagnostic Definitions ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -70,10 +70,8 @@ void DiagnosticInfoInlineAsm::print(DiagnosticPrinter &DP) const {
 }
 
 void DiagnosticInfoResourceLimit::print(DiagnosticPrinter &DP) const {
-  DP << getResourceName() << " (" << getResourceSize() << ") exceeds limit";
-  if (getResourceLimit() != 0)
-    DP << " (" << getResourceLimit() << ')';
-  DP << " in function '" << getFunction() << '\'';
+  DP << getResourceName() << " (" << getResourceSize() << ") exceeds limit ("
+     << getResourceLimit() << ") in function '" << getFunction() << '\'';
 }
 
 void DiagnosticInfoDebugMetadataVersion::print(DiagnosticPrinter &DP) const {
@@ -401,3 +399,35 @@ std::string DiagnosticInfoOptimizationBase::getMsg() const {
 
 void OptimizationRemarkAnalysisFPCommute::anchor() {}
 void OptimizationRemarkAnalysisAliasing::anchor() {}
+
+void llvm::diagnoseDontCall(const CallInst &CI) {
+  auto *F = CI.getCalledFunction();
+  if (!F)
+    return;
+
+  for (int i = 0; i != 2; ++i) {
+    auto AttrName = i == 0 ? "dontcall-error" : "dontcall-warn";
+    auto Sev = i == 0 ? DS_Error : DS_Warning;
+
+    if (F->hasFnAttribute(AttrName)) {
+      unsigned LocCookie = 0;
+      auto A = F->getFnAttribute(AttrName);
+      if (MDNode *MD = CI.getMetadata("srcloc"))
+        LocCookie =
+            mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+      DiagnosticInfoDontCall D(F->getName(), A.getValueAsString(), Sev,
+                               LocCookie);
+      F->getContext().diagnose(D);
+    }
+  }
+}
+
+void DiagnosticInfoDontCall::print(DiagnosticPrinter &DP) const {
+  DP << "call to " << getFunctionName() << " marked \"dontcall-";
+  if (getSeverity() == DiagnosticSeverity::DS_Error)
+    DP << "error\"";
+  else
+    DP << "warn\"";
+  if (!getNote().empty())
+    DP << ": " << getNote();
+}
diff --git a/llvm/lib/IR/DiagnosticPrinter.cpp b/llvm/lib/IR/DiagnosticPrinter.cpp
index 496bd18e78e2..49b8bbae53be 100644
--- a/llvm/lib/IR/DiagnosticPrinter.cpp
+++ b/llvm/lib/IR/DiagnosticPrinter.cpp
@@ -1,4 +1,4 @@
-//===- llvm/Support/DiagnosticInfo.cpp - Diagnostic Definitions -*- C++ -*-===//
+//===- llvm/IR/DiagnosticPrinter.cpp - Diagnostic Printer -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index 516c702acec7..c6e0938e71a6 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -17,7 +17,7 @@
 
 namespace llvm {
 
-Optional<RoundingMode> StrToRoundingMode(StringRef RoundingArg) {
+Optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
   // For dynamic rounding mode, we use round to nearest but we will set the
   // 'exact' SDNodeFlag so that the value will not be rounded.
   return StringSwitch<Optional<RoundingMode>>(RoundingArg)
@@ -30,7 +30,7 @@ Optional<RoundingMode> StrToRoundingMode(StringRef RoundingArg) {
       .Default(None);
 }
 
-Optional<StringRef> RoundingModeToStr(RoundingMode UseRounding) {
+Optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
   Optional<StringRef> RoundingStr = None;
   switch (UseRounding) {
   case RoundingMode::Dynamic:
@@ -57,7 +57,8 @@ Optional<StringRef> RoundingModeToStr(RoundingMode UseRounding) {
   return RoundingStr;
 }
 
-Optional<fp::ExceptionBehavior> StrToExceptionBehavior(StringRef ExceptionArg) {
+Optional<fp::ExceptionBehavior>
+convertStrToExceptionBehavior(StringRef ExceptionArg) {
   return StringSwitch<Optional<fp::ExceptionBehavior>>(ExceptionArg)
       .Case("fpexcept.ignore", fp::ebIgnore)
       .Case("fpexcept.maytrap", fp::ebMayTrap)
@@ -65,7 +66,8 @@ Optional<fp::ExceptionBehavior> StrToExceptionBehavior(StringRef ExceptionArg) {
       .Default(None);
 }
 
-Optional<StringRef> ExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
+Optional<StringRef>
+convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   Optional<StringRef> ExceptStr = None;
   switch (UseExcept) {
   case fp::ebStrict:
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 4034b1505bd0..82b20a8af91b 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -140,25 +140,25 @@ bool Argument::hasPreallocatedAttr() const {
 bool Argument::hasPassPointeeByValueCopyAttr() const {
   if (!getType()->isPointerTy()) return false;
   AttributeList Attrs = getParent()->getAttributes();
-  return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::Preallocated);
+  return Attrs.hasParamAttr(getArgNo(), Attribute::ByVal) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::InAlloca) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::Preallocated);
 }
 
 bool Argument::hasPointeeInMemoryValueAttr() const {
   if (!getType()->isPointerTy())
     return false;
   AttributeList Attrs = getParent()->getAttributes();
-  return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::StructRet) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::Preallocated) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::ByRef);
+  return Attrs.hasParamAttr(getArgNo(), Attribute::ByVal) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::StructRet) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::InAlloca) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::Preallocated) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::ByRef);
 }
 
 /// For a byval, sret, inalloca, or preallocated parameter, get the in-memory
 /// parameter type.
-static Type *getMemoryParamAllocType(AttributeSet ParamAttrs, Type *ArgTy) {
+static Type *getMemoryParamAllocType(AttributeSet ParamAttrs) {
   // FIXME: All the type carrying attributes are mutually exclusive, so there
   // should be a single query to get the stored type that handles any of them.
   if (Type *ByValTy = ParamAttrs.getByValType())
@@ -177,19 +177,19 @@ static Type *getMemoryParamAllocType(AttributeSet ParamAttrs, Type *ArgTy) {
 
 uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const {
   AttributeSet ParamAttrs =
-      getParent()->getAttributes().getParamAttributes(getArgNo());
-  if (Type *MemTy = getMemoryParamAllocType(ParamAttrs, getType()))
+      getParent()->getAttributes().getParamAttrs(getArgNo());
+  if (Type *MemTy = getMemoryParamAllocType(ParamAttrs))
     return DL.getTypeAllocSize(MemTy);
   return 0;
 }
 
 Type *Argument::getPointeeInMemoryValueType() const {
   AttributeSet ParamAttrs =
-      getParent()->getAttributes().getParamAttributes(getArgNo());
-  return getMemoryParamAllocType(ParamAttrs, getType());
+      getParent()->getAttributes().getParamAttrs(getArgNo());
+  return getMemoryParamAllocType(ParamAttrs);
 }
 
-unsigned Argument::getParamAlignment() const {
+uint64_t Argument::getParamAlignment() const {
   assert(getType()->isPointerTy() && "Only pointers have alignments");
   return getParent()->getParamAlignment(getArgNo());
 }
@@ -278,8 +278,8 @@ bool Argument::hasSExtAttr() const {
 
 bool Argument::onlyReadsMemory() const {
   AttributeList Attrs = getParent()->getAttributes();
-  return Attrs.hasParamAttribute(getArgNo(), Attribute::ReadOnly) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::ReadNone);
+  return Attrs.hasParamAttr(getArgNo(), Attribute::ReadOnly) ||
+         Attrs.hasParamAttr(getArgNo(), Attribute::ReadNone);
 }
 
 void Argument::addAttrs(AttrBuilder &B) {
@@ -354,7 +354,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
     B.addAttribute("frame-pointer", "all");
     break;
   }
-  F->addAttributes(AttributeList::FunctionIndex, B);
+  F->addFnAttrs(B);
   return F;
 }
 
@@ -529,101 +529,144 @@ void Function::dropAllReferences() {
   clearMetadata();
 }
 
-void Function::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
+void Function::addAttributeAtIndex(unsigned i, Attribute Attr) {
+  AttributeSets = AttributeSets.addAttributeAtIndex(getContext(), i, Attr);
 }
 
-void Function::addAttribute(unsigned i, Attribute Attr) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Attr);
-  setAttributes(PAL);
+void Function::addFnAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.addFnAttribute(getContext(), Kind);
 }
 
-void Function::addAttributes(unsigned i, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttributes(getContext(), i, Attrs);
-  setAttributes(PAL);
+void Function::addFnAttr(StringRef Kind, StringRef Val) {
+  AttributeSets = AttributeSets.addFnAttribute(getContext(), Kind, Val);
+}
+
+void Function::addFnAttr(Attribute Attr) {
+  AttributeSets = AttributeSets.addFnAttribute(getContext(), Attr);
+}
+
+void Function::addFnAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.addFnAttributes(getContext(), Attrs);
+}
+
+void Function::addRetAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.addRetAttribute(getContext(), Kind);
+}
+
+void Function::addRetAttr(Attribute Attr) {
+  AttributeSets = AttributeSets.addRetAttribute(getContext(), Attr);
+}
+
+void Function::addRetAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.addRetAttributes(getContext(), Attrs);
 }
 
 void Function::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addParamAttribute(getContext(), ArgNo, Kind);
 }
 
 void Function::addParamAttr(unsigned ArgNo, Attribute Attr) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addParamAttribute(getContext(), ArgNo, Attr);
 }
 
 void Function::addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttributes(getContext(), ArgNo, Attrs);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addParamAttributes(getContext(), ArgNo, Attrs);
 }
 
-void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
+void Function::removeAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.removeAttributeAtIndex(getContext(), i, Kind);
 }
 
-void Function::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
+void Function::removeAttributeAtIndex(unsigned i, StringRef Kind) {
+  AttributeSets = AttributeSets.removeAttributeAtIndex(getContext(), i, Kind);
 }
 
-void Function::removeAttributes(unsigned i, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttributes(getContext(), i, Attrs);
-  setAttributes(PAL);
+void Function::removeFnAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.removeFnAttribute(getContext(), Kind);
+}
+
+void Function::removeFnAttr(StringRef Kind) {
+  AttributeSets = AttributeSets.removeFnAttribute(getContext(), Kind);
+}
+
+void Function::removeFnAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.removeFnAttributes(getContext(), Attrs);
+}
+
+void Function::removeRetAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.removeRetAttribute(getContext(), Kind);
+}
+
+void Function::removeRetAttr(StringRef Kind) {
+  AttributeSets = AttributeSets.removeRetAttribute(getContext(), Kind);
+}
+
+void Function::removeRetAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.removeRetAttributes(getContext(), Attrs);
 }
 
 void Function::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.removeParamAttribute(getContext(), ArgNo, Kind);
 }
 
 void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.removeParamAttribute(getContext(), ArgNo, Kind);
 }
 
 void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs);
-  setAttributes(PAL);
+  AttributeSets =
+      AttributeSets.removeParamAttributes(getContext(), ArgNo, Attrs);
+}
+
+void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) {
+  AttributeSets =
+      AttributeSets.addDereferenceableParamAttr(getContext(), ArgNo, Bytes);
 }
 
-void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
+bool Function::hasFnAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.hasFnAttr(Kind);
 }
 
-void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableParamAttr(getContext(), ArgNo, Bytes);
-  setAttributes(PAL);
+bool Function::hasFnAttribute(StringRef Kind) const {
+  return AttributeSets.hasFnAttr(Kind);
+}
+
+bool Function::hasRetAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.hasRetAttr(Kind);
+}
+
+bool Function::hasParamAttribute(unsigned ArgNo,
+                                 Attribute::AttrKind Kind) const {
+  return AttributeSets.hasParamAttr(ArgNo, Kind);
+}
+
+Attribute Function::getAttributeAtIndex(unsigned i,
+                                        Attribute::AttrKind Kind) const {
+  return AttributeSets.getAttributeAtIndex(i, Kind);
+}
+
+Attribute Function::getAttributeAtIndex(unsigned i, StringRef Kind) const {
+  return AttributeSets.getAttributeAtIndex(i, Kind);
 }
 
-void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
+Attribute Function::getFnAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.getFnAttr(Kind);
+}
+
+Attribute Function::getFnAttribute(StringRef Kind) const {
+  return AttributeSets.getFnAttr(Kind);
+}
+
+/// gets the specified attribute from the list of attributes.
+Attribute Function::getParamAttribute(unsigned ArgNo,
+                                      Attribute::AttrKind Kind) const {
+  return AttributeSets.getParamAttr(ArgNo, Kind);
 }
 
 void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo,
                                                  uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableOrNullParamAttr(getContext(), ArgNo, Bytes);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addDereferenceableOrNullParamAttr(getContext(),
+                                                                  ArgNo, Bytes);
 }
 
 DenormalMode Function::getDenormalMode(const fltSemantics &FPType) const {
@@ -936,7 +979,8 @@ enum IIT_Info {
   IIT_BF16 = 48,
   IIT_STRUCT9 = 49,
   IIT_V256 = 50,
-  IIT_AMX  = 51
+  IIT_AMX  = 51,
+  IIT_PPCF128 = 52
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -983,6 +1027,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_F128:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Quad, 0));
     return;
+  case IIT_PPCF128:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::PPCQuad, 0));
+    return;
   case IIT_I1:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
     return;
@@ -1207,6 +1254,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Float: return Type::getFloatTy(Context);
   case IITDescriptor::Double: return Type::getDoubleTy(Context);
   case IITDescriptor::Quad: return Type::getFP128Ty(Context);
+  case IITDescriptor::PPCQuad: return Type::getPPC_FP128Ty(Context);
 
   case IITDescriptor::Integer:
     return IntegerType::get(Context, D.Integer_Width);
@@ -1389,6 +1437,7 @@ static bool matchIntrinsicType(
     case IITDescriptor::Float: return !Ty->isFloatTy();
     case IITDescriptor::Double: return !Ty->isDoubleTy();
     case IITDescriptor::Quad: return !Ty->isFP128Ty();
+    case IITDescriptor::PPCQuad: return !Ty->isPPC_FP128Ty();
     case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
     case IITDescriptor::Vector: {
       VectorType *VT = dyn_cast<VectorType>(Ty);
@@ -1403,11 +1452,6 @@ static bool matchIntrinsicType(
       if (!PT->isOpaque())
         return matchIntrinsicType(PT->getElementType(), Infos, ArgTys,
                                   DeferredChecks, IsDeferredCheck);
-      // If typed pointers are supported, do not allow using opaque pointer in
-      // place of fixed pointer type. This would make the intrinsic signature
-      // non-unique.
-      if (Ty->getContext().supportsTypedPointers())
-        return true;
       // Consume IIT descriptors relating to the pointer element type.
       while (Infos.front().Kind == IITDescriptor::Pointer)
         Infos = Infos.slice(1);
@@ -1525,11 +1569,8 @@ static bool matchIntrinsicType(
 
       if (!ThisArgType || !ReferenceType)
         return true;
-      if (!ThisArgType->isOpaque())
-        return ThisArgType->getElementType() != ReferenceType->getElementType();
-      // If typed pointers are supported, do not allow opaque pointer to ensure
-      // uniqueness.
-      return Ty->getContext().supportsTypedPointers();
+      return !ThisArgType->isOpaqueOrPointeeTypeMatches(
+          ReferenceType->getElementType());
     }
     case IITDescriptor::VecOfAnyPtrsToElt: {
       unsigned RefArgNumber = D.getRefArgNumber();
@@ -1702,8 +1743,8 @@ Optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
 /// and llvm.compiler.used variables.
 bool Function::hasAddressTaken(const User **PutOffender,
                                bool IgnoreCallbackUses,
-                               bool IgnoreAssumeLikeCalls,
-                               bool IgnoreLLVMUsed) const {
+                               bool IgnoreAssumeLikeCalls, bool IgnoreLLVMUsed,
+                               bool IgnoreARCAttachedCall) const {
   for (const Use &U : uses()) {
     const User *FU = U.getUser();
     if (isa<BlockAddress>(FU))
@@ -1747,6 +1788,11 @@ bool Function::hasAddressTaken(const User **PutOffender,
       return true;
     }
     if (!Call->isCallee(&U)) {
+      if (IgnoreARCAttachedCall &&
+          Call->isOperandBundleOfType(LLVMContext::OB_clang_arc_attachedcall,
+                                      U.getOperandNo()))
+        continue;
+
       if (PutOffender)
         *PutOffender = FU;
       return true;
@@ -1846,10 +1892,9 @@ void Function::setValueSubclassDataBit(unsigned Bit, bool On) {
 
 void Function::setEntryCount(ProfileCount Count,
                              const DenseSet<GlobalValue::GUID> *S) {
-  assert(Count.hasValue());
 #if !defined(NDEBUG)
   auto PrevCount = getEntryCount();
-  assert(!PrevCount.hasValue() || PrevCount.getType() == Count.getType());
+  assert(!PrevCount.hasValue() || PrevCount->getType() == Count.getType());
 #endif
 
   auto ImportGUIDs = getImportGUIDs();
@@ -1867,7 +1912,7 @@ void Function::setEntryCount(uint64_t Count, Function::ProfileCountType Type,
   setEntryCount(ProfileCount(Count, Type), Imports);
 }
 
-ProfileCount Function::getEntryCount(bool AllowSynthetic) const {
+Optional<ProfileCount> Function::getEntryCount(bool AllowSynthetic) const {
   MDNode *MD = getMetadata(LLVMContext::MD_prof);
   if (MD && MD->getOperand(0))
     if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0))) {
@@ -1877,7 +1922,7 @@ ProfileCount Function::getEntryCount(bool AllowSynthetic) const {
         // A value of -1 is used for SamplePGO when there were no samples.
         // Treat this the same as unknown.
         if (Count == (uint64_t)-1)
-          return ProfileCount::getInvalid();
+          return None;
         return ProfileCount(Count, PCT_Real);
       } else if (AllowSynthetic &&
                  MDS->getString().equals("synthetic_function_entry_count")) {
@@ -1886,7 +1931,7 @@ ProfileCount Function::getEntryCount(bool AllowSynthetic) const {
         return ProfileCount(Count, PCT_Synthetic);
       }
     }
-  return ProfileCount::getInvalid();
+  return None;
 }
 
 DenseSet<GlobalValue::GUID> Function::getImportGUIDs() const {
diff --git a/llvm/lib/IR/GCStrategy.cpp b/llvm/lib/IR/GCStrategy.cpp
index 25dad5bec9ef..f3bc5b74f8fd 100644
--- a/llvm/lib/IR/GCStrategy.cpp
+++ b/llvm/lib/IR/GCStrategy.cpp
@@ -18,3 +18,21 @@ using namespace llvm;
 LLVM_INSTANTIATE_REGISTRY(GCRegistry)
 
 GCStrategy::GCStrategy() = default;
+
+std::unique_ptr<GCStrategy> llvm::getGCStrategy(const StringRef Name) {
+  for (auto &S : GCRegistry::entries())
+    if (S.getName() == Name)
+      return S.instantiate();
+
+  if (GCRegistry::begin() == GCRegistry::end()) {
+    // In normal operation, the registry should not be empty.  There should
+    // be the builtin GCs if nothing else.  The most likely scenario here is
+    // that we got here without running the initializers used by the Registry
+    // itself and it's registration mechanism.
+    const std::string error =
+        std::string("unsupported GC: ") + Name.str() +
+        " (did you remember to link and initialize the library?)";
+    report_fatal_error(error);
+  } else
+    report_fatal_error(std::string("unsupported GC: ") + Name.str());
+}
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index b1c6dcc6672d..9f38288095e3 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -162,7 +162,7 @@ std::string GlobalValue::getGlobalIdentifier() const {
 StringRef GlobalValue::getSection() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
-    if (const GlobalObject *GO = GA->getBaseObject())
+    if (const GlobalObject *GO = GA->getAliaseeObject())
       return GO->getSection();
     return "";
   }
@@ -172,7 +172,7 @@ StringRef GlobalValue::getSection() const {
 const Comdat *GlobalValue::getComdat() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
-    if (const GlobalObject *GO = GA->getBaseObject())
+    if (const GlobalObject *GO = GA->getAliaseeObject())
       return const_cast<GlobalObject *>(GO)->getComdat();
     return nullptr;
   }
@@ -235,7 +235,7 @@ bool GlobalValue::isDeclaration() const {
     return F->empty() && !F->isMaterializable();
 
   // Aliases and ifuncs are always definitions.
-  assert(isa<GlobalIndirectSymbol>(this));
+  assert(isa<GlobalAlias>(this) || isa<GlobalIFunc>(this));
   return false;
 }
 
@@ -280,14 +280,44 @@ bool GlobalObject::canIncreaseAlignment() const {
   return true;
 }
 
-const GlobalObject *GlobalValue::getBaseObject() const {
-  if (auto *GO = dyn_cast<GlobalObject>(this))
+static const GlobalObject *
+findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases) {
+  if (auto *GO = dyn_cast<GlobalObject>(C))
     return GO;
-  if (auto *GA = dyn_cast<GlobalIndirectSymbol>(this))
-    return GA->getBaseObject();
+  if (auto *GA = dyn_cast<GlobalAlias>(C))
+    if (Aliases.insert(GA).second)
+      return findBaseObject(GA->getOperand(0), Aliases);
+  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+    switch (CE->getOpcode()) {
+    case Instruction::Add: {
+      auto *LHS = findBaseObject(CE->getOperand(0), Aliases);
+      auto *RHS = findBaseObject(CE->getOperand(1), Aliases);
+      if (LHS && RHS)
+        return nullptr;
+      return LHS ? LHS : RHS;
+    }
+    case Instruction::Sub: {
+      if (findBaseObject(CE->getOperand(1), Aliases))
+        return nullptr;
+      return findBaseObject(CE->getOperand(0), Aliases);
+    }
+    case Instruction::IntToPtr:
+    case Instruction::PtrToInt:
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+      return findBaseObject(CE->getOperand(0), Aliases);
+    default:
+      break;
+    }
+  }
   return nullptr;
 }
 
+const GlobalObject *GlobalValue::getAliaseeObject() const {
+  DenseSet<const GlobalAlias *> Aliases;
+  return findBaseObject(this, Aliases);
+}
+
 bool GlobalValue::isAbsoluteSymbolRef() const {
   auto *GO = dyn_cast<GlobalObject>(this);
   if (!GO)
@@ -421,63 +451,15 @@ void GlobalVariable::dropAllReferences() {
 }
 
 //===----------------------------------------------------------------------===//
-// GlobalIndirectSymbol Implementation
-//===----------------------------------------------------------------------===//
-
-GlobalIndirectSymbol::GlobalIndirectSymbol(Type *Ty, ValueTy VTy,
-    unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name,
-    Constant *Symbol)
-    : GlobalValue(Ty, VTy, &Op<0>(), 1, Linkage, Name, AddressSpace) {
-    Op<0>() = Symbol;
-}
-
-static const GlobalObject *
-findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases) {
-  if (auto *GO = dyn_cast<GlobalObject>(C))
-    return GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(C))
-    if (Aliases.insert(GA).second)
-      return findBaseObject(GA->getOperand(0), Aliases);
-  if (auto *CE = dyn_cast<ConstantExpr>(C)) {
-    switch (CE->getOpcode()) {
-    case Instruction::Add: {
-      auto *LHS = findBaseObject(CE->getOperand(0), Aliases);
-      auto *RHS = findBaseObject(CE->getOperand(1), Aliases);
-      if (LHS && RHS)
-        return nullptr;
-      return LHS ? LHS : RHS;
-    }
-    case Instruction::Sub: {
-      if (findBaseObject(CE->getOperand(1), Aliases))
-        return nullptr;
-      return findBaseObject(CE->getOperand(0), Aliases);
-    }
-    case Instruction::IntToPtr:
-    case Instruction::PtrToInt:
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-      return findBaseObject(CE->getOperand(0), Aliases);
-    default:
-      break;
-    }
-  }
-  return nullptr;
-}
-
-const GlobalObject *GlobalIndirectSymbol::getBaseObject() const {
-  DenseSet<const GlobalAlias *> Aliases;
-  return findBaseObject(getOperand(0), Aliases);
-}
-
-//===----------------------------------------------------------------------===//
 // GlobalAlias Implementation
 //===----------------------------------------------------------------------===//
 
 GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
                          const Twine &Name, Constant *Aliasee,
                          Module *ParentModule)
-    : GlobalIndirectSymbol(Ty, Value::GlobalAliasVal, AddressSpace, Link, Name,
-                           Aliasee) {
+    : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name,
+                  AddressSpace) {
+  setAliasee(Aliasee);
   if (ParentModule)
     ParentModule->getAliasList().push_back(this);
 }
@@ -521,7 +503,12 @@ void GlobalAlias::eraseFromParent() {
 void GlobalAlias::setAliasee(Constant *Aliasee) {
   assert((!Aliasee || Aliasee->getType() == getType()) &&
          "Alias and aliasee types should match!");
-  setIndirectSymbol(Aliasee);
+  Op<0>().set(Aliasee);
+}
+
+const GlobalObject *GlobalAlias::getAliaseeObject() const {
+  DenseSet<const GlobalAlias *> Aliases;
+  return findBaseObject(getOperand(0), Aliases);
 }
 
 //===----------------------------------------------------------------------===//
@@ -531,8 +518,9 @@ void GlobalAlias::setAliasee(Constant *Aliasee) {
 GlobalIFunc::GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
                          const Twine &Name, Constant *Resolver,
                          Module *ParentModule)
-    : GlobalIndirectSymbol(Ty, Value::GlobalIFuncVal, AddressSpace, Link, Name,
-                           Resolver) {
+    : GlobalObject(Ty, Value::GlobalIFuncVal, &Op<0>(), 1, Link, Name,
+                   AddressSpace) {
+  setResolver(Resolver);
   if (ParentModule)
     ParentModule->getIFuncList().push_back(this);
 }
@@ -550,3 +538,8 @@ void GlobalIFunc::removeFromParent() {
 void GlobalIFunc::eraseFromParent() {
   getParent()->getIFuncList().erase(getIterator());
 }
+
+const Function *GlobalIFunc::getResolverFunction() const {
+  DenseSet<const GlobalAlias *> Aliases;
+  return dyn_cast<Function>(findBaseObject(getResolver(), Aliases));
+}
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 0f4945bad5ab..98f6ccf81973 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -94,11 +94,22 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
 }
 
 Value *IRBuilderBase::CreateStepVector(Type *DstType, const Twine &Name) {
-  if (isa<ScalableVectorType>(DstType))
-    return CreateIntrinsic(Intrinsic::experimental_stepvector, {DstType}, {},
-                           nullptr, Name);
-
   Type *STy = DstType->getScalarType();
+  if (isa<ScalableVectorType>(DstType)) {
+    Type *StepVecType = DstType;
+    // TODO: We expect this special case (element type < 8 bits) to be
+    // temporary - once the intrinsic properly supports < 8 bits this code
+    // can be removed.
+    if (STy->getScalarSizeInBits() < 8)
+      StepVecType =
+          VectorType::get(getInt8Ty(), cast<ScalableVectorType>(DstType));
+    Value *Res = CreateIntrinsic(Intrinsic::experimental_stepvector,
+                                 {StepVecType}, {}, nullptr, Name);
+    if (StepVecType != DstType)
+      Res = CreateTrunc(Res, DstType);
+    return Res;
+  }
+
   unsigned NumEls = cast<FixedVectorType>(DstType)->getNumElements();
 
   // Create a vector of consecutive numbers from zero to VF.
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 937dc6957806..a4659da7e807 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -141,6 +141,10 @@ bool Instruction::hasNoSignedWrap() const {
   return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
 }
 
+bool Instruction::hasPoisonGeneratingFlags() const {
+  return cast<Operator>(this)->hasPoisonGeneratingFlags();
+}
+
 void Instruction::dropPoisonGeneratingFlags() {
   switch (getOpcode()) {
   case Instruction::Add:
@@ -163,6 +167,8 @@ void Instruction::dropPoisonGeneratingFlags() {
     break;
   }
   // TODO: FastMathFlags!
+
+  assert(!hasPoisonGeneratingFlags() && "must be kept in sync");
 }
 
 void Instruction::dropUndefImplyingAttrsAndUnknownMetadata(
@@ -178,9 +184,9 @@ void Instruction::dropUndefImplyingAttrsAndUnknownMetadata(
   if (AL.isEmpty())
     return;
   AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes();
-  for (unsigned ArgNo = 0; ArgNo < CB->getNumArgOperands(); ArgNo++)
+  for (unsigned ArgNo = 0; ArgNo < CB->arg_size(); ArgNo++)
     CB->removeParamAttrs(ArgNo, UBImplyingAttributes);
-  CB->removeAttributes(AttributeList::ReturnIndex, UBImplyingAttributes);
+  CB->removeRetAttrs(UBImplyingAttributes);
 }
 
 bool Instruction::isExact() const {
@@ -307,20 +313,20 @@ void Instruction::copyIRFlags(const Value *V, bool IncludeWrapFlags) {
 
   if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(V))
     if (auto *DestGEP = dyn_cast<GetElementPtrInst>(this))
-      DestGEP->setIsInBounds(SrcGEP->isInBounds() | DestGEP->isInBounds());
+      DestGEP->setIsInBounds(SrcGEP->isInBounds() || DestGEP->isInBounds());
 }
 
 void Instruction::andIRFlags(const Value *V) {
   if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
     if (isa<OverflowingBinaryOperator>(this)) {
-      setHasNoSignedWrap(hasNoSignedWrap() & OB->hasNoSignedWrap());
-      setHasNoUnsignedWrap(hasNoUnsignedWrap() & OB->hasNoUnsignedWrap());
+      setHasNoSignedWrap(hasNoSignedWrap() && OB->hasNoSignedWrap());
+      setHasNoUnsignedWrap(hasNoUnsignedWrap() && OB->hasNoUnsignedWrap());
     }
   }
 
   if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
     if (isa<PossiblyExactOperator>(this))
-      setIsExact(isExact() & PE->isExact());
+      setIsExact(isExact() && PE->isExact());
 
   if (auto *FP = dyn_cast<FPMathOperator>(V)) {
     if (isa<FPMathOperator>(this)) {
@@ -332,7 +338,7 @@ void Instruction::andIRFlags(const Value *V) {
 
   if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(V))
     if (auto *DestGEP = dyn_cast<GetElementPtrInst>(this))
-      DestGEP->setIsInBounds(SrcGEP->isInBounds() & DestGEP->isInBounds());
+      DestGEP->setIsInBounds(SrcGEP->isInBounds() && DestGEP->isInBounds());
 }
 
 const char *Instruction::getOpcodeName(unsigned OpCode) {
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 5b01c70dec8d..c42df49d97ea 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -318,9 +318,8 @@ bool CallBase::isReturnNonNull() const {
   if (hasRetAttr(Attribute::NonNull))
     return true;
 
-  if (getDereferenceableBytes(AttributeList::ReturnIndex) > 0 &&
-           !NullPointerIsDefined(getCaller(),
-                                 getType()->getPointerAddressSpace()))
+  if (getRetDereferenceableBytes() > 0 &&
+      !NullPointerIsDefined(getCaller(), getType()->getPointerAddressSpace()))
     return true;
 
   return false;
@@ -329,11 +328,10 @@ bool CallBase::isReturnNonNull() const {
 Value *CallBase::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index))
     return getArgOperand(Index - AttributeList::FirstArgIndex);
   if (const Function *F = getCalledFunction())
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
-        Index)
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index))
       return getArgOperand(Index - AttributeList::FirstArgIndex);
 
   return nullptr;
@@ -341,24 +339,36 @@ Value *CallBase::getReturnedArgOperand() const {
 
 /// Determine whether the argument or parameter has the given attribute.
 bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
-  assert(ArgNo < getNumArgOperands() && "Param index out of bounds!");
+  assert(ArgNo < arg_size() && "Param index out of bounds!");
 
-  if (Attrs.hasParamAttribute(ArgNo, Kind))
+  if (Attrs.hasParamAttr(ArgNo, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasParamAttribute(ArgNo, Kind);
+    return F->getAttributes().hasParamAttr(ArgNo, Kind);
   return false;
 }
 
 bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const {
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasFnAttribute(Kind);
+  Value *V = getCalledOperand();
+  if (auto *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == BitCast)
+      V = CE->getOperand(0);
+
+  if (auto *F = dyn_cast<Function>(V))
+    return F->getAttributes().hasFnAttr(Kind);
+
   return false;
 }
 
 bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const {
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasFnAttribute(Kind);
+  Value *V = getCalledOperand();
+  if (auto *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == BitCast)
+      V = CE->getOperand(0);
+
+  if (auto *F = dyn_cast<Function>(V))
+    return F->getAttributes().hasFnAttr(Kind);
+
   return false;
 }
 
@@ -933,7 +943,7 @@ void CallBrInst::updateArgBlockAddresses(unsigned i, BasicBlock *B) {
   if (BasicBlock *OldBB = getIndirectDest(i)) {
     BlockAddress *Old = BlockAddress::get(OldBB);
     BlockAddress *New = BlockAddress::get(B);
-    for (unsigned ArgNo = 0, e = getNumArgOperands(); ArgNo != e; ++ArgNo)
+    for (unsigned ArgNo = 0, e = arg_size(); ArgNo != e; ++ArgNo)
       if (dyn_cast<BlockAddress>(getArgOperand(ArgNo)) == Old)
         setArgOperand(ArgNo, New);
   }
@@ -1909,6 +1919,32 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
 //                      ShuffleVectorInst Implementation
 //===----------------------------------------------------------------------===//
 
+static Value *createPlaceholderForShuffleVector(Value *V) {
+  assert(V && "Cannot create placeholder of nullptr V");
+  return PoisonValue::get(V->getType());
+}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *Mask, const Twine &Name,
+                                     Instruction *InsertBefore)
+    : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
+                        InsertBefore) {}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *Mask, const Twine &Name,
+                                     BasicBlock *InsertAtEnd)
+    : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
+                        InsertAtEnd) {}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, ArrayRef<int> Mask,
+                                     const Twine &Name,
+                                     Instruction *InsertBefore)
+    : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
+                        InsertBefore) {}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, ArrayRef<int> Mask,
+                                     const Twine &Name, BasicBlock *InsertAtEnd)
+    : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
+                        InsertAtEnd) {}
+
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                                      const Twine &Name,
                                      Instruction *InsertBefore)
@@ -2259,6 +2295,80 @@ bool ShuffleVectorInst::isExtractSubvectorMask(ArrayRef<int> Mask,
   return false;
 }
 
+bool ShuffleVectorInst::isInsertSubvectorMask(ArrayRef<int> Mask,
+                                              int NumSrcElts, int &NumSubElts,
+                                              int &Index) {
+  int NumMaskElts = Mask.size();
+
+  // Don't try to match if we're shuffling to a smaller size.
+  if (NumMaskElts < NumSrcElts)
+    return false;
+
+  // TODO: We don't recognize self-insertion/widening.
+  if (isSingleSourceMaskImpl(Mask, NumSrcElts))
+    return false;
+
+  // Determine which mask elements are attributed to which source.
+  APInt UndefElts = APInt::getZero(NumMaskElts);
+  APInt Src0Elts = APInt::getZero(NumMaskElts);
+  APInt Src1Elts = APInt::getZero(NumMaskElts);
+  bool Src0Identity = true;
+  bool Src1Identity = true;
+
+  for (int i = 0; i != NumMaskElts; ++i) {
+    int M = Mask[i];
+    if (M < 0) {
+      UndefElts.setBit(i);
+      continue;
+    }
+    if (M < NumSrcElts) {
+      Src0Elts.setBit(i);
+      Src0Identity &= (M == i);
+      continue;
+    }
+    Src1Elts.setBit(i);
+    Src1Identity &= (M == (i + NumSrcElts));
+    continue;
+  }
+  assert((Src0Elts | Src1Elts | UndefElts).isAllOnes() &&
+         "unknown shuffle elements");
+  assert(!Src0Elts.isZero() && !Src1Elts.isZero() &&
+         "2-source shuffle not found");
+
+  // Determine lo/hi span ranges.
+  // TODO: How should we handle undefs at the start of subvector insertions?
+  int Src0Lo = Src0Elts.countTrailingZeros();
+  int Src1Lo = Src1Elts.countTrailingZeros();
+  int Src0Hi = NumMaskElts - Src0Elts.countLeadingZeros();
+  int Src1Hi = NumMaskElts - Src1Elts.countLeadingZeros();
+
+  // If src0 is in place, see if the src1 elements is inplace within its own
+  // span.
+  if (Src0Identity) {
+    int NumSub1Elts = Src1Hi - Src1Lo;
+    ArrayRef<int> Sub1Mask = Mask.slice(Src1Lo, NumSub1Elts);
+    if (isIdentityMaskImpl(Sub1Mask, NumSrcElts)) {
+      NumSubElts = NumSub1Elts;
+      Index = Src1Lo;
+      return true;
+    }
+  }
+
+  // If src1 is in place, see if the src0 elements is inplace within its own
+  // span.
+  if (Src1Identity) {
+    int NumSub0Elts = Src0Hi - Src0Lo;
+    ArrayRef<int> Sub0Mask = Mask.slice(Src0Lo, NumSub0Elts);
+    if (isIdentityMaskImpl(Sub0Mask, NumSrcElts)) {
+      NumSubElts = NumSub0Elts;
+      Index = Src0Lo;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool ShuffleVectorInst::isIdentityWithPadding() const {
   if (isa<UndefValue>(Op<2>()))
     return false;
@@ -2326,6 +2436,87 @@ bool ShuffleVectorInst::isConcat() const {
   return isIdentityMaskImpl(getShuffleMask(), NumMaskElts);
 }
 
+static bool isReplicationMaskWithParams(ArrayRef<int> Mask,
+                                        int ReplicationFactor, int VF) {
+  assert(Mask.size() == (unsigned)ReplicationFactor * VF &&
+         "Unexpected mask size.");
+
+  for (int CurrElt : seq(0, VF)) {
+    ArrayRef<int> CurrSubMask = Mask.take_front(ReplicationFactor);
+    assert(CurrSubMask.size() == (unsigned)ReplicationFactor &&
+           "Run out of mask?");
+    Mask = Mask.drop_front(ReplicationFactor);
+    if (!all_of(CurrSubMask, [CurrElt](int MaskElt) {
+          return MaskElt == UndefMaskElem || MaskElt == CurrElt;
+        }))
+      return false;
+  }
+  assert(Mask.empty() && "Did not consume the whole mask?");
+
+  return true;
+}
+
+bool ShuffleVectorInst::isReplicationMask(ArrayRef<int> Mask,
+                                          int &ReplicationFactor, int &VF) {
+  // undef-less case is trivial.
+  if (none_of(Mask, [](int MaskElt) { return MaskElt == UndefMaskElem; })) {
+    ReplicationFactor =
+        Mask.take_while([](int MaskElt) { return MaskElt == 0; }).size();
+    if (ReplicationFactor == 0 || Mask.size() % ReplicationFactor != 0)
+      return false;
+    VF = Mask.size() / ReplicationFactor;
+    return isReplicationMaskWithParams(Mask, ReplicationFactor, VF);
+  }
+
+  // However, if the mask contains undef's, we have to enumerate possible tuples
+  // and pick one. There are bounds on replication factor: [1, mask size]
+  // (where RF=1 is an identity shuffle, RF=mask size is a broadcast shuffle)
+  // Additionally, mask size is a replication factor multiplied by vector size,
+  // which further significantly reduces the search space.
+
+  // Before doing that, let's perform basic sanity check first.
+  int Largest = -1;
+  for (int MaskElt : Mask) {
+    if (MaskElt == UndefMaskElem)
+      continue;
+    // Elements must be in non-decreasing order.
+    if (MaskElt < Largest)
+      return false;
+    Largest = std::max(Largest, MaskElt);
+  }
+
+  // Prefer larger replication factor if all else equal.
+  for (int PossibleReplicationFactor :
+       reverse(seq_inclusive<unsigned>(1, Mask.size()))) {
+    if (Mask.size() % PossibleReplicationFactor != 0)
+      continue;
+    int PossibleVF = Mask.size() / PossibleReplicationFactor;
+    if (!isReplicationMaskWithParams(Mask, PossibleReplicationFactor,
+                                     PossibleVF))
+      continue;
+    ReplicationFactor = PossibleReplicationFactor;
+    VF = PossibleVF;
+    return true;
+  }
+
+  return false;
+}
+
+bool ShuffleVectorInst::isReplicationMask(int &ReplicationFactor,
+                                          int &VF) const {
+  // Not possible to express a shuffle mask for a scalable vector for this
+  // case.
+  if (isa<ScalableVectorType>(getType()))
+    return false;
+
+  VF = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
+  if (ShuffleMask.size() % VF != 0)
+    return false;
+  ReplicationFactor = ShuffleMask.size() / VF;
+
+  return isReplicationMaskWithParams(ShuffleMask, ReplicationFactor, VF);
+}
+
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
@@ -3945,6 +4136,35 @@ bool CmpInst::isSigned(Predicate predicate) {
   }
 }
 
+bool ICmpInst::compare(const APInt &LHS, const APInt &RHS,
+                       ICmpInst::Predicate Pred) {
+  assert(ICmpInst::isIntPredicate(Pred) && "Only for integer predicates!");
+  switch (Pred) {
+  case ICmpInst::Predicate::ICMP_EQ:
+    return LHS.eq(RHS);
+  case ICmpInst::Predicate::ICMP_NE:
+    return LHS.ne(RHS);
+  case ICmpInst::Predicate::ICMP_UGT:
+    return LHS.ugt(RHS);
+  case ICmpInst::Predicate::ICMP_UGE:
+    return LHS.uge(RHS);
+  case ICmpInst::Predicate::ICMP_ULT:
+    return LHS.ult(RHS);
+  case ICmpInst::Predicate::ICMP_ULE:
+    return LHS.ule(RHS);
+  case ICmpInst::Predicate::ICMP_SGT:
+    return LHS.sgt(RHS);
+  case ICmpInst::Predicate::ICMP_SGE:
+    return LHS.sge(RHS);
+  case ICmpInst::Predicate::ICMP_SLT:
+    return LHS.slt(RHS);
+  case ICmpInst::Predicate::ICMP_SLE:
+    return LHS.sle(RHS);
+  default:
+    llvm_unreachable("Unexpected non-integer predicate.");
+  };
+}
+
 CmpInst::Predicate CmpInst::getFlippedSignednessPredicate(Predicate pred) {
   assert(CmpInst::isRelational(pred) &&
          "Call only with non-equality predicates!");
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 19942fa187fd..7552906fd07a 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -188,26 +188,26 @@ Value *InstrProfIncrementInst::getStep() const {
 }
 
 Optional<RoundingMode> ConstrainedFPIntrinsic::getRoundingMode() const {
-  unsigned NumOperands = getNumArgOperands();
+  unsigned NumOperands = arg_size();
   Metadata *MD = nullptr;
   auto *MAV = dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2));
   if (MAV)
     MD = MAV->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return None;
-  return StrToRoundingMode(cast<MDString>(MD)->getString());
+  return convertStrToRoundingMode(cast<MDString>(MD)->getString());
 }
 
 Optional<fp::ExceptionBehavior>
 ConstrainedFPIntrinsic::getExceptionBehavior() const {
-  unsigned NumOperands = getNumArgOperands();
+  unsigned NumOperands = arg_size();
   Metadata *MD = nullptr;
   auto *MAV = dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1));
   if (MAV)
     MD = MAV->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return None;
-  return StrToExceptionBehavior(cast<MDString>(MD)->getString());
+  return convertStrToExceptionBehavior(cast<MDString>(MD)->getString());
 }
 
 bool ConstrainedFPIntrinsic::isDefaultFPEnvironment() const {
@@ -473,8 +473,17 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   assert(isVPIntrinsic(VPID) && "not a VP intrinsic");
   Function *VPFunc;
   switch (VPID) {
-  default:
-    VPFunc = Intrinsic::getDeclaration(M, VPID, Params[0]->getType());
+  default: {
+    Type *OverloadTy = Params[0]->getType();
+    if (VPReductionIntrinsic::isVPReduction(VPID))
+      OverloadTy =
+          Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType();
+
+    VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy);
+    break;
+  }
+  case Intrinsic::vp_select:
+    VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()});
     break;
   case Intrinsic::vp_load:
     VPFunc = Intrinsic::getDeclaration(
@@ -504,6 +513,48 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   return VPFunc;
 }
 
+bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return false;
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return true;
+}
+
+unsigned VPReductionIntrinsic::getVectorParamPos() const {
+  return *VPReductionIntrinsic::getVectorParamPos(getIntrinsicID());
+}
+
+unsigned VPReductionIntrinsic::getStartParamPos() const {
+  return *VPReductionIntrinsic::getStartParamPos(getIntrinsicID());
+}
+
+Optional<unsigned> VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) {
+  switch (ID) {
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    return VECTORPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  default:
+    return None;
+  }
+}
+
+Optional<unsigned> VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) {
+  switch (ID) {
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    return STARTPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  default:
+    return None;
+  }
+}
+
 Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const {
   switch (getIntrinsicID()) {
   case Intrinsic::uadd_with_overflow:
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index c4a713db455b..90716d9c81a6 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -348,6 +348,12 @@ std::unique_ptr<DiagnosticHandler> LLVMContext::getDiagnosticHandler() {
   return std::move(pImpl->DiagHandler);
 }
 
+void LLVMContext::enableOpaquePointers() const {
+  assert(pImpl->PointerTypes.empty() && pImpl->ASPointerTypes.empty() &&
+         "Must be called before creating any pointer types");
+  pImpl->setOpaquePointers(true);
+}
+
 bool LLVMContext::supportsTypedPointers() const {
-  return !pImpl->ForceOpaquePointers;
+  return !pImpl->getOpaquePointers();
 }
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 99819602c545..ebbf382aea38 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -23,9 +23,8 @@
 using namespace llvm;
 
 static cl::opt<bool>
-    ForceOpaquePointersCL("force-opaque-pointers",
-                          cl::desc("Force all pointers to be opaque pointers"),
-                          cl::init(false));
+    OpaquePointersCL("opaque-pointers", cl::desc("Use opaque pointers"),
+                     cl::init(false));
 
 LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     : DiagHandler(std::make_unique<DiagnosticHandler>()),
@@ -36,8 +35,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
       X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
       PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID),
       X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8),
-      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128),
-      ForceOpaquePointers(ForceOpaquePointersCL) {}
+      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
@@ -55,8 +53,15 @@ LLVMContextImpl::~LLVMContextImpl() {
 
   // Drop references for MDNodes.  Do this before Values get deleted to avoid
   // unnecessary RAUW when nodes are still unresolved.
-  for (auto *I : DistinctMDNodes)
+  for (auto *I : DistinctMDNodes) {
+    // We may have DIArgList that were uniqued, and as it has a custom
+    // implementation of dropAllReferences, it needs to be explicitly invoked.
+    if (auto *AL = dyn_cast<DIArgList>(I)) {
+      AL->dropAllReferences();
+      continue;
+    }
     I->dropAllReferences();
+  }
 #define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   for (auto *I : CLASS##s)                                                     \
     I->dropAllReferences();
@@ -227,3 +232,11 @@ OptPassGate &LLVMContextImpl::getOptPassGate() const {
 void LLVMContextImpl::setOptPassGate(OptPassGate& OPG) {
   this->OPG = &OPG;
 }
+
+bool LLVMContextImpl::getOpaquePointers() {
+  if (LLVM_UNLIKELY(!(OpaquePointers.hasValue())))
+    OpaquePointers = OpaquePointersCL;
+  return *OpaquePointers;
+}
+
+void LLVMContextImpl::setOpaquePointers(bool OP) { OpaquePointers = OP; }
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 2ae23fdc95a8..b2909c425846 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -61,7 +61,9 @@ using DenseMapAPIntKeyInfo = DenseMapInfo<APInt>;
 
 struct DenseMapAPFloatKeyInfo {
   static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus(), 1); }
-  static inline APFloat getTombstoneKey() { return APFloat(APFloat::Bogus(), 2); }
+  static inline APFloat getTombstoneKey() {
+    return APFloat(APFloat::Bogus(), 2);
+  }
 
   static unsigned getHashValue(const APFloat &Key) {
     return static_cast<unsigned>(hash_value(Key));
@@ -74,46 +76,42 @@ struct DenseMapAPFloatKeyInfo {
 
 struct AnonStructTypeKeyInfo {
   struct KeyTy {
-    ArrayRef<Type*> ETypes;
+    ArrayRef<Type *> ETypes;
     bool isPacked;
 
-    KeyTy(const ArrayRef<Type*>& E, bool P) :
-      ETypes(E), isPacked(P) {}
+    KeyTy(const ArrayRef<Type *> &E, bool P) : ETypes(E), isPacked(P) {}
 
     KeyTy(const StructType *ST)
         : ETypes(ST->elements()), isPacked(ST->isPacked()) {}
 
-    bool operator==(const KeyTy& that) const {
+    bool operator==(const KeyTy &that) const {
       if (isPacked != that.isPacked)
         return false;
       if (ETypes != that.ETypes)
         return false;
       return true;
     }
-    bool operator!=(const KeyTy& that) const {
-      return !this->operator==(that);
-    }
+    bool operator!=(const KeyTy &that) const { return !this->operator==(that); }
   };
 
-  static inline StructType* getEmptyKey() {
-    return DenseMapInfo<StructType*>::getEmptyKey();
+  static inline StructType *getEmptyKey() {
+    return DenseMapInfo<StructType *>::getEmptyKey();
   }
 
-  static inline StructType* getTombstoneKey() {
-    return DenseMapInfo<StructType*>::getTombstoneKey();
+  static inline StructType *getTombstoneKey() {
+    return DenseMapInfo<StructType *>::getTombstoneKey();
   }
 
-  static unsigned getHashValue(const KeyTy& Key) {
-    return hash_combine(hash_combine_range(Key.ETypes.begin(),
-                                           Key.ETypes.end()),
-                        Key.isPacked);
+  static unsigned getHashValue(const KeyTy &Key) {
+    return hash_combine(
+        hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()), Key.isPacked);
   }
 
   static unsigned getHashValue(const StructType *ST) {
     return getHashValue(KeyTy(ST));
   }
 
-  static bool isEqual(const KeyTy& LHS, const StructType *RHS) {
+  static bool isEqual(const KeyTy &LHS, const StructType *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return LHS == KeyTy(RHS);
@@ -127,16 +125,16 @@ struct AnonStructTypeKeyInfo {
 struct FunctionTypeKeyInfo {
   struct KeyTy {
     const Type *ReturnType;
-    ArrayRef<Type*> Params;
+    ArrayRef<Type *> Params;
     bool isVarArg;
 
-    KeyTy(const Type* R, const ArrayRef<Type*>& P, bool V) :
-      ReturnType(R), Params(P), isVarArg(V) {}
+    KeyTy(const Type *R, const ArrayRef<Type *> &P, bool V)
+        : ReturnType(R), Params(P), isVarArg(V) {}
     KeyTy(const FunctionType *FT)
         : ReturnType(FT->getReturnType()), Params(FT->params()),
           isVarArg(FT->isVarArg()) {}
 
-    bool operator==(const KeyTy& that) const {
+    bool operator==(const KeyTy &that) const {
       if (ReturnType != that.ReturnType)
         return false;
       if (isVarArg != that.isVarArg)
@@ -145,31 +143,28 @@ struct FunctionTypeKeyInfo {
         return false;
       return true;
     }
-    bool operator!=(const KeyTy& that) const {
-      return !this->operator==(that);
-    }
+    bool operator!=(const KeyTy &that) const { return !this->operator==(that); }
   };
 
-  static inline FunctionType* getEmptyKey() {
-    return DenseMapInfo<FunctionType*>::getEmptyKey();
+  static inline FunctionType *getEmptyKey() {
+    return DenseMapInfo<FunctionType *>::getEmptyKey();
   }
 
-  static inline FunctionType* getTombstoneKey() {
-    return DenseMapInfo<FunctionType*>::getTombstoneKey();
+  static inline FunctionType *getTombstoneKey() {
+    return DenseMapInfo<FunctionType *>::getTombstoneKey();
   }
 
-  static unsigned getHashValue(const KeyTy& Key) {
-    return hash_combine(Key.ReturnType,
-                        hash_combine_range(Key.Params.begin(),
-                                           Key.Params.end()),
-                        Key.isVarArg);
+  static unsigned getHashValue(const KeyTy &Key) {
+    return hash_combine(
+        Key.ReturnType,
+        hash_combine_range(Key.Params.begin(), Key.Params.end()), Key.isVarArg);
   }
 
   static unsigned getHashValue(const FunctionType *FT) {
     return getHashValue(KeyTy(FT));
   }
 
-  static bool isEqual(const KeyTy& LHS, const FunctionType *RHS) {
+  static bool isEqual(const KeyTy &LHS, const FunctionType *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return LHS == KeyTy(RHS);
@@ -412,14 +407,14 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
         Encoding(Encoding), Flags(Flags) {}
   MDNodeKeyImpl(const DIBasicType *N)
       : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getSizeInBits()),
-        AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()), Flags(N->getFlags()) {}
+        AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()),
+        Flags(N->getFlags()) {}
 
   bool isKeyOf(const DIBasicType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            SizeInBits == RHS->getSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
-           Encoding == RHS->getEncoding() &&
-           Flags == RHS->getFlags();
+           Encoding == RHS->getEncoding() && Flags == RHS->getFlags();
   }
 
   unsigned getHashValue() const {
@@ -471,23 +466,24 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   Optional<unsigned> DWARFAddressSpace;
   unsigned Flags;
   Metadata *ExtraData;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint32_t AlignInBits, uint64_t OffsetInBits,
                 Optional<unsigned> DWARFAddressSpace, unsigned Flags,
-                Metadata *ExtraData)
+                Metadata *ExtraData, Metadata *Annotations)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
-        Flags(Flags), ExtraData(ExtraData) {}
+        Flags(Flags), ExtraData(ExtraData), Annotations(Annotations) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
         DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
-        ExtraData(N->getRawExtraData()) {}
+        ExtraData(N->getRawExtraData()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -497,8 +493,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            AlignInBits == RHS->getAlignInBits() &&
            OffsetInBits == RHS->getOffsetInBits() &&
            DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
-           Flags == RHS->getFlags() &&
-           ExtraData == RHS->getRawExtraData();
+           Flags == RHS->getFlags() && ExtraData == RHS->getRawExtraData() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   unsigned getHashValue() const {
@@ -525,7 +521,8 @@ template <> struct MDNodeSubsetEqualImpl<DIDerivedType> {
     return isODRMember(LHS.Tag, LHS.Scope, LHS.Name, RHS);
   }
 
-  static bool isSubsetEqual(const DIDerivedType *LHS, const DIDerivedType *RHS) {
+  static bool isSubsetEqual(const DIDerivedType *LHS,
+                            const DIDerivedType *RHS) {
     return isODRMember(LHS->getTag(), LHS->getRawScope(), LHS->getRawName(),
                        RHS);
   }
@@ -569,6 +566,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   Metadata *Associated;
   Metadata *Allocated;
   Metadata *Rank;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -577,14 +575,15 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
                 Metadata *VTableHolder, Metadata *TemplateParams,
                 MDString *Identifier, Metadata *Discriminator,
                 Metadata *DataLocation, Metadata *Associated,
-                Metadata *Allocated, Metadata *Rank)
+                Metadata *Allocated, Metadata *Rank, Metadata *Annotations)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), Flags(Flags), Elements(Elements),
         RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
         TemplateParams(TemplateParams), Identifier(Identifier),
         Discriminator(Discriminator), DataLocation(DataLocation),
-        Associated(Associated), Allocated(Allocated), Rank(Rank) {}
+        Associated(Associated), Allocated(Allocated), Rank(Rank),
+        Annotations(Annotations) {}
   MDNodeKeyImpl(const DICompositeType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
@@ -597,7 +596,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
         Discriminator(N->getRawDiscriminator()),
         DataLocation(N->getRawDataLocation()),
         Associated(N->getRawAssociated()), Allocated(N->getRawAllocated()),
-        Rank(N->getRawRank()) {}
+        Rank(N->getRawRank()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DICompositeType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -614,7 +613,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            Discriminator == RHS->getRawDiscriminator() &&
            DataLocation == RHS->getRawDataLocation() &&
            Associated == RHS->getRawAssociated() &&
-           Allocated == RHS->getRawAllocated() && Rank == RHS->getRawRank();
+           Allocated == RHS->getRawAllocated() && Rank == RHS->getRawRank() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   unsigned getHashValue() const {
@@ -623,7 +623,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
     // collision "most of the time". There is no correctness issue in case of
     // collision because of the full check above.
     return hash_combine(Name, File, Line, BaseType, Scope, Elements,
-                        TemplateParams);
+                        TemplateParams, Annotations);
   }
 };
 
@@ -663,14 +663,13 @@ template <> struct MDNodeKeyImpl<DIFile> {
   bool isKeyOf(const DIFile *RHS) const {
     return Filename == RHS->getRawFilename() &&
            Directory == RHS->getRawDirectory() &&
-           Checksum == RHS->getRawChecksum() &&
-           Source == RHS->getRawSource();
+           Checksum == RHS->getRawChecksum() && Source == RHS->getRawSource();
   }
 
   unsigned getHashValue() const {
-    return hash_combine(
-        Filename, Directory, Checksum ? Checksum->Kind : 0,
-        Checksum ? Checksum->Value : nullptr, Source.getValueOr(nullptr));
+    return hash_combine(Filename, Directory, Checksum ? Checksum->Kind : 0,
+                        Checksum ? Checksum->Value : nullptr,
+                        Source.getValueOr(nullptr));
   }
 };
 
@@ -692,6 +691,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *Declaration;
   Metadata *RetainedNodes;
   Metadata *ThrownTypes;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
@@ -699,13 +699,14 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
                 unsigned VirtualIndex, int ThisAdjustment, unsigned Flags,
                 unsigned SPFlags, Metadata *Unit, Metadata *TemplateParams,
                 Metadata *Declaration, Metadata *RetainedNodes,
-                Metadata *ThrownTypes)
+                Metadata *ThrownTypes, Metadata *Annotations)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), ScopeLine(ScopeLine),
         ContainingType(ContainingType), VirtualIndex(VirtualIndex),
         ThisAdjustment(ThisAdjustment), Flags(Flags), SPFlags(SPFlags),
         Unit(Unit), TemplateParams(TemplateParams), Declaration(Declaration),
-        RetainedNodes(RetainedNodes), ThrownTypes(ThrownTypes) {}
+        RetainedNodes(RetainedNodes), ThrownTypes(ThrownTypes),
+        Annotations(Annotations) {}
   MDNodeKeyImpl(const DISubprogram *N)
       : Scope(N->getRawScope()), Name(N->getRawName()),
         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
@@ -717,7 +718,8 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         TemplateParams(N->getRawTemplateParams()),
         Declaration(N->getRawDeclaration()),
         RetainedNodes(N->getRawRetainedNodes()),
-        ThrownTypes(N->getRawThrownTypes()) {}
+        ThrownTypes(N->getRawThrownTypes()),
+        Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DISubprogram *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
@@ -732,7 +734,8 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            TemplateParams == RHS->getRawTemplateParams() &&
            Declaration == RHS->getRawDeclaration() &&
            RetainedNodes == RHS->getRawRetainedNodes() &&
-           ThrownTypes == RHS->getRawThrownTypes();
+           ThrownTypes == RHS->getRawThrownTypes() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   bool isDefinition() const { return SPFlags & DISubprogram::SPFlagDefinition; }
@@ -853,9 +856,7 @@ template <> struct MDNodeKeyImpl<DINamespace> {
            ExportSymbols == RHS->getExportSymbols();
   }
 
-  unsigned getHashValue() const {
-    return hash_combine(Scope, Name);
-  }
+  unsigned getHashValue() const { return hash_combine(Scope, Name); }
 };
 
 template <> struct MDNodeKeyImpl<DICommonBlock> {
@@ -865,8 +866,8 @@ template <> struct MDNodeKeyImpl<DICommonBlock> {
   Metadata *File;
   unsigned LineNo;
 
-  MDNodeKeyImpl(Metadata *Scope, Metadata *Decl, MDString *Name,
-                Metadata *File, unsigned LineNo)
+  MDNodeKeyImpl(Metadata *Scope, Metadata *Decl, MDString *Name, Metadata *File,
+                unsigned LineNo)
       : Scope(Scope), Decl(Decl), Name(Name), File(File), LineNo(LineNo) {}
   MDNodeKeyImpl(const DICommonBlock *N)
       : Scope(N->getRawScope()), Decl(N->getRawDecl()), Name(N->getRawName()),
@@ -874,8 +875,8 @@ template <> struct MDNodeKeyImpl<DICommonBlock> {
 
   bool isKeyOf(const DICommonBlock *RHS) const {
     return Scope == RHS->getRawScope() && Decl == RHS->getRawDecl() &&
-      Name == RHS->getRawName() && File == RHS->getRawFile() &&
-      LineNo == RHS->getLineNo();
+           Name == RHS->getRawName() && File == RHS->getRawFile() &&
+           LineNo == RHS->getLineNo();
   }
 
   unsigned getHashValue() const {
@@ -976,17 +977,19 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
   Metadata *StaticDataMemberDeclaration;
   Metadata *TemplateParams;
   uint32_t AlignInBits;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
                 bool IsLocalToUnit, bool IsDefinition,
                 Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
-                uint32_t AlignInBits)
+                uint32_t AlignInBits, Metadata *Annotations)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
         IsDefinition(IsDefinition),
         StaticDataMemberDeclaration(StaticDataMemberDeclaration),
-        TemplateParams(TemplateParams), AlignInBits(AlignInBits) {}
+        TemplateParams(TemplateParams), AlignInBits(AlignInBits),
+        Annotations(Annotations) {}
   MDNodeKeyImpl(const DIGlobalVariable *N)
       : Scope(N->getRawScope()), Name(N->getRawName()),
         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
@@ -994,7 +997,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
         IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
         StaticDataMemberDeclaration(N->getRawStaticDataMemberDeclaration()),
         TemplateParams(N->getRawTemplateParams()),
-        AlignInBits(N->getAlignInBits()) {}
+        AlignInBits(N->getAlignInBits()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DIGlobalVariable *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
@@ -1005,7 +1008,8 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
            StaticDataMemberDeclaration ==
                RHS->getRawStaticDataMemberDeclaration() &&
            TemplateParams == RHS->getRawTemplateParams() &&
-           AlignInBits == RHS->getAlignInBits();
+           AlignInBits == RHS->getAlignInBits() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   unsigned getHashValue() const {
@@ -1018,7 +1022,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
     // TODO: make hashing work fine with such situations
     return hash_combine(Scope, Name, LinkageName, File, Line, Type,
                         IsLocalToUnit, IsDefinition, /* AlignInBits, */
-                        StaticDataMemberDeclaration);
+                        StaticDataMemberDeclaration, Annotations);
   }
 };
 
@@ -1031,22 +1035,25 @@ template <> struct MDNodeKeyImpl<DILocalVariable> {
   unsigned Arg;
   unsigned Flags;
   uint32_t AlignInBits;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Type, unsigned Arg, unsigned Flags,
-                uint32_t AlignInBits)
+                uint32_t AlignInBits, Metadata *Annotations)
       : Scope(Scope), Name(Name), File(File), Line(Line), Type(Type), Arg(Arg),
-        Flags(Flags), AlignInBits(AlignInBits) {}
+        Flags(Flags), AlignInBits(AlignInBits), Annotations(Annotations) {}
   MDNodeKeyImpl(const DILocalVariable *N)
       : Scope(N->getRawScope()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Type(N->getRawType()), Arg(N->getArg()),
-        Flags(N->getFlags()), AlignInBits(N->getAlignInBits()) {}
+        Flags(N->getFlags()), AlignInBits(N->getAlignInBits()),
+        Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DILocalVariable *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            File == RHS->getRawFile() && Line == RHS->getLine() &&
            Type == RHS->getRawType() && Arg == RHS->getArg() &&
-           Flags == RHS->getFlags() && AlignInBits == RHS->getAlignInBits();
+           Flags == RHS->getFlags() && AlignInBits == RHS->getAlignInBits() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   unsigned getHashValue() const {
@@ -1057,7 +1064,7 @@ template <> struct MDNodeKeyImpl<DILocalVariable> {
     // clang/test/CodeGen/debug-info-257-args.c is an example of this problem,
     // generated IR is random for each run and test fails with Align included.
     // TODO: make hashing work fine with such situations
-    return hash_combine(Scope, Name, File, Line, Type, Arg, Flags);
+    return hash_combine(Scope, Name, File, Line, Type, Arg, Flags, Annotations);
   }
 };
 
@@ -1079,9 +1086,7 @@ template <> struct MDNodeKeyImpl<DILabel> {
   }
 
   /// Using name and line to get hash value. It should already be mostly unique.
-  unsigned getHashValue() const {
-    return hash_combine(Scope, Name, Line);
-  }
+  unsigned getHashValue() const { return hash_combine(Scope, Name, Line); }
 };
 
 template <> struct MDNodeKeyImpl<DIExpression> {
@@ -1155,23 +1160,26 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   Metadata *File;
   unsigned Line;
   MDString *Name;
+  Metadata *Elements;
 
   MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, Metadata *File,
-                unsigned Line, MDString *Name)
+                unsigned Line, MDString *Name, Metadata *Elements)
       : Tag(Tag), Scope(Scope), Entity(Entity), File(File), Line(Line),
-        Name(Name) {}
+        Name(Name), Elements(Elements) {}
   MDNodeKeyImpl(const DIImportedEntity *N)
       : Tag(N->getTag()), Scope(N->getRawScope()), Entity(N->getRawEntity()),
-        File(N->getRawFile()), Line(N->getLine()), Name(N->getRawName()) {}
+        File(N->getRawFile()), Line(N->getLine()), Name(N->getRawName()),
+        Elements(N->getRawElements()) {}
 
   bool isKeyOf(const DIImportedEntity *RHS) const {
     return Tag == RHS->getTag() && Scope == RHS->getRawScope() &&
            Entity == RHS->getRawEntity() && File == RHS->getFile() &&
-           Line == RHS->getLine() && Name == RHS->getRawName();
+           Line == RHS->getLine() && Name == RHS->getRawName() &&
+           Elements == RHS->getRawElements();
   }
 
   unsigned getHashValue() const {
-    return hash_combine(Tag, Scope, Entity, File, Line, Name);
+    return hash_combine(Tag, Scope, Entity, File, Line, Name, Elements);
   }
 };
 
@@ -1325,7 +1333,7 @@ class LLVMContextImpl {
 public:
   /// OwnedModules - The set of modules instantiated in this context, and which
   /// will be automatically deleted if this context is deleted.
-  SmallPtrSet<Module*, 4> OwnedModules;
+  SmallPtrSet<Module *, 4> OwnedModules;
 
   /// The main remark streamer used by all the other streamers (e.g. IR, MIR,
   /// frontends, etc.). This should only be used by the specific streamers, and
@@ -1377,7 +1385,7 @@ public:
   DenseMap<Value *, ValueAsMetadata *> ValuesAsMetadata;
   DenseMap<Metadata *, MetadataAsValue *> MetadataAsValues;
 
-  DenseMap<const Value*, ValueName*> ValueNames;
+  DenseMap<const Value *, ValueName *> ValueNames;
 
 #define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS)                                    \
   DenseSet<CLASS *, CLASS##Info> CLASS##s;
@@ -1412,7 +1420,7 @@ public:
   StringMap<std::unique_ptr<ConstantDataSequential>> CDSConstants;
 
   DenseMap<std::pair<const Function *, const BasicBlock *>, BlockAddress *>
-    BlockAddresses;
+      BlockAddresses;
 
   DenseMap<const GlobalValue *, DSOLocalEquivalent *> DSOLocalEquivalents;
 
@@ -1434,22 +1442,19 @@ public:
   BumpPtrAllocator Alloc;
   UniqueStringSaver Saver{Alloc};
 
-  DenseMap<unsigned, IntegerType*> IntegerTypes;
+  DenseMap<unsigned, IntegerType *> IntegerTypes;
 
   using FunctionTypeSet = DenseSet<FunctionType *, FunctionTypeKeyInfo>;
   FunctionTypeSet FunctionTypes;
   using StructTypeSet = DenseSet<StructType *, AnonStructTypeKeyInfo>;
   StructTypeSet AnonStructTypes;
-  StringMap<StructType*> NamedStructTypes;
+  StringMap<StructType *> NamedStructTypes;
   unsigned NamedStructTypesUniqueID = 0;
 
-  DenseMap<std::pair<Type *, uint64_t>, ArrayType*> ArrayTypes;
-  DenseMap<std::pair<Type *, ElementCount>, VectorType*> VectorTypes;
-  // TODO: clean up the following after we no longer support non-opaque pointer
-  // types.
-  bool ForceOpaquePointers;
-  DenseMap<Type*, PointerType*> PointerTypes;  // Pointers in AddrSpace = 0
-  DenseMap<std::pair<Type*, unsigned>, PointerType*> ASPointerTypes;
+  DenseMap<std::pair<Type *, uint64_t>, ArrayType *> ArrayTypes;
+  DenseMap<std::pair<Type *, ElementCount>, VectorType *> VectorTypes;
+  DenseMap<Type *, PointerType *> PointerTypes; // Pointers in AddrSpace = 0
+  DenseMap<std::pair<Type *, unsigned>, PointerType *> ASPointerTypes;
 
   /// ValueHandles - This map keeps track of all of the value handles that are
   /// watching a Value*.  The Value::HasValueHandle bit is used to know
@@ -1503,7 +1508,7 @@ public:
   /// This saves allocating an additional word in Function for programs which
   /// do not use GC (i.e., most programs) at the cost of increased overhead for
   /// clients which do use GC.
-  DenseMap<const Function*, std::string> GCNames;
+  DenseMap<const Function *, std::string> GCNames;
 
   /// Flag to indicate if Value (other than GlobalValue) retains their name or
   /// not.
@@ -1526,7 +1531,15 @@ public:
   ///
   /// The lifetime of the object must be guaranteed to extend as long as the
   /// LLVMContext is used by compilation.
-  void setOptPassGate(OptPassGate&);
+  void setOptPassGate(OptPassGate &);
+
+  // TODO: clean up the following after we no longer support non-opaque pointer
+  // types.
+  bool getOpaquePointers();
+  void setOpaquePointers(bool OP);
+
+private:
+  Optional<bool> OpaquePointers;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 32840fdeddf7..7bccf09012ca 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -1351,7 +1351,7 @@ void FunctionPassManager::add(Pass *P) {
 ///
 bool FunctionPassManager::run(Function &F) {
   handleAllErrors(F.materialize(), [&](ErrorInfoBase &EIB) {
-    report_fatal_error("Error reading bitcode file: " + EIB.message());
+    report_fatal_error(Twine("Error reading bitcode file: ") + EIB.message());
   });
   return FPM->run(F);
 }
diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index bbdde586e6e0..2399ea27ee9d 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -99,6 +99,11 @@ static void addByteCountSuffix(raw_ostream &OS, const Function *F,
   const unsigned PtrSize = DL.getPointerSize();
 
   for (const Argument &A : F->args()) {
+    // For the purposes of the byte count suffix, structs returned by pointer
+    // do not count as function arguments.
+    if (A.hasStructRetAttr())
+      continue;
+
     // 'Dereference' type in case of byval or inalloca parameter attribute.
     uint64_t AllocSize = A.hasPassPointeeByValueCopyAttr() ?
       A.getPassPointeeByValueCopySize(DL) :
@@ -186,7 +191,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
 
 // Check if the name needs quotes to be safe for the linker to interpret.
 static bool canBeUnquotedInDirective(char C) {
-  return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '@';
+  return isAlnum(C) || C == '_' || C == '@';
 }
 
 static bool canBeUnquotedInDirective(StringRef Name) {
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 4f87ef537765..ebcc493407cc 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -345,7 +345,7 @@ ReplaceableMetadataImpl *ReplaceableMetadataImpl::getIfExists(Metadata &MD) {
 bool ReplaceableMetadataImpl::isReplaceable(const Metadata &MD) {
   if (auto *N = dyn_cast<MDNode>(&MD))
     return !N->isResolved();
-  return dyn_cast<ValueAsMetadata>(&MD);
+  return isa<ValueAsMetadata>(&MD);
 }
 
 static DISubprogram *getLocalFunctionMetadata(Value *V) {
@@ -1367,6 +1367,15 @@ void Instruction::addAnnotationMetadata(StringRef Name) {
   setMetadata(LLVMContext::MD_annotation, MD);
 }
 
+AAMDNodes Instruction::getAAMetadata() const {
+  AAMDNodes Result;
+  Result.TBAA = getMetadata(LLVMContext::MD_tbaa);
+  Result.TBAAStruct = getMetadata(LLVMContext::MD_tbaa_struct);
+  Result.Scope = getMetadata(LLVMContext::MD_alias_scope);
+  Result.NoAlias = getMetadata(LLVMContext::MD_noalias);
+  return Result;
+}
+
 void Instruction::setAAMetadata(const AAMDNodes &N) {
   setMetadata(LLVMContext::MD_tbaa, N.TBAA);
   setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct);
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 7c18dc0ed299..63ea41fba89a 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -114,6 +114,10 @@ GlobalValue *Module::getNamedValue(StringRef Name) const {
   return cast_or_null<GlobalValue>(getValueSymbolTable().lookup(Name));
 }
 
+unsigned Module::getNumNamedValues() const {
+  return getValueSymbolTable().size();
+}
+
 /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
 /// This ID is uniqued across modules in the current LLVMContext.
 unsigned Module::getMDKindID(StringRef Name) const {
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index f4ac6caf4f93..31c5cd938d03 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -251,12 +251,13 @@ void ModuleSummaryIndex::propagateAttributes(
     bool IsDSOLocal = true;
     for (auto &S : P.second.SummaryList) {
       if (!isGlobalValueLive(S.get())) {
-        // computeDeadSymbols should have marked all copies live. Note that
-        // it is possible that there is a GUID collision between internal
-        // symbols with the same name in different files of the same name but
-        // not enough distinguishing path. Because computeDeadSymbols should
-        // conservatively mark all copies live we can assert here that all are
-        // dead if any copy is dead.
+        // computeDeadSymbolsAndUpdateIndirectCalls should have marked all
+        // copies live. Note that it is possible that there is a GUID collision
+        // between internal symbols with the same name in different files of the
+        // same name but not enough distinguishing path. Because
+        // computeDeadSymbolsAndUpdateIndirectCalls should conservatively mark
+        // all copies live we can assert here that all are dead if any copy is
+        // dead.
         assert(llvm::none_of(
             P.second.SummaryList,
             [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
@@ -446,9 +447,11 @@ static std::string linkageToString(GlobalValue::LinkageTypes LT) {
 
 static std::string fflagsToString(FunctionSummary::FFlags F) {
   auto FlagValue = [](unsigned V) { return V ? '1' : '0'; };
-  char FlagRep[] = {FlagValue(F.ReadNone),     FlagValue(F.ReadOnly),
-                    FlagValue(F.NoRecurse),    FlagValue(F.ReturnDoesNotAlias),
-                    FlagValue(F.NoInline), FlagValue(F.AlwaysInline), 0};
+  char FlagRep[] = {FlagValue(F.ReadNone),  FlagValue(F.ReadOnly),
+                    FlagValue(F.NoRecurse), FlagValue(F.ReturnDoesNotAlias),
+                    FlagValue(F.NoInline),  FlagValue(F.AlwaysInline),
+                    FlagValue(F.NoUnwind),  FlagValue(F.MayThrow), 
+                    FlagValue(F.HasUnknownCall), 0};
 
   return FlagRep;
 }
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 18a1c84933e0..cf309ffd6212 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -19,6 +19,31 @@
 #include "ConstantsContext.h"
 
 namespace llvm {
+bool Operator::hasPoisonGeneratingFlags() const {
+  switch (getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    auto *OBO = cast<OverflowingBinaryOperator>(this);
+    return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap();
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::AShr:
+  case Instruction::LShr:
+    return cast<PossiblyExactOperator>(this)->isExact();
+  case Instruction::GetElementPtr: {
+    auto *GEP = cast<GEPOperator>(this);
+    // Note: inrange exists on constexpr only
+    return GEP->isInBounds() || GEP->getInRangeIndex() != None;
+  }
+  default:
+    return false;
+  }
+  // TODO: FastMathFlags!  (On instructions, but not constexpr)
+}
+
 Type *GEPOperator::getSourceElementType() const {
   if (auto *I = dyn_cast<GetElementPtrInst>(this))
     return I->getSourceElementType();
@@ -190,12 +215,14 @@ bool GEPOperator::collectOffset(
 
     if (STy || ScalableType)
       return false;
-    // Insert an initial offset of 0 for V iff none exists already, then
-    // increment the offset by IndexedSize.
-    VariableOffsets.insert({V, APInt(BitWidth, 0)});
     APInt IndexedSize =
         APInt(BitWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
-    VariableOffsets[V] += IndexedSize;
+    // Insert an initial offset of 0 for V iff none exists already, then
+    // increment the offset by IndexedSize.
+    if (!IndexedSize.isZero()) {
+      VariableOffsets.insert({V, APInt(BitWidth, 0)});
+      VariableOffsets[V] += IndexedSize;
+    }
   }
   return true;
 }
diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp
index 2cf2298e0005..55c0dbad5aab 100644
--- a/llvm/lib/IR/OptBisect.cpp
+++ b/llvm/lib/IR/OptBisect.cpp
@@ -22,14 +22,12 @@
 using namespace llvm;
 
 static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
-                                   cl::init(std::numeric_limits<int>::max()),
-                                   cl::Optional,
+                                   cl::init(OptBisect::Disabled), cl::Optional,
+                                   cl::cb<void, int>([](int Limit) {
+                                     llvm::OptBisector->setLimit(Limit);
+                                   }),
                                    cl::desc("Maximum optimization to perform"));
 
-OptBisect::OptBisect() : OptPassGate() {
-  BisectEnabled = OptBisectLimit != std::numeric_limits<int>::max();
-}
-
 static void printPassMessage(const StringRef &Name, int PassNum,
                              StringRef TargetDesc, bool Running) {
   StringRef Status = Running ? "" : "NOT ";
@@ -38,19 +36,21 @@ static void printPassMessage(const StringRef &Name, int PassNum,
 }
 
 bool OptBisect::shouldRunPass(const Pass *P, StringRef IRDescription) {
-  assert(BisectEnabled);
+  assert(isEnabled());
 
   return checkPass(P->getPassName(), IRDescription);
 }
 
 bool OptBisect::checkPass(const StringRef PassName,
                           const StringRef TargetDesc) {
-  assert(BisectEnabled);
+  assert(isEnabled());
 
   int CurBisectNum = ++LastBisectNum;
-  bool ShouldRun = (OptBisectLimit == -1 || CurBisectNum <= OptBisectLimit);
+  bool ShouldRun = (BisectLimit == -1 || CurBisectNum <= BisectLimit);
   printPassMessage(PassName, CurBisectNum, TargetDesc, ShouldRun);
   return ShouldRun;
 }
 
+const int OptBisect::Disabled;
+
 ManagedStatic<OptBisect> llvm::OptBisector;
diff --git a/llvm/lib/IR/PassManager.cpp b/llvm/lib/IR/PassManager.cpp
index 4cf7ab2a602b..d933003ccdf7 100644
--- a/llvm/lib/IR/PassManager.cpp
+++ b/llvm/lib/IR/PassManager.cpp
@@ -10,12 +10,13 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManagerImpl.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+namespace llvm {
 // Explicit template instantiations and specialization defininitions for core
 // template typedefs.
-namespace llvm {
 template class AllAnalysesOn<Module>;
 template class AllAnalysesOn<Function>;
 template class PassManager<Module>;
@@ -91,6 +92,16 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
 }
 } // namespace llvm
 
+void ModuleToFunctionPassAdaptor::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  OS << "function";
+  if (EagerlyInvalidate)
+    OS << "<eager-inv>";
+  OS << "(";
+  Pass->printPipeline(OS, MapClassName2PassName);
+  OS << ")";
+}
+
 PreservedAnalyses ModuleToFunctionPassAdaptor::run(Module &M,
                                                    ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
@@ -122,7 +133,7 @@ PreservedAnalyses ModuleToFunctionPassAdaptor::run(Module &M,
     // We know that the function pass couldn't have invalidated any other
     // function's analyses (that's the contract of a function pass), so
     // directly handle the function analysis manager's invalidation here.
-    FAM.invalidate(F, PassPA);
+    FAM.invalidate(F, EagerlyInvalidate ? PreservedAnalyses::none() : PassPA);
 
     // Then intersect the preserved set so that invalidation of module
     // analyses will eventually occur when the module pass completes.
diff --git a/llvm/lib/IR/ProfileSummary.cpp b/llvm/lib/IR/ProfileSummary.cpp
index 453a278a7f3f..05d5ac2c5ddf 100644
--- a/llvm/lib/IR/ProfileSummary.cpp
+++ b/llvm/lib/IR/ProfileSummary.cpp
@@ -249,7 +249,7 @@ ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) {
                             PartialProfileRatio);
 }
 
-void ProfileSummary::printSummary(raw_ostream &OS) {
+void ProfileSummary::printSummary(raw_ostream &OS) const {
   OS << "Total functions: " << NumFunctions << "\n";
   OS << "Maximum function count: " << MaxFunctionCount << "\n";
   OS << "Maximum block count: " << MaxCount << "\n";
@@ -257,7 +257,7 @@ void ProfileSummary::printSummary(raw_ostream &OS) {
   OS << "Total count: " << TotalCount << "\n";
 }
 
-void ProfileSummary::printDetailedSummary(raw_ostream &OS) {
+void ProfileSummary::printDetailedSummary(raw_ostream &OS) const {
   OS << "Detailed summary:\n";
   for (const auto &Entry : DetailedSummary) {
     OS << Entry.NumCounts << " blocks with count >= " << Entry.MinCount
diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp
index bd92c604da2c..101cada77ff9 100644
--- a/llvm/lib/IR/PseudoProbe.cpp
+++ b/llvm/lib/IR/PseudoProbe.cpp
@@ -98,12 +98,4 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) {
   }
 }
 
-void addPseudoProbeAttribute(PseudoProbeInst &Inst,
-                             PseudoProbeAttributes Attr) {
-  IRBuilder<> Builder(&Inst);
-  uint32_t OldAttr = Inst.getAttributes()->getZExtValue();
-  uint32_t NewAttr = OldAttr | (uint32_t)Attr;
-  if (OldAttr != NewAttr)
-    Inst.replaceUsesOfWith(Inst.getAttributes(), Builder.getInt32(NewAttr));
-}
 } // namespace llvm
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index fd73a1a8e5af..cfd8deba5a53 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -15,15 +15,9 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ValueMap.h"
 
 namespace llvm {
-// Replace a constant expression by instructions with equivalent operations at
-// a specified location.
-Instruction *createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
-  auto *CEInstr = CE->getAsInstruction();
-  CEInstr->insertBefore(Instr);
-  return CEInstr;
-}
 
 void convertConstantExprsToInstructions(Instruction *I, ConstantExpr *CE,
                                         SmallPtrSetImpl<Instruction *> *Insts) {
@@ -40,7 +34,8 @@ void convertConstantExprsToInstructions(
     Instruction *I,
     std::map<Use *, std::vector<std::vector<ConstantExpr *>>> &CEPaths,
     SmallPtrSetImpl<Instruction *> *Insts) {
-  SmallPtrSet<ConstantExpr *, 8> Visited;
+  ValueMap<ConstantExpr *, Instruction *> Visited;
+
   for (Use &U : I->operands()) {
     // The operand U is either not a constant expression operand or the
     // constant expression paths do not belong to U, ignore U.
@@ -55,24 +50,47 @@ void convertConstantExprsToInstructions(
       BI = &(*(BB->getFirstInsertionPt()));
     }
 
-    // Go through the paths associated with operand U, and convert all the
-    // constant expressions along all paths to corresponding instructions.
+    // Go through all the paths associated with operand U, and convert all the
+    // constant expressions along all the paths to corresponding instructions.
     auto *II = I;
     auto &Paths = CEPaths[&U];
     for (auto &Path : Paths) {
       for (auto *CE : Path) {
-        if (!Visited.insert(CE).second)
-          continue;
-        auto *NI = CE->getAsInstruction();
-        NI->insertBefore(BI);
+        // Instruction which is equivalent to CE.
+        Instruction *NI = nullptr;
+
+        if (!Visited.count(CE)) {
+          // CE is encountered first time, convert it into a corresponding
+          // instruction NI, and appropriately insert NI before the parent
+          // instruction.
+          NI = CE->getAsInstruction(BI);
+
+          // Mark CE as visited by mapping CE to NI.
+          Visited[CE] = NI;
+
+          // If required collect NI.
+          if (Insts)
+            Insts->insert(NI);
+        } else {
+          // We had already encountered CE, the correponding instruction already
+          // exist, use it to replace CE.
+          NI = Visited[CE];
+        }
+
+        assert(NI && "Expected an instruction corresponding to constant "
+                     "expression.");
+
+        // Replace all uses of constant expression CE by the corresponding
+        // instruction NI within the current parent instruction.
         II->replaceUsesOfWith(CE, NI);
-        CE->removeDeadConstantUsers();
         BI = II = NI;
-        if (Insts)
-          Insts->insert(NI);
       }
     }
   }
+
+  // Remove all converted constant expressions which are dead by now.
+  for (auto Item : Visited)
+    Item.first->removeDeadConstantUsers();
 }
 
 void collectConstantExprPaths(
diff --git a/llvm/lib/IR/Statepoint.cpp b/llvm/lib/IR/Statepoint.cpp
index bbfbbe489bae..b5916e4937c6 100644
--- a/llvm/lib/IR/Statepoint.cpp
+++ b/llvm/lib/IR/Statepoint.cpp
@@ -26,16 +26,14 @@ StatepointDirectives
 llvm::parseStatepointDirectivesFromAttrs(AttributeList AS) {
   StatepointDirectives Result;
 
-  Attribute AttrID =
-      AS.getAttribute(AttributeList::FunctionIndex, "statepoint-id");
+  Attribute AttrID = AS.getFnAttr("statepoint-id");
   uint64_t StatepointID;
   if (AttrID.isStringAttribute())
     if (!AttrID.getValueAsString().getAsInteger(10, StatepointID))
       Result.StatepointID = StatepointID;
 
   uint32_t NumPatchBytes;
-  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeList::FunctionIndex,
-                                                "statepoint-num-patch-bytes");
+  Attribute AttrNumPatchBytes = AS.getFnAttr("statepoint-num-patch-bytes");
   if (AttrNumPatchBytes.isStringAttribute())
     if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes))
       Result.NumPatchBytes = NumPatchBytes;
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index a21998976066..d59d87ad631b 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -66,6 +66,44 @@ bool Type::isOpaquePointerTy() const {
   return false;
 }
 
+const fltSemantics &Type::getFltSemantics() const {
+  switch (getTypeID()) {
+  case HalfTyID: return APFloat::IEEEhalf();
+  case BFloatTyID: return APFloat::BFloat();
+  case FloatTyID: return APFloat::IEEEsingle();
+  case DoubleTyID: return APFloat::IEEEdouble();
+  case X86_FP80TyID: return APFloat::x87DoubleExtended();
+  case FP128TyID: return APFloat::IEEEquad();
+  case PPC_FP128TyID: return APFloat::PPCDoubleDouble();
+  default: llvm_unreachable("Invalid floating type");
+  }
+}
+
+bool Type::isIEEE() const {
+  return APFloat::getZero(getFltSemantics()).isIEEE();
+}
+
+Type *Type::getFloatingPointTy(LLVMContext &C, const fltSemantics &S) {
+  Type *Ty;
+  if (&S == &APFloat::IEEEhalf())
+    Ty = Type::getHalfTy(C);
+  else if (&S == &APFloat::BFloat())
+    Ty = Type::getBFloatTy(C);
+  else if (&S == &APFloat::IEEEsingle())
+    Ty = Type::getFloatTy(C);
+  else if (&S == &APFloat::IEEEdouble())
+    Ty = Type::getDoubleTy(C);
+  else if (&S == &APFloat::x87DoubleExtended())
+    Ty = Type::getX86_FP80Ty(C);
+  else if (&S == &APFloat::IEEEquad())
+    Ty = Type::getFP128Ty(C);
+  else {
+    assert(&S == &APFloat::PPCDoubleDouble() && "Unknown FP format");
+    Ty = Type::getPPC_FP128Ty(C);
+  }
+  return Ty;
+}
+
 bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   // Identity cast means no change so return true
   if (this == Ty)
@@ -296,9 +334,7 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
   return Entry;
 }
 
-APInt IntegerType::getMask() const {
-  return APInt::getAllOnesValue(getBitWidth());
-}
+APInt IntegerType::getMask() const { return APInt::getAllOnes(getBitWidth()); }
 
 //===----------------------------------------------------------------------===//
 //                       FunctionType Implementation
@@ -696,8 +732,8 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
 
   LLVMContextImpl *CImpl = EltTy->getContext().pImpl;
 
-  // Create opaque pointer for pointer to opaque pointer.
-  if (CImpl->ForceOpaquePointers || EltTy->isOpaquePointerTy())
+  // Automatically convert typed pointers to opaque pointers.
+  if (CImpl->getOpaquePointers())
     return get(EltTy->getContext(), AddressSpace);
 
   // Since AddressSpace #0 is the common case, we special case it.
@@ -711,6 +747,8 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
 
 PointerType *PointerType::get(LLVMContext &C, unsigned AddressSpace) {
   LLVMContextImpl *CImpl = C.pImpl;
+  assert(CImpl->getOpaquePointers() &&
+         "Can only create opaque pointers in opaque pointer mode");
 
   // Since AddressSpace #0 is the common case, we special case it.
   PointerType *&Entry =
diff --git a/llvm/lib/IR/TypeFinder.cpp b/llvm/lib/IR/TypeFinder.cpp
index 724b8f6b6ad2..1f757d7dbf4e 100644
--- a/llvm/lib/IR/TypeFinder.cpp
+++ b/llvm/lib/IR/TypeFinder.cpp
@@ -106,11 +106,9 @@ void TypeFinder::incorporateType(Type *Ty) {
         StructTypes.push_back(STy);
 
     // Add all unvisited subtypes to worklist for processing
-    for (Type::subtype_reverse_iterator I = Ty->subtype_rbegin(),
-                                        E = Ty->subtype_rend();
-         I != E; ++I)
-      if (VisitedTypes.insert(*I).second)
-        TypeWorklist.push_back(*I);
+    for (Type *SubTy : llvm::reverse(Ty->subtypes()))
+      if (VisitedTypes.insert(SubTy).second)
+        TypeWorklist.push_back(SubTy);
   } while (!TypeWorklist.empty());
 }
 
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 8837151f2e18..68489075cd88 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -107,7 +107,7 @@ MutableArrayRef<uint8_t> User::getDescriptor() {
 }
 
 bool User::isDroppable() const {
-  return isa<AssumeInst>(this);
+  return isa<AssumeInst>(this) || isa<PseudoProbeInst>(this);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 1c595651b3d7..b475c8327874 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -176,6 +176,18 @@ Use *Value::getSingleUndroppableUse() {
   return Result;
 }
 
+User *Value::getUniqueUndroppableUser() {
+  User *Result = nullptr;
+  for (auto *U : users()) {
+    if (!U->isDroppable()) {
+      if (Result && Result != U)
+        return nullptr;
+      Result = U;
+    }
+  }
+  return Result;
+}
+
 bool Value::hasNUndroppableUses(unsigned int N) const {
   return hasNItems(user_begin(), user_end(), N, isUnDroppableUser);
 }
@@ -534,9 +546,7 @@ void Value::replaceUsesWithIf(Value *New,
   SmallVector<TrackingVH<Constant>, 8> Consts;
   SmallPtrSet<Constant *, 8> Visited;
 
-  for (use_iterator UI = use_begin(), E = use_end(); UI != E;) {
-    Use &U = *UI;
-    ++UI;
+  for (Use &U : llvm::make_early_inc_range(uses())) {
     if (!ShouldReplace(U))
       continue;
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
@@ -694,6 +704,7 @@ const Value *Value::stripPointerCastsForAliasAnalysis() const {
 
 const Value *Value::stripAndAccumulateConstantOffsets(
     const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
+    bool AllowInvariantGroup,
     function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
   if (!getType()->isPtrOrPtrVectorTy())
     return this;
@@ -753,6 +764,8 @@ const Value *Value::stripAndAccumulateConstantOffsets(
     } else if (const auto *Call = dyn_cast<CallBase>(V)) {
         if (const Value *RV = Call->getReturnedArgOperand())
           V = RV;
+        if (AllowInvariantGroup && Call->isLaunderOrStripInvariantGroup())
+          V = Call->getArgOperand(0);
     }
     assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!");
   } while (Visited.insert(V).second);
@@ -852,10 +865,9 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
       CanBeNull = true;
     }
   } else if (const auto *Call = dyn_cast<CallBase>(this)) {
-    DerefBytes = Call->getDereferenceableBytes(AttributeList::ReturnIndex);
+    DerefBytes = Call->getRetDereferenceableBytes();
     if (DerefBytes == 0) {
-      DerefBytes =
-          Call->getDereferenceableOrNullBytes(AttributeList::ReturnIndex);
+      DerefBytes = Call->getRetDereferenceableOrNullBytes();
       CanBeNull = true;
     }
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(this)) {
@@ -1014,8 +1026,7 @@ bool Value::isTransitiveUsedByMetadataOnly() const {
   llvm::SmallPtrSet<const User *, 32> Visited;
   WorkList.insert(WorkList.begin(), user_begin(), user_end());
   while (!WorkList.empty()) {
-    const User *U = WorkList.back();
-    WorkList.pop_back();
+    const User *U = WorkList.pop_back_val();
     Visited.insert(U);
     // If it is transitively used by a global value or a non-constant value,
     // it's obviously not only used by metadata.
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 758205a39eb3..dc4370d4b6ed 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -415,15 +415,18 @@ public:
     for (const GlobalAlias &GA : M.aliases())
       visitGlobalAlias(GA);
 
+    for (const GlobalIFunc &GI : M.ifuncs())
+      visitGlobalIFunc(GI);
+
     for (const NamedMDNode &NMD : M.named_metadata())
       visitNamedMDNode(NMD);
 
     for (const StringMapEntry<Comdat> &SMEC : M.getComdatSymbolTable())
       visitComdat(SMEC.getValue());
 
-    visitModuleFlags(M);
-    visitModuleIdents(M);
-    visitModuleCommandLines(M);
+    visitModuleFlags();
+    visitModuleIdents();
+    visitModuleCommandLines();
 
     verifyCompileUnits();
 
@@ -440,6 +443,7 @@ private:
   void visitGlobalValue(const GlobalValue &GV);
   void visitGlobalVariable(const GlobalVariable &GV);
   void visitGlobalAlias(const GlobalAlias &GA);
+  void visitGlobalIFunc(const GlobalIFunc &GI);
   void visitAliaseeSubExpr(const GlobalAlias &A, const Constant &C);
   void visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias *> &Visited,
                            const GlobalAlias &A, const Constant &C);
@@ -448,9 +452,9 @@ private:
   void visitMetadataAsValue(const MetadataAsValue &MD, Function *F);
   void visitValueAsMetadata(const ValueAsMetadata &MD, Function *F);
   void visitComdat(const Comdat &C);
-  void visitModuleIdents(const Module &M);
-  void visitModuleCommandLines(const Module &M);
-  void visitModuleFlags(const Module &M);
+  void visitModuleIdents();
+  void visitModuleCommandLines();
+  void visitModuleFlags();
   void visitModuleFlag(const MDNode *Op,
                        DenseMap<const MDString *, const MDNode *> &SeenIDs,
                        SmallVectorImpl<const MDNode *> &Requirements);
@@ -461,6 +465,8 @@ private:
   void visitDereferenceableMetadata(Instruction &I, MDNode *MD);
   void visitProfMetadata(Instruction &I, MDNode *MD);
   void visitAnnotationMetadata(MDNode *Annotation);
+  void visitAliasScopeMetadata(const MDNode *MD);
+  void visitAliasScopeListMetadata(const MDNode *MD);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -547,6 +553,8 @@ private:
   void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V, bool IsIntrinsic);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
+  template <typename T>
+  void verifyODRTypeAsScopeOperand(const MDNode &MD, T * = nullptr);
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
@@ -569,6 +577,9 @@ private:
   /// declarations share the same calling convention.
   void verifyDeoptimizeCallingConvs();
 
+  void verifyAttachedCallBundle(const CallBase &Call,
+                                const OperandBundleUse &BU);
+
   /// Verify all-or-nothing property of DIFile source attribute within a CU.
   void verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F);
 
@@ -816,6 +827,21 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
   visitGlobalValue(GA);
 }
 
+void Verifier::visitGlobalIFunc(const GlobalIFunc &GI) {
+  // Pierce through ConstantExprs and GlobalAliases and check that the resolver
+  // has a Function 
+  const Function *Resolver = GI.getResolverFunction();
+  Assert(Resolver, "IFunc must have a Function resolver", &GI);
+
+  // Check that the immediate resolver operand (prior to any bitcasts) has the
+  // correct type
+  const Type *ResolverTy = GI.getResolver()->getType();
+  const Type *ResolverFuncTy =
+      GlobalIFunc::getResolverFunctionType(GI.getValueType());
+  Assert(ResolverTy == ResolverFuncTy->getPointerTo(),
+         "IFunc resolver has incorrect type", &GI);
+}
+
 void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
   // There used to be various other llvm.dbg.* nodes, but we don't support
   // upgrading them and we want to reserve the namespace for future uses.
@@ -834,6 +860,19 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
   }
 }
 
+template <typename T>
+void Verifier::verifyODRTypeAsScopeOperand(const MDNode &MD, T *) {
+  if (isa<T>(MD)) {
+    if (auto *N = dyn_cast_or_null<DICompositeType>(cast<T>(MD).getScope()))
+      // Of all the supported tags for DICompositeType(see visitDICompositeType)
+      // we know that enum type cannot be a scope.
+      AssertDI(N->getTag() != dwarf::DW_TAG_enumeration_type,
+               "enum type is not a scope; check enum type ODR "
+               "violation",
+               N, &MD);
+  }
+}
+
 void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   // Only visit each node once.  Metadata can be mutually recursive, so this
   // avoids infinite recursion here, as well as being an optimization.
@@ -843,6 +882,12 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   Assert(&MD.getContext() == &Context,
          "MDNode context does not match Module context!", &MD);
 
+  // Makes sure when a scope operand is a ODR type, the ODR type uniquing does
+  // not create invalid debug metadata.
+  // TODO: check that the non-ODR-type scope operand is valid.
+  verifyODRTypeAsScopeOperand<DIType>(MD);
+  verifyODRTypeAsScopeOperand<DILocalScope>(MD);
+
   switch (MD.getMetadataID()) {
   default:
     llvm_unreachable("Invalid MDNode subclass");
@@ -1091,7 +1136,8 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
                N.getTag() == dwarf::DW_TAG_union_type ||
                N.getTag() == dwarf::DW_TAG_enumeration_type ||
                N.getTag() == dwarf::DW_TAG_class_type ||
-               N.getTag() == dwarf::DW_TAG_variant_part,
+               N.getTag() == dwarf::DW_TAG_variant_part ||
+               N.getTag() == dwarf::DW_TAG_namelist,
            "invalid tag", &N);
 
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
@@ -1470,7 +1516,7 @@ void Verifier::visitComdat(const Comdat &C) {
              "comdat global value has private linkage", GV);
 }
 
-void Verifier::visitModuleIdents(const Module &M) {
+void Verifier::visitModuleIdents() {
   const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident");
   if (!Idents)
     return;
@@ -1487,7 +1533,7 @@ void Verifier::visitModuleIdents(const Module &M) {
   }
 }
 
-void Verifier::visitModuleCommandLines(const Module &M) {
+void Verifier::visitModuleCommandLines() {
   const NamedMDNode *CommandLines = M.getNamedMetadata("llvm.commandline");
   if (!CommandLines)
     return;
@@ -1505,7 +1551,7 @@ void Verifier::visitModuleCommandLines(const Module &M) {
   }
 }
 
-void Verifier::visitModuleFlags(const Module &M) {
+void Verifier::visitModuleFlags() {
   const NamedMDNode *Flags = M.getModuleFlagsMetadata();
   if (!Flags) return;
 
@@ -1824,9 +1870,8 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
 
 void Verifier::checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr,
                                             const Value *V) {
-  if (Attrs.hasFnAttribute(Attr)) {
-    StringRef S = Attrs.getAttribute(AttributeList::FunctionIndex, Attr)
-                      .getValueAsString();
+  if (Attrs.hasFnAttr(Attr)) {
+    StringRef S = Attrs.getFnAttr(Attr).getValueAsString();
     unsigned N;
     if (S.getAsInteger(10, N))
       CheckFailed("\"" + Attr + "\" takes an unsigned integer: " + S, V);
@@ -1861,7 +1906,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
   bool SawSwiftError = false;
 
   // Verify return value attributes.
-  AttributeSet RetAttrs = Attrs.getRetAttributes();
+  AttributeSet RetAttrs = Attrs.getRetAttrs();
   for (Attribute RetAttr : RetAttrs)
     Assert(RetAttr.isStringAttribute() ||
            Attribute::canUseAsRetAttr(RetAttr.getKindAsEnum()),
@@ -1874,7 +1919,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
   // Verify parameter attributes.
   for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
     Type *Ty = FT->getParamType(i);
-    AttributeSet ArgAttrs = Attrs.getParamAttributes(i);
+    AttributeSet ArgAttrs = Attrs.getParamAttrs(i);
 
     if (!IsIntrinsic) {
       Assert(!ArgAttrs.hasAttribute(Attribute::ImmArg),
@@ -1928,63 +1973,63 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
     }
   }
 
-  if (!Attrs.hasAttributes(AttributeList::FunctionIndex))
+  if (!Attrs.hasFnAttrs())
     return;
 
-  verifyAttributeTypes(Attrs.getFnAttributes(), V);
-  for (Attribute FnAttr : Attrs.getFnAttributes())
+  verifyAttributeTypes(Attrs.getFnAttrs(), V);
+  for (Attribute FnAttr : Attrs.getFnAttrs())
     Assert(FnAttr.isStringAttribute() ||
            Attribute::canUseAsFnAttr(FnAttr.getKindAsEnum()),
            "Attribute '" + FnAttr.getAsString() +
                "' does not apply to functions!",
            V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::ReadOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::ReadOnly)),
          "Attributes 'readnone and readonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::WriteOnly)),
          "Attributes 'readnone and writeonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadOnly) &&
-           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadOnly) &&
+           Attrs.hasFnAttr(Attribute::WriteOnly)),
          "Attributes 'readonly and writeonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::InaccessibleMemOrArgMemOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly)),
          "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
          "incompatible!",
          V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::InaccessibleMemOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::InaccessibleMemOnly)),
          "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::NoInline) &&
-           Attrs.hasFnAttribute(Attribute::AlwaysInline)),
+  Assert(!(Attrs.hasFnAttr(Attribute::NoInline) &&
+           Attrs.hasFnAttr(Attribute::AlwaysInline)),
          "Attributes 'noinline and alwaysinline' are incompatible!", V);
 
-  if (Attrs.hasFnAttribute(Attribute::OptimizeNone)) {
-    Assert(Attrs.hasFnAttribute(Attribute::NoInline),
+  if (Attrs.hasFnAttr(Attribute::OptimizeNone)) {
+    Assert(Attrs.hasFnAttr(Attribute::NoInline),
            "Attribute 'optnone' requires 'noinline'!", V);
 
-    Assert(!Attrs.hasFnAttribute(Attribute::OptimizeForSize),
+    Assert(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
            "Attributes 'optsize and optnone' are incompatible!", V);
 
-    Assert(!Attrs.hasFnAttribute(Attribute::MinSize),
+    Assert(!Attrs.hasFnAttr(Attribute::MinSize),
            "Attributes 'minsize and optnone' are incompatible!", V);
   }
 
-  if (Attrs.hasFnAttribute(Attribute::JumpTable)) {
+  if (Attrs.hasFnAttr(Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
     Assert(GV->hasGlobalUnnamedAddr(),
            "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
 
-  if (Attrs.hasFnAttribute(Attribute::AllocSize)) {
+  if (Attrs.hasFnAttr(Attribute::AllocSize)) {
     std::pair<unsigned, Optional<unsigned>> Args =
-        Attrs.getAllocSizeArgs(AttributeList::FunctionIndex);
+        Attrs.getFnAttrs().getAllocSizeArgs();
 
     auto CheckParam = [&](StringRef Name, unsigned ParamNo) {
       if (ParamNo >= FT->getNumParams()) {
@@ -2009,17 +2054,16 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       return;
   }
 
-  if (Attrs.hasFnAttribute(Attribute::VScaleRange)) {
+  if (Attrs.hasFnAttr(Attribute::VScaleRange)) {
     std::pair<unsigned, unsigned> Args =
-        Attrs.getVScaleRangeArgs(AttributeList::FunctionIndex);
+        Attrs.getFnAttrs().getVScaleRangeArgs();
 
     if (Args.first > Args.second && Args.second != 0)
       CheckFailed("'vscale_range' minimum cannot be greater than maximum", V);
   }
 
-  if (Attrs.hasFnAttribute("frame-pointer")) {
-    StringRef FP = Attrs.getAttribute(AttributeList::FunctionIndex,
-                                      "frame-pointer").getValueAsString();
+  if (Attrs.hasFnAttr("frame-pointer")) {
+    StringRef FP = Attrs.getFnAttr("frame-pointer").getValueAsString();
     if (FP != "all" && FP != "non-leaf" && FP != "none")
       CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V);
   }
@@ -2168,7 +2212,7 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
            Call);
 
     if (TargetFuncType->isVarArg()) {
-      AttributeSet ArgAttrs = Attrs.getParamAttributes(5 + i);
+      AttributeSet ArgAttrs = Attrs.getParamAttrs(5 + i);
       Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
              "Attribute 'sret' cannot be used for vararg call arguments!",
              Call);
@@ -2334,7 +2378,7 @@ void Verifier::visitFunction(const Function &F) {
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
   // checking for Attributes that can/can not ever be on functions.
-  Assert(!Attrs.hasFnAttribute(Attribute::Builtin),
+  Assert(!Attrs.hasFnAttr(Attribute::Builtin),
          "Attribute 'builtin' can only be applied to a callsite.", &F);
 
   Assert(!Attrs.hasAttrSomewhere(Attribute::ElementType),
@@ -2348,7 +2392,7 @@ void Verifier::visitFunction(const Function &F) {
   case CallingConv::C:
     break;
   case CallingConv::X86_INTR: {
-    Assert(F.arg_empty() || Attrs.hasParamAttribute(0, Attribute::ByVal),
+    Assert(F.arg_empty() || Attrs.hasParamAttr(0, Attribute::ByVal),
            "Calling convention parameter requires byval", &F);
     break;
   }
@@ -2368,14 +2412,14 @@ void Verifier::visitFunction(const Function &F) {
       const unsigned StackAS = DL.getAllocaAddrSpace();
       unsigned i = 0;
       for (const Argument &Arg : F.args()) {
-        Assert(!Attrs.hasParamAttribute(i, Attribute::ByVal),
+        Assert(!Attrs.hasParamAttr(i, Attribute::ByVal),
                "Calling convention disallows byval", &F);
-        Assert(!Attrs.hasParamAttribute(i, Attribute::Preallocated),
+        Assert(!Attrs.hasParamAttr(i, Attribute::Preallocated),
                "Calling convention disallows preallocated", &F);
-        Assert(!Attrs.hasParamAttribute(i, Attribute::InAlloca),
+        Assert(!Attrs.hasParamAttr(i, Attribute::InAlloca),
                "Calling convention disallows inalloca", &F);
 
-        if (Attrs.hasParamAttribute(i, Attribute::ByRef)) {
+        if (Attrs.hasParamAttr(i, Attribute::ByRef)) {
           // FIXME: Should also disallow LDS and GDS, but we don't have the enum
           // value here.
           Assert(Arg.getType()->getPointerAddressSpace() != StackAS,
@@ -2416,7 +2460,7 @@ void Verifier::visitFunction(const Function &F) {
     }
 
     // Check that swifterror argument is only used by loads and stores.
-    if (Attrs.hasParamAttribute(i, Attribute::SwiftError)) {
+    if (Attrs.hasParamAttr(i, Attribute::SwiftError)) {
       verifySwiftErrorValue(&Arg);
     }
     ++i;
@@ -2523,7 +2567,8 @@ void Verifier::visitFunction(const Function &F) {
   // uses.
   if (F.isIntrinsic() && F.getParent()->isMaterialized()) {
     const User *U;
-    if (F.hasAddressTaken(&U))
+    if (F.hasAddressTaken(&U, false, true, false,
+                          /*IgnoreARCAttachedCall=*/true))
       Assert(false, "Invalid user of intrinsic instruction!", U);
   }
 
@@ -2693,6 +2738,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
 }
 
 void Verifier::visitSwitchInst(SwitchInst &SI) {
+  Assert(SI.getType()->isVoidTy(), "Switch must have void result type!", &SI);
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
@@ -2726,7 +2772,7 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
     Assert(CBI.getSuccessor(i)->getType()->isLabelTy(),
            "Callbr successors must all have pointer type!", &CBI);
   for (unsigned i = 0, e = CBI.getNumOperands(); i != e; ++i) {
-    Assert(i >= CBI.getNumArgOperands() || !isa<BasicBlock>(CBI.getOperand(i)),
+    Assert(i >= CBI.arg_size() || !isa<BasicBlock>(CBI.getOperand(i)),
            "Using an unescaped label as a callbr argument!", &CBI);
     if (isa<BasicBlock>(CBI.getOperand(i)))
       for (unsigned j = i + 1; j != e; ++j)
@@ -3071,14 +3117,14 @@ void Verifier::visitCallBase(CallBase &Call) {
     Assert(Callee->getValueType() == FTy,
            "Intrinsic called with incompatible signature", Call);
 
-  if (Attrs.hasFnAttribute(Attribute::Speculatable)) {
+  if (Attrs.hasFnAttr(Attribute::Speculatable)) {
     // Don't allow speculatable on call sites, unless the underlying function
     // declaration is also speculatable.
     Assert(Callee && Callee->isSpeculatable(),
            "speculatable attribute may not apply to call sites", Call);
   }
 
-  if (Attrs.hasFnAttribute(Attribute::Preallocated)) {
+  if (Attrs.hasFnAttr(Attribute::Preallocated)) {
     Assert(Call.getCalledFunction()->getIntrinsicID() ==
                Intrinsic::call_preallocated_arg,
            "preallocated as a call site attribute can only be on "
@@ -3118,7 +3164,7 @@ void Verifier::visitCallBase(CallBase &Call) {
              Call);
     }
 
-    if (Attrs.hasParamAttribute(i, Attribute::ImmArg)) {
+    if (Attrs.hasParamAttr(i, Attribute::ImmArg)) {
       // Don't allow immarg on call sites, unless the underlying declaration
       // also has the matching immarg.
       Assert(Callee && Callee->hasParamAttribute(i, Attribute::ImmArg),
@@ -3150,16 +3196,16 @@ void Verifier::visitCallBase(CallBase &Call) {
     bool SawReturned = false;
 
     for (unsigned Idx = 0; Idx < FTy->getNumParams(); ++Idx) {
-      if (Attrs.hasParamAttribute(Idx, Attribute::Nest))
+      if (Attrs.hasParamAttr(Idx, Attribute::Nest))
         SawNest = true;
-      if (Attrs.hasParamAttribute(Idx, Attribute::Returned))
+      if (Attrs.hasParamAttr(Idx, Attribute::Returned))
         SawReturned = true;
     }
 
     // Check attributes on the varargs part.
     for (unsigned Idx = FTy->getNumParams(); Idx < Call.arg_size(); ++Idx) {
       Type *Ty = Call.getArgOperand(Idx)->getType();
-      AttributeSet ArgAttrs = Attrs.getParamAttributes(Idx);
+      AttributeSet ArgAttrs = Attrs.getParamAttrs(Idx);
       verifyParameterAttrs(ArgAttrs, Ty, &Call);
 
       if (ArgAttrs.hasAttribute(Attribute::Nest)) {
@@ -3265,17 +3311,10 @@ void Verifier::visitCallBase(CallBase &Call) {
       Assert(!FoundAttachedCallBundle,
              "Multiple \"clang.arc.attachedcall\" operand bundles", Call);
       FoundAttachedCallBundle = true;
+      verifyAttachedCallBundle(Call, BU);
     }
   }
 
-  if (FoundAttachedCallBundle)
-    Assert((FTy->getReturnType()->isPointerTy() ||
-            (Call.doesNotReturn() && FTy->getReturnType()->isVoidTy())),
-           "a call with operand bundle \"clang.arc.attachedcall\" must call a "
-           "function returning a pointer or a non-returning function that has "
-           "a void return type",
-           Call);
-
   // Verify that each inlinable callsite of a debug-info-bearing function in a
   // debug-info-bearing function has a debug location attached to it. Failure to
   // do so causes assertion failures when the inliner sets up inline scope info.
@@ -3315,7 +3354,7 @@ static bool isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
+static AttrBuilder getParameterABIAttributes(unsigned I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
       Attribute::StructRet,  Attribute::ByVal,          Attribute::InAlloca,
       Attribute::InReg,      Attribute::StackAlignment, Attribute::SwiftSelf,
@@ -3323,15 +3362,15 @@ static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
       Attribute::ByRef};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
-    Attribute Attr = Attrs.getParamAttributes(I).getAttribute(AK);
+    Attribute Attr = Attrs.getParamAttrs(I).getAttribute(AK);
     if (Attr.isValid())
       Copy.addAttribute(Attr);
   }
 
   // `align` is ABI-affecting only in combination with `byval` or `byref`.
-  if (Attrs.hasParamAttribute(I, Attribute::Alignment) &&
-      (Attrs.hasParamAttribute(I, Attribute::ByVal) ||
-       Attrs.hasParamAttribute(I, Attribute::ByRef)))
+  if (Attrs.hasParamAttr(I, Attribute::Alignment) &&
+      (Attrs.hasParamAttr(I, Attribute::ByVal) ||
+       Attrs.hasParamAttr(I, Attribute::ByRef)))
     Copy.addAlignmentAttr(Attrs.getParamAlignment(I));
   return Copy;
 }
@@ -3383,12 +3422,12 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
     // - Only sret, byval, swiftself, and swiftasync ABI-impacting attributes
     //   are allowed in swifttailcc call
-    for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
+    for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
       AttrBuilder ABIAttrs = getParameterABIAttributes(I, CallerAttrs);
       SmallString<32> Context{CCName, StringRef(" musttail caller")};
       verifyTailCCMustTailAttrs(ABIAttrs, Context);
     }
-    for (int I = 0, E = CalleeTy->getNumParams(); I != E; ++I) {
+    for (unsigned I = 0, E = CalleeTy->getNumParams(); I != E; ++I) {
       AttrBuilder ABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
       SmallString<32> Context{CCName, StringRef(" musttail callee")};
       verifyTailCCMustTailAttrs(ABIAttrs, Context);
@@ -3406,7 +3445,7 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
     Assert(CallerTy->getNumParams() == CalleeTy->getNumParams(),
            "cannot guarantee tail call due to mismatched parameter counts",
            &CI);
-    for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
+    for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
       Assert(
           isTypeCongruent(CallerTy->getParamType(I), CalleeTy->getParamType(I)),
           "cannot guarantee tail call due to mismatched parameter types", &CI);
@@ -3415,7 +3454,7 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
   // - All ABI-impacting function attributes, such as sret, byval, inreg,
   //   returned, preallocated, and inalloca, must match.
-  for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
+  for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
     AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs);
     AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
     Assert(CallerABIAttrs == CalleeABIAttrs,
@@ -4347,6 +4386,38 @@ void Verifier::visitAnnotationMetadata(MDNode *Annotation) {
     Assert(isa<MDString>(Op.get()), "operands must be strings");
 }
 
+void Verifier::visitAliasScopeMetadata(const MDNode *MD) {
+  unsigned NumOps = MD->getNumOperands();
+  Assert(NumOps >= 2 && NumOps <= 3, "scope must have two or three operands",
+         MD);
+  Assert(MD->getOperand(0).get() == MD || isa<MDString>(MD->getOperand(0)),
+         "first scope operand must be self-referential or string", MD);
+  if (NumOps == 3)
+    Assert(isa<MDString>(MD->getOperand(2)),
+           "third scope operand must be string (if used)", MD);
+
+  MDNode *Domain = dyn_cast<MDNode>(MD->getOperand(1));
+  Assert(Domain != nullptr, "second scope operand must be MDNode", MD);
+
+  unsigned NumDomainOps = Domain->getNumOperands();
+  Assert(NumDomainOps >= 1 && NumDomainOps <= 2,
+         "domain must have one or two operands", Domain);
+  Assert(Domain->getOperand(0).get() == Domain ||
+             isa<MDString>(Domain->getOperand(0)),
+         "first domain operand must be self-referential or string", Domain);
+  if (NumDomainOps == 2)
+    Assert(isa<MDString>(Domain->getOperand(1)),
+           "second domain operand must be string (if used)", Domain);
+}
+
+void Verifier::visitAliasScopeListMetadata(const MDNode *MD) {
+  for (const MDOperand &Op : MD->operands()) {
+    const MDNode *OpMD = dyn_cast<MDNode>(Op);
+    Assert(OpMD != nullptr, "scope list must consist of MDNodes", MD);
+    visitAliasScopeMetadata(OpMD);
+  }
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -4403,10 +4474,21 @@ void Verifier::visitInstruction(Instruction &I) {
     }
 
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
+      // This code checks whether the function is used as the operand of a
+      // clang_arc_attachedcall operand bundle.
+      auto IsAttachedCallOperand = [](Function *F, const CallBase *CBI,
+                                      int Idx) {
+        return CBI && CBI->isOperandBundleOfType(
+                          LLVMContext::OB_clang_arc_attachedcall, Idx);
+      };
+
       // Check to make sure that the "address of" an intrinsic function is never
-      // taken.
-      Assert(!F->isIntrinsic() ||
-                 (CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i)),
+      // taken. Ignore cases where the address of the intrinsic function is used
+      // as the argument of operand bundle "clang.arc.attachedcall" as those
+      // cases are handled in verifyAttachedCallBundle.
+      Assert((!F->isIntrinsic() ||
+              (CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i)) ||
+              IsAttachedCallOperand(F, CBI, i)),
              "Cannot take the address of an intrinsic!", &I);
       Assert(
           !F->isIntrinsic() || isa<CallInst>(I) ||
@@ -4420,9 +4502,10 @@ void Verifier::visitInstruction(Instruction &I) {
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void ||
               F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64 ||
               F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint ||
-              F->getIntrinsicID() == Intrinsic::wasm_rethrow,
+              F->getIntrinsicID() == Intrinsic::wasm_rethrow ||
+              IsAttachedCallOperand(F, CBI, i),
           "Cannot invoke an intrinsic other than donothing, patchpoint, "
-          "statepoint, coro_resume or coro_destroy",
+          "statepoint, coro_resume, coro_destroy or clang.arc.attachedcall",
           &I);
       Assert(F->getParent() == &M, "Referencing function in another module!",
              &I, &M, F, F->getParent());
@@ -4471,6 +4554,11 @@ void Verifier::visitInstruction(Instruction &I) {
     visitRangeMetadata(I, Range, I.getType());
   }
 
+  if (I.hasMetadata(LLVMContext::MD_invariant_group)) {
+    Assert(isa<LoadInst>(I) || isa<StoreInst>(I),
+           "invariant.group metadata is only for loads and stores", &I);
+  }
+
   if (I.getMetadata(LLVMContext::MD_nonnull)) {
     Assert(I.getType()->isPointerTy(), "nonnull applies only to pointer types",
            &I);
@@ -4489,6 +4577,11 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias))
+    visitAliasScopeListMetadata(MD);
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
+    visitAliasScopeListMetadata(MD);
+
   if (MDNode *AlignMD = I.getMetadata(LLVMContext::MD_align)) {
     Assert(I.getType()->isPointerTy(), "align applies only to pointer types",
            &I);
@@ -4599,33 +4692,34 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     for (auto &Elem : Call.bundle_op_infos()) {
       Assert(Elem.Tag->getKey() == "ignore" ||
                  Attribute::isExistingAttribute(Elem.Tag->getKey()),
-             "tags must be valid attribute names");
+             "tags must be valid attribute names", Call);
       Attribute::AttrKind Kind =
           Attribute::getAttrKindFromName(Elem.Tag->getKey());
       unsigned ArgCount = Elem.End - Elem.Begin;
       if (Kind == Attribute::Alignment) {
         Assert(ArgCount <= 3 && ArgCount >= 2,
-               "alignment assumptions should have 2 or 3 arguments");
+               "alignment assumptions should have 2 or 3 arguments", Call);
         Assert(Call.getOperand(Elem.Begin)->getType()->isPointerTy(),
-               "first argument should be a pointer");
+               "first argument should be a pointer", Call);
         Assert(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(),
-               "second argument should be an integer");
+               "second argument should be an integer", Call);
         if (ArgCount == 3)
           Assert(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
-                 "third argument should be an integer if present");
+                 "third argument should be an integer if present", Call);
         return;
       }
-      Assert(ArgCount <= 2, "to many arguments");
+      Assert(ArgCount <= 2, "too many arguments", Call);
       if (Kind == Attribute::None)
         break;
       if (Attribute::isIntAttrKind(Kind)) {
-        Assert(ArgCount == 2, "this attribute should have 2 arguments");
+        Assert(ArgCount == 2, "this attribute should have 2 arguments", Call);
         Assert(isa<ConstantInt>(Call.getOperand(Elem.Begin + 1)),
-               "the second argument should be a constant integral value");
+               "the second argument should be a constant integral value", Call);
       } else if (Attribute::canUseAsParamAttr(Kind)) {
-        Assert((ArgCount) == 1, "this attribute should have one argument");
+        Assert((ArgCount) == 1, "this attribute should have one argument",
+               Call);
       } else if (Attribute::canUseAsFnAttr(Kind)) {
-        Assert((ArgCount) == 0, "this attribute has no argument");
+        Assert((ArgCount) == 0, "this attribute has no argument", Call);
       }
     }
     break;
@@ -4736,7 +4830,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
                            "llvm.call.preallocated.setup");
         FoundCall = true;
         size_t NumPreallocatedArgs = 0;
-        for (unsigned i = 0; i < UseCall->getNumArgOperands(); i++) {
+        for (unsigned i = 0; i < UseCall->arg_size(); i++) {
           if (UseCall->paramHasAttr(i, Attribute::Preallocated)) {
             ++NumPreallocatedArgs;
           }
@@ -4834,7 +4928,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       Assert(AI && AI->isStaticAlloca(),
              "llvm.localescape only accepts static allocas", Call);
     }
-    FrameEscapeInfo[BB->getParent()].first = Call.getNumArgOperands();
+    FrameEscapeInfo[BB->getParent()].first = Call.arg_size();
     SawFrameEscape = true;
     break;
   }
@@ -4883,7 +4977,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   }
   case Intrinsic::experimental_gc_relocate: {
-    Assert(Call.getNumArgOperands() == 3, "wrong number of arguments", Call);
+    Assert(Call.arg_size() == 3, "wrong number of arguments", Call);
 
     Assert(isa<PointerType>(Call.getType()->getScalarType()),
            "gc.relocate must return a pointer or a vector of pointers", Call);
@@ -5017,14 +5111,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::masked_gather: {
     const APInt &Alignment =
         cast<ConstantInt>(Call.getArgOperand(1))->getValue();
-    Assert(Alignment.isNullValue() || Alignment.isPowerOf2(),
+    Assert(Alignment.isZero() || Alignment.isPowerOf2(),
            "masked_gather: alignment must be 0 or a power of 2", Call);
     break;
   }
   case Intrinsic::masked_scatter: {
     const APInt &Alignment =
         cast<ConstantInt>(Call.getArgOperand(2))->getValue();
-    Assert(Alignment.isNullValue() || Alignment.isPowerOf2(),
+    Assert(Alignment.isZero() || Alignment.isPowerOf2(),
            "masked_scatter: alignment must be 0 or a power of 2", Call);
     break;
   }
@@ -5340,7 +5434,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   // Compare intrinsics carry an extra predicate metadata operand.
   if (isa<ConstrainedFPCmpIntrinsic>(FPI))
     NumOperands += 1;
-  Assert((FPI.getNumArgOperands() == NumOperands),
+  Assert((FPI.arg_size() == NumOperands),
          "invalid arguments for constrained FP intrinsic", &FPI);
 
   switch (FPI.getIntrinsicID()) {
@@ -5643,6 +5737,41 @@ void Verifier::verifyDeoptimizeCallingConvs() {
   }
 }
 
+void Verifier::verifyAttachedCallBundle(const CallBase &Call,
+                                        const OperandBundleUse &BU) {
+  FunctionType *FTy = Call.getFunctionType();
+
+  Assert((FTy->getReturnType()->isPointerTy() ||
+          (Call.doesNotReturn() && FTy->getReturnType()->isVoidTy())),
+         "a call with operand bundle \"clang.arc.attachedcall\" must call a "
+         "function returning a pointer or a non-returning function that has a "
+         "void return type",
+         Call);
+
+  Assert((BU.Inputs.empty() ||
+          (BU.Inputs.size() == 1 && isa<Function>(BU.Inputs.front()))),
+         "operand bundle \"clang.arc.attachedcall\" can take either no "
+         "arguments or one function as an argument",
+         Call);
+
+  if (BU.Inputs.empty())
+    return;
+
+  auto *Fn = cast<Function>(BU.Inputs.front());
+  Intrinsic::ID IID = Fn->getIntrinsicID();
+
+  if (IID) {
+    Assert((IID == Intrinsic::objc_retainAutoreleasedReturnValue ||
+            IID == Intrinsic::objc_unsafeClaimAutoreleasedReturnValue),
+           "invalid function argument", Call);
+  } else {
+    StringRef FnName = Fn->getName();
+    Assert((FnName == "objc_retainAutoreleasedReturnValue" ||
+            FnName == "objc_unsafeClaimAutoreleasedReturnValue"),
+           "invalid function argument", Call);
+  }
+}
+
 void Verifier::verifySourceDebugInfo(const DICompileUnit &U, const DIFile &F) {
   bool HasSource = F.getSource().hasValue();
   if (!HasSourceDebugInfo.count(&U))
@@ -5671,6 +5800,7 @@ void Verifier::verifyNoAliasScopeDecl() {
            II);
     Assert(ScopeListMD->getNumOperands() == 1,
            "!id.scope.list must point to a list with a single scope", II);
+    visitAliasScopeListMetadata(ScopeListMD);
   }
 
   // Only check the domination rule when requested. Once all passes have been
@@ -6036,11 +6166,7 @@ static bool isNewFormatTBAATypeNode(llvm::MDNode *Type) {
 
   // In the new format type nodes shall have a reference to the parent type as
   // its first operand.
-  MDNode *Parent = dyn_cast_or_null<MDNode>(Type->getOperand(0));
-  if (!Parent)
-    return false;
-
-  return true;
+  return isa_and_nonnull<MDNode>(Type->getOperand(0));
 }
 
 bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
diff --git a/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
index 112c1cea354a..d41c7d3217d7 100644
--- a/llvm/lib/InterfaceStub/ELFObjHandler.cpp
+++ b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
@@ -367,7 +367,7 @@ Error appendToError(Error Err, StringRef After) {
   Stream << Err;
   Stream << " " << After;
   consumeError(std::move(Err));
-  return createError(Stream.str().c_str());
+  return createError(Stream.str());
 }
 
 /// This function populates a DynamicEntries struct using an ELFT::DynRange.
diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp
index d3d351fa2ed4..e6bf09232ce2 100644
--- a/llvm/lib/InterfaceStub/IFSHandler.cpp
+++ b/llvm/lib/InterfaceStub/IFSHandler.cpp
@@ -163,7 +163,7 @@ bool usesTriple(StringRef Buf) {
   for (line_iterator I(MemoryBufferRef(Buf, "ELFStub")); !I.is_at_eof(); ++I) {
     StringRef Line = (*I).trim();
     if (Line.startswith("Target:")) {
-      if (Line == "Target:" || (Line.find("{") != Line.npos)) {
+      if (Line == "Target:" || Line.contains("{")) {
         return false;
       }
     }
@@ -327,3 +327,13 @@ void ifs::stripIFSTarget(IFSStub &Stub, bool StripTriple, bool StripArch,
     Stub.Target.ObjectFormat.reset();
   }
 }
+
+void ifs::stripIFSUndefinedSymbols(IFSStub &Stub) {
+  for (auto Iter = Stub.Symbols.begin(); Iter != Stub.Symbols.end();) {
+    if (Iter->Undefined) {
+      Iter = Stub.Symbols.erase(Iter);
+    } else {
+      Iter++;
+    }
+  }
+}
diff --git a/llvm/lib/InterfaceStub/IFSStub.cpp b/llvm/lib/InterfaceStub/IFSStub.cpp
index bbc91ada1ded..008263f8db9f 100644
--- a/llvm/lib/InterfaceStub/IFSStub.cpp
+++ b/llvm/lib/InterfaceStub/IFSStub.cpp
@@ -29,7 +29,7 @@ IFSStub::IFSStub(IFSStub &&Stub) {
   Symbols = std::move(Stub.Symbols);
 }
 
-IFSStubTriple::IFSStubTriple(IFSStubTriple const &Stub) {
+IFSStubTriple::IFSStubTriple(IFSStubTriple const &Stub) : IFSStub() {
   IfsVersion = Stub.IfsVersion;
   Target = Stub.Target;
   SoName = Stub.SoName;
@@ -37,7 +37,7 @@ IFSStubTriple::IFSStubTriple(IFSStubTriple const &Stub) {
   Symbols = Stub.Symbols;
 }
 
-IFSStubTriple::IFSStubTriple(IFSStub const &Stub) {
+IFSStubTriple::IFSStubTriple(IFSStub const &Stub) : IFSStub() {
   IfsVersion = Stub.IfsVersion;
   Target = Stub.Target;
   SoName = Stub.SoName;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 69d500ba9bce..6ce2ed265739 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LTO/LTO.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -32,6 +33,7 @@
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/Linker/IRMover.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
@@ -41,7 +43,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SHA1.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/TimeProfiler.h"
@@ -536,12 +537,12 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
   auto *ResI = Res.begin();
   auto *ResE = Res.end();
   (void)ResE;
+  const Triple TT(RegularLTO.CombinedModule->getTargetTriple());
   for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
 
     StringRef Name = Sym.getName();
-    Triple TT(RegularLTO.CombinedModule->getTargetTriple());
     // Strip the __imp_ prefix from COFF dllimport symbols (similar to the
     // way they are handled by lld), otherwise we can end up with two
     // global resolutions (one with and one for a copy of the symbol without).
@@ -732,7 +733,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
 
   DenseSet<GlobalObject *> AliasedGlobals;
   for (auto &GA : M.aliases())
-    if (GlobalObject *GO = GA.getBaseObject())
+    if (GlobalObject *GO = GA.getAliaseeObject())
       AliasedGlobals.insert(GO);
 
   // In this function we need IR GlobalValues matching the symbols in Syms
@@ -856,10 +857,14 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
   for (GlobalValue *GV : Mod.Keep) {
     if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) {
       if (Function *F = dyn_cast<Function>(GV)) {
-        OptimizationRemarkEmitter ORE(F, nullptr);
-        ORE.emit(OptimizationRemark(DEBUG_TYPE, "deadfunction", F)
-                 << ore::NV("Function", F)
-                 << " not added to the combined module ");
+        if (DiagnosticOutputFile) {
+          if (Error Err = F->materialize())
+            return Err;
+          OptimizationRemarkEmitter ORE(F, nullptr);
+          ORE.emit(OptimizationRemark(DEBUG_TYPE, "deadfunction", F)
+                   << ore::NV("Function", F)
+                   << " not added to the combined module ");
+        }
       }
       continue;
     }
@@ -992,7 +997,7 @@ Error LTO::checkPartiallySplit() {
   return Error::success();
 }
 
-Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
+Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
   // Compute "dead" symbols, we don't want to import/export these!
   DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
   DenseMap<GlobalValue::GUID, PrevailingType> GUIDPrevailingResolutions;
@@ -1048,6 +1053,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
       Conf.RemarksHotnessThreshold);
   if (!DiagFileOrErr)
     return DiagFileOrErr.takeError();
+  DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
   // Finalize linking of regular LTO modules containing summaries now that
   // we have computed liveness information.
@@ -1136,7 +1142,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
       return Err;
   }
 
-  return finalizeOptimizationRemarks(std::move(*DiagFileOrErr));
+  return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 }
 
 static const char *libcallRoutineNames[] = {
@@ -1177,7 +1183,7 @@ namespace {
 class InProcessThinBackend : public ThinBackendProc {
   ThreadPool BackendThreadPool;
   AddStreamFn AddStream;
-  NativeObjectCache Cache;
+  FileCache Cache;
   std::set<GlobalValue::GUID> CfiFunctionDefs;
   std::set<GlobalValue::GUID> CfiFunctionDecls;
 
@@ -1189,7 +1195,7 @@ public:
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      AddStreamFn AddStream, NativeObjectCache Cache)
+      AddStreamFn AddStream, FileCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
         BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
         Cache(std::move(Cache)) {
@@ -1202,8 +1208,8 @@ public:
   }
 
   Error runThinLTOBackendThread(
-      AddStreamFn AddStream, NativeObjectCache Cache, unsigned Task,
-      BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
@@ -1233,7 +1239,11 @@ public:
     computeLTOCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList,
                        ExportList, ResolvedODR, DefinedGlobals, CfiFunctionDefs,
                        CfiFunctionDecls);
-    if (AddStreamFn CacheAddStream = Cache(Task, Key))
+    Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key);
+    if (Error Err = CacheAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheAddStream = *CacheAddStreamOrErr;
+    if (CacheAddStream)
       return RunThinBackend(CacheAddStream);
 
     return Error::success();
@@ -1295,7 +1305,7 @@ public:
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
   return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-             AddStreamFn AddStream, NativeObjectCache Cache) {
+             AddStreamFn AddStream, FileCache Cache) {
     return std::make_unique<InProcessThinBackend>(
         Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
         Cache);
@@ -1389,15 +1399,20 @@ ThinBackend lto::createWriteIndexesThinBackend(
     raw_fd_ostream *LinkedObjectsFile, IndexWriteCallback OnWrite) {
   return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-             AddStreamFn AddStream, NativeObjectCache Cache) {
+             AddStreamFn AddStream, FileCache Cache) {
     return std::make_unique<WriteIndexesThinBackend>(
         Conf, CombinedIndex, ModuleToDefinedGVSummaries, OldPrefix, NewPrefix,
         ShouldEmitImportsFiles, LinkedObjectsFile, OnWrite);
   };
 }
 
-Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
+Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                       const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  timeTraceProfilerBegin("ThinLink", StringRef(""));
+  auto TimeTraceScopeExit = llvm::make_scope_exit([]() {
+    if (llvm::timeTraceProfilerEnabled())
+      llvm::timeTraceProfilerEnd();
+  });
   if (ThinLTO.ModuleMap.empty())
     return Error::success();
 
@@ -1510,8 +1525,15 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   thinLTOResolvePrevailingInIndex(Conf, ThinLTO.CombinedIndex, isPrevailing,
                                   recordNewLinkage, GUIDPreservedSymbols);
 
+  thinLTOPropagateFunctionAttrs(ThinLTO.CombinedIndex, isPrevailing);
+
   generateParamAccessSummary(ThinLTO.CombinedIndex);
 
+  if (llvm::timeTraceProfilerEnabled())
+    llvm::timeTraceProfilerEnd();
+
+  TimeTraceScopeExit.release();
+
   std::unique_ptr<ThinBackendProc> BackendProc =
       ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                       AddStream, Cache);
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 4e4ba4f3a58e..be06556b0c3b 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
@@ -37,7 +38,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -74,7 +74,11 @@ static cl::opt<bool> ThinLTOAssumeMerged(
     cl::desc("Assume the input has already undergone ThinLTO function "
              "importing and the other pre-optimization pipeline changes."));
 
-LLVM_ATTRIBUTE_NORETURN static void reportOpenError(StringRef Path, Twine Msg) {
+namespace llvm {
+extern cl::opt<bool> NoPGOWarnMismatch;
+}
+
+[[noreturn]] static void reportOpenError(StringRef Path, Twine Msg) {
   errs() << "failed to open " << Path << ": " << Msg << '\n';
   errs().flush();
   exit(1);
@@ -221,10 +225,13 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
     PGOOpt = PGOOptions(Conf.CSIRProfile, "", Conf.ProfileRemapping,
                         PGOOptions::IRUse, PGOOptions::CSIRUse,
                         Conf.AddFSDiscriminator);
+    NoPGOWarnMismatch = !Conf.PGOWarnMismatch;
   } else if (Conf.AddFSDiscriminator) {
     PGOOpt = PGOOptions("", "", "", PGOOptions::NoAction,
                         PGOOptions::NoCSAction, true);
   }
+  if (TM)
+    TM->setPGOOption(PGOOpt);
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -244,18 +251,16 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
     TLII->disableAllFunctions();
   FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
 
-  AAManager AA;
   // Parse a custom AA pipeline if asked to.
   if (!Conf.AAPipeline.empty()) {
+    AAManager AA;
     if (auto Err = PB.parseAAPipeline(AA, Conf.AAPipeline)) {
-      report_fatal_error("unable to parse AA pipeline description '" +
+      report_fatal_error(Twine("unable to parse AA pipeline description '") +
                          Conf.AAPipeline + "': " + toString(std::move(Err)));
     }
-  } else {
-    AA = PB.buildDefaultAAPipeline();
+    // Register the AA manager first so that our version is the one used.
+    FAM.registerPass([&] { return std::move(AA); });
   }
-  // Register the AA manager first so that our version is the one used.
-  FAM.registerPass([&] { return std::move(AA); });
 
   // Register all the basic analyses with the managers.
   PB.registerModuleAnalyses(MAM);
@@ -269,29 +274,29 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   if (!Conf.DisableVerify)
     MPM.addPass(VerifierPass());
 
-  PassBuilder::OptimizationLevel OL;
+  OptimizationLevel OL;
 
   switch (OptLevel) {
   default:
     llvm_unreachable("Invalid optimization level");
   case 0:
-    OL = PassBuilder::OptimizationLevel::O0;
+    OL = OptimizationLevel::O0;
     break;
   case 1:
-    OL = PassBuilder::OptimizationLevel::O1;
+    OL = OptimizationLevel::O1;
     break;
   case 2:
-    OL = PassBuilder::OptimizationLevel::O2;
+    OL = OptimizationLevel::O2;
     break;
   case 3:
-    OL = PassBuilder::OptimizationLevel::O3;
+    OL = OptimizationLevel::O3;
     break;
   }
 
   // Parse a custom pipeline if asked to.
   if (!Conf.OptPipeline.empty()) {
     if (auto Err = PB.parsePassPipeline(MPM, Conf.OptPipeline)) {
-      report_fatal_error("unable to parse pass pipeline description '" +
+      report_fatal_error(Twine("unable to parse pass pipeline description '") +
                          Conf.OptPipeline + "': " + toString(std::move(Err)));
     }
   } else if (IsThinLTO) {
@@ -387,8 +392,8 @@ static void codegen(const Config &Conf, TargetMachine *TM,
   if (!Conf.DwoDir.empty()) {
     std::error_code EC;
     if (auto EC = llvm::sys::fs::create_directories(Conf.DwoDir))
-      report_fatal_error("Failed to create directory " + Conf.DwoDir + ": " +
-                         EC.message());
+      report_fatal_error(Twine("Failed to create directory ") + Conf.DwoDir +
+                         ": " + EC.message());
 
     DwoFile = Conf.DwoDir;
     sys::path::append(DwoFile, std::to_string(Task) + ".dwo");
@@ -400,10 +405,14 @@ static void codegen(const Config &Conf, TargetMachine *TM,
     std::error_code EC;
     DwoOut = std::make_unique<ToolOutputFile>(DwoFile, EC, sys::fs::OF_None);
     if (EC)
-      report_fatal_error("Failed to open " + DwoFile + ": " + EC.message());
+      report_fatal_error(Twine("Failed to open ") + DwoFile + ": " +
+                         EC.message());
   }
 
-  auto Stream = AddStream(Task);
+  Expected<std::unique_ptr<CachedFileStream>> StreamOrErr = AddStream(Task);
+  if (Error Err = StreamOrErr.takeError())
+    report_fatal_error(std::move(Err));
+  std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr;
   legacy::PassManager CodeGenPasses;
   CodeGenPasses.add(
       createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
@@ -599,7 +608,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
 
   dropDeadSymbols(Mod, DefinedGlobals, CombinedIndex);
 
-  thinLTOResolvePrevailingInModule(Mod, DefinedGlobals);
+  thinLTOFinalizeInModule(Mod, DefinedGlobals, /*PropagateAttrs=*/true);
 
   if (Conf.PostPromoteModuleHook && !Conf.PostPromoteModuleHook(Task, Mod))
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 7bffcbf01b03..088e45c9e8dc 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -44,13 +44,13 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -245,8 +245,7 @@ bool LTOCodeGenerator::compileOptimizedToFile(const char **Name) {
   // make unique temp output file to put generated code
   SmallString<128> Filename;
 
-  auto AddStream =
-      [&](size_t Task) -> std::unique_ptr<lto::NativeObjectStream> {
+  auto AddStream = [&](size_t Task) -> std::unique_ptr<CachedFileStream> {
     StringRef Extension(Config.CGFileType == CGFT_AssemblyFile ? "s" : "o");
 
     int FD;
@@ -255,7 +254,7 @@ bool LTOCodeGenerator::compileOptimizedToFile(const char **Name) {
     if (EC)
       emitError(EC.message());
 
-    return std::make_unique<lto::NativeObjectStream>(
+    return std::make_unique<CachedFileStream>(
         std::make_unique<llvm::raw_fd_ostream>(FD, true));
   };
 
@@ -557,7 +556,7 @@ bool LTOCodeGenerator::optimize() {
   return true;
 }
 
-bool LTOCodeGenerator::compileOptimized(lto::AddStreamFn AddStream,
+bool LTOCodeGenerator::compileOptimized(AddStreamFn AddStream,
                                         unsigned ParallelismLevel) {
   if (!this->determineTarget())
     return false;
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index 155790041a75..4cc1b307c553 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
@@ -35,7 +36,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
@@ -688,3 +688,16 @@ Expected<uint32_t> LTOModule::getMachOCPUType() const {
 Expected<uint32_t> LTOModule::getMachOCPUSubType() const {
   return MachO::getCPUSubType(Triple(Mod->getTargetTriple()));
 }
+
+bool LTOModule::hasCtorDtor() const {
+  for (auto Sym : SymTab.symbols()) {
+    if (auto *GV = Sym.dyn_cast<GlobalValue *>()) {
+      StringRef Name = GV->getName();
+      if (Name.consume_front("llvm.global_")) {
+        if (Name.equals("ctors") || Name.equals("dtors"))
+          return true;
+      }
+    }
+  }
+  return false;
+}
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 8f0fa933a6a1..9474d8c9dafb 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -14,6 +14,7 @@
 #include "llvm/LTO/legacy/ThinLTOCodeGenerator.h"
 #include "llvm/Support/CommandLine.h"
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -37,6 +38,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
@@ -48,12 +50,12 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SHA1.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -289,11 +291,6 @@ static void optimizeModuleNewPM(Module &TheModule, TargetMachine &TM,
     TLII->disableAllFunctions();
   FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
 
-  AAManager AA = PB.buildDefaultAAPipeline();
-
-  // Register the AA manager first so that our version is the one used.
-  FAM.registerPass([&] { return std::move(AA); });
-
   // Register all the basic analyses with the managers.
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
@@ -303,22 +300,22 @@ static void optimizeModuleNewPM(Module &TheModule, TargetMachine &TM,
 
   ModulePassManager MPM;
 
-  PassBuilder::OptimizationLevel OL;
+  OptimizationLevel OL;
 
   switch (OptLevel) {
   default:
     llvm_unreachable("Invalid optimization level");
   case 0:
-    OL = PassBuilder::OptimizationLevel::O0;
+    OL = OptimizationLevel::O0;
     break;
   case 1:
-    OL = PassBuilder::OptimizationLevel::O1;
+    OL = OptimizationLevel::O1;
     break;
   case 2:
-    OL = PassBuilder::OptimizationLevel::O2;
+    OL = OptimizationLevel::O2;
     break;
   case 3:
-    OL = PassBuilder::OptimizationLevel::O3;
+    OL = OptimizationLevel::O3;
     break;
   }
 
@@ -503,7 +500,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     promoteModule(TheModule, Index, ClearDSOLocalOnDeclarations);
 
     // Apply summary-based prevailing-symbol resolution decisions.
-    thinLTOResolvePrevailingInModule(TheModule, DefinedGlobals);
+    thinLTOFinalizeInModule(TheModule, DefinedGlobals, /*PropagateAttrs=*/true);
 
     // Save temps: after promotion.
     saveTempBitcode(TheModule, SaveTempsDir, count, ".1.promoted.bc");
@@ -607,7 +604,7 @@ void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
 
   auto InputOrError = lto::InputFile::create(Buffer);
   if (!InputOrError)
-    report_fatal_error("ThinLTO cannot create input file: " +
+    report_fatal_error(Twine("ThinLTO cannot create input file: ") +
                        toString(InputOrError.takeError()));
 
   auto TripleStr = (*InputOrError)->getTargetTriple();
@@ -642,7 +639,7 @@ std::unique_ptr<TargetMachine> TargetMachineBuilder::create() const {
   const Target *TheTarget =
       TargetRegistry::lookupTarget(TheTriple.str(), ErrMsg);
   if (!TheTarget) {
-    report_fatal_error("Can't load target for this Triple: " + ErrMsg);
+    report_fatal_error(Twine("Can't load target for this Triple: ") + ErrMsg);
   }
 
   // Use MAttr as the default set of features.
@@ -762,8 +759,9 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, ModuleSummaryIndex &Index,
   resolvePrevailingInIndex(Index, ResolvedODR, GUIDPreservedSymbols,
                            PrevailingCopy);
 
-  thinLTOResolvePrevailingInModule(
-      TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
+  thinLTOFinalizeInModule(TheModule,
+                          ModuleToDefinedGVSummaries[ModuleIdentifier],
+                          /*PropagateAttrs=*/false);
 
   // Promote the exported values in the index, so that they are promoted
   // in the module.
@@ -937,8 +935,9 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   promoteModule(TheModule, Index, /*ClearDSOLocalOnDeclarations=*/false);
 
   // Internalization
-  thinLTOResolvePrevailingInModule(
-      TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
+  thinLTOFinalizeInModule(TheModule,
+                          ModuleToDefinedGVSummaries[ModuleIdentifier],
+                          /*PropagateAttrs=*/false);
 
   thinLTOInternalizeModule(TheModule,
                            ModuleToDefinedGVSummaries[ModuleIdentifier]);
@@ -989,13 +988,18 @@ ThinLTOCodeGenerator::writeGeneratedObject(int count, StringRef CacheEntryPath,
   std::error_code Err;
   raw_fd_ostream OS(OutputPath, Err, sys::fs::OF_None);
   if (Err)
-    report_fatal_error("Can't open output '" + OutputPath + "'\n");
+    report_fatal_error(Twine("Can't open output '") + OutputPath + "'\n");
   OS << OutputBuffer.getBuffer();
   return std::string(OutputPath.str());
 }
 
 // Main entry point for the ThinLTO processing
 void ThinLTOCodeGenerator::run() {
+  timeTraceProfilerBegin("ThinLink", StringRef(""));
+  auto TimeTraceScopeExit = llvm::make_scope_exit([]() {
+    if (llvm::timeTraceProfilerEnabled())
+      llvm::timeTraceProfilerEnd();
+  });
   // Prepare the resulting object vector
   assert(ProducedBinaries.empty() && "The generator should not be reused");
   if (SavedObjectsDirectoryPath.empty())
@@ -1005,7 +1009,7 @@ void ThinLTOCodeGenerator::run() {
     bool IsDir;
     sys::fs::is_directory(SavedObjectsDirectoryPath, IsDir);
     if (!IsDir)
-      report_fatal_error("Unexistent dir: '" + SavedObjectsDirectoryPath + "'");
+      report_fatal_error(Twine("Unexistent dir: '") + SavedObjectsDirectoryPath + "'");
     ProducedBinaryFiles.resize(Modules.size());
   }
 
@@ -1124,6 +1128,8 @@ void ThinLTOCodeGenerator::run() {
       *Index, IsExported(ExportLists, GUIDPreservedSymbols),
       IsPrevailing(PrevailingCopy));
 
+  thinLTOPropagateFunctionAttrs(*Index, IsPrevailing(PrevailingCopy));
+
   // Make sure that every module has an entry in the ExportLists, ImportList,
   // GVSummary and ResolvedODR maps to enable threaded access to these maps
   // below.
@@ -1141,6 +1147,11 @@ void ThinLTOCodeGenerator::run() {
     ModulesVec.push_back(&Mod->getSingleBitcodeModule());
   std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);
 
+  if (llvm::timeTraceProfilerEnabled())
+    llvm::timeTraceProfilerEnd();
+
+  TimeTraceScopeExit.release();
+
   // Parallel optimizer + codegen
   {
     ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount));
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 7bc6f0585921..bad483be197d 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <utility>
 using namespace llvm;
@@ -491,8 +492,8 @@ class IRLinker {
 
   void linkGlobalVariable(GlobalVariable &Dst, GlobalVariable &Src);
   Error linkFunctionBody(Function &Dst, Function &Src);
-  void linkIndirectSymbolBody(GlobalIndirectSymbol &Dst,
-                              GlobalIndirectSymbol &Src);
+  void linkAliasAliasee(GlobalAlias &Dst, GlobalAlias &Src);
+  void linkIFuncResolver(GlobalIFunc &Dst, GlobalIFunc &Src);
   Error linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
 
   /// Replace all types in the source AttributeList with the
@@ -503,7 +504,7 @@ class IRLinker {
   /// into the destination module.
   GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
   Function *copyFunctionProto(const Function *SF);
-  GlobalValue *copyGlobalIndirectSymbolProto(const GlobalIndirectSymbol *SGIS);
+  GlobalValue *copyIndirectSymbolProto(const GlobalValue *SGV);
 
   /// Perform "replace all uses with" operations. These work items need to be
   /// performed as part of materialization, but we postpone them to happen after
@@ -605,10 +606,14 @@ Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) {
   } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
     if (V->hasInitializer() || V->hasAppendingLinkage())
       return New;
-  } else {
-    auto *IS = cast<GlobalIndirectSymbol>(New);
-    if (IS->getIndirectSymbol())
+  } else if (auto *GA = dyn_cast<GlobalAlias>(New)) {
+    if (GA->getAliasee())
+      return New;
+  } else if (auto *GI = dyn_cast<GlobalIFunc>(New)) {
+    if (GI->getResolver())
       return New;
+  } else {
+    llvm_unreachable("Invalid GlobalValue type");
   }
 
   // If the global is being linked for an indirect symbol, it may have already
@@ -648,12 +653,14 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
 
 AttributeList IRLinker::mapAttributeTypes(LLVMContext &C, AttributeList Attrs) {
   for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
-    for (Attribute::AttrKind TypedAttr :
-         {Attribute::ByVal, Attribute::StructRet, Attribute::ByRef,
-          Attribute::InAlloca}) {
-      if (Attrs.hasAttribute(i, TypedAttr)) {
-        if (Type *Ty = Attrs.getAttribute(i, TypedAttr).getValueAsType()) {
-          Attrs = Attrs.replaceAttributeType(C, i, TypedAttr, TypeMap.get(Ty));
+    for (int AttrIdx = Attribute::FirstTypeAttr;
+         AttrIdx <= Attribute::LastTypeAttr; AttrIdx++) {
+      Attribute::AttrKind TypedAttr = (Attribute::AttrKind)AttrIdx;
+      if (Attrs.hasAttributeAtIndex(i, TypedAttr)) {
+        if (Type *Ty =
+                Attrs.getAttributeAtIndex(i, TypedAttr).getValueAsType()) {
+          Attrs = Attrs.replaceAttributeTypeAtIndex(C, i, TypedAttr,
+                                                    TypeMap.get(Ty));
           break;
         }
       }
@@ -677,22 +684,28 @@ Function *IRLinker::copyFunctionProto(const Function *SF) {
 
 /// Set up prototypes for any indirect symbols that come over from the source
 /// module.
-GlobalValue *
-IRLinker::copyGlobalIndirectSymbolProto(const GlobalIndirectSymbol *SGIS) {
+GlobalValue *IRLinker::copyIndirectSymbolProto(const GlobalValue *SGV) {
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
-  auto *Ty = TypeMap.get(SGIS->getValueType());
-  GlobalIndirectSymbol *GIS;
-  if (isa<GlobalAlias>(SGIS))
-    GIS = GlobalAlias::create(Ty, SGIS->getAddressSpace(),
-                              GlobalValue::ExternalLinkage, SGIS->getName(),
-                              &DstM);
-  else
-    GIS = GlobalIFunc::create(Ty, SGIS->getAddressSpace(),
-                              GlobalValue::ExternalLinkage, SGIS->getName(),
-                              nullptr, &DstM);
-  GIS->copyAttributesFrom(SGIS);
-  return GIS;
+  auto *Ty = TypeMap.get(SGV->getValueType());
+
+  if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
+    auto *DGA = GlobalAlias::create(Ty, SGV->getAddressSpace(),
+                                    GlobalValue::ExternalLinkage,
+                                    SGV->getName(), &DstM);
+    DGA->copyAttributesFrom(GA);
+    return DGA;
+  }
+
+  if (auto *GI = dyn_cast<GlobalIFunc>(SGV)) {
+    auto *DGI = GlobalIFunc::create(Ty, SGV->getAddressSpace(),
+                                    GlobalValue::ExternalLinkage,
+                                    SGV->getName(), nullptr, &DstM);
+    DGI->copyAttributesFrom(GI);
+    return DGI;
+  }
+
+  llvm_unreachable("Invalid source global value type");
 }
 
 GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -704,7 +717,7 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
     NewGV = copyFunctionProto(SF);
   } else {
     if (ForDefinition)
-      NewGV = copyGlobalIndirectSymbolProto(cast<GlobalIndirectSymbol>(SGV));
+      NewGV = copyIndirectSymbolProto(SGV);
     else if (SGV->getValueType()->isFunctionTy())
       NewGV =
           Function::Create(cast<FunctionType>(TypeMap.get(SGV->getValueType())),
@@ -1108,10 +1121,12 @@ Error IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
   return Error::success();
 }
 
-void IRLinker::linkIndirectSymbolBody(GlobalIndirectSymbol &Dst,
-                                      GlobalIndirectSymbol &Src) {
-  Mapper.scheduleMapGlobalIndirectSymbol(Dst, *Src.getIndirectSymbol(),
-                                         IndirectSymbolMCID);
+void IRLinker::linkAliasAliasee(GlobalAlias &Dst, GlobalAlias &Src) {
+  Mapper.scheduleMapGlobalAlias(Dst, *Src.getAliasee(), IndirectSymbolMCID);
+}
+
+void IRLinker::linkIFuncResolver(GlobalIFunc &Dst, GlobalIFunc &Src) {
+  Mapper.scheduleMapGlobalIFunc(Dst, *Src.getResolver(), IndirectSymbolMCID);
 }
 
 Error IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
@@ -1121,7 +1136,11 @@ Error IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
     linkGlobalVariable(cast<GlobalVariable>(Dst), *GVar);
     return Error::success();
   }
-  linkIndirectSymbolBody(cast<GlobalIndirectSymbol>(Dst), cast<GlobalIndirectSymbol>(Src));
+  if (auto *GA = dyn_cast<GlobalAlias>(&Src)) {
+    linkAliasAliasee(cast<GlobalAlias>(Dst), *GA);
+    return Error::success();
+  }
+  linkIFuncResolver(cast<GlobalIFunc>(Dst), cast<GlobalIFunc>(Src));
   return Error::success();
 }
 
@@ -1443,7 +1462,39 @@ Error IRLinker::run() {
   if (DstM.getDataLayout().isDefault())
     DstM.setDataLayout(SrcM->getDataLayout());
 
-  if (SrcM->getDataLayout() != DstM.getDataLayout()) {
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (DstM.getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM->getTargetTriple());
+
+  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM.getTargetTriple());
+
+  // During CUDA compilation we have to link with the bitcode supplied with
+  // CUDA. libdevice bitcode either has no data layout set (pre-CUDA-11), or has
+  // the layout that is different from the one used by LLVM/clang (it does not
+  // include i128). Issuing a warning is not very helpful as there's not much
+  // the user can do about it.
+  bool EnableDLWarning = true;
+  bool EnableTripleWarning = true;
+  if (SrcTriple.isNVPTX() && DstTriple.isNVPTX()) {
+    std::string ModuleId = SrcM->getModuleIdentifier();
+    StringRef FileName = llvm::sys::path::filename(ModuleId);
+    bool SrcIsLibDevice =
+        FileName.startswith("libdevice") && FileName.endswith(".10.bc");
+    bool SrcHasLibDeviceDL =
+        (SrcM->getDataLayoutStr().empty() ||
+         SrcM->getDataLayoutStr() == "e-i64:64-v16:16-v32:32-n16:32:64");
+    // libdevice bitcode uses nvptx64-nvidia-gpulibs or just
+    // 'nvptx-unknown-unknown' triple (before CUDA-10.x) and is compatible with
+    // all NVPTX variants.
+    bool SrcHasLibDeviceTriple = (SrcTriple.getVendor() == Triple::NVIDIA &&
+                                  SrcTriple.getOSName() == "gpulibs") ||
+                                 (SrcTriple.getVendorName() == "unknown" &&
+                                  SrcTriple.getOSName() == "unknown");
+    EnableTripleWarning = !(SrcIsLibDevice && SrcHasLibDeviceTriple);
+    EnableDLWarning = !(SrcIsLibDevice && SrcHasLibDeviceDL);
+  }
+
+  if (EnableDLWarning && (SrcM->getDataLayout() != DstM.getDataLayout())) {
     emitWarning("Linking two modules of different data layouts: '" +
                 SrcM->getModuleIdentifier() + "' is '" +
                 SrcM->getDataLayoutStr() + "' whereas '" +
@@ -1451,13 +1502,7 @@ Error IRLinker::run() {
                 DstM.getDataLayoutStr() + "'\n");
   }
 
-  // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM.getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
-    DstM.setTargetTriple(SrcM->getTargetTriple());
-
-  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM.getTargetTriple());
-
-  if (!SrcM->getTargetTriple().empty()&&
+  if (EnableTripleWarning && !SrcM->getTargetTriple().empty() &&
       !SrcTriple.isCompatibleWith(DstTriple))
     emitWarning("Linking two modules of different target triples: '" +
                 SrcM->getModuleIdentifier() + "' is '" +
diff --git a/llvm/lib/Linker/LinkModules.cpp b/llvm/lib/Linker/LinkModules.cpp
index 97d6f8cd8075..f9f51bf17d95 100644
--- a/llvm/lib/Linker/LinkModules.cpp
+++ b/llvm/lib/Linker/LinkModules.cpp
@@ -24,6 +24,8 @@ using namespace llvm;
 
 namespace {
 
+enum class LinkFrom { Dst, Src, Both };
+
 /// This is an implementation class for the LinkModules function, which is the
 /// entrypoint for this file.
 class ModuleLinker {
@@ -67,11 +69,11 @@ class ModuleLinker {
                                      Comdat::SelectionKind Src,
                                      Comdat::SelectionKind Dst,
                                      Comdat::SelectionKind &Result,
-                                     bool &LinkFromSrc);
-  std::map<const Comdat *, std::pair<Comdat::SelectionKind, bool>>
+                                     LinkFrom &From);
+  DenseMap<const Comdat *, std::pair<Comdat::SelectionKind, LinkFrom>>
       ComdatsChosen;
   bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
-                       bool &LinkFromSrc);
+                       LinkFrom &From);
   // Keep track of the lazy linked global members of each comdat in source.
   DenseMap<const Comdat *, std::vector<GlobalValue *>> LazyComdatMembers;
 
@@ -103,7 +105,7 @@ class ModuleLinker {
   void dropReplacedComdat(GlobalValue &GV,
                           const DenseSet<const Comdat *> &ReplacedDstComdats);
 
-  bool linkIfNeeded(GlobalValue &GV);
+  bool linkIfNeeded(GlobalValue &GV, SmallVectorImpl<GlobalValue *> &GVToClone);
 
 public:
   ModuleLinker(IRMover &Mover, std::unique_ptr<Module> SrcM, unsigned Flags,
@@ -114,7 +116,7 @@ public:
 
   bool run();
 };
-}
+} // namespace
 
 static GlobalValue::VisibilityTypes
 getMinVisibility(GlobalValue::VisibilityTypes A,
@@ -131,7 +133,7 @@ bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
                                    const GlobalVariable *&GVar) {
   const GlobalValue *GVal = M.getNamedValue(ComdatName);
   if (const auto *GA = dyn_cast_or_null<GlobalAlias>(GVal)) {
-    GVal = GA->getBaseObject();
+    GVal = GA->getAliaseeObject();
     if (!GVal)
       // We cannot resolve the size of the aliasee yet.
       return emitError("Linking COMDATs named '" + ComdatName +
@@ -151,7 +153,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
                                                  Comdat::SelectionKind Src,
                                                  Comdat::SelectionKind Dst,
                                                  Comdat::SelectionKind &Result,
-                                                 bool &LinkFromSrc) {
+                                                 LinkFrom &From) {
   Module &DstM = Mover.getModule();
   // The ability to mix Comdat::SelectionKind::Any with
   // Comdat::SelectionKind::Largest is a behavior that comes from COFF.
@@ -175,11 +177,11 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
   switch (Result) {
   case Comdat::SelectionKind::Any:
     // Go with Dst.
-    LinkFromSrc = false;
+    From = LinkFrom::Dst;
     break;
   case Comdat::SelectionKind::NoDeduplicate:
-    return emitError("Linking COMDATs named '" + ComdatName +
-                     "': nodeduplicate has been violated!");
+    From = LinkFrom::Both;
+    break;
   case Comdat::SelectionKind::ExactMatch:
   case Comdat::SelectionKind::Largest:
   case Comdat::SelectionKind::SameSize: {
@@ -197,14 +199,14 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
       if (SrcGV->getInitializer() != DstGV->getInitializer())
         return emitError("Linking COMDATs named '" + ComdatName +
                          "': ExactMatch violated!");
-      LinkFromSrc = false;
+      From = LinkFrom::Dst;
     } else if (Result == Comdat::SelectionKind::Largest) {
-      LinkFromSrc = SrcSize > DstSize;
+      From = SrcSize > DstSize ? LinkFrom::Src : LinkFrom::Dst;
     } else if (Result == Comdat::SelectionKind::SameSize) {
       if (SrcSize != DstSize)
         return emitError("Linking COMDATs named '" + ComdatName +
                          "': SameSize violated!");
-      LinkFromSrc = false;
+      From = LinkFrom::Dst;
     } else {
       llvm_unreachable("unknown selection kind");
     }
@@ -217,7 +219,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
 
 bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    Comdat::SelectionKind &Result,
-                                   bool &LinkFromSrc) {
+                                   LinkFrom &From) {
   Module &DstM = Mover.getModule();
   Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
@@ -226,15 +228,14 @@ bool ModuleLinker::getComdatResult(const Comdat *SrcC,
 
   if (DstCI == ComdatSymTab.end()) {
     // Use the comdat if it is only available in one of the modules.
-    LinkFromSrc = true;
+    From = LinkFrom::Src;
     Result = SSK;
     return false;
   }
 
   const Comdat *DstC = &DstCI->second;
   Comdat::SelectionKind DSK = DstC->getSelectionKind();
-  return computeResultingSelectionKind(ComdatName, SSK, DSK, Result,
-                                       LinkFromSrc);
+  return computeResultingSelectionKind(ComdatName, SSK, DSK, Result, From);
 }
 
 bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
@@ -325,7 +326,8 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
                    "': symbol multiply defined!");
 }
 
-bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
+bool ModuleLinker::linkIfNeeded(GlobalValue &GV,
+                                SmallVectorImpl<GlobalValue *> &GVToClone) {
   GlobalValue *DGV = getLinkedToGlobal(&GV);
 
   if (shouldLinkOnlyNeeded()) {
@@ -377,17 +379,18 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   if (GV.isDeclaration())
     return false;
 
+  LinkFrom ComdatFrom = LinkFrom::Dst;
   if (const Comdat *SC = GV.getComdat()) {
-    bool LinkFromSrc;
-    Comdat::SelectionKind SK;
-    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    if (!LinkFromSrc)
+    std::tie(std::ignore, ComdatFrom) = ComdatsChosen[SC];
+    if (ComdatFrom == LinkFrom::Dst)
       return false;
   }
 
   bool LinkFromSrc = true;
   if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, GV))
     return true;
+  if (DGV && ComdatFrom == LinkFrom::Both)
+    GVToClone.push_back(LinkFromSrc ? DGV : &GV);
   if (LinkFromSrc)
     ValuesToLink.insert(&GV);
   return false;
@@ -462,12 +465,12 @@ bool ModuleLinker::run() {
     if (ComdatsChosen.count(&C))
       continue;
     Comdat::SelectionKind SK;
-    bool LinkFromSrc;
-    if (getComdatResult(&C, SK, LinkFromSrc))
+    LinkFrom From;
+    if (getComdatResult(&C, SK, From))
       return true;
-    ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
+    ComdatsChosen[&C] = std::make_pair(SK, From);
 
-    if (!LinkFromSrc)
+    if (From != LinkFrom::Src)
       continue;
 
     Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
@@ -482,20 +485,14 @@ bool ModuleLinker::run() {
 
   // Alias have to go first, since we are not able to find their comdats
   // otherwise.
-  for (auto I = DstM.alias_begin(), E = DstM.alias_end(); I != E;) {
-    GlobalAlias &GV = *I++;
+  for (GlobalAlias &GV : llvm::make_early_inc_range(DstM.aliases()))
     dropReplacedComdat(GV, ReplacedDstComdats);
-  }
 
-  for (auto I = DstM.global_begin(), E = DstM.global_end(); I != E;) {
-    GlobalVariable &GV = *I++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(DstM.globals()))
     dropReplacedComdat(GV, ReplacedDstComdats);
-  }
 
-  for (auto I = DstM.begin(), E = DstM.end(); I != E;) {
-    Function &GV = *I++;
+  for (Function &GV : llvm::make_early_inc_range(DstM))
     dropReplacedComdat(GV, ReplacedDstComdats);
-  }
 
   for (GlobalVariable &GV : SrcM->globals())
     if (GV.hasLinkOnceLinkage())
@@ -514,18 +511,45 @@ bool ModuleLinker::run() {
 
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
+  SmallVector<GlobalValue *, 0> GVToClone;
   for (GlobalVariable &GV : SrcM->globals())
-    if (linkIfNeeded(GV))
+    if (linkIfNeeded(GV, GVToClone))
       return true;
 
   for (Function &SF : *SrcM)
-    if (linkIfNeeded(SF))
+    if (linkIfNeeded(SF, GVToClone))
       return true;
 
   for (GlobalAlias &GA : SrcM->aliases())
-    if (linkIfNeeded(GA))
+    if (linkIfNeeded(GA, GVToClone))
+      return true;
+
+  for (GlobalIFunc &GI : SrcM->ifuncs())
+    if (linkIfNeeded(GI, GVToClone))
       return true;
 
+  // For a variable in a comdat nodeduplicate, its initializer should be
+  // preserved (its content may be implicitly used by other members) even if
+  // symbol resolution does not pick it. Clone it into an unnamed private
+  // variable.
+  for (GlobalValue *GV : GVToClone) {
+    if (auto *Var = dyn_cast<GlobalVariable>(GV)) {
+      auto *NewVar = new GlobalVariable(*Var->getParent(), Var->getValueType(),
+                                        Var->isConstant(), Var->getLinkage(),
+                                        Var->getInitializer());
+      NewVar->copyAttributesFrom(Var);
+      NewVar->setVisibility(GlobalValue::DefaultVisibility);
+      NewVar->setLinkage(GlobalValue::PrivateLinkage);
+      NewVar->setDSOLocal(true);
+      NewVar->setComdat(Var->getComdat());
+      if (Var->getParent() != &Mover.getModule())
+        ValuesToLink.insert(NewVar);
+    } else {
+      emitError("linking '" + GV->getName() +
+                "': non-variables in comdat nodeduplicate are not handled");
+    }
+  }
+
   for (unsigned I = 0; I < ValuesToLink.size(); ++I) {
     GlobalValue *GV = ValuesToLink[I];
     const Comdat *SC = GV->getComdat();
diff --git a/llvm/lib/MC/ConstantPools.cpp b/llvm/lib/MC/ConstantPools.cpp
index d4199025ad77..d8a08a4bd439 100644
--- a/llvm/lib/MC/ConstantPools.cpp
+++ b/llvm/lib/MC/ConstantPools.cpp
@@ -28,7 +28,7 @@ void ConstantPool::emitEntries(MCStreamer &Streamer) {
     return;
   Streamer.emitDataRegion(MCDR_DataRegion);
   for (const ConstantPoolEntry &Entry : Entries) {
-    Streamer.emitCodeAlignment(Entry.Size); // align naturally
+    Streamer.emitValueToAlignment(Entry.Size); // align naturally
     Streamer.emitLabel(Entry.Label);
     Streamer.emitValue(Entry.Value, Entry.Size, Entry.Loc);
   }
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index e0ea44626b7f..883735fcc293 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -796,7 +796,7 @@ MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx,
   else
     EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
 
-  unsigned Flags = 0;
+  unsigned Flags = ELF::SHF_INFO_LINK;
   if (Sec.getFlags() & ELF::SHF_GROUP)
     Flags = ELF::SHF_GROUP;
 
@@ -1311,6 +1311,7 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
   case MCSymbolRefExpr::VK_GOT:
   case MCSymbolRefExpr::VK_PLT:
   case MCSymbolRefExpr::VK_GOTPCREL:
+  case MCSymbolRefExpr::VK_GOTPCREL_NORELAX:
   case MCSymbolRefExpr::VK_PPC_GOT_LO:
   case MCSymbolRefExpr::VK_PPC_GOT_HI:
   case MCSymbolRefExpr::VK_PPC_GOT_HA:
diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp
new file mode 100644
index 000000000000..81704ffe4b24
--- /dev/null
+++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp
@@ -0,0 +1,27 @@
+//===- MCAsmInfoGOFF.cpp - MCGOFFAsmInfo properties -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines certain target specific asm properties for GOFF (z/OS)
+/// based targets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoGOFF.h"
+
+using namespace llvm;
+
+void MCAsmInfoGOFF::anchor() {}
+
+MCAsmInfoGOFF::MCAsmInfoGOFF() {
+  Data64bitsDirective = "\t.quad\t";
+  HasDotTypeDotSizeDirective = false;
+  PrivateGlobalPrefix = "@@";
+  PrivateLabelPrefix = "@";
+  ZeroDirective = "\t.space\t";
+}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 72f4ee3f33be..154b2d051f34 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -30,13 +30,13 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <cctype>
 
 using namespace llvm;
@@ -245,7 +245,7 @@ public:
                             unsigned ValueSize = 1,
                             unsigned MaxBytesToEmit = 0) override;
 
-  void emitCodeAlignment(unsigned ByteAlignment,
+  void emitCodeAlignment(unsigned ByteAlignment, const MCSubtargetInfo *STI,
                          unsigned MaxBytesToEmit = 0) override;
 
   void emitValueToOffset(const MCExpr *Offset,
@@ -1429,6 +1429,7 @@ void MCAsmStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
 }
 
 void MCAsmStreamer::emitCodeAlignment(unsigned ByteAlignment,
+                                      const MCSubtargetInfo *STI,
                                       unsigned MaxBytesToEmit) {
   // Emit with a text fill value.
   emitValueToAlignment(ByteAlignment, MAI->getTextAlignFillValue(),
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 9ed8d1083a40..d5e9f4fc66bc 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -483,6 +483,7 @@ void MCAssembler::writeFragmentPadding(raw_ostream &OS,
            "Writing bundle padding for a fragment without instructions");
 
     unsigned TotalLength = BundlePadding + static_cast<unsigned>(FSize);
+    const MCSubtargetInfo *STI = EF.getSubtargetInfo();
     if (EF.alignToBundleEnd() && TotalLength > getBundleAlignSize()) {
       // If the padding itself crosses a bundle boundary, it must be emitted
       // in 2 pieces, since even nop instructions must not cross boundaries.
@@ -493,12 +494,12 @@ void MCAssembler::writeFragmentPadding(raw_ostream &OS,
       // ----------------------------
       //        ^-------------------^   <- TotalLength
       unsigned DistanceToBoundary = TotalLength - getBundleAlignSize();
-      if (!getBackend().writeNopData(OS, DistanceToBoundary))
+      if (!getBackend().writeNopData(OS, DistanceToBoundary, STI))
         report_fatal_error("unable to write NOP sequence of " +
                            Twine(DistanceToBoundary) + " bytes");
       BundlePadding -= DistanceToBoundary;
     }
-    if (!getBackend().writeNopData(OS, BundlePadding))
+    if (!getBackend().writeNopData(OS, BundlePadding, STI))
       report_fatal_error("unable to write NOP sequence of " +
                          Twine(BundlePadding) + " bytes");
   }
@@ -544,7 +545,7 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     // bytes left to fill use the Value and ValueSize to fill the rest.
     // If we are aligning with nops, ask that target to emit the right data.
     if (AF.hasEmitNops()) {
-      if (!Asm.getBackend().writeNopData(OS, Count))
+      if (!Asm.getBackend().writeNopData(OS, Count, AF.getSubtargetInfo()))
         report_fatal_error("unable to write nop sequence of " +
                           Twine(Count) + " bytes");
       break;
@@ -621,9 +622,11 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
   case MCFragment::FT_Nops: {
     ++stats::EmittedNopsFragments;
     const MCNopsFragment &NF = cast<MCNopsFragment>(F);
+
     int64_t NumBytes = NF.getNumBytes();
     int64_t ControlledNopLength = NF.getControlledNopLength();
-    int64_t MaximumNopLength = Asm.getBackend().getMaximumNopSize();
+    int64_t MaximumNopLength =
+        Asm.getBackend().getMaximumNopSize(*NF.getSubtargetInfo());
 
     assert(NumBytes > 0 && "Expected positive NOPs fragment size");
     assert(ControlledNopLength >= 0 && "Expected non-negative NOP size");
@@ -647,7 +650,8 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
       uint64_t NumBytesToEmit =
           (uint64_t)std::min(NumBytes, ControlledNopLength);
       assert(NumBytesToEmit && "try to emit empty NOP instruction");
-      if (!Asm.getBackend().writeNopData(OS, NumBytesToEmit)) {
+      if (!Asm.getBackend().writeNopData(OS, NumBytesToEmit,
+                                         NF.getSubtargetInfo())) {
         report_fatal_error("unable to write nop sequence of the remaining " +
                            Twine(NumBytesToEmit) + " bytes");
         break;
@@ -664,7 +668,8 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
   }
 
   case MCFragment::FT_BoundaryAlign: {
-    if (!Asm.getBackend().writeNopData(OS, FragmentSize))
+    const MCBoundaryAlignFragment &BF = cast<MCBoundaryAlignFragment>(F);
+    if (!Asm.getBackend().writeNopData(OS, FragmentSize, BF.getSubtargetInfo()))
       report_fatal_error("unable to write nop sequence of " +
                          Twine(FragmentSize) + " bytes");
     break;
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index 52ab0b41f539..aaa3b747682c 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -25,9 +25,9 @@
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstddef>
diff --git a/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
index 64e216e0051d..735be23206e4 100644
--- a/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
 #include "llvm-c/Disassembler.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 27bb7a103165..1c9cfb9042e2 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -27,7 +27,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -66,29 +65,6 @@ MCSymbol *mcdwarf::emitListsTableHeaderStart(MCStreamer &S) {
   return End;
 }
 
-/// Manage the .debug_line_str section contents, if we use it.
-class llvm::MCDwarfLineStr {
-  MCSymbol *LineStrLabel = nullptr;
-  StringTableBuilder LineStrings{StringTableBuilder::DWARF};
-  bool UseRelocs = false;
-
-public:
-  /// Construct an instance that can emit .debug_line_str (for use in a normal
-  /// v5 line table).
-  explicit MCDwarfLineStr(MCContext &Ctx) {
-    UseRelocs = Ctx.getAsmInfo()->doesDwarfUseRelocationsAcrossSections();
-    if (UseRelocs)
-      LineStrLabel =
-          Ctx.getObjectFileInfo()->getDwarfLineStrSection()->getBeginSymbol();
-  }
-
-  /// Emit a reference to the string.
-  void emitRef(MCStreamer *MCOS, StringRef Path);
-
-  /// Emit the .debug_line_str section if appropriate.
-  void emitSection(MCStreamer *MCOS);
-};
-
 static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
   unsigned MinInsnLength = Context.getAsmInfo()->getMinInstAlignment();
   if (MinInsnLength == 1)
@@ -100,6 +76,13 @@ static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
   return AddrDelta / MinInsnLength;
 }
 
+MCDwarfLineStr::MCDwarfLineStr(MCContext &Ctx) {
+  UseRelocs = Ctx.getAsmInfo()->doesDwarfUseRelocationsAcrossSections();
+  if (UseRelocs)
+    LineStrLabel =
+        Ctx.getObjectFileInfo()->getDwarfLineStrSection()->getBeginSymbol();
+}
+
 //
 // This is called when an instruction is assembled into the specified section
 // and if there is information from the last .loc directive that has yet to have
@@ -158,23 +141,58 @@ makeStartPlusIntExpr(MCContext &Ctx, const MCSymbol &Start, int IntVal) {
   return Res;
 }
 
+void MCLineSection::addEndEntry(MCSymbol *EndLabel) {
+  auto *Sec = &EndLabel->getSection();
+  // The line table may be empty, which we should skip adding an end entry.
+  // There are two cases:
+  // (1) MCAsmStreamer - emitDwarfLocDirective emits a location directive in
+  //     place instead of adding a line entry if the target has
+  //     usesDwarfFileAndLocDirectives.
+  // (2) MCObjectStreamer - if a function has incomplete debug info where
+  //     instructions don't have DILocations, the line entries are missing.
+  auto I = MCLineDivisions.find(Sec);
+  if (I != MCLineDivisions.end()) {
+    auto &Entries = I->second;
+    auto EndEntry = Entries.back();
+    EndEntry.setEndLabel(EndLabel);
+    Entries.push_back(EndEntry);
+  }
+}
+
 //
 // This emits the Dwarf line table for the specified section from the entries
 // in the LineSection.
 //
-static inline void emitDwarfLineTable(
+void MCDwarfLineTable::emitOne(
     MCStreamer *MCOS, MCSection *Section,
     const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
-  unsigned FileNum = 1;
-  unsigned LastLine = 1;
-  unsigned Column = 0;
-  unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
-  unsigned Isa = 0;
-  unsigned Discriminator = 0;
-  MCSymbol *LastLabel = nullptr;
+
+  unsigned FileNum, LastLine, Column, Flags, Isa, Discriminator;
+  MCSymbol *LastLabel;
+  auto init = [&]() {
+    FileNum = 1;
+    LastLine = 1;
+    Column = 0;
+    Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
+    Isa = 0;
+    Discriminator = 0;
+    LastLabel = nullptr;
+  };
+  init();
 
   // Loop through each MCDwarfLineEntry and encode the dwarf line number table.
+  bool EndEntryEmitted = false;
   for (const MCDwarfLineEntry &LineEntry : LineEntries) {
+    MCSymbol *Label = LineEntry.getLabel();
+    const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
+    if (LineEntry.IsEndEntry) {
+      MCOS->emitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, Label,
+                                     asmInfo->getCodePointerSize());
+      init();
+      EndEntryEmitted = true;
+      continue;
+    }
+
     int64_t LineDelta = static_cast<int64_t>(LineEntry.getLine()) - LastLine;
 
     if (FileNum != LineEntry.getFileNum()) {
@@ -212,12 +230,9 @@ static inline void emitDwarfLineTable(
     if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
       MCOS->emitInt8(dwarf::DW_LNS_set_epilogue_begin);
 
-    MCSymbol *Label = LineEntry.getLabel();
-
     // At this point we want to emit/create the sequence to encode the delta in
     // line numbers and the increment of the address from the previous Label
     // and the current Label.
-    const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
     MCOS->emitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
                                    asmInfo->getCodePointerSize());
 
@@ -227,7 +242,12 @@ static inline void emitDwarfLineTable(
   }
 
   // Generate DWARF line end entry.
-  MCOS->emitDwarfLineEndEntry(Section, LastLabel);
+  // We do not need this for DwarfDebug that explicitly terminates the line
+  // table using ranges whenever CU or section changes. However, the MC path
+  // does not track ranges nor terminate the line table. In that case,
+  // conservatively use the section end symbol to end the line table.
+  if (!EndEntryEmitted)
+    MCOS->emitDwarfLineEndEntry(Section, LastLabel);
 }
 
 //
@@ -522,7 +542,7 @@ void MCDwarfLineTable::emitCU(MCStreamer *MCOS, MCDwarfLineTableParams Params,
 
   // Put out the line tables.
   for (const auto &LineSec : MCLineSections.getMCLineEntries())
-    emitDwarfLineTable(MCOS, LineSec.first, LineSec.second);
+    emitOne(MCOS, LineSec.first, LineSec.second);
 
   // This is the end of the section, so set the value of the symbol at the end
   // of this section (that was used in a previous expression).
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 784d66805d63..1ba999a63113 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -29,10 +29,10 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
@@ -88,10 +88,10 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
   DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
 }
 
-void MCELFStreamer::InitSections(bool NoExecStack) {
+void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
   MCContext &Ctx = getContext();
   SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
-  emitCodeAlignment(4);
+  emitCodeAlignment(Ctx.getObjectFileInfo()->getTextSectionAlignment(), &STI);
 
   if (NoExecStack)
     SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
@@ -224,6 +224,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_ELF_TypeGnuUniqueObject:
     Symbol->setType(CombineSymbolTypes(Symbol->getType(), ELF::STT_OBJECT));
     Symbol->setBinding(ELF::STB_GNU_UNIQUE);
+    getAssembler().getWriter().markGnuAbi();
     break;
 
   case MCSA_Global:
@@ -325,7 +326,7 @@ void MCELFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
     SwitchSection(P.first, P.second);
   } else {
     if(Symbol->declareCommon(Size, ByteAlignment))
-      report_fatal_error("Symbol: " + Symbol->getName() +
+      report_fatal_error(Twine("Symbol: ") + Symbol->getName() +
                          " redeclared as different type");
   }
 
@@ -500,7 +501,7 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
               *MCOffset, "BFD_RELOC_NONE", SRE, SRE->getLoc(),
               *getContext().getSubtargetInfo()))
     report_fatal_error("Relocation for CG Profile could not be created: " +
-                       Err->second);
+                       Twine(Err->second));
 }
 
 void MCELFStreamer::finalizeCGProfile() {
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 84ec0f6bb57b..10d494b5ac61 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -230,6 +230,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_GOTREL: return "GOTREL";
   case VK_PCREL: return "PCREL";
   case VK_GOTPCREL: return "GOTPCREL";
+  case VK_GOTPCREL_NORELAX: return "GOTPCREL_NORELAX";
   case VK_GOTTPOFF: return "GOTTPOFF";
   case VK_INDNTPOFF: return "INDNTPOFF";
   case VK_NTPOFF: return "NTPOFF";
@@ -358,6 +359,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_WASM_MBREL: return "MBREL";
   case VK_WASM_TLSREL: return "TLSREL";
   case VK_WASM_TBREL: return "TBREL";
+  case VK_WASM_GOT_TLS: return "GOT@TLS";
   case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo";
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
   case VK_AMDGPU_REL32_LO: return "rel32@lo";
@@ -393,6 +395,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("gotrel", VK_GOTREL)
     .Case("pcrel", VK_PCREL)
     .Case("gotpcrel", VK_GOTPCREL)
+    .Case("gotpcrel_norelax", VK_GOTPCREL_NORELAX)
     .Case("gottpoff", VK_GOTTPOFF)
     .Case("indntpoff", VK_INDNTPOFF)
     .Case("ntpoff", VK_NTPOFF)
@@ -499,6 +502,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("tbrel", VK_WASM_TBREL)
     .Case("mbrel", VK_WASM_MBREL)
     .Case("tlsrel", VK_WASM_TLSREL)
+    .Case("got@tls", VK_WASM_GOT_TLS)
     .Case("gotpcrel32@lo", VK_AMDGPU_GOTPCREL32_LO)
     .Case("gotpcrel32@hi", VK_AMDGPU_GOTPCREL32_HI)
     .Case("rel32@lo", VK_AMDGPU_REL32_LO)
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 0f8543f51096..4634de863b2f 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -128,7 +128,11 @@ static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S,
   const MCSymbolRefExpr *A = Target.getSymA();
   if (A) {
     uint64_t ValA;
-    if (!getLabelOffset(Layout, A->getSymbol(), ReportError, ValA))
+    // FIXME: On most platforms, `Target`'s component symbols are labels from
+    // having been simplified during evaluation, but on Mach-O they can be
+    // variables due to PR19203. This, and the line below for `B` can be
+    // restored to call `getLabelOffset` when PR19203 is fixed.
+    if (!getSymbolOffsetImpl(Layout, A->getSymbol(), ReportError, ValA))
       return false;
     Offset += ValA;
   }
@@ -136,7 +140,7 @@ static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S,
   const MCSymbolRefExpr *B = Target.getSymB();
   if (B) {
     uint64_t ValB;
-    if (!getLabelOffset(Layout, B->getSymbol(), ReportError, ValB))
+    if (!getSymbolOffsetImpl(Layout, B->getSymbol(), ReportError, ValB))
       return false;
     Offset -= ValB;
   }
diff --git a/llvm/lib/MC/MCInstrAnalysis.cpp b/llvm/lib/MC/MCInstrAnalysis.cpp
index a7dc0626d0ab..52b59185c6fc 100644
--- a/llvm/lib/MC/MCInstrAnalysis.cpp
+++ b/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -29,8 +29,14 @@ bool MCInstrAnalysis::evaluateBranch(const MCInst & /*Inst*/, uint64_t /*Addr*/,
   return false;
 }
 
-Optional<uint64_t>
-MCInstrAnalysis::evaluateMemoryOperandAddress(const MCInst &Inst, uint64_t Addr,
-                                              uint64_t Size) const {
+Optional<uint64_t> MCInstrAnalysis::evaluateMemoryOperandAddress(
+    const MCInst &Inst, const MCSubtargetInfo *STI, uint64_t Addr,
+    uint64_t Size) const {
   return None;
 }
+
+Optional<uint64_t>
+MCInstrAnalysis::getMemoryOperandRelocationOffset(const MCInst &Inst,
+                                                  uint64_t Size) const {
+  return None;
+}
+\ No newline at end of file
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index f0948a184598..aa94b141d8be 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -30,9 +30,9 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <vector>
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 7ea1106068b7..d7f85f793c55 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -896,6 +896,19 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
       ".rodata", SectionKind::getReadOnly(),
       XCOFF::CsectProperties(XCOFF::StorageMappingClass::XMC_RO, XCOFF::XTY_SD),
       /* MultiSymbolsAllowed*/ true);
+  ReadOnlySection->setAlignment(Align(4));
+
+  ReadOnly8Section = Ctx->getXCOFFSection(
+      ".rodata.8", SectionKind::getReadOnly(),
+      XCOFF::CsectProperties(XCOFF::StorageMappingClass::XMC_RO, XCOFF::XTY_SD),
+      /* MultiSymbolsAllowed*/ true);
+  ReadOnly8Section->setAlignment(Align(8));
+
+  ReadOnly16Section = Ctx->getXCOFFSection(
+      ".rodata.16", SectionKind::getReadOnly(),
+      XCOFF::CsectProperties(XCOFF::StorageMappingClass::XMC_RO, XCOFF::XTY_SD),
+      /* MultiSymbolsAllowed*/ true);
+  ReadOnly16Section->setAlignment(Align(16));
 
   TLSDataSection = Ctx->getXCOFFSection(
       ".tdata", SectionKind::getThreadData(),
@@ -968,6 +981,8 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
       /* MultiSymbolsAllowed */ true, ".dwmac", XCOFF::SSUBTYP_DWMAC);
 }
 
+MCObjectFileInfo::~MCObjectFileInfo() {}
+
 void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
                                             bool LargeCodeModel) {
   PositionIndependent = PIC;
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 2865a2ad80a9..9c86fcc86bcb 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -368,7 +368,7 @@ void MCObjectStreamer::emitInstruction(const MCInst &Inst,
                                                 "' cannot have instructions");
     return;
   }
-  getAssembler().getBackend().emitInstructionBegin(*this, Inst);
+  getAssembler().getBackend().emitInstructionBegin(*this, Inst, STI);
   emitInstructionImpl(Inst, STI);
   getAssembler().getBackend().emitInstructionEnd(*this, Inst);
 }
@@ -609,9 +609,10 @@ void MCObjectStreamer::emitValueToAlignment(unsigned ByteAlignment,
 }
 
 void MCObjectStreamer::emitCodeAlignment(unsigned ByteAlignment,
+                                         const MCSubtargetInfo *STI,
                                          unsigned MaxBytesToEmit) {
   emitValueToAlignment(ByteAlignment, 0, 1, MaxBytesToEmit);
-  cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true);
+  cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true, STI);
 }
 
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
@@ -835,13 +836,14 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
 }
 
 void MCObjectStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLength,
-                                SMLoc Loc) {
+                                SMLoc Loc, const MCSubtargetInfo &STI) {
   // Emit an NOP fragment.
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
   assert(getCurrentSectionOnly() && "need a section");
-  insert(new MCNopsFragment(NumBytes, ControlledNopLength, Loc));
+
+  insert(new MCNopsFragment(NumBytes, ControlledNopLength, Loc, STI));
 }
 
 void MCObjectStreamer::emitFileDirective(StringRef Filename) {
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index e328ba5315af..bf9b9e916d6f 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -228,6 +228,7 @@ AsmToken AsmLexer::LexLineComment() {
   int CurChar = getNextChar();
   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
     CurChar = getNextChar();
+  const char *NewlinePtr = CurPtr;
   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
     ++CurPtr;
 
@@ -235,7 +236,7 @@ AsmToken AsmLexer::LexLineComment() {
   if (CommentConsumer) {
     CommentConsumer->HandleComment(
         SMLoc::getFromPointer(CommentTextStart),
-        StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
+        StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
   }
 
   IsAtStartOfLine = true;
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index d3cb5ca59bf3..ed9f2066dc20 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -749,6 +749,7 @@ namespace llvm {
 extern MCAsmParserExtension *createDarwinAsmParser();
 extern MCAsmParserExtension *createELFAsmParser();
 extern MCAsmParserExtension *createCOFFAsmParser();
+extern MCAsmParserExtension *createGOFFAsmParser();
 extern MCAsmParserExtension *createXCOFFAsmParser();
 extern MCAsmParserExtension *createWasmAsmParser();
 
@@ -783,7 +784,8 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
     PlatformParser.reset(createELFAsmParser());
     break;
   case MCContext::IsGOFF:
-    report_fatal_error("GOFFAsmParser support not implemented yet");
+    PlatformParser.reset(createGOFFAsmParser());
+    break;
   case MCContext::IsWasm:
     PlatformParser.reset(createWasmAsmParser());
     break;
@@ -950,7 +952,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   // Create the initial section, if requested.
   if (!NoInitialTextSection)
-    Out.InitSections(false);
+    Out.initSections(false, getTargetParser().getSTI());
 
   // Prime the lexer.
   Lex();
@@ -1052,18 +1054,21 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
       }
     }
   }
-
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
-  if (!HadError && !NoFinalize)
+  if (!HadError && !NoFinalize) {
+    if (auto *TS = Out.getTargetStreamer())
+      TS->emitConstantPools();
+
     Out.Finish(Lexer.getLoc());
+  }
 
   return HadError || getContext().hadError();
 }
 
 bool AsmParser::checkForValidSection() {
   if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) {
-    Out.InitSections(false);
+    Out.initSections(false, getTargetParser().getSTI());
     return Error(getTok().getLoc(),
                  "expected section directive before assembly directive");
   }
@@ -3451,7 +3456,8 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   bool UseCodeAlign = Section->UseCodeAlign();
   if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
       ValueSize == 1 && UseCodeAlign) {
-    getStreamer().emitCodeAlignment(Alignment, MaxBytesToFill);
+    getStreamer().emitCodeAlignment(Alignment, &getTargetParser().getSTI(),
+                                    MaxBytesToFill);
   } else {
     // FIXME: Target specific behavior about how the "extra" bytes are filled.
     getStreamer().emitValueToAlignment(Alignment, FillExpr, ValueSize,
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 70d69fc8dd32..ddc41d0a08ab 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -502,6 +502,23 @@ static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
   return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
 }
 
+static bool allowSectionTypeMismatch(const Triple &TT, StringRef SectionName,
+                                     unsigned Type) {
+  if (TT.getArch() == Triple::x86_64) {
+    // x86-64 psABI names SHT_X86_64_UNWIND as the canonical type for .eh_frame,
+    // but GNU as emits SHT_PROGBITS .eh_frame for .cfi_* directives. Don't
+    // error for SHT_PROGBITS .eh_frame
+    return SectionName == ".eh_frame" && Type == ELF::SHT_PROGBITS;
+  }
+  if (TT.isMIPS()) {
+    // MIPS .debug_* sections should have SHT_MIPS_DWARF section type to
+    // distinguish among sections contain DWARF and ECOFF debug formats,
+    // but in assembly files these sections have SHT_PROGBITS type.
+    return hasPrefix(SectionName, ".debug_") && Type == ELF::SHT_PROGBITS;
+  }
+  return false;
+}
+
 bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
@@ -659,11 +676,9 @@ EndStmt:
       getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
                                  IsComdat, UniqueID, LinkedToSym);
   getStreamer().SwitchSection(Section, Subsection);
-  // x86-64 psABI names SHT_X86_64_UNWIND as the canonical type for .eh_frame,
-  // but GNU as emits SHT_PROGBITS .eh_frame for .cfi_* directives. Don't error
-  // for SHT_PROGBITS .eh_frame
   if (Section->getType() != Type &&
-      !(SectionName == ".eh_frame" && Type == ELF::SHT_PROGBITS))
+      !allowSectionTypeMismatch(getContext().getTargetTriple(), SectionName,
+                                Type))
     Error(loc, "changed section type for " + SectionName + ", expected: 0x" +
                    utohexstr(Section->getType()));
   // Check that flags are used consistently. However, the GNU assembler permits
@@ -815,7 +830,7 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
-  if (Name.find('@') == StringRef::npos)
+  if (!Name.contains('@'))
     return TokError("expected a '@' in the name");
   bool KeepOriginalSym = !Name.contains("@@@");
   if (parseOptionalToken(AsmToken::Comma)) {
diff --git a/llvm/lib/MC/MCParser/GOFFAsmParser.cpp b/llvm/lib/MC/MCParser/GOFFAsmParser.cpp
new file mode 100644
index 000000000000..c2a7eaee8029
--- /dev/null
+++ b/llvm/lib/MC/MCParser/GOFFAsmParser.cpp
@@ -0,0 +1,48 @@
+//===- GOFFAsmParser.cpp - GOFF Assembly Parser ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCSectionGOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolGOFF.h"
+
+using namespace llvm;
+
+namespace {
+
+class GOFFAsmParser : public MCAsmParserExtension {
+  template <bool (GOFFAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler =
+        std::make_pair(this, HandleDirective<GOFFAsmParser, HandlerMethod>);
+
+    getParser().addDirectiveHandler(Directive, Handler);
+  }
+
+public:
+  GOFFAsmParser() {}
+
+  void Initialize(MCAsmParser &Parser) override {
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(Parser);
+  }
+};
+
+} // namespace
+
+namespace llvm {
+
+MCAsmParserExtension *createGOFFAsmParser() { return new GOFFAsmParser; }
+
+} // namespace llvm
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 7b4d6e529cc2..f1704cef46ac 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -1319,7 +1319,7 @@ bool MasmParser::enabledGenDwarfForAssembly() {
 bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Create the initial section, if requested.
   if (!NoInitialTextSection)
-    Out.InitSections(false);
+    Out.initSections(false, getTargetParser().getSTI());
 
   // Prime the lexer.
   Lex();
@@ -1437,7 +1437,7 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
 bool MasmParser::checkForValidSection() {
   if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) {
-    Out.InitSections(false);
+    Out.initSections(false, getTargetParser().getSTI());
     return Error(getTok().getLoc(),
                  "expected section directive before assembly directive");
   }
@@ -4772,7 +4772,8 @@ bool MasmParser::emitAlignTo(int64_t Alignment) {
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
     assert(Section && "must have section to emit alignment");
     if (Section->UseCodeAlign()) {
-      getStreamer().emitCodeAlignment(Alignment, /*MaxBytesToEmit=*/0);
+      getStreamer().emitCodeAlignment(Alignment, &getTargetParser().getSTI(),
+                                      /*MaxBytesToEmit=*/0);
     } else {
       // FIXME: Target specific behavior about how the "extra" bytes are filled.
       getStreamer().emitValueToAlignment(Alignment, /*Value=*/0,
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 731831d3bce3..e35bcec8fe75 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -12,10 +12,17 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <limits>
+#include <memory>
+#include <sstream>
 
 #define DEBUG_TYPE "mcpseudoprobe"
 
 using namespace llvm;
+using namespace support;
 
 #ifndef NDEBUG
 int MCPseudoProbeTable::DdgPrintIndent = 0;
@@ -69,23 +76,6 @@ void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
   });
 }
 
-MCPseudoProbeInlineTree::~MCPseudoProbeInlineTree() {
-  for (auto &Inlinee : Inlinees)
-    delete Inlinee.second;
-}
-
-MCPseudoProbeInlineTree *
-MCPseudoProbeInlineTree::getOrAddNode(InlineSite Site) {
-  auto Iter = Inlinees.find(Site);
-  if (Iter == Inlinees.end()) {
-    auto *Node = new MCPseudoProbeInlineTree(std::get<0>(Site));
-    Inlinees[Site] = Node;
-    return Node;
-  } else {
-    return Iter->second;
-  }
-}
-
 void MCPseudoProbeInlineTree::addPseudoProbe(
     const MCPseudoProbe &Probe, const MCPseudoProbeInlineStack &InlineStack) {
   // The function should not be called on the root.
@@ -147,7 +137,7 @@ void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS,
     // Emit number of probes in this node
     MCOS->emitULEB128IntValue(Probes.size());
     // Emit number of direct inlinees
-    MCOS->emitULEB128IntValue(Inlinees.size());
+    MCOS->emitULEB128IntValue(Children.size());
     // Emit probes in this group
     for (const auto &Probe : Probes) {
       Probe.emit(MCOS, LastProbe);
@@ -157,7 +147,13 @@ void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS,
     assert(Probes.empty() && "Root should not have probes");
   }
 
-  // Emit descendent
+  // Emit sorted descendant
+  // InlineSite is unique for each pair,
+  // so there will be no ordering of Inlinee based on MCPseudoProbeInlineTree*
+  std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees;
+  for (auto Child = Children.begin(); Child != Children.end(); ++Child)
+    Inlinees[Child->first] = Child->second.get();
+
   for (const auto &Inlinee : Inlinees) {
     if (Guid) {
       // Emit probe index
@@ -211,3 +207,361 @@ void MCPseudoProbeTable::emit(MCObjectStreamer *MCOS) {
   // Put out the probe.
   ProbeSections.emit(MCOS);
 }
+
+static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
+                                      uint64_t GUID) {
+  auto It = GUID2FuncMAP.find(GUID);
+  assert(It != GUID2FuncMAP.end() &&
+         "Probe function must exist for a valid GUID");
+  return It->second.FuncName;
+}
+
+void MCPseudoProbeFuncDesc::print(raw_ostream &OS) {
+  OS << "GUID: " << FuncGUID << " Name: " << FuncName << "\n";
+  OS << "Hash: " << FuncHash << "\n";
+}
+
+void MCDecodedPseudoProbe::getInlineContext(
+    SmallVectorImpl<MCPseduoProbeFrameLocation> &ContextStack,
+    const GUIDProbeFunctionMap &GUID2FuncMAP) const {
+  uint32_t Begin = ContextStack.size();
+  MCDecodedPseudoProbeInlineTree *Cur = InlineTree;
+  // It will add the string of each node's inline site during iteration.
+  // Note that it won't include the probe's belonging function(leaf location)
+  while (Cur->hasInlineSite()) {
+    StringRef FuncName =
+        getProbeFNameForGUID(GUID2FuncMAP, std::get<0>(Cur->ISite));
+    ContextStack.emplace_back(
+        MCPseduoProbeFrameLocation(FuncName, std::get<1>(Cur->ISite)));
+    Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
+  }
+  // Make the ContextStack in caller-callee order
+  std::reverse(ContextStack.begin() + Begin, ContextStack.end());
+}
+
+std::string MCDecodedPseudoProbe::getInlineContextStr(
+    const GUIDProbeFunctionMap &GUID2FuncMAP) const {
+  std::ostringstream OContextStr;
+  SmallVector<MCPseduoProbeFrameLocation, 16> ContextStack;
+  getInlineContext(ContextStack, GUID2FuncMAP);
+  for (auto &Cxt : ContextStack) {
+    if (OContextStr.str().size())
+      OContextStr << " @ ";
+    OContextStr << Cxt.first.str() << ":" << Cxt.second;
+  }
+  return OContextStr.str();
+}
+
+static const char *PseudoProbeTypeStr[3] = {"Block", "IndirectCall",
+                                            "DirectCall"};
+
+void MCDecodedPseudoProbe::print(raw_ostream &OS,
+                                 const GUIDProbeFunctionMap &GUID2FuncMAP,
+                                 bool ShowName) const {
+  OS << "FUNC: ";
+  if (ShowName) {
+    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Guid);
+    OS << FuncName.str() << " ";
+  } else {
+    OS << Guid << " ";
+  }
+  OS << "Index: " << Index << "  ";
+  OS << "Type: " << PseudoProbeTypeStr[static_cast<uint8_t>(Type)] << "  ";
+  std::string InlineContextStr = getInlineContextStr(GUID2FuncMAP);
+  if (InlineContextStr.size()) {
+    OS << "Inlined: @ ";
+    OS << InlineContextStr;
+  }
+  OS << "\n";
+}
+
+template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readUnencodedNumber() {
+  if (Data + sizeof(T) > End) {
+    return std::error_code();
+  }
+  T Val = endian::readNext<T, little, unaligned>(Data);
+  return ErrorOr<T>(Val);
+}
+
+template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readUnsignedNumber() {
+  unsigned NumBytesRead = 0;
+  uint64_t Val = decodeULEB128(Data, &NumBytesRead);
+  if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
+    return std::error_code();
+  }
+  Data += NumBytesRead;
+  return ErrorOr<T>(static_cast<T>(Val));
+}
+
+template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readSignedNumber() {
+  unsigned NumBytesRead = 0;
+  int64_t Val = decodeSLEB128(Data, &NumBytesRead);
+  if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
+    return std::error_code();
+  }
+  Data += NumBytesRead;
+  return ErrorOr<T>(static_cast<T>(Val));
+}
+
+ErrorOr<StringRef> MCPseudoProbeDecoder::readString(uint32_t Size) {
+  StringRef Str(reinterpret_cast<const char *>(Data), Size);
+  if (Data + Size > End) {
+    return std::error_code();
+  }
+  Data += Size;
+  return ErrorOr<StringRef>(Str);
+}
+
+bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
+                                                 std::size_t Size) {
+  // The pseudo_probe_desc section has a format like:
+  // .section .pseudo_probe_desc,"",@progbits
+  // .quad -5182264717993193164   // GUID
+  // .quad 4294967295             // Hash
+  // .uleb 3                      // Name size
+  // .ascii "foo"                 // Name
+  // .quad -2624081020897602054
+  // .quad 174696971957
+  // .uleb 34
+  // .ascii "main"
+
+  Data = Start;
+  End = Data + Size;
+
+  while (Data < End) {
+    auto ErrorOrGUID = readUnencodedNumber<uint64_t>();
+    if (!ErrorOrGUID)
+      return false;
+
+    auto ErrorOrHash = readUnencodedNumber<uint64_t>();
+    if (!ErrorOrHash)
+      return false;
+
+    auto ErrorOrNameSize = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrNameSize)
+      return false;
+    uint32_t NameSize = std::move(*ErrorOrNameSize);
+
+    auto ErrorOrName = readString(NameSize);
+    if (!ErrorOrName)
+      return false;
+
+    uint64_t GUID = std::move(*ErrorOrGUID);
+    uint64_t Hash = std::move(*ErrorOrHash);
+    StringRef Name = std::move(*ErrorOrName);
+
+    // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
+    GUID2FuncDescMap.emplace(GUID, MCPseudoProbeFuncDesc(GUID, Hash, Name));
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  return true;
+}
+
+bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
+                                                 std::size_t Size) {
+  // The pseudo_probe section encodes an inline forest and each tree has a
+  // format like:
+  //  FUNCTION BODY (one for each uninlined function present in the text
+  //  section)
+  //     GUID (uint64)
+  //         GUID of the function
+  //     NPROBES (ULEB128)
+  //         Number of probes originating from this function.
+  //     NUM_INLINED_FUNCTIONS (ULEB128)
+  //         Number of callees inlined into this function, aka number of
+  //         first-level inlinees
+  //     PROBE RECORDS
+  //         A list of NPROBES entries. Each entry contains:
+  //           INDEX (ULEB128)
+  //           TYPE (uint4)
+  //             0 - block probe, 1 - indirect call, 2 - direct call
+  //           ATTRIBUTE (uint3)
+  //             1 - tail call, 2 - dangling
+  //           ADDRESS_TYPE (uint1)
+  //             0 - code address, 1 - address delta
+  //           CODE_ADDRESS (uint64 or ULEB128)
+  //             code address or address delta, depending on Flag
+  //     INLINED FUNCTION RECORDS
+  //         A list of NUM_INLINED_FUNCTIONS entries describing each of the
+  //         inlined callees.  Each record contains:
+  //           INLINE SITE
+  //             Index of the callsite probe (ULEB128)
+  //           FUNCTION BODY
+  //             A FUNCTION BODY entry describing the inlined function.
+
+  Data = Start;
+  End = Data + Size;
+
+  MCDecodedPseudoProbeInlineTree *Root = &DummyInlineRoot;
+  MCDecodedPseudoProbeInlineTree *Cur = &DummyInlineRoot;
+  uint64_t LastAddr = 0;
+  uint32_t Index = 0;
+  // A DFS-based decoding
+  while (Data < End) {
+    if (Root == Cur) {
+      // Use a sequential id for top level inliner.
+      Index = Root->getChildren().size();
+    } else {
+      // Read inline site for inlinees
+      auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
+      if (!ErrorOrIndex)
+        return false;
+      Index = std::move(*ErrorOrIndex);
+    }
+    // Switch/add to a new tree node(inlinee)
+    Cur = Cur->getOrAddNode(std::make_tuple(Cur->Guid, Index));
+    // Read guid
+    auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+    if (!ErrorOrCurGuid)
+      return false;
+    Cur->Guid = std::move(*ErrorOrCurGuid);
+    // Read number of probes in the current node.
+    auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrNodeCount)
+      return false;
+    uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+    // Read number of direct inlinees
+    auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+    if (!ErrorOrCurChildrenToProcess)
+      return false;
+    Cur->ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+    // Read all probes in this node
+    for (std::size_t I = 0; I < NodeCount; I++) {
+      // Read index
+      auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
+      if (!ErrorOrIndex)
+        return false;
+      uint32_t Index = std::move(*ErrorOrIndex);
+      // Read type | flag.
+      auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+      if (!ErrorOrValue)
+        return false;
+      uint8_t Value = std::move(*ErrorOrValue);
+      uint8_t Kind = Value & 0xf;
+      uint8_t Attr = (Value & 0x70) >> 4;
+      // Read address
+      uint64_t Addr = 0;
+      if (Value & 0x80) {
+        auto ErrorOrOffset = readSignedNumber<int64_t>();
+        if (!ErrorOrOffset)
+          return false;
+        int64_t Offset = std::move(*ErrorOrOffset);
+        Addr = LastAddr + Offset;
+      } else {
+        auto ErrorOrAddr = readUnencodedNumber<int64_t>();
+        if (!ErrorOrAddr)
+          return false;
+        Addr = std::move(*ErrorOrAddr);
+      }
+      // Populate Address2ProbesMap
+      auto &Probes = Address2ProbesMap[Addr];
+      Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
+                          Cur);
+      Cur->addProbes(&Probes.back());
+      LastAddr = Addr;
+    }
+
+    // Look for the parent for the next node by subtracting the current
+    // node count from tree counts along the parent chain. The first node
+    // in the chain that has a non-zero tree count is the target.
+    while (Cur != Root) {
+      if (Cur->ChildrenToProcess == 0) {
+        Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
+        if (Cur != Root) {
+          assert(Cur->ChildrenToProcess > 0 &&
+                 "Should have some unprocessed nodes");
+          Cur->ChildrenToProcess -= 1;
+        }
+      } else {
+        break;
+      }
+    }
+  }
+
+  assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  assert(Cur == Root &&
+         " Cur should point to root when the forest is fully built up");
+  return true;
+}
+
+void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
+  OS << "Pseudo Probe Desc:\n";
+  // Make the output deterministic
+  std::map<uint64_t, MCPseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
+                                                       GUID2FuncDescMap.end());
+  for (auto &I : OrderedMap) {
+    I.second.print(OS);
+  }
+}
+
+void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
+                                                uint64_t Address) {
+  auto It = Address2ProbesMap.find(Address);
+  if (It != Address2ProbesMap.end()) {
+    for (auto &Probe : It->second) {
+      OS << " [Probe]:\t";
+      Probe.print(OS, GUID2FuncDescMap, true);
+    }
+  }
+}
+
+void MCPseudoProbeDecoder::printProbesForAllAddresses(raw_ostream &OS) {
+  std::vector<uint64_t> Addresses;
+  for (auto Entry : Address2ProbesMap)
+    Addresses.push_back(Entry.first);
+  std::sort(Addresses.begin(), Addresses.end());
+  for (auto K : Addresses) {
+    OS << "Address:\t";
+    OS << K;
+    OS << "\n";
+    printProbeForAddress(OS, K);
+  }
+}
+
+const MCDecodedPseudoProbe *
+MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
+  auto It = Address2ProbesMap.find(Address);
+  if (It == Address2ProbesMap.end())
+    return nullptr;
+  const auto &Probes = It->second;
+
+  const MCDecodedPseudoProbe *CallProbe = nullptr;
+  for (const auto &Probe : Probes) {
+    if (Probe.isCall()) {
+      assert(!CallProbe &&
+             "There should be only one call probe corresponding to address "
+             "which is a callsite.");
+      CallProbe = &Probe;
+    }
+  }
+  return CallProbe;
+}
+
+const MCPseudoProbeFuncDesc *
+MCPseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
+  auto It = GUID2FuncDescMap.find(GUID);
+  assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
+  return &It->second;
+}
+
+void MCPseudoProbeDecoder::getInlineContextForProbe(
+    const MCDecodedPseudoProbe *Probe,
+    SmallVectorImpl<MCPseduoProbeFrameLocation> &InlineContextStack,
+    bool IncludeLeaf) const {
+  Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap);
+  if (!IncludeLeaf)
+    return;
+  // Note that the context from probe doesn't include leaf frame,
+  // hence we need to retrieve and prepend leaf if requested.
+  const auto *FuncDesc = getFuncDescForGUID(Probe->getGuid());
+  InlineContextStack.emplace_back(
+      MCPseduoProbeFrameLocation(FuncDesc->FuncName, Probe->getIndex()));
+}
+
+const MCPseudoProbeFuncDesc *MCPseudoProbeDecoder::getInlinerDescForProbe(
+    const MCDecodedPseudoProbe *Probe) const {
+  MCDecodedPseudoProbeInlineTree *InlinerNode = Probe->getInlineTreeNode();
+  if (!InlinerNode->hasInlineSite())
+    return nullptr;
+  return getFuncDescForGUID(std::get<0>(InlinerNode->ISite));
+}
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 648efc14da06..7f7380bf810d 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -118,6 +118,10 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 bool MCSectionXCOFF::UseCodeAlign() const { return getKind().isText(); }
 
 bool MCSectionXCOFF::isVirtualSection() const {
-  assert(isCsect() && "Only csect section can be virtual!");
+  // DWARF sections are always not virtual.
+  if (isDwarfSect())
+    return false;
+  assert(isCsect() &&
+         "Handling for isVirtualSection not implemented for this section!");
   return XCOFF::XTY_CM == CsectProp->Type;
 }
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index fc7fb555f0b9..f4e64b42c817 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -53,6 +53,8 @@ void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {}
 
 void MCTargetStreamer::finish() {}
 
+void MCTargetStreamer::emitConstantPools() {}
+
 void MCTargetStreamer::changeSection(const MCSection *CurSection,
                                      MCSection *Section,
                                      const MCExpr *Subsection,
@@ -218,7 +220,7 @@ void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
 }
 
 void llvm::MCStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLen,
-                                llvm::SMLoc) {}
+                                llvm::SMLoc, const MCSubtargetInfo& STI) {}
 
 /// The implementation in this class just redirects to emitFill.
 void MCStreamer::emitZeros(uint64_t NumBytes) { emitFill(NumBytes, 0); }
@@ -397,7 +399,7 @@ void MCStreamer::emitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
 
-void MCStreamer::InitSections(bool NoExecStack) {
+void MCStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
@@ -1198,6 +1200,7 @@ void MCStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                       unsigned ValueSize,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::emitCodeAlignment(unsigned ByteAlignment,
+                                   const MCSubtargetInfo *STI,
                                    unsigned MaxBytesToEmit) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index e3d2439cef81..90249fb7380a 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -26,10 +26,10 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -49,6 +49,27 @@ void MCWasmStreamer::mergeFragment(MCDataFragment *DF, MCDataFragment *EF) {
   DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
 }
 
+void MCWasmStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
+  auto *Symbol = cast<MCSymbolWasm>(S);
+  MCObjectStreamer::emitLabel(Symbol, Loc);
+
+  const MCSectionWasm &Section =
+      static_cast<const MCSectionWasm &>(*getCurrentSectionOnly());
+  if (Section.getSegmentFlags() & wasm::WASM_SEG_FLAG_TLS)
+    Symbol->setTLS();
+}
+
+void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment *F,
+                                    uint64_t Offset) {
+  auto *Symbol = cast<MCSymbolWasm>(S);
+  MCObjectStreamer::emitLabelAtPos(Symbol, Loc, F, Offset);
+
+  const MCSectionWasm &Section =
+      static_cast<const MCSectionWasm &>(*getCurrentSectionOnly());
+  if (Section.getSegmentFlags() & wasm::WASM_SEG_FLAG_TLS)
+    Symbol->setTLS();
+}
+
 void MCWasmStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
@@ -117,6 +138,10 @@ bool MCWasmStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
     Symbol->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
     break;
 
+  case MCSA_ELF_TypeTLS:
+    Symbol->setTLS();
+    break;
+
   case MCSA_ELF_TypeObject:
   case MCSA_Cold:
     break;
@@ -156,6 +181,10 @@ void MCWasmStreamer::emitIdent(StringRef IdentString) {
 void MCWasmStreamer::emitInstToFragment(const MCInst &Inst,
                                         const MCSubtargetInfo &STI) {
   this->MCObjectStreamer::emitInstToFragment(Inst, STI);
+  MCRelaxableFragment &F = *cast<MCRelaxableFragment>(getCurrentFragment());
+
+  for (auto &Fixup : F.getFixups())
+    fixSymbolsInTLSFixups(Fixup.getValue());
 }
 
 void MCWasmStreamer::emitInstToData(const MCInst &Inst,
@@ -166,6 +195,9 @@ void MCWasmStreamer::emitInstToData(const MCInst &Inst,
   raw_svector_ostream VecOS(Code);
   Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
 
+  for (auto &Fixup : Fixups)
+    fixSymbolsInTLSFixups(Fixup.getValue());
+
   // Append the encoded instruction to the current data fragment (or create a
   // new such fragment if the current fragment is not a data fragment).
   MCDataFragment *DF = getOrCreateDataFragment();
@@ -185,16 +217,37 @@ void MCWasmStreamer::finishImpl() {
   this->MCObjectStreamer::finishImpl();
 }
 
-MCStreamer *llvm::createWasmStreamer(MCContext &Context,
-                                     std::unique_ptr<MCAsmBackend> &&MAB,
-                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&CE,
-                                     bool RelaxAll) {
-  MCWasmStreamer *S =
-      new MCWasmStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  return S;
+void MCWasmStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
+  switch (expr->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *be = cast<MCBinaryExpr>(expr);
+    fixSymbolsInTLSFixups(be->getLHS());
+    fixSymbolsInTLSFixups(be->getRHS());
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &symRef = *cast<MCSymbolRefExpr>(expr);
+    switch (symRef.getKind()) {
+    case MCSymbolRefExpr::VK_WASM_TLSREL:
+    case MCSymbolRefExpr::VK_WASM_GOT_TLS:
+      getAssembler().registerSymbol(symRef.getSymbol());
+      cast<MCSymbolWasm>(symRef.getSymbol()).setTLS();
+      break;
+    default:
+      break;
+    }
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixSymbolsInTLSFixups(cast<MCUnaryExpr>(expr)->getSubExpr());
+    break;
+  }
 }
 
 void MCWasmStreamer::emitThumbFunc(MCSymbol *Func) {
@@ -215,3 +268,15 @@ void MCWasmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                     uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("Wasm doesn't support this directive");
 }
+
+MCStreamer *llvm::createWasmStreamer(MCContext &Context,
+                                     std::unique_ptr<MCAsmBackend> &&MAB,
+                                     std::unique_ptr<MCObjectWriter> &&OW,
+                                     std::unique_ptr<MCCodeEmitter> &&CE,
+                                     bool RelaxAll) {
+  MCWasmStreamer *S =
+      new MCWasmStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index de1b0fd3c742..7773d8828931 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -144,8 +144,8 @@ static void EmitRuntimeFunction(MCStreamer &streamer,
   MCContext &context = streamer.getContext();
 
   streamer.emitValueToAlignment(4);
-  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
-  EmitSymbolRefWithOfs(streamer, info->Function, info->End);
+  EmitSymbolRefWithOfs(streamer, info->Begin, info->Begin);
+  EmitSymbolRefWithOfs(streamer, info->Begin, info->End);
   streamer.emitValue(MCSymbolRefExpr::create(info->Symbol,
                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
                                              context), 4);
@@ -1073,7 +1073,7 @@ static void ARM64EmitRuntimeFunction(MCStreamer &streamer,
   MCContext &context = streamer.getContext();
 
   streamer.emitValueToAlignment(4);
-  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
+  EmitSymbolRefWithOfs(streamer, info->Begin, info->Begin);
   if (info->PackedInfo)
     streamer.emitInt32(info->PackedInfo);
   else
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 69dc71b39fd1..0dfe5a5c2bdb 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -66,18 +66,19 @@ void MCWinCOFFStreamer::emitInstToData(const MCInst &Inst,
   DF->getContents().append(Code.begin(), Code.end());
 }
 
-void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
+void MCWinCOFFStreamer::initSections(bool NoExecStack,
+                                     const MCSubtargetInfo &STI) {
   // FIXME: this is identical to the ELF one.
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
-  emitCodeAlignment(4);
+  emitCodeAlignment(4, &STI);
 
   SwitchSection(getContext().getObjectFileInfo()->getDataSection());
-  emitCodeAlignment(4);
+  emitCodeAlignment(4, &STI);
 
   SwitchSection(getContext().getObjectFileInfo()->getBSSSection());
-  emitCodeAlignment(4);
+  emitCodeAlignment(4, &STI);
 
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index ec9e89fac416..90604782de13 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -18,7 +18,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 10ae27c2acc2..277d88cf1cd2 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -965,7 +965,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
     // Write the section relocation entries, in reverse order to match 'as'
     // (approximately, the exact algorithm is more complicated than this).
     std::vector<RelAndSymbol> &Relocs = Relocations[&Sec];
-    for (const RelAndSymbol &Rel : make_range(Relocs.rbegin(), Relocs.rend())) {
+    for (const RelAndSymbol &Rel : llvm::reverse(Relocs)) {
       W.write<uint32_t>(Rel.MRE.r_word0);
       W.write<uint32_t>(Rel.MRE.r_word1);
     }
diff --git a/llvm/lib/Support/TargetRegistry.cpp b/llvm/lib/MC/TargetRegistry.cpp
index 1f9c3bbf8229..0948a6b9f1a1 100644
--- a/llvm/lib/Support/TargetRegistry.cpp
+++ b/llvm/lib/MC/TargetRegistry.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 0dc5c9111db2..636c1d238932 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -292,6 +292,8 @@ private:
     W->OS << Str;
   }
 
+  void writeStringWithAlignment(const StringRef Str, unsigned Alignment);
+
   void writeI32(int32_t val) {
     char Buffer[4];
     support::endian::write32le(Buffer, val);
@@ -317,7 +319,7 @@ private:
   uint32_t writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
                             ArrayRef<WasmFunction> Functions);
   uint32_t writeDataSection(const MCAsmLayout &Layout);
-  void writeTagSection(ArrayRef<wasm::WasmTagType> Tags);
+  void writeTagSection(ArrayRef<uint32_t> TagTypes);
   void writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals);
   void writeTableSection(ArrayRef<wasm::WasmTable> Tables);
   void writeRelocSection(uint32_t SectionIndex, StringRef Name,
@@ -362,6 +364,28 @@ void WasmObjectWriter::startSection(SectionBookkeeping &Section,
   Section.Index = SectionCount++;
 }
 
+// Write a string with extra paddings for trailing alignment
+// TODO: support alignment at asm and llvm level?
+void WasmObjectWriter::writeStringWithAlignment(const StringRef Str,
+                                                unsigned Alignment) {
+
+  // Calculate the encoded size of str length and add pads based on it and
+  // alignment.
+  raw_null_ostream NullOS;
+  uint64_t StrSizeLength = encodeULEB128(Str.size(), NullOS);
+  uint64_t Offset = W->OS.tell() + StrSizeLength + Str.size();
+  uint64_t Paddings = offsetToAlignment(Offset, Align(Alignment));
+  Offset += Paddings;
+
+  // LEB128 greater than 5 bytes is invalid
+  assert((StrSizeLength + Paddings) <= 5 && "too long string to align");
+
+  encodeSLEB128(Str.size(), W->OS, StrSizeLength + Paddings);
+  W->OS << Str;
+
+  assert(W->OS.tell() == Offset && "invalid padding");
+}
+
 void WasmObjectWriter::startCustomSection(SectionBookkeeping &Section,
                                           StringRef Name) {
   LLVM_DEBUG(dbgs() << "startCustomSection " << Name << "\n");
@@ -371,7 +395,12 @@ void WasmObjectWriter::startCustomSection(SectionBookkeeping &Section,
   Section.PayloadOffset = W->OS.tell();
 
   // Custom sections in wasm also have a string identifier.
-  writeString(Name);
+  if (Name != "__clangast") {
+    writeString(Name);
+  } else {
+    // The on-disk hashtable in clangast needs to be aligned by 4 bytes.
+    writeStringWithAlignment(Name, 4);
+  }
 
   // The position where the custom section starts.
   Section.ContentsOffset = W->OS.tell();
@@ -565,8 +594,14 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
     SymA->setUsedInReloc();
   }
 
-  if (RefA->getKind() == MCSymbolRefExpr::VK_GOT)
+  switch (RefA->getKind()) {
+  case MCSymbolRefExpr::VK_GOT:
+  case MCSymbolRefExpr::VK_WASM_GOT_TLS:
     SymA->setUsedInGOT();
+    break;
+  default:
+    break;
+  }
 
   WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection);
   LLVM_DEBUG(dbgs() << "WasmReloc: " << Rec << "\n");
@@ -825,8 +860,8 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
       encodeULEB128(NumElements, W->OS); // initial
       break;
     case wasm::WASM_EXTERNAL_TAG:
-      W->OS << char(Import.Tag.Attribute);
-      encodeULEB128(Import.Tag.SigIndex, W->OS);
+      W->OS << char(0); // Reserved 'attribute' field
+      encodeULEB128(Import.SigIndex, W->OS);
       break;
     default:
       llvm_unreachable("unsupported import kind");
@@ -850,17 +885,17 @@ void WasmObjectWriter::writeFunctionSection(ArrayRef<WasmFunction> Functions) {
   endSection(Section);
 }
 
-void WasmObjectWriter::writeTagSection(ArrayRef<wasm::WasmTagType> Tags) {
-  if (Tags.empty())
+void WasmObjectWriter::writeTagSection(ArrayRef<uint32_t> TagTypes) {
+  if (TagTypes.empty())
     return;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_TAG);
 
-  encodeULEB128(Tags.size(), W->OS);
-  for (const wasm::WasmTagType &Tag : Tags) {
-    W->OS << char(Tag.Attribute);
-    encodeULEB128(Tag.SigIndex, W->OS);
+  encodeULEB128(TagTypes.size(), W->OS);
+  for (uint32_t Index : TagTypes) {
+    W->OS << char(0); // Reserved 'attribute' field
+    encodeULEB128(Index, W->OS);
   }
 
   endSection(Section);
@@ -1052,7 +1087,7 @@ uint32_t WasmObjectWriter::writeDataSection(const MCAsmLayout &Layout) {
 void WasmObjectWriter::writeRelocSection(
     uint32_t SectionIndex, StringRef Name,
     std::vector<WasmRelocationEntry> &Relocs) {
-  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // See: https://github.com/WebAssembly/tool-conventions/blob/main/Linking.md
   // for descriptions of the reloc sections.
 
   if (Relocs.empty())
@@ -1340,8 +1375,7 @@ void WasmObjectWriter::prepareImports(
         Import.Module = WS.getImportModule();
         Import.Field = WS.getImportName();
         Import.Kind = wasm::WASM_EXTERNAL_TAG;
-        Import.Tag.Attribute = wasm::WASM_TAG_ATTRIBUTE_EXCEPTION;
-        Import.Tag.SigIndex = getTagType(WS);
+        Import.SigIndex = getTagType(WS);
         Imports.push_back(Import);
         assert(WasmIndices.count(&WS) == 0);
         WasmIndices[&WS] = NumTagImports++;
@@ -1409,7 +1443,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
   SmallVector<uint32_t, 4> TableElems;
   SmallVector<wasm::WasmImport, 4> Imports;
   SmallVector<wasm::WasmExport, 4> Exports;
-  SmallVector<wasm::WasmTagType, 1> Tags;
+  SmallVector<uint32_t, 2> TagTypes;
   SmallVector<wasm::WasmGlobal, 1> Globals;
   SmallVector<wasm::WasmTable, 1> Tables;
   SmallVector<wasm::WasmSymbolInfo, 4> SymbolInfos;
@@ -1644,16 +1678,15 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
         LLVM_DEBUG(dbgs() << " -> table index: "
                           << WasmIndices.find(&WS)->second << "\n");
       } else if (WS.isTag()) {
-        // C++ exception symbol (__cpp_exception)
+        // C++ exception symbol (__cpp_exception) or longjmp symbol
+        // (__c_longjmp)
         unsigned Index;
         if (WS.isDefined()) {
-          Index = NumTagImports + Tags.size();
-          wasm::WasmTagType Tag;
-          Tag.SigIndex = getTagType(WS);
-          Tag.Attribute = wasm::WASM_TAG_ATTRIBUTE_EXCEPTION;
+          Index = NumTagImports + TagTypes.size();
+          uint32_t SigIndex = getTagType(WS);
           assert(WasmIndices.count(&WS) == 0);
           WasmIndices[&WS] = Index;
-          Tags.push_back(Tag);
+          TagTypes.push_back(SigIndex);
         } else {
           // An import; the index was assigned above.
           assert(WasmIndices.count(&WS) > 0);
@@ -1747,6 +1780,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
       Flags |= wasm::WASM_SYMBOL_EXPLICIT_NAME;
     if (WS.hasExportName())
       Flags |= wasm::WASM_SYMBOL_EXPORTED;
+    if (WS.isTLS())
+      Flags |= wasm::WASM_SYMBOL_TLS;
 
     wasm::WasmSymbolInfo Info;
     Info.Name = WS.getName();
@@ -1869,7 +1904,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
     writeFunctionSection(Functions);
     writeTableSection(Tables);
     // Skip the "memory" section; we import the memory instead.
-    writeTagSection(Tags);
+    writeTagSection(TagTypes);
     writeGlobalSection(Globals);
     writeExportSection(Exports);
     const MCSymbol *IndirectFunctionTable =
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index adf0d3eb443c..177253d7a9d7 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -168,6 +168,24 @@ struct CsectSectionEntry : public SectionEntry {
   virtual ~CsectSectionEntry() {}
 };
 
+struct DwarfSectionEntry : public SectionEntry {
+  // For DWARF section entry.
+  std::unique_ptr<XCOFFSection> DwarfSect;
+
+  DwarfSectionEntry(StringRef N, int32_t Flags,
+                    std::unique_ptr<XCOFFSection> Sect)
+      : SectionEntry(N, Flags | XCOFF::STYP_DWARF), DwarfSect(std::move(Sect)) {
+    assert(DwarfSect->MCSec->isDwarfSect() &&
+           "This should be a DWARF section!");
+    assert(N.size() <= XCOFF::NameSize && "section name too long");
+    memcpy(Name, N.data(), N.size());
+  }
+
+  DwarfSectionEntry(DwarfSectionEntry &&s) = default;
+
+  virtual ~DwarfSectionEntry() {}
+};
+
 class XCOFFObjectWriter : public MCObjectWriter {
 
   uint32_t SymbolTableEntryCount = 0;
@@ -213,6 +231,8 @@ class XCOFFObjectWriter : public MCObjectWriter {
   std::array<CsectSectionEntry *const, 5> Sections{
       {&Text, &Data, &BSS, &TData, &TBSS}};
 
+  std::vector<DwarfSectionEntry> DwarfSections;
+
   CsectGroup &getCsectGroup(const MCSectionXCOFF *MCSec);
 
   virtual void reset() override;
@@ -231,12 +251,21 @@ class XCOFFObjectWriter : public MCObjectWriter {
                                                 uint64_t);
   void writeSymbolTableEntryForControlSection(const XCOFFSection &, int16_t,
                                               XCOFF::StorageClass);
+  void writeSymbolTableEntryForDwarfSection(const XCOFFSection &, int16_t);
   void writeFileHeader();
   void writeSectionHeaderTable();
   void writeSections(const MCAssembler &Asm, const MCAsmLayout &Layout);
+  void writeSectionForControlSectionEntry(const MCAssembler &Asm,
+                                          const MCAsmLayout &Layout,
+                                          const CsectSectionEntry &CsectEntry,
+                                          uint32_t &CurrentAddressLocation);
+  void writeSectionForDwarfSectionEntry(const MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const DwarfSectionEntry &DwarfEntry,
+                                        uint32_t &CurrentAddressLocation);
   void writeSymbolTable(const MCAsmLayout &Layout);
   void writeRelocations();
-  void writeRelocation(XCOFFRelocation Reloc, const XCOFFSection &CSection);
+  void writeRelocation(XCOFFRelocation Reloc, const XCOFFSection &Section);
 
   // Called after all the csects and symbols have been processed by
   // `executePostLayoutBinding`, this function handles building up the majority
@@ -290,6 +319,8 @@ void XCOFFObjectWriter::reset() {
   // Reset any sections we have written to, and empty the section header table.
   for (auto *Sec : Sections)
     Sec->reset();
+  for (auto &DwarfSec : DwarfSections)
+    DwarfSec.reset();
 
   // Reset states in XCOFFObjectWriter.
   SymbolTableEntryCount = 0;
@@ -372,17 +403,32 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
     const auto *MCSec = cast<const MCSectionXCOFF>(&S);
     assert(SectionMap.find(MCSec) == SectionMap.end() &&
            "Cannot add a section twice.");
-    assert(XCOFF::XTY_ER != MCSec->getCSectType() &&
-           "An undefined csect should not get registered.");
 
     // If the name does not fit in the storage provided in the symbol table
     // entry, add it to the string table.
     if (nameShouldBeInStringTable(MCSec->getSymbolTableName()))
       Strings.add(MCSec->getSymbolTableName());
-
-    CsectGroup &Group = getCsectGroup(MCSec);
-    Group.emplace_back(MCSec);
-    SectionMap[MCSec] = &Group.back();
+    if (MCSec->isCsect()) {
+      // A new control section. Its CsectSectionEntry should already be staticly
+      // generated as Text/Data/BSS/TDATA/TBSS. Add this section to the group of
+      // the CsectSectionEntry.
+      assert(XCOFF::XTY_ER != MCSec->getCSectType() &&
+             "An undefined csect should not get registered.");
+      CsectGroup &Group = getCsectGroup(MCSec);
+      Group.emplace_back(MCSec);
+      SectionMap[MCSec] = &Group.back();
+    } else if (MCSec->isDwarfSect()) {
+      // A new DwarfSectionEntry.
+      std::unique_ptr<XCOFFSection> DwarfSec =
+          std::make_unique<XCOFFSection>(MCSec);
+      SectionMap[MCSec] = DwarfSec.get();
+
+      DwarfSectionEntry SecEntry(MCSec->getName(),
+                                 MCSec->getDwarfSubtypeFlags().getValue(),
+                                 std::move(DwarfSec));
+      DwarfSections.push_back(std::move(SecEntry));
+    } else
+      llvm_unreachable("unsupport section type!");
   }
 
   for (const MCSymbol &S : Asm.symbols()) {
@@ -443,13 +489,20 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
                : SymbolIndexMap[ContainingCsect->getQualNameSymbol()];
   };
 
-  auto getVirtualAddress = [this,
-                            &Layout](const MCSymbol *Sym,
-                                     const MCSectionXCOFF *ContainingCsect) {
-    // If Sym is a csect, return csect's address.
-    // If Sym is a label, return csect's address + label's offset from the csect.
-    return SectionMap[ContainingCsect]->Address +
-           (Sym->isDefined() ? Layout.getSymbolOffset(*Sym) : 0);
+  auto getVirtualAddress =
+      [this, &Layout](const MCSymbol *Sym,
+                      const MCSectionXCOFF *ContainingSect) -> uint64_t {
+    // A DWARF section.
+    if (ContainingSect->isDwarfSect())
+      return Layout.getSymbolOffset(*Sym);
+
+    // A csect.
+    if (!Sym->isDefined())
+      return SectionMap[ContainingSect]->Address;
+
+    // A label.
+    assert(Sym->isDefined() && "not a valid object that has address!");
+    return SectionMap[ContainingSect]->Address + Layout.getSymbolOffset(*Sym);
   };
 
   const MCSymbol *const SymA = &Target.getSymA()->getSymbol();
@@ -538,41 +591,12 @@ void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
 void XCOFFObjectWriter::writeSections(const MCAssembler &Asm,
                                       const MCAsmLayout &Layout) {
   uint32_t CurrentAddressLocation = 0;
-  for (const auto *Section : Sections) {
-    // Nothing to write for this Section.
-    if (Section->Index == SectionEntry::UninitializedIndex ||
-        Section->IsVirtual)
-      continue;
-
-    // There could be a gap (without corresponding zero padding) between
-    // sections.
-    assert(((CurrentAddressLocation <= Section->Address) ||
-            (Section->Flags == XCOFF::STYP_TDATA) ||
-            (Section->Flags == XCOFF::STYP_TBSS)) &&
-           "CurrentAddressLocation should be less than or equal to section "
-           "address if the section is not TData or TBSS.");
-
-    CurrentAddressLocation = Section->Address;
-
-    for (const auto *Group : Section->Groups) {
-      for (const auto &Csect : *Group) {
-        if (uint32_t PaddingSize = Csect.Address - CurrentAddressLocation)
-          W.OS.write_zeros(PaddingSize);
-        if (Csect.Size)
-          Asm.writeSectionData(W.OS, Csect.MCSec, Layout);
-        CurrentAddressLocation = Csect.Address + Csect.Size;
-      }
-    }
-
-    // The size of the tail padding in a section is the end virtual address of
-    // the current section minus the the end virtual address of the last csect
-    // in that section.
-    if (uint32_t PaddingSize =
-            Section->Address + Section->Size - CurrentAddressLocation) {
-      W.OS.write_zeros(PaddingSize);
-      CurrentAddressLocation += PaddingSize;
-    }
-  }
+  for (const auto *Section : Sections)
+    writeSectionForControlSectionEntry(Asm, Layout, *Section,
+                                       CurrentAddressLocation);
+  for (const auto &DwarfSection : DwarfSections)
+    writeSectionForDwarfSectionEntry(Asm, Layout, DwarfSection,
+                                     CurrentAddressLocation);
 }
 
 uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
@@ -654,6 +678,36 @@ void XCOFFObjectWriter::writeSymbolTableEntryForCsectMemberLabel(
   W.write<uint16_t>(0);
 }
 
+void XCOFFObjectWriter::writeSymbolTableEntryForDwarfSection(
+    const XCOFFSection &DwarfSectionRef, int16_t SectionIndex) {
+  assert(DwarfSectionRef.MCSec->isDwarfSect() && "Not a DWARF section!");
+
+  // n_name, n_zeros, n_offset
+  writeSymbolName(DwarfSectionRef.getSymbolTableName());
+  // n_value
+  W.write<uint32_t>(0);
+  // n_scnum
+  W.write<int16_t>(SectionIndex);
+  // n_type
+  W.write<uint16_t>(0);
+  // n_sclass
+  W.write<uint8_t>(XCOFF::C_DWARF);
+  // Always 1 aux entry for now.
+  W.write<uint8_t>(1);
+
+  // Now output the auxiliary entry.
+  // x_scnlen
+  W.write<uint32_t>(DwarfSectionRef.Size);
+  // Reserved
+  W.write<uint32_t>(0);
+  // x_nreloc. Set to 0 for now.
+  W.write<uint32_t>(0);
+  // Reserved
+  W.write<uint32_t>(0);
+  // Reserved
+  W.write<uint16_t>(0);
+}
+
 void XCOFFObjectWriter::writeSymbolTableEntryForControlSection(
     const XCOFFSection &CSectionRef, int16_t SectionIndex,
     XCOFF::StorageClass StorageClass) {
@@ -711,10 +765,10 @@ void XCOFFObjectWriter::writeFileHeader() {
 }
 
 void XCOFFObjectWriter::writeSectionHeaderTable() {
-  for (const auto *Sec : Sections) {
+  auto writeSectionHeader = [&](const SectionEntry *Sec, bool IsDwarf) {
     // Nothing to write for this Section.
     if (Sec->Index == SectionEntry::UninitializedIndex)
-      continue;
+      return false;
 
     // Write Name.
     ArrayRef<char> NameRef(Sec->Name, XCOFF::NameSize);
@@ -722,8 +776,14 @@ void XCOFFObjectWriter::writeSectionHeaderTable() {
 
     // Write the Physical Address and Virtual Address. In an object file these
     // are the same.
-    W.write<uint32_t>(Sec->Address);
-    W.write<uint32_t>(Sec->Address);
+    // We use 0 for DWARF sections' Physical and Virtual Addresses.
+    if (!IsDwarf) {
+      W.write<uint32_t>(Sec->Address);
+      W.write<uint32_t>(Sec->Address);
+    } else {
+      W.write<uint32_t>(0);
+      W.write<uint32_t>(0);
+    }
 
     W.write<uint32_t>(Sec->Size);
     W.write<uint32_t>(Sec->FileOffsetToData);
@@ -738,12 +798,25 @@ void XCOFFObjectWriter::writeSectionHeaderTable() {
     W.write<uint16_t>(0);
 
     W.write<int32_t>(Sec->Flags);
-  }
+
+    return true;
+  };
+
+  for (const auto *CsectSec : Sections)
+    writeSectionHeader(CsectSec, /* IsDwarf */ false);
+  for (const auto &DwarfSec : DwarfSections)
+    writeSectionHeader(&DwarfSec, /* IsDwarf */ true);
 }
 
 void XCOFFObjectWriter::writeRelocation(XCOFFRelocation Reloc,
-                                        const XCOFFSection &CSection) {
-  W.write<uint32_t>(CSection.Address + Reloc.FixupOffsetInCsect);
+                                        const XCOFFSection &Section) {
+  if (Section.MCSec->isCsect())
+    W.write<uint32_t>(Section.Address + Reloc.FixupOffsetInCsect);
+  else {
+    // DWARF sections' address is set to 0.
+    assert(Section.MCSec->isDwarfSect() && "unsupport section type!");
+    W.write<uint32_t>(Reloc.FixupOffsetInCsect);
+  }
   W.write<uint32_t>(Reloc.SymbolTableIndex);
   W.write<uint8_t>(Reloc.SignAndSize);
   W.write<uint8_t>(Reloc.Type);
@@ -765,6 +838,10 @@ void XCOFFObjectWriter::writeRelocations() {
       }
     }
   }
+
+  for (const auto &DwarfSection : DwarfSections)
+    for (const auto &Reloc : DwarfSection.DwarfSect->Relocations)
+      writeRelocation(Reloc, *DwarfSection.DwarfSect);
 }
 
 void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
@@ -819,6 +896,10 @@ void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
       }
     }
   }
+
+  for (const auto &DwarfSection : DwarfSections)
+    writeSymbolTableEntryForDwarfSection(*DwarfSection.DwarfSect,
+                                         DwarfSection.Index);
 }
 
 void XCOFFObjectWriter::finalizeSectionInfo() {
@@ -844,11 +925,17 @@ void XCOFFObjectWriter::finalizeSectionInfo() {
     }
   }
 
+  for (auto &DwarfSection : DwarfSections)
+    DwarfSection.RelocationCount = DwarfSection.DwarfSect->Relocations.size();
+
   // Calculate the file offset to the relocation entries.
   uint64_t RawPointer = RelocationEntryOffset;
-  for (auto Sec : Sections) {
-    if (Sec->Index == SectionEntry::UninitializedIndex || !Sec->RelocationCount)
-      continue;
+  auto calcOffsetToRelocations = [&](SectionEntry *Sec, bool IsDwarf) {
+    if (!IsDwarf && Sec->Index == SectionEntry::UninitializedIndex)
+      return false;
+
+    if (!Sec->RelocationCount)
+      return false;
 
     Sec->FileOffsetToRelocations = RawPointer;
     const uint32_t RelocationSizeInSec =
@@ -856,7 +943,15 @@ void XCOFFObjectWriter::finalizeSectionInfo() {
     RawPointer += RelocationSizeInSec;
     if (RawPointer > UINT32_MAX)
       report_fatal_error("Relocation data overflowed this object file.");
-  }
+
+    return true;
+  };
+
+  for (auto *Sec : Sections)
+    calcOffsetToRelocations(Sec, /* IsDwarf */ false);
+
+  for (auto &DwarfSec : DwarfSections)
+    calcOffsetToRelocations(&DwarfSec, /* IsDwarf */ true);
 
   // TODO Error check that the number of symbol table entries fits in 32-bits
   // signed ...
@@ -944,6 +1039,37 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
     Section->Size = Address - Section->Address;
   }
 
+  for (auto &DwarfSection : DwarfSections) {
+    assert((SectionIndex <= MaxSectionIndex) && "Section index overflow!");
+
+    XCOFFSection &DwarfSect = *DwarfSection.DwarfSect;
+    const MCSectionXCOFF *MCSec = DwarfSect.MCSec;
+
+    // Section index.
+    DwarfSection.Index = SectionIndex++;
+    SectionCount++;
+
+    // Symbol index.
+    DwarfSect.SymbolTableIndex = SymbolTableIndex;
+    SymbolIndexMap[MCSec->getQualNameSymbol()] = DwarfSect.SymbolTableIndex;
+    // 1 main and 1 auxiliary symbol table entry for the csect.
+    SymbolTableIndex += 2;
+
+    // Section address. Make it align to section alignment.
+    // We use address 0 for DWARF sections' Physical and Virtual Addresses.
+    // This address is used to tell where is the section in the final object.
+    // See writeSectionForDwarfSectionEntry().
+    DwarfSection.Address = DwarfSect.Address =
+        alignTo(Address, MCSec->getAlignment());
+
+    // Section size.
+    // For DWARF section, we must use the real size which may be not aligned.
+    DwarfSection.Size = DwarfSect.Size = Layout.getSectionAddressSize(MCSec);
+
+    // Make the Address align to default alignment for follow section.
+    Address = alignTo(DwarfSect.Address + DwarfSect.Size, DefaultSectionAlign);
+  }
+
   SymbolTableEntryCount = SymbolTableIndex;
 
   // Calculate the RawPointer value for each section.
@@ -959,9 +1085,102 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
       report_fatal_error("Section raw data overflowed this object file.");
   }
 
+  for (auto &DwarfSection : DwarfSections) {
+    // Address of csect sections are always aligned to DefaultSectionAlign, but
+    // address of DWARF section are aligned to Section alignment which may be
+    // bigger than DefaultSectionAlign, need to execlude the padding bits.
+    RawPointer =
+          alignTo(RawPointer, DwarfSection.DwarfSect->MCSec->getAlignment());
+
+    DwarfSection.FileOffsetToData = RawPointer;
+    // Some section entries, like DWARF section size is not aligned, so
+    // RawPointer may be not aligned.
+    RawPointer += DwarfSection.Size;
+    // Make sure RawPointer is aligned.
+    RawPointer = alignTo(RawPointer, DefaultSectionAlign);
+
+    assert(RawPointer <= UINT32_MAX &&
+           "Section raw data overflowed this object file.");
+  }
+
   RelocationEntryOffset = RawPointer;
 }
 
+void XCOFFObjectWriter::writeSectionForControlSectionEntry(
+    const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const CsectSectionEntry &CsectEntry, uint32_t &CurrentAddressLocation) {
+  // Nothing to write for this Section.
+  if (CsectEntry.Index == SectionEntry::UninitializedIndex)
+    return;
+
+  // There could be a gap (without corresponding zero padding) between
+  // sections.
+  // There could be a gap (without corresponding zero padding) between
+  // sections.
+  assert(((CurrentAddressLocation <= CsectEntry.Address) ||
+          (CsectEntry.Flags == XCOFF::STYP_TDATA) ||
+          (CsectEntry.Flags == XCOFF::STYP_TBSS)) &&
+         "CurrentAddressLocation should be less than or equal to section "
+         "address if the section is not TData or TBSS.");
+
+  CurrentAddressLocation = CsectEntry.Address;
+
+  // For virtual sections, nothing to write. But need to increase
+  // CurrentAddressLocation for later sections like DWARF section has a correct
+  // writing location.
+  if (CsectEntry.IsVirtual) {
+    CurrentAddressLocation += CsectEntry.Size;
+    return;
+  }
+
+  for (const auto &Group : CsectEntry.Groups) {
+    for (const auto &Csect : *Group) {
+      if (uint32_t PaddingSize = Csect.Address - CurrentAddressLocation)
+        W.OS.write_zeros(PaddingSize);
+      if (Csect.Size)
+        Asm.writeSectionData(W.OS, Csect.MCSec, Layout);
+      CurrentAddressLocation = Csect.Address + Csect.Size;
+    }
+  }
+
+  // The size of the tail padding in a section is the end virtual address of
+  // the current section minus the the end virtual address of the last csect
+  // in that section.
+  if (uint32_t PaddingSize =
+          CsectEntry.Address + CsectEntry.Size - CurrentAddressLocation) {
+    W.OS.write_zeros(PaddingSize);
+    CurrentAddressLocation += PaddingSize;
+  }
+}
+
+void XCOFFObjectWriter::writeSectionForDwarfSectionEntry(
+    const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const DwarfSectionEntry &DwarfEntry, uint32_t &CurrentAddressLocation) {
+  // There could be a gap (without corresponding zero padding) between
+  // sections. For example DWARF section alignment is bigger than
+  // DefaultSectionAlign.
+  assert(CurrentAddressLocation <= DwarfEntry.Address &&
+         "CurrentAddressLocation should be less than or equal to section "
+         "address.");
+
+  if (uint32_t PaddingSize = DwarfEntry.Address - CurrentAddressLocation)
+    W.OS.write_zeros(PaddingSize);
+
+  if (DwarfEntry.Size)
+    Asm.writeSectionData(W.OS, DwarfEntry.DwarfSect->MCSec, Layout);
+
+  CurrentAddressLocation = DwarfEntry.Address + DwarfEntry.Size;
+
+  // DWARF section size is not aligned to DefaultSectionAlign.
+  // Make sure CurrentAddressLocation is aligned to DefaultSectionAlign.
+  uint32_t Mod = CurrentAddressLocation % DefaultSectionAlign;
+  uint32_t TailPaddingSize = Mod ? DefaultSectionAlign - Mod : 0;
+  if (TailPaddingSize)
+    W.OS.write_zeros(TailPaddingSize);
+
+  CurrentAddressLocation += TailPaddingSize;
+}
+
 // Takes the log base 2 of the alignment and shifts the result into the 5 most
 // significant bits of a byte, then or's in the csect type into the least
 // significant 3 bits.
diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp
index 99d2373588ac..c21ec9e62dff 100644
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@@ -74,14 +74,17 @@ Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr,
                                CustomBehaviour &CB) {
   const MCSchedModel &SM = STI.getSchedModel();
   auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
+  auto LSU = std::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+                                      Opts.StoreQueueSize, Opts.AssumeNoAlias);
 
   // Create the pipeline stages.
   auto Entry = std::make_unique<EntryStage>(SrcMgr);
-  auto InOrderIssue = std::make_unique<InOrderIssueStage>(STI, *PRF, CB);
+  auto InOrderIssue = std::make_unique<InOrderIssueStage>(STI, *PRF, CB, *LSU);
   auto StagePipeline = std::make_unique<Pipeline>();
 
   // Pass the ownership of all the hardware units to this Context.
   addHardwareUnit(std::move(PRF));
+  addHardwareUnit(std::move(LSU));
 
   // Build the pipeline.
   StagePipeline->appendStage(std::move(Entry));
diff --git a/llvm/lib/MCA/CustomBehaviour.cpp b/llvm/lib/MCA/CustomBehaviour.cpp
index 23211f402927..a9ea8edff059 100644
--- a/llvm/lib/MCA/CustomBehaviour.cpp
+++ b/llvm/lib/MCA/CustomBehaviour.cpp
@@ -24,5 +24,23 @@ unsigned CustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
   return 0;
 }
 
+std::vector<std::unique_ptr<View>>
+CustomBehaviour::getStartViews(llvm::MCInstPrinter &IP,
+                               llvm::ArrayRef<llvm::MCInst> Insts) {
+  return std::vector<std::unique_ptr<View>>();
+}
+
+std::vector<std::unique_ptr<View>>
+CustomBehaviour::getPostInstrInfoViews(llvm::MCInstPrinter &IP,
+                                       llvm::ArrayRef<llvm::MCInst> Insts) {
+  return std::vector<std::unique_ptr<View>>();
+}
+
+std::vector<std::unique_ptr<View>>
+CustomBehaviour::getEndViews(llvm::MCInstPrinter &IP,
+                             llvm::ArrayRef<llvm::MCInst> Insts) {
+  return std::vector<std::unique_ptr<View>>();
+}
+
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp b/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
index 81c4f682f63d..474bf84cf891 100644
--- a/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
+++ b/llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
@@ -288,6 +288,19 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   // If this move has been eliminated, then method tryEliminateMoveOrSwap should
   // have already updated all the register mappings.
   if (!IsEliminated) {
+    // Check if this is one of multiple writes performed by this
+    // instruction to register RegID.
+    const WriteRef &OtherWrite = RegisterMappings[RegID].first;
+    const WriteState *OtherWS = OtherWrite.getWriteState();
+    if (OtherWS && OtherWrite.getSourceIndex() == Write.getSourceIndex()) {
+      if (OtherWS->getLatency() > WS.getLatency()) {
+        // Conservatively keep the slowest write on RegID.
+        if (ShouldAllocatePhysRegs)
+          allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+        return;
+      }
+    }
+
     // Update the mapping for register RegID including its sub-registers.
     RegisterMappings[RegID].first = Write;
     RegisterMappings[RegID].second.AliasRegID = 0U;
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 4067d86930d1..0ab845a4c28f 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -687,7 +687,7 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
     if (IsDepBreaking) {
       // A mask of all zeroes means: explicit input operands are not
       // independent.
-      if (Mask.isNullValue()) {
+      if (Mask.isZero()) {
         if (!RD.isImplicitRead())
           RS.setIndependentFromDef();
       } else {
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
index ccf6f20a6737..fa5c0fc66b9e 100644
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MCA/Stages/InOrderIssueStage.h"
+#include "llvm/MCA/HardwareUnits/LSUnit.h"
 #include "llvm/MCA/HardwareUnits/RegisterFile.h"
 #include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
 #include "llvm/MCA/Instruction.h"
@@ -43,9 +44,10 @@ void StallInfo::cycleEnd() {
 }
 
 InOrderIssueStage::InOrderIssueStage(const MCSubtargetInfo &STI,
-                                     RegisterFile &PRF, CustomBehaviour &CB)
-    : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), NumIssued(), SI(),
-      CarryOver(), Bandwidth(), LastWriteBackCycle() {}
+                                     RegisterFile &PRF, CustomBehaviour &CB,
+                                     LSUnit &LSU)
+    : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), LSU(LSU),
+      NumIssued(), SI(), CarryOver(), Bandwidth(), LastWriteBackCycle() {}
 
 unsigned InOrderIssueStage::getIssueWidth() const {
   return STI.getSchedModel().IssueWidth;
@@ -125,6 +127,13 @@ bool InOrderIssueStage::canExecute(const InstRef &IR) {
     return false;
   }
 
+  if (IR.getInstruction()->isMemOp() && !LSU.isReady(IR)) {
+    // This load (store) aliases with a preceding store (load). Delay
+    // it until the depenency is cleared.
+    SI.update(IR, /* delay */ 1, StallInfo::StallKind::LOAD_STORE);
+    return false;
+  }
+
   if (unsigned CustomStallCycles = CB.checkCustomHazard(IssuedInst, IR)) {
     SI.update(IR, CustomStallCycles, StallInfo::StallKind::CUSTOM_STALL);
     return false;
@@ -188,6 +197,10 @@ void InOrderIssueStage::notifyInstructionRetired(const InstRef &IR,
 }
 
 llvm::Error InOrderIssueStage::execute(InstRef &IR) {
+  Instruction &IS = *IR.getInstruction();
+  if (IS.isMemOp())
+    IS.setLSUTokenID(LSU.dispatch(IR));
+
   if (llvm::Error E = tryIssue(IR))
     return E;
 
@@ -222,6 +235,9 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR) {
   RM.issueInstruction(Desc, UsedResources);
   IS.execute(SourceIndex);
 
+  if (IS.isMemOp())
+    LSU.onInstructionIssued(IR);
+
   // Replace resource masks with valid resource processor IDs.
   for (ResourceUse &Use : UsedResources) {
     uint64_t Mask = Use.first.first;
@@ -245,6 +261,7 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR) {
   // the execution and retirement now.
   if (IS.isExecuted()) {
     PRF.onInstructionExecuted(&IS);
+    LSU.onInstructionExecuted(IR);
     notifyEvent<HWInstructionEvent>(
         HWInstructionEvent(HWInstructionEvent::Executed, IR));
     LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n");
@@ -279,6 +296,7 @@ void InOrderIssueStage::updateIssuedInst() {
     }
 
     PRF.onInstructionExecuted(&IS);
+    LSU.onInstructionExecuted(IR);
     notifyInstructionExecuted(IR);
     ++NumExecuted;
 
@@ -324,6 +342,9 @@ void InOrderIssueStage::retireInstruction(InstRef &IR) {
   for (const WriteState &WS : IS.getDefs())
     PRF.removeRegisterWrite(WS, FreedRegs);
 
+  if (IS.isMemOp())
+    LSU.onInstructionRetired(IR);
+
   notifyInstructionRetired(IR, FreedRegs);
 }
 
@@ -363,6 +384,7 @@ llvm::Error InOrderIssueStage::cycleStart() {
   Bandwidth = getIssueWidth();
 
   PRF.cycleStart();
+  LSU.cycleEvent();
 
   // Release consumed resources.
   SmallVector<ResourceRef, 4> Freed;
diff --git a/llvm/lib/MCA/Stages/InstructionTables.cpp b/llvm/lib/MCA/Stages/InstructionTables.cpp
index 93e368123066..a842b52dcd39 100644
--- a/llvm/lib/MCA/Stages/InstructionTables.cpp
+++ b/llvm/lib/MCA/Stages/InstructionTables.cpp
@@ -24,7 +24,7 @@ Error InstructionTables::execute(InstRef &IR) {
   UsedResources.clear();
 
   // Identify the resources consumed by this instruction.
-  for (const std::pair<const uint64_t, ResourceUsage> Resource :
+  for (const std::pair<uint64_t, ResourceUsage> &Resource :
        Desc.Resources) {
     // Skip zero-cycle resources (i.e., unused resources).
     if (!Resource.second.size())
diff --git a/llvm/tools/llvm-mca/Views/View.cpp b/llvm/lib/MCA/View.cpp
index 09d08d3ae007..a56d3a124934 100644
--- a/llvm/tools/llvm-mca/Views/View.cpp
+++ b/llvm/lib/MCA/View.cpp
@@ -11,7 +11,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "Views/View.h"
+#include "llvm/MCA/View.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 6ff896cf347e..5492692445e7 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -418,7 +418,7 @@ Expected<bool> Archive::Child::isThinMember() const {
   if (!NameOrErr)
     return NameOrErr.takeError();
   StringRef Name = NameOrErr.get();
-  return Parent->IsThin && Name != "/" && Name != "//";
+  return Parent->IsThin && Name != "/" && Name != "//" && Name != "/SYM64/";
 }
 
 Expected<std::string> Archive::Child::getFullName() const {
diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp
index 8f29f7a658fd..55ddd3baca2b 100644
--- a/llvm/lib/Object/COFFModuleDefinition.cpp
+++ b/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -80,11 +80,6 @@ static bool isDecorated(StringRef Sym, bool MingwDef) {
          (!MingwDef && Sym.contains('@'));
 }
 
-static Error createError(const Twine &Err) {
-  return make_error<StringError>(StringRef(Err.str()),
-                                 object_error::parse_failed);
-}
-
 class Lexer {
 public:
   Lexer(StringRef S) : Buf(S) {}
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index ca2ed4449120..84181ae5e501 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -246,6 +246,9 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
       STRINGIFY_ENUM_CASE(ELF, SHT_MIPS_ABIFLAGS);
     }
     break;
+  case ELF::EM_MSP430:
+    switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_MSP430_ATTRIBUTES); }
+    break;
   case ELF::EM_RISCV:
     switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_RISCV_ATTRIBUTES); }
     break;
@@ -333,40 +336,26 @@ ELFFile<ELFT>::decode_relrs(Elf_Relr_Range relrs) const {
   std::vector<Elf_Rel> Relocs;
 
   // Word type: uint32_t for Elf32, and uint64_t for Elf64.
-  typedef typename ELFT::uint Word;
-
-  // Word size in number of bytes.
-  const size_t WordSize = sizeof(Word);
+  using Addr = typename ELFT::uint;
 
-  // Number of bits used for the relocation offsets bitmap.
-  // These many relative relocations can be encoded in a single entry.
-  const size_t NBits = 8*WordSize - 1;
-
-  Word Base = 0;
-  for (const Elf_Relr &R : relrs) {
-    Word Entry = R;
-    if ((Entry&1) == 0) {
+  Addr Base = 0;
+  for (Elf_Relr R : relrs) {
+    typename ELFT::uint Entry = R;
+    if ((Entry & 1) == 0) {
       // Even entry: encodes the offset for next relocation.
       Rel.r_offset = Entry;
       Relocs.push_back(Rel);
       // Set base offset for subsequent bitmap entries.
-      Base = Entry + WordSize;
-      continue;
-    }
-
-    // Odd entry: encodes bitmap for relocations starting at base.
-    Word Offset = Base;
-    while (Entry != 0) {
-      Entry >>= 1;
-      if ((Entry&1) != 0) {
-        Rel.r_offset = Offset;
-        Relocs.push_back(Rel);
-      }
-      Offset += WordSize;
+      Base = Entry + sizeof(Addr);
+    } else {
+      // Odd entry: encodes bitmap for relocations starting at base.
+      for (Addr Offset = Base; (Entry >>= 1) != 0; Offset += sizeof(Addr))
+        if ((Entry & 1) != 0) {
+          Rel.r_offset = Offset;
+          Relocs.push_back(Rel);
+        }
+      Base += (CHAR_BIT * sizeof(Entry) - 1) * sizeof(Addr);
     }
-
-    // Advance base offset by NBits words.
-    Base += NBits * WordSize;
   }
 
   return Relocs;
@@ -474,6 +463,14 @@ std::string ELFFile<ELFT>::getDynamicTagAsString(unsigned Arch,
     }
     break;
 
+  case ELF::EM_PPC:
+    switch (Type) {
+#define PPC_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef PPC_DYNAMIC_TAG
+    }
+    break;
+
   case ELF::EM_PPC64:
     switch (Type) {
 #define PPC64_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
@@ -481,6 +478,14 @@ std::string ELFFile<ELFT>::getDynamicTagAsString(unsigned Arch,
 #undef PPC64_DYNAMIC_TAG
     }
     break;
+
+  case ELF::EM_RISCV:
+    switch (Type) {
+#define RISCV_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef RISCV_DYNAMIC_TAG
+    }
+    break;
   }
 #undef DYNAMIC_TAG
   switch (Type) {
@@ -488,7 +493,9 @@ std::string ELFFile<ELFT>::getDynamicTagAsString(unsigned Arch,
 #define AARCH64_DYNAMIC_TAG(name, value)
 #define MIPS_DYNAMIC_TAG(name, value)
 #define HEXAGON_DYNAMIC_TAG(name, value)
+#define PPC_DYNAMIC_TAG(name, value)
 #define PPC64_DYNAMIC_TAG(name, value)
+#define RISCV_DYNAMIC_TAG(name, value)
 // Also ignore marker tags such as DT_HIOS (maps to DT_VERNEEDNUM), etc.
 #define DYNAMIC_TAG_MARKER(name, value)
 #define DYNAMIC_TAG(name, value) case value: return #name;
@@ -497,7 +504,9 @@ std::string ELFFile<ELFT>::getDynamicTagAsString(unsigned Arch,
 #undef AARCH64_DYNAMIC_TAG
 #undef MIPS_DYNAMIC_TAG
 #undef HEXAGON_DYNAMIC_TAG
+#undef PPC_DYNAMIC_TAG
 #undef PPC64_DYNAMIC_TAG
+#undef RISCV_DYNAMIC_TAG
 #undef DYNAMIC_TAG_MARKER
 #undef DYNAMIC_STRINGIFY_ENUM
   default:
@@ -613,14 +622,14 @@ ELFFile<ELFT>::toMappedAddr(uint64_t VAddr, WarningHandler WarnHandler) const {
 }
 
 template <class ELFT>
-Expected<std::vector<typename ELFT::BBAddrMap>>
+Expected<std::vector<BBAddrMap>>
 ELFFile<ELFT>::decodeBBAddrMap(const Elf_Shdr &Sec) const {
   Expected<ArrayRef<uint8_t>> ContentsOrErr = getSectionContents(Sec);
   if (!ContentsOrErr)
     return ContentsOrErr.takeError();
   ArrayRef<uint8_t> Content = *ContentsOrErr;
   DataExtractor Data(Content, isLE(), ELFT::Is64Bits ? 8 : 4);
-  std::vector<Elf_BBAddrMap> FunctionEntries;
+  std::vector<BBAddrMap> FunctionEntries;
 
   DataExtractor::Cursor Cur(0);
   Error ULEBSizeErr = Error::success();
@@ -647,7 +656,7 @@ ELFFile<ELFT>::decodeBBAddrMap(const Elf_Shdr &Sec) const {
   while (!ULEBSizeErr && Cur && Cur.tell() < Content.size()) {
     uintX_t Address = static_cast<uintX_t>(Data.getAddress(Cur));
     uint32_t NumBlocks = ReadULEB128AsUInt32();
-    std::vector<typename Elf_BBAddrMap::BBEntry> BBEntries;
+    std::vector<BBAddrMap::BBEntry> BBEntries;
     for (uint32_t BlockID = 0; !ULEBSizeErr && Cur && (BlockID < NumBlocks);
          ++BlockID) {
       uint32_t Offset = ReadULEB128AsUInt32();
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 6613d79ab3d0..50035d6c7523 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -15,6 +15,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Object/Error.h"
@@ -25,7 +26,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributeParser.h"
 #include "llvm/Support/RISCVAttributes.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
@@ -538,9 +538,16 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
     case ARMBuildAttrs::v6K:
       Triple += "v6k";
       break;
-    case ARMBuildAttrs::v7:
-      Triple += "v7";
+    case ARMBuildAttrs::v7: {
+      Optional<unsigned> ArchProfileAttr =
+          Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile);
+      if (ArchProfileAttr.hasValue() &&
+          ArchProfileAttr.getValue() == ARMBuildAttrs::MicroControllerProfile)
+        Triple += "v7m";
+      else
+        Triple += "v7";
       break;
+    }
     case ARMBuildAttrs::v6_M:
       Triple += "v6m";
       break;
@@ -647,3 +654,72 @@ ELFObjectFileBase::getPltAddresses() const {
   }
   return Result;
 }
+
+template <class ELFT>
+static Expected<std::vector<VersionEntry>>
+readDynsymVersionsImpl(const ELFFile<ELFT> &EF,
+                       ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
+  using Elf_Shdr = typename ELFT::Shdr;
+  const Elf_Shdr *VerSec = nullptr;
+  const Elf_Shdr *VerNeedSec = nullptr;
+  const Elf_Shdr *VerDefSec = nullptr;
+  // The user should ensure sections() can't fail here.
+  for (const Elf_Shdr &Sec : cantFail(EF.sections())) {
+    if (Sec.sh_type == ELF::SHT_GNU_versym)
+      VerSec = &Sec;
+    else if (Sec.sh_type == ELF::SHT_GNU_verdef)
+      VerDefSec = &Sec;
+    else if (Sec.sh_type == ELF::SHT_GNU_verneed)
+      VerNeedSec = &Sec;
+  }
+  if (!VerSec)
+    return std::vector<VersionEntry>();
+
+  Expected<SmallVector<Optional<VersionEntry>, 0>> MapOrErr =
+      EF.loadVersionMap(VerNeedSec, VerDefSec);
+  if (!MapOrErr)
+    return MapOrErr.takeError();
+
+  std::vector<VersionEntry> Ret;
+  size_t I = 0;
+  for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) {
+    ++I;
+    Expected<const typename ELFT::Versym *> VerEntryOrErr =
+        EF.template getEntry<typename ELFT::Versym>(*VerSec, I);
+    if (!VerEntryOrErr)
+      return createError("unable to read an entry with index " + Twine(I) +
+                         " from " + describe(EF, *VerSec) + ": " +
+                         toString(VerEntryOrErr.takeError()));
+
+    Expected<uint32_t> FlagsOrErr = It->getFlags();
+    if (!FlagsOrErr)
+      return createError("unable to read flags for symbol with index " +
+                         Twine(I) + ": " + toString(FlagsOrErr.takeError()));
+
+    bool IsDefault;
+    Expected<StringRef> VerOrErr = EF.getSymbolVersionByIndex(
+        (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr,
+        (*FlagsOrErr) & SymbolRef::SF_Undefined);
+    if (!VerOrErr)
+      return createError("unable to get a version for entry " + Twine(I) +
+                         " of " + describe(EF, *VerSec) + ": " +
+                         toString(VerOrErr.takeError()));
+
+    Ret.push_back({(*VerOrErr).str(), IsDefault});
+  }
+
+  return Ret;
+}
+
+Expected<std::vector<VersionEntry>>
+ELFObjectFileBase::readDynsymVersions() const {
+  elf_symbol_iterator_range Symbols = getDynamicSymbolIterators();
+  if (const auto *Obj = dyn_cast<ELF32LEObjectFile>(this))
+    return readDynsymVersionsImpl(Obj->getELFFile(), Symbols);
+  if (const auto *Obj = dyn_cast<ELF32BEObjectFile>(this))
+    return readDynsymVersionsImpl(Obj->getELFFile(), Symbols);
+  if (const auto *Obj = dyn_cast<ELF64LEObjectFile>(this))
+    return readDynsymVersionsImpl(Obj->getELFFile(), Symbols);
+  return readDynsymVersionsImpl(cast<ELF64BEObjectFile>(this)->getELFFile(),
+                                Symbols);
+}
diff --git a/llvm/lib/Object/IRObjectFile.cpp b/llvm/lib/Object/IRObjectFile.cpp
index befba5d57127..c653262791cc 100644
--- a/llvm/lib/Object/IRObjectFile.cpp
+++ b/llvm/lib/Object/IRObjectFile.cpp
@@ -18,9 +18,9 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 using namespace object;
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 746b00867157..093ae1bbc267 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -41,10 +41,15 @@
 using namespace llvm;
 using namespace irsymtab;
 
-static const char *LibcallRoutineNames[] = {
+static const char *PreservedSymbols[] = {
 #define HANDLE_LIBCALL(code, name) name,
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+    // There are global variables, so put it here instead of in
+    // RuntimeLibcalls.def.
+    // TODO: Are there similar such variables?
+    "__ssp_canary_word",
+    "__stack_chk_guard",
 };
 
 namespace {
@@ -261,9 +266,9 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
 
   setStr(Sym.IRName, GV->getName());
 
-  bool IsBuiltinFunc = llvm::is_contained(LibcallRoutineNames, GV->getName());
+  bool IsPreservedSymbol = llvm::is_contained(PreservedSymbols, GV->getName());
 
-  if (Used.count(GV) || IsBuiltinFunc)
+  if (Used.count(GV) || IsPreservedSymbol)
     Sym.Flags |= 1 << storage::Symbol::FB_used;
   if (GV->isThreadLocal())
     Sym.Flags |= 1 << storage::Symbol::FB_tls;
@@ -283,11 +288,15 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
     Uncommon().CommonAlign = GVar->getAlignment();
   }
 
-  const GlobalObject *Base = GV->getBaseObject();
-  if (!Base)
-    return make_error<StringError>("Unable to determine comdat of alias!",
-                                   inconvertibleErrorCode());
-  if (const Comdat *C = Base->getComdat()) {
+  const GlobalObject *GO = GV->getAliaseeObject();
+  if (!GO) {
+    if (isa<GlobalIFunc>(GV))
+      GO = cast<GlobalIFunc>(GV)->getResolverFunction();
+    if (!GO)
+      return make_error<StringError>("Unable to determine comdat of alias!",
+                                     inconvertibleErrorCode());
+  }
+  if (const Comdat *C = GO->getComdat()) {
     Expected<int> ComdatIndexOrErr = getComdatIndex(C, GV->getParent());
     if (!ComdatIndexOrErr)
       return ComdatIndexOrErr.takeError();
@@ -312,8 +321,8 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
     }
   }
 
-  if (!Base->getSection().empty())
-    setStr(Uncommon().SectionName, Saver.save(Base->getSection()));
+  if (!GO->getSection().empty())
+    setStr(Uncommon().SectionName, Saver.save(GO->getSection()));
 
   return Error::success();
 }
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 177314a9a790..7501661591f0 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -246,8 +246,8 @@ static Error checkOverlappingElement(std::list<MachOElement> &Elements,
   if (Size == 0)
     return Error::success();
 
-  for (auto it=Elements.begin() ; it != Elements.end(); ++it) {
-    auto E = *it;
+  for (auto it = Elements.begin(); it != Elements.end(); ++it) {
+    const auto &E = *it;
     if ((Offset >= E.Offset && Offset < E.Offset + E.Size) ||
         (Offset + Size > E.Offset && Offset + Size < E.Offset + E.Size) ||
         (Offset <= E.Offset && Offset + Size >= E.Offset + E.Size))
@@ -258,7 +258,7 @@ static Error checkOverlappingElement(std::list<MachOElement> &Elements,
     auto nt = it;
     nt++;
     if (nt != Elements.end()) {
-      auto N = *nt;
+      const auto &N = *nt;
       if (Offset + Size <= N.Offset) {
         Elements.insert(nt, {Offset, Size, Name});
         return Error::success();
@@ -2048,6 +2048,46 @@ bool MachOObjectFile::isDebugSection(DataRefImpl Sec) const {
          SectionName == "__swift_ast";
 }
 
+namespace {
+template <typename LoadCommandType>
+ArrayRef<uint8_t> getSegmentContents(const MachOObjectFile &Obj,
+                                     MachOObjectFile::LoadCommandInfo LoadCmd,
+                                     StringRef SegmentName) {
+  auto SegmentOrErr = getStructOrErr<LoadCommandType>(Obj, LoadCmd.Ptr);
+  if (!SegmentOrErr) {
+    consumeError(SegmentOrErr.takeError());
+    return {};
+  }
+  auto &Segment = SegmentOrErr.get();
+  if (StringRef(Segment.segname, 16).startswith(SegmentName))
+    return arrayRefFromStringRef(Obj.getData().slice(
+        Segment.fileoff, Segment.fileoff + Segment.filesize));
+  return {};
+}
+} // namespace
+
+ArrayRef<uint8_t>
+MachOObjectFile::getSegmentContents(StringRef SegmentName) const {
+  for (auto LoadCmd : load_commands()) {
+    ArrayRef<uint8_t> Contents;
+    switch (LoadCmd.C.cmd) {
+    case MachO::LC_SEGMENT:
+      Contents = ::getSegmentContents<MachO::segment_command>(*this, LoadCmd,
+                                                              SegmentName);
+      break;
+    case MachO::LC_SEGMENT_64:
+      Contents = ::getSegmentContents<MachO::segment_command_64>(*this, LoadCmd,
+                                                                 SegmentName);
+      break;
+    default:
+      continue;
+    }
+    if (!Contents.empty())
+      return Contents;
+  }
+  return {};
+}
+
 unsigned MachOObjectFile::getSectionID(SectionRef Sec) const {
   return Sec.getRawDataRefImpl().d.a;
 }
diff --git a/llvm/lib/Object/ModuleSymbolTable.cpp b/llvm/lib/Object/ModuleSymbolTable.cpp
index 9a79de77af16..954d1f09f4e9 100644
--- a/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -43,7 +44,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -204,8 +204,9 @@ uint32_t ModuleSymbolTable::getSymbolFlags(Symbol S) const {
     if (GVar->isConstant())
       Res |= BasicSymbolRef::SF_Const;
   }
-  if (dyn_cast_or_null<Function>(GV->getBaseObject()))
-    Res |= BasicSymbolRef::SF_Executable;
+  if (const GlobalObject *GO = GV->getAliaseeObject())
+    if (isa<Function>(GO) || isa<GlobalIFunc>(GO))
+      Res |= BasicSymbolRef::SF_Executable;
   if (isa<GlobalAlias>(GV))
     Res |= BasicSymbolRef::SF_Indirect;
   if (GV->hasPrivateLinkage())
diff --git a/llvm/lib/Object/Object.cpp b/llvm/lib/Object/Object.cpp
index b486e9f5c9a8..0659cf6a2d41 100644
--- a/llvm/lib/Object/Object.cpp
+++ b/llvm/lib/Object/Object.cpp
@@ -222,8 +222,7 @@ void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
    std::string Buf;
    raw_string_ostream OS(Buf);
    logAllUnhandledErrors(SecOrErr.takeError(), OS);
-   OS.flush();
-   report_fatal_error(Buf);
+   report_fatal_error(Twine(OS.str()));
   }
   *unwrap(Sect) = *SecOrErr;
 }
@@ -304,8 +303,7 @@ const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
     std::string Buf;
     raw_string_ostream OS(Buf);
     logAllUnhandledErrors(Ret.takeError(), OS);
-    OS.flush();
-    report_fatal_error(Buf);
+    report_fatal_error(Twine(OS.str()));
   }
   return Ret->data();
 }
@@ -316,8 +314,7 @@ uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
     std::string Buf;
     raw_string_ostream OS(Buf);
     logAllUnhandledErrors(Ret.takeError(), OS);
-    OS.flush();
-    report_fatal_error(Buf);
+    report_fatal_error(Twine(OS.str()));
   }
   return *Ret;
 }
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index 5c894439ff67..6fd02f3b9592 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -55,14 +55,15 @@ bool SectionRef::containsSymbol(SymbolRef S) const {
 }
 
 Expected<uint64_t> ObjectFile::getSymbolValue(DataRefImpl Ref) const {
-  if (Expected<uint32_t> FlagsOrErr = getSymbolFlags(Ref)) {
-    if (*FlagsOrErr & SymbolRef::SF_Undefined)
-      return 0;
-    if (*FlagsOrErr & SymbolRef::SF_Common)
-      return getCommonSymbolSize(Ref);
-  } else
+  uint32_t Flags;
+  if (Error E = getSymbolFlags(Ref).moveInto(Flags))
     // TODO: Test this error.
-    return FlagsOrErr.takeError();
+    return std::move(E);
+
+  if (Flags & SymbolRef::SF_Undefined)
+    return 0;
+  if (Flags & SymbolRef::SF_Common)
+    return getCommonSymbolSize(Ref);
   return getSymbolValueImpl(Ref);
 }
 
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index ab98a2dd2ac1..00a45e2c5d4e 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -18,7 +18,7 @@ namespace object {
 static int64_t getELFAddend(RelocationRef R) {
   Expected<int64_t> AddendOrErr = ELFRelocationRef(R).getAddend();
   handleAllErrors(AddendOrErr.takeError(), [](const ErrorInfoBase &EI) {
-    report_fatal_error(EI.message());
+    report_fatal_error(Twine(EI.message()));
   });
   return *AddendOrErr;
 }
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index a08c648358c0..6a19b159f3d5 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -286,9 +286,9 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
     return;
   }
 
-  WasmSection Sec;
   WasmSectionOrderChecker Checker;
   while (Ctx.Ptr < Ctx.End) {
+    WasmSection Sec;
     if ((Err = readSection(Sec, Ctx, Checker)))
       return;
     if ((Err = parseSection(Sec)))
@@ -339,7 +339,8 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) {
 }
 
 Error WasmObjectFile::parseDylinkSection(ReadContext &Ctx) {
-  // See https://github.com/WebAssembly/tool-conventions/blob/master/DynamicLinking.md
+  // Legacy "dylink" section support.
+  // See parseDylink0Section for the current "dylink.0" section parsing.
   HasDylinkSection = true;
   DylinkInfo.MemorySize = readVaruint32(Ctx);
   DylinkInfo.MemoryAlignment = readVaruint32(Ctx);
@@ -349,17 +350,77 @@ Error WasmObjectFile::parseDylinkSection(ReadContext &Ctx) {
   while (Count--) {
     DylinkInfo.Needed.push_back(readString(Ctx));
   }
+
   if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("dylink section ended prematurely",
                                           object_error::parse_failed);
   return Error::success();
 }
 
+Error WasmObjectFile::parseDylink0Section(ReadContext &Ctx) {
+  // See
+  // https://github.com/WebAssembly/tool-conventions/blob/main/DynamicLinking.md
+  HasDylinkSection = true;
+
+  const uint8_t *OrigEnd = Ctx.End;
+  while (Ctx.Ptr < OrigEnd) {
+    Ctx.End = OrigEnd;
+    uint8_t Type = readUint8(Ctx);
+    uint32_t Size = readVaruint32(Ctx);
+    LLVM_DEBUG(dbgs() << "readSubsection type=" << int(Type) << " size=" << Size
+                      << "\n");
+    Ctx.End = Ctx.Ptr + Size;
+    uint32_t Count;
+    switch (Type) {
+    case wasm::WASM_DYLINK_MEM_INFO:
+      DylinkInfo.MemorySize = readVaruint32(Ctx);
+      DylinkInfo.MemoryAlignment = readVaruint32(Ctx);
+      DylinkInfo.TableSize = readVaruint32(Ctx);
+      DylinkInfo.TableAlignment = readVaruint32(Ctx);
+      break;
+    case wasm::WASM_DYLINK_NEEDED:
+      Count = readVaruint32(Ctx);
+      while (Count--) {
+        DylinkInfo.Needed.push_back(readString(Ctx));
+      }
+      break;
+    case wasm::WASM_DYLINK_EXPORT_INFO: {
+      uint32_t Count = readVaruint32(Ctx);
+      while (Count--) {
+        DylinkInfo.ExportInfo.push_back({readString(Ctx), readVaruint32(Ctx)});
+      }
+      break;
+    }
+    case wasm::WASM_DYLINK_IMPORT_INFO: {
+      uint32_t Count = readVaruint32(Ctx);
+      while (Count--) {
+        DylinkInfo.ImportInfo.push_back(
+            {readString(Ctx), readString(Ctx), readVaruint32(Ctx)});
+      }
+      break;
+    }
+    default:
+      LLVM_DEBUG(dbgs() << "unknown dylink.0 sub-section: " << Type << "\n");
+      Ctx.Ptr += Size;
+      break;
+    }
+    if (Ctx.Ptr != Ctx.End) {
+      return make_error<GenericBinaryError>(
+          "dylink.0 sub-section ended prematurely", object_error::parse_failed);
+    }
+  }
+
+  if (Ctx.Ptr != Ctx.End)
+    return make_error<GenericBinaryError>("dylink.0 section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
 Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
   llvm::DenseSet<uint64_t> SeenFunctions;
   llvm::DenseSet<uint64_t> SeenGlobals;
   llvm::DenseSet<uint64_t> SeenSegments;
-  if (FunctionTypes.size() && !SeenCodeSection) {
+  if (Functions.size() && !SeenCodeSection) {
     return make_error<GenericBinaryError>("names must come after code section",
                                           object_error::parse_failed);
   }
@@ -427,7 +488,7 @@ Error WasmObjectFile::parseNameSection(ReadContext &Ctx) {
 
 Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
   HasLinkingSection = true;
-  if (FunctionTypes.size() && !SeenCodeSection) {
+  if (Functions.size() && !SeenCodeSection) {
     return make_error<GenericBinaryError>(
         "linking data must come after code section",
         object_error::parse_failed);
@@ -529,7 +590,6 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
     const wasm::WasmSignature *Signature = nullptr;
     const wasm::WasmGlobalType *GlobalType = nullptr;
     const wasm::WasmTableType *TableType = nullptr;
-    const wasm::WasmTagType *TagType = nullptr;
 
     Info.Kind = readUint8(Ctx);
     Info.Flags = readVaruint32(Ctx);
@@ -545,8 +605,8 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
       if (IsDefined) {
         Info.Name = readString(Ctx);
         unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
-        Signature = &Signatures[FunctionTypes[FuncIndex]];
         wasm::WasmFunction &Function = Functions[FuncIndex];
+        Signature = &Signatures[Function.SigIndex];
         if (Function.SymbolName.empty())
           Function.SymbolName = Info.Name;
       } else {
@@ -674,8 +734,7 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
         Info.Name = readString(Ctx);
         unsigned TagIndex = Info.ElementIndex - NumImportedTags;
         wasm::WasmTag &Tag = Tags[TagIndex];
-        Signature = &Signatures[Tag.Type.SigIndex];
-        TagType = &Tag.Type;
+        Signature = &Signatures[Tag.SigIndex];
         if (Tag.SymbolName.empty())
           Tag.SymbolName = Info.Name;
 
@@ -687,8 +746,7 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
         } else {
           Info.Name = Import.Field;
         }
-        TagType = &Import.Tag;
-        Signature = &Signatures[TagType->SigIndex];
+        Signature = &Signatures[Import.SigIndex];
         if (!Import.Module.empty()) {
           Info.ImportModule = Import.Module;
         }
@@ -710,7 +768,7 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
                                             object_error::parse_failed);
     LinkingData.SymbolTable.emplace_back(Info);
     Symbols.emplace_back(LinkingData.SymbolTable.back(), GlobalType, TableType,
-                         TagType, Signature);
+                         Signature);
     LLVM_DEBUG(dbgs() << "Adding symbol: " << Symbols.back() << "\n");
   }
 
@@ -984,6 +1042,9 @@ Error WasmObjectFile::parseCustomSection(WasmSection &Sec, ReadContext &Ctx) {
   if (Sec.Name == "dylink") {
     if (Error Err = parseDylinkSection(Ctx))
       return Err;
+  } else if (Sec.Name == "dylink.0") {
+    if (Error Err = parseDylink0Section(Ctx))
+      return Err;
   } else if (Sec.Name == "name") {
     if (Error Err = parseNameSection(Ctx))
       return Err;
@@ -1034,6 +1095,7 @@ Error WasmObjectFile::parseTypeSection(ReadContext &Ctx) {
 
 Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
+  uint32_t NumTypes = Signatures.size();
   Imports.reserve(Count);
   for (uint32_t I = 0; I < Count; I++) {
     wasm::WasmImport Im;
@@ -1044,6 +1106,9 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
     case wasm::WASM_EXTERNAL_FUNCTION:
       NumImportedFunctions++;
       Im.SigIndex = readVaruint32(Ctx);
+      if (Im.SigIndex >= NumTypes)
+        return make_error<GenericBinaryError>("invalid function type",
+                                              object_error::parse_failed);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
       NumImportedGlobals++;
@@ -1067,8 +1132,13 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
     }
     case wasm::WASM_EXTERNAL_TAG:
       NumImportedTags++;
-      Im.Tag.Attribute = readUint8(Ctx);
-      Im.Tag.SigIndex = readVarint32(Ctx);
+      if (readUint8(Ctx) != 0) // Reserved 'attribute' field
+        return make_error<GenericBinaryError>("invalid attribute",
+                                              object_error::parse_failed);
+      Im.SigIndex = readVaruint32(Ctx);
+      if (Im.SigIndex >= NumTypes)
+        return make_error<GenericBinaryError>("invalid tag type",
+                                              object_error::parse_failed);
       break;
     default:
       return make_error<GenericBinaryError>("unexpected import kind",
@@ -1084,15 +1154,16 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
 
 Error WasmObjectFile::parseFunctionSection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
-  FunctionTypes.reserve(Count);
-  Functions.resize(Count);
+  Functions.reserve(Count);
   uint32_t NumTypes = Signatures.size();
   while (Count--) {
     uint32_t Type = readVaruint32(Ctx);
     if (Type >= NumTypes)
       return make_error<GenericBinaryError>("invalid function type",
                                             object_error::parse_failed);
-    FunctionTypes.push_back(Type);
+    wasm::WasmFunction F;
+    F.SigIndex = Type;
+    Functions.push_back(F);
   }
   if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("function section ended prematurely",
@@ -1141,11 +1212,18 @@ Error WasmObjectFile::parseTagSection(ReadContext &Ctx) {
   TagSection = Sections.size();
   uint32_t Count = readVaruint32(Ctx);
   Tags.reserve(Count);
+  uint32_t NumTypes = Signatures.size();
   while (Count--) {
+    if (readUint8(Ctx) != 0) // Reserved 'attribute' field
+      return make_error<GenericBinaryError>("invalid attribute",
+                                            object_error::parse_failed);
+    uint32_t Type = readVaruint32(Ctx);
+    if (Type >= NumTypes)
+      return make_error<GenericBinaryError>("invalid tag type",
+                                            object_error::parse_failed);
     wasm::WasmTag Tag;
     Tag.Index = NumImportedTags + Tags.size();
-    Tag.Type.Attribute = readUint8(Ctx);
-    Tag.Type.SigIndex = readVaruint32(Ctx);
+    Tag.SigIndex = Type;
     Tags.push_back(Tag);
   }
 
@@ -1216,7 +1294,7 @@ Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
 }
 
 bool WasmObjectFile::isValidFunctionIndex(uint32_t Index) const {
-  return Index < NumImportedFunctions + FunctionTypes.size();
+  return Index < NumImportedFunctions + Functions.size();
 }
 
 bool WasmObjectFile::isDefinedFunctionIndex(uint32_t Index) const {
@@ -1304,7 +1382,7 @@ Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
   SeenCodeSection = true;
   CodeSection = Sections.size();
   uint32_t FunctionCount = readVaruint32(Ctx);
-  if (FunctionCount != FunctionTypes.size()) {
+  if (FunctionCount != Functions.size()) {
     return make_error<GenericBinaryError>("invalid function count",
                                           object_error::parse_failed);
   }
@@ -1793,6 +1871,7 @@ int WasmSectionOrderChecker::getSectionOrder(unsigned ID,
   case wasm::WASM_SEC_CUSTOM:
     return StringSwitch<unsigned>(CustomSectionName)
         .Case("dylink", WASM_SEC_ORDER_DYLINK)
+        .Case("dylink.0", WASM_SEC_ORDER_DYLINK)
         .Case("linking", WASM_SEC_ORDER_LINKING)
         .StartsWith("reloc.", WASM_SEC_ORDER_RELOC)
         .Case("name", WASM_SEC_ORDER_NAME)
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index 53447d0c97b2..9b0a5efacba7 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -69,15 +69,18 @@ bool XCOFFSectionHeader<T>::isReservedSectionType() const {
   return getSectionType() & SectionFlagsReservedMask;
 }
 
-bool XCOFFRelocation32::isRelocationSigned() const {
+template <typename AddressType>
+bool XCOFFRelocation<AddressType>::isRelocationSigned() const {
   return Info & XR_SIGN_INDICATOR_MASK;
 }
 
-bool XCOFFRelocation32::isFixupIndicated() const {
+template <typename AddressType>
+bool XCOFFRelocation<AddressType>::isFixupIndicated() const {
   return Info & XR_FIXUP_INDICATOR_MASK;
 }
 
-uint8_t XCOFFRelocation32::getRelocatedLength() const {
+template <typename AddressType>
+uint8_t XCOFFRelocation<AddressType>::getRelocatedLength() const {
   // The relocation encodes the bit length being relocated minus 1. Add back
   // the 1 to get the actual length being relocated.
   return (Info & XR_BIASED_LENGTH_MASK) + 1;
@@ -146,6 +149,20 @@ const XCOFFFileHeader64 *XCOFFObjectFile::fileHeader64() const {
   return static_cast<const XCOFFFileHeader64 *>(FileHeader);
 }
 
+const XCOFFAuxiliaryHeader32 *XCOFFObjectFile::auxiliaryHeader32() const {
+  assert(!is64Bit() && "32-bit interface called on 64-bit object file.");
+  return static_cast<const XCOFFAuxiliaryHeader32 *>(AuxiliaryHeader);
+}
+
+const XCOFFAuxiliaryHeader64 *XCOFFObjectFile::auxiliaryHeader64() const {
+  assert(is64Bit() && "64-bit interface called on a 32-bit object file.");
+  return static_cast<const XCOFFAuxiliaryHeader64 *>(AuxiliaryHeader);
+}
+
+template <typename T> const T *XCOFFObjectFile::sectionHeaderTable() const {
+  return static_cast<const T *>(SectionHeaderTable);
+}
+
 const XCOFFSectionHeader32 *
 XCOFFObjectFile::sectionHeaderTable32() const {
   assert(!is64Bit() && "32-bit interface called on 64-bit object file.");
@@ -183,12 +200,16 @@ XCOFFObjectFile::getStringTableEntry(uint32_t Offset) const {
   if (StringTable.Data != nullptr && StringTable.Size > Offset)
     return (StringTable.Data + Offset);
 
-  return make_error<GenericBinaryError>("Bad offset for string table entry",
-                                        object_error::parse_failed);
+  return createError("entry with offset 0x" + Twine::utohexstr(Offset) +
+                     " in a string table with size 0x" +
+                     Twine::utohexstr(StringTable.Size) + " is invalid");
 }
 
 StringRef XCOFFObjectFile::getStringTable() const {
-  return StringRef(StringTable.Data, StringTable.Size);
+  // If the size is less than or equal to 4, then the string table contains no
+  // string data.
+  return StringRef(StringTable.Data,
+                   StringTable.Size <= 4 ? 0 : StringTable.Size);
 }
 
 Expected<StringRef>
@@ -210,15 +231,85 @@ uint64_t XCOFFObjectFile::getSymbolValueImpl(DataRefImpl Symb) const {
   return toSymbolRef(Symb).getValue();
 }
 
+uint32_t XCOFFObjectFile::getSymbolAlignment(DataRefImpl Symb) const {
+  uint64_t Result = 0;
+  XCOFFSymbolRef XCOFFSym = toSymbolRef(Symb);
+  if (XCOFFSym.isCsectSymbol()) {
+    Expected<XCOFFCsectAuxRef> CsectAuxRefOrError =
+        XCOFFSym.getXCOFFCsectAuxRef();
+    if (!CsectAuxRefOrError)
+      // TODO: report the error up the stack.
+      consumeError(CsectAuxRefOrError.takeError());
+    else
+      Result = 1ULL << CsectAuxRefOrError.get().getAlignmentLog2();
+  }
+  return Result;
+}
+
 uint64_t XCOFFObjectFile::getCommonSymbolSizeImpl(DataRefImpl Symb) const {
   uint64_t Result = 0;
-  llvm_unreachable("Not yet implemented!");
+  XCOFFSymbolRef XCOFFSym = toSymbolRef(Symb);
+  if (XCOFFSym.isCsectSymbol()) {
+    Expected<XCOFFCsectAuxRef> CsectAuxRefOrError =
+        XCOFFSym.getXCOFFCsectAuxRef();
+    if (!CsectAuxRefOrError)
+      // TODO: report the error up the stack.
+      consumeError(CsectAuxRefOrError.takeError());
+    else {
+      XCOFFCsectAuxRef CsectAuxRef = CsectAuxRefOrError.get();
+      assert(CsectAuxRef.getSymbolType() == XCOFF::XTY_CM);
+      Result = CsectAuxRef.getSectionOrLength();
+    }
+  }
   return Result;
 }
 
 Expected<SymbolRef::Type>
 XCOFFObjectFile::getSymbolType(DataRefImpl Symb) const {
-  // TODO: Return the correct symbol type.
+  XCOFFSymbolRef XCOFFSym = toSymbolRef(Symb);
+
+  if (XCOFFSym.isFunction())
+    return SymbolRef::ST_Function;
+
+  if (XCOFF::C_FILE == XCOFFSym.getStorageClass())
+    return SymbolRef::ST_File;
+
+  int16_t SecNum = XCOFFSym.getSectionNumber();
+  if (SecNum <= 0)
+    return SymbolRef::ST_Other;
+
+  Expected<DataRefImpl> SecDRIOrErr =
+      getSectionByNum(XCOFFSym.getSectionNumber());
+
+  if (!SecDRIOrErr)
+    return SecDRIOrErr.takeError();
+
+  DataRefImpl SecDRI = SecDRIOrErr.get();
+
+  Expected<StringRef> SymNameOrError = XCOFFSym.getName();
+  if (SymNameOrError) {
+    // The "TOC" symbol is treated as SymbolRef::ST_Other.
+    if (SymNameOrError.get() == "TOC")
+      return SymbolRef::ST_Other;
+
+    // The symbol for a section name is treated as SymbolRef::ST_Other.
+    StringRef SecName;
+    if (is64Bit())
+      SecName = XCOFFObjectFile::toSection64(SecDRIOrErr.get())->getName();
+    else
+      SecName = XCOFFObjectFile::toSection32(SecDRIOrErr.get())->getName();
+
+    if (SecName == SymNameOrError.get())
+      return SymbolRef::ST_Other;
+  } else
+    return SymNameOrError.takeError();
+
+  if (isSectionData(SecDRI) || isSectionBSS(SecDRI))
+    return SymbolRef::ST_Data;
+
+  if (isDebugSection(SecDRI))
+    return SymbolRef::ST_Debug;
+
   return SymbolRef::ST_Other;
 }
 
@@ -285,8 +376,12 @@ XCOFFObjectFile::getSectionContents(DataRefImpl Sec) const {
 
   const uint8_t * ContentStart = base() + OffsetToRaw;
   uint64_t SectionSize = getSectionSize(Sec);
-  if (checkOffset(Data, reinterpret_cast<uintptr_t>(ContentStart), SectionSize))
-    return make_error<BinaryError>();
+  if (Error E = Binary::checkOffset(
+          Data, reinterpret_cast<uintptr_t>(ContentStart), SectionSize))
+    return createError(
+        toString(std::move(E)) + ": section data with offset 0x" +
+        Twine::utohexstr(OffsetToRaw) + " and size 0x" +
+        Twine::utohexstr(SectionSize) + " goes past the end of the file");
 
   return makeArrayRef(ContentStart,SectionSize);
 }
@@ -297,6 +392,43 @@ uint64_t XCOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const {
   return Result;
 }
 
+Expected<uintptr_t> XCOFFObjectFile::getLoaderSectionAddress() const {
+  uint64_t OffsetToLoaderSection = 0;
+  uint64_t SizeOfLoaderSection = 0;
+
+  if (is64Bit()) {
+    for (const auto &Sec64 : sections64())
+      if (Sec64.getSectionType() == XCOFF::STYP_LOADER) {
+        OffsetToLoaderSection = Sec64.FileOffsetToRawData;
+        SizeOfLoaderSection = Sec64.SectionSize;
+        break;
+      }
+  } else {
+    for (const auto &Sec32 : sections32())
+      if (Sec32.getSectionType() == XCOFF::STYP_LOADER) {
+        OffsetToLoaderSection = Sec32.FileOffsetToRawData;
+        SizeOfLoaderSection = Sec32.SectionSize;
+        break;
+      }
+  }
+
+  // No loader section is not an error.
+  if (!SizeOfLoaderSection)
+    return 0;
+
+  uintptr_t LoderSectionStart =
+      reinterpret_cast<uintptr_t>(base() + OffsetToLoaderSection);
+  if (Error E =
+          Binary::checkOffset(Data, LoderSectionStart, SizeOfLoaderSection))
+    return createError(toString(std::move(E)) +
+                       ": loader section with offset 0x" +
+                       Twine::utohexstr(OffsetToLoaderSection) +
+                       " and size 0x" + Twine::utohexstr(SizeOfLoaderSection) +
+                       " goes past the end of the file");
+
+  return LoderSectionStart;
+}
+
 bool XCOFFObjectFile::isSectionCompressed(DataRefImpl Sec) const {
   return false;
 }
@@ -326,61 +458,112 @@ bool XCOFFObjectFile::isSectionVirtual(DataRefImpl Sec) const {
 }
 
 relocation_iterator XCOFFObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
-  auto RelocationsOrErr = relocations(*SectionEntPtr);
-  if (Error E = RelocationsOrErr.takeError())
-    return relocation_iterator(RelocationRef());
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  if (is64Bit()) {
+    const XCOFFSectionHeader64 *SectionEntPtr = toSection64(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader64, XCOFFRelocation64>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  } else {
+    const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader32, XCOFFRelocation32>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  }
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
 relocation_iterator XCOFFObjectFile::section_rel_end(DataRefImpl Sec) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
-  auto RelocationsOrErr = relocations(*SectionEntPtr);
-  if (Error E = RelocationsOrErr.takeError())
-    return relocation_iterator(RelocationRef());
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  if (is64Bit()) {
+    const XCOFFSectionHeader64 *SectionEntPtr = toSection64(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader64, XCOFFRelocation64>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  } else {
+    const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader32, XCOFFRelocation32>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  }
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
 void XCOFFObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation32>(Rel.p) + 1);
+  if (is64Bit())
+    Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation64>(Rel.p) + 1);
+  else
+    Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation32>(Rel.p) + 1);
 }
 
 uint64_t XCOFFObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
-  const XCOFFSectionHeader32 *Sec32 = sectionHeaderTable32();
-  const uint32_t RelocAddress = Reloc->VirtualAddress;
-  const uint16_t NumberOfSections = getNumberOfSections();
-  for (uint16_t i = 0; i < NumberOfSections; ++i) {
-    // Find which section this relocation is belonging to, and get the
-    // relocation offset relative to the start of the section.
-    if (Sec32->VirtualAddress <= RelocAddress &&
-        RelocAddress < Sec32->VirtualAddress + Sec32->SectionSize) {
-      return RelocAddress - Sec32->VirtualAddress;
+  if (is64Bit()) {
+    const XCOFFRelocation64 *Reloc = viewAs<XCOFFRelocation64>(Rel.p);
+    const XCOFFSectionHeader64 *Sec64 = sectionHeaderTable64();
+    const uint64_t RelocAddress = Reloc->VirtualAddress;
+    const uint16_t NumberOfSections = getNumberOfSections();
+    for (uint16_t I = 0; I < NumberOfSections; ++I) {
+      // Find which section this relocation belongs to, and get the
+      // relocation offset relative to the start of the section.
+      if (Sec64->VirtualAddress <= RelocAddress &&
+          RelocAddress < Sec64->VirtualAddress + Sec64->SectionSize) {
+        return RelocAddress - Sec64->VirtualAddress;
+      }
+      ++Sec64;
+    }
+  } else {
+    const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+    const XCOFFSectionHeader32 *Sec32 = sectionHeaderTable32();
+    const uint32_t RelocAddress = Reloc->VirtualAddress;
+    const uint16_t NumberOfSections = getNumberOfSections();
+    for (uint16_t I = 0; I < NumberOfSections; ++I) {
+      // Find which section this relocation belongs to, and get the
+      // relocation offset relative to the start of the section.
+      if (Sec32->VirtualAddress <= RelocAddress &&
+          RelocAddress < Sec32->VirtualAddress + Sec32->SectionSize) {
+        return RelocAddress - Sec32->VirtualAddress;
+      }
+      ++Sec32;
     }
-    ++Sec32;
   }
   return InvalidRelocOffset;
 }
 
 symbol_iterator XCOFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
-  const uint32_t Index = Reloc->SymbolIndex;
-
-  if (Index >= getLogicalNumberOfSymbolTableEntries32())
-    return symbol_end();
-
+  uint32_t Index;
+  if (is64Bit()) {
+    const XCOFFRelocation64 *Reloc = viewAs<XCOFFRelocation64>(Rel.p);
+    Index = Reloc->SymbolIndex;
+
+    if (Index >= getNumberOfSymbolTableEntries64())
+      return symbol_end();
+  } else {
+    const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+    Index = Reloc->SymbolIndex;
+
+    if (Index >= getLogicalNumberOfSymbolTableEntries32())
+      return symbol_end();
+  }
   DataRefImpl SymDRI;
   SymDRI.p = getSymbolEntryAddressByIndex(Index);
   return symbol_iterator(SymbolRef(SymDRI, this));
@@ -388,22 +571,50 @@ symbol_iterator XCOFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
 
 uint64_t XCOFFObjectFile::getRelocationType(DataRefImpl Rel) const {
   if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
+    return viewAs<XCOFFRelocation64>(Rel.p)->Type;
   return viewAs<XCOFFRelocation32>(Rel.p)->Type;
 }
 
 void XCOFFObjectFile::getRelocationTypeName(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
-  StringRef Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  StringRef Res;
+  if (is64Bit()) {
+    const XCOFFRelocation64 *Reloc = viewAs<XCOFFRelocation64>(Rel.p);
+    Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  } else {
+    const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+    Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  }
   Result.append(Res.begin(), Res.end());
 }
 
 Expected<uint32_t> XCOFFObjectFile::getSymbolFlags(DataRefImpl Symb) const {
-  uint32_t Result = 0;
-  // TODO: Return correct symbol flags.
+  XCOFFSymbolRef XCOFFSym = toSymbolRef(Symb);
+  uint32_t Result = SymbolRef::SF_None;
+
+  if (XCOFFSym.getSectionNumber() == XCOFF::N_ABS)
+    Result |= SymbolRef::SF_Absolute;
+
+  XCOFF::StorageClass SC = XCOFFSym.getStorageClass();
+  if (XCOFF::C_EXT == SC || XCOFF::C_WEAKEXT == SC)
+    Result |= SymbolRef::SF_Global;
+
+  if (XCOFF::C_WEAKEXT == SC)
+    Result |= SymbolRef::SF_Weak;
+
+  if (XCOFFSym.isCsectSymbol()) {
+    Expected<XCOFFCsectAuxRef> CsectAuxEntOrErr =
+        XCOFFSym.getXCOFFCsectAuxRef();
+    if (CsectAuxEntOrErr) {
+      if (CsectAuxEntOrErr.get().getSymbolType() == XCOFF::XTY_CM)
+        Result |= SymbolRef::SF_Common;
+    } else
+      return CsectAuxEntOrErr.takeError();
+  }
+
+  if (XCOFFSym.getSectionNumber() == XCOFF::N_UNDEF)
+    Result |= SymbolRef::SF_Undefined;
+
   return Result;
 }
 
@@ -494,7 +705,9 @@ uint16_t XCOFFObjectFile::getMagic() const {
 
 Expected<DataRefImpl> XCOFFObjectFile::getSectionByNum(int16_t Num) const {
   if (Num <= 0 || Num > getNumberOfSections())
-    return errorCodeToError(object_error::invalid_section_index);
+    return createStringError(object_error::invalid_section_index,
+                             "the section index (" + Twine(Num) +
+                                 ") is invalid");
 
   DataRefImpl DRI;
   DRI.p = getWithOffset(getSectionHeaderTableAddress(),
@@ -602,6 +815,25 @@ uint32_t XCOFFObjectFile::getSymbolIndex(uintptr_t SymbolEntPtr) const {
          XCOFF::SymbolTableEntrySize;
 }
 
+uint64_t XCOFFObjectFile::getSymbolSize(DataRefImpl Symb) const {
+  uint64_t Result = 0;
+  XCOFFSymbolRef XCOFFSym = toSymbolRef(Symb);
+  if (XCOFFSym.isCsectSymbol()) {
+    Expected<XCOFFCsectAuxRef> CsectAuxRefOrError =
+        XCOFFSym.getXCOFFCsectAuxRef();
+    if (!CsectAuxRefOrError)
+      // TODO: report the error up the stack.
+      consumeError(CsectAuxRefOrError.takeError());
+    else {
+      XCOFFCsectAuxRef CsectAuxRef = CsectAuxRefOrError.get();
+      uint8_t SymType = CsectAuxRef.getSymbolType();
+      if (SymType == XCOFF::XTY_SD || SymType == XCOFF::XTY_CM)
+        Result = CsectAuxRef.getSectionOrLength();
+    }
+  }
+  return Result;
+}
+
 uintptr_t XCOFFObjectFile::getSymbolEntryAddressByIndex(uint32_t Index) const {
   return getAdvancedSymbolEntryAddress(
       reinterpret_cast<uintptr_t>(getPointerToSymbolTable()), Index);
@@ -612,7 +844,9 @@ XCOFFObjectFile::getSymbolNameByIndex(uint32_t Index) const {
   const uint32_t NumberOfSymTableEntries = getNumberOfSymbolTableEntries();
 
   if (Index >= NumberOfSymTableEntries)
-    return errorCodeToError(object_error::invalid_symbol_index);
+    return createError("symbol index " + Twine(Index) +
+                       " exceeds symbol count " +
+                       Twine(NumberOfSymTableEntries));
 
   DataRefImpl SymDRI;
   SymDRI.p = getSymbolEntryAddressByIndex(Index);
@@ -658,13 +892,16 @@ ArrayRef<XCOFFSectionHeader32> XCOFFObjectFile::sections32() const {
 // section header contains the actual count of relocation entries in the s_paddr
 // field. STYP_OVRFLO headers contain the section index of their corresponding
 // sections as their raw "NumberOfRelocations" field value.
-Expected<uint32_t> XCOFFObjectFile::getLogicalNumberOfRelocationEntries(
-    const XCOFFSectionHeader32 &Sec) const {
-
-  uint16_t SectionIndex = &Sec - sectionHeaderTable32() + 1;
+template <typename T>
+Expected<uint32_t> XCOFFObjectFile::getNumberOfRelocationEntries(
+    const XCOFFSectionHeader<T> &Sec) const {
+  const T &Section = static_cast<const T &>(Sec);
+  if (is64Bit())
+    return Section.NumberOfRelocations;
 
-  if (Sec.NumberOfRelocations < XCOFF::RelocOverflow)
-    return Sec.NumberOfRelocations;
+  uint16_t SectionIndex = &Section - sectionHeaderTable<T>() + 1;
+  if (Section.NumberOfRelocations < XCOFF::RelocOverflow)
+    return Section.NumberOfRelocations;
   for (const auto &Sec : sections32()) {
     if (Sec.Flags == XCOFF::STYP_OVRFLO &&
         Sec.NumberOfRelocations == SectionIndex)
@@ -673,27 +910,31 @@ Expected<uint32_t> XCOFFObjectFile::getLogicalNumberOfRelocationEntries(
   return errorCodeToError(object_error::parse_failed);
 }
 
-Expected<ArrayRef<XCOFFRelocation32>>
-XCOFFObjectFile::relocations(const XCOFFSectionHeader32 &Sec) const {
+template <typename Shdr, typename Reloc>
+Expected<ArrayRef<Reloc>> XCOFFObjectFile::relocations(const Shdr &Sec) const {
   uintptr_t RelocAddr = getWithOffset(reinterpret_cast<uintptr_t>(FileHeader),
                                       Sec.FileOffsetToRelocationInfo);
-  auto NumRelocEntriesOrErr = getLogicalNumberOfRelocationEntries(Sec);
+  auto NumRelocEntriesOrErr = getNumberOfRelocationEntries(Sec);
   if (Error E = NumRelocEntriesOrErr.takeError())
     return std::move(E);
 
   uint32_t NumRelocEntries = NumRelocEntriesOrErr.get();
-
-  static_assert(
-      sizeof(XCOFFRelocation32) == XCOFF::RelocationSerializationSize32, "");
+  static_assert((sizeof(Reloc) == XCOFF::RelocationSerializationSize64 ||
+                 sizeof(Reloc) == XCOFF::RelocationSerializationSize32),
+                "Relocation structure is incorrect");
   auto RelocationOrErr =
-      getObject<XCOFFRelocation32>(Data, reinterpret_cast<void *>(RelocAddr),
-                                   NumRelocEntries * sizeof(XCOFFRelocation32));
-  if (Error E = RelocationOrErr.takeError())
-    return std::move(E);
+      getObject<Reloc>(Data, reinterpret_cast<void *>(RelocAddr),
+                       NumRelocEntries * sizeof(Reloc));
+  if (!RelocationOrErr)
+    return createError(
+        toString(RelocationOrErr.takeError()) + ": relocations with offset 0x" +
+        Twine::utohexstr(Sec.FileOffsetToRelocationInfo) + " and size 0x" +
+        Twine::utohexstr(NumRelocEntries * sizeof(Reloc)) +
+        " go past the end of the file");
 
-  const XCOFFRelocation32 *StartReloc = RelocationOrErr.get();
+  const Reloc *StartReloc = RelocationOrErr.get();
 
-  return ArrayRef<XCOFFRelocation32>(StartReloc, StartReloc + NumRelocEntries);
+  return ArrayRef<Reloc>(StartReloc, StartReloc + NumRelocEntries);
 }
 
 Expected<XCOFFStringTable>
@@ -716,8 +957,12 @@ XCOFFObjectFile::parseStringTable(const XCOFFObjectFile *Obj, uint64_t Offset) {
 
   auto StringTableOrErr =
       getObject<char>(Obj->Data, Obj->base() + Offset, Size);
-  if (Error E = StringTableOrErr.takeError())
-    return std::move(E);
+  if (!StringTableOrErr)
+    return createError(toString(StringTableOrErr.takeError()) +
+                       ": string table with offset 0x" +
+                       Twine::utohexstr(Offset) + " and size 0x" +
+                       Twine::utohexstr(Size) +
+                       " goes past the end of the file");
 
   const char *StringTablePtr = StringTableOrErr.get();
   if (StringTablePtr[Size - 1] != '\0')
@@ -726,6 +971,54 @@ XCOFFObjectFile::parseStringTable(const XCOFFObjectFile *Obj, uint64_t Offset) {
   return XCOFFStringTable{Size, StringTablePtr};
 }
 
+// This function returns the import file table. Each entry in the import file
+// table consists of: "path_name\0base_name\0archive_member_name\0".
+Expected<StringRef> XCOFFObjectFile::getImportFileTable() const {
+  Expected<uintptr_t> LoaderSectionAddrOrError = getLoaderSectionAddress();
+  if (!LoaderSectionAddrOrError)
+    return LoaderSectionAddrOrError.takeError();
+
+  uintptr_t LoaderSectionAddr = LoaderSectionAddrOrError.get();
+  if (!LoaderSectionAddr)
+    return StringRef();
+
+  uint64_t OffsetToImportFileTable = 0;
+  uint64_t LengthOfImportFileTable = 0;
+  if (is64Bit()) {
+    const LoaderSectionHeader64 *LoaderSec64 =
+        viewAs<LoaderSectionHeader64>(LoaderSectionAddr);
+    OffsetToImportFileTable = LoaderSec64->OffsetToImpid;
+    LengthOfImportFileTable = LoaderSec64->LengthOfImpidStrTbl;
+  } else {
+    const LoaderSectionHeader32 *LoaderSec32 =
+        viewAs<LoaderSectionHeader32>(LoaderSectionAddr);
+    OffsetToImportFileTable = LoaderSec32->OffsetToImpid;
+    LengthOfImportFileTable = LoaderSec32->LengthOfImpidStrTbl;
+  }
+
+  auto ImportTableOrErr = getObject<char>(
+      Data,
+      reinterpret_cast<void *>(LoaderSectionAddr + OffsetToImportFileTable),
+      LengthOfImportFileTable);
+  if (!ImportTableOrErr)
+    return createError(
+        toString(ImportTableOrErr.takeError()) +
+        ": import file table with offset 0x" +
+        Twine::utohexstr(LoaderSectionAddr + OffsetToImportFileTable) +
+        " and size 0x" + Twine::utohexstr(LengthOfImportFileTable) +
+        " goes past the end of the file");
+
+  const char *ImportTablePtr = ImportTableOrErr.get();
+  if (ImportTablePtr[LengthOfImportFileTable - 1] != '\0')
+    return createError(
+        ": import file name table with offset 0x" +
+        Twine::utohexstr(LoaderSectionAddr + OffsetToImportFileTable) +
+        " and size 0x" + Twine::utohexstr(LengthOfImportFileTable) +
+        " must end with a null terminator");
+
+  return StringRef(ImportTablePtr, LengthOfImportFileTable);
+}
+
 Expected<std::unique_ptr<XCOFFObjectFile>>
 XCOFFObjectFile::create(unsigned Type, MemoryBufferRef MBR) {
   // Can't use std::make_unique because of the private constructor.
@@ -744,17 +1037,30 @@ XCOFFObjectFile::create(unsigned Type, MemoryBufferRef MBR) {
   Obj->FileHeader = FileHeaderOrErr.get();
 
   CurOffset += Obj->getFileHeaderSize();
-  // TODO FIXME we don't have support for an optional header yet, so just skip
-  // past it.
+
+  if (Obj->getOptionalHeaderSize()) {
+    auto AuxiliaryHeaderOrErr =
+        getObject<void>(Data, Base + CurOffset, Obj->getOptionalHeaderSize());
+    if (Error E = AuxiliaryHeaderOrErr.takeError())
+      return std::move(E);
+    Obj->AuxiliaryHeader = AuxiliaryHeaderOrErr.get();
+  }
+
   CurOffset += Obj->getOptionalHeaderSize();
 
   // Parse the section header table if it is present.
   if (Obj->getNumberOfSections()) {
-    auto SecHeadersOrErr = getObject<void>(Data, Base + CurOffset,
-                                           Obj->getNumberOfSections() *
-                                               Obj->getSectionHeaderSize());
-    if (Error E = SecHeadersOrErr.takeError())
-      return std::move(E);
+    uint64_t SectionHeadersSize =
+        Obj->getNumberOfSections() * Obj->getSectionHeaderSize();
+    auto SecHeadersOrErr =
+        getObject<void>(Data, Base + CurOffset, SectionHeadersSize);
+    if (!SecHeadersOrErr)
+      return createError(toString(SecHeadersOrErr.takeError()) +
+                         ": section headers with offset 0x" +
+                         Twine::utohexstr(CurOffset) + " and size 0x" +
+                         Twine::utohexstr(SectionHeadersSize) +
+                         " go past the end of the file");
+
     Obj->SectionHeaderTable = SecHeadersOrErr.get();
   }
 
@@ -773,8 +1079,12 @@ XCOFFObjectFile::create(unsigned Type, MemoryBufferRef MBR) {
       NumberOfSymbolTableEntries;
   auto SymTableOrErr =
       getObject<void *>(Data, Base + CurOffset, SymbolTableSize);
-  if (Error E = SymTableOrErr.takeError())
-    return std::move(E);
+  if (!SymTableOrErr)
+    return createError(
+        toString(SymTableOrErr.takeError()) + ": symbol table with offset 0x" +
+        Twine::utohexstr(CurOffset) + " and size 0x" +
+        Twine::utohexstr(SymbolTableSize) + " goes past the end of the file");
+
   Obj->SymbolTblPtr = SymTableOrErr.get();
   CurOffset += SymbolTableSize;
 
@@ -844,10 +1154,10 @@ Expected<XCOFFCsectAuxRef> XCOFFSymbolRef::getXCOFFCsectAuxRef() const {
   if (auto Err = NameOrErr.takeError())
     return std::move(Err);
 
+  uint32_t SymbolIdx = OwningObjectPtr->getSymbolIndex(getEntryAddress());
   if (!NumberOfAuxEntries) {
-    return createStringError(object_error::parse_failed,
-                             "csect symbol \"" + *NameOrErr +
-                                 "\" contains no auxiliary entry");
+    return createError("csect symbol \"" + *NameOrErr + "\" with index " +
+                       Twine(SymbolIdx) + " contains no auxiliary entry");
   }
 
   if (!OwningObjectPtr->is64Bit()) {
@@ -872,9 +1182,9 @@ Expected<XCOFFCsectAuxRef> XCOFFSymbolRef::getXCOFFCsectAuxRef() const {
     }
   }
 
-  return createStringError(
-      object_error::parse_failed,
-      "a csect auxiliary entry is not found for symbol \"" + *NameOrErr + "\"");
+  return createError(
+      "a csect auxiliary entry has not been found for symbol \"" + *NameOrErr +
+      "\" with index " + Twine(SymbolIdx));
 }
 
 Expected<StringRef> XCOFFSymbolRef::getName() const {
@@ -897,6 +1207,18 @@ Expected<StringRef> XCOFFSymbolRef::getName() const {
 template struct XCOFFSectionHeader<XCOFFSectionHeader32>;
 template struct XCOFFSectionHeader<XCOFFSectionHeader64>;
 
+template struct XCOFFRelocation<llvm::support::ubig32_t>;
+template struct XCOFFRelocation<llvm::support::ubig64_t>;
+
+template llvm::Expected<llvm::ArrayRef<llvm::object::XCOFFRelocation64>>
+llvm::object::XCOFFObjectFile::relocations<llvm::object::XCOFFSectionHeader64,
+                                           llvm::object::XCOFFRelocation64>(
+    llvm::object::XCOFFSectionHeader64 const &) const;
+template llvm::Expected<llvm::ArrayRef<llvm::object::XCOFFRelocation32>>
+llvm::object::XCOFFObjectFile::relocations<llvm::object::XCOFFSectionHeader32,
+                                           llvm::object::XCOFFRelocation32>(
+    llvm::object::XCOFFSectionHeader32 const &) const;
+
 bool doesXCOFFTracebackTableBegin(ArrayRef<uint8_t> Bytes) {
   if (Bytes.size() < 4)
     return false;
diff --git a/llvm/lib/ObjectYAML/COFFEmitter.cpp b/llvm/lib/ObjectYAML/COFFEmitter.cpp
index 06ce93affd38..5f38ca13cfc2 100644
--- a/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -170,8 +170,8 @@ static bool layoutOptionalHeader(COFFParser &CP) {
   unsigned PEHeaderSize = CP.is64Bit() ? sizeof(object::pe32plus_header)
                                        : sizeof(object::pe32_header);
   CP.Obj.Header.SizeOfOptionalHeader =
-      PEHeaderSize +
-      sizeof(object::data_directory) * (COFF::NUM_DATA_DIRECTORIES + 1);
+      PEHeaderSize + sizeof(object::data_directory) *
+                         CP.Obj.OptionalHeader->Header.NumberOfRvaAndSize;
   return true;
 }
 
@@ -397,7 +397,7 @@ static uint32_t initializeOptionalHeader(COFFParser &CP, uint16_t Magic,
   Header->SizeOfStackCommit = CP.Obj.OptionalHeader->Header.SizeOfStackCommit;
   Header->SizeOfHeapReserve = CP.Obj.OptionalHeader->Header.SizeOfHeapReserve;
   Header->SizeOfHeapCommit = CP.Obj.OptionalHeader->Header.SizeOfHeapCommit;
-  Header->NumberOfRvaAndSize = COFF::NUM_DATA_DIRECTORIES + 1;
+  Header->NumberOfRvaAndSize = CP.Obj.OptionalHeader->Header.NumberOfRvaAndSize;
   return BaseOfData;
 }
 
@@ -458,18 +458,20 @@ static bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
       PEH.BaseOfData = BaseOfData;
       OS.write(reinterpret_cast<char *>(&PEH), sizeof(PEH));
     }
-    for (const Optional<COFF::DataDirectory> &DD :
-         CP.Obj.OptionalHeader->DataDirectories) {
-      if (!DD.hasValue()) {
+    for (uint32_t I = 0; I < CP.Obj.OptionalHeader->Header.NumberOfRvaAndSize;
+         ++I) {
+      const Optional<COFF::DataDirectory> *DataDirectories =
+          CP.Obj.OptionalHeader->DataDirectories;
+      uint32_t NumDataDir = sizeof(CP.Obj.OptionalHeader->DataDirectories) /
+                            sizeof(Optional<COFF::DataDirectory>);
+      if (I >= NumDataDir || !DataDirectories[I].hasValue()) {
         OS << zeros(uint32_t(0));
         OS << zeros(uint32_t(0));
       } else {
-        OS << binary_le(DD->RelativeVirtualAddress);
-        OS << binary_le(DD->Size);
+        OS << binary_le(DataDirectories[I]->RelativeVirtualAddress);
+        OS << binary_le(DataDirectories[I]->Size);
       }
     }
-    OS << zeros(uint32_t(0));
-    OS << zeros(uint32_t(0));
   }
 
   assert(OS.tell() == CP.SectionTableStart);
diff --git a/llvm/lib/ObjectYAML/COFFYAML.cpp b/llvm/lib/ObjectYAML/COFFYAML.cpp
index 96069c0c590f..6e5cdce89060 100644
--- a/llvm/lib/ObjectYAML/COFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/COFFYAML.cpp
@@ -448,25 +448,27 @@ void MappingTraits<COFFYAML::PEHeader>::mapping(IO &IO,
   MappingNormalization<NDLLCharacteristics, uint16_t> NDC(
       IO, PH.Header.DLLCharacteristics);
 
-  IO.mapRequired("AddressOfEntryPoint", PH.Header.AddressOfEntryPoint);
-  IO.mapRequired("ImageBase", PH.Header.ImageBase);
-  IO.mapRequired("SectionAlignment", PH.Header.SectionAlignment);
-  IO.mapRequired("FileAlignment", PH.Header.FileAlignment);
-  IO.mapRequired("MajorOperatingSystemVersion",
+  IO.mapOptional("AddressOfEntryPoint", PH.Header.AddressOfEntryPoint);
+  IO.mapOptional("ImageBase", PH.Header.ImageBase);
+  IO.mapOptional("SectionAlignment", PH.Header.SectionAlignment, 1);
+  IO.mapOptional("FileAlignment", PH.Header.FileAlignment, 1);
+  IO.mapOptional("MajorOperatingSystemVersion",
                  PH.Header.MajorOperatingSystemVersion);
-  IO.mapRequired("MinorOperatingSystemVersion",
+  IO.mapOptional("MinorOperatingSystemVersion",
                  PH.Header.MinorOperatingSystemVersion);
-  IO.mapRequired("MajorImageVersion", PH.Header.MajorImageVersion);
-  IO.mapRequired("MinorImageVersion", PH.Header.MinorImageVersion);
-  IO.mapRequired("MajorSubsystemVersion", PH.Header.MajorSubsystemVersion);
-  IO.mapRequired("MinorSubsystemVersion", PH.Header.MinorSubsystemVersion);
-  IO.mapRequired("Subsystem", NWS->Subsystem);
-  IO.mapRequired("DLLCharacteristics", NDC->Characteristics);
-  IO.mapRequired("SizeOfStackReserve", PH.Header.SizeOfStackReserve);
-  IO.mapRequired("SizeOfStackCommit", PH.Header.SizeOfStackCommit);
-  IO.mapRequired("SizeOfHeapReserve", PH.Header.SizeOfHeapReserve);
-  IO.mapRequired("SizeOfHeapCommit", PH.Header.SizeOfHeapCommit);
-
+  IO.mapOptional("MajorImageVersion", PH.Header.MajorImageVersion);
+  IO.mapOptional("MinorImageVersion", PH.Header.MinorImageVersion);
+  IO.mapOptional("MajorSubsystemVersion", PH.Header.MajorSubsystemVersion);
+  IO.mapOptional("MinorSubsystemVersion", PH.Header.MinorSubsystemVersion);
+  IO.mapOptional("Subsystem", NWS->Subsystem);
+  IO.mapOptional("DLLCharacteristics", NDC->Characteristics);
+  IO.mapOptional("SizeOfStackReserve", PH.Header.SizeOfStackReserve);
+  IO.mapOptional("SizeOfStackCommit", PH.Header.SizeOfStackCommit);
+  IO.mapOptional("SizeOfHeapReserve", PH.Header.SizeOfHeapReserve);
+  IO.mapOptional("SizeOfHeapCommit", PH.Header.SizeOfHeapCommit);
+
+  IO.mapOptional("NumberOfRvaAndSize", PH.Header.NumberOfRvaAndSize,
+                 COFF::NUM_DATA_DIRECTORIES + 1);
   IO.mapOptional("ExportTable", PH.DataDirectories[COFF::EXPORT_TABLE]);
   IO.mapOptional("ImportTable", PH.DataDirectories[COFF::IMPORT_TABLE]);
   IO.mapOptional("ResourceTable", PH.DataDirectories[COFF::RESOURCE_TABLE]);
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index f8f2f0c12020..e378be3892fe 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1380,9 +1380,6 @@ void ELFState<ELFT>::writeSectionContent(
   if (!Section.Entries)
     return;
 
-  if (!Section.Entries)
-    return;
-
   for (const ELFYAML::StackSizeEntry &E : *Section.Entries) {
     CBA.write<uintX_t>(E.Address, ELFT::TargetEndianness);
     SHeader.sh_size += sizeof(uintX_t) + CBA.writeULEB128(E.Size);
@@ -1488,9 +1485,6 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   if (!Section.Bucket)
     return;
 
-  if (!Section.Bucket)
-    return;
-
   CBA.write<uint32_t>(
       Section.NBucket.getValueOr(llvm::yaml::Hex64(Section.Bucket->size())),
       ELFT::TargetEndianness);
@@ -1663,9 +1657,6 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   if (!Section.Symbols)
     return;
 
-  if (!Section.Symbols)
-    return;
-
   for (StringRef Sym : *Section.Symbols)
     SHeader.sh_size +=
         CBA.writeULEB128(toSymbolIndex(Sym, Section.Name, /*IsDynamic=*/false));
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 50821544a687..fdf9aeae1622 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -155,6 +155,13 @@ void ScalarEnumerationTraits<ELFYAML::ELF_NT>::enumeration(
   ECase(NT_FREEBSD_PROCSTAT_OSREL);
   ECase(NT_FREEBSD_PROCSTAT_PSSTRINGS);
   ECase(NT_FREEBSD_PROCSTAT_AUXV);
+  // OpenBSD core note types.
+  ECase(NT_OPENBSD_PROCINFO);
+  ECase(NT_OPENBSD_AUXV);
+  ECase(NT_OPENBSD_REGS);
+  ECase(NT_OPENBSD_FPREGS);
+  ECase(NT_OPENBSD_XFPREGS);
+  ECase(NT_OPENBSD_WCOOKIE);
   // AMD specific notes. (Code Object V2)
   ECase(NT_AMD_HSA_CODE_OBJECT_VERSION);
   ECase(NT_AMD_HSA_HSAIL);
@@ -655,6 +662,9 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   case ELF::EM_RISCV:
     ECase(SHT_RISCV_ATTRIBUTES);
     break;
+  case ELF::EM_MSP430:
+    ECase(SHT_MSP430_ATTRIBUTES);
+    break;
   default:
     // Nothing to do.
     break;
@@ -887,6 +897,13 @@ void ScalarEnumerationTraits<ELFYAML::ELF_DYNTAG>::enumeration(
 #undef PPC64_DYNAMIC_TAG
 #define PPC64_DYNAMIC_TAG(name, value)
     break;
+  case ELF::EM_RISCV:
+#undef RISCV_DYNAMIC_TAG
+#define RISCV_DYNAMIC_TAG(name, value) DYNAMIC_TAG(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef RISCV_DYNAMIC_TAG
+#define RISCV_DYNAMIC_TAG(name, value)
+    break;
   default:
 #include "llvm/BinaryFormat/DynamicTags.def"
     break;
@@ -1165,6 +1182,8 @@ struct NormalizedOther {
 
     if (EMachine == ELF::EM_AARCH64)
       Map["STO_AARCH64_VARIANT_PCS"] = ELF::STO_AARCH64_VARIANT_PCS;
+    if (EMachine == ELF::EM_RISCV)
+      Map["STO_RISCV_VARIANT_CC"] = ELF::STO_RISCV_VARIANT_CC;
     return Map;
   }
 
diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index 46e4dd05a737..c653c29ec9a7 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -184,6 +184,30 @@ size_t writeLoadCommandData<MachO::rpath_command>(MachOYAML::LoadCommand &LC,
 }
 
 template <>
+size_t writeLoadCommandData<MachO::sub_framework_command>(
+    MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
+size_t writeLoadCommandData<MachO::sub_umbrella_command>(
+    MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
+size_t writeLoadCommandData<MachO::sub_client_command>(
+    MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
+size_t writeLoadCommandData<MachO::sub_library_command>(
+    MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
+  return writePayloadString(LC, OS);
+}
+
+template <>
 size_t writeLoadCommandData<MachO::build_version_command>(
     MachOYAML::LoadCommand &LC, raw_ostream &OS, bool IsLittleEndian) {
   size_t BytesWritten = 0;
@@ -264,6 +288,7 @@ void MachOWriter::writeLoadCommands(raw_ostream &OS) {
 }
 
 Error MachOWriter::writeSectionData(raw_ostream &OS) {
+  uint64_t LinkEditOff = 0;
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SEGMENT:
@@ -273,6 +298,9 @@ Error MachOWriter::writeSectionData(raw_ostream &OS) {
       if (0 ==
           strncmp(&LC.Data.segment_command_data.segname[0], "__LINKEDIT", 16)) {
         FoundLinkEditSeg = true;
+        LinkEditOff = segOff;
+        if (Obj.RawLinkEditSegment)
+          continue;
         writeLinkEditData(OS);
       }
       for (auto &Sec : LC.Sections) {
@@ -320,6 +348,13 @@ Error MachOWriter::writeSectionData(raw_ostream &OS) {
     }
   }
 
+  if (Obj.RawLinkEditSegment) {
+    ZeroToOffset(OS, LinkEditOff);
+    if (OS.tell() - fileStart > LinkEditOff || !LinkEditOff)
+      return createStringError(errc::invalid_argument,
+                               "section offsets don't line up");
+    Obj.RawLinkEditSegment->writeAsBinary(OS);
+  }
   return Error::success();
 }
 
diff --git a/llvm/lib/ObjectYAML/MachOYAML.cpp b/llvm/lib/ObjectYAML/MachOYAML.cpp
index dce82ab1cada..c9562bd72258 100644
--- a/llvm/lib/ObjectYAML/MachOYAML.cpp
+++ b/llvm/lib/ObjectYAML/MachOYAML.cpp
@@ -110,6 +110,9 @@ void MappingTraits<MachOYAML::Object>::mapping(IO &IO,
   Object.DWARF.Is64BitAddrSize = Object.Header.magic == MachO::MH_MAGIC_64 ||
                                  Object.Header.magic == MachO::MH_CIGAM_64;
   IO.mapOptional("LoadCommands", Object.LoadCommands);
+
+  if (Object.RawLinkEditSegment || !IO.outputting())
+    IO.mapOptional("__LINKEDIT", Object.RawLinkEditSegment);
   if(!Object.LinkEdit.isEmpty() || !IO.outputting())
     IO.mapOptional("LinkEditData", Object.LinkEdit);
 
@@ -234,6 +237,30 @@ void mapLoadCommandData<MachO::dylinker_command>(
 }
 
 template <>
+void mapLoadCommandData<MachO::sub_framework_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Content", LoadCommand.Content);
+}
+
+template <>
+void mapLoadCommandData<MachO::sub_umbrella_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Content", LoadCommand.Content);
+}
+
+template <>
+void mapLoadCommandData<MachO::sub_client_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Content", LoadCommand.Content);
+}
+
+template <>
+void mapLoadCommandData<MachO::sub_library_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Content", LoadCommand.Content);
+}
+
+template <>
 void mapLoadCommandData<MachO::build_version_command>(
     IO &IO, MachOYAML::LoadCommand &LoadCommand) {
   IO.mapOptional("Tools", LoadCommand.Tools);
diff --git a/llvm/lib/ObjectYAML/WasmEmitter.cpp b/llvm/lib/ObjectYAML/WasmEmitter.cpp
index 888ba115e2d9..80a8c56f6912 100644
--- a/llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ b/llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -157,13 +157,24 @@ void WasmWriter::writeInitExpr(raw_ostream &OS,
 void WasmWriter::writeSectionContent(raw_ostream &OS,
                                      WasmYAML::DylinkSection &Section) {
   writeStringRef(Section.Name, OS);
-  encodeULEB128(Section.MemorySize, OS);
-  encodeULEB128(Section.MemoryAlignment, OS);
-  encodeULEB128(Section.TableSize, OS);
-  encodeULEB128(Section.TableAlignment, OS);
-  encodeULEB128(Section.Needed.size(), OS);
-  for (StringRef Needed : Section.Needed)
-    writeStringRef(Needed, OS);
+
+  writeUint8(OS, wasm::WASM_DYLINK_MEM_INFO);
+  SubSectionWriter SubSection(OS);
+  raw_ostream &SubOS = SubSection.getStream();
+  encodeULEB128(Section.MemorySize, SubOS);
+  encodeULEB128(Section.MemoryAlignment, SubOS);
+  encodeULEB128(Section.TableSize, SubOS);
+  encodeULEB128(Section.TableAlignment, SubOS);
+  SubSection.done();
+
+  if (Section.Needed.size()) {
+    writeUint8(OS, wasm::WASM_DYLINK_NEEDED);
+    raw_ostream &SubOS = SubSection.getStream();
+    encodeULEB128(Section.Needed.size(), SubOS);
+    for (StringRef Needed : Section.Needed)
+      writeStringRef(Needed, SubOS);
+    SubSection.done();
+  }
 }
 
 void WasmWriter::writeSectionContent(raw_ostream &OS,
@@ -386,8 +397,8 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
       NumImportedGlobals++;
       break;
     case wasm::WASM_EXTERNAL_TAG:
-      writeUint32(OS, Import.TagImport.Attribute);
-      writeUint32(OS, Import.TagImport.SigIndex);
+      writeUint8(OS, 0); // Reserved 'attribute' field
+      encodeULEB128(Import.SigIndex, OS);
       NumImportedTags++;
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
@@ -451,16 +462,10 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
 
 void WasmWriter::writeSectionContent(raw_ostream &OS,
                                      WasmYAML::TagSection &Section) {
-  encodeULEB128(Section.Tags.size(), OS);
-  uint32_t ExpectedIndex = NumImportedTags;
-  for (auto &Tag : Section.Tags) {
-    if (Tag.Index != ExpectedIndex) {
-      reportError("unexpected tag index: " + Twine(Tag.Index));
-      return;
-    }
-    ++ExpectedIndex;
-    encodeULEB128(Tag.Attribute, OS);
-    encodeULEB128(Tag.SigIndex, OS);
+  encodeULEB128(Section.TagTypes.size(), OS);
+  for (uint32_t TagType : Section.TagTypes) {
+    writeUint8(OS, 0); // Reserved 'attribute' field
+    encodeULEB128(TagType, OS);
   }
 }
 
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 752654ddbbaf..3f0172ebf361 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -55,6 +55,8 @@ static void sectionMapping(IO &IO, WasmYAML::DylinkSection &Section) {
   IO.mapRequired("TableSize", Section.TableSize);
   IO.mapRequired("TableAlignment", Section.TableAlignment);
   IO.mapRequired("Needed", Section.Needed);
+  IO.mapOptional("ImportInfo", Section.ImportInfo);
+  IO.mapOptional("ExportInfo", Section.ExportInfo);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
@@ -122,7 +124,7 @@ static void sectionMapping(IO &IO, WasmYAML::MemorySection &Section) {
 
 static void sectionMapping(IO &IO, WasmYAML::TagSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Tags", Section.Tags);
+  IO.mapOptional("TagTypes", Section.TagTypes);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
@@ -177,7 +179,7 @@ void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
     } else {
       IO.mapRequired("Name", SectionName);
     }
-    if (SectionName == "dylink") {
+    if (SectionName == "dylink" || SectionName == "dylink.0") {
       if (!IO.outputting())
         Section.reset(new WasmYAML::DylinkSection());
       sectionMapping(IO, *cast<WasmYAML::DylinkSection>(Section.get()));
@@ -391,14 +393,12 @@ void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
   IO.mapRequired("Module", Import.Module);
   IO.mapRequired("Field", Import.Field);
   IO.mapRequired("Kind", Import.Kind);
-  if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+  if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION ||
+      Import.Kind == wasm::WASM_EXTERNAL_TAG) {
     IO.mapRequired("SigIndex", Import.SigIndex);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
     IO.mapRequired("GlobalType", Import.GlobalImport.Type);
     IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
-  } else if (Import.Kind == wasm::WASM_EXTERNAL_TAG) {
-    IO.mapRequired("TagAttribute", Import.TagImport.Attribute);
-    IO.mapRequired("TagSigIndex", Import.TagImport.SigIndex);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
     IO.mapRequired("Table", Import.TableImport);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY) {
@@ -525,10 +525,17 @@ void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
   }
 }
 
-void MappingTraits<WasmYAML::Tag>::mapping(IO &IO, WasmYAML::Tag &Tag) {
-  IO.mapRequired("Index", Tag.Index);
-  IO.mapRequired("Attribute", Tag.Attribute);
-  IO.mapRequired("SigIndex", Tag.SigIndex);
+void MappingTraits<WasmYAML::DylinkImportInfo>::mapping(
+    IO &IO, WasmYAML::DylinkImportInfo &Info) {
+  IO.mapRequired("Module", Info.Module);
+  IO.mapRequired("Field", Info.Field);
+  IO.mapRequired("Flags", Info.Flags);
+}
+
+void MappingTraits<WasmYAML::DylinkExportInfo>::mapping(
+    IO &IO, WasmYAML::DylinkExportInfo &Info) {
+  IO.mapRequired("Name", Info.Name);
+  IO.mapRequired("Flags", Info.Flags);
 }
 
 void ScalarBitSetTraits<WasmYAML::LimitFlags>::bitset(
@@ -561,6 +568,7 @@ void ScalarBitSetTraits<WasmYAML::SymbolFlags>::bitset(
   BCaseMask(EXPORTED, EXPORTED);
   BCaseMask(EXPLICIT_NAME, EXPLICIT_NAME);
   BCaseMask(NO_STRIP, NO_STRIP);
+  BCaseMask(TLS, TLS);
 #undef BCaseMask
 }
 
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index 14fea5437a32..85d1f82bfafc 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -18,8 +18,9 @@
 #include "llvm/ObjectYAML/ObjectYAML.h"
 #include "llvm/ObjectYAML/yaml2obj.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -33,7 +34,7 @@ class XCOFFWriter {
 public:
   XCOFFWriter(XCOFFYAML::Object &Obj, raw_ostream &OS, yaml::ErrorHandler EH)
       : Obj(Obj), W(OS, support::big), ErrHandler(EH),
-        Strings(StringTableBuilder::XCOFF) {
+        StrTblBuilder(StringTableBuilder::XCOFF) {
     Is64Bit = Obj.Header.Magic == (llvm::yaml::Hex16)XCOFF::XCOFF64;
   }
   bool writeXCOFF();
@@ -41,20 +42,24 @@ public:
 private:
   bool nameShouldBeInStringTable(StringRef SymbolName);
   bool initFileHeader(uint64_t CurrentOffset);
+  void initAuxFileHeader();
   bool initSectionHeader(uint64_t &CurrentOffset);
   bool initRelocations(uint64_t &CurrentOffset);
+  bool initStringTable();
   bool assignAddressesAndIndices();
   void writeFileHeader();
+  void writeAuxFileHeader();
   void writeSectionHeader();
   bool writeSectionData();
   bool writeRelocations();
   bool writeSymbols();
+  void writeStringTable();
 
   XCOFFYAML::Object &Obj;
   bool Is64Bit = false;
   support::endian::Writer W;
   yaml::ErrorHandler ErrHandler;
-  StringTableBuilder Strings;
+  StringTableBuilder StrTblBuilder;
   uint64_t StartOffset;
   // Map the section name to its corrresponding section index.
   DenseMap<StringRef, int16_t> SectionIndexMap = {
@@ -62,6 +67,7 @@ private:
       {StringRef("N_ABS"), XCOFF::N_ABS},
       {StringRef("N_UNDEF"), XCOFF::N_UNDEF}};
   XCOFFYAML::FileHeader InitFileHdr = Obj.Header;
+  XCOFFYAML::AuxiliaryHeader InitAuxFileHdr;
   std::vector<XCOFFYAML::Section> InitSections = Obj.Sections;
 };
 
@@ -75,7 +81,8 @@ static void writeName(StringRef StrName, support::endian::Writer W) {
 }
 
 bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) {
-  return SymbolName.size() > XCOFF::NameSize;
+  // For XCOFF64: The symbol name is always in the string table.
+  return (SymbolName.size() > XCOFF::NameSize) || Is64Bit;
 }
 
 bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
@@ -83,8 +90,9 @@ bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
     if (!InitSections[I].Relocations.empty()) {
       InitSections[I].NumberOfRelocations = InitSections[I].Relocations.size();
       InitSections[I].FileOffsetToRelocations = CurrentOffset;
-      CurrentOffset += InitSections[I].NumberOfRelocations *
-                       XCOFF::RelocationSerializationSize32;
+      uint64_t RelSize = Is64Bit ? XCOFF::RelocationSerializationSize64
+                                 : XCOFF::RelocationSerializationSize32;
+      CurrentOffset += InitSections[I].NumberOfRelocations * RelSize;
       if (CurrentOffset > MaxRawDataSize) {
         ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
                    "exceeded when writing relocation data");
@@ -138,20 +146,79 @@ bool XCOFFWriter::initSectionHeader(uint64_t &CurrentOffset) {
   return initRelocations(CurrentOffset);
 }
 
+bool XCOFFWriter::initStringTable() {
+  if (Obj.StrTbl.RawContent) {
+    size_t RawSize = Obj.StrTbl.RawContent->binary_size();
+    if (Obj.StrTbl.Strings || Obj.StrTbl.Length) {
+      ErrHandler(
+          "can't specify Strings or Length when RawContent is specified");
+      return false;
+    }
+    if (Obj.StrTbl.ContentSize && *Obj.StrTbl.ContentSize < RawSize) {
+      ErrHandler("specified ContentSize (" + Twine(*Obj.StrTbl.ContentSize) +
+                 ") is less than the RawContent data size (" + Twine(RawSize) +
+                 ")");
+      return false;
+    }
+    return true;
+  }
+  if (Obj.StrTbl.ContentSize && *Obj.StrTbl.ContentSize <= 3) {
+    ErrHandler("ContentSize shouldn't be less than 4 without RawContent");
+    return false;
+  }
+
+  // Build the string table.
+  StrTblBuilder.clear();
+
+  if (Obj.StrTbl.Strings) {
+    // All specified strings should be added to the string table.
+    for (StringRef StringEnt : *Obj.StrTbl.Strings)
+      StrTblBuilder.add(StringEnt);
+
+    size_t StrTblIdx = 0;
+    size_t NumOfStrings = Obj.StrTbl.Strings->size();
+    for (XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
+      if (nameShouldBeInStringTable(YamlSym.SymbolName)) {
+        if (StrTblIdx < NumOfStrings) {
+          // Overwrite the symbol name with the specified string.
+          YamlSym.SymbolName = (*Obj.StrTbl.Strings)[StrTblIdx];
+          ++StrTblIdx;
+        } else
+          // Names that are not overwritten are still stored in the string
+          // table.
+          StrTblBuilder.add(YamlSym.SymbolName);
+      }
+    }
+  } else {
+    for (XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
+      if (nameShouldBeInStringTable(YamlSym.SymbolName))
+        StrTblBuilder.add(YamlSym.SymbolName);
+    }
+  }
+
+  StrTblBuilder.finalize();
+
+  size_t StrTblSize = StrTblBuilder.getSize();
+  if (Obj.StrTbl.ContentSize && *Obj.StrTbl.ContentSize < StrTblSize) {
+    ErrHandler("specified ContentSize (" + Twine(*Obj.StrTbl.ContentSize) +
+               ") is less than the size of the data that would otherwise be "
+               "written (" +
+               Twine(StrTblSize) + ")");
+    return false;
+  }
+
+  return true;
+}
+
 bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
   // The default format of the object file is XCOFF32.
   InitFileHdr.Magic = XCOFF::XCOFF32;
   InitFileHdr.NumberOfSections = Obj.Sections.size();
   InitFileHdr.NumberOfSymTableEntries = Obj.Symbols.size();
 
-  for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
+  for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols)
     // Add the number of auxiliary symbols to the total number.
     InitFileHdr.NumberOfSymTableEntries += YamlSym.NumberOfAuxEntries;
-    if (nameShouldBeInStringTable(YamlSym.SymbolName))
-      Strings.add(YamlSym.SymbolName);
-  }
-  // Finalize the string table.
-  Strings.finalize();
 
   // Calculate SymbolTableOffset for the file header.
   if (InitFileHdr.NumberOfSymTableEntries) {
@@ -168,17 +235,87 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
   return true;
 }
 
+void XCOFFWriter::initAuxFileHeader() {
+  InitAuxFileHdr = *Obj.AuxHeader;
+  // In general, an object file might contain multiple sections of a given type,
+  // but in a loadable module, there must be exactly one .text, .data, .bss, and
+  // .loader section. A loadable object might also have one .tdata section and
+  // one .tbss section.
+  // Set these section-related values if not set explicitly. We assume that the
+  // input YAML matches the format of the loadable object, but if multiple input
+  // sections still have the same type, the first section with that type
+  // prevails.
+  for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) {
+    switch (InitSections[I].Flags) {
+    case XCOFF::STYP_TEXT:
+      if (!InitAuxFileHdr.TextSize)
+        InitAuxFileHdr.TextSize = InitSections[I].Size;
+      if (!InitAuxFileHdr.TextStartAddr)
+        InitAuxFileHdr.TextStartAddr = InitSections[I].Address;
+      if (!InitAuxFileHdr.SecNumOfText)
+        InitAuxFileHdr.SecNumOfText = I + 1;
+      break;
+    case XCOFF::STYP_DATA:
+      if (!InitAuxFileHdr.InitDataSize)
+        InitAuxFileHdr.InitDataSize = InitSections[I].Size;
+      if (!InitAuxFileHdr.DataStartAddr)
+        InitAuxFileHdr.DataStartAddr = InitSections[I].Address;
+      if (!InitAuxFileHdr.SecNumOfData)
+        InitAuxFileHdr.SecNumOfData = I + 1;
+      break;
+    case XCOFF::STYP_BSS:
+      if (!InitAuxFileHdr.BssDataSize)
+        InitAuxFileHdr.BssDataSize = InitSections[I].Size;
+      if (!InitAuxFileHdr.SecNumOfBSS)
+        InitAuxFileHdr.SecNumOfBSS = I + 1;
+      break;
+    case XCOFF::STYP_TDATA:
+      if (!InitAuxFileHdr.SecNumOfTData)
+        InitAuxFileHdr.SecNumOfTData = I + 1;
+      break;
+    case XCOFF::STYP_TBSS:
+      if (!InitAuxFileHdr.SecNumOfTBSS)
+        InitAuxFileHdr.SecNumOfTBSS = I + 1;
+      break;
+    case XCOFF::STYP_LOADER:
+      if (!InitAuxFileHdr.SecNumOfLoader)
+        InitAuxFileHdr.SecNumOfLoader = I + 1;
+      break;
+    default:
+      break;
+    }
+  }
+}
+
 bool XCOFFWriter::assignAddressesAndIndices() {
-  Strings.clear();
+  uint64_t FileHdrSize =
+      Is64Bit ? XCOFF::FileHeaderSize64 : XCOFF::FileHeaderSize32;
+  uint64_t AuxFileHdrSize = 0;
+  if (Obj.AuxHeader)
+    AuxFileHdrSize = Obj.Header.AuxHeaderSize
+                         ? Obj.Header.AuxHeaderSize
+                         : (Is64Bit ? XCOFF::AuxFileHeaderSize64
+                                    : XCOFF::AuxFileHeaderSize32);
+  uint64_t SecHdrSize =
+      Is64Bit ? XCOFF::SectionHeaderSize64 : XCOFF::SectionHeaderSize32;
   uint64_t CurrentOffset =
-      XCOFF::FileHeaderSize32 /* TODO: + auxiliaryHeaderSize() */ +
-      InitSections.size() * XCOFF::SectionHeaderSize32;
+      FileHdrSize + AuxFileHdrSize + InitSections.size() * SecHdrSize;
 
   // Calculate section header info.
   if (!initSectionHeader(CurrentOffset))
     return false;
+  InitFileHdr.AuxHeaderSize = AuxFileHdrSize;
+
   // Calculate file header info.
-  return initFileHeader(CurrentOffset);
+  if (!initFileHeader(CurrentOffset))
+    return false;
+
+  // Initialize the auxiliary file header.
+  if (Obj.AuxHeader)
+    initAuxFileHeader();
+
+  // Initialize the string table.
+  return initStringTable();
 }
 
 void XCOFFWriter::writeFileHeader() {
@@ -186,14 +323,86 @@ void XCOFFWriter::writeFileHeader() {
   W.write<uint16_t>(Obj.Header.NumberOfSections ? Obj.Header.NumberOfSections
                                                 : InitFileHdr.NumberOfSections);
   W.write<int32_t>(Obj.Header.TimeStamp);
-  W.write<uint32_t>(Obj.Header.SymbolTableOffset
-                        ? Obj.Header.SymbolTableOffset
-                        : InitFileHdr.SymbolTableOffset);
-  W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
-                       ? Obj.Header.NumberOfSymTableEntries
-                       : InitFileHdr.NumberOfSymTableEntries);
-  W.write<uint16_t>(Obj.Header.AuxHeaderSize);
-  W.write<uint16_t>(Obj.Header.Flags);
+  if (Is64Bit) {
+    W.write<uint64_t>(Obj.Header.SymbolTableOffset
+                          ? Obj.Header.SymbolTableOffset
+                          : InitFileHdr.SymbolTableOffset);
+    W.write<uint16_t>(InitFileHdr.AuxHeaderSize);
+    W.write<uint16_t>(Obj.Header.Flags);
+    W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
+                         ? Obj.Header.NumberOfSymTableEntries
+                         : InitFileHdr.NumberOfSymTableEntries);
+  } else {
+    W.write<uint32_t>(Obj.Header.SymbolTableOffset
+                          ? Obj.Header.SymbolTableOffset
+                          : InitFileHdr.SymbolTableOffset);
+    W.write<int32_t>(Obj.Header.NumberOfSymTableEntries
+                         ? Obj.Header.NumberOfSymTableEntries
+                         : InitFileHdr.NumberOfSymTableEntries);
+    W.write<uint16_t>(InitFileHdr.AuxHeaderSize);
+    W.write<uint16_t>(Obj.Header.Flags);
+  }
+}
+
+void XCOFFWriter::writeAuxFileHeader() {
+  W.write<uint16_t>(InitAuxFileHdr.Magic.getValueOr(yaml::Hex16(1)));
+  W.write<uint16_t>(InitAuxFileHdr.Version.getValueOr(yaml::Hex16(1)));
+  if (Is64Bit) {
+    W.OS.write_zeros(4); // Reserved for debugger.
+    W.write<uint64_t>(InitAuxFileHdr.TextStartAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.DataStartAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.TOCAnchorAddr.getValueOr(yaml::Hex64(0)));
+  } else {
+    W.write<uint32_t>(InitAuxFileHdr.TextSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.InitDataSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.BssDataSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.EntryPointAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.TextStartAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.DataStartAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.TOCAnchorAddr.getValueOr(yaml::Hex64(0)));
+  }
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfEntryPoint.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfText.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfData.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTOC.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfLoader.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfBSS.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.MaxAlignOfText.getValueOr(yaml::Hex16(0)));
+  W.write<uint16_t>(InitAuxFileHdr.MaxAlignOfData.getValueOr(yaml::Hex16(0)));
+  W.write<uint16_t>(InitAuxFileHdr.ModuleType.getValueOr(yaml::Hex16(0)));
+  W.write<uint8_t>(InitAuxFileHdr.CpuFlag.getValueOr(yaml::Hex8(0)));
+  W.write<uint8_t>(0); // Reserved for CPU type.
+  if (Is64Bit) {
+    W.write<uint8_t>(InitAuxFileHdr.TextPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.DataPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.StackPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(
+        InitAuxFileHdr.FlagAndTDataAlignment.getValueOr(yaml::Hex8(0x80)));
+    W.write<uint64_t>(InitAuxFileHdr.TextSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.InitDataSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.BssDataSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.EntryPointAddr.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.MaxStackSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint64_t>(InitAuxFileHdr.MaxDataSize.getValueOr(yaml::Hex64(0)));
+  } else {
+    W.write<uint32_t>(InitAuxFileHdr.MaxStackSize.getValueOr(yaml::Hex64(0)));
+    W.write<uint32_t>(InitAuxFileHdr.MaxDataSize.getValueOr(yaml::Hex64(0)));
+    W.OS.write_zeros(4); // Reserved for debugger.
+    W.write<uint8_t>(InitAuxFileHdr.TextPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.DataPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(InitAuxFileHdr.StackPageSize.getValueOr(yaml::Hex8(0)));
+    W.write<uint8_t>(
+        InitAuxFileHdr.FlagAndTDataAlignment.getValueOr(yaml::Hex8(0)));
+  }
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTData.getValueOr(0));
+  W.write<uint16_t>(InitAuxFileHdr.SecNumOfTBSS.getValueOr(0));
+  if (Is64Bit) {
+    W.write<uint16_t>(InitAuxFileHdr.Flag.getValueOr(yaml::Hex16(XCOFF::SHR_SYMTAB)));
+    if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize64)
+      W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize64);
+  } else if (InitFileHdr.AuxHeaderSize > XCOFF::AuxFileHeaderSize32) {
+    W.OS.write_zeros(InitFileHdr.AuxHeaderSize - XCOFF::AuxFileHeaderSize32);
+  }
 }
 
 void XCOFFWriter::writeSectionHeader() {
@@ -202,22 +411,40 @@ void XCOFFWriter::writeSectionHeader() {
     XCOFFYAML::Section DerivedSec = InitSections[I];
     writeName(YamlSec.SectionName, W);
     // Virtual address is the same as physical address.
-    uint32_t SectionAddress =
+    uint64_t SectionAddress =
         YamlSec.Address ? YamlSec.Address : DerivedSec.Address;
-    W.write<uint32_t>(SectionAddress); // Physical address
-    W.write<uint32_t>(SectionAddress); // Virtual address
-    W.write<uint32_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
-    W.write<uint32_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
-                                               : DerivedSec.FileOffsetToData);
-    W.write<uint32_t>(YamlSec.FileOffsetToRelocations
-                          ? YamlSec.FileOffsetToRelocations
-                          : DerivedSec.FileOffsetToRelocations);
-    W.write<uint32_t>(YamlSec.FileOffsetToLineNumbers);
-    W.write<uint16_t>(YamlSec.NumberOfRelocations
-                          ? YamlSec.NumberOfRelocations
-                          : DerivedSec.NumberOfRelocations);
-    W.write<uint16_t>(YamlSec.NumberOfLineNumbers);
-    W.write<int32_t>(YamlSec.Flags);
+    if (Is64Bit) {
+      W.write<uint64_t>(SectionAddress); // Physical address
+      W.write<uint64_t>(SectionAddress); // Virtual address
+      W.write<uint64_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
+      W.write<uint64_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
+                                                 : DerivedSec.FileOffsetToData);
+      W.write<uint64_t>(YamlSec.FileOffsetToRelocations
+                            ? YamlSec.FileOffsetToRelocations
+                            : DerivedSec.FileOffsetToRelocations);
+      W.write<uint64_t>(YamlSec.FileOffsetToLineNumbers);
+      W.write<uint32_t>(YamlSec.NumberOfRelocations
+                            ? YamlSec.NumberOfRelocations
+                            : DerivedSec.NumberOfRelocations);
+      W.write<uint32_t>(YamlSec.NumberOfLineNumbers);
+      W.write<int32_t>(YamlSec.Flags);
+      W.OS.write_zeros(4);
+    } else {
+      W.write<uint32_t>(SectionAddress); // Physical address
+      W.write<uint32_t>(SectionAddress); // Virtual address
+      W.write<uint32_t>(YamlSec.Size ? YamlSec.Size : DerivedSec.Size);
+      W.write<uint32_t>(YamlSec.FileOffsetToData ? YamlSec.FileOffsetToData
+                                                 : DerivedSec.FileOffsetToData);
+      W.write<uint32_t>(YamlSec.FileOffsetToRelocations
+                            ? YamlSec.FileOffsetToRelocations
+                            : DerivedSec.FileOffsetToRelocations);
+      W.write<uint32_t>(YamlSec.FileOffsetToLineNumbers);
+      W.write<uint16_t>(YamlSec.NumberOfRelocations
+                            ? YamlSec.NumberOfRelocations
+                            : DerivedSec.NumberOfRelocations);
+      W.write<uint16_t>(YamlSec.NumberOfLineNumbers);
+      W.write<int32_t>(YamlSec.Flags);
+    }
   }
 }
 
@@ -232,8 +459,7 @@ bool XCOFFWriter::writeSectionData() {
         ErrHandler("redundant data was written before section data");
         return false;
       }
-      if (PaddingSize > 0)
-        W.OS.write_zeros(PaddingSize);
+      W.OS.write_zeros(PaddingSize);
       YamlSec.SectionData.writeAsBinary(W.OS);
     }
   }
@@ -250,10 +476,12 @@ bool XCOFFWriter::writeRelocations() {
         ErrHandler("redundant data was written before relocations");
         return false;
       }
-      if (PaddingSize > 0)
-        W.OS.write_zeros(PaddingSize);
+      W.OS.write_zeros(PaddingSize);
       for (const XCOFFYAML::Relocation &YamlRel : YamlSec.Relocations) {
-        W.write<uint32_t>(YamlRel.VirtualAddress);
+        if (Is64Bit)
+          W.write<uint64_t>(YamlRel.VirtualAddress);
+        else
+          W.write<uint32_t>(YamlRel.VirtualAddress);
         W.write<uint32_t>(YamlRel.SymbolIndex);
         W.write<uint8_t>(YamlRel.Info);
         W.write<uint8_t>(YamlRel.Type);
@@ -270,20 +498,39 @@ bool XCOFFWriter::writeSymbols() {
     ErrHandler("redundant data was written before symbols");
     return false;
   }
-  if (PaddingSize > 0)
-    W.OS.write_zeros(PaddingSize);
+  W.OS.write_zeros(PaddingSize);
   for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
-    if (nameShouldBeInStringTable(YamlSym.SymbolName)) {
-      // For XCOFF32: A value of 0 indicates that the symbol name is in the
-      // string table.
-      W.write<int32_t>(0);
-      W.write<uint32_t>(Strings.getOffset(YamlSym.SymbolName));
+    if (Is64Bit) {
+      W.write<uint64_t>(YamlSym.Value);
+      W.write<uint32_t>(StrTblBuilder.getOffset(YamlSym.SymbolName));
+    } else {
+      if (nameShouldBeInStringTable(YamlSym.SymbolName)) {
+        // For XCOFF32: A value of 0 indicates that the symbol name is in the
+        // string table.
+        W.write<int32_t>(0);
+        W.write<uint32_t>(StrTblBuilder.getOffset(YamlSym.SymbolName));
+      } else {
+        writeName(YamlSym.SymbolName, W);
+      }
+      W.write<uint32_t>(YamlSym.Value);
+    }
+    if (YamlSym.SectionName) {
+      if (!SectionIndexMap.count(*YamlSym.SectionName)) {
+        ErrHandler("the SectionName " + *YamlSym.SectionName +
+                   " specified in the symbol does not exist");
+        return false;
+      }
+      if (YamlSym.SectionIndex &&
+          SectionIndexMap[*YamlSym.SectionName] != *YamlSym.SectionIndex) {
+        ErrHandler("the SectionName " + *YamlSym.SectionName +
+                   " and the SectionIndex (" + Twine(*YamlSym.SectionIndex) +
+                   ") refer to different sections");
+        return false;
+      }
+      W.write<int16_t>(SectionIndexMap[*YamlSym.SectionName]);
     } else {
-      writeName(YamlSym.SymbolName, W);
+      W.write<int16_t>(YamlSym.SectionIndex ? *YamlSym.SectionIndex : 0);
     }
-    W.write<uint32_t>(YamlSym.Value);
-    W.write<int16_t>(
-        YamlSym.SectionName.size() ? SectionIndexMap[YamlSym.SectionName] : 0);
     W.write<uint16_t>(YamlSym.Type);
     W.write<uint8_t>(YamlSym.StorageClass);
     W.write<uint8_t>(YamlSym.NumberOfAuxEntries);
@@ -295,21 +542,61 @@ bool XCOFFWriter::writeSymbols() {
       // length of each auxiliary entry is the same as a symbol table entry (18
       // bytes). The format and quantity of auxiliary entries depend on the
       // storage class (n_sclass) and type (n_type) of the symbol table entry.
-      W.OS.write_zeros(18);
+      W.OS.write_zeros(XCOFF::SymbolTableEntrySize);
     }
   }
   return true;
 }
 
-bool XCOFFWriter::writeXCOFF() {
-  if (Is64Bit) {
-    ErrHandler("only XCOFF32 is currently supported");
-    return false;
+void XCOFFWriter::writeStringTable() {
+  if (Obj.StrTbl.RawContent) {
+    Obj.StrTbl.RawContent->writeAsBinary(W.OS);
+    if (Obj.StrTbl.ContentSize) {
+      assert(*Obj.StrTbl.ContentSize >= Obj.StrTbl.RawContent->binary_size() &&
+             "Specified ContentSize is less than the RawContent size.");
+      W.OS.write_zeros(*Obj.StrTbl.ContentSize -
+                       Obj.StrTbl.RawContent->binary_size());
+    }
+    return;
+  }
+
+  size_t StrTblBuilderSize = StrTblBuilder.getSize();
+  // If neither Length nor ContentSize is specified, write the StrTblBuilder
+  // directly, which contains the auto-generated Length value.
+  if (!Obj.StrTbl.Length && !Obj.StrTbl.ContentSize) {
+    if (StrTblBuilderSize <= 4)
+      return;
+    StrTblBuilder.write(W.OS);
+    return;
+  }
+
+  // Serialize the string table's content to a temporary buffer.
+  std::unique_ptr<WritableMemoryBuffer> Buf =
+      WritableMemoryBuffer::getNewMemBuffer(StrTblBuilderSize);
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+  StrTblBuilder.write(Ptr);
+  // Replace the first 4 bytes, which contain the auto-generated Length value,
+  // with the specified value.
+  memset(Ptr, 0, 4);
+  support::endian::write32be(Ptr, Obj.StrTbl.Length ? *Obj.StrTbl.Length
+                                                    : *Obj.StrTbl.ContentSize);
+  // Copy the buffer content to the actual output stream.
+  W.OS.write(Buf->getBufferStart(), Buf->getBufferSize());
+  // Add zeros as padding after strings.
+  if (Obj.StrTbl.ContentSize) {
+    assert(*Obj.StrTbl.ContentSize >= StrTblBuilderSize &&
+           "Specified ContentSize is less than the StringTableBuilder size.");
+    W.OS.write_zeros(*Obj.StrTbl.ContentSize - StrTblBuilderSize);
   }
+}
+
+bool XCOFFWriter::writeXCOFF() {
   if (!assignAddressesAndIndices())
     return false;
   StartOffset = W.OS.tell();
   writeFileHeader();
+  if (Obj.AuxHeader)
+    writeAuxFileHeader();
   if (!Obj.Sections.empty()) {
     writeSectionHeader();
     if (!writeSectionData())
@@ -319,9 +606,7 @@ bool XCOFFWriter::writeXCOFF() {
   }
   if (!Obj.Symbols.empty() && !writeSymbols())
     return false;
-  // Write the string table.
-  if (Strings.getSize() > 4)
-    Strings.write(W.OS);
+  writeStringTable();
   return true;
 }
 
diff --git a/llvm/lib/ObjectYAML/XCOFFYAML.cpp b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
index 73d188e274b1..221cf3b064c0 100644
--- a/llvm/lib/ObjectYAML/XCOFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
@@ -118,6 +118,37 @@ void MappingTraits<XCOFFYAML::FileHeader>::mapping(
   IO.mapOptional("Flags", FileHdr.Flags);
 }
 
+void MappingTraits<XCOFFYAML::AuxiliaryHeader>::mapping(
+    IO &IO, XCOFFYAML::AuxiliaryHeader &AuxHdr) {
+  IO.mapOptional("Magic", AuxHdr.Magic);
+  IO.mapOptional("Version", AuxHdr.Version);
+  IO.mapOptional("TextStartAddr", AuxHdr.TextStartAddr);
+  IO.mapOptional("DataStartAddr", AuxHdr.DataStartAddr);
+  IO.mapOptional("TOCAnchorAddr", AuxHdr.TOCAnchorAddr);
+  IO.mapOptional("TextSectionSize", AuxHdr.TextSize);
+  IO.mapOptional("DataSectionSize", AuxHdr.InitDataSize);
+  IO.mapOptional("BssSectionSize", AuxHdr.BssDataSize);
+  IO.mapOptional("SecNumOfEntryPoint", AuxHdr.SecNumOfEntryPoint);
+  IO.mapOptional("SecNumOfText", AuxHdr.SecNumOfText);
+  IO.mapOptional("SecNumOfData", AuxHdr.SecNumOfData);
+  IO.mapOptional("SecNumOfTOC", AuxHdr.SecNumOfTOC);
+  IO.mapOptional("SecNumOfLoader", AuxHdr.SecNumOfLoader);
+  IO.mapOptional("SecNumOfBSS", AuxHdr.SecNumOfBSS);
+  IO.mapOptional("MaxAlignOfText", AuxHdr.MaxAlignOfText);
+  IO.mapOptional("MaxAlignOfData", AuxHdr.MaxAlignOfData);
+  IO.mapOptional("ModuleType", AuxHdr.CpuFlag);
+  IO.mapOptional("TextPageSize", AuxHdr.TextPageSize);
+  IO.mapOptional("DataPageSize", AuxHdr.DataPageSize);
+  IO.mapOptional("StackPageSize", AuxHdr.StackPageSize);
+  IO.mapOptional("FlagAndTDataAlignment", AuxHdr.FlagAndTDataAlignment);
+  IO.mapOptional("EntryPointAddr", AuxHdr.EntryPointAddr);
+  IO.mapOptional("MaxStackSize", AuxHdr.MaxStackSize);
+  IO.mapOptional("MaxDataSize", AuxHdr.MaxDataSize);
+  IO.mapOptional("SecNumOfTData", AuxHdr.SecNumOfTData);
+  IO.mapOptional("SecNumOfTBSS", AuxHdr.SecNumOfTBSS);
+  IO.mapOptional("Flag", AuxHdr.Flag);
+}
+
 void MappingTraits<XCOFFYAML::Relocation>::mapping(IO &IO,
                                                    XCOFFYAML::Relocation &R) {
   IO.mapOptional("Address", R.VirtualAddress);
@@ -143,19 +174,29 @@ void MappingTraits<XCOFFYAML::Section>::mapping(IO &IO,
 }
 
 void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) {
-  IO.mapRequired("Name", S.SymbolName);
+  IO.mapOptional("Name", S.SymbolName);
   IO.mapOptional("Value", S.Value);
   IO.mapOptional("Section", S.SectionName);
+  IO.mapOptional("SectionIndex", S.SectionIndex);
   IO.mapOptional("Type", S.Type);
   IO.mapOptional("StorageClass", S.StorageClass);
   IO.mapOptional("NumberOfAuxEntries", S.NumberOfAuxEntries);
 }
 
+void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTable &Str) {
+  IO.mapOptional("ContentSize", Str.ContentSize);
+  IO.mapOptional("Length", Str.Length);
+  IO.mapOptional("Strings", Str.Strings);
+  IO.mapOptional("RawContent", Str.RawContent);
+}
+
 void MappingTraits<XCOFFYAML::Object>::mapping(IO &IO, XCOFFYAML::Object &Obj) {
   IO.mapTag("!XCOFF", true);
   IO.mapRequired("FileHeader", Obj.Header);
+  IO.mapOptional("AuxiliaryHeader", Obj.AuxHeader);
   IO.mapOptional("Sections", Obj.Sections);
   IO.mapOptional("Symbols", Obj.Symbols);
+  IO.mapOptional("StringTable", Obj.StrTbl);
 }
 
 } // namespace yaml
diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index f5bf166e9e65..37c2fcbab181 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -104,11 +104,11 @@ OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase)
   for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
     unsigned Kind = getInfo(i + 1).Kind;
     if (Kind == Option::InputClass) {
-      assert(!TheInputOptionID && "Cannot have multiple input options!");
-      TheInputOptionID = getInfo(i + 1).ID;
+      assert(!InputOptionID && "Cannot have multiple input options!");
+      InputOptionID = getInfo(i + 1).ID;
     } else if (Kind == Option::UnknownClass) {
-      assert(!TheUnknownOptionID && "Cannot have multiple unknown options!");
-      TheUnknownOptionID = getInfo(i + 1).ID;
+      assert(!UnknownOptionID && "Cannot have multiple unknown options!");
+      UnknownOptionID = getInfo(i + 1).ID;
     } else if (Kind != Option::GroupClass) {
       FirstSearchableIndex = i;
       break;
@@ -337,13 +337,14 @@ bool OptTable::addValues(const char *Option, const char *Values) {
 // GroupedShortOptions is true, -a matches "-abc" and the argument in Args will
 // be updated to "-bc". This overload does not support
 // FlagsToInclude/FlagsToExclude or case insensitive options.
-Arg *OptTable::parseOneArgGrouped(InputArgList &Args, unsigned &Index) const {
+std::unique_ptr<Arg> OptTable::parseOneArgGrouped(InputArgList &Args,
+                                                  unsigned &Index) const {
   // Anything that doesn't start with PrefixesUnion is an input, as is '-'
   // itself.
   const char *CStr = Args.getArgString(Index);
   StringRef Str(CStr);
   if (isInput(PrefixesUnion, Str))
-    return new Arg(getOption(TheInputOptionID), Str, Index++, CStr);
+    return std::make_unique<Arg>(getOption(InputOptionID), Str, Index++, CStr);
 
   const Info *End = OptionInfos.data() + OptionInfos.size();
   StringRef Name = Str.ltrim(PrefixChars);
@@ -359,8 +360,9 @@ Arg *OptTable::parseOneArgGrouped(InputArgList &Args, unsigned &Index) const {
       continue;
 
     Option Opt(Start, this);
-    if (Arg *A = Opt.accept(Args, StringRef(Args.getArgString(Index), ArgSize),
-                            false, Index))
+    if (std::unique_ptr<Arg> A =
+            Opt.accept(Args, StringRef(Args.getArgString(Index), ArgSize),
+                       /*GroupedShortOption=*/false, Index))
       return A;
 
     // If Opt is a Flag of length 2 (e.g. "-a"), we know it is a prefix of
@@ -375,28 +377,39 @@ Arg *OptTable::parseOneArgGrouped(InputArgList &Args, unsigned &Index) const {
   }
   if (Fallback) {
     Option Opt(Fallback, this);
-    if (Arg *A = Opt.accept(Args, Str.substr(0, 2), true, Index)) {
-      if (Str.size() == 2)
-        ++Index;
-      else
-        Args.replaceArgString(Index, Twine('-') + Str.substr(2));
+    // Check that the last option isn't a flag wrongly given an argument.
+    if (Str[2] == '=')
+      return std::make_unique<Arg>(getOption(UnknownOptionID), Str, Index++,
+                                   CStr);
+
+    if (std::unique_ptr<Arg> A = Opt.accept(
+            Args, Str.substr(0, 2), /*GroupedShortOption=*/true, Index)) {
+      Args.replaceArgString(Index, Twine('-') + Str.substr(2));
       return A;
     }
   }
 
-  return new Arg(getOption(TheUnknownOptionID), Str, Index++, CStr);
+  // In the case of an incorrect short option extract the character and move to
+  // the next one.
+  if (Str[1] != '-') {
+    CStr = Args.MakeArgString(Str.substr(0, 2));
+    Args.replaceArgString(Index, Twine('-') + Str.substr(2));
+    return std::make_unique<Arg>(getOption(UnknownOptionID), CStr, Index, CStr);
+  }
+
+  return std::make_unique<Arg>(getOption(UnknownOptionID), Str, Index++, CStr);
 }
 
-Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
-                           unsigned FlagsToInclude,
-                           unsigned FlagsToExclude) const {
+std::unique_ptr<Arg> OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
+                                           unsigned FlagsToInclude,
+                                           unsigned FlagsToExclude) const {
   unsigned Prev = Index;
   const char *Str = Args.getArgString(Index);
 
   // Anything that doesn't start with PrefixesUnion is an input, as is '-'
   // itself.
   if (isInput(PrefixesUnion, Str))
-    return new Arg(getOption(TheInputOptionID), Str, Index++, Str);
+    return std::make_unique<Arg>(getOption(InputOptionID), Str, Index++, Str);
 
   const Info *Start = OptionInfos.data() + FirstSearchableIndex;
   const Info *End = OptionInfos.data() + OptionInfos.size();
@@ -430,8 +443,9 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
       continue;
 
     // See if this option matches.
-    if (Arg *A = Opt.accept(Args, StringRef(Args.getArgString(Index), ArgSize),
-                            false, Index))
+    if (std::unique_ptr<Arg> A =
+            Opt.accept(Args, StringRef(Args.getArgString(Index), ArgSize),
+                       /*GroupedShortOption=*/false, Index))
       return A;
 
     // Otherwise, see if this argument was missing values.
@@ -442,9 +456,9 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
   // If we failed to find an option and this arg started with /, then it's
   // probably an input path.
   if (Str[0] == '/')
-    return new Arg(getOption(TheInputOptionID), Str, Index++, Str);
+    return std::make_unique<Arg>(getOption(InputOptionID), Str, Index++, Str);
 
-  return new Arg(getOption(TheUnknownOptionID), Str, Index++, Str);
+  return std::make_unique<Arg>(getOption(UnknownOptionID), Str, Index++, Str);
 }
 
 InputArgList OptTable::ParseArgs(ArrayRef<const char *> ArgArr,
@@ -472,7 +486,7 @@ InputArgList OptTable::ParseArgs(ArrayRef<const char *> ArgArr,
     }
 
     unsigned Prev = Index;
-    Arg *A = GroupedShortOptions
+    std::unique_ptr<Arg> A = GroupedShortOptions
                  ? parseOneArgGrouped(Args, Index)
                  : ParseOneArg(Args, Index, FlagsToInclude, FlagsToExclude);
     assert((Index > Prev || GroupedShortOptions) &&
@@ -487,7 +501,7 @@ InputArgList OptTable::ParseArgs(ArrayRef<const char *> ArgArr,
       break;
     }
 
-    Args.append(A);
+    Args.append(A.release());
   }
 
   return Args;
@@ -654,7 +668,7 @@ void OptTable::printHelp(raw_ostream &OS, const char *Usage, const char *Title,
         HelpText = getOptionHelpText(Alias.getID());
     }
 
-    if (HelpText) {
+    if (HelpText && (strlen(HelpText) != 0)) {
       const char *HelpGroup = getOptionHelpGroup(*this, Id);
       const std::string &OptName = getOptionHelpName(*this, Id);
       GroupedOptionHelp[HelpGroup].push_back({OptName, HelpText});
diff --git a/llvm/lib/Option/Option.cpp b/llvm/lib/Option/Option.cpp
index 68d074b2702e..ebdba8949223 100644
--- a/llvm/lib/Option/Option.cpp
+++ b/llvm/lib/Option/Option.cpp
@@ -106,23 +106,24 @@ bool Option::matches(OptSpecifier Opt) const {
   return false;
 }
 
-Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
-                            unsigned &Index) const {
+std::unique_ptr<Arg> Option::acceptInternal(const ArgList &Args,
+                                            StringRef Spelling,
+                                            unsigned &Index) const {
   size_t ArgSize = Spelling.size();
   switch (getKind()) {
   case FlagClass: {
     if (ArgSize != strlen(Args.getArgString(Index)))
       return nullptr;
-    return new Arg(*this, Spelling, Index++);
+    return std::make_unique<Arg>(*this, Spelling, Index++);
   }
   case JoinedClass: {
     const char *Value = Args.getArgString(Index) + ArgSize;
-    return new Arg(*this, Spelling, Index++, Value);
+    return std::make_unique<Arg>(*this, Spelling, Index++, Value);
   }
   case CommaJoinedClass: {
     // Always matches.
     const char *Str = Args.getArgString(Index) + ArgSize;
-    Arg *A = new Arg(*this, Spelling, Index++);
+    auto A = std::make_unique<Arg>(*this, Spelling, Index++);
 
     // Parse out the comma separated values.
     const char *Prev = Str;
@@ -158,7 +159,8 @@ Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
         Args.getArgString(Index - 1) == nullptr)
       return nullptr;
 
-    return new Arg(*this, Spelling, Index - 2, Args.getArgString(Index - 1));
+    return std::make_unique<Arg>(*this, Spelling, Index - 2,
+                                 Args.getArgString(Index - 1));
   case MultiArgClass: {
     // Matches iff this is an exact match.
     // FIXME: Avoid strlen.
@@ -169,8 +171,8 @@ Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
     if (Index > Args.getNumInputArgStrings())
       return nullptr;
 
-    Arg *A = new Arg(*this, Spelling, Index - 1 - getNumArgs(),
-                     Args.getArgString(Index - getNumArgs()));
+    auto A = std::make_unique<Arg>(*this, Spelling, Index - 1 - getNumArgs(),
+                                   Args.getArgString(Index - getNumArgs()));
     for (unsigned i = 1; i != getNumArgs(); ++i)
       A->getValues().push_back(Args.getArgString(Index - getNumArgs() + i));
     return A;
@@ -180,7 +182,7 @@ Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
     // FIXME: Avoid strlen.
     if (ArgSize != strlen(Args.getArgString(Index))) {
       const char *Value = Args.getArgString(Index) + ArgSize;
-      return new Arg(*this, Spelling, Index++, Value);
+      return std::make_unique<Arg>(*this, Spelling, Index++, Value);
     }
 
     // Otherwise it must be separate.
@@ -189,7 +191,8 @@ Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
         Args.getArgString(Index - 1) == nullptr)
       return nullptr;
 
-    return new Arg(*this, Spelling, Index - 2, Args.getArgString(Index - 1));
+    return std::make_unique<Arg>(*this, Spelling, Index - 2,
+                                 Args.getArgString(Index - 1));
   }
   case JoinedAndSeparateClass:
     // Always matches.
@@ -198,22 +201,22 @@ Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
         Args.getArgString(Index - 1) == nullptr)
       return nullptr;
 
-    return new Arg(*this, Spelling, Index - 2,
-                   Args.getArgString(Index - 2) + ArgSize,
-                   Args.getArgString(Index - 1));
+    return std::make_unique<Arg>(*this, Spelling, Index - 2,
+                                 Args.getArgString(Index - 2) + ArgSize,
+                                 Args.getArgString(Index - 1));
   case RemainingArgsClass: {
     // Matches iff this is an exact match.
     // FIXME: Avoid strlen.
     if (ArgSize != strlen(Args.getArgString(Index)))
       return nullptr;
-    Arg *A = new Arg(*this, Spelling, Index++);
+    auto A = std::make_unique<Arg>(*this, Spelling, Index++);
     while (Index < Args.getNumInputArgStrings() &&
            Args.getArgString(Index) != nullptr)
       A->getValues().push_back(Args.getArgString(Index++));
     return A;
   }
   case RemainingArgsJoinedClass: {
-    Arg *A = new Arg(*this, Spelling, Index);
+    auto A = std::make_unique<Arg>(*this, Spelling, Index);
     if (ArgSize != strlen(Args.getArgString(Index))) {
       // An inexact match means there is a joined arg.
       A->getValues().push_back(Args.getArgString(Index) + ArgSize);
@@ -230,17 +233,18 @@ Arg *Option::acceptInternal(const ArgList &Args, StringRef Spelling,
   }
 }
 
-Arg *Option::accept(const ArgList &Args, StringRef CurArg,
-                    bool GroupedShortOption, unsigned &Index) const {
-  std::unique_ptr<Arg> A(GroupedShortOption && getKind() == FlagClass
-                             ? new Arg(*this, CurArg, Index)
+std::unique_ptr<Arg> Option::accept(const ArgList &Args, StringRef CurArg,
+                                    bool GroupedShortOption,
+                                    unsigned &Index) const {
+  auto A(GroupedShortOption && getKind() == FlagClass
+                             ? std::make_unique<Arg>(*this, CurArg, Index)
                              : acceptInternal(Args, CurArg, Index));
   if (!A)
     return nullptr;
 
   const Option &UnaliasedOption = getUnaliasedOption();
   if (getID() == UnaliasedOption.getID())
-    return A.release();
+    return A;
 
   // "A" is an alias for a different flag. For most clients it's more convenient
   // if this function returns unaliased Args, so create an unaliased arg for
@@ -259,7 +263,8 @@ Arg *Option::accept(const ArgList &Args, StringRef CurArg,
   // Due to this, ArgList::getArgString(A->getIndex()) will return the spelling
   // of the aliased arg always, while A->getSpelling() returns either the
   // unaliased or the aliased arg, depending on which Arg object it's called on.
-  Arg *UnaliasedA = new Arg(UnaliasedOption, UnaliasedSpelling, A->getIndex());
+  auto UnaliasedA =
+      std::make_unique<Arg>(UnaliasedOption, UnaliasedSpelling, A->getIndex());
   Arg *RawA = A.get();
   UnaliasedA->setAlias(std::move(A));
 
diff --git a/llvm/lib/Passes/OptimizationLevel.cpp b/llvm/lib/Passes/OptimizationLevel.cpp
new file mode 100644
index 000000000000..a1f8c1e14b1f
--- /dev/null
+++ b/llvm/lib/Passes/OptimizationLevel.cpp
@@ -0,0 +1,30 @@
+//===- OptimizationLevel.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Passes/OptimizationLevel.h"
+
+using namespace llvm;
+
+const OptimizationLevel OptimizationLevel::O0 = {
+    /*SpeedLevel*/ 0,
+    /*SizeLevel*/ 0};
+const OptimizationLevel OptimizationLevel::O1 = {
+    /*SpeedLevel*/ 1,
+    /*SizeLevel*/ 0};
+const OptimizationLevel OptimizationLevel::O2 = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 0};
+const OptimizationLevel OptimizationLevel::O3 = {
+    /*SpeedLevel*/ 3,
+    /*SizeLevel*/ 0};
+const OptimizationLevel OptimizationLevel::Os = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 1};
+const OptimizationLevel OptimizationLevel::Oz = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 2};
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 79fcc8569b6d..561a881bab0c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1,4 +1,4 @@
-//===- Parsing, selection, and construction of pass pipelines -------------===//
+//===- Parsing and selection of pass pipelines ----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CostModel.h"
 #include "llvm/Analysis/DDG.h"
 #include "llvm/Analysis/DDGPrinter.h"
 #include "llvm/Analysis/Delinearization.h"
@@ -109,6 +110,7 @@
 #include "llvm/Transforms/IPO/LoopExtractor.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
 #include "llvm/Transforms/IPO/SCCP.h"
@@ -241,103 +243,16 @@
 
 using namespace llvm;
 
-static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
-    "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
-    cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
-    cl::values(clEnumValN(InliningAdvisorMode::Default, "default",
-                          "Heuristics-based inliner version."),
-               clEnumValN(InliningAdvisorMode::Development, "development",
-                          "Use development mode (runtime-loadable model)."),
-               clEnumValN(InliningAdvisorMode::Release, "release",
-                          "Use release mode (AOT-compiled model).")));
-
-static cl::opt<bool> EnableSyntheticCounts(
-    "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
-    cl::desc("Run synthetic function entry count generation "
-             "pass"));
-
 static const Regex DefaultAliasRegex(
     "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
-/// Flag to enable inline deferral during PGO.
-static cl::opt<bool>
-    EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
-                            cl::Hidden,
-                            cl::desc("Enable inline deferral during PGO"));
-
-static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false),
-                                       cl::Hidden, cl::ZeroOrMore,
-                                       cl::desc("Enable memory profiler"));
-
-static cl::opt<bool> PerformMandatoryInliningsFirst(
-    "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore,
-    cl::desc("Perform mandatory inlinings module-wide, before performing "
-             "inlining."));
-
-static cl::opt<bool> EnableO3NonTrivialUnswitching(
-    "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
-    cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
-
-PipelineTuningOptions::PipelineTuningOptions() {
-  LoopInterleaving = true;
-  LoopVectorization = true;
-  SLPVectorization = false;
-  LoopUnrolling = true;
-  ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
-  LicmMssaOptCap = SetLicmMssaOptCap;
-  LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
-  CallGraphProfile = true;
-  MergeFunctions = false;
-}
-
 namespace llvm {
-extern cl::opt<unsigned> MaxDevirtIterations;
-extern cl::opt<bool> EnableConstraintElimination;
-extern cl::opt<bool> EnableFunctionSpecialization;
-extern cl::opt<bool> EnableGVNHoist;
-extern cl::opt<bool> EnableGVNSink;
-extern cl::opt<bool> EnableHotColdSplit;
-extern cl::opt<bool> EnableIROutliner;
-extern cl::opt<bool> EnableOrderFileInstrumentation;
-extern cl::opt<bool> EnableCHR;
-extern cl::opt<bool> EnableLoopInterchange;
-extern cl::opt<bool> EnableUnrollAndJam;
-extern cl::opt<bool> EnableLoopFlatten;
-extern cl::opt<bool> EnableDFAJumpThreading;
-extern cl::opt<bool> RunNewGVN;
-extern cl::opt<bool> RunPartialInlining;
-extern cl::opt<bool> ExtraVectorizerPasses;
-
-extern cl::opt<bool> FlattenedProfileUsed;
-
-extern cl::opt<AttributorRunOption> AttributorRun;
-extern cl::opt<bool> EnableKnowledgeRetention;
-
-extern cl::opt<bool> EnableMatrix;
-
-extern cl::opt<bool> DisablePreInliner;
-extern cl::opt<int> PreInlineThreshold;
+cl::opt<bool> PrintPipelinePasses(
+    "print-pipeline-passes",
+    cl::desc("Print a '-passes' compatible string describing the pipeline "
+             "(best-effort only)."));
 } // namespace llvm
 
-const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O0 = {
-    /*SpeedLevel*/ 0,
-    /*SizeLevel*/ 0};
-const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O1 = {
-    /*SpeedLevel*/ 1,
-    /*SizeLevel*/ 0};
-const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O2 = {
-    /*SpeedLevel*/ 2,
-    /*SizeLevel*/ 0};
-const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O3 = {
-    /*SpeedLevel*/ 3,
-    /*SizeLevel*/ 0};
-const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Os = {
-    /*SpeedLevel*/ 2,
-    /*SizeLevel*/ 1};
-const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Oz = {
-    /*SpeedLevel*/ 2,
-    /*SizeLevel*/ 2};
-
 namespace {
 
 // The following passes/analyses have custom names, otherwise their name will
@@ -405,6 +320,15 @@ public:
   static StringRef name() { return "NoOpFunctionAnalysis"; }
 };
 
+/// No-op loop nest pass which does nothing.
+struct NoOpLoopNestPass : PassInfoMixin<NoOpLoopNestPass> {
+  PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &,
+                        LoopStandardAnalysisResults &, LPMUpdater &) {
+    return PreservedAnalyses::all();
+  }
+  static StringRef name() { return "NoOpLoopNestPass"; }
+};
+
 /// No-op loop pass which does nothing.
 struct NoOpLoopPass : PassInfoMixin<NoOpLoopPass> {
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &,
@@ -439,7 +363,8 @@ AnalysisKey NoOpLoopAnalysis::Key;
 /// it. This should be updated if new pass instrumentation wants to use the map.
 /// We currently only use this for --print-before/after.
 bool shouldPopulateClassToPassNames() {
-  return !printBeforePasses().empty() || !printAfterPasses().empty();
+  return PrintPipelinePasses || !printBeforePasses().empty() ||
+         !printAfterPasses().empty();
 }
 
 } // namespace
@@ -453,6 +378,8 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   if (PIC && shouldPopulateClassToPassNames()) {
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  PIC->addClassToPassName(CLASS, NAME);
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
@@ -461,6 +388,8 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   PIC->addClassToPassName(CLASS, NAME);
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)        \
@@ -469,18 +398,14 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
+  PIC->addClassToPassName(CLASS, NAME);
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #include "PassRegistry.def"
   }
 }
 
-void PassBuilder::invokePeepholeEPCallbacks(
-    FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
-  for (auto &C : PeepholeEPCallbacks)
-    C(FPM, Level);
-}
-
 void PassBuilder::registerModuleAnalyses(ModuleAnalysisManager &MAM) {
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   MAM.registerPass([&] { return CREATE_PASS; });
@@ -500,6 +425,11 @@ void PassBuilder::registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM) {
 }
 
 void PassBuilder::registerFunctionAnalyses(FunctionAnalysisManager &FAM) {
+  // We almost always want the default alias analysis pipeline.
+  // If a user wants a different one, they can register their own before calling
+  // registerFunctionAnalyses().
+  FAM.registerPass([&] { return buildDefaultAAPipeline(); });
+
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   FAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
@@ -517,1518 +447,6 @@ void PassBuilder::registerLoopAnalyses(LoopAnalysisManager &LAM) {
     C(LAM);
 }
 
-// Helper to add AnnotationRemarksPass.
-static void addAnnotationRemarksPass(ModulePassManager &MPM) {
-  FunctionPassManager FPM;
-  FPM.addPass(AnnotationRemarksPass());
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-}
-
-// Helper to check if the current compilation phase is preparing for LTO
-static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
-  return Phase == ThinOrFullLTOPhase::ThinLTOPreLink ||
-         Phase == ThinOrFullLTOPhase::FullLTOPreLink;
-}
-
-// TODO: Investigate the cost/benefit of tail call elimination on debugging.
-FunctionPassManager
-PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
-                                                   ThinOrFullLTOPhase Phase) {
-
-  FunctionPassManager FPM;
-
-  // Form SSA out of local memory accesses after breaking apart aggregates into
-  // scalars.
-  FPM.addPass(SROA());
-
-  // Catch trivial redundancies
-  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
-
-  // Hoisting of scalars and load expressions.
-  FPM.addPass(SimplifyCFGPass());
-  FPM.addPass(InstCombinePass());
-
-  FPM.addPass(LibCallsShrinkWrapPass());
-
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  FPM.addPass(SimplifyCFGPass());
-
-  // Form canonically associated expression trees, and simplify the trees using
-  // basic mathematical properties. For example, this will form (nearly)
-  // minimal multiplication trees.
-  FPM.addPass(ReassociatePass());
-
-  // Add the primary loop simplification pipeline.
-  // FIXME: Currently this is split into two loop pass pipelines because we run
-  // some function passes in between them. These can and should be removed
-  // and/or replaced by scheduling the loop pass equivalents in the correct
-  // positions. But those equivalent passes aren't powerful enough yet.
-  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
-  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
-  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
-  // `LoopInstSimplify`.
-  LoopPassManager LPM1, LPM2;
-
-  // Simplify the loop body. We do this initially to clean up after other loop
-  // passes run, either when iterating on a loop or on inner loops with
-  // implications on the outer loop.
-  LPM1.addPass(LoopInstSimplifyPass());
-  LPM1.addPass(LoopSimplifyCFGPass());
-
-  // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
-  // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-
-  LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
-                              isLTOPreLink(Phase)));
-  // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-  LPM1.addPass(SimpleLoopUnswitchPass());
-
-  LPM2.addPass(LoopIdiomRecognizePass());
-  LPM2.addPass(IndVarSimplifyPass());
-
-  for (auto &C : LateLoopOptimizationsEPCallbacks)
-    C(LPM2, Level);
-
-  LPM2.addPass(LoopDeletionPass());
-
-  if (EnableLoopInterchange)
-    LPM2.addPass(LoopInterchangePass());
-
-  // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
-  // because it changes IR to makes profile annotation in back compile
-  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
-  // attributes so we need to make sure and allow the full unroll pass to pay
-  // attention to it.
-  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
-      PGOOpt->Action != PGOOptions::SampleUse)
-    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
-                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
-                                    PTO.ForgetAllSCEVInLoopUnroll));
-
-  for (auto &C : LoopOptimizerEndEPCallbacks)
-    C(LPM2, Level);
-
-  // We provide the opt remark emitter pass for LICM to use. We only need to do
-  // this once as it is immutable.
-  FPM.addPass(
-      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                              EnableMSSALoopDependency,
-                                              /*UseBlockFrequencyInfo=*/true));
-  FPM.addPass(SimplifyCFGPass());
-  FPM.addPass(InstCombinePass());
-  if (EnableLoopFlatten)
-    FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
-  // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
-  // *All* loop passes must preserve it, in order to be able to use it.
-  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
-                                              /*UseMemorySSA=*/false,
-                                              /*UseBlockFrequencyInfo=*/false));
-
-  // Delete small array after loop unroll.
-  FPM.addPass(SROA());
-
-  // Specially optimize memory movement as it doesn't look like dataflow in SSA.
-  FPM.addPass(MemCpyOptPass());
-
-  // Sparse conditional constant propagation.
-  // FIXME: It isn't clear why we do this *after* loop passes rather than
-  // before...
-  FPM.addPass(SCCPPass());
-
-  // Delete dead bit computations (instcombine runs after to fold away the dead
-  // computations, and then ADCE will run later to exploit any new DCE
-  // opportunities that creates).
-  FPM.addPass(BDCEPass());
-
-  // Run instcombine after redundancy and dead bit elimination to exploit
-  // opportunities opened up by them.
-  FPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  FPM.addPass(CoroElidePass());
-
-  for (auto &C : ScalarOptimizerLateEPCallbacks)
-    C(FPM, Level);
-
-  // Finally, do an expensive DCE pass to catch all the dead code exposed by
-  // the simplifications and basic cleanup after all the simplifications.
-  // TODO: Investigate if this is too expensive.
-  FPM.addPass(ADCEPass());
-  FPM.addPass(SimplifyCFGPass());
-  FPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  return FPM;
-}
-
-FunctionPassManager
-PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                                 ThinOrFullLTOPhase Phase) {
-  assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
-
-  // The O1 pipeline has a separate pipeline creation function to simplify
-  // construction readability.
-  if (Level.getSpeedupLevel() == 1)
-    return buildO1FunctionSimplificationPipeline(Level, Phase);
-
-  FunctionPassManager FPM;
-
-  // Form SSA out of local memory accesses after breaking apart aggregates into
-  // scalars.
-  FPM.addPass(SROA());
-
-  // Catch trivial redundancies
-  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
-  if (EnableKnowledgeRetention)
-    FPM.addPass(AssumeSimplifyPass());
-
-  // Hoisting of scalars and load expressions.
-  if (EnableGVNHoist)
-    FPM.addPass(GVNHoistPass());
-
-  // Global value numbering based sinking.
-  if (EnableGVNSink) {
-    FPM.addPass(GVNSinkPass());
-    FPM.addPass(SimplifyCFGPass());
-  }
-
-  if (EnableConstraintElimination)
-    FPM.addPass(ConstraintEliminationPass());
-
-  // Speculative execution if the target has divergent branches; otherwise nop.
-  FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
-
-  // Optimize based on known information about branches, and cleanup afterward.
-  FPM.addPass(JumpThreadingPass());
-  FPM.addPass(CorrelatedValuePropagationPass());
-
-  FPM.addPass(SimplifyCFGPass());
-  if (Level == OptimizationLevel::O3)
-    FPM.addPass(AggressiveInstCombinePass());
-  FPM.addPass(InstCombinePass());
-
-  if (!Level.isOptimizingForSize())
-    FPM.addPass(LibCallsShrinkWrapPass());
-
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
-  // using the size value profile. Don't perform this when optimizing for size.
-  if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
-      !Level.isOptimizingForSize())
-    FPM.addPass(PGOMemOPSizeOpt());
-
-  FPM.addPass(TailCallElimPass());
-  FPM.addPass(SimplifyCFGPass());
-
-  // Form canonically associated expression trees, and simplify the trees using
-  // basic mathematical properties. For example, this will form (nearly)
-  // minimal multiplication trees.
-  FPM.addPass(ReassociatePass());
-
-  // Add the primary loop simplification pipeline.
-  // FIXME: Currently this is split into two loop pass pipelines because we run
-  // some function passes in between them. These can and should be removed
-  // and/or replaced by scheduling the loop pass equivalents in the correct
-  // positions. But those equivalent passes aren't powerful enough yet.
-  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
-  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
-  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
-  // `LoopInstSimplify`.
-  LoopPassManager LPM1, LPM2;
-
-  // Simplify the loop body. We do this initially to clean up after other loop
-  // passes run, either when iterating on a loop or on inner loops with
-  // implications on the outer loop.
-  LPM1.addPass(LoopInstSimplifyPass());
-  LPM1.addPass(LoopSimplifyCFGPass());
-
-  // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
-  // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-
-  // Disable header duplication in loop rotation at -Oz.
-  LPM1.addPass(
-      LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
-  // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-  LPM1.addPass(
-      SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
-                             EnableO3NonTrivialUnswitching));
-  LPM2.addPass(LoopIdiomRecognizePass());
-  LPM2.addPass(IndVarSimplifyPass());
-
-  for (auto &C : LateLoopOptimizationsEPCallbacks)
-    C(LPM2, Level);
-
-  LPM2.addPass(LoopDeletionPass());
-
-  if (EnableLoopInterchange)
-    LPM2.addPass(LoopInterchangePass());
-
-  // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
-  // because it changes IR to makes profile annotation in back compile
-  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
-  // attributes so we need to make sure and allow the full unroll pass to pay
-  // attention to it.
-  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
-      PGOOpt->Action != PGOOptions::SampleUse)
-    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
-                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
-                                    PTO.ForgetAllSCEVInLoopUnroll));
-
-  for (auto &C : LoopOptimizerEndEPCallbacks)
-    C(LPM2, Level);
-
-  // We provide the opt remark emitter pass for LICM to use. We only need to do
-  // this once as it is immutable.
-  FPM.addPass(
-      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                              EnableMSSALoopDependency,
-                                              /*UseBlockFrequencyInfo=*/true));
-  FPM.addPass(SimplifyCFGPass());
-  FPM.addPass(InstCombinePass());
-  if (EnableLoopFlatten)
-    FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
-  // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
-  // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
-  // *All* loop passes must preserve it, in order to be able to use it.
-  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
-                                              /*UseMemorySSA=*/false,
-                                              /*UseBlockFrequencyInfo=*/false));
-
-  // Delete small array after loop unroll.
-  FPM.addPass(SROA());
-
-  // Eliminate redundancies.
-  FPM.addPass(MergedLoadStoreMotionPass());
-  if (RunNewGVN)
-    FPM.addPass(NewGVNPass());
-  else
-    FPM.addPass(GVN());
-
-  // Sparse conditional constant propagation.
-  // FIXME: It isn't clear why we do this *after* loop passes rather than
-  // before...
-  FPM.addPass(SCCPPass());
-
-  // Delete dead bit computations (instcombine runs after to fold away the dead
-  // computations, and then ADCE will run later to exploit any new DCE
-  // opportunities that creates).
-  FPM.addPass(BDCEPass());
-
-  // Run instcombine after redundancy and dead bit elimination to exploit
-  // opportunities opened up by them.
-  FPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  // Re-consider control flow based optimizations after redundancy elimination,
-  // redo DCE, etc.
-  if (EnableDFAJumpThreading && Level.getSizeLevel() == 0)
-    FPM.addPass(DFAJumpThreadingPass());
-
-  FPM.addPass(JumpThreadingPass());
-  FPM.addPass(CorrelatedValuePropagationPass());
-
-  // Finally, do an expensive DCE pass to catch all the dead code exposed by
-  // the simplifications and basic cleanup after all the simplifications.
-  // TODO: Investigate if this is too expensive.
-  FPM.addPass(ADCEPass());
-
-  // Specially optimize memory movement as it doesn't look like dataflow in SSA.
-  FPM.addPass(MemCpyOptPass());
-
-  FPM.addPass(DSEPass());
-  FPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
-
-  FPM.addPass(CoroElidePass());
-
-  for (auto &C : ScalarOptimizerLateEPCallbacks)
-    C(FPM, Level);
-
-  FPM.addPass(SimplifyCFGPass(
-      SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
-  FPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
-      (PGOOpt->Action == PGOOptions::IRUse ||
-       PGOOpt->Action == PGOOptions::SampleUse))
-    FPM.addPass(ControlHeightReductionPass());
-
-  return FPM;
-}
-
-void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
-  MPM.addPass(CanonicalizeAliasesPass());
-  MPM.addPass(NameAnonGlobalPass());
-}
-
-void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
-                                    PassBuilder::OptimizationLevel Level,
-                                    bool RunProfileGen, bool IsCS,
-                                    std::string ProfileFile,
-                                    std::string ProfileRemappingFile) {
-  assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
-  if (!IsCS && !DisablePreInliner) {
-    InlineParams IP;
-
-    IP.DefaultThreshold = PreInlineThreshold;
-
-    // FIXME: The hint threshold has the same value used by the regular inliner
-    // when not optimzing for size. This should probably be lowered after
-    // performance testing.
-    // FIXME: this comment is cargo culted from the old pass manager, revisit).
-    IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
-    ModuleInlinerWrapperPass MIWP(IP);
-    CGSCCPassManager &CGPipeline = MIWP.getPM();
-
-    FunctionPassManager FPM;
-    FPM.addPass(SROA());
-    FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
-    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
-    FPM.addPass(InstCombinePass()); // Combine silly sequences.
-    invokePeepholeEPCallbacks(FPM, Level);
-
-    CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
-
-    MPM.addPass(std::move(MIWP));
-
-    // Delete anything that is now dead to make sure that we don't instrument
-    // dead code. Instrumentation can end up keeping dead code around and
-    // dramatically increase code size.
-    MPM.addPass(GlobalDCEPass());
-  }
-
-  if (!RunProfileGen) {
-    assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
-    MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
-    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
-    // RequireAnalysisPass for PSI before subsequent non-module passes.
-    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-    return;
-  }
-
-  // Perform PGO instrumentation.
-  MPM.addPass(PGOInstrumentationGen(IsCS));
-
-  FunctionPassManager FPM;
-  // Disable header duplication in loop rotation at -Oz.
-  FPM.addPass(createFunctionToLoopPassAdaptor(
-      LoopRotatePass(Level != OptimizationLevel::Oz), EnableMSSALoopDependency,
-      /*UseBlockFrequencyInfo=*/false));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-
-  // Add the profile lowering pass.
-  InstrProfOptions Options;
-  if (!ProfileFile.empty())
-    Options.InstrProfileOutput = ProfileFile;
-  // Do counter promotion at Level greater than O0.
-  Options.DoCounterPromotion = true;
-  Options.UseBFIInPromotion = IsCS;
-  MPM.addPass(InstrProfiling(Options, IsCS));
-}
-
-void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,
-                                         bool RunProfileGen, bool IsCS,
-                                         std::string ProfileFile,
-                                         std::string ProfileRemappingFile) {
-  if (!RunProfileGen) {
-    assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
-    MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
-    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
-    // RequireAnalysisPass for PSI before subsequent non-module passes.
-    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-    return;
-  }
-
-  // Perform PGO instrumentation.
-  MPM.addPass(PGOInstrumentationGen(IsCS));
-  // Add the profile lowering pass.
-  InstrProfOptions Options;
-  if (!ProfileFile.empty())
-    Options.InstrProfileOutput = ProfileFile;
-  // Do not do counter promotion at O0.
-  Options.DoCounterPromotion = false;
-  Options.UseBFIInPromotion = IsCS;
-  MPM.addPass(InstrProfiling(Options, IsCS));
-}
-
-static InlineParams
-getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) {
-  return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
-}
-
-ModuleInlinerWrapperPass
-PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
-                                  ThinOrFullLTOPhase Phase) {
-  InlineParams IP = getInlineParamsFromOptLevel(Level);
-  if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
-      PGOOpt->Action == PGOOptions::SampleUse)
-    IP.HotCallSiteThreshold = 0;
-
-  if (PGOOpt)
-    IP.EnableDeferral = EnablePGOInlineDeferral;
-
-  ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst,
-                                UseInlineAdvisor, MaxDevirtIterations);
-
-  // Require the GlobalsAA analysis for the module so we can query it within
-  // the CGSCC pipeline.
-  MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>());
-  // Invalidate AAManager so it can be recreated and pick up the newly available
-  // GlobalsAA.
-  MIWP.addModulePass(
-      createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
-
-  // Require the ProfileSummaryAnalysis for the module so we can query it within
-  // the inliner pass.
-  MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-
-  // Now begin the main postorder CGSCC pipeline.
-  // FIXME: The current CGSCC pipeline has its origins in the legacy pass
-  // manager and trying to emulate its precise behavior. Much of this doesn't
-  // make a lot of sense and we should revisit the core CGSCC structure.
-  CGSCCPassManager &MainCGPipeline = MIWP.getPM();
-
-  // Note: historically, the PruneEH pass was run first to deduce nounwind and
-  // generally clean up exception handling overhead. It isn't clear this is
-  // valuable as the inliner doesn't currently care whether it is inlining an
-  // invoke or a call.
-
-  if (AttributorRun & AttributorRunOption::CGSCC)
-    MainCGPipeline.addPass(AttributorCGSCCPass());
-
-  // Now deduce any function attributes based in the current code.
-  MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
-
-  // When at O3 add argument promotion to the pass pipeline.
-  // FIXME: It isn't at all clear why this should be limited to O3.
-  if (Level == OptimizationLevel::O3)
-    MainCGPipeline.addPass(ArgumentPromotionPass());
-
-  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
-  // there are no OpenMP runtime calls present in the module.
-  if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
-    MainCGPipeline.addPass(OpenMPOptCGSCCPass());
-
-  for (auto &C : CGSCCOptimizerLateEPCallbacks)
-    C(MainCGPipeline, Level);
-
-  // Lastly, add the core function simplification pipeline nested inside the
-  // CGSCC walk.
-  MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
-      buildFunctionSimplificationPipeline(Level, Phase)));
-
-  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
-
-  return MIWP;
-}
-
-ModulePassManager
-PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
-                                               ThinOrFullLTOPhase Phase) {
-  ModulePassManager MPM;
-
-  // Place pseudo probe instrumentation as the first pass of the pipeline to
-  // minimize the impact of optimization changes.
-  if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
-      Phase != ThinOrFullLTOPhase::ThinLTOPostLink)
-    MPM.addPass(SampleProfileProbePass(TM));
-
-  bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
-
-  // In ThinLTO mode, when flattened profile is used, all the available
-  // profile information will be annotated in PreLink phase so there is
-  // no need to load the profile again in PostLink.
-  bool LoadSampleProfile =
-      HasSampleProfile &&
-      !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink);
-
-  // During the ThinLTO backend phase we perform early indirect call promotion
-  // here, before globalopt. Otherwise imported available_externally functions
-  // look unreferenced and are removed. If we are going to load the sample
-  // profile then defer until later.
-  // TODO: See if we can move later and consolidate with the location where
-  // we perform ICP when we are loading a sample profile.
-  // TODO: We pass HasSampleProfile (whether there was a sample profile file
-  // passed to the compile) to the SamplePGO flag of ICP. This is used to
-  // determine whether the new direct calls are annotated with prof metadata.
-  // Ideally this should be determined from whether the IR is annotated with
-  // sample profile, and not whether the a sample profile was provided on the
-  // command line. E.g. for flattened profiles where we will not be reloading
-  // the sample profile in the ThinLTO backend, we ideally shouldn't have to
-  // provide the sample profile file.
-  if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile)
-    MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile));
-
-  // Do basic inference of function attributes from known properties of system
-  // libraries and other oracles.
-  MPM.addPass(InferFunctionAttrsPass());
-
-  // Create an early function pass manager to cleanup the output of the
-  // frontend.
-  FunctionPassManager EarlyFPM;
-  // Lower llvm.expect to metadata before attempting transforms.
-  // Compare/branch metadata may alter the behavior of passes like SimplifyCFG.
-  EarlyFPM.addPass(LowerExpectIntrinsicPass());
-  EarlyFPM.addPass(SimplifyCFGPass());
-  EarlyFPM.addPass(SROA());
-  EarlyFPM.addPass(EarlyCSEPass());
-  EarlyFPM.addPass(CoroEarlyPass());
-  if (Level == OptimizationLevel::O3)
-    EarlyFPM.addPass(CallSiteSplittingPass());
-
-  // In SamplePGO ThinLTO backend, we need instcombine before profile annotation
-  // to convert bitcast to direct calls so that they can be inlined during the
-  // profile annotation prepration step.
-  // More details about SamplePGO design can be found in:
-  // https://research.google.com/pubs/pub45290.html
-  // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured.
-  if (LoadSampleProfile)
-    EarlyFPM.addPass(InstCombinePass());
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
-
-  if (LoadSampleProfile) {
-    // Annotate sample profile right after early FPM to ensure freshness of
-    // the debug info.
-    MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
-                                        PGOOpt->ProfileRemappingFile, Phase));
-    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
-    // RequireAnalysisPass for PSI before subsequent non-module passes.
-    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-    // Do not invoke ICP in the LTOPrelink phase as it makes it hard
-    // for the profile annotation to be accurate in the LTO backend.
-    if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink &&
-        Phase != ThinOrFullLTOPhase::FullLTOPreLink)
-      // We perform early indirect call promotion here, before globalopt.
-      // This is important for the ThinLTO backend phase because otherwise
-      // imported available_externally functions look unreferenced and are
-      // removed.
-      MPM.addPass(
-          PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
-  }
-
-  // Try to perform OpenMP specific optimizations on the module. This is a
-  // (quick!) no-op if there are no OpenMP runtime calls present in the module.
-  if (Level != OptimizationLevel::O0)
-    MPM.addPass(OpenMPOptPass());
-
-  if (AttributorRun & AttributorRunOption::MODULE)
-    MPM.addPass(AttributorPass());
-
-  // Lower type metadata and the type.test intrinsic in the ThinLTO
-  // post link pipeline after ICP. This is to enable usage of the type
-  // tests in ICP sequences.
-  if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink)
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
-
-  for (auto &C : PipelineEarlySimplificationEPCallbacks)
-    C(MPM, Level);
-
-  // Specialize functions with IPSCCP.
-  if (EnableFunctionSpecialization)
-    MPM.addPass(FunctionSpecializationPass());
-
-  // Interprocedural constant propagation now that basic cleanup has occurred
-  // and prior to optimizing globals.
-  // FIXME: This position in the pipeline hasn't been carefully considered in
-  // years, it should be re-analyzed.
-  MPM.addPass(IPSCCPPass());
-
-  // Attach metadata to indirect call sites indicating the set of functions
-  // they may target at run-time. This should follow IPSCCP.
-  MPM.addPass(CalledValuePropagationPass());
-
-  // Optimize globals to try and fold them into constants.
-  MPM.addPass(GlobalOptPass());
-
-  // Promote any localized globals to SSA registers.
-  // FIXME: Should this instead by a run of SROA?
-  // FIXME: We should probably run instcombine and simplifycfg afterward to
-  // delete control flows that are dead once globals have been folded to
-  // constants.
-  MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
-
-  // Remove any dead arguments exposed by cleanups and constant folding
-  // globals.
-  MPM.addPass(DeadArgumentEliminationPass());
-
-  // Create a small function pass pipeline to cleanup after all the global
-  // optimizations.
-  FunctionPassManager GlobalCleanupPM;
-  GlobalCleanupPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
-
-  GlobalCleanupPM.addPass(SimplifyCFGPass());
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM)));
-
-  // Add all the requested passes for instrumentation PGO, if requested.
-  if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
-      (PGOOpt->Action == PGOOptions::IRInstr ||
-       PGOOpt->Action == PGOOptions::IRUse)) {
-    addPGOInstrPasses(MPM, Level,
-                      /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr,
-                      /* IsCS */ false, PGOOpt->ProfileFile,
-                      PGOOpt->ProfileRemappingFile);
-    MPM.addPass(PGOIndirectCallPromotion(false, false));
-  }
-  if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
-      PGOOpt->CSAction == PGOOptions::CSIRInstr)
-    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
-
-  // Synthesize function entry counts for non-PGO compilation.
-  if (EnableSyntheticCounts && !PGOOpt)
-    MPM.addPass(SyntheticCountsPropagation());
-
-  MPM.addPass(buildInlinerPipeline(Level, Phase));
-
-  if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) {
-    MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
-    MPM.addPass(ModuleMemProfilerPass());
-  }
-
-  return MPM;
-}
-
-/// TODO: Should LTO cause any differences to this set of passes?
-void PassBuilder::addVectorPasses(OptimizationLevel Level,
-                                  FunctionPassManager &FPM, bool IsFullLTO) {
-  FPM.addPass(LoopVectorizePass(
-      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
-
-  if (IsFullLTO) {
-    // The vectorizer may have significantly shortened a loop body; unroll
-    // again. Unroll small loops to hide loop backedge latency and saturate any
-    // parallel execution resources of an out-of-order processor. We also then
-    // need to clean up redundancies and loop invariant code.
-    // FIXME: It would be really good to use a loop-integrated instruction
-    // combiner for cleanup here so that the unrolling and LICM can be pipelined
-    // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling)
-      FPM.addPass(createFunctionToLoopPassAdaptor(
-          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
-    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
-        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
-        PTO.ForgetAllSCEVInLoopUnroll)));
-    FPM.addPass(WarnMissedTransformationsPass());
-  }
-
-  if (!IsFullLTO) {
-    // Eliminate loads by forwarding stores from the previous iteration to loads
-    // of the current iteration.
-    FPM.addPass(LoopLoadEliminationPass());
-  }
-  // Cleanup after the loop optimization passes.
-  FPM.addPass(InstCombinePass());
-
-  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
-    // At higher optimization levels, try to clean up any runtime overlap and
-    // alignment checks inserted by the vectorizer. We want to track correlated
-    // runtime checks for two inner loops in the same outer loop, fold any
-    // common computations, hoist loop-invariant aspects out of any outer loop,
-    // and unswitch the runtime checks if possible. Once hoisted, we may have
-    // dead (or speculatable) control flows or more combining opportunities.
-    FPM.addPass(EarlyCSEPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-    FPM.addPass(InstCombinePass());
-    LoopPassManager LPM;
-    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
-    LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
-                                       OptimizationLevel::O3));
-    FPM.addPass(
-        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        std::move(LPM), EnableMSSALoopDependency,
-        /*UseBlockFrequencyInfo=*/true));
-    FPM.addPass(SimplifyCFGPass());
-    FPM.addPass(InstCombinePass());
-  }
-
-  // Now that we've formed fast to execute loop structures, we do further
-  // optimizations. These are run afterward as they might block doing complex
-  // analyses and transforms such as what are needed for loop vectorization.
-
-  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
-  // GVN, loop transforms, and others have already run, so it's now better to
-  // convert to more optimized IR using more aggressive simplify CFG options.
-  // The extra sinking transform can create larger basic blocks, so do this
-  // before SLP vectorization.
-  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
-                                  .forwardSwitchCondToPhi(true)
-                                  .convertSwitchToLookupTable(true)
-                                  .needCanonicalLoops(false)
-                                  .hoistCommonInsts(true)
-                                  .sinkCommonInsts(true)));
-
-  if (IsFullLTO) {
-    FPM.addPass(SCCPPass());
-    FPM.addPass(InstCombinePass());
-    FPM.addPass(BDCEPass());
-  }
-
-  // Optimize parallel scalar instruction chains into SIMD instructions.
-  if (PTO.SLPVectorization) {
-    FPM.addPass(SLPVectorizerPass());
-    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
-      FPM.addPass(EarlyCSEPass());
-    }
-  }
-  // Enhance/cleanup vector code.
-  FPM.addPass(VectorCombinePass());
-
-  if (!IsFullLTO) {
-    FPM.addPass(InstCombinePass());
-    // Unroll small loops to hide loop backedge latency and saturate any
-    // parallel execution resources of an out-of-order processor. We also then
-    // need to clean up redundancies and loop invariant code.
-    // FIXME: It would be really good to use a loop-integrated instruction
-    // combiner for cleanup here so that the unrolling and LICM can be pipelined
-    // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-      FPM.addPass(createFunctionToLoopPassAdaptor(
-          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
-    }
-    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
-        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
-        PTO.ForgetAllSCEVInLoopUnroll)));
-    FPM.addPass(WarnMissedTransformationsPass());
-    FPM.addPass(InstCombinePass());
-    FPM.addPass(
-        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-        EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
-  }
-
-  // Now that we've vectorized and unrolled loops, we may have more refined
-  // alignment information, try to re-derive it here.
-  FPM.addPass(AlignmentFromAssumptionsPass());
-
-  if (IsFullLTO)
-    FPM.addPass(InstCombinePass());
-}
-
-ModulePassManager
-PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                             bool LTOPreLink) {
-  ModulePassManager MPM;
-
-  // Optimize globals now that the module is fully simplified.
-  MPM.addPass(GlobalOptPass());
-  MPM.addPass(GlobalDCEPass());
-
-  // Run partial inlining pass to partially inline functions that have
-  // large bodies.
-  if (RunPartialInlining)
-    MPM.addPass(PartialInlinerPass());
-
-  // Remove avail extern fns and globals definitions since we aren't compiling
-  // an object file for later LTO. For LTO we want to preserve these so they
-  // are eligible for inlining at link-time. Note if they are unreferenced they
-  // will be removed by GlobalDCE later, so this only impacts referenced
-  // available externally globals. Eventually they will be suppressed during
-  // codegen, but eliminating here enables more opportunity for GlobalDCE as it
-  // may make globals referenced by available external functions dead and saves
-  // running remaining passes on the eliminated functions. These should be
-  // preserved during prelinking for link-time inlining decisions.
-  if (!LTOPreLink)
-    MPM.addPass(EliminateAvailableExternallyPass());
-
-  if (EnableOrderFileInstrumentation)
-    MPM.addPass(InstrOrderFilePass());
-
-  // Do RPO function attribute inference across the module to forward-propagate
-  // attributes where applicable.
-  // FIXME: Is this really an optimization rather than a canonicalization?
-  MPM.addPass(ReversePostOrderFunctionAttrsPass());
-
-  // Do a post inline PGO instrumentation and use pass. This is a context
-  // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as
-  // cross-module inline has not been done yet. The context sensitive
-  // instrumentation is after all the inlines are done.
-  if (!LTOPreLink && PGOOpt) {
-    if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
-      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
-                        /* IsCS */ true, PGOOpt->CSProfileGenFile,
-                        PGOOpt->ProfileRemappingFile);
-    else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
-      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
-                        /* IsCS */ true, PGOOpt->ProfileFile,
-                        PGOOpt->ProfileRemappingFile);
-  }
-
-  // Re-require GloblasAA here prior to function passes. This is particularly
-  // useful as the above will have inlined, DCE'ed, and function-attr
-  // propagated everything. We should at this point have a reasonably minimal
-  // and richly annotated call graph. By computing aliasing and mod/ref
-  // information for all local globals here, the late loop passes and notably
-  // the vectorizer will be able to use them to help recognize vectorizable
-  // memory operations.
-  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
-
-  FunctionPassManager OptimizePM;
-  OptimizePM.addPass(Float2IntPass());
-  OptimizePM.addPass(LowerConstantIntrinsicsPass());
-
-  if (EnableMatrix) {
-    OptimizePM.addPass(LowerMatrixIntrinsicsPass());
-    OptimizePM.addPass(EarlyCSEPass());
-  }
-
-  // FIXME: We need to run some loop optimizations to re-rotate loops after
-  // simplifycfg and others undo their rotation.
-
-  // Optimize the loop execution. These passes operate on entire loop nests
-  // rather than on each loop in an inside-out manner, and so they are actually
-  // function passes.
-
-  for (auto &C : VectorizerStartEPCallbacks)
-    C(OptimizePM, Level);
-
-  // First rotate loops that may have been un-rotated by prior passes.
-  // Disable header duplication at -Oz.
-  OptimizePM.addPass(createFunctionToLoopPassAdaptor(
-      LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink),
-      EnableMSSALoopDependency,
-      /*UseBlockFrequencyInfo=*/false));
-
-  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
-  // into separate loop that would otherwise inhibit vectorization.  This is
-  // currently only performed for loops marked with the metadata
-  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
-  OptimizePM.addPass(LoopDistributePass());
-
-  // Populates the VFABI attribute with the scalar-to-vector mappings
-  // from the TargetLibraryInfo.
-  OptimizePM.addPass(InjectTLIMappings());
-
-  addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
-
-  // Split out cold code. Splitting is done late to avoid hiding context from
-  // other optimizations and inadvertently regressing performance. The tradeoff
-  // is that this has a higher code size cost than splitting early.
-  if (EnableHotColdSplit && !LTOPreLink)
-    MPM.addPass(HotColdSplittingPass());
-
-  // Search the code for similar regions of code. If enough similar regions can
-  // be found where extracting the regions into their own function will decrease
-  // the size of the program, we extract the regions, a deduplicate the
-  // structurally similar regions.
-  if (EnableIROutliner)
-    MPM.addPass(IROutlinerPass());
-
-  // Merge functions if requested.
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
-
-  // LoopSink pass sinks instructions hoisted by LICM, which serves as a
-  // canonicalization pass that enables other optimizations. As a result,
-  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
-  // result too early.
-  OptimizePM.addPass(LoopSinkPass());
-
-  // And finally clean up LCSSA form before generating code.
-  OptimizePM.addPass(InstSimplifyPass());
-
-  // This hoists/decomposes div/rem ops. It should run after other sink/hoist
-  // passes to avoid re-sinking, but before SimplifyCFG because it can allow
-  // flattening of blocks.
-  OptimizePM.addPass(DivRemPairsPass());
-
-  // LoopSink (and other loop passes since the last simplifyCFG) might have
-  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  OptimizePM.addPass(SimplifyCFGPass());
-
-  OptimizePM.addPass(CoroCleanupPass());
-
-  // Add the core optimizing pipeline.
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
-
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(MPM, Level);
-
-  if (PTO.CallGraphProfile)
-    MPM.addPass(CGProfilePass());
-
-  // Now we need to do some global optimization transforms.
-  // FIXME: It would seem like these should come first in the optimization
-  // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
-  // ordering here.
-  MPM.addPass(GlobalDCEPass());
-  MPM.addPass(ConstantMergePass());
-
-  // TODO: Relative look table converter pass caused an issue when full lto is
-  // enabled. See https://reviews.llvm.org/D94355 for more details.
-  // Until the issue fixed, disable this pass during pre-linking phase.
-  if (!LTOPreLink)
-    MPM.addPass(RelLookupTableConverterPass());
-
-  return MPM;
-}
-
-ModulePassManager
-PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
-                                           bool LTOPreLink) {
-  assert(Level != OptimizationLevel::O0 &&
-         "Must request optimizations for the default pipeline!");
-
-  ModulePassManager MPM;
-
-  // Convert @llvm.global.annotations to !annotation metadata.
-  MPM.addPass(Annotation2MetadataPass());
-
-  // Force any function attributes we want the rest of the pipeline to observe.
-  MPM.addPass(ForceFunctionAttrsPass());
-
-  // Apply module pipeline start EP callback.
-  for (auto &C : PipelineStartEPCallbacks)
-    C(MPM, Level);
-
-  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
-    MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
-
-  // Add the core simplification pipeline.
-  MPM.addPass(buildModuleSimplificationPipeline(
-      Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink
-                        : ThinOrFullLTOPhase::None));
-
-  // Now add the optimization pipeline.
-  MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink));
-
-  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
-    MPM.addPass(PseudoProbeUpdatePass());
-
-  // Emit annotation remarks.
-  addAnnotationRemarksPass(MPM);
-
-  if (LTOPreLink)
-    addRequiredLTOPreLinkPasses(MPM);
-
-  return MPM;
-}
-
-ModulePassManager
-PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
-  assert(Level != OptimizationLevel::O0 &&
-         "Must request optimizations for the default pipeline!");
-
-  ModulePassManager MPM;
-
-  // Convert @llvm.global.annotations to !annotation metadata.
-  MPM.addPass(Annotation2MetadataPass());
-
-  // Force any function attributes we want the rest of the pipeline to observe.
-  MPM.addPass(ForceFunctionAttrsPass());
-
-  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
-    MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
-
-  // Apply module pipeline start EP callback.
-  for (auto &C : PipelineStartEPCallbacks)
-    C(MPM, Level);
-
-  // If we are planning to perform ThinLTO later, we don't bloat the code with
-  // unrolling/vectorization/... now. Just simplify the module as much as we
-  // can.
-  MPM.addPass(buildModuleSimplificationPipeline(
-      Level, ThinOrFullLTOPhase::ThinLTOPreLink));
-
-  // Run partial inlining pass to partially inline functions that have
-  // large bodies.
-  // FIXME: It isn't clear whether this is really the right place to run this
-  // in ThinLTO. Because there is another canonicalization and simplification
-  // phase that will run after the thin link, running this here ends up with
-  // less information than will be available later and it may grow functions in
-  // ways that aren't beneficial.
-  if (RunPartialInlining)
-    MPM.addPass(PartialInlinerPass());
-
-  // Reduce the size of the IR as much as possible.
-  MPM.addPass(GlobalOptPass());
-
-  // Module simplification splits coroutines, but does not fully clean up
-  // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up
-  // on these, we schedule the cleanup here.
-  MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
-
-  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
-    MPM.addPass(PseudoProbeUpdatePass());
-
-  // Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual
-  // optimization is going to be done in PostLink stage, but clang can't
-  // add callbacks there in case of in-process ThinLTO called by linker.
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(MPM, Level);
-
-  // Emit annotation remarks.
-  addAnnotationRemarksPass(MPM);
-
-  addRequiredLTOPreLinkPasses(MPM);
-
-  return MPM;
-}
-
-ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
-    OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
-  ModulePassManager MPM;
-
-  // Convert @llvm.global.annotations to !annotation metadata.
-  MPM.addPass(Annotation2MetadataPass());
-
-  if (ImportSummary) {
-    // These passes import type identifier resolutions for whole-program
-    // devirtualization and CFI. They must run early because other passes may
-    // disturb the specific instruction patterns that these passes look for,
-    // creating dependencies on resolutions that may not appear in the summary.
-    //
-    // For example, GVN may transform the pattern assume(type.test) appearing in
-    // two basic blocks into assume(phi(type.test, type.test)), which would
-    // transform a dependency on a WPD resolution into a dependency on a type
-    // identifier resolution for CFI.
-    //
-    // Also, WPD has access to more precise information than ICP and can
-    // devirtualize more effectively, so it should operate on the IR first.
-    //
-    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
-    // metadata and intrinsics.
-    MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary));
-    MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
-  }
-
-  if (Level == OptimizationLevel::O0) {
-    // Run a second time to clean up any type tests left behind by WPD for use
-    // in ICP.
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
-    // Drop available_externally and unreferenced globals. This is necessary
-    // with ThinLTO in order to avoid leaving undefined references to dead
-    // globals in the object file.
-    MPM.addPass(EliminateAvailableExternallyPass());
-    MPM.addPass(GlobalDCEPass());
-    return MPM;
-  }
-
-  // Force any function attributes we want the rest of the pipeline to observe.
-  MPM.addPass(ForceFunctionAttrsPass());
-
-  // Add the core simplification pipeline.
-  MPM.addPass(buildModuleSimplificationPipeline(
-      Level, ThinOrFullLTOPhase::ThinLTOPostLink));
-
-  // Now add the optimization pipeline.
-  MPM.addPass(buildModuleOptimizationPipeline(Level));
-
-  // Emit annotation remarks.
-  addAnnotationRemarksPass(MPM);
-
-  return MPM;
-}
-
-ModulePassManager
-PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
-  assert(Level != OptimizationLevel::O0 &&
-         "Must request optimizations for the default pipeline!");
-  // FIXME: We should use a customized pre-link pipeline!
-  return buildPerModuleDefaultPipeline(Level,
-                                       /* LTOPreLink */ true);
-}
-
-ModulePassManager
-PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
-                                     ModuleSummaryIndex *ExportSummary) {
-  ModulePassManager MPM;
-
-  // Convert @llvm.global.annotations to !annotation metadata.
-  MPM.addPass(Annotation2MetadataPass());
-
-  // Create a function that performs CFI checks for cross-DSO calls with targets
-  // in the current module.
-  MPM.addPass(CrossDSOCFIPass());
-
-  if (Level == OptimizationLevel::O0) {
-    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
-    // metadata and intrinsics.
-    MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
-    MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
-    // Run a second time to clean up any type tests left behind by WPD for use
-    // in ICP.
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
-
-    // Emit annotation remarks.
-    addAnnotationRemarksPass(MPM);
-
-    return MPM;
-  }
-
-  if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) {
-    // Load sample profile before running the LTO optimization pipeline.
-    MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
-                                        PGOOpt->ProfileRemappingFile,
-                                        ThinOrFullLTOPhase::FullLTOPostLink));
-    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
-    // RequireAnalysisPass for PSI before subsequent non-module passes.
-    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-  }
-
-  // Remove unused virtual tables to improve the quality of code generated by
-  // whole-program devirtualization and bitset lowering.
-  MPM.addPass(GlobalDCEPass());
-
-  // Force any function attributes we want the rest of the pipeline to observe.
-  MPM.addPass(ForceFunctionAttrsPass());
-
-  // Do basic inference of function attributes from known properties of system
-  // libraries and other oracles.
-  MPM.addPass(InferFunctionAttrsPass());
-
-  if (Level.getSpeedupLevel() > 1) {
-    FunctionPassManager EarlyFPM;
-    EarlyFPM.addPass(CallSiteSplittingPass());
-    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
-
-    // Indirect call promotion. This should promote all the targets that are
-    // left by the earlier promotion pass that promotes intra-module targets.
-    // This two-step promotion is to save the compile time. For LTO, it should
-    // produce the same result as if we only do promotion here.
-    MPM.addPass(PGOIndirectCallPromotion(
-        true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
-
-    if (EnableFunctionSpecialization)
-      MPM.addPass(FunctionSpecializationPass());
-    // Propagate constants at call sites into the functions they call.  This
-    // opens opportunities for globalopt (and inlining) by substituting function
-    // pointers passed as arguments to direct uses of functions.
-    MPM.addPass(IPSCCPPass());
-
-    // Attach metadata to indirect call sites indicating the set of functions
-    // they may target at run-time. This should follow IPSCCP.
-    MPM.addPass(CalledValuePropagationPass());
-  }
-
-  // Now deduce any function attributes based in the current code.
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-              PostOrderFunctionAttrsPass()));
-
-  // Do RPO function attribute inference across the module to forward-propagate
-  // attributes where applicable.
-  // FIXME: Is this really an optimization rather than a canonicalization?
-  MPM.addPass(ReversePostOrderFunctionAttrsPass());
-
-  // Use in-range annotations on GEP indices to split globals where beneficial.
-  MPM.addPass(GlobalSplitPass());
-
-  // Run whole program optimization of virtual call when the list of callees
-  // is fixed.
-  MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
-
-  // Stop here at -O1.
-  if (Level == OptimizationLevel::O1) {
-    // The LowerTypeTestsPass needs to run to lower type metadata and the
-    // type.test intrinsics. The pass does nothing if CFI is disabled.
-    MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
-    // Run a second time to clean up any type tests left behind by WPD for use
-    // in ICP (which is performed earlier than this in the regular LTO
-    // pipeline).
-    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
-
-    // Emit annotation remarks.
-    addAnnotationRemarksPass(MPM);
-
-    return MPM;
-  }
-
-  // Optimize globals to try and fold them into constants.
-  MPM.addPass(GlobalOptPass());
-
-  // Promote any localized globals to SSA registers.
-  MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
-
-  // Linking modules together can lead to duplicate global constant, only
-  // keep one copy of each constant.
-  MPM.addPass(ConstantMergePass());
-
-  // Remove unused arguments from functions.
-  MPM.addPass(DeadArgumentEliminationPass());
-
-  // Reduce the code after globalopt and ipsccp.  Both can open up significant
-  // simplification opportunities, and both can propagate functions through
-  // function pointers.  When this happens, we often have to resolve varargs
-  // calls, etc, so let instcombine do this.
-  FunctionPassManager PeepholeFPM;
-  if (Level == OptimizationLevel::O3)
-    PeepholeFPM.addPass(AggressiveInstCombinePass());
-  PeepholeFPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(PeepholeFPM, Level);
-
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM)));
-
-  // Note: historically, the PruneEH pass was run first to deduce nounwind and
-  // generally clean up exception handling overhead. It isn't clear this is
-  // valuable as the inliner doesn't currently care whether it is inlining an
-  // invoke or a call.
-  // Run the inliner now.
-  MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level)));
-
-  // Optimize globals again after we ran the inliner.
-  MPM.addPass(GlobalOptPass());
-
-  // Garbage collect dead functions.
-  // FIXME: Add ArgumentPromotion pass after once it's ported.
-  MPM.addPass(GlobalDCEPass());
-
-  FunctionPassManager FPM;
-  // The IPO Passes may leave cruft around. Clean up after them.
-  FPM.addPass(InstCombinePass());
-  invokePeepholeEPCallbacks(FPM, Level);
-
-  FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
-
-  // Do a post inline PGO instrumentation and use pass. This is a context
-  // sensitive PGO pass.
-  if (PGOOpt) {
-    if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
-      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
-                        /* IsCS */ true, PGOOpt->CSProfileGenFile,
-                        PGOOpt->ProfileRemappingFile);
-    else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
-      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
-                        /* IsCS */ true, PGOOpt->ProfileFile,
-                        PGOOpt->ProfileRemappingFile);
-  }
-
-  // Break up allocas
-  FPM.addPass(SROA());
-
-  // LTO provides additional opportunities for tailcall elimination due to
-  // link-time inlining, and visibility of nocapture attribute.
-  FPM.addPass(TailCallElimPass());
-
-  // Run a few AA driver optimizations here and now to cleanup the code.
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-
-  MPM.addPass(
-      createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
-
-  // Require the GlobalsAA analysis for the module so we can query it within
-  // MainFPM.
-  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
-  // Invalidate AAManager so it can be recreated and pick up the newly available
-  // GlobalsAA.
-  MPM.addPass(
-      createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
-
-  FunctionPassManager MainFPM;
-  MainFPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
-
-  if (RunNewGVN)
-    MainFPM.addPass(NewGVNPass());
-  else
-    MainFPM.addPass(GVN());
-
-  // Remove dead memcpy()'s.
-  MainFPM.addPass(MemCpyOptPass());
-
-  // Nuke dead stores.
-  MainFPM.addPass(DSEPass());
-  MainFPM.addPass(MergedLoadStoreMotionPass());
-
-  // More loops are countable; try to optimize them.
-  if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
-    MainFPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
-
-  if (EnableConstraintElimination)
-    MainFPM.addPass(ConstraintEliminationPass());
-
-  LoopPassManager LPM;
-  LPM.addPass(IndVarSimplifyPass());
-  LPM.addPass(LoopDeletionPass());
-  // FIXME: Add loop interchange.
-
-  // Unroll small loops and perform peeling.
-  LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
-                                 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
-                                 PTO.ForgetAllSCEVInLoopUnroll));
-  // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
-  // *All* loop passes must preserve it, in order to be able to use it.
-  MainFPM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
-
-  MainFPM.addPass(LoopDistributePass());
-
-  addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
-
-  invokePeepholeEPCallbacks(MainFPM, Level);
-  MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));
-
-  // Lower type metadata and the type.test intrinsic. This pass supports
-  // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
-  // to be run at link time if CFI is enabled. This pass does nothing if
-  // CFI is disabled.
-  MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
-  // Run a second time to clean up any type tests left behind by WPD for use
-  // in ICP (which is performed earlier than this in the regular LTO pipeline).
-  MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
-
-  // Enable splitting late in the FullLTO post-link pipeline. This is done in
-  // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses).
-  if (EnableHotColdSplit)
-    MPM.addPass(HotColdSplittingPass());
-
-  // Add late LTO optimization passes.
-  // Delete basic blocks, which optimization passes may have killed.
-  MPM.addPass(createModuleToFunctionPassAdaptor(
-      SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))));
-
-  // Drop bodies of available eternally objects to improve GlobalDCE.
-  MPM.addPass(EliminateAvailableExternallyPass());
-
-  // Now that we have optimized the program, discard unreachable functions.
-  MPM.addPass(GlobalDCEPass());
-
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
-
-  // Emit annotation remarks.
-  addAnnotationRemarksPass(MPM);
-
-  return MPM;
-}
-
-ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
-                                                      bool LTOPreLink) {
-  assert(Level == OptimizationLevel::O0 &&
-         "buildO0DefaultPipeline should only be used with O0");
-
-  ModulePassManager MPM;
-
-  if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr ||
-                 PGOOpt->Action == PGOOptions::IRUse))
-    addPGOInstrPassesForO0(
-        MPM,
-        /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
-        /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile);
-
-  for (auto &C : PipelineStartEPCallbacks)
-    C(MPM, Level);
-
-  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
-    MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
-
-  for (auto &C : PipelineEarlySimplificationEPCallbacks)
-    C(MPM, Level);
-
-  // Build a minimal pipeline based on the semantics required by LLVM,
-  // which is just that always inlining occurs. Further, disable generating
-  // lifetime intrinsics to avoid enabling further optimizations during
-  // code generation.
-  MPM.addPass(AlwaysInlinerPass(
-      /*InsertLifetimeIntrinsics=*/false));
-
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
-
-  if (EnableMatrix)
-    MPM.addPass(
-        createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true)));
-
-  if (!CGSCCOptimizerLateEPCallbacks.empty()) {
-    CGSCCPassManager CGPM;
-    for (auto &C : CGSCCOptimizerLateEPCallbacks)
-      C(CGPM, Level);
-    if (!CGPM.isEmpty())
-      MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-  }
-  if (!LateLoopOptimizationsEPCallbacks.empty()) {
-    LoopPassManager LPM;
-    for (auto &C : LateLoopOptimizationsEPCallbacks)
-      C(LPM, Level);
-    if (!LPM.isEmpty()) {
-      MPM.addPass(createModuleToFunctionPassAdaptor(
-          createFunctionToLoopPassAdaptor(std::move(LPM))));
-    }
-  }
-  if (!LoopOptimizerEndEPCallbacks.empty()) {
-    LoopPassManager LPM;
-    for (auto &C : LoopOptimizerEndEPCallbacks)
-      C(LPM, Level);
-    if (!LPM.isEmpty()) {
-      MPM.addPass(createModuleToFunctionPassAdaptor(
-          createFunctionToLoopPassAdaptor(std::move(LPM))));
-    }
-  }
-  if (!ScalarOptimizerLateEPCallbacks.empty()) {
-    FunctionPassManager FPM;
-    for (auto &C : ScalarOptimizerLateEPCallbacks)
-      C(FPM, Level);
-    if (!FPM.isEmpty())
-      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-  }
-  if (!VectorizerStartEPCallbacks.empty()) {
-    FunctionPassManager FPM;
-    for (auto &C : VectorizerStartEPCallbacks)
-      C(FPM, Level);
-    if (!FPM.isEmpty())
-      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-  }
-
-  MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
-  CGSCCPassManager CGPM;
-  CGPM.addPass(CoroSplitPass());
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-  MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
-
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(MPM, Level);
-
-  if (LTOPreLink)
-    addRequiredLTOPreLinkPasses(MPM);
-
-  return MPM;
-}
-
-AAManager PassBuilder::buildDefaultAAPipeline() {
-  AAManager AA;
-
-  // The order in which these are registered determines their priority when
-  // being queried.
-
-  // First we register the basic alias analysis that provides the majority of
-  // per-function local AA logic. This is a stateless, on-demand local set of
-  // AA techniques.
-  AA.registerFunctionAnalysis<BasicAA>();
-
-  // Next we query fast, specialized alias analyses that wrap IR-embedded
-  // information about aliasing.
-  AA.registerFunctionAnalysis<ScopedNoAliasAA>();
-  AA.registerFunctionAnalysis<TypeBasedAA>();
-
-  // Add support for querying global aliasing information when available.
-  // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
-  // analysis, all that the `AAManager` can do is query for any *cached*
-  // results from `GlobalsAA` through a readonly proxy.
-  AA.registerModuleAnalysis<GlobalsAA>();
-
-  // Add target-specific alias analyses.
-  if (TM)
-    TM->registerDefaultAliasAnalyses(AA);
-
-  return AA;
-}
-
 static Optional<int> parseRepeatPassName(StringRef Name) {
   if (!Name.consume_front("repeat<") || !Name.consume_back(">"))
     return None;
@@ -2140,6 +558,83 @@ Expected<LoopUnrollOptions> parseLoopUnrollOptions(StringRef Params) {
   return UnrollOpts;
 }
 
+Expected<bool> parseSinglePassOption(StringRef Params, StringRef OptionName,
+                                     StringRef PassName) {
+  bool Result = false;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == OptionName) {
+      Result = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid {1} pass parameter '{0}' ", ParamName, PassName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
+Expected<bool> parseInlinerPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "only-mandatory", "InlinerPass");
+}
+
+Expected<bool> parseEarlyCSEPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "memssa", "EarlyCSE");
+}
+
+Expected<bool> parseEntryExitInstrumenterPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "post-inline", "EntryExitInstrumenter");
+}
+
+Expected<bool> parseLoopExtractorPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "single", "LoopExtractor");
+}
+
+Expected<bool> parseLowerMatrixIntrinsicsPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "minimal", "LowerMatrixIntrinsics");
+}
+
+Expected<AddressSanitizerOptions> parseASanPassOptions(StringRef Params) {
+  AddressSanitizerOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == "kernel") {
+      Result.CompileKernel = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid AddressSanitizer pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
+Expected<HWAddressSanitizerOptions> parseHWASanPassOptions(StringRef Params) {
+  HWAddressSanitizerOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == "recover") {
+      Result.Recover = true;
+    } else if (ParamName == "kernel") {
+      Result.CompileKernel = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid HWAddressSanitizer pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   MemorySanitizerOptions Result;
   while (!Params.empty()) {
@@ -2349,7 +844,7 @@ static bool isModulePassName(StringRef Name, CallbacksT &Callbacks) {
     return true;
   if (Name == "cgscc")
     return true;
-  if (Name == "function")
+  if (Name == "function" || Name == "function<eager-inv>")
     return true;
 
   // Explicitly handle custom-parsed pass names.
@@ -2359,6 +854,9 @@ static bool isModulePassName(StringRef Name, CallbacksT &Callbacks) {
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME)                                                            \
     return true;
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  if (checkParametrizedPassName(Name, NAME))                                   \
+    return true;
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
     return true;
@@ -2372,7 +870,7 @@ static bool isCGSCCPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
   if (Name == "cgscc")
     return true;
-  if (Name == "function")
+  if (Name == "function" || Name == "function<eager-inv>")
     return true;
 
   // Explicitly handle custom-parsed pass names.
@@ -2384,6 +882,9 @@ static bool isCGSCCPassName(StringRef Name, CallbacksT &Callbacks) {
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME)                                                            \
     return true;
+#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
+  if (checkParametrizedPassName(Name, NAME))                                   \
+    return true;
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
     return true;
@@ -2395,7 +896,7 @@ static bool isCGSCCPassName(StringRef Name, CallbacksT &Callbacks) {
 template <typename CallbacksT>
 static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
-  if (Name == "function")
+  if (Name == "function" || Name == "function<eager-inv>")
     return true;
   if (Name == "loop" || Name == "loop-mssa")
     return true;
@@ -2419,15 +920,41 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
 }
 
 template <typename CallbacksT>
-static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks) {
-  // Explicitly handle pass manager names.
-  if (Name == "loop" || Name == "loop-mssa")
+static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks,
+                               bool &UseMemorySSA) {
+  UseMemorySSA = false;
+
+  // Explicitly handle custom-parsed pass names.
+  if (parseRepeatPassName(Name))
     return true;
 
+  if (Name == "lnicm") {
+    UseMemorySSA = true;
+    return true;
+  }
+
+#define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME)                                                            \
+    return true;
+#include "PassRegistry.def"
+
+  return callbacksAcceptPassName<LoopPassManager>(Name, Callbacks);
+}
+
+template <typename CallbacksT>
+static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks,
+                           bool &UseMemorySSA) {
+  UseMemorySSA = false;
+
   // Explicitly handle custom-parsed pass names.
   if (parseRepeatPassName(Name))
     return true;
 
+  if (Name == "licm") {
+    UseMemorySSA = true;
+    return true;
+  }
+
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME)                                                            \
     return true;
@@ -2520,11 +1047,12 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
       return Error::success();
     }
-    if (Name == "function") {
+    if (Name == "function" || Name == "function<eager-inv>") {
       FunctionPassManager FPM;
       if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline))
         return Err;
-      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
+                                                    Name != "function"));
       return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
@@ -2599,6 +1127,14 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     MPM.addPass(CREATE_PASS);                                                  \
     return Error::success();                                                   \
   }
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    MPM.addPass(CREATE_PASS(Params.get()));                                    \
+    return Error::success();                                                   \
+  }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
     MPM.addPass(                                                               \
@@ -2616,6 +1152,15 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(CREATE_PASS));         \
     return Error::success();                                                   \
   }
+#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    MPM.addPass(                                                               \
+        createModuleToPostOrderCGSCCPassAdaptor(CREATE_PASS(Params.get())));   \
+    return Error::success();                                                   \
+  }
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS));               \
@@ -2629,6 +1174,12 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
     return Error::success();                                                   \
   }
+#define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    MPM.addPass(createModuleToFunctionPassAdaptor(                             \
+        createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)));          \
+    return Error::success();                                                   \
+  }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     MPM.addPass(createModuleToFunctionPassAdaptor(                             \
@@ -2670,12 +1221,13 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
       CGPM.addPass(std::move(NestedCGPM));
       return Error::success();
     }
-    if (Name == "function") {
+    if (Name == "function" || Name == "function<eager-inv>") {
       FunctionPassManager FPM;
       if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline))
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
-      CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+      CGPM.addPass(
+          createCGSCCToFunctionPassAdaptor(std::move(FPM), Name != "function"));
       return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
@@ -2710,6 +1262,14 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
     CGPM.addPass(CREATE_PASS);                                                 \
     return Error::success();                                                   \
   }
+#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    CGPM.addPass(CREATE_PASS(Params.get()));                                   \
+    return Error::success();                                                   \
+  }
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">") {                                           \
     CGPM.addPass(RequireAnalysisPass<                                          \
@@ -2736,6 +1296,12 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
     CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
     return Error::success();                                                   \
   }
+#define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    CGPM.addPass(createCGSCCToFunctionPassAdaptor(                             \
+        createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)));          \
+    return Error::success();                                                   \
+  }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     CGPM.addPass(createCGSCCToFunctionPassAdaptor(                             \
@@ -2785,8 +1351,11 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
       bool UseMemorySSA = (Name == "loop-mssa");
       bool UseBFI = llvm::any_of(
           InnerPipeline, [](auto Pipeline) { return Pipeline.Name == "licm"; });
+      bool UseBPI = llvm::any_of(InnerPipeline, [](auto Pipeline) {
+        return Pipeline.Name == "loop-predication";
+      });
       FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA,
-                                                  UseBFI));
+                                                  UseBFI, UseBPI));
       return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
@@ -2837,6 +1406,11 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
 //        bool UseMemorySSA = !("canon-freeze" || "loop-predication" ||
 //                              "guard-widening");
 //        The risk is that it may become obsolete if we're not careful.
+#define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false));   \
+    return Error::success();                                                   \
+  }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false));   \
@@ -2895,6 +1469,11 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM,
   }
 
 // Now expand the basic registered passes from the .inc file.
+#define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    LPM.addPass(CREATE_PASS);                                                  \
+    return Error::success();                                                   \
+  }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     LPM.addPass(CREATE_PASS);                                                  \
@@ -3016,13 +1595,20 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
   StringRef FirstName = Pipeline->front().Name;
 
   if (!isModulePassName(FirstName, ModulePipelineParsingCallbacks)) {
+    bool UseMemorySSA;
     if (isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks)) {
       Pipeline = {{"cgscc", std::move(*Pipeline)}};
     } else if (isFunctionPassName(FirstName,
                                   FunctionPipelineParsingCallbacks)) {
       Pipeline = {{"function", std::move(*Pipeline)}};
-    } else if (isLoopPassName(FirstName, LoopPipelineParsingCallbacks)) {
-      Pipeline = {{"function", {{"loop", std::move(*Pipeline)}}}};
+    } else if (isLoopNestPassName(FirstName, LoopPipelineParsingCallbacks,
+                                  UseMemorySSA)) {
+      Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop",
+                                 std::move(*Pipeline)}}}};
+    } else if (isLoopPassName(FirstName, LoopPipelineParsingCallbacks,
+                              UseMemorySSA)) {
+      Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop",
+                                 std::move(*Pipeline)}}}};
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline))
@@ -3172,6 +1758,11 @@ void PassBuilder::printPassNames(raw_ostream &OS) {
 #define MODULE_PASS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
 
+  OS << "Module passes with params:\n";
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  printPassName(NAME, PARAMS, OS);
+#include "PassRegistry.def"
+
   OS << "Module analyses:\n";
 #define MODULE_ANALYSIS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
@@ -3184,6 +1775,11 @@ void PassBuilder::printPassNames(raw_ostream &OS) {
 #define CGSCC_PASS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
 
+  OS << "CGSCC passes with params:\n";
+#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
+  printPassName(NAME, PARAMS, OS);
+#include "PassRegistry.def"
+
   OS << "CGSCC analyses:\n";
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
@@ -3205,6 +1801,10 @@ void PassBuilder::printPassNames(raw_ostream &OS) {
 #define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
 
+  OS << "LoopNest passes:\n";
+#define LOOPNEST_PASS(NAME, CREATE_PASS) printPassName(NAME, OS);
+#include "PassRegistry.def"
+
   OS << "Loop passes:\n";
 #define LOOP_PASS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
new file mode 100644
index 000000000000..ac5dfdbdd540
--- /dev/null
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -0,0 +1,1798 @@
+//===- Construction of pass pipelines -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the implementation of the PassBuilder based on our
+/// static pass registry as well as related functionality. It also provides
+/// helpers to aid in analyzing, debugging, and testing passes and pass
+/// pipelines.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/PGOOptions.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/Coroutines/CoroCleanup.h"
+#include "llvm/Transforms/Coroutines/CoroEarly.h"
+#include "llvm/Transforms/Coroutines/CoroElide.h"
+#include "llvm/Transforms/Coroutines/CoroSplit.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/Annotation2Metadata.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/IPO/CalledValuePropagation.h"
+#include "llvm/Transforms/IPO/ConstantMerge.h"
+#include "llvm/Transforms/IPO/CrossDSOCFI.h"
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
+#include "llvm/Transforms/IPO/ElimAvailExtern.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/GlobalOpt.h"
+#include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HotColdSplitting.h"
+#include "llvm/Transforms/IPO/IROutliner.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/IPO/ModuleInliner.h"
+#include "llvm/Transforms/IPO/OpenMPOpt.h"
+#include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Transforms/IPO/SampleProfile.h"
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
+#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
+#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/Transforms/Instrumentation/MemProfiler.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/Transforms/Scalar/ConstraintElimination.h"
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/Transforms/Scalar/DFAJumpThreading.h"
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/Float2Int.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopInterchange.h"
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LoopSink.h"
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
+#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+
+using namespace llvm;
+
+static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
+    "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
+    cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
+    cl::values(clEnumValN(InliningAdvisorMode::Default, "default",
+                          "Heuristics-based inliner version."),
+               clEnumValN(InliningAdvisorMode::Development, "development",
+                          "Use development mode (runtime-loadable model)."),
+               clEnumValN(InliningAdvisorMode::Release, "release",
+                          "Use release mode (AOT-compiled model).")));
+
+static cl::opt<bool> EnableSyntheticCounts(
+    "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Run synthetic function entry count generation "
+             "pass"));
+
+/// Flag to enable inline deferral during PGO.
+static cl::opt<bool>
+    EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("Enable inline deferral during PGO"));
+
+static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false),
+                                       cl::Hidden, cl::ZeroOrMore,
+                                       cl::desc("Enable memory profiler"));
+
+static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
+                                         cl::init(false), cl::Hidden,
+                                         cl::desc("Enable module inliner"));
+
+static cl::opt<bool> PerformMandatoryInliningsFirst(
+    "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Perform mandatory inlinings module-wide, before performing "
+             "inlining."));
+
+static cl::opt<bool> EnableO3NonTrivialUnswitching(
+    "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
+    cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
+
+static cl::opt<bool> EnableEagerlyInvalidateAnalyses(
+    "eagerly-invalidate-analyses", cl::init(true), cl::Hidden,
+    cl::desc("Eagerly invalidate more analyses in default pipelines"));
+
+static cl::opt<bool> EnableNoRerunSimplificationPipeline(
+    "enable-no-rerun-simplification-pipeline", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Prevent running the simplification pipeline on a function more "
+        "than once in the case that SCC mutations cause a function to be "
+        "visited multiple times as long as the function has not been changed"));
+
+PipelineTuningOptions::PipelineTuningOptions() {
+  LoopInterleaving = true;
+  LoopVectorization = true;
+  SLPVectorization = false;
+  LoopUnrolling = true;
+  ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
+  LicmMssaOptCap = SetLicmMssaOptCap;
+  LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
+  CallGraphProfile = true;
+  MergeFunctions = false;
+  EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses;
+}
+
+namespace llvm {
+
+extern cl::opt<unsigned> MaxDevirtIterations;
+extern cl::opt<bool> EnableConstraintElimination;
+extern cl::opt<bool> EnableFunctionSpecialization;
+extern cl::opt<bool> EnableGVNHoist;
+extern cl::opt<bool> EnableGVNSink;
+extern cl::opt<bool> EnableHotColdSplit;
+extern cl::opt<bool> EnableIROutliner;
+extern cl::opt<bool> EnableOrderFileInstrumentation;
+extern cl::opt<bool> EnableCHR;
+extern cl::opt<bool> EnableLoopInterchange;
+extern cl::opt<bool> EnableUnrollAndJam;
+extern cl::opt<bool> EnableLoopFlatten;
+extern cl::opt<bool> EnableDFAJumpThreading;
+extern cl::opt<bool> RunNewGVN;
+extern cl::opt<bool> RunPartialInlining;
+extern cl::opt<bool> ExtraVectorizerPasses;
+
+extern cl::opt<bool> FlattenedProfileUsed;
+
+extern cl::opt<AttributorRunOption> AttributorRun;
+extern cl::opt<bool> EnableKnowledgeRetention;
+
+extern cl::opt<bool> EnableMatrix;
+
+extern cl::opt<bool> DisablePreInliner;
+extern cl::opt<int> PreInlineThreshold;
+} // namespace llvm
+
+void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM,
+                                            OptimizationLevel Level) {
+  for (auto &C : PeepholeEPCallbacks)
+    C(FPM, Level);
+}
+
+// Helper to add AnnotationRemarksPass.
+static void addAnnotationRemarksPass(ModulePassManager &MPM) {
+  FunctionPassManager FPM;
+  FPM.addPass(AnnotationRemarksPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+}
+
+// Helper to check if the current compilation phase is preparing for LTO
+static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
+  return Phase == ThinOrFullLTOPhase::ThinLTOPreLink ||
+         Phase == ThinOrFullLTOPhase::FullLTOPreLink;
+}
+
+// TODO: Investigate the cost/benefit of tail call elimination on debugging.
+FunctionPassManager
+PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
+                                                   ThinOrFullLTOPhase Phase) {
+
+  FunctionPassManager FPM;
+
+  // Form SSA out of local memory accesses after breaking apart aggregates into
+  // scalars.
+  FPM.addPass(SROAPass());
+
+  // Catch trivial redundancies
+  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
+
+  // Hoisting of scalars and load expressions.
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+
+  FPM.addPass(LibCallsShrinkWrapPass());
+
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  FPM.addPass(SimplifyCFGPass());
+
+  // Form canonically associated expression trees, and simplify the trees using
+  // basic mathematical properties. For example, this will form (nearly)
+  // minimal multiplication trees.
+  FPM.addPass(ReassociatePass());
+
+  // Add the primary loop simplification pipeline.
+  // FIXME: Currently this is split into two loop pass pipelines because we run
+  // some function passes in between them. These can and should be removed
+  // and/or replaced by scheduling the loop pass equivalents in the correct
+  // positions. But those equivalent passes aren't powerful enough yet.
+  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
+  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
+  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
+  // `LoopInstSimplify`.
+  LoopPassManager LPM1, LPM2;
+
+  // Simplify the loop body. We do this initially to clean up after other loop
+  // passes run, either when iterating on a loop or on inner loops with
+  // implications on the outer loop.
+  LPM1.addPass(LoopInstSimplifyPass());
+  LPM1.addPass(LoopSimplifyCFGPass());
+
+  // Try to remove as much code from the loop header as possible,
+  // to reduce amount of IR that will have to be duplicated.
+  // TODO: Investigate promotion cap for O1.
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+
+  LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
+                              isLTOPreLink(Phase)));
+  // TODO: Investigate promotion cap for O1.
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(SimpleLoopUnswitchPass());
+
+  LPM2.addPass(LoopIdiomRecognizePass());
+  LPM2.addPass(IndVarSimplifyPass());
+
+  for (auto &C : LateLoopOptimizationsEPCallbacks)
+    C(LPM2, Level);
+
+  LPM2.addPass(LoopDeletionPass());
+
+  if (EnableLoopInterchange)
+    LPM2.addPass(LoopInterchangePass());
+
+  // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
+  // because it changes IR to makes profile annotation in back compile
+  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
+  // attributes so we need to make sure and allow the full unroll pass to pay
+  // attention to it.
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
+      PGOOpt->Action != PGOOptions::SampleUse)
+    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                    PTO.ForgetAllSCEVInLoopUnroll));
+
+  for (auto &C : LoopOptimizerEndEPCallbacks)
+    C(LPM2, Level);
+
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(
+      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
+                                              /*UseMemorySSA=*/true,
+                                              /*UseBlockFrequencyInfo=*/true));
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  if (EnableLoopFlatten)
+    FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
+  // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
+  // *All* loop passes must preserve it, in order to be able to use it.
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
+                                              /*UseMemorySSA=*/false,
+                                              /*UseBlockFrequencyInfo=*/false));
+
+  // Delete small array after loop unroll.
+  FPM.addPass(SROAPass());
+
+  // Specially optimize memory movement as it doesn't look like dataflow in SSA.
+  FPM.addPass(MemCpyOptPass());
+
+  // Sparse conditional constant propagation.
+  // FIXME: It isn't clear why we do this *after* loop passes rather than
+  // before...
+  FPM.addPass(SCCPPass());
+
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  FPM.addPass(BDCEPass());
+
+  // Run instcombine after redundancy and dead bit elimination to exploit
+  // opportunities opened up by them.
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  FPM.addPass(CoroElidePass());
+
+  for (auto &C : ScalarOptimizerLateEPCallbacks)
+    C(FPM, Level);
+
+  // Finally, do an expensive DCE pass to catch all the dead code exposed by
+  // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
+  FPM.addPass(ADCEPass());
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  return FPM;
+}
+
+FunctionPassManager
+PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
+                                                 ThinOrFullLTOPhase Phase) {
+  assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
+
+  // The O1 pipeline has a separate pipeline creation function to simplify
+  // construction readability.
+  if (Level.getSpeedupLevel() == 1)
+    return buildO1FunctionSimplificationPipeline(Level, Phase);
+
+  FunctionPassManager FPM;
+
+  // Form SSA out of local memory accesses after breaking apart aggregates into
+  // scalars.
+  FPM.addPass(SROAPass());
+
+  // Catch trivial redundancies
+  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
+  if (EnableKnowledgeRetention)
+    FPM.addPass(AssumeSimplifyPass());
+
+  // Hoisting of scalars and load expressions.
+  if (EnableGVNHoist)
+    FPM.addPass(GVNHoistPass());
+
+  // Global value numbering based sinking.
+  if (EnableGVNSink) {
+    FPM.addPass(GVNSinkPass());
+    FPM.addPass(SimplifyCFGPass());
+  }
+
+  if (EnableConstraintElimination)
+    FPM.addPass(ConstraintEliminationPass());
+
+  // Speculative execution if the target has divergent branches; otherwise nop.
+  FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
+
+  // Optimize based on known information about branches, and cleanup afterward.
+  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(CorrelatedValuePropagationPass());
+
+  FPM.addPass(SimplifyCFGPass());
+  if (Level == OptimizationLevel::O3)
+    FPM.addPass(AggressiveInstCombinePass());
+  FPM.addPass(InstCombinePass());
+
+  if (!Level.isOptimizingForSize())
+    FPM.addPass(LibCallsShrinkWrapPass());
+
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
+  // using the size value profile. Don't perform this when optimizing for size.
+  if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
+      !Level.isOptimizingForSize())
+    FPM.addPass(PGOMemOPSizeOpt());
+
+  FPM.addPass(TailCallElimPass());
+  FPM.addPass(SimplifyCFGPass());
+
+  // Form canonically associated expression trees, and simplify the trees using
+  // basic mathematical properties. For example, this will form (nearly)
+  // minimal multiplication trees.
+  FPM.addPass(ReassociatePass());
+
+  // Add the primary loop simplification pipeline.
+  // FIXME: Currently this is split into two loop pass pipelines because we run
+  // some function passes in between them. These can and should be removed
+  // and/or replaced by scheduling the loop pass equivalents in the correct
+  // positions. But those equivalent passes aren't powerful enough yet.
+  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
+  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
+  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
+  // `LoopInstSimplify`.
+  LoopPassManager LPM1, LPM2;
+
+  // Simplify the loop body. We do this initially to clean up after other loop
+  // passes run, either when iterating on a loop or on inner loops with
+  // implications on the outer loop.
+  LPM1.addPass(LoopInstSimplifyPass());
+  LPM1.addPass(LoopSimplifyCFGPass());
+
+  // Try to remove as much code from the loop header as possible,
+  // to reduce amount of IR that will have to be duplicated.
+  // TODO: Investigate promotion cap for O1.
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+
+  // Disable header duplication in loop rotation at -Oz.
+  LPM1.addPass(
+      LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
+  // TODO: Investigate promotion cap for O1.
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(
+      SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
+                             EnableO3NonTrivialUnswitching));
+  LPM2.addPass(LoopIdiomRecognizePass());
+  LPM2.addPass(IndVarSimplifyPass());
+
+  for (auto &C : LateLoopOptimizationsEPCallbacks)
+    C(LPM2, Level);
+
+  LPM2.addPass(LoopDeletionPass());
+
+  if (EnableLoopInterchange)
+    LPM2.addPass(LoopInterchangePass());
+
+  // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
+  // because it changes IR to makes profile annotation in back compile
+  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
+  // attributes so we need to make sure and allow the full unroll pass to pay
+  // attention to it.
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
+      PGOOpt->Action != PGOOptions::SampleUse)
+    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                    PTO.ForgetAllSCEVInLoopUnroll));
+
+  for (auto &C : LoopOptimizerEndEPCallbacks)
+    C(LPM2, Level);
+
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(
+      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
+                                              /*UseMemorySSA=*/true,
+                                              /*UseBlockFrequencyInfo=*/true));
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  if (EnableLoopFlatten)
+    FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
+  // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
+  // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
+  // *All* loop passes must preserve it, in order to be able to use it.
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
+                                              /*UseMemorySSA=*/false,
+                                              /*UseBlockFrequencyInfo=*/false));
+
+  // Delete small array after loop unroll.
+  FPM.addPass(SROAPass());
+
+  // The matrix extension can introduce large vector operations early, which can
+  // benefit from running vector-combine early on.
+  if (EnableMatrix)
+    FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true));
+
+  // Eliminate redundancies.
+  FPM.addPass(MergedLoadStoreMotionPass());
+  if (RunNewGVN)
+    FPM.addPass(NewGVNPass());
+  else
+    FPM.addPass(GVNPass());
+
+  // Sparse conditional constant propagation.
+  // FIXME: It isn't clear why we do this *after* loop passes rather than
+  // before...
+  FPM.addPass(SCCPPass());
+
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  FPM.addPass(BDCEPass());
+
+  // Run instcombine after redundancy and dead bit elimination to exploit
+  // opportunities opened up by them.
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  // Re-consider control flow based optimizations after redundancy elimination,
+  // redo DCE, etc.
+  if (EnableDFAJumpThreading && Level.getSizeLevel() == 0)
+    FPM.addPass(DFAJumpThreadingPass());
+
+  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(CorrelatedValuePropagationPass());
+
+  // Finally, do an expensive DCE pass to catch all the dead code exposed by
+  // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
+  FPM.addPass(ADCEPass());
+
+  // Specially optimize memory movement as it doesn't look like dataflow in SSA.
+  FPM.addPass(MemCpyOptPass());
+
+  FPM.addPass(DSEPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
+
+  FPM.addPass(CoroElidePass());
+
+  for (auto &C : ScalarOptimizerLateEPCallbacks)
+    C(FPM, Level);
+
+  FPM.addPass(SimplifyCFGPass(
+      SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
+      (PGOOpt->Action == PGOOptions::IRUse ||
+       PGOOpt->Action == PGOOptions::SampleUse))
+    FPM.addPass(ControlHeightReductionPass());
+
+  return FPM;
+}
+
+void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
+  MPM.addPass(CanonicalizeAliasesPass());
+  MPM.addPass(NameAnonGlobalPass());
+}
+
+void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
+                                    OptimizationLevel Level, bool RunProfileGen,
+                                    bool IsCS, std::string ProfileFile,
+                                    std::string ProfileRemappingFile) {
+  assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
+  if (!IsCS && !DisablePreInliner) {
+    InlineParams IP;
+
+    IP.DefaultThreshold = PreInlineThreshold;
+
+    // FIXME: The hint threshold has the same value used by the regular inliner
+    // when not optimzing for size. This should probably be lowered after
+    // performance testing.
+    // FIXME: this comment is cargo culted from the old pass manager, revisit).
+    IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
+    ModuleInlinerWrapperPass MIWP(IP);
+    CGSCCPassManager &CGPipeline = MIWP.getPM();
+
+    FunctionPassManager FPM;
+    FPM.addPass(SROAPass());
+    FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
+    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
+    FPM.addPass(InstCombinePass()); // Combine silly sequences.
+    invokePeepholeEPCallbacks(FPM, Level);
+
+    CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
+        std::move(FPM), PTO.EagerlyInvalidateAnalyses));
+
+    MPM.addPass(std::move(MIWP));
+
+    // Delete anything that is now dead to make sure that we don't instrument
+    // dead code. Instrumentation can end up keeping dead code around and
+    // dramatically increase code size.
+    MPM.addPass(GlobalDCEPass());
+  }
+
+  if (!RunProfileGen) {
+    assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
+    MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
+    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+    // RequireAnalysisPass for PSI before subsequent non-module passes.
+    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    return;
+  }
+
+  // Perform PGO instrumentation.
+  MPM.addPass(PGOInstrumentationGen(IsCS));
+
+  FunctionPassManager FPM;
+  // Disable header duplication in loop rotation at -Oz.
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false,
+      /*UseBlockFrequencyInfo=*/false));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  // Add the profile lowering pass.
+  InstrProfOptions Options;
+  if (!ProfileFile.empty())
+    Options.InstrProfileOutput = ProfileFile;
+  // Do counter promotion at Level greater than O0.
+  Options.DoCounterPromotion = true;
+  Options.UseBFIInPromotion = IsCS;
+  MPM.addPass(InstrProfiling(Options, IsCS));
+}
+
+void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,
+                                         bool RunProfileGen, bool IsCS,
+                                         std::string ProfileFile,
+                                         std::string ProfileRemappingFile) {
+  if (!RunProfileGen) {
+    assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
+    MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
+    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+    // RequireAnalysisPass for PSI before subsequent non-module passes.
+    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    return;
+  }
+
+  // Perform PGO instrumentation.
+  MPM.addPass(PGOInstrumentationGen(IsCS));
+  // Add the profile lowering pass.
+  InstrProfOptions Options;
+  if (!ProfileFile.empty())
+    Options.InstrProfileOutput = ProfileFile;
+  // Do not do counter promotion at O0.
+  Options.DoCounterPromotion = false;
+  Options.UseBFIInPromotion = IsCS;
+  MPM.addPass(InstrProfiling(Options, IsCS));
+}
+
+static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) {
+  return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
+}
+
+ModuleInlinerWrapperPass
+PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
+                                  ThinOrFullLTOPhase Phase) {
+  InlineParams IP = getInlineParamsFromOptLevel(Level);
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
+      PGOOpt->Action == PGOOptions::SampleUse)
+    IP.HotCallSiteThreshold = 0;
+
+  if (PGOOpt)
+    IP.EnableDeferral = EnablePGOInlineDeferral;
+
+  ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst,
+                                UseInlineAdvisor, MaxDevirtIterations);
+
+  // Require the GlobalsAA analysis for the module so we can query it within
+  // the CGSCC pipeline.
+  MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>());
+  // Invalidate AAManager so it can be recreated and pick up the newly available
+  // GlobalsAA.
+  MIWP.addModulePass(
+      createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
+
+  // Require the ProfileSummaryAnalysis for the module so we can query it within
+  // the inliner pass.
+  MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+
+  // Now begin the main postorder CGSCC pipeline.
+  // FIXME: The current CGSCC pipeline has its origins in the legacy pass
+  // manager and trying to emulate its precise behavior. Much of this doesn't
+  // make a lot of sense and we should revisit the core CGSCC structure.
+  CGSCCPassManager &MainCGPipeline = MIWP.getPM();
+
+  // Note: historically, the PruneEH pass was run first to deduce nounwind and
+  // generally clean up exception handling overhead. It isn't clear this is
+  // valuable as the inliner doesn't currently care whether it is inlining an
+  // invoke or a call.
+
+  if (AttributorRun & AttributorRunOption::CGSCC)
+    MainCGPipeline.addPass(AttributorCGSCCPass());
+
+  // Now deduce any function attributes based in the current code.
+  MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
+
+  // When at O3 add argument promotion to the pass pipeline.
+  // FIXME: It isn't at all clear why this should be limited to O3.
+  if (Level == OptimizationLevel::O3)
+    MainCGPipeline.addPass(ArgumentPromotionPass());
+
+  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+  // there are no OpenMP runtime calls present in the module.
+  if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
+    MainCGPipeline.addPass(OpenMPOptCGSCCPass());
+
+  for (auto &C : CGSCCOptimizerLateEPCallbacks)
+    C(MainCGPipeline, Level);
+
+  // Lastly, add the core function simplification pipeline nested inside the
+  // CGSCC walk.
+  MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
+      buildFunctionSimplificationPipeline(Level, Phase),
+      PTO.EagerlyInvalidateAnalyses, EnableNoRerunSimplificationPipeline));
+
+  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+
+  if (EnableNoRerunSimplificationPipeline)
+    MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
+        InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>()));
+
+  return MIWP;
+}
+
+ModuleInlinerPass
+PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
+                                        ThinOrFullLTOPhase Phase) {
+  InlineParams IP = getInlineParamsFromOptLevel(Level);
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
+      PGOOpt->Action == PGOOptions::SampleUse)
+    IP.HotCallSiteThreshold = 0;
+
+  if (PGOOpt)
+    IP.EnableDeferral = EnablePGOInlineDeferral;
+
+  // The inline deferral logic is used to avoid losing some
+  // inlining chance in future. It is helpful in SCC inliner, in which
+  // inlining is processed in bottom-up order.
+  // While in module inliner, the inlining order is a priority-based order
+  // by default. The inline deferral is unnecessary there. So we disable the
+  // inline deferral logic in module inliner.
+  IP.EnableDeferral = false;
+
+  return ModuleInlinerPass(IP, UseInlineAdvisor);
+}
+
+ModulePassManager
+PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase) {
+  ModulePassManager MPM;
+
+  // Place pseudo probe instrumentation as the first pass of the pipeline to
+  // minimize the impact of optimization changes.
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
+      Phase != ThinOrFullLTOPhase::ThinLTOPostLink)
+    MPM.addPass(SampleProfileProbePass(TM));
+
+  bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
+
+  // In ThinLTO mode, when flattened profile is used, all the available
+  // profile information will be annotated in PreLink phase so there is
+  // no need to load the profile again in PostLink.
+  bool LoadSampleProfile =
+      HasSampleProfile &&
+      !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink);
+
+  // During the ThinLTO backend phase we perform early indirect call promotion
+  // here, before globalopt. Otherwise imported available_externally functions
+  // look unreferenced and are removed. If we are going to load the sample
+  // profile then defer until later.
+  // TODO: See if we can move later and consolidate with the location where
+  // we perform ICP when we are loading a sample profile.
+  // TODO: We pass HasSampleProfile (whether there was a sample profile file
+  // passed to the compile) to the SamplePGO flag of ICP. This is used to
+  // determine whether the new direct calls are annotated with prof metadata.
+  // Ideally this should be determined from whether the IR is annotated with
+  // sample profile, and not whether the a sample profile was provided on the
+  // command line. E.g. for flattened profiles where we will not be reloading
+  // the sample profile in the ThinLTO backend, we ideally shouldn't have to
+  // provide the sample profile file.
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile)
+    MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile));
+
+  // Do basic inference of function attributes from known properties of system
+  // libraries and other oracles.
+  MPM.addPass(InferFunctionAttrsPass());
+
+  // Create an early function pass manager to cleanup the output of the
+  // frontend.
+  FunctionPassManager EarlyFPM;
+  // Lower llvm.expect to metadata before attempting transforms.
+  // Compare/branch metadata may alter the behavior of passes like SimplifyCFG.
+  EarlyFPM.addPass(LowerExpectIntrinsicPass());
+  EarlyFPM.addPass(SimplifyCFGPass());
+  EarlyFPM.addPass(SROAPass());
+  EarlyFPM.addPass(EarlyCSEPass());
+  EarlyFPM.addPass(CoroEarlyPass());
+  if (Level == OptimizationLevel::O3)
+    EarlyFPM.addPass(CallSiteSplittingPass());
+
+  // In SamplePGO ThinLTO backend, we need instcombine before profile annotation
+  // to convert bitcast to direct calls so that they can be inlined during the
+  // profile annotation prepration step.
+  // More details about SamplePGO design can be found in:
+  // https://research.google.com/pubs/pub45290.html
+  // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured.
+  if (LoadSampleProfile)
+    EarlyFPM.addPass(InstCombinePass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  if (LoadSampleProfile) {
+    // Annotate sample profile right after early FPM to ensure freshness of
+    // the debug info.
+    MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
+                                        PGOOpt->ProfileRemappingFile, Phase));
+    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+    // RequireAnalysisPass for PSI before subsequent non-module passes.
+    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    // Do not invoke ICP in the LTOPrelink phase as it makes it hard
+    // for the profile annotation to be accurate in the LTO backend.
+    if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink &&
+        Phase != ThinOrFullLTOPhase::FullLTOPreLink)
+      // We perform early indirect call promotion here, before globalopt.
+      // This is important for the ThinLTO backend phase because otherwise
+      // imported available_externally functions look unreferenced and are
+      // removed.
+      MPM.addPass(
+          PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
+  }
+
+  // Try to perform OpenMP specific optimizations on the module. This is a
+  // (quick!) no-op if there are no OpenMP runtime calls present in the module.
+  if (Level != OptimizationLevel::O0)
+    MPM.addPass(OpenMPOptPass());
+
+  if (AttributorRun & AttributorRunOption::MODULE)
+    MPM.addPass(AttributorPass());
+
+  // Lower type metadata and the type.test intrinsic in the ThinLTO
+  // post link pipeline after ICP. This is to enable usage of the type
+  // tests in ICP sequences.
+  if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink)
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
+  for (auto &C : PipelineEarlySimplificationEPCallbacks)
+    C(MPM, Level);
+
+  // Specialize functions with IPSCCP.
+  if (EnableFunctionSpecialization && Level == OptimizationLevel::O3)
+    MPM.addPass(FunctionSpecializationPass());
+
+  // Interprocedural constant propagation now that basic cleanup has occurred
+  // and prior to optimizing globals.
+  // FIXME: This position in the pipeline hasn't been carefully considered in
+  // years, it should be re-analyzed.
+  MPM.addPass(IPSCCPPass());
+
+  // Attach metadata to indirect call sites indicating the set of functions
+  // they may target at run-time. This should follow IPSCCP.
+  MPM.addPass(CalledValuePropagationPass());
+
+  // Optimize globals to try and fold them into constants.
+  MPM.addPass(GlobalOptPass());
+
+  // Promote any localized globals to SSA registers.
+  // FIXME: Should this instead by a run of SROA?
+  // FIXME: We should probably run instcombine and simplifycfg afterward to
+  // delete control flows that are dead once globals have been folded to
+  // constants.
+  MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
+
+  // Remove any dead arguments exposed by cleanups and constant folding
+  // globals.
+  MPM.addPass(DeadArgumentEliminationPass());
+
+  // Create a small function pass pipeline to cleanup after all the global
+  // optimizations.
+  FunctionPassManager GlobalCleanupPM;
+  GlobalCleanupPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
+
+  GlobalCleanupPM.addPass(SimplifyCFGPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  // Add all the requested passes for instrumentation PGO, if requested.
+  if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
+      (PGOOpt->Action == PGOOptions::IRInstr ||
+       PGOOpt->Action == PGOOptions::IRUse)) {
+    addPGOInstrPasses(MPM, Level,
+                      /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr,
+                      /* IsCS */ false, PGOOpt->ProfileFile,
+                      PGOOpt->ProfileRemappingFile);
+    MPM.addPass(PGOIndirectCallPromotion(false, false));
+  }
+  if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
+      PGOOpt->CSAction == PGOOptions::CSIRInstr)
+    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
+
+  // Synthesize function entry counts for non-PGO compilation.
+  if (EnableSyntheticCounts && !PGOOpt)
+    MPM.addPass(SyntheticCountsPropagation());
+
+  if (EnableModuleInliner)
+    MPM.addPass(buildModuleInlinerPipeline(Level, Phase));
+  else
+    MPM.addPass(buildInlinerPipeline(Level, Phase));
+
+  if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) {
+    MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
+    MPM.addPass(ModuleMemProfilerPass());
+  }
+
+  return MPM;
+}
+
+/// TODO: Should LTO cause any differences to this set of passes?
+void PassBuilder::addVectorPasses(OptimizationLevel Level,
+                                  FunctionPassManager &FPM, bool IsFullLTO) {
+  FPM.addPass(LoopVectorizePass(
+      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
+
+  if (IsFullLTO) {
+    // The vectorizer may have significantly shortened a loop body; unroll
+    // again. Unroll small loops to hide loop backedge latency and saturate any
+    // parallel execution resources of an out-of-order processor. We also then
+    // need to clean up redundancies and loop invariant code.
+    // FIXME: It would be really good to use a loop-integrated instruction
+    // combiner for cleanup here so that the unrolling and LICM can be pipelined
+    // across the loop nests.
+    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+    if (EnableUnrollAndJam && PTO.LoopUnrolling)
+      FPM.addPass(createFunctionToLoopPassAdaptor(
+          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
+        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+        PTO.ForgetAllSCEVInLoopUnroll)));
+    FPM.addPass(WarnMissedTransformationsPass());
+  }
+
+  if (!IsFullLTO) {
+    // Eliminate loads by forwarding stores from the previous iteration to loads
+    // of the current iteration.
+    FPM.addPass(LoopLoadEliminationPass());
+  }
+  // Cleanup after the loop optimization passes.
+  FPM.addPass(InstCombinePass());
+
+  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correlated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    FPM.addPass(EarlyCSEPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+    FPM.addPass(InstCombinePass());
+    LoopPassManager LPM;
+    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+    LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
+                                       OptimizationLevel::O3));
+    FPM.addPass(
+        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+    FPM.addPass(
+        createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
+                                        /*UseBlockFrequencyInfo=*/true));
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(InstCombinePass());
+  }
+
+  // Now that we've formed fast to execute loop structures, we do further
+  // optimizations. These are run afterward as they might block doing complex
+  // analyses and transforms such as what are needed for loop vectorization.
+
+  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
+  // GVN, loop transforms, and others have already run, so it's now better to
+  // convert to more optimized IR using more aggressive simplify CFG options.
+  // The extra sinking transform can create larger basic blocks, so do this
+  // before SLP vectorization.
+  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                  .forwardSwitchCondToPhi(true)
+                                  .convertSwitchToLookupTable(true)
+                                  .needCanonicalLoops(false)
+                                  .hoistCommonInsts(true)
+                                  .sinkCommonInsts(true)));
+
+  if (IsFullLTO) {
+    FPM.addPass(SCCPPass());
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(BDCEPass());
+  }
+
+  // Optimize parallel scalar instruction chains into SIMD instructions.
+  if (PTO.SLPVectorization) {
+    FPM.addPass(SLPVectorizerPass());
+    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+      FPM.addPass(EarlyCSEPass());
+    }
+  }
+  // Enhance/cleanup vector code.
+  FPM.addPass(VectorCombinePass());
+
+  if (!IsFullLTO) {
+    FPM.addPass(InstCombinePass());
+    // Unroll small loops to hide loop backedge latency and saturate any
+    // parallel execution resources of an out-of-order processor. We also then
+    // need to clean up redundancies and loop invariant code.
+    // FIXME: It would be really good to use a loop-integrated instruction
+    // combiner for cleanup here so that the unrolling and LICM can be pipelined
+    // across the loop nests.
+    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
+    if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+      FPM.addPass(createFunctionToLoopPassAdaptor(
+          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+    }
+    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
+        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+        PTO.ForgetAllSCEVInLoopUnroll)));
+    FPM.addPass(WarnMissedTransformationsPass());
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(
+        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+    FPM.addPass(createFunctionToLoopPassAdaptor(
+        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+        /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
+  }
+
+  // Now that we've vectorized and unrolled loops, we may have more refined
+  // alignment information, try to re-derive it here.
+  FPM.addPass(AlignmentFromAssumptionsPass());
+
+  if (IsFullLTO)
+    FPM.addPass(InstCombinePass());
+}
+
+ModulePassManager
+PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
+                                             bool LTOPreLink) {
+  ModulePassManager MPM;
+
+  // Optimize globals now that the module is fully simplified.
+  MPM.addPass(GlobalOptPass());
+  MPM.addPass(GlobalDCEPass());
+
+  // Run partial inlining pass to partially inline functions that have
+  // large bodies.
+  if (RunPartialInlining)
+    MPM.addPass(PartialInlinerPass());
+
+  // Remove avail extern fns and globals definitions since we aren't compiling
+  // an object file for later LTO. For LTO we want to preserve these so they
+  // are eligible for inlining at link-time. Note if they are unreferenced they
+  // will be removed by GlobalDCE later, so this only impacts referenced
+  // available externally globals. Eventually they will be suppressed during
+  // codegen, but eliminating here enables more opportunity for GlobalDCE as it
+  // may make globals referenced by available external functions dead and saves
+  // running remaining passes on the eliminated functions. These should be
+  // preserved during prelinking for link-time inlining decisions.
+  if (!LTOPreLink)
+    MPM.addPass(EliminateAvailableExternallyPass());
+
+  if (EnableOrderFileInstrumentation)
+    MPM.addPass(InstrOrderFilePass());
+
+  // Do RPO function attribute inference across the module to forward-propagate
+  // attributes where applicable.
+  // FIXME: Is this really an optimization rather than a canonicalization?
+  MPM.addPass(ReversePostOrderFunctionAttrsPass());
+
+  // Do a post inline PGO instrumentation and use pass. This is a context
+  // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as
+  // cross-module inline has not been done yet. The context sensitive
+  // instrumentation is after all the inlines are done.
+  if (!LTOPreLink && PGOOpt) {
+    if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
+                        /* IsCS */ true, PGOOpt->CSProfileGenFile,
+                        PGOOpt->ProfileRemappingFile);
+    else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
+                        /* IsCS */ true, PGOOpt->ProfileFile,
+                        PGOOpt->ProfileRemappingFile);
+  }
+
+  // Re-require GloblasAA here prior to function passes. This is particularly
+  // useful as the above will have inlined, DCE'ed, and function-attr
+  // propagated everything. We should at this point have a reasonably minimal
+  // and richly annotated call graph. By computing aliasing and mod/ref
+  // information for all local globals here, the late loop passes and notably
+  // the vectorizer will be able to use them to help recognize vectorizable
+  // memory operations.
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
+
+  FunctionPassManager OptimizePM;
+  OptimizePM.addPass(Float2IntPass());
+  OptimizePM.addPass(LowerConstantIntrinsicsPass());
+
+  if (EnableMatrix) {
+    OptimizePM.addPass(LowerMatrixIntrinsicsPass());
+    OptimizePM.addPass(EarlyCSEPass());
+  }
+
+  // FIXME: We need to run some loop optimizations to re-rotate loops after
+  // simplifycfg and others undo their rotation.
+
+  // Optimize the loop execution. These passes operate on entire loop nests
+  // rather than on each loop in an inside-out manner, and so they are actually
+  // function passes.
+
+  for (auto &C : VectorizerStartEPCallbacks)
+    C(OptimizePM, Level);
+
+  LoopPassManager LPM;
+  // First rotate loops that may have been un-rotated by prior passes.
+  // Disable header duplication at -Oz.
+  LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
+  // Some loops may have become dead by now. Try to delete them.
+  // FIXME: see disscussion in https://reviews.llvm.org/D112851
+  //        this may need to be revisited once GVN is more powerful.
+  LPM.addPass(LoopDeletionPass());
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
+
+  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
+  OptimizePM.addPass(LoopDistributePass());
+
+  // Populates the VFABI attribute with the scalar-to-vector mappings
+  // from the TargetLibraryInfo.
+  OptimizePM.addPass(InjectTLIMappings());
+
+  addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
+
+  // Split out cold code. Splitting is done late to avoid hiding context from
+  // other optimizations and inadvertently regressing performance. The tradeoff
+  // is that this has a higher code size cost than splitting early.
+  if (EnableHotColdSplit && !LTOPreLink)
+    MPM.addPass(HotColdSplittingPass());
+
+  // Search the code for similar regions of code. If enough similar regions can
+  // be found where extracting the regions into their own function will decrease
+  // the size of the program, we extract the regions, a deduplicate the
+  // structurally similar regions.
+  if (EnableIROutliner)
+    MPM.addPass(IROutlinerPass());
+
+  // Merge functions if requested.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
+  // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+  // canonicalization pass that enables other optimizations. As a result,
+  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+  // result too early.
+  OptimizePM.addPass(LoopSinkPass());
+
+  // And finally clean up LCSSA form before generating code.
+  OptimizePM.addPass(InstSimplifyPass());
+
+  // This hoists/decomposes div/rem ops. It should run after other sink/hoist
+  // passes to avoid re-sinking, but before SimplifyCFG because it can allow
+  // flattening of blocks.
+  OptimizePM.addPass(DivRemPairsPass());
+
+  // LoopSink (and other loop passes since the last simplifyCFG) might have
+  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
+  OptimizePM.addPass(SimplifyCFGPass());
+
+  OptimizePM.addPass(CoroCleanupPass());
+
+  // Add the core optimizing pipeline.
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
+  if (PTO.CallGraphProfile)
+    MPM.addPass(CGProfilePass());
+
+  // Now we need to do some global optimization transforms.
+  // FIXME: It would seem like these should come first in the optimization
+  // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
+  // ordering here.
+  MPM.addPass(GlobalDCEPass());
+  MPM.addPass(ConstantMergePass());
+
+  // TODO: Relative look table converter pass caused an issue when full lto is
+  // enabled. See https://reviews.llvm.org/D94355 for more details.
+  // Until the issue fixed, disable this pass during pre-linking phase.
+  if (!LTOPreLink)
+    MPM.addPass(RelLookupTableConverterPass());
+
+  return MPM;
+}
+
+ModulePassManager
+PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
+                                           bool LTOPreLink) {
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
+
+  ModulePassManager MPM;
+
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // Apply module pipeline start EP callback.
+  for (auto &C : PipelineStartEPCallbacks)
+    C(MPM, Level);
+
+  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
+    MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
+
+  // Add the core simplification pipeline.
+  MPM.addPass(buildModuleSimplificationPipeline(
+      Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink
+                        : ThinOrFullLTOPhase::None));
+
+  // Now add the optimization pipeline.
+  MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink));
+
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
+      PGOOpt->Action == PGOOptions::SampleUse)
+    MPM.addPass(PseudoProbeUpdatePass());
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  if (LTOPreLink)
+    addRequiredLTOPreLinkPasses(MPM);
+
+  return MPM;
+}
+
+ModulePassManager
+PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
+
+  ModulePassManager MPM;
+
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
+    MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
+
+  // Apply module pipeline start EP callback.
+  for (auto &C : PipelineStartEPCallbacks)
+    C(MPM, Level);
+
+  // If we are planning to perform ThinLTO later, we don't bloat the code with
+  // unrolling/vectorization/... now. Just simplify the module as much as we
+  // can.
+  MPM.addPass(buildModuleSimplificationPipeline(
+      Level, ThinOrFullLTOPhase::ThinLTOPreLink));
+
+  // Run partial inlining pass to partially inline functions that have
+  // large bodies.
+  // FIXME: It isn't clear whether this is really the right place to run this
+  // in ThinLTO. Because there is another canonicalization and simplification
+  // phase that will run after the thin link, running this here ends up with
+  // less information than will be available later and it may grow functions in
+  // ways that aren't beneficial.
+  if (RunPartialInlining)
+    MPM.addPass(PartialInlinerPass());
+
+  // Reduce the size of the IR as much as possible.
+  MPM.addPass(GlobalOptPass());
+
+  // Module simplification splits coroutines, but does not fully clean up
+  // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up
+  // on these, we schedule the cleanup here.
+  MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
+
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
+      PGOOpt->Action == PGOOptions::SampleUse)
+    MPM.addPass(PseudoProbeUpdatePass());
+
+  // Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual
+  // optimization is going to be done in PostLink stage, but clang can't
+  // add callbacks there in case of in-process ThinLTO called by linker.
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  addRequiredLTOPreLinkPasses(MPM);
+
+  return MPM;
+}
+
+ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
+    OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
+  ModulePassManager MPM;
+
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
+  if (ImportSummary) {
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    //
+    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
+    // metadata and intrinsics.
+    MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary));
+    MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
+  }
+
+  if (Level == OptimizationLevel::O0) {
+    // Run a second time to clean up any type tests left behind by WPD for use
+    // in ICP.
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+    // Drop available_externally and unreferenced globals. This is necessary
+    // with ThinLTO in order to avoid leaving undefined references to dead
+    // globals in the object file.
+    MPM.addPass(EliminateAvailableExternallyPass());
+    MPM.addPass(GlobalDCEPass());
+    return MPM;
+  }
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // Add the core simplification pipeline.
+  MPM.addPass(buildModuleSimplificationPipeline(
+      Level, ThinOrFullLTOPhase::ThinLTOPostLink));
+
+  // Now add the optimization pipeline.
+  MPM.addPass(buildModuleOptimizationPipeline(Level));
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  return MPM;
+}
+
+ModulePassManager
+PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
+  // FIXME: We should use a customized pre-link pipeline!
+  return buildPerModuleDefaultPipeline(Level,
+                                       /* LTOPreLink */ true);
+}
+
+ModulePassManager
+PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
+                                     ModuleSummaryIndex *ExportSummary) {
+  ModulePassManager MPM;
+
+  // Convert @llvm.global.annotations to !annotation metadata.
+  MPM.addPass(Annotation2MetadataPass());
+
+  // Create a function that performs CFI checks for cross-DSO calls with targets
+  // in the current module.
+  MPM.addPass(CrossDSOCFIPass());
+
+  if (Level == OptimizationLevel::O0) {
+    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
+    // metadata and intrinsics.
+    MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
+    MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
+    // Run a second time to clean up any type tests left behind by WPD for use
+    // in ICP.
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
+    // Emit annotation remarks.
+    addAnnotationRemarksPass(MPM);
+
+    return MPM;
+  }
+
+  if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) {
+    // Load sample profile before running the LTO optimization pipeline.
+    MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
+                                        PGOOpt->ProfileRemappingFile,
+                                        ThinOrFullLTOPhase::FullLTOPostLink));
+    // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+    // RequireAnalysisPass for PSI before subsequent non-module passes.
+    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+  }
+
+  // Remove unused virtual tables to improve the quality of code generated by
+  // whole-program devirtualization and bitset lowering.
+  MPM.addPass(GlobalDCEPass());
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // Do basic inference of function attributes from known properties of system
+  // libraries and other oracles.
+  MPM.addPass(InferFunctionAttrsPass());
+
+  if (Level.getSpeedupLevel() > 1) {
+    FunctionPassManager EarlyFPM;
+    EarlyFPM.addPass(CallSiteSplittingPass());
+    MPM.addPass(createModuleToFunctionPassAdaptor(
+        std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
+
+    // Indirect call promotion. This should promote all the targets that are
+    // left by the earlier promotion pass that promotes intra-module targets.
+    // This two-step promotion is to save the compile time. For LTO, it should
+    // produce the same result as if we only do promotion here.
+    MPM.addPass(PGOIndirectCallPromotion(
+        true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
+
+    if (EnableFunctionSpecialization && Level == OptimizationLevel::O3)
+      MPM.addPass(FunctionSpecializationPass());
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.
+    MPM.addPass(IPSCCPPass());
+
+    // Attach metadata to indirect call sites indicating the set of functions
+    // they may target at run-time. This should follow IPSCCP.
+    MPM.addPass(CalledValuePropagationPass());
+  }
+
+  // Now deduce any function attributes based in the current code.
+  MPM.addPass(
+      createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
+
+  // Do RPO function attribute inference across the module to forward-propagate
+  // attributes where applicable.
+  // FIXME: Is this really an optimization rather than a canonicalization?
+  MPM.addPass(ReversePostOrderFunctionAttrsPass());
+
+  // Use in-range annotations on GEP indices to split globals where beneficial.
+  MPM.addPass(GlobalSplitPass());
+
+  // Run whole program optimization of virtual call when the list of callees
+  // is fixed.
+  MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
+
+  // Stop here at -O1.
+  if (Level == OptimizationLevel::O1) {
+    // The LowerTypeTestsPass needs to run to lower type metadata and the
+    // type.test intrinsics. The pass does nothing if CFI is disabled.
+    MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
+    // Run a second time to clean up any type tests left behind by WPD for use
+    // in ICP (which is performed earlier than this in the regular LTO
+    // pipeline).
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
+    // Emit annotation remarks.
+    addAnnotationRemarksPass(MPM);
+
+    return MPM;
+  }
+
+  // Optimize globals to try and fold them into constants.
+  MPM.addPass(GlobalOptPass());
+
+  // Promote any localized globals to SSA registers.
+  MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
+
+  // Linking modules together can lead to duplicate global constant, only
+  // keep one copy of each constant.
+  MPM.addPass(ConstantMergePass());
+
+  // Remove unused arguments from functions.
+  MPM.addPass(DeadArgumentEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  FunctionPassManager PeepholeFPM;
+  if (Level == OptimizationLevel::O3)
+    PeepholeFPM.addPass(AggressiveInstCombinePass());
+  PeepholeFPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(PeepholeFPM, Level);
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  // Note: historically, the PruneEH pass was run first to deduce nounwind and
+  // generally clean up exception handling overhead. It isn't clear this is
+  // valuable as the inliner doesn't currently care whether it is inlining an
+  // invoke or a call.
+  // Run the inliner now.
+  MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level)));
+
+  // Optimize globals again after we ran the inliner.
+  MPM.addPass(GlobalOptPass());
+
+  // Garbage collect dead functions.
+  MPM.addPass(GlobalDCEPass());
+
+  // If we didn't decide to inline a function, check to see if we can
+  // transform it to pass arguments by value instead of by reference.
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass()));
+
+  FunctionPassManager FPM;
+  // The IPO Passes may leave cruft around. Clean up after them.
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
+
+  // Do a post inline PGO instrumentation and use pass. This is a context
+  // sensitive PGO pass.
+  if (PGOOpt) {
+    if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
+                        /* IsCS */ true, PGOOpt->CSProfileGenFile,
+                        PGOOpt->ProfileRemappingFile);
+    else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
+      addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
+                        /* IsCS */ true, PGOOpt->ProfileFile,
+                        PGOOpt->ProfileRemappingFile);
+  }
+
+  // Break up allocas
+  FPM.addPass(SROAPass());
+
+  // LTO provides additional opportunities for tailcall elimination due to
+  // link-time inlining, and visibility of nocapture attribute.
+  FPM.addPass(TailCallElimPass());
+
+  // Run a few AA driver optimizations here and now to cleanup the code.
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  MPM.addPass(
+      createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
+
+  // Require the GlobalsAA analysis for the module so we can query it within
+  // MainFPM.
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
+  // Invalidate AAManager so it can be recreated and pick up the newly available
+  // GlobalsAA.
+  MPM.addPass(
+      createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
+
+  FunctionPassManager MainFPM;
+  MainFPM.addPass(createFunctionToLoopPassAdaptor(
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
+
+  if (RunNewGVN)
+    MainFPM.addPass(NewGVNPass());
+  else
+    MainFPM.addPass(GVNPass());
+
+  // Remove dead memcpy()'s.
+  MainFPM.addPass(MemCpyOptPass());
+
+  // Nuke dead stores.
+  MainFPM.addPass(DSEPass());
+  MainFPM.addPass(MergedLoadStoreMotionPass());
+
+  // More loops are countable; try to optimize them.
+  if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
+    MainFPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
+
+  if (EnableConstraintElimination)
+    MainFPM.addPass(ConstraintEliminationPass());
+
+  LoopPassManager LPM;
+  LPM.addPass(IndVarSimplifyPass());
+  LPM.addPass(LoopDeletionPass());
+  // FIXME: Add loop interchange.
+
+  // Unroll small loops and perform peeling.
+  LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                 PTO.ForgetAllSCEVInLoopUnroll));
+  // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
+  // *All* loop passes must preserve it, in order to be able to use it.
+  MainFPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
+
+  MainFPM.addPass(LoopDistributePass());
+
+  addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
+
+  invokePeepholeEPCallbacks(MainFPM, Level);
+  MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM),
+                                                PTO.EagerlyInvalidateAnalyses));
+
+  // Lower type metadata and the type.test intrinsic. This pass supports
+  // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
+  // to be run at link time if CFI is enabled. This pass does nothing if
+  // CFI is disabled.
+  MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
+  // Run a second time to clean up any type tests left behind by WPD for use
+  // in ICP (which is performed earlier than this in the regular LTO pipeline).
+  MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
+  // Enable splitting late in the FullLTO post-link pipeline. This is done in
+  // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses).
+  if (EnableHotColdSplit)
+    MPM.addPass(HotColdSplittingPass());
+
+  // Add late LTO optimization passes.
+  // Delete basic blocks, which optimization passes may have killed.
+  MPM.addPass(createModuleToFunctionPassAdaptor(
+      SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))));
+
+  // Drop bodies of available eternally objects to improve GlobalDCE.
+  MPM.addPass(EliminateAvailableExternallyPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  MPM.addPass(GlobalDCEPass());
+
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
+  // Emit annotation remarks.
+  addAnnotationRemarksPass(MPM);
+
+  return MPM;
+}
+
+ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
+                                                      bool LTOPreLink) {
+  assert(Level == OptimizationLevel::O0 &&
+         "buildO0DefaultPipeline should only be used with O0");
+
+  ModulePassManager MPM;
+
+  // Perform pseudo probe instrumentation in O0 mode. This is for the
+  // consistency between different build modes. For example, a LTO build can be
+  // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in
+  // the postlink will require pseudo probe instrumentation in the prelink.
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
+    MPM.addPass(SampleProfileProbePass(TM));
+
+  if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr ||
+                 PGOOpt->Action == PGOOptions::IRUse))
+    addPGOInstrPassesForO0(
+        MPM,
+        /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
+        /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile);
+
+  for (auto &C : PipelineStartEPCallbacks)
+    C(MPM, Level);
+
+  if (PGOOpt && PGOOpt->DebugInfoForProfiling)
+    MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
+
+  for (auto &C : PipelineEarlySimplificationEPCallbacks)
+    C(MPM, Level);
+
+  // Build a minimal pipeline based on the semantics required by LLVM,
+  // which is just that always inlining occurs. Further, disable generating
+  // lifetime intrinsics to avoid enabling further optimizations during
+  // code generation.
+  MPM.addPass(AlwaysInlinerPass(
+      /*InsertLifetimeIntrinsics=*/false));
+
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
+  if (EnableMatrix)
+    MPM.addPass(
+        createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true)));
+
+  if (!CGSCCOptimizerLateEPCallbacks.empty()) {
+    CGSCCPassManager CGPM;
+    for (auto &C : CGSCCOptimizerLateEPCallbacks)
+      C(CGPM, Level);
+    if (!CGPM.isEmpty())
+      MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+  }
+  if (!LateLoopOptimizationsEPCallbacks.empty()) {
+    LoopPassManager LPM;
+    for (auto &C : LateLoopOptimizationsEPCallbacks)
+      C(LPM, Level);
+    if (!LPM.isEmpty()) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(
+          createFunctionToLoopPassAdaptor(std::move(LPM))));
+    }
+  }
+  if (!LoopOptimizerEndEPCallbacks.empty()) {
+    LoopPassManager LPM;
+    for (auto &C : LoopOptimizerEndEPCallbacks)
+      C(LPM, Level);
+    if (!LPM.isEmpty()) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(
+          createFunctionToLoopPassAdaptor(std::move(LPM))));
+    }
+  }
+  if (!ScalarOptimizerLateEPCallbacks.empty()) {
+    FunctionPassManager FPM;
+    for (auto &C : ScalarOptimizerLateEPCallbacks)
+      C(FPM, Level);
+    if (!FPM.isEmpty())
+      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+  if (!VectorizerStartEPCallbacks.empty()) {
+    FunctionPassManager FPM;
+    for (auto &C : VectorizerStartEPCallbacks)
+      C(FPM, Level);
+    if (!FPM.isEmpty())
+      MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
+  CGSCCPassManager CGPM;
+  CGPM.addPass(CoroSplitPass());
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+  MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
+
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
+  if (LTOPreLink)
+    addRequiredLTOPreLinkPasses(MPM);
+
+  return MPM;
+}
+
+AAManager PassBuilder::buildDefaultAAPipeline() {
+  AAManager AA;
+
+  // The order in which these are registered determines their priority when
+  // being queried.
+
+  // First we register the basic alias analysis that provides the majority of
+  // per-function local AA logic. This is a stateless, on-demand local set of
+  // AA techniques.
+  AA.registerFunctionAnalysis<BasicAA>();
+
+  // Next we query fast, specialized alias analyses that wrap IR-embedded
+  // information about aliasing.
+  AA.registerFunctionAnalysis<ScopedNoAliasAA>();
+  AA.registerFunctionAnalysis<TypeBasedAA>();
+
+  // Add support for querying global aliasing information when available.
+  // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
+  // analysis, all that the `AAManager` can do is query for any *cached*
+  // results from `GlobalsAA` through a readonly proxy.
+  AA.registerModuleAnalysis<GlobalsAA>();
+
+  // Add target-specific alias analyses.
+  if (TM)
+    TM->registerDefaultAliasAnalyses(AA);
+
+  return AA;
+}
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 7525d59f94a5..c2032b5b8276 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -60,8 +60,6 @@ MODULE_PASS("globaldce", GlobalDCEPass())
 MODULE_PASS("globalopt", GlobalOptPass())
 MODULE_PASS("globalsplit", GlobalSplitPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
-MODULE_PASS("hwasan", HWAddressSanitizerPass(false, false))
-MODULE_PASS("khwasan", HWAddressSanitizerPass(true, true))
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
 MODULE_PASS("inliner-wrapper-no-mandatory-first", ModuleInlinerWrapperPass(
@@ -75,7 +73,6 @@ MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("ipsccp", IPSCCPPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
 MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs()))
-MODULE_PASS("loop-extract", LoopExtractorPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -101,7 +98,6 @@ MODULE_PASS("rpo-function-attrs", ReversePostOrderFunctionAttrsPass())
 MODULE_PASS("sample-profile", SampleProfileLoaderPass())
 MODULE_PASS("scc-oz-module-inliner",
   buildInlinerPipeline(OptimizationLevel::Oz, ThinOrFullLTOPhase::None))
-MODULE_PASS("loop-extract-single", LoopExtractorPass(1))
 MODULE_PASS("strip", StripSymbolsPass())
 MODULE_PASS("strip-dead-debug-info", StripDeadDebugInfoPass())
 MODULE_PASS("pseudo-probe", SampleProfileProbePass(TM))
@@ -113,16 +109,43 @@ MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 MODULE_PASS("dfsan", DataFlowSanitizerPass())
-MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false))
-MODULE_PASS("msan-module", MemorySanitizerPass({}))
-MODULE_PASS("tsan-module", ThreadSanitizerPass())
-MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
+MODULE_PASS("msan-module", ModuleMemorySanitizerPass({}))
+MODULE_PASS("module-inline", ModuleInlinerPass())
+MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
 #undef MODULE_PASS
 
+#ifndef MODULE_PASS_WITH_PARAMS
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
+#endif
+MODULE_PASS_WITH_PARAMS("loop-extract",
+                        "LoopExtractorPass",
+                        [](bool Single) {
+                          if (Single)
+                            return LoopExtractorPass(1);
+                          return LoopExtractorPass();
+                        },
+                        parseLoopExtractorPassOptions,
+                        "single")
+MODULE_PASS_WITH_PARAMS("hwasan",
+                        "HWAddressSanitizerPass",
+                        [](HWAddressSanitizerOptions Opts) {
+                          return HWAddressSanitizerPass(Opts);
+                        },
+                        parseHWASanPassOptions,
+                        "kernel;recover")
+MODULE_PASS_WITH_PARAMS("asan-module",
+                        "ModuleAddressSanitizerPass",
+                        [](AddressSanitizerOptions Opts) {
+                          return ModuleAddressSanitizerPass(Opts);
+                        },
+                        parseASanPassOptions,
+                        "kernel")
+#undef MODULE_PASS_WITH_PARAMS
+
 #ifndef CGSCC_ANALYSIS
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)
 #endif
@@ -138,12 +161,23 @@ CGSCC_PASS("argpromotion", ArgumentPromotionPass())
 CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
 CGSCC_PASS("attributor-cgscc", AttributorCGSCCPass())
-CGSCC_PASS("inline", InlinerPass())
 CGSCC_PASS("openmp-opt-cgscc", OpenMPOptCGSCCPass())
 CGSCC_PASS("coro-split", CoroSplitPass())
 CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
 #undef CGSCC_PASS
 
+#ifndef CGSCC_PASS_WITH_PARAMS
+#define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
+#endif
+CGSCC_PASS_WITH_PARAMS("inline",
+                       "InlinerPass",
+                       [](bool OnlyMandatory) {
+                         return InlinerPass(OnlyMandatory);
+                       },
+                       parseInlinerPassOptions,
+                       "only-mandatory")
+#undef CGSCC_PASS_WITH_PARAMS
+
 #ifndef FUNCTION_ANALYSIS
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
@@ -167,6 +201,7 @@ FUNCTION_ANALYSIS("regions", RegionInfoAnalysis())
 FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
 FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
 FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
+FUNCTION_ANALYSIS("should-not-run-function-passes", ShouldNotRunFunctionPassesAnalysis())
 FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis())
 FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 FUNCTION_ANALYSIS("targetir",
@@ -217,12 +252,8 @@ FUNCTION_PASS("div-rem-pairs", DivRemPairsPass())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dot-cfg", CFGPrinterPass())
 FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
-FUNCTION_PASS("early-cse", EarlyCSEPass(/*UseMemorySSA=*/false))
-FUNCTION_PASS("early-cse-memssa", EarlyCSEPass(/*UseMemorySSA=*/true))
-FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false))
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
-FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true))
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
 FUNCTION_PASS("helloworld", HelloWorldPass())
@@ -242,8 +273,6 @@ FUNCTION_PASS("loweratomic", LowerAtomicPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
 FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
-FUNCTION_PASS("lower-matrix-intrinsics", LowerMatrixIntrinsicsPass())
-FUNCTION_PASS("lower-matrix-intrinsics-minimal", LowerMatrixIntrinsicsPass(true))
 FUNCTION_PASS("lower-widenable-condition", LowerWidenableConditionPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass())
@@ -273,6 +302,7 @@ FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
 FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
+FUNCTION_PASS("print<cost-model>", CostModelPrinterPass(dbgs()))
 FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<divergence>", DivergenceAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
@@ -286,6 +316,7 @@ FUNCTION_PASS("print<inliner-size-estimator>",
   InlineSizeEstimatorAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
 FUNCTION_PASS("print<memoryssa>", MemorySSAPrinterPass(dbgs()))
+FUNCTION_PASS("print<memoryssa-walker>", MemorySSAWalkerPrinterPass(dbgs()))
 FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(dbgs()))
 FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
@@ -306,7 +337,7 @@ FUNCTION_PASS("sink", SinkingPass())
 FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
 FUNCTION_PASS("slsr", StraightLineStrengthReducePass())
 FUNCTION_PASS("speculative-execution", SpeculativeExecutionPass())
-FUNCTION_PASS("sroa", SROA())
+FUNCTION_PASS("sroa", SROAPass())
 FUNCTION_PASS("strip-gc-relocates", StripGCRelocates())
 FUNCTION_PASS("structurizecfg", StructurizeCFGPass())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
@@ -322,10 +353,6 @@ FUNCTION_PASS("verify<scalar-evolution>", ScalarEvolutionVerifierPass())
 FUNCTION_PASS("view-cfg", CFGViewerPass())
 FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
-FUNCTION_PASS("asan", AddressSanitizerPass(false, false, false))
-FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false))
-FUNCTION_PASS("msan", MemorySanitizerPass({}))
-FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
 #undef FUNCTION_PASS
@@ -333,6 +360,27 @@ FUNCTION_PASS("memprof", MemProfilerPass())
 #ifndef FUNCTION_PASS_WITH_PARAMS
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
 #endif
+FUNCTION_PASS_WITH_PARAMS("early-cse",
+                          "EarlyCSEPass",
+                           [](bool UseMemorySSA) {
+                             return EarlyCSEPass(UseMemorySSA);
+                           },
+                          parseEarlyCSEPassOptions,
+                          "memssa")
+FUNCTION_PASS_WITH_PARAMS("ee-instrument",
+                          "EntryExitInstrumenterPass",
+                           [](bool PostInlining) {
+                             return EntryExitInstrumenterPass(PostInlining);
+                           },
+                          parseEntryExitInstrumenterPassOptions,
+                          "post-inline")
+FUNCTION_PASS_WITH_PARAMS("lower-matrix-intrinsics",
+                          "LowerMatrixIntrinsicsPass",
+                           [](bool Minimal) {
+                             return LowerMatrixIntrinsicsPass(Minimal);
+                           },
+                          parseLowerMatrixIntrinsicsPassOptions,
+                          "minimal")
 FUNCTION_PASS_WITH_PARAMS("loop-unroll",
                           "LoopUnrollPass",
                            [](LoopUnrollOptions Opts) {
@@ -345,6 +393,13 @@ FUNCTION_PASS_WITH_PARAMS("loop-unroll",
                           "no-profile-peeling;profile-peeling;"
                           "no-runtime;runtime;"
                           "no-upperbound;upperbound")
+FUNCTION_PASS_WITH_PARAMS("asan",
+                          "AddressSanitizerPass",
+                           [](AddressSanitizerOptions Opts) {
+                             return AddressSanitizerPass(Opts);
+                           },
+                          parseASanPassOptions,
+                          "kernel")
 FUNCTION_PASS_WITH_PARAMS("msan",
                           "MemorySanitizerPass",
                            [](MemorySanitizerOptions Opts) {
@@ -381,9 +436,9 @@ FUNCTION_PASS_WITH_PARAMS("mldst-motion",
                           parseMergedLoadStoreMotionOptions,
                           "no-split-footer-bb;split-footer-bb")
 FUNCTION_PASS_WITH_PARAMS("gvn",
-                          "GVN",
+                          "GVNPass",
                            [](GVNOptions Opts) {
-                             return GVN(Opts);
+                             return GVNPass(Opts);
                            },
                           parseGVNOptions,
                           "no-pre;pre;"
@@ -399,6 +454,16 @@ FUNCTION_PASS_WITH_PARAMS("print<stack-lifetime>",
                           "may;must")
 #undef FUNCTION_PASS_WITH_PARAMS
 
+#ifndef LOOPNEST_PASS
+#define LOOPNEST_PASS(NAME, CREATE_PASS)
+#endif
+LOOPNEST_PASS("lnicm", LNICMPass())
+LOOPNEST_PASS("loop-flatten", LoopFlattenPass())
+LOOPNEST_PASS("loop-interchange", LoopInterchangePass())
+LOOPNEST_PASS("loop-unroll-and-jam", LoopUnrollAndJamPass())
+LOOPNEST_PASS("no-op-loopnest", NoOpLoopNestPass())
+#undef LOOPNEST_PASS
+
 #ifndef LOOP_ANALYSIS
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)
 #endif
@@ -416,11 +481,8 @@ LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
 LOOP_PASS("dot-ddg", DDGDotPrinterPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 LOOP_PASS("licm", LICMPass())
-LOOP_PASS("lnicm", LNICMPass())
-LOOP_PASS("loop-flatten", LoopFlattenPass())
 LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
-LOOP_PASS("loop-interchange", LoopInterchangePass())
 LOOP_PASS("loop-rotate", LoopRotatePass())
 LOOP_PASS("no-op-loop", NoOpLoopPass())
 LOOP_PASS("print", PrintLoopPass(dbgs()))
@@ -428,7 +490,6 @@ LOOP_PASS("loop-deletion", LoopDeletionPass())
 LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
-LOOP_PASS("loop-unroll-and-jam", LoopUnrollAndJamPass())
 LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 5a48923bce8a..8e6be6730ea4 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -29,10 +29,14 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
+#include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -40,10 +44,11 @@ using namespace llvm;
 cl::opt<bool> PreservedCFGCheckerInstrumentation::VerifyPreservedCFG(
     "verify-cfg-preserved", cl::Hidden,
 #ifdef NDEBUG
-    cl::init(false));
+    cl::init(false)
 #else
-    cl::init(true));
+    cl::init(true)
 #endif
+    );
 
 // An option that prints out the IR after passes, similar to
 // -print-after-all except that it only prints the IR after passes that
@@ -79,7 +84,9 @@ enum class ChangePrinter {
   PrintChangedDiffVerbose,
   PrintChangedDiffQuiet,
   PrintChangedColourDiffVerbose,
-  PrintChangedColourDiffQuiet
+  PrintChangedColourDiffQuiet,
+  PrintChangedDotCfgVerbose,
+  PrintChangedDotCfgQuiet
 };
 static cl::opt<ChangePrinter> PrintChanged(
     "print-changed", cl::desc("Print changed IRs"), cl::Hidden,
@@ -95,6 +102,10 @@ static cl::opt<ChangePrinter> PrintChanged(
                    "Display patch-like changes with color"),
         clEnumValN(ChangePrinter::PrintChangedColourDiffQuiet, "cdiff-quiet",
                    "Display patch-like changes in quiet mode with color"),
+        clEnumValN(ChangePrinter::PrintChangedDotCfgVerbose, "dot-cfg",
+                   "Create a website with graphical changes"),
+        clEnumValN(ChangePrinter::PrintChangedDotCfgQuiet, "dot-cfg-quiet",
+                   "Create a website with graphical changes in quiet mode"),
         // Sentinel value for unspecified option.
         clEnumValN(ChangePrinter::PrintChangedVerbose, "", "")));
 
@@ -119,6 +130,40 @@ static cl::opt<std::string>
     DiffBinary("print-changed-diff-path", cl::Hidden, cl::init("diff"),
                cl::desc("system diff used by change reporters"));
 
+// An option for specifying the dot used by
+// print-changed=[dot-cfg | dot-cfg-quiet]
+static cl::opt<std::string>
+    DotBinary("print-changed-dot-path", cl::Hidden, cl::init("dot"),
+              cl::desc("system dot used by change reporters"));
+
+// An option that determines the colour used for elements that are only
+// in the before part.  Must be a colour named in appendix J of
+// https://graphviz.org/pdf/dotguide.pdf
+cl::opt<std::string>
+    BeforeColour("dot-cfg-before-color",
+                 cl::desc("Color for dot-cfg before elements."), cl::Hidden,
+                 cl::init("red"));
+// An option that determines the colour used for elements that are only
+// in the after part.  Must be a colour named in appendix J of
+// https://graphviz.org/pdf/dotguide.pdf
+cl::opt<std::string> AfterColour("dot-cfg-after-color",
+                                 cl::desc("Color for dot-cfg after elements."),
+                                 cl::Hidden, cl::init("forestgreen"));
+// An option that determines the colour used for elements that are in both
+// the before and after parts.  Must be a colour named in appendix J of
+// https://graphviz.org/pdf/dotguide.pdf
+cl::opt<std::string>
+    CommonColour("dot-cfg-common-color",
+                 cl::desc("Color for dot-cfg common elements."), cl::Hidden,
+                 cl::init("black"));
+
+// An option that determines where the generated website file (named
+// passes.html) and the associated pdf files (named diff_*.pdf) are saved.
+static cl::opt<std::string> DotCfgDir(
+    "dot-cfg-dir",
+    cl::desc("Generate dot files into specified directory for changed IRs"),
+    cl::Hidden, cl::init("./"));
+
 namespace {
 
 // Perform a system based diff between \p Before and \p After, using
@@ -166,7 +211,8 @@ std::string doSystemDiff(StringRef Before, StringRef After,
   SmallString<128> ULF =
       formatv("--unchanged-line-format={0}", UnchangedLineFormat);
 
-  StringRef Args[] = {"-w", "-d", OLF, NLF, ULF, FileName[0], FileName[1]};
+  StringRef Args[] = {DiffBinary, "-w", "-d",        OLF,
+                      NLF,        ULF,  FileName[0], FileName[1]};
   Optional<StringRef> Redirects[] = {None, StringRef(FileName[2]), None};
   int Result = sys::ExecuteAndWait(*DiffExe, Args, None, Redirects);
   if (Result < 0)
@@ -230,10 +276,9 @@ void printIR(raw_ostream &OS, const Function *F) {
   OS << *F;
 }
 
-void printIR(raw_ostream &OS, const Module *M,
-             bool ShouldPreserveUseListOrder = false) {
+void printIR(raw_ostream &OS, const Module *M) {
   if (isFunctionInPrintList("*") || forcePrintModuleIR()) {
-    M->print(OS, nullptr, ShouldPreserveUseListOrder);
+    M->print(OS, nullptr);
   } else {
     for (const auto &F : M->functions()) {
       printIR(OS, &F);
@@ -323,21 +368,20 @@ bool shouldPrintIR(Any IR) {
 
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
 /// llvm::Any and does actual print job.
-void unwrapAndPrint(raw_ostream &OS, Any IR,
-                    bool ShouldPreserveUseListOrder = false) {
+void unwrapAndPrint(raw_ostream &OS, Any IR) {
   if (!shouldPrintIR(IR))
     return;
 
   if (forcePrintModuleIR()) {
     auto *M = unwrapModule(IR);
     assert(M && "should have unwrapped module");
-    printIR(OS, M, ShouldPreserveUseListOrder);
+    printIR(OS, M);
     return;
   }
 
   if (any_isa<const Module *>(IR)) {
     const Module *M = any_cast<const Module *>(IR);
-    printIR(OS, M, ShouldPreserveUseListOrder);
+    printIR(OS, M);
     return;
   }
 
@@ -368,20 +412,46 @@ bool isIgnored(StringRef PassID) {
                         "DevirtSCCRepeatedPass", "ModuleInlinerWrapperPass"});
 }
 
+std::string makeHTMLReady(StringRef SR) {
+  std::string S;
+  while (true) {
+    StringRef Clean =
+        SR.take_until([](char C) { return C == '<' || C == '>'; });
+    S.append(Clean.str());
+    SR = SR.drop_front(Clean.size());
+    if (SR.size() == 0)
+      return S;
+    S.append(SR[0] == '<' ? "&lt;" : "&gt;");
+    SR = SR.drop_front();
+  }
+  llvm_unreachable("problems converting string to HTML");
+}
+
+// Return the module when that is the appropriate level of comparison for \p IR.
+const Module *getModuleForComparison(Any IR) {
+  if (any_isa<const Module *>(IR))
+    return any_cast<const Module *>(IR);
+  if (any_isa<const LazyCallGraph::SCC *>(IR))
+    return any_cast<const LazyCallGraph::SCC *>(IR)
+        ->begin()
+        ->getFunction()
+        .getParent();
+  return nullptr;
+}
+
 } // namespace
 
-template <typename IRUnitT>
-ChangeReporter<IRUnitT>::~ChangeReporter<IRUnitT>() {
+template <typename T> ChangeReporter<T>::~ChangeReporter<T>() {
   assert(BeforeStack.empty() && "Problem with Change Printer stack.");
 }
 
-template <typename IRUnitT>
-bool ChangeReporter<IRUnitT>::isInterestingFunction(const Function &F) {
+template <typename T>
+bool ChangeReporter<T>::isInterestingFunction(const Function &F) {
   return isFunctionInPrintList(F.getName());
 }
 
-template <typename IRUnitT>
-bool ChangeReporter<IRUnitT>::isInterestingPass(StringRef PassID) {
+template <typename T>
+bool ChangeReporter<T>::isInterestingPass(StringRef PassID) {
   if (isIgnored(PassID))
     return false;
 
@@ -392,8 +462,8 @@ bool ChangeReporter<IRUnitT>::isInterestingPass(StringRef PassID) {
 
 // Return true when this is a pass on IR for which printing
 // of changes is desired.
-template <typename IRUnitT>
-bool ChangeReporter<IRUnitT>::isInteresting(Any IR, StringRef PassID) {
+template <typename T>
+bool ChangeReporter<T>::isInteresting(Any IR, StringRef PassID) {
   if (!isInterestingPass(PassID))
     return false;
   if (any_isa<const Function *>(IR))
@@ -401,8 +471,8 @@ bool ChangeReporter<IRUnitT>::isInteresting(Any IR, StringRef PassID) {
   return true;
 }
 
-template <typename IRUnitT>
-void ChangeReporter<IRUnitT>::saveIRBeforePass(Any IR, StringRef PassID) {
+template <typename T>
+void ChangeReporter<T>::saveIRBeforePass(Any IR, StringRef PassID) {
   // Always need to place something on the stack because invalidated passes
   // are not given the IR so it cannot be determined whether the pass was for
   // something that was filtered out.
@@ -418,12 +488,12 @@ void ChangeReporter<IRUnitT>::saveIRBeforePass(Any IR, StringRef PassID) {
   }
 
   // Save the IR representation on the stack.
-  IRUnitT &Data = BeforeStack.back();
+  T &Data = BeforeStack.back();
   generateIRRepresentation(IR, PassID, Data);
 }
 
-template <typename IRUnitT>
-void ChangeReporter<IRUnitT>::handleIRAfterPass(Any IR, StringRef PassID) {
+template <typename T>
+void ChangeReporter<T>::handleIRAfterPass(Any IR, StringRef PassID) {
   assert(!BeforeStack.empty() && "Unexpected empty stack encountered.");
 
   std::string Name = getIRName(IR);
@@ -436,13 +506,13 @@ void ChangeReporter<IRUnitT>::handleIRAfterPass(Any IR, StringRef PassID) {
       handleFiltered(PassID, Name);
   } else {
     // Get the before rep from the stack
-    IRUnitT &Before = BeforeStack.back();
+    T &Before = BeforeStack.back();
     // Create the after rep
-    IRUnitT After;
+    T After;
     generateIRRepresentation(IR, PassID, After);
 
     // Was there a change in IR?
-    if (same(Before, After)) {
+    if (Before == After) {
       if (VerboseMode)
         omitAfter(PassID, Name);
     } else
@@ -451,8 +521,8 @@ void ChangeReporter<IRUnitT>::handleIRAfterPass(Any IR, StringRef PassID) {
   BeforeStack.pop_back();
 }
 
-template <typename IRUnitT>
-void ChangeReporter<IRUnitT>::handleInvalidatedPass(StringRef PassID) {
+template <typename T>
+void ChangeReporter<T>::handleInvalidatedPass(StringRef PassID) {
   assert(!BeforeStack.empty() && "Unexpected empty stack encountered.");
 
   // Always flag it as invalidated as we cannot determine when
@@ -464,8 +534,8 @@ void ChangeReporter<IRUnitT>::handleInvalidatedPass(StringRef PassID) {
   BeforeStack.pop_back();
 }
 
-template <typename IRUnitT>
-void ChangeReporter<IRUnitT>::registerRequiredCallbacks(
+template <typename T>
+void ChangeReporter<T>::registerRequiredCallbacks(
     PassInstrumentationCallbacks &PIC) {
   PIC.registerBeforeNonSkippedPassCallback(
       [this](StringRef P, Any IR) { saveIRBeforePass(IR, P); });
@@ -480,50 +550,40 @@ void ChangeReporter<IRUnitT>::registerRequiredCallbacks(
       });
 }
 
-ChangedBlockData::ChangedBlockData(const BasicBlock &B)
-    : Label(B.getName().str()) {
-  raw_string_ostream SS(Body);
-  B.print(SS, nullptr, true, true);
-}
-
-template <typename IRUnitT>
-TextChangeReporter<IRUnitT>::TextChangeReporter(bool Verbose)
-    : ChangeReporter<IRUnitT>(Verbose), Out(dbgs()) {}
+template <typename T>
+TextChangeReporter<T>::TextChangeReporter(bool Verbose)
+    : ChangeReporter<T>(Verbose), Out(dbgs()) {}
 
-template <typename IRUnitT>
-void TextChangeReporter<IRUnitT>::handleInitialIR(Any IR) {
+template <typename T> void TextChangeReporter<T>::handleInitialIR(Any IR) {
   // Always print the module.
   // Unwrap and print directly to avoid filtering problems in general routines.
   auto *M = unwrapModule(IR, /*Force=*/true);
   assert(M && "Expected module to be unwrapped when forced.");
   Out << "*** IR Dump At Start ***\n";
-  M->print(Out, nullptr,
-           /*ShouldPreserveUseListOrder=*/true);
+  M->print(Out, nullptr);
 }
 
-template <typename IRUnitT>
-void TextChangeReporter<IRUnitT>::omitAfter(StringRef PassID,
-                                            std::string &Name) {
+template <typename T>
+void TextChangeReporter<T>::omitAfter(StringRef PassID, std::string &Name) {
   Out << formatv("*** IR Dump After {0} on {1} omitted because no change ***\n",
                  PassID, Name);
 }
 
-template <typename IRUnitT>
-void TextChangeReporter<IRUnitT>::handleInvalidated(StringRef PassID) {
+template <typename T>
+void TextChangeReporter<T>::handleInvalidated(StringRef PassID) {
   Out << formatv("*** IR Pass {0} invalidated ***\n", PassID);
 }
 
-template <typename IRUnitT>
-void TextChangeReporter<IRUnitT>::handleFiltered(StringRef PassID,
-                                                 std::string &Name) {
+template <typename T>
+void TextChangeReporter<T>::handleFiltered(StringRef PassID,
+                                           std::string &Name) {
   SmallString<20> Banner =
       formatv("*** IR Dump After {0} on {1} filtered out ***\n", PassID, Name);
   Out << Banner;
 }
 
-template <typename IRUnitT>
-void TextChangeReporter<IRUnitT>::handleIgnored(StringRef PassID,
-                                                std::string &Name) {
+template <typename T>
+void TextChangeReporter<T>::handleIgnored(StringRef PassID, std::string &Name) {
   Out << formatv("*** IR Pass {0} on {1} ignored ***\n", PassID, Name);
 }
 
@@ -538,8 +598,7 @@ void IRChangedPrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
 void IRChangedPrinter::generateIRRepresentation(Any IR, StringRef PassID,
                                                 std::string &Output) {
   raw_string_ostream OS(Output);
-  unwrapAndPrint(OS, IR,
-                 /*ShouldPreserveUseListOrder=*/true);
+  unwrapAndPrint(OS, IR);
   OS.str();
 }
 
@@ -561,14 +620,10 @@ void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name,
   Out << "*** IR Dump After " << PassID << " on " << Name << " ***\n" << After;
 }
 
-bool IRChangedPrinter::same(const std::string &S1, const std::string &S2) {
-  return S1 == S2;
-}
-
-template <typename IRData>
-void OrderedChangedData<IRData>::report(
+template <typename T>
+void OrderedChangedData<T>::report(
     const OrderedChangedData &Before, const OrderedChangedData &After,
-    function_ref<void(const IRData *, const IRData *)> HandlePair) {
+    function_ref<void(const T *, const T *)> HandlePair) {
   const auto &BFD = Before.getData();
   const auto &AFD = After.getData();
   std::vector<std::string>::const_iterator BI = Before.getOrder().begin();
@@ -576,21 +631,21 @@ void OrderedChangedData<IRData>::report(
   std::vector<std::string>::const_iterator AI = After.getOrder().begin();
   std::vector<std::string>::const_iterator AE = After.getOrder().end();
 
-  auto handlePotentiallyRemovedIRData = [&](std::string S) {
+  auto HandlePotentiallyRemovedData = [&](std::string S) {
     // The order in LLVM may have changed so check if still exists.
     if (!AFD.count(S)) {
       // This has been removed.
       HandlePair(&BFD.find(*BI)->getValue(), nullptr);
     }
   };
-  auto handleNewIRData = [&](std::vector<const IRData *> &Q) {
+  auto HandleNewData = [&](std::vector<const T *> &Q) {
     // Print out any queued up new sections
-    for (const IRData *NBI : Q)
+    for (const T *NBI : Q)
       HandlePair(nullptr, NBI);
     Q.clear();
   };
 
-  // Print out the IRData in the after order, with before ones interspersed
+  // Print out the data in the after order, with before ones interspersed
   // appropriately (ie, somewhere near where they were in the before list).
   // Start at the beginning of both lists.  Loop through the
   // after list.  If an element is common, then advance in the before list
@@ -599,26 +654,26 @@ void OrderedChangedData<IRData>::report(
   // common, then enqueue it for reporting.  When the after list is exhausted,
   // loop through the before list, reporting any removed ones.  Finally,
   // report the rest of the enqueued new ones.
-  std::vector<const IRData *> NewIRDataQueue;
+  std::vector<const T *> NewDataQueue;
   while (AI != AE) {
     if (!BFD.count(*AI)) {
       // This section is new so place it in the queue.  This will cause it
       // to be reported after deleted sections.
-      NewIRDataQueue.emplace_back(&AFD.find(*AI)->getValue());
+      NewDataQueue.emplace_back(&AFD.find(*AI)->getValue());
       ++AI;
       continue;
     }
     // This section is in both; advance and print out any before-only
     // until we get to it.
     while (*BI != *AI) {
-      handlePotentiallyRemovedIRData(*BI);
+      HandlePotentiallyRemovedData(*BI);
       ++BI;
     }
     // Report any new sections that were queued up and waiting.
-    handleNewIRData(NewIRDataQueue);
+    HandleNewData(NewDataQueue);
 
-    const IRData &AData = AFD.find(*AI)->getValue();
-    const IRData &BData = BFD.find(*AI)->getValue();
+    const T &AData = AFD.find(*AI)->getValue();
+    const T &BData = BFD.find(*AI)->getValue();
     HandlePair(&BData, &AData);
     ++BI;
     ++AI;
@@ -626,39 +681,42 @@ void OrderedChangedData<IRData>::report(
 
   // Check any remaining before sections to see if they have been removed
   while (BI != BE) {
-    handlePotentiallyRemovedIRData(*BI);
+    HandlePotentiallyRemovedData(*BI);
     ++BI;
   }
 
-  handleNewIRData(NewIRDataQueue);
+  HandleNewData(NewDataQueue);
 }
 
-void ChangedIRComparer::compare(Any IR, StringRef Prefix, StringRef PassID,
-                                StringRef Name) {
-  if (!getModuleForComparison(IR)) {
-    // Not a module so just handle the single function.
-    assert(Before.getData().size() == 1 && "Expected only one function.");
-    assert(After.getData().size() == 1 && "Expected only one function.");
-    handleFunctionCompare(Name, Prefix, PassID, false,
-                          Before.getData().begin()->getValue(),
-                          After.getData().begin()->getValue());
+template <typename T>
+void IRComparer<T>::compare(
+    bool CompareModule,
+    std::function<void(bool InModule, unsigned Minor,
+                       const FuncDataT<T> &Before, const FuncDataT<T> &After)>
+        CompareFunc) {
+  if (!CompareModule) {
+    // Just handle the single function.
+    assert(Before.getData().size() == 1 && After.getData().size() == 1 &&
+           "Expected only one function.");
+    CompareFunc(false, 0, Before.getData().begin()->getValue(),
+                After.getData().begin()->getValue());
     return;
   }
 
-  ChangedIRData::report(
-      Before, After, [&](const ChangedFuncData *B, const ChangedFuncData *A) {
-        ChangedFuncData Missing;
-        if (!B)
-          B = &Missing;
-        else if (!A)
-          A = &Missing;
-        assert(B != &Missing && A != &Missing &&
-               "Both functions cannot be missing.");
-        handleFunctionCompare(Name, Prefix, PassID, true, *B, *A);
-      });
+  unsigned Minor = 0;
+  FuncDataT<T> Missing("");
+  IRDataT<T>::report(Before, After,
+                     [&](const FuncDataT<T> *B, const FuncDataT<T> *A) {
+                       assert((B || A) && "Both functions cannot be missing.");
+                       if (!B)
+                         B = &Missing;
+                       else if (!A)
+                         A = &Missing;
+                       CompareFunc(true, Minor++, *B, *A);
+                     });
 }
 
-void ChangedIRComparer::analyzeIR(Any IR, ChangedIRData &Data) {
+template <typename T> void IRComparer<T>::analyzeIR(Any IR, IRDataT<T> &Data) {
   if (const Module *M = getModuleForComparison(IR)) {
     // Create data for each existing/interesting function in the module.
     for (const Function &F : *M)
@@ -678,27 +736,16 @@ void ChangedIRComparer::analyzeIR(Any IR, ChangedIRData &Data) {
   generateFunctionData(Data, *F);
 }
 
-const Module *ChangedIRComparer::getModuleForComparison(Any IR) {
-  if (any_isa<const Module *>(IR))
-    return any_cast<const Module *>(IR);
-  if (any_isa<const LazyCallGraph::SCC *>(IR))
-    return any_cast<const LazyCallGraph::SCC *>(IR)
-        ->begin()
-        ->getFunction()
-        .getParent();
-  return nullptr;
-}
-
-bool ChangedIRComparer::generateFunctionData(ChangedIRData &Data,
-                                             const Function &F) {
+template <typename T>
+bool IRComparer<T>::generateFunctionData(IRDataT<T> &Data, const Function &F) {
   if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
-    ChangedFuncData CFD;
+    FuncDataT<T> FD(F.getEntryBlock().getName().str());
     for (const auto &B : F) {
-      CFD.getOrder().emplace_back(B.getName());
-      CFD.getData().insert({B.getName(), B});
+      FD.getOrder().emplace_back(B.getName());
+      FD.getData().insert({B.getName(), B});
     }
     Data.getOrder().emplace_back(F.getName());
-    Data.getData().insert({F.getName(), CFD});
+    Data.getData().insert({F.getName(), FD});
     return true;
   }
   return false;
@@ -792,7 +839,7 @@ bool PrintIRInstrumentation::shouldPrintBeforePass(StringRef PassID) {
     return true;
 
   StringRef PassName = PIC->getPassNameForClassName(PassID);
-  return llvm::is_contained(printBeforePasses(), PassName);
+  return is_contained(printBeforePasses(), PassName);
 }
 
 bool PrintIRInstrumentation::shouldPrintAfterPass(StringRef PassID) {
@@ -800,7 +847,7 @@ bool PrintIRInstrumentation::shouldPrintAfterPass(StringRef PassID) {
     return true;
 
   StringRef PassName = PIC->getPassNameForClassName(PassID);
-  return llvm::is_contained(printAfterPasses(), PassName);
+  return is_contained(printAfterPasses(), PassName);
 }
 
 void PrintIRInstrumentation::registerCallbacks(
@@ -874,14 +921,13 @@ void PrintPassInstrumentation::registerCallbacks(
     SpecialPasses.emplace_back("PassAdaptor");
   }
 
-  PIC.registerBeforeSkippedPassCallback(
-      [this, SpecialPasses](StringRef PassID, Any IR) {
-        assert(!isSpecialPass(PassID, SpecialPasses) &&
-               "Unexpectedly skipping special pass");
+  PIC.registerBeforeSkippedPassCallback([this, SpecialPasses](StringRef PassID,
+                                                              Any IR) {
+    assert(!isSpecialPass(PassID, SpecialPasses) &&
+           "Unexpectedly skipping special pass");
 
-        print() << "Skipping pass: " << PassID << " on " << getIRName(IR)
-                << "\n";
-      });
+    print() << "Skipping pass: " << PassID << " on " << getIRName(IR) << "\n";
+  });
   PIC.registerBeforeNonSkippedPassCallback([this, SpecialPasses](
                                                StringRef PassID, Any IR) {
     if (isSpecialPass(PassID, SpecialPasses))
@@ -1079,19 +1125,18 @@ void PreservedCFGCheckerInstrumentation::registerCallbacks(
     report_fatal_error(Twine("CFG unexpectedly changed by ", Pass));
   };
 
-  PIC.registerBeforeNonSkippedPassCallback(
-      [this, &FAM](StringRef P, Any IR) {
+  PIC.registerBeforeNonSkippedPassCallback([this, &FAM](StringRef P, Any IR) {
 #ifdef LLVM_ENABLE_ABI_BREAKING_CHECKS
-        assert(&PassStack.emplace_back(P));
+    assert(&PassStack.emplace_back(P));
 #endif
-        (void)this;
-        if (!any_isa<const Function *>(IR))
-          return;
+    (void)this;
+    if (!any_isa<const Function *>(IR))
+      return;
 
-        const auto *F = any_cast<const Function *>(IR);
-        // Make sure a fresh CFG snapshot is available before the pass.
-        FAM.getResult<PreservedCFGCheckerAnalysis>(*const_cast<Function *>(F));
-      });
+    const auto *F = any_cast<const Function *>(IR);
+    // Make sure a fresh CFG snapshot is available before the pass.
+    FAM.getResult<PreservedCFGCheckerAnalysis>(*const_cast<Function *>(F));
+  });
 
   PIC.registerAfterPassInvalidatedCallback(
       [this](StringRef P, const PreservedAnalyses &PassPA) {
@@ -1165,36 +1210,38 @@ void VerifyInstrumentation::registerCallbacks(
 InLineChangePrinter::~InLineChangePrinter() {}
 
 void InLineChangePrinter::generateIRRepresentation(Any IR, StringRef PassID,
-                                                   ChangedIRData &D) {
-  ChangedIRComparer::analyzeIR(IR, D);
+                                                   IRDataT<EmptyData> &D) {
+  IRComparer<EmptyData>::analyzeIR(IR, D);
 }
 
 void InLineChangePrinter::handleAfter(StringRef PassID, std::string &Name,
-                                      const ChangedIRData &Before,
-                                      const ChangedIRData &After, Any IR) {
+                                      const IRDataT<EmptyData> &Before,
+                                      const IRDataT<EmptyData> &After, Any IR) {
   SmallString<20> Banner =
       formatv("*** IR Dump After {0} on {1} ***\n", PassID, Name);
   Out << Banner;
-  ChangedIRComparer(Out, Before, After, UseColour)
-      .compare(IR, "", PassID, Name);
+  IRComparer<EmptyData>(Before, After)
+      .compare(getModuleForComparison(IR),
+               [&](bool InModule, unsigned Minor,
+                   const FuncDataT<EmptyData> &Before,
+                   const FuncDataT<EmptyData> &After) -> void {
+                 handleFunctionCompare(Name, "", PassID, " on ", InModule,
+                                       Minor, Before, After);
+               });
   Out << "\n";
 }
 
-bool InLineChangePrinter::same(const ChangedIRData &D1,
-                               const ChangedIRData &D2) {
-  return D1 == D2;
-}
-
-void ChangedIRComparer::handleFunctionCompare(StringRef Name, StringRef Prefix,
-                                              StringRef PassID, bool InModule,
-                                              const ChangedFuncData &Before,
-                                              const ChangedFuncData &After) {
+void InLineChangePrinter::handleFunctionCompare(
+    StringRef Name, StringRef Prefix, StringRef PassID, StringRef Divider,
+    bool InModule, unsigned Minor, const FuncDataT<EmptyData> &Before,
+    const FuncDataT<EmptyData> &After) {
   // Print a banner when this is being shown in the context of a module
   if (InModule)
     Out << "\n*** IR for function " << Name << " ***\n";
 
-  ChangedFuncData::report(
-      Before, After, [&](const ChangedBlockData *B, const ChangedBlockData *A) {
+  FuncDataT<EmptyData>::report(
+      Before, After,
+      [&](const BlockDataT<EmptyData> *B, const BlockDataT<EmptyData> *A) {
         StringRef BStr = B ? B->getBody() : "\n";
         StringRef AStr = A ? A->getBody() : "\n";
         const std::string Removed =
@@ -1210,7 +1257,863 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
       PrintChanged == ChangePrinter::PrintChangedDiffQuiet ||
       PrintChanged == ChangePrinter::PrintChangedColourDiffVerbose ||
       PrintChanged == ChangePrinter::PrintChangedColourDiffQuiet)
-    TextChangeReporter<ChangedIRData>::registerRequiredCallbacks(PIC);
+    TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC);
+}
+
+namespace {
+
+enum IRChangeDiffType { InBefore, InAfter, IsCommon, NumIRChangeDiffTypes };
+
+// Describe where a given element exists.
+std::string Colours[NumIRChangeDiffTypes];
+
+class DisplayNode;
+class DotCfgDiffDisplayGraph;
+
+// Base class for a node or edge in the dot-cfg-changes graph.
+class DisplayElement {
+public:
+  // Is this in before, after, or both?
+  IRChangeDiffType getType() const { return Type; }
+
+protected:
+  DisplayElement(IRChangeDiffType T) : Type(T) {}
+  const IRChangeDiffType Type;
+};
+
+// An edge representing a transition between basic blocks in the
+// dot-cfg-changes graph.
+class DisplayEdge : public DisplayElement {
+public:
+  DisplayEdge(std::string V, DisplayNode &Node, IRChangeDiffType T)
+      : DisplayElement(T), Value(V), Node(Node) {}
+  // The value on which the transition is made.
+  std::string getValue() const { return Value; }
+  // The node (representing a basic block) reached by this transition.
+  const DisplayNode &getDestinationNode() const { return Node; }
+
+protected:
+  std::string Value;
+  const DisplayNode &Node;
+};
+
+// A node in the dot-cfg-changes graph which represents a basic block.
+class DisplayNode : public DisplayElement {
+public:
+  // \p C is the content for the node, \p T indicates the colour for the
+  // outline of the node
+  DisplayNode(std::string C, IRChangeDiffType T)
+      : DisplayElement(T), Content(C) {}
+
+  // Iterator to the child nodes.  Required by GraphWriter.
+  using ChildIterator = std::unordered_set<DisplayNode *>::const_iterator;
+  ChildIterator children_begin() const { return Children.cbegin(); }
+  ChildIterator children_end() const { return Children.cend(); }
+
+  // Iterator for the edges.  Required by GraphWriter.
+  using EdgeIterator = std::vector<DisplayEdge *>::const_iterator;
+  EdgeIterator edges_begin() const { return EdgePtrs.cbegin(); }
+  EdgeIterator edges_end() const { return EdgePtrs.cend(); }
+
+  // Create an edge to \p Node on value \p V, with type \p T.
+  void createEdge(StringRef V, DisplayNode &Node, IRChangeDiffType T);
+
+  // Return the content of this node.
+  std::string getContent() const { return Content; }
+
+  // Return the type of the edge to node \p S.
+  const DisplayEdge &getEdge(const DisplayNode &To) const {
+    assert(EdgeMap.find(&To) != EdgeMap.end() && "Expected to find edge.");
+    return *EdgeMap.find(&To)->second;
+  }
+
+  // Return the value for the transition to basic block \p S.
+  // Required by GraphWriter.
+  std::string getEdgeSourceLabel(const DisplayNode &Sink) const {
+    return getEdge(Sink).getValue();
+  }
+
+  void createEdgeMap();
+
+protected:
+  const std::string Content;
+
+  // Place to collect all of the edges.  Once they are all in the vector,
+  // the vector will not reallocate so then we can use pointers to them,
+  // which are required by the graph writing routines.
+  std::vector<DisplayEdge> Edges;
+
+  std::vector<DisplayEdge *> EdgePtrs;
+  std::unordered_set<DisplayNode *> Children;
+  std::unordered_map<const DisplayNode *, const DisplayEdge *> EdgeMap;
+
+  // Safeguard adding of edges.
+  bool AllEdgesCreated = false;
+};
+
+// Class representing a difference display (corresponds to a pdf file).
+class DotCfgDiffDisplayGraph {
+public:
+  DotCfgDiffDisplayGraph(std::string Name) : GraphName(Name) {}
+
+  // Generate the file into \p DotFile.
+  void generateDotFile(StringRef DotFile);
+
+  // Iterator to the nodes.  Required by GraphWriter.
+  using NodeIterator = std::vector<DisplayNode *>::const_iterator;
+  NodeIterator nodes_begin() const {
+    assert(NodeGenerationComplete && "Unexpected children iterator creation");
+    return NodePtrs.cbegin();
+  }
+  NodeIterator nodes_end() const {
+    assert(NodeGenerationComplete && "Unexpected children iterator creation");
+    return NodePtrs.cend();
+  }
+
+  // Record the index of the entry node.  At this point, we can build up
+  // vectors of pointers that are required by the graph routines.
+  void setEntryNode(unsigned N) {
+    // At this point, there will be no new nodes.
+    assert(!NodeGenerationComplete && "Unexpected node creation");
+    NodeGenerationComplete = true;
+    for (auto &N : Nodes)
+      NodePtrs.emplace_back(&N);
+
+    EntryNode = NodePtrs[N];
+  }
+
+  // Create a node.
+  void createNode(std::string C, IRChangeDiffType T) {
+    assert(!NodeGenerationComplete && "Unexpected node creation");
+    Nodes.emplace_back(C, T);
+  }
+  // Return the node at index \p N to avoid problems with vectors reallocating.
+  DisplayNode &getNode(unsigned N) {
+    assert(N < Nodes.size() && "Node is out of bounds");
+    return Nodes[N];
+  }
+  unsigned size() const {
+    assert(NodeGenerationComplete && "Unexpected children iterator creation");
+    return Nodes.size();
+  }
+
+  // Return the name of the graph.  Required by GraphWriter.
+  std::string getGraphName() const { return GraphName; }
+
+  // Return the string representing the differences for basic block \p Node.
+  // Required by GraphWriter.
+  std::string getNodeLabel(const DisplayNode &Node) const {
+    return Node.getContent();
+  }
+
+  // Return a string with colour information for Dot.  Required by GraphWriter.
+  std::string getNodeAttributes(const DisplayNode &Node) const {
+    return attribute(Node.getType());
+  }
+
+  // Return a string with colour information for Dot.  Required by GraphWriter.
+  std::string getEdgeColorAttr(const DisplayNode &From,
+                               const DisplayNode &To) const {
+    return attribute(From.getEdge(To).getType());
+  }
+
+  // Get the starting basic block.  Required by GraphWriter.
+  DisplayNode *getEntryNode() const {
+    assert(NodeGenerationComplete && "Unexpected children iterator creation");
+    return EntryNode;
+  }
+
+protected:
+  // Return the string containing the colour to use as a Dot attribute.
+  std::string attribute(IRChangeDiffType T) const;
+
+  bool NodeGenerationComplete = false;
+  const std::string GraphName;
+  std::vector<DisplayNode> Nodes;
+  std::vector<DisplayNode *> NodePtrs;
+  DisplayNode *EntryNode = nullptr;
+};
+
+void DisplayNode::createEdge(StringRef V, DisplayNode &Node,
+                             IRChangeDiffType T) {
+  assert(!AllEdgesCreated && "Expected to be able to still create edges.");
+  Edges.emplace_back(V.str(), Node, T);
+  Children.insert(&Node);
+}
+
+void DisplayNode::createEdgeMap() {
+  // No more edges will be added so we can now use pointers to the edges
+  // as the vector will not grow and reallocate.
+  AllEdgesCreated = true;
+  for (auto &E : Edges)
+    EdgeMap.insert({&E.getDestinationNode(), &E});
+}
+
+class DotCfgDiffNode;
+class DotCfgDiff;
+
+// A class representing a basic block in the Dot difference graph.
+class DotCfgDiffNode {
+public:
+  DotCfgDiffNode() = delete;
+
+  // Create a node in Dot difference graph \p G representing the basic block
+  // represented by \p BD with type \p T (where it exists).
+  DotCfgDiffNode(DotCfgDiff &G, unsigned N, const BlockDataT<DCData> &BD,
+                 IRChangeDiffType T)
+      : Graph(G), N(N), Data{&BD, nullptr}, Type(T) {}
+  DotCfgDiffNode(const DotCfgDiffNode &DN)
+      : Graph(DN.Graph), N(DN.N), Data{DN.Data[0], DN.Data[1]}, Type(DN.Type),
+        EdgesMap(DN.EdgesMap), Children(DN.Children), Edges(DN.Edges) {}
+
+  unsigned getIndex() const { return N; }
+
+  // The label of the basic block
+  StringRef getLabel() const {
+    assert(Data[0] && "Expected Data[0] to be set.");
+    return Data[0]->getLabel();
+  }
+  // Return where this block exists.
+  IRChangeDiffType getType() const { return Type; }
+  // Change this basic block from being only in before to being common.
+  // Save the pointer to \p Other.
+  void setCommon(const BlockDataT<DCData> &Other) {
+    assert(!Data[1] && "Expected only one block datum");
+    Data[1] = &Other;
+    Type = IsCommon;
+  }
+  // Add an edge to \p E of type {\p Value, \p T}.
+  void addEdge(unsigned E, StringRef Value, IRChangeDiffType T) {
+    // This is a new edge or it is an edge being made common.
+    assert((EdgesMap.count(E) == 0 || T == IsCommon) &&
+           "Unexpected edge count and type.");
+    EdgesMap[E] = {Value.str(), T};
+  }
+  // Record the children and create edges.
+  void finalize(DotCfgDiff &G);
+
+  // Return the type of the edge to node \p S.
+  std::pair<std::string, IRChangeDiffType> getEdge(const unsigned S) const {
+    assert(EdgesMap.count(S) == 1 && "Expected to find edge.");
+    return EdgesMap.at(S);
+  }
+
+  // Return the string representing the basic block.
+  std::string getBodyContent() const;
+
+  void createDisplayEdges(DotCfgDiffDisplayGraph &Graph, unsigned DisplayNode,
+                          std::map<const unsigned, unsigned> &NodeMap) const;
+
+protected:
+  DotCfgDiff &Graph;
+  const unsigned N;
+  const BlockDataT<DCData> *Data[2];
+  IRChangeDiffType Type;
+  std::map<const unsigned, std::pair<std::string, IRChangeDiffType>> EdgesMap;
+  std::vector<unsigned> Children;
+  std::vector<unsigned> Edges;
+};
+
+// Class representing the difference graph between two functions.
+class DotCfgDiff {
+public:
+  // \p Title is the title given to the graph.  \p EntryNodeName is the
+  // entry node for the function.  \p Before and \p After are the before
+  // after versions of the function, respectively.  \p Dir is the directory
+  // in which to store the results.
+  DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
+             const FuncDataT<DCData> &After);
+
+  DotCfgDiff(const DotCfgDiff &) = delete;
+  DotCfgDiff &operator=(const DotCfgDiff &) = delete;
+
+  DotCfgDiffDisplayGraph createDisplayGraph(StringRef Title,
+                                            StringRef EntryNodeName);
+
+  // Return a string consisting of the labels for the \p Source and \p Sink.
+  // The combination allows distinguishing changing transitions on the
+  // same value (ie, a transition went to X before and goes to Y after).
+  // Required by GraphWriter.
+  StringRef getEdgeSourceLabel(const unsigned &Source,
+                               const unsigned &Sink) const {
+    std::string S =
+        getNode(Source).getLabel().str() + " " + getNode(Sink).getLabel().str();
+    assert(EdgeLabels.count(S) == 1 && "Expected to find edge label.");
+    return EdgeLabels.find(S)->getValue();
+  }
+
+  // Return the number of basic blocks (nodes).  Required by GraphWriter.
+  unsigned size() const { return Nodes.size(); }
+
+  const DotCfgDiffNode &getNode(unsigned N) const {
+    assert(N < Nodes.size() && "Unexpected index for node reference");
+    return Nodes[N];
+  }
+
+protected:
+  // Return the string surrounded by HTML to make it the appropriate colour.
+  std::string colourize(std::string S, IRChangeDiffType T) const;
+
+  void createNode(StringRef Label, const BlockDataT<DCData> &BD,
+                  IRChangeDiffType T) {
+    unsigned Pos = Nodes.size();
+    Nodes.emplace_back(*this, Pos, BD, T);
+    NodePosition.insert({Label, Pos});
+  }
+
+  // TODO Nodes should probably be a StringMap<DotCfgDiffNode> after the
+  // display graph is separated out, which would remove the need for
+  // NodePosition.
+  std::vector<DotCfgDiffNode> Nodes;
+  StringMap<unsigned> NodePosition;
+  const std::string GraphName;
+
+  StringMap<std::string> EdgeLabels;
+};
+
+std::string DotCfgDiffNode::getBodyContent() const {
+  if (Type == IsCommon) {
+    assert(Data[1] && "Expected Data[1] to be set.");
+
+    StringRef SR[2];
+    for (unsigned I = 0; I < 2; ++I) {
+      SR[I] = Data[I]->getBody();
+      // drop initial '\n' if present
+      if (SR[I][0] == '\n')
+        SR[I] = SR[I].drop_front();
+      // drop predecessors as they can be big and are redundant
+      SR[I] = SR[I].drop_until([](char C) { return C == '\n'; }).drop_front();
+    }
+
+    SmallString<80> OldLineFormat = formatv(
+        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[InBefore]);
+    SmallString<80> NewLineFormat = formatv(
+        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[InAfter]);
+    SmallString<80> UnchangedLineFormat = formatv(
+        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[IsCommon]);
+    std::string Diff = Data[0]->getLabel().str();
+    Diff += ":\n<BR align=\"left\"/>" +
+            doSystemDiff(makeHTMLReady(SR[0]), makeHTMLReady(SR[1]),
+                         OldLineFormat, NewLineFormat, UnchangedLineFormat);
+
+    // Diff adds in some empty colour changes which are not valid HTML
+    // so remove them.  Colours are all lowercase alpha characters (as
+    // listed in https://graphviz.org/pdf/dotguide.pdf).
+    Regex R("<FONT COLOR=\"\\w+\"></FONT>");
+    while (true) {
+      std::string Error;
+      std::string S = R.sub("", Diff, &Error);
+      if (Error != "")
+        return Error;
+      if (S == Diff)
+        return Diff;
+      Diff = S;
+    }
+    llvm_unreachable("Should not get here");
+  }
+
+  // Put node out in the appropriate colour.
+  assert(!Data[1] && "Data[1] is set unexpectedly.");
+  std::string Body = makeHTMLReady(Data[0]->getBody());
+  const StringRef BS = Body;
+  StringRef BS1 = BS;
+  // Drop leading newline, if present.
+  if (BS.front() == '\n')
+    BS1 = BS1.drop_front(1);
+  // Get label.
+  StringRef Label = BS1.take_until([](char C) { return C == ':'; });
+  // drop predecessors as they can be big and are redundant
+  BS1 = BS1.drop_until([](char C) { return C == '\n'; }).drop_front();
+
+  std::string S = "<FONT COLOR=\"" + Colours[Type] + "\">" + Label.str() + ":";
+
+  // align each line to the left.
+  while (BS1.size()) {
+    S.append("<BR align=\"left\"/>");
+    StringRef Line = BS1.take_until([](char C) { return C == '\n'; });
+    S.append(Line.str());
+    BS1 = BS1.drop_front(Line.size() + 1);
+  }
+  S.append("<BR align=\"left\"/></FONT>");
+  return S;
+}
+
+std::string DotCfgDiff::colourize(std::string S, IRChangeDiffType T) const {
+  if (S.length() == 0)
+    return S;
+  return "<FONT COLOR=\"" + Colours[T] + "\">" + S + "</FONT>";
+}
+
+std::string DotCfgDiffDisplayGraph::attribute(IRChangeDiffType T) const {
+  return "color=" + Colours[T];
+}
+
+DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
+                       const FuncDataT<DCData> &After)
+    : GraphName(Title.str()) {
+  StringMap<IRChangeDiffType> EdgesMap;
+
+  // Handle each basic block in the before IR.
+  for (auto &B : Before.getData()) {
+    StringRef Label = B.getKey();
+    const BlockDataT<DCData> &BD = B.getValue();
+    createNode(Label, BD, InBefore);
+
+    // Create transitions with names made up of the from block label, the value
+    // on which the transition is made and the to block label.
+    for (StringMap<std::string>::const_iterator Sink = BD.getData().begin(),
+                                                E = BD.getData().end();
+         Sink != E; ++Sink) {
+      std::string Key = (Label + " " + Sink->getKey().str()).str() + " " +
+                        BD.getData().getSuccessorLabel(Sink->getKey()).str();
+      EdgesMap.insert({Key, InBefore});
+    }
+  }
+
+  // Handle each basic block in the after IR
+  for (auto &A : After.getData()) {
+    StringRef Label = A.getKey();
+    const BlockDataT<DCData> &BD = A.getValue();
+    unsigned C = NodePosition.count(Label);
+    if (C == 0)
+      // This only exists in the after IR.  Create the node.
+      createNode(Label, BD, InAfter);
+    else {
+      assert(C == 1 && "Unexpected multiple nodes.");
+      Nodes[NodePosition[Label]].setCommon(BD);
+    }
+    // Add in the edges between the nodes (as common or only in after).
+    for (StringMap<std::string>::const_iterator Sink = BD.getData().begin(),
+                                                E = BD.getData().end();
+         Sink != E; ++Sink) {
+      std::string Key = (Label + " " + Sink->getKey().str()).str() + " " +
+                        BD.getData().getSuccessorLabel(Sink->getKey()).str();
+      unsigned C = EdgesMap.count(Key);
+      if (C == 0)
+        EdgesMap.insert({Key, InAfter});
+      else {
+        EdgesMap[Key] = IsCommon;
+      }
+    }
+  }
+
+  // Now go through the map of edges and add them to the node.
+  for (auto &E : EdgesMap) {
+    // Extract the source, sink and value from the edge key.
+    StringRef S = E.getKey();
+    auto SP1 = S.rsplit(' ');
+    auto &SourceSink = SP1.first;
+    auto SP2 = SourceSink.split(' ');
+    StringRef Source = SP2.first;
+    StringRef Sink = SP2.second;
+    StringRef Value = SP1.second;
+
+    assert(NodePosition.count(Source) == 1 && "Expected to find node.");
+    DotCfgDiffNode &SourceNode = Nodes[NodePosition[Source]];
+    assert(NodePosition.count(Sink) == 1 && "Expected to find node.");
+    unsigned SinkNode = NodePosition[Sink];
+    IRChangeDiffType T = E.second;
+
+    // Look for an edge from Source to Sink
+    if (EdgeLabels.count(SourceSink) == 0)
+      EdgeLabels.insert({SourceSink, colourize(Value.str(), T)});
+    else {
+      StringRef V = EdgeLabels.find(SourceSink)->getValue();
+      std::string NV = colourize(V.str() + " " + Value.str(), T);
+      T = IsCommon;
+      EdgeLabels[SourceSink] = NV;
+    }
+    SourceNode.addEdge(SinkNode, Value, T);
+  }
+  for (auto &I : Nodes)
+    I.finalize(*this);
+}
+
+DotCfgDiffDisplayGraph DotCfgDiff::createDisplayGraph(StringRef Title,
+                                                      StringRef EntryNodeName) {
+  assert(NodePosition.count(EntryNodeName) == 1 &&
+         "Expected to find entry block in map.");
+  unsigned Entry = NodePosition[EntryNodeName];
+  assert(Entry < Nodes.size() && "Expected to find entry node");
+  DotCfgDiffDisplayGraph G(Title.str());
+
+  std::map<const unsigned, unsigned> NodeMap;
+
+  int EntryIndex = -1;
+  unsigned Index = 0;
+  for (auto &I : Nodes) {
+    if (I.getIndex() == Entry)
+      EntryIndex = Index;
+    G.createNode(I.getBodyContent(), I.getType());
+    NodeMap.insert({I.getIndex(), Index++});
+  }
+  assert(EntryIndex >= 0 && "Expected entry node index to be set.");
+  G.setEntryNode(EntryIndex);
+
+  for (auto &I : NodeMap) {
+    unsigned SourceNode = I.first;
+    unsigned DisplayNode = I.second;
+    getNode(SourceNode).createDisplayEdges(G, DisplayNode, NodeMap);
+  }
+  return G;
+}
+
+void DotCfgDiffNode::createDisplayEdges(
+    DotCfgDiffDisplayGraph &DisplayGraph, unsigned DisplayNodeIndex,
+    std::map<const unsigned, unsigned> &NodeMap) const {
+
+  DisplayNode &SourceDisplayNode = DisplayGraph.getNode(DisplayNodeIndex);
+
+  for (auto I : Edges) {
+    unsigned SinkNodeIndex = I;
+    IRChangeDiffType Type = getEdge(SinkNodeIndex).second;
+    const DotCfgDiffNode *SinkNode = &Graph.getNode(SinkNodeIndex);
+
+    StringRef Label = Graph.getEdgeSourceLabel(getIndex(), SinkNodeIndex);
+    DisplayNode &SinkDisplayNode = DisplayGraph.getNode(SinkNode->getIndex());
+    SourceDisplayNode.createEdge(Label, SinkDisplayNode, Type);
+  }
+  SourceDisplayNode.createEdgeMap();
+}
+
+void DotCfgDiffNode::finalize(DotCfgDiff &G) {
+  for (auto E : EdgesMap) {
+    Children.emplace_back(E.first);
+    Edges.emplace_back(E.first);
+  }
+}
+
+} // namespace
+
+namespace llvm {
+
+template <> struct GraphTraits<DotCfgDiffDisplayGraph *> {
+  using NodeRef = const DisplayNode *;
+  using ChildIteratorType = DisplayNode::ChildIterator;
+  using nodes_iterator = DotCfgDiffDisplayGraph::NodeIterator;
+  using EdgeRef = const DisplayEdge *;
+  using ChildEdgeIterator = DisplayNode::EdgeIterator;
+
+  static NodeRef getEntryNode(const DotCfgDiffDisplayGraph *G) {
+    return G->getEntryNode();
+  }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return N->children_begin();
+  }
+  static ChildIteratorType child_end(NodeRef N) { return N->children_end(); }
+  static nodes_iterator nodes_begin(const DotCfgDiffDisplayGraph *G) {
+    return G->nodes_begin();
+  }
+  static nodes_iterator nodes_end(const DotCfgDiffDisplayGraph *G) {
+    return G->nodes_end();
+  }
+  static ChildEdgeIterator child_edge_begin(NodeRef N) {
+    return N->edges_begin();
+  }
+  static ChildEdgeIterator child_edge_end(NodeRef N) { return N->edges_end(); }
+  static NodeRef edge_dest(EdgeRef E) { return &E->getDestinationNode(); }
+  static unsigned size(const DotCfgDiffDisplayGraph *G) { return G->size(); }
+};
+
+template <>
+struct DOTGraphTraits<DotCfgDiffDisplayGraph *> : public DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool Simple = false)
+      : DefaultDOTGraphTraits(Simple) {}
+
+  static bool renderNodesUsingHTML() { return true; }
+  static std::string getGraphName(const DotCfgDiffDisplayGraph *DiffData) {
+    return DiffData->getGraphName();
+  }
+  static std::string
+  getGraphProperties(const DotCfgDiffDisplayGraph *DiffData) {
+    return "\tsize=\"190, 190\";\n";
+  }
+  static std::string getNodeLabel(const DisplayNode *Node,
+                                  const DotCfgDiffDisplayGraph *DiffData) {
+    return DiffData->getNodeLabel(*Node);
+  }
+  static std::string getNodeAttributes(const DisplayNode *Node,
+                                       const DotCfgDiffDisplayGraph *DiffData) {
+    return DiffData->getNodeAttributes(*Node);
+  }
+  static std::string getEdgeSourceLabel(const DisplayNode *From,
+                                        DisplayNode::ChildIterator &To) {
+    return From->getEdgeSourceLabel(**To);
+  }
+  static std::string getEdgeAttributes(const DisplayNode *From,
+                                       DisplayNode::ChildIterator &To,
+                                       const DotCfgDiffDisplayGraph *DiffData) {
+    return DiffData->getEdgeColorAttr(*From, **To);
+  }
+};
+
+} // namespace llvm
+
+namespace {
+
+void DotCfgDiffDisplayGraph::generateDotFile(StringRef DotFile) {
+  std::error_code EC;
+  raw_fd_ostream OutStream(DotFile, EC);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return;
+  }
+  WriteGraph(OutStream, this, false);
+  OutStream.flush();
+  OutStream.close();
+}
+
+} // namespace
+
+namespace llvm {
+
+DCData::DCData(const BasicBlock &B) {
+  // Build up transition labels.
+  const Instruction *Term = B.getTerminator();
+  if (const BranchInst *Br = dyn_cast<const BranchInst>(Term))
+    if (Br->isUnconditional())
+      addSuccessorLabel(Br->getSuccessor(0)->getName().str(), "");
+    else {
+      addSuccessorLabel(Br->getSuccessor(0)->getName().str(), "true");
+      addSuccessorLabel(Br->getSuccessor(1)->getName().str(), "false");
+    }
+  else if (const SwitchInst *Sw = dyn_cast<const SwitchInst>(Term)) {
+    addSuccessorLabel(Sw->case_default()->getCaseSuccessor()->getName().str(),
+                      "default");
+    for (auto &C : Sw->cases()) {
+      assert(C.getCaseValue() && "Expected to find case value.");
+      SmallString<20> Value = formatv("{0}", C.getCaseValue()->getSExtValue());
+      addSuccessorLabel(C.getCaseSuccessor()->getName().str(), Value);
+    }
+  } else
+    for (const_succ_iterator I = succ_begin(&B), E = succ_end(&B); I != E; ++I)
+      addSuccessorLabel((*I)->getName().str(), "");
+}
+
+DotCfgChangeReporter::DotCfgChangeReporter(bool Verbose)
+    : ChangeReporter<IRDataT<DCData>>(Verbose) {
+  // Set up the colours based on the hidden options.
+  Colours[InBefore] = BeforeColour;
+  Colours[InAfter] = AfterColour;
+  Colours[IsCommon] = CommonColour;
+}
+
+void DotCfgChangeReporter::handleFunctionCompare(
+    StringRef Name, StringRef Prefix, StringRef PassID, StringRef Divider,
+    bool InModule, unsigned Minor, const FuncDataT<DCData> &Before,
+    const FuncDataT<DCData> &After) {
+  assert(HTML && "Expected outstream to be set");
+  SmallString<8> Extender;
+  SmallString<8> Number;
+  // Handle numbering and file names.
+  if (InModule) {
+    Extender = formatv("{0}_{1}", N, Minor);
+    Number = formatv("{0}.{1}", N, Minor);
+  } else {
+    Extender = formatv("{0}", N);
+    Number = formatv("{0}", N);
+  }
+  // Create a temporary file name for the dot file.
+  SmallVector<char, 128> SV;
+  sys::fs::createUniquePath("cfgdot-%%%%%%.dot", SV, true);
+  std::string DotFile = Twine(SV).str();
+
+  SmallString<20> PDFFileName = formatv("diff_{0}.pdf", Extender);
+  SmallString<200> Text;
+
+  Text = formatv("{0}.{1}{2}{3}{4}", Number, Prefix, makeHTMLReady(PassID),
+                 Divider, Name);
+
+  DotCfgDiff Diff(Text, Before, After);
+  std::string EntryBlockName = After.getEntryBlockName();
+  // Use the before entry block if the after entry block was removed.
+  if (EntryBlockName == "")
+    EntryBlockName = Before.getEntryBlockName();
+  assert(EntryBlockName != "" && "Expected to find entry block");
+
+  DotCfgDiffDisplayGraph DG = Diff.createDisplayGraph(Text, EntryBlockName);
+  DG.generateDotFile(DotFile);
+
+  *HTML << genHTML(Text, DotFile, PDFFileName);
+  std::error_code EC = sys::fs::remove(DotFile);
+  if (EC)
+    errs() << "Error: " << EC.message() << "\n";
+}
+
+std::string DotCfgChangeReporter::genHTML(StringRef Text, StringRef DotFile,
+                                          StringRef PDFFileName) {
+  SmallString<20> PDFFile = formatv("{0}/{1}", DotCfgDir, PDFFileName);
+  // Create the PDF file.
+  static ErrorOr<std::string> DotExe = sys::findProgramByName(DotBinary);
+  if (!DotExe)
+    return "Unable to find dot executable.";
+
+  StringRef Args[] = {DotBinary, "-Tpdf", "-o", PDFFile, DotFile};
+  int Result = sys::ExecuteAndWait(*DotExe, Args, None);
+  if (Result < 0)
+    return "Error executing system dot.";
+
+  // Create the HTML tag refering to the PDF file.
+  SmallString<200> S = formatv(
+      "  <a href=\"{0}\" target=\"_blank\">{1}</a><br/>\n", PDFFileName, Text);
+  return S.c_str();
+}
+
+void DotCfgChangeReporter::handleInitialIR(Any IR) {
+  assert(HTML && "Expected outstream to be set");
+  *HTML << "<button type=\"button\" class=\"collapsible\">0. "
+        << "Initial IR (by function)</button>\n"
+        << "<div class=\"content\">\n"
+        << "  <p>\n";
+  // Create representation of IR
+  IRDataT<DCData> Data;
+  IRComparer<DCData>::analyzeIR(IR, Data);
+  // Now compare it against itself, which will have everything the
+  // same and will generate the files.
+  IRComparer<DCData>(Data, Data)
+      .compare(getModuleForComparison(IR),
+               [&](bool InModule, unsigned Minor,
+                   const FuncDataT<DCData> &Before,
+                   const FuncDataT<DCData> &After) -> void {
+                 handleFunctionCompare("", " ", "Initial IR", "", InModule,
+                                       Minor, Before, After);
+               });
+  *HTML << "  </p>\n"
+        << "</div><br/>\n";
+  ++N;
+}
+
+void DotCfgChangeReporter::generateIRRepresentation(Any IR, StringRef PassID,
+                                                    IRDataT<DCData> &Data) {
+  IRComparer<DCData>::analyzeIR(IR, Data);
+}
+
+void DotCfgChangeReporter::omitAfter(StringRef PassID, std::string &Name) {
+  assert(HTML && "Expected outstream to be set");
+  SmallString<20> Banner =
+      formatv("  <a>{0}. Pass {1} on {2} omitted because no change</a><br/>\n",
+              N, makeHTMLReady(PassID), Name);
+  *HTML << Banner;
+  ++N;
+}
+
+void DotCfgChangeReporter::handleAfter(StringRef PassID, std::string &Name,
+                                       const IRDataT<DCData> &Before,
+                                       const IRDataT<DCData> &After, Any IR) {
+  assert(HTML && "Expected outstream to be set");
+  IRComparer<DCData>(Before, After)
+      .compare(getModuleForComparison(IR),
+               [&](bool InModule, unsigned Minor,
+                   const FuncDataT<DCData> &Before,
+                   const FuncDataT<DCData> &After) -> void {
+                 handleFunctionCompare(Name, " Pass ", PassID, " on ", InModule,
+                                       Minor, Before, After);
+               });
+  *HTML << "    </p></div>\n";
+  ++N;
+}
+
+void DotCfgChangeReporter::handleInvalidated(StringRef PassID) {
+  assert(HTML && "Expected outstream to be set");
+  SmallString<20> Banner =
+      formatv("  <a>{0}. {1} invalidated</a><br/>\n", N, makeHTMLReady(PassID));
+  *HTML << Banner;
+  ++N;
+}
+
+void DotCfgChangeReporter::handleFiltered(StringRef PassID, std::string &Name) {
+  assert(HTML && "Expected outstream to be set");
+  SmallString<20> Banner =
+      formatv("  <a>{0}. Pass {1} on {2} filtered out</a><br/>\n", N,
+              makeHTMLReady(PassID), Name);
+  *HTML << Banner;
+  ++N;
+}
+
+void DotCfgChangeReporter::handleIgnored(StringRef PassID, std::string &Name) {
+  assert(HTML && "Expected outstream to be set");
+  SmallString<20> Banner = formatv("  <a>{0}. {1} on {2} ignored</a><br/>\n", N,
+                                   makeHTMLReady(PassID), Name);
+  *HTML << Banner;
+  ++N;
+}
+
+bool DotCfgChangeReporter::initializeHTML() {
+  std::error_code EC;
+  HTML = std::make_unique<raw_fd_ostream>(DotCfgDir + "/passes.html", EC);
+  if (EC) {
+    HTML = nullptr;
+    return false;
+  }
+
+  *HTML << "<!doctype html>"
+        << "<html>"
+        << "<head>"
+        << "<style>.collapsible { "
+        << "background-color: #777;"
+        << " color: white;"
+        << " cursor: pointer;"
+        << " padding: 18px;"
+        << " width: 100%;"
+        << " border: none;"
+        << " text-align: left;"
+        << " outline: none;"
+        << " font-size: 15px;"
+        << "} .active, .collapsible:hover {"
+        << " background-color: #555;"
+        << "} .content {"
+        << " padding: 0 18px;"
+        << " display: none;"
+        << " overflow: hidden;"
+        << " background-color: #f1f1f1;"
+        << "}"
+        << "</style>"
+        << "<title>passes.html</title>"
+        << "</head>\n"
+        << "<body>";
+  return true;
+}
+
+DotCfgChangeReporter::~DotCfgChangeReporter() {
+  if (!HTML)
+    return;
+  *HTML
+      << "<script>var coll = document.getElementsByClassName(\"collapsible\");"
+      << "var i;"
+      << "for (i = 0; i < coll.length; i++) {"
+      << "coll[i].addEventListener(\"click\", function() {"
+      << " this.classList.toggle(\"active\");"
+      << " var content = this.nextElementSibling;"
+      << " if (content.style.display === \"block\"){"
+      << " content.style.display = \"none\";"
+      << " }"
+      << " else {"
+      << " content.style.display= \"block\";"
+      << " }"
+      << " });"
+      << " }"
+      << "</script>"
+      << "</body>"
+      << "</html>\n";
+  HTML->flush();
+  HTML->close();
+}
+
+void DotCfgChangeReporter::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if ((PrintChanged == ChangePrinter::PrintChangedDotCfgVerbose ||
+       PrintChanged == ChangePrinter::PrintChangedDotCfgQuiet)) {
+    SmallString<128> OutputDir;
+    sys::fs::expand_tilde(DotCfgDir, OutputDir);
+    sys::fs::make_absolute(OutputDir);
+    assert(!OutputDir.empty() && "expected output dir to be non-empty");
+    DotCfgDir = OutputDir.c_str();
+    if (initializeHTML()) {
+      ChangeReporter<IRDataT<DCData>>::registerRequiredCallbacks(PIC);
+      return;
+    }
+    dbgs() << "Unable to open output stream for -cfg-dot-changed\n";
+  }
 }
 
 StandardInstrumentations::StandardInstrumentations(
@@ -1222,6 +2125,8 @@ StandardInstrumentations::StandardInstrumentations(
               PrintChanged == ChangePrinter::PrintChangedColourDiffVerbose,
           PrintChanged == ChangePrinter::PrintChangedColourDiffVerbose ||
               PrintChanged == ChangePrinter::PrintChangedColourDiffQuiet),
+      WebsiteChangeReporter(PrintChanged ==
+                            ChangePrinter::PrintChangedDotCfgVerbose),
       Verify(DebugLogging), VerifyEach(VerifyEach) {}
 
 void StandardInstrumentations::registerCallbacks(
@@ -1238,14 +2143,17 @@ void StandardInstrumentations::registerCallbacks(
   if (VerifyEach)
     Verify.registerCallbacks(PIC);
   PrintChangedDiff.registerCallbacks(PIC);
+  WebsiteChangeReporter.registerCallbacks(PIC);
 }
 
-namespace llvm {
-
 template class ChangeReporter<std::string>;
 template class TextChangeReporter<std::string>;
 
-template class ChangeReporter<ChangedIRData>;
-template class TextChangeReporter<ChangedIRData>;
+template class BlockDataT<EmptyData>;
+template class FuncDataT<EmptyData>;
+template class IRDataT<EmptyData>;
+template class ChangeReporter<IRDataT<EmptyData>>;
+template class TextChangeReporter<IRDataT<EmptyData>>;
+template class IRComparer<EmptyData>;
 
 } // namespace llvm
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 9fffb249e72d..94bd4807041d 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -567,7 +567,8 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
       if (Error Err = CFR->template getFuncName<Endian>(ProfileNames, FuncName))
         return Err;
       if (FuncName.empty())
-        return make_error<InstrProfError>(instrprof_error::malformed);
+        return make_error<InstrProfError>(instrprof_error::malformed,
+                                          "function name is empty");
       ++CovMapNumUsedRecords;
       Records.emplace_back(Version, FuncName, FuncHash, Mapping,
                            FileRange.StartingIndex, FileRange.Length);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index a83b56ed67f1..1168ad27fe52 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -74,53 +74,82 @@ static cl::opt<unsigned> StaticFuncStripDirNamePrefix(
     cl::desc("Strip specified level of directory name from source path in "
              "the profile counter name for static functions."));
 
-static std::string getInstrProfErrString(instrprof_error Err) {
+static std::string getInstrProfErrString(instrprof_error Err,
+                                         const std::string &ErrMsg = "") {
+  std::string Msg;
+  raw_string_ostream OS(Msg);
+
   switch (Err) {
   case instrprof_error::success:
-    return "success";
+    OS << "success";
+    break;
   case instrprof_error::eof:
-    return "end of File";
+    OS << "end of File";
+    break;
   case instrprof_error::unrecognized_format:
-    return "unrecognized instrumentation profile encoding format";
+    OS << "unrecognized instrumentation profile encoding format";
+    break;
   case instrprof_error::bad_magic:
-    return "invalid instrumentation profile data (bad magic)";
+    OS << "invalid instrumentation profile data (bad magic)";
+    break;
   case instrprof_error::bad_header:
-    return "invalid instrumentation profile data (file header is corrupt)";
+    OS << "invalid instrumentation profile data (file header is corrupt)";
+    break;
   case instrprof_error::unsupported_version:
-    return "unsupported instrumentation profile format version";
+    OS << "unsupported instrumentation profile format version";
+    break;
   case instrprof_error::unsupported_hash_type:
-    return "unsupported instrumentation profile hash type";
+    OS << "unsupported instrumentation profile hash type";
+    break;
   case instrprof_error::too_large:
-    return "too much profile data";
+    OS << "too much profile data";
+    break;
   case instrprof_error::truncated:
-    return "truncated profile data";
+    OS << "truncated profile data";
+    break;
   case instrprof_error::malformed:
-    return "malformed instrumentation profile data";
+    OS << "malformed instrumentation profile data";
+    break;
   case instrprof_error::invalid_prof:
-    return "invalid profile created. Please file a bug "
-           "at: " BUG_REPORT_URL
-           " and include the profraw files that caused this error.";
+    OS << "invalid profile created. Please file a bug "
+          "at: " BUG_REPORT_URL
+          " and include the profraw files that caused this error.";
+    break;
   case instrprof_error::unknown_function:
-    return "no profile data available for function";
+    OS << "no profile data available for function";
+    break;
   case instrprof_error::hash_mismatch:
-    return "function control flow change detected (hash mismatch)";
+    OS << "function control flow change detected (hash mismatch)";
+    break;
   case instrprof_error::count_mismatch:
-    return "function basic block count change detected (counter mismatch)";
+    OS << "function basic block count change detected (counter mismatch)";
+    break;
   case instrprof_error::counter_overflow:
-    return "counter overflow";
+    OS << "counter overflow";
+    break;
   case instrprof_error::value_site_count_mismatch:
-    return "function value site count change detected (counter mismatch)";
+    OS << "function value site count change detected (counter mismatch)";
+    break;
   case instrprof_error::compress_failed:
-    return "failed to compress data (zlib)";
+    OS << "failed to compress data (zlib)";
+    break;
   case instrprof_error::uncompress_failed:
-    return "failed to uncompress data (zlib)";
+    OS << "failed to uncompress data (zlib)";
+    break;
   case instrprof_error::empty_raw_profile:
-    return "empty raw profile file";
+    OS << "empty raw profile file";
+    break;
   case instrprof_error::zlib_unavailable:
-    return "profile uses zlib compression but the profile reader was built "
-           "without zlib support";
+    OS << "profile uses zlib compression but the profile reader was built "
+          "without zlib support";
+    break;
   }
-  llvm_unreachable("A value of instrprof_error has no message.");
+
+  // If optional error message is not empty, append it to the message.
+  if (!ErrMsg.empty())
+    OS << ": " << ErrMsg;
+
+  return OS.str();
 }
 
 namespace {
@@ -217,7 +246,7 @@ void SoftInstrProfErrors::addError(instrprof_error IE) {
 }
 
 std::string InstrProfError::message() const {
-  return getInstrProfErrString(Err);
+  return getInstrProfErrString(Err, Msg);
 }
 
 char InstrProfError::ID = 0;
@@ -878,18 +907,23 @@ static std::unique_ptr<ValueProfData> allocValueProfData(uint32_t TotalSize) {
 
 Error ValueProfData::checkIntegrity() {
   if (NumValueKinds > IPVK_Last + 1)
-    return make_error<InstrProfError>(instrprof_error::malformed);
-  // Total size needs to be mulltiple of quadword size.
+    return make_error<InstrProfError>(
+        instrprof_error::malformed, "number of value profile kinds is invalid");
+  // Total size needs to be multiple of quadword size.
   if (TotalSize % sizeof(uint64_t))
-    return make_error<InstrProfError>(instrprof_error::malformed);
+    return make_error<InstrProfError>(
+        instrprof_error::malformed, "total size is not multiples of quardword");
 
   ValueProfRecord *VR = getFirstValueProfRecord(this);
   for (uint32_t K = 0; K < this->NumValueKinds; K++) {
     if (VR->Kind > IPVK_Last)
-      return make_error<InstrProfError>(instrprof_error::malformed);
+      return make_error<InstrProfError>(instrprof_error::malformed,
+                                        "value kind is invalid");
     VR = getValueProfRecordNext(VR);
     if ((char *)VR - (char *)this > (ptrdiff_t)TotalSize)
-      return make_error<InstrProfError>(instrprof_error::malformed);
+      return make_error<InstrProfError>(
+          instrprof_error::malformed,
+          "value profile address is greater than total size");
   }
   return Error::success();
 }
@@ -1098,10 +1132,14 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
 bool isIRPGOFlagSet(const Module *M) {
   auto IRInstrVar =
       M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
-  if (!IRInstrVar || IRInstrVar->isDeclaration() ||
-      IRInstrVar->hasLocalLinkage())
+  if (!IRInstrVar || IRInstrVar->hasLocalLinkage())
     return false;
 
+  // For CSPGO+LTO, this variable might be marked as non-prevailing and we only
+  // have the decl.
+  if (IRInstrVar->isDeclaration())
+    return true;
+
   // Check if the flag is set.
   if (!IRInstrVar->hasInitializer())
     return false;
@@ -1137,8 +1175,8 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
 
 // Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
 // aware this is an ir_level profile so it can set the version flag.
-void createIRLevelProfileFlagVar(Module &M, bool IsCS,
-                                 bool InstrEntryBBEnabled) {
+GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS,
+                                            bool InstrEntryBBEnabled) {
   const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
   Type *IntTy64 = Type::getInt64Ty(M.getContext());
   uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF);
@@ -1155,6 +1193,7 @@ void createIRLevelProfileFlagVar(Module &M, bool IsCS,
     IRLevelVersionVariable->setLinkage(GlobalValue::ExternalLinkage);
     IRLevelVersionVariable->setComdat(M.getOrInsertComdat(VarName));
   }
+  return IRLevelVersionVariable;
 }
 
 // Create the variable for the profile file name.
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 8a4470ae207d..b4e8025dbef9 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -204,13 +204,15 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
     return success();
   }
   if (NumValueKinds == 0 || NumValueKinds > IPVK_Last + 1)
-    return error(instrprof_error::malformed);
+    return error(instrprof_error::malformed,
+                 "number of value kinds is invalid");
   Line++;
 
   for (uint32_t VK = 0; VK < NumValueKinds; VK++) {
     VP_READ_ADVANCE(ValueKind);
     if (ValueKind > IPVK_Last)
-      return error(instrprof_error::malformed);
+      return error(instrprof_error::malformed, "value kind is invalid");
+    ;
     VP_READ_ADVANCE(NumValueSites);
     if (!NumValueSites)
       continue;
@@ -268,16 +270,18 @@ Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
   if (Line.is_at_end())
     return error(instrprof_error::truncated);
   if ((Line++)->getAsInteger(0, Record.Hash))
-    return error(instrprof_error::malformed);
+    return error(instrprof_error::malformed,
+                 "function hash is not a valid integer");
 
   // Read the number of counters.
   uint64_t NumCounters;
   if (Line.is_at_end())
     return error(instrprof_error::truncated);
   if ((Line++)->getAsInteger(10, NumCounters))
-    return error(instrprof_error::malformed);
+    return error(instrprof_error::malformed,
+                 "number of counters is not a valid integer");
   if (NumCounters == 0)
-    return error(instrprof_error::malformed);
+    return error(instrprof_error::malformed, "number of counters is zero");
 
   // Read each counter and fill our internal storage with the values.
   Record.Clear();
@@ -287,7 +291,7 @@ Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
       return error(instrprof_error::truncated);
     uint64_t Count;
     if ((Line++)->getAsInteger(10, Count))
-      return error(instrprof_error::malformed);
+      return error(instrprof_error::malformed, "count is invalid");
     Record.Counts.push_back(Count);
   }
 
@@ -332,10 +336,12 @@ Error RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
   // If there isn't enough space for another header, this is probably just
   // garbage at the end of the file.
   if (CurrentPos + sizeof(RawInstrProf::Header) > End)
-    return make_error<InstrProfError>(instrprof_error::malformed);
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "not enough space for another header");
   // The writer ensures each profile is padded to start at an aligned address.
   if (reinterpret_cast<size_t>(CurrentPos) % alignof(uint64_t))
-    return make_error<InstrProfError>(instrprof_error::malformed);
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "insufficient padding");
   // The magic should have the same byte order as in the previous header.
   uint64_t Magic = *reinterpret_cast<const uint64_t *>(CurrentPos);
   if (Magic != swap(RawInstrProf::getMagic<IntPtrT>()))
@@ -366,6 +372,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   if (GET_VERSION(Version) != RawInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
+  BinaryIdsSize = swap(Header.BinaryIdsSize);
+  if (BinaryIdsSize % sizeof(uint64_t))
+    return error(instrprof_error::bad_header);
+
   CountersDelta = swap(Header.CountersDelta);
   NamesDelta = swap(Header.NamesDelta);
   auto DataSize = swap(Header.DataSize);
@@ -374,7 +384,6 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
   NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
-  BinaryIdsSize = swap(Header.BinaryIdsSize);
 
   auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>);
   auto PaddingSize = getNumPaddingBytes(NamesSize);
@@ -402,6 +411,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   NamesStart = Start + NamesOffset;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
+  const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd();
+  if (BinaryIdsStart + BinaryIdsSize > BufferEnd)
+    return error(instrprof_error::bad_header);
+
   std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
   if (Error E = createSymtab(*NewSymtab.get()))
     return E;
@@ -426,21 +439,46 @@ template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readRawCounts(
     InstrProfRecord &Record) {
   uint32_t NumCounters = swap(Data->NumCounters);
-  IntPtrT CounterPtr = Data->CounterPtr;
   if (NumCounters == 0)
-    return error(instrprof_error::malformed);
+    return error(instrprof_error::malformed, "number of counters is zero");
 
+  IntPtrT CounterPtr = Data->CounterPtr;
   auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
   ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart;
 
   // Check bounds. Note that the counter pointer embedded in the data record
   // may itself be corrupt.
   if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters)
-    return error(instrprof_error::malformed);
+    return error(instrprof_error::malformed,
+                 "counter pointer is out of bounds");
+
+  // We need to compute the in-buffer counter offset from the in-memory address
+  // distance. The initial CountersDelta is the in-memory address difference
+  // start(__llvm_prf_cnts)-start(__llvm_prf_data), so SrcData->CounterPtr -
+  // CountersDelta computes the offset into the in-buffer counter section.
+  //
+  // CountersDelta decreases as we advance to the next data record.
   ptrdiff_t CounterOffset = getCounterOffset(CounterPtr);
-  if (CounterOffset < 0 || CounterOffset > MaxNumCounters ||
-      ((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters)
-    return error(instrprof_error::malformed);
+  CountersDelta -= sizeof(*Data);
+  if (CounterOffset < 0)
+    return error(
+        instrprof_error::malformed,
+        ("counter offset " + Twine(CounterOffset) + " is negative").str());
+
+  if (CounterOffset > MaxNumCounters)
+    return error(instrprof_error::malformed,
+                 ("counter offset " + Twine(CounterOffset) +
+                  " is greater than the maximum number of counters " +
+                  Twine((uint32_t)MaxNumCounters))
+                     .str());
+
+  if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters)
+    return error(instrprof_error::malformed,
+                 ("number of counters " +
+                  Twine(((uint32_t)CounterOffset + NumCounters)) +
+                  " is greater than the maximum number of counters " +
+                  Twine((uint32_t)MaxNumCounters))
+                     .str());
 
   auto RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters);
 
@@ -512,6 +550,10 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   return success();
 }
 
+static size_t RoundUp(size_t size, size_t align) {
+  return (size + align - 1) & ~(align - 1);
+}
+
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::printBinaryIds(raw_ostream &OS) {
   if (BinaryIdsSize == 0)
@@ -519,19 +561,38 @@ Error RawInstrProfReader<IntPtrT>::printBinaryIds(raw_ostream &OS) {
 
   OS << "Binary IDs: \n";
   const uint8_t *BI = BinaryIdsStart;
-  while (BI < BinaryIdsStart + BinaryIdsSize) {
+  const uint8_t *BIEnd = BinaryIdsStart + BinaryIdsSize;
+  while (BI < BIEnd) {
+    size_t Remaining = BIEnd - BI;
+
+    // There should be enough left to read the binary ID size field.
+    if (Remaining < sizeof(uint64_t))
+      return make_error<InstrProfError>(
+          instrprof_error::malformed,
+          "not enough data to read binary id length");
+
     uint64_t BinaryIdLen = swap(*reinterpret_cast<const uint64_t *>(BI));
+
+    // There should be enough left to read the binary ID size field, and the
+    // binary ID.
+    if (Remaining < sizeof(BinaryIdLen) + BinaryIdLen)
+      return make_error<InstrProfError>(
+          instrprof_error::malformed, "not enough data to read binary id data");
+
     // Increment by binary id length data type size.
     BI += sizeof(BinaryIdLen);
     if (BI > (const uint8_t *)DataBuffer->getBufferEnd())
-      return make_error<InstrProfError>(instrprof_error::malformed);
+      return make_error<InstrProfError>(
+          instrprof_error::malformed,
+          "binary id that is read is bigger than buffer size");
 
     for (uint64_t I = 0; I < BinaryIdLen; I++)
       OS << format("%02x", BI[I]);
     OS << "\n";
 
-    // Increment by binary id data length.
-    BI += BinaryIdLen;
+    // Increment by binary id data length, rounded to the next 8 bytes. This
+    // accounts for the zero-padding after each build ID.
+    BI += RoundUp(BinaryIdLen, sizeof(uint64_t));
     if (BI > (const uint8_t *)DataBuffer->getBufferEnd())
       return make_error<InstrProfError>(instrprof_error::malformed);
   }
@@ -624,7 +685,8 @@ Error InstrProfReaderIndex<HashTableImpl>::getRecords(
 
   Data = (*Iter);
   if (Data.empty())
-    return make_error<InstrProfError>(instrprof_error::malformed);
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "profile data is empty");
 
   return Error::success();
 }
@@ -638,7 +700,8 @@ Error InstrProfReaderIndex<HashTableImpl>::getRecords(
   Data = *RecordIterator;
 
   if (Data.empty())
-    return make_error<InstrProfError>(instrprof_error::malformed);
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "profile data is empty");
 
   return Error::success();
 }
@@ -669,7 +732,7 @@ public:
     return Underlying.getRecords(FuncName, Data);
   }
 };
-}
+} // namespace
 
 /// A remapper that applies remappings based on a symbol remapping file.
 template <typename HashTableImpl>
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 987c0b175d3c..492e3541cb5a 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -215,8 +215,7 @@ void InstrProfWriter::overlapRecord(NamedInstrProfRecord &&Other,
   InstrProfRecord &Dest = Where->second;
 
   uint64_t ValueCutoff = FuncFilter.ValueCutoff;
-  if (!FuncFilter.NameFilter.empty() &&
-      Name.find(FuncFilter.NameFilter) != Name.npos)
+  if (!FuncFilter.NameFilter.empty() && Name.contains(FuncFilter.NameFilter))
     ValueCutoff = 0;
 
   Dest.overlap(Other, Overlap, FuncLevelOverlap, ValueCutoff);
@@ -272,7 +271,7 @@ static void setSummary(IndexedInstrProf::Summary *TheSummary,
                        ProfileSummary &PS) {
   using namespace IndexedInstrProf;
 
-  std::vector<ProfileSummaryEntry> &Res = PS.getDetailedSummary();
+  const std::vector<ProfileSummaryEntry> &Res = PS.getDetailedSummary();
   TheSummary->NumSummaryFields = Summary::NumKinds;
   TheSummary->NumCutoffEntries = Res.size();
   TheSummary->set(Summary::MaxFunctionCount, PS.getMaxFunctionCount());
diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index 2ab0f0cbc17a..f54df7b295e3 100644
--- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -80,7 +80,7 @@ const ArrayRef<uint32_t> ProfileSummaryBuilder::DefaultCutoffs =
     DefaultCutoffsData;
 
 const ProfileSummaryEntry &
-ProfileSummaryBuilder::getEntryForPercentile(SummaryEntryVector &DS,
+ProfileSummaryBuilder::getEntryForPercentile(const SummaryEntryVector &DS,
                                              uint64_t Percentile) {
   auto It = partition_point(DS, [=](const ProfileSummaryEntry &Entry) {
     return Entry.Cutoff < Percentile;
@@ -154,7 +154,8 @@ void ProfileSummaryBuilder::computeDetailedSummary() {
   }
 }
 
-uint64_t ProfileSummaryBuilder::getHotCountThreshold(SummaryEntryVector &DS) {
+uint64_t
+ProfileSummaryBuilder::getHotCountThreshold(const SummaryEntryVector &DS) {
   auto &HotEntry =
       ProfileSummaryBuilder::getEntryForPercentile(DS, ProfileSummaryCutoffHot);
   uint64_t HotCountThreshold = HotEntry.MinCount;
@@ -163,7 +164,8 @@ uint64_t ProfileSummaryBuilder::getHotCountThreshold(SummaryEntryVector &DS) {
   return HotCountThreshold;
 }
 
-uint64_t ProfileSummaryBuilder::getColdCountThreshold(SummaryEntryVector &DS) {
+uint64_t
+ProfileSummaryBuilder::getColdCountThreshold(const SummaryEntryVector &DS) {
   auto &ColdEntry = ProfileSummaryBuilder::getEntryForPercentile(
       DS, ProfileSummaryCutoffCold);
   uint64_t ColdCountThreshold = ColdEntry.MinCount;
@@ -181,17 +183,17 @@ std::unique_ptr<ProfileSummary> SampleProfileSummaryBuilder::getSummary() {
 
 std::unique_ptr<ProfileSummary>
 SampleProfileSummaryBuilder::computeSummaryForProfiles(
-    const StringMap<sampleprof::FunctionSamples> &Profiles) {
+    const SampleProfileMap &Profiles) {
   assert(NumFunctions == 0 &&
          "This can only be called on an empty summary builder");
-  StringMap<sampleprof::FunctionSamples> ContextLessProfiles;
-  const StringMap<sampleprof::FunctionSamples> *ProfilesToUse = &Profiles;
+  sampleprof::SampleProfileMap ContextLessProfiles;
+  const sampleprof::SampleProfileMap *ProfilesToUse = &Profiles;
   // For CSSPGO, context-sensitive profile effectively split a function profile
   // into many copies each representing the CFG profile of a particular calling
   // context. That makes the count distribution looks more flat as we now have
   // more function profiles each with lower counts, which in turn leads to lower
-  // hot thresholds. To compensate for that, by defauly we merge context
-  // profiles before coumputing profile summary.
+  // hot thresholds. To compensate for that, by default we merge context
+  // profiles before computing profile summary.
   if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS &&
                                 !UseContextLessSummary.getNumOccurrences())) {
     for (const auto &I : Profiles) {
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 60e707b146d5..fd8fd3b675b7 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -198,6 +198,21 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+void sampleprof::sortFuncProfiles(
+    const SampleProfileMap &ProfileMap,
+    std::vector<NameFunctionSamples> &SortedProfiles) {
+  for (const auto &I : ProfileMap) {
+    assert(I.first == I.second.getContext() && "Inconsistent profile map");
+    SortedProfiles.push_back(std::make_pair(I.second.getContext(), &I.second));
+  }
+  llvm::stable_sort(SortedProfiles, [](const NameFunctionSamples &A,
+                                       const NameFunctionSamples &B) {
+    if (A.second->getTotalSamples() == B.second->getTotalSamples())
+      return A.first < B.first;
+    return A.second->getTotalSamples() > B.second->getTotalSamples();
+  });
+}
+
 unsigned FunctionSamples::getOffset(const DILocation *DIL) {
   return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
       0xffff;
@@ -230,9 +245,13 @@ const FunctionSamples *FunctionSamples::findFunctionSamples(
     else
       Discriminator = DIL->getBaseDiscriminator();
 
+    // Use C++ linkage name if possible.
+    StringRef Name = PrevDIL->getScope()->getSubprogram()->getLinkageName();
+    if (Name.empty())
+      Name = PrevDIL->getScope()->getSubprogram()->getName();
+
     S.push_back(
-        std::make_pair(LineLocation(getOffset(DIL), Discriminator),
-                       PrevDIL->getScope()->getSubprogram()->getLinkageName()));
+        std::make_pair(LineLocation(getOffset(DIL), Discriminator), Name));
     PrevDIL = DIL;
   }
   if (S.size() == 0)
@@ -245,7 +264,7 @@ const FunctionSamples *FunctionSamples::findFunctionSamples(
 }
 
 void FunctionSamples::findAllNames(DenseSet<StringRef> &NameSet) const {
-  NameSet.insert(Name);
+  NameSet.insert(getName());
   for (const auto &BS : BodySamples)
     for (const auto &TS : BS.second.getCallTargets())
       NameSet.insert(TS.getKey());
@@ -316,7 +335,7 @@ std::error_code ProfileSymbolList::read(const uint8_t *Data,
 
 void SampleContextTrimmer::trimAndMergeColdContextProfiles(
     uint64_t ColdCountThreshold, bool TrimColdContext, bool MergeColdContext,
-    uint32_t ColdContextFrameLength) {
+    uint32_t ColdContextFrameLength, bool TrimBaseProfileOnly) {
   if (!TrimColdContext && !MergeColdContext)
     return;
 
@@ -324,25 +343,32 @@ void SampleContextTrimmer::trimAndMergeColdContextProfiles(
   if (ColdCountThreshold == 0)
     return;
 
+  // Trimming base profiles only is mainly to honor the preinliner decsion. When
+  // MergeColdContext is true preinliner decsion is not honored anyway so turn
+  // off TrimBaseProfileOnly.
+  if (MergeColdContext)
+    TrimBaseProfileOnly = false;
+
   // Filter the cold profiles from ProfileMap and move them into a tmp
   // container
-  std::vector<std::pair<StringRef, const FunctionSamples *>> ColdProfiles;
+  std::vector<std::pair<SampleContext, const FunctionSamples *>> ColdProfiles;
   for (const auto &I : ProfileMap) {
+    const SampleContext &Context = I.first;
     const FunctionSamples &FunctionProfile = I.second;
-    if (FunctionProfile.getTotalSamples() >= ColdCountThreshold)
-      continue;
-    ColdProfiles.emplace_back(I.getKey(), &I.second);
+    if (FunctionProfile.getTotalSamples() < ColdCountThreshold &&
+        (!TrimBaseProfileOnly || Context.isBaseContext()))
+      ColdProfiles.emplace_back(Context, &I.second);
   }
 
   // Remove the cold profile from ProfileMap and merge them into
   // MergedProfileMap by the last K frames of context
-  StringMap<FunctionSamples> MergedProfileMap;
+  SampleProfileMap MergedProfileMap;
   for (const auto &I : ColdProfiles) {
     if (MergeColdContext) {
-      auto Ret = MergedProfileMap.try_emplace(
-          I.second->getContext().getContextWithLastKFrames(
-              ColdContextFrameLength),
-          FunctionSamples());
+      auto MergedContext = I.second->getContext().getContextFrames();
+      if (ColdContextFrameLength < MergedContext.size())
+        MergedContext = MergedContext.take_back(ColdContextFrameLength);
+      auto Ret = MergedProfileMap.emplace(MergedContext, FunctionSamples());
       FunctionSamples &MergedProfile = Ret.first->second;
       MergedProfile.merge(*I.second);
     }
@@ -353,16 +379,15 @@ void SampleContextTrimmer::trimAndMergeColdContextProfiles(
   for (const auto &I : MergedProfileMap) {
     // Filter the cold merged profile
     if (TrimColdContext && I.second.getTotalSamples() < ColdCountThreshold &&
-        ProfileMap.find(I.getKey()) == ProfileMap.end())
+        ProfileMap.find(I.first) == ProfileMap.end())
       continue;
     // Merge the profile if the original profile exists, otherwise just insert
     // as a new profile
-    auto Ret = ProfileMap.try_emplace(I.getKey(), FunctionSamples());
+    auto Ret = ProfileMap.emplace(I.first, FunctionSamples());
     if (Ret.second) {
-      SampleContext FContext(Ret.first->first(), RawContext);
+      SampleContext FContext(Ret.first->first, RawContext);
       FunctionSamples &FProfile = Ret.first->second;
       FProfile.setContext(FContext);
-      FProfile.setName(FContext.getNameWithoutContext());
     }
     FunctionSamples &OrigProfile = Ret.first->second;
     OrigProfile.merge(I.second);
@@ -370,12 +395,12 @@ void SampleContextTrimmer::trimAndMergeColdContextProfiles(
 }
 
 void SampleContextTrimmer::canonicalizeContextProfiles() {
-  std::vector<StringRef> ProfilesToBeRemoved;
-  StringMap<FunctionSamples> ProfilesToBeAdded;
+  std::vector<SampleContext> ProfilesToBeRemoved;
+  SampleProfileMap ProfilesToBeAdded;
   for (auto &I : ProfileMap) {
     FunctionSamples &FProfile = I.second;
-    StringRef ContextStr = FProfile.getNameWithContext();
-    if (I.first() == ContextStr)
+    SampleContext &Context = FProfile.getContext();
+    if (I.first == Context)
       continue;
 
     // Use the context string from FunctionSamples to update the keys of
@@ -390,10 +415,10 @@ void SampleContextTrimmer::canonicalizeContextProfiles() {
     // with different profiles) from the map can cause a conflict if they are
     // not handled in a right order. This can be solved by just caching the
     // profiles to be added.
-    auto Ret = ProfilesToBeAdded.try_emplace(ContextStr, FProfile);
+    auto Ret = ProfilesToBeAdded.emplace(Context, FProfile);
     (void)Ret;
     assert(Ret.second && "Context conflict during canonicalization");
-    ProfilesToBeRemoved.push_back(I.first());
+    ProfilesToBeRemoved.push_back(I.first);
   }
 
   for (auto &I : ProfilesToBeRemoved) {
@@ -401,7 +426,7 @@ void SampleContextTrimmer::canonicalizeContextProfiles() {
   }
 
   for (auto &I : ProfilesToBeAdded) {
-    ProfileMap.try_emplace(I.first(), I.second);
+    ProfileMap.emplace(I.first, I.second);
   }
 }
 
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 6058eddb13dc..c99a19020511 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -53,21 +53,23 @@ using namespace sampleprof;
 // For ext-binary format profiles, the flag is set in the summary.
 static cl::opt<bool> ProfileIsFSDisciminator(
     "profile-isfs", cl::Hidden, cl::init(false),
-    cl::desc("Profile uses flow senstive discriminators"));
+    cl::desc("Profile uses flow sensitive discriminators"));
 
 /// Dump the function profile for \p FName.
 ///
-/// \param FName Name of the function to print.
+/// \param FContext Name + context of the function to print.
 /// \param OS Stream to emit the output to.
-void SampleProfileReader::dumpFunctionProfile(StringRef FName,
+void SampleProfileReader::dumpFunctionProfile(SampleContext FContext,
                                               raw_ostream &OS) {
-  OS << "Function: " << FName << ": " << Profiles[FName];
+  OS << "Function: " << FContext.toString() << ": " << Profiles[FContext];
 }
 
 /// Dump all the function profiles found on stream \p OS.
 void SampleProfileReader::dump(raw_ostream &OS) {
-  for (const auto &I : Profiles)
-    dumpFunctionProfile(I.getKey(), OS);
+  std::vector<NameFunctionSamples> V;
+  sortFuncProfiles(Profiles, V);
+  for (const auto &I : V)
+    dumpFunctionProfile(I.first, OS);
 }
 
 /// Parse \p Input as function head.
@@ -249,6 +251,7 @@ std::error_code SampleProfileReaderText::readImpl() {
   bool SeenMetadata = false;
 
   ProfileIsFS = ProfileIsFSDisciminator;
+  FunctionSamples::ProfileIsFS = ProfileIsFS;
   for (; !LineIt.is_at_eof(); ++LineIt) {
     if ((*LineIt)[(*LineIt).find_first_not_of(' ')] == '#')
       continue;
@@ -273,12 +276,11 @@ std::error_code SampleProfileReaderText::readImpl() {
         return sampleprof_error::malformed;
       }
       SeenMetadata = false;
-      SampleContext FContext(FName);
+      SampleContext FContext(FName, CSNameTable);
       if (FContext.hasContext())
         ++CSProfileCount;
       Profiles[FContext] = FunctionSamples();
       FunctionSamples &FProfile = Profiles[FContext];
-      FProfile.setName(FContext.getNameWithoutContext());
       FProfile.setContext(FContext);
       MergeResult(Result, FProfile.addTotalSamples(NumSamples));
       MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples));
@@ -450,6 +452,13 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readStringFromTable() {
   return NameTable[*Idx];
 }
 
+ErrorOr<SampleContext> SampleProfileReaderBinary::readSampleContextFromTable() {
+  auto FName(readStringFromTable());
+  if (std::error_code EC = FName.getError())
+    return EC;
+  return SampleContext(*FName);
+}
+
 ErrorOr<StringRef> SampleProfileReaderExtBinaryBase::readStringFromTable() {
   if (!FixedLengthMD5)
     return SampleProfileReaderBinary::readStringFromTable();
@@ -576,18 +585,16 @@ SampleProfileReaderBinary::readFuncProfile(const uint8_t *Start) {
   if (std::error_code EC = NumHeadSamples.getError())
     return EC;
 
-  auto FName(readStringFromTable());
-  if (std::error_code EC = FName.getError())
+  ErrorOr<SampleContext> FContext(readSampleContextFromTable());
+  if (std::error_code EC = FContext.getError())
     return EC;
 
-  SampleContext FContext(*FName);
-  Profiles[FContext] = FunctionSamples();
-  FunctionSamples &FProfile = Profiles[FContext];
-  FProfile.setName(FContext.getNameWithoutContext());
-  FProfile.setContext(FContext);
+  Profiles[*FContext] = FunctionSamples();
+  FunctionSamples &FProfile = Profiles[*FContext];
+  FProfile.setContext(*FContext);
   FProfile.addHeadSamples(*NumHeadSamples);
 
-  if (FContext.hasContext())
+  if (FContext->hasContext())
     CSProfileCount++;
 
   if (std::error_code EC = readProfile(FProfile))
@@ -597,6 +604,7 @@ SampleProfileReaderBinary::readFuncProfile(const uint8_t *Start) {
 
 std::error_code SampleProfileReaderBinary::readImpl() {
   ProfileIsFS = ProfileIsFSDisciminator;
+  FunctionSamples::ProfileIsFS = ProfileIsFS;
   while (!at_eof()) {
     if (std::error_code EC = readFuncProfile(Data))
       return EC;
@@ -605,6 +613,31 @@ std::error_code SampleProfileReaderBinary::readImpl() {
   return sampleprof_error::success;
 }
 
+ErrorOr<SampleContextFrames>
+SampleProfileReaderExtBinaryBase::readContextFromTable() {
+  auto ContextIdx = readNumber<uint32_t>();
+  if (std::error_code EC = ContextIdx.getError())
+    return EC;
+  if (*ContextIdx >= CSNameTable->size())
+    return sampleprof_error::truncated_name_table;
+  return (*CSNameTable)[*ContextIdx];
+}
+
+ErrorOr<SampleContext>
+SampleProfileReaderExtBinaryBase::readSampleContextFromTable() {
+  if (ProfileIsCS) {
+    auto FContext(readContextFromTable());
+    if (std::error_code EC = FContext.getError())
+      return EC;
+    return SampleContext(*FContext);
+  } else {
+    auto FName(readStringFromTable());
+    if (std::error_code EC = FName.getError())
+      return EC;
+    return SampleContext(*FName);
+  }
+}
+
 std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
     const uint8_t *Start, uint64_t Size, const SecHdrTableEntry &Entry) {
   Data = Start;
@@ -632,11 +665,17 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
       return EC;
     break;
   }
+  case SecCSNameTable: {
+    if (std::error_code EC = readCSNameTableSec())
+      return EC;
+    break;
+  }
   case SecLBRProfile:
     if (std::error_code EC = readFuncProfiles())
       return EC;
     break;
   case SecFuncOffsetTable:
+    FuncOffsetsOrdered = hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered);
     if (std::error_code EC = readFuncOffsetTable())
       return EC;
     break;
@@ -682,17 +721,27 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncOffsetTable() {
     return EC;
 
   FuncOffsetTable.reserve(*Size);
+
+  if (FuncOffsetsOrdered) {
+    OrderedFuncOffsets =
+        std::make_unique<std::vector<std::pair<SampleContext, uint64_t>>>();
+    OrderedFuncOffsets->reserve(*Size);
+  }
+
   for (uint32_t I = 0; I < *Size; ++I) {
-    auto FName(readStringFromTable());
-    if (std::error_code EC = FName.getError())
+    auto FContext(readSampleContextFromTable());
+    if (std::error_code EC = FContext.getError())
       return EC;
 
     auto Offset = readNumber<uint64_t>();
     if (std::error_code EC = Offset.getError())
       return EC;
 
-    FuncOffsetTable[*FName] = *Offset;
+    FuncOffsetTable[*FContext] = *Offset;
+    if (FuncOffsetsOrdered)
+      OrderedFuncOffsets->emplace_back(*FContext, *Offset);
   }
+
   return sampleprof_error::success;
 }
 
@@ -721,75 +770,77 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
       }
     }
 
-    if (useMD5()) {
-      for (auto Name : FuncsToUse) {
-        auto GUID = std::to_string(MD5Hash(Name));
-        auto iter = FuncOffsetTable.find(StringRef(GUID));
-        if (iter == FuncOffsetTable.end())
-          continue;
-        const uint8_t *FuncProfileAddr = Start + iter->second;
-        assert(FuncProfileAddr < End && "out of LBRProfile section");
-        if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-          return EC;
+    if (ProfileIsCS) {
+      DenseSet<uint64_t> FuncGuidsToUse;
+      if (useMD5()) {
+        for (auto Name : FuncsToUse)
+          FuncGuidsToUse.insert(Function::getGUID(Name));
       }
-    } else if (FunctionSamples::ProfileIsCS) {
-      // Compute the ordered set of names, so we can
-      // get all context profiles under a subtree by
-      // iterating through the ordered names.
-      struct Comparer {
-        // Ignore the closing ']' when ordering context
-        bool operator()(const StringRef &L, const StringRef &R) const {
-          return L.substr(0, L.size() - 1) < R.substr(0, R.size() - 1);
+
+      // For each function in current module, load all context profiles for
+      // the function as well as their callee contexts which can help profile
+      // guided importing for ThinLTO. This can be achieved by walking
+      // through an ordered context container, where contexts are laid out
+      // as if they were walked in preorder of a context trie. While
+      // traversing the trie, a link to the highest common ancestor node is
+      // kept so that all of its decendants will be loaded.
+      assert(OrderedFuncOffsets.get() &&
+             "func offset table should always be sorted in CS profile");
+      const SampleContext *CommonContext = nullptr;
+      for (const auto &NameOffset : *OrderedFuncOffsets) {
+        const auto &FContext = NameOffset.first;
+        auto FName = FContext.getName();
+        // For function in the current module, keep its farthest ancestor
+        // context. This can be used to load itself and its child and
+        // sibling contexts.
+        if ((useMD5() && FuncGuidsToUse.count(std::stoull(FName.data()))) ||
+            (!useMD5() && (FuncsToUse.count(FName) ||
+                           (Remapper && Remapper->exist(FName))))) {
+          if (!CommonContext || !CommonContext->IsPrefixOf(FContext))
+            CommonContext = &FContext;
         }
-      };
-      std::set<StringRef, Comparer> OrderedNames;
-      for (auto Name : FuncOffsetTable) {
-        OrderedNames.insert(Name.first);
-      }
 
-      // For each function in current module, load all
-      // context profiles for the function.
-      for (auto NameOffset : FuncOffsetTable) {
-        StringRef ContextName = NameOffset.first;
-        SampleContext FContext(ContextName);
-        auto FuncName = FContext.getNameWithoutContext();
-        if (!FuncsToUse.count(FuncName) &&
-            (!Remapper || !Remapper->exist(FuncName)))
-          continue;
-
-        // For each context profile we need, try to load
-        // all context profile in the subtree. This can
-        // help profile guided importing for ThinLTO.
-        auto It = OrderedNames.find(ContextName);
-        while (It != OrderedNames.end() &&
-               It->startswith(ContextName.substr(0, ContextName.size() - 1))) {
-          const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It];
+        if (CommonContext == &FContext ||
+            (CommonContext && CommonContext->IsPrefixOf(FContext))) {
+          // Load profile for the current context which originated from
+          // the common ancestor.
+          const uint8_t *FuncProfileAddr = Start + NameOffset.second;
           assert(FuncProfileAddr < End && "out of LBRProfile section");
           if (std::error_code EC = readFuncProfile(FuncProfileAddr))
             return EC;
-          // Remove loaded context profile so we won't
-          // load it repeatedly.
-          It = OrderedNames.erase(It);
         }
       }
     } else {
-      for (auto NameOffset : FuncOffsetTable) {
-        SampleContext FContext(NameOffset.first);
-        auto FuncName = FContext.getNameWithoutContext();
-        if (!FuncsToUse.count(FuncName) &&
-            (!Remapper || !Remapper->exist(FuncName)))
-          continue;
-        const uint8_t *FuncProfileAddr = Start + NameOffset.second;
-        assert(FuncProfileAddr < End && "out of LBRProfile section");
-        if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-          return EC;
+      if (useMD5()) {
+        for (auto Name : FuncsToUse) {
+          auto GUID = std::to_string(MD5Hash(Name));
+          auto iter = FuncOffsetTable.find(StringRef(GUID));
+          if (iter == FuncOffsetTable.end())
+            continue;
+          const uint8_t *FuncProfileAddr = Start + iter->second;
+          assert(FuncProfileAddr < End && "out of LBRProfile section");
+          if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+            return EC;
+        }
+      } else {
+        for (auto NameOffset : FuncOffsetTable) {
+          SampleContext FContext(NameOffset.first);
+          auto FuncName = FContext.getName();
+          if (!FuncsToUse.count(FuncName) &&
+              (!Remapper || !Remapper->exist(FuncName)))
+            continue;
+          const uint8_t *FuncProfileAddr = Start + NameOffset.second;
+          assert(FuncProfileAddr < End && "out of LBRProfile section");
+          if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+            return EC;
+        }
       }
     }
     Data = End;
   }
   assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
-  assert(ProfileIsCS == (CSProfileCount > 0) &&
+  assert((!CSProfileCount || ProfileIsCS) &&
          "Section flag should be consistent with actual profile");
   return sampleprof_error::success;
 }
@@ -885,6 +936,7 @@ std::error_code SampleProfileReaderCompactBinary::readImpl() {
   // given a module.
   bool LoadFuncsToBeUsed = collectFuncsFromModule();
   ProfileIsFS = ProfileIsFSDisciminator;
+  FunctionSamples::ProfileIsFS = ProfileIsFS;
   std::vector<uint64_t> OffsetsToUse;
   if (!LoadFuncsToBeUsed) {
     // load all the function profiles.
@@ -983,22 +1035,62 @@ std::error_code SampleProfileReaderExtBinaryBase::readNameTableSec(bool IsMD5) {
   return SampleProfileReaderBinary::readNameTable();
 }
 
+// Read in the CS name table section, which basically contains a list of context
+// vectors. Each element of a context vector, aka a frame, refers to the
+// underlying raw function names that are stored in the name table, as well as
+// a callsite identifier that only makes sense for non-leaf frames.
+std::error_code SampleProfileReaderExtBinaryBase::readCSNameTableSec() {
+  auto Size = readNumber<uint32_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+
+  std::vector<SampleContextFrameVector> *PNameVec =
+      new std::vector<SampleContextFrameVector>();
+  PNameVec->reserve(*Size);
+  for (uint32_t I = 0; I < *Size; ++I) {
+    PNameVec->emplace_back(SampleContextFrameVector());
+    auto ContextSize = readNumber<uint32_t>();
+    if (std::error_code EC = ContextSize.getError())
+      return EC;
+    for (uint32_t J = 0; J < *ContextSize; ++J) {
+      auto FName(readStringFromTable());
+      if (std::error_code EC = FName.getError())
+        return EC;
+      auto LineOffset = readNumber<uint64_t>();
+      if (std::error_code EC = LineOffset.getError())
+        return EC;
+
+      if (!isOffsetLegal(*LineOffset))
+        return std::error_code();
+
+      auto Discriminator = readNumber<uint64_t>();
+      if (std::error_code EC = Discriminator.getError())
+        return EC;
+
+      PNameVec->back().emplace_back(
+          FName.get(), LineLocation(LineOffset.get(), Discriminator.get()));
+    }
+  }
+
+  // From this point the underlying object of CSNameTable should be immutable.
+  CSNameTable.reset(PNameVec);
+  return sampleprof_error::success;
+}
+
 std::error_code
 SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) {
   while (Data < End) {
-    auto FName(readStringFromTable());
-    if (std::error_code EC = FName.getError())
+    auto FContext(readSampleContextFromTable());
+    if (std::error_code EC = FContext.getError())
       return EC;
 
-    SampleContext FContext(*FName);
-    bool ProfileInMap = Profiles.count(FContext);
-
+    bool ProfileInMap = Profiles.count(*FContext);
     if (ProfileIsProbeBased) {
       auto Checksum = readNumber<uint64_t>();
       if (std::error_code EC = Checksum.getError())
         return EC;
       if (ProfileInMap)
-        Profiles[FContext].setFunctionHash(*Checksum);
+        Profiles[*FContext].setFunctionHash(*Checksum);
     }
 
     if (ProfileHasAttribute) {
@@ -1006,7 +1098,7 @@ SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) {
       if (std::error_code EC = Attributes.getError())
         return EC;
       if (ProfileInMap)
-        Profiles[FContext].getContext().setAllAttributes(*Attributes);
+        Profiles[*FContext].getContext().setAllAttributes(*Attributes);
     }
   }
 
@@ -1132,6 +1224,16 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       Flags.append("fs-discriminator,");
     break;
+  case SecFuncOffsetTable:
+    if (hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered))
+      Flags.append("ordered,");
+    break;
+  case SecFuncMetadata:
+    if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased))
+      Flags.append("probe,");
+    if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute))
+      Flags.append("attr,");
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 535f87968104..78006aab1541 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -41,23 +41,10 @@
 using namespace llvm;
 using namespace sampleprof;
 
-std::error_code SampleProfileWriter::writeFuncProfiles(
-    const StringMap<FunctionSamples> &ProfileMap) {
-  // Sort the ProfileMap by total samples.
-  typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
+std::error_code
+SampleProfileWriter::writeFuncProfiles(const SampleProfileMap &ProfileMap) {
   std::vector<NameFunctionSamples> V;
-  for (const auto &I : ProfileMap) {
-    assert(I.getKey() == I.second.getNameWithContext() &&
-           "Inconsistent profile map");
-    V.push_back(std::make_pair(I.second.getNameWithContext(), &I.second));
-  }
-  llvm::stable_sort(
-      V, [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
-        if (A.second->getTotalSamples() == B.second->getTotalSamples())
-          return A.first > B.first;
-        return A.second->getTotalSamples() > B.second->getTotalSamples();
-      });
-
+  sortFuncProfiles(ProfileMap, V);
   for (const auto &I : V) {
     if (std::error_code EC = writeSample(*I.second))
       return EC;
@@ -65,8 +52,7 @@ std::error_code SampleProfileWriter::writeFuncProfiles(
   return sampleprof_error::success;
 }
 
-std::error_code
-SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+std::error_code SampleProfileWriter::write(const SampleProfileMap &ProfileMap) {
   if (std::error_code EC = writeHeader(ProfileMap))
     return EC;
 
@@ -130,8 +116,8 @@ std::error_code SampleProfileWriterExtBinaryBase::addNewSection(
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterExtBinaryBase::write(
-    const StringMap<FunctionSamples> &ProfileMap) {
+std::error_code
+SampleProfileWriterExtBinaryBase::write(const SampleProfileMap &ProfileMap) {
   if (std::error_code EC = writeHeader(ProfileMap))
     return EC;
 
@@ -146,11 +132,28 @@ std::error_code SampleProfileWriterExtBinaryBase::write(
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileWriterExtBinaryBase::writeContextIdx(
+    const SampleContext &Context) {
+  if (Context.hasContext())
+    return writeCSNameIdx(Context);
+  else
+    return SampleProfileWriterBinary::writeNameIdx(Context.getName());
+}
+
+std::error_code
+SampleProfileWriterExtBinaryBase::writeCSNameIdx(const SampleContext &Context) {
+  const auto &Ret = CSNameTable.find(Context);
+  if (Ret == CSNameTable.end())
+    return sampleprof_error::truncated_name_table;
+  encodeULEB128(Ret->second, *OutputStream);
+  return sampleprof_error::success;
+}
+
 std::error_code
 SampleProfileWriterExtBinaryBase::writeSample(const FunctionSamples &S) {
   uint64_t Offset = OutputStream->tell();
-  StringRef Name = S.getNameWithContext();
-  FuncOffsetTable[Name] = Offset - SecLBRProfileStart;
+  auto &Context = S.getContext();
+  FuncOffsetTable[Context] = Offset - SecLBRProfileStart;
   encodeULEB128(S.getHeadSamples(), *OutputStream);
   return writeBody(S);
 }
@@ -162,24 +165,42 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
   encodeULEB128(FuncOffsetTable.size(), OS);
 
   // Write out FuncOffsetTable.
-  for (auto Entry : FuncOffsetTable) {
-    if (std::error_code EC =
-            writeNameIdx(Entry.first, FunctionSamples::ProfileIsCS))
+  auto WriteItem = [&](const SampleContext &Context, uint64_t Offset) {
+    if (std::error_code EC = writeContextIdx(Context))
       return EC;
-    encodeULEB128(Entry.second, OS);
+    encodeULEB128(Offset, OS);
+    return (std::error_code)sampleprof_error::success;
+  };
+
+  if (FunctionSamples::ProfileIsCS) {
+    // Sort the contexts before writing them out. This is to help fast load all
+    // context profiles for a function as well as their callee contexts which
+    // can help profile-guided importing for ThinLTO.
+    std::map<SampleContext, uint64_t> OrderedFuncOffsetTable(
+        FuncOffsetTable.begin(), FuncOffsetTable.end());
+    for (const auto &Entry : OrderedFuncOffsetTable) {
+      if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+        return EC;
+    }
+    addSectionFlag(SecFuncOffsetTable, SecFuncOffsetFlags::SecFlagOrdered);
+  } else {
+    for (const auto &Entry : FuncOffsetTable) {
+      if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+        return EC;
+    }
   }
+
   FuncOffsetTable.clear();
   return sampleprof_error::success;
 }
 
 std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
-    const StringMap<FunctionSamples> &Profiles) {
+    const SampleProfileMap &Profiles) {
   if (!FunctionSamples::ProfileIsProbeBased && !FunctionSamples::ProfileIsCS)
     return sampleprof_error::success;
   auto &OS = *OutputStream;
   for (const auto &Entry : Profiles) {
-    if (std::error_code EC = writeNameIdx(Entry.second.getNameWithContext(),
-                                          FunctionSamples::ProfileIsCS))
+    if (std::error_code EC = writeContextIdx(Entry.second.getContext()))
       return EC;
     if (FunctionSamples::ProfileIsProbeBased)
       encodeULEB128(Entry.second.getFunctionHash(), OS);
@@ -195,7 +216,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeNameTable() {
 
   auto &OS = *OutputStream;
   std::set<StringRef> V;
-  stablizeNameTable(V);
+  stablizeNameTable(NameTable, V);
 
   // Write out the MD5 name table. We wrote unencoded MD5 so reader can
   // retrieve the name using the name index without having to read the
@@ -208,11 +229,10 @@ std::error_code SampleProfileWriterExtBinaryBase::writeNameTable() {
 }
 
 std::error_code SampleProfileWriterExtBinaryBase::writeNameTableSection(
-    const StringMap<FunctionSamples> &ProfileMap) {
+    const SampleProfileMap &ProfileMap) {
   for (const auto &I : ProfileMap) {
-    assert(I.first() == I.second.getNameWithContext() &&
-           "Inconsistent profile map");
-    addName(I.second.getNameWithContext(), FunctionSamples::ProfileIsCS);
+    assert(I.first == I.second.getContext() && "Inconsistent profile map");
+    addContext(I.second.getContext());
     addNames(I.second);
   }
 
@@ -220,7 +240,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeNameTableSection(
   // so compiler won't strip the suffix during profile matching after
   // seeing the flag in the profile.
   for (const auto &I : NameTable) {
-    if (I.first.find(FunctionSamples::UniqSuffix) != StringRef::npos) {
+    if (I.first.contains(FunctionSamples::UniqSuffix)) {
       addSectionFlag(SecNameTable, SecNameTableFlags::SecFlagUniqSuffix);
       break;
     }
@@ -231,6 +251,34 @@ std::error_code SampleProfileWriterExtBinaryBase::writeNameTableSection(
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileWriterExtBinaryBase::writeCSNameTableSection() {
+  // Sort the names to make CSNameTable deterministic.
+  std::set<SampleContext> OrderedContexts;
+  for (const auto &I : CSNameTable)
+    OrderedContexts.insert(I.first);
+  assert(OrderedContexts.size() == CSNameTable.size() &&
+         "Unmatched ordered and unordered contexts");
+  uint64_t I = 0;
+  for (auto &Context : OrderedContexts)
+    CSNameTable[Context] = I++;
+
+  auto &OS = *OutputStream;
+  encodeULEB128(OrderedContexts.size(), OS);
+  support::endian::Writer Writer(OS, support::little);
+  for (auto Context : OrderedContexts) {
+    auto Frames = Context.getContextFrames();
+    encodeULEB128(Frames.size(), OS);
+    for (auto &Callsite : Frames) {
+      if (std::error_code EC = writeNameIdx(Callsite.FuncName))
+        return EC;
+      encodeULEB128(Callsite.Location.LineOffset, OS);
+      encodeULEB128(Callsite.Location.Discriminator, OS);
+    }
+  }
+
+  return sampleprof_error::success;
+}
+
 std::error_code
 SampleProfileWriterExtBinaryBase::writeProfileSymbolListSection() {
   if (ProfSymList && ProfSymList->size() > 0)
@@ -241,8 +289,7 @@ SampleProfileWriterExtBinaryBase::writeProfileSymbolListSection() {
 }
 
 std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
-    SecType Type, uint32_t LayoutIdx,
-    const StringMap<FunctionSamples> &ProfileMap) {
+    SecType Type, uint32_t LayoutIdx, const SampleProfileMap &ProfileMap) {
   // The setting of SecFlagCompress should happen before markSectionStart.
   if (Type == SecProfileSymbolList && ProfSymList && ProfSymList->toCompress())
     setToCompressSection(SecProfileSymbolList);
@@ -266,6 +313,10 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
     if (auto EC = writeNameTableSection(ProfileMap))
       return EC;
     break;
+  case SecCSNameTable:
+    if (auto EC = writeCSNameTableSection())
+      return EC;
+    break;
   case SecLBRProfile:
     SecLBRProfileStart = OutputStream->tell();
     if (std::error_code EC = writeFuncProfiles(ProfileMap))
@@ -294,7 +345,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
 }
 
 std::error_code SampleProfileWriterExtBinary::writeDefaultLayout(
-    const StringMap<FunctionSamples> &ProfileMap) {
+    const SampleProfileMap &ProfileMap) {
   // The const indices passed to writeOneSection below are specifying the
   // positions of the sections in SectionHdrLayout. Look at
   // initSectionHdrLayout to find out where each section is located in
@@ -303,32 +354,33 @@ std::error_code SampleProfileWriterExtBinary::writeDefaultLayout(
     return EC;
   if (auto EC = writeOneSection(SecNameTable, 1, ProfileMap))
     return EC;
-  if (auto EC = writeOneSection(SecLBRProfile, 3, ProfileMap))
+  if (auto EC = writeOneSection(SecCSNameTable, 2, ProfileMap))
+    return EC;
+  if (auto EC = writeOneSection(SecLBRProfile, 4, ProfileMap))
     return EC;
-  if (auto EC = writeOneSection(SecProfileSymbolList, 4, ProfileMap))
+  if (auto EC = writeOneSection(SecProfileSymbolList, 5, ProfileMap))
     return EC;
-  if (auto EC = writeOneSection(SecFuncOffsetTable, 2, ProfileMap))
+  if (auto EC = writeOneSection(SecFuncOffsetTable, 3, ProfileMap))
     return EC;
-  if (auto EC = writeOneSection(SecFuncMetadata, 5, ProfileMap))
+  if (auto EC = writeOneSection(SecFuncMetadata, 6, ProfileMap))
     return EC;
   return sampleprof_error::success;
 }
 
-static void
-splitProfileMapToTwo(const StringMap<FunctionSamples> &ProfileMap,
-                     StringMap<FunctionSamples> &ContextProfileMap,
-                     StringMap<FunctionSamples> &NoContextProfileMap) {
+static void splitProfileMapToTwo(const SampleProfileMap &ProfileMap,
+                                 SampleProfileMap &ContextProfileMap,
+                                 SampleProfileMap &NoContextProfileMap) {
   for (const auto &I : ProfileMap) {
     if (I.second.getCallsiteSamples().size())
-      ContextProfileMap.insert({I.first(), I.second});
+      ContextProfileMap.insert({I.first, I.second});
     else
-      NoContextProfileMap.insert({I.first(), I.second});
+      NoContextProfileMap.insert({I.first, I.second});
   }
 }
 
 std::error_code SampleProfileWriterExtBinary::writeCtxSplitLayout(
-    const StringMap<FunctionSamples> &ProfileMap) {
-  StringMap<FunctionSamples> ContextProfileMap, NoContextProfileMap;
+    const SampleProfileMap &ProfileMap) {
+  SampleProfileMap ContextProfileMap, NoContextProfileMap;
   splitProfileMapToTwo(ProfileMap, ContextProfileMap, NoContextProfileMap);
 
   if (auto EC = writeOneSection(SecProfSummary, 0, ProfileMap))
@@ -358,7 +410,7 @@ std::error_code SampleProfileWriterExtBinary::writeCtxSplitLayout(
 }
 
 std::error_code SampleProfileWriterExtBinary::writeSections(
-    const StringMap<FunctionSamples> &ProfileMap) {
+    const SampleProfileMap &ProfileMap) {
   std::error_code EC;
   if (SecLayout == DefaultLayout)
     EC = writeDefaultLayout(ProfileMap);
@@ -369,8 +421,8 @@ std::error_code SampleProfileWriterExtBinary::writeSections(
   return EC;
 }
 
-std::error_code SampleProfileWriterCompactBinary::write(
-    const StringMap<FunctionSamples> &ProfileMap) {
+std::error_code
+SampleProfileWriterCompactBinary::write(const SampleProfileMap &ProfileMap) {
   if (std::error_code EC = SampleProfileWriter::write(ProfileMap))
     return EC;
   if (std::error_code EC = writeFuncOffsetTable())
@@ -389,7 +441,7 @@ std::error_code SampleProfileWriterCompactBinary::write(
 std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
   if (FunctionSamples::ProfileIsCS)
-    OS << "[" << S.getNameWithContext() << "]:" << S.getTotalSamples();
+    OS << "[" << S.getContext().toString() << "]:" << S.getTotalSamples();
   else
     OS << S.getName() << ":" << S.getTotalSamples();
 
@@ -445,27 +497,28 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName,
-                                                        bool IsContextName) {
-  std::string BracketedName;
-  if (IsContextName) {
-    BracketedName = "[" + FName.str() + "]";
-    FName = StringRef(BracketedName);
-  }
+std::error_code
+SampleProfileWriterBinary::writeContextIdx(const SampleContext &Context) {
+  assert(!Context.hasContext() && "cs profile is not supported");
+  return writeNameIdx(Context.getName());
+}
 
-  const auto &Ret = NameTable.find(FName);
-  if (Ret == NameTable.end())
+std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
+  auto &NTable = getNameTable();
+  const auto &Ret = NTable.find(FName);
+  if (Ret == NTable.end())
     return sampleprof_error::truncated_name_table;
   encodeULEB128(Ret->second, *OutputStream);
   return sampleprof_error::success;
 }
 
-void SampleProfileWriterBinary::addName(StringRef FName, bool IsContextName) {
-  if (IsContextName) {
-    auto It = BracketedContextStr.insert("[" + FName.str() + "]");
-    FName = StringRef(*It.first);
-  }
-  NameTable.insert(std::make_pair(FName, 0));
+void SampleProfileWriterBinary::addName(StringRef FName) {
+  auto &NTable = getNameTable();
+  NTable.insert(std::make_pair(FName, 0));
+}
+
+void SampleProfileWriterBinary::addContext(const SampleContext &Context) {
+  addName(Context.getName());
 }
 
 void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
@@ -485,7 +538,19 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
     }
 }
 
-void SampleProfileWriterBinary::stablizeNameTable(std::set<StringRef> &V) {
+void SampleProfileWriterExtBinaryBase::addContext(
+    const SampleContext &Context) {
+  if (Context.hasContext()) {
+    for (auto &Callsite : Context.getContextFrames())
+      SampleProfileWriterBinary::addName(Callsite.FuncName);
+    CSNameTable.insert(std::make_pair(Context, 0));
+  } else {
+    SampleProfileWriterBinary::addName(Context.getName());
+  }
+}
+
+void SampleProfileWriterBinary::stablizeNameTable(
+    MapVector<StringRef, uint32_t> &NameTable, std::set<StringRef> &V) {
   // Sort the names to make NameTable deterministic.
   for (const auto &I : NameTable)
     V.insert(I.first);
@@ -497,7 +562,7 @@ void SampleProfileWriterBinary::stablizeNameTable(std::set<StringRef> &V) {
 std::error_code SampleProfileWriterBinary::writeNameTable() {
   auto &OS = *OutputStream;
   std::set<StringRef> V;
-  stablizeNameTable(V);
+  stablizeNameTable(NameTable, V);
 
   // Write out the name table.
   encodeULEB128(NameTable.size(), OS);
@@ -526,8 +591,7 @@ std::error_code SampleProfileWriterCompactBinary::writeFuncOffsetTable() {
 
   // Write out FuncOffsetTable.
   for (auto Entry : FuncOffsetTable) {
-    if (std::error_code EC =
-            writeNameIdx(Entry.first, FunctionSamples::ProfileIsCS))
+    if (std::error_code EC = writeNameIdx(Entry.first))
       return EC;
     encodeULEB128(Entry.second, OS);
   }
@@ -537,7 +601,7 @@ std::error_code SampleProfileWriterCompactBinary::writeFuncOffsetTable() {
 std::error_code SampleProfileWriterCompactBinary::writeNameTable() {
   auto &OS = *OutputStream;
   std::set<StringRef> V;
-  stablizeNameTable(V);
+  stablizeNameTable(NameTable, V);
 
   // Write out the name table.
   encodeULEB128(NameTable.size(), OS);
@@ -556,8 +620,8 @@ SampleProfileWriterBinary::writeMagicIdent(SampleProfileFormat Format) {
   return sampleprof_error::success;
 }
 
-std::error_code SampleProfileWriterBinary::writeHeader(
-    const StringMap<FunctionSamples> &ProfileMap) {
+std::error_code
+SampleProfileWriterBinary::writeHeader(const SampleProfileMap &ProfileMap) {
   writeMagicIdent(Format);
 
   computeSummary(ProfileMap);
@@ -566,9 +630,8 @@ std::error_code SampleProfileWriterBinary::writeHeader(
 
   // Generate the name table for all the functions referenced in the profile.
   for (const auto &I : ProfileMap) {
-    assert(I.first() == I.second.getNameWithContext() &&
-           "Inconsistent profile map");
-    addName(I.first(), FunctionSamples::ProfileIsCS);
+    assert(I.first == I.second.getContext() && "Inconsistent profile map");
+    addContext(I.first);
     addNames(I.second);
   }
 
@@ -642,7 +705,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeSecHdrTable() {
 }
 
 std::error_code SampleProfileWriterExtBinaryBase::writeHeader(
-    const StringMap<FunctionSamples> &ProfileMap) {
+    const SampleProfileMap &ProfileMap) {
   auto &OS = *OutputStream;
   FileStart = OS.tell();
   writeMagicIdent(Format);
@@ -652,7 +715,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeHeader(
 }
 
 std::error_code SampleProfileWriterCompactBinary::writeHeader(
-    const StringMap<FunctionSamples> &ProfileMap) {
+    const SampleProfileMap &ProfileMap) {
   support::endian::Writer Writer(*OutputStream, support::little);
   if (auto EC = SampleProfileWriterBinary::writeHeader(ProfileMap))
     return EC;
@@ -671,7 +734,8 @@ std::error_code SampleProfileWriterBinary::writeSummary() {
   encodeULEB128(Summary->getMaxFunctionCount(), OS);
   encodeULEB128(Summary->getNumCounts(), OS);
   encodeULEB128(Summary->getNumFunctions(), OS);
-  std::vector<ProfileSummaryEntry> &Entries = Summary->getDetailedSummary();
+  const std::vector<ProfileSummaryEntry> &Entries =
+      Summary->getDetailedSummary();
   encodeULEB128(Entries.size(), OS);
   for (auto Entry : Entries) {
     encodeULEB128(Entry.Cutoff, OS);
@@ -682,9 +746,7 @@ std::error_code SampleProfileWriterBinary::writeSummary() {
 }
 std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-
-  if (std::error_code EC =
-          writeNameIdx(S.getNameWithContext(), FunctionSamples::ProfileIsCS))
+  if (std::error_code EC = writeContextIdx(S.getContext()))
     return EC;
 
   encodeULEB128(S.getTotalSamples(), OS);
@@ -803,8 +865,7 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
   return std::move(Writer);
 }
 
-void SampleProfileWriter::computeSummary(
-    const StringMap<FunctionSamples> &ProfileMap) {
+void SampleProfileWriter::computeSummary(const SampleProfileMap &ProfileMap) {
   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
   Summary = Builder.computeSummaryForProfiles(ProfileMap);
 }
diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp
index 2993892097e7..b3136a91e7f5 100644
--- a/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/llvm/lib/Support/AArch64TargetParser.cpp
@@ -98,6 +98,8 @@ bool AArch64::getExtensionFeatures(uint64_t Extensions,
     Features.push_back("+sve2-sha3");
   if (Extensions & AEK_SVE2BITPERM)
     Features.push_back("+sve2-bitperm");
+  if (Extensions & AArch64::AEK_TME)
+    Features.push_back("+tme");
   if (Extensions & AEK_RCPC)
     Features.push_back("+rcpc");
   if (Extensions & AEK_BRBE)
@@ -118,6 +120,8 @@ bool AArch64::getExtensionFeatures(uint64_t Extensions,
 
 bool AArch64::getArchFeatures(AArch64::ArchKind AK,
                               std::vector<StringRef> &Features) {
+  if (AK == ArchKind::ARMV8A)
+    Features.push_back("+v8a");
   if (AK == ArchKind::ARMV8_1A)
     Features.push_back("+v8.1a");
   if (AK == ArchKind::ARMV8_2A)
@@ -132,6 +136,12 @@ bool AArch64::getArchFeatures(AArch64::ArchKind AK,
     Features.push_back("+v8.6a");
   if (AK == AArch64::ArchKind::ARMV8_7A)
     Features.push_back("+v8.7a");
+  if (AK == AArch64::ArchKind::ARMV9A)
+    Features.push_back("+v9a");
+  if (AK == AArch64::ArchKind::ARMV9_1A)
+    Features.push_back("+v9.1a");
+  if (AK == AArch64::ArchKind::ARMV9_2A)
+    Features.push_back("+v9.2a");
   if(AK == AArch64::ArchKind::ARMV8R)
     Features.push_back("+v8r");
 
diff --git a/llvm/lib/Support/APFixedPoint.cpp b/llvm/lib/Support/APFixedPoint.cpp
index 9764dd51f572..61b30b5c5c60 100644
--- a/llvm/lib/Support/APFixedPoint.cpp
+++ b/llvm/lib/Support/APFixedPoint.cpp
@@ -306,7 +306,7 @@ APFixedPoint APFixedPoint::div(const APFixedPoint &Other,
     APInt::sdivrem(ThisVal, OtherVal, Result, Rem);
     // If the quotient is negative and the remainder is nonzero, round
     // towards negative infinity by subtracting epsilon from the result.
-    if (ThisVal.isNegative() != OtherVal.isNegative() && !Rem.isNullValue())
+    if (ThisVal.isNegative() != OtherVal.isNegative() && !Rem.isZero())
       Result = Result - 1;
   } else
     Result = ThisVal.udiv(OtherVal);
@@ -381,7 +381,7 @@ void APFixedPoint::toString(SmallVectorImpl<char> &Str) const {
   // Add 4 digits to hold the value after multiplying 10 (the radix)
   unsigned Width = Val.getBitWidth() + 4;
   APInt FractPart = Val.zextOrTrunc(Scale).zext(Width);
-  APInt FractPartMask = APInt::getAllOnesValue(Scale).zext(Width);
+  APInt FractPartMask = APInt::getAllOnes(Scale).zext(Width);
   APInt RadixInt = APInt(Width, 10);
 
   IntPart.toString(Str, /*Radix=*/10);
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 7abca8391f70..4b75c9db8526 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -92,7 +92,7 @@ namespace llvm {
      Note: we need to make the value different from semBogus as otherwise
      an unsafe optimization may collapse both values to a single address,
      and we heavily rely on them having distinct addresses.             */
-  static const fltSemantics semPPCDoubleDouble = {-1, 0, 0, 0};
+  static const fltSemantics semPPCDoubleDouble = {-1, 0, 0, 128};
 
   /* These are legacy semantics for the fallback, inaccrurate implementation of
      IBM double-double, if the accurate semPPCDoubleDouble doesn't handle the
@@ -1288,6 +1288,23 @@ IEEEFloat::compareAbsoluteValue(const IEEEFloat &rhs) const {
     return cmpEqual;
 }
 
+/* Set the least significant BITS bits of a bignum, clear the
+   rest.  */
+static void tcSetLeastSignificantBits(APInt::WordType *dst, unsigned parts,
+                                      unsigned bits) {
+  unsigned i = 0;
+  while (bits > APInt::APINT_BITS_PER_WORD) {
+    dst[i++] = ~(APInt::WordType)0;
+    bits -= APInt::APINT_BITS_PER_WORD;
+  }
+
+  if (bits)
+    dst[i++] = ~(APInt::WordType)0 >> (APInt::APINT_BITS_PER_WORD - bits);
+
+  while (i < parts)
+    dst[i++] = 0;
+}
+
 /* Handle overflow.  Sign is preserved.  We either become infinity or
    the largest finite number.  */
 IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) {
@@ -1303,8 +1320,8 @@ IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) {
   /* Otherwise we become the largest finite number.  */
   category = fcNormal;
   exponent = semantics->maxExponent;
-  APInt::tcSetLeastSignificantBits(significandParts(), partCount(),
-                                   semantics->precision);
+  tcSetLeastSignificantBits(significandParts(), partCount(),
+                            semantics->precision);
 
   return opInexact;
 }
@@ -2412,7 +2429,7 @@ IEEEFloat::convertToInteger(MutableArrayRef<integerPart> parts,
     else
       bits = width - isSigned;
 
-    APInt::tcSetLeastSignificantBits(parts.data(), dstPartsCount, bits);
+    tcSetLeastSignificantBits(parts.data(), dstPartsCount, bits);
     if (sign && isSigned)
       APInt::tcShiftLeft(parts.data(), dstPartsCount, width - 1);
   }
@@ -3379,7 +3396,6 @@ double IEEEFloat::convertToDouble() const {
 ///  exponent = 0, integer bit 1 ("pseudodenormal")
 /// At the moment, the first three are treated as NaNs, the last one as Normal.
 void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) {
-  assert(api.getBitWidth()==80);
   uint64_t i1 = api.getRawData()[0];
   uint64_t i2 = api.getRawData()[1];
   uint64_t myexponent = (i2 & 0x7fff);
@@ -3411,7 +3427,6 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromPPCDoubleDoubleAPInt(const APInt &api) {
-  assert(api.getBitWidth()==128);
   uint64_t i1 = api.getRawData()[0];
   uint64_t i2 = api.getRawData()[1];
   opStatus fs;
@@ -3435,7 +3450,6 @@ void IEEEFloat::initFromPPCDoubleDoubleAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) {
-  assert(api.getBitWidth()==128);
   uint64_t i1 = api.getRawData()[0];
   uint64_t i2 = api.getRawData()[1];
   uint64_t myexponent = (i2 >> 48) & 0x7fff;
@@ -3471,7 +3485,6 @@ void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromDoubleAPInt(const APInt &api) {
-  assert(api.getBitWidth()==64);
   uint64_t i = *api.getRawData();
   uint64_t myexponent = (i >> 52) & 0x7ff;
   uint64_t mysignificand = i & 0xfffffffffffffLL;
@@ -3500,7 +3513,6 @@ void IEEEFloat::initFromDoubleAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromFloatAPInt(const APInt &api) {
-  assert(api.getBitWidth()==32);
   uint32_t i = (uint32_t)*api.getRawData();
   uint32_t myexponent = (i >> 23) & 0xff;
   uint32_t mysignificand = i & 0x7fffff;
@@ -3529,7 +3541,6 @@ void IEEEFloat::initFromFloatAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromBFloatAPInt(const APInt &api) {
-  assert(api.getBitWidth() == 16);
   uint32_t i = (uint32_t)*api.getRawData();
   uint32_t myexponent = (i >> 7) & 0xff;
   uint32_t mysignificand = i & 0x7f;
@@ -3558,7 +3569,6 @@ void IEEEFloat::initFromBFloatAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromHalfAPInt(const APInt &api) {
-  assert(api.getBitWidth()==16);
   uint32_t i = (uint32_t)*api.getRawData();
   uint32_t myexponent = (i >> 10) & 0x1f;
   uint32_t mysignificand = i & 0x3ff;
@@ -3591,6 +3601,7 @@ void IEEEFloat::initFromHalfAPInt(const APInt &api) {
 /// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful
 /// when the size is anything else).
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
+  assert(api.getBitWidth() == Sem->sizeInBits);
   if (Sem == &semIEEEhalf)
     return initFromHalfAPInt(api);
   if (Sem == &semBFloat)
@@ -4847,9 +4858,8 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
   llvm_unreachable("Unexpected semantics");
 }
 
-APFloat APFloat::getAllOnesValue(const fltSemantics &Semantics,
-                                 unsigned BitWidth) {
-  return APFloat(Semantics, APInt::getAllOnesValue(BitWidth));
+APFloat APFloat::getAllOnesValue(const fltSemantics &Semantics) {
+  return APFloat(Semantics, APInt::getAllOnes(Semantics.sizeInBits));
 }
 
 void APFloat::print(raw_ostream &OS) const {
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index a8a950f09747..4940b61602d1 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -89,7 +89,6 @@ void APInt::initSlowCase(const APInt& that) {
 }
 
 void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
-  assert(BitWidth && "Bitwidth too small");
   assert(bigVal.data() && "Null pointer detected!");
   if (isSingleWord())
     U.VAL = bigVal[0];
@@ -105,19 +104,17 @@ void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
   clearUnusedBits();
 }
 
-APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal)
-  : BitWidth(numBits) {
+APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal) : BitWidth(numBits) {
   initFromArray(bigVal);
 }
 
 APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
-  : BitWidth(numBits) {
+    : BitWidth(numBits) {
   initFromArray(makeArrayRef(bigVal, numWords));
 }
 
 APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
-  : BitWidth(numbits) {
-  assert(BitWidth && "Bitwidth too small");
+    : BitWidth(numbits) {
   fromString(numbits, Str, radix);
 }
 
@@ -140,7 +137,7 @@ void APInt::reallocate(unsigned NewBitWidth) {
     U.pVal = getMemory(getNumWords());
 }
 
-void APInt::AssignSlowCase(const APInt& RHS) {
+void APInt::assignSlowCase(const APInt &RHS) {
   // Don't do anything for X = X
   if (this == &RHS)
     return;
@@ -233,27 +230,30 @@ APInt APInt::operator*(const APInt& RHS) const {
     return APInt(BitWidth, U.VAL * RHS.U.VAL);
 
   APInt Result(getMemory(getNumWords()), getBitWidth());
-
   tcMultiply(Result.U.pVal, U.pVal, RHS.U.pVal, getNumWords());
-
   Result.clearUnusedBits();
   return Result;
 }
 
-void APInt::AndAssignSlowCase(const APInt& RHS) {
-  tcAnd(U.pVal, RHS.U.pVal, getNumWords());
+void APInt::andAssignSlowCase(const APInt &RHS) {
+  WordType *dst = U.pVal, *rhs = RHS.U.pVal;
+  for (size_t i = 0, e = getNumWords(); i != e; ++i)
+    dst[i] &= rhs[i];
 }
 
-void APInt::OrAssignSlowCase(const APInt& RHS) {
-  tcOr(U.pVal, RHS.U.pVal, getNumWords());
+void APInt::orAssignSlowCase(const APInt &RHS) {
+  WordType *dst = U.pVal, *rhs = RHS.U.pVal;
+  for (size_t i = 0, e = getNumWords(); i != e; ++i)
+    dst[i] |= rhs[i];
 }
 
-void APInt::XorAssignSlowCase(const APInt& RHS) {
-  tcXor(U.pVal, RHS.U.pVal, getNumWords());
+void APInt::xorAssignSlowCase(const APInt &RHS) {
+  WordType *dst = U.pVal, *rhs = RHS.U.pVal;
+  for (size_t i = 0, e = getNumWords(); i != e; ++i)
+    dst[i] ^= rhs[i];
 }
 
-APInt& APInt::operator*=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+APInt &APInt::operator*=(const APInt &RHS) {
   *this = *this * RHS;
   return *this;
 }
@@ -268,7 +268,7 @@ APInt& APInt::operator*=(uint64_t RHS) {
   return clearUnusedBits();
 }
 
-bool APInt::EqualSlowCase(const APInt& RHS) const {
+bool APInt::equalSlowCase(const APInt &RHS) const {
   return std::equal(U.pVal, U.pVal + getNumWords(), RHS.U.pVal);
 }
 
@@ -327,12 +327,29 @@ void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) {
     U.pVal[word] = WORDTYPE_MAX;
 }
 
+// Complement a bignum in-place.
+static void tcComplement(APInt::WordType *dst, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
+    dst[i] = ~dst[i];
+}
+
 /// Toggle every bit to its opposite value.
 void APInt::flipAllBitsSlowCase() {
   tcComplement(U.pVal, getNumWords());
   clearUnusedBits();
 }
 
+/// Concatenate the bits from "NewLSB" onto the bottom of *this.  This is
+/// equivalent to:
+///   (this->zext(NewWidth) << NewLSB.getBitWidth()) | NewLSB.zext(NewWidth)
+/// In the slow case, we know the result is large.
+APInt APInt::concatSlowCase(const APInt &NewLSB) const {
+  unsigned NewWidth = getBitWidth() + NewLSB.getBitWidth();
+  APInt Result = NewLSB.zextOrSelf(NewWidth);
+  Result.insertBits(*this, NewLSB.getBitWidth());
+  return Result;
+}
+
 /// Toggle a given bit to its opposite value whose position is given
 /// as "bitPosition".
 /// Toggles a given bit to its opposite value.
@@ -343,8 +360,11 @@ void APInt::flipBit(unsigned bitPosition) {
 
 void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
   unsigned subBitWidth = subBits.getBitWidth();
-  assert(0 < subBitWidth && (subBitWidth + bitPosition) <= BitWidth &&
-         "Illegal bit insertion");
+  assert((subBitWidth + bitPosition) <= BitWidth && "Illegal bit insertion");
+
+  // inserting no bits is a noop.
+  if (subBitWidth == 0)
+    return;
 
   // Insertion is a direct copy.
   if (subBitWidth == BitWidth) {
@@ -424,7 +444,6 @@ void APInt::insertBits(uint64_t subBits, unsigned bitPosition, unsigned numBits)
 }
 
 APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
-  assert(numBits > 0 && "Can't extract zero bits");
   assert(bitPosition < BitWidth && (numBits + bitPosition) <= BitWidth &&
          "Illegal bit extraction");
 
@@ -550,7 +569,7 @@ hash_code llvm::hash_value(const APInt &Arg) {
       hash_combine_range(Arg.U.pVal, Arg.U.pVal + Arg.getNumWords()));
 }
 
-unsigned DenseMapInfo<APInt>::getHashValue(const APInt &Key) {
+unsigned DenseMapInfo<APInt, void>::getHashValue(const APInt &Key) {
   return static_cast<unsigned>(hash_value(Key));
 }
 
@@ -702,6 +721,8 @@ APInt APInt::reverseBits() const {
     return APInt(BitWidth, llvm::reverseBits<uint16_t>(U.VAL));
   case 8:
     return APInt(BitWidth, llvm::reverseBits<uint8_t>(U.VAL));
+  case 0:
+    return *this;
   default:
     break;
   }
@@ -861,7 +882,6 @@ double APInt::roundToDouble(bool isSigned) const {
 // Truncate to new width.
 APInt APInt::trunc(unsigned width) const {
   assert(width < BitWidth && "Invalid APInt Truncate request");
-  assert(width && "Can't truncate to 0 bits");
 
   if (width <= APINT_BITS_PER_WORD)
     return APInt(width, getRawData()[0]);
@@ -884,7 +904,6 @@ APInt APInt::trunc(unsigned width) const {
 // Truncate to new width with unsigned saturation.
 APInt APInt::truncUSat(unsigned width) const {
   assert(width < BitWidth && "Invalid APInt Truncate request");
-  assert(width && "Can't truncate to 0 bits");
 
   // Can we just losslessly truncate it?
   if (isIntN(width))
@@ -896,7 +915,6 @@ APInt APInt::truncUSat(unsigned width) const {
 // Truncate to new width with signed saturation.
 APInt APInt::truncSSat(unsigned width) const {
   assert(width < BitWidth && "Invalid APInt Truncate request");
-  assert(width && "Can't truncate to 0 bits");
 
   // Can we just losslessly truncate it?
   if (isSignedIntN(width))
@@ -1059,6 +1077,8 @@ void APInt::shlSlowCase(unsigned ShiftAmt) {
 
 // Calculate the rotate amount modulo the bit width.
 static unsigned rotateModulo(unsigned BitWidth, const APInt &rotateAmt) {
+  if (LLVM_UNLIKELY(BitWidth == 0))
+    return 0;
   unsigned rotBitWidth = rotateAmt.getBitWidth();
   APInt rot = rotateAmt;
   if (rotBitWidth < BitWidth) {
@@ -1075,6 +1095,8 @@ APInt APInt::rotl(const APInt &rotateAmt) const {
 }
 
 APInt APInt::rotl(unsigned rotateAmt) const {
+  if (LLVM_UNLIKELY(BitWidth == 0))
+    return *this;
   rotateAmt %= BitWidth;
   if (rotateAmt == 0)
     return *this;
@@ -1086,12 +1108,43 @@ APInt APInt::rotr(const APInt &rotateAmt) const {
 }
 
 APInt APInt::rotr(unsigned rotateAmt) const {
+  if (BitWidth == 0)
+    return *this;
   rotateAmt %= BitWidth;
   if (rotateAmt == 0)
     return *this;
   return lshr(rotateAmt) | shl(BitWidth - rotateAmt);
 }
 
+/// \returns the nearest log base 2 of this APInt. Ties round up.
+///
+/// NOTE: When we have a BitWidth of 1, we define:
+///
+///   log2(0) = UINT32_MAX
+///   log2(1) = 0
+///
+/// to get around any mathematical concerns resulting from
+/// referencing 2 in a space where 2 does no exist.
+unsigned APInt::nearestLogBase2() const {
+  // Special case when we have a bitwidth of 1. If VAL is 1, then we
+  // get 0. If VAL is 0, we get WORDTYPE_MAX which gets truncated to
+  // UINT32_MAX.
+  if (BitWidth == 1)
+    return U.VAL - 1;
+
+  // Handle the zero case.
+  if (isZero())
+    return UINT32_MAX;
+
+  // The non-zero case is handled by computing:
+  //
+  //   nearestLogBase2(x) = logBase2(x) + x[logBase2(x)-1].
+  //
+  // where x[i] is referring to the value of the ith bit of x.
+  unsigned lg = logBase2();
+  return lg + unsigned((*this)[lg - 1]);
+}
+
 // Square Root - this method computes and returns the square root of "this".
 // Three mechanisms are used for computation. For small values (<= 5 bits),
 // a table lookup is done. This gets some performance for common cases. For
@@ -1222,98 +1275,6 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
   return std::move(t[i]);
 }
 
-/// Calculate the magic numbers required to implement a signed integer division
-/// by a constant as a sequence of multiplies, adds and shifts.  Requires that
-/// the divisor not be 0, 1, or -1.  Taken from "Hacker's Delight", Henry S.
-/// Warren, Jr., chapter 10.
-APInt::ms APInt::magic() const {
-  const APInt& d = *this;
-  unsigned p;
-  APInt ad, anc, delta, q1, r1, q2, r2, t;
-  APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
-  struct ms mag;
-
-  ad = d.abs();
-  t = signedMin + (d.lshr(d.getBitWidth() - 1));
-  anc = t - 1 - t.urem(ad);   // absolute value of nc
-  p = d.getBitWidth() - 1;    // initialize p
-  q1 = signedMin.udiv(anc);   // initialize q1 = 2p/abs(nc)
-  r1 = signedMin - q1*anc;    // initialize r1 = rem(2p,abs(nc))
-  q2 = signedMin.udiv(ad);    // initialize q2 = 2p/abs(d)
-  r2 = signedMin - q2*ad;     // initialize r2 = rem(2p,abs(d))
-  do {
-    p = p + 1;
-    q1 = q1<<1;          // update q1 = 2p/abs(nc)
-    r1 = r1<<1;          // update r1 = rem(2p/abs(nc))
-    if (r1.uge(anc)) {  // must be unsigned comparison
-      q1 = q1 + 1;
-      r1 = r1 - anc;
-    }
-    q2 = q2<<1;          // update q2 = 2p/abs(d)
-    r2 = r2<<1;          // update r2 = rem(2p/abs(d))
-    if (r2.uge(ad)) {   // must be unsigned comparison
-      q2 = q2 + 1;
-      r2 = r2 - ad;
-    }
-    delta = ad - r2;
-  } while (q1.ult(delta) || (q1 == delta && r1 == 0));
-
-  mag.m = q2 + 1;
-  if (d.isNegative()) mag.m = -mag.m;   // resulting magic number
-  mag.s = p - d.getBitWidth();          // resulting shift
-  return mag;
-}
-
-/// Calculate the magic numbers required to implement an unsigned integer
-/// division by a constant as a sequence of multiplies, adds and shifts.
-/// Requires that the divisor not be 0.  Taken from "Hacker's Delight", Henry
-/// S. Warren, Jr., chapter 10.
-/// LeadingZeros can be used to simplify the calculation if the upper bits
-/// of the divided value are known zero.
-APInt::mu APInt::magicu(unsigned LeadingZeros) const {
-  const APInt& d = *this;
-  unsigned p;
-  APInt nc, delta, q1, r1, q2, r2;
-  struct mu magu;
-  magu.a = 0;               // initialize "add" indicator
-  APInt allOnes = APInt::getAllOnesValue(d.getBitWidth()).lshr(LeadingZeros);
-  APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
-  APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth());
-
-  nc = allOnes - (allOnes - d).urem(d);
-  p = d.getBitWidth() - 1;  // initialize p
-  q1 = signedMin.udiv(nc);  // initialize q1 = 2p/nc
-  r1 = signedMin - q1*nc;   // initialize r1 = rem(2p,nc)
-  q2 = signedMax.udiv(d);   // initialize q2 = (2p-1)/d
-  r2 = signedMax - q2*d;    // initialize r2 = rem((2p-1),d)
-  do {
-    p = p + 1;
-    if (r1.uge(nc - r1)) {
-      q1 = q1 + q1 + 1;  // update q1
-      r1 = r1 + r1 - nc; // update r1
-    }
-    else {
-      q1 = q1+q1; // update q1
-      r1 = r1+r1; // update r1
-    }
-    if ((r2 + 1).uge(d - r2)) {
-      if (q2.uge(signedMax)) magu.a = 1;
-      q2 = q2+q2 + 1;     // update q2
-      r2 = r2+r2 + 1 - d; // update r2
-    }
-    else {
-      if (q2.uge(signedMin)) magu.a = 1;
-      q2 = q2+q2;     // update q2
-      r2 = r2+r2 + 1; // update r2
-    }
-    delta = d - 1 - r2;
-  } while (p < d.getBitWidth()*2 &&
-           (q1.ult(delta) || (q1 == delta && r1 == 0)));
-  magu.m = q2 + 1; // resulting magic number
-  magu.s = p - d.getBitWidth();  // resulting shift
-  return magu;
-}
-
 /// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
 /// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
 /// variables here have the same names as in the algorithm. Comments explain
@@ -1984,15 +1945,16 @@ APInt APInt::usub_ov(const APInt &RHS, bool &Overflow) const {
 
 APInt APInt::sdiv_ov(const APInt &RHS, bool &Overflow) const {
   // MININT/-1  -->  overflow.
-  Overflow = isMinSignedValue() && RHS.isAllOnesValue();
+  Overflow = isMinSignedValue() && RHS.isAllOnes();
   return sdiv(RHS);
 }
 
 APInt APInt::smul_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this * RHS;
 
-  if (*this != 0 && RHS != 0)
-    Overflow = Res.sdiv(RHS) != *this || Res.sdiv(*this) != RHS;
+  if (RHS != 0)
+    Overflow = Res.sdiv(RHS) != *this ||
+               (isMinSignedValue() && RHS.isAllOnes());
   else
     Overflow = false;
   return Res;
@@ -2196,7 +2158,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
   }
 
   // First, check for a zero value and just short circuit the logic below.
-  if (*this == 0) {
+  if (isZero()) {
     while (*Prefix) {
       Str.push_back(*Prefix);
       ++Prefix;
@@ -2305,55 +2267,51 @@ void APInt::print(raw_ostream &OS, bool isSigned) const {
 static_assert(APInt::APINT_BITS_PER_WORD % 2 == 0,
               "Part width must be divisible by 2!");
 
-/* Some handy functions local to this file.  */
-
-/* Returns the integer part with the least significant BITS set.
-   BITS cannot be zero.  */
+// Returns the integer part with the least significant BITS set.
+// BITS cannot be zero.
 static inline APInt::WordType lowBitMask(unsigned bits) {
   assert(bits != 0 && bits <= APInt::APINT_BITS_PER_WORD);
-
   return ~(APInt::WordType) 0 >> (APInt::APINT_BITS_PER_WORD - bits);
 }
 
-/* Returns the value of the lower half of PART.  */
+/// Returns the value of the lower half of PART.
 static inline APInt::WordType lowHalf(APInt::WordType part) {
   return part & lowBitMask(APInt::APINT_BITS_PER_WORD / 2);
 }
 
-/* Returns the value of the upper half of PART.  */
+/// Returns the value of the upper half of PART.
 static inline APInt::WordType highHalf(APInt::WordType part) {
   return part >> (APInt::APINT_BITS_PER_WORD / 2);
 }
 
-/* Returns the bit number of the most significant set bit of a part.
-   If the input number has no bits set -1U is returned.  */
+/// Returns the bit number of the most significant set bit of a part.
+/// If the input number has no bits set -1U is returned.
 static unsigned partMSB(APInt::WordType value) {
   return findLastSet(value, ZB_Max);
 }
 
-/* Returns the bit number of the least significant set bit of a
-   part.  If the input number has no bits set -1U is returned.  */
+/// Returns the bit number of the least significant set bit of a part.  If the
+/// input number has no bits set -1U is returned.
 static unsigned partLSB(APInt::WordType value) {
   return findFirstSet(value, ZB_Max);
 }
 
-/* Sets the least significant part of a bignum to the input value, and
-   zeroes out higher parts.  */
+/// Sets the least significant part of a bignum to the input value, and zeroes
+/// out higher parts.
 void APInt::tcSet(WordType *dst, WordType part, unsigned parts) {
   assert(parts > 0);
-
   dst[0] = part;
   for (unsigned i = 1; i < parts; i++)
     dst[i] = 0;
 }
 
-/* Assign one bignum to another.  */
+/// Assign one bignum to another.
 void APInt::tcAssign(WordType *dst, const WordType *src, unsigned parts) {
   for (unsigned i = 0; i < parts; i++)
     dst[i] = src[i];
 }
 
-/* Returns true if a bignum is zero, false otherwise.  */
+/// Returns true if a bignum is zero, false otherwise.
 bool APInt::tcIsZero(const WordType *src, unsigned parts) {
   for (unsigned i = 0; i < parts; i++)
     if (src[i])
@@ -2362,28 +2320,27 @@ bool APInt::tcIsZero(const WordType *src, unsigned parts) {
   return true;
 }
 
-/* Extract the given bit of a bignum; returns 0 or 1.  */
+/// Extract the given bit of a bignum; returns 0 or 1.
 int APInt::tcExtractBit(const WordType *parts, unsigned bit) {
   return (parts[whichWord(bit)] & maskBit(bit)) != 0;
 }
 
-/* Set the given bit of a bignum. */
+/// Set the given bit of a bignum.
 void APInt::tcSetBit(WordType *parts, unsigned bit) {
   parts[whichWord(bit)] |= maskBit(bit);
 }
 
-/* Clears the given bit of a bignum. */
+/// Clears the given bit of a bignum.
 void APInt::tcClearBit(WordType *parts, unsigned bit) {
   parts[whichWord(bit)] &= ~maskBit(bit);
 }
 
-/* Returns the bit number of the least significant set bit of a
-   number.  If the input number has no bits set -1U is returned.  */
+/// Returns the bit number of the least significant set bit of a number.  If the
+/// input number has no bits set -1U is returned.
 unsigned APInt::tcLSB(const WordType *parts, unsigned n) {
   for (unsigned i = 0; i < n; i++) {
     if (parts[i] != 0) {
       unsigned lsb = partLSB(parts[i]);
-
       return lsb + i * APINT_BITS_PER_WORD;
     }
   }
@@ -2391,8 +2348,8 @@ unsigned APInt::tcLSB(const WordType *parts, unsigned n) {
   return -1U;
 }
 
-/* Returns the bit number of the most significant set bit of a number.
-   If the input number has no bits set -1U is returned.  */
+/// Returns the bit number of the most significant set bit of a number.
+/// If the input number has no bits set -1U is returned.
 unsigned APInt::tcMSB(const WordType *parts, unsigned n) {
   do {
     --n;
@@ -2407,10 +2364,10 @@ unsigned APInt::tcMSB(const WordType *parts, unsigned n) {
   return -1U;
 }
 
-/* Copy the bit vector of width srcBITS from SRC, starting at bit
-   srcLSB, to DST, of dstCOUNT parts, such that the bit srcLSB becomes
-   the least significant bit of DST.  All high bits above srcBITS in
-   DST are zero-filled.  */
+/// Copy the bit vector of width srcBITS from SRC, starting at bit srcLSB, to
+/// DST, of dstCOUNT parts, such that the bit srcLSB becomes the least
+/// significant bit of DST.  All high bits above srcBITS in DST are zero-filled.
+/// */
 void
 APInt::tcExtract(WordType *dst, unsigned dstCount, const WordType *src,
                  unsigned srcBits, unsigned srcLSB) {
@@ -2418,14 +2375,14 @@ APInt::tcExtract(WordType *dst, unsigned dstCount, const WordType *src,
   assert(dstParts <= dstCount);
 
   unsigned firstSrcPart = srcLSB / APINT_BITS_PER_WORD;
-  tcAssign (dst, src + firstSrcPart, dstParts);
+  tcAssign(dst, src + firstSrcPart, dstParts);
 
   unsigned shift = srcLSB % APINT_BITS_PER_WORD;
-  tcShiftRight (dst, dstParts, shift);
+  tcShiftRight(dst, dstParts, shift);
 
-  /* We now have (dstParts * APINT_BITS_PER_WORD - shift) bits from SRC
-     in DST.  If this is less that srcBits, append the rest, else
-     clear the high bits.  */
+  // We now have (dstParts * APINT_BITS_PER_WORD - shift) bits from SRC
+  // in DST.  If this is less that srcBits, append the rest, else
+  // clear the high bits.
   unsigned n = dstParts * APINT_BITS_PER_WORD - shift;
   if (n < srcBits) {
     WordType mask = lowBitMask (srcBits - n);
@@ -2436,12 +2393,12 @@ APInt::tcExtract(WordType *dst, unsigned dstCount, const WordType *src,
       dst[dstParts - 1] &= lowBitMask (srcBits % APINT_BITS_PER_WORD);
   }
 
-  /* Clear high parts.  */
+  // Clear high parts.
   while (dstParts < dstCount)
     dst[dstParts++] = 0;
 }
 
-/* DST += RHS + C where C is zero or one.  Returns the carry flag.  */
+//// DST += RHS + C where C is zero or one.  Returns the carry flag.
 APInt::WordType APInt::tcAdd(WordType *dst, const WordType *rhs,
                              WordType c, unsigned parts) {
   assert(c <= 1);
@@ -2476,7 +2433,7 @@ APInt::WordType APInt::tcAddPart(WordType *dst, WordType src,
   return 1;
 }
 
-/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
+/// DST -= RHS + C where C is zero or one.  Returns the carry flag.
 APInt::WordType APInt::tcSubtract(WordType *dst, const WordType *rhs,
                                   WordType c, unsigned parts) {
   assert(c <= 1);
@@ -2515,47 +2472,39 @@ APInt::WordType APInt::tcSubtractPart(WordType *dst, WordType src,
   return 1;
 }
 
-/* Negate a bignum in-place.  */
+/// Negate a bignum in-place.
 void APInt::tcNegate(WordType *dst, unsigned parts) {
   tcComplement(dst, parts);
   tcIncrement(dst, parts);
 }
 
-/*  DST += SRC * MULTIPLIER + CARRY   if add is true
-    DST  = SRC * MULTIPLIER + CARRY   if add is false
-
-    Requires 0 <= DSTPARTS <= SRCPARTS + 1.  If DST overlaps SRC
-    they must start at the same point, i.e. DST == SRC.
-
-    If DSTPARTS == SRCPARTS + 1 no overflow occurs and zero is
-    returned.  Otherwise DST is filled with the least significant
-    DSTPARTS parts of the result, and if all of the omitted higher
-    parts were zero return zero, otherwise overflow occurred and
-    return one.  */
+/// DST += SRC * MULTIPLIER + CARRY   if add is true
+/// DST  = SRC * MULTIPLIER + CARRY   if add is false
+/// Requires 0 <= DSTPARTS <= SRCPARTS + 1.  If DST overlaps SRC
+/// they must start at the same point, i.e. DST == SRC.
+/// If DSTPARTS == SRCPARTS + 1 no overflow occurs and zero is
+/// returned.  Otherwise DST is filled with the least significant
+/// DSTPARTS parts of the result, and if all of the omitted higher
+/// parts were zero return zero, otherwise overflow occurred and
+/// return one.
 int APInt::tcMultiplyPart(WordType *dst, const WordType *src,
                           WordType multiplier, WordType carry,
                           unsigned srcParts, unsigned dstParts,
                           bool add) {
-  /* Otherwise our writes of DST kill our later reads of SRC.  */
+  // Otherwise our writes of DST kill our later reads of SRC.
   assert(dst <= src || dst >= src + srcParts);
   assert(dstParts <= srcParts + 1);
 
-  /* N loops; minimum of dstParts and srcParts.  */
+  // N loops; minimum of dstParts and srcParts.
   unsigned n = std::min(dstParts, srcParts);
 
   for (unsigned i = 0; i < n; i++) {
-    WordType low, mid, high, srcPart;
-
-      /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
-
-         This cannot overflow, because
-
-         (n - 1) * (n - 1) + 2 (n - 1) = (n - 1) * (n + 1)
-
-         which is less than n^2.  */
-
-    srcPart = src[i];
-
+    // [LOW, HIGH] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
+    // This cannot overflow, because:
+    //   (n - 1) * (n - 1) + 2 (n - 1) = (n - 1) * (n + 1)
+    // which is less than n^2.
+    WordType srcPart = src[i];
+    WordType low, mid, high;
     if (multiplier == 0 || srcPart == 0) {
       low = carry;
       high = 0;
@@ -2577,14 +2526,14 @@ int APInt::tcMultiplyPart(WordType *dst, const WordType *src,
         high++;
       low += mid;
 
-      /* Now add carry.  */
+      // Now add carry.
       if (low + carry < low)
         high++;
       low += carry;
     }
 
     if (add) {
-      /* And now DST[i], and store the new low part there.  */
+      // And now DST[i], and store the new low part there.
       if (low + dst[i] < low)
         high++;
       dst[i] += low;
@@ -2595,32 +2544,32 @@ int APInt::tcMultiplyPart(WordType *dst, const WordType *src,
   }
 
   if (srcParts < dstParts) {
-    /* Full multiplication, there is no overflow.  */
+    // Full multiplication, there is no overflow.
     assert(srcParts + 1 == dstParts);
     dst[srcParts] = carry;
     return 0;
   }
 
-  /* We overflowed if there is carry.  */
+  // We overflowed if there is carry.
   if (carry)
     return 1;
 
-  /* We would overflow if any significant unwritten parts would be
-     non-zero.  This is true if any remaining src parts are non-zero
-     and the multiplier is non-zero.  */
+  // We would overflow if any significant unwritten parts would be
+  // non-zero.  This is true if any remaining src parts are non-zero
+  // and the multiplier is non-zero.
   if (multiplier)
     for (unsigned i = dstParts; i < srcParts; i++)
       if (src[i])
         return 1;
 
-  /* We fitted in the narrow destination.  */
+  // We fitted in the narrow destination.
   return 0;
 }
 
-/* DST = LHS * RHS, where DST has the same width as the operands and
-   is filled with the least significant parts of the result.  Returns
-   one if overflow occurred, otherwise zero.  DST must be disjoint
-   from both operands.  */
+/// DST = LHS * RHS, where DST has the same width as the operands and
+/// is filled with the least significant parts of the result.  Returns
+/// one if overflow occurred, otherwise zero.  DST must be disjoint
+/// from both operands.
 int APInt::tcMultiply(WordType *dst, const WordType *lhs,
                       const WordType *rhs, unsigned parts) {
   assert(dst != lhs && dst != rhs);
@@ -2640,7 +2589,7 @@ int APInt::tcMultiply(WordType *dst, const WordType *lhs,
 void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
                            const WordType *rhs, unsigned lhsParts,
                            unsigned rhsParts) {
-  /* Put the narrower number on the LHS for less loops below.  */
+  // Put the narrower number on the LHS for less loops below.
   if (lhsParts > rhsParts)
     return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
 
@@ -2652,16 +2601,15 @@ void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
     tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
 }
 
-/* If RHS is zero LHS and REMAINDER are left unchanged, return one.
-   Otherwise set LHS to LHS / RHS with the fractional part discarded,
-   set REMAINDER to the remainder, return zero.  i.e.
-
-   OLD_LHS = RHS * LHS + REMAINDER
-
-   SCRATCH is a bignum of the same size as the operands and result for
-   use by the routine; its contents need not be initialized and are
-   destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
-*/
+// If RHS is zero LHS and REMAINDER are left unchanged, return one.
+// Otherwise set LHS to LHS / RHS with the fractional part discarded,
+// set REMAINDER to the remainder, return zero.  i.e.
+//
+//   OLD_LHS = RHS * LHS + REMAINDER
+//
+// SCRATCH is a bignum of the same size as the operands and result for
+// use by the routine; its contents need not be initialized and are
+// destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
 int APInt::tcDivide(WordType *lhs, const WordType *rhs,
                     WordType *remainder, WordType *srhs,
                     unsigned parts) {
@@ -2680,8 +2628,8 @@ int APInt::tcDivide(WordType *lhs, const WordType *rhs,
   tcAssign(remainder, lhs, parts);
   tcSet(lhs, 0, parts);
 
-  /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
-     the total.  */
+  // Loop, subtracting SRHS if REMAINDER is greater and adding that to the
+  // total.
   for (;;) {
     int compare = tcCompare(remainder, srhs, parts);
     if (compare >= 0) {
@@ -2756,31 +2704,7 @@ void APInt::tcShiftRight(WordType *Dst, unsigned Words, unsigned Count) {
   std::memset(Dst + WordsToMove, 0, WordShift * APINT_WORD_SIZE);
 }
 
-/* Bitwise and of two bignums.  */
-void APInt::tcAnd(WordType *dst, const WordType *rhs, unsigned parts) {
-  for (unsigned i = 0; i < parts; i++)
-    dst[i] &= rhs[i];
-}
-
-/* Bitwise inclusive or of two bignums.  */
-void APInt::tcOr(WordType *dst, const WordType *rhs, unsigned parts) {
-  for (unsigned i = 0; i < parts; i++)
-    dst[i] |= rhs[i];
-}
-
-/* Bitwise exclusive or of two bignums.  */
-void APInt::tcXor(WordType *dst, const WordType *rhs, unsigned parts) {
-  for (unsigned i = 0; i < parts; i++)
-    dst[i] ^= rhs[i];
-}
-
-/* Complement a bignum in-place.  */
-void APInt::tcComplement(WordType *dst, unsigned parts) {
-  for (unsigned i = 0; i < parts; i++)
-    dst[i] = ~dst[i];
-}
-
-/* Comparison (unsigned) of two bignums.  */
+// Comparison (unsigned) of two bignums.
 int APInt::tcCompare(const WordType *lhs, const WordType *rhs,
                      unsigned parts) {
   while (parts) {
@@ -2792,23 +2716,6 @@ int APInt::tcCompare(const WordType *lhs, const WordType *rhs,
   return 0;
 }
 
-/* Set the least significant BITS bits of a bignum, clear the
-   rest.  */
-void APInt::tcSetLeastSignificantBits(WordType *dst, unsigned parts,
-                                      unsigned bits) {
-  unsigned i = 0;
-  while (bits > APINT_BITS_PER_WORD) {
-    dst[i++] = ~(WordType) 0;
-    bits -= APINT_BITS_PER_WORD;
-  }
-
-  if (bits)
-    dst[i++] = ~(WordType) 0 >> (APINT_BITS_PER_WORD - bits);
-
-  while (i < parts)
-    dst[i++] = 0;
-}
-
 APInt llvm::APIntOps::RoundingUDiv(const APInt &A, const APInt &B,
                                    APInt::Rounding RM) {
   // Currently udivrem always rounds down.
@@ -2819,7 +2726,7 @@ APInt llvm::APIntOps::RoundingUDiv(const APInt &A, const APInt &B,
   case APInt::Rounding::UP: {
     APInt Quo, Rem;
     APInt::udivrem(A, B, Quo, Rem);
-    if (Rem == 0)
+    if (Rem.isZero())
       return Quo;
     return Quo + 1;
   }
@@ -2834,7 +2741,7 @@ APInt llvm::APIntOps::RoundingSDiv(const APInt &A, const APInt &B,
   case APInt::Rounding::UP: {
     APInt Quo, Rem;
     APInt::sdivrem(A, B, Quo, Rem);
-    if (Rem == 0)
+    if (Rem.isZero())
       return Quo;
     // This algorithm deals with arbitrary rounding mode used by sdivrem.
     // We want to check whether the non-integer part of the mathematical value
@@ -2870,7 +2777,7 @@ llvm::APIntOps::SolveQuadraticEquationWrap(APInt A, APInt B, APInt C,
                     << "x + " << C << ", rw:" << RangeWidth << '\n');
 
   // Identify 0 as a (non)solution immediately.
-  if (C.sextOrTrunc(RangeWidth).isNullValue() ) {
+  if (C.sextOrTrunc(RangeWidth).isZero()) {
     LLVM_DEBUG(dbgs() << __func__ << ": zero solution\n");
     return APInt(CoeffWidth, 0);
   }
@@ -2932,7 +2839,7 @@ llvm::APIntOps::SolveQuadraticEquationWrap(APInt A, APInt B, APInt C,
   auto RoundUp = [] (const APInt &V, const APInt &A) -> APInt {
     assert(A.isStrictlyPositive());
     APInt T = V.abs().urem(A);
-    if (T.isNullValue())
+    if (T.isZero())
       return V;
     return V.isNegative() ? V+T : V+(A-T);
   };
@@ -3016,7 +2923,7 @@ llvm::APIntOps::SolveQuadraticEquationWrap(APInt A, APInt B, APInt C,
   // can be 0, but cannot be negative.
   assert(X.isNonNegative() && "Solution should be non-negative");
 
-  if (!InexactSQ && Rem.isNullValue()) {
+  if (!InexactSQ && Rem.isZero()) {
     LLVM_DEBUG(dbgs() << __func__ << ": solution (root): " << X << '\n');
     return X;
   }
@@ -3032,8 +2939,8 @@ llvm::APIntOps::SolveQuadraticEquationWrap(APInt A, APInt B, APInt C,
 
   APInt VX = (A*X + B)*X + C;
   APInt VY = VX + TwoA*X + A + B;
-  bool SignChange = VX.isNegative() != VY.isNegative() ||
-                    VX.isNullValue() != VY.isNullValue();
+  bool SignChange =
+      VX.isNegative() != VY.isNegative() || VX.isZero() != VY.isZero();
   // If the sign did not change between X and X+1, X is not a valid solution.
   // This could happen when the actual (exact) roots don't have an integer
   // between them, so they would both be contained between X and X+1.
@@ -3055,6 +2962,40 @@ llvm::APIntOps::GetMostSignificantDifferentBit(const APInt &A, const APInt &B) {
   return A.getBitWidth() - ((A ^ B).countLeadingZeros() + 1);
 }
 
+APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth) {
+  unsigned OldBitWidth = A.getBitWidth();
+  assert((((OldBitWidth % NewBitWidth) == 0) ||
+          ((NewBitWidth % OldBitWidth) == 0)) &&
+         "One size should be a multiple of the other one. "
+         "Can't do fractional scaling.");
+
+  // Check for matching bitwidths.
+  if (OldBitWidth == NewBitWidth)
+    return A;
+
+  APInt NewA = APInt::getZero(NewBitWidth);
+
+  // Check for null input.
+  if (A.isZero())
+    return NewA;
+
+  if (NewBitWidth > OldBitWidth) {
+    // Repeat bits.
+    unsigned Scale = NewBitWidth / OldBitWidth;
+    for (unsigned i = 0; i != OldBitWidth; ++i)
+      if (A[i])
+        NewA.setBits(i * Scale, (i + 1) * Scale);
+  } else {
+    // Merge bits - if any old bit is set, then set scale equivalent new bit.
+    unsigned Scale = OldBitWidth / NewBitWidth;
+    for (unsigned i = 0; i != NewBitWidth; ++i)
+      if (!A.extractBits(Scale, i * Scale).isZero())
+        NewA.setBit(i);
+  }
+
+  return NewA;
+}
+
 /// StoreIntToMemory - Fills the StoreBytes bytes of memory starting from Dst
 /// with the integer held in IntVal.
 void llvm::StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp
index 94b48df27993..4405ed176fe2 100644
--- a/llvm/lib/Support/ARMTargetParser.cpp
+++ b/llvm/lib/Support/ARMTargetParser.cpp
@@ -82,6 +82,10 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV8MMainline:
   case ArchKind::ARMV8_1MMainline:
     return 8;
+  case ArchKind::ARMV9A:
+  case ArchKind::ARMV9_1A:
+  case ArchKind::ARMV9_2A:
+    return 9;
   case ArchKind::INVALID:
     return 0;
   }
@@ -113,6 +117,9 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
   case ArchKind::ARMV8_5A:
   case ArchKind::ARMV8_6A:
   case ArchKind::ARMV8_7A:
+  case ArchKind::ARMV9A:
+  case ArchKind::ARMV9_1A:
+  case ArchKind::ARMV9_2A:
     return ProfileKind::A;
   case ArchKind::ARMV2:
   case ArchKind::ARMV2A:
@@ -158,6 +165,9 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v8.6a", "v8.6-a")
       .Case("v8.7a", "v8.7-a")
       .Case("v8r", "v8-r")
+      .Cases("v9", "v9a", "v9-a")
+      .Case("v9.1a", "v9.1-a")
+      .Case("v9.2a", "v9.2-a")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
       .Case("v8.1m.main", "v8.1-m.main")
@@ -297,7 +307,7 @@ StringRef ARM::getCanonicalArchName(StringRef Arch) {
   else if (A.startswith("aarch64")) {
     offset = 7;
     // AArch64 uses "_be", not "eb" suffix.
-    if (A.find("eb") != StringRef::npos)
+    if (A.contains("eb"))
       return Error;
     if (A.substr(offset, 3) == "_be")
       offset += 3;
@@ -323,7 +333,7 @@ StringRef ARM::getCanonicalArchName(StringRef Arch) {
     if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1])))
       return Error;
     // Can't have an extra 'eb'.
-    if (A.find("eb") != StringRef::npos)
+    if (A.contains("eb"))
       return Error;
   }
 
diff --git a/llvm/lib/Support/BinaryStreamReader.cpp b/llvm/lib/Support/BinaryStreamReader.cpp
index a0434bdc6115..2fe450db11dd 100644
--- a/llvm/lib/Support/BinaryStreamReader.cpp
+++ b/llvm/lib/Support/BinaryStreamReader.cpp
@@ -72,10 +72,10 @@ Error BinaryStreamReader::readSLEB128(int64_t &Dest) {
 }
 
 Error BinaryStreamReader::readCString(StringRef &Dest) {
-  uint32_t OriginalOffset = getOffset();
-  uint32_t FoundOffset = 0;
+  uint64_t OriginalOffset = getOffset();
+  uint64_t FoundOffset = 0;
   while (true) {
-    uint32_t ThisOffset = getOffset();
+    uint64_t ThisOffset = getOffset();
     ArrayRef<uint8_t> Buffer;
     if (auto EC = readLongestContiguousChunk(Buffer))
       return EC;
@@ -100,8 +100,8 @@ Error BinaryStreamReader::readCString(StringRef &Dest) {
 }
 
 Error BinaryStreamReader::readWideString(ArrayRef<UTF16> &Dest) {
-  uint32_t Length = 0;
-  uint32_t OriginalOffset = getOffset();
+  uint64_t Length = 0;
+  uint64_t OriginalOffset = getOffset();
   const UTF16 *C;
   while (true) {
     if (auto EC = readObject(C))
@@ -110,7 +110,7 @@ Error BinaryStreamReader::readWideString(ArrayRef<UTF16> &Dest) {
       break;
     ++Length;
   }
-  uint32_t NewOffset = getOffset();
+  uint64_t NewOffset = getOffset();
   setOffset(OriginalOffset);
 
   if (auto EC = readArray(Dest, Length))
@@ -145,7 +145,7 @@ Error BinaryStreamReader::readSubstream(BinarySubstreamRef &Ref,
   return readStreamRef(Ref.StreamData, Length);
 }
 
-Error BinaryStreamReader::skip(uint32_t Amount) {
+Error BinaryStreamReader::skip(uint64_t Amount) {
   if (Amount > bytesRemaining())
     return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
   Offset += Amount;
@@ -166,7 +166,7 @@ uint8_t BinaryStreamReader::peek() const {
 }
 
 std::pair<BinaryStreamReader, BinaryStreamReader>
-BinaryStreamReader::split(uint32_t Off) const {
+BinaryStreamReader::split(uint64_t Off) const {
   assert(getLength() >= Off);
 
   BinaryStreamRef First = Stream.drop_front(Offset);
diff --git a/llvm/lib/Support/BinaryStreamRef.cpp b/llvm/lib/Support/BinaryStreamRef.cpp
index 53e71baad57a..6d79d95e1bf0 100644
--- a/llvm/lib/Support/BinaryStreamRef.cpp
+++ b/llvm/lib/Support/BinaryStreamRef.cpp
@@ -21,15 +21,15 @@ public:
   llvm::support::endianness getEndian() const override {
     return BBS.getEndian();
   }
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     return BBS.readBytes(Offset, Size, Buffer);
   }
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     return BBS.readLongestContiguousChunk(Offset, Buffer);
   }
-  uint32_t getLength() override { return BBS.getLength(); }
+  uint64_t getLength() override { return BBS.getLength(); }
 
 private:
   BinaryByteStream BBS;
@@ -44,17 +44,17 @@ public:
   llvm::support::endianness getEndian() const override {
     return BBS.getEndian();
   }
-  Error readBytes(uint32_t Offset, uint32_t Size,
+  Error readBytes(uint64_t Offset, uint64_t Size,
                   ArrayRef<uint8_t> &Buffer) override {
     return BBS.readBytes(Offset, Size, Buffer);
   }
-  Error readLongestContiguousChunk(uint32_t Offset,
+  Error readLongestContiguousChunk(uint64_t Offset,
                                    ArrayRef<uint8_t> &Buffer) override {
     return BBS.readLongestContiguousChunk(Offset, Buffer);
   }
-  uint32_t getLength() override { return BBS.getLength(); }
+  uint64_t getLength() override { return BBS.getLength(); }
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) override {
+  Error writeBytes(uint64_t Offset, ArrayRef<uint8_t> Data) override {
     return BBS.writeBytes(Offset, Data);
   }
   Error commit() override { return BBS.commit(); }
@@ -66,8 +66,8 @@ private:
 
 BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream)
     : BinaryStreamRefBase(Stream) {}
-BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream, uint32_t Offset,
-                                 Optional<uint32_t> Length)
+BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream, uint64_t Offset,
+                                 Optional<uint64_t> Length)
     : BinaryStreamRefBase(Stream, Offset, Length) {}
 BinaryStreamRef::BinaryStreamRef(ArrayRef<uint8_t> Data, endianness Endian)
     : BinaryStreamRefBase(std::make_shared<ArrayRefImpl>(Data, Endian), 0,
@@ -76,7 +76,7 @@ BinaryStreamRef::BinaryStreamRef(StringRef Data, endianness Endian)
     : BinaryStreamRef(makeArrayRef(Data.bytes_begin(), Data.bytes_end()),
                       Endian) {}
 
-Error BinaryStreamRef::readBytes(uint32_t Offset, uint32_t Size,
+Error BinaryStreamRef::readBytes(uint64_t Offset, uint64_t Size,
                                  ArrayRef<uint8_t> &Buffer) const {
   if (auto EC = checkOffsetForRead(Offset, Size))
     return EC;
@@ -84,7 +84,7 @@ Error BinaryStreamRef::readBytes(uint32_t Offset, uint32_t Size,
 }
 
 Error BinaryStreamRef::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+    uint64_t Offset, ArrayRef<uint8_t> &Buffer) const {
   if (auto EC = checkOffsetForRead(Offset, 1))
     return EC;
 
@@ -94,7 +94,7 @@ Error BinaryStreamRef::readLongestContiguousChunk(
   // This StreamRef might refer to a smaller window over a larger stream.  In
   // that case we will have read out more bytes than we should return, because
   // we should not read past the end of the current view.
-  uint32_t MaxLength = getLength() - Offset;
+  uint64_t MaxLength = getLength() - Offset;
   if (Buffer.size() > MaxLength)
     Buffer = Buffer.slice(0, MaxLength);
   return Error::success();
@@ -104,8 +104,8 @@ WritableBinaryStreamRef::WritableBinaryStreamRef(WritableBinaryStream &Stream)
     : BinaryStreamRefBase(Stream) {}
 
 WritableBinaryStreamRef::WritableBinaryStreamRef(WritableBinaryStream &Stream,
-                                                 uint32_t Offset,
-                                                 Optional<uint32_t> Length)
+                                                 uint64_t Offset,
+                                                 Optional<uint64_t> Length)
     : BinaryStreamRefBase(Stream, Offset, Length) {}
 
 WritableBinaryStreamRef::WritableBinaryStreamRef(MutableArrayRef<uint8_t> Data,
@@ -113,8 +113,7 @@ WritableBinaryStreamRef::WritableBinaryStreamRef(MutableArrayRef<uint8_t> Data,
     : BinaryStreamRefBase(std::make_shared<MutableArrayRefImpl>(Data, Endian),
                           0, Data.size()) {}
 
-
-Error WritableBinaryStreamRef::writeBytes(uint32_t Offset,
+Error WritableBinaryStreamRef::writeBytes(uint64_t Offset,
                                           ArrayRef<uint8_t> Data) const {
   if (auto EC = checkOffsetForWrite(Offset, Data.size()))
     return EC;
diff --git a/llvm/lib/Support/BinaryStreamWriter.cpp b/llvm/lib/Support/BinaryStreamWriter.cpp
index 986e18da281d..8c9efa0ed9a9 100644
--- a/llvm/lib/Support/BinaryStreamWriter.cpp
+++ b/llvm/lib/Support/BinaryStreamWriter.cpp
@@ -62,7 +62,7 @@ Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref) {
   return writeStreamRef(Ref, Ref.getLength());
 }
 
-Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref, uint32_t Length) {
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref, uint64_t Length) {
   BinaryStreamReader SrcReader(Ref.slice(0, Length));
   // This is a bit tricky.  If we just call readBytes, we are requiring that it
   // return us the entire stream as a contiguous buffer.  There is no guarantee
@@ -80,7 +80,7 @@ Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref, uint32_t Length) {
 }
 
 std::pair<BinaryStreamWriter, BinaryStreamWriter>
-BinaryStreamWriter::split(uint32_t Off) const {
+BinaryStreamWriter::split(uint64_t Off) const {
   assert(getLength() >= Off);
 
   WritableBinaryStreamRef First = Stream.drop_front(Offset);
@@ -93,7 +93,7 @@ BinaryStreamWriter::split(uint32_t Off) const {
 }
 
 Error BinaryStreamWriter::padToAlignment(uint32_t Align) {
-  uint32_t NewOffset = alignTo(Offset, Align);
+  uint64_t NewOffset = alignTo(Offset, Align);
   if (NewOffset > getLength())
     return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
   while (Offset < NewOffset)
diff --git a/llvm/lib/LTO/Caching.cpp b/llvm/lib/Support/Caching.cpp
index 75a89e729f43..a2fe37a26617 100644
--- a/llvm/lib/LTO/Caching.cpp
+++ b/llvm/lib/Support/Caching.cpp
@@ -1,4 +1,4 @@
-//===-Caching.cpp - LLVM Link Time Optimizer Cache Handling ---------------===//
+//===-Caching.cpp - LLVM Local File Cache ---------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,18 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the Caching for ThinLTO.
+// This file implements the localCache function, which simplifies creating,
+// adding to, and querying a local file system cache. localCache takes care of
+// periodically pruning older files from the cache using a CachePruningPolicy.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/LTO/Caching.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/raw_ostream.h"
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
@@ -26,14 +25,21 @@
 #endif
 
 using namespace llvm;
-using namespace llvm::lto;
 
-Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
-                                            AddBufferFn AddBuffer) {
-  if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPath))
+Expected<FileCache> llvm::localCache(Twine CacheNameRef,
+                                     Twine TempFilePrefixRef,
+                                     Twine CacheDirectoryPathRef,
+                                     AddBufferFn AddBuffer) {
+  if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPathRef))
     return errorCodeToError(EC);
 
-  return [=](unsigned Task, StringRef Key) -> AddStreamFn {
+  // Create local copies which are safely captured-by-copy in lambdas
+  SmallString<64> CacheName, TempFilePrefix, CacheDirectoryPath;
+  CacheNameRef.toVector(CacheName);
+  TempFilePrefixRef.toVector(TempFilePrefix);
+  CacheDirectoryPathRef.toVector(CacheDirectoryPath);
+
+  return [=](unsigned Task, StringRef Key) -> Expected<AddStreamFn> {
     // This choice of file name allows the cache to be pruned (see pruneCache()
     // in include/llvm/Support/CachePruning.h).
     SmallString<64> EntryPath;
@@ -65,12 +71,12 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
     // Since the file is probably being deleted we handle it in the same way as
     // if the file did not exist at all.
     if (EC != errc::no_such_file_or_directory && EC != errc::permission_denied)
-      report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
-                         ": " + EC.message() + "\n");
+      return createStringError(EC, Twine("Failed to open cache file ") +
+                                       EntryPath + ": " + EC.message() + "\n");
 
-    // This native object stream is responsible for commiting the resulting
-    // file to the cache and calling AddBuffer to add it to the link.
-    struct CacheStream : NativeObjectStream {
+    // This file stream is responsible for commiting the resulting file to the
+    // cache and calling AddBuffer to add it to the link.
+    struct CacheStream : CachedFileStream {
       AddBufferFn AddBuffer;
       sys::fs::TempFile TempFile;
       std::string EntryPath;
@@ -79,11 +85,14 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
       CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddBufferFn AddBuffer,
                   sys::fs::TempFile TempFile, std::string EntryPath,
                   unsigned Task)
-          : NativeObjectStream(std::move(OS)), AddBuffer(std::move(AddBuffer)),
+          : CachedFileStream(std::move(OS)), AddBuffer(std::move(AddBuffer)),
             TempFile(std::move(TempFile)), EntryPath(std::move(EntryPath)),
             Task(Task) {}
 
       ~CacheStream() {
+        // TODO: Manually commit rather than using non-trivial destructor,
+        // allowing to replace report_fatal_errors with a return Error.
+
         // Make sure the stream is closed before committing it.
         OS.reset();
 
@@ -131,16 +140,17 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
       }
     };
 
-    return [=](size_t Task) -> std::unique_ptr<NativeObjectStream> {
+    return [=](size_t Task) -> Expected<std::unique_ptr<CachedFileStream>> {
       // Write to a temporary to avoid race condition
       SmallString<64> TempFilenameModel;
-      sys::path::append(TempFilenameModel, CacheDirectoryPath, "Thin-%%%%%%.tmp.o");
+      sys::path::append(TempFilenameModel, CacheDirectoryPath,
+                        TempFilePrefix + "-%%%%%%.tmp.o");
       Expected<sys::fs::TempFile> Temp = sys::fs::TempFile::create(
           TempFilenameModel, sys::fs::owner_read | sys::fs::owner_write);
-      if (!Temp) {
-        errs() << "Error: " << toString(Temp.takeError()) << "\n";
-        report_fatal_error("ThinLTO: Can't get a temporary file");
-      }
+      if (!Temp)
+        return createStringError(errc::io_error,
+                                 toString(Temp.takeError()) + ": " + CacheName +
+                                     ": Can't get a temporary file");
 
       // This CacheStream will move the temporary file into the cache when done.
       return std::make_unique<CacheStream>(
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 4ae3ad4c2453..e64934aa90cc 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1321,12 +1321,20 @@ bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
                                                Errs, LongOptionsUseDoubleDash);
 }
 
+/// Reset all options at least once, so that we can parse different options.
 void CommandLineParser::ResetAllOptionOccurrences() {
-  // So that we can parse different command lines multiple times in succession
-  // we reset all option values to look like they have never been seen before.
+  // Reset all option values to look like they have never been seen before.
+  // Options might be reset twice (they can be reference in both OptionsMap
+  // and one of the other members), but that does not harm.
   for (auto *SC : RegisteredSubCommands) {
     for (auto &O : SC->OptionsMap)
       O.second->reset();
+    for (Option *O : SC->PositionalOpts)
+      O->reset();
+    for (Option *O : SC->SinkOpts)
+      O->reset();
+    if (SC->ConsumeAfterOpt)
+      SC->ConsumeAfterOpt->reset();
   }
 }
 
@@ -2633,6 +2641,7 @@ void cl::AddExtraVersionPrinter(VersionPrinterTy func) {
 }
 
 StringMap<Option *> &cl::getRegisteredOptions(SubCommand &Sub) {
+  initCommonOptions();
   auto &Subs = GlobalParser->RegisteredSubCommands;
   (void)Subs;
   assert(is_contained(Subs, &Sub));
diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp
index 433d99df5932..b6aaf373a522 100644
--- a/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -428,8 +428,7 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
 
 #endif // !_MSC_VER
 
-LLVM_ATTRIBUTE_NORETURN
-void CrashRecoveryContext::HandleExit(int RetCode) {
+[[noreturn]] void CrashRecoveryContext::HandleExit(int RetCode) {
 #if defined(_WIN32)
   // SEH and VEH
   ::RaiseException(0xE0000000 | RetCode, 0, 0, NULL);
diff --git a/llvm/lib/Support/DebugOptions.h b/llvm/lib/Support/DebugOptions.h
index 4d5250649f6a..75e557d7d8d7 100644
--- a/llvm/lib/Support/DebugOptions.h
+++ b/llvm/lib/Support/DebugOptions.h
@@ -26,4 +26,4 @@ void initWithColorOptions();
 void initDebugOptions();
 void initRandomSeedOptions();
 
-} // namespace llvm
-\ No newline at end of file
+} // namespace llvm
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
new file mode 100644
index 000000000000..077629670e40
--- /dev/null
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -0,0 +1,107 @@
+//===----- DivisonByConstantInfo.cpp - division by constant -*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file implements support for optimizing divisions by a constant
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DivisionByConstantInfo.h"
+
+using namespace llvm;
+
+/// Calculate the magic numbers required to implement a signed integer division
+/// by a constant as a sequence of multiplies, adds and shifts.  Requires that
+/// the divisor not be 0, 1, or -1.  Taken from "Hacker's Delight", Henry S.
+/// Warren, Jr., Chapter 10.
+SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
+  unsigned P;
+  APInt AD, ANC, Delta, Q1, R1, Q2, R2, T;
+  APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
+  struct SignedDivisionByConstantInfo Retval;
+
+  AD = D.abs();
+  T = SignedMin + (D.lshr(D.getBitWidth() - 1));
+  ANC = T - 1 - T.urem(AD);  // absolute value of NC
+  P = D.getBitWidth() - 1;   // initialize P
+  Q1 = SignedMin.udiv(ANC);  // initialize Q1 = 2P/abs(NC)
+  R1 = SignedMin - Q1 * ANC; // initialize R1 = rem(2P,abs(NC))
+  Q2 = SignedMin.udiv(AD);   // initialize Q2 = 2P/abs(D)
+  R2 = SignedMin - Q2 * AD;  // initialize R2 = rem(2P,abs(D))
+  do {
+    P = P + 1;
+    Q1 = Q1 << 1;      // update Q1 = 2P/abs(NC)
+    R1 = R1 << 1;      // update R1 = rem(2P/abs(NC))
+    if (R1.uge(ANC)) { // must be unsigned comparison
+      Q1 = Q1 + 1;
+      R1 = R1 - ANC;
+    }
+    Q2 = Q2 << 1;     // update Q2 = 2P/abs(D)
+    R2 = R2 << 1;     // update R2 = rem(2P/abs(D))
+    if (R2.uge(AD)) { // must be unsigned comparison
+      Q2 = Q2 + 1;
+      R2 = R2 - AD;
+    }
+    Delta = AD - R2;
+  } while (Q1.ult(Delta) || (Q1 == Delta && R1 == 0));
+
+  Retval.Magic = Q2 + 1;
+  if (D.isNegative())
+    Retval.Magic = -Retval.Magic;           // resulting magic number
+  Retval.ShiftAmount = P - D.getBitWidth(); // resulting shift
+  return Retval;
+}
+
+/// Calculate the magic numbers required to implement an unsigned integer
+/// division by a constant as a sequence of multiplies, adds and shifts.
+/// Requires that the divisor not be 0.  Taken from "Hacker's Delight", Henry
+/// S. Warren, Jr., chapter 10.
+/// LeadingZeros can be used to simplify the calculation if the upper bits
+/// of the divided value are known zero.
+UnsignedDivisonByConstantInfo
+UnsignedDivisonByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
+  unsigned P;
+  APInt NC, Delta, Q1, R1, Q2, R2;
+  struct UnsignedDivisonByConstantInfo Retval;
+  Retval.IsAdd = 0; // initialize "add" indicator
+  APInt AllOnes = APInt::getAllOnes(D.getBitWidth()).lshr(LeadingZeros);
+  APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
+  APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
+
+  NC = AllOnes - (AllOnes - D).urem(D);
+  P = D.getBitWidth() - 1;  // initialize P
+  Q1 = SignedMin.udiv(NC);  // initialize Q1 = 2P/NC
+  R1 = SignedMin - Q1 * NC; // initialize R1 = rem(2P,NC)
+  Q2 = SignedMax.udiv(D);   // initialize Q2 = (2P-1)/D
+  R2 = SignedMax - Q2 * D;  // initialize R2 = rem((2P-1),D)
+  do {
+    P = P + 1;
+    if (R1.uge(NC - R1)) {
+      Q1 = Q1 + Q1 + 1;  // update Q1
+      R1 = R1 + R1 - NC; // update R1
+    } else {
+      Q1 = Q1 + Q1; // update Q1
+      R1 = R1 + R1; // update R1
+    }
+    if ((R2 + 1).uge(D - R2)) {
+      if (Q2.uge(SignedMax))
+        Retval.IsAdd = 1;
+      Q2 = Q2 + Q2 + 1;     // update Q2
+      R2 = R2 + R2 + 1 - D; // update R2
+    } else {
+      if (Q2.uge(SignedMin))
+        Retval.IsAdd = 1;
+      Q2 = Q2 + Q2;     // update Q2
+      R2 = R2 + R2 + 1; // update R2
+    }
+    Delta = D - 1 - R2;
+  } while (P < D.getBitWidth() * 2 &&
+           (Q1.ult(Delta) || (Q1 == Delta && R1 == 0)));
+  Retval.Magic = Q2 + 1;                    // resulting magic number
+  Retval.ShiftAmount = P - D.getBitWidth(); // resulting shift
+  return Retval;
+}
diff --git a/llvm/lib/Support/Error.cpp b/llvm/lib/Support/Error.cpp
index e7ab4387dfd1..8bfc8ee7a8cc 100644
--- a/llvm/lib/Support/Error.cpp
+++ b/llvm/lib/Support/Error.cpp
@@ -80,8 +80,11 @@ std::error_code inconvertibleErrorCode() {
 }
 
 std::error_code FileError::convertToErrorCode() const {
-  return std::error_code(static_cast<int>(ErrorErrorCode::FileError),
-                         *ErrorErrorCat);
+  std::error_code NestedEC = Err->convertToErrorCode();
+  if (NestedEC == inconvertibleErrorCode())
+    return std::error_code(static_cast<int>(ErrorErrorCode::FileError),
+                           *ErrorErrorCat);
+  return NestedEC;
 }
 
 Error errorCodeToError(std::error_code EC) {
@@ -96,7 +99,7 @@ std::error_code errorToErrorCode(Error Err) {
     EC = EI.convertToErrorCode();
   });
   if (EC == inconvertibleErrorCode())
-    report_fatal_error(EC.message());
+    report_fatal_error(Twine(EC.message()));
   return EC;
 }
 
@@ -144,7 +147,7 @@ void report_fatal_error(Error Err, bool GenCrashDiag) {
     raw_string_ostream ErrStream(ErrMsg);
     logAllUnhandledErrors(std::move(Err), ErrStream);
   }
-  report_fatal_error(ErrMsg);
+  report_fatal_error(Twine(ErrMsg));
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Support/ErrorHandling.cpp b/llvm/lib/Support/ErrorHandling.cpp
index ce6344284f06..80c0e00439a5 100644
--- a/llvm/lib/Support/ErrorHandling.cpp
+++ b/llvm/lib/Support/ErrorHandling.cpp
@@ -83,10 +83,6 @@ void llvm::report_fatal_error(const char *Reason, bool GenCrashDiag) {
   report_fatal_error(Twine(Reason), GenCrashDiag);
 }
 
-void llvm::report_fatal_error(const std::string &Reason, bool GenCrashDiag) {
-  report_fatal_error(Twine(Reason), GenCrashDiag);
-}
-
 void llvm::report_fatal_error(StringRef Reason, bool GenCrashDiag) {
   report_fatal_error(Twine(Reason), GenCrashDiag);
 }
@@ -105,7 +101,7 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   }
 
   if (handler) {
-    handler(handlerData, Reason.str(), GenCrashDiag);
+    handler(handlerData, Reason.str().c_str(), GenCrashDiag);
   } else {
     // Blast the result out to stderr.  We don't try hard to make sure this
     // succeeds (e.g. handling EINTR) and we can't use errs() here because
@@ -218,11 +214,11 @@ void llvm::llvm_unreachable_internal(const char *msg, const char *file,
 #endif
 }
 
-static void bindingsErrorHandler(void *user_data, const std::string& reason,
+static void bindingsErrorHandler(void *user_data, const char *reason,
                                  bool gen_crash_diag) {
   LLVMFatalErrorHandler handler =
       LLVM_EXTENSION reinterpret_cast<LLVMFatalErrorHandler>(user_data);
-  handler(reason.c_str());
+  handler(reason);
 }
 
 void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler) {
@@ -247,7 +243,10 @@ std::error_code llvm::mapWindowsError(unsigned EV) {
   switch (EV) {
     MAP_ERR_TO_COND(ERROR_ACCESS_DENIED, permission_denied);
     MAP_ERR_TO_COND(ERROR_ALREADY_EXISTS, file_exists);
+    MAP_ERR_TO_COND(ERROR_BAD_NETPATH, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_BAD_PATHNAME, no_such_file_or_directory);
     MAP_ERR_TO_COND(ERROR_BAD_UNIT, no_such_device);
+    MAP_ERR_TO_COND(ERROR_BROKEN_PIPE, broken_pipe);
     MAP_ERR_TO_COND(ERROR_BUFFER_OVERFLOW, filename_too_long);
     MAP_ERR_TO_COND(ERROR_BUSY, device_or_resource_busy);
     MAP_ERR_TO_COND(ERROR_BUSY_DRIVE, device_or_resource_busy);
@@ -269,18 +268,20 @@ std::error_code llvm::mapWindowsError(unsigned EV) {
     MAP_ERR_TO_COND(ERROR_INVALID_FUNCTION, function_not_supported);
     MAP_ERR_TO_COND(ERROR_INVALID_HANDLE, invalid_argument);
     MAP_ERR_TO_COND(ERROR_INVALID_NAME, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_INVALID_PARAMETER, invalid_argument);
     MAP_ERR_TO_COND(ERROR_LOCK_VIOLATION, no_lock_available);
     MAP_ERR_TO_COND(ERROR_LOCKED, no_lock_available);
     MAP_ERR_TO_COND(ERROR_NEGATIVE_SEEK, invalid_argument);
     MAP_ERR_TO_COND(ERROR_NOACCESS, permission_denied);
     MAP_ERR_TO_COND(ERROR_NOT_ENOUGH_MEMORY, not_enough_memory);
     MAP_ERR_TO_COND(ERROR_NOT_READY, resource_unavailable_try_again);
+    MAP_ERR_TO_COND(ERROR_NOT_SUPPORTED, not_supported);
     MAP_ERR_TO_COND(ERROR_OPEN_FAILED, io_error);
     MAP_ERR_TO_COND(ERROR_OPEN_FILES, device_or_resource_busy);
     MAP_ERR_TO_COND(ERROR_OUTOFMEMORY, not_enough_memory);
     MAP_ERR_TO_COND(ERROR_PATH_NOT_FOUND, no_such_file_or_directory);
-    MAP_ERR_TO_COND(ERROR_BAD_NETPATH, no_such_file_or_directory);
     MAP_ERR_TO_COND(ERROR_READ_FAULT, io_error);
+    MAP_ERR_TO_COND(ERROR_REPARSE_TAG_INVALID, invalid_argument);
     MAP_ERR_TO_COND(ERROR_RETRY, resource_unavailable_try_again);
     MAP_ERR_TO_COND(ERROR_SEEK, io_error);
     MAP_ERR_TO_COND(ERROR_SHARING_VIOLATION, permission_denied);
diff --git a/llvm/lib/Support/ExtensibleRTTI.cpp b/llvm/lib/Support/ExtensibleRTTI.cpp
index 1c98d1bb8feb..a6a5c196fb35 100644
--- a/llvm/lib/Support/ExtensibleRTTI.cpp
+++ b/llvm/lib/Support/ExtensibleRTTI.cpp
@@ -1,9 +1,8 @@
 //===----- lib/Support/ExtensibleRTTI.cpp - ExtensibleRTTI utilities ------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Support/FileUtilities.cpp b/llvm/lib/Support/FileUtilities.cpp
index e4a86bb69de4..dbe28e56b2c3 100644
--- a/llvm/lib/Support/FileUtilities.cpp
+++ b/llvm/lib/Support/FileUtilities.cpp
@@ -300,8 +300,7 @@ llvm::Error llvm::writeFileAtomically(
     std::function<llvm::Error(llvm::raw_ostream &)> Writer) {
   SmallString<128> GeneratedUniqPath;
   int TempFD;
-  if (sys::fs::createUniqueFile(TempPathModel.str(), TempFD,
-                                GeneratedUniqPath)) {
+  if (sys::fs::createUniqueFile(TempPathModel, TempFD, GeneratedUniqPath)) {
     return llvm::make_error<AtomicFileWriteError>(
         atomic_write_error::failed_to_create_uniq_file);
   }
@@ -319,8 +318,7 @@ llvm::Error llvm::writeFileAtomically(
         atomic_write_error::output_stream_error);
   }
 
-  if (sys::fs::rename(/*from=*/GeneratedUniqPath.c_str(),
-                      /*to=*/FinalPath.str().c_str())) {
+  if (sys::fs::rename(/*from=*/GeneratedUniqPath, /*to=*/FinalPath)) {
     return llvm::make_error<AtomicFileWriteError>(
         atomic_write_error::failed_to_rename_temp_file);
   }
diff --git a/llvm/lib/Support/GraphWriter.cpp b/llvm/lib/Support/GraphWriter.cpp
index b41869aba95f..696e6b7a99d8 100644
--- a/llvm/lib/Support/GraphWriter.cpp
+++ b/llvm/lib/Support/GraphWriter.cpp
@@ -23,11 +23,12 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <system_error>
 #include <string>
+#include <system_error>
 #include <vector>
 
 using namespace llvm;
@@ -94,11 +95,8 @@ StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
 
 static std::string replaceIllegalFilenameChars(std::string Filename,
                                                const char ReplacementChar) {
-#ifdef _WIN32
-  std::string IllegalChars = "\\/:?\"<>|";
-#else
-  std::string IllegalChars = "/";
-#endif
+  std::string IllegalChars =
+      is_style_windows(sys::path::Style::native) ? "\\/:?\"<>|" : "/";
 
   for (char IllegalChar : IllegalChars) {
     std::replace(Filename.begin(), Filename.end(), IllegalChar,
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index f873ff06f1f7..7b14616f6fea 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -772,6 +772,22 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT;
       break;
 
+    // Tigerlake:
+    case 0x8c:
+    case 0x8d:
+      CPU = "tigerlake";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_TIGERLAKE;
+      break;
+
+    // Alderlake:
+    case 0x97:
+    case 0x9a:
+      CPU = "alderlake";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_ALDERLAKE;
+      break;
+
     // Icelake Xeon:
     case 0x6a:
     case 0x6c:
@@ -1055,8 +1071,10 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_FMA);
   if ((ECX >> 19) & 1)
     setFeature(X86::FEATURE_SSE4_1);
-  if ((ECX >> 20) & 1)
+  if ((ECX >> 20) & 1) {
     setFeature(X86::FEATURE_SSE4_2);
+    setFeature(X86::FEATURE_CRC32);
+  }
   if ((ECX >> 23) & 1)
     setFeature(X86::FEATURE_POPCNT);
   if ((ECX >> 25) & 1)
@@ -1338,6 +1356,16 @@ StringRef sys::getHostCPUName() {
     return "generic";
   }
 }
+#elif defined(__riscv)
+StringRef sys::getHostCPUName() {
+#if __riscv_xlen == 64
+  return "generic-rv64";
+#elif __riscv_xlen == 32
+  return "generic-rv32";
+#else
+#error "Unhandled value of __riscv_xlen"
+#endif
+}
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
 namespace llvm {
@@ -1502,6 +1530,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["cx16"]   = (ECX >> 13) & 1;
   Features["sse4.1"] = (ECX >> 19) & 1;
   Features["sse4.2"] = (ECX >> 20) & 1;
+  Features["crc32"]  = Features["sse4.2"];
   Features["movbe"]  = (ECX >> 22) & 1;
   Features["popcnt"] = (ECX >> 23) & 1;
   Features["aes"]    = (ECX >> 25) & 1;
@@ -1617,6 +1646,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // For more info, see X86 ISA docs.
   Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1);
   Features["amx-bf16"]   = HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave;
+  Features["avx512fp16"] = HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save;
   Features["amx-tile"]   = HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave;
   Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
   bool HasLeaf7Subleaf1 =
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index dbfd673553f4..17b36ed51850 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -109,6 +109,7 @@ void Value::copyFrom(const Value &M) {
   case T_Boolean:
   case T_Double:
   case T_Integer:
+  case T_UINT64:
     memcpy(&Union, &M.Union, sizeof(Union));
     break;
   case T_StringRef:
@@ -133,6 +134,7 @@ void Value::moveFrom(const Value &&M) {
   case T_Boolean:
   case T_Double:
   case T_Integer:
+  case T_UINT64:
     memcpy(&Union, &M.Union, sizeof(Union));
     break;
   case T_StringRef:
@@ -159,6 +161,7 @@ void Value::destroy() {
   case T_Boolean:
   case T_Double:
   case T_Integer:
+  case T_UINT64:
     break;
   case T_StringRef:
     as<StringRef>().~StringRef();
@@ -750,6 +753,8 @@ void llvm::json::OStream::value(const Value &V) {
     valueBegin();
     if (V.Type == Value::T_Integer)
       OS << *V.getAsInteger();
+    else if (V.Type == Value::T_UINT64)
+      OS << *V.getAsUINT64();
     else
       OS << format("%.*g", std::numeric_limits<double>::max_digits10,
                    *V.getAsNumber());
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index d997bd85f1e0..90483817c302 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -404,7 +404,7 @@ KnownBits KnownBits::abs(bool IntMinIsPoison) const {
   // We only know that the absolute values's MSB will be zero if INT_MIN is
   // poison, or there is a set bit that isn't the sign bit (otherwise it could
   // be INT_MIN).
-  if (IntMinIsPoison || (!One.isNullValue() && !One.isMinSignedValue()))
+  if (IntMinIsPoison || (!One.isZero() && !One.isMinSignedValue()))
     KnownAbs.Zero.setSignBit();
 
   // FIXME: Handle known negative input?
@@ -412,10 +412,13 @@ KnownBits KnownBits::abs(bool IntMinIsPoison) const {
   return KnownAbs;
 }
 
-KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS) {
+KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
+                         bool SelfMultiply) {
   unsigned BitWidth = LHS.getBitWidth();
   assert(BitWidth == RHS.getBitWidth() && !LHS.hasConflict() &&
          !RHS.hasConflict() && "Operand mismatch");
+  assert((!SelfMultiply || (LHS.One == RHS.One && LHS.Zero == RHS.Zero)) &&
+         "Self multiplication knownbits mismatch");
 
   // Compute a conservative estimate for high known-0 bits.
   unsigned LeadZ =
@@ -489,6 +492,14 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS) {
   Res.Zero.setHighBits(LeadZ);
   Res.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
   Res.One = BottomKnown.getLoBits(ResultBitsKnown);
+
+  // If we're self-multiplying then bit[1] is guaranteed to be zero.
+  if (SelfMultiply && BitWidth > 1) {
+    assert(Res.One[1] == 0 &&
+           "Self-multiplication failed Quadratic Reciprocity!");
+    Res.Zero.setBit(1);
+  }
+
   return Res;
 }
 
diff --git a/llvm/lib/Support/LockFileManager.cpp b/llvm/lib/Support/LockFileManager.cpp
index a2b56ab295c4..5fd52999adb5 100644
--- a/llvm/lib/Support/LockFileManager.cpp
+++ b/llvm/lib/Support/LockFileManager.cpp
@@ -35,7 +35,7 @@
 #include <unistd.h>
 #endif
 
-#if defined(__APPLE__) && defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && (__MAC_OS_X_VERSION_MIN_REQUIRED > 1050)
+#if defined(__APPLE__) && defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1050)
 #define USE_OSX_GETHOSTUUID 1
 #else
 #define USE_OSX_GETHOSTUUID 0
diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp
index 5e0b076f176e..9dceb4d418cd 100644
--- a/llvm/lib/Support/MD5.cpp
+++ b/llvm/lib/Support/MD5.cpp
@@ -67,11 +67,11 @@
 // SET reads 4 input bytes in little-endian byte order and stores them
 // in a properly aligned word in host byte order.
 #define SET(n)                                                                 \
-  (block[(n)] =                                                                \
-       (MD5_u32plus) ptr[(n) * 4] | ((MD5_u32plus) ptr[(n) * 4 + 1] << 8) |    \
-       ((MD5_u32plus) ptr[(n) * 4 + 2] << 16) |                                \
-       ((MD5_u32plus) ptr[(n) * 4 + 3] << 24))
-#define GET(n) (block[(n)])
+  (InternalState.block[(n)] = (MD5_u32plus)ptr[(n)*4] |                        \
+                              ((MD5_u32plus)ptr[(n)*4 + 1] << 8) |             \
+                              ((MD5_u32plus)ptr[(n)*4 + 2] << 16) |            \
+                              ((MD5_u32plus)ptr[(n)*4 + 3] << 24))
+#define GET(n) (InternalState.block[(n)])
 
 using namespace llvm;
 
@@ -85,10 +85,10 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
 
   ptr = Data.data();
 
-  a = this->a;
-  b = this->b;
-  c = this->c;
-  d = this->d;
+  a = InternalState.a;
+  b = InternalState.b;
+  c = InternalState.c;
+  d = InternalState.d;
 
   do {
     saved_a = a;
@@ -176,10 +176,10 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
     ptr += 64;
   } while (Size -= 64);
 
-  this->a = a;
-  this->b = b;
-  this->c = c;
-  this->d = d;
+  InternalState.a = a;
+  InternalState.b = b;
+  InternalState.c = c;
+  InternalState.d = d;
 
   return ptr;
 }
@@ -193,10 +193,10 @@ void MD5::update(ArrayRef<uint8_t> Data) {
   const uint8_t *Ptr = Data.data();
   unsigned long Size = Data.size();
 
-  saved_lo = lo;
-  if ((lo = (saved_lo + Size) & 0x1fffffff) < saved_lo)
-    hi++;
-  hi += Size >> 29;
+  saved_lo = InternalState.lo;
+  if ((InternalState.lo = (saved_lo + Size) & 0x1fffffff) < saved_lo)
+    InternalState.hi++;
+  InternalState.hi += Size >> 29;
 
   used = saved_lo & 0x3f;
 
@@ -204,14 +204,14 @@ void MD5::update(ArrayRef<uint8_t> Data) {
     free = 64 - used;
 
     if (Size < free) {
-      memcpy(&buffer[used], Ptr, Size);
+      memcpy(&InternalState.buffer[used], Ptr, Size);
       return;
     }
 
-    memcpy(&buffer[used], Ptr, free);
+    memcpy(&InternalState.buffer[used], Ptr, free);
     Ptr = Ptr + free;
     Size -= free;
-    body(makeArrayRef(buffer, 64));
+    body(makeArrayRef(InternalState.buffer, 64));
   }
 
   if (Size >= 64) {
@@ -219,7 +219,7 @@ void MD5::update(ArrayRef<uint8_t> Data) {
     Size &= 0x3f;
   }
 
-  memcpy(buffer, Ptr, Size);
+  memcpy(InternalState.buffer, Ptr, Size);
 }
 
 /// Add the bytes in the StringRef \p Str to the hash.
@@ -235,31 +235,48 @@ void MD5::update(StringRef Str) {
 void MD5::final(MD5Result &Result) {
   unsigned long used, free;
 
-  used = lo & 0x3f;
+  used = InternalState.lo & 0x3f;
 
-  buffer[used++] = 0x80;
+  InternalState.buffer[used++] = 0x80;
 
   free = 64 - used;
 
   if (free < 8) {
-    memset(&buffer[used], 0, free);
-    body(makeArrayRef(buffer, 64));
+    memset(&InternalState.buffer[used], 0, free);
+    body(makeArrayRef(InternalState.buffer, 64));
     used = 0;
     free = 64;
   }
 
-  memset(&buffer[used], 0, free - 8);
+  memset(&InternalState.buffer[used], 0, free - 8);
 
-  lo <<= 3;
-  support::endian::write32le(&buffer[56], lo);
-  support::endian::write32le(&buffer[60], hi);
+  InternalState.lo <<= 3;
+  support::endian::write32le(&InternalState.buffer[56], InternalState.lo);
+  support::endian::write32le(&InternalState.buffer[60], InternalState.hi);
 
-  body(makeArrayRef(buffer, 64));
+  body(makeArrayRef(InternalState.buffer, 64));
 
-  support::endian::write32le(&Result[0], a);
-  support::endian::write32le(&Result[4], b);
-  support::endian::write32le(&Result[8], c);
-  support::endian::write32le(&Result[12], d);
+  support::endian::write32le(&Result[0], InternalState.a);
+  support::endian::write32le(&Result[4], InternalState.b);
+  support::endian::write32le(&Result[8], InternalState.c);
+  support::endian::write32le(&Result[12], InternalState.d);
+}
+
+StringRef MD5::final() {
+  final(Result);
+  return StringRef(reinterpret_cast<char *>(Result.Bytes.data()),
+                   Result.Bytes.size());
+}
+
+StringRef MD5::result() {
+  auto StateToRestore = InternalState;
+
+  auto Hash = final();
+
+  // Restore the state
+  InternalState = StateToRestore;
+
+  return Hash;
 }
 
 SmallString<32> MD5::MD5Result::digest() const {
diff --git a/llvm/lib/Support/MSP430AttributeParser.cpp b/llvm/lib/Support/MSP430AttributeParser.cpp
new file mode 100644
index 000000000000..a9948a158fc0
--- /dev/null
+++ b/llvm/lib/Support/MSP430AttributeParser.cpp
@@ -0,0 +1,53 @@
+//===-- MSP430AttributeParser.cpp - MSP430 Attribute Parser ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MSP430AttributeParser.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+using namespace llvm::MSP430Attrs;
+
+constexpr std::array<MSP430AttributeParser::DisplayHandler, 4>
+    MSP430AttributeParser::DisplayRoutines{
+        {{MSP430Attrs::TagISA, &MSP430AttributeParser::parseISA},
+         {MSP430Attrs::TagCodeModel, &MSP430AttributeParser::parseCodeModel},
+         {MSP430Attrs::TagDataModel, &MSP430AttributeParser::parseDataModel},
+         {MSP430Attrs::TagEnumSize, &MSP430AttributeParser::parseEnumSize}}};
+
+Error MSP430AttributeParser::parseISA(AttrType Tag) {
+  static const char *StringVals[] = {"None", "MSP430", "MSP430X"};
+  return parseStringAttribute("ISA", Tag, makeArrayRef(StringVals));
+}
+
+Error MSP430AttributeParser::parseCodeModel(AttrType Tag) {
+  static const char *StringVals[] = {"None", "Small", "Large"};
+  return parseStringAttribute("Code Model", Tag, makeArrayRef(StringVals));
+}
+
+Error MSP430AttributeParser::parseDataModel(AttrType Tag) {
+  static const char *StringVals[] = {"None", "Small", "Large", "Restricted"};
+  return parseStringAttribute("Data Model", Tag, makeArrayRef(StringVals));
+}
+
+Error MSP430AttributeParser::parseEnumSize(AttrType Tag) {
+  static const char *StringVals[] = {"None", "Small", "Integer", "Don't Care"};
+  return parseStringAttribute("Enum Size", Tag, makeArrayRef(StringVals));
+}
+
+Error MSP430AttributeParser::handler(uint64_t Tag, bool &Handled) {
+  Handled = false;
+  for (const DisplayHandler &Disp : DisplayRoutines) {
+    if (uint64_t(Disp.Attribute) != Tag)
+      continue;
+    if (Error E = (this->*Disp.Routine)(static_cast<AttrType>(Tag)))
+      return E;
+    Handled = true;
+    break;
+  }
+  return Error::success();
+}
diff --git a/llvm/lib/Support/MSP430Attributes.cpp b/llvm/lib/Support/MSP430Attributes.cpp
new file mode 100644
index 000000000000..4483a6872559
--- /dev/null
+++ b/llvm/lib/Support/MSP430Attributes.cpp
@@ -0,0 +1,22 @@
+//===-- MSP430Attributes.cpp - MSP430 Attributes --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MSP430Attributes.h"
+
+using namespace llvm;
+using namespace llvm::MSP430Attrs;
+
+static constexpr TagNameItem TagData[] = {{TagISA, "Tag_ISA"},
+                                          {TagCodeModel, "Tag_Code_Model"},
+                                          {TagDataModel, "Tag_Data_Model"},
+                                          {TagEnumSize, "Tag_Enum_Size"}};
+
+constexpr TagNameMap MSP430AttributeTags{TagData};
+const TagNameMap &llvm::MSP430Attrs::getMSP430AttributeTags() {
+  return MSP430AttributeTags;
+}
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 9a2e1003da5a..71e3a1362f7e 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -151,7 +151,12 @@ static std::atomic<int> TaskGroupInstances;
 // lock, only allow the first TaskGroup to run tasks parallelly. In the scenario
 // of nested parallel_for_each(), only the outermost one runs parallelly.
 TaskGroup::TaskGroup() : Parallel(TaskGroupInstances++ == 0) {}
-TaskGroup::~TaskGroup() { --TaskGroupInstances; }
+TaskGroup::~TaskGroup() {
+  // We must ensure that all the workloads have finished before decrementing the
+  // instances count.
+  L.sync();
+  --TaskGroupInstances;
+}
 
 void TaskGroup::spawn(std::function<void()> F) {
   if (Parallel) {
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index a724ba2faf93..3957547dfaaa 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Support/Path.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
@@ -37,15 +38,16 @@ namespace {
   using llvm::sys::path::Style;
 
   inline Style real_style(Style style) {
-#ifdef _WIN32
-    return (style == Style::posix) ? Style::posix : Style::windows;
-#else
-    return (style == Style::windows) ? Style::windows : Style::posix;
-#endif
+    if (style != Style::native)
+      return style;
+    if (is_style_posix(style))
+      return Style::posix;
+    return LLVM_WINDOWS_PREFER_FORWARD_SLASH ? Style::windows_slash
+                                             : Style::windows_backslash;
   }
 
   inline const char *separators(Style style) {
-    if (real_style(style) == Style::windows)
+    if (is_style_windows(style))
       return "\\/";
     return "/";
   }
@@ -66,7 +68,7 @@ namespace {
     if (path.empty())
       return path;
 
-    if (real_style(style) == Style::windows) {
+    if (is_style_windows(style)) {
       // C:
       if (path.size() >= 2 &&
           std::isalpha(static_cast<unsigned char>(path[0])) && path[1] == ':')
@@ -98,7 +100,7 @@ namespace {
 
     size_t pos = str.find_last_of(separators(style), str.size() - 1);
 
-    if (real_style(style) == Style::windows) {
+    if (is_style_windows(style)) {
       if (pos == StringRef::npos)
         pos = str.find_last_of(':', str.size() - 2);
     }
@@ -113,7 +115,7 @@ namespace {
   // directory in str, it returns StringRef::npos.
   size_t root_dir_start(StringRef str, Style style) {
     // case "c:/"
-    if (real_style(style) == Style::windows) {
+    if (is_style_windows(style)) {
       if (str.size() > 2 && str[1] == ':' && is_separator(str[2], style))
         return 2;
     }
@@ -259,7 +261,7 @@ const_iterator &const_iterator::operator++() {
     // Root dir.
     if (was_net ||
         // c:/
-        (real_style(S) == Style::windows && Component.endswith(":"))) {
+        (is_style_windows(S) && Component.endswith(":"))) {
       Component = Path.substr(Position, 1);
       return *this;
     }
@@ -348,7 +350,7 @@ StringRef root_path(StringRef path, Style style) {
   if (b != e) {
     bool has_net =
         b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
-    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
+    bool has_drive = is_style_windows(style) && b->endswith(":");
 
     if (has_net || has_drive) {
       if ((++pos != e) && is_separator((*pos)[0], style)) {
@@ -373,7 +375,7 @@ StringRef root_name(StringRef path, Style style) {
   if (b != e) {
     bool has_net =
         b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
-    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
+    bool has_drive = is_style_windows(style) && b->endswith(":");
 
     if (has_net || has_drive) {
       // just {C:,//net}, return the first component.
@@ -390,7 +392,7 @@ StringRef root_directory(StringRef path, Style style) {
   if (b != e) {
     bool has_net =
         b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
-    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
+    bool has_drive = is_style_windows(style) && b->endswith(":");
 
     if ((has_net || has_drive) &&
         // {C:,//net}, skip to the next component.
@@ -497,7 +499,7 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
 static bool starts_with(StringRef Path, StringRef Prefix,
                         Style style = Style::native) {
   // Windows prefix matching : case and separator insensitive
-  if (real_style(style) == Style::windows) {
+  if (is_style_windows(style)) {
     if (Path.size() < Prefix.size())
       return false;
     for (size_t I = 0, E = Prefix.size(); I != E; ++I) {
@@ -548,8 +550,10 @@ void native(const Twine &path, SmallVectorImpl<char> &result, Style style) {
 void native(SmallVectorImpl<char> &Path, Style style) {
   if (Path.empty())
     return;
-  if (real_style(style) == Style::windows) {
-    std::replace(Path.begin(), Path.end(), '/', '\\');
+  if (is_style_windows(style)) {
+    for (char &Ch : Path)
+      if (is_separator(Ch, style))
+        Ch = preferred_separator(style);
     if (Path[0] == '~' && (Path.size() == 1 || is_separator(Path[1], style))) {
       SmallString<128> PathHome;
       home_directory(PathHome);
@@ -557,14 +561,12 @@ void native(SmallVectorImpl<char> &Path, Style style) {
       Path = PathHome;
     }
   } else {
-    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI)
-      if (*PI == '\\')
-        *PI = '/';
+    std::replace(Path.begin(), Path.end(), '\\', '/');
   }
 }
 
 std::string convert_to_slash(StringRef path, Style style) {
-  if (real_style(style) != Style::windows)
+  if (is_style_posix(style))
     return std::string(path);
 
   std::string s = path.str();
@@ -599,7 +601,7 @@ StringRef extension(StringRef path, Style style) {
 bool is_separator(char value, Style style) {
   if (value == '/')
     return true;
-  if (real_style(style) == Style::windows)
+  if (is_style_windows(style))
     return value == '\\';
   return false;
 }
@@ -671,8 +673,7 @@ bool is_absolute(const Twine &path, Style style) {
   StringRef p = path.toStringRef(path_storage);
 
   bool rootDir = has_root_directory(p, style);
-  bool rootName =
-      (real_style(style) != Style::windows) || has_root_name(p, style);
+  bool rootName = is_style_posix(style) || has_root_name(p, style);
 
   return rootDir && rootName;
 }
@@ -686,7 +687,7 @@ bool is_absolute_gnu(const Twine &path, Style style) {
   if (!p.empty() && is_separator(p.front(), style))
     return true;
 
-  if (real_style(style) == Style::windows) {
+  if (is_style_windows(style)) {
     // Handle drive letter pattern (a character followed by ':') on Windows.
     if (p.size() >= 2 && (p[0] && p[1] == ':'))
       return true;
@@ -906,8 +907,7 @@ void make_absolute(const Twine &current_directory,
   bool rootName = path::has_root_name(p);
 
   // Already absolute.
-  if ((rootName || real_style(Style::native) != Style::windows) &&
-      rootDirectory)
+  if ((rootName || is_style_posix(Style::native)) && rootDirectory)
     return;
 
   // All of the following conditions will need the current directory.
@@ -1190,6 +1190,10 @@ TempFile &TempFile::operator=(TempFile &&Other) {
   FD = Other.FD;
   Other.Done = true;
   Other.FD = -1;
+#ifdef _WIN32
+  RemoveOnClose = Other.RemoveOnClose;
+  Other.RemoveOnClose = false;
+#endif
   return *this;
 }
 
@@ -1204,20 +1208,23 @@ Error TempFile::discard() {
   FD = -1;
 
 #ifdef _WIN32
-  // On windows closing will remove the file.
-  TmpName = "";
-  return Error::success();
+  // On Windows, closing will remove the file, if we set the delete
+  // disposition. If not, remove it manually.
+  bool Remove = RemoveOnClose;
 #else
-  // Always try to close and remove.
+  // Always try to remove the file.
+  bool Remove = true;
+#endif
   std::error_code RemoveEC;
-  if (!TmpName.empty()) {
+  if (Remove && !TmpName.empty()) {
     RemoveEC = fs::remove(TmpName);
     sys::DontRemoveFileOnSignal(TmpName);
     if (!RemoveEC)
       TmpName = "";
+  } else {
+    TmpName = "";
   }
   return errorCodeToError(RemoveEC);
-#endif
 }
 
 Error TempFile::keep(const Twine &Name) {
@@ -1228,19 +1235,26 @@ Error TempFile::keep(const Twine &Name) {
   // If we can't cancel the delete don't rename.
   auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
   std::error_code RenameEC = setDeleteDisposition(H, false);
+  bool ShouldDelete = false;
   if (!RenameEC) {
     RenameEC = rename_handle(H, Name);
     // If rename failed because it's cross-device, copy instead
     if (RenameEC ==
       std::error_code(ERROR_NOT_SAME_DEVICE, std::system_category())) {
       RenameEC = copy_file(TmpName, Name);
-      setDeleteDisposition(H, true);
+      ShouldDelete = true;
     }
   }
 
-  // If we can't rename, discard the temporary file.
+  // If we can't rename or copy, discard the temporary file.
   if (RenameEC)
-    setDeleteDisposition(H, true);
+    ShouldDelete = true;
+  if (ShouldDelete) {
+    if (!RemoveOnClose)
+      setDeleteDisposition(H, true);
+    else
+      remove(TmpName);
+  }
 #else
   std::error_code RenameEC = fs::rename(TmpName, Name);
   if (RenameEC) {
@@ -1250,8 +1264,8 @@ Error TempFile::keep(const Twine &Name) {
     if (RenameEC)
       remove(TmpName);
   }
-  sys::DontRemoveFileOnSignal(TmpName);
 #endif
+  sys::DontRemoveFileOnSignal(TmpName);
 
   if (!RenameEC)
     TmpName = "";
@@ -1273,9 +1287,8 @@ Error TempFile::keep() {
   auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
   if (std::error_code EC = setDeleteDisposition(H, false))
     return errorCodeToError(EC);
-#else
-  sys::DontRemoveFileOnSignal(TmpName);
 #endif
+  sys::DontRemoveFileOnSignal(TmpName);
 
   TmpName = "";
 
@@ -1297,14 +1310,22 @@ Expected<TempFile> TempFile::create(const Twine &Model, unsigned Mode,
     return errorCodeToError(EC);
 
   TempFile Ret(ResultPath, FD);
-#ifndef _WIN32
-  if (sys::RemoveFileOnSignal(ResultPath)) {
+#ifdef _WIN32
+  auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  bool SetSignalHandler = false;
+  if (std::error_code EC = setDeleteDisposition(H, true)) {
+    Ret.RemoveOnClose = true;
+    SetSignalHandler = true;
+  }
+#else
+  bool SetSignalHandler = true;
+#endif
+  if (SetSignalHandler && sys::RemoveFileOnSignal(ResultPath)) {
     // Make sure we delete the file when RemoveFileOnSignal fails.
     consumeError(Ret.discard());
     std::error_code EC(errc::operation_not_permitted);
     return errorCodeToError(EC);
   }
-#endif
   return std::move(Ret);
 }
 } // namespace fs
diff --git a/llvm/lib/Support/Process.cpp b/llvm/lib/Support/Process.cpp
index e7e9a8b56f74..547b3b73eec2 100644
--- a/llvm/lib/Support/Process.cpp
+++ b/llvm/lib/Support/Process.cpp
@@ -92,8 +92,7 @@ static bool coreFilesPrevented = !LLVM_ENABLE_CRASH_DUMPS;
 
 bool Process::AreCoreFilesPrevented() { return coreFilesPrevented; }
 
-LLVM_ATTRIBUTE_NORETURN
-void Process::Exit(int RetCode, bool NoCleanup) {
+[[noreturn]] void Process::Exit(int RetCode, bool NoCleanup) {
   if (CrashRecoveryContext *CRC = CrashRecoveryContext::GetCurrent())
     CRC->HandleExit(RetCode);
 
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
new file mode 100644
index 000000000000..8e984002f90d
--- /dev/null
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -0,0 +1,718 @@
+//===-- RISCVISAInfo.cpp - RISCV Arch String Parser --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/RISCVISAInfo.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <array>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+/// Represents the major and version number components of a RISC-V extension
+struct RISCVExtensionVersion {
+  unsigned Major;
+  unsigned Minor;
+};
+
+struct RISCVSupportedExtension {
+  const char *Name;
+  /// Supported version.
+  RISCVExtensionVersion Version;
+};
+
+} // end anonymous namespace
+
+static constexpr StringLiteral AllStdExts = "mafdqlcbjtpvn";
+
+static const RISCVSupportedExtension SupportedExtensions[] = {
+    {"i", RISCVExtensionVersion{2, 0}},
+    {"e", RISCVExtensionVersion{1, 9}},
+    {"m", RISCVExtensionVersion{2, 0}},
+    {"a", RISCVExtensionVersion{2, 0}},
+    {"f", RISCVExtensionVersion{2, 0}},
+    {"d", RISCVExtensionVersion{2, 0}},
+    {"c", RISCVExtensionVersion{2, 0}},
+};
+
+static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
+    {"v", RISCVExtensionVersion{0, 10}},
+    {"zba", RISCVExtensionVersion{1, 0}},
+    {"zbb", RISCVExtensionVersion{1, 0}},
+    {"zbc", RISCVExtensionVersion{1, 0}},
+    {"zbe", RISCVExtensionVersion{0, 93}},
+    {"zbf", RISCVExtensionVersion{0, 93}},
+    {"zbm", RISCVExtensionVersion{0, 93}},
+    {"zbp", RISCVExtensionVersion{0, 93}},
+    {"zbr", RISCVExtensionVersion{0, 93}},
+    {"zbs", RISCVExtensionVersion{1, 0}},
+    {"zbt", RISCVExtensionVersion{0, 93}},
+
+    {"zvamo", RISCVExtensionVersion{0, 10}},
+    {"zvlsseg", RISCVExtensionVersion{0, 10}},
+
+    {"zfhmin", RISCVExtensionVersion{0, 1}},
+    {"zfh", RISCVExtensionVersion{0, 1}},
+};
+
+static bool stripExperimentalPrefix(StringRef &Ext) {
+  return Ext.consume_front("experimental-");
+}
+
+struct FindByName {
+  FindByName(StringRef Ext) : Ext(Ext){};
+  StringRef Ext;
+  bool operator()(const RISCVSupportedExtension &ExtInfo) {
+    return ExtInfo.Name == Ext;
+  }
+};
+
+static Optional<RISCVExtensionVersion> findDefaultVersion(StringRef ExtName) {
+  // Find default version of an extension.
+  // TODO: We might set default version based on profile or ISA spec.
+  for (auto &ExtInfo : {makeArrayRef(SupportedExtensions),
+                        makeArrayRef(SupportedExperimentalExtensions)}) {
+    auto ExtensionInfoIterator = llvm::find_if(ExtInfo, FindByName(ExtName));
+
+    if (ExtensionInfoIterator == ExtInfo.end()) {
+      continue;
+    }
+    return ExtensionInfoIterator->Version;
+  }
+  return None;
+}
+
+void RISCVISAInfo::addExtension(StringRef ExtName, unsigned MajorVersion,
+                                unsigned MinorVersion) {
+  RISCVExtensionInfo Ext;
+  Ext.ExtName = ExtName.str();
+  Ext.MajorVersion = MajorVersion;
+  Ext.MinorVersion = MinorVersion;
+  Exts[ExtName.str()] = Ext;
+}
+
+static StringRef getExtensionTypeDesc(StringRef Ext) {
+  if (Ext.startswith("sx"))
+    return "non-standard supervisor-level extension";
+  if (Ext.startswith("s"))
+    return "standard supervisor-level extension";
+  if (Ext.startswith("x"))
+    return "non-standard user-level extension";
+  if (Ext.startswith("z"))
+    return "standard user-level extension";
+  return StringRef();
+}
+
+static StringRef getExtensionType(StringRef Ext) {
+  if (Ext.startswith("sx"))
+    return "sx";
+  if (Ext.startswith("s"))
+    return "s";
+  if (Ext.startswith("x"))
+    return "x";
+  if (Ext.startswith("z"))
+    return "z";
+  return StringRef();
+}
+
+static Optional<RISCVExtensionVersion> isExperimentalExtension(StringRef Ext) {
+  auto ExtIterator =
+      llvm::find_if(SupportedExperimentalExtensions, FindByName(Ext));
+  if (ExtIterator == std::end(SupportedExperimentalExtensions))
+    return None;
+
+  return ExtIterator->Version;
+}
+
+bool RISCVISAInfo::isSupportedExtensionFeature(StringRef Ext) {
+  bool IsExperimental = stripExperimentalPrefix(Ext);
+
+  if (IsExperimental)
+    return llvm::any_of(SupportedExperimentalExtensions, FindByName(Ext));
+  else
+    return llvm::any_of(SupportedExtensions, FindByName(Ext));
+}
+
+bool RISCVISAInfo::isSupportedExtension(StringRef Ext) {
+  return llvm::any_of(SupportedExtensions, FindByName(Ext)) ||
+         llvm::any_of(SupportedExperimentalExtensions, FindByName(Ext));
+}
+
+bool RISCVISAInfo::isSupportedExtension(StringRef Ext, unsigned MajorVersion,
+                                        unsigned MinorVersion) {
+  auto FindByNameAndVersion = [=](const RISCVSupportedExtension &ExtInfo) {
+    return ExtInfo.Name == Ext && (MajorVersion == ExtInfo.Version.Major) &&
+           (MinorVersion == ExtInfo.Version.Minor);
+  };
+  return llvm::any_of(SupportedExtensions, FindByNameAndVersion) ||
+         llvm::any_of(SupportedExperimentalExtensions, FindByNameAndVersion);
+}
+
+bool RISCVISAInfo::hasExtension(StringRef Ext) const {
+  stripExperimentalPrefix(Ext);
+
+  if (!isSupportedExtension(Ext))
+    return false;
+
+  return Exts.count(Ext.str()) != 0;
+}
+
+// Get the rank for single-letter extension, lower value meaning higher
+// priority.
+static int singleLetterExtensionRank(char Ext) {
+  switch (Ext) {
+  case 'i':
+    return -2;
+  case 'e':
+    return -1;
+  default:
+    break;
+  }
+
+  size_t Pos = AllStdExts.find(Ext);
+  int Rank;
+  if (Pos == StringRef::npos)
+    // If we got an unknown extension letter, then give it an alphabetical
+    // order, but after all known standard extensions.
+    Rank = AllStdExts.size() + (Ext - 'a');
+  else
+    Rank = Pos;
+
+  return Rank;
+}
+
+// Get the rank for multi-letter extension, lower value meaning higher
+// priority/order in canonical order.
+static int multiLetterExtensionRank(const std::string &ExtName) {
+  assert(ExtName.length() >= 2);
+  int HighOrder;
+  int LowOrder = 0;
+  // The order between multi-char extensions: s -> h -> z -> x.
+  char ExtClass = ExtName[0];
+  switch (ExtClass) {
+  case 's':
+    HighOrder = 0;
+    break;
+  case 'h':
+    HighOrder = 1;
+    break;
+  case 'z':
+    HighOrder = 2;
+    // `z` extension must be sorted by canonical order of second letter.
+    // e.g. zmx has higher rank than zax.
+    LowOrder = singleLetterExtensionRank(ExtName[1]);
+    break;
+  case 'x':
+    HighOrder = 3;
+    break;
+  default:
+    llvm_unreachable("Unknown prefix for multi-char extension");
+    return -1;
+  }
+
+  return (HighOrder << 8) + LowOrder;
+}
+
+// Compare function for extension.
+// Only compare the extension name, ignore version comparison.
+bool RISCVISAInfo::compareExtension(const std::string &LHS,
+                                    const std::string &RHS) {
+  size_t LHSLen = LHS.length();
+  size_t RHSLen = RHS.length();
+  if (LHSLen == 1 && RHSLen != 1)
+    return true;
+
+  if (LHSLen != 1 && RHSLen == 1)
+    return false;
+
+  if (LHSLen == 1 && RHSLen == 1)
+    return singleLetterExtensionRank(LHS[0]) <
+           singleLetterExtensionRank(RHS[0]);
+
+  // Both are multi-char ext here.
+  int LHSRank = multiLetterExtensionRank(LHS);
+  int RHSRank = multiLetterExtensionRank(RHS);
+  if (LHSRank != RHSRank)
+    return LHSRank < RHSRank;
+
+  // If the rank is same, it must be sorted by lexicographic order.
+  return LHS < RHS;
+}
+
+void RISCVISAInfo::toFeatures(
+    std::vector<StringRef> &Features,
+    std::function<StringRef(const Twine &)> StrAlloc) const {
+  for (auto &Ext : Exts) {
+    StringRef ExtName = Ext.first;
+
+    if (ExtName == "i")
+      continue;
+
+    if (ExtName == "zvlsseg") {
+      Features.push_back("+experimental-v");
+      Features.push_back("+experimental-zvlsseg");
+    } else if (ExtName == "zvamo") {
+      Features.push_back("+experimental-v");
+      Features.push_back("+experimental-zvlsseg");
+      Features.push_back("+experimental-zvamo");
+    } else if (isExperimentalExtension(ExtName)) {
+      Features.push_back(StrAlloc("+experimental-" + ExtName));
+    } else {
+      Features.push_back(StrAlloc("+" + ExtName));
+    }
+  }
+}
+
+// Extensions may have a version number, and may be separated by
+// an underscore '_' e.g.: rv32i2_m2.
+// Version number is divided into major and minor version numbers,
+// separated by a 'p'. If the minor version is 0 then 'p0' can be
+// omitted from the version string. E.g., rv32i2p0, rv32i2, rv32i2p1.
+static Error getExtensionVersion(StringRef Ext, StringRef In, unsigned &Major,
+                                 unsigned &Minor, unsigned &ConsumeLength,
+                                 bool EnableExperimentalExtension,
+                                 bool ExperimentalExtensionVersionCheck) {
+  StringRef MajorStr, MinorStr;
+  Major = 0;
+  Minor = 0;
+  ConsumeLength = 0;
+  MajorStr = In.take_while(isDigit);
+  In = In.substr(MajorStr.size());
+
+  if (!MajorStr.empty() && In.consume_front("p")) {
+    MinorStr = In.take_while(isDigit);
+    In = In.substr(MajorStr.size() + 1);
+
+    // Expected 'p' to be followed by minor version number.
+    if (MinorStr.empty()) {
+      return createStringError(
+          errc::invalid_argument,
+          "minor version number missing after 'p' for extension '" + Ext + "'");
+    }
+  }
+
+  if (!MajorStr.empty() && MajorStr.getAsInteger(10, Major))
+    return createStringError(
+        errc::invalid_argument,
+        "Failed to parse major version number for extension '" + Ext + "'");
+
+  if (!MinorStr.empty() && MinorStr.getAsInteger(10, Minor))
+    return createStringError(
+        errc::invalid_argument,
+        "Failed to parse minor version number for extension '" + Ext + "'");
+
+  ConsumeLength = MajorStr.size();
+
+  if (!MinorStr.empty())
+    ConsumeLength += MinorStr.size() + 1 /*'p'*/;
+
+  // Expected multi-character extension with version number to have no
+  // subsequent characters (i.e. must either end string or be followed by
+  // an underscore).
+  if (Ext.size() > 1 && In.size()) {
+    std::string Error =
+        "multi-character extensions must be separated by underscores";
+    return createStringError(errc::invalid_argument, Error);
+  }
+
+  // If experimental extension, require use of current version number number
+  if (auto ExperimentalExtension = isExperimentalExtension(Ext)) {
+    if (!EnableExperimentalExtension) {
+      std::string Error = "requires '-menable-experimental-extensions' for "
+                          "experimental extension '" +
+                          Ext.str() + "'";
+      return createStringError(errc::invalid_argument, Error);
+    }
+
+    if (ExperimentalExtensionVersionCheck &&
+        (MajorStr.empty() && MinorStr.empty())) {
+      std::string Error =
+          "experimental extension requires explicit version number `" +
+          Ext.str() + "`";
+      return createStringError(errc::invalid_argument, Error);
+    }
+
+    auto SupportedVers = *ExperimentalExtension;
+    if (ExperimentalExtensionVersionCheck &&
+        (Major != SupportedVers.Major || Minor != SupportedVers.Minor)) {
+      std::string Error = "unsupported version number " + MajorStr.str();
+      if (!MinorStr.empty())
+        Error += "." + MinorStr.str();
+      Error += " for experimental extension '" + Ext.str() +
+               "'(this compiler supports " + utostr(SupportedVers.Major) + "." +
+               utostr(SupportedVers.Minor) + ")";
+      return createStringError(errc::invalid_argument, Error);
+    }
+    return Error::success();
+  }
+
+  // Exception rule for `g`, we don't have clear version scheme for that on
+  // ISA spec.
+  if (Ext == "g")
+    return Error::success();
+
+  if (MajorStr.empty() && MinorStr.empty()) {
+    if (auto DefaultVersion = findDefaultVersion(Ext)) {
+      Major = DefaultVersion->Major;
+      Minor = DefaultVersion->Minor;
+    }
+    // No matter found or not, return success, assume other place will
+    // verify.
+    return Error::success();
+  }
+
+  if (RISCVISAInfo::isSupportedExtension(Ext, Major, Minor))
+    return Error::success();
+
+  std::string Error = "unsupported version number " + std::string(MajorStr);
+  if (!MinorStr.empty())
+    Error += "." + MinorStr.str();
+  Error += " for extension '" + Ext.str() + "'";
+  return createStringError(errc::invalid_argument, Error);
+}
+
+llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+RISCVISAInfo::parseFeatures(unsigned XLen,
+                            const std::vector<std::string> &Features) {
+  assert(XLen == 32 || XLen == 64);
+  std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
+
+  bool HasE = false;
+  for (auto &Feature : Features) {
+    StringRef ExtName = Feature;
+    bool Experimental = false;
+    assert(ExtName.size() > 1 && (ExtName[0] == '+' || ExtName[0] == '-'));
+    bool Add = ExtName[0] == '+';
+    ExtName = ExtName.drop_front(1); // Drop '+' or '-'
+    Experimental = stripExperimentalPrefix(ExtName);
+    auto ExtensionInfos = Experimental
+                              ? makeArrayRef(SupportedExperimentalExtensions)
+                              : makeArrayRef(SupportedExtensions);
+    auto ExtensionInfoIterator =
+        llvm::find_if(ExtensionInfos, FindByName(ExtName));
+
+    // Not all features is related to ISA extension, like `relax` or
+    // `save-restore`, skip those feature.
+    if (ExtensionInfoIterator == ExtensionInfos.end())
+      continue;
+
+    if (Add) {
+      if (ExtName == "e") {
+        if (XLen != 32)
+          return createStringError(
+              errc::invalid_argument,
+              "standard user-level extension 'e' requires 'rv32'");
+        HasE = true;
+      }
+
+      ISAInfo->addExtension(ExtName, ExtensionInfoIterator->Version.Major,
+                            ExtensionInfoIterator->Version.Minor);
+    } else
+      ISAInfo->Exts.erase(ExtName.str());
+  }
+  if (!HasE) {
+    if (auto Version = findDefaultVersion("i"))
+      ISAInfo->addExtension("i", Version->Major, Version->Minor);
+    else
+      llvm_unreachable("Default extension version for 'i' not found?");
+  }
+
+  ISAInfo->updateFLen();
+
+  return std::move(ISAInfo);
+}
+
+llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
+                              bool ExperimentalExtensionVersionCheck) {
+  // RISC-V ISA strings must be lowercase.
+  if (llvm::any_of(Arch, isupper)) {
+    return createStringError(errc::invalid_argument,
+                             "string must be lowercase");
+  }
+
+  bool HasRV64 = Arch.startswith("rv64");
+  // ISA string must begin with rv32 or rv64.
+  if (!(Arch.startswith("rv32") || HasRV64) || (Arch.size() < 5)) {
+    return createStringError(errc::invalid_argument,
+                             "string must begin with rv32{i,e,g} or rv64{i,g}");
+  }
+
+  unsigned XLen = HasRV64 ? 64 : 32;
+  std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
+
+  // The canonical order specified in ISA manual.
+  // Ref: Table 22.1 in RISC-V User-Level ISA V2.2
+  StringRef StdExts = AllStdExts;
+  bool HasF = false, HasD = false;
+  char Baseline = Arch[4];
+
+  // First letter should be 'e', 'i' or 'g'.
+  switch (Baseline) {
+  default:
+    return createStringError(errc::invalid_argument,
+                             "first letter should be 'e', 'i' or 'g'");
+  case 'e': {
+    // Extension 'e' is not allowed in rv64.
+    if (HasRV64)
+      return createStringError(
+          errc::invalid_argument,
+          "standard user-level extension 'e' requires 'rv32'");
+    break;
+  }
+  case 'i':
+    break;
+  case 'g':
+    // g = imafd
+    StdExts = StdExts.drop_front(4);
+    HasF = true;
+    HasD = true;
+    break;
+  }
+
+  // Skip rvxxx
+  StringRef Exts = Arch.substr(5);
+
+  // Remove multi-letter standard extensions, non-standard extensions and
+  // supervisor-level extensions. They have 'z', 'x', 's', 'sx' prefixes.
+  // Parse them at the end.
+  // Find the very first occurrence of 's', 'x' or 'z'.
+  StringRef OtherExts;
+  size_t Pos = Exts.find_first_of("zsx");
+  if (Pos != StringRef::npos) {
+    OtherExts = Exts.substr(Pos);
+    Exts = Exts.substr(0, Pos);
+  }
+
+  unsigned Major, Minor, ConsumeLength;
+  if (auto E = getExtensionVersion(std::string(1, Baseline), Exts, Major, Minor,
+                                   ConsumeLength, EnableExperimentalExtension,
+                                   ExperimentalExtensionVersionCheck))
+    return std::move(E);
+
+  if (Baseline == 'g') {
+    // No matter which version is given to `g`, we always set imafd to default
+    // version since the we don't have clear version scheme for that on
+    // ISA spec.
+    for (auto Ext : {"i", "m", "a", "f", "d"})
+      if (auto Version = findDefaultVersion(Ext))
+        ISAInfo->addExtension(Ext, Version->Major, Version->Minor);
+      else
+        llvm_unreachable("Default extension version not found?");
+  } else
+    // Baseline is `i` or `e`
+    ISAInfo->addExtension(std::string(1, Baseline), Major, Minor);
+
+  // Consume the base ISA version number and any '_' between rvxxx and the
+  // first extension
+  Exts = Exts.drop_front(ConsumeLength);
+  Exts.consume_front("_");
+
+  // TODO: Use version number when setting target features
+
+  auto StdExtsItr = StdExts.begin();
+  auto StdExtsEnd = StdExts.end();
+  for (auto I = Exts.begin(), E = Exts.end(); I != E;) {
+    char C = *I;
+
+    // Check ISA extensions are specified in the canonical order.
+    while (StdExtsItr != StdExtsEnd && *StdExtsItr != C)
+      ++StdExtsItr;
+
+    if (StdExtsItr == StdExtsEnd) {
+      // Either c contains a valid extension but it was not given in
+      // canonical order or it is an invalid extension.
+      if (StdExts.contains(C)) {
+        return createStringError(
+            errc::invalid_argument,
+            "standard user-level extension not given in canonical order '%c'",
+            C);
+      }
+
+      return createStringError(errc::invalid_argument,
+                               "invalid standard user-level extension '%c'", C);
+    }
+
+    // Move to next char to prevent repeated letter.
+    ++StdExtsItr;
+
+    std::string Next;
+    unsigned Major, Minor, ConsumeLength;
+    if (std::next(I) != E)
+      Next = std::string(std::next(I), E);
+    if (auto E = getExtensionVersion(std::string(1, C), Next, Major, Minor,
+                                     ConsumeLength, EnableExperimentalExtension,
+                                     ExperimentalExtensionVersionCheck))
+      return std::move(E);
+
+    // The order is OK, then push it into features.
+    // TODO: Use version number when setting target features
+    switch (C) {
+    default:
+      // Currently LLVM supports only "mafdcbv".
+      return createStringError(errc::invalid_argument,
+                               "unsupported standard user-level extension '%c'",
+                               C);
+    case 'm':
+      ISAInfo->addExtension("m", Major, Minor);
+      break;
+    case 'a':
+      ISAInfo->addExtension("a", Major, Minor);
+      break;
+    case 'f':
+      ISAInfo->addExtension("f", Major, Minor);
+      HasF = true;
+      break;
+    case 'd':
+      ISAInfo->addExtension("d", Major, Minor);
+      HasD = true;
+      break;
+    case 'c':
+      ISAInfo->addExtension("c", Major, Minor);
+      break;
+    case 'v':
+      ISAInfo->addExtension("v", Major, Minor);
+      ISAInfo->addExtension("zvlsseg", Major, Minor);
+      break;
+    }
+    // Consume full extension name and version, including any optional '_'
+    // between this extension and the next
+    ++I;
+    I += ConsumeLength;
+    if (*I == '_')
+      ++I;
+  }
+  // Dependency check.
+  // It's illegal to specify the 'd' (double-precision floating point)
+  // extension without also specifying the 'f' (single precision
+  // floating-point) extension.
+  // TODO: This has been removed in later specs, which specify that D implies F
+  if (HasD && !HasF)
+    return createStringError(errc::invalid_argument,
+                             "d requires f extension to also be specified");
+
+  // Additional dependency checks.
+  // TODO: The 'q' extension requires rv64.
+  // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'.
+
+  if (OtherExts.empty())
+    return std::move(ISAInfo);
+
+  // Handle other types of extensions other than the standard
+  // general purpose and standard user-level extensions.
+  // Parse the ISA string containing non-standard user-level
+  // extensions, standard supervisor-level extensions and
+  // non-standard supervisor-level extensions.
+  // These extensions start with 'z', 'x', 's', 'sx' prefixes, follow a
+  // canonical order, might have a version number (major, minor)
+  // and are separated by a single underscore '_'.
+  // Set the hardware features for the extensions that are supported.
+
+  // Multi-letter extensions are seperated by a single underscore
+  // as described in RISC-V User-Level ISA V2.2.
+  SmallVector<StringRef, 8> Split;
+  OtherExts.split(Split, '_');
+
+  SmallVector<StringRef, 8> AllExts;
+  std::array<StringRef, 4> Prefix{"z", "x", "s", "sx"};
+  auto I = Prefix.begin();
+  auto E = Prefix.end();
+
+  for (StringRef Ext : Split) {
+    if (Ext.empty())
+      return createStringError(errc::invalid_argument,
+                               "extension name missing after separator '_'");
+
+    StringRef Type = getExtensionType(Ext);
+    StringRef Desc = getExtensionTypeDesc(Ext);
+    auto Pos = Ext.find_if(isDigit);
+    StringRef Name(Ext.substr(0, Pos));
+    StringRef Vers(Ext.substr(Pos));
+
+    if (Type.empty())
+      return createStringError(errc::invalid_argument,
+                               "invalid extension prefix '" + Ext + "'");
+
+    // Check ISA extensions are specified in the canonical order.
+    while (I != E && *I != Type)
+      ++I;
+
+    if (I == E)
+      return createStringError(errc::invalid_argument,
+                               "%s not given in canonical order '%s'",
+                               Desc.str().c_str(), Ext.str().c_str());
+
+    if (Name.size() == Type.size()) {
+      return createStringError(errc::invalid_argument,
+                               "%s name missing after '%s'", Desc.str().c_str(),
+                               Type.str().c_str());
+    }
+
+    unsigned Major, Minor, ConsumeLength;
+    if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength,
+                                     EnableExperimentalExtension,
+                                     ExperimentalExtensionVersionCheck))
+      return std::move(E);
+
+    // Check if duplicated extension.
+    if (llvm::is_contained(AllExts, Name))
+      return createStringError(errc::invalid_argument, "duplicated %s '%s'",
+                               Desc.str().c_str(), Name.str().c_str());
+
+    ISAInfo->addExtension(Name, Major, Minor);
+    // Extension format is correct, keep parsing the extensions.
+    // TODO: Save Type, Name, Major, Minor to avoid parsing them later.
+    AllExts.push_back(Name);
+  }
+
+  for (auto Ext : AllExts) {
+    if (!isSupportedExtension(Ext)) {
+      StringRef Desc = getExtensionTypeDesc(getExtensionType(Ext));
+      return createStringError(errc::invalid_argument, "unsupported %s '%s'",
+                               Desc.str().c_str(), Ext.str().c_str());
+    }
+  }
+
+  ISAInfo->updateFLen();
+
+  return std::move(ISAInfo);
+}
+
+void RISCVISAInfo::updateFLen() {
+  FLen = 0;
+  // TODO: Handle q extension.
+  if (Exts.count("d"))
+    FLen = 64;
+  else if (Exts.count("f"))
+    FLen = 32;
+}
+
+std::string RISCVISAInfo::toString() const {
+  std::string Buffer;
+  raw_string_ostream Arch(Buffer);
+
+  Arch << "rv" << XLen;
+
+  ListSeparator LS("_");
+  for (auto &Ext : Exts) {
+    StringRef ExtName = Ext.first;
+    auto ExtInfo = Ext.second;
+    Arch << LS << ExtName;
+    Arch << ExtInfo.MajorVersion << "p" << ExtInfo.MinorVersion;
+  }
+
+  return Arch.str();
+}
diff --git a/llvm/lib/Support/Signposts.cpp b/llvm/lib/Support/Signposts.cpp
index 49a0b16baa02..58fafb26cdf3 100644
--- a/llvm/lib/Support/Signposts.cpp
+++ b/llvm/lib/Support/Signposts.cpp
@@ -1,23 +1,27 @@
 //===-- Signposts.cpp - Interval debug annotations ------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Signposts.h"
 #include "llvm/Support/Timer.h"
 
+#include "llvm/Config/config.h"
 #if LLVM_SUPPORT_XCODE_SIGNPOSTS
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Mutex.h"
-#endif
+#include <Availability.h>
+#include <os/signpost.h>
+#endif // if LLVM_SUPPORT_XCODE_SIGNPOSTS
 
 using namespace llvm;
 
 #if LLVM_SUPPORT_XCODE_SIGNPOSTS
+#define SIGNPOSTS_AVAILABLE()                                                  \
+  __builtin_available(macos 10.14, iOS 12, tvOS 12, watchOS 5, *)
 namespace {
 os_log_t *LogCreator() {
   os_log_t *X = new os_log_t;
@@ -35,13 +39,13 @@ struct LogDeleter {
 namespace llvm {
 class SignpostEmitterImpl {
   using LogPtrTy = std::unique_ptr<os_log_t, LogDeleter>;
+  using LogTy = LogPtrTy::element_type;
 
   LogPtrTy SignpostLog;
   DenseMap<const void *, os_signpost_id_t> Signposts;
   sys::SmartMutex<true> Mutex;
 
-public:
-  os_log_t &getLogger() const { return *SignpostLog; }
+  LogTy &getLogger() const { return *SignpostLog; }
   os_signpost_id_t getSignpostForObject(const void *O) {
     sys::SmartScopedLock<true> Lock(Mutex);
     const auto &I = Signposts.find(O);
@@ -55,6 +59,7 @@ public:
     return Inserted.first->second;
   }
 
+public:
   SignpostEmitterImpl() : SignpostLog(LogCreator()) {}
 
   bool isEnabled() const {
@@ -73,7 +78,7 @@ public:
     }
   }
 
-  void endInterval(const void *O) {
+  void endInterval(const void *O, llvm::StringRef Name) {
     if (isEnabled()) {
       if (SIGNPOSTS_AVAILABLE()) {
         // Both strings used here are required to be constant literal strings.
@@ -119,17 +124,10 @@ void SignpostEmitter::startInterval(const void *O, StringRef Name) {
 #endif // if !HAVE_ANY_SIGNPOST_IMPL
 }
 
-#if HAVE_ANY_SIGNPOST_IMPL
-os_log_t &SignpostEmitter::getLogger() const { return Impl->getLogger(); }
-os_signpost_id_t SignpostEmitter::getSignpostForObject(const void *O) {
-  return Impl->getSignpostForObject(O);
-}
-#endif
-
-void SignpostEmitter::endInterval(const void *O) {
+void SignpostEmitter::endInterval(const void *O, StringRef Name) {
 #if HAVE_ANY_SIGNPOST_IMPL
   if (Impl == nullptr)
     return;
-  Impl->endInterval(O);
+  Impl->endInterval(O, Name);
 #endif // if !HAVE_ANY_SIGNPOST_IMPL
 }
diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp
index 0005f7840912..2d7721e4e1fb 100644
--- a/llvm/lib/Support/SmallVector.cpp
+++ b/llvm/lib/Support/SmallVector.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
 #include <cstdint>
 #ifdef LLVM_ENABLE_EXCEPTIONS
 #include <stdexcept>
@@ -19,12 +20,21 @@ using namespace llvm;
 
 // Check that no bytes are wasted and everything is well-aligned.
 namespace {
+// These structures may cause binary compat warnings on AIX. Suppress the
+// warning since we are only using these types for the static assertions below.
+#if defined(_AIX)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Waix-compat"
+#endif
 struct Struct16B {
   alignas(16) void *X;
 };
 struct Struct32B {
   alignas(32) void *X;
 };
+#if defined(_AIX)
+#pragma GCC diagnostic pop
+#endif
 }
 static_assert(sizeof(SmallVector<void *, 0>) ==
                   sizeof(unsigned) * 2 + sizeof(void *),
@@ -47,8 +57,7 @@ static_assert(sizeof(SmallVector<char, 0>) ==
 
 /// Report that MinSize doesn't fit into this vector's size type. Throws
 /// std::length_error or calls report_fatal_error.
-LLVM_ATTRIBUTE_NORETURN
-static void report_size_overflow(size_t MinSize, size_t MaxSize);
+[[noreturn]] static void report_size_overflow(size_t MinSize, size_t MaxSize);
 static void report_size_overflow(size_t MinSize, size_t MaxSize) {
   std::string Reason = "SmallVector unable to grow. Requested capacity (" +
                        std::to_string(MinSize) +
@@ -57,13 +66,13 @@ static void report_size_overflow(size_t MinSize, size_t MaxSize) {
 #ifdef LLVM_ENABLE_EXCEPTIONS
   throw std::length_error(Reason);
 #else
-  report_fatal_error(Reason);
+  report_fatal_error(Twine(Reason));
 #endif
 }
 
 /// Report that this vector is already at maximum capacity. Throws
 /// std::length_error or calls report_fatal_error.
-LLVM_ATTRIBUTE_NORETURN static void report_at_maximum_capacity(size_t MaxSize);
+[[noreturn]] static void report_at_maximum_capacity(size_t MaxSize);
 static void report_at_maximum_capacity(size_t MaxSize) {
   std::string Reason =
       "SmallVector capacity unable to grow. Already at maximum size " +
@@ -71,7 +80,7 @@ static void report_at_maximum_capacity(size_t MaxSize) {
 #ifdef LLVM_ENABLE_EXCEPTIONS
   throw std::length_error(Reason);
 #else
-  report_fatal_error(Reason);
+  report_fatal_error(Twine(Reason));
 #endif
 }
 
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 73f852624a69..1939ed9e9547 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -64,7 +64,7 @@ unsigned SpecialCaseList::Matcher::match(StringRef Query) const {
     return It->second;
   if (Trigrams.isDefinitelyOut(Query))
     return false;
-  for (auto& RegExKV : RegExes)
+  for (const auto &RegExKV : RegExes)
     if (RegExKV.first->match(Query))
       return RegExKV.second;
   return 0;
@@ -93,7 +93,7 @@ SpecialCaseList::createOrDie(const std::vector<std::string> &Paths,
   std::string Error;
   if (auto SCL = create(Paths, FS, Error))
     return SCL;
-  report_fatal_error(Error);
+  report_fatal_error(Twine(Error));
 }
 
 bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
@@ -209,7 +209,7 @@ bool SpecialCaseList::inSection(StringRef Section, StringRef Prefix,
 unsigned SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
                                          StringRef Query,
                                          StringRef Category) const {
-  for (auto &SectionIter : Sections)
+  for (const auto &SectionIter : Sections)
     if (SectionIter.SectionMatcher->match(Section)) {
       unsigned Blame =
           inSectionBlame(SectionIter.Entries, Prefix, Query, Category);
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 8f2544e9e26d..2b094a4983a0 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -110,9 +110,8 @@ struct llvm::TimeTraceProfiler {
     // templates from within, we only want to add the topmost one. "topmost"
     // happens to be the ones that don't have any currently open entries above
     // itself.
-    if (std::find_if(++Stack.rbegin(), Stack.rend(), [&](const Entry &Val) {
-          return Val.Name == E.Name;
-        }) == Stack.rend()) {
+    if (llvm::none_of(llvm::drop_begin(llvm::reverse(Stack)),
+                      [&](const Entry &Val) { return Val.Name == E.Name; })) {
       auto &CountAndTotal = CountAndTotalPerName[E.Name];
       CountAndTotal.first++;
       CountAndTotal.second += Duration;
@@ -272,8 +271,9 @@ void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
 // Called from main thread.
 void llvm::timeTraceProfilerCleanup() {
   delete TimeTraceProfilerInstance;
+  TimeTraceProfilerInstance = nullptr;
   std::lock_guard<std::mutex> Lock(Mu);
-  for (auto TTP : *ThreadTimeTraceProfilerInstances)
+  for (auto *TTP : *ThreadTimeTraceProfilerInstances)
     delete TTP;
   ThreadTimeTraceProfilerInstances->clear();
 }
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index f025ecd3d45c..08e1a8a0e0aa 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -199,7 +199,7 @@ void Timer::stopTimer() {
   Running = false;
   Time += TimeRecord::getCurrentTime(false);
   Time -= StartTime;
-  Signposts->endInterval(this);
+  Signposts->endInterval(this, getName());
 }
 
 void Timer::clear() {
@@ -393,8 +393,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   OS << "  --- Name ---\n";
 
   // Loop through all of the timing data, printing it out.
-  for (const PrintRecord &Record : make_range(TimersToPrint.rbegin(),
-                                              TimersToPrint.rend())) {
+  for (const PrintRecord &Record : llvm::reverse(TimersToPrint)) {
     Record.Time.print(Total, OS);
     OS << Record.Description << '\n';
   }
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index 88311546354b..b9a92e280576 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -67,6 +67,8 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case sparcv9:        return "sparcv9";
   case spir64:         return "spir64";
   case spir:           return "spir";
+  case spirv32:        return "spirv32";
+  case spirv64:        return "spirv64";
   case systemz:        return "s390x";
   case tce:            return "tce";
   case tcele:          return "tcele";
@@ -147,6 +149,10 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case spir:
   case spir64:      return "spir";
+
+  case spirv32:
+  case spirv64:     return "spirv";
+
   case kalimba:     return "kalimba";
   case lanai:       return "lanai";
   case shave:       return "shave";
@@ -323,6 +329,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("hsail64", hsail64)
     .Case("spir", spir)
     .Case("spir64", spir64)
+    .Case("spirv32", spirv32)
+    .Case("spirv64", spirv64)
     .Case("kalimba", kalimba)
     .Case("lanai", lanai)
     .Case("shave", shave)
@@ -456,6 +464,8 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("hsail64", Triple::hsail64)
     .Case("spir", Triple::spir)
     .Case("spir64", Triple::spir64)
+    .Case("spirv32", Triple::spirv32)
+    .Case("spirv64", Triple::spirv64)
     .StartsWith("kalimba", Triple::kalimba)
     .Case("lanai", Triple::lanai)
     .Case("renderscript32", Triple::renderscript32)
@@ -653,6 +663,12 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8_6a;
   case ARM::ArchKind::ARMV8_7A:
     return Triple::ARMSubArch_v8_7a;
+  case ARM::ArchKind::ARMV9A:
+    return Triple::ARMSubArch_v9;
+  case ARM::ArchKind::ARMV9_1A:
+    return Triple::ARMSubArch_v9_1a;
+  case ARM::ArchKind::ARMV9_2A:
+    return Triple::ARMSubArch_v9_2a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
@@ -753,6 +769,11 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::wasm32:
   case Triple::wasm64:
     return Triple::Wasm;
+
+  case Triple::spirv32:
+  case Triple::spirv64:
+    // TODO: In future this will be Triple::SPIRV.
+    return Triple::UnknownObjectFormat;
   }
   llvm_unreachable("unknown architecture");
 }
@@ -1024,6 +1045,30 @@ StringRef Triple::getArchName() const {
   return StringRef(Data).split('-').first;           // Isolate first component
 }
 
+StringRef Triple::getArchName(ArchType Kind, SubArchType SubArch) const {
+  switch (Kind) {
+  case Triple::mips:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa32r6";
+    break;
+  case Triple::mipsel:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa32r6el";
+    break;
+  case Triple::mips64:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa64r6";
+    break;
+  case Triple::mips64el:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa64r6el";
+    break;
+  default:
+    break;
+  }
+  return getArchTypeName(Kind);
+}
+
 StringRef Triple::getVendorName() const {
   StringRef Tmp = StringRef(Data).split('-').second; // Strip first component
   return Tmp.split('-').first;                       // Isolate second component
@@ -1205,8 +1250,8 @@ void Triple::setTriple(const Twine &Str) {
   *this = Triple(Str);
 }
 
-void Triple::setArch(ArchType Kind) {
-  setArchName(getArchTypeName(Kind));
+void Triple::setArch(ArchType Kind, SubArchType SubArch) {
+  setArchName(getArchName(Kind, SubArch));
 }
 
 void Triple::setVendor(VendorType Kind) {
@@ -1298,6 +1343,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::spir:
+  case llvm::Triple::spirv32:
   case llvm::Triple::tce:
   case llvm::Triple::tcele:
   case llvm::Triple::thumb:
@@ -1324,6 +1370,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::riscv64:
   case llvm::Triple::sparcv9:
   case llvm::Triple::spir64:
+  case llvm::Triple::spirv64:
   case llvm::Triple::systemz:
   case llvm::Triple::ve:
   case llvm::Triple::wasm64:
@@ -1383,6 +1430,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::sparc:
   case Triple::sparcel:
   case Triple::spir:
+  case Triple::spirv32:
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumb:
@@ -1398,8 +1446,12 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::amdil64:        T.setArch(Triple::amdil);   break;
   case Triple::hsail64:        T.setArch(Triple::hsail);   break;
   case Triple::le64:           T.setArch(Triple::le32);    break;
-  case Triple::mips64:         T.setArch(Triple::mips);    break;
-  case Triple::mips64el:       T.setArch(Triple::mipsel);  break;
+  case Triple::mips64:
+    T.setArch(Triple::mips, getSubArch());
+    break;
+  case Triple::mips64el:
+    T.setArch(Triple::mipsel, getSubArch());
+    break;
   case Triple::nvptx64:        T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:          T.setArch(Triple::ppc);     break;
   case Triple::ppc64le:        T.setArch(Triple::ppcle);   break;
@@ -1407,6 +1459,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
   case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
   case Triple::spir64:         T.setArch(Triple::spir);    break;
+  case Triple::spirv64:        T.setArch(Triple::spirv32); break;
   case Triple::wasm64:         T.setArch(Triple::wasm32);  break;
   case Triple::x86_64:         T.setArch(Triple::x86);     break;
   }
@@ -1451,6 +1504,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::riscv64:
   case Triple::sparcv9:
   case Triple::spir64:
+  case Triple::spirv64:
   case Triple::systemz:
   case Triple::ve:
   case Triple::wasm64:
@@ -1464,8 +1518,12 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::armeb:           T.setArch(Triple::aarch64_be); break;
   case Triple::hsail:           T.setArch(Triple::hsail64);    break;
   case Triple::le32:            T.setArch(Triple::le64);       break;
-  case Triple::mips:            T.setArch(Triple::mips64);     break;
-  case Triple::mipsel:          T.setArch(Triple::mips64el);   break;
+  case Triple::mips:
+    T.setArch(Triple::mips64, getSubArch());
+    break;
+  case Triple::mipsel:
+    T.setArch(Triple::mips64el, getSubArch());
+    break;
   case Triple::nvptx:           T.setArch(Triple::nvptx64);    break;
   case Triple::ppc:             T.setArch(Triple::ppc64);      break;
   case Triple::ppcle:           T.setArch(Triple::ppc64le);    break;
@@ -1473,6 +1531,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
   case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
   case Triple::spir:            T.setArch(Triple::spir64);     break;
+  case Triple::spirv32:         T.setArch(Triple::spirv64);    break;
   case Triple::thumb:           T.setArch(Triple::aarch64);    break;
   case Triple::thumbeb:         T.setArch(Triple::aarch64_be); break;
   case Triple::wasm32:          T.setArch(Triple::wasm64);     break;
@@ -1509,6 +1568,8 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::shave:
   case Triple::spir64:
   case Triple::spir:
+  case Triple::spirv32:
+  case Triple::spirv64:
   case Triple::wasm32:
   case Triple::wasm64:
   case Triple::x86:
@@ -1526,8 +1587,12 @@ Triple Triple::getBigEndianArchVariant() const {
 
   case Triple::aarch64: T.setArch(Triple::aarch64_be); break;
   case Triple::bpfel:   T.setArch(Triple::bpfeb);      break;
-  case Triple::mips64el:T.setArch(Triple::mips64);     break;
-  case Triple::mipsel:  T.setArch(Triple::mips);       break;
+  case Triple::mips64el:
+    T.setArch(Triple::mips64, getSubArch());
+    break;
+  case Triple::mipsel:
+    T.setArch(Triple::mips, getSubArch());
+    break;
   case Triple::ppcle:   T.setArch(Triple::ppc);        break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
@@ -1559,8 +1624,12 @@ Triple Triple::getLittleEndianArchVariant() const {
 
   case Triple::aarch64_be: T.setArch(Triple::aarch64);  break;
   case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
-  case Triple::mips64:     T.setArch(Triple::mips64el); break;
-  case Triple::mips:       T.setArch(Triple::mipsel);   break;
+  case Triple::mips64:
+    T.setArch(Triple::mips64el, getSubArch());
+    break;
+  case Triple::mips:
+    T.setArch(Triple::mipsel, getSubArch());
+    break;
   case Triple::ppc:        T.setArch(Triple::ppcle);    break;
   case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
   case Triple::sparc:      T.setArch(Triple::sparcel);  break;
@@ -1604,6 +1673,8 @@ bool Triple::isLittleEndian() const {
   case Triple::sparcel:
   case Triple::spir64:
   case Triple::spir:
+  case Triple::spirv32:
+  case Triple::spirv64:
   case Triple::tcele:
   case Triple::thumb:
   case Triple::ve:
@@ -1709,6 +1780,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   switch (getOS()) {
   case llvm::Triple::FreeBSD:
   case llvm::Triple::NetBSD:
+  case llvm::Triple::OpenBSD:
     if (!MArch.empty() && MArch == "v6")
       return "arm1176jzf-s";
     if (!MArch.empty() && MArch == "v7")
diff --git a/llvm/lib/Support/Unix/Memory.inc b/llvm/lib/Support/Unix/Memory.inc
index be88e7db1400..b83477e0e4cc 100644
--- a/llvm/lib/Support/Unix/Memory.inc
+++ b/llvm/lib/Support/Unix/Memory.inc
@@ -29,14 +29,6 @@
 #include <zircon/syscalls.h>
 #endif
 
-#if defined(__mips__)
-#  if defined(__OpenBSD__)
-#    include <mips64/sysarch.h>
-#  elif !defined(__FreeBSD__)
-#    include <sys/cachectl.h>
-#  endif
-#endif
-
 #if defined(__APPLE__)
 extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
 #else
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index c37b3a54644a..19d89db55627 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -39,6 +39,9 @@
 #include <mach-o/dyld.h>
 #include <sys/attr.h>
 #include <copyfile.h>
+#if __has_include(<sys/clonefile.h>)
+#include <sys/clonefile.h>
+#endif
 #elif defined(__FreeBSD__)
 #include <osreldate.h>
 #if __FreeBSD_version >= 1300057
@@ -125,7 +128,8 @@ const file_t kInvalidFile = -1;
 
 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
     defined(__minix) || defined(__FreeBSD_kernel__) || defined(__linux__) ||   \
-    defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX) || defined(__GNU__)
+    defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX) || defined(__GNU__) || \
+    (defined(__sun__) && defined(__svr4__))
 static int
 test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
 {
@@ -283,6 +287,20 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   // Fall back to the classical detection.
   if (getprogpath(exe_path, argv0))
     return exe_path;
+#elif defined(__sun__) && defined(__svr4__)
+  char exe_path[PATH_MAX];
+  const char *aPath = "/proc/self/execname";
+  if (sys::fs::exists(aPath)) {
+    int fd = open(aPath, O_RDONLY);
+    if (fd == -1)
+      return "";
+    if (read(fd, exe_path, sizeof(exe_path)) < 0)
+      return "";
+    return exe_path;
+  }
+  // Fall back to the classical detection.
+  if (getprogpath(exe_path, argv0) != NULL)
+    return exe_path;
 #elif defined(__MVS__)
   int token = 0;
   W_PSPROC buf;
@@ -1442,22 +1460,37 @@ namespace fs {
 /// file descriptor variant of this function still uses the default
 /// implementation.
 std::error_code copy_file(const Twine &From, const Twine &To) {
-  uint32_t Flag = COPYFILE_DATA;
-#if __has_builtin(__builtin_available) && defined(COPYFILE_CLONE)
+  std::string FromS = From.str();
+  std::string ToS = To.str();
+#if __has_builtin(__builtin_available)
   if (__builtin_available(macos 10.12, *)) {
-    bool IsSymlink;
-    if (std::error_code Error = is_symlink_file(From, IsSymlink))
-      return Error;
-    // COPYFILE_CLONE clones the symlink instead of following it
-    // and returns EEXISTS if the target file already exists.
-    if (!IsSymlink && !exists(To))
-      Flag = COPYFILE_CLONE;
+    // Optimistically try to use clonefile() and handle errors, rather than
+    // calling stat() to see if it'll work.
+    //
+    // Note: It's okay if From is a symlink. In contrast to the behaviour of
+    // copyfile() with COPYFILE_CLONE, clonefile() clones targets (not the
+    // symlink itself) unless the flag CLONE_NOFOLLOW is passed.
+    if (!clonefile(FromS.c_str(), ToS.c_str(), 0))
+      return std::error_code();
+
+    auto Errno = errno;
+    switch (Errno) {
+      case EEXIST:  // To already exists.
+      case ENOTSUP: // Device does not support cloning.
+      case EXDEV:   // From and To are on different devices.
+        break;
+      default:
+        // Anything else will also break copyfile().
+        return std::error_code(Errno, std::generic_category());
+    }
+
+    // TODO: For EEXIST, profile calling fs::generateUniqueName() and
+    // clonefile() in a retry loop (then rename() on success) before falling
+    // back to copyfile(). Depending on the size of the file this could be
+    // cheaper.
   }
 #endif
-  int Status =
-      copyfile(From.str().c_str(), To.str().c_str(), /* State */ NULL, Flag);
-
-  if (Status == 0)
+  if (!copyfile(FromS.c_str(), ToS.c_str(), /*State=*/NULL, COPYFILE_DATA))
     return std::error_code();
   return std::error_code(errno, std::generic_category());
 }
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index 30b957e6a1c4..d3d9fb7d7187 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -461,5 +461,4 @@ unsigned llvm::sys::Process::GetRandomNumber() {
 #endif
 }
 
-LLVM_ATTRIBUTE_NORETURN
-void Process::ExitNoCleanup(int RetCode) { _Exit(RetCode); }
+[[noreturn]] void Process::ExitNoCleanup(int RetCode) { _Exit(RetCode); }
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index be59bb0232de..089342030b97 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -71,7 +71,8 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   assert(!Name.empty() && "Must have a name!");
   // Use the given path verbatim if it contains any slashes; this matches
   // the behavior of sh(1) and friends.
-  if (Name.find('/') != StringRef::npos) return std::string(Name);
+  if (Name.contains('/'))
+    return std::string(Name);
 
   SmallVector<StringRef, 16> EnvironmentPaths;
   if (Paths.empty())
diff --git a/llvm/lib/Support/Unix/Unix.h b/llvm/lib/Support/Unix/Unix.h
index 60929139598b..1599241a344a 100644
--- a/llvm/lib/Support/Unix/Unix.h
+++ b/llvm/lib/Support/Unix/Unix.h
@@ -67,11 +67,10 @@ static inline bool MakeErrMsg(
 }
 
 // Include StrError(errnum) in a fatal error message.
-LLVM_ATTRIBUTE_NORETURN static inline void ReportErrnumFatal(const char *Msg,
-                                                             int errnum) {
+[[noreturn]] static inline void ReportErrnumFatal(const char *Msg, int errnum) {
   std::string ErrMsg;
   MakeErrMsg(&ErrMsg, Msg, errnum);
-  llvm::report_fatal_error(ErrMsg);
+  llvm::report_fatal_error(llvm::Twine(ErrMsg));
 }
 
 namespace llvm {
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 15bb54e61817..9bf0384b5f1b 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileSystem/UniqueID.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
@@ -193,6 +194,7 @@ public:
                                                    bool RequiresNullTerminator,
                                                    bool IsVolatile) override;
   std::error_code close() override;
+  void setPath(const Twine &Path) override;
 };
 
 } // namespace
@@ -228,6 +230,12 @@ std::error_code RealFile::close() {
   return EC;
 }
 
+void RealFile::setPath(const Twine &Path) {
+  RealName = Path.str();
+  if (auto Status = status())
+    S = Status.get().copyWithNewName(Status.get(), Path);
+}
+
 namespace {
 
 /// A file system according to your operating system.
@@ -442,7 +450,7 @@ std::error_code OverlayFileSystem::isLocal(const Twine &Path, bool &Result) {
 std::error_code
 OverlayFileSystem::getRealPath(const Twine &Path,
                                SmallVectorImpl<char> &Output) const {
-  for (auto &FS : FSList)
+  for (const auto &FS : FSList)
     if (FS->exists(Path))
       return FS->getRealPath(Path, Output);
   return errc::no_such_file_or_directory;
@@ -638,6 +646,8 @@ public:
   }
 
   std::error_code close() override { return {}; }
+
+  void setPath(const Twine &Path) override { RequestedName = Path.str(); }
 };
 } // namespace
 
@@ -655,6 +665,9 @@ public:
   Status getStatus(const Twine &RequestedName) const {
     return Status::copyWithNewName(Stat, RequestedName);
   }
+
+  UniqueID getUniqueID() const { return Stat.getUniqueID(); }
+
   InMemoryNode *getChild(StringRef Name) {
     auto I = Entries.find(Name);
     if (I != Entries.end())
@@ -698,10 +711,28 @@ Status getNodeStatus(const InMemoryNode *Node, const Twine &RequestedName) {
 } // namespace
 } // namespace detail
 
+// The UniqueID of in-memory files is derived from path and content.
+// This avoids difficulties in creating exactly equivalent in-memory FSes,
+// as often needed in multithreaded programs.
+static sys::fs::UniqueID getUniqueID(hash_code Hash) {
+  return sys::fs::UniqueID(std::numeric_limits<uint64_t>::max(),
+                           uint64_t(size_t(Hash)));
+}
+static sys::fs::UniqueID getFileID(sys::fs::UniqueID Parent,
+                                   llvm::StringRef Name,
+                                   llvm::StringRef Contents) {
+  return getUniqueID(llvm::hash_combine(Parent.getFile(), Name, Contents));
+}
+static sys::fs::UniqueID getDirectoryID(sys::fs::UniqueID Parent,
+                                        llvm::StringRef Name) {
+  return getUniqueID(llvm::hash_combine(Parent.getFile(), Name));
+}
+
 InMemoryFileSystem::InMemoryFileSystem(bool UseNormalizedPaths)
     : Root(new detail::InMemoryDirectory(
-          Status("", getNextVirtualUniqueID(), llvm::sys::TimePoint<>(), 0, 0,
-                 0, llvm::sys::fs::file_type::directory_file,
+          Status("", getDirectoryID(llvm::sys::fs::UniqueID(), ""),
+                 llvm::sys::TimePoint<>(), 0, 0, 0,
+                 llvm::sys::fs::file_type::directory_file,
                  llvm::sys::fs::perms::all_all))),
       UseNormalizedPaths(UseNormalizedPaths) {}
 
@@ -754,10 +785,14 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
           Child.reset(new detail::InMemoryHardLink(P.str(), *HardLinkTarget));
         else {
           // Create a new file or directory.
-          Status Stat(P.str(), getNextVirtualUniqueID(),
-                      llvm::sys::toTimePoint(ModificationTime), ResolvedUser,
-                      ResolvedGroup, Buffer->getBufferSize(), ResolvedType,
-                      ResolvedPerms);
+          Status Stat(
+              P.str(),
+              (ResolvedType == sys::fs::file_type::directory_file)
+                  ? getDirectoryID(Dir->getUniqueID(), Name)
+                  : getFileID(Dir->getUniqueID(), Name, Buffer->getBuffer()),
+              llvm::sys::toTimePoint(ModificationTime), ResolvedUser,
+              ResolvedGroup, Buffer->getBufferSize(), ResolvedType,
+              ResolvedPerms);
           if (ResolvedType == sys::fs::file_type::directory_file) {
             Child.reset(new detail::InMemoryDirectory(std::move(Stat)));
           } else {
@@ -772,9 +807,9 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
       // Create a new directory. Use the path up to here.
       Status Stat(
           StringRef(Path.str().begin(), Name.end() - Path.str().begin()),
-          getNextVirtualUniqueID(), llvm::sys::toTimePoint(ModificationTime),
-          ResolvedUser, ResolvedGroup, 0, sys::fs::file_type::directory_file,
-          NewDirectoryPerms);
+          getDirectoryID(Dir->getUniqueID(), Name),
+          llvm::sys::toTimePoint(ModificationTime), ResolvedUser, ResolvedGroup,
+          0, sys::fs::file_type::directory_file, NewDirectoryPerms);
       Dir = cast<detail::InMemoryDirectory>(Dir->addChild(
           Name, std::make_unique<detail::InMemoryDirectory>(std::move(Stat))));
       continue;
@@ -1015,9 +1050,10 @@ static llvm::sys::path::Style getExistingStyle(llvm::StringRef Path) {
   // Detect the path style in use by checking the first separator.
   llvm::sys::path::Style style = llvm::sys::path::Style::native;
   const size_t n = Path.find_first_of("/\\");
+  // Can't distinguish between posix and windows_slash here.
   if (n != static_cast<size_t>(-1))
     style = (Path[n] == '/') ? llvm::sys::path::Style::posix
-                             : llvm::sys::path::Style::windows;
+                             : llvm::sys::path::Style::windows_backslash;
   return style;
 }
 
@@ -1091,6 +1127,7 @@ public:
   }
 };
 
+namespace {
 /// Directory iterator implementation for \c RedirectingFileSystem's
 /// directory remap entries that maps the paths reported by the external
 /// file system's directory iterator back to the virtual directory's path.
@@ -1129,6 +1166,7 @@ public:
     return EC;
   }
 };
+} // namespace
 
 llvm::ErrorOr<std::string>
 RedirectingFileSystem::getCurrentWorkingDirectory() const {
@@ -1161,8 +1199,10 @@ std::error_code RedirectingFileSystem::isLocal(const Twine &Path_,
 }
 
 std::error_code RedirectingFileSystem::makeAbsolute(SmallVectorImpl<char> &Path) const {
+  // is_absolute(..., Style::windows_*) accepts paths with both slash types.
   if (llvm::sys::path::is_absolute(Path, llvm::sys::path::Style::posix) ||
-      llvm::sys::path::is_absolute(Path, llvm::sys::path::Style::windows))
+      llvm::sys::path::is_absolute(Path,
+                                   llvm::sys::path::Style::windows_backslash))
     return {};
 
   auto WorkingDir = getCurrentWorkingDirectory();
@@ -1173,9 +1213,15 @@ std::error_code RedirectingFileSystem::makeAbsolute(SmallVectorImpl<char> &Path)
   // is native and there is no way to override that.  Since we know WorkingDir
   // is absolute, we can use it to determine which style we actually have and
   // append Path ourselves.
-  sys::path::Style style = sys::path::Style::windows;
+  sys::path::Style style = sys::path::Style::windows_backslash;
   if (sys::path::is_absolute(WorkingDir.get(), sys::path::Style::posix)) {
     style = sys::path::Style::posix;
+  } else {
+    // Distinguish between windows_backslash and windows_slash; getExistingStyle
+    // returns posix for a path with windows_slash.
+    if (getExistingStyle(WorkingDir.get()) !=
+        sys::path::Style::windows_backslash)
+      style = sys::path::Style::windows_slash;
   }
 
   std::string Result = WorkingDir.get();
@@ -1207,7 +1253,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
   }
 
   // Use status to make sure the path exists and refers to a directory.
-  ErrorOr<Status> S = status(Path, *Result);
+  ErrorOr<Status> S = status(Path, Dir, *Result);
   if (!S) {
     if (shouldFallBackToExternalFS(S.getError(), Result->E))
       return ExternalFS->dir_begin(Dir, EC);
@@ -1593,8 +1639,9 @@ private:
       // which style we have, and use it consistently.
       if (sys::path::is_absolute(Name, sys::path::Style::posix)) {
         path_style = sys::path::Style::posix;
-      } else if (sys::path::is_absolute(Name, sys::path::Style::windows)) {
-        path_style = sys::path::Style::windows;
+      } else if (sys::path::is_absolute(Name,
+                                        sys::path::Style::windows_backslash)) {
+        path_style = sys::path::Style::windows_backslash;
       } else {
         assert(NameValueNode && "Name presence should be checked earlier");
         error(NameValueNode,
@@ -1933,47 +1980,68 @@ RedirectingFileSystem::lookupPathImpl(
   return make_error_code(llvm::errc::no_such_file_or_directory);
 }
 
-static Status getRedirectedFileStatus(const Twine &Path, bool UseExternalNames,
+static Status getRedirectedFileStatus(const Twine &OriginalPath,
+                                      bool UseExternalNames,
                                       Status ExternalStatus) {
   Status S = ExternalStatus;
   if (!UseExternalNames)
-    S = Status::copyWithNewName(S, Path);
+    S = Status::copyWithNewName(S, OriginalPath);
   S.IsVFSMapped = true;
   return S;
 }
 
 ErrorOr<Status> RedirectingFileSystem::status(
-    const Twine &Path, const RedirectingFileSystem::LookupResult &Result) {
+    const Twine &CanonicalPath, const Twine &OriginalPath,
+    const RedirectingFileSystem::LookupResult &Result) {
   if (Optional<StringRef> ExtRedirect = Result.getExternalRedirect()) {
-    ErrorOr<Status> S = ExternalFS->status(*ExtRedirect);
+    SmallString<256> CanonicalRemappedPath((*ExtRedirect).str());
+    if (std::error_code EC = makeCanonical(CanonicalRemappedPath))
+      return EC;
+
+    ErrorOr<Status> S = ExternalFS->status(CanonicalRemappedPath);
     if (!S)
       return S;
+    S = Status::copyWithNewName(*S, *ExtRedirect);
     auto *RE = cast<RedirectingFileSystem::RemapEntry>(Result.E);
-    return getRedirectedFileStatus(Path, RE->useExternalName(UseExternalNames),
-                                   *S);
+    return getRedirectedFileStatus(OriginalPath,
+                                   RE->useExternalName(UseExternalNames), *S);
   }
 
   auto *DE = cast<RedirectingFileSystem::DirectoryEntry>(Result.E);
-  return Status::copyWithNewName(DE->getStatus(), Path);
+  return Status::copyWithNewName(DE->getStatus(), CanonicalPath);
 }
 
-ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path_) {
-  SmallString<256> Path;
-  Path_.toVector(Path);
+ErrorOr<Status>
+RedirectingFileSystem::getExternalStatus(const Twine &CanonicalPath,
+                                         const Twine &OriginalPath) const {
+  if (auto Result = ExternalFS->status(CanonicalPath)) {
+    return Result.get().copyWithNewName(Result.get(), OriginalPath);
+  } else {
+    return Result.getError();
+  }
+}
 
-  if (std::error_code EC = makeCanonical(Path))
+ErrorOr<Status> RedirectingFileSystem::status(const Twine &OriginalPath) {
+  SmallString<256> CanonicalPath;
+  OriginalPath.toVector(CanonicalPath);
+
+  if (std::error_code EC = makeCanonical(CanonicalPath))
     return EC;
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result =
+      lookupPath(CanonicalPath);
   if (!Result) {
-    if (shouldFallBackToExternalFS(Result.getError()))
-      return ExternalFS->status(Path);
+    if (shouldFallBackToExternalFS(Result.getError())) {
+      return getExternalStatus(CanonicalPath, OriginalPath);
+    }
     return Result.getError();
   }
 
-  ErrorOr<Status> S = status(Path, *Result);
-  if (!S && shouldFallBackToExternalFS(S.getError(), Result->E))
-    S = ExternalFS->status(Path);
+  ErrorOr<Status> S = status(CanonicalPath, OriginalPath, *Result);
+  if (!S && shouldFallBackToExternalFS(S.getError(), Result->E)) {
+    return getExternalStatus(CanonicalPath, OriginalPath);
+  }
+
   return S;
 }
 
@@ -1998,22 +2066,39 @@ public:
   }
 
   std::error_code close() override { return InnerFile->close(); }
+
+  void setPath(const Twine &Path) override { S = S.copyWithNewName(S, Path); }
 };
 
 } // namespace
 
 ErrorOr<std::unique_ptr<File>>
-RedirectingFileSystem::openFileForRead(const Twine &Path_) {
-  SmallString<256> Path;
-  Path_.toVector(Path);
+File::getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P) {
+  if (!Result)
+    return Result;
 
-  if (std::error_code EC = makeCanonical(Path))
+  ErrorOr<std::unique_ptr<File>> F = std::move(*Result);
+  auto Name = F->get()->getName();
+  if (Name && Name.get() != P.str())
+    F->get()->setPath(P);
+  return F;
+}
+
+ErrorOr<std::unique_ptr<File>>
+RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
+  SmallString<256> CanonicalPath;
+  OriginalPath.toVector(CanonicalPath);
+
+  if (std::error_code EC = makeCanonical(CanonicalPath))
     return EC;
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result =
+      lookupPath(CanonicalPath);
   if (!Result) {
     if (shouldFallBackToExternalFS(Result.getError()))
-      return ExternalFS->openFileForRead(Path);
+      return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
+                               OriginalPath);
+
     return Result.getError();
   }
 
@@ -2021,12 +2106,18 @@ RedirectingFileSystem::openFileForRead(const Twine &Path_) {
     return make_error_code(llvm::errc::invalid_argument);
 
   StringRef ExtRedirect = *Result->getExternalRedirect();
+  SmallString<256> CanonicalRemappedPath(ExtRedirect.str());
+  if (std::error_code EC = makeCanonical(CanonicalRemappedPath))
+    return EC;
+
   auto *RE = cast<RedirectingFileSystem::RemapEntry>(Result->E);
 
-  auto ExternalFile = ExternalFS->openFileForRead(ExtRedirect);
+  auto ExternalFile = File::getWithPath(
+      ExternalFS->openFileForRead(CanonicalRemappedPath), ExtRedirect);
   if (!ExternalFile) {
     if (shouldFallBackToExternalFS(ExternalFile.getError(), Result->E))
-      return ExternalFS->openFileForRead(Path);
+      return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
+                               OriginalPath);
     return ExternalFile;
   }
 
@@ -2036,7 +2127,7 @@ RedirectingFileSystem::openFileForRead(const Twine &Path_) {
 
   // FIXME: Update the status with the name and VFSMapped.
   Status S = getRedirectedFileStatus(
-      Path, RE->useExternalName(UseExternalNames), *ExternalStatus);
+      OriginalPath, RE->useExternalName(UseExternalNames), *ExternalStatus);
   return std::unique_ptr<File>(
       std::make_unique<FileWithFixedStatus>(std::move(*ExternalFile), S));
 }
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index c1d291731a88..b15e71a9ce2a 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -74,6 +74,11 @@ std::error_code widenPath(const Twine &Path8, SmallVectorImpl<wchar_t> &Path16,
   SmallString<MAX_PATH> Path8Str;
   Path8.toVector(Path8Str);
 
+  // If the path is a long path, mangled into forward slashes, normalize
+  // back to backslashes here.
+  if (Path8Str.startswith("//?/"))
+    llvm::sys::path::native(Path8Str, path::Style::windows_backslash);
+
   if (std::error_code EC = UTF8ToUTF16(Path8Str, Path16))
     return EC;
 
@@ -100,8 +105,10 @@ std::error_code widenPath(const Twine &Path8, SmallVectorImpl<wchar_t> &Path16,
   }
 
   // Remove '.' and '..' because long paths treat these as real path components.
+  // Explicitly use the backslash form here, as we're prepending the \\?\
+  // prefix.
   llvm::sys::path::native(Path8Str, path::Style::windows);
-  llvm::sys::path::remove_dots(Path8Str, true);
+  llvm::sys::path::remove_dots(Path8Str, true, path::Style::windows);
 
   const StringRef RootName = llvm::sys::path::root_name(Path8Str);
   assert(!RootName.empty() &&
@@ -145,6 +152,7 @@ std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
   if (UTF16ToUTF8(PathName.data(), PathName.size(), PathNameUTF8))
     return "";
 
+  llvm::sys::path::make_preferred(PathNameUTF8);
   return std::string(PathNameUTF8.data());
 }
 
@@ -207,7 +215,13 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   // On success, GetCurrentDirectoryW returns the number of characters not
   // including the null-terminator.
   cur_path.set_size(len);
-  return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
+
+  if (std::error_code EC =
+          UTF16ToUTF8(cur_path.begin(), cur_path.size(), result))
+    return EC;
+
+  llvm::sys::path::make_preferred(result);
+  return std::error_code();
 }
 
 std::error_code set_current_path(const Twine &path) {
@@ -388,7 +402,11 @@ static std::error_code realPathFromHandle(HANDLE H,
   }
 
   // Convert the result from UTF-16 to UTF-8.
-  return UTF16ToUTF8(Data, CountChars, RealPath);
+  if (std::error_code EC = UTF16ToUTF8(Data, CountChars, RealPath))
+    return EC;
+
+  llvm::sys::path::make_preferred(RealPath);
+  return std::error_code();
 }
 
 std::error_code is_local(int FD, bool &Result) {
@@ -416,8 +434,7 @@ static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
 
   // Check if the file is on a network (non-local) drive. If so, don't
   // continue when DeleteFile is true, since it prevents opening the file for
-  // writes. Note -- this will leak temporary files on disk, but only when the
-  // target file is on a network drive.
+  // writes.
   SmallVector<wchar_t, 128> FinalPath;
   if (std::error_code EC = realPathFromHandle(Handle, FinalPath))
     return EC;
@@ -427,7 +444,7 @@ static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
     return EC;
 
   if (!IsLocal)
-    return std::error_code();
+    return errc::not_supported;
 
   // The file is on a local drive, we can safely set FILE_DISPOSITION_INFO's
   // flag.
@@ -1183,12 +1200,6 @@ Expected<file_t> openNativeFile(const Twine &Name, CreationDisposition Disp,
     }
   }
 
-  if (Flags & OF_Delete) {
-    if ((EC = setDeleteDisposition(Result, true))) {
-      ::CloseHandle(Result);
-      return errorCodeToError(EC);
-    }
-  }
   return Result;
 }
 
@@ -1414,6 +1425,8 @@ static bool getKnownFolderPath(KNOWNFOLDERID folderId,
 
   bool ok = !UTF16ToUTF8(path, ::wcslen(path), result);
   ::CoTaskMemFree(path);
+  if (ok)
+    llvm::sys::path::make_preferred(result);
   return ok;
 }
 
@@ -1474,6 +1487,7 @@ void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
   // Fall back to a system default.
   const char *DefaultResult = "C:\\Temp";
   Result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
+  llvm::sys::path::make_preferred(Result);
 }
 } // end namespace path
 
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 6f58c52e0746..6732063b562e 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -261,6 +261,7 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
   EC = GetExecutableName(Filename);
   if (EC)
     return EC;
+  sys::path::make_preferred(Arg0);
   sys::path::append(Arg0, Filename);
   Args[0] = Saver.save(Arg0).data();
   return std::error_code();
@@ -504,8 +505,7 @@ bool llvm::RunningWindows8OrGreater() {
   return GetWindowsOSVersion() >= llvm::VersionTuple(6, 2, 0, 0);
 }
 
-LLVM_ATTRIBUTE_NORETURN
-void Process::ExitNoCleanup(int RetCode) {
+[[noreturn]] void Process::ExitNoCleanup(int RetCode) {
   TerminateProcess(GetCurrentProcess(), RetCode);
   llvm_unreachable("TerminateProcess doesn't return");
 }
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index 824834c1cbbe..a9cf2db7ec72 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -103,6 +103,7 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   if (U8Result.empty())
     return mapWindowsError(::GetLastError());
 
+  llvm::sys::path::make_preferred(U8Result);
   return std::string(U8Result.begin(), U8Result.end());
 }
 
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index c9530659caad..ab49ac548f89 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/X86TargetParser.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include <numeric>
 
 using namespace llvm;
 using namespace llvm::X86;
@@ -137,8 +139,8 @@ constexpr FeatureBitset FeaturesNocona =
 // Basic 64-bit capable CPU.
 constexpr FeatureBitset FeaturesX86_64 = FeaturesPentium4 | Feature64BIT;
 constexpr FeatureBitset FeaturesX86_64_V2 = FeaturesX86_64 | FeatureSAHF |
-                                            FeaturePOPCNT | FeatureSSE4_2 |
-                                            FeatureCMPXCHG16B;
+                                            FeaturePOPCNT | FeatureCRC32 |
+                                            FeatureSSE4_2 | FeatureCMPXCHG16B;
 constexpr FeatureBitset FeaturesX86_64_V3 =
     FeaturesX86_64_V2 | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureF16C |
     FeatureFMA | FeatureLZCNT | FeatureMOVBE | FeatureXSAVE;
@@ -151,7 +153,7 @@ constexpr FeatureBitset FeaturesCore2 =
     FeaturesNocona | FeatureSAHF | FeatureSSSE3;
 constexpr FeatureBitset FeaturesPenryn = FeaturesCore2 | FeatureSSE4_1;
 constexpr FeatureBitset FeaturesNehalem =
-    FeaturesPenryn | FeaturePOPCNT | FeatureSSE4_2;
+    FeaturesPenryn | FeaturePOPCNT | FeatureCRC32 | FeatureSSE4_2;
 constexpr FeatureBitset FeaturesWestmere = FeaturesNehalem | FeaturePCLMUL;
 constexpr FeatureBitset FeaturesSandyBridge =
     FeaturesWestmere | FeatureAVX | FeatureXSAVE | FeatureXSAVEOPT;
@@ -201,11 +203,11 @@ constexpr FeatureBitset FeaturesTigerlake =
     FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B |
     FeatureCLWB | FeatureMOVDIRI | FeatureSHSTK | FeatureKL | FeatureWIDEKL;
 constexpr FeatureBitset FeaturesSapphireRapids =
-    FeaturesICLServer | FeatureAMX_TILE | FeatureAMX_INT8 | FeatureAMX_BF16 |
-    FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
-    FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
-    FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
-    FeatureWAITPKG | FeatureAVXVNNI;
+    FeaturesICLServer | FeatureAMX_BF16 | FeatureAMX_INT8 | FeatureAMX_TILE |
+    FeatureAVX512BF16 | FeatureAVX512FP16 | FeatureAVX512VP2INTERSECT |
+    FeatureAVXVNNI | FeatureCLDEMOTE | FeatureENQCMD | FeatureMOVDIR64B |
+    FeatureMOVDIRI | FeaturePTWRITE | FeatureSERIALIZE | FeatureSHSTK |
+    FeatureTSXLDTRK | FeatureUINTR | FeatureWAITPKG;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -254,16 +256,17 @@ constexpr FeatureBitset FeaturesBTVER1 =
     FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_A |
     FeatureSAHF;
 constexpr FeatureBitset FeaturesBTVER2 =
-    FeaturesBTVER1 | FeatureAES | FeatureAVX | FeatureBMI | FeatureF16C |
-    FeatureMOVBE | FeaturePCLMUL | FeatureXSAVE | FeatureXSAVEOPT;
+    FeaturesBTVER1 | FeatureAES | FeatureAVX | FeatureBMI | FeatureCRC32 |
+    FeatureF16C | FeatureMOVBE | FeaturePCLMUL | FeatureXSAVE | FeatureXSAVEOPT;
 
 // AMD Bulldozer architecture processors.
 constexpr FeatureBitset FeaturesBDVER1 =
     FeatureX87 | FeatureAES | FeatureAVX | FeatureCMPXCHG8B |
-    FeatureCMPXCHG16B | Feature64BIT | FeatureFMA4 | FeatureFXSR | FeatureLWP |
-    FeatureLZCNT | FeatureMMX | FeaturePCLMUL | FeaturePOPCNT | FeaturePRFCHW |
-    FeatureSAHF | FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 |
-    FeatureSSE4_1 | FeatureSSE4_2 | FeatureSSE4_A | FeatureXOP | FeatureXSAVE;
+    FeatureCMPXCHG16B | FeatureCRC32 | Feature64BIT | FeatureFMA4 |
+    FeatureFXSR | FeatureLWP | FeatureLZCNT | FeatureMMX | FeaturePCLMUL |
+    FeaturePOPCNT | FeaturePRFCHW | FeatureSAHF | FeatureSSE | FeatureSSE2 |
+    FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_1 | FeatureSSE4_2 | FeatureSSE4_A |
+    FeatureXOP | FeatureXSAVE;
 constexpr FeatureBitset FeaturesBDVER2 =
     FeaturesBDVER1 | FeatureBMI | FeatureFMA | FeatureF16C | FeatureTBM;
 constexpr FeatureBitset FeaturesBDVER3 =
@@ -276,9 +279,9 @@ constexpr FeatureBitset FeaturesBDVER4 = FeaturesBDVER3 | FeatureAVX2 |
 constexpr FeatureBitset FeaturesZNVER1 =
     FeatureX87 | FeatureADX | FeatureAES | FeatureAVX | FeatureAVX2 |
     FeatureBMI | FeatureBMI2 | FeatureCLFLUSHOPT | FeatureCLZERO |
-    FeatureCMPXCHG8B | FeatureCMPXCHG16B | Feature64BIT | FeatureF16C |
-    FeatureFMA | FeatureFSGSBASE | FeatureFXSR | FeatureLZCNT | FeatureMMX |
-    FeatureMOVBE | FeatureMWAITX | FeaturePCLMUL | FeaturePOPCNT |
+    FeatureCMPXCHG8B | FeatureCMPXCHG16B | FeatureCRC32 | Feature64BIT |
+    FeatureF16C | FeatureFMA | FeatureFSGSBASE | FeatureFXSR | FeatureLZCNT |
+    FeatureMMX | FeatureMOVBE | FeatureMWAITX | FeaturePCLMUL | FeaturePOPCNT |
     FeaturePRFCHW | FeatureRDRND | FeatureRDSEED | FeatureSAHF | FeatureSHA |
     FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_1 |
     FeatureSSE4_2 | FeatureSSE4_A | FeatureXSAVE | FeatureXSAVEC |
@@ -470,6 +473,7 @@ constexpr FeatureBitset ImpliedFeaturesCLZERO = {};
 constexpr FeatureBitset ImpliedFeaturesCMOV = {};
 constexpr FeatureBitset ImpliedFeaturesCMPXCHG16B = {};
 constexpr FeatureBitset ImpliedFeaturesCMPXCHG8B = {};
+constexpr FeatureBitset ImpliedFeaturesCRC32 = {};
 constexpr FeatureBitset ImpliedFeaturesENQCMD = {};
 constexpr FeatureBitset ImpliedFeaturesFSGSBASE = {};
 constexpr FeatureBitset ImpliedFeaturesFXSR = {};
@@ -576,6 +580,8 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
+static constexpr FeatureBitset ImpliedFeaturesAVX512FP16 =
+    FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL;
 // Key Locker Features
 constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
 constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
@@ -660,3 +666,45 @@ void llvm::X86::updateImpliedFeatures(
     if (ImpliedBits[i] && !FeatureInfos[i].Name.empty())
       Features[FeatureInfos[i].Name] = Enabled;
 }
+
+uint64_t llvm::X86::getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs) {
+  // Processor features and mapping to processor feature value.
+  uint64_t FeaturesMask = 0;
+  for (const StringRef &FeatureStr : FeatureStrs) {
+    unsigned Feature = StringSwitch<unsigned>(FeatureStr)
+#define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY)                                \
+  .Case(STR, llvm::X86::FEATURE_##ENUM)
+#include "llvm/Support/X86TargetParser.def"
+        ;
+    FeaturesMask |= (1ULL << Feature);
+  }
+  return FeaturesMask;
+}
+
+unsigned llvm::X86::getFeaturePriority(ProcessorFeatures Feat) {
+#ifndef NDEBUG
+  // Check that priorities are set properly in the .def file. We expect that
+  // "compat" features are assigned non-duplicate consecutive priorities
+  // starting from zero (0, 1, ..., num_features - 1).
+#define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY) PRIORITY,
+  unsigned Priorities[] = {
+#include "llvm/Support/X86TargetParser.def"
+      std::numeric_limits<unsigned>::max() // Need to consume last comma.
+  };
+  std::array<unsigned, array_lengthof(Priorities) - 1> HelperList;
+  std::iota(HelperList.begin(), HelperList.end(), 0);
+  assert(std::is_permutation(HelperList.begin(), HelperList.end(),
+                             std::begin(Priorities),
+                             std::prev(std::end(Priorities))) &&
+         "Priorities don't form consecutive range!");
+#endif
+
+  switch (Feat) {
+#define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY)                                \
+  case X86::FEATURE_##ENUM:                                                    \
+    return PRIORITY;
+#include "llvm/Support/X86TargetParser.def"
+  default:
+    llvm_unreachable("No Feature Priority for non-CPUSupports Features");
+  }
+}
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index d4e1c884d125..4590a3d19b0d 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -185,7 +185,7 @@ raw_ostream &raw_ostream::write_escaped(StringRef Str,
       // Write out the escaped representation.
       if (UseHexEscapes) {
         *this << '\\' << 'x';
-        *this << hexdigit((c >> 4 & 0xF));
+        *this << hexdigit((c >> 4) & 0xF);
         *this << hexdigit((c >> 0) & 0xF);
       } else {
         // Always use a full 3-character octal escape.
@@ -679,7 +679,8 @@ raw_fd_ostream::~raw_fd_ostream() {
   // has_error() and clear the error flag with clear_error() before
   // destructing raw_ostream objects which may have errors.
   if (has_error())
-    report_fatal_error("IO failure on output stream: " + error().message(),
+    report_fatal_error(Twine("IO failure on output stream: ") +
+                           error().message(),
                        /*gen_crash_diag=*/false);
 }
 
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 0b1024648b66..762255b43136 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -55,6 +55,10 @@ WriteIfChanged("write-if-changed", cl::desc("Only write output if it changed"));
 static cl::opt<bool>
 TimePhases("time-phases", cl::desc("Time phases of parser and backend"));
 
+static cl::opt<bool> NoWarnOnUnusedTemplateArgs(
+    "no-warn-on-unused-template-args",
+    cl::desc("Disable unused template argument warnings."));
+
 static int reportError(const char *ProgName, Twine Msg) {
   errs() << ProgName << ": " << Msg;
   errs().flush();
@@ -107,7 +111,7 @@ int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) {
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
 
-  TGParser Parser(SrcMgr, MacroNames, Records);
+  TGParser Parser(SrcMgr, MacroNames, Records, NoWarnOnUnusedTemplateArgs);
 
   if (Parser.ParseFile())
     return 1;
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 8663863d968f..eb7d4838a9f6 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
@@ -41,24 +42,70 @@ using namespace llvm;
 
 #define DEBUG_TYPE "tblgen-records"
 
-static BumpPtrAllocator Allocator;
+//===----------------------------------------------------------------------===//
+//    Context
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+namespace detail {
+/// This class contains all of the contextual static state of the Record
+/// classes. This allows for better lifetime management and control of the used
+/// static data.
+struct RecordContext {
+  RecordContext()
+      : AnyRecord(0), TrueBitInit(true, &SharedBitRecTy),
+        FalseBitInit(false, &SharedBitRecTy), StringInitStringPool(Allocator),
+        StringInitCodePool(Allocator), LastRecordID(0) {}
+
+  BumpPtrAllocator Allocator;
+  std::vector<BitsRecTy *> SharedBitsRecTys;
+  BitRecTy SharedBitRecTy;
+  IntRecTy SharedIntRecTy;
+  StringRecTy SharedStringRecTy;
+  DagRecTy SharedDagRecTy;
+
+  RecordRecTy AnyRecord;
+  UnsetInit TheUnsetInit;
+  BitInit TrueBitInit;
+  BitInit FalseBitInit;
+
+  FoldingSet<BitsInit> TheBitsInitPool;
+  std::map<int64_t, IntInit *> TheIntInitPool;
+  StringMap<StringInit *, BumpPtrAllocator &> StringInitStringPool;
+  StringMap<StringInit *, BumpPtrAllocator &> StringInitCodePool;
+  FoldingSet<ListInit> TheListInitPool;
+  FoldingSet<UnOpInit> TheUnOpInitPool;
+  FoldingSet<BinOpInit> TheBinOpInitPool;
+  FoldingSet<TernOpInit> TheTernOpInitPool;
+  FoldingSet<FoldOpInit> TheFoldOpInitPool;
+  FoldingSet<IsAOpInit> TheIsAOpInitPool;
+  DenseMap<std::pair<RecTy *, Init *>, VarInit *> TheVarInitPool;
+  DenseMap<std::pair<TypedInit *, unsigned>, VarBitInit *> TheVarBitInitPool;
+  DenseMap<std::pair<TypedInit *, unsigned>, VarListElementInit *>
+      TheVarListElementInitPool;
+  FoldingSet<VarDefInit> TheVarDefInitPool;
+  DenseMap<std::pair<Init *, StringInit *>, FieldInit *> TheFieldInitPool;
+  FoldingSet<CondOpInit> TheCondOpInitPool;
+  FoldingSet<DagInit> TheDagInitPool;
+
+  unsigned LastRecordID;
+};
+} // namespace detail
+} // namespace llvm
+
+ManagedStatic<detail::RecordContext> Context;
 
 //===----------------------------------------------------------------------===//
 //    Type implementations
 //===----------------------------------------------------------------------===//
 
-BitRecTy BitRecTy::Shared;
-IntRecTy IntRecTy::Shared;
-StringRecTy StringRecTy::Shared;
-DagRecTy DagRecTy::Shared;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecTy::dump() const { print(errs()); }
 #endif
 
 ListRecTy *RecTy::getListTy() {
   if (!ListTy)
-    ListTy = new(Allocator) ListRecTy(this);
+    ListTy = new(Context->Allocator) ListRecTy(this);
   return ListTy;
 }
 
@@ -69,6 +116,8 @@ bool RecTy::typeIsConvertibleTo(const RecTy *RHS) const {
 
 bool RecTy::typeIsA(const RecTy *RHS) const { return this == RHS; }
 
+BitRecTy *BitRecTy::get() { return &Context->SharedBitRecTy; }
+
 bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{
   if (RecTy::typeIsConvertibleTo(RHS) || RHS->getRecTyKind() == IntRecTyKind)
     return true;
@@ -78,12 +127,11 @@ bool BitRecTy::typeIsConvertibleTo(const RecTy *RHS) const{
 }
 
 BitsRecTy *BitsRecTy::get(unsigned Sz) {
-  static std::vector<BitsRecTy*> Shared;
-  if (Sz >= Shared.size())
-    Shared.resize(Sz + 1);
-  BitsRecTy *&Ty = Shared[Sz];
+  if (Sz >= Context->SharedBitsRecTys.size())
+    Context->SharedBitsRecTys.resize(Sz + 1);
+  BitsRecTy *&Ty = Context->SharedBitsRecTys[Sz];
   if (!Ty)
-    Ty = new(Allocator) BitsRecTy(Sz);
+    Ty = new (Context->Allocator) BitsRecTy(Sz);
   return Ty;
 }
 
@@ -104,11 +152,15 @@ bool BitsRecTy::typeIsA(const RecTy *RHS) const {
   return false;
 }
 
+IntRecTy *IntRecTy::get() { return &Context->SharedIntRecTy; }
+
 bool IntRecTy::typeIsConvertibleTo(const RecTy *RHS) const {
   RecTyKind kind = RHS->getRecTyKind();
   return kind==BitRecTyKind || kind==BitsRecTyKind || kind==IntRecTyKind;
 }
 
+StringRecTy *StringRecTy::get() { return &Context->SharedStringRecTy; }
+
 std::string StringRecTy::getAsString() const {
   return "string";
 }
@@ -134,6 +186,8 @@ bool ListRecTy::typeIsA(const RecTy *RHS) const {
   return false;
 }
 
+DagRecTy *DagRecTy::get() { return &Context->SharedDagRecTy; }
+
 std::string DagRecTy::getAsString() const {
   return "dag";
 }
@@ -146,10 +200,8 @@ static void ProfileRecordRecTy(FoldingSetNodeID &ID,
 }
 
 RecordRecTy *RecordRecTy::get(ArrayRef<Record *> UnsortedClasses) {
-  if (UnsortedClasses.empty()) {
-    static RecordRecTy AnyRecord(0);
-    return &AnyRecord;
-  }
+  if (UnsortedClasses.empty())
+    return &Context->AnyRecord;
 
   FoldingSet<RecordRecTy> &ThePool =
       UnsortedClasses[0]->getRecords().RecordTypePool;
@@ -177,8 +229,8 @@ RecordRecTy *RecordRecTy::get(ArrayRef<Record *> UnsortedClasses) {
   }
 #endif
 
-  void *Mem = Allocator.Allocate(totalSizeToAlloc<Record *>(Classes.size()),
-                                 alignof(RecordRecTy));
+  void *Mem = Context->Allocator.Allocate(
+      totalSizeToAlloc<Record *>(Classes.size()), alignof(RecordRecTy));
   RecordRecTy *Ty = new(Mem) RecordRecTy(Classes.size());
   std::uninitialized_copy(Classes.begin(), Classes.end(),
                           Ty->getTrailingObjects<Record *>());
@@ -283,10 +335,7 @@ void Init::anchor() {}
 LLVM_DUMP_METHOD void Init::dump() const { return print(errs()); }
 #endif
 
-UnsetInit *UnsetInit::get() {
-  static UnsetInit TheInit;
-  return &TheInit;
-}
+UnsetInit *UnsetInit::get() { return &Context->TheUnsetInit; }
 
 Init *UnsetInit::getCastTo(RecTy *Ty) const {
   return const_cast<UnsetInit *>(this);
@@ -297,10 +346,7 @@ Init *UnsetInit::convertInitializerTo(RecTy *Ty) const {
 }
 
 BitInit *BitInit::get(bool V) {
-  static BitInit True(true);
-  static BitInit False(false);
-
-  return V ? &True : &False;
+  return V ? &Context->TrueBitInit : &Context->FalseBitInit;
 }
 
 Init *BitInit::convertInitializerTo(RecTy *Ty) const {
@@ -328,21 +374,19 @@ ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range) {
 }
 
 BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
-  static FoldingSet<BitsInit> ThePool;
-
   FoldingSetNodeID ID;
   ProfileBitsInit(ID, Range);
 
   void *IP = nullptr;
-  if (BitsInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (BitsInit *I = Context->TheBitsInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *>(Range.size()),
-                                 alignof(BitsInit));
+  void *Mem = Context->Allocator.Allocate(
+      totalSizeToAlloc<Init *>(Range.size()), alignof(BitsInit));
   BitsInit *I = new(Mem) BitsInit(Range.size());
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
-  ThePool.InsertNode(I, IP);
+  Context->TheBitsInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -446,10 +490,9 @@ Init *BitsInit::resolveReferences(Resolver &R) const {
 }
 
 IntInit *IntInit::get(int64_t V) {
-  static std::map<int64_t, IntInit*> ThePool;
-
-  IntInit *&I = ThePool[V];
-  if (!I) I = new(Allocator) IntInit(V);
+  IntInit *&I = Context->TheIntInitPool[V];
+  if (!I)
+    I = new (Context->Allocator) IntInit(V);
   return I;
 }
 
@@ -503,7 +546,7 @@ IntInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
 }
 
 AnonymousNameInit *AnonymousNameInit::get(unsigned V) {
-  return new (Allocator) AnonymousNameInit(V);
+  return new (Context->Allocator) AnonymousNameInit(V);
 }
 
 StringInit *AnonymousNameInit::getNameInit() const {
@@ -525,20 +568,12 @@ Init *AnonymousNameInit::resolveReferences(Resolver &R) const {
 }
 
 StringInit *StringInit::get(StringRef V, StringFormat Fmt) {
-  static StringMap<StringInit*, BumpPtrAllocator &> StringPool(Allocator);
-  static StringMap<StringInit*, BumpPtrAllocator &> CodePool(Allocator);
-
-  if (Fmt == SF_String) {
-    auto &Entry = *StringPool.insert(std::make_pair(V, nullptr)).first;
-    if (!Entry.second)
-      Entry.second = new (Allocator) StringInit(Entry.getKey(), Fmt);
-    return Entry.second;
-  } else {
-    auto &Entry = *CodePool.insert(std::make_pair(V, nullptr)).first;
-    if (!Entry.second)
-      Entry.second = new (Allocator) StringInit(Entry.getKey(), Fmt);
-    return Entry.second;
-  }
+  auto &InitMap = Fmt == SF_String ? Context->StringInitStringPool
+                                   : Context->StringInitCodePool;
+  auto &Entry = *InitMap.insert(std::make_pair(V, nullptr)).first;
+  if (!Entry.second)
+    Entry.second = new (Context->Allocator) StringInit(Entry.getKey(), Fmt);
+  return Entry.second;
 }
 
 Init *StringInit::convertInitializerTo(RecTy *Ty) const {
@@ -559,24 +594,22 @@ static void ProfileListInit(FoldingSetNodeID &ID,
 }
 
 ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
-  static FoldingSet<ListInit> ThePool;
-
   FoldingSetNodeID ID;
   ProfileListInit(ID, Range, EltTy);
 
   void *IP = nullptr;
-  if (ListInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (ListInit *I = Context->TheListInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
   assert(Range.empty() || !isa<TypedInit>(Range[0]) ||
          cast<TypedInit>(Range[0])->getType()->typeIsConvertibleTo(EltTy));
 
-  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *>(Range.size()),
-                                 alignof(ListInit));
-  ListInit *I = new(Mem) ListInit(Range.size(), EltTy);
+  void *Mem = Context->Allocator.Allocate(
+      totalSizeToAlloc<Init *>(Range.size()), alignof(ListInit));
+  ListInit *I = new (Mem) ListInit(Range.size(), EltTy);
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
-  ThePool.InsertNode(I, IP);
+  Context->TheListInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -696,17 +729,15 @@ ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, RecTy *Type) {
 }
 
 UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
-  static FoldingSet<UnOpInit> ThePool;
-
   FoldingSetNodeID ID;
   ProfileUnOpInit(ID, Opc, LHS, Type);
 
   void *IP = nullptr;
-  if (UnOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (UnOpInit *I = Context->TheUnOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  UnOpInit *I = new(Allocator) UnOpInit(Opc, LHS, Type);
-  ThePool.InsertNode(I, IP);
+  UnOpInit *I = new (Context->Allocator) UnOpInit(Opc, LHS, Type);
+  Context->TheUnOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -860,19 +891,16 @@ ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *RHS,
   ID.AddPointer(Type);
 }
 
-BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS,
-                          Init *RHS, RecTy *Type) {
-  static FoldingSet<BinOpInit> ThePool;
-
+BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, Init *RHS, RecTy *Type) {
   FoldingSetNodeID ID;
   ProfileBinOpInit(ID, Opc, LHS, RHS, Type);
 
   void *IP = nullptr;
-  if (BinOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (BinOpInit *I = Context->TheBinOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  BinOpInit *I = new(Allocator) BinOpInit(Opc, LHS, RHS, Type);
-  ThePool.InsertNode(I, IP);
+  BinOpInit *I = new (Context->Allocator) BinOpInit(Opc, LHS, RHS, Type);
+  Context->TheBinOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -884,7 +912,7 @@ static StringInit *ConcatStringInits(const StringInit *I0,
                                      const StringInit *I1) {
   SmallString<80> Concat(I0->getValue());
   Concat.append(I1->getValue());
-  return StringInit::get(Concat, 
+  return StringInit::get(Concat,
                          StringInit::determineFormat(I0->getFormat(),
                                                      I1->getFormat()));
 }
@@ -1189,17 +1217,15 @@ ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS,
 
 TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
                             RecTy *Type) {
-  static FoldingSet<TernOpInit> ThePool;
-
   FoldingSetNodeID ID;
   ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type);
 
   void *IP = nullptr;
-  if (TernOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (TernOpInit *I = Context->TheTernOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  TernOpInit *I = new(Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type);
-  ThePool.InsertNode(I, IP);
+  TernOpInit *I = new (Context->Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type);
+  Context->TheTernOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1273,8 +1299,8 @@ static Init *FilterHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
       if (!Include)
         return nullptr;
       if (IntInit *IncludeInt = dyn_cast_or_null<IntInit>(
-                                    Include->convertInitializerTo(IntRecTy::get()))) {
-        if (IncludeInt->getValue())          
+              Include->convertInitializerTo(IntRecTy::get()))) {
+        if (IncludeInt->getValue())
           NewList.push_back(Item);
       } else {
         return nullptr;
@@ -1482,17 +1508,17 @@ static void ProfileFoldOpInit(FoldingSetNodeID &ID, Init *Start, Init *List,
 
 FoldOpInit *FoldOpInit::get(Init *Start, Init *List, Init *A, Init *B,
                             Init *Expr, RecTy *Type) {
-  static FoldingSet<FoldOpInit> ThePool;
 
   FoldingSetNodeID ID;
   ProfileFoldOpInit(ID, Start, List, A, B, Expr, Type);
 
   void *IP = nullptr;
-  if (FoldOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (FoldOpInit *I = Context->TheFoldOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  FoldOpInit *I = new (Allocator) FoldOpInit(Start, List, A, B, Expr, Type);
-  ThePool.InsertNode(I, IP);
+  FoldOpInit *I =
+      new (Context->Allocator) FoldOpInit(Start, List, A, B, Expr, Type);
+  Context->TheFoldOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1547,17 +1573,16 @@ static void ProfileIsAOpInit(FoldingSetNodeID &ID, RecTy *CheckType,
 }
 
 IsAOpInit *IsAOpInit::get(RecTy *CheckType, Init *Expr) {
-  static FoldingSet<IsAOpInit> ThePool;
 
   FoldingSetNodeID ID;
   ProfileIsAOpInit(ID, CheckType, Expr);
 
   void *IP = nullptr;
-  if (IsAOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (IsAOpInit *I = Context->TheIsAOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  IsAOpInit *I = new (Allocator) IsAOpInit(CheckType, Expr);
-  ThePool.InsertNode(I, IP);
+  IsAOpInit *I = new (Context->Allocator) IsAOpInit(CheckType, Expr);
+  Context->TheIsAOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1680,14 +1705,9 @@ VarInit *VarInit::get(StringRef VN, RecTy *T) {
 }
 
 VarInit *VarInit::get(Init *VN, RecTy *T) {
-  using Key = std::pair<RecTy *, Init *>;
-  static DenseMap<Key, VarInit*> ThePool;
-
-  Key TheKey(std::make_pair(T, VN));
-
-  VarInit *&I = ThePool[TheKey];
+  VarInit *&I = Context->TheVarInitPool[std::make_pair(T, VN)];
   if (!I)
-    I = new(Allocator) VarInit(VN, T);
+    I = new (Context->Allocator) VarInit(VN, T);
   return I;
 }
 
@@ -1709,14 +1729,9 @@ Init *VarInit::resolveReferences(Resolver &R) const {
 }
 
 VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) {
-  using Key = std::pair<TypedInit *, unsigned>;
-  static DenseMap<Key, VarBitInit*> ThePool;
-
-  Key TheKey(std::make_pair(T, B));
-
-  VarBitInit *&I = ThePool[TheKey];
+  VarBitInit *&I = Context->TheVarBitInitPool[std::make_pair(T, B)];
   if (!I)
-    I = new(Allocator) VarBitInit(T, B);
+    I = new(Context->Allocator) VarBitInit(T, B);
   return I;
 }
 
@@ -1732,15 +1747,11 @@ Init *VarBitInit::resolveReferences(Resolver &R) const {
   return const_cast<VarBitInit*>(this);
 }
 
-VarListElementInit *VarListElementInit::get(TypedInit *T,
-                                            unsigned E) {
-  using Key = std::pair<TypedInit *, unsigned>;
-  static DenseMap<Key, VarListElementInit*> ThePool;
-
-  Key TheKey(std::make_pair(T, E));
-
-  VarListElementInit *&I = ThePool[TheKey];
-  if (!I) I = new(Allocator) VarListElementInit(T, E);
+VarListElementInit *VarListElementInit::get(TypedInit *T, unsigned E) {
+  VarListElementInit *&I =
+      Context->TheVarListElementInitPool[std::make_pair(T, E)];
+  if (!I)
+    I = new (Context->Allocator) VarListElementInit(T, E);
   return I;
 }
 
@@ -1800,21 +1811,19 @@ static void ProfileVarDefInit(FoldingSetNodeID &ID,
 }
 
 VarDefInit *VarDefInit::get(Record *Class, ArrayRef<Init *> Args) {
-  static FoldingSet<VarDefInit> ThePool;
-
   FoldingSetNodeID ID;
   ProfileVarDefInit(ID, Class, Args);
 
   void *IP = nullptr;
-  if (VarDefInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (VarDefInit *I = Context->TheVarDefInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *>(Args.size()),
-                                 alignof(VarDefInit));
-  VarDefInit *I = new(Mem) VarDefInit(Class, Args.size());
+  void *Mem = Context->Allocator.Allocate(totalSizeToAlloc<Init *>(Args.size()),
+                                          alignof(VarDefInit));
+  VarDefInit *I = new (Mem) VarDefInit(Class, Args.size());
   std::uninitialized_copy(Args.begin(), Args.end(),
                           I->getTrailingObjects<Init *>());
-  ThePool.InsertNode(I, IP);
+  Context->TheVarDefInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -1920,13 +1929,9 @@ std::string VarDefInit::getAsString() const {
 }
 
 FieldInit *FieldInit::get(Init *R, StringInit *FN) {
-  using Key = std::pair<Init *, StringInit *>;
-  static DenseMap<Key, FieldInit*> ThePool;
-
-  Key TheKey(std::make_pair(R, FN));
-
-  FieldInit *&I = ThePool[TheKey];
-  if (!I) I = new(Allocator) FieldInit(R, FN);
+  FieldInit *&I = Context->TheFieldInitPool[std::make_pair(R, FN)];
+  if (!I)
+    I = new (Context->Allocator) FieldInit(R, FN);
   return I;
 }
 
@@ -1995,23 +2000,22 @@ CondOpInit::get(ArrayRef<Init *> CondRange,
   assert(CondRange.size() == ValRange.size() &&
          "Number of conditions and values must match!");
 
-  static FoldingSet<CondOpInit> ThePool;
   FoldingSetNodeID ID;
   ProfileCondOpInit(ID, CondRange, ValRange, Ty);
 
   void *IP = nullptr;
-  if (CondOpInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (CondOpInit *I = Context->TheCondOpInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *>(2*CondRange.size()),
-                                 alignof(BitsInit));
+  void *Mem = Context->Allocator.Allocate(
+      totalSizeToAlloc<Init *>(2 * CondRange.size()), alignof(BitsInit));
   CondOpInit *I = new(Mem) CondOpInit(CondRange.size(), Ty);
 
   std::uninitialized_copy(CondRange.begin(), CondRange.end(),
                           I->getTrailingObjects<Init *>());
   std::uninitialized_copy(ValRange.begin(), ValRange.end(),
                           I->getTrailingObjects<Init *>()+CondRange.size());
-  ThePool.InsertNode(I, IP);
+  Context->TheCondOpInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -2113,25 +2117,24 @@ static void ProfileDagInit(FoldingSetNodeID &ID, Init *V, StringInit *VN,
   assert(Name == NameRange.end() && "Arg name overflow!");
 }
 
-DagInit *
-DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
-             ArrayRef<StringInit *> NameRange) {
-  static FoldingSet<DagInit> ThePool;
-
+DagInit *DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
+                      ArrayRef<StringInit *> NameRange) {
   FoldingSetNodeID ID;
   ProfileDagInit(ID, V, VN, ArgRange, NameRange);
 
   void *IP = nullptr;
-  if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+  if (DagInit *I = Context->TheDagInitPool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *, StringInit *>(ArgRange.size(), NameRange.size()), alignof(BitsInit));
-  DagInit *I = new(Mem) DagInit(V, VN, ArgRange.size(), NameRange.size());
+  void *Mem = Context->Allocator.Allocate(
+      totalSizeToAlloc<Init *, StringInit *>(ArgRange.size(), NameRange.size()),
+      alignof(BitsInit));
+  DagInit *I = new (Mem) DagInit(V, VN, ArgRange.size(), NameRange.size());
   std::uninitialized_copy(ArgRange.begin(), ArgRange.end(),
                           I->getTrailingObjects<Init *>());
   std::uninitialized_copy(NameRange.begin(), NameRange.end(),
                           I->getTrailingObjects<StringInit *>());
-  ThePool.InsertNode(I, IP);
+  Context->TheDagInitPool.InsertNode(I, IP);
   return I;
 }
 
@@ -2301,8 +2304,6 @@ void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
   if (PrintSem) OS << ";\n";
 }
 
-unsigned Record::LastID = 0;
-
 void Record::checkName() {
   // Ensure the record name has string type.
   const TypedInit *TypedName = cast<const TypedInit>(Name);
@@ -2319,10 +2320,12 @@ RecordRecTy *Record::getType() {
 
 DefInit *Record::getDefInit() {
   if (!CorrespondingDefInit)
-    CorrespondingDefInit = new (Allocator) DefInit(this);
+    CorrespondingDefInit = new (Context->Allocator) DefInit(this);
   return CorrespondingDefInit;
 }
 
+unsigned Record::getNewUID() { return Context->LastRecordID++; }
+
 void Record::setName(Init *NewName) {
   Name = NewName;
   checkName();
@@ -2501,7 +2504,7 @@ BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
 
   if (BitsInit *BI = dyn_cast<BitsInit>(R->getValue()))
     return BI;
-  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + 
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
                                 "' exists but does not have a bits value");
 }
 
@@ -2513,7 +2516,7 @@ ListInit *Record::getValueAsListInit(StringRef FieldName) const {
 
   if (ListInit *LI = dyn_cast<ListInit>(R->getValue()))
     return LI;
-  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName + 
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" + FieldName +
                                 "' exists but does not have a list value");
 }
 
@@ -2653,13 +2656,23 @@ void Record::checkRecordAssertions() {
   RecordResolver R(*this);
   R.setFinal(true);
 
-  for (auto Assertion : getAssertions()) {
+  for (const auto &Assertion : getAssertions()) {
     Init *Condition = Assertion.Condition->resolveReferences(R);
     Init *Message = Assertion.Message->resolveReferences(R);
     CheckAssert(Assertion.Loc, Condition, Message);
   }
 }
 
+// Report a warning if the record has unused template arguments.
+void Record::checkUnusedTemplateArgs() {
+  for (const Init *TA : getTemplateArgs()) {
+    const RecordVal *Arg = getValue(TA);
+    if (!Arg->isUsed())
+      PrintWarning(Arg->getLoc(),
+                   "unused template argument: " + Twine(Arg->getName()));
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; }
 #endif
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index ed7963031b24..6ccca4d69f40 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -874,8 +874,9 @@ Init *TGParser::ParseIDValue(Record *CurRec, StringInit *Name, SMLoc NameLoc,
 
     Record *TemplateRec = CurMultiClass ? &CurMultiClass->Rec : CurRec;
     if (TemplateRec->isTemplateArg(TemplateArgName)) {
-      const RecordVal *RV = TemplateRec->getValue(TemplateArgName);
+      RecordVal *RV = TemplateRec->getValue(TemplateArgName);
       assert(RV && "Template arg doesn't exist??");
+      RV->setUsed(true);
       return VarInit::get(TemplateArgName, RV->getType());
     } else if (Name->getValue() == "NAME") {
       return VarInit::get(TemplateArgName, StringRecTy::get());
@@ -3346,7 +3347,12 @@ bool TGParser::ParseClass() {
     if (ParseTemplateArgList(CurRec))
       return true;
 
-  return ParseObjectBody(CurRec);
+  if (ParseObjectBody(CurRec))
+    return true;
+
+  if (!NoWarnOnUnusedTemplateArgs)
+    CurRec->checkUnusedTemplateArgs();
+  return false;
 }
 
 /// ParseLetList - Parse a non-empty list of assignment expressions into a list
@@ -3541,6 +3547,9 @@ bool TGParser::ParseMultiClass() {
     PopLocalScope(MulticlassScope);
   }
 
+  if (!NoWarnOnUnusedTemplateArgs)
+    CurMultiClass->Rec.checkUnusedTemplateArgs();
+
   CurMultiClass = nullptr;
   return false;
 }
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index 6e3c5186e4f6..00883c858d58 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -160,10 +160,13 @@ class TGParser {
                       // exist.
   };
 
+  bool NoWarnOnUnusedTemplateArgs = false;
+
 public:
-  TGParser(SourceMgr &SM, ArrayRef<std::string> Macros,
-           RecordKeeper &records)
-    : Lex(SM, Macros), CurMultiClass(nullptr), Records(records) {}
+  TGParser(SourceMgr &SM, ArrayRef<std::string> Macros, RecordKeeper &records,
+           const bool NoWarnOnUnusedTemplateArgs = false)
+      : Lex(SM, Macros), CurMultiClass(nullptr), Records(records),
+        NoWarnOnUnusedTemplateArgs(NoWarnOnUnusedTemplateArgs) {}
 
   /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
   /// routines return true on error, or false on success.
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 658d44771e8d..b0dd30c13137 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -51,6 +51,7 @@ FunctionPass *createAArch64A53Fix835769();
 FunctionPass *createFalkorHWPFFixPass();
 FunctionPass *createFalkorMarkStridedAccessesPass();
 FunctionPass *createAArch64BranchTargetsPass();
+FunctionPass *createAArch64MIPeepholeOptPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -82,6 +83,7 @@ void initializeAArch64SLSHardeningPass(PassRegistry&);
 void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
+void initializeAArch64MIPeepholeOptPass(PassRegistry &);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index d8dd9d1b2f91..548e4e0c9389 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -61,6 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
 def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
   "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
 
+def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
+  "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
+
 def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
   "Enable out of line atomics to support LSE instructions">;
 
@@ -126,8 +129,12 @@ def FeatureExperimentalZeroingPseudos
                        "merged with destructive operations",
                        []>;
 
+def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
+  "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
+
 def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
-  "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>;
+  "Enable Scalable Vector Extension 2 (SVE2) instructions",
+  [FeatureSVE, FeatureUseScalarIncVL]>;
 
 def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true",
   "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>;
@@ -309,10 +316,6 @@ def FeatureSEL2 : SubtargetFeature<
     "sel2", "HasSEL2", "true",
     "Enable v8.4-A Secure Exception Level 2 extension">;
 
-def FeaturePMU : SubtargetFeature<
-    "pmu", "HasPMU", "true",
-    "Enable v8.4-A PMU extension">;
-
 def FeatureTLB_RMI : SubtargetFeature<
     "tlb-rmi", "HasTLB_RMI", "true",
     "Enable v8.4-A TLB Range and Maintenance Instructions">;
@@ -429,10 +432,13 @@ def FeatureEnhancedCounterVirtualization :
 def FeatureRME : SubtargetFeature<"rme", "HasRME",
     "true", "Enable Realm Management Extension">;
 
-// FIXME: SME should only imply the subset of SVE(2) instructions that are
-// legal in streaming mode.
+// A subset of SVE(2) instructions are legal in Streaming SVE execution mode
+// defined by SME.
+def FeatureStreamingSVE : SubtargetFeature<"streaming-sve",
+                                           "HasStreamingSVE", "true",
+  "Enable subset of SVE(2) instructions for Streaming SVE execution mode">;
 def FeatureSME : SubtargetFeature<"sme", "HasSME", "true",
-  "Enable Scalable Matrix Extension (SME)", [FeatureSVE2, FeatureBF16]>;
+  "Enable Scalable Matrix Extension (SME)", [FeatureStreamingSVE, FeatureBF16]>;
 
 def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true",
   "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>;
@@ -440,13 +446,24 @@ def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true",
 def FeatureSMEI64 : SubtargetFeature<"sme-i64", "HasSMEI64", "true",
   "Enable Scalable Matrix Extension (SME) I16I64 instructions", [FeatureSME]>;
 
+def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
+  "Apple A7 (the CPU formerly known as Cyclone)">;
+
+def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true",
+  "Enable Exception Level 2 Virtual Memory System Architecture">;
+
+def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true",
+  "Enable Exception Level 3">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
+def HasV8_0aOps : SubtargetFeature<"v8a", "HasV8_0aOps", "true",
+  "Support ARM v8.0a instructions", [FeatureEL2VMSA, FeatureEL3]>;
 
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
-  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM,
-  FeaturePAN, FeatureLOR, FeatureVH]>;
+  "Support ARM v8.1a instructions", [HasV8_0aOps, FeatureCRC, FeatureLSE,
+  FeatureRDM, FeaturePAN, FeatureLOR, FeatureVH]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
   "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
@@ -459,8 +476,8 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
 def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
   "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
   FeatureNV, FeatureMPAM, FeatureDIT,
-  FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI,
-  FeatureFlagM, FeatureRCPC_IMMO]>;
+  FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
+  FeatureFlagM, FeatureRCPC_IMMO, FeatureLSE2]>;
 
 def HasV8_5aOps : SubtargetFeature<
   "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
@@ -477,6 +494,18 @@ def HasV8_7aOps : SubtargetFeature<
   "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions",
   [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>;
 
+def HasV9_0aOps : SubtargetFeature<
+  "v9a", "HasV9_0aOps", "true", "Support ARM v9a instructions",
+  [HasV8_5aOps, FeatureSVE2]>;
+
+def HasV9_1aOps : SubtargetFeature<
+  "v9.1a", "HasV9_1aOps", "true", "Support ARM v9.1a instructions",
+  [HasV8_6aOps, HasV9_0aOps]>;
+
+def HasV9_2aOps : SubtargetFeature<
+  "v9.2a", "HasV9_2aOps", "true", "Support ARM v9.2a instructions",
+  [HasV8_7aOps, HasV9_1aOps]>;
+
 def HasV8_0rOps : SubtargetFeature<
   "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions",
   [//v8.1
@@ -553,7 +582,7 @@ class AArch64Unsupported { list<Predicate> F; }
 
 def SVEUnsupported : AArch64Unsupported {
   let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3,
-           HasSVE2BitPerm];
+           HasSVE2BitPerm, HasSVEorStreamingSVE, HasSVE2orStreamingSVE];
 }
 
 def PAUnsupported : AArch64Unsupported {
@@ -579,660 +608,553 @@ include "AArch64SchedA64FX.td"
 include "AArch64SchedThunderX3T110.td"
 include "AArch64SchedTSV110.td"
 
-def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
-                                   "Cortex-A35 ARM processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeaturePerfMon
-                                   ]>;
+def TuneA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                "Cortex-A35 ARM processors">;
 
-def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
+                                   FeatureFuseAES,
                                    FeatureBalanceFPOps,
-                                   FeatureCRC,
-                                   FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
-                                   FeatureFPARMv8,
-                                   FeatureFuseAES,
-                                   FeatureNEON,
-                                   FeaturePerfMon,
-                                   FeaturePostRAScheduler,
-                                   ]>;
+                                   FeaturePostRAScheduler]>;
 
-def ProcA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
+def TuneA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
                                    "Cortex-A55 ARM processors", [
-                                   HasV8_2aOps,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
                                    FeatureFuseAES,
-                                   FeatureNEON,
-                                   FeatureFullFP16,
-                                   FeatureDotProd,
-                                   FeatureRCPC,
-                                   FeaturePerfMon,
                                    FeaturePostRAScheduler,
+                                   FeatureFuseAddress]>;
+
+def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
+                                   "Cortex-A510 ARM processors", [
+                                   FeatureFuseAES,
+                                   FeaturePostRAScheduler
                                    ]>;
 
-def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
+                                   FeatureFuseAES,
                                    FeatureBalanceFPOps,
-                                   FeatureCRC,
-                                   FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
-                                   FeatureFPARMv8,
-                                   FeatureFuseAES,
                                    FeatureFuseLiterals,
-                                   FeatureNEON,
-                                   FeaturePerfMon,
                                    FeaturePostRAScheduler,
-                                   FeaturePredictableSelectIsExpensive
-                                   ]>;
+                                   FeaturePredictableSelectIsExpensive]>;
 
-def ProcA65     : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
+def TuneA65     : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
                                    "Cortex-A65 ARM processors", [
-                                   HasV8_2aOps,
-                                   FeatureCrypto,
-                                   FeatureDotProd,
-                                   FeatureFPARMv8,
-                                   FeatureFullFP16,
-                                   FeatureFuseAddress,
                                    FeatureFuseAES,
-                                   FeatureFuseLiterals,
-                                   FeatureNEON,
-                                   FeatureRAS,
-                                   FeatureRCPC,
-                                   FeatureSSBS,
-                                   ]>;
+                                   FeatureFuseAddress,
+                                   FeatureFuseLiterals]>;
 
-def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    "Cortex-A72 ARM processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
                                    FeatureFuseAES,
-                                   FeatureFuseLiterals,
-                                   FeatureNEON,
-                                   FeaturePerfMon
-                                   ]>;
+                                   FeatureFuseLiterals]>;
 
-def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
-                                   FeatureFuseAES,
-                                   FeatureNEON,
-                                   FeaturePerfMon
-                                   ]>;
+                                   FeatureFuseAES]>;
 
-def ProcA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
+def TuneA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", [
-                                   HasV8_2aOps,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
-                                   FeatureFuseAES,
-                                   FeatureNEON,
-                                   FeatureFullFP16,
-                                   FeatureDotProd,
-                                   FeatureRCPC,
-                                   FeaturePerfMon
-                                   ]>;
+                                   FeatureFuseAES]>;
 
-def ProcA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
+def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
-                                    HasV8_2aOps,
-                                    FeatureFPARMv8,
-                                    FeatureFuseAES,
-                                    FeatureNEON,
-                                    FeatureRCPC,
-                                    FeatureCrypto,
-                                    FeatureFullFP16,
-                                    FeatureDotProd,
-                                    FeatureSSBS
-                                    ]>;
-
-def ProcA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
+                                   FeatureFuseAES]>;
+
+def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    "Cortex-A77 ARM processors", [
-                                    HasV8_2aOps,
-                                    FeatureCmpBccFusion,
-                                    FeatureFPARMv8,
-                                    FeatureFuseAES,
-                                    FeatureNEON, FeatureRCPC,
-                                    FeatureCrypto,
-                                    FeatureFullFP16,
-                                    FeatureDotProd
-                                    ]>;
-
-def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
-                               "CortexA78",
+                                   FeatureCmpBccFusion,
+                                   FeatureFuseAES]>;
+
+def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                "Cortex-A78 ARM processors", [
-                               HasV8_2aOps,
                                FeatureCmpBccFusion,
-                               FeatureCrypto,
-                               FeatureFPARMv8,
                                FeatureFuseAES,
-                               FeatureNEON,
-                               FeatureRCPC,
-                               FeaturePerfMon,
-                               FeaturePostRAScheduler,
-                               FeatureSPE,
-                               FeatureFullFP16,
-                               FeatureSSBS,
-                               FeatureDotProd]>;
-
-def ProcA78C : SubtargetFeature<"cortex-a78c", "ARMProcFamily",
+                               FeaturePostRAScheduler]>;
+
+def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 "CortexA78C",
                                 "Cortex-A78C ARM processors", [
-                                HasV8_2aOps,
                                 FeatureCmpBccFusion,
-                                FeatureCrypto,
-                                FeatureDotProd,
-                                FeatureFlagM,
-                                FeatureFP16FML,
-                                FeatureFPARMv8,
-                                FeatureFullFP16,
                                 FeatureFuseAES,
-                                FeatureNEON,
-                                FeaturePAuth,
-                                FeaturePerfMon,
-                                FeaturePostRAScheduler,
-                                FeatureRCPC,
-                                FeatureSPE,
-                                FeatureSSBS]>;
-
-def ProcR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
+                                FeaturePostRAScheduler]>;
+
+def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
+                                   "Cortex-A710 ARM processors", [
+                                   FeatureFuseAES,
+                                   FeaturePostRAScheduler,
+                                   FeatureCmpBccFusion]>;
+
+def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
                                "CortexR82",
-                               "Cortex-R82 ARM Processors", [
-                               FeaturePostRAScheduler,
-                               // All other features are implied by v8_0r ops:
-                               HasV8_0rOps,
-                               ]>;
+                               "Cortex-R82 ARM processors", [
+                               FeaturePostRAScheduler]>;
 
-def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
+def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   "Cortex-X1 ARM processors", [
-                                  HasV8_2aOps,
                                   FeatureCmpBccFusion,
-                                  FeatureCrypto,
-                                  FeatureFPARMv8,
                                   FeatureFuseAES,
-                                  FeatureNEON,
-                                  FeatureRCPC,
-                                  FeaturePerfMon,
+                                  FeaturePostRAScheduler]>;
+
+def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
+                                  "Cortex-X2 ARM processors", [
+                                  FeatureFuseAES,
                                   FeaturePostRAScheduler,
-                                  FeatureSPE,
-                                  FeatureFullFP16,
-                                  FeatureDotProd]>;
+                                  FeatureCmpBccFusion]>;
 
-def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
+def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
                                  "Fujitsu A64FX processors", [
-                                  HasV8_2aOps,
-                                  FeatureFPARMv8,
-                                  FeatureNEON,
-                                  FeatureSHA2,
-                                  FeaturePerfMon,
-                                  FeatureFullFP16,
-                                  FeatureSVE,
-                                  FeaturePostRAScheduler,
-                                  FeatureComplxNum,
-                                  FeatureAggressiveFMA,
-                                  FeatureArithmeticBccFusion,
-                                  FeaturePredictableSelectIsExpensive
-                                  ]>;
-
-def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
-                                  "Nvidia Carmel processors", [
-                                   HasV8_2aOps,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureFullFP16
-                                   ]>;
+                                 FeaturePostRAScheduler,
+                                 FeatureAggressiveFMA,
+                                 FeatureArithmeticBccFusion,
+                                 FeaturePredictableSelectIsExpensive
+                                 ]>;
+
+def TuneCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
+                                  "Nvidia Carmel processors">;
 
 // Note that cyclone does not fuse AES instructions, but newer apple chips do
 // perform the fusion and cyclone is used by default when targetting apple OSes.
-def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
-                                   "Apple A7 (the CPU formerly known as Cyclone)", [
-                                   FeatureAlternateSExtLoadCVTF32Pattern,
-                                   FeatureArithmeticBccFusion,
-                                   FeatureArithmeticCbzFusion,
-                                   FeatureCrypto,
-                                   FeatureDisableLatencySchedHeuristic,
-                                   FeatureFPARMv8,
-                                   FeatureFuseAES,
-                                   FeatureFuseCryptoEOR,
-                                   FeatureNEON,
-                                   FeaturePerfMon,
-                                   FeatureZCRegMove,
-                                   FeatureZCZeroing,
-                                   FeatureZCZeroingFPWorkaround
-                                   ]>;
+def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
+                                    "Apple A7 (the CPU formerly known as Cyclone)", [
+                                    FeatureAlternateSExtLoadCVTF32Pattern,
+                                    FeatureArithmeticBccFusion,
+                                    FeatureArithmeticCbzFusion,
+                                    FeatureDisableLatencySchedHeuristic,
+                                    FeatureFuseAES, FeatureFuseCryptoEOR,
+                                    FeatureZCRegMove,
+                                    FeatureZCZeroing,
+                                    FeatureZCZeroingFPWorkaround]
+                                    >;
 
-def ProcAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
+def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     "Apple A10", [
                                     FeatureAlternateSExtLoadCVTF32Pattern,
                                     FeatureArithmeticBccFusion,
                                     FeatureArithmeticCbzFusion,
-                                    FeatureCrypto,
                                     FeatureDisableLatencySchedHeuristic,
-                                    FeatureFPARMv8,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
-                                    FeatureNEON,
-                                    FeaturePerfMon,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing,
-                                    FeatureCRC,
-                                    FeatureRDM,
-                                    FeaturePAN,
-                                    FeatureLOR,
-                                    FeatureVH,
-                                    ]>;
-
-def ProcAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
+                                    FeatureZCZeroing]
+                                    >;
+
+def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     "Apple A11", [
                                     FeatureAlternateSExtLoadCVTF32Pattern,
                                     FeatureArithmeticBccFusion,
                                     FeatureArithmeticCbzFusion,
-                                    FeatureCrypto,
                                     FeatureDisableLatencySchedHeuristic,
-                                    FeatureFPARMv8,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
-                                    FeatureNEON,
-                                    FeaturePerfMon,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing,
-                                    FeatureFullFP16,
-                                    HasV8_2aOps
-                                    ]>;
+                                    FeatureZCZeroing]
+                                    >;
 
-def ProcAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
+def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     "Apple A12", [
                                     FeatureAlternateSExtLoadCVTF32Pattern,
                                     FeatureArithmeticBccFusion,
                                     FeatureArithmeticCbzFusion,
-                                    FeatureCrypto,
                                     FeatureDisableLatencySchedHeuristic,
-                                    FeatureFPARMv8,
                                     FeatureFuseAES,
                                     FeatureFuseCryptoEOR,
-                                    FeatureNEON,
-                                    FeaturePerfMon,
                                     FeatureZCRegMove,
-                                    FeatureZCZeroing,
-                                    FeatureFullFP16,
-                                    HasV8_3aOps
-                                    ]>;
-
-def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
-                                     "Apple A13", [
-                                     FeatureAlternateSExtLoadCVTF32Pattern,
-                                     FeatureArithmeticBccFusion,
-                                     FeatureArithmeticCbzFusion,
-                                     FeatureCrypto,
-                                     FeatureDisableLatencySchedHeuristic,
-                                     FeatureFPARMv8,
-                                     FeatureFuseAES,
-                                     FeatureFuseCryptoEOR,
-                                     FeatureNEON,
-                                     FeaturePerfMon,
-                                     FeatureZCRegMove,
-                                     FeatureZCZeroing,
-                                     FeatureFullFP16,
-                                     FeatureFP16FML,
-                                     FeatureSHA3,
-                                     HasV8_4aOps
-                                     ]>;
-
-def ProcAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
-                                     "Apple A14", [
-                                     FeatureAggressiveFMA,
-                                     FeatureAlternateSExtLoadCVTF32Pattern,
-                                     FeatureAltFPCmp,
-                                     FeatureArithmeticBccFusion,
-                                     FeatureArithmeticCbzFusion,
-                                     FeatureCrypto,
-                                     FeatureDisableLatencySchedHeuristic,
-                                     FeatureFPARMv8,
-                                     FeatureFRInt3264,
-                                     FeatureFuseAddress,
-                                     FeatureFuseAES,
-                                     FeatureFuseArithmeticLogic,
-                                     FeatureFuseCCSelect,
-                                     FeatureFuseCryptoEOR,
-                                     FeatureFuseLiterals,
-                                     FeatureNEON,
-                                     FeaturePerfMon,
-                                     FeatureSpecRestrict,
-                                     FeatureSSBS,
-                                     FeatureSB,
-                                     FeaturePredRes,
-                                     FeatureCacheDeepPersist,
-                                     FeatureZCRegMove,
-                                     FeatureZCZeroing,
-                                     FeatureFullFP16,
-                                     FeatureFP16FML,
-                                     FeatureSHA3,
-                                     HasV8_4aOps
-                                     ]>;
-
-def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
+                                    FeatureZCZeroing]
+                                    >;
+
+def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
+                                    "Apple A13", [
+                                    FeatureAlternateSExtLoadCVTF32Pattern,
+                                    FeatureArithmeticBccFusion,
+                                    FeatureArithmeticCbzFusion,
+                                    FeatureDisableLatencySchedHeuristic,
+                                    FeatureFuseAES,
+                                    FeatureFuseCryptoEOR,
+                                    FeatureZCRegMove,
+                                    FeatureZCZeroing]
+                                    >;
+
+def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
+                                    "Apple A14", [
+                                    FeatureAggressiveFMA,
+                                    FeatureAlternateSExtLoadCVTF32Pattern,
+                                    FeatureArithmeticBccFusion,
+                                    FeatureArithmeticCbzFusion,
+                                    FeatureDisableLatencySchedHeuristic,
+                                    FeatureFuseAddress,
+                                    FeatureFuseAES,
+                                    FeatureFuseArithmeticLogic,
+                                    FeatureFuseCCSelect,
+                                    FeatureFuseCryptoEOR,
+                                    FeatureFuseLiterals,
+                                    FeatureZCRegMove,
+                                    FeatureZCZeroing]>;
+
+def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
-                                    [FeatureCRC,
-                                     FeatureCrypto,
-                                     FeatureExynosCheapAsMoveHandling,
+                                    [FeatureExynosCheapAsMoveHandling,
                                      FeatureForce32BitJumpTables,
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
                                      FeatureFuseCCSelect,
                                      FeatureFuseLiterals,
                                      FeatureLSLFast,
-                                     FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
 
-def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
-                                    "Samsung Exynos-M4 processors",
-                                    [HasV8_2aOps,
-                                     FeatureArithmeticBccFusion,
+def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
+                                    "Samsung Exynos-M3 processors",
+                                    [FeatureArithmeticBccFusion,
                                      FeatureArithmeticCbzFusion,
-                                     FeatureCrypto,
-                                     FeatureDotProd,
                                      FeatureExynosCheapAsMoveHandling,
                                      FeatureForce32BitJumpTables,
-                                     FeatureFullFP16,
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
                                      FeatureFuseArithmeticLogic,
                                      FeatureFuseCCSelect,
                                      FeatureFuseLiterals,
                                      FeatureLSLFast,
-                                     FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeatureZCZeroing]>;
 
-def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
-                                   FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast
-                                   ]>;
+                                   FeatureLSLFast]
+                                   >;
 
-def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
+def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    "Qualcomm Falkor processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
-                                   FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureRDM,
                                    FeatureZCZeroing,
                                    FeatureLSLFast,
                                    FeatureSlowSTRQro
                                    ]>;
 
-def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily",
-                                      "NeoverseE1",
+def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
                                       "Neoverse E1 ARM processors", [
-                                      HasV8_2aOps,
-                                      FeatureCrypto,
-                                      FeatureDotProd,
-                                      FeatureFPARMv8,
-                                      FeatureFullFP16,
-                                      FeatureNEON,
-                                      FeatureRCPC,
-                                      FeatureSSBS,
                                       FeaturePostRAScheduler,
-                                      FeatureFuseAES,
+                                      FeatureFuseAES
                                       ]>;
 
-def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
-                                      "NeoverseN1",
+def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1",
                                       "Neoverse N1 ARM processors", [
-                                      HasV8_2aOps,
-                                      FeatureCrypto,
-                                      FeatureDotProd,
-                                      FeatureFPARMv8,
-                                      FeatureFullFP16,
-                                      FeatureNEON,
-                                      FeatureRCPC,
-                                      FeatureSPE,
-                                      FeatureSSBS,
                                       FeaturePostRAScheduler,
-                                      FeatureFuseAES,
+                                      FeatureFuseAES
                                       ]>;
 
-def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily",
-                                      "NeoverseN2",
+def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2",
                                       "Neoverse N2 ARM processors", [
-                                      HasV8_5aOps,
-                                      FeatureBF16,
-                                      FeatureETE,
-                                      FeatureMatMulInt8,
-                                      FeatureMTE,
-                                      FeatureSVE2,
-                                      FeatureSVE2BitPerm,
-                                      FeatureTRBE,
                                       FeaturePostRAScheduler,
-                                      FeatureCrypto,
-                                      FeatureFuseAES,
+                                      FeatureFuseAES
+                                      ]>;
+def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
+                                      "Neoverse 512-TVB ARM processors", [
+                                      FeaturePostRAScheduler,
+                                      FeatureFuseAES
                                       ]>;
 
-def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily",
-                                      "NeoverseV1",
+def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
                                       "Neoverse V1 ARM processors", [
-                                      HasV8_4aOps,
-                                      FeatureBF16,
-                                      FeatureCacheDeepPersist,
-                                      FeatureCrypto,
-                                      FeatureFPARMv8,
-                                      FeatureFP16FML,
-                                      FeatureFullFP16,
                                       FeatureFuseAES,
-                                      FeatureMatMulInt8,
-                                      FeatureNEON,
-                                      FeaturePerfMon,
-                                      FeaturePostRAScheduler,
-                                      FeatureRandGen,
-                                      FeatureSPE,
-                                      FeatureSSBS,
-                                      FeatureSVE]>;
+                                      FeaturePostRAScheduler]>;
 
-def ProcSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
+def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
-                                   FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
-                                   FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureSPE,
-                                   FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast,
-                                   HasV8_4aOps]>;
+                                   FeatureLSLFast]>;
 
-def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
-                                         "ThunderX2T99",
+def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
                                          "Cavium ThunderX2 processors", [
                                           FeatureAggressiveFMA,
-                                          FeatureCRC,
-                                          FeatureCrypto,
-                                          FeatureFPARMv8,
-                                          FeatureArithmeticBccFusion,
-                                          FeatureNEON,
-                                          FeaturePostRAScheduler,
-                                          FeaturePredictableSelectIsExpensive,
-                                          FeatureLSE,
-                                          HasV8_1aOps]>;
-
-def ProcThunderX3T110  : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
-                                         "ThunderX3T110",
-                                         "Marvell ThunderX3 processors", [
-                                          FeatureAggressiveFMA,
-                                          FeatureCRC,
-                                          FeatureCrypto,
-                                          FeatureFPARMv8,
                                           FeatureArithmeticBccFusion,
-                                          FeatureNEON,
                                           FeaturePostRAScheduler,
-                                          FeaturePredictableSelectIsExpensive,
-                                          FeatureLSE,
-                                          FeaturePAuth,
-                                          FeatureBalanceFPOps,
-                                          FeaturePerfMon,
-                                          FeatureStrictAlign,
-                                          HasV8_3aOps]>;
-
-def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
+                                          FeaturePredictableSelectIsExpensive]>;
+
+def TuneThunderX3T110  : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
+                                          "ThunderX3T110",
+                                          "Marvell ThunderX3 processors", [
+                                           FeatureAggressiveFMA,
+                                           FeatureArithmeticBccFusion,
+                                           FeaturePostRAScheduler,
+                                           FeaturePredictableSelectIsExpensive,
+                                           FeatureBalanceFPOps,
+                                           FeatureStrictAlign]>;
+
+def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
                                     "Cavium ThunderX processors", [
-                                    FeatureCRC,
-                                    FeatureCrypto,
-                                    FeatureFPARMv8,
-                                    FeaturePerfMon,
                                     FeaturePostRAScheduler,
-                                    FeaturePredictableSelectIsExpensive,
-                                    FeatureNEON]>;
+                                    FeaturePredictableSelectIsExpensive]>;
 
-def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
+def TuneThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
                                        "ThunderXT88",
                                        "Cavium ThunderX processors", [
-                                       FeatureCRC,
-                                       FeatureCrypto,
-                                       FeatureFPARMv8,
-                                       FeaturePerfMon,
                                        FeaturePostRAScheduler,
-                                       FeaturePredictableSelectIsExpensive,
-                                       FeatureNEON]>;
+                                       FeaturePredictableSelectIsExpensive]>;
 
-def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
+def TuneThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
                                        "ThunderXT81",
                                        "Cavium ThunderX processors", [
-                                       FeatureCRC,
-                                       FeatureCrypto,
-                                       FeatureFPARMv8,
-                                       FeaturePerfMon,
                                        FeaturePostRAScheduler,
-                                       FeaturePredictableSelectIsExpensive,
-                                       FeatureNEON]>;
+                                       FeaturePredictableSelectIsExpensive]>;
 
-def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
+def TuneThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
                                        "ThunderXT83",
                                        "Cavium ThunderX processors", [
-                                       FeatureCRC,
-                                       FeatureCrypto,
-                                       FeatureFPARMv8,
-                                       FeaturePerfMon,
                                        FeaturePostRAScheduler,
-                                       FeaturePredictableSelectIsExpensive,
-                                       FeatureNEON]>;
+                                       FeaturePredictableSelectIsExpensive]>;
 
-def ProcTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
+def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
                                   "HiSilicon TS-V110 processors", [
-                                  HasV8_2aOps,
-                                  FeatureCrypto,
                                   FeatureCustomCheapAsMoveHandling,
-                                  FeatureFPARMv8,
                                   FeatureFuseAES,
-                                  FeatureNEON,
-                                  FeaturePerfMon,
-                                  FeaturePostRAScheduler,
-                                  FeatureSPE,
-                                  FeatureFullFP16,
-                                  FeatureFP16FML,
-                                  FeatureDotProd]>;
-
-def : ProcessorModel<"generic", NoSchedModel, [
-                     FeatureFPARMv8,
-                     FeatureFuseAES,
-                     FeatureNEON,
-                     FeaturePerfMon,
-                     FeaturePostRAScheduler,
-// ETE and TRBE are future architecture extensions. We temporarily enable them
-// by default for users targeting generic AArch64, until it is decided in which
-// armv8.x-a architecture revision they will end up. The extensions do not
-// affect code generated by the compiler and can be used only by explicitly
-// mentioning the new system register names in assembly.
-                     FeatureETE
-                     ]>;
-
-def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
-def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>;
-def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : ProcessorModel<"cortex-a55", CortexA55Model, [ProcA55]>;
-def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>;
-def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>;
-def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
-def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
-def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
-def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
-def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
-def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>;
-def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>;
-def : ProcessorModel<"cortex-a78c", CortexA57Model, [ProcA78C]>;
-def : ProcessorModel<"cortex-r82", CortexA55Model, [ProcR82]>;
-def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>;
-def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
-def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
-def : ProcessorModel<"neoverse-n2", CortexA57Model, [ProcNeoverseN2]>;
-def : ProcessorModel<"neoverse-v1", CortexA57Model, [ProcNeoverseV1]>;
-def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
-def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
-def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>;
-def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
-def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
-def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
+                                  FeaturePostRAScheduler]>;
+
+
+def ProcessorFeatures {
+  list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+                                 FeatureFPARMv8, FeatureNEON, FeaturePerfMon];
+  list<SubtargetFeature> A55  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureFullFP16, FeatureDotProd,
+                                 FeatureRCPC, FeaturePerfMon];
+  list<SubtargetFeature> A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
+                                 FeatureMatMulInt8, FeatureBF16, FeatureAM,
+                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureFP16FML];
+  list<SubtargetFeature> A65  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureFullFP16, FeatureDotProd,
+                                 FeatureRCPC, FeatureSSBS, FeatureRAS];
+  list<SubtargetFeature> A76  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureFullFP16, FeatureDotProd,
+                                 FeatureRCPC, FeatureSSBS];
+  list<SubtargetFeature> A77  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureFullFP16, FeatureDotProd,
+                                 FeatureRCPC];
+  list<SubtargetFeature> A78  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureFullFP16, FeatureDotProd,
+                                 FeatureRCPC, FeaturePerfMon, FeatureSPE,
+                                 FeatureSSBS];
+  list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureFullFP16, FeatureDotProd,
+                                 FeatureFlagM, FeatureFP16FML, FeaturePAuth,
+                                 FeaturePerfMon, FeatureRCPC, FeatureSPE,
+                                 FeatureSSBS];
+  list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
+                                 FeatureETE, FeatureMTE, FeatureFP16FML,
+                                 FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8];
+  list<SubtargetFeature> R82  = [HasV8_0rOps];
+  list<SubtargetFeature> X1   = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                 FeatureNEON, FeatureRCPC, FeaturePerfMon,
+                                 FeatureSPE, FeatureFullFP16, FeatureDotProd];
+  list<SubtargetFeature> X2   = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
+                                 FeatureMatMulInt8, FeatureBF16, FeatureAM,
+                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureFP16FML];
+  list<SubtargetFeature> A64FX    = [HasV8_2aOps, FeatureFPARMv8, FeatureNEON,
+                                     FeatureSHA2, FeaturePerfMon, FeatureFullFP16,
+                                     FeatureSVE, FeatureComplxNum];
+  list<SubtargetFeature> Carmel   = [HasV8_2aOps, FeatureNEON, FeatureCrypto,
+                                     FeatureFullFP16];
+  list<SubtargetFeature> AppleA7  = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8,
+                                     FeatureNEON,FeaturePerfMon, FeatureAppleA7SysReg];
+  list<SubtargetFeature> AppleA10 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8,
+                                     FeatureNEON, FeaturePerfMon, FeatureCRC,
+                                     FeatureRDM, FeaturePAN, FeatureLOR, FeatureVH];
+  list<SubtargetFeature> AppleA11 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                     FeatureNEON, FeaturePerfMon, FeatureFullFP16];
+  list<SubtargetFeature> AppleA12 = [HasV8_3aOps, FeatureCrypto, FeatureFPARMv8,
+                                     FeatureNEON, FeaturePerfMon, FeatureFullFP16];
+  list<SubtargetFeature> AppleA13 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
+                                     FeatureNEON, FeaturePerfMon, FeatureFullFP16,
+                                     FeatureFP16FML, FeatureSHA3];
+  list<SubtargetFeature> AppleA14 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
+                                     FeatureNEON, FeaturePerfMon, FeatureFRInt3264,
+                                     FeatureSpecRestrict, FeatureSSBS, FeatureSB,
+                                     FeaturePredRes, FeatureCacheDeepPersist,
+                                     FeatureFullFP16, FeatureFP16FML, FeatureSHA3,
+                                     FeatureAltFPCmp];
+  list<SubtargetFeature> ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+                                     FeaturePerfMon];
+  list<SubtargetFeature> ExynosM4 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
+                                     FeatureFullFP16, FeaturePerfMon];
+  list<SubtargetFeature> Falkor   = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+                                     FeatureFPARMv8, FeatureNEON, FeaturePerfMon,
+                                     FeatureRDM];
+  list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
+                                       FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
+                                       FeatureRCPC, FeatureSSBS];
+  list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd,
+                                       FeatureFPARMv8, FeatureFullFP16, FeatureNEON,
+                                       FeatureRCPC, FeatureSPE, FeatureSSBS];
+  list<SubtargetFeature> NeoverseN2 = [HasV8_5aOps, FeatureBF16, FeatureETE,
+                                       FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
+                                       FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto];
+  list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
+                                           FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
+                                           FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
+                                           FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                           FeatureSSBS, FeatureSVE];
+  list<SubtargetFeature> NeoverseV1 = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
+                                       FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
+                                       FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
+                                       FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                       FeatureSSBS, FeatureSVE];
+  list<SubtargetFeature> Saphira    = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
+                                       FeatureNEON, FeatureSPE, FeaturePerfMon];
+  list<SubtargetFeature> ThunderX   = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+                                       FeatureFPARMv8, FeaturePerfMon, FeatureNEON];
+  list<SubtargetFeature> ThunderX2T99  = [HasV8_1aOps, FeatureCRC, FeatureCrypto,
+                                          FeatureFPARMv8, FeatureNEON, FeatureLSE];
+  list<SubtargetFeature> ThunderX3T110 = [HasV8_3aOps, FeatureCRC, FeatureCrypto,
+                                          FeatureFPARMv8, FeatureNEON, FeatureLSE,
+                                          FeaturePAuth, FeaturePerfMon];
+  list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                   FeatureNEON, FeaturePerfMon, FeatureSPE,
+                                   FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+
+  // ETE and TRBE are future architecture extensions. We temporarily enable them
+  // by default for users targeting generic AArch64. The extensions do not
+  // affect code generated by the compiler and can be used only by explicitly
+  // mentioning the new system register names in assembly.
+  list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureETE];
+}
+
+
+def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic,
+                     [FeatureFuseAES, FeaturePostRAScheduler]>;
+def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
+                     [TuneA35]>;
+def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,
+                     [TuneA35]>;
+def : ProcessorModel<"cortex-a53", CortexA53Model, ProcessorFeatures.A53,
+                     [TuneA53]>;
+def : ProcessorModel<"cortex-a55", CortexA55Model, ProcessorFeatures.A55,
+                     [TuneA55]>;
+def : ProcessorModel<"cortex-a510", CortexA55Model, ProcessorFeatures.A510,
+                     [TuneA510]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, ProcessorFeatures.A53,
+                     [TuneA57]>;
+def : ProcessorModel<"cortex-a65", CortexA53Model, ProcessorFeatures.A65,
+                     [TuneA65]>;
+def : ProcessorModel<"cortex-a65ae", CortexA53Model, ProcessorFeatures.A65,
+                     [TuneA65]>;
+def : ProcessorModel<"cortex-a72", CortexA57Model, ProcessorFeatures.A53,
+                     [TuneA72]>;
+def : ProcessorModel<"cortex-a73", CortexA57Model, ProcessorFeatures.A53,
+                     [TuneA73]>;
+def : ProcessorModel<"cortex-a75", CortexA57Model, ProcessorFeatures.A55,
+                     [TuneA75]>;
+def : ProcessorModel<"cortex-a76", CortexA57Model, ProcessorFeatures.A76,
+                     [TuneA76]>;
+def : ProcessorModel<"cortex-a76ae", CortexA57Model, ProcessorFeatures.A76,
+                     [TuneA76]>;
+def : ProcessorModel<"cortex-a77", CortexA57Model, ProcessorFeatures.A77,
+                     [TuneA77]>;
+def : ProcessorModel<"cortex-a78", CortexA57Model, ProcessorFeatures.A78,
+                     [TuneA78]>;
+def : ProcessorModel<"cortex-a78c", CortexA57Model, ProcessorFeatures.A78C,
+                     [TuneA78C]>;
+def : ProcessorModel<"cortex-a710", CortexA57Model, ProcessorFeatures.A710,
+                     [TuneA710]>;
+def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
+                     [TuneR82]>;
+def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
+                     [TuneX1]>;
+def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2,
+                     [TuneX2]>;
+def : ProcessorModel<"neoverse-e1", CortexA53Model,
+                     ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>;
+def : ProcessorModel<"neoverse-n1", CortexA57Model,
+                     ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>;
+def : ProcessorModel<"neoverse-n2", CortexA57Model,
+                     ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>;
+def : ProcessorModel<"neoverse-512tvb", CortexA57Model,
+                     ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>;
+def : ProcessorModel<"neoverse-v1", CortexA57Model,
+                     ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
+def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3,
+                     [TuneExynosM3]>;
+def : ProcessorModel<"exynos-m4", ExynosM4Model, ProcessorFeatures.ExynosM4,
+                     [TuneExynosM4]>;
+def : ProcessorModel<"exynos-m5", ExynosM5Model, ProcessorFeatures.ExynosM4,
+                     [TuneExynosM4]>;
+def : ProcessorModel<"falkor", FalkorModel, ProcessorFeatures.Falkor,
+                     [TuneFalkor]>;
+def : ProcessorModel<"saphira", FalkorModel, ProcessorFeatures.Saphira,
+                     [TuneSaphira]>;
+def : ProcessorModel<"kryo", KryoModel, ProcessorFeatures.A53, [TuneKryo]>;
+
 // Cavium ThunderX/ThunderX T8X  Processors
-def : ProcessorModel<"thunderx", ThunderXT8XModel,  [ProcThunderX]>;
-def : ProcessorModel<"thunderxt88", ThunderXT8XModel,  [ProcThunderXT88]>;
-def : ProcessorModel<"thunderxt81", ThunderXT8XModel,  [ProcThunderXT81]>;
-def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
+def : ProcessorModel<"thunderx", ThunderXT8XModel,  ProcessorFeatures.ThunderX,
+                     [TuneThunderX]>;
+def : ProcessorModel<"thunderxt88", ThunderXT8XModel,
+                     ProcessorFeatures.ThunderX, [TuneThunderXT88]>;
+def : ProcessorModel<"thunderxt81", ThunderXT8XModel,
+                     ProcessorFeatures.ThunderX, [TuneThunderXT81]>;
+def : ProcessorModel<"thunderxt83", ThunderXT8XModel,
+                     ProcessorFeatures.ThunderX, [TuneThunderXT83]>;
 // Cavium ThunderX2T9X  Processors. Formerly Broadcom Vulcan.
-def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
+def : ProcessorModel<"thunderx2t99", ThunderX2T99Model,
+                     ProcessorFeatures.ThunderX2T99, [TuneThunderX2T99]>;
 // Marvell ThunderX3T110 Processors.
-def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>;
-def : ProcessorModel<"tsv110", TSV110Model, [ProcTSV110]>;
+def : ProcessorModel<"thunderx3t110", ThunderX3T110Model,
+                     ProcessorFeatures.ThunderX3T110, [TuneThunderX3T110]>;
+def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
+                     [TuneTSV110]>;
 
 // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
-def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>;
+def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7,
+                     [TuneAppleA7]>;
 
 // iPhone and iPad CPUs
-def : ProcessorModel<"apple-a7", CycloneModel, [ProcAppleA7]>;
-def : ProcessorModel<"apple-a8", CycloneModel, [ProcAppleA7]>;
-def : ProcessorModel<"apple-a9", CycloneModel, [ProcAppleA7]>;
-def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>;
-def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>;
-def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>;
-def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>;
-def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>;
+def : ProcessorModel<"apple-a7", CycloneModel, ProcessorFeatures.AppleA7,
+                     [TuneAppleA7]>;
+def : ProcessorModel<"apple-a8", CycloneModel, ProcessorFeatures.AppleA7,
+                     [TuneAppleA7]>;
+def : ProcessorModel<"apple-a9", CycloneModel, ProcessorFeatures.AppleA7,
+                     [TuneAppleA7]>;
+def : ProcessorModel<"apple-a10", CycloneModel, ProcessorFeatures.AppleA10,
+                     [TuneAppleA10]>;
+def : ProcessorModel<"apple-a11", CycloneModel, ProcessorFeatures.AppleA11,
+                     [TuneAppleA11]>;
+def : ProcessorModel<"apple-a12", CycloneModel, ProcessorFeatures.AppleA12,
+                     [TuneAppleA12]>;
+def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13,
+                     [TuneAppleA13]>;
+def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14,
+                     [TuneAppleA14]>;
 
 // Mac CPUs
-def : ProcessorModel<"apple-m1", CycloneModel, [ProcAppleA14]>;
+def : ProcessorModel<"apple-m1", CycloneModel, ProcessorFeatures.AppleA14,
+                     [TuneAppleA14]>;
 
 // watch CPUs.
-def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>;
-def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
+def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12,
+                     [TuneAppleA12]>;
+def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12,
+                     [TuneAppleA12]>;
 
 // Alias for the latest Apple processor model supported by LLVM.
-def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA14]>;
+def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleA14,
+                     [TuneAppleA14]>;
 
 // Fujitsu A64FX
-def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>;
+def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX,
+                     [TuneA64FX]>;
 
 // Nvidia Carmel
-def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>;
+def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel,
+                     [TuneCarmel]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index e80fe2cada09..7fd51a98ad94 100644
--- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -159,7 +159,7 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
   // If there is no non-pseudo in the current block, loop back around and try
   // the previous block (if there is one).
   while ((FMBB = getBBFallenThrough(FMBB, TII))) {
-    for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend()))
+    for (MachineInstr &I : llvm::reverse(*FMBB))
       if (!I.isPseudo())
         return &I;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index c996d2df8c38..cd67e058a9c1 100644
--- a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -377,8 +377,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
 // processMachineBasicBlock - Main optimzation loop.
 bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr &MI = *I++;
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
     if (isProfitableToTransform(MI)) {
       transformInstruction(MI);
       Changed = true;
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index adefe3b37ee0..9f527a17d390 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -50,9 +50,9 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
@@ -293,7 +293,7 @@ void AArch64AsmPrinter::emitSled(const MachineInstr &MI, SledKind Kind) {
   //   ;DATA: higher 32 bits of the address of the trampoline
   //   LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
   //
-  OutStreamer->emitCodeAlignment(4);
+  OutStreamer->emitCodeAlignment(4, &getSubtargetInfo());
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
   OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
@@ -653,6 +653,9 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
   case 'x':
     Reg = getXRegFromWReg(Reg);
     break;
+  case 't':
+    Reg = getXRegFromXRegTuple(Reg);
+    break;
   }
 
   O << AArch64InstPrinter::getRegisterName(Reg);
@@ -749,6 +752,10 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         AArch64::GPR64allRegClass.contains(Reg))
       return printAsmMRegister(MO, 'x', O);
 
+    // If this is an x register tuple, print an x register.
+    if (AArch64::GPR64x8ClassRegClass.contains(Reg))
+      return printAsmMRegister(MO, 't', O);
+
     unsigned AltName = AArch64::NoRegAltName;
     const TargetRegisterClass *RegClass;
     if (AArch64::ZPRRegClass.contains(Reg)) {
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 4b7ce565eb1e..c90601443934 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -50,9 +50,9 @@ def CC_AArch64_AAPCS : CallingConv<[
   // "sret" on argument 1 means instance methods.
 
   CCIfInReg<CCIfType<[i64],
-    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1], [W0, W1]>>>>>,
+    CCIfSRet<CCIfType<[i64], CCAssignToReg<[X0, X1]>>>>>,
 
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+  CCIfSRet<CCIfType<[i64], CCAssignToReg<[X8]>>>,
 
   // Put ByVal arguments directly on the stack. Minimum size and alignment of a
   // slot is 64-bit.
@@ -64,14 +64,14 @@ def CC_AArch64_AAPCS : CallingConv<[
   CCIfNest<CCAssignToReg<[X18]>>,
 
   // Pass SwiftSelf in a callee saved register.
-  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[X20]>>>,
 
   // A SwiftError is passed in X21.
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X21]>>>,
 
   // Pass SwiftAsync in an otherwise callee saved register so that it will be
   // preserved for normal function calls.
-  CCIfSwiftAsync<CCIfType<[i64], CCAssignToRegWithShadow<[X22], [W22]>>>,
+  CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[X22]>>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
@@ -90,8 +90,7 @@ def CC_AArch64_AAPCS : CallingConv<[
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
   CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
                                                     [X0, X1, X3, X5]>>>,
@@ -99,19 +98,13 @@ def CC_AArch64_AAPCS : CallingConv<[
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
 
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+           CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
@@ -131,7 +124,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
@@ -141,21 +134,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[
                          CCBitConvertToType<f128>>>,
 
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
-      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+      CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
@@ -195,49 +181,41 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
   // An SRet is passed in X8, not X0 like a normal pointer parameter.
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+  CCIfSRet<CCIfType<[i64], CCAssignToReg<[X8]>>>,
 
   // Put ByVal arguments directly on the stack. Minimum size and alignment of a
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
   // Pass SwiftSelf in a callee saved register.
-  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[X20]>>>,
 
   // A SwiftError is passed in X21.
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X21]>>>,
 
   // Pass SwiftAsync in an otherwise callee saved register so that it will be
   // preserved for normal function calls.
-  CCIfSwiftAsync<CCIfType<[i64], CCAssignToRegWithShadow<[X22], [W22]>>>,
+  CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[X22]>>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
   CCIfType<[i64],
-           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
-                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+           CCIfSplit<CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6]>>>,
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
 
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+           CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
@@ -310,8 +288,8 @@ let Entry = 1 in
 def CC_AArch64_WebKit_JS : CallingConv<[
   // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+  CCIfType<[i32], CCAssignToReg<[W0]>>,
+  CCIfType<[i64], CCAssignToReg<[X0]>>,
 
   // Pass the remaining arguments on the stack instead.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
@@ -320,14 +298,10 @@ def CC_AArch64_WebKit_JS : CallingConv<[
 
 let Entry = 1 in
 def RetCC_AArch64_WebKit_JS : CallingConv<[
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index d938008a1e07..d2097f7e6ee3 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -189,6 +189,13 @@ def fold_merge_to_zext : GICombineRule<
   (apply [{ applyFoldMergeToZext(*${d}, MRI, B, Observer); }])
 >;
 
+def mutate_anyext_to_zext : GICombineRule<
+  (defs root:$d),
+  (match (wip_match_opcode G_ANYEXT):$d,
+          [{ return matchMutateAnyExtToZExt(*${d}, MRI); }]),
+  (apply [{ applyMutateAnyExtToZExt(*${d}, MRI, B, Observer); }])
+>;
+
 // Post-legalization combines which should happen at all optimization levels.
 // (E.g. ones that facilitate matching for the selector) For example, matching
 // pseudos.
@@ -204,7 +211,7 @@ def AArch64PostLegalizerLoweringHelper
 def AArch64PostLegalizerCombinerHelper
     : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
                        [copy_prop, erase_undef_store, combines_for_extload,
-                        sext_trunc_sextload,
+                        sext_trunc_sextload, mutate_anyext_to_zext,
                         hoist_logic_op_with_same_opcode_hands,
                         redundant_and, xor_of_and_with_same_reg,
                         extractvecelt_pairwise_add, redundant_or,
@@ -212,6 +219,7 @@ def AArch64PostLegalizerCombinerHelper
                         form_bitfield_extract, rotate_out_of_range,
                         icmp_to_true_false_known_bits, merge_unmerge,
                         select_combines, fold_merge_to_zext,
-                        constant_fold, identity_combines]> {
+                        constant_fold, identity_combines,
+                        ptr_add_immed_chain, overlapping_and]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
index e90e8e3da057..533ab3b05de9 100644
--- a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -295,10 +295,7 @@ bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
     bool LocalChange = false;
-    for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(),
-                                     E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
+    for (MachineInstr &MI : MBB.terminators()) {
       switch (MI.getOpcode()) {
       default:
         break;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index b2eee2845ba9..4c04e04a7d3c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -937,12 +937,16 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
     case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
     }
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
-                MI.getOperand(0).getReg())
-            .add(MI.getOperand(1))
-            .add(MI.getOperand(2))
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    MachineFunction &MF = *MBB.getParent();
+    // Try to create new inst without implicit operands added.
+    MachineInstr *NewMI = MF.CreateMachineInstr(
+        TII->get(Opcode), MI.getDebugLoc(), /*NoImplicit=*/true);
+    MBB.insert(MBBI, NewMI);
+    MachineInstrBuilder MIB1(MF, NewMI);
+    MIB1.addReg(MI.getOperand(0).getReg(), RegState::Define)
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     transferImpOps(MI, MIB1, MIB1);
     MI.eraseFromParent();
     return true;
@@ -1049,6 +1053,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case AArch64::MOVaddrEXT: {
     // Expand into ADRP + ADD.
     Register DstReg = MI.getOperand(0).getReg();
+    assert(DstReg != AArch64::XZR);
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
             .add(MI.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 9acda17b816f..3dc694df509d 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3483,7 +3483,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       return false;
 
     const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
-    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 1);
+    return lowerCallTo(II, IntrMemName, II->arg_size() - 1);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -3499,7 +3499,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       // address spaces.
       return false;
 
-    return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
+    return lowerCallTo(II, "memset", II->arg_size() - 1);
   }
   case Intrinsic::sin:
   case Intrinsic::cos:
@@ -3533,10 +3533,10 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     }
 
     ArgListTy Args;
-    Args.reserve(II->getNumArgOperands());
+    Args.reserve(II->arg_size());
 
     // Populate the argument list.
-    for (auto &Arg : II->arg_operands()) {
+    for (auto &Arg : II->args()) {
       ArgListEntry Entry;
       Entry.Val = Arg;
       Entry.Ty = Arg->getType();
@@ -4806,7 +4806,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) {
 
   const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
   if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
-      !(C.isPowerOf2() || (-C).isPowerOf2()))
+      !(C.isPowerOf2() || C.isNegatedPowerOf2()))
     return selectBinaryOp(I, ISD::SDIV);
 
   unsigned Lg2 = C.countTrailingZeros();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f6a528c0e6fd..b630f4f0df5f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1158,11 +1158,33 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
   // bits so that is still true.
   if (HasFP && AFI->hasSwiftAsyncContext()) {
-    // ORR x29, x29, #0x1000_0000_0000_0000
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
-        .addUse(AArch64::FP)
-        .addImm(0x1100)
-        .setMIFlag(MachineInstr::FrameSetup);
+    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+    case SwiftAsyncFramePointerMode::DeploymentBased:
+      if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
+        // The special symbol below is absolute and has a *value* that can be
+        // combined with the frame pointer to signal an extended frame.
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
+            .addExternalSymbol("swift_async_extendedFramePointerFlags",
+                               AArch64II::MO_GOT);
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
+            .addUse(AArch64::FP)
+            .addUse(AArch64::X16)
+            .addImm(Subtarget.isTargetILP32() ? 32 : 0);
+        break;
+      }
+      LLVM_FALLTHROUGH;
+
+    case SwiftAsyncFramePointerMode::Always:
+      // ORR x29, x29, #0x1000_0000_0000_0000
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
+          .addUse(AArch64::FP)
+          .addImm(0x1100)
+          .setMIFlag(MachineInstr::FrameSetup);
+      break;
+
+    case SwiftAsyncFramePointerMode::Never:
+      break;
+    }
   }
 
   // All calls are tail calls in GHC calling conv, and functions have no
@@ -1205,7 +1227,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
                       StackOffset::getFixed(-NumBytes), TII,
                       MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
-      if (!NeedsWinCFI && needsFrameMoves) {
+      if (needsFrameMoves) {
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
           // Encode the stack size of the leaf function.
@@ -1631,7 +1653,8 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
   // this instruction can safely used for any v8a architecture.
   // From v8.3a onwards there are optimised authenticate LR and return
-  // instructions, namely RETA{A,B}, that can be used instead.
+  // instructions, namely RETA{A,B}, that can be used instead. In this case the
+  // DW_CFA_AARCH64_negate_ra_state can't be emitted.
   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
     BuildMI(MBB, MBBI, DL,
@@ -1643,6 +1666,12 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
         MBB, MBBI, DL,
         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
         .setMIFlag(MachineInstr::FrameDestroy);
+
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameDestroy);
   }
 }
 
@@ -2472,22 +2501,20 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
           .setMIFlag(MachineInstr::FrameSetup);
 
-    if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
-      // Emit a CFI instruction that causes 8 to be subtracted from the value of
-      // x18 when unwinding past this frame.
-      static const char CFIInst[] = {
-          dwarf::DW_CFA_val_expression,
-          18, // register
-          2,  // length
-          static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
-          static_cast<char>(-8) & 0x7f, // addend (sleb128)
-      };
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
-          nullptr, StringRef(CFIInst, sizeof(CFIInst))));
-      BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
+    // Emit a CFI instruction that causes 8 to be subtracted from the value of
+    // x18 when unwinding past this frame.
+    static const char CFIInst[] = {
+        dwarf::DW_CFA_val_expression,
+        18, // register
+        2,  // length
+        static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
+        static_cast<char>(-8) & 0x7f, // addend (sleb128)
+    };
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
+        nullptr, StringRef(CFIInst, sizeof(CFIInst))));
+    BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlag(MachineInstr::FrameSetup);
 
     // This instruction also makes x18 live-in to the entry block.
     MBB.addLiveIn(AArch64::X18);
@@ -2509,9 +2536,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     }
     return true;
   }
-  for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
-       ++RPII) {
-    RegPairInfo RPI = *RPII;
+  for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
     unsigned StrOpc;
@@ -3512,7 +3537,14 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
     return StackOffset::getFixed(MFI.getObjectOffset(FI));
   }
 
-  return getFrameIndexReference(MF, FI, FrameReg);
+  // Go to common code if we cannot provide sp + offset.
+  if (MFI.hasVarSizedObjects() ||
+      MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
+      MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  FrameReg = AArch64::SP;
+  return getStackOffset(MF, MFI.getObjectOffset(FI));
 }
 
 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index f8adaf36db84..e6d997f91b47 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -67,8 +67,6 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  bool hasSwiftExtendedFrame(const MachineFunction &MF) const;
-
   bool assignCalleeSavedSpillSlots(MachineFunction &MF,
                                    const TargetRegisterInfo *TRI,
                                    std::vector<CalleeSavedInfo> &CSI,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 17e530a4641d..fe9b2f8883b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -167,7 +167,7 @@ public:
     case ISD::SPLAT_VECTOR: {
       auto Opnd0 = N->getOperand(0);
       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
-        if (CN->isNullValue())
+        if (CN->isZero())
           return true;
       if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
         if (CN->isZero())
@@ -187,7 +187,7 @@ public:
     case ISD::SPLAT_VECTOR: {
       auto Opnd0 = N->getOperand(0);
       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
-        if (CN->isNullValue())
+        if (CN->isZero())
           return true;
       if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
         if (CN->isZero())
@@ -286,7 +286,8 @@ public:
   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
-                            unsigned Opc_rr, unsigned Opc_ri);
+                            unsigned Opc_rr, unsigned Opc_ri,
+                            bool IsIntr = false);
 
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
@@ -1487,7 +1488,7 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
 
 void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
                                                unsigned Scale, unsigned Opc_ri,
-                                               unsigned Opc_rr) {
+                                               unsigned Opc_rr, bool IsIntr) {
   assert(Scale < 4 && "Invalid scaling value.");
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
@@ -1497,11 +1498,11 @@ void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
   SDValue Base, Offset;
   unsigned Opc;
   std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
-      N, Opc_rr, Opc_ri, N->getOperand(2),
+      N, Opc_rr, Opc_ri, N->getOperand(IsIntr ? 3 : 2),
       CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
 
-  SDValue Ops[] = {N->getOperand(1), // Predicate
-                   Base,             // Memory operand
+  SDValue Ops[] = {N->getOperand(IsIntr ? 2 : 1), // Predicate
+                   Base,                          // Memory operand
                    Offset, Chain};
 
   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
@@ -2167,7 +2168,7 @@ static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
   APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
 
   return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
-         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnes();
 }
 
 // Look for bits that will be useful for later uses.
@@ -2965,8 +2966,8 @@ static int getIntOperandFromRegisterString(StringRef RegString) {
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MRS SysReg mapper.
 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
@@ -3011,8 +3012,8 @@ bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MSR SysReg mapper.
 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
@@ -3152,7 +3153,6 @@ bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SD
         Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
         return true;
       } else if ((ImmVal & 0xFF) == 0) {
-        assert((ImmVal >= -32768) && (ImmVal <= 32512));
         Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
         Imm = CurDAG->getTargetConstant((ImmVal >> 8) & 0xFF, DL, MVT::i32);
         return true;
@@ -3521,7 +3521,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     // Materialize zero constants as copies from WZR/XZR.  This allows
     // the coalescer to propagate these into other instructions.
     ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
-    if (ConstNode->isNullValue()) {
+    if (ConstNode->isZero()) {
       if (VT == MVT::i32) {
         SDValue New = CurDAG->getCopyFromReg(
             CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
@@ -3895,6 +3895,69 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_ld64b:
       SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0);
       return;
+    case Intrinsic::aarch64_sve_ld2_sret: {
+      if (VT == MVT::nxv16i8) {
+        SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B,
+                             true);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+        SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H,
+                             true);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W,
+                             true);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D,
+                             true);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_ld3_sret: {
+      if (VT == MVT::nxv16i8) {
+        SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B,
+                             true);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+        SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H,
+                             true);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W,
+                             true);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D,
+                             true);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_ld4_sret: {
+      if (VT == MVT::nxv16i8) {
+        SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B,
+                             true);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+        SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H,
+                             true);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W,
+                             true);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D,
+                             true);
+        return;
+      }
+      break;
+    }
     }
   } break;
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -4987,6 +5050,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
                                                    SDValue &Base,
                                                    SDValue &OffImm) {
   const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
+  const DataLayout &DL = CurDAG->getDataLayout();
+
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+    return true;
+  }
 
   if (MemVT == EVT())
     return false;
@@ -5010,6 +5081,11 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     return false;
 
   Base = N.getOperand(0);
+  if (Base.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+  }
+
   OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
   return true;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e7282aad05e2..6e9e61c8e7ac 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -246,6 +246,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
+  if (Subtarget->hasLS64()) {
+    addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
+    setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
+    setOperationAction(ISD::STORE, MVT::i64x8, Custom);
+  }
+
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
@@ -779,6 +785,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::i128, Custom);
 
+  // Aligned 128-bit loads and stores are single-copy atomic according to the
+  // v8.4a spec.
+  if (Subtarget->hasLSE2()) {
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+  }
+
   // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
   // custom lowering, as there are no un-paired non-temporal stores and
   // legalization will break up 256 bit inputs.
@@ -882,9 +895,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
 
-  // TODO: Do the same for FP_TO_*INT_SAT.
   setTargetDAGCombine(ISD::FP_TO_SINT);
   setTargetDAGCombine(ISD::FP_TO_UINT);
+  setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
+  setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
   setTargetDAGCombine(ISD::FDIV);
 
   // Try and combine setcc with csel
@@ -899,6 +913,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::STORE);
   if (Subtarget->supportsAddressTopByteIgnored())
     setTargetDAGCombine(ISD::LOAD);
@@ -991,16 +1006,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
 
+    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v1i64, Expand);
+
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 
     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
     // elements smaller than i32, so promote the input to i32 first.
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
-    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
-    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
-    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
-    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
 
     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
@@ -1013,6 +1027,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
 
     if (Subtarget->hasFullFP16()) {
+      setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
@@ -1020,6 +1038,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     } else {
       // when AArch64 doesn't have fullfp16 support, promote the input
       // to i32 first.
+      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
+      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
+      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
+      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
@@ -1034,6 +1056,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
+    for (auto VT : {MVT::v1i64, MVT::v2i64}) {
+      setOperationAction(ISD::UMAX, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, Custom);
+      setOperationAction(ISD::UMIN, VT, Custom);
+      setOperationAction(ISD::SMIN, VT, Custom);
+    }
 
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
@@ -1260,6 +1288,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::FADD, VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
       setOperationAction(ISD::FDIV, VT, Custom);
       setOperationAction(ISD::FMA, VT, Custom);
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
@@ -1447,6 +1476,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
 
   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+  setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+  setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
 
   if (!VT.isFloatingPoint())
     setOperationAction(ISD::ABS, VT, Legal);
@@ -1502,6 +1533,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
     MVT InnerVT = VT.changeVectorElementType(MVT::i8);
     while (InnerVT != VT) {
       setTruncStoreAction(VT, InnerVT, Custom);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
       InnerVT = InnerVT.changeVectorElementType(
           MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
     }
@@ -1771,6 +1804,11 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     Known.Zero = APInt::getHighBitsSet(64, 32);
     break;
   }
+  case AArch64ISD::ASSERT_ZEXT_BOOL: {
+    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
+    break;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
@@ -2023,6 +2061,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::LASTA)
     MAKE_CASE(AArch64ISD::LASTB)
     MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
+    MAKE_CASE(AArch64ISD::LS64_BUILD)
+    MAKE_CASE(AArch64ISD::LS64_EXTRACT)
     MAKE_CASE(AArch64ISD::TBL)
     MAKE_CASE(AArch64ISD::FADD_PRED)
     MAKE_CASE(AArch64ISD::FADDA_PRED)
@@ -2160,6 +2200,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
     MAKE_CASE(AArch64ISD::UADDLP)
     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
+    MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -2245,9 +2286,15 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
 
+  case TargetOpcode::STATEPOINT:
+    // STATEPOINT is a pseudo instruction which has no implicit defs/uses
+    // while bl call instruction (where statepoint will be lowered at the end)
+    // has implicit def. Add this implicit dead def here as a workaround.
+    MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true,
+                                                         true, false, true));
+    LLVM_FALLTHROUGH;
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
-  case TargetOpcode::STATEPOINT:
     return emitPatchPoint(MI, BB);
 
   case AArch64::CATCHRET:
@@ -2285,7 +2332,7 @@ static bool isZerosVector(const SDNode *N) {
   auto Opnd0 = N->getOperand(0);
   auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
   auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
-  return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero());
+  return (CINT && CINT->isZero()) || (CFP && CFP->isZero());
 }
 
 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
@@ -2967,9 +3014,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       }
     }
 
-    if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+    if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
       if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
-        if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+        if ((CC == ISD::SETNE) ^ RHSC->isZero())
           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
       }
     }
@@ -3134,14 +3181,14 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
 
   // We can commute the SELECT_CC by inverting the condition.  This
   // might be needed to make this fit into a CSINV pattern.
-  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+  if (CTVal->isAllOnes() && CFVal->isZero()) {
     std::swap(TVal, FVal);
     std::swap(CTVal, CFVal);
     CC = ISD::getSetCCInverse(CC, LHS.getValueType());
   }
 
   // If the constants line up, perform the transform!
-  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+  if (CTVal->isZero() && CFVal->isAllOnes()) {
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
@@ -3364,42 +3411,132 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   return SDValue();
 }
 
+SDValue
+AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // AArch64 FP-to-int conversions saturate to the destination element size, so
+  // we can lower common saturating conversions to simple instructions.
+  SDValue SrcVal = Op.getOperand(0);
+  EVT SrcVT = SrcVal.getValueType();
+  EVT DstVT = Op.getValueType();
+  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+
+  uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
+  uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
+  uint64_t SatWidth = SatVT.getScalarSizeInBits();
+  assert(SatWidth <= DstElementWidth &&
+         "Saturation width cannot exceed result width");
+
+  // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
+  // Currently, the `llvm.fpto[su]i.sat.*` instrinsics don't accept scalable
+  // types, so this is hard to reach.
+  if (DstVT.isScalableVector())
+    return SDValue();
+
+  EVT SrcElementVT = SrcVT.getVectorElementType();
+
+  // In the absence of FP16 support, promote f16 to f32 and saturate the result.
+  if (SrcElementVT == MVT::f16 &&
+      (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
+    MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
+    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
+    SrcVT = F32VT;
+    SrcElementVT = MVT::f32;
+    SrcElementWidth = 32;
+  } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
+             SrcElementVT != MVT::f16)
+    return SDValue();
+
+  SDLoc DL(Op);
+  // Cases that we can emit directly.
+  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
+    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
+                       DAG.getValueType(DstVT.getScalarType()));
+
+  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
+  // result. This is only valid if the legal cvt is larger than the saturate
+  // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
+  // (at least until sqxtn is selected).
+  if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
+    return SDValue();
+
+  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
+  SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
+                                  DAG.getValueType(IntVT.getScalarType()));
+  SDValue Sat;
+  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+    SDValue MinC = DAG.getConstant(
+        APInt::getSignedMaxValue(SatWidth).sextOrSelf(SrcElementWidth), DL,
+        IntVT);
+    SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
+    SDValue MaxC = DAG.getConstant(
+        APInt::getSignedMinValue(SatWidth).sextOrSelf(SrcElementWidth), DL,
+        IntVT);
+    Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
+  } else {
+    SDValue MinC = DAG.getConstant(
+        APInt::getAllOnesValue(SatWidth).zextOrSelf(SrcElementWidth), DL,
+        IntVT);
+    Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
+  }
+
+  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
+}
+
 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // AArch64 FP-to-int conversions saturate to the destination register size, so
   // we can lower common saturating conversions to simple instructions.
   SDValue SrcVal = Op.getOperand(0);
-
   EVT SrcVT = SrcVal.getValueType();
-  EVT DstVT = Op.getValueType();
 
+  if (SrcVT.isVector())
+    return LowerVectorFP_TO_INT_SAT(Op, DAG);
+
+  EVT DstVT = Op.getValueType();
   EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   uint64_t SatWidth = SatVT.getScalarSizeInBits();
   uint64_t DstWidth = DstVT.getScalarSizeInBits();
   assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
 
-  // TODO: Support lowering of NEON and SVE conversions.
-  if (SrcVT.isVector())
-    return SDValue();
-
-  // TODO: Saturate to SatWidth explicitly.
-  if (SatWidth != DstWidth)
+  // In the absence of FP16 support, promote f16 to f32 and saturate the result.
+  if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
+    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
+    SrcVT = MVT::f32;
+  } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
     return SDValue();
 
-  // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
-  if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal),
-                       Op.getOperand(1));
-
+  SDLoc DL(Op);
   // Cases that we can emit directly.
   if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
        (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
-      (DstVT == MVT::i64 || DstVT == MVT::i32))
-    return Op;
+      DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
+    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
+                       DAG.getValueType(DstVT));
+
+  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
+  // result. This is only valid if the legal cvt is larger than the saturate
+  // width.
+  if (DstWidth < SatWidth)
+    return SDValue();
 
-  // For all other cases, fall back on the expanded form.
-  return SDValue();
+  SDValue NativeCvt =
+      DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
+  SDValue Sat;
+  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+    SDValue MinC = DAG.getConstant(
+        APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT);
+    SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
+    SDValue MaxC = DAG.getConstant(
+        APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT);
+    Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
+  } else {
+    SDValue MinC = DAG.getConstant(
+        APInt::getAllOnesValue(SatWidth).zextOrSelf(DstWidth), DL, DstVT);
+    Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
+  }
+
+  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
 }
 
 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
@@ -3938,8 +4075,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::aarch64_sve_ptrue:
-    return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
-                       Op.getOperand(1));
+    return getPTrue(DAG, dl, Op.getValueType(),
+                    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   case Intrinsic::aarch64_sve_clz:
     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -4004,6 +4141,18 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_frecpx:
     return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frecpe_x:
+    return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frecps_x:
+    return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_sve_frsqrte_x:
+    return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_frsqrts_x:
+    return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_sve_fabs:
     return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -4153,14 +4302,17 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
 
 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
   if (VT.getVectorElementType() == MVT::i32 &&
-      VT.getVectorElementCount().getKnownMinValue() >= 4)
+      VT.getVectorElementCount().getKnownMinValue() >= 4 &&
+      !VT.isFixedLengthVector())
     return true;
 
   return false;
 }
 
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
-  return ExtVal.getValueType().isScalableVector();
+  return ExtVal.getValueType().isScalableVector() ||
+         useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
+                                      /*OverrideNEON=*/true);
 }
 
 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
@@ -4345,9 +4497,17 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   if (IsFixedLength) {
     assert(Subtarget->useSVEForFixedLengthVectors() &&
            "Cannot lower when not using SVE for fixed vectors");
-    IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
-    MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
+      IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
+      MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    } else {
+      MemVT = getContainerForFixedLengthVector(DAG, MemVT);
+      IndexVT = MemVT.changeTypeToInteger();
+    }
     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
+    Mask = DAG.getNode(
+        ISD::ZERO_EXTEND, DL,
+        VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
   }
 
   if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
@@ -4442,8 +4602,13 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
   if (IsFixedLength) {
     assert(Subtarget->useSVEForFixedLengthVectors() &&
            "Cannot lower when not using SVE for fixed vectors");
-    IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
-    MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
+      IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
+      MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    } else {
+      MemVT = getContainerForFixedLengthVector(DAG, MemVT);
+      IndexVT = MemVT.changeTypeToInteger();
+    }
     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
 
     StoreVal =
@@ -4452,6 +4617,9 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
         ISD::ANY_EXTEND, DL,
         VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
     StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
+    Mask = DAG.getNode(
+        ISD::ZERO_EXTEND, DL,
+        VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
   } else if (VT.isFloatingPoint()) {
     // Handle FP data by casting the data so an integer scatter can be used.
     EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
@@ -4593,29 +4761,77 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
       return Result;
     }
   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
-    assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
-    SDValue Lo =
-        DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
-                    DAG.getConstant(0, Dl, MVT::i64));
-    SDValue Hi =
-        DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
-                    DAG.getConstant(1, Dl, MVT::i64));
-    SDValue Result = DAG.getMemIntrinsicNode(
-        AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
-        {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
-        StoreNode->getMemoryVT(), StoreNode->getMemOperand());
-    return Result;
+    return LowerStore128(Op, DAG);
+  } else if (MemVT == MVT::i64x8) {
+    SDValue Value = StoreNode->getValue();
+    assert(Value->getValueType(0) == MVT::i64x8);
+    SDValue Chain = StoreNode->getChain();
+    SDValue Base = StoreNode->getBasePtr();
+    EVT PtrVT = Base.getValueType();
+    for (unsigned i = 0; i < 8; i++) {
+      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
+                                 Value, DAG.getConstant(i, Dl, MVT::i32));
+      SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
+                                DAG.getConstant(i * 8, Dl, PtrVT));
+      Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
+                           StoreNode->getOriginalAlign());
+    }
+    return Chain;
   }
 
   return SDValue();
 }
 
-// Custom lowering for extending v4i8 vector loads.
+/// Lower atomic or volatile 128-bit stores to a single STP instruction.
+SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MemSDNode *StoreNode = cast<MemSDNode>(Op);
+  assert(StoreNode->getMemoryVT() == MVT::i128);
+  assert(StoreNode->isVolatile() || StoreNode->isAtomic());
+  assert(!StoreNode->isAtomic() ||
+         StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
+         StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
+
+  SDValue Value = StoreNode->getOpcode() == ISD::STORE
+                      ? StoreNode->getOperand(1)
+                      : StoreNode->getOperand(2);
+  SDLoc DL(Op);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
+                           DAG.getConstant(0, DL, MVT::i64));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
+                           DAG.getConstant(1, DL, MVT::i64));
+  SDValue Result = DAG.getMemIntrinsicNode(
+      AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
+      {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+      StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+  return Result;
+}
+
 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
                                          SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
+
+  if (LoadNode->getMemoryVT() == MVT::i64x8) {
+    SmallVector<SDValue, 8> Ops;
+    SDValue Base = LoadNode->getBasePtr();
+    SDValue Chain = LoadNode->getChain();
+    EVT PtrVT = Base.getValueType();
+    for (unsigned i = 0; i < 8; i++) {
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
+                                DAG.getConstant(i * 8, DL, PtrVT));
+      SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
+                                 LoadNode->getPointerInfo(),
+                                 LoadNode->getOriginalAlign());
+      Ops.push_back(Part);
+      Chain = SDValue(Part.getNode(), 1);
+    }
+    SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
+    return DAG.getMergeValues({Loaded, Chain}, DL);
+  }
+
+  // Custom lowering for extending v4i8 vector loads.
   EVT VT = Op->getValueType(0);
   assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
 
@@ -4777,17 +4993,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::UDIV:
     return LowerDIV(Op, DAG);
   case ISD::SMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
-                               /*OverrideNEON=*/true);
   case ISD::UMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
-                               /*OverrideNEON=*/true);
   case ISD::SMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
-                               /*OverrideNEON=*/true);
   case ISD::UMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
-                               /*OverrideNEON=*/true);
+    return LowerMinMax(Op, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:
@@ -4835,6 +5044,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                /*OverrideNEON=*/true);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
+      assert(Subtarget->hasLSE2());
+      return LowerStore128(Op, DAG);
+    }
+    return SDValue();
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::MSTORE:
@@ -5025,8 +5240,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   DenseMap<unsigned, SDValue> CopiedRegs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // At this point, Ins[].VT may already be promoted to i32. To correctly
   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
@@ -5186,10 +5400,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         break;
       }
 
-      ArgValue = DAG.getExtLoad(
-          ExtType, DL, VA.getLocVT(), Chain, FIN,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-          MemVT);
+      ArgValue =
+          DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
+                         MachinePointerInfo::getFixedStack(MF, FI), MemVT);
     }
 
     if (VA.getLocInfo() == CCValAssign::Indirect) {
@@ -5229,6 +5442,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
         ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
                                ArgValue, DAG.getValueType(MVT::i32));
+
+      // i1 arguments are zero-extended to i8 by the caller. Emit a
+      // hint to reflect this.
+      if (Ins[i].isOrigArg()) {
+        Argument *OrigArg = MF.getFunction().getArg(Ins[i].getOrigArgIndex());
+        if (OrigArg->getType()->isIntegerTy(1)) {
+          if (!Ins[i].Flags.isZExt()) {
+            ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
+                                   ArgValue.getValueType(), ArgValue);
+          }
+        }
+      }
+
       InVals.push_back(ArgValue);
     }
   }
@@ -5350,13 +5576,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store = DAG.getStore(
-          Val.getValue(1), DL, Val, FIN,
-          IsWin64
-              ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
-                                                  GPRIdx,
-                                                  (i - FirstVariadicGPR) * 8)
-              : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       IsWin64 ? MachinePointerInfo::getFixedStack(
+                                     MF, GPRIdx, (i - FirstVariadicGPR) * 8)
+                               : MachinePointerInfo::getStack(MF, i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -5383,9 +5607,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
 
-        SDValue Store = DAG.getStore(
-            Val.getValue(1), DL, Val, FIN,
-            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
+        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                                     MachinePointerInfo::getStack(MF, i * 16));
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getConstant(16, DL, PtrVT));
@@ -5645,10 +5868,8 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument corresponding
-  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-                            UE = DAG.getEntryNode().getNode()->use_end();
-       U != UE; ++U)
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+  for (SDNode *U : DAG.getEntryNode().getNode()->uses())
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
@@ -5670,6 +5891,19 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
          CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
 }
 
+// Check if the value is zero-extended from i1 to i8
+static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
+  unsigned SizeInBits = Arg.getValueType().getSizeInBits();
+  if (SizeInBits < 8)
+    return false;
+
+  APInt LowBits(SizeInBits, 0xFF);
+  APInt RequredZero(SizeInBits, 0xFE);
+  KnownBits Bits = DAG.computeKnownBits(Arg, LowBits, 4);
+  bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
+  return ZExtBool;
+}
+
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue
@@ -5730,8 +5964,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   if (IsVarArg) {
     // Handle fixed and variable vector arguments differently.
@@ -5868,8 +6101,22 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::AExt:
       if (Outs[i].ArgVT == MVT::i1) {
         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
-        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
-        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+        //
+        // Check if we actually have to do this, because the value may
+        // already be zero-extended.
+        //
+        // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
+        // and rely on DAGCombiner to fold this, because the following
+        // (anyext i32) is combined with (zext i8) in DAG.getNode:
+        //
+        //   (ext (zext x)) -> (zext x)
+        //
+        // This will give us (zext i32), which we cannot remove, so
+        // try to check this beforehand.
+        if (!checkZExtBool(Arg, DAG)) {
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+          Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+        }
       }
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
@@ -5902,14 +6149,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         StoreSize *= NumParts;
       }
 
-      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      MachineFrameInfo &MFI = MF.getFrameInfo();
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
       MFI.setStackID(FI, TargetStackID::ScalableVector);
 
-      MachinePointerInfo MPI =
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+      MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
       SDValue Ptr = DAG.getFrameIndex(
           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
       SDValue SpillSlot = Ptr;
@@ -6004,8 +6250,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
 
         DstAddr = DAG.getFrameIndex(FI, PtrVT);
-        DstInfo =
-            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
 
         // Make sure any stack arguments overlapping with where we're storing
         // are loaded before this eventual operation. Otherwise they'll be
@@ -6015,8 +6260,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
 
         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
-        DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
-                                               LocMemOffset);
+        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
       }
 
       if (Outs[i].Flags.isByVal()) {
@@ -6196,8 +6440,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC);
 
   // Copy the result values into the output registers.
@@ -6274,8 +6517,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   }
 
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
-  const MCPhysReg *I =
-      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
   if (I) {
     for (; *I; ++I) {
       if (AArch64::GPR64RegClass.contains(*I))
@@ -6938,6 +7180,30 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
   SDValue In2 = Op.getOperand(1);
   EVT SrcVT = In2.getValueType();
 
+  if (VT.isScalableVector()) {
+    if (VT != SrcVT)
+      return SDValue();
+
+    // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK)
+    //
+    // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU;
+    // maybe useful for copysign operations with mismatched VTs.
+    //
+    // IntVT here is chosen so it's a legal type with the same element width
+    // as the input.
+    EVT IntVT =
+        getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
+    unsigned NumBits = VT.getScalarSizeInBits();
+    SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT);
+    SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT);
+    SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask,
+                               getSVESafeBitCast(IntVT, In2, DAG));
+    SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask,
+                                    getSVESafeBitCast(IntVT, In1, DAG));
+    SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude);
+    return getSVESafeBitCast(VT, IntResult, DAG);
+  }
+
   if (SrcVT.bitsLT(VT))
     In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
   else if (SrcVT.bitsGT(VT))
@@ -7083,6 +7349,56 @@ SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
 }
 
+SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
+                                           SelectionDAG &DAG) const {
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
+  ISD::CondCode CC;
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Wrong instruction");
+  case ISD::SMAX:
+    CC = ISD::SETGT;
+    break;
+  case ISD::SMIN:
+    CC = ISD::SETLT;
+    break;
+  case ISD::UMAX:
+    CC = ISD::SETUGT;
+    break;
+  case ISD::UMIN:
+    CC = ISD::SETULT;
+    break;
+  }
+
+  if (VT.isScalableVector() ||
+      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Wrong instruction");
+    case ISD::SMAX:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
+                                 /*OverrideNEON=*/true);
+    case ISD::SMIN:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
+                                 /*OverrideNEON=*/true);
+    case ISD::UMAX:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
+                                 /*OverrideNEON=*/true);
+    case ISD::UMIN:
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
+                                 /*OverrideNEON=*/true);
+    }
+  }
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
+  return DAG.getSelect(DL, VT, Cond, Op0, Op1);
+}
+
 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
                                                SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -7255,8 +7571,8 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
     // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
     // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
     // supported types.
-    if (CC == ISD::SETGT && RHSC && RHSC->isAllOnesValue() && CTVal && CFVal &&
-        CTVal->isOne() && CFVal->isAllOnesValue() &&
+    if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
+        CTVal->isOne() && CFVal->isAllOnes() &&
         LHS.getValueType() == TVal.getValueType()) {
       EVT VT = LHS.getValueType();
       SDValue Shift =
@@ -7269,11 +7585,11 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
 
     // If both the TVal and the FVal are constants, see if we can swap them in
     // order to for a CSINV or CSINC out of them.
-    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
       std::swap(TVal, FVal);
       std::swap(CTVal, CFVal);
       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
-    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
       std::swap(TVal, FVal);
       std::swap(CTVal, CFVal);
       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
@@ -7352,7 +7668,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
     // FVal, respectively.
     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
-        !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
+        !RHSVal->isZero() && !RHSVal->isAllOnes()) {
       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
       // "a != C ? x : a" to avoid materializing C.
@@ -7425,11 +7741,14 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
 
 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
                                                   SelectionDAG &DAG) const {
-
   EVT Ty = Op.getValueType();
   auto Idx = Op.getConstantOperandAPInt(2);
-  if (Idx.sge(-1) && Idx.slt(Ty.getVectorMinNumElements()))
+
+  // This will select to an EXT instruction, which has a maximum immediate
+  // value of 255, hence 2048-bits is the maximum value we can lower.
+  if (Idx.sge(-1) && Idx.slt(2048 / Ty.getVectorElementType().getSizeInBits()))
     return Op;
+
   return SDValue();
 }
 
@@ -7937,10 +8256,12 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
                            SDValue Operand, SelectionDAG &DAG,
                            int &ExtraSteps) {
   EVT VT = Operand.getValueType();
-  if (ST->hasNEON() &&
-      (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
-       VT == MVT::f32 || VT == MVT::v1f32 ||
-       VT == MVT::v2f32 || VT == MVT::v4f32)) {
+  if ((ST->hasNEON() &&
+       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
+        VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
+        VT == MVT::v4f32)) ||
+      (ST->hasSVE() &&
+       (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
       // For the reciprocal estimates, convergence is quadratic, so the number
       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
@@ -8173,6 +8494,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     case 'r':
       if (VT.isScalableVector())
         return std::make_pair(0U, nullptr);
+      if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
+        return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
       if (VT.getFixedSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
@@ -8260,6 +8583,15 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   return Res;
 }
 
+EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
+                                                  llvm::Type *Ty,
+                                                  bool AllowUnknown) const {
+  if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
+    return EVT(MVT::i64x8);
+
+  return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
+}
+
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void AArch64TargetLowering::LowerAsmOperandForConstraint(
@@ -8618,7 +8950,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     Src.WindowBase *= Src.WindowScale;
   }
 
-  // Final sanity check before we try to actually produce a shuffle.
+  // Final check before we try to actually produce a shuffle.
   LLVM_DEBUG(for (auto Src
                   : Sources)
                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
@@ -9250,8 +9582,11 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
   } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
     // The lane is incremented by the index of the extract.
     // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
-    Lane += V.getConstantOperandVal(1);
-    V = V.getOperand(0);
+    auto VecVT = V.getOperand(0).getValueType();
+    if (VecVT.isFixedLengthVector() && VecVT.getFixedSizeInBits() <= 128) {
+      Lane += V.getConstantOperandVal(1);
+      V = V.getOperand(0);
+    }
   } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
     // The lane is decremented if we are splatting from the 2nd operand.
     // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
@@ -9265,6 +9600,86 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
   return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
 }
 
+// Return true if we can get a new shuffle mask by checking the parameter mask
+// array to test whether every two adjacent mask values are continuous and
+// starting from an even number.
+static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
+                           SmallVectorImpl<int> &NewMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts % 2 != 0)
+    return false;
+
+  NewMask.clear();
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    int M0 = M[i];
+    int M1 = M[i + 1];
+
+    // If both elements are undef, new mask is undef too.
+    if (M0 == -1 && M1 == -1) {
+      NewMask.push_back(-1);
+      continue;
+    }
+
+    if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
+      NewMask.push_back(M1 / 2);
+      continue;
+    }
+
+    if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
+      NewMask.push_back(M0 / 2);
+      continue;
+    }
+
+    NewMask.clear();
+    return false;
+  }
+
+  assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
+  return true;
+}
+
+// Try to widen element type to get a new mask value for a better permutation
+// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
+// UZP1/2, TRN1/2, REV, INS, etc.
+// For example:
+//  shufflevector <4 x i32> %a, <4 x i32> %b,
+//                <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+// is equivalent to:
+//  shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
+// Finally, we can get:
+//  mov     v0.d[0], v1.d[1]
+static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT ScalarVT = VT.getVectorElementType();
+  unsigned ElementSize = ScalarVT.getFixedSizeInBits();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+  // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
+  // We need to make sure the wider element type is legal. Thus, ElementSize
+  // should be not larger than 32 bits, and i1 type should also be excluded.
+  if (ElementSize > 32 || ElementSize == 1)
+    return SDValue();
+
+  SmallVector<int, 8> NewMask;
+  if (isWideTypeMask(Mask, VT, NewMask)) {
+    MVT NewEltVT = VT.isFloatingPoint()
+                       ? MVT::getFloatingPointVT(ElementSize * 2)
+                       : MVT::getIntegerVT(ElementSize * 2);
+    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      V0 = DAG.getBitcast(NewVT, V0);
+      V1 = DAG.getBitcast(NewVT, V1);
+      return DAG.getBitcast(VT,
+                            DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -9412,6 +9827,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
         DstLaneV);
   }
 
+  if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
+    return NewSD;
+
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   unsigned NumElts = VT.getVectorNumElements();
@@ -9454,9 +9872,10 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
     // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
     // lowering code.
     if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+      if (ConstVal->isZero())
+        return SDValue(DAG.getMachineNode(AArch64::PFALSE, dl, VT), 0);
       if (ConstVal->isOne())
         return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
-      // TODO: Add special case for constant false
     }
     // The general case of i1.  There isn't any natural way to do this,
     // so we use some trickery with whilelo.
@@ -10007,7 +10426,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         unsigned BitSize = VT.getVectorElementType().getSizeInBits();
         APInt Val(BitSize,
                   Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
-        if (Val.isNullValue() || Val.isAllOnesValue())
+        if (Val.isZero() || Val.isAllOnes())
           return Op;
       }
   }
@@ -10311,8 +10730,29 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
          isTypeLegal(Op.getValueType()) &&
          "Expected legal scalable vector type!");
 
-  if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
-    return Op;
+  if (isTypeLegal(Op.getOperand(0).getValueType())) {
+    unsigned NumOperands = Op->getNumOperands();
+    assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
+           "Unexpected number of operands in CONCAT_VECTORS");
+
+    if (NumOperands == 2)
+      return Op;
+
+    // Concat each pair of subvectors and pack into the lower half of the array.
+    SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
+    while (ConcatOps.size() > 1) {
+      for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
+        SDValue V1 = ConcatOps[I];
+        SDValue V2 = ConcatOps[I + 1];
+        EVT SubVT = V1.getValueType();
+        EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+        ConcatOps[I / 2] =
+            DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
+      }
+      ConcatOps.resize(ConcatOps.size() / 2);
+    }
+    return ConcatOps[0];
+  }
 
   return SDValue();
 }
@@ -10432,6 +10872,10 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   unsigned Size = Op.getValueSizeInBits();
 
+  // If we don't have legal types yet, do nothing
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+    return SDValue();
+
   if (InVT.isScalableVector()) {
     // This will be matched by custom code during ISelDAGToDAG.
     if (Idx == 0 && isPackedVectorType(InVT, DAG))
@@ -10450,6 +10894,18 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
       InVT.getSizeInBits() == 128)
     return Op;
 
+  if (useSVEForFixedLengthVectorVT(InVT)) {
+    SDLoc DL(Op);
+
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+    SDValue NewInVec =
+        convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+
+    SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
+                                 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
+    return convertFromScalableVector(DAG, Op.getValueType(), Splice);
+  }
+
   return SDValue();
 }
 
@@ -10465,7 +10921,7 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     SDLoc DL(Op);
     EVT VT = Op.getValueType();
 
-    if (!isTypeLegal(VT) || !VT.isInteger())
+    if (!isTypeLegal(VT))
       return SDValue();
 
     SDValue Vec0 = Op.getOperand(0);
@@ -10475,9 +10931,19 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
       return SDValue();
 
-    // Extend elements of smaller vector...
-    EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
-    SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+    EVT WideVT;
+    SDValue ExtVec;
+
+    if (VT.isFloatingPoint()) {
+      // The InVT type should be legal. We can safely cast the unpacked
+      // subvector from InVT -> VT.
+      WideVT = VT;
+      ExtVec = getSVESafeBitCast(VT, Vec1, DAG);
+    } else {
+      // Extend elements of smaller vector...
+      WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
+      ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
+    }
 
     if (Idx == 0) {
       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
@@ -11085,7 +11551,7 @@ setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
   // memVT is `NumVecs * VT`.
   Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
                                 EC * NumVecs);
-  Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
+  Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
   Info.offset = 0;
   Info.align.reset();
   Info.flags = MachineMemOperand::MOStore;
@@ -11123,7 +11589,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors loaded.
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
     // volatile loads with NEON intrinsics not supported
@@ -11142,14 +11608,14 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
-    for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+    for (const Value *Arg : I.args()) {
+      Type *ArgTy = Arg->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
     // volatile stores with NEON intrinsics not supported
@@ -11203,9 +11669,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
-    Info.flags = MachineMemOperand::MOLoad;
-    if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
-      Info.flags |= MachineMemOperand::MONonTemporal;
+    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
     return true;
   }
   case Intrinsic::aarch64_sve_stnt1: {
@@ -11215,9 +11679,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
-    Info.flags = MachineMemOperand::MOStore;
-    if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
-      Info.flags |= MachineMemOperand::MONonTemporal;
+    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
     return true;
   }
   default:
@@ -11502,7 +11964,7 @@ bool AArch64TargetLowering::shouldSinkOperands(
     // can sink them too.
     auto Ext1 = cast<Instruction>(I->getOperand(0));
     auto Ext2 = cast<Instruction>(I->getOperand(1));
-    if (areExtractShuffleVectors(Ext1, Ext2)) {
+    if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
       Ops.push_back(&Ext1->getOperandUse(0));
       Ops.push_back(&Ext2->getOperandUse(0));
     }
@@ -11568,10 +12030,10 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
 
 /// A helper function for determining the number of interleaved accesses we
 /// will generate when lowering accesses of the given type.
-unsigned
-AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
-                                                 const DataLayout &DL) const {
-  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+unsigned AArch64TargetLowering::getNumInterleavedAccesses(
+    VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
+  unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128;
+  return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
 }
 
 MachineMemOperand::Flags
@@ -11583,24 +12045,63 @@ AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
 }
 
 bool AArch64TargetLowering::isLegalInterleavedAccessType(
-    VectorType *VecTy, const DataLayout &DL) const {
+    VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+  unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+
+  UseScalable = false;
 
   // Ensure the number of vector elements is greater than 1.
-  if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
+  if (NumElements < 2)
     return false;
 
   // Ensure the element type is legal.
   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
     return false;
 
+  if (Subtarget->useSVEForFixedLengthVectors() &&
+      (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
+       (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
+        isPowerOf2_32(NumElements) && VecSize > 128))) {
+    UseScalable = true;
+    return true;
+  }
+
   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
   // 128 will be split into multiple interleaved accesses.
   return VecSize == 64 || VecSize % 128 == 0;
 }
 
+static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
+  if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 2);
+
+  if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 4);
+
+  if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 8);
+
+  if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 8);
+
+  if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 2);
+
+  if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 4);
+
+  if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 8);
+
+  if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
+    return ScalableVectorType::get(VTy->getElementType(), 16);
+
+  llvm_unreachable("Cannot handle input vector type");
+}
+
 /// Lower an interleaved load into a ldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -11628,10 +12129,12 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
+  bool UseScalable;
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
 
-  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
+  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
 
   auto *FVTy = cast<FixedVectorType>(VTy);
 
@@ -11642,48 +12145,84 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
     FVTy =
         FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
 
+  // If we're going to generate more than one load, reset the sub-vector type
+  // to something legal.
+  FVTy = FixedVectorType::get(FVTy->getElementType(),
+                              FVTy->getNumElements() / NumLoads);
+
+  auto *LDVTy =
+      UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
+
   IRBuilder<> Builder(LI);
 
   // The base address of the load.
   Value *BaseAddr = LI->getPointerOperand();
 
   if (NumLoads > 1) {
-    // If we're going to generate more than one load, reset the sub-vector type
-    // to something legal.
-    FVTy = FixedVectorType::get(FVTy->getElementType(),
-                                FVTy->getNumElements() / NumLoads);
-
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
         BaseAddr,
-        FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
-  }
-
-  Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
-  Type *Tys[2] = {FVTy, PtrTy};
-  static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
-                                            Intrinsic::aarch64_neon_ld3,
-                                            Intrinsic::aarch64_neon_ld4};
-  Function *LdNFunc =
-      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+        LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
+  }
+
+  Type *PtrTy =
+      UseScalable
+          ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
+          : LDVTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
+                                 LDVTy->getElementCount());
+
+  static const Intrinsic::ID SVELoadIntrs[3] = {
+      Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
+      Intrinsic::aarch64_sve_ld4_sret};
+  static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
+                                                 Intrinsic::aarch64_neon_ld3,
+                                                 Intrinsic::aarch64_neon_ld4};
+  Function *LdNFunc;
+  if (UseScalable)
+    LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
+                                        SVELoadIntrs[Factor - 2], {LDVTy});
+  else
+    LdNFunc = Intrinsic::getDeclaration(
+        LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
 
   // Holds sub-vectors extracted from the load intrinsic return values. The
   // sub-vectors are associated with the shufflevector instructions they will
   // replace.
   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
+  Value *PTrue = nullptr;
+  if (UseScalable) {
+    unsigned PgPattern =
+        getSVEPredPatternFromNumElements(FVTy->getNumElements());
+    if (Subtarget->getMinSVEVectorSizeInBits() ==
+            Subtarget->getMaxSVEVectorSizeInBits() &&
+        Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
+      PgPattern = AArch64SVEPredPattern::all;
+
+    auto *PTruePat =
+        ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), PgPattern);
+    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
+                                    {PTruePat});
+  }
+
   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
     // If we're generating more than one load, compute the base address of
     // subsequent loads as an offset from the previous.
     if (LoadCount > 0)
-      BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
+      BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
                                             FVTy->getNumElements() * Factor);
 
-    CallInst *LdN = Builder.CreateCall(
-        LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
+    CallInst *LdN;
+    if (UseScalable)
+      LdN = Builder.CreateCall(
+          LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN");
+    else
+      LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy),
+                               "ldN");
 
     // Extract and store the sub-vectors returned by the load intrinsic.
     for (unsigned i = 0; i < Shuffles.size(); i++) {
@@ -11692,11 +12231,17 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
 
       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
 
+      if (UseScalable)
+        SubVec = Builder.CreateExtractVector(
+            FVTy, SubVec,
+            ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
+
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
             SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
                                          FVTy->getNumElements()));
+
       SubVecs[SVI].push_back(SubVec);
     }
   }
@@ -11755,14 +12300,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool UseScalable;
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
     return false;
 
-  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
 
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
@@ -11783,15 +12330,18 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = FixedVectorType::get(IntTy, LaneLen);
   }
 
+  // If we're going to generate more than one store, reset the lane length
+  // and sub-vector type to something legal.
+  LaneLen /= NumStores;
+  SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+  auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
+                            : SubVecTy;
+
   // The base address of the store.
   Value *BaseAddr = SI->getPointerOperand();
 
   if (NumStores > 1) {
-    // If we're going to generate more than one store, reset the lane length
-    // and sub-vector type to something legal.
-    LaneLen /= NumStores;
-    SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
-
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
@@ -11802,13 +12352,42 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 
   auto Mask = SVI->getShuffleMask();
 
-  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
-  Type *Tys[2] = {SubVecTy, PtrTy};
-  static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
-                                             Intrinsic::aarch64_neon_st3,
-                                             Intrinsic::aarch64_neon_st4};
-  Function *StNFunc =
-      Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+  Type *PtrTy =
+      UseScalable
+          ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
+          : STVTy->getPointerTo(SI->getPointerAddressSpace());
+  Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
+                                 STVTy->getElementCount());
+
+  static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
+                                                 Intrinsic::aarch64_sve_st3,
+                                                 Intrinsic::aarch64_sve_st4};
+  static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
+                                                  Intrinsic::aarch64_neon_st3,
+                                                  Intrinsic::aarch64_neon_st4};
+  Function *StNFunc;
+  if (UseScalable)
+    StNFunc = Intrinsic::getDeclaration(SI->getModule(),
+                                        SVEStoreIntrs[Factor - 2], {STVTy});
+  else
+    StNFunc = Intrinsic::getDeclaration(
+        SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
+
+  Value *PTrue = nullptr;
+  if (UseScalable) {
+    unsigned PgPattern =
+        getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
+    if (Subtarget->getMinSVEVectorSizeInBits() ==
+            Subtarget->getMaxSVEVectorSizeInBits() &&
+        Subtarget->getMinSVEVectorSizeInBits() ==
+            DL.getTypeSizeInBits(SubVecTy))
+      PgPattern = AArch64SVEPredPattern::all;
+
+    auto *PTruePat =
+        ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), PgPattern);
+    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
+                                    {PTruePat});
+  }
 
   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
@@ -11816,10 +12395,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 
     // Split the shufflevector operands into sub vectors for the new stN call.
     for (unsigned i = 0; i < Factor; i++) {
+      Value *Shuffle;
       unsigned IdxI = StoreCount * LaneLen * Factor + i;
       if (Mask[IdxI] >= 0) {
-        Ops.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
+        Shuffle = Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
       } else {
         unsigned StartMask = 0;
         for (unsigned j = 1; j < LaneLen; j++) {
@@ -11834,11 +12414,21 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
         // In the case of all undefs we're defaulting to using elems from 0
         // Note: StartMask cannot be negative, it's checked in
         // isReInterleaveMask
-        Ops.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
+        Shuffle = Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
       }
+
+      if (UseScalable)
+        Shuffle = Builder.CreateInsertVector(
+            STVTy, UndefValue::get(STVTy), Shuffle,
+            ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
+
+      Ops.push_back(Shuffle);
     }
 
+    if (UseScalable)
+      Ops.push_back(PTrue);
+
     // If we generating more than one store, we compute the base address of
     // subsequent stores as an offset from the previous.
     if (StoreCount > 0)
@@ -11905,8 +12495,7 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
-  bool CanImplicitFloat =
-      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
@@ -11923,8 +12512,8 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
   };
 
   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
-      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
-    return MVT::v2i64;
+      AlignmentIsAcceptable(MVT::v16i8, Align(16)))
+    return MVT::v16i8;
   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
     return MVT::f128;
   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
@@ -11936,8 +12525,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
 
 LLT AArch64TargetLowering::getOptimalMemOpLLT(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
-  bool CanImplicitFloat =
-      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
@@ -11981,6 +12569,33 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   return IsLegal;
 }
 
+// Return false to prevent folding
+// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
+// if the folding leads to worse code.
+bool AArch64TargetLowering::isMulAddWithConstProfitable(
+    const SDValue &AddNode, const SDValue &ConstNode) const {
+  // Let the DAGCombiner decide for vector types and large types.
+  const EVT VT = AddNode.getValueType();
+  if (VT.isVector() || VT.getScalarSizeInBits() > 64)
+    return true;
+
+  // It is worse if c1 is legal add immediate, while c1*c2 is not
+  // and has to be composed by at least two instructions.
+  const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
+  const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
+  const int64_t C1 = C1Node->getSExtValue();
+  const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
+  if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
+    return true;
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+  AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn);
+  if (Insn.size() > 1)
+    return false;
+
+  // Default to true and let the DAGCombiner decide.
+  return true;
+}
+
 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
 // immediates is the same as for an add or a sub.
 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
@@ -12100,7 +12715,8 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
 
 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
     EVT VT, CodeGenOpt::Level OptLevel) const {
-  return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector();
+  return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() &&
+         !useSVEForFixedLengthVectorVT(VT);
 }
 
 const MCPhysReg *
@@ -12348,7 +12964,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
   if ((VT != MVT::i32 && VT != MVT::i64) ||
-      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
     return SDValue();
 
   SDLoc DL(N);
@@ -12505,7 +13121,7 @@ static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
       DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
       DAG.getConstant(0, DL, MVT::i64));
 
-  std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
+  std::vector<int> ShuffleMask(TargetType.getVectorNumElements());
 
   SDValue VectorShuffleNode =
       DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
@@ -12547,12 +13163,44 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
+  // and in MachineCombiner pass, add+mul will be combined into madd.
+  // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue MulOper;
+  unsigned AddSubOpc;
+
+  auto IsAddSubWith1 = [&](SDValue V) -> bool {
+    AddSubOpc = V->getOpcode();
+    if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
+      SDValue Opnd = V->getOperand(1);
+      MulOper = V->getOperand(0);
+      if (AddSubOpc == ISD::SUB)
+        std::swap(Opnd, MulOper);
+      if (auto C = dyn_cast<ConstantSDNode>(Opnd))
+        return C->isOne();
+    }
+    return false;
+  };
+
+  if (IsAddSubWith1(N0)) {
+    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
+    return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
+  }
+
+  if (IsAddSubWith1(N1)) {
+    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
+    return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
+  }
+
   // The below optimizations require a constant RHS.
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
+  if (!isa<ConstantSDNode>(N1))
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
-  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
+  ConstantSDNode *C = cast<ConstantSDNode>(N1);
   const APInt &ConstValue = C->getAPIntValue();
 
   // Allow the scaling to be folded into the `cnt` instruction by preventing
@@ -12593,7 +13241,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   // and shift+add+shift.
   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
 
-  unsigned ShiftAmt, AddSubOpc;
+  unsigned ShiftAmt;
   // Is the shifted value the LHS operand of the add/sub?
   bool ShiftValUseIsN0 = true;
   // Do we need to negate the result?
@@ -12630,8 +13278,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
       return SDValue();
   }
 
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
   SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
                                    DAG.getConstant(ShiftAmt, DL, MVT::i64));
 
@@ -12757,7 +13403,8 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
 
   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
   uint32_t FloatBits = FloatTy.getSizeInBits();
-  if (FloatBits != 32 && FloatBits != 64)
+  if (FloatBits != 32 && FloatBits != 64 &&
+      (FloatBits != 16 || !Subtarget->hasFullFP16()))
     return SDValue();
 
   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
@@ -12776,27 +13423,20 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
   if (C == -1 || C == 0 || C > Bits)
     return SDValue();
 
-  MVT ResTy;
-  unsigned NumLanes = Op.getValueType().getVectorNumElements();
-  switch (NumLanes) {
-  default:
-    return SDValue();
-  case 2:
-    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
-    break;
-  case 4:
-    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
-    break;
-  }
-
-  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+  EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
     return SDValue();
 
-  assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
-         "Illegal vector type after legalization");
+  if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
+      N->getOpcode() == ISD::FP_TO_UINT_SAT) {
+    EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+    if (SatVT.getScalarSizeInBits() != IntBits)
+      return SDValue();
+  }
 
   SDLoc DL(N);
-  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+  bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
+                   N->getOpcode() == ISD::FP_TO_SINT_SAT);
   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
   SDValue FixConv =
@@ -13097,6 +13737,9 @@ static SDValue performSVEAndCombine(SDNode *N,
 
     SDLoc DL(N);
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
+    if (!C)
+      return SDValue();
+
     uint64_t ExtVal = C->getZExtValue();
 
     // If the mask is fully covered by the unpack, we don't need to push
@@ -13289,7 +13932,7 @@ performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 
     unsigned ElemSizeInBits = VT.getScalarSizeInBits();
     APInt CAsAPInt(ElemSizeInBits, C);
-    if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
+    if (CAsAPInt != APInt::getAllOnes(ElemSizeInBits))
       return SDValue();
 
     ExtendOpA = Xor.getOperand(0);
@@ -13475,7 +14118,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
   // canonicalise to that.
-  if (N0 == N1 && VT.getVectorNumElements() == 2) {
+  if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
     assert(VT.getScalarSizeInBits() == 64);
     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
                        DAG.getConstant(0, dl, MVT::i64));
@@ -13490,7 +14133,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // becomes
   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
 
-  if (N1Opc != ISD::BITCAST)
+  if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
     return SDValue();
   SDValue RHS = N1->getOperand(0);
   MVT RHSTy = RHS.getValueType().getSimpleVT();
@@ -13509,6 +14152,48 @@ static SDValue performConcatVectorsCombine(SDNode *N,
                                  RHS));
 }
 
+static SDValue
+performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              SelectionDAG &DAG) {
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  uint64_t IdxVal = N->getConstantOperandVal(2);
+  EVT VecVT = Vec.getValueType();
+  EVT SubVT = SubVec.getValueType();
+
+  // Only do this for legal fixed vector types.
+  if (!VecVT.isFixedLengthVector() ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
+    return SDValue();
+
+  // Ignore widening patterns.
+  if (IdxVal == 0 && Vec.isUndef())
+    return SDValue();
+
+  // Subvector must be half the width and an "aligned" insertion.
+  unsigned NumSubElts = SubVT.getVectorNumElements();
+  if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
+      (IdxVal != 0 && IdxVal != NumSubElts))
+    return SDValue();
+
+  // Fold insert_subvector -> concat_vectors
+  // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
+  // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
+  SDLoc DL(N);
+  SDValue Lo, Hi;
+  if (IdxVal == 0) {
+    Lo = SubVec;
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
+                     DAG.getVectorIdxConstant(NumSubElts, DL));
+  } else {
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
+                     DAG.getVectorIdxConstant(0, DL));
+    Hi = SubVec;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
+}
+
 static SDValue tryCombineFixedPointConvert(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
@@ -13611,6 +14296,8 @@ static bool isEssentiallyExtractHighSubvector(SDValue N) {
     N = N.getOperand(0);
   if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
     return false;
+  if (N.getOperand(0).getValueType().isScalableVector())
+    return false;
   return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
          N.getOperand(0).getValueType().getVectorNumElements() / 2;
 }
@@ -13687,7 +14374,7 @@ static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
     SetCCInfo.Info.AArch64.CC =
         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
   }
-  return TValue->isOne() && FValue->isNullValue();
+  return TValue->isOne() && FValue->isZero();
 }
 
 // Returns true if Op is setcc or zext of setcc.
@@ -13765,7 +14452,7 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
 
   auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
   auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
-  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
+  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
     return SDValue();
 
   SDValue Op1 = LHS->getOperand(0);
@@ -14237,20 +14924,20 @@ static bool isAllActivePredicate(SDValue N) {
 // or unpredicated operation, which potentially allows better isel (perhaps
 // using immediate forms) or relaxing register reuse requirements.
 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
-                                       SelectionDAG &DAG,
-                                       bool UnpredOp = false) {
+                                       SelectionDAG &DAG, bool UnpredOp = false,
+                                       bool SwapOperands = false) {
   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
   assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
   SDValue Pg = N->getOperand(1);
+  SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
+  SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
 
   // ISD way to specify an all active predicate.
   if (isAllActivePredicate(Pg)) {
     if (UnpredOp)
-      return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2),
-                         N->getOperand(3));
-    else
-      return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg,
-                         N->getOperand(2), N->getOperand(3));
+      return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
+
+    return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
   }
 
   // FUTURE: SplatVector(true)
@@ -14372,6 +15059,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
   case Intrinsic::aarch64_sve_sub:
     return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
+  case Intrinsic::aarch64_sve_subr:
+    return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
   case Intrinsic::aarch64_sve_and:
     return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
   case Intrinsic::aarch64_sve_bic:
@@ -14927,6 +15616,18 @@ static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
+          N->getOpcode() == AArch64ISD::UUNPKLO) &&
+         "Unexpected Opcode!");
+
+  // uunpklo/hi undef -> undef
+  if (N->getOperand(0).isUndef())
+    return DAG.getUNDEF(N->getValueType(0));
+
+  return SDValue();
+}
+
 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue Op0 = N->getOperand(0);
@@ -15169,11 +15870,10 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
         ExtOpCode != ISD::ANY_EXTEND)
       return SDValue();
     SDValue Orig = Ext->getOperand(0);
-    if (Store->getMemoryVT() != Orig->getValueType(0))
+    if (Store->getMemoryVT() != Orig.getValueType())
       return SDValue();
     return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
-                        Store->getBasePtr(), Store->getPointerInfo(),
-                        Store->getAlign());
+                        Store->getBasePtr(), Store->getMemOperand());
   }
 
   return SDValue();
@@ -15844,7 +16544,7 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
                           MVT::v2i32, MVT::v4i32, MVT::v2i64}),
             VT.getSimpleVT().SimpleTy) &&
         ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
-        SplatLHSVal.isOneValue() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
+        SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
         ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
       unsigned NumElts = VT.getVectorNumElements();
       SmallVector<SDValue, 8> Ops(
@@ -16544,6 +17244,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
     return performFpToIntCombine(N, DAG, DCI, Subtarget);
   case ISD::FDIV:
     return performFDivCombine(N, DAG, DCI, Subtarget);
@@ -16565,6 +17267,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performVectorTruncateCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::INSERT_SUBVECTOR:
+    return performInsertSubvectorCombine(N, DCI, DAG);
   case ISD::SELECT:
     return performSelectCombine(N, DCI);
   case ISD::VSELECT:
@@ -16592,6 +17296,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performNVCASTCombine(N);
   case AArch64ISD::SPLICE:
     return performSpliceCombine(N, DAG);
+  case AArch64ISD::UUNPKLO:
+  case AArch64ISD::UUNPKHI:
+    return performUnpackCombine(N, DAG);
   case AArch64ISD::UZP1:
     return performUzpCombine(N, DAG);
   case AArch64ISD::SETCC_MERGE_ZERO:
@@ -17212,18 +17919,22 @@ void AArch64TargetLowering::ReplaceNodeResults(
     return;
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
     // Let normal code take care of it by not adding anything to Results.
     return;
   case ISD::ATOMIC_CMP_SWAP:
     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
     return;
+  case ISD::ATOMIC_LOAD:
   case ISD::LOAD: {
     assert(SDValue(N, 0).getValueType() == MVT::i128 &&
            "unexpected load's value type");
-    LoadSDNode *LoadNode = cast<LoadSDNode>(N);
-    if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
-      // Non-volatile loads are optimized later in AArch64's load/store
+    MemSDNode *LoadNode = cast<MemSDNode>(N);
+    if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
+        LoadNode->getMemoryVT() != MVT::i128) {
+      // Non-volatile or atomic loads are optimized later in AArch64's load/store
       // optimizer.
       return;
     }
@@ -17314,12 +18025,37 @@ AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
+// provided the address is 16-byte aligned.
+bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
+  if (!Subtarget->hasLSE2())
+    return false;
+
+  if (auto LI = dyn_cast<LoadInst>(I))
+    return LI->getType()->getPrimitiveSizeInBits() == 128 &&
+           LI->getAlignment() >= 16;
+
+  if (auto SI = dyn_cast<StoreInst>(I))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
+           SI->getAlignment() >= 16;
+
+  return false;
+}
+
+bool AArch64TargetLowering::shouldInsertFencesForAtomic(
+    const Instruction *I) const {
+  return isOpSuitableForLDPSTP(I);
+}
+
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-  return Size == 128;
+  if (Size != 128)
+    return false;
+
+  return !isOpSuitableForLDPSTP(SI);
 }
 
 // Loads and stores less than 128-bits are already atomic; ones above that
@@ -17328,7 +18064,19 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+
+  if (Size != 128 || isOpSuitableForLDPSTP(LI))
+    return AtomicExpansionKind::None;
+
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement atomicrmw without spilling. If the target address is also on the
+  // stack and close enough to the spill slot, this can lead to a situation
+  // where the monitor always gets cleared and the atomic operation can never
+  // succeed. So at -O0 lower this operation to a CAS loop.
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return AtomicExpansionKind::CmpXChg;
+
+  return AtomicExpansionKind::LLSC;
 }
 
 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
@@ -17531,7 +18279,7 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
         Type::getInt8PtrTy(M.getContext()));
     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
       F->setCallingConv(CallingConv::Win64);
-      F->addAttribute(1, Attribute::AttrKind::InReg);
+      F->addParamAttr(0, Attribute::AttrKind::InReg);
     }
     return;
   }
@@ -17657,7 +18405,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
+  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
@@ -17759,42 +18507,20 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
          "Expected legal fixed length vector!");
 
-  int PgPattern;
-  switch (VT.getVectorNumElements()) {
-  default:
-    llvm_unreachable("unexpected element count for SVE predicate");
-  case 1:
-    PgPattern = AArch64SVEPredPattern::vl1;
-    break;
-  case 2:
-    PgPattern = AArch64SVEPredPattern::vl2;
-    break;
-  case 4:
-    PgPattern = AArch64SVEPredPattern::vl4;
-    break;
-  case 8:
-    PgPattern = AArch64SVEPredPattern::vl8;
-    break;
-  case 16:
-    PgPattern = AArch64SVEPredPattern::vl16;
-    break;
-  case 32:
-    PgPattern = AArch64SVEPredPattern::vl32;
-    break;
-  case 64:
-    PgPattern = AArch64SVEPredPattern::vl64;
-    break;
-  case 128:
-    PgPattern = AArch64SVEPredPattern::vl128;
-    break;
-  case 256:
-    PgPattern = AArch64SVEPredPattern::vl256;
-    break;
-  }
+  unsigned PgPattern =
+      getSVEPredPatternFromNumElements(VT.getVectorNumElements());
+  assert(PgPattern && "Unexpected element count for SVE predicate");
 
-  // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
-  // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
+  // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
+  // AArch64SVEPredPattern::all, which can enable the use of unpredicated
   // variants of instructions when available.
+  const auto &Subtarget =
+      static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
+  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
+  if (MaxSVESize && MinSVESize == MaxSVESize &&
+      MaxSVESize == VT.getSizeInBits())
+    PgPattern = AArch64SVEPredPattern::all;
 
   MVT MaskVT;
   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
@@ -17817,8 +18543,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
     break;
   }
 
-  return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
-                     DAG.getTargetConstant(PgPattern, DL, MVT::i64));
+  return getPTrue(DAG, DL, MaskVT, PgPattern);
 }
 
 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
@@ -17898,9 +18623,6 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
     SDValue Op, SelectionDAG &DAG) const {
   auto Load = cast<MaskedLoadSDNode>(Op);
 
-  if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD)
-    return SDValue();
-
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 386e1c2d8400..392e22b68366 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -330,6 +330,10 @@ enum NodeType : unsigned {
   // Cast between vectors of the same element type but differ in length.
   REINTERPRET_CAST,
 
+  // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa
+  LS64_BUILD,
+  LS64_EXTRACT,
+
   LD1_MERGE_ZERO,
   LD1S_MERGE_ZERO,
   LDNF1_MERGE_ZERO,
@@ -401,6 +405,10 @@ enum NodeType : unsigned {
   SSTNT1_PRED,
   SSTNT1_INDEX_PRED,
 
+  // Asserts that a function argument (i32) is zero-extended to i8 by
+  // the caller
+  ASSERT_ZEXT_BOOL,
+
   // Strict (exception-raising) floating point comparison
   STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
   STRICT_FCMPE,
@@ -591,6 +599,9 @@ public:
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
+  bool isMulAddWithConstProfitable(const SDValue &AddNode,
+                                   const SDValue &ConstNode) const override;
+
   bool shouldConsiderGEPOffsetSplit() const override;
 
   EVT getOptimalMemOpType(const MemOp &Op,
@@ -653,6 +664,9 @@ public:
 
   void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
 
+  bool isOpSuitableForLDPSTP(const Instruction *I) const;
+  bool shouldInsertFencesForAtomic(const Instruction *I) const override;
+
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
   bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
@@ -697,12 +711,11 @@ public:
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                        const SelectionDAG &DAG) const override {
+                        const MachineFunction &MF) const override {
     // Do not merge to float value size (128 bytes) if no implicit
     // float attribute is set.
 
-    bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
-        Attribute::NoImplicitFloat);
+    bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
 
     if (NoFloat)
       return (MemVT.getSizeInBits() <= 64);
@@ -730,7 +743,9 @@ public:
     if (!VT.isVector())
       return hasAndNotCompare(Y);
 
-    return VT.getSizeInBits() >= 64; // vector 'bic'
+    TypeSize TS = VT.getSizeInBits();
+    // TODO: We should be able to use bic/bif too for SVE.
+    return !TS.isScalable() && TS.getFixedValue() >= 64; // vector 'bic'
   }
 
   bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
@@ -786,13 +801,13 @@ public:
   /// Returns true if \p VecTy is a legal interleaved access type. This
   /// function checks the vector element type and the overall width of the
   /// vector.
-  bool isLegalInterleavedAccessType(VectorType *VecTy,
-                                    const DataLayout &DL) const;
+  bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL,
+                                    bool &UseScalable) const;
 
   /// Returns the number of interleaved accesses that will be generated when
   /// lowering accesses of the given type.
-  unsigned getNumInterleavedAccesses(VectorType *VecTy,
-                                     const DataLayout &DL) const;
+  unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL,
+                                     bool UseScalable) const;
 
   MachineMemOperand::Flags getTargetMMOFlags(
     const Instruction &I) const override;
@@ -824,6 +839,9 @@ public:
   bool isAllActivePredicate(SDValue N) const;
   EVT getPromotedVTForPredicate(EVT VT) const;
 
+  EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
+                             bool AllowUnknown = false) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -854,6 +872,7 @@ private:
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
@@ -959,10 +978,12 @@ private:
   SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 9bc2539e95f0..cd4bc8a61a8a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -721,6 +721,7 @@ def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
   let ParserMatchClass = Imm1_64Operand;
 }
 
+def Imm0_0Operand : AsmImmRange<0, 0>;
 def Imm0_1Operand : AsmImmRange<0, 1>;
 def Imm0_3Operand : AsmImmRange<0, 3>;
 def Imm0_7Operand : AsmImmRange<0, 7>;
@@ -845,13 +846,13 @@ def logical_imm64_not : Operand<i64> {
   let ParserMatchClass = LogicalImm64NotOperand;
 }
 
-// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535].
+// immXX_0_65535 predicates - True if the immediate is in the range [0,65535].
 let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in {
-def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{
+def timm32_0_65535 : Operand<i32>, TImmLeaf<i32, [{
   return ((uint32_t)Imm) < 65536;
 }]>;
 
-def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{
+def timm64_0_65535 : Operand<i64>, TImmLeaf<i64, [{
   return ((uint64_t)Imm) < 65536;
 }]>;
 }
@@ -955,8 +956,8 @@ def imm0_3 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_3Operand;
 }
 
-// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
-def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
+// timm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
+def timm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
   return ((uint32_t)Imm) < 8;
 }]> {
   let ParserMatchClass = Imm0_7Operand;
@@ -1215,6 +1216,18 @@ def fpimm0 : FPImmLeaf<fAny, [{
   return Imm.isExactlyValue(+0.0);
 }]>;
 
+def fpimm_half : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(+0.5);
+}]>;
+
+def fpimm_one : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(+1.0);
+}]>;
+
+def fpimm_two : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(+2.0);
+}]>;
+
 def gi_fpimm16 : GICustomOperandRenderer<"renderFPImm16">,
   GISDNodeXFormEquiv<fpimm16XForm>;
 def gi_fpimm32 : GICustomOperandRenderer<"renderFPImm32">,
@@ -1241,12 +1254,15 @@ multiclass VectorIndex<ValueType ty, AsmOperandClass mc, code pred> {
   def _timm : AsmVectorIndexOpnd<ty, mc>, TImmLeaf<ty, pred>;
 }
 
+def VectorIndex0Operand : AsmVectorIndex<0, 0>;
 def VectorIndex1Operand : AsmVectorIndex<1, 1>;
 def VectorIndexBOperand : AsmVectorIndex<0, 15>;
 def VectorIndexHOperand : AsmVectorIndex<0, 7>;
 def VectorIndexSOperand : AsmVectorIndex<0, 3>;
 def VectorIndexDOperand : AsmVectorIndex<0, 1>;
 
+defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand,
+                                [{ return ((uint64_t)Imm) == 0; }]>;
 defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
                                 [{ return ((uint64_t)Imm) == 1; }]>;
 defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
@@ -1291,6 +1307,37 @@ defm sve_elm_idx_extdup_q
   : VectorIndex<i64, SVEVectorIndexExtDupQOperand,
                 [{ return ((uint64_t)Imm) < 4; }]>;
 
+def sme_elm_idx0_0 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 0;
+}]> {
+  let ParserMatchClass = Imm0_0Operand;
+  let PrintMethod = "printMatrixIndex";
+}
+def sme_elm_idx0_1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) <= 1;
+}]> {
+  let ParserMatchClass = Imm0_1Operand;
+  let PrintMethod = "printMatrixIndex";
+}
+def sme_elm_idx0_3 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) <= 3;
+}]> {
+  let ParserMatchClass = Imm0_3Operand;
+  let PrintMethod = "printMatrixIndex";
+}
+def sme_elm_idx0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) <= 7;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
+  let PrintMethod = "printMatrixIndex";
+}
+def sme_elm_idx0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) <= 15;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+  let PrintMethod = "printMatrixIndex";
+}
+
 // 8-bit immediate for AdvSIMD where 64-bit values of the form:
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
 // are encoded as the eight bit value 'abcdefgh'.
@@ -1379,7 +1426,7 @@ class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
 
 // System instructions for exit from transactions
 class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
-    : I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>,
+    : I<(outs), (ins timm64_0_65535:$imm), asm, "\t$imm", "", pattern>,
       Sched<[WriteSys]> {
   bits<16> imm;
   let Inst{31-24} = 0b11010100;
@@ -1703,7 +1750,7 @@ class AuthReturn<bits<3> op, bits<1> M, string asm>
 
 let mayLoad = 1 in
 class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
-                   string operands, string cstr, Operand opr>
+                   string operands, string cstr>
   : I<oops, iops, asm, operands, cstr, []>, Sched<[]> {
   bits<10> offset;
   bits<5> Rn;
@@ -1725,11 +1772,11 @@ class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
 multiclass AuthLoad<bit M, string asm, Operand opr> {
   def indexed   : BaseAuthLoad<M, 0, (outs GPR64:$Rt),
                                (ins GPR64sp:$Rn, opr:$offset),
-                               asm, "\t$Rt, [$Rn, $offset]", "", opr>;
+                               asm, "\t$Rt, [$Rn, $offset]", "">;
   def writeback : BaseAuthLoad<M, 1, (outs GPR64sp:$wback, GPR64:$Rt),
                                (ins GPR64sp:$Rn, opr:$offset),
                                asm, "\t$Rt, [$Rn, $offset]!",
-                               "$Rn = $wback,@earlyclobber $wback", opr>;
+                               "$Rn = $wback,@earlyclobber $wback">;
 
   def : InstAlias<asm # "\t$Rt, [$Rn]",
                   (!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>;
@@ -1965,10 +2012,10 @@ class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
   let Inst{31} = 1;
 }
 
-class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm>
-  : I<(outs GPR64:$Rd), (ins GPR64:$src, GPR64sp:$Rn), asm, "\t$Rd, $Rn",
-      "$Rd = $src",
-      []>,
+class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm,
+                      SDPatternOperator op>
+  : I<(outs GPR64:$dst), (ins GPR64:$Rd, GPR64sp:$Rn), asm, "\t$Rd, $Rn",
+      "$dst = $Rd", [(set GPR64:$dst, (op GPR64:$Rd, opcode, GPR64sp:$Rn))]>,
     Sched<[WriteI, ReadI]> {
   bits<5> Rd;
   bits<5> Rn;
@@ -1979,9 +2026,11 @@ class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm>
   let Inst{4-0} = Rd;
 }
 
-class SignAuthZero<bits<3> opcode_prefix, bits<2> opcode, string asm>
-  : I<(outs GPR64:$Rd), (ins GPR64:$src), asm, "\t$Rd", "$Rd = $src",
-      []>, Sched<[]> {
+class SignAuthZero<bits<3> opcode_prefix, bits<2> opcode, string asm,
+                   SDPatternOperator op>
+  : I<(outs GPR64:$dst), (ins GPR64:$Rd), asm, "\t$Rd", "$dst = $Rd",
+      [(set GPR64:$dst, (op GPR64:$Rd, opcode, (i64 0)))]>,
+    Sched<[]> {
   bits<5> Rd;
   let Inst{31-15} = 0b11011010110000010;
   let Inst{14-12} = opcode_prefix;
@@ -2193,16 +2242,14 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
   let Inst{4-0}   = Rd;
 }
 
-multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+multiclass MulAccum<bit isSub, string asm> {
   // MADD/MSUB generation is decided by MachineCombiner.cpp
-  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm, []>,
       Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 0;
   }
 
-  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm, []>,
       Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 1;
   }
@@ -3421,8 +3468,8 @@ def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
 def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
                        ro_Xextend128>;
 
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
+                   dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
   bits<5> Rn;
@@ -3450,7 +3497,7 @@ class ROInstAlias<string asm, DAGOperand regtype, Instruction INST>
 multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                    string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10 in
-  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+  def roW : LoadStore8RO<sz, V, opc, asm,
                  (outs regtype:$Rt),
                  (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
                  [(set (Ty regtype:$Rt),
@@ -3461,7 +3508,7 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
   }
 
   let AddedComplexity = 10 in
-  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+  def roX : LoadStore8RO<sz, V, opc, asm,
                  (outs regtype:$Rt),
                  (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
                  [(set (Ty regtype:$Rt),
@@ -3477,30 +3524,30 @@ multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10 in
-  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+  def roW : LoadStore8RO<sz, V, opc, asm, (outs),
                  (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
                  [(storeop (Ty regtype:$Rt),
                            (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
                                          ro_Wextend8:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b0;
   }
 
   let AddedComplexity = 10 in
-  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+  def roX : LoadStore8RO<sz, V, opc, asm, (outs),
                  (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
                  [(storeop (Ty regtype:$Rt),
                            (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
                                          ro_Xextend8:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b1;
   }
 
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
+                    dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
   bits<5> Rn;
@@ -3524,7 +3571,7 @@ class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10 in
-  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roW : LoadStore16RO<sz, V, opc, asm, (outs regtype:$Rt),
                  (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
@@ -3534,7 +3581,7 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
   }
 
   let AddedComplexity = 10 in
-  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roX : LoadStore16RO<sz, V, opc, asm, (outs regtype:$Rt),
                  (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
@@ -3549,30 +3596,30 @@ multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10 in
-  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+  def roW : LoadStore16RO<sz, V, opc, asm, (outs),
                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
                 [(storeop (Ty regtype:$Rt),
                           (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
                                          ro_Wextend16:$extend))]>,
-           Sched<[WriteSTIdx, ReadAdrBase]> {
+           Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b0;
   }
 
   let AddedComplexity = 10 in
-  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+  def roX : LoadStore16RO<sz, V, opc, asm, (outs),
                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
                 [(storeop (Ty regtype:$Rt),
                           (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
                                          ro_Xextend16:$extend))]>,
-           Sched<[WriteSTIdx, ReadAdrBase]> {
+           Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b1;
   }
 
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
+                    dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
   bits<5> Rn;
@@ -3596,7 +3643,7 @@ class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10 in
-  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roW : LoadStore32RO<sz, V, opc, asm, (outs regtype:$Rt),
                  (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
@@ -3606,7 +3653,7 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
   }
 
   let AddedComplexity = 10 in
-  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roX : LoadStore32RO<sz, V, opc, asm, (outs regtype:$Rt),
                  (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
@@ -3621,30 +3668,30 @@ multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10 in
-  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+  def roW : LoadStore32RO<sz, V, opc, asm, (outs),
                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
                 [(storeop (Ty regtype:$Rt),
                           (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
                                          ro_Wextend32:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b0;
   }
 
   let AddedComplexity = 10 in
-  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+  def roX : LoadStore32RO<sz, V, opc, asm, (outs),
                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
                 [(storeop (Ty regtype:$Rt),
                           (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
                                         ro_Xextend32:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b1;
   }
 
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
+                    dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
   bits<5> Rn;
@@ -3668,7 +3715,7 @@ class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                     string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roW : LoadStore64RO<sz, V, opc, asm, (outs regtype:$Rt),
                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
                 [(set (Ty regtype:$Rt),
                       (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
@@ -3678,7 +3725,7 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
   }
 
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roX : LoadStore64RO<sz, V, opc, asm, (outs regtype:$Rt),
                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
@@ -3693,30 +3740,30 @@ multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator storeop> {
   let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+  def roW : LoadStore64RO<sz, V, opc, asm, (outs),
                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
                 [(storeop (Ty regtype:$Rt),
                           (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
                                          ro_Wextend64:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b0;
   }
 
   let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+  def roX : LoadStore64RO<sz, V, opc, asm, (outs),
                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
                 [(storeop (Ty regtype:$Rt),
                           (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
                                          ro_Xextend64:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b1;
   }
 
   def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, string asm, dag ins,
+                     dag outs, list<dag> pat>
     : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
   bits<5> Rt;
   bits<5> Rn;
@@ -3740,7 +3787,7 @@ class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
                      string asm, ValueType Ty, SDPatternOperator loadop> {
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roW : LoadStore128RO<sz, V, opc, asm, (outs regtype:$Rt),
                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
@@ -3750,7 +3797,7 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
   }
 
   let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+  def roX : LoadStore128RO<sz, V, opc, asm, (outs regtype:$Rt),
                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
                  [(set (Ty regtype:$Rt),
                        (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
@@ -3763,20 +3810,20 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 }
 
 multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
-                      string asm, ValueType Ty, SDPatternOperator storeop> {
+                      string asm> {
   let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+  def roW : LoadStore128RO<sz, V, opc, asm, (outs),
                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
                 []>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b0;
   }
 
   let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+  def roX : LoadStore128RO<sz, V, opc, asm, (outs),
                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
                 []>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
+            Sched<[WriteSTIdx, ReadST, ReadAdrBase]> {
     let Inst{13} = 0b1;
   }
 
@@ -4466,7 +4513,7 @@ multiclass MemTagStore<bits<2> opc1, string insn> {
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
 class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>,
+    : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", []>,
       Sched<[WriteSys]> {
   bits<16> imm;
   let Inst{31-24} = 0b11010100;
@@ -5309,7 +5356,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
       "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -5332,7 +5379,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
       "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -5351,7 +5398,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
   : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
-    Sched<[WriteV]>;
+    Sched<[!if(!eq(regtype, V128), WriteVq, WriteVd)]>;
 
 multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
   def v8i8  : BaseSIMDThreeSameVectorPseudo<V64,
@@ -5704,7 +5751,7 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -5729,7 +5776,7 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -5775,7 +5822,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
   : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
       "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -6032,7 +6079,7 @@ multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
                           [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
 
-
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
                            RegisterOperand inreg, RegisterOperand outreg,
                            string asm, string outkind, string inkind,
@@ -6040,7 +6087,7 @@ class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
       "{\t$Rd" # outkind # ", $Rn" # inkind #
       "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -6055,6 +6102,7 @@ class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
                            RegisterOperand inreg, RegisterOperand outreg,
                            string asm, string outkind, string inkind,
@@ -6062,7 +6110,7 @@ class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
       "{\t$Rd" # outkind # ", $Rn" # inkind #
       "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -6114,7 +6162,7 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
       "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
       "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
       [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -6212,7 +6260,7 @@ class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
                              list<dag> pattern>
   : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
       !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -6227,13 +6275,14 @@ class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
                              RegisterOperand outtype, RegisterOperand intype,
                              string asm, string VdTy, string VnTy,
                              list<dag> pattern>
   : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
       !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -6296,7 +6345,7 @@ class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
   : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
       "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
       "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6322,7 +6371,7 @@ class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
   : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
       "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
       "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6662,7 +6711,7 @@ class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
       "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
       [(set (vty regtype:$Rd),
             (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
-    Sched<[WriteV]> {
+    Sched<[!if(size, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6696,7 +6745,7 @@ class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
       "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
       "|" # kind # "\t$Rd, $Rn, $Rm}", "",
       [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
-    Sched<[WriteV]> {
+    Sched<[!if(!eq(regtype, V128), WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6752,7 +6801,7 @@ class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
       "\t$Rd, $Rn, $Rm", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6772,7 +6821,7 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
                         dag oops, dag iops, string asm,
             list<dag> pattern>
   : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6815,8 +6864,7 @@ multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
   def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
 }
 
-multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
+multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm> {
   def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
                                      (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
                                      asm, []>;
@@ -6826,16 +6874,19 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
 }
 
 multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
+                             SDPatternOperator OpNode = null_frag,
+                             Predicate pred = HasNEON> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    let Predicates = [pred] in {
     def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
     def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-    let Predicates = [HasNEON, HasFullFP16] in {
+    }
+    let Predicates = [pred, HasFullFP16] in {
     def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
       [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>;
-    } // Predicates = [HasNEON, HasFullFP16]
+    }
   }
 
   def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -6863,7 +6914,7 @@ class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
               dag oops, dag iops, string asm, string cstr, list<dag> pat>
   : I<oops, iops, asm,
       "\t$Rd, $Rn, $Rm", cstr, pat>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -6916,7 +6967,7 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         string asm, list<dag> pat>
   : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
       "\t$Rd, $Rn", "", pat>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-30} = 0b01;
@@ -6938,7 +6989,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
                         string asm, list<dag> pat>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
       "\t$Rd, $Rn", "$Rd = $dst", pat>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-30} = 0b01;
@@ -6958,7 +7009,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, string asm, string zero>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "\t$Rd, $Rn, #" # zero, "", []>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-30} = 0b01;
@@ -6977,7 +7028,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
 class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
   : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
      [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-17} = 0b011111100110000;
@@ -7025,10 +7076,13 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
             (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
 }
 
-multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm,
+                           Predicate pred = HasNEON> {
+  let Predicates = [pred] in {
   def v1i64       : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
   def v1i32       : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
-  let Predicates = [HasNEON, HasFullFP16] in {
+  }
+  let Predicates = [pred, HasFullFP16] in {
   def v1f16       : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
   }
 }
@@ -7096,7 +7150,7 @@ class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
                         string asm, string kind>
   : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
       "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-30} = 0b01;
@@ -7136,7 +7190,7 @@ class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
                           string asm, string kind, list<dag> pattern>
   : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
       "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -7202,7 +7256,7 @@ multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
 class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
                      string operands, string constraints, list<dag> pattern>
   : I<outs, ins, asm, operands, constraints, pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31} = 0;
@@ -7228,7 +7282,7 @@ class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
 class SIMDDupFromElement<bit Q, string dstkind, string srckind,
                          ValueType vectype, ValueType insreg,
                          RegisterOperand vecreg, Operand idxtype,
-                         ValueType elttype, SDNode OpNode>
+                         SDNode OpNode>
   : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
                    "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
                    "|" # dstkind # "\t$Rd, $Rn$idx}", "",
@@ -7239,7 +7293,7 @@ class SIMDDupFromElement<bit Q, string dstkind, string srckind,
 
 class SIMDDup64FromElement
   : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
-                       VectorIndexD, i64, AArch64duplane64> {
+                       VectorIndexD, AArch64duplane64> {
   bits<1> idx;
   let Inst{20} = idx;
   let Inst{19-16} = 0b1000;
@@ -7248,7 +7302,7 @@ class SIMDDup64FromElement
 class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
                            RegisterOperand vecreg>
   : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
-                       VectorIndexS, i64, AArch64duplane32> {
+                       VectorIndexS, AArch64duplane32> {
   bits<2> idx;
   let Inst{20-19} = idx;
   let Inst{18-16} = 0b100;
@@ -7257,7 +7311,7 @@ class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
 class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
                            RegisterOperand vecreg>
   : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
-                       VectorIndexH, i64, AArch64duplane16> {
+                       VectorIndexH, AArch64duplane16> {
   bits<3> idx;
   let Inst{20-18} = idx;
   let Inst{17-16} = 0b10;
@@ -7266,7 +7320,7 @@ class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
 class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
                           RegisterOperand vecreg>
   : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
-                       VectorIndexB, i64, AArch64duplane8> {
+                       VectorIndexB, AArch64duplane8> {
   bits<4> idx;
   let Inst{20-17} = idx;
   let Inst{16} = 1;
@@ -7295,6 +7349,25 @@ class SIMDMovAlias<string asm, string size, Instruction inst,
                 (inst regtype:$dst, V128:$src, idxtype:$idx)>;
 
 multiclass SMov {
+  // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
+  // streaming mode.
+  let Predicates = [HasNEONorStreamingSVE] in {
+    def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> {
+      let Inst{20-16} = 0b00001;
+    }
+    def vi8to64_idx0 : SIMDSMov<1, ".b", GPR64, VectorIndex0> {
+      let Inst{20-16} = 0b00001;
+    }
+    def vi16to32_idx0 : SIMDSMov<0, ".h", GPR32, VectorIndex0> {
+      let Inst{20-16} = 0b00010;
+    }
+    def vi16to64_idx0 : SIMDSMov<1, ".h", GPR64, VectorIndex0> {
+      let Inst{20-16} = 0b00010;
+    }
+    def vi32to64_idx0 : SIMDSMov<1, ".s", GPR64, VectorIndex0> {
+      let Inst{20-16} = 0b00100;
+    }
+  }
   def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
     bits<4> idx;
     let Inst{20-17} = idx;
@@ -7323,6 +7396,28 @@ multiclass SMov {
 }
 
 multiclass UMov {
+  // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME)
+  // streaming mode.
+  let Predicates = [HasNEONorStreamingSVE] in {
+    def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> {
+      let Inst{20-16} = 0b00001;
+    }
+    def vi16_idx0 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndex0> {
+      let Inst{20-16} = 0b00010;
+    }
+    def vi32_idx0 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndex0> {
+      let Inst{20-16} = 0b00100;
+    }
+    def vi64_idx0 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndex0> {
+      let Inst{20-16} = 0b01000;
+    }
+    def : SIMDMovAlias<"mov", ".s",
+                       !cast<Instruction>(NAME # vi32_idx0),
+                       GPR32, VectorIndex0>;
+    def : SIMDMovAlias<"mov", ".d",
+                       !cast<Instruction>(NAME # vi64_idx0),
+                       GPR64, VectorIndex0>;
+  }
   def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
     bits<4> idx;
     let Inst{20-17} = idx;
@@ -7473,7 +7568,7 @@ class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
                           RegisterOperand listtype, string asm, string kind>
   : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
        "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Vd;
   bits<5> Vn;
   bits<5> Vm;
@@ -7494,7 +7589,7 @@ class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectyp
                           RegisterOperand listtype, string asm, string kind>
   : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
        "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Vd;
   bits<5> Vn;
   bits<5> Vm;
@@ -7609,11 +7704,11 @@ multiclass SIMDTableLookupTied<bit op, string asm> {
 //----------------------------------------------------------------------------
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
-                        string kind, Operand idxtype>
-  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+                        string asm, string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), asm,
        "{\t$dst, $src" # kind # "$idx" #
        "|\t$dst, $src$idx}", "", []>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> dst;
   bits<5> src;
   let Inst{31-21} = 0b01011110000;
@@ -7630,22 +7725,22 @@ class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
 
 
 multiclass SIMDScalarCPY<string asm> {
-  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, asm, ".b", VectorIndexB> {
     bits<4> idx;
     let Inst{20-17} = idx;
     let Inst{16} = 1;
   }
-  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, asm, ".h", VectorIndexH> {
     bits<3> idx;
     let Inst{20-18} = idx;
     let Inst{17-16} = 0b10;
   }
-  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, asm, ".s", VectorIndexS> {
     bits<2> idx;
     let Inst{20-19} = idx;
     let Inst{18-16} = 0b100;
   }
-  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, asm, ".d", VectorIndexD> {
     bits<1> idx;
     let Inst{20} = idx;
     let Inst{19-16} = 0b1000;
@@ -7678,7 +7773,7 @@ class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
                           string asm, string op_string,
                           string cstr, list<dag> pattern>
   : I<oops, iops, asm, op_string, cstr, pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<8> imm8;
   let Inst{31}    = 0;
@@ -7848,7 +7943,7 @@ class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
       asm,
       "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
       "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -7878,7 +7973,7 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
       (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
       "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
       "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -7971,7 +8066,7 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
                                (v8bf16
                                   (AArch64duplane16 (v8bf16 V128_lo:$Rm),
                                       VectorIndexH:$idx)))))]>,
-    Sched<[WriteV]> {
+    Sched<[WriteVq]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<4> Rm;
@@ -8892,7 +8987,7 @@ class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
                      Operand immtype, string asm, list<dag> pattern>
   : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
       asm, "\t$Rd, $Rn, $imm", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<7> imm;
@@ -8912,7 +9007,7 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
                      Operand immtype, string asm, list<dag> pattern>
   : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
       asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[WriteVd]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<7> imm;
@@ -9076,7 +9171,7 @@ class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
   : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
       asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
            "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -9099,7 +9194,7 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
   : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
       asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
            "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31}    = 0;
@@ -10646,7 +10741,7 @@ class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, rottype:$rot), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot"
       "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -10720,7 +10815,7 @@ class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
       (ins regtype:$Rd, regtype:$Rn, regtype:$Rm, rottype:$rot), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot"
       "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -10796,7 +10891,7 @@ class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
       "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind #
       "$idx, $rot" # "|" # apple_kind #
       "\t$Rd, $Rn, $Rm$idx, $rot}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
+    Sched<[!if(Q, WriteVq, WriteVd)]> {
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -10822,8 +10917,8 @@ class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
 // The complex instructions index by pairs of elements, so the VectorIndexes
 // don't match the lane types, and the index bits are different to the other
 // classes.
-multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
-                                     string asm, SDPatternOperator OpNode> {
+multiclass SIMDIndexedTiedComplexHSD<bit opc1, bit opc2, Operand rottype,
+                                     string asm> {
   let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
   def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64,
                       V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h",
@@ -10861,7 +10956,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
               list<dag> pat>
   : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
+    Sched<[WriteVq]>{
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-16} = 0b0100111000101000;
@@ -10887,7 +10982,7 @@ class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
   : I<oops, iops, asm,
       "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
       "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
-    Sched<[WriteV]>{
+    Sched<[WriteVq]>{
   bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
@@ -10927,7 +11022,7 @@ class SHA2OpInst<bits<4> opc, string asm, string kind,
                  list<dag> pat>
   : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
                        "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
+    Sched<[WriteVq]>{
   bits<5> Rd;
   bits<5> Rn;
   let Inst{31-16} = 0b0101111000101000;
@@ -10950,7 +11045,7 @@ class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
 // Armv8.2-A Crypto extensions
 class BaseCryptoV82<dag oops, dag iops, string asm, string asmops, string cst,
                     list<dag> pattern>
-  : I <oops, iops, asm, asmops, cst, pattern>, Sched<[WriteV]> {
+  : I <oops, iops, asm, asmops, cst, pattern>, Sched<[WriteVq]> {
   bits<5> Vd;
   bits<5> Vn;
   let Inst{31-25} = 0b1100111;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index b03d421d3e6d..f8f8ee3f1e6c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1112,8 +1112,8 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                      Register &SrcReg2, int &CmpMask,
-                                      int &CmpValue) const {
+                                      Register &SrcReg2, int64_t &CmpMask,
+                                      int64_t &CmpValue) const {
   // The first operand can be a frame index where we'd normally expect a
   // register.
   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
@@ -1155,8 +1155,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    // FIXME: In order to convert CmpValue to 0 or 1
-    CmpValue = MI.getOperand(2).getImm() != 0;
+    CmpValue = MI.getOperand(2).getImm();
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
@@ -1165,14 +1164,9 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
-    // while the type of CmpValue is int. When converting uint64_t to int,
-    // the high 32 bits of uint64_t will be lost.
-    // In fact it causes a bug in spec2006-483.xalancbmk
-    // CmpValue is only used to compare with zero in OptimizeCompareInstr
     CmpValue = AArch64_AM::decodeLogicalImmediate(
                    MI.getOperand(2).getImm(),
-                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
     return true;
   }
 
@@ -1433,8 +1427,8 @@ bool AArch64InstrInfo::optimizePTestInstr(
 ///    instruction.
 ///    Only comparison with zero is supported.
 bool AArch64InstrInfo::optimizeCompareInstr(
-    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
+    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
+    int64_t CmpValue, const MachineRegisterInfo *MRI) const {
   assert(CmpInstr.getParent());
   assert(MRI);
 
@@ -1462,10 +1456,6 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
 
-  // Continue only if we have a "ri" where immediate is zero.
-  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
-  // function.
-  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
   if (SrcReg2 != 0)
     return false;
 
@@ -1473,9 +1463,10 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
     return false;
 
-  if (!CmpValue && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
+  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
     return true;
-  return removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
+  return (CmpValue == 0 || CmpValue == 1) &&
+         removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
 }
 
 /// Get opcode of S version of Instr.
@@ -2099,10 +2090,8 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
   default:
     break;
   case TargetOpcode::COPY: {
-    // FPR64 copies will by lowered to ORR.16b
     Register DstReg = MI.getOperand(0).getReg();
-    return (AArch64::FPR64RegClass.contains(DstReg) ||
-            AArch64::FPR128RegClass.contains(DstReg));
+    return AArch64::FPR128RegClass.contains(DstReg);
   }
   case AArch64::ORRv16i8:
     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
@@ -2274,32 +2263,35 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STNPSi:
   case AArch64::LDG:
   case AArch64::STGPi:
+
   case AArch64::LD1B_IMM:
-  case AArch64::LD1H_IMM:
-  case AArch64::LD1W_IMM:
-  case AArch64::LD1D_IMM:
-  case AArch64::ST1B_IMM:
-  case AArch64::ST1H_IMM:
-  case AArch64::ST1W_IMM:
-  case AArch64::ST1D_IMM:
   case AArch64::LD1B_H_IMM:
+  case AArch64::LD1B_S_IMM:
+  case AArch64::LD1B_D_IMM:
   case AArch64::LD1SB_H_IMM:
+  case AArch64::LD1SB_S_IMM:
+  case AArch64::LD1SB_D_IMM:
+  case AArch64::LD1H_IMM:
   case AArch64::LD1H_S_IMM:
+  case AArch64::LD1H_D_IMM:
   case AArch64::LD1SH_S_IMM:
+  case AArch64::LD1SH_D_IMM:
+  case AArch64::LD1W_IMM:
   case AArch64::LD1W_D_IMM:
   case AArch64::LD1SW_D_IMM:
+  case AArch64::LD1D_IMM:
+
+  case AArch64::ST1B_IMM:
   case AArch64::ST1B_H_IMM:
-  case AArch64::ST1H_S_IMM:
-  case AArch64::ST1W_D_IMM:
-  case AArch64::LD1B_S_IMM:
-  case AArch64::LD1SB_S_IMM:
-  case AArch64::LD1H_D_IMM:
-  case AArch64::LD1SH_D_IMM:
   case AArch64::ST1B_S_IMM:
-  case AArch64::ST1H_D_IMM:
-  case AArch64::LD1B_D_IMM:
-  case AArch64::LD1SB_D_IMM:
   case AArch64::ST1B_D_IMM:
+  case AArch64::ST1H_IMM:
+  case AArch64::ST1H_S_IMM:
+  case AArch64::ST1H_D_IMM:
+  case AArch64::ST1W_IMM:
+  case AArch64::ST1W_D_IMM:
+  case AArch64::ST1D_IMM:
+
   case AArch64::LD1RB_IMM:
   case AArch64::LD1RB_H_IMM:
   case AArch64::LD1RB_S_IMM:
@@ -2316,6 +2308,32 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LD1RW_D_IMM:
   case AArch64::LD1RSW_IMM:
   case AArch64::LD1RD_IMM:
+
+  case AArch64::LDNT1B_ZRI:
+  case AArch64::LDNT1H_ZRI:
+  case AArch64::LDNT1W_ZRI:
+  case AArch64::LDNT1D_ZRI:
+  case AArch64::STNT1B_ZRI:
+  case AArch64::STNT1H_ZRI:
+  case AArch64::STNT1W_ZRI:
+  case AArch64::STNT1D_ZRI:
+
+  case AArch64::LDNF1B_IMM:
+  case AArch64::LDNF1B_H_IMM:
+  case AArch64::LDNF1B_S_IMM:
+  case AArch64::LDNF1B_D_IMM:
+  case AArch64::LDNF1SB_H_IMM:
+  case AArch64::LDNF1SB_S_IMM:
+  case AArch64::LDNF1SB_D_IMM:
+  case AArch64::LDNF1H_IMM:
+  case AArch64::LDNF1H_S_IMM:
+  case AArch64::LDNF1H_D_IMM:
+  case AArch64::LDNF1SH_S_IMM:
+  case AArch64::LDNF1SH_D_IMM:
+  case AArch64::LDNF1W_IMM:
+  case AArch64::LDNF1W_D_IMM:
+  case AArch64::LDNF1SW_D_IMM:
+  case AArch64::LDNF1D_IMM:
     return 3;
   case AArch64::ADDG:
   case AArch64::STGOffset:
@@ -2866,10 +2884,22 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LD1H_IMM:
   case AArch64::LD1W_IMM:
   case AArch64::LD1D_IMM:
+  case AArch64::LDNT1B_ZRI:
+  case AArch64::LDNT1H_ZRI:
+  case AArch64::LDNT1W_ZRI:
+  case AArch64::LDNT1D_ZRI:
   case AArch64::ST1B_IMM:
   case AArch64::ST1H_IMM:
   case AArch64::ST1W_IMM:
   case AArch64::ST1D_IMM:
+  case AArch64::STNT1B_ZRI:
+  case AArch64::STNT1H_ZRI:
+  case AArch64::STNT1W_ZRI:
+  case AArch64::STNT1D_ZRI:
+  case AArch64::LDNF1B_IMM:
+  case AArch64::LDNF1H_IMM:
+  case AArch64::LDNF1W_IMM:
+  case AArch64::LDNF1D_IMM:
     // A full vectors worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(16);
@@ -2886,6 +2916,12 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::ST1B_H_IMM:
   case AArch64::ST1H_S_IMM:
   case AArch64::ST1W_D_IMM:
+  case AArch64::LDNF1B_H_IMM:
+  case AArch64::LDNF1SB_H_IMM:
+  case AArch64::LDNF1H_S_IMM:
+  case AArch64::LDNF1SH_S_IMM:
+  case AArch64::LDNF1W_D_IMM:
+  case AArch64::LDNF1SW_D_IMM:
     // A half vector worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(8);
@@ -2899,6 +2935,10 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LD1SH_D_IMM:
   case AArch64::ST1B_S_IMM:
   case AArch64::ST1H_D_IMM:
+  case AArch64::LDNF1B_S_IMM:
+  case AArch64::LDNF1SB_S_IMM:
+  case AArch64::LDNF1H_D_IMM:
+  case AArch64::LDNF1SH_D_IMM:
     // A quarter vector worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(4);
@@ -2909,6 +2949,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LD1B_D_IMM:
   case AArch64::LD1SB_D_IMM:
   case AArch64::ST1B_D_IMM:
+  case AArch64::LDNF1B_D_IMM:
+  case AArch64::LDNF1SB_D_IMM:
     // A eighth vector worth of data
     // Width = mbytes * elements
     Scale = TypeSize::Scalable(2);
@@ -3503,77 +3545,37 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR64RegClass.contains(DestReg) &&
       AArch64::FPR64RegClass.contains(SrcReg)) {
-    if (Subtarget.hasNEON()) {
-      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
-                                       &AArch64::FPR128RegClass);
-      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
-                                      &AArch64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
-          .addReg(SrcReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
   if (AArch64::FPR32RegClass.contains(DestReg) &&
       AArch64::FPR32RegClass.contains(SrcReg)) {
-    if (Subtarget.hasNEON()) {
-      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
-                                       &AArch64::FPR128RegClass);
-      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
-                                      &AArch64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
-          .addReg(SrcReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
   if (AArch64::FPR16RegClass.contains(DestReg) &&
       AArch64::FPR16RegClass.contains(SrcReg)) {
-    if (Subtarget.hasNEON()) {
-      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
-                                       &AArch64::FPR128RegClass);
-      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
-                                      &AArch64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
-          .addReg(SrcReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    } else {
-      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
-                                       &AArch64::FPR32RegClass);
-      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
-                                      &AArch64::FPR32RegClass);
-      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
+    DestReg =
+        RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
+    SrcReg =
+        RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
   if (AArch64::FPR8RegClass.contains(DestReg) &&
       AArch64::FPR8RegClass.contains(SrcReg)) {
-    if (Subtarget.hasNEON()) {
-      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
-                                       &AArch64::FPR128RegClass);
-      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
-                                      &AArch64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
-          .addReg(SrcReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    } else {
-      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
-                                       &AArch64::FPR32RegClass);
-      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
-                                      &AArch64::FPR32RegClass);
-      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
+    DestReg =
+        RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
+    SrcReg =
+        RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
@@ -4339,6 +4341,10 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   case AArch64::ST1Twov1d:
   case AArch64::ST1Threev1d:
   case AArch64::ST1Fourv1d:
+  case AArch64::ST1i8:
+  case AArch64::ST1i16:
+  case AArch64::ST1i32:
+  case AArch64::ST1i64:
   case AArch64::IRG:
   case AArch64::IRGstack:
   case AArch64::STGloop:
@@ -4911,6 +4917,55 @@ static bool getFMAPatterns(MachineInstr &Root,
   return Found;
 }
 
+static bool getFMULPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  auto Match = [&](unsigned Opcode, int Operand,
+                   MachineCombinerPattern Pattern) -> bool {
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    MachineOperand &MO = Root.getOperand(Operand);
+    MachineInstr *MI = nullptr;
+    if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
+      MI = MRI.getUniqueVRegDef(MO.getReg());
+    if (MI && MI->getOpcode() == Opcode) {
+      Patterns.push_back(Pattern);
+      return true;
+    }
+    return false;
+  };
+
+  typedef MachineCombinerPattern MCP;
+
+  switch (Root.getOpcode()) {
+  default:
+    return false;
+  case AArch64::FMULv2f32:
+    Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
+    Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
+    break;
+  case AArch64::FMULv2f64:
+    Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
+    Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
+    break;
+  case AArch64::FMULv4f16:
+    Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
+    Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
+    break;
+  case AArch64::FMULv4f32:
+    Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
+    Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
+    break;
+  case AArch64::FMULv8f16:
+    Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
+    Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
+    break;
+  }
+
+  return Found;
+}
+
 /// Return true when a code sequence can improve throughput. It
 /// should be called only for instructions in loops.
 /// \param Pattern - combiner pattern
@@ -4974,6 +5029,16 @@ bool AArch64InstrInfo::isThroughputPattern(
   case MachineCombinerPattern::FMLSv2f64_OP2:
   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
   case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMULv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMULv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMULv4i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i16_indexed_OP2:
+  case MachineCombinerPattern::FMULv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMULv8i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv8i16_indexed_OP2:
   case MachineCombinerPattern::MULADDv8i8_OP1:
   case MachineCombinerPattern::MULADDv8i8_OP2:
   case MachineCombinerPattern::MULADDv16i8_OP1:
@@ -5030,6 +5095,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMaddPatterns(Root, Patterns))
     return true;
   // Floating point patterns
+  if (getFMULPatterns(Root, Patterns))
+    return true;
   if (getFMAPatterns(Root, Patterns))
     return true;
 
@@ -5118,6 +5185,42 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
   return MUL;
 }
 
+/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
+static MachineInstr *
+genIndexedMultiply(MachineInstr &Root,
+                   SmallVectorImpl<MachineInstr *> &InsInstrs,
+                   unsigned IdxDupOp, unsigned MulOpc,
+                   const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
+  assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
+         "Invalid index of FMUL operand");
+
+  MachineFunction &MF = *Root.getMF();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  MachineInstr *Dup =
+      MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
+
+  Register DupSrcReg = Dup->getOperand(1).getReg();
+  MRI.clearKillFlags(DupSrcReg);
+  MRI.constrainRegClass(DupSrcReg, RC);
+
+  unsigned DupSrcLane = Dup->getOperand(2).getImm();
+
+  unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
+  MachineOperand &MulOp = Root.getOperand(IdxMulOp);
+
+  Register ResultReg = Root.getOperand(0).getReg();
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MulOpc), ResultReg)
+            .add(MulOp)
+            .addReg(DupSrcReg)
+            .addImm(DupSrcLane);
+
+  InsInstrs.push_back(MIB);
+  return &Root;
+}
+
 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
 /// instructions.
 ///
@@ -5329,15 +5432,15 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     uint64_t UImm = SignExtend64(Imm, BitSize);
     uint64_t Encoding;
-    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
-      MachineInstrBuilder MIB1 =
-          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
-              .addReg(ZeroReg)
-              .addImm(Encoding);
-      InsInstrs.push_back(MIB1);
-      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
-      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
-    }
+    if (!AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding))
+      return;
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+            .addReg(ZeroReg)
+            .addImm(Encoding);
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
     break;
   }
   case MachineCombinerPattern::MULSUBW_OP1:
@@ -5420,15 +5523,15 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     uint64_t UImm = SignExtend64(-Imm, BitSize);
     uint64_t Encoding;
-    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
-      MachineInstrBuilder MIB1 =
-          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
-              .addReg(ZeroReg)
-              .addImm(Encoding);
-      InsInstrs.push_back(MIB1);
-      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
-      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
-    }
+    if (!AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding))
+      return;
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+            .addReg(ZeroReg)
+            .addImm(Encoding);
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
     break;
   }
 
@@ -6076,12 +6179,50 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
+  case MachineCombinerPattern::FMULv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
+                       &AArch64::FPR128RegClass, MRI);
+    break;
+  }
+  case MachineCombinerPattern::FMULv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
+                       &AArch64::FPR128RegClass, MRI);
+    break;
+  }
+  case MachineCombinerPattern::FMULv4i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
+                       &AArch64::FPR128_loRegClass, MRI);
+    break;
+  }
+  case MachineCombinerPattern::FMULv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
+                       &AArch64::FPR128RegClass, MRI);
+    break;
+  }
+  case MachineCombinerPattern::FMULv8i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
+                       &AArch64::FPR128_loRegClass, MRI);
+    break;
+  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
-  // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and
-  // CodeGen/AArch64/urem-seteq-nonzero.ll.
-  // assert(MUL && "MUL was never set");
-  DelInstrs.push_back(MUL);
+  if (MUL)
+    DelInstrs.push_back(MUL);
   DelInstrs.push_back(&Root);
 }
 
@@ -6624,13 +6765,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
   MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
   for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
        Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
-    const std::vector<MCCFIInstruction> &CFIInstructions =
-        RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
-    if (MBBI->isCFIInstruction()) {
-      unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
-      MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+    if (MBBI->isCFIInstruction())
       CFICount++;
-    }
     MBBI++;
   }
 
@@ -7212,7 +7348,8 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
         .setMIFlags(MachineInstr::FrameSetup);
 
     // If v8.3a features are available we can replace a RET instruction by
-    // RETAA or RETAB and omit the AUT instructions
+    // RETAA or RETAB and omit the AUT instructions. In this case the
+    // DW_CFA_AARCH64_negate_ra_state can't be emitted.
     if (Subtarget.hasPAuth() && MBBAUT != MBB.end() &&
         MBBAUT->getOpcode() == AArch64::RET) {
       BuildMI(MBB, MBBAUT, DL,
@@ -7225,6 +7362,11 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
                                                     : AArch64::AUTIBSP))
           .setMIFlag(MachineInstr::FrameDestroy);
+      unsigned CFIIndexAuth =
+          MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+      BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndexAuth)
+          .setMIFlags(MachineInstr::FrameDestroy);
     }
   }
 }
@@ -7401,7 +7543,11 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
     unsigned Reg = findRegisterToSaveLRTo(C);
     assert(Reg != 0 && "No callee-saved register available?");
 
-    // Save and restore LR from that register.
+    // LR has to be a live in so that we can save it.
+    if (!MBB.isLiveIn(AArch64::LR))
+      MBB.addLiveIn(AArch64::LR);
+
+    // Save and restore LR from Reg.
     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
                .addReg(AArch64::XZR)
                .addReg(AArch64::LR)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index e25189e409a3..b2f9e82a7e8b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -227,12 +227,12 @@ public:
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
   /// Return true if the comparison instruction can be analyzed.
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
+                      Register &SrcReg2, int64_t &CmpMask,
+                      int64_t &CmpValue) const override;
   /// optimizeCompareInstr - Convert the instruction supplying the argument to
   /// the comparison into one that sets the zero bit in the flags register.
   bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                            Register SrcReg2, int CmpMask, int CmpValue,
+                            Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
                             const MachineRegisterInfo *MRI) const override;
   bool optimizeCondBranch(MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 682cec361728..db8e0c5dac4a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -27,6 +27,21 @@ def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
                                  AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
 def HasV8_7a         : Predicate<"Subtarget->hasV8_7aOps()">,
                                  AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">;
+def HasV9_0a         : Predicate<"Subtarget->hasV9_0aOps()">,
+                                 AssemblerPredicate<(all_of HasV9_0aOps), "armv9-a">;
+def HasV9_1a         : Predicate<"Subtarget->hasV9_1aOps()">,
+                                 AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">;
+def HasV9_2a         : Predicate<"Subtarget->hasV9_2aOps()">,
+                                 AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">;
+def HasV8_0r         : Predicate<"Subtarget->hasV8_0rOps()">,
+                                 AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">;
+
+def HasEL2VMSA       : Predicate<"Subtarget->hasEL2VMSA()">,
+                       AssemblerPredicate<(all_of FeatureEL2VMSA), "el2vmsa">;
+
+def HasEL3           : Predicate<"Subtarget->hasEL3()">,
+                       AssemblerPredicate<(all_of FeatureEL3), "el3">;
+
 def HasVH            : Predicate<"Subtarget->hasVH()">,
                        AssemblerPredicate<(all_of FeatureVH), "vh">;
 
@@ -63,9 +78,6 @@ def HasAM            : Predicate<"Subtarget->hasAM()">,
 def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
                        AssemblerPredicate<(all_of FeatureSEL2), "sel2">;
 
-def HasPMU           : Predicate<"Subtarget->hasPMU()">,
-                       AssemblerPredicate<(all_of FeaturePMU), "pmu">;
-
 def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
                        AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
 
@@ -128,6 +140,24 @@ def HasSMEF64        : Predicate<"Subtarget->hasSMEF64()">,
                                  AssemblerPredicate<(all_of FeatureSMEF64), "sme-f64">;
 def HasSMEI64        : Predicate<"Subtarget->hasSMEI64()">,
                                  AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">;
+def HasStreamingSVE  : Predicate<"Subtarget->hasStreamingSVE()">,
+                                 AssemblerPredicate<(all_of FeatureStreamingSVE), "streaming-sve">;
+// A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
+// they should be enabled if either has been specified.
+def HasSVEorStreamingSVE
+    : Predicate<"Subtarget->hasSVE() || Subtarget->hasStreamingSVE()">,
+                AssemblerPredicate<(any_of FeatureSVE, FeatureStreamingSVE),
+                "streaming-sve or sve">;
+def HasSVE2orStreamingSVE
+    : Predicate<"Subtarget->hasSVE2() || Subtarget->hasStreamingSVE()">,
+                AssemblerPredicate<(any_of FeatureSVE2, FeatureStreamingSVE),
+                "streaming-sve or sve2">;
+// A subset of NEON instructions are legal in Streaming SVE execution mode,
+// they should be enabled if either has been specified.
+def HasNEONorStreamingSVE
+    : Predicate<"Subtarget->hasNEON() || Subtarget->hasStreamingSVE()">,
+                AssemblerPredicate<(any_of FeatureNEON, FeatureStreamingSVE),
+                "streaming-sve or neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -180,6 +210,8 @@ def UseNegativeImmediates
     : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
                                              "NegativeImmediates">;
 
+def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
+
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
@@ -673,40 +705,40 @@ let isReMaterializable = 1, isCodeGenOnly = 1 in {
 // removed, along with the AArch64Wrapper node.
 
 let AddedComplexity = 10 in
-def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
-                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+def LOADgot : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr),
+                     [(set GPR64common:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
               Sched<[WriteLDAdr]>;
 
 // The MOVaddr instruction should match only when the add is not folded
 // into a load or store address.
 def MOVaddr
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+    : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
                                             tglobaladdr:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrJT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+    : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
                                              tjumptable:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrCP
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+    : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
                                              tconstpool:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrBA
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+    : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
                                              tblockaddress:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrTLS
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+    : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
                                             tglobaltlsaddr:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrEXT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+    : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64common:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
                                             texternalsym:$low))]>,
       Sched<[WriteAdrAdr]>;
 // Normally AArch64addlow either gets folded into a following ldr/str,
@@ -714,8 +746,8 @@ def MOVaddrEXT
 // might appear without either of them, so allow lowering it into a plain
 // add.
 def ADDlowTLS
-    : Pseudo<(outs GPR64:$dst), (ins GPR64:$src, i64imm:$low),
-             [(set GPR64:$dst, (AArch64addlow GPR64:$src,
+    : Pseudo<(outs GPR64sp:$dst), (ins GPR64sp:$src, i64imm:$low),
+             [(set GPR64sp:$dst, (AArch64addlow GPR64sp:$src,
                                             tglobaltlsaddr:$low))]>,
       Sched<[WriteAdr]>;
 
@@ -855,7 +887,7 @@ defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", AArch64udot>;
 }
 
 // ARMv8.6-A BFloat
-let Predicates = [HasBF16] in {
+let Predicates = [HasNEON, HasBF16] in {
 defm BFDOT       : SIMDThreeSameVectorBFDot<1, "bfdot">;
 defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;
 def BFMMLA       : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
@@ -865,7 +897,6 @@ def BFMLALBIdx   : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
 def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
 def BFCVTN       : SIMD_BFCVTN;
 def BFCVTN2      : SIMD_BFCVTN2;
-def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
 
 // Vector-scalar BFDOT:
 // The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -885,6 +916,10 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
                              VectorIndexS:$idx)>;
 }
 
+let Predicates = [HasNEONorStreamingSVE, HasBF16] in {
+def BFCVT : BF16ToSinglePrecision<"bfcvt">;
+}
+
 // ARMv8.6A AArch64 matrix multiplication
 let Predicates = [HasMatMulInt8] in {
 def  SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
@@ -958,6 +993,15 @@ def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v8i16>;
 def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v4i32>;
 def : SHA3_pattern<EOR3, int_aarch64_crypto_eor3u, v2i64>;
 
+class EOR3_pattern<ValueType VecTy>
+  : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)),
+        (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>;
+
+def : EOR3_pattern<v16i8>;
+def : EOR3_pattern<v8i16>;
+def : EOR3_pattern<v4i32>;
+def : EOR3_pattern<v2i64>;
+
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v16i8>;
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v8i16>;
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v4i32>;
@@ -1034,8 +1078,7 @@ defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop,
                                                "fcmla", null_frag>;
 defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
                                            "fcadd", null_frag>;
-defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
-                                       null_frag>;
+defm FCMLA : SIMDIndexedTiedComplexHSD<0, 1, complexrotateop, "fcmla">;
 
 let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
   def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
@@ -1172,23 +1215,25 @@ let Predicates = [HasPAuth] in {
   def : InstAlias<"autib1716", (AUTIB1716), 1>;
   def : InstAlias<"xpaclri", (XPACLRI), 1>;
 
-  multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
-    def IA   : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
-    def IB   : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
-    def DA   : SignAuthOneData<prefix, 0b10, !strconcat(asm, "da")>;
-    def DB   : SignAuthOneData<prefix, 0b11, !strconcat(asm, "db")>;
-    def IZA  : SignAuthZero<prefix_z, 0b00, !strconcat(asm, "iza")>;
-    def DZA  : SignAuthZero<prefix_z, 0b10, !strconcat(asm, "dza")>;
-    def IZB  : SignAuthZero<prefix_z, 0b01, !strconcat(asm, "izb")>;
-    def DZB  : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb")>;
+  multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm,
+                      SDPatternOperator op> {
+    def IA   : SignAuthOneData<prefix, 0b00, !strconcat(asm,  "ia"), op>;
+    def IB   : SignAuthOneData<prefix, 0b01, !strconcat(asm,  "ib"), op>;
+    def DA   : SignAuthOneData<prefix, 0b10, !strconcat(asm,  "da"), op>;
+    def DB   : SignAuthOneData<prefix, 0b11, !strconcat(asm,  "db"), op>;
+    def IZA  : SignAuthZero<prefix_z,  0b00, !strconcat(asm, "iza"), op>;
+    def DZA  : SignAuthZero<prefix_z,  0b10, !strconcat(asm, "dza"), op>;
+    def IZB  : SignAuthZero<prefix_z,  0b01, !strconcat(asm, "izb"), op>;
+    def DZB  : SignAuthZero<prefix_z,  0b11, !strconcat(asm, "dzb"), op>;
   }
 
-  defm PAC : SignAuth<0b000, 0b010, "pac">;
-  defm AUT : SignAuth<0b001, 0b011, "aut">;
+  defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>;
+  defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>;
 
   def XPACI : ClearAuth<0, "xpaci">;
   def XPACD : ClearAuth<1, "xpacd">;
-  def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
+
+  def PACGA : SignAuthTwoOperand<0b1100, "pacga", int_ptrauth_sign_generic>;
 
   // Combined Instructions
   let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1  in {
@@ -1272,6 +1317,7 @@ def : InstAlias<"clrex", (CLREX 0xf)>;
 def : InstAlias<"isb", (ISB 0xf)>;
 def : InstAlias<"ssbb", (DSB 0)>;
 def : InstAlias<"pssbb", (DSB 4)>;
+def : InstAlias<"dfb", (DSB 0b1100)>, Requires<[HasV8_0r]>;
 
 def MRS    : MRSI;
 def MSR    : MSRI;
@@ -1325,7 +1371,7 @@ def TSTART : TMSystemI<0b0000, "tstart",
 def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>;
 
 def TCANCEL : TMSystemException<0b011, "tcancel",
-                                [(int_aarch64_tcancel i64_imm0_65535:$imm)]>;
+                                [(int_aarch64_tcancel timm64_0_65535:$imm)]>;
 
 def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> {
   let mayLoad = 0;
@@ -1344,12 +1390,12 @@ let PostEncoderMethod = "fixMOVZ" in
 defm MOVZ : MoveImmediate<0b10, "movz">;
 
 // First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, timm32_0_65535:$imm, 0), 0>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, timm32_0_65535:$imm, 0), 0>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, timm32_0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, timm32_0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, timm32_0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, timm32_0_65535:$imm, 0)>;
 
 // Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
 def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
@@ -1620,8 +1666,8 @@ def : ShiftAlias<"rorv", RORVXr, GPR64>;
 
 // Multiply-add
 let AddedComplexity = 5 in {
-defm MADD : MulAccum<0, "madd", add>;
-defm MSUB : MulAccum<1, "msub", sub>;
+defm MADD : MulAccum<0, "madd">;
+defm MSUB : MulAccum<1, "msub">;
 
 def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
           (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
@@ -2334,16 +2380,16 @@ def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
 }
 def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
 def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
-def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">, Requires<[HasEL3]>;
 def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
 def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
-def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">, Requires<[HasEL3]>;
 def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
 
 // DCPSn defaults to an immediate operand of zero if unspecified.
 def : InstAlias<"dcps1", (DCPS1 0)>;
 def : InstAlias<"dcps2", (DCPS2 0)>;
-def : InstAlias<"dcps3", (DCPS3 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>, Requires<[HasEL3]>;
 
 def UDF : UDFType<0, "udf">;
 
@@ -3114,7 +3160,7 @@ defm STRB : Store8RO< 0b00,  1, 0b00, FPR8Op,   "str", untyped, store>;
 defm STRH : Store16RO<0b01,  1, 0b00, FPR16Op,  "str", f16,     store>;
 defm STRS : Store32RO<0b10,  1, 0b00, FPR32Op,  "str", f32,     store>;
 defm STRD : Store64RO<0b11,  1, 0b00, FPR64Op,  "str", f64,     store>;
-defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str", f128,    store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str">;
 
 let Predicates = [UseSTRQro], AddedComplexity = 10 in {
   def : Pat<(store (f128 FPR128:$Rt),
@@ -3710,35 +3756,56 @@ defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
 
 // AArch64's FCVT instructions saturate when out of range.
 multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
+  let Predicates = [HasFullFP16] in {
   def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
             (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
-  def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
-            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
-  def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
-            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
   def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
             (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  }
+  def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
   def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
             (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
   def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
             (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
+            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)),
+            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  }
+  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)),
+            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)),
+            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)),
+            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)),
+            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
 }
 
 defm : FPToIntegerSatPats<fp_to_sint_sat, "FCVTZS">;
 defm : FPToIntegerSatPats<fp_to_uint_sat, "FCVTZU">;
 
 multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+  let Predicates = [HasFullFP16] in {
   def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
   def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+  }
   def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
   def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
   def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
   def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
 
+  let Predicates = [HasFullFP16] in {
   def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
             (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
   def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
             (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  }
   def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
             (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
   def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
@@ -3763,10 +3830,12 @@ multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, strin
             (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
 
   // These instructions saturate like fp_to_[su]int_sat.
+  let Predicates = [HasFullFP16] in {
   def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
             (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
   def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
             (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  }
   def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
             (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
   def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
@@ -4127,6 +4196,22 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
 defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
 defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
 
+// AArch64's FCVT instructions saturate when out of range.
+multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
+  def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)),
+            (!cast<Instruction>(INST # v4f16) v4f16:$Rn)>;
+  def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)),
+            (!cast<Instruction>(INST # v8f16) v8f16:$Rn)>;
+  def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)),
+            (!cast<Instruction>(INST # v2f32) v2f32:$Rn)>;
+  def : Pat<(v4i32 (to_int_sat v4f32:$Rn, i32)),
+            (!cast<Instruction>(INST # v4f32) v4f32:$Rn)>;
+  def : Pat<(v2i64 (to_int_sat v2f64:$Rn, i64)),
+            (!cast<Instruction>(INST # v2f64) v2f64:$Rn)>;
+}
+defm : SIMDTwoVectorFPToIntSatPats<fp_to_sint_sat, "FCVTZS">;
+defm : SIMDTwoVectorFPToIntSatPats<fp_to_uint_sat, "FCVTZU">;
+
 def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
 def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
 def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
@@ -4606,9 +4691,9 @@ defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
-defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorStreamingSVE>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorStreamingSVE>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorStreamingSVE>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -4707,9 +4792,9 @@ defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe", HasNEONorStreamingSVE>;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx", HasNEONorStreamingSVE>;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte", HasNEONorStreamingSVE>;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
@@ -5211,7 +5296,7 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
 // AdvSIMD scalar CPY instruction
 //----------------------------------------------------------------------------
 
-defm CPY : SIMDScalarCPY<"cpy">;
+defm CPY : SIMDScalarCPY<"mov">;
 
 //----------------------------------------------------------------------------
 // AdvSIMD scalar pairwise instructions
@@ -5693,7 +5778,7 @@ def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
             (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
             (i64 0))), (i64 0))),
           (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-           (UADDLVv4i16v V64:$op), ssub), ssub)>;
+           (UADDLVv8i8v V64:$op), hsub), ssub)>;
 def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp
            (v16i8 V128:$op))))), (i64 0))),
           (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
@@ -6964,9 +7049,9 @@ def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
 // for AES fusion on some CPUs.
 let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
 def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
-                        Sched<[WriteV]>;
+                        Sched<[WriteVq]>;
 def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
-                         Sched<[WriteV]>;
+                         Sched<[WriteVq]>;
 }
 
 // Only use constrained versions of AES(I)MC instructions if they are paired with
@@ -8092,6 +8177,20 @@ let AddedComplexity = 10 in {
   // FIXME: add SVE dot-product patterns.
 }
 
+// Custom DAG nodes and isel rules to make a 64-byte block out of eight GPRs,
+// so that it can be used as input to inline asm, and vice versa.
+def LS64_BUILD : SDNode<"AArch64ISD::LS64_BUILD", SDTypeProfile<1, 8, []>>;
+def LS64_EXTRACT : SDNode<"AArch64ISD::LS64_EXTRACT", SDTypeProfile<1, 2, []>>;
+def : Pat<(i64x8 (LS64_BUILD GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3,
+                             GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7)),
+          (REG_SEQUENCE GPR64x8Class,
+              $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3,
+              $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7)>;
+foreach i = 0-7 in {
+  def : Pat<(i64 (LS64_EXTRACT (i64x8 GPR64x8:$val), (i32 i))),
+            (EXTRACT_SUBREG $val, !cast<SubRegIndex>("x8sub_"#i))>;
+}
+
 let Predicates = [HasLS64] in {
   def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
                                           (outs GPR64x8:$Rt)>;
@@ -8114,6 +8213,10 @@ def StoreSwiftAsyncContext
       : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset),
                []>, Sched<[]>;
 
+def AArch64AssertZExtBool : SDNode<"AArch64ISD::ASSERT_ZEXT_BOOL", SDT_assert>;
+def : Pat<(AArch64AssertZExtBool GPR32:$op),
+          (i32 GPR32:$op)>;
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
 include "AArch64SMEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index bf042c83294a..3a836ac33064 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1613,8 +1613,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           // If the stored value and the address of the second instruction is
           // the same, it needs to be using the updated register and therefore
           // it must not be folded.
-          bool IsMIRegTheSame =
-              getLdStRegOp(MI).getReg() == getLdStBaseOp(MI).getReg();
+          bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(),
+                                                 getLdStBaseOp(MI).getReg());
           if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
               IsMIRegTheSame) {
             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index be19d4953857..487e1f6162b9 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -363,7 +363,7 @@ static bool shouldUseFrameHelper(MachineBasicBlock &MBB,
   int InstCount = RegCount / 2;
 
   // Do not use a helper call when not saving LR.
-  if (std::find(Regs.begin(), Regs.end(), AArch64::LR) == Regs.end())
+  if (!llvm::is_contained(Regs, AArch64::LR))
     return false;
 
   switch (Type) {
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
new file mode 100644
index 000000000000..42db18332f1c
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -0,0 +1,293 @@
+//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs below peephole optimizations on MIR level.
+//
+// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
+//    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+//
+//    The mov pseudo instruction could be expanded to multiple mov instructions
+//    later. In this case, we could try to split the constant  operand of mov
+//    instruction into two bitmask immediates. It makes two AND instructions
+//    intead of multiple `mov` + `and` instructions.
+//
+// 2. Remove redundant ORRWrs which is generated by zero-extend.
+//
+//    %3:gpr32 = ORRWrs $wzr, %2, 0
+//    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
+//
+//    If AArch64's 32-bit form of instruction defines the source operand of
+//    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
+//    operand are set to zero.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExpandImm.h"
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-mi-peephole-opt"
+
+namespace {
+
+struct AArch64MIPeepholeOpt : public MachineFunctionPass {
+  static char ID;
+
+  AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
+    initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  const AArch64InstrInfo *TII;
+  MachineLoopInfo *MLI;
+  MachineRegisterInfo *MRI;
+
+  template <typename T>
+  bool visitAND(MachineInstr &MI,
+                SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+  bool visitORR(MachineInstr &MI,
+                SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AArch64 MI Peephole Optimization pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+char AArch64MIPeepholeOpt::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
+                "AArch64 MI Peephole Optimization", false, false)
+
+template <typename T>
+static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
+  T UImm = static_cast<T>(Imm);
+  if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
+    return false;
+
+  // If this immediate can be handled by one instruction, do not split it.
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+  AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
+  if (Insn.size() == 1)
+    return false;
+
+  // The bitmask immediate consists of consecutive ones.  Let's say there is
+  // constant 0b00000000001000000000010000000000 which does not consist of
+  // consecutive ones. We can split it in to two bitmask immediate like
+  // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
+  // If we do AND with these two bitmask immediate, we can see original one.
+  unsigned LowestBitSet = countTrailingZeros(UImm);
+  unsigned HighestBitSet = Log2_64(UImm);
+
+  // Create a mask which is filled with one from the position of lowest bit set
+  // to the position of highest bit set.
+  T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
+              (static_cast<T>(1) << LowestBitSet);
+  // Create a mask which is filled with one outside the position of lowest bit
+  // set and the position of highest bit set.
+  T NewImm2 = UImm | ~NewImm1;
+
+  // If the split value is not valid bitmask immediate, do not split this
+  // constant.
+  if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
+    return false;
+
+  Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+  Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+  return true;
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitAND(
+    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+  // Try below transformation.
+  //
+  // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
+  // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+  //
+  // The mov pseudo instruction could be expanded to multiple mov instructions
+  // later. Let's try to split the constant operand of mov instruction into two
+  // bitmask immediates. It makes only two AND instructions intead of multiple
+  // mov + and instructions.
+
+  unsigned RegSize = sizeof(T) * 8;
+  assert((RegSize == 32 || RegSize == 64) &&
+         "Invalid RegSize for AND bitmask peephole optimization");
+
+  // Check whether AND's MBB is in loop and the AND is loop invariant.
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineLoop *L = MLI->getLoopFor(MBB);
+  if (L && !L->isLoopInvariant(MI))
+    return false;
+
+  // Check whether AND's operand is MOV with immediate.
+  MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
+  if (!MovMI)
+    return false;
+
+  MachineInstr *SubregToRegMI = nullptr;
+  // If it is SUBREG_TO_REG, check its operand.
+  if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
+    SubregToRegMI = MovMI;
+    MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
+    if (!MovMI)
+      return false;
+  }
+
+  if (MovMI->getOpcode() != AArch64::MOVi32imm &&
+      MovMI->getOpcode() != AArch64::MOVi64imm)
+    return false;
+
+  // If the MOV has multiple uses, do not split the immediate because it causes
+  // more instructions.
+  if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
+    return false;
+
+  if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
+    return false;
+
+  // Split the bitmask immediate into two.
+  T UImm = static_cast<T>(MovMI->getOperand(1).getImm());
+  // For the 32 bit form of instruction, the upper 32 bits of the destination
+  // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
+  // of UImm to zero.
+  if (SubregToRegMI)
+    UImm &= 0xFFFFFFFF;
+  T Imm1Enc;
+  T Imm2Enc;
+  if (!splitBitmaskImm(UImm, RegSize, Imm1Enc, Imm2Enc))
+    return false;
+
+  // Create new AND MIs.
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetRegisterClass *ANDImmRC =
+      (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register NewTmpReg = MRI->createVirtualRegister(ANDImmRC);
+  Register NewDstReg = MRI->createVirtualRegister(ANDImmRC);
+  unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri;
+
+  MRI->constrainRegClass(NewTmpReg, MRI->getRegClass(SrcReg));
+  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+      .addReg(SrcReg)
+      .addImm(Imm1Enc);
+
+  MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
+  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+      .addReg(NewTmpReg)
+      .addImm(Imm2Enc);
+
+  MRI->replaceRegWith(DstReg, NewDstReg);
+  // replaceRegWith changes MI's definition register. Keep it for SSA form until
+  // deleting MI.
+  MI.getOperand(0).setReg(DstReg);
+
+  ToBeRemoved.insert(&MI);
+  if (SubregToRegMI)
+    ToBeRemoved.insert(SubregToRegMI);
+  ToBeRemoved.insert(MovMI);
+
+  return true;
+}
+
+bool AArch64MIPeepholeOpt::visitORR(
+    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+  // Check this ORR comes from below zero-extend pattern.
+  //
+  // def : Pat<(i64 (zext GPR32:$src)),
+  //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
+  if (MI.getOperand(3).getImm() != 0)
+    return false;
+
+  if (MI.getOperand(1).getReg() != AArch64::WZR)
+    return false;
+
+  MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
+  if (!SrcMI)
+    return false;
+
+  // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
+  //
+  // When you use the 32-bit form of an instruction, the upper 32 bits of the
+  // source registers are ignored and the upper 32 bits of the destination
+  // register are set to zero.
+  //
+  // If AArch64's 32-bit form of instruction defines the source operand of
+  // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
+  // real AArch64 instruction and if it is not, do not process the opcode
+  // conservatively.
+  if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+    return false;
+
+  Register DefReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
+  MRI->replaceRegWith(DefReg, SrcReg);
+  MRI->clearKillFlags(SrcReg);
+  // replaceRegWith changes MI's definition register. Keep it for SSA form until
+  // deleting MI.
+  MI.getOperand(0).setReg(DefReg);
+  ToBeRemoved.insert(&MI);
+
+  LLVM_DEBUG({ dbgs() << "Removed: " << MI << "\n"; });
+
+  return true;
+}
+
+bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MRI = &MF.getRegInfo();
+
+  if (!MRI->isSSA())
+    return false;
+
+  bool Changed = false;
+  SmallSetVector<MachineInstr *, 8> ToBeRemoved;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case AArch64::ANDWrr:
+        Changed = visitAND<uint32_t>(MI, ToBeRemoved);
+        break;
+      case AArch64::ANDXrr:
+        Changed = visitAND<uint64_t>(MI, ToBeRemoved);
+        break;
+      case AArch64::ORRWrs:
+        Changed = visitORR(MI, ToBeRemoved);
+      }
+    }
+  }
+
+  for (MachineInstr *MI : ToBeRemoved)
+    MI->eraseFromParent();
+
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
+  return new AArch64MIPeepholeOpt();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 07dee3ce1fbc..70daf5abf81d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -732,7 +732,9 @@ def Tuples8X : RegisterTuples<
   !foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)),
   !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>;
 
-def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>;
+def GPR64x8Class : RegisterClass<"AArch64", [i64x8], 512, (trunc Tuples8X, 12)> {
+  let Size = 512;
+}
 def GPR64x8AsmOp : AsmOperandClass {
   let Name = "GPR64x8";
   let ParserMethod = "tryParseGPR64x8";
@@ -899,16 +901,8 @@ def PPR32  : PPRRegOp<"s", PPRAsmOp32,  ElementSizeS,  PPR>;
 def PPR64  : PPRRegOp<"d", PPRAsmOp64,  ElementSizeD,  PPR>;
 
 def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b",  0>;
-def PPRAsmOp3b8   : PPRAsmOperand<"Predicate3bB",   "PPR_3b",  8>;
-def PPRAsmOp3b16  : PPRAsmOperand<"Predicate3bH",   "PPR_3b", 16>;
-def PPRAsmOp3b32  : PPRAsmOperand<"Predicate3bS",   "PPR_3b", 32>;
-def PPRAsmOp3b64  : PPRAsmOperand<"Predicate3bD",   "PPR_3b", 64>;
 
 def PPR3bAny : PPRRegOp<"",  PPRAsmOp3bAny, ElementSizeNone, PPR_3b>;
-def PPR3b8   : PPRRegOp<"b", PPRAsmOp3b8,   ElementSizeB, PPR_3b>;
-def PPR3b16  : PPRRegOp<"h", PPRAsmOp3b16,  ElementSizeH, PPR_3b>;
-def PPR3b32  : PPRRegOp<"s", PPRAsmOp3b32,  ElementSizeS, PPR_3b>;
-def PPR3b64  : PPRRegOp<"d", PPRAsmOp3b64,  ElementSizeD, PPR_3b>;
 
 //******************************************************************************
 
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 03b32967a212..80d98d17e1d6 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -641,7 +641,7 @@ bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
     StReg[i]     = DefiningMI->getOperand(2*i+1).getReg();
     StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
 
-    // Sanity check for the other arguments.
+    // Validation check for the other arguments.
     if (DefiningMI->getOperand(2*i+2).isImm()) {
       switch (DefiningMI->getOperand(2*i+2).getImm()) {
       default:
@@ -711,9 +711,7 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
     if (!shouldExitEarly(&MF, OptimizationKind)) {
       SmallVector<MachineInstr *, 8> RemoveMIs;
       for (MachineBasicBlock &MBB : MF) {
-        for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
-             MII != MIE;) {
-          MachineInstr &MI = *MII;
+        for (MachineInstr &MI : MBB) {
           bool InstRewrite;
           if (OptimizationKind == VectorElem)
             InstRewrite = optimizeVectElement(MI) ;
@@ -725,7 +723,6 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
             RemoveMIs.push_back(&MI);
             Changed = true;
           }
-          ++MII;
         }
       }
       for (MachineInstr *MI : RemoveMIs)
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 6a0fa2fc4f4e..aacace64e998 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -138,6 +138,6 @@ def REVD_ZPmZ : sve2_int_perm_revd<"revd">;
 defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>;
 defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>;
 
-defm DUP_PPzPRI : sve2_int_perm_dup_p<"dup">;
+defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel">;
 
 } // End let Predicates = [HasSME]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 91c3aec30a15..67d8fbb45cf5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -261,11 +261,6 @@ def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED
 
 def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>;
 
-def step_vector_oneuse : PatFrag<(ops node:$idx),
-                                  (step_vector node:$idx), [{
-  return N->hasOneUse();
-}]>;
-
 def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
 
 def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
@@ -286,7 +281,9 @@ let Predicates = [HasSVE] in {
   defm RDFFR_P    : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
   def  SETFFR     : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
   def  WRFFR      : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add", add>;
   defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
   defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
@@ -305,13 +302,15 @@ let Predicates = [HasSVE] in {
 
   defm ADD_ZPZZ  : sve_int_bin_pred_bhsd<AArch64add_p>;
   defm SUB_ZPZZ  : sve_int_bin_pred_bhsd<AArch64sub_p>;
+} // End HasSVEorStreamingSVE
 
-  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
-    defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
-    defm SUB_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
-    defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
-  }
+let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+  defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
+  defm SUB_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
+  defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
+} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
   defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
   defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
@@ -403,17 +402,37 @@ let Predicates = [HasSVE] in {
   defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>;
   defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>;
 
-  defm FRECPE_ZZ  : sve_fp_2op_u_zd<0b110, "frecpe",  int_aarch64_sve_frecpe_x>;
-  defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;
+  defm FRECPE_ZZ  : sve_fp_2op_u_zd<0b110, "frecpe",  AArch64frecpe>;
+  defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", AArch64frsqrte>;
+
+  defm FADD_ZPmI    : sve_fp_2op_i_p_zds<0b000, "fadd", "FADD_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fadd>;
+  defm FSUB_ZPmI    : sve_fp_2op_i_p_zds<0b001, "fsub", "FSUB_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsub>;
+  defm FMUL_ZPmI    : sve_fp_2op_i_p_zds<0b010, "fmul", "FMUL_ZPZI", sve_fpimm_half_two, fpimm_half, fpimm_two, int_aarch64_sve_fmul>;
+  defm FSUBR_ZPmI   : sve_fp_2op_i_p_zds<0b011, "fsubr", "FSUBR_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsubr>;
+  defm FMAXNM_ZPmI  : sve_fp_2op_i_p_zds<0b100, "fmaxnm", "FMAXNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmaxnm>;
+  defm FMINNM_ZPmI  : sve_fp_2op_i_p_zds<0b101, "fminnm", "FMINNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fminnm>;
+  defm FMAX_ZPmI    : sve_fp_2op_i_p_zds<0b110, "fmax", "FMAX_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmax>;
+  defm FMIN_ZPmI    : sve_fp_2op_i_p_zds<0b111, "fmin", "FMIN_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>;
+   
+  defm FADD_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fadd_p>;
+  defm FSUB_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsub_p>;
+  defm FMUL_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_two, fpimm_half, fpimm_two, AArch64fmul_p>;
+  defm FSUBR_ZPZI   : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one>;
+  defm FMAXNM_ZPZI  : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmaxnm_p>;
+  defm FMINNM_ZPZI  : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fminnm_p>;
+  defm FMAX_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmax_p>;
+  defm FMIN_ZPZI    : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmin_p>;
 
-  defm FADD_ZPmI    : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
-  defm FSUB_ZPmI    : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
-  defm FMUL_ZPmI    : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
-  defm FSUBR_ZPmI   : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
-  defm FMAXNM_ZPmI  : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
-  defm FMINNM_ZPmI  : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
-  defm FMAX_ZPmI    : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
-  defm FMIN_ZPmI    : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
+  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+    defm FADD_ZPZI    : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fadd>;
+    defm FSUB_ZPZI    : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsub>;
+    defm FMUL_ZPZI    : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_half_two, fpimm_half, fpimm_two, int_aarch64_sve_fmul>;
+    defm FSUBR_ZPZI   : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsubr>;
+    defm FMAXNM_ZPZI  : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmaxnm>;
+    defm FMINNM_ZPZI  : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fminnm>;
+    defm FMAX_ZPZI    : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmax>;
+    defm FMIN_ZPZI    : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>;
+  }
 
   defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
   defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
@@ -437,31 +456,43 @@ let Predicates = [HasSVE] in {
   defm FMAX_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmax_p>;
   defm FMIN_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmin_p>;
   defm FDIV_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
-
-  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
-    defm FADD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
-    defm FSUB_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
-    defm FMUL_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
-    defm FSUBR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
-    defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
-    defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
-    defm FMAX_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
-    defm FMIN_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
-    defm FABD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
-    defm FMULX_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
-    defm FDIVR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
-    defm FDIV_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
-  }
-
+} // End HasSVEorStreamingSVE
+
+let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+  defm FADD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
+  defm FSUB_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
+  defm FMUL_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
+  defm FSUBR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
+  defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
+  defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
+  defm FMAX_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
+  defm FMIN_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
+  defm FABD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
+  defm FMULX_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
+  defm FDIVR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
+  defm FDIV_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
+} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
+
+let Predicates = [HasSVEorStreamingSVE] in {
   defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>;
   defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>;
   defm FMUL_ZZZ    : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>;
+} // End HasSVEorStreamingSVE
+
+let Predicates = [HasSVE] in {
   defm FTSMUL_ZZZ  : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
-  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps",  int_aarch64_sve_frecps_x>;
-  defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
+  defm FRECPS_ZZZ  : sve_fp_3op_u_zd<0b110, "frecps",  AArch64frecps>;
+  defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>;
+} // End HasSVEorStreamingSVE
+
+let Predicates = [HasSVE] in {
   defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
   defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
 
@@ -475,10 +506,10 @@ let Predicates = [HasSVE] in {
   defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad, "FNMLA_ZPmZZ", /*isReverseInstr*/ 1>;
   defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb, "FNMLS_ZPmZZ", /*isReverseInstr*/ 1>;
 
-  defm FMLA_ZPZZZ  : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmla, int_aarch64_sve_fmad>;
-  defm FMLS_ZPZZZ  : sve_fp_3op_p_zds_zx<int_aarch64_sve_fmls, int_aarch64_sve_fmsb>;
-  defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmla, int_aarch64_sve_fnmad>;
-  defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx<int_aarch64_sve_fnmls, int_aarch64_sve_fnmsb>;
+  defm FMLA_ZPZZZ  : sve_fp_3op_p_zds_zx;
+  defm FMLS_ZPZZZ  : sve_fp_3op_p_zds_zx;
+  defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx;
+  defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx;
 
   multiclass fma<ValueType Ty, ValueType PredTy, string Suffix> {
     // Zd = Za + Zn * Zm
@@ -516,17 +547,26 @@ let Predicates = [HasSVE] in {
   defm : fma<nxv4f32, nxv4i1, "S">;
   defm : fma<nxv2f32, nxv2i1, "S">;
   defm : fma<nxv2f64, nxv2i1, "D">;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
   defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;
 
   defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
   defm FMUL_ZZZI   : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   // SVE floating point reductions.
   defm FADDA_VPZ   : sve_fp_2op_p_vd<0b000, "fadda",   AArch64fadda_p>;
+} // End HasSVE
+
+let Predicates = [HasSVEorStreamingSVE] in {
   defm FADDV_VPZ   : sve_fp_fast_red<0b000, "faddv",   AArch64faddv_p>;
   defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
   defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
@@ -614,8 +654,13 @@ let Predicates = [HasSVE] in {
   defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
+} // End HasSVE
+
+let Predicates = [HasSVEorStreamingSVE] in {
   defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
@@ -639,8 +684,13 @@ let Predicates = [HasSVE] in {
   defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
   defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
   def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
+} // End HasSVEorStreamingSVE
+
+let Predicates = [HasSVE] in {
   defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm BRKPA_PPzPP  : sve_int_brkp<0b00, "brkpa",  int_aarch64_sve_brkpa_z>;
   defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
   defm BRKPB_PPzPP  : sve_int_brkp<0b01, "brkpb",  int_aarch64_sve_brkpb_z>;
@@ -752,7 +802,9 @@ let Predicates = [HasSVE] in {
   defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
   defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
   defm LD1D    : sve_mem_cld_ss<0b1111, "ld1d",  Z_d, ZPR64, GPR64NoXZRshifted64>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   // non-faulting continuous load with reg+immediate
   defm LDNF1B_IMM    : sve_mem_cldnf_si<0b0000, "ldnf1b",  Z_b, ZPR8>;
   defm LDNF1B_H_IMM  : sve_mem_cldnf_si<0b0001, "ldnf1b",  Z_h, ZPR16>;
@@ -788,7 +840,9 @@ let Predicates = [HasSVE] in {
   defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
   defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
   defm LDFF1D    : sve_mem_cldff_ss<0b1111, "ldff1d",  Z_d, ZPR64, GPR64shifted64>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   // LD(2|3|4) structured loads with reg+immediate
   defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b,   "ld2b", simm4s2>;
   defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b,  "ld3b", simm4s3>;
@@ -816,7 +870,9 @@ let Predicates = [HasSVE] in {
   def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d,   "ld2d", GPR64NoXZRshifted64>;
   def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d,  "ld3d", GPR64NoXZRshifted64>;
   def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   // Gathers using unscaled 32-bit offsets, e.g.
   //    ld1h z0.s, p0/z, [x0, z0.s, uxtw]
   defm GLD1SB_S   : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather_sxtw_z,   AArch64ld1s_gather_uxtw_z,   ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
@@ -928,7 +984,9 @@ let Predicates = [HasSVE] in {
   defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
   defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
   defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   // Non-temporal contiguous loads (register + immediate)
   defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
   defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
@@ -964,7 +1022,9 @@ let Predicates = [HasSVE] in {
   defm ST1W   : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
   defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
   defm ST1D   : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   // Scatters using unpacked, unscaled 32-bit offsets, e.g.
   //    st1h z0.d, p0, [x0, z0.d, uxtw]
   defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
@@ -1014,7 +1074,9 @@ let Predicates = [HasSVE] in {
   defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
   defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
   defm SST1D_SCALED   : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   // ST(2|3|4) structured stores (register + immediate)
   defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b,   "st2b", simm4s2>;
   defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b,  "st3b", simm4s3>;
@@ -1073,7 +1135,7 @@ let Predicates = [HasSVE] in {
   def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
   def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
 
-  multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
+  multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, ComplexPattern AddrCP> {
     // reg + imm
     let AddedComplexity = 2 in {
       def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
@@ -1091,11 +1153,13 @@ let Predicates = [HasSVE] in {
                        (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>;
   }
 
-  defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>;
-  defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1,  PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>;
-  defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1,  PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>;
-  defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1,  PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, am_sve_regreg_lsl0>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1,  PRFH_PRI, PRFH_PRR, am_sve_regreg_lsl1>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1,  PRFW_PRI, PRFS_PRR, am_sve_regreg_lsl2>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1,  PRFD_PRI, PRFD_PRR, am_sve_regreg_lsl3>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   // Gather prefetch using scaled 32-bit offsets, e.g.
   //    prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
   defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only,  ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
@@ -1153,6 +1217,53 @@ let Predicates = [HasSVE] in {
   def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
             (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;
 
+  // Patterns to generate adr instruction.
+  // adr z0.d, [z0.d, z0.d, uxtw]
+  def : Pat<(add nxv2i64:$Op1,
+                (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))))),
+            (ADR_UXTW_ZZZ_D_0 $Op1, $Op2)>;
+  // adr z0.d, [z0.d, z0.d, sxtw]
+  def : Pat<(add nxv2i64:$Op1,
+                (nxv2i64 (sext_inreg nxv2i64:$Op2, nxv2i32))),
+            (ADR_SXTW_ZZZ_D_0 $Op1, $Op2)>;
+
+  // adr z0.s, [z0.s, z0.s, lsl #<shift>]
+  // adr z0.d, [z0.d, z0.d, lsl #<shift>]
+  multiclass adrShiftPat<ValueType Ty, ValueType PredTy, ValueType ShiftTy, Instruction DestAdrIns, int ShiftAmt> {
+    def : Pat<(add Ty:$Op1,
+                  (Ty (AArch64lsl_p (PredTy (SVEAllActive)),
+                                    Ty:$Op2,
+                                    (Ty (AArch64dup (ShiftTy ShiftAmt)))))),
+              (DestAdrIns $Op1, $Op2)>;
+  }
+  defm : adrShiftPat<nxv2i64, nxv2i1, i64, ADR_LSL_ZZZ_D_1, 1>;
+  defm : adrShiftPat<nxv2i64, nxv2i1, i64, ADR_LSL_ZZZ_D_2, 2>;
+  defm : adrShiftPat<nxv2i64, nxv2i1, i64, ADR_LSL_ZZZ_D_3, 3>;
+  defm : adrShiftPat<nxv4i32, nxv4i1, i32, ADR_LSL_ZZZ_S_1, 1>;
+  defm : adrShiftPat<nxv4i32, nxv4i1, i32, ADR_LSL_ZZZ_S_2, 2>;
+  defm : adrShiftPat<nxv4i32, nxv4i1, i32, ADR_LSL_ZZZ_S_3, 3>;
+
+  // adr z0.d, [z0.d, z0.d, uxtw #<shift>]
+  // adr z0.d, [z0.d, z0.d, sxtw #<shift>]
+  multiclass adrXtwShiftPat<ValueType Ty, ValueType PredTy, int ShiftAmt> {
+    def : Pat<(add Ty:$Op1,
+                  (Ty (AArch64lsl_p (PredTy (SVEAllActive)),
+                                    (Ty (and Ty:$Op2, (Ty (AArch64dup (i64 0xFFFFFFFF))))),
+                                    (Ty (AArch64dup (i64 ShiftAmt)))))),
+              (!cast<Instruction>("ADR_UXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>;
+
+    def : Pat<(add Ty:$Op1,
+                  (Ty (AArch64lsl_p (PredTy (SVEAllActive)),
+                                    (Ty (sext_inreg Ty:$Op2, nxv2i32)),
+                                    (Ty (AArch64dup (i64 ShiftAmt)))))),
+              (!cast<Instruction>("ADR_SXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>;
+  }
+  defm : adrXtwShiftPat<nxv2i64, nxv2i1, 1>;
+  defm : adrXtwShiftPat<nxv2i64, nxv2i1, 2>;
+  defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
+} // End HasSVE
+
+let Predicates = [HasSVEorStreamingSVE] in {
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
@@ -1171,17 +1282,52 @@ let Predicates = [HasSVE] in {
 
   // Extract lo/hi halves of legal predicate types.
   def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
-            (ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
+            (PUNPKLO_PP PPR:$Ps)>;
   def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
-            (ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
+            (PUNPKHI_PP PPR:$Ps)>;
   def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
-            (ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
+            (PUNPKLO_PP PPR:$Ps)>;
   def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
-            (ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
+            (PUNPKHI_PP PPR:$Ps)>;
   def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
-            (ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
+            (PUNPKLO_PP PPR:$Ps)>;
   def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
-            (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
+            (PUNPKHI_PP PPR:$Ps)>;
+
+  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+            (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))),
+            (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+            (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))),
+            (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
+
+  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+            (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
+            (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+            (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
+            (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
+
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+            (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))),
+            (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
+            (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))),
+            (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+            (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))),
+            (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
+            (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))),
+            (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
 
   // Extract subvectors from FP SVE vectors
   def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
@@ -1206,6 +1352,24 @@ let Predicates = [HasSVE] in {
   def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))),
             (UUNPKHI_ZZ_S ZPR:$Zs)>;
 
+  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
+  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 2))),
+            (UUNPKHI_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
+  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
+            (UUNPKLO_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
+  def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 6))),
+            (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
+
+  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))),
+            (UUNPKLO_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
+  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 2))),
+            (UUNPKHI_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>;
+  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))),
+            (UUNPKLO_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
+  def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 6))),
+            (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
+
   // Concatenate two predicates.
   def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
             (UZP1_PPP_S $p1, $p2)>;
@@ -1308,16 +1472,18 @@ let Predicates = [HasSVE] in {
   defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
   defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
   defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
+}
 
-  defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
-  defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
-  defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
-  defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
-  defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
-  defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
-  defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
-  defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;
+  defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
+  defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb", sub, int_aarch64_sve_cntb>;
+  defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch", add, int_aarch64_sve_cnth>;
+  defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech", sub, int_aarch64_sve_cnth>;
+  defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw", add, int_aarch64_sve_cntw>;
+  defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw", sub, int_aarch64_sve_cntw>;
+  defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd", add, int_aarch64_sve_cntd>;
+  defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd", sub, int_aarch64_sve_cntd>;
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
   defm UQINCB_WPiI   : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
   defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
@@ -1391,10 +1557,10 @@ let Predicates = [HasSVE] in {
   defm INCP_ZP     : sve_int_count_v<0b10000, "incp">;
   defm DECP_ZP     : sve_int_count_v<0b10100, "decp">;
 
-  defm INDEX_RR : sve_int_index_rr<"index", step_vector, step_vector_oneuse, AArch64mul_p_oneuse>;
-  defm INDEX_IR : sve_int_index_ir<"index", step_vector, step_vector_oneuse, AArch64mul_p, AArch64mul_p_oneuse>;
-  defm INDEX_RI : sve_int_index_ri<"index", step_vector, step_vector_oneuse>;
-  defm INDEX_II : sve_int_index_ii<"index", step_vector, step_vector_oneuse>;
+  defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
+  defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
+  defm INDEX_RI : sve_int_index_ri<"index">;
+  defm INDEX_II : sve_int_index_ii<"index">;
 
   // Unpredicated shifts
   defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>;
@@ -1414,14 +1580,16 @@ let Predicates = [HasSVE] in {
   defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>;
   defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>;
+} // End HasSVEorStreamingSVE
 
-  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
-    defm ASR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
-    defm LSR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
-    defm LSL_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
-    defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
-  }
+let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in {
+  defm ASR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
+  defm LSR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>;
+  defm LSL_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>;
+  defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
+} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos
 
+let Predicates = [HasSVEorStreamingSVE] in {
   defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">;
   defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">;
   defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">;
@@ -1536,19 +1704,27 @@ let Predicates = [HasSVE] in {
   defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>;
   defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>;
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
-
-  let Predicates = [HasBF16, HasSVE] in {
-    defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
-    defm BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
-    defm BFMMLA_ZZZ   : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
-    defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
-    defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
-    defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
-    defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
-    defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
-    defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
-  }
-
+} // End HasSVEorStreamingSVE
+
+let Predicates = [HasBF16, HasSVEorStreamingSVE] in {
+  defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
+  defm BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
+} // End HasBF16, HasSVEorStreamingSVE
+
+let Predicates = [HasBF16, HasSVE] in {
+  defm BFMMLA_ZZZ   : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
+} // End HasBF16, HasSVE
+
+let Predicates = [HasBF16, HasSVEorStreamingSVE] in {
+  defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
+  defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
+  defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
+  defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+  defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
+  defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
+} // End HasBF16, HasSVEorStreamingSVE
+
+let Predicates = [HasSVEorStreamingSVE] in {
   // InstAliases
   def : InstAlias<"mov $Zd, $Zn",
                   (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
@@ -1739,6 +1915,72 @@ let Predicates = [HasSVE] in {
     def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
   }
 
+  let AddedComplexity = 5 in {
+    def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+              (ADDVL_XXI GPR64:$op, $imm)>;
+
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                             GPR32:$op, sub_32), $imm),
+                                   sub_32))>;
+
+    def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
+              (INCH_ZPiI ZPR:$op, 31, $imm)>;
+    def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
+              (INCW_ZPiI ZPR:$op, 31, $imm)>;
+    def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))),
+              (INCD_ZPiI ZPR:$op, 31, $imm)>;
+
+    def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
+              (DECH_ZPiI ZPR:$op, 31, $imm)>;
+    def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
+              (DECW_ZPiI ZPR:$op, 31, $imm)>;
+    def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))),
+              (DECD_ZPiI ZPR:$op, 31, $imm)>;
+  }
+
+  let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL], AddedComplexity = 5 in {
+    def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
+              (INCH_XPiI GPR64:$op, 31, $imm)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
+              (INCW_XPiI GPR64:$op, 31, $imm)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))),
+              (INCD_XPiI GPR64:$op, 31, $imm)>;
+
+    def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
+              (DECH_XPiI GPR64:$op, 31, $imm)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
+              (DECW_XPiI GPR64:$op, 31, $imm)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
+              (DECD_XPiI GPR64:$op, 31, $imm)>;
+
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$op, sub_32), 31, $imm),
+                                    sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (INCW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$op, sub_32), 31, $imm),
+                                    sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (INCD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$op, sub_32), 31, $imm),
+                                    sub_32))>;
+
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$op, sub_32), 31, $imm),
+                                    sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (DECW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$op, sub_32), 31, $imm),
+                                    sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$op, sub_32), 31, $imm),
+                                    sub_32))>;
+  }
+
   def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
             (ADDVL_XXI GPR64:$op, $imm)>;
 
@@ -1864,25 +2106,27 @@ let Predicates = [HasSVE] in {
   }
 
   // 2-element contiguous loads
-  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8,   LD1B_D,  LD1B_D_IMM,  am_sve_regreg_lsl0>;
-  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8,  LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
-  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16,  LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
-  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
-  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32,  LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
-  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
-  defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
-  defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load,    LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
-  defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load,    LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
-  defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
+  defm : pred_load<nxv2i64,  nxv2i1, zext_masked_load_i8,   LD1B_D,  LD1B_D_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv2i64,  nxv2i1, asext_masked_load_i8,  LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv2i64,  nxv2i1, zext_masked_load_i16,  LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2i64,  nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2i64,  nxv2i1, zext_masked_load_i32,  LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64,  nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64,  nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
+  defm : pred_load<nxv2f16,  nxv2i1, nonext_masked_load,    LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2bf16, nxv2i1, nonext_masked_load,    LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2f32,  nxv2i1, nonext_masked_load,    LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2f64,  nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
 
   // 4-element contiguous loads
-  defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8,   LD1B_S,  LD1B_S_IMM,  am_sve_regreg_lsl0>;
-  defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8,  LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
-  defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16,  LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
-  defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
-  defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
-  defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load,    LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
-  defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
+  defm : pred_load<nxv4i32,  nxv4i1, zext_masked_load_i8,   LD1B_S,  LD1B_S_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv4i32,  nxv4i1, asext_masked_load_i8,  LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv4i32,  nxv4i1, zext_masked_load_i16,  LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32,  nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32,  nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
+  defm : pred_load<nxv4f16,  nxv4i1, nonext_masked_load,    LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4bf16, nxv4i1, nonext_masked_load,    LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4f32,  nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
 
   // 8-element contiguous loads
   defm : pred_load<nxv8i16,  nxv8i1, zext_masked_load_i8,  LD1B_H,  LD1B_H_IMM,  am_sve_regreg_lsl0>;
@@ -1909,20 +2153,22 @@ let Predicates = [HasSVE] in {
   }
 
   // 2-element contiguous stores
-  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8,  ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
-  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
-  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
-  defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store,  ST1D,   ST1D_IMM,   am_sve_regreg_lsl3>;
-  defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store,  ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
-  defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store,  ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
-  defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store,  ST1D,   ST1D_IMM,   am_sve_regreg_lsl3>;
+  defm : pred_store<nxv2i64,  nxv2i1, trunc_masked_store_i8,  ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv2i64,  nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv2i64,  nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_store<nxv2i64,  nxv2i1, nontrunc_masked_store,  ST1D,   ST1D_IMM,   am_sve_regreg_lsl3>;
+  defm : pred_store<nxv2f16,  nxv2i1, nontrunc_masked_store,  ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv2bf16, nxv2i1, nontrunc_masked_store,  ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv2f32,  nxv2i1, nontrunc_masked_store,  ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_store<nxv2f64,  nxv2i1, nontrunc_masked_store,  ST1D,   ST1D_IMM,   am_sve_regreg_lsl3>;
 
   // 4-element contiguous stores
-  defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8,  ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
-  defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
-  defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
-  defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store,  ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
-  defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
+  defm : pred_store<nxv4i32,  nxv4i1, trunc_masked_store_i8,  ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv4i32,  nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv4i32,  nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
+  defm : pred_store<nxv4f16,  nxv4i1, nontrunc_masked_store,  ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv4bf16, nxv4i1, nontrunc_masked_store,  ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv4f32,  nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
 
   // 8-element contiguous stores
   defm : pred_store<nxv8i16,  nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
@@ -1954,32 +2200,30 @@ let Predicates = [HasSVE] in {
       def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
                      (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
     }
-    let AddedComplexity = 3 in {
-      def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
-                    (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
-    }
 
     def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
               (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
   }
 
-  defm : unpred_store<         store, nxv16i8,   ST1B,   ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
-  defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
-  defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
-  defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
-  defm : unpred_store<         store, nxv8i16,   ST1H,   ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
-  defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
-  defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
-  defm : unpred_store<         store, nxv4i32,   ST1W,   ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
-  defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
-  defm : unpred_store<         store, nxv2i64,   ST1D,   ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
-  defm : unpred_store<         store, nxv8f16,   ST1H,   ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
-  defm : unpred_store<         store, nxv8bf16,  ST1H,   ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
-  defm : unpred_store<         store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
-  defm : unpred_store<         store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
-  defm : unpred_store<         store, nxv4f32,   ST1W,   ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
-  defm : unpred_store<         store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
-  defm : unpred_store<         store, nxv2f64,   ST1D,   ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+  defm : unpred_store<         store, nxv16i8,    ST1B,   ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
+  defm : unpred_store< truncstorevi8, nxv8i16,  ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
+  defm : unpred_store< truncstorevi8, nxv4i32,  ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
+  defm : unpred_store< truncstorevi8, nxv2i64,  ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
+  defm : unpred_store<         store, nxv8i16,    ST1H,   ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+  defm : unpred_store<truncstorevi16, nxv4i32,  ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+  defm : unpred_store<truncstorevi16, nxv2i64,  ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv4i32,    ST1W,   ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
+  defm : unpred_store<truncstorevi32, nxv2i64,  ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+  defm : unpred_store<         store, nxv2i64,    ST1D,   ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+  defm : unpred_store<         store, nxv8f16,    ST1H,   ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv8bf16,   ST1H,   ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv4f16,  ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv4bf16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv2f16,  ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv2bf16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+  defm : unpred_store<         store, nxv4f32,    ST1W,   ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
+  defm : unpred_store<         store, nxv2f32,  ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
+  defm : unpred_store<         store, nxv2f64,    ST1D,   ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
 
   multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
                          Instruction RegImmInst, Instruction PTrue,
@@ -1992,10 +2236,6 @@ let Predicates = [HasSVE] in {
       def _imm: Pat<(Ty (Load  (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
                     (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
     }
-    let AddedComplexity = 3 in {
-      def _fi : Pat<(Ty (Load  (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
-                    (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
-    }
 
     def : Pat<(Ty (Load GPR64:$base)),
               (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
@@ -2026,7 +2266,9 @@ let Predicates = [HasSVE] in {
   defm : unpred_load<        load, nxv8f16,    LD1H,    LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
   defm : unpred_load<        load, nxv8bf16,   LD1H,    LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
   defm : unpred_load<        load, nxv4f16,  LD1H_S,  LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
+  defm : unpred_load<        load, nxv4bf16, LD1H_S,  LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
   defm : unpred_load<        load, nxv2f16,  LD1H_D,  LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
+  defm : unpred_load<        load, nxv2bf16, LD1H_D,  LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
   defm : unpred_load<        load, nxv4f32,    LD1W,    LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
   defm : unpred_load<        load, nxv2f32,  LD1W_D,  LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
   defm : unpred_load<        load, nxv2f64,    LD1D,    LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
@@ -2059,9 +2301,6 @@ let Predicates = [HasSVE] in {
   }
 
   defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
-  defm Pat_Store_P8  : unpred_store_predicate<nxv8i1, STR_PXI>;
-  defm Pat_Store_P4  : unpred_store_predicate<nxv4i1, STR_PXI>;
-  defm Pat_Store_P2  : unpred_store_predicate<nxv2i1, STR_PXI>;
 
   multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
     def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
@@ -2072,9 +2311,6 @@ let Predicates = [HasSVE] in {
   }
 
   defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
-  defm Pat_Load_P8  : unpred_load_predicate<nxv8i1, LDR_PXI>;
-  defm Pat_Load_P4  : unpred_load_predicate<nxv4i1, LDR_PXI>;
-  defm Pat_Load_P2  : unpred_load_predicate<nxv2i1, LDR_PXI>;
 
   multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
                  SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
@@ -2122,7 +2358,9 @@ let Predicates = [HasSVE] in {
 
   // 16-element contiguous loads
   defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+} // End HasSVEorStreamingSVE
 
+let Predicates = [HasSVE] in {
   multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
     // scalar + immediate (mul vl)
     let AddedComplexity = 1 in {
@@ -2203,7 +2441,9 @@ let Predicates = [HasSVE] in {
 
   // 16-element contiguous first faulting loads
   defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+} // End HasSVE
 
+let Predicates = [HasSVEorStreamingSVE] in {
   multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
                  SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
     // reg + reg
@@ -2400,6 +2640,19 @@ let Predicates = [HasSVE] in {
             (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
   }
 
+  def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8),
+            (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+  def : Pat<(sext_inreg (anyext (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), i8),
+            (i64 (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+
+  def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16),
+            (i32 (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+  def : Pat<(sext_inreg (anyext (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), i16),
+            (i64 (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+
+  def : Pat<(sext (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+            (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+
   // Extract first element from vector.
   let AddedComplexity = 2 in {
   def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
@@ -2425,28 +2678,32 @@ let Predicates = [HasSVE] in {
   }
 
   // Splice with lane bigger or equal to 0
-  def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_15 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_15:$index)>;
-  def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_7 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_7:$index)>;
-  def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_3 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_3:$index)>;
-  def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_1 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_1:$index)>;
-}
+  def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_255:$index)>;
+  def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_127:$index)>;
+  def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_63:$index)>;
+  def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_31:$index)>;
+
+} // End HasSVEorStreamingSVE
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
   defm  SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
   defm  UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>;
   defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
+} // End HasSVE, HasMatMulInt8
+
+let Predicates = [HasSVEorStreamingSVE, HasMatMulInt8] in {
   defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
   defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
   defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
-}
+} // End HasSVEorStreamingSVE, HasMatMulInt8
 
 let Predicates = [HasSVE, HasMatMulFP32] in {
   defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
-}
+} // End HasSVE, HasMatMulFP32
 
 let Predicates = [HasSVE, HasMatMulFP64] in {
   defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>;
@@ -2458,15 +2715,18 @@ let Predicates = [HasSVE, HasMatMulFP64] in {
   defm LD1RO_H     : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1,  AArch64ld1ro_z, am_sve_regreg_lsl1>;
   defm LD1RO_W     : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1,  AArch64ld1ro_z, am_sve_regreg_lsl2>;
   defm LD1RO_D     : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1,  AArch64ld1ro_z, am_sve_regreg_lsl3>;
+} // End HasSVE, HasMatMulFP64
+
+let Predicates = [HasSVEorStreamingSVE, HasMatMulFP64] in {
   defm ZIP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
   defm ZIP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
   defm UZP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
   defm UZP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
   defm TRN1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
   defm TRN2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
-}
+} // End HasSVEorStreamingSVE, HasMatMulFP64
 
-let Predicates = [HasSVE2] in {
+let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 integer multiply-add (indexed)
   defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
   defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
@@ -2614,15 +2874,17 @@ let Predicates = [HasSVE2] in {
   defm UQSHL_ZPZZ   : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>;
   defm SQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>;
   defm UQRSHL_ZPZZ  : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>;
+} // End HasSVE2orStreamingSVE
 
-  let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
-    defm SQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
-    defm UQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
-    defm SRSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
-    defm URSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
-    defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
-  }
+let Predicates = [HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos] in {
+  defm SQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
+  defm UQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
+  defm SRSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
+  defm URSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
+  defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
+} // End HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos
 
+let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 predicated shifts
   defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl",  "SQSHL_ZPZI",  int_aarch64_sve_sqshl>;
   defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl",  "UQSHL_ZPZI",  int_aarch64_sve_uqshl>;
@@ -2735,11 +2997,15 @@ let Predicates = [HasSVE2] in {
   defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt",  int_aarch64_sve_sqxtnt>;
   defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt",  int_aarch64_sve_uqxtnt>;
   defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
+} // End HasSVE2orStreamingSVE
 
+let Predicates = [HasSVE2] in {
   // SVE2 character match
   defm MATCH_PPzZZ  : sve2_char_match<0b0, "match",  int_aarch64_sve_match>;
   defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
+} // End HasSVE2
 
+let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 bitwise exclusive-or interleaved
   defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
   defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
@@ -2754,13 +3020,17 @@ let Predicates = [HasSVE2] in {
   defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
   defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
   defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
+} // End HasSVE2orStreamingSVE
 
+let Predicates = [HasSVE2] in {
   // SVE2 histogram generation (segment)
   def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>;
 
   // SVE2 histogram generation (vector)
   defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
+} // End HasSVE2
 
+let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 floating-point base 2 logarithm as integer
   defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
 
@@ -2802,7 +3072,9 @@ let Predicates = [HasSVE2] in {
 
   // SVE2 extract vector (immediate offset, constructive)
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
+} // End HasSVE2orStreamingSVE
 
+let Predicates = [HasSVE2] in {
   // SVE2 non-temporal gather loads
   defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>;
   defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b",  AArch64ldnt1_gather_z,  nxv4i8>;
@@ -2817,10 +3089,14 @@ let Predicates = [HasSVE2] in {
   defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>;
   defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w",  AArch64ldnt1_gather_z,  nxv2i32>;
   defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d",  AArch64ldnt1_gather_z,  nxv2i64>;
+} // End HasSVE2
 
+let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 vector splice (constructive)
   defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
+} // End HasSVE2orStreamingSVE
 
+let Predicates = [HasSVE2] in {
   // SVE2 non-temporal scatter stores
   defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
   defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
@@ -2830,7 +3106,9 @@ let Predicates = [HasSVE2] in {
   defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
   defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
   defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
+} // End HasSVE2
 
+let Predicates = [HasSVE2orStreamingSVE] in {
   // SVE2 table lookup (three sources)
   defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
   defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
@@ -2849,7 +3127,7 @@ let Predicates = [HasSVE2] in {
   // SVE2 pointer conflict compare
   defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
   defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
-}
+} // End HasSVE2orStreamingSVE
 
 let Predicates = [HasSVE2AES] in {
   // SVE2 crypto destructive binary operations
@@ -2865,23 +3143,23 @@ let Predicates = [HasSVE2AES] in {
   // to NEON PMULL2 instruction.
   defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>;
   defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>;
-}
+} // End HasSVE2AES
 
 let Predicates = [HasSVE2SM4] in {
   // SVE2 crypto constructive binary operations
   defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
   // SVE2 crypto destructive binary operations
   defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
-}
+} // End HasSVE2SM4
 
 let Predicates = [HasSVE2SHA3] in {
   // SVE2 crypto constructive binary operations
   defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
-}
+} // End HasSVE2SHA3
 
 let Predicates = [HasSVE2BitPerm] in {
   // SVE2 bitwise permute
   defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
   defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
   defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
-}
+} // End HasSVE2BitPerm
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index 65c84b1f39c0..d18a05fda191 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -127,7 +127,8 @@ def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
 def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
 def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
 def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteVd, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteVq, [A53UnitFPALU]> { let Latency = 6; }
 
 // FP Mul, Div, Sqrt
 def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
@@ -149,6 +150,7 @@ def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
 // No forwarding for these reads.
 def : ReadAdvance<ReadExtrHi, 0>;
 def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 0>;
 def : ReadAdvance<ReadVLD, 0>;
 
 // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 0e680078c348..877c4d2ced41 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -149,9 +149,11 @@ def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
 def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
 def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
 def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
-def : WriteRes<WriteV, [CortexA55UnitFPALU]> { let Latency = 4; }
+def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; }
+def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; }
 
 // FP ALU specific new schedwrite definitions
+def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
 def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;}
 def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;}
 def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;}
@@ -182,6 +184,7 @@ def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency =
 def : ReadAdvance<ReadVLD, 0>;
 def : ReadAdvance<ReadExtrHi, 1>;
 def : ReadAdvance<ReadAdrBase, 1>;
+def : ReadAdvance<ReadST, 1>;
 
 // ALU - ALU input operands are generally needed in EX1. An operand produced in
 //       in say EX2 can be forwarded for consumption to ALU in EX1, thereby
@@ -330,6 +333,8 @@ def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16
 //---
 // Floating Point Conversions, MAC, DIV, SQRT
 //---
+def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
+def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^XTN")>;
 def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
 def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index c1eacca8cc1f..168a762241ca 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -96,7 +96,8 @@ def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
 def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
 def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;}
 def : SchedAlias<WriteFDiv,  A57Write_17cyc_1W>;
-def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteVd,    A57Write_3cyc_1V>;
+def : SchedAlias<WriteVq,    A57Write_3cyc_1V>;
 def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
 def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
 
@@ -116,6 +117,7 @@ def : ReadAdvance<ReadIM,      0>;
 def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
 def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadST,      0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index b6741d418ef0..1d25a6c00f95 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -21,7 +21,8 @@ def A64FXModel : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures =
-    [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth];
+    [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth,
+     HasSVE2orStreamingSVE];
 
   let FullInstRWOverlapCheck = 0;
 }
@@ -760,6 +761,7 @@ def : ReadAdvance<ReadIMA,     0>;
 def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST,      0>;
 def : ReadAdvance<ReadVLD,     0>;
 
 //===----------------------------------------------------------------------===//
@@ -1625,7 +1627,11 @@ def : InstRW<[A64FXWrite_FMOV_VG14], (instrs FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [A64FXGI03]> {
+def : WriteRes<WriteVd, [A64FXGI03]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteVq, [A64FXGI03]> {
   let Latency = 4;
   let ResourceCycles = [1];
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 11df304a974c..9fbb46919427 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -258,6 +258,7 @@ def CyReadAdrBase : SchedReadVariant<[
   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+def : ReadAdvance<ReadST, 0>;
 
 //---
 // 7.8.9,7.8.11. Load/Store, paired
@@ -303,7 +304,8 @@ def : WriteRes<WriteSys, []> {let Latency = -1;}
 // 7.9 Vector Unit Instructions
 
 // Simple vector operations take 2 cycles.
-def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+def : WriteRes<WriteVd, [CyUnitV]> {let Latency = 2;}
+def : WriteRes<WriteVq, [CyUnitV]> {let Latency = 2;}
 
 // Define some longer latency vector op types for Cyclone.
 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
@@ -334,7 +336,7 @@ def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
 // COPY is handled above in the WriteMov Variant.
 def WriteVMov    : SchedWriteVariant<[
                      SchedVar<WriteVMovPred, [WriteX]>,
-                     SchedVar<NoSchedPred,   [WriteV]>]>;
+                     SchedVar<NoSchedPred,   [WriteVq]>]>;
 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
 
 // FMOVSr,FMOVDr are WriteF.
@@ -354,7 +356,7 @@ def : WriteRes<WriteFCopy, [CyUnitLS]> {
 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
 
 // INS V[x],R
-def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteVq]>;
 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
 
 // SMOV,UMOV R,V[x]
@@ -570,7 +572,7 @@ def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
 //---
 
 // FCVT lengthen f16/s32
-def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+def : InstRW<[WriteVq], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
 
 // FCVT,FCVTN,FCVTXN
 // SCVTF,UCVTF V,V
@@ -680,61 +682,61 @@ def : InstRW<[WriteVLDShuffle],
 def : InstRW<[WriteVLDShuffle, WriteAdr],
              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
-def : InstRW<[WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVq],
              (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq],
              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq],
              (instregex "LD2i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq],
              (instregex "LD2i(8|16|32)_POST")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq],
              (instregex "LD2i64$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq],
              (instregex "LD2i64_POST")>;
 
-def : InstRW<[WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVq],
              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq],
              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq],
              (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq],
              (instregex "LD3Threev(8b|4h|2s)_POST")>;
 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq, WriteVq],
              (instregex "LD3i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq, WriteVq],
              (instregex "LD3i(8|16|32)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteVq],
              (instregex "LD3i64$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteVq],
              (instregex "LD3i64_POST")>;
 
-def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVq, WriteVq],
              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq, WriteVq],
              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq],
              (instrs LD3Rv1d,LD3Rv2d)>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq],
              (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq, WriteVq],
              (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq, WriteVq],
              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
               WriteVLDPairShuffle, WriteVLDPairShuffle],
@@ -743,25 +745,25 @@ def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
               WriteVLDPairShuffle, WriteVLDPairShuffle],
              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq, WriteVq, WriteVq],
              (instregex "LD4i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq, WriteVq, WriteVq],
              (instregex "LD4i(8|16|32)_POST")>;
 
 
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteVq, WriteVq],
              (instrs LD4i64)>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteVq],
              (instrs LD4i64_POST)>;
 
-def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVq, WriteVq, WriteVq],
              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq, WriteVq, WriteVq],
              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq, WriteVq],
              (instrs LD4Rv1d,LD4Rv2d)>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq, WriteVq],
              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
 
 //---
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index 6a33258be02c..14df8236504b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -254,7 +254,8 @@ def : WriteRes<WriteVST, [M3UnitS,
                                         let NumMicroOps = 1; }
 
 // ASIMD FP instructions.
-def : WriteRes<WriteV, [M3UnitNALU]> { let Latency = 3; }
+def : WriteRes<WriteVd, [M3UnitNALU]> { let Latency = 3; }
+def : WriteRes<WriteVq, [M3UnitNALU]> { let Latency = 3; }
 
 // Other miscellaneous instructions.
 def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
@@ -277,6 +278,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 //===----------------------------------------------------------------------===//
 // Finer scheduling model.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index db066a19b0b6..8f740a9a0d35 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -558,7 +558,8 @@ def : SchedAlias<WriteVLD, M4WriteL5>;
 def : SchedAlias<WriteVST, M4WriteVST1>;
 
 // ASIMD FP instructions.
-def : SchedAlias<WriteV, M4WriteNALU1>;
+def : SchedAlias<WriteVd, M4WriteNALU1>;
+def : SchedAlias<WriteVq, M4WriteNALU1>;
 
 // Other miscellaneous instructions.
 def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
@@ -581,6 +582,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 //===----------------------------------------------------------------------===//
 // Finer scheduling model.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 0429b6ab2ee2..93e1b66bea03 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -594,7 +594,8 @@ def : SchedAlias<WriteVLD, M5WriteL6>;
 def : SchedAlias<WriteVST, M5WriteVST1>;
 
 // ASIMD FP instructions.
-def : SchedAlias<WriteV, M5WriteNALU1>;
+def : SchedAlias<WriteVd, M5WriteNALU1>;
+def : SchedAlias<WriteVq, M5WriteNALU1>;
 
 // Other miscellaneous instructions.
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
@@ -616,6 +617,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 //===----------------------------------------------------------------------===//
 // Finer scheduling model.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 8bb95e442249..7c9b0afdd169 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -92,7 +92,8 @@ def : WriteRes<WriteFCopy, []>   { let Unsupported = 1; }
 def : WriteRes<WriteFImm, []>    { let Unsupported = 1; }
 def : WriteRes<WriteFMul, []>    { let Unsupported = 1; }
 def : WriteRes<WriteFDiv, []>    { let Unsupported = 1; }
-def : WriteRes<WriteV, []>       { let Unsupported = 1; }
+def : WriteRes<WriteVd, []>      { let Unsupported = 1; }
+def : WriteRes<WriteVq, []>      { let Unsupported = 1; }
 def : WriteRes<WriteVLD, []>     { let Unsupported = 1; }
 def : WriteRes<WriteVST, []>     { let Unsupported = 1; }
 def : WriteRes<WriteSys, []>     { let Unsupported = 1; }
@@ -111,6 +112,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 // Detailed Refinements
 // -----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index 45964e1ed6de..cc568a2f2f17 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -95,7 +95,8 @@ def : WriteRes<WriteFMul,  [KryoUnitX, KryoUnitX]>
       { let Latency = 6; let NumMicroOps = 2; }
 def : WriteRes<WriteFDiv,  [KryoUnitXA, KryoUnitY]>
       { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
-def : WriteRes<WriteV,     [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVd,    [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVq,    [KryoUnitXY]> { let Latency = 6; }
 def : WriteRes<WriteVLD,   [KryoUnitLS]> { let Latency = 4; }
 def : WriteRes<WriteVST,   [KryoUnitLS]> { let Latency = 4; }
 
@@ -117,6 +118,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 438371c1b6a8..77fca22a5f55 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -90,7 +90,8 @@ def : WriteRes<WriteFMul,  [TSV110UnitF]> { let Latency = 5; }
 // FP Div, Sqrt
 def : WriteRes<WriteFDiv,  [TSV110UnitFSU1]> { let Latency = 18; } 
 
-def : WriteRes<WriteV,     [TSV110UnitF]>     { let Latency = 4; }
+def : WriteRes<WriteVd,    [TSV110UnitF]>     { let Latency = 4; }
+def : WriteRes<WriteVq,    [TSV110UnitF]>     { let Latency = 4; }
 def : WriteRes<WriteVLD,   [TSV110UnitFLdSt]> { let Latency = 5; }
 def : WriteRes<WriteVST,   [TSV110UnitF]>     { let Latency = 1; }
 
@@ -113,6 +114,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 def : InstRW<[WriteI], (instrs COPY)>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index 125eb284cfd1..ff34c0ce9a0c 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -154,7 +154,8 @@ def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; }
 def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; }
 def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; }
 def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteVd, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteVq, [THXT8XUnitFPALU]> { let Latency = 6; }
 
 // FP Mul, Div, Sqrt
 def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; }
@@ -192,6 +193,7 @@ def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
 def : ReadAdvance<ReadExtrHi, 1>;
 def : ReadAdvance<ReadAdrBase, 2>;
 def : ReadAdvance<ReadVLD, 2>;
+def : ReadAdvance<ReadST, 2>;
 
 // FIXME: This needs more targeted benchmarking.
 // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 8d8675b7ac6f..e4cae97b5524 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -362,6 +362,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
@@ -1249,7 +1250,12 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [THX2T99F01]> {
+def : WriteRes<WriteVd, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4];
+}
+def : WriteRes<WriteVq, [THX2T99F01]> {
   let Latency = 7;
   let NumMicroOps = 4;
   let ResourceCycles = [4];
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
index 00838cc4b9bd..08be2b3a55b3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
@@ -621,6 +621,7 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
+def : ReadAdvance<ReadST,      0>;
 
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
@@ -1356,7 +1357,12 @@ def : InstRW<[THX3T110Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [THX3T110FP0123]> {
+def : WriteRes<WriteVd, [THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4];
+}
+def : WriteRes<WriteVq, [THX3T110FP0123]> {
   let Latency = 5;
   let NumMicroOps = 4;
   let ResourceCycles = [4];
diff --git a/llvm/lib/Target/AArch64/AArch64Schedule.td b/llvm/lib/Target/AArch64/AArch64Schedule.td
index 49c0c1782236..b8572c9b4572 100644
--- a/llvm/lib/Target/AArch64/AArch64Schedule.td
+++ b/llvm/lib/Target/AArch64/AArch64Schedule.td
@@ -47,6 +47,7 @@ def WriteAdr       : SchedWrite; // Address pre/post increment.
 
 def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
 def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadST     : SchedRead;  // Read the stored value.
 def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
 
 // Serialized two-level address load.
@@ -76,7 +77,8 @@ def WriteFImm    : SchedWrite; // Floating-point immediate.
 def WriteFMul    : SchedWrite; // Floating-point multiply.
 def WriteFDiv    : SchedWrite; // Floating-point division.
 
-def WriteV   : SchedWrite; // Vector ops.
+def WriteVd  : SchedWrite; // 64bit Vector D ops.
+def WriteVq  : SchedWrite; // 128bit Vector Q ops.
 def WriteVLD : SchedWrite; // Vector loads.
 def WriteVST : SchedWrite; // Vector stores.
 
@@ -86,9 +88,9 @@ def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP)
 def ReadVLD : SchedRead;
 
 // Sequential vector load and shuffle.
-def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
-def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteVq]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteVq, WriteVq]>;
 
 // Store a shuffled vector.
-def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
-def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
+def WriteVSTShuffle : WriteSequence<[WriteVq, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteVq, WriteVq, WriteVST]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 3eb4c04570de..d2d84b2a3f6d 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -24,8 +24,10 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
-  const char *bzeroName = (V && V->isNullValue())
-      ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) : nullptr;
+  const char *bzeroName =
+      (V && V->isZero())
+          ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
+          : nullptr;
   // For small size (< 256), it is not beneficial to use bzero
   // instead of memset.
   if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) {
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index f37fedd50378..5cec4cb66339 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -1,9 +1,8 @@
 //===- AArch64StackTagging.cpp - Stack tagging in IR --===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //===----------------------------------------------------------------------===//
@@ -652,7 +651,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
 
       auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); };
       if (!DT || !PDT ||
-          !forAllReachableExits(*DT, *PDT, Start, End, RetVec, TagEnd))
+          !forAllReachableExits(*DT, *PDT, Start, Info.LifetimeEnd, RetVec,
+                                TagEnd))
         End->eraseFromParent();
     } else {
       uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index 076ed9b13c99..d2488f61eb4b 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -1,9 +1,8 @@
 //===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -177,20 +176,19 @@ bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() {
 }
 
 void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
-  for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end();
-       UI != E;) {
-    MachineInstr *UseI = &*(UI++);
-    if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) {
+  for (MachineInstr &UseI :
+       llvm::make_early_inc_range(MRI->use_instructions(TaggedReg))) {
+    if (isUncheckedLoadOrStoreOpcode(UseI.getOpcode())) {
       // FI operand is always the one before the immediate offset.
-      unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1;
-      if (UseI->getOperand(OpIdx).isReg() &&
-          UseI->getOperand(OpIdx).getReg() == TaggedReg) {
-        UseI->getOperand(OpIdx).ChangeToFrameIndex(FI);
-        UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
+      unsigned OpIdx = TII->getLoadStoreImmIdx(UseI.getOpcode()) - 1;
+      if (UseI.getOperand(OpIdx).isReg() &&
+          UseI.getOperand(OpIdx).getReg() == TaggedReg) {
+        UseI.getOperand(OpIdx).ChangeToFrameIndex(FI);
+        UseI.getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
       }
-    } else if (UseI->isCopy() &&
-               Register::isVirtualRegister(UseI->getOperand(0).getReg())) {
-      uncheckUsesOf(UseI->getOperand(0).getReg(), FI);
+    } else if (UseI.isCopy() &&
+               Register::isVirtualRegister(UseI.getOperand(0).getReg())) {
+      uncheckUsesOf(UseI.getOperand(0).getReg(), FI);
     }
   }
 }
@@ -277,8 +275,7 @@ Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
     WorkList.push_back(RetagReg);
 
     while (!WorkList.empty()) {
-      Register UseReg = WorkList.back();
-      WorkList.pop_back();
+      Register UseReg = WorkList.pop_back_val();
       for (auto &UseI : MRI->use_instructions(UseReg)) {
         unsigned Opcode = UseI.getOpcode();
         if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index a94856ef4fba..64f13eab0413 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -119,7 +119,7 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
 }
 
 bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
+  if (skipFunction(MF.getFunction()) || MF.getFunction().hasOptSize())
     return false;
 
   const TargetSubtargetInfo &ST = MF.getSubtarget();
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index b22eb3b154f5..d782d6352cbe 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -50,15 +50,17 @@ static cl::opt<bool>
 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
                            cl::desc("Enable the use of AA during codegen."));
 
-AArch64Subtarget &
-AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
-                                                  StringRef CPUString) {
+AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
+    StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
   // Determine default and user-specified characteristics
 
   if (CPUString.empty())
     CPUString = "generic";
 
-  ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
+  if (TuneCPUString.empty())
+    TuneCPUString = CPUString;
+
+  ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
   initializeProperties();
 
   return *this;
@@ -98,6 +100,12 @@ void AArch64Subtarget::initializeProperties() {
   case CortexX1:
     PrefFunctionLogAlignment = 4;
     break;
+  case CortexA510:
+  case CortexA710:
+  case CortexX2:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    break;
   case A64FX:
     CacheLineSize = 256;
     PrefFunctionLogAlignment = 3;
@@ -106,6 +114,7 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
+    VScaleForTuning = 4;
     break;
   case AppleA7:
   case AppleA10:
@@ -147,9 +156,20 @@ void AArch64Subtarget::initializeProperties() {
     PrefFunctionLogAlignment = 3;
     break;
   case NeoverseN1:
+    PrefFunctionLogAlignment = 4;
+    break;
   case NeoverseN2:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    break;
   case NeoverseV1:
     PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 2;
+    break;
+  case Neoverse512TVB:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    MaxInterleaveFactor = 4;
     break;
   case Saphira:
     MaxInterleaveFactor = 4;
@@ -197,18 +217,20 @@ void AArch64Subtarget::initializeProperties() {
 }
 
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
+                                   const std::string &TuneCPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian,
                                    unsigned MinSVEVectorSizeInBitsOverride,
                                    unsigned MaxSVEVectorSizeInBitsOverride)
-    : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
+    : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
       IsLittle(LittleEndian),
       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
-      FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)),
-      TSInfo(), TLInfo(TM, *this) {
+      FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TSInfo(),
+      TLInfo(TM, *this) {
   if (AArch64::isX18ReservedByDefault(TT))
     ReserveXRegister.set(18);
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index e0ef8df6fca9..19db774ccd7b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -50,6 +50,7 @@ public:
     CortexA35,
     CortexA53,
     CortexA55,
+    CortexA510,
     CortexA57,
     CortexA65,
     CortexA72,
@@ -59,14 +60,17 @@ public:
     CortexA77,
     CortexA78,
     CortexA78C,
+    CortexA710,
     CortexR82,
     CortexX1,
+    CortexX2,
     ExynosM3,
     Falkor,
     Kryo,
     NeoverseE1,
     NeoverseN1,
     NeoverseN2,
+    Neoverse512TVB,
     NeoverseV1,
     Saphira,
     ThunderX2T99,
@@ -82,6 +86,7 @@ protected:
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily = Others;
 
+  bool HasV8_0aOps = false;
   bool HasV8_1aOps = false;
   bool HasV8_2aOps = false;
   bool HasV8_3aOps = false;
@@ -89,16 +94,21 @@ protected:
   bool HasV8_5aOps = false;
   bool HasV8_6aOps = false;
   bool HasV8_7aOps = false;
-
+  bool HasV9_0aOps = false;
+  bool HasV9_1aOps = false;
+  bool HasV9_2aOps = false;
   bool HasV8_0rOps = false;
-  bool HasCONTEXTIDREL2 = false;
 
+  bool HasCONTEXTIDREL2 = false;
+  bool HasEL2VMSA = false;
+  bool HasEL3 = false;
   bool HasFPARMv8 = false;
   bool HasNEON = false;
   bool HasCrypto = false;
   bool HasDotProd = false;
   bool HasCRC = false;
   bool HasLSE = false;
+  bool HasLSE2 = false;
   bool HasRAS = false;
   bool HasRDM = false;
   bool HasPerfMon = false;
@@ -119,6 +129,7 @@ protected:
   // SVE extensions
   bool HasSVE = false;
   bool UseExperimentalZeroingPseudos = false;
+  bool UseScalarIncVL = false;
 
   // Armv8.2 Crypto extensions
   bool HasSM4 = false;
@@ -139,7 +150,6 @@ protected:
   bool HasTRACEV8_4 = false;
   bool HasAM = false;
   bool HasSEL2 = false;
-  bool HasPMU = false;
   bool HasTLB_RMI = false;
   bool HasFlagM = false;
   bool HasRCPC_IMMO = false;
@@ -190,6 +200,10 @@ protected:
   bool HasSME = false;
   bool HasSMEF64 = false;
   bool HasSMEI64 = false;
+  bool HasStreamingSVE = false;
+
+  // AppleA7 system register.
+  bool HasAppleA7SysReg = false;
 
   // Future architecture extensions.
   bool HasETE = false;
@@ -271,6 +285,7 @@ protected:
 
   unsigned MinSVEVectorSizeInBits;
   unsigned MaxSVEVectorSizeInBits;
+  unsigned VScaleForTuning = 2;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
@@ -292,7 +307,8 @@ private:
   /// passed in feature string so that we can use initializer lists for
   /// subtarget initialization.
   AArch64Subtarget &initializeSubtargetDependencies(StringRef FS,
-                                                    StringRef CPUString);
+                                                    StringRef CPUString,
+                                                    StringRef TuneCPUString);
 
   /// Initialize properties based on the selected processor family.
   void initializeProperties();
@@ -301,8 +317,8 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   AArch64Subtarget(const Triple &TT, const std::string &CPU,
-                   const std::string &FS, const TargetMachine &TM,
-                   bool LittleEndian,
+                   const std::string &TuneCPU, const std::string &FS,
+                   const TargetMachine &TM, bool LittleEndian,
                    unsigned MinSVEVectorSizeInBitsOverride = 0,
                    unsigned MaxSVEVectorSizeInBitsOverride = 0);
 
@@ -338,11 +354,15 @@ public:
     return ARMProcFamily;
   }
 
+  bool hasV8_0aOps() const { return HasV8_0aOps; }
   bool hasV8_1aOps() const { return HasV8_1aOps; }
   bool hasV8_2aOps() const { return HasV8_2aOps; }
   bool hasV8_3aOps() const { return HasV8_3aOps; }
   bool hasV8_4aOps() const { return HasV8_4aOps; }
   bool hasV8_5aOps() const { return HasV8_5aOps; }
+  bool hasV9_0aOps() const { return HasV9_0aOps; }
+  bool hasV9_1aOps() const { return HasV9_1aOps; }
+  bool hasV9_2aOps() const { return HasV9_2aOps; }
   bool hasV8_0rOps() const { return HasV8_0rOps; }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
@@ -375,6 +395,7 @@ public:
   bool hasDotProd() const { return HasDotProd; }
   bool hasCRC() const { return HasCRC; }
   bool hasLSE() const { return HasLSE; }
+  bool hasLSE2() const { return HasLSE2; }
   bool hasRAS() const { return HasRAS; }
   bool hasRDM() const { return HasRDM; }
   bool hasSM4() const { return HasSM4; }
@@ -449,6 +470,8 @@ public:
     return UseExperimentalZeroingPseudos;
   }
 
+  bool useScalarIncVL() const { return UseScalarIncVL; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
@@ -494,6 +517,7 @@ public:
   bool hasSME() const { return HasSME; }
   bool hasSMEF64() const { return HasSMEF64; }
   bool hasSMEI64() const { return HasSMEI64; }
+  bool hasStreamingSVE() const { return HasStreamingSVE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -541,10 +565,11 @@ public:
   bool hasHCX() const { return HasHCX; }
   bool hasLS64() const { return HasLS64; }
   bool hasSEL2() const { return HasSEL2; }
-  bool hasPMU() const { return HasPMU; }
   bool hasTLB_RMI() const { return HasTLB_RMI; }
   bool hasFlagM() const { return HasFlagM; }
   bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
+  bool hasEL2VMSA() const { return HasEL2VMSA; }
+  bool hasEL3() const { return HasEL3; }
 
   bool addrSinkUsingGEPs() const override {
     // Keeping GEPs inbounds is important for exploiting AArch64
@@ -598,6 +623,31 @@ public:
     }
   }
 
+  /// Return whether FrameLowering should always set the "extended frame
+  /// present" bit in FP, or set it based on a symbol in the runtime.
+  bool swiftAsyncContextIsDynamicallySet() const {
+    // Older OS versions (particularly system unwinders) are confused by the
+    // Swift extended frame, so when building code that might be run on them we
+    // must dynamically query the concurrency library to determine whether
+    // extended frames should be flagged as present.
+    const Triple &TT = getTargetTriple();
+
+    unsigned Major, Minor, Micro;
+    TT.getOSVersion(Major, Minor, Micro);
+    switch(TT.getOS()) {
+    default:
+      return false;
+    case Triple::IOS:
+    case Triple::TvOS:
+      return Major < 15;
+    case Triple::WatchOS:
+      return Major < 8;
+    case Triple::MacOSX:
+    case Triple::Darwin:
+      return Major < 12;
+    }
+  }
+
   void mirFileLoaded(MachineFunction &MF) const override;
 
   // Return the known range for the bit length of SVE data registers. A value
@@ -614,6 +664,8 @@ public:
   }
 
   bool useSVEForFixedLengthVectors() const;
+
+  unsigned getVScaleForTuning() const { return VScaleForTuning; }
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index f400916c97c9..f9fe804865a5 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -586,6 +586,7 @@ class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
   let EnumValueField = "Encoding";
 
   string Name = name;
+  string AltName = name;
   bits<16> Encoding;
   let Encoding{15-14} = op0;
   let Encoding{13-11} = op1;
@@ -912,13 +913,19 @@ def : RWSysReg<"HSTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b011>;
 def : RWSysReg<"HACR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b111>;
 def : RWSysReg<"MDCR_EL3",           0b11, 0b110, 0b0001, 0b0011, 0b001>;
 def : RWSysReg<"TTBR0_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b000>;
-def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000>;
 def : RWSysReg<"TTBR0_EL3",          0b11, 0b110, 0b0010, 0b0000, 0b000>;
+
+let Requires = [{ {AArch64::FeatureEL2VMSA} }] in {
+def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000> {
+  let AltName = "VSCTLR_EL2";
+}
+def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
+}
+
 def : RWSysReg<"TTBR1_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b001>;
 def : RWSysReg<"TCR_EL1",            0b11, 0b000, 0b0010, 0b0000, 0b010>;
 def : RWSysReg<"TCR_EL2",            0b11, 0b100, 0b0010, 0b0000, 0b010>;
 def : RWSysReg<"TCR_EL3",            0b11, 0b110, 0b0010, 0b0000, 0b010>;
-def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
 def : RWSysReg<"VTCR_EL2",           0b11, 0b100, 0b0010, 0b0001, 0b010>;
 def : RWSysReg<"DACR32_EL2",         0b11, 0b100, 0b0011, 0b0000, 0b000>;
 def : RWSysReg<"SPSR_EL1",           0b11, 0b000, 0b0100, 0b0000, 0b000>;
@@ -970,6 +977,7 @@ def : RWSysReg<"PMUSERENR_EL0",      0b11, 0b011, 0b1001, 0b1110, 0b000>;
 def : RWSysReg<"PMINTENSET_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b001>;
 def : RWSysReg<"PMINTENCLR_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b010>;
 def : RWSysReg<"PMOVSSET_EL0",       0b11, 0b011, 0b1001, 0b1110, 0b011>;
+def : RWSysReg<"PMMIR_EL1",          0b11, 0b000, 0b1001, 0b1110, 0b110>;
 def : RWSysReg<"MAIR_EL1",           0b11, 0b000, 0b1010, 0b0010, 0b000>;
 def : RWSysReg<"MAIR_EL2",           0b11, 0b100, 0b1010, 0b0010, 0b000>;
 def : RWSysReg<"MAIR_EL3",           0b11, 0b110, 0b1010, 0b0010, 0b000>;
@@ -1292,6 +1300,57 @@ def : RWSysReg<"ICH_LR13_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b101>;
 def : RWSysReg<"ICH_LR14_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b110>;
 def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
 
+// v8r system registers
+let Requires = [{ {AArch64::HasV8_0rOps} }] in {
+//Virtualization System Control Register
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"VSCTLR_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b000> {
+  let AltName = "TTBR0_EL2";
+}
+
+//MPU Type Register
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"MPUIR_EL1",        0b11, 0b000, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"MPUIR_EL2",        0b11, 0b100, 0b0000, 0b0000, 0b100>;
+
+//Protection Region Enable Register
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"PRENR_EL1",        0b11, 0b000, 0b0110, 0b0001, 0b001>;
+def : RWSysReg<"PRENR_EL2",        0b11, 0b100, 0b0110, 0b0001, 0b001>;
+
+//Protection Region Selection Register
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"PRSELR_EL1",       0b11, 0b000, 0b0110, 0b0010, 0b001>;
+def : RWSysReg<"PRSELR_EL2",       0b11, 0b100, 0b0110, 0b0010, 0b001>;
+
+//Protection Region Base Address Register
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"PRBAR_EL1",        0b11, 0b000, 0b0110, 0b1000, 0b000>;
+def : RWSysReg<"PRBAR_EL2",        0b11, 0b100, 0b0110, 0b1000, 0b000>;
+
+//Protection Region Limit Address Register
+//                                 Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"PRLAR_EL1",        0b11, 0b000, 0b0110, 0b1000, 0b001>;
+def : RWSysReg<"PRLAR_EL2",        0b11, 0b100, 0b0110, 0b1000, 0b001>;
+
+foreach n = 0-15 in {
+foreach x = 1-2 in {
+//Direct acces to Protection Region Base Address Register for n th MPU region
+  def : RWSysReg<!strconcat("PRBAR"#n, "_EL"#x),
+    0b11, 0b000, 0b0110, 0b1000, 0b000>{
+    let Encoding{5-2} = n;
+    let Encoding{13} = !add(x,-1);
+  }
+
+  def : RWSysReg<!strconcat("PRLAR"#n, "_EL"#x),
+    0b11, 0b000, 0b0110, 0b1000, 0b001>{
+    let Encoding{5-2} = n;
+    let Encoding{13} = !add(x,-1);
+  }
+} //foreach x = 1-2 in
+} //foreach n = 0-15 in
+} //let Requires = [{ {AArch64::HasV8_0rOps} }] in
+
 // v8.1a "Privileged Access Never" extension-specific system registers
 let Requires = [{ {AArch64::FeaturePAN} }] in
 def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
@@ -1395,7 +1454,9 @@ let Requires = [{ {AArch64::FeatureSEL2} }] in {
 // v8.4a "Virtualization secure second stage translation" registers
 //                           Op0   Op1    CRn     CRm     Op2
 def : RWSysReg<"VSTCR_EL2" , 0b11, 0b100, 0b0010, 0b0110, 0b010>;
-def : RWSysReg<"VSTTBR_EL2", 0b11, 0b100, 0b0010, 0b0110, 0b000>;
+def : RWSysReg<"VSTTBR_EL2", 0b11, 0b100, 0b0010, 0b0110, 0b000> {
+  let Requires = [{ {AArch64::HasV8_0aOps} }];
+}
 
 // v8.4a "Virtualization timer" registers
 //                                Op0   Op1    CRn     CRm     Op2
@@ -1411,12 +1472,6 @@ def : RWSysReg<"CNTHPS_CTL_EL2",  0b11, 0b100, 0b1110, 0b0101, 0b001>;
 def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
 } // FeatureSEL2
 
-// v8.4a PMU registers
-//                          Op0   Op1    CRn     CRm     Op2
-let Requires = [{ {AArch64::FeaturePMU} }] in {
-def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
-} // FeaturePMU
-
 // v8.4a RAS registers
 //                              Op0   Op1    CRn     CRm     Op2
 def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
@@ -1640,7 +1695,7 @@ def : RWSysReg<"PMSNEVFR_EL1",      0b11, 0b000, 0b1001, 0b1001, 0b001>;
 
 // Cyclone specific system registers
 //                                 Op0    Op1     CRn     CRm    Op2
-let Requires = [{ {AArch64::ProcAppleA7} }] in
+let Requires = [{ {AArch64::FeatureAppleA7SysReg} }] in
 def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
 
 // Scalable Matrix Extension (SME)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 99bcb2f4649a..ce26c62af61a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
@@ -36,10 +37,10 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/CFGuard.h"
@@ -175,6 +176,16 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
 
 extern cl::opt<bool> EnableHomogeneousPrologEpilog;
 
+static cl::opt<bool> EnableGISelLoadStoreOptPreLegal(
+    "aarch64-enable-gisel-ldst-prelegal",
+    cl::desc("Enable GlobalISel's pre-legalizer load/store optimization pass"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableGISelLoadStoreOptPostLegal(
+    "aarch64-enable-gisel-ldst-postlegal",
+    cl::desc("Enable GlobalISel's post-legalizer load/store optimization pass"),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -195,6 +206,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
   initializeAArch64ExpandPseudoPass(*PR);
   initializeAArch64LoadStoreOptPass(*PR);
+  initializeAArch64MIPeepholeOptPass(*PR);
   initializeAArch64SIMDInstrOptPass(*PR);
   initializeAArch64O0PreLegalizerCombinerPass(*PR);
   initializeAArch64PreLegalizerCombinerPass(*PR);
@@ -354,10 +366,13 @@ AArch64TargetMachine::~AArch64TargetMachine() = default;
 const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
   std::string FS =
       FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
 
@@ -398,6 +413,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Key += "SVEMax";
   Key += std::to_string(MaxSVEVectorSize);
   Key += CPU;
+  Key += TuneCPU;
   Key += FS;
 
   auto &I = SubtargetMap[Key];
@@ -406,8 +422,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
-                                           isLittle, MinSVEVectorSize,
+    I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, TuneCPU, FS,
+                                           *this, isLittle, MinSVEVectorSize,
                                            MaxSVEVectorSize);
   }
   return I.get();
@@ -471,6 +487,7 @@ public:
 
   void addIRPasses()  override;
   bool addPreISel() override;
+  void addCodeGenPrepare() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
   void addPreLegalizeMachineIR() override;
@@ -479,6 +496,7 @@ public:
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addMachineSSAOptimization() override;
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -597,6 +615,12 @@ bool AArch64PassConfig::addPreISel() {
   return false;
 }
 
+void AArch64PassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createTypePromotionPass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool AArch64PassConfig::addInstSelector() {
   addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
 
@@ -617,8 +641,11 @@ bool AArch64PassConfig::addIRTranslator() {
 void AArch64PassConfig::addPreLegalizeMachineIR() {
   if (getOptLevel() == CodeGenOpt::None)
     addPass(createAArch64O0PreLegalizerCombiner());
-  else
+  else {
     addPass(createAArch64PreLegalizerCombiner());
+    if (EnableGISelLoadStoreOptPreLegal)
+      addPass(new LoadStoreOpt());
+  }
 }
 
 bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -628,8 +655,11 @@ bool AArch64PassConfig::addLegalizeMachineIR() {
 
 void AArch64PassConfig::addPreRegBankSelect() {
   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
-  if (!IsOptNone)
+  if (!IsOptNone) {
     addPass(createAArch64PostLegalizerCombiner(IsOptNone));
+    if (EnableGISelLoadStoreOptPostLegal)
+      addPass(new LoadStoreOpt());
+  }
   addPass(createAArch64PostLegalizerLowering());
 }
 
@@ -649,6 +679,14 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
   return false;
 }
 
+void AArch64PassConfig::addMachineSSAOptimization() {
+  // Run default MachineSSAOptimization first.
+  TargetPassConfig::addMachineSSAOptimization();
+
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64MIPeepholeOptPass());
+}
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 01236aa6b527..63d6fa5bbb26 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -9,11 +9,13 @@
 #include "AArch64TargetTransformInfo.h"
 #include "AArch64ExpandImm.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
@@ -220,19 +222,15 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   auto *RetTy = ICA.getReturnType();
   switch (ICA.getID()) {
   case Intrinsic::umin:
-  case Intrinsic::umax: {
-    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
-    // umin(x,y) -> sub(x,usubsat(x,y))
-    // umax(x,y) -> add(x,usubsat(y,x))
-    if (LT.second == MVT::v2i64)
-      return LT.first * 2;
-    LLVM_FALLTHROUGH;
-  }
+  case Intrinsic::umax:
   case Intrinsic::smin:
   case Intrinsic::smax: {
     static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
                                         MVT::v8i16, MVT::v2i32, MVT::v4i32};
     auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    // v2i64 types get converted to cmp+bif hence the cost of 2
+    if (LT.second == MVT::v2i64)
+      return LT.first * 2;
     if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
       return LT.first;
     break;
@@ -291,13 +289,15 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy);
     const auto *Entry =
         CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
-    // Cost Model is using the legal type(i32) that i8 and i16 will be converted
-    // to +1 so that we match the actual lowering cost
-    if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
-        TLI->getValueType(DL, RetTy, true) == MVT::i16)
-      return LegalisationCost.first * Entry->Cost + 1;
-    if (Entry)
+    if (Entry) {
+      // Cost Model is using the legal type(i32) that i8 and i16 will be
+      // converted to +1 so that we match the actual lowering cost
+      if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
+          TLI->getValueType(DL, RetTy, true) == MVT::i16)
+        return LegalisationCost.first * Entry->Cost + 1;
+
       return LegalisationCost.first * Entry->Cost;
+    }
     break;
   }
   case Intrinsic::ctpop: {
@@ -440,6 +440,18 @@ static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, Insert);
 }
 
+static Optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
+                                                  IntrinsicInst &II) {
+  // Replace DupX with a regular IR splat.
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  auto *RetTy = cast<ScalableVectorType>(II.getType());
+  Value *Splat =
+      Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
+  Splat->takeName(&II);
+  return IC.replaceInstUsesWith(II, Splat);
+}
+
 static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
                                                    IntrinsicInst &II) {
   LLVMContext &Ctx = II.getContext();
@@ -457,12 +469,9 @@ static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
     return None;
 
   // Check that we have a compare of zero..
-  auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
-  if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
-    return None;
-
-  auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
-  if (!DupXArg || !DupXArg->isZero())
+  auto *SplatValue =
+      dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
+  if (!SplatValue || !SplatValue->isZero())
     return None;
 
   // ..against a dupq
@@ -547,14 +556,34 @@ static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
 
 static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
                                                   IntrinsicInst &II) {
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
   Value *Pg = II.getArgOperand(0);
   Value *Vec = II.getArgOperand(1);
-  bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
+  auto IntrinsicID = II.getIntrinsicID();
+  bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
 
   // lastX(splat(X)) --> X
   if (auto *SplatVal = getSplatValue(Vec))
     return IC.replaceInstUsesWith(II, SplatVal);
 
+  // If x and/or y is a splat value then:
+  // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
+  Value *LHS, *RHS;
+  if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
+    if (isSplatValue(LHS) || isSplatValue(RHS)) {
+      auto *OldBinOp = cast<BinaryOperator>(Vec);
+      auto OpC = OldBinOp->getOpcode();
+      auto *NewLHS =
+          Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
+      auto *NewRHS =
+          Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
+      auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
+          OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
+      return IC.replaceInstUsesWith(II, NewBinOp);
+    }
+  }
+
   auto *C = dyn_cast<Constant>(Pg);
   if (IsAfter && C && C->isNullValue()) {
     // The intrinsic is extracting lane 0 so use an extract instead.
@@ -576,39 +605,11 @@ static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
 
   // Can the intrinsic's predicate be converted to a known constant index?
-  unsigned Idx;
-  switch (PTruePattern) {
-  default:
+  unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
+  if (!MinNumElts)
     return None;
-  case AArch64SVEPredPattern::vl1:
-    Idx = 0;
-    break;
-  case AArch64SVEPredPattern::vl2:
-    Idx = 1;
-    break;
-  case AArch64SVEPredPattern::vl3:
-    Idx = 2;
-    break;
-  case AArch64SVEPredPattern::vl4:
-    Idx = 3;
-    break;
-  case AArch64SVEPredPattern::vl5:
-    Idx = 4;
-    break;
-  case AArch64SVEPredPattern::vl6:
-    Idx = 5;
-    break;
-  case AArch64SVEPredPattern::vl7:
-    Idx = 6;
-    break;
-  case AArch64SVEPredPattern::vl8:
-    Idx = 7;
-    break;
-  case AArch64SVEPredPattern::vl16:
-    Idx = 15;
-    break;
-  }
 
+  unsigned Idx = MinNumElts - 1;
   // Increment the index if extracting the element after the last active
   // predicate element.
   if (IsAfter)
@@ -661,26 +662,9 @@ instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
     return IC.replaceInstUsesWith(II, VScale);
   }
 
-  unsigned MinNumElts = 0;
-  switch (Pattern) {
-  default:
-    return None;
-  case AArch64SVEPredPattern::vl1:
-  case AArch64SVEPredPattern::vl2:
-  case AArch64SVEPredPattern::vl3:
-  case AArch64SVEPredPattern::vl4:
-  case AArch64SVEPredPattern::vl5:
-  case AArch64SVEPredPattern::vl6:
-  case AArch64SVEPredPattern::vl7:
-  case AArch64SVEPredPattern::vl8:
-    MinNumElts = Pattern;
-    break;
-  case AArch64SVEPredPattern::vl16:
-    MinNumElts = 16;
-    break;
-  }
+  unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
 
-  return NumElts >= MinNumElts
+  return MinNumElts && NumElts >= MinNumElts
              ? Optional<Instruction *>(IC.replaceInstUsesWith(
                    II, ConstantInt::get(II.getType(), MinNumElts)))
              : None;
@@ -711,6 +695,116 @@ static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
   return None;
 }
 
+static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
+                                                        IntrinsicInst &II) {
+  // fold (fadd p a (fmul p b c)) -> (fma p a b c)
+  Value *P = II.getOperand(0);
+  Value *A = II.getOperand(1);
+  auto FMul = II.getOperand(2);
+  Value *B, *C;
+  if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(
+                       m_Specific(P), m_Value(B), m_Value(C))))
+    return None;
+
+  if (!FMul->hasOneUse())
+    return None;
+
+  llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
+  // Stop the combine when the flags on the inputs differ in case dropping flags
+  // would lead to us missing out on more beneficial optimizations.
+  if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())
+    return None;
+  if (!FAddFlags.allowContract())
+    return None;
+
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
+                                      {II.getType()}, {P, A, B, C}, &II);
+  FMLA->setFastMathFlags(FAddFlags);
+  return IC.replaceInstUsesWith(II, FMLA);
+}
+
+static Optional<Instruction *>
+instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  Value *Pred = II.getOperand(0);
+  Value *PtrOp = II.getOperand(1);
+  Type *VecTy = II.getType();
+  Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
+
+  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+                      m_ConstantInt<AArch64SVEPredPattern::all>()))) {
+    LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
+    return IC.replaceInstUsesWith(II, Load);
+  }
+
+  CallInst *MaskedLoad =
+      Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
+                               Pred, ConstantAggregateZero::get(VecTy));
+  return IC.replaceInstUsesWith(II, MaskedLoad);
+}
+
+static Optional<Instruction *>
+instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  Value *VecOp = II.getOperand(0);
+  Value *Pred = II.getOperand(1);
+  Value *PtrOp = II.getOperand(2);
+  Value *VecPtr =
+      Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
+
+  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+                      m_ConstantInt<AArch64SVEPredPattern::all>()))) {
+    Builder.CreateStore(VecOp, VecPtr);
+    return IC.eraseInstFromFunction(II);
+  }
+
+  Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL),
+                            Pred);
+  return IC.eraseInstFromFunction(II);
+}
+
+static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_sve_fmul:
+    return Instruction::BinaryOps::FMul;
+  case Intrinsic::aarch64_sve_fadd:
+    return Instruction::BinaryOps::FAdd;
+  case Intrinsic::aarch64_sve_fsub:
+    return Instruction::BinaryOps::FSub;
+  default:
+    return Instruction::BinaryOpsEnd;
+  }
+}
+
+static Optional<Instruction *> instCombineSVEVectorBinOp(InstCombiner &IC,
+                                                         IntrinsicInst &II) {
+  auto *OpPredicate = II.getOperand(0);
+  auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
+  if (BinOpCode == Instruction::BinaryOpsEnd ||
+      !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+                              m_ConstantInt<AArch64SVEPredPattern::all>())))
+    return None;
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  Builder.setFastMathFlags(II.getFastMathFlags());
+  auto BinOp =
+      Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
+  return IC.replaceInstUsesWith(II, BinOp);
+}
+
+static Optional<Instruction *> instCombineSVEVectorFAdd(InstCombiner &IC,
+                                                        IntrinsicInst &II) {
+  if (auto FMLA = instCombineSVEVectorFMLA(IC, II))
+    return FMLA;
+  return instCombineSVEVectorBinOp(IC, II);
+}
+
 static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
                                                        IntrinsicInst &II) {
   auto *OpPredicate = II.getOperand(0);
@@ -720,14 +814,11 @@ static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
   IRBuilder<> Builder(II.getContext());
   Builder.SetInsertPoint(&II);
 
-  // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
-  // with a unit splat value, false otherwise.
-  auto IsUnitDupX = [](auto *I) {
-    auto *IntrI = dyn_cast<IntrinsicInst>(I);
-    if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+  // Return true if a given instruction is a unit splat value, false otherwise.
+  auto IsUnitSplat = [](auto *I) {
+    auto *SplatValue = getSplatValue(I);
+    if (!SplatValue)
       return false;
-
-    auto *SplatValue = IntrI->getOperand(0);
     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
   };
 
@@ -744,10 +835,10 @@ static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
 
   // The OpMultiplier variable should always point to the dup (if any), so
   // swap if necessary.
-  if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
+  if (IsUnitDup(OpMultiplicand) || IsUnitSplat(OpMultiplicand))
     std::swap(OpMultiplier, OpMultiplicand);
 
-  if (IsUnitDupX(OpMultiplier)) {
+  if (IsUnitSplat(OpMultiplier)) {
     // [f]mul pg (dupx 1) %n => %n
     OpMultiplicand->takeName(&II);
     return IC.replaceInstUsesWith(II, OpMultiplicand);
@@ -763,22 +854,40 @@ static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
     }
   }
 
-  return None;
+  return instCombineSVEVectorBinOp(IC, II);
 }
 
+static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
+                                                    IntrinsicInst &II) {
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  Value *UnpackArg = II.getArgOperand(0);
+  auto *RetTy = cast<ScalableVectorType>(II.getType());
+  bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
+                  II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
+
+  // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
+  // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
+  if (auto *ScalarArg = getSplatValue(UnpackArg)) {
+    ScalarArg =
+        Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
+    Value *NewVal =
+        Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
+    NewVal->takeName(&II);
+    return IC.replaceInstUsesWith(II, NewVal);
+  }
+
+  return None;
+}
 static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
                                                  IntrinsicInst &II) {
   auto *OpVal = II.getOperand(0);
   auto *OpIndices = II.getOperand(1);
   VectorType *VTy = cast<VectorType>(II.getType());
 
-  // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
-  // constant splat value < minimal element count of result.
-  auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
-  if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
-    return None;
-
-  auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
+  // Check whether OpIndices is a constant splat value < minimal element count
+  // of result.
+  auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
   if (!SplatValue ||
       SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
     return None;
@@ -795,6 +904,115 @@ static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, VectorSplat);
 }
 
+static Optional<Instruction *> instCombineSVETupleGet(InstCombiner &IC,
+                                                      IntrinsicInst &II) {
+  // Try to remove sequences of tuple get/set.
+  Value *SetTuple, *SetIndex, *SetValue;
+  auto *GetTuple = II.getArgOperand(0);
+  auto *GetIndex = II.getArgOperand(1);
+  // Check that we have tuple_get(GetTuple, GetIndex) where GetTuple is a
+  // call to tuple_set i.e. tuple_set(SetTuple, SetIndex, SetValue).
+  // Make sure that the types of the current intrinsic and SetValue match
+  // in order to safely remove the sequence.
+  if (!match(GetTuple,
+             m_Intrinsic<Intrinsic::aarch64_sve_tuple_set>(
+                 m_Value(SetTuple), m_Value(SetIndex), m_Value(SetValue))) ||
+      SetValue->getType() != II.getType())
+    return None;
+  // Case where we get the same index right after setting it.
+  // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex) --> SetValue
+  if (GetIndex == SetIndex)
+    return IC.replaceInstUsesWith(II, SetValue);
+  // If we are getting a different index than what was set in the tuple_set
+  // intrinsic. We can just set the input tuple to the one up in the chain.
+  // tuple_get(tuple_set(SetTuple, SetIndex, SetValue), GetIndex)
+  // --> tuple_get(SetTuple, GetIndex)
+  return IC.replaceOperand(II, 0, SetTuple);
+}
+
+static Optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
+                                                 IntrinsicInst &II) {
+  // zip1(uzp1(A, B), uzp2(A, B)) --> A
+  // zip2(uzp1(A, B), uzp2(A, B)) --> B
+  Value *A, *B;
+  if (match(II.getArgOperand(0),
+            m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
+      match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
+                                     m_Specific(A), m_Specific(B))))
+    return IC.replaceInstUsesWith(
+        II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
+
+  return None;
+}
+
+static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC,
+                                                         IntrinsicInst &II) {
+  Value *Mask = II.getOperand(0);
+  Value *BasePtr = II.getOperand(1);
+  Value *Index = II.getOperand(2);
+  Type *Ty = II.getType();
+  Type *BasePtrTy = BasePtr->getType();
+  Value *PassThru = ConstantAggregateZero::get(Ty);
+
+  // Contiguous gather => masked load.
+  // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
+  // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
+  Value *IndexBase;
+  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
+                       m_Value(IndexBase), m_SpecificInt(1)))) {
+    IRBuilder<> Builder(II.getContext());
+    Builder.SetInsertPoint(&II);
+
+    Align Alignment =
+        BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
+
+    Type *VecPtrTy = PointerType::getUnqual(Ty);
+    Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
+                                   IndexBase);
+    Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
+    CallInst *MaskedLoad =
+        Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
+    MaskedLoad->takeName(&II);
+    return IC.replaceInstUsesWith(II, MaskedLoad);
+  }
+
+  return None;
+}
+
+static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
+                                                          IntrinsicInst &II) {
+  Value *Val = II.getOperand(0);
+  Value *Mask = II.getOperand(1);
+  Value *BasePtr = II.getOperand(2);
+  Value *Index = II.getOperand(3);
+  Type *Ty = Val->getType();
+  Type *BasePtrTy = BasePtr->getType();
+
+  // Contiguous scatter => masked store.
+  // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
+  // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
+  Value *IndexBase;
+  if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
+                       m_Value(IndexBase), m_SpecificInt(1)))) {
+    IRBuilder<> Builder(II.getContext());
+    Builder.SetInsertPoint(&II);
+
+    Align Alignment =
+        BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
+
+    Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr,
+                                   IndexBase);
+    Type *VecPtrTy = PointerType::getUnqual(Ty);
+    Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
+
+    (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
+
+    return IC.eraseInstFromFunction(II);
+  }
+
+  return None;
+}
+
 Optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -806,6 +1024,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineConvertFromSVBool(IC, II);
   case Intrinsic::aarch64_sve_dup:
     return instCombineSVEDup(IC, II);
+  case Intrinsic::aarch64_sve_dup_x:
+    return instCombineSVEDupX(IC, II);
   case Intrinsic::aarch64_sve_cmpne:
   case Intrinsic::aarch64_sve_cmpne_wide:
     return instCombineSVECmpNE(IC, II);
@@ -829,8 +1049,30 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_sve_mul:
   case Intrinsic::aarch64_sve_fmul:
     return instCombineSVEVectorMul(IC, II);
+  case Intrinsic::aarch64_sve_fadd:
+    return instCombineSVEVectorFAdd(IC, II);
+  case Intrinsic::aarch64_sve_fsub:
+    return instCombineSVEVectorBinOp(IC, II);
   case Intrinsic::aarch64_sve_tbl:
     return instCombineSVETBL(IC, II);
+  case Intrinsic::aarch64_sve_uunpkhi:
+  case Intrinsic::aarch64_sve_uunpklo:
+  case Intrinsic::aarch64_sve_sunpkhi:
+  case Intrinsic::aarch64_sve_sunpklo:
+    return instCombineSVEUnpack(IC, II);
+  case Intrinsic::aarch64_sve_tuple_get:
+    return instCombineSVETupleGet(IC, II);
+  case Intrinsic::aarch64_sve_zip1:
+  case Intrinsic::aarch64_sve_zip2:
+    return instCombineSVEZip(IC, II);
+  case Intrinsic::aarch64_sve_ld1_gather_index:
+    return instCombineLD1GatherIndex(IC, II);
+  case Intrinsic::aarch64_sve_st1_scatter_index:
+    return instCombineST1ScatterIndex(IC, II);
+  case Intrinsic::aarch64_sve_ld1:
+    return instCombineSVELD1(IC, II, DL);
+  case Intrinsic::aarch64_sve_st1:
+    return instCombineSVEST1(IC, II, DL);
   }
 
   return None;
@@ -1393,9 +1635,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     return (Cost + 1) * LT.first;
 
   case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FNEG:
     // These nodes are marked as 'custom' just to lower them to SVE.
     // We know said lowering will incur no additional cost.
-    if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
+    if (!Ty->getScalarType()->isFP128Ty())
       return (Cost + 2) * LT.first;
 
     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
@@ -1525,8 +1771,7 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
-
-  if (!isa<ScalableVectorType>(DataTy))
+  if (useNeonVector(DataTy))
     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
   auto *VT = cast<VectorType>(DataTy);
@@ -1623,9 +1868,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
     // matched to more than one ldN/stN instruction.
+    bool UseScalable;
     if (NumElts % Factor == 0 &&
-        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
-      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -1705,9 +1951,12 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 }
 
 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                             TTI::UnrollingPreferences &UP) {
+                                             TTI::UnrollingPreferences &UP,
+                                             OptimizationRemarkEmitter *ORE) {
   // Enable partial unrolling and runtime unrolling.
-  BaseT::getUnrollingPreferences(L, SE, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
+
+  UP.UpperBound = true;
 
   // For inner loop, it is more likely to be a hot one, and the runtime check
   // can be promoted out from LICM pass, so the overhead is less, let's try
@@ -1749,7 +1998,6 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       !ST->getSchedModel().isOutOfOrder()) {
     UP.Runtime = true;
     UP.Partial = true;
-    UP.UpperBound = true;
     UP.UnrollRemainder = true;
     UP.DefaultUnrollRuntimeCount = 4;
 
@@ -1775,7 +2023,7 @@ Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
     StructType *ST = dyn_cast<StructType>(ExpectedType);
     if (!ST)
       return nullptr;
-    unsigned NumElts = Inst->getNumArgOperands() - 1;
+    unsigned NumElts = Inst->arg_size() - 1;
     if (ST->getNumElements() != NumElts)
       return nullptr;
     for (unsigned i = 0, e = NumElts; i != e; ++i) {
@@ -1816,7 +2064,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
+    Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
     break;
   }
 
@@ -1892,6 +2140,8 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
   case RecurKind::UMax:
   case RecurKind::FMin:
   case RecurKind::FMax:
+  case RecurKind::SelectICmp:
+  case RecurKind::SelectFCmp:
     return true;
   default:
     return false;
@@ -1902,23 +2152,23 @@ InstructionCost
 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                        bool IsUnsigned,
                                        TTI::TargetCostKind CostKind) {
-  if (!isa<ScalableVectorType>(Ty))
+  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+  if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
-  assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
-         "Both vector needs to be scalable");
 
-  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
+         "Both vector needs to be equally scalable");
+
   InstructionCost LegalizationCost = 0;
   if (LT.first > 1) {
     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
-    unsigned CmpOpcode =
-        Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp;
-    LegalizationCost =
-        getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy,
-                           CmpInst::BAD_ICMP_PREDICATE, CostKind) +
-        getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy,
-                           CmpInst::BAD_ICMP_PREDICATE, CostKind);
-    LegalizationCost *= LT.first - 1;
+    unsigned MinMaxOpcode =
+        Ty->isFPOrFPVectorTy()
+            ? Intrinsic::maxnum
+            : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
+    IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
+    LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
   }
 
   return LegalizationCost + /*Cost of horizontal reduction*/ 2;
@@ -1954,8 +2204,13 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                            Optional<FastMathFlags> FMF,
                                            TTI::TargetCostKind CostKind) {
   if (TTI::requiresOrderedReduction(FMF)) {
-    if (!isa<ScalableVectorType>(ValTy))
-      return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
+      InstructionCost BaseCost =
+          BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+      // Add on extra cost to reflect the extra overhead on some CPUs. We still
+      // end up vectorizing for more computationally intensive loops.
+      return BaseCost + FixedVTy->getNumElements();
+    }
 
     if (Opcode != Instruction::FAdd)
       return InstructionCost::getInvalid();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index d55fd5b4f815..d1e8cd204b3a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -125,10 +125,8 @@ public:
     return ST->getMinVectorRegisterBitWidth();
   }
 
-  Optional<unsigned> getMaxVScale() const {
-    if (ST->hasSVE())
-      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
-    return BaseT::getMaxVScale();
+  Optional<unsigned> getVScaleForTuning() const {
+    return ST->getVScaleForTuning();
   }
 
   /// Try to return an estimate cost factor that can be used as a multiplier
@@ -138,9 +136,8 @@ public:
   unsigned getMaxNumElements(ElementCount VF) const {
     if (!VF.isScalable())
       return VF.getFixedValue();
-    Optional<unsigned> MaxNumVScale = getMaxVScale();
-    assert(MaxNumVScale && "Expected valid max vscale value");
-    return *MaxNumVScale * VF.getKnownMinValue();
+
+    return VF.getKnownMinValue() * ST->getVScaleForTuning();
   }
 
   unsigned getMaxInterleaveFactor(unsigned VF);
@@ -180,8 +177,7 @@ public:
   InstructionCost getSpliceCost(VectorType *Tp, int Index);
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -209,7 +205,8 @@ public:
   InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
@@ -229,7 +226,7 @@ public:
     if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
       return true;
 
-    if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
+    if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
         Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
       return true;
 
@@ -244,8 +241,7 @@ public:
     if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())
       return false; // Fall back to scalarization of masked operations.
 
-    return !DataType->getScalarType()->isIntegerTy(1) &&
-           isElementTypeLegalForScalableVector(DataType->getScalarType());
+    return isElementTypeLegalForScalableVector(DataType->getScalarType());
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -266,8 +262,7 @@ public:
                          DataTypeFVTy->getNumElements() < 2))
       return false;
 
-    return !DataType->getScalarType()->isIntegerTy(1) &&
-           isElementTypeLegalForScalableVector(DataType->getScalarType());
+    return isElementTypeLegalForScalableVector(DataType->getScalarType());
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
@@ -295,10 +290,11 @@ public:
     return BaseT::isLegalNTStore(DataType, Alignment);
   }
 
+  bool enableOrderedReductions() const { return true; }
+
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
   bool
@@ -316,9 +312,9 @@ public:
   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                    ElementCount VF) const;
 
-  InstructionCost getArithmeticReductionCost(
-      unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             Optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f27e9b2ef0f0..6d3aea2721de 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64InstrInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "TargetInfo/AArch64TargetInfo.h"
-#include "AArch64InstrInfo.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -40,15 +40,15 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cctype>
@@ -1511,7 +1511,7 @@ public:
   }
 
   bool isAdrpLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
+    // Validation was handled during parsing, so we just verify that
     // something didn't go haywire.
     if (!isImm())
         return false;
@@ -1527,7 +1527,7 @@ public:
   }
 
   bool isAdrLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
+    // Validation was handled during parsing, so we just verify that
     // something didn't go haywire.
     if (!isImm())
         return false;
@@ -2672,8 +2672,7 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
 /// the register is added to the operand list.
 OperandMatchResultTy
 AArch64AsmParser::tryParseScalarRegister(unsigned &RegNum) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
@@ -2683,22 +2682,21 @@ AArch64AsmParser::tryParseScalarRegister(unsigned &RegNum) {
     return MatchOperand_NoMatch;
 
   RegNum = Reg;
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
 /// tryParseSysCROperand - Try to parse a system instruction CR operand name.
 OperandMatchResultTy
 AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+  if (getTok().isNot(AsmToken::Identifier)) {
     Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
 
-  StringRef Tok = Parser.getTok().getIdentifier();
+  StringRef Tok = getTok().getIdentifier();
   if (Tok[0] != 'c' && Tok[0] != 'C') {
     Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
@@ -2711,7 +2709,7 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
   Operands.push_back(
       AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
   return MatchOperand_Success;
@@ -2721,9 +2719,8 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
 template <bool IsSVEPrefetch>
 OperandMatchResultTy
 AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
 
   auto LookupByName = [](StringRef N) {
     if (IsSVEPrefetch) {
@@ -2783,16 +2780,15 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
 
   Operands.push_back(AArch64Operand::CreatePrefetch(
       *PRFM, Tok.getString(), S, getContext()));
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
 /// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
 OperandMatchResultTy
 AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
     TokError("invalid operand for instruction");
     return MatchOperand_ParseFail;
@@ -2806,16 +2802,15 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
 
   Operands.push_back(AArch64Operand::CreatePSBHint(
       PSB->Encoding, Tok.getString(), S, getContext()));
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
 /// tryParseBTIHint - Try to parse a BTI operand, mapped to Hint command
 OperandMatchResultTy
 AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
     TokError("invalid operand for instruction");
     return MatchOperand_ParseFail;
@@ -2829,7 +2824,7 @@ AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
 
   Operands.push_back(AArch64Operand::CreateBTIHint(
       BTI->Encoding, Tok.getString(), S, getContext()));
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
@@ -2837,12 +2832,11 @@ AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
 /// instruction.
 OperandMatchResultTy
 AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const MCExpr *Expr = nullptr;
 
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
+  if (getTok().is(AsmToken::Hash)) {
+    Lex(); // Eat hash token.
   }
 
   if (parseSymbolicImmVal(Expr))
@@ -2894,11 +2888,11 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
   const MCExpr *Expr = nullptr;
 
   // Leave anything with a bracket to the default for SVE
-  if (getParser().getTok().is(AsmToken::LBrac))
+  if (getTok().is(AsmToken::LBrac))
     return MatchOperand_NoMatch;
 
-  if (getParser().getTok().is(AsmToken::Hash))
-    getParser().Lex(); // Eat hash token.
+  if (getTok().is(AsmToken::Hash))
+    Lex(); // Eat hash token.
 
   if (parseSymbolicImmVal(Expr))
     return MatchOperand_ParseFail;
@@ -2927,7 +2921,6 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
 template<bool AddFPZeroAsLiteral>
 OperandMatchResultTy
 AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   bool Hash = parseOptionalToken(AsmToken::Hash);
@@ -2935,7 +2928,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
   // Handle negation, as that still comes through as a separate token.
   bool isNegative = parseOptionalToken(AsmToken::Minus);
 
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   if (!Tok.is(AsmToken::Real) && !Tok.is(AsmToken::Integer)) {
     if (!Hash)
       return MatchOperand_NoMatch;
@@ -2974,7 +2967,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
           RealVal, *StatusOrErr == APFloat::opOK, S, getContext()));
   }
 
-  Parser.Lex(); // Eat the token.
+  Lex(); // Eat the token.
 
   return MatchOperand_Success;
 }
@@ -2983,51 +2976,50 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
 /// a shift suffix, for example '#1, lsl #12'.
 OperandMatchResultTy
 AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
-  if (Parser.getTok().is(AsmToken::Hash))
-    Parser.Lex(); // Eat '#'
-  else if (Parser.getTok().isNot(AsmToken::Integer))
+  if (getTok().is(AsmToken::Hash))
+    Lex(); // Eat '#'
+  else if (getTok().isNot(AsmToken::Integer))
     // Operand should start from # or should be integer, emit error otherwise.
     return MatchOperand_NoMatch;
 
   const MCExpr *Imm = nullptr;
   if (parseSymbolicImmVal(Imm))
     return MatchOperand_ParseFail;
-  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+  else if (getTok().isNot(AsmToken::Comma)) {
     Operands.push_back(
         AArch64Operand::CreateImm(Imm, S, getLoc(), getContext()));
     return MatchOperand_Success;
   }
 
   // Eat ','
-  Parser.Lex();
+  Lex();
 
   // The optional operand must be "lsl #N" where N is non-negative.
-  if (!Parser.getTok().is(AsmToken::Identifier) ||
-      !Parser.getTok().getIdentifier().equals_insensitive("lsl")) {
+  if (!getTok().is(AsmToken::Identifier) ||
+      !getTok().getIdentifier().equals_insensitive("lsl")) {
     Error(getLoc(), "only 'lsl #+N' valid after immediate");
     return MatchOperand_ParseFail;
   }
 
   // Eat 'lsl'
-  Parser.Lex();
+  Lex();
 
   parseOptionalToken(AsmToken::Hash);
 
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
+  if (getTok().isNot(AsmToken::Integer)) {
     Error(getLoc(), "only 'lsl #+N' valid after immediate");
     return MatchOperand_ParseFail;
   }
 
-  int64_t ShiftAmount = Parser.getTok().getIntVal();
+  int64_t ShiftAmount = getTok().getIntVal();
 
   if (ShiftAmount < 0) {
     Error(getLoc(), "positive shift amount required");
     return MatchOperand_ParseFail;
   }
-  Parser.Lex(); // Eat the number
+  Lex(); // Eat the number
 
   // Just in case the optional lsl #0 is used for immediates other than zero.
   if (ShiftAmount == 0 && Imm != nullptr) {
@@ -3085,16 +3077,15 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
 /// parseCondCode - Parse a Condition Code operand.
 bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
                                      bool invertCondCode) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
   StringRef Cond = Tok.getString();
   AArch64CC::CondCode CC = parseCondCodeString(Cond);
   if (CC == AArch64CC::Invalid)
     return TokError("invalid condition code");
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
 
   if (invertCondCode) {
     if (CC == AArch64CC::AL || CC == AArch64CC::NV)
@@ -3109,8 +3100,7 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
 
 OperandMatchResultTy
 AArch64AsmParser::tryParseSVCR(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   SMLoc S = getLoc();
 
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -3125,20 +3115,19 @@ AArch64AsmParser::tryParseSVCR(OperandVector &Operands) {
 
   Operands.push_back(
       AArch64Operand::CreateSVCR(PStateImm, Tok.getString(), S, getContext()));
-  Parser.Lex(); // Eat identifier token.
+  Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
 OperandMatchResultTy
 AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   SMLoc S = getLoc();
 
   StringRef Name = Tok.getString();
 
   if (Name.equals_insensitive("za")) {
-    Parser.Lex(); // eat "za"
+    Lex(); // eat "za"
     Operands.push_back(AArch64Operand::CreateMatrixRegister(
         AArch64::ZA, /*ElementWidth=*/0, MatrixKind::Array, S, getLoc(),
         getContext()));
@@ -3176,7 +3165,7 @@ AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) {
   }
   unsigned ElementWidth = KindRes->second;
 
-  Parser.Lex();
+  Lex();
 
   Operands.push_back(AArch64Operand::CreateMatrixRegister(
       Reg, ElementWidth, Kind, S, getLoc(), getContext()));
@@ -3194,8 +3183,7 @@ AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) {
 /// them if present.
 OperandMatchResultTy
 AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   std::string LowerID = Tok.getString().lower();
   AArch64_AM::ShiftExtendType ShOp =
       StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
@@ -3218,7 +3206,7 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
     return MatchOperand_NoMatch;
 
   SMLoc S = Tok.getLoc();
-  Parser.Lex();
+  Lex();
 
   bool Hash = parseOptionalToken(AsmToken::Hash);
 
@@ -3241,9 +3229,8 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
   // Make sure we do actually have a number, identifier or a parenthesized
   // expression.
   SMLoc E = getLoc();
-  if (!Parser.getTok().is(AsmToken::Integer) &&
-      !Parser.getTok().is(AsmToken::LParen) &&
-      !Parser.getTok().is(AsmToken::Identifier)) {
+  if (!getTok().is(AsmToken::Integer) && !getTok().is(AsmToken::LParen) &&
+      !getTok().is(AsmToken::Identifier)) {
     Error(E, "expected integer shift amount");
     return MatchOperand_ParseFail;
   }
@@ -3309,6 +3296,8 @@ static const struct Extension {
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
+  if (FBS[AArch64::HasV8_0aOps])
+    Str += "ARMv8a";
   if (FBS[AArch64::HasV8_1aOps])
     Str += "ARMv8.1a";
   else if (FBS[AArch64::HasV8_2aOps])
@@ -3323,6 +3312,14 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv8.6a";
   else if (FBS[AArch64::HasV8_7aOps])
     Str += "ARMv8.7a";
+  else if (FBS[AArch64::HasV9_0aOps])
+    Str += "ARMv9-a";
+  else if (FBS[AArch64::HasV9_1aOps])
+    Str += "ARMv9.1a";
+  else if (FBS[AArch64::HasV9_2aOps])
+    Str += "ARMv9.2a";
+  else if (FBS[AArch64::HasV8_0rOps])
+    Str += "ARMv8r";
   else {
     SmallVector<std::string, 2> ExtMatches;
     for (const auto& Ext : ExtensionMap) {
@@ -3358,14 +3355,13 @@ void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands
 /// the SYS instruction. Parse them specially so that we create a SYS MCInst.
 bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
                                    OperandVector &Operands) {
-  if (Name.find('.') != StringRef::npos)
+  if (Name.contains('.'))
     return TokError("invalid operand");
 
   Mnemonic = Name;
   Operands.push_back(AArch64Operand::CreateToken("sys", NameLoc, getContext()));
 
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
 
@@ -3376,7 +3372,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("IC " + std::string(IC->Name) + " requires: ");
       setRequiredFeatureString(IC->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(IC->Encoding, Operands, S);
   } else if (Mnemonic == "dc") {
@@ -3386,7 +3382,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("DC " + std::string(DC->Name) + " requires: ");
       setRequiredFeatureString(DC->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(DC->Encoding, Operands, S);
   } else if (Mnemonic == "at") {
@@ -3396,7 +3392,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("AT " + std::string(AT->Name) + " requires: ");
       setRequiredFeatureString(AT->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(AT->Encoding, Operands, S);
   } else if (Mnemonic == "tlbi") {
@@ -3406,7 +3402,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("TLBI " + std::string(TLBI->Name) + " requires: ");
       setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(TLBI->Encoding, Operands, S);
   } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp") {
@@ -3417,7 +3413,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
       std::string Str(
           Mnemonic.upper() + std::string(PRCTX->Name) + " requires: ");
       setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     uint16_t PRCTX_Op2 =
       Mnemonic == "cfp" ? 4 :
@@ -3428,7 +3424,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     createSysAlias(PRCTX->Encoding << 3 | PRCTX_Op2 , Operands, S);
   }
 
-  Parser.Lex(); // Eat operand.
+  Lex(); // Eat operand.
 
   bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
   bool HasRegister = false;
@@ -3454,7 +3450,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
 OperandMatchResultTy
 AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
 
   if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
     TokError("'csync' operand expected");
@@ -3519,15 +3515,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
   Operands.push_back(AArch64Operand::CreateBarrier(
       DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(),
       getContext(), false /*hasnXSModifier*/));
-  Parser.Lex(); // Consume the option
+  Lex(); // Consume the option
 
   return MatchOperand_Success;
 }
 
 OperandMatchResultTy
 AArch64AsmParser::tryParseBarriernXSOperand(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
 
   assert(Mnemonic == "dsb" && "Instruction does not accept nXS operands");
   if (Mnemonic != "dsb")
@@ -3574,15 +3569,14 @@ AArch64AsmParser::tryParseBarriernXSOperand(OperandVector &Operands) {
   Operands.push_back(
       AArch64Operand::CreateBarrier(DB->Encoding, Tok.getString(), getLoc(),
                                     getContext(), true /*hasnXSModifier*/));
-  Parser.Lex(); // Consume the option
+  Lex(); // Consume the option
 
   return MatchOperand_Success;
 }
 
 OperandMatchResultTy
 AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
 
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
@@ -3606,15 +3600,14 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   Operands.push_back(
       AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
                                    PStateImm, getContext()));
-  Parser.Lex(); // Eat identifier
+  Lex(); // Eat identifier
 
   return MatchOperand_Success;
 }
 
 /// tryParseNeonVectorRegister - Parse a vector register operand.
 bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  if (Parser.getTok().isNot(AsmToken::Identifier))
+  if (getTok().isNot(AsmToken::Identifier))
     return true;
 
   SMLoc S = getLoc();
@@ -3675,8 +3668,7 @@ AArch64AsmParser::tryParseVectorIndex(OperandVector &Operands) {
 OperandMatchResultTy
 AArch64AsmParser::tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
                                          RegKind MatchKind) {
-  MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
 
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
@@ -3696,7 +3688,7 @@ AArch64AsmParser::tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
         return MatchOperand_ParseFail;
       }
     }
-    Parser.Lex(); // Eat the register token.
+    Lex(); // Eat the register token.
 
     Reg = RegNum;
     return MatchOperand_Success;
@@ -3733,8 +3725,7 @@ AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
   }
 
   // Not all predicates are followed by a '/m' or '/z'.
-  MCAsmParser &Parser = getParser();
-  if (Parser.getTok().isNot(AsmToken::Slash))
+  if (getTok().isNot(AsmToken::Slash))
     return MatchOperand_Success;
 
   // But when they do they shouldn't have an element type suffix.
@@ -3746,10 +3737,10 @@ AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
   // Add a literal slash as operand
   Operands.push_back(AArch64Operand::CreateToken("/", getLoc(), getContext()));
 
-  Parser.Lex(); // Eat the slash.
+  Lex(); // Eat the slash.
 
   // Zeroing or merging?
-  auto Pred = Parser.getTok().getString().lower();
+  auto Pred = getTok().getString().lower();
   if (Pred != "z" && Pred != "m") {
     Error(getLoc(), "expecting 'm' or 'z' predication");
     return MatchOperand_ParseFail;
@@ -3759,7 +3750,7 @@ AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
   const char *ZM = Pred == "z" ? "z" : "m";
   Operands.push_back(AArch64Operand::CreateToken(ZM, getLoc(), getContext()));
 
-  Parser.Lex(); // Eat zero/merge token.
+  Lex(); // Eat zero/merge token.
   return MatchOperand_Success;
 }
 
@@ -3777,17 +3768,16 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
 }
 
 bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
-  MCAsmParser &Parser = getParser();
   bool HasELFModifier = false;
   AArch64MCExpr::VariantKind RefKind;
 
   if (parseOptionalToken(AsmToken::Colon)) {
     HasELFModifier = true;
 
-    if (Parser.getTok().isNot(AsmToken::Identifier))
+    if (getTok().isNot(AsmToken::Identifier))
       return TokError("expect relocation specifier in operand after ':'");
 
-    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    std::string LowerCase = getTok().getIdentifier().lower();
     RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
                   .Case("lo12", AArch64MCExpr::VK_LO12)
                   .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
@@ -3840,7 +3830,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
     if (RefKind == AArch64MCExpr::VK_INVALID)
       return TokError("expect relocation specifier in operand after ':'");
 
-    Parser.Lex(); // Eat identifier
+    Lex(); // Eat identifier
 
     if (parseToken(AsmToken::Colon, "expect ':' after relocation specifier"))
       return true;
@@ -3857,14 +3847,11 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
 
 OperandMatchResultTy
 AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-
-  if (Parser.getTok().isNot(AsmToken::LCurly))
+  if (getTok().isNot(AsmToken::LCurly))
     return MatchOperand_NoMatch;
 
-  auto ParseMatrixTile = [this, &Parser](unsigned &Reg,
-                                         unsigned &ElementWidth) {
-    StringRef Name = Parser.getTok().getString();
+  auto ParseMatrixTile = [this](unsigned &Reg, unsigned &ElementWidth) {
+    StringRef Name = getTok().getString();
     size_t DotPosition = Name.find('.');
     if (DotPosition == StringRef::npos)
       return MatchOperand_NoMatch;
@@ -3882,13 +3869,13 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
     }
     ElementWidth = KindRes->second;
     Reg = RegNum;
-    Parser.Lex(); // Eat the register.
+    Lex(); // Eat the register.
     return MatchOperand_Success;
   };
 
   SMLoc S = getLoc();
-  auto LCurly = Parser.getTok();
-  Parser.Lex(); // Eat left bracket token.
+  auto LCurly = getTok();
+  Lex(); // Eat left bracket token.
 
   // Empty matrix list
   if (parseOptionalToken(AsmToken::RCurly)) {
@@ -3898,8 +3885,8 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
   }
 
   // Try parse {za} alias early
-  if (Parser.getTok().getString().equals_insensitive("za")) {
-    Parser.Lex(); // Eat 'za'
+  if (getTok().getString().equals_insensitive("za")) {
+    Lex(); // Eat 'za'
 
     if (parseToken(AsmToken::RCurly, "'}' expected"))
       return MatchOperand_ParseFail;
@@ -3914,7 +3901,7 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) {
   unsigned FirstReg, ElementWidth;
   auto ParseRes = ParseMatrixTile(FirstReg, ElementWidth);
   if (ParseRes != MatchOperand_Success) {
-    Parser.getLexer().UnLex(LCurly);
+    getLexer().UnLex(LCurly);
     return ParseRes;
   }
 
@@ -3974,13 +3961,13 @@ OperandMatchResultTy
 AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
                                      bool ExpectMatch) {
   MCAsmParser &Parser = getParser();
-  if (!Parser.getTok().is(AsmToken::LCurly))
+  if (!getTok().is(AsmToken::LCurly))
     return MatchOperand_NoMatch;
 
   // Wrapper around parse function
-  auto ParseVector = [this, &Parser](unsigned &Reg, StringRef &Kind, SMLoc Loc,
-                                     bool NoMatchIsError) {
-    auto RegTok = Parser.getTok();
+  auto ParseVector = [this](unsigned &Reg, StringRef &Kind, SMLoc Loc,
+                            bool NoMatchIsError) {
+    auto RegTok = getTok();
     auto ParseRes = tryParseVectorRegister(Reg, Kind, VectorKind);
     if (ParseRes == MatchOperand_Success) {
       if (parseVectorKind(Kind, VectorKind))
@@ -4000,8 +3987,8 @@ AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
   };
 
   SMLoc S = getLoc();
-  auto LCurly = Parser.getTok();
-  Parser.Lex(); // Eat left bracket token.
+  auto LCurly = getTok();
+  Lex(); // Eat left bracket token.
 
   StringRef Kind;
   unsigned FirstReg;
@@ -4117,7 +4104,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
 
   parseOptionalToken(AsmToken::Hash);
 
-  if (getParser().getTok().isNot(AsmToken::Integer)) {
+  if (getTok().isNot(AsmToken::Integer)) {
     Error(getLoc(), "index must be absent or #0");
     return MatchOperand_ParseFail;
   }
@@ -4145,14 +4132,14 @@ AArch64AsmParser::tryParseGPROperand(OperandVector &Operands) {
     return Res;
 
   // No shift/extend is the default.
-  if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
+  if (!ParseShiftExtend || getTok().isNot(AsmToken::Comma)) {
     Operands.push_back(AArch64Operand::CreateReg(
         RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext(), EqTy));
     return MatchOperand_Success;
   }
 
   // Eat the comma
-  getParser().Lex();
+  Lex();
 
   // Match the shift
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
@@ -4178,23 +4165,23 @@ bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
   bool NextIsVL =
       Parser.getLexer().peekTok().getString().equals_insensitive("vl");
   bool NextIsHash = Parser.getLexer().peekTok().is(AsmToken::Hash);
-  if (!Parser.getTok().getString().equals_insensitive("mul") ||
+  if (!getTok().getString().equals_insensitive("mul") ||
       !(NextIsVL || NextIsHash))
     return true;
 
   Operands.push_back(
       AArch64Operand::CreateToken("mul", getLoc(), getContext()));
-  Parser.Lex(); // Eat the "mul"
+  Lex(); // Eat the "mul"
 
   if (NextIsVL) {
     Operands.push_back(
         AArch64Operand::CreateToken("vl", getLoc(), getContext()));
-    Parser.Lex(); // Eat the "vl"
+    Lex(); // Eat the "vl"
     return false;
   }
 
   if (NextIsHash) {
-    Parser.Lex(); // Eat the #
+    Lex(); // Eat the #
     SMLoc S = getLoc();
 
     // Parse immediate operand.
@@ -4212,8 +4199,7 @@ bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
 }
 
 bool AArch64AsmParser::parseKeywordOperand(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
-  auto Tok = Parser.getTok();
+  auto Tok = getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return true;
 
@@ -4225,7 +4211,7 @@ bool AArch64AsmParser::parseKeywordOperand(OperandVector &Operands) {
   Operands.push_back(
       AArch64Operand::CreateToken(Keyword, Tok.getLoc(), getContext()));
 
-  Parser.Lex();
+  Lex();
   return false;
 }
 
@@ -4264,7 +4250,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
   case AsmToken::LBrac: {
     Operands.push_back(
         AArch64Operand::CreateToken("[", getLoc(), getContext()));
-    Parser.Lex(); // Eat '['
+    Lex(); // Eat '['
 
     // There's no comma after a '[', so we can parse the next operand
     // immediately.
@@ -4276,7 +4262,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
 
     Operands.push_back(
         AArch64Operand::CreateToken("{", getLoc(), getContext()));
-    Parser.Lex(); // Eat '{'
+    Lex(); // Eat '{'
 
     // There's no comma after a '{', so we can parse the next operand
     // immediately.
@@ -4332,18 +4318,18 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
 
     // Parse a negative sign
     bool isNegative = false;
-    if (Parser.getTok().is(AsmToken::Minus)) {
+    if (getTok().is(AsmToken::Minus)) {
       isNegative = true;
       // We need to consume this token only when we have a Real, otherwise
       // we let parseSymbolicImmVal take care of it
       if (Parser.getLexer().peekTok().is(AsmToken::Real))
-        Parser.Lex();
+        Lex();
     }
 
     // The only Real that should come through here is a literal #0.0 for
     // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
     // so convert the value.
-    const AsmToken &Tok = Parser.getTok();
+    const AsmToken &Tok = getTok();
     if (Tok.is(AsmToken::Real)) {
       APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
       uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
@@ -4353,7 +4339,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
         return TokError("unexpected floating point literal");
       else if (IntVal != 0 || isNegative)
         return TokError("expected floating-point constant #0.0");
-      Parser.Lex(); // Eat the token.
+      Lex(); // Eat the token.
 
       Operands.push_back(AArch64Operand::CreateToken("#0", S, getContext()));
       Operands.push_back(AArch64Operand::CreateToken(".0", S, getContext()));
@@ -4372,7 +4358,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
     SMLoc Loc = getLoc();
     if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
       return TokError("unexpected token in operand");
-    Parser.Lex(); // Eat '='
+    Lex(); // Eat '='
     const MCExpr *SubExprVal;
     if (getParser().parseExpression(SubExprVal))
       return true;
@@ -4431,11 +4417,10 @@ bool AArch64AsmParser::parseImmExpr(int64_t &Out) {
 }
 
 bool AArch64AsmParser::parseComma() {
-  if (check(getParser().getTok().isNot(AsmToken::Comma), getLoc(),
-            "expected comma"))
+  if (check(getTok().isNot(AsmToken::Comma), getLoc(), "expected comma"))
     return true;
   // Eat the comma
-  getParser().Lex();
+  Lex();
   return false;
 }
 
@@ -4507,7 +4492,6 @@ bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
                                         OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   Name = StringSwitch<StringRef>(Name.lower())
              .Case("beq", "b.eq")
              .Case("bne", "b.ne")
@@ -4530,8 +4514,8 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
              .Default(Name);
 
   // First check for the AArch64-specific .req directive.
-  if (Parser.getTok().is(AsmToken::Identifier) &&
-      Parser.getTok().getIdentifier().lower() == ".req") {
+  if (getTok().is(AsmToken::Identifier) &&
+      getTok().getIdentifier().lower() == ".req") {
     parseDirectiveReq(Name, NameLoc);
     // We always return 'error' for this, as we're done with this
     // statement and don't need to match the 'instruction."
@@ -5084,6 +5068,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
     return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
   case Match_InvalidMemoryIndexed16:
     return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_0:
+    return Error(Loc, "immediate must be 0.");
   case Match_InvalidImm0_1:
     return Error(Loc, "immediate must be an integer in range [0, 1].");
   case Match_InvalidImm0_3:
@@ -5128,6 +5114,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
   case Match_InvalidSVECpyImm64:
     return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
                       "multiple of 256 in range [-32768, 32512]");
+  case Match_InvalidIndexRange0_0:
+    return Error(Loc, "expected lane specifier '[0]'");
   case Match_InvalidIndexRange1_1:
     return Error(Loc, "expected lane specifier '[1]'");
   case Match_InvalidIndexRange0_15:
@@ -5256,14 +5244,6 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
     return Error(Loc, "invalid predicate register.");
   case Match_InvalidSVEPredicate3bAnyReg:
     return Error(Loc, "invalid restricted predicate register, expected p0..p7 (without element suffix)");
-  case Match_InvalidSVEPredicate3bBReg:
-    return Error(Loc, "invalid restricted predicate register, expected p0.b..p7.b");
-  case Match_InvalidSVEPredicate3bHReg:
-    return Error(Loc, "invalid restricted predicate register, expected p0.h..p7.h");
-  case Match_InvalidSVEPredicate3bSReg:
-    return Error(Loc, "invalid restricted predicate register, expected p0.s..p7.s");
-  case Match_InvalidSVEPredicate3bDReg:
-    return Error(Loc, "invalid restricted predicate register, expected p0.d..p7.d");
   case Match_InvalidSVEExactFPImmOperandHalfOne:
     return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0.");
   case Match_InvalidSVEExactFPImmOperandHalfTwo:
@@ -5724,6 +5704,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidMemoryIndexedSImm9:
   case Match_InvalidMemoryIndexed16SImm9:
   case Match_InvalidMemoryIndexed8SImm10:
+  case Match_InvalidImm0_0:
   case Match_InvalidImm0_1:
   case Match_InvalidImm0_3:
   case Match_InvalidImm0_7:
@@ -5745,6 +5726,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSVECpyImm16:
   case Match_InvalidSVECpyImm32:
   case Match_InvalidSVECpyImm64:
+  case Match_InvalidIndexRange0_0:
   case Match_InvalidIndexRange1_1:
   case Match_InvalidIndexRange0_15:
   case Match_InvalidIndexRange0_7:
@@ -5811,10 +5793,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSVEPredicateSReg:
   case Match_InvalidSVEPredicateDReg:
   case Match_InvalidSVEPredicate3bAnyReg:
-  case Match_InvalidSVEPredicate3bBReg:
-  case Match_InvalidSVEPredicate3bHReg:
-  case Match_InvalidSVEPredicate3bSReg:
-  case Match_InvalidSVEPredicate3bDReg:
   case Match_InvalidSVEExactFPImmOperandHalfOne:
   case Match_InvalidSVEExactFPImmOperandHalfTwo:
   case Match_InvalidSVEExactFPImmOperandZeroOne:
@@ -5958,6 +5936,9 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
     case AArch64::ArchKind::ARMV8_5A:
     case AArch64::ArchKind::ARMV8_6A:
     case AArch64::ArchKind::ARMV8_7A:
+    case AArch64::ArchKind::ARMV9A:
+    case AArch64::ArchKind::ARMV9_1A:
+    case AArch64::ArchKind::ARMV9_2A:
     case AArch64::ArchKind::ARMV8R:
       RequestedExtensions.push_back("sm4");
       RequestedExtensions.push_back("sha3");
@@ -5980,6 +5961,9 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
     case AArch64::ArchKind::ARMV8_5A:
     case AArch64::ArchKind::ARMV8_6A:
     case AArch64::ArchKind::ARMV8_7A:
+    case AArch64::ArchKind::ARMV9A:
+    case AArch64::ArchKind::ARMV9_1A:
+    case AArch64::ArchKind::ARMV9_2A:
       RequestedExtensions.push_back("nosm4");
       RequestedExtensions.push_back("nosha3");
       RequestedExtensions.push_back("nosha2");
@@ -6206,12 +6190,12 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
 /// The number of arguments depends on the loh identifier.
 bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
   MCLOHType Kind;
-  if (getParser().getTok().isNot(AsmToken::Identifier)) {
-    if (getParser().getTok().isNot(AsmToken::Integer))
+  if (getTok().isNot(AsmToken::Identifier)) {
+    if (getTok().isNot(AsmToken::Integer))
       return TokError("expected an identifier or a number in directive");
     // We successfully get a numeric value for the identifier.
     // Check if it is valid.
-    int64_t Id = getParser().getTok().getIntVal();
+    int64_t Id = getTok().getIntVal();
     if (Id <= -1U && !isValidMCLOHType(Id))
       return TokError("invalid numeric identifier in directive");
     Kind = (MCLOHType)Id;
@@ -6265,8 +6249,7 @@ bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
 /// parseDirectiveReq
 ///  ::= name .req registername
 bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
-  MCAsmParser &Parser = getParser();
-  Parser.Lex(); // Eat the '.req' token.
+  Lex(); // Eat the '.req' token.
   SMLoc SRegLoc = getLoc();
   RegKind RegisterKind = RegKind::Scalar;
   unsigned RegNum;
@@ -6329,11 +6312,10 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
 /// parseDirectiveUneq
 ///  ::= .unreq registername
 bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
-  MCAsmParser &Parser = getParser();
   if (getTok().isNot(AsmToken::Identifier))
     return TokError("unexpected input in .unreq directive.");
-  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
-  Parser.Lex(); // Eat the identifier.
+  RegisterReqs.erase(getTok().getIdentifier().lower());
+  Lex(); // Eat the identifier.
   return parseToken(AsmToken::EndOfStatement);
 }
 
@@ -6357,9 +6339,7 @@ bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
 /// parseDirectiveVariantPCS
 /// ::= .variant_pcs symbolname
 bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
-  MCAsmParser &Parser = getParser();
-
-  const AsmToken &Tok = Parser.getTok();
+  const AsmToken &Tok = getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return TokError("expected symbol name");
 
@@ -6369,7 +6349,7 @@ bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
   if (!Sym)
     return TokError("unknown symbol");
 
-  Parser.Lex(); // Eat the symbol
+  Lex(); // Eat the symbol
 
   if (parseEOL())
     return true;
@@ -6741,7 +6721,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
 
   SMLoc S = getLoc();
 
-  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+  if (getTok().isNot(AsmToken::Identifier)) {
     Error(S, "expected register");
     return MatchOperand_ParseFail;
   }
@@ -6773,12 +6753,12 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  if (getParser().getTok().isNot(AsmToken::Comma)) {
+  if (getTok().isNot(AsmToken::Comma)) {
     Error(getLoc(), "expected comma");
     return MatchOperand_ParseFail;
   }
   // Eat the comma
-  getParser().Lex();
+  Lex();
 
   SMLoc E = getLoc();
   unsigned SecondReg;
@@ -6833,7 +6813,7 @@ AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
   unsigned ElementWidth = KindRes->second;
 
   // No shift/extend is the default.
-  if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
+  if (!ParseShiftExtend || getTok().isNot(AsmToken::Comma)) {
     Operands.push_back(AArch64Operand::CreateVectorReg(
         RegNum, RegKind::SVEDataVector, ElementWidth, S, S, getContext()));
 
@@ -6844,7 +6824,7 @@ AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
   }
 
   // Eat the comma
-  getParser().Lex();
+  Lex();
 
   // Match the shift
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
@@ -6866,7 +6846,7 @@ AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
 
   SMLoc SS = getLoc();
-  const AsmToken &TokE = Parser.getTok();
+  const AsmToken &TokE = getTok();
   bool IsHash = TokE.is(AsmToken::Hash);
 
   if (!IsHash && TokE.isNot(AsmToken::Identifier))
@@ -6874,7 +6854,7 @@ AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
 
   int64_t Pattern;
   if (IsHash) {
-    Parser.Lex(); // Eat hash
+    Lex(); // Eat hash
 
     // Parse the immediate operand.
     const MCExpr *ImmVal;
@@ -6893,7 +6873,7 @@ AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
     if (!Pat)
       return MatchOperand_NoMatch;
 
-    Parser.Lex();
+    Lex();
     Pattern = Pat->Encoding;
     assert(Pattern >= 0 && Pattern < 32);
   }
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 1ed8a80a4600..96d410e42be2 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -21,10 +21,10 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
 #include <memory>
 
@@ -225,13 +225,12 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
                                                       unsigned RegNo,
                                                       uint64_t Addr,
                                                       const void *Decoder);
-static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
+static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
                                                    uint64_t Address,
                                                    const void *Decoder);
-template<int Bits>
-static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
-                               uint64_t Address, const void *Decoder);
+template <int Bits>
+static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
+                               const void *Decoder);
 template <int ElementWidth>
 static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm,
                                      uint64_t Addr, const void *Decoder);
@@ -324,6 +323,33 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
       //                      ^ insert implicit 8-bit element tile
       MI.insert(MI.begin()+2, MCOperand::createReg(AArch64::ZAB0));
       break;
+    case AArch64::LD1_MXIPXX_H_Q:
+    case AArch64::LD1_MXIPXX_V_Q:
+    case AArch64::ST1_MXIPXX_H_Q:
+    case AArch64::ST1_MXIPXX_V_Q:
+      // 128-bit load/store have implicit zero vector index.
+      MI.insert(MI.begin()+2, MCOperand::createImm(0));
+      break;
+    // 128-bit mova have implicit zero vector index.
+    case AArch64::INSERT_MXIPZ_H_Q:
+    case AArch64::INSERT_MXIPZ_V_Q:
+      MI.insert(MI.begin()+2, MCOperand::createImm(0));
+      break;
+    case AArch64::EXTRACT_ZPMXI_H_Q:
+    case AArch64::EXTRACT_ZPMXI_V_Q:
+      MI.addOperand(MCOperand::createImm(0));
+      break;
+    case AArch64::SMOVvi8to32_idx0:
+    case AArch64::SMOVvi8to64_idx0:
+    case AArch64::SMOVvi16to32_idx0:
+    case AArch64::SMOVvi16to64_idx0:
+    case AArch64::SMOVvi32to64_idx0:
+    case AArch64::UMOVvi8_idx0:
+    case AArch64::UMOVvi16_idx0:
+    case AArch64::UMOVvi32_idx0:
+    case AArch64::UMOVvi64_idx0:
+      MI.addOperand(MCOperand::createImm(0));
+      break;
     }
 
     if (Result != MCDisassembler::Fail)
@@ -366,23 +392,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() {
                                        createAArch64ExternalSymbolizer);
 }
 
-static const unsigned FPR128DecoderTable[] = {
-    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
-    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
-    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
-    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
-    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
-    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
-    AArch64::Q30, AArch64::Q31
-};
-
 static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Addr,
                                               const void *Decoder) {
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = FPR128DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::FPR128RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -395,107 +412,63 @@ static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
 }
 
-static const unsigned FPR64DecoderTable[] = {
-    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
-    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
-    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
-    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
-    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
-    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
-    AArch64::D30, AArch64::D31
-};
-
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
                                              const void *Decoder) {
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = FPR64DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::FPR64RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned FPR32DecoderTable[] = {
-    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
-    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
-    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
-    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
-    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
-    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
-    AArch64::S30, AArch64::S31
-};
-
 static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
                                              const void *Decoder) {
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = FPR32DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::FPR32RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned FPR16DecoderTable[] = {
-    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
-    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
-    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
-    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
-    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
-    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
-    AArch64::H30, AArch64::H31
-};
-
 static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
                                              const void *Decoder) {
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = FPR16DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::FPR16RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned FPR8DecoderTable[] = {
-    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
-    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
-    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
-    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
-    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
-    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
-    AArch64::B30, AArch64::B31
-};
-
 static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Addr,
                                             const void *Decoder) {
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = FPR8DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::FPR8RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned GPR64DecoderTable[] = {
-    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
-    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
-    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
-    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
-    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
-    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
-    AArch64::LR,  AArch64::XZR
-};
-
 static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t Addr,
                                                    const void *Decoder) {
   if (RegNo > 30)
     return Fail;
 
-  unsigned Register = GPR64DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::GPR64commonRegClassID].getRegister(
+          RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -506,26 +479,12 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = GPR64DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned GPR64x8DecoderTable[] = {
-  AArch64::X0_X1_X2_X3_X4_X5_X6_X7,
-  AArch64::X2_X3_X4_X5_X6_X7_X8_X9,
-  AArch64::X4_X5_X6_X7_X8_X9_X10_X11,
-  AArch64::X6_X7_X8_X9_X10_X11_X12_X13,
-  AArch64::X8_X9_X10_X11_X12_X13_X14_X15,
-  AArch64::X10_X11_X12_X13_X14_X15_X16_X17,
-  AArch64::X12_X13_X14_X15_X16_X17_X18_X19,
-  AArch64::X14_X15_X16_X17_X18_X19_X20_X21,
-  AArch64::X16_X17_X18_X19_X20_X21_X22_X23,
-  AArch64::X18_X19_X20_X21_X22_X23_X24_X25,
-  AArch64::X20_X21_X22_X23_X24_X25_X26_X27,
-  AArch64::X22_X23_X24_X25_X26_X27_X28_FP,
-};
-
 static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
                                                     unsigned RegNo,
                                                     uint64_t Address,
@@ -535,7 +494,9 @@ static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst,
   if (RegNo & 1)
     return Fail;
 
-  unsigned Register = GPR64x8DecoderTable[RegNo >> 1];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].getRegister(
+          RegNo >> 1);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -545,17 +506,12 @@ static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
                                                const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = GPR64DecoderTable[RegNo];
-  if (Register == AArch64::XZR)
-    Register = AArch64::SP;
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned MatrixIndexGPR32_12_15DecoderTable[] = {
-    AArch64::W12, AArch64::W13, AArch64::W14, AArch64::W15
-};
-
 static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
                                                               unsigned RegNo,
                                                               uint64_t Addr,
@@ -563,28 +519,21 @@ static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst,
   if (RegNo > 3)
     return Fail;
 
-  unsigned Register = MatrixIndexGPR32_12_15DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::MatrixIndexGPR32_12_15RegClassID]
+          .getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned GPR32DecoderTable[] = {
-    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
-    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
-    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
-    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
-    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
-    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
-    AArch64::W30, AArch64::WZR
-};
-
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
                                              const void *Decoder) {
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = GPR32DecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::GPR32RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -595,22 +544,11 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = GPR32DecoderTable[RegNo];
-  if (Register == AArch64::WZR)
-    Register = AArch64::WSP;
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::GPR32spRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
-static const unsigned ZPRDecoderTable[] = {
-    AArch64::Z0,  AArch64::Z1,  AArch64::Z2,  AArch64::Z3,
-    AArch64::Z4,  AArch64::Z5,  AArch64::Z6,  AArch64::Z7,
-    AArch64::Z8,  AArch64::Z9,  AArch64::Z10, AArch64::Z11,
-    AArch64::Z12, AArch64::Z13, AArch64::Z14, AArch64::Z15,
-    AArch64::Z16, AArch64::Z17, AArch64::Z18, AArch64::Z19,
-    AArch64::Z20, AArch64::Z21, AArch64::Z22, AArch64::Z23,
-    AArch64::Z24, AArch64::Z25, AArch64::Z26, AArch64::Z27,
-    AArch64::Z28, AArch64::Z29, AArch64::Z30, AArch64::Z31
-};
 
 static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
@@ -618,7 +556,8 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return Fail;
 
-  unsigned Register = ZPRDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::ZPRRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -639,71 +578,35 @@ static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static const unsigned ZZDecoderTable[] = {
-  AArch64::Z0_Z1,   AArch64::Z1_Z2,   AArch64::Z2_Z3,   AArch64::Z3_Z4,
-  AArch64::Z4_Z5,   AArch64::Z5_Z6,   AArch64::Z6_Z7,   AArch64::Z7_Z8,
-  AArch64::Z8_Z9,   AArch64::Z9_Z10,  AArch64::Z10_Z11, AArch64::Z11_Z12,
-  AArch64::Z12_Z13, AArch64::Z13_Z14, AArch64::Z14_Z15, AArch64::Z15_Z16,
-  AArch64::Z16_Z17, AArch64::Z17_Z18, AArch64::Z18_Z19, AArch64::Z19_Z20,
-  AArch64::Z20_Z21, AArch64::Z21_Z22, AArch64::Z22_Z23, AArch64::Z23_Z24,
-  AArch64::Z24_Z25, AArch64::Z25_Z26, AArch64::Z26_Z27, AArch64::Z27_Z28,
-  AArch64::Z28_Z29, AArch64::Z29_Z30, AArch64::Z30_Z31, AArch64::Z31_Z0
-};
-
 static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void* Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = ZZDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::ZPR2RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned ZZZDecoderTable[] = {
-  AArch64::Z0_Z1_Z2,    AArch64::Z1_Z2_Z3,    AArch64::Z2_Z3_Z4,
-  AArch64::Z3_Z4_Z5,    AArch64::Z4_Z5_Z6,    AArch64::Z5_Z6_Z7,
-  AArch64::Z6_Z7_Z8,    AArch64::Z7_Z8_Z9,    AArch64::Z8_Z9_Z10,
-  AArch64::Z9_Z10_Z11,  AArch64::Z10_Z11_Z12, AArch64::Z11_Z12_Z13,
-  AArch64::Z12_Z13_Z14, AArch64::Z13_Z14_Z15, AArch64::Z14_Z15_Z16,
-  AArch64::Z15_Z16_Z17, AArch64::Z16_Z17_Z18, AArch64::Z17_Z18_Z19,
-  AArch64::Z18_Z19_Z20, AArch64::Z19_Z20_Z21, AArch64::Z20_Z21_Z22,
-  AArch64::Z21_Z22_Z23, AArch64::Z22_Z23_Z24, AArch64::Z23_Z24_Z25,
-  AArch64::Z24_Z25_Z26, AArch64::Z25_Z26_Z27, AArch64::Z26_Z27_Z28,
-  AArch64::Z27_Z28_Z29, AArch64::Z28_Z29_Z30, AArch64::Z29_Z30_Z31,
-  AArch64::Z30_Z31_Z0,  AArch64::Z31_Z0_Z1
-};
-
 static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void* Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = ZZZDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::ZPR3RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned ZZZZDecoderTable[] = {
-  AArch64::Z0_Z1_Z2_Z3,     AArch64::Z1_Z2_Z3_Z4,     AArch64::Z2_Z3_Z4_Z5,
-  AArch64::Z3_Z4_Z5_Z6,     AArch64::Z4_Z5_Z6_Z7,     AArch64::Z5_Z6_Z7_Z8,
-  AArch64::Z6_Z7_Z8_Z9,     AArch64::Z7_Z8_Z9_Z10,    AArch64::Z8_Z9_Z10_Z11,
-  AArch64::Z9_Z10_Z11_Z12,  AArch64::Z10_Z11_Z12_Z13, AArch64::Z11_Z12_Z13_Z14,
-  AArch64::Z12_Z13_Z14_Z15, AArch64::Z13_Z14_Z15_Z16, AArch64::Z14_Z15_Z16_Z17,
-  AArch64::Z15_Z16_Z17_Z18, AArch64::Z16_Z17_Z18_Z19, AArch64::Z17_Z18_Z19_Z20,
-  AArch64::Z18_Z19_Z20_Z21, AArch64::Z19_Z20_Z21_Z22, AArch64::Z20_Z21_Z22_Z23,
-  AArch64::Z21_Z22_Z23_Z24, AArch64::Z22_Z23_Z24_Z25, AArch64::Z23_Z24_Z25_Z26,
-  AArch64::Z24_Z25_Z26_Z27, AArch64::Z25_Z26_Z27_Z28, AArch64::Z26_Z27_Z28_Z29,
-  AArch64::Z27_Z28_Z29_Z30, AArch64::Z28_Z29_Z30_Z31, AArch64::Z29_Z30_Z31_Z0,
-  AArch64::Z30_Z31_Z0_Z1,   AArch64::Z31_Z0_Z1_Z2
-};
-
 static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void* Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = ZZZZDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::ZPR4RegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -741,19 +644,13 @@ static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
-static const unsigned PPRDecoderTable[] = {
-  AArch64::P0,  AArch64::P1,  AArch64::P2,  AArch64::P3,
-  AArch64::P4,  AArch64::P5,  AArch64::P6,  AArch64::P7,
-  AArch64::P8,  AArch64::P9,  AArch64::P10, AArch64::P11,
-  AArch64::P12, AArch64::P13, AArch64::P14, AArch64::P15
-};
-
 static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Addr, const void *Decoder) {
   if (RegNo > 15)
     return Fail;
 
-  unsigned Register = PPRDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::PPRRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -768,157 +665,64 @@ static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodePPRRegisterClass(Inst, RegNo, Addr, Decoder);
 }
 
-static const unsigned VectorDecoderTable[] = {
-    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
-    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
-    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
-    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
-    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
-    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
-    AArch64::Q30, AArch64::Q31
-};
-
-static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = VectorDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::createReg(Register));
-  return Success;
-}
-
-static const unsigned QQDecoderTable[] = {
-  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
-  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
-  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
-  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
-  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
-  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
-  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
-  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
-};
-
 static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                           uint64_t Addr, const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = QQDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::QQRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned QQQDecoderTable[] = {
-  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
-  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
-  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
-  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
-  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
-  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
-  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
-  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
-  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
-  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
-  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
-};
-
 static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Addr, const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = QQQDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::QQQRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned QQQQDecoderTable[] = {
-  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
-  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
-  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
-  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
-  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
-  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
-  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
-  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
-  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
-  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
-  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
-};
-
 static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Addr,
                                             const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = QQQQDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::QQQQRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned DDDecoderTable[] = {
-  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
-  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
-  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
-  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
-  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
-  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
-  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
-  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
-};
-
 static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                           uint64_t Addr, const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = DDDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::DDRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned DDDDecoderTable[] = {
-  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
-  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
-  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
-  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
-  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
-  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
-  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
-  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
-  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
-  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
-  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
-};
-
 static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Addr, const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = DDDDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::DDDRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
 
-static const unsigned DDDDDecoderTable[] = {
-  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
-  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
-  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
-  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
-  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
-  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
-  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
-  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
-  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
-  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
-  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
-};
-
 static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Addr,
                                             const void *Decoder) {
   if (RegNo > 31)
     return Fail;
-  unsigned Register = DDDDDecoderTable[RegNo];
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::DDDDRegClassID].getRegister(RegNo);
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
@@ -1776,7 +1580,7 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn,
   if (Inst.getOpcode() == AArch64::MOVID)
     DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
   else
-    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder);
 
   Inst.addOperand(MCOperand::createImm(imm));
 
@@ -1813,8 +1617,8 @@ static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn,
   imm |= fieldFromInstruction(insn, 5, 5);
 
   // Tied operands added twice.
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder);
 
   Inst.addOperand(MCOperand::createImm(imm));
   Inst.addOperand(MCOperand::createImm((cmode & 6) << 2));
@@ -1980,8 +1784,7 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
                                              RegNo, Addr, Decoder);
 }
 
-static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
+static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn,
                                                    uint64_t Addr,
                                                    const void *Decoder) {
   unsigned Zdn = fieldFromInstruction(insn, 0, 5);
@@ -1997,9 +1800,9 @@ static DecodeStatus DecodeSVELogicalImmInstruction(llvm::MCInst &Inst,
   return Success;
 }
 
-template<int Bits>
-static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm,
-                               uint64_t Address, const void *Decoder) {
+template <int Bits>
+static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address,
+                               const void *Decoder) {
   if (Imm & ~((1LL << Bits) - 1))
       return Fail;
 
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 3f815ac8c3d0..5b6f06f8dbb4 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -34,7 +34,9 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) {
   case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
     return MCSymbolRefExpr::VK_GOTPAGEOFF;
   case LLVMDisassembler_VariantKind_ARM64_TLVP:
+    return MCSymbolRefExpr::VK_TLVPPAGE;
   case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+    return MCSymbolRefExpr::VK_TLVPPAGEOFF;
   default:
     llvm_unreachable("bad LLVMDisassembler_VariantKind");
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 28b234b180fc..ac08ee8ae8dd 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -156,7 +156,7 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     markPhysRegUsed(PhysReg);
     IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
   }
@@ -181,7 +181,18 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
     auto MMO = MF.getMachineMemOperand(
         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, LocTy,
         inferAlignFromPtrInfo(MF, MPO));
-    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+
+    switch (VA.getLocInfo()) {
+    case CCValAssign::LocInfo::ZExt:
+      MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, ValVReg, Addr, *MMO);
+      return;
+    case CCValAssign::LocInfo::SExt:
+      MIRBuilder.buildLoadInstr(TargetOpcode::G_SEXTLOAD, ValVReg, Addr, *MMO);
+      return;
+    default:
+      MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+      return;
+    }
   }
 
   /// How the physical register gets marked varies between formal
@@ -270,7 +281,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
     Register ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -376,11 +387,9 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
         MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]);
         if (EVT(NewVT) != SplitEVTs[i]) {
           unsigned ExtendOp = TargetOpcode::G_ANYEXT;
-          if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                             Attribute::SExt))
+          if (F.getAttributes().hasRetAttr(Attribute::SExt))
             ExtendOp = TargetOpcode::G_SEXT;
-          else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                                  Attribute::ZExt))
+          else if (F.getAttributes().hasRetAttr(Attribute::ZExt))
             ExtendOp = TargetOpcode::G_ZEXT;
 
           LLT NewLLT(NewVT);
@@ -522,6 +531,7 @@ bool AArch64CallLowering::lowerFormalArguments(
   auto &DL = F.getParent()->getDataLayout();
 
   SmallVector<ArgInfo, 8> SplitArgs;
+  SmallVector<std::pair<Register, Register>> BoolArgs;
   unsigned i = 0;
   for (auto &Arg : F.args()) {
     if (DL.getTypeStoreSize(Arg.getType()).isZero())
@@ -530,6 +540,22 @@ bool AArch64CallLowering::lowerFormalArguments(
     ArgInfo OrigArg{VRegs[i], Arg, i};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F);
 
+    // i1 arguments are zero-extended to i8 by the caller. Emit a
+    // hint to reflect this.
+    if (OrigArg.Ty->isIntegerTy(1)) {
+      assert(OrigArg.Regs.size() == 1 &&
+             MRI.getType(OrigArg.Regs[0]).getSizeInBits() == 1 &&
+             "Unexpected registers used for i1 arg");
+
+      if (!OrigArg.Flags[0].isZExt()) {
+        // Lower i1 argument as i8, and insert AssertZExt + Trunc later.
+        Register OrigReg = OrigArg.Regs[0];
+        Register WideReg = MRI.createGenericVirtualRegister(LLT::scalar(8));
+        OrigArg.Regs[0] = WideReg;
+        BoolArgs.push_back({OrigReg, WideReg});
+      }
+    }
+
     if (Arg.hasAttribute(Attribute::SwiftAsync))
       MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
 
@@ -550,6 +576,18 @@ bool AArch64CallLowering::lowerFormalArguments(
                                      F.getCallingConv(), F.isVarArg()))
     return false;
 
+  if (!BoolArgs.empty()) {
+    for (auto &KV : BoolArgs) {
+      Register OrigReg = KV.first;
+      Register WideReg = KV.second;
+      LLT WideTy = MRI.getType(WideReg);
+      assert(MRI.getType(OrigReg).getScalarSizeInBits() == 1 &&
+             "Unexpected bit size of a bool arg");
+      MIRBuilder.buildTrunc(
+          OrigReg, MIRBuilder.buildAssertZExt(WideTy, WideReg, 1).getReg(0));
+    }
+  }
+
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   uint64_t StackOffset = Assigner.StackOffset;
   if (F.isVarArg()) {
@@ -1042,8 +1080,19 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   for (auto &OrigArg : Info.OrigArgs) {
     splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
     // AAPCS requires that we zero-extend i1 to 8 bits by the caller.
-    if (OrigArg.Ty->isIntegerTy(1))
-      OutArgs.back().Flags[0].setZExt();
+    if (OrigArg.Ty->isIntegerTy(1)) {
+      ArgInfo &OutArg = OutArgs.back();
+      assert(OutArg.Regs.size() == 1 &&
+             MRI.getType(OutArg.Regs[0]).getSizeInBits() == 1 &&
+             "Unexpected registers used for i1 arg");
+
+      // We cannot use a ZExt ArgInfo flag here, because it will
+      // zero-extend the argument to i32 instead of just i8.
+      OutArg.Regs[0] =
+          MIRBuilder.buildZExt(LLT::scalar(8), OutArg.Regs[0]).getReg(0);
+      LLVMContext &Ctx = MF.getFunction().getContext();
+      OutArg.Ty = Type::getInt8Ty(Ctx);
+    }
   }
 
   SmallVector<ArgInfo, 8> InArgs;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
index 08d1c987dc3b..38afc5deb42f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp
@@ -26,7 +26,7 @@ AArch64GISelUtils::getAArch64VectorSplat(const MachineInstr &MI,
     return None;
   Register Src = MI.getOperand(1).getReg();
   if (auto ValAndVReg =
-          getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI))
+          getAnyConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI))
     return RegOrConstant(ValAndVReg->Value.getSExtValue());
   return RegOrConstant(Src);
 }
@@ -56,7 +56,7 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub,
       !CmpInst::isEquality(Pred))
     return false;
   auto MaybeZero =
-      getConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI);
   return MaybeZero && MaybeZero->Value.getZExtValue() == 0;
 }
 
@@ -68,7 +68,8 @@ bool AArch64GISelUtils::tryEmitBZero(MachineInstr &MI,
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   if (!TLI.getLibcallName(RTLIB::BZERO))
     return false;
-  auto Zero = getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI);
+  auto Zero =
+      getIConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI);
   if (!Zero || Zero->Value.getSExtValue() != 0)
     return false;
 
@@ -78,8 +79,8 @@ bool AArch64GISelUtils::tryEmitBZero(MachineInstr &MI,
   if (!MinSize) {
     // If the size is known, check it. If it is not known, assume using bzero is
     // better.
-    if (auto Size =
-            getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI)) {
+    if (auto Size = getIConstantVRegValWithLookThrough(
+            MI.getOperand(2).getReg(), MRI)) {
       if (Size->Value.getSExtValue() <= 256)
         return false;
     }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a98248438e40..e090d87d59a2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -163,6 +164,9 @@ private:
   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
                               MachineRegisterInfo &MRI);
+  /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
+  /// SUBREG_TO_REG.
+  bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -171,6 +175,14 @@ private:
   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
+
+  /// Helper function to select vector load intrinsics like
+  /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
+  /// \p Opc is the opcode that the selected instruction should use.
+  /// \p NumVecs is the number of vector destinations for the instruction.
+  /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
+  bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
+                                 MachineInstr &I);
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineRegisterInfo &MRI);
   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -181,6 +193,7 @@ private:
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
+  bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
 
   unsigned emitConstantPoolEntry(const Constant *CPVal,
                                  MachineFunction &MF) const;
@@ -263,13 +276,9 @@ private:
                                      const RegisterBank &DstRB, LLT ScalarTy,
                                      Register VecReg, unsigned LaneIdx,
                                      MachineIRBuilder &MIRBuilder) const;
-
-  /// Emit a CSet for an integer compare.
-  ///
-  /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers.
-  MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
-                                MachineIRBuilder &MIRBuilder,
-                                Register SrcReg = AArch64::WZR) const;
+  MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
+                          AArch64CC::CondCode Pred,
+                          MachineIRBuilder &MIRBuilder) const;
   /// Emit a CSet for a FP compare.
   ///
   /// \p Dst is expected to be a 32-bit scalar register.
@@ -367,18 +376,15 @@ private:
     return selectAddrModeWRO(Root, Width / 8);
   }
 
-  ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
+  ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
+                                           bool AllowROR = false) const;
 
   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
     return selectShiftedRegister(Root);
   }
 
   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
-    // TODO: selectShiftedRegister should allow for rotates on logical shifts.
-    // For now, make them the same. The only difference between the two is that
-    // logical shifts are allowed to fold in rotates. Otherwise, these are
-    // functionally the same.
-    return selectShiftedRegister(Root);
+    return selectShiftedRegister(Root, true);
   }
 
   /// Given an extend instruction, determine the correct shift-extend type for
@@ -496,14 +502,18 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
   }
 
   if (RB.getID() == AArch64::FPRRegBankID) {
-    if (Ty.getSizeInBits() <= 16)
+    switch (Ty.getSizeInBits()) {
+    case 8:
+      return &AArch64::FPR8RegClass;
+    case 16:
       return &AArch64::FPR16RegClass;
-    if (Ty.getSizeInBits() == 32)
+    case 32:
       return &AArch64::FPR32RegClass;
-    if (Ty.getSizeInBits() == 64)
+    case 64:
       return &AArch64::FPR64RegClass;
-    if (Ty.getSizeInBits() == 128)
+    case 128:
       return &AArch64::FPR128RegClass;
+    }
     return nullptr;
   }
 
@@ -652,7 +662,7 @@ static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
     Immed = Root.getCImm()->getZExtValue();
   else if (Root.isReg()) {
     auto ValAndVReg =
-        getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
+        getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
     if (!ValAndVReg)
       return None;
     Immed = ValAndVReg->Value.getSExtValue();
@@ -810,6 +820,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
       return isStore ? AArch64::STRSui : AArch64::LDRSui;
     case 64:
       return isStore ? AArch64::STRDui : AArch64::LDRDui;
+    case 128:
+      return isStore ? AArch64::STRQui : AArch64::LDRQui;
     }
     break;
   }
@@ -1195,8 +1207,8 @@ AArch64InstructionSelector::emitSelect(Register Dst, Register True,
                           &Optimized]() {
     if (Optimized)
       return false;
-    auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
-    auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
+    auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
+    auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
     if (!TrueCst && !FalseCst)
       return false;
 
@@ -1301,6 +1313,7 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
                               MachineRegisterInfo &MRI) {
   assert(Reg.isValid() && "Expected valid register!");
+  bool HasZext = false;
   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
     unsigned Opc = MI->getOpcode();
 
@@ -1314,6 +1327,9 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
     // on the truncated x is the same as the bit number on x.
     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
         Opc == TargetOpcode::G_TRUNC) {
+      if (Opc == TargetOpcode::G_ZEXT)
+        HasZext = true;
+
       Register NextReg = MI->getOperand(1).getReg();
       // Did we find something worth folding?
       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
@@ -1334,16 +1350,20 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
     case TargetOpcode::G_XOR: {
       TestReg = MI->getOperand(1).getReg();
       Register ConstantReg = MI->getOperand(2).getReg();
-      auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+      auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
       if (!VRegAndVal) {
         // AND commutes, check the other side for a constant.
         // FIXME: Can we canonicalize the constant so that it's always on the
         // same side at some point earlier?
         std::swap(ConstantReg, TestReg);
-        VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+        VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
+      }
+      if (VRegAndVal) {
+        if (HasZext)
+          C = VRegAndVal->Value.getZExtValue();
+        else
+          C = VRegAndVal->Value.getSExtValue();
       }
-      if (VRegAndVal)
-        C = VRegAndVal->Value.getSExtValue();
       break;
     }
     case TargetOpcode::G_ASHR:
@@ -1351,7 +1371,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
     case TargetOpcode::G_SHL: {
       TestReg = MI->getOperand(1).getReg();
       auto VRegAndVal =
-          getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+          getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
       if (VRegAndVal)
         C = VRegAndVal->Value.getSExtValue();
       break;
@@ -1479,7 +1499,7 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
   // Check if the AND has a constant on its RHS which we can use as a mask.
   // If it's a power of 2, then it's the same as checking a specific bit.
   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
-  auto MaybeBit = getConstantVRegValWithLookThrough(
+  auto MaybeBit = getIConstantVRegValWithLookThrough(
       AndInst.getOperand(2).getReg(), *MIB.getMRI());
   if (!MaybeBit)
     return false;
@@ -1555,7 +1575,7 @@ bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
   Register RHS = ICmp.getOperand(3).getReg();
 
   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
-  auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+  auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
 
   // When we can emit a TB(N)Z, prefer that.
@@ -1590,7 +1610,7 @@ bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
   if (ICmpInst::isEquality(Pred)) {
     if (!VRegAndVal) {
       std::swap(RHS, LHS);
-      VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+      VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
     }
 
@@ -2049,7 +2069,7 @@ bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
   // selector which will match the register variant.
   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
   const auto &MO = I.getOperand(2);
-  auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
+  auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
   if (!VRegAndVal)
     return false;
 
@@ -2131,7 +2151,7 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     // Before selecting a DUP instruction, check if it is better selected as a
     // MOV or load from a constant pool.
     Register Src = I.getOperand(1).getReg();
-    auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI);
+    auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
     if (!ValAndVReg)
       return false;
     LLVMContext &Ctx = MF.getFunction().getContext();
@@ -2145,17 +2165,14 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     I.eraseFromParent();
     return true;
   }
-  case TargetOpcode::G_BR: {
-    // If the branch jumps to the fallthrough block, don't bother emitting it.
-    // Only do this for -O0 for a good code size improvement, because when
-    // optimizations are enabled we want to leave this choice to
-    // MachineBlockPlacement.
-    bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
-    if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
-      return false;
-    I.eraseFromParent();
-    return true;
-  }
+  case TargetOpcode::G_SEXT:
+    // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
+    // over a normal extend.
+    if (selectUSMovFromExtend(I, MRI))
+      return true;
+    return false;
+  case TargetOpcode::G_BR:
+    return false;
   case TargetOpcode::G_SHL:
     return earlySelectSHL(I, MRI);
   case TargetOpcode::G_CONSTANT: {
@@ -2192,27 +2209,55 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
     // fold the add into the cset for the cmp by using cinc.
     //
     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
-    Register X = I.getOperand(1).getReg();
-
-    // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out
-    // early if we see it.
-    LLT Ty = MRI.getType(X);
-    if (Ty.isVector() || Ty.getSizeInBits() != 32)
+    Register AddDst = I.getOperand(0).getReg();
+    Register AddLHS = I.getOperand(1).getReg();
+    Register AddRHS = I.getOperand(2).getReg();
+    // Only handle scalars.
+    LLT Ty = MRI.getType(AddLHS);
+    if (Ty.isVector())
       return false;
-
-    Register CmpReg = I.getOperand(2).getReg();
-    MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
+    // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
+    // bits.
+    unsigned Size = Ty.getSizeInBits();
+    if (Size != 32 && Size != 64)
+      return false;
+    auto MatchCmp = [&](Register Reg) -> MachineInstr * {
+      if (!MRI.hasOneNonDBGUse(Reg))
+        return nullptr;
+      // If the LHS of the add is 32 bits, then we want to fold a 32-bit
+      // compare.
+      if (Size == 32)
+        return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
+      // We model scalar compares using 32-bit destinations right now.
+      // If it's a 64-bit compare, it'll have 64-bit sources.
+      Register ZExt;
+      if (!mi_match(Reg, MRI,
+                    m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
+        return nullptr;
+      auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
+      if (!Cmp ||
+          MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
+        return nullptr;
+      return Cmp;
+    };
+    // Try to match
+    // z + (cmp pred, x, y)
+    MachineInstr *Cmp = MatchCmp(AddRHS);
     if (!Cmp) {
-      std::swap(X, CmpReg);
-      Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
+      // (cmp pred, x, y) + z
+      std::swap(AddLHS, AddRHS);
+      Cmp = MatchCmp(AddRHS);
       if (!Cmp)
         return false;
     }
-    auto Pred =
-        static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate());
-    emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3),
-                       Cmp->getOperand(1), MIB);
-    emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X);
+    auto &PredOp = Cmp->getOperand(1);
+    auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
+    const AArch64CC::CondCode InvCC =
+        changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+    MIB.setInstrAndDebugLoc(I);
+    emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
+                       /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
+    emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
     I.eraseFromParent();
     return true;
   }
@@ -2352,10 +2397,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     unsigned Size = Ty.getSizeInBits();
     unsigned Opc = OpcTable[IsSigned][Size == 64];
     auto Cst1 =
-        getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
+        getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
     assert(Cst1 && "Should have gotten a constant for src 1?");
     auto Cst2 =
-        getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
+        getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
     assert(Cst2 && "Should have gotten a constant for src 2?");
     auto LSB = Cst1->Value.getZExtValue();
     auto Width = Cst2->Value.getZExtValue();
@@ -2456,10 +2501,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
     // FIXME: Redundant check, but even less readable when factored out.
     if (isFP) {
-      if (Ty != s32 && Ty != s64 && Ty != s128) {
+      if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
-                          << " constant, expected: " << s32 << " or " << s64
-                          << " or " << s128 << '\n');
+                          << " constant, expected: " << s16 << " or " << s32
+                          << " or " << s64 << " or " << s128 << '\n');
         return false;
       }
 
@@ -2493,23 +2538,20 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       }
     }
 
-    // We allow G_CONSTANT of types < 32b.
-    const unsigned MovOpc =
-        DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
-
     if (isFP) {
-      // Either emit a FMOV, or emit a copy to emit a normal mov.
-      const TargetRegisterClass &GPRRC =
-          DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
-      const TargetRegisterClass &FPRRC = 
-          DefSize == 32 ? AArch64::FPR32RegClass 
-                        : (DefSize == 64 ? AArch64::FPR64RegClass 
-                                         : AArch64::FPR128RegClass);
-
-      // For 64b values, emit a constant pool load instead.
-      // For s32, use a cp load if we have optsize/minsize.
-      if (DefSize == 64 || DefSize == 128 ||
-          (DefSize == 32 && shouldOptForSize(&MF))) {
+      const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize);
+      // For 16, 64, and 128b values, emit a constant pool load.
+      switch (DefSize) {
+      default:
+        llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
+      case 32:
+        // For s32, use a cp load if we have optsize/minsize.
+        if (!shouldOptForSize(&MF))
+          break;
+        LLVM_FALLTHROUGH;
+      case 16:
+      case 64:
+      case 128: {
         auto *FPImm = I.getOperand(1).getFPImm();
         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
         if (!LoadMI) {
@@ -2520,9 +2562,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
         I.eraseFromParent();
         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
       }
+      }
 
-      // Nope. Emit a copy and use a normal mov instead.
-      const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
+      // Either emit a FMOV, or emit a copy to emit a normal mov.
+      assert(DefSize == 32 &&
+             "Expected constant pool loads for all sizes other than 32!");
+      const Register DefGPRReg =
+          MRI.createVirtualRegister(&AArch64::GPR32RegClass);
       MachineOperand &RegOp = I.getOperand(0);
       RegOp.setReg(DefGPRReg);
       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
@@ -2545,6 +2591,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       I.getOperand(1).ChangeToImmediate(Val);
     }
 
+    const unsigned MovOpc =
+        DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
     I.setDesc(TII.get(MovOpc));
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
     return true;
@@ -2693,8 +2741,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_ZEXTLOAD:
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE: {
+    GLoadStore &LdSt = cast<GLoadStore>(I);
     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
-    LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
+    LLT PtrTy = MRI.getType(LdSt.getPointerReg());
 
     if (PtrTy != LLT::pointer(0, 64)) {
       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
@@ -2702,26 +2751,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    auto &MemOp = **I.memoperands_begin();
-    uint64_t MemSizeInBytes = MemOp.getSize();
-    unsigned MemSizeInBits = MemSizeInBytes * 8;
-    AtomicOrdering Order = MemOp.getSuccessOrdering();
+    uint64_t MemSizeInBytes = LdSt.getMemSize();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits();
+    AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
     if (Order != AtomicOrdering::NotAtomic &&
         Order != AtomicOrdering::Unordered &&
         Order != AtomicOrdering::Monotonic) {
-      assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD);
+      assert(!isa<GZExtLoad>(LdSt));
       if (MemSizeInBytes > 64)
         return false;
 
-      if (I.getOpcode() == TargetOpcode::G_LOAD) {
+      if (isa<GLoad>(LdSt)) {
         static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
                                      AArch64::LDARW, AArch64::LDARX};
         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
       } else {
         static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
                                      AArch64::STLRW, AArch64::STLRX};
+        Register ValReg = LdSt.getReg(0);
+        if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
+          // Emit a subreg copy of 32 bits.
+          Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+          MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
+              .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
+          I.getOperand(0).setReg(NewVal);
+        }
         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
       }
       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -2729,22 +2785,64 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
 #ifndef NDEBUG
-    const Register PtrReg = I.getOperand(1).getReg();
+    const Register PtrReg = LdSt.getPointerReg();
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
-    // Sanity-check the pointer register.
+    // Check that the pointer register is valid.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
            "Load/Store pointer operand isn't a GPR");
     assert(MRI.getType(PtrReg).isPointer() &&
            "Load/Store pointer operand isn't a pointer");
 #endif
 
-    const Register ValReg = I.getOperand(0).getReg();
+    const Register ValReg = LdSt.getReg(0);
+    const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
+    // The code below doesn't support truncating stores, so we need to split it
+    // again.
+    if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+      unsigned SubReg;
+      LLT MemTy = LdSt.getMMO().getMemoryType();
+      auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+      if (!getSubRegForClass(RC, TRI, SubReg))
+        return false;
+
+      // Generate a subreg copy.
+      auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
+                      .addReg(ValReg, 0, SubReg)
+                      .getReg(0);
+      RBI.constrainGenericRegister(Copy, *RC, MRI);
+      LdSt.getOperand(0).setReg(Copy);
+    } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+      // If this is an any-extending load from the FPR bank, split it into a regular
+      // load + extend.
+      if (RB.getID() == AArch64::FPRRegBankID) {
+        unsigned SubReg;
+        LLT MemTy = LdSt.getMMO().getMemoryType();
+        auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
+        if (!getSubRegForClass(RC, TRI, SubReg))
+          return false;
+        Register OldDst = LdSt.getReg(0);
+        Register NewDst =
+            MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
+        LdSt.getOperand(0).setReg(NewDst);
+        MRI.setRegBank(NewDst, RB);
+        // Generate a SUBREG_TO_REG to extend it.
+        MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
+        MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
+            .addImm(0)
+            .addUse(NewDst)
+            .addImm(SubReg);
+        auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI);
+        RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
+        MIB.setInstr(LdSt);
+      }
+    }
+
     // Helper lambda for partially selecting I. Either returns the original
     // instruction with an updated opcode, or a new instruction.
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
-      bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
+      bool IsStore = isa<GStore>(I);
       const unsigned NewOpc =
           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
       if (NewOpc == I.getOpcode())
@@ -2761,7 +2859,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
       // Folded something. Create a new instruction and return it.
       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
-      IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
+      Register CurValReg = I.getOperand(0).getReg();
+      IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
       NewInst.cloneMemRefs(I);
       for (auto &Fn : *AddrModeFns)
         Fn(NewInst);
@@ -2775,9 +2874,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 
     // If we're storing a 0, use WZR/XZR.
     if (Opcode == TargetOpcode::G_STORE) {
-      auto CVal = getConstantVRegValWithLookThrough(
-          LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
-          /*HandleFConstants = */ false);
+      auto CVal = getIConstantVRegValWithLookThrough(
+          LoadStore->getOperand(0).getReg(), MRI);
       if (CVal && CVal->Value == 0) {
         switch (LoadStore->getOpcode()) {
         case AArch64::STRWui:
@@ -2897,17 +2995,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // false, so to get the increment when it's true, we need to use the
     // inverse. In this case, we want to increment when carry is set.
     Register ZReg = AArch64::WZR;
-    auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
-                                 {ZReg, ZReg})
-                      .addImm(getInvertedCondCode(OpAndCC.second));
-    constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
+    emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg,
+              getInvertedCondCode(OpAndCC.second), MIB);
     I.eraseFromParent();
     return true;
   }
 
   case TargetOpcode::G_PTRMASK: {
     Register MaskReg = I.getOperand(2).getReg();
-    Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
+    Optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
     // TODO: Implement arbitrary cases
     if (!MaskVal || !isShiftedMask_64(*MaskVal))
       return false;
@@ -2991,7 +3087,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       if (Opcode == TargetOpcode::G_PTRTOINT) {
         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
         I.setDesc(TII.get(TargetOpcode::COPY));
-        return true;
+        return selectCopy(I, TII, MRI, TRI, RBI);
       }
     }
 
@@ -2999,6 +3095,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   }
 
   case TargetOpcode::G_ANYEXT: {
+    if (selectUSMovFromExtend(I, MRI))
+      return true;
+
     const Register DstReg = I.getOperand(0).getReg();
     const Register SrcReg = I.getOperand(1).getReg();
 
@@ -3045,6 +3144,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_SEXT_INREG:
   case TargetOpcode::G_SEXT: {
+    if (selectUSMovFromExtend(I, MRI))
+      return true;
+
     unsigned Opcode = I.getOpcode();
     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
     const Register DefReg = I.getOperand(0).getReg();
@@ -3231,9 +3333,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
-    emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
-                       MIB);
-    emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB);
+    const AArch64CC::CondCode InvCC =
+        changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+    emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
+    emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
+              /*Src2=*/AArch64::WZR, InvCC, MIB);
     I.eraseFromParent();
     return true;
   }
@@ -3839,6 +3943,10 @@ static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
   // Choose a lane copy opcode and subregister based off of the size of the
   // vector's elements.
   switch (EltSize) {
+  case 8:
+    CopyOpc = AArch64::CPYi8;
+    ExtractSubReg = AArch64::bsub;
+    break;
   case 16:
     CopyOpc = AArch64::CPYi16;
     ExtractSubReg = AArch64::hsub;
@@ -3942,7 +4050,7 @@ bool AArch64InstructionSelector::selectExtractElt(
   }
 
   // Find the index to extract from.
-  auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
+  auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
   if (!VRegAndVal)
     return false;
   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
@@ -4164,6 +4272,13 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
               .addConstantPoolIndex(CPIdx, 0,
                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     break;
+  case 2:
+    LoadMI =
+        &*MIRBuilder
+              .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp})
+              .addConstantPoolIndex(CPIdx, 0,
+                                    AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    break;
   default:
     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
                       << *CPVal->getType());
@@ -4326,7 +4441,7 @@ AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
   // ANDS needs a logical immediate for its immediate form. Check if we can
   // fold one in.
-  if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
+  if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
     int64_t Imm = ValAndVReg->Value.getSExtValue();
 
     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
@@ -4368,25 +4483,19 @@ MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
          "Expected a 32-bit scalar register?");
 #endif
-  const Register ZeroReg = AArch64::WZR;
-  auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
-    auto CSet =
-        MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
-            .addImm(getInvertedCondCode(CC));
-    constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
-    return &*CSet;
-  };
-
+  const Register ZReg = AArch64::WZR;
   AArch64CC::CondCode CC1, CC2;
   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
+  auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
   if (CC2 == AArch64CC::AL)
-    return EmitCSet(Dst, CC1);
-
+    return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
+                     MIRBuilder);
   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
   Register Def1Reg = MRI.createVirtualRegister(RC);
   Register Def2Reg = MRI.createVirtualRegister(RC);
-  EmitCSet(Def1Reg, CC1);
-  EmitCSet(Def2Reg, CC2);
+  auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
+  emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
+  emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
   return &*OrMI;
@@ -4495,16 +4604,25 @@ MachineInstr *AArch64InstructionSelector::emitVectorConcat(
 }
 
 MachineInstr *
-AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
-                                            MachineIRBuilder &MIRBuilder,
-                                            Register SrcReg) const {
-  // CSINC increments the result when the predicate is false. Invert it.
-  const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
-      CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
-  auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg})
-               .addImm(InvCC);
-  constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
-  return &*I;
+AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
+                                      Register Src2, AArch64CC::CondCode Pred,
+                                      MachineIRBuilder &MIRBuilder) const {
+  auto &MRI = *MIRBuilder.getMRI();
+  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
+  // If we used a register class, then this won't necessarily have an LLT.
+  // Compute the size based off whether or not we have a class or bank.
+  unsigned Size;
+  if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
+    Size = TRI.getRegSizeInBits(*RC);
+  else
+    Size = MRI.getType(Dst).getSizeInBits();
+  // Some opcodes use s1.
+  assert(Size <= 64 && "Expected 64 bits or less only!");
+  static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
+  unsigned Opc = OpcTable[Size == 64];
+  auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
+  constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
+  return &*CSINC;
 }
 
 std::pair<MachineInstr *, AArch64CC::CondCode>
@@ -4671,7 +4789,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
   if (!CmpInst::isUnsigned(P) && LHSDef &&
       LHSDef->getOpcode() == TargetOpcode::G_AND) {
     // Make sure that the RHS is 0.
-    auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
+    auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
     if (!ValAndVReg || ValAndVReg->Value != 0)
       return nullptr;
 
@@ -4792,6 +4910,71 @@ MachineInstr *AArch64InstructionSelector::emitLaneInsert(
   return InsElt;
 }
 
+bool AArch64InstructionSelector::selectUSMovFromExtend(
+    MachineInstr &MI, MachineRegisterInfo &MRI) {
+  if (MI.getOpcode() != TargetOpcode::G_SEXT &&
+      MI.getOpcode() != TargetOpcode::G_ZEXT &&
+      MI.getOpcode() != TargetOpcode::G_ANYEXT)
+    return false;
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
+  const Register DefReg = MI.getOperand(0).getReg();
+  const LLT DstTy = MRI.getType(DefReg);
+  unsigned DstSize = DstTy.getSizeInBits();
+
+  if (DstSize != 32 && DstSize != 64)
+    return false;
+
+  MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
+                                       MI.getOperand(1).getReg(), MRI);
+  int64_t Lane;
+  if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
+    return false;
+  Register Src0 = Extract->getOperand(1).getReg();
+
+  const LLT &VecTy = MRI.getType(Src0);
+
+  if (VecTy.getSizeInBits() != 128) {
+    const MachineInstr *ScalarToVector = emitScalarToVector(
+        VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
+    assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
+    Src0 = ScalarToVector->getOperand(0).getReg();
+  }
+
+  unsigned Opcode;
+  if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
+    Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
+  else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
+    Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
+  else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
+    Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
+  else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
+    Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
+  else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
+    Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
+  else
+    llvm_unreachable("Unexpected type combo for S/UMov!");
+
+  // We may need to generate one of these, depending on the type and sign of the
+  // input:
+  //  DstReg = SMOV Src0, Lane;
+  //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
+  MachineInstr *ExtI = nullptr;
+  if (DstSize == 64 && !IsSigned) {
+    Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+    MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
+    ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
+               .addImm(0)
+               .addUse(NewReg)
+               .addImm(AArch64::sub_32);
+    RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
+  } else
+    ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
+
+  constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
                                                  MachineRegisterInfo &MRI) {
   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
@@ -4811,7 +4994,7 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
   // Find the definition of the index. Bail out if it's not defined by a
   // G_CONSTANT.
   Register IdxReg = I.getOperand(3).getReg();
-  auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
+  auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
   if (!VRegAndVal)
     return false;
   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
@@ -4936,6 +5119,47 @@ bool AArch64InstructionSelector::tryOptConstantBuildVec(
   return true;
 }
 
+bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
+    MachineInstr &I, MachineRegisterInfo &MRI) {
+  // Given:
+  //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
+  //
+  // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
+  Register Dst = I.getOperand(0).getReg();
+  Register EltReg = I.getOperand(1).getReg();
+  LLT EltTy = MRI.getType(EltReg);
+  // If the index isn't on the same bank as its elements, then this can't be a
+  // SUBREG_TO_REG.
+  const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
+  const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
+  if (EltRB != DstRB)
+    return false;
+  if (any_of(make_range(I.operands_begin() + 2, I.operands_end()),
+             [&MRI](const MachineOperand &Op) {
+               return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(),
+                                    MRI);
+             }))
+    return false;
+  unsigned SubReg;
+  const TargetRegisterClass *EltRC =
+      getMinClassForRegBank(EltRB, EltTy.getSizeInBits());
+  if (!EltRC)
+    return false;
+  const TargetRegisterClass *DstRC =
+      getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits());
+  if (!DstRC)
+    return false;
+  if (!getSubRegForClass(EltRC, TRI, SubReg))
+    return false;
+  auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
+                         .addImm(0)
+                         .addUse(EltReg)
+                         .addImm(SubReg);
+  I.eraseFromParent();
+  constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
+  return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
+}
+
 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
                                                    MachineRegisterInfo &MRI) {
   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
@@ -4947,6 +5171,9 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
 
   if (tryOptConstantBuildVec(I, DstTy, MRI))
     return true;
+  if (tryOptBuildVecToSubregToReg(I, MRI))
+    return true;
+
   if (EltSize < 16 || EltSize > 64)
     return false; // Don't support all element types yet.
   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
@@ -5013,24 +5240,45 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
   return true;
 }
 
-/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
-/// ID if it exists, and 0 otherwise.
-static unsigned findIntrinsicID(MachineInstr &I) {
-  auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
-    return Op.isIntrinsicID();
-  });
-  if (IntrinOp == I.operands_end())
-    return 0;
-  return IntrinOp->getIntrinsicID();
+bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
+                                                           unsigned NumVecs,
+                                                           MachineInstr &I) {
+  assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+  assert(Opc && "Expected an opcode?");
+  assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
+  auto &MRI = *MIB.getMRI();
+  LLT Ty = MRI.getType(I.getOperand(0).getReg());
+  unsigned Size = Ty.getSizeInBits();
+  assert((Size == 64 || Size == 128) &&
+         "Destination must be 64 bits or 128 bits?");
+  unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
+  auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
+  assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
+  auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
+  Load.cloneMemRefs(I);
+  constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
+  Register SelectedLoadDst = Load->getOperand(0).getReg();
+  for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+    auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
+                   .addReg(SelectedLoadDst, 0, SubReg + Idx);
+    // Emit the subreg copies and immediately select them.
+    // FIXME: We should refactor our copy code into an emitCopy helper and
+    // clean up uses of this pattern elsewhere in the selector.
+    selectCopy(*Vec, TII, MRI, TRI, RBI);
+  }
+  return true;
 }
 
 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
     MachineInstr &I, MachineRegisterInfo &MRI) {
   // Find the intrinsic ID.
-  unsigned IntrinID = findIntrinsicID(I);
-  if (!IntrinID)
-    return false;
+  unsigned IntrinID = I.getIntrinsicID();
 
+  const LLT S8 = LLT::scalar(8);
+  const LLT S16 = LLT::scalar(16);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+  const LLT P0 = LLT::pointer(0, 64);
   // Select the instruction.
   switch (IntrinID) {
   default:
@@ -5055,16 +5303,59 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
     MIB.buildInstr(AArch64::BRK, {}, {})
         .addImm(I.getOperand(1).getImm() | ('U' << 8));
     break;
+  case Intrinsic::aarch64_neon_ld2: {
+    LLT Ty = MRI.getType(I.getOperand(0).getReg());
+    unsigned Opc = 0;
+    if (Ty == LLT::fixed_vector(8, S8))
+      Opc = AArch64::LD2Twov8b;
+    else if (Ty == LLT::fixed_vector(16, S8))
+      Opc = AArch64::LD2Twov16b;
+    else if (Ty == LLT::fixed_vector(4, S16))
+      Opc = AArch64::LD2Twov4h;
+    else if (Ty == LLT::fixed_vector(8, S16))
+      Opc = AArch64::LD2Twov8h;
+    else if (Ty == LLT::fixed_vector(2, S32))
+      Opc = AArch64::LD2Twov2s;
+    else if (Ty == LLT::fixed_vector(4, S32))
+      Opc = AArch64::LD2Twov4s;
+    else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+      Opc = AArch64::LD2Twov2d;
+    else if (Ty == S64 || Ty == P0)
+      Opc = AArch64::LD1Twov1d;
+    else
+      llvm_unreachable("Unexpected type for ld2!");
+    selectVectorLoadIntrinsic(Opc, 2, I);
+    break;
+  }
+  case Intrinsic::aarch64_neon_ld4: {
+    LLT Ty = MRI.getType(I.getOperand(0).getReg());
+    unsigned Opc = 0;
+    if (Ty == LLT::fixed_vector(8, S8))
+      Opc = AArch64::LD4Fourv8b;
+    else if (Ty == LLT::fixed_vector(16, S8))
+      Opc = AArch64::LD4Fourv16b;
+    else if (Ty == LLT::fixed_vector(4, S16))
+      Opc = AArch64::LD4Fourv4h;
+    else if (Ty == LLT::fixed_vector(8, S16))
+      Opc = AArch64::LD4Fourv8h;
+    else if (Ty == LLT::fixed_vector(2, S32))
+      Opc = AArch64::LD4Fourv2s;
+    else if (Ty == LLT::fixed_vector(4, S32))
+      Opc = AArch64::LD4Fourv4s;
+    else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+      Opc = AArch64::LD4Fourv2d;
+    else if (Ty == S64 || Ty == P0)
+      Opc = AArch64::LD1Fourv1d;
+    else
+      llvm_unreachable("Unexpected type for ld4!");
+    selectVectorLoadIntrinsic(Opc, 4, I);
+    break;
+  }
   case Intrinsic::aarch64_neon_st2: {
     Register Src1 = I.getOperand(1).getReg();
     Register Src2 = I.getOperand(2).getReg();
     Register Ptr = I.getOperand(3).getReg();
     LLT Ty = MRI.getType(Src1);
-    const LLT S8 = LLT::scalar(8);
-    const LLT S16 = LLT::scalar(16);
-    const LLT S32 = LLT::scalar(32);
-    const LLT S64 = LLT::scalar(64);
-    const LLT P0 = LLT::pointer(0, 64);
     unsigned Opc;
     if (Ty == LLT::fixed_vector(8, S8))
       Opc = AArch64::ST2Twov8b;
@@ -5100,9 +5391,7 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
 
 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
                                                  MachineRegisterInfo &MRI) {
-  unsigned IntrinID = findIntrinsicID(I);
-  if (!IntrinID)
-    return false;
+  unsigned IntrinID = I.getIntrinsicID();
 
   switch (IntrinID) {
   default:
@@ -5146,6 +5435,33 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case Intrinsic::ptrauth_sign: {
+    Register DstReg = I.getOperand(0).getReg();
+    Register ValReg = I.getOperand(2).getReg();
+    uint64_t Key = I.getOperand(3).getImm();
+    Register DiscReg = I.getOperand(4).getReg();
+    auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
+    bool IsDiscZero = DiscVal.hasValue() && DiscVal->isNullValue();
+
+    if (Key > 3)
+      return false;
+
+    unsigned Opcodes[][4] = {
+        {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB},
+        {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}};
+    unsigned Opcode = Opcodes[IsDiscZero][Key];
+
+    auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg});
+
+    if (!IsDiscZero) {
+      PAC.addUse(DiscReg);
+      RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI);
+    }
+
+    RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
+    I.eraseFromParent();
+    return true;
+  }
   case Intrinsic::frameaddress:
   case Intrinsic::returnaddress: {
     MachineFunction &MF = *I.getParent()->getParent();
@@ -5403,7 +5719,7 @@ AArch64InstructionSelector::selectExtendedSHL(
   // constant is the RHS.
   Register OffsetReg = OffsetInst->getOperand(1).getReg();
   Register ConstantReg = OffsetInst->getOperand(2).getReg();
-  auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+  auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
   if (!ValAndVReg) {
     // We didn't get a constant on the RHS. If the opcode is a shift, then
     // we're done.
@@ -5412,7 +5728,7 @@ AArch64InstructionSelector::selectExtendedSHL(
 
     // If we have a G_MUL, we can use either register. Try looking at the RHS.
     std::swap(OffsetReg, ConstantReg);
-    ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+    ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
     if (!ValAndVReg)
       return None;
   }
@@ -5580,7 +5896,7 @@ AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
   // mov x0, wide
   // ldr x2, [base, x0]
   auto ValAndVReg =
-      getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
   if (ValAndVReg) {
     unsigned Scale = Log2_32(SizeInBytes);
     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
@@ -5839,7 +6155,6 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
 /// Given a shift instruction, return the correct shift type for that
 /// instruction.
 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
-  // TODO: Handle AArch64_AM::ROR
   switch (MI.getOpcode()) {
   default:
     return AArch64_AM::InvalidShiftExtend;
@@ -5849,15 +6164,16 @@ static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
     return AArch64_AM::LSR;
   case TargetOpcode::G_ASHR:
     return AArch64_AM::ASR;
+  case TargetOpcode::G_ROTR:
+    return AArch64_AM::ROR;
   }
 }
 
 /// Select a "shifted register" operand. If the value is not shifted, set the
 /// shift operand to a default value of "lsl 0".
-///
-/// TODO: Allow shifted register to be rotated in logical instructions.
 InstructionSelector::ComplexRendererFns
-AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
+AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
+                                                  bool AllowROR) const {
   if (!Root.isReg())
     return None;
   MachineRegisterInfo &MRI =
@@ -5865,14 +6181,14 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
 
   // Check if the operand is defined by an instruction which corresponds to
   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
-  //
-  // TODO: Handle AArch64_AM::ROR for logical instructions.
   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
   if (!ShiftInst)
     return None;
   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
   if (ShType == AArch64_AM::InvalidShiftExtend)
     return None;
+  if (ShType == AArch64_AM::ROR && !AllowROR)
+    return None;
   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
     return None;
 
@@ -6045,7 +6361,7 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
          "Expected G_CONSTANT");
   Optional<int64_t> CstVal =
-      getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
+      getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
   assert(CstVal && "Expected constant value");
   MIB.addImm(CstVal.getValue());
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 08e4a119127c..1524aa5eb0ec 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -16,6 +16,7 @@
 #include "AArch64Subtarget.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -23,6 +24,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/MathExtras.h"
@@ -34,6 +36,7 @@ using namespace llvm;
 using namespace LegalizeActions;
 using namespace LegalizeMutations;
 using namespace LegalityPredicates;
+using namespace MIPatternMatch;
 
 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     : ST(&ST) {
@@ -45,7 +48,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
   const LLT s128 = LLT::scalar(128);
-  const LLT s256 = LLT::scalar(256);
   const LLT v16s8 = LLT::fixed_vector(16, 8);
   const LLT v8s8 = LLT::fixed_vector(8, 8);
   const LLT v4s8 = LLT::fixed_vector(4, 8);
@@ -80,8 +82,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
       .legalFor({p0, s1, s8, s16, s32, s64})
       .legalFor(PackedVectorAllTypeList)
+      .widenScalarToNextPow2(0)
       .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0, 8)
       .fewerElementsIf(
           [=](const LegalityQuery &Query) {
             return Query.Types[0].isVector() &&
@@ -95,16 +97,22 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
             return std::make_pair(0, EltTy);
           });
 
-  getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64})
+  getActionDefinitionsBuilder(G_PHI)
+      .legalFor({p0, s16, s32, s64})
       .legalFor(PackedVectorAllTypeList)
+      .widenScalarToNextPow2(0)
       .clampScalar(0, s16, s64)
-      .widenScalarToNextPow2(0);
+      // Maximum: sN * k = 128
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampMaxNumElements(0, s32, 4)
+      .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2);
 
   getActionDefinitionsBuilder(G_BSWAP)
       .legalFor({s32, s64, v4s32, v2s32, v2s64})
-      .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
-      .customIf(typeIs(0, v2s16)); // custom lower as G_REV32 + G_LSHR
+      .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
       .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
@@ -114,8 +122,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           },
           0)
       .legalFor({v2s64})
-      .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
+      .clampScalar(0, s32, s64)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0);
@@ -161,11 +169,22 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(0);
 
   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
-      .lowerFor({s1, s8, s16, s32, s64});
+      .lowerFor({s1, s8, s16, s32, s64, v2s64, v4s32, v2s32})
+      .widenScalarOrEltToNextPow2(0)
+      .clampScalarOrElt(0, s32, s64)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+      .moreElementsToNextPow2(0);
+
 
-  getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
+  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
+      .widenScalarToNextPow2(0, /*Min = */ 32)
+      .clampScalar(0, s32, s64)
+      .lowerIf(typeIs(1, s1));
 
-  getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
+  getActionDefinitionsBuilder({G_SMULH, G_UMULH})
+      .legalFor({s64, v8s16, v16s8, v4s32})
+      .lower();
 
   getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
@@ -184,7 +203,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0);
 
   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
-      .legalFor({s32, s64, v2s64, v4s32, v2s32})
+      .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32})
+      .clampScalar(0, MinFPScalar, s64)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64);
 
@@ -220,42 +240,25 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .libcallFor({s32, s64, v2s32, v4s32, v2s64});
 
   getActionDefinitionsBuilder(G_INSERT)
-      .unsupportedIf([=](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits();
-      })
-      .legalIf([=](const LegalityQuery &Query) {
-        const LLT &Ty0 = Query.Types[0];
-        const LLT &Ty1 = Query.Types[1];
-        if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0)
-          return false;
-        return isPowerOf2_32(Ty1.getSizeInBits()) &&
-               (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8);
-      })
-      .clampScalar(0, s32, s64)
+      .legalIf(all(typeInSet(0, {s32, s64, p0}),
+                   typeInSet(1, {s1, s8, s16, s32}), smallerThan(1, 0)))
       .widenScalarToNextPow2(0)
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(1)
+      .minScalar(1, s8)
       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
-      .maxScalarIf(typeInSet(0, {s64}), 1, s32)
-      .widenScalarToNextPow2(1);
+      .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
 
   getActionDefinitionsBuilder(G_EXTRACT)
-      .unsupportedIf([=](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits();
-      })
-      .legalIf([=](const LegalityQuery &Query) {
-        const LLT &Ty0 = Query.Types[0];
-        const LLT &Ty1 = Query.Types[1];
-        if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
-          return false;
-        if (Ty1 == p0)
-          return true;
-        return isPowerOf2_32(Ty0.getSizeInBits()) &&
-               (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
-      })
-      .clampScalar(1, s32, s128)
+      .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
+                   typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
       .widenScalarToNextPow2(1)
+      .clampScalar(1, s32, s128)
+      .widenScalarToNextPow2(0)
+      .minScalar(0, s16)
       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
-      .maxScalarIf(typeInSet(1, {s64}), 0, s32)
-      .widenScalarToNextPow2(0);
+      .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
+      .maxScalarIf(typeInSet(1, {s128}), 0, s64);
 
   getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
       .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
@@ -268,8 +271,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                                  {s64, p0, s64, 8},
                                  {p0, p0, s64, 8},
                                  {v2s32, p0, s64, 8}})
-      .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
+      .clampScalar(0, s32, s64)
       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
       //       how to do that yet.
       .unsupportedIfMemSizeNotPow2()
@@ -285,6 +288,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   };
 
   getActionDefinitionsBuilder(G_LOAD)
+      .customIf([=](const LegalityQuery &Query) {
+        return Query.Types[0] == s128 &&
+               Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
+      })
       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
                                  {s16, p0, s16, 8},
                                  {s32, p0, s32, 8},
@@ -300,9 +307,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                                  {v2s64, p0, s128, 8}})
       // These extends are also legal
       .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
-      .clampScalar(0, s8, s64)
+      .widenScalarToNextPow2(0, /* MinSize = */8)
       .lowerIfMemSizeNotPow2()
-      .widenScalarToNextPow2(0)
+      .clampScalar(0, s8, s64)
       .narrowScalarIf([=](const LegalityQuery &Query) {
         // Clamp extending load results to 32-bits.
         return Query.Types[0].isScalar() &&
@@ -318,10 +325,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s16, 8)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeIs(0, v2s16), 0);
 
   getActionDefinitionsBuilder(G_STORE)
+      .customIf([=](const LegalityQuery &Query) {
+        return Query.Types[0] == s128 &&
+               Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
+      })
       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
                                  {s16, p0, s8, 8}, // truncstorei8 from s16
                                  {s32, p0, s8, 8}, // truncstorei8 from s32
@@ -353,6 +365,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s16, 8)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotPow2()
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeIs(0, v2s16), 0);
@@ -360,8 +373,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   // Constants
   getActionDefinitionsBuilder(G_CONSTANT)
       .legalFor({p0, s8, s16, s32, s64})
-      .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0);
+      .widenScalarToNextPow2(0)
+      .clampScalar(0, s8, s64);
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalIf([=](const LegalityQuery &Query) {
         const auto &Ty = Query.Types[0];
@@ -383,6 +396,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v8s16, v8s16},
                  {v8s8, v8s8},
                  {v16s8, v16s8}})
+      .widenScalarOrEltToNextPow2(1)
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s32)
       .minScalarEltSameAsIf(
@@ -399,7 +413,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .minScalarOrEltIf(
           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
           s64)
-      .widenScalarOrEltToNextPow2(1)
       .clampNumElements(0, v2s32, v4s32);
 
   // Extensions
@@ -459,10 +472,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   // Conversions
   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
-      .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
-      .clampScalar(1, s32, s64)
-      .widenScalarToNextPow2(1);
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(1)
+      .clampScalar(1, s32, s64);
 
   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
@@ -477,8 +490,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_SELECT)
       .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
-      .clampScalar(0, s32, s64)
       .widenScalarToNextPow2(0)
+      .clampScalar(0, s32, s64)
       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
       .lowerIf(isVector(0));
 
@@ -492,6 +505,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_PTRTOINT)
       .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+      .legalFor({{v2s64, v2p0}})
       .maxScalar(0, s64)
       .widenScalarToNextPow2(0, /*Min*/ 8);
 
@@ -544,76 +558,30 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
-
-    auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) {
-      const LLT &Ty = Query.Types[TypeIdx];
-      if (Ty.isVector()) {
-        const LLT &EltTy = Ty.getElementType();
-        if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
-          return true;
-        if (!isPowerOf2_32(EltTy.getSizeInBits()))
-          return true;
-      }
-      return false;
-    };
-
-    // FIXME: This rule is horrible, but specifies the same as what we had
-    // before with the particularly strange definitions removed (e.g.
-    // s8 = G_MERGE_VALUES s32, s32).
-    // Part of the complexity comes from these ops being extremely flexible. For
-    // example, you can build/decompose vectors with it, concatenate vectors,
-    // etc. and in addition to this you can also bitcast with it at the same
-    // time. We've been considering breaking it up into multiple ops to make it
-    // more manageable throughout the backend.
     getActionDefinitionsBuilder(Op)
-        // Break up vectors with weird elements into scalars
-        .fewerElementsIf(
-            [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
-            scalarize(0))
-        .fewerElementsIf(
-            [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
-            scalarize(1))
-        // Clamp the big scalar to s8-s128 and make it a power of 2.
-        .clampScalar(BigTyIdx, s8, s128)
-        .widenScalarIf(
-            [=](const LegalityQuery &Query) {
-              const LLT &Ty = Query.Types[BigTyIdx];
-              return !isPowerOf2_32(Ty.getSizeInBits()) &&
-                     Ty.getSizeInBits() % 64 != 0;
-            },
-            [=](const LegalityQuery &Query) {
-              // Pick the next power of 2, or a multiple of 64 over 128.
-              // Whichever is smaller.
-              const LLT &Ty = Query.Types[BigTyIdx];
-              unsigned NewSizeInBits = 1
-                                       << Log2_32_Ceil(Ty.getSizeInBits() + 1);
-              if (NewSizeInBits >= 256) {
-                unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
-                if (RoundedTo < NewSizeInBits)
-                  NewSizeInBits = RoundedTo;
-              }
-              return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
-            })
-        // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
-        // worth considering the multiples of 64 since 2*192 and 2*384 are not
-        // valid.
-        .clampScalar(LitTyIdx, s8, s256)
-        .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8)
-        // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384,
-        // s512, <X x s8>, <X x s16>, <X x s32>, or <X x s64>.
-        // At this point it's simple enough to accept the legal types.
-        .legalIf([=](const LegalityQuery &Query) {
-          const LLT &BigTy = Query.Types[BigTyIdx];
-          const LLT &LitTy = Query.Types[LitTyIdx];
-          if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
+        .widenScalarToNextPow2(LitTyIdx, 8)
+        .widenScalarToNextPow2(BigTyIdx, 32)
+        .clampScalar(LitTyIdx, s8, s64)
+        .clampScalar(BigTyIdx, s32, s128)
+        .legalIf([=](const LegalityQuery &Q) {
+          switch (Q.Types[BigTyIdx].getSizeInBits()) {
+          case 32:
+          case 64:
+          case 128:
+            break;
+          default:
             return false;
-          if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
+          }
+          switch (Q.Types[LitTyIdx].getSizeInBits()) {
+          case 8:
+          case 16:
+          case 32:
+          case 64:
+            return true;
+          default:
             return false;
-          return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
-        })
-        // Any vectors left are the wrong size. Scalarize them.
-        .scalarize(0)
-        .scalarize(1);
+          }
+        });
   }
 
   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -626,7 +594,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         const LLT &VecTy = Query.Types[1];
         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
-               VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0;
+               VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 ||
+               VecTy == v2p0;
       })
       .minScalarOrEltIf(
           [=](const LegalityQuery &Query) {
@@ -671,6 +640,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, s64}})
       .clampNumElements(0, v4s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
+      .minScalarOrElt(0, s8)
       .minScalarSameAs(1, 0);
 
   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
@@ -682,7 +652,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
 
   // TODO: Custom lowering for v2s32, v4s32, v2s64.
-  getActionDefinitionsBuilder(G_BITREVERSE).legalFor({s32, s64, v8s8, v16s8});
+  getActionDefinitionsBuilder(G_BITREVERSE)
+      .legalFor({s32, s64, v8s8, v16s8})
+      .widenScalarToNextPow2(0, /*Min = */ 32)
+      .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
 
@@ -716,7 +689,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
-      .legalFor({{v4s32, v2s32}, {v8s16, v4s16}});
+      .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
 
   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
 
@@ -748,6 +721,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(1, s32, 4)
       .lower();
 
+  getActionDefinitionsBuilder(
+      {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
+      // Try to break down into smaller vectors as long as they're at least 64
+      // bits. This lets us use vector operations for some parts of the
+      // reduction.
+      .fewerElementsIf(
+          [=](const LegalityQuery &Q) {
+            LLT SrcTy = Q.Types[1];
+            if (SrcTy.isScalar())
+              return false;
+            if (!isPowerOf2_32(SrcTy.getNumElements()))
+              return false;
+            // We can usually perform 64b vector operations.
+            return SrcTy.getSizeInBits() > 64;
+          },
+          [=](const LegalityQuery &Q) {
+            LLT SrcTy = Q.Types[1];
+            return std::make_pair(1, SrcTy.divide(2));
+          })
+      .scalarize(1)
+      .lower();
+
   getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
       .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
 
@@ -764,7 +759,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
       .customFor({{s32, s32}, {s64, s64}});
 
-  // TODO: Custom legalization for s128
   // TODO: Use generic lowering when custom lowering is not possible.
   auto always = [=](const LegalityQuery &Q) { return true; };
   getActionDefinitionsBuilder(G_CTPOP)
@@ -775,12 +769,27 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .maxScalarEltSameAsIf(always, 1, 0)
       .customFor({{s32, s32},
                   {s64, s64},
+                  {s128, s128},
                   {v2s64, v2s64},
                   {v2s32, v2s32},
                   {v4s32, v4s32},
                   {v4s16, v4s16},
                   {v8s16, v8s16}});
 
+  // TODO: Vector types.
+  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
+
+  // TODO: Vector types.
+  getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM})
+      .legalFor({MinFPScalar, s32, s64})
+      .libcallFor({s128})
+      .minScalar(0, MinFPScalar);
+
+  // TODO: Libcall support for s128.
+  // TODO: s16 should be legal with full FP16 support.
+  getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
+      .legalFor({{s64, s32}, {s64, s64}});
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -799,8 +808,6 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE:
     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
-  case TargetOpcode::G_BSWAP:
-    return legalizeBSwap(MI, MRI, MIRBuilder);
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
@@ -948,6 +955,37 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
 
 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                              MachineInstr &MI) const {
+  switch (MI.getIntrinsicID()) {
+  case Intrinsic::vacopy: {
+    unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
+    unsigned VaListSize =
+      (ST->isTargetDarwin() || ST->isTargetWindows())
+          ? PtrSize
+          : ST->isTargetILP32() ? 20 : 32;
+
+    MachineFunction &MF = *MI.getMF();
+    auto Val = MF.getRegInfo().createGenericVirtualRegister(
+        LLT::scalar(VaListSize * 8));
+    MachineIRBuilder MIB(MI);
+    MIB.buildLoad(Val, MI.getOperand(2),
+                  *MF.getMachineMemOperand(MachinePointerInfo(),
+                                           MachineMemOperand::MOLoad,
+                                           VaListSize, Align(PtrSize)));
+    MIB.buildStore(Val, MI.getOperand(1),
+                   *MF.getMachineMemOperand(MachinePointerInfo(),
+                                            MachineMemOperand::MOStore,
+                                            VaListSize, Align(PtrSize)));
+    MI.eraseFromParent();
+    return true;
+  }
+  case Intrinsic::get_dynamic_area_offset: {
+    MachineIRBuilder &MIB = Helper.MIRBuilder;
+    MIB.buildConstant(MI.getOperand(0).getReg(), 0);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
+
   return true;
 }
 
@@ -960,7 +998,7 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
   // imported patterns can select it later. Either way, it will be legal.
   Register AmtReg = MI.getOperand(2).getReg();
-  auto VRegAndVal = getConstantVRegValWithLookThrough(AmtReg, MRI);
+  auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
   if (!VRegAndVal)
     return true;
   // Check the shift amount is in range for an immediate form.
@@ -974,6 +1012,20 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
   return true;
 }
 
+static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
+                                MachineRegisterInfo &MRI) {
+  Base = Root;
+  Offset = 0;
+
+  Register NewBase;
+  int64_t NewOffset;
+  if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
+      isShiftedInt<7, 3>(NewOffset)) {
+    Base = NewBase;
+    Offset = NewOffset;
+  }
+}
+
 // FIXME: This should be removed and replaced with the generic bitcast legalize
 // action.
 bool AArch64LegalizerInfo::legalizeLoadStore(
@@ -993,6 +1045,36 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
   Register ValReg = MI.getOperand(0).getReg();
   const LLT ValTy = MRI.getType(ValReg);
 
+  if (ValTy == LLT::scalar(128)) {
+    assert((*MI.memoperands_begin())->getSuccessOrdering() ==
+               AtomicOrdering::Monotonic ||
+           (*MI.memoperands_begin())->getSuccessOrdering() ==
+               AtomicOrdering::Unordered);
+    assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
+    LLT s64 = LLT::scalar(64);
+    MachineInstrBuilder NewI;
+    if (MI.getOpcode() == TargetOpcode::G_LOAD) {
+      NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
+      MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
+    } else {
+      auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
+      NewI = MIRBuilder.buildInstr(
+          AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
+    }
+    Register Base;
+    int Offset;
+    matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
+    NewI.addUse(Base);
+    NewI.addImm(Offset / 8);
+
+    NewI.cloneMemRefs(MI);
+    constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
+                                     *MRI.getTargetRegisterInfo(),
+                                     *ST->getRegBankInfo());
+    MI.eraseFromParent();
+    return true;
+  }
+
   if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
       ValTy.getElementType().getAddressSpace() != 0) {
     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
@@ -1015,46 +1097,6 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
   return true;
 }
 
-bool AArch64LegalizerInfo::legalizeBSwap(MachineInstr &MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineIRBuilder &MIRBuilder) const {
-  assert(MI.getOpcode() == TargetOpcode::G_BSWAP);
-
-  // The <2 x half> case needs special lowering because there isn't an
-  // instruction that does that directly. Instead, we widen to <8 x i8>
-  // and emit a G_REV32 followed by a G_LSHR knowing that instruction selection
-  // will later match them as:
-  //
-  //   rev32.8b v0, v0
-  //   ushr.2s v0, v0, #16
-  //
-  // We could emit those here directly, but it seems better to keep things as
-  // generic as possible through legalization, and avoid committing layering
-  // violations by legalizing & selecting here at the same time.
-
-  Register ValReg = MI.getOperand(1).getReg();
-  assert(LLT::fixed_vector(2, 16) == MRI.getType(ValReg));
-  const LLT v2s32 = LLT::fixed_vector(2, 32);
-  const LLT v8s8 = LLT::fixed_vector(8, 8);
-  const LLT s32 = LLT::scalar(32);
-
-  auto Undef = MIRBuilder.buildUndef(v8s8);
-  auto Insert =
-      MIRBuilder
-          .buildInstr(TargetOpcode::INSERT_SUBREG, {v8s8}, {Undef, ValReg})
-          .addImm(AArch64::ssub);
-  auto Rev32 = MIRBuilder.buildInstr(AArch64::G_REV32, {v8s8}, {Insert});
-  auto Bitcast = MIRBuilder.buildBitcast(v2s32, Rev32);
-  auto Amt = MIRBuilder.buildConstant(v2s32, 16);
-  auto UShr =
-      MIRBuilder.buildInstr(TargetOpcode::G_LSHR, {v2s32}, {Bitcast, Amt});
-  auto Zero = MIRBuilder.buildConstant(s32, 0);
-  auto Extract = MIRBuilder.buildExtractVectorElement(s32, UShr, Zero);
-  MIRBuilder.buildBitcast({MI.getOperand(0).getReg()}, Extract);
-  MI.eraseFromParent();
-  return true;
-}
-
 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &MIRBuilder) const {
@@ -1107,8 +1149,8 @@ bool AArch64LegalizerInfo::legalizeBitfieldExtract(
     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
   // Only legal if we can select immediate forms.
   // TODO: Lower this otherwise.
-  return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
-         getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+  return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
+         getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
 }
 
 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
@@ -1151,8 +1193,7 @@ bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
   // v8s16,v4s32,v2s64 -> v16i8
   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
   if (Ty.isScalar()) {
-    // TODO: Handle s128.
-    assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!");
+    assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
     if (Size == 32) {
       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
     }
@@ -1198,7 +1239,7 @@ bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
   }
 
   // Post-conditioning.
-  if (Ty.isScalar() && Size == 64)
+  if (Ty.isScalar() && (Size == 64 || Size == 128))
     MIRBuilder.buildZExt(Dst, UADD);
   else
     UADD->getOperand(0).setReg(Dst);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 78fc24559d71..35456d95dc2b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -35,8 +35,6 @@ public:
                          MachineInstr &MI) const override;
 
 private:
-  bool legalizeBSwap(MachineInstr &MI, MachineRegisterInfo &MRI,
-                     MachineIRBuilder &MIRBuilder) const;
   bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &MIRBuilder) const;
   bool legalizeLoadStore(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index b700c3760a58..a9b3792e0118 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -55,7 +55,7 @@ bool matchExtractVecEltPairwiseAdd(
   Register Src2 = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
 
-  auto Cst = getConstantVRegValWithLookThrough(Src2, MRI);
+  auto Cst = getIConstantVRegValWithLookThrough(Src2, MRI);
   if (!Cst || Cst->Value != 0)
     return false;
   // SDAG also checks for FullFP16, but this looks to be beneficial anyway.
@@ -129,7 +129,7 @@ bool matchAArch64MulConstCombine(
   const LLT Ty = MRI.getType(LHS);
 
   // The below optimizations require a constant RHS.
-  auto Const = getConstantVRegValWithLookThrough(RHS, MRI);
+  auto Const = getIConstantVRegValWithLookThrough(RHS, MRI);
   if (!Const)
     return false;
 
@@ -262,6 +262,33 @@ void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
   Observer.changedInstr(MI);
 }
 
+/// \returns True if a G_ANYEXT instruction \p MI should be mutated to a G_ZEXT
+/// instruction.
+static bool matchMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  // If this is coming from a scalar compare then we can use a G_ZEXT instead of
+  // a G_ANYEXT:
+  //
+  // %cmp:_(s32) = G_[I|F]CMP ... <-- produces 0/1.
+  // %ext:_(s64) = G_ANYEXT %cmp(s32)
+  //
+  // By doing this, we can leverage more KnownBits combines.
+  assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  return MRI.getType(Dst).isScalar() &&
+         mi_match(Src, MRI,
+                  m_any_of(m_GICmp(m_Pred(), m_Reg(), m_Reg()),
+                           m_GFCmp(m_Pred(), m_Reg(), m_Reg())));
+}
+
+static void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineIRBuilder &B,
+                              GISelChangeObserver &Observer) {
+  Observer.changingInstr(MI);
+  MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
+  Observer.changedInstr(MI);
+}
+
 #define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
 #include "AArch64GenPostLegalizeGICombiner.inc"
 #undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 84ecb4ba6964..3ff67d188822 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -527,7 +527,7 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P,
 
   // If the RHS is not a constant, or the RHS is already a valid arithmetic
   // immediate, then there is nothing to change.
-  auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
+  auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, MRI);
   if (!ValAndVReg)
     return None;
   uint64_t C = ValAndVReg->Value.getZExtValue();
@@ -757,7 +757,7 @@ static unsigned getCmpOperandFoldingProfit(Register CmpOp,
     if (MI.getOpcode() != TargetOpcode::G_AND)
       return false;
     auto ValAndVReg =
-        getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+        getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
     if (!ValAndVReg)
       return false;
     uint64_t Mask = ValAndVReg->Value.getZExtValue();
@@ -774,7 +774,7 @@ static unsigned getCmpOperandFoldingProfit(Register CmpOp,
     return 0;
 
   auto MaybeShiftAmt =
-      getConstantVRegValWithLookThrough(Def->getOperand(2).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(Def->getOperand(2).getReg(), MRI);
   if (!MaybeShiftAmt)
     return 0;
   uint64_t ShiftAmt = MaybeShiftAmt->Value.getZExtValue();
@@ -814,7 +814,7 @@ static bool trySwapICmpOperands(MachineInstr &MI,
   // Don't swap if there's a constant on the RHS, because we know we can fold
   // that.
   Register RHS = MI.getOperand(3).getReg();
-  auto RHSCst = getConstantVRegValWithLookThrough(RHS, MRI);
+  auto RHSCst = getIConstantVRegValWithLookThrough(RHS, MRI);
   if (RHSCst && isLegalArithImmed(RHSCst->Value.getSExtValue()))
     return false;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 9efbcbb0065b..d3f4130d2ba1 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -146,8 +146,8 @@ static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
   for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
     if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
       return false;
-    auto Cst =
-        getConstantVRegValWithLookThrough(UseInstr.getOperand(2).getReg(), MRI);
+    auto Cst = getIConstantVRegValWithLookThrough(
+        UseInstr.getOperand(2).getReg(), MRI);
     if (!Cst)
       return false;
     MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
@@ -220,6 +220,121 @@ static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
   return true;
 }
 
+static bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
+                               CombinerHelper &Helper,
+                               GISelChangeObserver &Observer) {
+  // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
+  // result is only used in the no-overflow case. It is restricted to cases
+  // where we know that the high-bits of the operands are 0. If there's an
+  // overflow, then the the 9th or 17th bit must be set, which can be checked
+  // using TBNZ.
+  //
+  // Change (for UADDOs on 8 and 16 bits):
+  //
+  //   %z0 = G_ASSERT_ZEXT _
+  //   %op0 = G_TRUNC %z0
+  //   %z1 = G_ASSERT_ZEXT _
+  //   %op1 = G_TRUNC %z1
+  //   %val, %cond = G_UADDO %op0, %op1
+  //   G_BRCOND %cond, %error.bb
+  //
+  // error.bb:
+  //   (no successors and no uses of %val)
+  //
+  // To:
+  //
+  //   %z0 = G_ASSERT_ZEXT _
+  //   %z1 = G_ASSERT_ZEXT _
+  //   %add = G_ADD %z0, %z1
+  //   %val = G_TRUNC %add
+  //   %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
+  //   %cond = G_ICMP NE, %bit, 0
+  //   G_BRCOND %cond, %error.bb
+
+  auto &MRI = *B.getMRI();
+
+  MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg());
+  MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg());
+  Register Op0Wide;
+  Register Op1Wide;
+  if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) ||
+      !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide))))
+    return false;
+  LLT WideTy0 = MRI.getType(Op0Wide);
+  LLT WideTy1 = MRI.getType(Op1Wide);
+  Register ResVal = MI.getOperand(0).getReg();
+  LLT OpTy = MRI.getType(ResVal);
+  MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide);
+  MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide);
+
+  unsigned OpTySize = OpTy.getScalarSizeInBits();
+  // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
+  // inputs have been zero-extended.
+  if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
+      Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
+      OpTySize != Op0WideDef->getOperand(2).getImm() ||
+      OpTySize != Op1WideDef->getOperand(2).getImm())
+    return false;
+
+  // Only scalar UADDO with either 8 or 16 bit operands are handled.
+  if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
+      OpTySize >= WideTy0.getScalarSizeInBits() ||
+      (OpTySize != 8 && OpTySize != 16))
+    return false;
+
+  // The overflow-status result must be used by a branch only.
+  Register ResStatus = MI.getOperand(1).getReg();
+  if (!MRI.hasOneNonDBGUse(ResStatus))
+    return false;
+  MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus);
+  if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
+    return false;
+
+  // Make sure the computed result is only used in the no-overflow blocks.
+  MachineBasicBlock *CurrentMBB = MI.getParent();
+  MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB();
+  if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
+    return false;
+  if (any_of(MRI.use_nodbg_instructions(ResVal),
+             [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
+               return &MI != &I &&
+                      (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
+             }))
+    return false;
+
+  // Remove G_ADDO.
+  B.setInstrAndDebugLoc(*MI.getNextNode());
+  MI.eraseFromParent();
+
+  // Emit wide add.
+  Register AddDst = MRI.cloneVirtualRegister(Op0Wide);
+  B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide});
+
+  // Emit check of the 9th or 17th bit and update users (the branch). This will
+  // later be folded to TBNZ.
+  Register CondBit = MRI.cloneVirtualRegister(Op0Wide);
+  B.buildAnd(
+      CondBit, AddDst,
+      B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16));
+  B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit,
+              B.buildConstant(LLT::scalar(32), 0));
+
+  // Update ZEXts users of the result value. Because all uses are in the
+  // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
+  B.buildZExtOrTrunc(ResVal, AddDst);
+  for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) {
+    Register WideReg;
+    if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) {
+      auto OldR = U.getParent()->getOperand(0).getReg();
+      Observer.erasingInstr(*U.getParent());
+      U.getParent()->eraseFromParent();
+      Helper.replaceRegWith(MRI, OldR, AddDst);
+    }
+  }
+
+  return true;
+}
+
 class AArch64PreLegalizerCombinerHelperState {
 protected:
   CombinerHelper &Helper;
@@ -272,6 +387,8 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
     return Helper.tryCombineConcatVectors(MI);
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return Helper.tryCombineShuffleVector(MI);
+  case TargetOpcode::G_UADDO:
+    return tryToSimplifyUADDO(MI, B, Helper, Observer);
   case TargetOpcode::G_MEMCPY_INLINE:
     return Helper.tryEmitMemcpyInline(MI);
   case TargetOpcode::G_MEMCPY:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 8c34027f7bb3..40ddf6a94f73 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -13,8 +13,11 @@
 
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -271,6 +274,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::WSeqPairsClassRegClassID:
   case AArch64::XSeqPairsClassRegClassID:
   case AArch64::MatrixIndexGPR32_12_15RegClassID:
+  case AArch64::GPR64_with_sub_32_in_MatrixIndexGPR32_12_15RegClassID:
     return getRegBank(AArch64::GPRRegBankID);
   case AArch64::CCRRegClassID:
     return getRegBank(AArch64::CCRegBankID);
@@ -424,6 +428,8 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
   case TargetOpcode::G_FRINT:
   case TargetOpcode::G_INTRINSIC_TRUNC:
   case TargetOpcode::G_INTRINSIC_ROUND:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM:
     return true;
   }
   return false;
@@ -529,6 +535,8 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_LROUND:
+  case TargetOpcode::G_LLROUND:
     return true;
   default:
     break;
@@ -747,24 +755,33 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // for the greedy mode the cost of the cross bank copy will
     // offset this number.
     // FIXME: Should be derived from the scheduling model.
-    if (OpRegBankIdx[0] != PMI_FirstGPR)
+    if (OpRegBankIdx[0] != PMI_FirstGPR) {
       Cost = 2;
-    else
-      // Check if that load feeds fp instructions.
-      // In that case, we want the default mapping to be on FPR
-      // instead of blind map every scalar to GPR.
-      for (const MachineInstr &UseMI :
-           MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
-        // If we have at least one direct use in a FP instruction,
-        // assume this was a floating point load in the IR.
-        // If it was not, we would have had a bitcast before
-        // reaching that instruction.
-        // Int->FP conversion operations are also captured in onlyDefinesFP().
-        if (onlyUsesFP(UseMI, MRI, TRI) || onlyDefinesFP(UseMI, MRI, TRI)) {
-          OpRegBankIdx[0] = PMI_FirstFPR;
-          break;
-        }
-      }
+      break;
+    }
+
+    if (cast<GLoad>(MI).isAtomic()) {
+      // Atomics always use GPR destinations. Don't refine any further.
+      OpRegBankIdx[0] = PMI_FirstGPR;
+      break;
+    }
+
+    // Check if that load feeds fp instructions.
+    // In that case, we want the default mapping to be on FPR
+    // instead of blind map every scalar to GPR.
+    if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+               [&](const MachineInstr &UseMI) {
+                 // If we have at least one direct use in a FP instruction,
+                 // assume this was a floating point load in the IR. If it was
+                 // not, we would have had a bitcast before reaching that
+                 // instruction.
+                 //
+                 // Int->FP conversion operations are also captured in
+                 // onlyDefinesFP().
+                 return onlyUsesFP(UseMI, MRI, TRI) ||
+                        onlyDefinesFP(UseMI, MRI, TRI);
+               }))
+      OpRegBankIdx[0] = PMI_FirstFPR;
     break;
   case TargetOpcode::G_STORE:
     // Check if that store is fed by fp instructions.
@@ -957,6 +974,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
+  case TargetOpcode::G_LROUND:
+  case TargetOpcode::G_LLROUND: {
+    // Source is always floating point and destination is always integer.
+    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+    break;
+  }
   }
 
   // Finally construct the computed mapping.
diff --git a/llvm/lib/Target/AArch64/GISel/select-saddo.mir b/llvm/lib/Target/AArch64/GISel/select-saddo.mir
deleted file mode 100644
index 6f05bd7ac838..000000000000
--- a/llvm/lib/Target/AArch64/GISel/select-saddo.mir
+++ /dev/null
@@ -1,158 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-uknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
-
-...
----
-name:            saddo_s32
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-
-    ; CHECK-LABEL: name: saddo_s32
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %reg0:gpr32 = COPY $w0
-    ; CHECK: %reg1:gpr32 = COPY $w1
-    ; CHECK: %saddo:gpr32 = ADDSWrr %reg0, %reg1, implicit-def $nzcv
-    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %saddo
-    ; CHECK: RET_ReallyLR implicit $w0
-    %reg0:gpr(s32) = COPY $w0
-    %reg1:gpr(s32) = COPY $w1
-    %saddo:gpr(s32), %4:gpr(s1) = G_SADDO %reg0, %reg1
-    $w0 = COPY %saddo(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            saddo_s64
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $x0, $x1, $x2
-
-    ; CHECK-LABEL: name: saddo_s64
-    ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: %reg0:gpr64 = COPY $x0
-    ; CHECK: %reg1:gpr64 = COPY $x1
-    ; CHECK: %saddo:gpr64 = ADDSXrr %reg0, %reg1, implicit-def $nzcv
-    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $x0 = COPY %saddo
-    ; CHECK: RET_ReallyLR implicit $x0
-    %reg0:gpr(s64) = COPY $x0
-    %reg1:gpr(s64) = COPY $x1
-    %saddo:gpr(s64), %4:gpr(s1) = G_SADDO %reg0, %reg1
-    $x0 = COPY %saddo(s64)
-    RET_ReallyLR implicit $x0
-
-...
----
-name:            saddo_s32_imm
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-    ; Check that we get ADDSWri when we can fold in a constant.
-    ;
-    ; CHECK-LABEL: name: saddo_s32_imm
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %copy:gpr32sp = COPY $w0
-    ; CHECK: %saddo:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv
-    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %saddo
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:gpr(s32) = COPY $w0
-    %constant:gpr(s32) = G_CONSTANT i32 16
-    %saddo:gpr(s32), %overflow:gpr(s1) = G_SADDO %copy, %constant
-    $w0 = COPY %saddo(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            saddo_s32_shifted
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-    ; Check that we get ADDSWrs when we can fold in a shift.
-    ;
-    ; CHECK-LABEL: name: saddo_s32_shifted
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %reg0:gpr32 = COPY $w0
-    ; CHECK: %reg1:gpr32 = COPY $w1
-    ; CHECK: %add:gpr32 = ADDSWrs %reg0, %reg1, 16, implicit-def $nzcv
-    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %add
-    ; CHECK: RET_ReallyLR implicit $w0
-    %reg0:gpr(s32) = COPY $w0
-    %reg1:gpr(s32) = COPY $w1
-    %constant:gpr(s32) = G_CONSTANT i32 16
-    %shift:gpr(s32) = G_SHL %reg1(s32), %constant(s32)
-    %add:gpr(s32), %overflow:gpr(s1) = G_SADDO %reg0, %shift
-    $w0 = COPY %add(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            saddo_s32_neg_imm
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-    ; Check that we get SUBSWri when we can fold in a negative constant.
-    ;
-    ; CHECK-LABEL: name: saddo_s32_neg_imm
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %copy:gpr32sp = COPY $w0
-    ; CHECK: %add:gpr32 = SUBSWri %copy, 16, 0, implicit-def $nzcv
-    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %add
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:gpr(s32) = COPY $w0
-    %constant:gpr(s32) = G_CONSTANT i32 -16
-    %add:gpr(s32), %overflow:gpr(s1) = G_SADDO %copy, %constant
-    $w0 = COPY %add(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            saddo_arith_extended
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $x0
-    ; Check that we get ADDSXrx.
-    ; CHECK-LABEL: name: saddo_arith_extended
-    ; CHECK: liveins: $w0, $x0
-    ; CHECK: %reg0:gpr64sp = COPY $x0
-    ; CHECK: %reg1:gpr32 = COPY $w0
-    ; CHECK: %add:gpr64 = ADDSXrx %reg0, %reg1, 18, implicit-def $nzcv
-    ; CHECK: %flags:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $x0 = COPY %add
-    ; CHECK: RET_ReallyLR implicit $x0
-    %reg0:gpr(s64) = COPY $x0
-    %reg1:gpr(s32) = COPY $w0
-    %ext:gpr(s64) = G_ZEXT %reg1(s32)
-    %cst:gpr(s64) = G_CONSTANT i64 2
-    %shift:gpr(s64) = G_SHL %ext, %cst(s64)
-    %add:gpr(s64), %flags:gpr(s1) = G_SADDO %reg0, %shift
-    $x0 = COPY %add(s64)
-    RET_ReallyLR implicit $x0
diff --git a/llvm/lib/Target/AArch64/GISel/select-ssubo.mir b/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
deleted file mode 100644
index f6b1794645f7..000000000000
--- a/llvm/lib/Target/AArch64/GISel/select-ssubo.mir
+++ /dev/null
@@ -1,158 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-uknown -global-isel -run-pass=instruction-select %s -o - | FileCheck %s
-
-...
----
-name:            ssubo_s32
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-
-    ; CHECK-LABEL: name: ssubo_s32
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %reg0:gpr32 = COPY $w0
-    ; CHECK: %reg1:gpr32 = COPY $w1
-    ; CHECK: %ssubo:gpr32 = SUBSWrr %reg0, %reg1, implicit-def $nzcv
-    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %ssubo
-    ; CHECK: RET_ReallyLR implicit $w0
-    %reg0:gpr(s32) = COPY $w0
-    %reg1:gpr(s32) = COPY $w1
-    %ssubo:gpr(s32), %4:gpr(s1) = G_SSUBO %reg0, %reg1
-    $w0 = COPY %ssubo(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            ssubo_s64
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $x0, $x1, $x2
-
-    ; CHECK-LABEL: name: ssubo_s64
-    ; CHECK: liveins: $x0, $x1, $x2
-    ; CHECK: %reg0:gpr64 = COPY $x0
-    ; CHECK: %reg1:gpr64 = COPY $x1
-    ; CHECK: %ssubo:gpr64 = SUBSXrr %reg0, %reg1, implicit-def $nzcv
-    ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $x0 = COPY %ssubo
-    ; CHECK: RET_ReallyLR implicit $x0
-    %reg0:gpr(s64) = COPY $x0
-    %reg1:gpr(s64) = COPY $x1
-    %ssubo:gpr(s64), %4:gpr(s1) = G_SSUBO %reg0, %reg1
-    $x0 = COPY %ssubo(s64)
-    RET_ReallyLR implicit $x0
-
-...
----
-name:            ssubo_s32_imm
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-    ; Check that we get SUBSWri when we can fold in a constant.
-    ;
-    ; CHECK-LABEL: name: ssubo_s32_imm
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %copy:gpr32sp = COPY $w0
-    ; CHECK: %ssubo:gpr32 = SUBSWri %copy, 16, 0, implicit-def $nzcv
-    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %ssubo
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:gpr(s32) = COPY $w0
-    %constant:gpr(s32) = G_CONSTANT i32 16
-    %ssubo:gpr(s32), %overflow:gpr(s1) = G_SSUBO %copy, %constant
-    $w0 = COPY %ssubo(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            ssubo_s32_shifted
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-    ; Check that we get SUBSWrs when we can fold in a shift.
-    ;
-    ; CHECK-LABEL: name: ssubo_s32_shifted
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %reg0:gpr32 = COPY $w0
-    ; CHECK: %reg1:gpr32 = COPY $w1
-    ; CHECK: %sub:gpr32 = SUBSWrs %reg0, %reg1, 16, implicit-def $nzcv
-    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %sub
-    ; CHECK: RET_ReallyLR implicit $w0
-    %reg0:gpr(s32) = COPY $w0
-    %reg1:gpr(s32) = COPY $w1
-    %constant:gpr(s32) = G_CONSTANT i32 16
-    %shift:gpr(s32) = G_SHL %reg1(s32), %constant(s32)
-    %sub:gpr(s32), %overflow:gpr(s1) = G_SSUBO %reg0, %shift
-    $w0 = COPY %sub(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            ssubo_s32_neg_imm
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $w1, $x2
-    ; Check that we get ADDSWri when we can fold in a negative constant.
-    ;
-    ; CHECK-LABEL: name: ssubo_s32_neg_imm
-    ; CHECK: liveins: $w0, $w1, $x2
-    ; CHECK: %copy:gpr32sp = COPY $w0
-    ; CHECK: %sub:gpr32 = ADDSWri %copy, 16, 0, implicit-def $nzcv
-    ; CHECK: %overflow:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $w0 = COPY %sub
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:gpr(s32) = COPY $w0
-    %constant:gpr(s32) = G_CONSTANT i32 -16
-    %sub:gpr(s32), %overflow:gpr(s1) = G_SSUBO %copy, %constant
-    $w0 = COPY %sub(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            ssubo_arith_extended
-alignment:       4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1.entry:
-    liveins: $w0, $x0
-    ; Check that we get SUBSXrx.
-    ; CHECK-LABEL: name: ssubo_arith_extended
-    ; CHECK: liveins: $w0, $x0
-    ; CHECK: %reg0:gpr64sp = COPY $x0
-    ; CHECK: %reg1:gpr32 = COPY $w0
-    ; CHECK: %sub:gpr64 = SUBSXrx %reg0, %reg1, 18, implicit-def $nzcv
-    ; CHECK: %flags:gpr32 = CSINCWr $wzr, $wzr, 7, implicit $nzcv
-    ; CHECK: $x0 = COPY %sub
-    ; CHECK: RET_ReallyLR implicit $x0
-    %reg0:gpr(s64) = COPY $x0
-    %reg1:gpr(s32) = COPY $w0
-    %ext:gpr(s64) = G_ZEXT %reg1(s32)
-    %cst:gpr(s64) = G_CONSTANT i64 2
-    %shift:gpr(s64) = G_SHL %ext, %cst(s64)
-    %sub:gpr(s64), %flags:gpr(s1) = G_SSUBO %reg0, %shift
-    $x0 = COPY %sub(s64)
-    RET_ReallyLR implicit $x0
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index c3e74757675b..876526093591 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 
+#include "AArch64ExpandImm.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/bit.h"
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 290fe88a8cec..dbb8e85713cb 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -24,9 +24,9 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 namespace {
@@ -92,7 +92,8 @@ public:
                             const MCAsmLayout &Layout) const override;
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
 
@@ -159,8 +160,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
     return AdrImmBits(Value & 0x1fffffULL);
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     assert(!IsResolved);
-    if (TheTriple.isOSBinFormatCOFF())
+    if (TheTriple.isOSBinFormatCOFF()) {
+      if (!isInt<21>(SignedValue))
+        Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
       return AdrImmBits(Value & 0x1fffffULL);
+    }
     return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
   case AArch64::fixup_aarch64_ldr_pcrel_imm19:
   case AArch64::fixup_aarch64_pcrel_branch19:
@@ -456,7 +460,8 @@ void AArch64AsmBackend::relaxInstruction(MCInst &Inst,
   llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
 
-bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                     const MCSubtargetInfo *STI) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index cd1bfed9d40d..ee0870d9ef7a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1026,11 +1026,11 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
     unsigned Shift =
         AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
     O << '#' << formatImm(Val);
-    if (Shift != 0)
+    if (Shift != 0) {
       printShifter(MI, OpNum + 1, STI, O);
-
-    if (CommentStream)
-      *CommentStream << '=' << formatImm(Val << Shift) << '\n';
+      if (CommentStream)
+        *CommentStream << '=' << formatImm(Val << Shift) << '\n';
+    }
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
     MO.getExpr()->print(O, &MAI);
@@ -1450,6 +1450,12 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
   O << "[" << MI->getOperand(OpNum).getImm() << "]";
 }
 
+void AArch64InstPrinter::printMatrixIndex(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
 void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
                                            unsigned OpNum,
                                            const MCSubtargetInfo &STI,
@@ -1539,6 +1545,28 @@ void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo,
     O << "#" << Val;
 }
 
+static bool isValidSysReg(const AArch64SysReg::SysReg *Reg, bool Read,
+                          const MCSubtargetInfo &STI) {
+  return (Reg && (Read ? Reg->Readable : Reg->Writeable) &&
+          Reg->haveFeatures(STI.getFeatureBits()));
+}
+
+// Looks up a system register either by encoding or by name. Some system
+// registers share the same encoding between different architectures,
+// therefore a tablegen lookup by encoding will return an entry regardless
+// of the register's predication on a specific subtarget feature. To work
+// around this problem we keep an alternative name for such registers and
+// look them up by that name if the first lookup was unsuccessful.
+static const AArch64SysReg::SysReg *lookupSysReg(unsigned Val, bool Read,
+                                                 const MCSubtargetInfo &STI) {
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+
+  if (Reg && !isValidSysReg(Reg, Read, STI))
+    Reg = AArch64SysReg::lookupSysRegByName(Reg->AltName);
+
+  return Reg;
+}
+
 void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 const MCSubtargetInfo &STI,
                                                 raw_ostream &O) {
@@ -1558,8 +1586,9 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
     return;
   }
 
-  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
-  if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
+  const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, true /*Read*/, STI);
+
+  if (isValidSysReg(Reg, true /*Read*/, STI))
     O << Reg->Name;
   else
     O << AArch64SysReg::genericRegisterString(Val);
@@ -1584,8 +1613,9 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
     return;
   }
 
-  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
-  if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
+  const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, false /*Read*/, STI);
+
+  if (isValidSysReg(Reg, false /*Read*/, STI))
     O << Reg->Name;
   else
     O << AArch64SysReg::genericRegisterString(Val);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 9ec74a1bc7b6..d36fb30a0ce6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -161,6 +161,8 @@ protected:
 
   void printVectorIndex(const MCInst *MI, unsigned OpNum,
                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMatrixIndex(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum,
                       const MCSubtargetInfo &STI, raw_ostream &O);
   void printBarrierOption(const MCInst *MI, unsigned OpNum,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 3c2df1621e11..90688f1a3e83 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -26,9 +26,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -57,7 +57,16 @@ createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
       CPU = "apple-a12";
   }
 
-  return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+  // Most of the NEON instruction set isn't supported in streaming mode on SME
+  // targets, disable NEON unless explicitly requested.
+  bool RequestedNEON = FS.contains("neon");
+  bool RequestedStreamingSVE = FS.contains("streaming-sve");
+  MCSubtargetInfo *STI =
+      createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+  if (RequestedStreamingSVE && !RequestedNEON &&
+      STI->hasFeature(AArch64::FeatureNEON))
+    STI->ToggleFeature(AArch64::FeatureNEON);
+  return STI;
 }
 
 void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index c84c313c1db0..941226b83e44 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -62,9 +62,6 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  MCInstPrinter *InstPrint,
                                                  bool isVerboseAsm);
 
-MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S,
-                                                    const MCSubtargetInfo &STI);
-
 namespace AArch64_MC {
 void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 557603c24ba5..cf1a60643efd 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -48,11 +48,13 @@ void AArch64TargetStreamer::emitCurrentConstantPool() {
   ConstantPools->emitForCurrentSection(Streamer);
 }
 
+void AArch64TargetStreamer::emitConstantPools() {
+  ConstantPools->emitAll(Streamer);
+}
+
 // finish() - write out any non-empty assembler constant pools and
 //   write out note.gnu.properties if need.
 void AArch64TargetStreamer::finish() {
-  ConstantPools->emitAll(Streamer);
-
   if (MarkBTIProperty)
     emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 9b030775094c..86c7baf8f429 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -23,6 +23,7 @@ public:
   ~AArch64TargetStreamer() override;
 
   void finish() override;
+  void emitConstantPools() override;
 
   /// Callback used to implement the ldr= pseudo.
   /// Add a new entry to the constant pool for the current section and return an
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 62089166f4b7..41f2cead4cf8 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -180,26 +180,18 @@ class sme_mem_ld_ss_base<bit Q, bit V, bits<2> msz, dag outs, dag ins,
   let mayLoad = 1;
 }
 
-class sme_mem_ld_ss_inst_BHSD<bits<2> msz, string mnemonic,
-                              MatrixTileVectorOperand tile_ty, bit is_col,
-                              Operand imm_ty, RegisterOperand gpr_ty>
+class sme_mem_ld_ss_inst<bit Q, bits<2> msz, string mnemonic,
+                         MatrixTileVectorOperand tile_ty, bit is_col,
+                         Operand imm_ty, RegisterOperand gpr_ty>
     : sme_mem_ld_ss_base<
-        0b0, is_col, msz, (outs tile_ty:$ZAt),
+        Q, is_col, msz, (outs tile_ty:$ZAt),
         (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn,
              gpr_ty:$Rm),
         mnemonic, "\t\\{$ZAt[$Rv, $imm]\\}, $Pg/z, [$Rn, $Rm]">;
 
-class sme_mem_ld_ss_inst_Q<string mnemonic, MatrixTileVectorOperand tile_ty,
-                           bit is_col>
-    : sme_mem_ld_ss_base<
-        0b1, is_col, 0b11, (outs tile_ty:$ZAt),
-        (ins MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn,
-             GPR64shifted128:$Rm),
-        mnemonic, "\t\\{$ZAt[$Rv]\\}, $Pg/z, [$Rn, $Rm]">;
-
-multiclass sme_mem_ss_aliases_BHSD<string mnemonic, Instruction inst,
-                                   MatrixTileVectorOperand tile_ty, Operand imm_ty,
-                                   RegisterOperand gpr_ty,
+multiclass sme_mem_ss_aliases_base<string mnemonic, Instruction inst,
+                                   MatrixTileVectorOperand tile_ty,
+                                   Operand imm_ty, RegisterOperand gpr_ty,
                                    string pg_suffix=""> {
   def : InstAlias<mnemonic # "\t$ZAt[$Rv, $imm], $Pg" # pg_suffix # ", [$Rn, $Rm]",
                   (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn, gpr_ty:$Rm), 0>;
@@ -210,35 +202,23 @@ multiclass sme_mem_ss_aliases_BHSD<string mnemonic, Instruction inst,
                   (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
 }
 
-multiclass sme_mem_ss_aliases_Q<string mnemonic, Instruction inst,
-                                MatrixTileVectorOperand tile_ty,
-                                string pg_suffix=""> {
-  def : InstAlias<mnemonic # "\t$ZAt[$Rv], $Pg" # pg_suffix # ", [$Rn, $Rm]",
-                  (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn, GPR64shifted128:$Rm), 0>;
-  // Default XZR offset aliases
-  def : InstAlias<mnemonic # "\t\\{$ZAt[$Rv]\\}, $Pg" # pg_suffix # ", [$Rn]",
-                  (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 2>;
-  def : InstAlias<mnemonic # "\t$ZAt[$Rv], $Pg" # pg_suffix # ", [$Rn]",
-                  (inst tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
-}
-
 multiclass sme_mem_ss_aliases<string mnemonic, string inst, bit is_col,
                               string pg_suffix=""> {
-  defm : sme_mem_ss_aliases_BHSD<mnemonic # "b", !cast<Instruction>(inst # _B),
+  defm : sme_mem_ss_aliases_base<mnemonic # "b", !cast<Instruction>(inst # _B),
                                  !if(is_col, TileVectorOpV8, TileVectorOpH8),
-                                 imm0_15, GPR64shifted8, pg_suffix>;
-  defm : sme_mem_ss_aliases_BHSD<mnemonic # "h", !cast<Instruction>(inst # _H),
+                                 sme_elm_idx0_15, GPR64shifted8, pg_suffix>;
+  defm : sme_mem_ss_aliases_base<mnemonic # "h", !cast<Instruction>(inst # _H),
                                  !if(is_col, TileVectorOpV16, TileVectorOpH16),
-                                 imm0_7, GPR64shifted16, pg_suffix>;
-  defm : sme_mem_ss_aliases_BHSD<mnemonic # "w", !cast<Instruction>(inst # _S),
+                                 sme_elm_idx0_7, GPR64shifted16, pg_suffix>;
+  defm : sme_mem_ss_aliases_base<mnemonic # "w", !cast<Instruction>(inst # _S),
                                  !if(is_col, TileVectorOpV32, TileVectorOpH32),
-                                 imm0_3, GPR64shifted32, pg_suffix>;
-  defm : sme_mem_ss_aliases_BHSD<mnemonic # "d", !cast<Instruction>(inst # _D),
+                                 sme_elm_idx0_3, GPR64shifted32, pg_suffix>;
+  defm : sme_mem_ss_aliases_base<mnemonic # "d", !cast<Instruction>(inst # _D),
                                  !if(is_col, TileVectorOpV64, TileVectorOpH64),
-                                 imm0_1, GPR64shifted64, pg_suffix>;
-  defm : sme_mem_ss_aliases_Q   <mnemonic # "q", !cast<Instruction>(inst # _Q),
+                                 sme_elm_idx0_1, GPR64shifted64, pg_suffix>;
+  defm : sme_mem_ss_aliases_base<mnemonic # "q", !cast<Instruction>(inst # _Q),
                                  !if(is_col, TileVectorOpV128, TileVectorOpH128),
-                                 pg_suffix>;
+                                 sme_elm_idx0_0, GPR64shifted128, pg_suffix>;
 }
 
 multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
@@ -246,44 +226,39 @@ multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
 }
 
 multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
-  def _B : sme_mem_ld_ss_inst_BHSD<0b00, mnemonic # "b",
-                                   !if(is_col, TileVectorOpV8,
-                                               TileVectorOpH8),
-                                   is_col, imm0_15, GPR64shifted8> {
+  def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b",
+                              !if(is_col, TileVectorOpV8, TileVectorOpH8),
+                              is_col, sme_elm_idx0_15, GPR64shifted8> {
     bits<4> imm;
     let Inst{3-0} = imm;
   }
-  def _H : sme_mem_ld_ss_inst_BHSD<0b01, mnemonic # "h",
-                                   !if(is_col, TileVectorOpV16,
-                                               TileVectorOpH16),
-                                   is_col, imm0_7, GPR64shifted16> {
+  def _H : sme_mem_ld_ss_inst<0b0, 0b01, mnemonic # "h",
+                              !if(is_col, TileVectorOpV16, TileVectorOpH16),
+                              is_col, sme_elm_idx0_7, GPR64shifted16> {
     bits<1> ZAt;
     bits<3> imm;
     let Inst{3}   = ZAt;
     let Inst{2-0} = imm;
   }
-  def _S : sme_mem_ld_ss_inst_BHSD<0b10, mnemonic # "w",
-                                   !if(is_col, TileVectorOpV32,
-                                               TileVectorOpH32),
-                                   is_col, imm0_3, GPR64shifted32> {
+  def _S : sme_mem_ld_ss_inst<0b0, 0b10, mnemonic # "w",
+                              !if(is_col, TileVectorOpV32, TileVectorOpH32),
+                              is_col, sme_elm_idx0_3, GPR64shifted32> {
     bits<2> ZAt;
     bits<2> imm;
     let Inst{3-2} = ZAt;
     let Inst{1-0} = imm;
   }
-  def _D : sme_mem_ld_ss_inst_BHSD<0b11, mnemonic # "d",
-                                   !if(is_col, TileVectorOpV64,
-                                               TileVectorOpH64),
-                                   is_col, imm0_1, GPR64shifted64> {
+  def _D : sme_mem_ld_ss_inst<0b0, 0b11, mnemonic # "d",
+                              !if(is_col, TileVectorOpV64, TileVectorOpH64),
+                              is_col, sme_elm_idx0_1, GPR64shifted64> {
     bits<3> ZAt;
     bits<1> imm;
     let Inst{3-1} = ZAt;
     let Inst{0}   = imm;
   }
-  def _Q : sme_mem_ld_ss_inst_Q<mnemonic # "q",
-                                !if(is_col, TileVectorOpV128,
-                                            TileVectorOpH128),
-                                is_col> {
+  def _Q : sme_mem_ld_ss_inst<0b1, 0b11, mnemonic # "q",
+                              !if(is_col, TileVectorOpV128, TileVectorOpH128),
+                              is_col, sme_elm_idx0_0, GPR64shifted128> {
     bits<4> ZAt;
     let Inst{3-0} = ZAt;
   }
@@ -322,66 +297,53 @@ class sme_mem_st_ss_base<bit Q, bit V, bits<2> msz, dag ins,
   let hasSideEffects = 1;
 }
 
-class sme_mem_st_ss_inst_BHSD<bits<2> msz, string mnemonic,
-                              MatrixTileVectorOperand tile_ty, bit is_col,
-                              Operand imm_ty, RegisterOperand gpr_ty>
+class sme_mem_st_ss_inst<bit Q, bits<2> msz, string mnemonic,
+                         MatrixTileVectorOperand tile_ty, bit is_col,
+                         Operand imm_ty, RegisterOperand gpr_ty>
     : sme_mem_st_ss_base<
-        0b0, is_col, msz,
+        Q, is_col, msz,
         (ins tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg,
              GPR64sp:$Rn, gpr_ty:$Rm),
         mnemonic, "\t\\{$ZAt[$Rv, $imm]\\}, $Pg, [$Rn, $Rm]">;
 
-class sme_mem_st_ss_inst_Q<string mnemonic, MatrixTileVectorOperand tile_ty,
-                           bit is_col>
-    : sme_mem_st_ss_base<
-        0b1, is_col, 0b11,
-        (ins tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg,
-             GPR64sp:$Rn, GPR64shifted128:$Rm),
-        mnemonic, "\t\\{$ZAt[$Rv]\\}, $Pg, [$Rn, $Rm]">;
-
 multiclass sme_mem_st_ss_aliases<string inst, bit is_col> {
   defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>;
 }
 
 multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
-  def _B : sme_mem_st_ss_inst_BHSD<0b00, mnemonic # "b",
-                                   !if(is_col, TileVectorOpV8,
-                                               TileVectorOpH8),
-                                   is_col, imm0_15, GPR64shifted8> {
+  def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b",
+                              !if(is_col, TileVectorOpV8, TileVectorOpH8),
+                              is_col, sme_elm_idx0_15, GPR64shifted8> {
     bits<4> imm;
     let Inst{3-0} = imm;
   }
-  def _H : sme_mem_st_ss_inst_BHSD<0b01, mnemonic # "h",
-                                   !if(is_col, TileVectorOpV16,
-                                               TileVectorOpH16),
-                                   is_col, imm0_7, GPR64shifted16> {
+  def _H : sme_mem_st_ss_inst<0b0, 0b01, mnemonic # "h",
+                              !if(is_col, TileVectorOpV16, TileVectorOpH16),
+                              is_col, sme_elm_idx0_7, GPR64shifted16> {
     bits<1> ZAt;
     bits<3> imm;
     let Inst{3}   = ZAt;
     let Inst{2-0} = imm;
   }
-  def _S : sme_mem_st_ss_inst_BHSD<0b10, mnemonic # "w",
-                                   !if(is_col, TileVectorOpV32,
-                                               TileVectorOpH32),
-                                   is_col, imm0_3, GPR64shifted32> {
+  def _S : sme_mem_st_ss_inst<0b0, 0b10, mnemonic # "w",
+                              !if(is_col, TileVectorOpV32, TileVectorOpH32),
+                              is_col, sme_elm_idx0_3, GPR64shifted32> {
     bits<2> ZAt;
     bits<2> imm;
     let Inst{3-2} = ZAt;
     let Inst{1-0} = imm;
   }
-  def _D : sme_mem_st_ss_inst_BHSD<0b11, mnemonic # "d",
-                                   !if(is_col, TileVectorOpV64,
-                                               TileVectorOpH64),
-                                   is_col, imm0_1, GPR64shifted64> {
+  def _D : sme_mem_st_ss_inst<0b0, 0b11, mnemonic # "d",
+                              !if(is_col, TileVectorOpV64, TileVectorOpH64),
+                              is_col, sme_elm_idx0_1, GPR64shifted64> {
     bits<3> ZAt;
     bits<1> imm;
     let Inst{3-1} = ZAt;
     let Inst{0}   = imm;
   }
-  def _Q : sme_mem_st_ss_inst_Q<mnemonic # "q",
-                                !if(is_col, TileVectorOpV128,
-                                            TileVectorOpH128),
-                                is_col> {
+  def _Q : sme_mem_st_ss_inst<0b1, 0b11, mnemonic # "q",
+                              !if(is_col, TileVectorOpV128, TileVectorOpH128),
+                              is_col, sme_elm_idx0_0, GPR64shifted128> {
     bits<4> ZAt;
     let Inst{3-0} = ZAt;
   }
@@ -423,13 +385,13 @@ multiclass sme_spill_fill<bit isStore, dag outs, dag ins, string opcodestr> {
 
   def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
                   (!cast<Instruction>(NAME) MatrixOp:$ZAt,
-                   MatrixIndexGPR32Op12_15:$Rv, imm0_15:$imm4, GPR64sp:$Rn, 0), 1>;
+                   MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
 }
 
 multiclass sme_spill<string opcodestr> {
   defm NAME : sme_spill_fill<0b1, (outs),
                              (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
-                                  imm0_15:$imm4, GPR64sp:$Rn,
+                                  sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
                                   imm0_15:$offset),
                              opcodestr>;
 }
@@ -437,7 +399,7 @@ multiclass sme_spill<string opcodestr> {
 multiclass sme_fill<string opcodestr> {
   defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt),
                              (ins MatrixIndexGPR32Op12_15:$Rv,
-                                  imm0_15:$imm4, GPR64sp:$Rn,
+                                  sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
                                   imm0_15:$offset),
                              opcodestr>;
 }
@@ -463,60 +425,54 @@ class sme_vector_to_tile_base<bit Q, bit V, bits<2> sz, dag outs, dag ins,
   let Inst{4}     = 0b0;
 }
 
-class sme_vector_to_tile_inst<bits<2> sz, MatrixTileVectorOperand tile_ty,
+class sme_vector_to_tile_inst<bit Q, bits<2> sz, MatrixTileVectorOperand tile_ty,
                               bit is_col, Operand imm_ty, ZPRRegOp zpr_ty,
                               string mnemonic>
-    : sme_vector_to_tile_base<0b0, is_col, sz, (outs tile_ty:$ZAd),
+    : sme_vector_to_tile_base<Q, is_col, sz, (outs tile_ty:$ZAd),
         (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn),
         mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">;
 
-class sme_vector_to_tile_inst_Q<MatrixTileVectorOperand tile_ty,
-                                bit is_col, string mnemonic>
-    : sme_vector_to_tile_base<0b1, is_col, 0b11, (outs tile_ty:$ZAd),
-        (ins MatrixIndexGPR32Op12_15:$Rv, PPR3bAny:$Pg, ZPR128:$Zn),
-        mnemonic, "\t$ZAd[$Rv], $Pg/m, $Zn">;
-
 multiclass sme_vector_to_tile_aliases<Instruction inst,
                                       MatrixTileVectorOperand tile_ty,
                                       ZPRRegOp zpr_ty, Operand imm_ty> {
   def : InstAlias<"mov\t$ZAd[$Rv, $imm], $Pg/m, $Zn",
-                  (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm0_15:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>;
+                  (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>;
 }
 
 multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
-  def _B : sme_vector_to_tile_inst<0b00, !if(is_col, TileVectorOpV8,
-                                                     TileVectorOpH8),
-                                   is_col, imm0_15, ZPR8, mnemonic> {
+  def _B : sme_vector_to_tile_inst<0b0, 0b00, !if(is_col, TileVectorOpV8,
+                                                          TileVectorOpH8),
+                                   is_col, sme_elm_idx0_15, ZPR8, mnemonic> {
     bits<4> imm;
     let Inst{3-0} = imm;
   }
-  def _H : sme_vector_to_tile_inst<0b01, !if(is_col, TileVectorOpV16,
-                                                     TileVectorOpH16),
-                                   is_col, imm0_7, ZPR16, mnemonic> {
+  def _H : sme_vector_to_tile_inst<0b0, 0b01, !if(is_col, TileVectorOpV16,
+                                                          TileVectorOpH16),
+                                   is_col, sme_elm_idx0_7, ZPR16, mnemonic> {
     bits<1> ZAd;
     bits<3> imm;
     let Inst{3}   = ZAd;
     let Inst{2-0} = imm;
   }
-  def _S : sme_vector_to_tile_inst<0b10, !if(is_col, TileVectorOpV32,
-                                                     TileVectorOpH32),
-                                   is_col, imm0_3, ZPR32, mnemonic> {
+  def _S : sme_vector_to_tile_inst<0b0, 0b10, !if(is_col, TileVectorOpV32,
+                                                          TileVectorOpH32),
+                                   is_col, sme_elm_idx0_3, ZPR32, mnemonic> {
     bits<2> ZAd;
     bits<2> imm;
     let Inst{3-2} = ZAd;
     let Inst{1-0} = imm;
   }
-  def _D : sme_vector_to_tile_inst<0b11, !if(is_col, TileVectorOpV64,
-                                                     TileVectorOpH64),
-                                   is_col, imm0_1, ZPR64, mnemonic> {
+  def _D : sme_vector_to_tile_inst<0b0, 0b11, !if(is_col, TileVectorOpV64,
+                                                          TileVectorOpH64),
+                                   is_col, sme_elm_idx0_1, ZPR64, mnemonic> {
     bits<3> ZAd;
     bits<1> imm;
     let Inst{3-1} = ZAd;
     let Inst{0}   = imm;
   }
-  def _Q : sme_vector_to_tile_inst_Q<!if(is_col, TileVectorOpV128,
-                                                 TileVectorOpH128),
-                                     is_col, mnemonic> {
+  def _Q : sme_vector_to_tile_inst<0b1, 0b11, !if(is_col, TileVectorOpV128,
+                                                          TileVectorOpH128),
+                                   is_col, sme_elm_idx0_0, ZPR128, mnemonic> {
     bits<4> ZAd;
     bits<1> imm;
     let Inst{3-0} = ZAd;
@@ -525,26 +481,23 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
   defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _B),
                                     !if(is_col, TileVectorOpV8,
                                                 TileVectorOpH8),
-                                    ZPR8, imm0_15>;
+                                    ZPR8, sme_elm_idx0_15>;
   defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _H),
                                     !if(is_col, TileVectorOpV16,
                                                 TileVectorOpH16),
-                                    ZPR16, imm0_7>;
+                                    ZPR16, sme_elm_idx0_7>;
   defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _S),
                                     !if(is_col, TileVectorOpV32,
                                                 TileVectorOpH32),
-                                    ZPR32, imm0_3>;
+                                    ZPR32, sme_elm_idx0_3>;
   defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _D),
                                     !if(is_col, TileVectorOpV64,
                                                 TileVectorOpH64),
-                                    ZPR64, imm0_1>;
-
-  def : InstAlias<"mov\t$ZAd[$Rv], $Pg/m, $Zn",
-                  (!cast<Instruction>(NAME # _Q) !if(is_col,
-                                                      TileVectorOpV128,
-                                                      TileVectorOpH128):$ZAd,
-                                                 MatrixIndexGPR32Op12_15:$Rv,
-                                                 PPR3bAny:$Pg, ZPR128:$Zn), 1>;
+                                    ZPR64, sme_elm_idx0_1>;
+  defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _Q),
+                                    !if(is_col, TileVectorOpV128,
+                                                TileVectorOpH128),
+                                    ZPR128, sme_elm_idx0_0>;
 }
 
 multiclass sme_vector_to_tile<string mnemonic> {
@@ -569,19 +522,13 @@ class sme_tile_to_vector_base<bit Q, bit V, bits<2> sz, dag outs, dag ins,
   let Inst{4-0}   = Zd;
 }
 
-class sme_tile_to_vector_inst<bits<2> sz, ZPRRegOp zpr_ty,
+class sme_tile_to_vector_inst<bit Q, bits<2> sz, ZPRRegOp zpr_ty,
                               MatrixTileVectorOperand tile_ty,
                               bit is_col, Operand imm_ty, string mnemonic>
-    : sme_tile_to_vector_base<0b0, is_col, sz, (outs zpr_ty:$Zd),
+    : sme_tile_to_vector_base<Q, is_col, sz, (outs zpr_ty:$Zd),
         (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
         mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]">;
 
-class sme_tile_to_vector_inst_Q<MatrixTileVectorOperand tile_ty,
-                                bit is_col, string mnemonic>
-    : sme_tile_to_vector_base<0b1, is_col, 0b11, (outs ZPR128:$Zd),
-        (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv),
-        mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv]">;
-
 multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
                                       MatrixTileVectorOperand tile_ty,
                                       Operand imm_ty > {
@@ -590,62 +537,58 @@ multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty,
 }
 
 multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
-  def _B : sme_tile_to_vector_inst<0b00, ZPR8, !if(is_col, TileVectorOpV8,
-                                                           TileVectorOpH8),
-                                   is_col, imm0_15, mnemonic> {
+  def _B : sme_tile_to_vector_inst<0b0, 0b00, ZPR8, !if(is_col, TileVectorOpV8,
+                                                                TileVectorOpH8),
+                                   is_col, sme_elm_idx0_15, mnemonic> {
     bits<4> imm;
     let Inst{8-5} = imm;
   }
-  def _H : sme_tile_to_vector_inst<0b01, ZPR16, !if(is_col, TileVectorOpV16,
-                                                            TileVectorOpH16),
-                                   is_col, imm0_7, mnemonic> {
+  def _H : sme_tile_to_vector_inst<0b0, 0b01, ZPR16, !if(is_col, TileVectorOpV16,
+                                                                 TileVectorOpH16),
+                                   is_col, sme_elm_idx0_7, mnemonic> {
     bits<1> ZAn;
     bits<3> imm;
     let Inst{8}   = ZAn;
     let Inst{7-5} = imm;
   }
-  def _S : sme_tile_to_vector_inst<0b10, ZPR32, !if(is_col, TileVectorOpV32,
-                                                            TileVectorOpH32),
-                                   is_col, imm0_3, mnemonic> {
+  def _S : sme_tile_to_vector_inst<0b0, 0b10, ZPR32, !if(is_col, TileVectorOpV32,
+                                                                 TileVectorOpH32),
+                                   is_col, sme_elm_idx0_3, mnemonic> {
     bits<2> ZAn;
     bits<2> imm;
     let Inst{8-7} = ZAn;
     let Inst{6-5} = imm;
   }
-  def _D : sme_tile_to_vector_inst<0b11, ZPR64, !if(is_col, TileVectorOpV64,
-                                                            TileVectorOpH64),
-                                   is_col, imm0_1, mnemonic> {
+  def _D : sme_tile_to_vector_inst<0b0, 0b11, ZPR64, !if(is_col, TileVectorOpV64,
+                                                                 TileVectorOpH64),
+                                   is_col, sme_elm_idx0_1, mnemonic> {
     bits<3> ZAn;
     bits<1> imm;
     let Inst{8-6} = ZAn;
     let Inst{5}   = imm;
   }
-  def _Q : sme_tile_to_vector_inst_Q<!if(is_col, TileVectorOpV128,
-                                                 TileVectorOpH128),
-                                     is_col, mnemonic> {
+  def _Q : sme_tile_to_vector_inst<0b1, 0b11, ZPR128, !if(is_col, TileVectorOpV128,
+                                                                  TileVectorOpH128),
+                                   is_col, sme_elm_idx0_0, mnemonic> {
     bits<4> ZAn;
     let Inst{8-5} = ZAn;
   }
 
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _B), ZPR8,
                                     !if(is_col, TileVectorOpV8,
-                                                TileVectorOpH8), imm0_15>;
+                                                TileVectorOpH8), sme_elm_idx0_15>;
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _H), ZPR16,
                                     !if(is_col, TileVectorOpV16,
-                                                TileVectorOpH16), imm0_7>;
+                                                TileVectorOpH16), sme_elm_idx0_7>;
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _S), ZPR32,
                                     !if(is_col, TileVectorOpV32,
-                                                TileVectorOpH32), imm0_3>;
+                                                TileVectorOpH32), sme_elm_idx0_3>;
   defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _D), ZPR64,
                                     !if(is_col, TileVectorOpV64,
-                                                TileVectorOpH64), imm0_1>;
-
-  def : InstAlias<"mov\t$Zd, $Pg/m, $ZAn[$Rv]",
-                  (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, PPR3bAny:$Pg,
-                                                 !if(is_col,
-                                                      TileVectorOpV128,
-                                                      TileVectorOpH128):$ZAn,
-                                                 MatrixIndexGPR32Op12_15:$Rv), 1>;
+                                                TileVectorOpH64), sme_elm_idx0_1>;
+  defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _Q), ZPR128,
+                                    !if(is_col, TileVectorOpV128,
+                                                TileVectorOpH128), sme_elm_idx0_0>;
 }
 
 multiclass sme_tile_to_vector<string mnemonic> {
@@ -736,57 +679,48 @@ multiclass sve2_clamp<string asm, bit U> {
   def _D : sve2_clamp<asm, 0b11, U, ZPR64>;
 }
 
-class sve2_int_perm_dup_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
-    : I<(outs ppr_ty:$Pd), (ins PPRAny:$Pg, ppr_ty:$Pn,
-                                MatrixIndexGPR32Op12_15:$Rm, imm_ty:$imm),
-        asm, "\t$Pd, $Pg/z, $Pn[$Rm, $imm]", "", []>,
+class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
+    : I<(outs PPRAny:$Pd), (ins PPRAny:$Pn, ppr_ty:$Pm,
+                            MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm),
+        asm, "\t$Pd, $Pn, $Pm[$Rv, $imm]", "", []>,
       Sched<[]> {
-  bits<2> Rm;
-  bits<4> Pg;
+  bits<2> Rv;
   bits<4> Pn;
+  bits<4> Pm;
   bits<4> Pd;
   let Inst{31-24} = 0b00100101;
   let Inst{21}    = 0b1;
-  let Inst{17-16} = Rm;
+  let Inst{17-16} = Rv;
   let Inst{15-14} = 0b01;
-  let Inst{13-10} = Pg;
+  let Inst{13-10} = Pn;
   let Inst{9}     = 0b0;
-  let Inst{8-5}   = Pn;
+  let Inst{8-5}   = Pm;
   let Inst{4}     = 0b0;
   let Inst{3-0}   = Pd;
 }
 
-multiclass sve2_int_perm_dup_p<string asm> {
-  def _B : sve2_int_perm_dup_p<asm, PPR8, imm0_15> {
+multiclass sve2_int_perm_sel_p<string asm> {
+  def _B : sve2_int_perm_sel_p<asm, PPR8, sme_elm_idx0_15> {
     bits<4> imm;
     let Inst{23-22} = imm{3-2};
     let Inst{20-19} = imm{1-0};
     let Inst{18}    = 0b1;
   }
-  def _H : sve2_int_perm_dup_p<asm, PPR16, imm0_7> {
+  def _H : sve2_int_perm_sel_p<asm, PPR16, sme_elm_idx0_7> {
     bits<3> imm;
     let Inst{23-22} = imm{2-1};
     let Inst{20}    = imm{0};
     let Inst{19-18} = 0b10;
   }
-  def _S : sve2_int_perm_dup_p<asm, PPR32, imm0_3> {
+  def _S : sve2_int_perm_sel_p<asm, PPR32, sme_elm_idx0_3> {
     bits<2> imm;
     let Inst{23-22} = imm{1-0};
     let Inst{20-18} = 0b100;
   }
-  def _D : sve2_int_perm_dup_p<asm, PPR64, imm0_1> {
+  def _D : sve2_int_perm_sel_p<asm, PPR64, sme_elm_idx0_1> {
     bits<1> imm;
     let Inst{23}    = imm;
     let Inst{22}    = 0b1;
     let Inst{20-18} = 0b000;
   }
-
-  def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
-                  (!cast<Instruction>(NAME # _B) PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
-  def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
-                  (!cast<Instruction>(NAME # _H) PPR16:$Pd, PPRAny:$Pg, PPR16:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
-  def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
-                  (!cast<Instruction>(NAME # _S) PPR32:$Pd, PPRAny:$Pg, PPR32:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
-  def : InstAlias<"dup\t$Pd, $Pg/z, $Pn[$Rm]",
-                  (!cast<Instruction>(NAME # _D) PPR64:$Pd, PPRAny:$Pg, PPR64:$Pn, MatrixIndexGPR32Op12_15:$Rm, 0), 1>;
 }
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 02d3a765a802..010ffa1502de 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -264,16 +264,22 @@ def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
 def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">;
 
 
-def sve_ext_imm_0_1  : ComplexPattern<i32, 1, "SelectEXTImm<1, 8>">;
-def sve_ext_imm_0_3  : ComplexPattern<i32, 1, "SelectEXTImm<3, 4>">;
-def sve_ext_imm_0_7  : ComplexPattern<i32, 1, "SelectEXTImm<7, 2>">;
-def sve_ext_imm_0_15 : ComplexPattern<i32, 1, "SelectEXTImm<15, 1>">;
+def sve_ext_imm_0_31  : ComplexPattern<i32, 1, "SelectEXTImm<31, 8>">;
+def sve_ext_imm_0_63  : ComplexPattern<i32, 1, "SelectEXTImm<63, 4>">;
+def sve_ext_imm_0_127 : ComplexPattern<i32, 1, "SelectEXTImm<127, 2>">;
+def sve_ext_imm_0_255 : ComplexPattern<i32, 1, "SelectEXTImm<255, 1>">;
 
 def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2),
                                           (int_aarch64_sve_cntp node:$pred, node:$src2), [{
   return N->hasOneUse();
 }]>;
 
+def step_vector_oneuse : PatFrag<(ops node:$idx),
+                                 (step_vector node:$idx), [{
+  return N->hasOneUse();
+}]>;
+
+
 //===----------------------------------------------------------------------===//
 // SVE PTrue - These are used extensively throughout the pattern matching so
 //             it's important we define them first.
@@ -321,7 +327,7 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
 def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
 def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
 
-let Predicates = [HasSVE] in {
+let Predicates = [HasSVEorStreamingSVE] in {
   defm PTRUE  : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
   defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>;
 }
@@ -484,6 +490,21 @@ class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
 : Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
       (inst $Rn, i32:$imm)>;
 
+class SVE_2_Op_Fp_Imm_Pat<ValueType vt, SDPatternOperator op,
+                          ValueType pt, ValueType it,
+                          FPImmLeaf immL, int imm,
+                          Instruction inst>
+: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (AArch64dup (it immL))))),
+      (inst $Pg, $Zs1, imm)>;
+
+class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op,
+                              ValueType pt, ValueType it,
+                              FPImmLeaf immL, int imm,
+                              Instruction inst>
+: Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Zs1, (SVEDup0)),
+                      (vt (AArch64dup (it immL))))),
+      (inst $Pg, $Zs1, imm)>;
+
 //
 // Pseudo -> Instruction mappings
 //
@@ -621,6 +642,8 @@ class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
 
   let Constraints = "$Pdn = $_Pdn";
   let Defs = [NZCV];
+  let isPTestLike = 1;
+  let ElementSize = pprty.ElementSize;
 }
 
 multiclass sve_int_pfirst<bits<5> opc, string asm, SDPatternOperator op> {
@@ -912,13 +935,43 @@ class sve_int_pred_pattern_a<bits<3> opc, string asm>
   let Constraints = "$Rdn = $_Rdn";
 }
 
-multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
-  def NAME : sve_int_pred_pattern_a<opc, asm>;
+multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
+                                  SDPatternOperator op,
+                                  SDPatternOperator opcnt> {
+  let Predicates = [HasSVEorStreamingSVE] in {
+    def NAME : sve_int_pred_pattern_a<opc, asm>;
+
+    def : InstAlias<asm # "\t$Rdn, $pattern",
+                    (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
+    def : InstAlias<asm # "\t$Rdn",
+                    (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
+  }
 
-  def : InstAlias<asm # "\t$Rdn, $pattern",
-                  (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
-  def : InstAlias<asm # "\t$Rdn",
-                  (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
+  let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL] in {
+    def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))),
+              (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>;
+
+    def : Pat<(i64 (op GPR64:$Rdn, (mul (opcnt sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm)))),
+              (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>;
+
+    def : Pat<(i64 (op GPR64:$Rdn, (shl (opcnt sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm))))),
+              (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>;
+
+    def : Pat<(i32 (op GPR32:$Rdn, (i32 (trunc (opcnt (sve_pred_enum:$pattern)))))),
+              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, 1),
+                                    sub_32))>;
+
+    def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm i32:$imm)))),
+              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
+                                    sub_32))>;
+
+    def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (i64 (sve_cnt_shl_imm i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                               GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
+                                    sub_32))>;
+  }
 }
 
 class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
@@ -1270,10 +1323,15 @@ multiclass sve_int_perm_reverse_z<string asm, SDPatternOperator op> {
   def : SVE_1_Op_Pat<nxv4i32, op, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Pat<nxv2i64, op, nxv2i64, !cast<Instruction>(NAME # _D)>;
 
+  def : SVE_1_Op_Pat<nxv2f16, op, nxv2f16, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Pat<nxv4f16, op, nxv4f16, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Pat<nxv2f32, op, nxv2f32, !cast<Instruction>(NAME # _D)>;
   def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
 
+  def : SVE_1_Op_Pat<nxv2bf16, op, nxv2bf16, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Pat<nxv4bf16, op, nxv4bf16, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Pat<nxv8bf16, op, nxv8bf16, !cast<Instruction>(NAME # _H)>;
 }
 
@@ -1707,10 +1765,19 @@ class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
-  def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
-  def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
-  def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
+multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, string Ps, Operand imm_ty, FPImmLeaf A, FPImmLeaf B, SDPatternOperator op> {
+  let DestructiveInstType = DestructiveBinaryImm in {
+  def _H : SVEPseudo2Instr<Ps # _H, 1>, sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
+  def _S : SVEPseudo2Instr<Ps # _S, 1>, sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
+  def _D : SVEPseudo2Instr<Ps # _D, 1>, sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
+  }
+
+  def : SVE_2_Op_Fp_Imm_Pat<nxv8f16, op, nxv8i1, f16, A, 0, !cast<Instruction>(NAME # "_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv8f16, op, nxv8i1, f16, B, 1, !cast<Instruction>(NAME # "_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv4f32, op, nxv4i1, f32, A, 0, !cast<Instruction>(NAME # "_S")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv4f32, op, nxv4i1, f32, B, 1, !cast<Instruction>(NAME # "_S")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f64, op, nxv2i1, f64, A, 0, !cast<Instruction>(NAME # "_D")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f64, op, nxv2i1, f64, B, 1, !cast<Instruction>(NAME # "_D")>;
 }
 
 class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
@@ -1775,7 +1842,7 @@ multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
 }
 
 class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, timm32_0_7:$imm3),
   asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
   "",
   []>, Sched<[]> {
@@ -1800,12 +1867,46 @@ multiclass sve_fp_ftmad<string asm, SDPatternOperator op> {
   def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
   def _D : sve_fp_ftmad<0b11, asm, ZPR64>;
 
-  def : Pat<(nxv8f16 (op (nxv8f16 ZPR16:$Zn), (nxv8f16 ZPR16:$Zm), (i32 imm32_0_7:$imm))),
-            (!cast<Instruction>(NAME # _H) ZPR16:$Zn, ZPR16:$Zm, imm32_0_7:$imm)>;
-  def : Pat<(nxv4f32 (op (nxv4f32 ZPR32:$Zn), (nxv4f32 ZPR32:$Zm), (i32 imm32_0_7:$imm))),
-            (!cast<Instruction>(NAME # _S) ZPR32:$Zn, ZPR32:$Zm, imm32_0_7:$imm)>;
-  def : Pat<(nxv2f64 (op (nxv2f64 ZPR64:$Zn), (nxv2f64 ZPR64:$Zm), (i32 imm32_0_7:$imm))),
-            (!cast<Instruction>(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, imm32_0_7:$imm)>;
+  def : Pat<(nxv8f16 (op (nxv8f16 ZPR16:$Zn), (nxv8f16 ZPR16:$Zm), (i32 timm32_0_7:$imm))),
+            (!cast<Instruction>(NAME # _H) ZPR16:$Zn, ZPR16:$Zm, timm32_0_7:$imm)>;
+  def : Pat<(nxv4f32 (op (nxv4f32 ZPR32:$Zn), (nxv4f32 ZPR32:$Zm), (i32 timm32_0_7:$imm))),
+            (!cast<Instruction>(NAME # _S) ZPR32:$Zn, ZPR32:$Zm, timm32_0_7:$imm)>;
+  def : Pat<(nxv2f64 (op (nxv2f64 ZPR64:$Zn), (nxv2f64 ZPR64:$Zm), (i32 timm32_0_7:$imm))),
+            (!cast<Instruction>(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, timm32_0_7:$imm)>;
+}
+
+multiclass sve_fp_2op_i_p_zds_hfd<Operand imm_ty, FPImmLeaf A, FPImmLeaf B, SDPatternOperator ir_op = null_frag> {
+  def _UNDEF_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, imm_ty, FalseLanesUndef>;
+  def _UNDEF_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, imm_ty, FalseLanesUndef>;
+  def _UNDEF_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, imm_ty, FalseLanesUndef>;
+
+  def : SVE_2_Op_Fp_Imm_Pat<nxv8f16, ir_op, nxv8i1, f16, A, 0, !cast<Instruction>(NAME # "_UNDEF_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv8f16, ir_op, nxv8i1, f16, B, 1, !cast<Instruction>(NAME # "_UNDEF_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv4f16, ir_op, nxv4i1, f16, A, 0, !cast<Instruction>(NAME # "_UNDEF_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv4f16, ir_op, nxv4i1, f16, B, 1, !cast<Instruction>(NAME # "_UNDEF_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f16, ir_op, nxv2i1, f16, A, 0, !cast<Instruction>(NAME # "_UNDEF_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f16, ir_op, nxv2i1, f16, B, 1, !cast<Instruction>(NAME # "_UNDEF_H")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv4f32, ir_op, nxv4i1, f32, A, 0, !cast<Instruction>(NAME # "_UNDEF_S")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv4f32, ir_op, nxv4i1, f32, B, 1, !cast<Instruction>(NAME # "_UNDEF_S")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f32, ir_op, nxv2i1, f32, A, 0, !cast<Instruction>(NAME # "_UNDEF_S")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f32, ir_op, nxv2i1, f32, B, 1, !cast<Instruction>(NAME # "_UNDEF_S")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f64, ir_op, nxv2i1, f64, A, 0, !cast<Instruction>(NAME # "_UNDEF_D")>;
+  def : SVE_2_Op_Fp_Imm_Pat<nxv2f64, ir_op, nxv2i1, f64, B, 1, !cast<Instruction>(NAME # "_UNDEF_D")>;
+}
+
+multiclass sve_fp_2op_i_p_zds_zeroing_hfd<Operand imm_ty, FPImmLeaf A, FPImmLeaf B, SDPatternOperator op> {
+  def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, imm_ty, FalseLanesZero>;
+  def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, imm_ty, FalseLanesZero>;
+  def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, imm_ty, FalseLanesZero>;
+
+  let AddedComplexity = 2 in {
+    def : SVE_2_Op_Fp_Imm_Pat_Zero<nxv8f16, op, nxv8i1, f16, A, 0, !cast<Instruction>(NAME # "_ZERO_H")>;
+    def : SVE_2_Op_Fp_Imm_Pat_Zero<nxv8f16, op, nxv8i1, f16, B, 1, !cast<Instruction>(NAME # "_ZERO_H")>;
+    def : SVE_2_Op_Fp_Imm_Pat_Zero<nxv4f32, op, nxv4i1, f32, A, 0, !cast<Instruction>(NAME # "_ZERO_S")>;
+    def : SVE_2_Op_Fp_Imm_Pat_Zero<nxv4f32, op, nxv4i1, f32, B, 1, !cast<Instruction>(NAME # "_ZERO_S")>;
+    def : SVE_2_Op_Fp_Imm_Pat_Zero<nxv2f64, op, nxv2i1, f64, A, 0, !cast<Instruction>(NAME # "_ZERO_D")>;
+    def : SVE_2_Op_Fp_Imm_Pat_Zero<nxv2f64, op, nxv2i1, f64, B, 1, !cast<Instruction>(NAME # "_ZERO_D")>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1938,7 +2039,7 @@ multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op,
   def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_fp_3op_p_zds_zx<SDPatternOperator op, SDPatternOperator rev_op> {
+multiclass sve_fp_3op_p_zds_zx {
   def _UNDEF_H : PredThreeOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
   def _UNDEF_S : PredThreeOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
   def _UNDEF_D : PredThreeOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
@@ -2433,7 +2534,7 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = DestructiveOther;
+  let DestructiveInstType = DestructiveUnaryPassthru;
   let ElementSize = Sz;
 }
 
@@ -2482,9 +2583,12 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
 }
 
 multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
-  def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
-  def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
-  def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
+  def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>,
+           SVEPseudo2Instr<NAME # _H, 1>;
+  def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>,
+           SVEPseudo2Instr<NAME # _S, 1>;
+  def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>,
+           SVEPseudo2Instr<NAME # _D, 1>;
 
   def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
@@ -2492,6 +2596,17 @@ multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def _UNDEF_H : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+  def _UNDEF_S : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+  def _UNDEF_D : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _UNDEF_H)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _UNDEF_H)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _UNDEF_H)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _UNDEF_S)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _UNDEF_S)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _UNDEF_D)>;
 }
 
 multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
@@ -4986,7 +5101,7 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_ii<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse> {
+multiclass sve_int_index_ii<string asm> {
   def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>;
   def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>;
   def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
@@ -5029,7 +5144,7 @@ class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_ir<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse, SDPatternOperator mulop, SDPatternOperator muloneuseop> {
+multiclass sve_int_index_ir<string asm, SDPatternOperator mulop, SDPatternOperator muloneuseop> {
   def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>;
   def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>;
   def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
@@ -5096,7 +5211,7 @@ class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_ri<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse> {
+multiclass sve_int_index_ri<string asm> {
   def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>;
   def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>;
   def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
@@ -5130,7 +5245,7 @@ class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_rr<string asm, SDPatternOperator step_vector, SDPatternOperator step_vector_oneuse, SDPatternOperator mulop> {
+multiclass sve_int_index_rr<string asm, SDPatternOperator mulop> {
   def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
   def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
   def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
@@ -8333,3 +8448,4 @@ multiclass sve_int_bin_pred_all_active_bhsd<SDPatternOperator op> {
   def : SVE_2_Op_Pred_All_Active_Pt<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
   def : SVE_2_Op_Pred_All_Active_Pt<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
 }
+
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 79dcca8f8458..e72dccdc4b78 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -1,9 +1,8 @@
 //===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -59,6 +58,10 @@ private:
   bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
                                    SmallSetVector<IntrinsicInst *, 4> &PTrues);
   bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
+  bool optimizePredicateStore(Instruction *I);
+  bool optimizePredicateLoad(Instruction *I);
+
+  bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);
 
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
@@ -276,11 +279,166 @@ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
   return Changed;
 }
 
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable stores as late as possible
+bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
+  auto *F = I->getFunction();
+  auto Attr = F->getFnAttribute(Attribute::VScaleRange);
+  if (!Attr.isValid())
+    return false;
+
+  unsigned MinVScale, MaxVScale;
+  std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
+  // The transform needs to know the exact runtime length of scalable vectors
+  if (MinVScale != MaxVScale || MinVScale == 0)
+    return false;
+
+  auto *PredType =
+      ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
+  auto *FixedPredType =
+      FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
+
+  // If we have a store..
+  auto *Store = dyn_cast<StoreInst>(I);
+  if (!Store || !Store->isSimple())
+    return false;
+
+  // ..that is storing a predicate vector sized worth of bits..
+  if (Store->getOperand(0)->getType() != FixedPredType)
+    return false;
+
+  // ..where the value stored comes from a vector extract..
+  auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
+  if (!IntrI ||
+      IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
+    return false;
+
+  // ..that is extracting from index 0..
+  if (!cast<ConstantInt>(IntrI->getOperand(1))->isZero())
+    return false;
+
+  // ..where the value being extract from comes from a bitcast
+  auto *BitCast = dyn_cast<BitCastInst>(IntrI->getOperand(0));
+  if (!BitCast)
+    return false;
+
+  // ..and the bitcast is casting from predicate type
+  if (BitCast->getOperand(0)->getType() != PredType)
+    return false;
+
+  IRBuilder<> Builder(I->getContext());
+  Builder.SetInsertPoint(I);
+
+  auto *PtrBitCast = Builder.CreateBitCast(
+      Store->getPointerOperand(),
+      PredType->getPointerTo(Store->getPointerAddressSpace()));
+  Builder.CreateStore(BitCast->getOperand(0), PtrBitCast);
+
+  Store->eraseFromParent();
+  if (IntrI->getNumUses() == 0)
+    IntrI->eraseFromParent();
+  if (BitCast->getNumUses() == 0)
+    BitCast->eraseFromParent();
+
+  return true;
+}
+
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable loads as late as possible
+bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
+  auto *F = I->getFunction();
+  auto Attr = F->getFnAttribute(Attribute::VScaleRange);
+  if (!Attr.isValid())
+    return false;
+
+  unsigned MinVScale, MaxVScale;
+  std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
+  // The transform needs to know the exact runtime length of scalable vectors
+  if (MinVScale != MaxVScale || MinVScale == 0)
+    return false;
+
+  auto *PredType =
+      ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
+  auto *FixedPredType =
+      FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
+
+  // If we have a bitcast..
+  auto *BitCast = dyn_cast<BitCastInst>(I);
+  if (!BitCast || BitCast->getType() != PredType)
+    return false;
+
+  // ..whose operand is a vector_insert..
+  auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
+  if (!IntrI ||
+      IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+    return false;
+
+  // ..that is inserting into index zero of an undef vector..
+  if (!isa<UndefValue>(IntrI->getOperand(0)) ||
+      !cast<ConstantInt>(IntrI->getOperand(2))->isZero())
+    return false;
+
+  // ..where the value inserted comes from a load..
+  auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
+  if (!Load || !Load->isSimple())
+    return false;
+
+  // ..that is loading a predicate vector sized worth of bits..
+  if (Load->getType() != FixedPredType)
+    return false;
+
+  IRBuilder<> Builder(I->getContext());
+  Builder.SetInsertPoint(Load);
+
+  auto *PtrBitCast = Builder.CreateBitCast(
+      Load->getPointerOperand(),
+      PredType->getPointerTo(Load->getPointerAddressSpace()));
+  auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
+
+  BitCast->replaceAllUsesWith(LoadPred);
+  BitCast->eraseFromParent();
+  if (IntrI->getNumUses() == 0)
+    IntrI->eraseFromParent();
+  if (Load->getNumUses() == 0)
+    Load->eraseFromParent();
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizeInstructions(
+    SmallSetVector<Function *, 4> &Functions) {
+  bool Changed = false;
+
+  for (auto *F : Functions) {
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+
+    // Traverse the DT with an rpo walk so we see defs before uses, allowing
+    // simplification to be done incrementally.
+    BasicBlock *Root = DT->getRoot();
+    ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
+    for (auto *BB : RPOT) {
+      for (Instruction &I : make_early_inc_range(*BB)) {
+        switch (I.getOpcode()) {
+        case Instruction::Store:
+          Changed |= optimizePredicateStore(&I);
+          break;
+        case Instruction::BitCast:
+          Changed |= optimizePredicateLoad(&I);
+          break;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
 
   Changed |= optimizePTrueIntrinsicCalls(Functions);
+  Changed |= optimizeInstructions(Functions);
 
   return Changed;
 }
@@ -297,6 +455,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
       continue;
 
     switch (F.getIntrinsicID()) {
+    case Intrinsic::experimental_vector_extract:
+    case Intrinsic::experimental_vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
diff --git a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index a6796742117b..52c88fd0218d 100644
--- a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/AArch64TargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 Target &llvm::getTheAArch64leTarget() {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index ce6866154242..caee2acd2606 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -106,6 +106,25 @@ inline static unsigned getXRegFromWReg(unsigned Reg) {
   return Reg;
 }
 
+inline static unsigned getXRegFromXRegTuple(unsigned RegTuple) {
+  switch (RegTuple) {
+  case AArch64::X0_X1_X2_X3_X4_X5_X6_X7: return AArch64::X0;
+  case AArch64::X2_X3_X4_X5_X6_X7_X8_X9: return AArch64::X2;
+  case AArch64::X4_X5_X6_X7_X8_X9_X10_X11: return AArch64::X4;
+  case AArch64::X6_X7_X8_X9_X10_X11_X12_X13: return AArch64::X6;
+  case AArch64::X8_X9_X10_X11_X12_X13_X14_X15: return AArch64::X8;
+  case AArch64::X10_X11_X12_X13_X14_X15_X16_X17: return AArch64::X10;
+  case AArch64::X12_X13_X14_X15_X16_X17_X18_X19: return AArch64::X12;
+  case AArch64::X14_X15_X16_X17_X18_X19_X20_X21: return AArch64::X14;
+  case AArch64::X16_X17_X18_X19_X20_X21_X22_X23: return AArch64::X16;
+  case AArch64::X18_X19_X20_X21_X22_X23_X24_X25: return AArch64::X18;
+  case AArch64::X20_X21_X22_X23_X24_X25_X26_X27: return AArch64::X20;
+  case AArch64::X22_X23_X24_X25_X26_X27_X28_FP: return AArch64::X22;
+  }
+  // For anything else, return it unchanged.
+  return RegTuple;
+}
+
 static inline unsigned getBRegFromDReg(unsigned Reg) {
   switch (Reg) {
   case AArch64::D0:  return AArch64::B0;
@@ -435,6 +454,60 @@ namespace AArch64SVEPredPattern {
 #include "AArch64GenSystemOperands.inc"
 }
 
+/// Return the number of active elements for VL1 to VL256 predicate pattern,
+/// zero for all other patterns.
+inline unsigned getNumElementsFromSVEPredPattern(unsigned Pattern) {
+  switch (Pattern) {
+  default:
+    return 0;
+  case AArch64SVEPredPattern::vl1:
+  case AArch64SVEPredPattern::vl2:
+  case AArch64SVEPredPattern::vl3:
+  case AArch64SVEPredPattern::vl4:
+  case AArch64SVEPredPattern::vl5:
+  case AArch64SVEPredPattern::vl6:
+  case AArch64SVEPredPattern::vl7:
+  case AArch64SVEPredPattern::vl8:
+    return Pattern;
+  case AArch64SVEPredPattern::vl16:
+    return 16;
+  case AArch64SVEPredPattern::vl32:
+    return 32;
+  case AArch64SVEPredPattern::vl64:
+    return 64;
+  case AArch64SVEPredPattern::vl128:
+    return 128;
+  case AArch64SVEPredPattern::vl256:
+    return 256;
+  }
+}
+
+/// Return specific VL predicate pattern based on the number of elements.
+inline unsigned getSVEPredPatternFromNumElements(unsigned MinNumElts) {
+  switch (MinNumElts) {
+  default:
+    llvm_unreachable("unexpected element count for SVE predicate");
+  case 1:
+    return AArch64SVEPredPattern::vl1;
+  case 2:
+    return AArch64SVEPredPattern::vl2;
+  case 4:
+    return AArch64SVEPredPattern::vl4;
+  case 8:
+    return AArch64SVEPredPattern::vl8;
+  case 16:
+    return AArch64SVEPredPattern::vl16;
+  case 32:
+    return AArch64SVEPredPattern::vl32;
+  case 64:
+    return AArch64SVEPredPattern::vl64;
+  case 128:
+    return AArch64SVEPredPattern::vl128;
+  case 256:
+    return AArch64SVEPredPattern::vl256;
+  }
+}
+
 namespace AArch64ExactFPImm {
   struct ExactFPImm {
     const char *Name;
@@ -552,6 +625,7 @@ AArch64StringToVectorLayout(StringRef LayoutStr) {
 namespace AArch64SysReg {
   struct SysReg {
     const char *Name;
+    const char *AltName;
     unsigned Encoding;
     bool Readable;
     bool Writeable;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ca088e63e03c..958e8c9e5bc5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -15,17 +15,7 @@
 
 namespace llvm {
 
-class FunctionPass;
-class GCNTargetMachine;
-class ImmutablePass;
-class MachineFunctionPass;
-class ModulePass;
-class Pass;
-class Target;
 class TargetMachine;
-class TargetOptions;
-class PassRegistry;
-class Module;
 
 // GlobalISel passes
 void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
@@ -35,16 +25,6 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
 FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
 void initializeAMDGPURegBankCombinerPass(PassRegistry &);
 
-// R600 Passes
-FunctionPass *createR600VectorRegMerger();
-FunctionPass *createR600ExpandSpecialInstrsPass();
-FunctionPass *createR600EmitClauseMarkers();
-FunctionPass *createR600ClauseMergePass();
-FunctionPass *createR600Packetizer();
-FunctionPass *createR600ControlFlowFinalizer();
-FunctionPass *createAMDGPUCFGStructurizerPass();
-FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
-
 // SI Passes
 FunctionPass *createGCNDPPCombinePass();
 FunctionPass *createSIAnnotateControlFlowPass();
@@ -114,10 +94,23 @@ ModulePass *createAMDGPUFixFunctionBitcastsPass();
 void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
 extern char &AMDGPUFixFunctionBitcastsID;
 
+ModulePass *createAMDGPUCtorDtorLoweringPass();
+void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &);
+extern char &AMDGPUCtorDtorLoweringID;
+
 FunctionPass *createAMDGPULowerKernelArgumentsPass();
 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
 extern char &AMDGPULowerKernelArgumentsID;
 
+FunctionPass *createAMDGPUPromoteKernelArgumentsPass();
+void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPUPromoteKernelArgumentsID;
+
+struct AMDGPUPromoteKernelArgumentsPass
+    : PassInfoMixin<AMDGPUPromoteKernelArgumentsPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 ModulePass *createAMDGPULowerKernelAttributesPass();
 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
 extern char &AMDGPULowerKernelAttributesID;
@@ -172,21 +165,6 @@ extern char &AMDGPURewriteOutArgumentsID;
 void initializeGCNDPPCombinePass(PassRegistry &);
 extern char &GCNDPPCombineID;
 
-void initializeR600ClauseMergePassPass(PassRegistry &);
-extern char &R600ClauseMergePassID;
-
-void initializeR600ControlFlowFinalizerPass(PassRegistry &);
-extern char &R600ControlFlowFinalizerID;
-
-void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &);
-extern char &R600ExpandSpecialInstrsPassID;
-
-void initializeR600VectorRegMergerPass(PassRegistry &);
-extern char &R600VectorRegMergerID;
-
-void initializeR600PacketizerPass(PassRegistry &);
-extern char &R600PacketizerID;
-
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
@@ -278,7 +256,6 @@ private:
   bool GlobalOpt;
 };
 
-ModulePass *createR600OpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
 ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -390,9 +367,9 @@ namespace AMDGPUAS {
 
     BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
 
-    /// Address space for direct addressible parameter memory (CONST0).
+    /// Address space for direct addressable parameter memory (CONST0).
     PARAM_D_ADDRESS = 6,
-    /// Address space for indirect addressible parameter memory (VTX1).
+    /// Address space for indirect addressable parameter memory (VTX1).
     PARAM_I_ADDRESS = 7,
 
     // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7991f3d2a6b2..e606f0e8fc3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -18,7 +18,6 @@ def p4 : PtrValueType<i64, 4>;
 def p5 : PtrValueType<i32, 5>;
 def p6 : PtrValueType<i32, 6>;
 
-
 class BoolToList<bit Value> {
   list<int> ret = !if(Value, [1]<int>, []<int>);
 }
@@ -416,7 +415,7 @@ def FeatureDPP : SubtargetFeature<"dpp",
   "Support DPP (Data Parallel Primitives) extension"
 >;
 
-// DPP8 allows arbitrary cross-lane swizzling withing groups of 8 lanes.
+// DPP8 allows arbitrary cross-lane swizzling within groups of 8 lanes.
 def FeatureDPP8 : SubtargetFeature<"dpp8",
   "HasDPP8",
   "true",
@@ -1351,7 +1350,7 @@ def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
 def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
   AssemblerPredicate<(all_of FeatureGFX9Insts)>;
 
-def HasLDSFPAtomics : Predicate<"Subtarget->hasLDSFPAtomics()">,
+def HasLDSFPAtomicAdd : Predicate<"Subtarget->hasLDSFPAtomicAdd()">,
   AssemblerPredicate<(all_of FeatureGFX8Insts)>;
 
 def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 88b88a04a7d1..dd3eb3849eac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUAliasAnalysis.h"
+#include "AMDGPU.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
 
@@ -37,6 +38,10 @@ ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() {
   return new AMDGPUExternalAAWrapper();
 }
 
+AMDGPUAAWrapperPass::AMDGPUAAWrapperPass() : ImmutablePass(ID) {
+  initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
 void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
 }
@@ -94,7 +99,7 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
         getUnderlyingObject(A.Ptr->stripPointerCastsForAliasAnalysis());
     if (const LoadInst *LI = dyn_cast<LoadInst>(ObjA)) {
       // If a generic pointer is loaded from the constant address space, it
-      // could only be a GLOBAL or CONSTANT one as that address space is soley
+      // could only be a GLOBAL or CONSTANT one as that address space is solely
       // prepared on the host side, where only GLOBAL or CONSTANT variables are
       // visible. Note that this even holds for regular functions.
       if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 44de40d4aa7f..22be014813b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -12,13 +12,11 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
 
-#include "AMDGPU.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 
 namespace llvm {
 
 class DataLayout;
-class MDNode;
 class MemoryLocation;
 
 /// A simple AA result that uses TBAA metadata to answer queries.
@@ -67,9 +65,7 @@ class AMDGPUAAWrapperPass : public ImmutablePass {
 public:
   static char ID;
 
-  AMDGPUAAWrapperPass() : ImmutablePass(ID) {
-    initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry());
-  }
+  AMDGPUAAWrapperPass();
 
   AMDGPUAAResult &getResult() { return *Result; }
   const AMDGPUAAResult &getResult() const { return *Result; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 2af9fc955875..2e24e9f929d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -70,7 +71,7 @@ recursivelyVisitUsers(GlobalValue &GV,
         // and just let us hit the error when we can't handle this.
         //
         // Unfortunately, clang adds noinline to all functions at -O0. We have
-        // to override this here. until that's fixed.
+        // to override this here until that's fixed.
         F->removeFnAttr(Attribute::NoInline);
 
         FuncsToAlwaysInline.insert(F);
@@ -90,9 +91,13 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
 
   SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
   SmallPtrSet<Function *, 8> FuncsToNoInline;
+  Triple TT(M.getTargetTriple());
 
   for (GlobalAlias &A : M.aliases()) {
     if (Function* F = dyn_cast<Function>(A.getAliasee())) {
+      if (TT.getArch() == Triple::amdgcn &&
+          A.getLinkage() != GlobalValue::InternalLinkage)
+        continue;
       A.replaceAllUsesWith(F);
       AliasesToRemove.push_back(&A);
     }
@@ -122,7 +127,7 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
     unsigned AS = GV.getAddressSpace();
     if ((AS == AMDGPUAS::REGION_ADDRESS) ||
         (AS == AMDGPUAS::LOCAL_ADDRESS &&
-         !AMDGPUTargetMachine::EnableLowerModuleLDS))
+         (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer())))
       recursivelyVisitUsers(GV, FuncsToAlwaysInline);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index af6dfc07eb50..52791dfd9d93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file This pass adds target attributes to functions which use intrinsics
-/// which will impact calling convention lowering.
+/// \file This pass propagates the uniform-work-group-size attribute from
+/// kernels to leaf functions when possible. It also adds additional attributes
+/// to hint ABI lowering optimizations later.
 //
 //===----------------------------------------------------------------------===//
 
@@ -25,22 +26,11 @@
 using namespace llvm;
 
 namespace {
-static constexpr StringLiteral ImplicitAttrNames[] = {
-    // X ids unnecessarily propagated to kernels.
-    "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
-    "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
-    "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
-    "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
-    "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
-
 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
   const TargetMachine *TM = nullptr;
-  SmallVector<CallGraphNode*, 8> NodeList;
 
   bool addFeatureAttributes(Function &F);
-  bool processUniformWorkGroupAttribute();
-  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
 
 public:
   static char ID;
@@ -58,12 +48,6 @@ public:
     AU.setPreservesAll();
     CallGraphSCCPass::getAnalysisUsage(AU);
   }
-
-  static bool visitConstantExpr(const ConstantExpr *CE);
-  static bool visitConstantExprsRecursively(
-    const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
-    bool HasApertureRegs);
 };
 
 } // end anonymous namespace
@@ -75,212 +59,11 @@ char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
                 "Add AMDGPU function attributes", false, false)
 
-
-// The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS) {
-  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
-}
-
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
-  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
-}
-
-static bool isDSAddress(const Constant *C) {
-  const GlobalValue *GV = dyn_cast<GlobalValue>(C);
-  if (!GV)
-    return false;
-  unsigned AS = GV->getAddressSpace();
-  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
-}
-
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
-  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
-    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-    return castRequiresQueuePtr(SrcAS);
-  }
-
-  return false;
-}
-
-bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
-  const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
-  bool IsFunc, bool HasApertureRegs) {
-
-  if (!ConstantExprVisited.insert(EntryC).second)
-    return false;
-
-  SmallVector<const Constant *, 16> Stack;
-  Stack.push_back(EntryC);
-
-  while (!Stack.empty()) {
-    const Constant *C = Stack.pop_back_val();
-
-    // We need to trap on DS globals in non-entry functions.
-    if (IsFunc && isDSAddress(C))
-      return true;
-
-    // Check this constant expression.
-    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (!HasApertureRegs && visitConstantExpr(CE))
-        return true;
-    }
-
-    // Visit all sub-expressions.
-    for (const Use &U : C->operands()) {
-      const auto *OpC = dyn_cast<Constant>(U);
-      if (!OpC)
-        continue;
-
-      if (!ConstantExprVisited.insert(OpC).second)
-        continue;
-
-      Stack.push_back(OpC);
-    }
-  }
-
-  return false;
-}
-
-// We do not need to note the x workitem or workgroup id because they are always
-// initialized.
-//
-// TODO: We should not add the attributes if the known compile time workgroup
-// size is 1 for y/z.
-static StringRef intrinsicToAttrName(Intrinsic::ID ID,
-                                     bool &NonKernelOnly,
-                                     bool &IsQueuePtr) {
-  switch (ID) {
-  case Intrinsic::amdgcn_workitem_id_x:
-    NonKernelOnly = true;
-    return "amdgpu-work-item-id-x";
-  case Intrinsic::amdgcn_workgroup_id_x:
-    NonKernelOnly = true;
-    return "amdgpu-work-group-id-x";
-  case Intrinsic::amdgcn_workitem_id_y:
-  case Intrinsic::r600_read_tidig_y:
-    return "amdgpu-work-item-id-y";
-  case Intrinsic::amdgcn_workitem_id_z:
-  case Intrinsic::r600_read_tidig_z:
-    return "amdgpu-work-item-id-z";
-  case Intrinsic::amdgcn_workgroup_id_y:
-  case Intrinsic::r600_read_tgid_y:
-    return "amdgpu-work-group-id-y";
-  case Intrinsic::amdgcn_workgroup_id_z:
-  case Intrinsic::r600_read_tgid_z:
-    return "amdgpu-work-group-id-z";
-  case Intrinsic::amdgcn_dispatch_ptr:
-    return "amdgpu-dispatch-ptr";
-  case Intrinsic::amdgcn_dispatch_id:
-    return "amdgpu-dispatch-id";
-  case Intrinsic::amdgcn_kernarg_segment_ptr:
-    return "amdgpu-kernarg-segment-ptr";
-  case Intrinsic::amdgcn_implicitarg_ptr:
-    return "amdgpu-implicitarg-ptr";
-  case Intrinsic::amdgcn_queue_ptr:
-  case Intrinsic::amdgcn_is_shared:
-  case Intrinsic::amdgcn_is_private:
-    // TODO: Does not require queue ptr on gfx9+
-  case Intrinsic::trap:
-  case Intrinsic::debugtrap:
-    IsQueuePtr = true;
-    return "amdgpu-queue-ptr";
-  default:
-    return "";
-  }
-}
-
-static bool handleAttr(Function &Parent, const Function &Callee,
-                       StringRef Name) {
-  if (Callee.hasFnAttribute(Name)) {
-    Parent.addFnAttr(Name);
-    return true;
-  }
-  return false;
-}
-
-static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
-                                   bool &NeedQueuePtr) {
-  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
-    NeedQueuePtr = true;
-
-  for (StringRef AttrName : ImplicitAttrNames)
-    handleAttr(Parent, Callee, AttrName);
-}
-
-bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
-  bool Changed = false;
-
-  for (auto *Node : reverse(NodeList)) {
-    Function *Caller = Node->getFunction();
-
-    for (auto I : *Node) {
-      Function *Callee = std::get<1>(I)->getFunction();
-      if (Callee)
-        Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
-    }
-  }
-
-  return Changed;
-}
-
-bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
-       Function &Caller, Function &Callee) {
-
-  // Check for externally defined function
-  if (!Callee.hasExactDefinition()) {
-    Callee.addFnAttr("uniform-work-group-size", "false");
-    if (!Caller.hasFnAttribute("uniform-work-group-size"))
-      Caller.addFnAttr("uniform-work-group-size", "false");
-
-    return true;
-  }
-  // Check if the Caller has the attribute
-  if (Caller.hasFnAttribute("uniform-work-group-size")) {
-    // Check if the value of the attribute is true
-    if (Caller.getFnAttribute("uniform-work-group-size")
-        .getValueAsString().equals("true")) {
-      // Propagate the attribute to the Callee, if it does not have it
-      if (!Callee.hasFnAttribute("uniform-work-group-size")) {
-        Callee.addFnAttr("uniform-work-group-size", "true");
-        return true;
-      }
-    } else {
-      Callee.addFnAttr("uniform-work-group-size", "false");
-      return true;
-    }
-  } else {
-    // If the attribute is absent, set it as false
-    Caller.addFnAttr("uniform-work-group-size", "false");
-    Callee.addFnAttr("uniform-work-group-size", "false");
-    return true;
-  }
-  return false;
-}
-
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
-  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
-  bool HasApertureRegs = ST.hasApertureRegs();
-  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
-
   bool HaveStackObjects = false;
   bool Changed = false;
-  bool NeedQueuePtr = false;
   bool HaveCall = false;
-  bool HasIndirectCall = false;
   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
-  CallingConv::ID CC = F.getCallingConv();
-  bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
-
-  // If this function hasAddressTaken() = true
-  // then add all attributes corresponding to the implicit args.
-  if (CallingConvSupportsAllImplicits &&
-      F.hasAddressTaken(nullptr, true, true, true)) {
-    for (StringRef AttrName : ImplicitAttrNames) {
-      F.addFnAttr(AttrName);
-    }
-    Changed = true;
-  }
 
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
@@ -293,65 +76,23 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
         const Function *Callee =
             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
 
-        // Note the occurence of indirect call.
+        // Note the occurrence of indirect call.
         if (!Callee) {
-          if (!CB->isInlineAsm()) {
-            HasIndirectCall = true;
+          if (!CB->isInlineAsm())
             HaveCall = true;
-          }
+
           continue;
         }
 
         Intrinsic::ID IID = Callee->getIntrinsicID();
         if (IID == Intrinsic::not_intrinsic) {
           HaveCall = true;
-          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
           Changed = true;
-        } else {
-          bool NonKernelOnly = false;
-
-          if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
-            F.addFnAttr("amdgpu-kernarg-segment-ptr");
-          } else {
-            StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
-                                                     NeedQueuePtr);
-            if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
-              F.addFnAttr(AttrName);
-              Changed = true;
-            }
-          }
-        }
-      }
-
-      if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
-        continue;
-
-      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
-          NeedQueuePtr = true;
-          continue;
-        }
-      }
-
-      for (const Use &U : I.operands()) {
-        const auto *OpC = dyn_cast<Constant>(U);
-        if (!OpC)
-          continue;
-
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
-                                          HasApertureRegs)) {
-          NeedQueuePtr = true;
-          break;
         }
       }
     }
   }
 
-  if (NeedQueuePtr) {
-    F.addFnAttr("amdgpu-queue-ptr");
-    Changed = true;
-  }
-
   // TODO: We could refine this to captured pointers that could possibly be
   // accessed by flat instructions. For now this is mostly a poor way of
   // estimating whether there are calls before argument lowering.
@@ -365,28 +106,6 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
     Changed = true;
   }
 
-  // This pass cannot copy attributes from callees to callers
-  // if there is an indirect call and in thus such cases,
-  // hasAddressTaken() would be false for kernels and functions
-  // making an indirect call (if they are themselves not indirectly called).
-  // We must tag all such kernels/functions with all implicits attributes
-  // for correctness.
-  // e.g.
-  // 1. Kernel K1 makes an indirect call to function F1.
-  //    Without detecting an indirect call in K1, this pass will not
-  //    add all implicit args to K1 (which is incorrect).
-  // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
-  // F2.
-  //    Without detecting an indirect call in F1 (whose hasAddressTaken() is
-  //    false), the pass will not add all implicit args to F1 (which is
-  //    essential for correctness).
-  if (CallingConvSupportsAllImplicits && HasIndirectCall) {
-    for (StringRef AttrName : ImplicitAttrNames) {
-      F.addFnAttr(AttrName);
-    }
-    Changed = true;
-  }
-
   return Changed;
 }
 
@@ -394,14 +113,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
   bool Changed = false;
 
   for (CallGraphNode *I : SCC) {
-    // Build a list of CallGraphNodes from most number of uses to least
-    if (I->getNumReferences())
-      NodeList.push_back(I);
-    else {
-      processUniformWorkGroupAttribute();
-      NodeList.clear();
-    }
-
     Function *F = I->getFunction();
     // Ignore functions with graphics calling conventions, these are currently
     // not allowed to have kernel arguments.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cbc4ab212566..bb2e723f4ab0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -32,8 +32,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -678,7 +678,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
   if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
-                                          ProgInfo.ScratchSize, DS_Error);
+                                          ProgInfo.ScratchSize,
+                                          MaxScratchPerWorkitem, DS_Error);
     MF.getFunction().getContext().diagnose(DiagStackSize);
   }
 
@@ -697,11 +698,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm.
       LLVMContext &Ctx = MF.getFunction().getContext();
-      DiagnosticInfoResourceLimit Diag(MF.getFunction(),
-                                       "addressable scalar registers",
-                                       ProgInfo.NumSGPR, DS_Error,
-                                       DK_ResourceLimit,
-                                       MaxAddressableNumSGPRs);
+      DiagnosticInfoResourceLimit Diag(
+          MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
+          MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
       Ctx.diagnose(Diag);
       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
     }
@@ -717,18 +716,72 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
   if (isShader(F.getCallingConv())) {
+    bool IsPixelShader =
+        F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
+
+    // Calculate the number of VGPR registers based on the SPI input registers
+    uint32_t InputEna = 0;
+    uint32_t InputAddr = 0;
+    unsigned LastEna = 0;
+
+    if (IsPixelShader) {
+      // Note for IsPixelShader:
+      // By this stage, all enabled inputs are tagged in InputAddr as well.
+      // We will use InputAddr to determine whether the input counts against the
+      // vgpr total and only use the InputEnable to determine the last input
+      // that is relevant - if extra arguments are used, then we have to honour
+      // the InputAddr for any intermediate non-enabled inputs.
+      InputEna = MFI->getPSInputEnable();
+      InputAddr = MFI->getPSInputAddr();
+
+      // We only need to consider input args up to the last used arg.
+      assert((InputEna || InputAddr) &&
+             "PSInputAddr and PSInputEnable should "
+             "never both be 0 for AMDGPU_PS shaders");
+      // There are some rare circumstances where InputAddr is non-zero and
+      // InputEna can be set to 0. In this case we default to setting LastEna
+      // to 1.
+      LastEna = InputEna ? findLastSet(InputEna) + 1 : 1;
+    }
+
     // FIXME: We should be using the number of registers determined during
     // calling convention lowering to legalize the types.
     const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned PSArgCount = 0;
+    unsigned IntermediateVGPR = 0;
     for (auto &Arg : F.args()) {
       unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
-      if (Arg.hasAttribute(Attribute::InReg))
+      if (Arg.hasAttribute(Attribute::InReg)) {
         WaveDispatchNumSGPR += NumRegs;
-      else
-        WaveDispatchNumVGPR += NumRegs;
+      } else {
+        // If this is a PS shader and we're processing the PS Input args (first
+        // 16 VGPR), use the InputEna and InputAddr bits to define how many
+        // VGPRs are actually used.
+        // Any extra VGPR arguments are handled as normal arguments (and
+        // contribute to the VGPR count whether they're used or not).
+        if (IsPixelShader && PSArgCount < 16) {
+          if ((1 << PSArgCount) & InputAddr) {
+            if (PSArgCount < LastEna)
+              WaveDispatchNumVGPR += NumRegs;
+            else
+              IntermediateVGPR += NumRegs;
+          }
+          PSArgCount++;
+        } else {
+          // If there are extra arguments we have to include the allocation for
+          // the non-used (but enabled with InputAddr) input arguments
+          if (IntermediateVGPR) {
+            WaveDispatchNumVGPR += IntermediateVGPR;
+            IntermediateVGPR = 0;
+          }
+          WaveDispatchNumVGPR += NumRegs;
+        }
+      }
     }
     ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
-    ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
+    ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
+    ProgInfo.NumVGPR =
+        Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
@@ -745,11 +798,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       // This can happen due to a compiler bug or when using inline asm to use
       // the registers which are usually reserved for vcc etc.
       LLVMContext &Ctx = MF.getFunction().getContext();
-      DiagnosticInfoResourceLimit Diag(MF.getFunction(),
-                                       "scalar registers",
-                                       ProgInfo.NumSGPR, DS_Error,
-                                       DK_ResourceLimit,
-                                       MaxAddressableNumSGPRs);
+      DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
+                                       ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
+                                       DS_Error, DK_ResourceLimit);
       Ctx.diagnose(Diag);
       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
@@ -766,14 +817,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
     LLVMContext &Ctx = MF.getFunction().getContext();
     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
-                                     MFI->getNumUserSGPRs(), DS_Error);
+                                     MFI->getNumUserSGPRs(),
+                                     STM.getMaxNumUserSGPRs(), DS_Error);
     Ctx.diagnose(Diag);
   }
 
   if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
     LLVMContext &Ctx = MF.getFunction().getContext();
     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
-                                     MFI->getLDSSize(), DS_Error);
+                                     MFI->getLDSSize(),
+                                     STM.getLocalMemorySize(), DS_Error);
     Ctx.diagnose(Diag);
   }
 
@@ -1039,6 +1092,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   // kernarg_segment_alignment is specified as log of the alignment.
   // The minimum alignment is 16.
+  // FIXME: The metadata treats the minimum as 4?
   Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index d3a555bc228f..d5c60aa3be7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -26,7 +26,6 @@ struct AMDGPUResourceUsageAnalysis;
 class AMDGPUTargetStreamer;
 class MCCodeEmitter;
 class MCOperand;
-class GCNSubtarget;
 
 namespace AMDGPU {
 namespace HSAMD {
@@ -55,9 +54,6 @@ private:
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
   void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
                         const MachineFunction &MF) const;
-  void findNumUsedRegistersSI(const MachineFunction &MF,
-                              unsigned &NumSGPR,
-                              unsigned &NumVGPR) const;
 
   /// Emit register usage information so that the GPU driver
   /// can correctly setup the GPU state.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 3e9fdcb1618e..1e2cf3890d0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -541,7 +541,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
       if (NeedResult)
         ExclScan = buildShiftRight(B, NewV, Identity);
 
-      // Read the value from the last lane, which has accumlated the values of
+      // Read the value from the last lane, which has accumulated the values of
       // each active lane in the wavefront. This will be our new value which we
       // will provide to the atomic operation.
       Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 61b1d22edc33..f0aadab3302f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -22,48 +22,71 @@
 
 using namespace llvm;
 
-static constexpr StringLiteral ImplicitAttrNames[] = {
-    // X ids unnecessarily propagated to kernels.
-    "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
-    "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
-    "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
-    "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
-    "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
+enum ImplicitArgumentMask {
+  NOT_IMPLICIT_INPUT = 0,
+
+  // SGPRs
+  DISPATCH_PTR = 1 << 0,
+  QUEUE_PTR = 1 << 1,
+  DISPATCH_ID = 1 << 2,
+  IMPLICIT_ARG_PTR = 1 << 3,
+  WORKGROUP_ID_X = 1 << 4,
+  WORKGROUP_ID_Y = 1 << 5,
+  WORKGROUP_ID_Z = 1 << 6,
+
+  // VGPRS:
+  WORKITEM_ID_X = 1 << 7,
+  WORKITEM_ID_Y = 1 << 8,
+  WORKITEM_ID_Z = 1 << 9,
+  ALL_ARGUMENT_MASK = (1 << 10) - 1
+};
+
+static constexpr std::pair<ImplicitArgumentMask,
+                           StringLiteral> ImplicitAttrs[] = {
+  {DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
+  {QUEUE_PTR, "amdgpu-no-queue-ptr"},
+  {DISPATCH_ID, "amdgpu-no-dispatch-id"},
+  {IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
+  {WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
+  {WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"},
+  {WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"},
+  {WORKITEM_ID_X, "amdgpu-no-workitem-id-x"},
+  {WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"},
+  {WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"}
+};
 
 // We do not need to note the x workitem or workgroup id because they are always
 // initialized.
 //
 // TODO: We should not add the attributes if the known compile time workgroup
 // size is 1 for y/z.
-static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly,
-                                     bool &IsQueuePtr) {
+static ImplicitArgumentMask
+intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
   switch (ID) {
   case Intrinsic::amdgcn_workitem_id_x:
     NonKernelOnly = true;
-    return "amdgpu-work-item-id-x";
+    return WORKITEM_ID_X;
   case Intrinsic::amdgcn_workgroup_id_x:
     NonKernelOnly = true;
-    return "amdgpu-work-group-id-x";
+    return WORKGROUP_ID_X;
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
-    return "amdgpu-work-item-id-y";
+    return WORKITEM_ID_Y;
   case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::r600_read_tidig_z:
-    return "amdgpu-work-item-id-z";
+    return WORKITEM_ID_Z;
   case Intrinsic::amdgcn_workgroup_id_y:
   case Intrinsic::r600_read_tgid_y:
-    return "amdgpu-work-group-id-y";
+    return WORKGROUP_ID_Y;
   case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
-    return "amdgpu-work-group-id-z";
+    return WORKGROUP_ID_Z;
   case Intrinsic::amdgcn_dispatch_ptr:
-    return "amdgpu-dispatch-ptr";
+    return DISPATCH_PTR;
   case Intrinsic::amdgcn_dispatch_id:
-    return "amdgpu-dispatch-id";
-  case Intrinsic::amdgcn_kernarg_segment_ptr:
-    return "amdgpu-kernarg-segment-ptr";
+    return DISPATCH_ID;
   case Intrinsic::amdgcn_implicitarg_ptr:
-    return "amdgpu-implicitarg-ptr";
+    return IMPLICIT_ARG_PTR;
   case Intrinsic::amdgcn_queue_ptr:
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private:
@@ -71,9 +94,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly,
   case Intrinsic::trap:
   case Intrinsic::debugtrap:
     IsQueuePtr = true;
-    return "amdgpu-queue-ptr";
+    return QUEUE_PTR;
   default:
-    return "";
+    return NOT_IMPLICIT_INPUT;
   }
 }
 
@@ -89,6 +112,7 @@ static bool isDSAddress(const Constant *C) {
   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
 }
 
+namespace {
 class AMDGPUInformationCache : public InformationCache {
 public:
   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
@@ -105,6 +129,17 @@ public:
     return ST.hasApertureRegs();
   }
 
+  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return ST.getFlatWorkGroupSizes(F);
+  }
+
+  std::pair<unsigned, unsigned>
+  getMaximumFlatWorkGroupRange(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
+  }
+
 private:
   /// Check if the ConstantExpr \p CE requires queue ptr attribute.
   static bool visitConstExpr(const ConstantExpr *CE) {
@@ -163,8 +198,11 @@ private:
   DenseMap<const Constant *, uint8_t> ConstantStatus;
 };
 
-struct AAAMDAttributes : public StateWrapper<BooleanState, AbstractAttribute> {
-  using Base = StateWrapper<BooleanState, AbstractAttribute>;
+struct AAAMDAttributes : public StateWrapper<
+  BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> {
+  using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>,
+                            AbstractAttribute>;
+
   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
   /// Create an abstract attribute view for the position \p IRP.
@@ -183,24 +221,24 @@ struct AAAMDAttributes : public StateWrapper<BooleanState, AbstractAttribute> {
     return (AA->getIdAddr() == &ID);
   }
 
-  virtual const DenseSet<StringRef> &getAttributes() const = 0;
-
   /// Unique ID (due to the unique address)
   static const char ID;
 };
 const char AAAMDAttributes::ID = 0;
 
-struct AAAMDWorkGroupSize
+struct AAUniformWorkGroupSize
     : public StateWrapper<BooleanState, AbstractAttribute> {
   using Base = StateWrapper<BooleanState, AbstractAttribute>;
-  AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+  AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
   /// Create an abstract attribute view for the position \p IRP.
-  static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP,
-                                               Attributor &A);
+  static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
+                                                   Attributor &A);
 
   /// See AbstractAttribute::getName().
-  const std::string getName() const override { return "AAAMDWorkGroupSize"; }
+  const std::string getName() const override {
+    return "AAUniformWorkGroupSize";
+  }
 
   /// See AbstractAttribute::getIdAddr().
   const char *getIdAddr() const override { return &ID; }
@@ -214,11 +252,11 @@ struct AAAMDWorkGroupSize
   /// Unique ID (due to the unique address)
   static const char ID;
 };
-const char AAAMDWorkGroupSize::ID = 0;
+const char AAUniformWorkGroupSize::ID = 0;
 
-struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
-  AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
-      : AAAMDWorkGroupSize(IRP, A) {}
+struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
+  AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
+      : AAUniformWorkGroupSize(IRP, A) {}
 
   void initialize(Attributor &A) override {
     Function *F = getAssociatedFunction();
@@ -244,10 +282,10 @@ struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
 
     auto CheckCallSite = [&](AbstractCallSite CS) {
       Function *Caller = CS.getInstruction()->getFunction();
-      LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName()
+      LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
                         << "->" << getAssociatedFunction()->getName() << "\n");
 
-      const auto &CallerInfo = A.getAAFor<AAAMDWorkGroupSize>(
+      const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
 
       Change = Change | clampStateAndIndicateChange(this->getState(),
@@ -286,11 +324,13 @@ struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
   void trackStatistics() const override {}
 };
 
-AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP,
-                                                          Attributor &A) {
+AAUniformWorkGroupSize &
+AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
+                                          Attributor &A) {
   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
-    return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A);
-  llvm_unreachable("AAAMDWorkGroupSize is only valid for function position");
+    return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
+  llvm_unreachable(
+      "AAUniformWorkGroupSize is only valid for function position");
 }
 
 struct AAAMDAttributesFunction : public AAAMDAttributes {
@@ -299,14 +339,13 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
   void initialize(Attributor &A) override {
     Function *F = getAssociatedFunction();
-    CallingConv::ID CC = F->getCallingConv();
-    bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+    for (auto Attr : ImplicitAttrs) {
+      if (F->hasFnAttribute(Attr.second))
+        addKnownBits(Attr.first);
+    }
 
-    // Don't add attributes to instrinsics
-    if (F->isIntrinsic()) {
-      indicatePessimisticFixpoint();
+    if (F->isDeclaration())
       return;
-    }
 
     // Ignore functions with graphics calling conventions, these are currently
     // not allowed to have kernel arguments.
@@ -314,94 +353,47 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       indicatePessimisticFixpoint();
       return;
     }
-
-    for (StringRef Attr : ImplicitAttrNames) {
-      if (F->hasFnAttribute(Attr))
-        Attributes.insert(Attr);
-    }
-
-    // TODO: We shouldn't need this in the future.
-    if (CallingConvSupportsAllImplicits &&
-        F->hasAddressTaken(nullptr, true, true, true)) {
-      for (StringRef AttrName : ImplicitAttrNames) {
-        Attributes.insert(AttrName);
-      }
-    }
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
     Function *F = getAssociatedFunction();
-    ChangeStatus Change = ChangeStatus::UNCHANGED;
-    bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
-    CallingConv::ID CC = F->getCallingConv();
-    bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
-    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
-
-    auto AddAttribute = [&](StringRef AttrName) {
-      if (Attributes.insert(AttrName).second)
-        Change = ChangeStatus::CHANGED;
-    };
+    // The current assumed state used to determine a change.
+    auto OrigAssumed = getAssumed();
 
     // Check for Intrinsics and propagate attributes.
     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
         *this, this->getIRPosition(), DepClassTy::REQUIRED);
+    if (AAEdges.hasNonAsmUnknownCallee())
+      return indicatePessimisticFixpoint();
 
-    // We have to assume that we can reach a function with these attributes.
-    // We do not consider inline assembly as a unknown callee.
-    if (CallingConvSupportsAllImplicits && AAEdges.hasNonAsmUnknownCallee()) {
-      for (StringRef AttrName : ImplicitAttrNames) {
-        AddAttribute(AttrName);
-      }
-    }
+    bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
 
     bool NeedsQueuePtr = false;
-    bool HasCall = false;
+
     for (Function *Callee : AAEdges.getOptimisticEdges()) {
       Intrinsic::ID IID = Callee->getIntrinsicID();
-      if (IID != Intrinsic::not_intrinsic) {
-        if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
-          AddAttribute("amdgpu-kernarg-segment-ptr");
-          continue;
-        }
-
-        bool NonKernelOnly = false;
-        StringRef AttrName =
-            intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr);
-
-        if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly))
-          AddAttribute(AttrName);
-
+      if (IID == Intrinsic::not_intrinsic) {
+        const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
+          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+        *this &= AAAMD;
         continue;
       }
 
-      HasCall = true;
-      const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
-          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
-      const DenseSet<StringRef> &CalleeAttributes = AAAMD.getAttributes();
-      // Propagate implicit attributes from called function.
-      for (StringRef AttrName : ImplicitAttrNames)
-        if (CalleeAttributes.count(AttrName))
-          AddAttribute(AttrName);
+      bool NonKernelOnly = false;
+      ImplicitArgumentMask AttrMask =
+          intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr);
+      if (AttrMask != NOT_IMPLICIT_INPUT) {
+        if ((IsNonEntryFunc || !NonKernelOnly))
+          removeAssumedBits(AttrMask);
+      }
     }
 
-    HasCall |= AAEdges.hasUnknownCallee();
-    if (!IsNonEntryFunc && HasCall)
-      AddAttribute("amdgpu-calls");
-
-    // Check the function body.
-    auto CheckAlloca = [&](Instruction &I) {
-      AddAttribute("amdgpu-stack-objects");
-      return false;
-    };
-
-    bool UsedAssumedInformation = false;
-    A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca},
-                              UsedAssumedInformation);
-
     // If we found that we need amdgpu-queue-ptr, nothing else to do.
-    if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) {
-      AddAttribute("amdgpu-queue-ptr");
-      return Change;
+    if (NeedsQueuePtr) {
+      removeAssumedBits(QUEUE_PTR);
+      return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
+                                           ChangeStatus::UNCHANGED;
     }
 
     auto CheckAddrSpaceCasts = [&](Instruction &I) {
@@ -419,60 +411,68 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     // instructions, try it first.
 
     // amdgpu-queue-ptr is not needed if aperture regs is present.
-    if (!HasApertureRegs)
+    if (!HasApertureRegs) {
+      bool UsedAssumedInformation = false;
       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
                                 {Instruction::AddrSpaceCast},
                                 UsedAssumedInformation);
+    }
 
     // If we found  that we need amdgpu-queue-ptr, nothing else to do.
     if (NeedsQueuePtr) {
-      AddAttribute("amdgpu-queue-ptr");
-      return Change;
+      removeAssumedBits(QUEUE_PTR);
+      return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
+                                           ChangeStatus::UNCHANGED;
     }
 
-    if (!IsNonEntryFunc && HasApertureRegs)
-      return Change;
+    if (!IsNonEntryFunc && HasApertureRegs) {
+      return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
+                                           ChangeStatus::UNCHANGED;
+    }
 
     for (BasicBlock &BB : *F) {
       for (Instruction &I : BB) {
         for (const Use &U : I.operands()) {
           if (const auto *C = dyn_cast<Constant>(U)) {
             if (InfoCache.needsQueuePtr(C, *F)) {
-              AddAttribute("amdgpu-queue-ptr");
-              return Change;
+              removeAssumedBits(QUEUE_PTR);
+              return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
+                                                   ChangeStatus::UNCHANGED;
             }
           }
         }
       }
     }
 
-    return Change;
+    return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
+                                         ChangeStatus::UNCHANGED;
   }
 
   ChangeStatus manifest(Attributor &A) override {
     SmallVector<Attribute, 8> AttrList;
     LLVMContext &Ctx = getAssociatedFunction()->getContext();
 
-    for (StringRef AttrName : Attributes)
-      AttrList.push_back(Attribute::get(Ctx, AttrName));
+    for (auto Attr : ImplicitAttrs) {
+      if (isKnown(Attr.first))
+        AttrList.push_back(Attribute::get(Ctx, Attr.second));
+    }
 
     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
                                               /* ForceReplace */ true);
   }
 
   const std::string getAsStr() const override {
-    return "AMDInfo[" + std::to_string(Attributes.size()) + "]";
-  }
-
-  const DenseSet<StringRef> &getAttributes() const override {
-    return Attributes;
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << "AMDInfo[";
+    for (auto Attr : ImplicitAttrs)
+      OS << ' ' << Attr.second;
+    OS << " ]";
+    return OS.str();
   }
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {}
-
-private:
-  DenseSet<StringRef> Attributes;
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
@@ -482,6 +482,118 @@ AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
   llvm_unreachable("AAAMDAttributes is only valid for function position");
 }
 
+/// Propagate amdgpu-flat-work-group-size attribute.
+struct AAAMDFlatWorkGroupSize
+    : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
+  using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
+  AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, 32) {}
+
+  /// See AbstractAttribute::getState(...).
+  IntegerRangeState &getState() override { return *this; }
+  const IntegerRangeState &getState() const override { return *this; }
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+    unsigned MinGroupSize, MaxGroupSize;
+    std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
+    intersectKnown(
+        ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto CheckCallSite = [&](AbstractCallSite CS) {
+      Function *Caller = CS.getInstruction()->getFunction();
+      LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
+                        << "->" << getAssociatedFunction()->getName() << '\n');
+
+      const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
+          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+
+      Change |=
+          clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
+
+      return true;
+    };
+
+    bool AllCallSitesKnown = true;
+    if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return Change;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    SmallVector<Attribute, 8> AttrList;
+    Function *F = getAssociatedFunction();
+    LLVMContext &Ctx = F->getContext();
+
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+    unsigned Min, Max;
+    std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
+
+    // Don't add the attribute if it's the implied default.
+    if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
+      return ChangeStatus::UNCHANGED;
+
+    SmallString<10> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
+
+    AttrList.push_back(
+        Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
+    return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+                                              /* ForceReplace */ true);
+  }
+
+  const std::string getAsStr() const override {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << "AMDFlatWorkGroupSize[";
+    OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
+    OS << ']';
+    return OS.str();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
+                                                   Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override {
+    return "AAAMDFlatWorkGroupSize";
+  }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDFlatWorkGroupSize
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+const char AAAMDFlatWorkGroupSize::ID = 0;
+
+AAAMDFlatWorkGroupSize &
+AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
+                                          Attributor &A) {
+  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+    return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
+  llvm_unreachable(
+      "AAAMDFlatWorkGroupSize is only valid for function position");
+}
+
 class AMDGPUAttributor : public ModulePass {
 public:
   AMDGPUAttributor() : ModulePass(ID) {}
@@ -500,17 +612,28 @@ public:
   bool runOnModule(Module &M) override {
     SetVector<Function *> Functions;
     AnalysisGetter AG;
-    for (Function &F : M)
-      Functions.insert(&F);
+    for (Function &F : M) {
+      if (!F.isIntrinsic())
+        Functions.insert(&F);
+    }
 
     CallGraphUpdater CGUpdater;
     BumpPtrAllocator Allocator;
     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
-    Attributor A(Functions, InfoCache, CGUpdater);
+    DenseSet<const char *> Allowed(
+        {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
+         &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID});
+
+    Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
 
     for (Function &F : M) {
-      A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
-      A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
+      if (!F.isIntrinsic()) {
+        A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
+        A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+        if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
+          A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
+        }
+      }
     }
 
     ChangeStatus Change = A.run();
@@ -521,6 +644,7 @@ public:
   TargetMachine *TM;
   static char ID;
 };
+} // namespace
 
 char AMDGPUAttributor::ID = 0;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index b9faad453aba..43928d7c2a09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -60,7 +60,7 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
 
     // If this is a scalar return, insert a readfirstlane just in case the value
@@ -103,7 +103,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     markPhysRegUsed(PhysReg);
 
     if (VA.getLocVT().getSizeInBits() < 32) {
@@ -203,7 +203,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -236,7 +236,7 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
   : CallLowering(&TLI) {
 }
 
-// FIXME: Compatability shim
+// FIXME: Compatibility shim
 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
   switch (MIOpc) {
   case TargetOpcode::G_SEXT:
@@ -355,14 +355,23 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
 
   auto const &ST = MF.getSubtarget<GCNSubtarget>();
 
-  unsigned ReturnOpc =
-      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
+  unsigned ReturnOpc = 0;
+  if (IsShader)
+    ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG;
+  else if (CC == CallingConv::AMDGPU_Gfx)
+    ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx;
+  else
+    ReturnOpc = AMDGPU::S_SETPC_B64_return;
 
   auto Ret = B.buildInstrNoInsert(ReturnOpc);
   Register ReturnAddrVReg;
   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
     Ret.addUse(ReturnAddrVReg);
+  } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
+    ReturnAddrVReg =
+        MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass);
+    Ret.addUse(ReturnAddrVReg);
   }
 
   if (!FLI.CanLowerReturn)
@@ -370,7 +379,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
   else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
-  if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+  if (ReturnOpc == AMDGPU::S_SETPC_B64_return ||
+      ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
     const SIRegisterInfo *TRI = ST.getRegisterInfo();
     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
                                          &AMDGPU::SGPR_64RegClass);
@@ -753,6 +763,11 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
                                            CallLoweringInfo &Info) const {
   MachineFunction &MF = MIRBuilder.getMF();
 
+  // If there's no call site, this doesn't correspond to a call from the IR and
+  // doesn't need implicit inputs.
+  if (!Info.CB)
+    return true;
+
   const AMDGPUFunctionArgInfo *CalleeArgInfo
     = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
 
@@ -773,17 +788,32 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
   };
 
+  static constexpr StringLiteral ImplicitAttrNames[] = {
+    "amdgpu-no-dispatch-ptr",
+    "amdgpu-no-queue-ptr",
+    "amdgpu-no-implicitarg-ptr",
+    "amdgpu-no-dispatch-id",
+    "amdgpu-no-workgroup-id-x",
+    "amdgpu-no-workgroup-id-y",
+    "amdgpu-no-workgroup-id-z"
+  };
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const AMDGPULegalizerInfo *LI
     = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
 
+  unsigned I = 0;
   for (auto InputID : InputRegs) {
     const ArgDescriptor *OutgoingArg;
     const TargetRegisterClass *ArgRC;
     LLT ArgTy;
 
+    // If the callee does not use the attribute value, skip copying the value.
+    if (Info.CB->hasFnAttr(ImplicitAttrNames[I++]))
+      continue;
+
     std::tie(OutgoingArg, ArgRC, ArgTy) =
         CalleeArgInfo->getPreloadedValue(InputID);
     if (!OutgoingArg)
@@ -843,16 +873,22 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
   const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
   const LLT S32 = LLT::scalar(32);
 
+  const bool NeedWorkItemIDX = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-x");
+  const bool NeedWorkItemIDY = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-y");
+  const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr("amdgpu-no-workitem-id-z");
+
   // If incoming ids are not packed we need to pack them.
   // FIXME: Should consider known workgroup size to eliminate known 0 cases.
   Register InputReg;
-  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
+  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
+      NeedWorkItemIDX) {
     InputReg = MRI.createGenericVirtualRegister(S32);
     LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
                        std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
   }
 
-  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
+  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
+      NeedWorkItemIDY) {
     Register Y = MRI.createGenericVirtualRegister(S32);
     LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
                        std::get<2>(WorkitemIDY));
@@ -861,7 +897,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
   }
 
-  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
+  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
+      NeedWorkItemIDZ) {
     Register Z = MRI.createGenericVirtualRegister(S32);
     LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
                        std::get<2>(WorkitemIDZ));
@@ -870,7 +907,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
   }
 
-  if (!InputReg) {
+  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
     InputReg = MRI.createGenericVirtualRegister(S32);
 
     // Workitem ids are already packed, any of present incoming arguments will
@@ -883,7 +920,9 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
   }
 
   if (OutgoingArg->isRegister()) {
-    ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
+    if (InputReg)
+      ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
+
     if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
       report_fatal_error("failed to allocate implicit input argument");
   } else {
@@ -903,7 +942,9 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
 
 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
                               bool IsTailCall) {
-  return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
+  assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
+                                        "because the address can be divergent");
+  return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::G_SI_CALL;
 }
 
 // Add operands to call instruction to track the callee.
@@ -1033,6 +1074,11 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
   if (!Info.IsTailCall)
     return false;
 
+  // Indirect calls can't be tail calls, because the address can be divergent.
+  // TODO Check divergence info if the call really is divergent.
+  if (Info.Callee.isReg())
+    return false;
+
   MachineFunction &MF = B.getMF();
   const Function &CallerF = MF.getFunction();
   CallingConv::ID CalleeCC = Info.CallConv;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 90b52395b76c..1682d43ae671 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -20,11 +20,13 @@ def CC_SI_Gfx : CallingConv<[
   // 0-3 are reserved for the stack buffer descriptor
   // 30-31 are reserved for the return address
   // 32 is reserved for the stack pointer
+  // 33 is reserved for the frame pointer
+  // 34 is reserved for the base pointer
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
-    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
   ]>>>,
 
   CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
@@ -41,17 +43,6 @@ def RetCC_SI_Gfx : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  // 0-3 are reserved for the stack buffer descriptor
-  // 32 is reserved for the stack pointer
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
-    SGPR4, SGPR5, SGPR6, SGPR7,
-    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
-    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
-    SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
-    SGPR40, SGPR41, SGPR42, SGPR43
-  ]>>>,
-
   CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
@@ -139,14 +130,6 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>
 ]>;
 
-def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
-  (sequence "VGPR%u", 24, 255)
->;
-
-def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
-  (sequence "VGPR%u", 32, 255)
->;
-
 def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
   // The CSRs & scratch-registers are interleaved at a split boundary of 8.
   (add (sequence "VGPR%u", 40, 47),
@@ -173,6 +156,14 @@ def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
 
+def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs<
+  (sequence "SGPR%u", 4, 29)
+>;
+
+def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs<
+  (sequence "SGPR%u", 64, 105)
+>;
+
 // Just to get the regmask, not for calling convention purposes.
 def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
   (sequence "VGPR%u", 0, 255)
@@ -198,6 +189,14 @@ def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
   (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
 >;
 
+def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
+>;
+
+def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs<
+  (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255)
+>;
+
 def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
 
 // Calling convention for leaf functions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 60e79c2c6c2f..a55729586b8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -148,11 +148,15 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// \returns True.
   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
 
-
+  /// \returns The minimum number of bits needed to store the value of \Op as an
+  /// unsigned integer. Truncating to this size and then zero-extending to
+  /// ScalarSize will not change the value.
   unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+
+  /// \returns The minimum number of bits needed to store the value of \Op as a
+  /// signed integer. Truncating to this size and then sign-extending to
+  /// ScalarSize will not change the value.
   unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
-  bool isI24(Value *V, unsigned ScalarSize) const;
-  bool isU24(Value *V, unsigned ScalarSize) const;
 
   /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
   /// SelectionDAG has an issue where an and asserting the bits are known
@@ -451,17 +455,7 @@ unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
                                              unsigned ScalarSize) const {
   // In order for this to be a signed 24-bit value, bit 23, must
   // be a sign bit.
-  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
-}
-
-bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
-  return ScalarSize >= 24 && // Types less than 24-bit should be treated
-                                     // as unsigned 24-bit values.
-    numBitsSigned(V, ScalarSize) < 24;
-}
-
-bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
-  return numBitsUnsigned(V, ScalarSize) <= 24;
+  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC) + 1;
 }
 
 static void extractValues(IRBuilder<> &Builder,
@@ -489,6 +483,34 @@ static Value *insertValues(IRBuilder<> &Builder,
   return NewVal;
 }
 
+// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
+// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
+// width of the original destination.
+static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
+                       unsigned Size, unsigned NumBits, bool IsSigned) {
+  if (Size <= 32 || NumBits <= 32) {
+    Intrinsic::ID ID =
+        IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+    return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
+  }
+
+  assert(NumBits <= 48);
+
+  Intrinsic::ID LoID =
+      IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+  Intrinsic::ID HiID =
+      IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
+
+  Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
+  Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
+
+  IntegerType *I64Ty = Builder.getInt64Ty();
+  Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
+  Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
+
+  return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
+}
+
 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   if (I.getOpcode() != Instruction::Mul)
     return false;
@@ -507,13 +529,17 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   IRBuilder<> Builder(&I);
   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-  Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+  unsigned LHSBits = 0, RHSBits = 0;
+  bool IsSigned = false;
+
+  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 &&
+      (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) {
+    IsSigned = false;
+
+  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) <= 24 &&
+             (RHSBits = numBitsSigned(RHS, Size)) <= 24) {
+    IsSigned = true;
 
-  // TODO: Should this try to match mulhi24?
-  if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
-    IntrID = Intrinsic::amdgcn_mul_u24;
-  } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
-    IntrID = Intrinsic::amdgcn_mul_i24;
   } else
     return false;
 
@@ -523,27 +549,26 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   extractValues(Builder, LHSVals, LHS);
   extractValues(Builder, RHSVals, RHS);
 
-
   IntegerType *I32Ty = Builder.getInt32Ty();
-  FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
   for (int I = 0, E = LHSVals.size(); I != E; ++I) {
     Value *LHS, *RHS;
-    if (IntrID == Intrinsic::amdgcn_mul_u24) {
-      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
-      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
-    } else {
+    if (IsSigned) {
       LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
       RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
+    } else {
+      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
     }
 
-    Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
+    Value *Result =
+        getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
 
-    if (IntrID == Intrinsic::amdgcn_mul_u24) {
-      ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
-                                                     LHSVals[I]->getType()));
+    if (IsSigned) {
+      ResultVals.push_back(
+          Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
     } else {
-      ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
-                                                     LHSVals[I]->getType()));
+      ResultVals.push_back(
+          Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
     }
   }
 
@@ -816,7 +841,7 @@ bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
   if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
     return visitBinaryOperator(I);
 
-  // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic
+  // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
   // has only one use
   if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
       !IntrinsicCall->hasOneUse())
@@ -1314,7 +1339,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
       ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Range->getOperand(0));
 
-      if (Lower->getValue().isNullValue()) {
+      if (Lower->isNullValue()) {
         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
       } else {
         Metadata *LowAndHigh[] = {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c6273adca50f..df2f9a0fa3a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -64,26 +64,36 @@ def remove_fcanonicalize : GICombineRule<
          [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
   (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>;
 
+def foldable_fneg_matchdata : GIDefMatchData<"MachineInstr *">;
+
+def foldable_fneg : GICombineRule<
+  (defs root:$ffn, foldable_fneg_matchdata:$matchinfo),
+  (match (wip_match_opcode G_FNEG):$ffn,
+         [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]),
+  (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>;
+
 // Combines which should only apply on SI/VI
 def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
 
 def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
-  "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
+  "AMDGPUGenPreLegalizerCombinerHelper",
+  [all_combines, clamp_i64_to_i16, foldable_fneg]> {
   let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
   let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
+  let AdditionalArguments = [];
 }
 
 def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
   "AMDGPUGenPostLegalizerCombinerHelper",
   [all_combines, gfx6gfx7_combines,
-   uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> {
+   uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> {
   let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
   let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
   let AdditionalArguments = [];
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
-  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> {
+  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
   let StateClass = "AMDGPURegBankCombinerHelperState";
   let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
new file mode 100644
index 000000000000..301e6f6d6f42
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -0,0 +1,382 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCombinerHelper.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+LLVM_READNONE
+static bool fnegFoldsIntoMI(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
+  case AMDGPU::G_FMUL:
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_FSIN:
+  case AMDGPU::G_FPEXT:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_FPTRUNC:
+  case AMDGPU::G_FRINT:
+  case AMDGPU::G_FNEARBYINT:
+  case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+  case AMDGPU::G_FCANONICALIZE:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+    return true;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MI.getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sin:
+    case Intrinsic::amdgcn_fmul_legacy:
+    case Intrinsic::amdgcn_fmed3:
+    case Intrinsic::amdgcn_fma_legacy:
+      return true;
+    default:
+      return false;
+    }
+  }
+  default:
+    return false;
+  }
+}
+
+/// \p returns true if the operation will definitely need to use a 64-bit
+/// encoding, and thus will use a VOP3 encoding regardless of the source
+/// modifiers.
+LLVM_READONLY
+static bool opMustUseVOP3Encoding(const MachineInstr &MI,
+                                  const MachineRegisterInfo &MRI) {
+  return MI.getNumOperands() >
+             (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||
+         MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
+}
+
+// Most FP instructions support source modifiers.
+LLVM_READONLY
+static bool hasSourceMods(const MachineInstr &MI) {
+  if (!MI.memoperands().empty())
+    return false;
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::G_SELECT:
+  case AMDGPU::G_FDIV:
+  case AMDGPU::G_FREM:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::INLINEASM_BR:
+  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+  case AMDGPU::G_BITCAST:
+  case AMDGPU::G_ANYEXT:
+  case AMDGPU::G_BUILD_VECTOR:
+  case AMDGPU::G_BUILD_VECTOR_TRUNC:
+  case AMDGPU::G_PHI:
+    return false;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MI.getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_interp_p1:
+    case Intrinsic::amdgcn_interp_p2:
+    case Intrinsic::amdgcn_interp_mov:
+    case Intrinsic::amdgcn_interp_p1_f16:
+    case Intrinsic::amdgcn_interp_p2_f16:
+    case Intrinsic::amdgcn_div_scale:
+      return false;
+    default:
+      return true;
+    }
+  }
+  default:
+    return true;
+  }
+}
+
+static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                  unsigned CostThreshold = 4) {
+  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
+  // it is truly free to use a source modifier in all cases. If there are
+  // multiple users but for each one will necessitate using VOP3, there will be
+  // a code size increase. Try to avoid increasing code size unless we know it
+  // will save on the instruction count.
+  unsigned NumMayIncreaseSize = 0;
+  Register Dst = MI.getOperand(0).getReg();
+  for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
+    if (!hasSourceMods(Use))
+      return false;
+
+    if (!opMustUseVOP3Encoding(Use, MRI)) {
+      if (++NumMayIncreaseSize > CostThreshold)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool mayIgnoreSignedZero(MachineInstr &MI) {
+  const TargetOptions &Options = MI.getMF()->getTarget().Options;
+  return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
+}
+
+static bool isInv2Pi(const APFloat &APF) {
+  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
+  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
+  static const APFloat KF64(APFloat::IEEEdouble(),
+                            APInt(64, 0x3fc45f306dc9c882));
+
+  return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
+         APF.bitwiseIsEqual(KF64);
+}
+
+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
+// additional cost to negate them.
+static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
+                                       MachineRegisterInfo &MRI) {
+  Optional<FPValueAndVReg> FPValReg;
+  if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
+    if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
+      return true;
+
+    const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
+    if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
+      return true;
+  }
+  return false;
+}
+
+static unsigned inverseMinMax(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::G_FMAXNUM:
+    return AMDGPU::G_FMINNUM;
+  case AMDGPU::G_FMINNUM:
+    return AMDGPU::G_FMAXNUM;
+  case AMDGPU::G_FMAXNUM_IEEE:
+    return AMDGPU::G_FMINNUM_IEEE;
+  case AMDGPU::G_FMINNUM_IEEE:
+    return AMDGPU::G_FMAXNUM_IEEE;
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+    return AMDGPU::G_AMDGPU_FMIN_LEGACY;
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+    return AMDGPU::G_AMDGPU_FMAX_LEGACY;
+  default:
+    llvm_unreachable("invalid min/max opcode");
+  }
+}
+
+bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
+                                             MachineInstr *&MatchInfo) {
+  Register Src = MI.getOperand(1).getReg();
+  MatchInfo = MRI.getVRegDef(Src);
+
+  // If the input has multiple uses and we can either fold the negate down, or
+  // the other uses cannot, give up. This both prevents unprofitable
+  // transformations and infinite loops: we won't repeatedly try to fold around
+  // a negate that has no 'good' form.
+  if (MRI.hasOneNonDBGUse(Src)) {
+    if (allUsesHaveSourceMods(MI, MRI, 0))
+      return false;
+  } else {
+    if (fnegFoldsIntoMI(*MatchInfo) &&
+        (allUsesHaveSourceMods(MI, MRI) ||
+         !allUsesHaveSourceMods(*MatchInfo, MRI)))
+      return false;
+  }
+
+  switch (MatchInfo->getOpcode()) {
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+    // 0 doesn't have a negated inline immediate.
+    return !isConstantCostlierToNegate(*MatchInfo,
+                                       MatchInfo->getOperand(2).getReg(), MRI);
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
+    return mayIgnoreSignedZero(*MatchInfo);
+  case AMDGPU::G_FMUL:
+  case AMDGPU::G_FPEXT:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_FPTRUNC:
+  case AMDGPU::G_FRINT:
+  case AMDGPU::G_FNEARBYINT:
+  case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+  case AMDGPU::G_FSIN:
+  case AMDGPU::G_FCANONICALIZE:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+    return true;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MatchInfo->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sin:
+    case Intrinsic::amdgcn_fmul_legacy:
+    case Intrinsic::amdgcn_fmed3:
+      return true;
+    case Intrinsic::amdgcn_fma_legacy:
+      return mayIgnoreSignedZero(*MatchInfo);
+    default:
+      return false;
+    }
+  }
+  default:
+    return false;
+  }
+}
+
+void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
+                                             MachineInstr *&MatchInfo) {
+  // Transform:
+  // %A = inst %Op1, ...
+  // %B = fneg %A
+  //
+  // into:
+  //
+  // (if %A has one use, specifically fneg above)
+  // %B = inst (maybe fneg %Op1), ...
+  //
+  // (if %A has multiple uses)
+  // %B = inst (maybe fneg %Op1), ...
+  // %A = fneg %B
+
+  // Replace register in operand with a register holding negated value.
+  auto NegateOperand = [&](MachineOperand &Op) {
+    Register Reg = Op.getReg();
+    if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
+      Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
+    replaceRegOpWith(MRI, Op, Reg);
+  };
+
+  // Replace either register in operands with a register holding negated value.
+  auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
+    Register XReg = X.getReg();
+    Register YReg = Y.getReg();
+    if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
+      replaceRegOpWith(MRI, X, XReg);
+    else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
+      replaceRegOpWith(MRI, Y, YReg);
+    else {
+      YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
+      replaceRegOpWith(MRI, Y, YReg);
+    }
+  };
+
+  Builder.setInstrAndDebugLoc(*MatchInfo);
+
+  // Negate appropriate operands so that resulting value of MatchInfo is
+  // negated.
+  switch (MatchInfo->getOpcode()) {
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
+    NegateOperand(MatchInfo->getOperand(1));
+    NegateOperand(MatchInfo->getOperand(2));
+    break;
+  case AMDGPU::G_FMUL:
+    NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
+    break;
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
+    NegateOperand(MatchInfo->getOperand(1));
+    NegateOperand(MatchInfo->getOperand(2));
+    unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
+    replaceOpcodeWith(*MatchInfo, Opposite);
+    break;
+  }
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
+    NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
+    NegateOperand(MatchInfo->getOperand(3));
+    break;
+  case AMDGPU::G_FPEXT:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_FRINT:
+  case AMDGPU::G_FNEARBYINT:
+  case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+  case AMDGPU::G_FSIN:
+  case AMDGPU::G_FCANONICALIZE:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+  case AMDGPU::G_FPTRUNC:
+    NegateOperand(MatchInfo->getOperand(1));
+    break;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MatchInfo->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sin:
+      NegateOperand(MatchInfo->getOperand(2));
+      break;
+    case Intrinsic::amdgcn_fmul_legacy:
+      NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
+      break;
+    case Intrinsic::amdgcn_fmed3:
+      NegateOperand(MatchInfo->getOperand(2));
+      NegateOperand(MatchInfo->getOperand(3));
+      NegateOperand(MatchInfo->getOperand(4));
+      break;
+    case Intrinsic::amdgcn_fma_legacy:
+      NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
+      NegateOperand(MatchInfo->getOperand(4));
+      break;
+    default:
+      llvm_unreachable("folding fneg not supported for this intrinsic");
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("folding fneg not supported for this instruction");
+  }
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
+
+  if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
+    // MatchInfo now has negated value so use that instead of old Dst.
+    replaceRegWith(MRI, Dst, MatchInfoDst);
+  } else {
+    // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
+    // but replaceRegWith will replace defs as well. It is easier to replace one
+    // def with a new register.
+    LLT Type = MRI.getType(Dst);
+    Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
+    replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
+
+    // MatchInfo now has negated value so use that instead of old Dst.
+    replaceRegWith(MRI, Dst, NegatedMatchInfo);
+
+    // Recreate non negated value for other uses of old MatchInfoDst
+    Builder.setInstrAndDebugLoc(MI);
+    Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
+  }
+
+  MI.eraseFromParent();
+  return;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
new file mode 100644
index 000000000000..1d4747136bf7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -0,0 +1,26 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.h -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This contains common combine transformations that may be used in a combine
+/// pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+
+using namespace llvm;
+
+class AMDGPUCombinerHelper : public CombinerHelper {
+public:
+  using CombinerHelper::CombinerHelper;
+
+  bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
+  void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
+};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
new file mode 100644
index 000000000000..04bf623bfa46
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
@@ -0,0 +1,95 @@
+//===-- AMDGPUCtorDtorLowering.cpp - Handle global ctors and dtors --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This pass creates a unified init and fini kernel with the required metadata
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-ctor-dtor"
+
+namespace {
+class AMDGPUCtorDtorLowering final : public ModulePass {
+  bool runOnModule(Module &M) override;
+
+public:
+  Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
+    StringRef InitOrFiniKernelName = "amdgcn.device.init";
+    if (!IsCtor)
+      InitOrFiniKernelName = "amdgcn.device.fini";
+
+    Function *InitOrFiniKernel = Function::createWithDefaultAttr(
+        FunctionType::get(Type::getVoidTy(M.getContext()), false),
+        GlobalValue::ExternalLinkage, 0, InitOrFiniKernelName, &M);
+    BasicBlock *InitOrFiniKernelBB =
+        BasicBlock::Create(M.getContext(), "", InitOrFiniKernel);
+    ReturnInst::Create(M.getContext(), InitOrFiniKernelBB);
+
+    InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL);
+    if (IsCtor)
+      InitOrFiniKernel->addFnAttr("device-init");
+    else
+      InitOrFiniKernel->addFnAttr("device-fini");
+    return InitOrFiniKernel;
+  }
+
+  bool createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) {
+    if (!GV)
+      return false;
+    ConstantArray *GA = dyn_cast<ConstantArray>(GV->getInitializer());
+    if (!GA || GA->getNumOperands() == 0)
+      return false;
+    Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor);
+    IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator());
+    for (Value *V : GA->operands()) {
+      auto *CS = cast<ConstantStruct>(V);
+      if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+        FunctionCallee Ctor =
+            M.getOrInsertFunction(F->getName(), IRB.getVoidTy());
+        IRB.CreateCall(Ctor);
+      }
+    }
+    appendToUsed(M, {InitOrFiniKernel});
+    return true;
+  }
+
+  static char ID;
+  AMDGPUCtorDtorLowering() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char AMDGPUCtorDtorLowering::ID = 0;
+char &llvm::AMDGPUCtorDtorLoweringID = AMDGPUCtorDtorLowering::ID;
+INITIALIZE_PASS(AMDGPUCtorDtorLowering, DEBUG_TYPE,
+                "Lower ctors and dtors for AMDGPU", false, false)
+
+ModulePass *llvm::createAMDGPUCtorDtorLoweringPass() {
+  return new AMDGPUCtorDtorLowering();
+}
+
+bool AMDGPUCtorDtorLowering::runOnModule(Module &M) {
+  bool Modified = false;
+  Modified |=
+      createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_ctors"),
+                             /*IsCtor =*/true);
+  Modified |=
+      createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_dtors"),
+                             /*IsCtor =*/false);
+  return Modified;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index db00f8f711a3..3533087bbfd1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -25,7 +25,6 @@ class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
   "The size of local memory in bytes"
 >;
 
-def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
 def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
 def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 521c8f261a00..12cef2774aaf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -159,6 +159,7 @@ def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>;
 def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
 
 def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32_impl>;
+def : GINodeEquiv<G_AMDGPU_FFBL_B32, AMDGPUffbl_b32_impl>;
 def : GINodeEquiv<G_AMDGPU_FMIN_LEGACY, AMDGPUfmin_legacy>;
 def : GINodeEquiv<G_AMDGPU_FMAX_LEGACY, AMDGPUfmax_legacy>;
 def : GINodeEquiv<G_AMDGPU_RCP_IFLAG, AMDGPUrcp_iflag>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 8eeda7b67b73..b9c59f4c615a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -113,7 +113,7 @@ MetadataStreamerV2::getAddressSpaceQualifier(
 
 ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
                                            StringRef BaseTypeName) const {
-  if (TypeQual.find("pipe") != StringRef::npos)
+  if (TypeQual.contains("pipe"))
     return ValueKind::Pipe;
 
   return StringSwitch<ValueKind>(BaseTypeName)
@@ -201,10 +201,11 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
   Align MaxKernArgAlign;
   HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
                                                                MaxKernArgAlign);
+  HSACodeProps.mKernargSegmentAlign =
+    std::max(MaxKernArgAlign, Align(4)).value();
+
   HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
   HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
-  HSACodeProps.mKernargSegmentAlign =
-      std::max(MaxKernArgAlign, Align(4)).value();
   HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
   HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
   HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
@@ -533,7 +534,7 @@ MetadataStreamerV3::getAddressSpaceQualifier(unsigned AddressSpace) const {
 
 StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
                                            StringRef BaseTypeName) const {
-  if (TypeQual.find("pipe") != StringRef::npos)
+  if (TypeQual.contains("pipe"))
     return "pipe";
 
   return StringSwitch<StringRef>(BaseTypeName)
@@ -665,6 +666,10 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
         Func.getFnAttribute("runtime-handle").getValueAsString().str(),
         /*Copy=*/true);
   }
+  if (Func.hasFnAttribute("device-init"))
+    Kern[".kind"] = Kern.getDocument()->getNode("init");
+  else if (Func.hasFnAttribute("device-fini"))
+    Kern[".kind"] = Kern.getDocument()->getNode("fini");
 }
 
 void MetadataStreamerV3::emitKernelArgs(const Function &Func,
@@ -794,7 +799,8 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
   if (!HiddenArgNumBytes)
     return;
 
-  auto &DL = Func.getParent()->getDataLayout();
+  const Module *M = Func.getParent();
+  auto &DL = M->getDataLayout();
   auto Int64Ty = Type::getInt64Ty(Func.getContext());
 
   if (HiddenArgNumBytes >= 8)
@@ -810,16 +816,16 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
   auto Int8PtrTy =
       Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
 
-  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
-  // "none" argument.
+  // Emit "printf buffer" argument if printf is used, emit "hostcall buffer"
+  // if "hostcall" module flag is set, otherwise emit dummy "none" argument.
   if (HiddenArgNumBytes >= 32) {
-    if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+    if (M->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
                     Args);
-    else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) {
+    else if (M->getModuleFlag("amdgpu_hostcall")) {
       // The printf runtime binding pass should have ensured that hostcall and
       // printf are not used in the same module.
-      assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
+      assert(!M->getNamedMetadata("llvm.printf.fmts"));
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
                     Args);
     } else
@@ -862,6 +868,8 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(ProgramInfo.LDSSize);
   Kern[".private_segment_fixed_size"] =
       Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
+
+  // FIXME: The metadata treats the minimum as 16?
   Kern[".kernarg_segment_align"] =
       Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
   Kern[".wavefront_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 4824b4cf37c7..af5dae1cd8c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
 
-#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/Alignment.h"
@@ -33,6 +32,11 @@ struct SIProgramInfo;
 class Type;
 
 namespace AMDGPU {
+
+namespace IsaInfo {
+class AMDGPUTargetID;
+}
+
 namespace HSAMD {
 
 class MetadataStreamer {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index a3106ded1e38..cee56ee97294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,8 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600RegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -32,287 +35,12 @@
 
 using namespace llvm;
 
-namespace llvm {
-
-class R600InstrInfo;
-
-} // end namespace llvm
-
 //===----------------------------------------------------------------------===//
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
 
 namespace {
 
-static bool isNullConstantOrUndef(SDValue V) {
-  if (V.isUndef())
-    return true;
-
-  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
-  return Const != nullptr && Const->isNullValue();
-}
-
-static bool getConstantValue(SDValue N, uint32_t &Out) {
-  // This is only used for packed vectors, where ussing 0 for undef should
-  // always be good.
-  if (N.isUndef()) {
-    Out = 0;
-    return true;
-  }
-
-  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
-    Out = C->getAPIntValue().getSExtValue();
-    return true;
-  }
-
-  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
-    Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
-    return true;
-  }
-
-  return false;
-}
-
-// TODO: Handle undef as zero
-static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
-                                 bool Negate = false) {
-  assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
-  uint32_t LHSVal, RHSVal;
-  if (getConstantValue(N->getOperand(0), LHSVal) &&
-      getConstantValue(N->getOperand(1), RHSVal)) {
-    SDLoc SL(N);
-    uint32_t K = Negate ?
-      (-LHSVal & 0xffff) | (-RHSVal << 16) :
-      (LHSVal & 0xffff) | (RHSVal << 16);
-    return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
-                              DAG.getTargetConstant(K, SL, MVT::i32));
-  }
-
-  return nullptr;
-}
-
-static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
-  return packConstantV2I16(N, DAG, true);
-}
-
-/// AMDGPU specific code to select AMDGPU machine instructions for
-/// SelectionDAG operations.
-class AMDGPUDAGToDAGISel : public SelectionDAGISel {
-  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
-  // make the right decision when generating code for different targets.
-  const GCNSubtarget *Subtarget;
-
-  // Default FP mode for the current function.
-  AMDGPU::SIModeRegisterDefaults Mode;
-
-  bool EnableLateStructurizeCFG;
-
-  // Instructions that will be lowered with a final instruction that zeros the
-  // high result bits.
-  bool fp16SrcZerosHighBits(unsigned Opc) const;
-
-public:
-  explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
-                              CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
-    : SelectionDAGISel(*TM, OptLevel) {
-    EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
-  }
-  ~AMDGPUDAGToDAGISel() override = default;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AMDGPUArgumentUsageInfo>();
-    AU.addRequired<LegacyDivergenceAnalysis>();
-#ifdef EXPENSIVE_CHECKS
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-#endif
-    SelectionDAGISel::getAnalysisUsage(AU);
-  }
-
-  bool matchLoadD16FromBuildVector(SDNode *N) const;
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  void PreprocessISelDAG() override;
-  void Select(SDNode *N) override;
-  StringRef getPassName() const override;
-  void PostprocessISelDAG() override;
-
-protected:
-  void SelectBuildVector(SDNode *N, unsigned RegClassID);
-
-private:
-  std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
-  bool isNoNanSrc(SDValue N) const;
-  bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
-  bool isNegInlineImmediate(const SDNode *N) const {
-    return isInlineImmediate(N, true);
-  }
-
-  bool isInlineImmediate16(int64_t Imm) const {
-    return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
-  }
-
-  bool isInlineImmediate32(int64_t Imm) const {
-    return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm());
-  }
-
-  bool isInlineImmediate64(int64_t Imm) const {
-    return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm());
-  }
-
-  bool isInlineImmediate(const APFloat &Imm) const {
-    return Subtarget->getInstrInfo()->isInlineConstant(Imm);
-  }
-
-  bool isVGPRImm(const SDNode *N) const;
-  bool isUniformLoad(const SDNode *N) const;
-  bool isUniformBr(const SDNode *N) const;
-
-  bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
-                                  SDValue &RHS) const;
-
-  MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
-
-  SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
-  SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
-  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
-
-  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
-  virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
-  virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
-  bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
-                        unsigned Size) const;
-  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
-  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
-                                 SDValue &Offset1) const;
-  bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
-                                  SDValue &Offset1) const;
-  bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
-                          SDValue &Offset1, unsigned Size) const;
-  bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
-                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
-                   SDValue &Idxen, SDValue &Addr64) const;
-  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
-                         SDValue &SOffset, SDValue &Offset) const;
-  bool SelectMUBUFScratchOffen(SDNode *Parent,
-                               SDValue Addr, SDValue &RSrc, SDValue &VAddr,
-                               SDValue &SOffset, SDValue &ImmOffset) const;
-  bool SelectMUBUFScratchOffset(SDNode *Parent,
-                                SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
-                                SDValue &Offset) const;
-
-  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
-                         SDValue &Offset) const;
-
-  bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
-                            SDValue &Offset, uint64_t FlatVariant) const;
-  bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
-                        SDValue &Offset) const;
-  bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
-                          SDValue &Offset) const;
-  bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
-                           SDValue &Offset) const;
-  bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
-                         SDValue &VOffset, SDValue &Offset) const;
-  bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
-                          SDValue &Offset) const;
-
-  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
-                        bool &Imm) const;
-  SDValue Expand32BitAddress(SDValue Addr) const;
-  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
-                  bool &Imm) const;
-  bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
-  bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
-  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
-  bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
-  bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
-  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
-
-  bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
-                          bool AllowAbs = true) const;
-  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
-  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                       SDValue &Clamp, SDValue &Omod) const;
-  bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                        SDValue &Clamp, SDValue &Omod) const;
-  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                         SDValue &Clamp, SDValue &Omod) const;
-
-  bool SelectVOP3OMods(SDValue In, SDValue &Src,
-                       SDValue &Clamp, SDValue &Omod) const;
-
-  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-
-  bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-
-  bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
-  bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-
-  SDValue getHi16Elt(SDValue In) const;
-
-  SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
-
-  void SelectADD_SUB_I64(SDNode *N);
-  void SelectAddcSubb(SDNode *N);
-  void SelectUADDO_USUBO(SDNode *N);
-  void SelectDIV_SCALE(SDNode *N);
-  void SelectMAD_64_32(SDNode *N);
-  void SelectFMA_W_CHAIN(SDNode *N);
-  void SelectFMUL_W_CHAIN(SDNode *N);
-
-  SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
-                   uint32_t Offset, uint32_t Width);
-  void SelectS_BFEFromShifts(SDNode *N);
-  void SelectS_BFE(SDNode *N);
-  bool isCBranchSCC(const SDNode *N) const;
-  void SelectBRCOND(SDNode *N);
-  void SelectFMAD_FMA(SDNode *N);
-  void SelectATOMIC_CMP_SWAP(SDNode *N);
-  void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
-  void SelectDS_GWS(SDNode *N, unsigned IntrID);
-  void SelectInterpP1F16(SDNode *N);
-  void SelectINTRINSIC_W_CHAIN(SDNode *N);
-  void SelectINTRINSIC_WO_CHAIN(SDNode *N);
-  void SelectINTRINSIC_VOID(SDNode *N);
-
-protected:
-  // Include the pieces autogenerated from the target description.
-#include "AMDGPUGenDAGISel.inc"
-};
-
-class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
-  const R600Subtarget *Subtarget;
-
-  bool isConstantLoad(const MemSDNode *N, int cbID) const;
-  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
-                                       SDValue& Offset);
-public:
-  explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
-      AMDGPUDAGToDAGISel(TM, OptLevel) {}
-
-  void Select(SDNode *N) override;
-
-  bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
-                          SDValue &Offset) override;
-  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
-                          SDValue &Offset) override;
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void PreprocessISelDAG() override {}
-
-protected:
-  // Include the pieces autogenerated from the target description.
-#include "R600GenDAGISel.inc"
-};
-
 static SDValue stripBitcast(SDValue Val) {
   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
 }
@@ -351,7 +79,7 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
 static SDValue stripExtractLoElt(SDValue In) {
   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
-      if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
+      if (Idx->isZero() && In.getValueSizeInBits() <= 32)
         return In.getOperand(0);
     }
   }
@@ -386,11 +114,11 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
   return new AMDGPUDAGToDAGISel(TM, OptLevel);
 }
 
-/// This pass converts a legalized DAG into a R600-specific
-// DAG, ready for instruction scheduling.
-FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
-                                      CodeGenOpt::Level OptLevel) {
-  return new R600DAGToDAGISel(TM, OptLevel);
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(
+    TargetMachine *TM /*= nullptr*/,
+    CodeGenOpt::Level OptLevel /*= CodeGenOpt::Default*/)
+    : SelectionDAGISel(*TM, OptLevel) {
+  EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
 }
 
 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
@@ -468,6 +196,16 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
   }
 }
 
+void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AMDGPUArgumentUsageInfo>();
+  AU.addRequired<LegacyDivergenceAnalysis>();
+#ifdef EXPENSIVE_CHECKS
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+#endif
+  SelectionDAGISel::getAnalysisUsage(AU);
+}
+
 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   assert(Subtarget->d16PreservesUnusedBits());
   MVT VT = N->getValueType(0).getSimpleVT();
@@ -903,8 +641,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     uint32_t OffsetVal = Offset->getZExtValue();
     uint32_t WidthVal = Width->getZExtValue();
 
-    ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
-                            SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+    ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
+                            WidthVal));
     return;
   }
   case AMDGPUISD::DIV_SCALE: {
@@ -1207,7 +945,14 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
   Ops[8] = N->getOperand(0);
   Ops[9] = N->getOperand(4);
 
-  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops);
+  // If there are no source modifiers, prefer fmac over fma because it can use
+  // the smaller VOP2 encoding.
+  bool UseFMAC = Subtarget->hasDLInsts() &&
+                 cast<ConstantSDNode>(Ops[0])->isZero() &&
+                 cast<ConstantSDNode>(Ops[2])->isZero() &&
+                 cast<ConstantSDNode>(Ops[4])->isZero();
+  unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
+  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
 }
 
 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
@@ -1707,7 +1452,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
-                    APInt::getAllOnesValue(32).getZExtValue(); // Size
+                    APInt::getAllOnes(32).getZExtValue(); // Size
     SDLoc DL(Addr);
 
     const SITargetLowering& Lowering =
@@ -2202,9 +1947,17 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
   return true;
 }
 
-SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
+SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
                                      SDValue Val, uint32_t Offset,
                                      uint32_t Width) {
+  if (Val->isDivergent()) {
+    unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
+    SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
+    SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
+
+    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
+  }
+  unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
   // Transformation function, pack the offset and width of a BFE into
   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
   // source, bits [5:0] contain the offset and bits [22:16] the width.
@@ -2229,10 +1982,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
 
     if (0 < BVal && BVal <= CVal && CVal < 32) {
       bool Signed = N->getOpcode() == ISD::SRA;
-      unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
-
-      ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
-                              32 - CVal));
+      ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
+                  32 - CVal));
       return;
     }
   }
@@ -2255,9 +2006,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
 
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = countPopulation(MaskVal);
-
-          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
-                                  Srl.getOperand(0), ShiftVal, WidthVal));
+          ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
+                                  WidthVal));
           return;
         }
       }
@@ -2277,9 +2027,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
 
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = countPopulation(MaskVal);
-
-          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
-                                  And.getOperand(0), ShiftVal, WidthVal));
+          ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
+                      WidthVal));
           return;
         }
       }
@@ -2306,7 +2055,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
       break;
 
     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
-    ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
+    ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
                             Amt->getZExtValue(), Width));
     return;
   }
@@ -3111,128 +2860,3 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
     CurDAG->RemoveDeadNodes();
   } while (IsModified);
 }
-
-bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  Subtarget = &MF.getSubtarget<R600Subtarget>();
-  return SelectionDAGISel::runOnMachineFunction(MF);
-}
-
-bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
-  if (!N->readMem())
-    return false;
-  if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
-           N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-
-  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
-}
-
-bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-                                                         SDValue& IntPtr) {
-  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
-    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
-                                       true);
-    return true;
-  }
-  return false;
-}
-
-bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
-    SDValue& BaseReg, SDValue &Offset) {
-  if (!isa<ConstantSDNode>(Addr)) {
-    BaseReg = Addr;
-    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
-    return true;
-  }
-  return false;
-}
-
-void R600DAGToDAGISel::Select(SDNode *N) {
-  unsigned int Opc = N->getOpcode();
-  if (N->isMachineOpcode()) {
-    N->setNodeId(-1);
-    return;   // Already selected.
-  }
-
-  switch (Opc) {
-  default: break;
-  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
-  case ISD::SCALAR_TO_VECTOR:
-  case ISD::BUILD_VECTOR: {
-    EVT VT = N->getValueType(0);
-    unsigned NumVectorElts = VT.getVectorNumElements();
-    unsigned RegClassID;
-    // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-    // that adds a 128 bits reg copy when going through TwoAddressInstructions
-    // pass. We want to avoid 128 bits copies as much as possible because they
-    // can't be bundled by our scheduler.
-    switch(NumVectorElts) {
-    case 2: RegClassID = R600::R600_Reg64RegClassID; break;
-    case 4:
-      if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
-        RegClassID = R600::R600_Reg128VerticalRegClassID;
-      else
-        RegClassID = R600::R600_Reg128RegClassID;
-      break;
-    default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
-    }
-    SelectBuildVector(N, RegClassID);
-    return;
-  }
-  }
-
-  SelectCode(N);
-}
-
-bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
-                                          SDValue &Offset) {
-  ConstantSDNode *C;
-  SDLoc DL(Addr);
-
-  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
-  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
-             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
-    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
-  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
-    Base = Addr.getOperand(0);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
-  } else {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  }
-
-  return true;
-}
-
-bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
-                                          SDValue &Offset) {
-  ConstantSDNode *IMMOffset;
-
-  if (Addr.getOpcode() == ISD::ADD
-      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
-      && isInt<16>(IMMOffset->getZExtValue())) {
-
-      Base = Addr.getOperand(0);
-      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
-                                         MVT::i32);
-      return true;
-  // If the pointer address is constant, we can move it to the offset field.
-  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
-             && isInt<16>(IMMOffset->getZExtValue())) {
-    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                  SDLoc(CurDAG->getEntryNode()),
-                                  R600::ZERO, MVT::i32);
-    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
-                                       MVT::i32);
-    return true;
-  }
-
-  // Default case, no offset
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
-  return true;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
new file mode 100644
index 000000000000..c1d9673f067e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -0,0 +1,256 @@
+//===-- AMDGPUISelDAGToDAG.h - A dag to dag inst selector for AMDGPU ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
+
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+static inline bool isNullConstantOrUndef(SDValue V) {
+  if (V.isUndef())
+    return true;
+
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isZero();
+}
+
+static inline bool getConstantValue(SDValue N, uint32_t &Out) {
+  // This is only used for packed vectors, where using 0 for undef should
+  // always be good.
+  if (N.isUndef()) {
+    Out = 0;
+    return true;
+  }
+
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    Out = C->getAPIntValue().getSExtValue();
+    return true;
+  }
+
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+    Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+// TODO: Handle undef as zero
+static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
+                                        bool Negate = false) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+  uint32_t LHSVal, RHSVal;
+  if (getConstantValue(N->getOperand(0), LHSVal) &&
+      getConstantValue(N->getOperand(1), RHSVal)) {
+    SDLoc SL(N);
+    uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
+                        : (LHSVal & 0xffff) | (RHSVal << 16);
+    return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
+                              DAG.getTargetConstant(K, SL, MVT::i32));
+  }
+
+  return nullptr;
+}
+
+static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
+  return packConstantV2I16(N, DAG, true);
+}
+} // namespace
+
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const GCNSubtarget *Subtarget;
+
+  // Default FP mode for the current function.
+  AMDGPU::SIModeRegisterDefaults Mode;
+
+  bool EnableLateStructurizeCFG;
+
+  // Instructions that will be lowered with a final instruction that zeros the
+  // high result bits.
+  bool fp16SrcZerosHighBits(unsigned Opc) const;
+
+public:
+  explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
+                              CodeGenOpt::Level OptLevel = CodeGenOpt::Default);
+  ~AMDGPUDAGToDAGISel() override = default;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool matchLoadD16FromBuildVector(SDNode *N) const;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void PreprocessISelDAG() override;
+  void Select(SDNode *N) override;
+  StringRef getPassName() const override;
+  void PostprocessISelDAG() override;
+
+protected:
+  void SelectBuildVector(SDNode *N, unsigned RegClassID);
+
+private:
+  std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
+  bool isNoNanSrc(SDValue N) const;
+  bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
+  bool isNegInlineImmediate(const SDNode *N) const {
+    return isInlineImmediate(N, true);
+  }
+
+  bool isInlineImmediate16(int64_t Imm) const {
+    return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
+  }
+
+  bool isInlineImmediate32(int64_t Imm) const {
+    return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm());
+  }
+
+  bool isInlineImmediate64(int64_t Imm) const {
+    return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm());
+  }
+
+  bool isInlineImmediate(const APFloat &Imm) const {
+    return Subtarget->getInstrInfo()->isInlineConstant(Imm);
+  }
+
+  bool isVGPRImm(const SDNode *N) const;
+  bool isUniformLoad(const SDNode *N) const;
+  bool isUniformBr(const SDNode *N) const;
+
+  bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
+                                  SDValue &RHS) const;
+
+  MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
+
+  SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
+  SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
+  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+  virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
+  bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
+                        unsigned Size) const;
+  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                 SDValue &Offset1) const;
+  bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                  SDValue &Offset1) const;
+  bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                          SDValue &Offset1, unsigned Size) const;
+  bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+                   SDValue &Idxen, SDValue &Addr64) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                         SDValue &SOffset, SDValue &Offset) const;
+  bool SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &RSrc,
+                               SDValue &VAddr, SDValue &SOffset,
+                               SDValue &ImmOffset) const;
+  bool SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc,
+                                SDValue &Soffset, SDValue &Offset) const;
+
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset) const;
+
+  bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
+                            SDValue &Offset, uint64_t FlatVariant) const;
+  bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+                        SDValue &Offset) const;
+  bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+                          SDValue &Offset) const;
+  bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+                           SDValue &Offset) const;
+  bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+                         SDValue &VOffset, SDValue &Offset) const;
+  bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+                          SDValue &Offset) const;
+
+  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+                        bool &Imm) const;
+  SDValue Expand32BitAddress(SDValue Addr) const;
+  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+                  bool &Imm) const;
+  bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+
+  bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
+                          bool AllowAbs = true) const;
+  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
+  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                       SDValue &Clamp, SDValue &Omod) const;
+  bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                        SDValue &Clamp, SDValue &Omod) const;
+  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                         SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,
+                       SDValue &Omod) const;
+
+  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+  bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+  bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
+                                 unsigned &Mods) const;
+  bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
+  SDValue getHi16Elt(SDValue In) const;
+
+  SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
+
+  void SelectADD_SUB_I64(SDNode *N);
+  void SelectAddcSubb(SDNode *N);
+  void SelectUADDO_USUBO(SDNode *N);
+  void SelectDIV_SCALE(SDNode *N);
+  void SelectMAD_64_32(SDNode *N);
+  void SelectFMA_W_CHAIN(SDNode *N);
+  void SelectFMUL_W_CHAIN(SDNode *N);
+  SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
+                   uint32_t Width);
+  void SelectS_BFEFromShifts(SDNode *N);
+  void SelectS_BFE(SDNode *N);
+  bool isCBranchSCC(const SDNode *N) const;
+  void SelectBRCOND(SDNode *N);
+  void SelectFMAD_FMA(SDNode *N);
+  void SelectATOMIC_CMP_SWAP(SDNode *N);
+  void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+  void SelectDS_GWS(SDNode *N, unsigned IntrID);
+  void SelectInterpP1F16(SDNode *N);
+  void SelectINTRINSIC_W_CHAIN(SDNode *N);
+  void SelectINTRINSIC_WO_CHAIN(SDNode *N);
+  void SelectINTRINSIC_VOID(SDNode *N);
+
+protected:
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d68488ccb342..523fa2d3724b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -45,17 +45,13 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
 }
 
 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-  KnownBits Known = DAG.computeKnownBits(Op);
-  return VT.getSizeInBits() - Known.countMinLeadingZeros();
+  return DAG.computeKnownBits(Op).countMaxActiveBits();
 }
 
 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-
   // In order for this to be a signed 24-bit value, bit 23, must
   // be a sign bit.
-  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
+  return DAG.ComputeMinSignedBits(Op);
 }
 
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
@@ -1042,7 +1038,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 /// In order to correctly lower the arguments we need to know the size of each
 /// argument.  Since Ins[x].VT gives us the size of the register that will
 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
-/// for the orignal function argument so that we can deduce the correct memory
+/// for the original function argument so that we can deduce the correct memory
 /// type to use for Ins[x].  In most cases the correct memory type will be
 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
 /// we have a kernel argument of type v8i8, this argument will be split into
@@ -1210,10 +1206,8 @@ SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument corresponding
-  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-                            UE = DAG.getEntryNode().getNode()->use_end();
-       U != UE; ++U) {
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
+  for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
         if (FI->getIndex() < 0) {
           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
@@ -1334,14 +1328,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar || !GVar->hasInitializer())
-    return false;
-
-  return !isa<UndefValue>(GVar->getInitializer());
-}
-
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                  SDValue Op,
                                                  SelectionDAG &DAG) const {
@@ -1378,16 +1364,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
          "Do not know what to do with an non-zero offset");
 
     // TODO: We could emit code to handle the initialization somewhere.
-    if (!hasDefinedInitializer(GV)) {
-      unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
-      return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
-    }
+    // We ignore the initializer for now and legalize it to allow selection.
+    // The initializer will anyway get errored out during assembly emission.
+    unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
+    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
   }
-
-  const Function &Fn = DAG.getMachineFunction().getFunction();
-  DiagnosticInfoUnsupported BadInit(
-      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
-  DAG.getContext()->diagnose(BadInit);
   return SDValue();
 }
 
@@ -1856,6 +1837,9 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   }
 
   if (isTypeLegal(MVT::i64)) {
+    // The algorithm here is based on ideas from "Software Integer Division",
+    // Tom Rodeheffer, August 2008.
+
     MachineFunction &MF = DAG.getMachineFunction();
     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1890,37 +1874,35 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
 
+    // First round of UNR (Unsigned integer Newton-Raphson).
     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
                                     Zero);
-    SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
-                                    One);
-
+    SDValue Mulhi1_Hi =
+        DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
                                   Mulhi1_Lo, Zero1);
     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
                                   Mulhi1_Hi, Add1_Lo.getValue(1));
-    SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
     SDValue Add1 = DAG.getBitcast(VT,
                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
 
+    // Second round of UNR.
     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
                                     Zero);
-    SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
-                                    One);
-
+    SDValue Mulhi2_Hi =
+        DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
                                   Mulhi2_Lo, Zero1);
-    SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
-                                   Mulhi2_Hi, Add1_Lo.getValue(1));
-    SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
-                                  Zero, Add2_Lo.getValue(1));
+    SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
+                                  Mulhi2_Hi, Add2_Lo.getValue(1));
     SDValue Add2 = DAG.getBitcast(VT,
                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
+
     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
 
     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
@@ -2211,13 +2193,10 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f64);
 
   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
-
-  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
 
   // Extract the upper half, since this is where we will find the sign and
   // exponent.
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+  SDValue Hi = getHiHalf64(Src, DAG);
 
   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
 
@@ -2380,72 +2359,50 @@ static bool isCttzOpc(unsigned Opc) {
 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
-  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
-                   Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
-
-  unsigned ISDOpc, NewOpc;
-  if (isCtlzOpc(Op.getOpcode())) {
-    ISDOpc = ISD::CTLZ_ZERO_UNDEF;
-    NewOpc = AMDGPUISD::FFBH_U32;
-  } else if (isCttzOpc(Op.getOpcode())) {
-    ISDOpc = ISD::CTTZ_ZERO_UNDEF;
-    NewOpc = AMDGPUISD::FFBL_B32;
-  } else
-    llvm_unreachable("Unexpected OPCode!!!");
-
-
-  if (ZeroUndef && Src.getValueType() == MVT::i32)
-    return DAG.getNode(NewOpc, SL, MVT::i32, Src);
 
-  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
-
-  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
-
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
-
-  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
-                                   *DAG.getContext(), MVT::i32);
-
-  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
-  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
-
-  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
-  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
-
-  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
-  SDValue Add, NewOpr;
-  if (isCtlzOpc(Op.getOpcode())) {
-    Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
-    // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
-    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
-  } else {
-    Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
-    // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
-    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
+  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
+  bool Ctlz = isCtlzOpc(Op.getOpcode());
+  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
+
+  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
+                   Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+
+  if (Src.getValueType() == MVT::i32) {
+    // (ctlz hi:lo) -> (umin (ffbh src), 32)
+    // (cttz hi:lo) -> (umin (ffbl src), 32)
+    // (ctlz_zero_undef src) -> (ffbh src)
+    // (cttz_zero_undef src) -> (ffbl src)
+    SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
+    if (!ZeroUndef) {
+      const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
+      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+    }
+    return NewOpr;
   }
 
-  if (!ZeroUndef) {
-    // Test if the full 64-bit input is zero.
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
 
-    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
-    // which we probably don't want.
-    SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
-    SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
-    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
+  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
+  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
 
-    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
-    // with the same cycles, otherwise it is slower.
-    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
-    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
+  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
+  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
+  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
+  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
 
-    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
+  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
+  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
+  if (Ctlz)
+    OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
+  else
+    OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
 
-    // The instruction returns -1 for 0 input, but the defined intrinsic
-    // behavior is to return the number of bits.
-    NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
-                         SrcIsZero, Bits32, NewOpr);
+  SDValue NewOpr;
+  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
+  if (!ZeroUndef) {
+    const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
+    NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
   }
 
   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
@@ -2453,87 +2410,128 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
 
 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
                                                bool Signed) const {
-  // Unsigned
-  // cul2f(ulong u)
-  //{
-  //  uint lz = clz(u);
-  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
-  //  u = (u << lz) & 0x7fffffffffffffffUL;
-  //  ulong t = u & 0xffffffffffUL;
-  //  uint v = (e << 23) | (uint)(u >> 40);
-  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
-  //  return as_float(v + r);
-  //}
-  // Signed
-  // cl2f(long l)
-  //{
-  //  long s = l >> 63;
-  //  float r = cul2f((l + s) ^ s);
-  //  return s ? -r : r;
-  //}
+  // The regular method converting a 64-bit integer to float roughly consists of
+  // 2 steps: normalization and rounding. In fact, after normalization, the
+  // conversion from a 64-bit integer to a float is essentially the same as the
+  // one from a 32-bit integer. The only difference is that it has more
+  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
+  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
+  // converted into the correct float number. The basic steps for the unsigned
+  // conversion are illustrated in the following pseudo code:
+  //
+  // f32 uitofp(i64 u) {
+  //   i32 hi, lo = split(u);
+  //   // Only count the leading zeros in hi as we have native support of the
+  //   // conversion from i32 to f32. If hi is all 0s, the conversion is
+  //   // reduced to a 32-bit one automatically.
+  //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.
+  //   u <<= shamt;
+  //   hi, lo = split(u);
+  //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
+  //   // convert it as a 32-bit integer and scale the result back.
+  //   return uitofp(hi) * 2^(32 - shamt);
+  // }
+  //
+  // The signed one follows the same principle but uses 'ffbh_i32' to count its
+  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
+  // converted instead followed by negation based its sign bit.
 
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
-  SDValue L = Src;
 
-  SDValue S;
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
+  SDValue Sign;
+  SDValue ShAmt;
+  if (Signed && Subtarget->isGCN()) {
+    // We also need to consider the sign bit in Lo if Hi has just sign bits,
+    // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
+    // account. That is, the maximal shift is
+    // - 32 if Lo and Hi have opposite signs;
+    // - 33 if Lo and Hi have the same sign.
+    //
+    // Or, MaxShAmt = 33 + OppositeSign, where
+    //
+    // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
+    // - -1 if Lo and Hi have opposite signs; and
+    // -  0 otherwise.
+    //
+    // All in all, ShAmt is calculated as
+    //
+    //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
+    //
+    // or
+    //
+    //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
+    //
+    // to reduce the critical path.
+    SDValue OppositeSign = DAG.getNode(
+        ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
+        DAG.getConstant(31, SL, MVT::i32));
+    SDValue MaxShAmt =
+        DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
+                    OppositeSign);
+    // Count the leading sign bits.
+    ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
+    // Different from unsigned conversion, the shift should be one bit less to
+    // preserve the sign bit.
+    ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
+                        DAG.getConstant(1, SL, MVT::i32));
+    ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
+  } else {
+    if (Signed) {
+      // Without 'ffbh_i32', only leading zeros could be counted. Take the
+      // absolute value first.
+      Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
+                         DAG.getConstant(63, SL, MVT::i64));
+      SDValue Abs =
+          DAG.getNode(ISD::XOR, SL, MVT::i64,
+                      DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
+      std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
+    }
+    // Count the leading zeros.
+    ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
+    // The shift amount for signed integers is [0, 32].
+  }
+  // Normalize the given 64-bit integer.
+  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
+  // Split it again.
+  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
+  // Calculate the adjust bit for rounding.
+  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
+  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
+                               DAG.getConstant(1, SL, MVT::i32), Lo);
+  // Get the 32-bit normalized integer.
+  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
+  // Convert the normalized 32-bit integer into f32.
+  unsigned Opc =
+      (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
+
+  // Finally, need to scale back the converted floating number as the original
+  // 64-bit integer is converted as a 32-bit one.
+  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
+                      ShAmt);
+  // On GCN, use LDEXP directly.
+  if (Subtarget->isGCN())
+    return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
+
+  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
+  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
+  // exponent is enough to avoid overflowing into the sign bit.
+  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
+                            DAG.getConstant(23, SL, MVT::i32));
+  SDValue IVal =
+      DAG.getNode(ISD::ADD, SL, MVT::i32,
+                  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
   if (Signed) {
-    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
-    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
-
-    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
-    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
+    // Set the sign bit.
+    Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
+                       DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
+                       DAG.getConstant(31, SL, MVT::i32));
+    IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
   }
-
-  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
-                                   *DAG.getContext(), MVT::f32);
-
-
-  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
-  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
-  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
-  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
-
-  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
-  SDValue E = DAG.getSelect(SL, MVT::i32,
-    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
-    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
-    ZeroI32);
-
-  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
-    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
-    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
-
-  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
-                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
-
-  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
-                             U, DAG.getConstant(40, SL, MVT::i64));
-
-  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
-    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
-    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
-
-  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
-  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
-  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
-
-  SDValue One = DAG.getConstant(1, SL, MVT::i32);
-
-  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
-
-  SDValue R = DAG.getSelect(SL, MVT::i32,
-    RCmp,
-    One,
-    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
-  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
-  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
-
-  if (!Signed)
-    return R;
-
-  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
-  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
+  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
 }
 
 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
@@ -2541,12 +2539,8 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
 
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
-
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
-                           DAG.getConstant(0, SL, MVT::i32));
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
-                           DAG.getConstant(1, SL, MVT::i32));
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
 
   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
                               SL, MVT::f64, Hi);
@@ -2878,7 +2872,7 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
                                      // as unsigned 24-bit values.
-    AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
+         AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
 }
 
 static SDValue simplifyMul24(SDNode *Node24,
@@ -2892,8 +2886,22 @@ static SDValue simplifyMul24(SDNode *Node24,
   unsigned NewOpcode = Node24->getOpcode();
   if (IsIntrin) {
     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
-    NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
-      AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+    switch (IID) {
+    case Intrinsic::amdgcn_mul_i24:
+      NewOpcode = AMDGPUISD::MUL_I24;
+      break;
+    case Intrinsic::amdgcn_mul_u24:
+      NewOpcode = AMDGPUISD::MUL_U24;
+      break;
+    case Intrinsic::amdgcn_mulhi_i24:
+      NewOpcode = AMDGPUISD::MULHI_I24;
+      break;
+    case Intrinsic::amdgcn_mulhi_u24:
+      NewOpcode = AMDGPUISD::MULHI_U24;
+      break;
+    default:
+      llvm_unreachable("Expected 24-bit mul intrinsic");
+    }
   }
 
   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@@ -3102,6 +3110,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   switch (IID) {
   case Intrinsic::amdgcn_mul_i24:
   case Intrinsic::amdgcn_mul_u24:
+  case Intrinsic::amdgcn_mulhi_i24:
+  case Intrinsic::amdgcn_mulhi_u24:
     return simplifyMul24(N, DCI);
   case Intrinsic::amdgcn_fract:
   case Intrinsic::amdgcn_rsq:
@@ -3281,11 +3291,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   // srl i64:x, C for C >= 32
   // =>
   //   build_pair (srl hi_32(x), C - 32), 0
-  SDValue One = DAG.getConstant(1, SL, MVT::i32);
   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
 
-  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
+  SDValue Hi = getHiHalf64(LHS, DAG);
 
   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
@@ -3355,7 +3363,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
       KnownBits Known = DAG.computeKnownBits(Amt);
       unsigned Size = VT.getScalarSizeInBits();
       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
-          (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
+          (Known.countMaxActiveBits() <= Log2_32(Size))) {
         EVT MidVT = VT.isVector() ?
           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                            VT.getVectorNumElements()) : MVT::i32;
@@ -3522,7 +3530,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
 
 static bool isNegativeOne(SDValue Val) {
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
-    return C->isAllOnesValue();
+    return C->isAllOnes();
   return false;
 }
 
@@ -3557,7 +3565,7 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C
                                                  SDValue LHS, SDValue RHS,
                                                  DAGCombinerInfo &DCI) const {
   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
-  if (!CmpRhs || !CmpRhs->isNullValue())
+  if (!CmpRhs || !CmpRhs->isZero())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -4341,6 +4349,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TC_RETURN)
   NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(RET_GFX_FLAG)
   NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
   NODE_NAME_CASE(DWORDADDR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e61021d451f8..03632ac18598 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -35,9 +35,15 @@ private:
   SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
 
 public:
+  /// \returns The minimum number of bits needed to store the value of \Op as an
+  /// unsigned integer. Truncating to this size and then zero-extending to the
+  /// original size will not change the value.
   static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
+
+  /// \returns The minimum number of bits needed to store the value of \Op as a
+  /// signed integer. Truncating to this size and then sign-extending to the
+  /// original size will not change the value.
   static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
-  static bool hasDefinedInitializer(const GlobalValue *GV);
 
 protected:
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -337,7 +343,7 @@ namespace AMDGPUISD {
 enum NodeType : unsigned {
   // AMDIL ISD Opcodes
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  UMUL,        // 32bit unsigned multiplication
+  UMUL, // 32bit unsigned multiplication
   BRANCH_COND,
   // End AMDIL ISD Opcodes
 
@@ -360,6 +366,9 @@ enum NodeType : unsigned {
   // Return with values from a non-entry function.
   RET_FLAG,
 
+  // Return with values from a non-entry function (AMDGPU_Gfx CC).
+  RET_GFX_FLAG,
+
   DWORDADDR,
   FRACT,
 
@@ -416,10 +425,10 @@ enum NodeType : unsigned {
   DOT4,
   CARRY,
   BORROW,
-  BFE_U32, // Extract range of bits with zero extension to 32-bits.
-  BFE_I32, // Extract range of bits with sign extension to 32-bits.
-  BFI, // (src0 & src1) | (~src0 & src2)
-  BFM, // Insert a range of bits into a 32-bit word.
+  BFE_U32,  // Extract range of bits with zero extension to 32-bits.
+  BFE_I32,  // Extract range of bits with sign extension to 32-bits.
+  BFI,      // (src0 & src1) | (~src0 & src2)
+  BFM,      // Insert a range of bits into a 32-bit word.
   FFBH_U32, // ctlz with -1 if input is zero.
   FFBH_I32,
   FFBL_B32, // cttz with -1 if input is zero.
@@ -528,7 +537,6 @@ enum NodeType : unsigned {
   LAST_AMDGPU_ISD_NUMBER
 };
 
-
 } // End namespace AMDGPUISD
 
 } // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 06aa0055e4bb..88b4ec53a2a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -17,7 +17,6 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNSubtarget.h"
-#include "R600Subtarget.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 
@@ -149,7 +148,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
   Function *I =
       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
 
-  SmallVector<Value *, 8> Args(II.arg_operands());
+  SmallVector<Value *, 8> Args(II.args());
 
   unsigned EndIndex =
       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
@@ -440,7 +439,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (!CWidth || !COffset)
       break;
 
-    // The case of Width == 0 is handled above, which makes this tranformation
+    // The case of Width == 0 is handled above, which makes this transformation
     // safe.  If Width == 0, then the ashr and lshr instructions become poison
     // value since the shift amount would be equal to the bit size.
     assert(Width != 0);
@@ -586,8 +585,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
+        NewCall->addFnAttr(Attribute::Convergent);
         NewCall->takeName(&II);
         return IC.replaceInstUsesWith(II, NewCall);
       }
@@ -712,8 +710,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
+        NewCall->addFnAttr(Attribute::Convergent);
         NewCall->takeName(&II);
         return IC.replaceInstUsesWith(II, NewCall);
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 8e7a6a7029c6..b1263618c5db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -76,8 +76,8 @@ struct ImageDimIntrinsicInfo {
 };
 const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
 
-const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode,
-                                                               unsigned Dim);
+const ImageDimIntrinsicInfo *
+getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim);
 
 } // end AMDGPU namespace
 } // End llvm namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 0f9cb712f820..391dc8428539 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -103,9 +103,6 @@ def AMDGPUconstdata_ptr : SDNode<
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
-// Force dependencies for vector trunc stores
-def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
-
 def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
 def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
 // out = a - floor(a)
@@ -282,11 +279,18 @@ def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
-def AMDGPUmulhi_u24 : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
+// mulhi24 yields the high-order 16 bits of the 48-bit result. Here's an example
+// that shows mulhi24 is not associative:
+//
+// Given a = 0x10002, b = c = 0xffffff:
+// mulhi24(mulhi24(a, b), c) = mulhi24(0x100, 0xffffff) = 0
+// Which is not equal to:
+// mulhi24(a, mulhi24(b, c)) = mulhi24(0x10002, 0xffff) = 1
+def AMDGPUmulhi_u24_impl : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp,
+  [SDNPCommutative]
 >;
-def AMDGPUmulhi_i24 : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
+def AMDGPUmulhi_i24_impl : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp,
+  [SDNPCommutative]
 >;
 
 def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
@@ -329,11 +333,6 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [
 ]>;
 
 
-def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
-
-def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
-  [SDNPHasChain, SDNPSideEffect]>;
-
 //===----------------------------------------------------------------------===//
 // Flow Control Profile Types
 //===----------------------------------------------------------------------===//
@@ -360,6 +359,10 @@ def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPt
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
 
+def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 
 //===----------------------------------------------------------------------===//
 // Intrinsic/Custom node compatibility PatFrags
@@ -443,6 +446,14 @@ def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_mul_i24 node:$src0, node:$src1),
    (AMDGPUmul_i24_impl node:$src0, node:$src1)]>;
 
+def AMDGPUmulhi_u24 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_mulhi_u24 node:$src0, node:$src1),
+   (AMDGPUmulhi_u24_impl node:$src0, node:$src1)]>;
+
+def AMDGPUmulhi_i24 : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_mulhi_i24 node:$src0, node:$src1),
+   (AMDGPUmulhi_i24_impl node:$src0, node:$src1)]>;
+
 def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
   [(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),
    (AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 323aaaf70cd4..28cb2fc57ac7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 
 #define DEBUG_TYPE "amdgpu-isel"
 
@@ -140,7 +141,7 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
 
       Optional<ValueAndVReg> ConstVal =
-          getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
+          getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
       if (ConstVal) {
         unsigned MovOpc =
             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
@@ -608,11 +609,10 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock *BB = MI.getParent();
 
-  auto ConstSrc1 =
-      getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
+  auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
   if (ConstSrc1) {
     auto ConstSrc0 =
-        getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
+        getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
     if (ConstSrc0) {
       const int64_t K0 = ConstSrc0->Value.getSExtValue();
       const int64_t K1 = ConstSrc1->Value.getSExtValue();
@@ -844,7 +844,7 @@ bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
 
   Optional<ValueAndVReg> ConstSelect =
-    getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
+      getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
   if (ConstSelect) {
     // The selector has to be an inline immediate, so we can use whatever for
     // the other operands.
@@ -853,7 +853,7 @@ bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
   } else {
     Optional<ValueAndVReg> ConstVal =
-      getConstantVRegValWithLookThrough(Val, *MRI, true, true);
+        getIConstantVRegValWithLookThrough(Val, *MRI);
 
     // If the value written is an inline immediate, we can get away without a
     // copy to m0.
@@ -928,7 +928,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
   case Intrinsic::amdgcn_if_break: {
     MachineBasicBlock *BB = I.getParent();
 
-    // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+    // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
     // SelectionDAG uses for wave32 vs wave64.
     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
       .add(I.getOperand(0))
@@ -1130,7 +1130,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
     return false;
 
   Optional<ValueAndVReg> Arg =
-      getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
+      getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
 
   if (Arg.hasValue()) {
     const int64_t Value = Arg.getValue().Value.getSExtValue();
@@ -1242,7 +1242,7 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
 }
 
 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
-  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
   // SelectionDAG uses for wave32 vs wave64.
   MachineBasicBlock *BB = MI.getParent();
   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
@@ -1826,8 +1826,9 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
             .add(I.getOperand(2))
             .add(I.getOperand(3));
 
-    bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
-               constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+    bool Ret = false;
+    Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+    Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
     I.eraseFromParent();
     return Ret;
   }
@@ -2387,7 +2388,7 @@ void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
       STI.ldsRequiresM0Init()) {
     MachineBasicBlock *BB = I.getParent();
 
-    // If DS instructions require M0 initializtion, insert it before selecting.
+    // If DS instructions require M0 initialization, insert it before selecting.
     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addImm(-1);
   }
@@ -2465,6 +2466,27 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
+  if (Reg.isPhysical())
+    return false;
+
+  MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
+  const unsigned Opcode = MI.getOpcode();
+
+  if (Opcode == AMDGPU::COPY)
+    return isVCmpResult(MI.getOperand(1).getReg(), MRI);
+
+  if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
+      Opcode == AMDGPU::G_XOR)
+    return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
+           isVCmpResult(MI.getOperand(2).getReg(), MRI);
+
+  if (Opcode == TargetOpcode::G_INTRINSIC)
+    return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
+
+  return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
+}
+
 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineOperand &CondOp = I.getOperand(0);
@@ -2488,11 +2510,22 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
     ConstrainRC = &AMDGPU::SReg_32RegClass;
   } else {
-    // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
-    // We sort of know that a VCC producer based on the register bank, that ands
-    // inactive lanes with 0. What if there was a logical operation with vcc
-    // producers in different blocks/with different exec masks?
     // FIXME: Should scc->vcc copies and with exec?
+
+    // Unless the value of CondReg is a result of a V_CMP* instruction then we
+    // need to insert an and with exec.
+    if (!isVCmpResult(CondReg, *MRI)) {
+      const bool Is64 = STI.isWave64();
+      const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+      const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
+
+      Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
+      BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
+          .addReg(CondReg)
+          .addReg(Exec);
+      CondReg = TmpReg;
+    }
+
     CondPhysReg = TRI.getVCC();
     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
     ConstrainRC = TRI.getBoolRC();
@@ -3216,6 +3249,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case AMDGPU::G_SBFX:
   case AMDGPU::G_UBFX:
     return selectG_SBFX_UBFX(I);
+  case AMDGPU::G_SI_CALL:
+    I.setDesc(TII.get(AMDGPU::SI_CALL));
+    return true;
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -3977,8 +4013,8 @@ AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
     return {Root, 0};
 
   MachineOperand &RHS = RootI->getOperand(2);
-  Optional<ValueAndVReg> MaybeOffset
-    = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
+  Optional<ValueAndVReg> MaybeOffset =
+      getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
   if (!MaybeOffset)
     return {Root, 0};
   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
@@ -4306,8 +4342,8 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
 /// Get an immediate that must be 32-bits, and treated as zero extended.
 static Optional<uint64_t> getConstantZext32Val(Register Reg,
                                                const MachineRegisterInfo &MRI) {
-  // getConstantVRegVal sexts any values, so see if that matters.
-  Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
+  // getIConstantVRegVal sexts any values, so see if that matters.
+  Optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
   if (!OffsetVal || !isInt<32>(*OffsetVal))
     return None;
   return Lo_32(*OffsetVal);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index cb05a1cb6369..b70e6883bae2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -14,10 +14,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/Register.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 
 namespace {
 #define GET_GLOBALISEL_PREDICATE_BITSET
@@ -135,7 +132,6 @@ private:
   bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
   void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
                        SmallVectorImpl<GEPInfo> &AddrInfo) const;
-  bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
 
   void initM0(MachineInstr &I) const;
   bool selectG_LOAD_STORE_ATOMICRMW(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 119c4089d6c2..bad9f6265b36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -237,6 +237,36 @@ def select_oneuse : HasOneUseTernaryOp<select>;
 def AMDGPUmul_u24_oneuse : HasOneUseBinOp<AMDGPUmul_u24>;
 def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>;
 
+//===----------------------------------------------------------------------===//
+// PatFrags for shifts
+//===----------------------------------------------------------------------===//
+
+// Constrained shift PatFrags.
+foreach width = [16, 32, 64] in {
+defvar mask = !sub(width, 1);
+
+def cshl_#width : PatFrags<(ops node:$src0, node:$src1),
+  [(shl node:$src0, node:$src1), (shl node:$src0, (and node:$src1, mask))]>;
+defvar cshl = !cast<SDPatternOperator>("cshl_"#width);
+def cshl_#width#_oneuse : HasOneUseBinOp<cshl>;
+def clshl_rev_#width : PatFrag <(ops node:$src0, node:$src1),
+  (cshl $src1, $src0)>;
+
+def csrl_#width : PatFrags<(ops node:$src0, node:$src1),
+  [(srl node:$src0, node:$src1), (srl node:$src0, (and node:$src1, mask))]>;
+defvar csrl = !cast<SDPatternOperator>("csrl_"#width);
+def csrl_#width#_oneuse : HasOneUseBinOp<csrl>;
+def clshr_rev_#width : PatFrag <(ops node:$src0, node:$src1),
+  (csrl $src1, $src0)>;
+
+def csra_#width : PatFrags<(ops node:$src0, node:$src1),
+  [(sra node:$src0, node:$src1), (sra node:$src0, (and node:$src1, mask))]>;
+defvar csra = !cast<SDPatternOperator>("csra_"#width);
+def csra_#width#_oneuse : HasOneUseBinOp<csra>;
+def cashr_rev_#width : PatFrag <(ops node:$src0, node:$src1),
+  (csra $src1, $src0)>;
+} // end foreach width
+
 def srl_16 : PatFrag<
   (ops node:$src0), (srl_oneuse node:$src0, (i32 16))
 >;
@@ -422,6 +452,16 @@ def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
   let MemoryVT = i16;
 }
 
+def atomic_load_8_#as : PatFrag<(ops node:$ptr), (atomic_load_8 node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i8;
+}
+
+def atomic_load_16_#as : PatFrag<(ops node:$ptr), (atomic_load_16 node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i16;
+}
+
 def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> {
   let IsAtomic = 1;
   let MemoryVT = i32;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 4971b010870d..9e86bd0c2b97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -95,10 +95,8 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
 
   bool Changed = false;
   for (auto &BB : F)
-    for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
-      Instruction *I = &*BI++;
-      Changed |= visit(*I);
-    }
+    for (Instruction &I : llvm::make_early_inc_range(BB))
+      Changed |= visit(I);
 
   return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c1a9b30a509e..1f898f2ba8b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -59,7 +59,7 @@ static LLT getPow2ScalarType(LLT Ty) {
   return LLT::scalar(Pow2Bits);
 }
 
-/// \returs true if this is an odd sized vector which should widen by adding an
+/// \returns true if this is an odd sized vector which should widen by adding an
 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
 /// excludes s1 vectors, which should always be scalarized.
 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
@@ -532,10 +532,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     // Full set of gfx9 features.
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16, V2S16})
-      .clampScalar(0, S16, S32)
+      .minScalar(0, S16)
       .clampMaxNumElements(0, S16, 2)
-      .scalarize(0)
-      .widenScalarToNextPow2(0, 32);
+      .widenScalarToNextMultipleOf(0, 32)
+      .maxScalar(0, S32)
+      .scalarize(0);
 
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
       .legalFor({S32, S16, V2S16}) // Clamp modifier
@@ -547,9 +548,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   } else if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16})
-      .clampScalar(0, S16, S32)
-      .scalarize(0)
-      .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
+      .minScalar(0, S16)
+      .widenScalarToNextMultipleOf(0, 32)
+      .maxScalar(0, S32)
+      .scalarize(0);
 
     // Technically the saturating operations require clamp bit support, but this
     // was introduced at the same time as 16-bit operations.
@@ -569,6 +571,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   } else {
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32})
+      .widenScalarToNextMultipleOf(0, 32)
       .clampScalar(0, S32, S32)
       .scalarize(0);
 
@@ -603,7 +606,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
                    .legalFor({S32})
-                   .maxScalarOrElt(0, S32);
+                   .maxScalar(0, S32);
 
   if (ST.hasVOP3PInsts()) {
     Mulh
@@ -812,10 +815,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   // TODO: Split s1->s64 during regbankselect for VALU.
   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
-    .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
-    .lowerFor({{S32, S64}})
-    .lowerIf(typeIs(1, S1))
-    .customFor({{S64, S64}});
+                    .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
+                    .lowerIf(typeIs(1, S1))
+                    .customFor({{S32, S64}, {S64, S64}});
   if (ST.has16BitInsts())
     IToFP.legalFor({{S16, S16}});
   IToFP.clampScalar(1, S32, S64)
@@ -941,7 +943,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(1, S32, S64)
     .widenScalarToNextPow2(0, 32)
     .widenScalarToNextPow2(1, 32)
-    .lower();
+    .custom();
 
   // The 64-bit versions produce 32-bit results, but only on the SALU.
   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
@@ -1266,7 +1268,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               // May need relegalization for the scalars.
               return std::make_pair(0, EltTy);
             })
-    .lowerIfMemSizeNotPow2()
     .minScalar(0, S32)
     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
     .widenScalarToNextPow2(0)
@@ -1318,7 +1319,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   }
 
   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
-  if (ST.hasLDSFPAtomics()) {
+  if (ST.hasLDSFPAtomicAdd()) {
     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
     if (ST.hasGFX90AInsts())
       Atomic.legalFor({{S64, LocalPtr}});
@@ -1628,6 +1629,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S32, S64)
     .lower();
 
+  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
+    .scalarize(0)
+    .lower();
+
   // TODO: Only Try to form v2s16 with legal packed instructions.
   getActionDefinitionsBuilder(G_FSHR)
     .legalFor({{S32, S32}})
@@ -1681,6 +1686,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        // TODO: Implement
       G_FMINIMUM, G_FMAXIMUM}).lower();
 
+  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
+      .lower();
+
   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
@@ -1760,6 +1768,9 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFFloor(MI, MRI, B);
   case TargetOpcode::G_BUILD_VECTOR:
     return legalizeBuildVector(MI, MRI, B);
+  case TargetOpcode::G_CTLZ:
+  case TargetOpcode::G_CTTZ:
+    return legalizeCTLZ_CTTZ(MI, MRI, B);
   default:
     return false;
   }
@@ -2065,23 +2076,53 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
   const LLT S64 = LLT::scalar(64);
   const LLT S32 = LLT::scalar(32);
 
-  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+  assert(MRI.getType(Src) == S64);
 
   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+  auto ThirtyTwo = B.buildConstant(S32, 32);
 
-  auto CvtHi = Signed ?
-    B.buildSITOFP(S64, Unmerge.getReg(1)) :
-    B.buildUITOFP(S64, Unmerge.getReg(1));
+  if (MRI.getType(Dst) == S64) {
+    auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
+                        : B.buildUITOFP(S64, Unmerge.getReg(1));
 
-  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
+    auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
+    auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
+                     .addUse(CvtHi.getReg(0))
+                     .addUse(ThirtyTwo.getReg(0));
 
-  auto ThirtyTwo = B.buildConstant(S32, 32);
-  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
-    .addUse(CvtHi.getReg(0))
-    .addUse(ThirtyTwo.getReg(0));
+    // TODO: Should this propagate fast-math-flags?
+    B.buildFAdd(Dst, LdExp, CvtLo);
+    MI.eraseFromParent();
+    return true;
+  }
 
-  // TODO: Should this propagate fast-math-flags?
-  B.buildFAdd(Dst, LdExp, CvtLo);
+  assert(MRI.getType(Dst) == S32);
+
+  auto One = B.buildConstant(S32, 1);
+
+  MachineInstrBuilder ShAmt;
+  if (Signed) {
+    auto ThirtyOne = B.buildConstant(S32, 31);
+    auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
+    auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
+    auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
+    auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
+                               /*HasSideEffects=*/false)
+                  .addUse(Unmerge.getReg(1));
+    auto LS2 = B.buildSub(S32, LS, One);
+    ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
+  } else
+    ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
+  auto Norm = B.buildShl(S64, Src, ShAmt);
+  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
+  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
+  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
+  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
+  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
+  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
+                   /*HasSideEffects=*/false)
+      .addUse(FVal.getReg(0))
+      .addUse(Scale.getReg(0));
   MI.eraseFromParent();
   return true;
 }
@@ -2183,9 +2224,9 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
 
   // FIXME: Artifact combiner probably should have replaced the truncated
   // constant before this, so we shouldn't need
-  // getConstantVRegValWithLookThrough.
+  // getIConstantVRegValWithLookThrough.
   Optional<ValueAndVReg> MaybeIdxVal =
-      getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
     return true;
   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
@@ -2215,9 +2256,9 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
 
   // FIXME: Artifact combiner probably should have replaced the truncated
   // constant before this, so we shouldn't need
-  // getConstantVRegValWithLookThrough.
+  // getIConstantVRegValWithLookThrough.
   Optional<ValueAndVReg> MaybeIdxVal =
-      getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+      getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
     return true;
 
@@ -2379,43 +2420,36 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
     }
 
     // TODO: We could emit code to handle the initialization somewhere.
-    if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
-      const SITargetLowering *TLI = ST.getTargetLowering();
-      if (!TLI->shouldUseLDSConstAddress(GV)) {
-        MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
-        return true; // Leave in place;
-      }
+    // We ignore the initializer for now and legalize it to allow selection.
+    // The initializer will anyway get errored out during assembly emission.
+    const SITargetLowering *TLI = ST.getTargetLowering();
+    if (!TLI->shouldUseLDSConstAddress(GV)) {
+      MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
+      return true; // Leave in place;
+    }
 
-      if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
-        Type *Ty = GV->getValueType();
-        // HIP uses an unsized array `extern __shared__ T s[]` or similar
-        // zero-sized type in other languages to declare the dynamic shared
-        // memory which size is not known at the compile time. They will be
-        // allocated by the runtime and placed directly after the static
-        // allocated ones. They all share the same offset.
-        if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
-          // Adjust alignment for that dynamic shared memory array.
-          MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
-          LLT S32 = LLT::scalar(32);
-          auto Sz =
-              B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
-          B.buildIntToPtr(DstReg, Sz);
-          MI.eraseFromParent();
-          return true;
-        }
+    if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
+      Type *Ty = GV->getValueType();
+      // HIP uses an unsized array `extern __shared__ T s[]` or similar
+      // zero-sized type in other languages to declare the dynamic shared
+      // memory which size is not known at the compile time. They will be
+      // allocated by the runtime and placed directly after the static
+      // allocated ones. They all share the same offset.
+      if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+        // Adjust alignment for that dynamic shared memory array.
+        MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+        LLT S32 = LLT::scalar(32);
+        auto Sz =
+            B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
+        B.buildIntToPtr(DstReg, Sz);
+        MI.eraseFromParent();
+        return true;
       }
-
-      B.buildConstant(
-          DstReg,
-          MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
-      MI.eraseFromParent();
-      return true;
     }
 
-    const Function &Fn = MF.getFunction();
-    DiagnosticInfoUnsupported BadInit(
-      Fn, "unsupported initializer for address space", MI.getDebugLoc());
-    Fn.getContext().diagnose(BadInit);
+    B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
+                                                   *cast<GlobalVariable>(GV)));
+    MI.eraseFromParent();
     return true;
   }
 
@@ -2446,7 +2480,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
 
   if (Ty.getSizeInBits() == 32) {
-    // Truncate if this is a 32-bit constant adrdess.
+    // Truncate if this is a 32-bit constant address.
     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
     B.buildExtract(DstReg, Load, 0);
   } else
@@ -2745,11 +2779,32 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
   return true;
 }
 
+// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
+// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
+// case with a single min instruction instead of a compare+select.
+bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
+                                            MachineRegisterInfo &MRI,
+                                            MachineIRBuilder &B) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
+                        ? AMDGPU::G_AMDGPU_FFBH_U32
+                        : AMDGPU::G_AMDGPU_FFBL_B32;
+  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
+  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Check that this is a G_XOR x, -1
 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::G_XOR)
     return false;
-  auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
+  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
   return ConstVal && *ConstVal == -1;
 }
 
@@ -2770,7 +2825,7 @@ verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
       return nullptr;
 
     // We're deleting the def of this value, so we need to remove it.
-    UseMI->eraseFromParent();
+    eraseInstr(*UseMI, MRI);
 
     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
     Negated = true;
@@ -2836,6 +2891,20 @@ bool AMDGPULegalizerInfo::loadInputValue(
   LLT ArgTy;
   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
 
+  if (!Arg) {
+    if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
+      // The intrinsic may appear when we have a 0 sized kernarg segment, in which
+      // case the pointer argument may be missing and we use null.
+      B.buildConstant(DstReg, 0);
+      return true;
+    }
+
+    // It's undefined behavior if a function marked with the amdgpu-no-*
+    // attributes uses the corresponding intrinsic.
+    B.buildUndef(DstReg);
+    return true;
+  }
+
   if (!Arg->isRegister() || !Arg->getRegister().isValid())
     return false; // TODO: Handle these
   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
@@ -2913,7 +2982,7 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
 }
 
-// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
+// Build integer reciprocal sequence around V_RCP_IFLAG_F32
 //
 // Return lo, hi of result
 //
@@ -2982,7 +3051,6 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
 
   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
-  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
 
   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
@@ -2993,9 +3061,7 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
 
   auto Zero32 = B.buildConstant(S32, 0);
   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
-  auto Add2_HiC =
-      B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
-  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
+  auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
 
   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
@@ -3701,11 +3767,11 @@ void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
                                           unsigned ImmOffset, Register VIndex,
                                           MachineRegisterInfo &MRI) const {
   Optional<ValueAndVReg> MaybeVOffsetVal =
-      getConstantVRegValWithLookThrough(VOffset, MRI);
+      getIConstantVRegValWithLookThrough(VOffset, MRI);
   Optional<ValueAndVReg> MaybeSOffsetVal =
-      getConstantVRegValWithLookThrough(SOffset, MRI);
+      getIConstantVRegValWithLookThrough(SOffset, MRI);
   Optional<ValueAndVReg> MaybeVIndexVal =
-      getConstantVRegValWithLookThrough(VIndex, MRI);
+      getIConstantVRegValWithLookThrough(VIndex, MRI);
   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
   // update the MMO with that offset. The stride is unknown so we can only do
   // this if VIndex is constant 0.
@@ -4246,8 +4312,8 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
 /// to exposes all register repacking to the legalizer/combiners. We also don't
 /// want a selected instrution entering RegBankSelect. In order to avoid
 /// defining a multitude of intermediate image instructions, directly hack on
-/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
-/// now unnecessary arguments with $noreg.
+/// the intrinsic's arguments. In cases like a16 addresses, this requires
+/// padding now unnecessary arguments with $noreg.
 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
@@ -4339,8 +4405,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
-            AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
-                                                      Intr->Dim);
+            AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
+                                                     Intr->Dim);
 
         // The starting indexes should remain in the same place.
         --CorrectedNumVAddrs;
@@ -4518,7 +4584,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   MI.getOperand(0).setReg(NewResultReg);
 
   // In the IR, TFE is supposed to be used with a 2 element struct return
-  // type. The intruction really returns these two values in one contiguous
+  // type. The instruction really returns these two values in one contiguous
   // register, with one additional dword beyond the loaded data. Rewrite the
   // return type to use a single register result.
 
@@ -4730,7 +4796,7 @@ bool AMDGPULegalizerInfo::legalizeTrapHsa(
 
 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
-  // Is non-HSA path or trap-handler disabled? then, report a warning
+  // Is non-HSA path or trap-handler disabled? Then, report a warning
   // accordingly
   if (!ST.isTrapHandlerEnabled() ||
       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
@@ -4771,12 +4837,27 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
     return false;
   }
 
-  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
-  bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
-  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
-                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
-                          : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
-                                 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+  const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
+  const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+  const unsigned NumVDataDwords = 4;
+  const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+  const bool UseNSA =
+      ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
+  const unsigned BaseOpcodes[2][2] = {
+      {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
+      {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
+       AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
+  int Opcode;
+  if (UseNSA) {
+    Opcode =
+        AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
+                              NumVDataDwords, NumVAddrDwords);
+  } else {
+    Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+                                   AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
+                                   PowerOf2Ceil(NumVAddrDwords));
+  }
+  assert(Opcode != -1);
 
   SmallVector<Register, 12> Ops;
   if (Is64) {
@@ -4813,6 +4894,14 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
     packLanes(RayInvDir);
   }
 
+  if (!UseNSA) {
+    // Build a single vector containing all the operands so far prepared.
+    LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
+    Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
+    Ops.clear();
+    Ops.push_back(MergedOps);
+  }
+
   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
     .addDef(DstReg)
     .addImm(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index d4fefd89b487..7faf0436f995 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -89,6 +89,8 @@ public:
 
   bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
+  bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B) const;
 
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       const ArgDescriptor *Arg,
@@ -107,8 +109,8 @@ public:
                                      Register Den) const;
 
   void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg,
-                                     Register DstRemReg, Register Numer,
-                                     Register Denom) const;
+                                     Register DstRemReg, Register Num,
+                                     Register Den) const;
 
   bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 1ee6933bd7ff..49cf6db5197f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -54,15 +54,14 @@ private:
 
   bool useNativeFunc(const StringRef F) const;
 
-  // Return a pointer (pointer expr) to the function if function defintion with
+  // Return a pointer (pointer expr) to the function if function definition with
   // "FuncName" exists. It may create a new function prototype in pre-link mode.
   FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
 
   // Replace a normal function with its native version.
   bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
 
-  bool parseFunctionName(const StringRef& FMangledName,
-                         FuncInfo *FInfo=nullptr /*out*/);
+  bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
 
   bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
 
@@ -87,9 +86,9 @@ private:
   bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
 
   // evaluate calls if calls' arguments are constants.
-  bool evaluateScalarMathFunc(FuncInfo &FInfo, double& Res0,
+  bool evaluateScalarMathFunc(const FuncInfo &FInfo, double& Res0,
     double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
-  bool evaluateCall(CallInst *aCI, FuncInfo &FInfo);
+  bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
 
   // exp
   bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
@@ -116,7 +115,8 @@ private:
   bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);
 
   // __read_pipe/__write_pipe
-  bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
+  bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
+                            const FuncInfo &FInfo);
 
   // llvm.amdgcn.wavefrontsize
   bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
@@ -125,7 +125,7 @@ private:
   BasicBlock::iterator getEntryIns(CallInst * UI);
   // Insert an Alloc instruction.
   AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
-  // Get a scalar native builtin signle argument FP function
+  // Get a scalar native builtin single argument FP function
   FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
 
 protected:
@@ -466,9 +466,9 @@ FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
                        : AMDGPULibFunc::getFunction(M, fInfo);
 }
 
-bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName,
-                                    FuncInfo *FInfo) {
-  return AMDGPULibFunc::parse(FMangledName, *FInfo);
+bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
+                                       FuncInfo &FInfo) {
+  return AMDGPULibFunc::parse(FMangledName, FInfo);
 }
 
 bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
@@ -529,7 +529,7 @@ bool AMDGPULibCalls::useNative(CallInst *aCI) {
   Function *Callee = aCI->getCalledFunction();
 
   FuncInfo FInfo;
-  if (!parseFunctionName(Callee->getName(), &FInfo) || !FInfo.isMangled() ||
+  if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() ||
       FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
       getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) ||
       !(AllNative || useNativeFunc(FInfo.getName()))) {
@@ -558,7 +558,7 @@ bool AMDGPULibCalls::useNative(CallInst *aCI) {
 // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
 // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
 bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
-                                          FuncInfo &FInfo) {
+                                          const FuncInfo &FInfo) {
   auto *Callee = CI->getCalledFunction();
   if (!Callee->isDeclaration())
     return false;
@@ -567,7 +567,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
   auto *M = Callee->getParent();
   auto &Ctx = M->getContext();
   std::string Name = std::string(Callee->getName());
-  auto NumArg = CI->getNumArgOperands();
+  auto NumArg = CI->arg_size();
   if (NumArg != 4 && NumArg != 6)
     return false;
   auto *PacketSize = CI->getArgOperand(NumArg - 2);
@@ -584,7 +584,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
     PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
   else
     PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8);
-  unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
+  unsigned PtrArgLoc = CI->arg_size() - 3;
   auto PtrArg = CI->getArgOperand(PtrArgLoc);
   unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
   auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS);
@@ -644,11 +644,11 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
   }
 
   FuncInfo FInfo;
-  if (!parseFunctionName(Callee->getName(), &FInfo))
+  if (!parseFunctionName(Callee->getName(), FInfo))
     return false;
 
   // Further check the number of arguments to see if they match.
-  if (CI->getNumArgOperands() != FInfo.getNumArgs())
+  if (CI->arg_size() != FInfo.getNumArgs())
     return false;
 
   if (TDOFold(CI, FInfo))
@@ -660,7 +660,7 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
   if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))
     return true;
 
-  // Specilized optimizations for each function call
+  // Specialized optimizations for each function call
   switch (FInfo.getId()) {
   case AMDGPULibFunc::EI_RECIP:
     // skip vector function
@@ -1231,7 +1231,7 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
   return false;
 }
 
-// Get a scalar native builtin signle argument FP function
+// Get a scalar native builtin single argument FP function
 FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
                                                  const FuncInfo &FInfo) {
   if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
@@ -1371,8 +1371,7 @@ bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
   StringRef CPU = TM->getTargetCPU();
   StringRef Features = TM->getTargetFeatureString();
   if ((CPU.empty() || CPU.equals_insensitive("generic")) &&
-      (Features.empty() ||
-       Features.find_insensitive("wavefrontsize") == StringRef::npos))
+      (Features.empty() || !Features.contains_insensitive("wavefrontsize")))
     return false;
 
   Function *F = CI->getParent()->getParent();
@@ -1410,7 +1409,7 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
   return Alloc;
 }
 
-bool AMDGPULibCalls::evaluateScalarMathFunc(FuncInfo &FInfo,
+bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
                                             double& Res0, double& Res1,
                                             Constant *copr0, Constant *copr1,
                                             Constant *copr2) {
@@ -1605,8 +1604,8 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(FuncInfo &FInfo,
   return false;
 }
 
-bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
-  int numArgs = (int)aCI->getNumArgOperands();
+bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
+  int numArgs = (int)aCI->arg_size();
   if (numArgs > 3)
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 32262ea75fd3..aa7c7ff2e388 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -352,7 +352,7 @@ const unsigned UnmangledFuncInfo::TableSize =
 static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
                                        const AMDGPULibFunc::Param (&Leads)[2]) {
   AMDGPULibFunc::Param Res = Leads[0];
-  // TBD - This switch may require to be extended for other intriniscs
+  // TBD - This switch may require to be extended for other intrinsics
   switch (id) {
   case AMDGPULibFunc::EI_SINCOS:
     Res.PtrKind = AMDGPULibFunc::BYVALUE;
@@ -455,7 +455,8 @@ AMDGPULibFunc::Param ParamIterator::getNextParam() {
       break;
     }
 
-    default: llvm_unreachable("Unhandeled param rule");
+    default:
+      llvm_unreachable("Unhandled param rule");
     }
   }
   ++Index;
@@ -747,7 +748,8 @@ static const char *getItaniumTypeName(AMDGPULibFunc::EType T) {
   case AMDGPULibFunc::IMG3D:   return "11ocl_image3d";
   case AMDGPULibFunc::SAMPLER: return "11ocl_sampler";
   case AMDGPULibFunc::EVENT:   return "9ocl_event";
-  default: llvm_unreachable("Unhandeled param type");
+  default:
+    llvm_unreachable("Unhandled param type");
   }
   return nullptr;
 }
@@ -761,7 +763,7 @@ namespace {
 // substitution candidates from the grammar, but are explicitly excluded:
 // 1. <builtin-type> other than vendor extended types ..."
 
-// For the purpose of functions the following productions make sence for the
+// For the purpose of functions the following productions make sense for the
 // substitution:
 //  <type> ::= <builtin-type>
 //    ::= <class-enum-type>
@@ -774,11 +776,11 @@ namespace {
 // using <class-enum-type> production rule they're not used for substitution
 // because clang consider them as builtin types.
 //
-// DvNN_ type is GCC extension for vectors and is a subject for the substitution.
-
+// DvNN_ type is GCC extension for vectors and is a subject for the
+// substitution.
 
 class ItaniumMangler {
-  SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions
+  SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substitutions
   bool  UseAddrSpace;
 
   int findSubst(const AMDGPULibFunc::Param& P) const {
@@ -902,7 +904,7 @@ static Type* getIntrinsicParamType(
   case AMDGPULibFunc::EVENT:
     T = StructType::create(C,"ocl_event")->getPointerTo(); break;
   default:
-    llvm_unreachable("Unhandeled param type");
+    llvm_unreachable("Unhandled param type");
     return nullptr;
   }
   if (P.VectorSize > 1)
@@ -990,10 +992,8 @@ FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
   } else {
     AttributeList Attr;
     LLVMContext &Ctx = M->getContext();
-    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
-                             Attribute::ReadOnly);
-    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
-                             Attribute::NoUnwind);
+    Attr = Attr.addFnAttribute(Ctx, Attribute::ReadOnly);
+    Attr = Attr.addFnAttribute(Ctx, Attribute::NoUnwind);
     C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 714e74faaf13..b700dd5aa301 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -76,9 +76,8 @@ bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
   Intrinsic::ID ID = F.getIntrinsicID();
   bool Changed = false;
 
-  for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
-    Instruction *Inst = cast<Instruction>(*I);
-    ++I;
+  for (User *U : llvm::make_early_inc_range(F.users())) {
+    Instruction *Inst = cast<Instruction>(U);
 
     switch (ID) {
     case Intrinsic::memcpy: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 0f157e53c3db..c34c12ab9fec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -82,9 +82,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
       Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
                               nullptr, F.getName() + ".kernarg.segment");
 
-  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
-    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+  KernArgSegment->addRetAttr(Attribute::NonNull);
+  KernArgSegment->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
 
   unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
   uint64_t ExplicitArgOffset = 0;
@@ -232,8 +232,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     }
   }
 
-  KernArgSegment->addAttribute(
-      AttributeList::ReturnIndex,
+  KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
   return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 70ecea8dbc3e..12d6d35a6917 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -118,7 +119,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
     // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
     // that might call a function which accesses a field within it. This is
     // presently approximated to 'all kernels' if there are any such functions
-    // in the module. This implicit use is reified as an explicit use here so
+    // in the module. This implicit use is redefined as an explicit use here so
     // that later passes, specifically PromoteAlloca, account for the required
     // memory without any knowledge of this transform.
 
@@ -162,6 +163,9 @@ public:
     bool Changed = processUsedLDS(M);
 
     for (Function &F : M.functions()) {
+      if (F.isDeclaration())
+        continue;
+
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
@@ -282,6 +286,21 @@ private:
     // so remove the variables from these lists before replaceAllUsesWith
     removeFromUsedLists(M, LocalVars);
 
+    // Create alias.scope and their lists. Each field in the new structure
+    // does not alias with all other fields.
+    SmallVector<MDNode *> AliasScopes;
+    SmallVector<Metadata *> NoAliasList;
+    if (LocalVars.size() > 1) {
+      MDBuilder MDB(Ctx);
+      AliasScopes.reserve(LocalVars.size());
+      MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
+      for (size_t I = 0; I < LocalVars.size(); I++) {
+        MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
+        AliasScopes.push_back(Scope);
+      }
+      NoAliasList.append(&AliasScopes[1], AliasScopes.end());
+    }
+
     // Replace uses of ith variable with a constantexpr to the ith field of the
     // instance that will be allocated by AMDGPUMachineFunction
     Type *I32 = Type::getInt32Ty(Ctx);
@@ -313,7 +332,15 @@ private:
 
       uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
       Align A = commonAlignment(StructAlign, Off);
-      refineUsesAlignment(GEP, A, DL);
+
+      if (I)
+        NoAliasList[I - 1] = AliasScopes[I - 1];
+      MDNode *NoAlias =
+          NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
+      MDNode *AliasScope =
+          AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
+
+      refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
 
     // Mark kernels with asm that reads the address of the allocated structure
@@ -323,23 +350,39 @@ private:
     if (!F) {
       IRBuilder<> Builder(Ctx);
       SmallPtrSet<Function *, 32> Kernels;
-      for (auto &I : M.functions()) {
-        Function *Func = &I;
-        if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) {
-          markUsedByKernel(Builder, Func, SGV);
-          Kernels.insert(Func);
+      for (Function &Func : M.functions()) {
+        if (Func.isDeclaration())
+          continue;
+
+        if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {
+          markUsedByKernel(Builder, &Func, SGV);
+          Kernels.insert(&Func);
         }
       }
     }
     return true;
   }
 
-  void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
-                           unsigned MaxDepth = 5) {
-    if (!MaxDepth || A == 1)
+  void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
+                                MDNode *AliasScope, MDNode *NoAlias,
+                                unsigned MaxDepth = 5) {
+    if (!MaxDepth || (A == 1 && !AliasScope))
       return;
 
     for (User *U : Ptr->users()) {
+      if (auto *I = dyn_cast<Instruction>(U)) {
+        if (AliasScope && I->mayReadOrWriteMemory()) {
+          MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
+          AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
+                   : AliasScope);
+          I->setMetadata(LLVMContext::MD_alias_scope, AS);
+
+          MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
+          NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
+          I->setMetadata(LLVMContext::MD_noalias, NA);
+        }
+      }
+
       if (auto *LI = dyn_cast<LoadInst>(U)) {
         LI->setAlignment(std::max(A, LI->getAlign()));
         continue;
@@ -364,17 +407,19 @@ private:
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
         APInt Off(BitWidth, 0);
-        if (GEP->getPointerOperand() == Ptr &&
-            GEP->accumulateConstantOffset(DL, Off)) {
-          Align GA = commonAlignment(A, Off.getLimitedValue());
-          refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+        if (GEP->getPointerOperand() == Ptr) {
+          Align GA;
+          if (GEP->accumulateConstantOffset(DL, Off))
+            GA = commonAlignment(A, Off.getLimitedValue());
+          refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
+                                   MaxDepth - 1);
         }
         continue;
       }
       if (auto *I = dyn_cast<Instruction>(U)) {
         if (I->getOpcode() == Instruction::BitCast ||
             I->getOpcode() == Instruction::AddrSpaceCast)
-          refineUsesAlignment(I, A, DL, MaxDepth - 1);
+          refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 3dd27f1996d6..3fad7e192195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -12,11 +12,11 @@
 //===----------------------------------------------------------------------===//
 //
 
+#include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "R600AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
@@ -34,36 +34,6 @@
 
 using namespace llvm;
 
-namespace {
-
-class AMDGPUMCInstLower {
-  MCContext &Ctx;
-  const TargetSubtargetInfo &ST;
-  const AsmPrinter &AP;
-
-public:
-  AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
-                    const AsmPrinter &AP);
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
-  /// Lower a MachineInstr to an MCInst
-  void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-};
-
-class R600MCInstLower : public AMDGPUMCInstLower {
-public:
-  R600MCInstLower(MCContext &ctx, const R600Subtarget &ST,
-                  const AsmPrinter &AP);
-
-  /// Lower a MachineInstr to an MCInst
-  void lower(const MachineInstr *MI, MCInst &OutMI) const;
-};
-
-
-} // End anonymous namespace
-
 #include "AMDGPUGenMCPseudoLowering.inc"
 
 AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx,
@@ -150,7 +120,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
   // need to select it to the subtarget specific version, and there's no way to
   // do that with a single pseudo source operation.
-  if (Opcode == AMDGPU::S_SETPC_B64_return)
+  if (Opcode == AMDGPU::S_SETPC_B64_return ||
+      Opcode == AMDGPU::S_SETPC_B64_return_gfx)
     Opcode = AMDGPU::S_SETPC_B64;
   else if (Opcode == AMDGPU::SI_CALL) {
     // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
@@ -194,30 +165,6 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
   return MCInstLowering.lowerOperand(MO, MCOp);
 }
 
-static const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
-                                        const Constant *CV,
-                                        MCContext &OutContext) {
-  // TargetMachine does not support llvm-style cast. Use C++-style cast.
-  // This is safe since TM is always of type AMDGPUTargetMachine or its
-  // derived class.
-  auto &AT = static_cast<const AMDGPUTargetMachine&>(TM);
-  auto *CE = dyn_cast<ConstantExpr>(CV);
-
-  // Lower null pointers in private and local address space.
-  // Clang generates addrspacecast for null pointers in private and local
-  // address space, which needs to be lowered.
-  if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
-    auto Op = CE->getOperand(0);
-    auto SrcAddr = Op->getType()->getPointerAddressSpace();
-    if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) {
-      auto DstAddr = CE->getType()->getPointerAddressSpace();
-      return MCConstantExpr::create(AT.getNullPointerValue(DstAddr),
-        OutContext);
-    }
-  }
-  return nullptr;
-}
-
 const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
   if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
     return E;
@@ -267,12 +214,18 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    if (MI->isMetaInstruction()) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" meta instruction");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
 
 #ifdef EXPENSIVE_CHECKS
-    // Sanity-check getInstSizeInBytes on explicitly specified CPUs (it cannot
+    // Check getInstSizeInBytes on explicitly specified CPUs (it cannot
     // work correctly for the generic CPU).
     //
     // The isPseudo check really shouldn't be here, but unfortunately there are
@@ -325,47 +278,3 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
   }
 }
-
-R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST,
-                                 const AsmPrinter &AP) :
-        AMDGPUMCInstLower(Ctx, ST, AP) { }
-
-void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
-  for (const MachineOperand &MO : MI->explicit_operands()) {
-    MCOperand MCOp;
-    lowerOperand(MO, MCOp);
-    OutMI.addOperand(MCOp);
-  }
-}
-
-void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
-  const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
-  R600MCInstLower MCInstLowering(OutContext, STI, *this);
-
-  StringRef Err;
-  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
-    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
-    C.emitError("Illegal instruction detected: " + Err);
-    MI->print(errs());
-  }
-
-  if (MI->isBundle()) {
-    const MachineBasicBlock *MBB = MI->getParent();
-    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
-    while (I != MBB->instr_end() && I->isInsideBundle()) {
-      emitInstruction(&*I);
-      ++I;
-    }
-  } else {
-    MCInst TmpInst;
-    MCInstLowering.lower(MI, TmpInst);
-    EmitToStreamer(*OutStreamer, TmpInst);
- }
-}
-
-const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) {
-  if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
-    return E;
-  return AsmPrinter::lowerConstant(CV);
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
new file mode 100644
index 000000000000..0e43b4fe9461
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -0,0 +1,69 @@
+//===- AMDGPUMCInstLower.h - Lower AMDGPU MachineInstr to an MCInst -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Header of lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Casting.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+} // namespace llvm
+
+using namespace llvm;
+
+class AMDGPUMCInstLower {
+  MCContext &Ctx;
+  const TargetSubtargetInfo &ST;
+  const AsmPrinter &AP;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
+                    const AsmPrinter &AP);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  /// Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+
+namespace {
+static inline const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
+                                               const Constant *CV,
+                                               MCContext &OutContext) {
+  // TargetMachine does not support llvm-style cast. Use C++-style cast.
+  // This is safe since TM is always of type AMDGPUTargetMachine or its
+  // derived class.
+  auto &AT = static_cast<const AMDGPUTargetMachine &>(TM);
+  auto *CE = dyn_cast<ConstantExpr>(CV);
+
+  // Lower null pointers in private and local address space.
+  // Clang generates addrspacecast for null pointers in private and local
+  // address space, which needs to be lowered.
+  if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
+    auto Op = CE->getOperand(0);
+    auto SrcAddr = Op->getType()->getPointerAddressSpace();
+    if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) {
+      auto DstAddr = CE->getType()->getPointerAddressSpace();
+      return MCConstantExpr::create(AT.getNullPointerValue(DstAddr),
+                                    OutContext);
+    }
+  }
+  return nullptr;
+}
+} // namespace
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index a61f1f7b8182..47faa6c72481 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -16,18 +16,12 @@
 #ifndef LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
 #define LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MIRFormatter.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
 
 namespace llvm {
 
 class MachineFunction;
-class MachineInstr;
 struct PerFunctionMIParsingState;
-struct SlotMapping;
 
 class AMDGPUMIRFormatter final : public MIRFormatter {
 public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 697513b5db7a..5d4b007f11e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -194,7 +194,7 @@ bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB,
 }
 
 void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) {
-  assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists");
+  assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exists");
   PHISourcesT EmptySet;
   PHIInfoElementT *NewElement = new PHIInfoElementT();
   NewElement->DestReg = DestReg;
@@ -606,7 +606,7 @@ MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
                    DenseMap<MachineRegion *, RegionMRT *> &RegionMap) {
   for (auto &MFI : MF) {
     MachineBasicBlock *ExitMBB = &MFI;
-    if (ExitMBB->succ_size() == 0) {
+    if (ExitMBB->succ_empty()) {
       return ExitMBB;
     }
   }
@@ -748,10 +748,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
 
   // If we have a successor with a PHI, source coming from this MBB we have to
   // add the register as live out
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                        E = MBB->succ_end();
-       SI != E; ++SI) {
-    for (auto &II : *(*SI)) {
+  for (MachineBasicBlock *Succ : MBB->successors()) {
+    for (auto &II : *Succ) {
       if (II.isPHI()) {
         MachineInstr &PHI = II;
         int numPreds = getPHINumInputs(PHI);
@@ -760,7 +758,7 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
             unsigned PHIReg = getPHISourceReg(PHI, i);
             LLVM_DEBUG(dbgs()
                        << "Add LiveOut (PhiSource " << printMBBReference(*MBB)
-                       << " -> " << printMBBReference(*(*SI))
+                       << " -> " << printMBBReference(*Succ)
                        << "): " << printReg(PHIReg, TRI) << "\n");
             addLiveOut(PHIReg);
           }
@@ -813,7 +811,7 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
     } else {
       LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion();
       // We should be limited to only store registers that are live out from the
-      // lineaized region
+      // linearized region
       for (auto MBBI : SubRegion->MBBs) {
         storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion);
       }
@@ -896,7 +894,7 @@ void LinearizedRegion::replaceRegister(unsigned Register,
   assert(Register != NewRegister && "Cannot replace a reg with itself");
 
   LLVM_DEBUG(
-      dbgs() << "Pepareing to replace register (region): "
+      dbgs() << "Preparing to replace register (region): "
              << printReg(Register, MRI->getTargetRegisterInfo()) << " with "
              << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
 
@@ -1073,7 +1071,6 @@ private:
   const SIInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
-  unsigned BBSelectRegister;
   PHILinearize PHIInfo;
   DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
   RegionMRT *RMRT;
@@ -1125,8 +1122,6 @@ private:
 
   void transformSimpleIfRegion(RegionMRT *Region);
 
-  void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
-
   void insertUnconditionalBranch(MachineBasicBlock *MBB,
                                  MachineBasicBlock *Dest,
                                  const DebugLoc &DL = DebugLoc());
@@ -1238,11 +1233,7 @@ bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) {
     return false;
   }
 
-  for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
-                                              E = Entry->succ_end();
-       SI != E; ++SI) {
-    MachineBasicBlock *Current = *SI;
-
+  for (MachineBasicBlock *Current : Entry->successors()) {
     if (Current == Succ) {
       FoundBypass = true;
     } else if ((Current->succ_size() == 1) &&
@@ -1280,10 +1271,7 @@ static void fixRegionTerminator(RegionMRT *Region) {
   auto Exit = LRegion->getExit();
 
   SmallPtrSet<MachineBasicBlock *, 2> Successors;
-  for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(),
-                                              SE = Exit->succ_end();
-       SI != SE; ++SI) {
-    MachineBasicBlock *Succ = *SI;
+  for (MachineBasicBlock *Succ : Exit->successors()) {
     if (LRegion->contains(Succ)) {
       // Do not allow re-assign
       assert(InternalSucc == nullptr);
@@ -1404,7 +1392,7 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
     MachineInstr &Instr = *I;
     if (Instr.isPHI()) {
       unsigned PHIDestReg = getPHIDestReg(Instr);
-      LLVM_DEBUG(dbgs() << "Extractking killed phi:\n");
+      LLVM_DEBUG(dbgs() << "Extracting killed phi:\n");
       LLVM_DEBUG(Instr.dump());
       PHIs.insert(&Instr);
       PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
@@ -1589,11 +1577,9 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
 
       // Check if register is live out of the basic block
       MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent();
-      for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) {
-        if ((*UI).getParent()->getParent() != DefMBB) {
+      for (const MachineOperand &MO : MRI->use_operands(Reg))
+        if (MO.getParent()->getParent() != DefMBB)
           IsDead = false;
-        }
-      }
 
       LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is "
                         << (IsDead ? "dead" : "alive")
@@ -1686,7 +1672,7 @@ void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *
 static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) {
   MachineBasicBlock *result = nullptr;
   for (auto &MFI : MF) {
-    if (MFI.succ_size() == 0) {
+    if (MFI.succ_empty()) {
       if (result == nullptr) {
         result = &MFI;
       } else {
@@ -1770,34 +1756,27 @@ static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) {
 static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
                                    MachineBasicBlock *EndMBB) {
 
-  // We have to check against the StartMBB successor becasuse a
+  // We have to check against the StartMBB successor because a
   // structurized region with a loop will have the entry block split,
   // and the backedge will go to the entry successor.
   DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs;
   unsigned SuccSize = StartMBB->succ_size();
   if (SuccSize > 0) {
     MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin());
-    for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(),
-                                          E = EndMBB->succ_end();
-         PI != E; ++PI) {
+    for (MachineBasicBlock *Succ : EndMBB->successors()) {
       // Either we have a back-edge to the entry block, or a back-edge to the
       // successor of the entry block since the block may be split.
-      if ((*PI) != StartMBB &&
-          !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
+      if (Succ != StartMBB &&
+          !(Succ == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
         Succs.insert(
-            std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI));
+            std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, Succ));
       }
     }
   }
 
-  for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(),
-                                        E = StartMBB->pred_end();
-       PI != E; ++PI) {
-    if ((*PI) != EndMBB) {
-      Succs.insert(
-          std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB));
-    }
-  }
+  for (MachineBasicBlock *Pred : StartMBB->predecessors())
+    if (Pred != EndMBB)
+      Succs.insert(std::make_pair(Pred, StartMBB));
 
   for (auto SI : Succs) {
     std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
@@ -1815,14 +1794,9 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
   MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock();
 
   if (InheritPreds) {
-    for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(),
-                                          E = CodeBBStart->pred_end();
-         PI != E; ++PI) {
-      if ((*PI) != CodeBBEnd) {
-        MachineBasicBlock *Pred = (*PI);
+    for (MachineBasicBlock *Pred : CodeBBStart->predecessors())
+      if (Pred != CodeBBEnd)
         Pred->addSuccessor(IfBB);
-      }
-    }
   }
 
   removeExternalCFGEdges(CodeBBStart, CodeBBEnd);
@@ -1872,9 +1846,8 @@ void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
     return;
 
   Register CondReg = Cond[0].getReg();
-  for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
-    (*UI).setIsKill(false);
-  }
+  for (MachineOperand &MO : MRI->use_operands(CondReg))
+    MO.setIsKill(false);
 }
 
 void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
@@ -2018,7 +1991,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
     LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI));
     if (!containsDef(CodeBB, InnerRegion, LI) ||
         (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
-      // If the register simly lives through the CodeBB, we don't have
+      // If the register simply lives through the CodeBB, we don't have
       // to rewrite anything since the register is not defined in this
       // part of the code.
       LLVM_DEBUG(dbgs() << "- through");
@@ -2028,14 +2001,14 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
     unsigned Reg = LI;
     if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
       // If the register is live out, we do want to create a phi,
-      // unless it is from the Exit block, becasuse in that case there
+      // unless it is from the Exit block, because in that case there
       // is already a PHI, and no need to create a new one.
 
       // If the register is just a live out def and not part of a phi
       // chain, we need to create a PHI node to handle the if region,
       // and replace all uses outside of the region with the new dest
       // register, unless it is the outgoing BB select register. We have
-      // already creaed phi nodes for these.
+      // already created phi nodes for these.
       const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
       Register PHIDestReg = MRI->createVirtualRegister(RegClass);
       Register IfSourceReg = MRI->createVirtualRegister(RegClass);
@@ -2569,11 +2542,9 @@ static void removeOldExitPreds(RegionMRT *Region) {
 
 static bool mbbHasBackEdge(MachineBasicBlock *MBB,
                            SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
-  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
-    if (MBBs.contains(*SI)) {
+  for (MachineBasicBlock *Succ : MBB->successors())
+    if (MBBs.contains(Succ))
       return true;
-    }
-  }
   return false;
 }
 
@@ -2591,11 +2562,9 @@ static bool containsNewBackedge(MRT *Tree,
     }
   } else {
     RegionMRT *Region = Tree->getRegionMRT();
-    SetVector<MRT *> *Children = Region->getChildren();
-    for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) {
-      if (containsNewBackedge(*CI, MBBs))
+    for (MRT *C : llvm::reverse(*Region->getChildren()))
+      if (containsNewBackedge(C, MBBs))
         return true;
-    }
   }
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index d27eb68ca74b..5a5a5d213a1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -13,7 +13,7 @@
 //
 // In LLVM CodeGen the runtime-handle metadata will be translated to
 // RuntimeHandle metadata in code object. Runtime allocates a global buffer
-// for each kernel with RuntimeHandel metadata and saves the kernel address
+// for each kernel with RuntimeHandle metadata and saves the kernel address
 // required for the AQL packet into the buffer. __enqueue_kernel function
 // in device library knows that the invoke function pointer in the block
 // literal is actually runtime handle and loads the kernel address from it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 728be811afae..fc984d2dda64 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUCombinerHelper.h"
 #include "AMDGPULegalizerInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -34,10 +35,11 @@ protected:
   MachineIRBuilder &B;
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
-  CombinerHelper &Helper;
+  AMDGPUCombinerHelper &Helper;
 
 public:
-  AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+  AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
+                                    AMDGPUCombinerHelper &Helper)
       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
 
   struct FMinFMaxLegacyInfo {
@@ -257,12 +259,12 @@ bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
 
 class AMDGPUPostLegalizerCombinerHelperState {
 protected:
-  CombinerHelper &Helper;
+  AMDGPUCombinerHelper &Helper;
   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
 
 public:
   AMDGPUPostLegalizerCombinerHelperState(
-      CombinerHelper &Helper,
+      AMDGPUCombinerHelper &Helper,
       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
 };
@@ -300,7 +302,7 @@ public:
 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                               MachineInstr &MI,
                                               MachineIRBuilder &B) const {
-  CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
+  AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo);
   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
                                                  PostLegalizerHelper);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 13f09ab8f164..c029046ab65f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUCombinerHelper.h"
 #include "AMDGPULegalizerInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -34,10 +35,11 @@ protected:
   MachineIRBuilder &B;
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
-  CombinerHelper &Helper;
+  AMDGPUCombinerHelper &Helper;
 
 public:
-  AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+  AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B,
+                                   AMDGPUCombinerHelper &Helper)
       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
 
   struct ClampI64ToI16MatchInfo {
@@ -154,12 +156,12 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
 
 class AMDGPUPreLegalizerCombinerHelperState {
 protected:
-  CombinerHelper &Helper;
+  AMDGPUCombinerHelper &Helper;
   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
 
 public:
   AMDGPUPreLegalizerCombinerHelperState(
-      CombinerHelper &Helper,
+      AMDGPUCombinerHelper &Helper,
       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
 };
@@ -196,17 +198,15 @@ public:
 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                               MachineInstr &MI,
                                               MachineIRBuilder &B) const {
-  CombinerHelper Helper(Observer, B, KB, MDT);
+  AMDGPUCombinerHelper Helper(Observer, B, KB, MDT);
   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
                                                 PreLegalizerHelper);
 
-  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+  if (Generated.tryCombineAll(Observer, MI, B))
     return true;
 
   switch (MI.getOpcode()) {
-  case TargetOpcode::G_MEMCPY_INLINE:
-    return Helper.tryEmitMemcpyInline(MI);
   case TargetOpcode::G_CONCAT_VECTORS:
     return Helper.tryCombineConcatVectors(MI);
   case TargetOpcode::G_SHUFFLE_VECTOR:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7b6959b56145..d560d2043f42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -149,11 +149,11 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
   IRBuilder<> Builder(Ctx);
   Type *I32Ty = Type::getInt32Ty(Ctx);
   unsigned UniqID = 0;
-  // NB: This is important for this string size to be divizable by 4
+  // NB: This is important for this string size to be divisible by 4
   const char NonLiteralStr[4] = "???";
 
   for (auto CI : Printfs) {
-    unsigned NumOps = CI->getNumArgOperands();
+    unsigned NumOps = CI->arg_size();
 
     SmallString<16> OpConvSpecifiers;
     Value *Op = CI->getArgOperand(0);
@@ -201,10 +201,10 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
       std::string AStreamHolder;
       raw_string_ostream Sizes(AStreamHolder);
       int Sum = DWORD_ALIGN;
-      Sizes << CI->getNumArgOperands() - 1;
+      Sizes << CI->arg_size() - 1;
       Sizes << ':';
-      for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
-                                  ArgCount <= OpConvSpecifiers.size();
+      for (unsigned ArgCount = 1;
+           ArgCount < CI->arg_size() && ArgCount <= OpConvSpecifiers.size();
            ArgCount++) {
         Value *Arg = CI->getArgOperand(ArgCount);
         Type *ArgType = Arg->getType();
@@ -330,7 +330,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
           M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
 
       LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes.str() << '\n');
-      std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str().c_str();
+      std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str();
       MDString *fmtStrArray = MDString::get(Ctx, fmtstr);
 
       // Instead of creating global variables, the
@@ -389,8 +389,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
 
       Type *Int32Ty = Type::getInt32Ty(Ctx);
       Type *Int64Ty = Type::getInt64Ty(Ctx);
-      for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
-                                  ArgCount <= OpConvSpecifiers.size();
+      for (unsigned ArgCount = 1;
+           ArgCount < CI->arg_size() && ArgCount <= OpConvSpecifiers.size();
            ArgCount++) {
         Value *Arg = CI->getArgOperand(ArgCount);
         Type *ArgType = Arg->getType();
@@ -524,7 +524,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
           LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
                             << *StBuff << '\n');
           (void)StBuff;
-          if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
+          if (I + 1 == E && ArgCount + 1 == CI->arg_size())
             break;
           BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset,
                                                 "PrintBuffNextPtr", Brnch);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3f1f21a33f7e..3ec5dd7e0eff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -176,6 +177,10 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F) {
   if (IsAMDGCN) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
   } else {
     MaxVGPRs = 128;
   }
@@ -200,7 +205,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F) {
 
 std::pair<Value *, Value *>
 AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
-  const Function &F = *Builder.GetInsertBlock()->getParent();
+  Function &F = *Builder.GetInsertBlock()->getParent();
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
 
   if (!IsAMDHSA) {
@@ -256,11 +261,12 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 
   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
-  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  DispatchPtr->addRetAttr(Attribute::NoAlias);
+  DispatchPtr->addRetAttr(Attribute::NonNull);
+  F.removeFnAttr("amdgpu-no-dispatch-ptr");
 
   // Size of the dispatch packet struct.
-  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
+  DispatchPtr->addDereferenceableRetAttr(64);
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
   Value *CastDispatchPtr = Builder.CreateBitCast(
@@ -268,7 +274,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
 
   // We could do a single 64-bit load here, but it's likely that the basic
   // 32-bit and extract sequence is already present, and it is probably easier
-  // to CSE this. The loads should be mergable later anyway.
+  // to CSE this. The loads should be mergeable later anyway.
   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
 
@@ -288,23 +294,27 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
 
 Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
                                               unsigned N) {
-  const AMDGPUSubtarget &ST =
-      AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent());
+  Function *F = Builder.GetInsertBlock()->getParent();
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+  StringRef AttrName;
 
   switch (N) {
   case 0:
     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
+    AttrName = "amdgpu-no-workitem-id-x";
     break;
   case 1:
     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
+    AttrName = "amdgpu-no-workitem-id-y";
     break;
 
   case 2:
     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
+    AttrName = "amdgpu-no-workitem-id-z";
     break;
   default:
     llvm_unreachable("invalid dimension");
@@ -313,6 +323,7 @@ Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
   ST.makeLIDRangeMetadata(CI);
+  F->removeFnAttr(AttrName);
 
   return CI;
 }
@@ -1065,9 +1076,9 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
                                     MI->getRawSource(), MI->getSourceAlign(),
                                     MI->getLength(), MI->isVolatile());
 
-    for (unsigned I = 1; I != 3; ++I) {
-      if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
-        B->addDereferenceableAttr(I, Bytes);
+    for (unsigned I = 0; I != 2; ++I) {
+      if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
+        B->addDereferenceableParamAttr(I, Bytes);
       }
     }
 
@@ -1101,6 +1112,10 @@ bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+    // A non-entry function has only 32 caller preserved registers.
+    // Do not promote alloca which will force spilling.
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      MaxVGPRs = std::min(MaxVGPRs, 32u);
   } else {
     MaxVGPRs = 128;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
new file mode 100644
index 000000000000..01d03d17ec47
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -0,0 +1,195 @@
+//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass recursively promotes generic pointer arguments of a kernel
+/// into the global address space.
+///
+/// The pass walks kernel's pointer arguments, then loads from them. If a loaded
+/// value is a pointer and loaded pointer is unmodified in the kernel before the
+/// load, then promote loaded pointer to global. Then recursively continue.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-promote-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteKernelArguments : public FunctionPass {
+  MemorySSA *MSSA;
+
+  Instruction *ArgCastInsertPt;
+
+  SmallVector<Value *> Ptrs;
+
+  void enqueueUsers(Value *Ptr);
+
+  bool promotePointer(Value *Ptr);
+
+public:
+  static char ID;
+
+  AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
+
+  bool run(Function &F, MemorySSA &MSSA);
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.setPreservesAll();
+  }
+};
+
+} // end anonymous namespace
+
+void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
+  SmallVector<User *> PtrUsers(Ptr->users());
+
+  while (!PtrUsers.empty()) {
+    Instruction *U = dyn_cast<Instruction>(PtrUsers.pop_back_val());
+    if (!U)
+      continue;
+
+    switch (U->getOpcode()) {
+    default:
+      break;
+    case Instruction::Load: {
+      LoadInst *LD = cast<LoadInst>(U);
+      PointerType *PT = dyn_cast<PointerType>(LD->getType());
+      if (!PT ||
+          (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
+           PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
+           PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
+          LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
+        break;
+      const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
+      // TODO: This load poprobably can be promoted to constant address space.
+      if (MSSA->isLiveOnEntryDef(MA))
+        Ptrs.push_back(LD);
+      break;
+    }
+    case Instruction::GetElementPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::BitCast:
+      if (U->getOperand(0)->stripInBoundsOffsets() == Ptr)
+        PtrUsers.append(U->user_begin(), U->user_end());
+      break;
+    }
+  }
+}
+
+bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
+  enqueueUsers(Ptr);
+
+  PointerType *PT = cast<PointerType>(Ptr->getType());
+  if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
+    return false;
+
+  bool IsArg = isa<Argument>(Ptr);
+  IRBuilder<> B(IsArg ? ArgCastInsertPt
+                      : &*std::next(cast<Instruction>(Ptr)->getIterator()));
+
+  // Cast pointer to global address space and back to flat and let
+  // Infer Address Spaces pass to do all necessary rewriting.
+  PointerType *NewPT =
+      PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS);
+  Value *Cast =
+      B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
+  Value *CastBack =
+      B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
+  Ptr->replaceUsesWithIf(CastBack,
+                         [Cast](Use &U) { return U.getUser() != Cast; });
+
+  return true;
+}
+
+// skip allocas
+static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
+  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
+  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
+
+    // If this is a dynamic alloca, the value may depend on the loaded kernargs,
+    // so loads will need to be inserted before it.
+    if (!AI || !AI->isStaticAlloca())
+      break;
+  }
+
+  return InsPt;
+}
+
+bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
+  if (skipFunction(F))
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+    return false;
+
+  ArgCastInsertPt = &*getInsertPt(*F.begin());
+  this->MSSA = &MSSA;
+
+  for (Argument &Arg : F.args()) {
+    if (Arg.use_empty())
+      continue;
+
+    PointerType *PT = dyn_cast<PointerType>(Arg.getType());
+    if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
+                PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
+                PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS))
+      continue;
+
+    Ptrs.push_back(&Arg);
+  }
+
+  bool Changed = false;
+  while (!Ptrs.empty()) {
+    Value *Ptr = Ptrs.pop_back_val();
+    Changed |= promotePointer(Ptr);
+  }
+
+  return Changed;
+}
+
+bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
+  MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  return run(F, MSSA);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
+                      "AMDGPU Promote Kernel Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
+                    "AMDGPU Promote Kernel Arguments", false, false)
+
+char AMDGPUPromoteKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() {
+  return new AMDGPUPromoteKernelArguments();
+}
+
+PreservedAnalyses
+AMDGPUPromoteKernelArgumentsPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    PA.preserve<MemorySSAAnalysis>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index 0e4c26170a8f..dafbeaeaec52 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -9,7 +9,7 @@
 /// \file
 /// \brief This pass propagates attributes from kernels to the non-entry
 /// functions. Most of the library functions were not compiled for specific ABI,
-/// yet will be correctly compiled if proper attrbutes are propagated from the
+/// yet will be correctly compiled if proper attributes are propagated from the
 /// caller.
 ///
 /// The pass analyzes call graph and propagates ABI target features through the
@@ -17,7 +17,7 @@
 ///
 /// It can run in two modes: as a function or module pass. A function pass
 /// simply propagates attributes. A module pass clones functions if there are
-/// callers with different ABI. If a function is clonned all call sites will
+/// callers with different ABI. If a function is cloned all call sites will
 /// be updated to use a correct clone.
 ///
 /// A function pass is limited in functionality but can run early in the
@@ -55,10 +55,7 @@ static constexpr const FeatureBitset TargetFeatures = {
 
 // Attributes to propagate.
 // TODO: Support conservative min/max merging instead of cloning.
-static constexpr const char* AttributeNames[] = {
-  "amdgpu-waves-per-eu",
-  "amdgpu-flat-work-group-size"
-};
+static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"};
 
 static constexpr unsigned NumAttr =
   sizeof(AttributeNames) / sizeof(AttributeNames[0]);
@@ -149,7 +146,7 @@ public:
   bool process(Module &M);
 };
 
-// Allows to propagate attributes early, but no clonning is allowed as it must
+// Allows to propagate attributes early, but no cloning is allowed as it must
 // be a function pass to run before any optimizations.
 // TODO: We shall only need a one instance of module pass, but that needs to be
 // in the linker pipeline which is currently not possible.
@@ -168,7 +165,7 @@ public:
   bool runOnFunction(Function &F) override;
 };
 
-// Allows to propagate attributes with clonning but does that late in the
+// Allows to propagate attributes with cloning but does that late in the
 // pipeline.
 class AMDGPUPropagateAttributesLate : public ModulePass {
   const TargetMachine *TM;
@@ -212,10 +209,10 @@ AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
 
 bool AMDGPUPropagateAttributes::process(Module &M) {
   for (auto &F : M.functions())
-    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    if (AMDGPU::isKernel(F.getCallingConv()))
       Roots.insert(&F);
 
-  return process();
+  return Roots.empty() ? false : process();
 }
 
 bool AMDGPUPropagateAttributes::process(Function &F) {
@@ -228,8 +225,7 @@ bool AMDGPUPropagateAttributes::process() {
   SmallSet<Function *, 32> NewRoots;
   SmallSet<Function *, 32> Replaced;
 
-  if (Roots.empty())
-    return false;
+  assert(!Roots.empty());
   Module &M = *(*Roots.begin())->getParent();
 
   do {
@@ -273,7 +269,7 @@ bool AMDGPUPropagateAttributes::process() {
         if (!NewF) {
           const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps);
           if (!AllowClone) {
-            // This may set different features on different iteartions if
+            // This may set different features on different iterations if
             // there is a contradiction in callers' attributes. In this case
             // we rely on a second pass running on Module, which is allowed
             // to clone.
@@ -383,7 +379,7 @@ bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
     TM = &TPC->getTM<TargetMachine>();
   }
 
-  if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+  if (!AMDGPU::isKernel(F.getCallingConv()))
     return false;
 
   return AMDGPUPropagateAttributes(TM, false).process(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 4e12e5cd8f65..d7dc9ee4117b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -57,9 +57,9 @@ public:
 
   MinMaxMedOpc getMinMaxPair(unsigned Opc);
 
-  template <class m_Cst>
+  template <class m_Cst, typename CstTy>
   bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
-                Register &Val, Register &K0, Register &K1);
+                Register &Val, CstTy &K0, CstTy &K1);
 
   bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
   void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
@@ -83,11 +83,11 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
   }
 }
 
-template <class m_Cst>
+template <class m_Cst, typename CstTy>
 bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI,
                                            MachineRegisterInfo &MRI,
                                            MinMaxMedOpc MMMOpc, Register &Val,
-                                           Register &K0, Register &K1) {
+                                           CstTy &K0, CstTy &K1) {
   // 4 operand commutes of: min(max(Val, K0), K1).
   // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
   // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
@@ -115,19 +115,18 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
     return false;
 
   MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
-  Register Val, K0, K1;
+  Register Val;
+  Optional<ValueAndVReg> K0, K1;
   // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
-  if (!matchMed<ICstRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+  if (!matchMed<GCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
     return false;
 
-  const APInt &K0_Imm = getConstantIntVRegVal(K0, MRI)->getValue();
-  const APInt &K1_Imm = getConstantIntVRegVal(K1, MRI)->getValue();
-  if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0_Imm.sgt(K1_Imm))
+  if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(K1->Value))
     return false;
-  if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0_Imm.ugt(K1_Imm))
+  if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(K1->Value))
     return false;
 
-  MatchInfo = {OpcodeTriple.Med, Val, K0, K1};
+  MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0e4005627e02..ab3ce980c3f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -58,7 +58,7 @@
 ///
 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
 /// operation should have its source operands all mapped to VGPRs (except for
-/// VCC), inserting copies from any SGPR operands. This the most trival legal
+/// VCC), inserting copies from any SGPR operands. This the most trivial legal
 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
 /// complicated to solve here. Every optimization pattern or instruction
 /// selected to multiple outputs would have to enforce this rule, and there
@@ -118,7 +118,7 @@ public:
         Opc == AMDGPU::G_SEXT) {
       // LegalizerHelper wants to use the basic legalization artifacts when
       // widening etc. We don't handle selection with vcc in artifact sources,
-      // so we need to use a sslect instead to handle these properly.
+      // so we need to use a select instead to handle these properly.
       Register DstReg = MI.getOperand(0).getReg();
       Register SrcReg = MI.getOperand(1).getReg();
       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
@@ -282,7 +282,7 @@ AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   // VCC-like use.
   if (TRI->isSGPRClass(&RC)) {
     // FIXME: This probably came from a copy from a physical register, which
-    // should be inferrrable from the copied to-type. We don't have many boolean
+    // should be inferable from the copied to-type. We don't have many boolean
     // physical register constraints so just assume a normal SGPR for now.
     if (!Ty.isValid())
       return AMDGPU::SGPRRegBank;
@@ -734,23 +734,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
 #endif
 
-  for (MachineInstr &MI : Range) {
-    for (MachineOperand &Def : MI.defs()) {
-      if (MRI.use_nodbg_empty(Def.getReg()))
-        continue;
-
-      LLT ResTy = MRI.getType(Def.getReg());
-      const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
-      ResultRegs.push_back(Def.getReg());
-      Register InitReg = B.buildUndef(ResTy).getReg(0);
-      Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
-      InitResultRegs.push_back(InitReg);
-      PhiRegs.push_back(PhiReg);
-      MRI.setRegBank(PhiReg, *DefBank);
-      MRI.setRegBank(InitReg, *DefBank);
-    }
-  }
-
   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
 
@@ -894,23 +877,26 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
         bool Is64 = OpSize % 64 == 0;
 
-        LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
-        unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
-          : AMDGPU::V_CMP_EQ_U32_e64;
-
-        // The compares can be done as 64-bit, but the extract needs to be done
-        // in 32-bit pieces.
+        unsigned UnmergeTySize = Is64 ? 64 : 32;
+        unsigned CmpOp =
+            Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
 
         // Insert the unmerge before the loop.
 
         B.setMBB(MBB);
-        auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
+        unsigned NumPieces = OpSize / UnmergeTySize;
+        SmallVector<Register, 8> UnmergePieces;
+        if (NumPieces == 1) {
+          UnmergePieces.push_back(OpReg);
+        } else {
+          LLT UnmergeTy = LLT::scalar(UnmergeTySize);
+          MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
+          for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
+            UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
+        }
         B.setInstr(*I);
 
-        unsigned NumPieces = Unmerge->getNumOperands() - 1;
-        for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
-          Register UnmergePiece = Unmerge.getReg(PieceIdx);
-
+        for (Register UnmergePiece : UnmergePieces) {
           Register CurrentLaneOpReg;
           if (Is64) {
             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
@@ -985,12 +971,14 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
         if (OpTy.isVector()) {
           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
           Op.setReg(Merge.getReg(0));
-        } else {
+          MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
+        } else if (ReadlanePieces.size() > 1) {
           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
           Op.setReg(Merge.getReg(0));
+          MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
+        } else {
+          Op.setReg(ReadlanePieces[0]);
         }
-
-        MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
       }
 
       // Make sure we don't re-process this register again.
@@ -998,8 +986,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
     }
   }
 
-  B.setInsertPt(*LoopBB, LoopBB->end());
-
   // Update EXEC, save the original EXEC value to VCC.
   B.buildInstr(AndSaveExecOpc)
     .addDef(NewExec)
@@ -1007,6 +993,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   MRI.setSimpleHint(NewExec, CondReg);
 
+  B.setInsertPt(*LoopBB, LoopBB->end());
+
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   B.buildInstr(XorTermOpc)
     .addDef(ExecReg)
@@ -1017,8 +1005,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   // s_cbranch_scc0?
 
   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
-  B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
-    .addMBB(LoopBB);
+  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
@@ -1336,7 +1323,7 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
   const LLT S32 = LLT::scalar(32);
   MachineRegisterInfo *MRI = B.getMRI();
 
-  if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
+  if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
     uint32_t SOffset, ImmOffset;
     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
                                  Alignment)) {
@@ -1430,7 +1417,7 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
       OffsetBank == &AMDGPU::SGPRRegBank)
     return true; // Legal mapping
 
-  // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
+  // FIXME: 96-bit case was widened during legalize. We need to narrow it back
   // here but don't have an MMO.
 
   unsigned LoadSize = Ty.getSizeInBits();
@@ -1455,7 +1442,7 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
                                         VOffset, SOffset, ImmOffset, Alignment);
 
   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
-  // can, but we neeed to track an MMO for that.
+  // can, but we need to track an MMO for that.
   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
   const Align MemAlign(4); // FIXME: ABI type alignment?
   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
@@ -1569,7 +1556,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
 
     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
     // if the width is a constant.
-    if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) {
+    if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
       // Use the 32-bit bitfield extract instruction if the width is a constant.
       // Depending on the width size, use either the low or high 32-bits.
       auto Zero = B.buildConstant(S32, 0);
@@ -1775,97 +1762,6 @@ AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
   return {BaseReg, C1};
 }
 
-static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
-  int64_t C;
-  return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
-}
-
-static unsigned extractCPol(unsigned CachePolicy) {
-  return CachePolicy & AMDGPU::CPol::ALL;
-}
-
-static unsigned extractSWZ(unsigned CachePolicy) {
-  return (CachePolicy >> 3) & 1;
-}
-
-
-MachineInstr *
-AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
-                                             MachineInstr &MI) const {
-   MachineRegisterInfo &MRI = *B.getMRI();
-  executeInWaterfallLoop(B, MI, MRI, {2, 4});
-
-  // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
-
-  Register VData = MI.getOperand(1).getReg();
-  LLT Ty = MRI.getType(VData);
-
-  int EltSize = Ty.getScalarSizeInBits();
-  int Size = Ty.getSizeInBits();
-
-  // FIXME: Broken integer truncstore.
-  if (EltSize != 32)
-    report_fatal_error("unhandled intrinsic store");
-
-  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
-  const int MemSize = (*MI.memoperands_begin())->getSize();
-
-
-  Register RSrc = MI.getOperand(2).getReg();
-  Register VOffset = MI.getOperand(3).getReg();
-  Register SOffset = MI.getOperand(4).getReg();
-  unsigned CachePolicy = MI.getOperand(5).getImm();
-
-  unsigned ImmOffset;
-  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
-
-  const bool Offen = !isZero(VOffset, MRI);
-
-  unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
-  switch (8 * MemSize) {
-  case 8:
-    Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
-                  AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
-    break;
-  case 16:
-    Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
-                  AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
-    break;
-  default:
-    Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
-                  AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
-    if (Size > 32)
-      Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
-    break;
-  }
-
-
-  // Set the insertion point back to the instruction in case it was moved into a
-  // loop.
-  B.setInstr(MI);
-
-  MachineInstrBuilder MIB = B.buildInstr(Opc)
-    .addUse(VData);
-
-  if (Offen)
-    MIB.addUse(VOffset);
-
-  MIB.addUse(RSrc)
-     .addUse(SOffset)
-     .addImm(ImmOffset)
-     .addImm(extractCPol(CachePolicy))
-     .addImm(0) // tfe: FIXME: Remove from inst
-     .addImm(extractSWZ(CachePolicy))
-     .cloneMemRefs(MI);
-
-  // FIXME: We need a way to report failure from applyMappingImpl.
-  // Insert constrain copies before inserting the loop.
-  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
-    report_fatal_error("failed to constrain selected store intrinsic");
-
-  return MIB;
-}
-
 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
                                         Register SrcReg) const {
   MachineRegisterInfo &MRI = *B.getMRI();
@@ -2153,7 +2049,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       // The standard handling only considers the result register bank for
       // phis. For VCC, blindly inserting a copy when the phi is lowered will
       // produce an invalid copy. We can only copy with some kind of compare to
-      // get a vector boolean result. Insert a regitser bank copy that will be
+      // get a vector boolean result. Insert a register bank copy that will be
       // correctly lowered to a compare.
       MachineIRBuilder B(*MI.getParent()->getParent());
 
@@ -2491,9 +2387,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     return;
   }
   case AMDGPU::G_CTPOP:
-  case AMDGPU::G_BITREVERSE:
-  case AMDGPU::G_CTLZ_ZERO_UNDEF:
-  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+  case AMDGPU::G_BITREVERSE: {
     const RegisterBank *DstBank =
       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
     if (DstBank == &AMDGPU::SGPRRegBank)
@@ -2515,6 +2409,48 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       llvm_unreachable("narrowScalar should have succeeded");
     return;
   }
+  case AMDGPU::G_AMDGPU_FFBH_U32:
+  case AMDGPU::G_AMDGPU_FFBL_B32:
+  case AMDGPU::G_CTLZ_ZERO_UNDEF:
+  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+    const RegisterBank *DstBank =
+        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+    if (DstBank == &AMDGPU::SGPRRegBank)
+      break;
+
+    Register SrcReg = MI.getOperand(1).getReg();
+    const LLT S32 = LLT::scalar(32);
+    LLT Ty = MRI.getType(SrcReg);
+    if (Ty == S32)
+      break;
+
+    // We can narrow this more efficiently than Helper can by using ffbh/ffbl
+    // which return -1 when the input is zero:
+    // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
+    // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
+    // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
+    // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
+    ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
+    MachineIRBuilder B(MI, ApplyVALU);
+    SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
+    unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
+                          ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
+                          : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
+                                ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
+                                : Opc;
+    unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
+    auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
+    auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
+    unsigned AddOpc =
+        Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
+            ? AMDGPU::G_ADD
+            : AMDGPU::G_UADDSAT;
+    Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
+    Register DstReg = MI.getOperand(0).getReg();
+    B.buildUMin(DstReg, X, Y);
+    MI.eraseFromParent();
+    return;
+  }
   case AMDGPU::G_SEXT:
   case AMDGPU::G_ZEXT:
   case AMDGPU::G_ANYEXT: {
@@ -3034,6 +2970,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   }
   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
     unsigned N = MI.getNumExplicitOperands() - 2;
+    applyDefaultMapping(OpdMapper);
     executeInWaterfallLoop(MI, MRI, { N });
     return;
   }
@@ -3095,6 +3032,101 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     }
     break;
   }
+  case AMDGPU::G_SI_CALL: {
+    // Use a set to avoid extra readfirstlanes in the case where multiple
+    // operands are the same register.
+    SmallSet<Register, 4> SGPROperandRegs;
+
+    if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
+      break;
+
+    // Move all copies to physical SGPRs that are used by the call instruction
+    // into the loop block. Start searching for these copies until the
+    // ADJCALLSTACKUP.
+    unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
+    unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
+
+    // Move all non-copies before the copies, so that a complete range can be
+    // moved into the waterfall loop.
+    SmallVector<MachineInstr *, 4> NonCopyInstrs;
+    // Count of NonCopyInstrs found until the current LastCopy.
+    unsigned NonCopyInstrsLen = 0;
+    MachineBasicBlock::iterator Start(&MI);
+    MachineBasicBlock::iterator LastCopy = Start;
+    MachineBasicBlock *MBB = MI.getParent();
+    const SIMachineFunctionInfo *Info =
+        MBB->getParent()->getInfo<SIMachineFunctionInfo>();
+    while (Start->getOpcode() != FrameSetupOpcode) {
+      --Start;
+      bool IsCopy = false;
+      if (Start->getOpcode() == AMDGPU::COPY) {
+        auto &Dst = Start->getOperand(0);
+        if (Dst.isReg()) {
+          Register Reg = Dst.getReg();
+          if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
+            IsCopy = true;
+          } else {
+            // Also move the copy from the scratch rsrc descriptor into the loop
+            // to allow it to be optimized away.
+            auto &Src = Start->getOperand(1);
+            if (Src.isReg()) {
+              Reg = Src.getReg();
+              IsCopy = Info->getScratchRSrcReg() == Reg;
+            }
+          }
+        }
+      }
+
+      if (IsCopy) {
+        LastCopy = Start;
+        NonCopyInstrsLen = NonCopyInstrs.size();
+      } else {
+        NonCopyInstrs.push_back(&*Start);
+      }
+    }
+    NonCopyInstrs.resize(NonCopyInstrsLen);
+
+    for (auto *NonCopy : reverse(NonCopyInstrs)) {
+      MBB->splice(LastCopy, MBB, NonCopy->getIterator());
+    }
+    Start = LastCopy;
+
+    // Do the same for copies after the loop
+    NonCopyInstrs.clear();
+    NonCopyInstrsLen = 0;
+    MachineBasicBlock::iterator End(&MI);
+    LastCopy = End;
+    while (End->getOpcode() != FrameDestroyOpcode) {
+      ++End;
+      bool IsCopy = false;
+      if (End->getOpcode() == AMDGPU::COPY) {
+        auto &Src = End->getOperand(1);
+        if (Src.isReg()) {
+          Register Reg = Src.getReg();
+          IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
+        }
+      }
+
+      if (IsCopy) {
+        LastCopy = End;
+        NonCopyInstrsLen = NonCopyInstrs.size();
+      } else {
+        NonCopyInstrs.push_back(&*End);
+      }
+    }
+    NonCopyInstrs.resize(NonCopyInstrsLen);
+
+    End = LastCopy;
+    ++LastCopy;
+    for (auto *NonCopy : reverse(NonCopyInstrs)) {
+      MBB->splice(LastCopy, MBB, NonCopy->getIterator());
+    }
+
+    ++End;
+    MachineIRBuilder B(*Start);
+    executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
+    break;
+  }
   case AMDGPU::G_LOAD:
   case AMDGPU::G_ZEXTLOAD:
   case AMDGPU::G_SEXTLOAD: {
@@ -3290,7 +3322,7 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
 }
 
-/// Return the mapping for a pointer arugment.
+/// Return the mapping for a pointer argument.
 const RegisterBankInfo::ValueMapping *
 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
                                               Register PtrReg) const {
@@ -3620,7 +3652,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
   case AMDGPU::G_FSHR: // TODO: Expand for scalar
-  case AMDGPU::G_AMDGPU_FFBH_U32:
   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
   case AMDGPU::G_AMDGPU_RCP_IFLAG:
@@ -3726,8 +3757,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
     break;
   }
+  case AMDGPU::G_AMDGPU_FFBH_U32:
+  case AMDGPU::G_AMDGPU_FFBL_B32:
   case AMDGPU::G_CTLZ_ZERO_UNDEF:
-  case AMDGPU::G_CTTZ_ZERO_UNDEF:
+  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
+    OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
+    break;
+  }
   case AMDGPU::G_CTPOP: {
     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
@@ -4033,6 +4072,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mbcnt_hi:
     case Intrinsic::amdgcn_mul_u24:
     case Intrinsic::amdgcn_mul_i24:
+    case Intrinsic::amdgcn_mulhi_u24:
+    case Intrinsic::amdgcn_mulhi_i24:
     case Intrinsic::amdgcn_lerp:
     case Intrinsic::amdgcn_sad_u8:
     case Intrinsic::amdgcn_msad_u8:
@@ -4254,8 +4295,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     unsigned N = MI.getNumExplicitOperands() - 2;
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
-    for (unsigned I = 2; I < N; ++I)
-      OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+    if (N == 3) {
+      // Sequential form: all operands combined into VGPR256/VGPR512
+      unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+      if (Size > 256)
+        Size = 512;
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    } else {
+      // NSA form
+      for (unsigned I = 2; I < N; ++I)
+        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+    }
     break;
   }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
@@ -4447,6 +4497,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
 
+  case AMDGPU::G_SI_CALL: {
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+    // Lie and claim everything is legal, even though some need to be
+    // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+    OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+    // Allow anything for implicit arguments
+    for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
+      if (MI.getOperand(I).isReg()) {
+        Register Reg = MI.getOperand(I).getReg();
+        auto OpBank = getRegBankID(Reg, MRI);
+        unsigned Size = getSizeInBits(Reg, MRI, *TRI);
+        OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
+      }
+    }
+    break;
+  }
   case AMDGPU::G_LOAD:
   case AMDGPU::G_ZEXTLOAD:
   case AMDGPU::G_SEXTLOAD:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 7e051e4a5424..2b9d0923ab49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -89,9 +89,6 @@ public:
   std::pair<Register, unsigned>
   splitBufferOffsets(MachineIRBuilder &B, Register Offset) const;
 
-  MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B,
-                                     MachineInstr &MI) const;
-
   /// See RegisterBankInfo::applyMapping.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index dabb4d006d99..d55bf3917e9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -130,11 +130,9 @@ class ReplaceLDSUseImpl {
     std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
 
     // Remove LDS which don't qualify for replacement.
-    LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(),
-                                    [&](GlobalVariable *GV) {
-                                      return shouldIgnorePointerReplacement(GV);
-                                    }),
-                     LDSGlobals.end());
+    llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
+      return shouldIgnorePointerReplacement(GV);
+    });
 
     return LDSGlobals;
   }
@@ -142,7 +140,7 @@ class ReplaceLDSUseImpl {
   // Returns true if uses of given LDS global within non-kernel functions should
   // be keep as it is without pointer replacement.
   bool shouldIgnorePointerReplacement(GlobalVariable *GV) {
-    // LDS whose size is very small and doesn`t exceed pointer size is not worth
+    // LDS whose size is very small and doesn't exceed pointer size is not worth
     // replacing.
     if (DL.getTypeAllocSize(GV->getValueType()) <= 2)
       return true;
@@ -158,7 +156,7 @@ class ReplaceLDSUseImpl {
 
   // Insert new global LDS pointer which points to LDS.
   GlobalVariable *createLDSPointer(GlobalVariable *GV) {
-    // LDS pointer which points to LDS is already created? return it.
+    // LDS pointer which points to LDS is already created? Return it.
     auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr));
     if (!PointerEntry.second)
       return PointerEntry.first->second;
@@ -185,7 +183,7 @@ class ReplaceLDSUseImpl {
   // Split entry basic block in such a way that only lane 0 of each wave does
   // the LDS pointer initialization, and return newly created basic block.
   BasicBlock *activateLaneZero(Function *K) {
-    // If the entry basic block of kernel K is already splitted, then return
+    // If the entry basic block of kernel K is already split, then return
     // newly created basic block.
     auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr));
     if (!BasicBlockEntry.second)
@@ -204,7 +202,7 @@ class ReplaceLDSUseImpl {
 
     BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent();
 
-    // Mark that the entry basic block of kernel K is splitted.
+    // Mark that the entry basic block of kernel K is split.
     KernelToInitBB[K] = NBB;
 
     return NBB;
@@ -235,7 +233,7 @@ class ReplaceLDSUseImpl {
   }
 
   // We have created an LDS pointer for LDS, and initialized it to point-to LDS
-  // within all relevent kernels. Now replace all the uses of LDS within
+  // within all relevant kernels. Now replace all the uses of LDS within
   // non-kernel functions by LDS pointer.
   void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) {
     SmallVector<User *, 8> LDSUsers(GV->users());
@@ -268,8 +266,8 @@ class ReplaceLDSUseImpl {
             convertConstantExprsToInstructions(I, CE, &UserInsts);
           }
 
-          // Go through all the user instrutions, if LDS exist within them as an
-          // operand, then replace it by replace instruction.
+          // Go through all the user instructions, if LDS exist within them as
+          // an operand, then replace it by replace instruction.
           for (auto *II : UserInsts) {
             auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer);
             II->replaceUsesOfWith(GV, ReplaceInst);
@@ -373,7 +371,7 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
     return false;
 
   // We have created an LDS pointer for LDS, and initialized it to point-to LDS
-  // within all relevent kernels. Now replace all the uses of LDS within
+  // within all relevant kernels. Now replace all the uses of LDS within
   // non-kernel functions by LDS pointer.
   replaceLDSUseByPointer(GV, LDSPointer);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index ef46e53b7460..cb511e5e3483 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -29,6 +29,8 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -61,7 +63,8 @@ static const Function *getCalleeFunction(const MachineOperand &Op) {
     assert(Op.getImm() == 0);
     return nullptr;
   }
-
+  if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
+    return cast<Function>(GA->getOperand(0));
   return cast<Function>(Op.getGlobal());
 }
 
@@ -83,10 +86,15 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
 }
 
 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
+    const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
+  if (ST.hasGFX90AInsts() && ArgNumAGPR)
+    return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
+  return std::max(ArgNumVGPR, ArgNumAGPR);
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
     const GCNSubtarget &ST) const {
-  if (ST.hasGFX90AInsts() && NumAGPR)
-    return alignTo(NumVGPR, 4) + NumAGPR;
-  return std::max(NumVGPR, NumAGPR);
+  return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
 }
 
 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
@@ -444,6 +452,25 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
         if (!IsIndirect)
           I = CallGraphResourceInfo.find(Callee);
 
+        // FIXME: Call site could have norecurse on it
+        if (!Callee || !Callee->doesNotRecurse()) {
+          Info.HasRecursion = true;
+
+          // TODO: If we happen to know there is no stack usage in the
+          // callgraph, we don't need to assume an infinitely growing stack.
+          if (!MI.isReturn()) {
+            // We don't need to assume an unknown stack size for tail calls.
+
+            // FIXME: This only benefits in the case where the kernel does not
+            // directly call the tail called function. If a kernel directly
+            // calls a tail recursive function, we'll assume maximum stack size
+            // based on the regular call instruction.
+            CalleeFrameSize =
+              std::max(CalleeFrameSize,
+                       static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+          }
+        }
+
         if (IsIndirect || I == CallGraphResourceInfo.end()) {
           CalleeFrameSize =
               std::max(CalleeFrameSize,
@@ -468,10 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           Info.HasRecursion |= I->second.HasRecursion;
           Info.HasIndirectCall |= I->second.HasIndirectCall;
         }
-
-        // FIXME: Call site could have norecurse on it
-        if (!Callee || !Callee->doesNotRecurse())
-          Info.HasRecursion = true;
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 832e8119e444..b0a2d3bffc62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -17,7 +17,6 @@
 
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/ValueMap.h"
 
 namespace llvm {
 
@@ -44,6 +43,10 @@ public:
     bool HasIndirectCall = false;
 
     int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+    // Total number of VGPRs is actually a combination of AGPR and VGPR
+    // depending on architecture - and some alignment constraints
+    int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR,
+                             int32_t NumVGPR) const;
     int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
   };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index e2aafa25142e..45f7c2f369bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -142,8 +142,8 @@ bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
 
       // Special case handle structs with single members. It is useful to handle
       // some casts between structs and non-structs, but we can't bitcast
-      // directly between them.  directly bitcast between them.  Blender uses
-      // some casts that look like { <3 x float> }* to <4 x float>*
+      // directly between them. Blender uses some casts that look like
+      // { <3 x float> }* to <4 x float>*
       if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
         return false;
 
@@ -259,7 +259,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
 
     // Keep retrying if we are able to successfully eliminate an argument. This
     // helps with cases with multiple arguments which may alias, such as in a
-    // sincos implemntation. If we have 2 stores to arguments, on the first
+    // sincos implementation. If we have 2 stores to arguments, on the first
     // attempt the MDA query will succeed for the second store but not the
     // first. On the second iteration we've removed that out clobbering argument
     // (by effectively moving it into another function) and will find the second
@@ -357,7 +357,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   RetAttrs.addAttribute(Attribute::SExt);
   RetAttrs.addAttribute(Attribute::ZExt);
   RetAttrs.addAttribute(Attribute::NoAlias);
-  NewFunc->removeAttributes(AttributeList::ReturnIndex, RetAttrs);
+  NewFunc->removeRetAttrs(RetAttrs);
   // TODO: How to preserve metadata?
 
   // Move the body of the function into the new rewritten function, and replace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0c5020dccecd..0655b4342ba1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
-#include "AMDGPU.h"
 #include "AMDGPUCallLowering.h"
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "R600Subtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallString.h"
@@ -38,10 +38,7 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #define AMDGPUSubtarget GCNSubtarget
 #include "AMDGPUGenSubtargetInfo.inc"
-#define GET_SUBTARGETINFO_TARGET_DESC
-#define GET_SUBTARGETINFO_CTOR
 #undef AMDGPUSubtarget
-#include "R600GenSubtargetInfo.inc"
 
 static cl::opt<bool> DisablePowerSched(
   "amdgpu-disable-power-sched",
@@ -64,19 +61,6 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
 
 GCNSubtarget::~GCNSubtarget() = default;
 
-R600Subtarget &
-R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
-                                               StringRef GPU, StringRef FS) {
-  SmallString<256> FullFS("+promote-alloca,");
-  FullFS += FS;
-  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
-
-  HasMulU24 = getGeneration() >= EVERGREEN;
-  HasMulI24 = hasCaymanISA();
-
-  return *this;
-}
-
 GCNSubtarget &
 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                               StringRef GPU, StringRef FS) {
@@ -98,12 +82,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
 
   // Disable mutually exclusive bits.
-  if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
-    if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
+  if (FS.contains_insensitive("+wavefrontsize")) {
+    if (!FS.contains_insensitive("wavefrontsize16"))
       FullFS += "-wavefrontsize16,";
-    if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
+    if (!FS.contains_insensitive("wavefrontsize32"))
       FullFS += "-wavefrontsize32,";
-    if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
+    if (!FS.contains_insensitive("wavefrontsize64"))
       FullFS += "-wavefrontsize64,";
   }
 
@@ -549,13 +533,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
-  const Function &F) const {
+    const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
   // Default minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 
-  // Default/requested minimum/maximum flat work group sizes.
-  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
-
   // If minimum/maximum flat work group sizes were explicitly requested using
   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
   // number of waves per execution unit to values implied by requested
@@ -563,8 +544,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   unsigned MinImpliedByFlatWorkGroupSize =
     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
   Default.first = MinImpliedByFlatWorkGroupSize;
-  bool RequestedFlatWorkGroupSize =
-      F.hasFnAttribute("amdgpu-flat-work-group-size");
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -581,8 +560,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 
   // Make sure requested values are compatible with values implied by requested
   // minimum/maximum flat work group sizes.
-  if (RequestedFlatWorkGroupSize &&
-      Requested.first < MinImpliedByFlatWorkGroupSize)
+  if (Requested.first < MinImpliedByFlatWorkGroupSize)
     return Default;
 
   return Requested;
@@ -710,6 +688,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
   if (ImplicitBytes != 0) {
     const Align Alignment = getAlignmentForImplicitArgPtr();
     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+    MaxAlign = std::max(MaxAlign, Alignment);
   }
 
   // Being able to dereference past the end is useful for emitting scalar loads.
@@ -721,23 +700,6 @@ AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
                                   : AMDGPUDwarfFlavour::Wave64;
 }
 
-R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                             const TargetMachine &TM) :
-  R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
-  AMDGPUSubtarget(TT),
-  InstrInfo(*this),
-  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-  FMA(false),
-  CaymanISA(false),
-  CFALUBug(false),
-  HasVertexCache(false),
-  R600ALUInst(false),
-  FP64(false),
-  TexVTXClauseSize(0),
-  Gen(R600),
-  TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
-  InstrItins(getInstrItineraryForCPU(GPU)) { }
-
 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                       unsigned NumRegionInstrs) const {
   // Track register pressure so the scheduler can try to decrease
@@ -805,7 +767,7 @@ GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
   if (getGeneration() >= AMDGPUSubtarget::GFX10)
     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 
-  if (HasFlatScratchInit) {
+  if (HasFlatScratchInit || HasArchitectedFlatScratch) {
     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@@ -1003,6 +965,13 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
       --Lat;
     }
     Dep.setLatency(Lat);
+  } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
+    // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
+    // implicit operands which come from the MCInstrDesc, which can fool
+    // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
+    // pseudo operands.
+    Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
+        DefI, DefOpIdx, UseI, UseOpIdx));
   }
 }
 
@@ -1052,7 +1021,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
     return true;
   }
 
-  // Link as much SALU intructions in chain as possible. Return the size
+  // Link as many SALU instructions in chain as possible. Return the size
   // of the chain. Links up to MaxChain instructions.
   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
                          SmallPtrSetImpl<SUnit *> &Visited) const {
@@ -1136,6 +1105,11 @@ void GCNSubtarget::getPostRAMutations(
   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 }
 
+std::unique_ptr<ScheduleDAGMutation>
+GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
+  return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
+}
+
 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b160cdf3a97a..88ed4b2b7a24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -91,7 +91,18 @@ public:
   /// be converted to integer, violate subtarget's specifications, or are not
   /// compatible with minimum/maximum number of waves limited by flat work group
   /// size, register usage, and/or lds usage.
-  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
+    // Default/requested minimum/maximum flat work group sizes.
+    std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
+    return getWavesPerEU(F, FlatWorkGroupSizes);
+  }
+
+  /// Overload which uses the specified values for the flat work group sizes,
+  /// rather than querying the function itself. \p FlatWorkGroupSizes Should
+  /// correspond to the function's value for getFlatWorkGroupSizes.
+  std::pair<unsigned, unsigned>
+  getWavesPerEU(const Function &F,
+                std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
 
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
@@ -240,7 +251,7 @@ public:
   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
 
-  /// \returns Corresponsing DWARF register number mapping flavour for the
+  /// \returns Corresponding DWARF register number mapping flavour for the
   /// \p WavefrontSize.
   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e4485f87fb79..de11676279f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -8,7 +8,7 @@
 //
 /// \file
 /// The AMDGPU target machine contains all of the hardware specific
-/// information  needed to emit code for R600 and SI GPUs.
+/// information  needed to emit code for SI+ GPUs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,7 +21,8 @@
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
-#include "R600MachineScheduler.h"
+#include "R600.h"
+#include "R600TargetMachine.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
@@ -35,11 +36,13 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -162,12 +165,6 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR(
   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
 }
 
-
-static cl::opt<bool> EnableR600StructurizeCFG(
-  "r600-ir-structurize",
-  cl::desc("Use StructurizeCFG IR pass"),
-  cl::init(true));
-
 static cl::opt<bool> EnableSROA(
   "amdgpu-sroa",
   cl::desc("Run SROA after promote alloca pass"),
@@ -184,12 +181,6 @@ OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
             cl::desc("Run pre-RA exec mask optimizations"),
             cl::init(true));
 
-static cl::opt<bool> EnableR600IfConvert(
-  "r600-if-convert",
-  cl::desc("Use if conversion pass"),
-  cl::ReallyHidden,
-  cl::init(true));
-
 // Option to disable vectorizer for tests.
 static cl::opt<bool> EnableLoadStoreVectorizer(
   "amdgpu-load-store-vectorizer",
@@ -240,13 +231,6 @@ static cl::opt<bool, true> LateCFGStructurize(
   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
   cl::Hidden);
 
-static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
-  "amdgpu-function-calls",
-  cl::desc("Enable AMDGPU function call support"),
-  cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
-  cl::init(true),
-  cl::Hidden);
-
 static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
   "amdgpu-fixed-function-abi",
   cl::desc("Enable all implicit function arguments"),
@@ -324,6 +308,11 @@ static cl::opt<bool> EnablePreRAOptimizations(
     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool> EnablePromoteKernelArguments(
+    "amdgpu-enable-promote-kernel-arguments",
+    cl::desc("Enable promotion of flat kernel pointer arguments to global"),
+    cl::Hidden, cl::init(true));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -349,6 +338,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeVGPRLiveRangePass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUFixFunctionBitcastsPass(*PR);
+  initializeAMDGPUCtorDtorLoweringPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAttributorPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
@@ -356,6 +346,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
+  initializeAMDGPUPromoteKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
@@ -400,10 +391,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return std::make_unique<AMDGPUTargetObjectFile>();
 }
 
-static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
-}
-
 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
   return new SIScheduleDAGMI(C);
 }
@@ -441,10 +428,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
 }
 
 static MachineSchedRegistry
-R600SchedRegistry("r600", "Run R600's custom scheduler",
-                   createR600MachineScheduler);
-
-static MachineSchedRegistry
 SISchedRegistry("si", "Run SI's custom scheduler",
                 createSIMachineScheduler);
 
@@ -542,7 +525,9 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
 /// Predicate for Internalize pass.
 static bool mustPreserveGV(const GlobalValue &GV) {
   if (const Function *F = dyn_cast<Function>(&GV))
-    return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
+    return F->isDeclaration() || F->getName().startswith("__asan_") ||
+           F->getName().startswith("__sanitizer_") ||
+           AMDGPU::isEntryFunctionCC(F->getCallingConv());
 
   GV.removeDeadConstantUsers();
   return !GV.use_empty();
@@ -556,6 +541,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
+  bool PromoteKernelArguments =
+      EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;
 
   if (EnableFunctionCalls) {
     delete Builder.Inliner;
@@ -597,7 +584,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
 
   Builder.addExtension(
     PassManagerBuilder::EP_CGSCCOptimizerLate,
-    [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
+                                        legacy::PassManagerBase &PM) {
+      // Add promote kernel arguments pass to the opt pipeline right before
+      // infer address spaces which is needed to do actual address space
+      // rewriting.
+      if (PromoteKernelArguments)
+        PM.add(createAMDGPUPromoteKernelArgumentsPass());
+
       // Add infer address spaces pass to the opt pipeline after inlining
       // but before SROA to increase SROA opportunities.
       PM.add(createInferAddressSpacesPass());
@@ -674,6 +668,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
           return true;
         }
+        if (PassName == "amdgpu-promote-kernel-arguments") {
+          PM.addPass(AMDGPUPromoteKernelArgumentsPass());
+          return true;
+        }
         return false;
       });
 
@@ -690,19 +688,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   });
 
   PB.registerPipelineStartEPCallback(
-      [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
         FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
         FPM.addPass(AMDGPUUseNativeCallsPass());
-        if (EnableLibCallSimplify &&
-            Level != PassBuilder::OptimizationLevel::O0)
+        if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
           FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
 
   PB.registerPipelineEarlySimplificationEPCallback(
-      [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
-        if (Level == PassBuilder::OptimizationLevel::O0)
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        if (Level == OptimizationLevel::O0)
           return;
 
         PM.addPass(AMDGPUUnifyMetadataPass());
@@ -720,12 +717,19 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 
   PB.registerCGSCCOptimizerLateEPCallback(
-      [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
-        if (Level == PassBuilder::OptimizationLevel::O0)
+      [this](CGSCCPassManager &PM, OptimizationLevel Level) {
+        if (Level == OptimizationLevel::O0)
           return;
 
         FunctionPassManager FPM;
 
+        // Add promote kernel arguments pass to the opt pipeline right before
+        // infer address spaces which is needed to do actual address space
+        // rewriting.
+        if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
+            EnablePromoteKernelArguments)
+          FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
+
         // Add infer address spaces pass to the opt pipeline after inlining
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
@@ -734,7 +738,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // anything, and before other cleanup optimizations.
         FPM.addPass(AMDGPULowerKernelAttributesPass());
 
-        if (Level != PassBuilder::OptimizationLevel::O0) {
+        if (Level != OptimizationLevel::O0) {
           // Promote alloca to vector before SROA and loop unroll. If we
           // manage to eliminate allocas before unroll we may choose to unroll
           // less.
@@ -745,45 +749,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 }
 
-//===----------------------------------------------------------------------===//
-// R600 Target Machine (R600 -> Cayman)
-//===----------------------------------------------------------------------===//
-
-R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
-                                     StringRef CPU, StringRef FS,
-                                     TargetOptions Options,
-                                     Optional<Reloc::Model> RM,
-                                     Optional<CodeModel::Model> CM,
-                                     CodeGenOpt::Level OL, bool JIT)
-    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
-  setRequiresStructuredCFG(true);
-
-  // Override the default since calls aren't supported for r600.
-  if (EnableFunctionCalls &&
-      EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
-    EnableFunctionCalls = false;
-}
-
-const R600Subtarget *R600TargetMachine::getSubtargetImpl(
-  const Function &F) const {
-  StringRef GPU = getGPUName(F);
-  StringRef FS = getFeatureString(F);
-
-  SmallString<128> SubtargetKey(GPU);
-  SubtargetKey.append(FS);
-
-  auto &I = SubtargetMap[SubtargetKey];
-  if (!I) {
-    // This needs to be done before we create a new subtarget since any
-    // creation will depend on the TM and the code generation flags on the
-    // function that reside in TargetOptions.
-    resetTargetOptions(F);
-    I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
-  }
-
-  return I.get();
-}
-
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
@@ -817,9 +782,31 @@ unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
   return AMDGPUAS::GLOBAL_ADDRESS;
 }
 
-TargetTransformInfo
-R600TargetMachine::getTargetTransformInfo(const Function &F) {
-  return TargetTransformInfo(R600TTIImpl(this, F));
+std::pair<const Value *, unsigned>
+AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
+  if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_is_shared:
+      return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
+    case Intrinsic::amdgcn_is_private:
+      return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
+    default:
+      break;
+    }
+    return std::make_pair(nullptr, -1);
+  }
+  // Check the global pointer predication based on
+  // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
+  // the order of 'is_shared' and 'is_private' is not significant.
+  Value *Ptr;
+  if (match(
+          const_cast<Value *>(V),
+          m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
+                  m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
+                      m_Deferred(Ptr))))))
+    return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
+
+  return std::make_pair(nullptr, -1);
 }
 
 //===----------------------------------------------------------------------===//
@@ -834,7 +821,8 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
                                    CodeGenOpt::Level OL, bool JIT)
     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
-const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+const TargetSubtargetInfo *
+GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef GPU = getGPUName(F);
   StringRef FS = getFeatureString(F);
 
@@ -864,76 +852,11 @@ GCNTargetMachine::getTargetTransformInfo(const Function &F) {
 // AMDGPU Pass Setup
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-class AMDGPUPassConfig : public TargetPassConfig {
-public:
-  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {
-    // Exceptions and StackMaps are not supported, so these passes will never do
-    // anything.
-    disablePass(&StackMapLivenessID);
-    disablePass(&FuncletLayoutID);
-    // Garbage collection is not supported.
-    disablePass(&GCLoweringID);
-    disablePass(&ShadowStackGCLoweringID);
-  }
-
-  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
-    return getTM<AMDGPUTargetMachine>();
-  }
-
-  ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const override {
-    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-    return DAG;
-  }
-
-  void addEarlyCSEOrGVNPass();
-  void addStraightLineScalarOptimizationPasses();
-  void addIRPasses() override;
-  void addCodeGenPrepare() override;
-  bool addPreISel() override;
-  bool addInstSelector() override;
-  bool addGCPasses() override;
-
-  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
-
-  /// Check if a pass is enabled given \p Opt option. The option always
-  /// overrides defaults if explicitely used. Otherwise its default will
-  /// be used given that a pass shall work at an optimization \p Level
-  /// minimum.
-  bool isPassEnabled(const cl::opt<bool> &Opt,
-                     CodeGenOpt::Level Level = CodeGenOpt::Default) const {
-    if (Opt.getNumOccurrences())
-      return Opt;
-    if (TM->getOptLevel() < Level)
-      return false;
-    return Opt;
-  }
-};
-
-std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
+std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
   return getStandardCSEConfigForOpt(TM->getOptLevel());
 }
 
-class R600PassConfig final : public AMDGPUPassConfig {
-public:
-  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
-    : AMDGPUPassConfig(TM, PM) {}
-
-  ScheduleDAGInstrs *createMachineScheduler(
-    MachineSchedContext *C) const override {
-    return createR600MachineScheduler(C);
-  }
-
-  bool addPreISel() override;
-  bool addInstSelector() override;
-  void addPreRegAlloc() override;
-  void addPreSched2() override;
-  void addPreEmitPass() override;
-};
+namespace {
 
 class GCNPassConfig final : public AMDGPUPassConfig {
 public:
@@ -943,6 +866,7 @@ public:
     // allow calls without EnableAMDGPUFunctionCalls if they are marked
     // noinline, so this is always required.
     setRequiresCodeGenSCCOrder(true);
+    substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
   }
 
   GCNTargetMachine &getGCNTargetMachine() const {
@@ -952,6 +876,15 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+    const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
+    return DAG;
+  }
+
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
   bool addILPOpts() override;
@@ -982,6 +915,17 @@ public:
 
 } // end anonymous namespace
 
+AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+  // Exceptions and StackMaps are not supported, so these passes will never do
+  // anything.
+  disablePass(&StackMapLivenessID);
+  disablePass(&FuncletLayoutID);
+  // Garbage collection is not supported.
+  disablePass(&GCLoweringID);
+  disablePass(&ShadowStackGCLoweringID);
+}
+
 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
   if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createGVNPass());
@@ -993,7 +937,7 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
   addPass(createLICMPass());
   addPass(createSeparateConstOffsetFromGEPPass());
   addPass(createSpeculativeExecutionPass());
-  // ReassociateGEPs exposes more opportunites for SLSR. See
+  // ReassociateGEPs exposes more opportunities for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
   addPass(createStraightLineStrengthReducePass());
   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
@@ -1015,6 +959,7 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&PatchableFunctionID);
 
   addPass(createAMDGPUPrintfRuntimeBinding());
+  addPass(createAMDGPUCtorDtorLoweringPass());
 
   // This must occur before inlining, as the inliner will not look through
   // bitcast calls.
@@ -1100,8 +1045,13 @@ void AMDGPUPassConfig::addIRPasses() {
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    addPass(createAMDGPUAttributorPass());
+
+    // FIXME: This pass adds 2 hacky attributes that can be replaced with an
+    // analysis, and should be removed.
     addPass(createAMDGPUAnnotateKernelFeaturesPass());
+  }
 
   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
       EnableLowerKernelArguments)
@@ -1126,8 +1076,7 @@ bool AMDGPUPassConfig::addPreISel() {
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  // Defer the verifier until FinalizeISel.
-  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
+  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
   return false;
 }
 
@@ -1136,44 +1085,11 @@ bool AMDGPUPassConfig::addGCPasses() {
   return false;
 }
 
-//===----------------------------------------------------------------------===//
-// R600 Pass Setup
-//===----------------------------------------------------------------------===//
-
-bool R600PassConfig::addPreISel() {
-  AMDGPUPassConfig::addPreISel();
-
-  if (EnableR600StructurizeCFG)
-    addPass(createStructurizeCFGPass());
-  return false;
-}
-
-bool R600PassConfig::addInstSelector() {
-  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
-  return false;
-}
-
-void R600PassConfig::addPreRegAlloc() {
-  addPass(createR600VectorRegMerger());
-}
-
-void R600PassConfig::addPreSched2() {
-  addPass(createR600EmitClauseMarkers(), false);
-  if (EnableR600IfConvert)
-    addPass(&IfConverterID, false);
-  addPass(createR600ClauseMergePass(), false);
-}
-
-void R600PassConfig::addPreEmitPass() {
-  addPass(createAMDGPUCFGStructurizerPass(), false);
-  addPass(createR600ExpandSpecialInstrsPass(), false);
-  addPass(&FinalizeMachineBundlesID, false);
-  addPass(createR600Packetizer(), false);
-  addPass(createR600ControlFlowFinalizer(), false);
-}
-
-TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new R600PassConfig(*this, PM);
+llvm::ScheduleDAGInstrs *
+AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1311,7 +1227,7 @@ void GCNPassConfig::addFastRegAlloc() {
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+  insertPass(&PHIEliminationID, &SILowerControlFlowID);
 
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
   insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
@@ -1341,11 +1257,11 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // the register in LiveVariables, this would trigger a failure in verifier,
   // we should fix it and enable the verifier.
   if (OptVGPRLiveRange)
-    insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
+    insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.
-  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+  insertPass(&PHIEliminationID, &SILowerControlFlowID);
 
   if (EnableDCEInRA)
     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -1418,7 +1334,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   // Commit allocated register changes. This is mostly necessary because too
   // many things rely on the use lists of the physical registers, such as the
   // verifier. This is only necessary with allocators which use LiveIntervals,
-  // since FastRegAlloc does the replacments itself.
+  // since FastRegAlloc does the replacements itself.
   addPass(createVirtRegRewriter(false));
 
   // Equivalent of PEI for SGPRs.
@@ -1440,6 +1356,8 @@ void GCNPassConfig::addPostRegAlloc() {
 }
 
 void GCNPassConfig::addPreSched2() {
+  if (TM->getOptLevel() > CodeGenOpt::None)
+    addPass(createSIShrinkInstructionsPass());
   addPass(&SIPostRABundlerID);
 }
 
@@ -1447,9 +1365,6 @@ void GCNPassConfig::addPreEmitPass() {
   addPass(createSIMemoryLegalizerPass());
   addPass(createSIInsertWaitcntsPass());
 
-  if (TM->getOptLevel() > CodeGenOpt::None)
-    addPass(createSIShrinkInstructionsPass());
-
   addPass(createSIModeRegisterPass());
 
   if (getOptLevel() > CodeGenOpt::None)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 1bfe026d080c..0ff2db2a52d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -15,11 +15,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
 
 #include "GCNSubtarget.h"
-#include "R600Subtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Target/TargetMachine.h"
+#include <utility>
 
 namespace llvm {
 
+class ScheduleDAGMILive;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU Target Machine (R600+)
 //===----------------------------------------------------------------------===//
@@ -61,31 +64,9 @@ public:
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
   unsigned getAssumedAddrSpace(const Value *V) const override;
-};
-
-//===----------------------------------------------------------------------===//
-// R600 Target Machine (R600 -> Cayman)
-//===----------------------------------------------------------------------===//
-
-class R600TargetMachine final : public AMDGPUTargetMachine {
-private:
-  mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
 
-public:
-  R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                    StringRef FS, TargetOptions Options,
-                    Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
-                    CodeGenOpt::Level OL, bool JIT);
-
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  const R600Subtarget *getSubtargetImpl(const Function &) const override;
-
-  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
-
-  bool isMachineVerifierClean() const override {
-    return false;
-  }
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const override;
 };
 
 //===----------------------------------------------------------------------===//
@@ -104,7 +85,7 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  const GCNSubtarget *getSubtargetImpl(const Function &) const override;
+  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
@@ -121,6 +102,45 @@ public:
                                 SMRange &SourceRange) const override;
 };
 
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup
+//===----------------------------------------------------------------------===//
+
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM);
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
+  void addEarlyCSEOrGVNPass();
+  void addStraightLineScalarOptimizationPasses();
+  void addIRPasses() override;
+  void addCodeGenPrepare() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addGCPasses() override;
+
+  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
+
+  /// Check if a pass is enabled given \p Opt option. The option always
+  /// overrides defaults if explicitly used. Otherwise its default will
+  /// be used given that a pass shall work at an optimization \p Level
+  /// minimum.
+  bool isPassEnabled(const cl::opt<bool> &Opt,
+                     CodeGenOpt::Level Level = CodeGenOpt::Default) const {
+    if (Opt.getNumOccurrences())
+      return Opt;
+    if (TM->getOptLevel() < Level)
+      return false;
+    return Opt;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 63f449f7a726..ecdbdf613a53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -16,10 +16,11 @@
 
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
 
@@ -101,7 +102,8 @@ AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
       TLI(ST->getTargetLowering()) {}
 
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                            TTI::UnrollingPreferences &UP) {
+                                            TTI::UnrollingPreferences &UP,
+                                            OptimizationRemarkEmitter *ORE) {
   const Function &F = *L->getHeader()->getParent();
   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
   UP.MaxCount = std::numeric_limits<unsigned>::max();
@@ -503,7 +505,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
     Info.ReadMem = true;
     Info.WriteMem = true;
-    Info.IsVolatile = !Volatile->isNullValue();
+    Info.IsVolatile = !Volatile->isZero();
     return true;
   }
   default:
@@ -1224,8 +1226,9 @@ unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                         TTI::UnrollingPreferences &UP) {
-  CommonTTI.getUnrollingPreferences(L, SE, UP);
+                                         TTI::UnrollingPreferences &UP,
+                                         OptimizationRemarkEmitter *ORE) {
+  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
 }
 
 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
@@ -1239,122 +1242,3 @@ int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
              : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
                                       : getQuarterRateInstrCost(CostKind);
 }
-
-R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
-    : BaseT(TM, F.getParent()->getDataLayout()),
-      ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
-      TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
-
-unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
-  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
-}
-
-unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
-  return getHardwareNumberOfRegisters(Vec);
-}
-
-TypeSize
-R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
-  return TypeSize::getFixed(32);
-}
-
-unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
-  return 32;
-}
-
-unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
-  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
-      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
-    return 128;
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-      AddrSpace == AMDGPUAS::REGION_ADDRESS)
-    return 64;
-  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return 32;
-
-  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
-      AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
-      (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
-      AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
-    return 128;
-  llvm_unreachable("unhandled address space");
-}
-
-bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
-                                             Align Alignment,
-                                             unsigned AddrSpace) const {
-  // We allow vectorization of flat stores, even though we may need to decompose
-  // them later if they may access private memory. We don't have enough context
-  // here, and legalization can handle it.
-  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
-}
-
-bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
-                                              Align Alignment,
-                                              unsigned AddrSpace) const {
-  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
-}
-
-bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
-                                               Align Alignment,
-                                               unsigned AddrSpace) const {
-  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
-}
-
-unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  // Disable unrolling if the loop is not vectorized.
-  // TODO: Enable this again.
-  if (VF == 1)
-    return 1;
-
-  return 8;
-}
-
-InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
-                                            TTI::TargetCostKind CostKind,
-                                            const Instruction *I) {
-  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
-    return Opcode == Instruction::PHI ? 0 : 1;
-
-  // XXX - For some reason this isn't called for switch.
-  switch (Opcode) {
-  case Instruction::Br:
-  case Instruction::Ret:
-    return 10;
-  default:
-    return BaseT::getCFInstrCost(Opcode, CostKind, I);
-  }
-}
-
-InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                                unsigned Index) {
-  switch (Opcode) {
-  case Instruction::ExtractElement:
-  case Instruction::InsertElement: {
-    unsigned EltSize
-      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
-    if (EltSize < 32) {
-      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
-    }
-
-    // Extracts are just reads of a subregister, so are free. Inserts are
-    // considered free because we don't want to have any cost for scalarizing
-    // operations, and we don't have to copy into a different register class.
-
-    // Dynamic indexing isn't free and is best avoided.
-    return Index == ~0u ? 2 : 0;
-  }
-  default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
-  }
-}
-
-void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                          TTI::UnrollingPreferences &UP) {
-  CommonTTI.getUnrollingPreferences(L, SE, UP);
-}
-
-void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
-                                        TTI::PeelingPreferences &PP) {
-  CommonTTI.getPeelingPreferences(L, SE, PP);
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 37c0756eb7a8..e901b5c5747d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -18,18 +18,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
 
 #include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 
 namespace llvm {
 
-class AMDGPUTargetLowering;
 class AMDGPUTargetMachine;
 class GCNSubtarget;
 class InstCombiner;
 class Loop;
-class R600Subtarget;
 class ScalarEvolution;
 class SITargetLowering;
 class Type;
@@ -53,7 +49,8 @@ public:
   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
@@ -82,24 +79,21 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
     return TargetTransformInfo::TCC_Basic;
   }
 
-  static inline int getHalfRateInstrCost(
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+  static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
     return CostKind == TTI::TCK_CodeSize ? 2
                                          : 2 * TargetTransformInfo::TCC_Basic;
   }
 
   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
   // should be 2 or 4.
-  static inline int getQuarterRateInstrCost(
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+  static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
     return CostKind == TTI::TCK_CodeSize ? 2
                                          : 4 * TargetTransformInfo::TCC_Basic;
   }
 
   // On some parts, normal fp64 operations are half rate, and others
   // quarter. This also applies to some integer operations.
-  int get64BitInstrCost(
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+  int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
 
 public:
   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
@@ -108,7 +102,8 @@ public:
   bool useGPUDivergenceAnalysis() const;
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
@@ -154,8 +149,7 @@ public:
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -184,6 +178,12 @@ public:
 
   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                   Intrinsic::ID IID) const;
+
+  bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
+    return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+           AS != AMDGPUAS::PRIVATE_ADDRESS;
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const;
 
@@ -213,51 +213,13 @@ public:
 
   InstructionCost getArithmeticReductionCost(
       unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+      TTI::TargetCostKind CostKind);
 
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
   InstructionCost getMinMaxReductionCost(
       VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
-};
-
-class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
-  using BaseT = BasicTTIImplBase<R600TTIImpl>;
-  using TTI = TargetTransformInfo;
-
-  friend BaseT;
-
-  const R600Subtarget *ST;
-  const AMDGPUTargetLowering *TLI;
-  AMDGPUTTIImpl CommonTTI;
-
-public:
-  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
-
-  const R600Subtarget *getST() const { return ST; }
-  const AMDGPUTargetLowering *getTLI() const { return TLI; }
-
-  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
-  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
-                             TTI::PeelingPreferences &PP);
-  unsigned getHardwareNumberOfRegisters(bool Vec) const;
-  unsigned getNumberOfRegisters(bool Vec) const;
-  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
-  unsigned getMinVectorRegisterBitWidth() const;
-  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
-  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
-                                  unsigned AddrSpace) const;
-  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
-                                   unsigned AddrSpace) const;
-  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
-                                    unsigned AddrSpace) const;
-  unsigned getMaxInterleaveFactor(unsigned VF);
-  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
-                                 const Instruction *I = nullptr);
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     unsigned Index);
+      TTI::TargetCostKind CostKind);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 4e3d5fdc012d..c6751f98fe6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -232,7 +232,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
         BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
         Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
       } else { // Conditional branch.
-        SmallVector<BasicBlock *, 2> Successors(succ_begin(BB), succ_end(BB));
+        SmallVector<BasicBlock *, 2> Successors(successors(BB));
 
         // Create a new transition block to hold the conditional branch.
         BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 56befe4ed0d0..1a9255f3240f 100644
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -6,8 +6,8 @@
 //
 //==-----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600RegisterInfo.h"
 #include "R600Subtarget.h"
 #include "llvm/ADT/SCCIterator.h"
@@ -127,6 +127,10 @@ public:
   bool prepare();
 
   bool runOnMachineFunction(MachineFunction &MF) override {
+    // FIXME: This pass causes verification failures.
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::FailsVerification);
+
     TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
     TRI = &TII->getRegisterInfo();
     LLVM_DEBUG(MF.dump(););
@@ -245,7 +249,7 @@ protected:
   int loopendPatternMatch();
   int mergeLoop(MachineLoop *LoopRep);
 
-  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// return true iff src1Blk->succ_empty() && src1Blk and src2Blk are in
   /// the same loop with LoopLandInfo without explicitly keeping track of
   /// loopContBlks and loopBreakBlks, this is a method to get the information.
   bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
@@ -571,12 +575,9 @@ bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
 DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
   //get DebugLoc from the first MachineBasicBlock instruction with debug info
   DebugLoc DL;
-  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
-      ++It) {
-    MachineInstr *instr = &(*It);
-    if (instr->getDebugLoc())
-      DL = instr->getDebugLoc();
-  }
+  for (MachineInstr &MI : *MBB)
+    if (MI.getDebugLoc())
+      DL = MI.getDebugLoc();
   return DL;
 }
 
@@ -617,7 +618,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
 
 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   MachineInstr *MI = getReturnInstr(MBB);
-  bool IsReturn = (MBB->succ_size() == 0);
+  bool IsReturn = MBB->succ_empty();
   if (MI)
     assert(IsReturn);
   else if (IsReturn)
@@ -628,9 +629,8 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
 
 void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
-  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
-       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
-    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+  for (MachineBasicBlock *Succ : SrcMBB->successors())
+    DstMBB->addSuccessor(Succ);  // *iter's predecessor is also taken care of
 }
 
 MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
@@ -808,7 +808,7 @@ bool AMDGPUCFGStructurizer::run() {
 
     MachineBasicBlock *EntryMBB =
         *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
-    if (EntryMBB->succ_size() == 0) {
+    if (EntryMBB->succ_empty()) {
       Finish = true;
       LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
     } else {
@@ -1054,7 +1054,7 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
 
 bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
     MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
-  if (Src1MBB->succ_size() == 0) {
+  if (Src1MBB->succ_empty()) {
     MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
     if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
       MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
@@ -1319,12 +1319,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     insertInstrBefore(I, R600::ENDIF);
 
     // put initReg = 2 to other predecessors of landBlk
-    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
-         PE = LandBlk->pred_end(); PI != PE; ++PI) {
-      MachineBasicBlock *MBB = *PI;
+    for (MachineBasicBlock *MBB : LandBlk->predecessors())
       if (MBB != TrueMBB && MBB != FalseMBB)
         report_fatal_error("Extra register needed to handle CFG");
-    }
   }
   LLVM_DEBUG(
       dbgs() << "result from improveSimpleJumpintoIf: ";
@@ -1393,7 +1390,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     MBB->splice(I, FalseMBB, FalseMBB->begin(),
                    FalseMBB->end());
     MBB->removeSuccessor(FalseMBB, true);
-    if (LandMBB && FalseMBB->succ_size() != 0)
+    if (LandMBB && !FalseMBB->succ_empty())
       FalseMBB->removeSuccessor(LandMBB, true);
     retireBlock(FalseMBB);
     MLI->removeBlock(FalseMBB);
@@ -1639,8 +1636,7 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
     SrcBlkInfo = new BlockInformation();
 
   SrcBlkInfo->IsRetired = true;
-  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
-         && "can't retire block yet");
+  assert(MBB->succ_empty() && MBB->pred_empty() && "can't retire block yet");
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 00032c7d4ea5..4acd77a9d5d2 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -28,12 +28,12 @@
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/TargetParser.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -1542,7 +1542,7 @@ private:
   bool validateOpSel(const MCInst &Inst);
   bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
   bool validateVccOperand(unsigned Reg) const;
-  bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands);
+  bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
   bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
   bool validateAGPRLdSt(const MCInst &Inst) const;
   bool validateVGPRAlign(const MCInst &Inst) const;
@@ -1715,6 +1715,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   switch (OperandType) {
   case AMDGPU::OPERAND_REG_IMM_INT32:
   case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
@@ -1723,6 +1724,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
   case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_KIMM32:
     return &APFloat::IEEEsingle();
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -1732,6 +1734,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
     return &APFloat::IEEEdouble();
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
@@ -1742,6 +1745,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_KIMM16:
     return &APFloat::IEEEhalf();
   default:
     llvm_unreachable("unsupported fp type");
@@ -2017,12 +2021,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 
     case AMDGPU::OPERAND_REG_IMM_INT32:
     case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
     case AMDGPU::OPERAND_REG_IMM_INT16:
     case AMDGPU::OPERAND_REG_IMM_FP16:
+    case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
@@ -2036,7 +2042,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
     case AMDGPU::OPERAND_REG_IMM_V2FP32:
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
-    case AMDGPU::OPERAND_REG_IMM_V2INT32: {
+    case AMDGPU::OPERAND_REG_IMM_V2INT32:
+    case AMDGPU::OPERAND_KIMM32:
+    case AMDGPU::OPERAND_KIMM16: {
       bool lost;
       APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
       // Convert literal to single precision
@@ -2062,6 +2070,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   switch (OpTy) {
   case AMDGPU::OPERAND_REG_IMM_INT32:
   case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
@@ -2101,6 +2110,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
@@ -2128,6 +2138,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     Inst.addOperand(MCOperand::createImm(Val));
     return;
   }
+  case AMDGPU::OPERAND_KIMM32:
+    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
+    setImmKindNone();
+    return;
+  case AMDGPU::OPERAND_KIMM16:
+    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(16).getZExtValue()));
+    setImmKindNone();
+    return;
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -3250,7 +3268,8 @@ AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst,
        SIInstrFlags::SDWA)) {
     // Check special imm operands (used by madmk, etc)
     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
-      ++ConstantBusUseCount;
+      ++NumLiterals;
+      LiteralSize = 4;
     }
 
     SmallDenseSet<unsigned> SGPRsUsed;
@@ -3290,7 +3309,7 @@ AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst,
 
           // An instruction may use only one literal.
           // This has been validated on the previous step.
-          // See validateVOP3Literal.
+          // See validateVOPLiteral.
           // This literal may be used as more than one operand.
           // If all these operands are of the same size,
           // this literal counts as one scalar value.
@@ -3981,26 +4000,29 @@ bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
     (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO);
 }
 
-// VOP3 literal is only allowed in GFX10+ and only one can be used
-bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst,
-                                          const OperandVector &Operands) {
+// One unique literal can be used. VOP3 literal is only allowed in GFX10+
+bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
+                                         const OperandVector &Operands) {
   unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
-  if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)))
+  const int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
+  if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) &&
+      ImmIdx == -1)
     return true;
 
   const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
 
-  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+  const int OpIndices[] = {Src0Idx, Src1Idx, Src2Idx, ImmIdx};
 
   unsigned NumExprs = 0;
   unsigned NumLiterals = 0;
   uint32_t LiteralValue;
 
   for (int OpIdx : OpIndices) {
-    if (OpIdx == -1) break;
+    if (OpIdx == -1)
+      continue;
 
     const MCOperand &MO = Inst.getOperand(OpIdx);
     if (!MO.isImm() && !MO.isExpr())
@@ -4030,7 +4052,7 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst,
   if (!NumLiterals)
     return true;
 
-  if (!getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
+  if (ImmIdx == -1 && !getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
     Error(getLitLoc(Operands), "literal operands are not supported");
     return false;
   }
@@ -4202,7 +4224,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
       "only one literal operand is allowed");
     return false;
   }
-  if (!validateVOP3Literal(Inst, Operands)) {
+  if (!validateVOPLiteral(Inst, Operands)) {
     return false;
   }
   if (!validateConstantBusLimitations(Inst, Operands)) {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 5f43aa8388ee..d3644db7cf8b 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -55,10 +55,6 @@ class MTBUFGetBaseOpcode<string Op> {
     !subst("FORMAT_XYZW", "FORMAT_X", Op)));
 }
 
-class getMTBUFElements<string Op> {
-  int ret = 1;
-}
-
 
 class MTBUF_Pseudo <string opName, dag outs, dag ins,
                     string asmOps, list<dag> pattern=[]> :
@@ -223,8 +219,7 @@ class MTBUF_Load_Pseudo <string opName,
 }
 
 multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
-                              int elems, ValueType load_vt = i32,
-                              SDPatternOperator ld = null_frag> {
+                              int elems> {
 
   def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
                 MTBUFAddr64Table<0, NAME>;
@@ -265,8 +260,7 @@ class MTBUF_Store_Pseudo <string opName,
 }
 
 multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
-                               int elems, ValueType store_vt = i32,
-                               SDPatternOperator st = null_frag> {
+                               int elems> {
 
   def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
     MTBUFAddr64Table<0, NAME>;
@@ -541,7 +535,6 @@ multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPa
 // opcode because it needs an N+1 register class dest register.
 multiclass MUBUF_Pseudo_Loads<string opName,
                               ValueType load_vt = i32,
-                              SDPatternOperator ld = null_frag,
                               bit TiedDest = 0,
                               bit isLds = 0> {
 
@@ -565,11 +558,9 @@ multiclass MUBUF_Pseudo_Loads<string opName,
   }
 }
 
-multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32,
-                                  SDPatternOperator ld_nolds = null_frag,
-                                  SDPatternOperator ld_lds = null_frag> {
-  defm NAME : MUBUF_Pseudo_Loads<opName, load_vt, ld_nolds>;
-  defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, ld_lds, 0, 1>;
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
+  defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
+  defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
 }
 
 class MUBUF_Store_Pseudo <string opName,
@@ -742,7 +733,6 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
 multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
                                         RegisterClass vdataClass,
                                         ValueType vdataType,
-                                        SDPatternOperator atomic,
                                         bit isFP = isFloatType<vdataType>.ret> {
   let FPAtomic = isFP in
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
@@ -796,7 +786,7 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
                                  RegisterClass vdataClass,
                                  ValueType vdataType,
                                  SDPatternOperator atomic> :
-  MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType, atomic>,
+  MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
   MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
 
 
@@ -924,13 +914,13 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
 // in at least GFX8+ chips. See Bug 37653.
 let SubtargetPredicate = isGFX8GFX9 in {
 defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx2", v2i32, null_frag, 0, 1
+  "buffer_load_dwordx2", v2i32, 0, 1
 >;
 defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx3", v3i32, null_frag, 0, 1
+  "buffer_load_dwordx3", v3i32, 0, 1
 >;
 defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
-  "buffer_load_dwordx4", v4i32, null_frag, 0, 1
+  "buffer_load_dwordx4", v4i32, 0, 1
 >;
 }
 
@@ -1076,27 +1066,27 @@ defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
 let SubtargetPredicate = HasD16LoadStore in {
 
 defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_ubyte_d16", i32, null_frag, 1
+  "buffer_load_ubyte_d16", i32, 1
 >;
 
 defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
-  "buffer_load_ubyte_d16_hi", i32, null_frag, 1
+  "buffer_load_ubyte_d16_hi", i32, 1
 >;
 
 defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_sbyte_d16", i32, null_frag, 1
+  "buffer_load_sbyte_d16", i32, 1
 >;
 
 defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
-  "buffer_load_sbyte_d16_hi", i32, null_frag, 1
+  "buffer_load_sbyte_d16_hi", i32, 1
 >;
 
 defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_short_d16", i32, null_frag, 1
+  "buffer_load_short_d16", i32, 1
 >;
 
 defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
-  "buffer_load_short_d16_hi", i32, null_frag, 1
+  "buffer_load_short_d16_hi", i32, 1
 >;
 
 defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
@@ -1121,10 +1111,10 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
 
 let SubtargetPredicate = HasAtomicFaddInsts in {
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32
+  "buffer_atomic_add_f32", VGPR_32, f32
 >;
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
-  "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
+  "buffer_atomic_pk_add_f16", VGPR_32, v2f16
 >;
 
 let OtherPredicates = [isGFX90APlus] in {
@@ -1438,6 +1428,13 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
 
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f32, "BUFFER_ATOMIC_FMIN">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f32, "BUFFER_ATOMIC_FMAX">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_FMIN_X2">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_FMAX_X2">;
+}
+
 class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
   (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
   (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index ad9528ece7d0..104b5160b985 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -428,11 +428,10 @@ defm DS_AND_B32       : DS_1A1D_NORET_mc<"ds_and_b32">;
 defm DS_OR_B32        : DS_1A1D_NORET_mc<"ds_or_b32">;
 defm DS_XOR_B32       : DS_1A1D_NORET_mc<"ds_xor_b32">;
 
-let SubtargetPredicate = HasLDSFPAtomics in {
+let SubtargetPredicate = HasLDSFPAtomicAdd in {
 defm DS_ADD_F32       : DS_1A1D_NORET_mc<"ds_add_f32">;
 }
 
-// FIXME: Are these really present pre-gfx8?
 defm DS_MIN_F32       : DS_1A1D_NORET_mc<"ds_min_f32">;
 defm DS_MAX_F32       : DS_1A1D_NORET_mc<"ds_max_f32">;
 
@@ -493,7 +492,7 @@ defm DS_MAX_F64       : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
 
 defm DS_ADD_RTN_U32   : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
 
-let SubtargetPredicate = HasLDSFPAtomics in {
+let SubtargetPredicate = HasLDSFPAtomicAdd in {
 defm DS_ADD_RTN_F32   : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">;
 }
 defm DS_SUB_RTN_U32   : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
@@ -665,7 +664,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
 
 } // let SubtargetPredicate = isGFX8Plus
 
-let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in {
+let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] in {
 def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
 }
 
@@ -715,6 +714,10 @@ foreach vt = Reg32Types.types in {
 defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
 }
 
+defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_8_local">;
+defm : DSReadPat_mc <DS_READ_U16, i16, "atomic_load_16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_16_local">;
 defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
 defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
 
@@ -775,6 +778,10 @@ foreach vt = Reg32Types.types in {
 defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
 }
 
+defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_local_8">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_local_8">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_local_16">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_local_16">;
 defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
 defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
 
@@ -933,11 +940,11 @@ defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
-
-let SubtargetPredicate = HasLDSFPAtomics in {
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+
+let SubtargetPredicate = HasLDSFPAtomicAdd in {
 defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
 }
 
@@ -954,6 +961,8 @@ defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">;
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F64, f64, "atomic_load_fmin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F64, f64, "atomic_load_fmax">;
 
 defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fe62b8590fa0..e2186d4d533e 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -25,8 +25,9 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -264,6 +265,34 @@ static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
 }
 
+static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm,
+                                          uint64_t Addr, const void *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
+}
+
+static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm,
+                                          uint64_t Addr, const void *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
+}
+
+static DecodeStatus decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(
+      Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW16, Imm, true));
+}
+
+static DecodeStatus decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(
+      Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW32, Imm, true));
+}
+
 static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
                           const MCRegisterInfo *MRI) {
   if (OpIdx < 0)
@@ -626,6 +655,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
+  int ImmLitIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
+  if (Res && ImmLitIdx != -1)
+    Res = convertFMAanyK(MI, ImmLitIdx);
+
   // if the opcode was not recognized we'll assume a Size of 4 bytes
   // (unless there are fewer bytes left)
   Size = Res ? (MaxInstBytesNum - Bytes.size())
@@ -693,22 +727,21 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::d16);
 
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+
   assert(VDataIdx != -1);
-  if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray
+  if (BaseOpcode->BVH) {
+    // Add A16 operand for intersect_ray instructions
     if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) {
-      assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa ||
-             MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa ||
-             MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa ||
-             MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa);
       addOperand(MI, MCOperand::createImm(1));
     }
     return MCDisassembler::Success;
   }
 
-  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
   bool IsAtomic = (VDstIdx != -1);
   bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
-
   bool IsNSA = false;
   unsigned AddrSize = Info->VAddrDwords;
 
@@ -717,8 +750,6 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
     int A16Idx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
-    const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-        AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
     const AMDGPU::MIMGDimInfo *Dim =
         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
@@ -813,6 +844,24 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   return MCDisassembler::Success;
 }
 
+DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
+                                                int ImmLitIdx) const {
+  assert(HasLiteral && "Should have decoded a literal");
+  const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
+  unsigned DescNumOps = Desc.getNumOperands();
+  assert(DescNumOps == MI.getNumOperands());
+  for (unsigned I = 0; I < DescNumOps; ++I) {
+    auto &Op = MI.getOperand(I);
+    auto OpType = Desc.OpInfo[I].OperandType;
+    bool IsDeferredOp = (OpType == AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED ||
+                         OpType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED);
+    if (Op.isImm() && Op.getImm() == AMDGPU::EncValues::LITERAL_CONST &&
+        IsDeferredOp)
+      Op.setImm(Literal);
+  }
+  return MCDisassembler::Success;
+}
+
 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
   return getContext().getRegisterInfo()->
     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -1022,6 +1071,18 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
   return decodeDstOp(OPW512, Val);
 }
 
+// Decode Literals for insts which always have a literal in the encoding
+MCOperand
+AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
+  if (HasLiteral) {
+    if (Literal != Val)
+      return errOperand(Val, "More than one unique literal is illegal");
+  }
+  HasLiteral = true;
+  Literal = Val;
+  return MCOperand::createImm(Literal);
+}
+
 MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
@@ -1235,7 +1296,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
   return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
 }
 
-MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
+                                          bool MandatoryLiteral) const {
   using namespace AMDGPU::EncValues;
 
   assert(Val < 1024); // enum10
@@ -1264,8 +1326,13 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
     return decodeFPImmed(Width, Val);
 
-  if (Val == LITERAL_CONST)
-    return decodeLiteralConstant();
+  if (Val == LITERAL_CONST) {
+    if (MandatoryLiteral)
+      // Keep a sentinel value for deferred setting
+      return MCOperand::createImm(LITERAL_CONST);
+    else
+      return decodeLiteralConstant();
+  }
 
   switch (Width) {
   case OPW32:
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index dc879ec5ad88..eea6074d5281 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -87,6 +87,7 @@ public:
   DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
                                        raw_string_ostream &KdStream) const;
 
+  DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
   DecodeStatus convertSDWAInst(MCInst &MI) const;
   DecodeStatus convertDPP8Inst(MCInst &MI) const;
   DecodeStatus convertMIMGInst(MCInst &MI) const;
@@ -150,9 +151,11 @@ public:
 
   static MCOperand decodeIntImmed(unsigned Imm);
   static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
+  MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
   MCOperand decodeLiteralConstant() const;
 
-  MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
+                        bool MandatoryLiteral = false) const;
   MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 596c3d7baea0..12224cb3f797 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -303,16 +303,16 @@ def : EGPat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
 
 let SubtargetPredicate = isEGorCayman in {
 
-multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret,
-                     SDPatternOperator node_ret, SDPatternOperator node_noret> {
+multiclass AtomicPat<Instruction inst_noret,
+                     SDPatternOperator node_noret> {
   // FIXME: Add _RTN version. We need per WI scratch location to store the old value
   // EXTRACT_SUBREG here is dummy, we know the node has no uses
   def : EGOrCaymanPat<(i32 (node_noret i32:$ptr, i32:$data)),
             (EXTRACT_SUBREG (inst_noret
               (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>;
 }
-multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret,
-                     SDPatternOperator node_ret, SDPatternOperator node_noret, int C> {
+multiclass AtomicIncDecPat<Instruction inst_noret,
+                           SDPatternOperator node_noret, int C> {
   // FIXME: Add _RTN version. We need per WI scratch location to store the old value
   // EXTRACT_SUBREG here is dummy, we know the node has no uses
   def : EGOrCaymanPat<(i32 (node_noret i32:$ptr, C)),
@@ -330,47 +330,33 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$
             $data, sub0),
           $ptr), sub1)>;
 
-defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN,
-                                RAT_ATOMIC_XCHG_INT_NORET,
-                                atomic_swap_global_ret_32,
+defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_NORET,
                                 atomic_swap_global_noret_32>;
-defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET,
-                               atomic_load_add_global_ret_32, atomic_load_add_global_noret_32>;
-defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET,
-                               atomic_load_sub_global_ret_32, atomic_load_sub_global_noret_32>;
-defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN,
-                               RAT_ATOMIC_MIN_INT_NORET,
-                               atomic_load_min_global_ret_32, atomic_load_min_global_noret_32>;
-defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN,
-                                RAT_ATOMIC_MIN_UINT_NORET,
-                                atomic_load_umin_global_ret_32, atomic_load_umin_global_noret_32>;
-defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN,
-                               RAT_ATOMIC_MAX_INT_NORET,
-                               atomic_load_max_global_ret_32, atomic_load_max_global_noret_32>;
-defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN,
-                                RAT_ATOMIC_MAX_UINT_NORET,
-                                atomic_load_umax_global_ret_32, atomic_load_umax_global_noret_32>;
-defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET,
-                               atomic_load_and_global_ret_32, atomic_load_and_global_noret_32>;
-defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET,
-                              atomic_load_or_global_ret_32, atomic_load_or_global_noret_32>;
-defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET,
-                               atomic_load_xor_global_ret_32, atomic_load_xor_global_noret_32>;
-defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
-                                        RAT_ATOMIC_INC_UINT_NORET,
-                                        atomic_load_add_global_ret_32,
+defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_NORET,
+                               atomic_load_add_global_noret_32>;
+defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_NORET,
+                               atomic_load_sub_global_noret_32>;
+defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_NORET,
+                               atomic_load_min_global_noret_32>;
+defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_NORET,
+                                atomic_load_umin_global_noret_32>;
+defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_NORET,
+                               atomic_load_max_global_noret_32>;
+defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_NORET,
+                                atomic_load_umax_global_noret_32>;
+defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_NORET,
+                               atomic_load_and_global_noret_32>;
+defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_NORET,
+                              atomic_load_or_global_noret_32>;
+defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_NORET,
+                               atomic_load_xor_global_noret_32>;
+defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_NORET,
                                         atomic_load_add_global_noret_32, 1>;
-defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
-                                        RAT_ATOMIC_INC_UINT_NORET,
-                                        atomic_load_sub_global_ret_32,
+defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_NORET,
                                         atomic_load_sub_global_noret_32, -1>;
-defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
-                                        RAT_ATOMIC_DEC_UINT_NORET,
-                                        atomic_load_add_global_ret_32,
+defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_NORET,
                                         atomic_load_add_global_noret_32, -1>;
-defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
-                                        RAT_ATOMIC_DEC_UINT_NORET,
-                                        atomic_load_sub_global_ret_32,
+defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_NORET,
                                         atomic_load_sub_global_noret_32, 1>;
 
 // Should be predicated on FeatureFP64
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 90f26e514f54..bb0aa648ff90 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -203,7 +203,7 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
 }
 
 class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
-  bit HasTiedOutput = 0, bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+  bit HasTiedOutput = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
   !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
@@ -224,10 +224,10 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
 }
 
 multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
-  bit HasTiedOutput = 0, bit HasSignedOffset = 0> {
-  def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, HasSignedOffset>,
+  bit HasTiedOutput = 0> {
+  def "" : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput>,
     GlobalSaddrTable<0, opName>;
-  def _SADDR : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, HasSignedOffset, 1>,
+  def _SADDR : FLAT_Global_Load_AddTid_Pseudo<opName, regClass, HasTiedOutput, 1>,
     GlobalSaddrTable<1, opName>;
 }
 
@@ -241,7 +241,7 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
 }
 
 class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
-  bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+  bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs),
   !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)),
@@ -258,11 +258,10 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
   let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
 }
 
-multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass,
-  bit HasSignedOffset = 0> {
-  def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, HasSignedOffset>,
+multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass> {
+  def "" : FLAT_Global_Store_AddTid_Pseudo<opName, regClass>,
     GlobalSaddrTable<0, opName>;
-  def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, HasSignedOffset, 1>,
+  def _SADDR : FLAT_Global_Store_AddTid_Pseudo<opName, regClass, 1>,
     GlobalSaddrTable<1, opName>;
 }
 
@@ -353,8 +352,6 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
     let mayStore = 1;
     let has_glc  = 0;
     let glcValue = 0;
-    let has_dlc  = 0;
-    let dlcValue = 0;
     let has_vdst = 0;
     let has_sccb  = 1;
     let sccbValue = 0;
@@ -368,7 +365,6 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
   let hasPostISelHook = 1;
   let has_vdst = 1;
   let glcValue = 1;
-  let dlcValue = 0;
   let sccbValue = 0;
   let IsAtomicNoRet = 0;
   let IsAtomicRet = 1;
@@ -412,7 +408,6 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
-  SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
   bit isFP = isFloatType<data_vt>.ret,
@@ -483,11 +478,10 @@ multiclass FLAT_Global_Atomic_Pseudo<
   RegisterClass vdst_rc,
   ValueType vt,
   SDPatternOperator atomic_rtn = null_frag,
-  SDPatternOperator atomic_no_rtn = null_frag,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc> {
   let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
-    defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>;
+    defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
     defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
   }
 }
@@ -668,12 +662,11 @@ defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d
 
 let is_flat_global = 1 in {
 defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
-                               VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32, null_frag,
+                               VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32,
                                v2i32, VReg_64>;
 
 defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
                                   VReg_64, i64, AMDGPUatomic_cmp_swap_global_64,
-                                  null_frag,
                                   v2i64, VReg_128>;
 
 defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
@@ -786,17 +779,17 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor
 
 let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
   defm GLOBAL_ATOMIC_FCMPSWAP :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
   defm GLOBAL_ATOMIC_FMIN :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
   defm GLOBAL_ATOMIC_FMAX :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
   defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
   defm GLOBAL_ATOMIC_FMIN_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
   defm GLOBAL_ATOMIC_FMAX_X2 :
-    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
+    FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
 } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
 
 let is_flat_global = 1 in {
@@ -1237,6 +1230,13 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
 
+let OtherPredicates = [isGFX10Plus] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", atomic_load_fmin_global_32, f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", atomic_load_fmax_global_32, f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", atomic_load_fmin_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", atomic_load_fmax_global_64, f64>;
+}
+
 let OtherPredicates = [HasAtomicFaddInsts] in {
 defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32,    atomic_load_fadd_global_noret_32, f32>;
 defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 2bf365168048..a8c85ec4e5ea 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -612,8 +612,7 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   for (auto &MBB : MF) {
-    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
-      auto &MI = *I++;
+    for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
         Changed = true;
         ++NumDPPMovsCombined;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bc2fb1e9770c..ff5d0b0af6a4 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -349,20 +349,16 @@ void GCNHazardRecognizer::AdvanceCycle() {
     return;
   }
 
-  // Do not track non-instructions which do not affect the wait states.
-  // If included, these instructions can lead to buffer overflow such that
-  // detectable hazards are missed.
-  if (CurrCycleInstr->isMetaInstruction()) {
-    CurrCycleInstr = nullptr;
-    return;
-  }
-
   if (CurrCycleInstr->isBundle()) {
     processBundle();
     return;
   }
 
   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
+  if (!NumWaitStates) {
+    CurrCycleInstr = nullptr;
+    return;
+  }
 
   // Keep track of emitted instructions
   EmittedInstrs.push_front(CurrCycleInstr);
@@ -409,7 +405,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (IsHazard(*I))
       return WaitStates;
 
-    if (I->isInlineAsm() || I->isMetaInstruction())
+    if (I->isInlineAsm())
       continue;
 
     WaitStates += SIInstrInfo::getNumWaitStates(*I);
@@ -1549,7 +1545,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
 }
 
 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
-  // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
+  // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
     return 0;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index f3f9eb53355f..86924667084d 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -188,7 +188,7 @@ public:
                printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
     Sch.BaseClass::schedule();
 
-    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    // Unfortunately placeDebugValues incorrectly modifies RegionEnd, restore
     Sch.RegionEnd = Rgn.End;
     //assert(Rgn.End == Sch.RegionEnd);
     Rgn.Begin = Sch.RegionBegin;
@@ -280,7 +280,7 @@ GCNIterativeScheduler::getSchedulePressure(const Region &R,
   return RPTracker.moveMaxPressure();
 }
 
-void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
+void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overridden
                                         MachineBasicBlock::iterator Begin,
                                         MachineBasicBlock::iterator End,
                                         unsigned NumRegionInstrs) {
@@ -293,7 +293,7 @@ void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
   }
 }
 
-void GCNIterativeScheduler::schedule() { // overriden
+void GCNIterativeScheduler::schedule() { // overridden
   // do nothing
   LLVM_DEBUG(printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
              if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
@@ -304,7 +304,7 @@ void GCNIterativeScheduler::schedule() { // overriden
              << '\n';);
 }
 
-void GCNIterativeScheduler::finalizeSchedule() { // overriden
+void GCNIterativeScheduler::finalizeSchedule() { // overridden
   if (Regions.empty())
     return;
   switch (Strategy) {
@@ -391,8 +391,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
   // and already interleaved with debug values
   if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
     placeDebugValues();
-    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
-    //assert(R.End == RegionEnd);
+    // Unfortunately placeDebugValues incorrectly modifies RegionEnd, restore
+    // assert(R.End == RegionEnd);
     RegionEnd = R.End;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 443472a3b99a..e82d7362a342 100644
--- a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines and imlements the class GCNMinRegScheduler, which
+/// This file defines and implements the class GCNMinRegScheduler, which
 /// implements an experimental, simple scheduler whose main goal is to learn
 /// ways about consuming less possible registers for a region.
 ///
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index a51399d7da5f..a906a4207758 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// This pass combines split register tuple initialization into a single psuedo:
+/// This pass combines split register tuple initialization into a single pseudo:
 ///
 ///   undef %0.sub1:sreg_64 = S_MOV_B32 1
 ///   %0.sub0:sreg_64 = S_MOV_B32 2
@@ -40,6 +40,7 @@ namespace {
 class GCNPreRAOptimizations : public MachineFunctionPass {
 private:
   const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
 
@@ -85,32 +86,107 @@ bool GCNPreRAOptimizations::processReg(Register Reg) {
   MachineInstr *Def0 = nullptr;
   MachineInstr *Def1 = nullptr;
   uint64_t Init = 0;
+  bool Changed = false;
+  SmallSet<Register, 32> ModifiedRegs;
+  bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg));
 
   for (MachineInstr &I : MRI->def_instructions(Reg)) {
-    if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
-        !I.getOperand(1).isImm() || I.getNumOperands() != 2)
-      return false;
-
-    switch (I.getOperand(0).getSubReg()) {
+    switch (I.getOpcode()) {
     default:
       return false;
-    case AMDGPU::sub0:
-      if (Def0)
+    case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
+      break;
+    case AMDGPU::COPY: {
+      // Some subtargets cannot do an AGPR to AGPR copy directly, and need an
+      // intermdiate temporary VGPR register. Try to find the defining
+      // accvgpr_write to avoid temporary registers.
+
+      if (!IsAGPRDst)
         return false;
-      Def0 = &I;
-      Init |= I.getOperand(1).getImm() & 0xffffffff;
+
+      Register SrcReg = I.getOperand(1).getReg();
+
+      if (!SrcReg.isVirtual())
+        break;
+
+      // Check if source of copy is from another AGPR.
+      bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg));
+      if (!IsAGPRSrc)
+        break;
+
+      // def_instructions() does not look at subregs so it may give us a
+      // different instruction that defines the same vreg but different subreg
+      // so we have to manually check subreg.
+      Register SrcSubReg = I.getOperand(1).getSubReg();
+      for (auto &Def : MRI->def_instructions(SrcReg)) {
+        if (SrcSubReg != Def.getOperand(0).getSubReg())
+          continue;
+
+        if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
+          MachineOperand DefSrcMO = Def.getOperand(1);
+
+          // Immediates are not an issue and can be propagated in
+          // postrapseudos pass. Only handle cases where defining
+          // accvgpr_write source is a vreg.
+          if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
+            // Propagate source reg of accvgpr write to this copy instruction
+            I.getOperand(1).setReg(DefSrcMO.getReg());
+            I.getOperand(1).setSubReg(DefSrcMO.getSubReg());
+
+            // Reg uses were changed, collect unique set of registers to update
+            // live intervals at the end.
+            ModifiedRegs.insert(DefSrcMO.getReg());
+            ModifiedRegs.insert(SrcReg);
+
+            Changed = true;
+          }
+
+          // Found the defining accvgpr_write, stop looking any further.
+          break;
+        }
+      }
       break;
-    case AMDGPU::sub1:
-      if (Def1)
+    }
+    case AMDGPU::S_MOV_B32:
+      if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() ||
+          I.getNumOperands() != 2)
         return false;
-      Def1 = &I;
-      Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+
+      switch (I.getOperand(0).getSubReg()) {
+      default:
+        return false;
+      case AMDGPU::sub0:
+        if (Def0)
+          return false;
+        Def0 = &I;
+        Init |= I.getOperand(1).getImm() & 0xffffffff;
+        break;
+      case AMDGPU::sub1:
+        if (Def1)
+          return false;
+        Def1 = &I;
+        Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+        break;
+      }
       break;
     }
   }
 
+  // For AGPR reg, check if live intervals need to be updated.
+  if (IsAGPRDst) {
+    if (Changed) {
+      for (Register RegToUpdate : ModifiedRegs) {
+        LIS->removeInterval(RegToUpdate);
+        LIS->createAndComputeVirtRegInterval(RegToUpdate);
+      }
+    }
+
+    return Changed;
+  }
+
+  // For SGPR reg, check if we can combine instructions.
   if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
-    return false;
+    return Changed;
 
   LLVM_DEBUG(dbgs() << "Combining:\n  " << *Def0 << "  " << *Def1
                     << "    =>\n");
@@ -144,7 +220,7 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  TRI = ST.getRegisterInfo();
 
   bool Changed = false;
 
@@ -153,8 +229,10 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
     if (!LIS->hasInterval(Reg))
       continue;
     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
+    if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
+        (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
       continue;
+
     Changed |= processReg(Reg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0212b8e17641..75855a7a4f9c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -26,32 +26,36 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
 void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);
 
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-
   MF = &DAG->MF;
 
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   // FIXME: This is also necessary, because some passes that run after
   // scheduling and before regalloc increase register pressure.
-  const int ErrorMargin = 3;
-
-  SGPRExcessLimit = Context->RegClassInfo
-    ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
-  VGPRExcessLimit = Context->RegClassInfo
-    ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
-  if (TargetOccupancy) {
-    SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true);
-    VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
-  } else {
-    SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
-        AMDGPU::RegisterPressureSets::SReg_32);
-    VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
-        AMDGPU::RegisterPressureSets::VGPR_32);
-  }
-
-  SGPRCriticalLimit -= ErrorMargin;
-  VGPRCriticalLimit -= ErrorMargin;
+  const unsigned ErrorMargin = 3;
+
+  SGPRExcessLimit =
+      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
+  VGPRExcessLimit =
+      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+
+  SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  // Set the initial TargetOccupnacy to the maximum occupancy that we can
+  // achieve for this function. This effectively sets a lower bound on the
+  // 'Critical' register limits in the scheduler.
+  TargetOccupancy = MFI.getOccupancy();
+  SGPRCriticalLimit =
+      std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
+  VGPRCriticalLimit =
+      std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+
+  // Subtract error margin from register limits and avoid overflow.
+  SGPRCriticalLimit =
+      std::min(SGPRCriticalLimit - ErrorMargin, SGPRCriticalLimit);
+  VGPRCriticalLimit =
+      std::min(VGPRCriticalLimit - ErrorMargin, VGPRCriticalLimit);
+  SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit);
+  VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit);
 }
 
 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -117,7 +121,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
 
   // Register pressure is considered 'CRITICAL' if it is approaching a value
   // that would reduce the wave occupancy for the execution unit.  When
-  // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
+  // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
   // has the same cost, so we don't need to prefer one over the other.
 
   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
@@ -361,14 +365,18 @@ void GCNScheduleDAGMILive::schedule() {
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
-  unsigned Occ = MFI.getOccupancy();
-  unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST));
-  unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST));
+
+  unsigned WavesAfter =
+      std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST));
+  unsigned WavesBefore =
+      std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 
-  // We could not keep current target occupancy because of the just scheduled
-  // region. Record new occupancy for next scheduling cycle.
+  // We may not be able to keep the current target occupancy because of the just
+  // scheduled region. We might still be able to revert scheduling if the
+  // occupancy before was higher, or if the current schedule has register
+  // pressure higher than the excess limits which could lead to more spilling.
   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
   // Allow memory bound functions to drop to 4 waves if not limited by an
   // attribute.
@@ -378,6 +386,7 @@ void GCNScheduleDAGMILive::schedule() {
                       << MFI.getMinAllowedOccupancy() << " waves\n");
     NewOccupancy = WavesAfter;
   }
+
   if (NewOccupancy < MinOccupancy) {
     MinOccupancy = NewOccupancy;
     MFI.limitOccupancy(MinOccupancy);
@@ -394,6 +403,11 @@ void GCNScheduleDAGMILive::schedule() {
     RegionsWithHighRP[RegionIdx] = true;
   }
 
+  // If this condition is true, then either the occupancy before and after
+  // scheduling is the same, or we are allowing the occupancy to drop because
+  // the function is memory bound. Even if we are OK with the current occupancy,
+  // we still need to verify that we will not introduce any extra chance of
+  // spilling.
   if (WavesAfter >= MinOccupancy) {
     if (Stage == UnclusteredReschedule &&
         !PressureAfter.less(ST, PressureBefore)) {
@@ -540,7 +554,6 @@ GCNScheduleDAGMILive::getBBLiveInMap() const {
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
   LiveIns.resize(Regions.size());
@@ -586,8 +599,6 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
             dbgs()
             << "Retrying function scheduling with lowest recorded occupancy "
             << MinOccupancy << ".\n");
-
-        S.setTargetOccupancy(MinOccupancy);
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 15eba3f5eac0..53d6ff0aa731 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -54,7 +54,7 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
   // before a region scheduling to know if the region had such clusters.
   bool HasClusteredNodes;
 
-  // schedule() have seen a an excess register pressure and had to track
+  // schedule() have seen an excess register pressure and had to track
   // register pressure for actual scheduling heuristics.
   bool HasExcessPressure;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index bd0c40081c01..d8bc0b2df2bd 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -21,13 +21,6 @@
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
-namespace llvm {
-
-class MCInst;
-class MCInstrInfo;
-
-} // namespace llvm
-
 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"
 
@@ -104,7 +97,6 @@ protected:
   bool FP64;
   bool FMA;
   bool MIMG_R128;
-  bool IsGCN;
   bool CIInsts;
   bool GFX8Insts;
   bool GFX9Insts;
@@ -172,13 +164,8 @@ protected:
   bool HasArchitectedFlatScratch;
   bool AddNoCarryInsts;
   bool HasUnpackedD16VMem;
-  bool R600ALUInst;
-  bool CaymanISA;
-  bool CFALUBug;
   bool LDSMisalignedBug;
   bool HasMFMAInlineLiteralBug;
-  bool HasVertexCache;
-  short TexVTXClauseSize;
   bool UnalignedBufferAccess;
   bool UnalignedDSAccess;
   bool HasPackedTID;
@@ -272,7 +259,7 @@ public:
     return (Generation)Gen;
   }
 
-  /// Return the number of high bits known to be zero fror a frame index.
+  /// Return the number of high bits known to be zero for a frame index.
   unsigned getKnownHighZeroBitsForFrameIndex() const {
     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
   }
@@ -612,7 +599,7 @@ public:
   }
 
   /// Return if most LDS instructions have an m0 use that require m0 to be
-  /// iniitalized.
+  /// initialized.
   bool ldsRequiresM0Init() const {
     return getGeneration() < GFX9;
   }
@@ -753,7 +740,7 @@ public:
   }
 
   // Scratch is allocated in 256 dword per wave blocks for the entire
-  // wavefront. When viewed from the perspecive of an arbitrary workitem, this
+  // wavefront. When viewed from the perspective of an arbitrary workitem, this
   // is 4-byte aligned.
   //
   // Only 4-byte alignment is really needed to access anything. Transformations
@@ -818,9 +805,7 @@ public:
     return HasScalarAtomics;
   }
 
-  bool hasLDSFPAtomics() const {
-    return GFX8Insts;
-  }
+  bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
 
   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
@@ -1139,6 +1124,9 @@ public:
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
 
+  std::unique_ptr<ScheduleDAGMutation>
+  createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
+
   bool isWave32() const {
     return getWavefrontSize() == 32;
   }
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
new file mode 100644
index 000000000000..f3f664f7972a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -0,0 +1,361 @@
+//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the AMDGPUCustomBehaviour class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCustomBehaviour.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/WithColor.h"
+
+namespace llvm {
+namespace mca {
+
+void AMDGPUInstrPostProcess::postProcessInstruction(
+    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
+  switch (MCI.getOpcode()) {
+  case AMDGPU::S_WAITCNT:
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT:
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    return processWaitCnt(Inst, MCI);
+  }
+}
+
+// s_waitcnt instructions encode important information as immediate operands
+// which are lost during the MCInst -> mca::Instruction lowering.
+void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
+                                            const MCInst &MCI) {
+  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
+    MCAOperand Op;
+    const MCOperand &MCOp = MCI.getOperand(Idx);
+    if (MCOp.isReg()) {
+      Op = MCAOperand::createReg(MCOp.getReg());
+    } else if (MCOp.isImm()) {
+      Op = MCAOperand::createImm(MCOp.getImm());
+    }
+    Op.setIndex(Idx);
+    Inst->addOperand(Op);
+  }
+}
+
+AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
+                                             const mca::SourceMgr &SrcMgr,
+                                             const MCInstrInfo &MCII)
+    : CustomBehaviour(STI, SrcMgr, MCII) {
+  generateWaitCntInfo();
+}
+
+unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
+                                                  const InstRef &IR) {
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
+  // pseudo instructions here. However, there are plans for the future to make
+  // it possible to use mca within backend passes. As such, I have left the
+  // pseudo version of s_waitcnt within this switch statement.
+  switch (Opcode) {
+  default:
+    return 0;
+  case AMDGPU::S_WAITCNT: // This instruction
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    // s_endpgm also behaves as if there is an implicit
+    // s_waitcnt 0, but I'm not sure if it would be appropriate
+    // to model this in llvm-mca based on how the iterations work
+    // while simulating the pipeline over and over.
+    return handleWaitCnt(IssuedInst, IR);
+  }
+
+  return 0;
+}
+
+unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
+                                              const InstRef &IR) {
+  // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
+  // I do not know how that instruction works so I did not attempt to model it.
+  // set the max values to begin
+  unsigned Vmcnt = 63;
+  unsigned Expcnt = 7;
+  unsigned Lgkmcnt = 31;
+  unsigned Vscnt = 63;
+  unsigned CurrVmcnt = 0;
+  unsigned CurrExpcnt = 0;
+  unsigned CurrLgkmcnt = 0;
+  unsigned CurrVscnt = 0;
+  unsigned CyclesToWaitVm = ~0U;
+  unsigned CyclesToWaitExp = ~0U;
+  unsigned CyclesToWaitLgkm = ~0U;
+  unsigned CyclesToWaitVs = ~0U;
+
+  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
+
+  // We will now look at each of the currently executing instructions
+  // to find out if this wait instruction still needs to wait.
+  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
+    const InstRef &PrevIR = *I;
+    const Instruction &PrevInst = *PrevIR.getInstruction();
+    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
+    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
+    const int CyclesLeft = PrevInst.getCyclesLeft();
+    assert(CyclesLeft != UNKNOWN_CYCLES &&
+           "We should know how many cycles are left for this instruction");
+    if (PrevInstWaitInfo.VmCnt) {
+      CurrVmcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitVm)
+        CyclesToWaitVm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.ExpCnt) {
+      CurrExpcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitExp)
+        CyclesToWaitExp = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.LgkmCnt) {
+      CurrLgkmcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
+        CyclesToWaitLgkm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.VsCnt) {
+      CurrVscnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitVs)
+        CyclesToWaitVs = CyclesLeft;
+    }
+  }
+
+  unsigned CyclesToWait = ~0U;
+  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
+    CyclesToWait = CyclesToWaitVm;
+  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
+    CyclesToWait = CyclesToWaitExp;
+  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
+    CyclesToWait = CyclesToWaitLgkm;
+  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
+    CyclesToWait = CyclesToWaitVs;
+
+  // We may underestimate how many cycles we need to wait, but this
+  // isn't a big deal. Our return value is just how many cycles until
+  // this function gets run again. So as long as we don't overestimate
+  // the wait time, we'll still end up stalling at this instruction
+  // for the correct number of cycles.
+
+  if (CyclesToWait == ~0U)
+    return 0;
+  return CyclesToWait;
+}
+
+void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
+                                           unsigned &Expcnt, unsigned &Lgkmcnt,
+                                           unsigned &Vscnt) {
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  switch (Opcode) {
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
+    // Should probably be checking for nullptr
+    // here, but I'm not sure how I should handle the case
+    // where we see a nullptr.
+    const MCAOperand *OpReg = Inst.getOperand(0);
+    const MCAOperand *OpImm = Inst.getOperand(1);
+    assert(OpReg && OpReg->isReg() && "First operand should be a register.");
+    assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
+    if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
+      // Instruction is using a real register.
+      // Since we can't know what value this register will have,
+      // we can't compute what the value of this wait should be.
+      WithColor::warning() << "The register component of "
+                           << MCII.getName(Opcode) << " will be completely "
+                           << "ignored. So the wait may not be accurate.\n";
+    }
+    switch (Opcode) {
+    // Redundant switch so I don't have to repeat the code above
+    // for each case. There are more clever ways to avoid this
+    // extra switch and anyone can feel free to implement one of them.
+    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+      Expcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+      Lgkmcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+      Vmcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+      Vscnt = OpImm->getImm();
+      break;
+    }
+    return;
+  }
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    unsigned WaitCnt = Inst.getOperand(0)->getImm();
+    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
+    return;
+  }
+}
+
+void AMDGPUCustomBehaviour::generateWaitCntInfo() {
+  // The core logic from this function is taken from
+  // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
+  // that are being looked at are in the MachineInstr format, whereas we have
+  // access to the MCInst format. The side effects of this are that we can't use
+  // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
+  // functions. Therefore, we conservatively assume that these functions will
+  // return true. This may cause a few instructions to be incorrectly tagged
+  // with an extra CNT. However, these are instructions that do interact with at
+  // least one CNT so giving them an extra CNT shouldn't cause issues in most
+  // scenarios.
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
+  InstrWaitCntInfo.resize(SrcMgr.size());
+
+  int Index = 0;
+  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
+    const std::unique_ptr<Instruction> &Inst = *I;
+    unsigned Opcode = Inst->getOpcode();
+    const MCInstrDesc &MCID = MCII.get(Opcode);
+    if ((MCID.TSFlags & SIInstrFlags::DS) &&
+        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
+        InstrWaitCntInfo[Index].ExpCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
+      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
+      // and mayAccessLDSThroughFlat(Inst) would both return true for this
+      // instruction. We have to do this because those functions use
+      // information about the memory operands that we don't have access to.
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else
+        InstrWaitCntInfo[Index].VsCnt = true;
+    } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if ((MCID.mayLoad() &&
+                !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
+               ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
+                !MCID.mayStore()))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if (MCID.mayStore())
+        InstrWaitCntInfo[Index].VsCnt = true;
+
+      // (IV.Major < 7) is meant to represent
+      // GCNTarget.vmemWriteNeedsExpWaitcnt()
+      // which is defined as
+      // { return getGeneration() < SEA_ISLANDS; }
+      if (IV.Major < 7 &&
+          (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
+        InstrWaitCntInfo[Index].ExpCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
+      InstrWaitCntInfo[Index].ExpCnt = true;
+    } else {
+      switch (Opcode) {
+      case AMDGPU::S_SENDMSG:
+      case AMDGPU::S_SENDMSGHALT:
+      case AMDGPU::S_MEMTIME:
+      case AMDGPU::S_MEMREALTIME:
+        InstrWaitCntInfo[Index].LgkmCnt = true;
+        break;
+      }
+    }
+  }
+}
+
+// taken from SIInstrInfo::isVMEM()
+bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
+  return MCID.TSFlags & SIInstrFlags::MUBUF ||
+         MCID.TSFlags & SIInstrFlags::MTBUF ||
+         MCID.TSFlags & SIInstrFlags::MIMG;
+}
+
+// taken from SIInstrInfo::hasModifiersSet()
+bool AMDGPUCustomBehaviour::hasModifiersSet(
+    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
+  if (Idx == -1)
+    return false;
+
+  const MCAOperand *Op = Inst->getOperand(Idx);
+  if (Op == nullptr || !Op->isImm() || !Op->getImm())
+    return false;
+
+  return true;
+}
+
+// taken from SIInstrInfo::isAlwaysGDS()
+bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
+  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
+         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
+         Opcode == AMDGPU::DS_GWS_SEMA_P ||
+         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
+         Opcode == AMDGPU::DS_GWS_BARRIER;
+}
+
+} // namespace mca
+} // namespace llvm
+
+using namespace llvm;
+using namespace mca;
+
+static CustomBehaviour *
+createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
+                            const mca::SourceMgr &SrcMgr,
+                            const MCInstrInfo &MCII) {
+  return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
+}
+
+static InstrPostProcess *
+createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
+                             const MCInstrInfo &MCII) {
+  return new AMDGPUInstrPostProcess(STI, MCII);
+}
+
+/// Extern function to initialize the targets for the AMDGPU backend
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
+  TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
+                                          createAMDGPUCustomBehaviour);
+  TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
+                                           createAMDGPUInstrPostProcess);
+
+  TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
+                                          createAMDGPUCustomBehaviour);
+  TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
+                                           createAMDGPUInstrPostProcess);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
new file mode 100644
index 000000000000..56650515bd0a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -0,0 +1,103 @@
+//===------------------- AMDGPUCustomBehaviour.h ----------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the AMDGPUCustomBehaviour class which inherits from
+/// CustomBehaviour. This class is used by the tool llvm-mca to enforce
+/// target specific behaviour that is not expressed well enough in the
+/// scheduling model for mca to enforce it automatically.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCA_AMDGPUCUSTOMBEHAVIOUR_H
+#define LLVM_LIB_TARGET_AMDGPU_MCA_AMDGPUCUSTOMBEHAVIOUR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/CustomBehaviour.h"
+#include "llvm/Support/TargetParser.h"
+
+namespace llvm {
+namespace mca {
+
+class AMDGPUInstrPostProcess : public InstrPostProcess {
+  void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
+public:
+  AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
+      : InstrPostProcess(STI, MCII) {}
+
+  ~AMDGPUInstrPostProcess() {}
+
+  void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
+                              const MCInst &MCI) override;
+};
+
+struct WaitCntInfo {
+  bool VmCnt = false;
+  bool ExpCnt = false;
+  bool LgkmCnt = false;
+  bool VsCnt = false;
+};
+
+class AMDGPUCustomBehaviour : public CustomBehaviour {
+  /// Whenever MCA would like to dispatch an s_waitcnt instructions,
+  /// we must check all the instruction that are still executing to see if
+  /// they modify the same CNT as we need to wait for. This vector
+  /// gets built in the constructor and contains 1 WaitCntInfo struct
+  /// for each instruction within the SrcManager. Each element
+  /// tells us which CNTs that instruction may interact with.
+  /// We conservatively assume some instructions interact with more
+  /// CNTs than they do in reality, so we will occasionally wait
+  /// longer than necessary, but we shouldn't ever wait for shorter.
+  std::vector<WaitCntInfo> InstrWaitCntInfo;
+
+  /// This method gets called from the constructor and is
+  /// where we setup the InstrWaitCntInfo vector.
+  /// The core logic for determining which CNTs an instruction
+  /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter().
+  /// Unfortunately, some of the logic from that function is not available to us
+  /// in this scope so we conservatively end up assuming that some
+  /// instructions interact with more CNTs than they do in reality.
+  void generateWaitCntInfo();
+  /// Helper function used in generateWaitCntInfo()
+  bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
+                       unsigned OpName) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isAlwaysGDS(uint16_t Opcode) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isVMEM(const MCInstrDesc &MCID);
+  /// This method gets called from checkCustomHazard when mca is attempting to
+  /// dispatch an s_waitcnt instruction (or one of its variants). The method
+  /// looks at each of the instructions that are still executing in the pipeline
+  /// to determine if the waitcnt should force a wait.
+  unsigned handleWaitCnt(ArrayRef<InstRef> IssuedInst, const InstRef &IR);
+  /// Based on the type of s_waitcnt instruction we are looking at, and what its
+  /// operands are, this method will set the values for each of the cnt
+  /// references provided as arguments.
+  void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt,
+                      unsigned &Lgkmcnt, unsigned &Vscnt);
+
+public:
+  AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
+                        const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
+
+  ~AMDGPUCustomBehaviour() {}
+  /// This method is used to determine if an instruction
+  /// should be allowed to be dispatched. The return value is
+  /// how many cycles until the instruction can be dispatched.
+  /// This method is called after MCA has already checked for
+  /// register and hardware dependencies so this method should only
+  /// implement custom behaviour and dependencies that are not picked up
+  /// by MCA naturally.
+  unsigned checkCustomHazard(ArrayRef<InstRef> IssuedInst,
+                             const InstRef &IR) override;
+};
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index dd0db6c7b655..50318a59225d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -15,8 +15,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -44,7 +44,8 @@ public:
                          const MCSubtargetInfo &STI) const override;
 
   unsigned getMinimumNopSize() const override;
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 };
@@ -169,7 +170,8 @@ unsigned AMDGPUAsmBackend::getMinimumNopSize() const {
   return 4;
 }
 
-bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                    const MCSubtargetInfo *STI) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index b56f75132135..e09e2dca1b47 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -19,10 +19,9 @@ namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
-class MCSubtargetInfo;
 class MCELFStreamer;
-class Triple;
 class MCObjectWriter;
+class Triple;
 
 MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
                                        std::unique_ptr<MCAsmBackend> MAB,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 9ba0ffbced3d..b68b4b12e750 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -605,6 +605,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     switch (OpTy) {
     case AMDGPU::OPERAND_REG_IMM_INT32:
     case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
@@ -631,6 +632,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     case AMDGPU::OPERAND_REG_IMM_FP16:
+    case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
       printImmediate16(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
@@ -1451,208 +1453,3 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
 }
 
 #include "AMDGPUGenAsmWriter.inc"
-
-void R600InstPrinter::printInst(const MCInst *MI, uint64_t Address,
-                                StringRef Annot, const MCSubtargetInfo &STI,
-                                raw_ostream &O) {
-  O.flush();
-  printInstruction(MI, Address, O);
-  printAnnotation(O, Annot);
-}
-
-void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
-                               raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
-}
-
-void R600InstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
-                                       raw_ostream &O) {
-  int BankSwizzle = MI->getOperand(OpNo).getImm();
-  switch (BankSwizzle) {
-  case 1:
-    O << "BS:VEC_021/SCL_122";
-    break;
-  case 2:
-    O << "BS:VEC_120/SCL_212";
-    break;
-  case 3:
-    O << "BS:VEC_102/SCL_221";
-    break;
-  case 4:
-    O << "BS:VEC_201";
-    break;
-  case 5:
-    O << "BS:VEC_210";
-    break;
-  default:
-    break;
-  }
-}
-
-void R600InstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "_SAT");
-}
-
-void R600InstPrinter::printCT(const MCInst *MI, unsigned OpNo,
-                                raw_ostream &O) {
-  unsigned CT = MI->getOperand(OpNo).getImm();
-  switch (CT) {
-  case 0:
-    O << 'U';
-    break;
-  case 1:
-    O << 'N';
-    break;
-  default:
-    break;
-  }
-}
-
-void R600InstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  int KCacheMode = MI->getOperand(OpNo).getImm();
-  if (KCacheMode > 0) {
-    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
-    O << "CB" << KCacheBank << ':';
-    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
-    int LineSize = (KCacheMode == 1) ? 16 : 32;
-    O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
-  }
-}
-
-void R600InstPrinter::printLast(const MCInst *MI, unsigned OpNo,
-                                raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "*", " ");
-}
-
-void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isImm() || Op.isExpr());
-  if (Op.isImm()) {
-    int64_t Imm = Op.getImm();
-    O << Imm << '(' << BitsToFloat(Imm) << ')';
-  }
-  if (Op.isExpr()) {
-    Op.getExpr()->print(O << '@', &MAI);
-  }
-}
-
-void R600InstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
-                               raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '-');
-}
-
-void R600InstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
-                                raw_ostream &O) {
-  switch (MI->getOperand(OpNo).getImm()) {
-  default: break;
-  case 1:
-    O << " * 2.0";
-    break;
-  case 2:
-    O << " * 4.0";
-    break;
-  case 3:
-    O << " / 2.0";
-    break;
-  }
-}
-
-void R600InstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  printOperand(MI, OpNo, O);
-  O  << ", ";
-  printOperand(MI, OpNo + 1, O);
-}
-
-void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (OpNo >= MI->getNumOperands()) {
-    O << "/*Missing OP" << OpNo << "*/";
-    return;
-  }
-
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    switch (Op.getReg()) {
-    // This is the default predicate state, so we don't need to print it.
-    case R600::PRED_SEL_OFF:
-      break;
-
-    default:
-      O << getRegisterName(Op.getReg());
-      break;
-    }
-  } else if (Op.isImm()) {
-      O << Op.getImm();
-  } else if (Op.isDFPImm()) {
-    // We special case 0.0 because otherwise it will be printed as an integer.
-    if (Op.getDFPImm() == 0.0)
-      O << "0.0";
-    else {
-      O << bit_cast<double>(Op.getDFPImm());
-    }
-  } else if (Op.isExpr()) {
-    const MCExpr *Exp = Op.getExpr();
-    Exp->print(O, &MAI);
-  } else {
-    O << "/*INV_OP*/";
-  }
-}
-
-void R600InstPrinter::printRel(const MCInst *MI, unsigned OpNo,
-                               raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '+');
-}
-
-void R600InstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  unsigned Sel = MI->getOperand(OpNo).getImm();
-  switch (Sel) {
-  case 0:
-    O << 'X';
-    break;
-  case 1:
-    O << 'Y';
-    break;
-  case 2:
-    O << 'Z';
-    break;
-  case 3:
-    O << 'W';
-    break;
-  case 4:
-    O << '0';
-    break;
-  case 5:
-    O << '1';
-    break;
-  case 7:
-    O << '_';
-    break;
-  default:
-    break;
-  }
-}
-
-void R600InstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "ExecMask,");
-}
-
-void R600InstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "Pred,");
-}
-
-void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.getImm() == 0) {
-    O << " (MASKED)";
-  }
-}
-
-#include "R600GenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3cb4fcb28cb0..71db0beba0b6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -240,36 +240,6 @@ protected:
                    raw_ostream &O);
 };
 
-class R600InstPrinter : public MCInstPrinter {
-public:
-  R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                  const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI) {}
-
-  void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
-                 const MCSubtargetInfo &STI, raw_ostream &O) override;
-  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
-  void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-
-  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-};
-
 } // End namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f3d945cc0764..93bec8aaadfd 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 1a7ca7e1a330..53c724f2211a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// CodeEmitter interface for R600 and SI codegen.
+/// CodeEmitter interface for SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 34b2cd1fc1e4..1f917cd91b47 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -16,7 +16,8 @@
 #include "AMDGPUInstPrinter.h"
 #include "AMDGPUMCAsmInfo.h"
 #include "AMDGPUTargetStreamer.h"
-#include "SIDefines.h"
+#include "R600InstPrinter.h"
+#include "R600MCTargetDesc.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -26,10 +27,9 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 71b44a509108..e5cce6045c8c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -18,6 +18,7 @@
 #include <memory>
 
 namespace llvm {
+class Target;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -26,20 +27,11 @@ class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
-class StringRef;
-class Target;
-class Triple;
-class raw_pwrite_stream;
 
 enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 };
 
 MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
 
-MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
-                                       const MCRegisterInfo &MRI,
-                                       MCContext &Ctx);
-MCInstrInfo *createR600MCInstrInfo();
-
 MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCRegisterInfo &MRI,
                                      MCContext &Ctx);
@@ -57,23 +49,12 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
 
-#define GET_REGINFO_ENUM
-#include "R600GenRegisterInfo.inc"
-
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_OPERAND_ENUM
 #define GET_INSTRINFO_SCHED_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 
-#define GET_INSTRINFO_ENUM
-#define GET_INSTRINFO_OPERAND_ENUM
-#define GET_INSTRINFO_SCHED_ENUM
-#include "R600GenInstrInfo.inc"
-
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 
-#define GET_SUBTARGETINFO_ENUM
-#include "R600GenSubtargetInfo.inc"
-
 #endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index cef34a5e5a59..a857fd00a855 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -17,13 +17,8 @@ struct amd_kernel_code_t;
 
 namespace llvm {
 
-class DataLayout;
-class Function;
 class MCELFStreamer;
 class MCSymbol;
-class MDNode;
-class Module;
-class Type;
 class formatted_raw_ostream;
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
new file mode 100644
index 000000000000..f77ed1faf029
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -0,0 +1,224 @@
+//===-- R600InstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600InstPrinter.h"
+#include "AMDGPUInstPrinter.h"
+#include "R600MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+void R600InstPrinter::printInst(const MCInst *MI, uint64_t Address,
+                                StringRef Annot, const MCSubtargetInfo &STI,
+                                raw_ostream &O) {
+  O.flush();
+  printInstruction(MI, Address, O);
+  printAnnotation(O, Annot);
+}
+
+void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+                               raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|');
+}
+
+void R600InstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  int BankSwizzle = MI->getOperand(OpNo).getImm();
+  switch (BankSwizzle) {
+  case 1:
+    O << "BS:VEC_021/SCL_122";
+    break;
+  case 2:
+    O << "BS:VEC_120/SCL_212";
+    break;
+  case 3:
+    O << "BS:VEC_102/SCL_221";
+    break;
+  case 4:
+    O << "BS:VEC_201";
+    break;
+  case 5:
+    O << "BS:VEC_210";
+    break;
+  default:
+    break;
+  }
+}
+
+void R600InstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void R600InstPrinter::printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+  unsigned CT = MI->getOperand(OpNo).getImm();
+  switch (CT) {
+  case 0:
+    O << 'U';
+    break;
+  case 1:
+    O << 'N';
+    break;
+  default:
+    break;
+  }
+}
+
+void R600InstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  int KCacheMode = MI->getOperand(OpNo).getImm();
+  if (KCacheMode > 0) {
+    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+    O << "CB" << KCacheBank << ':';
+    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+    int LineSize = (KCacheMode == 1) ? 16 : 32;
+    O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
+  }
+}
+
+void R600InstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "*", " ");
+}
+
+void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() || Op.isExpr());
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    O << Imm << '(' << BitsToFloat(Imm) << ')';
+  }
+  if (Op.isExpr()) {
+    Op.getExpr()->print(O << '@', &MAI);
+  }
+}
+
+void R600InstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+                               raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '-');
+}
+
+void R600InstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  switch (MI->getOperand(OpNo).getImm()) {
+  default:
+    break;
+  case 1:
+    O << " * 2.0";
+    break;
+  case 2:
+    O << " * 4.0";
+    break;
+  case 3:
+    O << " / 2.0";
+    break;
+  }
+}
+
+void R600InstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+  O << ", ";
+  printOperand(MI, OpNo + 1, O);
+}
+
+void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (OpNo >= MI->getNumOperands()) {
+    O << "/*Missing OP" << OpNo << "*/";
+    return;
+  }
+
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    switch (Op.getReg()) {
+    // This is the default predicate state, so we don't need to print it.
+    case R600::PRED_SEL_OFF:
+      break;
+
+    default:
+      O << getRegisterName(Op.getReg());
+      break;
+    }
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else if (Op.isDFPImm()) {
+    // We special case 0.0 because otherwise it will be printed as an integer.
+    if (Op.getDFPImm() == 0.0)
+      O << "0.0";
+    else {
+      O << bit_cast<double>(Op.getDFPImm());
+    }
+  } else if (Op.isExpr()) {
+    const MCExpr *Exp = Op.getExpr();
+    Exp->print(O, &MAI);
+  } else {
+    O << "/*INV_OP*/";
+  }
+}
+
+void R600InstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+                               raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '+');
+}
+
+void R600InstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  unsigned Sel = MI->getOperand(OpNo).getImm();
+  switch (Sel) {
+  case 0:
+    O << 'X';
+    break;
+  case 1:
+    O << 'Y';
+    break;
+  case 2:
+    O << 'Z';
+    break;
+  case 3:
+    O << 'W';
+    break;
+  case 4:
+    O << '0';
+    break;
+  case 5:
+    O << '1';
+    break;
+  case 7:
+    O << '_';
+    break;
+  default:
+    break;
+  }
+}
+
+void R600InstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void R600InstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() == 0) {
+    O << " (MASKED)";
+  }
+}
+
+#include "R600GenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.h
new file mode 100644
index 000000000000..6c88ffd1514b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.h
@@ -0,0 +1,48 @@
+//===-- R600InstPrinter.h - AMDGPU MC Inst -> ASM interface -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_R600INSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_R600INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class R600InstPrinter : public MCInstPrinter {
+public:
+  R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+                 const MCSubtargetInfo &STI, raw_ostream &O) override;
+  std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+  void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index bbca8cbb742c..6fe192e95e72 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600Defines.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
index a4809af29daa..269209a12175 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUMCTargetDesc.h"
+#include "R600MCTargetDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
new file mode 100644
index 000000000000..fc52cb33824f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
@@ -0,0 +1,44 @@
+//===-- R600MCTargetDesc.h - R600 Target Descriptions -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Provides R600 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_R600MCTARGETDESC_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_R600MCTARGETDESC_H
+
+#include <cstdint>
+
+namespace llvm {
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       MCContext &Ctx);
+MCInstrInfo *createR600MCInstrInfo();
+
+} // namespace llvm
+
+#define GET_REGINFO_ENUM
+#include "R600GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
+#include "R600GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "R600GenSubtargetInfo.inc"
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index dbce4b2e872c..77f219aaa3ab 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -233,6 +233,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
   switch (OpInfo.OperandType) {
   case AMDGPU::OPERAND_REG_IMM_INT32:
   case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
@@ -255,6 +256,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
     return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     // FIXME Is this correct? What do inline immediates do on SI for f16 src
@@ -277,6 +279,9 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     uint32_t Encoding = getLit16Encoding(Lo16, STI);
     return Encoding;
   }
+  case AMDGPU::OPERAND_KIMM32:
+  case AMDGPU::OPERAND_KIMM16:
+    return MO.getImm();
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -341,7 +346,13 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]))
     return;
 
-  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+  // Do not print literals from SISrc Operands for insts with mandatory literals
+  int ImmLitIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
+  if (ImmLitIdx != -1)
+    return;
+
+  // Check for additional literals
   for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) {
 
     // Check if this operand should be encoded as [SV]Src
@@ -536,8 +547,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
     uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
-    if (Enc != ~0U &&
-        (Enc != 255 || Desc.getSize() == 4 || Desc.getSize() == 8))
+    if (Enc != ~0U)
       return Enc;
 
   } else if (MO.isImm())
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index bacb790aac62..6dd886367302 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -43,6 +43,7 @@ class MIMGBaseOpcode : PredicateControl {
   bit HasD16 = 0;
   bit IsAtomicRet = 0;
   bit MSAA = 0;
+  bit BVH = 0;
 }
 
 def MIMGBaseOpcode : GenericEnum {
@@ -54,7 +55,7 @@ def MIMGBaseOpcodesTable : GenericTable {
   let CppTypeName = "MIMGBaseOpcodeInfo";
   let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                 "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
-                "LodOrClampOrMip", "HasD16", "MSAA"];
+                "LodOrClampOrMip", "HasD16", "MSAA", "BVH"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
 
   let PrimaryKey = ["BaseOpcode"];
@@ -872,6 +873,14 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
 multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample>
     : MIMG_Gather<op, sample, 1>;
 
+class MIMG_IntersectRay_Helper<bit Is64, bit A16> {
+  int num_addrs = !if(Is64, !if(A16, 9, 12), !if(A16, 8, 11));
+  // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple,
+  // when we only need 9, 11 or 12 depending on A16 field and ptr size.
+  RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
+  int VAddrDwords = !srl(RegClass.Size, 5);
+}
+
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
     : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> {
 
@@ -890,8 +899,11 @@ class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
 }
 
-multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16> {
-  def "" : MIMGBaseOpcode;
+multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> {
+  defvar info = MIMG_IntersectRay_Helper<Is64, A16>;
+  def "" : MIMGBaseOpcode {
+    let BVH = 1;
+  }
   let SubtargetPredicate = HasGFX10_AEncoding,
       AssemblerPredicate = HasGFX10_AEncoding,
       AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
@@ -908,13 +920,11 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16>
       d16 = 0,
       BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
       VDataDwords = 4 in {
-    // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple,
-    // when we only need 9, 11 or 12 depending on A16 field and ptr size.
-    def "_sa" : MIMG_IntersectRay_gfx10<op, opcode, MIMGAddrSize<num_addrs, 0>.RegClass, A16> {
-      let VAddrDwords = !srl(MIMGAddrSize<num_addrs, 0>.RegClass.Size, 5);
+    def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, A16> {
+      let VAddrDwords = info.VAddrDwords;
     }
-    def _nsa : MIMG_IntersectRay_nsa_gfx10<op, opcode, num_addrs, A16> {
-      let VAddrDwords = num_addrs;
+    def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, A16> {
+      let VAddrDwords = info.num_addrs;
     }
   }
 }
@@ -949,7 +959,7 @@ defm IMAGE_ATOMIC_OR            : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">
 defm IMAGE_ATOMIC_XOR           : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">;
 defm IMAGE_ATOMIC_INC           : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">;
 defm IMAGE_ATOMIC_DEC           : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">;
-defm IMAGE_ATOMIC_FCMPSWAP      : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 0, 1>;
+defm IMAGE_ATOMIC_FCMPSWAP      : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
 defm IMAGE_ATOMIC_FMIN          : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
 defm IMAGE_ATOMIC_FMAX          : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
 
@@ -1045,10 +1055,10 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd
 let SubtargetPredicate = HasGFX10_AEncoding in
 defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
 
-defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 11, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 8, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 12, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 9, 1>;
+defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 1>;
 
 /********** ========================================= **********/
 /********** Table of dimension-aware image intrinsics **********/
@@ -1098,7 +1108,7 @@ def ImageDimIntrinsicTable : GenericTable {
   let PrimaryKeyEarlyOut = 1;
 }
 
-def getImageDimInstrinsicByBaseOpcode : SearchIndex {
+def getImageDimIntrinsicByBaseOpcode : SearchIndex {
   let Table = ImageDimIntrinsicTable;
   let Key = ["BaseOpcode", "Dim"];
 }
diff --git a/llvm/lib/Target/AMDGPU/R600.h b/llvm/lib/Target/AMDGPU/R600.h
new file mode 100644
index 000000000000..2b483ae63da9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600.h
@@ -0,0 +1,50 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600_H
+#define LLVM_LIB_TARGET_AMDGPU_R600_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class TargetMachine;
+class ModulePass;
+class PassRegistry;
+
+// R600 Passes
+FunctionPass *createR600VectorRegMerger();
+FunctionPass *createR600ExpandSpecialInstrsPass();
+FunctionPass *createR600EmitClauseMarkers();
+FunctionPass *createR600ClauseMergePass();
+FunctionPass *createR600Packetizer();
+FunctionPass *createR600ControlFlowFinalizer();
+FunctionPass *createAMDGPUCFGStructurizerPass();
+FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
+ModulePass *createR600OpenCLImageTypeLoweringPass();
+
+void initializeR600ClauseMergePassPass(PassRegistry &);
+extern char &R600ClauseMergePassID;
+
+void initializeR600ControlFlowFinalizerPass(PassRegistry &);
+extern char &R600ControlFlowFinalizerID;
+
+void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &);
+extern char &R600ExpandSpecialInstrsPassID;
+
+void initializeR600VectorRegMergerPass(PassRegistry &);
+extern char &R600VectorRegMergerID;
+
+void initializeR600PacketizerPass(PassRegistry &);
+extern char &R600PacketizerID;
+
+} // End namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AMDGPU/R600.td b/llvm/lib/Target/AMDGPU/R600.td
index 1d11da969474..45bc955d4f4c 100644
--- a/llvm/lib/Target/AMDGPU/R600.td
+++ b/llvm/lib/Target/AMDGPU/R600.td
@@ -34,6 +34,7 @@ def ALU_NULL : FuncUnit;
 include "AMDGPUFeatures.td"
 include "R600Schedule.td"
 include "R600Processors.td"
+include "R600InstrInfo.td"
 include "AMDGPUInstrInfo.td"
 include "AMDGPUInstructions.td"
 include "R600Instructions.td"
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index a96fc7ef234e..c19e3c41485e 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600AsmPrinter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600Subtarget.h"
@@ -129,4 +129,3 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   return false;
 }
-
diff --git a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index a19d00b62502..1d93165f9eec 100644
--- a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -12,8 +12,8 @@
 /// It needs to be called after IfCvt for best results.
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600Subtarget.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index ca1e61393e9a..29c37c706138 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600Subtarget.h"
 #include <set>
diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 664e134889e9..d5eaa33ef964 100644
--- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -13,8 +13,8 @@
 /// initiated by CF_ALU instructions.
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
 
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 81dc91ab922f..838a497b4df1 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
 
diff --git a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..9f842e91c0f3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
@@ -0,0 +1,184 @@
+//===-- R600ISelDAGToDAG.cpp - A dag to dag inst selector for R600 --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// Defines an instruction selector for the R600 subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUISelDAGToDAG.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
+#include "R600Subtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
+  const R600Subtarget *Subtarget;
+
+  bool isConstantLoad(const MemSDNode *N, int cbID) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue &IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue &Offset);
+
+public:
+  explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel)
+      : AMDGPUDAGToDAGISel(TM, OptLevel) {}
+
+  void Select(SDNode *N) override;
+
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) override;
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void PreprocessISelDAG() override {}
+
+protected:
+  // Include the pieces autogenerated from the target description.
+#include "R600GenDAGISel.inc"
+};
+
+bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<R600Subtarget>();
+  return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+  if (!N->readMem())
+    return false;
+  if (CbId == -1)
+    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+           N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
+
+  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+                                                       SDValue &IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr =
+        CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), true);
+    return true;
+  }
+  return false;
+}
+
+bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+                                                       SDValue &BaseReg,
+                                                       SDValue &Offset) {
+  if (!isa<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+    return true;
+  }
+  return false;
+}
+
+void R600DAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return; // Already selected.
+  }
+
+  switch (Opc) {
+  default:
+    break;
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+  case ISD::BUILD_VECTOR: {
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    unsigned RegClassID;
+    // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+    // that adds a 128 bits reg copy when going through TwoAddressInstructions
+    // pass. We want to avoid 128 bits copies as much as possible because they
+    // can't be bundled by our scheduler.
+    switch (NumVectorElts) {
+    case 2:
+      RegClassID = R600::R600_Reg64RegClassID;
+      break;
+    case 4:
+      if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+        RegClassID = R600::R600_Reg128VerticalRegClassID;
+      else
+        RegClassID = R600::R600_Reg128RegClassID;
+      break;
+    default:
+      llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+    }
+    SelectBuildVector(N, RegClassID);
+    return;
+  }
+  }
+
+  SelectCode(N);
+}
+
+bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) {
+  ConstantSDNode *C;
+  SDLoc DL(Addr);
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  }
+
+  return true;
+}
+
+bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) {
+  ConstantSDNode *IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD &&
+      (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) &&
+      isInt<16>(IMMOffset->getZExtValue())) {
+
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                       MVT::i32);
+    return true;
+    // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) &&
+             isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  SDLoc(CurDAG->getEntryNode()), R600::ZERO,
+                                  MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                       MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}
+
+/// This pass converts a legalized DAG into a R600-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
+                                      CodeGenOpt::Level OptLevel) {
+  return new R600DAGToDAGISel(TM, OptLevel);
+}
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 002ef1801448..0215eb9f9fea 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -13,11 +13,12 @@
 
 #include "R600ISelLowering.h"
 #include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 
@@ -335,7 +336,9 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X);
     int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
     //TODO: Ugh this is rather ugly
-    MIB->getOperand(Idx) = MI.getOperand(1);
+    const MachineOperand &MO = MI.getOperand(1);
+    MIB->getOperand(Idx).ChangeToGA(MO.getGlobal(), MO.getOffset(),
+                                    MO.getTargetFlags());
     break;
   }
 
@@ -827,7 +830,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
-    return Cst->isNullValue();
+    return Cst->isZero();
   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
     return CstFP->isZero();
   } else {
@@ -923,7 +926,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       std::swap(LHS, RHS);
       CC = DAG.getCondCode(CCSwapped);
     } else {
-      // Try inverting the conditon and then swapping the operands
+      // Try inverting the condition and then swapping the operands
       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT);
       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
@@ -1564,7 +1567,7 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
 }
 
 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
-                                          const SelectionDAG &DAG) const {
+                                          const MachineFunction &MF) const {
   // Local and Private addresses do not handle vectors. Limit to i32
   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) {
     return (MemVT.getSizeInBits() <= 32);
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 920cf3cd97ef..f9a9a6127322 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
+#include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
@@ -47,7 +48,7 @@ public:
                          EVT VT) const override;
 
   bool canMergeStoresTo(unsigned AS, EVT MemVT,
-                        const SelectionDAG &DAG) const override;
+                        const MachineFunction &MF) const override;
 
   bool allowsMisalignedMemoryAccesses(
       EVT VT, unsigned AS, Align Alignment,
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 7a623f3e304e..a7ebf72315cb 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -13,7 +13,8 @@
 
 #include "R600InstrInfo.h"
 #include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
 #include "llvm/ADT/SmallSet.h"
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index 1e249c6348f1..fc567f1a1fca 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -175,7 +175,7 @@ public:
                         int *BytesAdded = nullptr) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
-                        int *BytesRemvoed = nullptr) const override;
+                        int *BytesRemoved = nullptr) const override;
 
   bool isPredicated(const MachineInstr &MI) const override;
 
@@ -211,7 +211,7 @@ public:
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  /// Reserve the registers that may be accesed using indirect addressing.
+  /// Reserve the registers that may be accessed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF,
                                 const R600RegisterInfo &TRI) const;
@@ -220,7 +220,7 @@ public:
   /// \p Channel
   ///
   /// We model indirect addressing using a virtual address space that can be
-  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// accessed with loads and stores.  The "Indirect Address" is the memory
   /// address in this virtual address space that maps to the given \p RegIndex
   /// and \p Channel.
   unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const;
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.td b/llvm/lib/Target/AMDGPU/R600InstrInfo.td
new file mode 100644
index 000000000000..92320748c497
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.td
@@ -0,0 +1,23 @@
+//===-- R600InstrInfo.td - R600 DAG nodes ------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node definitions for the R600 target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// R600 DAG Nodes
+//
+
+// Force dependencies for vector trunc stores
+def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
+
+def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
+  [SDNPHasChain, SDNPSideEffect]>;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index 055e2de59ea1..4487864888b6 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -74,8 +74,6 @@ def FRAMEri : Operand<iPTR> {
   let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
 }
 
-def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
-def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
 def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
 def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
@@ -212,16 +210,6 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
   let Inst{63-32} = Word1;
 }
 
-class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
-                      InstrItinClass itin = VecALU> :
-  InstR600 <(outs R600_Reg32:$dst),
-          ins,
-          asm,
-          pattern,
-          itin>;
-
-
-
 } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
 
 class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
@@ -815,7 +803,7 @@ def DUMMY_CHAIN : R600WrapperInst <
 
 let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
 
-class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
+class MOV_IMM <Operand immType> : R600WrapperInst <
   (outs R600_Reg32:$dst),
   (ins immType:$imm),
   "",
@@ -826,20 +814,20 @@ class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst <
 
 } // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
 
-def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def MOV_IMM_I32 : MOV_IMM<i32imm>;
 def : R600Pat <
   (imm:$val),
   (MOV_IMM_I32 imm:$val)
 >;
 
-def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>;
+def MOV_IMM_GLOBAL_ADDR : MOV_IMM<i32imm>;
 def : R600Pat <
   (AMDGPUconstdata_ptr tglobaladdr:$addr),
   (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr)
 >;
 
 
-def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def MOV_IMM_F32 : MOV_IMM<f32imm>;
 def : R600Pat <
   (fpimm:$val),
   (MOV_IMM_F32  fpimm:$val)
@@ -1358,7 +1346,7 @@ let Predicates = [isR600] in {
 
 
 //===----------------------------------------------------------------------===//
-// Regist loads and stores - for indirect addressing
+// Register loads and stores - for indirect addressing
 //===----------------------------------------------------------------------===//
 
 let Namespace = "R600" in {
diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
new file mode 100644
index 000000000000..8f7807a2b472
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@@ -0,0 +1,73 @@
+//===- R600MCInstLower.cpp - Lower R600 MachineInstr to an MCInst ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Code to lower R600 MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "R600AsmPrinter.h"
+#include "R600Subtarget.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+
+class R600MCInstLower : public AMDGPUMCInstLower {
+public:
+  R600MCInstLower(MCContext &ctx, const R600Subtarget &ST,
+                  const AsmPrinter &AP);
+
+  /// Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+
+R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST,
+                                 const AsmPrinter &AP)
+    : AMDGPUMCInstLower(Ctx, ST, AP) {}
+
+void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    MCOperand MCOp;
+    lowerOperand(MO, MCOp);
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
+  const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
+  R600MCInstLower MCInstLowering(OutContext, STI, *this);
+
+  StringRef Err;
+  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+    C.emitError("Illegal instruction detected: " + Err);
+    MI->print(errs());
+  }
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+    while (I != MBB->instr_end() && I->isInsideBundle()) {
+      emitInstruction(&*I);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+  }
+}
+
+const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) {
+  if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext))
+    return E;
+  return AsmPrinter::lowerConstant(CV);
+}
diff --git a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index f85a68706287..36acfafa72aa 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600Subtarget.h"
 
 using namespace llvm;
@@ -29,7 +29,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   MRI = &DAG->MRI;
   CurInstKind = IDOther;
   CurEmitted = 0;
-  OccupedSlotsMask = 31;
+  OccupiedSlotsMask = 31;
   InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
   InstKindLimit[IDOther] = 32;
   InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
@@ -138,7 +138,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (NextInstKind != CurInstKind) {
     LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
     if (NextInstKind != IDAlu)
-      OccupedSlotsMask |= 31;
+      OccupiedSlotsMask |= 31;
     CurEmitted = 0;
     CurInstKind = NextInstKind;
   }
@@ -339,10 +339,10 @@ void R600SchedStrategy::LoadAlu() {
 
 void R600SchedStrategy::PrepareNextSlot() {
   LLVM_DEBUG(dbgs() << "New Slot\n");
-  assert (OccupedSlotsMask && "Slot wasn't filled");
-  OccupedSlotsMask = 0;
-//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
-//    OccupedSlotsMask |= 16;
+  assert(OccupiedSlotsMask && "Slot wasn't filled");
+  OccupiedSlotsMask = 0;
+  //  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+  //    OccupiedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
 }
@@ -400,41 +400,41 @@ unsigned R600SchedStrategy::AvailablesAluCount() const {
 
 SUnit* R600SchedStrategy::pickAlu() {
   while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
-    if (!OccupedSlotsMask) {
+    if (!OccupiedSlotsMask) {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {
-        OccupedSlotsMask |= 31;
+        OccupiedSlotsMask |= 31;
         return PopInst(AvailableAlus[AluPredX], false);
       }
       // Flush physical reg copies (RA will discard them)
       if (!AvailableAlus[AluDiscarded].empty()) {
-        OccupedSlotsMask |= 31;
+        OccupiedSlotsMask |= 31;
         return PopInst(AvailableAlus[AluDiscarded], false);
       }
       // If there is a T_XYZW alu available, use it
       if (!AvailableAlus[AluT_XYZW].empty()) {
-        OccupedSlotsMask |= 15;
+        OccupiedSlotsMask |= 15;
         return PopInst(AvailableAlus[AluT_XYZW], false);
       }
     }
-    bool TransSlotOccuped = OccupedSlotsMask & 16;
-    if (!TransSlotOccuped && VLIW5) {
+    bool TransSlotOccupied = OccupiedSlotsMask & 16;
+    if (!TransSlotOccupied && VLIW5) {
       if (!AvailableAlus[AluTrans].empty()) {
-        OccupedSlotsMask |= 16;
+        OccupiedSlotsMask |= 16;
         return PopInst(AvailableAlus[AluTrans], false);
       }
       SUnit *SU = AttemptFillSlot(3, true);
       if (SU) {
-        OccupedSlotsMask |= 16;
+        OccupiedSlotsMask |= 16;
         return SU;
       }
     }
     for (int Chan = 3; Chan > -1; --Chan) {
-      bool isOccupied = OccupedSlotsMask & (1 << Chan);
+      bool isOccupied = OccupiedSlotsMask & (1 << Chan);
       if (!isOccupied) {
         SUnit *SU = AttemptFillSlot(Chan, false);
         if (SU) {
-          OccupedSlotsMask |= (1 << Chan);
+          OccupiedSlotsMask |= (1 << Chan);
           InstructionsGroupCandidate.push_back(SU->getInstr());
           return SU;
         }
diff --git a/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
index abcc37f8400d..f3fd71d470ba 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -63,7 +63,7 @@ class R600SchedStrategy final : public MachineSchedStrategy {
 
   int InstKindLimit[IDLast];
 
-  int OccupedSlotsMask;
+  int OccupiedSlotsMask;
 
 public:
   R600SchedStrategy() = default;
diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 8f1a069c232d..ac6a3581e255 100644
--- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -24,7 +24,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
+#include "R600.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
@@ -86,7 +86,7 @@ GetFunctionFromMDNode(MDNode *Node) {
   if (!F)
     return nullptr;
 
-  // Sanity checks.
+  // Validation checks.
   size_t ExpectNumArgNodeOps = F->arg_size() + 1;
   for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
     MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 8f19a3e478e8..1a723279dc9f 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -26,8 +26,8 @@
 /// to reduce MOV count.
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index eaac938b098a..e858bba2983c 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+#include "R600.h"
 #include "R600Subtarget.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/llvm/lib/Target/AMDGPU/R600Processors.td b/llvm/lib/Target/AMDGPU/R600Processors.td
index fff884e4848e..8cf8edd1254f 100644
--- a/llvm/lib/Target/AMDGPU/R600Processors.td
+++ b/llvm/lib/Target/AMDGPU/R600Processors.td
@@ -45,11 +45,11 @@ class R600SubtargetFeatureGeneration <string Value, string FeatureName,
         SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>;
 
 def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600",
-  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+  [FeatureR600ALUInst, FeatureFetchLimit8]
 >;
 
 def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700",
-  [FeatureFetchLimit16, FeatureLocalMemorySize0]
+  [FeatureFetchLimit16]
 >;
 
 def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen",
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index e4f7d89bf4c9..99a1a8e9871a 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600RegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600Defines.h"
 #include "R600Subtarget.h"
 
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
new file mode 100644
index 000000000000..20c1ce7266dd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp
@@ -0,0 +1,46 @@
+//===-- R600Subtarget.cpp - R600 Subtarget Information --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Implements the R600 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Subtarget.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "R600GenSubtargetInfo.inc"
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                             const TargetMachine &TM)
+    : R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT),
+      InstrInfo(*this),
+      FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+      FMA(false), CaymanISA(false), CFALUBug(false), HasVertexCache(false),
+      R600ALUInst(false), FP64(false), TexVTXClauseSize(0), Gen(R600),
+      TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
+      InstrItins(getInstrItineraryForCPU(GPU)) {}
+
+R600Subtarget &R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
+                                                              StringRef GPU,
+                                                              StringRef FS) {
+  SmallString<256> FullFS("+promote-alloca,");
+  FullFS += FS;
+  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
+
+  HasMulU24 = getGeneration() >= EVERGREEN;
+  HasMulI24 = hasCaymanISA();
+
+  return *this;
+}
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 07238da18c67..94403b88f21a 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -23,7 +23,6 @@
 
 namespace llvm {
 
-class MCInst;
 class MCInstrInfo;
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
new file mode 100644
index 000000000000..39dad45425fc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -0,0 +1,143 @@
+//===-- R600TargetMachine.cpp - TargetMachine for hw codegen targets-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// The AMDGPU-R600 target machine contains all of the hardware specific
+/// information  needed to emit code for R600 GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600TargetMachine.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600.h"
+#include "R600MachineScheduler.h"
+#include "R600TargetTransformInfo.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableR600StructurizeCFG("r600-ir-structurize",
+                             cl::desc("Use StructurizeCFG IR pass"),
+                             cl::init(true));
+
+static cl::opt<bool> EnableR600IfConvert("r600-if-convert",
+                                         cl::desc("Use if conversion pass"),
+                                         cl::ReallyHidden, cl::init(true));
+
+static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
+    "amdgpu-function-calls", cl::desc("Enable AMDGPU function call support"),
+    cl::location(AMDGPUTargetMachine::EnableFunctionCalls), cl::init(true),
+    cl::Hidden);
+
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
+}
+
+static MachineSchedRegistry R600SchedRegistry("r600",
+                                              "Run R600's custom scheduler",
+                                              createR600MachineScheduler);
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
+                                     StringRef CPU, StringRef FS,
+                                     TargetOptions Options,
+                                     Optional<Reloc::Model> RM,
+                                     Optional<CodeModel::Model> CM,
+                                     CodeGenOpt::Level OL, bool JIT)
+    : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+  setRequiresStructuredCFG(true);
+
+  // Override the default since calls aren't supported for r600.
+  if (EnableFunctionCalls &&
+      EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
+    EnableFunctionCalls = false;
+}
+
+const TargetSubtargetInfo *
+R600TargetMachine::getSubtargetImpl(const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+  }
+
+  return I.get();
+}
+
+TargetTransformInfo
+R600TargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(R600TTIImpl(this, F));
+}
+
+class R600PassConfig final : public AMDGPUPassConfig {
+public:
+  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
+      : AMDGPUPassConfig(TM, PM) {}
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    return createR600MachineScheduler(C);
+  }
+
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+
+//===----------------------------------------------------------------------===//
+// R600 Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool R600PassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+
+  if (EnableR600StructurizeCFG)
+    addPass(createStructurizeCFGPass());
+  return false;
+}
+
+bool R600PassConfig::addInstSelector() {
+  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
+  return false;
+}
+
+void R600PassConfig::addPreRegAlloc() { addPass(createR600VectorRegMerger()); }
+
+void R600PassConfig::addPreSched2() {
+  addPass(createR600EmitClauseMarkers());
+  if (EnableR600IfConvert)
+    addPass(&IfConverterID);
+  addPass(createR600ClauseMergePass());
+}
+
+void R600PassConfig::addPreEmitPass() {
+  addPass(createAMDGPUCFGStructurizerPass());
+  addPass(createR600ExpandSpecialInstrsPass());
+  addPass(&FinalizeMachineBundlesID);
+  addPass(createR600Packetizer());
+  addPass(createR600ControlFlowFinalizer());
+}
+
+TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new R600PassConfig(*this, PM);
+}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
new file mode 100644
index 000000000000..0ccbca3c68b1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -0,0 +1,48 @@
+//===-- R600TargetMachine.h - AMDGPU TargetMachine Interface ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
+#define LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
+
+#include "AMDGPUTargetMachine.h"
+#include "R600Subtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+class R600TargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
+
+public:
+  R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, TargetOptions Options,
+                    Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                    CodeGenOpt::Level OL, bool JIT);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+  bool isMachineVerifierClean() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
new file mode 100644
index 000000000000..365c005b2503
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -0,0 +1,142 @@
+//===- R600TargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// R600 target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600TargetTransformInfo.h"
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Subtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "R600tti"
+
+R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
+      TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
+
+unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
+  return getHardwareNumberOfRegisters(Vec);
+}
+
+TypeSize
+R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+  return TypeSize::getFixed(32);
+}
+
+unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { return 32; }
+
+unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
+    return 128;
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS)
+    return 64;
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
+    return 32;
+
+  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+        AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
+    return 128;
+  llvm_unreachable("unhandled address space");
+}
+
+bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                             Align Alignment,
+                                             unsigned AddrSpace) const {
+  // We allow vectorization of flat stores, even though we may need to decompose
+  // them later if they may access private memory. We don't have enough context
+  // here, and legalization can handle it.
+  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
+}
+
+bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                              Align Alignment,
+                                              unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                               Align Alignment,
+                                               unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
+  if (VF == 1)
+    return 1;
+
+  return 8;
+}
+
+InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
+                                            TTI::TargetCostKind CostKind,
+                                            const Instruction *I) {
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+    return Opcode == Instruction::PHI ? 0 : 1;
+
+  // XXX - For some reason this isn't called for switch.
+  switch (Opcode) {
+  case Instruction::Br:
+  case Instruction::Ret:
+    return 10;
+  default:
+    return BaseT::getCFInstrCost(Opcode, CostKind, I);
+  }
+}
+
+InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                                unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement: {
+    unsigned EltSize =
+        DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+    if (EltSize < 32) {
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    }
+
+    // Extracts are just reads of a subregister, so are free. Inserts are
+    // considered free because we don't want to have any cost for scalarizing
+    // operations, and we don't have to copy into a different register class.
+
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  }
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
+
+void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                          TTI::UnrollingPreferences &UP,
+                                          OptimizationRemarkEmitter *ORE) {
+  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
+}
+
+void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                        TTI::PeelingPreferences &PP) {
+  CommonTTI.getPeelingPreferences(L, SE, PP);
+}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
new file mode 100644
index 000000000000..544292bc4fd9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -0,0 +1,69 @@
+//===- R600TargetTransformInfo.h - R600 specific TTI --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// R600 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600TARGETTRANSFORMINFO_H
+
+#include "AMDGPUTargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+
+class R600Subtarget;
+class AMDGPUTargetLowering;
+
+class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
+  using BaseT = BasicTTIImplBase<R600TTIImpl>;
+  using TTI = TargetTransformInfo;
+
+  friend BaseT;
+
+  const R600Subtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+  AMDGPUTTIImpl CommonTTI;
+
+public:
+  explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
+
+  const R600Subtarget *getST() const { return ST; }
+  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+  unsigned getHardwareNumberOfRegisters(bool Vec) const;
+  unsigned getNumberOfRegisters(bool Vec) const;
+  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
+  unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
+                                  unsigned AddrSpace) const;
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
+                                   unsigned AddrSpace) const;
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
+                                    unsigned AddrSpace) const;
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                                 const Instruction *I = nullptr);
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                     unsigned Index);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600TARGETTRANSFORMINFO_H
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index d3c0d792804d..777744f08cde 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -7,13 +7,20 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCInstrDesc.h"
-
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 #define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
 
+#include "llvm/MC/MCInstrDesc.h"
+
 namespace llvm {
 
+// This needs to be kept in sync with the field bits in SIRegisterClass.
+enum SIRCFlags : uint8_t {
+  // For vector registers.
+  HasVGPR = 1 << 0,
+  HasAGPR = 1 << 1
+}; // enum SIRCFlags
+
 namespace SIInstrFlags {
 // This needs to be kept in sync with the field bits in InstSI.
 enum : uint64_t {
@@ -132,64 +139,67 @@ enum ClassFlags : unsigned {
 }
 
 namespace AMDGPU {
-  enum OperandType : unsigned {
-    /// Operands with register or 32-bit immediate
-    OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
-    OPERAND_REG_IMM_INT64,
-    OPERAND_REG_IMM_INT16,
-    OPERAND_REG_IMM_FP32,
-    OPERAND_REG_IMM_FP64,
-    OPERAND_REG_IMM_FP16,
-    OPERAND_REG_IMM_V2FP16,
-    OPERAND_REG_IMM_V2INT16,
-    OPERAND_REG_IMM_V2INT32,
-    OPERAND_REG_IMM_V2FP32,
-
-    /// Operands with register or inline constant
-    OPERAND_REG_INLINE_C_INT16,
-    OPERAND_REG_INLINE_C_INT32,
-    OPERAND_REG_INLINE_C_INT64,
-    OPERAND_REG_INLINE_C_FP16,
-    OPERAND_REG_INLINE_C_FP32,
-    OPERAND_REG_INLINE_C_FP64,
-    OPERAND_REG_INLINE_C_V2INT16,
-    OPERAND_REG_INLINE_C_V2FP16,
-    OPERAND_REG_INLINE_C_V2INT32,
-    OPERAND_REG_INLINE_C_V2FP32,
-
-    /// Operands with an AccVGPR register or inline constant
-    OPERAND_REG_INLINE_AC_INT16,
-    OPERAND_REG_INLINE_AC_INT32,
-    OPERAND_REG_INLINE_AC_FP16,
-    OPERAND_REG_INLINE_AC_FP32,
-    OPERAND_REG_INLINE_AC_FP64,
-    OPERAND_REG_INLINE_AC_V2INT16,
-    OPERAND_REG_INLINE_AC_V2FP16,
-    OPERAND_REG_INLINE_AC_V2INT32,
-    OPERAND_REG_INLINE_AC_V2FP32,
-
-    OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
-    OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
-
-    OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
-    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32,
-
-    OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
-    OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32,
-
-    OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
-    OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
-
-    // Operand for source modifiers for VOP instructions
-    OPERAND_INPUT_MODS,
-
-    // Operand for SDWA instructions
-    OPERAND_SDWA_VOPC_DST,
-
-    /// Operand with 32-bit immediate that uses the constant bus.
-    OPERAND_KIMM32,
-    OPERAND_KIMM16
-  };
+enum OperandType : unsigned {
+  /// Operands with register or 32-bit immediate
+  OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
+  OPERAND_REG_IMM_INT64,
+  OPERAND_REG_IMM_INT16,
+  OPERAND_REG_IMM_FP32,
+  OPERAND_REG_IMM_FP64,
+  OPERAND_REG_IMM_FP16,
+  OPERAND_REG_IMM_FP16_DEFERRED,
+  OPERAND_REG_IMM_FP32_DEFERRED,
+  OPERAND_REG_IMM_V2FP16,
+  OPERAND_REG_IMM_V2INT16,
+  OPERAND_REG_IMM_V2INT32,
+  OPERAND_REG_IMM_V2FP32,
+
+  /// Operands with register or inline constant
+  OPERAND_REG_INLINE_C_INT16,
+  OPERAND_REG_INLINE_C_INT32,
+  OPERAND_REG_INLINE_C_INT64,
+  OPERAND_REG_INLINE_C_FP16,
+  OPERAND_REG_INLINE_C_FP32,
+  OPERAND_REG_INLINE_C_FP64,
+  OPERAND_REG_INLINE_C_V2INT16,
+  OPERAND_REG_INLINE_C_V2FP16,
+  OPERAND_REG_INLINE_C_V2INT32,
+  OPERAND_REG_INLINE_C_V2FP32,
+
+  /// Operand with 32-bit immediate that uses the constant bus.
+  OPERAND_KIMM32,
+  OPERAND_KIMM16,
+
+  /// Operands with an AccVGPR register or inline constant
+  OPERAND_REG_INLINE_AC_INT16,
+  OPERAND_REG_INLINE_AC_INT32,
+  OPERAND_REG_INLINE_AC_FP16,
+  OPERAND_REG_INLINE_AC_FP32,
+  OPERAND_REG_INLINE_AC_FP64,
+  OPERAND_REG_INLINE_AC_V2INT16,
+  OPERAND_REG_INLINE_AC_V2FP16,
+  OPERAND_REG_INLINE_AC_V2INT32,
+  OPERAND_REG_INLINE_AC_V2FP32,
+
+  OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
+  OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
+
+  OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
+  OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32,
+
+  OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
+  OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32,
+
+  OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
+  OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
+
+  // Operand for source modifiers for VOP instructions
+  OPERAND_INPUT_MODS,
+
+  // Operand for SDWA instructions
+  OPERAND_SDWA_VOPC_DST
+
+};
 }
 
 // Input operand modifiers bit-masks
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index d5c56bf2a321..cf93a63f26a0 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -585,10 +585,43 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       case AMDGPU::SOFT_WQM:
       case AMDGPU::STRICT_WWM: {
         Register DstReg = MI.getOperand(0).getReg();
-
         const TargetRegisterClass *SrcRC, *DstRC;
         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 
+        if (MI.isCopy()) {
+          Register SrcReg = MI.getOperand(1).getReg();
+          if (SrcReg == AMDGPU::SCC) {
+            Register SCCCopy = MRI->createVirtualRegister(
+                TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
+            I = BuildMI(*MI.getParent(),
+                        std::next(MachineBasicBlock::iterator(MI)),
+                        MI.getDebugLoc(),
+                        TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+                                               : AMDGPU::S_CSELECT_B64),
+                        SCCCopy)
+                    .addImm(-1)
+                    .addImm(0);
+            I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
+                        TII->get(AMDGPU::COPY), DstReg)
+                    .addReg(SCCCopy);
+            MI.eraseFromParent();
+            continue;
+          } else if (DstReg == AMDGPU::SCC) {
+            unsigned Opcode =
+                ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+            Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
+            Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
+            I = BuildMI(*MI.getParent(),
+                        std::next(MachineBasicBlock::iterator(MI)),
+                        MI.getDebugLoc(), TII->get(Opcode))
+                    .addReg(Tmp, getDefRegState(true))
+                    .addReg(SrcReg)
+                    .addReg(Exec);
+            MI.eraseFromParent();
+            continue;
+          }
+        }
+
         if (!DstReg.isVirtual()) {
           // If the destination register is a physical register there isn't
           // really much we can do to fix this.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index ad910522ba90..a3a0e9c9b9ac 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -228,7 +228,7 @@ static bool updateOperand(FoldCandidate &Fold,
       MachineOperand &Mod = MI->getOperand(ModIdx);
       unsigned Val = Mod.getImm();
       if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
-        // Only apply the following transformation if that operand requries
+        // Only apply the following transformation if that operand requires
         // a packed immediate.
         switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
         case AMDGPU::OPERAND_REG_IMM_V2FP16:
@@ -452,7 +452,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     const SIRegisterInfo &SRI = TII->getRegisterInfo();
 
     // Fine if the operand can be encoded as an inline constant
-    if (OpToFold->isImm()) {
+    if (TII->isLiteralConstantLike(*OpToFold, OpInfo)) {
       if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
           !TII->isInlineConstant(*OpToFold, OpInfo)) {
         // Otherwise check for another constant
@@ -646,7 +646,7 @@ void SIFoldOperands::foldOperand(
     return;
 
   if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
-    // Sanity check that this is a stack access.
+    // Verify that this is a stack access.
     // FIXME: Should probably use stack pseudos before frame lowering.
 
     if (TII->isMUBUF(*UseMI)) {
@@ -688,7 +688,7 @@ void SIFoldOperands::foldOperand(
 
     // Don't fold into a copy to a physical register with the same class. Doing
     // so would interfere with the register coalescer's logic which would avoid
-    // redundant initalizations.
+    // redundant initializations.
     if (DestReg.isPhysical() && SrcRC->contains(DestReg))
       return;
 
@@ -902,7 +902,7 @@ void SIFoldOperands::foldOperand(
     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 
     // FIXME: We could try to change the instruction from 64-bit to 32-bit
-    // to enable more folding opportunites.  The shrink operands pass
+    // to enable more folding opportunities.  The shrink operands pass
     // already does this.
     return;
   }
@@ -1388,6 +1388,13 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
   DefClamp->setImm(1);
   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
   MI.eraseFromParent();
+
+  // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
+  // instruction, so we might as well convert it to the more flexible VOP3-only
+  // mad/fma form.
+  if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
+    Def->eraseFromParent();
+
   return true;
 }
 
@@ -1526,6 +1533,13 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
   DefOMod->setImm(OMod);
   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
   MI.eraseFromParent();
+
+  // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
+  // instruction, so we might as well convert it to the more flexible VOP3-only
+  // mad/fma form.
+  if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
+    Def->eraseFromParent();
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index c9883d38e08c..882b9a203755 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -125,8 +125,8 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
                              const SIMachineFunctionInfo &FuncInfo,
                              LivePhysRegs &LiveRegs, MachineFunction &MF,
                              MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I, Register SpillReg,
-                             int FI) {
+                             MachineBasicBlock::iterator I, const DebugLoc &DL,
+                             Register SpillReg, int FI) {
   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
                                         : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
 
@@ -136,7 +136,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
       FrameInfo.getObjectAlign(FI));
   LiveRegs.addReg(SpillReg);
-  TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true,
+  TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, true,
                           FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
                           &LiveRegs);
   LiveRegs.removeReg(SpillReg);
@@ -147,8 +147,8 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
                                const SIMachineFunctionInfo &FuncInfo,
                                LivePhysRegs &LiveRegs, MachineFunction &MF,
                                MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, Register SpillReg,
-                               int FI) {
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL, Register SpillReg, int FI) {
   unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
                                         : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
 
@@ -157,7 +157,7 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
       FrameInfo.getObjectAlign(FI));
-  TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false,
+  TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false,
                           FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
                           &LiveRegs);
 }
@@ -258,9 +258,10 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 
     // Mask the offset in [47:0] of the descriptor
     const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
-    BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
+    auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
         .addReg(FlatScrInitHi)
         .addImm(0xffff);
+    And->getOperand(3).setIsDead(); // Mark SCC as dead.
   } else {
     Register FlatScratchInitReg =
         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
@@ -280,9 +281,12 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
         .addReg(FlatScrInitLo)
         .addReg(ScratchWaveOffsetReg);
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
+      auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
+                          FlatScrInitHi)
         .addReg(FlatScrInitHi)
         .addImm(0);
+      Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
+
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
         addReg(FlatScrInitLo).
         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
@@ -298,9 +302,11 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
       .addReg(FlatScrInitLo)
       .addReg(ScratchWaveOffsetReg);
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
+    auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
+                        AMDGPU::FLAT_SCR_HI)
       .addReg(FlatScrInitHi)
       .addImm(0);
+    Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
     return;
   }
@@ -318,9 +324,11 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
       .addReg(ScratchWaveOffsetReg);
 
   // Convert offset to 256-byte units.
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+  auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
+                      AMDGPU::FLAT_SCR_HI)
     .addReg(FlatScrInitLo, RegState::Kill)
     .addImm(8);
+  LShr->getOperand(3).setIsDead(true); // Mark SCC as dead.
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
@@ -419,9 +427,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-  // FIXME: Hack to not crash in situations which emitted an error.
-  if (!PreloadedScratchWaveOffsetReg)
-    return;
 
   // We need to do the replacement of the private segment buffer register even
   // if there are no stack objects. There could be stores to undef or a
@@ -467,7 +472,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
   // wave offset to a free SGPR.
   Register ScratchWaveOffsetReg;
-  if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
+  if (PreloadedScratchWaveOffsetReg &&
+      TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
     AllSGPRs = AllSGPRs.slice(
@@ -485,7 +491,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   } else {
     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
   }
-  assert(ScratchWaveOffsetReg);
+  assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
 
   if (requiresStackPointerReference(MF)) {
     Register SPReg = MFI->getStackPtrOffsetReg();
@@ -506,7 +512,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
        (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
 
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
-      !ST.flatScratchIsArchitected()) {
+      PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
@@ -660,10 +666,11 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addReg(ScratchRsrcSub0)
       .addReg(ScratchWaveOffsetReg)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
+  auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
       .addReg(ScratchRsrcSub1)
       .addImm(0)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 }
 
 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
@@ -720,7 +727,9 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
 
   const unsigned OrSaveExec =
       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
-  BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
+  auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
+    .addImm(-1);
+  SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
 
   return ScratchExecCopy;
 }
@@ -776,7 +785,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI,
                                              /*IsProlog*/ true);
 
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR,
+    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, Reg.VGPR,
                      *Reg.FI);
   }
 
@@ -791,7 +800,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       ScratchExecCopy =
           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
 
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI);
+    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
+                     *FI);
   }
 
   if (ScratchExecCopy) {
@@ -817,7 +827,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(FramePtrReg);
 
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
                      FramePtrFI);
   }
 
@@ -835,7 +845,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(BasePtrReg);
 
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR,
                      BasePtrFI);
   }
 
@@ -927,10 +937,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(StackPtrReg)
         .addImm((Alignment - 1) * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+    auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
         .addReg(FramePtrReg, RegState::Kill)
         .addImm(-Alignment * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
+    And->getOperand(3).setIsDead(); // Mark SCC as dead.
     FuncInfo->setIsStackRealigned(true);
   } else if ((HasFP = hasFP(MF))) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
@@ -949,18 +960,22 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (HasFP && RoundedSize != 0) {
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
+    auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
         .addReg(StackPtrReg)
         .addImm(RoundedSize * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
+    Add->getOperand(3).setIsDead(); // Mark SCC as dead.
   }
 
   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
                      FuncInfo->FramePointerSaveIndex)) &&
          "Needed to save FP but didn't save it anywhere");
 
+  // If we allow spilling to AGPRs we may have saved FP but then spill
+  // everything into AGPRs instead of the stack.
   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
-                    !FuncInfo->FramePointerSaveIndex)) &&
+                    !FuncInfo->FramePointerSaveIndex) ||
+                   EnableSpillVGPRToAGPR) &&
          "Saved FP but didn't need it");
 
   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
@@ -1000,10 +1015,11 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
 
   if (RoundedSize != 0 && hasFP(MF)) {
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
+    auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
         .addReg(StackPtrReg)
         .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
         .setMIFlag(MachineInstr::FrameDestroy);
+    Add->getOperand(3).setIsDead(); // Mark SCC as dead.
   }
 
   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
@@ -1028,8 +1044,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
       if (!TmpVGPR)
         report_fatal_error("failed to find free scratch register");
-      buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
-                         FramePtrFI);
+      buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+                         TmpVGPR, FramePtrFI);
       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
           .addReg(TmpVGPR, RegState::Kill);
     } else {
@@ -1054,8 +1070,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
       if (!TmpVGPR)
         report_fatal_error("failed to find free scratch register");
-      buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
-                         BasePtrFI);
+      buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+                         TmpVGPR, BasePtrFI);
       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
           .addReg(TmpVGPR, RegState::Kill);
     } else {
@@ -1080,8 +1096,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
       ScratchExecCopy =
           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
 
-    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR,
-                       *Reg.FI);
+    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+                       Reg.VGPR, *Reg.FI);
   }
 
   for (const auto &Reg : FuncInfo->WWMReservedRegs) {
@@ -1094,7 +1110,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
       ScratchExecCopy =
           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
 
-    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI);
+    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
+                       *FI);
   }
 
   if (ScratchExecCopy) {
@@ -1154,11 +1171,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
     bool SeenDbgInstr = false;
 
     for (MachineBasicBlock &MBB : MF) {
-      MachineBasicBlock::iterator Next;
-      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
-        MachineInstr &MI = *I;
-        Next = std::next(I);
-
+      for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
         if (MI.isDebugInstr())
           SeenDbgInstr = true;
 
@@ -1199,7 +1212,6 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
           if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
               SpillFIs[MI.getOperand(0).getIndex()]) {
             MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
-            MI.getOperand(0).setIsDebug();
           }
         }
       }
@@ -1301,10 +1313,13 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
   // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
   const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
 
-  // We have to anticipate introducing CSR VGPR spills if we don't have any
-  // stack objects already, since we require an FP if there is a call and stack.
+  // We have to anticipate introducing CSR VGPR spills or spill of caller
+  // save VGPR reserved for SGPR spills as we now always create stack entry
+  // for it, if we don't have any stack objects already, since we require
+  // an FP if there is a call and stack.
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  const bool WillHaveFP = FrameInfo.hasCalls() && HaveAnyCSRVGPR;
+  const bool WillHaveFP =
+      FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill);
 
   // FP will be specially managed like SP.
   if (WillHaveFP || hasFP(MF))
@@ -1373,9 +1388,10 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
     Amount *= getScratchScaleFactor(ST);
     if (IsDestroy)
       Amount = -Amount;
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
+    auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
         .addReg(SPReg)
         .addImm(Amount);
+    Add->getOperand(3).setIsDead(); // Mark SCC as dead.
   } else if (CalleePopAmount != 0) {
     llvm_unreachable("is this used?");
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d98acfc6c532..519c5b936536 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,10 +19,12 @@
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -465,11 +467,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasBCNT(64))
     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 
-  if (Subtarget->hasFFBH())
+  if (Subtarget->hasFFBH()) {
+    setOperationAction(ISD::CTLZ, MVT::i32, Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+  }
 
-  if (Subtarget->hasFFBL())
+  if (Subtarget->hasFFBL()) {
+    setOperationAction(ISD::CTTZ, MVT::i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+  }
 
   // We only really have 32-bit BFE instructions (and 16-bit on VI).
   //
@@ -1061,7 +1067,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
                                                   (Intrinsic::ID)IntrID);
-    if (Attr.hasFnAttribute(Attribute::ReadNone))
+    if (Attr.hasFnAttr(Attribute::ReadNone))
       return false;
 
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1076,7 +1082,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     }
 
     Info.flags = MachineMemOperand::MODereferenceable;
-    if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+    if (Attr.hasFnAttr(Attribute::ReadOnly)) {
       unsigned DMaskLanes = 4;
 
       if (RsrcIntr->IsImage) {
@@ -1100,7 +1106,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       // FIXME: What does alignment mean for an image?
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.flags |= MachineMemOperand::MOLoad;
-    } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+    } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
       Info.opc = ISD::INTRINSIC_VOID;
 
       Type *DataTy = CI.getArgOperand(0)->getType();
@@ -1423,7 +1429,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 }
 
 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
-                                        const SelectionDAG &DAG) const {
+                                        const MachineFunction &MF) const {
   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
     return (MemVT.getSizeInBits() <= 4 * 32);
   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1657,12 +1663,17 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
   const ArgDescriptor *InputPtrReg;
   const TargetRegisterClass *RC;
   LLT ArgTy;
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
 
   std::tie(InputPtrReg, RC, ArgTy) =
       Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
 
+  // We may not have the kernarg segment argument if we have no kernel
+  // arguments.
+  if (!InputPtrReg)
+    return DAG.getConstant(0, SL, PtrVT);
+
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
 
@@ -1808,6 +1819,19 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   LLT Ty;
 
   std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+  if (!Reg) {
+    if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
+      // It's possible for a kernarg intrinsic call to appear in a kernel with
+      // no allocated segment, in which case we do not add the user sgpr
+      // argument, so just return null.
+      return DAG.getConstant(0, SDLoc(), VT);
+    }
+
+    // It's undefined behavior if a function marked with the amdgpu-no-*
+    // attributes uses the corresponding intrinsic.
+    return DAG.getUNDEF(VT);
+  }
+
   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
 }
 
@@ -2023,31 +2047,33 @@ void SITargetLowering::allocateSpecialInputSGPRs(
   SIMachineFunctionInfo &Info) const {
   auto &ArgInfo = Info.getArgInfo();
 
-  // TODO: Unify handling with private memory pointers.
+  // We need to allocate these in place regardless of their use.
+  const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI;
 
-  if (Info.hasDispatchPtr())
+  // TODO: Unify handling with private memory pointers.
+  if (IsFixed || Info.hasDispatchPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
 
-  if (Info.hasQueuePtr())
+  if (IsFixed || Info.hasQueuePtr())
     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
 
   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
   // constant offset from the kernarg segment.
-  if (Info.hasImplicitArgPtr())
+  if (IsFixed || Info.hasImplicitArgPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
 
-  if (Info.hasDispatchID())
+  if (IsFixed || Info.hasDispatchID())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
 
   // flat_scratch_init is not applicable for non-kernel functions.
 
-  if (Info.hasWorkGroupIDX())
+  if (IsFixed || Info.hasWorkGroupIDX())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
 
-  if (Info.hasWorkGroupIDY())
+  if (IsFixed || Info.hasWorkGroupIDY())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
 
-  if (Info.hasWorkGroupIDZ())
+  if (IsFixed || Info.hasWorkGroupIDZ())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
 }
 
@@ -2590,9 +2616,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     SDValue ReturnAddrReg = CreateLiveInRegister(
       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
 
-    SDValue ReturnAddrVirtualReg = DAG.getRegister(
-        MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
-        MVT::i64);
+    SDValue ReturnAddrVirtualReg =
+        DAG.getRegister(MF.getRegInfo().createVirtualRegister(
+                            CallConv != CallingConv::AMDGPU_Gfx
+                                ? &AMDGPU::CCR_SGPR_64RegClass
+                                : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
+                        MVT::i64);
     Chain =
         DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
     Flag = Chain.getValue(1);
@@ -2655,8 +2684,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(Flag);
 
   unsigned Opc = AMDGPUISD::ENDPGM;
-  if (!IsWaveEnd)
-    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
+  if (!IsWaveEnd) {
+    if (IsShader)
+      Opc = AMDGPUISD::RETURN_TO_EPILOG;
+    else if (CallConv == CallingConv::AMDGPU_Gfx)
+      Opc = AMDGPUISD::RET_GFX_FLAG;
+    else
+      Opc = AMDGPUISD::RET_FLAG;
+  }
+
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -2747,21 +2783,28 @@ void SITargetLowering::passSpecialInputs(
   // TODO: Unify with private memory register handling. This is complicated by
   // the fact that at least in kernels, the input argument is not necessarily
   // in the same location as the input.
-  AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
-    AMDGPUFunctionArgInfo::DISPATCH_PTR,
-    AMDGPUFunctionArgInfo::QUEUE_PTR,
-    AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
-    AMDGPUFunctionArgInfo::DISPATCH_ID,
-    AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
-    AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
-    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
+  static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
+                             StringLiteral> ImplicitAttrs[] = {
+    {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
+    {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
+    {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
+    {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}
   };
 
-  for (auto InputID : InputRegs) {
+  for (auto Attr : ImplicitAttrs) {
     const ArgDescriptor *OutgoingArg;
     const TargetRegisterClass *ArgRC;
     LLT ArgTy;
 
+    AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
+
+    // If the callee does not use the attribute value, skip copying the value.
+    if (CLI.CB->hasFnAttr(Attr.second))
+      continue;
+
     std::tie(OutgoingArg, ArgRC, ArgTy) =
         CalleeArgInfo->getPreloadedValue(InputID);
     if (!OutgoingArg)
@@ -2780,11 +2823,14 @@ void SITargetLowering::passSpecialInputs(
 
     if (IncomingArg) {
       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
-    } else {
+    } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
       // The implicit arg ptr is special because it doesn't have a corresponding
       // input for kernels, and is computed from the kernarg segment pointer.
-      assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
       InputReg = getImplicitArgPtr(DAG, DL);
+    } else {
+      // We may have proven the input wasn't needed, although the ABI is
+      // requiring it. We just need to allocate the register appropriately.
+      InputReg = DAG.getUNDEF(ArgVT);
     }
 
     if (OutgoingArg->isRegister()) {
@@ -2827,11 +2873,17 @@ void SITargetLowering::passSpecialInputs(
   SDValue InputReg;
   SDLoc SL;
 
+  const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
+  const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
+  const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
+
   // If incoming ids are not packed we need to pack them.
-  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
+  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
+      NeedWorkItemIDX)
     InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
 
-  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
+  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
+      NeedWorkItemIDY) {
     SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
     Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
                     DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2839,7 +2891,8 @@ void SITargetLowering::passSpecialInputs(
                  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
   }
 
-  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
+  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
+      NeedWorkItemIDZ) {
     SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
     Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
                     DAG.getShiftAmountConstant(20, MVT::i32, SL));
@@ -2847,7 +2900,7 @@ void SITargetLowering::passSpecialInputs(
                  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
   }
 
-  if (!InputReg.getNode()) {
+  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
     // Workitem ids are already packed, any of present incoming arguments
     // will carry all required fields.
     ArgDescriptor IncomingArg = ArgDescriptor::createArg(
@@ -2858,13 +2911,17 @@ void SITargetLowering::passSpecialInputs(
   }
 
   if (OutgoingArg->isRegister()) {
-    RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+    if (InputReg)
+      RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+
     CCInfo.AllocateReg(OutgoingArg->getRegister());
   } else {
     unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
-    SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
-                                            SpecialArgOffset);
-    MemOpChains.push_back(ArgStore);
+    if (InputReg) {
+      SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+                                              SpecialArgOffset);
+      MemOpChains.push_back(ArgStore);
+    }
   }
 }
 
@@ -4091,7 +4148,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     }
 
     const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
-    if (TRI->getRegSizeInBits(*Src2RC) == 64) {
+    unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
+    assert(WaveSize == 64 || WaveSize == 32);
+
+    if (WaveSize == 64) {
       if (ST.hasScalarCompareEq64()) {
         BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
             .addReg(Src2.getReg())
@@ -4121,8 +4181,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
 
     BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
 
-    BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg())
-      .addReg(AMDGPU::SCC);
+    unsigned SelOpc =
+        (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+
+    BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
+        .addImm(-1)
+        .addImm(0);
+
     MI.eraseFromParent();
     return BB;
   }
@@ -4261,6 +4326,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     MI.eraseFromParent();
     return BB;
   }
+  case AMDGPU::V_ADDC_U32_e32:
+  case AMDGPU::V_SUBB_U32_e32:
+  case AMDGPU::V_SUBBREV_U32_e32:
+    // These instructions have an implicit use of vcc which counts towards the
+    // constant bus limit.
+    TII->legalizeOperands(MI);
+    return BB;
   case AMDGPU::DS_GWS_INIT:
   case AMDGPU::DS_GWS_SEMA_BR:
   case AMDGPU::DS_GWS_BARRIER:
@@ -4818,7 +4890,7 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
   }
   if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
     // (ballot 0) -> 0
-    if (Arg->isNullValue())
+    if (Arg->isZero())
       return DAG.getConstant(0, SL, VT);
 
     // (ballot 1) -> EXEC/EXEC_LO
@@ -5266,9 +5338,18 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr(
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   Register UserSGPR = Info->getQueuePtrUserSGPR();
-  assert(UserSGPR != AMDGPU::NoRegister);
-  SDValue QueuePtr = CreateLiveInRegister(
-    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+  SDValue QueuePtr;
+  if (UserSGPR == AMDGPU::NoRegister) {
+    // We probably are in a function incorrectly marked with
+    // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap,
+    // so just use a null pointer.
+    QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+  } else {
+    QueuePtr = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+  }
+
   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
                                    QueuePtr, SDValue());
@@ -5345,7 +5426,11 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   Register UserSGPR = Info->getQueuePtrUserSGPR();
-  assert(UserSGPR != AMDGPU::NoRegister);
+  if (UserSGPR == AMDGPU::NoRegister) {
+    // We probably are in a function incorrectly marked with
+    // amdgpu-no-queue-ptr. This is undefined.
+    return DAG.getUNDEF(MVT::i32);
+  }
 
   SDValue QueuePtr = CreateLiveInRegister(
     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
@@ -5936,6 +6021,9 @@ static SDValue constructRetValue(SelectionDAG &DAG,
 
   EVT LegalReqRetVT = ReqRetVT;
   if (!ReqRetVT.isVector()) {
+    if (!Data.getValueType().isInteger())
+      Data = DAG.getNode(ISD::BITCAST, DL,
+                         Data.getValueType().changeTypeToInteger(), Data);
     Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
   } else {
     // We need to widen the return vector to a legal type
@@ -6124,7 +6212,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   if (MIPMappingInfo) {
     if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
             Op.getOperand(ArgOffset + Intr->MipIndex))) {
-      if (ConstantLod->isNullValue()) {
+      if (ConstantLod->isZero()) {
         IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
         VAddrEnd--;                           // remove 'mip'
       }
@@ -6659,7 +6747,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     // intrinsic has the numerator as the first operand to match a normal
     // division operation.
 
-    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+    SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
 
     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
                        Denominator, Numerator);
@@ -6793,7 +6881,7 @@ static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
   }
 
   if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
-                 !cast<ConstantSDNode>(VIndex)->isNullValue())) {
+                 !cast<ConstantSDNode>(VIndex)->isZero())) {
     // The strided index component of the address is not known to be zero, so we
     // cannot represent it in the MMO. Give up.
     MMO->setValue((Value *)nullptr);
@@ -7341,7 +7429,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
   }
   case Intrinsic::amdgcn_image_bvh_intersect_ray: {
-    SDLoc DL(Op);
     MemSDNode *M = cast<MemSDNode>(Op);
     SDValue NodePtr = M->getOperand(2);
     SDValue RayExtent = M->getOperand(3);
@@ -7360,12 +7447,27 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       return SDValue();
     }
 
-    bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
-    bool Is64 = NodePtr.getValueType() == MVT::i64;
-    unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
-                                   : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
-                            : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
-                                   : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+    const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+    const bool Is64 = NodePtr.getValueType() == MVT::i64;
+    const unsigned NumVDataDwords = 4;
+    const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+    const bool UseNSA = Subtarget->hasNSAEncoding() &&
+                        NumVAddrDwords <= Subtarget->getNSAMaxSize();
+    const unsigned BaseOpcodes[2][2] = {
+        {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
+        {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
+         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
+    int Opcode;
+    if (UseNSA) {
+      Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+                                     AMDGPU::MIMGEncGfx10NSA, NumVDataDwords,
+                                     NumVAddrDwords);
+    } else {
+      Opcode = AMDGPU::getMIMGOpcode(
+          BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
+          PowerOf2Ceil(NumVAddrDwords));
+    }
+    assert(Opcode != -1);
 
     SmallVector<SDValue, 16> Ops;
 
@@ -7405,6 +7507,20 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     packLanes(RayOrigin, true);
     packLanes(RayDir, true);
     packLanes(RayInvDir, false);
+
+    if (!UseNSA) {
+      // Build a single vector containing all the operands so far prepared.
+      if (NumVAddrDwords > 8) {
+        SDValue Undef = DAG.getUNDEF(MVT::i32);
+        Ops.append(16 - Ops.size(), Undef);
+      }
+      assert(Ops.size() == 8 || Ops.size() == 16);
+      SDValue MergedOps = DAG.getBuildVector(
+          Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
+      Ops.clear();
+      Ops.push_back(MergedOps);
+    }
+
     Ops.push_back(TDescr);
     if (IsA16)
       Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
@@ -7610,7 +7726,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Op.getOperand(0) // Chain
     };
 
-    unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
+    unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
     return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
   }
   case Intrinsic::amdgcn_s_barrier: {
@@ -8241,6 +8357,16 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Cond = Op.getOperand(0);
 
+  if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() &&
+      !Op->isDivergent()) {
+    if (VT == MVT::i64)
+      return Op;
+    SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS));
+  }
+
   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
   SDValue One = DAG.getConstant(1, DL, MVT::i32);
 
@@ -9358,7 +9484,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (CRHS) {
     if (SDValue Split
-          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
+          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
+                                     N->getOperand(0), CRHS))
       return Split;
   }
 
@@ -9445,7 +9572,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
 
   // fp_class x, 0 -> false
   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
-    if (CMask->isNullValue())
+    if (CMask->isZero())
       return DAG.getConstant(0, SDLoc(N), MVT::i1);
   }
 
@@ -10348,7 +10475,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
     }
 
-    if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
+    if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
       MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
       MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
       AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
@@ -10434,7 +10561,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
   if (LHS.getOpcode() == ISD::SUBCARRY) {
     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
-    if (!C || !C->isNullValue())
+    if (!C || !C->isZero())
       return SDValue();
     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
@@ -10657,20 +10784,20 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
-      if ((CRHS->isAllOnesValue() &&
+      if ((CRHS->isAllOnes() &&
            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
-          (CRHS->isNullValue() &&
+          (CRHS->isZero() &&
            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
                            DAG.getConstant(-1, SL, MVT::i1));
-      if ((CRHS->isAllOnesValue() &&
+      if ((CRHS->isAllOnes() &&
            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
-          (CRHS->isNullValue() &&
+          (CRHS->isZero() &&
            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
         return LHS.getOperand(0);
     }
 
-    uint64_t CRHSVal = CRHS->getZExtValue();
+    const APInt &CRHSVal = CRHS->getAPIntValue();
     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
         LHS.getOpcode() == ISD::SELECT &&
         isa<ConstantSDNode>(LHS.getOperand(1)) &&
@@ -10682,8 +10809,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
       // setcc (select cc, CT, CF), CF, ne => cc
       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
       // setcc (select cc, CT, CF), CT, eq => cc
-      uint64_t CT = LHS.getConstantOperandVal(1);
-      uint64_t CF = LHS.getConstantOperandVal(2);
+      const APInt &CT = LHS.getConstantOperandAPInt(1);
+      const APInt &CF = LHS.getConstantOperandAPInt(2);
 
       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
           (CT == CRHSVal && CC == ISD::SETNE))
@@ -10747,7 +10874,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
     // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
     if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
-      Shift = DAG.getZExtOrTrunc(Shift.getOperand(0),
+      SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
                                  SDLoc(Shift.getOperand(0)), MVT::i32);
 
       unsigned ShiftOffset = 8 * Offset;
@@ -10758,7 +10885,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
 
       if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
-                           MVT::f32, Shift);
+                           MVT::f32, Shifted);
       }
     }
   }
@@ -12086,6 +12213,25 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+
+  auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
+    OptimizationRemarkEmitter ORE(RMW->getFunction());
+    LLVMContext &Ctx = RMW->getFunction()->getContext();
+    SmallVector<StringRef> SSNs;
+    Ctx.getSyncScopeNames(SSNs);
+    auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
+                        ? "system"
+                        : SSNs[RMW->getSyncScopeID()];
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
+             << "Hardware instruction generated for atomic "
+             << RMW->getOperationName(RMW->getOperation())
+             << " operation at memory scope " << MemScope
+             << " due to an unsafe request.";
+    });
+    return Kind;
+  };
+
   switch (RMW->getOperation()) {
   case AtomicRMWInst::FAdd: {
     Type *Ty = RMW->getType();
@@ -12120,28 +12266,30 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
             SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
           return AtomicExpansionKind::CmpXChg;
 
-        return AtomicExpansionKind::None;
+        return ReportUnsafeHWInst(AtomicExpansionKind::None);
       }
 
       if (AS == AMDGPUAS::FLAT_ADDRESS)
         return AtomicExpansionKind::CmpXChg;
 
-      return RMW->use_empty() ? AtomicExpansionKind::None
+      return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None)
                               : AtomicExpansionKind::CmpXChg;
     }
 
     // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
     // to round-to-nearest-even.
     // The only exception is DS_ADD_F64 which never flushes regardless of mode.
-    if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
+    if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
       if (!Ty->isDoubleTy())
         return AtomicExpansionKind::None;
 
-      return (fpModeMatchesGlobalFPAtomicMode(RMW) ||
-              RMW->getFunction()
-                      ->getFnAttribute("amdgpu-unsafe-fp-atomics")
-                      .getValueAsString() == "true")
-                 ? AtomicExpansionKind::None
+      if (fpModeMatchesGlobalFPAtomicMode(RMW))
+        return AtomicExpansionKind::None;
+
+      return RMW->getFunction()
+                         ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+                         .getValueAsString() == "true"
+                 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
                  : AtomicExpansionKind::CmpXChg;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f3d34267a81d..1e48c96ad3c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -16,6 +16,7 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
@@ -267,7 +268,7 @@ public:
                              Instruction *I = nullptr) const override;
 
   bool canMergeStoresTo(unsigned AS, EVT MemVT,
-                        const SelectionDAG &DAG) const override;
+                        const MachineFunction &MF) const override;
 
   bool allowsMisalignedMemoryAccessesImpl(
       unsigned Size, unsigned AddrSpace, Align Alignment,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 7ba20eb6027b..125f006a1d1d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -58,6 +58,8 @@ enum HardClauseType {
   // Internal instructions, which are allowed in the middle of a hard clause,
   // except for s_waitcnt.
   HARDCLAUSE_INTERNAL,
+  // Meta instructions that do not result in any ISA like KILL.
+  HARDCLAUSE_IGNORE,
   // Instructions that are not allowed in a hard clause: SALU, export, branch,
   // message, GDS, s_waitcnt and anything else not mentioned above.
   HARDCLAUSE_ILLEGAL,
@@ -100,6 +102,8 @@ public:
     // It's safe to treat the rest as illegal.
     if (MI.getOpcode() == AMDGPU::S_NOP)
       return HARDCLAUSE_INTERNAL;
+    if (MI.isMetaInstruction())
+      return HARDCLAUSE_IGNORE;
     return HARDCLAUSE_ILLEGAL;
   }
 
@@ -112,25 +116,25 @@ public:
     // The last non-internal instruction in the clause.
     MachineInstr *Last = nullptr;
     // The length of the clause including any internal instructions in the
-    // middle or after the end of the clause.
+    // middle (but not at the end) of the clause.
     unsigned Length = 0;
+    // Internal instructions at the and of a clause should not be included in
+    // the clause. Count them in TrailingInternalLength until a new memory
+    // instruction is added.
+    unsigned TrailingInternalLength = 0;
     // The base operands of *Last.
     SmallVector<const MachineOperand *, 4> BaseOps;
   };
 
   bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
-    // Get the size of the clause excluding any internal instructions at the
-    // end.
-    unsigned Size =
-        std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1;
-    if (Size < 2)
+    if (CI.First == CI.Last)
       return false;
-    assert(Size <= 64 && "Hard clause is too long!");
+    assert(CI.Length <= 64 && "Hard clause is too long!");
 
     auto &MBB = *CI.First->getParent();
     auto ClauseMI =
         BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
-            .addImm(Size - 1);
+            .addImm(CI.Length - 1);
     finalizeBundle(MBB, ClauseMI->getIterator(),
                    std::next(CI.Last->getIterator()));
     return true;
@@ -168,6 +172,7 @@ public:
 
         if (CI.Length == 64 ||
             (CI.Length && Type != HARDCLAUSE_INTERNAL &&
+             Type != HARDCLAUSE_IGNORE &&
              (Type != CI.Type ||
               // Note that we lie to shouldClusterMemOps about the size of the
               // cluster. When shouldClusterMemOps is called from the machine
@@ -182,14 +187,20 @@ public:
 
         if (CI.Length) {
           // Extend the current clause.
-          ++CI.Length;
-          if (Type != HARDCLAUSE_INTERNAL) {
-            CI.Last = &MI;
-            CI.BaseOps = std::move(BaseOps);
+          if (Type != HARDCLAUSE_IGNORE) {
+            if (Type == HARDCLAUSE_INTERNAL) {
+              ++CI.TrailingInternalLength;
+            } else {
+              ++CI.Length;
+              CI.Length += CI.TrailingInternalLength;
+              CI.TrailingInternalLength = 0;
+              CI.Last = &MI;
+              CI.BaseOps = std::move(BaseOps);
+            }
           }
         } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
           // Start a new clause.
-          CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)};
+          CI = ClauseInfo{Type, &MI, &MI, 1, 0, std::move(BaseOps)};
         }
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7d6f79922d2e..f4e5771d2a2a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -73,7 +73,7 @@ public:
 
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
-// s_waitcnt instruction needs to be emited.
+// s_waitcnt instruction needs to be emitted.
 
 #define CNT_MASK(t) (1u << (t))
 
@@ -963,6 +963,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
   }
@@ -1686,17 +1687,13 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   bool HaveScalarStores = false;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
-       ++BI) {
-    MachineBasicBlock &MBB = *BI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
-         ++I) {
-      if (!HaveScalarStores && TII->isScalarStore(*I))
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!HaveScalarStores && TII->isScalarStore(MI))
         HaveScalarStores = true;
 
-      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+      if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+          MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
         EndPgmBlocks.push_back(&MBB);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7ab0f7a100c5..4a928123b68f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -19,8 +19,10 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -108,7 +110,7 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
 
 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                                     AAResults *AA) const {
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) {
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
     // Normally VALU use of exec would block the rematerialization, but that
     // is OK in this case to have an implicit exec read as all VALU do.
     // We really want all of the generic logic for this except for this.
@@ -116,6 +118,10 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
     // Another potential implicit use is mode register. The core logic of
     // the RA will not attempt rematerialization if mode is set anywhere
     // in the function, otherwise it is safe since mode is not changed.
+
+    // There is difference to generic method which does not allow
+    // rematerialization if there are virtual register uses. We allow this,
+    // therefore this method includes SOP instructions as well.
     return !MI.hasImplicitDef() &&
            MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() &&
            !MI.mayRaiseFPException();
@@ -1637,10 +1643,20 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
 
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  default: return 1; // FIXME: Do wait states equal cycles?
+  default:
+    if (MI.isMetaInstruction())
+      return 0;
+    return 1; // FIXME: Do wait states equal cycles?
 
   case AMDGPU::S_NOP:
     return MI.getOperand(0).getImm() + 1;
+
+  // FIXME: Any other pseudo instruction?
+  // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
+  // hazard, even if one exist, won't really be visible. Should we handle it?
+  case AMDGPU::SI_MASKED_UNREACHABLE:
+  case AMDGPU::WAVE_BARRIER:
+    return 0;
   }
 }
 
@@ -1889,7 +1905,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
                               .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
     SetOn->getOperand(3).setIsUndef();
 
-    const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
+    const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
     MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, OpDesc)
             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
@@ -1929,11 +1945,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
                               .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
     SetOn->getOperand(3).setIsUndef();
 
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
         .addDef(Dst)
         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
-        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
-        .addReg(AMDGPU::M0, RegState::Implicit);
+        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
 
     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
 
@@ -2208,15 +2223,17 @@ MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
   return MI.getOperand(0).getMBB();
 }
 
-unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
-                                           MachineBasicBlock &DestBB,
-                                           const DebugLoc &DL,
-                                           int64_t BrOffset,
-                                           RegScavenger *RS) const {
+void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                       MachineBasicBlock &DestBB,
+                                       MachineBasicBlock &RestoreBB,
+                                       const DebugLoc &DL, int64_t BrOffset,
+                                       RegScavenger *RS) const {
   assert(RS && "RegScavenger required for long branching");
   assert(MBB.empty() &&
          "new block should be inserted for expanding unconditional branch");
   assert(MBB.pred_size() == 1);
+  assert(RestoreBB.empty() &&
+         "restore block should be inserted for restoring clobbered registers");
 
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -2253,14 +2270,6 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
     .addReg(PCReg);
 
-  auto ComputeBlockSize = [](const TargetInstrInfo *TII,
-                             const MachineBasicBlock &MBB) {
-    unsigned Size = 0;
-    for (const MachineInstr &MI : MBB)
-      Size += TII->getInstSizeInBytes(MI);
-    return Size;
-  };
-
   // FIXME: If spilling is necessary, this will fail because this scavenger has
   // no emergency stack slots. It is non-trivial to spill in this situation,
   // because the restore code needs to be specially placed after the
@@ -2299,22 +2308,34 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
 
   RS->enterBasicBlockEnd(MBB);
   Register Scav = RS->scavengeRegisterBackwards(
-    AMDGPU::SReg_64RegClass,
-    MachineBasicBlock::iterator(GetPC), false, 0);
-  MRI.replaceRegWith(PCReg, Scav);
-  MRI.clearVirtRegs();
-  RS->setRegUsed(Scav);
+      AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
+      /* RestoreAfter */ false, 0, /* AllowSpill */ false);
+  if (Scav) {
+    RS->setRegUsed(Scav);
+    MRI.replaceRegWith(PCReg, Scav);
+    MRI.clearVirtRegs();
+  } else {
+    // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
+    // SGPR spill.
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
+    MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
+    MRI.clearVirtRegs();
+  }
 
+  MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
   // Now, the distance could be defined.
   auto *Offset = MCBinaryExpr::createSub(
-      MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+      MCSymbolRefExpr::create(DestLabel, MCCtx),
       MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
   // Add offset assignments.
   auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
   OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
   auto *ShAmt = MCConstantExpr::create(32, MCCtx);
   OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
-  return ComputeBlockSize(this, MBB);
+
+  return;
 }
 
 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
@@ -2443,16 +2464,15 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                    int *BytesRemoved) const {
-  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
-
   unsigned Count = 0;
   unsigned RemovedSize = 0;
-  while (I != MBB.end()) {
-    MachineBasicBlock::iterator Next = std::next(I);
-    RemovedSize += getInstSizeInBytes(*I);
-    I->eraseFromParent();
-    ++Count;
-    I = Next;
+  for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
+    // Skip over artificial terminators when removing instructions.
+    if (MI.isBranch() || MI.isReturn()) {
+      RemovedSize += getInstSizeInBytes(MI);
+      MI.eraseFromParent();
+      ++Count;
+    }
   }
 
   if (BytesRemoved)
@@ -2691,18 +2711,11 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   }
 }
 
-bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
+bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
-  case AMDGPU::V_MOV_B64_PSEUDO: {
-    // If there are additional implicit register operands, this may be used for
-    // register indexing so the source register operand isn't simply copied.
-    unsigned NumOps = MI.getDesc().getNumOperands() +
-      MI.getDesc().getNumImplicitUses();
-
-    return MI.getNumOperands() == NumOps;
-  }
+  case AMDGPU::V_MOV_B64_PSEUDO:
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B64:
   case AMDGPU::COPY:
@@ -3069,16 +3082,24 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   return false;
 }
 
-static int64_t getFoldableImm(const MachineOperand* MO) {
+static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
+                           int64_t &Imm) {
+  if (Reg.isPhysical())
+    return false;
+  auto *Def = MRI.getUniqueVRegDef(Reg);
+  if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
+    Imm = Def->getOperand(1).getImm();
+    return true;
+  }
+  return false;
+}
+
+static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) {
   if (!MO->isReg())
     return false;
   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF->getRegInfo();
-  auto Def = MRI.getUniqueVRegDef(MO->getReg());
-  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
-      Def->getOperand(1).isImm())
-    return Def->getOperand(1).getImm();
-  return AMDGPU::NoRegister;
+  return getFoldableImm(MO->getReg(), MRI, Imm);
 }
 
 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
@@ -3093,9 +3114,9 @@ static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
   }
 }
 
-MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
-                                                 MachineInstr &MI,
-                                                 LiveVariables *LV) const {
+MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
+                                                 LiveVariables *LV,
+                                                 LiveIntervals *LIS) const {
   unsigned Opc = MI.getOpcode();
   bool IsF16 = false;
   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
@@ -3145,50 +3166,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
   MachineInstrBuilder MIB;
+  MachineBasicBlock &MBB = *MI.getParent();
 
   if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
       // If we have an SGPR input, we will violate the constant bus restriction.
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
-       !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
-    if (auto Imm = getFoldableImm(Src2)) {
+       !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
+    int64_t Imm;
+    if (getFoldableImm(Src2, Imm)) {
       unsigned NewOpc =
           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
       if (pseudoToMCOpcode(NewOpc) != -1) {
-        MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+        MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                   .add(*Dst)
                   .add(*Src0)
                   .add(*Src1)
                   .addImm(Imm);
         updateLiveVariables(LV, MI, *MIB);
+        if (LIS)
+          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
         return MIB;
       }
     }
     unsigned NewOpc = IsFMA
                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
-    if (auto Imm = getFoldableImm(Src1)) {
+    if (getFoldableImm(Src1, Imm)) {
       if (pseudoToMCOpcode(NewOpc) != -1) {
-        MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+        MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                   .add(*Dst)
                   .add(*Src0)
                   .addImm(Imm)
                   .add(*Src2);
         updateLiveVariables(LV, MI, *MIB);
+        if (LIS)
+          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
         return MIB;
       }
     }
-    if (auto Imm = getFoldableImm(Src0)) {
+    if (getFoldableImm(Src0, Imm)) {
       if (pseudoToMCOpcode(NewOpc) != -1 &&
           isOperandLegal(
               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
               Src1)) {
-        MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+        MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                   .add(*Dst)
                   .add(*Src1)
                   .addImm(Imm)
                   .add(*Src2);
         updateLiveVariables(LV, MI, *MIB);
+        if (LIS)
+          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
         return MIB;
       }
     }
@@ -3201,7 +3230,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
   if (pseudoToMCOpcode(NewOpc) == -1)
     return nullptr;
 
-  MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+  MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
             .add(*Dst)
             .addImm(Src0Mods ? Src0Mods->getImm() : 0)
             .add(*Src0)
@@ -3212,6 +3241,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
             .addImm(Clamp ? Clamp->getImm() : 0)
             .addImm(Omod ? Omod->getImm() : 0);
   updateLiveVariables(LV, MI, *MIB);
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(MI, *MIB);
   return MIB;
 }
 
@@ -3382,6 +3413,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   switch (OperandType) {
   case AMDGPU::OPERAND_REG_IMM_INT32:
   case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
@@ -3420,6 +3452,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
     // This suffers the same problem as the scalar 16-bit cases.
     return AMDGPU::isInlinableIntLiteralV216(Imm);
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
@@ -3440,6 +3473,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
     uint32_t Trunc = static_cast<uint32_t>(Imm);
     return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
   }
+  case AMDGPU::OPERAND_KIMM32:
+  case AMDGPU::OPERAND_KIMM16:
+    return false;
   default:
     llvm_unreachable("invalid bitwidth");
   }
@@ -3566,11 +3602,13 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
         // Additional verification is needed for sdst/src2.
         return true;
       }
-      case AMDGPU::V_MAC_F32_e64:
       case AMDGPU::V_MAC_F16_e64:
-      case AMDGPU::V_FMAC_F32_e64:
+      case AMDGPU::V_MAC_F32_e64:
+      case AMDGPU::V_MAC_LEGACY_F32_e64:
       case AMDGPU::V_FMAC_F16_e64:
+      case AMDGPU::V_FMAC_F32_e64:
       case AMDGPU::V_FMAC_F64_e64:
+      case AMDGPU::V_FMAC_LEGACY_F32_e64:
         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
           return false;
@@ -3813,6 +3851,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       break;
     case AMDGPU::OPERAND_REG_IMM_INT32:
     case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
@@ -4472,20 +4511,20 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
-  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
-  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
-  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
-  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
-  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
-  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
-  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
-  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
-  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
-  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
-  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
-  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
-  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
-  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
+  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
+  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
+  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
+  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
+  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
+  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
+  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
+  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -4963,13 +5002,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
       continue;
     }
 
-    if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
+    if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
         !isOperandLegal(MI, Idx, &MO)) {
       legalizeOpWithMove(MI, Idx);
       continue;
     }
 
-    if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+    if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
       continue; // VGPRs are legal
 
     // We can use one SGPR in each VOP3 instruction prior to GFX10
@@ -5165,8 +5204,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
     return;
 
   Register DstReg = MRI.createVirtualRegister(DstRC);
-  MachineInstr *Copy =
-      BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
+  auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
 
   Op.setReg(DstReg);
   Op.setSubReg(0);
@@ -5188,7 +5226,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
   }
   if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
       !ImpDef)
-    Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+    Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
 }
 
 // Emit the actual waterfall loop, executing the wrapped instruction for each
@@ -5897,18 +5935,18 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       continue;
 
     case AMDGPU::S_CBRANCH_SCC0:
-    case AMDGPU::S_CBRANCH_SCC1:
-      // Clear unused bits of vcc
-      if (ST.isWave32())
-        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
-                AMDGPU::VCC_LO)
-            .addReg(AMDGPU::EXEC_LO)
-            .addReg(AMDGPU::VCC_LO);
-      else
-        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
-                AMDGPU::VCC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+    case AMDGPU::S_CBRANCH_SCC1: {
+        // Clear unused bits of vcc
+        Register CondReg = Inst.getOperand(1).getReg();
+        bool IsSCC = CondReg == AMDGPU::SCC;
+        Register VCC = RI.getVCC();
+        Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+        unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
+            .addReg(EXEC)
+            .addReg(IsSCC ? VCC : CondReg);
+        Inst.RemoveOperand(1);
+      }
       break;
 
     case AMDGPU::S_BFE_U64:
@@ -6016,12 +6054,43 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       continue;
 
     case AMDGPU::S_CSELECT_B32:
+      lowerSelect32(Worklist, Inst, MDT);
+      Inst.eraseFromParent();
+      continue;
     case AMDGPU::S_CSELECT_B64:
-      lowerSelect(Worklist, Inst, MDT);
+      splitSelect64(Worklist, Inst, MDT);
       Inst.eraseFromParent();
       continue;
+    case AMDGPU::S_CMP_EQ_I32:
+    case AMDGPU::S_CMP_LG_I32:
+    case AMDGPU::S_CMP_GT_I32:
+    case AMDGPU::S_CMP_GE_I32:
+    case AMDGPU::S_CMP_LT_I32:
+    case AMDGPU::S_CMP_LE_I32:
+    case AMDGPU::S_CMP_EQ_U32:
+    case AMDGPU::S_CMP_LG_U32:
+    case AMDGPU::S_CMP_GT_U32:
+    case AMDGPU::S_CMP_GE_U32:
+    case AMDGPU::S_CMP_LT_U32:
+    case AMDGPU::S_CMP_LE_U32:
+    case AMDGPU::S_CMP_EQ_U64:
+    case AMDGPU::S_CMP_LG_U64: {
+        const MCInstrDesc &NewDesc = get(NewOpcode);
+        Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
+        MachineInstr *NewInstr =
+            BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
+                .add(Inst.getOperand(0))
+                .add(Inst.getOperand(1));
+        legalizeOperands(*NewInstr, MDT);
+        int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
+        MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+        addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
+        Inst.eraseFromParent();
+      }
+      continue;
     }
 
+
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
@@ -6167,8 +6236,8 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
   return std::make_pair(false, nullptr);
 }
 
-void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
-                              MachineDominatorTree *MDT) const {
+void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
+                                MachineDominatorTree *MDT) const {
 
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6181,47 +6250,51 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
   MachineOperand &Cond = Inst.getOperand(3);
 
   Register SCCSource = Cond.getReg();
-  // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
-  if (!Cond.isUndef()) {
-    for (MachineInstr &CandI :
-         make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
-                    Inst.getParent()->rend())) {
-      if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-          -1) {
-        if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
-          SCCSource = CandI.getOperand(1).getReg();
-        }
-        break;
-      }
-    }
-  }
+  bool IsSCC = (SCCSource == AMDGPU::SCC);
 
   // If this is a trivial select where the condition is effectively not SCC
   // (SCCSource is a source of copy to SCC), then the select is semantically
   // equivalent to copying SCCSource. Hence, there is no need to create
   // V_CNDMASK, we can just use that and bail out.
-  if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
-      Src1.isImm() && (Src1.getImm() == 0)) {
+  if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
+      (Src1.getImm() == 0)) {
     MRI.replaceRegWith(Dest.getReg(), SCCSource);
     return;
   }
 
-  const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
-                                      ? &AMDGPU::SReg_64_XEXECRegClass
-                                      : &AMDGPU::SReg_32_XM0_XEXECRegClass;
+  const TargetRegisterClass *TC =
+      RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
   Register CopySCC = MRI.createVirtualRegister(TC);
 
-  if (SCCSource == AMDGPU::SCC) {
-    // Insert a trivial select instead of creating a copy, because a copy from
-    // SCC would semantically mean just copying a single bit, but we may need
-    // the result to be a vector condition mask that needs preserving.
-    unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
-                                                    : AMDGPU::S_CSELECT_B32;
-    auto NewSelect =
-        BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
-    NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
-  } else {
-    BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
+  if (IsSCC) {
+    // Now look for the closest SCC def if it is a copy
+    // replacing the SCCSource with the COPY source register
+    bool CopyFound = false;
+    for (MachineInstr &CandI :
+         make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
+                    Inst.getParent()->rend())) {
+      if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
+          -1) {
+        if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
+          BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+              .addReg(CandI.getOperand(1).getReg());
+          CopyFound = true;
+        }
+        break;
+      }
+    }
+    if (!CopyFound) {
+      // SCC def is not a copy
+      // Insert a trivial select instead of creating a copy, because a copy from
+      // SCC would semantically mean just copying a single bit, but we may need
+      // the result to be a vector condition mask that needs preserving.
+      unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
+                                                      : AMDGPU::S_CSELECT_B32;
+      auto NewSelect =
+          BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+      NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
+    }
   }
 
   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -6232,13 +6305,102 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
           .add(Src1) // False
           .addImm(0)
           .add(Src0) // True
-          .addReg(CopySCC);
+          .addReg(IsSCC ? CopySCC : SCCSource);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   legalizeOperands(*UpdatedInst, MDT);
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
+                                MachineDominatorTree *MDT) const {
+  // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them
+  // further.
+  const DebugLoc &DL = Inst.getDebugLoc();
+  MachineBasicBlock::iterator MII = Inst;
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Get the original operands.
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  MachineOperand &Cond = Inst.getOperand(3);
+
+  Register SCCSource = Cond.getReg();
+  bool IsSCC = (SCCSource == AMDGPU::SCC);
+
+  // If this is a trivial select where the condition is effectively not SCC
+  // (SCCSource is a source of copy to SCC), then the select is semantically
+  // equivalent to copying SCCSource. Hence, there is no need to create
+  // V_CNDMASK, we can just use that and bail out.
+  if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) &&
+      (Src1.isImm() && Src1.getImm() == 0)) {
+    MRI.replaceRegWith(Dest.getReg(), SCCSource);
+    return;
+  }
+
+  // Prepare the split destination.
+  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  // Split the source operands.
+  const TargetRegisterClass *Src0RC = nullptr;
+  const TargetRegisterClass *Src0SubRC = nullptr;
+  if (Src0.isReg()) {
+    Src0RC = MRI.getRegClass(Src0.getReg());
+    Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  }
+  const TargetRegisterClass *Src1RC = nullptr;
+  const TargetRegisterClass *Src1SubRC = nullptr;
+  if (Src1.isReg()) {
+    Src1RC = MRI.getRegClass(Src1.getReg());
+    Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+  }
+  // Split lo.
+  MachineOperand SrcReg0Sub0 =
+      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 =
+      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+  // Split hi.
+  MachineOperand SrcReg0Sub1 =
+      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 =
+      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+  // Select the lo part.
+  MachineInstr *LoHalf =
+      BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0)
+          .add(SrcReg0Sub0)
+          .add(SrcReg1Sub0);
+  // Replace the condition operand with the original one.
+  LoHalf->getOperand(3).setReg(SCCSource);
+  Worklist.insert(LoHalf);
+  // Select the hi part.
+  MachineInstr *HiHalf =
+      BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1)
+          .add(SrcReg0Sub1)
+          .add(SrcReg1Sub1);
+  // Replace the condition operand with the original one.
+  HiHalf->getOperand(3).setReg(SCCSource);
+  Worklist.insert(HiHalf);
+  // Merge them back to the original 64-bit one.
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+      .addReg(DestSub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(DestSub1)
+      .addImm(AMDGPU::sub1);
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep
+  // it valid.
+  legalizeOperands(*LoHalf, MDT);
+  legalizeOperands(*HiHalf, MDT);
+
+  // Move all users of this moved value.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
@@ -6823,8 +6985,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
 
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
-                                               SetVectorType &Worklist) const {
-  bool SCCUsedImplicitly = false;
+                                               SetVectorType &Worklist,
+                                               Register NewCond) const {
 
   // Ensure that def inst defines SCC, which is still live.
   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
@@ -6836,33 +6998,18 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
                   SCCDefInst.getParent()->end())) {
     // Check if SCC is used first.
-    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
+    int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
+    if (SCCIdx != -1) {
       if (MI.isCopy()) {
         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
         Register DestReg = MI.getOperand(0).getReg();
 
-        for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
-          if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
-              (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
-            User.getOperand(4).setReg(RI.getVCC());
-            Worklist.insert(&User);
-          } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
-            User.getOperand(5).setReg(RI.getVCC());
-            // No need to add to Worklist.
-          }
-        }
+        MRI.replaceRegWith(DestReg, NewCond);
         CopyToDelete.push_back(&MI);
       } else {
-        if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
-            MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
-          // This is an implicit use of SCC and it is really expected by
-          // the SCC users to handle.
-          // We cannot preserve the edge to the user so add the explicit
-          // copy: SCC = COPY VCC.
-          // The copy will be cleaned up during the processing of the user
-          // in lowerSelect.
-          SCCUsedImplicitly = true;
-        }
+
+        if (NewCond.isValid())
+          MI.getOperand(SCCIdx).setReg(NewCond);
 
         Worklist.insert(&MI);
       }
@@ -6873,12 +7020,6 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
   }
   for (auto &Copy : CopyToDelete)
     Copy->eraseFromParent();
-
-  if (SCCUsedImplicitly) {
-    BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
-            SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
-        .addReg(RI.getVCC());
-  }
 }
 
 // Instructions that use SCC may be converted to VALU instructions. When that
@@ -7171,31 +7312,19 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return Size;
   }
 
-  // 4-byte instructions may have a 32-bit literal encoded after them. Check
-  // operands that coud ever be literals.
+  // Instructions may have a 32-bit literal encoded after them. Check
+  // operands that could ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-    if (Src0Idx == -1)
-      return DescSize; // No operands.
-
-    if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
-      return isVOP3(MI) ? 12 : (DescSize + 4);
-
-    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-    if (Src1Idx == -1)
+    if (isDPP(MI))
       return DescSize;
-
-    if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
-      return isVOP3(MI) ? 12 : (DescSize + 4);
-
-    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
-    if (Src2Idx == -1)
-      return DescSize;
-
-    if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
-      return isVOP3(MI) ? 12 : (DescSize + 4);
-
-    return DescSize;
+    bool HasLiteral = false;
+    for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
+      if (isLiteralConstant(MI, I)) {
+        HasLiteral = true;
+        break;
+      }
+    }
+    return HasLiteral ? DescSize + 4 : DescSize;
   }
 
   // Check whether we have extra NSA words.
@@ -7283,19 +7412,16 @@ void SIInstrInfo::convertNonUniformLoopRegion(
     Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
     MachineInstrBuilder HeaderPHIBuilder =
         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
-    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
-                                          E = LoopEntry->pred_end();
-         PI != E; ++PI) {
-      if (*PI == LoopEnd) {
+    for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
+      if (PMBB == LoopEnd) {
         HeaderPHIBuilder.addReg(BackEdgeReg);
       } else {
-        MachineBasicBlock *PMBB = *PI;
         Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
                              ZeroReg, 0);
         HeaderPHIBuilder.addReg(ZeroReg);
       }
-      HeaderPHIBuilder.addMBB(*PI);
+      HeaderPHIBuilder.addMBB(PMBB);
     }
     MachineInstr *HeaderPhi = HeaderPHIBuilder;
     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
@@ -7340,6 +7466,20 @@ SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const
   return new GCNHazardRecognizer(MF);
 }
 
+// Called during:
+// - pre-RA scheduling and post-RA scheduling
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                                            const ScheduleDAGMI *DAG) const {
+  // Borrowed from Arm Target
+  // We would like to restrict this hazard recognizer to only
+  // post-RA scheduling; we can tell that we're post-RA because we don't
+  // track VRegLiveness.
+  if (!DAG->hasVRegLiveness())
+    return new GCNHazardRecognizer(DAG->MF);
+  return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG);
+}
+
 std::pair<unsigned, unsigned>
 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
@@ -7919,3 +8059,209 @@ unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
     return 0;
   }
 }
+
+bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                 Register &SrcReg2, int64_t &CmpMask,
+                                 int64_t &CmpValue) const {
+  if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
+    return false;
+
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AMDGPU::S_CMP_EQ_U32:
+  case AMDGPU::S_CMP_EQ_I32:
+  case AMDGPU::S_CMP_LG_U32:
+  case AMDGPU::S_CMP_LG_I32:
+  case AMDGPU::S_CMP_LT_U32:
+  case AMDGPU::S_CMP_LT_I32:
+  case AMDGPU::S_CMP_GT_U32:
+  case AMDGPU::S_CMP_GT_I32:
+  case AMDGPU::S_CMP_LE_U32:
+  case AMDGPU::S_CMP_LE_I32:
+  case AMDGPU::S_CMP_GE_U32:
+  case AMDGPU::S_CMP_GE_I32:
+  case AMDGPU::S_CMP_EQ_U64:
+  case AMDGPU::S_CMP_LG_U64:
+    SrcReg = MI.getOperand(0).getReg();
+    if (MI.getOperand(1).isReg()) {
+      if (MI.getOperand(1).getSubReg())
+        return false;
+      SrcReg2 = MI.getOperand(1).getReg();
+      CmpValue = 0;
+    } else if (MI.getOperand(1).isImm()) {
+      SrcReg2 = Register();
+      CmpValue = MI.getOperand(1).getImm();
+    } else {
+      return false;
+    }
+    CmpMask = ~0;
+    return true;
+  case AMDGPU::S_CMPK_EQ_U32:
+  case AMDGPU::S_CMPK_EQ_I32:
+  case AMDGPU::S_CMPK_LG_U32:
+  case AMDGPU::S_CMPK_LG_I32:
+  case AMDGPU::S_CMPK_LT_U32:
+  case AMDGPU::S_CMPK_LT_I32:
+  case AMDGPU::S_CMPK_GT_U32:
+  case AMDGPU::S_CMPK_GT_I32:
+  case AMDGPU::S_CMPK_LE_U32:
+  case AMDGPU::S_CMPK_LE_I32:
+  case AMDGPU::S_CMPK_GE_U32:
+  case AMDGPU::S_CMPK_GE_I32:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = Register();
+    CmpValue = MI.getOperand(1).getImm();
+    CmpMask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                                       Register SrcReg2, int64_t CmpMask,
+                                       int64_t CmpValue,
+                                       const MachineRegisterInfo *MRI) const {
+  if (!SrcReg || SrcReg.isPhysical())
+    return false;
+
+  if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
+    return false;
+
+  const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
+                               this](int64_t ExpectedValue, unsigned SrcSize,
+                                     bool IsReversable, bool IsSigned) -> bool {
+    // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
+    // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
+    // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
+    // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
+    // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
+    // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
+    // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
+    // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
+    // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
+    // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
+    //
+    // Signed ge/gt are not used for the sign bit.
+    //
+    // If result of the AND is unused except in the compare:
+    // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
+    //
+    // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
+    // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
+    // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
+    // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
+    // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
+    // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
+
+    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+    if (!Def || Def->getParent() != CmpInstr.getParent())
+      return false;
+
+    if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
+        Def->getOpcode() != AMDGPU::S_AND_B64)
+      return false;
+
+    int64_t Mask;
+    const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
+      if (MO->isImm())
+        Mask = MO->getImm();
+      else if (!getFoldableImm(MO, Mask))
+        return false;
+      Mask &= maxUIntN(SrcSize);
+      return isPowerOf2_64(Mask);
+    };
+
+    MachineOperand *SrcOp = &Def->getOperand(1);
+    if (isMask(SrcOp))
+      SrcOp = &Def->getOperand(2);
+    else if (isMask(&Def->getOperand(2)))
+      SrcOp = &Def->getOperand(1);
+    else
+      return false;
+
+    unsigned BitNo = countTrailingZeros((uint64_t)Mask);
+    if (IsSigned && BitNo == SrcSize - 1)
+      return false;
+
+    ExpectedValue <<= BitNo;
+
+    bool IsReversedCC = false;
+    if (CmpValue != ExpectedValue) {
+      if (!IsReversable)
+        return false;
+      IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
+      if (!IsReversedCC)
+        return false;
+    }
+
+    Register DefReg = Def->getOperand(0).getReg();
+    if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
+      return false;
+
+    for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
+         I != E; ++I) {
+      if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
+          I->killsRegister(AMDGPU::SCC, &RI))
+        return false;
+    }
+
+    MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC);
+    SccDef->setIsDead(false);
+    CmpInstr.eraseFromParent();
+
+    if (!MRI->use_nodbg_empty(DefReg)) {
+      assert(!IsReversedCC);
+      return true;
+    }
+
+    // Replace AND with unused result with a S_BITCMP.
+    MachineBasicBlock *MBB = Def->getParent();
+
+    unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
+                                                     : AMDGPU::S_BITCMP1_B32
+                                      : IsReversedCC ? AMDGPU::S_BITCMP0_B64
+                                                     : AMDGPU::S_BITCMP1_B64;
+
+    BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
+      .add(*SrcOp)
+      .addImm(BitNo);
+    Def->eraseFromParent();
+
+    return true;
+  };
+
+  switch (CmpInstr.getOpcode()) {
+  default:
+    break;
+  case AMDGPU::S_CMP_EQ_U32:
+  case AMDGPU::S_CMP_EQ_I32:
+  case AMDGPU::S_CMPK_EQ_U32:
+  case AMDGPU::S_CMPK_EQ_I32:
+    return optimizeCmpAnd(1, 32, true, false);
+  case AMDGPU::S_CMP_GE_U32:
+  case AMDGPU::S_CMPK_GE_U32:
+    return optimizeCmpAnd(1, 32, false, false);
+  case AMDGPU::S_CMP_GE_I32:
+  case AMDGPU::S_CMPK_GE_I32:
+    return optimizeCmpAnd(1, 32, false, true);
+  case AMDGPU::S_CMP_EQ_U64:
+    return optimizeCmpAnd(1, 64, true, false);
+  case AMDGPU::S_CMP_LG_U32:
+  case AMDGPU::S_CMP_LG_I32:
+  case AMDGPU::S_CMPK_LG_U32:
+  case AMDGPU::S_CMPK_LG_I32:
+    return optimizeCmpAnd(0, 32, true, false);
+  case AMDGPU::S_CMP_GT_U32:
+  case AMDGPU::S_CMPK_GT_U32:
+    return optimizeCmpAnd(0, 32, false, false);
+  case AMDGPU::S_CMP_GT_I32:
+  case AMDGPU::S_CMPK_GT_I32:
+    return optimizeCmpAnd(0, 32, false, true);
+  case AMDGPU::S_CMP_LG_U64:
+    return optimizeCmpAnd(0, 64, true, false);
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fc5e5be03541..dd9ea2b53ca2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -78,8 +78,11 @@ private:
   moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
                    MachineDominatorTree *MDT = nullptr) const;
 
-  void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
-                   MachineDominatorTree *MDT = nullptr) const;
+  void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
+                     MachineDominatorTree *MDT = nullptr) const;
+
+  void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
+                     MachineDominatorTree *MDT = nullptr) const;
 
   void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
@@ -122,7 +125,8 @@ private:
 
   void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
-                                    SetVectorType &Worklist) const;
+                                    SetVectorType &Worklist,
+                                    Register NewCond = Register()) const;
   void addSCCDefsToVALUWorklist(MachineOperand &Op,
                                 SetVectorType &Worklist) const;
 
@@ -271,11 +275,10 @@ public:
 
   MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
 
-  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                MachineBasicBlock &NewDestBB,
-                                const DebugLoc &DL,
-                                int64_t BrOffset,
-                                RegScavenger *RS = nullptr) const override;
+  void insertIndirectBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock &NewDestBB,
+                            MachineBasicBlock &RestoreBB, const DebugLoc &DL,
+                            int64_t BrOffset, RegScavenger *RS) const override;
 
   bool analyzeBranchImpl(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator I,
@@ -315,6 +318,14 @@ public:
                           Register DstReg, ArrayRef<MachineOperand> Cond,
                           Register TrueReg, Register FalseReg) const;
 
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int64_t &CmpMask,
+                      int64_t &CmpValue) const override;
+
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
   unsigned getAddressSpaceForPseudoSourceKind(
              unsigned Kind) const override;
 
@@ -322,16 +333,15 @@ public:
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
                                   const MachineInstr &MIb) const override;
 
-  bool isFoldableCopy(const MachineInstr &MI) const;
+  static bool isFoldableCopy(const MachineInstr &MI);
 
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const final;
 
   unsigned getMachineCSELookAheadLimit() const override { return 500; }
 
-  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
-                                      MachineInstr &MI,
-                                      LiveVariables *LV) const override;
+  MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                      LiveIntervals *LIS) const override;
 
   bool isSchedulingBoundary(const MachineInstr &MI,
                             const MachineBasicBlock *MBB,
@@ -1036,6 +1046,10 @@ public:
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
 
+  ScheduleHazardRecognizer *
+  CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAGMI *DAG) const override;
+
   bool isBasicBlockPrologue(const MachineInstr &MI) const override;
 
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
@@ -1119,6 +1133,8 @@ public:
   }
 
   static unsigned getDSShaderTypeValue(const MachineFunction &MF);
+
+  const TargetSchedModel &getSchedModel() const { return SchedModel; }
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 25b647d34ec1..8c24268e379e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -333,6 +333,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
   let IsNonExtLoad = 1;
 }
 
+def atomic_load_8_glue : PatFrag<(ops node:$ptr),
+  (AMDGPUatomic_ld_glue node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i8;
+}
+
+def atomic_load_16_glue : PatFrag<(ops node:$ptr),
+  (AMDGPUatomic_ld_glue node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i16;
+}
+
 def atomic_load_32_glue : PatFrag<(ops node:$ptr),
   (AMDGPUatomic_ld_glue node:$ptr)> {
   let IsAtomic = 1;
@@ -423,6 +435,14 @@ def load_align16_local_m0 : PatFrag<(ops node:$ptr),
 } // End IsLoad = 1
 
 let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def atomic_load_8_local_m0 : PatFrag<(ops node:$ptr),
+                                      (atomic_load_8_glue node:$ptr)> {
+  let MemoryVT = i8;
+}
+def atomic_load_16_local_m0 : PatFrag<(ops node:$ptr),
+                                      (atomic_load_16_glue node:$ptr)> {
+  let MemoryVT = i16;
+}
 def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
                                       (atomic_load_32_glue node:$ptr)> {
   let MemoryVT = i32;
@@ -509,6 +529,18 @@ def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
 
 let AddressSpaces = StoreAddress_local.AddrSpaces in {
 
+def atomic_store_local_8_m0 : PatFrag <
+  (ops node:$value, node:$ptr),
+  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i8;
+}
+def atomic_store_local_16_m0 : PatFrag <
+  (ops node:$value, node:$ptr),
+  (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+  let IsAtomic = 1;
+  let MemoryVT = i16;
+}
 def atomic_store_local_32_m0 : PatFrag <
   (ops node:$value, node:$ptr),
   (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
@@ -527,15 +559,7 @@ def atomic_store_local_64_m0 : PatFrag <
 def si_setcc_uniform : PatFrag <
   (ops node:$lhs, node:$rhs, node:$cond),
   (setcc node:$lhs, node:$rhs, node:$cond), [{
-  for (SDNode *Use : N->uses()) {
-    if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
-      return false;
-
-    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
-    if (Reg != AMDGPU::SCC)
-      return false;
-  }
-  return true;
+  return !N->isDivergent();
 }]>;
 
 //===----------------------------------------------------------------------===//
@@ -1181,6 +1205,7 @@ class kimmOperand<ValueType vt> : Operand<vt> {
   let OperandType = "OPERAND_KIMM"#vt.Size;
   let PrintMethod = "printU"#vt.Size#"ImmOperand";
   let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
+  let DecoderMethod = "decodeOperand_f"#vt.Size#"kimm";
 }
 
 // 32-bit VALU immediate operand that uses the constant bus.
@@ -1864,8 +1889,8 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
 
 // Returns the assembly string for the inputs and outputs of a VOP3P
 // instruction.
-class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
-                   bit HasClamp, ValueType DstVT = i32> {
+class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
+                   bit HasClamp> {
   string dst = "$vdst";
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
@@ -1883,7 +1908,6 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
 
 class getAsmVOP3OpSel <int NumSrcArgs,
                        bit HasClamp,
-                       bit HasOMod,
                        bit Src0HasMods,
                        bit Src1HasMods,
                        bit Src2HasMods> {
@@ -2026,8 +2050,7 @@ class getHasSDWA <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
             );
 }
 
-class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
-                 ValueType Src1VT = i32> {
+class getHasDPP <int NumSrcArgs> {
   bit ret = !if(!eq(NumSrcArgs, 3),
                 0, // NumSrcArgs == 3 - No DPP for VOP3
                 1);
@@ -2035,14 +2058,14 @@ class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
 
 class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32> {
-  bit ret = !and(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+  bit ret = !and(getHasDPP<NumSrcArgs>.ret,
                  getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
 }
 
 // Function that checks if instruction supports DPP and SDWA
 class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32> {
-  bit ret = !or(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+  bit ret = !or(getHasDPP<NumSrcArgs>.ret,
                 getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
 }
 
@@ -2146,7 +2169,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
   field bit HasSrc2Mods = !if(HasModifiers, !or(HasSrc2FloatMods, HasSrc2IntMods), 0);
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
-  field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+  field bit HasExtDPP = getHasDPP<NumSrcArgs>.ret;
   field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
   field bit HasExtSDWA9 = HasExtSDWA;
@@ -2197,9 +2220,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret;
-  field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
+  field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
-                                              HasClamp, HasOMod,
+                                              HasClamp,
                                               HasSrc0FloatMods,
                                               HasSrc1FloatMods,
                                               HasSrc2FloatMods>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index fbf4634bfc94..d5f9cb8ba493 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1017,22 +1017,33 @@ def : GCNPat <
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
 
-foreach Index = 0-2 in {
-  def Extract_Element_v2i32_#Index : Extract_Element <
-    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+// Special case for 2 element vectors. REQ_SEQUENCE produces better code
+// than an INSERT_SUBREG.
+multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> {
+  def : GCNPat <
+    (insertelt vec_type:$vec, elem_type:$elem, 0),
+    (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1)
   >;
-  def Insert_Element_v2i32_#Index : Insert_Element <
+
+  def : GCNPat <
+    (insertelt vec_type:$vec, elem_type:$elem, 1),
+    (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1)
+  >;
+}
+
+foreach Index = 0-1 in {
+  def Extract_Element_v2i32_#Index : Extract_Element <
     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
   >;
 
   def Extract_Element_v2f32_#Index : Extract_Element <
     f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
   >;
-  def Insert_Element_v2f32_#Index : Insert_Element <
-    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
-  >;
 }
 
+defm : Insert_Element_V2 <SReg_64, i32, v2i32>;
+defm : Insert_Element_V2 <SReg_64, f32, v2f32>;
+
 foreach Index = 0-2 in {
   def Extract_Element_v3i32_#Index : Extract_Element <
     i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1860,40 +1871,92 @@ def : GCNPat <
 // Conversion Patterns
 //===----------------------------------------------------------------------===//
 
-def : GCNPat<(i32 (sext_inreg i32:$src, i1)),
+class UniformSextInreg<ValueType VT> : PatFrag<
+  (ops node:$src),
+  (sext_inreg $src, VT),
+  [{ return !N->isDivergent(); }]>;
+
+def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
   (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
 
 // Handle sext_inreg in i64
 def : GCNPat <
-  (i64 (sext_inreg i64:$src, i1)),
+  (i64 (UniformSextInreg<i1> i64:$src)),
   (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
 >;
 
 def : GCNPat <
-  (i16 (sext_inreg i16:$src, i1)),
+  (i16 (UniformSextInreg<i1> i16:$src)),
   (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
 >;
 
 def : GCNPat <
-  (i16 (sext_inreg i16:$src, i8)),
+  (i16 (UniformSextInreg<i8> i16:$src)),
   (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
 >;
 
 def : GCNPat <
-  (i64 (sext_inreg i64:$src, i8)),
+  (i64 (UniformSextInreg<i8> i64:$src)),
   (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
 >;
 
 def : GCNPat <
-  (i64 (sext_inreg i64:$src, i16)),
+  (i64 (UniformSextInreg<i16> i64:$src)),
   (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
 >;
 
 def : GCNPat <
-  (i64 (sext_inreg i64:$src, i32)),
+  (i64 (UniformSextInreg<i32> i64:$src)),
   (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
 >;
 
+
+class DivergentSextInreg<ValueType VT> : PatFrag<
+  (ops node:$src),
+  (sext_inreg $src, VT),
+  [{ return N->isDivergent(); }]>;
+
+def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
+  (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
+
+def : GCNPat <
+  (i16 (DivergentSextInreg<i1> i16:$src)),
+  (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16
+>;
+
+def : GCNPat <
+  (i16 (DivergentSextInreg<i8> i16:$src)),
+  (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16
+>;
+
+def : GCNPat <
+  (i64 (DivergentSextInreg<i1> i64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0,
+    (V_ASHRREV_I32_e32  (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1)
+>;
+
+def : GCNPat <
+  (i64 (DivergentSextInreg<i8> i64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0,
+    (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
+>;
+
+def : GCNPat <
+  (i64 (DivergentSextInreg<i16> i64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0,
+    (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
+>;
+
+def : GCNPat <
+  (i64 (DivergentSextInreg<i32> i64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0,
+    (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1)
+>;
+
 def : GCNPat <
   (i64 (zext i32:$src)),
   (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
@@ -2097,6 +2160,22 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (i1 (UniformUnaryFrag<trunc> i32:$a)),
+  (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : GCNPat <
+  (i1 (UniformUnaryFrag<trunc> i16:$a)),
+  (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : GCNPat <
+  (i1 (UniformUnaryFrag<trunc> i64:$a)),
+  (S_CMP_EQ_U32 (S_AND_B32 (i32 1),
+                    (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
+>;
+
+def : GCNPat <
   (i1 (trunc i32:$a)),
   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
 >;
@@ -2278,31 +2357,37 @@ let SubtargetPredicate = NotHasMinMaxDenormModes in {
 
 
 let OtherPredicates = [HasDLInsts] in {
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
 def : GCNPat <
-  (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
-       (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+  (fma (f32 (VOP3NoMods f32:$src0)),
+       (f32 (VOP3NoMods f32:$src1)),
        (f32 (VOP3NoMods f32:$src2))),
-  (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+  (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
                   SRCMODS.NONE, $src2)
 >;
 } // End OtherPredicates = [HasDLInsts]
 
 let SubtargetPredicate = isGFX10Plus in
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
 def : GCNPat <
-  (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
-       (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+  (fma (f16 (VOP3NoMods f32:$src0)),
+       (f16 (VOP3NoMods f32:$src1)),
        (f16 (VOP3NoMods f32:$src2))),
-  (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+  (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
                   SRCMODS.NONE, $src2)
 >;
 
 let SubtargetPredicate = isGFX90APlus in
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
 def : GCNPat <
-  (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
-       (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
+  (fma (f64 (VOP3NoMods f64:$src0)),
+       (f64 (VOP3NoMods f64:$src1)),
        (f64 (VOP3NoMods f64:$src2))),
-  (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
-                  SRCMODS.NONE, $src2, $clamp, $omod)
+  (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+                  SRCMODS.NONE, $src2)
 >;
 
 // COPY is workaround tablegen bug from multiple outputs
@@ -2656,12 +2741,20 @@ class AMDGPUGenericInstruction : GenericInstruction {
   let Namespace = "AMDGPU";
 }
 
+// Returns -1 if the input is zero.
 def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
   let hasSideEffects = 0;
 }
 
+// Returns -1 if the input is zero.
+def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = 0;
+}
+
 def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
@@ -2854,3 +2947,16 @@ def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
   let mayLoad = 1;
   let mayStore = 0;
 }
+
+// Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop
+// if necessary.
+def G_SI_CALL : AMDGPUGenericInstruction {
+  let OutOperandList = (outs SReg_64:$dst);
+  let InOperandList = (ins type0:$src0, unknown:$callee);
+  let Size = 4;
+  let isCall = 1;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteBranch];
+  // TODO: Should really base this on the call target
+  let isConvergent = 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index d560b477b8ba..4fa8ec711134 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -140,11 +140,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
   bool MadeChange = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
-
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       switch (MI.getOpcode()) {
       case AMDGPU::S_BRANCH:
         // Optimize out branches to the next block.
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 493c1ad87f93..34cbb49dcd16 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -146,7 +146,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
         if (!AddrOp->isReg())
           return false;
 
-        // TODO: We should be able to merge physical reg addreses.
+        // TODO: We should be able to merge physical reg addresses.
         if (AddrOp->getReg().isPhysical())
           return false;
 
@@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
     return 2;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
     return 4;
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+    return 8;
   case AMDGPU::DS_READ_B32:      LLVM_FALLTHROUGH;
   case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
   case AMDGPU::DS_WRITE_B32:     LLVM_FALLTHROUGH;
@@ -343,6 +345,9 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
         return UNKNOWN;
+      // Ignore BVH instructions
+      if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
+        return UNKNOWN;
       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
           TII.isGather4(Opc))
@@ -369,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return S_BUFFER_LOAD_IMM;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -380,15 +386,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return DS_WRITE;
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
-    return UNKNOWN;
   }
 }
 
@@ -419,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
   }
 }
@@ -469,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     Result.SBase = true;
     return Result;
   case AMDGPU::DS_READ_B32:
@@ -653,7 +652,7 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
 }
 
 // This function assumes that \p A and \p B have are identical except for
-// size and offset, and they referecne adjacent memory.
+// size and offset, and they reference adjacent memory.
 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
                                                    const MachineMemOperand *A,
                                                    const MachineMemOperand *B) {
@@ -863,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
       return false;
     case 2:
     case 4:
+    case 8:
       return true;
     }
   }
@@ -1529,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
     case 4:
       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+    case 8:
+      return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
     }
   case MIMG:
-    assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
+    assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
+           "No overlaps");
     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
   }
 }
 
 std::pair<unsigned, unsigned>
-SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
+                                    const CombineInfo &Paired) {
 
-  if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
-    return std::make_pair(0, 0);
+  assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
 
   bool ReverseOrder;
   if (CI.InstClass == MIMG) {
-    assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
-           "No overlaps");
+    assert(
+        (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
+        "No overlaps");
     ReverseOrder = CI.DMask > Paired.DMask;
   } else
     ReverseOrder = CI.Offset > Paired.Offset;
 
-  static const unsigned Idxs[4][4] = {
-      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
-      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
-      {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
-      {AMDGPU::sub3, 0, 0, 0},
-  };
   unsigned Idx0;
   unsigned Idx1;
 
-  assert(CI.Width >= 1 && CI.Width <= 3);
-  assert(Paired.Width >= 1 && Paired.Width <= 3);
+  if (CI.Width + Paired.Width > 4) {
+    assert(CI.Width == 4 && Paired.Width == 4);
 
-  if (ReverseOrder) {
-    Idx1 = Idxs[0][Paired.Width - 1];
-    Idx0 = Idxs[Paired.Width][CI.Width - 1];
+    if (ReverseOrder) {
+      Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
+      Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
+    } else {
+      Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
+      Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
+    }
   } else {
-    Idx0 = Idxs[0][CI.Width - 1];
-    Idx1 = Idxs[CI.Width][Paired.Width - 1];
+    static const unsigned Idxs[4][4] = {
+        {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+        {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+        {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+        {AMDGPU::sub3, 0, 0, 0},
+    };
+
+    assert(CI.Width >= 1 && CI.Width <= 3);
+    assert(Paired.Width >= 1 && Paired.Width <= 3);
+
+    if (ReverseOrder) {
+      Idx1 = Idxs[0][Paired.Width - 1];
+      Idx0 = Idxs[Paired.Width][CI.Width - 1];
+    } else {
+      Idx0 = Idxs[0][CI.Width - 1];
+      Idx1 = Idxs[CI.Width][Paired.Width - 1];
+    }
   }
 
   return std::make_pair(Idx0, Idx1);
@@ -2048,7 +2065,7 @@ SILoadStoreOptimizer::collectMergeableInsts(
     // adjacent to each other in the list, which will make it easier to find
     // matches.
     MergeList.sort(
-        [] (const CombineInfo &A, CombineInfo &B) {
+        [] (const CombineInfo &A, const CombineInfo &B) {
           return A.Offset < B.Offset;
         });
     ++I;
@@ -2140,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
       MachineBasicBlock::iterator NewMI =
           mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
+      OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
       break;
     }
     case BUFFER_LOAD: {
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 0f2836e1e7fb..3168bcd53eda 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -13,7 +13,7 @@
 /// All control flow is handled using predicated instructions and
 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
-/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// by writing to the 64-bit EXEC register (each bit corresponds to a
 /// single vector ALU).  Typically, for predicates, a vector ALU will write
 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
@@ -38,7 +38,8 @@
 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
 ///
 /// label0:
-/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then block
+/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then
+///                                    // block
 /// %exec = S_XOR_B64 %sgpr0, %exec    // Update the exec mask
 /// S_BRANCH_EXECZ label1              // Use our branch optimization
 ///                                    // instruction again.
@@ -52,6 +53,8 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
@@ -69,6 +72,8 @@ private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
   LiveIntervals *LIS = nullptr;
+  LiveVariables *LV = nullptr;
+  MachineDominatorTree *MDT = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
@@ -141,6 +146,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     // Should preserve the same set that TwoAddressInstructions does.
+    AU.addPreserved<MachineDominatorTree>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
     AU.addPreservedID(LiveVariablesID);
@@ -234,6 +240,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
     .addReg(CopyReg)
     .add(Cond);
+  if (LV)
+    LV->replaceKillInstruction(Cond.getReg(), MI, *And);
 
   setImpSCCDefDead(*And, true);
 
@@ -251,6 +259,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineInstr *SetExec =
     BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
     .addReg(Tmp, RegState::Kill);
+  if (LV)
+    LV->getVarInfo(Tmp).Kills.push_back(SetExec);
 
   // Skip ahead to the unconditional branch in case there are other terminators
   // present.
@@ -304,6 +314,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   MachineInstr *OrSaveExec =
     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
     .add(MI.getOperand(1)); // Saved EXEC
+  if (LV)
+    LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *OrSaveExec);
 
   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 
@@ -377,15 +389,22 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
     And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
              .addReg(Exec)
              .add(MI.getOperand(1));
+    if (LV)
+      LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And);
     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
              .addReg(AndReg)
              .add(MI.getOperand(2));
     if (LIS)
       LIS->createAndComputeVirtRegInterval(AndReg);
-  } else
+  } else {
     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
              .add(MI.getOperand(1))
              .add(MI.getOperand(2));
+    if (LV)
+      LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *Or);
+  }
+  if (LV)
+    LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or);
 
   if (LIS) {
     if (And)
@@ -471,6 +490,14 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock *SplitBB = &MBB;
   if (NeedBlockSplit) {
     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
+    if (MDT && SplitBB != &MBB) {
+      MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
+      SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
+                                                 MBBNode->end());
+      MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
+      for (MachineDomTreeNode *Child : Children)
+        MDT->changeImmediateDominator(Child, SplitBBNode);
+    }
     Opcode = OrTermrOpc;
     InsPt = MI;
   }
@@ -479,6 +506,8 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
     BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
     .addReg(Exec)
     .add(MI.getOperand(0));
+  if (LV)
+    LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI);
 
   LoweredEndCf.insert(NewMI);
 
@@ -570,7 +599,12 @@ void SILowerControlFlow::optimizeEndCf() {
       LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
       if (LIS)
         LIS->RemoveMachineInstrFromMaps(*MI);
+      Register Reg;
+      if (LV)
+        Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
       MI->eraseFromParent();
+      if (LV)
+        LV->recomputeForSingleDefVirtReg(Reg);
       removeMBBifRedundant(MBB);
     }
   }
@@ -686,6 +720,8 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
                    .addReg(InputReg)
                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+  if (LV)
+    LV->recomputeForSingleDefVirtReg(InputReg);
   auto BfmMI =
       BuildMI(*MBB, FirstMI, DL,
               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
@@ -694,6 +730,8 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
                    .addReg(CountReg, RegState::Kill)
                    .addImm(WavefrontSize);
+  if (LV)
+    LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
   auto CmovMI =
       BuildMI(*MBB, FirstMI, DL,
               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
@@ -719,23 +757,6 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
 }
 
 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
-  auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
-    auto *S = B->getNextNode();
-    if (!S)
-      return nullptr;
-    if (B->isSuccessor(S)) {
-      // The only fallthrough candidate
-      MachineBasicBlock::iterator I(B->getFirstInstrTerminator());
-      MachineBasicBlock::iterator E = B->end();
-      for (; I != E; I++) {
-        if (I->isBranch() && TII->getBranchDestBlock(*I) == S)
-          // We have unoptimized branch to layout successor
-          return nullptr;
-      }
-    }
-    return S;
-  };
-
   for (auto &I : MBB.instrs()) {
     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
       return false;
@@ -748,7 +769,7 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
 
   while (!MBB.predecessors().empty()) {
     MachineBasicBlock *P = *MBB.pred_begin();
-    if (GetFallThroughSucc(P) == &MBB)
+    if (P->getFallThrough() == &MBB)
       FallThrough = P;
     P->ReplaceUsesOfBlockWith(&MBB, Succ);
   }
@@ -757,10 +778,19 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
     for (auto &I : MBB.instrs())
       LIS->RemoveMachineInstrFromMaps(I);
   }
+  if (MDT) {
+    // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
+    // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
+    // be a leaf node in MDT and could be erased directly.
+    if (MDT->dominates(&MBB, Succ))
+      MDT->changeImmediateDominator(MDT->getNode(Succ),
+                                    MDT->getNode(&MBB)->getIDom());
+    MDT->eraseNode(&MBB);
+  }
   MBB.clear();
   MBB.eraseFromParent();
   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
-    if (!GetFallThroughSucc(Succ)) {
+    if (!Succ->canFallThrough()) {
       MachineFunction *MF = FallThrough->getParent();
       MachineFunction::iterator FallThroughPos(FallThrough);
       MF->splice(std::next(FallThroughPos), Succ);
@@ -780,6 +810,9 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
 
   // This doesn't actually need LiveIntervals, but we can preserve them.
   LIS = getAnalysisIfAvailable<LiveIntervals>();
+  // This doesn't actually need LiveVariables, but we can preserve them.
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  MDT = getAnalysisIfAvailable<MachineDominatorTree>();
   MRI = &MF.getRegInfo();
   BoolRC = TRI->getBoolRC();
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 38b9d85b653b..55196fe334e6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -35,7 +35,6 @@ class SILowerSGPRSpills : public MachineFunctionPass {
 private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
-  VirtRegMap *VRM = nullptr;
   LiveIntervals *LIS = nullptr;
 
   // Save and Restore blocks of the current function. Typically there is a
@@ -289,7 +288,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
-  VRM = getAnalysisIfAvailable<VirtRegMap>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
 
   assert(SaveBlocks.empty() && RestoreBlocks.empty());
@@ -334,11 +332,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
 
     for (MachineBasicBlock &MBB : MF) {
-      MachineBasicBlock::iterator Next;
-      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
-        MachineInstr &MI = *I;
-        Next = std::next(I);
-
+      for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
         if (!TII->isSGPRSpill(MI))
           continue;
 
@@ -369,11 +363,17 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
         if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
             SpillFIs[MI.getOperand(0).getIndex()]) {
           MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
-          MI.getOperand(0).setIsDebug();
         }
       }
     }
 
+    // All those frame indices which are dead by now should be removed from the
+    // function frame. Otherwise, there is a side effect such as re-mapping of
+    // free frame index ids by the later pass(es) like "stack slot coloring"
+    // which in turn could mess-up with the book keeping of "frame index to VGPR
+    // lane".
+    FuncInfo->removeDeadFrameIndices(MFI);
+
     MadeChange = true;
   } else if (FuncInfo->VGPRReservedForSGPRSpill) {
     FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 85cfe36df16a..c4007f56f350 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -67,9 +67,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
                            CC != CallingConv::AMDGPU_Gfx &&
                            (!isEntryFunction() || HasCalls);
+  const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
+                        CC == CallingConv::SPIR_KERNEL;
 
-  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    if (!F.arg_empty())
+  if (IsKernel) {
+    if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)
       KernargSegmentPtr = true;
     WorkGroupIDX = true;
     WorkItemIDX = true;
@@ -94,45 +96,76 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
         ArgDescriptor::createRegister(ScratchRSrcReg);
     }
 
-    if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
+    if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
       ImplicitArgPtr = true;
   } else {
-    if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
-      KernargSegmentPtr = true;
-      MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
-                                 MaxKernArgAlign);
-    }
+    ImplicitArgPtr = false;
+    MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
+                               MaxKernArgAlign);
   }
 
+  bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
+  if (isAmdHsaOrMesa && !ST.enableFlatScratch())
+    PrivateSegmentBuffer = true;
+  else if (ST.isMesaGfxShader(F))
+    ImplicitBufferPtr = true;
+
   if (UseFixedABI) {
+    DispatchPtr = true;
+    QueuePtr = true;
+    ImplicitArgPtr = true;
     WorkGroupIDX = true;
     WorkGroupIDY = true;
     WorkGroupIDZ = true;
     WorkItemIDX = true;
     WorkItemIDY = true;
     WorkItemIDZ = true;
-    ImplicitArgPtr = true;
-  } else {
-    if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+
+    // FIXME: We don't need this?
+    DispatchID = true;
+  } else if (!AMDGPU::isGraphics(CC)) {
+    if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
 
-    if (F.hasFnAttribute("amdgpu-work-group-id-y"))
+    if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y"))
       WorkGroupIDY = true;
 
-    if (F.hasFnAttribute("amdgpu-work-group-id-z"))
+    if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z"))
       WorkGroupIDZ = true;
 
-    if (F.hasFnAttribute("amdgpu-work-item-id-x"))
+    if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
       WorkItemIDX = true;
 
-    if (F.hasFnAttribute("amdgpu-work-item-id-y"))
+    if (!F.hasFnAttribute("amdgpu-no-workitem-id-y"))
       WorkItemIDY = true;
 
-    if (F.hasFnAttribute("amdgpu-work-item-id-z"))
+    if (!F.hasFnAttribute("amdgpu-no-workitem-id-z"))
       WorkItemIDZ = true;
+
+    if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
+      DispatchPtr = true;
+
+    if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
+      QueuePtr = true;
+
+    if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
+      DispatchID = true;
   }
 
+  // FIXME: This attribute is a hack, we just need an analysis on the function
+  // to look for allocas.
   bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
+
+  // TODO: This could be refined a lot. The attribute is a poor way of
+  // detecting calls or stack objects that may require it before argument
+  // lowering.
+  if (ST.hasFlatAddressSpace() && isEntryFunction() &&
+      (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
+      (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+      !ST.flatScratchIsArchitected()) {
+    FlatScratchInit = true;
+  }
+
   if (isEntryFunction()) {
     // X, XY, and XYZ are the only supported combinations, so make sure Y is
     // enabled if Z is.
@@ -150,44 +183,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     }
   }
 
-  bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
-  if (isAmdHsaOrMesa) {
-    if (!ST.enableFlatScratch())
-      PrivateSegmentBuffer = true;
-
-    if (UseFixedABI) {
-      DispatchPtr = true;
-      QueuePtr = true;
-
-      // FIXME: We don't need this?
-      DispatchID = true;
-    } else {
-      if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
-        DispatchPtr = true;
-
-      if (F.hasFnAttribute("amdgpu-queue-ptr"))
-        QueuePtr = true;
-
-      if (F.hasFnAttribute("amdgpu-dispatch-id"))
-        DispatchID = true;
-    }
-  } else if (ST.isMesaGfxShader(F)) {
-    ImplicitBufferPtr = true;
-  }
-
-  if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
-    KernargSegmentPtr = true;
-
-  // TODO: This could be refined a lot. The attribute is a poor way of
-  // detecting calls or stack objects that may require it before argument
-  // lowering.
-  if (ST.hasFlatAddressSpace() && isEntryFunction() &&
-      (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
-      (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
-      !ST.flatScratchIsArchitected()) {
-    FlatScratchInit = true;
-  }
-
   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
   StringRef S = A.getValueAsString();
   if (!S.empty())
@@ -426,7 +421,7 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
     OtherUsedRegs.set(Reg);
 
   SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
-  for (unsigned I = 0; I < NumLanes; ++I) {
+  for (int I = NumLanes - 1; I >= 0; --I) {
     NextSpillReg = std::find_if(
         NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
           return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
@@ -447,10 +442,16 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 }
 
 void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
-  // The FP & BP spills haven't been inserted yet, so keep them around.
-  for (auto &R : SGPRToVGPRSpills) {
-    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex)
+  // Remove dead frame indices from function frame, however keep FP & BP since
+  // spills for them haven't been inserted yet. And also make sure to remove the
+  // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could
+  // result in an unexpected side effect and bug, in case of any re-mapping of
+  // freed frame indices by later pass(es) like "stack slot coloring".
+  for (auto &R : make_early_inc_range(SGPRToVGPRSpills)) {
+    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) {
       MFI.RemoveStackObject(R.first);
+      SGPRToVGPRSpills.erase(R.first);
+    }
   }
 
   // All other SPGRs must be allocated on the default stack, so reset the stack
@@ -650,3 +651,38 @@ bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
   }
   return false;
 }
+
+bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
+  if (UsesAGPRs)
+    return *UsesAGPRs;
+
+  if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
+      MF.getFrameInfo().hasCalls()) {
+    UsesAGPRs = true;
+    return true;
+  }
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    const Register Reg = Register::index2VirtReg(I);
+    const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+    if (RC && SIRegisterInfo::isAGPRClass(RC)) {
+      UsesAGPRs = true;
+      return true;
+    } else if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) {
+      // Defer caching UsesAGPRs, function might not yet been regbank selected.
+      return true;
+    }
+  }
+
+  for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      UsesAGPRs = true;
+      return true;
+    }
+  }
+
+  UsesAGPRs = false;
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index fb6d4f8841ab..c305bc20e40d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,9 +26,9 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineFunction;
-class TargetRegisterClass;
 class SIMachineFunctionInfo;
 class SIRegisterInfo;
+class TargetRegisterClass;
 
 class AMDGPUPseudoSourceValue : public PseudoSourceValue {
 public:
@@ -433,6 +433,8 @@ private:
   // Current recorded maximum possible occupancy.
   unsigned Occupancy;
 
+  mutable Optional<bool> UsesAGPRs;
+
   MCPhysReg getNextUserSGPR() const;
 
   MCPhysReg getNextSystemSGPR() const;
@@ -946,6 +948,9 @@ public:
       Occupancy = Limit;
     limitOccupancy(MF);
   }
+
+  // \returns true if a function needs or may need AGPRs.
+  bool usesAGPRs(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 278dd05b049c..5590d84cc3ab 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -403,7 +403,7 @@ void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
   }
 
   // TODO: compute InternalAdditionnalPressure.
-  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+  InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size());
 
   // Check everything is right.
 #ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index a2f5a1453d6a..ac34a748edbc 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -25,6 +25,8 @@ namespace llvm {
 
 class SIInstrInfo;
 class SIRegisterInfo;
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
 
 enum SIScheduleCandReason {
   NoCand,
@@ -48,9 +50,6 @@ struct SISchedulerCandidate {
   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
 };
 
-class SIScheduleDAGMI;
-class SIScheduleBlockCreator;
-
 enum SIScheduleBlockLinkKind {
   NoData,
   Data
@@ -73,7 +72,7 @@ class SIScheduleBlock {
   // store the live virtual and real registers.
   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
   // Pressure of additional registers required inside the block.
-  std::vector<unsigned> InternalAdditionnalPressure;
+  std::vector<unsigned> InternalAdditionalPressure;
   // Pressure of input and output registers
   std::vector<unsigned> LiveInPressure;
   std::vector<unsigned> LiveOutPressure;
@@ -154,8 +153,8 @@ public:
 
   // Needs the block to be scheduled inside
   // TODO: find a way to compute it.
-  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
-    return InternalAdditionnalPressure;
+  std::vector<unsigned> &getInternalAdditionalRegUsage() {
+    return InternalAdditionalPressure;
   }
 
   std::set<unsigned> &getInRegs() { return LiveInRegs; }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 71be73c2f0e4..29f072ca1e6c 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -126,8 +126,7 @@ private:
            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
                SIAtomicAddrSpace::NONE &&
            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
-               SIAtomicAddrSpace::NONE &&
-           !isStrongerThan(FailureOrdering, Ordering));
+               SIAtomicAddrSpace::NONE);
 
     // There is also no cross address space ordering if the ordering
     // address space is the same as the instruction address space and
@@ -369,7 +368,7 @@ protected:
 
 public:
 
-  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
+  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
 
   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
@@ -410,7 +409,7 @@ public:
 class SIGfx7CacheControl : public SIGfx6CacheControl {
 public:
 
-  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
+  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
 
   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
@@ -422,7 +421,7 @@ public:
 class SIGfx90ACacheControl : public SIGfx7CacheControl {
 public:
 
-  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
+  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
 
   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
@@ -471,7 +470,7 @@ protected:
 
 public:
 
-  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
+  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
 
   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
@@ -651,14 +650,11 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
       }
 
       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
-      Ordering = isStrongerThan(Ordering, OpOrdering)
-                     ? Ordering
-                     : MMO->getSuccessOrdering();
+      Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
       FailureOrdering =
-          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
-              FailureOrdering : MMO->getFailureOrdering();
+          getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
     }
   }
 
@@ -859,7 +855,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
-  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
 
   bool Changed = false;
 
@@ -1035,8 +1031,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                        SIAtomicAddrSpace AddrSpace,
                                        bool IsCrossAddrSpaceOrdering,
                                        Position Pos) const {
-    return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                      IsCrossAddrSpaceOrdering, Pos);
+  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                    IsCrossAddrSpaceOrdering, Pos);
 }
 
 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1108,7 +1104,8 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass(
       // different CUs. Therefore need to bypass the L1 which is per CU.
       // Otherwise in non-threadgroup split mode all waves of a work-group are
       // on the same CU, and so the L1 does not need to be bypassed.
-      if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
+      if (ST.isTgSplitEnabled())
+        Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -1204,14 +1201,13 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
-  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
 
   bool Changed = false;
 
   if (IsVolatile) {
-    if (Op == SIMemOp::LOAD) {
+    if (Op == SIMemOp::LOAD)
       Changed |= enableGLCBit(MI);
-    }
 
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
@@ -1398,7 +1394,8 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
       // CU mode all waves of a work-group are on the same CU, and so the L0
       // does not need to be bypassed.
-      if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
+      if (!ST.isCuModeEnabled())
+        Changed |= enableGLCBit(MI);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -1432,12 +1429,11 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
   // instructions. The latter are always marked as volatile so cannot sensibly
   // handle it as do not want to pessimize all atomics. Also they do not support
   // the nontemporal attribute.
-  assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
 
   bool Changed = false;
 
   if (IsVolatile) {
-
     if (Op == SIMemOp::LOAD) {
       Changed |= enableGLCBit(MI);
       Changed |= enableDLCBit(MI);
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 3d659eca47db..69eab762f05c 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -225,7 +225,7 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
   // RequirePending is used to indicate whether we are collecting the initial
   // requirements for the block, and need to defer the first InsertionPoint to
   // Phase 3. It is set to false once we have set FirstInsertionPoint, or when
-  // we discover an explict setreg that means this block doesn't have any
+  // we discover an explicit setreg that means this block doesn't have any
   // initial requirements.
   bool RequirePending = true;
   Status IPChange;
@@ -373,12 +373,8 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
     BlockInfo[ThisBlock]->Exit = TmpStatus;
     // Add the successors to the work list so we can propagate the changed exit
     // status.
-    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
-                                          E = MBB.succ_end();
-         S != E; S = std::next(S)) {
-      MachineBasicBlock &B = *(*S);
-      Phase2List.push(&B);
-    }
+    for (MachineBasicBlock *Succ : MBB.successors())
+      Phase2List.push(Succ);
   }
   BlockInfo[ThisBlock]->ExitSet = ExitSet;
   if (RevisitRequired)
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 307c9eba9d3b..6bf6c45d8cf6 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -11,7 +11,7 @@
 /// structures and waterfall loops.
 ///
 /// When we do structurization, we usually transform an if-else into two
-/// sucessive if-then (with a flow block to do predicate inversion). Consider a
+/// successive if-then (with a flow block to do predicate inversion). Consider a
 /// simple case after structurization: A divergent value %a was defined before
 /// if-else and used in both THEN (use in THEN is optional) and ELSE part:
 ///    bb.if:
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 7d7a753bb333..6a698348d389 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -365,7 +365,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
 
       if (Dst &&
           DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
-        // This will work if the tied src is acessing WORD_0, and the dst is
+        // This will work if the tied src is accessing WORD_0, and the dst is
         // writing WORD_1. Modifiers don't matter because all the bits that
         // would be impacted are being overwritten by the dst.
         // Any other case will not work.
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index e05aafe5e291..13a6a718f4f2 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -8,7 +8,7 @@
 //
 /// \file
 /// This pass creates bundles of memory instructions to protect adjacent loads
-/// and stores from beeing rescheduled apart from each other post-RA.
+/// and stores from being rescheduled apart from each other post-RA.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -90,6 +90,9 @@ bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
 
 void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
                                           BitVector &UsedRegUnits) const {
+  if (MI.isDebugInstr())
+    return;
+
   for (const MachineOperand &Op : MI.operands()) {
     if (!Op.isReg() || !Op.readsReg())
       continue;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index dce0f4b0df5f..d1b8e217471e 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -174,7 +174,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
   } else if (IsVCCZ && MaskValue == 0) {
     // Will always branch
-    // Remove all succesors shadowed by new unconditional branch
+    // Remove all successors shadowed by new unconditional branch
     MachineBasicBlock *Parent = MI.getParent();
     SmallVector<MachineInstr *, 4> ToRemove;
     bool Found = false;
@@ -257,10 +257,8 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
                        })) {
         // The only exception allowed here is another indirect vector move
         // with the same mode.
-        if (!IdxOn ||
-            !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
-               I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
-              I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
+        if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
+                        I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
           return false;
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index bba5bf7fdbc3..bfbe84f696f8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -97,7 +97,7 @@ struct SGPRSpillBuilder {
   unsigned EltSize = 4;
 
   RegScavenger *RS;
-  MachineBasicBlock &MBB;
+  MachineBasicBlock *MBB;
   MachineFunction &MF;
   SIMachineFunctionInfo &MFI;
   const SIInstrInfo &TII;
@@ -110,9 +110,14 @@ struct SGPRSpillBuilder {
   SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
                    bool IsWave32, MachineBasicBlock::iterator MI, int Index,
                    RegScavenger *RS)
-      : SuperReg(MI->getOperand(0).getReg()), MI(MI),
-        IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index),
-        RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()),
+      : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
+                         MI->getOperand(0).isKill(), Index, RS) {}
+
+  SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
+                   bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
+                   bool IsKill, int Index, RegScavenger *RS)
+      : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
+        Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
         MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
         IsWave32(IsWave32) {
     const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg);
@@ -189,8 +194,9 @@ struct SGPRSpillBuilder {
     if (SavedExecReg) {
       RS->setRegUsed(SavedExecReg);
       // Set exec to needed lanes
-      BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
-      auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
+      BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
+      auto I =
+          BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
       if (!TmpVGPRLive)
         I.addReg(TmpVGPR, RegState::ImplicitDefine);
       // Spill needed lanes
@@ -201,7 +207,7 @@ struct SGPRSpillBuilder {
         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
                                     /*IsKill*/ false);
       // Spill inactive lanes
-      auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
       if (!TmpVGPRLive)
         I.addReg(TmpVGPR, RegState::ImplicitDefine);
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
@@ -224,7 +230,7 @@ struct SGPRSpillBuilder {
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
                                   /*IsKill*/ false);
       // Restore exec
-      auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg)
+      auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
                    .addReg(SavedExecReg, RegState::Kill);
       // Add an implicit use of the load so it is not dead.
       // FIXME This inserts an unnecessary waitcnt
@@ -235,7 +241,7 @@ struct SGPRSpillBuilder {
       // Restore inactive lanes
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
                                   /*IsKill*/ false);
-      auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
       if (!TmpVGPRLive) {
         I.addReg(TmpVGPR, RegState::ImplicitKill);
       }
@@ -261,11 +267,17 @@ struct SGPRSpillBuilder {
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
                                   /*IsKill*/ false);
       // Spill inactive lanes
-      BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
-      BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
     }
   }
+
+  void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
+    assert(MBB->getParent() == &MF);
+    MI = NewMI;
+    MBB = NewMBB;
+  }
 };
 
 } // namespace llvm
@@ -348,10 +360,13 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-  case CallingConv::AMDGPU_Gfx:
     return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
         ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
         : CSR_AMDGPU_HighRegs_SaveList;
+  case CallingConv::AMDGPU_Gfx:
+    return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+               ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList
+               : CSR_AMDGPU_SI_Gfx_SaveList;
   default: {
     // Dummy to not crash RegisterClassInfo.
     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -371,10 +386,13 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-  case CallingConv::AMDGPU_Gfx:
     return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
         ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
         : CSR_AMDGPU_HighRegs_RegMask;
+  case CallingConv::AMDGPU_Gfx:
+    return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+               ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask
+               : CSR_AMDGPU_SI_Gfx_RegMask;
   default:
     return nullptr;
   }
@@ -501,18 +519,36 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, Reg);
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
-  // TODO: In an entry function without calls and AGPRs used it is possible
-  //       to use the whole register budget for VGPRs. Even more it shall
-  //       be possible to estimate maximum AGPR/VGPR pressure and split
-  //       register file accordingly.
-  if (ST.hasGFX90AInsts())
-    MaxNumVGPRs /= 2;
+  unsigned MaxNumAGPRs = MaxNumVGPRs;
   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+  if (ST.hasGFX90AInsts()) {
+    // In an entry function without calls and AGPRs used it is possible to use
+    // the whole register budget for VGPRs.
+
+    // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and
+    //       split register file accordingly.
+    if (MFI->usesAGPRs(MF)) {
+      MaxNumVGPRs /= 2;
+      MaxNumAGPRs = MaxNumVGPRs;
+    } else {
+      if (MaxNumVGPRs > TotalNumVGPRs) {
+        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
+        MaxNumVGPRs = TotalNumVGPRs;
+      } else
+        MaxNumAGPRs = 0;
+    }
+  }
+
   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
-    Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+  }
+
+  for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
   }
 
@@ -536,8 +572,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
@@ -801,6 +835,14 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
   return &AMDGPU::VGPR_32RegClass;
 }
 
+const TargetRegisterClass *
+SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
+    return getEquivalentVGPRClass(RC);
+
+  return RC;
+}
+
 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
 
   switch (Op) {
@@ -1037,7 +1079,7 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
 }
 
 void SIRegisterInfo::buildSpillLoadStore(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
     unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
     MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
     RegScavenger *RS, LivePhysRegs *LiveRegs) const {
@@ -1049,7 +1091,6 @@ void SIRegisterInfo::buildSpillLoadStore(
   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
 
   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
-  const DebugLoc &DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
   bool IsStore = Desc->mayStore();
   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
 
@@ -1177,9 +1218,19 @@ void SIRegisterInfo::buildSpillLoadStore(
     bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
     bool NeedSuperRegImpOperand = e > 1;
 
-    unsigned Lane = RegOffset / 4;
-    unsigned LaneE = (RegOffset + EltSize) / 4;
-    for ( ; Lane != LaneE; ++Lane) {
+    // Remaining element size to spill into memory after some parts of it
+    // spilled into either AGPRs or VGPRs.
+    unsigned RemEltSize = EltSize;
+
+    // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
+    // starting from the last lane. In case if a register cannot be completely
+    // spilled into another register that will ensure its alignment does not
+    // change. For targets with VGPR alignment requirement this is important
+    // in case of flat scratch usage as we might get a scratch_load or
+    // scratch_store of an unaligned register otherwise.
+    for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
+             LaneE = RegOffset / 4;
+         Lane >= LaneE; --Lane) {
       bool IsSubReg = e > 1 || EltSize > 4;
       Register Sub = IsSubReg
              ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
@@ -1187,33 +1238,29 @@ void SIRegisterInfo::buildSpillLoadStore(
       auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
       if (!MIB.getInstr())
         break;
-      if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
+      if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && !i)) {
         MIB.addReg(ValueReg, RegState::ImplicitDefine);
         NeedSuperRegDef = false;
       }
       if (IsSubReg || NeedSuperRegImpOperand) {
         NeedSuperRegImpOperand = true;
         unsigned State = SrcDstRegState;
-        if (Lane + 1 != LaneE)
+        if (Lane != LaneE)
           State &= ~RegState::Kill;
         MIB.addReg(ValueReg, RegState::Implicit | State);
       }
+      RemEltSize -= 4;
     }
 
-    if (Lane == LaneE) // Fully spilled into AGPRs.
+    if (!RemEltSize) // Fully spilled into AGPRs.
       continue;
 
-    // Offset in bytes from the beginning of the ValueReg to its portion we
-    // still need to spill. It may differ from RegOffset if a portion of
-    // current SubReg has been already spilled into AGPRs by the loop above.
-    unsigned RemRegOffset = Lane * 4;
-    unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
     if (RemEltSize != EltSize) { // Partially spilled to AGPRs
       assert(IsFlat && EltSize > 4);
 
       unsigned NumRegs = RemEltSize / 4;
       SubReg = Register(getSubReg(ValueReg,
-                        getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
+                        getSubRegFromChannel(RegOffset / 4, NumRegs)));
       unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
       Desc = &TII->get(Opc);
     }
@@ -1240,10 +1287,10 @@ void SIRegisterInfo::buildSpillLoadStore(
       SubReg = TmpReg;
     }
 
-    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
+    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
     MachineMemOperand *NewMMO =
         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
-                                 commonAlignment(Alignment, RemRegOffset));
+                                 commonAlignment(Alignment, RegOffset));
 
     auto MIB =
         BuildMI(MBB, MI, DL, *Desc)
@@ -1257,7 +1304,7 @@ void SIRegisterInfo::buildSpillLoadStore(
     } else {
       MIB.addReg(SOffset, SOffsetRegState);
     }
-    MIB.addImm(Offset + RemRegOffset)
+    MIB.addImm(Offset + RegOffset)
        .addImm(0); // cpol
     if (!IsFlat)
       MIB.addImm(0)  // tfe
@@ -1307,13 +1354,13 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
   if (IsLoad) {
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
-    buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
-                        Offset * SB.EltSize, MMO, SB.RS);
+    buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
+                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
   } else {
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
-    buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg,
-                        Offset * SB.EltSize, MMO, SB.RS);
+    buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
+                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
     // This only ever adds one VGPR spill
     SB.MFI.addToSpilledVGPRs(1);
   }
@@ -1336,6 +1383,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                          SB.SuperReg != SB.MFI.getFrameOffsetReg()));
 
   if (SpillToVGPR) {
+
+    assert(SB.NumSubRegs == VGPRSpills.size() &&
+           "Num of VGPR lanes should be equal to num of SGPRs spilled");
+
     for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
       Register SubReg =
           SB.NumSubRegs == 1
@@ -1347,8 +1398,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
 
       // Mark the "old value of vgpr" input undef only if this is the first sgpr
       // spill to this specific vgpr in the first basic block.
-      auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
-                         Spill.VGPR)
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
+                         SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
                      .addReg(SubReg, getKillRegState(UseKill))
                      .addImm(Spill.Lane)
                      .addReg(Spill.VGPR);
@@ -1394,7 +1445,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
         MachineInstrBuilder WriteLane =
-            BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+            BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
                     SB.TmpVGPR)
                 .addReg(SubReg, SubKillState)
                 .addImm(i % PVD.PerVGPR)
@@ -1456,10 +1507,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
-      auto MIB =
-          BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
-              .addReg(Spill.VGPR)
-              .addImm(Spill.Lane);
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
+                         SubReg)
+                     .addReg(Spill.VGPR)
+                     .addImm(Spill.Lane);
       if (SB.NumSubRegs > 1 && i == 0)
         MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
       if (LIS) {
@@ -1490,7 +1541,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
         bool LastSubReg = (i + 1 == e);
-        auto MIB = BuildMI(SB.MBB, MI, SB.DL,
+        auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
                            SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
                        .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
                        .addImm(i);
@@ -1516,6 +1567,75 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   return true;
 }
 
+bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
+                                        MachineBasicBlock &RestoreMBB,
+                                        Register SGPR, RegScavenger *RS) const {
+  SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
+                      RS);
+  SB.prepare();
+  // Generate the spill of SGPR to SB.TmpVGPR.
+  unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
+  auto PVD = SB.getPerVGPRData();
+  for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
+    unsigned TmpVGPRFlags = RegState::Undef;
+    // Write sub registers into the VGPR
+    for (unsigned i = Offset * PVD.PerVGPR,
+                  e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
+         i < e; ++i) {
+      Register SubReg =
+          SB.NumSubRegs == 1
+              ? SB.SuperReg
+              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
+
+      MachineInstrBuilder WriteLane =
+          BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+                  SB.TmpVGPR)
+              .addReg(SubReg, SubKillState)
+              .addImm(i % PVD.PerVGPR)
+              .addReg(SB.TmpVGPR, TmpVGPRFlags);
+      TmpVGPRFlags = 0;
+      // There could be undef components of a spilled super register.
+      // TODO: Can we detect this and skip the spill?
+      if (SB.NumSubRegs > 1) {
+        // The last implicit use of the SB.SuperReg carries the "Kill" flag.
+        unsigned SuperKillState = 0;
+        if (i + 1 == SB.NumSubRegs)
+          SuperKillState |= getKillRegState(SB.IsKill);
+        WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
+      }
+    }
+    // Don't need to write VGPR out.
+  }
+
+  // Restore clobbered registers in the specified restore block.
+  MI = RestoreMBB.end();
+  SB.setMI(&RestoreMBB, MI);
+  // Generate the restore of SGPR from SB.TmpVGPR.
+  for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
+    // Don't need to load VGPR in.
+    // Unpack lanes
+    for (unsigned i = Offset * PVD.PerVGPR,
+                  e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
+         i < e; ++i) {
+      Register SubReg =
+          SB.NumSubRegs == 1
+              ? SB.SuperReg
+              : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
+      bool LastSubReg = (i + 1 == e);
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
+                         SubReg)
+                     .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
+                     .addImm(i);
+      if (SB.NumSubRegs > 1 && i == 0)
+        MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+    }
+  }
+  SB.restore();
+
+  SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
+  return false;
+}
+
 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
 /// a VGPR and the stack slot can be safely eliminated when all other users are
 /// handled.
@@ -1632,7 +1752,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                             : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
       auto *MBB = MI->getParent();
       buildSpillLoadStore(
-          *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
+          *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
           *MI->memoperands_begin(), RS);
       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
@@ -1668,7 +1788,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                             : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
       auto *MBB = MI->getParent();
       buildSpillLoadStore(
-          *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
+          *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
           *MI->memoperands_begin(), RS);
       MI->eraseFromParent();
@@ -2152,34 +2272,6 @@ bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
   return isSGPRClass(RC);
 }
 
-// TODO: It might be helpful to have some target specific flags in
-// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
-bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  unsigned Size = getRegSizeInBits(*RC);
-  if (Size == 16) {
-    return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
-           getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
-  }
-  const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
-  if (!VRC) {
-    assert(Size < 32 && "Invalid register class size");
-    return false;
-  }
-  return getCommonSubClass(VRC, RC) != nullptr;
-}
-
-bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
-  unsigned Size = getRegSizeInBits(*RC);
-  if (Size < 16)
-    return false;
-  const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
-  if (!ARC) {
-    assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
-    return false;
-  }
-  return getCommonSubClass(ARC, RC) != nullptr;
-}
-
 const TargetRegisterClass *
 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
   unsigned Size = getRegSizeInBits(*SRC);
@@ -2321,7 +2413,7 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
                             Register Reg) const {
   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
   // Registers without classes are unaddressable, SGPR-like registers.
-  return RC && hasVGPRs(RC);
+  return RC && isVGPRClass(RC);
 }
 
 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
@@ -2329,7 +2421,7 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
 
   // Registers without classes are unaddressable, SGPR-like registers.
-  return RC && hasAGPRs(RC);
+  return RC && isAGPRClass(RC);
 }
 
 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -2427,8 +2519,10 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
   if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
 
-  const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
-  return getAllocatableClass(RC);
+  if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
+    return getAllocatableClass(RC);
+
+  return nullptr;
 }
 
 MCRegister SIRegisterInfo::getVCC() const {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 2a92051e5fb2..8d90ddb1cf4c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -17,6 +17,8 @@
 #define GET_REGINFO_HEADER
 #include "AMDGPUGenRegisterInfo.inc"
 
+#include "SIDefines.h"
+
 namespace llvm {
 
 class GCNSubtarget;
@@ -24,7 +26,6 @@ class LiveIntervals;
 class LivePhysRegs;
 class RegisterBank;
 struct SGPRSpillBuilder;
-class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 private:
@@ -108,6 +109,13 @@ public:
   const TargetRegisterClass *getPointerRegClass(
     const MachineFunction &MF, unsigned Kind = 0) const override;
 
+  /// Returns a legal register class to copy a register in the specified class
+  /// to or from. If it is possible to copy the register directly without using
+  /// a cross register class copy, return the specified RC. Returns NULL if it
+  /// is not possible to copy between two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
   void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
                                bool IsLoad, bool IsKill = true) const;
 
@@ -122,6 +130,10 @@ public:
                    LiveIntervals *LIS = nullptr,
                    bool OnlyToVGPR = false) const;
 
+  bool spillEmergencySGPR(MachineBasicBlock::iterator MI,
+                          MachineBasicBlock &RestoreMBB, Register SGPR,
+                          RegScavenger *RS) const;
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
@@ -151,7 +163,7 @@ public:
   const TargetRegisterClass *getPhysRegClass(MCRegister Reg) const;
 
   /// \returns true if this class contains only SGPR registers
-  bool isSGPRClass(const TargetRegisterClass *RC) const {
+  static bool isSGPRClass(const TargetRegisterClass *RC) {
     return !hasVGPRs(RC) && !hasAGPRs(RC);
   }
 
@@ -162,19 +174,28 @@ public:
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;
 
+  /// \returns true if this class contains only VGPR registers
+  static bool isVGPRClass(const TargetRegisterClass *RC) {
+    return hasVGPRs(RC) && !hasAGPRs(RC);
+  }
+
   /// \returns true if this class contains only AGPR registers
-  bool isAGPRClass(const TargetRegisterClass *RC) const {
+  static bool isAGPRClass(const TargetRegisterClass *RC) {
     return hasAGPRs(RC) && !hasVGPRs(RC);
   }
 
   /// \returns true if this class contains VGPR registers.
-  bool hasVGPRs(const TargetRegisterClass *RC) const;
+  static bool hasVGPRs(const TargetRegisterClass *RC) {
+    return RC->TSFlags & SIRCFlags::HasVGPR;
+  }
 
   /// \returns true if this class contains AGPR registers.
-  bool hasAGPRs(const TargetRegisterClass *RC) const;
+  static bool hasAGPRs(const TargetRegisterClass *RC) {
+    return RC->TSFlags & SIRCFlags::HasAGPR;
+  }
 
   /// \returns true if this class contains any vector registers.
-  bool hasVectorRegisters(const TargetRegisterClass *RC) const {
+  static bool hasVectorRegisters(const TargetRegisterClass *RC) {
     return hasVGPRs(RC) || hasAGPRs(RC);
   }
 
@@ -350,10 +371,11 @@ public:
   // For creating spill instructions during frame lowering, where no scavenger
   // is available, LiveRegs can be used.
   void buildSpillLoadStore(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, unsigned LoadStoreOp,
-                           int Index, Register ValueReg, bool ValueIsKill,
-                           MCRegister ScratchOffsetReg, int64_t InstrOffset,
-                           MachineMemOperand *MMO, RegScavenger *RS,
+                           MachineBasicBlock::iterator MI, const DebugLoc &DL,
+                           unsigned LoadStoreOp, int Index, Register ValueReg,
+                           bool ValueIsKill, MCRegister ScratchOffsetReg,
+                           int64_t InstrOffset, MachineMemOperand *MMO,
+                           RegScavenger *RS,
                            LivePhysRegs *LiveRegs = nullptr) const;
 };
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 6e3c4e8775f3..cf1d90484228 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -126,8 +126,16 @@ class SIReg <string n, bits<16> regIdx = 0> :
   let HWEncoding = regIdx;
 }
 
-class SIRegWithSubRegs <string n, list<Register> subregs, bits<16> regIdx> :
-  RegisterWithSubRegs<n, subregs> {
+// For register classes that use TSFlags.
+class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
+  : RegisterClass <n, rTypes, Align, rList> {
+  // For vector register classes.
+  field bit HasVGPR = 0;
+  field bit HasAGPR = 0;
+
+  // These need to be kept in sync with the enum SIRCFlags.
+  let TSFlags{0} = HasVGPR;
+  let TSFlags{1} = HasAGPR;
 }
 
 multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
@@ -490,14 +498,15 @@ class RegisterTypes<list<ValueType> reg_types> {
 def Reg16Types : RegisterTypes<[i16, f16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
 
-def VGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+let HasVGPR = 1 in {
+def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
                               (add (sequence "VGPR%u_LO16", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 16;
   let GeneratePressureSet = 0;
 }
 
-def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
                               (add (sequence "VGPR%u_HI16", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 16;
@@ -506,12 +515,13 @@ def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
 
 // VGPR 32-bit registers
 // i16/f16 only on VI+
-def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
+def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
   let Weight = 1;
 }
+} // End HasVGPR = 1
 
 // VGPR 64-bit registers
 def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">;
@@ -540,7 +550,8 @@ def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
 // VGPR 1024-bit registers
 def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
 
-def AGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+let HasAGPR = 1 in {
+def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
                               (add (sequence "AGPR%u_LO16", 0, 255))> {
   let isAllocatable = 0;
   let Size = 16;
@@ -548,12 +559,13 @@ def AGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
 }
 
 // AccVGPR 32-bit registers
-def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
   let Weight = 1;
 }
+} // End HasAGPR = 1
 
 // AGPR 64-bit registers
 def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
@@ -679,6 +691,14 @@ def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
   let AllocationPriority = SGPR_64.AllocationPriority;
 }
 
+// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
+def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+                                (add (trunc (shl SGPR_64, 15), 1), // s[30:31]
+                                     (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
+  let CopyCost = SGPR_64.CopyCost;
+  let AllocationPriority = SGPR_64.AllocationPriority;
+}
+
 def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;
@@ -748,14 +768,15 @@ defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256R
 defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
 defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 
-def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                                  (add VGPR_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
+  let HasVGPR = 1;
 }
 
 // Register class for all vector registers (VGPRs + Interpolation Registers)
 class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
-    RegisterClass<"AMDGPU", regTypes, 32, regList> {
+    SIRegisterClass<"AMDGPU", regTypes, 32, regList> {
   let Size = !mul(numRegs, 32);
 
   // Requires n v_mov_b32 to copy
@@ -767,11 +788,13 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
 // Define a register tuple class, along with one requiring an even
 // aligned base register.
 multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
-  // Define the regular class.
-  def "" : VRegClassBase<numRegs, regTypes, regList>;
+  let HasVGPR = 1 in {
+    // Define the regular class.
+    def "" : VRegClassBase<numRegs, regTypes, regList>;
 
-  // Define 2-aligned variant
-  def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
+    // Define 2-aligned variant
+    def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
+  }
 }
 
 defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
@@ -787,7 +810,7 @@ defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
 defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
 
 multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
-  let CopyCost = !add(numRegs, numRegs, 1) in {
+  let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
     // Define the regular class.
     def "" : VRegClassBase<numRegs, regTypes, regList>;
 
@@ -811,7 +834,7 @@ defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)
 
 // This is not a real register. This is just to have a register to add
 // to VReg_1 that does not alias any real register that would
-// introduce inferred register classess.
+// introduce inferred register classes.
 def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> {
   let isArtificial = 1;
 }
@@ -823,44 +846,53 @@ let GeneratePressureSet = 0 in {
 // on an empty register set, but also sorts register classes based on
 // the number of registerss in them. Add only one register so this is
 // sorted to the end and not preferred over VGPR_32.
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> {
+def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> {
   let Size = 1;
+  let HasVGPR = 1;
 }
 
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                           (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
+  let HasVGPR = 1;
 }
 
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
+def VS_64 : SIRegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
   let isAllocatable = 0;
+  let HasVGPR = 1;
 }
 
-def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
+def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
                           (add AGPR_32, VGPR_32)> {
   let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasAGPR = 1;
 }
 
-def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
+def AV_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
                           (add AReg_64, VReg_64)> {
   let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasAGPR = 1;
 }
 } // End GeneratePressureSet = 0
 
-def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
+let HasVGPR = 1, HasAGPR = 1 in {
+def AV_96 : SIRegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
                           (add AReg_96, VReg_96)> {
   let isAllocatable = 0;
 }
 
-def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+def AV_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
                           (add AReg_128, VReg_128)> {
   let isAllocatable = 0;
 }
 
-def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
+def AV_160 : SIRegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
                           (add AReg_160, VReg_160)> {
   let isAllocatable = 0;
 }
+} // End HasVGPR = 1, HasAGPR = 1
 
 //===----------------------------------------------------------------------===//
 //  Register operands
@@ -996,6 +1028,30 @@ def VSrc_128 : RegisterOperand<VReg_128> {
 }
 
 //===----------------------------------------------------------------------===//
+//  VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use
+//  with FMAMK/FMAAK
+//===----------------------------------------------------------------------===//
+
+multiclass SIRegOperand32_Deferred <string rc, string MatchName, string opType,
+                           string rc_suffix = "_32"> {
+  let OperandNamespace = "AMDGPU" in {
+    def _f16_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+      let OperandType = opType#"_FP16_DEFERRED";
+      let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
+      let DecoderMethod = "decodeOperand_" # rc # "_16_Deferred";
+    }
+
+    def _f32_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+      let OperandType = opType#"_FP32_DEFERRED";
+      let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
+      let DecoderMethod = "decodeOperand_" # rc # "_32_Deferred";
+    }
+  }
+}
+
+defm VSrc : SIRegOperand32_Deferred<"VS", "VSrc", "OPERAND_REG_IMM">;
+
+//===----------------------------------------------------------------------===//
 //  VRegSrc_* Operands with a VGPR
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index b24c061af7ab..0792b303b830 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -137,6 +137,7 @@ def MIReadVGPR : SchedReadVariant<[
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
 
+  let RetireOOO = 1 in { // llvm-mca specific flag
   def : HWWriteRes<WriteBranch,  [HWBranch], 8>;
   def : HWWriteRes<WriteExport,  [HWExport], 4>;
   def : HWWriteRes<WriteLDS,     [HWLGKM],   5>; // Can be between 2 and 64
@@ -159,6 +160,7 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<Write8PassMAI,  [HWXDL], 8>;
   let ResourceCycles = [16] in
   def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
+  } // End RetireOOO = 1
 
   def : ReadAdvance<MIVGPRRead, -2>;
 
@@ -182,6 +184,7 @@ let SchedModel = SIFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<Write64Bit,       2>;
 def : HWVALUWriteRes<WriteIntMul,      4>;
 def : HWVALUWriteRes<WriteFloatFMA,    1>;
@@ -189,6 +192,7 @@ def : HWVALUWriteRes<WriteDouble,      4>;
 def : HWVALUWriteRes<WriteDoubleAdd,   2>;
 def : HWVALUWriteRes<WriteDoubleCvt,   4>;
 def : HWVALUWriteRes<WriteTrans64,     4>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
@@ -198,6 +202,7 @@ let SchedModel = SIQuarterSpeedModel in {
 
 defm : SICommonWriteRes;
 
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<Write64Bit,       2>;
 def : HWVALUWriteRes<WriteIntMul,      4>;
 def : HWVALUWriteRes<WriteFloatFMA,    16>;
@@ -205,6 +210,7 @@ def : HWVALUWriteRes<WriteDouble,      16>;
 def : HWVALUWriteRes<WriteDoubleAdd,    8>;
 def : HWVALUWriteRes<WriteDoubleCvt,    4>;
 def : HWVALUWriteRes<WriteTrans64,     16>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
@@ -218,6 +224,7 @@ let SchedModel = SIDPFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<WriteFloatFMA,    1>;
 def : HWVALUWriteRes<WriteDouble,      1>;
 def : HWVALUWriteRes<WriteDoubleAdd,   1>;
@@ -225,6 +232,7 @@ def : HWVALUWriteRes<WriteDoubleCvt,   1>;
 def : HWVALUWriteRes<WriteTrans64,     4>;
 def : HWVALUWriteRes<WriteIntMul,      1>;
 def : HWVALUWriteRes<Write64Bit,       1>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
@@ -240,6 +248,7 @@ let SchedModel = GFX10SpeedModel in {
 
 // The latency values are 1 / (operations / cycle).
 // Add 1 stall cycle for VGPR read.
+let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWWriteRes<Write32Bit,         [HWVALU, HWRC],   5>;
 def : HWWriteRes<WriteFloatCvt,      [HWVALU, HWRC],   5>;
 def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   6>;
@@ -259,6 +268,7 @@ def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
 def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
+} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 45dd57ea1be4..3a372d4519fb 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -188,7 +188,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
     return;
 
   // eq/ne is special because the imm16 can be treated as signed or unsigned,
-  // and initially selectd to the unsigned versions.
+  // and initially selected to the unsigned versions.
   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
     bool HasUImm;
     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
@@ -810,6 +810,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       // Copy extra operands not present in the instruction definition.
       copyExtraImplicitOps(*Inst32, MF, MI);
 
+      // Copy deadness from the old explicit vcc def to the new implicit def.
+      if (SDst && SDst->isDead())
+        Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
+
       MI.eraseFromParent();
       foldImmediates(*Inst32, TII, MRI);
 
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 38548eaf9478..6f63f686635a 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1029,11 +1029,8 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   SmallVector<MachineInstr *, 4> SplitPoints;
   char State = BI.InitialState;
 
-  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
-  while (II != IE) {
-    auto Next = std::next(II);
-    MachineInstr &MI = *II;
-
+  for (MachineInstr &MI : llvm::make_early_inc_range(
+           llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
     if (StateTransition.count(&MI))
       State = StateTransition[&MI];
 
@@ -1051,8 +1048,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
     }
     if (SplitPoint)
       SplitPoints.push_back(SplitPoint);
-
-    II = Next;
   }
 
   // Perform splitting after instruction scan to simplify iteration.
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e9697017aac0..61ecc13620a1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -265,6 +265,7 @@ def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
 let isReturn = 1 in {
 // Define variant marked as return rather than branch.
 def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
+def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>;
 }
 } // End isTerminator = 1, isBarrier = 1
 
@@ -517,9 +518,10 @@ let Uses = [SCC] in {
     def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
       [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))]
     >;
+    def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64",
+      [(set i64:$sdst, (SelectPat<select> i64:$src0, i64:$src1))]
+    >;
   }
-
-  def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
 } // End Uses = [SCC]
 
 let Defs = [SCC] in {
@@ -557,19 +559,19 @@ def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
 >;
 
 def S_NAND_B32 : SOP2_32 <"s_nand_b32",
-  [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))]
+  [(set i32:$sdst, (UniformUnaryFrag<not> (and_oneuse i32:$src0, i32:$src1)))]
 >;
 
 def S_NAND_B64 : SOP2_64 <"s_nand_b64",
-  [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))]
+  [(set i64:$sdst, (UniformUnaryFrag<not> (and_oneuse i64:$src0, i64:$src1)))]
 >;
 
 def S_NOR_B32 : SOP2_32 <"s_nor_b32",
-  [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))]
+  [(set i32:$sdst, (UniformUnaryFrag<not> (or_oneuse i32:$src0, i32:$src1)))]
 >;
 
 def S_NOR_B64 : SOP2_64 <"s_nor_b64",
-  [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))]
+  [(set i64:$sdst, (UniformUnaryFrag<not> (or_oneuse i64:$src0, i64:$src1)))]
 >;
 } // End isCommutable = 1
 
@@ -597,22 +599,22 @@ let AddedComplexity = 1 in {
 let Defs = [SCC] in {
 // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
-  [(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_32:$sdst, (UniformBinFrag<cshl_32> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
-  [(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_64:$sdst, (UniformBinFrag<cshl_64> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
-  [(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_32:$sdst, (UniformBinFrag<csrl_32> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
-  [(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_64:$sdst, (UniformBinFrag<csrl_64> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
-  [(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_32:$sdst, (UniformBinFrag<csra_32> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
-  [(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_64:$sdst, (UniformBinFrag<csra_64> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
 >;
 } // End Defs = [SCC]
 
@@ -621,9 +623,8 @@ def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
   [(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
 def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
 
-// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change
 def S_MUL_I32 : SOP2_32 <"s_mul_i32",
-  [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
+  [(set i32:$sdst, (UniformBinFrag<mul> i32:$src0, i32:$src1))]> {
   let isCommutable = 1;
 }
 } // End isReMaterializable = 1
@@ -713,7 +714,7 @@ class SOPK_Pseudo <string opName, dag outs, dag ins,
   bits<1> has_sdst = 1;
 }
 
-class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
+class SOPK_Real<SOPK_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList,
           ps.Mnemonic # " " # ps.AsmOperands, []> {
   let SALU = 1;
@@ -739,7 +740,7 @@ class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
 }
 
 class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> :
-  SOPK_Real <op, ps>,
+  SOPK_Real <ps>,
   Enc32 {
   let Inst{15-0}  = simm16;
   let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
@@ -748,7 +749,7 @@ class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> :
 }
 
 class SOPK_Real64<bits<5> op, SOPK_Pseudo ps> :
-  SOPK_Real<op, ps>,
+  SOPK_Real<ps>,
   Enc64 {
   let Inst{15-0}  = simm16;
   let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
@@ -1107,7 +1108,7 @@ class SOPPRelaxTable <bit isRelaxed, string keyName, string gfxip> {
 }
 
 //spaces inserted in realname on instantiation of this record to allow s_endpgm to omit whitespace
-class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
+class SOPP_Real<SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
   InstSI <ps.OutOperandList, ps.InOperandList,
           real_name # ps.AsmOperands, []> {
   let SALU = 1;
@@ -1127,14 +1128,14 @@ class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
   bits <16> simm16;
 }
 
-class SOPP_Real_32 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<op, ps, real_name>,
+class SOPP_Real_32 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<ps, real_name>,
 Enc32 {
   let Inst{15-0} = !if(ps.fixed_imm, ps.simm16, simm16);
   let Inst{22-16} = op;
   let Inst{31-23} = 0x17f;
 }
 
-class SOPP_Real_64 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<op, ps, real_name>,
+class SOPP_Real_64 <bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : SOPP_Real<ps, real_name>,
 Enc64 {
   // encoding
   let Inst{15-0} = !if(ps.fixed_imm, ps.simm16, simm16);
diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 9ec437760c0a..7573af597056 100644
--- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/AMDGPUTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 29bbf50cbfdc..9da7b9f5145d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -450,16 +450,16 @@ std::string AMDGPUTargetID::toString() const {
       } else if (Processor == "gfx801") {
         if (!isXnackOnOrAny())
           report_fatal_error(
-              "AMD GPU code object V2 does not support processor " + Processor +
-              " without XNACK");
+              "AMD GPU code object V2 does not support processor " +
+              Twine(Processor) + " without XNACK");
       } else if (Processor == "gfx802") {
       } else if (Processor == "gfx803") {
       } else if (Processor == "gfx805") {
       } else if (Processor == "gfx810") {
         if (!isXnackOnOrAny())
           report_fatal_error(
-              "AMD GPU code object V2 does not support processor " + Processor +
-              " without XNACK");
+              "AMD GPU code object V2 does not support processor " +
+              Twine(Processor) + " without XNACK");
       } else if (Processor == "gfx900") {
         if (isXnackOnOrAny())
           Processor = "gfx901";
@@ -475,11 +475,12 @@ std::string AMDGPUTargetID::toString() const {
       } else if (Processor == "gfx90c") {
         if (isXnackOnOrAny())
           report_fatal_error(
-              "AMD GPU code object V2 does not support processor " + Processor +
-              " with XNACK being ON or ANY");
+              "AMD GPU code object V2 does not support processor " +
+              Twine(Processor) + " with XNACK being ON or ANY");
       } else {
         report_fatal_error(
-            "AMD GPU code object V2 does not support processor " + Processor);
+            "AMD GPU code object V2 does not support processor " +
+            Twine(Processor));
       }
       break;
     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
@@ -671,7 +672,8 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
     if (XNACKUsed)
       ExtraSGPRs = 4;
 
-    if (FlatScrUsed)
+    if (FlatScrUsed ||
+        STI->getFeatureBits().test(AMDGPU::FeatureArchitectedFlatScratch))
       ExtraSGPRs = 6;
   }
 
@@ -1572,8 +1574,10 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   switch (OpType) {
   case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
@@ -1825,8 +1829,8 @@ bool isArgPassedInSGPR(const Argument *A) {
   case CallingConv::AMDGPU_Gfx:
     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
     // Everything else is in VGPRs.
-    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
-           F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
+    return F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::InReg) ||
+           F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::ByVal);
   default:
     // TODO: Should calls support inreg for SGPR inputs?
     return false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 72c872dec5ba..061c74c0ace6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -292,9 +292,13 @@ struct MIMGBaseOpcodeInfo {
   bool LodOrClampOrMip;
   bool HasD16;
   bool MSAA;
+  bool BVH;
 };
 
 LLVM_READONLY
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
 const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode);
 
 struct MIMGDimInfo {
@@ -767,7 +771,7 @@ bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
 /// Is this floating-point operand?
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
-/// Does this opearnd support only inlinable literals?
+/// Does this operand support only inlinable literals?
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
 /// Get the size in bits of a register from the register class \p RC.
@@ -785,6 +789,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   switch (OpInfo.OperandType) {
   case AMDGPU::OPERAND_REG_IMM_INT32:
   case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
@@ -793,6 +798,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+  case AMDGPU::OPERAND_KIMM32:
+  case AMDGPU::OPERAND_KIMM16: // mandatory literal is always size 4
     return 4;
 
   case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -804,6 +811,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
index da8fcf3900bb..2e4d83fbbc39 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPULDSUtils.h"
+#include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -68,6 +69,11 @@ class CollectReachableCallees {
       if (!VisitedCGNodes.insert(CGN).second)
         continue;
 
+      // Ignore call graph node which does not have associated function or
+      // associated function is not a definition.
+      if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
+        continue;
+
       for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
         auto *RCB = cast<CallBase>(GI->first.getValue());
         auto *RCGN = GI->second;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
index ffcafb9b76ce..d1c9229bc336 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
 
-#include "AMDGPU.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Constants.h"
 
@@ -49,7 +48,7 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 /// as an use within some instruction (either from kernel or from non-kernel).
 bool hasUserInstruction(const GlobalValue *GV);
 
-/// \returns true if an LDS global requres lowering to a module LDS structure
+/// \returns true if an LDS global requires lowering to a module LDS structure
 /// if \p F is not given. If \p F is given it must be a kernel and function
 /// \returns true if an LDS global is directly used from that kernel and it
 /// is safe to replace its uses with a kernel LDS structure member.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 35d5fe13ad30..48548d8b6722 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -860,16 +860,25 @@ defm V_CVT_NORM_U16_F16  : VOP1_Real_vi<0x4e>;
 
 defm V_ACCVGPR_MOV_B32   : VOP1Only_Real_vi<0x52>;
 
+let VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [EXEC, M0] in {
+
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
 // and an implicit use and def of the super register should be added.
-def V_MOV_B32_indirect : VPseudoInstSI<(outs),
+def V_MOV_B32_indirect_write : VPseudoInstSI<(outs),
   (ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
   PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
-                                        getVOPSrc0ForVT<i32>.ret:$src0)> {
-  let VOP1 = 1;
-  let SubtargetPredicate = isGFX8GFX9;
-}
+                                        getVOPSrc0ForVT<i32>.ret:$src0)>;
+
+// Copy of v_mov_b32 for use with VGPR indexing mode. An implicit use of the
+// super register should be added.
+def V_MOV_B32_indirect_read : VPseudoInstSI<
+  (outs getVALUDstForVT<i32>.ret:$vdst),
+  (ins getVOPSrc0ForVT<i32>.ret:$src0)>,
+  PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
+                                        getVOPSrc0ForVT<i32>.ret:$src0)>;
+
+} // End VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [M0]
 
 let OtherPredicates = [isGFX8Plus] in {
 
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 7860b7e7f8a6..8d232ffe4114 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -154,8 +154,6 @@ multiclass VOP2Inst_e64<string opName,
 
 multiclass VOP2Inst_sdwa<string opName,
                          VOPProfile P,
-                         SDPatternOperator node = null_frag,
-                         string revOp = opName,
                          bit GFX9Renamed = 0> {
   let renamedInGFX9 = GFX9Renamed in {
     foreach _ = BoolToList<P.HasExtSDWA>.ret in
@@ -170,7 +168,7 @@ multiclass VOP2Inst<string opName,
                     bit GFX9Renamed = 0> :
     VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
     VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
-    VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> {
+    VOP2Inst_sdwa<opName, P, GFX9Renamed> {
   let renamedInGFX9 = GFX9Renamed in {
     foreach _ = BoolToList<P.HasExtDPP>.ret in
       def _dpp  : VOP2_DPP_Pseudo <opName, P>;
@@ -188,7 +186,7 @@ multiclass VOP2bInst <string opName,
       let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
         def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
                    Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
-          let usesCustomInserter = !eq(P.NumSrcArgs, 2);
+          let usesCustomInserter = true;
         }
 
         foreach _ = BoolToList<P.HasExtSDWA>.ret in
@@ -272,12 +270,11 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = !if(!eq(vt.Size, 32),
-                        (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
-                        (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
+                        (ins VSrc_f32_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm),
+                        (ins VSrc_f16_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm));
+  field string Asm32 = "$vdst, $src0, $src1, $imm";
   field bit HasExt = 0;
   let IsSingle = 1;
-
-  field string Asm32 = "$vdst, $src0, $src1, $imm";
 }
 
 def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -285,11 +282,10 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>;
 
 class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
-  field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
+  field dag Ins32 = (ins VSrc_f32_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1);
+  field string Asm32 = "$vdst, $src0, $imm, $src1";
   field bit HasExt = 0;
   let IsSingle = 1;
-
-  field string Asm32 = "$vdst, $src0, $imm, $src1";
 }
 
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
@@ -496,18 +492,18 @@ defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub
 defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
 defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
 defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>;
-defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
 defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>;
-defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
 defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
 defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
 defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
 defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
 defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
 defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
-defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">;
-defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">;
-defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">;
+defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">;
+defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, clshl_rev_32, "v_lshl_b32">;
 defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
 defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
 defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
@@ -582,9 +578,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
 
 let isCommutable = 1 in {
 let SubtargetPredicate = isGFX6GFX7 in {
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, csrl_32>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, csra_32>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, cshl_32>;
 } // End SubtargetPredicate = isGFX6GFX7
 } // End isCommutable = 1
 } // End isReMaterializable = 1
@@ -609,9 +605,9 @@ class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
       )
   >;
 
-def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
-def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
-def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
+def : DivergentBinOp<csrl_32, V_LSHRREV_B32_e64>;
+def : DivergentBinOp<csra_32, V_ASHRREV_I32_e64>;
+def : DivergentBinOp<cshl_32, V_LSHLREV_B32_e64>;
 
 let SubtargetPredicate = HasAddNoCarryInsts in {
   def : DivergentClampingBinOp<add, V_ADD_U32_e64>;
@@ -652,9 +648,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
 } // End FPDPRounding = 1
 
-defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>;
-defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>;
-defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, clshr_rev_16>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, cashr_rev_16>;
 
 let isCommutable = 1 in {
 let FPDPRounding = 1 in {
@@ -856,9 +852,9 @@ defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>;
 defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
 defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
 defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
-defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>;
-defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>;
-defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>;
 }  // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
 
 def : ZExt_i16_i1_Pat<zext>;
@@ -927,7 +923,7 @@ class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
     SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
 
 class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
-                string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+                VOPProfile p = ps.Pfl> :
     VOP_DPP8<ps.OpName, p> {
   let hasSideEffects = ps.hasSideEffects;
   let Defs = ps.Defs;
@@ -1123,14 +1119,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
   multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
     foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp8_gfx10 :
-      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
         let DecoderNamespace = "DPP8";
       }
     foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp8_w32_gfx10 :
-      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
         let isAsmParserOnly = 1;
@@ -1138,7 +1134,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
       }
     foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
     def _dpp8_w64_gfx10 :
-      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+      VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # AsmDPP8;
         let isAsmParserOnly = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ee3b87f487d0..494e3aeb6d55 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -120,11 +120,11 @@ class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
 }
 
 // Consistently gives instructions a _e64 suffix.
-multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
-  def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
+multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = []> {
+  def _e64 : VOP3_Pseudo<opName, P, pattern>;
 }
 
-class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
   VOP3_Pseudo<OpName, P,
   !if(P.HasOpSel,
       !if(P.HasModifiers,
@@ -137,7 +137,7 @@ class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_fr
               !if (P.IsMAI,
                   getVOP3MAIPat<P, node>.ret,
                   getVOP3Pat<P, node>.ret)))),
-  VOP3Only, 0, P.HasOpSel> {
+  0, P.HasOpSel> {
 
   let IntClamp = P.HasIntClamp;
   let AsmMatchConverter =
@@ -148,8 +148,8 @@ class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_fr
           ""));
 }
 
-multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> {
-  def _e64 : VOP3InstBase<OpName, P, node, VOP3Only>;
+multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
+  def _e64 : VOP3InstBase<OpName, P, node>;
 }
 
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
@@ -296,15 +296,15 @@ defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_a
 let SchedRW = [WriteDoubleAdd] in {
 let FPDPRounding = 1 in {
 defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
-defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
-defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>;
+defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul>;
 } // End FPDPRounding = 1
-defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
-defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
+defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like>;
+defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteIntMul] in {
-defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
+defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
 defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
 defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
 defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
@@ -371,18 +371,18 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32
 
 let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
   defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
-  defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
+  defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp>;
 } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 } // End isReMaterializable = 1
 
 
 let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
   let SchedRW = [WriteFloatFMA, WriteSALU] in
-  defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> ;
+  defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ;
 
   // Double precision division pre-scale.
   let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
-  defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1>;
+  defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>;
 } // End mayRaiseFPException = 0
 
 let isReMaterializable = 1 in
@@ -400,15 +400,15 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I
 
 let SchedRW = [Write64Bit] in {
   let SubtargetPredicate = isGFX6GFX7 in {
-  defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
-  defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
-  defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
+  defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, cshl_64>;
+  defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, csrl_64>;
+  defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
   } // End SubtargetPredicate = isGFX6GFX7
 
   let SubtargetPredicate = isGFX8Plus in {
-  defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
-  defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
-  defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
+  defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
+  defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
+  defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
   } // End SubtargetPredicate = isGFX8Plus
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
@@ -528,7 +528,7 @@ def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
 let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
 
 multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
-                             Instruction inst, SDPatternOperator op3> {
+                             Instruction inst> {
 def : GCNPat <
   (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
   (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
@@ -536,15 +536,15 @@ def : GCNPat <
 
 }
 
-defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64, zext>;
-defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64>;
 
 } // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
 
 let Predicates = [Has16BitInsts, isGFX10Plus] in {
 
 multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
-                                 Instruction inst, SDPatternOperator op3> {
+                                 Instruction inst> {
 def : GCNPat <
   (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
   (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
@@ -552,8 +552,8 @@ def : GCNPat <
 
 }
 
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64, zext>;
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64, sext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
 
 } // End Predicates = [Has16BitInsts, isGFX10Plus]
 
@@ -656,10 +656,10 @@ class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instructio
   (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
 >;
 
-def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32_e64>;
-def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32_e64>;
+def : ThreeOp_i32_Pats<cshl_32, add, V_LSHL_ADD_U32_e64>;
+def : ThreeOp_i32_Pats<add, cshl_32, V_ADD_LSHL_U32_e64>;
 def : ThreeOp_i32_Pats<add, add, V_ADD3_U32_e64>;
-def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32_e64>;
+def : ThreeOp_i32_Pats<cshl_32, or, V_LSHL_OR_B32_e64>;
 def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
 def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
 def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
@@ -667,6 +667,14 @@ def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
 def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
 def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
 
+def : GCNPat<(getDivergentFrag<or>.ret (or_oneuse i64:$src0, i64:$src1), i64:$src2),
+             (REG_SEQUENCE VReg_64,
+               (V_OR3_B32_e64 (i32 (EXTRACT_SUBREG $src0, sub0)),
+                              (i32 (EXTRACT_SUBREG $src1, sub0)),
+                              (i32 (EXTRACT_SUBREG $src2, sub0))), sub0,
+               (V_OR3_B32_e64 (i32 (EXTRACT_SUBREG $src0, sub1)),
+                              (i32 (EXTRACT_SUBREG $src1, sub1)),
+                              (i32 (EXTRACT_SUBREG $src2, sub1))), sub1)>;
 
 // FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
 class OpSelBinOpClampPat<SDPatternOperator node,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 48f5eb1dc272..32222b3eb93c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -50,8 +50,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 
 // Non-packed instructions that use the VOP3P encoding.
 // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
-multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P,
-                          SDPatternOperator node = null_frag> {
+multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
   def NAME : VOP3P_Pseudo<OpName, P> {
     let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
     let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
@@ -83,9 +82,9 @@ defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16
 defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
 defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
 
-defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
-defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
-defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
 
 
 let SubtargetPredicate = HasVOP3PInsts in {
@@ -113,7 +112,6 @@ def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
 } // End SubtargetPredicate = HasVOP3PInsts
 
 multiclass MadFmaMixPats<SDPatternOperator fma_like,
-                         Instruction mix_inst,
                          Instruction mixlo_inst,
                          Instruction mixhi_inst> {
   def : GCNPat <
@@ -192,7 +190,7 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
 } // End FPDPRounding = 1
 }
 
-defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
 } // End SubtargetPredicate = HasMadMixInsts
 
 
@@ -211,7 +209,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
 } // End FPDPRounding = 1
 }
 
-defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+defm : MadFmaMixPats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
 }
 
 // Defines patterns that extract signed 4bit from each Idx[0].
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5f6f664ea3e7..a3eccf13cd71 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -57,8 +57,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
 }
 
 class VOP3Common <dag outs, dag ins, string asm = "",
-                  list<dag> pattern = [], bit HasMods = 0,
-                  bit VOP3Only = 0> :
+                  list<dag> pattern = [], bit HasMods = 0> :
   VOPAnyCommon <outs, ins, asm, pattern> {
 
   // Using complex patterns gives VOP3 patterns a very high complexity rating,
@@ -83,7 +82,7 @@ class VOP3Common <dag outs, dag ins, string asm = "",
 }
 
 class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
-                   bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> :
+                   bit isVOP3P = 0, bit isVop3OpSel = 0> :
   VOP_Pseudo <opName, "_e64", P, P.Outs64,
               !if(isVop3OpSel,
                   P.InsVOP3OpSel,
@@ -136,7 +135,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
 }
 
 class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
-  VOP3_Pseudo<opName, P, pattern, 1, 1> {
+  VOP3_Pseudo<opName, P, pattern, 1> {
   let VOP3P = 1;
 }
 
@@ -760,10 +759,11 @@ class getNumNodeArgs<SDPatternOperator Op> {
   int ret = TP.NumOperands;
 }
 
-
 class getDivergentFrag<SDPatternOperator Op> {
+  assert !or(!isa<SDNode>(Op), !isa<PatFrags>(Op)), "Expected SDNode or PatFrags";
 
-  int NumSrcArgs = getNumNodeArgs<Op>.ret;
+  int NumSrcArgs = !if(!isa<SDNode>(Op), getNumNodeArgs<Op>.ret,
+    !size(!cast<PatFrags>(Op).Operands));
   PatFrag ret = PatFrag <
     !if(!eq(NumSrcArgs, 1),
              (ops node:$src0),
diff --git a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
index 025b920ff7b4..0390c01eecb1 100644
--- a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -22,7 +22,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/ARC/ARCExpandPseudos.cpp b/llvm/lib/Target/ARC/ARCExpandPseudos.cpp
index a1646d17605f..84bb6cac2876 100644
--- a/llvm/lib/Target/ARC/ARCExpandPseudos.cpp
+++ b/llvm/lib/Target/ARC/ARCExpandPseudos.cpp
@@ -13,6 +13,7 @@
 #include "ARCInstrInfo.h"
 #include "ARCRegisterInfo.h"
 #include "ARCSubtarget.h"
+#include "MCTargetDesc/ARCInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -34,7 +35,9 @@ public:
   StringRef getPassName() const override { return "ARC Expand Pseudos"; }
 
 private:
-  void ExpandStore(MachineFunction &, MachineBasicBlock::iterator);
+  void expandStore(MachineFunction &, MachineBasicBlock::iterator);
+  void expandCTLZ(MachineFunction &, MachineBasicBlock::iterator);
+  void expandCTTZ(MachineFunction &, MachineBasicBlock::iterator);
 
   const ARCInstrInfo *TII;
 };
@@ -56,11 +59,11 @@ static unsigned getMappedOp(unsigned PseudoOp) {
   }
 }
 
-void ARCExpandPseudos::ExpandStore(MachineFunction &MF,
+void ARCExpandPseudos::expandStore(MachineFunction &MF,
                                    MachineBasicBlock::iterator SII) {
   MachineInstr &SI = *SII;
-  unsigned AddrReg = MF.getRegInfo().createVirtualRegister(&ARC::GPR32RegClass);
-  unsigned AddOpc =
+  Register AddrReg = MF.getRegInfo().createVirtualRegister(&ARC::GPR32RegClass);
+  Register AddOpc =
       isUInt<6>(SI.getOperand(2).getImm()) ? ARC::ADD_rru6 : ARC::ADD_rrlimm;
   BuildMI(*SI.getParent(), SI, SI.getDebugLoc(), TII->get(AddOpc), AddrReg)
       .addReg(SI.getOperand(1).getReg())
@@ -73,10 +76,62 @@ void ARCExpandPseudos::ExpandStore(MachineFunction &MF,
   SI.eraseFromParent();
 }
 
+void ARCExpandPseudos::expandCTLZ(MachineFunction &MF,
+                                  MachineBasicBlock::iterator MII) {
+  // Expand:
+  //	%R2<def> = CTLZ %R0, %STATUS<imp-def>
+  // To:
+  //	%R2<def> = FLS_f_rr %R0, %STATUS<imp-def>
+  //	%R2<def,tied1> = MOV_cc_ru6 %R2<tied0>, 32, pred:1, %STATUS<imp-use>
+  //	%R2<def,tied1> = RSUB_cc_rru6 %R2<tied0>, 31, pred:2, %STATUS<imp-use>
+  MachineInstr &MI = *MII;
+  const MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Src = MI.getOperand(1);
+  Register Ra = MF.getRegInfo().createVirtualRegister(&ARC::GPR32RegClass);
+  Register Rb = MF.getRegInfo().createVirtualRegister(&ARC::GPR32RegClass);
+
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(ARC::FLS_f_rr), Ra)
+      .add(Src);
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(ARC::MOV_cc_ru6), Rb)
+      .addImm(32)
+      .addImm(ARCCC::EQ)
+      .addReg(Ra);
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(ARC::RSUB_cc_rru6))
+      .add(Dest)
+      .addImm(31)
+      .addImm(ARCCC::NE)
+      .addReg(Rb);
+
+  MI.eraseFromParent();
+}
+
+void ARCExpandPseudos::expandCTTZ(MachineFunction &MF,
+                                  MachineBasicBlock::iterator MII) {
+  // Expand:
+  //	%R0<def> = CTTZ %R0<kill>, %STATUS<imp-def>
+  // To:
+  //	%R0<def> = FFS_f_rr %R0<kill>, %STATUS<imp-def>
+  //	%R0<def,tied1> = MOVcc_ru6 %R0<tied0>, 32, pred:1, %STATUS<imp-use>
+  MachineInstr &MI = *MII;
+  const MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Src = MI.getOperand(1);
+  Register R = MF.getRegInfo().createVirtualRegister(&ARC::GPR32RegClass);
+
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(ARC::FFS_f_rr), R)
+      .add(Src);
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(ARC::MOV_cc_ru6))
+      .add(Dest)
+      .addImm(32)
+      .addImm(ARCCC::EQ)
+      .addReg(R);
+
+  MI.eraseFromParent();
+}
+
 bool ARCExpandPseudos::runOnMachineFunction(MachineFunction &MF) {
   const ARCSubtarget *STI = &MF.getSubtarget<ARCSubtarget>();
   TII = STI->getInstrInfo();
-  bool ExpandedStore = false;
+  bool Expanded = false;
   for (auto &MBB : MF) {
     MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
     while (MBBI != E) {
@@ -85,8 +140,16 @@ bool ARCExpandPseudos::runOnMachineFunction(MachineFunction &MF) {
       case ARC::ST_FAR:
       case ARC::STH_FAR:
       case ARC::STB_FAR:
-        ExpandStore(MF, MBBI);
-        ExpandedStore = true;
+        expandStore(MF, MBBI);
+        Expanded = true;
+        break;
+      case ARC::CTLZ:
+        expandCTLZ(MF, MBBI);
+        Expanded = true;
+        break;
+      case ARC::CTTZ:
+        expandCTTZ(MF, MBBI);
+        Expanded = true;
         break;
       default:
         break;
@@ -94,7 +157,7 @@ bool ARCExpandPseudos::runOnMachineFunction(MachineFunction &MF) {
       MBBI = NMBBI;
     }
   }
-  return ExpandedStore;
+  return Expanded;
 }
 
 FunctionPass *llvm::createARCExpandPseudosPass() {
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp
index ca33f5297471..7fd08f70ea3b 100644
--- a/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -68,6 +68,31 @@ static ARCCC::CondCode ISDCCtoARCCC(ISD::CondCode isdCC) {
   }
 }
 
+void ARCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  LLVM_DEBUG(dbgs() << "[ARC-ISEL] ReplaceNodeResults ");
+  LLVM_DEBUG(N->dump(&DAG));
+  LLVM_DEBUG(dbgs() << "; use_count=" << N->use_size() << "\n");
+
+  switch (N->getOpcode()) {
+  case ISD::READCYCLECOUNTER:
+    if (N->getValueType(0) == MVT::i64) {
+      // We read the TIMER0 and zero-extend it to 64-bits as the intrinsic
+      // requires.
+      SDValue V =
+          DAG.getNode(ISD::READCYCLECOUNTER, SDLoc(N),
+                      DAG.getVTList(MVT::i32, MVT::Other), N->getOperand(0));
+      SDValue Op = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i64, V);
+      Results.push_back(Op);
+      Results.push_back(V.getValue(1));
+    }
+    break;
+  default:
+    break;
+  }
+}
+
 ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
                                      const ARCSubtarget &Subtarget)
     : TargetLowering(TM), Subtarget(Subtarget) {
@@ -96,6 +121,11 @@ ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SMAX, MVT::i32, Legal);
   setOperationAction(ISD::SMIN, MVT::i32, Legal);
 
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
+
   // Need barrel shifter.
   setOperationAction(ISD::SHL, MVT::i32, Legal);
   setOperationAction(ISD::SRA, MVT::i32, Legal);
@@ -135,6 +165,15 @@ ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,
 
   // Sign extend inreg
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+
+  // TODO: Predicate these with `options.hasBitScan() ? Legal : Expand`
+  //       when the HasBitScan predicate is available.
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i32, Legal);
+
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i32, Legal);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
+                     isTypeLegal(MVT::i64) ? Legal : Custom);
 }
 
 const char *ARCTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -495,7 +534,7 @@ SDValue ARCTargetLowering::LowerCallArguments(
         CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
       }
     } else {
-      // sanity check
+      // Only arguments passed on the stack should make it here.
       assert(VA.isMemLoc());
       // Load the argument to a virtual register
       unsigned ObjSize = VA.getLocVT().getStoreSize();
@@ -761,6 +800,13 @@ SDValue ARCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerJumpTable(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG);
+  case ISD::READCYCLECOUNTER:
+    // As of LLVM 3.8, the lowering code insists that we customize it even
+    // though we've declared the i32 version as legal. This is because it only
+    // thinks i64 is the truly supported version. We've already converted the
+    // i64 version to a widened i32.
+    assert(Op.getSimpleValueType() == MVT::i32);
+    return Op;
   default:
     llvm_unreachable("unimplemented operand");
   }
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.h b/llvm/lib/Target/ARC/ARCISelLowering.h
index 4b72bfdaee9c..e070ed8752cc 100644
--- a/llvm/lib/Target/ARC/ARCISelLowering.h
+++ b/llvm/lib/Target/ARC/ARCISelLowering.h
@@ -77,6 +77,9 @@ public:
 private:
   const ARCSubtarget &Subtarget;
 
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
   // Lower Operand helpers
   SDValue LowerCallArguments(SDValue Chain, CallingConv::ID CallConv,
                              bool isVarArg,
diff --git a/llvm/lib/Target/ARC/ARCInstrFormats.td b/llvm/lib/Target/ARC/ARCInstrFormats.td
index 5f539c92c745..2a109cc0f764 100644
--- a/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -261,32 +261,6 @@ class F32_SOP_RR<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
   let Inst{5-0} = subop;
 }
 
-// Single Operand Immediate Instructions.
-// 1-register, unsigned 6-bit immediate Single Operand instruction with
-// condition code.
-// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
-// |B[2-0]  | 1| 1|            subop| F|B[5-3]  |U6           |1|cc       |
-class F32_SOP_CC_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
-                     string asmstr, list<dag> pattern> :
-  InstARC<4, outs, ins, asmstr, pattern> {
-
-  bits<5> cc;
-  bits<6> U6;
-  bits<6> B;
-
-  let Inst{31-27} = major;
-  let Inst{26-24} = B{2-0};
-  let Inst{23-22} = 0b11;
-  let Inst{21-16} = subop;
-  let Inst{15} = F;
-  let Inst{14-12} = B{5-3};
-  let Inst{11-6} = U6;
-  let Inst{5} = 1;
-  let Inst{4-0} = cc;
-
-  let DecoderMethod = "DecodeCCRU6Instruction";
-}
-
 // Dual Operand Instructions.  Inst[21-16] specifies the specific operation
 // for this format.
 
@@ -353,6 +327,31 @@ class F32_DOP_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
   let Inst{5-0} = A;
 }
 
+// 1-register, unsigned 6-bit, immediate Dual Operand instruction with
+// condition code.
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0]  | 1| 1|            subop| F|B[5-3]  |U6           |1|cc       |
+class F32_DOP_CC_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+                     string asmstr, list<dag> pattern> :
+  InstARC<4, outs, ins, asmstr, pattern> {
+
+  bits<5> cc;
+  bits<6> U6;
+  bits<6> B;
+
+  let Inst{31-27} = major;
+  let Inst{26-24} = B{2-0};
+  let Inst{23-22} = 0b11;
+  let Inst{21-16} = subop;
+  let Inst{15} = F;
+  let Inst{14-12} = B{5-3};
+  let Inst{11-6} = U6;
+  let Inst{5} = 1;
+  let Inst{4-0} = cc;
+
+  let DecoderMethod = "DecodeCCRU6Instruction";
+}
+
 // 2-register, unsigned 6-bit immediate Dual Operand instruction with
 // condition code. This instruction uses B as the first 2 operands
 // (i.e, add.cc B, B, u6).
@@ -364,7 +363,6 @@ class F32_DOP_CC_RRU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
   bits<5> cc;
   bits<6> U6;
   bits<6> B;
-  bits<6> A;
 
   let Inst{31-27} = major;
   let Inst{26-24} = B{2-0};
@@ -397,6 +395,50 @@ class F32_DOP_RS12<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
   let Inst{5-0} = S12{11-6};
 }
 
+// 1-register, signed 12-bit immediate Dual Operand instruction.
+// This instruction uses B as the first operand (i.e., lr B, [%count0]).
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0]  | 1| 0|            subop| F|B[5-3]  |S12[5-0]     |S12[11-6]  |
+class F32_SOP_RS12<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+                   string asmstr, list<dag> pattern> :
+  InstARC<4, outs, ins, asmstr, pattern> {
+  bits<6> B;
+  bits<12> S12;
+
+  let Inst{31-27} = major;
+  let Inst{26-24} = B{2-0};
+  let Inst{23-22} = 0b10;
+  let Inst{21-16} = subop;
+  let Inst{15} = F;
+  let Inst{14-12} = B{5-3};
+  let Inst{11-6} = S12{5-0};
+  let Inst{5-0} = S12{11-6};
+
+  let DecoderMethod = "DecodeSOPwithRS12";
+}
+
+// 1-register, unsigned 6-bit immediate Dual Operand instruction.
+// This instruction uses B as the first operand.
+// |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
+// |B[2-0]  | 0| 1|            subop| F|B[5-3]  |U6           |0|0|0|0|0|0|
+class F32_SOP_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
+                  string asmstr, list<dag> pattern> :
+  InstARC<4, outs, ins, asmstr, pattern> {
+  bits<6> B;
+  bits<6> U6;
+
+  let Inst{31-27} = major;
+  let Inst{26-24} = B{2-0};
+  let Inst{23-22} = 0b01;
+  let Inst{21-16} = subop;
+  let Inst{15} = F;
+  let Inst{14-12} = B{5-3};
+  let Inst{11-6} = U6;
+  let Inst{5-0} = 0;
+
+  let DecoderMethod = "DecodeSOPwithRU6";
+}
+
 // 2-register, 32-bit immediate (LImm) Dual Operand instruction.
 // This instruction has the 32-bit immediate in bits 32-63, and
 // 62 in the C register operand slot, but is otherwise F32_DOP_RR.
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 527f239c2643..6e8190ee7209 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -18,8 +18,8 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -43,8 +43,8 @@ enum TSFlagsConstants {
 // Pin the vtable to this file.
 void ARCInstrInfo::anchor() {}
 
-ARCInstrInfo::ARCInstrInfo()
-    : ARCGenInstrInfo(ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI() {}
+ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST)
+    : ARCGenInstrInfo(ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
 
 static bool isZeroImm(const MachineOperand &Op) {
   return Op.isImm() && Op.getImm() == 0;
@@ -99,7 +99,7 @@ unsigned ARCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 }
 
 /// Return the inverse of passed condition, i.e. turning COND_E to COND_NE.
-static ARCCC::CondCode GetOppositeBranchCondition(ARCCC::CondCode CC) {
+static ARCCC::CondCode getOppositeBranchCondition(ARCCC::CondCode CC) {
   switch (CC) {
   default:
     llvm_unreachable("Illegal condition code!");
@@ -280,23 +280,23 @@ unsigned ARCInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
 void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
-                               const DebugLoc &dl, MCRegister DestReg,
+                               const DebugLoc &DL, MCRegister DestReg,
                                MCRegister SrcReg, bool KillSrc) const {
   assert(ARC::GPR32RegClass.contains(SrcReg) &&
          "Only GPR32 src copy supported.");
   assert(ARC::GPR32RegClass.contains(DestReg) &&
          "Only GPR32 dest copy supported.");
-  BuildMI(MBB, I, dl, get(ARC::MOV_rr), DestReg)
+  BuildMI(MBB, I, DL, get(ARC::MOV_rr), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void ARCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
-                                       Register SrcReg, bool isKill,
+                                       Register SrcReg, bool IsKill,
                                        int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  DebugLoc dl = MBB.findDebugLoc(I);
+  DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
@@ -312,8 +312,8 @@ void ARCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
          "Only support GPR32 stores to stack now.");
   LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
                     << " to FrameIndex=" << FrameIndex << "\n");
-  BuildMI(MBB, I, dl, get(ARC::ST_rs9))
-      .addReg(SrcReg, getKillRegState(isKill))
+  BuildMI(MBB, I, DL, get(ARC::ST_rs9))
+      .addReg(SrcReg, getKillRegState(IsKill))
       .addFrameIndex(FrameIndex)
       .addImm(0)
       .addMemOperand(MMO);
@@ -324,7 +324,7 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
-  DebugLoc dl = MBB.findDebugLoc(I);
+  DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -339,7 +339,7 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
          "Only support GPR32 stores to stack now.");
   LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
                     << " from FrameIndex=" << FrameIndex << "\n");
-  BuildMI(MBB, I, dl, get(ARC::LD_rs9))
+  BuildMI(MBB, I, DL, get(ARC::LD_rs9))
       .addReg(DestReg, RegState::Define)
       .addFrameIndex(FrameIndex)
       .addImm(0)
@@ -350,7 +350,7 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 bool ARCInstrInfo::reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
   assert((Cond.size() == 3) && "Invalid ARC branch condition!");
-  Cond[2].setImm(GetOppositeBranchCondition((ARCCC::CondCode)Cond[2].getImm()));
+  Cond[2].setImm(getOppositeBranchCondition((ARCCC::CondCode)Cond[2].getImm()));
   return false;
 }
 
@@ -358,9 +358,9 @@ MachineBasicBlock::iterator
 ARCInstrInfo::loadImmediate(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI, unsigned Reg,
                             uint64_t Value) const {
-  DebugLoc dl = MBB.findDebugLoc(MI);
+  DebugLoc DL = MBB.findDebugLoc(MI);
   if (isInt<12>(Value)) {
-    return BuildMI(MBB, MI, dl, get(ARC::MOV_rs12), Reg)
+    return BuildMI(MBB, MI, DL, get(ARC::MOV_rs12), Reg)
         .addImm(Value)
         .getInstr();
   }
@@ -371,7 +371,7 @@ unsigned ARCInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
-                                    const DebugLoc &dl, int *BytesAdded) const {
+                                    const DebugLoc &DL, int *BytesAdded) const {
   assert(!BytesAdded && "Code size not handled.");
 
   // Shouldn't be a fall through.
@@ -380,11 +380,11 @@ unsigned ARCInstrInfo::insertBranch(MachineBasicBlock &MBB,
          "ARC branch conditions have two components!");
 
   if (Cond.empty()) {
-    BuildMI(&MBB, dl, get(ARC::BR)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(ARC::BR)).addMBB(TBB);
     return 1;
   }
   int BccOpc = Cond[1].isImm() ? ARC::BRcc_ru6_p : ARC::BRcc_rr_p;
-  MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(BccOpc));
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(BccOpc));
   MIB.addMBB(TBB);
   for (unsigned i = 0; i < 3; i++) {
     MIB.add(Cond[i]);
@@ -396,7 +396,7 @@ unsigned ARCInstrInfo::insertBranch(MachineBasicBlock &MBB,
   }
 
   // Two-way conditional branch.
-  BuildMI(&MBB, dl, get(ARC::BR)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(ARC::BR)).addMBB(FBB);
   return 2;
 }
 
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h
index 4f6122daf91f..ebc02a93b124 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.h
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.h
@@ -28,7 +28,7 @@ class ARCInstrInfo : public ARCGenInstrInfo {
   virtual void anchor();
 
 public:
-  ARCInstrInfo();
+  ARCInstrInfo(const ARCSubtarget &);
 
   const ARCRegisterInfo &getRegisterInfo() const { return RI; }
 
@@ -57,19 +57,19 @@ public:
 
   unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        const DebugLoc &dl,
+                        const DebugLoc &,
                         int *BytesAdded = nullptr) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   const DebugLoc &dl, MCRegister DestReg, MCRegister SrcReg,
+                   const DebugLoc &, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
-                           bool isKill, int FrameIndex,
+                           bool IsKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.td b/llvm/lib/Target/ARC/ARCInstrInfo.td
index ea3e41621323..4a0bc5cf7421 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -45,7 +45,6 @@ def SDT_ARCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
 def SDT_ARCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                          SDTCisVT<1, i32> ]>;
 
-
 // Global Address.
 def ARCGAWrapper : SDNode<"ARCISD::GAWRAPPER", SDT_ARCmov, []>;
 
@@ -80,6 +79,12 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARCCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
 //===----------------------------------------------------------------------===//
+// Instruction predicates
+//===----------------------------------------------------------------------===//
+
+def HasNorm  :       Predicate<"Subtarget->hasNorm()">;
+
+//===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
 
@@ -128,6 +133,19 @@ def STB_FAR : PseudoInstARC<(outs), (ins GPR32:$dst, MEMrlimm:$addr),
                              "STB_FAR $dst, $addr",
                              [(truncstorei8 GPR32:$dst, AddrModeFar:$addr)]>;
 
+// TODO: Add `Requires<[HasBitScan]>` predicate to these when available.
+let Defs = [STATUS32] in {
+  def CTLZ : PseudoInstARC<(outs GPR32:$A),
+                           (ins GPR32:$B),
+                           "error.fls $A, $B",
+                           [(set GPR32:$A, (ctlz i32:$B))]>;
+
+  def CTTZ : PseudoInstARC<(outs GPR32:$A),
+                           (ins GPR32:$B),
+                           "error.ffs $A, $B",
+                           [(set GPR32:$A, (cttz i32:$B))]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction Generation multiclasses.
 // Generate many variants of a single instruction with a single defining
@@ -252,6 +270,19 @@ multiclass MultiPat<SDPatternOperator InFrag,
   def _rrlimm : Pat<(InFrag i32:$B, imm32:$LImm), (RRLImm i32:$B, imm32:$LImm)>;
 }
 
+// NOTE: This could be specialized later with a custom `PrintMethod` for
+//       displaying the aux register name. E.g. `[%count0]` instead of [33].
+def AuxReg : Operand<i32>;
+
+def LR_rs12 : F32_SOP_RS12<0b00100, 0b101010, 0,
+                           (outs GPR32:$B), (ins AuxReg:$C),
+                           "lr\t$B, [$C]", []>;
+def LR_ru6 : F32_SOP_RU6<0b00100, 0b101010, 0,
+                         (outs GPR32:$B), (ins AuxReg:$C),
+                         "lr\t$B, [$C]", []>;
+
+def: Pat<(i32 readcyclecounter), (LR_rs12 0x21) >;  // read timer
+
 // ---------------------------------------------------------------------------
 // Instruction definitions and patterns for 3 operand binary instructions.
 // ---------------------------------------------------------------------------
@@ -276,6 +307,10 @@ defm MPY  : ArcBinaryGEN4Inst<0b011010, "mpy",1>;
 defm MPYM : ArcBinaryGEN4Inst<0b011011, "mpym",1>;
 defm MPYMU : ArcBinaryGEN4Inst<0b011100, "mpymu",1>;
 defm SETEQ : ArcBinaryGEN4Inst<0b111000, "seteq",1>;
+let Uses=[STATUS32], isAsCheapAsAMove=0, isReMaterializable=0 in {
+  defm ADC : ArcBinaryGEN4Inst<0b000001, "adc",1>;
+  defm SBC : ArcBinaryGEN4Inst<0b000011, "sbc">;
+}
 
 // Patterns for 3 operand binary instructions.
 defm : MultiPat<add, ADD_rrr, ADD_rru6, ADD_rrlimm>;
@@ -293,6 +328,11 @@ defm : MultiPat<mul, MPY_rrr, MPY_rru6, MPY_rrlimm>;
 defm : MultiPat<mulhs, MPYM_rrr, MPYM_rru6, MPYM_rrlimm>;
 defm : MultiPat<mulhu, MPYMU_rrr, MPYMU_rru6, MPYMU_rrlimm>;
 
+defm : MultiPat<addc, ADD_f_rrr, ADD_f_rru6, ADD_f_rrlimm>;
+defm : MultiPat<adde, ADC_f_rrr, ADC_f_rru6, ADC_f_rrlimm>;
+defm : MultiPat<subc, SUB_f_rrr, SUB_f_rru6, SUB_f_rrlimm>;
+defm : MultiPat<sube, SBC_f_rrr, SBC_f_rru6, SBC_f_rrlimm>;
+
 // ---------------------------------------------------------------------------
 // Unary Instruction definitions.
 // ---------------------------------------------------------------------------
@@ -301,8 +341,14 @@ defm SEXB : ArcUnaryGEN4Inst<0b000101, "sexb">;
 defm SEXH : ArcUnaryGEN4Inst<0b000110, "sexh">;
 
 // Extension unary instruction definitions.
+defm FFS : ArcUnaryEXT5Inst<0b010010, "ffs">;
 defm FLS : ArcUnaryEXT5Inst<0b010011, "fls">;
 
+let Predicates=[HasNorm] in {
+  defm NORM  : ArcUnaryEXT5Inst<0b000001,"norm">;
+  defm NORMH : ArcUnaryEXT5Inst<0b001000,"normh">;
+}
+
 // General Unary Instruction fragments.
 def : Pat<(sext_inreg i32:$a, i8), (SEXB_rr i32:$a)>;
 def : Pat<(sext_inreg i32:$a, i16), (SEXH_rr i32:$a)>;
@@ -337,24 +383,30 @@ def MOV_ru6 : F32_DOP_RU6<0b00100, 0b001010, 0,
                           (outs GPR32:$B), (ins immU6:$U6),
                           "mov\t$B, $U6", []>;
 
+def MOV_f_ru6 : F32_DOP_RU6<0b00100, 0b001010, 1,
+                          (outs GPR32:$B), (ins u6:$U6),
+                          "mov.f\t$B, $U6", []> {
+  let isAsCheapAsAMove=1;
+  let Defs = [STATUS32];
+}
+
 def cmov : PatFrag<(ops node:$op1, node:$op2, node:$cc),
                    (ARCcmov $op1, $op2, $cc)>;
-let Uses = [STATUS32] in {
-  def MOVcc : F32_DOP_CC_RR<0b00100, 0b001010, 0,
-                 (outs GPR32:$B),
-                 (ins GPR32:$C, GPR32:$fval, cmovpred:$cc),
-                 !strconcat("mov.", "$cc\t$B, $C"),
-                 [(set GPR32:$B, (cmov i32:$C, i32:$fval, cmovpred:$cc))]> {
-    let Constraints = "$B = $fval";
-  }
-
-  def MOVcc_ru6 : F32_SOP_CC_RU6<0b00100, 0b001010, 0,
-                 (outs GPR32:$b), (ins u6:$c, CCOp:$cc, GPR32:$b2),
-                 "mov.$cc\t$b, $c", []> {
-    let isAsCheapAsAMove=0;
-    let isPredicable=1;
-    let isReMaterializable=0;
-    let Constraints="$b2 = $b";
+let Uses = [STATUS32], isAsCheapAsAMove = 1, isPredicable=1,
+	  isReMaterializable = 0, Constraints = "$B = $B2" in {
+  def MOV_cc : F32_DOP_CC_RR<0b00100, 0b001010, 0,
+                 (outs GPR32:$B), (ins GPR32:$C, GPR32:$B2, cmovpred:$cc),
+                 "mov.$cc\t$B, $C",
+                 [(set GPR32:$B, (cmov i32:$C, i32:$B2, cmovpred:$cc))]>;
+
+  def MOV_cc_ru6 : F32_DOP_CC_RU6<0b00100, 0b001010, 0,
+                 (outs GPR32:$B), (ins u6:$C, CCOp:$cc, GPR32:$B2),
+                 "mov.$cc\t$B, $C", []>;
+
+  def MOV_cc_f_ru6 : F32_DOP_CC_RU6<0b00100, 0b001010, 1,
+                    (outs GPR32:$B), (ins u6:$C, CCOp:$cc, GPR32:$B2),
+                    "mov.$cc.f\t$B, $C", []> {
+    let Defs = [STATUS32];
   }
 }
 
diff --git a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
index 232a7be2a9f5..c956f00b628d 100644
--- a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
+++ b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -33,6 +34,16 @@ using namespace llvm;
 #define DEBUG_TYPE "arc-addr-mode"
 
 namespace llvm {
+
+static cl::opt<unsigned> ArcKillAddrMode("arc-kill-addr-mode", cl::init(0),
+                                         cl::ReallyHidden, cl::ZeroOrMore);
+
+#define DUMP_BEFORE() ((ArcKillAddrMode & 0x0001) != 0)
+#define DUMP_AFTER() ((ArcKillAddrMode & 0x0002) != 0)
+#define VIEW_BEFORE() ((ArcKillAddrMode & 0x0004) != 0)
+#define VIEW_AFTER() ((ArcKillAddrMode & 0x0008) != 0)
+#define KILL_PASS() ((ArcKillAddrMode & 0x0010) != 0)
+
 FunctionPass *createARCOptAddrMode();
 void initializeARCOptAddrModePass(PassRegistry &);
 } // end namespace llvm
@@ -73,9 +84,9 @@ private:
   // instruction \p To
   bool canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
 
-  // Returns true if load/store instruction \p Ldst can be sunk down
-  // to instruction \p To
-  bool canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
+  // // Returns true if load/store instruction \p Ldst can be sunk down
+  // // to instruction \p To
+  // bool canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To);
 
   // Check if instructions \p Ldst and \p Add can be moved to become adjacent
   // If they can return instruction which need not to move.
@@ -413,30 +424,30 @@ bool ARCOptAddrMode::canHoistLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
   return true;
 }
 
-bool ARCOptAddrMode::canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
-  // Can only sink load/store within same BB
-  if (Ldst->getParent() != To->getParent())
-    return false;
-  MachineBasicBlock::const_iterator MI(Ldst), ME(To),
-      End(Ldst->getParent()->end());
-
-  bool IsStore = Ldst->mayStore();
-  bool IsLoad = Ldst->mayLoad();
-
-  Register ValReg = IsLoad ? Ldst->getOperand(0).getReg() : Register();
-  for (; MI != ME && MI != End; ++MI) {
-    if (MI->isDebugValue())
-      continue;
-    if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() ||
-        MI->hasUnmodeledSideEffects())
-      return false;
-    if (IsStore && MI->mayLoad())
-      return false;
-    if (ValReg && MI->readsVirtualRegister(ValReg))
-      return false;
-  }
-  return true;
-}
+// bool ARCOptAddrMode::canSinkLoadStoreTo(MachineInstr *Ldst, MachineInstr *To) {
+//   // Can only sink load/store within same BB
+//   if (Ldst->getParent() != To->getParent())
+//     return false;
+//   MachineBasicBlock::const_iterator MI(Ldst), ME(To),
+//       End(Ldst->getParent()->end());
+
+//   bool IsStore = Ldst->mayStore();
+//   bool IsLoad = Ldst->mayLoad();
+
+//   Register ValReg = IsLoad ? Ldst->getOperand(0).getReg() : Register();
+//   for (; MI != ME && MI != End; ++MI) {
+//     if (MI->isDebugValue())
+//       continue;
+//     if (MI->mayStore() || MI->isCall() || MI->isInlineAsm() ||
+//         MI->hasUnmodeledSideEffects())
+//       return false;
+//     if (IsStore && MI->mayLoad())
+//       return false;
+//     if (ValReg && MI->readsVirtualRegister(ValReg))
+//       return false;
+//   }
+//   return true;
+// }
 
 void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
                                       unsigned NewBase,
@@ -485,9 +496,16 @@ bool ARCOptAddrMode::processBasicBlock(MachineBasicBlock &MBB) {
 }
 
 bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
+  if (skipFunction(MF.getFunction()) || KILL_PASS())
     return false;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (DUMP_BEFORE())
+    MF.dump();
+#endif
+  if (VIEW_BEFORE())
+    MF.viewCFG();
+
   AST = &MF.getSubtarget<ARCSubtarget>();
   AII = AST->getInstrInfo();
   MRI = &MF.getRegInfo();
@@ -496,6 +514,13 @@ bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   for (auto &MBB : MF)
     Changed |= processBasicBlock(MBB);
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (DUMP_AFTER())
+    MF.dump();
+#endif
+  if (VIEW_AFTER())
+    MF.viewCFG();
   return Changed;
 }
 
diff --git a/llvm/lib/Target/ARC/ARCRegisterInfo.cpp b/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
index fb84dd9b266a..91ddd7fe36e1 100644
--- a/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -35,19 +35,19 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "ARCGenRegisterInfo.inc"
 
-static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
+static void replaceFrameIndex(MachineBasicBlock::iterator II,
                               const ARCInstrInfo &TII, unsigned Reg,
                               unsigned FrameReg, int Offset, int StackSize,
                               int ObjSize, RegScavenger *RS, int SPAdj) {
   assert(RS && "Need register scavenger.");
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc dl = MI.getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   unsigned BaseReg = FrameReg;
   unsigned KillState = 0;
   if (MI.getOpcode() == ARC::LD_rs9 && (Offset >= 256 || Offset < -256)) {
     // Loads can always be reached with LD_rlimm.
-    BuildMI(MBB, II, dl, TII.get(ARC::LD_rlimm), Reg)
+    BuildMI(MBB, II, DL, TII.get(ARC::LD_rlimm), Reg)
         .addReg(BaseReg)
         .addImm(Offset)
         .addMemOperand(*MI.memoperands_begin());
@@ -72,7 +72,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
       RS->setRegUsed(BaseReg);
     }
     unsigned AddOpc = isUInt<6>(Offset) ? ARC::ADD_rru6 : ARC::ADD_rrlimm;
-    BuildMI(MBB, II, dl, TII.get(AddOpc))
+    BuildMI(MBB, II, DL, TII.get(AddOpc))
         .addReg(BaseReg, RegState::Define)
         .addReg(FrameReg)
         .addImm(Offset);
@@ -90,7 +90,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
   case ARC::LDB_rs9:
   case ARC::LDB_X_rs9:
     LLVM_DEBUG(dbgs() << "Building LDFI\n");
-    BuildMI(MBB, II, dl, TII.get(MI.getOpcode()), Reg)
+    BuildMI(MBB, II, DL, TII.get(MI.getOpcode()), Reg)
         .addReg(BaseReg, KillState)
         .addImm(Offset)
         .addMemOperand(*MI.memoperands_begin());
@@ -103,7 +103,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
     LLVM_FALLTHROUGH;
   case ARC::STB_rs9:
     LLVM_DEBUG(dbgs() << "Building STFI\n");
-    BuildMI(MBB, II, dl, TII.get(MI.getOpcode()))
+    BuildMI(MBB, II, DL, TII.get(MI.getOpcode()))
         .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
         .addReg(BaseReg, KillState)
         .addImm(Offset)
@@ -111,7 +111,7 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
     break;
   case ARC::GETFI:
     LLVM_DEBUG(dbgs() << "Building GETFI\n");
-    BuildMI(MBB, II, dl,
+    BuildMI(MBB, II, DL,
             TII.get(isUInt<6>(Offset) ? ARC::ADD_rru6 : ARC::ADD_rrlimm))
         .addReg(Reg, RegState::Define)
         .addReg(FrameReg)
@@ -125,7 +125,8 @@ static void ReplaceFrameIndex(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
-ARCRegisterInfo::ARCRegisterInfo() : ARCGenRegisterInfo(ARC::BLINK) {}
+ARCRegisterInfo::ARCRegisterInfo(const ARCSubtarget &ST)
+    : ARCGenRegisterInfo(ARC::BLINK), ST(ST) {}
 
 bool ARCRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
   return MF.needsFrameMoves();
@@ -145,6 +146,7 @@ BitVector ARCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(ARC::R25);
   Reserved.set(ARC::BLINK);
   Reserved.set(ARC::FP);
+
   return Reserved;
 }
 
@@ -214,7 +216,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
              "FP Offset not in bounds.");
     }
   }
-  ReplaceFrameIndex(II, TII, Reg, getFrameRegister(MF), Offset, StackSize,
+  replaceFrameIndex(II, TII, Reg, getFrameRegister(MF), Offset, StackSize,
                     ObjSize, RS, SPAdj);
 }
 
diff --git a/llvm/lib/Target/ARC/ARCRegisterInfo.h b/llvm/lib/Target/ARC/ARCRegisterInfo.h
index f8bca11fdbc8..b1ae6b69398f 100644
--- a/llvm/lib/Target/ARC/ARCRegisterInfo.h
+++ b/llvm/lib/Target/ARC/ARCRegisterInfo.h
@@ -21,10 +21,13 @@
 namespace llvm {
 
 class TargetInstrInfo;
+class ARCSubtarget;
 
 struct ARCRegisterInfo : public ARCGenRegisterInfo {
+  const ARCSubtarget &ST;
+
 public:
-  ARCRegisterInfo();
+  ARCRegisterInfo(const ARCSubtarget &);
 
   /// Code Generation virtual methods...
 
diff --git a/llvm/lib/Target/ARC/ARCRegisterInfo.td b/llvm/lib/Target/ARC/ARCRegisterInfo.td
index 5f2bc7974dde..4b686e4bda64 100644
--- a/llvm/lib/Target/ARC/ARCRegisterInfo.td
+++ b/llvm/lib/Target/ARC/ARCRegisterInfo.td
@@ -21,56 +21,56 @@ class Core<int num, string n, list<string>altNames=[]> : ARCReg<n, altNames> {
   let HWEncoding = num;
 }
 
-class Status<string n> : ARCReg<n, []> {
+// Auxilary register
+class Aux<int num, string n, list<string> altNames=[]> : ARCReg<n, altNames> {
+  let HWEncoding = num;
 }
 
 // Integer registers
-def R0 : Core< 0, "%r0">, DwarfRegNum<[0]>;
-def R1 : Core< 1, "%r1">, DwarfRegNum<[1]>;
-def R2 : Core< 2, "%r2">, DwarfRegNum<[2]>;
-def R3 : Core< 3, "%r3">, DwarfRegNum<[3]>;
+foreach i = 0 - 3 in
+  def R#i : Core<i, "%r"#i>, DwarfRegNum<[i]>;
+
 let CostPerUse=[1] in {
-def R4 : Core< 4, "%r4">, DwarfRegNum<[4]>;
-def R5 : Core< 5, "%r5">, DwarfRegNum<[5]>;
-def R6 : Core< 6, "%r6">, DwarfRegNum<[6]>;
-def R7 : Core< 7, "%r7">, DwarfRegNum<[7]>;
-def R8 : Core< 8, "%r8">, DwarfRegNum<[8]>;
-def R9 : Core< 9, "%r9">, DwarfRegNum<[9]>;
-def R10 : Core<10, "%r10">, DwarfRegNum<[10]>;
-def R11 : Core<11, "%r11">, DwarfRegNum<[11]>;
+  foreach i = 4 - 11 in
+    def R#i : Core<i, "%r"#i>, DwarfRegNum<[i]>;
 }
-def R12 : Core<12, "%r12">, DwarfRegNum<[12]>;
-def R13 : Core<13, "%r13">, DwarfRegNum<[13]>;
-def R14 : Core<14, "%r14">, DwarfRegNum<[14]>;
-def R15 : Core<15, "%r15">, DwarfRegNum<[15]>;
+
+foreach i = 12 - 15 in
+  def R#i : Core<i, "%r"#i>, DwarfRegNum<[i]>;
 
 let CostPerUse=[1] in {
-def R16 : Core<16, "%r16">, DwarfRegNum<[16]>;
-def R17 : Core<17, "%r17">, DwarfRegNum<[17]>;
-def R18 : Core<18, "%r18">, DwarfRegNum<[18]>;
-def R19 : Core<19, "%r19">, DwarfRegNum<[19]>;
-def R20 : Core<20, "%r20">, DwarfRegNum<[20]>;
-def R21 : Core<21, "%r21">, DwarfRegNum<[21]>;
-def R22 : Core<22, "%r22">, DwarfRegNum<[22]>;
-def R23 : Core<23, "%r23">, DwarfRegNum<[23]>;
-def R24 : Core<24, "%r24">, DwarfRegNum<[24]>;
-def R25 : Core<25, "%r25">, DwarfRegNum<[25]>;
-def GP : Core<26, "%gp",["%r26"]>, DwarfRegNum<[26]>;
-def FP : Core<27, "%fp", ["%r27"]>, DwarfRegNum<[27]>;
-def SP : Core<28, "%sp", ["%r28"]>, DwarfRegNum<[28]>;
-def ILINK : Core<29, "%ilink">, DwarfRegNum<[29]>;
-def R30 : Core<30, "%r30">, DwarfRegNum<[30]>;
-def BLINK: Core<31, "%blink">, DwarfRegNum<[31]>;
-
-def STATUS32 : Status<"status32">, DwarfRegNum<[32]>;
+
+  foreach i = 16 - 25 in
+    def R#i : Core<i, "%r"#i>, DwarfRegNum<[i]>;
+
+  def GP : Core<26, "%gp",["%r26"]>, DwarfRegNum<[26]>;
+  def FP : Core<27, "%fp", ["%r27"]>, DwarfRegNum<[27]>;
+  def SP : Core<28, "%sp", ["%r28"]>, DwarfRegNum<[28]>;
+  def ILINK : Core<29, "%ilink">, DwarfRegNum<[29]>;
+  def R30 : Core<30, "%r30">, DwarfRegNum<[30]>;
+  def BLINK : Core<31, "%blink">, DwarfRegNum<[31]>;
+
+  // Define extended core registers R32..R63
+  foreach i = 32 - 63 in
+    def R#i : Core<i, "%r"#i>, DwarfRegNum<[i]>;
+}
+
+// Auxilary registers
+let CostPerUse=[1] in {
+  def STATUS32 : Aux<10, "status32">; // No DwarfRegNum defined in the ARC ABI
 }
 
-// Register classes.
-//
 def GPR32: RegisterClass<"ARC", [i32], 32,
-  (add R0, R1, R2, R3,
-  R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19,
-  R20, R21, R22, R23, R24, R25, GP, FP, SP, ILINK, R30, BLINK)>;
+  (add (sequence "R%u", 0, 25), GP, FP, SP, ILINK, R30, BLINK, (sequence "R%u", 32, 63))> {
+  let AltOrders=[(add (sequence "R%u", 0, 25), GP, FP, SP, ILINK, R30, BLINK)];
+  let AltOrderSelect = [{
+      // When referenced in a C++ code block like this
+      // 0 is all Core32 regs
+      // 1 is AltOrders[0]
+      // 2 is AltOrders[1] and so on
+      return 1;
+    }];
+}
 
 def SREG : RegisterClass<"ARC", [i32], 1, (add STATUS32)>;
 
diff --git a/llvm/lib/Target/ARC/ARCSubtarget.cpp b/llvm/lib/Target/ARC/ARCSubtarget.cpp
index 409dd2a98ab4..641c56b06870 100644
--- a/llvm/lib/Target/ARC/ARCSubtarget.cpp
+++ b/llvm/lib/Target/ARC/ARCSubtarget.cpp
@@ -12,7 +12,7 @@
 
 #include "ARCSubtarget.h"
 #include "ARC.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -26,5 +26,5 @@ void ARCSubtarget::anchor() {}
 
 ARCSubtarget::ARCSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
-    : ARCGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), FrameLowering(*this),
-      TLInfo(TM, *this) {}
+    : ARCGenSubtargetInfo(TT, CPU, /*TuneCPU=*/CPU, FS), InstrInfo(*this),
+      FrameLowering(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/ARC/ARCSubtarget.h b/llvm/lib/Target/ARC/ARCSubtarget.h
index 6a4856221b8f..f3429677deeb 100644
--- a/llvm/lib/Target/ARC/ARCSubtarget.h
+++ b/llvm/lib/Target/ARC/ARCSubtarget.h
@@ -29,14 +29,15 @@ class StringRef;
 class TargetMachine;
 
 class ARCSubtarget : public ARCGenSubtargetInfo {
-  bool Xnorm = false;
-
   virtual void anchor();
   ARCInstrInfo InstrInfo;
   ARCFrameLowering FrameLowering;
   ARCTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
 
+  // ARC processor extensions
+  bool Xnorm = false;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index b8c8949e18dd..52f74b729ff7 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -16,7 +16,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp b/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
index b7033d0972b9..bb5336931932 100644
--- a/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
+++ b/llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp
@@ -21,7 +21,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -107,6 +107,12 @@ static DecodeStatus DecodeStLImmInstruction(MCInst &, uint64_t, uint64_t,
 static DecodeStatus DecodeLdRLImmInstruction(MCInst &, uint64_t, uint64_t,
                                              const void *);
 
+static DecodeStatus DecodeSOPwithRS12(MCInst &, uint64_t, uint64_t,
+                                      const void *);
+
+static DecodeStatus DecodeSOPwithRU6(MCInst &, uint64_t, uint64_t,
+                                     const void *);
+
 static DecodeStatus DecodeCCRU6Instruction(MCInst &, uint64_t, uint64_t,
                                            const void *);
 
@@ -304,13 +310,36 @@ static DecodeStatus DecodeCCRU6Instruction(MCInst &Inst, uint64_t Insn,
   DstB = decodeBField(Insn);
   DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
   using Field = decltype(Insn);
-  Field U6Field = fieldFromInstruction(Insn, 6, 11);
+  Field U6Field = fieldFromInstruction(Insn, 6, 6);
   Inst.addOperand(MCOperand::createImm(U6Field));
   Field CCField = fieldFromInstruction(Insn, 0, 4);
   Inst.addOperand(MCOperand::createImm(CCField));
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSOPwithRU6(MCInst &Inst, uint64_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  unsigned DstB = decodeBField(Insn);
+  DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
+  using Field = decltype(Insn);
+  Field U6 = fieldFromInstruction(Insn, 6, 6);
+  Inst.addOperand(MCOperand::createImm(U6));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSOPwithRS12(MCInst &Inst, uint64_t Insn,
+                                      uint64_t Address, const void *Decoder) {
+  unsigned DstB = decodeBField(Insn);
+  DecodeGPR32RegisterClass(Inst, DstB, Address, Decoder);
+  using Field = decltype(Insn);
+  Field Lower = fieldFromInstruction(Insn, 6, 6);
+  Field Upper = fieldFromInstruction(Insn, 0, 5);
+  Field Sign = fieldFromInstruction(Insn, 5, 1) ? -1 : 1;
+  Field Result = Sign * ((Upper << 6) + Lower);
+  Inst.addOperand(MCOperand::createImm(Result));
+  return MCDisassembler::Success;
+}
+
 DecodeStatus ARCDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                              ArrayRef<uint8_t> Bytes,
                                              uint64_t Address,
diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index 358ee6002f80..d4f74fa77fc4 100644
--- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -19,9 +19,9 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp b/llvm/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp
index d4a74e1c4174..91d56bb6b86d 100644
--- a/llvm/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp
+++ b/llvm/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/ARCTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index bb81233cf803..f4d0f4a6d6b0 100644
--- a/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -182,8 +182,7 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
   Front.push_back(MI);
 
   while (Front.size() != 0) {
-    MI = Front.back();
-    Front.pop_back();
+    MI = Front.pop_back_val();
 
     // MI is already known to be dead. We need to see
     // if other instructions can also be removed.
@@ -621,9 +620,8 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
       // Collect all the uses of this MI's DPR def for updating later.
       SmallVector<MachineOperand*, 8> Uses;
       Register DPRDefReg = MI->getOperand(0).getReg();
-      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
-             E = MRI->use_end(); I != E; ++I)
-        Uses.push_back(&*I);
+      for (MachineOperand &MO : MRI->use_operands(DPRDefReg))
+        Uses.push_back(&MO);
 
       // We can optimize this.
       unsigned NewReg = optimizeSDPattern(MI);
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 5c1bed14c941..8cbd80f1bf65 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -437,6 +437,11 @@ def FeatureLOB            : SubtargetFeature<"lob", "HasLOB", "true",
                                              "Enable Low Overhead Branch "
                                              "extensions">;
 
+def FeatureFixCMSE_CVE_2021_35465 : SubtargetFeature<"fix-cmse-cve-2021-35465",
+                                        "FixCMSE_CVE_2021_35465", "true",
+                                        "Mitigate against the cve-2021-35465 "
+                                        "security vulnurability">;
+
 //===----------------------------------------------------------------------===//
 // ARM architecture class
 //
@@ -539,6 +544,18 @@ def HasV8_7aOps   : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true",
                                    "Support ARM v8.7a instructions",
                                    [HasV8_6aOps]>;
 
+def HasV9_0aOps   : SubtargetFeature<"v9a", "HasV9_0aOps", "true",
+                                   "Support ARM v9a instructions",
+                                   [HasV8_5aOps]>;
+
+def HasV9_1aOps   : SubtargetFeature<"v9.1a", "HasV9_1aOps", "true",
+                                   "Support ARM v9.1a instructions",
+                                   [HasV8_6aOps, HasV9_0aOps]>;
+
+def HasV9_2aOps   : SubtargetFeature<"v9.2a", "HasV9_2aOps", "true",
+                                   "Support ARM v9.2a instructions",
+                                   [HasV8_7aOps, HasV9_1aOps]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
@@ -619,6 +636,8 @@ def ProcA78     : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78",
                                    "Cortex-A78 ARM processors", []>;
 def ProcA78C    : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C",
                                    "Cortex-A78C ARM processors", []>;
+def ProcA710    : SubtargetFeature<"cortex-a710", "ARMProcFamily",
+                                   "CortexA710", "Cortex-A710 ARM processors", []>;
 def ProcX1      : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                    "Cortex-X1 ARM processors", []>;
 
@@ -867,6 +886,43 @@ def ARMv87a   : Architecture<"armv8.7-a", "ARMv87a",  [HasV8_7aOps,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
 
+def ARMv9a   : Architecture<"armv9-a", "ARMv9a",       [HasV9_0aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
+def ARMv91a   : Architecture<"armv9.1-a", "ARMv91a",   [HasV9_1aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
+def ARMv92a   : Architecture<"armv9.2-a", "ARMv92a",  [HasV9_2aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
+
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
@@ -1213,7 +1269,8 @@ def : ProcessorModel<"cortex-m33", CortexM4Model,       [ARMv8mMainline,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
-                                                         FeatureHasNoBranchPredictor]>;
+                                                         FeatureHasNoBranchPredictor,
+                                                         FeatureFixCMSE_CVE_2021_35465]>;
 
 def : ProcessorModel<"cortex-m35p", CortexM4Model,      [ARMv8mMainline,
                                                          FeatureDSP,
@@ -1222,7 +1279,8 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model,      [ARMv8mMainline,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
-                                                         FeatureHasNoBranchPredictor]>;
+                                                         FeatureHasNoBranchPredictor,
+                                                         FeatureFixCMSE_CVE_2021_35465]>;
 
 def : ProcessorModel<"cortex-m55", CortexM4Model,      [ARMv81mMainline,
                                                          FeatureDSP,
@@ -1231,7 +1289,8 @@ def : ProcessorModel<"cortex-m55", CortexM4Model,      [ARMv81mMainline,
                                                          FeatureHasNoBranchPredictor,
                                                          FeaturePrefLoopAlign32,
                                                          FeatureHasSlowFPVMLx,
-                                                         HasMVEFloatOps]>;
+                                                         HasMVEFloatOps,
+                                                         FeatureFixCMSE_CVE_2021_35465]>;
 
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDivThumb,
@@ -1323,6 +1382,14 @@ def : ProcNoItin<"cortex-a78c",                         [ARMv82a, ProcA78C,
                                                          FeatureDotProd,
                                                          FeatureFullFP16]>;
 
+def : ProcNoItin<"cortex-a710",                         [ARMv9a, ProcA710,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureFP16FML,
+                                                         FeatureBF16,
+                                                         FeatureMatMulInt8,
+                                                         FeatureSB]>;
+
 def : ProcNoItin<"cortex-x1",                           [ARMv82a, ProcX1,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index ba594b7f0935..9901b86b0e87 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -41,11 +41,11 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -1291,9 +1291,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
-  const MachineFunction &MF = *MI->getParent()->getParent();
-  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
     OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
@@ -1742,7 +1739,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
       // FIXME: Ideally we could vary the LDRB index based on the padding
       // between the sequence and jump table, however that relies on MCExprs
       // for load indexes which are currently not supported.
-      OutStreamer->emitCodeAlignment(4);
+      OutStreamer->emitCodeAlignment(4, &getSubtargetInfo());
       EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
                                        .addReg(Idx)
                                        .addReg(Idx)
@@ -2035,6 +2032,9 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
+    const MachineFunction &MF = *MI->getParent()->getParent();
+    const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+
     if (STI.isTargetDarwin() || STI.isTargetWindows()) {
       // These platforms always use the same frame register
       EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
@@ -2080,6 +2080,9 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     Register SrcReg = MI->getOperand(0).getReg();
     Register ScratchReg = MI->getOperand(1).getReg();
 
+    const MachineFunction &MF = *MI->getParent()->getParent();
+    const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
       .addReg(ScratchReg)
       .addReg(SrcReg)
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9b058ff7dbcb..2d981be4cfc1 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -173,8 +173,9 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   return MHR;
 }
 
-MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
-    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+MachineInstr *
+ARMBaseInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                        LiveIntervals *LIS) const {
   // FIXME: Thumb2 support.
 
   if (!EnableARM3Addr)
@@ -336,9 +337,9 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
     }
   }
 
-  MachineBasicBlock::iterator MBBI = MI.getIterator();
-  MFI->insert(MBBI, NewMIs[1]);
-  MFI->insert(MBBI, NewMIs[0]);
+  MachineBasicBlock &MBB = *MI.getParent();
+  MBB.insert(MI, NewMIs[1]);
+  MBB.insert(MI, NewMIs[0]);
   return NewMIs[0];
 }
 
@@ -867,6 +868,7 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
 void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) {
   MIB.addImm(ARMVCC::None);
   MIB.addReg(0);
+  MIB.addReg(0); // tp_reg
 }
 
 void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB,
@@ -878,6 +880,7 @@ void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB,
 void llvm::addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond) {
   MIB.addImm(Cond);
   MIB.addReg(ARM::VPR, RegState::Implicit);
+  MIB.addReg(0); // tp_reg
 }
 
 void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB,
@@ -914,7 +917,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.hasFP64())
     Opc = ARM::VMOVD;
   else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
-    Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR;
+    Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MQPRCopy;
 
   if (Opc) {
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
@@ -923,7 +926,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       MIB.addReg(SrcReg, getKillRegState(KillSrc));
     if (Opc == ARM::MVE_VORR)
       addUnpredicatedMveVpredROp(MIB, DestReg);
-    else
+    else if (Opc != ARM::MQPRCopy)
       MIB.add(predOps(ARMCC::AL));
     return;
   }
@@ -1241,7 +1244,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         llvm_unreachable("Unknown reg class!");
       break;
     case 32:
-      if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+      if (ARM::QQPRRegClass.hasSubClassEq(RC) ||
+          ARM::MQQPRRegClass.hasSubClassEq(RC) ||
+          ARM::DQuadRegClass.hasSubClassEq(RC)) {
         if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
             Subtarget.hasNEON()) {
           // FIXME: It's possible to only store part of the QQ register if the
@@ -1252,6 +1257,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
               .addReg(SrcReg, getKillRegState(isKill))
               .addMemOperand(MMO)
               .add(predOps(ARMCC::AL));
+        } else if (Subtarget.hasMVEIntegerOps()) {
+          BuildMI(MBB, I, DebugLoc(), get(ARM::MQQPRStore))
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addFrameIndex(FI)
+              .addMemOperand(MMO);
         } else {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(),
                                             get(ARM::VSTMDIA))
@@ -1267,7 +1277,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         llvm_unreachable("Unknown reg class!");
       break;
     case 64:
-      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      if (ARM::MQQQQPRRegClass.hasSubClassEq(RC) &&
+          Subtarget.hasMVEIntegerOps()) {
+        BuildMI(MBB, I, DebugLoc(), get(ARM::MQQQQPRStore))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addMemOperand(MMO);
+      } else if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
         MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA))
                                       .addFrameIndex(FI)
                                       .add(predOps(ARMCC::AL))
@@ -1328,6 +1344,13 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
       return MI.getOperand(0).getReg();
     }
     break;
+  case ARM::MQQPRStore:
+  case ARM::MQQQQPRStore:
+    if (MI.getOperand(1).isFI()) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
   }
 
   return 0;
@@ -1473,31 +1496,42 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       llvm_unreachable("Unknown reg class!");
     break;
    case 32:
-    if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
-      if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
-          Subtarget.hasNEON()) {
-        BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
-            .addFrameIndex(FI)
-            .addImm(16)
-            .addMemOperand(MMO)
-            .add(predOps(ARMCC::AL));
-      } else {
-        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                                      .addFrameIndex(FI)
-                                      .add(predOps(ARMCC::AL))
-                                      .addMemOperand(MMO);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
-        if (Register::isPhysicalRegister(DestReg))
-          MIB.addReg(DestReg, RegState::ImplicitDefine);
-      }
-    } else
-      llvm_unreachable("Unknown reg class!");
-    break;
+     if (ARM::QQPRRegClass.hasSubClassEq(RC) ||
+         ARM::MQQPRRegClass.hasSubClassEq(RC) ||
+         ARM::DQuadRegClass.hasSubClassEq(RC)) {
+       if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
+           Subtarget.hasNEON()) {
+         BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+             .addFrameIndex(FI)
+             .addImm(16)
+             .addMemOperand(MMO)
+             .add(predOps(ARMCC::AL));
+       } else if (Subtarget.hasMVEIntegerOps()) {
+         BuildMI(MBB, I, DL, get(ARM::MQQPRLoad), DestReg)
+             .addFrameIndex(FI)
+             .addMemOperand(MMO);
+       } else {
+         MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                       .addFrameIndex(FI)
+                                       .add(predOps(ARMCC::AL))
+                                       .addMemOperand(MMO);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+         if (Register::isPhysicalRegister(DestReg))
+           MIB.addReg(DestReg, RegState::ImplicitDefine);
+       }
+     } else
+       llvm_unreachable("Unknown reg class!");
+     break;
   case 64:
-    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+    if (ARM::MQQQQPRRegClass.hasSubClassEq(RC) &&
+        Subtarget.hasMVEIntegerOps()) {
+      BuildMI(MBB, I, DL, get(ARM::MQQQQPRLoad), DestReg)
+          .addFrameIndex(FI)
+          .addMemOperand(MMO);
+    } else if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
       MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
                                     .addFrameIndex(FI)
                                     .add(predOps(ARMCC::AL))
@@ -1566,6 +1600,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
       return MI.getOperand(0).getReg();
     }
     break;
+  case ARM::MQQPRLoad:
+  case ARM::MQQQQPRLoad:
+    if (MI.getOperand(1).isFI()) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
   }
 
   return 0;
@@ -1642,8 +1683,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
 
 bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
-    assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
-           "LOAD_STACK_GUARD currently supported only for MachO.");
     expandLoadStackGuard(MI);
     MI.getParent()->erase(MI);
     return true;
@@ -2331,9 +2370,13 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
 
   // Find new register class to use.
   MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
+  MachineOperand TrueReg = MI.getOperand(Invert ? 1 : 2);
   Register DestReg = MI.getOperand(0).getReg();
-  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
-  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+  const TargetRegisterClass *FalseClass = MRI.getRegClass(FalseReg.getReg());
+  const TargetRegisterClass *TrueClass = MRI.getRegClass(TrueReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, FalseClass))
+    return nullptr;
+  if (!MRI.constrainRegClass(DestReg, TrueClass))
     return nullptr;
 
   // Create a new predicated version of DefMI.
@@ -2760,8 +2803,8 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
 bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                      Register &SrcReg2, int &CmpMask,
-                                      int &CmpValue) const {
+                                      Register &SrcReg2, int64_t &CmpMask,
+                                      int64_t &CmpValue) const {
   switch (MI.getOpcode()) {
   default: break;
   case ARM::CMPri:
@@ -2832,7 +2875,8 @@ inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
 /// This function can be extended later on.
 inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
                                         Register SrcReg, Register SrcReg2,
-                                        int ImmValue, const MachineInstr *OI,
+                                        int64_t ImmValue,
+                                        const MachineInstr *OI,
                                         bool &IsThumb1) {
   if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
       (OI->getOpcode() == ARM::SUBrr || OI->getOpcode() == ARM::t2SUBrr) &&
@@ -2967,8 +3011,8 @@ static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) {
 /// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
 /// condition code of instructions which use the flags.
 bool ARMBaseInstrInfo::optimizeCompareInstr(
-    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
+    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
+    int64_t CmpValue, const MachineRegisterInfo *MRI) const {
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
   if (!MI) return false;
@@ -3220,9 +3264,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   // live-out. If it is live-out, do not optimize.
   if (!isSafe) {
     MachineBasicBlock *MBB = CmpInstr.getParent();
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-             SE = MBB->succ_end(); SI != SE; ++SI)
-      if ((*SI)->isLiveIn(ARM::CPSR))
+    for (MachineBasicBlock *Succ : MBB->successors())
+      if (Succ->isLiveIn(ARM::CPSR))
         return false;
   }
 
@@ -3255,7 +3298,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
   MachineBasicBlock::const_iterator Next = &MI;
   ++Next;
   Register SrcReg, SrcReg2;
-  int CmpMask, CmpValue;
+  int64_t CmpMask, CmpValue;
   bool IsThumb1;
   if (Next != MI.getParent()->end() &&
       analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) &&
@@ -4839,8 +4882,6 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
   return true;
 }
 
-// LoadStackGuard has so far only been implemented for MachO. Different code
-// sequence is needed for other targets.
 void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                                                 unsigned LoadImmOpc,
                                                 unsigned LoadOpc) const {
@@ -4850,27 +4891,70 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   Register Reg = MI->getOperand(0).getReg();
-  const GlobalValue *GV =
-      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
   MachineInstrBuilder MIB;
+  unsigned int Offset = 0;
+
+  if (LoadImmOpc == ARM::MRC || LoadImmOpc == ARM::t2MRC) {
+    assert(Subtarget.isReadTPHard() &&
+           "TLS stack protector requires hardware TLS register");
+
+    BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
+        .addImm(15)
+        .addImm(0)
+        .addImm(13)
+        .addImm(0)
+        .addImm(3)
+        .add(predOps(ARMCC::AL));
 
-  BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
-      .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+    Module &M = *MBB.getParent()->getFunction().getParent();
+    Offset = M.getStackProtectorGuardOffset();
+    if (Offset & ~0xfffU) {
+      // The offset won't fit in the LDR's 12-bit immediate field, so emit an
+      // extra ADD to cover the delta. This gives us a guaranteed 8 additional
+      // bits, resulting in a range of 0 to +1 MiB for the guard offset.
+      unsigned AddOpc = (LoadImmOpc == ARM::MRC) ? ARM::ADDri : ARM::t2ADDri;
+      BuildMI(MBB, MI, DL, get(AddOpc), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(Offset & ~0xfffU)
+          .add(predOps(ARMCC::AL))
+          .addReg(0);
+      Offset &= 0xfffU;
+    }
+  } else {
+    const GlobalValue *GV =
+        cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+    bool IsIndirect = Subtarget.isGVIndirectSymbol(GV);
+
+    unsigned TargetFlags = ARMII::MO_NO_FLAG;
+    if (Subtarget.isTargetMachO()) {
+      TargetFlags |= ARMII::MO_NONLAZY;
+    } else if (Subtarget.isTargetCOFF()) {
+      if (GV->hasDLLImportStorageClass())
+        TargetFlags |= ARMII::MO_DLLIMPORT;
+      else if (IsIndirect)
+        TargetFlags |= ARMII::MO_COFFSTUB;
+    } else if (Subtarget.isGVInGOT(GV)) {
+      TargetFlags |= ARMII::MO_GOT;
+    }
 
-  if (Subtarget.isGVIndirectSymbol(GV)) {
-    MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
-    MIB.addReg(Reg, RegState::Kill).addImm(0);
-    auto Flags = MachineMemOperand::MOLoad |
-                 MachineMemOperand::MODereferenceable |
-                 MachineMemOperand::MOInvariant;
-    MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-        MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4));
-    MIB.addMemOperand(MMO).add(predOps(ARMCC::AL));
+    BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
+        .addGlobalAddress(GV, 0, TargetFlags);
+
+    if (IsIndirect) {
+      MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+      MIB.addReg(Reg, RegState::Kill).addImm(0);
+      auto Flags = MachineMemOperand::MOLoad |
+                   MachineMemOperand::MODereferenceable |
+                   MachineMemOperand::MOInvariant;
+      MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+          MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4));
+      MIB.addMemOperand(MMO).add(predOps(ARMCC::AL));
+    }
   }
 
   MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
   MIB.addReg(Reg, RegState::Kill)
-      .addImm(0)
+      .addImm(Offset)
       .cloneMemRefs(*MI)
       .add(predOps(ARMCC::AL));
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 0ebba0d9fdd5..db9320962e81 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -120,9 +120,8 @@ public:
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
 
-  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineInstr &MI,
-                                      LiveVariables *LV) const override;
+  MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                      LiveIntervals *LIS) const override;
 
   virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
@@ -289,15 +288,15 @@ public:
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
+                      Register &SrcReg2, int64_t &CmpMask,
+                      int64_t &CmpValue) const override;
 
   /// optimizeCompareInstr - Convert the instruction to set the zero flag so
   /// that we can remove a "comparison with zero"; Remove a redundant CMP
   /// instruction if the flags can be updated in the same way by an earlier
   /// instruction such as SUB.
   bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                            Register SrcReg2, int CmpMask, int CmpValue,
+                            Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   bool analyzeSelect(const MachineInstr &MI,
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 4883e5693f87..b53efe58e8de 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -263,6 +263,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case ARM::QQQQPRRegClassID:
       if (MF.getSubtarget<ARMSubtarget>().hasNEON())
         return Super;
+      break;
+    case ARM::MQPRRegClassID:
+    case ARM::MQQPRRegClassID:
+    case ARM::MQQQQPRRegClassID:
+      if (MF.getSubtarget<ARMSubtarget>().hasMVEIntegerOps())
+        return Super;
+      break;
     }
     Super = *I++;
   } while (Super);
@@ -928,4 +935,4 @@ bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
 
   return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
                                                   SrcRC, SrcSubReg);
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index 5ea47f529b23..ddbd6702e528 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -31,6 +31,8 @@ private:
   const ARMBaseInstrInfo *TII;
   std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
   MachineLoopInfo *MLI = nullptr;
+  // A list of WLS instructions that need to be reverted to DLS.
+  SmallVector<MachineInstr *> RevertedWhileLoops;
 
 public:
   static char ID;
@@ -41,9 +43,9 @@ public:
   bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
   bool fixBackwardsWLS(MachineLoop *ML);
   bool processPostOrderLoops(MachineLoop *ML);
+  bool revertWhileToDoLoop(MachineInstr *WLS);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
     AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -82,6 +84,66 @@ static MachineInstr *findWLS(MachineLoop *ML) {
   return nullptr;
 }
 
+// Revert a WhileLoopStart to an equivalent DoLoopStart and branch. Note that
+// because of the branches this requires an extra block to be created.
+bool ARMBlockPlacement::revertWhileToDoLoop(MachineInstr *WLS) {
+  //   lr = t2WhileLoopStartTP r0, r1, TgtBB
+  //   t2Br Ph
+  // ->
+  //   cmp r0, 0
+  //   brcc TgtBB
+  // block2:
+  //   LR = t2DoLoopStartTP r0, r1
+  //   t2Br Ph
+  MachineBasicBlock *Preheader = WLS->getParent();
+  assert(WLS != &Preheader->back());
+  assert(WLS->getNextNode() == &Preheader->back());
+  MachineInstr *Br = &Preheader->back();
+  assert(Br->getOpcode() == ARM::t2B);
+  assert(Br->getOperand(1).getImm() == 14);
+
+  // Clear the kill flags, as the cmp/bcc will no longer kill any operands.
+  WLS->getOperand(1).setIsKill(false);
+  if (WLS->getOpcode() == ARM::t2WhileLoopStartTP)
+    WLS->getOperand(2).setIsKill(false);
+
+  // Create the new block
+  MachineBasicBlock *NewBlock = Preheader->getParent()->CreateMachineBasicBlock(
+      Preheader->getBasicBlock());
+  Preheader->getParent()->insert(++Preheader->getIterator(), NewBlock);
+  // Move the Br to it
+  Br->removeFromParent();
+  NewBlock->insert(NewBlock->end(), Br);
+  // And setup the successors correctly.
+  Preheader->replaceSuccessor(Br->getOperand(0).getMBB(), NewBlock);
+  NewBlock->addSuccessor(Br->getOperand(0).getMBB());
+
+  // Create a new DLS to replace the WLS
+  MachineInstrBuilder MIB =
+      BuildMI(*NewBlock, Br, WLS->getDebugLoc(),
+              TII->get(WLS->getOpcode() == ARM::t2WhileLoopStartTP
+                           ? ARM::t2DoLoopStartTP
+                           : ARM::t2DoLoopStart));
+  MIB.add(WLS->getOperand(0));
+  MIB.add(WLS->getOperand(1));
+  if (WLS->getOpcode() == ARM::t2WhileLoopStartTP)
+    MIB.add(WLS->getOperand(2));
+
+  LLVM_DEBUG(dbgs() << DEBUG_PREFIX
+                    << "Reverting While Loop to Do Loop: " << *WLS << "\n");
+
+  RevertWhileLoopStartLR(WLS, TII, ARM::t2Bcc, true);
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *NewBlock);
+
+  Preheader->getParent()->RenumberBlocks();
+  BBUtils->computeAllBlockSizes();
+  BBUtils->adjustBBOffsetsAfter(Preheader);
+
+  return true;
+}
+
 /// Checks if loop has a backwards branching WLS, and if possible, fixes it.
 /// This requires checking the predecessor (ie. preheader or it's predecessor)
 /// for a WLS and if its loopExit/target is before it.
@@ -125,11 +187,10 @@ bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
       // TODO: Analyse the blocks to make a decision if it would be worth
       // moving Preheader even if we'd introduce a backwards WLS
       if (WLSTarget == Predecessor) {
-        LLVM_DEBUG(
-            dbgs() << DEBUG_PREFIX
-                   << "Can't move Predecessor"
-                      "block as it would convert a WLS from forward to a "
-                      "backwards branching WLS\n");
+        LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Can't move Predecessor block as "
+                          << "it would convert a WLS from forward to a "
+                          << "backwards branching WLS\n");
+        RevertedWhileLoops.push_back(WlsInstr);
         return false;
       }
     }
@@ -162,11 +223,16 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   BBUtils->computeAllBlockSizes();
   BBUtils->adjustBBOffsetsAfter(&MF.front());
   bool Changed = false;
+  RevertedWhileLoops.clear();
 
   // Find loops with a backwards branching WLS and fix if possible.
   for (auto *ML : *MLI)
     Changed |= processPostOrderLoops(ML);
 
+  // Revert any While loops still out of range to DLS loops.
+  for (auto *WlsInstr : RevertedWhileLoops)
+    Changed |= revertWhileToDoLoop(WlsInstr);
+
   return Changed;
 }
 
@@ -199,18 +265,22 @@ void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
     assert(From->isSuccessor(To) &&
            "'To' is expected to be a successor of 'From'");
     MachineInstr &Terminator = *(--From->terminators().end());
-    if (!Terminator.isUnconditionalBranch()) {
-      // The BB doesn't have an unconditional branch so it relied on
-      // fall-through. Fix by adding an unconditional branch to the moved BB.
-      MachineInstrBuilder MIB =
-          BuildMI(From, Terminator.getDebugLoc(), TII->get(ARM::t2B));
-      MIB.addMBB(To);
-      MIB.addImm(ARMCC::CondCodes::AL);
-      MIB.addReg(ARM::NoRegister);
-      LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from "
-                        << From->getName() << " to " << To->getName() << ": "
-                        << *MIB.getInstr());
-    }
+    if (!TII->isPredicated(Terminator) &&
+        (isUncondBranchOpcode(Terminator.getOpcode()) ||
+         isIndirectBranchOpcode(Terminator.getOpcode()) ||
+         isJumpTableBranchOpcode(Terminator.getOpcode()) ||
+         Terminator.isReturn()))
+      return;
+    // The BB doesn't have an unconditional branch so it relied on
+    // fall-through. Fix by adding an unconditional branch to the moved BB.
+    MachineInstrBuilder MIB =
+        BuildMI(From, Terminator.getDebugLoc(), TII->get(ARM::t2B));
+    MIB.addMBB(To);
+    MIB.addImm(ARMCC::CondCodes::AL);
+    MIB.addReg(ARM::NoRegister);
+    LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from "
+                      << From->getName() << " to " << To->getName() << ": "
+                      << *MIB.getInstr());
   };
 
   // Fix fall-through to the moved BB from the one that used to be before it.
@@ -225,5 +295,5 @@ void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
 
   F->RenumberBlocks();
   BBUtils->computeAllBlockSizes();
-  BBUtils->adjustBBOffsetsAfter(&F->front());
+  BBUtils->adjustBBOffsetsAfter(BB);
 }
diff --git a/llvm/lib/Target/ARM/ARMCallLowering.cpp b/llvm/lib/Target/ARM/ARMCallLowering.cpp
index aff7ec8d2ed6..81ec4d09a408 100644
--- a/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -45,6 +45,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <functional>
 #include <utility>
 
 using namespace llvm;
@@ -109,7 +110,7 @@ struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
@@ -130,7 +131,8 @@ struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
   }
 
   unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
-                             ArrayRef<CCValAssign> VAs) override {
+                             ArrayRef<CCValAssign> VAs,
+                             std::function<void()> *Thunk) override {
     assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet");
 
     CCValAssign VA = VAs[0];
@@ -158,9 +160,15 @@ struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
     if (!IsLittle)
       std::swap(NewRegs[0], NewRegs[1]);
 
+    if (Thunk) {
+      *Thunk = [=]() {
+        assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+        assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+      };
+      return 1;
+    }
     assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
     assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
-
     return 1;
   }
 
@@ -273,7 +281,7 @@ struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
@@ -298,7 +306,8 @@ struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler {
   }
 
   unsigned assignCustomValue(ARMCallLowering::ArgInfo &Arg,
-                             ArrayRef<CCValAssign> VAs) override {
+                             ArrayRef<CCValAssign> VAs,
+                             std::function<void()> *Thunk) override {
     assert(Arg.Regs.size() == 1 && "Can't handle multple regs yet");
 
     CCValAssign VA = VAs[0];
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index e15826fa6159..121558276c3e 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -18,6 +18,7 @@
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "MVETailPredUtils.h"
 #include "Thumb2InstrInfo.h"
 #include "Utils/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
@@ -340,12 +341,12 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
 // Align blocks where the previous block does not fall through. This may add
 // extra NOP's but they will not be executed. It uses the PrefLoopAlignment as a
 // measure of how much to align, and only runs at CodeGenOpt::Aggressive.
-static bool AlignBlocks(MachineFunction *MF) {
+static bool AlignBlocks(MachineFunction *MF, const ARMSubtarget *STI) {
   if (MF->getTarget().getOptLevel() != CodeGenOpt::Aggressive ||
       MF->getFunction().hasOptSize())
     return false;
 
-  auto *TLI = MF->getSubtarget().getTargetLowering();
+  auto *TLI = STI->getTargetLowering();
   const Align Alignment = TLI->getPrefLoopAlignment();
   if (Alignment < 4)
     return false;
@@ -357,7 +358,25 @@ static bool AlignBlocks(MachineFunction *MF) {
       Changed = true;
       MBB.setAlignment(Alignment);
     }
+
     PrevCanFallthough = MBB.canFallThrough();
+
+    // For LOB's, the ARMLowOverheadLoops pass may remove the unconditional
+    // branch later in the pipeline.
+    if (STI->hasLOB()) {
+      for (const auto &MI : reverse(MBB.terminators())) {
+        if (MI.getOpcode() == ARM::t2B &&
+            MI.getOperand(0).getMBB() == MBB.getNextNode())
+          continue;
+        if (isLoopStart(MI) || MI.getOpcode() == ARM::t2LoopEnd ||
+            MI.getOpcode() == ARM::t2LoopEndDec) {
+          PrevCanFallthough = true;
+          break;
+        }
+        // Any other terminator - nothing to do
+        break;
+      }
+    }
   }
 
   return Changed;
@@ -406,7 +425,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Align any non-fallthrough blocks
-  MadeChange |= AlignBlocks(MF);
+  MadeChange |= AlignBlocks(MF, STI);
 
   // Perform the initial placement of the constant pool entries.  To start with,
   // we put them all at the end of the function.
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2167ad5d7467..a8f09969e948 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -69,6 +69,7 @@ namespace {
     void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
     void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
                     unsigned Opc, bool IsExt);
+    void ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
     void CMSEClearGPRegs(MachineBasicBlock &MBB,
@@ -887,6 +888,43 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   LLVM_DEBUG(dbgs() << "To:        "; MIB.getInstr()->dump(););
 }
 
+void ARMExpandPseudo::ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+  unsigned NewOpc =
+      MI.getOpcode() == ARM::MQQPRStore || MI.getOpcode() == ARM::MQQQQPRStore
+          ? ARM::VSTMDIA
+          : ARM::VLDMDIA;
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+
+  unsigned Flags = getKillRegState(MI.getOperand(0).isKill()) |
+                   getDefRegState(MI.getOperand(0).isDef());
+  Register SrcReg = MI.getOperand(0).getReg();
+
+  // Copy the destination register.
+  MIB.add(MI.getOperand(1));
+  MIB.add(predOps(ARMCC::AL));
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_0), Flags);
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_1), Flags);
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_2), Flags);
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_3), Flags);
+  if (MI.getOpcode() == ARM::MQQQQPRStore ||
+      MI.getOpcode() == ARM::MQQQQPRLoad) {
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_4), Flags);
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_5), Flags);
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_6), Flags);
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_7), Flags);
+  }
+
+  if (NewOpc == ARM::VSTMDIA)
+    MIB.addReg(SrcReg, RegState::Implicit);
+
+  TransferImpOps(MI, MIB, MIB);
+  MIB.cloneMemRefs(MI);
+  MI.eraseFromParent();
+}
+
 static bool IsAnAddressOperand(const MachineOperand &MO) {
   // This check is overly conservative.  Unless we are certain that the machine
   // operand is not a symbol reference, we return that it is a symbol reference.
@@ -1295,7 +1333,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegs(
     const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) {
   if (STI->hasV8_1MMainlineOps())
     CMSESaveClearFPRegsV81(MBB, MBBI, DL, LiveRegs);
-  else
+  else if (STI->hasV8MMainlineOps())
     CMSESaveClearFPRegsV8(MBB, MBBI, DL, LiveRegs, ScratchRegs);
 }
 
@@ -1303,8 +1341,6 @@ void ARMExpandPseudo::CMSESaveClearFPRegs(
 void ARMExpandPseudo::CMSESaveClearFPRegsV8(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
     const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) {
-  if (!STI->hasFPRegs())
-    return;
 
   // Store an available register for FPSCR clearing
   assert(!ScratchRegs.empty());
@@ -1358,7 +1394,11 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
 
   bool passesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty());
 
-  // Lazy store all fp registers to the stack
+  if (passesFPReg)
+    assert(STI->hasFPRegs() && "Subtarget needs fpregs");
+
+  // Lazy store all fp registers to the stack.
+  // This executes as NOP in the absence of floating-point support.
   MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
                                   .addReg(ARM::SP)
                                   .add(predOps(ARMCC::AL));
@@ -1486,15 +1526,18 @@ void ARMExpandPseudo::CMSERestoreFPRegs(
     SmallVectorImpl<unsigned> &AvailableRegs) {
   if (STI->hasV8_1MMainlineOps())
     CMSERestoreFPRegsV81(MBB, MBBI, DL, AvailableRegs);
-  else
+  else if (STI->hasV8MMainlineOps())
     CMSERestoreFPRegsV8(MBB, MBBI, DL, AvailableRegs);
 }
 
 void ARMExpandPseudo::CMSERestoreFPRegsV8(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
     SmallVectorImpl<unsigned> &AvailableRegs) {
-  if (!STI->hasFPRegs())
-    return;
+
+  // Keep a scratch register for the mitigation sequence.
+  unsigned ScratchReg = ARM::NoRegister;
+  if (STI->fixCMSE_CVE_2021_35465())
+    ScratchReg = AvailableRegs.pop_back_val();
 
   // Use AvailableRegs to store the fp regs
   std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs;
@@ -1536,24 +1579,64 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
     }
   }
 
+  bool returnsFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty());
+
+  if (returnsFPReg)
+    assert(STI->hasFPRegs() && "Subtarget needs fpregs");
+
   // Push FP regs that cannot be restored via normal registers on the stack
   for (unsigned Reg : NonclearedFPRegs) {
     if (ARM::DPR_VFP2RegClass.contains(Reg))
-      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRD), Reg)
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRD))
+          .addReg(Reg)
           .addReg(ARM::SP)
           .addImm((Reg - ARM::D0) * 2)
           .add(predOps(ARMCC::AL));
     else if (ARM::SPRRegClass.contains(Reg))
-      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRS), Reg)
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRS))
+          .addReg(Reg)
           .addReg(ARM::SP)
           .addImm(Reg - ARM::S0)
           .add(predOps(ARMCC::AL));
   }
 
-  // Lazy load fp regs from stack
-  BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
-      .addReg(ARM::SP)
-      .add(predOps(ARMCC::AL));
+  // Lazy load fp regs from stack.
+  // This executes as NOP in the absence of floating-point support.
+  MachineInstrBuilder VLLDM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+                                  .addReg(ARM::SP)
+                                  .add(predOps(ARMCC::AL));
+
+  if (STI->fixCMSE_CVE_2021_35465()) {
+    auto Bundler = MIBundleBuilder(MBB, VLLDM);
+    // Read the CONTROL register.
+    Bundler.append(BuildMI(*MBB.getParent(), DL, TII->get(ARM::t2MRS_M))
+                       .addReg(ScratchReg, RegState::Define)
+                       .addImm(20)
+                       .add(predOps(ARMCC::AL)));
+    // Check bit 3 (SFPA).
+    Bundler.append(BuildMI(*MBB.getParent(), DL, TII->get(ARM::t2TSTri))
+                       .addReg(ScratchReg)
+                       .addImm(8)
+                       .add(predOps(ARMCC::AL)));
+    // Emit the IT block.
+    Bundler.append(BuildMI(*MBB.getParent(), DL, TII->get(ARM::t2IT))
+                       .addImm(ARMCC::NE)
+                       .addImm(8));
+    // If SFPA is clear jump over to VLLDM, otherwise execute an instruction
+    // which has no functional effect apart from causing context creation:
+    // vmovne s0, s0. In the absence of FPU we emit .inst.w 0xeeb00a40,
+    // which is defined as NOP if not executed.
+    if (STI->hasFPRegs())
+      Bundler.append(BuildMI(*MBB.getParent(), DL, TII->get(ARM::VMOVS))
+                         .addReg(ARM::S0, RegState::Define)
+                         .addReg(ARM::S0, RegState::Undef)
+                         .add(predOps(ARMCC::NE)));
+    else
+      Bundler.append(BuildMI(*MBB.getParent(), DL, TII->get(ARM::INLINEASM))
+                         .addExternalSymbol(".inst.w 0xeeb00a40")
+                         .addImm(InlineAsm::Extra_HasSideEffects));
+    finalizeBundle(MBB, Bundler.begin(), Bundler.end());
+  }
 
   // Restore all FP registers via normal registers
   for (const auto &Regs : ClearedFPRegs) {
@@ -1594,6 +1677,12 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
     SmallVectorImpl<unsigned> &AvailableRegs) {
   if (!definesOrUsesFPReg(*MBBI)) {
+    if (STI->fixCMSE_CVE_2021_35465()) {
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSCCLRMS))
+          .add(predOps(ARMCC::AL))
+          .addReg(ARM::VPR, RegState::Define);
+    }
+
     // Load FP registers from stack.
     BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
         .addReg(ARM::SP)
@@ -1647,7 +1736,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
            "CMP_SWAP not expected to be custom expanded for Thumb1");
     assert((UxtOp == 0 || UxtOp == ARM::tUXTB || UxtOp == ARM::tUXTH) &&
            "ARMv8-M.baseline does not have t2UXTB/t2UXTH");
-    assert(ARM::tGPRRegClass.contains(DesiredReg) &&
+    assert((UxtOp == 0 || ARM::tGPRRegClass.contains(DesiredReg)) &&
            "DesiredReg used for UXT op must be tGPR");
   }
 
@@ -2916,6 +3005,13 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
     case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
 
+    case ARM::MQQPRLoad:
+    case ARM::MQQPRStore:
+    case ARM::MQQQQPRLoad:
+    case ARM::MQQQQPRStore:
+      ExpandMQQPRLoadStore(MBBI);
+      return true;
+
     case ARM::tCMP_SWAP_8:
       assert(STI->isThumb());
       return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB,
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9c7055deaaf8..2b83a292db76 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -79,6 +79,10 @@ public:
 
   void Select(SDNode *N) override;
 
+  /// Return true as some complex patterns, like those that call
+  /// canExtractShiftFromMul can modify the DAG inplace.
+  bool ComplexPatternFuncMutatesDAG() const override { return true; }
+
   bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
                              ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
@@ -406,11 +410,9 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     return;
 
   bool isThumb2 = Subtarget->isThumb();
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ) {
-    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
-
-    if (N->getOpcode() != ISD::ADD)
+  // We use make_early_inc_range to avoid invalidation issues.
+  for (SDNode &N : llvm::make_early_inc_range(CurDAG->allnodes())) {
+    if (N.getOpcode() != ISD::ADD)
       continue;
 
     // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
@@ -422,8 +424,8 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     // operand of 'add' and the 'and' and 'srl' would become a bits extraction
     // node (UBFX).
 
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
     unsigned And_imm = 0;
     if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
       if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
@@ -480,7 +482,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
                          CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32));
     N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
                          N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32));
-    CurDAG->UpdateNodeOperands(N, N0, N1);
+    CurDAG->UpdateNodeOperands(&N, N0, N1);
   }
 }
 
@@ -1121,7 +1123,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRRSext(SDValue N, SDValue &Base,
                                                 SDValue &Offset) {
   if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
     ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
-    if (!NC || !NC->isNullValue())
+    if (!NC || !NC->isZero())
       return false;
 
     Base = Offset = N;
@@ -1818,8 +1820,11 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
   else
     return false;
 
-  SDValue Ops[] = {Base, NewOffset,
-                   CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg,
+  SDValue Ops[] = {Base,
+                   NewOffset,
+                   CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32),
+                   PredReg,
+                   CurDAG->getRegister(0, MVT::i32), // tp_reg
                    Chain};
   SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
                                        N->getValueType(0), MVT::Other, Ops);
@@ -2525,6 +2530,7 @@ void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
                                            SDValue PredicateMask) {
   Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32));
   Ops.push_back(PredicateMask);
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // tp_reg
 }
 
 template <typename SDValueVector>
@@ -2533,6 +2539,7 @@ void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
                                            SDValue Inactive) {
   Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32));
   Ops.push_back(PredicateMask);
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // tp_reg
   Ops.push_back(Inactive);
 }
 
@@ -2540,6 +2547,7 @@ template <typename SDValueVector>
 void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc) {
   Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32));
   Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // tp_reg
 }
 
 template <typename SDValueVector>
@@ -2547,6 +2555,7 @@ void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
                                                 EVT InactiveTy) {
   Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32));
   Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // tp_reg
   Ops.push_back(SDValue(
       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, InactiveTy), 0));
 }
@@ -3545,7 +3554,7 @@ void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) {
     return;
 
   SDValue Zero = N->getOperand(1);
-  if (!isa<ConstantSDNode>(Zero) || !cast<ConstantSDNode>(Zero)->isNullValue() ||
+  if (!isa<ConstantSDNode>(Zero) || !cast<ConstantSDNode>(Zero)->isZero() ||
       And->getOpcode() != ISD::AND)
     return;
   SDValue X = And.getOperand(0);
@@ -5495,8 +5504,8 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to construct as operands for the node.
 bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
   SDLoc DL(N);
 
@@ -5610,8 +5619,8 @@ bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to use in the nodes
 bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
   SDLoc DL(N);
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 900113244e41..e7e10ce07a44 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -55,6 +55,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -300,6 +301,9 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::UINT_TO_FP, VT, Expand);
       setOperationAction(ISD::FP_TO_SINT, VT, Expand);
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    } else {
+      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
     }
 
     // Pre and Post inc are supported on loads and stores
@@ -544,6 +548,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::SRL_I128, nullptr);
   setLibcallName(RTLIB::SRA_I128, nullptr);
   setLibcallName(RTLIB::MUL_I128, nullptr);
+  setLibcallName(RTLIB::MULO_I64, nullptr);
+  setLibcallName(RTLIB::MULO_I128, nullptr);
 
   // RTLIB
   if (Subtarget->isAAPCS_ABI() &&
@@ -741,6 +747,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       Subtarget->hasFPRegs()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+
+    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+
     if (!Subtarget->hasVFP2Base())
       setAllExpand(MVT::f32);
     if (!Subtarget->hasFP64())
@@ -981,6 +993,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
     setTargetDAGCombine(ISD::BUILD_VECTOR);
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+    setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
@@ -1851,12 +1864,18 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
   // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
   // MVE Q registers.
-  if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
+  if (Subtarget->hasNEON()) {
     if (VT == MVT::v4i64)
       return &ARM::QQPRRegClass;
     if (VT == MVT::v8i64)
       return &ARM::QQQQPRRegClass;
   }
+  if (Subtarget->hasMVEIntegerOps()) {
+    if (VT == MVT::v4i64)
+      return &ARM::MQQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::MQQQQPRRegClass;
+  }
   return TargetLowering::getRegClassFor(VT);
 }
 
@@ -2287,7 +2306,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool PreferIndirect = false;
 
   // Determine whether this is a non-secure function call.
-  if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call"))
+  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
     isCmseNSCall = true;
 
   // Disable tail calls if they're not supported.
@@ -3259,26 +3278,24 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     SDNode *VMov = Copy;
     // f64 returned in a pair of GPRs.
     SmallPtrSet<SDNode*, 2> Copies;
-    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
-         UI != UE; ++UI) {
-      if (UI->getOpcode() != ISD::CopyToReg)
+    for (SDNode *U : VMov->uses()) {
+      if (U->getOpcode() != ISD::CopyToReg)
         return false;
-      Copies.insert(*UI);
+      Copies.insert(U);
     }
     if (Copies.size() > 2)
       return false;
 
-    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
-         UI != UE; ++UI) {
-      SDValue UseChain = UI->getOperand(0);
+    for (SDNode *U : VMov->uses()) {
+      SDValue UseChain = U->getOperand(0);
       if (Copies.count(UseChain.getNode()))
         // Second CopyToReg
-        Copy = *UI;
+        Copy = U;
       else {
         // We are at the top of this chain.
         // If the copy has a glue operand, we conservatively assume it
         // isn't safe to perform a tail call.
-        if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
+        if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
           return false;
         // First CopyToReg
         TCChain = UseChain;
@@ -3301,10 +3318,9 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   }
 
   bool HasRet = false;
-  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
-       UI != UE; ++UI) {
-    if (UI->getOpcode() != ARMISD::RET_FLAG &&
-        UI->getOpcode() != ARMISD::INTRET_FLAG)
+  for (const SDNode *U : Copy->uses()) {
+    if (U->getOpcode() != ARMISD::RET_FLAG &&
+        U->getOpcode() != ARMISD::INTRET_FLAG)
       return false;
     HasRet = true;
   }
@@ -3782,7 +3798,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
 
 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    if (!(GV = GA->getBaseObject()))
+    if (!(GV = GA->getAliaseeObject()))
       return false;
   if (const auto *V = dyn_cast<GlobalVariable>(GV))
     return V->isConstant();
@@ -4517,7 +4533,7 @@ SDValue ARMTargetLowering::LowerFormalArguments(
 
       InVals.push_back(ArgValue);
     } else { // VA.isRegLoc()
-      // sanity check
+      // Only arguments passed on the stack should make it here.
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
@@ -5811,6 +5827,43 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
+                                  const ARMSubtarget *Subtarget) {
+  EVT VT = Op.getValueType();
+  EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  EVT FromVT = Op.getOperand(0).getValueType();
+
+  if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
+    return Op;
+  if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
+      Subtarget->hasFP64())
+    return Op;
+  if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
+      Subtarget->hasFullFP16())
+    return Op;
+  if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
+      Subtarget->hasMVEFloatOps())
+    return Op;
+  if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
+      Subtarget->hasMVEFloatOps())
+    return Op;
+
+  if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
+    return SDValue();
+
+  SDLoc DL(Op);
+  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
+  unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
+  SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
+                            DAG.getValueType(VT.getScalarType()));
+  SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
+                            DAG.getConstant((1 << BW) - 1, DL, VT));
+  if (IsSigned)
+    Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
+                      DAG.getConstant(-(1 << BW), DL, VT));
+  return Max;
+}
+
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
@@ -7660,7 +7713,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatUndef.isAllOnesValue())
+    if (SplatUndef.isAllOnes())
       return DAG.getUNDEF(VT);
 
     if ((ST->hasNEON() && SplatBitSize <= 64) ||
@@ -8052,7 +8105,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     Src.WindowBase *= Src.WindowScale;
   }
 
-  // Final sanity check before we try to actually produce a shuffle.
+  // Final check before we try to actually produce a shuffle.
   LLVM_DEBUG(for (auto Src
                   : Sources)
                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
@@ -8175,7 +8228,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
             isVTBLMask(M, VT) ||
             isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
     return true;
-  else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
+  else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
            isReverseMask(M, VT))
     return true;
   else if (Subtarget->hasMVEIntegerOps() &&
@@ -8268,21 +8321,23 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 }
 
-static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
-                                                      SelectionDAG &DAG) {
+static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  SDValue OpLHS = Op.getOperand(0);
-  EVT VT = OpLHS.getValueType();
+  EVT VT = Op.getValueType();
 
-  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+  assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
          "Expect an v8i16/v16i8 type");
-  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
-  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
+  // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
   // extract the first 8 bytes into the top double word and the last 8 bytes
-  // into the bottom double word. The v8i16 case is similar.
-  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
-  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
-                     DAG.getConstant(ExtractNum, DL, MVT::i32));
+  // into the bottom double word, through a new vector shuffle that will be
+  // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
+  std::vector<int> NewMask;
+  for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
+    NewMask.push_back(VT.getVectorNumElements() / 2 + i);
+  for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
+    NewMask.push_back(i);
+  return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
 }
 
 static EVT getVectorTyFromPredicateVector(EVT VT) {
@@ -8704,8 +8759,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
-  if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
-    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+  if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
+      isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
 
   if (ST->hasNEON() && VT == MVT::v8i8)
     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
@@ -8822,54 +8878,68 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
                                       const ARMSubtarget *ST) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  EVT Op1VT = V1.getValueType();
-  EVT Op2VT = V2.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert(Op1VT == Op2VT && "Operand types don't match!");
-  assert(VT.getScalarSizeInBits() == 1 &&
+  assert(Op.getValueType().getScalarSizeInBits() == 1 &&
+         "Unexpected custom CONCAT_VECTORS lowering");
+  assert(isPowerOf2_32(Op.getNumOperands()) &&
          "Unexpected custom CONCAT_VECTORS lowering");
   assert(ST->hasMVEIntegerOps() &&
          "CONCAT_VECTORS lowering only supported for MVE");
 
-  SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
-  SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
-
-  // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
-  // promoted to v8i16, etc.
-
-  MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
-
-  // Extract the vector elements from Op1 and Op2 one by one and truncate them
-  // to be the right size for the destination. For example, if Op1 is v4i1 then
-  // the promoted vector is v4i32. The result of concatentation gives a v8i1,
-  // which when promoted is v8i16. That means each i32 element from Op1 needs
-  // truncating to i16 and inserting in the result.
-  EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
-  SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
-  auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
-    EVT NewVT = NewV.getValueType();
-    EVT ConcatVT = ConVec.getValueType();
-    for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
-      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
-                                DAG.getIntPtrConstant(i, dl));
-      ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
-                           DAG.getConstant(j, dl, MVT::i32));
-    }
-    return ConVec;
+  auto ConcatPair = [&](SDValue V1, SDValue V2) {
+    EVT Op1VT = V1.getValueType();
+    EVT Op2VT = V2.getValueType();
+    assert(Op1VT == Op2VT && "Operand types don't match!");
+    EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
+
+    SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+    SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
+
+    // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
+    // promoted to v8i16, etc.
+    MVT ElType =
+        getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+    unsigned NumElts = 2 * Op1VT.getVectorNumElements();
+
+    // Extract the vector elements from Op1 and Op2 one by one and truncate them
+    // to be the right size for the destination. For example, if Op1 is v4i1
+    // then the promoted vector is v4i32. The result of concatentation gives a
+    // v8i1, which when promoted is v8i16. That means each i32 element from Op1
+    // needs truncating to i16 and inserting in the result.
+    EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+    SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
+    auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
+      EVT NewVT = NewV.getValueType();
+      EVT ConcatVT = ConVec.getValueType();
+      for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
+        SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
+                                  DAG.getIntPtrConstant(i, dl));
+        ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
+                             DAG.getConstant(j, dl, MVT::i32));
+      }
+      return ConVec;
+    };
+    unsigned j = 0;
+    ConVec = ExtractInto(NewV1, ConVec, j);
+    ConVec = ExtractInto(NewV2, ConVec, j);
+
+    // Now return the result of comparing the subvector with zero,
+    // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+    return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+                       DAG.getConstant(ARMCC::NE, dl, MVT::i32));
   };
-  unsigned j = 0;
-  ConVec = ExractInto(NewV1, ConVec, j);
-  ConVec = ExractInto(NewV2, ConVec, j);
 
-  // Now return the result of comparing the subvector with zero,
-  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
-  return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
-                     DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+  // Concat each pair of subvectors and pack into the lower half of the array.
+  SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
+  while (ConcatOps.size() > 1) {
+    for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
+      SDValue V1 = ConcatOps[I];
+      SDValue V2 = ConcatOps[I + 1];
+      ConcatOps[I / 2] = ConcatPair(V1, V2);
+    }
+    ConcatOps.resize(ConcatOps.size() / 2);
+  }
+  return ConcatOps[0];
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
@@ -9069,7 +9139,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
         return true;
     } else {
-      if (Hi0->isNullValue() && Hi1->isNullValue())
+      if (Hi0->isZero() && Hi1->isZero())
         return true;
     }
     return false;
@@ -10140,6 +10210,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
@@ -10326,6 +10398,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::ZERO_EXTEND:
     Res = LowerVectorExtend(N, DAG, Subtarget);
     break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
+    break;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -10877,10 +10953,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
 static
 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I != Succ)
-      return *I;
+  for (MachineBasicBlock *S : MBB->successors())
+    if (S != Succ)
+      return S;
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
@@ -11378,13 +11453,9 @@ static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
   // If we hit the end of the block, check whether CPSR is live into a
   // successor.
   if (miI == BB->end()) {
-    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
-                                          sEnd = BB->succ_end();
-         sItr != sEnd; ++sItr) {
-      MachineBasicBlock* succ = *sItr;
-      if (succ->isLiveIn(ARM::CPSR))
+    for (MachineBasicBlock *Succ : BB->successors())
+      if (Succ->isLiveIn(ARM::CPSR))
         return false;
-    }
   }
 
   // We found a def, or hit the end of the basic block and CPSR wasn't live
@@ -11487,6 +11558,7 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
   BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
       .addUse(PredCounterPhiReg)
       .addImm(ARMVCC::None)
+      .addReg(0)
       .addReg(0);
 
   BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
@@ -11505,7 +11577,8 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
         .addReg(SrcPhiReg)
         .addImm(16)
         .addImm(ARMVCC::Then)
-        .addUse(VccrReg);
+        .addUse(VccrReg)
+        .addReg(0);
   } else
     SrcValueReg = OpSrcReg;
 
@@ -11515,7 +11588,8 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
       .addReg(DestPhiReg)
       .addImm(16)
       .addImm(ARMVCC::Then)
-      .addUse(VccrReg);
+      .addUse(VccrReg)
+      .addReg(0);
 
   // Add the pseudoInstrs for decrementing the loop counter and marking the
   // end:t2DoLoopDec and t2DoLoopEnd
@@ -12103,8 +12177,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
       // When looking for a 0 constant, N can be zext or sext.
       OtherOp = DAG.getConstant(1, dl, VT);
     else
-      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
-                                VT);
+      OtherOp = DAG.getAllOnesConstant(dl, VT);
     return true;
   }
   }
@@ -12696,7 +12769,7 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
                                       const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG(DCI.DAG);
 
-  if (N->getOpcode() == ARMISD::SUBC) {
+  if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
     // (SUBC (ADDE 0, 0, C), 1) -> C
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
@@ -12868,6 +12941,9 @@ static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue Shft;
   ConstantSDNode *Clamp;
 
+  if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
+    return SDValue();
+
   if (N->getOpcode() == ISD::SMIN) {
     Shft = N->getOperand(0);
     Clamp = isConstOrConstSplat(N->getOperand(1));
@@ -13008,19 +13084,15 @@ static SDValue PerformVSELECTCombine(SDNode *N,
 }
 
 static SDValue PerformABSCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const ARMSubtarget *Subtarget) {
-  SDValue res;
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
     return SDValue();
 
-  if (!TLI.expandABS(N, res, DAG))
-      return SDValue();
-
-  return res;
+  return TLI.expandABS(N, DAG);
 }
 
 /// PerformADDECombine - Target-specific dag combine transform from
@@ -13064,13 +13136,166 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  auto IsVecReduce = [](SDValue Op) {
+    switch (Op.getOpcode()) {
+    case ISD::VECREDUCE_ADD:
+    case ARMISD::VADDVs:
+    case ARMISD::VADDVu:
+    case ARMISD::VMLAVs:
+    case ARMISD::VMLAVu:
+      return true;
+    }
+    return false;
+  };
+
+  auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
+    // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
+    //   add(add(X, vecreduce(Y)), vecreduce(Z))
+    // to make better use of vaddva style instructions.
+    if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
+        IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
+        !isa<ConstantSDNode>(N0)) {
+      SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
+      return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
+    }
+    // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
+    //   add(add(add(A, C), reduce(B)), reduce(D))
+    if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
+        N1.getOpcode() == ISD::ADD) {
+      unsigned N0RedOp = 0;
+      if (!IsVecReduce(N0.getOperand(N0RedOp))) {
+        N0RedOp = 1;
+        if (!IsVecReduce(N0.getOperand(N0RedOp)))
+          return SDValue();
+      }
+
+      unsigned N1RedOp = 0;
+      if (!IsVecReduce(N1.getOperand(N1RedOp)))
+        N1RedOp = 1;
+      if (!IsVecReduce(N1.getOperand(N1RedOp)))
+        return SDValue();
+
+      SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
+                                 N1.getOperand(1 - N1RedOp));
+      SDValue Add1 =
+          DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
+      return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
+    }
+    return SDValue();
+  };
+  if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
+    return R;
+  if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
+    return R;
+
+  // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
+  // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
+  // by ascending load offsets. This can help cores prefetch if the order of
+  // loads is more predictable.
+  auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
+    // Check if two reductions are known to load data where one is before/after
+    // another. Return negative if N0 loads data before N1, positive if N1 is
+    // before N0 and 0 otherwise if nothing is known.
+    auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
+      // Look through to the first operand of a MUL, for the VMLA case.
+      // Currently only looks at the first operand, in the hope they are equal.
+      if (N0.getOpcode() == ISD::MUL)
+        N0 = N0.getOperand(0);
+      if (N1.getOpcode() == ISD::MUL)
+        N1 = N1.getOperand(0);
+
+      // Return true if the two operands are loads to the same object and the
+      // offset of the first is known to be less than the offset of the second.
+      LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
+      LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
+      if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
+          !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
+          Load1->isIndexed())
+        return 0;
+
+      auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
+      auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
+
+      if (!BaseLocDecomp0.getBase() ||
+          BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
+          !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
+        return 0;
+      if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
+        return -1;
+      if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
+        return 1;
+      return 0;
+    };
+
+    SDValue X;
+    if (N0.getOpcode() == ISD::ADD) {
+      if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
+        int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
+                                         N0.getOperand(1).getOperand(0));
+        if (IsBefore < 0) {
+          X = N0.getOperand(0);
+          N0 = N0.getOperand(1);
+        } else if (IsBefore > 0) {
+          X = N0.getOperand(1);
+          N0 = N0.getOperand(0);
+        } else
+          return SDValue();
+      } else if (IsVecReduce(N0.getOperand(0))) {
+        X = N0.getOperand(1);
+        N0 = N0.getOperand(0);
+      } else if (IsVecReduce(N0.getOperand(1))) {
+        X = N0.getOperand(0);
+        N0 = N0.getOperand(1);
+      } else
+        return SDValue();
+    } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
+               IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
+      // Note this is backward to how you would expect. We create
+      // add(reduce(load + 16), reduce(load + 0)) so that the
+      // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
+      // the X as VADDV(load + 0)
+      return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
+    } else
+      return SDValue();
+
+    if (!IsVecReduce(N0) || !IsVecReduce(N1))
+      return SDValue();
+
+    if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
+      return SDValue();
+
+    // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
+    SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
+    return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
+  };
+  if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
+    return R;
+  if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
+    return R;
+  return SDValue();
+}
+
 static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
                                    const ARMSubtarget *Subtarget) {
-  if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
+  if (!Subtarget->hasMVEIntegerOps())
     return SDValue();
 
+  if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))
+    return R;
+
+  EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  if (VT != MVT::i64)
+    return SDValue();
 
   // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
   // will look like:
@@ -13090,7 +13315,6 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
         NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
       return SDValue();
 
-    SDLoc dl(N);
     if (VecRed->getOpcode() == OpcodeA) {
       // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
       SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
@@ -14732,6 +14956,7 @@ PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
   SDValue Op0 = Ext.getOperand(0);
   EVT VecVT = Op0.getValueType();
+  unsigned ResNo = Op0.getResNo();
   unsigned Lane = Ext.getConstantOperandVal(1);
   if (VecVT.getVectorNumElements() != 4)
     return SDValue();
@@ -14740,7 +14965,8 @@ PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
     return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
            isa<ConstantSDNode>(V->getOperand(1)) &&
-           V->getConstantOperandVal(1) == Lane + 1;
+           V->getConstantOperandVal(1) == Lane + 1 &&
+           V->getOperand(0).getResNo() == ResNo;
   });
   if (OtherIt == Op0->uses().end())
     return SDValue();
@@ -14884,6 +15110,47 @@ static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
                      Op0->getOperand(0), Op1->getOperand(0));
 }
 
+static SDValue
+PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  uint64_t IdxVal = N->getConstantOperandVal(2);
+  EVT VecVT = Vec.getValueType();
+  EVT SubVT = SubVec.getValueType();
+
+  // Only do this for legal fixed vector types.
+  if (!VecVT.isFixedLengthVector() ||
+      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
+      !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
+    return SDValue();
+
+  // Ignore widening patterns.
+  if (IdxVal == 0 && Vec.isUndef())
+    return SDValue();
+
+  // Subvector must be half the width and an "aligned" insertion.
+  unsigned NumSubElts = SubVT.getVectorNumElements();
+  if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
+      (IdxVal != 0 && IdxVal != NumSubElts))
+    return SDValue();
+
+  // Fold insert_subvector -> concat_vectors
+  // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
+  // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
+  SDLoc DL(N);
+  SDValue Lo, Hi;
+  if (IdxVal == 0) {
+    Lo = SubVec;
+    Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
+                         DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
+  } else {
+    Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
+                         DCI.DAG.getVectorIdxConstant(0, DL));
+    Hi = SubVec;
+  }
+  return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
+}
+
 // shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
 static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
                                           SelectionDAG &DAG) {
@@ -14965,6 +15232,390 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
                               DAG.getUNDEF(VT), NewMask);
 }
 
+/// Load/store instruction that can be merged with a base address
+/// update
+struct BaseUpdateTarget {
+  SDNode *N;
+  bool isIntrinsic;
+  bool isStore;
+  unsigned AddrOpIdx;
+};
+
+struct BaseUpdateUser {
+  /// Instruction that updates a pointer
+  SDNode *N;
+  /// Pointer increment operand
+  SDValue Inc;
+  /// Pointer increment value if it is a constant, or 0 otherwise
+  unsigned ConstInc;
+};
+
+static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
+                                 struct BaseUpdateUser &User,
+                                 bool SimpleConstIncOnly,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDNode *N = Target.N;
+  MemSDNode *MemN = cast<MemSDNode>(N);
+  SDLoc dl(N);
+
+  // Find the new opcode for the updating load/store.
+  bool isLoadOp = true;
+  bool isLaneOp = false;
+  // Workaround for vst1x and vld1x intrinsics which do not have alignment
+  // as an operand.
+  bool hasAlignment = true;
+  unsigned NewOpc = 0;
+  unsigned NumVecs = 0;
+  if (Target.isIntrinsic) {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::arm_neon_vld1:
+      NewOpc = ARMISD::VLD1_UPD;
+      NumVecs = 1;
+      break;
+    case Intrinsic::arm_neon_vld2:
+      NewOpc = ARMISD::VLD2_UPD;
+      NumVecs = 2;
+      break;
+    case Intrinsic::arm_neon_vld3:
+      NewOpc = ARMISD::VLD3_UPD;
+      NumVecs = 3;
+      break;
+    case Intrinsic::arm_neon_vld4:
+      NewOpc = ARMISD::VLD4_UPD;
+      NumVecs = 4;
+      break;
+    case Intrinsic::arm_neon_vld1x2:
+      NewOpc = ARMISD::VLD1x2_UPD;
+      NumVecs = 2;
+      hasAlignment = false;
+      break;
+    case Intrinsic::arm_neon_vld1x3:
+      NewOpc = ARMISD::VLD1x3_UPD;
+      NumVecs = 3;
+      hasAlignment = false;
+      break;
+    case Intrinsic::arm_neon_vld1x4:
+      NewOpc = ARMISD::VLD1x4_UPD;
+      NumVecs = 4;
+      hasAlignment = false;
+      break;
+    case Intrinsic::arm_neon_vld2dup:
+      NewOpc = ARMISD::VLD2DUP_UPD;
+      NumVecs = 2;
+      break;
+    case Intrinsic::arm_neon_vld3dup:
+      NewOpc = ARMISD::VLD3DUP_UPD;
+      NumVecs = 3;
+      break;
+    case Intrinsic::arm_neon_vld4dup:
+      NewOpc = ARMISD::VLD4DUP_UPD;
+      NumVecs = 4;
+      break;
+    case Intrinsic::arm_neon_vld2lane:
+      NewOpc = ARMISD::VLD2LN_UPD;
+      NumVecs = 2;
+      isLaneOp = true;
+      break;
+    case Intrinsic::arm_neon_vld3lane:
+      NewOpc = ARMISD::VLD3LN_UPD;
+      NumVecs = 3;
+      isLaneOp = true;
+      break;
+    case Intrinsic::arm_neon_vld4lane:
+      NewOpc = ARMISD::VLD4LN_UPD;
+      NumVecs = 4;
+      isLaneOp = true;
+      break;
+    case Intrinsic::arm_neon_vst1:
+      NewOpc = ARMISD::VST1_UPD;
+      NumVecs = 1;
+      isLoadOp = false;
+      break;
+    case Intrinsic::arm_neon_vst2:
+      NewOpc = ARMISD::VST2_UPD;
+      NumVecs = 2;
+      isLoadOp = false;
+      break;
+    case Intrinsic::arm_neon_vst3:
+      NewOpc = ARMISD::VST3_UPD;
+      NumVecs = 3;
+      isLoadOp = false;
+      break;
+    case Intrinsic::arm_neon_vst4:
+      NewOpc = ARMISD::VST4_UPD;
+      NumVecs = 4;
+      isLoadOp = false;
+      break;
+    case Intrinsic::arm_neon_vst2lane:
+      NewOpc = ARMISD::VST2LN_UPD;
+      NumVecs = 2;
+      isLoadOp = false;
+      isLaneOp = true;
+      break;
+    case Intrinsic::arm_neon_vst3lane:
+      NewOpc = ARMISD::VST3LN_UPD;
+      NumVecs = 3;
+      isLoadOp = false;
+      isLaneOp = true;
+      break;
+    case Intrinsic::arm_neon_vst4lane:
+      NewOpc = ARMISD::VST4LN_UPD;
+      NumVecs = 4;
+      isLoadOp = false;
+      isLaneOp = true;
+      break;
+    case Intrinsic::arm_neon_vst1x2:
+      NewOpc = ARMISD::VST1x2_UPD;
+      NumVecs = 2;
+      isLoadOp = false;
+      hasAlignment = false;
+      break;
+    case Intrinsic::arm_neon_vst1x3:
+      NewOpc = ARMISD::VST1x3_UPD;
+      NumVecs = 3;
+      isLoadOp = false;
+      hasAlignment = false;
+      break;
+    case Intrinsic::arm_neon_vst1x4:
+      NewOpc = ARMISD::VST1x4_UPD;
+      NumVecs = 4;
+      isLoadOp = false;
+      hasAlignment = false;
+      break;
+    }
+  } else {
+    isLaneOp = true;
+    switch (N->getOpcode()) {
+    default:
+      llvm_unreachable("unexpected opcode for Neon base update");
+    case ARMISD::VLD1DUP:
+      NewOpc = ARMISD::VLD1DUP_UPD;
+      NumVecs = 1;
+      break;
+    case ARMISD::VLD2DUP:
+      NewOpc = ARMISD::VLD2DUP_UPD;
+      NumVecs = 2;
+      break;
+    case ARMISD::VLD3DUP:
+      NewOpc = ARMISD::VLD3DUP_UPD;
+      NumVecs = 3;
+      break;
+    case ARMISD::VLD4DUP:
+      NewOpc = ARMISD::VLD4DUP_UPD;
+      NumVecs = 4;
+      break;
+    case ISD::LOAD:
+      NewOpc = ARMISD::VLD1_UPD;
+      NumVecs = 1;
+      isLaneOp = false;
+      break;
+    case ISD::STORE:
+      NewOpc = ARMISD::VST1_UPD;
+      NumVecs = 1;
+      isLaneOp = false;
+      isLoadOp = false;
+      break;
+    }
+  }
+
+  // Find the size of memory referenced by the load/store.
+  EVT VecTy;
+  if (isLoadOp) {
+    VecTy = N->getValueType(0);
+  } else if (Target.isIntrinsic) {
+    VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
+  } else {
+    assert(Target.isStore &&
+           "Node has to be a load, a store, or an intrinsic!");
+    VecTy = N->getOperand(1).getValueType();
+  }
+
+  bool isVLDDUPOp =
+      NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
+      NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
+
+  unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+  if (isLaneOp || isVLDDUPOp)
+    NumBytes /= VecTy.getVectorNumElements();
+
+  if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
+    // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+    // separate instructions that make it harder to use a non-constant update.
+    return false;
+  }
+
+  if (SimpleConstIncOnly && User.ConstInc != NumBytes)
+    return false;
+
+  // OK, we found an ADD we can fold into the base update.
+  // Now, create a _UPD node, taking care of not breaking alignment.
+
+  EVT AlignedVecTy = VecTy;
+  unsigned Alignment = MemN->getAlignment();
+
+  // If this is a less-than-standard-aligned load/store, change the type to
+  // match the standard alignment.
+  // The alignment is overlooked when selecting _UPD variants; and it's
+  // easier to introduce bitcasts here than fix that.
+  // There are 3 ways to get to this base-update combine:
+  // - intrinsics: they are assumed to be properly aligned (to the standard
+  //   alignment of the memory type), so we don't need to do anything.
+  // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+  //   intrinsics, so, likewise, there's nothing to do.
+  // - generic load/store instructions: the alignment is specified as an
+  //   explicit operand, rather than implicitly as the standard alignment
+  //   of the memory type (like the intrisics).  We need to change the
+  //   memory type to match the explicit alignment.  That way, we don't
+  //   generate non-standard-aligned ARMISD::VLDx nodes.
+  if (isa<LSBaseSDNode>(N)) {
+    if (Alignment == 0)
+      Alignment = 1;
+    if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+      MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+      assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+      assert(!isLaneOp && "Unexpected generic load/store lane.");
+      unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+      AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+    }
+    // Don't set an explicit alignment on regular load/stores that we want
+    // to transform to VLD/VST 1_UPD nodes.
+    // This matches the behavior of regular load/stores, which only get an
+    // explicit alignment if the MMO alignment is larger than the standard
+    // alignment of the memory type.
+    // Intrinsics, however, always get an explicit alignment, set to the
+    // alignment of the MMO.
+    Alignment = 1;
+  }
+
+  // Create the new updating load/store node.
+  // First, create an SDVTList for the new updating node's results.
+  EVT Tys[6];
+  unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+  unsigned n;
+  for (n = 0; n < NumResultVecs; ++n)
+    Tys[n] = AlignedVecTy;
+  Tys[n++] = MVT::i32;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+
+  // Then, gather the new node's operands.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(N->getOperand(0)); // incoming chain
+  Ops.push_back(N->getOperand(Target.AddrOpIdx));
+  Ops.push_back(User.Inc);
+
+  if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+    // Try to match the intrinsic's signature
+    Ops.push_back(StN->getValue());
+  } else {
+    // Loads (and of course intrinsics) match the intrinsics' signature,
+    // so just add all but the alignment operand.
+    unsigned LastOperand =
+        hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
+    for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
+      Ops.push_back(N->getOperand(i));
+  }
+
+  // For all node types, the alignment operand is always the last one.
+  Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+
+  // If this is a non-standard-aligned STORE, the penultimate operand is the
+  // stored value.  Bitcast it to the aligned type.
+  if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+    SDValue &StVal = Ops[Ops.size() - 2];
+    StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+  }
+
+  EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+  SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
+                                         MemN->getMemOperand());
+
+  // Update the uses.
+  SmallVector<SDValue, 5> NewResults;
+  for (unsigned i = 0; i < NumResultVecs; ++i)
+    NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+  // If this is an non-standard-aligned LOAD, the first result is the loaded
+  // value.  Bitcast it to the expected result type.
+  if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+    SDValue &LdVal = NewResults[0];
+    LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+  }
+
+  NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+  DCI.CombineTo(N, NewResults);
+  DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
+
+  return true;
+}
+
+// If (opcode ptr inc) is and ADD-like instruction, return the
+// increment value. Otherwise return 0.
+static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
+                                         SDValue Inc, const SelectionDAG &DAG) {
+  ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
+  if (!CInc)
+    return 0;
+
+  switch (Opcode) {
+  case ARMISD::VLD1_UPD:
+  case ISD::ADD:
+    return CInc->getZExtValue();
+  case ISD::OR: {
+    if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
+      // (OR ptr inc) is the same as (ADD ptr inc)
+      return CInc->getZExtValue();
+    }
+    return 0;
+  }
+  default:
+    return 0;
+  }
+}
+
+static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
+  switch (N->getOpcode()) {
+  case ISD::ADD:
+  case ISD::OR: {
+    if (isa<ConstantSDNode>(N->getOperand(1))) {
+      *Ptr = N->getOperand(0);
+      *CInc = N->getOperand(1);
+      return true;
+    }
+    return false;
+  }
+  case ARMISD::VLD1_UPD: {
+    if (isa<ConstantSDNode>(N->getOperand(2))) {
+      *Ptr = N->getOperand(1);
+      *CInc = N->getOperand(2);
+      return true;
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
+  // Check that the add is independent of the load/store.
+  // Otherwise, folding it would create a cycle. Search through Addr
+  // as well, since the User may not be a direct user of Addr and
+  // only share a base pointer.
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(N);
+  Worklist.push_back(User);
+  if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
+      SDNode::hasPredecessorHelper(User, Visited, Worklist))
+    return false;
+  return true;
+}
+
 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
 /// NEON load/store intrinsics, and generic vector load/stores, to merge
 /// base address updates.
@@ -14972,237 +15623,89 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
 /// The caller is assumed to have checked legality.
 static SDValue CombineBaseUpdate(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
   const bool isStore = N->getOpcode() == ISD::STORE;
   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
+  BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
+
   SDValue Addr = N->getOperand(AddrOpIdx);
-  MemSDNode *MemN = cast<MemSDNode>(N);
-  SDLoc dl(N);
+
+  SmallVector<BaseUpdateUser, 8> BaseUpdates;
 
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
-      continue;
-
-    // Check that the add is independent of the load/store.  Otherwise, folding
-    // it would create a cycle. We can avoid searching through Addr as it's a
-    // predecessor to both.
-    SmallPtrSet<const SDNode *, 32> Visited;
-    SmallVector<const SDNode *, 16> Worklist;
-    Visited.insert(Addr.getNode());
-    Worklist.push_back(N);
-    Worklist.push_back(User);
-    if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
-        SDNode::hasPredecessorHelper(User, Visited, Worklist))
+    if (UI.getUse().getResNo() != Addr.getResNo() ||
+        User->getNumOperands() != 2)
       continue;
 
-    // Find the new opcode for the updating load/store.
-    bool isLoadOp = true;
-    bool isLaneOp = false;
-    // Workaround for vst1x and vld1x intrinsics which do not have alignment
-    // as an operand.
-    bool hasAlignment = true;
-    unsigned NewOpc = 0;
-    unsigned NumVecs = 0;
-    if (isIntrinsic) {
-      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      default: llvm_unreachable("unexpected intrinsic for Neon base update");
-      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
-        NumVecs = 1; break;
-      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::arm_neon_vld1x2:   NewOpc = ARMISD::VLD1x2_UPD;
-        NumVecs = 2; hasAlignment = false; break;
-      case Intrinsic::arm_neon_vld1x3:   NewOpc = ARMISD::VLD1x3_UPD;
-        NumVecs = 3; hasAlignment = false; break;
-      case Intrinsic::arm_neon_vld1x4:   NewOpc = ARMISD::VLD1x4_UPD;
-        NumVecs = 4; hasAlignment = false; break;
-      case Intrinsic::arm_neon_vld2dup:  NewOpc = ARMISD::VLD2DUP_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::arm_neon_vld3dup:  NewOpc = ARMISD::VLD3DUP_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::arm_neon_vld4dup:  NewOpc = ARMISD::VLD4DUP_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
-        NumVecs = 2; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
-        NumVecs = 3; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
-        NumVecs = 4; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
-        NumVecs = 1; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
-        NumVecs = 2; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
-        NumVecs = 3; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
-        NumVecs = 4; isLoadOp = false; break;
-      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
-        NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
-        NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
-        NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst1x2:   NewOpc = ARMISD::VST1x2_UPD;
-        NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
-      case Intrinsic::arm_neon_vst1x3:   NewOpc = ARMISD::VST1x3_UPD;
-        NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
-      case Intrinsic::arm_neon_vst1x4:   NewOpc = ARMISD::VST1x4_UPD;
-        NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
-      }
-    } else {
-      isLaneOp = true;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("unexpected opcode for Neon base update");
-      case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
-      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
-      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
-      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
-      case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
-        NumVecs = 1; isLaneOp = false; break;
-      case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
-        NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
-      }
-    }
-
-    // Find the size of memory referenced by the load/store.
-    EVT VecTy;
-    if (isLoadOp) {
-      VecTy = N->getValueType(0);
-    } else if (isIntrinsic) {
-      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
-    } else {
-      assert(isStore && "Node has to be a load, a store, or an intrinsic!");
-      VecTy = N->getOperand(1).getValueType();
-    }
-
-    bool isVLDDUPOp =
-        NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
-        NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
-
-    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-    if (isLaneOp || isVLDDUPOp)
-      NumBytes /= VecTy.getVectorNumElements();
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
-    if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
-      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
-      // separate instructions that make it harder to use a non-constant update.
-      continue;
-    }
+    SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
+    unsigned ConstInc =
+        getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
 
-    // OK, we found an ADD we can fold into the base update.
-    // Now, create a _UPD node, taking care of not breaking alignment.
-
-    EVT AlignedVecTy = VecTy;
-    unsigned Alignment = MemN->getAlignment();
-
-    // If this is a less-than-standard-aligned load/store, change the type to
-    // match the standard alignment.
-    // The alignment is overlooked when selecting _UPD variants; and it's
-    // easier to introduce bitcasts here than fix that.
-    // There are 3 ways to get to this base-update combine:
-    // - intrinsics: they are assumed to be properly aligned (to the standard
-    //   alignment of the memory type), so we don't need to do anything.
-    // - ARMISD::VLDx nodes: they are only generated from the aforementioned
-    //   intrinsics, so, likewise, there's nothing to do.
-    // - generic load/store instructions: the alignment is specified as an
-    //   explicit operand, rather than implicitly as the standard alignment
-    //   of the memory type (like the intrisics).  We need to change the
-    //   memory type to match the explicit alignment.  That way, we don't
-    //   generate non-standard-aligned ARMISD::VLDx nodes.
-    if (isa<LSBaseSDNode>(N)) {
-      if (Alignment == 0)
-        Alignment = 1;
-      if (Alignment < VecTy.getScalarSizeInBits() / 8) {
-        MVT EltTy = MVT::getIntegerVT(Alignment * 8);
-        assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
-        assert(!isLaneOp && "Unexpected generic load/store lane.");
-        unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
-        AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
-      }
-      // Don't set an explicit alignment on regular load/stores that we want
-      // to transform to VLD/VST 1_UPD nodes.
-      // This matches the behavior of regular load/stores, which only get an
-      // explicit alignment if the MMO alignment is larger than the standard
-      // alignment of the memory type.
-      // Intrinsics, however, always get an explicit alignment, set to the
-      // alignment of the MMO.
-      Alignment = 1;
-    }
+    if (ConstInc || User->getOpcode() == ISD::ADD)
+      BaseUpdates.push_back({User, Inc, ConstInc});
+  }
 
-    // Create the new updating load/store node.
-    // First, create an SDVTList for the new updating node's results.
-    EVT Tys[6];
-    unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
-    unsigned n;
-    for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = AlignedVecTy;
-    Tys[n++] = MVT::i32;
-    Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
+  // If the address is a constant pointer increment itself, find
+  // another constant increment that has the same base operand
+  SDValue Base;
+  SDValue CInc;
+  if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
+    unsigned Offset =
+        getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
+    for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
+         UI != UE; ++UI) {
 
-    // Then, gather the new node's operands.
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // incoming chain
-    Ops.push_back(N->getOperand(AddrOpIdx));
-    Ops.push_back(Inc);
+      SDNode *User = *UI;
+      if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
+          User->getNumOperands() != 2)
+        continue;
 
-    if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
-      // Try to match the intrinsic's signature
-      Ops.push_back(StN->getValue());
-    } else {
-      // Loads (and of course intrinsics) match the intrinsics' signature,
-      // so just add all but the alignment operand.
-      unsigned LastOperand =
-          hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
-      for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
-        Ops.push_back(N->getOperand(i));
-    }
+      SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
+      unsigned UserOffset =
+          getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
 
-    // For all node types, the alignment operand is always the last one.
-    Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+      if (!UserOffset || UserOffset <= Offset)
+        continue;
 
-    // If this is a non-standard-aligned STORE, the penultimate operand is the
-    // stored value.  Bitcast it to the aligned type.
-    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
-      SDValue &StVal = Ops[Ops.size()-2];
-      StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+      unsigned NewConstInc = UserOffset - Offset;
+      SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
+      BaseUpdates.push_back({User, NewInc, NewConstInc});
     }
+  }
 
-    EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
-                                           MemN->getMemOperand());
-
-    // Update the uses.
-    SmallVector<SDValue, 5> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i)
-      NewResults.push_back(SDValue(UpdN.getNode(), i));
-
-    // If this is an non-standard-aligned LOAD, the first result is the loaded
-    // value.  Bitcast it to the expected result type.
-    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
-      SDValue &LdVal = NewResults[0];
-      LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+  // Try to fold the load/store with an update that matches memory
+  // access size. This should work well for sequential loads.
+  //
+  // Filter out invalid updates as well.
+  unsigned NumValidUpd = BaseUpdates.size();
+  for (unsigned I = 0; I < NumValidUpd;) {
+    BaseUpdateUser &User = BaseUpdates[I];
+    if (!isValidBaseUpdate(N, User.N)) {
+      --NumValidUpd;
+      std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
+      continue;
     }
 
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
-    DCI.CombineTo(N, NewResults);
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
-
-    break;
+    if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
+      return SDValue();
+    ++I;
+  }
+  BaseUpdates.resize(NumValidUpd);
+
+  // Try to fold with other users. Non-constant updates are considered
+  // first, and constant updates are sorted to not break a sequence of
+  // strided accesses (if there is any).
+  std::sort(BaseUpdates.begin(), BaseUpdates.end(),
+            [](BaseUpdateUser &LHS, BaseUpdateUser &RHS) {
+              return LHS.ConstInc < RHS.ConstInc;
+            });
+  for (BaseUpdateUser &User : BaseUpdates) {
+    if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
+      return SDValue();
   }
   return SDValue();
 }
@@ -15502,11 +16005,12 @@ static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue PerformLOADCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
   EVT VT = N->getValueType(0);
 
   // If this is a legal vector load, try to combine it into a VLD1_UPD.
-  if (ISD::isNormalLoad(N) && VT.isVector() &&
+  if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return CombineBaseUpdate(N, DCI);
 
@@ -15976,6 +16480,15 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDLoc dl(N);
 
+  // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
+  if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
+      (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
+       N0.getValueType() == MVT::v16i8)) {
+    SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
+    SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
+    return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
+  }
+
   // We are looking for something that will have illegal types if left alone,
   // but that we can convert to a single instruction under MVE. For example
   // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
@@ -16124,38 +16637,8 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
                        SDValue(Node.getNode(), 1));
   };
 
-  if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
-    return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
-  if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
-    return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
-  if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
-    return Create64bitNode(ARMISD::VADDLVs, {A});
-  if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
-    return Create64bitNode(ARMISD::VADDLVu, {A});
-  if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
-  if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
-
-  SDValue Mask;
-  if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
-    return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
-  if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
-    return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
-  if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
-    return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
-  if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
-    return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
-  if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
-  if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
-    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
-                       DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
-
   SDValue A, B;
+  SDValue Mask;
   if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
     return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
   if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
@@ -16192,6 +16675,36 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
                        DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
 
+  if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
+    return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
+  if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
+    return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
+  if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
+    return Create64bitNode(ARMISD::VADDLVs, {A});
+  if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
+    return Create64bitNode(ARMISD::VADDLVu, {A});
+  if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
+  if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
+
+  if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
+    return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
+  if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
+    return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
+  if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
+    return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
+  if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
+    return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
+  if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
+  if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
+    return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
+                       DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
+
   // Some complications. We can get a case where the two inputs of the mul are
   // the same, then the output sext will have been helpfully converted to a
   // zext. Turn it back.
@@ -16978,7 +17491,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
     auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
     if (!Const)
       return SDValue();
-    if (Const->isNullValue())
+    if (Const->isZero())
       Imm = 0;
     else if (Const->isOne())
       Imm = 1;
@@ -17030,7 +17543,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
     Cond = N->getOperand(2);
     Dest = N->getOperand(4);
     if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
-      if (!Const->isOne() && !Const->isNullValue())
+      if (!Const->isOne() && !Const->isZero())
         return SDValue();
       Imm = Const->getZExtValue();
     } else
@@ -17685,6 +18198,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::EXTRACT_VECTOR_ELT:
     return PerformExtractEltCombine(N, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);
+  case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
@@ -17710,9 +18224,12 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SMAX:
   case ISD::UMAX:
     return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
-  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
-  case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
-  case ISD::LOAD:       return PerformLOADCombine(N, DCI);
+  case ARMISD::CMOV:
+    return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::BRCOND:
+    return PerformBRCONDCombine(N, DCI.DAG);
+  case ISD::LOAD:
+    return PerformLOADCombine(N, DCI, Subtarget);
   case ARMISD::VLD1DUP:
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
@@ -17929,7 +18446,7 @@ EVT ARMTargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
   // See if we can use NEON instructions for this...
   if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
-      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+      !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
     bool Fast;
     if (Op.size() >= 16 &&
         (Op.isAligned(Align(16)) ||
@@ -18086,18 +18603,27 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
         switch (II->getIntrinsicID()) {
         case Intrinsic::fma:
           return !IsFMS(I);
+        case Intrinsic::sadd_sat:
+        case Intrinsic::uadd_sat:
         case Intrinsic::arm_mve_add_predicated:
         case Intrinsic::arm_mve_mul_predicated:
         case Intrinsic::arm_mve_qadd_predicated:
+        case Intrinsic::arm_mve_vhadd:
         case Intrinsic::arm_mve_hadd_predicated:
+        case Intrinsic::arm_mve_vqdmull:
         case Intrinsic::arm_mve_vqdmull_predicated:
+        case Intrinsic::arm_mve_vqdmulh:
         case Intrinsic::arm_mve_qdmulh_predicated:
+        case Intrinsic::arm_mve_vqrdmulh:
         case Intrinsic::arm_mve_qrdmulh_predicated:
         case Intrinsic::arm_mve_fma_predicated:
           return true;
+        case Intrinsic::ssub_sat:
+        case Intrinsic::usub_sat:
         case Intrinsic::arm_mve_sub_predicated:
         case Intrinsic::arm_mve_qsub_predicated:
         case Intrinsic::arm_mve_hsub_predicated:
+        case Intrinsic::arm_mve_vhsub:
           return Operand == 1;
         default:
           return false;
@@ -18508,6 +19034,31 @@ bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
   return AbsImm >= 0 && AbsImm <= 255;
 }
 
+// Return false to prevent folding
+// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
+// if the folding leads to worse code.
+bool ARMTargetLowering::isMulAddWithConstProfitable(
+    const SDValue &AddNode, const SDValue &ConstNode) const {
+  // Let the DAGCombiner decide for vector types and large types.
+  const EVT VT = AddNode.getValueType();
+  if (VT.isVector() || VT.getScalarSizeInBits() > 32)
+    return true;
+
+  // It is worse if c0 is legal add immediate, while c1*c0 is not
+  // and has to be composed by at least two instructions.
+  const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
+  const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
+  const int64_t C0 = C0Node->getSExtValue();
+  APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
+  if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue()))
+    return true;
+  if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
+    return false;
+
+  // Default to true and let the DAGCombiner decide.
+  return true;
+}
+
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
                                       bool isSEXTLoad, SDValue &Base,
                                       SDValue &Offset, bool &isInc,
@@ -19015,8 +19566,8 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
     if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
         isa<ConstantSDNode>(Op->getOperand(2))) {
       unsigned ShAmt = Op->getConstantOperandVal(2);
-      if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(
-                            APInt::getAllOnesValue(32) << (32 - ShAmt)))
+      if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
+                                                        << (32 - ShAmt)))
         return TLO.CombineTo(
             Op, TLO.DAG.getNode(
                     ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
@@ -19760,7 +20311,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
     Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
@@ -19774,7 +20325,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
     Info.offset = 0;
     Info.align.reset();
     // volatile loads with NEON intrinsics not supported
@@ -19792,7 +20343,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors stored.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
@@ -19801,7 +20352,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
     Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
@@ -19814,7 +20365,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors stored.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
@@ -20128,10 +20679,7 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic(
   return InsertFencesForAtomic;
 }
 
-// This has so far only been implemented for MachO.
-bool ARMTargetLowering::useLoadStackGuardNode() const {
-  return Subtarget->isTargetMachO();
-}
+bool ARMTargetLowering::useLoadStackGuardNode() const { return true; }
 
 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
   if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
@@ -20146,7 +20694,7 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
       "__security_check_cookie", Type::getVoidTy(M.getContext()),
       Type::getInt8PtrTy(M.getContext()));
   if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
-    F->addAttribute(1, Attribute::AttrKind::InReg);
+    F->addParamAttr(0, Attribute::AttrKind::InReg);
 }
 
 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 844b7d4f1707..0fddd58e178e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -680,7 +680,7 @@ class VectorType;
                                    unsigned &Cost) const override;
 
     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                          const SelectionDAG &DAG) const override {
+                          const MachineFunction &MF) const override {
       // Do not merge to larger than i32.
       return (MemVT.getSizeInBits() <= 32);
     }
@@ -712,6 +712,9 @@ class VectorType;
                                       Align Alignment,
                                       const DataLayout &DL) const;
 
+    bool isMulAddWithConstProfitable(const SDValue &AddNode,
+                                     const SDValue &ConstNode) const override;
+
     bool alignLoopsWithOptSize() const override;
 
     /// Returns the number of interleaved accesses that will be generated when
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index 0e97668e2e01..54e27a6be558 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -612,14 +612,14 @@ multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
                                     (VTI.Vec MQPR:$inactive), timm:$imm,
                                     (VTI.Pred VCCR:$pred))),
             (VTI.Vec (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm, ARMVCCThen,
-                                    (VTI.Pred VCCR:$pred),
+                                    (VTI.Pred VCCR:$pred), zero_reg,
                                     (VTI.Vec MQPR:$inactive)))>;
   def : Pat<(VTI.Vec (int_arm_cde_vcx1qa_predicated timm:$coproc,
                                     (VTI.Vec MQPR:$acc), timm:$imm,
                                     (VTI.Pred VCCR:$pred))),
             (VTI.Vec (CDE_VCX1A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
                                     imm_12b:$imm, ARMVCCThen,
-                                    (VTI.Pred VCCR:$pred)))>;
+                                    (VTI.Pred VCCR:$pred), zero_reg))>;
 
   def : Pat<(VTI.Vec (int_arm_cde_vcx2q_predicated timm:$coproc,
                                     (VTI.Vec MQPR:$inactive),
@@ -627,7 +627,7 @@ multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
                                     (VTI.Pred VCCR:$pred))),
             (VTI.Vec (CDE_VCX2_vec p_imm:$coproc, (v16i8 MQPR:$n),
                                     imm_7b:$imm, ARMVCCThen,
-                                    (VTI.Pred VCCR:$pred),
+                                    (VTI.Pred VCCR:$pred), zero_reg,
                                     (VTI.Vec MQPR:$inactive)))>;
   def : Pat<(VTI.Vec (int_arm_cde_vcx2qa_predicated timm:$coproc,
                                     (VTI.Vec MQPR:$acc),
@@ -635,7 +635,7 @@ multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
                                     (VTI.Pred VCCR:$pred))),
             (VTI.Vec (CDE_VCX2A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
                                     (v16i8 MQPR:$n), timm:$imm, ARMVCCThen,
-                                    (VTI.Pred VCCR:$pred)))>;
+                                    (VTI.Pred VCCR:$pred), zero_reg))>;
 
   def : Pat<(VTI.Vec (int_arm_cde_vcx3q_predicated timm:$coproc,
                                     (VTI.Vec MQPR:$inactive),
@@ -645,7 +645,7 @@ multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
             (VTI.Vec (CDE_VCX3_vec p_imm:$coproc, (v16i8 MQPR:$n),
                                     (v16i8 MQPR:$m),
                                     imm_4b:$imm, ARMVCCThen,
-                                    (VTI.Pred VCCR:$pred),
+                                    (VTI.Pred VCCR:$pred), zero_reg,
                                     (VTI.Vec MQPR:$inactive)))>;
   def : Pat<(VTI.Vec (int_arm_cde_vcx3qa_predicated timm:$coproc,
                                     (VTI.Vec MQPR:$acc),
@@ -654,7 +654,7 @@ multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
             (VTI.Vec (CDE_VCX3A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
                                     (v16i8 MQPR:$n), (v16i8 MQPR:$m),
                                     imm_4b:$imm, ARMVCCThen,
-                                    (VTI.Pred VCCR:$pred)))>;
+                                    (VTI.Pred VCCR:$pred), zero_reg))>;
 }
 
 let Predicates = [HasCDE, HasMVEInt] in
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 85da7c5a535e..de351372abf2 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -249,10 +249,10 @@ def VPTPredROperand : AsmOperandClass {
 
 // Base class for both kinds of vpred.
 class vpred_ops<dag extra_op, dag extra_mi> : OperandWithDefaultOps<OtherVT,
-            !con((ops (i32 0), (i32 zero_reg)), extra_op)> {
+            !con((ops (i32 0), (i32 zero_reg), (i32 zero_reg)), extra_op)> {
   let PrintMethod = "printVPTPredicateOperand";
   let OperandNamespace = "ARM";
-  let MIOperandInfo = !con((ops i32imm:$cond, VCCR:$cond_reg), extra_mi);
+  let MIOperandInfo = !con((ops i32imm:$cond, VCCR:$cond_reg, GPRlr:$tp_reg), extra_mi);
 
   // For convenience, we provide a string value that can be appended
   // to the constraints string. It's empty for vpred_n, and for
@@ -408,6 +408,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   // in an IT block).
   bit thumbArithFlagSetting = 0;
 
+  bits<2> VecSize = 0;
   bit validForTailPredication = 0;
   bit retainsPreviousHalfElement = 0;
   bit horizontalReduction = 0;
@@ -428,6 +429,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   let TSFlags{21}    = retainsPreviousHalfElement;
   let TSFlags{22}    = horizontalReduction;
   let TSFlags{23}    = doubleWidthResult;
+  let TSFlags{25-24} = VecSize;
 
   let Constraints = cstr;
   let Itinerary = itin;
@@ -1385,8 +1387,8 @@ class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
 }
 
 class T2I<dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+          string opc, string asm, list<dag> pattern, AddrMode am = AddrModeNone>
+  : Thumb2I<oops, iops, am, 4, itin, opc, asm, "", pattern>;
 class T2Ii12<dag oops, dag iops, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 3c6c6960b80f..5dee5e04af81 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -95,8 +95,17 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
   const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
   const TargetMachine &TM = MF.getTarget();
+  Module &M = *MF.getFunction().getParent();
 
-  if (!Subtarget.useMovt()) {
+  if (M.getStackProtectorGuard() == "tls") {
+    expandLoadStackGuardBase(MI, ARM::MRC, ARM::LDRi12);
+    return;
+  }
+
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+
+  if (!Subtarget.useMovt() || Subtarget.isGVInGOT(GV)) {
     if (TM.isPositionIndependent())
       expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
     else
@@ -109,9 +118,6 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
     return;
   }
 
-  const GlobalValue *GV =
-      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
-
   if (!Subtarget.isGVIndirectSymbol(GV)) {
     expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12);
     return;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 7466cecb9b33..7d0bc756e882 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -1252,7 +1252,7 @@ def addrmode5_pre : AddrMode5 {
 // addrmode5fp16 := reg +/- imm8*2
 //
 def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; }
-class AddrMode5FP16 : Operand<i32>,
+class AddrMode5FP16 : MemOperand,
                       ComplexPattern<i32, 2, "SelectAddrMode5FP16", []> {
   let EncoderMethod = "getAddrMode5FP16OpValue";
   let DecoderMethod = "DecodeAddrMode5FP16Operand";
@@ -1589,7 +1589,7 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
 let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                     SDNode opnode, bit Commutable = 0> {
+                     SDNode opnode> {
   // The register-immediate version is re-materializable. This is useful
   // in particular for taking the address of a local.
   let isReMaterializable = 1 in {
@@ -1693,9 +1693,8 @@ multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
 /// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG
 /// operands are reversed.
 let hasPostISelHook = 1, Defs = [CPSR] in {
-multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis, SDNode opnode,
-                          bit Commutable = 0> {
+multiclass AsI1_rbin_s_is<InstrItinClass iii,
+                          InstrItinClass iis, SDNode opnode> {
   def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
                          4, iii,
                          [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>,
@@ -3853,7 +3852,7 @@ defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
 
 // FIXME: Eliminate them if we can write def : Pat patterns which defines
 // CPSR and the implicit def of CPSR is not needed.
-defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUsr, ARMsubc>;
 
 defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>;
 
@@ -5391,14 +5390,16 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 }
 
 class ACI<dag oops, dag iops, string opc, string asm,
-            list<dag> pattern, IndexMode im = IndexModeNone>
-  : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+            list<dag> pattern, IndexMode im = IndexModeNone,
+            AddrMode am = AddrModeNone>
+  : I<oops, iops, am, 4, im, BrFrm, NoItinerary,
       opc, asm, "", pattern> {
   let Inst{27-25} = 0b110;
 }
 class ACInoP<dag oops, dag iops, string opc, string asm,
-          list<dag> pattern, IndexMode im = IndexModeNone>
-  : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+          list<dag> pattern, IndexMode im = IndexModeNone,
+          AddrMode am = AddrModeNone>
+  : InoP<oops, iops, am, 4, im, BrFrm, NoItinerary,
          opc, asm, "", pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{27-25} = 0b110;
@@ -5407,7 +5408,8 @@ class ACInoP<dag oops, dag iops, string opc, string asm,
 let DecoderNamespace = "CoProc" in {
 multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                    asm, "\t$cop, $CRd, $addr", pattern> {
+                    asm, "\t$cop, $CRd, $addr", pattern, IndexModeNone,
+                    AddrMode5> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -5478,7 +5480,8 @@ multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
 }
 multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                       asm, "\t$cop, $CRd, $addr", pattern> {
+                       asm, "\t$cop, $CRd, $addr", pattern, IndexModeNone,
+                       AddrMode5> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 372893814092..697730037277 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -97,7 +97,7 @@ def VecList2QAsmOperand : AsmOperandClass {
                          "q-registers in range [q0,q7]";
 }
 
-def VecList2Q : RegisterOperand<QQPR, "printMVEVectorListTwoQ"> {
+def VecList2Q : RegisterOperand<MQQPR, "printMVEVectorListTwoQ"> {
   let ParserMatchClass = VecList2QAsmOperand;
   let PrintMethod = "printMVEVectorList<2>";
 }
@@ -110,7 +110,7 @@ def VecList4QAsmOperand : AsmOperandClass {
                          "q-registers in range [q0,q7]";
 }
 
-def VecList4Q : RegisterOperand<QQQQPR, "printMVEVectorListFourQ"> {
+def VecList4Q : RegisterOperand<MQQQQPR, "printMVEVectorListFourQ"> {
   let ParserMatchClass = VecList4QAsmOperand;
   let PrintMethod = "printMVEVectorList<4>";
 }
@@ -332,7 +332,7 @@ multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrinsic
                                              (VTI.Vec MQPR:$Qn))),
                                 (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                               (VTI.Vec MQPR:$inactive)))>;
 
     // Optionally with the select folded through the op
@@ -341,7 +341,7 @@ multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrinsic
                                              (VTI.Vec MQPR:$Qn),
                                              (VTI.Vec IdentityVec))))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                               (VTI.Vec MQPR:$Qm)))>;
   }
 
@@ -350,7 +350,7 @@ multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrinsic
                           PredOperands,
                           (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                             (VTI.Vec MQPR:$inactive)))>;
 }
 
@@ -368,7 +368,7 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
                                              (VTI.Vec (ARMvdup rGPR:$Rn)))),
                                 (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
-                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                               (VTI.Vec MQPR:$inactive)))>;
 
     // Optionally with the select folded through the op
@@ -377,7 +377,7 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
                                              (ARMvdup rGPR:$Rn),
                                              (VTI.Vec IdentityVec))))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
-                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                               (VTI.Vec MQPR:$Qm)))>;
   }
 
@@ -386,19 +386,20 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
                           PredOperands,
                           (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
-                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                             (VTI.Vec MQPR:$inactive)))>;
 }
 
 // --------- Start of base classes for the instructions themselves
 
 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
-             string ops, string cstr, list<dag> pattern>
+             string ops, string cstr, bits<2> vecsize, list<dag> pattern>
   : Thumb2XI<oops, iops, AddrModeNone, 4, itin, !strconcat(asm, "\t", ops), cstr,
              pattern>,
     Requires<[HasMVEInt]> {
   let D = MVEDomain;
   let DecoderNamespace = "MVE";
+  let VecSize = vecsize;
 }
 
 // MVE_p is used for most predicated instructions, to add the cluster
@@ -406,22 +407,22 @@ class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
 // the input predicate register.
 class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
             string suffix, string ops, vpred_ops vpred, string cstr,
-            list<dag> pattern=[]>
+            bits<2> vecsize, list<dag> pattern=[]>
   : MVE_MI<oops, !con(iops, (ins vpred:$vp)), itin,
            // If the instruction has a suffix, like vadd.f32, then the
            // VPT predication suffix goes before the dot, so the full
            // name has to be "vadd${vp}.f32".
            !strconcat(iname, "${vp}",
                       !if(!eq(suffix, ""), "", !strconcat(".", suffix))),
-           ops, !strconcat(cstr, vpred.vpred_constraint), pattern> {
+           ops, !strconcat(cstr, vpred.vpred_constraint), vecsize, pattern> {
   let Inst{31-29} = 0b111;
   let Inst{27-26} = 0b11;
 }
 
 class MVE_f<dag oops, dag iops, InstrItinClass itin, string iname,
             string suffix, string ops, vpred_ops vpred, string cstr,
-            list<dag> pattern=[]>
-  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred, cstr, pattern> {
+            bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   let Predicates = [HasMVEFloat];
 }
 
@@ -599,11 +600,11 @@ def MVE_URSHRL  : MVE_ScalarShiftDRegImm<"urshrl",  0b01, 0b1>;
 
 class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
                 string iname, string suffix,
-                string ops, string cstr, list<dag> pattern=[]>
+                string ops, string cstr, bits<2> vecsize, list<dag> pattern=[]>
 // Always use vpred_n and not vpred_r: with the output register being
 // a GPR and not a vector register, there can't be any question of
 // what to put in its inactive lanes.
-  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred_n, cstr, pattern> {
+  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred_n, cstr, vecsize, pattern> {
 
   let Inst{25-23} = 0b101;
   let Inst{11-9} = 0b111;
@@ -613,7 +614,7 @@ class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
 class MVE_VABAV<string suffix, bit U, bits<2> size>
   : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
               NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
-              []> {
+              size, []> {
   bits<4> Qm;
   bits<4> Qn;
   bits<4> Rda;
@@ -652,7 +653,7 @@ multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
                          (VTI.Pred VCCR:$mask))),
               (i32 (Inst (i32 rGPR:$Rda_src),
                          (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
-                         ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                         ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
   }
 }
 
@@ -666,7 +667,7 @@ defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>;
 class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
               bit A, bit U, bits<2> size, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$Rda), iops, NoItinerary,
-              iname, suffix, "$Rda, $Qm", cstr, pattern> {
+              iname, suffix, "$Rda, $Qm", cstr, size, pattern> {
   bits<3> Qm;
   bits<4> Rda;
 
@@ -710,11 +711,11 @@ multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
       def : Pat<(i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                                       (VTI.Vec MQPR:$vec),
                                                       (VTI.Vec ARMimmAllZerosV))))),
-                (i32 (InstN $vec, ARMVCCThen, $pred))>;
+                (i32 (InstN $vec, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
                 (i32 (InstN $vec))>;
       def : Pat<(i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
-                (i32 (InstN $vec, ARMVCCThen, $pred))>;
+                (i32 (InstN $vec, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
                           (i32 tGPREven:$acc))),
                 (i32 (InstA $acc, $vec))>;
@@ -722,13 +723,13 @@ multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
                                                                 (VTI.Vec MQPR:$vec),
                                                                 (VTI.Vec ARMimmAllZerosV))))),
                           (i32 tGPREven:$acc))),
-                (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+                (i32 (InstA $acc, $vec, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
                           (i32 tGPREven:$acc))),
                 (i32 (InstA $acc, $vec))>;
       def : Pat<(i32 (add (i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
                           (i32 tGPREven:$acc))),
-                (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+                (i32 (InstA $acc, $vec, ARMVCCThen, $pred, zero_reg))>;
     } else {
       def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
                 (i32 (InstN $vec))>;
@@ -736,21 +737,21 @@ multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
                           (i32 tGPREven:$acc))),
                 (i32 (InstA $acc, $vec))>;
       def : Pat<(i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
-                (i32 (InstN $vec, ARMVCCThen, $pred))>;
+                (i32 (InstN $vec, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(i32 (add (i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))),
                           (i32 tGPREven:$acc))),
-                (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+                (i32 (InstA $acc, $vec, ARMVCCThen, $pred, zero_reg))>;
     }
 
     def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
                                                 (i32 VTI.Unsigned),
                                                 (VTI.Pred VCCR:$pred))),
-              (i32 (InstN $vec, ARMVCCThen, $pred))>;
+              (i32 (InstN $vec, ARMVCCThen, $pred, zero_reg))>;
     def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
                                                      (i32 VTI.Unsigned),
                                                      (VTI.Pred VCCR:$pred)),
                         (i32 tGPREven:$acc))),
-              (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+              (i32 (InstA $acc, $vec, ARMVCCThen, $pred, zero_reg))>;
   }
 }
 
@@ -764,7 +765,7 @@ defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>;
 class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
                bit A, bit U, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
-              suffix, "$RdaLo, $RdaHi, $Qm", cstr, pattern> {
+              suffix, "$RdaLo, $RdaHi, $Qm", cstr, 0b10, pattern> {
   bits<3> Qm;
   bits<4> RdaLo;
   bits<4> RdaHi;
@@ -821,11 +822,11 @@ multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> {
     def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)),
               (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>;
     def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)),
-              (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+              (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred), zero_reg)>;
     def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
                            (VTI.Pred VCCR:$pred)),
               (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
-                     ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+                     ARMVCCThen, (VTI.Pred VCCR:$pred), zero_reg)>;
   }
 }
 
@@ -836,7 +837,7 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
                      bit bit_17, bit bit_7, list<dag> pattern=[]>
   : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm),
               NoItinerary, iname, suffix, "$RdaSrc, $Qm",
-              "$RdaDest = $RdaSrc", pattern> {
+              "$RdaDest = $RdaSrc", !if(sz, 0b01, 0b10), pattern> {
   bits<3> Qm;
   bits<4> RdaDest;
 
@@ -876,7 +877,7 @@ multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin,
                                    (VTI.Pred VCCR:$pred))),
            (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
                                    (VTI.Vec MQPR:$vec),
-                                   ARMVCCThen, (VTI.Pred VCCR:$pred)),
+                                   ARMVCCThen, (VTI.Pred VCCR:$pred), zero_reg),
                               ScalarReg)>;
   }
 }
@@ -897,7 +898,7 @@ defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">;
 class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
                  bit bit_17, bit bit_7, list<dag> pattern=[]>
   : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary,
-              iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> {
+              iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", size, pattern> {
   bits<3> Qm;
   bits<4> RdaDest;
 
@@ -931,7 +932,7 @@ multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin,
               (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
     def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))),
               (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec),
-                         ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+                         ARMVCCThen, (VTI.Pred VCCR:$pred), zero_reg))>;
   }
 }
 
@@ -1020,9 +1021,10 @@ defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">;
 defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">;
 
 class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
-                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
+                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
+                   bits<2> vecsize>
   : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
-              "$RdaDest, $Qn, $Qm", cstr, []> {
+              "$RdaDest, $Qn, $Qm", cstr, vecsize, []> {
   bits<4> RdaDest;
   bits<3> Qm;
   bits<3> Qn;
@@ -1050,11 +1052,11 @@ multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
                             bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> {
   def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix,
                                    (ins MQPR:$Qn, MQPR:$Qm), "",
-                                   sz, bit_28, 0b0, X, bit_8, bit_0>;
+                                   sz, bit_28, 0b0, X, bit_8, bit_0, VTI.Size>;
   def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix,
                                     (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
                                     "$RdaDest = $RdaSrc",
-                                    sz, bit_28, 0b1, X, bit_8, bit_0>;
+                                    sz, bit_28, 0b1, X, bit_8, bit_0, VTI.Size>;
   let Predicates = [HasMVEInt] in {
     def : Pat<(i32 (int_arm_mve_vmldava
                             (i32 VTI.Unsigned),
@@ -1074,7 +1076,7 @@ multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
                             (VTI.Pred VCCR:$mask))),
               (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
                             (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
 
     def : Pat<(i32 (int_arm_mve_vmldava
                             (i32 VTI.Unsigned),
@@ -1096,7 +1098,7 @@ multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
               (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
                             (i32 tGPREven:$RdaSrc),
                             (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
   }
 }
 
@@ -1200,47 +1202,47 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(i32 (vecreduce_add (vselect (v4i1 VCCR:$pred),
                                          (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)),
                                          (v4i32 ARMimmAllZerosV)))),
-            (i32 (MVE_VMLADAVu32 $src1, $src2, ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVu32 $src1, $src2, ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (vecreduce_add (vselect (v8i1 VCCR:$pred),
                                          (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)),
                                          (v8i16 ARMimmAllZerosV)))),
-            (i32 (MVE_VMLADAVu16 $src1, $src2, ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVu16 $src1, $src2, ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))),
-            (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))),
-            (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (vecreduce_add (vselect (v16i1 VCCR:$pred),
                                          (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)),
                                          (v16i8 ARMimmAllZerosV)))),
-            (i32 (MVE_VMLADAVu8 $src1, $src2, ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVu8 $src1, $src2, ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))),
-            (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))),
-            (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
 
   def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v4i1 VCCR:$pred),
                                                    (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)),
                                                    (v4i32 ARMimmAllZerosV)))),
                       (i32 tGPREven:$src3))),
-            (i32 (MVE_VMLADAVau32 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVau32 $src3, $src1, $src2, ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v8i1 VCCR:$pred),
                                                    (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)),
                                                    (v8i16 ARMimmAllZerosV)))),
                       (i32 tGPREven:$src3))),
-            (i32 (MVE_VMLADAVau16 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVau16 $src3, $src1, $src2, ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (add (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)),
-            (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (add (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)),
-            (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v16i1 VCCR:$pred),
                                                    (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)),
                                                    (v16i8 ARMimmAllZerosV)))),
                       (i32 tGPREven:$src3))),
-            (i32 (MVE_VMLADAVau8 $src3, $src1, $src2, ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVau8 $src3, $src1, $src2, ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (add (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)),
-            (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
   def : Pat<(i32 (add (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)),
-            (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>;
+            (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred, zero_reg))>;
 }
 
 // vmlav aliases vmladav
@@ -1255,9 +1257,9 @@ foreach acc = ["", "a"] in {
 // Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
 class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
                        bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
-                       list<dag> pattern=[]>
+                       bits<2> vecsize, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest), iops, NoItinerary,
-              iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, pattern> {
+              iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, vecsize, pattern> {
   bits<4> RdaLoDest;
   bits<4> RdaHiDest;
   bits<3> Qm;
@@ -1285,35 +1287,35 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
 }
 
 multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
-                               bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
-                               list<dag> pattern=[]> {
+                              bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+                              bits<2> vecsize, list<dag> pattern=[]> {
   def ""#x#suffix : MVE_VMLALDAVBase<
      iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
-     sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+     sz, bit_28, 0b0, X, bit_8, bit_0, vecsize, pattern>;
   def "a"#x#suffix : MVE_VMLALDAVBase<
      iname # "a" # x, suffix,
      (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm),
      "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
-     sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+     sz, bit_28, 0b1, X, bit_8, bit_0, vecsize, pattern>;
 }
 
 
 multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28,
-                               bit bit_8, bit bit_0, list<dag> pattern=[]> {
+                               bit bit_8, bit bit_0, bits<2> vecsize, list<dag> pattern=[]> {
   defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz,
-                               bit_28, 0b0, bit_8, bit_0, pattern>;
+                               bit_28, 0b0, bit_8, bit_0, vecsize, pattern>;
   defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz,
-                               bit_28, 0b1, bit_8, bit_0, pattern>;
+                               bit_28, 0b1, bit_8, bit_0, vecsize, pattern>;
 }
 
-multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix,
-                                0b0, 0b0, 0b1, 0b0, pattern>;
-  defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix,
-                               0b0, 0b1, 0b0, 0b1, 0b0, pattern>;
+multiclass MVE_VRMLALDAVH_multi<MVEVectorVTInfo VTI, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#VTI.BitsSuffix,
+                                0b0, 0b0, 0b1, 0b0, VTI.Size, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#VTI.BitsSuffix,
+                               0b0, 0b1, 0b0, 0b1, 0b0, VTI.Size, pattern>;
 }
 
-defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">;
+defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<MVE_v4i32>;
 
 // vrmlalvh aliases for vrmlaldavh
 def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
@@ -1333,14 +1335,15 @@ def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
                    tGPREven:$RdaLo, tGPROdd:$RdaHi,
                    MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 
-multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>;
-  defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix,
-                               sz, 0b1, 0b0, 0b0, 0b0, pattern>;
+multiclass MVE_VMLALDAV_multi<MVEVectorVTInfo VTI, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#VTI.BitsSuffix,
+                                VTI.Size{1}, 0b0, 0b0, 0b0, VTI.Size, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#VTI.BitsSuffix,
+                               VTI.Size{1}, 0b1, 0b0, 0b0, 0b0, VTI.Size, pattern>;
 }
 
-defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
-defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<MVE_v8i16>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<MVE_v4i32>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(ARMVMLALVs (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
@@ -1363,22 +1366,22 @@ let Predicates = [HasMVEInt] in {
 
   // Predicated
   def : Pat<(ARMVMLALVps (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
-            (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
   def : Pat<(ARMVMLALVpu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
-            (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
   def : Pat<(ARMVMLALVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
-            (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
   def : Pat<(ARMVMLALVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
-            (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
 
   def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
-            (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
   def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)),
-            (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
   def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
-            (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
   def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)),
-            (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>;
+            (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred, zero_reg)>;
 }
 
 // vmlalv aliases vmlaldav
@@ -1393,22 +1396,22 @@ foreach acc = ["", "a"] in {
 }
 
 multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
-                              bit bit_28, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
+                              bit bit_28, bits<2> vecsize, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, vecsize, pattern>;
 }
 
-defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
-defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
-defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
+defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0, 0b01>;
+defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0, 0b10>;
+defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1, 0b10>;
 
 // end of mve_rDest instructions
 
 // start of mve_comp instructions
 
 class MVE_comp<InstrItinClass itin, string iname, string suffix,
-               string cstr, list<dag> pattern=[]>
+               string cstr, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), itin, iname, suffix,
-           "$Qd, $Qn, $Qm", vpred_r, cstr, pattern> {
+           "$Qd, $Qn, $Qm", vpred_r, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -1425,25 +1428,26 @@ class MVE_comp<InstrItinClass itin, string iname, string suffix,
   let Inst{0} = 0b0;
 }
 
-class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
+class MVE_VMINMAXNM<string iname, string suffix, bits<2> sz, bit bit_21,
                     list<dag> pattern=[]>
-  : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
+  : MVE_comp<NoItinerary, iname, suffix, "", sz, pattern> {
 
   let Inst{28} = 0b1;
   let Inst{25-24} = 0b11;
   let Inst{23} = 0b0;
   let Inst{21} = bit_21;
-  let Inst{20} = sz;
+  let Inst{20} = sz{0};
   let Inst{11} = 0b1;
   let Inst{8} = 0b1;
   let Inst{6} = 0b1;
   let Inst{4} = 0b1;
 
   let Predicates = [HasMVEFloat];
+  let validForTailPredication = 1;
 }
 
 multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size{0}, bit_4>;
+  def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size, bit_4>;
 
   let Predicates = [HasMVEFloat] in {
     defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 0)), !cast<Instruction>(NAME)>;
@@ -1458,7 +1462,7 @@ defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_
 
 class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
               bit bit_4, list<dag> pattern=[]>
-  : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
+  : MVE_comp<NoItinerary, iname, suffix, "", size, pattern> {
 
   let Inst{28} = U;
   let Inst{25-24} = 0b11;
@@ -1504,8 +1508,8 @@ defm MVE_VMAXu32  : MVE_VMAX<MVE_v4u32>;
 // start of mve_bit instructions
 
 class MVE_bit_arith<dag oops, dag iops, string iname, string suffix,
-                    string ops, string cstr, list<dag> pattern=[]>
-  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred_r, cstr, pattern> {
+                    string ops, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred_r, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -1516,7 +1520,7 @@ class MVE_bit_arith<dag oops, dag iops, string iname, string suffix,
 }
 
 def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
-                             "vbic", "", "$Qd, $Qn, $Qm", ""> {
+                             "vbic", "", "$Qd, $Qn, $Qm", "", 0b00> {
   bits<4> Qn;
 
   let Inst{28} = 0b0;
@@ -1532,9 +1536,10 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
   let validForTailPredication = 1;
 }
 
-class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr="">
+class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7,
+               bits<2> vecsize, string cstr="">
   : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
-                  suffix, "$Qd, $Qm", cstr> {
+                  suffix, "$Qd, $Qm", cstr, vecsize> {
 
   let Inst{28} = 0b1;
   let Inst{25-23} = 0b111;
@@ -1548,14 +1553,14 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, strin
   let Inst{0} = 0b0;
 }
 
-def MVE_VREV64_8  : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">;
-def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">;
-def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_8  : MVE_VREV<"vrev64", "8", 0b00, 0b00, 0b11, "@earlyclobber $Qd">;
+def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, 0b11, "@earlyclobber $Qd">;
+def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, 0b11, "@earlyclobber $Qd">;
 
-def MVE_VREV32_8  : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
-def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
+def MVE_VREV32_8  : MVE_VREV<"vrev32", "8", 0b00, 0b01, 0b10>;
+def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01, 0b10>;
 
-def MVE_VREV16_8  : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
+def MVE_VREV16_8  : MVE_VREV<"vrev16", "8", 0b00, 0b10, 0b01>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))),
@@ -1574,7 +1579,7 @@ multiclass MVE_VREV_basic_patterns<int revbits, list<MVEVectorVTInfo> VTIs,
     def : Pat<(VTI.Vec (int_arm_mve_vrev_predicated (VTI.Vec MQPR:$src),
                   revbits, (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$src), ARMVCCThen,
-                  (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+                  (VTI.Pred VCCR:$pred), zero_reg, (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
@@ -1590,7 +1595,7 @@ let Predicates = [HasMVEInt] in {
 }
 
 def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
-                             "vmvn", "", "$Qd, $Qm", ""> {
+                             "vmvn", "", "$Qd, $Qm", "", 0b00> {
   let Inst{28} = 0b1;
   let Inst{25-23} = 0b111;
   let Inst{21-16} = 0b110000;
@@ -1607,13 +1612,13 @@ let Predicates = [HasMVEInt] in {
     def : Pat<(VTI.Vec (int_arm_mve_mvn_predicated (VTI.Vec MQPR:$val1),
                        (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1), ARMVCCThen,
-                       (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+                       (VTI.Pred VCCR:$pred), zero_reg, (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
 class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
   : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
-                  iname, "", "$Qd, $Qn, $Qm", ""> {
+                  iname, "", "$Qd, $Qn, $Qm", "", 0b00> {
   bits<4> Qn;
 
   let Inst{28} = bit_28;
@@ -1684,9 +1689,9 @@ let Predicates = [HasMVEInt] in {
                           int_arm_mve_orn_predicated, (? ), MVE_VORN>;
 }
 
-class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
+class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps, bits<2> vecsize>
   : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
-          iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
+          iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src", vecsize> {
   bits<12> imm;
   bits<4> Qd;
 
@@ -1709,7 +1714,7 @@ class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
 multiclass MVE_bit_cmode_p<string iname, bit opcode,
                            MVEVectorVTInfo VTI, Operand imm_type, SDNode op> {
   def "" : MVE_bit_cmode<iname, VTI.Suffix, VTI.Size{0},
-                         (ins MQPR:$Qd_src, imm_type:$imm)> {
+                         (ins MQPR:$Qd_src, imm_type:$imm), VTI.Size> {
     let Inst{5} = opcode;
     let validForTailPredication = 1;
   }
@@ -1723,7 +1728,7 @@ multiclass MVE_bit_cmode_p<string iname, bit opcode,
     def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                           UnpredPat, (VTI.Vec MQPR:$src))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm,
-                             ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+                             ARMVCCThen, (VTI.Pred VCCR:$pred), zero_reg))>;
   }
 }
 
@@ -1801,6 +1806,7 @@ class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir>
   let Inst{16} = Idx{1};
   let Inst{21} = Idx{0};
 
+  let VecSize = 0b10;
   let Predicates = [HasFPRegsV8_1M];
 }
 
@@ -1812,6 +1818,8 @@ class MVE_VMOV_lane_16<string suffix, bit U, MVE_VMOV_lane_direction dir>
   let Inst{16} = Idx{2};
   let Inst{21} = Idx{1};
   let Inst{6} = Idx{0};
+
+  let VecSize = 0b01;
 }
 
 class MVE_VMOV_lane_8<string suffix, bit U, MVE_VMOV_lane_direction dir>
@@ -1822,6 +1830,8 @@ class MVE_VMOV_lane_8<string suffix, bit U, MVE_VMOV_lane_direction dir>
   let Inst{21} = Idx{2};
   let Inst{6} = Idx{1};
   let Inst{5} = Idx{0};
+
+  let VecSize = 0b00;
 }
 
 def MVE_VMOV_from_lane_32  : MVE_VMOV_lane_32<            MVE_VMOV_from_lane>;
@@ -1932,7 +1942,7 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
-          iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
+          iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -2205,7 +2215,7 @@ multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
                             (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
                             (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -2293,7 +2303,7 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
                             (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -2334,7 +2344,7 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
                             (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
                             (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -2350,9 +2360,9 @@ defm MVE_VHSUBu8  : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>;
 defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>;
 defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>;
 
-class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
+class MVE_VDUP<string suffix, bit B, bit E, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
-          "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
+          "vdup", suffix, "$Qd, $Rt", vpred_r, "", vecsize, pattern> {
   bits<4> Qd;
   bits<4> Rt;
 
@@ -2371,9 +2381,9 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
-def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>;
-def MVE_VDUP8  : MVE_VDUP<"8",  0b1, 0b0>;
+def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0, 0b10>;
+def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1, 0b01>;
+def MVE_VDUP8  : MVE_VDUP<"8",  0b1, 0b0, 0b00>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))),
@@ -2392,27 +2402,27 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred),
                             (v16i8 (ARMvdup (i32 rGPR:$elem))),
                             (v16i8 MQPR:$inactive))),
-            (MVE_VDUP8  rGPR:$elem, ARMVCCThen, (v16i1 VCCR:$pred),
+            (MVE_VDUP8  rGPR:$elem, ARMVCCThen, (v16i1 VCCR:$pred), zero_reg,
                         (v16i8 MQPR:$inactive))>;
   def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred),
                             (v8i16 (ARMvdup (i32 rGPR:$elem))),
                             (v8i16 MQPR:$inactive))),
-            (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
+            (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), zero_reg,
                             (v8i16 MQPR:$inactive))>;
   def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred),
                             (v4i32 (ARMvdup (i32 rGPR:$elem))),
                             (v4i32 MQPR:$inactive))),
-            (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
+            (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), zero_reg,
                             (v4i32 MQPR:$inactive))>;
   def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred),
                             (v4f32 (ARMvdup (i32 rGPR:$elem))),
                             (v4f32 MQPR:$inactive))),
-            (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
+            (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), zero_reg,
                             (v4f32 MQPR:$inactive))>;
   def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred),
                             (v8f16 (ARMvdup (i32 rGPR:$elem))),
                             (v8f16 MQPR:$inactive))),
-            (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
+            (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), zero_reg,
                             (v8f16 MQPR:$inactive))>;
 }
 
@@ -2420,7 +2430,7 @@ let Predicates = [HasMVEInt] in {
 class MVEIntSingleSrc<string iname, string suffix, bits<2> size,
                          list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary,
-          iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> {
+          iname, suffix, "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -2460,7 +2470,7 @@ multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred),
                                  (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen,
-                             (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+                             (VTI.Pred VCCR:$pred), zero_reg, (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
@@ -2506,7 +2516,7 @@ multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate,
 
     def : Pat<(VTI.Vec (pred_int  (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
                                   (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
+              (VTI.Vec (Inst $v, ARMVCCThen, $mask, zero_reg, $inactive))>;
   }
 }
 
@@ -2565,9 +2575,9 @@ defm MVE_VQABSNEG_Ps32 : vqabsneg_pattern<MVE_v4i32,
                                     MVE_VQABSs32, MVE_VQNEGs32>;
 
 class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
-                  dag iops, list<dag> pattern=[]>
+                  dag iops, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm",
-          vpred_r, "", pattern> {
+          vpred_r, "", vecsize, pattern> {
   bits<13> imm;
   bits<4> Qd;
 
@@ -2590,21 +2600,21 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
 
 let isReMaterializable = 1 in {
 let isAsCheapAsAMove = 1 in {
-def MVE_VMOVimmi8  : MVE_mod_imm<"vmov", "i8",  {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm)>;
-def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm)> {
+def MVE_VMOVimmi8  : MVE_mod_imm<"vmov", "i8",  {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm), 0b00>;
+def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm), 0b01> {
   let Inst{9} = imm{9};
 }
-def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm)> {
+def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm), 0b10> {
   let Inst{11-8} = imm{11-8};
 }
-def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm)>;
-def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm)>;
+def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm), 0b11>;
+def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm), 0b10>;
 } // let isAsCheapAsAMove = 1
 
-def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm)> {
+def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm), 0b01> {
   let Inst{9} = imm{9};
 }
-def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm)> {
+def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm), 0b10> {
   let Inst{11-8} = imm{11-8};
 }
 } // let isReMaterializable = 1
@@ -2630,18 +2640,18 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
                             MQPR:$inactive)),
             (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm,
-                            ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
+                            ARMVCCThen, VCCR:$pred, zero_reg, MQPR:$inactive))>;
   def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
                             MQPR:$inactive)),
             (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm,
-                            ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
+                            ARMVCCThen, VCCR:$pred, zero_reg, MQPR:$inactive))>;
 }
 
 class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
                    bit bit_12, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
           NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
-          pattern> {
+          size, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -2675,7 +2685,7 @@ multiclass MVE_VMINMAXA_m<string iname, MVEVectorVTInfo VTI,
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
                             (VTI.Pred VCCR:$mask))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
-                            ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                            ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
   }
 }
 
@@ -2700,7 +2710,7 @@ defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>;
 def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
                       (ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm),
                       NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm",
-                      vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc"> {
+                      vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc", 0b10> {
   bits<5> imm;
   bits<4> Qd;
   bits<4> RdmDest;
@@ -2717,8 +2727,8 @@ def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
 
 class MVE_shift_imm<dag oops, dag iops, string iname, string suffix,
                     string ops, vpred_ops vpred, string cstr,
-                    list<dag> pattern=[]>
-  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                    bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -2732,7 +2742,7 @@ class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, bit top,
               list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
                   iname, suffix, "$Qd, $Qm", vpred_r, "",
-                  pattern> {
+                  sz, pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b101;
   let Inst{21} = 0b1;
@@ -2756,7 +2766,7 @@ multiclass MVE_VMOVL_m<bit top, string chr, MVEVectorVTInfo OutVTI,
                             (OutVTI.Pred VCCR:$pred),
                             (OutVTI.Vec MQPR:$inactive))),
             (OutVTI.Vec (Inst (InVTI.Vec MQPR:$src), ARMVCCThen,
-                            (OutVTI.Pred VCCR:$pred),
+                            (OutVTI.Pred VCCR:$pred), zero_reg,
                             (OutVTI.Vec MQPR:$inactive)))>;
 }
 
@@ -2798,9 +2808,9 @@ let Predicates = [HasMVEInt] in {
 
 
 class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
-                    Operand immtype, list<dag> pattern=[]>
+                    Operand immtype, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm, immtype:$imm),
-                  iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> {
+                  iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", vecsize, pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b101;
   let Inst{21} = 0b1;
@@ -2821,7 +2831,7 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
 
 class MVE_VSHLL_imm8<string iname, string suffix,
                      bit U, bit th, list<dag> pattern=[]>
-  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, pattern> {
+  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, 0b01, pattern> {
   bits<3> imm;
   let Inst{20-19} = 0b01;
   let Inst{18-16} = imm;
@@ -2829,7 +2839,7 @@ class MVE_VSHLL_imm8<string iname, string suffix,
 
 class MVE_VSHLL_imm16<string iname, string suffix,
                       bit U, bit th, list<dag> pattern=[]>
-  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, pattern> {
+  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, 0b10, pattern> {
   bits<4> imm;
   let Inst{20} = 0b1;
   let Inst{19-16} = imm;
@@ -2847,7 +2857,7 @@ def MVE_VSHLL_immu16th : MVE_VSHLL_imm16<"vshllt", "u16", 0b1, 0b1>;
 class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size,
                               bit U, string ops, list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
-                  iname, suffix, ops, vpred_r, "", pattern> {
+                  iname, suffix, ops, vpred_r, "", !if(size, 0b10, 0b01), pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b100;
   let Inst{21-20} = 0b11;
@@ -2894,14 +2904,14 @@ multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> {
                                     (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive))),
             (VTI.DblVec (inst_imm   (VTI.Vec MQPR:$src), imm:$imm,
-                                    ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                                    ARMVCCThen, (VTI.DblPred VCCR:$mask), zero_reg,
                                     (VTI.DblVec MQPR:$inactive)))>;
   def : Pat<(VTI.DblVec (pred_int   (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
                                     (i32 VTI.Unsigned), (i32 top),
                                     (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive))),
             (VTI.DblVec (inst_lw    (VTI.Vec MQPR:$src), ARMVCCThen,
-                                    (VTI.DblPred VCCR:$mask),
+                                    (VTI.DblPred VCCR:$mask), zero_reg,
                                     (VTI.DblVec MQPR:$inactive)))>;
 }
 
@@ -2909,15 +2919,15 @@ foreach VTI = [MVE_v16s8, MVE_v8s16, MVE_v16u8, MVE_v8u16] in
   foreach top = [0, 1] in
     defm : MVE_VSHLL_patterns<VTI, top>;
 
-class MVE_shift_imm_partial<Operand imm, string iname, string suffix>
+class MVE_shift_imm_partial<Operand imm, string iname, string suffix, bits<2> vecsize>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$QdSrc, MQPR:$Qm, imm:$imm),
-                  iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc"> {
+                  iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", vecsize> {
   Operand immediateType = imm;
 }
 
 class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
-                 Operand imm, list<dag> pattern=[]>
-  : MVE_shift_imm_partial<imm, iname, suffix> {
+                 Operand imm, bits<2> vecsize>
+  : MVE_shift_imm_partial<imm, iname, suffix, vecsize> {
   bits<5> imm;
 
   let Inst{28} = bit_28;
@@ -2932,35 +2942,35 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
   let retainsPreviousHalfElement = 1;
 }
 
-def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
+def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> {
+def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16> {
+def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
-def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16> {
+def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
-def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8> {
+def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8> {
+def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16> {
+def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
-def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16> {
+def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
 class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
-                    Operand imm, list<dag> pattern=[]>
-  : MVE_shift_imm_partial<imm, iname, suffix> {
+                    Operand imm, bits<2> vecsize>
+  : MVE_shift_imm_partial<imm, iname, suffix, vecsize> {
   bits<5> imm;
 
   let Inst{28} = bit_28;
@@ -2976,42 +2986,42 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
 }
 
 def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
-    "vqrshrunb", "s16", 0b1, 0b0, shr_imm8> {
+    "vqrshrunb", "s16", 0b1, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<
-    "vqrshrunt", "s16", 0b1, 0b1, shr_imm8> {
+    "vqrshrunt", "s16", 0b1, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN<
-    "vqrshrunb", "s32", 0b1, 0b0, shr_imm16> {
+    "vqrshrunb", "s32", 0b1, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN<
-    "vqrshrunt", "s32", 0b1, 0b1, shr_imm16> {
+    "vqrshrunt", "s32", 0b1, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
 def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN<
-    "vqshrunb", "s16", 0b0, 0b0, shr_imm8> {
+    "vqshrunb", "s16", 0b0, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQSHRUNs16th : MVE_VxQRSHRUN<
-    "vqshrunt", "s16", 0b0, 0b1, shr_imm8> {
+    "vqshrunt", "s16", 0b0, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN<
-    "vqshrunb", "s32", 0b0, 0b0, shr_imm16> {
+    "vqshrunb", "s32", 0b0, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 def MVE_VQSHRUNs32th : MVE_VxQRSHRUN<
-    "vqshrunt", "s32", 0b0, 0b1, shr_imm16> {
+    "vqshrunt", "s32", 0b0, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
 class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
-                   Operand imm, list<dag> pattern=[]>
-  : MVE_shift_imm_partial<imm, iname, suffix> {
+                   Operand imm, bits<2> vecsize>
+  : MVE_shift_imm_partial<imm, iname, suffix, vecsize> {
   bits<5> imm;
 
   let Inst{25-23} = 0b101;
@@ -3026,19 +3036,19 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
 }
 
 multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
-  def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> {
+  def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8, 0b01> {
     let Inst{28} = 0b0;
     let Inst{20-19} = 0b01;
   }
-  def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> {
+  def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8, 0b01> {
     let Inst{28} = 0b1;
     let Inst{20-19} = 0b01;
   }
-  def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16> {
+  def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16, 0b10> {
     let Inst{28} = 0b0;
     let Inst{20} = 0b1;
   }
-  def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16> {
+  def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16, 0b10> {
     let Inst{28} = 0b1;
     let Inst{20} = 0b1;
   }
@@ -3062,7 +3072,7 @@ multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst,
             (OutVTI.Vec outparams)>;
   def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated
                                            (InVTI.Pred VCCR:$pred)))),
-            (OutVTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
+            (OutVTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred, zero_reg)))>;
 }
 
 defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh,    MVE_v16s8, MVE_v8s16, 0,0,0>;
@@ -3113,7 +3123,7 @@ defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,1,1>;
 class MVE_shift_by_vec<string iname, string suffix, bit U,
                        bits<2> size, bit bit_4, bit bit_8>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm, MQPR:$Qn), NoItinerary,
-           iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", []> {
+           iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", size, []> {
   // Shift instructions which take a vector of shift counts
   bits<4> Qd;
   bits<4> Qm;
@@ -3152,7 +3162,7 @@ multiclass MVE_shift_by_vec_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
                          (i32 q), (i32 r), (i32 VTI.Unsigned),
                          (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
-                           ARMVCCThen, (VTI.Pred VCCR:$mask),
+                           ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                            (VTI.Vec MQPR:$inactive)))>;
 }
 
@@ -3188,8 +3198,8 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
                          string ops, vpred_ops vpred, string cstr,
-                         list<dag> pattern=[]>
-  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                         bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -3212,10 +3222,10 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
   dag unsignedFlag = (?);
 }
 
-class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType>
+class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType, bits<2> vecsize>
   : MVE_shift_with_imm<iname, suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qd_src, MQPR:$Qm, immType:$imm),
-                       "$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src"> {
+                       "$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src", vecsize> {
   bits<6> imm;
   let Inst{28} = 0b1;
   let Inst{25-24} = 0b11;
@@ -3227,27 +3237,27 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType>
   Operand immediateType = immType;
 }
 
-def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8> {
+def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8, 0b00> {
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16> {
+def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16, 0b01> {
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32> {
+def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32, 0b10> {
   let Inst{21} = 0b1;
 }
 
-def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7> {
+def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7, 0b00> {
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15> {
+def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15, 0b01> {
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31> {
+def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31, 0b10> {
   let Inst{21} = 0b1;
 }
 
@@ -3263,7 +3273,7 @@ multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name,
   def : Pat<(VTI.Vec !setdagop(inparams, unpred_int)),
             (VTI.Vec outparams)>;
   def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))),
-            (VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
+            (VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred, zero_reg)))>;
 }
 
 defm : MVE_VSxI_patterns<MVE_VSLIimm8,  "vsli", MVE_v16i8>;
@@ -3276,7 +3286,7 @@ defm : MVE_VSxI_patterns<MVE_VSRIimm32, "vsri", MVE_v4i32>;
 class MVE_VQSHL_imm<MVEVectorVTInfo VTI_, Operand immType>
   : MVE_shift_with_imm<"vqshl", VTI_.Suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", VTI_.Size> {
   bits<6> imm;
 
   let Inst{28} = VTI_.Unsigned;
@@ -3316,7 +3326,7 @@ let unpred_int = int_arm_mve_vqshl_imm,
 class MVE_VQSHLU_imm<MVEVectorVTInfo VTI_, Operand immType>
   : MVE_shift_with_imm<"vqshlu", VTI_.Suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", VTI_.Size> {
   bits<6> imm;
 
   let Inst{28} = 0b1;
@@ -3346,7 +3356,7 @@ let unpred_int = int_arm_mve_vqshlu_imm,
 class MVE_VRSHR_imm<MVEVectorVTInfo VTI_, Operand immType>
   : MVE_shift_with_imm<"vrshr", VTI_.Suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", VTI_.Size> {
   bits<6> imm;
 
   let Inst{28} = VTI_.Unsigned;
@@ -3400,7 +3410,7 @@ multiclass MVE_shift_imm_patterns<MVE_shift_with_imm inst> {
                                   (inst.VTI.Vec MQPR:$inactive)))),
             (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
                                 inst.immediateType:$imm,
-                                ARMVCCThen, (inst.VTI.Pred VCCR:$mask),
+                                ARMVCCThen, (inst.VTI.Pred VCCR:$mask), zero_reg,
                                 (inst.VTI.Vec MQPR:$inactive)))>;
 }
 
@@ -3420,10 +3430,10 @@ defm : MVE_shift_imm_patterns<MVE_VRSHR_immu16>;
 defm : MVE_shift_imm_patterns<MVE_VRSHR_imms32>;
 defm : MVE_shift_imm_patterns<MVE_VRSHR_immu32>;
 
-class MVE_VSHR_imm<string suffix, dag imm>
+class MVE_VSHR_imm<string suffix, dag imm, bits<2> vecsize>
   : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd),
                        !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", vecsize> {
   bits<6> imm;
 
   let Inst{25-24} = 0b11;
@@ -3431,40 +3441,40 @@ class MVE_VSHR_imm<string suffix, dag imm>
   let Inst{10-8} = 0b000;
 }
 
-def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm)> {
+def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm), 0b00> {
   let Inst{28} = 0b0;
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm)> {
+def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm), 0b00> {
   let Inst{28} = 0b1;
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm)> {
+def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm), 0b01> {
   let Inst{28} = 0b0;
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm)> {
+def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm), 0b01> {
   let Inst{28} = 0b1;
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm)> {
+def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm), 0b10> {
   let Inst{28} = 0b0;
   let Inst{21} = 0b1;
 }
 
-def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm)> {
+def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm), 0b10> {
   let Inst{28} = 0b1;
   let Inst{21} = 0b1;
 }
 
-class MVE_VSHL_imm<string suffix, dag imm>
+class MVE_VSHL_imm<string suffix, dag imm, bits<2> vecsize>
   : MVE_shift_with_imm<"vshl", suffix, (outs MQPR:$Qd),
                        !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", vecsize> {
   bits<6> imm;
 
   let Inst{28} = 0b0;
@@ -3473,15 +3483,15 @@ class MVE_VSHL_imm<string suffix, dag imm>
   let Inst{10-8} = 0b101;
 }
 
-def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm)> {
+def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm), 0b00> {
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm)> {
+def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm), 0b01> {
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> {
+def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm), 0b10> {
   let Inst{21} = 0b1;
 }
 
@@ -3497,7 +3507,7 @@ multiclass MVE_immediate_shift_patterns_inner<
                           (pred_int (VTI.Pred VCCR:$mask),
                                    (VTI.Vec MQPR:$inactive)))),
             (VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm,
-                           ARMVCCThen, (VTI.Pred VCCR:$mask),
+                           ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                            (VTI.Vec MQPR:$inactive)))>;
 }
 
@@ -3525,8 +3535,8 @@ let Predicates = [HasMVEInt] in {
 // start of MVE Floating Point instructions
 
 class MVE_float<string iname, string suffix, dag oops, dag iops, string ops,
-                vpred_ops vpred, string cstr, list<dag> pattern=[]>
-  : MVE_f<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                vpred_ops vpred, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_f<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qm;
 
   let Inst{12} = 0b0;
@@ -3539,7 +3549,7 @@ class MVE_float<string iname, string suffix, dag oops, dag iops, string ops,
 class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
                 list<dag> pattern=[]>
   : MVE_float<!strconcat("vrint", rmode), suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -3568,7 +3578,7 @@ multiclass MVE_VRINT_m<MVEVectorVTInfo VTI, string suffix, bits<3> opcode,
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred),
                                  (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen,
-                             (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+                             (VTI.Pred VCCR:$pred), zero_reg, (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
@@ -3586,16 +3596,16 @@ defm MVE_VRINTf32 : MVE_VRINT_ops<MVE_v4f32>;
 
 class MVEFloatArithNeon<string iname, string suffix, bit size,
                            dag oops, dag iops, string ops,
-                           vpred_ops vpred, string cstr, list<dag> pattern=[]>
-  : MVE_float<iname, suffix, oops, iops, ops, vpred, cstr, pattern> {
+                           vpred_ops vpred, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_float<iname, suffix, oops, iops, ops, vpred, cstr, vecsize, pattern> {
   let Inst{20} = size;
   let Inst{16} = 0b0;
 }
 
-class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
-  : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
+class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
+  : MVEFloatArithNeon<iname, suffix, size{0}, (outs MQPR:$Qd),
                       (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "",
-                      pattern> {
+                      size, pattern> {
   bits<4> Qd;
   bits<4> Qn;
 
@@ -3611,9 +3621,9 @@ class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
+multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI,
                             SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
+  def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3622,15 +3632,15 @@ multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
 }
 
 multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI>
-  : MVE_VMULT_fp_m<"vmul", 0, VTI, fmul, int_arm_mve_mul_predicated>;
+  : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated>;
 
 defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>;
 defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>;
 
-class MVE_VCMLA<string suffix, bit size>
-  : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd),
+class MVE_VCMLA<string suffix, bits<2> size>
+  : MVEFloatArithNeon<"vcmla", suffix, size{1}, (outs MQPR:$Qd),
                          (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
-                         "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", []> {
+                         "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", size, []> {
   bits<4> Qd;
   bits<4> Qn;
   bits<2> rot;
@@ -3647,8 +3657,8 @@ class MVE_VCMLA<string suffix, bit size>
   let Inst{4} = 0b0;
 }
 
-multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> {
-  def "" : MVE_VCMLA<VTI.Suffix, size>;
+multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI> {
+  def "" : MVE_VCMLA<VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3665,21 +3675,21 @@ multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> {
                             (VTI.Pred VCCR:$mask))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qn),
                              (VTI.Vec MQPR:$Qm), imm:$rot,
-                             ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
 
   }
 }
 
-defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16, 0b0>;
-defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32, 0b1>;
+defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16>;
+defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32>;
 
-class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
+class MVE_VADDSUBFMA_fp<string iname, string suffix, bits<2> size, bit bit_4,
                         bit bit_8, bit bit_21, dag iops=(ins),
                         vpred_ops vpred=vpred_r, string cstr="",
                         list<dag> pattern=[]>
-  : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
+  : MVEFloatArithNeon<iname, suffix, size{0}, (outs MQPR:$Qd),
                       !con(iops, (ins MQPR:$Qn, MQPR:$Qm)), "$Qd, $Qn, $Qm",
-                      vpred, cstr, pattern> {
+                      vpred, cstr, size, pattern> {
   bits<4> Qd;
   bits<4> Qn;
 
@@ -3697,7 +3707,7 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
 }
 
 multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
-  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms,
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0b1, 0b0, fms,
                              (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
   defvar Inst = !cast<Instruction>(NAME);
   defvar pred_int = int_arm_mve_fma_predicated;
@@ -3713,20 +3723,20 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma (fneg m1), m2, add)),
                                   add)),
-                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred, zero_reg)>;
       def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)),
-                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred, zero_reg)>;
       def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)),
-                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred, zero_reg)>;
     } else {
       def : Pat<(VTI.Vec (fma m1, m2, add)),
                 (Inst $add, $m1, $m2)>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma m1, m2, add)),
                                   add)),
-                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred, zero_reg)>;
       def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)),
-                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred, zero_reg)>;
     }
   }
 }
@@ -3738,7 +3748,7 @@ defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
                             SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> {
     let validForTailPredication = 1;
   }
   defvar Inst = !cast<Instruction>(NAME);
@@ -3759,10 +3769,10 @@ defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>;
 defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>;
 defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>;
 
-class MVE_VCADD<string suffix, bit size, string cstr="">
-  : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
+class MVE_VCADD<string suffix, bits<2> size, string cstr="">
+  : MVEFloatArithNeon<"vcadd", suffix, size{1}, (outs MQPR:$Qd),
                          (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
-                         "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
+                         "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, size, []> {
   bits<4> Qd;
   bits<4> Qn;
   bit rot;
@@ -3780,8 +3790,8 @@ class MVE_VCADD<string suffix, bit size, string cstr="">
   let Inst{4} = 0b0;
 }
 
-multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> {
-  def "" : MVE_VCADD<VTI.Suffix, size, cstr>;
+multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, string cstr=""> {
+  def "" : MVE_VCADD<VTI.Suffix, VTI.Size, cstr>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3795,18 +3805,18 @@ multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> {
                             (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
                             (VTI.Pred VCCR:$mask))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
-                             imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
 
   }
 }
 
-defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16, 0b0>;
-defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, 0b1, "@earlyclobber $Qd">;
+defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16>;
+defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, "@earlyclobber $Qd">;
 
-class MVE_VABD_fp<string suffix, bit size>
+class MVE_VABD_fp<string suffix, bits<2> size>
   : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
-              "$Qd, $Qn, $Qm", vpred_r, ""> {
+              "$Qd, $Qn, $Qm", vpred_r, "", size> {
   bits<4> Qd;
   bits<4> Qn;
 
@@ -3814,7 +3824,7 @@ class MVE_VABD_fp<string suffix, bit size>
   let Inst{25-23} = 0b110;
   let Inst{22} = Qd{3};
   let Inst{21} = 0b1;
-  let Inst{20} = size;
+  let Inst{20} = size{0};
   let Inst{19-17} = Qn{2-0};
   let Inst{16} = 0b0;
   let Inst{15-13} = Qd{2-0};
@@ -3826,7 +3836,7 @@ class MVE_VABD_fp<string suffix, bit size>
 
 multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
                             Intrinsic unpred_int, Intrinsic pred_int> {
-  def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
+  def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3837,7 +3847,7 @@ multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
                             (i32 0), (VTI.Pred VCCR:$mask),
                             (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -3846,7 +3856,7 @@ multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
   : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
 
 defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
-defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; 
+defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
 
 let Predicates = [HasMVEFloat] in {
   def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))),
@@ -3859,7 +3869,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
                    Operand imm_operand_type>
   : MVE_float<"vcvt", suffix,
               (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6),
-              "$Qd, $Qm, $imm6", vpred_r, "", []> {
+              "$Qd, $Qm, $imm6", vpred_r, "", !if(fsi, 0b10, 0b01), []> {
   bits<4> Qd;
   bits<6> imm6;
 
@@ -3913,7 +3923,7 @@ multiclass MVE_VCVT_fix_patterns<Instruction Inst, bit U, MVEVectorVTInfo DestVT
                               imm:$scale,
                               (DestVTI.Pred VCCR:$mask))),
               (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale,
-                             ARMVCCThen, (DestVTI.Pred VCCR:$mask),
+                             ARMVCCThen, (DestVTI.Pred VCCR:$mask), zero_reg,
                              (DestVTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -3942,7 +3952,7 @@ defm MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32_m<0b1, 0b1, MVE_v4u32, MVE_v4f32>;
 class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
                 bits<2> rm, list<dag> pattern=[]>
   : MVE_float<!strconcat("vcvt", anpm), suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -3976,7 +3986,7 @@ multiclass MVE_VCVT_fp_int_anpm_inner<MVEVectorVTInfo Int, MVEVectorVTInfo Flt,
     def : Pat<(Int.Vec (PredIntr (i32 Int.Unsigned), (Int.Vec MQPR:$inactive),
                                  (Flt.Vec MQPR:$in), (Flt.Pred VCCR:$pred))),
               (Int.Vec (Inst (Flt.Vec MQPR:$in), ARMVCCThen,
-                             (Flt.Pred VCCR:$pred), (Int.Vec MQPR:$inactive)))>;
+                             (Flt.Pred VCCR:$pred), zero_reg, (Int.Vec MQPR:$inactive)))>;
   }
 }
 
@@ -3999,7 +4009,7 @@ defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4u32, MVE_v4f32>;
 class MVE_VCVT_fp_int<string suffix, bits<2> size, bit toint, bit unsigned,
                       list<dag> pattern=[]>
   : MVE_float<"vcvt", suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -4032,7 +4042,7 @@ multiclass MVE_VCVT_fp_int_m<MVEVectorVTInfo Dest, MVEVectorVTInfo Src,
                              (Src.Vec MQPR:$src), (i32 Unsigned),
                              (Src.Pred VCCR:$mask), (Dest.Vec MQPR:$inactive))),
               (Dest.Vec (Inst (Src.Vec MQPR:$src), ARMVCCThen,
-                              (Src.Pred VCCR:$mask),
+                              (Src.Pred VCCR:$mask), zero_reg,
                               (Dest.Vec MQPR:$inactive)))>;
   }
 }
@@ -4048,10 +4058,21 @@ defm MVE_VCVTf16u16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8u16, uint_to_fp>;
 defm MVE_VCVTf32s32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4s32, sint_to_fp>;
 defm MVE_VCVTf32u32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4u32, uint_to_fp>;
 
+let Predicates = [HasMVEFloat] in {
+  def : Pat<(v4i32 (fp_to_sint_sat v4f32:$src, i32)),
+            (MVE_VCVTs32f32z v4f32:$src)>;
+  def : Pat<(v4i32 (fp_to_uint_sat v4f32:$src, i32)),
+            (MVE_VCVTu32f32z v4f32:$src)>;
+  def : Pat<(v8i16 (fp_to_sint_sat v8f16:$src, i16)),
+            (MVE_VCVTs16f16z v8f16:$src)>;
+  def : Pat<(v8i16 (fp_to_uint_sat v8f16:$src, i16)),
+            (MVE_VCVTu16f16z v8f16:$src)>;
+}
+
 class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
                    list<dag> pattern=[]>
   : MVE_float<iname, suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -4077,7 +4098,7 @@ multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int,
               (VTI.Vec (Inst $v))>;
     def : Pat<(VTI.Vec (pred_int  (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
                                   (VTI.Vec MQPR:$inactive))),
-              (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
+              (VTI.Vec (Inst $v, ARMVCCThen, $mask, zero_reg, $inactive))>;
   }
 }
 
@@ -4090,15 +4111,15 @@ defm MVE_VNEGf16 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
 defm MVE_VNEGf32 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
                                     MVE_v4f32, 1>;
 
-class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
+class MVE_VMAXMINNMA<string iname, string suffix, bits<2> size, bit bit_12,
                      list<dag> pattern=[]>
   : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
           NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
-          pattern> {
+          size, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
-  let Inst{28} = size;
+  let Inst{28} = size{0};
   let Inst{25-23} = 0b100;
   let Inst{22} = Qd{3};
   let Inst{21-16} = 0b111111;
@@ -4111,12 +4132,13 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
   let Inst{0} = 0b1;
 
   let isCommutable = 1;
+  let validForTailPredication = 1;
 }
 
 multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
                       SDNode unpred_op, Intrinsic pred_int,
                       bit bit_12> {
-  def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>;
+  def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size, bit_12>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
@@ -4129,7 +4151,7 @@ multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
                             (VTI.Pred VCCR:$mask))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
-                            ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                            ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
   }
 }
 
@@ -4150,9 +4172,9 @@ defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>;
 // start of MVE compares
 
 class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
-                 VCMPPredicateOperand predtype, list<dag> pattern=[]>
+                 VCMPPredicateOperand predtype, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc),
-           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", pattern> {
+           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", vecsize, pattern> {
   // Base class for comparing two vector registers
   bits<3> fc;
   bits<4> Qn;
@@ -4187,24 +4209,24 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
 }
 
 class MVE_VCMPqqf<string suffix, bit size>
-    : MVE_VCMPqq<suffix, size, 0b11, pred_basic_fp> {
+    : MVE_VCMPqq<suffix, size, 0b11, pred_basic_fp, !if(size, 0b01, 0b10)> {
   let Predicates = [HasMVEFloat];
 }
 
 class MVE_VCMPqqi<string suffix, bits<2> size>
-    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_i> {
+    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_i, size> {
   let Inst{12} = 0b0;
   let Inst{0} = 0b0;
 }
 
 class MVE_VCMPqqu<string suffix, bits<2> size>
-    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_u> {
+    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_u, size> {
   let Inst{12} = 0b0;
   let Inst{0} = 0b1;
 }
 
 class MVE_VCMPqqs<string suffix, bits<2> size>
-    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_s> {
+    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_s, size> {
   let Inst{12} = 0b1;
 }
 
@@ -4224,9 +4246,9 @@ def MVE_VCMPs16 : MVE_VCMPqqs<"s16", 0b01>;
 def MVE_VCMPs32 : MVE_VCMPqqs<"s32", 0b10>;
 
 class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
-                 VCMPPredicateOperand predtype, list<dag> pattern=[]>
+                 VCMPPredicateOperand predtype, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, GPRwithZR:$Rm, predtype:$fc),
-           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", pattern> {
+           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", vecsize, pattern> {
   // Base class for comparing a vector register with a scalar
   bits<3> fc;
   bits<4> Qn;
@@ -4252,24 +4274,24 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
 }
 
 class MVE_VCMPqrf<string suffix, bit size>
-    : MVE_VCMPqr<suffix, size, 0b11, pred_basic_fp> {
+    : MVE_VCMPqr<suffix, size, 0b11, pred_basic_fp, !if(size, 0b01, 0b10)> {
   let Predicates = [HasMVEFloat];
 }
 
 class MVE_VCMPqri<string suffix, bits<2> size>
-    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_i> {
+    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_i, size> {
   let Inst{12} = 0b0;
   let Inst{5} = 0b0;
 }
 
 class MVE_VCMPqru<string suffix, bits<2> size>
-    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_u> {
+    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_u, size> {
   let Inst{12} = 0b0;
   let Inst{5} = 0b1;
 }
 
 class MVE_VCMPqrs<string suffix, bits<2> size>
-    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_s> {
+    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_s, size> {
   let Inst{12} = 0b1;
 }
 
@@ -4297,11 +4319,11 @@ multiclass unpred_vcmp_z<string suffix, PatLeaf fc> {
                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
 
   def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
 }
 
 multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
@@ -4320,18 +4342,18 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
                  (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>;
 
   def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
 
   def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
 }
 
 multiclass unpred_vcmpf_z<PatLeaf fc> {
@@ -4341,9 +4363,9 @@ multiclass unpred_vcmpf_z<PatLeaf fc> {
                 (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
 
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
 }
 
 multiclass unpred_vcmpf_r<PatLeaf fc> {
@@ -4358,14 +4380,14 @@ multiclass unpred_vcmpf_r<PatLeaf fc> {
             (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>;
 
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
-            (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
-            (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
 
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1, zero_reg))>;
 }
 
 let Predicates = [HasMVEInt] in {
@@ -4477,9 +4499,9 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
                      string ops, vpred_ops vpred, string cstr,
-                     list<dag> pattern=[]>
+                     bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<oops, iops, NoItinerary, iname, suffix,
-          ops, vpred, cstr, pattern> {
+          ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -4494,10 +4516,11 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
 }
 
 class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
-                    string suffix, bits<2> size, string cstr="", list<dag> pattern=[]>
+                    string suffix, bits<2> size, string cstr="",
+                    list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_n, "$Qd = $Qd_src"#cstr, pattern> {
+                   vpred_n, "$Qd = $Qd_src"#cstr, size, pattern> {
   bits<4> Qn;
 
   let Inst{28} = subtract;
@@ -4528,7 +4551,7 @@ multiclass MVE_VQxDMLxDH_p<string iname, bit exch, bit round, bit subtract,
                           (? (VTI.Pred VCCR:$pred)))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
                            (VTI.Vec MQPR:$c),
-                           ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+                           ARMVCCThen, (VTI.Pred VCCR:$pred), zero_reg))>;
 }
 
 multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
@@ -4547,14 +4570,15 @@ defm MVE_VQDMLSDHX  : MVE_VQxDMLxDH_multi<"vqdmlsdhx",  0b1, 0b0, 0b1>;
 defm MVE_VQRDMLSDH  : MVE_VQxDMLxDH_multi<"vqrdmlsdh",  0b0, 0b1, 0b1>;
 defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
 
-class MVE_VCMUL<string iname, string suffix, bit size, string cstr="">
+class MVE_VCMUL<string iname, string suffix, bits<2> size, string cstr="">
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
-                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
+                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, size,
+                   []> {
   bits<4> Qn;
   bits<2> rot;
 
-  let Inst{28} = size;
+  let Inst{28} = size{1};
   let Inst{21-20} = 0b11;
   let Inst{19-17} = Qn{2-0};
   let Inst{16} = 0b0;
@@ -4567,8 +4591,8 @@ class MVE_VCMUL<string iname, string suffix, bit size, string cstr="">
 }
 
 multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI,
-                       bit size, string cstr=""> {
-  def "" : MVE_VCMUL<iname, VTI.Suffix, size, cstr>;
+                       string cstr=""> {
+  def "" : MVE_VCMUL<iname, VTI.Suffix, VTI.Size, cstr>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -4582,20 +4606,20 @@ multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI,
                             (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
                             (VTI.Pred VCCR:$mask))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
-                             imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
 
   }
 }
 
-defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16, 0b0>;
-defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, 0b1, "@earlyclobber $Qd">;
+defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16>;
+defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, "@earlyclobber $Qd">;
 
 class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
-                bit T, string cstr, list<dag> pattern=[]>
+                bit T, string cstr, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, cstr, pattern> {
+                   vpred_r, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -4614,9 +4638,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
 
 multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
                        SDPatternOperator unpred_op, Intrinsic pred_int,
-                       bit Top, string cstr=""> {
+                       bit Top, bits<2> vecsize, string cstr=""> {
   def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
-                     VTI.Size, Top, cstr>;
+                     VTI.Size, Top, cstr, vecsize>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
@@ -4634,7 +4658,7 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
                                uflag, (? (i32 Top), (VTI.DblPred VCCR:$mask),
                                          (VTI.DblVec MQPR:$inactive)))),
               (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                                ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                                ARMVCCThen, (VTI.DblPred VCCR:$mask), zero_reg,
                                 (VTI.DblVec MQPR:$inactive)))>;
   }
 }
@@ -4643,43 +4667,43 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
 // the unsigned bit switches to encoding the size.
 
 defm MVE_VMULLBs8  : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b01>;
 defm MVE_VMULLTs8  : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b01>;
 defm MVE_VMULLBs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b10>;
 defm MVE_VMULLTs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b10>;
 defm MVE_VMULLBs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0,
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b11,
                                  "@earlyclobber $Qd">;
 defm MVE_VMULLTs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1,
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b11,
                                  "@earlyclobber $Qd">;
 
 defm MVE_VMULLBu8  : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b01>;
 defm MVE_VMULLTu8  : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b01>;
 defm MVE_VMULLBu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b10>;
 defm MVE_VMULLTu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b10>;
 defm MVE_VMULLBu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0,
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b11,
                                  "@earlyclobber $Qd">;
 defm MVE_VMULLTu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1,
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b11,
                                  "@earlyclobber $Qd">;
 
 defm MVE_VMULLBp8  : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b0>;
+                                 int_arm_mve_mull_poly_predicated, 0b0, 0b01>;
 defm MVE_VMULLTp8  : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b1>;
+                                 int_arm_mve_mull_poly_predicated, 0b1, 0b01>;
 defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b0>;
+                                 int_arm_mve_mull_poly_predicated, 0b0, 0b10>;
 defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b1>;
+                                 int_arm_mve_mull_poly_predicated, 0b1, 0b10>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
@@ -4729,7 +4753,7 @@ class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
                  list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, "", pattern> {
+                   vpred_r, "", size, pattern> {
   bits<4> Qn;
 
   let Inst{28} = U;
@@ -4759,7 +4783,7 @@ multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
                               (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
                               (VTI.Vec MQPR:$inactive))),
                 (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                               (VTI.Vec MQPR:$inactive)))>;
     }
 
@@ -4794,7 +4818,7 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
                   bits<2> size, bit T, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qd_src, MQPR:$Qm), "$Qd, $Qm",
-                   vpred_n, "$Qd = $Qd_src", pattern> {
+                   vpred_n, "$Qd = $Qd_src", !if(size, 0b10, 0b01), pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = 0b11;
@@ -4854,7 +4878,7 @@ multiclass MVE_VMOVN_p<Instruction Inst, bit top,
                                   (InVTI.Pred VCCR:$pred))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
                               (InVTI.Vec MQPR:$Qm),
-                              ARMVCCThen, (InVTI.Pred VCCR:$pred)))>;
+                              ARMVCCThen, (InVTI.Pred VCCR:$pred), zero_reg))>;
 }
 
 defm : MVE_VMOVN_p<MVE_VMOVNi32bh, 0, MVE_v8i16, MVE_v4i32>;
@@ -4876,7 +4900,7 @@ multiclass MVE_VQMOVN_p<Instruction Inst, bit outU, bit inU, bit top,
                                   (InVTI.Pred VCCR:$pred))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
                               (InVTI.Vec MQPR:$Qm),
-                              ARMVCCThen, (InVTI.Pred VCCR:$pred)))>;
+                              ARMVCCThen, (InVTI.Pred VCCR:$pred), zero_reg))>;
 }
 
 defm : MVE_VQMOVN_p<MVE_VQMOVNs32bh,  0, 0, 0, MVE_v8i16, MVE_v4i32>;
@@ -4939,7 +4963,7 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
                   dag iops_extra, vpred_ops vpred, string cstr>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    !con(iops_extra, (ins MQPR:$Qm)), "$Qd, $Qm",
-                   vpred, cstr, []> {
+                   vpred, cstr, 0b10, []> {
   let Inst{28} = op;
   let Inst{21-16} = 0b111111;
   let Inst{12} = T;
@@ -4968,7 +4992,7 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
                          (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half),
                          (v4i1 VCCR:$mask))),
               (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
-                           ARMVCCThen, (v4i1 VCCR:$mask)))>;
+                           ARMVCCThen, (v4i1 VCCR:$mask), zero_reg))>;
 
     def : Pat<(v8f16 (MVEvcvtn (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))),
               (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>;
@@ -4986,7 +5010,7 @@ multiclass MVE_VCVT_h2f_m<string iname, int half> {
                          (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
                          (v4i1 VCCR:$mask))),
               (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
-                           (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
+                           (v4i1 VCCR:$mask), zero_reg, (v4f32 MQPR:$inactive)))>;
 
     def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))),
               (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
@@ -5002,7 +5026,7 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
                  string cstr="">
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
-                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
+                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, size, []> {
   bits<4> Qn;
   bit rot;
 
@@ -5032,7 +5056,7 @@ multiclass MVE_VxCADD_m<string iname, MVEVectorVTInfo VTI,
                             (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
                             (VTI.Pred VCCR:$mask))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
-                             imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
 
   }
@@ -5050,7 +5074,7 @@ class MVE_VADCSBC<string iname, bit I, bit subtract,
                   dag carryin, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, "i32", (outs MQPR:$Qd, cl_FPSCR_NZCV:$carryout),
                    !con((ins MQPR:$Qn, MQPR:$Qm), carryin),
-                   "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
+                   "$Qd, $Qn, $Qm", vpred_r, "", 0b10, pattern> {
   bits<4> Qn;
 
   let Inst{28} = subtract;
@@ -5077,7 +5101,7 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
                   string cstr="", list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, cstr, pattern> {
+                   vpred_r, cstr, !if(size, 0b10, 0b01), pattern> {
   bits<4> Qn;
 
   let Inst{28} = size;
@@ -5108,7 +5132,7 @@ multiclass MVE_VQDMULL_m<string iname, MVEVectorVTInfo VTI, bit size, bit T,
                                     (i32 T), (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive))),
               (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                                ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                                ARMVCCThen, (VTI.DblPred VCCR:$mask), zero_reg,
                                 (VTI.DblVec MQPR:$inactive)))>;
   }
 }
@@ -5125,10 +5149,9 @@ defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 // start of mve_qDest_rSrc
 
-class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
-                  string suffix, string ops, vpred_ops vpred, string cstr,
-                  list<dag> pattern=[]>
-   : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+class MVE_qr_base<dag oops, dag iops, string iname, string suffix, string ops,
+                  vpred_ops vpred, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+   : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Rm;
@@ -5144,19 +5167,19 @@ class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
   let Inst{3-0} = Rm{3-0};
 }
 
-class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]>
+class MVE_qDest_rSrc<string iname, string suffix, string cstr="", bits<2> vecsize, list<dag> pattern=[]>
   : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
-          NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
-           pattern>;
+                 iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
+                 vecsize, pattern>;
 
-class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDestSrc_rSrc<string iname, string suffix, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, rGPR:$Rm),
-          NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src",
-           pattern>;
+                 iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src",
+                 vecsize, pattern>;
 
-class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDest_single_rSrc<string iname, string suffix, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, rGPR:$Rm), NoItinerary, iname,
-          suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", pattern> {
+          suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", vecsize, pattern> {
   bits<4> Qd;
   bits<4> Rm;
 
@@ -5187,14 +5210,14 @@ multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
                             (pred_op (VTI.Pred VCCR:$mask),
                                      (VTI.Vec MQPR:$inactive)))),
               (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
-                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                              (VTI.Vec MQPR:$inactive)))>;
   }
 }
 
 class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
                      bit bit_5, bit bit_12, bit bit_16, bit bit_28>
-  : MVE_qDest_rSrc<iname, suffix, ""> {
+  : MVE_qDest_rSrc<iname, suffix, "", size> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = size;
@@ -5262,7 +5285,7 @@ defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32, usubsat>;
 
 class MVE_VQDMULL_qr<string iname, string suffix, bit size,
                      bit T, string cstr="", list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, cstr, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, cstr, !if(size, 0b10, 0b01), pattern> {
 
   let Inst{28} = size;
   let Inst{21-20} = 0b11;
@@ -5293,7 +5316,7 @@ multiclass MVE_VQDMULL_qr_m<string iname, MVEVectorVTInfo VTI, bit size,
                                     (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive))),
               (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
-                             ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                             ARMVCCThen, (VTI.DblPred VCCR:$mask), zero_reg,
                              (VTI.DblVec MQPR:$inactive)))>;
   }
 }
@@ -5307,12 +5330,12 @@ defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>;
 defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 class MVE_VxADDSUB_qr<string iname, string suffix,
-                      bit bit_28, bits<2> bits_21_20, bit subtract,
-                      list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+                      bit bit_28, bits<2> size, bit subtract,
+                      bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_qDest_rSrc<iname, suffix, "", vecsize, pattern> {
 
   let Inst{28} = bit_28;
-  let Inst{21-20} = bits_21_20;
+  let Inst{21-20} = size;
   let Inst{16} = 0b0;
   let Inst{12} = subtract;
   let Inst{8} = 0b1;
@@ -5322,7 +5345,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
 
 multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
                              Intrinsic unpred_int, Intrinsic pred_int> {
-  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>;
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract, VTI.Size>;
   defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
                                   VTI, unpred_int, pred_int, 1, 1>;
 }
@@ -5351,7 +5374,7 @@ defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
 
 multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
                             SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract>;
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
                               !cast<Instruction>(NAME)>;
 }
@@ -5370,7 +5393,7 @@ let Predicates = [HasMVEFloat] in {
 
 class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
                    bit bit_7, bit bit_17, list<dag> pattern=[]>
-  : MVE_qDest_single_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_single_rSrc<iname, suffix, size, pattern> {
 
   let Inst{28} = U;
   let Inst{25-23} = 0b100;
@@ -5398,7 +5421,7 @@ multiclass MVE_VxSHL_qr_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
                          (i32 q), (i32 r), (i32 VTI.Unsigned),
                          (VTI.Pred VCCR:$mask))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$in), (i32 rGPR:$sh),
-                           ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+                           ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg))>;
 }
 
 multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
@@ -5432,7 +5455,7 @@ let Predicates = [HasMVEInt] in {
 }
 
 class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", size, pattern> {
 
   let Inst{28} = 0b1;
   let Inst{21-20} = size;
@@ -5457,7 +5480,7 @@ multiclass MVE_VBRSR_pat_m<MVEVectorVTInfo VTI, Instruction Inst> {
                           (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm),
                           (VTI.Pred VCCR:$mask))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm),
-                          ARMVCCThen, (VTI.Pred VCCR:$mask),
+                          ARMVCCThen, (VTI.Pred VCCR:$mask), zero_reg,
                           (VTI.Vec MQPR:$inactive)))>;
 }
 
@@ -5482,7 +5505,7 @@ let Predicates = [HasMVEFloat] in {
 }
 
 class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
-  : MVE_qDest_rSrc<iname, suffix, ""> {
+  : MVE_qDest_rSrc<iname, suffix, "", size> {
 
   let Inst{28} = 0b0;
   let Inst{21-20} = size;
@@ -5506,11 +5529,11 @@ defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>;
 defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>;
 
 class MVE_VxxMUL_qr<string iname, string suffix,
-                    bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+                    bit bit_28, bits<2> size, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_qDest_rSrc<iname, suffix, "", vecsize, pattern> {
 
   let Inst{28} = bit_28;
-  let Inst{21-20} = bits_21_20;
+  let Inst{21-20} = size;
   let Inst{16} = 0b1;
   let Inst{12} = 0b0;
   let Inst{8} = 0b0;
@@ -5520,7 +5543,7 @@ class MVE_VxxMUL_qr<string iname, string suffix,
 
 multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
                            PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> {
-  def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
+  def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size, VTI.Size>;
 
   let Predicates = [HasMVEInt] in {
     defm : MVE_TwoOpPatternDup<VTI, Op, int_pred, (? ), !cast<Instruction>(NAME)>;
@@ -5546,7 +5569,7 @@ defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 
 multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> {
   let validForTailPredication = 1 in
-  def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11>;
+  def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
                              !cast<Instruction>(NAME)>;
 }
@@ -5558,8 +5581,8 @@ let Predicates = [HasMVEFloat] in {
 
 class MVE_VFMAMLA_qr<string iname, string suffix,
                      bit bit_28, bits<2> bits_21_20, bit S,
-                     list<dag> pattern=[]>
-  : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
+                     bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_qDestSrc_rSrc<iname, suffix, vecsize, pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = bits_21_20;
@@ -5574,7 +5597,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
 multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI,
                              bit scalar_addend> {
   def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
-                         scalar_addend>;
+                         scalar_addend, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
   defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated");
   defvar v1   = (VTI.Vec MQPR:$v1);
@@ -5596,7 +5619,7 @@ multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI,
     }
 
     def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)),
-              (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>;
+              (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred, zero_reg))>;
   }
 }
 
@@ -5616,7 +5639,7 @@ defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>;
 
 multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
                              bit scalar_addend> {
-  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;
+  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
   defvar pred_int = int_arm_mve_fma_predicated;
   defvar v1   = (VTI.Vec MQPR:$v1);
@@ -5632,9 +5655,9 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma v1, v2, vs)),
                                   v1)),
-                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
-                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred, zero_reg))>;
     } else {
       def : Pat<(VTI.Vec (fma v1, vs, v2)),
                 (VTI.Vec (Inst v2, v1, is))>;
@@ -5643,15 +5666,15 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma vs, v2, v1)),
                                   v1)),
-                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma v2, vs, v1)),
                                   v1)),
-                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>;
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred, zero_reg))>;
       def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
-                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred, zero_reg))>;
       def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
-                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred, zero_reg))>;
     }
   }
 }
@@ -5665,7 +5688,7 @@ let Predicates = [HasMVEFloat] in {
 
 class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
                      bit bit_5, bit bit_12, list<dag> pattern=[]>
-  : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
+  : MVE_qDestSrc_rSrc<iname, suffix, size, pattern> {
 
   let Inst{28} = U;
   let Inst{21-20} = size;
@@ -5691,7 +5714,7 @@ multiclass MVE_VQDMLAH_qr_multi<string iname, MVEVectorVTInfo VTI,
                                    (i32 rGPR:$s), (VTI.Pred VCCR:$pred))),
               (VTI.Vec (Inst       (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
                                    (i32 rGPR:$s), ARMVCCThen,
-                                   (VTI.Pred VCCR:$pred)))>;
+                                   (VTI.Pred VCCR:$pred), zero_reg))>;
   }
 }
 
@@ -5710,7 +5733,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
                 ValueType VT, SDPatternOperator vxdup>
   : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
           (ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary,
-          iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src",
+          iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src", size,
           [(set (VT MQPR:$Qd), (i32 tGPREven:$Rn),
               (vxdup (i32 tGPREven:$Rn_src), (i32 imm:$imm)))]> {
   bits<4> Qd;
@@ -5745,7 +5768,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
                  list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
           (ins tGPREven:$Rn_src, tGPROdd:$Rm, MVE_VIDUP_imm:$imm), NoItinerary,
-          iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src",
+          iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src", size,
           pattern> {
   bits<4> Qd;
   bits<4> Rm;
@@ -5780,7 +5803,7 @@ def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 let isReMaterializable = 1 in
 class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
-          "$Rn", vpred_n, "", pattern> {
+          "$Rn", vpred_n, "", size, pattern> {
   bits<4> Rn;
 
   let Inst{28-27} = 0b10;
@@ -5804,7 +5827,7 @@ multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
     def : Pat<(intr rGPR:$Rn),
               (VTI.Pred (Inst rGPR:$Rn))>;
     def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
-              (VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask))>;
+              (VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask, zero_reg))>;
   }
 }
 
@@ -5837,6 +5860,7 @@ class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr>
   let Inst{4} = idx2;
   let Inst{3-0} = Rt{3-0};
 
+  let VecSize = 0b10;
   let hasSideEffects = 0;
 }
 
@@ -5925,7 +5949,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
                        bit load, dag Oops, dag loadIops, dag wbIops,
                        string iname, string ops,
                        string cstr, list<dag> pattern=[]>
-  : MVE_MI<Oops, !con(loadIops, wbIops), NoItinerary, iname, ops, cstr, pattern> {
+  : MVE_MI<Oops, !con(loadIops, wbIops), NoItinerary, iname, ops, cstr, size, pattern> {
   bits<4> VQd;
   bits<4> Rn;
 
@@ -6037,13 +6061,13 @@ multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
     def : Pat<(int_arm_mve_vst2q i32:$addr,
                 (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
               (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize)
-                (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+                (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
                 t2_addr_offset_none:$addr)>;
   foreach stage = [0,1] in
     def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32),
                 (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))),
               (i32 (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize#_wb)
-                (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+                (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
                 t2_addr_offset_none:$addr))>;
 
   foreach stage = [0,1,2,3] in
@@ -6051,16 +6075,16 @@ multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
                 (VT MQPR:$v0), (VT MQPR:$v1),
                 (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
               (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize)
-                (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
-                                      VT:$v2, qsub_2, VT:$v3, qsub_3),
+                (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+                                       VT:$v2, qsub_2, VT:$v3, qsub_3),
                 t2_addr_offset_none:$addr)>;
   foreach stage = [0,1,2,3] in
     def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64),
                 (VT MQPR:$v0), (VT MQPR:$v1),
                 (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))),
               (i32 (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize#_wb)
-                (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
-                                      VT:$v2, qsub_2, VT:$v3, qsub_3),
+                (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+                                       VT:$v2, qsub_2, VT:$v3, qsub_3),
                 t2_addr_offset_none:$addr))>;
 }
 defm : MVE_vst24_patterns<8, v16i8>;
@@ -6123,8 +6147,8 @@ def MVE_memD: MVE_memsz<0b11, 3, ?,               "d", ["", "u", "s", "f"]>;
 // input values.
 class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
                        dag oops, dag iops, string asm, string suffix,
-                       string ops, string cstr, list<dag> pattern=[]>
- : MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, pattern> {
+                       string ops, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, vecsize, pattern> {
   bits<3> Qd;
 
   let Inst{28} = U;
@@ -6160,12 +6184,14 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
 class MVE_VLDRSTR_cs<MVE_ldst_direction dir, MVE_memsz memsz, bit P, bit W,
                      dag oops, dag iops, string asm, string suffix,
                      IndexMode im, string ops, string cstr>
-  : MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr> {
+  : MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr, memsz.encoding> {
   bits<12> addr;
   let Inst{23} = addr{7};
   let Inst{19-16} = addr{11-8};
   let Inst{8-7} = memsz.encoding;
   let Inst{6-0} = addr{6-0};
+
+  let IM = im;
 }
 
 // Contiguous, widening/narrowing
@@ -6173,7 +6199,7 @@ class MVE_VLDRSTR_cw<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
                      bit P, bit W, bits<2> size, dag oops, dag iops,
                      string asm, string suffix, IndexMode im,
                      string ops, string cstr>
-  : MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr> {
+  : MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr, size> {
   bits<11> addr;
   let Inst{23} = addr{7};
   let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit
@@ -6290,7 +6316,7 @@ class MVE_VLDRSTR_rq<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
                      bits<2> size, bit os, string asm, string suffix, int shift>
   : MVE_VLDRSTR_base<dir, U, 0b0, 0b0, 0, dir.Oops,
                      !con(dir.Iops, (ins mve_addr_rq_shift<shift>:$addr)),
-                     asm, suffix, "$Qd, $addr", dir.cstr> {
+                     asm, suffix, "$Qd, $addr", dir.cstr, size> {
   bits<7> addr;
   let Inst{23} = 0b1;
   let Inst{19-16} = addr{6-3};
@@ -6336,9 +6362,9 @@ multiclass MVE_VLDR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)),
               (VTI.Vec (Inst GPR:$base, MQPR:$offsets))>;
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))),
-              (VTI.Vec (InstU GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
+              (VTI.Vec (InstU GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred, zero_reg))>;
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))),
-              (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
+              (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred, zero_reg))>;
   }
 }
 multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> {
@@ -6350,7 +6376,7 @@ multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> {
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)),
               (VTI.Vec (Inst GPR:$base, MQPR:$offsets))>;
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))),
-              (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
+              (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred, zero_reg))>;
   }
 }
 multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
@@ -6365,9 +6391,9 @@ multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
     def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift),
               (Inst MQPR:$data, GPR:$base, MQPR:$offsets)>;
     def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)),
-              (InstU MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
+              (InstU MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred, zero_reg)>;
     def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)),
-              (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
+              (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred, zero_reg)>;
   }
 }
 multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> {
@@ -6379,7 +6405,7 @@ multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> {
     def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0),
               (Inst MQPR:$data, GPR:$base, MQPR:$offsets)>;
     def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)),
-              (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
+              (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred, zero_reg)>;
   }
 }
 
@@ -6423,7 +6449,7 @@ class MVE_VLDRSTR_qi<MVE_ldst_direction dir, MVE_memsz memsz, bit W, dag wbops,
                      string asm, string wbAsm, string suffix, string cstr = "">
   : MVE_VLDRSTR_base<dir, 1, 1, W, 1, !con(wbops, dir.Oops),
                      !con(dir.Iops, (ins mve_addr_q_shift<memsz.shift>:$addr)),
-                     asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> {
+                     asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr, memsz.encoding> {
   bits<11> addr;
   let Inst{23} = addr{7};
   let Inst{19-17} = addr{10-8};
@@ -6460,7 +6486,7 @@ multiclass MVE_VLDR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
     def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated
                  (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))),
               (DVTI.Vec (Inst (AVTI.Vec MQPR:$addr), (i32 imm:$offset),
-                        ARMVCCThen, VCCR:$pred))>;
+                        ARMVCCThen, VCCR:$pred, zero_reg))>;
   }
 }
 multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
@@ -6478,7 +6504,7 @@ multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
     def : Pat<(int_arm_mve_vstr_scatter_base_predicated
                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)),
               (Inst (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
-                    (i32 imm:$offset), ARMVCCThen, VCCR:$pred)>;
+                    (i32 imm:$offset), ARMVCCThen, VCCR:$pred, zero_reg)>;
     def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb
                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))),
               (AVTI.Vec (InstPre (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
@@ -6486,7 +6512,7 @@ multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
     def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated
                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))),
               (AVTI.Vec (InstPre (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
-                                 (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>;
+                                 (i32 imm:$offset), ARMVCCThen, VCCR:$pred, zero_reg))>;
   }
 }
 
@@ -6532,7 +6558,7 @@ foreach suffix = memsz.suffixes in {
 // end of MVE predicable load/store
 
 class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> pattern=[]>
-  : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> {
+  : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", size, pattern> {
   bits<3> fc;
   bits<4> Mk;
   bits<3> Qn;
@@ -6642,7 +6668,7 @@ def MVE_VPTv16s8r : MVE_VPTt2s<"s8", 0b00>;
 
 class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=[]>
   : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm,
-            "", pattern> {
+            "", !if(size, 0b01, 0b10), pattern> {
   bits<3> fc;
   bits<4> Mk;
   bits<3> Qn;
@@ -6695,7 +6721,7 @@ def MVE_VPTv4f32r        : MVE_VPTft2<"f32", 0b0>;
 def MVE_VPTv8f16r        : MVE_VPTft2<"f16", 0b1>;
 
 def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
-       !strconcat("vpst", "${Mk}"), "", "", []> {
+       !strconcat("vpst", "${Mk}"), "", "", 0b00, []> {
   bits<4> Mk;
 
   let Inst{31-23} = 0b111111100;
@@ -6712,7 +6738,7 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
 }
 
 def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
-                      "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", []> {
+                      "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", 0b00, []> {
   bits<4> Qn;
   bits<4> Qd;
   bits<4> Qm;
@@ -6741,71 +6767,71 @@ def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
-            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
-            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
-            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
-            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
+            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
-            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
+            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
             (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
-                              (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), ARMCCne)))>;
+                              (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), ARMCCne), zero_reg))>;
   def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
             (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
-                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
+                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne), zero_reg))>;
   def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
             (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
-                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
+                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne), zero_reg))>;
 
   def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
             (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
-                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
+                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne), zero_reg))>;
   def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
             (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
-                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
+                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne), zero_reg))>;
 
   // Pred <-> Int
   def : Pat<(v16i8 (zext  (v16i1 VCCR:$pred))),
-            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v8i16 (zext  (v8i1  VCCR:$pred))),
-            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (zext  (v4i1  VCCR:$pred))),
-            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v16i8 (sext  (v16i1 VCCR:$pred))),
-            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v8i16 (sext  (v8i1  VCCR:$pred))),
-            (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (sext  (v4i1  VCCR:$pred))),
-            (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v16i8 (anyext  (v16i1 VCCR:$pred))),
-            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v8i16 (anyext  (v8i1  VCCR:$pred))),
-            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (anyext  (v4i1  VCCR:$pred))),
-            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
 }
 
 let Predicates = [HasMVEFloat] in {
   // Pred <-> Float
   // 112 is 1.0 in float
   def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
-            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
+            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred, zero_reg))>;
   // 2620 in 1.0 in half
   def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
-            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
+            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred, zero_reg))>;
   // 240 is -1.0 in float
   def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
-            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
+            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred, zero_reg))>;
   // 2748 is -1.0 in half
   def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
-            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
+            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
             (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
@@ -6818,7 +6844,7 @@ let Predicates = [HasMVEFloat] in {
 }
 
 def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
-                      "vpnot", "", "", vpred_n, "", []> {
+                      "vpnot", "", "", vpred_n, "", 0b00, []> {
   let Inst{31-0} = 0b11111110001100010000111101001101;
   let Unpredictable{19-17} = 0b111;
   let Unpredictable{12} = 0b1;
@@ -6930,6 +6956,37 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
 }
 
 
+// Pseudo instructions for lowering MQQPR and MQQQQPR stack spills and reloads.
+// They are equivalent to VLDMDIA/VSTMDIA with a single reg, as opposed to multiple
+// dreg subregs.
+
+let Predicates = [HasMVEInt], AM = AddrMode4 in {
+let mayStore = 1, hasSideEffects = 0 in {
+  def MQQPRStore : t2PseudoInst<(outs), (ins MQQPR:$val, GPRnopc:$ptr),
+                                4, NoItinerary, []>;
+  def MQQQQPRStore : t2PseudoInst<(outs), (ins MQQQQPR:$val, GPRnopc:$ptr),
+                                  4, NoItinerary, []>;
+}
+let mayLoad = 1, hasSideEffects = 0 in {
+  def MQQPRLoad : t2PseudoInst<(outs MQQPR:$val), (ins GPRnopc:$ptr),
+                               4, NoItinerary, []>;
+  def MQQQQPRLoad : t2PseudoInst<(outs MQQQQPR:$val), (ins GPRnopc:$ptr),
+                                 4, NoItinerary, []>;
+}
+}
+
+// Pseudo for lowering MVE Q register COPYs. These will usually get converted
+// to a "MVE_VORR dst, src, src", but may behave differently in tail predicated
+// loops to ensure the whole register is copied, not a subset from a
+// tail-predicated MVE_VORR. In the event we cannot prove a MVE_VORR is valid,
+// it will become a pair of VMOVD instructions for each half of the Q register.
+let Predicates = [HasMVEInt], hasSideEffects = 0, isMoveReg = 1,
+    D = MVEDomain in {
+  def MQPRCopy : t2PseudoInst<(outs MQPR:$dst), (ins MQPR:$src),
+                              8, NoItinerary, []>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Patterns
 //===----------------------------------------------------------------------===//
@@ -7142,7 +7199,7 @@ class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
 class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
                                    PatFrag StoreKind, int shift>
   : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
-        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg)>;
 
 multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
                             int shift> {
@@ -7163,7 +7220,7 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
 class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
                                   PatFrag LoadKind, int shift>
   : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))),
-        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg))>;
 
 multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
                            int shift> {
@@ -7184,7 +7241,7 @@ class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
 class MVE_vector_offset_maskedstore_typed<ValueType Ty, Instruction Opcode,
                                           PatFrag StoreKind, int shift>
   : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr, VCCR:$pred),
-        (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+        (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg)>;
 
 multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
                                    int shift> {
@@ -7314,11 +7371,11 @@ multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string
 
   // Masked trunc stores
   def : Pat<(!cast<PatFrag>("aligned_truncmaskedst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr, VCCR:$pred),
-            (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+            (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg)>;
   def : Pat<(!cast<PatFrag>("aligned_post_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
-            (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+            (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg)>;
   def : Pat<(!cast<PatFrag>("aligned_pre_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
-            (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+            (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg)>;
 
   // Ext loads
   def : Pat<(VT (!cast<PatFrag>("aligned_extload"#Amble) taddrmode_imm7<Shift>:$addr)),
@@ -7330,11 +7387,11 @@ multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string
 
   // Masked ext loads
   def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
-            (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+            (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg))>;
   def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
-            (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+            (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg))>;
   def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
-            (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+            (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred, zero_reg))>;
 }
 
 let Predicates = [HasMVEInt] in {
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 3ca6704c17b9..aaf3280ea150 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -2735,8 +2735,11 @@ class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 string Dt, ValueType ResTy, ValueType OpTy,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
-          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
-          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, OpcodeStr, Dt,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
 
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
@@ -2789,19 +2792,22 @@ class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
-          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
 
 // Same as N3VQIntnp but with Vd as a src register.
 class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 bit op4, Format f, InstrItinClass itin, string OpcodeStr,
                 string Dt, ValueType ResTy, ValueType OpTy,
-                SDPatternOperator IntOp, bit Commutable>
+                SDPatternOperator IntOp>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm),
           f, itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
                                        (OpTy QPR:$Vm))))]> {
   let Constraints = "$src = $Vd";
+  let isCommutable = 0;
 }
 
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
@@ -3118,7 +3124,10 @@ class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
-          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
 
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
@@ -4041,7 +4050,7 @@ multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin, string OpcodeStr, string Dt,
-                       string baseOpc, SDNode OpNode> {
+                       SDNode OpNode> {
   // 64-bit vector types.
   def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
                      OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
@@ -4987,7 +4996,7 @@ class BaseN3VCP8ComplexTiedLane64<bit op4, bit s, bit q, InstrItinClass itin,
 }
 
 multiclass N3VCP8ComplexTied<bit op21, bit op4,
-                       string OpcodeStr, SDPatternOperator Op> {
+                       string OpcodeStr> {
   let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
   def v4f16 : BaseN3VCP8ComplexTied<op21, op4, 0, 0, IIC_VMACD, (outs DPR:$Vd),
               (ins DPR:$src1, DPR:$Vn, DPR:$Vm, complexrotateop:$rot),
@@ -5007,7 +5016,7 @@ multiclass N3VCP8ComplexTied<bit op21, bit op4,
 }
 
 multiclass N3VCP8ComplexOdd<bit op23, bit op21, bit op4,
-                       string OpcodeStr, SDPatternOperator Op> {
+                       string OpcodeStr> {
   let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
   def v4f16 : BaseN3VCP8ComplexOdd<op23, op21, op4, 0, 0, IIC_VMACD,
               (outs DPR:$Vd),
@@ -5032,8 +5041,7 @@ multiclass N3VCP8ComplexOdd<bit op23, bit op21, bit op4,
 
 // These instructions index by pairs of lanes, so the VectorIndexes are twice
 // as wide as the data types.
-multiclass N3VCP8ComplexTiedLane<bit op4, string OpcodeStr,
-                                 SDPatternOperator Op> {
+multiclass N3VCP8ComplexTiedLane<bit op4, string OpcodeStr> {
   let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
   def v4f16_indexed : BaseN3VCP8ComplexTiedLane32<op4, 0, 0, IIC_VMACD,
                       (outs DPR:$Vd),
@@ -5060,9 +5068,9 @@ multiclass N3VCP8ComplexTiedLane<bit op4, string OpcodeStr,
   }
 }
 
-defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla", null_frag>;
-defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd", null_frag>;
-defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla", null_frag>;
+defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla">;
+defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd">;
+defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla">;
 
 let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
   def : Pat<(v4f16 (int_arm_neon_vcadd_rot90 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
@@ -5991,9 +5999,9 @@ def : Pat<(v2i64 (ARMvshlu (v2i64 QPR:$Dn), (v2i64 QPR:$Dm))),
 defm VSHLi    : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", ARMvshlImm>;
 
 //   VSHR     : Vector Shift Right (Immediate)
-defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
+defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s",
                             ARMvshrsImm>;
-defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
+defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u",
                             ARMvshruImm>;
 
 //   VSHLL    : Vector Shift Left Long
@@ -6061,9 +6069,9 @@ defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
                             "vrshl", "u", int_arm_neon_vrshiftu>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
+defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s",
                             NEONvrshrsImm>;
-defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
+defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u",
                             NEONvrshruImm>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
@@ -6438,6 +6446,18 @@ def : Pat<(ARMvgetlaneu (v8i16 QPR:$src), imm:$lane),
           (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
                              (DSubReg_i16_reg imm:$lane))),
                      (SubReg_i16_lane imm:$lane))>;
+def : Pat<(ARMvgetlaneu (v8f16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4f16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(ARMvgetlaneu (v4f16 DPR:$src), imm:$lane),
+          (VGETLNu16 (v4f16 DPR:$src), imm:$lane)>;
+def : Pat<(ARMvgetlaneu (v8bf16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4bf16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(ARMvgetlaneu (v4bf16 DPR:$src), imm:$lane),
+          (VGETLNu16 (v4bf16 DPR:$src), imm:$lane)>;
 }
 def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
           (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
@@ -7074,7 +7094,7 @@ class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
 
 class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
-        (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm,
+        (ins QPR:$Vn, QPR:$Vm, immTy:$index), NVExtFrm,
         IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
         [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
                                      (Ty QPR:$Vm), imm:$index)))]> {
@@ -7337,7 +7357,7 @@ let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
                  !strconcat("sha", op), "32", v4i32, v4i32, Int>;
   class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
     : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
-                !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>;
+                !strconcat("sha", op), "32", v4i32, v4i32, Int>;
 }
 
 let Predicates = [HasV8, HasAES] in {
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index ef07b2839bc9..bf717a4056e9 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -168,6 +168,7 @@ def thumb_cb_target : Operand<OtherVT> {
   let EncoderMethod = "getThumbCBTargetOpValue";
   let DecoderMethod = "DecodeThumbCmpBROperand";
 }
+} // OperandType = "OPERAND_PCREL"
 
 // t_addrmode_pc := <label> => pc + imm8 * 4
 //
@@ -177,7 +178,6 @@ def t_addrmode_pc : MemOperand {
   let PrintMethod = "printThumbLdrLabelOperand";
   let ParserMatchClass = ThumbMemPC;
 }
-}
 
 // t_addrmode_rr := reg + reg
 //
@@ -1520,6 +1520,7 @@ def tTBH_JT : tPseudoInst<(outs),
 let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
 def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
                           [(set R0, ARMthread_pointer)]>,
+                          Requires<[IsThumb, IsReadTPSoft]>,
                           Sched<[WriteBr]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index e7eed2a0bbb1..783db9dde17f 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -200,7 +200,7 @@ def t2addrmode_imm12 : MemOperand,
 }
 
 // t2ldrlabel  := imm12
-def t2ldrlabel : Operand<i32> {
+def t2ldrlabel : MemOperand {
   let EncoderMethod = "getAddrModeImm12OpValue";
   let PrintMethod = "printThumbLdrLabelOperand";
 }
@@ -1927,7 +1927,7 @@ def : InstAlias<"pli${p}.w\t$addr",
 
 // pci variant is very similar to i12, but supports negative offsets
 // from the PC. Only PLD and PLI have pci variants (not PLDW)
-class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
+class T2Iplpci<bits<1> inst, string opc> : T2Ipc<(outs), (ins t2ldrlabel:$addr),
                IIC_Preload, opc, "\t$addr",
                [(ARMPreload (ARMWrapper tconstpool:$addr),
                 (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
@@ -4274,8 +4274,9 @@ def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
 //===----------------------------------------------------------------------===//
 // Coprocessor load/store -- for disassembly only
 //
-class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, NoItinerary, opc, asm, pattern> {
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm,
+           list<dag> pattern, AddrMode am = AddrModeNone>
+  : T2I<oops, iops, NoItinerary, opc, asm, pattern, am> {
   let Inst{31-28} = op31_28;
   let Inst{27-25} = 0b110;
 }
@@ -4283,7 +4284,7 @@ class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm, list<dag
 multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> pattern> {
   def _OFFSET : T2CI<op31_28,
                      (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
-                     asm, "\t$cop, $CRd, $addr", pattern> {
+                     asm, "\t$cop, $CRd, $addr", pattern, AddrMode5> {
     bits<13> addr;
     bits<4> cop;
     bits<4> CRd;
@@ -4670,6 +4671,9 @@ def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 }
 
 
+// Reading thread pointer from coprocessor register
+def : T2Pat<(ARMthread_pointer), (t2MRC 15, 0, 13, 0, 3)>,
+      Requires<[IsThumb2, IsReadTPHard]>;
 
 //===----------------------------------------------------------------------===//
 // ARMv8.1 Privilege Access Never extension
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index bcd6433a579b..9d1bfa414dff 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -1600,9 +1600,13 @@ def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
 let Predicates=[HasVFP2, HasDPVFP] in {
   def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
                (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
+  def : VFPPat<(i32 (fp_to_sint_sat (f64 DPR:$a), i32)),
+               (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
 
   def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
                (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
+  def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f64 DPR:$a), i32)), addrmode5:$ptr),
+               (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
 }
 
 def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
@@ -1619,10 +1623,15 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
 
 def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
                    (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
+def : VFPPat<(i32 (fp_to_sint_sat SPR:$a, i32)),
+             (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
 
 def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                    (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
+def : VFPPat<(alignedstore32 (i32 (fp_to_sint_sat (f32 SPR:$a), i32)),
+                                   addrmode5:$ptr),
+             (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
 
 def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
                                  (outs SPR:$Sd), (ins HPR:$Sm),
@@ -1635,6 +1644,8 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
 
 def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))),
                    (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
+def : VFPPat<(i32 (fp_to_sint_sat (f16 HPR:$a), i32)),
+             (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
@@ -1647,9 +1658,13 @@ def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
 let Predicates=[HasVFP2, HasDPVFP] in {
   def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
                (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
+  def : VFPPat<(i32 (fp_to_uint_sat (f64 DPR:$a), i32)),
+               (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
 
   def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
                (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
+  def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f64 DPR:$a), i32)), addrmode5:$ptr),
+               (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
 }
 
 def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
@@ -1666,10 +1681,15 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
 
 def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
                    (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
+def : VFPPat<(i32 (fp_to_uint_sat SPR:$a, i32)),
+             (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
 
 def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
                                    addrmode5:$ptr),
                   (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
+def : VFPPat<(alignedstore32 (i32 (fp_to_uint_sat (f32 SPR:$a), i32)),
+                                   addrmode5:$ptr),
+             (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
 
 def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
                                  (outs SPR:$Sd), (ins HPR:$Sm),
@@ -1682,6 +1702,8 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
 
 def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))),
                    (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
+def : VFPPat<(i32 (fp_to_uint_sat (f16 HPR:$a), i32)),
+             (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index fd06bfdf352c..6e259b1baf97 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -564,7 +564,7 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
   }
 
   // End of block was reached.
-  if (MBB.succ_size() > 0) {
+  if (!MBB.succ_empty()) {
     // FIXME: Because of a bug, live registers are sometimes missing from
     // the successor blocks' live-in sets. This means we can't trust that
     // information and *always* have to reset at the end of a block.
@@ -587,7 +587,7 @@ unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
   }
 
   for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
-    if (!LiveRegs.contains(Reg))
+    if (LiveRegs.available(MF->getRegInfo(), Reg))
       return Reg;
   return 0;
 }
@@ -2476,8 +2476,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
           }
         } else {
           for (unsigned i = 0; i != NumMove; ++i) {
-            MachineInstr *Op = Ops.back();
-            Ops.pop_back();
+            MachineInstr *Op = Ops.pop_back_val();
             MBB->splice(InsertPos, MBB, Op);
           }
         }
@@ -2811,6 +2810,7 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
         .addImm(Offset)
         .add(MI->getOperand(3))
         .add(MI->getOperand(4))
+        .add(MI->getOperand(5))
         .cloneMemRefs(*MI);
   case ARMII::AddrModeT2_i8:
     if (MI->mayLoad()) {
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index ea41442857f3..3874db5792d6 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -97,7 +97,15 @@ static bool isDomainMVE(MachineInstr *MI) {
   return Domain == ARMII::DomainMVE;
 }
 
+static int getVecSize(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t Flags = MCID.TSFlags;
+  return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
+}
+
 static bool shouldInspect(MachineInstr &MI) {
+  if (MI.isDebugInstr())
+    return false;
   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
 }
 
@@ -368,9 +376,11 @@ namespace {
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
     MachineOperand TPNumElements;
-    SmallVector<MachineInstr*, 4> VCTPs;
-    SmallPtrSet<MachineInstr*, 4> ToRemove;
-    SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
+    SmallVector<MachineInstr *, 4> VCTPs;
+    SmallPtrSet<MachineInstr *, 4> ToRemove;
+    SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
+    SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
+    SmallPtrSet<MachineInstr *, 4> VMOVCopies;
     bool Revert = false;
     bool CannotTailPredicate = false;
 
@@ -730,6 +740,20 @@ bool LowOverheadLoop::ValidateTailPredicate() {
     return false;
   }
 
+  // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
+  // need to compute an output size that is smaller than the VCTP mask operates
+  // on. The VecSize of the DoubleWidthResult is the larger vector size - the
+  // size it extends into, so any VCTP VecSize <= is valid.
+  unsigned VCTPVecSize = getVecSize(*VCTP);
+  for (MachineInstr *MI : DoubleWidthResultInstrs) {
+    unsigned InstrVecSize = getVecSize(*MI);
+    if (InstrVecSize > VCTPVecSize) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
+                        << "VecSize:\n" << *MI);
+      return false;
+    }
+  }
+
   // Check that the value change of the element count is what we expect and
   // that the predication will be equivalent. For this we need:
   // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
@@ -880,6 +904,10 @@ static bool producesFalseLanesZero(MachineInstr &MI,
       continue;
     if (!isRegInClass(MO, QPRs) && AllowScalars)
       continue;
+    // Skip the lr predicate reg
+    int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
+    if (PIdx != -1 && (int)MI.getOperandNo(&MO) == PIdx + 2)
+      continue;
 
     // Check that this instruction will produce zeros in its false lanes:
     // - If it only consumes false lanes zero or constant 0 (vmov #0)
@@ -927,6 +955,8 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   SmallPtrSet<MachineInstr *, 4> Predicated;
   MachineBasicBlock *Header = ML.getHeader();
 
+  LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
+
   for (auto &MI : *Header) {
     if (!shouldInspect(MI))
       continue;
@@ -944,12 +974,25 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       FalseLanesZero.insert(&MI);
     else if (MI.getNumDefs() == 0)
       continue;
-    else if (!isPredicated && retainsOrReduces)
+    else if (!isPredicated && retainsOrReduces) {
+      LLVM_DEBUG(dbgs() << "  Unpredicated instruction that retainsOrReduces: " << MI);
       return false;
-    else if (!isPredicated)
+    } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
       FalseLanesUnknown.insert(&MI);
   }
 
+  LLVM_DEBUG({
+    dbgs() << "  Predicated:\n";
+    for (auto *I : Predicated)
+      dbgs() << "  " << *I;
+    dbgs() << "  FalseLanesZero:\n";
+    for (auto *I : FalseLanesZero)
+      dbgs() << "  " << *I;
+    dbgs() << "  FalseLanesUnknown:\n";
+    for (auto *I : FalseLanesUnknown)
+      dbgs() << "  " << *I;
+  });
+
   auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
                               SmallPtrSetImpl<MachineInstr *> &Predicated) {
     SmallPtrSet<MachineInstr *, 2> Uses;
@@ -973,7 +1016,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       if (!isRegInClass(MO, QPRs) || !MO.isDef())
         continue;
       if (!HasPredicatedUsers(MI, MO, Predicated)) {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
+        LLVM_DEBUG(dbgs() << "  Found an unknown def of : "
                           << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
         NonPredicated.insert(MI);
         break;
@@ -993,8 +1036,10 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
     // TODO: Instead of blocking predication, we could move the vctp to the exit
     // block and calculate it's operand there in or the preheader.
-    if (RegMask.PhysReg == ARM::VPR)
+    if (RegMask.PhysReg == ARM::VPR) {
+      LLVM_DEBUG(dbgs() << "  VPR is live in to the exit block.");
       return false;
+    }
     // Check Q-regs that are live in the exit blocks. We don't collect scalars
     // because they won't be affected by lane predication.
     if (QPRs->contains(RegMask.PhysReg))
@@ -1007,10 +1052,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   // any VPT predicated instruction is predicated upon VCTP. Any live-out
   // instruction needs to be predicated, so check this here. The instructions
   // in NonPredicated have been found to be a reduction that we can ensure its
-  // legality.
-  for (auto *MI : LiveOutMIs) {
-    if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
-      LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI);
+  // legality. Any MQPRCopy found will need to validate its input as if it was
+  // live out.
+  SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
+  while (!Worklist.empty()) {
+    MachineInstr *MI = Worklist.pop_back_val();
+    if (MI->getOpcode() == ARM::MQPRCopy) {
+      VMOVCopies.insert(MI);
+      MachineInstr *CopySrc =
+          RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
+      if (CopySrc)
+        Worklist.push_back(CopySrc);
+    } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
+      LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
+      VMOVCopies.clear();
       return false;
     }
   }
@@ -1121,7 +1176,7 @@ static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
     return false;
   int FI = GetFrameIndex(MI->memoperands().front());
 
-  MachineFrameInfo FrameInfo = MI->getParent()->getParent()->getFrameInfo();
+  auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
   if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
     return false;
 
@@ -1211,8 +1266,15 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
   bool RequiresExplicitPredication =
     (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
   if (isDomainMVE(MI) && RequiresExplicitPredication) {
-    LLVM_DEBUG(if (!IsUse)
-               dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
+    if (MI->getOpcode() == ARM::MQPRCopy)
+      return true;
+    if (!IsUse && producesDoubleWidthResult(*MI)) {
+      DoubleWidthResultInstrs.insert(MI);
+      return true;
+    }
+
+    LLVM_DEBUG(if (!IsUse) dbgs()
+               << "ARM Loops: Can't tail predicate: " << *MI);
     return IsUse;
   }
 
@@ -1689,6 +1751,31 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     }
   };
 
+  // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
+  // Anything other MQPRCopy can be converted to MVE_VORR later on.
+  auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
+    for (auto *MI : VMOVCopies) {
+      LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
+      assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
+      MachineBasicBlock *MBB = MI->getParent();
+      Register Dst = MI->getOperand(0).getReg();
+      Register Src = MI->getOperand(1).getReg();
+      auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
+                          ARM::D0 + (Dst - ARM::Q0) * 2)
+                      .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
+                      .add(predOps(ARMCC::AL));
+      (void)MIB1;
+      LLVM_DEBUG(dbgs() << " into " << *MIB1);
+      auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
+                          ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
+                      .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
+                      .add(predOps(ARMCC::AL));
+      LLVM_DEBUG(dbgs() << " and  " << *MIB2);
+      (void)MIB2;
+      MI->eraseFromParent();
+    }
+  };
+
   if (LoLoop.Revert) {
     if (isWhileLoopStart(*LoLoop.Start))
       RevertWhile(LoLoop.Start);
@@ -1699,6 +1786,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     else
       RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
   } else {
+    ExpandVMOVCopies(LoLoop.VMOVCopies);
     LoLoop.Start = ExpandLoopStart(LoLoop);
     if (LoLoop.Start)
       RemoveDeadBranch(LoLoop.Start);
@@ -1743,6 +1831,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
     SmallVector<MachineInstr*, 4> Decs;
     SmallVector<MachineInstr*, 4> Ends;
     SmallVector<MachineInstr *, 4> EndDecs;
+    SmallVector<MachineInstr *, 4> MQPRCopies;
 
     for (auto &I : MBB) {
       if (isLoopStart(I))
@@ -1753,9 +1842,12 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
         Ends.push_back(&I);
       else if (I.getOpcode() == ARM::t2LoopEndDec)
         EndDecs.push_back(&I);
+      else if (I.getOpcode() == ARM::MQPRCopy)
+        MQPRCopies.push_back(&I);
     }
 
-    if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty())
+    if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
+        MQPRCopies.empty())
       continue;
 
     Changed = true;
@@ -1773,6 +1865,17 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
       RevertLoopEnd(End);
     for (auto *End : EndDecs)
       RevertLoopEndDec(End);
+    for (auto *MI : MQPRCopies) {
+      LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
+      assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
+      MachineBasicBlock *MBB = MI->getParent();
+      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
+                         MI->getOperand(0).getReg())
+                     .add(MI->getOperand(1))
+                     .add(MI->getOperand(1));
+      addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
+      MI->eraseFromParent();
+    }
   }
   return Changed;
 }
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index e4b022968431..2030fab6217d 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -194,7 +194,7 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   //   BLX ip
   //   POP{ r0, lr }
   //
-  OutStreamer->emitCodeAlignment(4);
+  OutStreamer->emitCodeAlignment(4, &getSubtargetInfo());
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
   OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index b37988232127..9752b3166b45 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -66,6 +66,8 @@ def ssub_10 : ComposedSubRegIndex<dsub_5, ssub_0>;
 def ssub_11 : ComposedSubRegIndex<dsub_5, ssub_1>;
 def ssub_12 : ComposedSubRegIndex<dsub_6, ssub_0>;
 def ssub_13 : ComposedSubRegIndex<dsub_6, ssub_1>;
+def ssub_14 : ComposedSubRegIndex<dsub_7, ssub_0>;
+def ssub_15 : ComposedSubRegIndex<dsub_7, ssub_1>;
 
 def gsub_0 : SubRegIndex<32>;
 def gsub_1 : SubRegIndex<32, 32>;
@@ -555,6 +557,9 @@ def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
   let AltOrderSelect = [{ return 1; }];
 }
 
+// Same as QQPR but for MVE, containing the 7 register pairs made up from Q0-Q7.
+def MQQPR : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 7)>;
+
 // Tuples of 4 D regs that isn't also a pair of Q regs.
 def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
                                 [(decimate (shl DPR, 1), 2),
@@ -578,6 +583,9 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
   let AltOrderSelect = [{ return 1; }];
 }
 
+// Same as QQPR but for MVE, containing the 5 register quads made up from Q0-Q7.
+def MQQQQPR : RegisterClass<"ARM", [v8i64], 256, (trunc QQQQPR, 5)>;
+
 
 // Pseudo-registers representing 2-spaced consecutive D registers.
 def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2],
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 90f1b693fec6..36c4bbaafcbf 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -295,6 +295,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   case CortexA77:
   case CortexA78:
   case CortexA78C:
+  case CortexA710:
   case CortexR4:
   case CortexR4F:
   case CortexR5:
@@ -389,7 +390,13 @@ bool ARMSubtarget::enableMachineScheduler() const {
   return useMachineScheduler();
 }
 
-bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; }
+bool ARMSubtarget::enableSubRegLiveness() const {
+  if (EnableSubRegLiveness.getNumOccurrences())
+    return EnableSubRegLiveness;
+  // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs
+  // and q subregs for qqqqpr regs.
+  return hasMVEIntegerOps();
+}
 
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index a8a9ae66b4ab..5e1217b6a468 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -65,6 +65,7 @@ protected:
     CortexA77,
     CortexA78,
     CortexA78C,
+    CortexA710,
     CortexA8,
     CortexA9,
     CortexM3,
@@ -124,6 +125,9 @@ protected:
     ARMv8mMainline,
     ARMv8r,
     ARMv81mMainline,
+    ARMv9a,
+    ARMv91a,
+    ARMv92a,
   };
 
 public:
@@ -170,6 +174,9 @@ protected:
   bool HasV8_5aOps = false;
   bool HasV8_6aOps = false;
   bool HasV8_7aOps = false;
+  bool HasV9_0aOps = false;
+  bool HasV9_1aOps = false;
+  bool HasV9_2aOps = false;
   bool HasV8MBaselineOps = false;
   bool HasV8MMainlineOps = false;
   bool HasV8_1MMainlineOps = false;
@@ -468,6 +475,9 @@ protected:
   /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
   bool NegativeImmediates = true;
 
+  /// Mitigate against the cve-2021-35465 security vulnurability.
+  bool FixCMSE_CVE_2021_35465 = false;
+
   /// Harden against Straight Line Speculation for Returns and Indirect
   /// Branches.
   bool HardenSlsRetBr = false;
@@ -618,6 +628,9 @@ public:
   bool hasV8_5aOps() const { return HasV8_5aOps; }
   bool hasV8_6aOps() const { return HasV8_6aOps; }
   bool hasV8_7aOps() const { return HasV8_7aOps; }
+  bool hasV9_0aOps() const { return HasV9_0aOps; }
+  bool hasV9_1aOps() const { return HasV9_1aOps; }
+  bool hasV9_2aOps() const { return HasV9_2aOps; }
   bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
   bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
   bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
@@ -780,14 +793,7 @@ public:
   // ARM Targets that support EHABI exception handling standard
   // Darwin uses SjLj. Other targets might need more checks.
   bool isTargetEHABICompatible() const {
-    return (TargetTriple.getEnvironment() == Triple::EABI ||
-            TargetTriple.getEnvironment() == Triple::GNUEABI ||
-            TargetTriple.getEnvironment() == Triple::MuslEABI ||
-            TargetTriple.getEnvironment() == Triple::EABIHF ||
-            TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
-            TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
-            isTargetAndroid()) &&
-           !isTargetDarwin() && !isTargetWindows();
+    return TargetTriple.isTargetEHABICompatible();
   }
 
   bool isTargetHardFloat() const;
@@ -934,6 +940,8 @@ public:
                                    unsigned PhysReg) const override;
   unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
 
+  bool fixCMSE_CVE_2021_35465() const { return FixCMSE_CVE_2021_35465; }
+
   bool hardenSlsRetBr() const { return HardenSlsRetBr; }
   bool hardenSlsBlr() const { return HardenSlsBlr; }
   bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index ae7ea7c2f415..833c7effd31c 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -38,12 +38,12 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/CFGuard.h"
@@ -541,7 +541,6 @@ void ARMPassConfig::addPreSched2() {
       return !MF.getSubtarget<ARMSubtarget>().isThumb1Only();
     }));
   }
-  addPass(createMVEVPTBlockPass());
   addPass(createThumb2ITBlockPass());
 
   // Add both scheduling passes to give the subtarget an opportunity to pick
@@ -551,6 +550,7 @@ void ARMPassConfig::addPreSched2() {
     addPass(&PostRASchedulerID);
   }
 
+  addPass(createMVEVPTBlockPass());
   addPass(createARMIndirectThunks());
   addPass(createARMSLSHardeningPass());
 }
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index b03bff92f373..8c5438f7093b 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -54,6 +54,16 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   }
 }
 
+const MCRegister ARMElfTargetObjectFile::getStaticBase() const {
+  return ARM::R9;
+}
+
+const MCExpr *ARMElfTargetObjectFile::
+getIndirectSymViaRWPI(const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_ARM_SBREL,
+                                 getContext());
+}
+
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/llvm/lib/Target/ARM/ARMTargetObjectFile.h
index 7b15dcc61f56..8b13198fe144 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -11,6 +11,7 @@
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegister.h"
 
 namespace llvm {
 
@@ -23,6 +24,10 @@ public:
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
+  const MCRegister getStaticBase() const override;
+
+  const MCExpr *getIndirectSymViaRWPI(const MCSymbol *Sym) const override;
+
   const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
                                         unsigned Encoding,
                                         const TargetMachine &TM,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cf7456e9e4f5..88de84a4fd78 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -149,7 +149,7 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     Align MemAlign =
         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
                           &IC.getAssumptionCache(), &IC.getDominatorTree());
-    unsigned AlignArg = II.getNumArgOperands() - 1;
+    unsigned AlignArg = II.arg_size() - 1;
     Value *AlignArgOp = II.getArgOperand(AlignArg);
     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
     if (Align && *Align < MemAlign) {
@@ -175,7 +175,7 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
                          PatternMatch::m_Constant(XorMask))) &&
         II.getType() == ArgArg->getType()) {
       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
-        if (CI->getValue().trunc(16).isAllOnesValue()) {
+        if (CI->getValue().trunc(16).isAllOnes()) {
           auto TrueVector = IC.Builder.CreateVectorSplat(
               cast<FixedVectorType>(II.getType())->getNumElements(),
               IC.Builder.getTrue());
@@ -248,6 +248,48 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   return None;
 }
 
+Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
+    APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+
+  // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
+  // opcode specifying a Top/Bottom instruction, which can change between
+  // instructions.
+  auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
+    unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
+    unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
+
+    // The only odd/even lanes of operand 0 will only be demanded depending
+    // on whether this is a top/bottom instruction.
+    APInt DemandedElts =
+        APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
+                                       : APInt::getHighBitsSet(2, 1));
+    SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
+    // The other lanes will be defined from the inserted elements.
+    UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
+                                                 : APInt::getHighBitsSet(2, 1));
+    return None;
+  };
+
+  switch (II.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::arm_mve_vcvt_narrow:
+    SimplifyNarrowInstrTopBottom(2);
+    break;
+  case Intrinsic::arm_mve_vqmovn:
+    SimplifyNarrowInstrTopBottom(4);
+    break;
+  case Intrinsic::arm_mve_vshrn:
+    SimplifyNarrowInstrTopBottom(7);
+    break;
+  }
+
+  return None;
+}
+
 InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                           TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
@@ -300,7 +342,7 @@ static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
 
   if (InstSPF == SPF_SMAX &&
       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
-      C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
+      C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
 
     auto isSSatMin = [&](Value *MinInst) {
       if (isa<SelectInst>(MinInst)) {
@@ -368,7 +410,7 @@ InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
   }
 
   // xor a, -1 can always be folded to MVN
-  if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
+  if (Opcode == Instruction::Xor && Imm.isAllOnes())
     return 0;
 
   // Ensures negative constant of min(max()) or max(min()) patterns that
@@ -381,6 +423,14 @@ InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
       return 0;
   }
 
+  // We can convert <= -1 to < 0, which is generally quite cheap.
+  if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
+    ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
+    if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
+      return std::min(getIntImmCost(Imm, Ty, CostKind),
+                      getIntImmCost(Imm + 1, Ty, CostKind));
+  }
+
   return getIntImmCost(Imm, Ty, CostKind);
 }
 
@@ -1623,13 +1673,24 @@ ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
                                         TTI::TargetCostKind CostKind) {
   EVT ValVT = TLI->getValueType(DL, ValTy);
   EVT ResVT = TLI->getValueType(DL, ResTy);
+
   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
     std::pair<InstructionCost, MVT> LT =
         TLI->getTypeLegalizationCost(DL, ValTy);
-    if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
-        (LT.second == MVT::v8i16 &&
-         ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
-        (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
+
+    // The legal cases are:
+    //   VADDV u/s 8/16/32
+    //   VMLAV u/s 8/16/32
+    //   VADDLV u/s 32
+    //   VMLALV u/s 16/32
+    // Codegen currently cannot always handle larger than legal vectors very
+    // well, especially for predicated reductions where the mask needs to be
+    // split, so restrict to 128bit or smaller input types.
+    unsigned RevVTSize = ResVT.getSizeInBits();
+    if (ValVT.getSizeInBits() <= 128 &&
+        ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
+         (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
+         (LT.second == MVT::v4i32 && RevVTSize <= 64)))
       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
   }
 
@@ -1949,6 +2010,20 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
   // we simply count the icmps, i.e. there should only be 1 for the backedge.
   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
     return false;
+  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
+  // not currently canonical, but soon will be. Code without them uses icmp, and
+  // so is not tail predicated as per the condition above. In order to get the
+  // same performance we treat min and max the same as an icmp for tailpred
+  // purposes for the moment (we often rely on non-tailpred and higher VF's to
+  // pick more optimial instructions like VQDMULH. They need to be recognized
+  // directly by the vectorizer).
+  if (auto *II = dyn_cast<IntrinsicInst>(&I))
+    if ((II->getIntrinsicID() == Intrinsic::smin ||
+         II->getIntrinsicID() == Intrinsic::smax ||
+         II->getIntrinsicID() == Intrinsic::umin ||
+         II->getIntrinsicID() == Intrinsic::umax) &&
+        ++ICmpCount > 1)
+      return false;
 
   if (isa<FCmpInst>(&I))
     return false;
@@ -2035,8 +2110,9 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
         return false;
       }
       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
-        Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
-        int64_t NextStride = getPtrStride(PSE, Ptr, L);
+        Value *Ptr = getLoadStorePointerOperand(&I);
+        Type *AccessTy = getLoadStoreType(&I);
+        int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
         if (NextStride == 1) {
           // TODO: for now only allow consecutive strides of 1. We could support
           // other strides as long as it is uniform, but let's keep it simple
@@ -2055,8 +2131,7 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
           // least if they are loop invariant.
           // TODO: Loop variant strides should in theory work, too, but
           // this requires further testing.
-          const SCEV *PtrScev =
-              replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
+          const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
             if (PSE.getSE()->isLoopInvariant(Step, L))
@@ -2135,14 +2210,15 @@ bool ARMTTIImpl::emitGetActiveLaneMask() const {
   return true;
 }
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                         TTI::UnrollingPreferences &UP) {
+                                         TTI::UnrollingPreferences &UP,
+                                         OptimizationRemarkEmitter *ORE) {
   // Enable Upper bound unrolling universally, not dependant upon the conditions
   // below.
   UP.UpperBound = true;
 
   // Only currently enable these preferences for M-Class cores.
   if (!ST->isMClass())
-    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
+    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
 
   // Disable loop unrolling for Oz and Os.
   UP.OptSizeThreshold = 0;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 889940534ce5..a56886d4fc11 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -120,6 +120,11 @@ public:
 
   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                                IntrinsicInst &II) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
 
   /// \name Scalar TTI Implementations
   /// @{
@@ -226,8 +231,7 @@ public:
                                             const SCEV *Ptr);
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -246,8 +250,7 @@ public:
 
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
@@ -279,7 +282,8 @@ public:
                                    DominatorTree *DT,
                                    const LoopAccessInfo *LAI);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   bool emitGetActiveLaneMask() const;
 
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index e410fe0aeff2..64d2e1bfa9b2 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMFeatures.h"
 #include "ARMBaseInstrInfo.h"
-#include "Utils/ARMBaseInfo.h"
+#include "ARMFeatures.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMInstPrinter.h"
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "TargetInfo/ARMTargetInfo.h"
+#include "Utils/ARMBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/None.h"
@@ -22,8 +22,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
@@ -44,6 +44,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
 #include "llvm/Support/Casting.h"
@@ -53,7 +54,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -2478,14 +2478,15 @@ public:
   }
 
   void addVPTPredNOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
+    assert(N == 3 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred())));
     unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0;
     Inst.addOperand(MCOperand::createReg(RegNum));
+    Inst.addOperand(MCOperand::createReg(0));
   }
 
   void addVPTPredROperands(MCInst &Inst, unsigned N) const {
-    assert(N == 3 && "Invalid number of operands!");
+    assert(N == 4 && "Invalid number of operands!");
     addVPTPredNOperands(Inst, N-1);
     unsigned RegNum;
     if (getVPTPred() == ARMVCC::None) {
@@ -3343,16 +3344,16 @@ public:
     // regs) or q0-q4 (for 4)
     //
     // The MVE instructions taking a register range of this kind will
-    // need an operand in the QQPR or QQQQPR class, representing the
+    // need an operand in the MQQPR or MQQQQPR class, representing the
     // entire range as a unit. So we must translate into that class,
     // by finding the index of the base register in the MQPR reg
     // class, and returning the super-register at the corresponding
     // index in the target class.
 
     const MCRegisterClass *RC_in = &ARMMCRegisterClasses[ARM::MQPRRegClassID];
-    const MCRegisterClass *RC_out = (VectorList.Count == 2) ?
-      &ARMMCRegisterClasses[ARM::QQPRRegClassID] :
-      &ARMMCRegisterClasses[ARM::QQQQPRRegClassID];
+    const MCRegisterClass *RC_out =
+        (VectorList.Count == 2) ? &ARMMCRegisterClasses[ARM::MQQPRRegClassID]
+                                : &ARMMCRegisterClasses[ARM::MQQQQPRRegClassID];
 
     unsigned I, E = RC_out->getNumRegs();
     for (I = 0; I < E; I++)
@@ -10960,7 +10961,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
       // Only after the instruction is fully processed, we can validate it
       if (wasInITBlock && hasV8Ops() && isThumb() &&
-          !isV8EligibleForIT(&Inst)) {
+          !isV8EligibleForIT(&Inst) && !getTargetOptions().MCNoDeprecatedWarn) {
         Warning(IDLoc, "deprecated instruction in IT block");
       }
     }
@@ -11777,13 +11778,13 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
     return true;
 
   if (!Section) {
-    getStreamer().InitSections(false);
+    getStreamer().initSections(false, getSTI());
     Section = getStreamer().getCurrentSectionOnly();
   }
 
   assert(Section && "must have section to emit alignment");
   if (Section->UseCodeAlign())
-    getStreamer().emitCodeAlignment(2);
+    getStreamer().emitCodeAlignment(2, &getSTI());
   else
     getStreamer().emitValueToAlignment(2);
 
@@ -11985,7 +11986,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
     assert(Section && "must have section to emit alignment");
     if (Section->UseCodeAlign())
-      getStreamer().emitCodeAlignment(4, 0);
+      getStreamer().emitCodeAlignment(4, &getSTI(), 0);
     else
       getStreamer().emitValueToAlignment(4, 0, 1, 0);
     return false;
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 51fd45034534..9caef9f09ea9 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -19,10 +19,10 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -227,10 +227,12 @@ static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
@@ -852,12 +854,15 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
     VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
     ++VCCI;
     if (VCC == ARMVCC::None)
-      MI.insert(VCCI, MCOperand::createReg(0));
+      VCCI = MI.insert(VCCI, MCOperand::createReg(0));
     else
-      MI.insert(VCCI, MCOperand::createReg(ARM::P0));
+      VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
+    ++VCCI;
+    VCCI = MI.insert(VCCI, MCOperand::createReg(0));
+    ++VCCI;
     if (OpInfo[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
       int TiedOp = ARMInsts[MI.getOpcode()].getOperandConstraint(
-        VCCPos + 2, MCOI::TIED_TO);
+        VCCPos + 3, MCOI::TIED_TO);
       assert(TiedOp >= 0 &&
              "Inactive register in vpred_r is not tied to an output!");
       // Copy the operand to ensure it's not invalidated when MI grows.
@@ -6154,9 +6159,9 @@ static const uint16_t QQPRDecoderTable[] = {
      ARM::Q4_Q5,  ARM::Q5_Q6,  ARM::Q6_Q7
 };
 
-static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
   if (RegNo > 6)
     return MCDisassembler::Fail;
 
@@ -6170,9 +6175,9 @@ static const uint16_t QQQQPRDecoderTable[] = {
      ARM::Q3_Q4_Q5_Q6,  ARM::Q4_Q5_Q6_Q7
 };
 
-static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo > 4)
     return MCDisassembler::Fail;
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 9f7327f792c7..851acea94022 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -48,9 +48,10 @@ public:
 } // end anonymous namespace
 
 Optional<MCFixupKind> ARMAsmBackend::getFixupKind(StringRef Name) const {
-  if (!STI.getTargetTriple().isOSBinFormatELF())
-    return None;
+  return None;
+}
 
+Optional<MCFixupKind> ARMAsmBackendELF::getFixupKind(StringRef Name) const {
   unsigned Type = llvm::StringSwitch<unsigned>(Name)
 #define ELF_RELOC(X, Y) .Case(#X, Y)
 #include "llvm/BinaryFormat/ELFRelocs/ARM.def"
@@ -330,7 +331,7 @@ void ARMAsmBackend::relaxInstruction(MCInst &Inst,
                                      const MCSubtargetInfo &STI) const {
   unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode(), STI);
 
-  // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
+  // Return a diagnostic if we get here w/ a bogus instruction.
   if (RelaxedOp == Inst.getOpcode()) {
     SmallString<256> Tmp;
     raw_svector_ostream OS(Tmp);
@@ -357,14 +358,15 @@ void ARMAsmBackend::relaxInstruction(MCInst &Inst,
   Inst.setOpcode(RelaxedOp);
 }
 
-bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                 const MCSubtargetInfo *STI) const {
   const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
   const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
   const uint32_t ARMv4_NopEncoding = 0xe1a00000;   // using MOV r0,r0
   const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
   if (isThumb()) {
     const uint16_t nopEncoding =
-        hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
+        hasNOP(STI) ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
     uint64_t NumNops = Count / 2;
     for (uint64_t i = 0; i != NumNops; ++i)
       support::endian::write(OS, nopEncoding, Endian);
@@ -374,7 +376,7 @@ bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   }
   // ARM mode
   const uint32_t nopEncoding =
-      hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
+      hasNOP(STI) ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
   uint64_t NumNops = Count / 4;
   for (uint64_t i = 0; i != NumNops; ++i)
     support::endian::write(OS, nopEncoding, Endian);
@@ -1300,11 +1302,12 @@ static MCAsmBackend *createARMAsmBackend(const Target &T,
     return new ARMAsmBackendDarwin(T, STI, MRI);
   case Triple::COFF:
     assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
-    return new ARMAsmBackendWinCOFF(T, STI);
+    return new ARMAsmBackendWinCOFF(T, STI.getTargetTriple().isThumb());
   case Triple::ELF:
     assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
     uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-    return new ARMAsmBackendELF(T, STI, OSABI, Endian);
+    return new ARMAsmBackendELF(T, STI.getTargetTriple().isThumb(), OSABI,
+                                Endian);
   }
 }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 38c7b30769b3..9b0c8c084161 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -13,29 +13,23 @@
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 namespace llvm {
 
 class ARMAsmBackend : public MCAsmBackend {
-  // The STI from the target triple the MCAsmBackend was instantiated with
-  // note that MCFragments may have a different local STI that should be
-  // used in preference.
-  const MCSubtargetInfo &STI;
   bool isThumbMode;    // Currently emitting Thumb code.
 public:
-  ARMAsmBackend(const Target &T, const MCSubtargetInfo &STI,
-                support::endianness Endian)
-      : MCAsmBackend(Endian), STI(STI),
-        isThumbMode(STI.getTargetTriple().isThumb()) {}
+  ARMAsmBackend(const Target &T, bool isThumb, support::endianness Endian)
+      : MCAsmBackend(Endian), isThumbMode(isThumb) {}
 
   unsigned getNumFixupKinds() const override {
     return ARM::NumTargetFixupKinds;
   }
 
-  // FIXME: this should be calculated per fragment as the STI may be
-  // different.
-  bool hasNOP() const { return STI.getFeatureBits()[ARM::HasV6T2Ops]; }
+  bool hasNOP(const MCSubtargetInfo *STI) const {
+    return STI->getFeatureBits()[ARM::HasV6T2Ops];
+  }
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 
@@ -69,7 +63,8 @@ public:
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   void handleAssemblerFlag(MCAssemblerFlag Flag) override;
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index e27bb134670f..85013b5f099a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -21,8 +21,8 @@ public:
   const MachO::CPUSubTypeARM Subtype;
   ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI,
                       const MCRegisterInfo &MRI)
-      : ARMAsmBackend(T, STI, support::little), MRI(MRI),
-        TT(STI.getTargetTriple()),
+      : ARMAsmBackend(T, STI.getTargetTriple().isThumb(), support::little),
+        MRI(MRI), TT(STI.getTargetTriple()),
         Subtype((MachO::CPUSubTypeARM)cantFail(
             MachO::getCPUSubType(STI.getTargetTriple()))) {}
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
index 5d735114d441..2431c4865b64 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -19,14 +19,16 @@ namespace {
 class ARMAsmBackendELF : public ARMAsmBackend {
 public:
   uint8_t OSABI;
-  ARMAsmBackendELF(const Target &T, const MCSubtargetInfo &STI, uint8_t OSABI,
+  ARMAsmBackendELF(const Target &T, bool isThumb, uint8_t OSABI,
                    support::endianness Endian)
-      : ARMAsmBackend(T, STI, Endian), OSABI(OSABI) {}
+      : ARMAsmBackend(T, isThumb, Endian), OSABI(OSABI) {}
 
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
     return createARMELFObjectWriter(OSABI);
   }
+
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 };
 }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
index 62eb1d73a2ce..6e447df9e4cb 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -16,8 +16,8 @@ using namespace llvm;
 namespace {
 class ARMAsmBackendWinCOFF : public ARMAsmBackend {
 public:
-  ARMAsmBackendWinCOFF(const Target &T, const MCSubtargetInfo &STI)
-      : ARMAsmBackend(T, STI, support::little) {}
+  ARMAsmBackendWinCOFF(const Target &T, bool isThumb)
+      : ARMAsmBackend(T, isThumb, support::little) {}
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
     return createARMWinCOFFObjectWriter();
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index ecd96114e8a4..43f7575df6db 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -408,6 +408,14 @@ namespace ARMII {
     // its input, typically reading from the top/bottom halves of the input(s).
     DoubleWidthResult = 1 << 23,
 
+    // The vector element size for MVE instructions. 00 = i8, 01 = i16, 10 = i32
+    // and 11 = i64. This is the largest type if multiple are present, so a
+    // MVE_VMOVLs8bh is ize 01=i16, as it extends from a i8 to a i16. There are
+    // some caveats so cannot be used blindly, such as exchanging VMLADAVA's and
+    // complex instructions, which may use different input lanes.
+    VecSizeShift = 24,
+    VecSize = 3 << VecSizeShift,
+
     //===------------------------------------------------------------------===//
     // Code domain.
     DomainShift   = 15,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 12076b8c49c1..896b104e8d97 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -785,6 +785,9 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV8_4A:
   case ARM::ArchKind::ARMV8_5A:
   case ARM::ArchKind::ARMV8_6A:
+  case ARM::ArchKind::ARMV9A:
+  case ARM::ArchKind::ARMV9_1A:
+  case ARM::ArchKind::ARMV9_2A:
     S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     S.setAttributeItem(ARM_ISA_use, Allowed, false);
     S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1056,7 +1059,7 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
 
   // Switch to .ARM.extab or .ARM.exidx section
   SwitchSection(EHSection);
-  emitCodeAlignment(4);
+  emitValueToAlignment(4, 0, 1, 0);
 }
 
 inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 40e8e244e312..77c0e3522911 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -91,7 +91,7 @@ ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
   ExceptionsType = ExceptionHandling::WinEH;
   PrivateGlobalPrefix = "$M";
   PrivateLabelPrefix = "$M";
-  CommentString = ";";
+  CommentString = "@";
 
   // Conditional Thumb 4-byte instructions can have an implicit IT.
   MaxInstLength = 6;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index ced48ccc9883..5ecacdab390f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1138,6 +1138,7 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
   // representation for the complex operand in the .td file. This isn't just
   // style, unfortunately. As-is, we can't represent the distinct encoding
   // for #-0.
+  assert(((Imm8 & 0x3) == 0) && "Not a valid immediate!");
   uint32_t Binary = (Imm8 >> 2) & 0xff;
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
   if (isAdd)
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 87cce08b1ce4..05e5a473a3c6 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -27,9 +27,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -441,8 +441,201 @@ public:
     }
     return false;
   }
+
+  Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+                                                  const MCSubtargetInfo *STI,
+                                                  uint64_t Addr,
+                                                  uint64_t Size) const override;
 };
 
+} // namespace
+
+static Optional<uint64_t>
+// NOLINTNEXTLINE(readability-identifier-naming)
+evaluateMemOpAddrForAddrMode_i12(const MCInst &Inst, const MCInstrDesc &Desc,
+                                 unsigned MemOpIndex, uint64_t Addr) {
+  if (MemOpIndex + 1 >= Desc.getNumOperands())
+    return None;
+
+  const MCOperand &MO1 = Inst.getOperand(MemOpIndex);
+  const MCOperand &MO2 = Inst.getOperand(MemOpIndex + 1);
+  if (!MO1.isReg() || MO1.getReg() != ARM::PC || !MO2.isImm())
+    return None;
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  return Addr + OffImm;
+}
+
+static Optional<uint64_t> evaluateMemOpAddrForAddrMode3(const MCInst &Inst,
+                                                        const MCInstrDesc &Desc,
+                                                        unsigned MemOpIndex,
+                                                        uint64_t Addr) {
+  if (MemOpIndex + 2 >= Desc.getNumOperands())
+    return None;
+
+  const MCOperand &MO1 = Inst.getOperand(MemOpIndex);
+  const MCOperand &MO2 = Inst.getOperand(MemOpIndex + 1);
+  const MCOperand &MO3 = Inst.getOperand(MemOpIndex + 2);
+  if (!MO1.isReg() || MO1.getReg() != ARM::PC || MO2.getReg() || !MO3.isImm())
+    return None;
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  ARM_AM::AddrOpc Op = ARM_AM::getAM3Op(MO3.getImm());
+
+  if (Op == ARM_AM::sub)
+    return Addr - ImmOffs;
+  return Addr + ImmOffs;
+}
+
+static Optional<uint64_t> evaluateMemOpAddrForAddrMode5(const MCInst &Inst,
+                                                        const MCInstrDesc &Desc,
+                                                        unsigned MemOpIndex,
+                                                        uint64_t Addr) {
+  if (MemOpIndex + 1 >= Desc.getNumOperands())
+    return None;
+
+  const MCOperand &MO1 = Inst.getOperand(MemOpIndex);
+  const MCOperand &MO2 = Inst.getOperand(MemOpIndex + 1);
+  if (!MO1.isReg() || MO1.getReg() != ARM::PC || !MO2.isImm())
+    return None;
+
+  unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+  ARM_AM::AddrOpc Op = ARM_AM::getAM5Op(MO2.getImm());
+
+  if (Op == ARM_AM::sub)
+    return Addr - ImmOffs * 4;
+  return Addr + ImmOffs * 4;
+}
+
+static Optional<uint64_t>
+evaluateMemOpAddrForAddrMode5FP16(const MCInst &Inst, const MCInstrDesc &Desc,
+                                  unsigned MemOpIndex, uint64_t Addr) {
+  if (MemOpIndex + 1 >= Desc.getNumOperands())
+    return None;
+
+  const MCOperand &MO1 = Inst.getOperand(MemOpIndex);
+  const MCOperand &MO2 = Inst.getOperand(MemOpIndex + 1);
+  if (!MO1.isReg() || MO1.getReg() != ARM::PC || !MO2.isImm())
+    return None;
+
+  unsigned ImmOffs = ARM_AM::getAM5FP16Offset(MO2.getImm());
+  ARM_AM::AddrOpc Op = ARM_AM::getAM5FP16Op(MO2.getImm());
+
+  if (Op == ARM_AM::sub)
+    return Addr - ImmOffs * 2;
+  return Addr + ImmOffs * 2;
+}
+
+static Optional<uint64_t>
+// NOLINTNEXTLINE(readability-identifier-naming)
+evaluateMemOpAddrForAddrModeT2_i8s4(const MCInst &Inst, const MCInstrDesc &Desc,
+                                    unsigned MemOpIndex, uint64_t Addr) {
+  if (MemOpIndex + 1 >= Desc.getNumOperands())
+    return None;
+
+  const MCOperand &MO1 = Inst.getOperand(MemOpIndex);
+  const MCOperand &MO2 = Inst.getOperand(MemOpIndex + 1);
+  if (!MO1.isReg() || MO1.getReg() != ARM::PC || !MO2.isImm())
+    return None;
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  return Addr + OffImm;
+}
+
+static Optional<uint64_t>
+// NOLINTNEXTLINE(readability-identifier-naming)
+evaluateMemOpAddrForAddrModeT2_pc(const MCInst &Inst, const MCInstrDesc &Desc,
+                                  unsigned MemOpIndex, uint64_t Addr) {
+  const MCOperand &MO1 = Inst.getOperand(MemOpIndex);
+  if (!MO1.isImm())
+    return None;
+
+  int32_t OffImm = (int32_t)MO1.getImm();
+
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  return Addr + OffImm;
+}
+
+static Optional<uint64_t>
+// NOLINTNEXTLINE(readability-identifier-naming)
+evaluateMemOpAddrForAddrModeT1_s(const MCInst &Inst, const MCInstrDesc &Desc,
+                                 unsigned MemOpIndex, uint64_t Addr) {
+  return evaluateMemOpAddrForAddrModeT2_pc(Inst, Desc, MemOpIndex, Addr);
+}
+
+Optional<uint64_t> ARMMCInstrAnalysis::evaluateMemoryOperandAddress(
+    const MCInst &Inst, const MCSubtargetInfo *STI, uint64_t Addr,
+    uint64_t Size) const {
+  const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+
+  // Only load instructions can have PC-relative memory addressing.
+  if (!Desc.mayLoad())
+    return None;
+
+  // PC-relative addressing does not update the base register.
+  uint64_t TSFlags = Desc.TSFlags;
+  unsigned IndexMode =
+      (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
+  if (IndexMode != ARMII::IndexModeNone)
+    return None;
+
+  // Find the memory addressing operand in the instruction.
+  unsigned OpIndex = Desc.NumDefs;
+  while (OpIndex < Desc.getNumOperands() &&
+         Desc.OpInfo[OpIndex].OperandType != MCOI::OPERAND_MEMORY)
+    ++OpIndex;
+  if (OpIndex == Desc.getNumOperands())
+    return None;
+
+  // Base address for PC-relative addressing is always 32-bit aligned.
+  Addr &= ~0x3;
+
+  // For ARM instructions the PC offset is 8 bytes, for Thumb instructions it
+  // is 4 bytes.
+  switch (Desc.TSFlags & ARMII::FormMask) {
+  default:
+    Addr += 8;
+    break;
+  case ARMII::ThumbFrm:
+    Addr += 4;
+    break;
+  // VLDR* instructions share the same opcode (and thus the same form) for Arm
+  // and Thumb. Use a bit longer route through STI in that case.
+  case ARMII::VFPLdStFrm:
+    Addr += STI->getFeatureBits()[ARM::ModeThumb] ? 4 : 8;
+    break;
+  }
+
+  // Eveluate the address depending on the addressing mode
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  switch (AddrMode) {
+  default:
+    return None;
+  case ARMII::AddrMode_i12:
+    return evaluateMemOpAddrForAddrMode_i12(Inst, Desc, OpIndex, Addr);
+  case ARMII::AddrMode3:
+    return evaluateMemOpAddrForAddrMode3(Inst, Desc, OpIndex, Addr);
+  case ARMII::AddrMode5:
+    return evaluateMemOpAddrForAddrMode5(Inst, Desc, OpIndex, Addr);
+  case ARMII::AddrMode5FP16:
+    return evaluateMemOpAddrForAddrMode5FP16(Inst, Desc, OpIndex, Addr);
+  case ARMII::AddrModeT2_i8s4:
+    return evaluateMemOpAddrForAddrModeT2_i8s4(Inst, Desc, OpIndex, Addr);
+  case ARMII::AddrModeT2_pc:
+    return evaluateMemOpAddrForAddrModeT2_pc(Inst, Desc, OpIndex, Addr);
+  case ARMII::AddrModeT1_s:
+    return evaluateMemOpAddrForAddrModeT1_s(Inst, Desc, OpIndex, Addr);
+  }
 }
 
 static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 1fee354cad93..3e4c97630af6 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -43,7 +43,9 @@ void ARMTargetStreamer::emitCurrentConstantPool() {
 }
 
 // finish() - write out any non-empty assembler constant pools.
-void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+void ARMTargetStreamer::emitConstantPools() {
+  ConstantPools->emitAll(Streamer);
+}
 
 // reset() - Reset any state
 void ARMTargetStreamer::reset() {}
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 4981b8051657..cfd275bc0621 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -76,6 +76,7 @@ public:
 
 private:
   LoopInfo *LI = nullptr;
+  const DataLayout *DL;
 
   // Check this is a valid gather with correct alignment
   bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
@@ -149,10 +150,10 @@ private:
   bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
   // Pushes the given add out of the loop
   void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex);
-  // Pushes the given mul out of the loop
-  void pushOutMul(PHINode *&Phi, Value *IncrementPerRound,
-                  Value *OffsSecondOperand, unsigned LoopIncrement,
-                  IRBuilder<> &Builder);
+  // Pushes the given mul or shl out of the loop
+  void pushOutMulShl(unsigned Opc, PHINode *&Phi, Value *IncrementPerRound,
+                     Value *OffsSecondOperand, unsigned LoopIncrement,
+                     IRBuilder<> &Builder);
 };
 
 } // end anonymous namespace
@@ -335,14 +336,15 @@ int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
 
 Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) {
   const Constant *C = dyn_cast<Constant>(V);
-  if (C != nullptr)
+  if (C && C->getSplatValue())
     return Optional<int64_t>{C->getUniqueInteger().getSExtValue()};
   if (!isa<Instruction>(V))
     return Optional<int64_t>{};
 
   const Instruction *I = cast<Instruction>(V);
-  if (I->getOpcode() == Instruction::Add ||
-              I->getOpcode() == Instruction::Mul) {
+  if (I->getOpcode() == Instruction::Add || I->getOpcode() == Instruction::Or ||
+      I->getOpcode() == Instruction::Mul ||
+      I->getOpcode() == Instruction::Shl) {
     Optional<int64_t> Op0 = getIfConst(I->getOperand(0));
     Optional<int64_t> Op1 = getIfConst(I->getOperand(1));
     if (!Op0 || !Op1)
@@ -351,18 +353,30 @@ Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) {
       return Optional<int64_t>{Op0.getValue() + Op1.getValue()};
     if (I->getOpcode() == Instruction::Mul)
       return Optional<int64_t>{Op0.getValue() * Op1.getValue()};
+    if (I->getOpcode() == Instruction::Shl)
+      return Optional<int64_t>{Op0.getValue() << Op1.getValue()};
+    if (I->getOpcode() == Instruction::Or)
+      return Optional<int64_t>{Op0.getValue() | Op1.getValue()};
   }
   return Optional<int64_t>{};
 }
 
+// Return true if I is an Or instruction that is equivalent to an add, due to
+// the operands having no common bits set.
+static bool isAddLikeOr(Instruction *I, const DataLayout &DL) {
+  return I->getOpcode() == Instruction::Or &&
+         haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL);
+}
+
 std::pair<Value *, int64_t>
 MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) {
   std::pair<Value *, int64_t> ReturnFalse =
       std::pair<Value *, int64_t>(nullptr, 0);
-  // At this point, the instruction we're looking at must be an add or we
-  // bail out
+  // At this point, the instruction we're looking at must be an add or an
+  // add-like-or.
   Instruction *Add = dyn_cast<Instruction>(Inst);
-  if (Add == nullptr || Add->getOpcode() != Instruction::Add)
+  if (Add == nullptr ||
+      (Add->getOpcode() != Instruction::Add && !isAddLikeOr(Add, *DL)))
     return ReturnFalse;
 
   Value *Summand;
@@ -737,10 +751,9 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
 
   // The gep was in charge of making sure the offsets are scaled correctly
   // - calculate that factor so it can be applied by hand
-  DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout();
   int TypeScale =
-      computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()),
-                   DT.getTypeSizeInBits(GEP->getType()) /
+      computeScale(DL->getTypeSizeInBits(GEP->getOperand(0)->getType()),
+                   DL->getTypeSizeInBits(GEP->getType()) /
                        cast<FixedVectorType>(GEP->getType())->getNumElements());
   if (TypeScale == -1)
     return nullptr;
@@ -888,11 +901,11 @@ void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
   Phi->removeIncomingValue(StartIndex);
 }
 
-void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
-                                          Value *IncrementPerRound,
-                                          Value *OffsSecondOperand,
-                                          unsigned LoopIncrement,
-                                          IRBuilder<> &Builder) {
+void MVEGatherScatterLowering::pushOutMulShl(unsigned Opcode, PHINode *&Phi,
+                                             Value *IncrementPerRound,
+                                             Value *OffsSecondOperand,
+                                             unsigned LoopIncrement,
+                                             IRBuilder<> &Builder) {
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n");
 
   // Create a new scalar add outside of the loop and transform it to a splat
@@ -901,12 +914,13 @@ void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
         Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
 
   // Create a new index
-  Value *StartIndex = BinaryOperator::Create(
-      Instruction::Mul, Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
-      OffsSecondOperand, "PushedOutMul", InsertionPoint);
+  Value *StartIndex =
+      BinaryOperator::Create((Instruction::BinaryOps)Opcode,
+                             Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
+                             OffsSecondOperand, "PushedOutMul", InsertionPoint);
 
   Instruction *Product =
-      BinaryOperator::Create(Instruction::Mul, IncrementPerRound,
+      BinaryOperator::Create((Instruction::BinaryOps)Opcode, IncrementPerRound,
                              OffsSecondOperand, "Product", InsertionPoint);
   // Increment NewIndex by Product instead of the multiplication
   Instruction *NewIncrement = BinaryOperator::Create(
@@ -923,7 +937,7 @@ void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
 
 // Check whether all usages of this instruction are as offsets of
 // gathers/scatters or simple arithmetics only used by gathers/scatters
-static bool hasAllGatScatUsers(Instruction *I) {
+static bool hasAllGatScatUsers(Instruction *I, const DataLayout &DL) {
   if (I->hasNUses(0)) {
     return false;
   }
@@ -936,8 +950,10 @@ static bool hasAllGatScatUsers(Instruction *I) {
       return Gatscat;
     } else {
       unsigned OpCode = cast<Instruction>(U)->getOpcode();
-      if ((OpCode == Instruction::Add || OpCode == Instruction::Mul) &&
-          hasAllGatScatUsers(cast<Instruction>(U))) {
+      if ((OpCode == Instruction::Add || OpCode == Instruction::Mul ||
+           OpCode == Instruction::Shl ||
+           isAddLikeOr(cast<Instruction>(U), DL)) &&
+          hasAllGatScatUsers(cast<Instruction>(U), DL)) {
         continue;
       }
       return false;
@@ -955,14 +971,15 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   if (!isa<Instruction>(Offsets))
     return false;
   Instruction *Offs = cast<Instruction>(Offsets);
-  if (Offs->getOpcode() != Instruction::Add &&
-      Offs->getOpcode() != Instruction::Mul)
+  if (Offs->getOpcode() != Instruction::Add && !isAddLikeOr(Offs, *DL) &&
+      Offs->getOpcode() != Instruction::Mul &&
+      Offs->getOpcode() != Instruction::Shl)
     return false;
   Loop *L = LI->getLoopFor(BB);
   if (L == nullptr)
     return false;
   if (!Offs->hasOneUse()) {
-    if (!hasAllGatScatUsers(Offs))
+    if (!hasAllGatScatUsers(Offs, *DL))
       return false;
   }
 
@@ -1060,11 +1077,13 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
 
   switch (Offs->getOpcode()) {
   case Instruction::Add:
+  case Instruction::Or:
     pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1);
     break;
   case Instruction::Mul:
-    pushOutMul(NewPhi, IncrementPerRound, OffsSecondOperand, IncrementingBlock,
-               Builder);
+  case Instruction::Shl:
+    pushOutMulShl(Offs->getOpcode(), NewPhi, IncrementPerRound,
+                  OffsSecondOperand, IncrementingBlock, Builder);
     break;
   default:
     return false;
@@ -1182,8 +1201,7 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
   if (!GEP)
     return false;
   bool Changed = false;
-  if (GEP->hasOneUse() &&
-      dyn_cast<GetElementPtrInst>(GEP->getPointerOperand())) {
+  if (GEP->hasOneUse() && isa<GetElementPtrInst>(GEP->getPointerOperand())) {
     IRBuilder<> Builder(GEP->getContext());
     Builder.SetInsertPoint(GEP);
     Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
@@ -1214,6 +1232,7 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   if (!ST->hasMVEIntegerOps())
     return false;
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DL = &F.getParent()->getDataLayout();
   SmallVector<IntrinsicInst *, 4> Gathers;
   SmallVector<IntrinsicInst *, 4> Scatters;
 
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 6fa5402096a6..dc58b5427425 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -40,6 +40,11 @@ MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
     cl::desc("Enable merging Loop End and Dec instructions."),
     cl::init(true));
 
+static cl::opt<bool>
+SetLRPredicate("arm-set-lr-predicate", cl::Hidden,
+    cl::desc("Enable setting lr as a predicate in tail predication regions."),
+    cl::init(true));
+
 namespace {
 class MVETPAndVPTOptimisations : public MachineFunctionPass {
 public:
@@ -434,10 +439,14 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
     return false;
 
   SmallVector<MachineInstr *, 4> VCTPs;
-  for (MachineBasicBlock *BB : ML->blocks())
+  SmallVector<MachineInstr *, 4> MVEInstrs;
+  for (MachineBasicBlock *BB : ML->blocks()) {
     for (MachineInstr &MI : *BB)
       if (isVCTP(&MI))
         VCTPs.push_back(&MI);
+      else if (findFirstVPTPredOperandIdx(MI) != -1)
+        MVEInstrs.push_back(&MI);
+  }
 
   if (VCTPs.empty()) {
     LLVM_DEBUG(dbgs() << "  no VCTPs\n");
@@ -510,6 +519,16 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
   MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
   LoopStart->eraseFromParent();
 
+  if (SetLRPredicate) {
+    // Each instruction in the loop needs to be using LR as the predicate from
+    // the Phi as the predicate.
+    Register LR = LoopPhi->getOperand(0).getReg();
+    for (MachineInstr *MI : MVEInstrs) {
+      int Idx = findFirstVPTPredOperandIdx(*MI);
+      MI->getOperand(Idx + 2).setReg(LR);
+    }
+  }
+
   return true;
 }
 
@@ -991,6 +1010,7 @@ bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
             .add(MI.getOperand(1))
             .addImm(ARMVCC::Then)
             .add(MI.getOperand(4))
+            .add(MI.getOperand(5))
             .add(MI.getOperand(2));
     // Silence unused variable warning in release builds.
     (void)MIBuilder;
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index cf9e2484bab5..6a5bc9284266 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -293,14 +293,18 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
     // Check for equality of TC and Ceil by calculating SCEV expression
     // TC - Ceil and test it for zero.
     //
-    bool Zero = SE->getMinusSCEV(
-                      SE->getBackedgeTakenCount(L),
-                      SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
-                                                     SE->getNegativeSCEV(VW)),
-                                      VW))
-                    ->isZero();
-
-    if (!Zero) {
+    const SCEV *Sub =
+      SE->getMinusSCEV(SE->getBackedgeTakenCount(L),
+                       SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
+                                                      SE->getNegativeSCEV(VW)),
+                                       VW));
+
+    // Use context sensitive facts about the path to the loop to refine.  This
+    // comes up as the backedge taken count can incorporate context sensitive
+    // reasoning, and our RHS just above doesn't.
+    Sub = SE->applyLoopGuards(Sub, L);
+
+    if (!Sub->isZero()) {
       LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n");
       return false;
     }
diff --git a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index a7f7d75e356e..4d514f3ca444 100644
--- a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/ARMTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheARMLETarget() {
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index ccd272a8617d..e4e95f63f0a6 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -582,10 +582,10 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
 static void findTemporariesForLR(const BitVector &GPRsNoLRSP,
                                  const BitVector &PopFriendly,
                                  const LivePhysRegs &UsedRegs, unsigned &PopReg,
-                                 unsigned &TmpReg) {
+                                 unsigned &TmpReg, MachineRegisterInfo &MRI) {
   PopReg = TmpReg = 0;
   for (auto Reg : GPRsNoLRSP.set_bits()) {
-    if (!UsedRegs.contains(Reg)) {
+    if (UsedRegs.available(MRI, Reg)) {
       // Remember the first pop-friendly register and exit.
       if (PopFriendly.test(Reg)) {
         PopReg = Reg;
@@ -693,7 +693,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   GPRsNoLRSP.reset(ARM::LR);
   GPRsNoLRSP.reset(ARM::SP);
   GPRsNoLRSP.reset(ARM::PC);
-  findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg);
+  findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg,
+                       MF.getRegInfo());
 
   // If we couldn't find a pop-friendly register, try restoring LR before
   // popping the other callee-saved registers, so we could use one of them as a
@@ -704,7 +705,8 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     PrevMBBI--;
     if (PrevMBBI->getOpcode() == ARM::tPOP) {
       UsedRegs.stepBackward(*PrevMBBI);
-      findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg, TemporaryReg);
+      findTemporariesForLR(GPRsNoLRSP, PopFriendly, UsedRegs, PopReg,
+                           TemporaryReg, MF.getRegInfo());
       if (PopReg) {
         MBBI = PrevMBBI;
         UseLDRSP = true;
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index cf5eb4b4c0f1..4b18f5e20d40 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -135,6 +135,10 @@ void Thumb1InstrInfo::expandLoadStackGuard(
     MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
   const TargetMachine &TM = MF.getTarget();
+
+  assert(MF.getFunction().getParent()->getStackProtectorGuard() != "tls" &&
+         "TLS stack protector not supported for Thumb1 targets");
+
   if (TM.isPositionIndependent())
     expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi);
   else
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 5204e3b03e9e..bdb167a08e61 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -250,7 +250,19 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 void Thumb2InstrInfo::expandLoadStackGuard(
     MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
-  if (MF.getTarget().isPositionIndependent())
+  Module &M = *MF.getFunction().getParent();
+
+  if (M.getStackProtectorGuard() == "tls") {
+    expandLoadStackGuardBase(MI, ARM::t2MRC, ARM::t2LDRi12);
+    return;
+  }
+
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+
+  if (MF.getSubtarget<ARMSubtarget>().isGVInGOT(GV))
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::t2LDRi12);
+  else if (MF.getTarget().isPositionIndependent())
     expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
   else
     expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
@@ -792,8 +804,12 @@ void llvm::recomputeVPTBlockMask(MachineInstr &Instr) {
   MachineBasicBlock::iterator Iter = ++Instr.getIterator(),
                               End = Instr.getParent()->end();
 
+  while (Iter != End && Iter->isDebugInstr())
+    ++Iter;
+
   // Verify that the instruction after the VPT/VPST is predicated (it should
   // be), and skip it.
+  assert(Iter != End && "Expected some instructions in any VPT block");
   assert(
       getVPTInstrPredicate(*Iter) == ARMVCC::Then &&
       "VPT/VPST should be followed by an instruction with a 'then' predicate!");
@@ -802,6 +818,10 @@ void llvm::recomputeVPTBlockMask(MachineInstr &Instr) {
   // Iterate over the predicated instructions, updating the BlockMask as we go.
   ARM::PredBlockMask BlockMask = ARM::PredBlockMask::T;
   while (Iter != End) {
+    if (Iter->isDebugInstr()) {
+      ++Iter;
+      continue;
+    }
     ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*Iter);
     if (Pred == ARMVCC::None)
       break;
diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h
index 7332307c07a3..143c339c0664 100644
--- a/llvm/lib/Target/AVR/AVR.h
+++ b/llvm/lib/Target/AVR/AVR.h
@@ -32,8 +32,8 @@ FunctionPass *createAVRDynAllocaSRPass();
 FunctionPass *createAVRBranchSelectionPass();
 
 void initializeAVRShiftExpandPass(PassRegistry &);
-void initializeAVRExpandPseudoPass(PassRegistry&);
-void initializeAVRRelaxMemPass(PassRegistry&);
+void initializeAVRExpandPseudoPass(PassRegistry &);
+void initializeAVRRelaxMemPass(PassRegistry &);
 
 /// Contains the AVR backend.
 namespace AVR {
diff --git a/llvm/lib/Target/AVR/AVR.td b/llvm/lib/Target/AVR/AVR.td
index 53768f99df3b..22ffc4a368ad 100644
--- a/llvm/lib/Target/AVR/AVR.td
+++ b/llvm/lib/Target/AVR/AVR.td
@@ -45,8 +45,8 @@ include "AVRCallingConv.td"
 //===---------------------------------------------------------------------===//
 
 def AVRAsmWriter : AsmWriter {
- string AsmWriterClassName = "InstPrinter";
- bit isMCAsmWriter = 1;
+  string AsmWriterClassName = "InstPrinter";
+  bit isMCAsmWriter = 1;
 }
 
 //===---------------------------------------------------------------------===//
@@ -71,10 +71,9 @@ def AVRAsmParserVariant : AsmParserVariant {
 //===---------------------------------------------------------------------===//
 
 def AVR : Target {
-  let InstructionSet         = AVRInstrInfo;
-  let AssemblyWriters        = [AVRAsmWriter];
+  let InstructionSet = AVRInstrInfo;
+  let AssemblyWriters = [AVRAsmWriter];
 
-  let AssemblyParsers        = [AVRAsmParser];
+  let AssemblyParsers = [AVRAsmParser];
   let AssemblyParserVariants = [AVRAsmParserVariant];
 }
-
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index e8a13c712210..259ab1bc7aec 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -24,11 +24,12 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "avr-asm-printer"
@@ -38,9 +39,8 @@ namespace llvm {
 /// An AVR assembly code printer.
 class AVRAsmPrinter : public AsmPrinter {
 public:
-  AVRAsmPrinter(TargetMachine &TM,
-                std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MRI(*TM.getMCRegisterInfo()) { }
+  AVRAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MRI(*TM.getMCRegisterInfo()) {}
 
   StringRef getPassName() const override { return "AVR Assembly Printer"; }
 
@@ -56,8 +56,13 @@ public:
 
   const MCExpr *lowerConstant(const Constant *CV) override;
 
+  void emitXXStructor(const DataLayout &DL, const Constant *CV) override;
+
+  bool doFinalization(Module &M) override;
+
 private:
   const MCRegisterInfo &MRI;
+  bool EmittedStructorSymbolAttrs = false;
 };
 
 void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
@@ -139,9 +144,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
 bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                           unsigned OpNum, const char *ExtraCode,
                                           raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0]) {
-    llvm_unreachable("This branch is not implemented yet");
-  }
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   (void)MO;
@@ -193,9 +197,47 @@ const MCExpr *AVRAsmPrinter::lowerConstant(const Constant *CV) {
   return AsmPrinter::lowerConstant(CV);
 }
 
+void AVRAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
+  if (!EmittedStructorSymbolAttrs) {
+    OutStreamer->emitRawComment(
+        " Emitting these undefined symbol references causes us to link the"
+        " libgcc code that runs our constructors/destructors");
+    OutStreamer->emitRawComment(" This matches GCC's behavior");
+
+    MCSymbol *CtorsSym = OutContext.getOrCreateSymbol("__do_global_ctors");
+    OutStreamer->emitSymbolAttribute(CtorsSym, MCSA_Global);
+
+    MCSymbol *DtorsSym = OutContext.getOrCreateSymbol("__do_global_dtors");
+    OutStreamer->emitSymbolAttribute(DtorsSym, MCSA_Global);
+
+    EmittedStructorSymbolAttrs = true;
+  }
+
+  AsmPrinter::emitXXStructor(DL, CV);
+}
+
+bool AVRAsmPrinter::doFinalization(Module &M) {
+  MCSymbol *DoCopyData = OutContext.getOrCreateSymbol("__do_copy_data");
+  MCSymbol *DoClearBss = OutContext.getOrCreateSymbol("__do_clear_bss");
+
+  // FIXME: We can disable __do_copy_data if there are no static RAM variables.
+
+  OutStreamer->emitRawComment(
+      " Declaring this symbol tells the CRT that it should");
+  OutStreamer->emitRawComment(
+      "copy all variables from program memory to RAM on startup");
+  OutStreamer->emitSymbolAttribute(DoCopyData, MCSA_Global);
+
+  OutStreamer->emitRawComment(
+      " Declaring this symbol tells the CRT that it should");
+  OutStreamer->emitRawComment("clear the zeroed data section on startup");
+  OutStreamer->emitSymbolAttribute(DoClearBss, MCSA_Global);
+
+  return AsmPrinter::doFinalization(M);
+}
+
 } // end of namespace llvm
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
   llvm::RegisterAsmPrinter<llvm::AVRAsmPrinter> X(llvm::getTheAVRTarget());
 }
-
diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td
index 65545e531a88..87874c5c50b2 100644
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -14,9 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 // Special return value calling convention for runtime functions.
-def RetCC_AVR_BUILTIN : CallingConv
-<[
-  CCIfType<[i8], CCAssignToReg<[R24,R25]>>,
+def RetCC_AVR_BUILTIN : CallingConv<[
+  CCIfType<[i8], CCAssignToReg<[R24, R25]>>,
   CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>>
 ]>;
 
@@ -27,8 +26,7 @@ def RetCC_AVR_BUILTIN : CallingConv
 // The calling conventions are implemented in custom C++ code
 
 // Calling convention for variadic functions.
-def ArgCC_AVR_Vararg : CallingConv
-<[
+def ArgCC_AVR_Vararg : CallingConv<[
   // i16 are always passed through the stack with an alignment of 1.
   CCAssignToStack<2, 1>
 ]>;
@@ -38,4 +36,4 @@ def ArgCC_AVR_Vararg : CallingConv
 //===----------------------------------------------------------------------===//
 
 def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
-def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>;
+def CSR_Interrupts : CalleeSavedRegs<(add(sequence "R%u", 31, 0))>;
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 9507aa40c3d8..7ad0fe904a81 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -7,19 +7,18 @@
 //        In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`.
 //        avr2 (with SRAM) adds the rest of the variants.
 
-
 // A feature set aggregates features, grouping them. We don't want to create a
 // new member in AVRSubtarget (to store a value) for each set because we do not
 // care if the set is supported, only the subfeatures inside the set. We fix
 // this by simply setting the same dummy member for all feature sets, which is
 // then ignored.
 class FeatureSet<string name, string desc, list<SubtargetFeature> i>
-  : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>;
+    : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>;
 
 // A family of microcontrollers, defining a set of supported features.
 class Family<string name, list<SubtargetFeature> i>
-  : FeatureSet<name, !strconcat("The device is a part of the ",
-               name, " family"), i>;
+    : FeatureSet<
+          name, !strconcat("The device is a part of the ", name, " family"), i>;
 
 // The device has SRAM, and supports the bare minimum of
 // SRAM-relevant instructions.
@@ -32,122 +31,122 @@ class Family<string name, list<SubtargetFeature> i>
 // `LDS Rd, K`
 // `STS k, Rr`
 // `PUSH`/`POP`
-def FeatureSRAM           : SubtargetFeature<"sram", "m_hasSRAM", "true",
-                                  "The device has random access memory">;
+def FeatureSRAM : SubtargetFeature<"sram", "m_hasSRAM", "true",
+                                   "The device has random access memory">;
 
 // The device supports the `JMP k` and `CALL k` instructions.
-def FeatureJMPCALL        : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true",
-                                  "The device supports the `JMP` and "
-                                  "`CALL` instructions">;
-
+def FeatureJMPCALL : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true",
+                                      "The device supports the `JMP` and "
+                                      "`CALL` instructions">;
 
 // The device supports the indirect branches `IJMP` and `ICALL`.
-def FeatureIJMPCALL       : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL",
-                                  "true",
-                                  "The device supports `IJMP`/`ICALL`"
-                                  "instructions">;
+def FeatureIJMPCALL : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL", "true",
+                                       "The device supports `IJMP`/`ICALL`"
+                                       "instructions">;
 
 // The device supports the extended indirect branches `EIJMP` and `EICALL`.
-def FeatureEIJMPCALL      : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL",
-                                  "true", "The device supports the "
-                                  "`EIJMP`/`EICALL` instructions">;
+def FeatureEIJMPCALL : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL", "true",
+                                        "The device supports the "
+                                        "`EIJMP`/`EICALL` instructions">;
 
 // The device supports `ADDI Rd, K`, `SUBI Rd, K`.
-def FeatureADDSUBIW       : SubtargetFeature<"addsubiw", "m_hasADDSUBIW",
-                                  "true", "Enable 16-bit register-immediate "
-                                  "addition and subtraction instructions">;
+def FeatureADDSUBIW : SubtargetFeature<"addsubiw", "m_hasADDSUBIW", "true",
+                                       "Enable 16-bit register-immediate "
+                                       "addition and subtraction instructions">;
 
 // The device has an 8-bit stack pointer (SP) register.
-def FeatureSmallStack     : SubtargetFeature<"smallstack", "m_hasSmallStack",
-                                  "true", "The device has an 8-bit "
-                                  "stack pointer">;
+def FeatureSmallStack
+    : SubtargetFeature<"smallstack", "m_hasSmallStack", "true",
+                       "The device has an 8-bit "
+                       "stack pointer">;
 
 // The device supports the 16-bit GPR pair MOVW instruction.
-def FeatureMOVW           : SubtargetFeature<"movw", "m_hasMOVW", "true",
-                                  "The device supports the 16-bit MOVW "
-                                  "instruction">;
+def FeatureMOVW : SubtargetFeature<"movw", "m_hasMOVW", "true",
+                                   "The device supports the 16-bit MOVW "
+                                   "instruction">;
 
 // The device supports the `LPM` instruction, with implied destination being r0.
-def FeatureLPM            : SubtargetFeature<"lpm", "m_hasLPM", "true",
+def FeatureLPM : SubtargetFeature<"lpm", "m_hasLPM", "true",
                                   "The device supports the `LPM` instruction">;
 
 // The device supports the `LPM Rd, Z[+] instruction.
-def FeatureLPMX           : SubtargetFeature<"lpmx", "m_hasLPMX", "true",
-                                  "The device supports the `LPM Rd, Z[+]` "
-                                  "instruction">;
+def FeatureLPMX : SubtargetFeature<"lpmx", "m_hasLPMX", "true",
+                                   "The device supports the `LPM Rd, Z[+]` "
+                                   "instruction">;
 
 // The device supports the `ELPM` instruction.
-def FeatureELPM           : SubtargetFeature<"elpm", "m_hasELPM", "true",
-                                  "The device supports the ELPM instruction">;
+def FeatureELPM : SubtargetFeature<"elpm", "m_hasELPM", "true",
+                                   "The device supports the ELPM instruction">;
 
 // The device supports the `ELPM Rd, Z[+]` instructions.
-def FeatureELPMX          : SubtargetFeature<"elpmx", "m_hasELPMX", "true",
-                                  "The device supports the `ELPM Rd, Z[+]` "
-                                  "instructions">;
+def FeatureELPMX : SubtargetFeature<"elpmx", "m_hasELPMX", "true",
+                                    "The device supports the `ELPM Rd, Z[+]` "
+                                    "instructions">;
 
 // The device supports the `SPM` instruction.
-def FeatureSPM            : SubtargetFeature<"spm", "m_hasSPM", "true",
+def FeatureSPM : SubtargetFeature<"spm", "m_hasSPM", "true",
                                   "The device supports the `SPM` instruction">;
 
 // The device supports the `SPM Z+` instruction.
-def FeatureSPMX           : SubtargetFeature<"spmx", "m_hasSPMX", "true",
-                                  "The device supports the `SPM Z+` "
-                                  "instruction">;
+def FeatureSPMX : SubtargetFeature<"spmx", "m_hasSPMX", "true",
+                                   "The device supports the `SPM Z+` "
+                                   "instruction">;
 
 // The device supports the `DES k` instruction.
-def FeatureDES            : SubtargetFeature<"des", "m_hasDES", "true",
+def FeatureDES : SubtargetFeature<"des", "m_hasDES", "true",
                                   "The device supports the `DES k` encryption "
                                   "instruction">;
 
 // The device supports the Read-Write-Modify instructions
 // XCH, LAS, LAC, and LAT.
-def FeatureRMW            : SubtargetFeature<"rmw", "m_supportsRMW", "true",
+def FeatureRMW : SubtargetFeature<"rmw", "m_supportsRMW", "true",
                                   "The device supports the read-write-modify "
                                   "instructions: XCH, LAS, LAC, LAT">;
 
 // The device supports the `[F]MUL[S][U]` family of instructions.
-def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication",
-                                  "true", "The device supports the "
-                                  "multiplication instructions">;
+def FeatureMultiplication
+    : SubtargetFeature<"mul", "m_supportsMultiplication", "true",
+                       "The device supports the "
+                       "multiplication instructions">;
 
 // The device supports the `BREAK` instruction.
-def FeatureBREAK          : SubtargetFeature<"break", "m_hasBREAK", "true",
-                                  "The device supports the `BREAK` debugging "
-                                  "instruction">;
+def FeatureBREAK : SubtargetFeature<"break", "m_hasBREAK", "true",
+                                    "The device supports the `BREAK` debugging "
+                                    "instruction">;
 
 // The device has instruction encodings specific to the Tiny core.
-def FeatureTinyEncoding   : SubtargetFeature<"tinyencoding",
-                                  "m_hasTinyEncoding", "true",
-                                  "The device has Tiny core specific "
-                                  "instruction encodings">;
+def FeatureTinyEncoding
+    : SubtargetFeature<"tinyencoding", "m_hasTinyEncoding", "true",
+                       "The device has Tiny core specific "
+                       "instruction encodings">;
 
 // The device has CPU registers mapped in data address space
-def FeatureMMR : SubtargetFeature<"memmappedregs", "m_hasMemMappedGPR",
-                                  "true", "The device has CPU registers "
+def FeatureMMR : SubtargetFeature<"memmappedregs", "m_hasMemMappedGPR", "true",
+                                  "The device has CPU registers "
                                   "mapped in data address space">;
 
-class ELFArch<string name>  : SubtargetFeature<"", "ELFArch",
-                                    !strconcat("ELF::",name), "">;
+class ELFArch<string name>
+    : SubtargetFeature<"", "ELFArch", !strconcat("ELF::", name), "">;
 
 // ELF e_flags architecture values
-def ELFArchAVR1    : ELFArch<"EF_AVR_ARCH_AVR1">;
-def ELFArchAVR2    : ELFArch<"EF_AVR_ARCH_AVR2">;
-def ELFArchAVR25   : ELFArch<"EF_AVR_ARCH_AVR25">;
-def ELFArchAVR3    : ELFArch<"EF_AVR_ARCH_AVR3">;
-def ELFArchAVR31   : ELFArch<"EF_AVR_ARCH_AVR31">;
-def ELFArchAVR35   : ELFArch<"EF_AVR_ARCH_AVR35">;
-def ELFArchAVR4    : ELFArch<"EF_AVR_ARCH_AVR4">;
-def ELFArchAVR5    : ELFArch<"EF_AVR_ARCH_AVR5">;
-def ELFArchAVR51   : ELFArch<"EF_AVR_ARCH_AVR51">;
-def ELFArchAVR6    : ELFArch<"EF_AVR_ARCH_AVR6">;
-def ELFArchTiny    : ELFArch<"EF_AVR_ARCH_AVRTINY">;
-def ELFArchXMEGA1  : ELFArch<"EF_AVR_ARCH_XMEGA1">;
-def ELFArchXMEGA2  : ELFArch<"EF_AVR_ARCH_XMEGA2">;
-def ELFArchXMEGA3  : ELFArch<"EF_AVR_ARCH_XMEGA3">;
-def ELFArchXMEGA4  : ELFArch<"EF_AVR_ARCH_XMEGA4">;
-def ELFArchXMEGA5  : ELFArch<"EF_AVR_ARCH_XMEGA5">;
-def ELFArchXMEGA6  : ELFArch<"EF_AVR_ARCH_XMEGA6">;
-def ELFArchXMEGA7  : ELFArch<"EF_AVR_ARCH_XMEGA7">;
+def ELFArchAVR1 : ELFArch<"EF_AVR_ARCH_AVR1">;
+def ELFArchAVR2 : ELFArch<"EF_AVR_ARCH_AVR2">;
+def ELFArchAVR25 : ELFArch<"EF_AVR_ARCH_AVR25">;
+def ELFArchAVR3 : ELFArch<"EF_AVR_ARCH_AVR3">;
+def ELFArchAVR31 : ELFArch<"EF_AVR_ARCH_AVR31">;
+def ELFArchAVR35 : ELFArch<"EF_AVR_ARCH_AVR35">;
+def ELFArchAVR4 : ELFArch<"EF_AVR_ARCH_AVR4">;
+def ELFArchAVR5 : ELFArch<"EF_AVR_ARCH_AVR5">;
+def ELFArchAVR51 : ELFArch<"EF_AVR_ARCH_AVR51">;
+def ELFArchAVR6 : ELFArch<"EF_AVR_ARCH_AVR6">;
+def ELFArchTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">;
+def ELFArchXMEGA1 : ELFArch<"EF_AVR_ARCH_XMEGA1">;
+def ELFArchXMEGA2 : ELFArch<"EF_AVR_ARCH_XMEGA2">;
+def ELFArchXMEGA3 : ELFArch<"EF_AVR_ARCH_XMEGA3">;
+def ELFArchXMEGA4 : ELFArch<"EF_AVR_ARCH_XMEGA4">;
+def ELFArchXMEGA5 : ELFArch<"EF_AVR_ARCH_XMEGA5">;
+def ELFArchXMEGA6 : ELFArch<"EF_AVR_ARCH_XMEGA6">;
+def ELFArchXMEGA7 : ELFArch<"EF_AVR_ARCH_XMEGA7">;
 
 //===---------------------------------------------------------------------===//
 // AVR Families
@@ -155,68 +154,64 @@ def ELFArchXMEGA7  : ELFArch<"EF_AVR_ARCH_XMEGA7">;
 
 // The device has at least the bare minimum that **every** single AVR
 // device should have.
-def FamilyAVR0           : Family<"avr0", []>;
+def FamilyAVR0 : Family<"avr0", []>;
 
-def FamilyAVR1           : Family<"avr1", [FamilyAVR0, FeatureLPM, FeatureMMR]>;
+def FamilyAVR1 : Family<"avr1", [FamilyAVR0, FeatureLPM, FeatureMMR]>;
 
-def FamilyAVR2           : Family<"avr2",
-                                 [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW,
-                                  FeatureSRAM]>;
+def FamilyAVR2
+    : Family<"avr2",
+             [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW, FeatureSRAM]>;
 
-def FamilyAVR25          : Family<"avr25",
-                                 [FamilyAVR2, FeatureMOVW, FeatureLPMX,
-                                  FeatureSPM, FeatureBREAK]>;
+def FamilyAVR25
+    : Family<"avr25",
+             [FamilyAVR2, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureBREAK]>;
 
-def FamilyAVR3           : Family<"avr3",
-                                 [FamilyAVR2, FeatureJMPCALL]>;
+def FamilyAVR3 : Family<"avr3", [FamilyAVR2, FeatureJMPCALL]>;
 
-def FamilyAVR31          : Family<"avr31",
-                                 [FamilyAVR3, FeatureELPM]>;
+def FamilyAVR31 : Family<"avr31", [FamilyAVR3, FeatureELPM]>;
 
-def FamilyAVR35          : Family<"avr35",
-                                 [FamilyAVR3, FeatureMOVW, FeatureLPMX,
-                                  FeatureSPM, FeatureBREAK]>;
+def FamilyAVR35
+    : Family<"avr35",
+             [FamilyAVR3, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureBREAK]>;
 
-def FamilyAVR4           : Family<"avr4",
-                                 [FamilyAVR2, FeatureMultiplication,
-                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
-                                  FeatureBREAK]>;
+def FamilyAVR4 : Family<"avr4", [
+  FamilyAVR2, FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM,
+  FeatureBREAK
+]>;
 
-def FamilyAVR5           : Family<"avr5",
-                                 [FamilyAVR3, FeatureMultiplication,
-                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
-                                  FeatureBREAK]>;
+def FamilyAVR5 : Family<"avr5", [
+  FamilyAVR3, FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM,
+  FeatureBREAK
+]>;
 
-def FamilyAVR51          : Family<"avr51",
-                                 [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
+def FamilyAVR51 : Family<"avr51", [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
 
-def FamilyAVR6           : Family<"avr6",
-                                 [FamilyAVR51]>;
+def FamilyAVR6 : Family<"avr6", [FamilyAVR51]>;
 
-def FamilyTiny           : Family<"avrtiny",
-                                 [FamilyAVR0, FeatureBREAK, FeatureSRAM,
-                                  FeatureTinyEncoding]>;
+def FamilyTiny
+    : Family<"avrtiny",
+             [FamilyAVR0, FeatureBREAK, FeatureSRAM, FeatureTinyEncoding]>;
 
-def FamilyXMEGA          : Family<"xmega",
-                                 [FamilyAVR0, FeatureLPM, FeatureIJMPCALL, FeatureADDSUBIW,
-                                  FeatureSRAM, FeatureJMPCALL, FeatureMultiplication,
-                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
-                                  FeatureBREAK, FeatureEIJMPCALL, FeatureSPMX,
-                                  FeatureDES, FeatureELPM, FeatureELPMX]>;
+def FamilyXMEGA : Family<"xmega", [
+  FamilyAVR0, FeatureLPM, FeatureIJMPCALL, FeatureADDSUBIW, FeatureSRAM,
+  FeatureJMPCALL, FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM,
+  FeatureBREAK, FeatureEIJMPCALL, FeatureSPMX, FeatureDES, FeatureELPM,
+  FeatureELPMX
+]>;
 
-def FamilyXMEGAU         : Family<"xmegau",
-                                  [FamilyXMEGA, FeatureRMW]>;
+def FamilyXMEGAU : Family<"xmegau", [FamilyXMEGA, FeatureRMW]>;
 
-def FeatureSetSpecial    : FeatureSet<"special",
-                                      "Enable use of the entire instruction "
-                                      "set - used for debugging",
-                                      [FeatureSRAM, FeatureJMPCALL,
-                                       FeatureIJMPCALL, FeatureEIJMPCALL,
-                                       FeatureADDSUBIW, FeatureMOVW,
-                                       FeatureLPM, FeatureLPMX, FeatureELPM,
-                                       FeatureELPMX, FeatureSPM, FeatureSPMX,
-                                       FeatureDES, FeatureRMW,
-                                       FeatureMultiplication, FeatureBREAK, FeatureMMR]>;
+def FeatureSetSpecial
+    : FeatureSet<"special",
+                 "Enable use of the entire instruction "
+                 "set - used for debugging",
+                 [
+                   FeatureSRAM, FeatureJMPCALL, FeatureIJMPCALL,
+                   FeatureEIJMPCALL, FeatureADDSUBIW, FeatureMOVW, FeatureLPM,
+                   FeatureLPMX, FeatureELPM, FeatureELPMX, FeatureSPM,
+                   FeatureSPMX, FeatureDES, FeatureRMW, FeatureMultiplication,
+                   FeatureBREAK, FeatureMMR
+                 ]>;
 
 //===---------------------------------------------------------------------===//
 // AVR microcontrollers supported.
@@ -224,284 +219,307 @@ def FeatureSetSpecial    : FeatureSet<"special",
 
 class Device<string Name, Family Fam, ELFArch Arch,
              list<SubtargetFeature> ExtraFeatures = []>
-  : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>;
+    : Processor<Name, NoItineraries, !listconcat([Fam, Arch], ExtraFeatures)>;
 
 // Generic MCUs
 // Note that several versions of GCC has strange ELF architecture
 // settings for backwards compatibility - see `gas/config/tc-avr.c`
 // in AVR binutils. We do not replicate this.
-def : Device<"avr1",      FamilyAVR1,    ELFArchAVR1>;
-def : Device<"avr2",      FamilyAVR2,    ELFArchAVR2>;
-def : Device<"avr25",     FamilyAVR25,   ELFArchAVR25>;
-def : Device<"avr3",      FamilyAVR3,    ELFArchAVR3>;
-def : Device<"avr31",     FamilyAVR31,   ELFArchAVR31>;
-def : Device<"avr35",     FamilyAVR35,   ELFArchAVR35>;
-def : Device<"avr4",      FamilyAVR4,    ELFArchAVR4>;
-def : Device<"avr5",      FamilyAVR5,    ELFArchAVR5>;
-def : Device<"avr51",     FamilyAVR51,   ELFArchAVR51>;
-def : Device<"avr6",      FamilyAVR6,    ELFArchAVR6>;
-def : Device<"avrxmega1", FamilyXMEGA,   ELFArchXMEGA1>;
-def : Device<"avrxmega2", FamilyXMEGA,   ELFArchXMEGA2>;
-def : Device<"avrxmega3", FamilyXMEGA,   ELFArchXMEGA3>;
-def : Device<"avrxmega4", FamilyXMEGA,   ELFArchXMEGA4>;
-def : Device<"avrxmega5", FamilyXMEGA,   ELFArchXMEGA5>;
-def : Device<"avrxmega6", FamilyXMEGA,   ELFArchXMEGA6>;
-def : Device<"avrxmega7", FamilyXMEGA,   ELFArchXMEGA7>;
-def : Device<"avrtiny",   FamilyTiny,    ELFArchTiny>;
+def : Device<"avr1", FamilyAVR1, ELFArchAVR1>;
+def : Device<"avr2", FamilyAVR2, ELFArchAVR2>;
+def : Device<"avr25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"avr3", FamilyAVR3, ELFArchAVR3>;
+def : Device<"avr31", FamilyAVR31, ELFArchAVR31>;
+def : Device<"avr35", FamilyAVR35, ELFArchAVR35>;
+def : Device<"avr4", FamilyAVR4, ELFArchAVR4>;
+def : Device<"avr5", FamilyAVR5, ELFArchAVR5>;
+def : Device<"avr51", FamilyAVR51, ELFArchAVR51>;
+def : Device<"avr6", FamilyAVR6, ELFArchAVR6>;
+def : Device<"avrxmega1", FamilyXMEGA, ELFArchXMEGA1>;
+def : Device<"avrxmega2", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"avrxmega3", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"avrtiny", FamilyTiny, ELFArchTiny>;
 
 // Specific MCUs
-def : Device<"at90s1200",          FamilyAVR0, ELFArchAVR1>;
-def : Device<"attiny11",           FamilyAVR1, ELFArchAVR1>;
-def : Device<"attiny12",           FamilyAVR1, ELFArchAVR1>;
-def : Device<"attiny15",           FamilyAVR1, ELFArchAVR1>;
-def : Device<"attiny28",           FamilyAVR1, ELFArchAVR1>;
-def : Device<"at90s2313",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s2323",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s2333",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s2343",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"attiny22",           FamilyAVR2, ELFArchAVR2>;
-def : Device<"attiny26",           FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
-def : Device<"at86rf401",          FamilyAVR2, ELFArchAVR25,
-             [FeatureMOVW, FeatureLPMX]>;
-def : Device<"at90s4414",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s4433",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s4434",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s8515",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90c8534",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s8535",          FamilyAVR2, ELFArchAVR2>;
-def : Device<"ata5272",            FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny13",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny13a",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny2313",         FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny2313a",        FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny24",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny24a",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny4313",         FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny44",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny44a",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny84",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny84a",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny25",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny45",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny85",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny261",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny261a",         FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny441",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny461",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny461a",         FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny841",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny861",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny861a",         FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny87",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny43u",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny48",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny88",           FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny828",          FamilyAVR25, ELFArchAVR25>;
-def : Device<"at43usb355",         FamilyAVR3,  ELFArchAVR3>;
-def : Device<"at76c711",           FamilyAVR3,  ELFArchAVR3>;
-def : Device<"atmega103",          FamilyAVR31, ELFArchAVR31>;
-def : Device<"at43usb320",         FamilyAVR31, ELFArchAVR31>;
-def : Device<"attiny167",          FamilyAVR35, ELFArchAVR35>;
-def : Device<"at90usb82",          FamilyAVR35, ELFArchAVR35>;
-def : Device<"at90usb162",         FamilyAVR35, ELFArchAVR35>;
-def : Device<"ata5505",            FamilyAVR35, ELFArchAVR35>;
-def : Device<"atmega8u2",          FamilyAVR35, ELFArchAVR35>;
-def : Device<"atmega16u2",         FamilyAVR35, ELFArchAVR35>;
-def : Device<"atmega32u2",         FamilyAVR35, ELFArchAVR35>;
-def : Device<"attiny1634",         FamilyAVR35, ELFArchAVR35>;
-def : Device<"atmega8",            FamilyAVR2,  ELFArchAVR4,
+def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>;
+def : Device<"attiny11", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny12", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny15", FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny28", FamilyAVR1, ELFArchAVR1>;
+def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny22", FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny26", FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
+def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25, [FeatureMOVW, FeatureLPMX]>;
+def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>;
+def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny25", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny441", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny841", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>;
+def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>;
+def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>;
+def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>;
+def : Device<"at43usb320", FamilyAVR31, ELFArchAVR31>;
+def : Device<"attiny167", FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb82", FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb162", FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata5505", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>;
+def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8", FamilyAVR2, ELFArchAVR4,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"ata6289",            FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega8a",           FamilyAVR2,  ELFArchAVR4,
+def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8a", FamilyAVR2, ELFArchAVR4,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"ata6285",            FamilyAVR4,  ELFArchAVR4>;
-def : Device<"ata6286",            FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega48",           FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega48a",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega48pa",         FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega48pb",         FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega48p",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega88",           FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega88a",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega88p",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega88pa",         FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega88pb",         FamilyAVR4,  ELFArchAVR4>;
-def : Device<"atmega8515",         FamilyAVR2,  ELFArchAVR4,
+def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48pb", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega48p", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88a", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega88pb", FamilyAVR4, ELFArchAVR4>;
+def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"atmega8535",         FamilyAVR2,  ELFArchAVR4,
+def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"atmega8hva",         FamilyAVR4,  ELFArchAVR4>;
-def : Device<"at90pwm1",           FamilyAVR4,  ELFArchAVR4>;
-def : Device<"at90pwm2",           FamilyAVR4,  ELFArchAVR4>;
-def : Device<"at90pwm2b",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"at90pwm3",           FamilyAVR4,  ELFArchAVR4>;
-def : Device<"at90pwm3b",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"at90pwm81",          FamilyAVR4,  ELFArchAVR4>;
-def : Device<"ata5790",            FamilyAVR5,  ELFArchAVR5>;
-def : Device<"ata5795",            FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16",           FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16a",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega161",          FamilyAVR3,  ELFArchAVR5,
+def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm2b", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm3", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm3b", FamilyAVR4, ELFArchAVR4>;
+def : Device<"at90pwm81", FamilyAVR4, ELFArchAVR4>;
+def : Device<"ata5790", FamilyAVR5, ELFArchAVR5>;
+def : Device<"ata5795", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega161", FamilyAVR3, ELFArchAVR5,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"atmega162",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega163",          FamilyAVR3,  ELFArchAVR5,
+def : Device<"atmega162", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega163", FamilyAVR3, ELFArchAVR5,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"atmega164a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega164p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega164pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega165",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega165a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega165p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega165pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega168",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega168a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega168p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega168pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega168pb",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega169",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega169a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega169p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega169pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32",           FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32a",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega323",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega324a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega324p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega324pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega324pb",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega325",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega325a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega325p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega325pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3250",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3250a",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3250p",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3250pa",       FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega328",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega328p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega328pb",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega329",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega329a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega329p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega329pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3290",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3290a",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3290p",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega3290pa",       FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega406",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega64",           FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega64a",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega640",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega644",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega644a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega644p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega644pa",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega645",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega645a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega645p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega649",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega649a",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega649p",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega6450",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega6450a",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega6450p",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega6490",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega6490a",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega6490p",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega64rfr2",       FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega644rfr2",      FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16hva",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16hva2",       FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16hvb",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16hvbrevb",    FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32hvb",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32hvbrevb",    FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega64hve",        FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90can32",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90can64",          FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90pwm161",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90pwm216",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90pwm316",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32c1",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega64c1",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16m1",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32m1",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega64m1",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega16u4",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32u4",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega32u6",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90usb646",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90usb647",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at90scr100",         FamilyAVR5,  ELFArchAVR5>;
-def : Device<"at94k",              FamilyAVR3,  ELFArchAVR5,
+def : Device<"atmega164a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega164p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega164pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega165pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega168pb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega169pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega323", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega324pb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega325pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3250pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega328pb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega329pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega3290pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega406", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega640", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644pa", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega645p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega649p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6450p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490a", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega6490p", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64rfr2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega644rfr2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hva", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hva2", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hvb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16hvbrevb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32hvb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32hvbrevb", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64hve", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90can32", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90can64", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm161", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm216", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90pwm316", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32c1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64c1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega64m1", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega16u4", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32u4", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega32u6", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90usb646", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90usb647", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at90scr100", FamilyAVR5, ELFArchAVR5>;
+def : Device<"at94k", FamilyAVR3, ELFArchAVR5,
              [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>;
-def : Device<"m3000",              FamilyAVR5,  ELFArchAVR5>;
-def : Device<"atmega128",          FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega128a",         FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega1280",         FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega1281",         FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega1284",         FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega1284p",        FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega128rfa1",      FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega128rfr2",      FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega1284rfr2",     FamilyAVR51, ELFArchAVR51>;
-def : Device<"at90can128",         FamilyAVR51, ELFArchAVR51>;
-def : Device<"at90usb1286",        FamilyAVR51, ELFArchAVR51>;
-def : Device<"at90usb1287",        FamilyAVR51, ELFArchAVR51>;
-def : Device<"atmega2560",         FamilyAVR6,  ELFArchAVR6>;
-def : Device<"atmega2561",         FamilyAVR6,  ELFArchAVR6>;
-def : Device<"atmega256rfr2",      FamilyAVR6,  ELFArchAVR6>;
-def : Device<"atmega2564rfr2",     FamilyAVR6,  ELFArchAVR6>;
-def : Device<"atxmega16a4",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega16a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega16c4",        FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega16d4",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega32a4",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega32a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega32c4",        FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega32d4",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega32e5",        FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega16e5",        FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega8e5",         FamilyXMEGAU, ELFArchXMEGA2>;
-def : Device<"atxmega32x1",        FamilyXMEGA, ELFArchXMEGA2>;
-def : Device<"atxmega64a3",        FamilyXMEGA, ELFArchXMEGA4>;
-def : Device<"atxmega64a3u",       FamilyXMEGAU, ELFArchXMEGA4>;
-def : Device<"atxmega64a4u",       FamilyXMEGAU, ELFArchXMEGA4>;
-def : Device<"atxmega64b1",        FamilyXMEGAU, ELFArchXMEGA4>;
-def : Device<"atxmega64b3",        FamilyXMEGAU, ELFArchXMEGA4>;
-def : Device<"atxmega64c3",        FamilyXMEGAU, ELFArchXMEGA4>;
-def : Device<"atxmega64d3",        FamilyXMEGA, ELFArchXMEGA4>;
-def : Device<"atxmega64d4",        FamilyXMEGA, ELFArchXMEGA4>;
-def : Device<"atxmega64a1",        FamilyXMEGA, ELFArchXMEGA5>;
-def : Device<"atxmega64a1u",       FamilyXMEGAU, ELFArchXMEGA5>;
-def : Device<"atxmega128a3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega128a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega128b1",       FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega128b3",       FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega128c3",       FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega128d3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega128d4",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega192a3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega192a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega192c3",       FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega192d3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega256a3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega256a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega256a3b",      FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega256a3bu",     FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega256c3",       FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega256d3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega384c3",       FamilyXMEGAU, ELFArchXMEGA6>;
-def : Device<"atxmega384d3",       FamilyXMEGA, ELFArchXMEGA6>;
-def : Device<"atxmega128a1",       FamilyXMEGA, ELFArchXMEGA7>;
-def : Device<"atxmega128a1u",      FamilyXMEGAU, ELFArchXMEGA7>;
-def : Device<"atxmega128a4u",      FamilyXMEGAU, ELFArchXMEGA7>;
-def : Device<"attiny4",            FamilyTiny, ELFArchTiny>;
-def : Device<"attiny5",            FamilyTiny, ELFArchTiny>;
-def : Device<"attiny9",            FamilyTiny, ELFArchTiny>;
-def : Device<"attiny10",           FamilyTiny, ELFArchTiny>;
-def : Device<"attiny20",           FamilyTiny, ELFArchTiny>;
-def : Device<"attiny40",           FamilyTiny, ELFArchTiny>;
-def : Device<"attiny102",          FamilyTiny, ELFArchTiny>;
-def : Device<"attiny104",          FamilyTiny, ELFArchTiny>;
-
+def : Device<"m3000", FamilyAVR5, ELFArchAVR5>;
+def : Device<"atmega128", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128a", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1280", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1281", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284p", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfa1", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfr2", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284rfr2", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90can128", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1286", FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1287", FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega2560", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega2561", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega256rfr2", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atmega2564rfr2", FamilyAVR6, ELFArchAVR6>;
+def : Device<"atxmega16a4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16d4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16e5", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega8e5", FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64a4u", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b1", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b3", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64c3", FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64d3", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64d4", FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a1", FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"atxmega64a1u", FamilyXMEGAU, ELFArchXMEGA5>;
+def : Device<"atxmega128a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b1", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128d4", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3u", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256a3b", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3bu", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega384c3", FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega384d3", FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a1", FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"atxmega128a1u", FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"atxmega128a4u", FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"attiny4", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny5", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny9", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny10", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny20", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny40", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny102", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny104", FamilyTiny, ELFArchTiny>;
+def : Device<"attiny202", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny402", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny204", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny404", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny804", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny1604", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny406", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny806", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny1606", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny807", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny1607", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny212", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny412", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny214", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny414", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny814", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny1614", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny416", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny816", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny1616", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny3216", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny417", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny817", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny1617", FamilyXMEGA, ELFArchXMEGA3>;
+def : Device<"attiny3217", FamilyXMEGA, ELFArchXMEGA3>;
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index f9f91f50c9d5..cb85d73772c5 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -70,25 +70,24 @@ private:
     return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode), DstReg);
   }
 
-  MachineRegisterInfo &getRegInfo(Block &MBB) { return MBB.getParent()->getRegInfo(); }
+  MachineRegisterInfo &getRegInfo(Block &MBB) {
+    return MBB.getParent()->getRegInfo();
+  }
 
   bool expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI);
   bool expandLogic(unsigned Op, Block &MBB, BlockIt MBBI);
   bool expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI);
   bool isLogicImmOpRedundant(unsigned Op, unsigned ImmVal) const;
 
-  template<typename Func>
-  bool expandAtomic(Block &MBB, BlockIt MBBI, Func f);
+  template <typename Func> bool expandAtomic(Block &MBB, BlockIt MBBI, Func f);
 
-  template<typename Func>
+  template <typename Func>
   bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI, Func f);
 
   bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI);
 
-  bool expandAtomicArithmeticOp(unsigned MemOpcode,
-                                unsigned ArithOpcode,
-                                Block &MBB,
-                                BlockIt MBBI);
+  bool expandAtomicArithmeticOp(unsigned MemOpcode, unsigned ArithOpcode,
+                                Block &MBB, BlockIt MBBI);
 
   /// Specific shift implementation.
   bool expandLSLB7Rd(Block &MBB, BlockIt MBBI);
@@ -150,8 +149,8 @@ bool AVRExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   return Modified;
 }
 
-bool AVRExpandPseudo::
-expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expandArith(unsigned OpLo, unsigned OpHi, Block &MBB,
+                                  BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
@@ -164,14 +163,15 @@ expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(DstIsKill))
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg, getKillRegState(DstIsKill))
+      .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(DstIsKill))
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   if (ImpIsDead)
     MIBHI->getOperand(3).setIsDead();
@@ -183,8 +183,7 @@ expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
   return true;
 }
 
-bool AVRExpandPseudo::
-expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
@@ -196,18 +195,20 @@ expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  auto MIBLO = buildMI(MBB, MBBI, Op)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(DstIsKill))
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+  auto MIBLO =
+      buildMI(MBB, MBBI, Op)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill))
+          .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   // SREG is always implicitly dead
   MIBLO->getOperand(3).setIsDead();
 
-  auto MIBHI = buildMI(MBB, MBBI, Op)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(DstIsKill))
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+  auto MIBHI =
+      buildMI(MBB, MBBI, Op)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   if (ImpIsDead)
     MIBHI->getOperand(3).setIsDead();
@@ -216,8 +217,8 @@ expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
   return true;
 }
 
-bool AVRExpandPseudo::
-  isLogicImmOpRedundant(unsigned Op, unsigned ImmVal) const {
+bool AVRExpandPseudo::isLogicImmOpRedundant(unsigned Op,
+                                            unsigned ImmVal) const {
 
   // ANDI Rd, 0xff is redundant.
   if (Op == AVR::ANDIRdK && ImmVal == 0xff)
@@ -230,8 +231,7 @@ bool AVRExpandPseudo::
   return false;
 }
 
-bool AVRExpandPseudo::
-expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
@@ -244,20 +244,22 @@ expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   if (!isLogicImmOpRedundant(Op, Lo8)) {
-    auto MIBLO = buildMI(MBB, MBBI, Op)
-      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstLoReg, getKillRegState(SrcIsKill))
-      .addImm(Lo8);
+    auto MIBLO =
+        buildMI(MBB, MBBI, Op)
+            .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstLoReg, getKillRegState(SrcIsKill))
+            .addImm(Lo8);
 
     // SREG is always implicitly dead
     MIBLO->getOperand(3).setIsDead();
   }
 
   if (!isLogicImmOpRedundant(Op, Hi8)) {
-    auto MIBHI = buildMI(MBB, MBBI, Op)
-      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstHiReg, getKillRegState(SrcIsKill))
-      .addImm(Hi8);
+    auto MIBHI =
+        buildMI(MBB, MBBI, Op)
+            .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstHiReg, getKillRegState(SrcIsKill))
+            .addImm(Hi8);
 
     if (ImpIsDead)
       MIBHI->getOperand(3).setIsDead();
@@ -292,13 +294,15 @@ bool AVRExpandPseudo::expand<AVR::SUBIWRdK>(Block &MBB, BlockIt MBBI) {
   bool ImpIsDead = MI.getOperand(3).isDead();
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  auto MIBLO = buildMI(MBB, MBBI, AVR::SUBIRdK)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(SrcIsKill));
+  auto MIBLO =
+      buildMI(MBB, MBBI, AVR::SUBIRdK)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(SrcIsKill));
 
-  auto MIBHI = buildMI(MBB, MBBI, AVR::SBCIRdK)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(SrcIsKill));
+  auto MIBHI =
+      buildMI(MBB, MBBI, AVR::SBCIRdK)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(SrcIsKill));
 
   switch (MI.getOperand(2).getType()) {
   case MachineOperand::MO_GlobalAddress: {
@@ -349,18 +353,20 @@ bool AVRExpandPseudo::expand<AVR::SBCIWRdK>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::SBCIRdK;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(SrcIsKill))
-    .addImm(Lo8);
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(SrcIsKill))
+          .addImm(Lo8);
 
   // SREG is always implicitly killed
   MIBLO->getOperand(4).setIsKill();
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(SrcIsKill))
-    .addImm(Hi8);
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(SrcIsKill))
+          .addImm(Hi8);
 
   if (ImpIsDead)
     MIBHI->getOperand(3).setIsDead();
@@ -409,16 +415,18 @@ bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::COMRd;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(DstIsKill));
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill));
 
   // SREG is always implicitly dead
   MIBLO->getOperand(2).setIsDead();
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(DstIsKill));
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill));
 
   if (ImpIsDead)
     MIBHI->getOperand(2).setIsDead();
@@ -481,12 +489,12 @@ bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
 
   // Low part
   buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, getKillRegState(DstIsKill))
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+      .addReg(DstLoReg, getKillRegState(DstIsKill))
+      .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, getKillRegState(DstIsKill))
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+                   .addReg(DstHiReg, getKillRegState(DstIsKill))
+                   .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   if (ImpIsDead)
     MIBHI->getOperand(2).setIsDead();
@@ -513,15 +521,15 @@ bool AVRExpandPseudo::expand<AVR::CPCWRdRr>(Block &MBB, BlockIt MBBI) {
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, getKillRegState(DstIsKill))
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+                   .addReg(DstLoReg, getKillRegState(DstIsKill))
+                   .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   // SREG is always implicitly killed
   MIBLO->getOperand(3).setIsKill();
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, getKillRegState(DstIsKill))
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+                   .addReg(DstHiReg, getKillRegState(DstIsKill))
+                   .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   if (ImpIsDead)
     MIBHI->getOperand(2).setIsDead();
@@ -543,11 +551,13 @@ bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::LDIRdK;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
 
   switch (MI.getOperand(1).getType()) {
   case MachineOperand::MO_GlobalAddress: {
@@ -592,11 +602,13 @@ bool AVRExpandPseudo::expand<AVR::LDSWRdK>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::LDSRdK;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
 
   switch (MI.getOperand(1).getType()) {
   case MachineOperand::MO_GlobalAddress: {
@@ -656,9 +668,9 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
 
   // Load high byte.
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(CurDstHiReg, RegState::Define)
-    .addReg(SrcReg, getKillRegState(SrcIsKill))
-    .addImm(1);
+                   .addReg(CurDstHiReg, RegState::Define)
+                   .addReg(SrcReg, getKillRegState(SrcIsKill))
+                   .addImm(1);
 
   if (TmpReg) {
     // Move the high byte into the final destination.
@@ -689,15 +701,17 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtrPi>(Block &MBB, BlockIt MBBI) {
 
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(SrcReg, RegState::Define)
-    .addReg(SrcReg, RegState::Kill);
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(SrcReg, RegState::Define)
+          .addReg(SrcReg, RegState::Kill);
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
-    .addReg(SrcReg, RegState::Kill);
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
+          .addReg(SrcReg, RegState::Kill);
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -720,15 +734,17 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtrPd>(Block &MBB, BlockIt MBBI) {
 
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(SrcReg, RegState::Define)
-    .addReg(SrcReg, RegState::Kill);
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(SrcReg, RegState::Define)
+          .addReg(SrcReg, RegState::Kill);
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
-    .addReg(SrcReg, RegState::Kill);
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
+          .addReg(SrcReg, RegState::Kill);
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -750,8 +766,8 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::LDDRdPtrQ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
-  // allowed for the instruction, 62 is the limit here.
+  // Since we add 1 to the Imm value for the high byte below, and 63 is the
+  // highest Imm value allowed for the instruction, 62 is the limit here.
   assert(Imm <= 62 && "Offset is out of range");
 
   // Use a temporary register if src and dst registers are the same.
@@ -763,9 +779,9 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
 
   // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(CurDstLoReg, RegState::Define)
-    .addReg(SrcReg)
-    .addImm(Imm);
+                   .addReg(CurDstLoReg, RegState::Define)
+                   .addReg(SrcReg)
+                   .addImm(Imm);
 
   // Push low byte onto stack if necessary.
   if (TmpReg)
@@ -773,9 +789,9 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
 
   // Load high byte.
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(CurDstHiReg, RegState::Define)
-    .addReg(SrcReg, getKillRegState(SrcIsKill))
-    .addImm(Imm + 1);
+                   .addReg(CurDstHiReg, RegState::Define)
+                   .addReg(SrcReg, getKillRegState(SrcIsKill))
+                   .addImm(Imm + 1);
 
   if (TmpReg) {
     // Move the high byte into the final destination.
@@ -813,8 +829,8 @@ bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
 
   // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-      .addReg(CurDstLoReg, RegState::Define)
-      .addReg(SrcReg);
+                   .addReg(CurDstLoReg, RegState::Define)
+                   .addReg(SrcReg);
 
   // Push low byte onto stack if necessary.
   if (TmpReg)
@@ -822,8 +838,8 @@ bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
 
   // Load high byte.
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-      .addReg(CurDstHiReg, RegState::Define)
-      .addReg(SrcReg, getKillRegState(SrcIsKill));
+                   .addReg(CurDstHiReg, RegState::Define)
+                   .addReg(SrcReg, getKillRegState(SrcIsKill));
 
   if (TmpReg) {
     // Move the high byte into the final destination.
@@ -845,15 +861,15 @@ bool AVRExpandPseudo::expand<AVR::LPMWRdZPi>(Block &MBB, BlockIt MBBI) {
   llvm_unreachable("wide LPMPi is unimplemented");
 }
 
-template<typename Func>
+template <typename Func>
 bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
   // Remove the pseudo instruction.
   MachineInstr &MI = *MBBI;
 
   // Store the SREG.
   buildMI(MBB, MBBI, AVR::INRdA)
-    .addReg(SCRATCH_REGISTER, RegState::Define)
-    .addImm(SREG_ADDR);
+      .addReg(SCRATCH_REGISTER, RegState::Define)
+      .addImm(SREG_ADDR);
 
   // Disable exceptions.
   buildMI(MBB, MBBI, AVR::BCLRs).addImm(7); // CLI
@@ -861,58 +877,52 @@ bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
   f(MI);
 
   // Restore the status reg.
-  buildMI(MBB, MBBI, AVR::OUTARr)
-    .addImm(SREG_ADDR)
-    .addReg(SCRATCH_REGISTER);
+  buildMI(MBB, MBBI, AVR::OUTARr).addImm(SREG_ADDR).addReg(SCRATCH_REGISTER);
 
   MI.eraseFromParent();
   return true;
 }
 
-template<typename Func>
-bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
-                                           Block &MBB,
-                                           BlockIt MBBI,
-                                           Func f) {
+template <typename Func>
+bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode, Block &MBB,
+                                           BlockIt MBBI, Func f) {
   return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
-      auto Op1 = MI.getOperand(0);
-      auto Op2 = MI.getOperand(1);
+    auto Op1 = MI.getOperand(0);
+    auto Op2 = MI.getOperand(1);
 
-      MachineInstr &NewInst =
-          *buildMI(MBB, MBBI, Opcode).add(Op1).add(Op2).getInstr();
-      f(NewInst);
+    MachineInstr &NewInst =
+        *buildMI(MBB, MBBI, Opcode).add(Op1).add(Op2).getInstr();
+    f(NewInst);
   });
 }
 
-bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
-                                           Block &MBB,
+bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode, Block &MBB,
                                            BlockIt MBBI) {
   return expandAtomicBinaryOp(Opcode, MBB, MBBI, [](MachineInstr &MI) {});
 }
 
 bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
-                                               unsigned ArithOpcode,
-                                               Block &MBB,
+                                               unsigned ArithOpcode, Block &MBB,
                                                BlockIt MBBI) {
   return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
-      auto DstReg = MI.getOperand(0).getReg();
-      auto PtrOp = MI.getOperand(1);
-      auto SrcReg = MI.getOperand(2).getReg();
+    auto DstReg = MI.getOperand(0).getReg();
+    auto PtrOp = MI.getOperand(1);
+    auto SrcReg = MI.getOperand(2).getReg();
 
-      unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
-      unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
+    unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
+    unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
 
-      // FIXME: this returns the new value (after the operation), not the old
-      // value as the atomicrmw instruction is supposed to do!
+    // FIXME: this returns the new value (after the operation), not the old
+    // value as the atomicrmw instruction is supposed to do!
 
-      // Create the load
-      buildMI(MBB, MBBI, LoadOpcode, DstReg).addReg(PtrOp.getReg());
+    // Create the load
+    buildMI(MBB, MBBI, LoadOpcode, DstReg).addReg(PtrOp.getReg());
 
-      // Create the arithmetic op
-      buildMI(MBB, MBBI, ArithOpcode, DstReg).addReg(DstReg).addReg(SrcReg);
+    // Create the arithmetic op
+    buildMI(MBB, MBBI, ArithOpcode, DstReg).addReg(DstReg).addReg(SrcReg);
 
-      // Create the store
-      buildMI(MBB, MBBI, StoreOpcode).add(PtrOp).addReg(DstReg);
+    // Create the store
+    buildMI(MBB, MBBI, StoreOpcode).add(PtrOp).addReg(DstReg);
   });
 }
 
@@ -924,8 +934,7 @@ Register AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
   RS.forward(MI);
 
   BitVector Candidates =
-      TRI->getAllocatableSet
-      (*MBB.getParent(), &AVR::GPR8RegClass);
+      TRI->getAllocatableSet(*MBB.getParent(), &AVR::GPR8RegClass);
 
   // Exclude all the registers being used by the instruction.
   for (MachineOperand &MO : MI.operands()) {
@@ -942,77 +951,77 @@ Register AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
   return Reg;
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoad8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoad16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::LDWRdPtr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicStore8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::STPtrRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicStore16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::STWPtrRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(8, AVR::ADDRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(16, AVR::ADDWRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadSub8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(8, AVR::SUBRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadSub16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(16, AVR::SUBWRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(8, AVR::ANDRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(16, AVR::ANDWRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadOr8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(8, AVR::ORRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadOr16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(16, AVR::ORWRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadXor8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(8, AVR::EORRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicLoadXor16>(Block &MBB, BlockIt MBBI) {
   return expandAtomicArithmeticOp(16, AVR::EORWRdRr, MBB, MBBI);
 }
 
-template<>
+template <>
 bool AVRExpandPseudo::expand<AVR::AtomicFence>(Block &MBB, BlockIt MBBI) {
   // On AVR, there is only one core and so atomic fences do nothing.
   MBBI->eraseFromParent();
@@ -1077,15 +1086,15 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::STDPtrQRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
-  //:TODO: need to reverse this order like inw and stsw?
+  //: TODO: need to reverse this order like inw and stsw?
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstReg, getUndefRegState(DstIsUndef))
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+                   .addReg(DstReg, getUndefRegState(DstIsUndef))
+                   .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstReg, getUndefRegState(DstIsUndef))
-    .addImm(1)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+                   .addReg(DstReg, getUndefRegState(DstIsUndef))
+                   .addImm(1)
+                   .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -1110,16 +1119,17 @@ bool AVRExpandPseudo::expand<AVR::STWPtrPiRr>(Block &MBB, BlockIt MBBI) {
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstReg, RegState::Define)
-    .addReg(DstReg, RegState::Kill)
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
-    .addImm(Imm);
+                   .addReg(DstReg, RegState::Define)
+                   .addReg(DstReg, RegState::Kill)
+                   .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+                   .addImm(Imm);
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstReg, RegState::Kill)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
-    .addImm(Imm);
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg, RegState::Kill)
+          .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+          .addImm(Imm);
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -1144,16 +1154,17 @@ bool AVRExpandPseudo::expand<AVR::STWPtrPdRr>(Block &MBB, BlockIt MBBI) {
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstReg, RegState::Define)
-    .addReg(DstReg, RegState::Kill)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
-    .addImm(Imm);
+                   .addReg(DstReg, RegState::Define)
+                   .addReg(DstReg, RegState::Kill)
+                   .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+                   .addImm(Imm);
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstReg, RegState::Kill)
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
-    .addImm(Imm);
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg, RegState::Kill)
+          .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+          .addImm(Imm);
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -1175,19 +1186,19 @@ bool AVRExpandPseudo::expand<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::STDPtrQRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
-  // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
-  // allowed for the instruction, 62 is the limit here.
+  // Since we add 1 to the Imm value for the high byte below, and 63 is the
+  // highest Imm value allowed for the instruction, 62 is the limit here.
   assert(Imm <= 62 && "Offset is out of range");
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstReg)
-    .addImm(Imm)
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+                   .addReg(DstReg)
+                   .addImm(Imm)
+                   .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstReg, getKillRegState(DstIsKill))
-    .addImm(Imm + 1)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+                   .addReg(DstReg, getKillRegState(DstIsKill))
+                   .addImm(Imm + 1)
+                   .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -1207,17 +1218,19 @@ bool AVRExpandPseudo::expand<AVR::INWRdA>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::INRdA;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
-  // allowed for the instruction, 62 is the limit here.
+  // Since we add 1 to the Imm value for the high byte below, and 63 is the
+  // highest Imm value allowed for the instruction, 62 is the limit here.
   assert(Imm <= 62 && "Address is out of range");
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addImm(Imm);
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addImm(Imm);
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addImm(Imm + 1);
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addImm(Imm + 1);
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -1237,18 +1250,18 @@ bool AVRExpandPseudo::expand<AVR::OUTWARr>(Block &MBB, BlockIt MBBI) {
   unsigned OpHi = AVR::OUTARr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
-  // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
-  // allowed for the instruction, 62 is the limit here.
+  // Since we add 1 to the Imm value for the high byte below, and 63 is the
+  // highest Imm value allowed for the instruction, 62 is the limit here.
   assert(Imm <= 62 && "Address is out of range");
 
   // 16 bit I/O writes need the high byte first
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addImm(Imm + 1)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+                   .addImm(Imm + 1)
+                   .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addImm(Imm)
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+                   .addImm(Imm)
+                   .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   MIBLO.setMemRefs(MI.memoperands());
   MIBHI.setMemRefs(MI.memoperands());
@@ -1270,13 +1283,13 @@ bool AVRExpandPseudo::expand<AVR::PUSHWRr>(Block &MBB, BlockIt MBBI) {
 
   // Low part
   buildMI(MBB, MBBI, OpLo)
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
-    .setMIFlags(Flags);
+      .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+      .setMIFlags(Flags);
 
   // High part
   buildMI(MBB, MBBI, OpHi)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
-    .setMIFlags(Flags);
+      .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+      .setMIFlags(Flags);
 
   MI.eraseFromParent();
   return true;
@@ -1319,15 +1332,15 @@ bool AVRExpandPseudo::expand<AVR::ROLBRd>(Block &MBB, BlockIt MBBI) {
 
   // Shift part
   buildMI(MBB, MBBI, OpShift)
-    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstReg)
-    .addReg(DstReg);
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg)
+      .addReg(DstReg);
 
   // Add the carry bit
   auto MIB = buildMI(MBB, MBBI, OpCarry)
-    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstReg)
-    .addReg(ZERO_REGISTER);
+                 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+                 .addReg(DstReg)
+                 .addReg(ZERO_REGISTER);
 
   // SREG is always implicitly killed
   MIB->getOperand(2).setIsKill();
@@ -1378,14 +1391,15 @@ bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
 
   // Low part
   buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg)
-    .addReg(DstLoReg, getKillRegState(DstIsKill));
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg)
+      .addReg(DstLoReg, getKillRegState(DstIsKill));
 
-  auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg)
-    .addReg(DstHiReg, getKillRegState(DstIsKill));
+  auto MIBHI =
+      buildMI(MBB, MBBI, OpHi)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg)
+          .addReg(DstHiReg, getKillRegState(DstIsKill));
 
   if (ImpIsDead)
     MIBHI->getOperand(3).setIsDead();
@@ -1554,12 +1568,13 @@ bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) {
 
   // High part
   buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(DstIsKill));
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg, getKillRegState(DstIsKill));
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(DstIsKill));
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill));
 
   if (ImpIsDead)
     MIBLO->getOperand(2).setIsDead();
@@ -1740,12 +1755,13 @@ bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
 
   // High part
   buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, getKillRegState(DstIsKill));
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg, getKillRegState(DstIsKill));
 
-  auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstLoReg, getKillRegState(DstIsKill));
+  auto MIBLO =
+      buildMI(MBB, MBBI, OpLo)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill));
 
   if (ImpIsDead)
     MIBLO->getOperand(2).setIsDead();
@@ -1817,7 +1833,8 @@ bool AVRExpandPseudo::expandLSLB7Rd(Block &MBB, BlockIt MBBI) {
   buildMI(MBB, MBBI, AVR::RORRd)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
       .addReg(DstReg, getKillRegState(DstIsKill))
-      ->getOperand(3).setIsUndef(true);
+      ->getOperand(3)
+      .setIsUndef(true);
 
   buildMI(MBB, MBBI, AVR::EORRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -1867,7 +1884,8 @@ bool AVRExpandPseudo::expandLSRB7Rd(Block &MBB, BlockIt MBBI) {
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
       .addReg(DstReg, getKillRegState(DstIsKill))
       .addReg(DstReg, getKillRegState(DstIsKill))
-      ->getOperand(4).setIsUndef(true);
+      ->getOperand(4)
+      .setIsUndef(true);
 
   buildMI(MBB, MBBI, AVR::EORRdRr)
       .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -1918,10 +1936,11 @@ bool AVRExpandPseudo::expandASRB7Rd(Block &MBB, BlockIt MBBI) {
       .addReg(DstReg, getKillRegState(DstIsKill))
       .addReg(DstReg, getKillRegState(DstIsKill));
 
-  auto MIRRC = buildMI(MBB, MBBI, AVR::SBCRdRr)
-      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(DstReg, getKillRegState(DstIsKill))
-      .addReg(DstReg, getKillRegState(DstIsKill));
+  auto MIRRC =
+      buildMI(MBB, MBBI, AVR::SBCRdRr)
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg, getKillRegState(DstIsKill))
+          .addReg(DstReg, getKillRegState(DstIsKill));
 
   if (ImpIsDead)
     MIRRC->getOperand(3).setIsDead();
@@ -1970,9 +1989,10 @@ template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   if (SrcReg != DstLoReg) {
-    auto MOV = buildMI(MBB, MBBI, AVR::MOVRdRr)
-      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(SrcReg);
+    auto MOV =
+        buildMI(MBB, MBBI, AVR::MOVRdRr)
+            .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(SrcReg);
 
     if (SrcReg == DstHiReg) {
       MOV->getOperand(1).setIsKill();
@@ -1981,19 +2001,20 @@ template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
 
   if (SrcReg != DstHiReg) {
     buildMI(MBB, MBBI, AVR::MOVRdRr)
-      .addReg(DstHiReg, RegState::Define)
-      .addReg(SrcReg, getKillRegState(SrcIsKill));
+        .addReg(DstHiReg, RegState::Define)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
   }
 
   buildMI(MBB, MBBI, AVR::ADDRdRr) // LSL Rd <==> ADD Rd, Rr
-    .addReg(DstHiReg, RegState::Define)
-    .addReg(DstHiReg)
-    .addReg(DstHiReg, RegState::Kill);
+      .addReg(DstHiReg, RegState::Define)
+      .addReg(DstHiReg)
+      .addReg(DstHiReg, RegState::Kill);
 
-  auto SBC = buildMI(MBB, MBBI, AVR::SBCRdRr)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, RegState::Kill)
-    .addReg(DstHiReg, RegState::Kill);
+  auto SBC =
+      buildMI(MBB, MBBI, AVR::SBCRdRr)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, RegState::Kill)
+          .addReg(DstHiReg, RegState::Kill);
 
   if (ImpIsDead)
     SBC->getOperand(3).setIsDead();
@@ -2025,14 +2046,15 @@ template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
 
   if (SrcReg != DstLoReg) {
     buildMI(MBB, MBBI, AVR::MOVRdRr)
-      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(SrcReg, getKillRegState(SrcIsKill));
+        .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
   }
 
-  auto EOR = buildMI(MBB, MBBI, AVR::EORRdRr)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstHiReg, RegState::Kill | RegState::Undef)
-    .addReg(DstHiReg, RegState::Kill | RegState::Undef);
+  auto EOR =
+      buildMI(MBB, MBBI, AVR::EORRdRr)
+          .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, RegState::Kill | RegState::Undef)
+          .addReg(DstHiReg, RegState::Kill | RegState::Undef);
 
   if (ImpIsDead)
     EOR->getOperand(3).setIsDead();
@@ -2054,15 +2076,15 @@ bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
 
   // Low part
   buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addImm(0x3d)
-    .setMIFlags(Flags);
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addImm(0x3d)
+      .setMIFlags(Flags);
 
   // High part
   buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addImm(0x3e)
-    .setMIFlags(Flags);
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addImm(0x3e)
+      .setMIFlags(Flags);
 
   MI.eraseFromParent();
   return true;
@@ -2078,26 +2100,26 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   buildMI(MBB, MBBI, AVR::INRdA)
-    .addReg(AVR::R0, RegState::Define)
-    .addImm(SREG_ADDR)
-    .setMIFlags(Flags);
+      .addReg(AVR::R0, RegState::Define)
+      .addImm(SREG_ADDR)
+      .setMIFlags(Flags);
 
   buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
 
   buildMI(MBB, MBBI, AVR::OUTARr)
-    .addImm(0x3e)
-    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
-    .setMIFlags(Flags);
+      .addImm(0x3e)
+      .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+      .setMIFlags(Flags);
 
   buildMI(MBB, MBBI, AVR::OUTARr)
-    .addImm(SREG_ADDR)
-    .addReg(AVR::R0, RegState::Kill)
-    .setMIFlags(Flags);
+      .addImm(SREG_ADDR)
+      .addReg(AVR::R0, RegState::Kill)
+      .setMIFlags(Flags);
 
   buildMI(MBB, MBBI, AVR::OUTARr)
-    .addImm(0x3d)
-    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
-    .setMIFlags(Flags);
+      .addImm(0x3d)
+      .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+      .setMIFlags(Flags);
 
   MI.eraseFromParent();
   return true;
@@ -2107,8 +2129,8 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   int Opcode = MBBI->getOpcode();
 
-#define EXPAND(Op)               \
-  case Op:                       \
+#define EXPAND(Op)                                                             \
+  case Op:                                                                     \
     return expand<Op>(MBB, MI)
 
   switch (Opcode) {
@@ -2132,7 +2154,7 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
     EXPAND(AVR::LDWRdPtr);
     EXPAND(AVR::LDWRdPtrPi);
     EXPAND(AVR::LDWRdPtrPd);
-  case AVR::LDDWRdYQ: //:FIXME: remove this once PR13375 gets fixed
+  case AVR::LDDWRdYQ: //: FIXME: remove this once PR13375 gets fixed
     EXPAND(AVR::LDDWRdPtrQ);
     EXPAND(AVR::LPMWRdZ);
     EXPAND(AVR::LPMWRdZPi);
@@ -2184,8 +2206,8 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
 
 } // end of anonymous namespace
 
-INITIALIZE_PASS(AVRExpandPseudo, "avr-expand-pseudo",
-                AVR_EXPAND_PSEUDO_NAME, false, false)
+INITIALIZE_PASS(AVRExpandPseudo, "avr-expand-pseudo", AVR_EXPAND_PSEUDO_NAME,
+                false, false)
 namespace llvm {
 
 FunctionPass *createAVRExpandPseudoPass() { return new AVRExpandPseudo(); }
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index 89ed30e8bcdb..672611ea2234 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -111,9 +111,8 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
       .setMIFlag(MachineInstr::FrameSetup);
 
   // Mark the FramePtr as live-in in every block except the entry.
-  for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
-       I != E; ++I) {
-    I->addLiveIn(AVR::R29R28);
+  for (MachineBasicBlock &MBBJ : llvm::drop_begin(MF)) {
+    MBBJ.addLiveIn(AVR::R29R28);
   }
 
   if (!FrameSize) {
@@ -304,16 +303,16 @@ static void fixStackStores(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            const TargetInstrInfo &TII, Register FP) {
   // Iterate through the BB until we hit a call instruction or we reach the end.
-  for (auto I = MI, E = MBB.end(); I != E && !I->isCall();) {
-    MachineBasicBlock::iterator NextMI = std::next(I);
-    MachineInstr &MI = *I;
-    unsigned Opcode = I->getOpcode();
+  for (MachineInstr &MI :
+       llvm::make_early_inc_range(llvm::make_range(MI, MBB.end()))) {
+    if (MI.isCall())
+      break;
+
+    unsigned Opcode = MI.getOpcode();
 
     // Only care of pseudo store instructions where SP is the base pointer.
-    if (Opcode != AVR::STDSPQRr && Opcode != AVR::STDWSPQRr) {
-      I = NextMI;
+    if (Opcode != AVR::STDSPQRr && Opcode != AVR::STDWSPQRr)
       continue;
-    }
 
     assert(MI.getOperand(0).getReg() == AVR::SP &&
            "Invalid register, should be SP!");
@@ -325,8 +324,6 @@ static void fixStackStores(MachineBasicBlock &MBB,
 
     MI.setDesc(TII.get(STOpc));
     MI.getOperand(0).setReg(FP);
-
-    I = NextMI;
   }
 }
 
@@ -361,13 +358,13 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
       // values, etc) is tricky and thus left to be optimized in the future.
       BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
 
-      MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
-                              .addReg(AVR::R31R30, RegState::Kill)
-                              .addImm(Amount);
+      MachineInstr *New =
+          BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
+              .addReg(AVR::R31R30, RegState::Kill)
+              .addImm(Amount);
       New->getOperand(3).setIsDead();
 
-      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
-          .addReg(AVR::R31R30);
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP).addReg(AVR::R31R30);
 
       // Make sure the remaining stack stores are converted to real store
       // instructions.
@@ -536,4 +533,3 @@ char AVRDynAllocaSR::ID = 0;
 FunctionPass *createAVRDynAllocaSRPass() { return new AVRDynAllocaSR(); }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index df382d553753..7ec2629ab45d 100644
--- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -156,9 +156,9 @@ bool AVRDAGToDAGISel::selectIndexedLoad(SDNode *N) {
     return false;
   }
 
-  SDNode *ResNode = CurDAG->getMachineNode(Opcode, SDLoc(N), VT,
-                                           PtrVT, MVT::Other,
-                                           LD->getBasePtr(), LD->getChain());
+  SDNode *ResNode =
+      CurDAG->getMachineNode(Opcode, SDLoc(N), VT, PtrVT, MVT::Other,
+                             LD->getBasePtr(), LD->getChain());
   ReplaceUses(N, ResNode);
   CurDAG->RemoveDeadNode(N);
 
@@ -199,12 +199,11 @@ unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
   return Opcode;
 }
 
-bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                                   unsigned ConstraintCode,
-                                                   std::vector<SDValue> &OutOps) {
+bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) {
   assert((ConstraintCode == InlineAsm::Constraint_m ||
-         ConstraintCode == InlineAsm::Constraint_Q) &&
-      "Unexpected asm memory constraint");
+          ConstraintCode == InlineAsm::Constraint_Q) &&
+         "Unexpected asm memory constraint");
 
   MachineRegisterInfo &RI = MF->getRegInfo();
   const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
@@ -276,7 +275,8 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
       }
 
       if (ImmNode->getValueType(0) != MVT::i8) {
-        Disp = CurDAG->getTargetConstant(ImmNode->getAPIntValue().getZExtValue(), dl, MVT::i8);
+        Disp = CurDAG->getTargetConstant(
+            ImmNode->getAPIntValue().getZExtValue(), dl, MVT::i8);
       } else {
         Disp = ImmOp;
       }
@@ -309,11 +309,10 @@ template <> bool AVRDAGToDAGISel::select<ISD::FrameIndex>(SDNode *N) {
   // effective address of the final stack slot.
   int FI = cast<FrameIndexSDNode>(N)->getIndex();
   SDValue TFI =
-    CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy(DL));
+      CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy(DL));
 
-  CurDAG->SelectNodeTo(N, AVR::FRMIDX,
-                       getTargetLowering()->getPointerTy(DL), TFI,
-                       CurDAG->getTargetConstant(0, SDLoc(N), MVT::i16));
+  CurDAG->SelectNodeTo(N, AVR::FRMIDX, getTargetLowering()->getPointerTy(DL),
+                       TFI, CurDAG->getTargetConstant(0, SDLoc(N), MVT::i16));
   return true;
 }
 
@@ -380,8 +379,8 @@ template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) {
   // Check if the opcode can be converted into an indexed load.
   if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT)) {
     // It is legal to fold the load into an indexed load.
-    ResNode = CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr,
-                                     RegZ);
+    ResNode =
+        CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr, RegZ);
     ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
   } else {
     // Selecting an indexed load is not legal, fallback to a normal load.
@@ -391,8 +390,8 @@ template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) {
                                        Ptr, RegZ);
       break;
     case MVT::i16:
-      ResNode = CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16,
-                                       MVT::Other, Ptr, RegZ);
+      ResNode = CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16, MVT::Other,
+                                       Ptr, RegZ);
       ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
       break;
     default:
@@ -441,7 +440,7 @@ template <> bool AVRDAGToDAGISel::select<AVRISD::CALL>(SDNode *N) {
   Ops.push_back(Chain.getValue(1));
 
   SDNode *ResNode =
-    CurDAG->getMachineNode(AVR::ICALL, DL, MVT::Other, MVT::Glue, Ops);
+      CurDAG->getMachineNode(AVR::ICALL, DL, MVT::Other, MVT::Glue, Ops);
 
   ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
   ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
@@ -532,16 +531,23 @@ bool AVRDAGToDAGISel::trySelect(SDNode *N) {
 
   switch (Opcode) {
   // Nodes we fully handle.
-  case ISD::FrameIndex: return select<ISD::FrameIndex>(N);
-  case ISD::BRIND:      return select<ISD::BRIND>(N);
+  case ISD::FrameIndex:
+    return select<ISD::FrameIndex>(N);
+  case ISD::BRIND:
+    return select<ISD::BRIND>(N);
   case ISD::UMUL_LOHI:
-  case ISD::SMUL_LOHI:  return selectMultiplication(N);
+  case ISD::SMUL_LOHI:
+    return selectMultiplication(N);
 
   // Nodes we handle partially. Other cases are autogenerated
-  case ISD::STORE:   return select<ISD::STORE>(N);
-  case ISD::LOAD:    return select<ISD::LOAD>(N);
-  case AVRISD::CALL: return select<AVRISD::CALL>(N);
-  default:           return false;
+  case ISD::STORE:
+    return select<ISD::STORE>(N);
+  case ISD::LOAD:
+    return select<ISD::LOAD>(N);
+  case AVRISD::CALL:
+    return select<AVRISD::CALL>(N);
+  default:
+    return false;
   }
 }
 
@@ -551,4 +557,3 @@ FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 58a7aed91cdf..a6f2afb87102 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -13,8 +13,8 @@
 
 #include "AVRISelLowering.h"
 
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -232,8 +232,8 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
 }
 
 const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
-#define NODE(name)       \
-  case AVRISD::name:     \
+#define NODE(name)                                                             \
+  case AVRISD::name:                                                           \
     return #name
 
   switch (Opcode) {
@@ -269,7 +269,7 @@ EVT AVRTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
 }
 
 SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
-  //:TODO: this function has to be completely rewritten to produce optimal
+  //: TODO: this function has to be completely rewritten to produce optimal
   // code, for now it's producing very long but correct code.
   unsigned Opc8;
   const SDNode *N = Op.getNode();
@@ -527,7 +527,8 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,
   assert((LHS.getSimpleValueType() == RHS.getSimpleValueType()) &&
          "LHS and RHS have different types");
   assert(((LHS.getSimpleValueType() == MVT::i16) ||
-          (LHS.getSimpleValueType() == MVT::i8)) && "invalid comparison type");
+          (LHS.getSimpleValueType() == MVT::i8)) &&
+         "invalid comparison type");
 
   SDValue Cmp;
 
@@ -856,7 +857,8 @@ void AVRTargetLowering::ReplaceNodeResults(SDNode *N,
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
-                                              unsigned AS, Instruction *I) const {
+                                              unsigned AS,
+                                              Instruction *I) const {
   int64_t Offs = AM.BaseOffs;
 
   // Allow absolute addresses.
@@ -1003,14 +1005,13 @@ static const MCPhysReg RegList8[] = {
     AVR::R19, AVR::R18, AVR::R17, AVR::R16, AVR::R15, AVR::R14,
     AVR::R13, AVR::R12, AVR::R11, AVR::R10, AVR::R9,  AVR::R8};
 static const MCPhysReg RegList16[] = {
-    AVR::R26R25, AVR::R25R24, AVR::R24R23, AVR::R23R22,
-    AVR::R22R21, AVR::R21R20, AVR::R20R19, AVR::R19R18,
-    AVR::R18R17, AVR::R17R16, AVR::R16R15, AVR::R15R14,
-    AVR::R14R13, AVR::R13R12, AVR::R12R11, AVR::R11R10,
-    AVR::R10R9,  AVR::R9R8};
+    AVR::R26R25, AVR::R25R24, AVR::R24R23, AVR::R23R22, AVR::R22R21,
+    AVR::R21R20, AVR::R20R19, AVR::R19R18, AVR::R18R17, AVR::R17R16,
+    AVR::R16R15, AVR::R15R14, AVR::R14R13, AVR::R13R12, AVR::R12R11,
+    AVR::R11R10, AVR::R10R9,  AVR::R9R8};
 
 static_assert(array_lengthof(RegList8) == array_lengthof(RegList16),
-        "8-bit and 16-bit register arrays must be of equal length");
+              "8-bit and 16-bit register arrays must be of equal length");
 
 /// Analyze incoming and outgoing function arguments. We need custom C++ code
 /// to handle special constraints in the ABI.
@@ -1084,10 +1085,11 @@ analyzeArguments(TargetLowering::CallLoweringInfo *CLI, const Function *F,
 
 /// Count the total number of bytes needed to pass or return these arguments.
 template <typename ArgT>
-static unsigned getTotalArgumentsSizeInBytes(const SmallVectorImpl<ArgT> &Args) {
+static unsigned
+getTotalArgumentsSizeInBytes(const SmallVectorImpl<ArgT> &Args) {
   unsigned TotalBytes = 0;
 
-  for (const ArgT& Arg : Args) {
+  for (const ArgT &Arg : Args) {
     TotalBytes += Arg.VT.getStoreSize();
   }
   return TotalBytes;
@@ -1102,7 +1104,8 @@ static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args,
   unsigned NumArgs = Args.size();
   unsigned TotalBytes = getTotalArgumentsSizeInBytes(Args);
   // CanLowerReturn() guarantees this assertion.
-  assert(TotalBytes <= 8 && "return values greater than 8 bytes cannot be lowered");
+  assert(TotalBytes <= 8 &&
+         "return values greater than 8 bytes cannot be lowered");
 
   // GCC-ABI says that the size is rounded up to the next even number,
   // but actually once it is more than 4 it will always round up to 8.
@@ -1197,7 +1200,7 @@ SDValue AVRTargetLowering::LowerFormalArguments(
 
       InVals.push_back(ArgValue);
     } else {
-      // Sanity check.
+      // Only arguments passed on the stack should make it here.
       assert(VA.isMemLoc());
 
       EVT LocVT = VA.getLocVT();
@@ -1406,8 +1409,8 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 ///
 SDValue AVRTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1488,17 +1491,14 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // Don't emit the ret/reti instruction when the naked attribute is present in
   // the function being compiled.
-  if (MF.getFunction().getAttributes().hasAttribute(
-          AttributeList::FunctionIndex, Attribute::Naked)) {
+  if (MF.getFunction().getAttributes().hasFnAttr(Attribute::Naked)) {
     return Chain;
   }
 
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
 
   unsigned RetOpc =
-    AFI->isInterruptOrSignalHandler()
-        ? AVRISD::RETI_FLAG
-        : AVRISD::RET_FLAG;
+      AFI->isInterruptOrSignalHandler() ? AVRISD::RETI_FLAG : AVRISD::RET_FLAG;
 
   RetOps[0] = Chain; // Update chain.
 
@@ -1572,8 +1572,10 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   MachineFunction::iterator I;
-  for (I = BB->getIterator(); I != F->end() && &(*I) != BB; ++I);
-  if (I != F->end()) ++I;
+  for (I = BB->getIterator(); I != F->end() && &(*I) != BB; ++I)
+    ;
+  if (I != F->end())
+    ++I;
 
   // Create loop block.
   MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -1636,8 +1638,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
       .addReg(ShiftReg2)
       .addMBB(LoopBB);
 
-  BuildMI(CheckBB, dl, TII.get(AVR::DECRd), ShiftAmtReg2)
-      .addReg(ShiftAmtReg);
+  BuildMI(CheckBB, dl, TII.get(AVR::DECRd), ShiftAmtReg2).addReg(ShiftAmtReg);
   BuildMI(CheckBB, dl, TII.get(AVR::BRPLk)).addMBB(LoopBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
@@ -1725,8 +1726,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
   MachineFunction::iterator I;
-  for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
-  if (I != MF->end()) ++I;
+  for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I)
+    ;
+  if (I != MF->end())
+    ++I;
   MF->insert(I, trueMBB);
   MF->insert(I, falseMBB);
 
@@ -1748,11 +1751,12 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   falseMBB->addSuccessor(trueMBB);
 
   // Set up the Phi node to determine where we came from
-  BuildMI(*trueMBB, trueMBB->begin(), dl, TII.get(AVR::PHI), MI.getOperand(0).getReg())
-    .addReg(MI.getOperand(1).getReg())
-    .addMBB(MBB)
-    .addReg(MI.getOperand(2).getReg())
-    .addMBB(falseMBB) ;
+  BuildMI(*trueMBB, trueMBB->begin(), dl, TII.get(AVR::PHI),
+          MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(MBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(falseMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return trueMBB;
@@ -1779,9 +1783,12 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'w': // Special upper register pairs
       return C_RegisterClass;
     case 't': // Temporary register
-    case 'x': case 'X': // Pointer register pair X
-    case 'y': case 'Y': // Pointer register pair Y
-    case 'z': case 'Z': // Pointer register pair Z
+    case 'x':
+    case 'X': // Pointer register pair X
+    case 'y':
+    case 'Y': // Pointer register pair Y
+    case 'z':
+    case 'Z': // Pointer register pair Z
       return C_Register;
     case 'Q': // A memory address based on Y or Z pointer with displacement.
       return C_Memory;
@@ -1842,9 +1849,12 @@ AVRTargetLowering::getSingleConstraintMatchWeight(
   case 'q':
   case 't':
   case 'w':
-  case 'x': case 'X':
-  case 'y': case 'Y':
-  case 'z': case 'Z':
+  case 'x':
+  case 'X':
+  case 'y':
+  case 'Y':
+  case 'z':
+  case 'Z':
     weight = CW_SpecificReg;
     break;
   case 'G':
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 8130cf045fa8..3ae036b66bcb 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -133,11 +133,11 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  Register getRegisterByName(const char* RegName, LLT VT,
+  Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
-  bool shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL)
-    const override {
+  bool shouldSplitFunctionArgumentsAsLittleEndian(
+      const DataLayout &DL) const override {
     return false;
   }
 
@@ -179,7 +179,6 @@ private:
                           SmallVectorImpl<SDValue> &InVals) const;
 
 protected:
-
   const AVRSubtarget &Subtarget;
 
 private:
diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td
index 6eb49076efb0..2bcbcdfbf925 100644
--- a/llvm/lib/Target/AVR/AVRInstrFormats.td
+++ b/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 // A generic AVR instruction.
-class AVRInst<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction
-{
+class AVRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
   let Namespace = "AVR";
 
   dag OutOperandList = outs;
@@ -25,8 +25,7 @@ class AVRInst<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction
 
 /// A 16-bit AVR instruction.
 class AVRInst16<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst<outs, ins, asmstr, pattern>
-{
+    : AVRInst<outs, ins, asmstr, pattern> {
   field bits<16> Inst;
 
   let Size = 2;
@@ -34,8 +33,7 @@ class AVRInst16<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 /// a 32-bit AVR instruction.
 class AVRInst32<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst<outs, ins, asmstr, pattern>
-{
+    : AVRInst<outs, ins, asmstr, pattern> {
   field bits<32> Inst;
 
   let Size = 4;
@@ -50,8 +48,7 @@ class AVRInst32<dag outs, dag ins, string asmstr, list<dag> pattern>
 // is defined as a pseudo instruction. In AVRExpandPseudoInsts.cpp,
 // the instruction is then replaced by two add instructions - one for each byte.
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   let Pattern = pattern;
 
   let isPseudo = 1;
@@ -67,28 +64,26 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Accepts all registers)
 //===----------------------------------------------------------------------===//
 class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
-            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
-{
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> rd;
   bits<5> rr;
 
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = f;
+  let Inst{15 - 12} = opcode;
+  let Inst{11 - 10} = f;
   let Inst{9} = rr{4};
-  let Inst{8-4} = rd;
-  let Inst{3-0} = rr{3-0};
+  let Inst{8 - 4} = rd;
+  let Inst{3 - 0} = rr{3 - 0};
 }
 
 class FTST<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
-            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
-{
+           list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> rd;
 
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = f;
+  let Inst{15 - 12} = opcode;
+  let Inst{11 - 10} = f;
   let Inst{9} = rd{4};
-  let Inst{8-4} = rd;
-  let Inst{3-0} = rd{3-0};
+  let Inst{8 - 4} = rd;
+  let Inst{3 - 0} = rd{3 - 0};
 }
 
 //===----------------------------------------------------------------------===//
@@ -96,19 +91,18 @@ class FTST<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
 // <|1001|001r|rrrr|0ttt>
 //===----------------------------------------------------------------------===//
 class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> rd;
 
-  let Inst{15-12} = 0b1001;
+  let Inst{15 - 12} = 0b1001;
 
-  let Inst{11-9} = 0b001;
+  let Inst{11 - 9} = 0b001;
   let Inst{8} = rd{4};
 
-  let Inst{7-4} = rd{3-0};
+  let Inst{7 - 4} = rd{3 - 0};
 
   let Inst{3} = 0;
-  let Inst{2-0} = t;
+  let Inst{2 - 0} = t;
 }
 
 //===----------------------------------------------------------------------===//
@@ -119,15 +113,14 @@ class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Only accepts r16-r31)
 //===----------------------------------------------------------------------===//
 class FRdK<bits<4> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<4> rd;
   bits<8> k;
 
-  let Inst{15-12} = opcode;
-  let Inst{11-8} = k{7-4};
-  let Inst{7-4} = rd{3-0};
-  let Inst{3-0} = k{3-0};
+  let Inst{15 - 12} = opcode;
+  let Inst{11 - 8} = k{7 - 4};
+  let Inst{7 - 4} = rd{3 - 0};
+  let Inst{3 - 0} = k{3 - 0};
 
   let isAsCheapAsAMove = 1;
 }
@@ -140,14 +133,13 @@ class FRdK<bits<4> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Accepts all registers)
 //===----------------------------------------------------------------------===//
 class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
-          list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
-{
+          list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> d;
 
-  let Inst{15-12} = opcode;
-  let Inst{11-9} = f{6-4};
-  let Inst{8-4} = d;
-  let Inst{3-0} = f{3-0};
+  let Inst{15 - 12} = opcode;
+  let Inst{11 - 9} = f{6 - 4};
+  let Inst{8 - 4} = d;
+  let Inst{3 - 0} = f{3 - 0};
 
   let DecoderMethod = "decodeFRd";
 }
@@ -160,23 +152,22 @@ class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
 // p = pointer register (1 bit) [1 for Y, 0 for Z]
 //===----------------------------------------------------------------------===//
 class FSTDLDD<bit type, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<7> memri;
   bits<5> reg; // the GP register
 
-  let Inst{15-14} = 0b10;
+  let Inst{15 - 14} = 0b10;
   let Inst{13} = memri{5};
   let Inst{12} = 0;
 
-  let Inst{11-10} = memri{4-3};
+  let Inst{11 - 10} = memri{4 - 3};
   let Inst{9} = type;
   let Inst{8} = reg{4};
 
-  let Inst{7-4} = reg{3-0};
+  let Inst{7 - 4} = reg{3 - 0};
 
   let Inst{3} = memri{6};
-  let Inst{2-0} = memri{2-0};
+  let Inst{2 - 0} = memri{2 - 0};
 }
 
 //===---------------------------------------------------------------------===//
@@ -190,26 +181,24 @@ class FSTDLDD<bit type, dag outs, dag ins, string asmstr, list<dag> pattern>
 // Note that the bit labelled 'i' above does not follow a simple pattern,
 // so there exists a post encoder method to set it manually.
 //===---------------------------------------------------------------------===//
-class FSTLD<bit type, bits<2> mode, dag outs, dag ins,
-            string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+class FSTLD<bit type, bits<2> mode, dag outs, dag ins, string asmstr,
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> {
   bits<2> ptrreg;
   bits<5> reg;
 
-  let Inst{15-13} = 0b100;
+  let Inst{15 - 13} = 0b100;
   // This bit varies depending on the arguments and the mode.
   // We have a post encoder method to set this bit manually.
   let Inst{12} = 0;
 
-  let Inst{11-10} = 0b00;
+  let Inst{11 - 10} = 0b00;
   let Inst{9} = type;
   let Inst{8} = reg{4};
 
-  let Inst{7-4} = reg{3-0};
+  let Inst{7 - 4} = reg{3 - 0};
 
-  let Inst{3-2} = ptrreg{1-0};
-  let Inst{1-0} = mode{1-0};
+  let Inst{3 - 2} = ptrreg{1 - 0};
+  let Inst{1 - 0} = mode{1 - 0};
 
   let PostEncoderMethod = "loadStorePostEncoder";
 }
@@ -223,22 +212,21 @@ class FSTLD<bit type, bits<2> mode, dag outs, dag ins,
 // p = is postincrement
 //===---------------------------------------------------------------------===//
 class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
-   bits<5> reg;
+    : AVRInst16<outs, ins, asmstr, pattern> {
+  bits<5> reg;
 
-   let Inst{15-12} = 0b1001;
+  let Inst{15 - 12} = 0b1001;
 
-   let Inst{11-9} = 0b000;
-   let Inst{8} = reg{4};
+  let Inst{11 - 9} = 0b000;
+  let Inst{8} = reg{4};
 
-   let Inst{7-4} = reg{3-0};
+  let Inst{7 - 4} = reg{3 - 0};
 
-   let Inst{3-2} = 0b01;
-   let Inst{1} = e;
-   let Inst{0} = p;
+  let Inst{3 - 2} = 0b01;
+  let Inst{1} = e;
+  let Inst{0} = p;
 
-   let DecoderMethod = "decodeFLPMX";
+  let DecoderMethod = "decodeFLPMX";
 }
 
 //===----------------------------------------------------------------------===//
@@ -248,14 +236,13 @@ class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Only accepts even registers)
 //===----------------------------------------------------------------------===//
 class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> d;
   bits<5> r;
 
-  let Inst{15-8} = 0b00000001;
-  let Inst{7-4} = d{4-1};
-  let Inst{3-0} = r{4-1};
+  let Inst{15 - 8} = 0b00000001;
+  let Inst{7 - 4} = d{4 - 1};
+  let Inst{3 - 0} = r{4 - 1};
 
   let DecoderMethod = "decodeFMOVWRdRr";
 }
@@ -267,15 +254,14 @@ class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Only accepts r16-r31)
 //===----------------------------------------------------------------------===//
 class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
-  bits<5> rd;              // accept 5 bits but only encode the lower 4
-  bits<5> rr;              // accept 5 bits but only encode the lower 4
+    : AVRInst16<outs, ins, asmstr, pattern> {
+  bits<5> rd; // accept 5 bits but only encode the lower 4
+  bits<5> rr; // accept 5 bits but only encode the lower 4
 
-  let Inst{15-9} = 0b0000001;
+  let Inst{15 - 9} = 0b0000001;
   let Inst{8} = f;
-  let Inst{7-4} = rd{3-0};
-  let Inst{3-0} = rr{3-0};
+  let Inst{7 - 4} = rd{3 - 0};
+  let Inst{3 - 0} = rr{3 - 0};
 
   let DecoderMethod = "decodeFMUL2RdRr";
 }
@@ -291,21 +277,19 @@ class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
 // ddd = destination register
 // rrr = source register
 class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<3> rd;
   bits<3> rr;
 
-  let Inst{15-8} = 0b00000011;
+  let Inst{15 - 8} = 0b00000011;
   let Inst{7} = f{1};
-  let Inst{6-4} = rd;
+  let Inst{6 - 4} = rd;
   let Inst{3} = f{0};
-  let Inst{2-0} = rr;
+  let Inst{2 - 0} = rr;
 
   let DecoderMethod = "decodeFFMULRdRr";
 }
 
-
 //===----------------------------------------------------------------------===//
 // Arithmetic word instructions (ADIW / SBIW): <|1001|011f|kkdd|kkkk|>
 // f = secondary opcode = 1 bit
@@ -314,16 +298,15 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Only accepts r25:24 r27:26 r29:28 r31:30)
 //===----------------------------------------------------------------------===//
 class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
-  bits<5> dst;              // accept 5 bits but only encode bits 1 and 2
+    : AVRInst16<outs, ins, asmstr, pattern> {
+  bits<5> dst; // accept 5 bits but only encode bits 1 and 2
   bits<6> k;
 
-  let Inst{15-9} = 0b1001011;
+  let Inst{15 - 9} = 0b1001011;
   let Inst{8} = f;
-  let Inst{7-6} = k{5-4};
-  let Inst{5-4} = dst{2-1};
-  let Inst{3-0} = k{3-0};
+  let Inst{7 - 6} = k{5 - 4};
+  let Inst{5 - 4} = dst{2 - 1};
+  let Inst{3 - 0} = k{3 - 0};
 
   let DecoderMethod = "decodeFWRdK";
 }
@@ -335,15 +318,14 @@ class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Accepts all registers)
 //===----------------------------------------------------------------------===//
 class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> d;
   bits<6> A;
 
-  let Inst{15-11} = 0b10110;
-  let Inst{10-9} = A{5-4};
-  let Inst{8-4} = d;
-  let Inst{3-0} = A{3-0};
+  let Inst{15 - 11} = 0b10110;
+  let Inst{10 - 9} = A{5 - 4};
+  let Inst{8 - 4} = d;
+  let Inst{3 - 0} = A{3 - 0};
 
   let DecoderMethod = "decodeFIORdA";
 }
@@ -355,15 +337,14 @@ class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Accepts all registers)
 //===----------------------------------------------------------------------===//
 class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<6> A;
   bits<5> r;
 
-  let Inst{15-11} = 0b10111;
-  let Inst{10-9} = A{5-4};
-  let Inst{8-4} = r;
-  let Inst{3-0} = A{3-0};
+  let Inst{15 - 11} = 0b10111;
+  let Inst{10 - 9} = A{5 - 4};
+  let Inst{8 - 4} = r;
+  let Inst{3 - 0} = A{3 - 0};
 
   let DecoderMethod = "decodeFIOARr";
 }
@@ -376,20 +357,19 @@ class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
 // b = bit number
 //===----------------------------------------------------------------------===//
 class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> A;
   bits<3> b;
 
-  let Inst{15-12} = 0b1001;
+  let Inst{15 - 12} = 0b1001;
 
-  let Inst{11-10} = 0b10;
-  let Inst{9-8} = t;
+  let Inst{11 - 10} = 0b10;
+  let Inst{9 - 8} = t;
 
-  let Inst{7-4} = A{4-1};
+  let Inst{7 - 4} = A{4 - 1};
 
   let Inst{3} = A{0};
-  let Inst{2-0} = b{2-0};
+  let Inst{2 - 0} = b{2 - 0};
 
   let DecoderMethod = "decodeFIOBIT";
 }
@@ -402,21 +382,20 @@ class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
 // b = bit
 //===----------------------------------------------------------------------===//
 class FRdB<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<5> rd;
   bits<3> b;
 
-  let Inst{15-12} = 0b1111;
+  let Inst{15 - 12} = 0b1111;
 
   let Inst{11} = 0b1;
-  let Inst{10-9} = t;
+  let Inst{10 - 9} = t;
   let Inst{8} = rd{4};
 
-  let Inst{7-4} = rd{3-0};
+  let Inst{7 - 4} = rd{3 - 0};
 
   let Inst{3} = 0;
-  let Inst{2-0} = b;
+  let Inst{2 - 0} = b;
 }
 
 // Special encoding for the `DES K` instruction.
@@ -425,17 +404,16 @@ class FRdB<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
 //
 // KKKK = 4 bit immediate
 class FDES<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<4> k;
 
-  let Inst{15-12} = 0b1001;
+  let Inst{15 - 12} = 0b1001;
 
-  let Inst{11-8} = 0b0100;
+  let Inst{11 - 8} = 0b0100;
 
-  let Inst{7-4} = k;
+  let Inst{7 - 4} = k;
 
-  let Inst{3-0} = 0b1011;
+  let Inst{3 - 0} = 0b1011;
 }
 
 //===----------------------------------------------------------------------===//
@@ -444,15 +422,14 @@ class FDES<dag outs, dag ins, string asmstr, list<dag> pattern>
 // k = constant address = 7 bits
 // s = bit in status register = 3 bits
 //===----------------------------------------------------------------------===//
-class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr,
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern> {
   bits<7> k;
 
-  let Inst{15-11} = 0b11110;
+  let Inst{15 - 11} = 0b11110;
   let Inst{10} = f;
-  let Inst{9-3} = k;
-  let Inst{2-0} = s;
+  let Inst{9 - 3} = k;
+  let Inst{2 - 0} = s;
 }
 
 //===----------------------------------------------------------------------===//
@@ -460,14 +437,12 @@ class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr, list<dag> patter
 //===----------------------------------------------------------------------===//
 
 class F16<bits<16> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   let Inst = opcode;
 }
 
 class F32<bits<32> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst32<outs, ins, asmstr, pattern>
-{
+    : AVRInst32<outs, ins, asmstr, pattern> {
   let Inst = opcode;
 }
 
@@ -477,13 +452,12 @@ class F32<bits<32> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
 // k = constant address = 12 bits
 //===----------------------------------------------------------------------===//
 class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<12> k;
 
-  let Inst{15-13} = 0b110;
+  let Inst{15 - 13} = 0b110;
   let Inst{12} = f;
-  let Inst{11-0} = k;
+  let Inst{11 - 0} = k;
 }
 
 //===----------------------------------------------------------------------===//
@@ -492,14 +466,13 @@ class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
 // k = constant address = 22 bits
 //===----------------------------------------------------------------------===//
 class F32BRk<bits<3> f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst32<outs, ins, asmstr, pattern>
-{
+    : AVRInst32<outs, ins, asmstr, pattern> {
   bits<22> k;
 
-  let Inst{31-25} = 0b1001010;
-  let Inst{24-20} = k{21-17};
-  let Inst{19-17} = f;
-  let Inst{16-0} = k{16-0};
+  let Inst{31 - 25} = 0b1001010;
+  let Inst{24 - 20} = k{21 - 17};
+  let Inst{19 - 17} = f;
+  let Inst{16 - 0} = k{16 - 0};
 }
 
 //===----------------------------------------------------------------------===//
@@ -510,38 +483,36 @@ class F32BRk<bits<3> f, dag outs, dag ins, string asmstr, list<dag> pattern>
 // (Accepts all registers)
 //===----------------------------------------------------------------------===//
 class F32DM<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst32<outs, ins, asmstr, pattern>
-{
+    : AVRInst32<outs, ins, asmstr, pattern> {
   bits<5> rd;
   bits<16> k;
 
-  let Inst{31-28} = 0b1001;
+  let Inst{31 - 28} = 0b1001;
 
-  let Inst{27-26} = 0b00;
+  let Inst{27 - 26} = 0b00;
   let Inst{25} = f;
   let Inst{24} = rd{4};
 
-  let Inst{23-20} = rd{3-0};
+  let Inst{23 - 20} = rd{3 - 0};
 
-  let Inst{19-16} = 0b0000;
+  let Inst{19 - 16} = 0b0000;
 
-  let Inst{15-0} = k;
+  let Inst{15 - 0} = k;
 }
 
 // <|1001|0100|bfff|1000>
 class FS<bit b, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<3> s;
 
-  let Inst{15-12} = 0b1001;
+  let Inst{15 - 12} = 0b1001;
 
-  let Inst{11-8} = 0b0100;
+  let Inst{11 - 8} = 0b0100;
 
   let Inst{7} = b;
-  let Inst{6-4} = s;
+  let Inst{6 - 4} = s;
 
-  let Inst{3-0} = 0b1000;
+  let Inst{3 - 0} = 0b1000;
 }
 
 // Set/clr bit in status flag instructions/
@@ -549,48 +520,42 @@ class FS<bit b, dag outs, dag ins, string asmstr, list<dag> pattern>
 // ---------------------
 // <|1111|0fkk|kkkk|ksss>
 class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
-  : AVRInst16<outs, ins, asmstr, pattern>
-{
+    : AVRInst16<outs, ins, asmstr, pattern> {
   bits<7> k;
   bits<3> s;
 
-  let Inst{15-12} = 0b1111;
+  let Inst{15 - 12} = 0b1111;
 
   let Inst{11} = 0;
   let Inst{10} = f;
-  let Inst{9-8} = k{6-5};
+  let Inst{9 - 8} = k{6 - 5};
 
-  let Inst{7-4} = k{4-1};
+  let Inst{7 - 4} = k{4 - 1};
 
   let Inst{3} = k{0};
-  let Inst{2-0} = s;
+  let Inst{2 - 0} = s;
 }
 
 class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : Pseudo<outs, ins, asmstr, pattern>
-{
+    : Pseudo<outs, ins, asmstr, pattern> {
   let Defs = [SREG];
 }
 
 class StorePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : Pseudo<outs, ins, asmstr, pattern>
-{
+    : Pseudo<outs, ins, asmstr, pattern> {
   let Defs = [SP];
 }
 
 class SelectPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : Pseudo<outs, ins, asmstr, pattern>
-{
+    : Pseudo<outs, ins, asmstr, pattern> {
   let usesCustomInserter = 1;
 
   let Uses = [SREG];
 }
 
 class ShiftPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : Pseudo<outs, ins, asmstr, pattern>
-{
+    : Pseudo<outs, ins, asmstr, pattern> {
   let usesCustomInserter = 1;
 
   let Defs = [SREG];
 }
-
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 06f07696bde3..798d08393eae 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -20,9 +20,9 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #include "AVR.h"
 #include "AVRMachineFunctionInfo.h"
@@ -55,13 +55,13 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Register DestLo, DestHi, SrcLo, SrcHi;
 
       TRI.splitReg(DestReg, DestLo, DestHi);
-      TRI.splitReg(SrcReg,  SrcLo,  SrcHi);
+      TRI.splitReg(SrcReg, SrcLo, SrcHi);
 
       // Copy each individual register with the `MOV` instruction.
       BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestLo)
-        .addReg(SrcLo, getKillRegState(KillSrc));
+          .addReg(SrcLo, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestHi)
-        .addReg(SrcHi, getKillRegState(KillSrc));
+          .addReg(SrcHi, getKillRegState(KillSrc));
     }
   } else {
     if (AVR::GPR8RegClass.contains(DestReg, SrcReg)) {
@@ -83,7 +83,7 @@ unsigned AVRInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                            int &FrameIndex) const {
   switch (MI.getOpcode()) {
   case AVR::LDDRdPtrQ:
-  case AVR::LDDWRdYQ: { //:FIXME: remove this once PR13375 gets fixed
+  case AVR::LDDWRdYQ: { //: FIXME: remove this once PR13375 gets fixed
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0) {
       FrameIndex = MI.getOperand(1).getIndex();
@@ -179,7 +179,7 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     Opcode = AVR::LDDRdPtrQ;
   } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     // Opcode = AVR::LDDWRdPtrQ;
-    //:FIXME: remove this once PR13375 gets fixed
+    //: FIXME: remove this once PR13375 gets fixed
     Opcode = AVR::LDDWRdYQ;
   } else {
     llvm_unreachable("Cannot load this register from a stack slot!");
@@ -289,7 +289,7 @@ bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     }
 
     // Handle unconditional branches.
-    //:TODO: add here jmp
+    //: TODO: add here jmp
     if (I->getOpcode() == AVR::RJMPk) {
       UnCondBrIter = I;
 
@@ -399,9 +399,9 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
-                                    const DebugLoc &DL,
-                                    int *BytesAdded) const {
-  if (BytesAdded) *BytesAdded = 0;
+                                    const DebugLoc &DL, int *BytesAdded) const {
+  if (BytesAdded)
+    *BytesAdded = 0;
 
   // Shouldn't be a fall through.
   assert(TBB && "insertBranch must not be told to insert a fallthrough");
@@ -421,13 +421,15 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
   AVRCC::CondCodes CC = (AVRCC::CondCodes)Cond[0].getImm();
   auto &CondMI = *BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB);
 
-  if (BytesAdded) *BytesAdded += getInstSizeInBytes(CondMI);
+  if (BytesAdded)
+    *BytesAdded += getInstSizeInBytes(CondMI);
   ++Count;
 
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
     auto &MI = *BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB);
-    if (BytesAdded) *BytesAdded += getInstSizeInBytes(MI);
+    if (BytesAdded)
+      *BytesAdded += getInstSizeInBytes(MI);
     ++Count;
   }
 
@@ -436,7 +438,8 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                     int *BytesRemoved) const {
-  if (BytesRemoved) *BytesRemoved = 0;
+  if (BytesRemoved)
+    *BytesRemoved = 0;
 
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
@@ -446,7 +449,7 @@ unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
     if (I->isDebugInstr()) {
       continue;
     }
-    //:TODO: add here the missing jmp instructions once they are implemented
+    //: TODO: add here the missing jmp instructions once they are implemented
     // like jmp, {e}ijmp, and other cond branches, ...
     if (I->getOpcode() != AVR::RJMPk &&
         getCondFromBranchOpc(I->getOpcode()) == AVRCC::COND_INVALID) {
@@ -454,7 +457,8 @@ unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
     }
 
     // Remove the branch.
-    if (BytesRemoved) *BytesRemoved += getInstSizeInBytes(*I);
+    if (BytesRemoved)
+      *BytesRemoved += getInstSizeInBytes(*I);
     I->eraseFromParent();
     I = MBB.end();
     ++Count;
@@ -490,7 +494,8 @@ unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   case TargetOpcode::INLINEASM:
   case TargetOpcode::INLINEASM_BR: {
     const MachineFunction &MF = *MI.getParent()->getParent();
-    const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
+    const AVRTargetMachine &TM =
+        static_cast<const AVRTargetMachine &>(MF.getTarget());
     const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
     const TargetInstrInfo &TII = *STI.getInstrInfo();
 
@@ -555,20 +560,19 @@ bool AVRInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   }
 }
 
-unsigned AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
-                                            MachineBasicBlock &NewDestBB,
-                                            const DebugLoc &DL,
-                                            int64_t BrOffset,
-                                            RegScavenger *RS) const {
-    // This method inserts a *direct* branch (JMP), despite its name.
-    // LLVM calls this method to fixup unconditional branches; it never calls
-    // insertBranch or some hypothetical "insertDirectBranch".
-    // See lib/CodeGen/RegisterRelaxation.cpp for details.
-    // We end up here when a jump is too long for a RJMP instruction.
-    auto &MI = *BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB);
-
-    return getInstSizeInBytes(MI);
+void AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock &NewDestBB,
+                                        MachineBasicBlock &RestoreBB,
+                                        const DebugLoc &DL, int64_t BrOffset,
+                                        RegScavenger *RS) const {
+  // This method inserts a *direct* branch (JMP), despite its name.
+  // LLVM calls this method to fixup unconditional branches; it never calls
+  // insertBranch or some hypothetical "insertDirectBranch".
+  // See lib/CodeGen/RegisterRelaxation.cpp for details.
+  // We end up here when a jump is too long for a RJMP instruction.
+  BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB);
+
+  return;
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index 11f45865de54..6d0596642fa1 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -107,11 +107,11 @@ public:
   bool isBranchOffsetInRange(unsigned BranchOpc,
                              int64_t BrOffset) const override;
 
-  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                MachineBasicBlock &NewDestBB,
-                                const DebugLoc &DL,
-                                int64_t BrOffset,
-                                RegScavenger *RS) const override;
+  void insertIndirectBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock &NewDestBB,
+                            MachineBasicBlock &RestoreBB, const DebugLoc &DL,
+                            int64_t BrOffset, RegScavenger *RS) const override;
+
 private:
   const AVRRegisterInfo RI;
 };
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index c7c9656d3bfb..c7f423292da0 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -20,12 +20,13 @@ def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
-def SDT_AVRBrcond : SDTypeProfile<0, 2,
-                                  [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+def SDT_AVRBrcond
+    : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
 def SDT_AVRCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
 def SDT_AVRTst : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDT_AVRSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
-                                    SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDT_AVRSelectCC
+    : SDTypeProfile<1, 3,
+                    [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
 
 //===----------------------------------------------------------------------===//
 // AVR Specific Node Definitions
@@ -46,12 +47,12 @@ def AVRcall : SDNode<"AVRISD::CALL", SDT_AVRCall,
 
 def AVRWrapper : SDNode<"AVRISD::WRAPPER", SDT_AVRWrapper>;
 
-def AVRbrcond : SDNode<"AVRISD::BRCOND", SDT_AVRBrcond,
-                       [SDNPHasChain, SDNPInGlue]>;
+def AVRbrcond
+    : SDNode<"AVRISD::BRCOND", SDT_AVRBrcond, [SDNPHasChain, SDNPInGlue]>;
 def AVRcmp : SDNode<"AVRISD::CMP", SDT_AVRCmp, [SDNPOutGlue]>;
 def AVRcmpc : SDNode<"AVRISD::CMPC", SDT_AVRCmp, [SDNPInGlue, SDNPOutGlue]>;
 def AVRtst : SDNode<"AVRISD::TST", SDT_AVRTst, [SDNPOutGlue]>;
-def AVRselectcc: SDNode<"AVRISD::SELECT_CC", SDT_AVRSelectCC, [SDNPInGlue]>;
+def AVRselectcc : SDNode<"AVRISD::SELECT_CC", SDT_AVRSelectCC, [SDNPInGlue]>;
 
 // Shift nodes.
 def AVRlsl : SDNode<"AVRISD::LSL", SDTIntUnaryOp>;
@@ -80,29 +81,31 @@ def AVRSwap : SDNode<"AVRISD::SWAP", SDTIntUnaryOp>;
 // AVR Operands, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
-def imm8_neg_XFORM : SDNodeXForm<imm,
-[{
-  return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i8);
-}]>;
+def imm8_neg_XFORM : SDNodeXForm<imm, [{
+                                   return CurDAG->getTargetConstant(
+                                       -N->getAPIntValue(), SDLoc(N), MVT::i8);
+                                 }]>;
 
-def imm16_neg_XFORM : SDNodeXForm<imm,
-[{
-  return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i16);
-}]>;
+def imm16_neg_XFORM
+    : SDNodeXForm<imm, [{
+                    return CurDAG->getTargetConstant(-N->getAPIntValue(),
+                                                     SDLoc(N), MVT::i16);
+                  }]>;
 
-def imm0_63_neg : PatLeaf<(imm),
-[{
-  int64_t val = -N->getSExtValue();
-  return val >= 0 && val < 64;
-}], imm16_neg_XFORM>;
+def imm0_63_neg : PatLeaf<(imm), [{
+                            int64_t val = -N->getSExtValue();
+                            return val >= 0 && val < 64;
+                          }],
+                          imm16_neg_XFORM>;
 
 def uimm6 : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>;
 
 // imm_com8_XFORM - Return the complement of a imm_com8 value
-def imm_com8_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(~((uint8_t)N->getZExtValue()), SDLoc(N),
-                                   MVT::i8);
-}]>;
+def imm_com8_XFORM
+    : SDNodeXForm<imm, [{
+                    return CurDAG->getTargetConstant(
+                        ~((uint8_t) N->getZExtValue()), SDLoc(N), MVT::i8);
+                  }]>;
 
 // imm_com8 - Match an immediate that is a complement
 // of a 8-bit immediate.
@@ -110,59 +113,55 @@ def imm_com8_XFORM : SDNodeXForm<imm, [{
 // only used on aliases (Pat<> and InstAlias<>). The actual encoding
 // is handled by the destination instructions, which use imm_com8.
 def imm_com8_asmoperand : AsmOperandClass { let Name = "ImmCom8"; }
-def imm_com8 : Operand<i8> {
-  let ParserMatchClass = imm_com8_asmoperand;
-}
-
-def ioaddr_XFORM : SDNodeXForm<imm,
-[{
-  uint8_t offset = Subtarget->getIORegisterOffset();
-  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - offset,
-                                   SDLoc(N), MVT::i8);
-}]>;
-
-def iobitpos8_XFORM : SDNodeXForm<imm,
-[{
-  return CurDAG->getTargetConstant(Log2_32(uint8_t(N->getZExtValue())),
-                                   SDLoc(N), MVT::i8);
-}]>;
-
-def iobitposn8_XFORM : SDNodeXForm<imm,
-[{
-  return CurDAG->getTargetConstant(Log2_32(uint8_t(~N->getZExtValue())),
-                                   SDLoc(N), MVT::i8);
-}]>;
-
-def ioaddr8 : PatLeaf<(imm),
-[{
-  uint8_t offset = Subtarget->getIORegisterOffset();
-  uint64_t val = N->getZExtValue() - offset;
-  return val < 0x40;
-}], ioaddr_XFORM>;
-
-def lowioaddr8 : PatLeaf<(imm),
-[{
-  uint8_t offset = Subtarget->getIORegisterOffset();
-  uint64_t val = N->getZExtValue() - offset;
-  return val < 0x20;
-}], ioaddr_XFORM>;
-
-def ioaddr16 : PatLeaf<(imm),
-[{
-  uint8_t offset = Subtarget->getIORegisterOffset();
-  uint64_t val = N->getZExtValue() - offset;
-  return val < 0x3f;
-}], ioaddr_XFORM>;
-
-def iobitpos8 : PatLeaf<(imm),
-[{
-  return isPowerOf2_32(uint8_t(N->getZExtValue()));
-}], iobitpos8_XFORM>;
-
-def iobitposn8 : PatLeaf<(imm),
-[{
-  return isPowerOf2_32(uint8_t(~N->getZExtValue()));
-}], iobitposn8_XFORM>;
+def imm_com8 : Operand<i8> { let ParserMatchClass = imm_com8_asmoperand; }
+
+def ioaddr_XFORM
+    : SDNodeXForm<imm, [{
+                    uint8_t offset = Subtarget->getIORegisterOffset();
+                    return CurDAG->getTargetConstant(
+                        uint8_t(N->getZExtValue()) - offset, SDLoc(N), MVT::i8);
+                  }]>;
+
+def iobitpos8_XFORM
+    : SDNodeXForm<imm, [{
+                    return CurDAG->getTargetConstant(
+                        Log2_32(uint8_t(N->getZExtValue())), SDLoc(N), MVT::i8);
+                  }]>;
+
+def iobitposn8_XFORM : SDNodeXForm<imm, [{
+                                     return CurDAG->getTargetConstant(
+                                         Log2_32(uint8_t(~N->getZExtValue())),
+                                         SDLoc(N), MVT::i8);
+                                   }]>;
+
+def ioaddr8 : PatLeaf<(imm), [{
+                        uint8_t offset = Subtarget->getIORegisterOffset();
+                        uint64_t val = N->getZExtValue() - offset;
+                        return val < 0x40;
+                      }],
+                      ioaddr_XFORM>;
+
+def lowioaddr8 : PatLeaf<(imm), [{
+                           uint8_t offset = Subtarget->getIORegisterOffset();
+                           uint64_t val = N->getZExtValue() - offset;
+                           return val < 0x20;
+                         }],
+                         ioaddr_XFORM>;
+
+def ioaddr16 : PatLeaf<(imm), [{
+                         uint8_t offset = Subtarget->getIORegisterOffset();
+                         uint64_t val = N->getZExtValue() - offset;
+                         return val < 0x3f;
+                       }],
+                       ioaddr_XFORM>;
+
+def iobitpos8
+    : PatLeaf<(imm), [{ return isPowerOf2_32(uint8_t(N->getZExtValue())); }],
+              iobitpos8_XFORM>;
+
+def iobitposn8
+    : PatLeaf<(imm), [{ return isPowerOf2_32(uint8_t(~N->getZExtValue())); }],
+              iobitposn8_XFORM>;
 
 def MemriAsmOperand : AsmOperandClass {
   let Name = "Memri";
@@ -170,8 +169,7 @@ def MemriAsmOperand : AsmOperandClass {
 }
 
 /// Address operand for `reg+imm` used by STD and LDD.
-def memri : Operand<iPTR>
-{
+def memri : Operand<iPTR> {
   let MIOperandInfo = (ops PTRDISPREGS, i16imm);
 
   let PrintMethod = "printMemri";
@@ -181,60 +179,47 @@ def memri : Operand<iPTR>
 }
 
 // Address operand for `SP+imm` used by STD{W}SPQRr
-def memspi : Operand<iPTR>
-{
-  let MIOperandInfo = (ops GPRSP, i16imm);
-}
+def memspi : Operand<iPTR> { let MIOperandInfo = (ops GPRSP, i16imm); }
 
-def relbrtarget_7 : Operand<OtherVT>
-{
-    let PrintMethod   = "printPCRelImm";
-    let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
+def relbrtarget_7 : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImm";
+  let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
 }
 
-def brtarget_13 : Operand<OtherVT>
-{
-    let PrintMethod   = "printPCRelImm";
-    let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+def brtarget_13 : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImm";
+  let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
 }
 
 // The target of a 22 or 16-bit call/jmp instruction.
-def call_target : Operand<iPTR>
-{
-    let EncoderMethod = "encodeCallTarget";
-    let DecoderMethod = "decodeCallTarget";
+def call_target : Operand<iPTR> {
+  let EncoderMethod = "encodeCallTarget";
+  let DecoderMethod = "decodeCallTarget";
 }
 
 // A 16-bit address (which can lead to an R_AVR_16 relocation).
-def imm16 : Operand<i16>
-{
-    let EncoderMethod = "encodeImm<AVR::fixup_16, 2>";
-}
+def imm16 : Operand<i16> { let EncoderMethod = "encodeImm<AVR::fixup_16, 2>"; }
 
 /// A 6-bit immediate used in the ADIW/SBIW instructions.
-def imm_arith6 : Operand<i16>
-{
-    let EncoderMethod = "encodeImm<AVR::fixup_6_adiw, 0>";
+def imm_arith6 : Operand<i16> {
+  let EncoderMethod = "encodeImm<AVR::fixup_6_adiw, 0>";
 }
 
 /// An 8-bit immediate inside an instruction with the same format
 /// as the `LDI` instruction (the `FRdK` format).
-def imm_ldi8 : Operand<i8>
-{
-    let EncoderMethod = "encodeImm<AVR::fixup_ldi, 0>";
+def imm_ldi8 : Operand<i8> {
+  let EncoderMethod = "encodeImm<AVR::fixup_ldi, 0>";
 }
 
 /// A 5-bit port number used in SBIC and friends (the `FIOBIT` format).
-def imm_port5 : Operand<i8>
-{
-    let EncoderMethod = "encodeImm<AVR::fixup_port5, 0>";
+def imm_port5 : Operand<i8> {
+  let EncoderMethod = "encodeImm<AVR::fixup_port5, 0>";
 }
 
 /// A 6-bit port number used in the `IN` instruction and friends (the
 /// `FIORdA` format.
-def imm_port6 : Operand<i8>
-{
-    let EncoderMethod = "encodeImm<AVR::fixup_port6, 0>";
+def imm_port6 : Operand<i8> {
+  let EncoderMethod = "encodeImm<AVR::fixup_port6, 0>";
 }
 
 // Addressing mode pattern reg+imm6
@@ -243,91 +228,85 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], [SDNPWantRoot]>;
 // AsmOperand class for a pointer register.
 // Used with the LD/ST family of instructions.
 // See FSTLD in AVRInstrFormats.td
-def PtrRegAsmOperand : AsmOperandClass
-{
-   let Name = "Reg";
-}
+def PtrRegAsmOperand : AsmOperandClass { let Name = "Reg"; }
 
 // A special operand type for the LD/ST instructions.
 // It converts the pointer register number into a two-bit field used in the
 // instruction.
-def LDSTPtrReg : Operand<i16>
-{
-    let MIOperandInfo = (ops PTRREGS);
-    let EncoderMethod = "encodeLDSTPtrReg";
+def LDSTPtrReg : Operand<i16> {
+  let MIOperandInfo = (ops PTRREGS);
+  let EncoderMethod = "encodeLDSTPtrReg";
 
-    let ParserMatchClass = PtrRegAsmOperand;
+  let ParserMatchClass = PtrRegAsmOperand;
 }
 
 // A special operand type for the LDD/STD instructions.
 // It behaves identically to the LD/ST version, except restricts
 // the pointer registers to Y and Z.
-def LDDSTDPtrReg : Operand<i16>
-{
-    let MIOperandInfo = (ops PTRDISPREGS);
-    let EncoderMethod = "encodeLDSTPtrReg";
+def LDDSTDPtrReg : Operand<i16> {
+  let MIOperandInfo = (ops PTRDISPREGS);
+  let EncoderMethod = "encodeLDSTPtrReg";
 
-    let ParserMatchClass = PtrRegAsmOperand;
+  let ParserMatchClass = PtrRegAsmOperand;
 }
 
 //===----------------------------------------------------------------------===//
 // AVR predicates for subtarget features
 //===----------------------------------------------------------------------===//
 
-def HasSRAM       :    Predicate<"Subtarget->hasSRAM()">,
-                         AssemblerPredicate<(all_of FeatureSRAM)>;
+def HasSRAM : Predicate<"Subtarget->hasSRAM()">,
+              AssemblerPredicate<(all_of FeatureSRAM)>;
 
-def HasJMPCALL    :    Predicate<"Subtarget->hasJMPCALL()">,
-                         AssemblerPredicate<(all_of FeatureJMPCALL)>;
+def HasJMPCALL : Predicate<"Subtarget->hasJMPCALL()">,
+                 AssemblerPredicate<(all_of FeatureJMPCALL)>;
 
-def HasIJMPCALL   :    Predicate<"Subtarget->hasIJMPCALL()">,
-                         AssemblerPredicate<(all_of FeatureIJMPCALL)>;
+def HasIJMPCALL : Predicate<"Subtarget->hasIJMPCALL()">,
+                  AssemblerPredicate<(all_of FeatureIJMPCALL)>;
 
-def HasEIJMPCALL  :    Predicate<"Subtarget->hasEIJMPCALL()">,
-                         AssemblerPredicate<(all_of FeatureEIJMPCALL)>;
+def HasEIJMPCALL : Predicate<"Subtarget->hasEIJMPCALL()">,
+                   AssemblerPredicate<(all_of FeatureEIJMPCALL)>;
 
-def HasADDSUBIW   :    Predicate<"Subtarget->hasADDSUBIW()">,
-                         AssemblerPredicate<(all_of FeatureADDSUBIW)>;
+def HasADDSUBIW : Predicate<"Subtarget->hasADDSUBIW()">,
+                  AssemblerPredicate<(all_of FeatureADDSUBIW)>;
 
-def HasSmallStack :    Predicate<"Subtarget->HasSmallStack()">,
-                         AssemblerPredicate<(all_of FeatureSmallStack)>;
+def HasSmallStack : Predicate<"Subtarget->HasSmallStack()">,
+                    AssemblerPredicate<(all_of FeatureSmallStack)>;
 
-def HasMOVW       :    Predicate<"Subtarget->hasMOVW()">,
-                         AssemblerPredicate<(all_of FeatureMOVW)>;
+def HasMOVW : Predicate<"Subtarget->hasMOVW()">,
+              AssemblerPredicate<(all_of FeatureMOVW)>;
 
-def HasLPM        :    Predicate<"Subtarget->hasLPM()">,
-                         AssemblerPredicate<(all_of FeatureLPM)>;
+def HasLPM : Predicate<"Subtarget->hasLPM()">,
+             AssemblerPredicate<(all_of FeatureLPM)>;
 
-def HasLPMX       :    Predicate<"Subtarget->hasLPMX()">,
-                         AssemblerPredicate<(all_of FeatureLPMX)>;
+def HasLPMX : Predicate<"Subtarget->hasLPMX()">,
+              AssemblerPredicate<(all_of FeatureLPMX)>;
 
-def HasELPM       :    Predicate<"Subtarget->hasELPM()">,
-                         AssemblerPredicate<(all_of FeatureELPM)>;
+def HasELPM : Predicate<"Subtarget->hasELPM()">,
+              AssemblerPredicate<(all_of FeatureELPM)>;
 
-def HasELPMX      :    Predicate<"Subtarget->hasELPMX()">,
-                         AssemblerPredicate<(all_of FeatureELPMX)>;
+def HasELPMX : Predicate<"Subtarget->hasELPMX()">,
+               AssemblerPredicate<(all_of FeatureELPMX)>;
 
-def HasSPM        :    Predicate<"Subtarget->hasSPM()">,
-                         AssemblerPredicate<(all_of FeatureSPM)>;
+def HasSPM : Predicate<"Subtarget->hasSPM()">,
+             AssemblerPredicate<(all_of FeatureSPM)>;
 
-def HasSPMX       :    Predicate<"Subtarget->hasSPMX()">,
-                         AssemblerPredicate<(all_of FeatureSPMX)>;
+def HasSPMX : Predicate<"Subtarget->hasSPMX()">,
+              AssemblerPredicate<(all_of FeatureSPMX)>;
 
-def HasDES        :    Predicate<"Subtarget->hasDES()">,
-                         AssemblerPredicate<(all_of FeatureDES)>;
+def HasDES : Predicate<"Subtarget->hasDES()">,
+             AssemblerPredicate<(all_of FeatureDES)>;
 
-def SupportsRMW   :    Predicate<"Subtarget->supportsRMW()">,
-                         AssemblerPredicate<(all_of FeatureRMW)>;
+def SupportsRMW : Predicate<"Subtarget->supportsRMW()">,
+                  AssemblerPredicate<(all_of FeatureRMW)>;
 
 def SupportsMultiplication : Predicate<"Subtarget->supportsMultiplication()">,
-                               AssemblerPredicate<(all_of FeatureMultiplication)>;
+                             AssemblerPredicate<(all_of FeatureMultiplication)>;
 
-def HasBREAK      :    Predicate<"Subtarget->hasBREAK()">,
-                         AssemblerPredicate<(all_of FeatureBREAK)>;
+def HasBREAK : Predicate<"Subtarget->hasBREAK()">,
+               AssemblerPredicate<(all_of FeatureBREAK)>;
 
 def HasTinyEncoding : Predicate<"Subtarget->hasTinyEncoding()">,
-                        AssemblerPredicate<(all_of FeatureTinyEncoding)>;
-
+                      AssemblerPredicate<(all_of FeatureTinyEncoding)>;
 
 // AVR specific condition code. These correspond to AVR_*_COND in
 // AVRInstrInfo.td. They must be kept in synch.
@@ -340,7 +319,6 @@ def AVR_COND_LO : PatLeaf<(i8 5)>;
 def AVR_COND_MI : PatLeaf<(i8 6)>;
 def AVR_COND_PL : PatLeaf<(i8 7)>;
 
-
 //===----------------------------------------------------------------------===//
 //===----------------------------------------------------------------------===//
 // AVR Instruction list
@@ -352,43 +330,49 @@ def AVR_COND_PL : PatLeaf<(i8 7)>;
 // pointer before prolog-epilog rewriting occurs.
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SREG.
-let Defs = [SP, SREG],
-Uses = [SP] in
-{
+let Defs = [SP, SREG], Uses = [SP] in {
   def ADJCALLSTACKDOWN : Pseudo<(outs),
-                                (ins i16imm:$amt, i16imm:$amt2),
-                                "#ADJCALLSTACKDOWN",
-                                [(AVRcallseq_start timm:$amt, timm:$amt2)]>;
+                                (ins i16imm
+                                 : $amt, i16imm
+                                 : $amt2),
+                                "#ADJCALLSTACKDOWN", [(AVRcallseq_start timm
+                                                       : $amt, timm
+                                                       : $amt2)]>;
 
   // R31R30 is used to update SP. It is normally free because it is a
   // call-clobbered register but it is necessary to set it as a def as the
   // register allocator might use it in rare cases (for rematerialization, it
   // seems). hasSideEffects needs to be set to true so this instruction isn't
   // considered dead.
-  let Defs = [R31R30],
-  hasSideEffects=1 in
-  def ADJCALLSTACKUP : Pseudo<(outs),
-                              (ins i16imm:$amt1, i16imm:$amt2),
-                              "#ADJCALLSTACKUP",
-                              [(AVRcallseq_end timm:$amt1, timm:$amt2)]>;
+  let Defs = [R31R30], hasSideEffects = 1 in def ADJCALLSTACKUP
+      : Pseudo<(outs),
+               (ins i16imm
+                : $amt1, i16imm
+                : $amt2),
+               "#ADJCALLSTACKUP", [(AVRcallseq_end timm
+                                    : $amt1, timm
+                                    : $amt2)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
-let isCommutable = 1,
-Constraints = "$src = $rd",
-Defs = [SREG] in
-{
+let isCommutable = 1, Constraints = "$src = $rd", Defs = [SREG] in {
   // ADD Rd, Rr
   // Adds two 8-bit registers.
-  def ADDRdRr : FRdRr<0b0000,
-                      0b11,
-                      (outs GPR8:$rd),
-                      (ins GPR8:$src, GPR8:$rr),
-                      "add\t$rd, $rr",
-                      [(set i8:$rd, (add i8:$src, i8:$rr)),
-                       (implicit SREG)]>;
+  def ADDRdRr
+      : FRdRr<0b0000, 0b11,
+              (outs GPR8
+               : $rd),
+              (ins GPR8
+               : $src, GPR8
+               : $rr),
+              "add\t$rd, $rr",
+              [(set i8
+                : $rd, (add i8
+                        : $src, i8
+                        : $rr)),
+               (implicit SREG)]>;
 
   // ADDW Rd+1:Rd, Rr+1:Rr
   // Pseudo instruction to add four 8-bit registers as two 16-bit values.
@@ -396,22 +380,34 @@ Defs = [SREG] in
   // Expands to:
   // add Rd,    Rr
   // adc Rd+1, Rr+1
-  def ADDWRdRr : Pseudo<(outs DREGS:$rd),
-                        (ins DREGS:$src, DREGS:$rr),
-                        "addw\t$rd, $rr",
-                        [(set i16:$rd, (add i16:$src, i16:$rr)),
-                         (implicit SREG)]>;
+  def ADDWRdRr
+      : Pseudo<(outs DREGS
+                : $rd),
+               (ins DREGS
+                : $src, DREGS
+                : $rr),
+               "addw\t$rd, $rr",
+               [(set i16
+                 : $rd, (add i16
+                         : $src, i16
+                         : $rr)),
+                (implicit SREG)]>;
 
   // ADC Rd, Rr
   // Adds two 8-bit registers with carry.
-  let Uses = [SREG] in
-  def ADCRdRr : FRdRr<0b0001,
-                      0b11,
-                      (outs GPR8:$rd),
-                      (ins GPR8:$src, GPR8:$rr),
-                      "adc\t$rd, $rr",
-                      [(set i8:$rd, (adde i8:$src, i8:$rr)),
-                       (implicit SREG)]>;
+  let Uses = [SREG] in def ADCRdRr
+      : FRdRr<0b0001, 0b11,
+              (outs GPR8
+               : $rd),
+              (ins GPR8
+               : $src, GPR8
+               : $rr),
+              "adc\t$rd, $rr",
+              [(set i8
+                : $rd, (adde i8
+                        : $src, i8
+                        : $rr)),
+               (implicit SREG)]>;
 
   // ADCW Rd+1:Rd, Rr+1:Rr
   // Pseudo instruction to add four 8-bit registers as two 16-bit values with
@@ -420,39 +416,56 @@ Defs = [SREG] in
   // Expands to:
   // adc Rd,   Rr
   // adc Rd+1, Rr+1
-  let Uses = [SREG] in
-  def ADCWRdRr : Pseudo<(outs DREGS:$rd),
-                        (ins DREGS:$src, DREGS:$rr),
-                        "adcw\t$rd, $rr",
-                        [(set i16:$rd, (adde i16:$src, i16:$rr)),
-                         (implicit SREG)]>;
+  let Uses = [SREG] in def ADCWRdRr : Pseudo<(outs DREGS
+                                              : $rd),
+                                             (ins DREGS
+                                              : $src, DREGS
+                                              : $rr),
+                                             "adcw\t$rd, $rr", [
+                                               (set i16
+                                                : $rd, (adde i16
+                                                        : $src, i16
+                                                        : $rr)),
+                                               (implicit SREG)
+                                             ]>;
 
   // AIDW Rd, k
   // Adds an immediate 6-bit value K to Rd, placing the result in Rd.
-  def ADIWRdK : FWRdK<0b0,
-                      (outs IWREGS:$rd),
-                      (ins IWREGS:$src, imm_arith6:$k),
-                      "adiw\t$rd, $k",
-                      [(set i16:$rd, (add i16:$src, uimm6:$k)),
-                       (implicit SREG)]>,
-                Requires<[HasADDSUBIW]>;
+  def ADIWRdK
+      : FWRdK<0b0,
+              (outs IWREGS
+               : $rd),
+              (ins IWREGS
+               : $src, imm_arith6
+               : $k),
+              "adiw\t$rd, $k",
+              [(set i16
+                : $rd, (add i16
+                        : $src, uimm6
+                        : $k)),
+               (implicit SREG)]>,
+        Requires<[HasADDSUBIW]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd",
-Defs = [SREG] in
-{
+let Constraints = "$src = $rd", Defs = [SREG] in {
   // SUB Rd, Rr
   // Subtracts the 8-bit value of Rr from Rd and places the value in Rd.
-  def SUBRdRr : FRdRr<0b0001,
-                      0b10,
-                      (outs GPR8:$rd),
-                      (ins GPR8:$src, GPR8:$rr),
-                      "sub\t$rd, $rr",
-                      [(set i8:$rd, (sub i8:$src, i8:$rr)),
-                       (implicit SREG)]>;
+  def SUBRdRr
+      : FRdRr<0b0001, 0b10,
+              (outs GPR8
+               : $rd),
+              (ins GPR8
+               : $src, GPR8
+               : $rr),
+              "sub\t$rd, $rr",
+              [(set i8
+                : $rd, (sub i8
+                        : $src, i8
+                        : $rr)),
+               (implicit SREG)]>;
 
   // SUBW Rd+1:Rd, Rr+1:Rr
   // Subtracts two 16-bit values and places the result into Rd.
@@ -460,295 +473,429 @@ Defs = [SREG] in
   // Expands to:
   // sub Rd,   Rr
   // sbc Rd+1, Rr+1
-  def SUBWRdRr : Pseudo<(outs DREGS:$rd),
-                        (ins DREGS:$src, DREGS:$rr),
-                        "subw\t$rd, $rr",
-                        [(set i16:$rd, (sub i16:$src, i16:$rr)),
-                         (implicit SREG)]>;
-
-  def SUBIRdK : FRdK<0b0101,
-                     (outs LD8:$rd),
-                     (ins LD8:$src, imm_ldi8:$k),
-                     "subi\t$rd, $k",
-                     [(set i8:$rd, (sub i8:$src, imm:$k)),
-                      (implicit SREG)]>;
+  def SUBWRdRr
+      : Pseudo<(outs DREGS
+                : $rd),
+               (ins DREGS
+                : $src, DREGS
+                : $rr),
+               "subw\t$rd, $rr",
+               [(set i16
+                 : $rd, (sub i16
+                         : $src, i16
+                         : $rr)),
+                (implicit SREG)]>;
+
+  def SUBIRdK
+      : FRdK<0b0101,
+             (outs LD8
+              : $rd),
+             (ins LD8
+              : $src, imm_ldi8
+              : $k),
+             "subi\t$rd, $k",
+             [(set i8
+               : $rd, (sub i8
+                       : $src, imm
+                       : $k)),
+              (implicit SREG)]>;
 
   // SUBIW Rd+1:Rd, K+1:K
   //
   // Expands to:
   // subi Rd,   K
   // sbci Rd+1, K+1
-  def SUBIWRdK : Pseudo<(outs DLDREGS:$rd),
-                        (ins DLDREGS:$src, i16imm:$rr),
-                        "subiw\t$rd, $rr",
-                        [(set i16:$rd, (sub i16:$src, imm:$rr)),
-                         (implicit SREG)]>;
-
-  def SBIWRdK : FWRdK<0b1,
-                      (outs IWREGS:$rd),
-                      (ins IWREGS:$src, imm_arith6:$k),
-                      "sbiw\t$rd, $k",
-                      [(set i16:$rd, (sub i16:$src, uimm6:$k)),
-                       (implicit SREG)]>,
-                Requires<[HasADDSUBIW]>;
+  def SUBIWRdK
+      : Pseudo<(outs DLDREGS
+                : $rd),
+               (ins DLDREGS
+                : $src, i16imm
+                : $rr),
+               "subiw\t$rd, $rr",
+               [(set i16
+                 : $rd, (sub i16
+                         : $src, imm
+                         : $rr)),
+                (implicit SREG)]>;
+
+  def SBIWRdK
+      : FWRdK<0b1,
+              (outs IWREGS
+               : $rd),
+              (ins IWREGS
+               : $src, imm_arith6
+               : $k),
+              "sbiw\t$rd, $k",
+              [(set i16
+                : $rd, (sub i16
+                        : $src, uimm6
+                        : $k)),
+               (implicit SREG)]>,
+        Requires<[HasADDSUBIW]>;
 
   // Subtract with carry operations which must read the carry flag in SREG.
-  let Uses = [SREG] in
-  {
-    def SBCRdRr : FRdRr<0b0000,
-                        0b10,
-                        (outs GPR8:$rd),
-                        (ins GPR8:$src, GPR8:$rr),
-                        "sbc\t$rd, $rr",
-                        [(set i8:$rd, (sube i8:$src, i8:$rr)),
-                         (implicit SREG)]>;
+  let Uses = [SREG] in {
+    def SBCRdRr
+        : FRdRr<0b0000, 0b10,
+                (outs GPR8
+                 : $rd),
+                (ins GPR8
+                 : $src, GPR8
+                 : $rr),
+                "sbc\t$rd, $rr",
+                [(set i8
+                  : $rd, (sube i8
+                          : $src, i8
+                          : $rr)),
+                 (implicit SREG)]>;
 
     // SBCW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // sbc Rd,   Rr
     // sbc Rd+1, Rr+1
-    def SBCWRdRr : Pseudo<(outs DREGS:$rd),
-                          (ins DREGS:$src, DREGS:$rr),
-                          "sbcw\t$rd, $rr",
-                          [(set i16:$rd, (sube i16:$src, i16:$rr)),
-                           (implicit SREG)]>;
-
-    def SBCIRdK : FRdK<0b0100,
-                       (outs LD8:$rd),
-                       (ins LD8:$src, imm_ldi8:$k),
-                       "sbci\t$rd, $k",
-                       [(set i8:$rd, (sube i8:$src, imm:$k)),
-                        (implicit SREG)]>;
+    def SBCWRdRr : Pseudo<(outs DREGS
+                           : $rd),
+                          (ins DREGS
+                           : $src, DREGS
+                           : $rr),
+                          "sbcw\t$rd, $rr", [
+                            (set i16
+                             : $rd, (sube i16
+                                     : $src, i16
+                                     : $rr)),
+                            (implicit SREG)
+                          ]>;
+
+    def SBCIRdK
+        : FRdK<0b0100,
+               (outs LD8
+                : $rd),
+               (ins LD8
+                : $src, imm_ldi8
+                : $k),
+               "sbci\t$rd, $k",
+               [(set i8
+                 : $rd, (sube i8
+                         : $src, imm
+                         : $k)),
+                (implicit SREG)]>;
 
     // SBCIW Rd+1:Rd, K+1:K
     // sbci Rd,   K
     // sbci Rd+1, K+1
-    def SBCIWRdK : Pseudo<(outs DLDREGS:$rd),
-                          (ins DLDREGS:$src, i16imm:$rr),
-                          "sbciw\t$rd, $rr",
-                          [(set i16:$rd, (sube i16:$src, imm:$rr)),
-                           (implicit SREG)]>;
+    def SBCIWRdK : Pseudo<(outs DLDREGS
+                           : $rd),
+                          (ins DLDREGS
+                           : $src, i16imm
+                           : $rr),
+                          "sbciw\t$rd, $rr", [
+                            (set i16
+                             : $rd, (sube i16
+                                     : $src, imm
+                                     : $rr)),
+                            (implicit SREG)
+                          ]>;
   }
 }
 
 //===----------------------------------------------------------------------===//
 // Increment and Decrement
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd",
-Defs = [SREG] in
-{
-  def INCRd : FRd<0b1001,
-                  0b0100011,
-                  (outs GPR8:$rd),
-                  (ins GPR8:$src),
-                  "inc\t$rd",
-                  [(set i8:$rd, (add i8:$src, 1)), (implicit SREG)]>;
-
-  def DECRd : FRd<0b1001,
-                  0b0101010,
-                  (outs GPR8:$rd),
-                  (ins GPR8:$src),
-                  "dec\t$rd",
-                  [(set i8:$rd, (add i8:$src, -1)), (implicit SREG)]>;
+let Constraints = "$src = $rd", Defs = [SREG] in {
+  def INCRd
+      : FRd<0b1001, 0b0100011,
+            (outs GPR8
+             : $rd),
+            (ins GPR8
+             : $src),
+            "inc\t$rd", [(set i8
+                          : $rd, (add i8
+                                  : $src, 1)),
+                         (implicit SREG)]>;
+
+  def DECRd
+      : FRd<0b1001, 0b0101010,
+            (outs GPR8
+             : $rd),
+            (ins GPR8
+             : $src),
+            "dec\t$rd", [(set i8
+                          : $rd, (add i8
+                                  : $src, -1)),
+                         (implicit SREG)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Multiplication
 //===----------------------------------------------------------------------===//
 
-let isCommutable = 1,
-Defs = [R1, R0, SREG] in
-{
+let isCommutable = 1, Defs = [R1, R0, SREG] in {
   // MUL Rd, Rr
   // Multiplies Rd by Rr and places the result into R1:R0.
   let usesCustomInserter = 1 in {
-    def MULRdRr : FRdRr<0b1001, 0b11,
-                        (outs),
-                        (ins GPR8:$lhs, GPR8:$rhs),
+    def MULRdRr : FRdRr<0b1001, 0b11, (outs),
+                        (ins GPR8
+                         : $lhs, GPR8
+                         : $rhs),
                         "mul\t$lhs, $rhs",
                         [/*(set R1, R0, (smullohi i8:$lhs, i8:$rhs))*/]>,
-                    Requires<[SupportsMultiplication]>;
+                  Requires<[SupportsMultiplication]>;
 
-    def MULSRdRr : FMUL2RdRr<0,
-                             (outs),
-                             (ins LD8:$lhs, LD8:$rhs),
-                             "muls\t$lhs, $rhs",
-                             []>,
+    def MULSRdRr : FMUL2RdRr<0, (outs),
+                             (ins LD8
+                              : $lhs, LD8
+                              : $rhs),
+                             "muls\t$lhs, $rhs", []>,
                    Requires<[SupportsMultiplication]>;
   }
 
-  def MULSURdRr : FMUL2RdRr<1,
-                            (outs),
-                            (ins LD8lo:$lhs, LD8lo:$rhs),
-                            "mulsu\t$lhs, $rhs",
-                            []>,
+  def MULSURdRr : FMUL2RdRr<1, (outs),
+                            (ins LD8lo
+                             : $lhs, LD8lo
+                             : $rhs),
+                            "mulsu\t$lhs, $rhs", []>,
                   Requires<[SupportsMultiplication]>;
 
-  def FMUL : FFMULRdRr<0b01,
-                       (outs),
-                       (ins LD8lo:$lhs, LD8lo:$rhs),
-                       "fmul\t$lhs, $rhs",
-                       []>,
+  def FMUL : FFMULRdRr<0b01, (outs),
+                       (ins LD8lo
+                        : $lhs, LD8lo
+                        : $rhs),
+                       "fmul\t$lhs, $rhs", []>,
              Requires<[SupportsMultiplication]>;
 
-  def FMULS : FFMULRdRr<0b10,
-                        (outs),
-                        (ins LD8lo:$lhs, LD8lo:$rhs),
-                        "fmuls\t$lhs, $rhs",
-                        []>,
+  def FMULS : FFMULRdRr<0b10, (outs),
+                        (ins LD8lo
+                         : $lhs, LD8lo
+                         : $rhs),
+                        "fmuls\t$lhs, $rhs", []>,
               Requires<[SupportsMultiplication]>;
 
-  def FMULSU : FFMULRdRr<0b11,
-                         (outs),
-                         (ins LD8lo:$lhs, LD8lo:$rhs),
-                         "fmulsu\t$lhs, $rhs",
-                         []>,
+  def FMULSU : FFMULRdRr<0b11, (outs),
+                         (ins LD8lo
+                          : $lhs, LD8lo
+                          : $rhs),
+                         "fmulsu\t$lhs, $rhs", []>,
                Requires<[SupportsMultiplication]>;
 }
 
-let Defs = [R15, R14, R13, R12, R11, R10, R9,
-            R8, R7, R6, R5, R4, R3, R2, R1, R0] in
-def DESK : FDES<(outs),
-                (ins i8imm:$k),
-                "des\t$k",
-                []>,
-           Requires<[HasDES]>;
+let Defs =
+    [R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R1,
+     R0] in def DESK : FDES<(outs),
+                            (ins i8imm
+                             : $k),
+                            "des\t$k", []>,
+    Requires<[HasDES]>;
 
 //===----------------------------------------------------------------------===//
 // Logic
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd",
-Defs = [SREG] in
-{
+let Constraints = "$src = $rd", Defs = [SREG] in {
   // Register-Register logic instructions (which have the
   // property of commutativity).
-  let isCommutable = 1 in
-  {
-    def ANDRdRr : FRdRr<0b0010,
-                        0b00,
-                        (outs GPR8:$rd),
-                        (ins GPR8:$src, GPR8:$rr),
-                        "and\t$rd, $rr",
-                        [(set i8:$rd, (and i8:$src, i8:$rr)),
-                         (implicit SREG)]>;
+  let isCommutable = 1 in {
+    def ANDRdRr
+        : FRdRr<0b0010, 0b00,
+                (outs GPR8
+                 : $rd),
+                (ins GPR8
+                 : $src, GPR8
+                 : $rr),
+                "and\t$rd, $rr",
+                [(set i8
+                  : $rd, (and i8
+                          : $src, i8
+                          : $rr)),
+                 (implicit SREG)]>;
 
     // ANDW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // and Rd,   Rr
     // and Rd+1, Rr+1
-    def ANDWRdRr : Pseudo<(outs DREGS:$rd),
-                          (ins DREGS:$src, DREGS:$rr),
-                          "andw\t$rd, $rr",
-                          [(set i16:$rd, (and i16:$src, i16:$rr)),
-                           (implicit SREG)]>;
-
-    def ORRdRr : FRdRr<0b0010,
-                       0b10,
-                       (outs GPR8:$rd),
-                       (ins GPR8:$src, GPR8:$rr),
-                       "or\t$rd, $rr",
-                       [(set i8:$rd, (or i8:$src, i8:$rr)),
-                        (implicit SREG)]>;
+    def ANDWRdRr : Pseudo<(outs DREGS
+                           : $rd),
+                          (ins DREGS
+                           : $src, DREGS
+                           : $rr),
+                          "andw\t$rd, $rr", [
+                            (set i16
+                             : $rd, (and i16
+                                     : $src, i16
+                                     : $rr)),
+                            (implicit SREG)
+                          ]>;
+
+    def ORRdRr
+        : FRdRr<0b0010, 0b10,
+                (outs GPR8
+                 : $rd),
+                (ins GPR8
+                 : $src, GPR8
+                 : $rr),
+                "or\t$rd, $rr",
+                [(set i8
+                  : $rd, (or i8
+                          : $src, i8
+                          : $rr)),
+                 (implicit SREG)]>;
 
     // ORW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // or Rd,   Rr
     // or Rd+1, Rr+1
-    def ORWRdRr : Pseudo<(outs DREGS:$rd),
-                         (ins DREGS:$src, DREGS:$rr),
-                         "orw\t$rd, $rr",
-                         [(set i16:$rd, (or i16:$src, i16:$rr)),
-                          (implicit SREG)]>;
-
-    def EORRdRr : FRdRr<0b0010,
-                        0b01,
-                        (outs GPR8:$rd),
-                        (ins GPR8:$src, GPR8:$rr),
-                        "eor\t$rd, $rr",
-                        [(set i8:$rd, (xor i8:$src, i8:$rr)),
-                         (implicit SREG)]>;
+    def ORWRdRr : Pseudo<(outs DREGS
+                          : $rd),
+                         (ins DREGS
+                          : $src, DREGS
+                          : $rr),
+                         "orw\t$rd, $rr", [
+                           (set i16
+                            : $rd, (or i16
+                                    : $src, i16
+                                    : $rr)),
+                           (implicit SREG)
+                         ]>;
+
+    def EORRdRr
+        : FRdRr<0b0010, 0b01,
+                (outs GPR8
+                 : $rd),
+                (ins GPR8
+                 : $src, GPR8
+                 : $rr),
+                "eor\t$rd, $rr",
+                [(set i8
+                  : $rd, (xor i8
+                          : $src, i8
+                          : $rr)),
+                 (implicit SREG)]>;
 
     // EORW Rd+1:Rd, Rr+1:Rr
     //
     // Expands to:
     // eor Rd,   Rr
     // eor Rd+1, Rr+1
-    def EORWRdRr : Pseudo<(outs DREGS:$rd),
-                          (ins DREGS:$src, DREGS:$rr),
-                          "eorw\t$rd, $rr",
-                          [(set i16:$rd, (xor i16:$src, i16:$rr)),
-                           (implicit SREG)]>;
+    def EORWRdRr : Pseudo<(outs DREGS
+                           : $rd),
+                          (ins DREGS
+                           : $src, DREGS
+                           : $rr),
+                          "eorw\t$rd, $rr", [
+                            (set i16
+                             : $rd, (xor i16
+                                     : $src, i16
+                                     : $rr)),
+                            (implicit SREG)
+                          ]>;
   }
 
-  def ANDIRdK : FRdK<0b0111,
-                     (outs LD8:$rd),
-                     (ins LD8:$src, imm_ldi8:$k),
-                     "andi\t$rd, $k",
-                     [(set i8:$rd, (and i8:$src, imm:$k)),
-                      (implicit SREG)]>;
+  def ANDIRdK
+      : FRdK<0b0111,
+             (outs LD8
+              : $rd),
+             (ins LD8
+              : $src, imm_ldi8
+              : $k),
+             "andi\t$rd, $k",
+             [(set i8
+               : $rd, (and i8
+                       : $src, imm
+                       : $k)),
+              (implicit SREG)]>;
 
   // ANDI Rd+1:Rd, K+1:K
   //
   // Expands to:
   // andi Rd,   K
   // andi Rd+1, K+1
-  def ANDIWRdK : Pseudo<(outs DLDREGS:$rd),
-                        (ins DLDREGS:$src, i16imm:$k),
-                        "andiw\t$rd, $k",
-                        [(set i16:$rd, (and i16:$src, imm:$k)),
-                         (implicit SREG)]>;
-
-  def ORIRdK : FRdK<0b0110,
-                    (outs LD8:$rd),
-                    (ins LD8:$src, imm_ldi8:$k),
-                    "ori\t$rd, $k",
-                    [(set i8:$rd, (or i8:$src, imm:$k)),
-                     (implicit SREG)]>;
+  def ANDIWRdK
+      : Pseudo<(outs DLDREGS
+                : $rd),
+               (ins DLDREGS
+                : $src, i16imm
+                : $k),
+               "andiw\t$rd, $k",
+               [(set i16
+                 : $rd, (and i16
+                         : $src, imm
+                         : $k)),
+                (implicit SREG)]>;
+
+  def ORIRdK
+      : FRdK<0b0110,
+             (outs LD8
+              : $rd),
+             (ins LD8
+              : $src, imm_ldi8
+              : $k),
+             "ori\t$rd, $k",
+             [(set i8
+               : $rd, (or i8
+                       : $src, imm
+                       : $k)),
+              (implicit SREG)]>;
 
   // ORIW Rd+1:Rd, K+1,K
   //
   // Expands to:
   // ori Rd,   K
   // ori Rd+1, K+1
-  def ORIWRdK : Pseudo<(outs DLDREGS:$rd),
-                       (ins DLDREGS:$src, i16imm:$rr),
-                       "oriw\t$rd, $rr",
-                       [(set i16:$rd, (or i16:$src, imm:$rr)),
-                        (implicit SREG)]>;
+  def ORIWRdK
+      : Pseudo<(outs DLDREGS
+                : $rd),
+               (ins DLDREGS
+                : $src, i16imm
+                : $rr),
+               "oriw\t$rd, $rr",
+               [(set i16
+                 : $rd, (or i16
+                         : $src, imm
+                         : $rr)),
+                (implicit SREG)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // One's/Two's Complement
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd",
-Defs = [SREG] in
-{
-  def COMRd : FRd<0b1001,
-                  0b0100000,
-                  (outs GPR8:$rd),
-                  (ins GPR8:$src),
-                  "com\t$rd",
-                  [(set i8:$rd, (not i8:$src)), (implicit SREG)]>;
+let Constraints = "$src = $rd", Defs = [SREG] in {
+  def COMRd
+      : FRd<0b1001, 0b0100000,
+            (outs GPR8
+             : $rd),
+            (ins GPR8
+             : $src),
+            "com\t$rd", [(set i8
+                          : $rd, (not i8
+                                  : $src)),
+                         (implicit SREG)]>;
 
   // COMW Rd+1:Rd
   //
   // Expands to:
   // com Rd
   // com Rd+1
-  def COMWRd : Pseudo<(outs DREGS:$rd),
-                      (ins DREGS:$src),
+  def COMWRd : Pseudo<(outs DREGS
+                       : $rd),
+                      (ins DREGS
+                       : $src),
                       "comw\t$rd",
-                      [(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
+                      [(set i16
+                        : $rd, (not i16
+                                : $src)),
+                       (implicit SREG)]>;
 
-  def NEGRd : FRd<0b1001,
-                  0b0100001,
-                  (outs GPR8:$rd),
-                  (ins GPR8:$src),
-                  "neg\t$rd",
-                  [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
+  def NEGRd
+      : FRd<0b1001, 0b0100001,
+            (outs GPR8
+             : $rd),
+            (ins GPR8
+             : $src),
+            "neg\t$rd", [(set i8
+                          : $rd, (ineg i8
+                                  : $src)),
+                         (implicit SREG)]>;
 
   // NEGW Rd+1:Rd
   //
@@ -756,155 +903,126 @@ Defs = [SREG] in
   // neg Rd+1
   // neg Rd
   // sbc Rd+1, r1
-  def NEGWRd : Pseudo<(outs DREGS:$rd),
-                      (ins DREGS:$src),
+  def NEGWRd : Pseudo<(outs DREGS
+                       : $rd),
+                      (ins DREGS
+                       : $src),
                       "negw\t$rd",
-                      [(set i16:$rd, (ineg i16:$src)), (implicit SREG)]>;
+                      [(set i16
+                        : $rd, (ineg i16
+                                : $src)),
+                       (implicit SREG)]>;
 }
 
 // TST Rd
 // Test for zero of minus.
 // This operation is identical to a `Rd AND Rd`.
-def : InstAlias<"tst\t$rd", (ANDRdRr GPR8:$rd, GPR8:$rd)>;
+def : InstAlias<"tst\t$rd", (ANDRdRr GPR8 : $rd, GPR8 : $rd)>;
 
 // SBR Rd, K
 //
 // Mnemonic alias to 'ORI Rd, K'. Same bit pattern, same operands,
 // same everything.
 def : InstAlias<"sbr\t$rd, $k",
-                (ORIRdK LD8:$rd, imm_ldi8:$k),
+                (ORIRdK LD8
+                 : $rd, imm_ldi8
+                 : $k),
                 /* Disable display, so we don't override ORI */ 0>;
 
 //===----------------------------------------------------------------------===//
 // Jump instructions
 //===----------------------------------------------------------------------===//
-let isBarrier = 1,
-isBranch = 1,
-isTerminator = 1 in
-{
-  def RJMPk : FBRk<0,
-                   (outs),
-                   (ins brtarget_13:$target),
-                   "rjmp\t$target",
-                   [(br bb:$target)]>;
+let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
+  def RJMPk : FBRk<0, (outs),
+                   (ins brtarget_13
+                    : $target),
+                   "rjmp\t$target", [(br bb
+                                      : $target)]>;
 
   let isIndirectBranch = 1,
-  Uses = [R31R30] in
-  def IJMP : F16<0b1001010000001001,
-                 (outs),
-                 (ins),
-                 "ijmp",
-                 []>,
-             Requires<[HasIJMPCALL]>;
+      Uses = [R31R30] in def IJMP
+      : F16<0b1001010000001001, (outs), (ins), "ijmp", []>,
+      Requires<[HasIJMPCALL]>;
 
   let isIndirectBranch = 1,
-  Uses = [R31R30] in
-  def EIJMP : F16<0b1001010000011001,
-                  (outs),
-                  (ins),
-                  "eijmp",
-                  []>,
-              Requires<[HasEIJMPCALL]>;
-
-  def JMPk : F32BRk<0b110,
-                    (outs),
-                    (ins call_target:$k),
-                    "jmp\t$k",
-                    []>,
+      Uses = [R31R30] in def EIJMP
+      : F16<0b1001010000011001, (outs), (ins), "eijmp", []>,
+      Requires<[HasEIJMPCALL]>;
+
+  def JMPk : F32BRk<0b110, (outs),
+                    (ins call_target
+                     : $k),
+                    "jmp\t$k", []>,
              Requires<[HasJMPCALL]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Call instructions
 //===----------------------------------------------------------------------===//
-let isCall = 1 in
-{
+let isCall = 1 in {
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP] in
-  def RCALLk : FBRk<1,
-                    (outs),
-                    (ins brtarget_13:$target),
-                    "rcall\t$target",
-                    []>;
+  let Uses = [SP] in def RCALLk : FBRk<1, (outs),
+                                       (ins brtarget_13
+                                        : $target),
+                                       "rcall\t$target", []>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP, R31R30] in
-  def ICALL : F16<0b1001010100001001,
-                  (outs),
-                  (ins variable_ops),
-                  "icall",
-                  []>,
-              Requires<[HasIJMPCALL]>;
+  let Uses = [SP, R31R30] in def ICALL
+      : F16<0b1001010100001001, (outs), (ins variable_ops), "icall", []>,
+      Requires<[HasIJMPCALL]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
-  let Uses = [SP, R31R30] in
-  def EICALL : F16<0b1001010100011001,
-                   (outs),
-                   (ins variable_ops),
-                   "eicall",
-                   []>,
-               Requires<[HasEIJMPCALL]>;
+  let Uses = [SP, R31R30] in def EICALL
+      : F16<0b1001010100011001, (outs), (ins variable_ops), "eicall", []>,
+      Requires<[HasEIJMPCALL]>;
 
   // SP is marked as a use to prevent stack-pointer assignments that appear
   // immediately before calls from potentially appearing dead.
   //
-  //:TODO: the imm field can be either 16 or 22 bits in devices with more
+  //: TODO: the imm field can be either 16 or 22 bits in devices with more
   // than 64k of ROM, fix it once we support the largest devices.
-  let Uses = [SP] in
-  def CALLk : F32BRk<0b111,
-                     (outs),
-                     (ins call_target:$k),
-                     "call\t$k",
-                     [(AVRcall imm:$k)]>,
-              Requires<[HasJMPCALL]>;
+  let Uses = [SP] in def CALLk : F32BRk<0b111, (outs),
+                                        (ins call_target
+                                         : $k),
+                                        "call\t$k", [(AVRcall imm
+                                                      : $k)]>,
+      Requires<[HasJMPCALL]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Return instructions.
 //===----------------------------------------------------------------------===//
-let isTerminator = 1,
-isReturn = 1,
-isBarrier = 1 in
-{
-  def RET : F16<0b1001010100001000,
-                (outs),
-                (ins),
-                "ret",
-                [(AVRretflag)]>;
-
-  def RETI : F16<0b1001010100011000,
-                 (outs),
-                 (ins),
-                 "reti",
-                 [(AVRretiflag)]>;
+let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def RET : F16<0b1001010100001000, (outs), (ins), "ret", [(AVRretflag)]>;
+
+  def RETI : F16<0b1001010100011000, (outs), (ins), "reti", [(AVRretiflag)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Compare operations.
 //===----------------------------------------------------------------------===//
-let Defs = [SREG] in
-{
+let Defs = [SREG] in {
   // CPSE Rd, Rr
   // Compare Rd and Rr, skipping the next instruction if they are equal.
-  let isBarrier = 1,
-  isBranch = 1,
-  isTerminator = 1 in
-  def CPSE : FRdRr<0b0001,
-                   0b00,
-                   (outs),
-                   (ins GPR8:$rd, GPR8:$rr),
-                   "cpse\t$rd, $rr",
-                   []>;
-
-  def CPRdRr : FRdRr<0b0001,
-                     0b01,
-                     (outs),
-                     (ins GPR8:$rd, GPR8:$rr),
-                     "cp\t$rd, $rr",
-                     [(AVRcmp i8:$rd, i8:$rr), (implicit SREG)]>;
+  let isBarrier = 1, isBranch = 1,
+      isTerminator = 1 in def CPSE : FRdRr<0b0001, 0b00, (outs),
+                                           (ins GPR8
+                                            : $rd, GPR8
+                                            : $rr),
+                                           "cpse\t$rd, $rr", []>;
+
+  def CPRdRr
+      : FRdRr<0b0001, 0b01, (outs),
+              (ins GPR8
+               : $rd, GPR8
+               : $rr),
+              "cp\t$rd, $rr", [(AVRcmp i8
+                                : $rd, i8
+                                : $rr),
+                               (implicit SREG)]>;
 
   // CPW Rd+1:Rd, Rr+1:Rr
   //
@@ -912,251 +1030,256 @@ let Defs = [SREG] in
   // cp  Rd,   Rr
   // cpc Rd+1, Rr+1
   def CPWRdRr : Pseudo<(outs),
-                       (ins DREGS:$src, DREGS:$src2),
+                       (ins DREGS
+                        : $src, DREGS
+                        : $src2),
                        "cpw\t$src, $src2",
-                       [(AVRcmp i16:$src, i16:$src2), (implicit SREG)]>;
+                       [(AVRcmp i16
+                         : $src, i16
+                         : $src2),
+                        (implicit SREG)]>;
 
-  let Uses = [SREG] in
-  def CPCRdRr : FRdRr<0b0000,
-                      0b01,
-                      (outs),
-                      (ins GPR8:$rd, GPR8:$rr),
-                      "cpc\t$rd, $rr",
-                      [(AVRcmpc i8:$rd, i8:$rr), (implicit SREG)]>;
+  let Uses = [SREG] in def CPCRdRr
+      : FRdRr<0b0000, 0b01, (outs),
+              (ins GPR8
+               : $rd, GPR8
+               : $rr),
+              "cpc\t$rd, $rr", [(AVRcmpc i8
+                                 : $rd, i8
+                                 : $rr),
+                                (implicit SREG)]>;
 
   // CPCW Rd+1:Rd. Rr+1:Rr
   //
   // Expands to:
   // cpc Rd,   Rr
   // cpc Rd+1, Rr+1
-  let Uses = [SREG] in
-  def CPCWRdRr : Pseudo<(outs),
-                        (ins DREGS:$src, DREGS:$src2),
-                        "cpcw\t$src, $src2",
-                        [(AVRcmpc i16:$src, i16:$src2), (implicit SREG)]>;
+  let Uses = [SREG] in def CPCWRdRr
+      : Pseudo<(outs),
+               (ins DREGS
+                : $src, DREGS
+                : $src2),
+               "cpcw\t$src, $src2",
+               [(AVRcmpc i16
+                 : $src, i16
+                 : $src2),
+                (implicit SREG)]>;
 
   // CPI Rd, K
   // Compares a register with an 8 bit immediate.
-  def CPIRdK : FRdK<0b0011,
-                    (outs),
-                    (ins LD8:$rd, imm_ldi8:$k),
-                    "cpi\t$rd, $k",
-                    [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
+  def CPIRdK
+      : FRdK<0b0011, (outs),
+             (ins LD8
+              : $rd, imm_ldi8
+              : $k),
+             "cpi\t$rd, $k", [(AVRcmp i8
+                               : $rd, imm
+                               : $k),
+                              (implicit SREG)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Register conditional skipping/branching operations.
 //===----------------------------------------------------------------------===//
-let isBranch = 1,
-isTerminator = 1 in
-{
+let isBranch = 1, isTerminator = 1 in {
   // Conditional skipping on GPR register bits, and
   // conditional skipping on IO register bits.
-  let isBarrier = 1 in
-  {
-    def SBRCRrB : FRdB<0b10,
-                       (outs),
-                       (ins GPR8:$rr, i8imm:$b),
-                       "sbrc\t$rr, $b",
-                       []>;
-
-    def SBRSRrB : FRdB<0b11,
-                       (outs),
-                       (ins GPR8:$rr, i8imm:$b),
-                       "sbrs\t$rr, $b",
-                       []>;
-
-    def SBICAb : FIOBIT<0b01,
-                        (outs),
-                        (ins imm_port5:$a, i8imm:$b),
-                        "sbic\t$a, $b",
-                        []>;
-
-    def SBISAb : FIOBIT<0b11,
-                        (outs),
-                        (ins imm_port5:$a, i8imm:$b),
-                        "sbis\t$a, $b",
-                        []>;
+  let isBarrier = 1 in {
+    def SBRCRrB : FRdB<0b10, (outs),
+                       (ins GPR8
+                        : $rr, i8imm
+                        : $b),
+                       "sbrc\t$rr, $b", []>;
+
+    def SBRSRrB : FRdB<0b11, (outs),
+                       (ins GPR8
+                        : $rr, i8imm
+                        : $b),
+                       "sbrs\t$rr, $b", []>;
+
+    def SBICAb : FIOBIT<0b01, (outs),
+                        (ins imm_port5
+                         : $a, i8imm
+                         : $b),
+                        "sbic\t$a, $b", []>;
+
+    def SBISAb : FIOBIT<0b11, (outs),
+                        (ins imm_port5
+                         : $a, i8imm
+                         : $b),
+                        "sbis\t$a, $b", []>;
   }
 
   // Relative branches on status flag bits.
-  let Uses = [SREG] in
-  {
+  let Uses = [SREG] in {
     // BRBS s, k
     // Branch if `s` flag in status register is set.
-    def BRBSsk : FSK<0,
-                     (outs),
-                     (ins i8imm:$s, relbrtarget_7:$k),
-                     "brbs\t$s, $k",
-                     []>;
+    def BRBSsk : FSK<0, (outs),
+                     (ins i8imm
+                      : $s, relbrtarget_7
+                      : $k),
+                     "brbs\t$s, $k", []>;
 
     // BRBC s, k
     // Branch if `s` flag in status register is clear.
-    def BRBCsk : FSK<1,
-                     (outs),
-                     (ins i8imm:$s, relbrtarget_7:$k),
-                     "brbc\t$s, $k",
-                     []>;
+    def BRBCsk : FSK<1, (outs),
+                     (ins i8imm
+                      : $s, relbrtarget_7
+                      : $k),
+                     "brbc\t$s, $k", []>;
   }
 }
 
-
 // BRCS k
 // Branch if carry flag is set
-def : InstAlias<"brcs\t$k", (BRBSsk 0, relbrtarget_7:$k)>;
+def : InstAlias<"brcs\t$k", (BRBSsk 0, relbrtarget_7 : $k)>;
 
 // BRCC k
 // Branch if carry flag is clear
-def : InstAlias<"brcc\t$k", (BRBCsk 0, relbrtarget_7:$k)>;
+def : InstAlias<"brcc\t$k", (BRBCsk 0, relbrtarget_7 : $k)>;
 
 // BRHS k
 // Branch if half carry flag is set
-def : InstAlias<"brhs\t$k", (BRBSsk 5, relbrtarget_7:$k)>;
+def : InstAlias<"brhs\t$k", (BRBSsk 5, relbrtarget_7 : $k)>;
 
 // BRHC k
 // Branch if half carry flag is clear
-def : InstAlias<"brhc\t$k", (BRBCsk 5, relbrtarget_7:$k)>;
+def : InstAlias<"brhc\t$k", (BRBCsk 5, relbrtarget_7 : $k)>;
 
 // BRTS k
 // Branch if the T flag is set
-def : InstAlias<"brts\t$k", (BRBSsk 6, relbrtarget_7:$k)>;
+def : InstAlias<"brts\t$k", (BRBSsk 6, relbrtarget_7 : $k)>;
 
 // BRTC k
 // Branch if the T flag is clear
-def : InstAlias<"brtc\t$k", (BRBCsk 6, relbrtarget_7:$k)>;
+def : InstAlias<"brtc\t$k", (BRBCsk 6, relbrtarget_7 : $k)>;
 
 // BRVS k
 // Branch if the overflow flag is set
-def : InstAlias<"brvs\t$k", (BRBSsk 3, relbrtarget_7:$k)>;
+def : InstAlias<"brvs\t$k", (BRBSsk 3, relbrtarget_7 : $k)>;
 
 // BRVC k
 // Branch if the overflow flag is clear
-def : InstAlias<"brvc\t$k", (BRBCsk 3, relbrtarget_7:$k)>;
+def : InstAlias<"brvc\t$k", (BRBCsk 3, relbrtarget_7 : $k)>;
 
 // BRIE k
 // Branch if the global interrupt flag is enabled
-def : InstAlias<"brie\t$k", (BRBSsk 7, relbrtarget_7:$k)>;
+def : InstAlias<"brie\t$k", (BRBSsk 7, relbrtarget_7 : $k)>;
 
 // BRID k
 // Branch if the global interrupt flag is disabled
-def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7:$k)>;
+def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7 : $k)>;
 
 //===----------------------------------------------------------------------===//
 // PC-relative conditional branches
 //===----------------------------------------------------------------------===//
 // Based on status register. We cannot simplify these into instruction aliases
 // because we also need to be able to specify a pattern to match for ISel.
-let isBranch = 1,
-isTerminator = 1,
-Uses = [SREG] in
-{
-  def BREQk : FBRsk<0,
-                    0b001,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "breq\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_EQ)]>;
-
-  def BRNEk : FBRsk<1,
-                    0b001,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brne\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_NE)]>;
-
-
-  def BRSHk : FBRsk<1,
-                    0b000,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brsh\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_SH)]>;
-
-  def BRLOk : FBRsk<0,
-                    0b000,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brlo\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_LO)]>;
-
-  def BRMIk : FBRsk<0,
-                    0b010,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brmi\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_MI)]>;
-
-  def BRPLk : FBRsk<1,
-                    0b010,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brpl\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_PL)]>;
-
-  def BRGEk : FBRsk<1,
-                    0b100,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brge\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_GE)]>;
-
-  def BRLTk : FBRsk<0,
-                    0b100,
-                    (outs),
-                    (ins relbrtarget_7:$target),
-                    "brlt\t$target",
-                    [(AVRbrcond bb:$target, AVR_COND_LT)]>;
+let isBranch = 1, isTerminator = 1, Uses = [SREG] in {
+  def BREQk : FBRsk<0, 0b001, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "breq\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_EQ)]>;
+
+  def BRNEk : FBRsk<1, 0b001, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brne\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_NE)]>;
+
+  def BRSHk : FBRsk<1, 0b000, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brsh\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_SH)]>;
+
+  def BRLOk : FBRsk<0, 0b000, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brlo\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_LO)]>;
+
+  def BRMIk : FBRsk<0, 0b010, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brmi\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_MI)]>;
+
+  def BRPLk : FBRsk<1, 0b010, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brpl\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_PL)]>;
+
+  def BRGEk : FBRsk<1, 0b100, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brge\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_GE)]>;
+
+  def BRLTk : FBRsk<0, 0b100, (outs),
+                    (ins relbrtarget_7
+                     : $target),
+                    "brlt\t$target", [(AVRbrcond bb
+                                       : $target, AVR_COND_LT)]>;
 }
 
 //===----------------------------------------------------------------------===//
 // Data transfer instructions
 //===----------------------------------------------------------------------===//
 // 8 and 16-bit register move instructions.
-let hasSideEffects = 0 in
-{
-  def MOVRdRr : FRdRr<0b0010,
-                      0b11,
-                      (outs GPR8:$rd),
-                      (ins GPR8:$rr),
-                      "mov\t$rd, $rr",
-                      []>;
-
-  def MOVWRdRr : FMOVWRdRr<(outs DREGS:$dst),
-                           (ins DREGS:$src),
-                           "movw\t$dst, $src",
-                           []>,
+let hasSideEffects = 0 in {
+  def MOVRdRr : FRdRr<0b0010, 0b11,
+                      (outs GPR8
+                       : $rd),
+                      (ins GPR8
+                       : $rr),
+                      "mov\t$rd, $rr", []>;
+
+  def MOVWRdRr : FMOVWRdRr<(outs DREGS
+                            : $dst),
+                           (ins DREGS
+                            : $src),
+                           "movw\t$dst, $src", []>,
                  Requires<[HasMOVW]>;
 }
 
 // Load immediate values into registers.
-let isReMaterializable = 1 in
-{
+let isReMaterializable = 1 in {
   def LDIRdK : FRdK<0b1110,
-                    (outs LD8:$rd),
-                    (ins imm_ldi8:$k),
-                    "ldi\t$rd, $k",
-                    [(set i8:$rd, imm:$k)]>;
+                    (outs LD8
+                     : $rd),
+                    (ins imm_ldi8
+                     : $k),
+                    "ldi\t$rd, $k", [(set i8
+                                      : $rd, imm
+                                      : $k)]>;
 
   // LDIW Rd+1:Rd, K+1:K
   //
   // Expands to:
   // ldi Rd,   K
   // ldi Rd+1, K+1
-  def LDIWRdK : Pseudo<(outs DLDREGS:$dst),
-                       (ins i16imm:$src),
-                       "ldiw\t$dst, $src",
-                       [(set i16:$dst, imm:$src)]>;
+  def LDIWRdK : Pseudo<(outs DLDREGS
+                        : $dst),
+                       (ins i16imm
+                        : $src),
+                       "ldiw\t$dst, $src", [(set i16
+                                             : $dst, imm
+                                             : $src)]>;
 }
 
 // Load from data space into register.
-let canFoldAsLoad = 1,
-isReMaterializable = 1 in
-{
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
   def LDSRdK : F32DM<0b0,
-                     (outs GPR8:$rd),
-                     (ins imm16:$k),
-                     "lds\t$rd, $k",
-                     [(set i8:$rd, (load imm:$k))]>,
+                     (outs GPR8
+                      : $rd),
+                     (ins imm16
+                      : $k),
+                     "lds\t$rd, $k", [(set i8
+                                       : $rd, (load imm
+                                               : $k))]>,
                Requires<[HasSRAM]>;
 
   // LDSW Rd+1:Rd, K+1:K
@@ -1164,23 +1287,26 @@ isReMaterializable = 1 in
   // Expands to:
   // lds Rd,  (K+1:K)
   // lds Rd+1 (K+1:K) + 1
-  def LDSWRdK : Pseudo<(outs DREGS:$dst),
-                       (ins i16imm:$src),
-                       "ldsw\t$dst, $src",
-                       [(set i16:$dst, (load imm:$src))]>,
+  def LDSWRdK : Pseudo<(outs DREGS
+                        : $dst),
+                       (ins i16imm
+                        : $src),
+                       "ldsw\t$dst, $src", [(set i16
+                                             : $dst, (load imm
+                                                      : $src))]>,
                 Requires<[HasSRAM]>;
 }
 
 // Indirect loads.
-let canFoldAsLoad = 1,
-isReMaterializable = 1 in
-{
-  def LDRdPtr : FSTLD<0,
-                      0b00,
-                      (outs GPR8:$reg),
-                      (ins LDSTPtrReg:$ptrreg),
-                      "ld\t$reg, $ptrreg",
-                      [(set GPR8:$reg, (load i16:$ptrreg))]>,
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def LDRdPtr : FSTLD<0, 0b00,
+                      (outs GPR8
+                       : $reg),
+                      (ins LDSTPtrReg
+                       : $ptrreg),
+                      "ld\t$reg, $ptrreg", [(set GPR8
+                                             : $reg, (load i16
+                                                      : $ptrreg))]>,
                 Requires<[HasSRAM]>;
 
   // LDW Rd+1:Rd, P
@@ -1188,43 +1314,48 @@ isReMaterializable = 1 in
   // Expands to:
   // ld  Rd,   P
   // ldd Rd+1, P+1
-  let Constraints = "@earlyclobber $reg" in
-  def LDWRdPtr : Pseudo<(outs DREGS:$reg),
-                        (ins PTRDISPREGS:$ptrreg),
-                        "ldw\t$reg, $ptrreg",
-                        [(set i16:$reg, (load i16:$ptrreg))]>,
-                 Requires<[HasSRAM]>;
+  let Constraints = "@earlyclobber $reg" in def LDWRdPtr
+      : Pseudo<(outs DREGS
+                : $reg),
+               (ins PTRDISPREGS
+                : $ptrreg),
+               "ldw\t$reg, $ptrreg", [(set i16
+                                       : $reg, (load i16
+                                                : $ptrreg))]>,
+      Requires<[HasSRAM]>;
 }
 
 // Indirect loads (with postincrement or predecrement).
-let mayLoad = 1,
-hasSideEffects = 0,
-Constraints = "$ptrreg = $base_wb,@earlyclobber $reg" in
-{
-  def LDRdPtrPi : FSTLD<0,
-                        0b01,
-                        (outs GPR8:$reg, PTRREGS:$base_wb),
-                        (ins LDSTPtrReg:$ptrreg),
-                        "ld\t$reg, $ptrreg+",
-                        []>,
+let mayLoad = 1, hasSideEffects = 0,
+    Constraints = "$ptrreg = $base_wb,@earlyclobber $reg" in {
+  def LDRdPtrPi : FSTLD<0, 0b01,
+                        (outs GPR8
+                         : $reg, PTRREGS
+                         : $base_wb),
+                        (ins LDSTPtrReg
+                         : $ptrreg),
+                        "ld\t$reg, $ptrreg+", []>,
                   Requires<[HasSRAM]>;
 
   // LDW Rd+1:Rd, P+
   // Expands to:
   // ld Rd,   P+
   // ld Rd+1, P+
-  def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
-                          (ins PTRREGS:$ptrreg),
-                          "ldw\t$reg, $ptrreg+",
-                          []>,
+  def LDWRdPtrPi : Pseudo<(outs DREGS
+                           : $reg, PTRREGS
+                           : $base_wb),
+                          (ins PTRREGS
+                           : $ptrreg),
+                          "ldw\t$reg, $ptrreg+", []>,
                    Requires<[HasSRAM]>;
 
-  def LDRdPtrPd : FSTLD<0,
-                        0b10,
-                        (outs GPR8:$reg, PTRREGS:$base_wb),
-                        (ins LDSTPtrReg:$ptrreg),
-                        "ld\t$reg, -$ptrreg",
-                        []>,
+  def LDRdPtrPd : FSTLD<0, 0b10,
+                        (outs GPR8
+                         : $reg, PTRREGS
+                         : $base_wb),
+                        (ins LDSTPtrReg
+                         : $ptrreg),
+                        "ld\t$reg, -$ptrreg", []>,
                   Requires<[HasSRAM]>;
 
   // LDW Rd+1:Rd, -P
@@ -1232,36 +1363,42 @@ Constraints = "$ptrreg = $base_wb,@earlyclobber $reg" in
   // Expands to:
   // ld Rd+1, -P
   // ld Rd,   -P
-  def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
-                          (ins PTRREGS:$ptrreg),
-                          "ldw\t$reg, -$ptrreg",
-                          []>,
+  def LDWRdPtrPd : Pseudo<(outs DREGS
+                           : $reg, PTRREGS
+                           : $base_wb),
+                          (ins PTRREGS
+                           : $ptrreg),
+                          "ldw\t$reg, -$ptrreg", []>,
                    Requires<[HasSRAM]>;
 }
 
 // Load indirect with displacement operations.
-let canFoldAsLoad = 1,
-isReMaterializable = 1 in
-{
-  let Constraints = "@earlyclobber $reg" in
-  def LDDRdPtrQ : FSTDLDD<0,
-                          (outs GPR8:$reg),
-                          (ins memri:$memri),
-                          "ldd\t$reg, $memri",
-                          [(set i8:$reg, (load addr:$memri))]>,
-                  Requires<[HasSRAM]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  let Constraints = "@earlyclobber $reg" in def LDDRdPtrQ
+      : FSTDLDD<0,
+                (outs GPR8
+                 : $reg),
+                (ins memri
+                 : $memri),
+                "ldd\t$reg, $memri", [(set i8
+                                       : $reg, (load addr
+                                                : $memri))]>,
+      Requires<[HasSRAM]>;
 
   // LDDW Rd+1:Rd, P+q
   //
   // Expands to:
   // ldd Rd,   P+q
   // ldd Rd+1, P+q+1
-  let Constraints = "@earlyclobber $dst" in
-  def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_YZ_WORKAROUND:$dst),
-                          (ins memri:$memri),
-                          "lddw\t$dst, $memri",
-                          [(set i16:$dst, (load addr:$memri))]>,
-                   Requires<[HasSRAM]>;
+  let Constraints = "@earlyclobber $dst" in def LDDWRdPtrQ
+      : Pseudo<(outs DREGS_WITHOUT_YZ_WORKAROUND
+                : $dst),
+               (ins memri
+                : $memri),
+               "lddw\t$dst, $memri", [(set i16
+                                       : $dst, (load addr
+                                                : $memri))]>,
+      Requires<[HasSRAM]>;
 
   // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
   // register and without the @earlyclobber flag.
@@ -1270,7 +1407,8 @@ isReMaterializable = 1 in
   // being able to handle the expansion of a COPY into an machine instruction
   // that has an earlyclobber flag. This is because the register allocator will
   // try expand a copy from a register slot into an earlyclobber instruction.
-  // Instructions that are earlyclobber need to be in a dedicated earlyclobber slot.
+  // Instructions that are earlyclobber need to be in a dedicated earlyclobber
+  // slot.
   //
   // This pseudo instruction can be used pre-AVR pseudo expansion in order to
   // get a frame index load without directly using earlyclobber instructions.
@@ -1279,30 +1417,44 @@ isReMaterializable = 1 in
   //
   // This instruction may be removed once PR13375 is fixed.
   let mayLoad = 1,
-  hasSideEffects = 0 in
-  def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
-                        (ins memri:$memri),
-                        "lddw\t$dst, $memri",
-                        []>,
-                 Requires<[HasSRAM]>;
+      hasSideEffects = 0 in def LDDWRdYQ : Pseudo<(outs DREGS
+                                                   : $dst),
+                                                  (ins memri
+                                                   : $memri),
+                                                  "lddw\t$dst, $memri", []>,
+      Requires<[HasSRAM]>;
 }
 
-class AtomicLoad<PatFrag Op, RegisterClass DRC,
-                 RegisterClass PTRRC> :
-  Pseudo<(outs DRC:$rd), (ins PTRRC:$rr), "atomic_op",
-         [(set DRC:$rd, (Op i16:$rr))]>;
-
-class AtomicStore<PatFrag Op, RegisterClass DRC,
-                  RegisterClass PTRRC> :
-  Pseudo<(outs), (ins PTRRC:$rd, DRC:$rr), "atomic_op",
-         [(Op i16:$rd, DRC:$rr)]>;
-
-let Constraints = "@earlyclobber $rd" in
-class AtomicLoadOp<PatFrag Op, RegisterClass DRC,
-                   RegisterClass PTRRC> :
-  Pseudo<(outs DRC:$rd), (ins PTRRC:$rr, DRC:$operand),
-         "atomic_op",
-         [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
+class AtomicLoad<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
+    : Pseudo<(outs DRC
+              : $rd),
+             (ins PTRRC
+              : $rr),
+             "atomic_op", [(set DRC
+                            : $rd, (Op i16
+                                    : $rr))]>;
+
+class AtomicStore<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
+    : Pseudo<(outs),
+             (ins PTRRC
+              : $rd, DRC
+              : $rr),
+             "atomic_op", [(Op i16
+                            : $rd, DRC
+                            : $rr)]>;
+
+let Constraints =
+    "@earlyclobber $rd" in class AtomicLoadOp<PatFrag Op, RegisterClass DRC,
+                                              RegisterClass PTRRC>
+    : Pseudo<(outs DRC
+              : $rd),
+             (ins PTRRC
+              : $rr, DRC
+              : $operand),
+             "atomic_op", [(set DRC
+                            : $rd, (Op i16
+                                    : $rr, DRC
+                                    : $operand))]>;
 
 // FIXME: I think 16-bit atomic binary ops need to mark
 // r0 as clobbered.
@@ -1318,34 +1470,36 @@ class AtomicLoadOp<PatFrag Op, RegisterClass DRC,
 // 16-bit operations use 16-bit load/store postincrement instructions,
 // which require PTRDISPREGS.
 
-def AtomicLoad8   : AtomicLoad<atomic_load_8, GPR8, PTRREGS>;
-def AtomicLoad16  : AtomicLoad<atomic_load_16, DREGS, PTRDISPREGS>;
+def AtomicLoad8 : AtomicLoad<atomic_load_8, GPR8, PTRREGS>;
+def AtomicLoad16 : AtomicLoad<atomic_load_16, DREGS, PTRDISPREGS>;
 
-def AtomicStore8  : AtomicStore<atomic_store_8, GPR8, PTRREGS>;
+def AtomicStore8 : AtomicStore<atomic_store_8, GPR8, PTRREGS>;
 def AtomicStore16 : AtomicStore<atomic_store_16, DREGS, PTRDISPREGS>;
 
 class AtomicLoadOp8<PatFrag Op> : AtomicLoadOp<Op, GPR8, PTRREGS>;
 class AtomicLoadOp16<PatFrag Op> : AtomicLoadOp<Op, DREGS, PTRDISPREGS>;
 
-def AtomicLoadAdd8  : AtomicLoadOp8<atomic_load_add_8>;
+def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_8>;
 def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_16>;
-def AtomicLoadSub8  : AtomicLoadOp8<atomic_load_sub_8>;
+def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_8>;
 def AtomicLoadSub16 : AtomicLoadOp16<atomic_load_sub_16>;
-def AtomicLoadAnd8  : AtomicLoadOp8<atomic_load_and_8>;
+def AtomicLoadAnd8 : AtomicLoadOp8<atomic_load_and_8>;
 def AtomicLoadAnd16 : AtomicLoadOp16<atomic_load_and_16>;
-def AtomicLoadOr8   : AtomicLoadOp8<atomic_load_or_8>;
-def AtomicLoadOr16  : AtomicLoadOp16<atomic_load_or_16>;
-def AtomicLoadXor8  : AtomicLoadOp8<atomic_load_xor_8>;
+def AtomicLoadOr8 : AtomicLoadOp8<atomic_load_or_8>;
+def AtomicLoadOr16 : AtomicLoadOp16<atomic_load_or_16>;
+def AtomicLoadXor8 : AtomicLoadOp8<atomic_load_xor_8>;
 def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_16>;
-def AtomicFence     : Pseudo<(outs), (ins), "atomic_fence",
-                             [(atomic_fence timm, timm)]>;
+def AtomicFence
+    : Pseudo<(outs), (ins), "atomic_fence", [(atomic_fence timm, timm)]>;
 
 // Indirect store from register to data space.
-def STSKRr : F32DM<0b1,
-                   (outs),
-                   (ins imm16:$k, GPR8:$rd),
-                   "sts\t$k, $rd",
-                   [(store i8:$rd, imm:$k)]>,
+def STSKRr : F32DM<0b1, (outs),
+                   (ins imm16
+                    : $k, GPR8
+                    : $rd),
+                   "sts\t$k, $rd", [(store i8
+                                     : $rd, imm
+                                     : $k)]>,
              Requires<[HasSRAM]>;
 
 // STSW K+1:K, Rr+1:Rr
@@ -1354,20 +1508,24 @@ def STSKRr : F32DM<0b1,
 // sts Rr+1, (K+1:K) + 1
 // sts Rr,   (K+1:K)
 def STSWKRr : Pseudo<(outs),
-                     (ins i16imm:$dst, DREGS:$src),
-                     "stsw\t$dst, $src",
-                     [(store i16:$src, imm:$dst)]>,
+                     (ins i16imm
+                      : $dst, DREGS
+                      : $src),
+                     "stsw\t$dst, $src", [(store i16
+                                           : $src, imm
+                                           : $dst)]>,
               Requires<[HasSRAM]>;
 
 // Indirect stores.
 // ST P, Rr
 // Stores the value of Rr into the location addressed by pointer P.
-def STPtrRr : FSTLD<1,
-                    0b00,
-                    (outs),
-                    (ins LDSTPtrReg:$ptrreg, GPR8:$reg),
-                    "st\t$ptrreg, $reg",
-                    [(store GPR8:$reg, i16:$ptrreg)]>,
+def STPtrRr : FSTLD<1, 0b00, (outs),
+                    (ins LDSTPtrReg
+                     : $ptrreg, GPR8
+                     : $reg),
+                    "st\t$ptrreg, $reg", [(store GPR8
+                                           : $reg, i16
+                                           : $ptrreg)]>,
               Requires<[HasSRAM]>;
 
 // STW P, Rr+1:Rr
@@ -1377,25 +1535,32 @@ def STPtrRr : FSTLD<1,
 // st P, Rr
 // std P+1, Rr+1
 def STWPtrRr : Pseudo<(outs),
-                      (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
-                      "stw\t$ptrreg, $reg",
-                      [(store i16:$reg, i16:$ptrreg)]>,
+                      (ins PTRDISPREGS
+                       : $ptrreg, DREGS
+                       : $reg),
+                      "stw\t$ptrreg, $reg", [(store i16
+                                              : $reg, i16
+                                              : $ptrreg)]>,
                Requires<[HasSRAM]>;
 
 // Indirect stores (with postincrement or predecrement).
-let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
-{
+let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in {
 
   // ST P+, Rr
   // Stores the value of Rr into the location addressed by pointer P.
   // Post increments P.
-  def STPtrPiRr : FSTLD<1,
-                        0b01,
-                        (outs LDSTPtrReg:$base_wb),
-                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
-                        "st\t$ptrreg+, $reg",
-                        [(set i16:$base_wb,
-                         (post_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+  def STPtrPiRr : FSTLD<1, 0b01,
+                        (outs LDSTPtrReg
+                         : $base_wb),
+                        (ins LDSTPtrReg
+                         : $ptrreg, GPR8
+                         : $reg, i8imm
+                         : $offs),
+                        "st\t$ptrreg+, $reg", [(set i16
+                                                : $base_wb, (post_store GPR8
+                                                             : $reg, i16
+                                                             : $ptrreg, imm
+                                                             : $offs))]>,
                   Requires<[HasSRAM]>;
 
   // STW P+, Rr+1:Rr
@@ -1405,23 +1570,34 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
   // Expands to:
   // st P+, Rr
   // st P+, Rr+1
-  def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb),
-                          (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs),
-                          "stw\t$ptrreg+, $trh",
-                          [(set PTRREGS:$base_wb,
-                           (post_store DREGS:$trh, PTRREGS:$ptrreg, imm:$offs))]>,
+  def STWPtrPiRr : Pseudo<(outs PTRREGS
+                           : $base_wb),
+                          (ins PTRREGS
+                           : $ptrreg, DREGS
+                           : $trh, i8imm
+                           : $offs),
+                          "stw\t$ptrreg+, $trh", [(set PTRREGS
+                                                   : $base_wb, (post_store DREGS
+                                                                : $trh, PTRREGS
+                                                                : $ptrreg, imm
+                                                                : $offs))]>,
                    Requires<[HasSRAM]>;
 
   // ST -P, Rr
   // Stores the value of Rr into the location addressed by pointer P.
   // Pre decrements P.
-  def STPtrPdRr : FSTLD<1,
-                        0b10,
-                        (outs LDSTPtrReg:$base_wb),
-                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
-                        "st\t-$ptrreg, $reg",
-                        [(set i16:$base_wb,
-                         (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+  def STPtrPdRr : FSTLD<1, 0b10,
+                        (outs LDSTPtrReg
+                         : $base_wb),
+                        (ins LDSTPtrReg
+                         : $ptrreg, GPR8
+                         : $reg, i8imm
+                         : $offs),
+                        "st\t-$ptrreg, $reg", [(set i16
+                                                : $base_wb, (pre_store GPR8
+                                                             : $reg, i16
+                                                             : $ptrreg, imm
+                                                             : $offs))]>,
                   Requires<[HasSRAM]>;
 
   // STW -P, Rr+1:Rr
@@ -1431,11 +1607,17 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
   // Expands to:
   // st -P, Rr+1
   // st -P, Rr
-  def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb),
-                          (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs),
-                          "stw\t-$ptrreg, $reg",
-                          [(set PTRREGS:$base_wb,
-                           (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>,
+  def STWPtrPdRr : Pseudo<(outs PTRREGS
+                           : $base_wb),
+                          (ins PTRREGS
+                           : $ptrreg, DREGS
+                           : $reg, i8imm
+                           : $offs),
+                          "stw\t-$ptrreg, $reg", [(set PTRREGS
+                                                   : $base_wb, (pre_store i16
+                                                                : $reg, i16
+                                                                : $ptrreg, imm
+                                                                : $offs))]>,
                    Requires<[HasSRAM]>;
 }
 
@@ -1443,11 +1625,13 @@ let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
 // STD P+q, Rr
 // Stores the value of Rr into the location addressed by pointer P with a
 // displacement of q. Does not modify P.
-def STDPtrQRr : FSTDLDD<1,
-                        (outs),
-                        (ins memri:$memri, GPR8:$reg),
-                        "std\t$memri, $reg",
-                        [(store i8:$reg, addr:$memri)]>,
+def STDPtrQRr : FSTDLDD<1, (outs),
+                        (ins memri
+                         : $memri, GPR8
+                         : $reg),
+                        "std\t$memri, $reg", [(store i8
+                                               : $reg, addr
+                                               : $memri)]>,
                 Requires<[HasSRAM]>;
 
 // STDW P+q, Rr+1:Rr
@@ -1458,206 +1642,192 @@ def STDPtrQRr : FSTDLDD<1,
 // std P+q,   Rr
 // std P+q+1, Rr+1
 def STDWPtrQRr : Pseudo<(outs),
-                        (ins memri:$memri, DREGS:$src),
-                        "stdw\t$memri, $src",
-                        [(store i16:$src, addr:$memri)]>,
+                        (ins memri
+                         : $memri, DREGS
+                         : $src),
+                        "stdw\t$memri, $src", [(store i16
+                                                : $src, addr
+                                                : $memri)]>,
                  Requires<[HasSRAM]>;
 
-
 // Load program memory operations.
-let canFoldAsLoad = 1,
-isReMaterializable = 1,
-mayLoad = 1,
-hasSideEffects = 0 in
-{
+let canFoldAsLoad = 1, isReMaterializable = 1, mayLoad = 1,
+    hasSideEffects = 0 in {
   let Defs = [R0],
-      Uses = [R31R30] in
-  def LPM : F16<0b1001010111001000,
-                (outs),
-                (ins),
-                "lpm",
-                []>,
-            Requires<[HasLPM]>;
-
-  def LPMRdZ : FLPMX<0,
-                     0,
-                     (outs GPR8:$dst),
-                     (ins ZREG:$z),
-                     "lpm\t$dst, $z",
-                     []>,
+      Uses = [R31R30] in def LPM
+      : F16<0b1001010111001000, (outs), (ins), "lpm", []>,
+      Requires<[HasLPM]>;
+
+  def LPMRdZ : FLPMX<0, 0,
+                     (outs GPR8
+                      : $dst),
+                     (ins ZREG
+                      : $z),
+                     "lpm\t$dst, $z", []>,
                Requires<[HasLPMX]>;
 
   // Load program memory, while postincrementing the Z register.
-  let Defs = [R31R30] in
-  {
-    def LPMRdZPi : FLPMX<0,
-                         1,
-                         (outs GPR8:$dst),
-                         (ins ZREG:$z),
-                         "lpm\t$dst, $z+",
-                         []>,
+  let Defs = [R31R30] in {
+    def LPMRdZPi : FLPMX<0, 1,
+                         (outs GPR8
+                          : $dst),
+                         (ins ZREG
+                          : $z),
+                         "lpm\t$dst, $z+", []>,
                    Requires<[HasLPMX]>;
 
-    def LPMWRdZ : Pseudo<(outs DREGS:$dst),
-                         (ins ZREG:$z),
-                         "lpmw\t$dst, $z",
-                         []>,
+    def LPMWRdZ : Pseudo<(outs DREGS
+                          : $dst),
+                         (ins ZREG
+                          : $z),
+                         "lpmw\t$dst, $z", []>,
                   Requires<[HasLPMX]>;
 
-    def LPMWRdZPi : Pseudo<(outs DREGS:$dst),
-                           (ins ZREG:$z),
-                           "lpmw\t$dst, $z+",
-                           []>,
+    def LPMWRdZPi : Pseudo<(outs DREGS
+                            : $dst),
+                           (ins ZREG
+                            : $z),
+                           "lpmw\t$dst, $z+", []>,
                     Requires<[HasLPMX]>;
   }
 }
 
 // Extended load program memory operations.
-let mayLoad = 1,
-hasSideEffects = 0 in
-{
+let mayLoad = 1, hasSideEffects = 0 in {
   let Defs = [R0],
-      Uses = [R31R30] in
-  def ELPM : F16<0b1001010111011000,
-                 (outs),
-                 (ins),
-                 "elpm",
-                 []>,
-             Requires<[HasELPM]>;
-
-  def ELPMRdZ : FLPMX<1,
-                      0,
-                      (outs GPR8:$dst),
-                      (ins ZREG:$z),
-                      "elpm\t$dst, $z",
-                      []>,
+      Uses = [R31R30] in def ELPM
+      : F16<0b1001010111011000, (outs), (ins), "elpm", []>,
+      Requires<[HasELPM]>;
+
+  def ELPMRdZ : FLPMX<1, 0,
+                      (outs GPR8
+                       : $dst),
+                      (ins ZREG
+                       : $z),
+                      "elpm\t$dst, $z", []>,
                 Requires<[HasELPMX]>;
 
-  let Defs = [R31R30] in
-  def ELPMRdZPi : FLPMX<1,
-                        1,
-                        (outs GPR8:$dst),
-                        (ins ZREG: $z),
-                        "elpm\t$dst, $z+",
-                        []>,
-                  Requires<[HasELPMX]>;
+  let Defs = [R31R30] in def ELPMRdZPi : FLPMX<1, 1,
+                                               (outs GPR8
+                                                : $dst),
+                                               (ins ZREG
+                                                : $z),
+                                               "elpm\t$dst, $z+", []>,
+      Requires<[HasELPMX]>;
 }
 
 // Store program memory operations.
-let Uses = [R1, R0] in
-{
-  let Uses = [R31R30, R1, R0] in
-  def SPM : F16<0b1001010111101000,
-                (outs),
-                (ins),
-                "spm",
-                []>,
-            Requires<[HasSPM]>;
-
-  let Defs = [R31R30] in
-  def SPMZPi : F16<0b1001010111111000,
-                   (outs),
-                   (ins ZREG:$z),
-                   "spm $z+",
-                   []>,
-               Requires<[HasSPMX]>;
+let Uses = [R1, R0] in {
+  let Uses = [R31R30, R1, R0] in def SPM
+      : F16<0b1001010111101000, (outs), (ins), "spm", []>,
+      Requires<[HasSPM]>;
+
+  let Defs = [R31R30] in def SPMZPi : F16<0b1001010111111000, (outs),
+                                          (ins ZREG
+                                           : $z),
+                                          "spm $z+", []>,
+      Requires<[HasSPMX]>;
 }
 
 // Read data from IO location operations.
-let canFoldAsLoad = 1,
-isReMaterializable = 1 in
-{
-  def INRdA : FIORdA<(outs GPR8:$dst),
-                     (ins imm_port6:$src),
-                     "in\t$dst, $src",
-                     [(set i8:$dst, (load ioaddr8:$src))]>;
-
-  def INWRdA : Pseudo<(outs DREGS:$dst),
-                      (ins imm_port6:$src),
-                      "inw\t$dst, $src",
-                      [(set i16:$dst, (load ioaddr16:$src))]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def INRdA : FIORdA<(outs GPR8
+                      : $dst),
+                     (ins imm_port6
+                      : $src),
+                     "in\t$dst, $src", [(set i8
+                                         : $dst, (load ioaddr8
+                                                  : $src))]>;
+
+  def INWRdA : Pseudo<(outs DREGS
+                       : $dst),
+                      (ins imm_port6
+                       : $src),
+                      "inw\t$dst, $src", [(set i16
+                                           : $dst, (load ioaddr16
+                                                    : $src))]>;
 }
 
 // Write data to IO location operations.
 def OUTARr : FIOARr<(outs),
-                    (ins imm_port6:$dst, GPR8:$src),
-                    "out\t$dst, $src",
-                    [(store i8:$src, ioaddr8:$dst)]>;
+                    (ins imm_port6
+                     : $dst, GPR8
+                     : $src),
+                    "out\t$dst, $src", [(store i8
+                                         : $src, ioaddr8
+                                         : $dst)]>;
 
 def OUTWARr : Pseudo<(outs),
-                     (ins imm_port6:$dst, DREGS:$src),
-                     "outw\t$dst, $src",
-                     [(store i16:$src, ioaddr16:$dst)]>;
+                     (ins imm_port6
+                      : $dst, DREGS
+                      : $src),
+                     "outw\t$dst, $src", [(store i16
+                                           : $src, ioaddr16
+                                           : $dst)]>;
 
 // Stack push/pop operations.
-let Defs = [SP],
-Uses = [SP],
-hasSideEffects = 0 in
-{
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in {
   // Stack push operations.
-  let mayStore = 1 in
-  {
-    def PUSHRr : FRd<0b1001,
-                     0b0011111,
-                     (outs),
-                     (ins GPR8:$reg),
-                     "push\t$reg",
-                     []>,
+  let mayStore = 1 in {
+    def PUSHRr : FRd<0b1001, 0b0011111, (outs),
+                     (ins GPR8
+                      : $reg),
+                     "push\t$reg", []>,
                  Requires<[HasSRAM]>;
 
     def PUSHWRr : Pseudo<(outs),
-                         (ins DREGS:$reg),
-                         "pushw\t$reg",
-                         []>,
+                         (ins DREGS
+                          : $reg),
+                         "pushw\t$reg", []>,
                   Requires<[HasSRAM]>;
   }
 
   // Stack pop operations.
-  let mayLoad = 1 in
-  {
-    def POPRd : FRd<0b1001,
-                    0b0001111,
-                    (outs GPR8:$reg),
-                    (ins),
-                    "pop\t$reg",
-                    []>,
+  let mayLoad = 1 in {
+    def POPRd : FRd<0b1001, 0b0001111,
+                    (outs GPR8
+                     : $reg),
+                    (ins), "pop\t$reg", []>,
                 Requires<[HasSRAM]>;
 
-    def POPWRd : Pseudo<(outs DREGS:$reg),
-                        (ins),
-                        "popw\t$reg",
-                        []>,
+    def POPWRd : Pseudo<(outs DREGS
+                         : $reg),
+                        (ins), "popw\t$reg", []>,
                  Requires<[HasSRAM]>;
   }
 }
 
 // Read-Write-Modify (RMW) instructions.
 def XCHZRd : FZRd<0b100,
-                  (outs GPR8:$rd),
-                  (ins ZREG:$z),
-                  "xch\t$z, $rd",
-                  []>,
+                  (outs GPR8
+                   : $rd),
+                  (ins ZREG
+                   : $z),
+                  "xch\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
 def LASZRd : FZRd<0b101,
-                  (outs GPR8:$rd),
-                  (ins ZREG:$z),
-                  "las\t$z, $rd",
-                  []>,
+                  (outs GPR8
+                   : $rd),
+                  (ins ZREG
+                   : $z),
+                  "las\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
 def LACZRd : FZRd<0b110,
-                  (outs GPR8:$rd),
-                  (ins ZREG:$z),
-                  "lac\t$z, $rd",
-                  []>,
+                  (outs GPR8
+                   : $rd),
+                  (ins ZREG
+                   : $z),
+                  "lac\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
 def LATZRd : FZRd<0b111,
-                  (outs GPR8:$rd),
-                  (ins ZREG:$z),
-                  "lat\t$z, $rd",
-                  []>,
+                  (outs GPR8
+                   : $rd),
+                  (ins ZREG
+                   : $z),
+                  "lat\t$z, $rd", []>,
              Requires<[SupportsRMW]>;
 
 //===----------------------------------------------------------------------===//
@@ -1665,187 +1835,277 @@ def LATZRd : FZRd<0b111,
 //===----------------------------------------------------------------------===//
 
 // Bit shift/rotate operations.
-let Constraints = "$src = $rd",
-Defs = [SREG] in
-{
+let Constraints = "$src = $rd", Defs = [SREG] in {
   // 8-bit LSL is an alias of ADD Rd, Rd
 
-  def LSLWRd : Pseudo<(outs DREGS:$rd),
-                      (ins DREGS:$src),
+  def LSLWRd : Pseudo<(outs DREGS
+                       : $rd),
+                      (ins DREGS
+                       : $src),
                       "lslw\t$rd",
-                      [(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
-
-  def LSLWNRd : Pseudo<(outs DLDREGS:$rd),
-                       (ins DREGS:$src, imm16:$bits),
-                       "lslwn\t$rd, $bits",
-                       [(set i16:$rd, (AVRlslwn i16:$src, imm:$bits)),
-                        (implicit SREG)]>;
-
-  def LSLBNRd : Pseudo<(outs LD8:$rd),
-                       (ins GPR8:$src, imm_ldi8:$bits),
-                       "lslbn\t$rd, $bits",
-                       [(set i8:$rd, (AVRlslbn i8:$src, imm:$bits)),
-                        (implicit SREG)]>;
+                      [(set i16
+                        : $rd, (AVRlsl i16
+                                : $src)),
+                       (implicit SREG)]>;
 
-  def LSRRd : FRd<0b1001,
-                  0b0100110,
-                  (outs GPR8:$rd),
-                  (ins GPR8:$src),
-                  "lsr\t$rd",
-                  [(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
+  def LSLWNRd : Pseudo<(outs DLDREGS
+                        : $rd),
+                       (ins DREGS
+                        : $src, imm16
+                        : $bits),
+                       "lslwn\t$rd, $bits", [
+                         (set i16
+                          : $rd, (AVRlslwn i16
+                                  : $src, imm
+                                  : $bits)),
+                         (implicit SREG)
+                       ]>;
+
+  def LSLBNRd : Pseudo<(outs LD8
+                        : $rd),
+                       (ins GPR8
+                        : $src, imm_ldi8
+                        : $bits),
+                       "lslbn\t$rd, $bits", [
+                         (set i8
+                          : $rd, (AVRlslbn i8
+                                  : $src, imm
+                                  : $bits)),
+                         (implicit SREG)
+                       ]>;
+
+  def LSRRd
+      : FRd<0b1001, 0b0100110,
+            (outs GPR8
+             : $rd),
+            (ins GPR8
+             : $src),
+            "lsr\t$rd", [(set i8
+                          : $rd, (AVRlsr i8
+                                  : $src)),
+                         (implicit SREG)]>;
 
-  def LSRWRd : Pseudo<(outs DREGS:$rd),
-                      (ins DREGS:$src),
+  def LSRWRd : Pseudo<(outs DREGS
+                       : $rd),
+                      (ins DREGS
+                       : $src),
                       "lsrw\t$rd",
-                      [(set i16:$rd, (AVRlsr i16:$src)), (implicit SREG)]>;
-
-  def LSRWNRd : Pseudo<(outs DLDREGS:$rd),
-                       (ins DREGS:$src, imm16:$bits),
-                       "lsrwn\t$rd, $bits",
-                       [(set i16:$rd, (AVRlsrwn i16:$src, imm:$bits)),
-                        (implicit SREG)]>;
-
-  def LSRBNRd : Pseudo<(outs LD8:$rd),
-                       (ins GPR8:$src, imm_ldi8:$bits),
-                       "lsrbn\t$rd, $bits",
-                       [(set i8:$rd, (AVRlsrbn i8:$src, imm:$bits)),
-                        (implicit SREG)]>;
-
-  def ASRRd : FRd<0b1001,
-                  0b0100101,
-                  (outs GPR8:$rd),
-                  (ins GPR8:$src),
-                  "asr\t$rd",
-                  [(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
-
-  def ASRWNRd : Pseudo<(outs DLDREGS:$rd),
-                       (ins DREGS:$src, imm16:$bits),
-                       "asrwn\t$rd, $bits",
-                       [(set i16:$rd, (AVRasrwn i16:$src, imm:$bits)),
-                        (implicit SREG)]>;
+                      [(set i16
+                        : $rd, (AVRlsr i16
+                                : $src)),
+                       (implicit SREG)]>;
 
-  def ASRBNRd : Pseudo<(outs LD8:$rd),
-                       (ins GPR8:$src, imm_ldi8:$bits),
-                       "asrbn\t$rd, $bits",
-                       [(set i8:$rd, (AVRasrbn i8:$src, imm:$bits)),
-                        (implicit SREG)]>;
+  def LSRWNRd : Pseudo<(outs DLDREGS
+                        : $rd),
+                       (ins DREGS
+                        : $src, imm16
+                        : $bits),
+                       "lsrwn\t$rd, $bits", [
+                         (set i16
+                          : $rd, (AVRlsrwn i16
+                                  : $src, imm
+                                  : $bits)),
+                         (implicit SREG)
+                       ]>;
+
+  def LSRBNRd : Pseudo<(outs LD8
+                        : $rd),
+                       (ins GPR8
+                        : $src, imm_ldi8
+                        : $bits),
+                       "lsrbn\t$rd, $bits", [
+                         (set i8
+                          : $rd, (AVRlsrbn i8
+                                  : $src, imm
+                                  : $bits)),
+                         (implicit SREG)
+                       ]>;
+
+  def ASRRd
+      : FRd<0b1001, 0b0100101,
+            (outs GPR8
+             : $rd),
+            (ins GPR8
+             : $src),
+            "asr\t$rd", [(set i8
+                          : $rd, (AVRasr i8
+                                  : $src)),
+                         (implicit SREG)]>;
 
-  def ASRWRd : Pseudo<(outs DREGS:$rd),
-                      (ins DREGS:$src),
+  def ASRWNRd : Pseudo<(outs DLDREGS
+                        : $rd),
+                       (ins DREGS
+                        : $src, imm16
+                        : $bits),
+                       "asrwn\t$rd, $bits", [
+                         (set i16
+                          : $rd, (AVRasrwn i16
+                                  : $src, imm
+                                  : $bits)),
+                         (implicit SREG)
+                       ]>;
+
+  def ASRBNRd : Pseudo<(outs LD8
+                        : $rd),
+                       (ins GPR8
+                        : $src, imm_ldi8
+                        : $bits),
+                       "asrbn\t$rd, $bits", [
+                         (set i8
+                          : $rd, (AVRasrbn i8
+                                  : $src, imm
+                                  : $bits)),
+                         (implicit SREG)
+                       ]>;
+
+  def ASRWRd : Pseudo<(outs DREGS
+                       : $rd),
+                      (ins DREGS
+                       : $src),
                       "asrw\t$rd",
-                      [(set i16:$rd, (AVRasr i16:$src)), (implicit SREG)]>;
+                      [(set i16
+                        : $rd, (AVRasr i16
+                                : $src)),
+                       (implicit SREG)]>;
 
-  def ROLBRd : Pseudo<(outs GPR8:$rd),
-                      (ins GPR8:$src),
+  def ROLBRd : Pseudo<(outs GPR8
+                       : $rd),
+                      (ins GPR8
+                       : $src),
                       "rolb\t$rd",
-                      [(set i8:$rd, (AVRrol i8:$src)), (implicit SREG)]>;
+                      [(set i8
+                        : $rd, (AVRrol i8
+                                : $src)),
+                       (implicit SREG)]>;
 
-  def RORBRd : Pseudo<(outs GPR8:$rd),
-                      (ins GPR8:$src),
+  def RORBRd : Pseudo<(outs GPR8
+                       : $rd),
+                      (ins GPR8
+                       : $src),
                       "rorb\t$rd",
-                      [(set i8:$rd, (AVRror i8:$src)), (implicit SREG)]>;
+                      [(set i8
+                        : $rd, (AVRror i8
+                                : $src)),
+                       (implicit SREG)]>;
 
   // Bit rotate operations.
-  let Uses = [SREG] in
-  {
-
-    def ROLWRd : Pseudo<(outs DREGS:$rd),
-                        (ins DREGS:$src),
-                        "rolw\t$rd",
-                        [(set i16:$rd, (AVRrol i16:$src)), (implicit SREG)]>;
-
-    def RORRd : FRd<0b1001,
-                    0b0100111,
-                    (outs GPR8:$rd),
-                    (ins GPR8:$src),
-                    "ror\t$rd",
-                    []>;
-
-    def RORWRd : Pseudo<(outs DREGS:$rd),
-                        (ins DREGS:$src),
-                        "rorw\t$rd",
-                        [(set i16:$rd, (AVRror i16:$src)), (implicit SREG)]>;
+  let Uses = [SREG] in {
+
+    def ROLWRd
+        : Pseudo<(outs DREGS
+                  : $rd),
+                 (ins DREGS
+                  : $src),
+                 "rolw\t$rd",
+                 [(set i16
+                   : $rd, (AVRrol i16
+                           : $src)),
+                  (implicit SREG)]>;
+
+    def RORRd : FRd<0b1001, 0b0100111,
+                    (outs GPR8
+                     : $rd),
+                    (ins GPR8
+                     : $src),
+                    "ror\t$rd", []>;
+
+    def RORWRd
+        : Pseudo<(outs DREGS
+                  : $rd),
+                 (ins DREGS
+                  : $src),
+                 "rorw\t$rd",
+                 [(set i16
+                   : $rd, (AVRror i16
+                           : $src)),
+                  (implicit SREG)]>;
   }
 }
 
 // SWAP Rd
 // Swaps the high and low nibbles in a register.
-let Constraints = "$src = $rd" in
-def SWAPRd : FRd<0b1001,
-                 0b0100010,
-                 (outs GPR8:$rd),
-                 (ins GPR8:$src),
-                 "swap\t$rd",
-                 [(set i8:$rd, (AVRSwap i8:$src))]>;
+let Constraints =
+    "$src = $rd" in def SWAPRd : FRd<0b1001, 0b0100010,
+                                     (outs GPR8
+                                      : $rd),
+                                     (ins GPR8
+                                      : $src),
+                                     "swap\t$rd", [(set i8
+                                                    : $rd, (AVRSwap i8
+                                                            : $src))]>;
 
 // IO register bit set/clear operations.
-//:TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
+//: TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
 // instead of in+ori+out which requires one more instr.
-def SBIAb : FIOBIT<0b10,
-                   (outs),
-                   (ins imm_port5:$addr, i8imm:$bit),
-                   "sbi\t$addr, $bit",
-                   [(store (or (i8 (load lowioaddr8:$addr)), iobitpos8:$bit),
-                     lowioaddr8:$addr)]>;
-
-def CBIAb : FIOBIT<0b00,
-                   (outs),
-                   (ins imm_port5:$addr, i8imm:$bit),
-                   "cbi\t$addr, $bit",
-                   [(store (and (i8 (load lowioaddr8:$addr)), iobitposn8:$bit),
-                     lowioaddr8:$addr)]>;
+def SBIAb : FIOBIT<0b10, (outs),
+                   (ins imm_port5
+                    : $addr, i8imm
+                    : $bit),
+                   "sbi\t$addr, $bit", [(store(or(i8(load lowioaddr8
+                                                     : $addr)),
+                                               iobitpos8
+                                               : $bit),
+                                         lowioaddr8
+                                         : $addr)]>;
+
+def CBIAb : FIOBIT<0b00, (outs),
+                   (ins imm_port5
+                    : $addr, i8imm
+                    : $bit),
+                   "cbi\t$addr, $bit", [(store(and(i8(load lowioaddr8
+                                                      : $addr)),
+                                               iobitposn8
+                                               : $bit),
+                                         lowioaddr8
+                                         : $addr)]>;
 
 // Status register bit load/store operations.
-let Defs = [SREG] in
-def BST : FRdB<0b01,
-               (outs),
-               (ins GPR8:$rd, i8imm:$b),
-               "bst\t$rd, $b",
-               []>;
+let Defs = [SREG] in def BST : FRdB<0b01, (outs),
+                                    (ins GPR8
+                                     : $rd, i8imm
+                                     : $b),
+                                    "bst\t$rd, $b", []>;
 
 let Constraints = "$src = $rd",
-Uses = [SREG] in
-def BLD : FRdB<0b00,
-               (outs GPR8:$rd),
-               (ins GPR8:$src, i8imm:$b),
-               "bld\t$rd, $b",
-               []>;
+    Uses = [SREG] in def BLD : FRdB<0b00,
+                                    (outs GPR8
+                                     : $rd),
+                                    (ins GPR8
+                                     : $src, i8imm
+                                     : $b),
+                                    "bld\t$rd, $b", []>;
 
-def CBR : InstAlias<"cbr\t$rd, $k", (ANDIRdK LD8:$rd, imm_com8:$k), 0>;
+def CBR : InstAlias<"cbr\t$rd, $k", (ANDIRdK LD8 : $rd, imm_com8 : $k), 0>;
 
 // CLR Rd
 // Alias for EOR Rd, Rd
 // -------------
 // Clears all bits in a register.
-def CLR : InstAlias<"clr\t$rd", (EORRdRr GPR8:$rd, GPR8:$rd)>;
+def CLR : InstAlias<"clr\t$rd", (EORRdRr GPR8 : $rd, GPR8 : $rd)>;
 
 // LSL Rd
 // Alias for ADD Rd, Rd
 // --------------
 // Logical shift left one bit.
-def LSL : InstAlias<"lsl\t$rd", (ADDRdRr GPR8:$rd, GPR8:$rd)>;
+def LSL : InstAlias<"lsl\t$rd", (ADDRdRr GPR8 : $rd, GPR8 : $rd)>;
 
-def ROL : InstAlias<"rol\t$rd", (ADCRdRr GPR8:$rd, GPR8:$rd)>;
+def ROL : InstAlias<"rol\t$rd", (ADCRdRr GPR8 : $rd, GPR8 : $rd)>;
 
 // SER Rd
 // Alias for LDI Rd, 0xff
 // ---------
 // Sets all bits in a register.
-def : InstAlias<"ser\t$rd", (LDIRdK LD8:$rd, 0xff), 0>;
-
-let Defs = [SREG] in
-def BSETs : FS<0,
-               (outs),
-               (ins i8imm:$s),
-               "bset\t$s",
-               []>;
-
-let Defs = [SREG] in
-def BCLRs : FS<1,
-               (outs),
-               (ins i8imm:$s),
-               "bclr\t$s",
-               []>;
+def : InstAlias<"ser\t$rd", (LDIRdK LD8 : $rd, 0xff), 0>;
+
+let Defs = [SREG] in def BSETs : FS<0, (outs),
+                                    (ins i8imm
+                                     : $s),
+                                    "bset\t$s", []>;
+
+let Defs = [SREG] in def BCLRs : FS<1, (outs),
+                                    (ins i8imm
+                                     : $s),
+                                    "bclr\t$s", []>;
 
 // Set/clear aliases for the carry (C) status flag (bit 0).
 def : InstAlias<"sec", (BSETs 0)>;
@@ -1887,284 +2147,353 @@ def : InstAlias<"cli", (BCLRs 7)>;
 // Breakpoint instruction
 // ---------
 // <|1001|0101|1001|1000>
-def BREAK : F16<0b1001010110011000,
-                (outs),
-                (ins),
-                "break",
-                []>,
+def BREAK : F16<0b1001010110011000, (outs), (ins), "break", []>,
             Requires<[HasBREAK]>;
 
 // NOP
 // No-operation instruction
 // ---------
 // <|0000|0000|0000|0000>
-def NOP : F16<0b0000000000000000,
-              (outs),
-              (ins),
-              "nop",
-              []>;
+def NOP : F16<0b0000000000000000, (outs), (ins), "nop", []>;
 
 // SLEEP
 // Sleep instruction
 // ---------
 // <|1001|0101|1000|1000>
-def SLEEP : F16<0b1001010110001000,
-                (outs),
-                (ins),
-                "sleep",
-                []>;
+def SLEEP : F16<0b1001010110001000, (outs), (ins), "sleep", []>;
 
 // WDR
 // Watchdog reset
 // ---------
 // <|1001|0101|1010|1000>
-def WDR : F16<0b1001010110101000,
-              (outs),
-              (ins),
-              "wdr",
-              []>;
+def WDR : F16<0b1001010110101000, (outs), (ins), "wdr", []>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions for later expansion
 //===----------------------------------------------------------------------===//
 
-//:TODO: Optimize this for wider types AND optimize the following code
+//: TODO: Optimize this for wider types AND optimize the following code
 //       compile int foo(char a, char b, char c, char d) {return d+b;}
 //       looks like a missed sext_inreg opportunity.
-def SEXT : ExtensionPseudo<
-  (outs DREGS:$dst),
-  (ins GPR8:$src),
-  "sext\t$dst, $src",
-  [(set i16:$dst, (sext i8:$src)), (implicit SREG)]
->;
-
-def ZEXT : ExtensionPseudo<
-  (outs DREGS:$dst),
-  (ins GPR8:$src),
-  "zext\t$dst, $src",
-  [(set i16:$dst, (zext i8:$src)), (implicit SREG)]
->;
+def SEXT
+    : ExtensionPseudo<(outs DREGS
+                       : $dst),
+                      (ins GPR8
+                       : $src),
+                      "sext\t$dst, $src",
+                      [(set i16
+                        : $dst, (sext i8
+                                 : $src)),
+                       (implicit SREG)]>;
+
+def ZEXT
+    : ExtensionPseudo<(outs DREGS
+                       : $dst),
+                      (ins GPR8
+                       : $src),
+                      "zext\t$dst, $src",
+                      [(set i16
+                        : $dst, (zext i8
+                                 : $src)),
+                       (implicit SREG)]>;
 
 // This pseudo gets expanded into a movw+adiw thus it clobbers SREG.
 let Defs = [SREG],
-    hasSideEffects = 0 in
-def FRMIDX : Pseudo<(outs DLDREGS:$dst),
-                    (ins DLDREGS:$src, i16imm:$src2),
-                    "frmidx\t$dst, $src, $src2",
-                    []>;
+    hasSideEffects = 0 in def FRMIDX : Pseudo<(outs DLDREGS
+                                               : $dst),
+                                              (ins DLDREGS
+                                               : $src, i16imm
+                                               : $src2),
+                                              "frmidx\t$dst, $src, $src2", []>;
 
 // This pseudo is either converted to a regular store or a push which clobbers
 // SP.
-def STDSPQRr : StorePseudo<
-  (outs),
-  (ins memspi:$dst, GPR8:$src),
-  "stdstk\t$dst, $src",
-  [(store i8:$src, addr:$dst)]
->;
+def STDSPQRr : StorePseudo<(outs),
+                           (ins memspi
+                            : $dst, GPR8
+                            : $src),
+                           "stdstk\t$dst, $src", [(store i8
+                                                   : $src, addr
+                                                   : $dst)]>;
 
 // This pseudo is either converted to a regular store or a push which clobbers
 // SP.
-def STDWSPQRr : StorePseudo<
-  (outs),
-  (ins memspi:$dst, DREGS:$src),
-  "stdwstk\t$dst, $src",
-  [(store i16:$src, addr:$dst)]
->;
+def STDWSPQRr : StorePseudo<(outs),
+                            (ins memspi
+                             : $dst, DREGS
+                             : $src),
+                            "stdwstk\t$dst, $src", [(store i16
+                                                     : $src, addr
+                                                     : $dst)]>;
 
 // SP read/write pseudos.
-let hasSideEffects = 0 in
-{
-  let Uses = [SP] in
-  def SPREAD : Pseudo<
-    (outs DREGS:$dst),
-    (ins GPRSP:$src),
-    "spread\t$dst, $src",
-    []
-  >;
-
-  let Defs = [SP] in
-  def SPWRITE : Pseudo<
-    (outs GPRSP:$dst),
-    (ins DREGS:$src),
-    "spwrite\t$dst, $src",
-    []>;
+let hasSideEffects = 0 in {
+  let Uses = [SP] in def SPREAD : Pseudo<(outs DREGS
+                                          : $dst),
+                                         (ins GPRSP
+                                          : $src),
+                                         "spread\t$dst, $src", []>;
+
+  let Defs = [SP] in def SPWRITE : Pseudo<(outs GPRSP
+                                           : $dst),
+                                          (ins DREGS
+                                           : $src),
+                                          "spwrite\t$dst, $src", []>;
 }
 
-def Select8 : SelectPseudo<
-  (outs GPR8:$dst),
-  (ins GPR8:$src, GPR8:$src2, i8imm:$cc),
-  "# Select8 PSEUDO",
-  [(set i8:$dst, (AVRselectcc i8:$src, i8:$src2, imm:$cc))]
->;
-
-def Select16 : SelectPseudo<
-  (outs DREGS:$dst),
-  (ins DREGS:$src, DREGS:$src2, i8imm:$cc),
-  "# Select16 PSEUDO",
-  [(set i16:$dst, (AVRselectcc i16:$src, i16:$src2, imm:$cc))]
->;
-
-def Lsl8 : ShiftPseudo<
-  (outs GPR8:$dst),
-  (ins GPR8:$src, GPR8:$cnt),
-  "# Lsl8 PSEUDO",
-  [(set i8:$dst, (AVRlslLoop i8:$src, i8:$cnt))]
->;
-
-def Lsl16 : ShiftPseudo<
-  (outs DREGS:$dst),
-  (ins DREGS:$src, GPR8:$cnt),
-  "# Lsl16 PSEUDO",
-  [(set i16:$dst, (AVRlslLoop i16:$src, i8:$cnt))]
->;
-
-def Lsr8 : ShiftPseudo<
-  (outs GPR8:$dst),
-  (ins GPR8:$src, GPR8:$cnt),
-  "# Lsr8 PSEUDO",
-  [(set i8:$dst, (AVRlsrLoop i8:$src, i8:$cnt))]
->;
-
-def Lsr16 : ShiftPseudo<
-  (outs DREGS:$dst),
-   (ins DREGS:$src, GPR8:$cnt),
-   "# Lsr16 PSEUDO",
-   [(set i16:$dst, (AVRlsrLoop i16:$src, i8:$cnt))]
->;
-
-def Rol8 : ShiftPseudo<
-  (outs GPR8:$dst),
-  (ins GPR8:$src, GPR8:$cnt),
-  "# Rol8 PSEUDO",
-  [(set i8:$dst, (AVRrolLoop i8:$src, i8:$cnt))]
->;
-
-def Rol16 : ShiftPseudo<
-  (outs DREGS:$dst),
-  (ins DREGS:$src, GPR8:$cnt),
-  "# Rol16 PSEUDO",
-  [(set i16:$dst, (AVRrolLoop i16:$src, i8:$cnt))]
->;
-
-def Ror8 : ShiftPseudo<
-  (outs GPR8:$dst),
-  (ins GPR8:$src, GPR8:$cnt),
-  "# Ror8 PSEUDO",
-  [(set i8:$dst, (AVRrorLoop i8:$src, i8:$cnt))]
->;
-
-def Ror16 : ShiftPseudo<
-  (outs DREGS:$dst),
-  (ins DREGS:$src, GPR8:$cnt),
-  "# Ror16 PSEUDO",
-  [(set i16:$dst, (AVRrorLoop i16:$src, i8:$cnt))]
->;
-
-def Asr8 : ShiftPseudo<
-  (outs GPR8:$dst),
-  (ins GPR8:$src, GPR8:$cnt),
-  "# Asr8 PSEUDO",
-  [(set i8:$dst, (AVRasrLoop i8:$src, i8:$cnt))]
->;
-
-def Asr16 : ShiftPseudo<
-  (outs DREGS:$dst),
-   (ins DREGS:$src, GPR8:$cnt),
-   "# Asr16 PSEUDO",
-   [(set i16:$dst, (AVRasrLoop i16:$src, i8:$cnt))]
->;
-
+def Select8 : SelectPseudo<(outs GPR8
+                            : $dst),
+                           (ins GPR8
+                            : $src, GPR8
+                            : $src2, i8imm
+                            : $cc),
+                           "# Select8 PSEUDO", [(set i8
+                                                 : $dst, (AVRselectcc i8
+                                                          : $src, i8
+                                                          : $src2, imm
+                                                          : $cc))]>;
+
+def Select16 : SelectPseudo<(outs DREGS
+                             : $dst),
+                            (ins DREGS
+                             : $src, DREGS
+                             : $src2, i8imm
+                             : $cc),
+                            "# Select16 PSEUDO", [(set i16
+                                                   : $dst, (AVRselectcc i16
+                                                            : $src, i16
+                                                            : $src2, imm
+                                                            : $cc))]>;
+
+def Lsl8 : ShiftPseudo<(outs GPR8
+                        : $dst),
+                       (ins GPR8
+                        : $src, GPR8
+                        : $cnt),
+                       "# Lsl8 PSEUDO", [(set i8
+                                          : $dst, (AVRlslLoop i8
+                                                   : $src, i8
+                                                   : $cnt))]>;
+
+def Lsl16 : ShiftPseudo<(outs DREGS
+                         : $dst),
+                        (ins DREGS
+                         : $src, GPR8
+                         : $cnt),
+                        "# Lsl16 PSEUDO", [(set i16
+                                            : $dst, (AVRlslLoop i16
+                                                     : $src, i8
+                                                     : $cnt))]>;
+
+def Lsr8 : ShiftPseudo<(outs GPR8
+                        : $dst),
+                       (ins GPR8
+                        : $src, GPR8
+                        : $cnt),
+                       "# Lsr8 PSEUDO", [(set i8
+                                          : $dst, (AVRlsrLoop i8
+                                                   : $src, i8
+                                                   : $cnt))]>;
+
+def Lsr16 : ShiftPseudo<(outs DREGS
+                         : $dst),
+                        (ins DREGS
+                         : $src, GPR8
+                         : $cnt),
+                        "# Lsr16 PSEUDO", [(set i16
+                                            : $dst, (AVRlsrLoop i16
+                                                     : $src, i8
+                                                     : $cnt))]>;
+
+def Rol8 : ShiftPseudo<(outs GPR8
+                        : $dst),
+                       (ins GPR8
+                        : $src, GPR8
+                        : $cnt),
+                       "# Rol8 PSEUDO", [(set i8
+                                          : $dst, (AVRrolLoop i8
+                                                   : $src, i8
+                                                   : $cnt))]>;
+
+def Rol16 : ShiftPseudo<(outs DREGS
+                         : $dst),
+                        (ins DREGS
+                         : $src, GPR8
+                         : $cnt),
+                        "# Rol16 PSEUDO", [(set i16
+                                            : $dst, (AVRrolLoop i16
+                                                     : $src, i8
+                                                     : $cnt))]>;
+
+def Ror8 : ShiftPseudo<(outs GPR8
+                        : $dst),
+                       (ins GPR8
+                        : $src, GPR8
+                        : $cnt),
+                       "# Ror8 PSEUDO", [(set i8
+                                          : $dst, (AVRrorLoop i8
+                                                   : $src, i8
+                                                   : $cnt))]>;
+
+def Ror16 : ShiftPseudo<(outs DREGS
+                         : $dst),
+                        (ins DREGS
+                         : $src, GPR8
+                         : $cnt),
+                        "# Ror16 PSEUDO", [(set i16
+                                            : $dst, (AVRrorLoop i16
+                                                     : $src, i8
+                                                     : $cnt))]>;
+
+def Asr8 : ShiftPseudo<(outs GPR8
+                        : $dst),
+                       (ins GPR8
+                        : $src, GPR8
+                        : $cnt),
+                       "# Asr8 PSEUDO", [(set i8
+                                          : $dst, (AVRasrLoop i8
+                                                   : $src, i8
+                                                   : $cnt))]>;
+
+def Asr16 : ShiftPseudo<(outs DREGS
+                         : $dst),
+                        (ins DREGS
+                         : $src, GPR8
+                         : $cnt),
+                        "# Asr16 PSEUDO", [(set i16
+                                            : $dst, (AVRasrLoop i16
+                                                     : $src, i8
+                                                     : $cnt))]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
 
-//:TODO: look in x86InstrCompiler.td for odd encoding trick related to
+//: TODO: look in x86InstrCompiler.td for odd encoding trick related to
 // add x, 128 -> sub x, -128. Clang is emitting an eor for this (ldi+eor)
 
 // the add instruction always writes the carry flag
-def : Pat<(addc i8:$src, i8:$src2),
-          (ADDRdRr i8:$src, i8:$src2)>;
-def : Pat<(addc DREGS:$src, DREGS:$src2),
-          (ADDWRdRr DREGS:$src, DREGS:$src2)>;
+def : Pat<(addc i8 : $src, i8 : $src2), (ADDRdRr i8 : $src, i8 : $src2)>;
+def : Pat<(addc DREGS
+           : $src, DREGS
+           : $src2),
+          (ADDWRdRr DREGS
+           : $src, DREGS
+           : $src2)>;
 
 // all sub instruction variants always writes the carry flag
-def : Pat<(subc i8:$src, i8:$src2),
-          (SUBRdRr i8:$src, i8:$src2)>;
-def : Pat<(subc i16:$src, i16:$src2),
-          (SUBWRdRr i16:$src, i16:$src2)>;
-def : Pat<(subc i8:$src, imm:$src2),
-          (SUBIRdK i8:$src, imm:$src2)>;
-def : Pat<(subc i16:$src, imm:$src2),
-          (SUBIWRdK i16:$src, imm:$src2)>;
+def : Pat<(subc i8 : $src, i8 : $src2), (SUBRdRr i8 : $src, i8 : $src2)>;
+def : Pat<(subc i16 : $src, i16 : $src2), (SUBWRdRr i16 : $src, i16 : $src2)>;
+def : Pat<(subc i8 : $src, imm : $src2), (SUBIRdK i8 : $src, imm : $src2)>;
+def : Pat<(subc i16 : $src, imm : $src2), (SUBIWRdK i16 : $src, imm : $src2)>;
 
 // These patterns convert add (x, -imm) to sub (x, imm) since we dont have
 // any add with imm instructions. Also take care of the adiw/sbiw instructions.
-def : Pat<(add i16:$src1, imm0_63_neg:$src2),
-          (SBIWRdK i16:$src1, (imm0_63_neg:$src2))>;
-def : Pat<(add i16:$src1, imm:$src2),
-          (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
-def : Pat<(addc i16:$src1, imm:$src2),
-          (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
-
-def : Pat<(add i8:$src1, imm:$src2),
-          (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
-def : Pat<(addc i8:$src1, imm:$src2),
-          (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
-def : Pat<(adde i8:$src1, imm:$src2),
-          (SBCIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(add i16
+           : $src1, imm0_63_neg
+           : $src2),
+          (SBIWRdK i16
+           : $src1, (imm0_63_neg
+                     : $src2))>;
+def : Pat<(add i16
+           : $src1, imm
+           : $src2),
+          (SUBIWRdK i16
+           : $src1, (imm16_neg_XFORM imm
+                     : $src2))>;
+def : Pat<(addc i16
+           : $src1, imm
+           : $src2),
+          (SUBIWRdK i16
+           : $src1, (imm16_neg_XFORM imm
+                     : $src2))>;
+
+def : Pat<(add i8
+           : $src1, imm
+           : $src2),
+          (SUBIRdK i8
+           : $src1, (imm8_neg_XFORM imm
+                     : $src2))>;
+def : Pat<(addc i8
+           : $src1, imm
+           : $src2),
+          (SUBIRdK i8
+           : $src1, (imm8_neg_XFORM imm
+                     : $src2))>;
+def : Pat<(adde i8
+           : $src1, imm
+           : $src2),
+          (SBCIRdK i8
+           : $src1, (imm8_neg_XFORM imm
+                     : $src2))>;
 
 // Calls.
-def : Pat<(AVRcall (i16 tglobaladdr:$dst)),
-          (CALLk tglobaladdr:$dst)>;
-def : Pat<(AVRcall (i16 texternalsym:$dst)),
-          (CALLk texternalsym:$dst)>;
+def : Pat<(AVRcall(i16 tglobaladdr : $dst)), (CALLk tglobaladdr : $dst)>;
+def : Pat<(AVRcall(i16 texternalsym : $dst)), (CALLk texternalsym : $dst)>;
 
 // `anyext`
-def : Pat<(i16 (anyext i8:$src)),
-          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), i8:$src, sub_lo)>;
+def : Pat<(i16(anyext i8
+               : $src)),
+          (INSERT_SUBREG(i16(IMPLICIT_DEF)), i8
+           : $src, sub_lo)>;
 
 // `trunc`
-def : Pat<(i8 (trunc i16:$src)),
-          (EXTRACT_SUBREG i16:$src, sub_lo)>;
+def : Pat<(i8(trunc i16 : $src)), (EXTRACT_SUBREG i16 : $src, sub_lo)>;
 
 // sext_inreg
-def : Pat<(sext_inreg i16:$src, i8),
-          (SEXT (i8 (EXTRACT_SUBREG i16:$src, sub_lo)))>;
+def : Pat<(sext_inreg i16
+           : $src, i8),
+          (SEXT(i8(EXTRACT_SUBREG i16
+                   : $src, sub_lo)))>;
 
 // GlobalAddress
-def : Pat<(i16 (AVRWrapper tglobaladdr:$dst)),
-          (LDIWRdK tglobaladdr:$dst)>;
-def : Pat<(add i16:$src, (AVRWrapper tglobaladdr:$src2)),
-          (SUBIWRdK i16:$src, tglobaladdr:$src2)>;
-def : Pat<(i8 (load (AVRWrapper tglobaladdr:$dst))),
-          (LDSRdK tglobaladdr:$dst)>;
-def : Pat<(i16 (load (AVRWrapper tglobaladdr:$dst))),
-          (LDSWRdK tglobaladdr:$dst)>;
-def : Pat<(store i8:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
-          (STSKRr tglobaladdr:$dst, i8:$src)>;
-def : Pat<(store i16:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
-          (STSWKRr tglobaladdr:$dst, i16:$src)>;
+def : Pat<(i16(AVRWrapper tglobaladdr : $dst)), (LDIWRdK tglobaladdr : $dst)>;
+def : Pat<(add i16
+           : $src, (AVRWrapper tglobaladdr
+                    : $src2)),
+          (SUBIWRdK i16
+           : $src, tglobaladdr
+           : $src2)>;
+def : Pat<(i8(load(AVRWrapper tglobaladdr
+                   : $dst))),
+          (LDSRdK tglobaladdr
+           : $dst)>;
+def : Pat<(i16(load(AVRWrapper tglobaladdr
+                    : $dst))),
+          (LDSWRdK tglobaladdr
+           : $dst)>;
+def : Pat<(store i8
+           : $src, (i16(AVRWrapper tglobaladdr
+                        : $dst))),
+          (STSKRr tglobaladdr
+           : $dst, i8
+           : $src)>;
+def : Pat<(store i16
+           : $src, (i16(AVRWrapper tglobaladdr
+                        : $dst))),
+          (STSWKRr tglobaladdr
+           : $dst, i16
+           : $src)>;
 
 // BlockAddress
-def : Pat<(i16 (AVRWrapper tblockaddress:$dst)),
-          (LDIWRdK tblockaddress:$dst)>;
+def : Pat<(i16(AVRWrapper tblockaddress
+               : $dst)),
+          (LDIWRdK tblockaddress
+           : $dst)>;
 
-def : Pat<(i8 (trunc (AVRlsrwn DLDREGS:$src, (i16 8)))),
-          (EXTRACT_SUBREG DREGS:$src, sub_hi)>;
+def : Pat<(i8(trunc(AVRlsrwn DLDREGS
+                    : $src, (i16 8)))),
+          (EXTRACT_SUBREG DREGS
+           : $src, sub_hi)>;
 
 // :FIXME: DAGCombiner produces an shl node after legalization from these seq:
 // BR_JT -> (mul x, 2) -> (shl x, 1)
-def : Pat<(shl i16:$src1, (i8 1)),
-          (LSLWRd i16:$src1)>;
+def : Pat<(shl i16 : $src1, (i8 1)), (LSLWRd i16 : $src1)>;
 
 // Lowering of 'tst' node to 'TST' instruction.
 // TST is an alias of AND Rd, Rd.
-def : Pat<(AVRtst i8:$rd),
-          (ANDRdRr GPR8:$rd, GPR8:$rd)>;
+def : Pat<(AVRtst i8 : $rd), (ANDRdRr GPR8 : $rd, GPR8 : $rd)>;
 
 // Lowering of 'lsl' node to 'LSL' instruction.
 // LSL is an alias of 'ADD Rd, Rd'
-def : Pat<(AVRlsl i8:$rd),
-          (ADDRdRr GPR8:$rd, GPR8:$rd)>;
-
+def : Pat<(AVRlsl i8 : $rd), (ADDRdRr GPR8 : $rd, GPR8 : $rd)>;
diff --git a/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
index 49a318762b63..2b8711656139 100644
--- a/llvm/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -29,7 +29,9 @@ MCOperand AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
 
   bool IsNegated = false;
-  if (TF & AVRII::MO_NEG) { IsNegated = true; }
+  if (TF & AVRII::MO_NEG) {
+    IsNegated = true;
+  }
 
   if (!MO.isJTI() && MO.getOffset()) {
     Expr = MCBinaryExpr::createAdd(
@@ -59,7 +61,8 @@ MCOperand AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
   return MCOperand::createExpr(Expr);
 }
 
-void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) const {
+void AVRMCInstLower::lowerInstruction(const MachineInstr &MI,
+                                      MCInst &OutMI) const {
   OutMI.setOpcode(MI.getOpcode());
 
   for (MachineOperand const &MO : MI.operands()) {
@@ -108,4 +111,3 @@ void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) con
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/AVRMCInstLower.h b/llvm/lib/Target/AVR/AVRMCInstLower.h
index 5e0f42ac16a7..7ad6d472ad87 100644
--- a/llvm/lib/Target/AVR/AVRMCInstLower.h
+++ b/llvm/lib/Target/AVR/AVRMCInstLower.h
@@ -39,4 +39,3 @@ private:
 } // end namespace llvm
 
 #endif // LLVM_AVR_MCINST_LOWER_H
-
diff --git a/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
index 5432fac122ef..8b1c247eb6a7 100644
--- a/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
+++ b/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -55,8 +55,10 @@ public:
         CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {
     unsigned CallConv = MF.getFunction().getCallingConv();
 
-    this->IsInterruptHandler = CallConv == CallingConv::AVR_INTR || MF.getFunction().hasFnAttribute("interrupt");
-    this->IsSignalHandler = CallConv == CallingConv::AVR_SIGNAL || MF.getFunction().hasFnAttribute("signal");
+    this->IsInterruptHandler = CallConv == CallingConv::AVR_INTR ||
+                               MF.getFunction().hasFnAttribute("interrupt");
+    this->IsSignalHandler = CallConv == CallingConv::AVR_SIGNAL ||
+                            MF.getFunction().hasFnAttribute("signal");
   }
 
   bool getHasSpills() const { return HasSpills; }
@@ -69,7 +71,9 @@ public:
   void setHasStackArgs(bool B) { HasStackArgs = B; }
 
   /// Checks if the function is some form of interrupt service routine.
-  bool isInterruptOrSignalHandler() const { return isInterruptHandler() || isSignalHandler(); }
+  bool isInterruptOrSignalHandler() const {
+    return isInterruptHandler() || isSignalHandler();
+  }
 
   bool isInterruptHandler() const { return IsInterruptHandler; }
   bool isSignalHandler() const { return IsSignalHandler; }
@@ -81,6 +85,6 @@ public:
   void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
 };
 
-} // end llvm namespace
+} // namespace llvm
 
 #endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
index 2a4905ce2461..1886debaf492 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -17,8 +17,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/Function.h"
 
 #include "AVR.h"
 #include "AVRInstrInfo.h"
@@ -37,9 +37,8 @@ const uint16_t *
 AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const AVRMachineFunctionInfo *AFI = MF->getInfo<AVRMachineFunctionInfo>();
 
-  return AFI->isInterruptOrSignalHandler()
-              ? CSR_Interrupts_SaveList
-              : CSR_Normal_SaveList;
+  return AFI->isInterruptOrSignalHandler() ? CSR_Interrupts_SaveList
+                                           : CSR_Normal_SaveList;
 }
 
 const uint32_t *
@@ -47,9 +46,8 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
   const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
 
-  return AFI->isInterruptOrSignalHandler()
-              ? CSR_Interrupts_RegMask
-              : CSR_Normal_RegMask;
+  return AFI->isInterruptOrSignalHandler() ? CSR_Interrupts_RegMask
+                                           : CSR_Normal_RegMask;
 }
 
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -207,7 +205,8 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // If the offset is too big we have to adjust and restore the frame pointer
   // to materialize a valid load/store with displacement.
-  //:TODO: consider using only one adiw/sbiw chain for more than one frame index
+  //: TODO: consider using only one adiw/sbiw chain for more than one frame
+  //: index
   if (Offset > 62) {
     unsigned AddOpc = AVR::ADIWRdK, SubOpc = AVR::SBIWRdK;
     int AddOffset = Offset - 63 + 1;
@@ -276,18 +275,16 @@ void AVRRegisterInfo::splitReg(Register Reg, Register &LoReg,
   HiReg = getSubReg(Reg, AVR::sub_hi);
 }
 
-bool AVRRegisterInfo::shouldCoalesce(MachineInstr *MI,
-                                     const TargetRegisterClass *SrcRC,
-                                     unsigned SubReg,
-                                     const TargetRegisterClass *DstRC,
-                                     unsigned DstSubReg,
-                                     const TargetRegisterClass *NewRC,
-                                     LiveIntervals &LIS) const {
-  if(this->getRegClass(AVR::PTRDISPREGSRegClassID)->hasSubClassEq(NewRC)) {
+bool AVRRegisterInfo::shouldCoalesce(
+    MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
+    const TargetRegisterClass *DstRC, unsigned DstSubReg,
+    const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+  if (this->getRegClass(AVR::PTRDISPREGSRegClassID)->hasSubClassEq(NewRC)) {
     return false;
   }
 
-  return TargetRegisterInfo::shouldCoalesce(MI, SrcRC, SubReg, DstRC, DstSubReg, NewRC, LIS);
+  return TargetRegisterInfo::shouldCoalesce(MI, SrcRC, SubReg, DstRC, DstSubReg,
+                                            NewRC, LIS);
 }
 
 } // end of namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.h b/llvm/lib/Target/AVR/AVRRegisterInfo.h
index 23439f2fe195..fa27d9283209 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.h
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -51,12 +51,9 @@ public:
   /// \param Reg A 16-bit register to split.
   void splitReg(Register Reg, Register &LoReg, Register &HiReg) const;
 
-  bool shouldCoalesce(MachineInstr *MI,
-                      const TargetRegisterClass *SrcRC,
-                      unsigned SubReg,
-                      const TargetRegisterClass *DstRC,
-                      unsigned DstSubReg,
-                      const TargetRegisterClass *NewRC,
+  bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC,
+                      unsigned SubReg, const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg, const TargetRegisterClass *NewRC,
                       LiveIntervals &LIS) const override;
 };
 
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index 1948fcbaf75a..bb4e86ca0536 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -11,12 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 // 8-bit General purpose register definition.
-class AVRReg<bits<16> num,
-             string name,
-             list<Register> subregs = [],
-             list<string> altNames = []>
-  : RegisterWithSubRegs<name, subregs>
-{
+class AVRReg<bits<16> num, string name, list<Register> subregs = [],
+             list<string> altNames = []> : RegisterWithSubRegs<name, subregs> {
   field bits<16> Num = num;
 
   let HWEncoding = num;
@@ -26,31 +22,27 @@ class AVRReg<bits<16> num,
 }
 
 // Subregister indices.
-let Namespace = "AVR" in
-{
+let Namespace = "AVR" in {
   def sub_lo : SubRegIndex<8>;
   def sub_hi : SubRegIndex<8, 8>;
 }
 
-let Namespace = "AVR" in {
-  def ptr : RegAltNameIndex;
-}
-
+let Namespace = "AVR" in { def ptr : RegAltNameIndex; }
 
 //===----------------------------------------------------------------------===//
 //  8-bit general purpose registers
 //===----------------------------------------------------------------------===//
 
-def R0  : AVRReg<0,  "r0">,  DwarfRegNum<[0]>;
-def R1  : AVRReg<1,  "r1">,  DwarfRegNum<[1]>;
-def R2  : AVRReg<2,  "r2">,  DwarfRegNum<[2]>;
-def R3  : AVRReg<3,  "r3">,  DwarfRegNum<[3]>;
-def R4  : AVRReg<4,  "r4">,  DwarfRegNum<[4]>;
-def R5  : AVRReg<5,  "r5">,  DwarfRegNum<[5]>;
-def R6  : AVRReg<6,  "r6">,  DwarfRegNum<[6]>;
-def R7  : AVRReg<7,  "r7">,  DwarfRegNum<[7]>;
-def R8  : AVRReg<8,  "r8">,  DwarfRegNum<[8]>;
-def R9  : AVRReg<9,  "r9">,  DwarfRegNum<[9]>;
+def R0 : AVRReg<0, "r0">, DwarfRegNum<[0]>;
+def R1 : AVRReg<1, "r1">, DwarfRegNum<[1]>;
+def R2 : AVRReg<2, "r2">, DwarfRegNum<[2]>;
+def R3 : AVRReg<3, "r3">, DwarfRegNum<[3]>;
+def R4 : AVRReg<4, "r4">, DwarfRegNum<[4]>;
+def R5 : AVRReg<5, "r5">, DwarfRegNum<[5]>;
+def R6 : AVRReg<6, "r6">, DwarfRegNum<[6]>;
+def R7 : AVRReg<7, "r7">, DwarfRegNum<[7]>;
+def R8 : AVRReg<8, "r8">, DwarfRegNum<[8]>;
+def R9 : AVRReg<9, "r9">, DwarfRegNum<[9]>;
 def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>;
 def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>;
 def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>;
@@ -76,19 +68,17 @@ def R31 : AVRReg<31, "r31", [], ["zh"]>, DwarfRegNum<[31]>;
 def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
 def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
 
-let SubRegIndices = [sub_lo, sub_hi],
-CoveredBySubRegs = 1 in
-{
+let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in {
   // 16 bit GPR pairs.
-  def SP     : AVRReg<32, "SP",      [SPL, SPH]>, DwarfRegNum<[32]>;
+  def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>;
 
   // The pointer registers (X,Y,Z) are a special case because they
   // are printed as a `high:low` pair when a DREG is expected,
   // but printed using `X`, `Y`, `Z` when a pointer register is expected.
   let RegAltNameIndices = [ptr] in {
-      def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
-      def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
-      def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+    def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+    def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+    def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
   }
   def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
   def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
@@ -98,11 +88,11 @@ CoveredBySubRegs = 1 in
   def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
   def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
   def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
-  def R9R8   : AVRReg<8,  "r9:r8",   [R8, R9]>,   DwarfRegNum<[8]>;
-  def R7R6   : AVRReg<6,  "r7:r6",   [R6, R7]>,   DwarfRegNum<[6]>;
-  def R5R4   : AVRReg<4,  "r5:r4",   [R4, R5]>,   DwarfRegNum<[4]>;
-  def R3R2   : AVRReg<2,  "r3:r2",   [R2, R3]>,   DwarfRegNum<[2]>;
-  def R1R0   : AVRReg<0,  "r1:r0",   [R0, R1]>,   DwarfRegNum<[0]>;
+  def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>;
+  def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>;
+  def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>;
+  def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>;
+  def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>;
 
   // Pseudo registers for unaligned i16
   def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>;
@@ -113,7 +103,7 @@ CoveredBySubRegs = 1 in
   def R16R15 : AVRReg<15, "r16:r15", [R15, R16]>, DwarfRegNum<[15]>;
   def R14R13 : AVRReg<13, "r14:r13", [R13, R14]>, DwarfRegNum<[13]>;
   def R12R11 : AVRReg<11, "r12:r11", [R11, R12]>, DwarfRegNum<[11]>;
-  def R10R9  : AVRReg<9,  "r10:r9",  [R9,  R10]>, DwarfRegNum<[9]>;
+  def R10R9 : AVRReg<9, "r10:r9", [R9, R10]>, DwarfRegNum<[9]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -122,81 +112,71 @@ CoveredBySubRegs = 1 in
 
 // Main 8-bit register class.
 def GPR8 : RegisterClass<"AVR", [i8], 8,
-  (
-    // Return value and argument registers.
-    add R24, R25, R18, R19, R20, R21, R22, R23,
-    // Scratch registers.
-    R30, R31, R26, R27,
-    // Callee saved registers.
-    R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
-    R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
-  )>;
+                         (
+                             // Return value and argument registers.
+                             add R24, R25, R18, R19, R20, R21, R22, R23,
+                             // Scratch registers.
+                             R30, R31, R26, R27,
+                             // Callee saved registers.
+                             R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+                             R9, R8, R7, R6, R5, R4, R3, R2, R0, R1)>;
 
 // Simple lower registers r0..r15
 def GPR8lo : RegisterClass<"AVR", [i8], 8,
-  (
-    add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
-  )>;
+                           (add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6,
+                            R5, R4, R3, R2, R0, R1)>;
 
 // 8-bit register class for instructions which take immediates.
 def LD8 : RegisterClass<"AVR", [i8], 8,
-  (
-    // Return value and arguments.
-    add R24, R25, R18, R19, R20, R21, R22, R23,
-    // Scratch registers.
-    R30, R31, R26, R27,
-    // Callee saved registers.
-    R28, R29, R17, R16
-  )>;
+                        (
+                            // Return value and arguments.
+                            add R24, R25, R18, R19, R20, R21, R22, R23,
+                            // Scratch registers.
+                            R30, R31, R26, R27,
+                            // Callee saved registers.
+                            R28, R29, R17, R16)>;
 
 // Simple lower registers r16..r23
 def LD8lo : RegisterClass<"AVR", [i8], 8,
-  (
-    add R23, R22, R21, R20, R19, R18, R17, R16
-  )>;
+                          (add R23, R22, R21, R20, R19, R18, R17, R16)>;
 
 // Main 16-bit pair register class.
 def DREGS : RegisterClass<"AVR", [i16], 8,
-  (
-    // Return value and arguments.
-    add R25R24, R19R18, R21R20, R23R22,
-    // Scratch registers.
-    R31R30, R27R26,
-    // Callee saved registers.
-    R29R28, R17R16, R15R14, R13R12, R11R10,
-    R9R8, R7R6, R5R4, R3R2, R1R0,
-    // Pseudo regs for unaligned 16-bits
-    R26R25, R24R23, R22R21,
-    R20R19, R18R17, R16R15,
-    R14R13, R12R11, R10R9
-  )>;
+                          (
+                              // Return value and arguments.
+                              add R25R24, R19R18, R21R20, R23R22,
+                              // Scratch registers.
+                              R31R30, R27R26,
+                              // Callee saved registers.
+                              R29R28, R17R16, R15R14, R13R12, R11R10, R9R8,
+                              R7R6, R5R4, R3R2, R1R0,
+                              // Pseudo regs for unaligned 16-bits
+                              R26R25, R24R23, R22R21, R20R19, R18R17, R16R15,
+                              R14R13, R12R11, R10R9)>;
 
 // Lower 16-bit pair registers in R0..R15, only used in inline assembly.
-def DREGSlo : RegisterClass<"AVR", [i16], 8,
-  (
-    add R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2, R1R0
-  )>;
+def DREGSlo
+    : RegisterClass<"AVR", [i16], 8,
+                    (add R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2, R1R0)>;
 
 // Lower 16-bit pair registers in r16..r23, only used in inline assembly.
 def DREGSLD8lo : RegisterClass<"AVR", [i16], 8,
-  (
-    // Return value and arguments.
-    add R19R18, R21R20, R23R22,
-    // Callee saved registers.
-    R17R16
-  )>;
+                               (
+                                   // Return value and arguments.
+                                   add R19R18, R21R20, R23R22,
+                                   // Callee saved registers.
+                                   R17R16)>;
 
 // 16-bit pair register class for movw
 def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
-  (
-    // Return value and arguments.
-    add R25R24, R19R18, R21R20, R23R22,
-    // Scratch registers.
-    R31R30, R27R26,
-    // Callee saved registers.
-    R29R28, R17R16, R15R14, R13R12, R11R10,
-    R9R8, R7R6, R5R4, R3R2, R1R0
-  )>;
+                              (
+                                  // Return value and arguments.
+                                  add R25R24, R19R18, R21R20, R23R22,
+                                  // Scratch registers.
+                                  R31R30, R27R26,
+                                  // Callee saved registers.
+                                  R29R28, R17R16, R15R14, R13R12, R11R10, R9R8,
+                                  R7R6, R5R4, R3R2, R1R0)>;
 
 // The 16-bit DREGS register class, excluding the Z pointer register.
 //
@@ -207,66 +187,59 @@ def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
 // cannot use Z; it's simply a workaround a regalloc bug.
 //
 // More information can be found in PR39553.
-def DREGS_WITHOUT_YZ_WORKAROUND : RegisterClass<"AVR", [i16], 8,
-  (
-    // Return value and arguments.
-    add R25R24, R19R18, R21R20, R23R22,
-    // Scratch registers.
-    R27R26,
-    // Callee saved registers.
-    R17R16, R15R14, R13R12, R11R10,
-    R9R8, R7R6, R5R4, R3R2, R1R0
-  )>;
+def DREGS_WITHOUT_YZ_WORKAROUND
+    : RegisterClass<"AVR", [i16], 8,
+                    (
+                        // Return value and arguments.
+                        add R25R24, R19R18, R21R20, R23R22,
+                        // Scratch registers.
+                        R27R26,
+                        // Callee saved registers.
+                        R17R16, R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2,
+                        R1R0)>;
 
 // 16-bit register class for immediate instructions.
 def DLDREGS : RegisterClass<"AVR", [i16], 8,
-  (
-    // Return value and arguments.
-    add R25R24, R19R18, R21R20, R23R22,
-    // Scratch registers.
-    R31R30, R27R26,
-    // Callee saved registers.
-    R29R28, R17R16
-  )>;
+                            (
+                                // Return value and arguments.
+                                add R25R24, R19R18, R21R20, R23R22,
+                                // Scratch registers.
+                                R31R30, R27R26,
+                                // Callee saved registers.
+                                R29R28, R17R16)>;
 
 // 16-bit register class for the adiw/sbiw instructions.
 def IWREGS : RegisterClass<"AVR", [i16], 8,
-  (
-    // Return value and arguments.
-    add R25R24,
-    // Scratch registers.
-    R31R30, R27R26,
-    // Callee saved registers.
-    R29R28
-  )>;
+                           (
+                               // Return value and arguments.
+                               add R25R24,
+                               // Scratch registers.
+                               R31R30, R27R26,
+                               // Callee saved registers.
+                               R29R28)>;
 
 // 16-bit register class for the ld and st instructions.
 // AKA X,Y, and Z
 def PTRREGS : RegisterClass<"AVR", [i16], 8,
-  (
-    add R27R26, // X
-        R29R28, // Y
-        R31R30  // Z
-  ), ptr>;
+                            (add R27R26, // X
+                             R29R28,     // Y
+                             R31R30      // Z
+                             ),
+                            ptr>;
 
 // 16-bit register class for the ldd and std instructions.
 // AKA Y and Z.
-def PTRDISPREGS : RegisterClass<"AVR", [i16], 8,
-  (
-    add R31R30, R29R28
-  ), ptr>;
+def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, (add R31R30, R29R28), ptr>;
 
 // We have a bunch of instructions with an explicit Z register argument. We
 // model this using a register class containing only the Z register.
 def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
 
 // Register class used for the stack read pseudo instruction.
-def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>;
+def GPRSP : RegisterClass<"AVR", [i16], 8, (add SP)>;
 
 // Status register.
 def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>;
-def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)>
-{
-  let CopyCost = -1;      // Don't allow copying of status registers
+def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)> {
+  let CopyCost = -1; // Don't allow copying of status registers
 }
-
diff --git a/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp b/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
index 7d2d19de7578..76f29eb9f369 100644
--- a/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
+++ b/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
@@ -84,8 +84,7 @@ bool AVRRelaxMem::runOnBasicBlock(Block &MBB) {
   return Modified;
 }
 
-template <>
-bool AVRRelaxMem::relax<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
+template <> bool AVRRelaxMem::relax<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
 
   MachineOperand &Ptr = MI.getOperand(0);
@@ -96,24 +95,23 @@ bool AVRRelaxMem::relax<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
   if (Imm > 63) {
     // Push the previous state of the pointer register.
     // This instruction must preserve the value.
-    buildMI(MBB, MBBI, AVR::PUSHWRr)
-      .addReg(Ptr.getReg());
+    buildMI(MBB, MBBI, AVR::PUSHWRr).addReg(Ptr.getReg());
 
     // Add the immediate to the pointer register.
     buildMI(MBB, MBBI, AVR::SBCIWRdK)
-      .addReg(Ptr.getReg(), RegState::Define)
-      .addReg(Ptr.getReg())
-      .addImm(-Imm);
+        .addReg(Ptr.getReg(), RegState::Define)
+        .addReg(Ptr.getReg())
+        .addImm(-Imm);
 
     // Store the value in the source register to the address
     // pointed to by the pointer register.
     buildMI(MBB, MBBI, AVR::STWPtrRr)
-      .addReg(Ptr.getReg())
-      .addReg(Src.getReg(), getKillRegState(Src.isKill()));
+        .addReg(Ptr.getReg())
+        .addReg(Src.getReg(), getKillRegState(Src.isKill()));
 
     // Pop the original state of the pointer register.
     buildMI(MBB, MBBI, AVR::POPWRd)
-      .addDef(Ptr.getReg(), getKillRegState(Ptr.isKill()));
+        .addDef(Ptr.getReg(), getKillRegState(Ptr.isKill()));
 
     MI.removeFromParent();
   }
@@ -125,21 +123,19 @@ bool AVRRelaxMem::runOnInstruction(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   int Opcode = MBBI->getOpcode();
 
-#define RELAX(Op)                \
-  case Op:                       \
+#define RELAX(Op)                                                              \
+  case Op:                                                                     \
     return relax<Op>(MBB, MI)
 
-  switch (Opcode) {
-    RELAX(AVR::STDWPtrQRr);
-  }
+  switch (Opcode) { RELAX(AVR::STDWPtrQRr); }
 #undef RELAX
   return false;
 }
 
 } // end of anonymous namespace
 
-INITIALIZE_PASS(AVRRelaxMem, "avr-relax-mem",
-                AVR_RELAX_MEM_OPS_NAME, false, false)
+INITIALIZE_PASS(AVRRelaxMem, "avr-relax-mem", AVR_RELAX_MEM_OPS_NAME, false,
+                false)
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.cpp b/llvm/lib/Target/AVR/AVRSubtarget.cpp
index 601865120491..990e1c57e63f 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.cpp
+++ b/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -13,7 +13,7 @@
 #include "AVRSubtarget.h"
 
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 #include "AVR.h"
 #include "AVRTargetMachine.h"
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h
index 7d49e43a83f5..90b9cd4da7c1 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -39,10 +39,18 @@ public:
                const AVRTargetMachine &TM);
 
   const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; }
-  const AVRTargetLowering *getTargetLowering() const override { return &TLInfo; }
-  const AVRSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; }
-  const AVRRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const AVRTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const AVRSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const AVRRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
 
   /// Parses a subtarget feature string, setting appropriate options.
   /// \note Definition of function is auto generated by `tblgen`.
@@ -84,7 +92,6 @@ public:
   }
 
 private:
-
   /// The ELF e_flags architecture.
   unsigned ELFArch;
 
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 5be4260ce035..65740f7c2306 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -16,7 +16,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 #include "AVR.h"
 #include "AVRTargetObjectFile.h"
@@ -25,7 +25,8 @@
 
 namespace llvm {
 
-static const char *AVRDataLayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
+static const char *AVRDataLayout =
+    "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8";
 
 /// Processes a CPU name.
 static StringRef getCPU(StringRef CPU) {
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.h b/llvm/lib/Target/AVR/AVRTargetMachine.h
index f9015c8741ea..54669eda060c 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.h
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.h
@@ -29,8 +29,7 @@ class AVRTargetMachine : public LLVMTargetMachine {
 public:
   AVRTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                    StringRef FS, const TargetOptions &Options,
-                   Optional<Reloc::Model> RM,
-                   Optional<CodeModel::Model> CM,
+                   Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
                    CodeGenOpt::Level OL, bool JIT);
 
   const AVRSubtarget *getSubtargetImpl() const;
@@ -42,10 +41,6 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  bool isMachineVerifierClean() const override {
-    return false;
-  }
-
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AVRSubtarget SubTarget;
diff --git a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
index 14206cdb8276..c7715ca1f51b 100644
--- a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -24,10 +24,8 @@ void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
       Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 }
 
-MCSection *
-AVRTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
-                                            SectionKind Kind,
-                                            const TargetMachine &TM) const {
+MCSection *AVRTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   // Global values in flash memory are placed in the progmem.data section
   // unless they already have a user assigned section.
   if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection() && Kind.isReadOnly())
@@ -37,4 +35,3 @@ AVRTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
   return Base::SelectSectionForGlobal(GO, Kind, TM);
 }
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 19f769270569..95ecd28200ba 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -25,9 +25,9 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #include <sstream>
 
@@ -170,9 +170,11 @@ public:
   }
 
   bool isImmCom8() const {
-    if (!isImm()) return false;
+    if (!isImm())
+      return false;
     const auto *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    if (!CE)
+      return false;
     int64_t Value = CE->getValue();
     return isUInt<8>(Value);
   }
@@ -322,11 +324,16 @@ bool AVRAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
       MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
-  case Match_Success:        return emit(Inst, Loc, Out);
-  case Match_MissingFeature: return missingFeature(Loc, ErrorInfo);
-  case Match_InvalidOperand: return invalidOperand(Loc, Operands, ErrorInfo);
-  case Match_MnemonicFail:   return Error(Loc, "invalid instruction");
-  default:                   return true;
+  case Match_Success:
+    return emit(Inst, Loc, Out);
+  case Match_MissingFeature:
+    return missingFeature(Loc, ErrorInfo);
+  case Match_InvalidOperand:
+    return invalidOperand(Loc, Operands, ErrorInfo);
+  case Match_MnemonicFail:
+    return Error(Loc, "invalid instruction");
+  default:
+    return true;
   }
 }
 
@@ -440,8 +447,7 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
          tokens[1].getKind() == AsmToken::Minus)) {
 
       AsmToken::TokenKind CurTok = Parser.getLexer().getKind();
-      if (CurTok == AsmToken::Minus ||
-          tokens[1].getKind() == AsmToken::Minus) {
+      if (CurTok == AsmToken::Minus || tokens[1].getKind() == AsmToken::Minus) {
         isNegated = true;
       } else {
         assert(CurTok == AsmToken::Plus);
@@ -461,7 +467,7 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
     return true;
   }
   StringRef ModifierName = Parser.getTok().getString();
-  ModifierKind = AVRMCExpr::getKindByName(ModifierName.str().c_str());
+  ModifierKind = AVRMCExpr::getKindByName(ModifierName);
 
   if (ModifierKind != AVRMCExpr::VK_AVR_None) {
     Parser.Lex();
@@ -469,7 +475,7 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
     if (Parser.getTok().getString() == GENERATE_STUBS &&
         Parser.getTok().getKind() == AsmToken::Identifier) {
       std::string GSModName = ModifierName.str() + "_" + GENERATE_STUBS;
-      ModifierKind = AVRMCExpr::getKindByName(GSModName.c_str());
+      ModifierKind = AVRMCExpr::getKindByName(GSModName);
       if (ModifierKind != AVRMCExpr::VK_AVR_None)
         Parser.Lex(); // Eat gs modifier name
     }
@@ -498,8 +504,8 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   assert(Parser.getTok().getKind() == AsmToken::RParen);
   Parser.Lex(); // Eat closing parenthesis
 
-  MCExpr const *Expression = AVRMCExpr::create(ModifierKind, InnerExpression,
-                                               isNegated, getContext());
+  MCExpr const *Expression =
+      AVRMCExpr::create(ModifierKind, InnerExpression, isNegated, getContext());
 
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   Operands.push_back(AVROperand::CreateImm(Expression, S, E));
@@ -552,8 +558,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) {
   return true;
 }
 
-OperandMatchResultTy
-AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
+OperandMatchResultTy AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
   LLVM_DEBUG(dbgs() << "parseMemriOperand()\n");
 
   SMLoc E, S;
@@ -620,7 +625,8 @@ bool AVRAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 
   bool first = true;
   while (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (!first) eatComma();
+    if (!first)
+      eatComma();
 
     first = false;
 
@@ -670,7 +676,7 @@ bool AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
       Tokens[1].getKind() == AsmToken::Identifier) {
     MCSymbol *Symbol = getContext().getOrCreateSymbol(".text");
     AVRStreamer.emitValueForModiferKind(Symbol, SizeInBytes, L,
-            AVRMCExpr::VK_AVR_None);
+                                        AVRMCExpr::VK_AVR_None);
     return false;
   }
 
@@ -678,7 +684,7 @@ bool AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
       Parser.getLexer().peekTok().getKind() == AsmToken::LParen) {
     StringRef ModifierName = Parser.getTok().getString();
     AVRMCExpr::VariantKind ModifierKind =
-        AVRMCExpr::getKindByName(ModifierName.str().c_str());
+        AVRMCExpr::getKindByName(ModifierName);
     if (ModifierKind != AVRMCExpr::VK_AVR_None) {
       Parser.Lex();
       Parser.Lex(); // Eat the modifier and parenthesis
@@ -722,7 +728,7 @@ unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
       int64_t RegNum = Const->getValue();
       std::ostringstream RegName;
       RegName << "r" << RegNum;
-      RegNum = MatchRegisterName(RegName.str().c_str());
+      RegNum = MatchRegisterName(RegName.str());
       if (RegNum != AVR::NoRegister) {
         Op.makeReg(RegNum);
         if (validateOperandClass(Op, Expected) == Match_Success) {
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 8e7251a74dfd..9dcd370b9f1e 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -21,7 +21,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -42,7 +42,7 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &CStream) const override;
 };
-}
+} // namespace
 
 static MCDisassembler *createAVRDisassembler(const Target &T,
                                              const MCSubtargetInfo &STI,
@@ -50,7 +50,6 @@ static MCDisassembler *createAVRDisassembler(const Target &T,
   return new AVRDisassembler(STI, Ctx);
 }
 
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheAVRTarget(),
@@ -58,18 +57,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRDisassembler() {
 }
 
 static const uint16_t GPRDecoderTable[] = {
-  AVR::R0, AVR::R1, AVR::R2, AVR::R3,
-  AVR::R4, AVR::R5, AVR::R6, AVR::R7,
-  AVR::R8, AVR::R9, AVR::R10, AVR::R11,
-  AVR::R12, AVR::R13, AVR::R14, AVR::R15,
-  AVR::R16, AVR::R17, AVR::R18, AVR::R19,
-  AVR::R20, AVR::R21, AVR::R22, AVR::R23,
-  AVR::R24, AVR::R25, AVR::R26, AVR::R27,
-  AVR::R28, AVR::R29, AVR::R30, AVR::R31,
+    AVR::R0,  AVR::R1,  AVR::R2,  AVR::R3,  AVR::R4,  AVR::R5,  AVR::R6,
+    AVR::R7,  AVR::R8,  AVR::R9,  AVR::R10, AVR::R11, AVR::R12, AVR::R13,
+    AVR::R14, AVR::R15, AVR::R16, AVR::R17, AVR::R18, AVR::R19, AVR::R20,
+    AVR::R21, AVR::R22, AVR::R23, AVR::R24, AVR::R25, AVR::R26, AVR::R27,
+    AVR::R28, AVR::R29, AVR::R30, AVR::R31,
 };
 
 static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address, const void *Decoder) {
+                                            uint64_t Address,
+                                            const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -79,39 +76,41 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address, const void *Decoder) {
+                                           uint64_t Address,
+                                           const void *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
 
-  unsigned Register = GPRDecoderTable[RegNo+16];
+  unsigned Register = GPRDecoderTable[RegNo + 16];
   Inst.addOperand(MCOperand::createReg(Register));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodePTRREGSRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Address, const void *Decoder) {
+                                               uint64_t Address,
+                                               const void *Decoder) {
   // Note: this function must be defined but does not seem to be called.
   assert(false && "unimplemented: PTRREGS register class");
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder);
+static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder);
 
-static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder);
+static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder);
 
-static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder);
+static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder);
 
 static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder);
 
-static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn,
-                              uint64_t Address, const void *Decoder);
+static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const void *Decoder);
 
-static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder);
+static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const void *Decoder);
 
 static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
@@ -119,40 +118,42 @@ static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
 static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
 
-static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder);
+static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const void *Decoder);
 
 static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
 
 #include "AVRGenDisassemblerTables.inc"
 
-static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
   unsigned addr = 0;
   addr |= fieldFromInstruction(Insn, 0, 4);
   addr |= fieldFromInstruction(Insn, 9, 2) << 4;
   unsigned reg = fieldFromInstruction(Insn, 4, 5);
   Inst.addOperand(MCOperand::createImm(addr));
-  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
   unsigned addr = 0;
   addr |= fieldFromInstruction(Insn, 0, 4);
   addr |= fieldFromInstruction(Insn, 9, 2) << 4;
   unsigned reg = fieldFromInstruction(Insn, 4, 5);
-  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(addr));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn,
-                                 uint64_t Address, const void *Decoder) {
+static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
   unsigned addr = fieldFromInstruction(Insn, 3, 5);
   unsigned b = fieldFromInstruction(Insn, 0, 3);
   Inst.addOperand(MCOperand::createImm(addr));
@@ -168,16 +169,17 @@ static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn,
-                              uint64_t Address, const void *Decoder) {
+static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn, uint64_t Address,
+                              const void *Decoder) {
   unsigned d = fieldFromInstruction(Insn, 4, 5);
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn,
-                                uint64_t Address, const void *Decoder) {
+static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const void *Decoder) {
   if (decodeFRd(Inst, Insn, Address, Decoder) == MCDisassembler::Fail)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createReg(AVR::R31R30));
@@ -188,9 +190,11 @@ static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   unsigned d = fieldFromInstruction(Insn, 4, 3) + 16;
   unsigned r = fieldFromInstruction(Insn, 0, 3) + 16;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
@@ -199,22 +203,26 @@ static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   unsigned r = fieldFromInstruction(Insn, 4, 4) * 2;
   unsigned d = fieldFromInstruction(Insn, 0, 4) * 2;
-  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn,
-                                    uint64_t Address, const void *Decoder) {
+static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                const void *Decoder) {
   unsigned d = fieldFromInstruction(Insn, 4, 2) * 2 + 24; // starts at r24:r25
   unsigned k = 0;
   k |= fieldFromInstruction(Insn, 0, 4);
   k |= fieldFromInstruction(Insn, 6, 2) << 4;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::createImm(k));
   return MCDisassembler::Success;
@@ -224,9 +232,11 @@ static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   unsigned rd = fieldFromInstruction(Insn, 4, 4) + 16;
   unsigned rr = fieldFromInstruction(Insn, 0, 4) + 16;
-  if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  if (DecodeGPR8RegisterClass(Inst, rr, Address, Decoder) == MCDisassembler::Fail)
+  if (DecodeGPR8RegisterClass(Inst, rr, Address, Decoder) ==
+      MCDisassembler::Fail)
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
@@ -253,7 +263,8 @@ static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
   }
 
   Size = 4;
-  Insn = (Bytes[0] << 16) | (Bytes[1] << 24) | (Bytes[2] << 0) | (Bytes[3] << 8);
+  Insn =
+      (Bytes[0] << 16) | (Bytes[1] << 24) | (Bytes[2] << 0) | (Bytes[3] << 8);
 
   return MCDisassembler::Success;
 }
@@ -261,9 +272,12 @@ static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
 static const uint8_t *getDecoderTable(uint64_t Size) {
 
   switch (Size) {
-    case 2: return DecoderTable16;
-    case 4: return DecoderTable32;
-    default: llvm_unreachable("instructions must be 16 or 32-bits");
+  case 2:
+    return DecoderTable16;
+  case 4:
+    return DecoderTable32;
+  default:
+    llvm_unreachable("instructions must be 16 or 32-bits");
   }
 }
 
@@ -279,11 +293,12 @@ DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   {
     Result = readInstruction16(Bytes, Address, Size, Insn);
 
-    if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
 
     // Try to auto-decode a 16-bit instruction.
-    Result = decodeInstruction(getDecoderTable(Size), Instr,
-                               Insn, Address, this, STI);
+    Result = decodeInstruction(getDecoderTable(Size), Instr, Insn, Address,
+                               this, STI);
 
     if (Result != MCDisassembler::Fail)
       return Result;
@@ -293,10 +308,11 @@ DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
   {
     Result = readInstruction32(Bytes, Address, Size, Insn);
 
-    if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
 
-    Result = decodeInstruction(getDecoderTable(Size), Instr, Insn,
-                               Address, this, STI);
+    Result = decodeInstruction(getDecoderTable(Size), Instr, Insn, Address,
+                               this, STI);
 
     if (Result != MCDisassembler::Fail) {
       return Result;
@@ -308,4 +324,3 @@ DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
 
 typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
                                    const void *Decoder);
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 49840672bf9a..a3a4d63932c0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -44,7 +44,7 @@ static void signed_width(unsigned Width, uint64_t Value,
     int64_t Max = maxIntN(Width);
 
     Diagnostic += " (expected an integer in the range " + std::to_string(Min) +
-      " to " + std::to_string(Max) + ")";
+                  " to " + std::to_string(Max) + ")";
 
     if (Ctx) {
       Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
@@ -62,8 +62,8 @@ static void unsigned_width(unsigned Width, uint64_t Value,
 
     int64_t Max = maxUIntN(Width);
 
-    Diagnostic += " (expected an integer in the range 0 to " +
-      std::to_string(Max) + ")";
+    Diagnostic +=
+        " (expected an integer in the range 0 to " + std::to_string(Max) + ")";
 
     if (Ctx) {
       Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
@@ -233,15 +233,14 @@ static void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
-} // end of ldi namespace
-} // end of adjust namespace
+} // namespace ldi
+} // namespace adjust
 
 namespace llvm {
 
 // Prepare value for the target space for it
 void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
-                                     const MCValue &Target,
-                                     uint64_t &Value,
+                                     const MCValue &Target, uint64_t &Value,
                                      MCContext *Ctx) const {
   // The size of the fixup in bits.
   uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
@@ -280,7 +279,8 @@ void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
     break;
   case AVR::fixup_hh8_ldi:
   case AVR::fixup_hh8_ldi_pm:
-    if (Kind == AVR::fixup_hh8_ldi_pm) adjust::pm(Value);
+    if (Kind == AVR::fixup_hh8_ldi_pm)
+      adjust::pm(Value);
 
     adjust::ldi::hh8(Size, Fixup, Value, Ctx);
     break;
@@ -290,21 +290,24 @@ void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
 
   case AVR::fixup_lo8_ldi_neg:
   case AVR::fixup_lo8_ldi_pm_neg:
-    if (Kind == AVR::fixup_lo8_ldi_pm_neg) adjust::pm(Value);
+    if (Kind == AVR::fixup_lo8_ldi_pm_neg)
+      adjust::pm(Value);
 
     adjust::ldi::neg(Value);
     adjust::ldi::lo8(Size, Fixup, Value, Ctx);
     break;
   case AVR::fixup_hi8_ldi_neg:
   case AVR::fixup_hi8_ldi_pm_neg:
-    if (Kind == AVR::fixup_hi8_ldi_pm_neg) adjust::pm(Value);
+    if (Kind == AVR::fixup_hi8_ldi_pm_neg)
+      adjust::pm(Value);
 
     adjust::ldi::neg(Value);
     adjust::ldi::hi8(Size, Fixup, Value, Ctx);
     break;
   case AVR::fixup_hh8_ldi_neg:
   case AVR::fixup_hh8_ldi_pm_neg:
-    if (Kind == AVR::fixup_hh8_ldi_pm_neg) adjust::pm(Value);
+    if (Kind == AVR::fixup_hh8_ldi_pm_neg)
+      adjust::pm(Value);
 
     adjust::ldi::neg(Value);
     adjust::ldi::hh8(Size, Fixup, Value, Ctx);
@@ -455,7 +458,8 @@ MCFixupKindInfo const &AVRAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-bool AVRAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool AVRAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                 const MCSubtargetInfo *STI) const {
   // If the count is not 2-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
@@ -468,8 +472,9 @@ bool AVRAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
 bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                           const MCFixup &Fixup,
                                           const MCValue &Target) {
-  switch ((unsigned) Fixup.getKind()) {
-  default: return false;
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
   // Fixups which should always be recorded as relocations.
   case AVR::fixup_7_pcrel:
   case AVR::fixup_13_pcrel:
@@ -485,4 +490,3 @@ MCAsmBackend *createAVRAsmBackend(const Target &T, const MCSubtargetInfo &STI,
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 46dc914adf78..ea7fc30ab9d0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -55,7 +55,8 @@ public:
     return false;
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
@@ -67,4 +68,3 @@ private:
 } // end namespace llvm
 
 #endif // LLVM_AVR_ASM_BACKEND_H
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index bedf68db08ca..b90e103794da 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -27,21 +27,18 @@ public:
 
   virtual ~AVRELFObjectWriter() {}
 
-  unsigned getRelocType(MCContext &Ctx,
-                        const MCValue &Target,
-                        const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 };
 
 AVRELFObjectWriter::AVRELFObjectWriter(uint8_t OSABI)
     : MCELFObjectTargetWriter(false, OSABI, ELF::EM_AVR, true) {}
 
-unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
-                                          const MCValue &Target,
+unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
-  switch ((unsigned) Fixup.getKind()) {
+  switch ((unsigned)Fixup.getKind()) {
   case FK_Data_1:
     switch (Modifier) {
     default:
@@ -158,4 +155,3 @@ std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI) {
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index 6d126ed622aa..85933d6b9bb9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -53,8 +53,7 @@ static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
   return EFlags;
 }
 
-AVRELFStreamer::AVRELFStreamer(MCStreamer &S,
-                               const MCSubtargetInfo &STI)
+AVRELFStreamer::AVRELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI)
     : AVRTargetStreamer(S) {
 
   MCAssembler &MCA = getStreamer().getAssembler();
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
index a0dd1dc8ac3e..1f7a926edb5c 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
@@ -141,7 +141,7 @@ namespace fixups {
 template <typename T> inline void adjustBranchTarget(T &val) { val >>= 1; }
 
 } // end of namespace fixups
-}
-} // end of namespace llvm::AVR
+} // namespace AVR
+} // namespace llvm
 
 #endif // LLVM_AVR_FIXUP_KINDS_H
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index 42fac5e2e000..d68e73ce0bb1 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -172,7 +172,8 @@ void AVRInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
 
 void AVRInstPrinter::printMemri(const MCInst *MI, unsigned OpNo,
                                 raw_ostream &O) {
-  assert(MI->getOperand(OpNo).isReg() && "Expected a register for the first operand");
+  assert(MI->getOperand(OpNo).isReg() &&
+         "Expected a register for the first operand");
 
   const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
 
@@ -195,4 +196,3 @@ void AVRInstPrinter::printMemri(const MCInst *MI, unsigned OpNo,
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index 8976ef28f3dc..11f55f6d253b 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -56,4 +56,3 @@ private:
 } // end namespace llvm
 
 #endif // LLVM_AVR_INST_PRINTER_H
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index 50872d6d7a92..9754ff7f1146 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -75,7 +75,7 @@ AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue,
 
   // check whether either of the registers are the X pointer register.
   bool IsRegX = MI.getOperand(0).getReg() == AVR::R27R26 ||
-                  MI.getOperand(1).getReg() == AVR::R27R26;
+                MI.getOperand(1).getReg() == AVR::R27R26;
 
   bool IsPredec = Opcode == AVR::LDRdPtrPd || Opcode == AVR::STPtrPdRr;
   bool IsPostinc = Opcode == AVR::LDRdPtrPi || Opcode == AVR::STPtrPiRr;
@@ -96,8 +96,8 @@ AVRMCCodeEmitter::encodeRelCondBrTarget(const MCInst &MI, unsigned OpNo,
   const MCOperand &MO = MI.getOperand(OpNo);
 
   if (MO.isExpr()) {
-    Fixups.push_back(MCFixup::create(0, MO.getExpr(),
-                     MCFixupKind(Fixup), MI.getLoc()));
+    Fixups.push_back(
+        MCFixup::create(0, MO.getExpr(), MCFixupKind(Fixup), MI.getLoc()));
     return 0;
   }
 
@@ -119,9 +119,12 @@ unsigned AVRMCCodeEmitter::encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo,
   assert(MO.isReg());
 
   switch (MO.getReg()) {
-  case AVR::R27R26: return 0x03; // X: 0b11
-  case AVR::R29R28: return 0x02; // Y: 0b10
-  case AVR::R31R30: return 0x00; // Z: 0b00
+  case AVR::R27R26:
+    return 0x03; // X: 0b11
+  case AVR::R29R28:
+    return 0x02; // Y: 0b10
+  case AVR::R31R30:
+    return 0x00; // Z: 0b00
   default:
     llvm_unreachable("invalid pointer register");
   }
@@ -159,7 +162,7 @@ unsigned AVRMCCodeEmitter::encodeMemri(const MCInst &MI, unsigned OpNo,
   } else if (OffsetOp.isExpr()) {
     OffsetBits = 0;
     Fixups.push_back(MCFixup::create(0, OffsetOp.getExpr(),
-                     MCFixupKind(AVR::fixup_6), MI.getLoc()));
+                                     MCFixupKind(AVR::fixup_6), MI.getLoc()));
   } else {
     llvm_unreachable("invalid value for offset");
   }
@@ -193,7 +196,8 @@ unsigned AVRMCCodeEmitter::encodeImm(const MCInst &MI, unsigned OpNo,
     }
 
     MCFixupKind FixupKind = static_cast<MCFixupKind>(Fixup);
-    Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), FixupKind, MI.getLoc()));
+    Fixups.push_back(
+        MCFixup::create(Offset, MO.getExpr(), FixupKind, MI.getLoc()));
 
     return 0;
   }
@@ -251,8 +255,10 @@ unsigned AVRMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                              const MCOperand &MO,
                                              SmallVectorImpl<MCFixup> &Fixups,
                                              const MCSubtargetInfo &STI) const {
-  if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  if (MO.isImm()) return static_cast<unsigned>(MO.getImm());
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
 
   if (MO.isDFPImm())
     return static_cast<unsigned>(bit_cast<double>(MO.getDFPImm()));
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
index 2e24d885c155..1bfa79f26b27 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -109,7 +109,6 @@ private:
   MCContext &Ctx;
 };
 
-} // end namespace of llvm.
+} // namespace llvm
 
 #endif // LLVM_AVR_CODE_EMITTER_H
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index 0743344bc1ed..5f2a5a82e41d 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -13,8 +13,8 @@
 #include "MCTargetDesc/AVRMCELFStreamer.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
 
 #define DEBUG_TYPE "avrmcelfstreamer"
 
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index a4f8787e5667..7e735ffa6cec 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -19,16 +19,15 @@ namespace llvm {
 namespace {
 
 const struct ModifierEntry {
-  const char * const Spelling;
+  const char *const Spelling;
   AVRMCExpr::VariantKind VariantKind;
 } ModifierNames[] = {
     {"lo8", AVRMCExpr::VK_AVR_LO8},       {"hi8", AVRMCExpr::VK_AVR_HI8},
     {"hh8", AVRMCExpr::VK_AVR_HH8}, // synonym with hlo8
     {"hlo8", AVRMCExpr::VK_AVR_HH8},      {"hhi8", AVRMCExpr::VK_AVR_HHI8},
 
-    {"pm", AVRMCExpr::VK_AVR_PM},
-    {"pm_lo8", AVRMCExpr::VK_AVR_PM_LO8}, {"pm_hi8", AVRMCExpr::VK_AVR_PM_HI8},
-    {"pm_hh8", AVRMCExpr::VK_AVR_PM_HH8},
+    {"pm", AVRMCExpr::VK_AVR_PM},         {"pm_lo8", AVRMCExpr::VK_AVR_PM_LO8},
+    {"pm_hi8", AVRMCExpr::VK_AVR_PM_HI8}, {"pm_hh8", AVRMCExpr::VK_AVR_PM_HH8},
 
     {"lo8_gs", AVRMCExpr::VK_AVR_LO8_GS}, {"hi8_gs", AVRMCExpr::VK_AVR_HI8_GS},
     {"gs", AVRMCExpr::VK_AVR_GS},
@@ -81,7 +80,8 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
   if (Value.isAbsolute()) {
     Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
   } else {
-    if (!Layout) return false;
+    if (!Layout)
+      return false;
 
     MCContext &Context = Layout->getAssembler().getContext();
     const MCSymbolRefExpr *Sym = Value.getSymA();
@@ -219,4 +219,3 @@ AVRMCExpr::VariantKind AVRMCExpr::getKindByName(StringRef Name) {
 }
 
 } // end of namespace llvm
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index e35385ebd90a..68589763f29a 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -34,7 +34,7 @@ public:
 
     VK_AVR_LO8_GS, ///< Corresponds to `lo8(gs())`.
     VK_AVR_HI8_GS, ///< Corresponds to `hi8(gs())`.
-    VK_AVR_GS, ///< Corresponds to `gs()`.
+    VK_AVR_GS,     ///< Corresponds to `gs()`.
   };
 
 public:
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index 95f4465924cc..cdfe4a21105d 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -10,21 +10,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AVRMCTargetDesc.h"
 #include "AVRELFStreamer.h"
 #include "AVRInstPrinter.h"
 #include "AVRMCAsmInfo.h"
 #include "AVRMCELFStreamer.h"
-#include "AVRMCTargetDesc.h"
 #include "AVRTargetStreamer.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "AVRGenInstrInfo.inc"
@@ -108,7 +108,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
                                         createAVRMCInstPrinter);
 
   // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(getTheAVRTarget(), createAVRMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(getTheAVRTarget(),
+                                        createAVRMCCodeEmitter);
 
   // Register the obj streamer
   TargetRegistry::RegisterELFStreamer(getTheAVRTarget(), createMCStreamer);
@@ -124,4 +125,3 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
   // Register the asm backend (as little endian).
   TargetRegistry::RegisterMCAsmBackend(getTheAVRTarget(), createAVRAsmBackend);
 }
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
index eccd343d79ab..56e0e7810466 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
@@ -21,23 +21,4 @@ AVRTargetStreamer::AVRTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 AVRTargetAsmStreamer::AVRTargetAsmStreamer(MCStreamer &S)
     : AVRTargetStreamer(S) {}
 
-void AVRTargetStreamer::finish() {
-  MCStreamer &OS = getStreamer();
-  MCContext &Context = OS.getContext();
-
-  MCSymbol *DoCopyData = Context.getOrCreateSymbol("__do_copy_data");
-  MCSymbol *DoClearBss = Context.getOrCreateSymbol("__do_clear_bss");
-
-  // FIXME: We can disable __do_copy_data if there are no static RAM variables.
-
-  OS.emitRawComment(" Declaring this symbol tells the CRT that it should");
-  OS.emitRawComment("copy all variables from program memory to RAM on startup");
-  OS.emitSymbolAttribute(DoCopyData, MCSA_Global);
-
-  OS.emitRawComment(" Declaring this symbol tells the CRT that it should");
-  OS.emitRawComment("clear the zeroed data section on startup");
-  OS.emitSymbolAttribute(DoClearBss, MCSA_Global);
-}
-
 } // end namespace llvm
-
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
index 5c4d1a22f6c6..b8b1454a2b8d 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
@@ -18,8 +18,6 @@ class MCStreamer;
 class AVRTargetStreamer : public MCTargetStreamer {
 public:
   explicit AVRTargetStreamer(MCStreamer &S);
-
-  void finish() override;
 };
 
 /// A target streamer for textual AVR assembly code.
diff --git a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
index 69b509b33e88..dd61add1526c 100644
--- a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
+++ b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -7,16 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/AVRTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 namespace llvm {
 Target &getTheAVRTarget() {
   static Target TheAVRTarget;
   return TheAVRTarget;
 }
-}
+} // namespace llvm
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetInfo() {
   llvm::RegisterTarget<llvm::Triple::avr> X(llvm::getTheAVRTarget(), "avr",
                                             "Atmel AVR Microcontroller", "AVR");
 }
-
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 57488bc28f98..50298bf5e943 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -19,8 +19,8 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index a98a3e08d5de..89990f7e15c2 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -21,6 +21,7 @@ ModulePass *createBPFCheckAndAdjustIR();
 
 FunctionPass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
 FunctionPass *createBPFPreserveDIType();
+FunctionPass *createBPFIRPeephole();
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
@@ -33,6 +34,7 @@ void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
 
 void initializeBPFAbstractMemberAccessLegacyPassPass(PassRegistry &);
 void initializeBPFPreserveDITypePass(PassRegistry&);
+void initializeBPFIRPeepholePass(PassRegistry&);
 void initializeBPFMISimplifyPatchablePass(PassRegistry&);
 void initializeBPFMIPeepholePass(PassRegistry&);
 void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
@@ -57,6 +59,13 @@ public:
   static bool isRequired() { return true; }
 };
 
+class BPFIRPeepholePass : public PassInfoMixin<BPFIRPeepholePass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
 class BPFAdjustOptPass : public PassInfoMixin<BPFAdjustOptPass> {
 public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
index 7088d55e1a71..69d0bca0bd77 100644
--- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
+++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -15,6 +15,7 @@
 #include "BPFTargetMachine.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsBPF.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
@@ -66,6 +67,7 @@ private:
   Module *M;
   SmallVector<PassThroughInfo, 16> PassThroughs;
 
+  bool adjustICmpToBuiltin();
   void adjustBasicBlock(BasicBlock &BB);
   bool serializeICMPCrossBB(BasicBlock &BB);
   void adjustInst(Instruction &I);
@@ -85,14 +87,72 @@ ModulePass *llvm::createBPFAdjustOpt() { return new BPFAdjustOpt(); }
 bool BPFAdjustOpt::runOnModule(Module &M) { return BPFAdjustOptImpl(&M).run(); }
 
 bool BPFAdjustOptImpl::run() {
+  bool Changed = adjustICmpToBuiltin();
+
   for (Function &F : *M)
     for (auto &BB : F) {
       adjustBasicBlock(BB);
       for (auto &I : BB)
         adjustInst(I);
     }
+  return insertPassThrough() || Changed;
+}
+
+// Commit acabad9ff6bf ("[InstCombine] try to canonicalize icmp with
+// trunc op into mask and cmp") added a transformation to
+// convert "(conv)a < power_2_const" to "a & <const>" in certain
+// cases and bpf kernel verifier has to handle the resulted code
+// conservatively and this may reject otherwise legitimate program.
+// Here, we change related icmp code to a builtin which will
+// be restored to original icmp code later to prevent that
+// InstCombine transformatin.
+bool BPFAdjustOptImpl::adjustICmpToBuiltin() {
+  bool Changed = false;
+  ICmpInst *ToBeDeleted = nullptr;
+  for (Function &F : *M)
+    for (auto &BB : F)
+      for (auto &I : BB) {
+        if (ToBeDeleted) {
+          ToBeDeleted->eraseFromParent();
+          ToBeDeleted = nullptr;
+        }
+
+        auto *Icmp = dyn_cast<ICmpInst>(&I);
+        if (!Icmp)
+          continue;
+
+        Value *Op0 = Icmp->getOperand(0);
+        if (!isa<TruncInst>(Op0))
+          continue;
+
+        auto ConstOp1 = dyn_cast<ConstantInt>(Icmp->getOperand(1));
+        if (!ConstOp1)
+          continue;
+
+        auto ConstOp1Val = ConstOp1->getValue().getZExtValue();
+        auto Op = Icmp->getPredicate();
+        if (Op == ICmpInst::ICMP_ULT || Op == ICmpInst::ICMP_UGE) {
+          if ((ConstOp1Val - 1) & ConstOp1Val)
+            continue;
+        } else if (Op == ICmpInst::ICMP_ULE || Op == ICmpInst::ICMP_UGT) {
+          if (ConstOp1Val & (ConstOp1Val + 1))
+            continue;
+        } else {
+          continue;
+        }
+
+        Constant *Opcode =
+            ConstantInt::get(Type::getInt32Ty(BB.getContext()), Op);
+        Function *Fn = Intrinsic::getDeclaration(
+            M, Intrinsic::bpf_compare, {Op0->getType(), ConstOp1->getType()});
+        auto *NewInst = CallInst::Create(Fn, {Opcode, Op0, ConstOp1});
+        BB.getInstList().insert(I.getIterator(), NewInst);
+        Icmp->replaceAllUsesWith(NewInst);
+        Changed = true;
+        ToBeDeleted = Icmp;
+      }
 
-  return insertPassThrough();
+  return Changed;
 }
 
 bool BPFAdjustOptImpl::insertPassThrough() {
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 37950e105bdc..d6145f53c170 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -27,7 +27,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index 5239218ad003..cf1bc3f7c5bc 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -46,6 +46,7 @@ private:
   void checkIR(Module &M);
   bool adjustIR(Module &M);
   bool removePassThroughBuiltin(Module &M);
+  bool removeCompareBuiltin(Module &M);
 };
 } // End anonymous namespace
 
@@ -120,8 +121,50 @@ bool BPFCheckAndAdjustIR::removePassThroughBuiltin(Module &M) {
   return Changed;
 }
 
+bool BPFCheckAndAdjustIR::removeCompareBuiltin(Module &M) {
+  // Remove __builtin_bpf_compare()'s which are used to prevent
+  // certain IR optimizations. Now major IR optimizations are done,
+  // remove them.
+  bool Changed = false;
+  CallInst *ToBeDeleted = nullptr;
+  for (Function &F : M)
+    for (auto &BB : F)
+      for (auto &I : BB) {
+        if (ToBeDeleted) {
+          ToBeDeleted->eraseFromParent();
+          ToBeDeleted = nullptr;
+        }
+
+        auto *Call = dyn_cast<CallInst>(&I);
+        if (!Call)
+          continue;
+        auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+        if (!GV)
+          continue;
+        if (!GV->getName().startswith("llvm.bpf.compare"))
+          continue;
+
+        Changed = true;
+        Value *Arg0 = Call->getArgOperand(0);
+        Value *Arg1 = Call->getArgOperand(1);
+        Value *Arg2 = Call->getArgOperand(2);
+
+        auto OpVal = cast<ConstantInt>(Arg0)->getValue().getZExtValue();
+        CmpInst::Predicate Opcode = (CmpInst::Predicate)OpVal;
+
+        auto *ICmp = new ICmpInst(Opcode, Arg1, Arg2);
+        BB.getInstList().insert(Call->getIterator(), ICmp);
+
+        Call->replaceAllUsesWith(ICmp);
+        ToBeDeleted = Call;
+      }
+  return Changed;
+}
+
 bool BPFCheckAndAdjustIR::adjustIR(Module &M) {
-  return removePassThroughBuiltin(M);
+  bool Changed = removePassThroughBuiltin(M);
+  Changed = removeCompareBuiltin(M) || Changed;
+  return Changed;
 }
 
 bool BPFCheckAndAdjustIR::runOnModule(Module &M) {
diff --git a/llvm/lib/Target/BPF/BPFIRPeephole.cpp b/llvm/lib/Target/BPF/BPFIRPeephole.cpp
new file mode 100644
index 000000000000..d6a70012dca0
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFIRPeephole.cpp
@@ -0,0 +1,118 @@
+//===------------ BPFIRPeephole.cpp - IR Peephole Transformation ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// IR level peephole optimization, specifically removing @llvm.stacksave() and
+// @llvm.stackrestore().
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "bpf-ir-peephole"
+
+using namespace llvm;
+
+namespace {
+
+static bool BPFIRPeepholeImpl(Function &F) {
+  LLVM_DEBUG(dbgs() << "******** BPF IR Peephole ********\n");
+
+  bool Changed = false;
+  Instruction *ToErase = nullptr;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      // The following code pattern is handled:
+      //     %3 = call i8* @llvm.stacksave()
+      //     store i8* %3, i8** %saved_stack, align 8
+      //     ...
+      //     %4 = load i8*, i8** %saved_stack, align 8
+      //     call void @llvm.stackrestore(i8* %4)
+      //     ...
+      // The goal is to remove the above four instructions,
+      // so we won't have instructions with r11 (stack pointer)
+      // if eventually there is no variable length stack allocation.
+      // InstrCombine also tries to remove the above instructions,
+      // if it is proven safe (constant alloca etc.), but depending
+      // on code pattern, it may still miss some.
+      //
+      // With unconditionally removing these instructions, if alloca is
+      // constant, we are okay then. Otherwise, SelectionDag will complain
+      // since BPF does not support dynamic allocation yet.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      if (auto *Call = dyn_cast<CallInst>(&I)) {
+        if (auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand())) {
+          if (!GV->getName().equals("llvm.stacksave"))
+            continue;
+          if (!Call->hasOneUser())
+            continue;
+          auto *Inst = cast<Instruction>(*Call->user_begin());
+          LLVM_DEBUG(dbgs() << "Remove:"; I.dump());
+          LLVM_DEBUG(dbgs() << "Remove:"; Inst->dump(); dbgs() << '\n');
+          Changed = true;
+          Inst->eraseFromParent();
+          ToErase = &I;
+        }
+        continue;
+      }
+
+      if (auto *LD = dyn_cast<LoadInst>(&I)) {
+        if (!LD->hasOneUser())
+          continue;
+        auto *Call = dyn_cast<CallInst>(*LD->user_begin());
+        if (!Call)
+          continue;
+        auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+        if (!GV)
+          continue;
+        if (!GV->getName().equals("llvm.stackrestore"))
+          continue;
+        LLVM_DEBUG(dbgs() << "Remove:"; I.dump());
+        LLVM_DEBUG(dbgs() << "Remove:"; Call->dump(); dbgs() << '\n');
+        Changed = true;
+        Call->eraseFromParent();
+        ToErase = &I;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+class BPFIRPeephole final : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+public:
+  static char ID;
+  BPFIRPeephole() : FunctionPass(ID) {}
+};
+} // End anonymous namespace
+
+char BPFIRPeephole::ID = 0;
+INITIALIZE_PASS(BPFIRPeephole, DEBUG_TYPE, "BPF IR Peephole", false, false)
+
+FunctionPass *llvm::createBPFIRPeephole() { return new BPFIRPeephole(); }
+
+bool BPFIRPeephole::runOnFunction(Function &F) { return BPFIRPeepholeImpl(F); }
+
+PreservedAnalyses BPFIRPeepholePass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  return BPFIRPeepholeImpl(F) ? PreservedAnalyses::none()
+                              : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index c543dfcfca95..90723ac04f64 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -822,7 +822,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BuildMI(BB, DL, TII.get(NewCC)).addReg(LHS).addReg(RHS).addMBB(Copy1MBB);
   } else {
     int64_t imm32 = MI.getOperand(2).getImm();
-    // sanity check before we build J*_ri instruction.
+    // Check before we build J*_ri instruction.
     assert (isInt<32>(imm32));
     BuildMI(BB, DL, TII.get(NewCC))
         .addReg(LHS).addImm(imm32).addMBB(Copy1MBB);
@@ -859,3 +859,25 @@ MVT BPFTargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
                                               EVT VT) const {
   return (getHasAlu32() && VT == MVT::i32) ? MVT::i32 : MVT::i64;
 }
+
+bool BPFTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS,
+                                              Instruction *I) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0: // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (!AM.HasBaseReg) // allow "r+i".
+      break;
+    return false; // disallow "r+r" or "r+r+i".
+  default:
+    return false;
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index d5007425a7f8..dcc53019db75 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -130,6 +130,10 @@ private:
     return false;
   }
 
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                             Type *Ty, unsigned AS,
+                             Instruction *I = nullptr) const override;
+
   // isTruncateFree - Return true if it's free to truncate a value of
   // type Ty1 to type Ty2. e.g. On BPF at alu32 mode, it's free to truncate
   // a i64 value in register R1 to i32 by referencing its sub-register W1.
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
index 4e24e3d911b8..eb8c48ac49de 100644
--- a/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -164,7 +164,7 @@ bool BPFMIPreEmitChecking::processAtomicInsts(void) {
         DebugLoc Empty;
         const DebugLoc &DL = MI.getDebugLoc();
         if (DL != Empty)
-          report_fatal_error("line " + std::to_string(DL.getLine()) +
+          report_fatal_error(Twine("line ") + std::to_string(DL.getLine()) +
                              ": Invalid usage of the XADD return value", false);
         else
           report_fatal_error("Invalid usage of the XADD return value", false);
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index ae1f5ea21c12..7e829ea43e89 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -97,15 +97,13 @@ void BPFMISimplifyPatchable::checkADDrr(MachineRegisterInfo *MRI,
 
   // Go through all uses of %1 as in %1 = ADD_rr %2, %3
   const MachineOperand Op0 = Inst->getOperand(0);
-  auto Begin = MRI->use_begin(Op0.getReg()), End = MRI->use_end();
-  decltype(End) NextI;
-  for (auto I = Begin; I != End; I = NextI) {
-    NextI = std::next(I);
+  for (MachineOperand &MO :
+       llvm::make_early_inc_range(MRI->use_operands(Op0.getReg()))) {
     // The candidate needs to have a unique definition.
-    if (!MRI->getUniqueVRegDef(I->getReg()))
+    if (!MRI->getUniqueVRegDef(MO.getReg()))
       continue;
 
-    MachineInstr *DefInst = I->getParent();
+    MachineInstr *DefInst = MO.getParent();
     unsigned Opcode = DefInst->getOpcode();
     unsigned COREOp;
     if (Opcode == BPF::LDB || Opcode == BPF::LDH || Opcode == BPF::LDW ||
@@ -131,7 +129,7 @@ void BPFMISimplifyPatchable::checkADDrr(MachineRegisterInfo *MRI,
         Opcode == BPF::STD || Opcode == BPF::STB32 || Opcode == BPF::STH32 ||
         Opcode == BPF::STW32) {
       const MachineOperand &Opnd = DefInst->getOperand(0);
-      if (Opnd.isReg() && Opnd.getReg() == I->getReg())
+      if (Opnd.isReg() && Opnd.getReg() == MO.getReg())
         continue;
     }
 
diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.td b/llvm/lib/Target/BPF/BPFRegisterInfo.td
index 88dec063be70..abeef5dc8aad 100644
--- a/llvm/lib/Target/BPF/BPFRegisterInfo.td
+++ b/llvm/lib/Target/BPF/BPFRegisterInfo.td
@@ -36,7 +36,7 @@ foreach I = 0-11 in {
 }
 
 // Register classes.
-def GPR32 : RegisterClass<"BPF", [i32], 32, (add
+def GPR32 : RegisterClass<"BPF", [i32], 64, (add
   (sequence "W%u", 1, 9),
   W0, // Return value
   W11, // Stack Ptr
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index fac02e6476b7..77e3cd393f87 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -12,8 +12,8 @@
 
 #include "BPFSubtarget.h"
 #include "BPF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 5b0431293dc2..2fb76ab5c440 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -20,9 +20,9 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
@@ -43,6 +43,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeBPFAbstractMemberAccessLegacyPassPass(PR);
   initializeBPFPreserveDITypePass(PR);
+  initializeBPFIRPeepholePass(PR);
   initializeBPFAdjustOptPass(PR);
   initializeBPFCheckAndAdjustIRPass(PR);
   initializeBPFMIPeepholePass(PR);
@@ -107,6 +108,7 @@ void BPFTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
         PM.add(createBPFAbstractMemberAccess(this));
         PM.add(createBPFPreserveDIType());
+        PM.add(createBPFIRPeephole());
       });
 
   Builder.addExtension(
@@ -124,18 +126,19 @@ void BPFTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
 
 void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerPipelineStartEPCallback(
-      [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
+      [=](ModulePassManager &MPM, OptimizationLevel) {
         FunctionPassManager FPM;
         FPM.addPass(BPFAbstractMemberAccessPass(this));
         FPM.addPass(BPFPreserveDITypePass());
+        FPM.addPass(BPFIRPeepholePass());
         MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
   PB.registerPeepholeEPCallback([=](FunctionPassManager &FPM,
-                                    PassBuilder::OptimizationLevel Level) {
+                                    OptimizationLevel Level) {
     FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
   });
   PB.registerPipelineEarlySimplificationEPCallback(
-      [=](ModulePassManager &MPM, PassBuilder::OptimizationLevel) {
+      [=](ModulePassManager &MPM, OptimizationLevel) {
         MPM.addPass(BPFAdjustOptPass());
       });
 }
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 3bc5556a62f4..6b86bf6e6cc1 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -54,6 +54,23 @@ public:
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                      I);
   }
+
+  InstructionCost getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+    ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+    const Instruction *CxtI = nullptr) {
+      int ISD = TLI->InstructionOpcodeToISD(Opcode);
+      if (ISD == ISD::ADD && CostKind == TTI::TCK_RecipThroughput)
+        return SCEVCheapExpansionBudget.getValue() + 1;
+
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                           Opd2Info, Opd1PropInfo,
+                                           Opd2PropInfo);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/BPF/BTF.def b/llvm/lib/Target/BPF/BTF.def
index 66cf2c90ead4..0ae4194bc512 100644
--- a/llvm/lib/Target/BPF/BTF.def
+++ b/llvm/lib/Target/BPF/BTF.def
@@ -31,5 +31,7 @@ HANDLE_BTF_KIND(13, FUNC_PROTO)
 HANDLE_BTF_KIND(14, VAR)
 HANDLE_BTF_KIND(15, DATASEC)
 HANDLE_BTF_KIND(16, FLOAT)
+HANDLE_BTF_KIND(17, DECL_TAG)
+HANDLE_BTF_KIND(18, TYPE_TAG)
 
 #undef HANDLE_BTF_KIND
diff --git a/llvm/lib/Target/BPF/BTF.h b/llvm/lib/Target/BPF/BTF.h
index ad3dcc14c38a..e54b97cd49a9 100644
--- a/llvm/lib/Target/BPF/BTF.h
+++ b/llvm/lib/Target/BPF/BTF.h
@@ -113,7 +113,7 @@ struct CommonType {
   /// "Size" tells the size of the type it is describing.
   ///
   /// "Type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
-  /// FUNC, FUNC_PROTO and VAR.
+  /// FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG.
   /// "Type" is a type_id referring to another type.
   union {
     uint32_t Size;
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index c1f8ea99b959..0c510686a13b 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -43,7 +43,7 @@ void BTFTypeBase::emitType(MCStreamer &OS) {
 
 BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
                                bool NeedsFixup)
-    : DTy(DTy), NeedsFixup(NeedsFixup) {
+    : DTy(DTy), NeedsFixup(NeedsFixup), Name(DTy->getName()) {
   switch (Tag) {
   case dwarf::DW_TAG_pointer_type:
     Kind = BTF::BTF_KIND_PTR;
@@ -66,14 +66,23 @@ BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
   BTFType.Info = Kind << 24;
 }
 
+/// Used by DW_TAG_pointer_type only.
+BTFTypeDerived::BTFTypeDerived(unsigned NextTypeId, unsigned Tag,
+                               StringRef Name)
+    : DTy(nullptr), NeedsFixup(false), Name(Name) {
+  Kind = BTF::BTF_KIND_PTR;
+  BTFType.Info = Kind << 24;
+  BTFType.Type = NextTypeId;
+}
+
 void BTFTypeDerived::completeType(BTFDebug &BDebug) {
   if (IsCompleted)
     return;
   IsCompleted = true;
 
-  BTFType.NameOff = BDebug.addString(DTy->getName());
+  BTFType.NameOff = BDebug.addString(Name);
 
-  if (NeedsFixup)
+  if (NeedsFixup || !DTy)
     return;
 
   // The base type for PTR/CONST/VOLATILE could be void.
@@ -386,6 +395,55 @@ void BTFTypeFloat::completeType(BTFDebug &BDebug) {
   BTFType.NameOff = BDebug.addString(Name);
 }
 
+BTFTypeDeclTag::BTFTypeDeclTag(uint32_t BaseTypeId, int ComponentIdx,
+                               StringRef Tag)
+    : Tag(Tag) {
+  Kind = BTF::BTF_KIND_DECL_TAG;
+  BTFType.Info = Kind << 24;
+  BTFType.Type = BaseTypeId;
+  Info = ComponentIdx;
+}
+
+void BTFTypeDeclTag::completeType(BTFDebug &BDebug) {
+  if (IsCompleted)
+    return;
+  IsCompleted = true;
+
+  BTFType.NameOff = BDebug.addString(Tag);
+}
+
+void BTFTypeDeclTag::emitType(MCStreamer &OS) {
+  BTFTypeBase::emitType(OS);
+  OS.emitInt32(Info);
+}
+
+BTFTypeTypeTag::BTFTypeTypeTag(uint32_t NextTypeId, StringRef Tag)
+    : DTy(nullptr), Tag(Tag) {
+  Kind = BTF::BTF_KIND_TYPE_TAG;
+  BTFType.Info = Kind << 24;
+  BTFType.Type = NextTypeId;
+}
+
+BTFTypeTypeTag::BTFTypeTypeTag(const DIDerivedType *DTy, StringRef Tag)
+    : DTy(DTy), Tag(Tag) {
+  Kind = BTF::BTF_KIND_TYPE_TAG;
+  BTFType.Info = Kind << 24;
+}
+
+void BTFTypeTypeTag::completeType(BTFDebug &BDebug) {
+  if (IsCompleted)
+    return;
+  IsCompleted = true;
+  BTFType.NameOff = BDebug.addString(Tag);
+  if (DTy) {
+    const DIType *ResolvedType = DTy->getBaseType();
+    if (!ResolvedType)
+      BTFType.Type = 0;
+    else
+      BTFType.Type = BDebug.getTypeId(ResolvedType);
+  }
+}
+
 uint32_t BTFStringTable::addString(StringRef S) {
   // Check whether the string already exists.
   for (auto &OffsetM : OffsetToIdMap) {
@@ -475,6 +533,25 @@ void BTFDebug::visitSubroutineType(
   }
 }
 
+void BTFDebug::processDeclAnnotations(DINodeArray Annotations,
+                                      uint32_t BaseTypeId,
+                                      int ComponentIdx) {
+  if (!Annotations)
+     return;
+
+  for (const Metadata *Annotation : Annotations->operands()) {
+    const MDNode *MD = cast<MDNode>(Annotation);
+    const MDString *Name = cast<MDString>(MD->getOperand(0));
+    if (!Name->getString().equals("btf_decl_tag"))
+      continue;
+
+    const MDString *Value = cast<MDString>(MD->getOperand(1));
+    auto TypeEntry = std::make_unique<BTFTypeDeclTag>(BaseTypeId, ComponentIdx,
+                                                      Value->getString());
+    addType(std::move(TypeEntry));
+  }
+}
+
 /// Handle structure/union types.
 void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
                                uint32_t &TypeId) {
@@ -498,9 +575,17 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
   StructTypes.push_back(TypeEntry.get());
   TypeId = addType(std::move(TypeEntry), CTy);
 
+  // Check struct/union annotations
+  processDeclAnnotations(CTy->getAnnotations(), TypeId, -1);
+
   // Visit all struct members.
-  for (const auto *Element : Elements)
-    visitTypeEntry(cast<DIDerivedType>(Element));
+  int FieldNo = 0;
+  for (const auto *Element : Elements) {
+    const auto Elem = cast<DIDerivedType>(Element);
+    visitTypeEntry(Elem);
+    processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+    FieldNo++;
+  }
 }
 
 void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
@@ -609,11 +694,49 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
     }
   }
 
-  if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef ||
-      Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
-      Tag == dwarf::DW_TAG_restrict_type) {
+  if (Tag == dwarf::DW_TAG_pointer_type) {
+    SmallVector<const MDString *, 4> MDStrs;
+    DINodeArray Annots = DTy->getAnnotations();
+    if (Annots) {
+      // For type with "int __tag1 __tag2 *p", the MDStrs will have
+      // content: [__tag1, __tag2].
+      for (const Metadata *Annotations : Annots->operands()) {
+        const MDNode *MD = cast<MDNode>(Annotations);
+        const MDString *Name = cast<MDString>(MD->getOperand(0));
+        if (!Name->getString().equals("btf_type_tag"))
+          continue;
+        MDStrs.push_back(cast<MDString>(MD->getOperand(1)));
+      }
+    }
+
+    if (MDStrs.size() > 0) {
+      // With MDStrs [__tag1, __tag2], the output type chain looks like
+      //   PTR -> __tag2 -> __tag1 -> BaseType
+      // In the below, we construct BTF types with the order of __tag1, __tag2
+      // and PTR.
+      auto TypeEntry =
+          std::make_unique<BTFTypeTypeTag>(DTy, MDStrs[0]->getString());
+      unsigned TmpTypeId = addType(std::move(TypeEntry));
+      for (unsigned I = 1; I < MDStrs.size(); I++) {
+        const MDString *Value = MDStrs[I];
+        TypeEntry =
+            std::make_unique<BTFTypeTypeTag>(TmpTypeId, Value->getString());
+        TmpTypeId = addType(std::move(TypeEntry));
+      }
+      auto TypeDEntry =
+          std::make_unique<BTFTypeDerived>(TmpTypeId, Tag, DTy->getName());
+      TypeId = addType(std::move(TypeDEntry), DTy);
+    } else {
+      auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, false);
+      TypeId = addType(std::move(TypeEntry), DTy);
+    }
+  } else if (Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type ||
+             Tag == dwarf::DW_TAG_volatile_type ||
+             Tag == dwarf::DW_TAG_restrict_type) {
     auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, false);
     TypeId = addType(std::move(TypeEntry), DTy);
+    if (Tag == dwarf::DW_TAG_typedef)
+      processDeclAnnotations(DTy->getAnnotations(), TypeId, -1);
   } else if (Tag != dwarf::DW_TAG_member) {
     return;
   }
@@ -783,7 +906,9 @@ void BTFDebug::emitBTFSection() {
     return;
 
   MCContext &Ctx = OS.getContext();
-  OS.SwitchSection(Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0));
+  MCSectionELF *Sec = Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0);
+  Sec->setAlignment(Align(4));
+  OS.SwitchSection(Sec);
 
   // Emit header.
   emitCommonHeader();
@@ -821,7 +946,9 @@ void BTFDebug::emitBTFExtSection() {
     return;
 
   MCContext &Ctx = OS.getContext();
-  OS.SwitchSection(Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0));
+  MCSectionELF *Sec = Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0);
+  Sec->setAlignment(Align(4));
+  OS.SwitchSection(Sec);
 
   // Emit header.
   emitCommonHeader();
@@ -964,6 +1091,17 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
       std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId, Scope);
   uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));
 
+  // Process argument annotations.
+  for (const DINode *DN : SP->getRetainedNodes()) {
+    if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
+      uint32_t Arg = DV->getArg();
+      if (Arg)
+        processDeclAnnotations(DV->getAnnotations(), FuncTypeId, Arg - 1);
+    }
+  }
+
+  processDeclAnnotations(SP->getAnnotations(), FuncTypeId, -1);
+
   for (const auto &TypeEntry : TypeEntries)
     TypeEntry->completeType(*this);
 
@@ -1176,11 +1314,13 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
       continue;
 
     uint32_t GVTypeId = 0;
+    DIGlobalVariable *DIGlobal = nullptr;
     for (auto *GVE : GVs) {
+      DIGlobal = GVE->getVariable();
       if (SecName.startswith(".maps"))
-        visitMapDefType(GVE->getVariable()->getType(), GVTypeId);
+        visitMapDefType(DIGlobal->getType(), GVTypeId);
       else
-        visitTypeEntry(GVE->getVariable()->getType(), GVTypeId, false, false);
+        visitTypeEntry(DIGlobal->getType(), GVTypeId, false, false);
       break;
     }
 
@@ -1212,6 +1352,8 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
         std::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
     uint32_t VarId = addType(std::move(VarEntry));
 
+    processDeclAnnotations(DIGlobal->getAnnotations(), VarId, -1);
+
     // An empty SecName means an extern variable without section attribute.
     if (SecName.empty())
       continue;
@@ -1306,6 +1448,9 @@ void BTFDebug::processFuncPrototypes(const Function *F) {
   auto FuncTypeEntry =
       std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId, Scope);
   uint32_t FuncId = addType(std::move(FuncTypeEntry));
+
+  processDeclAnnotations(SP->getAnnotations(), FuncId, -1);
+
   if (F->hasSection()) {
     StringRef SecName = F->getSection();
 
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 2fdcf8500b7f..7c30675c553c 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -64,9 +64,11 @@ public:
 class BTFTypeDerived : public BTFTypeBase {
   const DIDerivedType *DTy;
   bool NeedsFixup;
+  StringRef Name;
 
 public:
   BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag, bool NeedsFixup);
+  BTFTypeDerived(unsigned NextTypeId, unsigned Tag, StringRef Name);
   void completeType(BTFDebug &BDebug) override;
   void emitType(MCStreamer &OS) override;
   void setPointeeType(uint32_t PointeeType);
@@ -204,6 +206,28 @@ public:
   void completeType(BTFDebug &BDebug) override;
 };
 
+/// Handle decl tags.
+class BTFTypeDeclTag : public BTFTypeBase {
+  uint32_t Info;
+  StringRef Tag;
+
+public:
+  BTFTypeDeclTag(uint32_t BaseTypeId, int ComponentId, StringRef Tag);
+  uint32_t getSize() override { return BTFTypeBase::getSize() + 4; }
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
+};
+
+class BTFTypeTypeTag : public BTFTypeBase {
+  const DIDerivedType *DTy;
+  StringRef Tag;
+
+public:
+  BTFTypeTypeTag(uint32_t NextTypeId, StringRef Tag);
+  BTFTypeTypeTag(const DIDerivedType *DTy, StringRef Tag);
+  void completeType(BTFDebug &BDebug) override;
+};
+
 /// String table.
 class BTFStringTable {
   /// String table size in bytes.
@@ -313,6 +337,10 @@ class BTFDebug : public DebugHandlerBase {
   /// Generate types for function prototypes.
   void processFuncPrototypes(const Function *);
 
+  /// Generate types for decl annotations.
+  void processDeclAnnotations(DINodeArray Annotations, uint32_t BaseTypeId,
+                              int ComponentId);
+
   /// Generate one field relocation record.
   void generatePatchImmReloc(const MCSymbol *ORSym, uint32_t RootId,
                              const GlobalVariable *, bool IsAma);
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 3a1492743bf4..3f643d47f934 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -18,8 +18,8 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <cstdint>
 
 using namespace llvm;
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 6687dbe25364..bacd00360f82 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -43,12 +43,14 @@ public:
 
   unsigned getNumFixupKinds() const override { return 1; }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 };
 
 } // end anonymous namespace
 
-bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                 const MCSubtargetInfo *STI) const {
   if ((Count % 8) != 0)
     return false;
 
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 8fb7d7e89f09..5a1e251cd29c 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -18,8 +18,8 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "BPFGenInstrInfo.inc"
diff --git a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
index 49eb9ad62c56..d7cdcae916aa 100644
--- a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
+++ b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/BPFTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
index f2a381190fe7..ebc04b40d428 100644
--- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
+++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
@@ -1,12 +1,12 @@
-//===-- CSKYAsmParser.cpp - Parse CSKY assembly to MCInst instructions --===//
+//===---- CSKYAsmParser.cpp - Parse CSKY assembly to MCInst instructions --===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/CSKYInstPrinter.h"
 #include "MCTargetDesc/CSKYMCExpr.h"
 #include "MCTargetDesc/CSKYMCTargetDesc.h"
 #include "TargetInfo/CSKYTargetInfo.h"
@@ -20,10 +20,14 @@
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "csky-asm-parser"
 
 using namespace llvm;
 
@@ -32,6 +36,8 @@ struct CSKYOperand;
 
 class CSKYAsmParser : public MCTargetAsmParser {
 
+  const MCRegisterInfo *MRI;
+
   bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
                                   int64_t Lower, int64_t Upper, Twine Msg);
 
@@ -52,6 +58,9 @@ class CSKYAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                         SMLoc &EndLoc) override;
 
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands,
+                          MCStreamer &Out);
+
 // Auto-generated instruction matching functions
 #define GET_ASSEMBLER_HEADER
 #include "CSKYGenAsmMatcher.inc"
@@ -61,12 +70,18 @@ class CSKYAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseBaseRegImm(OperandVector &Operands);
   OperandMatchResultTy parseCSKYSymbol(OperandVector &Operands);
   OperandMatchResultTy parseConstpoolSymbol(OperandVector &Operands);
+  OperandMatchResultTy parseDataSymbol(OperandVector &Operands);
+  OperandMatchResultTy parsePSRFlag(OperandVector &Operands);
+  OperandMatchResultTy parseRegSeq(OperandVector &Operands);
+  OperandMatchResultTy parseRegList(OperandVector &Operands);
 
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
 public:
   enum CSKYMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresSameSrcAndDst,
+    Match_InvalidRegOutOfRange,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "CSKYGenAsmMatcher.inc"
 #undef GET_OPERAND_DIAGNOSTIC_TYPES
@@ -81,10 +96,14 @@ public:
 
 /// Instances of this class represent a parsed machine instruction.
 struct CSKYOperand : public MCParsedAsmOperand {
+
   enum KindTy {
     Token,
     Register,
     Immediate,
+    RegisterSeq,
+    CPOP,
+    RegisterList
   } Kind;
 
   struct RegOp {
@@ -95,11 +114,34 @@ struct CSKYOperand : public MCParsedAsmOperand {
     const MCExpr *Val;
   };
 
+  struct ConstpoolOp {
+    const MCExpr *Val;
+  };
+
+  struct RegSeqOp {
+    unsigned RegNumFrom;
+    unsigned RegNumTo;
+  };
+
+  struct RegListOp {
+    unsigned List1From = 0;
+    unsigned List1To = 0;
+    unsigned List2From = 0;
+    unsigned List2To = 0;
+    unsigned List3From = 0;
+    unsigned List3To = 0;
+    unsigned List4From = 0;
+    unsigned List4To = 0;
+  };
+
   SMLoc StartLoc, EndLoc;
   union {
     StringRef Tok;
     RegOp Reg;
     ImmOp Imm;
+    ConstpoolOp CPool;
+    RegSeqOp RegSeq;
+    RegListOp RegList;
   };
 
   CSKYOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -113,18 +155,31 @@ public:
     case Register:
       Reg = o.Reg;
       break;
+    case RegisterSeq:
+      RegSeq = o.RegSeq;
+      break;
+    case CPOP:
+      CPool = o.CPool;
+      break;
     case Immediate:
       Imm = o.Imm;
       break;
     case Token:
       Tok = o.Tok;
       break;
+    case RegisterList:
+      RegList = o.RegList;
+      break;
     }
   }
 
   bool isToken() const override { return Kind == Token; }
   bool isReg() const override { return Kind == Register; }
   bool isImm() const override { return Kind == Immediate; }
+  bool isRegisterSeq() const { return Kind == RegisterSeq; }
+  bool isRegisterList() const { return Kind == RegisterList; }
+  bool isConstPoolOp() const { return Kind == CPOP; }
+
   bool isMem() const override { return false; }
 
   static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm) {
@@ -163,29 +218,132 @@ public:
     return IsConstantImm && isShiftedInt<num, shift>(Imm);
   }
 
+  bool isUImm1() const { return isUImm<1>(); }
   bool isUImm2() const { return isUImm<2>(); }
+  bool isUImm3() const { return isUImm<3>(); }
+  bool isUImm4() const { return isUImm<4>(); }
   bool isUImm5() const { return isUImm<5>(); }
+  bool isUImm6() const { return isUImm<6>(); }
+  bool isUImm7() const { return isUImm<7>(); }
+  bool isUImm8() const { return isUImm<8>(); }
   bool isUImm12() const { return isUImm<12>(); }
   bool isUImm16() const { return isUImm<16>(); }
-
+  bool isUImm20() const { return isUImm<20>(); }
+  bool isUImm24() const { return isUImm<24>(); }
+
+  bool isOImm3() const { return isOImm<3>(); }
+  bool isOImm4() const { return isOImm<4>(); }
+  bool isOImm5() const { return isOImm<5>(); }
+  bool isOImm6() const { return isOImm<6>(); }
+  bool isOImm8() const { return isOImm<8>(); }
   bool isOImm12() const { return isOImm<12>(); }
   bool isOImm16() const { return isOImm<16>(); }
 
+  bool isSImm8() const { return isSImm<8>(); }
+
+  bool isUImm5Shift1() { return isUImm<5, 1>(); }
+  bool isUImm5Shift2() { return isUImm<5, 2>(); }
+  bool isUImm7Shift1() { return isUImm<7, 1>(); }
+  bool isUImm7Shift2() { return isUImm<7, 2>(); }
+  bool isUImm7Shift3() { return isUImm<7, 3>(); }
+  bool isUImm8Shift2() { return isUImm<8, 2>(); }
+  bool isUImm8Shift3() { return isUImm<8, 3>(); }
+  bool isUImm8Shift8() { return isUImm<8, 8>(); }
+  bool isUImm8Shift16() { return isUImm<8, 16>(); }
+  bool isUImm8Shift24() { return isUImm<8, 24>(); }
   bool isUImm12Shift1() { return isUImm<12, 1>(); }
   bool isUImm12Shift2() { return isUImm<12, 2>(); }
+  bool isUImm16Shift8() { return isUImm<16, 8>(); }
+  bool isUImm16Shift16() { return isUImm<16, 16>(); }
+  bool isUImm24Shift8() { return isUImm<24, 8>(); }
 
   bool isSImm16Shift1() { return isSImm<16, 1>(); }
 
-  bool isCSKYSymbol() const {
+  bool isCSKYSymbol() const { return isImm(); }
+
+  bool isConstpool() const { return isConstPoolOp(); }
+  bool isDataSymbol() const { return isConstPoolOp(); }
+
+  bool isSPOperand() const {
+    if (!isReg())
+      return false;
+    return getReg() == CSKY::R14;
+  }
+
+  bool isPSRFlag() const {
     int64_t Imm;
-    // Must be of 'immediate' type but not a constant.
-    return isImm() && !evaluateConstantImm(getImm(), Imm);
+    // Must be of 'immediate' type and a constant.
+    if (!isImm() || !evaluateConstantImm(getImm(), Imm))
+      return false;
+
+    return isUInt<5>(Imm);
+  }
+
+  template <unsigned MIN, unsigned MAX> bool isRegSeqTemplate() const {
+    if (!isRegisterSeq())
+      return false;
+
+    std::pair<unsigned, unsigned> regSeq = getRegSeq();
+
+    return MIN <= regSeq.first && regSeq.first <= regSeq.second &&
+           regSeq.second <= MAX;
+  }
+
+  bool isRegSeq() const { return isRegSeqTemplate<CSKY::R0, CSKY::R31>(); }
+
+  static bool isLegalRegList(unsigned from, unsigned to) {
+    if (from == 0 && to == 0)
+      return true;
+
+    if (from == to) {
+      if (from != CSKY::R4 && from != CSKY::R15 && from != CSKY::R16 &&
+          from != CSKY::R28)
+        return false;
+
+      return true;
+    } else {
+      if (from != CSKY::R4 && from != CSKY::R16)
+        return false;
+
+      if (from == CSKY::R4 && to > CSKY::R4 && to < CSKY::R12)
+        return true;
+      else if (from == CSKY::R16 && to > CSKY::R16 && to < CSKY::R18)
+        return true;
+      else
+        return false;
+    }
+  }
+
+  bool isRegList() const {
+    if (!isRegisterList())
+      return false;
+
+    auto regList = getRegList();
+
+    if (!isLegalRegList(regList.List1From, regList.List1To))
+      return false;
+    if (!isLegalRegList(regList.List2From, regList.List2To))
+      return false;
+    if (!isLegalRegList(regList.List3From, regList.List3To))
+      return false;
+    if (!isLegalRegList(regList.List4From, regList.List4To))
+      return false;
+
+    return true;
   }
 
-  bool isConstpoolSymbol() const {
+  bool isExtImm6() {
+    if (!isImm())
+      return false;
+
     int64_t Imm;
-    // Must be of 'immediate' type but not a constant.
-    return isImm() && !evaluateConstantImm(getImm(), Imm);
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm);
+    if (!IsConstantImm)
+      return false;
+
+    int uimm4 = Imm & 0xf;
+
+    return isShiftedUInt<6, 0>(Imm) && uimm4 >= 0 && uimm4 <= 14;
   }
 
   /// Gets location of the first token of this operand.
@@ -198,23 +356,64 @@ public:
     return Reg.RegNum;
   }
 
+  std::pair<unsigned, unsigned> getRegSeq() const {
+    assert(Kind == RegisterSeq && "Invalid type access!");
+    return std::pair<unsigned, unsigned>(RegSeq.RegNumFrom, RegSeq.RegNumTo);
+  }
+
+  RegListOp getRegList() const {
+    assert(Kind == RegisterList && "Invalid type access!");
+    return RegList;
+  }
+
   const MCExpr *getImm() const {
     assert(Kind == Immediate && "Invalid type access!");
     return Imm.Val;
   }
 
+  const MCExpr *getConstpoolOp() const {
+    assert(Kind == CPOP && "Invalid type access!");
+    return CPool.Val;
+  }
+
   StringRef getToken() const {
     assert(Kind == Token && "Invalid type access!");
     return Tok;
   }
 
   void print(raw_ostream &OS) const override {
+    auto RegName = [](unsigned Reg) {
+      if (Reg)
+        return CSKYInstPrinter::getRegisterName(Reg);
+      else
+        return "noreg";
+    };
+
     switch (Kind) {
+    case CPOP:
+      OS << *getConstpoolOp();
+      break;
     case Immediate:
       OS << *getImm();
       break;
-    case Register:
-      OS << "<register x" << getReg() << ">";
+    case KindTy::Register:
+      OS << "<register " << RegName(getReg()) << ">";
+      break;
+    case RegisterSeq:
+      OS << "<register-seq ";
+      OS << RegName(getRegSeq().first) << "-" << RegName(getRegSeq().second)
+         << ">";
+      break;
+    case RegisterList:
+      OS << "<register-list ";
+      OS << RegName(getRegList().List1From) << "-"
+         << RegName(getRegList().List1To) << ",";
+      OS << RegName(getRegList().List2From) << "-"
+         << RegName(getRegList().List2To) << ",";
+      OS << RegName(getRegList().List3From) << "-"
+         << RegName(getRegList().List3To) << ",";
+      OS << RegName(getRegList().List4From) << "-"
+         << RegName(getRegList().List4To);
       break;
     case Token:
       OS << "'" << getToken() << "'";
@@ -239,6 +438,51 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<CSKYOperand> createRegSeq(unsigned RegNoFrom,
+                                                   unsigned RegNoTo, SMLoc S) {
+    auto Op = std::make_unique<CSKYOperand>(RegisterSeq);
+    Op->RegSeq.RegNumFrom = RegNoFrom;
+    Op->RegSeq.RegNumTo = RegNoTo;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<CSKYOperand>
+  createRegList(SmallVector<unsigned, 4> reglist, SMLoc S) {
+    auto Op = std::make_unique<CSKYOperand>(RegisterList);
+    Op->RegList.List1From = 0;
+    Op->RegList.List1To = 0;
+    Op->RegList.List2From = 0;
+    Op->RegList.List2To = 0;
+    Op->RegList.List3From = 0;
+    Op->RegList.List3To = 0;
+    Op->RegList.List4From = 0;
+    Op->RegList.List4To = 0;
+
+    for (unsigned i = 0; i < reglist.size(); i += 2) {
+      if (Op->RegList.List1From == 0) {
+        Op->RegList.List1From = reglist[i];
+        Op->RegList.List1To = reglist[i + 1];
+      } else if (Op->RegList.List2From == 0) {
+        Op->RegList.List2From = reglist[i];
+        Op->RegList.List2To = reglist[i + 1];
+      } else if (Op->RegList.List3From == 0) {
+        Op->RegList.List3From = reglist[i];
+        Op->RegList.List3To = reglist[i + 1];
+      } else if (Op->RegList.List4From == 0) {
+        Op->RegList.List4From = reglist[i];
+        Op->RegList.List4To = reglist[i + 1];
+      } else {
+        assert(0);
+      }
+    }
+
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static std::unique_ptr<CSKYOperand> createImm(const MCExpr *Val, SMLoc S,
                                                 SMLoc E) {
     auto Op = std::make_unique<CSKYOperand>(Immediate);
@@ -248,6 +492,15 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<CSKYOperand> createConstpoolOp(const MCExpr *Val,
+                                                        SMLoc S, SMLoc E) {
+    auto Op = std::make_unique<CSKYOperand>(CPOP);
+    Op->CPool.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     assert(Expr && "Expr shouldn't be null!");
     if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
@@ -266,6 +519,70 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
+
+  void addConstpoolOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createExpr(getConstpoolOp()));
+  }
+
+  void addRegSeqOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    auto regSeq = getRegSeq();
+
+    Inst.addOperand(MCOperand::createReg(regSeq.first));
+    Inst.addOperand(MCOperand::createReg(regSeq.second));
+  }
+
+  static unsigned getListValue(unsigned ListFrom, unsigned ListTo) {
+    if (ListFrom == ListTo && ListFrom == CSKY::R15)
+      return (1 << 4);
+    else if (ListFrom == ListTo && ListFrom == CSKY::R28)
+      return (1 << 8);
+    else if (ListFrom == CSKY::R4)
+      return ListTo - ListFrom + 1;
+    else if (ListFrom == CSKY::R16)
+      return ((ListTo - ListFrom + 1) << 5);
+    else
+      return 0;
+  }
+
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    auto regList = getRegList();
+
+    unsigned V = 0;
+
+    unsigned T = getListValue(regList.List1From, regList.List1To);
+    if (T != 0)
+      V = V | T;
+
+    T = getListValue(regList.List2From, regList.List2To);
+    if (T != 0)
+      V = V | T;
+
+    T = getListValue(regList.List3From, regList.List3To);
+    if (T != 0)
+      V = V | T;
+
+    T = getListValue(regList.List4From, regList.List4To);
+    if (T != 0)
+      V = V | T;
+
+    Inst.addOperand(MCOperand::createImm(V));
+  }
+
+  bool isValidForTie(const CSKYOperand &Other) const {
+    if (Kind != Other.Kind)
+      return false;
+
+    switch (Kind) {
+    default:
+      llvm_unreachable("Unexpected kind");
+      return false;
+    case Register:
+      return Reg.RegNum == Other.Reg.RegNum;
+    }
+  }
 };
 } // end anonymous namespace.
 
@@ -299,9 +616,7 @@ bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   default:
     break;
   case Match_Success:
-    Inst.setLoc(IDLoc);
-    Out.emitInstruction(Inst, getSTI());
-    return false;
+    return processInstruction(Inst, IDLoc, Operands, Out);
   case Match_MissingFeature: {
     assert(MissingFeatures.any() && "Unknown missing features!");
     ListSeparator LS;
@@ -347,26 +662,79 @@ bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (Result) {
   default:
     break;
+  case Match_InvalidSImm8:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 7),
+                                      (1 << 7) - 1);
+  case Match_InvalidOImm3:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 3));
+  case Match_InvalidOImm4:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 4));
+  case Match_InvalidOImm5:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 5));
+  case Match_InvalidOImm6:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 6));
+  case Match_InvalidOImm8:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 8));
   case Match_InvalidOImm12:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 12));
   case Match_InvalidOImm16:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 16));
+  case Match_InvalidUImm1:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 1) - 1);
   case Match_InvalidUImm2:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 2) - 1);
+  case Match_InvalidUImm3:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 3) - 1);
+  case Match_InvalidUImm4:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 4) - 1);
   case Match_InvalidUImm5:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
+  case Match_InvalidUImm6:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
+  case Match_InvalidUImm7:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 7) - 1);
+  case Match_InvalidUImm8:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 8) - 1);
   case Match_InvalidUImm12:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 12) - 1);
+  case Match_InvalidUImm16:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 16) - 1);
+  case Match_InvalidUImm5Shift1:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 5) - 2,
+        "immediate must be a multiple of 2 bytes in the range");
   case Match_InvalidUImm12Shift1:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, 0, (1 << 12) - 2,
         "immediate must be a multiple of 2 bytes in the range");
+  case Match_InvalidUImm5Shift2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 5) - 4,
+        "immediate must be a multiple of 4 bytes in the range");
+  case Match_InvalidUImm7Shift1:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 7) - 2,
+        "immediate must be a multiple of 2 bytes in the range");
+  case Match_InvalidUImm7Shift2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 7) - 4,
+        "immediate must be a multiple of 4 bytes in the range");
+  case Match_InvalidUImm8Shift2:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 8) - 4,
+        "immediate must be a multiple of 4 bytes in the range");
+  case Match_InvalidUImm8Shift3:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 8) - 8,
+        "immediate must be a multiple of 8 bytes in the range");
+  case Match_InvalidUImm8Shift8:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, 0, (1 << 8) - 256,
+        "immediate must be a multiple of 256 bytes in the range");
   case Match_InvalidUImm12Shift2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, 0, (1 << 12) - 4,
         "immediate must be a multiple of 4 bytes in the range");
-  case Match_InvalidUImm16:
-    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 16) - 1);
   case Match_InvalidCSKYSymbol: {
     SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a symbol name");
@@ -375,15 +743,68 @@ bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a constpool symbol name");
   }
+  case Match_InvalidPSRFlag: {
+    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "psrset operand is not valid");
+  }
+  case Match_InvalidRegSeq: {
+    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "Register sequence is not valid");
   }
-
+  case Match_InvalidRegOutOfRange: {
+    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "register is out of range");
+  }
+  case Match_InvalidSPOperand: {
+    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be sp register");
+  }
+  case Match_RequiresSameSrcAndDst: {
+    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "src and dst operand must be same");
+  }
+  case Match_InvalidRegList: {
+    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "invalid register list");
+  }
+  }
+  LLVM_DEBUG(dbgs() << "Result = " << Result);
   llvm_unreachable("Unknown match type detected!");
 }
 
+bool CSKYAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                       OperandVector &Operands,
+                                       MCStreamer &Out) {
+
+  if (Inst.getOpcode() == CSKY::LDQ32 || Inst.getOpcode() == CSKY::STQ32) {
+    if (Inst.getOperand(1).getReg() != CSKY::R4 ||
+        Inst.getOperand(2).getReg() != CSKY::R7) {
+      return Error(IDLoc, "Register sequence is not valid. 'r4-r7' expected");
+    }
+    Inst.setOpcode(Inst.getOpcode() == CSKY::LDQ32 ? CSKY::LDM32 : CSKY::STM32);
+    Out.emitInstruction(Inst, getSTI());
+    return false;
+  } else if (Inst.getOpcode() == CSKY::SEXT32 ||
+             Inst.getOpcode() == CSKY::ZEXT32) {
+    if (Inst.getOperand(2).getImm() < Inst.getOperand(3).getImm())
+      return Error(IDLoc, "msb must be greater or equal to lsb");
+  } else if (Inst.getOpcode() == CSKY::INS32) {
+    if (Inst.getOperand(3).getImm() < Inst.getOperand(4).getImm())
+      return Error(IDLoc, "msb must be greater or equal to lsb");
+  } else if (Inst.getOpcode() == CSKY::IDLY32) {
+    if (Inst.getOperand(0).getImm() > 32 || Inst.getOperand(0).getImm() < 0)
+      return Error(IDLoc, "n must be in range [0,32]");
+  }
+
+  Out.emitInstruction(Inst, getSTI());
+  return false;
+}
+
 // Attempts to match Name as a register (either using the default name or
 // alternative ABI names), setting RegNo to the matching register. Upon
 // failure, returns true and sets RegNo to 0.
-static bool matchRegisterNameHelper(MCRegister &RegNo, StringRef Name) {
+static bool matchRegisterNameHelper(const MCSubtargetInfo &STI,
+                                    MCRegister &RegNo, StringRef Name) {
   RegNo = MatchRegisterName(Name);
 
   if (RegNo == CSKY::NoRegister)
@@ -399,12 +820,12 @@ bool CSKYAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   EndLoc = Tok.getEndLoc();
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  if (!matchRegisterNameHelper((MCRegister &)RegNo, Name)) {
+  if (!matchRegisterNameHelper(getSTI(), (MCRegister &)RegNo, Name)) {
     getParser().Lex(); // Eat identifier token.
     return false;
   }
 
-  return Error(StartLoc, "invalid register name");
+  return MatchOperand_NoMatch;
 }
 
 OperandMatchResultTy CSKYAsmParser::parseRegister(OperandVector &Operands) {
@@ -418,7 +839,7 @@ OperandMatchResultTy CSKYAsmParser::parseRegister(OperandVector &Operands) {
     StringRef Name = getLexer().getTok().getIdentifier();
     MCRegister RegNo;
 
-    if (matchRegisterNameHelper((MCRegister &)RegNo, Name))
+    if (matchRegisterNameHelper(getSTI(), (MCRegister &)RegNo, Name))
       return MatchOperand_NoMatch;
 
     getLexer().Lex();
@@ -439,7 +860,13 @@ OperandMatchResultTy CSKYAsmParser::parseBaseRegImm(OperandVector &Operands) {
   if (parseRegister(Operands) != MatchOperand_Success) {
     getLexer().UnLex(Tok);
     Operands.pop_back();
-    return MatchOperand_ParseFail;
+    return MatchOperand_NoMatch;
+  }
+
+  if (getLexer().is(AsmToken::RParen)) {
+    Operands.push_back(CSKYOperand::createToken(")", getLoc()));
+    getParser().Lex(); // Eat ')'
+    return MatchOperand_Success;
   }
 
   if (getLexer().isNot(AsmToken::Comma)) {
@@ -495,8 +922,10 @@ OperandMatchResultTy CSKYAsmParser::parseImmediate(OperandVector &Operands) {
 
   const MCExpr *IdVal;
   SMLoc S = getLoc();
-  if (getParser().parseExpression(IdVal))
+  if (getParser().parseExpression(IdVal)) {
+    Error(getLoc(), "unknown expression");
     return MatchOperand_ParseFail;
+  }
 
   SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
   Operands.push_back(CSKYOperand::createImm(IdVal, S, E));
@@ -517,17 +946,26 @@ bool CSKYAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     return true;
 
   // Attempt to parse token as register
-  if (parseRegister(Operands) == MatchOperand_Success)
+  auto Res = parseRegister(Operands);
+  if (Res == MatchOperand_Success)
     return false;
+  else if (Res == MatchOperand_ParseFail)
+    return true;
 
   // Attempt to parse token as (register, imm)
-  if (getLexer().is(AsmToken::LParen))
-    if (parseBaseRegImm(Operands) == MatchOperand_Success)
+  if (getLexer().is(AsmToken::LParen)) {
+    Res = parseBaseRegImm(Operands);
+    if (Res == MatchOperand_Success)
       return false;
+    else if (Res == MatchOperand_ParseFail)
+      return true;
+  }
 
-  // Attempt to parse token as a imm.
-  if (parseImmediate(Operands) == MatchOperand_Success)
+  Res = parseImmediate(Operands);
+  if (Res == MatchOperand_Success)
     return false;
+  else if (Res == MatchOperand_ParseFail)
+    return true;
 
   // Finally we have exhausted all options and must declare defeat.
   Error(getLoc(), "unknown operand");
@@ -537,16 +975,20 @@ bool CSKYAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 OperandMatchResultTy CSKYAsmParser::parseCSKYSymbol(OperandVector &Operands) {
   SMLoc S = getLoc();
   SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  const MCExpr *Res;
 
   if (getLexer().getKind() != AsmToken::Identifier)
     return MatchOperand_NoMatch;
 
   StringRef Identifier;
-  if (getParser().parseIdentifier(Identifier))
+  AsmToken Tok = getLexer().getTok();
+
+  if (getParser().parseIdentifier(Identifier)) {
+    Error(getLoc(), "unknown identifier");
     return MatchOperand_ParseFail;
+  }
 
   CSKYMCExpr::VariantKind Kind = CSKYMCExpr::VK_CSKY_None;
-
   if (Identifier.consume_back("@GOT"))
     Kind = CSKYMCExpr::VK_CSKY_GOT;
   else if (Identifier.consume_back("@GOTOFF"))
@@ -555,44 +997,377 @@ OperandMatchResultTy CSKYAsmParser::parseCSKYSymbol(OperandVector &Operands) {
     Kind = CSKYMCExpr::VK_CSKY_PLT;
   else if (Identifier.consume_back("@GOTPC"))
     Kind = CSKYMCExpr::VK_CSKY_GOTPC;
+  else if (Identifier.consume_back("@TLSGD32"))
+    Kind = CSKYMCExpr::VK_CSKY_TLSGD;
+  else if (Identifier.consume_back("@GOTTPOFF"))
+    Kind = CSKYMCExpr::VK_CSKY_TLSIE;
+  else if (Identifier.consume_back("@TPOFF"))
+    Kind = CSKYMCExpr::VK_CSKY_TLSLE;
+  else if (Identifier.consume_back("@TLSLDM32"))
+    Kind = CSKYMCExpr::VK_CSKY_TLSLDM;
+  else if (Identifier.consume_back("@TLSLDO32"))
+    Kind = CSKYMCExpr::VK_CSKY_TLSLDO;
+
+  MCSymbol *Sym = getContext().getInlineAsmLabel(Identifier);
+
+  if (!Sym)
+    Sym = getContext().getOrCreateSymbol(Identifier);
+
+  if (Sym->isVariable()) {
+    const MCExpr *V = Sym->getVariableValue(/*SetUsed=*/false);
+    if (!isa<MCSymbolRefExpr>(V)) {
+      getLexer().UnLex(Tok); // Put back if it's not a bare symbol.
+      Error(getLoc(), "unknown symbol");
+      return MatchOperand_ParseFail;
+    }
+    Res = V;
+  } else
+    Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
 
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
-  const MCExpr *Res =
-      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+  MCBinaryExpr::Opcode Opcode;
+  switch (getLexer().getKind()) {
+  default:
+    if (Kind != CSKYMCExpr::VK_CSKY_None)
+      Res = CSKYMCExpr::create(Res, Kind, getContext());
 
-  if (Kind != CSKYMCExpr::VK_CSKY_None)
-    Res = CSKYMCExpr::create(Res, Kind, getContext());
+    Operands.push_back(CSKYOperand::createImm(Res, S, E));
+    return MatchOperand_Success;
+  case AsmToken::Plus:
+    Opcode = MCBinaryExpr::Add;
+    break;
+  case AsmToken::Minus:
+    Opcode = MCBinaryExpr::Sub;
+    break;
+  }
 
+  getLexer().Lex(); // eat + or -
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr)) {
+    Error(getLoc(), "unknown expression");
+    return MatchOperand_ParseFail;
+  }
+  Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
   Operands.push_back(CSKYOperand::createImm(Res, S, E));
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy CSKYAsmParser::parseDataSymbol(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  const MCExpr *Res;
+
+  if (getLexer().getKind() != AsmToken::LBrac)
+    return MatchOperand_NoMatch;
+
+  getLexer().Lex(); // Eat '['.
+
+  if (getLexer().getKind() != AsmToken::Identifier) {
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr)) {
+      Error(getLoc(), "unknown expression");
+      return MatchOperand_ParseFail;
+    }
+
+    if (getLexer().getKind() != AsmToken::RBrac) {
+      Error(getLoc(), "expected ]");
+      return MatchOperand_ParseFail;
+    }
+
+    getLexer().Lex(); // Eat ']'.
+
+    Operands.push_back(CSKYOperand::createConstpoolOp(Expr, S, E));
+    return MatchOperand_Success;
+  }
+
+  AsmToken Tok = getLexer().getTok();
+  StringRef Identifier;
+
+  if (getParser().parseIdentifier(Identifier)) {
+    Error(getLoc(), "unknown identifier " + Identifier);
+    return MatchOperand_ParseFail;
+  }
+
+  CSKYMCExpr::VariantKind Kind = CSKYMCExpr::VK_CSKY_None;
+  if (Identifier.consume_back("@GOT"))
+    Kind = CSKYMCExpr::VK_CSKY_GOT_IMM18_BY4;
+  else if (Identifier.consume_back("@PLT"))
+    Kind = CSKYMCExpr::VK_CSKY_PLT_IMM18_BY4;
+
+  MCSymbol *Sym = getContext().getInlineAsmLabel(Identifier);
+
+  if (!Sym)
+    Sym = getContext().getOrCreateSymbol(Identifier);
+
+  if (Sym->isVariable()) {
+    const MCExpr *V = Sym->getVariableValue(/*SetUsed=*/false);
+    if (!isa<MCSymbolRefExpr>(V)) {
+      getLexer().UnLex(Tok); // Put back if it's not a bare symbol.
+      Error(getLoc(), "unknown symbol");
+      return MatchOperand_ParseFail;
+    }
+    Res = V;
+  } else {
+    Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+  }
+
+  MCBinaryExpr::Opcode Opcode;
+  switch (getLexer().getKind()) {
+  default:
+    Error(getLoc(), "unknown symbol");
+    return MatchOperand_ParseFail;
+  case AsmToken::RBrac:
+
+    getLexer().Lex(); // Eat ']'.
+
+    if (Kind != CSKYMCExpr::VK_CSKY_None)
+      Res = CSKYMCExpr::create(Res, Kind, getContext());
+
+    Operands.push_back(CSKYOperand::createConstpoolOp(Res, S, E));
+    return MatchOperand_Success;
+  case AsmToken::Plus:
+    Opcode = MCBinaryExpr::Add;
+    break;
+  case AsmToken::Minus:
+    Opcode = MCBinaryExpr::Sub;
+    break;
+  }
+
+  getLexer().Lex(); // eat + or -
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr)) {
+    Error(getLoc(), "unknown expression");
+    return MatchOperand_ParseFail;
+  }
+
+  if (getLexer().getKind() != AsmToken::RBrac) {
+    Error(getLoc(), "expected ']'");
+    return MatchOperand_ParseFail;
+  }
+
+  getLexer().Lex(); // Eat ']'.
+
+  Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
+  Operands.push_back(CSKYOperand::createConstpoolOp(Res, S, E));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 CSKYAsmParser::parseConstpoolSymbol(OperandVector &Operands) {
   SMLoc S = getLoc();
   SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  const MCExpr *Res;
 
   if (getLexer().getKind() != AsmToken::LBrac)
     return MatchOperand_NoMatch;
 
   getLexer().Lex(); // Eat '['.
 
-  if (getLexer().getKind() != AsmToken::Identifier)
-    return MatchOperand_NoMatch;
+  if (getLexer().getKind() != AsmToken::Identifier) {
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr)) {
+      Error(getLoc(), "unknown expression");
+      return MatchOperand_ParseFail;
+    }
+
+    if (getLexer().getKind() != AsmToken::RBrac) {
+      Error(getLoc(), "expected ']'");
+      return MatchOperand_ParseFail;
+    }
+
+    getLexer().Lex(); // Eat ']'.
+
+    Operands.push_back(CSKYOperand::createConstpoolOp(Expr, S, E));
+    return MatchOperand_Success;
+  }
 
+  AsmToken Tok = getLexer().getTok();
   StringRef Identifier;
-  if (getParser().parseIdentifier(Identifier))
+
+  if (getParser().parseIdentifier(Identifier)) {
+    Error(getLoc(), "unknown identifier");
     return MatchOperand_ParseFail;
+  }
 
-  if (getLexer().getKind() != AsmToken::RBrac)
-    return MatchOperand_NoMatch;
+  MCSymbol *Sym = getContext().getInlineAsmLabel(Identifier);
+
+  if (!Sym)
+    Sym = getContext().getOrCreateSymbol(Identifier);
+
+  if (Sym->isVariable()) {
+    const MCExpr *V = Sym->getVariableValue(/*SetUsed=*/false);
+    if (!isa<MCSymbolRefExpr>(V)) {
+      getLexer().UnLex(Tok); // Put back if it's not a bare symbol.
+      Error(getLoc(), "unknown symbol");
+      return MatchOperand_ParseFail;
+    }
+    Res = V;
+  } else {
+    Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+  }
+
+  MCBinaryExpr::Opcode Opcode;
+  switch (getLexer().getKind()) {
+  default:
+    Error(getLoc(), "unknown symbol");
+    return MatchOperand_ParseFail;
+  case AsmToken::RBrac:
+
+    getLexer().Lex(); // Eat ']'.
+
+    Operands.push_back(CSKYOperand::createConstpoolOp(Res, S, E));
+    return MatchOperand_Success;
+  case AsmToken::Plus:
+    Opcode = MCBinaryExpr::Add;
+    break;
+  case AsmToken::Minus:
+    Opcode = MCBinaryExpr::Sub;
+    break;
+  }
+
+  getLexer().Lex(); // eat + or -
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr)) {
+    Error(getLoc(), "unknown expression");
+    return MatchOperand_ParseFail;
+  }
+
+  if (getLexer().getKind() != AsmToken::RBrac) {
+    Error(getLoc(), "expected ']'");
+    return MatchOperand_ParseFail;
+  }
 
   getLexer().Lex(); // Eat ']'.
 
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
-  const MCExpr *Res =
-      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
-  Operands.push_back(CSKYOperand::createImm(Res, S, E));
+  Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
+  Operands.push_back(CSKYOperand::createConstpoolOp(Res, S, E));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy CSKYAsmParser::parsePSRFlag(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+
+  unsigned Flag = 0;
+
+  while (getLexer().isNot(AsmToken::EndOfStatement)) {
+    StringRef Identifier;
+    if (getParser().parseIdentifier(Identifier)) {
+      Error(getLoc(), "unknown identifier " + Identifier);
+      return MatchOperand_ParseFail;
+    }
+
+    if (Identifier == "sie")
+      Flag = (1 << 4) | Flag;
+    else if (Identifier == "ee")
+      Flag = (1 << 3) | Flag;
+    else if (Identifier == "ie")
+      Flag = (1 << 2) | Flag;
+    else if (Identifier == "fe")
+      Flag = (1 << 1) | Flag;
+    else if (Identifier == "af")
+      Flag = (1 << 0) | Flag;
+    else {
+      Error(getLoc(), "expected " + Identifier);
+      return MatchOperand_ParseFail;
+    }
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().is(AsmToken::Comma)) {
+      getLexer().Lex(); // eat ','
+    } else {
+      Error(getLoc(), "expected ,");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  Operands.push_back(
+      CSKYOperand::createImm(MCConstantExpr::create(Flag, getContext()), S, E));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy CSKYAsmParser::parseRegSeq(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (parseRegister(Operands) != MatchOperand_Success)
+    return MatchOperand_NoMatch;
+
+  auto Ry = Operands.back()->getReg();
+  Operands.pop_back();
+
+  if (getLexer().isNot(AsmToken::Minus)) {
+    Error(getLoc(), "expected '-'");
+    return MatchOperand_ParseFail;
+  }
+
+  getLexer().Lex(); // eat '-'
+
+  if (parseRegister(Operands) != MatchOperand_Success) {
+    Error(getLoc(), "invalid register");
+    return MatchOperand_ParseFail;
+  }
+
+  auto Rz = Operands.back()->getReg();
+  Operands.pop_back();
+
+  Operands.push_back(CSKYOperand::createRegSeq(Ry, Rz, S));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy CSKYAsmParser::parseRegList(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  SmallVector<unsigned, 4> reglist;
+
+  while (true) {
+
+    if (parseRegister(Operands) != MatchOperand_Success) {
+      Error(getLoc(), "invalid register");
+      return MatchOperand_ParseFail;
+    }
+
+    auto Ry = Operands.back()->getReg();
+    Operands.pop_back();
+
+    if (getLexer().is(AsmToken::Minus)) {
+      getLexer().Lex(); // eat '-'
+
+      if (parseRegister(Operands) != MatchOperand_Success) {
+        Error(getLoc(), "invalid register");
+        return MatchOperand_ParseFail;
+      }
+
+      auto Rz = Operands.back()->getReg();
+      Operands.pop_back();
+
+      reglist.push_back(Ry);
+      reglist.push_back(Rz);
+
+      if (getLexer().is(AsmToken::Comma))
+        getLexer().Lex(); // eat ','
+      else if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+    } else if (getLexer().is(AsmToken::Comma)) {
+      reglist.push_back(Ry);
+      reglist.push_back(Ry);
+
+      getLexer().Lex(); // eat ','
+    } else if (getLexer().is(AsmToken::EndOfStatement)) {
+      reglist.push_back(Ry);
+      reglist.push_back(Ry);
+      break;
+    } else {
+      Error(getLoc(), "invalid register list");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  Operands.push_back(CSKYOperand::createRegList(reglist, S));
   return MatchOperand_Success;
 }
 
@@ -638,7 +1413,7 @@ OperandMatchResultTy CSKYAsmParser::tryParseRegister(unsigned &RegNo,
 
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  if (matchRegisterNameHelper((MCRegister &)RegNo, Name))
+  if (matchRegisterNameHelper(getSTI(), (MCRegister &)RegNo, Name))
     return MatchOperand_NoMatch;
 
   getParser().Lex(); // Eat identifier token.
diff --git a/llvm/include/llvm/ExecutionEngine/OrcV1Deprecation.h b/llvm/lib/Target/CSKY/CSKY.h
index 7ed254b3ee04..357b1e96e606 100644
--- a/llvm/include/llvm/ExecutionEngine/OrcV1Deprecation.h
+++ b/llvm/lib/Target/CSKY/CSKY.h
@@ -1,4 +1,4 @@
-//===------ OrcV1Deprecation.h - Memory manager for MC-JIT ------*- C++ -*-===//
+//===-- CSKY.h - Top-level interface for CSKY--------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,17 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Tag for suppressing ORCv1 deprecation warnings.
+// This file contains the entry points for global functions defined in the LLVM
+// CSKY back-end.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTIONENGINE_ORCV1DEPRECATION_H
-#define LLVM_EXECUTIONENGINE_ORCV1DEPRECATION_H
+#ifndef LLVM_LIB_TARGET_CSKY_CSKY_H
+#define LLVM_LIB_TARGET_CSKY_CSKY_H
+
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+class CSKYTargetMachine;
+class FunctionPass;
 
-enum ORCv1DeprecationAcknowledgement { AcknowledgeORCv1Deprecation };
+FunctionPass *createCSKYISelDag(CSKYTargetMachine &TM);
 
 } // namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORCV1DEPRECATION_H
+#endif // LLVM_LIB_TARGET_CSKY_CSKY_H
diff --git a/llvm/lib/Target/CSKY/CSKY.td b/llvm/lib/Target/CSKY/CSKY.td
index 854a8b5f22a2..e26781ca6aa1 100644
--- a/llvm/lib/Target/CSKY/CSKY.td
+++ b/llvm/lib/Target/CSKY/CSKY.td
@@ -9,10 +9,97 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
+// CSKY subtarget features and instruction predicates.
+//===----------------------------------------------------------------------===//
+
+def FeatureBTST16 : SubtargetFeature<"btst16", "HasBTST16", "true",
+                                     "Use the 16-bit btsti instruction">;
+def HasBTST16 : Predicate<"Subtarget->hasBTST16()">,
+                   AssemblerPredicate<(all_of FeatureBTST16),
+                   "Use the 16-bit btsti instruction">;
+
+// Atomic Support
+def FeatureExtendLrw : SubtargetFeature<"elrw", "HasExtendLrw", "true",
+                                        "Use the extend LRW instruction">;
+def HasExtendLrw : Predicate<"Subtarget->hasExtendLrw()">,
+                   AssemblerPredicate<(all_of FeatureExtendLrw),
+                   "Use the extend LRW instruction">;
+
+def FeatureJAVA
+    : SubtargetFeature<"java", "HasJAVA", "true", "Enable java instructions">;
+def HasJAVA : Predicate<"Subtarget->hasJAVA()">,
+              AssemblerPredicate<(all_of FeatureJAVA),
+              "Enable java instructions">;
+
+def FeatureDoloop : SubtargetFeature<"doloop", "HasDoloop", "true",
+                                     "Enable doloop instructions">;
+def HasDoloop : Predicate<"Subtarget->hasDoloop()">,
+                AssemblerPredicate<(all_of FeatureDoloop),
+                "Enable doloop instructions">;
+
+def HasE1
+    : SubtargetFeature<"e1", "HasE1", "true", "Support CSKY e1 instructions",
+                       [FeatureExtendLrw]>;
+def iHasE1 : Predicate<"Subtarget->hasE1()">,
+             AssemblerPredicate<(all_of HasE1),
+             "Support CSKY e1 instructions">;
+
+def HasE2
+    : SubtargetFeature<"e2", "HasE2", "true", "Support CSKY e2 instructions",
+                       [HasE1]>;
+def iHasE2 : Predicate<"Subtarget->hasE2()">,
+             AssemblerPredicate<(all_of HasE2),
+             "Support CSKY e2 instructions">;
+
+def Has2E3 : SubtargetFeature<"2e3", "Has2E3", "true",
+                              "Support CSKY 2e3 instructions", [HasE2]>;
+def iHas2E3 : Predicate<"Subtarget->has2E3()">,
+              AssemblerPredicate<(all_of Has2E3),
+              "Support CSKY 2e3 instructions">;
+
+def Has3E3r1 : SubtargetFeature<"3e3r1", "Has3E3r1", "true",
+                                "Support CSKY 3e3r1 instructions">;
+def iHas3E3r1 : Predicate<"Subtarget->has3E3r1()">,
+                AssemblerPredicate<(all_of Has3E3r1),
+                "Support CSKY 3e3r1 instructions">;
+
+def Has3r2E3r3
+    : SubtargetFeature<"3e3r3", "Has3r2E3r3", "true",
+                       "Support CSKY 3e3r3 instructions", [FeatureDoloop]>;
+def iHas3r2E3r3 : Predicate<"Subtarget->has3r2E3r3()">,
+                  AssemblerPredicate<(all_of Has3r2E3r3),
+                  "Support CSKY 3e3r3 instructions">;
+
+def Has3E7 : SubtargetFeature<"3e7", "Has3E7", "true",
+                              "Support CSKY 3e7 instructions", [Has2E3]>;
+def iHas3E7 : Predicate<"Subtarget->has3E7()">,
+              AssemblerPredicate<(all_of Has3E7),
+              "Support CSKY 3e7 instructions">;
+
+def HasMP1E2 : SubtargetFeature<"mp1e2", "HasMP1E2", "true",
+                                "Support CSKY mp1e2 instructions", [Has3E7]>;
+def iHasMP1E2 : Predicate<"Subtarget->hasMP1E2()">,
+                AssemblerPredicate<(all_of HasMP1E2),
+                "Support CSKY mp1e2 instructions">;
+
+def Has7E10 : SubtargetFeature<"7e10", "Has7E10", "true",
+                               "Support CSKY 7e10 instructions", [Has3E7]>;
+def iHas7E10 : Predicate<"Subtarget->has7E10()">,
+               AssemblerPredicate<(all_of Has7E10),
+               "Support CSKY 7e10 instructions">;
+
+def Has10E60 : SubtargetFeature<"10e60", "Has10E60", "true",
+                               "Support CSKY 10e60 instructions", [Has7E10]>;
+def iHas10E60 : Predicate<"Subtarget->has10E60()">,
+               AssemblerPredicate<(all_of Has10E60),
+               "Support CSKY 10e60 instructions">;
+
+//===----------------------------------------------------------------------===//
 // Registers, calling conventions, instruction descriptions.
 //===----------------------------------------------------------------------===//
 
 include "CSKYRegisterInfo.td"
+include "CSKYCallingConv.td"
 include "CSKYInstrInfo.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
new file mode 100644
index 000000000000..1c38c5d1fde6
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -0,0 +1,58 @@
+//===-- CSKYAsmPrinter.cpp - CSKY LLVM assembly writer --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the CSKY assembly language.
+//
+//===----------------------------------------------------------------------===//
+#include "CSKYAsmPrinter.h"
+#include "CSKY.h"
+#include "CSKYTargetMachine.h"
+#include "MCTargetDesc/CSKYInstPrinter.h"
+#include "MCTargetDesc/CSKYMCExpr.h"
+#include "TargetInfo/CSKYTargetInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-asm-printer"
+
+CSKYAsmPrinter::CSKYAsmPrinter(llvm::TargetMachine &TM,
+                               std::unique_ptr<llvm::MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this) {}
+
+bool CSKYAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<CSKYSubtarget>();
+  return AsmPrinter::runOnMachineFunction(MF);
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "CSKYGenMCPseudoLowering.inc"
+
+void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+    return;
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmPrinter() {
+  RegisterAsmPrinter<CSKYAsmPrinter> X(getTheCSKYTarget());
+}
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
new file mode 100644
index 000000000000..f0f5d8657c04
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
@@ -0,0 +1,40 @@
+//===-- CSKYAsmPrinter.h - CSKY implementation of AsmPrinter ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYASMPRINTER_H
+#define LLVM_LIB_TARGET_CSKY_CSKYASMPRINTER_H
+
+#include "CSKYMCInstLower.h"
+#include "CSKYSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCDirectives.h"
+
+namespace llvm {
+class LLVM_LIBRARY_VISIBILITY CSKYAsmPrinter : public AsmPrinter {
+  CSKYMCInstLower MCInstLowering;
+
+  const CSKYSubtarget *Subtarget;
+
+public:
+  explicit CSKYAsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer);
+
+  StringRef getPassName() const override { return "CSKY Assembly Printer"; }
+
+  /// tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void emitInstruction(const MachineInstr *MI) override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYASMPRINTER_H
diff --git a/llvm/lib/Target/CSKY/CSKYCallingConv.h b/llvm/lib/Target/CSKY/CSKYCallingConv.h
new file mode 100644
index 000000000000..f1048f86264b
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYCallingConv.h
@@ -0,0 +1,63 @@
+//=== CSKYCallingConv.h - CSKY Custom Calling Convention Routines -*-C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the CSKY Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYCALLINGCONV_H
+#define LLVM_LIB_TARGET_CSKY_CSKYCALLINGCONV_H
+
+#include "CSKY.h"
+#include "CSKYSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+static bool CC_CSKY_ABIV2_SOFT_64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                  CCValAssign::LocInfo &LocInfo,
+                                  ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+
+  static const MCPhysReg ArgGPRs[] = {CSKY::R0, CSKY::R1, CSKY::R2, CSKY::R3};
+  Register Reg = State.AllocateReg(ArgGPRs);
+  LocVT = MVT::i32;
+  if (!Reg) {
+    unsigned StackOffset = State.AllocateStack(8, Align(4));
+    State.addLoc(
+        CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
+    return true;
+  }
+  if (!State.AllocateReg(ArgGPRs))
+    State.AllocateStack(4, Align(4));
+  State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  return true;
+}
+
+static bool Ret_CSKY_ABIV2_SOFT_64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+
+  static const MCPhysReg ArgGPRs[] = {CSKY::R0, CSKY::R1};
+  Register Reg = State.AllocateReg(ArgGPRs);
+  LocVT = MVT::i32;
+  if (!Reg)
+    return false;
+
+  if (!State.AllocateReg(ArgGPRs))
+    return false;
+
+  State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  return true;
+}
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/CSKY/CSKYCallingConv.td b/llvm/lib/Target/CSKY/CSKYCallingConv.td
new file mode 100644
index 000000000000..87e2e6b9dc31
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYCallingConv.td
@@ -0,0 +1,82 @@
+//===-- CSKYCallingConv.td - Calling Conventions CSKY ----*- tablegen -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the CSKY architecture.
+//
+//===----------------------------------------------------------------------===//
+
+def CSR_I32 : CalleeSavedRegs<(add R8, R15, (sequence "R%u", 4, 7),
+  (sequence "R%u", 9, 11), (sequence "R%u", 16, 17), R28)>;
+def CSR_GPR_FPR32 : CalleeSavedRegs<(add CSR_I32, (sequence "F%u_32", 8, 15))>;
+def CSR_GPR_FPR64 : CalleeSavedRegs<(add CSR_I32,
+                                    (sequence "F%u_64", 8, 15))>;
+
+// Interrupt handler needs to save/restore all registers that are used,
+// both Caller and Callee saved registers.
+def CSR_GPR_ISR : CalleeSavedRegs<(add R8, R15,
+    (sequence "R%u", 0, 3),
+    (sequence "R%u", 4, 7),
+    (sequence "R%u", 9, 13),
+    (sequence "R%u", 16, 31))>;
+
+def CSR_GPR_FPR32_ISR: CalleeSavedRegs<(add CSR_GPR_ISR,
+    (sequence "F%u_32", 0, 15))>;
+def CSR_GPR_FPR64_ISR: CalleeSavedRegs<(add CSR_GPR_ISR,
+    (sequence "F%u_64", 0, 15))>;
+
+def CSR_GPR_FPR32v3_ISR: CalleeSavedRegs<(add CSR_GPR_FPR32_ISR,
+    (sequence "F%u_32", 16, 31))>;
+def CSR_GPR_FPR64v3_ISR: CalleeSavedRegs<(add CSR_GPR_FPR64_ISR,
+    (sequence "F%u_64", 16, 31))>;
+
+// Needed for implementation of CSKYRegisterInfo::getNoPreservedMask()
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CC_CSKY_ABIV2_SOFT : CallingConv<[
+  // DSP types
+  CCIfType<[v2i16, v4i8], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[v2i16, v4i8], CCAssignToStack<4, 4>>,
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCCustom<"CC_CSKY_ABIV2_SOFT_64">>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>
+]>;
+
+def RetCC_CSKY_ABIV2_SOFT : CallingConv<[
+  // DSP types
+  CCIfType<[v2i16, v4i8], CCAssignToReg<[R0, R1]>>,
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1]>>,
+  CCIfType<[f64], CCCustom<"Ret_CSKY_ABIV2_SOFT_64">>
+]>;
+
+def CC_CSKY_ABIV2_FP : CallingConv<[
+  // DSP types
+  CCIfType<[v2i16, v4i8], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[v2i16, v4i8], CCAssignToStack<4, 4>>,
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f32], CCAssignToReg<[F0_32, F1_32, F2_32, F3_32]>>,
+  CCIfType<[f32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToReg<[F0_64, F1_64, F2_64, F3_64]>>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>
+]>;
+
+def RetCC_CSKY_ABIV2_FP : CallingConv<[
+  // DSP types
+  CCIfType<[v2i16, v4i8], CCAssignToReg<[R0, R1]>>,
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1]>>,
+  CCIfType<[f32], CCAssignToReg<[F0_32]>>,
+  CCIfType<[f64], CCAssignToReg<[F0_64]>>
+]>;
+\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
new file mode 100644
index 000000000000..9b22c95cfe21
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -0,0 +1,57 @@
+//===-- CSKYFrameLowering.cpp - CSKY Frame Information ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CSKY implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYFrameLowering.h"
+#include "CSKYSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/MC/MCDwarf.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-frame-lowering"
+
+// Returns the register used to hold the frame pointer.
+static Register getFPReg(const CSKYSubtarget &STI) { return CSKY::R8; }
+
+// To avoid the BP value clobbered by a function call, we need to choose a
+// callee saved register to save the value.
+static Register getBPReg(const CSKYSubtarget &STI) { return CSKY::R7; }
+
+bool CSKYFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         RegInfo->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
+         MFI.isFrameAddressTaken();
+}
+
+bool CSKYFrameLowering::hasBP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  return MFI.hasVarSizedObjects();
+}
+
+void CSKYFrameLowering::emitPrologue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  // FIXME: Implement this when we have function calls
+}
+
+void CSKYFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  // FIXME: Implement this when we have function calls
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.h b/llvm/lib/Target/CSKY/CSKYFrameLowering.h
new file mode 100644
index 000000000000..49921a1866bc
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.h
@@ -0,0 +1,38 @@
+//===-- CSKYFrameLowering.h - Define frame lowering for CSKY -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements CSKY-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYFRAMELOWERING_H
+#define LLVM_LIB_TARGET_CSKY_CSKYFRAMELOWERING_H
+
+#include "llvm/CodeGen/TargetFrameLowering.h"
+
+namespace llvm {
+class CSKYSubtarget;
+
+class CSKYFrameLowering : public TargetFrameLowering {
+  const CSKYSubtarget &STI;
+
+public:
+  explicit CSKYFrameLowering(const CSKYSubtarget &STI)
+      : TargetFrameLowering(StackGrowsDown,
+                            /*StackAlignment=*/Align(4),
+                            /*LocalAreaOffset=*/0),
+        STI(STI) {}
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasBP(const MachineFunction &MF) const;
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
new file mode 100644
index 000000000000..fc9ef8bfd9d9
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
@@ -0,0 +1,75 @@
+//===-- CSKYISelDAGToDAG.cpp - A dag to dag inst selector for CSKY---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the CSKY target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKY.h"
+#include "CSKYSubtarget.h"
+#include "CSKYTargetMachine.h"
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-isel"
+
+namespace {
+class CSKYDAGToDAGISel : public SelectionDAGISel {
+  const CSKYSubtarget *Subtarget;
+
+public:
+  explicit CSKYDAGToDAGISel(CSKYTargetMachine &TM) : SelectionDAGISel(TM) {}
+
+  StringRef getPassName() const override {
+    return "CSKY DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &MF.getSubtarget<CSKYSubtarget>();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
+  void Select(SDNode *N) override;
+
+#include "CSKYGenDAGISel.inc"
+};
+} // namespace
+
+void CSKYDAGToDAGISel::Select(SDNode *N) {
+  // If we have a custom node, we have already selected
+  if (N->isMachineOpcode()) {
+    LLVM_DEBUG(dbgs() << "== "; N->dump(CurDAG); dbgs() << "\n");
+    N->setNodeId(-1);
+    return;
+  }
+
+  SDLoc Dl(N);
+  unsigned Opcode = N->getOpcode();
+  bool IsSelected = false;
+
+  switch (Opcode) {
+  default:
+    break;
+    // FIXME: Add selection nodes needed later.
+  }
+
+  if (IsSelected)
+    return;
+
+  // Select the default instruction.
+  SelectCode(N);
+}
+
+FunctionPass *llvm::createCSKYISelDag(CSKYTargetMachine &TM) {
+  return new CSKYDAGToDAGISel(TM);
+}
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
new file mode 100644
index 000000000000..ac6d069e592c
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -0,0 +1,346 @@
+//===-- CSKYISelLowering.cpp - CSKY DAG Lowering Implementation  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that CSKY uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYISelLowering.h"
+#include "CSKYCallingConv.h"
+#include "CSKYMachineFunctionInfo.h"
+#include "CSKYRegisterInfo.h"
+#include "CSKYSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-isel-lowering"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+#include "CSKYGenCallingConv.inc"
+
+static const MCPhysReg GPRArgRegs[] = {CSKY::R0, CSKY::R1, CSKY::R2, CSKY::R3};
+
+CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
+                                       const CSKYSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+  // Register Class
+  addRegisterClass(MVT::i32, &CSKY::GPRRegClass);
+
+  // Compute derived properties from the register classes.
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  setBooleanContents(UndefinedBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // TODO: Add atomic support fully.
+  setMaxAtomicSizeInBitsSupported(0);
+
+  setStackPointerRegisterToSaveRestore(CSKY::R14);
+  const Align FunctionAlignment(2);
+  setMinFunctionAlignment(FunctionAlignment);
+  setSchedulingPreference(Sched::Source);
+}
+
+EVT CSKYTargetLowering::getSetCCResultType(const DataLayout &DL,
+                                           LLVMContext &Context, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+
+  return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
+                                   const CCValAssign &VA, const SDLoc &DL) {
+  EVT LocVT = VA.getLocVT();
+
+  switch (VA.getLocInfo()) {
+  default:
+    llvm_unreachable("Unexpected CCValAssign::LocInfo");
+  case CCValAssign::Full:
+    break;
+  case CCValAssign::BCvt:
+    Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
+    break;
+  }
+  return Val;
+}
+
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
+                                   const CCValAssign &VA, const SDLoc &DL) {
+  switch (VA.getLocInfo()) {
+  default:
+    llvm_unreachable("Unexpected CCValAssign::LocInfo");
+  case CCValAssign::Full:
+    break;
+  case CCValAssign::BCvt:
+    Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+    break;
+  }
+  return Val;
+}
+
+static SDValue unpackFromRegLoc(const CSKYSubtarget &Subtarget,
+                                SelectionDAG &DAG, SDValue Chain,
+                                const CCValAssign &VA, const SDLoc &DL) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  EVT LocVT = VA.getLocVT();
+  SDValue Val;
+  const TargetRegisterClass *RC;
+
+  switch (LocVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("Unexpected register type");
+  case MVT::i32:
+    RC = &CSKY::GPRRegClass;
+    break;
+  }
+
+  Register VReg = RegInfo.createVirtualRegister(RC);
+  RegInfo.addLiveIn(VA.getLocReg(), VReg);
+  Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+
+  return convertLocVTToValVT(DAG, Val, VA, DL);
+}
+
+static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
+                                const CCValAssign &VA, const SDLoc &DL) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  EVT LocVT = VA.getLocVT();
+  EVT ValVT = VA.getValVT();
+  EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
+  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+                                 VA.getLocMemOffset(), /*Immutable=*/true);
+  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+  SDValue Val;
+
+  ISD::LoadExtType ExtType;
+  switch (VA.getLocInfo()) {
+  default:
+    llvm_unreachable("Unexpected CCValAssign::LocInfo");
+  case CCValAssign::Full:
+  case CCValAssign::BCvt:
+    ExtType = ISD::NON_EXTLOAD;
+    break;
+  }
+  Val = DAG.getExtLoad(
+      ExtType, DL, LocVT, Chain, FIN,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
+  return Val;
+}
+
+// Transform physical registers into virtual registers.
+SDValue CSKYTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  switch (CallConv) {
+  default:
+    report_fatal_error("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    break;
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, IsVarArg));
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue ArgValue;
+
+    if (VA.isRegLoc())
+      ArgValue = unpackFromRegLoc(Subtarget, DAG, Chain, VA, DL);
+    else
+      ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
+
+    InVals.push_back(ArgValue);
+  }
+
+  if (IsVarArg) {
+    const unsigned XLenInBytes = 4;
+    const MVT XLenVT = MVT::i32;
+
+    ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(GPRArgRegs);
+    unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
+    const TargetRegisterClass *RC = &CSKY::GPRRegClass;
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    CSKYMachineFunctionInfo *CSKYFI = MF.getInfo<CSKYMachineFunctionInfo>();
+
+    // Offset of the first variable argument from stack pointer, and size of
+    // the vararg save area. For now, the varargs save area is either zero or
+    // large enough to hold a0-a4.
+    int VaArgOffset, VarArgsSaveSize;
+
+    // If all registers are allocated, then all varargs must be passed on the
+    // stack and we don't need to save any argregs.
+    if (ArgRegs.size() == Idx) {
+      VaArgOffset = CCInfo.getNextStackOffset();
+      VarArgsSaveSize = 0;
+    } else {
+      VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
+      VaArgOffset = -VarArgsSaveSize;
+    }
+
+    // Record the frame index of the first variable argument
+    // which is a value necessary to VASTART.
+    int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
+    CSKYFI->setVarArgsFrameIndex(FI);
+
+    // Copy the integer registers that may have been used for passing varargs
+    // to the vararg save area.
+    for (unsigned I = Idx; I < ArgRegs.size();
+         ++I, VaArgOffset += XLenInBytes) {
+      const Register Reg = RegInfo.createVirtualRegister(RC);
+      RegInfo.addLiveIn(ArgRegs[I], Reg);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
+      FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
+      SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
+                                   MachinePointerInfo::getFixedStack(MF, FI));
+      cast<StoreSDNode>(Store.getNode())
+          ->getMemOperand()
+          ->setValue((Value *)nullptr);
+      OutChains.push_back(Store);
+    }
+    CSKYFI->setVarArgsSaveSize(VarArgsSaveSize);
+  }
+
+  // All stores are grouped in one node to allow the matching between
+  // the size of Ins and InVals. This only happens for vararg functions.
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  }
+
+  return Chain;
+}
+
+bool CSKYTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> CSKYLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, CSKYLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+}
+
+SDValue
+CSKYTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                bool IsVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<SDValue> &OutVals,
+                                const SDLoc &DL, SelectionDAG &DAG) const {
+  // Stores the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> CSKYLocs;
+
+  // Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), CSKYLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, e = CSKYLocs.size(); i < e; ++i) {
+    SDValue Val = OutVals[i];
+    CCValAssign &VA = CSKYLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
+
+    if (IsF64OnCSKY) {
+
+      assert(VA.isRegLoc() && "Expected return via registers");
+      SDValue Split64 = DAG.getNode(CSKYISD::BITCAST_TO_LOHI, DL,
+                                    DAG.getVTList(MVT::i32, MVT::i32), Val);
+      SDValue Lo = Split64.getValue(0);
+      SDValue Hi = Split64.getValue(1);
+
+      Register RegLo = VA.getLocReg();
+      assert(RegLo < CSKY::R31 && "Invalid register pair");
+      Register RegHi = RegLo + 1;
+
+      Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
+      Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
+    } else {
+      // Handle a 'normal' return.
+      Val = convertValVTToLocVT(DAG, Val, VA, DL);
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
+
+      // Guarantee that all emitted copies are stuck together.
+      Glue = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+    }
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the glue node if we have it.
+  if (Glue.getNode()) {
+    RetOps.push_back(Glue);
+  }
+
+  // Interrupt service routines use different return instructions.
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt"))
+    return DAG.getNode(CSKYISD::NIR, DL, MVT::Other, RetOps);
+
+  return DAG.getNode(CSKYISD::RET, DL, MVT::Other, RetOps);
+}
+
+CCAssignFn *CSKYTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                    bool IsVarArg) const {
+  if (IsVarArg || !Subtarget.useHardFloatABI())
+    return RetCC_CSKY_ABIV2_SOFT;
+  else
+    return RetCC_CSKY_ABIV2_FP;
+}
+
+CCAssignFn *CSKYTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                  bool IsVarArg) const {
+  if (IsVarArg || !Subtarget.useHardFloatABI())
+    return CC_CSKY_ABIV2_SOFT;
+  else
+    return CC_CSKY_ABIV2_FP;
+}
+
+const char *CSKYTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unknown CSKYISD node");
+  case CSKYISD::NIE:
+    return "CSKYISD::NIE";
+  case CSKYISD::NIR:
+    return "CSKYISD::NIR";
+  case CSKYISD::RET:
+    return "CSKYISD::RET";
+  case CSKYISD::BITCAST_TO_LOHI:
+    return "CSKYISD::BITCAST_TO_LOHI";
+  }
+}
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.h b/llvm/lib/Target/CSKY/CSKYISelLowering.h
new file mode 100644
index 000000000000..7557c11f50a8
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.h
@@ -0,0 +1,69 @@
+//===-- CSKYISelLowering.cpp - CSKY DAG Lowering Implementation  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that CSKY uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYISELLOWERING_H
+#define LLVM_LIB_TARGET_CSKY_CSKYISELLOWERING_H
+
+#include "MCTargetDesc/CSKYBaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+class CSKYSubtarget;
+
+namespace CSKYISD {
+enum NodeType : unsigned {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  NIE,
+  NIR,
+  RET,
+  BITCAST_TO_LOHI
+};
+}
+
+class CSKYTargetLowering : public TargetLowering {
+  const CSKYSubtarget &Subtarget;
+
+public:
+  explicit CSKYTargetLowering(const TargetMachine &TM,
+                              const CSKYSubtarget &STI);
+
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+private:
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+  CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYISELLOWERING_H
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats.td b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
index dd71b693bbbb..9b6ef9ca23db 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats.td
@@ -24,7 +24,7 @@ class CSKYInst<AddrMode am, int sz, dag outs, dag ins, string asmstr,
   let Namespace = "CSKY";
   int Size = sz;
   AddrMode AM = am;
-
+  field bits<32> SoftFail = 0;
   let OutOperandList = outs;
   let InOperandList = ins;
   let AsmString = asmstr;
@@ -46,6 +46,11 @@ class CSKY32Inst<AddrMode am, bits<6> opcode, dag outs, dag ins, string asmstr,
   let Inst{31 - 26} = opcode;
 }
 
+class CSKY16Inst<AddrMode am, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : CSKYInst<am, 2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+}
+
 // CSKY 32-bit instruction
 // Format< OP[6] | Offset[26] >
 // Instruction(1): bsr32
@@ -157,19 +162,7 @@ class I_16_RET<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
   let isTerminator = 1;
   let isReturn = 1;
   let isBarrier = 1;
-}
-
-// Instructions(1): rte32
-class I_16_RET_I<bits<5> sop, bits<5> pcode, string op, list<dag> pattern>
-    : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins), op, pattern> {
-  let Inst{25 - 21} = sop;
-  let Inst{20 - 16} = pcode;
-  let Inst{15 - 10} = 0x10;
-  let Inst{9 - 5} = 1;
-  let Inst{4 - 0} = 0;
-  let isTerminator = 1;
-  let isReturn = 1;
-  let isBarrier = 1;
+  let Uses = [ R15 ];
 }
 
 // Format< OP[6] | SOP[5] | RX[5] | IMM16[16] >
@@ -227,14 +220,27 @@ class I_LDST<AddrMode am, bits<6> opcode, bits<4> sop, dag outs, dag ins,
   let Inst{11 - 0} = imm12;
 }
 
+class I_PLDR<AddrMode am, bits<6> opcode, bits<4> sop, dag outs, dag ins,
+             string op, list<dag> pattern>
+    : CSKY32Inst<am, opcode, outs, ins, !strconcat(op, "\t($rx, ${imm12})"),
+                 pattern> {
+  bits<5> rx;
+  bits<12> imm12;
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 12} = sop;
+  let Inst{11 - 0} = imm12;
+}
+
+
 // Format< OP[6] | RZ[5] | RX[5] | SOP[4] | OFFSET[12] >
-// Instructions(6): ld32.b, ld32.bs, ld32.h, ld32.hs, ld32.w, ld32.d
+// Instructions(6): ld32.b, ld32.bs, ld32.h, ld32.hs, ld32.w
 class I_LD<AddrMode am, bits<4> sop, string op, Operand operand>
     : I_LDST<am, 0x36, sop,
     (outs GPR:$rz), (ins GPR:$rx, operand:$imm12), op, []>;
 
 // Format< OP[6] | RZ[5] | RX[5] | SOP[4] | OFFSET[12] >
-// Instructions(4): st32.b, st32.h, st32.w, st32.d
+// Instructions(4): st32.b, st32.h, st32.w
 class I_ST<AddrMode am, bits<4> sop, string op, Operand operand>
     : I_LDST<am, 0x37, sop, (outs),
     (ins GPR:$rz, GPR:$rx, operand:$imm12), op, []>;
@@ -249,6 +255,8 @@ class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op>
   let Inst{20 - 16} = pcode;
   let Inst{15 - 12} = 0;
   let Inst{11 - 0} = regs;
+  let Uses = [R14];
+  let Defs = [R14];
 }
 
 // Format< OP[6] | RZ[5] | RX[5] | SOP[6] | PCODE[5] | IMM[5]>
@@ -256,7 +264,7 @@ class I_12_PP<bits<5> sop, bits<5> pcode, dag outs, dag ins, string op>
 class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
              list<dag> pattern>
     : CSKY32Inst<AddrModeNone, 0x31, (outs GPR:$rz),
-    (ins GPR:$false, GPR:$rx, ImmType:$imm5),
+    (ins CARRY:$cond, GPR:$false, GPR:$rx, ImmType:$imm5),
     !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
   bits<5> rz;
   bits<5> rx;
@@ -272,9 +280,9 @@ class I_5_ZX<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
 // Format< OP[6] | IMM[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5]>
 // Instructions(13): decgt32, declt32, decne32, lsli32, lslc32, lsri32
 //                   lsrc32, asri32, asrc32, rotli32, xsr32, bclri32, bseti32
-class I_5_XZ<bits<6> sop, bits<5> pcode, string op, dag ins, dag outs,
+class I_5_XZ<bits<6> sop, bits<5> pcode, string op, dag outs, dag ins,
              list<dag> pattern>
-    : CSKY32Inst<AddrModeNone, 0x31, ins, outs,
+    : CSKY32Inst<AddrModeNone, 0x31, outs, ins,
                  !strconcat(op, "\t$rz, $rx, $imm5"), pattern> {
   bits<5> imm5;
   bits<5> rx;
@@ -286,19 +294,107 @@ class I_5_XZ<bits<6> sop, bits<5> pcode, string op, dag ins, dag outs,
   let Inst{4 - 0} = rz;
 }
 
+// mtcr32, mfcr32
+class I_5_XZ_CR<bits<6> sop, bits<5> pcode, string opStr, dag outs, dag ins,
+             list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x30, outs, ins, opStr, pattern> {
+  bits<5> sel;
+  bits<5> rx;
+  bits<5> cr;
+  let Inst{25 - 21} = sel;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = cr;
+}
+
+// sync
+class I_5_XZ_SYNC<bits<6> sop, bits<5> pcode, string opStr, bits<1> S, bits<1> I>
+    : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins), opStr, []> {
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+  let Inst{25} = S;
+  let Inst{21} = I;
+
+}
+
+// Priviledged Instructions
+class I_5_XZ_PRIVI<bits<6> sop, bits<5> pcode, string opStr>
+    : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins), opStr, []> {
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+}
+
+class I_CP<bits<4> sop, dag outs, dag ins, string opStr>
+    : CSKY32Inst<AddrModeNone, 0x3f, outs, ins, opStr, []> {
+  bits<5> cpid;
+  bits<12> usdef;
+  let Inst{25 - 21} = cpid;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 12} = sop;
+  let Inst{11 - 0} = usdef;
+}
+
+class I_CPOP<dag outs, dag ins, string opStr>
+    : CSKY32Inst<AddrModeNone, 0x3f, outs, ins, opStr, []> {
+  bits<5> cpid;
+  bits<20> usdef;
+  let Inst{25 - 21} = cpid;
+  let Inst{20 - 16} = usdef{19-15};
+  let Inst{15} = 1;
+  let Inst{14 - 0} = usdef{14-0};
+}
+
+class I_CP_Z<bits<4> sop, dag outs, dag ins, string opStr>
+    : CSKY32Inst<AddrModeNone, 0x3f, outs, ins, opStr, []> {
+  bits<5> cpid;
+  bits<12> usdef;
+  bits<5> rz;
+
+  let Inst{25 - 21} = cpid;
+  let Inst{20 - 16} = rz;
+  let Inst{15 - 12} = sop;
+  let Inst{11 - 0} = usdef;
+}
+
+class I_5_CACHE<bits<6> sop, bits<5> pcode, string opStr>
+    : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins), opStr, []> {
+  let Inst{25 - 21} = pcode;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
+}
+
+class I_5_X_CACHE<bits<6> sop, bits<5> pcode, string opStr>
+    : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins GPR:$rx), opStr #"\t$rx", []> {
+  bits<5> rx;
+
+  let Inst{25 - 21} = pcode;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
+}
+
 // Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | IMM[5]>
 // Instructions(2): ldm32, (ldq32), stm32, (stq32)
-class I_5_YX<bits<6> opcode, dag outs, dag ins, string op, list<dag> pattern,
-             bits<5> imm5>
-    : CSKY32Inst<AddrModeNone, opcode, outs, ins,
-                 op #"\t${ry}, (${rx}), " #!cast<int>(imm5), pattern> {
+class I_5_YX<bits<6> opcode, bits<6> sop, dag outs, dag ins, string opStr, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, opcode, outs, ins, opStr, pattern> {
+  bits<10> regs;
   bits<5> rx;
-  bits<5> ry;
-  let Inst{25 - 21} = ry; // ry
+
+  let Inst{25 - 21} = regs{9 - 5}; // ry
   let Inst{20 - 16} = rx;
-  let Inst{15 - 10} = 0b000111;
+  let Inst{15 - 10} = sop;
   let Inst{9 - 5} = 0b00001;
-  let Inst{4 - 0} = imm5{4 - 0}; // imm5
+  let Inst{4 - 0} = regs{4 - 0}; // imm5
 }
 
 // Format< OP[6] | LSB[5] | RX[5] | SOP[6] | MSB[5] | RZ[5]>
@@ -317,14 +413,33 @@ class I_5_XZ_U<bits<6> sop, dag outs, dag ins, string op, list<dag> pattern>
   let Inst{4 - 0} = rz;
 }
 
-// sextb, sexth
-class I_5_XZ_US<bits<6> sop, string op, SDNode opnode,
-  ValueType type> : I_5_XZ_U<sop, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), op,
-    [(set GPR:$rz, (opnode GPR:$rx, type))]>;
+class I_5_XZ_INS<bits<6> sop, dag outs, dag ins, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, outs, ins, op #"\t$rz, $rx, $msb, $lsb",
+                 pattern> {
+  bits<5> rx;
+  bits<5> rz;
+  bits<5> msb;
+  bits<5> lsb;
+  let Inst{25 - 21} = rz;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = msb;
+  let Inst{4 - 0} = lsb;
+}
 
-class I_5_XZ_UZ<bits<6> sop, string op, int v>
-    : I_5_XZ_U<sop, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), op,
-    [(set GPR:$rz, (and GPR:$rx, (i32 v)))]>;
+// Format< OP[6] | LSB[5] | RX[5] | SOP[6] | MSB[5] | RZ[5]>
+// Instructions(6): zext32, zextb32, zexth32, sext32, sextb32, sexth32
+class I_5_XZ_U2<bits<6> sop, bits<5> lsb, bits<5> msb, dag outs, dag ins,
+  string op, list<dag> pattern>
+  : CSKY32Inst<AddrModeNone, 0x31, outs, ins, !strconcat(op, "\t$rz, $rx"), pattern> {
+  bits<5> rx;
+  bits<5> rz;
+  let Inst{25 - 21} = lsb;     // lsb
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = msb;       // msb
+  let Inst{4 - 0} = rz;
+}
 
 // Format< OP[6] | RZ[5] | RX[5] | SOP[6] | SIZE[5] | LSB[5]>
 // Instructions(1): ins32
@@ -341,6 +456,16 @@ class I_5_ZX_U<bits<6> sop, string op, Operand operand, list<dag> pattern>
   let Inst{4 - 0} = size_lsb{4 - 0}; // lsb
 }
 
+// sextb, sexth
+class I_5_XZ_US<bits<6> sop, bits<5> lsb, bits<5> msb, string op,
+  SDNode opnode, ValueType type>
+  : I_5_XZ_U2<sop, lsb, msb, (outs GPR:$rz), (ins GPR:$rx), op,
+  [(set GPR:$rz, (opnode GPR:$rx, type))]>;
+
+class I_5_XZ_UZ<bits<6> sop, bits<5> lsb, bits<5> msb, string op, int v>
+  : I_5_XZ_U2<sop, lsb, msb, (outs GPR:$rz), (ins GPR:$rx), op,
+  [(set GPR:$rz, (and GPR:$rx, (i32 v)))]>;
+
 // Format< OP[6] | IMM[5] | RX[5] | SOP[6] | PCODE[5] | 00000 >
 // Instructions(1): btsti32
 class I_5_X<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
@@ -373,6 +498,18 @@ class I_5_Z<bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
   let Inst{4 - 0} = rz;
 }
 
+class I_5_IMM5<bits<6> opcode, bits<6> sop, bits<5> pcode, string op, ImmLeaf ImmType,
+            list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, opcode, (outs), (ins ImmType:$imm5),
+                 !strconcat(op, "\t$imm5"), pattern> {
+  bits<5> imm5;
+  let Inst{25 - 21} = imm5;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = sop;
+  let Inst{9 - 5} = pcode;
+  let Inst{4 - 0} = 0;
+}
+
 // Format< OP[6] | RY[5] | RX[5] | SOP[6] | PCODE[5] | RZ[5] >
 // Instructions(24): addu32, addc32, subu32, subc32, (rsub32), ixh32, ixw32,
 // ixd32, and32, andn32, or32, xor32, nor32, lsl32, lsr32, asr32, rotl32
@@ -493,9 +630,8 @@ class R_ZX<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
 
 // Format< OP[6] | 00000[5] | RX[5] | SOP[6] | PCODE[5] | 00000[5] >
 // Instructions:(1) tstnbz32
-class R_X<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
-    : CSKY32Inst<AddrModeNone, 0x31, (outs CARRY:$ca),(ins GPR:$rx),
-                 !strconcat(op, "\t$rx"), pattern> {
+class R_X<bits<6> sop, bits<5> pcode, dag outs, dag ins, string op, list<dag> pattern>
+    : CSKY32Inst<AddrModeNone, 0x31, outs, ins, !strconcat(op, "\t$rx"), pattern> {
   bits<5> rx;
   let Inst{25 - 21} = 0;
   let Inst{20 - 16} = rx;
@@ -530,3 +666,14 @@ class R_Z_2<bits<6> sop, bits<5> pcode, string op, list<dag> pattern>
   let Inst{4 - 0} = 0;
   let Constraints = "$rz = $false";
 }
+
+class BAR<bits<5> sop, string op, bits<1> signed>
+  : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins), op, []> {
+  let Inst{25} = signed;
+  let Inst{24 - 16} = 0;
+  let Inst{15 - 5} = 0x421;
+  let Inst{4 - 0} = sop;
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
new file mode 100644
index 000000000000..6d42bddcdd78
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
@@ -0,0 +1,219 @@
+//===- CSKYInstrFormats16Instr.td - 16-bit Instr. Formats -*- tablegen --*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+class J16<bits<5> sop, string opstr, dag ins>
+  : CSKY16Inst<AddrModeNone, (outs), ins,
+    !strconcat(opstr, "\t$offset"), []> {
+  bits<10> offset;
+  let Inst{15} = 0;
+  let Inst{14 - 10} = sop;
+  let Inst{9 - 0} = offset;
+}
+
+class J16_B<bits<5> sop, string opstr>
+  : CSKY16Inst<AddrModeNone, (outs), (ins CARRY:$ca, br_symbol_16bit:$offset),
+    !strconcat(opstr, "\t$offset"), []> {
+  bits<10> offset;
+  let Inst{15} = 0;
+  let Inst{14 - 10} = sop;
+  let Inst{9 - 0} = offset;
+}
+
+class R16_XYZ<bits<2> sop, string opstr, SDNode opnode> : CSKY16Inst<AddrModeNone,
+  (outs mGPR:$rz), (ins mGPR:$rx, mGPR:$ry), !strconcat(opstr, "\t$rz, $rx, $ry"),
+  [(set mGPR:$rz, (opnode mGPR:$rx, mGPR:$ry)) ]> {
+  bits<3> rz;
+  bits<3> rx;
+  bits<3> ry;
+  let Inst{15 - 11} = 0b01011;
+  let Inst{10 - 8} = rx;
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 2} = ry;
+  let Inst{1, 0} = sop;
+}
+
+class R16_XZ_BINOP<bits<4> op, bits<2> sop, string opstr, PatFrag opnode> : CSKY16Inst<
+  AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rZ, sGPR:$rx), !strconcat(opstr, "\t$rz, $rx"),
+  [(set sGPR:$rz, (opnode sGPR:$rZ, sGPR:$rx))]> {
+  bits<4> rz;
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = op;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+  let Constraints = "$rz = $rZ";
+}
+
+class R16_XZ_BINOP_NOPat<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rZ, sGPR:$rx), !strconcat(opstr, "\t$rz, $rx"),
+  []> {
+  bits<4> rz;
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = op;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+  let Constraints = "$rz = $rZ";
+}
+
+class R16_XZ_BINOP_C<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs sGPR:$rz, CARRY:$cout),
+  (ins sGPR:$rZ, sGPR:$rx, CARRY:$cin), !strconcat(opstr, "\t$rz, $rx"), []> {
+  bits<4> rz;
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = op;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+  let Constraints = "$rz = $rZ";
+}
+
+class R16_XZ_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx), !strconcat(opstr, "\t$rz, $rx"),
+  []> {
+  bits<4> rz;
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = op;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+}
+
+class R16_XY_CMP<bits<2> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), !strconcat(opstr, "\t$rx, $ry"),
+  []> {
+  bits<4> ry;
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = 0b1001;
+  let Inst{9 - 6} = ry;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+  let isCompare = 1;
+}
+
+class R16_X_J<bits<8> op_rz, bits<2> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs), (ins sGPR:$rx), !strconcat(opstr, "\t$rx"), []> {
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 6} = op_rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+}
+
+class I16_Z_8<bits<3> op, dag ins, string asmstr>
+  : CSKY16Inst<AddrModeNone, (outs mGPR:$rz), ins, asmstr, []> {
+  bits<3> rz;
+  bits<8> imm8;
+  let Inst{15, 14} = 0b00;
+  let Inst{13 - 11} = op;
+  let Inst{10 - 8} = rz;
+  let Inst{7 - 0} = imm8;
+}
+
+class I16_Z_5<bits<3> sop, dag outs, dag ins,string opstr>
+  : CSKY16Inst<AddrModeNone, outs, ins,
+  !strconcat(opstr, "\t$rz, $imm5"), []> {
+  bits<3> rz;
+  bits<5> imm5;
+  let Inst{15, 14} = 0b00;
+  let Inst{13 - 11} = 0b111;
+  let Inst{10 - 8} = rz;
+  let Inst{7 - 5} = sop;
+  let Inst{4 - 0} = imm5;
+}
+
+class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst<
+  AddrModeNone, (outs CARRY:$ca), (ins mGPR:$rx, Immoperand:$imm5),
+  !strconcat(opstr, "\t$rx, $imm5"), []> {
+  bits<3> rx;
+  bits<5> imm5;
+  let Inst{15, 14} = 0b00;
+  let Inst{13 - 11} = 0b111;
+  let Inst{10 - 8} = rx;
+  let Inst{7 - 5} = sop;
+  let Inst{4 - 0} = imm5;
+  let isCompare = 1;
+}
+
+class I16_SP_IMM7<bits<3> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs SPOp:$sp2), (ins SPOp:$sp1, uimm7_2:$imm7),
+  !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> {
+  bits<7> imm7;
+  let Inst{15, 14} = 0b00;
+  let Inst{13 - 10} = 0b0101;
+  let Inst{9, 8} = imm7{6,5};
+  let Inst{7 - 5} = sop;
+  let Inst{4 - 0} = imm7{4 - 0};
+}
+
+class I16_XZ_IMM5<bits<3> sop, string opstr, SDNode opnode> : CSKY16Inst<
+  AddrModeNone, (outs mGPR:$rz), (ins mGPR:$rx, uimm5:$imm5),
+  !strconcat(opstr, "\t$rz, $rx, $imm5"), [(set mGPR:$rz, (opnode mGPR:$rx, uimm5:$imm5))]> {
+  bits<3> rx;
+  bits<3> rz;
+  bits<5> imm5;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 11} = sop;
+  let Inst{10 - 8} = rx;
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 0} = imm5;
+}
+
+class I16_XZ_LDST<AddrMode am, bits<3> sop, string opstr, dag outs, dag ins>
+  : CSKY16Inst<am, outs, ins, !strconcat(opstr, "\t$rz, ($rx, ${imm})"),
+  []> {
+  bits<3> rx;
+  bits<3> rz;
+  bits<5> imm;
+  let Inst{15, 14} = 0b10;
+  let Inst{13 - 11} = sop;
+  let Inst{10 - 8} = rx;
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 0} = imm;
+}
+
+class I16_ZSP_LDST<AddrMode am, bits<3> sop, string opstr, dag outs, dag ins> : CSKY16Inst<
+  am,  outs, ins, !strconcat(opstr, "\t$rz, ($sp, ${addr})"),
+  []> {
+  bits<3> rz;
+  bits<8> addr;
+  let Inst{15, 14} = 0b10;
+  let Inst{13 - 11} = sop;
+  let Inst{10 - 8} = addr{7 - 5};
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 0} = addr{4 - 0};
+}
+
+class I16_XZ_IMM3<bits<2> sop, string opstr, SDNode opnode> : CSKY16Inst<
+  AddrModeNone, (outs mGPR:$rz), (ins mGPR:$rx, oimm3:$oimm3),
+  !strconcat(opstr, "\t$rz, $rx, $oimm3"), [(set mGPR:$rz, (opnode mGPR:$rx, oimm3:$oimm3))]> {
+  bits<3> rx;
+  bits<3> rz;
+  bits<3> oimm3;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 11} = 0b011;
+  let Inst{10 - 8} = rx;
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 2} = oimm3;
+  let Inst{1, 0} = sop;
+}
+
+class I16_BPushPop<bits<11> op, bits<2> uop, dag out, dag ins, string opstr> :
+  CSKY16Inst<AddrModeNone, out, ins, opstr, []>{
+  bits<3> rz;
+  let Inst{15- 5} = op;
+  let Inst{4 -2} = rz;
+  let Inst{1,0} = uop;
+  let Predicates = [HasJAVA];
+  let hasSideEffects = 1;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
new file mode 100644
index 000000000000..e12235cf9478
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -0,0 +1,25 @@
+//===-- CSKYInstrInfo.h - CSKY Instruction Information --------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CSKY implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYInstrInfo.h"
+#include "llvm/MC/MCContext.h"
+
+#define DEBUG_TYPE "csky-instr-info"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "CSKYGenInstrInfo.inc"
+
+CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI)
+    : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) {
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
new file mode 100644
index 000000000000..04be9da27b57
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -0,0 +1,36 @@
+//===-- CSKYInstrInfo.h - CSKY Instruction Information --------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CSKY implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYINSTRINFO_H
+#define LLVM_LIB_TARGET_CSKY_CSKYINSTRINFO_H
+
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "CSKYGenInstrInfo.inc"
+
+namespace llvm {
+
+class CSKYSubtarget;
+
+class CSKYInstrInfo : public CSKYGenInstrInfo {
+protected:
+  const CSKYSubtarget &STI;
+
+public:
+  explicit CSKYInstrInfo(CSKYSubtarget &STI);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYINSTRINFO_H
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index 20adda4f9ca2..9dda3159e446 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -15,6 +15,18 @@
 // CSKY specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
+def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                       SDTCisVT<1, i32>]>;
+
+def SDT_CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                     SDTCisVT<1, i32>]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 // Target-dependent nodes.
 def CSKY_RET : SDNode<"CSKYISD::RET", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@@ -44,6 +56,7 @@ class oimm<int num> : Operand<i32>,
   ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> {
   let EncoderMethod = "getOImmOpValue";
   let ParserMatchClass = OImmAsmOperand<num>;
+  let DecoderMethod = "decodeOImmOperand<"#num#">";
 }
 
 class uimm<int num, int shift = 0> : Operand<i32>,
@@ -53,12 +66,14 @@ class uimm<int num, int shift = 0> : Operand<i32>,
     !if(!ne(shift, 0),
         UImmAsmOperand<num, "Shift"#shift>,
         UImmAsmOperand<num>);
+  let DecoderMethod = "decodeUImmOperand<"#num#", "#shift#">";
 }
 
 class simm<int num, int shift = 0> : Operand<i32>,
   ImmLeaf<i32, "return isShiftedInt<"#num#", "#shift#">(Imm);"> {
   let EncoderMethod = "getImmOpValue<"#shift#">";
   let ParserMatchClass = SImmAsmOperand<num>;
+  let DecoderMethod = "decodeSImmOperand<"#num#", "#shift#">";
 }
 
 def nimm_XFORM : SDNodeXForm<imm, [{
@@ -73,14 +88,19 @@ def uimm32_hi16 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((N->getZExtValue() >> 16) & 0xFFFF,
     SDLoc(N), MVT::i32);
 }]>;
+def uimm32_lo16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue()& 0xFFFF, SDLoc(N), MVT::i32);
+}]>;
 def uimm16_16_xform : Operand<i32>,
   ImmLeaf<i32, "return isShiftedUInt<16, 16>(Imm);", uimm32_hi16> {
   let ParserMatchClass = UImmAsmOperand<16>;
+  let EncoderMethod = "getImmOpValue";
 }
 
 def uimm_shift : Operand<i32>, ImmLeaf<i32, "return isUInt<2>(Imm);"> {
   let EncoderMethod = "getImmShiftOpValue";
   let ParserMatchClass = UImmAsmOperand<2>;
+  let DecoderMethod = "decodeImmShiftOpValue";
 }
 
 def CSKYSymbol : AsmOperandClass {
@@ -94,16 +114,22 @@ def br_symbol : Operand<iPTR> {
   let EncoderMethod =
     "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm16_scale2>";
   let ParserMatchClass = CSKYSymbol;
+  let DecoderMethod = "decodeSImmOperand<16, 1>";
+  let PrintMethod = "printCSKYSymbolOperand";
+  let OperandType = "OPERAND_PCREL";
 }
 
 def call_symbol : Operand<iPTR> {
   let ParserMatchClass = CSKYSymbol;
   let EncoderMethod = "getCallSymbolOpValue";
+  let DecoderMethod = "decodeSImmOperand<26, 1>";
+  let PrintMethod = "printCSKYSymbolOperand";
+  let OperandType = "OPERAND_PCREL";
 }
 
 def Constpool : AsmOperandClass {
-  let Name = "ConstpoolSymbol";
-  let RenderMethod = "addImmOperands";
+  let Name = "Constpool";
+  let RenderMethod = "addConstpoolOperands";
   let DiagnosticType = "InvalidConstpool";
   let ParserMethod = "parseConstpoolSymbol";
 }
@@ -112,24 +138,132 @@ def constpool_symbol : Operand<iPTR> {
   let ParserMatchClass = Constpool;
   let EncoderMethod =
     "getConstpoolSymbolOpValue<CSKY::fixup_csky_pcrel_uimm16_scale4>";
+  let DecoderMethod = "decodeUImmOperand<16, 2>";
+  let PrintMethod = "printConstpool";
+  let OperandType = "OPERAND_PCREL";
+}
+
+def DataAsmClass : AsmOperandClass {
+  let Name = "DataSymbol";
+  let RenderMethod = "addConstpoolOperands";
+  let DiagnosticType = "InvalidConstpool";
+  let ParserMethod = "parseDataSymbol";
+}
+
+class data_symbol<string reloc, int shift> : Operand<iPTR> {
+  let ParserMatchClass = Constpool;
+  let EncoderMethod =
+    "getDataSymbolOpValue<"#reloc#">";
+  let DecoderMethod = "decodeUImmOperand<18, "#shift#">";
+  let PrintMethod = "printDataSymbol";
 }
 
 def bare_symbol : Operand<iPTR> {
   let ParserMatchClass = CSKYSymbol;
   let EncoderMethod = "getBareSymbolOpValue";
+  let PrintMethod = "printCSKYSymbolOperand";
+  let DecoderMethod = "decodeSImmOperand<18, 1>";
+  let OperandType = "OPERAND_PCREL";
+}
+
+def oimm3 : oimm<3>;
+def oimm4 : oimm<4>;
+def oimm5 : oimm<5>;
+def oimm6 : oimm<6>;
+
+def imm5_idly : Operand<i32>, ImmLeaf<i32,
+  "return Imm <= 32 && Imm >= 0;"> {
+  let EncoderMethod = "getImmOpValueIDLY";
+  let DecoderMethod = "decodeOImmOperand<5>";
 }
 
+def oimm8 : oimm<8>;
 def oimm12 : oimm<12>;
 def oimm16 : oimm<16>;
 
 def nimm12 : nimm<12>;
 
+def uimm1 : uimm<1>;
+def uimm2 : uimm<2>;
+
+
+def uimm2_jmpix : Operand<i32>,
+  ImmLeaf<i32, "return Imm == 16 || Imm == 24 || Imm == 32 || Imm == 40;"> {
+  let EncoderMethod = "getImmJMPIX";
+  let DecoderMethod = "decodeJMPIXImmOperand";
+}
+
+def uimm3 : uimm<3>;
+def uimm4 : uimm<4>;
 def uimm5 : uimm<5>;
+def uimm5_msb_size : uimm<5> {
+  let EncoderMethod = "getImmOpValueMSBSize";
+}
+
+def uimm5_1 : uimm<5, 1>;
+def uimm5_2 : uimm<5, 2>;
+def uimm6 : uimm<6>;
+def uimm7 : uimm<7>;
+def uimm7_1 : uimm<7, 1>;
+def uimm7_2 : uimm<7, 2>;
+def uimm7_3 : uimm<7, 3>;
+def uimm8 : uimm<8>;
+def uimm8_2 : uimm<8, 2>;
+def uimm8_3 : uimm<8, 3>;
+def uimm8_8 : uimm<8, 8>;
+def uimm8_16 : uimm<8, 16>;
+def uimm8_24 : uimm<8, 24>;
 def uimm12 : uimm<12>;
 def uimm12_1 : uimm<12, 1>;
 def uimm12_2 : uimm<12, 2>;
 def uimm16 : uimm<16>;
+def uimm16_8 : uimm<16, 8>;
+def uimm16_16 : uimm<16, 16>;
+def uimm20 : uimm<20>;
+def uimm24 : uimm<24>;
+def uimm24_8 : uimm<24, 8>;
+
+def simm8_2 : simm<8, 2>;
+
+class RegSeqAsmOperand<string Suffix = ""> : AsmOperandClass {
+  let Name = "RegSeq"#Suffix;
+  let RenderMethod = "addRegSeqOperands";
+  let DiagnosticType = "InvalidRegSeq";
+  let ParserMethod = "parseRegSeq";
+}
+
+def regseq : Operand<iPTR> {
+  let EncoderMethod = "getRegisterSeqOpValue";
+  let ParserMatchClass = RegSeqAsmOperand<"">;
+  let PrintMethod = "printRegisterSeq";
+  let DecoderMethod = "DecodeRegSeqOperand";
+  let MIOperandInfo = (ops GPR, uimm5);
+}
 
+def RegListAsmOperand : AsmOperandClass {
+  let Name = "RegList";
+  let RenderMethod = "addRegListOperands";
+  let DiagnosticType = "InvalidRegList";
+  let ParserMethod = "parseRegList";
+}
+
+def reglist : Operand<iPTR> {
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+}
+
+def PSRFlag : AsmOperandClass {
+  let Name = "PSRFlag";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidPSRFlag";
+  let ParserMethod = "parsePSRFlag";
+}
+
+def psrflag : Operand<i32>, ImmLeaf<i32, "return isShiftedUInt<5, 0>(Imm);"> {
+  let EncoderMethod = "getImmOpValue";
+  let ParserMatchClass = PSRFlag;
+  let PrintMethod = "printPSRFlag";
+}
 
 //===----------------------------------------------------------------------===//
 // Instruction Formats
@@ -145,12 +279,33 @@ class TriOpFrag<dag res> : PatFrag<(ops node: $LHS, node:$MHS, node:$RHS), res>;
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
 
+def eqToAdd : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs), [{
+  return isOrEquivalentToAdd(N);
+}]>;
+
+def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
+
+
+//===----------------------------------------------------------------------===//
+// CSKYPseudo
+//===----------------------------------------------------------------------===//
+
+// Pessimistically assume the stack pointer will be clobbered
+let Defs = [R14], Uses = [R14] in {
+def ADJCALLSTACKDOWN : CSKYPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+  "!ADJCALLSTACKDOWN $amt1, $amt2", [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP   : CSKYPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+  "!ADJCALLSTACKUP $amt1, $amt2", [(callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [R14], Uses = [R14]
 
 
 //===----------------------------------------------------------------------===//
 // Basic ALU instructions.
 //===----------------------------------------------------------------------===//
 
+let Predicates = [iHasE2] in {
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  let isAdd = 1 in
   def ADDI32 : I_12<0x0, "addi32", add, oimm12>;
   def SUBI32 : I_12<0x1, "subi32", sub, oimm12>;
   def ORI32 : I_16_ZX<"ori32", uimm16,
@@ -171,11 +326,15 @@ class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
     (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5),
     [(set GPR:$rz, (rotl GPR:$rx, uimm5:$imm5))]>;
 
-
+  def ROTRI32 : CSKYPseudo<(outs GPR:$rz), (ins GPR:$rx, oimm5:$imm5),
+                            "rotri32 $rz, $rx, $imm5", []>;
+  }
+  let isAdd = 1 in
   def ADDU32 : R_YXZ_SP_F1<0x0, 0x1,
     BinOpFrag<(add node:$LHS, node:$RHS)>, "addu32", 1>;
   def SUBU32 : R_YXZ_SP_F1<0x0, 0x4,
     BinOpFrag<(sub node:$LHS, node:$RHS)>, "subu32">;
+
   def MULT32 : R_YXZ_SP_F1<0x21, 0x1,
     BinOpFrag<(mul node:$LHS, node:$RHS)>, "mult32", 1>;
   def AND32 : R_YXZ_SP_F1<0x8, 0x1,
@@ -188,8 +347,16 @@ class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
     BinOpFrag<(xor node:$LHS, node:$RHS)>, "xor32", 1>;
   def NOR32 : R_YXZ_SP_F1<0x9, 0x4,
     BinOpFrag<(not (or node:$LHS, node:$RHS))>, "nor32", 1>;
+  let isCodeGenOnly = 1 in
   def NOT32 : R_XXZ<0b001001, 0b00100, (outs GPR:$rz), (ins GPR:$rx),
     "not32", [(set GPR:$rz, (not GPR:$rx))]>;
+
+  let Size = 8 in
+  def NEG32 : CSKYPseudo<(outs GPR:$rd), (ins GPR:$rx), "neg32 $rd, $rx", []>;
+
+  let Size = 8 in
+  def RSUBI32 : CSKYPseudo<(outs GPR:$rd), (ins GPR:$rx, uimm12:$imm12), "rsubi32 $rd, $rx, $imm12", []>;
+
   def LSL32 : R_YXZ_SP_F1<0x10, 0x1,
     BinOpFrag<(shl node:$LHS, node:$RHS)>, "lsl32">;
   def LSR32 : R_YXZ_SP_F1<0x10, 0x2,
@@ -199,23 +366,37 @@ class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
   def ROTL32 : R_YXZ_SP_F1<0x10, 0x8,
     BinOpFrag<(rotl node:$LHS, (and node:$RHS, 0x1f))>, "rotl32">;
 
-  // TODO: Shift series instr. with carry.
+  def BMASKI32 : I_5_Z<0b010100, 0x1, "bmaski32", oimm5, []>;
+  def LSLC32 : I_5_XZ<0x13, 0x1, "lslc32",
+    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
+  def LSRC32 : I_5_XZ<0x13, 0x2, "lsrc32",
+    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
+  def ASRC32 : I_5_XZ<0x13, 0x4, "asrc32",
+    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5), []>;
+  def XSR32 : I_5_XZ<0x13, 0x8, "xsr32",
+    (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, oimm5:$imm5, CARRY:$cin), []>;
 
   def IXH32 : R_YXZ_SP_F1<0x2, 0x1,
     BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 1)))>, "ixh32">;
   def IXW32 : R_YXZ_SP_F1<0x2, 0x2,
     BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 2)))>, "ixw32">;
-
+  let Predicates = [iHas2E3] in
   def IXD32 : R_YXZ_SP_F1<0x2, 0x4,
     BinOpFrag<(add node:$LHS, (shl node:$RHS, (i32 3)))>, "ixd32">;
 
-  let isCommutable = 1 in
+  let isCommutable = 1, isAdd = 1 in
   def ADDC32 : R_YXZ<0x31, 0x0, 0x2, (outs GPR:$rz, CARRY:$cout),
     (ins GPR:$rx, GPR:$ry, CARRY:$cin), "addc32", []>;
   def SUBC32 : R_YXZ<0x31, 0x0, 0x8, (outs GPR:$rz, CARRY:$cout),
     (ins GPR:$rx, GPR:$ry, CARRY:$cin), "subc32", []>;
 
-  // TODO: incf32.
+  def INCF32 : I_5_ZX<0x3, 0x1, "incf32", uimm5, []>;
+  def INCT32 : I_5_ZX<0x3, 0x2, "inct32", uimm5, []>;
+  def DECF32 : I_5_ZX<0x3, 0x4, "decf32", uimm5, []>;
+  def DECT32 : I_5_ZX<0x3, 0x8, "dect32", uimm5, []>;
+}
+
+let Predicates = [iHas2E3] in {
   def DIVS32 : R_YXZ_SP_F1<0x20, 0x2,
     BinOpFrag<(sdiv node:$LHS, node:$RHS)>, "divs32">;
   def DIVU32 : R_YXZ_SP_F1<0x20, 0x1,
@@ -228,11 +409,35 @@ class UnOpFrag<dag res> : PatFrag<(ops node:$Src), res>;
   def DECNE32 : I_5_XZ<0x4, 0x4, "decne32",
     (outs GPR:$rz, CARRY:$cout), (ins GPR:$rx, uimm5:$imm5), []>;
 
-  // TODO: s/zext.
-  def ZEXT32 : I_5_XZ_U<0x15, (outs GPR:$rz),
-    (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "zext32",[]>;
-  def SEXT32 : I_5_XZ_U<0x16, (outs GPR:$rz),
-    (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "sext32", []>;
+  def SEXT32 : I_5_XZ_U<0x16, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "sext32", []>;
+  let isCodeGenOnly = 1 in {
+  def SEXTB32 : I_5_XZ_US<0x16, 0, 7, "sextb32", sext_inreg, i8>;
+  def SEXTH32 : I_5_XZ_US<0x16, 0, 15, "sexth32", sext_inreg, i16>;
+  def ZEXTB32 : I_5_XZ_UZ<0x15, 0, 7, "zextb32", 255>;
+  def ZEXTH32 : I_5_XZ_UZ<0x15, 0, 15, "zexth32", 65535>;
+  }
+  def ZEXT32 : I_5_XZ_U<0x15, (outs GPR:$rz), (ins GPR:$rx, uimm5:$msb, uimm5:$lsb), "zext32",[]>;
+
+  let Constraints = "$rZ = $rz" in
+  def INS32 : I_5_XZ_INS<0b010111, (outs GPR:$rz), (ins GPR:$rZ, GPR:$rx, uimm5_msb_size:$msb, uimm5:$lsb), "ins32", []>;
+}
+
+let Predicates = [iHas3E3r1] in {
+def MULTS32 : R_YXZ<0x3e, 0x20, 0x10, (outs GPRPair:$rz),
+    (ins GPR:$rx, GPR:$ry), "mul.s32", []>;
+def MULTU32 : R_YXZ<0x3e, 0x20, 0x00, (outs GPRPair:$rz),
+    (ins GPR:$rx, GPR:$ry), "mul.u32", []>;
+
+let Constraints = "$rZ = $rz" in {
+def MULATS32 : R_YXZ<0x3e, 0x20, 0x14, (outs GPRPair:$rZ),
+    (ins GPRPair:$rz, GPR:$rx, GPR:$ry), "mula.s32", []>;
+def MULATU32 : R_YXZ<0x3e, 0x20, 0x04, (outs GPRPair:$rZ),
+    (ins GPRPair:$rz, GPR:$rx, GPR:$ry), "mula.u32", []>;
+}
+}
+
+def MULSH32 : R_YXZ<0x31, 0b100100, 0b00001, (outs GPR:$rz),
+    (ins GPR:$rx, GPR:$ry), "mulsh32", []>;
 
 //===----------------------------------------------------------------------===//
 // Load & Store instructions.
@@ -242,18 +447,35 @@ def LD32B : I_LD<AddrMode32B, 0x0, "ld32.b", uimm12>;
 def LD32H : I_LD<AddrMode32H, 0x1, "ld32.h", uimm12_1>;
 def LD32W : I_LD<AddrMode32WD, 0x2, "ld32.w", uimm12_2>;
 
+let OutOperandList = (outs GPRPair:$rz) in
+def LD32D : I_LD<AddrMode32WD, 0x3, "ld32.d", uimm12_2>;
 
+let Predicates = [iHasE2] in {
   def LD32BS : I_LD<AddrMode32B, 0x4, "ld32.bs", uimm12>;
   def LD32HS : I_LD<AddrMode32H, 0x5, "ld32.hs", uimm12_1>;
 
-  // TODO: LDM and STM.
+  def LDM32 : I_5_YX<0b110100, 0b000111,
+    (outs), (ins GPR:$rx, regseq:$regs, variable_ops), "ldm32\t$regs, (${rx})", []>;
+  def STM32 : I_5_YX<0b110101, 0b000111,
+    (outs), (ins GPR:$rx, regseq:$regs, variable_ops), "stm32\t$regs, (${rx})", []>;
 
+  let Size = 4, isCodeGenOnly = 0 in {
+  def LDQ32 : CSKYPseudo<(outs), (ins GPR:$rx, regseq:$regs, variable_ops),
+                             "ldq32\t$regs, (${rx})", []>;
+  def STQ32 : CSKYPseudo<(outs), (ins GPR:$rx, regseq:$regs, variable_ops),
+                             "stq32\t$regs, (${rx})", []>;
+  }
+
+}
 
 def ST32B : I_ST<AddrMode32B, 0x0, "st32.b", uimm12>;
 def ST32H : I_ST<AddrMode32H, 0x1, "st32.h", uimm12_1>;
 def ST32W : I_ST<AddrMode32WD, 0x2, "st32.w", uimm12_2>;
 
+let InOperandList = (ins GPRPair:$rz, GPR:$rx, uimm12_2:$imm12 ) in
+def ST32D : I_ST<AddrMode32WD, 0x3, "st32.d", uimm12_2>;
 
+let Predicates = [iHas2E3] in {
   def LDR32B :  I_LDR<0x0, "ldr32.b">;
   def LDR32BS :  I_LDR<0x4, "ldr32.bs">;
   def LDR32H :  I_LDR<0x1, "ldr32.h">;
@@ -262,42 +484,100 @@ def ST32W : I_ST<AddrMode32WD, 0x2, "st32.w", uimm12_2>;
   def STR32B :  I_STR<0x0, "str32.b">;
   def STR32H :  I_STR<0x1, "str32.h">;
   def STR32W :  I_STR<0x2, "str32.w">;
+}
+
+// Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+let mayStore = 1 in {
+def SPILL_CARRY : CSKYPseudo<(outs), (ins CARRY:$cond, GPR:$rx, uimm12_2:$imm),
+                             "!SPILL_CARRY $cond, $rx, $imm", []>;
+}
+
+// Indicate that we're restoring the CR register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in {
+def RESTORE_CARRY : CSKYPseudo<(outs CARRY:$cond), (ins GPR:$rx, uimm12_2:$imm),
+                                "!RESTORE_CARRY $cond, $rx, $imm", []>;
+}
 
-  //TODO: SPILL_CARRY and RESTORE_CARRY.
+let mayLoad = 1 in {
+def STORE_PAIR : CSKYPseudo<(outs), (ins GPRPair:$rz, GPR:$rx, uimm12_2:$imm),
+                            "!STORE_PAIR $rz, $rx, $imm", []>;
+}
+
+let mayLoad = 1 in {
+def LOAD_PAIR : CSKYPseudo<(outs GPRPair:$rz), (ins GPR:$rx, uimm12_2:$imm),
+                            "!LOAD_PAIR $rz, $rx, $imm", []>;
+}
 
 //===----------------------------------------------------------------------===//
 // Compare instructions.
 //===----------------------------------------------------------------------===//
-
+let Predicates = [iHasE2] in {
   def CMPNEI32 : I_16_X<0x1A, "cmpnei32", uimm16>;
   def CMPHSI32 : I_16_X<0x18, "cmphsi32", oimm16>;
   def CMPLTI32 : I_16_X<0x19, "cmplti32", oimm16>;
-
-
+  def CMPLEI32 : CSKYPseudo<(outs CARRY:$ca), (ins GPR:$rx, uimm16:$imm16),
+    "cmplei32\t$rx, $imm16", []>;
+}
+let Predicates = [iHas2E3] in {
   def CMPNE32 : R_YX<0x1, 0x4, "cmpne32">;
   def CMPHS32 : R_YX<0x1, 0x1, "cmphs32">;
   def CMPLT32 : R_YX<0x1, 0x2, "cmplt32">;
 
-  // TODO: setc and clrc.
-  // TODO: test32 and tstnbz.
+  def SETC32 : CSKY32Inst<AddrModeNone, 0x31,
+    (outs CARRY:$ca), (ins), "setc32", []> {
+    let Inst{25 - 21} = 0; //rx
+    let Inst{20 - 16} = 0; //ry
+    let Inst{15 - 10} = 0x1;
+    let Inst{9 - 5} = 0x1;
+    let Inst{4 - 0} = 0;
+    let isCompare = 1;
+  }
+  def CLRC32 : CSKY32Inst<AddrModeNone, 0x31,
+    (outs CARRY:$ca), (ins), "clrc32", []> {
+    let Inst{25 - 21} = 0; //rx
+    let Inst{20 - 16} = 0; //ry
+    let Inst{15 - 10} = 0x1;
+    let Inst{9 - 5} = 0x4;
+    let Inst{4 - 0} = 0;
+    let isCompare = 1;
+  }
+
+  def TST32 : R_YX<0x8, 0x4, "tst32">;
+  def TSTNBZ32 : R_X<0x8, 0x8,
+    (outs CARRY:$ca), (ins GPR:$rx), "tstnbz32", []>;
+}
 
 //===----------------------------------------------------------------------===//
 // Data move instructions.
 //===----------------------------------------------------------------------===//
 
+let Predicates= [iHasE2] in {
+  let isCodeGenOnly = 1 in {
   def MOVT32 : R_ZX<0x3, 0x2, "movt32", []>;
   def MOVF32 : R_ZX<0x3, 0x1, "movf32", []>;
+  }
   def MOVI32 : I_16_MOV<0x10, "movi32", uimm16>;
+  let Size = 4, isCodeGenOnly = 0 in
+  def BGENI : CSKYPseudo<(outs GPR:$dst), (ins uimm5:$imm), "bgeni\t$dst, $imm", []>;
+  def : InstAlias<"bgeni16 $dst, $imm", (BGENI GPR:$dst, uimm5:$imm)>;
+  def : InstAlias<"bgeni32 $dst, $imm", (BGENI GPR:$dst, uimm5:$imm)>;
   def MOVIH32 : I_16_MOV<0x11, "movih32", uimm16_16_xform>;
   def MVC32 : R_Z_1<0x1, 0x8, "mvc32">;
+  let isCodeGenOnly = 1 in
   def MOV32 : R_XZ<0x12, 0x1, "mov32">;
 
-  // TODO: ISEL Pseudo.
+  let usesCustomInserter = 1 in
+  def ISEL32 : CSKYPseudo<(outs GPR:$dst), (ins CARRY:$cond, GPR:$src1, GPR:$src2),
+    "!isel32\t$dst, $src1, src2", [(set GPR:$dst, (select CARRY:$cond, GPR:$src1, GPR:$src2))]>;
+}
 
+let Predicates = [iHas2E3] in {
   def MVCV32 : R_Z_1<0x1, 0x10, "mvcv32">;
-  // TODO: clrf and clrt.
   def CLRF32 : R_Z_2<0xB, 0x1, "clrf32", []>;
   def CLRT32 : R_Z_2<0xB, 0x2, "clrt32", []>;
+}
 
 //===----------------------------------------------------------------------===//
 // Branch and call instructions.
@@ -309,12 +589,12 @@ let isBranch = 1, isTerminator = 1 in {
                      [(br bb:$imm16)]>;
 
   def BT32 : I_16_L<0x3, (outs), (ins CARRY:$ca, br_symbol:$imm16),
-    "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>;
+    "bt32\t$imm16", [(brcond CARRY:$ca, bb:$imm16)]>, Requires<[iHasE2]>;
   def BF32 : I_16_L<0x2, (outs), (ins CARRY:$ca, br_symbol:$imm16),
-    "bf32\t$imm16", []>;
+    "bf32\t$imm16", []>, Requires<[iHasE2]>;
 }
 
-
+let Predicates = [iHas2E3] in {
   def BEZ32 : I_16_X_L<0x8, "bez32", br_symbol>;
   def BNEZ32 : I_16_X_L<0x9, "bnez32", br_symbol>;
   def BHZ32 : I_16_X_L<0xA, "bhz32", br_symbol>;
@@ -334,10 +614,25 @@ let isBranch = 1, isTerminator = 1 in {
   let isCall = 1, Defs = [ R15 ] , mayLoad = 1 in
     def JSRI32: I_16_L<0x17, (outs),
       (ins constpool_symbol:$imm16), "jsri32\t$imm16", []>;
+}
 
+def BNEZAD32 : CSKY32Inst<AddrModeNone, 0x3a,
+  (outs GPR:$rx_u), (ins GPR:$rx, br_symbol:$imm16), "bnezad32\t$rx, $imm16", []> {
+  bits<5> rx;
+  bits<16> imm16;
+  let Inst{25 - 21} = 0x1;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 0} = imm16;
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Constraints = "$rx_u = $rx";
+  let Predicates = [iHas2E3, iHas10E60];
+}
 
 def BSR32 : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>;
 
+def : InstAlias<"bsr $dst", (BSR32 call_symbol:$dst)>;
+
 def BSR32_BR : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>{
   let isCodeGenOnly = 1;
   let isBranch = 1;
@@ -347,27 +642,310 @@ def BSR32_BR : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>{
   let Defs = [ R15 ];
 }
 
-
+let Predicates = [iHasE2], isCodeGenOnly = 1 in {
   def RTS32 : I_16_RET<0x6, 0xF, "rts32", [(CSKY_RET)]>;
+}
 
 
-def RTE32 : I_16_RET_I<0, 0, "rte32", []>;
-
 //===----------------------------------------------------------------------===//
 // Symbol address instructions.
 //===----------------------------------------------------------------------===//
 
+def data_symbol_b : data_symbol<"CSKY::fixup_csky_doffset_imm18", 0>;
+def data_symbol_h : data_symbol<"CSKY::fixup_csky_doffset_imm18_scale2", 1>;
+def data_symbol_w : data_symbol<"CSKY::fixup_csky_doffset_imm18_scale4", 2> {
+  let ParserMatchClass = DataAsmClass;
+}
+
+let Predicates = [iHas2E3] in {
+
 def GRS32 : I_18_Z_L<0x3, "grs32\t$rz, $offset",
                     (outs GPR:$rz), (ins bare_symbol:$offset), []>;
+def : InstAlias<"grs\t$rz, $offset", (GRS32 GPR:$rz, bare_symbol:$offset)>;
+
+let Uses = [R28] in {
+def LRS32B : I_18_Z_L<0x0, "lrs32.b\t$rz, $offset",
+                    (outs GPR:$rz), (ins data_symbol_b:$offset), []>;
+def LRS32H : I_18_Z_L<0x1, "lrs32.h\t$rz, $offset",
+                    (outs GPR:$rz), (ins data_symbol_h:$offset), []>;
+def LRS32W : I_18_Z_L<0x2, "lrs32.w\t$rz, $offset",
+                    (outs GPR:$rz), (ins data_symbol_w:$offset), []>;
+def SRS32B : I_18_Z_L<0x4, "srs32.b\t$rz, $offset",
+                    (outs), (ins GPR:$rz, data_symbol_b:$offset), []>;
+def SRS32H : I_18_Z_L<0x5, "srs32.h\t$rz, $offset",
+                    (outs), (ins GPR:$rz, data_symbol_h:$offset), []>;
+def SRS32W : I_18_Z_L<0x6, "srs32.w\t$rz, $offset",
+                    (outs), (ins GPR:$rz, data_symbol_w:$offset), []>;
+}
+
+def PUSH32 : I_12_PP<0b11111, 0b00000, (outs), (ins reglist:$regs, variable_ops), "push32 $regs">;
+
+let Uses = [R14, R15], isReturn = 1, isTerminator = 1, isBarrier = 1 in
+def POP32 : I_12_PP<0b11110, 0b00000, (outs), (ins reglist:$regs, variable_ops), "pop32 $regs">;
+
+}
 
 let mayLoad = 1, mayStore = 0 in {
 def LRW32 : I_16_Z_L<0x14, "lrw32", (ins constpool_symbol:$imm16), []>;
 let isCodeGenOnly = 1 in
-def LRW32_Gen : I_16_Z_L<0x14, "lrw32",
-  (ins bare_symbol:$src1, constpool_symbol:$imm16), []>;
+def LRW32_Gen : I_16_Z_L<0x14, "lrw32", (ins bare_symbol:$src1, constpool_symbol:$imm16), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic and fence instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [iHasMP1E2] in {
+  def BRWARW : BAR<0b01111, "bar.brwarw", 0>;
+  def BRWARWS : BAR<0b01111, "bar.brwarws", 1>;
+  def BRARW : BAR<0b00111, "bar.brarw", 0>;
+  def BRARWS : BAR<0b00111, "bar.brarws", 1>;
+  def BRWAW : BAR<0b01110, "bar.brwaw", 0>;
+  def BRWAWS : BAR<0b01110, "bar.brwaws", 1>;
+  def BRAR : BAR<0b00101, "bar.brar", 0>;
+  def BRARS : BAR<0b00101, "bar.brars", 1>;
+  def BWAW : BAR<0b01010, "bar.bwaw", 0>;
+  def BWAWS : BAR<0b01010, "bar.bwaws", 1>;
+
+  def LDEX32W : I_LD<AddrMode32WD, 0x7, "ldex32.w", uimm12_2>;
+  let Constraints = "$rd = $rz" in
+    def STEX32W : I_LDST<AddrMode32WD, 0x37, 7,
+      (outs GPR:$rd), (ins GPR:$rz, GPR:$rx, uimm12_2:$imm12), "stex32.w", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Other operation instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [iHas2E3] in {
+  def BREV32 : R_XZ<0x18, 0x10, "brev32">;
+  def ABS32 : R_XZ<0x0, 0x10, "abs32">;
+  def BGENR32 : R_XZ<0x14, 0x2, "bgenr32">;
+}
+
+let Predicates = [iHasE2] in {
+  def REVB32 : R_XZ<0x18, 0x4, "revb32">;
+  def REVH32 : R_XZ<0x18, 0x8, "revh32">;
+  def FF0 : R_XZ<0x1F, 0x1, "ff0.32">;
+  def FF1 : R_XZ<0x1F, 0x2, "ff1.32">;
+  def XTRB0 : R_XZ<0x1C, 0x1, "xtrb0.32">;
+  def XTRB1 : R_XZ<0x1C, 0x2, "xtrb1.32">;
+  def XTRB2 : R_XZ<0x1C, 0x4, "xtrb2.32">;
+  def XTRB3 : R_XZ<0x1C, 0x8, "xtrb3.32">;
+  def BTSTI32 : I_5_X<0x0A, 0x4, "btsti32", uimm5, []>;
+  def BCLRI32 : I_5_XZ<0xA, 0x1, "bclri32",
+  (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5), []>;
+  def BSETI32 : I_5_XZ<0xA, 0x2, "bseti32",
+  (outs GPR:$rz), (ins GPR:$rx, uimm5:$imm5), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Special instructions.
+//===----------------------------------------------------------------------===//
+
+def MFFCR : CSKY32Inst<AddrModeNone, 0x30,
+  (outs GPR:$rx), (ins), "mfcr\t$rx, fcr", []> {
+  bits<5> rx;
+
+  let Inst{25 - 21} = 0b00010;
+  let Inst{20 - 16} = 0b00001;
+  let Inst{15 - 10} = 0b011000;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = rx;
+  let hasSideEffects = 1;
+  let isCodeGenOnly = 1;
+}
+
+def MTFCR : CSKY32Inst<AddrModeNone, 0x30,
+  (outs), (ins GPR:$rx), "mtcr\t$rx, fcr", []> {
+  bits<5> rx;
+
+  let Inst{25 - 21} = 0b00010;
+  let Inst{20 - 16} = rx;
+  let Inst{15 - 10} = 0b011001;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0b00001;
+  let hasSideEffects = 1;
+  let isCodeGenOnly = 1;
+}
+
+def SYNC32 : I_5_IMM5<0x30, 0b000001, 0b00001, "sync32", uimm5, []>;
+
+def SYNC0_32 : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins),
+                 "sync32", []> {
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = 0b000001;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
+}
+
+def SYNC_32_I : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins),
+                 "sync32.i", []> {
+  let Inst{25 - 21} = 1;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = 0b000001;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
+}
+
+def SYNC_32_S : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins),
+                 "sync32.s", []> {
+  let Inst{25 - 21} = 0b10000;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = 0b000001;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
+}
+
+def SYNC_32_IS : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins),
+                 "sync32.is", []> {
+  let Inst{25 - 21} = 0b10001;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 10} = 0b000001;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
 }
 
-// TODO: Atomic and fence instructions.
-// TODO: Other operations.
-// TODO: Special instructions.
-// TODO: Pseudo for assembly.
+let Predicates = [iHas2E3] in {
+  def RFI32 : I_5_XZ_PRIVI<0x11, 0x1, "rfi32">;
+  def SCE32 : I_5_IMM5<0x30, 0b000110, 0b00001, "sce32", uimm4, []>;
+}
+let Predicates = [HasExtendLrw] in
+def IDLY32 : I_5_IMM5<0x30, 0b000111, 0b00001, "idly32", imm5_idly, []>;
+def STOP32 : I_5_XZ_PRIVI<0x12, 0x1, "stop32">;
+def WAIT32 : I_5_XZ_PRIVI<0x13, 0x1, "wait32">;
+def DOZE32 : I_5_XZ_PRIVI<0x14, 0x1, "doze32">;
+def WE32 : I_5_XZ_PRIVI<0b010101, 0x1, "we32">;
+def SE32 : I_5_XZ_PRIVI<0b010110, 0x1, "se32">;
+def WSC32 : I_5_XZ_PRIVI<0b001111, 0x1, "wsc32">;
+
+def CPOP32 : I_CPOP<(outs), (ins uimm5:$cpid, uimm20:$usdef), "cpop32 <$cpid, ${usdef}>">;
+def CPRC32 : I_CP<0b0100, (outs CARRY:$ca), (ins uimm5:$cpid, uimm12:$usdef), "cprc32 <$cpid, ${usdef}>">;
+def CPRCR32 : I_CP_Z<0b0010, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprcr32 $rz, <$cpid, ${usdef}>">;
+def CPRGR32 : I_CP_Z<0b0000, (outs GPR:$rz), (ins uimm5:$cpid, uimm12:$usdef), "cprgr32 $rz, <$cpid, ${usdef}>">;
+def CPWCR32 : I_CP_Z<0b0011, (outs), (ins GPR:$rz, uimm5:$cpid, uimm12:$usdef), "cpwcr32 $rz, <$cpid, ${usdef}>">;
+def CPWGR32 : I_CP_Z<0b0001, (outs), (ins GPR:$rz, uimm5:$cpid, uimm12:$usdef), "cpwgr32 $rz, <$cpid, ${usdef}>">;
+
+let Predicates = [iHas3r2E3r3] in {
+def DCACHE_IALL32 : I_5_CACHE<0b100101, 0b01000, "dcache32.iall">;
+def DCACHE_CALL32 : I_5_CACHE<0b100101, 0b00100, "dcache32.call">;
+def DCACHE_CIALL32 : I_5_CACHE<0b100101, 0b01100, "dcache32.ciall">;
+def DCACHE_IVA32 : I_5_X_CACHE<0b100101, 0b01011, "dcache32.iva">;
+def DCACHE_ISW32: I_5_X_CACHE<0b100101, 0b01010, "dcache32.isw">;
+def DCACHE_CVA32 : I_5_X_CACHE<0b100101, 0b00111, "dcache32.cva">;
+def DCACHE_CVAL32 : I_5_X_CACHE<0b100101, 0b10111, "dcache32.cval1">;
+def DCACHE_CSW32 : I_5_X_CACHE<0b100101, 0b00110, "dcache32.csw">;
+def DCACHE_CIVA32 : I_5_X_CACHE<0b100101, 0b01111, "dcache32.civa">;
+def DCACHE_CISW32 : I_5_X_CACHE<0b100101, 0b01110, "dcache32.cisw">;
+
+def ICACHE_IALL32 : I_5_CACHE<0b100100, 0b01000, "icache32.iall">;
+def ICACHE_IALLS32 : I_5_CACHE<0b100100, 0b11000, "icache32.ialls">;
+def ICACHE_IVA32 : I_5_X_CACHE<0b100100, 0b01011, "icache32.iva">;
+
+def TLBI_VAA32 : I_5_X_CACHE<0b100010, 0b00010, "tlbi32.vaa">;
+def TLBI_VAAS32 : I_5_X_CACHE<0b100010, 0b10010, "tlbi32.vaas">;
+def TLBI_ASID32 : I_5_X_CACHE<0b100010, 0b00001, "tlbi32.asid">;
+def TLBI_ASIDS32 : I_5_X_CACHE<0b100010, 0b10001, "tlbi32.asids">;
+def TLBI_VA32 : I_5_X_CACHE<0b100010, 0b00011, "tlbi32.va">;
+def TLBI_VAS32 : I_5_X_CACHE<0b100010, 0b10011, "tlbi32.vas">;
+def TLBI_ALL32 : I_5_CACHE<0b100010, 0b00000, "tlbi32.all">;
+def TLBI_ALLS32 : I_5_CACHE<0b100010, 0b10000, "tlbi32.alls">;
+
+def L2CACHE_IALL : I_5_CACHE<0b100110, 0b01000, "l2cache.iall">;
+def L2CACHE_CALL : I_5_CACHE<0b100110, 0b00100, "l2cache.call">;
+def L2CACHE_CIALL : I_5_CACHE<0b100110, 0b01100, "l2cache.ciall">;
+}
+
+def PLDR32 :I_PLDR<AddrMode32WD, 0x36, 0b0110, (outs), (ins GPR:$rx, uimm12_2:$imm12), "pldr32", []>;
+def PLDW32 :I_PLDR<AddrMode32WD, 0x37, 0b0110, (outs), (ins GPR:$rx, uimm12_2:$imm12), "pldw32", []>;
+
+def TRAP32 : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins uimm2:$imm2), "trap32 ${imm2}", []> {
+  bits<2> imm2;
+
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = 0;
+  let Inst{15 - 12} = 0b0010;
+  let Inst{11 - 10} = imm2;
+  let Inst{9 - 5} = 0b00001;
+  let Inst{4 - 0} = 0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo for assembly
+//===----------------------------------------------------------------------===//
+
+let isCall = 1, Defs = [ R15 ], mayLoad = 1, Size = 4, isCodeGenOnly = 0 in
+def JBSR32 : CSKYPseudo<(outs), (ins call_symbol:$src1), "jbsr32\t$src1", []>;
+
+def : InstAlias<"jbsr\t$src1", (JBSR32 call_symbol:$src1)>;
+
+def JBR32 : CSKYPseudo<(outs), (ins br_symbol:$src1), "jbr32\t$src1", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isIndirectBranch = 1;
+  let mayLoad = 1;
+  let Size = 4;
+}
+
+def JBT32 : CSKYPseudo<(outs), (ins CARRY:$ca, br_symbol:$src1), "jbt32\t$src1", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isIndirectBranch = 1;
+  let mayLoad = 1;
+  let Size = 4;
+}
+
+def JBF32 : CSKYPseudo<(outs), (ins CARRY:$ca, br_symbol:$src1), "jbf32\t$src1", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isIndirectBranch = 1;
+  let mayLoad = 1;
+  let Size = 4;
+}
+
+def JBT_E : CSKYPseudo<(outs), (ins CARRY:$ca, br_symbol:$src1), "!jbt_e\t$src1", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isIndirectBranch = 1;
+  let mayLoad = 1;
+  let Size = 6;
+}
+
+def JBF_E : CSKYPseudo<(outs), (ins CARRY:$ca, br_symbol:$src1), "!jbf_e\t$src1", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isIndirectBranch = 1;
+  let mayLoad = 1;
+  let Size = 6;
+}
+
+let mayLoad = 1, Size = 2, isCodeGenOnly = 0 in
+def PseudoLRW32 : CSKYPseudo<(outs GPR:$rz), (ins bare_symbol:$src), "lrw32 $rz, $src", []>;
+
+
+def : InstAlias<"lrw $rz, $src", (PseudoLRW32 GPR:$rz, bare_symbol:$src)>;
+def : InstAlias<"lrw $rz, $src", (LRW32 GPR:$rz, constpool_symbol:$src)>;
+
+let mayLoad = 1, Size = 4, isCodeGenOnly = 0 in
+def PseudoJSRI32 : CSKYPseudo<(outs), (ins call_symbol:$src), "jsri32 $src", []>;
+def : InstAlias<"jsri $dst", (PseudoJSRI32 call_symbol:$dst)>;
+def : InstAlias<"jsri $dst", (JSRI32 constpool_symbol:$dst)>;
+
+let mayLoad = 1, Size = 4, isCodeGenOnly = 0 in
+def PseudoJMPI32 : CSKYPseudo<(outs), (ins br_symbol:$src), "jmpi32 $src", []>;
+def : InstAlias<"jmpi $dst", (PseudoJMPI32 br_symbol:$dst)>;
+def : InstAlias<"jmpi $dst", (JMPI32 constpool_symbol:$dst)>;
+
+let isNotDuplicable = 1, mayLoad = 1, mayStore = 0, Size = 8 in
+def PseudoTLSLA32 : CSKYPseudo<(outs GPR:$dst1, GPR:$dst2),
+  (ins constpool_symbol:$src, i32imm:$label), "!tlslrw32\t$dst1, $dst2, $src, $label", []>;
+
+let hasSideEffects = 0, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY : CSKYPseudo<(outs),
+  (ins i32imm:$instid, i32imm:$cpidx, i32imm:$size), "", []>;
+
+include "CSKYInstrInfo16Instr.td"
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
new file mode 100644
index 000000000000..c98f43622155
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
@@ -0,0 +1,452 @@
+//===-- CSKYInstrInfo16Instr.td - CSKY 16-bit Instruction --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the CSKY 16-bit instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CSKY specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Target-dependent nodes.
+def CSKY_NIE : SDNode<"CSKYISD::NIE", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+def CSKY_NIR : SDNode<"CSKYISD::NIR", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+def br_symbol_16bit : Operand<iPTR> {
+  let EncoderMethod =
+    "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm10_scale2>";
+  let ParserMatchClass = CSKYSymbol;
+  let DecoderMethod = "decodeSImmOperand<10, 1>";
+  let PrintMethod = "printCSKYSymbolOperand";
+  let OperandType = "OPERAND_PCREL";
+}
+
+def SPOperand : AsmOperandClass {
+  let Name = "SPOperand";
+  let RenderMethod = "addRegOperands";
+  let DiagnosticType = !strconcat("Invalid", Name);
+}
+
+def SPOp : RegisterOperand<GPR> {
+ let ParserMatchClass = SPOperand;
+}
+
+def constpool_symbol_16bit : Operand<iPTR> {
+  let ParserMatchClass = Constpool;
+  let EncoderMethod =
+    "getConstpoolSymbolOpValue<CSKY::fixup_csky_pcrel_uimm7_scale4>";
+  let DecoderMethod = "decodeLRW16Imm8";
+  let PrintMethod = "printConstpool";
+  let OperandType = "OPERAND_PCREL";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Formats
+//===----------------------------------------------------------------------===//
+
+include "CSKYInstrFormats16Instr.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Basic ALU instructions.
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1, isAdd = 1 in
+  def ADDU16 : R16_XYZ<0, "addu16", add>;
+let Pattern = [(set mGPR:$rz, (sub mGPR:$rx, mGPR:$ry))] in
+  def SUBU16 : R16_XYZ<1, "subu16", sub>;
+
+let isCommutable = 1, isAdd = 1 in
+  def ADDC16 : R16_XZ_BINOP_C<0b1000, 0b01, "addc16">;
+def SUBC16 : R16_XZ_BINOP_C<0b1000, 0b11, "subc16">;
+
+let isCommutable = 1 in {
+  let isAdd = 1 in
+  def ADDU16XZ : R16_XZ_BINOP<0b1000, 0b00, "addu16", BinOpFrag<(add node:$LHS, node:$RHS)>>;
+  def AND16 : R16_XZ_BINOP<0b1010, 0b00, "and16", BinOpFrag<(and node:$LHS, node:$RHS)>>;
+  def OR16 : R16_XZ_BINOP<0b1011, 0b00, "or16", BinOpFrag<(or node:$LHS, node:$RHS)>>;
+  def XOR16 : R16_XZ_BINOP<0b1011, 0b01, "xor16", BinOpFrag<(xor node:$LHS, node:$RHS)>>;
+  def NOR16 : R16_XZ_BINOP<0b1011, 0b10, "nor16", BinOpFrag<(not (or node:$LHS, node:$RHS))>>;
+  let isCodeGenOnly = 1 in
+  def NOT16 : R16_XZ_UNOP<0b1011, 0b10, "not16">;
+  def MULT16 :  R16_XZ_BINOP<0b1111, 0b00, "mult16", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+}
+def SUBU16XZ : R16_XZ_BINOP<0b1000, 0b10, "subu16", BinOpFrag<(sub node:$LHS, node:$RHS)>>;
+def ANDN16 : R16_XZ_BINOP<0b1010, 0b01, "andn16", BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+def LSL16 : R16_XZ_BINOP<0b1100, 0b00, "lsl16", BinOpFrag<(shl node:$LHS, node:$RHS)>>;
+def LSR16 : R16_XZ_BINOP<0b1100, 0b01, "lsr16", BinOpFrag<(srl node:$LHS, node:$RHS)>>;
+def ASR16 : R16_XZ_BINOP<0b1100, 0b10, "asr16", BinOpFrag<(sra node:$LHS, node:$RHS)>>;
+def ROTL16 : R16_XZ_BINOP<0b1100, 0b11, "rotl16", BinOpFrag<(rotl node:$LHS, (and node:$RHS, 0x1f))>>;
+
+def MULSH16 : R16_XZ_BINOP_NOPat<0b1111, 0b01, "mulsh16">;
+
+def ZEXTB16 : R16_XZ_UNOP<0b1101, 0b00, "zextb16">;
+def ZEXTH16 : R16_XZ_UNOP<0b1101, 0b01, "zexth16">;
+def SEXTB16 : R16_XZ_UNOP<0b1101, 0b10, "sextb16">;
+def SEXTH16 : R16_XZ_UNOP<0b1101, 0b11, "sexth16">;
+
+let Constraints = "$rZ = $rz", isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  let isAdd = 1, Pattern = [(set mGPR:$rz, (add mGPR:$rZ, oimm8:$imm8))] in
+  def ADDI16 : I16_Z_8<0b100, (ins mGPR:$rZ, oimm8:$imm8), "addi16\t$rz, $imm8">;
+  let Pattern = [(set mGPR:$rz, (sub mGPR:$rZ, oimm8:$imm8))] in
+  def SUBI16 : I16_Z_8<0b101, (ins mGPR:$rZ, oimm8:$imm8), "subi16\t$rz, $imm8">;
+}
+
+let isAdd = 1 in
+def ADDI16ZSP : I16_Z_8<0b011, (ins SPOp:$sp, uimm8_2:$imm8),
+                        "addi16\t$rz, $sp, $imm8">;
+
+let isAdd = 1 in
+def ADDI16SPSP : I16_SP_IMM7<0b000,"addi16">;
+def SUBI16SPSP : I16_SP_IMM7<0b001,"subi16">;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def LSLI16 : I16_XZ_IMM5<0, "lsli16", shl>;
+  def LSRI16 : I16_XZ_IMM5<1, "lsri16", srl>;
+  def ASRI16 : I16_XZ_IMM5<2, "asri16", sra>;
+}
+
+let isAdd = 1 in
+def ADDI16XZ : I16_XZ_IMM3<0b10, "addi16", add>;
+def SUBI16XZ : I16_XZ_IMM3<0b11, "subi16", sub>;
+
+let Size = 4 in
+def NEG16 : CSKYPseudo<(outs mGPR:$rd), (ins mGPR:$rx), "neg16 $rd, $rx", []>;
+
+let Size = 4 in
+def RSUBI16 : CSKYPseudo<(outs mGPR:$rd),
+  (ins mGPR:$rx, uimm8:$imm8), "rsubi16 $rd, $rx, $imm8", []>;
+
+//===----------------------------------------------------------------------===//
+// Load & Store instructions.
+//===----------------------------------------------------------------------===//
+
+def LD16B : I16_XZ_LDST<AddrMode16B, 0b000, "ld16.b",
+  (outs mGPR:$rz), (ins mGPR:$rx, uimm5:$imm)>;
+def LD16H : I16_XZ_LDST<AddrMode16H, 0b001, "ld16.h",
+  (outs mGPR:$rz), (ins mGPR:$rx, uimm5_1:$imm)>;
+def LD16W : I16_XZ_LDST<AddrMode16W, 0b010, "ld16.w",
+  (outs mGPR:$rz), (ins mGPR:$rx, uimm5_2:$imm)>;
+def ST16B : I16_XZ_LDST<AddrMode16B, 0b100, "st16.b",
+  (outs), (ins mGPR:$rz, mGPR:$rx, uimm5:$imm)>;
+def ST16H : I16_XZ_LDST<AddrMode16H, 0b101, "st16.h",
+  (outs), (ins mGPR:$rz, mGPR:$rx, uimm5_1:$imm)>;
+def ST16W : I16_XZ_LDST<AddrMode16W, 0b110, "st16.w",
+  (outs), (ins mGPR:$rz, mGPR:$rx, uimm5_2:$imm)>;
+
+def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w",
+  (outs mGPR:$rz), (ins SPOp:$sp, uimm8_2:$addr)>;
+def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w",
+  (outs), (ins mGPR:$rz, SPOp:$sp, uimm8_2:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// Compare instructions.
+//===----------------------------------------------------------------------===//
+
+def CMPHS16 : R16_XY_CMP<0, "cmphs16">;
+def CMPLT16 : R16_XY_CMP<1, "cmplt16">;
+let isCommutable = 1 in
+def CMPNE16 : R16_XY_CMP<2, "cmpne16">;
+
+
+def CMPHSI16 : I16_X_CMP<0, "cmphsi16", oimm5>;
+def CMPLTI16 : I16_X_CMP<1, "cmplti16", oimm5>;
+def CMPLEI16 : CSKYPseudo<(outs CARRY:$ca), (ins mGPR:$rx, uimm5:$imm5),
+    "cmplei16\t$rx, $imm5", []>;
+def CMPNEI16 : I16_X_CMP<2, "cmpnei16", uimm5>;
+
+//===----------------------------------------------------------------------===//
+// Data move instructions.
+//===----------------------------------------------------------------------===//
+
+
+def MOVI16 : I16_Z_8<0b110, (ins uimm8:$imm8), "movi16\t$rz, $imm8"> {
+  let isReMaterializable = 1;
+  let isAsCheapAsAMove = 1;
+  let isMoveImm = 1;
+  let Pattern = [(set mGPR:$rz, uimm8:$imm8)];
+}
+
+def MOV16 : CSKY16Inst<AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx),
+                       "mov16\t$rz, $rx", []> {
+  bits<4> rz;
+  bits<4> rx;
+  let Inst{15,14} = 0b01;
+  let Inst{13 - 10} = 0b1011;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1,0} = 0b11;
+}
+
+// MVC16 is not in "cskyv2 instructions reference manul"
+def MVCV16 : CSKY16Inst<AddrModeNone,
+  (outs sGPR:$rz), (ins CARRY:$ca), "mvcv16\t$rz", []> {
+  bits<4> rz;
+  let Inst{15,14} = 0b01;
+  let Inst{13 - 10} = 0b1001;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = 0;
+  let Inst{1,0} = 0b11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Branch and call instructions.
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, isTerminator = 1 in {
+  let isBarrier = 1, isPredicable = 1 in
+    def BR16 : J16<1, "br16", (ins br_symbol_16bit:$offset)>;
+
+  def BT16 : J16_B<2, "bt16">;
+  def BF16 : J16_B<3, "bf16">;
+}
+
+def JMP16 : R16_X_J<0b11100000, 0b00, "jmp16"> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isIndirectBranch = 1;
+  let Pattern = [(brind sGPR:$rx)];
+}
+
+def JSR16 : R16_X_J<0b11101111, 0b01, "jsr16"> {
+  let isCall = 1;
+  let Defs = [ R15 ];
+}
+
+def RTS16 : CSKY16Inst<AddrModeNone, (outs), (ins), "rts16", [(CSKY_RET)]> {
+  let isTerminator = 1;
+  let isReturn = 1;
+  let isBarrier = 1;
+  let Inst = 0b0111100000111100;
+  let Uses = [R15];
+  let isCodeGenOnly = 1;
+}
+
+def JMPIX16 :  CSKY16Inst<AddrModeNone, (outs),
+  (ins mGPR:$rx, uimm2_jmpix:$indeximm2), "jmpix16\t$rx, $indeximm2", []> {
+  bits<3> rx;
+  bits<2> indeximm2;
+  let Inst{15,14} = 0b00;
+  let Inst{13 - 11} = 0b111;
+  let Inst{10 - 8} = rx;
+  let Inst{7 - 2} = 0b111000;
+  let Inst{1,0} = indeximm2;
+  let Predicates = [HasJAVA];
+  let Uses = [R30];
+}
+
+//===----------------------------------------------------------------------===//
+// Symbol address instructions.
+//===----------------------------------------------------------------------===//
+
+def LRW16 : CSKY16Inst<AddrModeNone, (outs mGPR:$rz),
+  (ins constpool_symbol_16bit:$label), "lrw16\t$rz, $label", []> {
+  bits<3> rz;
+  bits<8> label;
+  let Inst{15 - 13} = 0b000;
+  let Inst{12} = label{7};
+  let Inst{11,10} = 0b00;
+  let Inst{9,8} = label{6,5};
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 0} = label{4-0};
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+def LRW16_Gen : CSKY16Inst<AddrModeNone, (outs mGPR:$rz),
+  (ins bare_symbol:$src, constpool_symbol_16bit:$label),
+  "lrw16\t$rz, $label", []> {
+  bits<3> rz;
+  bits<8> label;
+  let Inst{15 - 13} = 0b000;
+  let Inst{12} = label{7};
+  let Inst{11,10} = 0b00;
+  let Inst{9,8} = label{6,5};
+  let Inst{7 - 5} = rz;
+  let Inst{4 - 0} = label{4-0};
+  let mayLoad = 1;
+  let mayStore = 0;
+  let isCodeGenOnly = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Other operation instructions.
+//===----------------------------------------------------------------------===//
+
+def REVB16 :  R16_XZ_UNOP<0b1110, 0b10, "revb16">;
+def REVH16 :  R16_XZ_UNOP<0b1110, 0b11, "revh16">;
+
+let isCodeGenOnly = 1 in
+def SETC16 : CSKY16Inst<AddrModeNone,
+  (outs CARRY:$ca), (ins), "setc16", []> {
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = 0b1001;
+  let Inst{9 - 6} = 0;
+  let Inst{5 - 2} = 0;
+  let Inst{1, 0} = 0;
+  let isCompare = 1;
+}
+
+let isCodeGenOnly = 1 in
+def CLRC16 : CSKY16Inst<AddrModeNone,
+  (outs CARRY:$ca), (ins), "clrc16", []> {
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = 0b1001;
+  let Inst{9 - 6} = 0;
+  let Inst{5 - 2} = 0;
+  let Inst{1, 0} = 2;
+  let isCompare = 1;
+}
+
+let Constraints = "$rZ = $rz" in {
+  def BCLRI16 : I16_Z_5<0b100, (outs mGPR:$rz), (ins mGPR:$rZ, uimm5:$imm5),
+                        "bclri16">;
+  def BSETI16 : I16_Z_5<0b101, (outs mGPR:$rz), (ins mGPR:$rZ, uimm5:$imm5),
+                        "bseti16">;
+}
+
+let Predicates = [HasBTST16] in
+  def BTSTI16 : I16_Z_5<0b110, (outs CARRY:$ca), (ins mGPR:$rz, uimm5:$imm5),
+                        "btsti16">;
+
+def TST16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry),
+                        "tst16\t$rx, $ry", []> {
+  bits<4> ry;
+  bits<4> rx;
+  let Inst{15,14} = 0b01;
+  let Inst{13 - 10} = 0b1010;
+  let Inst{9 - 6} = ry;
+  let Inst{5 - 2} = rx;
+  let Inst{1,0} = 0b10;
+  let isCompare = 1;
+}
+
+def TSTNBZ16 : CSKY16Inst<AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx),
+                          "tstnbz16\t$rx", []> {
+  bits<4> rx;
+  let Inst{15,14} = 0b01;
+  let Inst{13 - 10} = 0b1010;
+  let Inst{9 - 6} = 0b0000;
+  let Inst{5 - 2} = rx;
+  let Inst{1,0} = 0b11;
+  let isCompare = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Special instructions.
+//===----------------------------------------------------------------------===//
+
+def BKPT : CSKY16Inst<AddrModeNone, (outs), (ins), "bkpt", []> {
+  let Inst = 0;
+}
+
+let mayStore = 1 in {
+def BPUSHH : I16_BPushPop<0b00010100111, 0, (outs), (ins mGPR:$rz), "bpush.h $rz">;
+def BPUSHW : I16_BPushPop<0b00010100111, 0b10, (outs), (ins mGPR:$rz), "bpush.w $rz">;
+}
+
+let mayLoad = 1 in {
+def BPOPH : I16_BPushPop<0b00010100101, 0, (outs mGPR:$rz), (ins),  "bpop.h $rz">;
+def BPOPW : I16_BPushPop<0b00010100101, 0b10, (outs mGPR:$rz), (ins), "bpop.w $rz">;
+}
+
+def NIE : CSKY16Inst<AddrModeNone, (outs), (ins), "nie", [(CSKY_NIE)]> {
+  let Inst = 0b0001010001100000;
+}
+
+let isBarrier = 1, isReturn = 1, isTerminator = 1 in
+def NIR : CSKY16Inst<AddrModeNone, (outs), (ins), "nir", [(CSKY_NIR)]> {
+  let Inst = 0b0001010001100001;
+}
+
+def IPUSH16 : CSKY16Inst<AddrModeNone, (outs), (ins), "ipush16", []> {
+  let Inst{15- 5} = 0b00010100011;
+  let Inst{4-0} = 0b00010;
+  let Predicates = [iHasE1];
+  let Defs  = [R14];
+  let Uses  = [R14, R0, R1, R2, R3, R12, R13];
+  let mayStore = 1;
+}
+
+def IPOP16 : CSKY16Inst<AddrModeNone, (outs), (ins), "ipop16", []> {
+  let Inst{15- 5} = 0b00010100011;
+  let Inst{4-0} = 0b00011;
+  let Predicates = [iHasE1];
+  let Defs  = [R14, R0, R1, R2, R3, R12, R13];
+  let Uses  = [R14];
+  let mayLoad = 1;
+}
+
+def PUSH16 : CSKY16Inst<AddrModeNone, (outs),
+  (ins reglist:$regs, variable_ops), "push16 $regs", []> {
+  bits<5> regs;
+
+  let Inst{15- 5} = 0b00010100110;
+  let Inst{4-0} = regs;
+  let Predicates = [iHasE1];
+  let Defs  = [R14];
+  let Uses  = [R14];
+  let mayStore = 1;
+}
+
+def POP16 : CSKY16Inst<AddrModeNone, (outs),
+  (ins reglist:$regs, variable_ops), "pop16 $regs", []> {
+  bits<5> regs;
+
+  let Inst{15- 5} = 0b00010100100;
+  let Inst{4-0} = regs;
+  let Predicates = [iHasE1];
+  let Defs  = [R14];
+  let Uses  = [R14];
+  let mayLoad = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// CSKYPseudo
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in  {
+  def ISEL16 : CSKYPseudo<(outs sGPR:$dst),
+    (ins CARRY:$cond, sGPR:$src1, sGPR:$src2),
+    "!isel16\t$dst, $src1, src2",
+    [(set sGPR:$dst, (select CARRY:$cond, sGPR:$src1, sGPR:$src2))]>;
+}
+
+class JBranchPseudo<dag out, dag ins, string opstr> :
+  CSKYPseudo<out, ins, opstr, []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isIndirectBranch = 1;
+  let mayLoad = 1;
+  let Size = 2;
+}
+
+let isBarrier = 1 in
+def JBR16 : JBranchPseudo<(outs),
+  (ins br_symbol_16bit:$src1), "jbr16\t$src1">;
+def JBT16 : JBranchPseudo<(outs),
+  (ins CARRY:$ca, br_symbol_16bit:$src1), "jbt16\t$src1">;
+def JBF16 : JBranchPseudo<(outs),
+  (ins CARRY:$ca, br_symbol_16bit:$src1), "jbf16\t$src1">;
+
+let mayLoad = 1, Size = 2, isCodeGenOnly = 0 in
+def PseudoLRW16 : CSKYPseudo<(outs mGPR:$rz),
+  (ins bare_symbol:$src), "lrw16 $rz, $src", []>;
diff --git a/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp b/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp
new file mode 100644
index 000000000000..c42a56bfb04e
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp
@@ -0,0 +1,117 @@
+//===-- CSKYMCInstLower.cpp - Convert CSKY MachineInstr to an MCInst --------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower CSKY MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYMCInstLower.h"
+#include "MCTargetDesc/CSKYBaseInfo.h"
+#include "MCTargetDesc/CSKYMCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCExpr.h"
+
+#define DEBUG_TYPE "csky-mcinst-lower"
+
+using namespace llvm;
+
+CSKYMCInstLower::CSKYMCInstLower(MCContext &Ctx, AsmPrinter &Printer)
+    : Ctx(Ctx), Printer(Printer) {}
+
+void CSKYMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (const MachineOperand &MO : MI->operands()) {
+    MCOperand MCOp;
+    if (lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
+
+MCOperand CSKYMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
+                                              MCSymbol *Sym) const {
+  CSKYMCExpr::VariantKind Kind;
+  MCContext &Ctx = Printer.OutContext;
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag.");
+  case CSKYII::MO_None:
+    Kind = CSKYMCExpr::VK_CSKY_None;
+    break;
+  case CSKYII::MO_GOT32:
+    Kind = CSKYMCExpr::VK_CSKY_GOT;
+    break;
+  case CSKYII::MO_GOTOFF:
+    Kind = CSKYMCExpr::VK_CSKY_GOTOFF;
+    break;
+  case CSKYII::MO_ADDR32:
+    Kind = CSKYMCExpr::VK_CSKY_ADDR;
+    break;
+  case CSKYII::MO_PLT32:
+    Kind = CSKYMCExpr::VK_CSKY_PLT;
+    break;
+  case CSKYII::MO_ADDR_HI16:
+    Kind = CSKYMCExpr::VK_CSKY_ADDR_HI16;
+    break;
+  case CSKYII::MO_ADDR_LO16:
+    Kind = CSKYMCExpr::VK_CSKY_ADDR_LO16;
+    break;
+  }
+  const MCExpr *ME =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+
+  if (Kind != CSKYMCExpr::VK_CSKY_None)
+    ME = CSKYMCExpr::create(ME, Kind, Ctx);
+
+  return MCOperand::createExpr(ME);
+}
+
+bool CSKYMCInstLower::lowerOperand(const MachineOperand &MO,
+                                   MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_RegisterMask:
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    break;
+  case MachineOperand::MO_Register:
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::createReg(MO.getReg());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::createExpr(
+        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = lowerSymbolOperand(MO, Printer.getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = lowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = lowerSymbolOperand(
+        MO, Printer.GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = lowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = lowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_MCSymbol:
+    MCOp = lowerSymbolOperand(MO, MO.getMCSymbol());
+    break;
+  }
+  return true;
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYMCInstLower.h b/llvm/lib/Target/CSKY/CSKYMCInstLower.h
new file mode 100644
index 000000000000..ea76bd129d30
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYMCInstLower.h
@@ -0,0 +1,35 @@
+//===-- CSKYMCInstLower.cpp - Convert CSKY MachineInstr to an MCInst --------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYMCINSTLOWER_H
+#define LLVM_LIB_TARGET_CSKY_CSKYMCINSTLOWER_H
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MachineInstr;
+class MCInst;
+class MachineOperand;
+class MCOperand;
+class MCSymbol;
+
+class CSKYMCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+
+public:
+  CSKYMCInstLower(MCContext &Ctx, AsmPrinter &Printer);
+
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYMCINSTLOWER_H
diff --git a/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h b/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h
new file mode 100644
index 000000000000..b6e303f8ccfb
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYMachineFunctionInfo.h
@@ -0,0 +1,62 @@
+//=- CSKYMachineFunctionInfo.h - CSKY machine function info -------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares CSKY-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_CSKY_CSKYMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class CSKYMachineFunctionInfo : public MachineFunctionInfo {
+  MachineFunction &MF;
+
+  Register GlobalBaseReg = 0;
+  bool SpillsCR = false;
+
+  int VarArgsFrameIndex = 0;
+  unsigned VarArgsSaveSize = 0;
+
+  int spillAreaSize = 0;
+
+  bool LRSpilled = false;
+
+  unsigned PICLabelUId = 0;
+
+public:
+  CSKYMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+
+  Register getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
+
+  void setSpillsCR() { SpillsCR = true; }
+  bool isCRSpilled() const { return SpillsCR; }
+
+  void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
+  int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
+
+  unsigned getVarArgsSaveSize() const { return VarArgsSaveSize; }
+  void setVarArgsSaveSize(int Size) { VarArgsSaveSize = Size; }
+
+  bool isLRSpilled() const { return LRSpilled; }
+  void setLRIsSpilled(bool s) { LRSpilled = s; }
+
+  void setCalleeSaveAreaSize(int v) { spillAreaSize = v; }
+  int getCalleeSaveAreaSize() const { return spillAreaSize; }
+
+  unsigned createPICLabelUId() { return ++PICLabelUId; }
+  void initPICLabelUId(unsigned UId) { PICLabelUId = UId; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYMACHINEFUNCTIONINFO_H
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
new file mode 100644
index 000000000000..a1d45fea534b
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
@@ -0,0 +1,95 @@
+//===-- CSKYRegisterInfo.h - CSKY Register Information Impl ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CSKY implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYRegisterInfo.h"
+#include "CSKY.h"
+#include "CSKYSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCContext.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "CSKYGenRegisterInfo.inc"
+
+using namespace llvm;
+
+CSKYRegisterInfo::CSKYRegisterInfo()
+    : CSKYGenRegisterInfo(CSKY::R15, 0, 0, 0) {}
+
+const uint32_t *
+CSKYRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID Id) const {
+  const CSKYSubtarget &STI = MF.getSubtarget<CSKYSubtarget>();
+  return CSR_I32_RegMask;
+}
+
+Register CSKYRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? CSKY::R8 : CSKY::R14;
+}
+
+BitVector CSKYRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const CSKYFrameLowering *TFI = getFrameLowering(MF);
+  const CSKYSubtarget &STI = MF.getSubtarget<CSKYSubtarget>();
+  BitVector Reserved(getNumRegs());
+
+  // Reserve the base register if we need to allocate
+  // variable-sized objects at runtime.
+  if (TFI->hasBP(MF))
+    markSuperRegs(Reserved, CSKY::R7); // bp
+
+  if (TFI->hasFP(MF))
+    markSuperRegs(Reserved, CSKY::R8); // fp
+
+  if (!STI.hasE2()) {
+    for (unsigned i = 0; i < 6; i++)
+      markSuperRegs(Reserved, CSKY::R8 + i); // R8 - R13
+  }
+
+  markSuperRegs(Reserved, CSKY::R14); // sp
+  markSuperRegs(Reserved, CSKY::R15); // lr
+
+  if (!STI.hasHighRegisters()) {
+    for (unsigned i = 0; i < 10; i++)
+      markSuperRegs(Reserved, CSKY::R16 + i); // R16 - R25
+  }
+
+  markSuperRegs(Reserved, CSKY::R26);
+  markSuperRegs(Reserved, CSKY::R27);
+  markSuperRegs(Reserved, CSKY::R28); // gp
+  markSuperRegs(Reserved, CSKY::R29);
+  markSuperRegs(Reserved, CSKY::R30);
+  markSuperRegs(Reserved, CSKY::R31); // tp
+
+  assert(checkAllSuperRegsMarked(Reserved));
+  return Reserved;
+}
+
+const uint32_t *CSKYRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+const MCPhysReg *
+CSKYRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const CSKYSubtarget &STI = MF->getSubtarget<CSKYSubtarget>();
+  if (MF->getFunction().hasFnAttribute("interrupt")) {
+    return CSR_GPR_ISR_SaveList;
+  }
+
+  return CSR_I32_SaveList;
+}
+
+void CSKYRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                           int SPAdj, unsigned FIOperandNum,
+                                           RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.h b/llvm/lib/Target/CSKY/CSKYRegisterInfo.h
new file mode 100644
index 000000000000..779ea6493c7e
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.h
@@ -0,0 +1,45 @@
+//===-- CSKYRegisterInfo.h - CSKY Register Information Impl ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CSKY implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYREGISTERINFO_H
+#define LLVM_LIB_TARGET_CSKY_CSKYREGISTERINFO_H
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "CSKYGenRegisterInfo.inc"
+
+namespace llvm {
+class CSKYInstrInfo;
+
+class CSKYRegisterInfo : public CSKYGenRegisterInfo {
+public:
+  CSKYRegisterInfo();
+
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID id) const override;
+  const uint32_t *getNoPreservedMask() const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  Register getFrameRegister(const MachineFunction &MF) const override;
+
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYREGISTERINFO_H
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
index aef4589a67f2..7548c22bb2c5 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
@@ -153,6 +153,21 @@ def GPR : RegisterClass<"CSKY", [i32], 32,
   let Size = 32;
 }
 
+// Register class for R0 - R15.
+// Some 16-bit integer instructions can only access R0 - R15.
+def sGPR : RegisterClass<"CSKY", [i32], 32,
+                        (add (sequence "R%u", 0, 3), (sequence "R%u", 12, 13), R15,
+                        (sequence "R%u", 4, 11), R14)> {
+  let Size = 32;
+}
+
+// Register class for R0 - R7.
+// Some 16-bit integer instructions can only access R0 - R7.
+def mGPR : RegisterClass<"CSKY", [i32], 32,
+                        (add (sequence "R%u", 0, 7))> {
+  let Size = 32;
+}
+
 def GPRPair : RegisterClass<"CSKY", [untyped], 32, (add GPRTuple)> {
   let Size = 64;
 }
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
new file mode 100644
index 000000000000..963c2ede9c44
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
@@ -0,0 +1,74 @@
+//===-- CSKYSubtarget.h - Define Subtarget for the CSKY----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CSKY specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csky-subtarget"
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "CSKYGenSubtargetInfo.inc"
+
+void CSKYSubtarget::anchor() {}
+
+CSKYSubtarget &CSKYSubtarget::initializeSubtargetDependencies(
+    const Triple &TT, StringRef CPUName, StringRef TuneCPUName, StringRef FS) {
+
+  if (CPUName.empty())
+    CPUName = "generic";
+  if (TuneCPUName.empty())
+    TuneCPUName = CPUName;
+
+  UseHardFloat = false;
+  UseHardFloatABI = false;
+  HasFPUv2SingleFloat = false;
+  HasFPUv2DoubleFloat = false;
+  HasFPUv3SingleFloat = false;
+  HasFPUv3DoubleFloat = false;
+
+  HasBTST16 = false;
+  HasJAVA = false;
+  HasExtendLrw = false;
+  HasDoloop = false;
+  HasHighRegisters = false;
+
+  HasE1 = false;
+  HasE2 = false;
+  Has2E3 = false;
+  HasMP = false;
+  Has3E3r1 = false;
+  Has3r1E3r2 = false;
+  Has3r2E3r3 = false;
+  Has3E7 = false;
+  HasMP1E2 = false;
+  Has7E10 = false;
+  Has10E60 = false;
+
+  ParseSubtargetFeatures(CPUName, TuneCPUName, FS);
+  return *this;
+}
+
+CSKYSubtarget::CSKYSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                             StringRef FS, const TargetMachine &TM)
+    : CSKYGenSubtargetInfo(TT, CPU, TuneCPU, FS),
+      FrameLowering(initializeSubtargetDependencies(TT, CPU, TuneCPU, FS)),
+      InstrInfo(*this), RegInfo(), TLInfo(TM, *this) {}
+
+bool CSKYSubtarget::useHardFloatABI() const {
+  auto FloatABI = getTargetLowering()->getTargetMachine().Options.FloatABIType;
+
+  if (FloatABI == FloatABI::Default)
+    return UseHardFloatABI;
+  else
+    return FloatABI == FloatABI::Hard;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.h b/llvm/lib/Target/CSKY/CSKYSubtarget.h
new file mode 100644
index 000000000000..4cd590e8e76e
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.h
@@ -0,0 +1,120 @@
+//===-- CSKYSubtarget.h - Define Subtarget for the CSKY----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CSKY specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_CSKYSUBTARGET_H
+#define LLVM_LIB_TARGET_CSKY_CSKYSUBTARGET_H
+
+#include "CSKYFrameLowering.h"
+#include "CSKYISelLowering.h"
+#include "CSKYInstrInfo.h"
+#include "CSKYRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "CSKYGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class CSKYSubtarget : public CSKYGenSubtargetInfo {
+  virtual void anchor();
+
+  CSKYFrameLowering FrameLowering;
+  CSKYInstrInfo InstrInfo;
+  CSKYRegisterInfo RegInfo;
+  CSKYTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+
+  bool UseHardFloat;
+  bool UseHardFloatABI;
+  bool HasFPUv2SingleFloat;
+  bool HasFPUv2DoubleFloat;
+  bool HasFPUv3SingleFloat;
+  bool HasFPUv3DoubleFloat;
+
+  bool HasBTST16;
+  bool HasJAVA;
+  bool HasExtendLrw;
+  bool HasDoloop;
+  bool HasHighRegisters;
+
+  bool HasE1;
+  bool HasE2;
+  bool Has2E3;
+  bool HasMP;
+  bool Has3E3r1;
+  bool Has3r1E3r2;
+  bool Has3r2E3r3;
+  bool Has3E7;
+  bool HasMP1E2;
+  bool Has7E10;
+  bool Has10E60;
+
+public:
+  CSKYSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+                StringRef FS, const TargetMachine &TM);
+
+  const CSKYFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const CSKYInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const CSKYRegisterInfo *getRegisterInfo() const override { return &RegInfo; }
+  const CSKYTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  /// Initializes using the passed in CPU and feature strings so that we can
+  /// use initializer lists for subtarget initialization.
+  CSKYSubtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                 StringRef CPU,
+                                                 StringRef TuneCPU,
+                                                 StringRef FS);
+
+  // Generated by inc file
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  bool useHardFloatABI() const;
+  bool useHardFloat() const { return UseHardFloat; }
+  bool hasFPUv2SingleFloat() const { return HasFPUv2SingleFloat; }
+  bool hasFPUv2DoubleFloat() const { return HasFPUv2DoubleFloat; }
+  bool hasFPUv2() const { return HasFPUv2SingleFloat || HasFPUv2DoubleFloat; }
+  bool hasFPUv3SingleFloat() const { return HasFPUv3SingleFloat; }
+  bool hasFPUv3DoubleFloat() const { return HasFPUv3DoubleFloat; }
+  bool hasFPUv3() const { return HasFPUv3SingleFloat || HasFPUv3DoubleFloat; }
+  bool hasAnyFloatExt() const { return hasFPUv2() || hasFPUv3(); };
+
+  bool hasBTST16() const { return HasBTST16; }
+  bool hasJAVA() const { return HasJAVA; }
+  bool hasExtendLrw() const { return HasExtendLrw; }
+  bool hasDoloop() const { return HasDoloop; }
+  bool hasHighRegisters() const { return HasHighRegisters; }
+
+  bool hasE1() const { return HasE1; }
+  bool hasE2() const { return HasE2; }
+  bool has2E3() const { return Has2E3; }
+  bool has3r1E3r2() const { return Has3r1E3r2; }
+  bool has3r2E3r3() const { return Has3r2E3r3; }
+  bool has3E3r1() const { return Has3E3r1; }
+  bool has3E7() const { return Has3E7; }
+  bool hasMP() const { return HasMP; }
+  bool hasMP1E2() const { return HasMP1E2; }
+  bool has7E10() const { return Has7E10; }
+  bool has10E60() const { return Has10E60; }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_CSKYSUBTARGET_H
diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
index 1c13796e84b6..8f61feb6506d 100644
--- a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
+++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -11,10 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYTargetMachine.h"
+#include "CSKY.h"
+#include "CSKYSubtarget.h"
 #include "TargetInfo/CSKYTargetInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -50,6 +53,34 @@ CSKYTargetMachine::CSKYTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
+const CSKYSubtarget *
+CSKYTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute TuneAttr = F.getFnAttribute("tune-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU =
+      CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
+  std::string TuneCPU =
+      TuneAttr.isValid() ? TuneAttr.getValueAsString().str() : CPU;
+  std::string FS =
+      FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS;
+
+  std::string Key = CPU + TuneCPU + FS;
+  auto &I = SubtargetMap[Key];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<CSKYSubtarget>(TargetTriple, CPU, TuneCPU, FS, *this);
+    if (I->useHardFloat() && !I->hasAnyFloatExt())
+      errs() << "Hard-float can't be used with current CPU,"
+                " set to Soft-float\n";
+  }
+  return I.get();
+}
+
 namespace {
 class CSKYPassConfig : public TargetPassConfig {
 public:
@@ -59,6 +90,8 @@ public:
   CSKYTargetMachine &getCSKYTargetMachine() const {
     return getTM<CSKYTargetMachine>();
   }
+
+  bool addInstSelector() override;
 };
 
 } // namespace
@@ -66,3 +99,9 @@ public:
 TargetPassConfig *CSKYTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new CSKYPassConfig(*this, PM);
 }
+
+bool CSKYPassConfig::addInstSelector() {
+  addPass(createCSKYISelDag(getCSKYTargetMachine()));
+
+  return false;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.h b/llvm/lib/Target/CSKY/CSKYTargetMachine.h
index d50e3877b550..ecb9fe953077 100644
--- a/llvm/lib/Target/CSKY/CSKYTargetMachine.h
+++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_CSKY_CSKYTARGETMACHINE_H
 #define LLVM_LIB_TARGET_CSKY_CSKYTARGETMACHINE_H
 
+#include "CSKYSubtarget.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -20,6 +21,7 @@ namespace llvm {
 
 class CSKYTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<CSKYSubtarget>> SubtargetMap;
 
 public:
   CSKYTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -29,6 +31,12 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  const CSKYSubtarget *getSubtargetImpl(const Function &F) const override;
+  // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget,
+  // subtargets are per-function entities based on the target-specific
+  // attributes of each function.
+  const CSKYSubtarget *getSubtargetImpl() const = delete;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index 7fb5f35548b4..daa655416c47 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -30,25 +30,57 @@ CSKYAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
   static llvm::DenseMap<unsigned, MCFixupKindInfo> Infos = {
       {CSKY::Fixups::fixup_csky_addr32, {"fixup_csky_addr32", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_addr_hi16, {"fixup_csky_addr_hi16", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_addr_lo16, {"fixup_csky_addr_lo16", 0, 32, 0}},
       {CSKY::Fixups::fixup_csky_pcrel_imm16_scale2,
        {"fixup_csky_pcrel_imm16_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
       {CSKY::Fixups::fixup_csky_pcrel_uimm16_scale4,
-       {"fixup_csky_pcrel_uimm16_scale4", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
+       {"fixup_csky_pcrel_uimm16_scale4", 0, 32,
+        MCFixupKindInfo::FKF_IsPCRel |
+            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}},
+      {CSKY::Fixups::fixup_csky_pcrel_uimm8_scale4,
+       {"fixup_csky_pcrel_uimm8_scale4", 0, 32,
+        MCFixupKindInfo::FKF_IsPCRel |
+            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}},
       {CSKY::Fixups::fixup_csky_pcrel_imm26_scale2,
        {"fixup_csky_pcrel_imm26_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
       {CSKY::Fixups::fixup_csky_pcrel_imm18_scale2,
-       {"fixup_csky_pcrel_imm18_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}}};
+       {"fixup_csky_pcrel_imm18_scale2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
+      {CSKY::Fixups::fixup_csky_got32, {"fixup_csky_got32", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_got_imm18_scale4,
+       {"fixup_csky_got_imm18_scale4", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_gotoff, {"fixup_csky_gotoff", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_gotpc,
+       {"fixup_csky_gotpc", 0, 32, MCFixupKindInfo::FKF_IsPCRel}},
+      {CSKY::Fixups::fixup_csky_plt32, {"fixup_csky_plt32", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_plt_imm18_scale4,
+       {"fixup_csky_plt_imm18_scale4", 0, 32, 0}},
+      {CSKY::Fixups::fixup_csky_pcrel_imm10_scale2,
+       {"fixup_csky_pcrel_imm10_scale2", 0, 16, MCFixupKindInfo::FKF_IsPCRel}},
+      {CSKY::Fixups::fixup_csky_pcrel_uimm7_scale4,
+       {"fixup_csky_pcrel_uimm7_scale4", 0, 16,
+        MCFixupKindInfo::FKF_IsPCRel |
+            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}},
+      {CSKY::Fixups::fixup_csky_doffset_imm18,
+       {"fixup_csky_doffset_imm18", 0, 18, 0}},
+      {CSKY::Fixups::fixup_csky_doffset_imm18_scale2,
+       {"fixup_csky_doffset_imm18_scale2", 0, 18, 0}},
+      {CSKY::Fixups::fixup_csky_doffset_imm18_scale4,
+       {"fixup_csky_doffset_imm18_scale4", 0, 18, 0}}};
+
   assert(Infos.size() == CSKY::NumTargetFixupKinds &&
          "Not all fixup kinds added to Infos array");
 
-  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-         "Invalid kind!");
-  if (FirstTargetFixupKind <= Kind && Kind < FirstLiteralRelocationKind)
+  if (FirstTargetFixupKind <= Kind && Kind < FirstLiteralRelocationKind) {
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+
     return Infos[Kind];
-  else if (Kind < FirstTargetFixupKind)
+  } else if (Kind < FirstTargetFixupKind) {
     return MCAsmBackend::getFixupKindInfo(Kind);
-  else
+  } else {
     return MCAsmBackend::getFixupKindInfo(FK_NONE);
+  }
 }
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
@@ -145,7 +177,8 @@ void CSKYAsmBackend::relaxInstruction(MCInst &Inst,
   llvm_unreachable("CSKYAsmBackend::relaxInstruction() unimplemented");
 }
 
-bool CSKYAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool CSKYAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                  const MCSubtargetInfo *STI) const {
   if (Count % 2)
     return false;
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index cdf688e9032a..e710954e9df8 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -39,7 +39,8 @@ public:
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override;
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYBaseInfo.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYBaseInfo.h
new file mode 100644
index 000000000000..fbfca4b6b85f
--- /dev/null
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYBaseInfo.h
@@ -0,0 +1,70 @@
+//===-- CSKYBaseInfo.h - Top level definitions for CSKY ---*- C++ -*-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the CSKY target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYBASEINFO_H
+#define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYBASEINFO_H
+
+#include "MCTargetDesc/CSKYMCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+namespace llvm {
+
+// CSKYII - This namespace holds all of the target specific flags that
+// instruction info tracks. All definitions must match CSKYInstrFormats.td.
+namespace CSKYII {
+
+enum AddrMode {
+  AddrModeNone = 0,
+  AddrMode32B = 1,   // ld32.b, ld32.bs, st32.b, st32.bs, +4kb
+  AddrMode32H = 2,   // ld32.h, ld32.hs, st32.h, st32.hs, +8kb
+  AddrMode32WD = 3,  // ld32.w, st32.w, ld32.d, st32.d, +16kb
+  AddrMode16B = 4,   // ld16.b, +32b
+  AddrMode16H = 5,   // ld16.h, +64b
+  AddrMode16W = 6,   // ld16.w, +128b or +1kb
+  AddrMode32SDF = 7, // flds, fldd, +1kb
+};
+
+// CSKY Specific MachineOperand Flags.
+enum TOF {
+  MO_None = 0,
+  MO_ADDR32,
+  MO_GOT32,
+  MO_GOTOFF,
+  MO_PLT32,
+  MO_ADDR_HI16,
+  MO_ADDR_LO16,
+
+  // Used to differentiate between target-specific "direct" flags and "bitmask"
+  // flags. A machine operand can only have one "direct" flag, but can have
+  // multiple "bitmask" flags.
+  MO_DIRECT_FLAG_MASK = 15
+};
+
+enum {
+  AddrModeMask = 0x1f,
+};
+
+} // namespace CSKYII
+
+namespace CSKYOp {
+enum OperandType : unsigned {
+  OPERAND_BARESYMBOL = MCOI::OPERAND_FIRST_TARGET,
+  OPERAND_CONSTPOOL
+};
+} // namespace CSKYOp
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYBASEINFO_H
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h
index 917f940fcad4..434fd5481626 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYFixupKinds.h
@@ -16,6 +16,10 @@ namespace CSKY {
 enum Fixups {
   fixup_csky_addr32 = FirstTargetFixupKind,
 
+  fixup_csky_addr_hi16,
+
+  fixup_csky_addr_lo16,
+
   fixup_csky_pcrel_imm16_scale2,
 
   fixup_csky_pcrel_uimm16_scale4,
@@ -24,6 +28,29 @@ enum Fixups {
 
   fixup_csky_pcrel_imm18_scale2,
 
+  fixup_csky_gotpc,
+
+  fixup_csky_gotoff,
+
+  fixup_csky_got32,
+
+  fixup_csky_got_imm18_scale4,
+
+  fixup_csky_plt32,
+
+  fixup_csky_plt_imm18_scale4,
+
+  fixup_csky_pcrel_imm10_scale2,
+
+  fixup_csky_pcrel_uimm7_scale4,
+
+  fixup_csky_pcrel_uimm8_scale4,
+
+  fixup_csky_doffset_imm18,
+
+  fixup_csky_doffset_imm18_scale2,
+
+  fixup_csky_doffset_imm18_scale4,
   // Marker
   fixup_csky_invalid,
   NumTargetFixupKinds = fixup_csky_invalid - FirstTargetFixupKind
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
index c8920fbb4b4c..7001de999a51 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -95,6 +96,107 @@ void CSKYInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   MO.getExpr()->print(O, &MAI);
 }
 
+void CSKYInstPrinter::printDataSymbol(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+
+  O << "[";
+  if (MO.isImm())
+    O << MO.getImm();
+  else
+    MO.getExpr()->print(O, &MAI);
+  O << "]";
+}
+
+void CSKYInstPrinter::printConstpool(const MCInst *MI, uint64_t Address,
+                                     unsigned OpNo, const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+
+  if (MO.isImm()) {
+    if (PrintBranchImmAsAddress) {
+      uint64_t Target = Address + MO.getImm();
+      Target &= 0xfffffffc;
+      O << formatHex(Target);
+    } else {
+      O << MO.getImm();
+    }
+    return;
+  }
+
+  assert(MO.isExpr() && "Unknown operand kind in printConstpool");
+
+  O << "[";
+  MO.getExpr()->print(O, &MAI);
+  O << "]";
+}
+
+void CSKYInstPrinter::printCSKYSymbolOperand(const MCInst *MI, uint64_t Address,
+                                             unsigned OpNo,
+                                             const MCSubtargetInfo &STI,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+  if (!MO.isImm()) {
+    return printOperand(MI, OpNo, STI, O);
+  }
+
+  if (PrintBranchImmAsAddress) {
+    uint64_t Target = Address + MO.getImm();
+    Target &= 0xffffffff;
+    O << formatHex(Target);
+  } else {
+    O << MO.getImm();
+  }
+}
+
+void CSKYInstPrinter::printRegisterSeq(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "-";
+  printRegName(O, MI->getOperand(OpNum + 1).getReg());
+}
+
+void CSKYInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  auto V = MI->getOperand(OpNum).getImm();
+  ListSeparator LS;
+
+  if (V & 0xf) {
+    O << LS;
+    printRegName(O, CSKY::R4);
+    auto Offset = (V & 0xf) - 1;
+    if (Offset) {
+      O << "-";
+      printRegName(O, CSKY::R4 + Offset);
+    }
+  }
+
+  if ((V >> 4) & 0x1) {
+    O << LS;
+    printRegName(O, CSKY::R15);
+  }
+
+  if ((V >> 5) & 0x7) {
+    O << LS;
+    printRegName(O, CSKY::R16);
+
+    auto Offset = ((V >> 5) & 0x7) - 1;
+
+    if (Offset) {
+      O << "-";
+      printRegName(O, CSKY::R16 + Offset);
+    }
+  }
+
+  if ((V >> 8) & 0x1) {
+    O << LS;
+    printRegName(O, CSKY::R28);
+  }
+}
+
 const char *CSKYInstPrinter::getRegisterName(unsigned RegNo) {
   return getRegisterName(RegNo, ArchRegNames ? CSKY::NoRegAltName
                                              : CSKY::ABIRegAltName);
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
index a28791a6d8e9..f93a342ec6a3 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
@@ -19,6 +19,9 @@
 namespace llvm {
 
 class CSKYInstPrinter : public MCInstPrinter {
+private:
+  bool ABIRegNames = false;
+
 public:
   CSKYInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                   const MCRegisterInfo &MRI)
@@ -43,6 +46,20 @@ public:
                                unsigned OpIdx, unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI, raw_ostream &O);
 
+  void printDataSymbol(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printConstpool(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPSRFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printRegisterSeq(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printRegisterList(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCSKYSymbolOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSPAddr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
   static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
 };
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
index 1a5b0225e0b9..1d220b749cb1 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYMCCodeEmitter.h"
+#include "CSKYMCExpr.h"
 #include "MCTargetDesc/CSKYMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCInstBuilder.h"
@@ -31,11 +32,46 @@ unsigned CSKYMCCodeEmitter::getOImmOpValue(const MCInst &MI, unsigned Idx,
   return MO.getImm() - 1;
 }
 
+unsigned
+CSKYMCCodeEmitter::getImmOpValueIDLY(const MCInst &MI, unsigned Idx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Idx);
+  assert(MO.isImm() && "Unexpected MO type.");
+
+  auto V = (MO.getImm() <= 3) ? 4 : MO.getImm();
+  return V - 1;
+}
+
+unsigned
+CSKYMCCodeEmitter::getImmOpValueMSBSize(const MCInst &MI, unsigned Idx,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  const MCOperand &MSB = MI.getOperand(Idx);
+  const MCOperand &LSB = MI.getOperand(Idx + 1);
+  assert(MSB.isImm() && LSB.isImm() && "Unexpected MO type.");
+
+  return MSB.getImm() - LSB.getImm();
+}
+
+static void writeData(uint32_t Bin, unsigned Size, raw_ostream &OS) {
+  uint16_t LO16 = static_cast<uint16_t>(Bin);
+  uint16_t HI16 = static_cast<uint16_t>(Bin >> 16);
+
+  if (Size == 4)
+    support::endian::write<uint16_t>(OS, HI16, support::little);
+
+  support::endian::write<uint16_t>(OS, LO16, support::little);
+}
+
 void CSKYMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                           SmallVectorImpl<MCFixup> &Fixups,
                                           const MCSubtargetInfo &STI) const {
   const MCInstrDesc &Desc = MII.get(MI.getOpcode());
   unsigned Size = Desc.getSize();
+
+  ++MCNumEmitted;
+
   uint32_t Bin = getBinaryCodeForInstr(MI, Fixups, STI);
 
   uint16_t LO16 = static_cast<uint16_t>(Bin);
@@ -45,7 +81,6 @@ void CSKYMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     support::endian::write<uint16_t>(OS, HI16, support::little);
 
   support::endian::write<uint16_t>(OS, LO16, support::little);
-  ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
 unsigned
@@ -62,6 +97,51 @@ CSKYMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return 0;
 }
 
+unsigned
+CSKYMCCodeEmitter::getRegSeqImmOpValue(const MCInst &MI, unsigned Idx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(Idx).isReg() && "Unexpected MO type.");
+  assert(MI.getOperand(Idx + 1).isImm() && "Unexpected MO type.");
+
+  unsigned Ry = MI.getOperand(Idx).getReg();
+  unsigned Rz = MI.getOperand(Idx + 1).getImm();
+
+  unsigned Imm = Ctx.getRegisterInfo()->getEncodingValue(Rz) -
+                 Ctx.getRegisterInfo()->getEncodingValue(Ry);
+
+  return ((Ctx.getRegisterInfo()->getEncodingValue(Ry) << 5) | Imm);
+}
+
+unsigned
+CSKYMCCodeEmitter::getRegisterSeqOpValue(const MCInst &MI, unsigned Op,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  unsigned Reg1 =
+      Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(Op).getReg());
+  unsigned Reg2 =
+      Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(Op + 1).getReg());
+
+  unsigned Binary = ((Reg1 & 0x1f) << 5) | (Reg2 - Reg1);
+
+  return Binary;
+}
+
+unsigned CSKYMCCodeEmitter::getImmJMPIX(const MCInst &MI, unsigned Idx,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MI.getOperand(Idx).getImm() == 16)
+    return 0;
+  else if (MI.getOperand(Idx).getImm() == 24)
+    return 1;
+  else if (MI.getOperand(Idx).getImm() == 32)
+    return 2;
+  else if (MI.getOperand(Idx).getImm() == 40)
+    return 3;
+  else
+    assert(0);
+}
+
 MCFixupKind CSKYMCCodeEmitter::getTargetFixup(const MCExpr *Expr) const {
   const CSKYMCExpr *CSKYExpr = cast<CSKYMCExpr>(Expr);
 
@@ -70,6 +150,22 @@ MCFixupKind CSKYMCCodeEmitter::getTargetFixup(const MCExpr *Expr) const {
     llvm_unreachable("Unhandled fixup kind!");
   case CSKYMCExpr::VK_CSKY_ADDR:
     return MCFixupKind(CSKY::fixup_csky_addr32);
+  case CSKYMCExpr::VK_CSKY_ADDR_HI16:
+    return MCFixupKind(CSKY::fixup_csky_addr_hi16);
+  case CSKYMCExpr::VK_CSKY_ADDR_LO16:
+    return MCFixupKind(CSKY::fixup_csky_addr_lo16);
+  case CSKYMCExpr::VK_CSKY_GOT:
+    return MCFixupKind(CSKY::fixup_csky_got32);
+  case CSKYMCExpr::VK_CSKY_GOTPC:
+    return MCFixupKind(CSKY::fixup_csky_gotpc);
+  case CSKYMCExpr::VK_CSKY_GOTOFF:
+    return MCFixupKind(CSKY::fixup_csky_gotoff);
+  case CSKYMCExpr::VK_CSKY_PLT:
+    return MCFixupKind(CSKY::fixup_csky_plt32);
+  case CSKYMCExpr::VK_CSKY_PLT_IMM18_BY4:
+    return MCFixupKind(CSKY::fixup_csky_plt_imm18_scale4);
+  case CSKYMCExpr::VK_CSKY_GOT_IMM18_BY4:
+    return MCFixupKind(CSKY::fixup_csky_got_imm18_scale4);
   }
 }
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
index a4c50d992a07..bfba07bcb32a 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCCodeEmitter.h
@@ -13,8 +13,8 @@
 #ifndef LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
 #define LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCCODEEMITTER_H
 
-#include "CSKYMCExpr.h"
 #include "MCTargetDesc/CSKYFixupKinds.h"
+#include "MCTargetDesc/CSKYMCExpr.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 
@@ -49,14 +49,40 @@ public:
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
     const MCOperand &MO = MI.getOperand(Idx);
-    assert(MO.isImm() && "Unexpected MO type.");
-    return (MO.getImm() >> shift);
+    if (MO.isImm())
+      return (MO.getImm() >> shift);
+
+    assert(MO.isExpr() && "Unexpected MO type.");
+
+    MCFixupKind Kind = getTargetFixup(MO.getExpr());
+    Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+    return 0;
   }
 
+  unsigned getRegSeqImmOpValue(const MCInst &MI, unsigned Idx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterSeqOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   unsigned getOImmOpValue(const MCInst &MI, unsigned Idx,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
+  unsigned getImmOpValueIDLY(const MCInst &MI, unsigned Idx,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getImmJMPIX(const MCInst &MI, unsigned Idx,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const;
+
+  unsigned getImmOpValueMSBSize(const MCInst &MI, unsigned Idx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
   unsigned getImmShiftOpValue(const MCInst &MI, unsigned Idx,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const {
@@ -101,6 +127,21 @@ public:
     return 0;
   }
 
+  template <llvm::CSKY::Fixups FIXUP>
+  unsigned getDataSymbolOpValue(const MCInst &MI, unsigned Idx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const {
+    const MCOperand &MO = MI.getOperand(Idx);
+    assert(MO.isExpr() && "Unexpected MO type.");
+
+    MCFixupKind Kind = MCFixupKind(FIXUP);
+    if (MO.getExpr()->getKind() == MCExpr::Target)
+      Kind = getTargetFixup(MO.getExpr());
+
+    Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+    return 0;
+  }
+
   unsigned getCallSymbolOpValue(const MCInst &MI, unsigned Idx,
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
index 59e630f43a42..7987613b0608 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp
@@ -26,22 +26,33 @@ StringRef CSKYMCExpr::getVariantKindName(VariantKind Kind) {
   switch (Kind) {
   default:
     llvm_unreachable("Invalid ELF symbol kind");
+  case VK_CSKY_None:
   case VK_CSKY_ADDR:
     return "";
-  case VK_CSKY_PCREL:
-    return "";
+  case VK_CSKY_ADDR_HI16:
+    return "@HI16";
+  case VK_CSKY_ADDR_LO16:
+    return "@LO16";
+  case VK_CSKY_GOT_IMM18_BY4:
   case VK_CSKY_GOT:
     return "@GOT";
   case VK_CSKY_GOTPC:
     return "@GOTPC";
   case VK_CSKY_GOTOFF:
     return "@GOTOFF";
+  case VK_CSKY_PLT_IMM18_BY4:
   case VK_CSKY_PLT:
     return "@PLT";
-  case VK_CSKY_TPOFF:
+  case VK_CSKY_TLSLE:
     return "@TPOFF";
+  case VK_CSKY_TLSIE:
+    return "@GOTTPOFF";
   case VK_CSKY_TLSGD:
-    return "@TLSGD";
+    return "@TLSGD32";
+  case VK_CSKY_TLSLDO:
+    return "@TLSLDO32";
+  case VK_CSKY_TLSLDM:
+    return "@TLSLDM32";
   }
 }
 
@@ -87,7 +98,8 @@ void CSKYMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   switch (getKind()) {
   default:
     return;
-  case VK_CSKY_TPOFF:
+  case VK_CSKY_TLSLE:
+  case VK_CSKY_TLSIE:
   case VK_CSKY_TLSGD:
     break;
   }
@@ -106,17 +118,20 @@ bool CSKYMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
     switch (getKind()) {
     default:
       return true;
-
-    case VK_CSKY_ADDR:
-    case VK_CSKY_PCREL:
     case VK_CSKY_GOT:
+    case VK_CSKY_GOT_IMM18_BY4:
     case VK_CSKY_GOTPC:
     case VK_CSKY_GOTOFF:
-    case VK_CSKY_TPOFF:
+    case VK_CSKY_PLT:
+    case VK_CSKY_PLT_IMM18_BY4:
+    case VK_CSKY_TLSIE:
+    case VK_CSKY_TLSLE:
     case VK_CSKY_TLSGD:
+    case VK_CSKY_TLSLDO:
+    case VK_CSKY_TLSLDM:
       return false;
     }
   }
 
   return true;
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h
index 06fccada53ce..9e5b4ca7d9bb 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.h
@@ -19,13 +19,20 @@ public:
   enum VariantKind {
     VK_CSKY_None,
     VK_CSKY_ADDR,
+    VK_CSKY_ADDR_HI16,
+    VK_CSKY_ADDR_LO16,
     VK_CSKY_PCREL,
     VK_CSKY_GOT,
+    VK_CSKY_GOT_IMM18_BY4,
     VK_CSKY_GOTPC,
     VK_CSKY_GOTOFF,
     VK_CSKY_PLT,
-    VK_CSKY_TPOFF,
+    VK_CSKY_PLT_IMM18_BY4,
+    VK_CSKY_TLSIE,
+    VK_CSKY_TLSLE,
     VK_CSKY_TLSGD,
+    VK_CSKY_TLSLDO,
+    VK_CSKY_TLSLDM,
     VK_CSKY_Invalid
   };
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
index 169e1e14eb0a..0901c0993607 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -19,7 +19,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "CSKYGenInstrInfo.inc"
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
index da8a3b63a2f9..25bbd635fc58 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
@@ -45,4 +45,7 @@ MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII,
 #define GET_INSTRINFO_ENUM
 #include "CSKYGenInstrInfo.inc"
 
+#define GET_SUBTARGETINFO_ENUM
+#include "CSKYGenSubtargetInfo.inc"
+
 #endif // LLVM_LIB_TARGET_CSKY_MCTARGETDESC_CSKYMCTARGETDESC_H
diff --git a/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp b/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
index 1af2e672ff42..40b7d493652d 100644
--- a/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
+++ b/llvm/lib/Target/CSKY/TargetInfo/CSKYTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/CSKYTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheCSKYTarget() {
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 7edc2a01eeb8..d131cf896834 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -37,6 +37,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -45,7 +46,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -141,12 +141,6 @@ class HexagonAsmParser : public MCTargetAsmParser {
   int processInstruction(MCInst &Inst, OperandVector const &Operands,
                          SMLoc IDLoc);
 
-  // Check if we have an assembler and, if so, set the ELF e_header flags.
-  void chksetELFHeaderEFlags(unsigned flags) {
-    if (getAssembler())
-      getAssembler()->setELFHeaderEFlags(flags);
-  }
-
   unsigned matchRegister(StringRef Name);
 
 /// @name Auto-generated Match Functions
@@ -211,10 +205,6 @@ struct HexagonOperand : public MCParsedAsmOperand {
     const MCExpr *Val;
   };
 
-  struct InstTy {
-    OperandVector *SubInsts;
-  };
-
   union {
     struct TokTy Tok;
     struct RegTy Reg;
@@ -1498,7 +1488,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
 
       MES->SwitchSection(mySection);
       unsigned byteSize = is32bit ? 4 : 8;
-      getStreamer().emitCodeAlignment(byteSize, byteSize);
+      getStreamer().emitCodeAlignment(byteSize, &getSTI(), byteSize);
 
       MCSymbol *Sym;
 
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 80a987c3a549..3c742c98077b 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -21,9 +21,9 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstddef>
@@ -131,6 +131,9 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder);
+static DecodeStatus DecodeSysRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
@@ -140,6 +143,10 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeGuestRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t Address,
                                                    const void *Decoder);
+static DecodeStatus DecodeSysRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
 
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t Address, const void *Decoder);
@@ -760,6 +767,78 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
   return MCDisassembler::Success;
 }
 
+static const uint16_t SysRegDecoderTable[] = {
+    Hexagon::SGP0,       Hexagon::SGP1,      Hexagon::STID,
+    Hexagon::ELR,        Hexagon::BADVA0,    Hexagon::BADVA1,
+    Hexagon::SSR,        Hexagon::CCR,       Hexagon::HTID,
+    Hexagon::BADVA,      Hexagon::IMASK,     Hexagon::S11,
+    Hexagon::S12,        Hexagon::S13,       Hexagon::S14,
+    Hexagon::S15,        Hexagon::EVB,       Hexagon::MODECTL,
+    Hexagon::SYSCFG,     Hexagon::S19,       Hexagon::S20,
+    Hexagon::VID,        Hexagon::S22,       Hexagon::S23,
+    Hexagon::S24,        Hexagon::S25,       Hexagon::S26,
+    Hexagon::CFGBASE,    Hexagon::DIAG,      Hexagon::REV,
+    Hexagon::PCYCLELO,   Hexagon::PCYCLEHI,  Hexagon::ISDBST,
+    Hexagon::ISDBCFG0,   Hexagon::ISDBCFG1,  Hexagon::S35,
+    Hexagon::BRKPTPC0,   Hexagon::BRKPTCFG0, Hexagon::BRKPTPC1,
+    Hexagon::BRKPTCFG1,  Hexagon::ISDBMBXIN, Hexagon::ISDBMBXOUT,
+    Hexagon::ISDBEN,     Hexagon::ISDBGPR,   Hexagon::S44,
+    Hexagon::S45,        Hexagon::S46,       Hexagon::S47,
+    Hexagon::PMUCNT0,    Hexagon::PMUCNT1,   Hexagon::PMUCNT2,
+    Hexagon::PMUCNT3,    Hexagon::PMUEVTCFG, Hexagon::PMUCFG,
+    Hexagon::S54,        Hexagon::S55,       Hexagon::S56,
+    Hexagon::S57,        Hexagon::S58,       Hexagon::S59,
+    Hexagon::S60,        Hexagon::S61,       Hexagon::S62,
+    Hexagon::S63,        Hexagon::S64,       Hexagon::S65,
+    Hexagon::S66,        Hexagon::S67,       Hexagon::S68,
+    Hexagon::S69,        Hexagon::S70,       Hexagon::S71,
+    Hexagon::S72,        Hexagon::S73,       Hexagon::S74,
+    Hexagon::S75,        Hexagon::S76,       Hexagon::S77,
+    Hexagon::S78,        Hexagon::S79,       Hexagon::S80,
+};
+
+static DecodeStatus DecodeSysRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
+  if (RegNo >= sizeof(SysRegDecoderTable) / sizeof(SysRegDecoderTable[0]))
+    return MCDisassembler::Fail;
+
+  if (SysRegDecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SysRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t SysReg64DecoderTable[] = {
+    Hexagon::SGP1_0, Hexagon::S3_2,   Hexagon::S5_4,   Hexagon::S7_6,
+    Hexagon::S9_8,   Hexagon::S11_10, Hexagon::S13_12, Hexagon::S15_14,
+    Hexagon::S17_16, Hexagon::S19_18, Hexagon::S21_20, Hexagon::S23_22,
+    Hexagon::S25_24, Hexagon::S27_26, Hexagon::S29_28, Hexagon::S31_30,
+    Hexagon::S33_32, Hexagon::S35_34, Hexagon::S37_36, Hexagon::S39_38,
+    Hexagon::S41_40, Hexagon::S43_42, Hexagon::S45_44, Hexagon::S47_46,
+    Hexagon::S49_48, Hexagon::S51_50, Hexagon::S53_52, Hexagon::S55_54,
+    Hexagon::S57_56, Hexagon::S59_58, Hexagon::S61_60, Hexagon::S63_62,
+    Hexagon::S65_64, Hexagon::S67_66, Hexagon::S69_68, Hexagon::S71_70,
+    Hexagon::S73_72, Hexagon::S75_74, Hexagon::S77_76, Hexagon::S79_78,
+};
+
+static DecodeStatus DecodeSysRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t /*Address*/,
+                                                 const void *Decoder) {
+  RegNo = RegNo >> 1;
+  if (RegNo >= sizeof(SysReg64DecoderTable) / sizeof(SysReg64DecoderTable[0]))
+    return MCDisassembler::Fail;
+
+  if (SysReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SysReg64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeGuestRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t /*Address*/,
                                                  const void *Decoder) {
diff --git a/llvm/lib/Target/Hexagon/HexagonArch.h b/llvm/lib/Target/Hexagon/HexagonArch.h
index e5d528390c51..4a42ec98feb1 100644
--- a/llvm/lib/Target/Hexagon/HexagonArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonArch.h
@@ -19,12 +19,6 @@ namespace llvm {
 namespace Hexagon {
 
 template <class ArchCont, typename Val>
-bool ValidArch(ArchCont const &ArchList, Val HexArch) {
-  return std::any_of(std::begin(ArchList), std::end(ArchList),
-                     [HexArch](Val V) { return V == HexArch; });
-}
-
-template <class ArchCont, typename Val>
 llvm::Optional<ArchEnum> GetCpu(ArchCont const &ArchList, Val CPUString) {
   llvm::Optional<ArchEnum> Res;
   auto Entry = ArchList.find(CPUString);
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index f3017d02995e..8e6a01e3a186 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -41,10 +41,10 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -179,7 +179,7 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
                            MCStreamer &OutStreamer, const MCOperand &Imm,
-                           int AlignSize) {
+                           int AlignSize, const MCSubtargetInfo& STI) {
   MCSymbol *Sym;
   int64_t Value;
   if (Imm.getExpr()->evaluateAsAbsolute(Value)) {
@@ -209,7 +209,7 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
       OutStreamer.emitLabel(Sym);
       OutStreamer.emitSymbolAttribute(Sym, MCSA_Global);
       OutStreamer.emitIntValue(Value, AlignSize);
-      OutStreamer.emitCodeAlignment(AlignSize);
+      OutStreamer.emitCodeAlignment(AlignSize, &STI);
     }
   } else {
     assert(Imm.isExpr() && "Expected expression and found none");
@@ -237,7 +237,7 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
       OutStreamer.emitLabel(Sym);
       OutStreamer.emitSymbolAttribute(Sym, MCSA_Local);
       OutStreamer.emitValue(Imm.getExpr(), AlignSize);
-      OutStreamer.emitCodeAlignment(AlignSize);
+      OutStreamer.emitCodeAlignment(AlignSize, &STI);
     }
   }
   return Sym;
@@ -328,7 +328,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
       const MCOperand &Imm = MappedInst.getOperand(1);
       MCSectionSubPair Current = OutStreamer->getCurrentSection();
 
-      MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8);
+      MCSymbol *Sym =
+          smallData(*this, MI, *OutStreamer, Imm, 8, getSubtargetInfo());
 
       OutStreamer->SwitchSection(Current.first, Current.second);
       MCInst TmpInst;
@@ -345,7 +346,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     if (!OutStreamer->hasRawTextSupport()) {
       MCOperand &Imm = MappedInst.getOperand(1);
       MCSectionSubPair Current = OutStreamer->getCurrentSection();
-      MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4);
+      MCSymbol *Sym =
+          smallData(*this, MI, *OutStreamer, Imm, 4, getSubtargetInfo());
       OutStreamer->SwitchSection(Current.first, Current.second);
       MCInst TmpInst;
       MCOperand &Reg = MappedInst.getOperand(0);
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 54aa14849dd9..2c5ad3b589d2 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -972,8 +972,8 @@ namespace {
 } // end anonymous namespace
 
 bool DeadCodeElimination::isDead(unsigned R) const {
-  for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
-    MachineInstr *UseI = I->getParent();
+  for (const MachineOperand &MO : MRI.use_operands(R)) {
+    const MachineInstr *UseI = MO.getParent();
     if (UseI->isDebugValue())
       continue;
     if (UseI->isPHI()) {
@@ -1305,8 +1305,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
     return false;
   bool Changed = false;
 
-  for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) {
-    NextI = std::next(I);
+  for (auto I = B.begin(), E = B.end(); I != E; ++I) {
     MachineInstr *MI = &*I;
 
     if (MI->getOpcode() == TargetOpcode::COPY)
@@ -1598,9 +1597,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
   bool Changed = false;
   RegisterSet Defs;
 
-  for (auto I = B.begin(), E = B.end(), NextI = I; I != E;
-       ++I, AVB.insert(Defs)) {
-    NextI = std::next(I);
+  for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) {
     Defs.clear();
     HBS::getInstrDefs(*I, Defs);
 
@@ -1726,8 +1723,8 @@ bool CopyPropagation::propagateRegCopy(MachineInstr &MI) {
 
 bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) {
   std::vector<MachineInstr*> Instrs;
-  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I)
-    Instrs.push_back(&*I);
+  for (MachineInstr &MI : llvm::reverse(B))
+    Instrs.push_back(&MI);
 
   bool Changed = false;
   for (auto I : Instrs) {
@@ -3123,8 +3120,8 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
     if (isConst(PR))
       continue;
     bool BadUse = false, GoodUse = false;
-    for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) {
-      MachineInstr *UseI = UI->getParent();
+    for (const MachineOperand &MO : MRI->use_operands(PR)) {
+      const MachineInstr *UseI = MO.getParent();
       if (UseI->getParent() != C.LB) {
         BadUse = true;
         break;
@@ -3252,7 +3249,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
     auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
       return G.Out.Reg == P.LR.Reg;
     };
-    if (llvm::find_if(Phis, LoopInpEq) == Phis.end())
+    if (llvm::none_of(Phis, LoopInpEq))
       continue;
 
     G.Inp.Reg = Inputs.find_first();
@@ -3338,9 +3335,9 @@ bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
       continue;
     MachineBasicBlock *PB = nullptr;
     bool IsLoop = false;
-    for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) {
-      if (*PI != &B)
-        PB = *PI;
+    for (MachineBasicBlock *Pred : B.predecessors()) {
+      if (Pred != &B)
+        PB = Pred;
       else
         IsLoop = true;
     }
@@ -3348,13 +3345,13 @@ bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     MachineBasicBlock *EB = nullptr;
-    for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) {
-      if (*SI == &B)
+    for (MachineBasicBlock *Succ : B.successors()) {
+      if (Succ == &B)
         continue;
       // Set EP to the epilog block, if it has only 1 predecessor (i.e. the
       // edge from B to EP is non-critical.
-      if ((*SI)->pred_size() == 1)
-        EB = *SI;
+      if (Succ->pred_size() == 1)
+        EB = Succ;
       break;
     }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 9f18d0b3162c..43f0758f6598 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -386,9 +386,8 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
   // dereferences the pointer operand.
   GepNode *PN = N;
   Type *PtrTy = GepI->getSourceElementType();
-  for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end();
-       OI != OE; ++OI) {
-    Value *Op = *OI;
+  for (Use &U : llvm::drop_begin(GepI->indices())) {
+    Value *Op = U;
     GepNode *Nx = new (*Mem) GepNode;
     Nx->Parent = PN;  // Link Nx to the previous node.
     Nx->Flags |= GepNode::Internal | InBounds;
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 954e61563697..daf311fc49d4 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -863,14 +863,13 @@ void MachineConstPropagator::removeCFGEdge(MachineBasicBlock *From,
   // First, remove the CFG successor/predecessor information.
   From->removeSuccessor(To);
   // Remove all corresponding PHI operands in the To block.
-  for (auto I = To->begin(), E = To->getFirstNonPHI(); I != E; ++I) {
-    MachineInstr *PN = &*I;
+  for (MachineInstr &PN : To->phis()) {
     // reg0 = PHI reg1, bb2, reg3, bb4, ...
-    int N = PN->getNumOperands()-2;
+    int N = PN.getNumOperands() - 2;
     while (N > 0) {
-      if (PN->getOperand(N+1).getMBB() == From) {
-        PN->RemoveOperand(N+1);
-        PN->RemoveOperand(N);
+      if (PN.getOperand(N + 1).getMBB() == From) {
+        PN.RemoveOperand(N + 1);
+        PN.RemoveOperand(N);
       }
       N -= 2;
     }
@@ -996,8 +995,7 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
     bool HaveTargets = computeBlockSuccessors(B, Targets);
     // Rewrite the executable instructions. Skip branches if we don't
     // have block successor information.
-    for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
-      MachineInstr &MI = *I;
+    for (MachineInstr &MI : llvm::reverse(*B)) {
       if (InstrExec.count(&MI)) {
         if (MI.isBranch() && !HaveTargets)
           continue;
@@ -1046,13 +1044,9 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
   // erase instructions during rewriting, so this needs to be delayed until
   // now.
   for (MachineBasicBlock &B : MF) {
-    MachineBasicBlock::iterator I = B.begin(), E = B.end();
-    while (I != E) {
-      auto Next = std::next(I);
-      if (I->isBranch() && !InstrExec.count(&*I))
-        B.erase(I);
-      I = Next;
-    }
+    for (MachineInstr &MI : llvm::make_early_inc_range(B))
+      if (MI.isBranch() && !InstrExec.count(&MI))
+        B.erase(&MI);
   }
   return Changed;
 }
@@ -3133,11 +3127,9 @@ void HexagonConstEvaluator::replaceAllRegUsesWith(Register FromReg,
                                                   Register ToReg) {
   assert(FromReg.isVirtual());
   assert(ToReg.isVirtual());
-  for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
-    MachineOperand &O = *I;
-    ++I;
+  for (MachineOperand &O :
+       llvm::make_early_inc_range(MRI->use_operands(FromReg)))
     O.setReg(ToReg);
-  }
 }
 
 bool HexagonConstEvaluator::rewriteHexBranch(MachineInstr &BrI,
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
index 4dd0110c4fed..b3f1b6638193 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -1700,6 +1700,12 @@ class Enc_7b7ba8 : OpcodeHexagon {
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
+class Enc_7d1542 : OpcodeHexagon {
+  bits <7> Ss128;
+  let Inst{22-16} = Ss128{6-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
 class Enc_7e5a82 : OpcodeHexagon {
   bits <5> Ii;
   let Inst{12-8} = Ii{4-0};
@@ -2011,6 +2017,12 @@ class Enc_8e583a : OpcodeHexagon {
   let Inst{25-23} = n1{3-1};
   let Inst{13-13} = n1{0-0};
 }
+class Enc_8f7633 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <7> Sd128;
+  let Inst{6-0} = Sd128{6-0};
+}
 class Enc_90cd8b : OpcodeHexagon {
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
@@ -2346,6 +2358,12 @@ class Enc_a6ce9c : OpcodeHexagon {
   bits <4> Rs16;
   let Inst{7-4} = Rs16{3-0};
 }
+class Enc_a705fc : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <7> Sdd128;
+  let Inst{6-0} = Sdd128{6-0};
+}
 class Enc_a7341a : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
@@ -3127,6 +3145,12 @@ class Enc_e26546 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
+class Enc_e32517 : OpcodeHexagon {
+  bits <7> Sss128;
+  let Inst{22-16} = Sss128{6-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
 class Enc_e38e1f : OpcodeHexagon {
   bits <8> Ii;
   let Inst{12-5} = Ii{7-0};
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index bba36352815e..4f00409c336c 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -38870,6 +38870,26 @@ let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b1010100001000000;
 let isSolo = 1;
 }
+def Y2_tfrscrr : HInst<
+(outs IntRegs:$Rd32),
+(ins SysRegs:$Ss128),
+"$Rd32 = $Ss128",
+tc_fae9dfa5, TypeCR>, Enc_7d1542 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-23} = 0b011011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def Y2_tfrsrcr : HInst<
+(outs SysRegs:$Sd128),
+(ins IntRegs:$Rs32),
+"$Sd128 = $Rs32",
+tc_6ae3426b, TypeCR>, Enc_8f7633 {
+let Inst{13-7} = 0b0000000;
+let Inst{31-21} = 0b01100111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
 def Y2_wait : HInst<
 (outs),
 (ins IntRegs:$Rs32),
@@ -38891,6 +38911,24 @@ let isSoloAX = 1;
 let hasSideEffects = 1;
 let mayStore = 1;
 }
+def Y4_tfrscpp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins SysRegs64:$Sss128),
+"$Rdd32 = $Sss128",
+tc_fae9dfa5, TypeCR>, Enc_e32517 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-23} = 0b011011110;
+}
+def Y4_tfrspcp : HInst<
+(outs SysRegs64:$Sdd128),
+(ins DoubleRegs:$Rss32),
+"$Sdd128 = $Rss32",
+tc_6ae3426b, TypeCR>, Enc_a705fc {
+let Inst{13-7} = 0b0000000;
+let Inst{31-21} = 0b01101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
 def Y4_trace : HInst<
 (outs),
 (ins IntRegs:$Rs32),
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index d36ffc3da641..9a3feb5b6af1 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -364,7 +364,7 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
     return true;
   if (B->isEHPad() || B->hasAddressTaken())
     return false;
-  if (B->succ_size() == 0)
+  if (B->succ_empty())
     return false;
 
   for (auto &MI : *B) {
@@ -390,8 +390,8 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
         continue;
       if (!isPredicate(R))
         continue;
-      for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
-        if (U->getParent()->isPHI())
+      for (const MachineOperand &U : MRI->use_operands(R))
+        if (U.getParent()->isPHI())
           return false;
     }
   }
@@ -570,12 +570,12 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
     TotalPh = computePhiCost(FP.JoinB, FP);
     PredDefs += countPredicateDefs(FP.JoinB);
   } else {
-    if (FP.TrueB && FP.TrueB->succ_size() > 0) {
+    if (FP.TrueB && !FP.TrueB->succ_empty()) {
       MachineBasicBlock *SB = *FP.TrueB->succ_begin();
       TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
-    if (FP.FalseB && FP.FalseB->succ_size() > 0) {
+    if (FP.FalseB && !FP.FalseB->succ_empty()) {
       MachineBasicBlock *SB = *FP.FalseB->succ_begin();
       TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
@@ -877,7 +877,7 @@ void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
   // existing terminators/successors from the split block.
   MachineBasicBlock *SSB = nullptr;
   FP.SplitB->erase(OldTI, FP.SplitB->end());
-  while (FP.SplitB->succ_size() > 0) {
+  while (!FP.SplitB->succ_empty()) {
     MachineBasicBlock *T = *FP.SplitB->succ_begin();
     // It's possible that the split block had a successor that is not a pre-
     // dicated block. This could only happen if there was only one block to
@@ -970,7 +970,7 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
     }
   }
 
-  while (B->succ_size() > 0)
+  while (!B->succ_empty())
     B->removeSuccessor(B->succ_begin());
 
   for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index fcc880463925..c444cf557c21 100644
--- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -1070,20 +1070,18 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
 bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
                                              std::set<Register> &UpdRegs) {
   bool Changed = false;
-  MachineBasicBlock::iterator I, E, NextI;
-  for (I = B.begin(), E = B.end(); I != E; I = NextI) {
-    NextI = std::next(I);
-    unsigned Opc = I->getOpcode();
+  for (MachineInstr &MI : llvm::make_early_inc_range(B)) {
+    unsigned Opc = MI.getOpcode();
     if (Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf) {
-      bool Done = predicate(*I, (Opc == Hexagon::A2_tfrt), UpdRegs);
+      bool Done = predicate(MI, (Opc == Hexagon::A2_tfrt), UpdRegs);
       if (!Done) {
         // If we didn't predicate I, we may need to remove it in case it is
         // an "identity" copy, e.g.  %1 = A2_tfrt %2, %1.
-        if (RegisterRef(I->getOperand(0)) == RegisterRef(I->getOperand(2))) {
-          for (auto &Op : I->operands())
+        if (RegisterRef(MI.getOperand(0)) == RegisterRef(MI.getOperand(2))) {
+          for (auto &Op : MI.operands())
             if (Op.isReg())
               UpdRegs.insert(Op.getReg());
-          removeInstr(*I);
+          removeInstr(MI);
         }
       }
       Changed |= Done;
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 5b782543b3b4..bff596e69efd 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -281,11 +281,10 @@ static unsigned getMaxCalleeSavedReg(ArrayRef<CalleeSavedInfo> CSI,
 /// frame to be already in place.
 static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
                             const HexagonRegisterInfo &HRI) {
-    for (auto &I : MBB) {
-      const MachineInstr *MI = &I;
-      if (MI->isCall())
+    for (const MachineInstr &MI : MBB) {
+      if (MI.isCall())
         return true;
-      unsigned Opc = MI->getOpcode();
+      unsigned Opc = MI.getOpcode();
       switch (Opc) {
         case Hexagon::PS_alloca:
         case Hexagon::PS_aligna:
@@ -294,7 +293,7 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
           break;
       }
       // Check individual operands.
-      for (const MachineOperand &MO : MI->operands()) {
+      for (const MachineOperand &MO : MI.operands()) {
         // While the presence of a frame index does not prove that a stack
         // frame will be required, all frame indexes should be within alloc-
         // frame/deallocframe. Otherwise, the code that translates a frame
@@ -343,8 +342,8 @@ static bool hasTailCall(const MachineBasicBlock &MBB) {
 
 /// Returns true if MBB contains an instruction that returns.
 static bool hasReturn(const MachineBasicBlock &MBB) {
-    for (auto I = MBB.getFirstTerminator(), E = MBB.end(); I != E; ++I)
-      if (I->isReturn())
+    for (const MachineInstr &MI : MBB.terminators())
+      if (MI.isReturn())
         return true;
     return false;
 }
@@ -425,11 +424,10 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
   // city don't do it right now.
   for (auto &I : MF) {
     unsigned BN = RPO[I.getNumber()];
-    for (auto SI = I.succ_begin(), SE = I.succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock *Succ : I.successors())
       // If found a back-edge, return.
-      if (RPO[(*SI)->getNumber()] <= BN)
+      if (RPO[Succ->getNumber()] <= BN)
         return;
-    }
   }
 
   // Collect the set of blocks that need a stack frame to execute. Scan
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index f2026877b22c..02da2f29591a 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -765,10 +765,7 @@ unsigned HexagonGenInsert::distance(const MachineBasicBlock *FromB,
 
   unsigned MaxD = 0;
 
-  using pred_iterator = MachineBasicBlock::const_pred_iterator;
-
-  for (pred_iterator I = ToB->pred_begin(), E = ToB->pred_end(); I != E; ++I) {
-    const MachineBasicBlock *PB = *I;
+  for (const MachineBasicBlock *PB : ToB->predecessors()) {
     // Skip back edges. Also, if FromB is a predecessor of ToB, the distance
     // along that path will be 0, and we don't need to do any calculations
     // on it.
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index 07f85e69abba..cf4f13fb8c0d 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -183,12 +183,11 @@ void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
   unsigned NR = HRI->getNumRegs();
   BitVector Defs(NR), Uses(NR);
 
-  for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
-    MachineInstr *MI = &*I;
-    I2X.insert(std::make_pair(MI, Index));
+  for (MachineInstr &MI : B) {
+    I2X.insert(std::make_pair(&MI, Index));
     Defs.reset();
     Uses.reset();
-    getDefsUses(MI, Defs, Uses);
+    getDefsUses(&MI, Defs, Uses);
     DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses)));
     Index++;
   }
@@ -232,22 +231,19 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
   CondsetMap CM;
   MuxInfoList ML;
 
-  MachineBasicBlock::iterator NextI, End = B.end();
-  for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) {
-    MachineInstr *MI = &*I;
-    NextI = std::next(I);
-    unsigned Opc = MI->getOpcode();
+  for (MachineInstr &MI : llvm::make_early_inc_range(B)) {
+    unsigned Opc = MI.getOpcode();
     if (!isCondTransfer(Opc))
       continue;
-    Register DR = MI->getOperand(0).getReg();
+    Register DR = MI.getOperand(0).getReg();
     if (isRegPair(DR))
       continue;
-    MachineOperand &PredOp = MI->getOperand(1);
+    MachineOperand &PredOp = MI.getOperand(1);
     if (PredOp.isUndef())
       continue;
 
     Register PR = PredOp.getReg();
-    unsigned Idx = I2X.lookup(MI);
+    unsigned Idx = I2X.lookup(&MI);
     CondsetMap::iterator F = CM.find(DR);
     bool IfTrue = HII->isPredicatedTrue(Opc);
 
@@ -360,21 +356,21 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
         return true;
     return false;
   };
-  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
-    if (I->isDebugInstr())
+  for (MachineInstr &I : llvm::reverse(B)) {
+    if (I.isDebugInstr())
       continue;
     // This isn't 100% accurate, but it's safe.
     // It won't detect (as a kill) a case like this
     //   r0 = add r0, 1    <-- r0 should be "killed"
     //   ... = r0
-    for (MachineOperand &Op : I->operands()) {
+    for (MachineOperand &Op : I.operands()) {
       if (!Op.isReg() || !Op.isUse())
         continue;
       assert(Op.getSubReg() == 0 && "Should have physical registers only");
       bool Live = IsLive(Op.getReg());
       Op.setIsKill(!Live);
     }
-    LPR.stepBackward(*I);
+    LPR.stepBackward(I);
   }
 
   return Changed;
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 58f3cd55ee9f..a4971ad712eb 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -468,7 +468,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
     return false;
 
   Register CmpReg1, CmpReg2;
-  int CmpImm = 0, CmpMask = 0;
+  int64_t CmpImm = 0, CmpMask = 0;
   bool CmpAnalyzed =
       TII->analyzeCompare(*PredI, CmpReg1, CmpReg2, CmpMask, CmpImm);
   // Fail if the compare was not analyzed, or it's not comparing a register
@@ -652,7 +652,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   unsigned CondOpc = CondI->getOpcode();
 
   Register CmpReg1, CmpReg2;
-  int Mask = 0, ImmValue = 0;
+  int64_t Mask = 0, ImmValue = 0;
   bool AnalyzedCmp =
       TII->analyzeCompare(*CondI, CmpReg1, CmpReg2, Mask, ImmValue);
   if (!AnalyzedCmp)
@@ -1094,15 +1094,15 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
       if (!MO.isReg() || !MO.isDef())
         continue;
       Register Reg = MO.getReg();
-      MachineRegisterInfo::use_iterator nextI;
-      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
-           E = MRI->use_end(); I != E; I = nextI) {
-        nextI = std::next(I);  // I is invalidated by the setReg
-        MachineInstr *UseMI = I->getParent();
+      // We use make_early_inc_range here because setReg below invalidates the
+      // iterator.
+      for (MachineOperand &MO :
+           llvm::make_early_inc_range(MRI->use_operands(Reg))) {
+        MachineInstr *UseMI = MO.getParent();
         if (UseMI == MI)
           continue;
-        if (I->isDebug())
-          I->setReg(0U);
+        if (MO.isDebug())
+          MO.setReg(0U);
       }
     }
 
@@ -1453,7 +1453,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
          E = MRI->use_instr_nodbg_end(); I != E; ++I) {
     MachineInstr *MI = &*I;
     Register CmpReg1, CmpReg2;
-    int CmpMask = 0, CmpValue = 0;
+    int64_t CmpMask = 0, CmpValue = 0;
 
     if (!TII->analyzeCompare(*MI, CmpReg1, CmpReg2, CmpMask, CmpValue))
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index fd404a156903..2679e399852f 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -990,7 +990,7 @@ void HexagonDAGToDAGISel::ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes) {
 
     auto IsZero = [] (const SDValue &V) -> bool {
       if (ConstantSDNode *SC = dyn_cast<ConstantSDNode>(V.getNode()))
-        return SC->isNullValue();
+        return SC->isZero();
       return false;
     };
     auto IsSelect0 = [IsZero] (const SDValue &Op) -> bool {
@@ -2247,8 +2247,8 @@ SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
 }
 
 void HexagonDAGToDAGISel::rebalanceAddressTrees() {
-  for (auto I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E;) {
-    SDNode *N = &*I++;
+  for (SDNode &Node : llvm::make_early_inc_range(CurDAG->allnodes())) {
+    SDNode *N = &Node;
     if (N->getOpcode() != ISD::LOAD && N->getOpcode() != ISD::STORE)
       continue;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 6ded323a34c3..29572e3106d1 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1231,7 +1231,7 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
 
   if (RM == Reloc::Static) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
-    const GlobalObject *GO = GV->getBaseObject();
+    const GlobalObject *GO = GV->getAliaseeObject();
     if (GO && Subtarget.useSmallData() && HLOF.isGlobalInSmallSection(GO, HTM))
       return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
     return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
@@ -2556,7 +2556,7 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
       // Extracting the lowest bit is a no-op, but it changes the type,
       // so it must be kept as an operation to avoid errors related to
       // type mismatches.
-      if (IdxN->isNullValue() && ValTy.getSizeInBits() == 1)
+      if (IdxN->isZero() && ValTy.getSizeInBits() == 1)
         return DAG.getNode(HexagonISD::TYPECAST, dl, MVT::i1, VecV);
     }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index e7d3c7c24f34..8900fca8bb78 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -525,7 +525,7 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
   if (IsSplat) {
     assert(SplatV.getNode());
     auto *IdxN = dyn_cast<ConstantSDNode>(SplatV.getNode());
-    if (IdxN && IdxN->isNullValue())
+    if (IdxN && IdxN->isZero())
       return getZero(dl, VecTy, DAG);
     MVT WordTy = MVT::getVectorVT(MVT::i32, HwLen/4);
     SDValue S = DAG.getNode(ISD::SPLAT_VECTOR, dl, WordTy, SplatV);
@@ -743,12 +743,12 @@ HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
 
   auto IsTrue = [] (SDValue V) {
     if (const auto *N = dyn_cast<ConstantSDNode>(V.getNode()))
-      return !N->isNullValue();
+      return !N->isZero();
     return false;
   };
   auto IsFalse = [] (SDValue V) {
     if (const auto *N = dyn_cast<ConstantSDNode>(V.getNode()))
-      return N->isNullValue();
+      return N->isZero();
     return false;
   };
 
@@ -1065,7 +1065,7 @@ HexagonTargetLowering::insertHvxSubvectorReg(SDValue VecV, SDValue SubV,
   assert(SubTy.getSizeInBits() == 32 || SubTy.getSizeInBits() == 64);
   // Convert IdxV to be index in bytes.
   auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
-  if (!IdxN || !IdxN->isNullValue()) {
+  if (!IdxN || !IdxN->isZero()) {
     IdxV = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
                        DAG.getConstant(ElemWidth/8, dl, MVT::i32));
     SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV, IdxV);
@@ -1088,7 +1088,7 @@ HexagonTargetLowering::insertHvxSubvectorReg(SDValue VecV, SDValue SubV,
     RolBase = HwLen-4;
   }
   // If the vector wasn't ror'ed, don't ror it back.
-  if (RolBase != 4 || !IdxN || !IdxN->isNullValue()) {
+  if (RolBase != 4 || !IdxN || !IdxN->isZero()) {
     SDValue RolV = DAG.getNode(ISD::SUB, dl, MVT::i32,
                                DAG.getConstant(RolBase, dl, MVT::i32), IdxV);
     SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV, RolV);
@@ -1125,7 +1125,7 @@ HexagonTargetLowering::insertHvxSubvectorPred(SDValue VecV, SDValue SubV,
   SDValue ByteIdx;
 
   auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
-  if (!IdxN || !IdxN->isNullValue()) {
+  if (!IdxN || !IdxN->isZero()) {
     ByteIdx = DAG.getNode(ISD::MUL, dl, MVT::i32, IdxV,
                           DAG.getConstant(BitBytes, dl, MVT::i32));
     ByteVec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, ByteVec, ByteIdx);
@@ -1140,7 +1140,7 @@ HexagonTargetLowering::insertHvxSubvectorPred(SDValue VecV, SDValue SubV,
                        {DAG.getConstant(BlockLen, dl, MVT::i32)}, DAG);
   ByteVec = getInstr(Hexagon::V6_vmux, dl, ByteTy, {Q, ByteSub, ByteVec}, DAG);
   // Rotate ByteVec back, and convert to a vector predicate.
-  if (!IdxN || !IdxN->isNullValue()) {
+  if (!IdxN || !IdxN->isZero()) {
     SDValue HwLenV = DAG.getConstant(HwLen, dl, MVT::i32);
     SDValue ByteXdi = DAG.getNode(ISD::SUB, dl, MVT::i32, HwLenV, ByteIdx);
     ByteVec = DAG.getNode(HexagonISD::VROR, dl, ByteTy, ByteVec, ByteXdi);
@@ -1594,15 +1594,15 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
-  SDValue ValQ = Op.getOperand(0);
+  SDValue Val = Op.getOperand(0);
   MVT ResTy = ty(Op);
-  MVT VecTy = ty(ValQ);
+  MVT ValTy = ty(Val);
   const SDLoc &dl(Op);
 
-  if (isHvxBoolTy(VecTy) && ResTy.isScalarInteger()) {
+  if (isHvxBoolTy(ValTy) && ResTy.isScalarInteger()) {
     unsigned HwLen = Subtarget.getVectorLength();
     MVT WordTy = MVT::getVectorVT(MVT::i32, HwLen/4);
-    SDValue VQ = compressHvxPred(ValQ, dl, WordTy, DAG);
+    SDValue VQ = compressHvxPred(Val, dl, WordTy, DAG);
     unsigned BitWidth = ResTy.getSizeInBits();
 
     if (BitWidth < 64) {
@@ -1635,6 +1635,39 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
 
     return DAG.getNode(ISD::BUILD_PAIR, dl, ResTy, Combines);
   }
+  if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) {
+    // Handle bitcast from i128 -> v128i1 and i64 -> v64i1.
+    unsigned BitWidth = ValTy.getSizeInBits();
+    unsigned HwLen = Subtarget.getVectorLength();
+    assert(BitWidth == HwLen);
+
+    MVT ValAsVecTy = MVT::getVectorVT(MVT::i8, BitWidth / 8);
+    SDValue ValAsVec = DAG.getBitcast(ValAsVecTy, Val);
+    // Splat each byte of Val 8 times.
+    // Bytes = [(b0)x8, (b1)x8, ...., (b15)x8]
+    // where b0, b1,..., b15 are least to most significant bytes of I.
+    SmallVector<SDValue, 128> Bytes;
+    // Tmp: 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80, 0x01,0x02,0x04,0x08,...
+    // These are bytes with the LSB rotated left with respect to their index.
+    SmallVector<SDValue, 128> Tmp;
+    for (unsigned I = 0; I != HwLen / 8; ++I) {
+      SDValue Idx = DAG.getConstant(I, dl, MVT::i32);
+      SDValue Byte =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, ValAsVec, Idx);
+      for (unsigned J = 0; J != 8; ++J) {
+        Bytes.push_back(Byte);
+        Tmp.push_back(DAG.getConstant(1ull << J, dl, MVT::i8));
+      }
+    }
+
+    MVT ConstantVecTy = MVT::getVectorVT(MVT::i8, HwLen);
+    SDValue ConstantVec = DAG.getBuildVector(ConstantVecTy, dl, Tmp);
+    SDValue I2V = buildHvxVectorReg(Bytes, dl, ConstantVecTy, DAG);
+
+    // Each Byte in the I2V will be set iff corresponding bit is set in Val.
+    I2V = DAG.getNode(ISD::AND, dl, ConstantVecTy, {I2V, ConstantVec});
+    return DAG.getNode(HexagonISD::V2Q, dl, ResTy, I2V);
+  }
 
   return Op;
 }
@@ -2255,8 +2288,8 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     case HexagonISD::V2Q:
       if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
         if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
-          return C->isNullValue() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
-                                  : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
+          return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
+                             : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
       }
       break;
     case HexagonISD::Q2V:
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index ef2b3040931d..45adaf50774f 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -268,8 +268,7 @@ class OpcodeDuplex {
   let Inst{12-0}  = ISubLo;
 }
 
-class InstDuplex<bits<4> iClass, list<dag> pattern = [],
-                 string cstr = "">
+class InstDuplex<bits<4> iClass, string cstr = "">
   : Instruction, OpcodeDuplex {
   let Namespace = "Hexagon";
   IType Type = TypeDUPLEX;  // uses slot 0,1
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index f14eaacbf071..76220eff4d51 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -169,13 +169,13 @@ MachineInstr *HexagonInstrInfo::findLoopInstr(MachineBasicBlock *BB,
       continue;
     if (PB == BB)
       continue;
-    for (auto I = PB->instr_rbegin(), E = PB->instr_rend(); I != E; ++I) {
-      unsigned Opc = I->getOpcode();
+    for (MachineInstr &I : llvm::reverse(PB->instrs())) {
+      unsigned Opc = I.getOpcode();
       if (Opc == LOOPi || Opc == LOOPr)
-        return &*I;
+        return &I;
       // We've reached a different loop, which means the loop01 has been
       // removed.
-      if (Opc == EndLoopOp && I->getOperand(0).getMBB() != TargetBB)
+      if (Opc == EndLoopOp && I.getOperand(0).getMBB() != TargetBB)
         return nullptr;
     }
     // Check the predecessors for the LOOP instruction.
@@ -1791,8 +1791,8 @@ HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
 bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                      Register &SrcReg2, int &Mask,
-                                      int &Value) const {
+                                      Register &SrcReg2, int64_t &Mask,
+                                      int64_t &Value) const {
   unsigned Opc = MI.getOpcode();
 
   // Set mask and the first source register.
@@ -3627,8 +3627,8 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
 
   switch (MI.getOpcode()) {
   default:
-    report_fatal_error(std::string("Unknown .new type: ") +
-      std::to_string(MI.getOpcode()));
+    report_fatal_error(Twine("Unknown .new type: ") +
+                       std::to_string(MI.getOpcode()));
   case Hexagon::S4_storerb_ur:
     return Hexagon::S4_storerbnew_ur;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 11717996935d..eaaf9f7046c7 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -270,7 +270,8 @@ public:
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &Mask, int &Value) const override;
+                      Register &SrcReg2, int64_t &Mask,
+                      int64_t &Value) const override;
 
   /// Compute the instruction latency of a given instruction.
   /// If the instruction has higher cost when predicated, it's returned via
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 76cc8f402c5a..ccaf1aac1ce0 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1351,8 +1351,8 @@ bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
     // be unshifted.
     if (!commutesWithShift(R))
       return false;
-    for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
-      auto *T = cast<Instruction>(*I);
+    for (User *U : R->users()) {
+      auto *T = cast<Instruction>(U);
       // Skip users from outside of the loop. They will be handled later.
       // Also, skip the right-shifts and phi nodes, since they mix early
       // and late values.
@@ -1490,10 +1490,8 @@ void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
     if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
       I.replaceAllUsesWith(SV);
 
-  for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
-    N = std::next(I);
-    RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
-  }
+  for (Instruction &I : llvm::make_early_inc_range(*LoopB))
+    RecursivelyDeleteTriviallyDeadInstructions(&I, &TLI);
 }
 
 unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
@@ -2247,8 +2245,7 @@ CleanupAndExit:
     DT->addNewBlock(MemmoveB, Preheader);
     // Find the new immediate dominator of the exit block.
     BasicBlock *ExitD = Preheader;
-    for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
-      BasicBlock *PB = *PI;
+    for (BasicBlock *PB : predecessors(ExitB)) {
       ExitD = DT->findNearestCommonDominator(ExitD, PB);
       if (!ExitD)
         break;
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 0e6555024303..47bebf77b31b 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -286,9 +286,6 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
 }
 
 void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
-  if (SU->isScheduled)
-    return;
-
   for (const SDep &PI : SU->Preds) {
     unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
     unsigned MinLatency = PI.getLatency();
@@ -298,13 +295,12 @@ void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
     if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
       SU->TopReadyCycle = PredReadyCycle + MinLatency;
   }
-  Top.releaseNode(SU, SU->TopReadyCycle);
+
+  if (!SU->isScheduled)
+    Top.releaseNode(SU, SU->TopReadyCycle);
 }
 
 void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
-  if (SU->isScheduled)
-    return;
-
   assert(SU->getInstr() && "Scheduled SUnit must have instr");
 
   for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
@@ -317,7 +313,9 @@ void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
     if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
       SU->BotReadyCycle = SuccReadyCycle + MinLatency;
   }
-  Bot.releaseNode(SU, SU->BotReadyCycle);
+
+  if (!SU->isScheduled)
+    Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
 /// Does this SU have a hazard within the current instruction group.
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 8dc1113194a8..8edcb745d654 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -535,13 +535,9 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
         // I am doing this only because LLVM does not provide LiveOut
         // at the BB level.
         bool predLive = false;
-        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
-                                                    SIE = MBB->succ_end();
-             SI != SIE; ++SI) {
-          MachineBasicBlock *succMBB = *SI;
-          if (succMBB->isLiveIn(predReg))
+        for (const MachineBasicBlock *SuccMBB : MBB->successors())
+          if (SuccMBB->isLiveIn(predReg))
             predLive = true;
-        }
         if (predLive)
           break;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index e026bb6d601d..bfd02802b782 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -67,26 +67,23 @@ bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  unsigned Idx = 1;
+  unsigned Idx = 0;
   // Try to optimize sign extends in formal parameters. It's relying on
   // callee already sign extending the values. I'm not sure if our ABI
   // requires callee to sign extend though.
   for (auto &Arg : F.args()) {
-    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+    if (F.getAttributes().hasParamAttr(Idx, Attribute::SExt)) {
       if (!isa<PointerType>(Arg.getType())) {
-        for (auto UI = Arg.use_begin(); UI != Arg.use_end();) {
-          if (isa<SExtInst>(*UI)) {
-            Instruction* Use = cast<Instruction>(*UI);
+        for (Use &U : llvm::make_early_inc_range(Arg.uses())) {
+          if (isa<SExtInst>(U)) {
+            Instruction* Use = cast<Instruction>(U);
             SExtInst* SI = new SExtInst(&Arg, Use->getType());
             assert (EVT::getEVT(SI->getType()) ==
                     (EVT::getEVT(Use->getType())));
-            ++UI;
             Use->replaceAllUsesWith(SI);
             Instruction* First = &F.getEntryBlock().front();
             SI->insertBefore(First);
             Use->eraseFromParent();
-          } else {
-            ++UI;
           }
         }
       }
diff --git a/llvm/lib/Target/Hexagon/HexagonPseudo.td b/llvm/lib/Target/Hexagon/HexagonPseudo.td
index 20c939577586..11f8af7c41a0 100644
--- a/llvm/lib/Target/Hexagon/HexagonPseudo.td
+++ b/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -198,7 +198,7 @@ def PS_callr_nr: InstHexagon<(outs), (ins IntRegs:$Rs),
 let isCall = 1, hasSideEffects = 1,
     isExtended = 0, isExtendable = 1, opExtendable = 0, isCodeGenOnly = 1,
     BaseOpcode = "PS_call_nr", isExtentSigned = 1, opExtentAlign = 2 in
-class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops,
+class Call_nr<bits<5> nbits, bit isFalse, dag iops,
               InstrItinClass itin>
   : Pseudo<(outs), iops, "">, PredRel {
     bits<2> Pu;
@@ -210,7 +210,7 @@ class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops,
     let Itinerary = itin;
 }
 
-def PS_call_nr : Call_nr<24, 0, 0, (ins s32_0Imm:$Ii), J2_call.Itinerary>;
+def PS_call_nr : Call_nr<24, 0, (ins s32_0Imm:$Ii), J2_call.Itinerary>;
 //def PS_call_nrt: Call_nr<17, 1, 0, (ins PredRegs:$Pu, s32_0Imm:$dst),
 //                         J2_callt.Itinerary>;
 //def PS_call_nrf: Call_nr<17, 1, 1, (ins PredRegs:$Pu, s32_0Imm:$dst),
@@ -267,7 +267,7 @@ let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
 class CondStr<string CReg, bit True, bit New> {
   string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
 }
-class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+class JumpOpcStr<string Mnemonic, bit Taken> {
   string S = Mnemonic # !if(Taken, ":t", ":nt");
 }
 let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
@@ -275,7 +275,7 @@ let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
 class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak, InstHexagon rootInst>
   :  InstHexagon<(outs), (ins PredRegs:$src, IntRegs:$dst),
                  CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-                 JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst",
+                 JumpOpcStr<"jumpr", isTak>.S # " $dst",
                  [], "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
 
     let isTaken = isTak;
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index 49428db223a1..8b7138d3c809 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -31,6 +31,19 @@ let Namespace = "Hexagon" in {
     let HWEncoding{4-0} = num;
   }
 
+  class HexagonSys<bits<7> num, string n, list<string> alt = [],
+                   list<Register> alias = []> : Register<n, alt> {
+    let Aliases = alias;
+    let HWEncoding{6-0} = num;
+  }
+
+  class HexagonDoubleSys<bits<7> num, string n, list<Register> subregs,
+                         list<string> alt = []> :
+        RegisterWithSubRegs<n, subregs> {
+    let AltNames = alt;
+    let HWEncoding{6-0} = num;
+  }
+
   // Registers are identified with 5-bit ID numbers.
   // Ri - 32-bit integer registers.
   class Ri<bits<5> num, string n, list<string> alt = []> :
@@ -74,6 +87,18 @@ let Namespace = "Hexagon" in {
     let SubRegs = subregs;
   }
 
+  // Rs - system registers
+  class Rs<bits<7> num, string n,
+           list<string> alt = [], list<Register> alias = []> :
+        HexagonSys<num, n, alt, alias>;
+
+  // Rss - 64-bit system registers.
+  class Rss<bits<7> num, string n, list<Register> subregs,
+            list<string> alt = []> :
+        HexagonDoubleSys<num, n, subregs, alt> {
+    let SubRegs = subregs;
+  }
+
   // Mx - address modifier registers
   class Mx<bits<1> num, string n> : Register<n, []> {
     let HWEncoding{0} = num;
@@ -260,6 +285,134 @@ let Namespace = "Hexagon" in {
   def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
   def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
 
+  // System registers.
+  def SGP0     :  Rs<0,  "sgp0",       ["s0"]>,  DwarfRegNum<[144]>;
+  def SGP1     :  Rs<1,  "sgp1",       ["s1"]>,  DwarfRegNum<[145]>;
+  def STID     :  Rs<2,  "stid",       ["s2"]>,  DwarfRegNum<[146]>;
+  def ELR      :  Rs<3,  "elr",        ["s3"]>,  DwarfRegNum<[147]>;
+  def BADVA0   :  Rs<4,  "badva0",     ["s4"]>,  DwarfRegNum<[148]>;
+  def BADVA1   :  Rs<5,  "badva1",     ["s5"]>,  DwarfRegNum<[149]>;
+  def SSR      :  Rs<6,  "ssr",        ["s6"]>,  DwarfRegNum<[150]>;
+  def CCR      :  Rs<7,  "ccr",        ["s7"]>,  DwarfRegNum<[151]>;
+  def HTID     :  Rs<8,  "htid",       ["s8"]>,  DwarfRegNum<[152]>;
+  def BADVA    :  Rs<9,  "badva",      ["s9"]>,  DwarfRegNum<[153]>;
+  def IMASK    :  Rs<10, "imask",      ["s10"]>, DwarfRegNum<[154]>;
+  def S11      :  Rs<11, "s11">,                 DwarfRegNum<[155]>;
+  def S12      :  Rs<12, "s12">,                 DwarfRegNum<[156]>;
+  def S13      :  Rs<13, "s13">,                 DwarfRegNum<[157]>;
+  def S14      :  Rs<14, "s14">,                 DwarfRegNum<[158]>;
+  def S15      :  Rs<15, "s15">,                 DwarfRegNum<[159]>;
+  def EVB      :  Rs<16, "evb",        ["s16"]>, DwarfRegNum<[160]>;
+  def MODECTL  :  Rs<17, "modectl",    ["s17"]>, DwarfRegNum<[161]>;
+  def SYSCFG   :  Rs<18, "syscfg",     ["s18"]>, DwarfRegNum<[162]>;
+  def S19      :  Rs<19, "s19",        ["s19"]>, DwarfRegNum<[163]>;
+  def S20      :  Rs<20, "s20",        ["s20"]>, DwarfRegNum<[164]>;
+  def VID      :  Rs<21, "vid",        ["s21"]>, DwarfRegNum<[165]>;
+  def S22      :  Rs<22, "s22",        ["s22"]>, DwarfRegNum<[166]>;
+  def S23      :  Rs<23, "s23">,                 DwarfRegNum<[167]>;
+  def S24      :  Rs<24, "s24">,                 DwarfRegNum<[168]>;
+  def S25      :  Rs<25, "s25">,                 DwarfRegNum<[169]>;
+  def S26      :  Rs<26, "s26">,                 DwarfRegNum<[170]>;
+  def CFGBASE  :  Rs<27, "cfgbase",    ["s27"]>, DwarfRegNum<[171]>;
+  def DIAG     :  Rs<28, "diag",       ["s28"]>, DwarfRegNum<[172]>;
+  def REV      :  Rs<29, "rev",        ["s29"]>, DwarfRegNum<[173]>;
+  def PCYCLELO :  Rs<30, "pcyclelo",   ["s30"]>, DwarfRegNum<[174]>;
+  def PCYCLEHI :  Rs<31, "pcyclehi",   ["s31"]>, DwarfRegNum<[175]>;
+  def ISDBST   :  Rs<32, "isdbst",     ["s32"]>, DwarfRegNum<[176]>;
+  def ISDBCFG0 :  Rs<33, "isdbcfg0",   ["s33"]>, DwarfRegNum<[177]>;
+  def ISDBCFG1 :  Rs<34, "isdbcfg1",   ["s34"]>, DwarfRegNum<[178]>;
+  def S35      :  Rs<35, "s35">,                 DwarfRegNum<[179]>;
+  def BRKPTPC0 :  Rs<36, "brkptpc0",   ["s36"]>, DwarfRegNum<[180]>;
+  def BRKPTCFG0:  Rs<37, "brkptcfg0",  ["s37"]>, DwarfRegNum<[181]>;
+  def BRKPTPC1 :  Rs<38, "brkptpc1",   ["s38"]>, DwarfRegNum<[182]>;
+  def BRKPTCFG1:  Rs<39, "brkptcfg1",  ["s39"]>, DwarfRegNum<[183]>;
+  def ISDBMBXIN:  Rs<40, "isdbmbxin",  ["s40"]>, DwarfRegNum<[184]>;
+  def ISDBMBXOUT: Rs<41, "isdbmbxout", ["s41"]>, DwarfRegNum<[185]>;
+  def ISDBEN:     Rs<42, "isdben",     ["s42"]>, DwarfRegNum<[186]>;
+  def ISDBGPR:    Rs<43, "isdbgpr",    ["s43"]>, DwarfRegNum<[187]>;
+  def S44:        Rs<44, "s44">,                 DwarfRegNum<[188]>;
+  def S45:        Rs<45, "s45">,                 DwarfRegNum<[189]>;
+  def S46:        Rs<46, "s46">,                 DwarfRegNum<[190]>;
+  def S47:        Rs<47, "s47">,                 DwarfRegNum<[191]>;
+  def PMUCNT0:    Rs<48, "pmucnt0",    ["s48"]>, DwarfRegNum<[192]>;
+  def PMUCNT1:    Rs<49, "pmucnt1",    ["s49"]>, DwarfRegNum<[193]>;
+  def PMUCNT2:    Rs<50, "pmucnt2",    ["s50"]>, DwarfRegNum<[194]>;
+  def PMUCNT3:    Rs<51, "pmucnt3",    ["s51"]>, DwarfRegNum<[195]>;
+  def PMUEVTCFG:  Rs<52, "pmuevtcfg",  ["s52"]>, DwarfRegNum<[196]>;
+  def PMUCFG:     Rs<53, "pmucfg",     ["s53"]>, DwarfRegNum<[197]>;
+  def S54:        Rs<54, "s54">,                 DwarfRegNum<[198]>;
+  def S55:        Rs<55, "s55">,                 DwarfRegNum<[199]>;
+  def S56:        Rs<56, "s56">,                 DwarfRegNum<[200]>;
+  def S57:        Rs<57, "s57">,                 DwarfRegNum<[201]>;
+  def S58:        Rs<58, "s58">,                 DwarfRegNum<[202]>;
+  def S59:        Rs<59, "s59">,                 DwarfRegNum<[203]>;
+  def S60:        Rs<60, "s60">,                 DwarfRegNum<[204]>;
+  def S61:        Rs<61, "s61">,                 DwarfRegNum<[205]>;
+  def S62:        Rs<62, "s62">,                 DwarfRegNum<[206]>;
+  def S63:        Rs<63, "s63">,                 DwarfRegNum<[207]>;
+  def S64:        Rs<64, "s64">,                 DwarfRegNum<[208]>;
+  def S65:        Rs<65, "s65">,                 DwarfRegNum<[209]>;
+  def S66:        Rs<66, "s66">,                 DwarfRegNum<[210]>;
+  def S67:        Rs<67, "s67">,                 DwarfRegNum<[211]>;
+  def S68:        Rs<68, "s68">,                 DwarfRegNum<[212]>;
+  def S69:        Rs<69, "s69">,                 DwarfRegNum<[213]>;
+  def S70:        Rs<70, "s70">,                 DwarfRegNum<[214]>;
+  def S71:        Rs<71, "s71">,                 DwarfRegNum<[215]>;
+  def S72:        Rs<72, "s72">,                 DwarfRegNum<[216]>;
+  def S73:        Rs<73, "s73">,                 DwarfRegNum<[217]>;
+  def S74:        Rs<74, "s74">,                 DwarfRegNum<[218]>;
+  def S75:        Rs<75, "s75">,                 DwarfRegNum<[219]>;
+  def S76:        Rs<76, "s76">,                 DwarfRegNum<[220]>;
+  def S77:        Rs<77, "s77">,                 DwarfRegNum<[221]>;
+  def S78:        Rs<78, "s78">,                 DwarfRegNum<[222]>;
+  def S79:        Rs<79, "s79">,                 DwarfRegNum<[223]>;
+  def S80:        Rs<80, "s80">,                 DwarfRegNum<[224]>;
+
+  // System Register Pair
+  let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+    def SGP1_0 : Rss<0,  "s1:0",  [SGP0, SGP1], ["sgp1:0"]>, DwarfRegNum<[144]>;
+    def S3_2   : Rss<2,  "s3:2",  [STID, ELR]>,              DwarfRegNum<[146]>;
+    def S5_4   : Rss<4,  "s5:4",  [BADVA0, BADVA1], ["badva1:0"]>,
+                                                             DwarfRegNum<[148]>;
+    def S7_6   : Rss<6,  "s7:6",  [SSR, CCR], ["ccr:ssr"]>,  DwarfRegNum<[150]>;
+    def S9_8   : Rss<8,  "s9:8",  [HTID, BADVA]>,            DwarfRegNum<[152]>;
+    def S11_10 : Rss<10, "s11:10", [IMASK, S11]>,            DwarfRegNum<[154]>;
+    def S13_12 : Rss<12, "s13:12", [S12, S13]>,              DwarfRegNum<[156]>;
+    def S15_14 : Rss<14, "s15:14", [S14, S15]>,              DwarfRegNum<[158]>;
+    def S17_16 : Rss<16, "s17:16", [EVB, MODECTL]>,          DwarfRegNum<[160]>;
+    def S19_18 : Rss<18, "s19:18", [SYSCFG, S19]>,           DwarfRegNum<[162]>;
+    def S21_20 : Rss<20, "s21:20", [S20, VID]>,              DwarfRegNum<[164]>;
+    def S23_22 : Rss<22, "s23:22", [S22, S23]>,              DwarfRegNum<[166]>;
+    def S25_24 : Rss<24, "s25:24", [S24, S25]>,              DwarfRegNum<[168]>;
+    def S27_26 : Rss<26, "s27:26", [S26, CFGBASE]>,          DwarfRegNum<[170]>;
+    def S29_28 : Rss<28, "s29:28", [DIAG, REV]>,             DwarfRegNum<[172]>;
+    def S31_30 : Rss<30, "s31:30", [PCYCLELO, PCYCLEHI], ["pcycle"]>,    DwarfRegNum<[174]>;
+    def S33_32 : Rss<32, "s33:32", [ISDBST, ISDBCFG0]>,      DwarfRegNum<[176]>;
+    def S35_34 : Rss<34, "s35:34", [ISDBCFG1, S35]>,    DwarfRegNum<[178]>;
+    def S37_36 : Rss<36, "s37:36", [BRKPTPC0, BRKPTCFG0]>,   DwarfRegNum<[180]>;
+    def S39_38 : Rss<38, "s39:38", [BRKPTPC1, BRKPTCFG1]>,   DwarfRegNum<[182]>;
+    def S41_40 : Rss<40, "s41:40", [ISDBMBXIN, ISDBMBXOUT]>, DwarfRegNum<[184]>;
+    def S43_42 : Rss<42, "s43:42", [ISDBEN, ISDBGPR]>,       DwarfRegNum<[186]>;
+    def S45_44 : Rss<44, "s45:44", [S44, S45]>,              DwarfRegNum<[188]>;
+    def S47_46 : Rss<46, "s47:46", [S46, S47]>,              DwarfRegNum<[190]>;
+    def S49_48 : Rss<48, "s49:48", [PMUCNT0, PMUCNT1]>,      DwarfRegNum<[192]>;
+    def S51_50 : Rss<50, "s51:50", [PMUCNT2, PMUCNT3]>,      DwarfRegNum<[194]>;
+    def S53_52 : Rss<52, "s53:52", [PMUEVTCFG, PMUCFG]>,     DwarfRegNum<[196]>;
+    def S55_54 : Rss<54, "s55:54", [S54, S55]>,              DwarfRegNum<[198]>;
+    def S57_56 : Rss<56, "s57:56", [S56, S57]>,              DwarfRegNum<[200]>;
+    def S59_58 : Rss<58, "s59:58", [S58, S59]>,              DwarfRegNum<[202]>;
+    def S61_60 : Rss<60, "s61:60", [S60, S61]>,              DwarfRegNum<[204]>;
+    def S63_62 : Rss<62, "s63:62", [S62, S63]>,              DwarfRegNum<[206]>;
+    def S65_64 : Rss<64, "s65:64", [S64, S65]>,              DwarfRegNum<[208]>;
+    def S67_66 : Rss<66, "s67:66", [S66, S67]>,              DwarfRegNum<[210]>;
+    def S69_68 : Rss<68, "s69:68", [S68, S69]>,              DwarfRegNum<[212]>;
+    def S71_70 : Rss<70, "s71:70", [S70, S71]>,              DwarfRegNum<[214]>;
+    def S73_72 : Rss<72, "s73:72", [S72, S73]>,              DwarfRegNum<[216]>;
+    def S75_74 : Rss<74, "s75:74", [S74, S75]>,              DwarfRegNum<[218]>;
+    def S77_76 : Rss<76, "s77:76", [S77, S76]>,              DwarfRegNum<[219]>;
+    def S79_78 : Rss<78, "s79:78", [S79, S78]>,              DwarfRegNum<[220]>;
+  }
+
   // Guest Registers
   def GELR:      Rg<0,  "gelr", ["g0"]>,       DwarfRegNum<[220]>;
   def GSR:       Rg<1,  "gsr", ["g1"]>,        DwarfRegNum<[221]>;
@@ -432,6 +585,40 @@ def GuestRegs64 : RegisterClass<"Hexagon", [i64], 64,
        G25_24, G27_26, G29_28,
        G31_30)>;
 
+let Size = 32, isAllocatable = 0 in
+def SysRegs : RegisterClass<"Hexagon", [i32], 32,
+  (add SGP0, SGP1, STID, ELR, BADVA0, BADVA1,
+       SSR, CCR, HTID, BADVA, IMASK,
+       S11, S12, S13, S14, S15,
+       S19, S23, S25,
+       EVB, MODECTL, SYSCFG, S20, VID, S22, S24,
+       S26, CFGBASE, DIAG, REV, PCYCLEHI,
+       PCYCLELO, ISDBST, ISDBCFG0, ISDBCFG1, S35,
+       BRKPTPC0, BRKPTCFG0, BRKPTPC1, BRKPTCFG1,
+       ISDBMBXIN, ISDBMBXOUT, ISDBEN, ISDBGPR,
+       S44, S45, S46, S47,
+       PMUCNT0, PMUCNT1, PMUCNT2, PMUCNT3,
+       PMUEVTCFG, PMUCFG, S54, S55, S56, S57,
+       S58, S59, S60, S61, S62, S63, S64, S65, S66, S67,
+       S68, S69, S70, S71, S72, S73, S74, S75, S76, S77,
+       S78, S79, S80
+       )>;
+
+let Size = 64, isAllocatable = 0 in
+def SysRegs64 : RegisterClass<"Hexagon", [i64], 64,
+  (add SGP1_0,
+       S3_2, S5_4, S7_6, S9_8,
+       S11_10, S13_12, S15_14,
+       S17_16, S19_18, S21_20,
+       S23_22, S25_24,
+       S27_26, S29_28, S31_30, S33_32, S35_34,
+       S37_36, S39_38, S41_40, S43_42, S45_44,
+       S47_46, S49_48, S51_50, S53_52,
+       S55_54, S57_56, S59_58,
+       S61_60, S63_62, S65_64, S67_66, S69_68,
+       S71_70, S73_72, S75_74, S77_76, S79_78
+       )>;
+
 // These registers are new for v62 and onward.
 // The function RegisterMatchesArch() uses this list for validation.
 let isAllocatable = 0 in
diff --git a/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index f9fb14c190ff..4890c3dbb7bc 100644
--- a/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -70,9 +70,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 
   // Loop over all of the basic blocks
   for (MachineBasicBlock &B : Fn) {
-    for (auto I = B.begin(), E = B.end(); I != E; ) {
-      MachineInstr &MI = *I;
-      ++I;
+    for (MachineInstr &MI : llvm::make_early_inc_range(B)) {
       unsigned Opc = MI.getOpcode();
 
       if (Opc == Hexagon::CONST32) {
diff --git a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 52452e9c6cd5..9a0f57fce97d 100644
--- a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -508,7 +508,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
   while (CmpI->getOpcode() == Hexagon::C2_not)
     CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
 
-  int Mask = 0, Val = 0;
+  int64_t Mask = 0, Val = 0;
   bool OkCI = TII->analyzeCompare(*CmpI, CmpR1, CmpR2, Mask, Val);
   if (!OkCI)
     return;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 3cbb4b591f8c..66de698182d7 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -23,9 +23,9 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 
@@ -238,9 +238,9 @@ const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   AttributeList FnAttrs = F.getAttributes();
   Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-cpu");
+      FnAttrs.getFnAttr("target-cpu");
   Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
+      FnAttrs.getFnAttr("target-features");
 
   std::string CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
@@ -280,11 +280,11 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {
 
 void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerLateLoopOptimizationsEPCallback(
-      [=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
+      [=](LoopPassManager &LPM, OptimizationLevel Level) {
         LPM.addPass(HexagonLoopIdiomRecognitionPass());
       });
   PB.registerLoopOptimizerEndEPCallback(
-      [=](LoopPassManager &LPM, PassBuilder::OptimizationLevel Level) {
+      [=](LoopPassManager &LPM, OptimizationLevel Level) {
         LPM.addPass(HexagonVectorLoopCarriedReusePass());
       });
 }
@@ -447,11 +447,11 @@ void HexagonPassConfig::addPreEmitPass() {
   }
 
   // Packetization is mandatory: it handles gather/scatter at all opt levels.
-  addPass(createHexagonPacketizer(NoOpt), false);
+  addPass(createHexagonPacketizer(NoOpt));
 
   if (EnableVectorPrint)
-    addPass(createHexagonVectorPrint(), false);
+    addPass(createHexagonVectorPrint());
 
   // Add CFI instructions if necessary.
-  addPass(createHexagonCallFrameInformation(), false);
+  addPass(createHexagonCallFrameInformation());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 25466786ee41..7df32e4072e3 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -90,9 +90,8 @@ static bool isSmallDataSection(StringRef Sec) {
     return true;
   // If either ".sdata." or ".sbss." is a substring of the section name
   // then put the symbol in small data.
-  return Sec.find(".sdata.") != StringRef::npos ||
-         Sec.find(".sbss.") != StringRef::npos ||
-         Sec.find(".scommon.") != StringRef::npos;
+  return Sec.contains(".sdata.") || Sec.contains(".sbss.") ||
+         Sec.contains(".scommon.");
 }
 
 static const char *getSectionSuffixForSize(unsigned Size) {
@@ -178,10 +177,10 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
 
   if (GO->hasSection()) {
     StringRef Section = GO->getSection();
-    if (Section.find(".access.text.group") != StringRef::npos)
+    if (Section.contains(".access.text.group"))
       return getContext().getELFSection(GO->getSection(), ELF::SHT_PROGBITS,
                                         ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
-    if (Section.find(".access.data.group") != StringRef::npos)
+    if (Section.contains(".access.data.group"))
       return getContext().getELFSection(GO->getSection(), ELF::SHT_PROGBITS,
                                         ELF::SHF_WRITE | ELF::SHF_ALLOC);
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
index a5b14a7e0764..a99aa4f16a08 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -16,6 +16,7 @@ class HexagonTargetStreamer : public MCTargetStreamer {
 public:
   HexagonTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
   virtual void emitCodeAlignment(unsigned ByteAlignment,
+                                 const MCSubtargetInfo *STI,
                                  unsigned MaxBytesToEmit = 0){};
   virtual void emitFAlign(unsigned Size, unsigned MaxBytesToEmit){};
   virtual void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 108027d79754..1bdd8c3c513a 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -64,7 +64,8 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
 
 // The Hexagon target can unroll loops with run-time trip counts.
 void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                             TTI::UnrollingPreferences &UP) {
+                                             TTI::UnrollingPreferences &UP,
+                                             OptimizationRemarkEmitter *ORE) {
   UP.Runtime = UP.Partial = true;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 2144fb27eb67..9e637dfc3e16 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
 
   // The Hexagon target can unroll loops with run-time trip counts.
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
@@ -120,10 +121,9 @@ public:
                                   MaybeAlign Alignment, unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr);
-  InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
-                        TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+  InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                        Align Alignment, unsigned AddressSpace,
+                                        TTI::TargetCostKind CostKind);
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
                                  ArrayRef<int> Mask, int Index, Type *SubTp);
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
@@ -133,16 +133,14 @@ public:
                                          const Instruction *I);
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                      CmpInst::Predicate VecPred,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index fa1ba4f2e469..1d325553f45a 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -203,6 +203,10 @@ static MachineBasicBlock::iterator moveInstrOut(MachineInstr &MI,
 }
 
 bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
+  // FIXME: This pass causes verification failures.
+  MF.getProperties().set(
+      MachineFunctionProperties::Property::FailsVerification);
+
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   HII = HST.getInstrInfo();
   HRI = HST.getRegisterInfo();
@@ -230,16 +234,9 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
   // dependence between Insn 0 and Insn 2. This can lead to incorrect
   // packetization
   for (MachineBasicBlock &MB : MF) {
-    auto End = MB.end();
-    auto MI = MB.begin();
-    while (MI != End) {
-      auto NextI = std::next(MI);
-      if (MI->isKill()) {
-        MB.erase(MI);
-        End = MB.end();
-      }
-      MI = NextI;
-    }
+    for (MachineInstr &MI : llvm::make_early_inc_range(MB))
+      if (MI.isKill())
+        MB.erase(&MI);
   }
 
   // TinyCore with Duplexes: Translate to big-instructions.
@@ -1156,12 +1153,9 @@ bool HexagonPacketizerList::cannotCoexist(const MachineInstr &MI,
 void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
   for (auto &B : MF) {
     MachineBasicBlock::iterator BundleIt;
-    MachineBasicBlock::instr_iterator NextI;
-    for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) {
-      NextI = std::next(I);
-      MachineInstr &MI = *I;
+    for (MachineInstr &MI : llvm::make_early_inc_range(B.instrs())) {
       if (MI.isBundle())
-        BundleIt = I;
+        BundleIt = MI.getIterator();
       if (!MI.isInsideBundle())
         continue;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index f949a9327f7a..897fb209a8bf 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -82,6 +82,7 @@ public:
 
   int getSizeOf(const Value *Val) const;
   int getSizeOf(const Type *Ty) const;
+  int getAllocSizeOf(const Type *Ty) const;
   int getTypeAlignment(Type *Ty) const;
 
   VectorType *getByteVectorTy(int ScLen) const;
@@ -443,8 +444,8 @@ auto AlignVectors::createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr,
   auto *PtrTy = cast<PointerType>(Ptr->getType());
   if (!PtrTy->isOpaque()) {
     Type *ElemTy = PtrTy->getElementType();
-    int ElemSize = HVC.getSizeOf(ElemTy);
-    if (Adjust % ElemSize == 0) {
+    int ElemSize = HVC.getAllocSizeOf(ElemTy);
+    if (Adjust % ElemSize == 0 && Adjust != 0) {
       Value *Tmp0 =
           Builder.CreateGEP(ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize));
       return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
@@ -979,6 +980,10 @@ auto HexagonVectorCombine::getSizeOf(const Type *Ty) const -> int {
   return DL.getTypeStoreSize(const_cast<Type *>(Ty)).getFixedValue();
 }
 
+auto HexagonVectorCombine::getAllocSizeOf(const Type *Ty) const -> int {
+  return DL.getTypeAllocSize(const_cast<Type *>(Ty)).getFixedValue();
+}
+
 auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
   // The actual type may be shorter than the HVX vector, so determine
   // the alignment based on subtarget info.
@@ -1326,7 +1331,7 @@ auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
     return None;
 
   Builder B(Gep0->getParent());
-  int Scale = DL.getTypeStoreSize(Gep0->getSourceElementType());
+  int Scale = getAllocSizeOf(Gep0->getSourceElementType());
 
   // FIXME: for now only check GEPs with a single index.
   if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
@@ -1343,7 +1348,7 @@ auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
   KnownBits Known0 = computeKnownBits(Idx0, DL, 0, &AC, Gep0, &DT);
   KnownBits Known1 = computeKnownBits(Idx1, DL, 0, &AC, Gep1, &DT);
   APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
-  if (Unknown.isAllOnesValue())
+  if (Unknown.isAllOnes())
     return None;
 
   Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index 310536458de9..f973862a0c9b 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -386,8 +386,7 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
                       << " can be reused\n");
 
     SmallVector<Instruction *, 4> PNUsers;
-    for (auto UI = PN->use_begin(), E = PN->use_end(); UI != E; ++UI) {
-      Use &U = *UI;
+    for (Use &U : PN->uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
 
       if (User->getParent() != BB)
@@ -415,9 +414,7 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() {
     // rematerialized in OtherBB, we may find more such "fixup" opportunities
     // in this block. So, we'll start over again.
     for (Instruction *I : PNUsers) {
-      for (auto UI = BEInst->use_begin(), E = BEInst->use_end(); UI != E;
-           ++UI) {
-        Use &U = *UI;
+      for (Use &U : BEInst->uses()) {
         Instruction *BEUser = cast<Instruction>(U.getUser());
 
         if (BEUser->getParent() != BB)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 627c53cadd84..5e5a26fea076 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -21,9 +21,9 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #include <sstream>
 
@@ -686,10 +686,11 @@ public:
     assert(Update && "Didn't find relaxation target");
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
-    static const uint32_t Nopcode  = 0x7f000000, // Hard-coded NOP.
-                          ParseIn  = 0x00004000, // In packet parse-bits.
-                          ParseEnd = 0x0000c000; // End of packet parse-bits.
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override {
+    static const uint32_t Nopcode = 0x7f000000, // Hard-coded NOP.
+        ParseIn = 0x00004000,                   // In packet parse-bits.
+        ParseEnd = 0x0000c000;                  // End of packet parse-bits.
 
     while (Count % HEXAGON_INSTR_SIZE) {
       LLVM_DEBUG(dbgs() << "Alignment not a multiple of the instruction size:"
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 24169c83bdb9..33b2e9a9e302 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -448,13 +448,12 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction(const MCInst &MI,
   ++MCNumEmitted;
 }
 
-LLVM_ATTRIBUTE_NORETURN
-static void raise_relocation_error(unsigned Width, unsigned Kind) {
+[[noreturn]] static void raise_relocation_error(unsigned Width, unsigned Kind) {
   std::string Text;
   raw_string_ostream Stream(Text);
   Stream << "Unrecognized relocation combination: width=" << Width
          << " kind=" << Kind;
-  report_fatal_error(Stream.str());
+  report_fatal_error(Twine(Stream.str()));
 }
 
 /// Some insns are not extended and thus have no bits. These cases require
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 32b0c610d63d..d832a756cb92 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -10,13 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "HexagonArch.h"
 #include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -32,8 +32,8 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
diff --git a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index 48770be3e301..ef9f9fd337fa 100644
--- a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/HexagonTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheHexagonTarget() {
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index e2642ddf722b..a994bd7e57a4 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -24,11 +24,11 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index b6f372657d59..57343784237d 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -19,17 +19,13 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
-namespace llvm {
-Target &getTheLanaiTarget();
-}
-
 static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx) {
diff --git a/llvm/lib/Target/Lanai/LanaiAluCode.h b/llvm/lib/Target/Lanai/LanaiAluCode.h
index 728332bff00b..69be05542723 100644
--- a/llvm/lib/Target/Lanai/LanaiAluCode.h
+++ b/llvm/lib/Target/Lanai/LanaiAluCode.h
@@ -70,7 +70,7 @@ inline static unsigned makePostOp(unsigned AluOp) {
 }
 
 inline static bool modifiesOp(unsigned AluOp) {
-  return isPreOp(AluOp) | isPostOp(AluOp);
+  return isPreOp(AluOp) || isPostOp(AluOp);
 }
 
 inline static const char *lanaiAluCodeToString(unsigned AluOp) {
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 6bac7c75853d..c0b7fd3fdd5d 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/LanaiInstPrinter.h"
 #include "LanaiAluCode.h"
 #include "LanaiCondCode.h"
 #include "LanaiInstrInfo.h"
 #include "LanaiMCInstLower.h"
 #include "LanaiTargetMachine.h"
+#include "MCTargetDesc/LanaiInstPrinter.h"
 #include "TargetInfo/LanaiTargetInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -32,7 +32,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
index aadcdc43f560..45af250b1410 100644
--- a/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -287,14 +287,14 @@ void LanaiDAGToDAGISel::Select(SDNode *Node) {
       ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
       // Materialize zero constants as copies from R0. This allows the coalescer
       // to propagate these into other instructions.
-      if (ConstNode->isNullValue()) {
+      if (ConstNode->isZero()) {
         SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
                                              SDLoc(Node), Lanai::R0, MVT::i32);
         return ReplaceNode(Node, New.getNode());
       }
       // Materialize all ones constants as copies from R1. This allows the
       // coalescer to propagate these into other instructions.
-      if (ConstNode->isAllOnesValue()) {
+      if (ConstNode->isAllOnes()) {
         SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
                                              SDLoc(Node), Lanai::R1, MVT::i32);
         return ReplaceNode(Node, New.getNode());
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index b96e178109d0..0d9e63c112fb 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -486,7 +486,7 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
         llvm_unreachable("unhandled argument type");
       }
     } else {
-      // Sanity check
+      // Only arguments passed on the stack should make it here.
       assert(VA.isMemLoc());
       // Load the argument to a virtual register
       unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
@@ -530,6 +530,15 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
   return Chain;
 }
 
+bool LanaiTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+
+  return CCInfo.CheckReturn(Outs, RetCC_Lanai32);
+}
+
 SDValue
 LanaiTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool IsVarArg,
@@ -1167,7 +1176,7 @@ SDValue LanaiTargetLowering::LowerGlobalAddress(SDValue Op,
 
   // If the code model is small or global variable will be placed in the small
   // section, then assume address will fit in 21-bits.
-  const GlobalObject *GO = GV->getBaseObject();
+  const GlobalObject *GO = GV->getAliaseeObject();
   if (TLOF->isGlobalInSmallSection(GO, getTargetMachine())) {
     SDValue Small = DAG.getTargetGlobalAddress(
         GV, DL, getPointerTy(DAG.getDataLayout()), Offset, LanaiII::MO_NO_FLAG);
@@ -1391,8 +1400,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC,
       // value is 0.
       OtherOp = DAG.getConstant(0, dl, VT);
     else
-      OtherOp =
-          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
+      OtherOp = DAG.getAllOnesConstant(dl, VT);
     return true;
   }
   }
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.h b/llvm/lib/Target/Lanai/LanaiISelLowering.h
index d29d69eaadb0..2f58560f4efe 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.h
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.h
@@ -90,6 +90,11 @@ public:
   SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
   Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
   std::pair<unsigned, const TargetRegisterClass *>
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index c82142970357..21d035c7ee9c 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -19,8 +19,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -175,8 +175,8 @@ LanaiInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
 }
 
 bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                    Register &SrcReg2, int &CmpMask,
-                                    int &CmpValue) const {
+                                    Register &SrcReg2, int64_t &CmpMask,
+                                    int64_t &CmpValue) const {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -203,7 +203,7 @@ bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
 // * SFSUB_F_RR can be made redundant by SUB_RI if the operands are the same.
 // * SFSUB_F_RI can be made redundant by SUB_I if the operands are the same.
 inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
+                                        unsigned SrcReg2, int64_t ImmValue,
                                         MachineInstr *OI) {
   if (CmpI->getOpcode() == Lanai::SFSUB_F_RR &&
       OI->getOpcode() == Lanai::SUB_R &&
@@ -281,8 +281,9 @@ inline static unsigned flagSettingOpcodeVariant(unsigned OldOpcode) {
 }
 
 bool LanaiInstrInfo::optimizeCompareInstr(
-    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int /*CmpMask*/,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
+    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2,
+    int64_t /*CmpMask*/, int64_t CmpValue,
+    const MachineRegisterInfo *MRI) const {
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
   if (!MI)
@@ -418,10 +419,8 @@ bool LanaiInstrInfo::optimizeCompareInstr(
     // live-out. If it is live-out, do not optimize.
     if (!isSafe) {
       MachineBasicBlock *MBB = CmpInstr.getParent();
-      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                            SE = MBB->succ_end();
-           SI != SE; ++SI)
-        if ((*SI)->isLiveIn(Lanai::SR))
+      for (const MachineBasicBlock *Succ : MBB->successors())
+        if (Succ->isLiveIn(Lanai::SR))
           return false;
     }
 
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index 44c1e629a8e6..5eef4474801d 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -96,14 +96,14 @@ public:
   // SrcReg2 if having two register operands, and the value it compares against
   // in CmpValue. Return true if the comparison instruction can be analyzed.
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
+                      Register &SrcReg2, int64_t &CmpMask,
+                      int64_t &CmpValue) const override;
 
   // See if the comparison instruction can be converted into something more
   // efficient. E.g., on Lanai register-register instructions can set the flag
   // register, obviating the need for a separate compare.
   bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                            Register SrcReg2, int CmpMask, int CmpValue,
+                            Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   // Analyze the given select instruction, returning true if it cannot be
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/llvm/lib/Target/Lanai/LanaiInstrInfo.td
index fcf89a0b52f6..d1fd327722ef 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -269,7 +269,7 @@ def splsIdempotent : InstrMapping {
 // -------------------------------------------------- //
 // ALU instructions
 // -------------------------------------------------- //
-multiclass ALUbase<bits<3> subOp, string AsmStr, SDNode OpNode,
+multiclass ALUbase<bits<3> subOp, string AsmStr,
                    PatLeaf LoExt, PatLeaf HiExt,
                    list<dag> loPattern, list<dag> hiPattern> {
   // Register Immediate
@@ -286,7 +286,7 @@ multiclass ALUbase<bits<3> subOp, string AsmStr, SDNode OpNode,
 
 multiclass ALUarith<bits<3> subOp, string AsmStr, SDNode OpNode,
                     PatLeaf LoExt, PatLeaf HiExt> {
-  defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt, [], []>;
+  defm I_ : ALUbase<subOp, AsmStr, LoExt, HiExt, [], []>;
 
   // Register Register
   let JJJJJ = 0 in
@@ -297,7 +297,7 @@ multiclass ALUarith<bits<3> subOp, string AsmStr, SDNode OpNode,
 
 multiclass ALUlogic<bits<3> subOp, string AsmStr, SDNode OpNode,
                     PatLeaf LoExt, PatLeaf HiExt> {
-  defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt,
+  defm I_ : ALUbase<subOp, AsmStr, LoExt, HiExt,
                     [(set GPR:$Rd, (OpNode GPR:$Rs1, LoExt:$imm16))],
                     [(set GPR:$Rd, (OpNode GPR:$Rs1, HiExt:$imm16))]>;
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index a31f59214ec7..70b6fd2c185d 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index f1fcbe4f418a..19a3bf4455ad 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -52,6 +52,16 @@ public:
   InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                                 TTI::TargetCostKind CostKind) {
     assert(Ty->isIntegerTy());
+    unsigned BitSize = Ty->getPrimitiveSizeInBits();
+    // There is no cost model for constants with a bit size of 0. Return
+    // TCC_Free here, so that constant hoisting will ignore this constant.
+    if (BitSize == 0)
+      return TTI::TCC_Free;
+    // No cost model for operations on integers larger than 64 bit implemented
+    // yet.
+    if (BitSize > 64)
+      return TTI::TCC_Free;
+
     if (Imm == 0)
       return TTI::TCC_Free;
     if (isInt<16>(Imm.getSExtValue()))
@@ -81,8 +91,7 @@ public:
   }
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index a17afe5e62f6..3c2a3ac69224 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -69,10 +69,12 @@ public:
     return Lanai::NumTargetFixupKinds;
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 };
 
-bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool LanaiAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                   const MCSubtargetInfo *STI) const {
   if ((Count % 4) != 0)
     return false;
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index e850b98de806..eb6bf8d3836c 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -22,8 +22,8 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <cstdint>
 #include <string>
 
@@ -97,6 +97,9 @@ public:
                       uint64_t &Target) const override {
     if (Inst.getNumOperands() == 0)
       return false;
+    if (!isConditionalBranch(Inst) && !isUnconditionalBranch(Inst) &&
+        !isCall(Inst))
+      return false;
 
     if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType ==
         MCOI::OPERAND_PCREL) {
diff --git a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index 2bb9f6ed1e97..5c63df670938 100644
--- a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/LanaiTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
index d8465f6d682b..4db879c34ad9 100644
--- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
+++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -14,7 +14,7 @@
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 #include <sstream>
 
@@ -52,6 +52,7 @@ class M68kAsmParser : public MCTargetAsmParser {
   bool isExpr();
   OperandMatchResultTy parseImm(OperandVector &Operands);
   OperandMatchResultTy parseMemOp(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrMoveMask(OperandVector &Operands);
 
 public:
   M68kAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
@@ -80,6 +81,7 @@ public:
 struct M68kMemOp {
   enum class Kind {
     Addr,
+    RegMask,
     Reg,
     RegIndirect,
     RegPostIncrement,
@@ -90,6 +92,7 @@ struct M68kMemOp {
 
   // These variables are used for the following forms:
   // Addr: (OuterDisp)
+  // RegMask: RegMask (as register mask)
   // Reg: %OuterReg
   // RegIndirect: (%OuterReg)
   // RegPostIncrement: (%OuterReg)+
@@ -106,6 +109,7 @@ struct M68kMemOp {
   uint8_t Size : 4;
   uint8_t Scale : 4;
   const MCExpr *Expr;
+  uint16_t RegMask;
 
   M68kMemOp() {}
   M68kMemOp(Kind Op) : Op(Op) {}
@@ -117,14 +121,14 @@ struct M68kMemOp {
 class M68kOperand : public MCParsedAsmOperand {
   typedef MCParsedAsmOperand Base;
 
-  enum class Kind {
+  enum class KindTy {
     Invalid,
     Token,
     Imm,
     MemOp,
   };
 
-  Kind Kind;
+  KindTy Kind;
   SMLoc Start, End;
   union {
     StringRef Token;
@@ -133,8 +137,10 @@ class M68kOperand : public MCParsedAsmOperand {
     M68kMemOp MemOp;
   };
 
+  template <unsigned N> bool isAddrN() const;
+
 public:
-  M68kOperand(enum Kind Kind, SMLoc Start, SMLoc End)
+  M68kOperand(KindTy Kind, SMLoc Start, SMLoc End)
       : Base(), Kind(Kind), Start(Start), End(End) {}
 
   SMLoc getStartLoc() const override { return Start; }
@@ -143,12 +149,14 @@ public:
   void print(raw_ostream &OS) const override;
 
   bool isMem() const override { return false; }
-  bool isMemOp() const { return Kind == Kind::MemOp; }
+  bool isMemOp() const { return Kind == KindTy::MemOp; }
 
   static void addExpr(MCInst &Inst, const MCExpr *Expr);
 
   // Reg
   bool isReg() const override;
+  bool isAReg() const;
+  bool isDReg() const;
   unsigned getReg() const override;
   void addRegOperands(MCInst &Inst, unsigned N) const;
 
@@ -168,8 +176,15 @@ public:
   static std::unique_ptr<M68kOperand> createImm(const MCExpr *Expr, SMLoc Start,
                                                 SMLoc End);
 
+  // MoveMask
+  bool isMoveMask() const;
+  void addMoveMaskOperands(MCInst &Inst, unsigned N) const;
+
   // Addr
   bool isAddr() const;
+  bool isAddr8() const { return isAddrN<8>(); }
+  bool isAddr16() const { return isAddrN<16>(); }
+  bool isAddr32() const { return isAddrN<32>(); }
   void addAddrOperands(MCInst &Inst, unsigned N) const;
 
   // ARI
@@ -210,11 +225,45 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "M68kGenAsmMatcher.inc"
 
+static inline unsigned getRegisterByIndex(unsigned RegisterIndex) {
+  static unsigned RegistersByIndex[] = {
+      M68k::D0, M68k::D1, M68k::D2, M68k::D3, M68k::D4, M68k::D5,
+      M68k::D6, M68k::D7, M68k::A0, M68k::A1, M68k::A2, M68k::A3,
+      M68k::A4, M68k::A5, M68k::A6, M68k::SP,
+  };
+  assert(RegisterIndex <=
+         sizeof(RegistersByIndex) / sizeof(RegistersByIndex[0]));
+  return RegistersByIndex[RegisterIndex];
+}
+
+static inline unsigned getRegisterIndex(unsigned Register) {
+  if (Register >= M68k::D0 && Register <= M68k::D7)
+    return Register - M68k::D0;
+  if (Register >= M68k::A0 && Register <= M68k::A6)
+    return Register - M68k::A0 + 8;
+
+  switch (Register) {
+  case M68k::SP:
+    // SP is sadly not contiguous with the rest of the An registers
+    return 15;
+
+  case M68k::PC:
+  case M68k::CCR:
+    return 16;
+
+  default:
+    llvm_unreachable("unexpected register number");
+  }
+}
+
 void M68kMemOp::print(raw_ostream &OS) const {
   switch (Op) {
   case Kind::Addr:
     OS << OuterDisp;
     break;
+  case Kind::RegMask:
+    OS << "RegMask(" << format("%04x", RegMask) << ")";
+    break;
   case Kind::Reg:
     OS << '%' << OuterReg;
     break;
@@ -248,7 +297,7 @@ void M68kOperand::addExpr(MCInst &Inst, const MCExpr *Expr) {
 
 // Reg
 bool M68kOperand::isReg() const {
-  return Kind == Kind::MemOp && MemOp.Op == M68kMemOp::Kind::Reg;
+  return Kind == KindTy::MemOp && MemOp.Op == M68kMemOp::Kind::Reg;
 }
 
 unsigned M68kOperand::getReg() const {
@@ -265,13 +314,13 @@ void M68kOperand::addRegOperands(MCInst &Inst, unsigned N) const {
 
 std::unique_ptr<M68kOperand> M68kOperand::createMemOp(M68kMemOp MemOp,
                                                       SMLoc Start, SMLoc End) {
-  auto Op = std::make_unique<M68kOperand>(Kind::MemOp, Start, End);
+  auto Op = std::make_unique<M68kOperand>(KindTy::MemOp, Start, End);
   Op->MemOp = MemOp;
   return Op;
 }
 
 // Token
-bool M68kOperand::isToken() const { return Kind == Kind::Token; }
+bool M68kOperand::isToken() const { return Kind == KindTy::Token; }
 StringRef M68kOperand::getToken() const {
   assert(isToken());
   return Token;
@@ -279,15 +328,15 @@ StringRef M68kOperand::getToken() const {
 
 std::unique_ptr<M68kOperand> M68kOperand::createToken(StringRef Token,
                                                       SMLoc Start, SMLoc End) {
-  auto Op = std::make_unique<M68kOperand>(Kind::Token, Start, End);
+  auto Op = std::make_unique<M68kOperand>(KindTy::Token, Start, End);
   Op->Token = Token;
   return Op;
 }
 
 // Imm
-bool M68kOperand::isImm() const { return Kind == Kind::Imm; }
+bool M68kOperand::isImm() const { return Kind == KindTy::Imm; }
 void M68kOperand::addImmOperands(MCInst &Inst, unsigned N) const {
-  assert(isImm() && "wrong oeprand kind");
+  assert(isImm() && "wrong operand kind");
   assert((N == 1) && "can only handle one register operand");
 
   M68kOperand::addExpr(Inst, Expr);
@@ -295,15 +344,53 @@ void M68kOperand::addImmOperands(MCInst &Inst, unsigned N) const {
 
 std::unique_ptr<M68kOperand> M68kOperand::createImm(const MCExpr *Expr,
                                                     SMLoc Start, SMLoc End) {
-  auto Op = std::make_unique<M68kOperand>(Kind::Imm, Start, End);
+  auto Op = std::make_unique<M68kOperand>(KindTy::Imm, Start, End);
   Op->Expr = Expr;
   return Op;
 }
 
+// MoveMask
+bool M68kOperand::isMoveMask() const {
+  if (!isMemOp())
+    return false;
+
+  if (MemOp.Op == M68kMemOp::Kind::RegMask)
+    return true;
+
+  if (MemOp.Op != M68kMemOp::Kind::Reg)
+    return false;
+
+  // Only regular address / data registers are allowed to be used
+  // in register masks.
+  return getRegisterIndex(MemOp.OuterReg) < 16;
+}
+
+void M68kOperand::addMoveMaskOperands(MCInst &Inst, unsigned N) const {
+  assert(isMoveMask() && "wrong operand kind");
+  assert((N == 1) && "can only handle one immediate operand");
+
+  uint16_t MoveMask = MemOp.RegMask;
+  if (MemOp.Op == M68kMemOp::Kind::Reg)
+    MoveMask = 1 << getRegisterIndex(MemOp.OuterReg);
+
+  Inst.addOperand(MCOperand::createImm(MoveMask));
+}
+
 // Addr
 bool M68kOperand::isAddr() const {
   return isMemOp() && MemOp.Op == M68kMemOp::Kind::Addr;
 }
+// TODO: Maybe we can also store the size of OuterDisp
+// in Size?
+template <unsigned N> bool M68kOperand::isAddrN() const {
+  if (isAddr()) {
+    int64_t Res;
+    if (MemOp.OuterDisp->evaluateAsAbsolute(Res))
+      return isInt<N>(Res);
+    return true;
+  }
+  return false;
+}
 void M68kOperand::addAddrOperands(MCInst &Inst, unsigned N) const {
   M68kOperand::addExpr(Inst, MemOp.OuterDisp);
 }
@@ -412,6 +499,18 @@ static inline bool checkRegisterClass(unsigned RegNo, bool Data, bool Address,
   }
 }
 
+bool M68kOperand::isAReg() const {
+  return isReg() && checkRegisterClass(getReg(),
+                                       /*Data=*/false,
+                                       /*Address=*/true, /*SP=*/true);
+}
+
+bool M68kOperand::isDReg() const {
+  return isReg() && checkRegisterClass(getReg(),
+                                       /*Data=*/true,
+                                       /*Address=*/false, /*SP=*/false);
+}
+
 unsigned M68kAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
                                                    unsigned Kind) {
   M68kOperand &Operand = (M68kOperand &)Op;
@@ -487,11 +586,6 @@ bool M68kAsmParser::parseRegisterName(unsigned &RegNo, SMLoc Loc,
 
   // Parse simple general-purpose registers.
   if (RegisterNameLower.size() == 2) {
-    static unsigned RegistersByIndex[] = {
-        M68k::D0, M68k::D1, M68k::D2, M68k::D3, M68k::D4, M68k::D5,
-        M68k::D6, M68k::D7, M68k::A0, M68k::A1, M68k::A2, M68k::A3,
-        M68k::A4, M68k::A5, M68k::A6, M68k::SP,
-    };
 
     switch (RegisterNameLower[0]) {
     case 'd':
@@ -500,7 +594,7 @@ bool M68kAsmParser::parseRegisterName(unsigned &RegNo, SMLoc Loc,
         unsigned IndexOffset = (RegisterNameLower[0] == 'a') ? 8 : 0;
         unsigned RegIndex = (unsigned)(RegisterNameLower[1] - '0');
         if (RegIndex < 8) {
-          RegNo = RegistersByIndex[IndexOffset + RegIndex];
+          RegNo = getRegisterByIndex(IndexOffset + RegIndex);
           return true;
         }
       }
@@ -616,16 +710,9 @@ OperandMatchResultTy M68kAsmParser::parseMemOp(OperandVector &Operands) {
   bool IsPD = false;
   M68kMemOp MemOp;
 
-  // Check for a plain register.
-  auto Result = parseRegister(MemOp.OuterReg);
-  if (Result == MatchOperand_Success) {
-    MemOp.Op = M68kMemOp::Kind::Reg;
-    Operands.push_back(
-        M68kOperand::createMemOp(MemOp, Start, getLexer().getLoc()));
-    return MatchOperand_Success;
-  }
-
-  if (Result == MatchOperand_ParseFail) {
+  // Check for a plain register or register mask.
+  auto Result = parseRegOrMoveMask(Operands);
+  if (Result != llvm::MatchOperand_NoMatch) {
     return Result;
   }
 
@@ -743,6 +830,87 @@ OperandMatchResultTy M68kAsmParser::parseMemOp(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy
+M68kAsmParser::parseRegOrMoveMask(OperandVector &Operands) {
+  SMLoc Start = getLexer().getLoc();
+  M68kMemOp MemOp(M68kMemOp::Kind::RegMask);
+  MemOp.RegMask = 0;
+
+  for (;;) {
+    bool IsFirstRegister =
+        (MemOp.Op == M68kMemOp::Kind::RegMask) && (MemOp.RegMask == 0);
+
+    unsigned FirstRegister;
+    auto Result = parseRegister(FirstRegister);
+    if (IsFirstRegister && (Result == llvm::MatchOperand_NoMatch)) {
+      return MatchOperand_NoMatch;
+    }
+    if (Result != llvm::MatchOperand_Success) {
+      Error(getLexer().getLoc(), "expected start register");
+      return MatchOperand_ParseFail;
+    }
+
+    unsigned LastRegister = FirstRegister;
+    if (getLexer().is(AsmToken::Minus)) {
+      getLexer().Lex();
+      Result = parseRegister(LastRegister);
+      if (Result != llvm::MatchOperand_Success) {
+        Error(getLexer().getLoc(), "expected end register");
+        return MatchOperand_ParseFail;
+      }
+    }
+
+    unsigned FirstRegisterIndex = getRegisterIndex(FirstRegister);
+    unsigned LastRegisterIndex = getRegisterIndex(LastRegister);
+
+    uint16_t NumNewBits = LastRegisterIndex - FirstRegisterIndex + 1;
+    uint16_t NewMaskBits = ((1 << NumNewBits) - 1) << FirstRegisterIndex;
+
+    if (IsFirstRegister && (FirstRegister == LastRegister)) {
+      // First register range is a single register, simplify to just Reg
+      // so that it matches more operands.
+      MemOp.Op = M68kMemOp::Kind::Reg;
+      MemOp.OuterReg = FirstRegister;
+    } else {
+      if (MemOp.Op == M68kMemOp::Kind::Reg) {
+        // This is the second register being specified - expand the Reg operand
+        // into a mask first.
+        MemOp.Op = M68kMemOp::Kind::RegMask;
+        MemOp.RegMask = 1 << getRegisterIndex(MemOp.OuterReg);
+
+        if (MemOp.RegMask == 0) {
+          Error(getLexer().getLoc(),
+                "special registers cannot be used in register masks");
+          return MatchOperand_ParseFail;
+        }
+      }
+
+      if ((FirstRegisterIndex >= 16) || (LastRegisterIndex >= 16)) {
+        Error(getLexer().getLoc(),
+              "special registers cannot be used in register masks");
+        return MatchOperand_ParseFail;
+      }
+
+      if (NewMaskBits & MemOp.RegMask) {
+        Error(getLexer().getLoc(), "conflicting masked registers");
+        return MatchOperand_ParseFail;
+      }
+
+      MemOp.RegMask |= NewMaskBits;
+    }
+
+    if (getLexer().isNot(AsmToken::Slash)) {
+      break;
+    }
+
+    getLexer().Lex();
+  }
+
+  Operands.push_back(
+      M68kOperand::createMemOp(MemOp, Start, getLexer().getLoc()));
+  return MatchOperand_Success;
+}
+
 void M68kAsmParser::eatComma() {
   if (Parser.getTok().is(AsmToken::Comma)) {
     Parser.Lex();
@@ -842,19 +1010,19 @@ bool M68kAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
 
 void M68kOperand::print(raw_ostream &OS) const {
   switch (Kind) {
-  case Kind::Invalid:
+  case KindTy::Invalid:
     OS << "invalid";
     break;
 
-  case Kind::Token:
+  case KindTy::Token:
     OS << "token '" << Token << "'";
     break;
 
-  case Kind::Imm:
+  case KindTy::Imm:
     OS << "immediate " << Imm;
     break;
 
-  case Kind::MemOp:
+  case KindTy::MemOp:
     MemOp.print(OS);
     break;
   }
diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
index a8453c838493..a08ffa787095 100644
--- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
+++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -21,7 +21,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -451,7 +451,8 @@ void M68kDisassembler::decodeImm(MCInst &Instr, unsigned Bead,
     llvm_unreachable("invalid imm");
   }
 
-  Scratch = (Scratch << NumToRead) | Reader.readBits(NumToRead);
+  Scratch = (NumToRead < 32) ? (Scratch << NumToRead) : 0;
+  Scratch |= Reader.readBits(NumToRead);
 }
 
 DecodeStatus M68kDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
diff --git a/llvm/lib/Target/M68k/GlSel/M68kCallLowering.cpp b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
index c5931cbfe04f..9cd959012e6f 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kCallLowering.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
@@ -33,7 +33,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
       : OutgoingValueHandler(MIRBuilder, MRI), MIB(MIB) {}
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
     Register ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -110,7 +110,7 @@ bool M68kCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
 void M68kIncomingValueHandler::assignValueToReg(Register ValVReg,
                                                 Register PhysReg,
-                                                CCValAssign &VA) {
+                                                CCValAssign VA) {
   MIRBuilder.getMRI()->addLiveIn(PhysReg);
   MIRBuilder.getMBB().addLiveIn(PhysReg);
   IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
diff --git a/llvm/lib/Target/M68k/GlSel/M68kCallLowering.h b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
index 9e0d462db677..47cdefdba100 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kCallLowering.h
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
@@ -52,7 +52,7 @@ struct M68kIncomingValueHandler : public CallLowering::IncomingValueHandler {
 
 private:
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override;
+                        CCValAssign VA) override;
 
   void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
                             MachinePointerInfo &MPO, CCValAssign &VA) override;
diff --git a/llvm/lib/Target/M68k/GlSel/M68kInstructionSelector.cpp b/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp
index 9ac4ab9a5ba1..9ac4ab9a5ba1 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kInstructionSelector.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp
diff --git a/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
index bcbe62816beb..bcbe62816beb 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
diff --git a/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.h b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
index 205aa81aedcc..205aa81aedcc 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kLegalizerInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
new file mode 100644
index 000000000000..5c0f5dae8e37
--- /dev/null
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
@@ -0,0 +1,105 @@
+//===-- M68kRegisterBankInfo.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for M68k.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "M68kRegisterBankInfo.h"
+#include "M68kInstrInfo.h" // For the register classes
+#include "M68kSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "M68kGenRegisterBank.inc"
+
+using namespace llvm;
+
+// FIXME: TableGen this.
+// If it grows too much and TableGen still isn't ready to do the job, extract it
+// into an M68kGenRegisterBankInfo.def (similar to AArch64).
+namespace llvm {
+namespace M68k {
+enum PartialMappingIdx {
+  PMI_GPR,
+  PMI_Min = PMI_GPR,
+};
+
+RegisterBankInfo::PartialMapping PartMappings[]{
+    // GPR Partial Mapping
+    {0, 32, GPRRegBank},
+};
+
+enum ValueMappingIdx {
+  InvalidIdx = 0,
+  GPR3OpsIdx = 1,
+};
+
+RegisterBankInfo::ValueMapping ValueMappings[] = {
+    // invalid
+    {nullptr, 0},
+    // 3 operands in GPRs
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+
+};
+} // end namespace M68k
+} // end namespace llvm
+
+M68kRegisterBankInfo::M68kRegisterBankInfo(const TargetRegisterInfo &TRI)
+    : M68kGenRegisterBankInfo() {}
+
+const RegisterBank &
+M68kRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
+                                             LLT) const {
+  return getRegBank(M68k::GPRRegBankID);
+}
+
+const RegisterBankInfo::InstructionMapping &
+M68kRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  auto Opc = MI.getOpcode();
+
+  if (!isPreISelGenericOpcode(Opc)) {
+    const InstructionMapping &Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  }
+
+  using namespace TargetOpcode;
+
+  unsigned NumOperands = MI.getNumOperands();
+  const ValueMapping *OperandsMapping = &M68k::ValueMappings[M68k::GPR3OpsIdx];
+
+  switch (Opc) {
+  case G_ADD:
+  case G_SUB:
+  case G_MUL:
+  case G_SDIV:
+  case G_UDIV:
+  case G_LOAD:
+  case G_STORE: {
+    OperandsMapping = &M68k::ValueMappings[M68k::GPR3OpsIdx];
+    break;
+  }
+
+  case G_CONSTANT:
+  case G_FRAME_INDEX:
+    OperandsMapping =
+        getOperandsMapping({&M68k::ValueMappings[M68k::GPR3OpsIdx], nullptr});
+    break;
+  default:
+    return getInvalidInstructionMapping();
+  }
+
+  return getInstructionMapping(DefaultMappingID, /*Cost=*/1, OperandsMapping,
+                               NumOperands);
+}
diff --git a/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.h b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
index 9b97cc4a6dd4..853c75df2bb3 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
@@ -34,6 +34,12 @@ protected:
 class M68kRegisterBankInfo final : public M68kGenRegisterBankInfo {
 public:
   M68kRegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
+                                             LLT) const override;
+
+  const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const override;
 };
 } // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/M68k/GlSel/M68kRegisterBanks.td b/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td
index 2d1e74f78480..942677a60e6c 100644
--- a/llvm/lib/Target/M68k/GlSel/M68kRegisterBanks.td
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td
@@ -12,4 +12,4 @@
 //===----------------------------------------------------------------------===//
 
 /// General Purpose Registers. Here we define a register bank with name AnyGPR
-def GPRRegBank : RegisterBank<"AnyGPR", [DR8]>;
+def GPRRegBank : RegisterBank<"AnyGPR", [XR32]>;
diff --git a/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.cpp
deleted file mode 100644
index d12478624655..000000000000
--- a/llvm/lib/Target/M68k/GlSel/M68kRegisterBankInfo.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- M68kRegisterBankInfo.cpp -------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements the targeting of the RegisterBankInfo class for M68k.
-/// \todo This should be generated by TableGen.
-//===----------------------------------------------------------------------===//
-
-#include "M68kRegisterBankInfo.h"
-#include "MCTargetDesc/M68kMCTargetDesc.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-#define GET_TARGET_REGBANK_IMPL
-#include "M68kGenRegisterBank.inc"
-#undef GET_TARGET_REGBANK_IMPL
-
-using namespace llvm;
-
-M68kRegisterBankInfo::M68kRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : M68kGenRegisterBankInfo() {}
diff --git a/llvm/lib/Target/M68k/M68k.td b/llvm/lib/Target/M68k/M68k.td
index 669eb32f46f1..fde491e1b6d5 100644
--- a/llvm/lib/Target/M68k/M68k.td
+++ b/llvm/lib/Target/M68k/M68k.td
@@ -78,7 +78,7 @@ def : Proc<"M68060",  [ FeatureISA60 ]>;
 //===----------------------------------------------------------------------===//
 
 include "M68kRegisterInfo.td"
-include "GlSel/M68kRegisterBanks.td"
+include "GISel/M68kRegisterBanks.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
index a6fc58b5a277..08b7153632b4 100644
--- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
+++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
@@ -21,7 +21,7 @@
 #include "MCTargetDesc/M68kInstPrinter.h"
 #include "TargetInfo/M68kTargetInfo.h"
 
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/M68k/M68kCallingConv.h b/llvm/lib/Target/M68k/M68kCallingConv.h
index 18f72c95cedb..20ffa993897f 100644
--- a/llvm/lib/Target/M68k/M68kCallingConv.h
+++ b/llvm/lib/Target/M68k/M68kCallingConv.h
@@ -24,14 +24,13 @@
 namespace llvm {
 
 /// Custom state to propagate llvm type info to register CC assigner
-class M68kCCState : public CCState {
-public:
-  const llvm::Function &F;
+struct M68kCCState : public CCState {
+  ArrayRef<Type *> ArgTypeList;
 
-  M68kCCState(const llvm::Function &F, CallingConv::ID CC, bool IsVarArg,
+  M68kCCState(ArrayRef<Type *> ArgTypes, CallingConv::ID CC, bool IsVarArg,
               MachineFunction &MF, SmallVectorImpl<CCValAssign> &Locs,
               LLVMContext &C)
-      : CCState(CC, IsVarArg, MF, Locs, C), F(F) {}
+      : CCState(CC, IsVarArg, MF, Locs, C), ArgTypeList(ArgTypes) {}
 };
 
 /// NOTE this function is used to select registers for formal arguments and call
@@ -39,7 +38,7 @@ public:
 inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  M68kCCState CCInfo = static_cast<M68kCCState &>(State);
+  const M68kCCState &CCInfo = static_cast<M68kCCState &>(State);
 
   static const MCPhysReg DataRegList[] = {M68k::D0, M68k::D1, M68k::A0,
                                           M68k::A1};
@@ -52,14 +51,15 @@ inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
       M68k::D1,
   };
 
-  auto I = CCInfo.F.arg_begin();
+  const auto &ArgTypes = CCInfo.ArgTypeList;
+  auto I = ArgTypes.begin(), End = ArgTypes.end();
   int No = ValNo;
-  while (No > 0) {
-    No -= I->getType()->isIntegerTy(64) ? 2 : 1;
-    I++;
+  while (No > 0 && I != End) {
+    No -= (*I)->isIntegerTy(64) ? 2 : 1;
+    ++I;
   }
 
-  bool IsPtr = I != CCInfo.F.arg_end() && I->getType()->isPointerTy();
+  bool IsPtr = I != End && (*I)->isPointerTy();
 
   unsigned Reg =
       IsPtr ? State.AllocateReg(AddrRegList) : State.AllocateReg(DataRegList);
diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
index 26262b9b573d..66ea6ae38f43 100644
--- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
@@ -357,7 +357,7 @@ void M68kFrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
       if (Reg) {
         unsigned Opc = M68k::MOV32ri;
         BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg).addImm(Offset);
-        Opc = IsSub ? M68k::SUB32rr : M68k::ADD32rr;
+        Opc = IsSub ? M68k::SUB32ar : M68k::ADD32ar;
         MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
                                .addReg(StackPtr)
                                .addReg(Reg);
@@ -400,13 +400,13 @@ int M68kFrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
     return Offset;
   }
 
-  if (Opc == M68k::ADD32ri && PI->getOperand(0).getReg() == StackPtr) {
+  if (Opc == M68k::ADD32ai && PI->getOperand(0).getReg() == StackPtr) {
     assert(PI->getOperand(1).getReg() == StackPtr);
     Offset += PI->getOperand(2).getImm();
     MBB.erase(PI);
     if (!MergeWithPrevious)
       MBBI = NI;
-  } else if (Opc == M68k::SUB32ri && PI->getOperand(0).getReg() == StackPtr) {
+  } else if (Opc == M68k::SUB32ai && PI->getOperand(0).getReg() == StackPtr) {
     assert(PI->getOperand(1).getReg() == StackPtr);
     Offset -= PI->getOperand(2).getImm();
     MBB.erase(PI);
@@ -426,7 +426,7 @@ MachineInstrBuilder M68kFrameLowering::BuildStackAdjustment(
 
   bool IsSub = Offset < 0;
   uint64_t AbsOffset = IsSub ? -Offset : Offset;
-  unsigned Opc = IsSub ? M68k::SUB32ri : M68k::ADD32ri;
+  unsigned Opc = IsSub ? M68k::SUB32ai : M68k::ADD32ai;
 
   MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
                                .addReg(StackPtr)
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 3e7cee9889d7..79b395f8f984 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -519,9 +519,10 @@ SDValue M68kTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  // It is empty for LibCall
-  const Function *CalleeFunc = CLI.CB ? CLI.CB->getCalledFunction() : nullptr;
-  M68kCCState CCInfo(*CalleeFunc, CallConv, IsVarArg, MF, ArgLocs,
+  SmallVector<Type *, 4> ArgTypes;
+  for (const auto &Arg : CLI.getArgs())
+    ArgTypes.emplace_back(Arg.Ty);
+  M68kCCState CCInfo(ArgTypes, CallConv, IsVarArg, MF, ArgLocs,
                      *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_M68k);
 
@@ -876,8 +877,10 @@ SDValue M68kTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  M68kCCState CCInfo(MF.getFunction(), CCID, IsVarArg, MF, ArgLocs,
-                     *DAG.getContext());
+  SmallVector<Type *, 4> ArgTypes;
+  for (const Argument &Arg : MF.getFunction().args())
+    ArgTypes.emplace_back(Arg.getType());
+  M68kCCState CCInfo(ArgTypes, CCID, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_M68k);
 
@@ -1975,7 +1978,7 @@ SDValue M68kTargetLowering::LowerSETCCCARRY(SDValue Op,
   M68k::CondCode CC = TranslateIntegerM68kCC(cast<CondCodeSDNode>(Cond)->get());
 
   EVT CarryVT = Carry.getValueType();
-  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+  APInt NegOne = APInt::getAllOnes(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(M68kISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
                       DAG.getConstant(NegOne, DL, CarryVT));
 
@@ -2199,7 +2202,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Op2.getOpcode() == ISD::TRUNCATE) {
     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
     if (T1.getValueType() == T2.getValueType() &&
-        // Blacklist CopyFromReg to avoid partial register stalls.
+        // Block CopyFromReg so partial register stalls are avoided.
         T1.getOpcode() != ISD::CopyFromReg &&
         T2.getOpcode() != ISD::CopyFromReg) {
       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index f65ad5729eb4..b2c05365d30b 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -88,14 +88,15 @@ let Defs = [CCR] in {
 let Constraints = "$src = $dst" in {
 
 // $reg, $ccr <- $reg op $reg
-class MxBiArOp_RFRR_xEA<string MN, SDNode NODE, MxType TYPE, bits<4> CMD, MxBead REG>
-    : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd),
-             MN#"."#TYPE.Prefix#"\t$opd, $dst",
-             [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))],
+class MxBiArOp_RFRR_xEA<string MN, SDNode NODE, MxType DST_TYPE, MxType SRC_TYPE,
+                        bits<4> CMD, MxBead REG>
+    : MxInst<(outs DST_TYPE.ROp:$dst), (ins DST_TYPE.ROp:$src, SRC_TYPE.ROp:$opd),
+             MN#"."#DST_TYPE.Prefix#"\t$opd, $dst",
+             [(set DST_TYPE.VT:$dst, CCR, (NODE DST_TYPE.VT:$src, SRC_TYPE.VT:$opd))],
              MxArithEncoding<MxBead4Bits<CMD>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"),
+                             !cast<MxEncOpMode>("MxOpMode"#DST_TYPE.Size#DST_TYPE.RLet#"EA"),
                              REG,
-                             !cast<MxEncEA>("MxEncEA"#TYPE.RLet#"_2"),
+                             !cast<MxEncEA>("MxEncEA"#SRC_TYPE.RLet#"_2"),
                              MxExtEmpty>>;
 
 /// This Op is similar to the one above except it uses reversed opmode, some
@@ -260,11 +261,19 @@ multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm,
   def NAME#"32ji" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.JOp, MxType32.JPat,
                                  CMDI, MxEncEAj_0, MxExtEmpty>;
 
+  def NAME#"16dr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, MxType16r,
+                                      CMD, MxBeadDReg<0>>;
+  def NAME#"32dr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32d, MxType32r,
+                                      CMD, MxBeadDReg<0>>;
+
   let isCommutable = isComm in {
 
-    def NAME#"8dd"  : MxBiArOp_RFRR_xEA<MN, NODE, MxType8d,  CMD, MxBeadDReg<0>>;
-    def NAME#"16dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, CMD, MxBeadDReg<0>>;
-    def NAME#"32dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32d, CMD, MxBeadDReg<0>>;
+    def NAME#"8dd"  : MxBiArOp_RFRR_xEA<MN, NODE, MxType8d, MxType8d,
+                                        CMD, MxBeadDReg<0>>;
+    def NAME#"16dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, MxType16d,
+                                        CMD, MxBeadDReg<0>>;
+    def NAME#"32dd" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32d, MxType32d,
+                                        CMD, MxBeadDReg<0>>;
 
   } // isComm
 
@@ -278,29 +287,29 @@ let Pattern = [(null_frag)] in
 multiclass MxBiArOp_AF<string MN, SDNode NODE, bit isComm,
                        bits<4> CMD, bits<4> CMDI> {
 
-  def NAME#"32rk" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.KOp, MxType32.KPat,
+  def NAME#"32ak" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.KOp, MxType32.KPat,
                                   CMD, MxEncEAk, MxExtBrief_2>;
-  def NAME#"32rq" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.QOp, MxType32.QPat,
+  def NAME#"32aq" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.QOp, MxType32.QPat,
                                   CMD, MxEncEAq, MxExtI16_2>;
-  def NAME#"32rf" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.FOp, MxType32.FPat,
+  def NAME#"32af" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.FOp, MxType32.FPat,
                                   CMD, MxEncEAf_2, MxExtBrief_2>;
-  def NAME#"32rp" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.POp, MxType32.PPat,
+  def NAME#"32ap" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.POp, MxType32.PPat,
                                   CMD, MxEncEAp_2, MxExtI16_2>;
-  def NAME#"32rj" : MxBiArOp_RFRM<MN, NODE, MxType32r, MxType32.JOp, MxType32.JPat,
+  def NAME#"32aj" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.JOp, MxType32.JPat,
                                   CMD, MxEncEAj_2, MxExtEmpty>;
-  def NAME#"32ri" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32r, CMD>;
+  def NAME#"32ai" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32a, CMD>;
 
-  let isCommutable = isComm in
-  def NAME#"32rr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32r, CMD, MxBeadReg<0>>;
+  def NAME#"32ar" : MxBiArOp_RFRR_xEA<MN, NODE, MxType32a, MxType32r,
+                                      CMD, MxBeadReg<0>>;
 
 } // MxBiArOp_AF
 
 // NOTE These naturally produce CCR
 
-defm ADD : MxBiArOp_DF<"add", MxAdd, 1, 0xD, 0x6>;
-defm ADD : MxBiArOp_AF<"add", MxAdd, 1, 0xD, 0x6>;
-defm SUB : MxBiArOp_DF<"sub", MxSub, 0, 0x9, 0x4>;
-defm SUB : MxBiArOp_AF<"sub", MxSub, 0, 0x9, 0x4>;
+defm ADD : MxBiArOp_DF<"add",  MxAdd, 1, 0xD, 0x6>;
+defm ADD : MxBiArOp_AF<"adda", MxAdd, 1, 0xD, 0x6>;
+defm SUB : MxBiArOp_DF<"sub",  MxSub, 0, 0x9, 0x4>;
+defm SUB : MxBiArOp_AF<"suba", MxSub, 0, 0x9, 0x4>;
 
 
 let Uses = [CCR], Defs = [CCR] in {
@@ -366,13 +375,16 @@ defm XOR : MxBiArOp_DF_EAd<"eor", MxXor, 0xB, 0xA>;
 //===----------------------------------------------------------------------===//
 
 let Defs = [CCR] in {
-class MxCmp_RR<MxType TYPE>
-    : MxInst<(outs), (ins TYPE.ROp:$lhs, TYPE.ROp:$rhs),
-             "cmp."#TYPE.Prefix#"\t$lhs, $rhs",
-             [(set CCR, (MxCmp TYPE.VT:$lhs, TYPE.VT:$rhs))],
+class MxCmp_RR<MxType LHS_TYPE, MxType RHS_TYPE = LHS_TYPE,
+               MxBead REG = MxBeadDReg<1>>
+    : MxInst<(outs), (ins LHS_TYPE.ROp:$lhs, RHS_TYPE.ROp:$rhs),
+             "cmp."#RHS_TYPE.Prefix#"\t$lhs, $rhs",
+             [(set CCR, (MxCmp LHS_TYPE.VT:$lhs, RHS_TYPE.VT:$rhs))],
              MxArithEncoding<MxBead4Bits<0xB>,
-                             !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"dEA"),
-                             MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>;
+                             !cast<MxEncOpMode>("MxOpMode"#RHS_TYPE.Size#RHS_TYPE.RLet#"EA"),
+                             REG,
+                             !cast<MxEncEA>("MxEncEA"#LHS_TYPE.RLet#"_0"),
+                             MxExtEmpty>>;
 
 class MxCmp_RI<MxType TYPE>
     : MxInst<(outs), (ins TYPE.IOp:$imm, TYPE.ROp:$reg),
@@ -444,11 +456,16 @@ multiclass MMxCmp_MI<MxType TYPE> {
 }
 
 foreach S = [8, 16, 32] in {
-  def CMP#S#dd : MxCmp_RR<!cast<MxType>("MxType"#S#"d")>;
   def CMP#S#di : MxCmp_RI<!cast<MxType>("MxType"#S#"d")>;
   def CMP#S#bi : MxCmp_BI<!cast<MxType>("MxType"#S#"d")>;
 } // foreach
 
+def CMP8dd : MxCmp_RR<MxType8d>;
+foreach S = [16, 32] in {
+  def CMP#S#dr : MxCmp_RR<!cast<MxType>("MxType"#S#"r"),
+                          !cast<MxType>("MxType"#S#"d")>;
+}
+
 // cmp mem, Dn
 defm CMP8d  : MMxCmp_RM<MxType8d>;
 defm CMP16d : MMxCmp_RM<MxType16d>;
@@ -737,9 +754,9 @@ foreach N = ["add", "addc"] in {
   def : Pat<(!cast<SDNode>(N) i8 :$src, i8 :$opd),
             (ADD8dd  MxDRD8 :$src, MxDRD8 :$opd)>;
   def : Pat<(!cast<SDNode>(N) i16:$src, i16:$opd),
-            (ADD16dd MxDRD16:$src, MxDRD16:$opd)>;
+            (ADD16dr MxXRD16:$src, MxDRD16:$opd)>;
   def : Pat<(!cast<SDNode>(N) i32:$src, i32:$opd),
-            (ADD32rr MxXRD32:$src, MxXRD32:$opd)>;
+            (ADD32dr MxXRD32:$src, MxDRD32:$opd)>;
 
   // add (An), reg
   def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.JPat:$opd)),
@@ -747,7 +764,7 @@ foreach N = ["add", "addc"] in {
   def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.JPat:$opd)),
             (ADD16dj MxDRD16:$src, MxType16.JOp:$opd)>;
   def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.JPat:$opd)),
-            (ADD32rj MxXRD32:$src, MxType32.JOp:$opd)>;
+            (ADD32dj MxDRD32:$src, MxType32.JOp:$opd)>;
 
   // add (i,An), reg
   def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.PPat:$opd)),
@@ -755,7 +772,7 @@ foreach N = ["add", "addc"] in {
   def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.PPat:$opd)),
             (ADD16dp MxDRD16:$src, MxType16.POp:$opd)>;
   def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.PPat:$opd)),
-            (ADD32rp MxXRD32:$src, MxType32.POp:$opd)>;
+            (ADD32dp MxDRD32:$src, MxType32.POp:$opd)>;
 
   // add (i,An,Xn), reg
   def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.FPat:$opd)),
@@ -763,7 +780,7 @@ foreach N = ["add", "addc"] in {
   def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.FPat:$opd)),
             (ADD16df MxDRD16:$src, MxType16.FOp:$opd)>;
   def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.FPat:$opd)),
-            (ADD32rf MxXRD32:$src, MxType32.FOp:$opd)>;
+            (ADD32df MxDRD32:$src, MxType32.FOp:$opd)>;
 
   // add reg, imm
   def : Pat<(!cast<SDNode>(N) i8: $src, MximmSExt8:$opd),
@@ -776,7 +793,7 @@ foreach N = ["add", "addc"] in {
   // we make sure it will be selected over LEAp
   let AddedComplexity = 15 in {
   def : Pat<(!cast<SDNode>(N) i32:$src, MximmSExt32:$opd),
-            (ADD32ri MxXRD32:$src, imm:$opd)>;
+            (ADD32di MxDRD32:$src, imm:$opd)>;
   } // AddedComplexity = 15
 
   // add imm, (An)
@@ -806,7 +823,7 @@ foreach N = ["sub", "subc"] in {
   def : Pat<(!cast<SDNode>(N) i16:$src, i16:$opd),
             (SUB16dd MxDRD16:$src, MxDRD16:$opd)>;
   def : Pat<(!cast<SDNode>(N) i32:$src, i32:$opd),
-            (SUB32rr MxXRD32:$src, MxXRD32:$opd)>;
+            (SUB32dd MxDRD32:$src, MxDRD32:$opd)>;
 
 
   // sub (An), reg
@@ -815,7 +832,7 @@ foreach N = ["sub", "subc"] in {
   def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.JPat:$opd)),
             (SUB16dj MxDRD16:$src, MxType16.JOp:$opd)>;
   def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.JPat:$opd)),
-            (SUB32rj MxXRD32:$src, MxType32.JOp:$opd)>;
+            (SUB32dj MxDRD32:$src, MxType32.JOp:$opd)>;
 
   // sub (i,An), reg
   def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.PPat:$opd)),
@@ -823,7 +840,7 @@ foreach N = ["sub", "subc"] in {
   def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.PPat:$opd)),
             (SUB16dp MxDRD16:$src, MxType16.POp:$opd)>;
   def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.PPat:$opd)),
-            (SUB32rp MxXRD32:$src, MxType32.POp:$opd)>;
+            (SUB32dp MxDRD32:$src, MxType32.POp:$opd)>;
 
   // sub (i,An,Xn), reg
   def : Pat<(!cast<SDNode>(N) MxType8.VT:$src, (Mxloadi8 MxType8.FPat:$opd)),
@@ -831,7 +848,7 @@ foreach N = ["sub", "subc"] in {
   def : Pat<(!cast<SDNode>(N) MxType16.VT:$src, (Mxloadi16 MxType16.FPat:$opd)),
             (SUB16df MxDRD16:$src, MxType16.FOp:$opd)>;
   def : Pat<(!cast<SDNode>(N) MxType32.VT:$src, (Mxloadi32 MxType32.FPat:$opd)),
-            (SUB32rf MxXRD32:$src, MxType32.FOp:$opd)>;
+            (SUB32df MxDRD32:$src, MxType32.FOp:$opd)>;
 
   // sub reg, imm
   def : Pat<(!cast<SDNode>(N) i8 :$src, MximmSExt8 :$opd),
@@ -839,7 +856,7 @@ foreach N = ["sub", "subc"] in {
   def : Pat<(!cast<SDNode>(N) i16:$src, MximmSExt16:$opd),
             (SUB16di MxDRD16:$src, imm:$opd)>;
   def : Pat<(!cast<SDNode>(N) i32:$src, MximmSExt32:$opd),
-            (SUB32ri MxXRD32:$src, imm:$opd)>;
+            (SUB32di MxDRD32:$src, imm:$opd)>;
 
   // sub imm, (An)
   def : Pat<(store (!cast<SDNode>(N) (load MxType8.JPat:$dst), MxType8.IPat:$opd),
diff --git a/llvm/lib/Target/M68k/M68kInstrCompiler.td b/llvm/lib/Target/M68k/M68kInstrCompiler.td
index bcb815dbc4eb..8fb331dec0e9 100644
--- a/llvm/lib/Target/M68k/M68kInstrCompiler.td
+++ b/llvm/lib/Target/M68k/M68kInstrCompiler.td
@@ -23,15 +23,15 @@ def : Pat<(i32 (MxWrapper tjumptable    :$src)), (MOV32ri tjumptable    :$src)>;
 def : Pat<(i32 (MxWrapper tblockaddress :$src)), (MOV32ri tblockaddress :$src)>;
 
 def : Pat<(add MxDRD32:$src, (MxWrapper tconstpool:$opd)),
-          (ADD32ri MxDRD32:$src, tconstpool:$opd)>;
+          (ADD32di MxDRD32:$src, tconstpool:$opd)>;
 def : Pat<(add MxARD32:$src, (MxWrapper tjumptable:$opd)),
-          (ADD32ri MxARD32:$src, tjumptable:$opd)>;
+          (ADD32ai MxARD32:$src, tjumptable:$opd)>;
 def : Pat<(add MxARD32:$src, (MxWrapper tglobaladdr :$opd)),
-          (ADD32ri MxARD32:$src, tglobaladdr:$opd)>;
+          (ADD32ai MxARD32:$src, tglobaladdr:$opd)>;
 def : Pat<(add MxARD32:$src, (MxWrapper texternalsym:$opd)),
-          (ADD32ri MxARD32:$src, texternalsym:$opd)>;
+          (ADD32ai MxARD32:$src, texternalsym:$opd)>;
 def : Pat<(add MxARD32:$src, (MxWrapper tblockaddress:$opd)),
-          (ADD32ri MxARD32:$src, tblockaddress:$opd)>;
+          (ADD32ai MxARD32:$src, tblockaddress:$opd)>;
 
 def : Pat<(store (i32 (MxWrapper tglobaladdr:$src)), iPTR:$dst),
           (MOV32ji MxARI32:$dst, tglobaladdr:$src)>;
diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td
index 1d950bd0377a..99b7ffd17971 100644
--- a/llvm/lib/Target/M68k/M68kInstrFormats.td
+++ b/llvm/lib/Target/M68k/M68kInstrFormats.td
@@ -250,7 +250,7 @@ def MxOpMode16dEA : MxEncOpMode<MxBead3Bits<0b001>>;
 def MxOpMode32dEA : MxEncOpMode<MxBead3Bits<0b010>>;
 
 // op EA, An
-def MxOpMode16aEA : MxEncOpMode<MxBead3Bits<0b110>>;
+def MxOpMode16aEA : MxEncOpMode<MxBead3Bits<0b011>>;
 def MxOpMode32aEA : MxEncOpMode<MxBead3Bits<0b111>>;
 
 // op EA, Rn
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 0eddd8ce5f4c..639bcd455687 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -24,8 +24,8 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #include <functional>
 
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index a503b02c5a82..6aced1487365 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -173,7 +173,7 @@ static inline unsigned IsCMP(unsigned Op) {
   case M68k::CMP8di:
   case M68k::CMP8dj:
   case M68k::CMP8dp:
-  case M68k::CMP16dd:
+  case M68k::CMP16dr:
   case M68k::CMP16df:
   case M68k::CMP16di:
   case M68k::CMP16dj:
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td
index e743213830de..ed6cd9ecf442 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.td
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.td
@@ -165,12 +165,23 @@ def MxSize8  : MxSize<8,  "b", "byte">;
 def MxSize16 : MxSize<16, "w", "word">;
 def MxSize32 : MxSize<32, "l", "long">;
 
-class MxOpClass<string name> : AsmOperandClass {
+class MxOpClass<string name,
+                list<AsmOperandClass> superClasses = []> : AsmOperandClass {
   let Name = name;
   let ParserMethod = "parseMemOp";
+  let SuperClasses = superClasses;
 }
 
 def MxRegClass : MxOpClass<"Reg">;
+// Splitting asm register class to avoid ambiguous on operands'
+// MatchClassKind. For instance, without this separation,
+// both ADD32dd and ADD32dr has {MCK_RegClass, MCK_RegClass} for
+// their operands, which makes AsmParser unable to pick the correct
+// one in a deterministic way.
+let RenderMethod = "addRegOperands", SuperClasses = [MxRegClass]in {
+  def MxARegClass : MxOpClass<"AReg">;
+  def MxDRegClass : MxOpClass<"DReg">;
+}
 
 class MxOperand<ValueType vt, MxSize size, string letter, RegisterClass rc, dag pat = (null_frag)> {
   ValueType VT = vt;
@@ -200,20 +211,24 @@ def MxXRD32_TC : MxRegOp<i32, XR32_TC, MxSize32, "r">;
 
 // DATA REGISTER DIRECT. The operand is in the data register specified by
 // the effective address register field.
-def MxDRD8  : MxRegOp<i8,  DR8,  MxSize8,  "d">;
-def MxDRD16 : MxRegOp<i16, DR16, MxSize16, "d">;
-def MxDRD32 : MxRegOp<i32, DR32, MxSize32, "d">;
+let ParserMatchClass = MxDRegClass in {
+  def MxDRD8  : MxRegOp<i8,  DR8,  MxSize8,  "d">;
+  def MxDRD16 : MxRegOp<i16, DR16, MxSize16, "d">;
+  def MxDRD32 : MxRegOp<i32, DR32, MxSize32, "d">;
 
-def MxDRD16_TC : MxRegOp<i16, DR16_TC, MxSize16, "d">;
-def MxDRD32_TC : MxRegOp<i32, DR32_TC, MxSize32, "d">;
+  def MxDRD16_TC : MxRegOp<i16, DR16_TC, MxSize16, "d">;
+  def MxDRD32_TC : MxRegOp<i32, DR32_TC, MxSize32, "d">;
+}
 
 // ADDRESS REGISTER DIRECT. The operand is in the address register specified by
 // the effective address register field.
-def MxARD16 : MxRegOp<i16, AR16, MxSize16, "a">;
-def MxARD32 : MxRegOp<i32, AR32, MxSize32, "a">;
+let ParserMatchClass = MxARegClass in {
+  def MxARD16 : MxRegOp<i16, AR16, MxSize16, "a">;
+  def MxARD32 : MxRegOp<i32, AR32, MxSize32, "a">;
 
-def MxARD16_TC : MxRegOp<i16, AR16_TC, MxSize16, "a">;
-def MxARD32_TC : MxRegOp<i32, AR32_TC, MxSize32, "a">;
+  def MxARD16_TC : MxRegOp<i16, AR16_TC, MxSize16, "a">;
+  def MxARD32_TC : MxRegOp<i32, AR32_TC, MxSize32, "a">;
+}
 
 class MxMemOp<dag ops, MxSize size, string letter,
               string printMethod = "printOperand",
@@ -304,9 +319,17 @@ def MxARII32_TC  : MxMemOp<(ops i8imm, AR32_TC, XR32_TC), MxSize32, "f", "printA
 // extended before it is used.  The reference is classified as a data reference
 // with the exception of the jump and jump-tosubroutine instructions.
 def MxAddr     : MxOpClass<"Addr">;
-def MxAS8      : MxMemOp<(ops OtherVT), MxSize8,  "B", "printAS8Mem", MxAddr>;
-def MxAS16     : MxMemOp<(ops OtherVT), MxSize16, "B", "printAS16Mem", MxAddr>;
-def MxAS32     : MxMemOp<(ops OtherVT), MxSize32, "B", "printAS32Mem", MxAddr>;
+let RenderMethod = "addAddrOperands" in {
+  // This hierarchy ensures Addr8 will always be parsed
+  // before other larger-width variants.
+  def MxAddr32   : MxOpClass<"Addr32", [MxAddr]>;
+  def MxAddr16   : MxOpClass<"Addr16", [MxAddr32]>;
+  def MxAddr8    : MxOpClass<"Addr8",  [MxAddr16]>;
+}
+
+def MxAS8      : MxMemOp<(ops OtherVT), MxSize8,  "B", "printAS8Mem",  MxAddr8>;
+def MxAS16     : MxMemOp<(ops OtherVT), MxSize16, "B", "printAS16Mem", MxAddr16>;
+def MxAS32     : MxMemOp<(ops OtherVT), MxSize32, "B", "printAS32Mem", MxAddr32>;
 
 // ABSOLUTE LONG ADDRESS. This addressing mode requires two words of extension.
 // The address of the operand is developed by the concatenation of the extension
@@ -314,9 +337,9 @@ def MxAS32     : MxMemOp<(ops OtherVT), MxSize32, "B", "printAS32Mem", MxAddr>;
 // order part of the address is the second extension word. The reference is
 // classified as a data reference with the exception of the jump and jump
 // to-subroutine instructions.
-def MxAL8      : MxMemOp<(ops OtherVT), MxSize8,  "b", "printAL8Mem", MxAddr>;
-def MxAL16     : MxMemOp<(ops OtherVT), MxSize16, "b", "printAL16Mem", MxAddr>;
-def MxAL32     : MxMemOp<(ops OtherVT), MxSize32, "b", "printAL32Mem", MxAddr>;
+def MxAL8      : MxMemOp<(ops OtherVT), MxSize8,  "b", "printAL8Mem",  MxAddr8>;
+def MxAL16     : MxMemOp<(ops OtherVT), MxSize16, "b", "printAL16Mem", MxAddr16>;
+def MxAL32     : MxMemOp<(ops OtherVT), MxSize32, "b", "printAL32Mem", MxAddr32>;
 
 def MxPCD : MxOpClass<"PCD">;
 def MxPCI : MxOpClass<"PCI">;
@@ -370,21 +393,22 @@ def Mxi16imm : MxOp<i16, MxSize16, "i">;
 def Mxi32imm : MxOp<i32, MxSize32, "i">;
 } // OPERAND_IMMEDIATE
 
-let OperandType = "OPERAND_PCREL",
-    ParserMatchClass = MxAddr,
-    PrintMethod = "printPCRelImm" in {
-
+class MxBrTargetOperand<int N> : Operand<OtherVT> {
+  let OperandType = "OPERAND_PCREL";
+  let PrintMethod = "printPCRelImm";
+  let ParserMatchClass = !cast<AsmOperandClass>("MxAddr"#N);
+}
 // Branch targets have OtherVT type and print as pc-relative values.
-def MxBrTarget8  : Operand<OtherVT>;
-def MxBrTarget16 : Operand<OtherVT>;
-def MxBrTarget32 : Operand<OtherVT>;
-
-} // OPERAND_PCREL
+def MxBrTarget8  : MxBrTargetOperand<8>;
+def MxBrTarget16 : MxBrTargetOperand<16>;
+def MxBrTarget32 : MxBrTargetOperand<32>;
 
 // Used with MOVEM
+def MxMoveMaskClass : MxOpClass<"MoveMask">;
 def MxMoveMask : MxOp<i16, MxSize16, "m"> {
   let OperandType = "OPERAND_IMMEDIATE";
   let PrintMethod = "printMoveMask";
+  let ParserMatchClass = MxMoveMaskClass;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp
index 963e83cfbb07..991889706e67 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.cpp
+++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "M68kSubtarget.h"
-#include "GlSel/M68kCallLowering.h"
-#include "GlSel/M68kLegalizerInfo.h"
-#include "GlSel/M68kRegisterBankInfo.h"
+#include "GISel/M68kCallLowering.h"
+#include "GISel/M68kLegalizerInfo.h"
+#include "GISel/M68kRegisterBankInfo.h"
 
 #include "M68k.h"
 #include "M68kMachineFunction.h"
@@ -24,9 +24,9 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
index 5b8fd3d41b14..e8126c6219e8 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -24,8 +24,8 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <memory>
 
 using namespace llvm;
@@ -49,10 +49,14 @@ std::string computeDataLayout(const Triple &TT, StringRef CPU,
   // FIXME how to wire it with the used object format?
   Ret += "-m:e";
 
-  // M68k pointers are always 32 bit wide even for 16 bit cpus
-  Ret += "-p:32:32";
+  // M68k pointers are always 32 bit wide even for 16-bit CPUs.
+  // The ABI only specifies 16-bit alignment.
+  // On at least the 68020+ with a 32-bit bus, there is a performance benefit
+  // to having 32-bit alignment.
+  Ret += "-p:32:16:32";
 
-  // M68k requires i8 to align on 2 byte boundry
+  // Bytes do not require special alignment, words are word aligned and
+  // long words are word aligned at minimum.
   Ret += "-i8:8:8-i16:16:16-i32:16:32";
 
   // FIXME no floats at the moment
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index 8a0f32b58da4..c1f88fb78ee1 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -29,9 +29,9 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -82,7 +82,8 @@ public:
 
   /// Write a sequence of optimal nops to the output, covering \p Count bytes.
   /// \return - true on success, false on failure
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 };
 } // end anonymous namespace
 
@@ -200,7 +201,8 @@ void M68kAsmBackend::relaxInstruction(MCInst &Inst,
   Inst.setOpcode(RelaxedOp);
 }
 
-bool M68kAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool M68kAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                  const MCSubtargetInfo *STI) const {
   // Cannot emit NOP with size being not multiple of 16 bits.
   if (Count % 2 != 0)
     return false;
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
index e5f5909b5d79..a2e41437ee21 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
@@ -109,7 +109,7 @@ void M68kInstPrinter::printMoveMask(const MCInst *MI, unsigned opNum,
     // Print separation comma only if
     // both data & register parts have bit(s) set
     if (s != 0 && (Mask & 0xFF) && HalfMask)
-      O << ',';
+      O << '/';
 
     for (int i = 0; HalfMask; ++i) {
       if ((HalfMask >> i) & 0b1) {
@@ -130,7 +130,7 @@ void M68kInstPrinter::printMoveMask(const MCInst *MI, unsigned opNum,
         i = j;
 
         if (HalfMask)
-          O << ',';
+          O << '/';
       }
     }
   }
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
index 0a438ea042be..9f4db895a821 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
@@ -23,10 +23,10 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp b/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
index 5f08b9044b4e..2a225b8a43cd 100644
--- a/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
+++ b/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
@@ -10,7 +10,7 @@
 /// This file contains M68k target initializer.
 ///
 //===----------------------------------------------------------------------===//
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 4bad0368505a..c1677baf52a7 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -23,9 +23,9 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #define DEBUG_TYPE "msp430-asm-parser"
 
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index d2902189ec40..9bbb2938ab75 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MSP430.h"
 #include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "MSP430.h"
 #include "TargetInfo/MSP430TargetInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -19,8 +19,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index 071e1484196b..953916776c57 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -90,7 +90,8 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 };
 
 uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup,
@@ -147,7 +148,8 @@ void MSP430AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   }
 }
 
-bool MSP430AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool MSP430AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                    const MCSubtargetInfo *STI) const {
   if ((Count % 2) != 0)
     return false;
 
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
index 87ee312424c8..087045ccb1df 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -17,8 +17,10 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MSP430Attributes.h"
 
 using namespace llvm;
+using namespace llvm::MSP430Attrs;
 
 namespace llvm {
 
@@ -54,15 +56,14 @@ MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
   Streamer.emitInt8(1);
   // Attribute vector length.
   Streamer.emitInt32(11);
-  // OFBA_MSPABI_Tag_ISA(4) = 1, MSP430
-  Streamer.emitInt8(4);
-  Streamer.emitInt8(1);
-  // OFBA_MSPABI_Tag_Code_Model(6) = 1, Small
-  Streamer.emitInt8(6);
-  Streamer.emitInt8(1);
-  // OFBA_MSPABI_Tag_Data_Model(8) = 1, Small
-  Streamer.emitInt8(8);
-  Streamer.emitInt8(1);
+
+  Streamer.emitInt8(TagISA);
+  Streamer.emitInt8(STI.hasFeature(MSP430::FeatureX) ? ISAMSP430X : ISAMSP430);
+  Streamer.emitInt8(TagCodeModel);
+  Streamer.emitInt8(CMSmall);
+  Streamer.emitInt8(TagDataModel);
+  Streamer.emitInt8(DMSmall);
+  // Don't emit TagEnumSize, for full GCC compatibility.
 }
 
 MCELFStreamer &MSP430TargetELFStreamer::getStreamer() {
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index c352ea563454..3f006056955d 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -17,7 +17,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 459188434f2c..8eb3fbd58328 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -32,7 +32,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index 4be8d0760e68..a83a5d2dfcc9 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -71,9 +71,8 @@ void MSP430FrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(MSP430::SP);
 
     // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
-         I != E; ++I)
-      I->addLiveIn(MSP430::R4);
+    for (MachineBasicBlock &MBBJ : llvm::drop_begin(MF))
+      MBBJ.addLiveIn(MSP430::R4);
 
   } else
     NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
diff --git a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 7dabb9b4abae..abd48dfd5139 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -304,13 +304,11 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) {
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::i8:
-    // Sanity check
     if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
       return false;
 
     break;
   case MVT::i16:
-    // Sanity check
     if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
       return false;
 
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 9c6d44bf92de..c64a44a0ef95 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -670,7 +670,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
         InVals.push_back(ArgValue);
       }
     } else {
-      // Sanity check
+      // Only arguments passed on the stack should make it here. 
       assert(VA.isMemLoc());
 
       SDValue InVal;
@@ -1150,7 +1150,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   // lowering & isel wouldn't diverge.
   bool andCC = false;
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
-    if (RHSC->isNullValue() && LHS.hasOneUse() &&
+    if (RHSC->isZero() && LHS.hasOneUse() &&
         (LHS.getOpcode() == ISD::AND ||
          (LHS.getOpcode() == ISD::TRUNCATE &&
           LHS.getOperand(0).getOpcode() == ISD::AND))) {
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 130211878be1..e9e26e295fd5 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -18,8 +18,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -116,6 +116,7 @@ unsigned MSP430InstrInfo::removeBranch(MachineBasicBlock &MBB,
       continue;
     if (I->getOpcode() != MSP430::JMP &&
         I->getOpcode() != MSP430::JCC &&
+        I->getOpcode() != MSP430::Bi &&
         I->getOpcode() != MSP430::Br &&
         I->getOpcode() != MSP430::Bm)
       break;
@@ -189,7 +190,7 @@ bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       return true;
 
     // Handle unconditional branches.
-    if (I->getOpcode() == MSP430::JMP) {
+    if (I->getOpcode() == MSP430::JMP || I->getOpcode() == MSP430::Bi) {
       if (!AllowModify) {
         TBB = I->getOperand(0).getMBB();
         continue;
diff --git a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
index 5a117404d772..2fd58717c4db 100644
--- a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -12,7 +12,7 @@
 
 #include "MSP430Subtarget.h"
 #include "MSP430.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 827f24daad16..a33146ce2239 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -18,7 +18,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
@@ -81,5 +81,5 @@ bool MSP430PassConfig::addInstSelector() {
 
 void MSP430PassConfig::addPreEmitPass() {
   // Must run branch selection immediately preceding the asm printer.
-  addPass(createMSP430BranchSelectionPass(), false);
+  addPass(createMSP430BranchSelectionPass());
 }
diff --git a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index 9d4a8f141cc4..fc2b38f41c14 100644
--- a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/MSP430TargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheMSP430Target() {
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index e4d61f8c210e..01b5dff2e448 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -48,7 +49,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 6f197e424561..9a66dd77c0d3 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -20,11 +20,11 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
@@ -455,14 +455,6 @@ static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder);
 
 template <typename InsnType>
-static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn,
-                                       uint64_t Address, const void *Decoder);
-
-template <typename InsnType>
-static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
-                                   const void *Decoder);
-
-template <typename InsnType>
 static DecodeStatus
 DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
                       const void *Decoder);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 94d338746a6c..bfe413a152b6 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -518,7 +518,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
 /// it should return an error.
 ///
 /// \return - True on success.
-bool MipsAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool MipsAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                  const MCSubtargetInfo *STI) const {
   // Check for a less than instruction size number of bytes
   // FIXME: 16 bit instructions are not handled yet here.
   // We shouldn't be using a hard coded number for instruction size.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 16c7befb2670..5a0da3bc49bf 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -63,7 +63,8 @@ public:
     return false;
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 454f79926dd0..6fc8fcb482cd 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -29,9 +29,9 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 232d0eb33164..57cd016da4dc 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -902,7 +902,7 @@ void MipsTargetELFStreamer::finish() {
       if (Alignment) {
         OS.SwitchSection(&Section);
         if (Section.UseCodeAlign())
-          OS.emitCodeAlignment(Alignment, Alignment);
+          OS.emitCodeAlignment(Alignment, &STI, Alignment);
         else
           OS.emitValueToAlignment(Alignment, 0, 1, Alignment);
       }
diff --git a/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
index da8a06b0cff8..00ac9bf99c92 100644
--- a/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -958,7 +958,7 @@ class POOL32A_DVPEVP_FM_MMR6<string instr_asm, bits<10> funct>
   let Inst{5-0}   = 0b111100;
 }
 
-class CMP_BRANCH_OFF21_FM_MMR6<string opstr, bits<6> funct> : MipsR6Inst {
+class CMP_BRANCH_OFF21_FM_MMR6<bits<6> funct> : MipsR6Inst {
   bits<5> rs;
   bits<21> offset;
 
diff --git a/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 832124cb3f57..b1a05388884b 100644
--- a/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -62,8 +62,8 @@ class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>;
 class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>;
 class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>;
 class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">;
-class BEQZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"beqzc", 0b100000>;
-class BNEZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"bnezc", 0b101000>;
+class BEQZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<0b100000>;
+class BNEZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<0b101000>;
 class BGEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgec", 0b111101>,
                       DecodeDisambiguates<"POP75GroupBranchMMR6">;
 class BGEUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgeuc", 0b110000>,
@@ -406,7 +406,7 @@ class BITSWAP_MMR6_DESC : BITSWAP_MMR6_DESC_BASE<"bitswap", GPR32Opnd>;
 class BRK_MMR6_DESC : BRK_FT<"break">;
 
 class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd,
-                           RegisterOperand GPROpnd, InstrItinClass Itin>
+                           InstrItinClass Itin>
       : MMR6Arch<instr_asm> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
@@ -416,10 +416,8 @@ class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd,
   InstrItinClass Itinerary = Itin;
 }
 
-class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd,
-                                             II_CACHE>;
-class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd,
-                                             II_PREF>;
+class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, II_CACHE>;
+class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, II_PREF>;
 
 class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
                             RegisterOperand GPROpnd, InstrItinClass Itin>
@@ -1197,21 +1195,21 @@ class SWM16_MMR6_DESC
   ComplexPattern Addr = addr;
 }
 
-class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
-                          SDPatternOperator OpNode, InstrItinClass Itin,
-                          Operand MemOpnd>
+class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd,
+                          InstrItinClass Itin, Operand MemOpnd>
     : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
                       !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
       MMR6Arch<opstr> {
   let DecoderMethod = "DecodeMemMMImm4";
   let mayStore = 1;
 }
-class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd,
-                                           truncstorei8, II_SB, mem_mm_4>;
-class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd,
-                                           truncstorei16, II_SH, mem_mm_4_lsl1>;
-class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
-                                           store, II_SW, mem_mm_4_lsl2>;
+
+class SB16_MMR6_DESC
+    : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, II_SB, mem_mm_4>;
+class SH16_MMR6_DESC
+    : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, II_SH, mem_mm_4_lsl1>;
+class SW16_MMR6_DESC
+    : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, II_SW, mem_mm_4_lsl2>;
 
 class SWSP_MMR6_DESC
     : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
diff --git a/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 9a1e47e5ecca..8950de230a01 100644
--- a/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -281,57 +281,46 @@ class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
 class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
   "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>;
 
-class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                          InstrItinClass itin> {
+class EXT_MM_2R_DESC_BASE<string instr_asm> {
   dag OutOperandList = (outs GPR32Opnd:$rt);
   dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs");
-  InstrItinClass Itinerary = itin;
+  InstrItinClass Itinerary = NoItinerary;
 }
-class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                          InstrItinClass itin> {
+class EXT_MM_1R_DESC_BASE<string instr_asm> {
   dag OutOperandList = (outs GPR32Opnd:$rt);
   dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
-  InstrItinClass Itinerary = itin;
+  InstrItinClass Itinerary = NoItinerary;
 }
 
-class EXTP_MM_DESC
-    : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
-      Uses<[DSPPos]>, Defs<[DSPEFI]>;
-class EXTPDP_MM_DESC
-    : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
-      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
-class EXTPDPV_MM_DESC
-    : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>,
-      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
-class EXTPV_MM_DESC
-    : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
-      Uses<[DSPPos]>, Defs<[DSPEFI]>;
-class EXTR_W_MM_DESC
-    : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTR_R_W_MM_DESC
-    : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTR_RS_W_MM_DESC
-    : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTR_S_H_MM_DESC
-    : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTRV_W_MM_DESC
-    : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTRV_R_W_MM_DESC
-    : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTRV_RS_W_MM_DESC
-    : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
-class EXTRV_S_H_MM_DESC
-    : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
-      Defs<[DSPOutFlag23]>;
+class EXTP_MM_DESC : EXT_MM_1R_DESC_BASE<"extp">,
+                     Uses<[DSPPos]>,
+                     Defs<[DSPEFI]>;
+class EXTPDP_MM_DESC : EXT_MM_1R_DESC_BASE<"extpdp">,
+                       Uses<[DSPPos]>,
+                       Defs<[DSPPos, DSPEFI]>;
+class EXTPDPV_MM_DESC : EXT_MM_2R_DESC_BASE<"extpdpv">,
+                        Uses<[DSPPos]>,
+                        Defs<[DSPPos, DSPEFI]>;
+class EXTPV_MM_DESC : EXT_MM_2R_DESC_BASE<"extpv">,
+                      Uses<[DSPPos]>,
+                      Defs<[DSPEFI]>;
+class EXTR_W_MM_DESC : EXT_MM_1R_DESC_BASE<"extr.w">,
+                       Defs<[DSPOutFlag23]>;
+class EXTR_R_W_MM_DESC : EXT_MM_1R_DESC_BASE<"extr_r.w">,
+                         Defs<[DSPOutFlag23]>;
+class EXTR_RS_W_MM_DESC : EXT_MM_1R_DESC_BASE<"extr_rs.w">,
+                          Defs<[DSPOutFlag23]>;
+class EXTR_S_H_MM_DESC : EXT_MM_1R_DESC_BASE<"extr_s.h">,
+                         Defs<[DSPOutFlag23]>;
+class EXTRV_W_MM_DESC : EXT_MM_2R_DESC_BASE<"extrv.w">, Defs<[DSPOutFlag23]>;
+class EXTRV_R_W_MM_DESC : EXT_MM_2R_DESC_BASE<"extrv_r.w">,
+                          Defs<[DSPOutFlag23]>;
+class EXTRV_RS_W_MM_DESC : EXT_MM_2R_DESC_BASE<"extrv_rs.w">,
+                           Defs<[DSPOutFlag23]>;
+class EXTRV_S_H_MM_DESC : EXT_MM_2R_DESC_BASE<"extrv_s.h">,
+                          Defs<[DSPOutFlag23]>;
 
 class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
                         InstrItinClass itin> {
diff --git a/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
index 269ad8b548a4..5f6354e19ebc 100644
--- a/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -195,8 +195,7 @@ def simm23_lsl2 : Operand<i32> {
   let DecoderMethod = "DecodeSimm23Lsl2";
 }
 
-class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
-                      RegisterOperand RO> :
+class CompactBranchMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
   InstSE<(outs), (ins RO:$rs, opnd:$offset),
          !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> {
   let isBranch = 1;
@@ -240,7 +239,7 @@ MicroMipsInst16<(outs RO1:$rd1, RO2:$rd2), (ins RO3:$rs, RO3:$rt),
   let DecoderMethod = "DecodeMovePOperands";
 }
 
-class StorePairMM<string opstr, ComplexPattern Addr = addr>
+class StorePairMM<string opstr>
     :  InstSE<(outs), (ins GPR32Opnd:$rt, GPR32Opnd:$rt2, mem_simm12:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], II_SWP, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
@@ -248,7 +247,7 @@ class StorePairMM<string opstr, ComplexPattern Addr = addr>
   let AsmMatchConverter = "ConvertXWPOperands";
 }
 
-class LoadPairMM<string opstr, ComplexPattern Addr = addr>
+class LoadPairMM<string opstr>
     : InstSE<(outs GPR32Opnd:$rt, GPR32Opnd:$rt2), (ins mem_simm12:$addr),
           !strconcat(opstr, "\t$rt, $addr"), [], II_LWP, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
@@ -332,7 +331,7 @@ class ShiftIMM16<string opstr, Operand ImmOpnd, RegisterOperand RO,
   MicroMipsInst16<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
                   !strconcat(opstr, "\t$rd, $rt, $shamt"), [], Itin, FrmR>;
 
-class LoadMM16<string opstr, DAGOperand RO, SDPatternOperator OpNode,
+class LoadMM16<string opstr, DAGOperand RO,
                InstrItinClass Itin, Operand MemOpnd> :
   MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$addr),
                   !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
@@ -341,8 +340,7 @@ class LoadMM16<string opstr, DAGOperand RO, SDPatternOperator OpNode,
   let mayLoad = 1;
 }
 
-class StoreMM16<string opstr, DAGOperand RTOpnd, DAGOperand RO,
-                SDPatternOperator OpNode, InstrItinClass Itin,
+class StoreMM16<string opstr, DAGOperand RTOpnd, InstrItinClass Itin,
                 Operand MemOpnd> :
   MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
                   !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
@@ -499,8 +497,7 @@ let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
            !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>;
 }
 
-class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
-                              SDPatternOperator OpNode = null_frag> :
+class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO> :
   InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$rd, ${index}(${base})"), [], II_LWXS, FrmFI>;
 
@@ -540,34 +537,28 @@ def reglist16 : Operand<i32> {
   let ParserMatchClass = RegList16AsmOperand;
 }
 
-class StoreMultMM<string opstr,
-            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+class StoreMultMM<string opstr, InstrItinClass Itin> :
   InstSE<(outs), (ins reglist:$rt, mem_mm_12:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let mayStore = 1;
 }
 
-class LoadMultMM<string opstr,
-            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+class LoadMultMM<string opstr, InstrItinClass Itin> :
   InstSE<(outs reglist:$rt), (ins mem_mm_12:$addr),
           !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMemMMImm12";
   let mayLoad = 1;
 }
 
-class StoreMultMM16<string opstr,
-                    InstrItinClass Itin = NoItinerary,
-                    ComplexPattern Addr = addr> :
+class StoreMultMM16<string opstr, InstrItinClass Itin> :
   MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
                   !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
   let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
   let mayStore = 1;
 }
 
-class LoadMultMM16<string opstr,
-                   InstrItinClass Itin = NoItinerary,
-                   ComplexPattern Addr = addr> :
+class LoadMultMM16<string opstr, InstrItinClass Itin> :
   MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
                   !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
   let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
@@ -636,21 +627,21 @@ let FastISelShouldIgnore = 1 in {
   def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
                  LOGIC_FM_MM16<0x1>, ISA_MICROMIPS32_NOT_MIPS32R6;
 }
-def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
-                        mem_mm_4>, LOAD_STORE_FM_MM16<0x02>, ISA_MICROMIPS;
-def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
-                        mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>, ISA_MICROMIPS;
-def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, load, II_LW, mem_mm_4_lsl2>,
+def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, II_LBU, mem_mm_4>,
+               LOAD_STORE_FM_MM16<0x02>, ISA_MICROMIPS;
+def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, II_LHU, mem_mm_4_lsl1>,
+               LOAD_STORE_FM_MM16<0x0a>, ISA_MICROMIPS;
+def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, II_LW, mem_mm_4_lsl2>,
                       LOAD_STORE_FM_MM16<0x1a>, ISA_MICROMIPS;
-def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei8,
-                        II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>,
-                        ISA_MICROMIPS32_NOT_MIPS32R6;
-def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
-                        II_SH, mem_mm_4_lsl1>,
-                        LOAD_STORE_FM_MM16<0x2a>, ISA_MICROMIPS32_NOT_MIPS32R6;
-def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
-                        mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>,
-                        ISA_MICROMIPS32_NOT_MIPS32R6;
+def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, II_SB, mem_mm_4>,
+              LOAD_STORE_FM_MM16<0x22>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
+def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, II_SH, mem_mm_4_lsl1>,
+              LOAD_STORE_FM_MM16<0x2a>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
+def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, II_SW, mem_mm_4_lsl2>,
+              LOAD_STORE_FM_MM16<0x3a>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
 def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
                          LOAD_GP_FM_MM16<0x19>, ISA_MICROMIPS;
 def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
@@ -713,9 +704,9 @@ let DecoderNamespace = "MicroMips" in {
                 POOL32A_CFTC2_FM_MM<0b1101110100>, ISA_MICROMIPS;
 
   /// Compact Branch Instructions
-  def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
+  def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, GPR32Opnd>,
                  COMPACT_BRANCH_FM_MM<0x7>, ISA_MICROMIPS32_NOT_MIPS32R6;
-  def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, setne, GPR32Opnd>,
+  def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, GPR32Opnd>,
                  COMPACT_BRANCH_FM_MM<0x5>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
   /// Arithmetic Instructions (ALU Immediate)
diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 6c5f63804d19..203e05dde7ad 100644
--- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -408,12 +408,9 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         // during call setup, the proper call lowering to the helper
         // functions will take place.
         //
-        A = A.addAttribute(C, AttributeList::FunctionIndex,
-                           "__Mips16RetHelper");
-        A = A.addAttribute(C, AttributeList::FunctionIndex,
-                           Attribute::ReadNone);
-        A = A.addAttribute(C, AttributeList::FunctionIndex,
-                           Attribute::NoInline);
+        A = A.addFnAttribute(C, "__Mips16RetHelper");
+        A = A.addFnAttribute(C, Attribute::ReadNone);
+        A = A.addFnAttribute(C, Attribute::NoInline);
         FunctionCallee F = (M->getOrInsertFunction(Name, A, MyVoid, T));
         CallInst::Create(F, Params, "", &I);
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
@@ -485,11 +482,11 @@ static void removeUseSoftFloat(Function &F) {
   AttrBuilder B;
   LLVM_DEBUG(errs() << "removing -use-soft-float\n");
   B.addAttribute("use-soft-float", "false");
-  F.removeAttributes(AttributeList::FunctionIndex, B);
+  F.removeFnAttrs(B);
   if (F.hasFnAttribute("use-soft-float")) {
     LLVM_DEBUG(errs() << "still has -use-soft-float\n");
   }
-  F.addAttributes(AttributeList::FunctionIndex, B);
+  F.addFnAttrs(B);
 }
 
 // This pass only makes sense when the underlying chip has floating point but
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.td b/llvm/lib/Target/Mips/Mips16InstrInfo.td
index 990202b23bc0..3410fcd85fdc 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -304,14 +304,14 @@ class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
 //
 // MULT
 //
-class FMULT16_ins<string asmstr, InstrItinClass itin> :
+class FMULT16_ins<string asmstr> :
   MipsPseudo16<(outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
                !strconcat(asmstr, "\t$rx, $ry"), []>;
 
 //
 // MULT-LO
 //
-class FMULT16_LO_ins<string asmstr, InstrItinClass itin> :
+class FMULT16_LO_ins<string asmstr> :
   MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
                !strconcat(asmstr, "\t$rx, $ry\n\tmflo\t$rz"), []> {
   let isCodeGenOnly=1;
@@ -895,13 +895,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
 //
 // Pseudo Instruction for mult
 //
-def MultRxRy16:  FMULT16_ins<"mult",  IIM16Alu> {
+def MultRxRy16:  FMULT16_ins<"mult"> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
 }
 
-def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> {
+def MultuRxRy16: FMULT16_ins<"multu"> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
@@ -912,7 +912,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> {
 // Purpose: Multiply Word
 // To multiply 32-bit signed integers.
 //
-def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> {
+def MultRxRyRz16: FMULT16_LO_ins<"mult"> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
@@ -923,7 +923,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> {
 // Purpose: Multiply Unsigned Word
 // To multiply 32-bit unsigned integers.
 //
-def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> {
+def MultuRxRyRz16: FMULT16_LO_ins<"multu"> {
   let isCommutable = 1;
   let hasSideEffects = 0;
   let Defs = [HI0, LO0];
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 9607d008bc97..192d0013d89c 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -700,8 +700,7 @@ class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd, II_RINT_D>;
 class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd, II_CLASS_S>;
 class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd, II_CLASS_D>;
 
-class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
-                      RegisterOperand GPROpnd, InstrItinClass itin>
+class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd, InstrItinClass itin>
                      : MipsR6Arch<instr_asm> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
@@ -711,8 +710,8 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
   InstrItinClass Itinerary = itin;
 }
 
-class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd, II_CACHE>;
-class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd, II_PREF>;
+class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, II_CACHE>;
+class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, II_PREF>;
 
 class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd,
                        InstrItinClass itin> {
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index b460bc71b11f..6d3f3adb2b7a 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -52,9 +52,9 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -1203,7 +1203,7 @@ void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
   //   LD       RA, 8(SP)
   //   DADDIU   SP, SP, 16
   //
-  OutStreamer->emitCodeAlignment(4);
+  OutStreamer->emitCodeAlignment(4, &getSubtargetInfo());
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
   OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index 5c2549ee176b..f6ec34c7f403 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -24,6 +24,7 @@ using namespace llvm;
 MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
+namespace {
 struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
   /// This is the name of the function being called
   /// FIXME: Relying on this is unsound
@@ -80,7 +81,6 @@ struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
   }
 };
 
-namespace {
 class MipsIncomingValueHandler : public CallLowering::IncomingValueHandler {
   const MipsSubtarget &STI;
 
@@ -92,7 +92,7 @@ public:
 
 private:
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override;
+                        CCValAssign VA) override;
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO,
@@ -101,7 +101,8 @@ private:
                             MachinePointerInfo &MPO, CCValAssign &VA) override;
 
   unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
-                             ArrayRef<CCValAssign> VAs) override;
+                             ArrayRef<CCValAssign> VAs,
+                             std::function<void()> *Thunk = nullptr) override;
 
   virtual void markPhysRegUsed(unsigned PhysReg) {
     MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -127,7 +128,7 @@ private:
 
 void MipsIncomingValueHandler::assignValueToReg(Register ValVReg,
                                                 Register PhysReg,
-                                                CCValAssign &VA) {
+                                                CCValAssign VA) {
   markPhysRegUsed(PhysReg);
   IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
 }
@@ -163,7 +164,8 @@ void MipsIncomingValueHandler::assignValueToAddress(Register ValVReg,
 /// dependent on other arguments.
 unsigned
 MipsIncomingValueHandler::assignCustomValue(CallLowering::ArgInfo &Arg,
-                                            ArrayRef<CCValAssign> VAs) {
+                                            ArrayRef<CCValAssign> VAs,
+                                            std::function<void()> *Thunk) {
   const CCValAssign &VALo = VAs[0];
   const CCValAssign &VAHi = VAs[1];
 
@@ -197,7 +199,7 @@ public:
 
 private:
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override;
+                        CCValAssign VA) override;
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO,
@@ -206,7 +208,8 @@ private:
   void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
                             MachinePointerInfo &MPO, CCValAssign &VA) override;
   unsigned assignCustomValue(CallLowering::ArgInfo &Arg,
-                             ArrayRef<CCValAssign> VAs) override;
+                             ArrayRef<CCValAssign> VAs,
+                             std::function<void()> *Thunk) override;
 
   MachineInstrBuilder &MIB;
 };
@@ -214,7 +217,7 @@ private:
 
 void MipsOutgoingValueHandler::assignValueToReg(Register ValVReg,
                                                 Register PhysReg,
-                                                CCValAssign &VA) {
+                                                CCValAssign VA) {
   Register ExtReg = extendRegister(ValVReg, VA);
   MIRBuilder.buildCopy(PhysReg, ExtReg);
   MIB.addUse(PhysReg, RegState::Implicit);
@@ -253,7 +256,8 @@ void MipsOutgoingValueHandler::assignValueToAddress(Register ValVReg,
 
 unsigned
 MipsOutgoingValueHandler::assignCustomValue(CallLowering::ArgInfo &Arg,
-                                            ArrayRef<CCValAssign> VAs) {
+                                            ArrayRef<CCValAssign> VAs,
+                                            std::function<void()> *Thunk) {
   const CCValAssign &VALo = VAs[0];
   const CCValAssign &VAHi = VAs[1];
 
@@ -271,6 +275,15 @@ MipsOutgoingValueHandler::assignCustomValue(CallLowering::ArgInfo &Arg,
   if (!STI.isLittle())
     std::swap(Lo, Hi);
 
+  // If we can return a thunk, just include the register copies. The unmerge can
+  // be emitted earlier.
+  if (Thunk) {
+    *Thunk = [=]() {
+      MIRBuilder.buildCopy(VALo.getLocReg(), Lo);
+      MIRBuilder.buildCopy(VAHi.getLocReg(), Hi);
+    };
+    return 2;
+  }
   MIRBuilder.buildCopy(VALo.getLocReg(), Lo);
   MIRBuilder.buildCopy(VAHi.getLocReg(), Hi);
   return 2;
diff --git a/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
index 727d47d06ad4..dd0b48573ef6 100644
--- a/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -398,8 +398,7 @@ class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string BaseOpcode = instr_asm;
 }
 
-class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                              InstrItinClass itin> {
+class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, InstrItinClass itin> {
   dag OutOperandList = (outs GPR32Opnd:$rt);
   dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
@@ -407,8 +406,7 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string BaseOpcode = instr_asm;
 }
 
-class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                              InstrItinClass itin> {
+class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, InstrItinClass itin> {
   dag OutOperandList = (outs GPR32Opnd:$rt);
   dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
@@ -522,7 +520,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO,
   bit isMoveReg = 1;
 }
 
-class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
+class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode> :
   MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
   bit hasNoSchedulingInfo = 1;
   bit usesCustomInserter = 1;
@@ -891,47 +889,40 @@ class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>;
 class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget, NoItinerary>;
 
 // Extr
-class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", NoItinerary>,
                   Uses<[DSPPos]>, Defs<[DSPEFI]>;
 
-class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", NoItinerary>,
                    Uses<[DSPPos]>, Defs<[DSPEFI]>;
 
-class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", NoItinerary>,
                     Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
 
-class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
-                                             NoItinerary>,
+class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", NoItinerary>,
                      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
 
-class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", NoItinerary>,
                     Defs<[DSPOutFlag23]>;
 
-class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
-                                             NoItinerary>, Defs<[DSPOutFlag23]>;
+class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", NoItinerary>,
+                     Defs<[DSPOutFlag23]>;
 
-class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
-                                              NoItinerary>,
+class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", NoItinerary>,
                       Defs<[DSPOutFlag23]>;
 
-class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
-                                               NoItinerary>,
+class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", NoItinerary>,
                        Defs<[DSPOutFlag23]>;
 
-class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
-                                               NoItinerary>,
+class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", NoItinerary>,
                        Defs<[DSPOutFlag23]>;
 
-class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
-                                                NoItinerary>,
+class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", NoItinerary>,
                         Defs<[DSPOutFlag23]>;
 
-class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
-                                              NoItinerary>,
+class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", NoItinerary>,
                       Defs<[DSPOutFlag23]>;
 
-class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
-                                               NoItinerary>,
+class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", NoItinerary>,
                        Defs<[DSPOutFlag23]>;
 
 class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo", MipsSHILO>;
@@ -1115,8 +1106,8 @@ class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
                                       timmZExt5, NoItinerary>;
 
 // Pseudos.
-def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
-                                                NoItinerary>, Uses<[DSPPos]>;
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32>,
+                      Uses<[DSPPos]>;
 
 // Instruction defs.
 // MIPS DSP Rev 1
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 797d81204305..c2e3d7393a6d 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -401,10 +401,9 @@ void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
 
 void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
                              const MachineBasicBlock &SuccBB) {
-  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
-       SE = MBB.succ_end(); SI != SE; ++SI)
-    if (*SI != &SuccBB)
-      for (const auto &LI : (*SI)->liveins())
+  for (const MachineBasicBlock *S : MBB.successors())
+    if (S != &SuccBB)
+      for (const auto &LI : S->liveins())
         Uses.set(LI.PhysReg);
 }
 
@@ -839,9 +838,8 @@ bool MipsDelaySlotFiller::searchSuccBBs(MachineBasicBlock &MBB,
   auto *Fn = MBB.getParent();
 
   // Iterate over SuccBB's predecessor list.
-  for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
-       PE = SuccBB->pred_end(); PI != PE; ++PI)
-    if (!examinePred(**PI, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
+  for (MachineBasicBlock *Pred : SuccBB->predecessors())
+    if (!examinePred(*Pred, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
       return false;
 
   // Do not allow moving instructions which have unallocatable register operands
diff --git a/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
index 73cca8cfa5d9..c697dc90c14c 100644
--- a/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -70,8 +70,7 @@ class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd, II_LHUE>;
 class LWE_DESC  : LOAD_EVA_DESC_BASE<"lwe",  GPR32Opnd, II_LWE>;
 
 class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
-                          SDPatternOperator OpNode = null_frag,
-                          InstrItinClass itin = NoItinerary> {
+                          InstrItinClass itin> {
   dag OutOperandList = (outs);
   dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
@@ -82,9 +81,9 @@ class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   InstrItinClass Itinerary = itin;
 }
 
-class SBE_DESC  : STORE_EVA_DESC_BASE<"sbe",  GPR32Opnd, null_frag, II_SBE>;
-class SHE_DESC  : STORE_EVA_DESC_BASE<"she",  GPR32Opnd, null_frag, II_SHE>;
-class SWE_DESC  : STORE_EVA_DESC_BASE<"swe",  GPR32Opnd, null_frag, II_SWE>;
+class SBE_DESC : STORE_EVA_DESC_BASE<"sbe", GPR32Opnd, II_SBE>;
+class SHE_DESC : STORE_EVA_DESC_BASE<"she", GPR32Opnd, II_SHE>;
+class SWE_DESC : STORE_EVA_DESC_BASE<"swe", GPR32Opnd, II_SWE>;
 
 // Load/Store Left/Right EVA descriptions
 class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index e963185eaeaa..05c1c06ffefe 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -1660,7 +1660,7 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!MTI->getLength()->getType()->isIntegerTy(32))
       return false;
     const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
-    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 1);
+    return lowerCallTo(II, IntrMemName, II->arg_size() - 1);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -1669,7 +1669,7 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       return false;
     if (!MSI->getLength()->getType()->isIntegerTy(32))
       return false;
-    return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
+    return lowerCallTo(II, "memset", II->arg_size() - 1);
   }
   }
   return false;
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 9399c949a3f2..4f364ef6afc7 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -509,6 +509,9 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MUL_I128, nullptr);
+    setLibcallName(RTLIB::MULO_I64, nullptr);
+    setLibcallName(RTLIB::MULO_I128, nullptr);
   }
 
   setMinFunctionAlignment(Subtarget.isGP64bit() ? Align(8) : Align(4));
@@ -2073,7 +2076,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
-    const GlobalObject *GO = GV->getBaseObject();
+    const GlobalObject *GO = GV->getAliaseeObject();
     if (GO && TLOF->IsGlobalInSmallSection(GO, getTargetMachine()))
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG, ABI.IsN64());
@@ -3714,7 +3717,7 @@ SDValue MipsTargetLowering::LowerFormalArguments(
           LocVT = VA.getValVT();
       }
 
-      // sanity check
+      // Only arguments pased on the stack should make it here. 
       assert(VA.isMemLoc());
 
       // The stack pointer offset is relative to the caller stack frame.
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 256fb74c1d6c..6d44ce2ab563 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -145,14 +145,14 @@ bool MipsInstructionSelector::materialize32BitImm(Register DestReg, APInt Imm,
                                                   MachineIRBuilder &B) const {
   assert(Imm.getBitWidth() == 32 && "Unsupported immediate size.");
   // Ori zero extends immediate. Used for values with zeros in high 16 bits.
-  if (Imm.getHiBits(16).isNullValue()) {
+  if (Imm.getHiBits(16).isZero()) {
     MachineInstr *Inst =
         B.buildInstr(Mips::ORi, {DestReg}, {Register(Mips::ZERO)})
             .addImm(Imm.getLoBits(16).getLimitedValue());
     return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI);
   }
   // Lui places immediate in high 16 bits and sets low 16 bits to zero.
-  if (Imm.getLoBits(16).isNullValue()) {
+  if (Imm.getLoBits(16).isZero()) {
     MachineInstr *Inst = B.buildInstr(Mips::LUi, {DestReg}, {})
                              .addImm(Imm.getHiBits(16).getLimitedValue());
     return constrainSelectedInstRegOperands(*Inst, TII, TRI, RBI);
diff --git a/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
index 301f1c158010..c4abccb24c6f 100644
--- a/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -1308,8 +1308,8 @@ class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
   InstrItinClass Itinerary = itin;
 }
 
-class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
-                              RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+class MSA_2R_FILL_PSEUDO_BASE<SDPatternOperator OpNode,
+                              RegisterClass RCWD, RegisterClass RCWS> :
       MSAPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
                 [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
   let usesCustomInserter = 1;
@@ -2091,10 +2091,8 @@ class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
 class FILL_D_DESC : MSA_2R_FILL_DESC_BASE<"fill.d", v2i64, vsplati64,
                                           MSA128DOpnd, GPR64Opnd>;
 
-class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
-                                                    FGR32>;
-class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
-                                                    FGR64>;
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<vsplatf32, MSA128W, FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<vsplatf64, MSA128D, FGR64>;
 
 class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
 class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
@@ -3755,8 +3753,7 @@ def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
 // Pseudos used to implement BNZ.df, and BZ.df
 
 class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
-                                   RegisterClass RCWS,
-                                   InstrItinClass itin = NoItinerary> :
+                                   RegisterClass RCWS> :
   MipsPseudo<(outs GPR32:$dst),
              (ins RCWS:$ws),
              [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
@@ -3764,27 +3761,22 @@ class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
   bit hasNoSchedulingInfo = 1;
 }
 
-def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
-                                                MSA128B, NoItinerary>;
-def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
-                                                MSA128H, NoItinerary>;
-def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
-                                                MSA128W, NoItinerary>;
-def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
-                                                MSA128D, NoItinerary>;
-def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
-                                                MSA128B, NoItinerary>;
-
-def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
-                                               MSA128B, NoItinerary>;
-def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
-                                               MSA128H, NoItinerary>;
-def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
-                                               MSA128W, NoItinerary>;
-def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
-                                               MSA128D, NoItinerary>;
-def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
-                                               MSA128B, NoItinerary>;
+def SNZ_B_PSEUDO
+    : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8, MSA128B>;
+def SNZ_H_PSEUDO
+    : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16, MSA128H>;
+def SNZ_W_PSEUDO
+    : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32, MSA128W>;
+def SNZ_D_PSEUDO
+    : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64, MSA128D>;
+def SNZ_V_PSEUDO
+    : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8, MSA128B>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8, MSA128B>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16, MSA128H>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32, MSA128W>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64, MSA128D>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8, MSA128B>;
 
 // Pseudoes used to implement transparent fp16 support.
 
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 7be5fc33a0af..03a545605fe1 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -964,7 +964,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   // match the instruction.
   case MipsISD::Ins: {
 
-    // Sanity checking for the node operands.
+    // Validating the node operands.
     if (Node->getValueType(0) != MVT::i32 && Node->getValueType(0) != MVT::i64)
       return false;
 
@@ -1027,12 +1027,13 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     }
 
     SDNode *Rdhwr =
-        CurDAG->getMachineNode(RdhwrOpc, DL, Node->getValueType(0),
+        CurDAG->getMachineNode(RdhwrOpc, DL, Node->getValueType(0), MVT::Glue,
                                CurDAG->getRegister(Mips::HWR29, MVT::i32),
                                CurDAG->getTargetConstant(0, DL, MVT::i32));
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
-                                         SDValue(Rdhwr, 0));
-    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
+                                         SDValue(Rdhwr, 0), SDValue(Rdhwr, 1));
+    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT,
+                                             Chain.getValue(1));
     ReplaceNode(Node, ResNode.getNode());
     return true;
   }
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 37d4313cc506..1fe6ab09804b 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -569,7 +569,7 @@ static bool isVectorAllOnes(SDValue N) {
   // Endianness doesn't matter in this context because we are looking for
   // an all-ones value.
   if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
-    return SplatValue.isAllOnesValue();
+    return SplatValue.isAllOnes();
 
   return false;
 }
@@ -701,7 +701,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
 
     // Fold degenerate cases.
     if (IsConstantMask) {
-      if (Mask.isAllOnesValue())
+      if (Mask.isAllOnes())
         return IfSet;
       else if (Mask == 0)
         return IfClr;
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index 901a4fe4e2ac..26b31cfa9f2a 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -18,9 +18,9 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.cpp b/llvm/lib/Target/Mips/MipsSubtarget.cpp
index 8bb9d75e9173..c285385a19dd 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -12,17 +12,17 @@
 
 #include "MipsSubtarget.h"
 #include "Mips.h"
-#include "MipsMachineFunction.h"
-#include "MipsRegisterInfo.h"
-#include "MipsTargetMachine.h"
 #include "MipsCallLowering.h"
 #include "MipsLegalizerInfo.h"
+#include "MipsMachineFunction.h"
 #include "MipsRegisterBankInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -78,7 +78,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
       HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
       HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
-      HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+      HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 || Mips_Os16),
       Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
       HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
       HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 7dd030f73d55..8de3c9fd25bd 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -34,9 +34,9 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
diff --git a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index 44041987ec76..db5f607bbb4f 100644
--- a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/MipsTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheMipsTarget() {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index d69166feb042..856d03f0b210 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -10,15 +10,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NVPTXMCTargetDesc.h"
 #include "NVPTXInstPrinter.h"
 #include "NVPTXMCAsmInfo.h"
-#include "NVPTXMCTargetDesc.h"
 #include "NVPTXTargetStreamer.h"
 #include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index fe335f154703..1cbd650bdf06 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -26,7 +26,7 @@ NVPTXTargetStreamer::~NVPTXTargetStreamer() = default;
 
 void NVPTXTargetStreamer::outputDwarfFileDirectives() {
   for (const std::string &S : DwarfFiles)
-    getStreamer().emitRawText(S.data());
+    getStreamer().emitRawText(S);
   DwarfFiles.clear();
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 2b0972b8531e..7af927aba64e 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -89,6 +89,12 @@ def PTX71 : SubtargetFeature<"ptx71", "PTXVersion", "71",
                              "Use PTX version 7.1">;
 def PTX72 : SubtargetFeature<"ptx72", "PTXVersion", "72",
                              "Use PTX version 7.2">;
+def PTX73 : SubtargetFeature<"ptx73", "PTXVersion", "73",
+                             "Use PTX version 7.3">;
+def PTX74 : SubtargetFeature<"ptx74", "PTXVersion", "74",
+                             "Use PTX version 7.4">;
+def PTX75 : SubtargetFeature<"ptx75", "PTXVersion", "75",
+                             "Use PTX version 7.5">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 38844ff4ddf9..aab6d2034f11 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -71,12 +71,12 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -417,8 +417,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
   // llvm.loop.unroll.disable is marked on the back edges of a loop. Therefore,
   // we iterate through each back edge of the loop with header MBB, and check
   // whether its metadata contains llvm.loop.unroll.disable.
-  for (auto I = MBB.pred_begin(); I != MBB.pred_end(); ++I) {
-    const MachineBasicBlock *PMBB = *I;
+  for (const MachineBasicBlock *PMBB : MBB.predecessors()) {
     if (LI.getLoopFor(PMBB) != LI.getLoopFor(&MBB)) {
       // Edges from other loops to MBB are not back edges.
       continue;
@@ -703,7 +702,7 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
     const Function *F = &*FI;
 
-    if (F->getAttributes().hasFnAttribute("nvptx-libcall-callee")) {
+    if (F->getAttributes().hasFnAttr("nvptx-libcall-callee")) {
       emitDeclaration(F, O);
       continue;
     }
@@ -1457,7 +1456,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       }
     }
 
-    if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) {
+    if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
@@ -1748,135 +1747,63 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
   llvm_unreachable("Not scalar type found in printScalarConstant()");
 }
 
-// These utility functions assure we get the right sequence of bytes for a given
-// type even for big-endian machines
-template <typename T> static void ConvertIntToBytes(unsigned char *p, T val) {
-  int64_t vp = (int64_t)val;
-  for (unsigned i = 0; i < sizeof(T); ++i) {
-    p[i] = (unsigned char)vp;
-    vp >>= 8;
-  }
-}
-static void ConvertFloatToBytes(unsigned char *p, float val) {
-  int32_t *vp = (int32_t *)&val;
-  for (unsigned i = 0; i < sizeof(int32_t); ++i) {
-    p[i] = (unsigned char)*vp;
-    *vp >>= 8;
-  }
-}
-static void ConvertDoubleToBytes(unsigned char *p, double val) {
-  int64_t *vp = (int64_t *)&val;
-  for (unsigned i = 0; i < sizeof(int64_t); ++i) {
-    p[i] = (unsigned char)*vp;
-    *vp >>= 8;
-  }
-}
-
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
-                                   AggBuffer *aggBuffer) {
+                                   AggBuffer *AggBuffer) {
   const DataLayout &DL = getDataLayout();
-
+  int AllocSize = DL.getTypeAllocSize(CPV->getType());
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
-    int s = DL.getTypeAllocSize(CPV->getType());
-    if (s < Bytes)
-      s = Bytes;
-    aggBuffer->addZeros(s);
+    // Non-zero Bytes indicates that we need to zero-fill everything. Otherwise,
+    // only the space allocated by CPV.
+    AggBuffer->addZeros(Bytes ? Bytes : AllocSize);
     return;
   }
 
-  unsigned char ptr[8];
-  switch (CPV->getType()->getTypeID()) {
+  // Helper for filling AggBuffer with APInts.
+  auto AddIntToBuffer = [AggBuffer, Bytes](const APInt &Val) {
+    size_t NumBytes = (Val.getBitWidth() + 7) / 8;
+    SmallVector<unsigned char, 16> Buf(NumBytes);
+    for (unsigned I = 0; I < NumBytes; ++I) {
+      Buf[I] = Val.extractBitsAsZExtValue(8, I * 8);
+    }
+    AggBuffer->addBytes(Buf.data(), NumBytes, Bytes);
+  };
 
-  case Type::IntegerTyID: {
-    Type *ETy = CPV->getType();
-    if (ETy == Type::getInt8Ty(CPV->getContext())) {
-      unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
-      ConvertIntToBytes<>(ptr, c);
-      aggBuffer->addBytes(ptr, 1, Bytes);
-    } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
-      short int16 = (short)cast<ConstantInt>(CPV)->getZExtValue();
-      ConvertIntToBytes<>(ptr, int16);
-      aggBuffer->addBytes(ptr, 2, Bytes);
-    } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
-      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
-        int int32 = (int)(constInt->getZExtValue());
-        ConvertIntToBytes<>(ptr, int32);
-        aggBuffer->addBytes(ptr, 4, Bytes);
+  switch (CPV->getType()->getTypeID()) {
+  case Type::IntegerTyID:
+    if (const auto CI = dyn_cast<ConstantInt>(CPV)) {
+      AddIntToBuffer(CI->getValue());
+      break;
+    }
+    if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      if (const auto *CI =
+              dyn_cast<ConstantInt>(ConstantFoldConstant(Cexpr, DL))) {
+        AddIntToBuffer(CI->getValue());
         break;
-      } else if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (const auto *constInt = dyn_cast<ConstantInt>(
-                ConstantFoldConstant(Cexpr, DL))) {
-          int int32 = (int)(constInt->getZExtValue());
-          ConvertIntToBytes<>(ptr, int32);
-          aggBuffer->addBytes(ptr, 4, Bytes);
-          break;
-        }
-        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
-          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
-          aggBuffer->addSymbol(v, Cexpr->getOperand(0));
-          aggBuffer->addZeros(4);
-          break;
-        }
       }
-      llvm_unreachable("unsupported integer const type");
-    } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
-      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
-        long long int64 = (long long)(constInt->getZExtValue());
-        ConvertIntToBytes<>(ptr, int64);
-        aggBuffer->addBytes(ptr, 8, Bytes);
+      if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+        Value *V = Cexpr->getOperand(0)->stripPointerCasts();
+        AggBuffer->addSymbol(V, Cexpr->getOperand(0));
+        AggBuffer->addZeros(AllocSize);
         break;
-      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (const auto *constInt = dyn_cast<ConstantInt>(
-                ConstantFoldConstant(Cexpr, DL))) {
-          long long int64 = (long long)(constInt->getZExtValue());
-          ConvertIntToBytes<>(ptr, int64);
-          aggBuffer->addBytes(ptr, 8, Bytes);
-          break;
-        }
-        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
-          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
-          aggBuffer->addSymbol(v, Cexpr->getOperand(0));
-          aggBuffer->addZeros(8);
-          break;
-        }
       }
-      llvm_unreachable("unsupported integer const type");
-    } else
-      llvm_unreachable("unsupported integer const type");
+    }
+    llvm_unreachable("unsupported integer const type");
     break;
-  }
+
   case Type::HalfTyID:
   case Type::FloatTyID:
-  case Type::DoubleTyID: {
-    const auto *CFP = cast<ConstantFP>(CPV);
-    Type *Ty = CFP->getType();
-    if (Ty == Type::getHalfTy(CPV->getContext())) {
-      APInt API = CFP->getValueAPF().bitcastToAPInt();
-      uint16_t float16 = API.getLoBits(16).getZExtValue();
-      ConvertIntToBytes<>(ptr, float16);
-      aggBuffer->addBytes(ptr, 2, Bytes);
-    } else if (Ty == Type::getFloatTy(CPV->getContext())) {
-      float float32 = (float) CFP->getValueAPF().convertToFloat();
-      ConvertFloatToBytes(ptr, float32);
-      aggBuffer->addBytes(ptr, 4, Bytes);
-    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
-      double float64 = CFP->getValueAPF().convertToDouble();
-      ConvertDoubleToBytes(ptr, float64);
-      aggBuffer->addBytes(ptr, 8, Bytes);
-    } else {
-      llvm_unreachable("unsupported fp const type");
-    }
+  case Type::DoubleTyID:
+    AddIntToBuffer(cast<ConstantFP>(CPV)->getValueAPF().bitcastToAPInt());
     break;
-  }
+
   case Type::PointerTyID: {
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-      aggBuffer->addSymbol(GVar, GVar);
+      AggBuffer->addSymbol(GVar, GVar);
     } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
       const Value *v = Cexpr->stripPointerCasts();
-      aggBuffer->addSymbol(v, Cexpr);
+      AggBuffer->addSymbol(v, Cexpr);
     }
-    unsigned int s = DL.getTypeAllocSize(CPV->getType());
-    aggBuffer->addZeros(s);
+    AggBuffer->addZeros(AllocSize);
     break;
   }
 
@@ -1884,12 +1811,11 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::FixedVectorTyID:
   case Type::StructTyID: {
     if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
-      int ElementSize = DL.getTypeAllocSize(CPV->getType());
-      bufferAggregateConstant(CPV, aggBuffer);
-      if (Bytes > ElementSize)
-        aggBuffer->addZeros(Bytes - ElementSize);
+      bufferAggregateConstant(CPV, AggBuffer);
+      if (Bytes > AllocSize)
+        AggBuffer->addZeros(Bytes - AllocSize);
     } else if (isa<ConstantAggregateZero>(CPV))
-      aggBuffer->addZeros(Bytes);
+      AggBuffer->addZeros(Bytes);
     else
       llvm_unreachable("Unexpected Constant type");
     break;
@@ -1996,7 +1922,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     OS << "Unsupported expression in static initializer: ";
     CE->printAsOperand(OS, /*PrintType=*/false,
                    !MF ? nullptr : MF->getFunction().getParent());
-    report_fatal_error(OS.str());
+    report_fatal_error(Twine(OS.str()));
   }
 
   case Instruction::AddrSpaceCast: {
@@ -2010,7 +1936,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     OS << "Unsupported expression in static initializer: ";
     CE->printAsOperand(OS, /*PrintType=*/ false,
                        !MF ? nullptr : MF->getFunction().getParent());
-    report_fatal_error(OS.str());
+    report_fatal_error(Twine(OS.str()));
   }
 
   case Instruction::GetElementPtr: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 5c3a4eb470c1..5d680e731e4a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -106,6 +106,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       EmitGeneric = AP.EmitGeneric;
     }
 
+    // Copy Num bytes from Ptr.
+    // if Bytes > Num, zero fill up to Bytes.
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
       assert((curpos + Num) <= size);
       assert((curpos + Bytes) <= size);
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 024e51e5f488..1e19ef4116c3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -36,6 +36,9 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
     MachineInstr *MI = &MBB.front();
     MachineRegisterInfo &MR = MF.getRegInfo();
 
+    const NVPTXRegisterInfo *NRI =
+        MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo();
+
     // This instruction really occurs before first instruction
     // in the BB, so giving it no debug location.
     DebugLoc dl = DebugLoc();
@@ -50,15 +53,15 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
         (Is64Bit ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes);
     unsigned MovDepotOpcode =
         (Is64Bit ? NVPTX::MOV_DEPOT_ADDR_64 : NVPTX::MOV_DEPOT_ADDR);
-    if (!MR.use_empty(NVPTX::VRFrame)) {
+    if (!MR.use_empty(NRI->getFrameRegister(MF))) {
       // If %SP is not used, do not bother emitting "cvta.local %SP, %SPL".
       MI = BuildMI(MBB, MI, dl,
                    MF.getSubtarget().getInstrInfo()->get(CvtaLocalOpcode),
-                   NVPTX::VRFrame)
-               .addReg(NVPTX::VRFrameLocal);
+                   NRI->getFrameRegister(MF))
+               .addReg(NRI->getFrameLocalRegister(MF));
     }
     BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
-            NVPTX::VRFrameLocal)
+            NRI->getFrameLocalRegister(MF))
         .addImm(MF.getFunctionNumber());
   }
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 9078ff8cfb97..a9a5eae42c1d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -74,19 +74,16 @@ bool GenericToNVVM::runOnModule(Module &M) {
   // of original global variable and its clone is placed in the GVMap for later
   // use.
 
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E;) {
-    GlobalVariable *GV = &*I++;
-    if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
-        !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
-        !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
+    if (GV.getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+        !llvm::isTexture(GV) && !llvm::isSurface(GV) && !llvm::isSampler(GV) &&
+        !GV.getName().startswith("llvm.")) {
       GlobalVariable *NewGV = new GlobalVariable(
-          M, GV->getValueType(), GV->isConstant(),
-          GV->getLinkage(),
-          GV->hasInitializer() ? GV->getInitializer() : nullptr,
-          "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
-      NewGV->copyAttributesFrom(GV);
-      GVMap[GV] = NewGV;
+          M, GV.getValueType(), GV.isConstant(), GV.getLinkage(),
+          GV.hasInitializer() ? GV.getInitializer() : nullptr, "", &GV,
+          GV.getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+      NewGV->copyAttributesFrom(&GV);
+      GVMap[&GV] = NewGV;
     }
   }
 
@@ -215,7 +212,7 @@ Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
   // If any of the elements has been  modified, construct the equivalent
   // vector or aggregate value with a set instructions and the converted
   // elements.
-  Value *NewValue = UndefValue::get(C->getType());
+  Value *NewValue = PoisonValue::get(C->getType());
   if (isa<ConstantVector>(C)) {
     for (unsigned i = 0; i < NumOperands; ++i) {
       Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 00913e93cfd3..dd4290a605a9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2348,508 +2348,508 @@ bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
   switch (N->getOpcode()) {
   default: return false;
   case NVPTXISD::Tex1DFloatS32:
-    Opc = NVPTX::TEX_1D_F32_S32;
+    Opc = NVPTX::TEX_1D_F32_S32_RR;
     break;
   case NVPTXISD::Tex1DFloatFloat:
-    Opc = NVPTX::TEX_1D_F32_F32;
+    Opc = NVPTX::TEX_1D_F32_F32_RR;
     break;
   case NVPTXISD::Tex1DFloatFloatLevel:
-    Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex1DFloatFloatGrad:
-    Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+    Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex1DS32S32:
-    Opc = NVPTX::TEX_1D_S32_S32;
+    Opc = NVPTX::TEX_1D_S32_S32_RR;
     break;
   case NVPTXISD::Tex1DS32Float:
-    Opc = NVPTX::TEX_1D_S32_F32;
+    Opc = NVPTX::TEX_1D_S32_F32_RR;
     break;
   case NVPTXISD::Tex1DS32FloatLevel:
-    Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex1DS32FloatGrad:
-    Opc = NVPTX::TEX_1D_S32_F32_GRAD;
+    Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex1DU32S32:
-    Opc = NVPTX::TEX_1D_U32_S32;
+    Opc = NVPTX::TEX_1D_U32_S32_RR;
     break;
   case NVPTXISD::Tex1DU32Float:
-    Opc = NVPTX::TEX_1D_U32_F32;
+    Opc = NVPTX::TEX_1D_U32_F32_RR;
     break;
   case NVPTXISD::Tex1DU32FloatLevel:
-    Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex1DU32FloatGrad:
-    Opc = NVPTX::TEX_1D_U32_F32_GRAD;
+    Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex1DArrayFloatS32:
-    Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
+    Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
     break;
   case NVPTXISD::Tex1DArrayFloatFloat:
-    Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
     break;
   case NVPTXISD::Tex1DArrayFloatFloatLevel:
-    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex1DArrayFloatFloatGrad:
-    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex1DArrayS32S32:
-    Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
+    Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
     break;
   case NVPTXISD::Tex1DArrayS32Float:
-    Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
     break;
   case NVPTXISD::Tex1DArrayS32FloatLevel:
-    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex1DArrayS32FloatGrad:
-    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex1DArrayU32S32:
-    Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
+    Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
     break;
   case NVPTXISD::Tex1DArrayU32Float:
-    Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
     break;
   case NVPTXISD::Tex1DArrayU32FloatLevel:
-    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex1DArrayU32FloatGrad:
-    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex2DFloatS32:
-    Opc = NVPTX::TEX_2D_F32_S32;
+    Opc = NVPTX::TEX_2D_F32_S32_RR;
     break;
   case NVPTXISD::Tex2DFloatFloat:
-    Opc = NVPTX::TEX_2D_F32_F32;
+    Opc = NVPTX::TEX_2D_F32_F32_RR;
     break;
   case NVPTXISD::Tex2DFloatFloatLevel:
-    Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex2DFloatFloatGrad:
-    Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+    Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex2DS32S32:
-    Opc = NVPTX::TEX_2D_S32_S32;
+    Opc = NVPTX::TEX_2D_S32_S32_RR;
     break;
   case NVPTXISD::Tex2DS32Float:
-    Opc = NVPTX::TEX_2D_S32_F32;
+    Opc = NVPTX::TEX_2D_S32_F32_RR;
     break;
   case NVPTXISD::Tex2DS32FloatLevel:
-    Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex2DS32FloatGrad:
-    Opc = NVPTX::TEX_2D_S32_F32_GRAD;
+    Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex2DU32S32:
-    Opc = NVPTX::TEX_2D_U32_S32;
+    Opc = NVPTX::TEX_2D_U32_S32_RR;
     break;
   case NVPTXISD::Tex2DU32Float:
-    Opc = NVPTX::TEX_2D_U32_F32;
+    Opc = NVPTX::TEX_2D_U32_F32_RR;
     break;
   case NVPTXISD::Tex2DU32FloatLevel:
-    Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex2DU32FloatGrad:
-    Opc = NVPTX::TEX_2D_U32_F32_GRAD;
+    Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex2DArrayFloatS32:
-    Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
+    Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
     break;
   case NVPTXISD::Tex2DArrayFloatFloat:
-    Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
     break;
   case NVPTXISD::Tex2DArrayFloatFloatLevel:
-    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex2DArrayFloatFloatGrad:
-    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex2DArrayS32S32:
-    Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
+    Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
     break;
   case NVPTXISD::Tex2DArrayS32Float:
-    Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
     break;
   case NVPTXISD::Tex2DArrayS32FloatLevel:
-    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex2DArrayS32FloatGrad:
-    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex2DArrayU32S32:
-    Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
+    Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
     break;
   case NVPTXISD::Tex2DArrayU32Float:
-    Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
     break;
   case NVPTXISD::Tex2DArrayU32FloatLevel:
-    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex2DArrayU32FloatGrad:
-    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex3DFloatS32:
-    Opc = NVPTX::TEX_3D_F32_S32;
+    Opc = NVPTX::TEX_3D_F32_S32_RR;
     break;
   case NVPTXISD::Tex3DFloatFloat:
-    Opc = NVPTX::TEX_3D_F32_F32;
+    Opc = NVPTX::TEX_3D_F32_F32_RR;
     break;
   case NVPTXISD::Tex3DFloatFloatLevel:
-    Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex3DFloatFloatGrad:
-    Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+    Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex3DS32S32:
-    Opc = NVPTX::TEX_3D_S32_S32;
+    Opc = NVPTX::TEX_3D_S32_S32_RR;
     break;
   case NVPTXISD::Tex3DS32Float:
-    Opc = NVPTX::TEX_3D_S32_F32;
+    Opc = NVPTX::TEX_3D_S32_F32_RR;
     break;
   case NVPTXISD::Tex3DS32FloatLevel:
-    Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex3DS32FloatGrad:
-    Opc = NVPTX::TEX_3D_S32_F32_GRAD;
+    Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
     break;
   case NVPTXISD::Tex3DU32S32:
-    Opc = NVPTX::TEX_3D_U32_S32;
+    Opc = NVPTX::TEX_3D_U32_S32_RR;
     break;
   case NVPTXISD::Tex3DU32Float:
-    Opc = NVPTX::TEX_3D_U32_F32;
+    Opc = NVPTX::TEX_3D_U32_F32_RR;
     break;
   case NVPTXISD::Tex3DU32FloatLevel:
-    Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tex3DU32FloatGrad:
-    Opc = NVPTX::TEX_3D_U32_F32_GRAD;
+    Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
     break;
   case NVPTXISD::TexCubeFloatFloat:
-    Opc = NVPTX::TEX_CUBE_F32_F32;
+    Opc = NVPTX::TEX_CUBE_F32_F32_RR;
     break;
   case NVPTXISD::TexCubeFloatFloatLevel:
-    Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::TexCubeS32Float:
-    Opc = NVPTX::TEX_CUBE_S32_F32;
+    Opc = NVPTX::TEX_CUBE_S32_F32_RR;
     break;
   case NVPTXISD::TexCubeS32FloatLevel:
-    Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::TexCubeU32Float:
-    Opc = NVPTX::TEX_CUBE_U32_F32;
+    Opc = NVPTX::TEX_CUBE_U32_F32_RR;
     break;
   case NVPTXISD::TexCubeU32FloatLevel:
-    Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::TexCubeArrayFloatFloat:
-    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
     break;
   case NVPTXISD::TexCubeArrayFloatFloatLevel:
-    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
     break;
   case NVPTXISD::TexCubeArrayS32Float:
-    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
     break;
   case NVPTXISD::TexCubeArrayS32FloatLevel:
-    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
     break;
   case NVPTXISD::TexCubeArrayU32Float:
-    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
     break;
   case NVPTXISD::TexCubeArrayU32FloatLevel:
-    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
     break;
   case NVPTXISD::Tld4R2DFloatFloat:
-    Opc = NVPTX::TLD4_R_2D_F32_F32;
+    Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
     break;
   case NVPTXISD::Tld4G2DFloatFloat:
-    Opc = NVPTX::TLD4_G_2D_F32_F32;
+    Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
     break;
   case NVPTXISD::Tld4B2DFloatFloat:
-    Opc = NVPTX::TLD4_B_2D_F32_F32;
+    Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
     break;
   case NVPTXISD::Tld4A2DFloatFloat:
-    Opc = NVPTX::TLD4_A_2D_F32_F32;
+    Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
     break;
   case NVPTXISD::Tld4R2DS64Float:
-    Opc = NVPTX::TLD4_R_2D_S32_F32;
+    Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
     break;
   case NVPTXISD::Tld4G2DS64Float:
-    Opc = NVPTX::TLD4_G_2D_S32_F32;
+    Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
     break;
   case NVPTXISD::Tld4B2DS64Float:
-    Opc = NVPTX::TLD4_B_2D_S32_F32;
+    Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
     break;
   case NVPTXISD::Tld4A2DS64Float:
-    Opc = NVPTX::TLD4_A_2D_S32_F32;
+    Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
     break;
   case NVPTXISD::Tld4R2DU64Float:
-    Opc = NVPTX::TLD4_R_2D_U32_F32;
+    Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
     break;
   case NVPTXISD::Tld4G2DU64Float:
-    Opc = NVPTX::TLD4_G_2D_U32_F32;
+    Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
     break;
   case NVPTXISD::Tld4B2DU64Float:
-    Opc = NVPTX::TLD4_B_2D_U32_F32;
+    Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
     break;
   case NVPTXISD::Tld4A2DU64Float:
-    Opc = NVPTX::TLD4_A_2D_U32_F32;
+    Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
     break;
   case NVPTXISD::TexUnified1DFloatS32:
-    Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
     break;
   case NVPTXISD::TexUnified1DFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
     break;
   case NVPTXISD::TexUnified1DFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified1DFloatFloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified1DS32S32:
-    Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
     break;
   case NVPTXISD::TexUnified1DS32Float:
-    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
     break;
   case NVPTXISD::TexUnified1DS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified1DS32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified1DU32S32:
-    Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
     break;
   case NVPTXISD::TexUnified1DU32Float:
-    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
     break;
   case NVPTXISD::TexUnified1DU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified1DU32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified1DArrayFloatS32:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
     break;
   case NVPTXISD::TexUnified1DArrayFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
     break;
   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified1DArrayS32S32:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
     break;
   case NVPTXISD::TexUnified1DArrayS32Float:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
     break;
   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified1DArrayU32S32:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
     break;
   case NVPTXISD::TexUnified1DArrayU32Float:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
     break;
   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified2DFloatS32:
-    Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
     break;
   case NVPTXISD::TexUnified2DFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
     break;
   case NVPTXISD::TexUnified2DFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified2DFloatFloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified2DS32S32:
-    Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
     break;
   case NVPTXISD::TexUnified2DS32Float:
-    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
     break;
   case NVPTXISD::TexUnified2DS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified2DS32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified2DU32S32:
-    Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
     break;
   case NVPTXISD::TexUnified2DU32Float:
-    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
     break;
   case NVPTXISD::TexUnified2DU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified2DU32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified2DArrayFloatS32:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
     break;
   case NVPTXISD::TexUnified2DArrayFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
     break;
   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified2DArrayS32S32:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
     break;
   case NVPTXISD::TexUnified2DArrayS32Float:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
     break;
   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified2DArrayU32S32:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
     break;
   case NVPTXISD::TexUnified2DArrayU32Float:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
     break;
   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified3DFloatS32:
-    Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
     break;
   case NVPTXISD::TexUnified3DFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
     break;
   case NVPTXISD::TexUnified3DFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified3DFloatFloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified3DS32S32:
-    Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
     break;
   case NVPTXISD::TexUnified3DS32Float:
-    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
     break;
   case NVPTXISD::TexUnified3DS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified3DS32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnified3DU32S32:
-    Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
     break;
   case NVPTXISD::TexUnified3DU32Float:
-    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
     break;
   case NVPTXISD::TexUnified3DU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnified3DU32FloatGrad:
-    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
     break;
   case NVPTXISD::TexUnifiedCubeFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
     break;
   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnifiedCubeS32Float:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
     break;
   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnifiedCubeU32Float:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
     break;
   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
     break;
   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnifiedCubeArrayS32Float:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
     break;
   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
     break;
   case NVPTXISD::TexUnifiedCubeArrayU32Float:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
     break;
   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
-    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
     break;
   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
-    Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
-    Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
-    Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
-    Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedR2DS64Float:
-    Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedG2DS64Float:
-    Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedB2DS64Float:
-    Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedA2DS64Float:
-    Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedR2DU64Float:
-    Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedG2DU64Float:
-    Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedB2DU64Float:
-    Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
     break;
   case NVPTXISD::Tld4UnifiedA2DU64Float:
-    Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
     break;
   }
 
@@ -2866,499 +2866,499 @@ bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
   switch (N->getOpcode()) {
   default: return false;
   case NVPTXISD::Suld1DI8Clamp:
-    Opc = NVPTX::SULD_1D_I8_CLAMP;
+    Opc = NVPTX::SULD_1D_I8_CLAMP_R;
     break;
   case NVPTXISD::Suld1DI16Clamp:
-    Opc = NVPTX::SULD_1D_I16_CLAMP;
+    Opc = NVPTX::SULD_1D_I16_CLAMP_R;
     break;
   case NVPTXISD::Suld1DI32Clamp:
-    Opc = NVPTX::SULD_1D_I32_CLAMP;
+    Opc = NVPTX::SULD_1D_I32_CLAMP_R;
     break;
   case NVPTXISD::Suld1DI64Clamp:
-    Opc = NVPTX::SULD_1D_I64_CLAMP;
+    Opc = NVPTX::SULD_1D_I64_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV2I8Clamp:
-    Opc = NVPTX::SULD_1D_V2I8_CLAMP;
+    Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV2I16Clamp:
-    Opc = NVPTX::SULD_1D_V2I16_CLAMP;
+    Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV2I32Clamp:
-    Opc = NVPTX::SULD_1D_V2I32_CLAMP;
+    Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV2I64Clamp:
-    Opc = NVPTX::SULD_1D_V2I64_CLAMP;
+    Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV4I8Clamp:
-    Opc = NVPTX::SULD_1D_V4I8_CLAMP;
+    Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV4I16Clamp:
-    Opc = NVPTX::SULD_1D_V4I16_CLAMP;
+    Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
     break;
   case NVPTXISD::Suld1DV4I32Clamp:
-    Opc = NVPTX::SULD_1D_V4I32_CLAMP;
+    Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayI8Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayI16Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayI32Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayI64Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I8Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I16Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I32Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I64Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV4I8Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV4I16Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
     break;
   case NVPTXISD::Suld1DArrayV4I32Clamp:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
     break;
   case NVPTXISD::Suld2DI8Clamp:
-    Opc = NVPTX::SULD_2D_I8_CLAMP;
+    Opc = NVPTX::SULD_2D_I8_CLAMP_R;
     break;
   case NVPTXISD::Suld2DI16Clamp:
-    Opc = NVPTX::SULD_2D_I16_CLAMP;
+    Opc = NVPTX::SULD_2D_I16_CLAMP_R;
     break;
   case NVPTXISD::Suld2DI32Clamp:
-    Opc = NVPTX::SULD_2D_I32_CLAMP;
+    Opc = NVPTX::SULD_2D_I32_CLAMP_R;
     break;
   case NVPTXISD::Suld2DI64Clamp:
-    Opc = NVPTX::SULD_2D_I64_CLAMP;
+    Opc = NVPTX::SULD_2D_I64_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV2I8Clamp:
-    Opc = NVPTX::SULD_2D_V2I8_CLAMP;
+    Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV2I16Clamp:
-    Opc = NVPTX::SULD_2D_V2I16_CLAMP;
+    Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV2I32Clamp:
-    Opc = NVPTX::SULD_2D_V2I32_CLAMP;
+    Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV2I64Clamp:
-    Opc = NVPTX::SULD_2D_V2I64_CLAMP;
+    Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV4I8Clamp:
-    Opc = NVPTX::SULD_2D_V4I8_CLAMP;
+    Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV4I16Clamp:
-    Opc = NVPTX::SULD_2D_V4I16_CLAMP;
+    Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
     break;
   case NVPTXISD::Suld2DV4I32Clamp:
-    Opc = NVPTX::SULD_2D_V4I32_CLAMP;
+    Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayI8Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayI16Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayI32Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayI64Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I8Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I16Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I32Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I64Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV4I8Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV4I16Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
     break;
   case NVPTXISD::Suld2DArrayV4I32Clamp:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
     break;
   case NVPTXISD::Suld3DI8Clamp:
-    Opc = NVPTX::SULD_3D_I8_CLAMP;
+    Opc = NVPTX::SULD_3D_I8_CLAMP_R;
     break;
   case NVPTXISD::Suld3DI16Clamp:
-    Opc = NVPTX::SULD_3D_I16_CLAMP;
+    Opc = NVPTX::SULD_3D_I16_CLAMP_R;
     break;
   case NVPTXISD::Suld3DI32Clamp:
-    Opc = NVPTX::SULD_3D_I32_CLAMP;
+    Opc = NVPTX::SULD_3D_I32_CLAMP_R;
     break;
   case NVPTXISD::Suld3DI64Clamp:
-    Opc = NVPTX::SULD_3D_I64_CLAMP;
+    Opc = NVPTX::SULD_3D_I64_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV2I8Clamp:
-    Opc = NVPTX::SULD_3D_V2I8_CLAMP;
+    Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV2I16Clamp:
-    Opc = NVPTX::SULD_3D_V2I16_CLAMP;
+    Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV2I32Clamp:
-    Opc = NVPTX::SULD_3D_V2I32_CLAMP;
+    Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV2I64Clamp:
-    Opc = NVPTX::SULD_3D_V2I64_CLAMP;
+    Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV4I8Clamp:
-    Opc = NVPTX::SULD_3D_V4I8_CLAMP;
+    Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV4I16Clamp:
-    Opc = NVPTX::SULD_3D_V4I16_CLAMP;
+    Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
     break;
   case NVPTXISD::Suld3DV4I32Clamp:
-    Opc = NVPTX::SULD_3D_V4I32_CLAMP;
+    Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
     break;
   case NVPTXISD::Suld1DI8Trap:
-    Opc = NVPTX::SULD_1D_I8_TRAP;
+    Opc = NVPTX::SULD_1D_I8_TRAP_R;
     break;
   case NVPTXISD::Suld1DI16Trap:
-    Opc = NVPTX::SULD_1D_I16_TRAP;
+    Opc = NVPTX::SULD_1D_I16_TRAP_R;
     break;
   case NVPTXISD::Suld1DI32Trap:
-    Opc = NVPTX::SULD_1D_I32_TRAP;
+    Opc = NVPTX::SULD_1D_I32_TRAP_R;
     break;
   case NVPTXISD::Suld1DI64Trap:
-    Opc = NVPTX::SULD_1D_I64_TRAP;
+    Opc = NVPTX::SULD_1D_I64_TRAP_R;
     break;
   case NVPTXISD::Suld1DV2I8Trap:
-    Opc = NVPTX::SULD_1D_V2I8_TRAP;
+    Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
     break;
   case NVPTXISD::Suld1DV2I16Trap:
-    Opc = NVPTX::SULD_1D_V2I16_TRAP;
+    Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
     break;
   case NVPTXISD::Suld1DV2I32Trap:
-    Opc = NVPTX::SULD_1D_V2I32_TRAP;
+    Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
     break;
   case NVPTXISD::Suld1DV2I64Trap:
-    Opc = NVPTX::SULD_1D_V2I64_TRAP;
+    Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
     break;
   case NVPTXISD::Suld1DV4I8Trap:
-    Opc = NVPTX::SULD_1D_V4I8_TRAP;
+    Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
     break;
   case NVPTXISD::Suld1DV4I16Trap:
-    Opc = NVPTX::SULD_1D_V4I16_TRAP;
+    Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
     break;
   case NVPTXISD::Suld1DV4I32Trap:
-    Opc = NVPTX::SULD_1D_V4I32_TRAP;
+    Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayI8Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayI16Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayI32Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayI64Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I8Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I16Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I32Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV2I64Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV4I8Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV4I16Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
     break;
   case NVPTXISD::Suld1DArrayV4I32Trap:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
     break;
   case NVPTXISD::Suld2DI8Trap:
-    Opc = NVPTX::SULD_2D_I8_TRAP;
+    Opc = NVPTX::SULD_2D_I8_TRAP_R;
     break;
   case NVPTXISD::Suld2DI16Trap:
-    Opc = NVPTX::SULD_2D_I16_TRAP;
+    Opc = NVPTX::SULD_2D_I16_TRAP_R;
     break;
   case NVPTXISD::Suld2DI32Trap:
-    Opc = NVPTX::SULD_2D_I32_TRAP;
+    Opc = NVPTX::SULD_2D_I32_TRAP_R;
     break;
   case NVPTXISD::Suld2DI64Trap:
-    Opc = NVPTX::SULD_2D_I64_TRAP;
+    Opc = NVPTX::SULD_2D_I64_TRAP_R;
     break;
   case NVPTXISD::Suld2DV2I8Trap:
-    Opc = NVPTX::SULD_2D_V2I8_TRAP;
+    Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
     break;
   case NVPTXISD::Suld2DV2I16Trap:
-    Opc = NVPTX::SULD_2D_V2I16_TRAP;
+    Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
     break;
   case NVPTXISD::Suld2DV2I32Trap:
-    Opc = NVPTX::SULD_2D_V2I32_TRAP;
+    Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
     break;
   case NVPTXISD::Suld2DV2I64Trap:
-    Opc = NVPTX::SULD_2D_V2I64_TRAP;
+    Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
     break;
   case NVPTXISD::Suld2DV4I8Trap:
-    Opc = NVPTX::SULD_2D_V4I8_TRAP;
+    Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
     break;
   case NVPTXISD::Suld2DV4I16Trap:
-    Opc = NVPTX::SULD_2D_V4I16_TRAP;
+    Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
     break;
   case NVPTXISD::Suld2DV4I32Trap:
-    Opc = NVPTX::SULD_2D_V4I32_TRAP;
+    Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayI8Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayI16Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayI32Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayI64Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I8Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I16Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I32Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV2I64Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV4I8Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV4I16Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
     break;
   case NVPTXISD::Suld2DArrayV4I32Trap:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
     break;
   case NVPTXISD::Suld3DI8Trap:
-    Opc = NVPTX::SULD_3D_I8_TRAP;
+    Opc = NVPTX::SULD_3D_I8_TRAP_R;
     break;
   case NVPTXISD::Suld3DI16Trap:
-    Opc = NVPTX::SULD_3D_I16_TRAP;
+    Opc = NVPTX::SULD_3D_I16_TRAP_R;
     break;
   case NVPTXISD::Suld3DI32Trap:
-    Opc = NVPTX::SULD_3D_I32_TRAP;
+    Opc = NVPTX::SULD_3D_I32_TRAP_R;
     break;
   case NVPTXISD::Suld3DI64Trap:
-    Opc = NVPTX::SULD_3D_I64_TRAP;
+    Opc = NVPTX::SULD_3D_I64_TRAP_R;
     break;
   case NVPTXISD::Suld3DV2I8Trap:
-    Opc = NVPTX::SULD_3D_V2I8_TRAP;
+    Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
     break;
   case NVPTXISD::Suld3DV2I16Trap:
-    Opc = NVPTX::SULD_3D_V2I16_TRAP;
+    Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
     break;
   case NVPTXISD::Suld3DV2I32Trap:
-    Opc = NVPTX::SULD_3D_V2I32_TRAP;
+    Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
     break;
   case NVPTXISD::Suld3DV2I64Trap:
-    Opc = NVPTX::SULD_3D_V2I64_TRAP;
+    Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
     break;
   case NVPTXISD::Suld3DV4I8Trap:
-    Opc = NVPTX::SULD_3D_V4I8_TRAP;
+    Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
     break;
   case NVPTXISD::Suld3DV4I16Trap:
-    Opc = NVPTX::SULD_3D_V4I16_TRAP;
+    Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
     break;
   case NVPTXISD::Suld3DV4I32Trap:
-    Opc = NVPTX::SULD_3D_V4I32_TRAP;
+    Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
     break;
   case NVPTXISD::Suld1DI8Zero:
-    Opc = NVPTX::SULD_1D_I8_ZERO;
+    Opc = NVPTX::SULD_1D_I8_ZERO_R;
     break;
   case NVPTXISD::Suld1DI16Zero:
-    Opc = NVPTX::SULD_1D_I16_ZERO;
+    Opc = NVPTX::SULD_1D_I16_ZERO_R;
     break;
   case NVPTXISD::Suld1DI32Zero:
-    Opc = NVPTX::SULD_1D_I32_ZERO;
+    Opc = NVPTX::SULD_1D_I32_ZERO_R;
     break;
   case NVPTXISD::Suld1DI64Zero:
-    Opc = NVPTX::SULD_1D_I64_ZERO;
+    Opc = NVPTX::SULD_1D_I64_ZERO_R;
     break;
   case NVPTXISD::Suld1DV2I8Zero:
-    Opc = NVPTX::SULD_1D_V2I8_ZERO;
+    Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
     break;
   case NVPTXISD::Suld1DV2I16Zero:
-    Opc = NVPTX::SULD_1D_V2I16_ZERO;
+    Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
     break;
   case NVPTXISD::Suld1DV2I32Zero:
-    Opc = NVPTX::SULD_1D_V2I32_ZERO;
+    Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
     break;
   case NVPTXISD::Suld1DV2I64Zero:
-    Opc = NVPTX::SULD_1D_V2I64_ZERO;
+    Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
     break;
   case NVPTXISD::Suld1DV4I8Zero:
-    Opc = NVPTX::SULD_1D_V4I8_ZERO;
+    Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
     break;
   case NVPTXISD::Suld1DV4I16Zero:
-    Opc = NVPTX::SULD_1D_V4I16_ZERO;
+    Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
     break;
   case NVPTXISD::Suld1DV4I32Zero:
-    Opc = NVPTX::SULD_1D_V4I32_ZERO;
+    Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayI8Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayI16Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayI32Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayI64Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV2I8Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV2I16Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV2I32Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV2I64Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV4I8Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV4I16Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
     break;
   case NVPTXISD::Suld1DArrayV4I32Zero:
-    Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
     break;
   case NVPTXISD::Suld2DI8Zero:
-    Opc = NVPTX::SULD_2D_I8_ZERO;
+    Opc = NVPTX::SULD_2D_I8_ZERO_R;
     break;
   case NVPTXISD::Suld2DI16Zero:
-    Opc = NVPTX::SULD_2D_I16_ZERO;
+    Opc = NVPTX::SULD_2D_I16_ZERO_R;
     break;
   case NVPTXISD::Suld2DI32Zero:
-    Opc = NVPTX::SULD_2D_I32_ZERO;
+    Opc = NVPTX::SULD_2D_I32_ZERO_R;
     break;
   case NVPTXISD::Suld2DI64Zero:
-    Opc = NVPTX::SULD_2D_I64_ZERO;
+    Opc = NVPTX::SULD_2D_I64_ZERO_R;
     break;
   case NVPTXISD::Suld2DV2I8Zero:
-    Opc = NVPTX::SULD_2D_V2I8_ZERO;
+    Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
     break;
   case NVPTXISD::Suld2DV2I16Zero:
-    Opc = NVPTX::SULD_2D_V2I16_ZERO;
+    Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
     break;
   case NVPTXISD::Suld2DV2I32Zero:
-    Opc = NVPTX::SULD_2D_V2I32_ZERO;
+    Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
     break;
   case NVPTXISD::Suld2DV2I64Zero:
-    Opc = NVPTX::SULD_2D_V2I64_ZERO;
+    Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
     break;
   case NVPTXISD::Suld2DV4I8Zero:
-    Opc = NVPTX::SULD_2D_V4I8_ZERO;
+    Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
     break;
   case NVPTXISD::Suld2DV4I16Zero:
-    Opc = NVPTX::SULD_2D_V4I16_ZERO;
+    Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
     break;
   case NVPTXISD::Suld2DV4I32Zero:
-    Opc = NVPTX::SULD_2D_V4I32_ZERO;
+    Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayI8Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayI16Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayI32Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayI64Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV2I8Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV2I16Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV2I32Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV2I64Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV4I8Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV4I16Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
     break;
   case NVPTXISD::Suld2DArrayV4I32Zero:
-    Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
     break;
   case NVPTXISD::Suld3DI8Zero:
-    Opc = NVPTX::SULD_3D_I8_ZERO;
+    Opc = NVPTX::SULD_3D_I8_ZERO_R;
     break;
   case NVPTXISD::Suld3DI16Zero:
-    Opc = NVPTX::SULD_3D_I16_ZERO;
+    Opc = NVPTX::SULD_3D_I16_ZERO_R;
     break;
   case NVPTXISD::Suld3DI32Zero:
-    Opc = NVPTX::SULD_3D_I32_ZERO;
+    Opc = NVPTX::SULD_3D_I32_ZERO_R;
     break;
   case NVPTXISD::Suld3DI64Zero:
-    Opc = NVPTX::SULD_3D_I64_ZERO;
+    Opc = NVPTX::SULD_3D_I64_ZERO_R;
     break;
   case NVPTXISD::Suld3DV2I8Zero:
-    Opc = NVPTX::SULD_3D_V2I8_ZERO;
+    Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
     break;
   case NVPTXISD::Suld3DV2I16Zero:
-    Opc = NVPTX::SULD_3D_V2I16_ZERO;
+    Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
     break;
   case NVPTXISD::Suld3DV2I32Zero:
-    Opc = NVPTX::SULD_3D_V2I32_ZERO;
+    Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
     break;
   case NVPTXISD::Suld3DV2I64Zero:
-    Opc = NVPTX::SULD_3D_V2I64_ZERO;
+    Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
     break;
   case NVPTXISD::Suld3DV4I8Zero:
-    Opc = NVPTX::SULD_3D_V4I8_ZERO;
+    Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
     break;
   case NVPTXISD::Suld3DV4I16Zero:
-    Opc = NVPTX::SULD_3D_V4I16_ZERO;
+    Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
     break;
   case NVPTXISD::Suld3DV4I32Zero:
-    Opc = NVPTX::SULD_3D_V4I32_ZERO;
+    Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
     break;
   }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d4842c953ce7..e2f6b69fc530 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2530,7 +2530,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     // to newly created nodes. The SDNodes for params have to
     // appear in the same order as their order of appearance
     // in the original function. "idx+1" holds that order.
-    if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
+    if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
       bool aggregateIsPacked = false;
       if (StructType *STy = dyn_cast<StructType>(Ty))
         aggregateIsPacked = STy->isPacked();
@@ -3547,7 +3547,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
   case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
-  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: {
+  case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
+  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
+  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::v4i32;
     Info.ptrVal = I.getArgOperand(0);
@@ -3585,7 +3587,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
   case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
   case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
-  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
+  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
+  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
+  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i32;
     Info.ptrVal = I.getArgOperand(0);
@@ -3679,7 +3683,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
   case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
-  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
+  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
+  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
+  case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::v2i32;
     Info.ptrVal = I.getArgOperand(0);
@@ -4441,11 +4447,8 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
       //
       int numUses = 0;
       int nonAddCount = 0;
-      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
-           UE = N0.getNode()->use_end();
-           UI != UE; ++UI) {
+      for (const SDNode *User : N0.getNode()->uses()) {
         numUses++;
-        SDNode *User = *UI;
         if (User->getOpcode() != ISD::FADD)
           ++nonAddCount;
       }
@@ -4471,8 +4474,7 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
           opIsLive = true;
 
         if (!opIsLive)
-          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
-            SDNode *User = *UI;
+          for (const SDNode *User : left->uses()) {
             int orderNo3 = User->getIROrder();
             if (orderNo3 > orderNo) {
               opIsLive = true;
@@ -4481,8 +4483,7 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
           }
 
         if (!opIsLive)
-          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
-            SDNode *User = *UI;
+          for (const SDNode *User : right->uses()) {
             int orderNo3 = User->getIROrder();
             if (orderNo3 > orderNo) {
               opIsLive = true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index a846c2fada26..fc0d5cc6fbfa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -148,9 +148,8 @@ void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
   // We implement "poor man's DCE" here to make sure any code that is no longer
   // live is actually unreachable and can be trivially eliminated by the
   // unreachable block elimination pass.
-  for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE; ++UI) {
-    if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+  for (Use &U : From->uses()) {
+    if (BranchInst *BI = dyn_cast<BranchInst>(U)) {
       if (BI->isUnconditional()) continue;
       BasicBlock *Dest;
       if (To->isZero())
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index ec0c92ccf5c5..953d95e55f65 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -195,13 +195,12 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
     if (Cond.empty()) // Unconditional branch
       BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
     else // Conditional branch
-      BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg())
-          .addMBB(TBB);
+      BuildMI(&MBB, DL, get(NVPTX::CBranch)).add(Cond[0]).addMBB(TBB);
     return 1;
   }
 
   // Two-way Conditional Branch.
-  BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::CBranch)).add(Cond[0]).addMBB(TBB);
   BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
   return 2;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4834985b1019..96386af569de 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2247,8 +2247,18 @@ class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
             !strconcat("mov", asmstr, " \t$dst, $src;"),
             [(set regclass:$dst, (MoveParam regclass:$src))]>;
 
+class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty,
+                          string asmstr> :
+  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
+            !strconcat("mov", asmstr, " \t$dst, $src;"),
+            [(set regclass:$dst, (MoveParam texternalsym:$src))]>;
+
 def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
 def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+
+def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">;
+def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">;
+
 def MoveParamI16 :
   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
             "cvt.u16.u32 \t$dst, $src;",
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index de4bf2ef3055..511cd875ac55 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1232,7 +1232,7 @@ multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
 // has 2 operands, neg the second one
 multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
-  Operand IMMType, list<Predicate> Pred> {
+  list<Predicate> Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
     !strconcat(
       "{{ \n\t",
@@ -1244,12 +1244,11 @@ multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   Requires<Pred>;
 }
 multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
-  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
-  list<Predicate> Pred = []> {
+  string TypeStr, string OpcStr, PatFrag IntOp, list<Predicate> Pred = []> {
  defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
-   IntOp, IMMType, Pred> ;
+   IntOp, Pred> ;
  defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
-   IntOp, IMMType, Pred> ;
+   IntOp, Pred> ;
 }
 
 // has 3 operands
@@ -1357,21 +1356,21 @@ def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_sub_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
-  atomic_load_sub_32_g, i32imm>;
+  atomic_load_sub_32_g>;
 defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
-  atomic_load_sub_64_g, i64imm>;
+  atomic_load_sub_64_g>;
 defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
-  atomic_load_sub_32_gen, i32imm>;
+  atomic_load_sub_32_gen>;
 defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
-  ".add", atomic_load_sub_32_gen, i32imm>;
+  ".add", atomic_load_sub_32_gen>;
 defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
-  atomic_load_sub_32_s, i32imm>;
+  atomic_load_sub_32_s>;
 defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
-  atomic_load_sub_64_s, i64imm>;
+  atomic_load_sub_64_s>;
 defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
-  atomic_load_sub_64_gen, i64imm>;
+  atomic_load_sub_64_gen>;
 defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
-  ".add", atomic_load_sub_64_gen, i64imm>;
+  ".add", atomic_load_sub_64_gen>;
 
 // atom_swap
 
@@ -2465,2303 +2464,1563 @@ def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
 // texmode_independent
 let IsTex = true, IsTexModeUnified = false in {
 // Texture fetch instructions using handles
-def TEX_1D_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
-              []>;
-def TEX_1D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
-              []>;
-def TEX_1D_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
-              "tex.level.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x\\}], $lod;",
-              []>;
-def TEX_1D_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_1D_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
-              []>;
-def TEX_1D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
-              []>;
-def TEX_1D_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x\\}], $lod;",
-              []>;
-def TEX_1D_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_1D_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
-              []>;
-def TEX_1D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
-              []>;
-def TEX_1D_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x\\}], $lod;",
-              []>;
-def TEX_1D_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
 
-def TEX_1D_ARRAY_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}];",
-              []>;
-def TEX_1D_ARRAY_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}];",
-              []>;
-def TEX_1D_ARRAY_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}], $lod;",
-              []>;
-def TEX_1D_ARRAY_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_1D_ARRAY_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}];",
-              []>;
-def TEX_1D_ARRAY_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}];",
-              []>;
-def TEX_1D_ARRAY_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}], $lod;",
-              []>;
-def TEX_1D_ARRAY_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_1D_ARRAY_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}];",
-              []>;
-def TEX_1D_ARRAY_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}];",
-              []>;
-def TEX_1D_ARRAY_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}], $lod;",
-              []>;
-def TEX_1D_ARRAY_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-
-def TEX_2D_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TEX_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TEX_2D_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$lod),
-              "tex.level.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}], $lod;",
-              []>;
-def TEX_2D_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_2D_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TEX_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TEX_2D_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$lod),
-              "tex.level.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}], $lod;",
-              []>;
-def TEX_2D_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_2D_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TEX_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TEX_2D_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$lod),
-              "tex.level.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}], $lod;",
-              []>;
-def TEX_2D_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
+class TEX_1D_base<string inst, NVPTXRegClass outtype,
+                  NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+                 []>;
+
+multiclass TEX_1D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
+  def _RR : TEX_1D_base<inst, outtype, intype,
+                        (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_1D_base<inst, outtype, intype,
+                        (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_1D_base<inst, outtype, intype,
+                        (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_1D_base<inst, outtype, intype,
+                        (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_2D_ARRAY_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-                   Int32Regs:$y),
-              "tex.a2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_2D_ARRAY_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y),
-              "tex.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_2D_ARRAY_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
-              []>;
-def TEX_2D_ARRAY_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_2D_ARRAY_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-                   Int32Regs:$y),
-              "tex.a2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_2D_ARRAY_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y),
-              "tex.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_2D_ARRAY_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
-              []>;
-def TEX_2D_ARRAY_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_2D_ARRAY_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-                   Int32Regs:$y),
-              "tex.a2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_2D_ARRAY_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y),
-              "tex.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_2D_ARRAY_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
-              []>;
-def TEX_2D_ARRAY_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
+defm TEX_1D_F32_S32 : TEX_1D<"tex.1d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_1D_F32_F32 : TEX_1D<"tex.1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_1D_S32_S32 : TEX_1D<"tex.1d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_1D_S32_F32 : TEX_1D<"tex.1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_1D_U32_S32 : TEX_1D<"tex.1d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_1D_U32_F32 : TEX_1D<"tex.1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_1D_LEVEL_base<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}], $lod;",
+                 []>;
+
+multiclass TEX_1D_LEVEL<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype> {
+  def _RR : TEX_1D_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_1D_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_1D_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_1D_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_3D_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$z),
-              "tex.3d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_3D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z),
-              "tex.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_3D_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_3D_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$gradx2, Float32Regs:$grady0,
-                   Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], "
-              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
-              "\\{$grady0, $grady1, $grady2, $grady2\\};",
-              []>;
-def TEX_3D_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$z),
-              "tex.3d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_3D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z),
-              "tex.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_3D_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_3D_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$gradx2, Float32Regs:$grady0,
-                   Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], "
-              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
-              "\\{$grady0, $grady1, $grady2, $grady2\\};",
-              []>;
-def TEX_3D_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$z),
-              "tex.3d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_3D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z),
-              "tex.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_3D_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_3D_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$gradx2, Float32Regs:$grady0,
-                   Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], "
-              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
-              "\\{$grady0, $grady1, $grady2, $grady2\\};",
-              []>;
+defm TEX_1D_F32_F32_LEVEL :
+  TEX_1D_LEVEL<"tex.level.1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_1D_S32_F32_LEVEL :
+  TEX_1D_LEVEL<"tex.level.1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_1D_U32_F32_LEVEL :
+  TEX_1D_LEVEL<"tex.level.1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_1D_GRAD_base<string inst, NVPTXRegClass outtype,
+                       NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$gradx, intype:$grady)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}],"
+                        " \\{$gradx\\}, \\{$grady\\};",
+                 []>;
+
+multiclass TEX_1D_GRAD<string inst, NVPTXRegClass outtype,
+                       NVPTXRegClass intype> {
+  def _RR : TEX_1D_GRAD_base<inst, outtype, intype,
+                             (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_1D_GRAD_base<inst, outtype, intype,
+                             (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_1D_GRAD_base<inst, outtype, intype,
+                             (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_1D_GRAD_base<inst, outtype, intype,
+                             (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_CUBE_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s,
-               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_CUBE_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_CUBE_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_CUBE_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_CUBE_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_CUBE_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
+defm TEX_1D_F32_F32_GRAD
+  : TEX_1D_GRAD<"tex.grad.1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_1D_S32_F32_GRAD
+  : TEX_1D_GRAD<"tex.grad.1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_1D_U32_F32_GRAD
+  : TEX_1D_GRAD<"tex.grad.1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_1D_ARRAY_base<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$l, $x\\}];",
+                 []>;
+
+multiclass TEX_1D_ARRAY<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype> {
+  def _RR : TEX_1D_ARRAY_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_1D_ARRAY_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_1D_ARRAY_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_1D_ARRAY_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_CUBE_ARRAY_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
-               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $z\\}];",
-              []>;
-def TEX_CUBE_ARRAY_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
-              []>;
-def TEX_CUBE_ARRAY_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $z\\}];",
-              []>;
-def TEX_CUBE_ARRAY_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
-              []>;
-def TEX_CUBE_ARRAY_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $z\\}];",
-              []>;
-def TEX_CUBE_ARRAY_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
-              []>;
+defm TEX_1D_ARRAY_F32_F32
+  : TEX_1D_ARRAY<"tex.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_1D_ARRAY_F32_S32
+  : TEX_1D_ARRAY<"tex.a1d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_1D_ARRAY_S32_S32
+  : TEX_1D_ARRAY<"tex.a1d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_1D_ARRAY_S32_F32
+  : TEX_1D_ARRAY<"tex.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_1D_ARRAY_U32_S32
+  : TEX_1D_ARRAY<"tex.a1d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_1D_ARRAY_U32_F32
+  : TEX_1D_ARRAY<"tex.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_1D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
+                              NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$l, $x\\}], $lod;",
+                 []>;
+
+multiclass TEX_1D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
+                              NVPTXRegClass intype> {
+  def _RR : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                    (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                    (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                    (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_1D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                    (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TLD4_R_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_G_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_B_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_A_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_R_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_G_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_B_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_A_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_R_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_G_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_B_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
-def TLD4_A_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, $s, \\{$x, $y\\}];",
-              []>;
+defm TEX_1D_ARRAY_F32_F32_LEVEL
+  : TEX_1D_ARRAY_LEVEL<"tex.level.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_1D_ARRAY_S32_F32_LEVEL
+  : TEX_1D_ARRAY_LEVEL<"tex.level.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_1D_ARRAY_U32_F32_LEVEL
+  : TEX_1D_ARRAY_LEVEL<"tex.level.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_1D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
+                             NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x,
+                                    intype:$gradx, intype:$grady)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$l, $x\\}],"
+                        " \\{$gradx\\}, \\{$grady\\};",
+                 []>;
+
+multiclass TEX_1D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
+                             NVPTXRegClass intype> {
+  def _RR : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
+                                   (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
+                                   (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
+                                   (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_1D_ARRAY_GRAD_base<inst, outtype, intype,
+                                   (ins i64imm:$t, i64imm:$s)>;
 }
 
+defm TEX_1D_ARRAY_F32_F32_GRAD
+  : TEX_1D_ARRAY_GRAD<"tex.grad.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_1D_ARRAY_S32_F32_GRAD
+  : TEX_1D_ARRAY_GRAD<"tex.grad.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_1D_ARRAY_U32_F32_GRAD
+  : TEX_1D_ARRAY_GRAD<"tex.grad.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_2D_base<string inst, NVPTXRegClass outtype,
+                  NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x, $y\\}];",
+                 []>;
+
+multiclass TEX_2D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
+  def _RR : TEX_2D_base<inst, outtype, intype,
+                        (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_2D_base<inst, outtype, intype, (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_2D_base<inst, outtype, intype, (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_2D_base<inst, outtype, intype, (ins i64imm:$t, i64imm:$s)>;
+}
 
-// texmode_unified
-let IsTex = true, IsTexModeUnified = true in {
-// Texture fetch instructions using handles
-def TEX_UNIFIED_1D_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
-              []>;
-def TEX_UNIFIED_1D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
-              []>;
-def TEX_UNIFIED_1D_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
-              "tex.level.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x\\}], $lod;",
-              []>;
-def TEX_UNIFIED_1D_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_UNIFIED_1D_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
-              []>;
-def TEX_UNIFIED_1D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
-              []>;
-def TEX_UNIFIED_1D_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x\\}], $lod;",
-              []>;
-def TEX_UNIFIED_1D_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_UNIFIED_1D_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
-              []>;
-def TEX_UNIFIED_1D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
-              []>;
-def TEX_UNIFIED_1D_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x\\}], $lod;",
-              []>;
-def TEX_UNIFIED_1D_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
+defm TEX_2D_F32_F32 : TEX_2D<"tex.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_2D_F32_S32 : TEX_2D<"tex.2d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_2D_S32_S32 : TEX_2D<"tex.2d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_2D_S32_F32 : TEX_2D<"tex.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_2D_U32_S32 : TEX_2D<"tex.2d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_2D_U32_F32 : TEX_2D<"tex.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_2D_LEVEL_base<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$x, $y\\}], $lod;",
+                 []>;
+
+multiclass TEX_2D_LEVEL<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype> {
+  def _RR : TEX_2D_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_2D_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_2D_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_2D_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_UNIFIED_1D_ARRAY_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}];",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}];",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}], $lod;",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}];",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}];",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}], $lod;",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}];",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}];",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$lod),
-              "tex.level.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}], $lod;",
-              []>;
-def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
-              []>;
+defm TEX_2D_F32_F32_LEVEL :
+  TEX_2D_LEVEL<"tex.level.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_2D_S32_F32_LEVEL :
+  TEX_2D_LEVEL<"tex.level.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_2D_U32_F32_LEVEL :
+  TEX_2D_LEVEL<"tex.level.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_2D_GRAD_base<string inst, NVPTXRegClass outtype,
+                       NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y,
+                                    intype:$gradx0, intype:$gradx1,
+                                    intype:$grady0, intype:$grady1)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x, $y\\}],"
+                        " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
+                 []>;
+
+multiclass TEX_2D_GRAD<string inst, NVPTXRegClass outtype,
+                       NVPTXRegClass intype> {
+  def _RR : TEX_2D_GRAD_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_2D_GRAD_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_2D_GRAD_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_2D_GRAD_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_UNIFIED_2D_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$lod),
-              "tex.level.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}], $lod;",
-              []>;
-def TEX_UNIFIED_2D_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_UNIFIED_2D_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$lod),
-              "tex.level.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}], $lod;",
-              []>;
-def TEX_UNIFIED_2D_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_UNIFIED_2D_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$lod),
-              "tex.level.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}], $lod;",
-              []>;
-def TEX_UNIFIED_2D_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
+defm TEX_2D_F32_F32_GRAD :
+  TEX_2D_GRAD<"tex.grad.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_2D_S32_F32_GRAD :
+  TEX_2D_GRAD<"tex.grad.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_2D_U32_F32_GRAD :
+  TEX_2D_GRAD<"tex.grad.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_2D_ARRAY_base<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$l, $x, $y, $y\\}];",
+                 []>;
+
+multiclass TEX_2D_ARRAY<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype> {
+  def _RR : TEX_2D_ARRAY_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_2D_ARRAY_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_2D_ARRAY_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_2D_ARRAY_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_UNIFIED_2D_ARRAY_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
-                   Int32Regs:$y),
-              "tex.a2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y),
-              "tex.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
-                   Int32Regs:$y),
-              "tex.a2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y),
-              "tex.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
-                   Int32Regs:$y),
-              "tex.a2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y),
-              "tex.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}];",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
-              []>;
-def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
-                   Float32Regs:$y,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
-              "\\{$grady0, $grady1\\};",
-              []>;
+defm TEX_2D_ARRAY_F32_F32
+  : TEX_2D_ARRAY<"tex.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_2D_ARRAY_F32_S32
+  : TEX_2D_ARRAY<"tex.a2d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_2D_ARRAY_S32_S32
+  : TEX_2D_ARRAY<"tex.a2d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_2D_ARRAY_S32_F32
+  : TEX_2D_ARRAY<"tex.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_2D_ARRAY_U32_S32
+  : TEX_2D_ARRAY<"tex.a2d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_2D_ARRAY_U32_F32
+  : TEX_2D_ARRAY<"tex.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_2D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
+                              NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
+                                    intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+                 []>;
+
+multiclass TEX_2D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
+                              NVPTXRegClass intype> {
+  def _RR : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_2D_ARRAY_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_UNIFIED_3D_F32_S32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$z),
-              "tex.3d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_3D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z),
-              "tex.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_3D_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_3D_F32_F32_GRAD
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$gradx2, Float32Regs:$grady0,
-                   Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], "
-              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
-              "\\{$grady0, $grady1, $grady2, $grady2\\};",
-              []>;
-def TEX_UNIFIED_3D_S32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$z),
-              "tex.3d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_3D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z),
-              "tex.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_3D_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_3D_S32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$gradx2, Float32Regs:$grady0,
-                   Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], "
-              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
-              "\\{$grady0, $grady1, $grady2, $grady2\\};",
-              []>;
-def TEX_UNIFIED_3D_U32_S32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$z),
-              "tex.3d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_3D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z),
-              "tex.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_3D_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_3D_U32_F32_GRAD
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
-                   Float32Regs:$z,
-                   Float32Regs:$gradx0, Float32Regs:$gradx1,
-                   Float32Regs:$gradx2, Float32Regs:$grady0,
-                   Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], "
-              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
-              "\\{$grady0, $grady1, $grady2, $grady2\\};",
-              []>;
+defm TEX_2D_ARRAY_F32_F32_LEVEL
+  : TEX_2D_ARRAY_LEVEL<"tex.level.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_2D_ARRAY_S32_F32_LEVEL
+  : TEX_2D_ARRAY_LEVEL<"tex.level.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_2D_ARRAY_U32_F32_LEVEL
+  : TEX_2D_ARRAY_LEVEL<"tex.level.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_2D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
+                             NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
+                                    intype:$gradx0, intype:$gradx1,
+                                    intype:$grady0, intype:$grady1)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$l, $x, $y, $y\\}],"
+                        " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
+                 []>;
+
+multiclass TEX_2D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
+                             NVPTXRegClass intype> {
+  def _RR : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_2D_ARRAY_GRAD_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_UNIFIED_CUBE_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t,
-               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_CUBE_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_CUBE_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_CUBE_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_CUBE_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}];",
-              []>;
-def TEX_UNIFIED_CUBE_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
-              []>;
+defm TEX_2D_ARRAY_F32_F32_GRAD
+  : TEX_2D_ARRAY_GRAD<"tex.grad.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_2D_ARRAY_S32_F32_GRAD
+  : TEX_2D_ARRAY_GRAD<"tex.grad.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_2D_ARRAY_U32_F32_GRAD
+  : TEX_2D_ARRAY_GRAD<"tex.grad.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_3D_base<string inst, NVPTXRegClass outtype,
+                  NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y, intype:$z)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$x, $y, $z, $z\\}];",
+                 []>;
+
+multiclass TEX_3D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
+  def _RR : TEX_3D_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_3D_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_3D_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_3D_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TEX_UNIFIED_CUBE_ARRAY_F32_F32
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l,
-               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $z\\}];",
-              []>;
-def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
-  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
-                    Float32Regs:$b, Float32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_CUBE_ARRAY_S32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $z\\}];",
-              []>;
-def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
-              []>;
-def TEX_UNIFIED_CUBE_ARRAY_U32_F32
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $z\\}];",
-              []>;
-def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
-                    Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$t, Int32Regs:$l,
-                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
-                   Float32Regs:$lod),
-              "tex.level.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
-              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
-              []>;
+defm TEX_3D_F32_F32 : TEX_3D<"tex.3d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_3D_F32_S32 : TEX_3D<"tex.3d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_3D_S32_S32 : TEX_3D<"tex.3d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_3D_S32_F32 : TEX_3D<"tex.3d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_3D_U32_S32 : TEX_3D<"tex.3d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_3D_U32_F32 : TEX_3D<"tex.3d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_3D_LEVEL_base<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y, intype:$z,
+                                    intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+                 []>;
+
+multiclass TEX_3D_LEVEL<string inst, NVPTXRegClass outtype,
+                        NVPTXRegClass intype> {
+  def _RR : TEX_3D_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_3D_LEVEL_base<inst, outtype, intype,
+                              (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_3D_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_3D_LEVEL_base<inst, outtype, intype,
+                              (ins i64imm:$t, i64imm:$s)>;
+}
 
-def TLD4_UNIFIED_R_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_G_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_B_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_A_2D_F32_F32
-  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
-                    Float32Regs:$v2, Float32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_R_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_G_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_B_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_A_2D_S32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_R_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_G_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_B_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
-def TLD4_UNIFIED_A_2D_U32_F32
-  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
-                    Int32Regs:$v2, Int32Regs:$v3),
-              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
-              "[$t, \\{$x, $y\\}];",
-              []>;
+defm TEX_3D_F32_F32_LEVEL
+  : TEX_3D_LEVEL<"tex.level.3d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_3D_S32_F32_LEVEL
+  : TEX_3D_LEVEL<"tex.level.3d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_3D_U32_F32_LEVEL
+  : TEX_3D_LEVEL<"tex.level.3d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_3D_GRAD_base<string inst, NVPTXRegClass outtype,
+                       NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y, intype:$z,
+                                    intype :$gradx0, intype:$gradx1,
+                                    intype:$gradx2, intype:$grady0,
+                                    intype:$grady1, intype:$grady2)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$x, $y, $z, $z\\}],"
+                        " \\{$gradx0, $gradx1, $gradx2, $gradx2\\},"
+                        " \\{$grady0, $grady1, $grady2, $grady2\\};",
+                 []>;
+
+multiclass TEX_3D_GRAD<string inst, NVPTXRegClass outtype,
+                       NVPTXRegClass intype> {
+  def _RR : TEX_3D_GRAD_base<inst, outtype, intype,
+                             (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_3D_GRAD_base<inst, outtype, intype,
+                             (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_3D_GRAD_base<inst, outtype, intype,
+                             (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_3D_GRAD_base<inst, outtype, intype,
+                             (ins i64imm:$t, i64imm:$s)>;
 }
 
+defm TEX_3D_F32_F32_GRAD
+  : TEX_3D_GRAD<"tex.grad.3d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_3D_S32_F32_GRAD
+  : TEX_3D_GRAD<"tex.grad.3d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_3D_U32_F32_GRAD
+  : TEX_3D_GRAD<"tex.grad.3d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_CUBE_base<string inst, NVPTXRegClass outtype,
+                    NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y, intype:$z)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$x, $y, $z, $z\\}];",
+                 []>;
+
+multiclass TEX_CUBE<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
+  def _RR : TEX_CUBE_base<inst, outtype, intype,
+                          (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_CUBE_base<inst, outtype, intype,
+                          (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_CUBE_base<inst, outtype, intype,
+                          (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_CUBE_base<inst, outtype, intype,
+                          (ins i64imm:$t, i64imm:$s)>;
+}
 
+defm TEX_CUBE_F32_F32
+  : TEX_CUBE<"tex.cube.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_CUBE_S32_F32
+  : TEX_CUBE<"tex.cube.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_CUBE_U32_F32
+  : TEX_CUBE<"tex.cube.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_CUBE_LEVEL_base<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins intype:$x, intype:$y, intype:$z,
+                                    intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+                 []>;
+
+multiclass TEX_CUBE_LEVEL<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype> {
+  def _RR : TEX_CUBE_LEVEL_base<inst, outtype, intype,
+                                (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_CUBE_LEVEL_base<inst, outtype, intype,
+                                (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_CUBE_LEVEL_base<inst, outtype, intype,
+                                (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_CUBE_LEVEL_base<inst, outtype, intype,
+                                (ins i64imm:$t, i64imm:$s)>;
+}
 
-//=== Surface load instructions
-// .clamp variant
-let IsSuld = true in {
-def SULD_1D_I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
+defm TEX_CUBE_F32_F32_LEVEL
+  : TEX_CUBE_LEVEL<"tex.level.cube.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_CUBE_S32_F32_LEVEL
+  : TEX_CUBE_LEVEL<"tex.level.cube.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_CUBE_U32_F32_LEVEL
+  : TEX_CUBE_LEVEL<"tex.level.cube.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_CUBE_ARRAY_base<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
+                                    intype:$z)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$l, $x, $y, $z\\}];",
+                 []>;
+
+multiclass TEX_CUBE_ARRAY<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype> {
+  def _RR : TEX_CUBE_ARRAY_base<inst, outtype, intype,
+                                (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_CUBE_ARRAY_base<inst, outtype, intype,
+                                (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_CUBE_ARRAY_base<inst, outtype, intype,
+                                (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_CUBE_ARRAY_base<inst, outtype, intype,
+                                (ins i64imm:$t, i64imm:$s)>;
+}
 
-def SULD_1D_ARRAY_I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
+defm TEX_CUBE_ARRAY_F32_F32
+  : TEX_CUBE_ARRAY<"tex.acube.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_CUBE_ARRAY_S32_F32
+  : TEX_CUBE_ARRAY<"tex.acube.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_CUBE_ARRAY_U32_F32
+  : TEX_CUBE_ARRAY<"tex.acube.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_CUBE_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(texsamp, (ins Int32Regs:$l, intype:$x, intype:$y,
+                                    intype:$z, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+                 []>;
+
+multiclass TEX_CUBE_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype> {
+  def _RR : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
+                                      (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
+                                      (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
+                                      (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TEX_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
+                                      (ins i64imm:$t, i64imm:$s)>;
+}
 
-def SULD_2D_I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm TEX_CUBE_ARRAY_F32_F32_LEVEL
+  : TEX_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.f32.f32",
+                         Float32Regs, Float32Regs>;
+defm TEX_CUBE_ARRAY_S32_F32_LEVEL
+  : TEX_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.s32.f32",
+                         Int32Regs, Float32Regs>;
+defm TEX_CUBE_ARRAY_U32_F32_LEVEL
+  : TEX_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.u32.f32",
+                         Int32Regs, Float32Regs>;
+
+class TLD4_2D_base<string inst, NVPTXRegClass outtype,
+                   NVPTXRegClass intype, dag texsamp>
+    : NVPTXInst<(outs outtype:$v0, outtype:$v1,
+                      outtype:$v2, outtype:$v3),
+                 !con(texsamp, (ins intype:$x, intype:$y)),
+                 inst # " \t\\{$v0, $v1, $v2, $v3\\}, [$t, $s, \\{$x, $y\\}];",
+                 []>;
+
+multiclass TLD4_2D<string inst, NVPTXRegClass outtype, NVPTXRegClass intype> {
+  def _RR : TLD4_2D_base<inst, outtype, intype,
+                         (ins Int64Regs:$t, Int64Regs:$s)>;
+  def _RI : TLD4_2D_base<inst, outtype, intype,
+                         (ins Int64Regs:$t, i64imm:$s)>;
+  def _IR : TLD4_2D_base<inst, outtype, intype,
+                         (ins i64imm:$t, Int64Regs:$s)>;
+  def _II : TLD4_2D_base<inst, outtype, intype,
+                         (ins i64imm:$t, i64imm:$s)>;
+}
 
-def SULD_2D_ARRAY_I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm TLD4_R_2D_F32_F32
+  : TLD4_2D<"tld4.r.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TLD4_G_2D_F32_F32
+  : TLD4_2D<"tld4.g.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TLD4_B_2D_F32_F32
+  : TLD4_2D<"tld4.b.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TLD4_A_2D_F32_F32
+  : TLD4_2D<"tld4.a.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+
+defm TLD4_R_2D_S32_F32
+  : TLD4_2D<"tld4.r.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TLD4_G_2D_S32_F32
+  : TLD4_2D<"tld4.g.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TLD4_B_2D_S32_F32
+  : TLD4_2D<"tld4.b.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TLD4_A_2D_S32_F32
+  : TLD4_2D<"tld4.a.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+
+defm TLD4_R_2D_U32_F32
+  : TLD4_2D<"tld4.r.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+defm TLD4_G_2D_U32_F32
+  : TLD4_2D<"tld4.g.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+defm TLD4_B_2D_U32_F32
+  : TLD4_2D<"tld4.b.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+defm TLD4_A_2D_U32_F32
+  : TLD4_2D<"tld4.a.2d.v4.u32.f32", Int32Regs, Float32Regs>;
 
-def SULD_3D_I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
 }
 
-let IsSuld = 2 in {
-def SULD_1D_V2I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
 
-def SULD_1D_ARRAY_V2I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
+// texmode_unified
+let IsTex = true, IsTexModeUnified = true in {
+// Texture fetch instructions using handles
 
-def SULD_2D_V2I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
+class TEX_UNIFIED_1D_base<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+                 []>;
+
+multiclass TEX_UNIFIED_1D<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_1D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_1D_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_2D_ARRAY_V2I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm TEX_UNIFIED_1D_F32_S32
+  : TEX_UNIFIED_1D<"tex.1d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_UNIFIED_1D_F32_F32
+  : TEX_UNIFIED_1D<"tex.1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_S32_S32
+  : TEX_UNIFIED_1D<"tex.1d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_1D_S32_F32
+  : TEX_UNIFIED_1D<"tex.1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_U32_S32
+  : TEX_UNIFIED_1D<"tex.1d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_1D_U32_F32
+  : TEX_UNIFIED_1D<"tex.1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_1D_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}], $lod;",
+                 []>;
+
+multiclass TEX_UNIFIED_1D_LEVEL<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_1D_LEVEL_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_1D_LEVEL_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_3D_V2I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I64_CLAMP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+defm TEX_UNIFIED_1D_F32_F32_LEVEL
+  : TEX_UNIFIED_1D_LEVEL<"tex.level.1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_S32_F32_LEVEL
+  : TEX_UNIFIED_1D_LEVEL<"tex.level.1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_U32_F32_LEVEL
+  : TEX_UNIFIED_1D_LEVEL<"tex.level.1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_1D_GRAD_base<string inst, NVPTXRegClass outtype,
+                               NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$gradx, intype:$grady)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+                 []>;
+
+multiclass TEX_UNIFIED_1D_GRAD<string inst, NVPTXRegClass outtype,
+                               NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_1D_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_1D_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
 }
 
-let IsSuld = 3 in {
-def SULD_1D_V4I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
+defm TEX_UNIFIED_1D_F32_F32_GRAD
+  : TEX_UNIFIED_1D_GRAD<"tex.grad.1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_S32_F32_GRAD
+  : TEX_UNIFIED_1D_GRAD<"tex.grad.1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_U32_F32_GRAD
+  : TEX_UNIFIED_1D_GRAD<"tex.grad.1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_1D_ARRAY_base<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x\\}];",
+                 []>;
+
+multiclass TEX_UNIFIED_1D_ARRAY<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_1D_ARRAY_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_1D_ARRAY_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_1D_ARRAY_V4I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V4I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V4I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
+defm TEX_UNIFIED_1D_ARRAY_F32_S32
+  : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_F32_F32
+  : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_S32_S32
+  : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_S32_F32
+  : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_U32_S32
+  : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_U32_F32
+  : TEX_UNIFIED_1D_ARRAY<"tex.a1d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_1D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                      NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x\\}], $lod;",
+                 []>;
+
+multiclass TEX_UNIFIED_1D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
+                                      NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_1D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                           (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_1D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                           (ins i64imm:$t)>;
+}
 
-def SULD_2D_V4I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V4I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V4I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
+  : TEX_UNIFIED_1D_ARRAY_LEVEL<"tex.level.a1d.v4.f32.f32",
+                               Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
+  : TEX_UNIFIED_1D_ARRAY_LEVEL<"tex.level.a1d.v4.s32.f32",
+                               Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
+  : TEX_UNIFIED_1D_ARRAY_LEVEL<"tex.level.a1d.v4.u32.f32",
+                               Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_1D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
+                                     NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x,
+                                intype:$gradx, intype:$grady)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        "  [$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+                 []>;
+
+multiclass TEX_UNIFIED_1D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
+                                     NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_1D_ARRAY_GRAD_base<inst, outtype, intype,
+                                          (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_1D_ARRAY_GRAD_base<inst, outtype, intype,
+                                          (ins i64imm:$t)>;
+}
 
-def SULD_2D_ARRAY_V4I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V4I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V4I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
+  : TEX_UNIFIED_1D_ARRAY_GRAD<"tex.grad.a1d.v4.f32.f32",
+                              Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
+  : TEX_UNIFIED_1D_ARRAY_GRAD<"tex.grad.a1d.v4.s32.f32",
+                              Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
+  : TEX_UNIFIED_1D_ARRAY_GRAD<"tex.grad.a1d.v4.u32.f32",
+                              Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_2D_base<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y\\}];",
+                 []>;
+
+multiclass TEX_UNIFIED_2D<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_2D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_2D_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
+defm TEX_UNIFIED_2D_F32_S32
+  : TEX_UNIFIED_2D<"tex.2d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_UNIFIED_2D_F32_F32
+  : TEX_UNIFIED_2D<"tex.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_S32_S32
+  : TEX_UNIFIED_2D<"tex.2d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_2D_S32_F32
+  : TEX_UNIFIED_2D<"tex.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_U32_S32
+  : TEX_UNIFIED_2D<"tex.2d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_2D_U32_F32
+  : TEX_UNIFIED_2D<"tex.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_2D_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y\\}], $lod;",
+                 []>;
+
+multiclass TEX_UNIFIED_2D_LEVEL<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_2D_LEVEL_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_2D_LEVEL_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_3D_V4I8_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V4I16_CLAMP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V4I32_CLAMP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+defm TEX_UNIFIED_2D_F32_F32_LEVEL
+  : TEX_UNIFIED_2D_LEVEL<"tex.level.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_S32_F32_LEVEL
+  : TEX_UNIFIED_2D_LEVEL<"tex.level.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_U32_F32_LEVEL
+  : TEX_UNIFIED_2D_LEVEL<"tex.level.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_2D_GRAD_base<string inst, NVPTXRegClass outtype,
+                               NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y,
+                                intype:$gradx0, intype:$gradx1,
+                                intype:$grady0, intype:$grady1)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y\\}],"
+                        " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
+                 []>;
+multiclass TEX_UNIFIED_2D_GRAD<string inst, NVPTXRegClass outtype,
+                               NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_2D_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_2D_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
 }
 
+defm TEX_UNIFIED_2D_F32_F32_GRAD
+  : TEX_UNIFIED_2D_GRAD<"tex.grad.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_S32_F32_GRAD
+  : TEX_UNIFIED_2D_GRAD<"tex.grad.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_U32_F32_GRAD
+  : TEX_UNIFIED_2D_GRAD<"tex.grad.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_2D_ARRAY_base<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $y\\}];",
+                 []>;
+multiclass TEX_UNIFIED_2D_ARRAY<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_2D_ARRAY_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_2D_ARRAY_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-// .trap variant
-let IsSuld = true in {
-def SULD_1D_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
+defm TEX_UNIFIED_2D_ARRAY_F32_S32
+  : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_F32_F32
+  : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_S32_S32
+  : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_S32_F32
+  : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_U32_S32
+  : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_U32_F32
+  : TEX_UNIFIED_2D_ARRAY<"tex.a2d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_2D_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                      NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y,
+                                intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        "  [$t, \\{$l, $x, $y, $y\\}], $lod;",
+                 []>;
+multiclass TEX_UNIFIED_2D_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
+                                      NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_2D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                           (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_2D_ARRAY_LEVEL_base<inst, outtype, intype,
+                                           (ins i64imm:$t)>;
+}
 
-def SULD_1D_ARRAY_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
+defm TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
+  : TEX_UNIFIED_2D_ARRAY_LEVEL<"tex.level.a2d.v4.f32.f32",
+                               Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
+  : TEX_UNIFIED_2D_ARRAY_LEVEL<"tex.level.a2d.v4.s32.f32",
+                               Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
+  : TEX_UNIFIED_2D_ARRAY_LEVEL<"tex.level.a2d.v4.u32.f32",
+                               Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_2D_ARRAY_GRAD_base<string inst, NVPTXRegClass outtype,
+                                     NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y,
+                                intype:$gradx0, intype:$gradx1,
+                                intype:$grady0, intype:$grady1)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $y\\}],"
+                        " \\{$gradx0, $gradx1\\}, \\{$grady0, $grady1\\};",
+                 []>;
+multiclass TEX_UNIFIED_2D_ARRAY_GRAD<string inst, NVPTXRegClass outtype,
+                                     NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_2D_ARRAY_GRAD_base<inst, outtype, intype,
+                                          (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_2D_ARRAY_GRAD_base<inst, outtype, intype,
+                                          (ins i64imm:$t)>;
+}
 
-def SULD_2D_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
+  : TEX_UNIFIED_2D_ARRAY_GRAD<"tex.grad.a2d.v4.f32.f32",
+                              Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
+  : TEX_UNIFIED_2D_ARRAY_GRAD<"tex.grad.a2d.v4.s32.f32",
+                              Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
+  : TEX_UNIFIED_2D_ARRAY_GRAD<"tex.grad.a2d.v4.u32.f32",
+                              Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_3D_base<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y, intype:$z)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}];",
+                 []>;
+multiclass TEX_UNIFIED_3D<string inst, NVPTXRegClass outtype,
+                          NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_3D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_3D_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_2D_ARRAY_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm TEX_UNIFIED_3D_F32_S32
+  : TEX_UNIFIED_3D<"tex.3d.v4.f32.s32", Float32Regs, Int32Regs>;
+defm TEX_UNIFIED_3D_F32_F32
+  : TEX_UNIFIED_3D<"tex.3d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_3D_S32_S32
+  : TEX_UNIFIED_3D<"tex.3d.v4.s32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_3D_S32_F32
+  : TEX_UNIFIED_3D<"tex.3d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_3D_U32_S32
+  : TEX_UNIFIED_3D<"tex.3d.v4.u32.s32", Int32Regs, Int32Regs>;
+defm TEX_UNIFIED_3D_U32_F32
+  : TEX_UNIFIED_3D<"tex.3d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_3D_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y, intype:$z, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, \\{$x, $y, $z, $z\\}], $lod;",
+                 []>;
+multiclass TEX_UNIFIED_3D_LEVEL<string inst, NVPTXRegClass outtype,
+                                NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_3D_LEVEL_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_3D_LEVEL_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_3D_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+defm TEX_UNIFIED_3D_F32_F32_LEVEL
+  : TEX_UNIFIED_3D_LEVEL<"tex.level.3d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_3D_S32_F32_LEVEL
+  : TEX_UNIFIED_3D_LEVEL<"tex.level.3d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_3D_U32_F32_LEVEL
+  : TEX_UNIFIED_3D_LEVEL<"tex.level.3d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_3D_GRAD_base<string inst, NVPTXRegClass outtype,
+                               NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y, intype:$z,
+                                intype:$gradx0, intype:$gradx1,
+                                intype:$gradx2, intype:$grady0,
+                                intype:$grady1, intype:$grady2)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}],"
+                        " \\{$gradx0, $gradx1, $gradx2, $gradx2\\},"
+                        " \\{$grady0, $grady1, $grady2, $grady2\\};",
+                 []>;
+multiclass TEX_UNIFIED_3D_GRAD<string inst, NVPTXRegClass outtype,
+                               NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_3D_GRAD_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_3D_GRAD_base<inst, outtype, intype, (ins i64imm:$t)>;
 }
 
-let IsSuld = 2 in {
-def SULD_1D_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
+defm TEX_UNIFIED_3D_F32_F32_GRAD
+  : TEX_UNIFIED_3D_GRAD<"tex.grad.3d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_3D_S32_F32_GRAD
+  : TEX_UNIFIED_3D_GRAD<"tex.grad.3d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_3D_U32_F32_GRAD
+  : TEX_UNIFIED_3D_GRAD<"tex.grad.3d.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_CUBE_base<string inst, NVPTXRegClass outtype,
+                            NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y, intype:$z)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$x, $y, $z, $z\\}];",
+                 []>;
+multiclass TEX_UNIFIED_CUBE<string inst, NVPTXRegClass outtype,
+                            NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_CUBE_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_CUBE_base<inst, outtype, intype, (ins i64imm:$t)>;
+}
 
-def SULD_1D_ARRAY_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
+defm TEX_UNIFIED_CUBE_F32_F32
+  : TEX_UNIFIED_CUBE<"tex.cube.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_S32_F32
+  : TEX_UNIFIED_CUBE<"tex.cube.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_U32_F32
+  : TEX_UNIFIED_CUBE<"tex.cube.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_CUBE_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                  NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins intype:$x, intype:$y, intype:$z, intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, \\{$x, $y, $z, $z\\}], $lod;",
+                 []>;
+multiclass TEX_UNIFIED_CUBE_LEVEL<string inst, NVPTXRegClass outtype,
+                                  NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_CUBE_LEVEL_base<inst, outtype, intype,
+                                       (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_CUBE_LEVEL_base<inst, outtype, intype,
+                                       (ins i64imm:$t)>;
+}
 
-def SULD_2D_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm TEX_UNIFIED_CUBE_F32_F32_LEVEL
+  : TEX_UNIFIED_CUBE_LEVEL<"tex.level.cube.v4.f32.f32",
+                           Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_S32_F32_LEVEL
+  : TEX_UNIFIED_CUBE_LEVEL<"tex.level.cube.v4.s32.f32",
+                           Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_U32_F32_LEVEL
+  : TEX_UNIFIED_CUBE_LEVEL<"tex.level.cube.v4.u32.f32",
+                           Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_CUBE_ARRAY_base<string inst, NVPTXRegClass outtype,
+                                  NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y, intype:$z)),
+                 inst # " \t\\{$r, $g, $b, $a\\}, [$t, \\{$l, $x, $y, $z\\}];",
+                 []>;
+multiclass TEX_UNIFIED_CUBE_ARRAY<string inst, NVPTXRegClass outtype,
+                                  NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_CUBE_ARRAY_base<inst, outtype, intype,
+                                       (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_CUBE_ARRAY_base<inst, outtype, intype,
+                                       (ins i64imm:$t)>;
+}
 
-def SULD_2D_ARRAY_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm TEX_UNIFIED_CUBE_ARRAY_F32_F32
+  : TEX_UNIFIED_CUBE_ARRAY<"tex.acube.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_ARRAY_S32_F32
+  : TEX_UNIFIED_CUBE_ARRAY<"tex.acube.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_ARRAY_U32_F32
+  : TEX_UNIFIED_CUBE_ARRAY<"tex.acube.v4.u32.f32", Int32Regs, Float32Regs>;
+
+class TEX_UNIFIED_CUBE_ARRAY_LEVEL_base<string inst, NVPTXRegClass outtype,
+                                        NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$r, outtype:$g,
+                      outtype:$b, outtype:$a),
+                 !con(tex, (ins Int32Regs:$l, intype:$x, intype:$y, intype:$z,
+                                intype:$lod)),
+                 inst # " \t\\{$r, $g, $b, $a\\},"
+                        " [$t, \\{$l, $x, $y, $z\\}], $lod;",
+                 []>;
+multiclass TEX_UNIFIED_CUBE_ARRAY_LEVEL<string inst, NVPTXRegClass outtype,
+                                        NVPTXRegClass intype> {
+  def _R : TEX_UNIFIED_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
+                                             (ins Int64Regs:$t)>;
+  def _I : TEX_UNIFIED_CUBE_ARRAY_LEVEL_base<inst, outtype, intype,
+                                             (ins i64imm:$t)>;
+}
 
-def SULD_3D_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I64_TRAP
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+defm TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
+  : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.f32.f32",
+                                 Float32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
+  : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.s32.f32",
+                                 Int32Regs, Float32Regs>;
+defm TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
+  : TEX_UNIFIED_CUBE_ARRAY_LEVEL<"tex.level.acube.v4.u32.f32",
+                                 Int32Regs, Float32Regs>;
+
+class TLD4_UNIFIED_2D_base<string inst, NVPTXRegClass outtype,
+                           NVPTXRegClass intype, dag tex>
+    : NVPTXInst<(outs outtype:$v0, outtype:$v1,
+                      outtype:$v2, outtype:$v3),
+                 !con(tex, (ins intype:$x, intype:$y)),
+                 inst # " \t\\{$v0, $v1, $v2, $v3\\}, [$t, \\{$x, $y\\}];",
+                 []>;
+multiclass TLD4_UNIFIED_2D<string inst, NVPTXRegClass outtype,
+                           NVPTXRegClass intype> {
+  def _R : TLD4_UNIFIED_2D_base<inst, outtype, intype, (ins Int64Regs:$t)>;
+  def _I : TLD4_UNIFIED_2D_base<inst, outtype, intype, (ins i64imm:$t)>;
 }
 
-let IsSuld = 3 in {
-def SULD_1D_V4I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
+defm TLD4_UNIFIED_R_2D_F32_F32
+  : TLD4_UNIFIED_2D<"tld4.r.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TLD4_UNIFIED_G_2D_F32_F32
+  : TLD4_UNIFIED_2D<"tld4.g.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TLD4_UNIFIED_B_2D_F32_F32
+  : TLD4_UNIFIED_2D<"tld4.b.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+defm TLD4_UNIFIED_A_2D_F32_F32
+  : TLD4_UNIFIED_2D<"tld4.a.2d.v4.f32.f32", Float32Regs, Float32Regs>;
+
+defm TLD4_UNIFIED_R_2D_S32_F32
+  : TLD4_UNIFIED_2D<"tld4.r.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TLD4_UNIFIED_G_2D_S32_F32
+  : TLD4_UNIFIED_2D<"tld4.g.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TLD4_UNIFIED_B_2D_S32_F32
+  : TLD4_UNIFIED_2D<"tld4.b.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+defm TLD4_UNIFIED_A_2D_S32_F32
+  : TLD4_UNIFIED_2D<"tld4.a.2d.v4.s32.f32", Int32Regs, Float32Regs>;
+
+defm TLD4_UNIFIED_R_2D_U32_F32
+  : TLD4_UNIFIED_2D<"tld4.r.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+defm TLD4_UNIFIED_G_2D_U32_F32
+  : TLD4_UNIFIED_2D<"tld4.g.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+defm TLD4_UNIFIED_B_2D_U32_F32
+  : TLD4_UNIFIED_2D<"tld4.b.2d.v4.u32.f32", Int32Regs, Float32Regs>;
+defm TLD4_UNIFIED_A_2D_U32_F32
+  : TLD4_UNIFIED_2D<"tld4.a.2d.v4.u32.f32", Int32Regs, Float32Regs>;
 
-def SULD_1D_ARRAY_V4I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V4I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V4I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
+}
 
-def SULD_2D_V4I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V4I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V4I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
 
-def SULD_2D_ARRAY_V4I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V4I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V4I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
 
+//=== Surface load instructions
 
-def SULD_3D_V4I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V4I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V4I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+let IsSuld = true in {
+
+class SULD_1D_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r),
+                !con(surf, (ins Int32Regs:$x)),
+                inst # " \\{$r\\}, [$s, \\{$x\\}];",
+                []>;
+multiclass SULD_1D<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_1D_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_1D_base<inst, outtype, (ins i64imm:$s)>;
 }
 
-// .zero variant
-let IsSuld = true in {
-def SULD_1D_I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];",
-              []>;
+defm SULD_1D_I8_CLAMP : SULD_1D<"suld.b.1d.b8.clamp", Int16Regs>;
+defm SULD_1D_I16_CLAMP : SULD_1D<"suld.b.1d.b16.clamp", Int16Regs>;
+defm SULD_1D_I32_CLAMP : SULD_1D<"suld.b.1d.b32.clamp", Int32Regs>;
+defm SULD_1D_I64_CLAMP : SULD_1D<"suld.b.1d.b64.clamp", Int64Regs>;
+
+defm SULD_1D_I8_TRAP : SULD_1D<"suld.b.1d.b8.trap", Int16Regs>;
+defm SULD_1D_I16_TRAP : SULD_1D<"suld.b.1d.b16.trap", Int16Regs>;
+defm SULD_1D_I32_TRAP : SULD_1D<"suld.b.1d.b32.trap", Int32Regs>;
+defm SULD_1D_I64_TRAP : SULD_1D<"suld.b.1d.b64.trap", Int64Regs>;
+
+defm SULD_1D_I8_ZERO : SULD_1D<"suld.b.1d.b8.zero", Int16Regs>;
+defm SULD_1D_I16_ZERO : SULD_1D<"suld.b.1d.b16.zero", Int16Regs>;
+defm SULD_1D_I32_ZERO : SULD_1D<"suld.b.1d.b32.zero", Int32Regs>;
+defm SULD_1D_I64_ZERO : SULD_1D<"suld.b.1d.b64.zero", Int64Regs>;
+
+class SULD_1D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r),
+                !con(surf, (ins Int32Regs:$l, Int32Regs:$x)),
+                inst # " \\{$r\\}, [$s, \\{$l, $x\\}];",
+                []>;
+multiclass SULD_1D_ARRAY<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_1D_ARRAY_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_1D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_1D_ARRAY_I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
-              []>;
+defm SULD_1D_ARRAY_I8_CLAMP
+  : SULD_1D_ARRAY<"suld.b.a1d.b8.clamp", Int16Regs>;
+defm SULD_1D_ARRAY_I16_CLAMP
+  : SULD_1D_ARRAY<"suld.b.a1d.b16.clamp", Int16Regs>;
+defm SULD_1D_ARRAY_I32_CLAMP
+  : SULD_1D_ARRAY<"suld.b.a1d.b32.clamp", Int32Regs>;
+defm SULD_1D_ARRAY_I64_CLAMP
+  : SULD_1D_ARRAY<"suld.b.a1d.b64.clamp", Int64Regs>;
+
+defm SULD_1D_ARRAY_I8_TRAP
+  : SULD_1D_ARRAY<"suld.b.a1d.b8.trap", Int16Regs>;
+defm SULD_1D_ARRAY_I16_TRAP
+  : SULD_1D_ARRAY<"suld.b.a1d.b16.trap", Int16Regs>;
+defm SULD_1D_ARRAY_I32_TRAP
+  : SULD_1D_ARRAY<"suld.b.a1d.b32.trap", Int32Regs>;
+defm SULD_1D_ARRAY_I64_TRAP
+  : SULD_1D_ARRAY<"suld.b.a1d.b64.trap", Int64Regs>;
+
+defm SULD_1D_ARRAY_I8_ZERO
+  : SULD_1D_ARRAY<"suld.b.a1d.b8.zero", Int16Regs>;
+defm SULD_1D_ARRAY_I16_ZERO
+  : SULD_1D_ARRAY<"suld.b.a1d.b16.zero", Int16Regs>;
+defm SULD_1D_ARRAY_I32_ZERO
+  : SULD_1D_ARRAY<"suld.b.a1d.b32.zero", Int32Regs>;
+defm SULD_1D_ARRAY_I64_ZERO
+  : SULD_1D_ARRAY<"suld.b.a1d.b64.zero", Int64Regs>;
+
+class SULD_2D_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y)),
+                inst # " \\{$r\\}, [$s, \\{$x, $y\\}];",
+                []>;
+multiclass SULD_2D<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_2D_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_2D_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_2D_I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm SULD_2D_I8_CLAMP : SULD_2D<"suld.b.2d.b8.clamp", Int16Regs>;
+defm SULD_2D_I16_CLAMP : SULD_2D<"suld.b.2d.b16.clamp", Int16Regs>;
+defm SULD_2D_I32_CLAMP : SULD_2D<"suld.b.2d.b32.clamp", Int32Regs>;
+defm SULD_2D_I64_CLAMP : SULD_2D<"suld.b.2d.b64.clamp", Int64Regs>;
+
+defm SULD_2D_I8_TRAP : SULD_2D<"suld.b.2d.b8.trap", Int16Regs>;
+defm SULD_2D_I16_TRAP : SULD_2D<"suld.b.2d.b16.trap", Int16Regs>;
+defm SULD_2D_I32_TRAP : SULD_2D<"suld.b.2d.b32.trap", Int32Regs>;
+defm SULD_2D_I64_TRAP : SULD_2D<"suld.b.2d.b64.trap", Int64Regs>;
+
+defm SULD_2D_I8_ZERO : SULD_2D<"suld.b.2d.b8.zero", Int16Regs>;
+defm SULD_2D_I16_ZERO : SULD_2D<"suld.b.2d.b16.zero", Int16Regs>;
+defm SULD_2D_I32_ZERO : SULD_2D<"suld.b.2d.b32.zero", Int32Regs>;
+defm SULD_2D_I64_ZERO : SULD_2D<"suld.b.2d.b64.zero", Int64Regs>;
+
+class SULD_2D_ARRAY_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r),
+                !con(surf, (ins Int32Regs:$l, Int32Regs:$x, Int32Regs:$y)),
+                inst # " \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+                []>;
+multiclass SULD_2D_ARRAY<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_2D_ARRAY_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_2D_ARRAY_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_2D_ARRAY_I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm SULD_2D_ARRAY_I8_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b8.clamp", Int16Regs>;
+defm SULD_2D_ARRAY_I16_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b16.clamp", Int16Regs>;
+defm SULD_2D_ARRAY_I32_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b32.clamp", Int32Regs>;
+defm SULD_2D_ARRAY_I64_CLAMP : SULD_2D_ARRAY<"suld.b.a2d.b64.clamp", Int64Regs>;
+
+defm SULD_2D_ARRAY_I8_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b8.trap", Int16Regs>;
+defm SULD_2D_ARRAY_I16_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b16.trap", Int16Regs>;
+defm SULD_2D_ARRAY_I32_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b32.trap", Int32Regs>;
+defm SULD_2D_ARRAY_I64_TRAP : SULD_2D_ARRAY<"suld.b.a2d.b64.trap", Int64Regs>;
+
+defm SULD_2D_ARRAY_I8_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b8.zero", Int16Regs>;
+defm SULD_2D_ARRAY_I16_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b16.zero", Int16Regs>;
+defm SULD_2D_ARRAY_I32_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b32.zero", Int32Regs>;
+defm SULD_2D_ARRAY_I64_ZERO : SULD_2D_ARRAY<"suld.b.a2d.b64.zero", Int64Regs>;
+
+class SULD_3D_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z)),
+                inst # " \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+                []>;
+multiclass SULD_3D<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_3D_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_3D_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_3D_I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+defm SULD_3D_I8_CLAMP : SULD_3D<"suld.b.3d.b8.clamp", Int16Regs>;
+defm SULD_3D_I16_CLAMP : SULD_3D<"suld.b.3d.b16.clamp", Int16Regs>;
+defm SULD_3D_I32_CLAMP : SULD_3D<"suld.b.3d.b32.clamp", Int32Regs>;
+defm SULD_3D_I64_CLAMP : SULD_3D<"suld.b.3d.b64.clamp", Int64Regs>;
+
+defm SULD_3D_I8_TRAP : SULD_3D<"suld.b.3d.b8.trap", Int16Regs>;
+defm SULD_3D_I16_TRAP : SULD_3D<"suld.b.3d.b16.trap", Int16Regs>;
+defm SULD_3D_I32_TRAP : SULD_3D<"suld.b.3d.b32.trap", Int32Regs>;
+defm SULD_3D_I64_TRAP : SULD_3D<"suld.b.3d.b64.trap", Int64Regs>;
+
+defm SULD_3D_I8_ZERO : SULD_3D<"suld.b.3d.b8.zero", Int16Regs>;
+defm SULD_3D_I16_ZERO : SULD_3D<"suld.b.3d.b16.zero", Int16Regs>;
+defm SULD_3D_I32_ZERO : SULD_3D<"suld.b.3d.b32.zero", Int32Regs>;
+defm SULD_3D_I64_ZERO : SULD_3D<"suld.b.3d.b64.zero", Int64Regs>;
 }
 
 let IsSuld = 2 in {
-def SULD_1D_V2I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
 
-def SULD_1D_ARRAY_V2I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V2I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
-              []>;
+class SULD_1D_V2_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g),
+                !con(surf, (ins Int32Regs:$x)),
+                inst # " \\{$r, $g\\}, [$s, \\{$x\\}];",
+                []>;
+multiclass SULD_1D_V2<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_1D_V2_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_1D_V2_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_2D_V2I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm SULD_1D_V2I8_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b8.clamp", Int16Regs>;
+defm SULD_1D_V2I16_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b16.clamp", Int16Regs>;
+defm SULD_1D_V2I32_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b32.clamp", Int32Regs>;
+defm SULD_1D_V2I64_CLAMP : SULD_1D_V2<"suld.b.1d.v2.b64.clamp", Int64Regs>;
+
+defm SULD_1D_V2I8_TRAP : SULD_1D_V2<"suld.b.1d.v2.b8.trap", Int16Regs>;
+defm SULD_1D_V2I16_TRAP : SULD_1D_V2<"suld.b.1d.v2.b16.trap", Int16Regs>;
+defm SULD_1D_V2I32_TRAP : SULD_1D_V2<"suld.b.1d.v2.b32.trap", Int32Regs>;
+defm SULD_1D_V2I64_TRAP : SULD_1D_V2<"suld.b.1d.v2.b64.trap", Int64Regs>;
+
+defm SULD_1D_V2I8_ZERO : SULD_1D_V2<"suld.b.1d.v2.b8.zero", Int16Regs>;
+defm SULD_1D_V2I16_ZERO : SULD_1D_V2<"suld.b.1d.v2.b16.zero", Int16Regs>;
+defm SULD_1D_V2I32_ZERO : SULD_1D_V2<"suld.b.1d.v2.b32.zero", Int32Regs>;
+defm SULD_1D_V2I64_ZERO : SULD_1D_V2<"suld.b.1d.v2.b64.zero", Int64Regs>;
+
+class SULD_1D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g),
+                !con(surf, (ins Int32Regs:$l, Int32Regs:$x)),
+                inst # " \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+                []>;
+multiclass SULD_1D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_1D_ARRAY_V2_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_1D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_2D_ARRAY_V2I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm SULD_1D_ARRAY_V2I8_CLAMP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.clamp", Int16Regs>;
+defm SULD_1D_ARRAY_V2I16_CLAMP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.clamp", Int16Regs>;
+defm SULD_1D_ARRAY_V2I32_CLAMP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.clamp", Int32Regs>;
+defm SULD_1D_ARRAY_V2I64_CLAMP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.clamp", Int64Regs>;
+
+defm SULD_1D_ARRAY_V2I8_TRAP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.trap", Int16Regs>;
+defm SULD_1D_ARRAY_V2I16_TRAP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.trap", Int16Regs>;
+defm SULD_1D_ARRAY_V2I32_TRAP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.trap", Int32Regs>;
+defm SULD_1D_ARRAY_V2I64_TRAP
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.trap", Int64Regs>;
+
+defm SULD_1D_ARRAY_V2I8_ZERO
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b8.zero", Int16Regs>;
+defm SULD_1D_ARRAY_V2I16_ZERO
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b16.zero", Int16Regs>;
+defm SULD_1D_ARRAY_V2I32_ZERO
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b32.zero", Int32Regs>;
+defm SULD_1D_ARRAY_V2I64_ZERO
+  : SULD_1D_ARRAY_V2<"suld.b.a1d.v2.b64.zero", Int64Regs>;
+
+class SULD_2D_V2_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y)),
+                inst # " \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+                []>;
+multiclass SULD_2D_V2<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_2D_V2_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_2D_V2_base<inst, outtype, (ins i64imm:$s)>;
+}
+
+defm SULD_2D_V2I8_CLAMP
+  : SULD_2D_V2<"suld.b.2d.v2.b8.clamp", Int16Regs>;
+defm SULD_2D_V2I16_CLAMP
+  : SULD_2D_V2<"suld.b.2d.v2.b16.clamp", Int16Regs>;
+defm SULD_2D_V2I32_CLAMP
+  : SULD_2D_V2<"suld.b.2d.v2.b32.clamp", Int32Regs>;
+defm SULD_2D_V2I64_CLAMP
+  : SULD_2D_V2<"suld.b.2d.v2.b64.clamp", Int64Regs>;
+
+defm SULD_2D_V2I8_TRAP
+  : SULD_2D_V2<"suld.b.2d.v2.b8.trap", Int16Regs>;
+defm SULD_2D_V2I16_TRAP
+  : SULD_2D_V2<"suld.b.2d.v2.b16.trap", Int16Regs>;
+defm SULD_2D_V2I32_TRAP
+  : SULD_2D_V2<"suld.b.2d.v2.b32.trap", Int32Regs>;
+defm SULD_2D_V2I64_TRAP
+  : SULD_2D_V2<"suld.b.2d.v2.b64.trap", Int64Regs>;
+
+defm SULD_2D_V2I8_ZERO
+  : SULD_2D_V2<"suld.b.2d.v2.b8.zero", Int16Regs>;
+defm SULD_2D_V2I16_ZERO
+  : SULD_2D_V2<"suld.b.2d.v2.b16.zero", Int16Regs>;
+defm SULD_2D_V2I32_ZERO
+  : SULD_2D_V2<"suld.b.2d.v2.b32.zero", Int32Regs>;
+defm SULD_2D_V2I64_ZERO
+  : SULD_2D_V2<"suld.b.2d.v2.b64.zero", Int64Regs>;
+
+class SULD_2D_ARRAY_V2_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g),
+                !con(surf, (ins Int32Regs:$l, Int32Regs:$x, Int32Regs:$y)),
+                inst # " \\{$r, $g\\}, [$s, \\{$l, $x, $y, $y\\}];",
+                []>;
+multiclass SULD_2D_ARRAY_V2<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_2D_ARRAY_V2_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_2D_ARRAY_V2_base<inst, outtype, (ins i64imm:$s)>;
+}
+
+defm SULD_2D_ARRAY_V2I8_CLAMP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.clamp", Int16Regs>;
+defm SULD_2D_ARRAY_V2I16_CLAMP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.clamp", Int16Regs>;
+defm SULD_2D_ARRAY_V2I32_CLAMP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.clamp", Int32Regs>;
+defm SULD_2D_ARRAY_V2I64_CLAMP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.clamp", Int64Regs>;
+
+defm SULD_2D_ARRAY_V2I8_TRAP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.trap", Int16Regs>;
+defm SULD_2D_ARRAY_V2I16_TRAP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.trap", Int16Regs>;
+defm SULD_2D_ARRAY_V2I32_TRAP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.trap", Int32Regs>;
+defm SULD_2D_ARRAY_V2I64_TRAP
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.trap", Int64Regs>;
+
+defm SULD_2D_ARRAY_V2I8_ZERO
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b8.zero", Int16Regs>;
+defm SULD_2D_ARRAY_V2I16_ZERO
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b16.zero", Int16Regs>;
+defm SULD_2D_ARRAY_V2I32_ZERO
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b32.zero", Int32Regs>;
+defm SULD_2D_ARRAY_V2I64_ZERO
+  : SULD_2D_ARRAY_V2<"suld.b.a2d.v2.b64.zero", Int64Regs>;
+
+class SULD_3D_V2_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z)),
+                inst # " \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+                []>;
+multiclass SULD_3D_V2<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_3D_V2_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_3D_V2_base<inst, outtype, (ins i64imm:$s)>;
+}
+
+defm SULD_3D_V2I8_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b8.clamp", Int16Regs>;
+defm SULD_3D_V2I16_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b16.clamp", Int16Regs>;
+defm SULD_3D_V2I32_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b32.clamp", Int32Regs>;
+defm SULD_3D_V2I64_CLAMP : SULD_3D_V2<"suld.b.3d.v2.b64.clamp", Int64Regs>;
+
+defm SULD_3D_V2I8_TRAP : SULD_3D_V2<"suld.b.3d.v2.b8.trap", Int16Regs>;
+defm SULD_3D_V2I16_TRAP : SULD_3D_V2<"suld.b.3d.v2.b16.trap", Int16Regs>;
+defm SULD_3D_V2I32_TRAP : SULD_3D_V2<"suld.b.3d.v2.b32.trap", Int32Regs>;
+defm SULD_3D_V2I64_TRAP : SULD_3D_V2<"suld.b.3d.v2.b64.trap", Int64Regs>;
+
+defm SULD_3D_V2I8_ZERO : SULD_3D_V2<"suld.b.3d.v2.b8.zero", Int16Regs>;
+defm SULD_3D_V2I16_ZERO : SULD_3D_V2<"suld.b.3d.v2.b16.zero", Int16Regs>;
+defm SULD_3D_V2I32_ZERO : SULD_3D_V2<"suld.b.3d.v2.b32.zero", Int32Regs>;
+defm SULD_3D_V2I64_ZERO : SULD_3D_V2<"suld.b.3d.v2.b64.zero", Int64Regs>;
 
-def SULD_3D_V2I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I64_ZERO
-  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
 }
 
 let IsSuld = 3 in {
-def SULD_1D_V4I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
 
-def SULD_1D_ARRAY_V4I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V4I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
-def SULD_1D_ARRAY_V4I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x\\}];",
-              []>;
+class SULD_1D_V4_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
+                !con(surf, (ins Int32Regs:$x)),
+                inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+                []>;
+multiclass SULD_1D_V4<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_1D_V4_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_1D_V4_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_2D_V4I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V4I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V4I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
-              []>;
+defm SULD_1D_V4I8_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b8.clamp", Int16Regs>;
+defm SULD_1D_V4I16_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b16.clamp", Int16Regs>;
+defm SULD_1D_V4I32_CLAMP : SULD_1D_V4<"suld.b.1d.v4.b32.clamp", Int32Regs>;
+
+defm SULD_1D_V4I8_TRAP : SULD_1D_V4<"suld.b.1d.v4.b8.trap", Int16Regs>;
+defm SULD_1D_V4I16_TRAP : SULD_1D_V4<"suld.b.1d.v4.b16.trap", Int16Regs>;
+defm SULD_1D_V4I32_TRAP : SULD_1D_V4<"suld.b.1d.v4.b32.trap", Int32Regs>;
+
+defm SULD_1D_V4I8_ZERO : SULD_1D_V4<"suld.b.1d.v4.b8.zero", Int16Regs>;
+defm SULD_1D_V4I16_ZERO : SULD_1D_V4<"suld.b.1d.v4.b16.zero", Int16Regs>;
+defm SULD_1D_V4I32_ZERO : SULD_1D_V4<"suld.b.1d.v4.b32.zero", Int32Regs>;
+
+class SULD_1D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
+                !con(surf, (ins Int32Regs:$l, Int32Regs:$x)),
+                inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$l, $x\\}];",
+                []>;
+multiclass SULD_1D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_1D_ARRAY_V4_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_1D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
+}
 
-def SULD_2D_ARRAY_V4I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V4I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V4I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
+defm SULD_1D_ARRAY_V4I8_CLAMP
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.clamp", Int16Regs>;
+defm SULD_1D_ARRAY_V4I16_CLAMP
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.clamp", Int16Regs>;
+defm SULD_1D_ARRAY_V4I32_CLAMP
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.clamp", Int32Regs>;
+
+defm SULD_1D_ARRAY_V4I8_TRAP
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.trap", Int16Regs>;
+defm SULD_1D_ARRAY_V4I16_TRAP
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.trap", Int16Regs>;
+defm SULD_1D_ARRAY_V4I32_TRAP
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.trap", Int32Regs>;
+
+defm SULD_1D_ARRAY_V4I8_ZERO
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b8.zero", Int16Regs>;
+defm SULD_1D_ARRAY_V4I16_ZERO
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b16.zero", Int16Regs>;
+defm SULD_1D_ARRAY_V4I32_ZERO
+  : SULD_1D_ARRAY_V4<"suld.b.a1d.v4.b32.zero", Int32Regs>;
+
+class SULD_2D_V4_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y)),
+                inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+                []>;
+multiclass SULD_2D_V4<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_2D_V4_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_2D_V4_base<inst, outtype, (ins i64imm:$s)>;
+}
 
+defm SULD_2D_V4I8_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b8.clamp", Int16Regs>;
+defm SULD_2D_V4I16_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b16.clamp", Int16Regs>;
+defm SULD_2D_V4I32_CLAMP : SULD_2D_V4<"suld.b.2d.v4.b32.clamp", Int32Regs>;
+
+defm SULD_2D_V4I8_TRAP : SULD_2D_V4<"suld.b.2d.v4.b8.trap", Int16Regs>;
+defm SULD_2D_V4I16_TRAP : SULD_2D_V4<"suld.b.2d.v4.b16.trap", Int16Regs>;
+defm SULD_2D_V4I32_TRAP : SULD_2D_V4<"suld.b.2d.v4.b32.trap", Int32Regs>;
+
+defm SULD_2D_V4I8_ZERO : SULD_2D_V4<"suld.b.2d.v4.b8.zero", Int16Regs>;
+defm SULD_2D_V4I16_ZERO : SULD_2D_V4<"suld.b.2d.v4.b16.zero", Int16Regs>;
+defm SULD_2D_V4I32_ZERO : SULD_2D_V4<"suld.b.2d.v4.b32.zero", Int32Regs>;
+
+class SULD_2D_ARRAY_V4_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
+                !con(surf, (ins Int32Regs:$l, Int32Regs:$x, Int32Regs:$y)),
+                inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$l, $x, $y, $y\\}];",
+                []>;
+multiclass SULD_2D_ARRAY_V4<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_2D_ARRAY_V4_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_2D_ARRAY_V4_base<inst, outtype, (ins i64imm:$s)>;
+}
+
+defm SULD_2D_ARRAY_V4I8_CLAMP
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.clamp", Int16Regs>;
+defm SULD_2D_ARRAY_V4I16_CLAMP
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.clamp", Int16Regs>;
+defm SULD_2D_ARRAY_V4I32_CLAMP
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.clamp", Int32Regs>;
+
+defm SULD_2D_ARRAY_V4I8_TRAP
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.trap", Int16Regs>;
+defm SULD_2D_ARRAY_V4I16_TRAP
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.trap", Int16Regs>;
+defm SULD_2D_ARRAY_V4I32_TRAP
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.trap", Int32Regs>;
+
+defm SULD_2D_ARRAY_V4I8_ZERO
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b8.zero", Int16Regs>;
+defm SULD_2D_ARRAY_V4I16_ZERO
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b16.zero", Int16Regs>;
+defm SULD_2D_ARRAY_V4I32_ZERO
+  : SULD_2D_ARRAY_V4<"suld.b.a2d.v4.b32.zero", Int32Regs>;
+
+class SULD_3D_V4_base<string inst, NVPTXRegClass outtype, dag surf>
+    : NVPTXInst<(outs outtype:$r, outtype:$g, outtype:$b, outtype:$a),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z)),
+                inst # " \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y, $z, $z\\}];",
+                []>;
+multiclass SULD_3D_V4<string inst, NVPTXRegClass outtype> {
+  def _R : SULD_3D_V4_base<inst, outtype, (ins Int64Regs:$s)>;
+  def _I : SULD_3D_V4_base<inst, outtype, (ins i64imm:$s)>;
+}
+
+defm SULD_3D_V4I8_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b8.clamp", Int16Regs>;
+defm SULD_3D_V4I16_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b16.clamp", Int16Regs>;
+defm SULD_3D_V4I32_CLAMP : SULD_3D_V4<"suld.b.3d.v4.b32.clamp", Int32Regs>;
+
+defm SULD_3D_V4I8_TRAP : SULD_3D_V4<"suld.b.3d.v4.b8.trap", Int16Regs>;
+defm SULD_3D_V4I16_TRAP : SULD_3D_V4<"suld.b.3d.v4.b16.trap", Int16Regs>;
+defm SULD_3D_V4I32_TRAP : SULD_3D_V4<"suld.b.3d.v4.b32.trap", Int32Regs>;
+
+defm SULD_3D_V4I8_ZERO : SULD_3D_V4<"suld.b.3d.v4.b8.zero", Int16Regs>;
+defm SULD_3D_V4I16_ZERO : SULD_3D_V4<"suld.b.3d.v4.b16.zero", Int16Regs>;
+defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", Int32Regs>;
 
-def SULD_3D_V4I8_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V4I16_ZERO
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V4I32_ZERO
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
-              "[$s, \\{$x, $y, $z, $z\\}];",
-              []>;
 }
 
 //-----------------------------------
@@ -4769,56 +4028,88 @@ def SULD_3D_V4I32_ZERO
 //-----------------------------------
 
 let IsSurfTexQuery = true in {
-def TXQ_CHANNEL_ORDER
+def TXQ_CHANNEL_ORDER_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.channel_order.b32 \t$d, [$a];",
               []>;
-def TXQ_CHANNEL_DATA_TYPE
+def TXQ_CHANNEL_ORDER_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.channel_order.b32 \t$d, [$a];",
+              []>;
+def TXQ_CHANNEL_DATA_TYPE_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.channel_data_type.b32 \t$d, [$a];",
               []>;
-def TXQ_WIDTH
+def TXQ_CHANNEL_DATA_TYPE_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def TXQ_WIDTH_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.width.b32 \t$d, [$a];",
               []>;
-def TXQ_HEIGHT
+def TXQ_WIDTH_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.width.b32 \t$d, [$a];",
+              []>;
+def TXQ_HEIGHT_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.height.b32 \t$d, [$a];",
               []>;
-def TXQ_DEPTH
+def TXQ_HEIGHT_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.height.b32 \t$d, [$a];",
+              []>;
+def TXQ_DEPTH_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.depth.b32 \t$d, [$a];",
               []>;
-def TXQ_ARRAY_SIZE
+def TXQ_DEPTH_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.depth.b32 \t$d, [$a];",
+              []>;
+def TXQ_ARRAY_SIZE_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.array_size.b32 \t$d, [$a];",
               []>;
-def TXQ_NUM_SAMPLES
+def TXQ_ARRAY_SIZE_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.array_size.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_SAMPLES_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.num_samples.b32 \t$d, [$a];",
               []>;
-def TXQ_NUM_MIPMAP_LEVELS
+def TXQ_NUM_SAMPLES_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.num_samples.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_MIPMAP_LEVELS_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.num_mipmap_levels.b32 \t$d, [$a];",
               []>;
+def TXQ_NUM_MIPMAP_LEVELS_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "txq.num_mipmap_levels.b32 \t$d, [$a];",
+              []>;
 }
 
 def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
-          (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+          (TXQ_CHANNEL_ORDER_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
-          (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+          (TXQ_CHANNEL_DATA_TYPE_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_width Int64Regs:$a),
-          (TXQ_WIDTH Int64Regs:$a)>;
+          (TXQ_WIDTH_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_height Int64Regs:$a),
-          (TXQ_HEIGHT Int64Regs:$a)>;
+          (TXQ_HEIGHT_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
-          (TXQ_DEPTH Int64Regs:$a)>;
+          (TXQ_DEPTH_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
-          (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+          (TXQ_ARRAY_SIZE_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
-          (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+          (TXQ_NUM_SAMPLES_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
-          (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+          (TXQ_NUM_MIPMAP_LEVELS_R Int64Regs:$a)>;
 
 
 //-----------------------------------
@@ -4826,44 +4117,68 @@ def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
 //-----------------------------------
 
 let IsSurfTexQuery = true in {
-def SUQ_CHANNEL_ORDER
+def SUQ_CHANNEL_ORDER_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.channel_order.b32 \t$d, [$a];",
               []>;
-def SUQ_CHANNEL_DATA_TYPE
+def SUQ_CHANNEL_ORDER_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "suq.channel_order.b32 \t$d, [$a];",
+              []>;
+def SUQ_CHANNEL_DATA_TYPE_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.channel_data_type.b32 \t$d, [$a];",
               []>;
-def SUQ_WIDTH
+def SUQ_CHANNEL_DATA_TYPE_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "suq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def SUQ_WIDTH_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.width.b32 \t$d, [$a];",
               []>;
-def SUQ_HEIGHT
+def SUQ_WIDTH_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "suq.width.b32 \t$d, [$a];",
+              []>;
+def SUQ_HEIGHT_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.height.b32 \t$d, [$a];",
               []>;
-def SUQ_DEPTH
+def SUQ_HEIGHT_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "suq.height.b32 \t$d, [$a];",
+              []>;
+def SUQ_DEPTH_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.depth.b32 \t$d, [$a];",
               []>;
-def SUQ_ARRAY_SIZE
+def SUQ_DEPTH_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "suq.depth.b32 \t$d, [$a];",
+              []>;
+def SUQ_ARRAY_SIZE_R
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.array_size.b32 \t$d, [$a];",
               []>;
+def SUQ_ARRAY_SIZE_I
+  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+              "suq.array_size.b32 \t$d, [$a];",
+              []>;
 }
 
 def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
-          (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+          (SUQ_CHANNEL_ORDER_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
-          (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+          (SUQ_CHANNEL_DATA_TYPE_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_suq_width Int64Regs:$a),
-          (SUQ_WIDTH Int64Regs:$a)>;
+          (SUQ_WIDTH_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_suq_height Int64Regs:$a),
-          (SUQ_HEIGHT Int64Regs:$a)>;
+          (SUQ_HEIGHT_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
-          (SUQ_DEPTH Int64Regs:$a)>;
+          (SUQ_DEPTH_R Int64Regs:$a)>;
 def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
-          (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+          (SUQ_ARRAY_SIZE_R Int64Regs:$a)>;
 
 
 //===- Handle Query -------------------------------------------------------===//
@@ -4885,1329 +4200,522 @@ def ISTYPEP_TEXTURE
 //===- Surface Stores -----------------------------------------------------===//
 
 let IsSust = true in {
-// Unformatted
-// .clamp variant
-def SUST_B_1D_B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-              "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-              "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_V2B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-              "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-              "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V4B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_V4B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_V4B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
-                   Int32Regs:$b, Int32Regs:$a),
-              "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_1D_ARRAY_B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
-              "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
-              "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
-                   Int64Regs:$g),
-              "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_2D_B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-              "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-              "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_V2B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-                   Int64Regs:$g),
-              "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V4B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_V4B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_V4B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_2D_ARRAY_B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r),
-              "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int64Regs:$r),
-              "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-             "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g),
-             "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int64Regs:$r, Int64Regs:$g),
-             "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-      "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-      "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-     "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-     "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_3D_B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r),
-              "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int64Regs:$r),
-              "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_V2B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g),
-              "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B64_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int64Regs:$r, Int64Regs:$g),
-              "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V4B8_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-         "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-         "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_3D_V4B16_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-        "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_3D_V4B32_CLAMP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-        "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-// .trap variant
-def SUST_B_1D_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-              "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-              "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-              "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-              "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
-                   Int32Regs:$b, Int32Regs:$a),
-              "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_1D_ARRAY_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
-              "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
-              "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
-                   Int64Regs:$g),
-              "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_2D_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-              "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-              "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-                   Int64Regs:$g),
-              "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_2D_ARRAY_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r),
-              "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int64Regs:$r),
-              "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-             "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g),
-             "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int64Regs:$r, Int64Regs:$g),
-             "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-      "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-      "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-     "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-     "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-def SUST_B_3D_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r),
-              "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int64Regs:$r),
-              "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g),
-              "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B64_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int64Regs:$r, Int64Regs:$g),
-              "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-         "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-         "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_3D_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-        "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_3D_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-        "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
-
-
-// .zero variant
-def SUST_B_1D_B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-              "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-              "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_V2B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-              "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V2B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-              "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_V4B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_V4B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_V4B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
-                   Int32Regs:$b, Int32Regs:$a),
-              "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
 
+class SUST_1D_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, intype:$r)),
+                inst # " \t[$s, \\{$x\\}], \\{$r\\};",
+                []>;
+multiclass SUST_1D<string inst, NVPTXRegClass intype> {
+  def _R : SUST_1D_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_1D_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_B_1D_ARRAY_B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
-              "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
-              "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V2B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
-                   Int64Regs:$g),
-              "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_1D_ARRAY_V4B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_1D_B8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>;
+defm SUST_B_1D_B16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>;
+defm SUST_B_1D_B32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>;
+defm SUST_B_1D_B64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>;
+
+defm SUST_B_1D_B8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>;
+defm SUST_B_1D_B16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>;
+defm SUST_B_1D_B32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>;
+defm SUST_B_1D_B64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>;
+
+defm SUST_B_1D_B8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>;
+defm SUST_B_1D_B16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>;
+defm SUST_B_1D_B32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>;
+defm SUST_B_1D_B64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>;
+
+defm SUST_P_1D_B8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>;
+defm SUST_P_1D_B16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>;
+defm SUST_P_1D_B32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>;
+
+class SUST_1D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g)),
+                inst # " \t[$s, \\{$x\\}], \\{$r, $g\\};",
+                []>;
+multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> {
+  def _R : SUST_1D_V2_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_1D_V2B8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_1D_V2B16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_1D_V2B32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_1D_V2B64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>;
+
+defm SUST_B_1D_V2B8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>;
+defm SUST_B_1D_V2B16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>;
+defm SUST_B_1D_V2B32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>;
+defm SUST_B_1D_V2B64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>;
+
+defm SUST_B_1D_V2B8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>;
+defm SUST_B_1D_V2B16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>;
+defm SUST_B_1D_V2B32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>;
+defm SUST_B_1D_V2B64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>;
+
+defm SUST_P_1D_V2B8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>;
+defm SUST_P_1D_V2B16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>;
+defm SUST_P_1D_V2B32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>;
+
+class SUST_1D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g,
+                                intype:$b, intype:$a)),
+                inst # " \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+                []>;
+multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> {
+  def _R : SUST_1D_V4_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_B_2D_B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-              "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-              "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_V2B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V2B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-                   Int64Regs:$g),
-              "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_B_2D_V4B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_V4B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_V4B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_1D_V4B8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_1D_V4B16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_1D_V4B32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>;
+
+defm SUST_B_1D_V4B8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>;
+defm SUST_B_1D_V4B16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>;
+defm SUST_B_1D_V4B32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>;
+
+defm SUST_B_1D_V4B8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>;
+defm SUST_B_1D_V4B16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>;
+defm SUST_B_1D_V4B32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>;
+
+defm SUST_P_1D_V4B8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>;
+defm SUST_P_1D_V4B16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>;
+defm SUST_P_1D_V4B32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>;
+
+class SUST_1D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r)),
+                inst # " \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+                []>;
+multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> {
+  def _R : SUST_1D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_1D_ARRAY_B8_CLAMP
+  : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", Int16Regs>;
+defm SUST_B_1D_ARRAY_B16_CLAMP
+  : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", Int16Regs>;
+defm SUST_B_1D_ARRAY_B32_CLAMP
+  : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", Int32Regs>;
+defm SUST_B_1D_ARRAY_B64_CLAMP
+  : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", Int64Regs>;
+
+defm SUST_B_1D_ARRAY_B8_TRAP
+  : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", Int16Regs>;
+defm SUST_B_1D_ARRAY_B16_TRAP
+  : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", Int16Regs>;
+defm SUST_B_1D_ARRAY_B32_TRAP
+  : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", Int32Regs>;
+defm SUST_B_1D_ARRAY_B64_TRAP
+  : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", Int64Regs>;
+
+defm SUST_B_1D_ARRAY_B8_ZERO
+  : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", Int16Regs>;
+defm SUST_B_1D_ARRAY_B16_ZERO
+  : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", Int16Regs>;
+defm SUST_B_1D_ARRAY_B32_ZERO
+  : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", Int32Regs>;
+defm SUST_B_1D_ARRAY_B64_ZERO
+  : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", Int64Regs>;
+
+defm SUST_P_1D_ARRAY_B8_TRAP
+  : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", Int16Regs>;
+defm SUST_P_1D_ARRAY_B16_TRAP
+  : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", Int16Regs>;
+defm SUST_P_1D_ARRAY_B32_TRAP
+  : SUST_1D_ARRAY<"sust.p.a1d.b32.trap", Int32Regs>;
+
+class SUST_1D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
+                                intype:$r, intype:$g)),
+                inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+                []>;
+multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> {
+  def _R : SUST_1D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_B_2D_ARRAY_B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r),
-              "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int64Regs:$r),
-              "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-             "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g),
-             "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V2B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int64Regs:$r, Int64Regs:$g),
-             "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-      "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-      "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-     "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_2D_ARRAY_V4B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-     "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_1D_ARRAY_V2B8_CLAMP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_1D_ARRAY_V2B16_CLAMP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_1D_ARRAY_V2B32_CLAMP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_1D_ARRAY_V2B64_CLAMP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", Int64Regs>;
+
+defm SUST_B_1D_ARRAY_V2B8_TRAP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", Int16Regs>;
+defm SUST_B_1D_ARRAY_V2B16_TRAP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", Int16Regs>;
+defm SUST_B_1D_ARRAY_V2B32_TRAP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", Int32Regs>;
+defm SUST_B_1D_ARRAY_V2B64_TRAP
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", Int64Regs>;
+
+defm SUST_B_1D_ARRAY_V2B8_ZERO
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", Int16Regs>;
+defm SUST_B_1D_ARRAY_V2B16_ZERO
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", Int16Regs>;
+defm SUST_B_1D_ARRAY_V2B32_ZERO
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", Int32Regs>;
+defm SUST_B_1D_ARRAY_V2B64_ZERO
+  : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", Int64Regs>;
+
+defm SUST_P_1D_ARRAY_V2B8_TRAP
+  : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", Int16Regs>;
+defm SUST_P_1D_ARRAY_V2B16_TRAP
+  : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", Int16Regs>;
+defm SUST_P_1D_ARRAY_V2B32_TRAP
+  : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b32.trap", Int32Regs>;
+
+class SUST_1D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
+                                intype:$r, intype:$g, intype:$b, intype:$a)),
+                inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g, $b, $a\\};",
+                []>;
+multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> {
+  def _R : SUST_1D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_1D_ARRAY_V4B8_CLAMP
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_1D_ARRAY_V4B16_CLAMP
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_1D_ARRAY_V4B32_CLAMP
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", Int32Regs>;
+
+defm SUST_B_1D_ARRAY_V4B8_TRAP
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", Int16Regs>;
+defm SUST_B_1D_ARRAY_V4B16_TRAP
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", Int16Regs>;
+defm SUST_B_1D_ARRAY_V4B32_TRAP
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", Int32Regs>;
+
+defm SUST_B_1D_ARRAY_V4B8_ZERO
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", Int16Regs>;
+defm SUST_B_1D_ARRAY_V4B16_ZERO
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", Int16Regs>;
+defm SUST_B_1D_ARRAY_V4B32_ZERO
+  : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", Int32Regs>;
+
+defm SUST_P_1D_ARRAY_V4B8_TRAP
+  : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", Int16Regs>;
+defm SUST_P_1D_ARRAY_V4B16_TRAP
+  : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", Int16Regs>;
+defm SUST_P_1D_ARRAY_V4B32_TRAP
+  : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", Int32Regs>;
+
+class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r)),
+                inst # " \t[$s, \\{$x, $y\\}], \\{$r\\};",
+                []>;
+multiclass SUST_2D<string inst, NVPTXRegClass intype> {
+  def _R : SUST_2D_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_2D_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_B_3D_B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r),
-              "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int64Regs:$r),
-              "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_B_3D_V2B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g),
-              "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V2B64_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int64Regs:$r, Int64Regs:$g),
-              "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_B_3D_V4B8_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-         "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-         "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_3D_V4B16_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-        "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_B_3D_V4B32_ZERO
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-        "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_2D_B8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>;
+defm SUST_B_2D_B16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>;
+defm SUST_B_2D_B32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>;
+defm SUST_B_2D_B64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>;
+
+defm SUST_B_2D_B8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>;
+defm SUST_B_2D_B16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>;
+defm SUST_B_2D_B32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>;
+defm SUST_B_2D_B64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>;
+
+defm SUST_B_2D_B8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>;
+defm SUST_B_2D_B16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>;
+defm SUST_B_2D_B32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>;
+defm SUST_B_2D_B64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>;
+
+defm SUST_P_2D_B8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>;
+defm SUST_P_2D_B16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>;
+defm SUST_P_2D_B32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>;
+
+class SUST_2D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
+                                intype:$r, intype:$g)),
+                inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+                []>;
+multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> {
+  def _R : SUST_2D_V2_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_2D_V2B8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_2D_V2B16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_2D_V2B32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_2D_V2B64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>;
+
+defm SUST_B_2D_V2B8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>;
+defm SUST_B_2D_V2B16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>;
+defm SUST_B_2D_V2B32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>;
+defm SUST_B_2D_V2B64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>;
+
+defm SUST_B_2D_V2B8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>;
+defm SUST_B_2D_V2B16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>;
+defm SUST_B_2D_V2B32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>;
+defm SUST_B_2D_V2B64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>;
+
+defm SUST_P_2D_V2B8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>;
+defm SUST_P_2D_V2B16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>;
+defm SUST_P_2D_V2B32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>;
+
+class SUST_2D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
+                                intype:$r, intype:$g, intype:$b, intype:$a)),
+                inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g, $b, $a\\};",
+                []>;
+multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> {
+  def _R : SUST_2D_V4_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_2D_V4B8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_2D_V4B16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_2D_V4B32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>;
+
+defm SUST_B_2D_V4B8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>;
+defm SUST_B_2D_V4B16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>;
+defm SUST_B_2D_V4B32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>;
+
+defm SUST_B_2D_V4B8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>;
+defm SUST_B_2D_V4B16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>;
+defm SUST_B_2D_V4B32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>;
+
+defm SUST_P_2D_V4B8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>;
+defm SUST_P_2D_V4B16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>;
+defm SUST_P_2D_V4B32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>;
+
+class SUST_2D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                                intype:$r)),
+                inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+                []>;
+multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> {
+  def _R : SUST_2D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
+}
 
-// Formatted
+defm SUST_B_2D_ARRAY_B8_CLAMP
+  : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", Int16Regs>;
+defm SUST_B_2D_ARRAY_B16_CLAMP
+  : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", Int16Regs>;
+defm SUST_B_2D_ARRAY_B32_CLAMP
+  : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", Int32Regs>;
+defm SUST_B_2D_ARRAY_B64_CLAMP
+  : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", Int64Regs>;
+
+defm SUST_B_2D_ARRAY_B8_TRAP
+  : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", Int16Regs>;
+defm SUST_B_2D_ARRAY_B16_TRAP
+  : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", Int16Regs>;
+defm SUST_B_2D_ARRAY_B32_TRAP
+  : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", Int32Regs>;
+defm SUST_B_2D_ARRAY_B64_TRAP
+  : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", Int64Regs>;
+
+defm SUST_B_2D_ARRAY_B8_ZERO
+  : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", Int16Regs>;
+defm SUST_B_2D_ARRAY_B16_ZERO
+  : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", Int16Regs>;
+defm SUST_B_2D_ARRAY_B32_ZERO
+  : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", Int32Regs>;
+defm SUST_B_2D_ARRAY_B64_ZERO
+  : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", Int64Regs>;
+
+defm SUST_P_2D_ARRAY_B8_TRAP
+  : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", Int16Regs>;
+defm SUST_P_2D_ARRAY_B16_TRAP
+  : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", Int16Regs>;
+defm SUST_P_2D_ARRAY_B32_TRAP
+  : SUST_2D_ARRAY<"sust.p.a2d.b32.trap", Int32Regs>;
+
+class SUST_2D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                                intype:$r, intype:$g)),
+                inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g\\};",
+                []>;
+multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> {
+  def _R : SUST_2D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_P_1D_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_P_1D_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-              "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_P_1D_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-              "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
-              []>;
-def SUST_P_1D_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_1D_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-              "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_1D_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-              "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_1D_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_1D_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
-                   Int16Regs:$b, Int16Regs:$a),
-              "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_1D_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
-                   Int32Regs:$b, Int32Regs:$a),
-              "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_2D_ARRAY_V2B8_CLAMP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_2D_ARRAY_V2B16_CLAMP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_2D_ARRAY_V2B32_CLAMP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_2D_ARRAY_V2B64_CLAMP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", Int64Regs>;
+
+defm SUST_B_2D_ARRAY_V2B8_TRAP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", Int16Regs>;
+defm SUST_B_2D_ARRAY_V2B16_TRAP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", Int16Regs>;
+defm SUST_B_2D_ARRAY_V2B32_TRAP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", Int32Regs>;
+defm SUST_B_2D_ARRAY_V2B64_TRAP
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", Int64Regs>;
+
+defm SUST_B_2D_ARRAY_V2B8_ZERO
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", Int16Regs>;
+defm SUST_B_2D_ARRAY_V2B16_ZERO
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", Int16Regs>;
+defm SUST_B_2D_ARRAY_V2B32_ZERO
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", Int32Regs>;
+defm SUST_B_2D_ARRAY_V2B64_ZERO
+  : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", Int64Regs>;
+
+defm SUST_P_2D_ARRAY_V2B8_TRAP
+  : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", Int16Regs>;
+defm SUST_P_2D_ARRAY_V2B16_TRAP
+  : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", Int16Regs>;
+defm SUST_P_2D_ARRAY_V2B32_TRAP
+  : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b32.trap", Int32Regs>;
+
+class SUST_2D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                                intype:$r, intype:$g, intype:$b, intype:$a)),
+                inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g, $b, $a\\};",
+                []>;
+multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> {
+  def _R : SUST_2D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_2D_ARRAY_V4B8_CLAMP
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_2D_ARRAY_V4B16_CLAMP
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_2D_ARRAY_V4B32_CLAMP
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", Int32Regs>;
+
+defm SUST_B_2D_ARRAY_V4B8_TRAP
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", Int16Regs>;
+defm SUST_B_2D_ARRAY_V4B16_TRAP
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", Int16Regs>;
+defm SUST_B_2D_ARRAY_V4B32_TRAP
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", Int32Regs>;
+
+defm SUST_B_2D_ARRAY_V4B8_ZERO
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", Int16Regs>;
+defm SUST_B_2D_ARRAY_V4B16_ZERO
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", Int16Regs>;
+defm SUST_B_2D_ARRAY_V4B32_ZERO
+  : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", Int32Regs>;
+
+defm SUST_P_2D_ARRAY_V4B8_TRAP
+  : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", Int16Regs>;
+defm SUST_P_2D_ARRAY_V4B16_TRAP
+  : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", Int16Regs>;
+defm SUST_P_2D_ARRAY_V4B32_TRAP
+  : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b32.trap", Int32Regs>;
+
+class SUST_3D_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                                intype:$r)),
+                inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+                []>;
+multiclass SUST_3D<string inst, NVPTXRegClass intype> {
+  def _R : SUST_3D_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_3D_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_P_1D_ARRAY_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_P_1D_ARRAY_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
-              "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_P_1D_ARRAY_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
-              "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-              []>;
-def SUST_P_1D_ARRAY_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_1D_ARRAY_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_1D_ARRAY_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_1D_ARRAY_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_1D_ARRAY_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_1D_ARRAY_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_3D_B8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>;
+defm SUST_B_3D_B16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>;
+defm SUST_B_3D_B32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>;
+defm SUST_B_3D_B64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>;
+
+defm SUST_B_3D_B8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>;
+defm SUST_B_3D_B16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>;
+defm SUST_B_3D_B32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>;
+defm SUST_B_3D_B64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>;
+
+defm SUST_B_3D_B8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>;
+defm SUST_B_3D_B16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>;
+defm SUST_B_3D_B32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>;
+defm SUST_B_3D_B64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>;
+
+defm SUST_P_3D_B8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>;
+defm SUST_P_3D_B16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>;
+defm SUST_P_3D_B32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>;
+
+class SUST_3D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                                intype:$r, intype:$g)),
+                inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g\\};",
+                []>;
+multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> {
+  def _R : SUST_3D_V2_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s)>;
+}
 
+defm SUST_B_3D_V2B8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_3D_V2B16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_3D_V2B32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_3D_V2B64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>;
+
+defm SUST_B_3D_V2B8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>;
+defm SUST_B_3D_V2B16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>;
+defm SUST_B_3D_V2B32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>;
+defm SUST_B_3D_V2B64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>;
+
+defm SUST_B_3D_V2B8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>;
+defm SUST_B_3D_V2B16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>;
+defm SUST_B_3D_V2B32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>;
+defm SUST_B_3D_V2B64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>;
+
+defm SUST_P_3D_V2B8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>;
+defm SUST_P_3D_V2B16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>;
+defm SUST_P_3D_V2B32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>;
+
+class SUST_3D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+    : NVPTXInst<(outs),
+                !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                                intype:$r, intype:$g, intype:$b, intype:$a)),
+                inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g, $b, $a\\};",
+                []>;
+multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> {
+  def _R : SUST_3D_V4_base<inst, intype, (ins Int64Regs:$s)>;
+  def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s)>;
+}
 
-def SUST_P_2D_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_P_2D_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-              "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_P_2D_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-              "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
-              []>;
-def SUST_P_2D_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_2D_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g),
-              "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_2D_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g),
-              "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-              []>;
-def SUST_P_2D_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
-              "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_2D_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
-                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-             "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_2D_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-             "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
-             "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_3D_V4B8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_3D_V4B16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_3D_V4B32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>;
 
+defm SUST_B_3D_V4B8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>;
+defm SUST_B_3D_V4B16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>;
+defm SUST_B_3D_V4B32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>;
 
-def SUST_P_2D_ARRAY_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_P_2D_ARRAY_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r),
-              "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_P_2D_ARRAY_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r),
-              "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-              []>;
-def SUST_P_2D_ARRAY_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_P_2D_ARRAY_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g),
-             "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_P_2D_ARRAY_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g),
-             "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-             "\\{$r, $g\\};",
-              []>;
-def SUST_P_2D_ARRAY_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-      "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-      "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_2D_ARRAY_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-     "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_2D_ARRAY_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-     "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
-     "\\{$r, $g, $b, $a\\};",
-              []>;
+defm SUST_B_3D_V4B8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>;
+defm SUST_B_3D_V4B16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>;
+defm SUST_B_3D_V4B32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>;
 
+defm SUST_P_3D_V4B8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>;
+defm SUST_P_3D_V4B16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>;
+defm SUST_P_3D_V4B32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>;
 
-def SUST_P_3D_B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_P_3D_B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r),
-              "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_P_3D_B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r),
-              "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-              []>;
-def SUST_P_3D_V2B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_P_3D_V2B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g),
-              "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_P_3D_V2B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g),
-              "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-              "\\{$r, $g\\};",
-              []>;
-def SUST_P_3D_V4B8_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-         "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-         "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_3D_V4B16_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-        "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
-def SUST_P_3D_V4B32_TRAP
-  : NVPTXInst<(outs),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-        "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
-        "\\{$r, $g, $b, $a\\};",
-              []>;
 }
 
 // Surface store instruction patterns
@@ -6217,248 +4725,248 @@ def SUST_P_3D_V4B32_TRAP
 // .clamp variant
 def : Pat<(int_nvvm_sust_b_1d_i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_B_1D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_B_1D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+          (SUST_B_1D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i64_clamp
            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+          (SUST_B_1D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
            Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_2d_i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i64_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B8_CLAMP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B16_CLAMP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B32_CLAMP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B64_CLAMP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
            Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
            Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_V4B8_CLAMP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_V4B16_CLAMP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -6467,77 +4975,77 @@ def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
 def : Pat<(int_nvvm_sust_b_3d_i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_B_3D_B8_CLAMP Int64Regs:$s,
+          (SUST_B_3D_B8_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_B_3D_B16_CLAMP Int64Regs:$s,
+          (SUST_B_3D_B16_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r),
-          (SUST_B_3D_B32_CLAMP Int64Regs:$s,
+          (SUST_B_3D_B32_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i64_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r),
-          (SUST_B_3D_B64_CLAMP Int64Regs:$s,
+          (SUST_B_3D_B64_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V2B8_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V2B16_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V2B32_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V2B64_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V4B8_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V4B16_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_CLAMP Int64Regs:$s,
+          (SUST_B_3D_V4B32_CLAMP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -6545,248 +5053,248 @@ def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
 // .trap variant
 def : Pat<(int_nvvm_sust_b_1d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_B_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i16_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_B_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+          (SUST_B_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i64_trap
            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+          (SUST_B_1D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
            Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_2d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i64_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B64_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
            Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
            Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+          (SUST_B_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -6795,77 +5303,77 @@ def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
 def : Pat<(int_nvvm_sust_b_3d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_B_3D_B8_TRAP Int64Regs:$s,
+          (SUST_B_3D_B8_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_B_3D_B16_TRAP Int64Regs:$s,
+          (SUST_B_3D_B16_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r),
-          (SUST_B_3D_B32_TRAP Int64Regs:$s,
+          (SUST_B_3D_B32_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i64_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r),
-          (SUST_B_3D_B64_TRAP Int64Regs:$s,
+          (SUST_B_3D_B64_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+          (SUST_B_3D_V2B8_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+          (SUST_B_3D_V2B16_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+          (SUST_B_3D_V2B32_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_TRAP Int64Regs:$s,
+          (SUST_B_3D_V2B64_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+          (SUST_B_3D_V4B8_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+          (SUST_B_3D_V4B16_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+          (SUST_B_3D_V4B32_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -6873,248 +5381,248 @@ def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
 // .zero variant
 def : Pat<(int_nvvm_sust_b_1d_i8_zero
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_B_1D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i16_zero
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_B_1D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+          (SUST_B_1D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_i64_zero
            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+          (SUST_B_1D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
            Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
            Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x,
+          (SUST_B_1D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_B_1D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_2d_i8_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i16_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_i64_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_B_2D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B8_ZERO_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B16_ZERO_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B32_ZERO_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s,
+          (SUST_B_2D_ARRAY_B64_ZERO_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
            Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
            Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s,
+          (SUST_B_2D_ARRAY_V4B8_ZERO_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s,
+          (SUST_B_2D_ARRAY_V4B16_ZERO_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l,
+          (SUST_B_2D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -7123,77 +5631,77 @@ def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
 def : Pat<(int_nvvm_sust_b_3d_i8_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_B_3D_B8_ZERO Int64Regs:$s,
+          (SUST_B_3D_B8_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i16_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_B_3D_B16_ZERO Int64Regs:$s,
+          (SUST_B_3D_B16_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r),
-          (SUST_B_3D_B32_ZERO Int64Regs:$s,
+          (SUST_B_3D_B32_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_i64_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r),
-          (SUST_B_3D_B64_ZERO Int64Regs:$s,
+          (SUST_B_3D_B64_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_ZERO Int64Regs:$s,
+          (SUST_B_3D_V2B8_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_ZERO Int64Regs:$s,
+          (SUST_B_3D_V2B16_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_ZERO Int64Regs:$s,
+          (SUST_B_3D_V2B32_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_ZERO Int64Regs:$s,
+          (SUST_B_3D_V2B64_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int64Regs:$r, Int64Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_ZERO Int64Regs:$s,
+          (SUST_B_3D_V4B8_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_ZERO Int64Regs:$s,
+          (SUST_B_3D_V4B16_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_ZERO Int64Regs:$s,
+          (SUST_B_3D_V4B32_ZERO_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -7202,207 +5710,207 @@ def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
 
 def : Pat<(int_nvvm_sust_p_1d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_P_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_1d_i16_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+          (SUST_P_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_1d_i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+          (SUST_P_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_P_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_P_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_P_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_P_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_P_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
            Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+          (SUST_P_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+          (SUST_P_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_p_2d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_2d_i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_2d_i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+          (SUST_P_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
 
 def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+          (SUST_P_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+          (SUST_P_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+          (SUST_P_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_P_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_P_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
            Int32Regs:$g),
-          (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_P_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+          (SUST_P_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+          (SUST_P_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+          (SUST_P_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -7411,63 +5919,63 @@ def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
 def : Pat<(int_nvvm_sust_p_3d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_P_3D_B8_TRAP Int64Regs:$s,
+          (SUST_P_3D_B8_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_3d_i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r),
-          (SUST_P_3D_B16_TRAP Int64Regs:$s,
+          (SUST_P_3D_B16_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_3d_i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r),
-          (SUST_P_3D_B32_TRAP Int64Regs:$s,
+          (SUST_P_3D_B32_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r)>;
 
 def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+          (SUST_P_3D_V2B8_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+          (SUST_P_3D_V2B16_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+          (SUST_P_3D_V2B32_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g)>;
 
 def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+          (SUST_P_3D_V4B8_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+          (SUST_P_3D_V4B16_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
 
 def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+          (SUST_P_3D_V4B32_TRAP_R Int64Regs:$s,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
@@ -7578,6 +6086,7 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
     !eq(ptx_elt_type, "bf16") : Int32Regs,
     !eq(ptx_elt_type, "tf32") : Int32Regs,
     !eq(ptx_elt_type, "s32") : Int32Regs,
+    !eq(ptx_elt_type, "b16") : Int32Regs,
     !eq(ptx_elt_type, "s8") : Int32Regs,
     !eq(ptx_elt_type, "u8") : Int32Regs,
     !eq(ptx_elt_type, "s4") : Int32Regs,
@@ -7661,7 +6170,11 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
              !eq(geom, "m16n8k64"),
              !eq(geom, "m8n8k128"),
              !eq(geom, "m16n8k128"),
-             !eq(geom, "m16n8k256"))) : [hasSM80, hasPTX70]);
+             !eq(geom, "m16n8k256"))) : [hasSM80, hasPTX70],
+
+    !and(!eq(op,"ldmatrix"),
+         !eq(ptx_elt_type,"b16"),
+         !eq(geom, "m8n8")) : [hasSM75, hasPTX65]);
 
   // template DAGs for instruction inputs/output.
   dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -7910,6 +6423,44 @@ defset list<WMMA_INSTR> MMAs  = {
   } // layout_a
 } // defset
 
+//
+// ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
+//
+class LDMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space,
+               DAGOperand SrcOp>
+  : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record, [(ins SrcOp:$src)]>,
+    Requires<Frag.Predicates> {
+  // Build PatFrag that only matches particular address space.
+  PatFrag IntrFrag = PatFrag<(ops node:$src), (Intr node:$src),
+                             !cond(!eq(Space, ".shared"): AS_match.shared,
+                                   true: AS_match.generic)>;
+  // Build AS-constrained pattern.
+  let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
+
+  let OutOperandList = Frag.Outs;
+  let InOperandList = !con(Args, (ins MmaCode:$ptx));
+  let AsmString = "ldmatrix.sync.aligned."
+                  # Frag.geom
+                  # "." # Frag.frag
+                  # !if(Transposed, ".trans", "")
+                  # Space
+                  # "." # Frag.ptx_elt_type
+                  # " " # Frag.regstring # ", [$src];";
+}
+
+// Create all ldmatrix variants
+defset list<WMMA_INSTR> LDMATRIXs  = {
+  foreach transposed = [false, true] in {
+    foreach space = [".shared", ""] in {
+      foreach addr = [imem, Int32Regs, Int64Regs, MEMri, MEMri64] in {
+        foreach frag = NVVM_MMA_OPS.all_ldmatrix_ops in
+          if NVVM_LDMATRIX_SUPPORTED<frag>.ret then
+            def : LDMATRIX<WMMA_REGINFO<frag, "ldmatrix">, transposed, space,
+                            addr>;
+      } // addr
+    } // space
+  } // transposed
+} // defset
 
 // Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
 // dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
@@ -7921,5 +6472,5 @@ class MMA_PAT<WMMA_INSTR wi>
         Requires<wi.Predicates>;
 
 // Build intrinsic->instruction patterns for all MMA instructions.
-foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs) in
+foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in
   def : MMA_PAT<mma>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 1bd02552b666..369238436083 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -78,15 +78,12 @@ bool NVPTXLowerAlloca::runOnFunction(Function &F) {
             new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, "");
         NewASCToLocal->insertAfter(allocaInst);
         NewASCToGeneric->insertAfter(NewASCToLocal);
-        for (Value::use_iterator UI = allocaInst->use_begin(),
-                                 UE = allocaInst->use_end();
-             UI != UE;) {
+        for (Use &AllocaUse : llvm::make_early_inc_range(allocaInst->uses())) {
           // Check Load, Store, GEP, and BitCast Uses on alloca and make them
           // use the converted generic address, in order to expose non-generic
           // addrspacecast to NVPTXInferAddressSpaces. For other types
           // of instructions this is unnecessary and may introduce redundant
           // address cast.
-          const auto &AllocaUse = *UI++;
           auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
           if (LI && LI->getPointerOperand() == allocaInst &&
               !LI->isVolatile()) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
index 5e6411c61eab..1f3b4c9440d8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -21,17 +21,19 @@
 // This peephole pass optimizes these cases, for example
 //
 // It will transform the following pattern
-//    %0 = LEA_ADDRi64 %VRFrame, 4
+//    %0 = LEA_ADDRi64 %VRFrame64, 4
 //    %1 = cvta_to_local_yes_64 %0
 //
 // into
-//    %1 = LEA_ADDRi64 %VRFrameLocal, 4
+//    %1 = LEA_ADDRi64 %VRFrameLocal64, 4
 //
-// %VRFrameLocal is the virtual register name of %SPL
+// %VRFrameLocal64 is the virtual register name of %SPL
 //
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -92,9 +94,12 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
     return false;
   }
 
+  const NVPTXRegisterInfo *NRI =
+      MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo();
+
   // Check the LEA_ADDRi operand is Frame index
   auto &BaseAddrOp = GenericAddrDef->getOperand(1);
-  if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NVPTX::VRFrame) {
+  if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NRI->getFrameRegister(MF)) {
     return true;
   }
 
@@ -108,10 +113,13 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
 
+  const NVPTXRegisterInfo *NRI =
+      MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo();
+
   MachineInstrBuilder MIB =
       BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
               Root.getOperand(0).getReg())
-          .addReg(NVPTX::VRFrameLocal)
+          .addReg(NRI->getFrameLocalRegister(MF))
           .add(Prev.getOperand(2));
 
   MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
@@ -142,10 +150,13 @@ bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
     }  // Instruction
   }    // Basic Block
 
+  const NVPTXRegisterInfo *NRI =
+      MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo();
+
   // Remove unnecessary %VRFrame = cvta.local %VRFrameLocal
   const auto &MRI = MF.getRegInfo();
-  if (MRI.use_empty(NVPTX::VRFrame)) {
-    if (auto MI = MRI.getUniqueVRegDef(NVPTX::VRFrame)) {
+  if (MRI.use_empty(NRI->getFrameRegister(MF))) {
+    if (auto MI = MRI.getUniqueVRegDef(NRI->getFrameRegister(MF))) {
       MI->eraseFromParentAndMarkDBGValuesForRemoval();
     }
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 8e2299e65222..16fbe1a65562 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -74,7 +74,6 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
           auto Offset =
               TFI.getFrameIndexReference(MF, Op.getIndex(), Reg);
           Op.ChangeToRegister(Reg, /*isDef=*/false);
-          Op.setIsDebug();
           const DIExpression *DIExpr = MI.getDebugExpression();
           if (MI.isNonListDebugValue()) {
             DIExpr = TRI.prependOffsetExpression(MI.getDebugExpression(), DIExpression::ApplyOffset, Offset);
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 5cdec0925b26..ec7307265bca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -13,6 +13,7 @@
 #include "NVPTXRegisterInfo.h"
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -106,6 +107,14 @@ NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
 
 BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  for (unsigned Reg = NVPTX::ENVREG0; Reg <= NVPTX::ENVREG31; ++Reg) {
+    markSuperRegs(Reserved, Reg);
+  }
+  markSuperRegs(Reserved, NVPTX::VRFrame32);
+  markSuperRegs(Reserved, NVPTX::VRFrameLocal32);
+  markSuperRegs(Reserved, NVPTX::VRFrame64);
+  markSuperRegs(Reserved, NVPTX::VRFrameLocal64);
+  markSuperRegs(Reserved, NVPTX::VRDepot);
   return Reserved;
 }
 
@@ -122,10 +131,19 @@ void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                MI.getOperand(FIOperandNum + 1).getImm();
 
   // Using I0 as the frame pointer
-  MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(FIOperandNum).ChangeToRegister(getFrameRegister(MF), false);
   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
 Register NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return NVPTX::VRFrame;
+  const NVPTXTargetMachine &TM =
+      static_cast<const NVPTXTargetMachine &>(MF.getTarget());
+  return TM.is64Bit() ? NVPTX::VRFrame64 : NVPTX::VRFrame32;
+}
+
+Register
+NVPTXRegisterInfo::getFrameLocalRegister(const MachineFunction &MF) const {
+  const NVPTXTargetMachine &TM =
+      static_cast<const NVPTXTargetMachine &>(MF.getTarget());
+  return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
index 9ef6940daf86..c6dd647f4637 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -43,6 +43,7 @@ public:
                            RegScavenger *RS = nullptr) const override;
 
   Register getFrameRegister(const MachineFunction &MF) const override;
+  Register getFrameLocalRegister(const MachineFunction &MF) const;
 
   ManagedStringPool *getStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 19895a20bacf..162167e8720d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -22,8 +22,10 @@ class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
 //===----------------------------------------------------------------------===//
 
 // Special Registers used as stack pointer
-def VRFrame         : NVPTXReg<"%SP">;
-def VRFrameLocal    : NVPTXReg<"%SPL">;
+def VRFrame32         : NVPTXReg<"%SP">;
+def VRFrame64         : NVPTXReg<"%SP">;
+def VRFrameLocal32    : NVPTXReg<"%SPL">;
+def VRFrameLocal64    : NVPTXReg<"%SPL">;
 
 // Special Registers used as the stack
 def VRDepot  : NVPTXReg<"%Depot">;
@@ -56,8 +58,8 @@ foreach i = 0...31 in {
 //===----------------------------------------------------------------------===//
 def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
 def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
-def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
-def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
 def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>;
 def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;
 def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
@@ -68,5 +70,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
 def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
 
 // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
-def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot,
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame32, VRFrameLocal32, VRDepot,
                                             (sequence "ENVREG%u", 0, 31))>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index 8ae542130a14..e404cead344b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -41,7 +41,7 @@ public:
   }
 private:
   bool processInstr(MachineInstr &MI);
-  void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+  bool replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
   bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF,
                           unsigned &Idx);
 };
@@ -76,19 +76,1675 @@ bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
+static unsigned suldRegisterToIndexOpcode(unsigned RegOC) {
+  switch (RegOC) {
+  case NVPTX::SULD_1D_I8_CLAMP_R:
+    return NVPTX::SULD_1D_I8_CLAMP_I;
+  case NVPTX::SULD_1D_I16_CLAMP_R:
+    return NVPTX::SULD_1D_I16_CLAMP_I;
+  case NVPTX::SULD_1D_I32_CLAMP_R:
+    return NVPTX::SULD_1D_I32_CLAMP_I;
+  case NVPTX::SULD_1D_I64_CLAMP_R:
+    return NVPTX::SULD_1D_I64_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_I8_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_I8_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_I16_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_I16_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_I32_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_I32_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_I64_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_I64_CLAMP_I;
+  case NVPTX::SULD_2D_I8_CLAMP_R:
+    return NVPTX::SULD_2D_I8_CLAMP_I;
+  case NVPTX::SULD_2D_I16_CLAMP_R:
+    return NVPTX::SULD_2D_I16_CLAMP_I;
+  case NVPTX::SULD_2D_I32_CLAMP_R:
+    return NVPTX::SULD_2D_I32_CLAMP_I;
+  case NVPTX::SULD_2D_I64_CLAMP_R:
+    return NVPTX::SULD_2D_I64_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_I8_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_I8_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_I16_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_I16_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_I32_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_I32_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_I64_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_I64_CLAMP_I;
+  case NVPTX::SULD_3D_I8_CLAMP_R:
+    return NVPTX::SULD_3D_I8_CLAMP_I;
+  case NVPTX::SULD_3D_I16_CLAMP_R:
+    return NVPTX::SULD_3D_I16_CLAMP_I;
+  case NVPTX::SULD_3D_I32_CLAMP_R:
+    return NVPTX::SULD_3D_I32_CLAMP_I;
+  case NVPTX::SULD_3D_I64_CLAMP_R:
+    return NVPTX::SULD_3D_I64_CLAMP_I;
+  case NVPTX::SULD_1D_V2I8_CLAMP_R:
+    return NVPTX::SULD_1D_V2I8_CLAMP_I;
+  case NVPTX::SULD_1D_V2I16_CLAMP_R:
+    return NVPTX::SULD_1D_V2I16_CLAMP_I;
+  case NVPTX::SULD_1D_V2I32_CLAMP_R:
+    return NVPTX::SULD_1D_V2I32_CLAMP_I;
+  case NVPTX::SULD_1D_V2I64_CLAMP_R:
+    return NVPTX::SULD_1D_V2I64_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_I;
+  case NVPTX::SULD_2D_V2I8_CLAMP_R:
+    return NVPTX::SULD_2D_V2I8_CLAMP_I;
+  case NVPTX::SULD_2D_V2I16_CLAMP_R:
+    return NVPTX::SULD_2D_V2I16_CLAMP_I;
+  case NVPTX::SULD_2D_V2I32_CLAMP_R:
+    return NVPTX::SULD_2D_V2I32_CLAMP_I;
+  case NVPTX::SULD_2D_V2I64_CLAMP_R:
+    return NVPTX::SULD_2D_V2I64_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_I;
+  case NVPTX::SULD_3D_V2I8_CLAMP_R:
+    return NVPTX::SULD_3D_V2I8_CLAMP_I;
+  case NVPTX::SULD_3D_V2I16_CLAMP_R:
+    return NVPTX::SULD_3D_V2I16_CLAMP_I;
+  case NVPTX::SULD_3D_V2I32_CLAMP_R:
+    return NVPTX::SULD_3D_V2I32_CLAMP_I;
+  case NVPTX::SULD_3D_V2I64_CLAMP_R:
+    return NVPTX::SULD_3D_V2I64_CLAMP_I;
+  case NVPTX::SULD_1D_V4I8_CLAMP_R:
+    return NVPTX::SULD_1D_V4I8_CLAMP_I;
+  case NVPTX::SULD_1D_V4I16_CLAMP_R:
+    return NVPTX::SULD_1D_V4I16_CLAMP_I;
+  case NVPTX::SULD_1D_V4I32_CLAMP_R:
+    return NVPTX::SULD_1D_V4I32_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_I;
+  case NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R:
+    return NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_I;
+  case NVPTX::SULD_2D_V4I8_CLAMP_R:
+    return NVPTX::SULD_2D_V4I8_CLAMP_I;
+  case NVPTX::SULD_2D_V4I16_CLAMP_R:
+    return NVPTX::SULD_2D_V4I16_CLAMP_I;
+  case NVPTX::SULD_2D_V4I32_CLAMP_R:
+    return NVPTX::SULD_2D_V4I32_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_I;
+  case NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R:
+    return NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_I;
+  case NVPTX::SULD_3D_V4I8_CLAMP_R:
+    return NVPTX::SULD_3D_V4I8_CLAMP_I;
+  case NVPTX::SULD_3D_V4I16_CLAMP_R:
+    return NVPTX::SULD_3D_V4I16_CLAMP_I;
+  case NVPTX::SULD_3D_V4I32_CLAMP_R:
+    return NVPTX::SULD_3D_V4I32_CLAMP_I;
+  case NVPTX::SULD_1D_I8_TRAP_R:
+    return NVPTX::SULD_1D_I8_TRAP_I;
+  case NVPTX::SULD_1D_I16_TRAP_R:
+    return NVPTX::SULD_1D_I16_TRAP_I;
+  case NVPTX::SULD_1D_I32_TRAP_R:
+    return NVPTX::SULD_1D_I32_TRAP_I;
+  case NVPTX::SULD_1D_I64_TRAP_R:
+    return NVPTX::SULD_1D_I64_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_I8_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_I8_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_I16_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_I16_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_I32_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_I32_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_I64_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_I64_TRAP_I;
+  case NVPTX::SULD_2D_I8_TRAP_R:
+    return NVPTX::SULD_2D_I8_TRAP_I;
+  case NVPTX::SULD_2D_I16_TRAP_R:
+    return NVPTX::SULD_2D_I16_TRAP_I;
+  case NVPTX::SULD_2D_I32_TRAP_R:
+    return NVPTX::SULD_2D_I32_TRAP_I;
+  case NVPTX::SULD_2D_I64_TRAP_R:
+    return NVPTX::SULD_2D_I64_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_I8_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_I8_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_I16_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_I16_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_I32_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_I32_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_I64_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_I64_TRAP_I;
+  case NVPTX::SULD_3D_I8_TRAP_R:
+    return NVPTX::SULD_3D_I8_TRAP_I;
+  case NVPTX::SULD_3D_I16_TRAP_R:
+    return NVPTX::SULD_3D_I16_TRAP_I;
+  case NVPTX::SULD_3D_I32_TRAP_R:
+    return NVPTX::SULD_3D_I32_TRAP_I;
+  case NVPTX::SULD_3D_I64_TRAP_R:
+    return NVPTX::SULD_3D_I64_TRAP_I;
+  case NVPTX::SULD_1D_V2I8_TRAP_R:
+    return NVPTX::SULD_1D_V2I8_TRAP_I;
+  case NVPTX::SULD_1D_V2I16_TRAP_R:
+    return NVPTX::SULD_1D_V2I16_TRAP_I;
+  case NVPTX::SULD_1D_V2I32_TRAP_R:
+    return NVPTX::SULD_1D_V2I32_TRAP_I;
+  case NVPTX::SULD_1D_V2I64_TRAP_R:
+    return NVPTX::SULD_1D_V2I64_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V2I64_TRAP_I;
+  case NVPTX::SULD_2D_V2I8_TRAP_R:
+    return NVPTX::SULD_2D_V2I8_TRAP_I;
+  case NVPTX::SULD_2D_V2I16_TRAP_R:
+    return NVPTX::SULD_2D_V2I16_TRAP_I;
+  case NVPTX::SULD_2D_V2I32_TRAP_R:
+    return NVPTX::SULD_2D_V2I32_TRAP_I;
+  case NVPTX::SULD_2D_V2I64_TRAP_R:
+    return NVPTX::SULD_2D_V2I64_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V2I64_TRAP_I;
+  case NVPTX::SULD_3D_V2I8_TRAP_R:
+    return NVPTX::SULD_3D_V2I8_TRAP_I;
+  case NVPTX::SULD_3D_V2I16_TRAP_R:
+    return NVPTX::SULD_3D_V2I16_TRAP_I;
+  case NVPTX::SULD_3D_V2I32_TRAP_R:
+    return NVPTX::SULD_3D_V2I32_TRAP_I;
+  case NVPTX::SULD_3D_V2I64_TRAP_R:
+    return NVPTX::SULD_3D_V2I64_TRAP_I;
+  case NVPTX::SULD_1D_V4I8_TRAP_R:
+    return NVPTX::SULD_1D_V4I8_TRAP_I;
+  case NVPTX::SULD_1D_V4I16_TRAP_R:
+    return NVPTX::SULD_1D_V4I16_TRAP_I;
+  case NVPTX::SULD_1D_V4I32_TRAP_R:
+    return NVPTX::SULD_1D_V4I32_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SULD_1D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SULD_2D_V4I8_TRAP_R:
+    return NVPTX::SULD_2D_V4I8_TRAP_I;
+  case NVPTX::SULD_2D_V4I16_TRAP_R:
+    return NVPTX::SULD_2D_V4I16_TRAP_I;
+  case NVPTX::SULD_2D_V4I32_TRAP_R:
+    return NVPTX::SULD_2D_V4I32_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SULD_2D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SULD_3D_V4I8_TRAP_R:
+    return NVPTX::SULD_3D_V4I8_TRAP_I;
+  case NVPTX::SULD_3D_V4I16_TRAP_R:
+    return NVPTX::SULD_3D_V4I16_TRAP_I;
+  case NVPTX::SULD_3D_V4I32_TRAP_R:
+    return NVPTX::SULD_3D_V4I32_TRAP_I;
+  case NVPTX::SULD_1D_I8_ZERO_R:
+    return NVPTX::SULD_1D_I8_ZERO_I;
+  case NVPTX::SULD_1D_I16_ZERO_R:
+    return NVPTX::SULD_1D_I16_ZERO_I;
+  case NVPTX::SULD_1D_I32_ZERO_R:
+    return NVPTX::SULD_1D_I32_ZERO_I;
+  case NVPTX::SULD_1D_I64_ZERO_R:
+    return NVPTX::SULD_1D_I64_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_I8_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_I8_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_I16_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_I16_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_I32_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_I32_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_I64_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_I64_ZERO_I;
+  case NVPTX::SULD_2D_I8_ZERO_R:
+    return NVPTX::SULD_2D_I8_ZERO_I;
+  case NVPTX::SULD_2D_I16_ZERO_R:
+    return NVPTX::SULD_2D_I16_ZERO_I;
+  case NVPTX::SULD_2D_I32_ZERO_R:
+    return NVPTX::SULD_2D_I32_ZERO_I;
+  case NVPTX::SULD_2D_I64_ZERO_R:
+    return NVPTX::SULD_2D_I64_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_I8_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_I8_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_I16_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_I16_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_I32_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_I32_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_I64_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_I64_ZERO_I;
+  case NVPTX::SULD_3D_I8_ZERO_R:
+    return NVPTX::SULD_3D_I8_ZERO_I;
+  case NVPTX::SULD_3D_I16_ZERO_R:
+    return NVPTX::SULD_3D_I16_ZERO_I;
+  case NVPTX::SULD_3D_I32_ZERO_R:
+    return NVPTX::SULD_3D_I32_ZERO_I;
+  case NVPTX::SULD_3D_I64_ZERO_R:
+    return NVPTX::SULD_3D_I64_ZERO_I;
+  case NVPTX::SULD_1D_V2I8_ZERO_R:
+    return NVPTX::SULD_1D_V2I8_ZERO_I;
+  case NVPTX::SULD_1D_V2I16_ZERO_R:
+    return NVPTX::SULD_1D_V2I16_ZERO_I;
+  case NVPTX::SULD_1D_V2I32_ZERO_R:
+    return NVPTX::SULD_1D_V2I32_ZERO_I;
+  case NVPTX::SULD_1D_V2I64_ZERO_R:
+    return NVPTX::SULD_1D_V2I64_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V2I8_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V2I16_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V2I32_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V2I64_ZERO_I;
+  case NVPTX::SULD_2D_V2I8_ZERO_R:
+    return NVPTX::SULD_2D_V2I8_ZERO_I;
+  case NVPTX::SULD_2D_V2I16_ZERO_R:
+    return NVPTX::SULD_2D_V2I16_ZERO_I;
+  case NVPTX::SULD_2D_V2I32_ZERO_R:
+    return NVPTX::SULD_2D_V2I32_ZERO_I;
+  case NVPTX::SULD_2D_V2I64_ZERO_R:
+    return NVPTX::SULD_2D_V2I64_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V2I8_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V2I16_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V2I32_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V2I64_ZERO_I;
+  case NVPTX::SULD_3D_V2I8_ZERO_R:
+    return NVPTX::SULD_3D_V2I8_ZERO_I;
+  case NVPTX::SULD_3D_V2I16_ZERO_R:
+    return NVPTX::SULD_3D_V2I16_ZERO_I;
+  case NVPTX::SULD_3D_V2I32_ZERO_R:
+    return NVPTX::SULD_3D_V2I32_ZERO_I;
+  case NVPTX::SULD_3D_V2I64_ZERO_R:
+    return NVPTX::SULD_3D_V2I64_ZERO_I;
+  case NVPTX::SULD_1D_V4I8_ZERO_R:
+    return NVPTX::SULD_1D_V4I8_ZERO_I;
+  case NVPTX::SULD_1D_V4I16_ZERO_R:
+    return NVPTX::SULD_1D_V4I16_ZERO_I;
+  case NVPTX::SULD_1D_V4I32_ZERO_R:
+    return NVPTX::SULD_1D_V4I32_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V4I8_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V4I16_ZERO_I;
+  case NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R:
+    return NVPTX::SULD_1D_ARRAY_V4I32_ZERO_I;
+  case NVPTX::SULD_2D_V4I8_ZERO_R:
+    return NVPTX::SULD_2D_V4I8_ZERO_I;
+  case NVPTX::SULD_2D_V4I16_ZERO_R:
+    return NVPTX::SULD_2D_V4I16_ZERO_I;
+  case NVPTX::SULD_2D_V4I32_ZERO_R:
+    return NVPTX::SULD_2D_V4I32_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V4I8_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V4I16_ZERO_I;
+  case NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R:
+    return NVPTX::SULD_2D_ARRAY_V4I32_ZERO_I;
+  case NVPTX::SULD_3D_V4I8_ZERO_R:
+    return NVPTX::SULD_3D_V4I8_ZERO_I;
+  case NVPTX::SULD_3D_V4I16_ZERO_R:
+    return NVPTX::SULD_3D_V4I16_ZERO_I;
+  case NVPTX::SULD_3D_V4I32_ZERO_R:
+    return NVPTX::SULD_3D_V4I32_ZERO_I;
+  default:
+    llvm_unreachable("Unhandled SULD opcode");
+  }
+}
+
+static unsigned sustRegisterToIndexOpcode(unsigned RegOC) {
+  switch (RegOC) {
+  case NVPTX::SUST_B_1D_B8_CLAMP_R:
+    return NVPTX::SUST_B_1D_B8_CLAMP_I;
+  case NVPTX::SUST_B_1D_B16_CLAMP_R:
+    return NVPTX::SUST_B_1D_B16_CLAMP_I;
+  case NVPTX::SUST_B_1D_B32_CLAMP_R:
+    return NVPTX::SUST_B_1D_B32_CLAMP_I;
+  case NVPTX::SUST_B_1D_B64_CLAMP_R:
+    return NVPTX::SUST_B_1D_B64_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2B8_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2B8_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2B16_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2B16_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2B32_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2B32_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2B64_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2B64_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4B8_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4B8_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4B16_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4B16_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4B32_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4B32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_I;
+  case NVPTX::SUST_B_2D_B8_CLAMP_R:
+    return NVPTX::SUST_B_2D_B8_CLAMP_I;
+  case NVPTX::SUST_B_2D_B16_CLAMP_R:
+    return NVPTX::SUST_B_2D_B16_CLAMP_I;
+  case NVPTX::SUST_B_2D_B32_CLAMP_R:
+    return NVPTX::SUST_B_2D_B32_CLAMP_I;
+  case NVPTX::SUST_B_2D_B64_CLAMP_R:
+    return NVPTX::SUST_B_2D_B64_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2B8_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2B8_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2B16_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2B16_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2B32_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2B32_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2B64_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2B64_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4B8_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4B8_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4B16_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4B16_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4B32_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4B32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_I;
+  case NVPTX::SUST_B_3D_B8_CLAMP_R:
+    return NVPTX::SUST_B_3D_B8_CLAMP_I;
+  case NVPTX::SUST_B_3D_B16_CLAMP_R:
+    return NVPTX::SUST_B_3D_B16_CLAMP_I;
+  case NVPTX::SUST_B_3D_B32_CLAMP_R:
+    return NVPTX::SUST_B_3D_B32_CLAMP_I;
+  case NVPTX::SUST_B_3D_B64_CLAMP_R:
+    return NVPTX::SUST_B_3D_B64_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2B8_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2B8_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2B16_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2B16_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2B32_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2B32_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2B64_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2B64_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4B8_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4B8_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4B16_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4B16_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4B32_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4B32_CLAMP_I;
+  case NVPTX::SUST_B_1D_B8_TRAP_R:
+    return NVPTX::SUST_B_1D_B8_TRAP_I;
+  case NVPTX::SUST_B_1D_B16_TRAP_R:
+    return NVPTX::SUST_B_1D_B16_TRAP_I;
+  case NVPTX::SUST_B_1D_B32_TRAP_R:
+    return NVPTX::SUST_B_1D_B32_TRAP_I;
+  case NVPTX::SUST_B_1D_B64_TRAP_R:
+    return NVPTX::SUST_B_1D_B64_TRAP_I;
+  case NVPTX::SUST_B_1D_V2B8_TRAP_R:
+    return NVPTX::SUST_B_1D_V2B8_TRAP_I;
+  case NVPTX::SUST_B_1D_V2B16_TRAP_R:
+    return NVPTX::SUST_B_1D_V2B16_TRAP_I;
+  case NVPTX::SUST_B_1D_V2B32_TRAP_R:
+    return NVPTX::SUST_B_1D_V2B32_TRAP_I;
+  case NVPTX::SUST_B_1D_V2B64_TRAP_R:
+    return NVPTX::SUST_B_1D_V2B64_TRAP_I;
+  case NVPTX::SUST_B_1D_V4B8_TRAP_R:
+    return NVPTX::SUST_B_1D_V4B8_TRAP_I;
+  case NVPTX::SUST_B_1D_V4B16_TRAP_R:
+    return NVPTX::SUST_B_1D_V4B16_TRAP_I;
+  case NVPTX::SUST_B_1D_V4B32_TRAP_R:
+    return NVPTX::SUST_B_1D_V4B32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_B64_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_B64_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_I;
+  case NVPTX::SUST_B_2D_B8_TRAP_R:
+    return NVPTX::SUST_B_2D_B8_TRAP_I;
+  case NVPTX::SUST_B_2D_B16_TRAP_R:
+    return NVPTX::SUST_B_2D_B16_TRAP_I;
+  case NVPTX::SUST_B_2D_B32_TRAP_R:
+    return NVPTX::SUST_B_2D_B32_TRAP_I;
+  case NVPTX::SUST_B_2D_B64_TRAP_R:
+    return NVPTX::SUST_B_2D_B64_TRAP_I;
+  case NVPTX::SUST_B_2D_V2B8_TRAP_R:
+    return NVPTX::SUST_B_2D_V2B8_TRAP_I;
+  case NVPTX::SUST_B_2D_V2B16_TRAP_R:
+    return NVPTX::SUST_B_2D_V2B16_TRAP_I;
+  case NVPTX::SUST_B_2D_V2B32_TRAP_R:
+    return NVPTX::SUST_B_2D_V2B32_TRAP_I;
+  case NVPTX::SUST_B_2D_V2B64_TRAP_R:
+    return NVPTX::SUST_B_2D_V2B64_TRAP_I;
+  case NVPTX::SUST_B_2D_V4B8_TRAP_R:
+    return NVPTX::SUST_B_2D_V4B8_TRAP_I;
+  case NVPTX::SUST_B_2D_V4B16_TRAP_R:
+    return NVPTX::SUST_B_2D_V4B16_TRAP_I;
+  case NVPTX::SUST_B_2D_V4B32_TRAP_R:
+    return NVPTX::SUST_B_2D_V4B32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_B64_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_B64_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_I;
+  case NVPTX::SUST_B_3D_B8_TRAP_R:
+    return NVPTX::SUST_B_3D_B8_TRAP_I;
+  case NVPTX::SUST_B_3D_B16_TRAP_R:
+    return NVPTX::SUST_B_3D_B16_TRAP_I;
+  case NVPTX::SUST_B_3D_B32_TRAP_R:
+    return NVPTX::SUST_B_3D_B32_TRAP_I;
+  case NVPTX::SUST_B_3D_B64_TRAP_R:
+    return NVPTX::SUST_B_3D_B64_TRAP_I;
+  case NVPTX::SUST_B_3D_V2B8_TRAP_R:
+    return NVPTX::SUST_B_3D_V2B8_TRAP_I;
+  case NVPTX::SUST_B_3D_V2B16_TRAP_R:
+    return NVPTX::SUST_B_3D_V2B16_TRAP_I;
+  case NVPTX::SUST_B_3D_V2B32_TRAP_R:
+    return NVPTX::SUST_B_3D_V2B32_TRAP_I;
+  case NVPTX::SUST_B_3D_V2B64_TRAP_R:
+    return NVPTX::SUST_B_3D_V2B64_TRAP_I;
+  case NVPTX::SUST_B_3D_V4B8_TRAP_R:
+    return NVPTX::SUST_B_3D_V4B8_TRAP_I;
+  case NVPTX::SUST_B_3D_V4B16_TRAP_R:
+    return NVPTX::SUST_B_3D_V4B16_TRAP_I;
+  case NVPTX::SUST_B_3D_V4B32_TRAP_R:
+    return NVPTX::SUST_B_3D_V4B32_TRAP_I;
+  case NVPTX::SUST_B_1D_B8_ZERO_R:
+    return NVPTX::SUST_B_1D_B8_ZERO_I;
+  case NVPTX::SUST_B_1D_B16_ZERO_R:
+    return NVPTX::SUST_B_1D_B16_ZERO_I;
+  case NVPTX::SUST_B_1D_B32_ZERO_R:
+    return NVPTX::SUST_B_1D_B32_ZERO_I;
+  case NVPTX::SUST_B_1D_B64_ZERO_R:
+    return NVPTX::SUST_B_1D_B64_ZERO_I;
+  case NVPTX::SUST_B_1D_V2B8_ZERO_R:
+    return NVPTX::SUST_B_1D_V2B8_ZERO_I;
+  case NVPTX::SUST_B_1D_V2B16_ZERO_R:
+    return NVPTX::SUST_B_1D_V2B16_ZERO_I;
+  case NVPTX::SUST_B_1D_V2B32_ZERO_R:
+    return NVPTX::SUST_B_1D_V2B32_ZERO_I;
+  case NVPTX::SUST_B_1D_V2B64_ZERO_R:
+    return NVPTX::SUST_B_1D_V2B64_ZERO_I;
+  case NVPTX::SUST_B_1D_V4B8_ZERO_R:
+    return NVPTX::SUST_B_1D_V4B8_ZERO_I;
+  case NVPTX::SUST_B_1D_V4B16_ZERO_R:
+    return NVPTX::SUST_B_1D_V4B16_ZERO_I;
+  case NVPTX::SUST_B_1D_V4B32_ZERO_R:
+    return NVPTX::SUST_B_1D_V4B32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_B8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_B8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_B16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_B16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_B32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_B32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_B64_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_B64_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_I;
+  case NVPTX::SUST_B_2D_B8_ZERO_R:
+    return NVPTX::SUST_B_2D_B8_ZERO_I;
+  case NVPTX::SUST_B_2D_B16_ZERO_R:
+    return NVPTX::SUST_B_2D_B16_ZERO_I;
+  case NVPTX::SUST_B_2D_B32_ZERO_R:
+    return NVPTX::SUST_B_2D_B32_ZERO_I;
+  case NVPTX::SUST_B_2D_B64_ZERO_R:
+    return NVPTX::SUST_B_2D_B64_ZERO_I;
+  case NVPTX::SUST_B_2D_V2B8_ZERO_R:
+    return NVPTX::SUST_B_2D_V2B8_ZERO_I;
+  case NVPTX::SUST_B_2D_V2B16_ZERO_R:
+    return NVPTX::SUST_B_2D_V2B16_ZERO_I;
+  case NVPTX::SUST_B_2D_V2B32_ZERO_R:
+    return NVPTX::SUST_B_2D_V2B32_ZERO_I;
+  case NVPTX::SUST_B_2D_V2B64_ZERO_R:
+    return NVPTX::SUST_B_2D_V2B64_ZERO_I;
+  case NVPTX::SUST_B_2D_V4B8_ZERO_R:
+    return NVPTX::SUST_B_2D_V4B8_ZERO_I;
+  case NVPTX::SUST_B_2D_V4B16_ZERO_R:
+    return NVPTX::SUST_B_2D_V4B16_ZERO_I;
+  case NVPTX::SUST_B_2D_V4B32_ZERO_R:
+    return NVPTX::SUST_B_2D_V4B32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_B8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_B8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_B16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_B16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_B32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_B32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_B64_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_B64_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_I;
+  case NVPTX::SUST_B_3D_B8_ZERO_R:
+    return NVPTX::SUST_B_3D_B8_ZERO_I;
+  case NVPTX::SUST_B_3D_B16_ZERO_R:
+    return NVPTX::SUST_B_3D_B16_ZERO_I;
+  case NVPTX::SUST_B_3D_B32_ZERO_R:
+    return NVPTX::SUST_B_3D_B32_ZERO_I;
+  case NVPTX::SUST_B_3D_B64_ZERO_R:
+    return NVPTX::SUST_B_3D_B64_ZERO_I;
+  case NVPTX::SUST_B_3D_V2B8_ZERO_R:
+    return NVPTX::SUST_B_3D_V2B8_ZERO_I;
+  case NVPTX::SUST_B_3D_V2B16_ZERO_R:
+    return NVPTX::SUST_B_3D_V2B16_ZERO_I;
+  case NVPTX::SUST_B_3D_V2B32_ZERO_R:
+    return NVPTX::SUST_B_3D_V2B32_ZERO_I;
+  case NVPTX::SUST_B_3D_V2B64_ZERO_R:
+    return NVPTX::SUST_B_3D_V2B64_ZERO_I;
+  case NVPTX::SUST_B_3D_V4B8_ZERO_R:
+    return NVPTX::SUST_B_3D_V4B8_ZERO_I;
+  case NVPTX::SUST_B_3D_V4B16_ZERO_R:
+    return NVPTX::SUST_B_3D_V4B16_ZERO_I;
+  case NVPTX::SUST_B_3D_V4B32_ZERO_R:
+    return NVPTX::SUST_B_3D_V4B32_ZERO_I;
+  case NVPTX::SUST_P_1D_B8_TRAP_R:
+    return NVPTX::SUST_P_1D_B8_TRAP_I;
+  case NVPTX::SUST_P_1D_B16_TRAP_R:
+    return NVPTX::SUST_P_1D_B16_TRAP_I;
+  case NVPTX::SUST_P_1D_B32_TRAP_R:
+    return NVPTX::SUST_P_1D_B32_TRAP_I;
+  case NVPTX::SUST_P_1D_V2B8_TRAP_R:
+    return NVPTX::SUST_P_1D_V2B8_TRAP_I;
+  case NVPTX::SUST_P_1D_V2B16_TRAP_R:
+    return NVPTX::SUST_P_1D_V2B16_TRAP_I;
+  case NVPTX::SUST_P_1D_V2B32_TRAP_R:
+    return NVPTX::SUST_P_1D_V2B32_TRAP_I;
+  case NVPTX::SUST_P_1D_V4B8_TRAP_R:
+    return NVPTX::SUST_P_1D_V4B8_TRAP_I;
+  case NVPTX::SUST_P_1D_V4B16_TRAP_R:
+    return NVPTX::SUST_P_1D_V4B16_TRAP_I;
+  case NVPTX::SUST_P_1D_V4B32_TRAP_R:
+    return NVPTX::SUST_P_1D_V4B32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_B8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_B16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_B32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_I;
+  case NVPTX::SUST_P_2D_B8_TRAP_R:
+    return NVPTX::SUST_P_2D_B8_TRAP_I;
+  case NVPTX::SUST_P_2D_B16_TRAP_R:
+    return NVPTX::SUST_P_2D_B16_TRAP_I;
+  case NVPTX::SUST_P_2D_B32_TRAP_R:
+    return NVPTX::SUST_P_2D_B32_TRAP_I;
+  case NVPTX::SUST_P_2D_V2B8_TRAP_R:
+    return NVPTX::SUST_P_2D_V2B8_TRAP_I;
+  case NVPTX::SUST_P_2D_V2B16_TRAP_R:
+    return NVPTX::SUST_P_2D_V2B16_TRAP_I;
+  case NVPTX::SUST_P_2D_V2B32_TRAP_R:
+    return NVPTX::SUST_P_2D_V2B32_TRAP_I;
+  case NVPTX::SUST_P_2D_V4B8_TRAP_R:
+    return NVPTX::SUST_P_2D_V4B8_TRAP_I;
+  case NVPTX::SUST_P_2D_V4B16_TRAP_R:
+    return NVPTX::SUST_P_2D_V4B16_TRAP_I;
+  case NVPTX::SUST_P_2D_V4B32_TRAP_R:
+    return NVPTX::SUST_P_2D_V4B32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_B8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_B16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_B32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_I;
+  case NVPTX::SUST_P_3D_B8_TRAP_R:
+    return NVPTX::SUST_P_3D_B8_TRAP_I;
+  case NVPTX::SUST_P_3D_B16_TRAP_R:
+    return NVPTX::SUST_P_3D_B16_TRAP_I;
+  case NVPTX::SUST_P_3D_B32_TRAP_R:
+    return NVPTX::SUST_P_3D_B32_TRAP_I;
+  case NVPTX::SUST_P_3D_V2B8_TRAP_R:
+    return NVPTX::SUST_P_3D_V2B8_TRAP_I;
+  case NVPTX::SUST_P_3D_V2B16_TRAP_R:
+    return NVPTX::SUST_P_3D_V2B16_TRAP_I;
+  case NVPTX::SUST_P_3D_V2B32_TRAP_R:
+    return NVPTX::SUST_P_3D_V2B32_TRAP_I;
+  case NVPTX::SUST_P_3D_V4B8_TRAP_R:
+    return NVPTX::SUST_P_3D_V4B8_TRAP_I;
+  case NVPTX::SUST_P_3D_V4B16_TRAP_R:
+    return NVPTX::SUST_P_3D_V4B16_TRAP_I;
+  case NVPTX::SUST_P_3D_V4B32_TRAP_R:
+    return NVPTX::SUST_P_3D_V4B32_TRAP_I;
+  default:
+    llvm_unreachable("Unhandled SUST opcode");
+  }
+}
+
+static unsigned texRegisterToIndexOpcode(unsigned RegOC) {
+  switch (RegOC) {
+  case NVPTX::TEX_1D_F32_S32_RR:
+    return NVPTX::TEX_1D_F32_S32_IR;
+  case NVPTX::TEX_1D_F32_S32_RI:
+    return NVPTX::TEX_1D_F32_S32_II;
+  case NVPTX::TEX_1D_F32_F32_RR:
+    return NVPTX::TEX_1D_F32_F32_IR;
+  case NVPTX::TEX_1D_F32_F32_RI:
+    return NVPTX::TEX_1D_F32_F32_II;
+  case NVPTX::TEX_1D_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_1D_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_1D_F32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_F32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_F32_F32_GRAD_IR;
+  case NVPTX::TEX_1D_F32_F32_GRAD_RI:
+    return NVPTX::TEX_1D_F32_F32_GRAD_II;
+  case NVPTX::TEX_1D_S32_S32_RR:
+    return NVPTX::TEX_1D_S32_S32_IR;
+  case NVPTX::TEX_1D_S32_S32_RI:
+    return NVPTX::TEX_1D_S32_S32_II;
+  case NVPTX::TEX_1D_S32_F32_RR:
+    return NVPTX::TEX_1D_S32_F32_IR;
+  case NVPTX::TEX_1D_S32_F32_RI:
+    return NVPTX::TEX_1D_S32_F32_II;
+  case NVPTX::TEX_1D_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_1D_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_1D_S32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_S32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_S32_F32_GRAD_IR;
+  case NVPTX::TEX_1D_S32_F32_GRAD_RI:
+    return NVPTX::TEX_1D_S32_F32_GRAD_II;
+  case NVPTX::TEX_1D_U32_S32_RR:
+    return NVPTX::TEX_1D_U32_S32_IR;
+  case NVPTX::TEX_1D_U32_S32_RI:
+    return NVPTX::TEX_1D_U32_S32_II;
+  case NVPTX::TEX_1D_U32_F32_RR:
+    return NVPTX::TEX_1D_U32_F32_IR;
+  case NVPTX::TEX_1D_U32_F32_RI:
+    return NVPTX::TEX_1D_U32_F32_II;
+  case NVPTX::TEX_1D_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_1D_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_1D_U32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_U32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_U32_F32_GRAD_IR;
+  case NVPTX::TEX_1D_U32_F32_GRAD_RI:
+    return NVPTX::TEX_1D_U32_F32_GRAD_II;
+  case NVPTX::TEX_1D_ARRAY_F32_S32_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_S32_IR;
+  case NVPTX::TEX_1D_ARRAY_F32_S32_RI:
+    return NVPTX::TEX_1D_ARRAY_F32_S32_II;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_IR;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_RI:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_II;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_IR;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RI:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_II;
+  case NVPTX::TEX_1D_ARRAY_S32_S32_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_S32_IR;
+  case NVPTX::TEX_1D_ARRAY_S32_S32_RI:
+    return NVPTX::TEX_1D_ARRAY_S32_S32_II;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_IR;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_RI:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_II;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_IR;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RI:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_II;
+  case NVPTX::TEX_1D_ARRAY_U32_S32_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_S32_IR;
+  case NVPTX::TEX_1D_ARRAY_U32_S32_RI:
+    return NVPTX::TEX_1D_ARRAY_U32_S32_II;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_IR;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_RI:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_II;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_IR;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RI:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_II;
+  case NVPTX::TEX_2D_F32_S32_RR:
+    return NVPTX::TEX_2D_F32_S32_IR;
+  case NVPTX::TEX_2D_F32_S32_RI:
+    return NVPTX::TEX_2D_F32_S32_II;
+  case NVPTX::TEX_2D_F32_F32_RR:
+    return NVPTX::TEX_2D_F32_F32_IR;
+  case NVPTX::TEX_2D_F32_F32_RI:
+    return NVPTX::TEX_2D_F32_F32_II;
+  case NVPTX::TEX_2D_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_2D_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_2D_F32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_F32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_F32_F32_GRAD_IR;
+  case NVPTX::TEX_2D_F32_F32_GRAD_RI:
+    return NVPTX::TEX_2D_F32_F32_GRAD_II;
+  case NVPTX::TEX_2D_S32_S32_RR:
+    return NVPTX::TEX_2D_S32_S32_IR;
+  case NVPTX::TEX_2D_S32_S32_RI:
+    return NVPTX::TEX_2D_S32_S32_II;
+  case NVPTX::TEX_2D_S32_F32_RR:
+    return NVPTX::TEX_2D_S32_F32_IR;
+  case NVPTX::TEX_2D_S32_F32_RI:
+    return NVPTX::TEX_2D_S32_F32_II;
+  case NVPTX::TEX_2D_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_2D_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_2D_S32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_S32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_S32_F32_GRAD_IR;
+  case NVPTX::TEX_2D_S32_F32_GRAD_RI:
+    return NVPTX::TEX_2D_S32_F32_GRAD_II;
+  case NVPTX::TEX_2D_U32_S32_RR:
+    return NVPTX::TEX_2D_U32_S32_IR;
+  case NVPTX::TEX_2D_U32_S32_RI:
+    return NVPTX::TEX_2D_U32_S32_II;
+  case NVPTX::TEX_2D_U32_F32_RR:
+    return NVPTX::TEX_2D_U32_F32_IR;
+  case NVPTX::TEX_2D_U32_F32_RI:
+    return NVPTX::TEX_2D_U32_F32_II;
+  case NVPTX::TEX_2D_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_2D_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_2D_U32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_U32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_U32_F32_GRAD_IR;
+  case NVPTX::TEX_2D_U32_F32_GRAD_RI:
+    return NVPTX::TEX_2D_U32_F32_GRAD_II;
+  case NVPTX::TEX_2D_ARRAY_F32_S32_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_S32_IR;
+  case NVPTX::TEX_2D_ARRAY_F32_S32_RI:
+    return NVPTX::TEX_2D_ARRAY_F32_S32_II;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_IR;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_RI:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_II;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_IR;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RI:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_II;
+  case NVPTX::TEX_2D_ARRAY_S32_S32_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_S32_IR;
+  case NVPTX::TEX_2D_ARRAY_S32_S32_RI:
+    return NVPTX::TEX_2D_ARRAY_S32_S32_II;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_IR;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_RI:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_II;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_IR;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RI:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_II;
+  case NVPTX::TEX_2D_ARRAY_U32_S32_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_S32_IR;
+  case NVPTX::TEX_2D_ARRAY_U32_S32_RI:
+    return NVPTX::TEX_2D_ARRAY_U32_S32_II;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_IR;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_RI:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_II;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_IR;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RI:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_II;
+  case NVPTX::TEX_3D_F32_S32_RR:
+    return NVPTX::TEX_3D_F32_S32_IR;
+  case NVPTX::TEX_3D_F32_S32_RI:
+    return NVPTX::TEX_3D_F32_S32_II;
+  case NVPTX::TEX_3D_F32_F32_RR:
+    return NVPTX::TEX_3D_F32_F32_IR;
+  case NVPTX::TEX_3D_F32_F32_RI:
+    return NVPTX::TEX_3D_F32_F32_II;
+  case NVPTX::TEX_3D_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_3D_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_3D_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_3D_F32_F32_LEVEL_II;
+  case NVPTX::TEX_3D_F32_F32_GRAD_RR:
+    return NVPTX::TEX_3D_F32_F32_GRAD_IR;
+  case NVPTX::TEX_3D_F32_F32_GRAD_RI:
+    return NVPTX::TEX_3D_F32_F32_GRAD_II;
+  case NVPTX::TEX_3D_S32_S32_RR:
+    return NVPTX::TEX_3D_S32_S32_IR;
+  case NVPTX::TEX_3D_S32_S32_RI:
+    return NVPTX::TEX_3D_S32_S32_II;
+  case NVPTX::TEX_3D_S32_F32_RR:
+    return NVPTX::TEX_3D_S32_F32_IR;
+  case NVPTX::TEX_3D_S32_F32_RI:
+    return NVPTX::TEX_3D_S32_F32_II;
+  case NVPTX::TEX_3D_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_3D_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_3D_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_3D_S32_F32_LEVEL_II;
+  case NVPTX::TEX_3D_S32_F32_GRAD_RR:
+    return NVPTX::TEX_3D_S32_F32_GRAD_IR;
+  case NVPTX::TEX_3D_S32_F32_GRAD_RI:
+    return NVPTX::TEX_3D_S32_F32_GRAD_II;
+  case NVPTX::TEX_3D_U32_S32_RR:
+    return NVPTX::TEX_3D_U32_S32_IR;
+  case NVPTX::TEX_3D_U32_S32_RI:
+    return NVPTX::TEX_3D_U32_S32_II;
+  case NVPTX::TEX_3D_U32_F32_RR:
+    return NVPTX::TEX_3D_U32_F32_IR;
+  case NVPTX::TEX_3D_U32_F32_RI:
+    return NVPTX::TEX_3D_U32_F32_II;
+  case NVPTX::TEX_3D_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_3D_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_3D_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_3D_U32_F32_LEVEL_II;
+  case NVPTX::TEX_3D_U32_F32_GRAD_RR:
+    return NVPTX::TEX_3D_U32_F32_GRAD_IR;
+  case NVPTX::TEX_3D_U32_F32_GRAD_RI:
+    return NVPTX::TEX_3D_U32_F32_GRAD_II;
+  case NVPTX::TEX_CUBE_F32_F32_RR:
+    return NVPTX::TEX_CUBE_F32_F32_IR;
+  case NVPTX::TEX_CUBE_F32_F32_RI:
+    return NVPTX::TEX_CUBE_F32_F32_II;
+  case NVPTX::TEX_CUBE_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_CUBE_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_CUBE_F32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_S32_F32_RR:
+    return NVPTX::TEX_CUBE_S32_F32_IR;
+  case NVPTX::TEX_CUBE_S32_F32_RI:
+    return NVPTX::TEX_CUBE_S32_F32_II;
+  case NVPTX::TEX_CUBE_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_CUBE_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_CUBE_S32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_U32_F32_RR:
+    return NVPTX::TEX_CUBE_U32_F32_IR;
+  case NVPTX::TEX_CUBE_U32_F32_RI:
+    return NVPTX::TEX_CUBE_U32_F32_II;
+  case NVPTX::TEX_CUBE_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_CUBE_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_CUBE_U32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_RR:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_IR;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_RI:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_II;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_IR;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RI:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_RR:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_IR;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_RI:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_II;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_IR;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RI:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_RR:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_IR;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_RI:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_II;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_IR;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RI:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_II;
+  case NVPTX::TLD4_R_2D_F32_F32_RR:
+    return NVPTX::TLD4_R_2D_F32_F32_IR;
+  case NVPTX::TLD4_R_2D_F32_F32_RI:
+    return NVPTX::TLD4_R_2D_F32_F32_II;
+  case NVPTX::TLD4_G_2D_F32_F32_RR:
+    return NVPTX::TLD4_G_2D_F32_F32_IR;
+  case NVPTX::TLD4_G_2D_F32_F32_RI:
+    return NVPTX::TLD4_G_2D_F32_F32_II;
+  case NVPTX::TLD4_B_2D_F32_F32_RR:
+    return NVPTX::TLD4_B_2D_F32_F32_IR;
+  case NVPTX::TLD4_B_2D_F32_F32_RI:
+    return NVPTX::TLD4_B_2D_F32_F32_II;
+  case NVPTX::TLD4_A_2D_F32_F32_RR:
+    return NVPTX::TLD4_A_2D_F32_F32_IR;
+  case NVPTX::TLD4_A_2D_F32_F32_RI:
+    return NVPTX::TLD4_A_2D_F32_F32_II;
+  case NVPTX::TLD4_R_2D_S32_F32_RR:
+    return NVPTX::TLD4_R_2D_S32_F32_IR;
+  case NVPTX::TLD4_R_2D_S32_F32_RI:
+    return NVPTX::TLD4_R_2D_S32_F32_II;
+  case NVPTX::TLD4_G_2D_S32_F32_RR:
+    return NVPTX::TLD4_G_2D_S32_F32_IR;
+  case NVPTX::TLD4_G_2D_S32_F32_RI:
+    return NVPTX::TLD4_G_2D_S32_F32_II;
+  case NVPTX::TLD4_B_2D_S32_F32_RR:
+    return NVPTX::TLD4_B_2D_S32_F32_IR;
+  case NVPTX::TLD4_B_2D_S32_F32_RI:
+    return NVPTX::TLD4_B_2D_S32_F32_II;
+  case NVPTX::TLD4_A_2D_S32_F32_RR:
+    return NVPTX::TLD4_A_2D_S32_F32_IR;
+  case NVPTX::TLD4_A_2D_S32_F32_RI:
+    return NVPTX::TLD4_A_2D_S32_F32_II;
+  case NVPTX::TLD4_R_2D_U32_F32_RR:
+    return NVPTX::TLD4_R_2D_U32_F32_IR;
+  case NVPTX::TLD4_R_2D_U32_F32_RI:
+    return NVPTX::TLD4_R_2D_U32_F32_II;
+  case NVPTX::TLD4_G_2D_U32_F32_RR:
+    return NVPTX::TLD4_G_2D_U32_F32_IR;
+  case NVPTX::TLD4_G_2D_U32_F32_RI:
+    return NVPTX::TLD4_G_2D_U32_F32_II;
+  case NVPTX::TLD4_B_2D_U32_F32_RR:
+    return NVPTX::TLD4_B_2D_U32_F32_IR;
+  case NVPTX::TLD4_B_2D_U32_F32_RI:
+    return NVPTX::TLD4_B_2D_U32_F32_II;
+  case NVPTX::TLD4_A_2D_U32_F32_RR:
+    return NVPTX::TLD4_A_2D_U32_F32_IR;
+  case NVPTX::TLD4_A_2D_U32_F32_RI:
+    return NVPTX::TLD4_A_2D_U32_F32_II;
+  case NVPTX::TEX_UNIFIED_1D_F32_S32_R:
+    return NVPTX::TEX_UNIFIED_1D_F32_S32_I;
+  case NVPTX::TEX_UNIFIED_1D_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_1D_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_1D_S32_S32_R:
+    return NVPTX::TEX_UNIFIED_1D_S32_S32_I;
+  case NVPTX::TEX_UNIFIED_1D_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_1D_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_1D_U32_S32_R:
+    return NVPTX::TEX_UNIFIED_1D_U32_S32_I;
+  case NVPTX::TEX_UNIFIED_1D_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_1D_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_2D_F32_S32_R:
+    return NVPTX::TEX_UNIFIED_2D_F32_S32_I;
+  case NVPTX::TEX_UNIFIED_2D_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_2D_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_2D_S32_S32_R:
+    return NVPTX::TEX_UNIFIED_2D_S32_S32_I;
+  case NVPTX::TEX_UNIFIED_2D_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_2D_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_2D_U32_S32_R:
+    return NVPTX::TEX_UNIFIED_2D_U32_S32_I;
+  case NVPTX::TEX_UNIFIED_2D_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_2D_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_3D_F32_S32_R:
+    return NVPTX::TEX_UNIFIED_3D_F32_S32_I;
+  case NVPTX::TEX_UNIFIED_3D_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_3D_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_3D_S32_S32_R:
+    return NVPTX::TEX_UNIFIED_3D_S32_S32_I;
+  case NVPTX::TEX_UNIFIED_3D_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_3D_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_3D_U32_S32_R:
+    return NVPTX::TEX_UNIFIED_3D_U32_S32_I;
+  case NVPTX::TEX_UNIFIED_3D_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_3D_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R:
+    return NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_I;
+  case NVPTX::TEX_UNIFIED_CUBE_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_CUBE_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_CUBE_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_CUBE_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_CUBE_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_CUBE_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R:
+    return NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_I;
+  case NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R:
+    return NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_I;
+  case NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_I;
+  case NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R:
+    return NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_I;
+  case NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R:
+    return NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_I;
+  case NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R:
+    return NVPTX::TLD4_UNIFIED_R_2D_F32_F32_I;
+  case NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R:
+    return NVPTX::TLD4_UNIFIED_G_2D_F32_F32_I;
+  case NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R:
+    return NVPTX::TLD4_UNIFIED_B_2D_F32_F32_I;
+  case NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R:
+    return NVPTX::TLD4_UNIFIED_A_2D_F32_F32_I;
+  case NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R:
+    return NVPTX::TLD4_UNIFIED_R_2D_S32_F32_I;
+  case NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R:
+    return NVPTX::TLD4_UNIFIED_G_2D_S32_F32_I;
+  case NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R:
+    return NVPTX::TLD4_UNIFIED_B_2D_S32_F32_I;
+  case NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R:
+    return NVPTX::TLD4_UNIFIED_A_2D_S32_F32_I;
+  case NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R:
+    return NVPTX::TLD4_UNIFIED_R_2D_U32_F32_I;
+  case NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R:
+    return NVPTX::TLD4_UNIFIED_G_2D_U32_F32_I;
+  case NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R:
+    return NVPTX::TLD4_UNIFIED_B_2D_U32_F32_I;
+  case NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R:
+    return NVPTX::TLD4_UNIFIED_A_2D_U32_F32_I;
+  default:
+    llvm_unreachable("Unhandled TEX opcode");
+  };
+}
+
+static unsigned samplerRegisterToIndexOpcode(unsigned RegOC) {
+  switch (RegOC) {
+  case NVPTX::TEX_1D_F32_S32_RR:
+    return NVPTX::TEX_1D_F32_S32_RI;
+  case NVPTX::TEX_1D_F32_S32_IR:
+    return NVPTX::TEX_1D_F32_S32_II;
+  case NVPTX::TEX_1D_F32_F32_RR:
+    return NVPTX::TEX_1D_F32_F32_RI;
+  case NVPTX::TEX_1D_F32_F32_IR:
+    return NVPTX::TEX_1D_F32_F32_II;
+  case NVPTX::TEX_1D_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_1D_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_1D_F32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_F32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_F32_F32_GRAD_RI;
+  case NVPTX::TEX_1D_F32_F32_GRAD_IR:
+    return NVPTX::TEX_1D_F32_F32_GRAD_II;
+  case NVPTX::TEX_1D_S32_S32_RR:
+    return NVPTX::TEX_1D_S32_S32_RI;
+  case NVPTX::TEX_1D_S32_S32_IR:
+    return NVPTX::TEX_1D_S32_S32_II;
+  case NVPTX::TEX_1D_S32_F32_RR:
+    return NVPTX::TEX_1D_S32_F32_RI;
+  case NVPTX::TEX_1D_S32_F32_IR:
+    return NVPTX::TEX_1D_S32_F32_II;
+  case NVPTX::TEX_1D_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_1D_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_1D_S32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_S32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_S32_F32_GRAD_RI;
+  case NVPTX::TEX_1D_S32_F32_GRAD_IR:
+    return NVPTX::TEX_1D_S32_F32_GRAD_II;
+  case NVPTX::TEX_1D_U32_S32_RR:
+    return NVPTX::TEX_1D_U32_S32_RI;
+  case NVPTX::TEX_1D_U32_S32_IR:
+    return NVPTX::TEX_1D_U32_S32_II;
+  case NVPTX::TEX_1D_U32_F32_RR:
+    return NVPTX::TEX_1D_U32_F32_RI;
+  case NVPTX::TEX_1D_U32_F32_IR:
+    return NVPTX::TEX_1D_U32_F32_II;
+  case NVPTX::TEX_1D_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_1D_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_1D_U32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_U32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_U32_F32_GRAD_RI;
+  case NVPTX::TEX_1D_U32_F32_GRAD_IR:
+    return NVPTX::TEX_1D_U32_F32_GRAD_II;
+  case NVPTX::TEX_1D_ARRAY_F32_S32_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_S32_RI;
+  case NVPTX::TEX_1D_ARRAY_F32_S32_IR:
+    return NVPTX::TEX_1D_ARRAY_F32_S32_II;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_RI;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_IR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_II;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RI;
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_IR:
+    return NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_II;
+  case NVPTX::TEX_1D_ARRAY_S32_S32_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_S32_RI;
+  case NVPTX::TEX_1D_ARRAY_S32_S32_IR:
+    return NVPTX::TEX_1D_ARRAY_S32_S32_II;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_RI;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_IR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_II;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RI;
+  case NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_IR:
+    return NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_II;
+  case NVPTX::TEX_1D_ARRAY_U32_S32_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_S32_RI;
+  case NVPTX::TEX_1D_ARRAY_U32_S32_IR:
+    return NVPTX::TEX_1D_ARRAY_U32_S32_II;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_RI;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_IR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_II;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_II;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RI;
+  case NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_IR:
+    return NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_II;
+  case NVPTX::TEX_2D_F32_S32_RR:
+    return NVPTX::TEX_2D_F32_S32_RI;
+  case NVPTX::TEX_2D_F32_S32_IR:
+    return NVPTX::TEX_2D_F32_S32_II;
+  case NVPTX::TEX_2D_F32_F32_RR:
+    return NVPTX::TEX_2D_F32_F32_RI;
+  case NVPTX::TEX_2D_F32_F32_IR:
+    return NVPTX::TEX_2D_F32_F32_II;
+  case NVPTX::TEX_2D_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_2D_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_2D_F32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_F32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_F32_F32_GRAD_RI;
+  case NVPTX::TEX_2D_F32_F32_GRAD_IR:
+    return NVPTX::TEX_2D_F32_F32_GRAD_II;
+  case NVPTX::TEX_2D_S32_S32_RR:
+    return NVPTX::TEX_2D_S32_S32_RI;
+  case NVPTX::TEX_2D_S32_S32_IR:
+    return NVPTX::TEX_2D_S32_S32_II;
+  case NVPTX::TEX_2D_S32_F32_RR:
+    return NVPTX::TEX_2D_S32_F32_RI;
+  case NVPTX::TEX_2D_S32_F32_IR:
+    return NVPTX::TEX_2D_S32_F32_II;
+  case NVPTX::TEX_2D_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_2D_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_2D_S32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_S32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_S32_F32_GRAD_RI;
+  case NVPTX::TEX_2D_S32_F32_GRAD_IR:
+    return NVPTX::TEX_2D_S32_F32_GRAD_II;
+  case NVPTX::TEX_2D_U32_S32_RR:
+    return NVPTX::TEX_2D_U32_S32_RI;
+  case NVPTX::TEX_2D_U32_S32_IR:
+    return NVPTX::TEX_2D_U32_S32_II;
+  case NVPTX::TEX_2D_U32_F32_RR:
+    return NVPTX::TEX_2D_U32_F32_RI;
+  case NVPTX::TEX_2D_U32_F32_IR:
+    return NVPTX::TEX_2D_U32_F32_II;
+  case NVPTX::TEX_2D_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_2D_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_2D_U32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_U32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_U32_F32_GRAD_RI;
+  case NVPTX::TEX_2D_U32_F32_GRAD_IR:
+    return NVPTX::TEX_2D_U32_F32_GRAD_II;
+  case NVPTX::TEX_2D_ARRAY_F32_S32_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_S32_RI;
+  case NVPTX::TEX_2D_ARRAY_F32_S32_IR:
+    return NVPTX::TEX_2D_ARRAY_F32_S32_II;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_RI;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_IR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_II;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RI;
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_IR:
+    return NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_II;
+  case NVPTX::TEX_2D_ARRAY_S32_S32_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_S32_RI;
+  case NVPTX::TEX_2D_ARRAY_S32_S32_IR:
+    return NVPTX::TEX_2D_ARRAY_S32_S32_II;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_RI;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_IR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_II;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RI;
+  case NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_IR:
+    return NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_II;
+  case NVPTX::TEX_2D_ARRAY_U32_S32_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_S32_RI;
+  case NVPTX::TEX_2D_ARRAY_U32_S32_IR:
+    return NVPTX::TEX_2D_ARRAY_U32_S32_II;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_RI;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_IR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_II;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_II;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RI;
+  case NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_IR:
+    return NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_II;
+  case NVPTX::TEX_3D_F32_S32_RR:
+    return NVPTX::TEX_3D_F32_S32_RI;
+  case NVPTX::TEX_3D_F32_S32_IR:
+    return NVPTX::TEX_3D_F32_S32_II;
+  case NVPTX::TEX_3D_F32_F32_RR:
+    return NVPTX::TEX_3D_F32_F32_RI;
+  case NVPTX::TEX_3D_F32_F32_IR:
+    return NVPTX::TEX_3D_F32_F32_II;
+  case NVPTX::TEX_3D_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_3D_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_3D_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_3D_F32_F32_LEVEL_II;
+  case NVPTX::TEX_3D_F32_F32_GRAD_RR:
+    return NVPTX::TEX_3D_F32_F32_GRAD_RI;
+  case NVPTX::TEX_3D_F32_F32_GRAD_IR:
+    return NVPTX::TEX_3D_F32_F32_GRAD_II;
+  case NVPTX::TEX_3D_S32_S32_RR:
+    return NVPTX::TEX_3D_S32_S32_RI;
+  case NVPTX::TEX_3D_S32_S32_IR:
+    return NVPTX::TEX_3D_S32_S32_II;
+  case NVPTX::TEX_3D_S32_F32_RR:
+    return NVPTX::TEX_3D_S32_F32_RI;
+  case NVPTX::TEX_3D_S32_F32_IR:
+    return NVPTX::TEX_3D_S32_F32_II;
+  case NVPTX::TEX_3D_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_3D_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_3D_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_3D_S32_F32_LEVEL_II;
+  case NVPTX::TEX_3D_S32_F32_GRAD_RR:
+    return NVPTX::TEX_3D_S32_F32_GRAD_RI;
+  case NVPTX::TEX_3D_S32_F32_GRAD_IR:
+    return NVPTX::TEX_3D_S32_F32_GRAD_II;
+  case NVPTX::TEX_3D_U32_S32_RR:
+    return NVPTX::TEX_3D_U32_S32_RI;
+  case NVPTX::TEX_3D_U32_S32_IR:
+    return NVPTX::TEX_3D_U32_S32_II;
+  case NVPTX::TEX_3D_U32_F32_RR:
+    return NVPTX::TEX_3D_U32_F32_RI;
+  case NVPTX::TEX_3D_U32_F32_IR:
+    return NVPTX::TEX_3D_U32_F32_II;
+  case NVPTX::TEX_3D_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_3D_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_3D_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_3D_U32_F32_LEVEL_II;
+  case NVPTX::TEX_3D_U32_F32_GRAD_RR:
+    return NVPTX::TEX_3D_U32_F32_GRAD_RI;
+  case NVPTX::TEX_3D_U32_F32_GRAD_IR:
+    return NVPTX::TEX_3D_U32_F32_GRAD_II;
+  case NVPTX::TEX_CUBE_F32_F32_RR:
+    return NVPTX::TEX_CUBE_F32_F32_RI;
+  case NVPTX::TEX_CUBE_F32_F32_IR:
+    return NVPTX::TEX_CUBE_F32_F32_II;
+  case NVPTX::TEX_CUBE_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_CUBE_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_CUBE_F32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_S32_F32_RR:
+    return NVPTX::TEX_CUBE_S32_F32_RI;
+  case NVPTX::TEX_CUBE_S32_F32_IR:
+    return NVPTX::TEX_CUBE_S32_F32_II;
+  case NVPTX::TEX_CUBE_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_CUBE_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_CUBE_S32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_U32_F32_RR:
+    return NVPTX::TEX_CUBE_U32_F32_RI;
+  case NVPTX::TEX_CUBE_U32_F32_IR:
+    return NVPTX::TEX_CUBE_U32_F32_II;
+  case NVPTX::TEX_CUBE_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_CUBE_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_CUBE_U32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_RR:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_RI;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_IR:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_II;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RI;
+  case NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_IR:
+    return NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_RR:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_RI;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_IR:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_II;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RI;
+  case NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_IR:
+    return NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_II;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_RR:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_RI;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_IR:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_II;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RI;
+  case NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_IR:
+    return NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_II;
+  case NVPTX::TLD4_R_2D_F32_F32_RR:
+    return NVPTX::TLD4_R_2D_F32_F32_RI;
+  case NVPTX::TLD4_R_2D_F32_F32_IR:
+    return NVPTX::TLD4_R_2D_F32_F32_II;
+  case NVPTX::TLD4_G_2D_F32_F32_RR:
+    return NVPTX::TLD4_G_2D_F32_F32_RI;
+  case NVPTX::TLD4_G_2D_F32_F32_IR:
+    return NVPTX::TLD4_G_2D_F32_F32_II;
+  case NVPTX::TLD4_B_2D_F32_F32_RR:
+    return NVPTX::TLD4_B_2D_F32_F32_RI;
+  case NVPTX::TLD4_B_2D_F32_F32_IR:
+    return NVPTX::TLD4_B_2D_F32_F32_II;
+  case NVPTX::TLD4_A_2D_F32_F32_RR:
+    return NVPTX::TLD4_A_2D_F32_F32_RI;
+  case NVPTX::TLD4_A_2D_F32_F32_IR:
+    return NVPTX::TLD4_A_2D_F32_F32_II;
+  case NVPTX::TLD4_R_2D_S32_F32_RR:
+    return NVPTX::TLD4_R_2D_S32_F32_RI;
+  case NVPTX::TLD4_R_2D_S32_F32_IR:
+    return NVPTX::TLD4_R_2D_S32_F32_II;
+  case NVPTX::TLD4_G_2D_S32_F32_RR:
+    return NVPTX::TLD4_G_2D_S32_F32_RI;
+  case NVPTX::TLD4_G_2D_S32_F32_IR:
+    return NVPTX::TLD4_G_2D_S32_F32_II;
+  case NVPTX::TLD4_B_2D_S32_F32_RR:
+    return NVPTX::TLD4_B_2D_S32_F32_RI;
+  case NVPTX::TLD4_B_2D_S32_F32_IR:
+    return NVPTX::TLD4_B_2D_S32_F32_II;
+  case NVPTX::TLD4_A_2D_S32_F32_RR:
+    return NVPTX::TLD4_A_2D_S32_F32_RI;
+  case NVPTX::TLD4_A_2D_S32_F32_IR:
+    return NVPTX::TLD4_A_2D_S32_F32_II;
+  case NVPTX::TLD4_R_2D_U32_F32_RR:
+    return NVPTX::TLD4_R_2D_U32_F32_RI;
+  case NVPTX::TLD4_R_2D_U32_F32_IR:
+    return NVPTX::TLD4_R_2D_U32_F32_II;
+  case NVPTX::TLD4_G_2D_U32_F32_RR:
+    return NVPTX::TLD4_G_2D_U32_F32_RI;
+  case NVPTX::TLD4_G_2D_U32_F32_IR:
+    return NVPTX::TLD4_G_2D_U32_F32_II;
+  case NVPTX::TLD4_B_2D_U32_F32_RR:
+    return NVPTX::TLD4_B_2D_U32_F32_RI;
+  case NVPTX::TLD4_B_2D_U32_F32_IR:
+    return NVPTX::TLD4_B_2D_U32_F32_II;
+  case NVPTX::TLD4_A_2D_U32_F32_RR:
+    return NVPTX::TLD4_A_2D_U32_F32_RI;
+  case NVPTX::TLD4_A_2D_U32_F32_IR:
+    return NVPTX::TLD4_A_2D_U32_F32_II;
+  default:
+    llvm_unreachable("Unhandled TEX opcode");
+  };
+}
+
+static unsigned queryRegisterToIndexOpcode(unsigned RegOC) {
+  switch (RegOC) {
+  case NVPTX::TXQ_CHANNEL_ORDER_R:
+    return NVPTX::TXQ_CHANNEL_ORDER_I;
+  case NVPTX::TXQ_CHANNEL_DATA_TYPE_R:
+    return NVPTX::TXQ_CHANNEL_DATA_TYPE_I;
+  case NVPTX::TXQ_WIDTH_R:
+    return NVPTX::TXQ_WIDTH_I;
+  case NVPTX::TXQ_HEIGHT_R:
+    return NVPTX::TXQ_HEIGHT_I;
+  case NVPTX::TXQ_DEPTH_R:
+    return NVPTX::TXQ_DEPTH_I;
+  case NVPTX::TXQ_ARRAY_SIZE_R:
+    return NVPTX::TXQ_ARRAY_SIZE_I;
+  case NVPTX::TXQ_NUM_SAMPLES_R:
+    return NVPTX::TXQ_NUM_SAMPLES_I;
+  case NVPTX::TXQ_NUM_MIPMAP_LEVELS_R:
+    return NVPTX::TXQ_NUM_MIPMAP_LEVELS_I;
+  case NVPTX::SUQ_CHANNEL_ORDER_R:
+    return NVPTX::SUQ_CHANNEL_ORDER_I;
+  case NVPTX::SUQ_CHANNEL_DATA_TYPE_R:
+    return NVPTX::SUQ_CHANNEL_DATA_TYPE_I;
+  case NVPTX::SUQ_WIDTH_R:
+    return NVPTX::SUQ_WIDTH_I;
+  case NVPTX::SUQ_HEIGHT_R:
+    return NVPTX::SUQ_HEIGHT_I;
+  case NVPTX::SUQ_DEPTH_R:
+    return NVPTX::SUQ_DEPTH_I;
+  case NVPTX::SUQ_ARRAY_SIZE_R:
+    return NVPTX::SUQ_ARRAY_SIZE_I;
+  default:
+    llvm_unreachable("Unhandled TXQ/SUQ opcode");
+  };
+}
+
 bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
   MachineFunction &MF = *MI.getParent()->getParent();
   const MCInstrDesc &MCID = MI.getDesc();
+  const NVPTXInstrInfo *TII = MF.getSubtarget<NVPTXSubtarget>().getInstrInfo();
 
   if (MCID.TSFlags & NVPTXII::IsTexFlag) {
     // This is a texture fetch, so operand 4 is a texref and operand 5 is
     // a samplerref
     MachineOperand &TexHandle = MI.getOperand(4);
-    replaceImageHandle(TexHandle, MF);
+    if (replaceImageHandle(TexHandle, MF))
+      MI.setDesc(TII->get(texRegisterToIndexOpcode(MI.getOpcode())));
 
     if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
       MachineOperand &SampHandle = MI.getOperand(5);
-      replaceImageHandle(SampHandle, MF);
+      if (replaceImageHandle(SampHandle, MF))
+        MI.setDesc(TII->get(samplerRegisterToIndexOpcode(MI.getOpcode())));
     }
 
     return true;
@@ -99,21 +1755,24 @@ bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
     // For a surface load of vector size N, the Nth operand will be the surfref
     MachineOperand &SurfHandle = MI.getOperand(VecSize);
 
-    replaceImageHandle(SurfHandle, MF);
+    if (replaceImageHandle(SurfHandle, MF))
+      MI.setDesc(TII->get(suldRegisterToIndexOpcode(MI.getOpcode())));
 
     return true;
   } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
     // This is a surface store, so operand 0 is a surfref
     MachineOperand &SurfHandle = MI.getOperand(0);
 
-    replaceImageHandle(SurfHandle, MF);
+    if (replaceImageHandle(SurfHandle, MF))
+      MI.setDesc(TII->get(sustRegisterToIndexOpcode(MI.getOpcode())));
 
     return true;
   } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
     // This is a query, so operand 1 is a surfref/texref
     MachineOperand &Handle = MI.getOperand(1);
 
-    replaceImageHandle(Handle, MF);
+    if (replaceImageHandle(Handle, MF))
+      MI.setDesc(TII->get(queryRegisterToIndexOpcode(MI.getOpcode())));
 
     return true;
   }
@@ -121,12 +1780,14 @@ bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
   return false;
 }
 
-void NVPTXReplaceImageHandles::
-replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+bool NVPTXReplaceImageHandles::replaceImageHandle(MachineOperand &Op,
+                                                  MachineFunction &MF) {
   unsigned Idx;
   if (findIndexForHandle(Op, MF, Idx)) {
     Op.ChangeToImmediate(Idx);
+    return true;
   }
+  return false;
 }
 
 bool NVPTXReplaceImageHandles::
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e3515f35d022..0a1c61a35795 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -23,11 +23,12 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -225,7 +226,7 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 
   PB.registerPipelineStartEPCallback(
-      [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
         FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
         // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
@@ -240,6 +241,25 @@ NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
   return TargetTransformInfo(NVPTXTTIImpl(this, F));
 }
 
+std::pair<const Value *, unsigned>
+NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
+  if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::nvvm_isspacep_const:
+      return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);
+    case Intrinsic::nvvm_isspacep_global:
+      return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);
+    case Intrinsic::nvvm_isspacep_local:
+      return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
+    case Intrinsic::nvvm_isspacep_shared:
+      return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
+    default:
+      break;
+    }
+  }
+  return std::make_pair(nullptr, -1);
+}
+
 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
   if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createGVNPass());
@@ -328,6 +348,7 @@ void NVPTXPassConfig::addIRPasses() {
     addEarlyCSEOrGVNPass();
     if (!DisableLoadStoreVectorizer)
       addPass(createLoadStoreVectorizerPass());
+    addPass(createSROAPass());
   }
 }
 
@@ -350,7 +371,7 @@ void NVPTXPassConfig::addPreRegAlloc() {
 }
 
 void NVPTXPassConfig::addPostRegAlloc() {
-  addPass(createNVPTXPrologEpilogPass(), false);
+  addPass(createNVPTXPrologEpilogPass());
   if (getOptLevel() != CodeGenOpt::None) {
     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
     // index with VRFrame register. NVPTXPeephole need to be run after that and
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 39647eb65c0c..7a69197abcff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -16,6 +16,7 @@
 #include "ManagedStringPool.h"
 #include "NVPTXSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
+#include <utility>
 
 namespace llvm {
 
@@ -69,6 +70,9 @@ public:
   bool isMachineVerifierClean() const override {
     return false;
   }
+
+  std::pair<const Value *, unsigned>
+  getPredicatedAddrSpace(const Value *V) const override;
 }; // NVPTXTargetMachine.
 
 class NVPTXTargetMachine32 : public NVPTXTargetMachine {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 20bd227b4b16..466aa7130216 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -328,7 +328,7 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
 
   // Simplify to target-generic intrinsic.
   if (Action.IID) {
-    SmallVector<Value *, 4> Args(II->arg_operands());
+    SmallVector<Value *, 4> Args(II->args());
     // All the target-generic intrinsics currently of interest to us have one
     // type argument, equal to that of the nvvm intrinsic's argument.
     Type *Tys[] = {II->getArgOperand(0)->getType()};
@@ -402,8 +402,9 @@ InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
 }
 
 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                           TTI::UnrollingPreferences &UP) {
-  BaseT::getUnrollingPreferences(L, SE, UP);
+                                           TTI::UnrollingPreferences &UP,
+                                           OptimizationRemarkEmitter *ORE) {
+  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
 
   // Enable partial unrolling and runtime unrolling, but reduce the
   // threshold.  This partially unrolls small loops which are often
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d5a52d42ca00..bf593af68f33 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -48,6 +48,11 @@ public:
     return AddressSpace::ADDRESS_SPACE_GENERIC;
   }
 
+  bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
+    return AS != AddressSpace::ADDRESS_SPACE_SHARED &&
+           AS != AddressSpace::ADDRESS_SPACE_LOCAL && AS != ADDRESS_SPACE_PARAM;
+  }
+
   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                                IntrinsicInst &II) const;
 
@@ -89,8 +94,7 @@ public:
   unsigned getInliningThresholdMultiplier() { return 5; }
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -99,7 +103,8 @@ public:
       const Instruction *CxtI = nullptr);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
diff --git a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
index 235be9c0dbbb..e4f0a517599f 100644
--- a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
+++ b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/NVPTXTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheNVPTXTarget32() {
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 7631bb4bccfb..9e181d4052d6 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -23,8 +23,8 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 94416fc584b5..5a12c3f22dee 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -12,8 +12,8 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
index 22731bbd0f82..6b16af293244 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp
@@ -80,7 +80,7 @@ bool PPCCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
 void PPCIncomingValueHandler::assignValueToReg(Register ValVReg,
                                                Register PhysReg,
-                                               CCValAssign &VA) {
+                                               CCValAssign VA) {
   markPhysRegUsed(PhysReg);
   IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
 }
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
index b045032bec06..cc2cb7b26e84 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
+++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h
@@ -46,7 +46,7 @@ public:
 
 private:
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override;
+                        CCValAssign VA) override;
 
   void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
                             MachinePointerInfo &MPO, CCValAssign &VA) override;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 28294b4c00ed..9df94edc8cdf 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -20,8 +20,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
@@ -196,7 +196,8 @@ public:
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override {
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
       support::endian::write<uint32_t>(OS, 0x60000000, Endian);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 386d59266096..0ca8587ba483 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -1,9 +1,8 @@
 //===-------- PPCELFStreamer.cpp - ELF Object Output ---------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -57,7 +56,7 @@ void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
   // all of the nops required as part of the alignment operation. In the cases
   // when no nops are added then The fragment is still created but it remains
   // empty.
-  emitCodeAlignment(64, 4);
+  emitCodeAlignment(64, &STI, 4);
 
   // Emit the instruction.
   // Since the previous emit created a new fragment then adding this instruction
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
index f44200104f32..b3e12413eacf 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
@@ -1,9 +1,8 @@
 //===- PPCELFStreamer.h - ELF Object Output --------------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 3f6497aa0e8f..67c02c17bc46 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -158,7 +158,10 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   //    dcbt ra, rb, th [server]
   //    dcbt th, ra, rb [embedded]
   //  where th can be omitted when it is 0. dcbtst is the same.
-  if (MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) {
+  // On AIX, only emit the extended mnemonics for dcbt and dcbtst if
+  // the "modern assembler" is available.
+  if ((MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) &&
+      (!TT.isOSAIX() || STI.getFeatureBits()[PPC::FeatureModernAIXAs])) {
     unsigned char TH = MI->getOperand(0).getImm();
     O << "\tdcbt";
     if (MI->getOpcode() == PPC::DCBTST)
@@ -628,8 +631,6 @@ const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
 // showRegistersWithPrefix - This method determines whether registers
 // should be number-only or include the prefix.
 bool PPCInstPrinter::showRegistersWithPrefix() const {
-  if (TT.getOS() == Triple::AIX)
-    return false;
   return FullRegNamesWithPercent || FullRegNames;
 }
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index e9fc056a08f0..22b948a83c34 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -37,11 +37,11 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
index e582ddfca323..79db03b0331b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.cpp
@@ -1,9 +1,8 @@
 //===-------- PPCXCOFFStreamer.cpp - XCOFF Object Output ------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -27,7 +26,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -46,7 +45,7 @@ void PPCXCOFFStreamer::emitPrefixedInstruction(const MCInst &Inst,
   // prefixed instruction. Align to 64 bytes if possible but add a maximum of 4
   // bytes when trying to do that. If alignment requires adding more than 4
   // bytes then the instruction won't be aligned.
-  emitCodeAlignment(64, 4);
+  emitCodeAlignment(64, &STI, 4);
 
   // Emit the instruction.
   // Since the previous emit created a new fragment then adding this instruction
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h
index f6eb5edfb7a7..5fa35127b70b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFStreamer.h
@@ -1,9 +1,8 @@
 //===- PPCXCOFFStreamer.h - XCOFF Object Output -----------------*- C++ -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
new file mode 100644
index 000000000000..f43ba00ec373
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -0,0 +1,2075 @@
+//===--- P10InstrResources.td - P10 Scheduling Definitions -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Automatically generated file, do not edit!
+//
+// This file defines the itinerary class data for the POWER10 processor.
+//
+//===----------------------------------------------------------------------===//
+// 22 Cycles Binary Floating Point operations, 2 input operands
+def : InstRW<[P10W_BF_22C, P10W_DISP_ANY, P10BF_Read, P10BF_Read],
+      (instrs
+    FDIVS,
+    XSDIVSP
+)>;
+
+// 2-way crack instructions
+// 22 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_BF_22C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FDIVS_rec
+)>;
+
+// 24 Cycles Binary Floating Point operations, 2 input operands
+def : InstRW<[P10W_BF_24C, P10W_DISP_ANY, P10BF_Read, P10BF_Read],
+      (instrs
+    XVDIVSP
+)>;
+
+// 26 Cycles Binary Floating Point operations, 1 input operands
+def : InstRW<[P10W_BF_26C, P10W_DISP_ANY, P10BF_Read],
+      (instrs
+    FSQRTS,
+    XSSQRTSP
+)>;
+
+// 2-way crack instructions
+// 26 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_BF_26C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FSQRTS_rec
+)>;
+
+// 27 Cycles Binary Floating Point operations, 1 input operands
+def : InstRW<[P10W_BF_27C, P10W_DISP_ANY, P10BF_Read],
+      (instrs
+    XVSQRTSP
+)>;
+
+// 27 Cycles Binary Floating Point operations, 2 input operands
+def : InstRW<[P10W_BF_27C, P10W_DISP_ANY, P10BF_Read, P10BF_Read],
+      (instrs
+    FDIV,
+    XSDIVDP,
+    XVDIVDP
+)>;
+
+// 2-way crack instructions
+// 27 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_BF_27C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FDIV_rec
+)>;
+
+// 36 Cycles Binary Floating Point operations, 1 input operands
+def : InstRW<[P10W_BF_36C, P10W_DISP_ANY, P10BF_Read],
+      (instrs
+    FSQRT,
+    XSSQRTDP,
+    XVSQRTDP
+)>;
+
+// 2-way crack instructions
+// 36 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_BF_36C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FSQRT_rec
+)>;
+
+// 7 Cycles Binary Floating Point operations, 1 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_ANY, P10BF_Read],
+      (instrs
+    FCFID,
+    FCFIDS,
+    FCFIDU,
+    FCFIDUS,
+    FCTID,
+    FCTIDU,
+    FCTIDUZ,
+    FCTIDZ,
+    FCTIW,
+    FCTIWU,
+    FCTIWUZ,
+    FCTIWZ,
+    FRE,
+    FRES,
+    FRIMD, FRIMS,
+    FRIND, FRINS,
+    FRIPD, FRIPS,
+    FRIZD, FRIZS,
+    FRSP,
+    FRSQRTE,
+    FRSQRTES,
+    VCFSX, VCFSX_0,
+    VCFUX, VCFUX_0,
+    VCTSXS, VCTSXS_0,
+    VCTUXS, VCTUXS_0,
+    VLOGEFP,
+    VREFP,
+    VRFIM,
+    VRFIN,
+    VRFIP,
+    VRFIZ,
+    VRSQRTEFP,
+    XSCVDPHP,
+    XSCVDPSP,
+    XSCVDPSPN,
+    XSCVDPSXDS, XSCVDPSXDSs,
+    XSCVDPSXWS, XSCVDPSXWSs,
+    XSCVDPUXDS, XSCVDPUXDSs,
+    XSCVDPUXWS, XSCVDPUXWSs,
+    XSCVSPDP,
+    XSCVSXDDP,
+    XSCVSXDSP,
+    XSCVUXDDP,
+    XSCVUXDSP,
+    XSRDPI,
+    XSRDPIC,
+    XSRDPIM,
+    XSRDPIP,
+    XSRDPIZ,
+    XSREDP,
+    XSRESP,
+    XSRSP,
+    XSRSQRTEDP,
+    XSRSQRTESP,
+    XVCVDPSP,
+    XVCVDPSXDS,
+    XVCVDPSXWS,
+    XVCVDPUXDS,
+    XVCVDPUXWS,
+    XVCVSPBF16,
+    XVCVSPDP,
+    XVCVSPHP,
+    XVCVSPSXDS,
+    XVCVSPSXWS,
+    XVCVSPUXDS,
+    XVCVSPUXWS,
+    XVCVSXDDP,
+    XVCVSXDSP,
+    XVCVSXWDP,
+    XVCVSXWSP,
+    XVCVUXDDP,
+    XVCVUXDSP,
+    XVCVUXWDP,
+    XVCVUXWSP,
+    XVRDPI,
+    XVRDPIC,
+    XVRDPIM,
+    XVRDPIP,
+    XVRDPIZ,
+    XVREDP,
+    XVRESP,
+    XVRSPI,
+    XVRSPIC,
+    XVRSPIM,
+    XVRSPIP,
+    XVRSPIZ,
+    XVRSQRTEDP,
+    XVRSQRTESP
+)>;
+
+// 7 Cycles Binary Floating Point operations, 2 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_ANY, P10BF_Read, P10BF_Read],
+      (instrs
+    FADD,
+    FADDS,
+    FMUL,
+    FMULS,
+    FSUB,
+    FSUBS,
+    VADDFP,
+    VSUBFP,
+    XSADDDP,
+    XSADDSP,
+    XSMULDP,
+    XSMULSP,
+    XSSUBDP,
+    XSSUBSP,
+    XVADDDP,
+    XVADDSP,
+    XVMULDP,
+    XVMULSP,
+    XVSUBDP,
+    XVSUBSP
+)>;
+
+// 7 Cycles Binary Floating Point operations, 3 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_ANY, P10BF_Read, P10BF_Read, P10BF_Read],
+      (instrs
+    FMADD,
+    FMADDS,
+    FMSUB,
+    FMSUBS,
+    FNMADD,
+    FNMADDS,
+    FNMSUB,
+    FNMSUBS,
+    FSELD, FSELS,
+    VMADDFP,
+    VNMSUBFP,
+    XSMADDADP,
+    XSMADDASP,
+    XSMADDMDP,
+    XSMADDMSP,
+    XSMSUBADP,
+    XSMSUBASP,
+    XSMSUBMDP,
+    XSMSUBMSP,
+    XSNMADDADP,
+    XSNMADDASP,
+    XSNMADDMDP,
+    XSNMADDMSP,
+    XSNMSUBADP,
+    XSNMSUBASP,
+    XSNMSUBMDP,
+    XSNMSUBMSP,
+    XVMADDADP,
+    XVMADDASP,
+    XVMADDMDP,
+    XVMADDMSP,
+    XVMSUBADP,
+    XVMSUBASP,
+    XVMSUBMDP,
+    XVMSUBMSP,
+    XVNMADDADP,
+    XVNMADDASP,
+    XVNMADDMDP,
+    XVNMADDMSP,
+    XVNMSUBADP,
+    XVNMSUBASP,
+    XVNMSUBMDP,
+    XVNMSUBMSP
+)>;
+
+// 2-way crack instructions
+// 7 Cycles Binary Floating Point operations, and 7 Cycles Binary Floating Point operations, 1 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_EVEN, P10W_BF_7C, P10W_DISP_ANY, P10BF_Read],
+      (instrs
+    VEXPTEFP
+)>;
+
+// 2-way crack instructions
+// 7 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FADD_rec,
+    FADDS_rec,
+    FMUL_rec,
+    FMULS_rec,
+    FSUB_rec,
+    FSUBS_rec
+)>;
+
+// 2-way crack instructions
+// 7 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FCFID_rec,
+    FCFIDS_rec,
+    FCFIDU_rec,
+    FCFIDUS_rec,
+    FCTID_rec,
+    FCTIDU_rec,
+    FCTIDUZ_rec,
+    FCTIDZ_rec,
+    FCTIW_rec,
+    FCTIWU_rec,
+    FCTIWUZ_rec,
+    FCTIWZ_rec,
+    FRE_rec,
+    FRES_rec,
+    FRIMD_rec, FRIMS_rec,
+    FRIND_rec, FRINS_rec,
+    FRIPD_rec, FRIPS_rec,
+    FRIZD_rec, FRIZS_rec,
+    FRSP_rec,
+    FRSQRTE_rec,
+    FRSQRTES_rec
+)>;
+
+// 2-way crack instructions
+// 7 Cycles Binary Floating Point operations, and 3 Cycles ALU operations, 3 input operands
+def : InstRW<[P10W_BF_7C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    FMADD_rec,
+    FMADDS_rec,
+    FMSUB_rec,
+    FMSUBS_rec,
+    FNMADD_rec,
+    FNMADDS_rec,
+    FNMSUB_rec,
+    FNMSUBS_rec,
+    FSELD_rec, FSELS_rec
+)>;
+
+// 2 Cycles Branch operations, 0 input operands
+def : InstRW<[P10W_BR_2C, P10W_DISP_ANY],
+      (instrs
+    BCLR, BCLRn, BDNZLR, BDNZLR8, BDNZLRm, BDNZLRp, BDZLR, BDZLR8, BDZLRm, BDZLRp, gBCLR,
+    BCLRL, BCLRLn, BDNZLRL, BDNZLRLm, BDNZLRLp, BDZLRL, BDZLRLm, BDZLRLp, gBCLRL,
+    BL, BL8, BL8_NOP, BL8_NOP_RM, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_RM, BL8_NOTOC_TLS, BL8_RM, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_NOP_RM, BL_RM, BL_TLS
+)>;
+
+// 2 Cycles Branch operations, 1 input operands
+def : InstRW<[P10W_BR_2C, P10W_DISP_ANY, P10BR_Read],
+      (instrs
+    B, BCC, BCCA, BCCCTR, BCCCTR8, BCCCTRL, BCCCTRL8, BCCL, BCCLA, BCCLR, BCCLRL, CTRL_DEP, TAILB, TAILB8,
+    BA, TAILBA, TAILBA8,
+    BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL8_LDinto_toc_RM, BCTRL8_RM, BCTRL_LWZinto_toc, BCTRL_LWZinto_toc_RM, BCTRL_RM, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat,
+    BCL, BCLalways, BCLn, BDNZL, BDNZLm, BDNZLp, BDZL, BDZLm, BDZLp, gBCL, gBCLat,
+    BLA, BLA8, BLA8_NOP, BLA8_NOP_RM, BLA8_RM, BLA_RM
+)>;
+
+// 2 Cycles Branch operations, 3 input operands
+def : InstRW<[P10W_BR_2C, P10W_DISP_ANY, P10BR_Read, P10BR_Read, P10BR_Read],
+      (instrs
+    BCCTR, BCCTR8, BCCTR8n, BCCTRn, gBCCTR,
+    BCCTRL, BCCTRL8, BCCTRL8n, BCCTRLn, gBCCTRL
+)>;
+
+// 2 Cycles Branch operations, 4 input operands
+def : InstRW<[P10W_BR_2C, P10W_DISP_ANY, P10BR_Read, P10BR_Read, P10BR_Read, P10BR_Read],
+      (instrs
+    BDNZA, BDNZAm, BDNZAp, BDZA, BDZAm, BDZAp, gBCA, gBCAat,
+    BDNZLA, BDNZLAm, BDNZLAp, BDZLA, BDZLAm, BDZLAp, gBCLA, gBCLAat
+)>;
+
+// 7 Cycles Crypto operations, 1 input operands
+def : InstRW<[P10W_CY_7C, P10W_DISP_ANY, P10CY_Read],
+      (instrs
+    VSBOX
+)>;
+
+// 7 Cycles Crypto operations, 2 input operands
+def : InstRW<[P10W_CY_7C, P10W_DISP_ANY, P10CY_Read, P10CY_Read],
+      (instrs
+    CFUGED,
+    CNTLZDM,
+    CNTTZDM,
+    PDEPD,
+    PEXTD,
+    VCFUGED,
+    VCIPHER,
+    VCIPHERLAST,
+    VCLZDM,
+    VCTZDM,
+    VGNB,
+    VNCIPHER,
+    VNCIPHERLAST,
+    VPDEPD,
+    VPEXTD,
+    VPMSUMB,
+    VPMSUMD,
+    VPMSUMH,
+    VPMSUMW
+)>;
+
+// 13 Cycles Decimal Floating Point operations, 1 input operands
+def : InstRW<[P10W_DF_13C, P10W_DISP_ANY, P10DF_Read],
+      (instrs
+    XSCVDPQP,
+    XSCVQPDP,
+    XSCVQPDPO,
+    XSCVQPSDZ,
+    XSCVQPSQZ,
+    XSCVQPSWZ,
+    XSCVQPUDZ,
+    XSCVQPUQZ,
+    XSCVQPUWZ,
+    XSCVSDQP,
+    XSCVSQQP,
+    XSCVUDQP,
+    XSCVUQQP
+)>;
+
+// 13 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_DF_13C, P10W_DISP_ANY, P10DF_Read, P10DF_Read],
+      (instrs
+    XSADDQP,
+    XSADDQPO,
+    XSSUBQP,
+    XSSUBQPO
+)>;
+
+// 13 Cycles Decimal Floating Point operations, 3 input operands
+def : InstRW<[P10W_DF_13C, P10W_DISP_ANY, P10DF_Read, P10DF_Read, P10DF_Read],
+      (instrs
+    BCDSR_rec,
+    XSRQPI,
+    XSRQPIX,
+    XSRQPXP
+)>;
+
+// 2-way crack instructions
+// 13 Cycles Decimal Floating Point operations, and 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_DF_13C, P10W_DISP_EVEN, P10W_ST_3C, P10W_DISP_ANY],
+      (instrs
+    HASHST,
+    HASHSTP
+)>;
+
+// 24 Cycles Decimal Floating Point operations, 1 input operands
+def : InstRW<[P10W_DF_24C, P10W_DISP_ANY, P10DF_Read],
+      (instrs
+    BCDCTSQ_rec
+)>;
+
+// 25 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_DF_25C, P10W_DISP_ANY, P10DF_Read, P10DF_Read],
+      (instrs
+    XSMULQP,
+    XSMULQPO
+)>;
+
+// 25 Cycles Decimal Floating Point operations, 3 input operands
+def : InstRW<[P10W_DF_25C, P10W_DISP_ANY, P10DF_Read, P10DF_Read, P10DF_Read],
+      (instrs
+    XSMADDQP,
+    XSMADDQPO,
+    XSMSUBQP,
+    XSMSUBQPO,
+    XSNMADDQP,
+    XSNMADDQPO,
+    XSNMSUBQP,
+    XSNMSUBQPO
+)>;
+
+// 38 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_DF_38C, P10W_DISP_ANY, P10DF_Read, P10DF_Read],
+      (instrs
+    BCDCFSQ_rec
+)>;
+
+// 59 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_DF_59C, P10W_DISP_ANY, P10DF_Read, P10DF_Read],
+      (instrs
+    XSDIVQP,
+    XSDIVQPO
+)>;
+
+// 61 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_DF_61C, P10W_DISP_ANY, P10DF_Read, P10DF_Read],
+      (instrs
+    VDIVESQ,
+    VDIVEUQ,
+    VDIVSQ,
+    VDIVUQ
+)>;
+
+// 68 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_DF_68C, P10W_DISP_ANY, P10DF_Read, P10DF_Read],
+      (instrs
+    VMODSQ,
+    VMODUQ
+)>;
+
+// 77 Cycles Decimal Floating Point operations, 1 input operands
+def : InstRW<[P10W_DF_77C, P10W_DISP_ANY, P10DF_Read],
+      (instrs
+    XSSQRTQP,
+    XSSQRTQPO
+)>;
+
+// 20 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_20C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    DIVW,
+    DIVWO,
+    DIVWU,
+    DIVWUO,
+    MODSW
+)>;
+
+// 2-way crack instructions
+// 20 Cycles Scalar Fixed-Point Divide operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_DV_20C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    DIVW_rec,
+    DIVWO_rec,
+    DIVWU_rec,
+    DIVWUO_rec
+)>;
+
+// 25 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_25C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    DIVD,
+    DIVDO,
+    DIVDU,
+    DIVDUO,
+    DIVWE,
+    DIVWEO,
+    DIVWEU,
+    DIVWEUO
+)>;
+
+// 2-way crack instructions
+// 25 Cycles Scalar Fixed-Point Divide operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_DV_25C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    DIVD_rec,
+    DIVDO_rec,
+    DIVDU_rec,
+    DIVDUO_rec,
+    DIVWE_rec,
+    DIVWEO_rec,
+    DIVWEU_rec,
+    DIVWEUO_rec
+)>;
+
+// 27 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_27C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    MODSD,
+    MODUD,
+    MODUW
+)>;
+
+// 41 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_41C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    DIVDE,
+    DIVDEO,
+    DIVDEU,
+    DIVDEUO
+)>;
+
+// 2-way crack instructions
+// 41 Cycles Scalar Fixed-Point Divide operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_DV_41C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    DIVDE_rec,
+    DIVDEO_rec,
+    DIVDEU_rec,
+    DIVDEUO_rec
+)>;
+
+// 43 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_43C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    VDIVSD,
+    VDIVUD
+)>;
+
+// 47 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_47C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    VMODSD,
+    VMODUD
+)>;
+
+// 54 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_54C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    VDIVSW,
+    VDIVUW
+)>;
+
+// 60 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_60C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    VMODSW,
+    VMODUW
+)>;
+
+// 75 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_75C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    VDIVESD,
+    VDIVEUD
+)>;
+
+// 83 Cycles Scalar Fixed-Point Divide operations, 2 input operands
+def : InstRW<[P10W_DV_83C, P10W_DISP_ANY, P10DV_Read, P10DV_Read],
+      (instrs
+    VDIVESW,
+    VDIVEUW
+)>;
+
+// 5 Cycles Fixed-Point and BCD operations, 1 input operands
+def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read],
+      (instrs
+    BCDCTN_rec,
+    VMUL10CUQ,
+    VMUL10UQ,
+    XSXSIGQP
+)>;
+
+// 5 Cycles Fixed-Point and BCD operations, 2 input operands
+def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read],
+      (instrs
+    BCDCFN_rec,
+    BCDCFZ_rec,
+    BCDCPSGN_rec,
+    BCDCTZ_rec,
+    BCDSETSGN_rec,
+    BCDUS_rec,
+    BCDUTRUNC_rec,
+    VADDCUQ,
+    VADDUQM,
+    VMUL10ECUQ,
+    VMUL10EUQ,
+    VSUBCUQ,
+    VSUBUQM,
+    XSCMPEXPQP,
+    XSCMPOQP,
+    XSCMPUQP,
+    XSTSTDCQP,
+    XXGENPCVBM
+)>;
+
+// 5 Cycles Fixed-Point and BCD operations, 3 input operands
+def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read, P10DX_Read],
+      (instrs
+    BCDS_rec,
+    BCDTRUNC_rec,
+    VADDECUQ,
+    VADDEUQM,
+    VSUBECUQ,
+    VSUBEUQM
+)>;
+
+// 4 Cycles ALU2 operations, 0 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_ANY],
+      (instrs
+    TRAP, TW
+)>;
+
+// 4 Cycles ALU2 operations, 1 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read],
+      (instrs
+    CNTLZD,
+    CNTLZD_rec,
+    CNTLZW, CNTLZW8,
+    CNTLZW8_rec, CNTLZW_rec,
+    CNTTZD,
+    CNTTZD_rec,
+    CNTTZW, CNTTZW8,
+    CNTTZW8_rec, CNTTZW_rec,
+    FTSQRT,
+    MTVSRBM,
+    MTVSRBMI,
+    MTVSRDM,
+    MTVSRHM,
+    MTVSRQM,
+    MTVSRWM,
+    POPCNTB, POPCNTB8,
+    POPCNTD,
+    POPCNTW,
+    VCLZB,
+    VCLZD,
+    VCLZH,
+    VCLZW,
+    VCTZB,
+    VCTZD,
+    VCTZH,
+    VCTZW,
+    VEXPANDBM,
+    VEXPANDDM,
+    VEXPANDHM,
+    VEXPANDQM,
+    VEXPANDWM,
+    VEXTRACTBM,
+    VEXTRACTDM,
+    VEXTRACTHM,
+    VEXTRACTQM,
+    VEXTRACTWM,
+    VPOPCNTB,
+    VPOPCNTD,
+    VPOPCNTH,
+    VPOPCNTW,
+    VPRTYBD,
+    VPRTYBW,
+    XSCVHPDP,
+    XSCVSPDPN,
+    XSTSQRTDP,
+    XVCVHPSP,
+    XVTLSBB,
+    XVTSQRTDP,
+    XVTSQRTSP
+)>;
+
+// 4 Cycles ALU2 operations, 2 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read],
+      (instrs
+    CMPEQB,
+    EXTSWSLI_32_64_rec, EXTSWSLI_rec,
+    FCMPOD, FCMPOS,
+    FCMPUD, FCMPUS,
+    FTDIV,
+    SLD_rec,
+    SLW8_rec, SLW_rec,
+    SRD_rec,
+    SRW8_rec, SRW_rec,
+    VABSDUB,
+    VABSDUH,
+    VABSDUW,
+    VADDCUW,
+    VADDSBS,
+    VADDSHS,
+    VADDSWS,
+    VADDUBS,
+    VADDUHS,
+    VADDUWS,
+    VAVGSB,
+    VAVGSH,
+    VAVGSW,
+    VAVGUB,
+    VAVGUH,
+    VAVGUW,
+    VCMPBFP,
+    VCMPBFP_rec,
+    VCMPEQFP,
+    VCMPEQFP_rec,
+    VCMPEQUB_rec,
+    VCMPEQUD_rec,
+    VCMPEQUH_rec,
+    VCMPEQUQ,
+    VCMPEQUQ_rec,
+    VCMPEQUW_rec,
+    VCMPGEFP,
+    VCMPGEFP_rec,
+    VCMPGTFP,
+    VCMPGTFP_rec,
+    VCMPGTSB_rec,
+    VCMPGTSD_rec,
+    VCMPGTSH_rec,
+    VCMPGTSQ,
+    VCMPGTSQ_rec,
+    VCMPGTSW_rec,
+    VCMPGTUB_rec,
+    VCMPGTUD_rec,
+    VCMPGTUH_rec,
+    VCMPGTUQ,
+    VCMPGTUQ_rec,
+    VCMPGTUW_rec,
+    VCMPNEB_rec,
+    VCMPNEH_rec,
+    VCMPNEW_rec,
+    VCMPNEZB_rec,
+    VCMPNEZH_rec,
+    VCMPNEZW_rec,
+    VCMPSQ,
+    VCMPUQ,
+    VCNTMBB,
+    VCNTMBD,
+    VCNTMBH,
+    VCNTMBW,
+    VMAXFP,
+    VMINFP,
+    VSUBCUW,
+    VSUBSBS,
+    VSUBSHS,
+    VSUBSWS,
+    VSUBUBS,
+    VSUBUHS,
+    VSUBUWS,
+    XSCMPEQDP,
+    XSCMPEXPDP,
+    XSCMPGEDP,
+    XSCMPGTDP,
+    XSCMPODP,
+    XSCMPUDP,
+    XSMAXCDP,
+    XSMAXDP,
+    XSMAXJDP,
+    XSMINCDP,
+    XSMINDP,
+    XSMINJDP,
+    XSTDIVDP,
+    XSTSTDCDP,
+    XSTSTDCSP,
+    XVCMPEQDP,
+    XVCMPEQDP_rec,
+    XVCMPEQSP,
+    XVCMPEQSP_rec,
+    XVCMPGEDP,
+    XVCMPGEDP_rec,
+    XVCMPGESP,
+    XVCMPGESP_rec,
+    XVCMPGTDP,
+    XVCMPGTDP_rec,
+    XVCMPGTSP,
+    XVCMPGTSP_rec,
+    XVMAXDP,
+    XVMAXSP,
+    XVMINDP,
+    XVMINSP,
+    XVTDIVDP,
+    XVTDIVSP,
+    XVTSTDCDP,
+    XVTSTDCSP
+)>;
+
+// 4 Cycles ALU2 operations, 3 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read],
+      (instrs
+    CMPRB, CMPRB8,
+    RLDCL_rec,
+    RLDCR_rec,
+    RLDIC_rec,
+    RLDICL_32_rec, RLDICL_rec,
+    RLDICR_rec,
+    TD,
+    TDI,
+    TWI,
+    VSHASIGMAD,
+    VSHASIGMAW
+)>;
+
+// 4 Cycles ALU2 operations, 4 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read, P10F2_Read],
+      (instrs
+    RLDIMI_rec,
+    RLWINM8_rec, RLWINM_rec,
+    RLWNM8_rec, RLWNM_rec
+)>;
+
+// 4 Cycles ALU2 operations, 5 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read, P10F2_Read, P10F2_Read],
+      (instrs
+    RLWIMI8_rec, RLWIMI_rec
+)>;
+
+// Single crack instructions
+// 4 Cycles ALU2 operations, 2 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read, P10F2_Read],
+      (instrs
+    SRAD_rec,
+    SRADI_rec,
+    SRAW_rec,
+    SRAWI_rec
+)>;
+
+// Single crack instructions
+// 4 Cycles ALU2 operations, 3 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read],
+      (instrs
+    TABORTDC,
+    TABORTDCI,
+    TABORTWC,
+    TABORTWCI
+)>;
+
+// 2-way crack instructions
+// 4 Cycles ALU2 operations, and 4 Cycles Permute operations, 2 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_PM_4C, P10W_DISP_ANY],
+      (instrs
+    VRLQ,
+    VRLQNM,
+    VSLQ,
+    VSRAQ,
+    VSRQ
+)>;
+
+// 2-way crack instructions
+// 4 Cycles ALU2 operations, and 4 Cycles Permute operations, 3 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_PM_4C, P10W_DISP_ANY],
+      (instrs
+    VRLQMI
+)>;
+
+// 2-way crack instructions
+// 4 Cycles ALU2 operations, and 4 Cycles ALU2 operations, 0 input operands
+def : InstRW<[P10W_F2_4C, P10W_DISP_PAIR, P10W_F2_4C],
+      (instrs
+    MFCR, MFCR8
+)>;
+
+// 2 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_FX_2C, P10W_DISP_ANY, P10FX_Read],
+      (instrs
+    MTCTR, MTCTR8, MTCTR8loop, MTCTRloop,
+    MTLR, MTLR8
+)>;
+
+// 3 Cycles ALU operations, 0 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    CR6SET, CREQV, CRSET,
+    DSS, DSSALL,
+    MCRXRX,
+    MFCTR, MFCTR8,
+    MFLR, MFLR8,
+    NOP, NOP_GT_PWR6, NOP_GT_PWR7, ORI, ORI8,
+    VXOR, V_SET0, V_SET0B, V_SET0H,
+    XXLEQV, XXLEQVOnes,
+    XXLXOR, XXLXORdpz, XXLXORspz, XXLXORz
+)>;
+
+// 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
+      (instrs
+    ADDI, ADDI8, ADDIdtprelL32,  ADDItlsldLADDR32,  ADDItocL, LI, LI8,
+    ADDIS, ADDIS8,  ADDISdtprelHA32, ADDIStocHA,  ADDIStocHA8, LIS, LIS8,
+    ADDME, ADDME8,
+    ADDME8O, ADDMEO,
+    ADDZE, ADDZE8,
+    ADDZE8O, ADDZEO,
+    EXTSB, EXTSB8, EXTSB8_32_64,
+    EXTSB8_rec, EXTSB_rec,
+    EXTSH, EXTSH8, EXTSH8_32_64,
+    EXTSH8_rec, EXTSH_rec,
+    EXTSW, EXTSW_32, EXTSW_32_64,
+    EXTSW_32_64_rec, EXTSW_rec,
+    FABSD, FABSS,
+    FMR,
+    FNABSD, FNABSS,
+    FNEGD, FNEGS,
+    MCRF,
+    MFOCRF, MFOCRF8,
+    MFVRD, MFVSRD,
+    MFVRWZ, MFVSRWZ,
+    MTOCRF, MTOCRF8,
+    MTVRD, MTVSRD,
+    MTVRWA, MTVSRWA,
+    MTVRWZ, MTVSRWZ,
+    NEG, NEG8,
+    NEG8_rec, NEG_rec,
+    NEG8O, NEGO,
+    SETB, SETB8,
+    SETBC, SETBC8,
+    SETBCR, SETBCR8,
+    SETNBC, SETNBC8,
+    SETNBCR, SETNBCR8,
+    SUBFME, SUBFME8,
+    SUBFME8O, SUBFMEO,
+    SUBFZE, SUBFZE8,
+    SUBFZE8O, SUBFZEO,
+    VEXTSB2D, VEXTSB2Ds,
+    VEXTSB2W, VEXTSB2Ws,
+    VEXTSD2Q,
+    VEXTSH2D, VEXTSH2Ds,
+    VEXTSH2W, VEXTSH2Ws,
+    VEXTSW2D, VEXTSW2Ds,
+    VNEGD,
+    VNEGW,
+    WAIT,
+    XSABSDP,
+    XSABSQP,
+    XSNABSDP,
+    XSNABSQP,
+    XSNEGDP,
+    XSNEGQP,
+    XSXEXPDP,
+    XSXEXPQP,
+    XSXSIGDP,
+    XVABSDP,
+    XVABSSP,
+    XVNABSDP,
+    XVNABSSP,
+    XVNEGDP,
+    XVNEGSP,
+    XVXEXPDP,
+    XVXEXPSP,
+    XVXSIGDP,
+    XVXSIGSP
+)>;
+
+// 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read],
+      (instrs
+    ADD4, ADD4TLS, ADD8, ADD8TLS, ADD8TLS_,
+    ADD4_rec, ADD8_rec,
+    ADDE, ADDE8,
+    ADDE8O, ADDEO,
+    ADDIC, ADDIC8,
+    ADD4O, ADD8O,
+    AND, AND8,
+    AND8_rec, AND_rec,
+    ANDC, ANDC8,
+    ANDC8_rec, ANDC_rec,
+    ANDI8_rec, ANDI_rec,
+    ANDIS8_rec, ANDIS_rec,
+    CMPD, CMPW,
+    CMPB, CMPB8,
+    CMPDI, CMPWI,
+    CMPLD, CMPLW,
+    CMPLDI, CMPLWI,
+    CRAND,
+    CRANDC,
+    CRNAND,
+    CRNOR,
+    CROR,
+    CRORC,
+    CR6UNSET, CRUNSET, CRXOR,
+    EQV, EQV8,
+    EQV8_rec, EQV_rec,
+    EXTSWSLI, EXTSWSLI_32_64,
+    FCPSGND, FCPSGNS,
+    NAND, NAND8,
+    NAND8_rec, NAND_rec,
+    NOR, NOR8,
+    NOR8_rec, NOR_rec,
+    COPY, OR, OR8,
+    OR8_rec, OR_rec,
+    ORC, ORC8,
+    ORC8_rec, ORC_rec,
+    ORIS, ORIS8,
+    SLD,
+    SLW, SLW8,
+    SRAD,
+    SRADI, SRADI_32,
+    SRAW,
+    SRAWI,
+    SRD,
+    SRW, SRW8,
+    SUBF, SUBF8,
+    SUBF8_rec, SUBF_rec,
+    SUBFE, SUBFE8,
+    SUBFE8O, SUBFEO,
+    SUBFIC, SUBFIC8,
+    SUBF8O, SUBFO,
+    VADDUBM,
+    VADDUDM,
+    VADDUHM,
+    VADDUWM,
+    VAND,
+    VANDC,
+    VCMPEQUB,
+    VCMPEQUD,
+    VCMPEQUH,
+    VCMPEQUW,
+    VCMPGTSB,
+    VCMPGTSD,
+    VCMPGTSH,
+    VCMPGTSW,
+    VCMPGTUB,
+    VCMPGTUD,
+    VCMPGTUH,
+    VCMPGTUW,
+    VCMPNEB,
+    VCMPNEH,
+    VCMPNEW,
+    VCMPNEZB,
+    VCMPNEZH,
+    VCMPNEZW,
+    VEQV,
+    VMAXSB,
+    VMAXSD,
+    VMAXSH,
+    VMAXSW,
+    VMAXUB,
+    VMAXUD,
+    VMAXUH,
+    VMAXUW,
+    VMINSB,
+    VMINSD,
+    VMINSH,
+    VMINSW,
+    VMINUB,
+    VMINUD,
+    VMINUH,
+    VMINUW,
+    VMRGEW,
+    VMRGOW,
+    VNAND,
+    VNOR,
+    VOR,
+    VORC,
+    VRLB,
+    VRLD,
+    VRLDNM,
+    VRLH,
+    VRLW,
+    VRLWNM,
+    VSLB,
+    VSLD,
+    VSLH,
+    VSLW,
+    VSRAB,
+    VSRAD,
+    VSRAH,
+    VSRAW,
+    VSRB,
+    VSRD,
+    VSRH,
+    VSRW,
+    VSUBUBM,
+    VSUBUDM,
+    VSUBUHM,
+    VSUBUWM,
+    XOR, XOR8,
+    XOR8_rec, XOR_rec,
+    XORI, XORI8,
+    XORIS, XORIS8,
+    XSCPSGNDP,
+    XSCPSGNQP,
+    XSIEXPDP,
+    XSIEXPQP,
+    XVCPSGNDP,
+    XVCPSGNSP,
+    XVIEXPDP,
+    XVIEXPSP,
+    XXLAND,
+    XXLANDC,
+    XXLNAND,
+    XXLNOR,
+    XXLOR, XXLORf,
+    XXLORC
+)>;
+
+// 3 Cycles ALU operations, 3 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read, P10FX_Read],
+      (instrs
+    ADDEX, ADDEX8,
+    DST, DST64, DSTT, DSTT64,
+    DSTST, DSTST64, DSTSTT, DSTSTT64,
+    ISEL, ISEL8,
+    RLDCL,
+    RLDCR,
+    RLDIC,
+    RLDICL, RLDICL_32, RLDICL_32_64,
+    RLDICR, RLDICR_32,
+    VRLDMI,
+    VRLWMI,
+    VSEL,
+    XXSEL
+)>;
+
+// 3 Cycles ALU operations, 4 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read, P10FX_Read, P10FX_Read],
+      (instrs
+    RLDIMI,
+    RLWINM, RLWINM8,
+    RLWNM, RLWNM8
+)>;
+
+// 3 Cycles ALU operations, 5 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read, P10FX_Read, P10FX_Read, P10FX_Read],
+      (instrs
+    RLWIMI, RLWIMI8
+)>;
+
+// Single crack instructions
+// 3 Cycles ALU operations, 0 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_DISP_ANY],
+      (instrs
+    MFFS,
+    MFFS_rec,
+    MFFSL,
+    MFVSCR,
+    TRECHKPT
+)>;
+
+// Single crack instructions
+// 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10FX_Read],
+      (instrs
+    ADDME8_rec, ADDME_rec,
+    ADDME8O_rec, ADDMEO_rec,
+    ADDZE8_rec, ADDZE_rec,
+    ADDZE8O_rec, ADDZEO_rec,
+    MCRFS,
+    MFFSCDRN,
+    MFFSCDRNI,
+    MFFSCRN,
+    MFFSCRNI,
+    MTFSB0,
+    MTVSCR,
+    NEG8O_rec, NEGO_rec,
+    SUBFME8_rec, SUBFME_rec,
+    SUBFME8O_rec, SUBFMEO_rec,
+    SUBFZE8_rec, SUBFZE_rec,
+    SUBFZE8O_rec, SUBFZEO_rec,
+    TABORT,
+    TBEGIN,
+    TRECLAIM,
+    TSR
+)>;
+
+// Single crack instructions
+// 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10FX_Read, P10FX_Read],
+      (instrs
+    ADDE8_rec, ADDE_rec,
+    ADDE8O_rec, ADDEO_rec,
+    ADDIC_rec,
+    ADD4O_rec, ADD8O_rec,
+    SUBFE8_rec, SUBFE_rec,
+    SUBFE8O_rec, SUBFEO_rec,
+    SUBF8O_rec, SUBFO_rec
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 3 Cycles ALU operations, 0 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    HRFID,
+    MFFSCE,
+    RFID,
+    STOP
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
+      (instrs
+    FABSD_rec, FABSS_rec,
+    FMR_rec,
+    FNABSD_rec, FNABSS_rec,
+    FNEGD_rec, FNEGS_rec,
+    MTFSB1,
+    RFEBB,
+    SC
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read],
+      (instrs
+    ADDC, ADDC8,
+    ADDC8_rec, ADDC_rec,
+    ADDC8O, ADDCO,
+    FCPSGND_rec, FCPSGNS_rec,
+    MTFSF, MTFSFb,
+    MTFSFI, MTFSFIb,
+    SUBFC, SUBFC8,
+    SUBFC8_rec, SUBFC_rec,
+    SUBFC8O, SUBFCO
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 3 Cycles ALU operations, 3 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read, P10FX_Read],
+      (instrs
+    MTFSFI_rec
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 3 Cycles ALU operations, 4 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read, P10FX_Read, P10FX_Read],
+      (instrs
+    MTFSF_rec
+)>;
+
+// 4-way crack instructions
+// 3 Cycles ALU operations, 3 Cycles ALU operations, 3 Cycles ALU operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10W_FX_3C, P10W_DISP_ANY, P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read],
+      (instrs
+    ADDC8O_rec, ADDCO_rec,
+    SUBFC8O_rec, SUBFCO_rec
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 4 Cycles Permute operations, 1 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_EVEN, P10W_PM_4C, P10W_DISP_ANY],
+      (instrs
+    VSTRIBL_rec,
+    VSTRIBR_rec,
+    VSTRIHL_rec,
+    VSTRIHR_rec
+)>;
+
+// 2-way crack instructions
+// 3 Cycles ALU operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_FX_3C, P10W_DISP_PAIR, P10W_FX_3C, P10FX_Read, P10FX_Read],
+      (instrs
+    MTCRF, MTCRF8
+)>;
+
+// 6 Cycles Load operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_ANY, P10LD_Read],
+      (instrs
+    LBZ, LBZ8,
+    LD,  LDtoc,  LDtocBA,  LDtocCPT,  LDtocJTI,  LDtocL, SPILLTOVSR_LD,
+    LDBRX,
+     DFLOADf32, DFLOADf64, LFD,
+    LFDX,  XFLOADf32, XFLOADf64,
+    LFIWAX, LIWAX,
+    LFIWZX, LIWZX,
+    LHA, LHA8,
+    LHAX, LHAX8,
+    LHBRX, LHBRX8,
+    LHZ, LHZ8,
+    LVEBX,
+    LVEHX,
+    LVEWX,
+    LVX,
+    LVXL,
+    LWA, LWA_32,
+    LWAX, LWAX_32,
+    LWBRX, LWBRX8,
+    LWZ, LWZ8,  LWZtoc, LWZtocL,
+    LXSD,
+    LXSDX,
+    LXSIBZX,
+    LXSIHZX,
+    LXSIWAX,
+    LXSIWZX,
+    LXV,
+    LXVB16X,
+    LXVD2X,
+    LXVDSX,
+    LXVH8X,
+    LXVRBX,
+    LXVRDX,
+    LXVRHX,
+    LXVRWX,
+    LXVW4X,
+    LXVWSX,
+    LXVX
+)>;
+
+// 6 Cycles Load operations, 2 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_ANY, P10LD_Read, P10LD_Read],
+      (instrs
+    DCBT,
+    DCBTST,
+    ICBT,
+    LBZX, LBZX8, LBZXTLS, LBZXTLS_, LBZXTLS_32,
+    LDX, LDXTLS, LDXTLS_, SPILLTOVSR_LDX,
+    LHZX, LHZX8, LHZXTLS, LHZXTLS_, LHZXTLS_32,
+    LWZX, LWZX8, LWZXTLS, LWZXTLS_, LWZXTLS_32,
+    LXVL,
+    LXVLL
+)>;
+
+// 2-way crack instructions
+// 6 Cycles Load operations, and 13 Cycles Decimal Floating Point operations, 2 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DF_13C, P10W_DISP_ANY],
+      (instrs
+    HASHCHK,
+    HASHCHKP
+)>;
+
+// Single crack instructions
+// 6 Cycles Load operations, 0 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DISP_ANY],
+      (instrs
+    SLBIA
+)>;
+
+// Single crack instructions
+// 6 Cycles Load operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DISP_ANY, P10LD_Read],
+      (instrs
+    DARN,
+    LBARX, LBARXL,
+    LDARX, LDARXL,
+    LHARX, LHARXL,
+    LWARX, LWARXL,
+    SLBFEE_rec,
+    SLBIE,
+    SLBMFEE,
+    SLBMFEV
+)>;
+
+// Single crack instructions
+// 6 Cycles Load operations, 2 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DISP_ANY, P10LD_Read, P10LD_Read],
+      (instrs
+    LBZCIX,
+    LDCIX,
+    LHZCIX,
+    LWZCIX,
+    MTSPR, MTSPR8, MTSR, MTVRSAVE, MTVRSAVEv
+)>;
+
+// Expand instructions
+// 6 Cycles Load operations, 6 Cycles Load operations, 6 Cycles Load operations, and 6 Cycles Load operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_LD_6C, P10W_DISP_ANY, P10W_LD_6C, P10W_DISP_ANY, P10W_LD_6C, P10W_DISP_ANY, P10LD_Read],
+      (instrs
+    LMW
+)>;
+
+// Expand instructions
+// 6 Cycles Load operations, 6 Cycles Load operations, 6 Cycles Load operations, and 6 Cycles Load operations, 2 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_LD_6C, P10W_DISP_ANY, P10W_LD_6C, P10W_DISP_ANY, P10W_LD_6C, P10W_DISP_ANY, P10LD_Read, P10LD_Read],
+      (instrs
+    LSWI
+)>;
+
+// 2-way crack instructions
+// 6 Cycles Load operations, and 3 Cycles Simple Fixed-point (SFX) operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_SX_3C, P10W_DISP_ANY],
+      (instrs
+    LBZU, LBZU8,
+    LBZUX, LBZUX8,
+    LDU,
+    LDUX,
+    LFDU,
+    LFDUX,
+    LHAU, LHAU8,
+    LHAUX, LHAUX8,
+    LHZU, LHZU8,
+    LHZUX, LHZUX8,
+    LWAUX,
+    LWZU, LWZU8,
+    LWZUX, LWZUX8
+)>;
+
+// 6 Cycles Load operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10LD_Read],
+      (instrs
+    PLBZ, PLBZ8, PLBZ8pc, PLBZpc,
+    PLD, PLDpc,
+    PLFD, PLFDpc,
+    PLFS, PLFSpc,
+    PLHA, PLHA8, PLHA8pc, PLHApc,
+    PLHZ, PLHZ8, PLHZ8pc, PLHZpc,
+    PLWA, PLWA8, PLWA8pc, PLWApc,
+    PLWZ, PLWZ8, PLWZ8pc, PLWZpc,
+    PLXSD, PLXSDpc,
+    PLXSSP, PLXSSPpc,
+    PLXV, PLXVpc,
+    PLXVP, PLXVPpc
+)>;
+
+// 2-way crack instructions
+// 6 Cycles Load operations, and 4 Cycles ALU2 operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10W_F2_4C],
+      (instrs
+    LFS,
+    LFSX,
+    LXSSP,
+    LXSSPX
+)>;
+
+// 4-way crack instructions
+// 6 Cycles Load operations, 4 Cycles ALU2 operations, 3 Cycles Simple Fixed-point (SFX) operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10W_F2_4C, P10W_SX_3C, P10W_DISP_ANY, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    LFSU,
+    LFSUX
+)>;
+
+// 2-way crack instructions
+// 6 Cycles Load operations, and 6 Cycles Load operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10W_LD_6C, P10W_DISP_PAIR, P10LD_Read],
+      (instrs
+    TLBIEL
+)>;
+
+// 2-way crack instructions
+// 6 Cycles Load operations, and 6 Cycles Load operations, 2 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10W_LD_6C, P10W_DISP_PAIR, P10LD_Read, P10LD_Read],
+      (instrs
+    SLBMTE
+)>;
+
+// 2-way crack instructions
+// 6 Cycles Load operations, and 3 Cycles Simple Fixed-point (SFX) operations, 1 input operands
+def : InstRW<[P10W_LD_6C, P10W_DISP_PAIR, P10W_SX_3C],
+      (instrs
+    LXVP,
+    LXVPX
+)>;
+
+// Single crack instructions
+// 13 Cycles Unknown operations, 1 input operands
+def : InstRW<[P10W_MFL_13C, P10W_DISP_EVEN, P10W_DISP_ANY],
+      (instrs
+    MFSPR, MFSPR8, MFSR, MFTB8, MFVRSAVE, MFVRSAVEv
+)>;
+
+// 10 Cycles SIMD Matrix Multiply Engine operations, 0 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_ANY],
+      (instrs
+    XXSETACCZ
+)>;
+
+// 10 Cycles SIMD Matrix Multiply Engine operations, 2 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_ANY, P10MM_Read, P10MM_Read],
+      (instrs
+    XVBF16GER2,
+    XVF16GER2,
+    XVF32GER,
+    XVF64GER,
+    XVI16GER2,
+    XVI16GER2S,
+    XVI4GER8,
+    XVI8GER4
+)>;
+
+// 10 Cycles SIMD Matrix Multiply Engine operations, 3 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_ANY, P10MM_Read, P10MM_Read, P10MM_Read],
+      (instrs
+    XVBF16GER2NN,
+    XVBF16GER2NP,
+    XVBF16GER2PN,
+    XVBF16GER2PP,
+    XVF16GER2NN,
+    XVF16GER2NP,
+    XVF16GER2PN,
+    XVF16GER2PP,
+    XVF32GERNN,
+    XVF32GERNP,
+    XVF32GERPN,
+    XVF32GERPP,
+    XVF64GERNN,
+    XVF64GERNP,
+    XVF64GERPN,
+    XVF64GERPP,
+    XVI16GER2PP,
+    XVI16GER2SPP,
+    XVI4GER8PP,
+    XVI8GER4PP,
+    XVI8GER4SPP
+)>;
+
+// 10 Cycles SIMD Matrix Multiply Engine operations, 4 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_PAIR, P10MM_Read, P10MM_Read, P10MM_Read, P10MM_Read],
+      (instrs
+    PMXVF32GER,
+    PMXVF64GER
+)>;
+
+// 10 Cycles SIMD Matrix Multiply Engine operations, 5 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_PAIR, P10MM_Read, P10MM_Read, P10MM_Read, P10MM_Read, P10MM_Read],
+      (instrs
+    PMXVBF16GER2,
+    PMXVF16GER2,
+    PMXVF32GERNN,
+    PMXVF32GERNP,
+    PMXVF32GERPN,
+    PMXVF32GERPP,
+    PMXVF64GERNN,
+    PMXVF64GERNP,
+    PMXVF64GERPN,
+    PMXVF64GERPP,
+    PMXVI16GER2,
+    PMXVI16GER2S,
+    PMXVI4GER8,
+    PMXVI8GER4
+)>;
+
+// 10 Cycles SIMD Matrix Multiply Engine operations, 6 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_PAIR, P10MM_Read, P10MM_Read, P10MM_Read, P10MM_Read, P10MM_Read, P10MM_Read],
+      (instrs
+    PMXVBF16GER2NN,
+    PMXVBF16GER2NP,
+    PMXVBF16GER2PN,
+    PMXVBF16GER2PP,
+    PMXVF16GER2NN,
+    PMXVF16GER2NP,
+    PMXVF16GER2PN,
+    PMXVF16GER2PP,
+    PMXVI16GER2PP,
+    PMXVI16GER2SPP,
+    PMXVI4GER8PP,
+    PMXVI8GER4PP,
+    PMXVI8GER4SPP
+)>;
+
+// 2-way crack instructions
+// 10 Cycles SIMD Matrix Multiply Engine operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_PAIR, P10W_FX_3C],
+      (instrs
+    XXMTACC
+)>;
+
+// 4-way crack instructions
+// 10 Cycles SIMD Matrix Multiply Engine operations, 3 Cycles ALU operations, 10 Cycles SIMD Matrix Multiply Engine operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_MM_10C, P10W_DISP_PAIR, P10W_FX_3C, P10W_MM_10C, P10W_DISP_PAIR, P10W_FX_3C],
+      (instrs
+    XXMFACC
+)>;
+
+// 5 Cycles GPR Multiply operations, 2 input operands
+def : InstRW<[P10W_MU_5C, P10W_DISP_ANY, P10MU_Read, P10MU_Read],
+      (instrs
+    MULHD,
+    MULHDU,
+    MULHW,
+    MULHWU,
+    MULLD,
+    MULLDO,
+    MULLI, MULLI8,
+    MULLW,
+    MULLWO,
+    VMULHSD,
+    VMULHUD,
+    VMULLD
+)>;
+
+// 5 Cycles GPR Multiply operations, 3 input operands
+def : InstRW<[P10W_MU_5C, P10W_DISP_ANY, P10MU_Read, P10MU_Read, P10MU_Read],
+      (instrs
+    MADDHD,
+    MADDHDU,
+    MADDLD, MADDLD8
+)>;
+
+// 2-way crack instructions
+// 5 Cycles GPR Multiply operations, and 3 Cycles ALU operations, 2 input operands
+def : InstRW<[P10W_MU_5C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    MULHD_rec,
+    MULHDU_rec,
+    MULHW_rec,
+    MULHWU_rec,
+    MULLD_rec,
+    MULLDO_rec,
+    MULLW_rec,
+    MULLWO_rec
+)>;
+
+// 4 Cycles Permute operations, 0 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_ANY],
+      (instrs
+    VSPLTISW, V_SETALLONES, V_SETALLONESB, V_SETALLONESH
+)>;
+
+// 4 Cycles Permute operations, 1 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_ANY, P10PM_Read],
+      (instrs
+    LVSL,
+    LVSR,
+    MFVSRLD,
+    MTVSRWS,
+    VCLZLSBB,
+    VCTZLSBB,
+    VGBBD,
+    VPRTYBQ,
+    VSPLTISB,
+    VSPLTISH,
+    VSTRIBL,
+    VSTRIBR,
+    VSTRIHL,
+    VSTRIHR,
+    VUPKHPX,
+    VUPKHSB,
+    VUPKHSH,
+    VUPKHSW,
+    VUPKLPX,
+    VUPKLSB,
+    VUPKLSH,
+    VUPKLSW,
+    XVCVBF16SPN,
+    XXBRD,
+    XXBRH,
+    XXBRQ,
+    XXBRW,
+    XXSPLTIB
+)>;
+
+// 4 Cycles Permute operations, 2 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_ANY, P10PM_Read, P10PM_Read],
+      (instrs
+    BPERMD,
+    MTVSRDD,
+    VBPERMD,
+    VBPERMQ,
+    VCLRLB,
+    VCLRRB,
+    VEXTRACTD,
+    VEXTRACTUB,
+    VEXTRACTUH,
+    VEXTRACTUW,
+    VEXTUBLX,
+    VEXTUBRX,
+    VEXTUHLX,
+    VEXTUHRX,
+    VEXTUWLX,
+    VEXTUWRX,
+    VINSERTD,
+    VINSERTW,
+    VMRGHB,
+    VMRGHH,
+    VMRGHW,
+    VMRGLB,
+    VMRGLH,
+    VMRGLW,
+    VPKPX,
+    VPKSDSS,
+    VPKSDUS,
+    VPKSHSS,
+    VPKSHUS,
+    VPKSWSS,
+    VPKSWUS,
+    VPKUDUM,
+    VPKUDUS,
+    VPKUHUM,
+    VPKUHUS,
+    VPKUWUM,
+    VPKUWUS,
+    VSL,
+    VSLO,
+    VSLV,
+    VSPLTB, VSPLTBs,
+    VSPLTH, VSPLTHs,
+    VSPLTW,
+    VSR,
+    VSRO,
+    VSRV,
+    XXEXTRACTUW,
+    XXGENPCVDM,
+    XXGENPCVHM,
+    XXGENPCVWM,
+    XXMRGHW,
+    XXMRGLW,
+    XXPERM,
+    XXPERMDI, XXPERMDIs,
+    XXPERMR,
+    XXSLDWI, XXSLDWIs,
+    XXSPLTW, XXSPLTWs
+)>;
+
+// 4 Cycles Permute operations, 3 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_ANY, P10PM_Read, P10PM_Read, P10PM_Read],
+      (instrs
+    VEXTDDVLX,
+    VEXTDDVRX,
+    VEXTDUBVLX,
+    VEXTDUBVRX,
+    VEXTDUHVLX,
+    VEXTDUHVRX,
+    VEXTDUWVLX,
+    VEXTDUWVRX,
+    VINSBLX,
+    VINSBRX,
+    VINSBVLX,
+    VINSBVRX,
+    VINSD,
+    VINSDLX,
+    VINSDRX,
+    VINSERTB,
+    VINSERTH,
+    VINSHLX,
+    VINSHRX,
+    VINSHVLX,
+    VINSHVRX,
+    VINSW,
+    VINSWLX,
+    VINSWRX,
+    VINSWVLX,
+    VINSWVRX,
+    VPERM,
+    VPERMR,
+    VPERMXOR,
+    VSLDBI,
+    VSLDOI,
+    VSRDBI,
+    XXINSERTW
+)>;
+
+// 2-way crack instructions
+// 4 Cycles Permute operations, and 7 Cycles VMX Multiply operations, 2 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_EVEN, P10W_vMU_7C, P10W_DISP_ANY],
+      (instrs
+    VSUMSWS
+)>;
+
+// 4 Cycles Permute operations, 1 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_PAIR, P10PM_Read],
+      (instrs
+    XXSPLTIDP,
+    XXSPLTIW
+)>;
+
+// 4 Cycles Permute operations, 3 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_PAIR, P10PM_Read, P10PM_Read, P10PM_Read],
+      (instrs
+    XXBLENDVB,
+    XXBLENDVD,
+    XXBLENDVH,
+    XXBLENDVW,
+    XXSPLTI32DX
+)>;
+
+// 4 Cycles Permute operations, 4 input operands
+def : InstRW<[P10W_PM_4C, P10W_DISP_PAIR, P10PM_Read, P10PM_Read, P10PM_Read, P10PM_Read],
+      (instrs
+    XXEVAL,
+    XXPERMX
+)>;
+
+// 3 Cycles Store operations, 1 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_ANY, P10ST_Read],
+      (instrs
+    DCBST,
+    DCBZ,
+    ICBI
+)>;
+
+// 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_ANY, P10ST_Read, P10ST_Read],
+      (instrs
+    DCBF,
+    PSTXVP, PSTXVPpc,
+    STB, STB8,
+    STBU, STBU8,
+    STBUX, STBUX8,
+    SPILLTOVSR_ST, STD,
+    STDBRX,
+    STDU,
+    STDUX,
+     DFSTOREf32, DFSTOREf64, STFD,
+    STFDU,
+    STFDUX,
+    STFDX,
+    STFIWX, STIWX,
+    STFS,
+    STFSU,
+    STFSUX,
+    STFSX,
+    STH, STH8,
+    STHBRX,
+    STHU, STHU8,
+    STHUX, STHUX8,
+    STVEBX,
+    STVEHX,
+    STVEWX,
+    STVX,
+    STVXL,
+    STW, STW8,
+    STWBRX,
+    STWU, STWU8,
+    STWUX, STWUX8,
+    STXSD,
+    STXSDX,
+    STXSIBX, STXSIBXv,
+    STXSIHX, STXSIHXv,
+    STXSIWX,
+    STXSSP,
+    STXSSPX,
+    STXV,
+    STXVB16X,
+    STXVD2X,
+    STXVH8X,
+    STXVRBX,
+    STXVRDX,
+    STXVRHX,
+    STXVRWX,
+    STXVW4X,
+    STXVX
+)>;
+
+// 3 Cycles Store operations, 3 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_ANY, P10ST_Read, P10ST_Read, P10ST_Read],
+      (instrs
+    CP_COPY, CP_COPY8,
+    STBX, STBX8, STBXTLS, STBXTLS_, STBXTLS_32,
+    SPILLTOVSR_STX, STDX, STDXTLS, STDXTLS_,
+    STHX, STHX8, STHXTLS, STHXTLS_, STHXTLS_32,
+    STWX, STWX8, STWXTLS, STWXTLS_, STWXTLS_32,
+    STXVL,
+    STXVLL
+)>;
+
+// Single crack instructions
+// 3 Cycles Store operations, 0 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY],
+      (instrs
+    EnforceIEIO,
+    MSGSYNC,
+    SLBSYNC,
+    TCHECK,
+    TLBSYNC
+)>;
+
+// Single crack instructions
+// 3 Cycles Store operations, 1 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read],
+      (instrs
+    TEND
+)>;
+
+// Single crack instructions
+// 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read, P10ST_Read],
+      (instrs
+    SLBIEG,
+    STBCX,
+    STDCX,
+    STHCX,
+    STWCX,
+    TLBIE
+)>;
+
+// Single crack instructions
+// 3 Cycles Store operations, 3 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read, P10ST_Read, P10ST_Read],
+      (instrs
+    CP_PASTE8_rec, CP_PASTE_rec,
+    STBCIX,
+    STDCIX,
+    STHCIX,
+    STWCIX
+)>;
+
+// 2-way crack instructions
+// 3 Cycles Store operations, and 3 Cycles ALU operations, 0 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    ISYNC
+)>;
+
+// 2-way crack instructions
+// 3 Cycles Store operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    SYNC
+)>;
+
+// Expand instructions
+// 3 Cycles Store operations, 3 Cycles ALU operations, 3 Cycles Store operations, 3 Cycles ALU operations, 3 Cycles Store operations, 3 Cycles ALU operations, 6 Cycles Load operations, and 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10W_FX_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10W_FX_3C, P10W_DISP_ANY, P10W_LD_6C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY],
+      (instrs
+    LDAT,
+    LWAT
+)>;
+
+// 4-way crack instructions
+// 3 Cycles Store operations, 3 Cycles ALU operations, 3 Cycles Store operations, and 3 Cycles Store operations, 3 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY],
+      (instrs
+    STDAT,
+    STWAT
+)>;
+
+// Expand instructions
+// 3 Cycles Store operations, 3 Cycles Store operations, 3 Cycles Store operations, and 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_ST_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10ST_Read, P10ST_Read],
+      (instrs
+    STMW
+)>;
+
+// Expand instructions
+// 3 Cycles Store operations, 3 Cycles Store operations, 3 Cycles Store operations, and 3 Cycles Store operations, 3 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_ST_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10W_ST_3C, P10W_DISP_ANY, P10ST_Read, P10ST_Read, P10ST_Read],
+      (instrs
+    STSWI
+)>;
+
+// 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_PAIR, P10ST_Read, P10ST_Read],
+      (instrs
+    PSTB, PSTB8, PSTB8pc, PSTBpc,
+    PSTD, PSTDpc,
+    PSTFD, PSTFDpc,
+    PSTFS, PSTFSpc,
+    PSTH, PSTH8, PSTH8pc, PSTHpc,
+    PSTW, PSTW8, PSTW8pc, PSTWpc,
+    PSTXSD, PSTXSDpc,
+    PSTXSSP, PSTXSSPpc,
+    PSTXV, PSTXVpc
+)>;
+
+// 2-way crack instructions
+// 3 Cycles Store operations, and 3 Cycles Store operations, 2 input operands
+def : InstRW<[P10W_ST_3C, P10W_DISP_PAIR, P10W_ST_3C, P10ST_Read, P10ST_Read],
+      (instrs
+    STXVP,
+    STXVPX
+)>;
+
+// FIXME - Miss scheduling information from datasheet
+// Temporary set it as 1 Cycles Simple Fixed-point (SFX) operations, 0 input operands
+def : InstRW<[P10W_SX, P10W_DISP_ANY],
+      (instrs
+    ATTN,
+    CP_ABORT,
+    DCBA,
+    DCBI,
+    DCBZL,
+    DCCCI,
+    ICBLC,
+    ICBLQ,
+    ICBTLS,
+    ICCCI,
+    LA,
+    LDMX,
+    MFDCR,
+    MFPMR,
+    MFSRIN,
+    MSYNC,
+    MTDCR,
+    MTPMR,
+    MTSRIN,
+    NAP,
+    TLBIA,
+    TLBLD,
+    TLBLI,
+    TLBRE2,
+    TLBSX2,
+    TLBSX2D,
+    TLBWE2
+)>;
+
+// Single crack instructions
+// 3 Cycles Simple Fixed-point (SFX) operations, 0 input operands
+def : InstRW<[P10W_SX_3C, P10W_DISP_EVEN, P10W_DISP_ANY],
+      (instrs
+    CLRBHRB,
+    MFMSR
+)>;
+
+// Single crack instructions
+// 3 Cycles Simple Fixed-point (SFX) operations, 1 input operands
+def : InstRW<[P10W_SX_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10SX_Read],
+      (instrs
+    MFTB
+)>;
+
+// Single crack instructions
+// 3 Cycles Simple Fixed-point (SFX) operations, 2 input operands
+def : InstRW<[P10W_SX_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10SX_Read, P10SX_Read],
+      (instrs
+    MFBHRBE,
+    MTMSR,
+    MTMSRD
+)>;
+
+// 2-way crack instructions
+// 3 Cycles Simple Fixed-point (SFX) operations, and 3 Cycles ALU operations, 1 input operands
+def : InstRW<[P10W_SX_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
+      (instrs
+    ADDPCIS
+)>;
+
+// 3 Cycles Simple Fixed-point (SFX) operations, 1 input operands
+def : InstRW<[P10W_SX_3C, P10W_DISP_PAIR, P10SX_Read],
+      (instrs
+    PADDI, PADDI8, PADDI8pc, PADDIpc, PLI, PLI8
+)>;
+
+// 7 Cycles VMX Multiply operations, 2 input operands
+def : InstRW<[P10W_vMU_7C, P10W_DISP_ANY, P10vMU_Read, P10vMU_Read],
+      (instrs
+    VMULESB,
+    VMULESD,
+    VMULESH,
+    VMULESW,
+    VMULEUB,
+    VMULEUD,
+    VMULEUH,
+    VMULEUW,
+    VMULHSW,
+    VMULHUW,
+    VMULOSB,
+    VMULOSD,
+    VMULOSH,
+    VMULOSW,
+    VMULOUB,
+    VMULOUD,
+    VMULOUH,
+    VMULOUW,
+    VMULUWM,
+    VSUM2SWS,
+    VSUM4SBS,
+    VSUM4SHS,
+    VSUM4UBS
+)>;
+
+// 7 Cycles VMX Multiply operations, 3 input operands
+def : InstRW<[P10W_vMU_7C, P10W_DISP_ANY, P10vMU_Read, P10vMU_Read, P10vMU_Read],
+      (instrs
+    VMHADDSHS,
+    VMHRADDSHS,
+    VMLADDUHM,
+    VMSUMCUD,
+    VMSUMMBM,
+    VMSUMSHM,
+    VMSUMSHS,
+    VMSUMUBM,
+    VMSUMUDM,
+    VMSUMUHM,
+    VMSUMUHS
+)>;
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index 76663acf4782..c4f4a2b3d796 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -1302,15 +1302,15 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C],
   (instregex "BCCTR(L)?(8)?(n)?$"),
   (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"),
   (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"),
-  (instregex "BL(_TLS|_NOP)?$"),
-  (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"),
-  (instregex "BLA(8|8_NOP)?$"),
+  (instregex "BL(_TLS|_NOP)?(_RM)?$"),
+  (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?(_RM)?$"),
+  (instregex "BLA(8|8_NOP)?(_RM)?$"),
   (instregex "BLR(8|L)?$"),
   (instregex "TAILB(A)?(8)?$"),
   (instregex "TAILBCTR(8)?$"),
   (instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"),
   (instregex "BCLR(L)?(n)?$"),
-  (instregex "BCTR(L)?(8)?$"),
+  (instregex "BCTR(L)?(8)?(_RM)?$"),
   B,
   BA,
   BC,
@@ -1321,6 +1321,8 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C],
   BCLn,
   BCTRL8_LDinto_toc,
   BCTRL_LWZinto_toc,
+  BCTRL8_LDinto_toc_RM,
+  BCTRL_LWZinto_toc_RM,
   BCn,
   CTRL_DEP
 )>;
@@ -1430,5 +1432,6 @@ def : InstRW<[],
   DCBI,
   DCCCI,
   ICCCI,
-  ADDEX
+  ADDEX,
+  ADDEX8
 )> { let Unsupported = 1; }
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index ce43ced57560..a1ff20bb3612 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -187,6 +187,22 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
 def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true",
                                           "Target supports store clustering",
                                           [FeatureFusion]>;
+def FeatureArithAddFusion :
+  SubtargetFeature<"fuse-arith-add", "HasArithAddFusion", "true",
+                   "Target supports Arithmetic Operations with Add fusion",
+                   [FeatureFusion]>;
+def FeatureAddLogicalFusion :
+  SubtargetFeature<"fuse-add-logical", "HasAddLogicalFusion", "true",
+                   "Target supports Add with Logical Operations fusion",
+                   [FeatureFusion]>;
+def FeatureLogicalAddFusion :
+  SubtargetFeature<"fuse-logical-add", "HasLogicalAddFusion", "true",
+                   "Target supports Logical with Add Operations fusion",
+                   [FeatureFusion]>;
+def FeatureLogicalFusion :
+  SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true",
+                   "Target supports Logical Operations fusion",
+                   [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -213,6 +229,9 @@ def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
 def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
   "Treat vector data stream cache control instructions as deprecated">;
 
+def FeatureISA2_06 : SubtargetFeature<"isa-v206-instructions", "IsISA2_06",
+                                      "true",
+                                      "Enable instructions in ISA 2.06.">;
 def FeatureISA2_07 : SubtargetFeature<"isa-v207-instructions", "IsISA2_07",
                                       "true",
                                       "Enable instructions in ISA 2.07.">;
@@ -319,7 +338,8 @@ def ProcessorFeatures {
                                                   FeatureMFTB,
                                                   DeprecatedDST,
                                                   FeatureTwoConstNR,
-                                                  FeatureUnalignedFloats];
+                                                  FeatureUnalignedFloats,
+                                                  FeatureISA2_06];
   list<SubtargetFeature> P7SpecificFeatures = [];
   list<SubtargetFeature> P7Features =
     !listconcat(P7InheritableFeatures, P7SpecificFeatures);
@@ -371,7 +391,10 @@ def ProcessorFeatures {
   // Power10
   // For P10 CPU we assume that all of the existing features from Power9
   // still exist with the exception of those we know are Power9 specific.
-  list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion];
+  list<SubtargetFeature> FusionFeatures = [
+    FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion,
+    FeatureLogicalFusion, FeatureArithAddFusion
+  ];
   list<SubtargetFeature> P10AdditionalFeatures =
     !listconcat(FusionFeatures, [
        DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
@@ -586,8 +609,7 @@ def : ProcessorModel<"pwr6x", G5Model,
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
-// No scheduler model yet.
-def : ProcessorModel<"pwr10", P9Model, ProcessorFeatures.P10Features>;
+def : ProcessorModel<"pwr10", P10Model, ProcessorFeatures.P10Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d0109f968446..a76963abb8e4 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -58,13 +58,13 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/SectionKind.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -1494,7 +1494,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     //
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
-    OutStreamer->emitCodeAlignment(8);
+    OutStreamer->emitCodeAlignment(8, &getSubtargetInfo());
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     OutStreamer->emitLabel(BeginOfSled);
     EmitToStreamer(*OutStreamer, RetInst);
@@ -2023,9 +2023,10 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
   // Set the 4th byte of the mandatory field.
   FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask;
 
-  static_assert(XCOFF::AllocRegNo == 31, "Unexpected register usage!");
-  if (MRI.isPhysRegUsed(Subtarget->isPPC64() ? PPC::X31 : PPC::R31,
-                        /* SkipRegMaskTest */ true))
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget->getRegisterInfo());
+  Register FrameReg = RegInfo->getFrameRegister(*MF);
+  if (FrameReg == (Subtarget->isPPC64() ? PPC::X31 : PPC::R31))
     FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask;
 
   const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs();
@@ -2527,7 +2528,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
 
   // Construct an aliasing list for each GlobalObject.
   for (const auto &Alias : M.aliases()) {
-    const GlobalObject *Base = Alias.getBaseObject();
+    const GlobalObject *Base = Alias.getAliaseeObject();
     if (!Base)
       report_fatal_error(
           "alias without a base object is not yet supported on AIX");
diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 50ae4450a837..786a3e163540 100644
--- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -291,7 +291,7 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
     return false;
   }
 
-  // Sanity check - the block must be able to fall through
+  // The block must be able to fall through.
   assert(Cand.BranchBlock->canFallThrough() &&
          "Expecting the block to fall through!");
 
@@ -751,9 +751,8 @@ bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
       if (!canCoalesceBranch(Cand2))
         break;
 
-      // Sanity check
       // The branch-taken block of the second candidate should post-dominate the
-      // first candidate
+      // first candidate.
       assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) &&
              "Branch-taken block should post-dominate first candidate");
 
diff --git a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 08b7bdb3ac1e..ff3d36d39fb2 100644
--- a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -25,9 +25,9 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -62,15 +62,14 @@ protected:
         return Changed;
 
       SmallVector<MachineBasicBlock*, 8> PredToRemove;
-      for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
-           PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
+      for (MachineBasicBlock *Pred : ReturnMBB.predecessors()) {
         bool OtherReference = false, BlockChanged = false;
 
-        if ((*PI)->empty())
+        if (Pred->empty())
           continue;
 
-        for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
-          if (J == (*PI)->end())
+        for (MachineBasicBlock::iterator J = Pred->getLastNonDebugInstr();;) {
+          if (J == Pred->end())
             break;
 
           if (J->getOpcode() == PPC::B) {
@@ -78,7 +77,7 @@ protected:
               // This is an unconditional branch to the return. Replace the
               // branch with a blr.
               MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I);
-              (*PI)->insert(J, MI);
+              Pred->insert(J, MI);
 
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
@@ -95,7 +94,7 @@ protected:
               MachineInstrBuilder(*ReturnMBB.getParent(), MI)
                   .add(J->getOperand(0))
                   .add(J->getOperand(1));
-              (*PI)->insert(J, MI);
+              Pred->insert(J, MI);
 
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
@@ -112,7 +111,7 @@ protected:
                   TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn));
               MachineInstrBuilder(*ReturnMBB.getParent(), MI)
                   .add(J->getOperand(0));
-              (*PI)->insert(J, MI);
+              Pred->insert(J, MI);
 
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
@@ -132,18 +131,18 @@ protected:
           } else if (!J->isTerminator() && !J->isDebugInstr())
             break;
 
-          if (J == (*PI)->begin())
+          if (J == Pred->begin())
             break;
 
           --J;
         }
 
-        if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
+        if (Pred->canFallThrough() && Pred->isLayoutSuccessor(&ReturnMBB))
           OtherReference = true;
 
         // Predecessors are stored in a vector and can't be removed here.
         if (!OtherReference && BlockChanged) {
-          PredToRemove.push_back(*PI);
+          PredToRemove.push_back(Pred);
         }
 
         if (BlockChanged)
@@ -185,12 +184,9 @@ public:
       // nothing to do.
       if (MF.size() < 2)
         return Changed;
-      
-      // We can't use a range-based for loop due to clobbering the iterator.
-      for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E;) {
-        MachineBasicBlock &B = *I++;
+
+      for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
         Changed |= processBlock(B);
-      }
 
       return Changed;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
index 9daef26ede47..be4c9dd60b00 100644
--- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -102,6 +102,16 @@ bool PPCExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
     return expandAtomicRMW128(MBB, MI, NMBBI);
   case PPC::ATOMIC_CMP_SWAP_I128:
     return expandAtomicCmpSwap128(MBB, MI, NMBBI);
+  case PPC::BUILD_QUADWORD: {
+    Register Dst = MI.getOperand(0).getReg();
+    Register DstHi = TRI->getSubReg(Dst, PPC::sub_gp8_x0);
+    Register DstLo = TRI->getSubReg(Dst, PPC::sub_gp8_x1);
+    Register Lo = MI.getOperand(1).getReg();
+    Register Hi = MI.getOperand(2).getReg();
+    PairedCopy(TII, MBB, MI, MI.getDebugLoc(), DstHi, DstLo, Hi, Lo);
+    MI.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index dfb2c1e5c0f5..856569bc8a73 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -987,15 +987,16 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
   auto RC = MRI.getRegClass(SrcReg);
   if (Subtarget->hasSPE()) {
     DestReg = createResultReg(&PPC::GPRCRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-      TII.get(PPC::EFSCFD), DestReg)
-      .addReg(SrcReg);
-  } else if (isVSFRCRegClass(RC)) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::EFSCFD),
+            DestReg)
+        .addReg(SrcReg);
+  } else if (Subtarget->hasP8Vector() && isVSFRCRegClass(RC)) {
     DestReg = createResultReg(&PPC::VSSRCRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-      TII.get(PPC::XSRSP), DestReg)
-      .addReg(SrcReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::XSRSP),
+            DestReg)
+        .addReg(SrcReg);
   } else {
+    SrcReg = copyRegToRegClass(&PPC::F8RCRegClass, SrcReg);
     DestReg = createResultReg(&PPC::F4RCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
       TII.get(PPC::FRSP), DestReg)
@@ -2467,9 +2468,9 @@ namespace llvm {
   // Create the fast instruction selector for PowerPC64 ELF.
   FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
                                 const TargetLibraryInfo *LibInfo) {
-    // Only available on 64-bit ELF for now.
+    // Only available on 64-bit for now.
     const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
-    if (Subtarget.is64BitELFABI())
+    if (Subtarget.isPPC64())
       return new PPCFastISel(FuncInfo, LibInfo);
     return nullptr;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 1de6b633d20a..fc3c7ec35b8d 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -279,11 +279,11 @@ static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
 
 /// determineFrameLayoutAndUpdate - Determine the size of the frame and maximum
 /// call frame size. Update the MachineFunction object with the stack size.
-unsigned
+uint64_t
 PPCFrameLowering::determineFrameLayoutAndUpdate(MachineFunction &MF,
                                                 bool UseEstimate) const {
   unsigned NewMaxCallFrameSize = 0;
-  unsigned FrameSize = determineFrameLayout(MF, UseEstimate,
+  uint64_t FrameSize = determineFrameLayout(MF, UseEstimate,
                                             &NewMaxCallFrameSize);
   MF.getFrameInfo().setStackSize(FrameSize);
   MF.getFrameInfo().setMaxCallFrameSize(NewMaxCallFrameSize);
@@ -292,7 +292,7 @@ PPCFrameLowering::determineFrameLayoutAndUpdate(MachineFunction &MF,
 
 /// determineFrameLayout - Determine the size of the frame and maximum call
 /// frame size.
-unsigned
+uint64_t
 PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
                                        bool UseEstimate,
                                        unsigned *NewMaxCallFrameSize) const {
@@ -300,7 +300,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
   const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
 
   // Get the number of bytes to allocate from the FrameInfo
-  unsigned FrameSize =
+  uint64_t FrameSize =
     UseEstimate ? MFI.estimateStackSize(MF) : MFI.getStackSize();
 
   // Get stack alignments. The frame must be aligned to the greatest of these:
@@ -624,9 +624,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   assert((isSVR4ABI || Subtarget.isAIXABI()) && "Unsupported PPC ABI.");
 
   // Work out frame sizes.
-  unsigned FrameSize = determineFrameLayoutAndUpdate(MF);
-  int NegFrameSize = -FrameSize;
-  if (!isInt<32>(NegFrameSize))
+  uint64_t FrameSize = determineFrameLayoutAndUpdate(MF);
+  int64_t NegFrameSize = -FrameSize;
+  if (!isInt<32>(FrameSize) || !isInt<32>(NegFrameSize))
     llvm_unreachable("Unhandled stack size!");
 
   if (MFI.isFrameAddressTaken())
@@ -692,9 +692,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 
   SingleScratchReg = ScratchReg == TempReg;
 
-  int LROffset = getReturnSaveOffset();
+  int64_t LROffset = getReturnSaveOffset();
 
-  int FPOffset = 0;
+  int64_t FPOffset = 0;
   if (HasFP) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int FPIndex = FI->getFramePointerSaveIndex();
@@ -702,7 +702,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     FPOffset = MFI.getObjectOffset(FPIndex);
   }
 
-  int BPOffset = 0;
+  int64_t BPOffset = 0;
   if (HasBP) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int BPIndex = FI->getBasePointerSaveIndex();
@@ -710,7 +710,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     BPOffset = MFI.getObjectOffset(BPIndex);
   }
 
-  int PBPOffset = 0;
+  int64_t PBPOffset = 0;
   if (FI->usesPICBase()) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int PBPIndex = FI->getPICBasePointerSaveIndex();
@@ -854,7 +854,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     // ABI.
     if (HasROPProtect) {
       const int SaveIndex = FI->getROPProtectionHashSaveIndex();
-      const int ImmOffset = MFI.getObjectOffset(SaveIndex);
+      const int64_t ImmOffset = MFI.getObjectOffset(SaveIndex);
       assert((ImmOffset <= -8 && ImmOffset >= -512) &&
              "ROP hash save offset out of range.");
       assert(((ImmOffset & 0x7) == 0) &&
@@ -1212,7 +1212,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIRegister);
       } else {
-        int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+        int64_t Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
         // We have changed the object offset above but we do not want to change
         // the actual offsets in the CFI instruction so we have to undo the
         // offset change here.
@@ -1550,7 +1550,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // Get the number of bytes allocated from the FrameInfo.
-  int FrameSize = MFI.getStackSize();
+  int64_t FrameSize = MFI.getStackSize();
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
@@ -1592,9 +1592,9 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
                                                      : PPC::MTOCRF);
   const MCInstrDesc &HashChk =
       TII.get(HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK);
-  int LROffset = getReturnSaveOffset();
+  int64_t LROffset = getReturnSaveOffset();
 
-  int FPOffset = 0;
+  int64_t FPOffset = 0;
 
   // Using the same bool variable as below to suppress compiler warnings.
   bool SingleScratchReg = findScratchRegister(&MBB, true, false, &ScratchReg,
@@ -1610,14 +1610,14 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     FPOffset = MFI.getObjectOffset(FPIndex);
   }
 
-  int BPOffset = 0;
+  int64_t BPOffset = 0;
   if (HasBP) {
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
       BPOffset = MFI.getObjectOffset(BPIndex);
   }
 
-  int PBPOffset = 0;
+  int64_t PBPOffset = 0;
   if (FI->usesPICBase()) {
     int PBPIndex = FI->getPICBasePointerSaveIndex();
     assert(PBPIndex && "No PIC Base Pointer Save Slot!");
@@ -1865,7 +1865,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     // hash and then compare it to the hash stored in the prologue.
     if (HasROPProtect) {
       const int SaveIndex = FI->getROPProtectionHashSaveIndex();
-      const int ImmOffset = MFI.getObjectOffset(SaveIndex);
+      const int64_t ImmOffset = MFI.getObjectOffset(SaveIndex);
       assert((ImmOffset <= -8 && ImmOffset >= -512) &&
              "ROP hash check location offset out of range.");
       assert(((ImmOffset & 0x7) == 0) &&
@@ -2680,15 +2680,15 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
-unsigned PPCFrameLowering::getTOCSaveOffset() const {
+uint64_t PPCFrameLowering::getTOCSaveOffset() const {
   return TOCSaveOffset;
 }
 
-unsigned PPCFrameLowering::getFramePointerSaveOffset() const {
+uint64_t PPCFrameLowering::getFramePointerSaveOffset() const {
   return FramePointerSaveOffset;
 }
 
-unsigned PPCFrameLowering::getBasePointerSaveOffset() const {
+uint64_t PPCFrameLowering::getBasePointerSaveOffset() const {
   return BasePointerSaveOffset;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index b378c2739925..21883b19a575 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -21,12 +21,12 @@ class PPCSubtarget;
 
 class PPCFrameLowering: public TargetFrameLowering {
   const PPCSubtarget &Subtarget;
-  const unsigned ReturnSaveOffset;
-  const unsigned TOCSaveOffset;
-  const unsigned FramePointerSaveOffset;
+  const uint64_t ReturnSaveOffset;
+  const uint64_t TOCSaveOffset;
+  const uint64_t FramePointerSaveOffset;
   const unsigned LinkageSize;
-  const unsigned BasePointerSaveOffset;
-  const unsigned CRSaveOffset;
+  const uint64_t BasePointerSaveOffset;
+  const uint64_t CRSaveOffset;
 
   // Map each group of one or two GPRs to corresponding VSR for spilling.
   // TODO: Use local table in methods to avoid this mutable member.
@@ -88,7 +88,7 @@ public:
   /**
    * Determine the frame layout and update the machine function.
    */
-  unsigned determineFrameLayoutAndUpdate(MachineFunction &MF,
+  uint64_t determineFrameLayoutAndUpdate(MachineFunction &MF,
                                          bool UseEstimate = false) const;
 
   /**
@@ -96,7 +96,7 @@ public:
    * The MachineFunction object can be const in this case as it is not
    * modified.
    */
-  unsigned determineFrameLayout(const MachineFunction &MF,
+  uint64_t determineFrameLayout(const MachineFunction &MF,
                                 bool UseEstimate = false,
                                 unsigned *NewMaxCallFrameSize = nullptr) const;
 
@@ -146,19 +146,19 @@ public:
 
   /// getReturnSaveOffset - Return the previous frame offset to save the
   /// return address.
-  unsigned getReturnSaveOffset() const { return ReturnSaveOffset; }
+  uint64_t getReturnSaveOffset() const { return ReturnSaveOffset; }
 
   /// getTOCSaveOffset - Return the previous frame offset to save the
   /// TOC register -- 64-bit SVR4 ABI only.
-  unsigned getTOCSaveOffset() const;
+  uint64_t getTOCSaveOffset() const;
 
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
-  unsigned getFramePointerSaveOffset() const;
+  uint64_t getFramePointerSaveOffset() const;
 
   /// getBasePointerSaveOffset - Return the previous frame offset to save the
   /// base pointer.
-  unsigned getBasePointerSaveOffset() const;
+  uint64_t getBasePointerSaveOffset() const;
 
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 606aae66196c..0abdf81d0908 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -262,6 +262,21 @@ namespace {
                                                 None) == PPC::AM_DForm;
     }
 
+    /// SelectPCRelForm - Returns true if address N can be represented by
+    /// PC-Relative addressing mode.
+    bool SelectPCRelForm(SDNode *Parent, SDValue N, SDValue &Disp,
+                         SDValue &Base) {
+      return PPCLowering->SelectOptimalAddrMode(Parent, N, Disp, Base, *CurDAG,
+                                                None) == PPC::AM_PCRel;
+    }
+
+    /// SelectPDForm - Returns true if address N can be represented by Prefixed
+    /// DForm addressing mode (a base register, plus a signed 34-bit immediate.
+    bool SelectPDForm(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectOptimalAddrMode(Parent, N, Disp, Base, *CurDAG,
+                                                None) == PPC::AM_PrefixDForm;
+    }
+
     /// SelectXForm - Returns true if address N can be represented by the
     /// addressing mode of XForm instructions (an indexed [r+r] operation).
     bool SelectXForm(SDNode *Parent, SDValue N, SDValue &Disp, SDValue &Base) {
@@ -3186,7 +3201,7 @@ IntegerCompareEliminator::get32BitZExtCompare(SDValue LHS, SDValue RHS,
     // by swapping inputs and falling through.
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     LLVM_FALLTHROUGH;
   }
   case ISD::SETLE: {
@@ -3236,7 +3251,7 @@ IntegerCompareEliminator::get32BitZExtCompare(SDValue LHS, SDValue RHS,
     // (%b < %a) by swapping inputs and falling through.
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
     LLVM_FALLTHROUGH;
   }
@@ -3370,7 +3385,7 @@ IntegerCompareEliminator::get32BitSExtCompare(SDValue LHS, SDValue RHS,
     // by swapping inputs and falling through.
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     LLVM_FALLTHROUGH;
   }
   case ISD::SETLE: {
@@ -3415,7 +3430,7 @@ IntegerCompareEliminator::get32BitSExtCompare(SDValue LHS, SDValue RHS,
     // (%b < %a) by swapping inputs and falling through.
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
     LLVM_FALLTHROUGH;
   }
@@ -3528,7 +3543,7 @@ IntegerCompareEliminator::get64BitZExtCompare(SDValue LHS, SDValue RHS,
       return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     LLVM_FALLTHROUGH;
   }
   case ISD::SETLE: {
@@ -3570,7 +3585,7 @@ IntegerCompareEliminator::get64BitZExtCompare(SDValue LHS, SDValue RHS,
     }
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
     LLVM_FALLTHROUGH;
   }
@@ -3687,7 +3702,7 @@ IntegerCompareEliminator::get64BitSExtCompare(SDValue LHS, SDValue RHS,
       return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     LLVM_FALLTHROUGH;
   }
   case ISD::SETLE: {
@@ -3730,7 +3745,7 @@ IntegerCompareEliminator::get64BitSExtCompare(SDValue LHS, SDValue RHS,
     }
     std::swap(LHS, RHS);
     ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSZero = RHSConst && RHSConst->isZero();
     IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
     LLVM_FALLTHROUGH;
   }
@@ -4982,6 +4997,51 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
     break;
 
+  case ISD::INTRINSIC_VOID: {
+    auto IntrinsicID = N->getConstantOperandVal(1);
+    if (IntrinsicID == Intrinsic::ppc_tdw || IntrinsicID == Intrinsic::ppc_tw) {
+      unsigned Opcode = IntrinsicID == Intrinsic::ppc_tdw ? PPC::TDI : PPC::TWI;
+      SDValue Ops[] = {N->getOperand(4), N->getOperand(2), N->getOperand(3)};
+      int16_t SImmOperand2;
+      int16_t SImmOperand3;
+      int16_t SImmOperand4;
+      bool isOperand2IntS16Immediate =
+          isIntS16Immediate(N->getOperand(2), SImmOperand2);
+      bool isOperand3IntS16Immediate =
+          isIntS16Immediate(N->getOperand(3), SImmOperand3);
+      // We will emit PPC::TD or PPC::TW if the 2nd and 3rd operands are reg +
+      // reg or imm + imm. The imm + imm form will be optimized to either an
+      // unconditional trap or a nop in a later pass.
+      if (isOperand2IntS16Immediate == isOperand3IntS16Immediate)
+        Opcode = IntrinsicID == Intrinsic::ppc_tdw ? PPC::TD : PPC::TW;
+      else if (isOperand3IntS16Immediate)
+        // The 2nd and 3rd operands are reg + imm.
+        Ops[2] = getI32Imm(int(SImmOperand3) & 0xFFFF, dl);
+      else {
+        // The 2nd and 3rd operands are imm + reg.
+        bool isOperand4IntS16Immediate =
+            isIntS16Immediate(N->getOperand(4), SImmOperand4);
+        (void)isOperand4IntS16Immediate;
+        assert(isOperand4IntS16Immediate &&
+               "The 4th operand is not an Immediate");
+        // We need to flip the condition immediate TO.
+        int16_t TO = int(SImmOperand4) & 0x1F;
+        // We swap the first and second bit of TO if they are not same.
+        if ((TO & 0x1) != ((TO & 0x2) >> 1))
+          TO = (TO & 0x1) ? TO + 1 : TO - 1;
+        // We swap the fourth and fifth bit of TO if they are not same.
+        if ((TO & 0x8) != ((TO & 0x10) >> 1))
+          TO = (TO & 0x8) ? TO + 8 : TO - 8;
+        Ops[0] = getI32Imm(TO, dl);
+        Ops[1] = N->getOperand(3);
+        Ops[2] = getI32Imm(int(SImmOperand2) & 0xFFFF, dl);
+      }
+      CurDAG->SelectNodeTo(N, Opcode, MVT::Other, Ops);
+      return;
+    }
+    break;
+  }
+
   case ISD::INTRINSIC_WO_CHAIN: {
     // We emit the PPC::FSELS instruction here because of type conflicts with
     // the comparison operand. The FSELS instruction is defined to use an 8-byte
@@ -5423,8 +5483,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
         if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
           if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
-            if (N1C->isNullValue() && N3C->isNullValue() &&
-                N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+            if (N1C->isZero() && N3C->isZero() && N2C->getZExtValue() == 1ULL &&
+                CC == ISD::SETNE &&
                 // FIXME: Implement this optzn for PPC64.
                 N->getValueType(0) == MVT::i32) {
               SDNode *Tmp =
@@ -5810,6 +5870,69 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     }
   }
+  case PPCISD::LD_SPLAT: {
+    // Here we want to handle splat load for type v16i8 and v8i16 when there is
+    // no direct move, we don't need to use stack for this case. If target has
+    // direct move, we should be able to get the best selection in the .td file.
+    if (!Subtarget->hasAltivec() || Subtarget->hasDirectMove())
+      break;
+
+    EVT Type = N->getValueType(0);
+    if (Type != MVT::v16i8 && Type != MVT::v8i16)
+      break;
+
+    SDValue ZeroReg =
+        CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                            Subtarget->isPPC64() ? MVT::i64 : MVT::i32);
+    unsigned LIOpcode = Subtarget->isPPC64() ? PPC::LI8 : PPC::LI;
+    // v16i8 LD_SPLAT addr
+    // ======>
+    // Mask = LVSR/LVSL 0, addr
+    // LoadLow = LVX 0, addr
+    // Perm = VPERM LoadLow, LoadLow, Mask
+    // Splat = VSPLTB 15/0, Perm
+    //
+    // v8i16 LD_SPLAT addr
+    // ======>
+    // Mask = LVSR/LVSL 0, addr
+    // LoadLow = LVX 0, addr
+    // LoadHigh = LVX (LI, 1), addr
+    // Perm = VPERM LoadLow, LoadHigh, Mask
+    // Splat = VSPLTH 7/0, Perm
+    unsigned SplatOp = (Type == MVT::v16i8) ? PPC::VSPLTB : PPC::VSPLTH;
+    unsigned SplatElemIndex =
+        Subtarget->isLittleEndian() ? ((Type == MVT::v16i8) ? 15 : 7) : 0;
+
+    SDNode *Mask = CurDAG->getMachineNode(
+        Subtarget->isLittleEndian() ? PPC::LVSR : PPC::LVSL, dl, Type, ZeroReg,
+        N->getOperand(1));
+
+    SDNode *LoadLow =
+        CurDAG->getMachineNode(PPC::LVX, dl, MVT::v16i8, MVT::Other,
+                               {ZeroReg, N->getOperand(1), N->getOperand(0)});
+
+    SDNode *LoadHigh = LoadLow;
+    if (Type == MVT::v8i16) {
+      LoadHigh = CurDAG->getMachineNode(
+          PPC::LVX, dl, MVT::v16i8, MVT::Other,
+          {SDValue(CurDAG->getMachineNode(
+                       LIOpcode, dl, MVT::i32,
+                       CurDAG->getTargetConstant(1, dl, MVT::i8)),
+                   0),
+           N->getOperand(1), SDValue(LoadLow, 1)});
+    }
+
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(LoadHigh, 1));
+    transferMemOperands(N, LoadHigh);
+
+    SDNode *Perm =
+        CurDAG->getMachineNode(PPC::VPERM, dl, Type, SDValue(LoadLow, 0),
+                               SDValue(LoadHigh, 0), SDValue(Mask, 0));
+    CurDAG->SelectNodeTo(N, SplatOp, Type,
+                         CurDAG->getTargetConstant(SplatElemIndex, dl, MVT::i8),
+                         SDValue(Perm, 0));
+    return;
+  }
   }
 
   SelectCode(N);
@@ -6153,9 +6276,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // be folded with the isel so that we don't need to materialize a register
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-       UI != UE; ++UI) {
-    SDNode *User = *UI;
+  for (const SDNode *User : N->uses()) {
     if (!User->isMachineOpcode())
       return false;
     if (User->getMachineOpcode() != PPC::SELECT_I4 &&
@@ -6180,7 +6301,7 @@ bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
     if (!C)
       return false;
 
-    if (!C->isNullValue())
+    if (!C->isZero())
       return false;
   }
 
@@ -6189,18 +6310,14 @@ bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
 
 void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
   SmallVector<SDNode *, 4> ToReplace;
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-       UI != UE; ++UI) {
-    SDNode *User = *UI;
+  for (SDNode *User : N->uses()) {
     assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
             User->getMachineOpcode() == PPC::SELECT_I8) &&
            "Must have all select users");
     ToReplace.push_back(User);
   }
 
-  for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
-       UE = ToReplace.end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
+  for (SDNode *User : ToReplace) {
     SDNode *ResNode =
       CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
                              User->getValueType(0), User->getOperand(0),
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 37358176f35e..ac952b240a48 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -601,6 +601,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
 
   // To handle counter-based loop conditions.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
@@ -1245,9 +1247,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     }
 
     if (Subtarget.hasP9Altivec()) {
-      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
-
+      if (Subtarget.isISA3_1()) {
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Legal);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Legal);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal);
+      } else {
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+      }
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8,  Legal);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
@@ -1256,9 +1265,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
       setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
     }
-
-    if (Subtarget.isISA3_1())
-      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
   }
 
   if (Subtarget.pairedVectorMemops()) {
@@ -1286,8 +1292,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
   }
 
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics())
+  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) {
     setMaxAtomicSizeInBitsSupported(128);
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+    setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom);
+  }
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
@@ -1301,6 +1311,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
   if (!isPPC64)
@@ -1513,10 +1524,10 @@ void PPCTargetLowering::initializeAddrModeMap() {
       PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
       PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
       PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
-      PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
-      PPC::MOF_NotAddNorCst | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
-      PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
   };
+  AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |
+                                       PPC::MOF_SubtargetP10};
+  // TODO: Add mapping for quadword load/store.
 }
 
 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
@@ -1550,7 +1561,7 @@ static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
-unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
+uint64_t PPCTargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
   // 16byte and wider vectors are passed on 16byte boundary.
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
@@ -1623,9 +1634,19 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::CALL:            return "PPCISD::CALL";
   case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
   case PPCISD::CALL_NOTOC:      return "PPCISD::CALL_NOTOC";
+  case PPCISD::CALL_RM:
+    return "PPCISD::CALL_RM";
+  case PPCISD::CALL_NOP_RM:
+    return "PPCISD::CALL_NOP_RM";
+  case PPCISD::CALL_NOTOC_RM:
+    return "PPCISD::CALL_NOTOC_RM";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL:           return "PPCISD::BCTRL";
   case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
+  case PPCISD::BCTRL_RM:
+    return "PPCISD::BCTRL_RM";
+  case PPCISD::BCTRL_LOAD_TOC_RM:
+    return "PPCISD::BCTRL_LOAD_TOC_RM";
   case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
   case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
   case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
@@ -1707,6 +1728,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
   case PPCISD::XXMFACC:         return "PPCISD::XXMFACC";
   case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
+  case PPCISD::ZEXT_LD_SPLAT:   return "PPCISD::ZEXT_LD_SPLAT";
+  case PPCISD::SEXT_LD_SPLAT:   return "PPCISD::SEXT_LD_SPLAT";
   case PPCISD::FNMSUB:          return "PPCISD::FNMSUB";
   case PPCISD::STRICT_FADDRTZ:
     return "PPCISD::STRICT_FADDRTZ";
@@ -2551,9 +2574,8 @@ static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
 bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
                                                SDValue &Index,
                                                SelectionDAG &DAG) const {
-  for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
-      UI != E; ++UI) {
-    if (MemSDNode *Memop = dyn_cast<MemSDNode>(*UI)) {
+  for (SDNode *U : N->uses()) {
+    if (MemSDNode *Memop = dyn_cast<MemSDNode>(U)) {
       if (Memop->getMemoryVT() == MVT::f64) {
           Base = N.getOperand(0);
           Index = N.getOperand(1);
@@ -3503,7 +3525,7 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     // Leave comparisons against 0 and -1 alone for now, since they're usually
     // optimized.  FIXME: revisit this when we can custom lower all setcc
     // optimizations.
-    if (C->isAllOnesValue() || C->isNullValue())
+    if (C->isAllOnes() || C->isZero())
       return SDValue();
   }
 
@@ -4364,21 +4386,10 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
           FuncInfo->addLiveInAttr(VReg, Flags);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store;
-
-          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
-            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
-                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
-            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
-                                      MachinePointerInfo(&*FuncArg), ObjType);
-          } else {
-            // For sizes that don't fit a truncating store (3, 5, 6, 7),
-            // store the whole register as-is to the parameter save area
-            // slot.
-            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                 MachinePointerInfo(&*FuncArg));
-          }
-
+          EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), ObjSize * 8);
+          SDValue Store =
+              DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
+                                MachinePointerInfo(&*FuncArg), ObjType);
           MemOps.push_back(Store);
         }
         // Whether we copied from a register or not, advance the offset
@@ -4649,7 +4660,7 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
 
   // If we have an Alias we can try to get the function from there.
   if (Alias) {
-    const GlobalObject *GlobalObj = Alias->getBaseObject();
+    const GlobalObject *GlobalObj = Alias->getAliaseeObject();
     F = dyn_cast<Function>(GlobalObj);
   }
 
@@ -5174,13 +5185,14 @@ static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
 }
 
 static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
-                              const Function &Caller,
-                              const SDValue &Callee,
+                              const Function &Caller, const SDValue &Callee,
                               const PPCSubtarget &Subtarget,
-                              const TargetMachine &TM) {
+                              const TargetMachine &TM,
+                              bool IsStrictFPCall = false) {
   if (CFlags.IsTailCall)
     return PPCISD::TC_RETURN;
 
+  unsigned RetOpc = 0;
   // This is a call through a function pointer.
   if (CFlags.IsIndirect) {
     // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
@@ -5191,28 +5203,46 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
     // immediately followed by a load of the TOC pointer from the the stack save
     // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
     // as it is not saved or used.
-    return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
-                                               : PPCISD::BCTRL;
-  }
-
-  if (Subtarget.isUsingPCRelativeCalls()) {
+    RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
+                                                 : PPCISD::BCTRL;
+  } else if (Subtarget.isUsingPCRelativeCalls()) {
     assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
-    return PPCISD::CALL_NOTOC;
+    RetOpc = PPCISD::CALL_NOTOC;
+  } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
+    // The ABIs that maintain a TOC pointer accross calls need to have a nop
+    // immediately following the call instruction if the caller and callee may
+    // have different TOC bases. At link time if the linker determines the calls
+    // may not share a TOC base, the call is redirected to a trampoline inserted
+    // by the linker. The trampoline will (among other things) save the callers
+    // TOC pointer at an ABI designated offset in the linkage area and the
+    // linker will rewrite the nop to be a load of the TOC pointer from the
+    // linkage area into gpr2.
+    RetOpc = callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
+                                                    : PPCISD::CALL_NOP;
+  else
+    RetOpc = PPCISD::CALL;
+  if (IsStrictFPCall) {
+    switch (RetOpc) {
+    default:
+      llvm_unreachable("Unknown call opcode");
+    case PPCISD::BCTRL_LOAD_TOC:
+      RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
+      break;
+    case PPCISD::BCTRL:
+      RetOpc = PPCISD::BCTRL_RM;
+      break;
+    case PPCISD::CALL_NOTOC:
+      RetOpc = PPCISD::CALL_NOTOC_RM;
+      break;
+    case PPCISD::CALL:
+      RetOpc = PPCISD::CALL_RM;
+      break;
+    case PPCISD::CALL_NOP:
+      RetOpc = PPCISD::CALL_NOP_RM;
+      break;
+    }
   }
-
-  // The ABIs that maintain a TOC pointer accross calls need to have a nop
-  // immediately following the call instruction if the caller and callee may
-  // have different TOC bases. At link time if the linker determines the calls
-  // may not share a TOC base, the call is redirected to a trampoline inserted
-  // by the linker. The trampoline will (among other things) save the callers
-  // TOC pointer at an ABI designated offset in the linkage area and the linker
-  // will rewrite the nop to be a load of the TOC pointer from the linkage area
-  // into gpr2.
-  if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
-    return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
-                                                  : PPCISD::CALL_NOP;
-
-  return PPCISD::CALL;
+  return RetOpc;
 }
 
 static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
@@ -5228,7 +5258,7 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
     const GlobalValue *GV = G ? G->getGlobal() : nullptr;
 
     return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
-           !dyn_cast_or_null<GlobalIFunc>(GV);
+           !isa_and_nonnull<GlobalIFunc>(GV);
   };
 
   // The PLT is only used in 32-bit ELF PIC mode.  Attempting to use the PLT in
@@ -5508,7 +5538,7 @@ SDValue PPCTargetLowering::FinishCall(
 
   unsigned CallOpc =
       getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
-                    Subtarget, DAG.getTarget());
+                    Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false);
 
   if (!CFlags.IsIndirect)
     Callee = transformCallee(Callee, DAG, dl, Subtarget);
@@ -9066,6 +9096,34 @@ bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
   return (!LosesInfo && !APFloatToConvert.isDenormal());
 }
 
+static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
+                             unsigned &Opcode) {
+  const SDNode *InputNode = Op.getOperand(0).getNode();
+  if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode))
+    return false;
+
+  if (!Subtarget.hasVSX())
+    return false;
+
+  EVT Ty = Op->getValueType(0);
+  if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 ||
+      Ty == MVT::v8i16 || Ty == MVT::v16i8)
+    return true;
+
+  if (Ty == MVT::v2i64) {
+    // Check the extend type, when the input type is i32, and the output vector
+    // type is v2i64.
+    if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) {
+      if (ISD::isZEXTLoad(InputNode))
+        Opcode = PPCISD::ZEXT_LD_SPLAT;
+      if (ISD::isSEXTLoad(InputNode))
+        Opcode = PPCISD::SEXT_LD_SPLAT;
+    }
+    return true;
+  }
+  return false;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@@ -9129,17 +9187,26 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   }
 
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
+    unsigned NewOpcode = PPCISD::LD_SPLAT;
 
-    bool IsPermutedLoad = false;
-    const SDValue *InputLoad =
-        getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
     // Handle load-and-splat patterns as we have instructions that will do this
     // in one go.
-    if (InputLoad && DAG.isSplatValue(Op, true)) {
+    if (DAG.isSplatValue(Op, true) &&
+        isValidSplatLoad(Subtarget, Op, NewOpcode)) {
+      const SDValue *InputLoad = &Op.getOperand(0);
       LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
 
-      // We have handling for 4 and 8 byte elements.
-      unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+      // If the input load is an extending load, it will be an i32 -> i64
+      // extending load and isValidSplatLoad() will update NewOpcode.
+      unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();
+      unsigned ElementSize =
+          MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
+
+      assert(((ElementSize == 2 * MemorySize)
+                  ? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||
+                     NewOpcode == PPCISD::SEXT_LD_SPLAT)
+                  : (NewOpcode == PPCISD::LD_SPLAT)) &&
+             "Unmatched element size and opcode!\n");
 
       // Checking for a single use of this load, we have to check for vector
       // width (128 bits) / ElementSize uses (since each operand of the
@@ -9148,18 +9215,45 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       for (SDValue BVInOp : Op->ops())
         if (BVInOp.isUndef())
           NumUsesOfInputLD--;
+
+      // Exclude somes case where LD_SPLAT is worse than scalar_to_vector:
+      // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
+      // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
+      // 15", but funciton IsValidSplatLoad() now will only return true when
+      // the data at index 0 is not nullptr. So we will not get into trouble for
+      // these cases.
+      //
+      // case 1 - lfiwzx/lfiwax
+      // 1.1: load result is i32 and is sign/zero extend to i64;
+      // 1.2: build a v2i64 vector type with above loaded value;
+      // 1.3: the vector has only one value at index 0, others are all undef;
+      // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
+      if (NumUsesOfInputLD == 1 &&
+          (Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
+           !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
+           Subtarget.hasLFIWAX()))
+        return SDValue();
+
+      // case 2 - lxvr[hb]x
+      // 2.1: load result is at most i16;
+      // 2.2: build a vector with above loaded value;
+      // 2.3: the vector has only one value at index 0, others are all undef;
+      // 2.4: on LE target, so that lxvr[hb]x does not need any permute.
+      if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
+          Subtarget.isISA3_1() && ElementSize <= 16)
+        return SDValue();
+
       assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
       if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
-          ((Subtarget.hasVSX() && ElementSize == 64) ||
-           (Subtarget.hasP9Vector() && ElementSize == 32))) {
+          Subtarget.hasVSX()) {
         SDValue Ops[] = {
           LD->getChain(),    // Chain
           LD->getBasePtr(),  // Ptr
           DAG.getValueType(Op.getValueType()) // VT
         };
         SDValue LdSplt = DAG.getMemIntrinsicNode(
-            PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
-            Ops, LD->getMemoryVT(), LD->getMemOperand());
+            NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops,
+            LD->getMemoryVT(), LD->getMemOperand());
         // Replace all uses of the output chain of the original load with the
         // output chain of the new load.
         DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
@@ -10368,6 +10462,71 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     }
     return DAG.getMergeValues(RetOps, dl);
   }
+
+  case Intrinsic::ppc_unpack_longdouble: {
+    auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
+           "Argument of long double unpack must be 0 or 1!");
+    return DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Op.getOperand(1),
+                       DAG.getConstant(!!(Idx->getSExtValue()), dl,
+                                       Idx->getValueType(0)));
+  }
+
+  case Intrinsic::ppc_compare_exp_lt:
+  case Intrinsic::ppc_compare_exp_gt:
+  case Intrinsic::ppc_compare_exp_eq:
+  case Intrinsic::ppc_compare_exp_uo: {
+    unsigned Pred;
+    switch (IntrinsicID) {
+    case Intrinsic::ppc_compare_exp_lt:
+      Pred = PPC::PRED_LT;
+      break;
+    case Intrinsic::ppc_compare_exp_gt:
+      Pred = PPC::PRED_GT;
+      break;
+    case Intrinsic::ppc_compare_exp_eq:
+      Pred = PPC::PRED_EQ;
+      break;
+    case Intrinsic::ppc_compare_exp_uo:
+      Pred = PPC::PRED_UN;
+      break;
+    }
+    return SDValue(
+        DAG.getMachineNode(
+            PPC::SELECT_CC_I4, dl, MVT::i32,
+            {SDValue(DAG.getMachineNode(PPC::XSCMPEXPDP, dl, MVT::i32,
+                                        Op.getOperand(1), Op.getOperand(2)),
+                     0),
+             DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),
+             DAG.getTargetConstant(Pred, dl, MVT::i32)}),
+        0);
+  }
+  case Intrinsic::ppc_test_data_class_d:
+  case Intrinsic::ppc_test_data_class_f: {
+    unsigned CmprOpc = PPC::XSTSTDCDP;
+    if (IntrinsicID == Intrinsic::ppc_test_data_class_f)
+      CmprOpc = PPC::XSTSTDCSP;
+    return SDValue(
+        DAG.getMachineNode(
+            PPC::SELECT_CC_I4, dl, MVT::i32,
+            {SDValue(DAG.getMachineNode(CmprOpc, dl, MVT::i32, Op.getOperand(2),
+                                        Op.getOperand(1)),
+                     0),
+             DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),
+             DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),
+        0);
+  }
+  case Intrinsic::ppc_convert_f128_to_ppcf128:
+  case Intrinsic::ppc_convert_ppcf128_to_f128: {
+    RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
+                            ? RTLIB::CONVERT_PPCF128_F128
+                            : RTLIB::CONVERT_F128_PPCF128;
+    MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Result =
+        makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(1), CallOptions,
+                    dl, SDValue());
+    return Result.first;
+  }
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
@@ -10443,11 +10602,18 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   case Intrinsic::ppc_cfence: {
     assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
     assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
-    return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
-                                      DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
-                                                  Op.getOperand(ArgStart + 1)),
-                                      Op.getOperand(0)),
-                   0);
+    SDValue Val = Op.getOperand(ArgStart + 1);
+    EVT Ty = Val.getValueType();
+    if (Ty == MVT::i128) {
+      // FIXME: Testing one of two paired registers is sufficient to guarantee
+      // ordering?
+      Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val);
+    }
+    return SDValue(
+        DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
+                           DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Val),
+                           Op.getOperand(0)),
+        0);
   }
   default:
     break;
@@ -10510,6 +10676,59 @@ SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
   return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
 }
 
+SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op.getNode());
+  EVT MemVT = N->getMemoryVT();
+  assert(MemVT.getSimpleVT() == MVT::i128 &&
+         "Expect quadword atomic operations");
+  SDLoc dl(N);
+  unsigned Opc = N->getOpcode();
+  switch (Opc) {
+  case ISD::ATOMIC_LOAD: {
+    // Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
+    // lowered to ppc instructions by pattern matching instruction selector.
+    SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
+    SmallVector<SDValue, 4> Ops{
+        N->getOperand(0),
+        DAG.getConstant(Intrinsic::ppc_atomic_load_i128, dl, MVT::i32)};
+    for (int I = 1, E = N->getNumOperands(); I < E; ++I)
+      Ops.push_back(N->getOperand(I));
+    SDValue LoadedVal = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys,
+                                                Ops, MemVT, N->getMemOperand());
+    SDValue ValLo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal);
+    SDValue ValHi =
+        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal.getValue(1));
+    ValHi = DAG.getNode(ISD::SHL, dl, MVT::i128, ValHi,
+                        DAG.getConstant(64, dl, MVT::i32));
+    SDValue Val =
+        DAG.getNode(ISD::OR, dl, {MVT::i128, MVT::Other}, {ValLo, ValHi});
+    return DAG.getNode(ISD::MERGE_VALUES, dl, {MVT::i128, MVT::Other},
+                       {Val, LoadedVal.getValue(2)});
+  }
+  case ISD::ATOMIC_STORE: {
+    // Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
+    // lowered to ppc instructions by pattern matching instruction selector.
+    SDVTList Tys = DAG.getVTList(MVT::Other);
+    SmallVector<SDValue, 4> Ops{
+        N->getOperand(0),
+        DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)};
+    SDValue Val = N->getOperand(2);
+    SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val);
+    SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val,
+                                DAG.getConstant(64, dl, MVT::i32));
+    ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi);
+    Ops.push_back(ValLo);
+    Ops.push_back(ValHi);
+    Ops.push_back(N->getOperand(1));
+    return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT,
+                                   N->getMemOperand());
+  }
+  default:
+    llvm_unreachable("Unexpected atomic opcode");
+  }
+}
+
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -10537,7 +10756,6 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  SDValue V3 = Op.getOperand(2);
 
   if (VT == MVT::v2f64 && C)
     return Op;
@@ -10546,18 +10764,10 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
     if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
       return SDValue();
     // On P10, we have legal lowering for constant and variable indices for
-    // integer vectors.
+    // all vectors.
     if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-        VT == MVT::v2i64)
-      return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3);
-    // For f32 and f64 vectors, we have legal lowering for variable indices.
-    // For f32 we also have legal lowering when the element is loaded from
-    // memory.
-    if (VT == MVT::v4f32 || VT == MVT::v2f64) {
-      if (!C || (VT == MVT::v4f32 && dyn_cast<LoadSDNode>(V2)))
-        return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, V2, V3);
+        VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
       return Op;
-    }
   }
 
   // Before P10, we have legal lowering for constant indices but not for
@@ -10901,6 +11111,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerBSWAP(Op, DAG);
   case ISD::ATOMIC_CMP_SWAP:
     return LowerATOMIC_CMP_SWAP(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    return LowerATOMIC_LOAD_STORE(Op, DAG);
   }
 }
 
@@ -10911,6 +11123,12 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::ATOMIC_LOAD: {
+    SDValue Res = LowerATOMIC_LOAD_STORE(SDValue(N, 0), DAG);
+    Results.push_back(Res);
+    Results.push_back(Res.getValue(1));
+    break;
+  }
   case ISD::READCYCLECOUNTER: {
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
     SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
@@ -10937,6 +11155,18 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(NewInt.getValue(1));
     break;
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+    case Intrinsic::ppc_pack_longdouble:
+      Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+                                    N->getOperand(2), N->getOperand(1)));
+      break;
+    case Intrinsic::ppc_convert_f128_to_ppcf128:
+      Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));
+      break;
+    }
+    break;
+  }
   case ISD::VAARG: {
     if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
       return;
@@ -12647,6 +12877,24 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
         .addDef(Hi)
         .addUse(Src, 0, PPC::sub_gp8_x0);
+  } else if (MI.getOpcode() == PPC::LQX_PSEUDO ||
+             MI.getOpcode() == PPC::STQX_PSEUDO) {
+    DebugLoc DL = MI.getDebugLoc();
+    // Ptr is used as the ptr_rc_no_r0 part
+    // of LQ/STQ's memory operand and adding result of RA and RB,
+    // so it has to be g8rc_and_g8rc_nox0.
+    Register Ptr =
+        F->getRegInfo().createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    Register Val = MI.getOperand(0).getReg();
+    Register RA = MI.getOperand(1).getReg();
+    Register RB = MI.getOperand(2).getReg();
+    BuildMI(*BB, MI, DL, TII->get(PPC::ADD8), Ptr).addReg(RA).addReg(RB);
+    BuildMI(*BB, MI, DL,
+            MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(PPC::LQ)
+                                              : TII->get(PPC::STQ))
+        .addReg(Val, MI.getOpcode() == PPC::LQX_PSEUDO ? RegState::Define : 0)
+        .addImm(0)
+        .addReg(Ptr);
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
@@ -12951,12 +13199,12 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
         if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
           return true;
 
-      for (SDNode::use_iterator UI = LoadRoot->use_begin(),
-           UE = LoadRoot->use_end(); UI != UE; ++UI)
-        if (((isa<MemSDNode>(*UI) &&
-            cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
-            UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
-          Queue.push_back(*UI);
+      for (SDNode *U : LoadRoot->uses())
+        if (((isa<MemSDNode>(U) &&
+              cast<MemSDNode>(U)->getChain().getNode() == LoadRoot) ||
+             U->getOpcode() == ISD::TokenFactor) &&
+            !Visited.count(U))
+          Queue.push_back(U);
     }
   }
 
@@ -13013,11 +13261,9 @@ SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
 
   // If all users of SETCC extend its value to a legal integer type
   // then we replace SETCC with a subtraction
-  for (SDNode::use_iterator UI = N->use_begin(),
-       UE = N->use_end(); UI != UE; ++UI) {
-    if (UI->getOpcode() != ISD::ZERO_EXTEND)
+  for (const SDNode *U : N->uses())
+    if (U->getOpcode() != ISD::ZERO_EXTEND)
       return SDValue();
-  }
 
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   auto OpSize = N->getOperand(0).getValueSizeInBits();
@@ -13194,10 +13440,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
-    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
-                              UE = Inputs[i].getNode()->use_end();
-         UI != UE; ++UI) {
-      SDNode *User = *UI;
+    for (const SDNode *User : Inputs[i].getNode()->uses()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -13218,10 +13461,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   }
 
   for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
-    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
-                              UE = PromOps[i].getNode()->use_end();
-         UI != UE; ++UI) {
-      SDNode *User = *UI;
+    for (const SDNode *User : PromOps[i].getNode()->uses()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -13406,10 +13646,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
-    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
-                              UE = Inputs[i].getNode()->use_end();
-         UI != UE; ++UI) {
-      SDNode *User = *UI;
+    for (SDNode *User : Inputs[i].getNode()->uses()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -13431,10 +13668,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
   }
 
   for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
-    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
-                              UE = PromOps[i].getNode()->use_end();
-         UI != UE; ++UI) {
-      SDNode *User = *UI;
+    for (SDNode *User : PromOps[i].getNode()->uses()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -14753,8 +14987,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   case PPCISD::SRA:
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
-      if (C->isNullValue() ||   //  0 >>s V -> 0.
-          C->isAllOnesValue())    // -1 >>s V -> -1.
+      if (C->isZero() ||  //  0 >>s V -> 0.
+          C->isAllOnes()) // -1 >>s V -> -1.
         return N->getOperand(0);
     }
     break;
@@ -15126,39 +15360,36 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         int Bits = 4 /* 16 byte alignment */;
 
         if (DAG.MaskedValueIsZero(Add->getOperand(1),
-                                  APInt::getAllOnesValue(Bits /* alignment */)
+                                  APInt::getAllOnes(Bits /* alignment */)
                                       .zext(Add.getScalarValueSizeInBits()))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
-          for (SDNode::use_iterator UI = BasePtr->use_begin(),
-                                    UE = BasePtr->use_end();
-               UI != UE; ++UI) {
-            if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
-                    IID) {
+          for (SDNode *U : BasePtr->uses()) {
+            if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                cast<ConstantSDNode>(U->getOperand(0))->getZExtValue() == IID) {
               // We've found another LVSL/LVSR, and this address is an aligned
               // multiple of that one. The results will be the same, so use the
               // one we've just found instead.
 
-              return SDValue(*UI, 0);
+              return SDValue(U, 0);
             }
           }
         }
 
         if (isa<ConstantSDNode>(Add->getOperand(1))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
-          for (SDNode::use_iterator UI = BasePtr->use_begin(),
-               UE = BasePtr->use_end(); UI != UE; ++UI) {
-            if (UI->getOpcode() == ISD::ADD &&
-                isa<ConstantSDNode>(UI->getOperand(1)) &&
+          for (SDNode *U : BasePtr->uses()) {
+            if (U->getOpcode() == ISD::ADD &&
+                isa<ConstantSDNode>(U->getOperand(1)) &&
                 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
-                 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
-                (1ULL << Bits) == 0) {
-              SDNode *OtherAdd = *UI;
-              for (SDNode::use_iterator VI = OtherAdd->use_begin(),
-                   VE = OtherAdd->use_end(); VI != VE; ++VI) {
-                if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                    cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
-                  return SDValue(*VI, 0);
+                 cast<ConstantSDNode>(U->getOperand(1))->getZExtValue()) %
+                        (1ULL << Bits) ==
+                    0) {
+              SDNode *OtherAdd = U;
+              for (SDNode *V : OtherAdd->uses()) {
+                if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                    cast<ConstantSDNode>(V->getOperand(0))->getZExtValue() ==
+                        IID) {
+                  return SDValue(V, 0);
                 }
               }
             }
@@ -15482,13 +15713,13 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   if (VT == MVT::i64 && !Subtarget.isPPC64())
     return SDValue();
   if ((VT != MVT::i32 && VT != MVT::i64) ||
-      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
     return SDValue();
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
 
-  bool IsNegPow2 = (-Divisor).isPowerOf2();
+  bool IsNegPow2 = Divisor.isNegatedPowerOf2();
   unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
   SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
 
@@ -15546,6 +15777,18 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       Known.Zero = ~1U;  // All bits but the low one are known to be zero.
       break;
     }
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+    default:
+      break;
+    case Intrinsic::ppc_load2r:
+      // Top bits are cleared for load2r (which is the same as lhbrx).
+      Known.Zero = 0xFFFF0000;
+      break;
+    }
+    break;
   }
   }
 }
@@ -15960,7 +16203,12 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
   if (Depth > 0) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    // The link register (return address) is saved in the caller's frame
+    // not the callee's stack frame. So we must get the caller's frame
+    // address and load the return address at the LR offset from there.
+    SDValue FrameAddr =
+        DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
+                    LowerFRAMEADDR(Op, DAG), MachinePointerInfo());
     SDValue Offset =
         DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
                         isPPC64 ? MVT::i64 : MVT::i32);
@@ -16077,6 +16325,22 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                  MachineMemOperand::MOVolatile;
     return true;
+  case Intrinsic::ppc_atomic_load_i128:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Align(16);
+    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
+    return true;
+  case Intrinsic::ppc_atomic_store_i128:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = Align(16);
+    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
+    return true;
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
   case Intrinsic::ppc_altivec_lvebx:
@@ -17146,6 +17410,9 @@ PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
   for (auto FlagSet : AddrModesMap.at(PPC::AM_DQForm))
     if ((Flags & FlagSet) == FlagSet)
       return PPC::AM_DQForm;
+  for (auto FlagSet : AddrModesMap.at(PPC::AM_PrefixDForm))
+    if ((Flags & FlagSet) == FlagSet)
+      return PPC::AM_PrefixDForm;
   // If no other forms are selected, return an X-Form as it is the most
   // general addressing mode.
   return PPC::AM_XForm;
@@ -17236,6 +17503,14 @@ static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
   }
 }
 
+static bool isPCRelNode(SDValue N) {
+  return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||
+      isValidPCRelNode<ConstantPoolSDNode>(N) ||
+      isValidPCRelNode<GlobalAddressSDNode>(N) ||
+      isValidPCRelNode<JumpTableSDNode>(N) ||
+      isValidPCRelNode<BlockAddressSDNode>(N));
+}
+
 /// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
 /// the address flags of the load/store instruction that is to be matched.
 unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
@@ -17253,6 +17528,26 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   if (Subtarget.hasSPE())
     FlagSet |= PPC::MOF_SubtargetSPE;
 
+  // Check if we have a PCRel node and return early.
+  if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))
+    return FlagSet;
+
+  // If the node is the paired load/store intrinsics, compute flags for
+  // address computation and return early.
+  unsigned ParentOp = Parent->getOpcode();
+  if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
+                               (ParentOp == ISD::INTRINSIC_VOID))) {
+    unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue();
+    assert(
+        ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) &&
+        "Only the paired load and store (lxvp/stxvp) intrinsics are valid.");
+    SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp) ? Parent->getOperand(2)
+                                                       : Parent->getOperand(3);
+    computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG);
+    FlagSet |= PPC::MOF_Vector;
+    return FlagSet;
+  }
+
   // Mark this as something we don't want to handle here if it is atomic
   // or pre-increment instruction.
   if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Parent))
@@ -17266,7 +17561,8 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   EVT MemVT = MN->getMemoryVT();
   unsigned Size = MemVT.getSizeInBits();
   if (MemVT.isScalarInteger()) {
-    assert(Size <= 64 && "Not expecting scalar integers larger than 8 bytes!");
+    assert(Size <= 128 &&
+           "Not expecting scalar integers larger than 16 bytes!");
     if (Size < 32)
       FlagSet |= PPC::MOF_SubWordInt;
     else if (Size == 32)
@@ -17276,9 +17572,12 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   } else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
     if (Size == 128)
       FlagSet |= PPC::MOF_Vector;
-    else if (Size == 256)
-      FlagSet |= PPC::MOF_Vector256;
-    else
+    else if (Size == 256) {
+      assert(Subtarget.pairedVectorMemops() &&
+             "256-bit vectors are only available when paired vector memops is "
+             "enabled!");
+      FlagSet |= PPC::MOF_Vector;
+    } else
       llvm_unreachable("Not expecting illegal vectors!");
   } else { // Floating point type: can be scalar, f128 or vector types.
     if (Size == 32 || Size == 64)
@@ -17396,6 +17695,14 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
   // Select an X-Form load if it is not.
   setXFormForUnalignedFI(N, Flags, Mode);
 
+  // Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.
+  if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {
+    assert(Subtarget.isUsingPCRelativeCalls() &&
+           "Must be using PC-Relative calls when a valid PC-Relative node is "
+           "present!");
+    Mode = PPC::AM_PCRel;
+  }
+
   // Set Base and Disp accordingly depending on the address mode.
   switch (Mode) {
   case PPC::AM_DForm:
@@ -17467,6 +17774,30 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
       Base = N;
     break;
   }
+  case PPC::AM_PrefixDForm: {
+    int64_t Imm34 = 0;
+    unsigned Opcode = N.getOpcode();
+    if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
+        (isIntS34Immediate(N.getOperand(1), Imm34))) {
+      // N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
+      Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      else
+        Base = N.getOperand(0);
+    } else if (isIntS34Immediate(N, Imm34)) {
+      // The address is a 34-bit signed immediate.
+      Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
+      Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+    }
+    break;
+  }
+  case PPC::AM_PCRel: {
+    // When selecting PC-Relative instructions, "Base" is not utilized as
+    // we select the address as [PC+imm].
+    Disp = N;
+    break;
+  }
   case PPC::AM_None:
     break;
   default: { // By default, X-Form is always available to be selected.
@@ -17503,10 +17834,7 @@ PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
 TargetLowering::AtomicExpansionKind
 PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
-  unsigned Size = AI->getPointerOperand()
-                      ->getType()
-                      ->getPointerElementType()
-                      ->getPrimitiveSizeInBits();
+  unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
   if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
   return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 87579bad118f..34dce2c3172d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -200,6 +200,14 @@ namespace llvm {
     /// and 64-bit AIX.
     BCTRL_LOAD_TOC,
 
+    /// The variants that implicitly define rounding mode for calls with
+    /// strictfp semantics.
+    CALL_RM,
+    CALL_NOP_RM,
+    CALL_NOTOC_RM,
+    BCTRL_RM,
+    BCTRL_LOAD_TOC_RM,
+
     /// Return with a flag operand, matched by 'blr'
     RET_FLAG,
 
@@ -494,6 +502,11 @@ namespace llvm {
     /// Constrained floating point add in round-to-zero mode.
     STRICT_FADDRTZ,
 
+    // NOTE: The nodes below may require PC-Rel specific patterns if the
+    // address could be PC-Relative. When adding new nodes below, consider
+    // whether or not the address can be PC-Relative and add the corresponding
+    // PC-relative patterns and tests.
+
     /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
     /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
     /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -554,6 +567,14 @@ namespace llvm {
     /// instructions such as LXVDSX, LXVWSX.
     LD_SPLAT,
 
+    /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+    /// that zero-extends.
+    ZEXT_LD_SPLAT,
+
+    /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+    /// that sign-extends.
+    SEXT_LD_SPLAT,
+
     /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
     /// Maps directly to an stxvd2x instruction that will be preceded by
     /// an xxswapd.
@@ -712,7 +733,9 @@ namespace llvm {
       AM_DForm,
       AM_DSForm,
       AM_DQForm,
+      AM_PrefixDForm,
       AM_XForm,
+      AM_PCRel
     };
   } // end namespace PPC
 
@@ -936,7 +959,7 @@ namespace llvm {
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(Type *Ty,
+    uint64_t getByValTypeAlignment(Type *Ty,
                                    const DataLayout &DL) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -1246,6 +1269,7 @@ namespace llvm {
     SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 92712c5c072b..417a6ce7e522 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -73,7 +73,7 @@ def SRL64 : SDNodeXForm<imm, [{
 //
 
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
-let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, hasSideEffects = 0 in {
   let isReturn = 1, isPredicable = 1, Uses = [LR8, RM] in
     def BLR8 : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
                             [(retflag)]>, Requires<[In64BitMode]>;
@@ -100,7 +100,7 @@ let Defs = [LR8] in
   def MovePCtoLR8 : PPCEmitTimePseudo<(outs), (ins), "#MovePCtoLR8", []>,
                     PPC970_Unit_BRU;
 
-let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, hasSideEffects = 0 in {
   let Defs = [CTR8], Uses = [CTR8] in {
     def BDZ8  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
                         "bdz $dst">;
@@ -118,7 +118,7 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
 
 
 
-let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8], hasSideEffects = 0 in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
@@ -178,6 +178,39 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   }
 }
 
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8, RM], hasSideEffects = 0,
+    isCodeGenOnly = 1, Uses = [RM] in {
+  // Convenient aliases for call instructions
+  def BL8_RM  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                      "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+
+  def BLA8_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                      "bla $func", IIC_BrB, [(PPCcall_rm (i64 imm:$func))]>;
+  def BL8_NOP_RM  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                           (outs), (ins calltarget:$func),
+                           "bl $func\n\tnop", IIC_BrB, []>;
+
+  def BLA8_NOP_RM : IForm_and_DForm_4_zero<18, 1, 1, 24,
+                           (outs), (ins abscalltarget:$func),
+                           "bla $func\n\tnop", IIC_BrB,
+                           [(PPCcall_nop_rm (i64 imm:$func))]>;
+  let Predicates = [PCRelativeMemops] in {
+    // BL8_NOTOC means that the caller does not use the TOC pointer and if
+    // it does use R2 then it is just a caller saved register. Therefore it is
+    // safe to emit only the bl and not the nop for this instruction. The
+    // linker will not try to restore R2 after the call.
+    def BL8_NOTOC_RM : IForm<18, 0, 1, (outs),
+                             (ins calltarget:$func),
+                             "bl $func", IIC_BrB, []>;
+  }
+  let Uses = [CTR8, RM] in {
+    let isPredicable = 1 in
+      def BCTRL8_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                                   "bctrl", IIC_BrB, [(PPCbctrl_rm)]>,
+                   Requires<[In64BitMode]>;
+  }
+}
+
 let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
     Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
   def BCTRL8_LDinto_toc :
@@ -188,12 +221,22 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
     Requires<[In64BitMode]>;
 }
 
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+    Defs = [LR8, X2, RM], Uses = [CTR8, RM], RST = 2 in {
+  def BCTRL8_LDinto_toc_RM :
+    XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
+                              (ins memrix:$src),
+                              "bctrl\n\tld 2, $src", IIC_BrB,
+                              [(PPCbctrl_load_toc_rm iaddrX4:$src)]>,
+    Requires<[In64BitMode]>;
+}
+
 } // Interpretation64Bit
 
 // FIXME: Duplicating this for the asm parser should be unnecessary, but the
 // previous definition must be marked as CodeGen only to prevent decoding
 // conflicts.
-let Interpretation64Bit = 1, isAsmParserOnly = 1 in
+let Interpretation64Bit = 1, isAsmParserOnly = 1, hasSideEffects = 0 in
 let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in
 def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func),
                      "bl $func", IIC_BrB, []>;
@@ -214,12 +257,32 @@ def : Pat<(PPCcall_notoc (i64 tglobaladdr:$dst)),
 def : Pat<(PPCcall_notoc (i64 texternalsym:$dst)),
           (BL8_NOTOC texternalsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i64 tglobaladdr:$dst)),
+          (BL8_RM tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop_rm (i64 tglobaladdr:$dst)),
+          (BL8_NOP_RM tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall_rm (i64 texternalsym:$dst)),
+          (BL8_RM texternalsym:$dst)>;
+def : Pat<(PPCcall_nop_rm (i64 texternalsym:$dst)),
+          (BL8_NOP_RM texternalsym:$dst)>;
+
+def : Pat<(PPCcall_notoc_rm (i64 tglobaladdr:$dst)),
+          (BL8_NOTOC_RM tglobaladdr:$dst)>;
+def : Pat<(PPCcall_notoc_rm (i64 texternalsym:$dst)),
+          (BL8_NOTOC_RM texternalsym:$dst)>;
+
 // Calls for AIX
 def : Pat<(PPCcall (i64 mcsym:$dst)),
           (BL8 mcsym:$dst)>;
 def : Pat<(PPCcall_nop (i64 mcsym:$dst)),
           (BL8_NOP mcsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i64 mcsym:$dst)),
+          (BL8_RM mcsym:$dst)>;
+def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)),
+          (BL8_NOP_RM mcsym:$dst)>;
+
 // Atomic operations
 // FIXME: some of these might be used with constant operands. This will result
 // in constant materialization instructions that may be redundant. We currently
@@ -408,6 +471,7 @@ def TCRETURNri8 : PPCEmitTimePseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
                  "#TC_RETURNr8 $dst $offset",
                  []>;
 
+let hasSideEffects = 0 in {
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
     isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
 def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
@@ -425,6 +489,7 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
 def TAILBA8   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
                   "ba $dst", IIC_BrB,
                   []>;
+}
 } // Interpretation64Bit
 
 def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
@@ -638,7 +703,7 @@ def XORIS8  : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
                    [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
 
 let isCommutable = 1 in
-defm ADD8  : XOForm_1rx<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+defm ADD8  : XOForm_1rx<31, 266, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "add", "$rT, $rA, $rB", IIC_IntSimple,
                         [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
 // ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
@@ -717,7 +782,7 @@ defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
                         PPC970_DGroup_Cracked;
-defm SUBF8 : XOForm_1rx<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+defm SUBF8 : XOForm_1rx<31, 40, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "subf", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
 defm NEG8    : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA),
@@ -961,7 +1026,7 @@ defm DIVDEU : XOForm_1rcr<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                           [(set i64:$rT, (int_ppc_divdeu g8rc:$rA, g8rc:$rB))]>,
                           isPPC64, Requires<[HasExtDiv]>;
 let isCommutable = 1 in
-defm MULLD : XOForm_1rx<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+defm MULLD : XOForm_1rx<31, 233, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "mulld", "$rT, $rA, $rB", IIC_IntMulHD,
                         [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in
@@ -1300,9 +1365,12 @@ def LDtocBA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
 def LDX  : XForm_1_memOp<31,  21, (outs g8rc:$rD), (ins memrr:$src),
                         "ldx $rD, $src", IIC_LdStLD,
                         [(set i64:$rD, (load XForm:$src))]>, isPPC64;
+
+let Predicates = [IsISA2_06] in {
 def LDBRX : XForm_1_memOp<31,  532, (outs g8rc:$rD), (ins memrr:$src),
                           "ldbrx $rD, $src", IIC_LdStLoad,
                           [(set i64:$rD, (PPClbrx ForceXForm:$src, i64))]>, isPPC64;
+}
 
 let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
 def LHBRX8 : XForm_1_memOp<31, 790, (outs g8rc:$rD), (ins memrr:$src),
@@ -1340,12 +1408,25 @@ def LQ   : DQForm_RTp5_RA17_MEM<56, 0,
                                 []>,
                                 RegConstraint<"@earlyclobber $RTp">,
                                 isPPC64;
+// We don't really have LQX in the ISA, make a pseudo one so that we can
+// handle x-form during isel. Make it pre-ra may expose
+// oppotunities to some opts(CSE, LICM and etc.) for the result of adding
+// RA and RB.
+def LQX_PSEUDO : PPCCustomInserterPseudo<(outs g8prc:$RTp),
+                                         (ins memrr:$src), "#LQX_PSEUDO", []>;
+
 def RESTORE_QUADWORD : PPCEmitTimePseudo<(outs g8prc:$RTp), (ins memrix:$src),
                                          "#RESTORE_QUADWORD", []>;
 }
 
 }
 
+def : Pat<(int_ppc_atomic_load_i128 iaddrX16:$src),
+          (SPLIT_QUADWORD (LQ memrix16:$src))>;
+
+def : Pat<(int_ppc_atomic_load_i128 ForceXForm:$src),
+          (SPLIT_QUADWORD (LQX_PSEUDO memrr:$src))>;
+
 // Support for medium and large code model.
 let hasSideEffects = 0 in {
 let isReMaterializable = 1 in {
@@ -1523,10 +1604,13 @@ def STDX  : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
                           "stdx $rS, $dst", IIC_LdStSTD,
                           [(store i64:$rS, XForm:$dst)]>, isPPC64,
                           PPC970_DGroup_Cracked;
+
+let Predicates = [IsISA2_06] in {
 def STDBRX: XForm_8_memOp<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
                           "stdbrx $rS, $dst", IIC_LdStStore,
                           [(PPCstbrx i64:$rS, ForceXForm:$dst, i64)]>, isPPC64,
                           PPC970_DGroup_Cracked;
+}
 
 let mayStore = 1, hasNoSchedulingInfo = 1 in {
 // Normal 16-byte stores.
@@ -1534,12 +1618,28 @@ let mayStore = 1, hasNoSchedulingInfo = 1 in {
 def STQ : DSForm_1<62, 2, (outs), (ins g8prc:$RSp, memrix:$dst),
                    "stq $RSp, $dst", IIC_LdStSTQ,
                    []>, isPPC64;
+
+def STQX_PSEUDO : PPCCustomInserterPseudo<(outs),
+                                          (ins g8prc:$RSp, memrr:$dst),
+                                          "#STQX_PSEUDO", []>;
+
 def SPILL_QUADWORD : PPCEmitTimePseudo<(outs), (ins g8prc:$RSp, memrix:$dst),
                                        "#SPILL_QUADWORD", []>;
 }
 
 }
 
+def BUILD_QUADWORD : PPCPostRAExpPseudo<
+                       (outs g8prc:$RTp),
+                       (ins g8rc:$lo, g8rc:$hi),
+                       "#BUILD_QUADWORD", []>;
+
+def : Pat<(int_ppc_atomic_store_i128 i64:$lo, i64:$hi, DSForm:$dst),
+          (STQ (BUILD_QUADWORD g8rc:$lo, g8rc:$hi), memrix:$dst)>;
+
+def : Pat<(int_ppc_atomic_store_i128 i64:$lo, i64:$hi, ForceXForm:$dst),
+          (STQX_PSEUDO (BUILD_QUADWORD g8rc:$lo, g8rc:$hi), memrr:$dst)>;
+
 // Stores with Update (pre-inc).
 let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
@@ -1670,6 +1770,13 @@ def HASHCHKP : XForm_XD6_RA5_RB5<31, 690, (outs),
                                  "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>;
 }
 
+let Interpretation64Bit = 1, isCodeGenOnly = 1, hasSideEffects = 1 in
+def ADDEX8 : Z23Form_RTAB5_CY2<31, 170, (outs g8rc:$rT),
+                              (ins g8rc:$rA, g8rc:$rB, u2imm:$CY),
+                              "addex $rT, $rA, $rB, $CY", IIC_IntGeneral,
+                              [(set i64:$rT, (int_ppc_addex i64:$rA, i64:$rB,
+                                                            timm:$CY))]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Patterns
 //
@@ -1833,8 +1940,6 @@ def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
 
 def : Pat<(int_ppc_stdcx ForceXForm:$dst, g8rc:$A),
           (STDCX g8rc:$A, ForceXForm:$dst)>;
-def : Pat<(int_ppc_tdw g8rc:$A, g8rc:$B, i32:$IMM),
-          (TD $IMM, $A, $B)>;
 
 // trapd
 def : Pat<(int_ppc_trapd g8rc:$A),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 2bc7fb2a1a5f..1e0e2d88e54b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1518,8 +1518,8 @@ def VPRTYBQ : VX_VT5_EO5_VB5<1538, 10, "vprtybq", [(set v1i128:$vD,
                             (int_ppc_altivec_vprtybq v1i128:$vB))]>;
 
 // Vector (Bit) Permute (Right-indexed)
-def VBPERMD : VXForm_1<1484, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                       "vbpermd $vD, $vA, $vB", IIC_VecFP, []>;
+def VBPERMD : VX1_Int_Ty3<1484, "vbpermd", int_ppc_altivec_vbpermd,
+                          v2i64, v2i64, v16i8>;
 def VPERMR : VAForm_1a<59, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
                        "vpermr $vD, $vA, $vB, $vC", IIC_VecFP, []>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 91b507ea6c4c..f7e4c0708d7d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -889,7 +889,7 @@ class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 }
 
 class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-                 string asmstr, InstrItinClass itin, list<dag> pattern>
+                 string asmstr, InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
   bit R;
 
@@ -903,7 +903,7 @@ class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 }
 
 class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
-                 string asmstr, InstrItinClass itin, list<dag> pattern>
+                 string asmstr, InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
   bit A;
 
@@ -916,7 +916,7 @@ class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
 }
 
 class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-              InstrItinClass itin, list<dag> pattern>
+              InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
   bit L;
 
@@ -930,7 +930,7 @@ class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 }
 
 class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-              InstrItinClass itin, list<dag> pattern>
+              InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
   bits<3> BF;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/llvm/lib/Target/PowerPC/PPCInstrHTM.td
index e59a08774dc5..ec1c397ff57f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrHTM.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrHTM.td
@@ -29,10 +29,10 @@ let Predicates = [HasHTM] in {
 
 let Defs = [CR0] in {
 def TBEGIN : XForm_htm0 <31, 654,
-                         (outs), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>;
+                         (outs), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR>;
 
 def TEND : XForm_htm1 <31, 686,
-                       (outs), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>;
+                       (outs), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR>;
 
 def TABORT : XForm_base_r3xo <31, 910,
                               (outs), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR,
@@ -62,7 +62,7 @@ def TABORTDCI : XForm_base_r3xo <31, 878,
                                  isRecordForm;
 
 def TSR : XForm_htm2 <31, 750,
-                      (outs), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>,
+                      (outs), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR>,
                       isRecordForm;
 
 def TRECLAIM : XForm_base_r3xo <31, 942,
@@ -84,7 +84,7 @@ def TRECHKPT : XForm_base_r3xo <31, 1006,
 }
 
 def TCHECK : XForm_htm3 <31, 718,
-                        (outs crrc:$BF), (ins), "tcheck $BF", IIC_SprMTSPR, []>;
+                        (outs crrc:$BF), (ins), "tcheck $BF", IIC_SprMTSPR>;
 // Builtins
 
 // All HTM instructions, with the exception of tcheck, set CR0 with the
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9dd35d5f44d1..649a150866b4 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -35,10 +35,10 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -1109,6 +1109,8 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case PPC::XXLXORdpz:
   case PPC::XXLEQVOnes:
   case PPC::XXSPLTI32DX:
+  case PPC::XXSPLTIW:
+  case PPC::XXSPLTIDP:
   case PPC::V_SET0B:
   case PPC::V_SET0H:
   case PPC::V_SET0:
@@ -1541,6 +1543,11 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
   if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
     return false;
 
+  // If the conditional branch uses a physical register, then it cannot be
+  // turned into a select.
+  if (Register::isPhysicalRegister(Cond[1].getReg()))
+    return false;
+
   // Check register classes.
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC =
@@ -2239,11 +2246,13 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
 
     return true;
   } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL ||
-             OpC == PPC::BCTRL8) {
+             OpC == PPC::BCTRL8 || OpC == PPC::BCTRL_RM ||
+             OpC == PPC::BCTRL8_RM) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
       llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
 
-    bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
+    bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8 ||
+                 OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM;
     bool isPPC64 = Subtarget.isPPC64();
 
     if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
@@ -2267,6 +2276,9 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
       MachineInstrBuilder(*MI.getParent()->getParent(), MI)
           .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit)
           .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine);
+    if (OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM)
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(PPC::RM, RegState::ImplicitDefine);
 
     return true;
   }
@@ -2343,8 +2355,8 @@ bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI,
 }
 
 bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                  Register &SrcReg2, int &Mask,
-                                  int &Value) const {
+                                  Register &SrcReg2, int64_t &Mask,
+                                  int64_t &Value) const {
   unsigned Opc = MI.getOpcode();
 
   switch (Opc) {
@@ -2373,7 +2385,8 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
 }
 
 bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                                        Register SrcReg2, int Mask, int Value,
+                                        Register SrcReg2, int64_t Mask,
+                                        int64_t Value,
                                         const MachineRegisterInfo *MRI) const {
   if (DisableCmpOpt)
     return false;
@@ -3009,7 +3022,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
             .addReg(SrcVSR + VecNo)
             .addReg(SrcVSR + VecNo);
     }
-    // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers.
+    // BUILD_UACC is expanded to 4 copies of the underlying vsx registers.
     // So after building the 4 copies, we can replace the BUILD_UACC instruction
     // with a NOP.
     LLVM_FALLTHROUGH;
@@ -3103,6 +3116,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
 
+    // FIXME: Maybe we can expand it in 'PowerPC Expand Atomic' pass.
   case PPC::CFENCE8: {
     auto Val = MI.getOperand(0).getReg();
     BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);
@@ -3770,7 +3784,7 @@ bool PPCInstrInfo::combineRLWINM(MachineInstr &MI,
   bool Simplified = false;
 
   // If final mask is 0, MI result should be 0 too.
-  if (FinalMask.isNullValue()) {
+  if (FinalMask.isZero()) {
     bool Is64Bit =
         (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec);
     Simplified = true;
@@ -5241,8 +5255,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
               return false;
             const IntegerType *IntTy =
               dyn_cast<IntegerType>(CalleeFn->getReturnType());
-            const AttributeSet &Attrs =
-              CalleeFn->getAttributes().getRetAttributes();
+            const AttributeSet &Attrs = CalleeFn->getAttributes().getRetAttrs();
             if (IntTy && IntTy->getBitWidth() <= 32)
               return Attrs.hasAttribute(SignExt ? Attribute::SExt :
                                                   Attribute::ZExt);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 404156de232f..2cfd53de3290 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -524,10 +524,11 @@ public:
   // Comparison optimization.
 
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &Mask, int &Value) const override;
+                      Register &SrcReg2, int64_t &Mask,
+                      int64_t &Value) const override;
 
   bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                            Register SrcReg2, int Mask, int Value,
+                            Register SrcReg2, int64_t Mask, int64_t Value,
                             const MachineRegisterInfo *MRI) const override;
 
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index f53e1b89626f..d83ecc699b19 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -316,6 +316,24 @@ def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                 SDNPVariadic]>;
 
+// Call nodes for strictfp calls (that define RM).
+def PPCcall_rm  : SDNode<"PPCISD::CALL_RM", SDT_PPCCall,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                          SDNPVariadic]>;
+def PPCcall_nop_rm  : SDNode<"PPCISD::CALL_NOP_RM", SDT_PPCCall,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
+def PPCcall_notoc_rm : SDNode<"PPCISD::CALL_NOTOC_RM", SDT_PPCCall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def PPCbctrl_rm : SDNode<"PPCISD::BCTRL_RM", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                          SDNPVariadic]>;
+def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM",
+                                  SDTypeProfile<0, 1, []>,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
+
 def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
@@ -1152,15 +1170,14 @@ def addr   : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
 /// This is just the offset part of iaddr, used for preinc.
 def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 
-// PC Relative Address
-def pcreladdr : ComplexPattern<iPTR, 1, "SelectAddrPCRel", [], []>;
-
 // Load and Store Instruction Selection addressing modes.
 def DForm  : ComplexPattern<iPTR, 2, "SelectDForm",    [], [SDNPWantParent]>;
 def DSForm : ComplexPattern<iPTR, 2, "SelectDSForm",   [], [SDNPWantParent]>;
 def DQForm : ComplexPattern<iPTR, 2, "SelectDQForm",   [], [SDNPWantParent]>;
 def XForm  : ComplexPattern<iPTR, 2, "SelectXForm",    [], [SDNPWantParent]>;
 def ForceXForm : ComplexPattern<iPTR, 2, "SelectForceXForm", [], [SDNPWantParent]>;
+def PCRelForm : ComplexPattern<iPTR, 2, "SelectPCRelForm", [], [SDNPWantParent]>;
+def PDForm : ComplexPattern<iPTR, 2, "SelectPDForm",   [], [SDNPWantParent]>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
@@ -1183,6 +1200,7 @@ def NaNsFPMath
     : Predicate<"!Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
 def HasBPERMD : Predicate<"Subtarget->hasBPERMD()">;
 def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
+def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">;
 def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">;
 def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
 def HasFPU : Predicate<"Subtarget->hasFPU()">;
@@ -1272,7 +1290,7 @@ multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
 
 // Multiclass for instructions which have a record overflow form as well
 // as a record form but no carry (i.e. mulld, mulldo, subf, subfo, etc.)
-multiclass XOForm_1rx<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+multiclass XOForm_1rx<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
                       string asmbase, string asmstr, InstrItinClass itin,
                       list<dag> pattern> {
   let BaseName = asmbase in {
@@ -1649,7 +1667,7 @@ def RESTORE_CRBIT : PPCEmitTimePseudo<(outs crbitrc:$cond), (ins memri:$F),
                            "#RESTORE_CRBIT", []>;
 }
 
-let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, hasSideEffects = 0 in {
   let isPredicable = 1, isReturn = 1, Uses = [LR, RM] in
     def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
                            [(retflag)]>, Requires<[In32BitMode]>;
@@ -1690,7 +1708,8 @@ let Defs = [LR] in
   def MoveGOTtoLR : PPCEmitTimePseudo<(outs), (ins), "#MoveGOTtoLR", []>,
                     PPC970_Unit_BRU;
 
-let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    hasSideEffects = 0 in {
   let isBarrier = 1 in {
     let isPredicable = 1 in
       def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
@@ -1782,7 +1801,8 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
 }
 
 // The unconditional BCL used by the SjLj setjmp code.
-let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
+let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7,
+    hasSideEffects = 0 in {
   let Defs = [LR], Uses = [RM] in {
     def BCLalways  : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
                             "bcl 20, 31, $dst">;
@@ -1890,6 +1910,26 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   }
 }
 
+let isCall = 1, PPC970_Unit = 7, Defs = [LR, RM], isCodeGenOnly = 1 in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL_RM  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                       "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+    def BLA_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                       "bla $func", IIC_BrB, [(PPCcall_rm (i32 imm:$func))]>;
+
+    def BL_NOP_RM  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                            (outs), (ins calltarget:$func),
+                                            "bl $func\n\tnop", IIC_BrB, []>;
+  }
+  let Uses = [CTR, RM] in {
+    let isPredicable = 1 in
+      def BCTRL_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                                  "bctrl", IIC_BrB, [(PPCbctrl_rm)]>,
+                  Requires<[In32BitMode]>;
+  }
+}
+
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi :PPCEmitTimePseudo< (outs),
                         (ins calltarget:$dst, i32imm:$offset),
@@ -1916,8 +1956,16 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
 
 }
 
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+    Defs = [LR, R2, RM], Uses = [CTR, RM], RST = 2 in {
+  def BCTRL_LWZinto_toc_RM:
+    XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs),
+     (ins memri:$src), "bctrl\n\tlwz 2, $src", IIC_BrB,
+     [(PPCbctrl_load_toc_rm iaddr:$src)]>, Requires<[In32BitMode]>;
 
-let isCodeGenOnly = 1 in {
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
     isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
@@ -3106,14 +3154,14 @@ def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
              PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
 
-let Defs = [RM] in {
+let Defs = [RM], hasSideEffects = 1 in {
   let isCodeGenOnly = 1 in
   def MTFSFb  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
                         "mtfsf $FM, $rT", IIC_IntMTFSB0,
                         [(int_ppc_mtfsf timm:$FM, f64:$rT)]>,
                 PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
-let Uses = [RM] in {
+let Uses = [RM], hasSideEffects = 1 in {
   def MFFS   : XForm_42<63, 583, (outs f8rc:$rT), (ins),
                          "mffs $rT", IIC_IntMFFS,
                          [(set f64:$rT, (PPCmffs))]>,
@@ -3170,7 +3218,7 @@ def ADDEX : Z23Form_RTAB5_CY2<31, 170, (outs gprc:$rT),
 let PPC970_Unit = 1, hasSideEffects = 0 in {  // FXU Operations.
 // XO-Form instructions.  Arithmetic instructions that can set overflow bit
 let isCommutable = 1 in
-defm ADD4  : XOForm_1rx<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+defm ADD4  : XOForm_1rx<31, 266, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
                         "add", "$rT, $rA, $rB", IIC_IntSimple,
                         [(set i32:$rT, (add i32:$rA, i32:$rB))]>;
 let isCodeGenOnly = 1 in
@@ -3204,11 +3252,11 @@ defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
 defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
                        "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
                        [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
-defm MULLW : XOForm_1rx<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+defm MULLW : XOForm_1rx<31, 235, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
                         "mullw", "$rT, $rA, $rB", IIC_IntMulHW,
                         [(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
 } // isCommutable
-defm SUBF  : XOForm_1rx<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+defm SUBF  : XOForm_1rx<31, 40, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
                         "subf", "$rT, $rA, $rB", IIC_IntGeneral,
                         [(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
 defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
@@ -3433,6 +3481,12 @@ def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
 def : Pat<(PPCcall (i32 texternalsym:$dst)),
           (BL texternalsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i32 tglobaladdr:$dst)),
+          (BL_RM tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall_rm (i32 texternalsym:$dst)),
+          (BL_RM texternalsym:$dst)>;
+
 // Calls for AIX only
 def : Pat<(PPCcall (i32 mcsym:$dst)),
           (BL mcsym:$dst)>;
@@ -3443,6 +3497,15 @@ def : Pat<(PPCcall_nop (i32 mcsym:$dst)),
 def : Pat<(PPCcall_nop (i32 texternalsym:$dst)),
           (BL_NOP texternalsym:$dst)>;
 
+def : Pat<(PPCcall_rm (i32 mcsym:$dst)),
+          (BL_RM mcsym:$dst)>;
+
+def : Pat<(PPCcall_nop_rm (i32 mcsym:$dst)),
+          (BL_NOP_RM mcsym:$dst)>;
+
+def : Pat<(PPCcall_nop_rm (i32 texternalsym:$dst)),
+          (BL_NOP_RM texternalsym:$dst)>;
+
 def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
           (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
 
@@ -4501,7 +4564,7 @@ def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
 // All MTFSF variants may change the rounding mode so conservatively set it
 // as an implicit def for all of them.
 let Predicates = [HasFPU] in {
-let Defs = [RM] in {
+let Defs = [RM], hasSideEffects = 1 in {
 let isCodeGenOnly = 1,
     Pattern = [(int_ppc_mtfsfi timm:$BF, timm:$U)], W = 0 in
 def MTFSFIb : XLForm_4<63, 134, (outs), (ins u3imm:$BF, u4imm:$U),
@@ -5059,7 +5122,7 @@ def RLWNMbm_rec : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
 
 // These generic branch instruction forms are used for the assembler parser only.
 // Defs and Uses are conservative, since we don't know the BO value.
-let PPC970_Unit = 7, isBranch = 1 in {
+let PPC970_Unit = 7, isBranch = 1, hasSideEffects = 0 in {
   let Defs = [CTR], Uses = [CTR, RM] in {
     def gBC : BForm_3<16, 0, 0, (outs),
                       (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
@@ -5475,8 +5538,6 @@ def : Pat<(int_ppc_stwcx ForceXForm:$dst, gprc:$A),
           (STWCX gprc:$A, ForceXForm:$dst)>;
 def : Pat<(int_ppc_stbcx ForceXForm:$dst, gprc:$A),
           (STBCX gprc:$A, ForceXForm:$dst)>;
-def : Pat<(int_ppc_tw gprc:$A, gprc:$B, i32:$IMM),
-          (TW $IMM, $A, $B)>;
 def : Pat<(int_ppc_trap gprc:$A),
           (TWI 24, $A, 0)>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index b183dbd4b3bb..a19289e96b3e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -29,9 +29,6 @@ def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [
 def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [
   SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1>
 ]>;
-def SDT_PPCVecInsertElt : SDTypeProfile<1, 3, [
-  SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<3>
-]>;
 
 //===----------------------------------------------------------------------===//
 // ISA 3.1 specific PPCISD nodes.
@@ -45,7 +42,6 @@ def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
 def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
                         []>;
 def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
-def PPCvecinsertelt : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsertElt, []>;
 
 //===----------------------------------------------------------------------===//
 
@@ -621,7 +617,7 @@ class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Inst{48-63} = D_RA{15-0};    // D
 }
 
-multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> pref, bits<6> opcode, dag OOL,
+multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> opcode, dag OOL,
                                        dag IOL, dag PCRel_IOL,
                                        string asmstr, InstrItinClass itin> {
   def NAME : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, IOL,
@@ -1652,208 +1648,201 @@ let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
 
 let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
   defm PLXVP :
-    8LS_DForm_R_XTp5_SI34_MEM_p<1, 58, (outs vsrprc:$XTp), (ins memri34:$D_RA),
+    8LS_DForm_R_XTp5_SI34_MEM_p<58, (outs vsrprc:$XTp), (ins memri34:$D_RA),
                                 (ins memri34_pcrel:$D_RA), "plxvp $XTp, $D_RA",
                                 IIC_LdStLFD>;
 }
 
 let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
   defm PSTXVP :
-    8LS_DForm_R_XTp5_SI34_MEM_p<1, 62, (outs), (ins vsrprc:$XTp, memri34:$D_RA),
+    8LS_DForm_R_XTp5_SI34_MEM_p<62, (outs), (ins vsrprc:$XTp, memri34:$D_RA),
                                 (ins vsrprc:$XTp, memri34_pcrel:$D_RA),
                                 "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
 }
 
 let Predicates = [PairedVectorMemops] in {
   // Intrinsics for Paired Vector Loads.
-  def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
-  def : Pat<(v256i1 (int_ppc_vsx_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>;
+  def : Pat<(v256i1 (int_ppc_vsx_lxvp DQForm:$src)), (LXVP memrix16:$src)>;
+  def : Pat<(v256i1 (int_ppc_vsx_lxvp XForm:$src)), (LXVPX XForm:$src)>;
   let Predicates = [PairedVectorMemops, PrefixInstrs] in {
-    def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+    def : Pat<(v256i1 (int_ppc_vsx_lxvp PDForm:$src)), (PLXVP memri34:$src)>;
   }
   // Intrinsics for Paired Vector Stores.
-  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX16:$dst),
+  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, DQForm:$dst),
             (STXVP $XSp, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, xaddrX16:$dst),
-            (STXVPX $XSp, xaddrX16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, XForm:$dst),
+            (STXVPX $XSp, XForm:$dst)>;
   let Predicates = [PairedVectorMemops, PrefixInstrs] in {
-    def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX34:$dst),
+    def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, PDForm:$dst),
               (PSTXVP $XSp, memri34:$dst)>;
   }
 }
 
-// TODO: We have an added complexity of 500 here. This is only a temporary
-// solution to have tablegen consider these patterns first. The way we do
-// addressing for PowerPC is complex depending on available D form, X form, or
-// aligned D form loads/stores like DS and DQ forms. The prefixed
-// instructions in this file also add additional PC Relative loads/stores
-// and D form loads/stores with 34 bit immediates. It is very difficult to force
-// instruction selection to consistently pick these first without the current
-// added complexity. Once pc-relative implementation is complete, a set of
-// follow-up patches will address this refactoring and the AddedComplexity will
-// be removed.
-let Predicates = [PCRelativeMemops], AddedComplexity = 500 in {
+let Predicates = [PCRelativeMemops] in {
   // Load i32
-  def : Pat<(i32 (zextloadi1  (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (zextloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (extloadi1  (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (extloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (zextloadi8  (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (zextloadi8  (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (extloadi8   (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (extloadi8   (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(i32 (sextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (sextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHApc $ga, 0)>;
-  def : Pat<(i32 (zextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (zextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZpc $ga, 0)>;
-  def : Pat<(i32 (extloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i32 (extloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZpc $ga, 0)>;
-  def : Pat<(i32 (load (PPCmatpcreladdr pcreladdr:$ga))), (PLWZpc $ga, 0)>;
+  def : Pat<(i32 (load (PPCmatpcreladdr PCRelForm:$ga))), (PLWZpc $ga, 0)>;
 
   // Store i32
-  def : Pat<(truncstorei8 i32:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(truncstorei8 i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTBpc $RS, $ga, 0)>;
-  def : Pat<(truncstorei16 i32:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(truncstorei16 i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTHpc $RS, $ga, 0)>;
-  def : Pat<(store i32:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store i32:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTWpc $RS, $ga, 0)>;
 
   // Load i64
-  def : Pat<(i64 (zextloadi1  (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (zextloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi1  (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (extloadi1  (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (zextloadi8  (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (zextloadi8  (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi8   (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (extloadi8   (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZ8pc $ga, 0)>;
-  def : Pat<(i64 (sextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (sextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHA8pc $ga, 0)>;
-  def : Pat<(i64 (zextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (zextloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZ8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (extloadi16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZ8pc $ga, 0)>;
-  def : Pat<(i64 (zextloadi32 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (zextloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLWZ8pc $ga, 0)>;
-  def : Pat<(i64 (sextloadi32 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (sextloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLWA8pc $ga, 0)>;
-  def : Pat<(i64 (extloadi32 (PPCmatpcreladdr pcreladdr:$ga))),
+  def : Pat<(i64 (extloadi32 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLWZ8pc $ga, 0)>;
-  def : Pat<(i64 (load (PPCmatpcreladdr pcreladdr:$ga))), (PLDpc $ga, 0)>;
+  def : Pat<(i64 (load (PPCmatpcreladdr PCRelForm:$ga))), (PLDpc $ga, 0)>;
 
   // Store i64
-  def : Pat<(truncstorei8 i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(truncstorei8 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTB8pc $RS, $ga, 0)>;
-  def : Pat<(truncstorei16 i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(truncstorei16 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTH8pc $RS, $ga, 0)>;
-  def : Pat<(truncstorei32 i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(truncstorei32 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTW8pc $RS, $ga, 0)>;
-  def : Pat<(store i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTDpc $RS, $ga, 0)>;
 
   // Load f32
-  def : Pat<(f32 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLFSpc $addr, 0)>;
+  def : Pat<(f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFSpc $addr, 0)>;
 
   // Store f32
-  def : Pat<(store f32:$FRS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store f32:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTFSpc $FRS, $ga, 0)>;
 
   // Load f64
-  def : Pat<(f64 (extloadf32 (PPCmatpcreladdr pcreladdr:$addr))),
+  def : Pat<(f64 (extloadf32 (PPCmatpcreladdr PCRelForm:$addr))),
             (COPY_TO_REGCLASS (PLFSpc $addr, 0), VSFRC)>;
-  def : Pat<(f64 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLFDpc $addr, 0)>;
+  def : Pat<(f64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFDpc $addr, 0)>;
 
   // Store f64
-  def : Pat<(store f64:$FRS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store f64:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTFDpc $FRS, $ga, 0)>;
 
   // Load f128
-  def : Pat<(f128 (load (PPCmatpcreladdr pcreladdr:$addr))),
+  def : Pat<(f128 (load (PPCmatpcreladdr PCRelForm:$addr))),
             (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>;
 
   // Store f128
-  def : Pat<(store f128:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store f128:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc (COPY_TO_REGCLASS $XS, VSRC), $ga, 0)>;
 
   // Load v4i32
-  def : Pat<(v4i32 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+  def : Pat<(v4i32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
 
   // Store v4i32
-  def : Pat<(store v4i32:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store v4i32:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc $XS, $ga, 0)>;
 
   // Load v2i64
-  def : Pat<(v2i64 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+  def : Pat<(v2i64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
 
   // Store v2i64
-  def : Pat<(store v2i64:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store v2i64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc $XS, $ga, 0)>;
 
   // Load v4f32
-  def : Pat<(v4f32 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+  def : Pat<(v4f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
 
   // Store v4f32
-  def : Pat<(store v4f32:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store v4f32:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc $XS, $ga, 0)>;
 
   // Load v2f64
-  def : Pat<(v2f64 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+  def : Pat<(v2f64 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLXVpc $addr, 0)>;
 
   // Store v2f64
-  def : Pat<(store v2f64:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(store v2f64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc $XS, $ga, 0)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga)),
             (PLBZpc $ga, 0)>;
-  def : Pat<(atomic_load_16 (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga)),
             (PLHZpc $ga, 0)>;
-  def : Pat<(atomic_load_32 (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga)),
             (PLWZpc $ga, 0)>;
-  def : Pat<(atomic_load_64 (PPCmatpcreladdr pcreladdr:$ga)),
+  def : Pat<(atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga)),
             (PLDpc $ga, 0)>;
 
   // Atomic Store
-  def : Pat<(atomic_store_8 (PPCmatpcreladdr pcreladdr:$ga), i32:$RS),
+  def : Pat<(atomic_store_8 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
             (PSTBpc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_16 (PPCmatpcreladdr pcreladdr:$ga), i32:$RS),
+  def : Pat<(atomic_store_16 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
             (PSTHpc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_32 (PPCmatpcreladdr pcreladdr:$ga), i32:$RS),
+  def : Pat<(atomic_store_32 (PPCmatpcreladdr PCRelForm:$ga), i32:$RS),
             (PSTWpc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_8 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+  def : Pat<(atomic_store_8 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
             (PSTB8pc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_16 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+  def : Pat<(atomic_store_16 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
             (PSTH8pc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_32 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+  def : Pat<(atomic_store_32 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
             (PSTW8pc $RS, $ga, 0)>;
-  def : Pat<(atomic_store_64 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+  def : Pat<(atomic_store_64 (PPCmatpcreladdr PCRelForm:$ga), i64:$RS),
             (PSTDpc $RS, $ga, 0)>;
 
   // Special Cases For PPCstore_scal_int_from_vsr
   def : Pat<(PPCstore_scal_int_from_vsr
               (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)),
-              (PPCmatpcreladdr pcreladdr:$dst), 8),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
             (PSTXSDpc (XSCVDPSXDS f64:$src), $dst, 0)>;
   def : Pat<(PPCstore_scal_int_from_vsr
               (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)),
-              (PPCmatpcreladdr pcreladdr:$dst), 8),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
             (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), $dst, 0)>;
 
   def : Pat<(PPCstore_scal_int_from_vsr
               (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)),
-              (PPCmatpcreladdr pcreladdr:$dst), 8),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
             (PSTXSDpc (XSCVDPUXDS f64:$src), $dst, 0)>;
   def : Pat<(PPCstore_scal_int_from_vsr
               (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)),
-              (PPCmatpcreladdr pcreladdr:$dst), 8),
+              (PPCmatpcreladdr PCRelForm:$dst), 8),
             (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), $dst, 0)>;
 
+  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
+            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
+
   // If the PPCmatpcreladdr node is not caught by any other pattern it should be
   // caught here and turned into a paddi instruction to materialize the address.
-  def : Pat<(PPCmatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+  def : Pat<(PPCmatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
   // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize
   // tls global address with paddi instruction.
-  def : Pat<(PPCtlsdynamatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+  def : Pat<(PPCtlsdynamatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
   // PPCtlslocalexecmataddr node is used for TLS local exec models to
   // materialize tls global address with paddi instruction.
   def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)),
@@ -1861,15 +1850,6 @@ let Predicates = [PCRelativeMemops], AddedComplexity = 500 in {
 }
 
 let Predicates = [PrefixInstrs] in {
-  def XXSPLTIW : 8RR_DForm_IMM32_XT6<32, 3, (outs vsrc:$XT),
-                                     (ins i32imm:$IMM32),
-                                     "xxspltiw $XT, $IMM32", IIC_VecGeneral,
-                                     []>;
-  def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
-                                      (ins i32imm:$IMM32),
-                                      "xxspltidp $XT, $IMM32", IIC_VecGeneral,
-                                      [(set v2f64:$XT,
-                                            (PPCxxspltidp i32:$IMM32))]>;
   def XXPERMX :
     8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
                             vsrc:$XC, u3imm:$UIM),
@@ -1893,9 +1873,18 @@ let Predicates = [PrefixInstrs] in {
                        IIC_VecGeneral, []>;
 }
 
-// XXSPLI32DX needs extra flags to make sure the compiler does not attempt
+// XXSPLTIW/DP/32DX need extra flags to make sure the compiler does not attempt
 // to spill part of the instruction when the values are similar.
-let isReMaterializable = 1, isMoveImm = 1, Predicates = [PrefixInstrs] in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, Predicates = [PrefixInstrs] in {
+  def XXSPLTIW : 8RR_DForm_IMM32_XT6<32, 3, (outs vsrc:$XT),
+                                     (ins i32imm:$IMM32),
+                                     "xxspltiw $XT, $IMM32", IIC_VecGeneral,
+                                     []>;
+  def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
+                                      (ins i32imm:$IMM32),
+                                      "xxspltidp $XT, $IMM32", IIC_VecGeneral,
+                                      [(set v2f64:$XT,
+                                            (PPCxxspltidp i32:$IMM32))]>;
   def XXSPLTI32DX :
       8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
                              (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
@@ -1934,7 +1923,7 @@ let Predicates = [IsISA3_1] in {
                                 [(set v16i8:$VRT,
                                       (int_ppc_altivec_vsldbi v16i8:$VRA,
                                                               v16i8:$VRB,
-                                                              i32:$SH))]>;
+                                                              timm:$SH))]>;
   def VSRDBI : VNForm_VTAB5_SD3<22, 1, (outs vrrc:$VRT),
                                 (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
                                 "vsrdbi $VRT, $VRA, $VRB, $SH",
@@ -1942,7 +1931,7 @@ let Predicates = [IsISA3_1] in {
                                 [(set v16i8:$VRT,
                                       (int_ppc_altivec_vsrdbi v16i8:$VRA,
                                                               v16i8:$VRB, 
-                                                              i32:$SH))]>;
+                                                              timm:$SH))]>;
   defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB),
                                  "vstribr", "$vT, $vB", IIC_VecGeneral,
 				 [(set v16i8:$vT,
@@ -2678,6 +2667,45 @@ def : Pat<(f64 nzFPImmAsi64:$A),
   // nand(A, nand(B, C))
   def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
                        !sub(255, 14)>;
+
+  // Anonymous patterns to select prefixed VSX loads and stores.
+  // Load / Store f128
+  def : Pat<(f128 (load PDForm:$src)),
+            (COPY_TO_REGCLASS (PLXV memri34:$src), VRRC)>;
+  def : Pat<(store f128:$XS, PDForm:$dst),
+            (PSTXV (COPY_TO_REGCLASS $XS, VSRC), memri34:$dst)>;
+
+  // Load / Store v4i32
+  def : Pat<(v4i32 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v4i32:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Load / Store v2i64
+  def : Pat<(v2i64 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v2i64:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Load / Store v4f32
+  def : Pat<(v4f32 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v4f32:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Load / Store v2f64
+  def : Pat<(v2f64 (load PDForm:$src)), (PLXV memri34:$src)>;
+  def : Pat<(store v2f64:$XS, PDForm:$dst), (PSTXV $XS, memri34:$dst)>;
+
+  // Cases For PPCstore_scal_int_from_vsr
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), PDForm:$dst, 8),
+            (PSTXSD (XSCVDPUXDS f64:$src), PDForm:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), PDForm:$dst, 8),
+            (PSTXSD (XSCVDPSXDS f64:$src), PDForm:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), PDForm:$dst, 8),
+            (PSTXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+                     PDForm:$dst)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), PDForm:$dst, 8),
+            (PSTXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+                     PDForm:$dst)>;
 }
 
 let Predicates = [PrefixInstrs] in {
@@ -2701,138 +2729,192 @@ let Predicates = [PrefixInstrs] in {
             (XXBLENDVW $A, $B, $C)>;
   def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C),
             (XXBLENDVD $A, $B, $C)>;
+
+  // Anonymous patterns to select prefixed loads and stores.
+  // Load i32
+  def : Pat<(i32 (extloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (zextloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (extloadi8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (zextloadi8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (extloadi16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(i32 (zextloadi16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(i32 (sextloadi16 PDForm:$src)), (PLHA memri34:$src)>;
+  def : Pat<(i32 (load PDForm:$src)), (PLWZ memri34:$src)>;
+
+  // Store i32
+  def : Pat<(truncstorei8 i32:$rS, PDForm:$dst), (PSTB gprc:$rS, memri34:$dst)>;
+  def : Pat<(truncstorei16 i32:$rS, PDForm:$dst), (PSTH gprc:$rS, memri34:$dst)>;
+  def : Pat<(store i32:$rS, PDForm:$dst), (PSTW gprc:$rS, memri34:$dst)>;
+
+  // Load i64
+  def : Pat<(i64 (extloadi1 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi1 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (extloadi8 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi8 PDForm:$src)), (PLBZ8 memri34:$src)>;
+  def : Pat<(i64 (extloadi16 PDForm:$src)), (PLHZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi16 PDForm:$src)), (PLHZ8 memri34:$src)>;
+  def : Pat<(i64 (sextloadi16 PDForm:$src)), (PLHA8 memri34:$src)>;
+  def : Pat<(i64 (extloadi32 PDForm:$src)), (PLWZ8 memri34:$src)>;
+  def : Pat<(i64 (zextloadi32 PDForm:$src)), (PLWZ8 memri34:$src)>;
+  def : Pat<(i64 (sextloadi32 PDForm:$src)), (PLWA8 memri34:$src)>;
+  def : Pat<(i64 (load PDForm:$src)), (PLD memri34:$src)>;
+
+  // Store i64
+  def : Pat<(truncstorei8 i64:$rS, PDForm:$dst), (PSTB8 g8rc:$rS, memri34:$dst)>;
+  def : Pat<(truncstorei16 i64:$rS, PDForm:$dst), (PSTH8 g8rc:$rS, memri34:$dst)>;
+  def : Pat<(truncstorei32 i64:$rS, PDForm:$dst), (PSTW8 g8rc:$rS, memri34:$dst)>;
+  def : Pat<(store i64:$rS, PDForm:$dst), (PSTD g8rc:$rS, memri34:$dst)>;
+
+  // Load / Store f32
+  def : Pat<(f32 (load PDForm:$src)), (PLFS memri34:$src)>;
+  def : Pat<(store f32:$FRS, PDForm:$dst), (PSTFS $FRS, memri34:$dst)>;
+
+  // Load / Store f64
+  def : Pat<(f64 (extloadf32 PDForm:$src)),
+            (COPY_TO_REGCLASS (PLFS memri34:$src), VSFRC)>;
+  def : Pat<(f64 (load PDForm:$src)), (PLFD memri34:$src)>;
+  def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
+
+  // Atomic Load
+  def : Pat<(atomic_load_8 PDForm:$src), (PLBZ memri34:$src)>;
+  def : Pat<(atomic_load_16 PDForm:$src), (PLHZ memri34:$src)>;
+  def : Pat<(atomic_load_32 PDForm:$src), (PLWZ memri34:$src)>;
+  def : Pat<(atomic_load_64 PDForm:$src), (PLD memri34:$src)>;
+
+  // Atomic Store
+  def : Pat<(atomic_store_8 PDForm:$dst, i32:$RS), (PSTB $RS, memri34:$dst)>;
+  def : Pat<(atomic_store_16 PDForm:$dst, i32:$RS), (PSTH $RS, memri34:$dst)>;
+  def : Pat<(atomic_store_32 PDForm:$dst, i32:$RS), (PSTW $RS, memri34:$dst)>;
+  def : Pat<(atomic_store_64 PDForm:$dst, i64:$RS), (PSTD $RS, memri34:$dst)>;
+
+  // Prefixed fpext to v2f64
+  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
+            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
 }
 
 def InsertEltShift {
-  dag Sub32Left0 = (EXTRACT_SUBREG $rB, sub_32);
+  dag Sub32 = (i32 (EXTRACT_SUBREG $rB, sub_32));
   dag Sub32Left1 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 1, 0, 30);
   dag Sub32Left2 = (RLWINM (EXTRACT_SUBREG $rB, sub_32), 2, 0, 29);
+  dag Left1 = (RLWINM $rB, 1, 0, 30);
+  dag Left2 = (RLWINM $rB, 2, 0, 29);
   dag Left3 = (RLWINM8 $rB, 3, 0, 28);
 }
 
 let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in {
   // Indexed vector insert element
-  def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)),
-            (VINSBRX $vDi, InsertEltShift.Sub32Left0, $rA)>;
-  def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)),
+  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i64:$rB)),
+            (VINSBRX $vDi, InsertEltShift.Sub32, $rA)>;
+  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i64:$rB)),
             (VINSHRX $vDi, InsertEltShift.Sub32Left1, $rA)>;
-  def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i64:$rB)),
             (VINSWRX $vDi, InsertEltShift.Sub32Left2, $rA)>;
-  def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+  def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, i64:$rB)),
             (VINSDRX $vDi, InsertEltShift.Left3, $rA)>;
 
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi,  f32:$A, i64:$rB)),
-            (VINSWRX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)),
+            (VINSWVRX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
             (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
             (VINSWRX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
             (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
 
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi,  f64:$A, i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi,  f64:$A, i64:$rB)),
             (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
             (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
             (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
             (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
-
-  // Immediate vector insert element
-  foreach i = [0, 1, 2, 3] in {
-    def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))),
-              (VINSW $vDi, !mul(!sub(3, i), 4), $rA)>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))),
-              (VINSW $vDi, !mul(!sub(3, i), 4), (LWZ memri:$rA))>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))),
-              (VINSW $vDi, !mul(!sub(3, i), 4), (PLWZ memri34:$rA))>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))),
-              (VINSW $vDi, !mul(!sub(3, i), 4), (LWZX memrr:$rA))>;
+  let AddedComplexity = 400 in {
+    // Immediate vector insert element
+    foreach Idx = [0, 1, 2, 3] in {
+      def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, Idx)),
+                (VINSW $vDi, !mul(!sub(3, Idx), 4), $rA)>;
+      def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), Idx)),
+                (VINSW $vDi, !mul(!sub(3, Idx), 4), (LWZ memri:$rA))>;
+      def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), Idx)),
+                (VINSW $vDi, !mul(!sub(3, Idx), 4), (PLWZ memri34:$rA))>;
+      def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), Idx)),
+                (VINSW $vDi, !mul(!sub(3, Idx), 4), (LWZX memrr:$rA))>;
+    }
+    foreach i = [0, 1] in
+     def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, (i64 i))),
+               (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>;
   }
-  foreach i = [0, 1] in
-    def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))),
-              (VINSD $vDi, !mul(!sub(1, i), 8), $rA)>;
 }
 
 let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC32] in {
   // Indexed vector insert element
-  def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i32:$rB)),
+  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i32:$rB)),
             (VINSBLX $vDi, $rB, $rA)>;
-  def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i32:$rB)),
-            (VINSHLX $vDi, $rB, $rA)>;
-  def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i32:$rB)),
-            (VINSWLX $vDi, $rB, $rA)>;
-
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi,  f32:$A, i32:$rB)),
-            (VINSWLX $vDi, $rB, Bitcast.FltToInt)>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)),
-                                    i32:$rB)),
-            (VINSWLX $vDi, $rB, (LWZ memri:$rA))>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)),
-                                    i32:$rB)),
-            (VINSWLX $vDi, $rB, (PLWZ memri34:$rA))>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)),
-                                    i32:$rB)),
-            (VINSWLX $vDi, $rB, (LWZX memrr:$rA))>;
+  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i32:$rB)),
+            (VINSHLX $vDi, InsertEltShift.Left1, $rA)>;
+  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i32:$rB)),
+            (VINSWLX $vDi, InsertEltShift.Left2, $rA)>;
 
-  // Immediate vector insert element
-  foreach i = [0, 1, 2, 3] in {
-    def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i32 i))),
-              (VINSW $vDi, !mul(i, 4), $rA)>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)),
-                                     (i32 i))),
-              (VINSW $vDi, !mul(i, 4), (LWZ memri:$rA))>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)),
-                                     (i32 i))),
-              (VINSW $vDi, !mul(i, 4), (PLWZ memri34:$rA))>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)),
-                                     (i32 i))),
-              (VINSW $vDi, !mul(i, 4), (LWZX memrr:$rA))>;
-  }
+  def : Pat<(v4f32 (insertelt v4f32:$vDi,  f32:$rA, i32:$rB)),
+            (VINSWVLX $vDi, InsertEltShift.Left2, (XSCVDPSPN $rA))>;
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i32:$rB)),
+            (VINSWLX v4f32:$vDi, InsertEltShift.Left2, (LWZ memri:$rA))>;
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i32:$rB)),
+            (VINSWLX v4f32:$vDi, InsertEltShift.Left2, (PLWZ memri34:$rA))>;
+  def: Pat<(v4f32(insertelt v4f32 : $vDi, (f32(load xaddr : $rA)), i32 : $rB)),
+           (VINSWLX v4f32 : $vDi, InsertEltShift.Left2, (LWZX memrr : $rA))>;
 }
 
 let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in {
   // Indexed vector insert element
-  def : Pat<(v16i8 (PPCvecinsertelt v16i8:$vDi, i32:$rA, i64:$rB)),
-            (VINSBLX $vDi, InsertEltShift.Sub32Left0, $rA)>;
-  def : Pat<(v8i16 (PPCvecinsertelt v8i16:$vDi, i32:$rA, i64:$rB)),
+  def : Pat<(v16i8 (vector_insert v16i8:$vDi, i32:$rA, i64:$rB)),
+            (VINSBLX $vDi, InsertEltShift.Sub32, $rA)>;
+  def : Pat<(v8i16 (vector_insert v8i16:$vDi, i32:$rA, i64:$rB)),
             (VINSHLX $vDi, InsertEltShift.Sub32Left1, $rA)>;
-  def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, i64:$rB)),
+  def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, i64:$rB)),
             (VINSWLX $vDi, InsertEltShift.Sub32Left2, $rA)>;
-  def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, i64:$rB)),
+  def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, i64:$rB)),
             (VINSDLX $vDi, InsertEltShift.Left3, $rA)>;
 
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi,  f32:$A, i64:$rB)),
-            (VINSWLX $vDi, InsertEltShift.Sub32Left2, Bitcast.FltToInt)>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
+  def : Pat<(v4f32 (insertelt v4f32:$vDi,  f32:$rA, i64:$rB)),
+            (VINSWVLX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>;
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)),
             (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)),
             (VINSWLX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>;
-  def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
+  def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)),
             (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>;
 
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi,  f64:$A, i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi,  f64:$A, i64:$rB)),
             (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>;
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)),
             (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>;
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)),
             (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>;
-  def : Pat<(v2f64 (PPCvecinsertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
+  def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)),
             (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>;
+}
 
+let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX, IsBigEndian] in {
   // Immediate vector insert element
-  foreach i = [0, 1, 2, 3] in {
-    def : Pat<(v4i32 (PPCvecinsertelt v4i32:$vDi, i32:$rA, (i64 i))),
-              (VINSW $vDi, !mul(i, 4), $rA)>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddr:$rA)), (i64 i))),
-              (VINSW $vDi, !mul(i, 4), (LWZ memri:$rA))>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), (i64 i))),
-              (VINSW $vDi, !mul(i, 4), (PLWZ memri34:$rA))>;
-    def : Pat<(v4f32 (PPCvecinsertelt v4f32:$vDi, (f32 (load xaddr:$rA)), (i64 i))),
-              (VINSW $vDi, !mul(i, 4), (LWZX memrr:$rA))>;
+  foreach Ty = [i32, i64] in {
+    foreach Idx = [0, 1, 2, 3] in {
+      def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, (Ty Idx))),
+               (VINSW $vDi, !mul(Idx, 4), $rA)>;
+      def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)),
+                                  (Ty Idx))),
+               (VINSW $vDi, !mul(Idx, 4), (LWZ memri:$rA))>;
+      def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)),
+                                  (Ty Idx))),
+               (VINSW $vDi, !mul(Idx, 4), (PLWZ memri34:$rA))>;
+      def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)),
+                                  (Ty Idx))),
+               (VINSW $vDi, !mul(Idx, 4), (LWZX memrr:$rA))>;
+    }
   }
-  foreach i = [0, 1] in
-    def : Pat<(v2i64 (PPCvecinsertelt v2i64:$vDi, i64:$rA, (i64 i))),
-              (VINSD $vDi, !mul(i, 8), $rA)>;
+
+  foreach Idx = [0, 1] in
+    def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, Idx)),
+              (VINSD $vDi, !mul(Idx, 8), $rA)>;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index c0f2aed43a4d..d2d5ca92ca1c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -138,6 +138,10 @@ def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
                      SDTypeProfile<1, 1, []>, []>;
 
@@ -902,16 +906,13 @@ let hasSideEffects = 0 in {
   // Rounding Instructions respecting current rounding mode
   def XSRDPIC : XX2Form<60, 107,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
-                      "xsrdpic $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+                      "xsrdpic $XT, $XB", IIC_VecFP, []>;
   def XVRDPIC : XX2Form<60, 235,
                       (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvrdpic $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+                      "xvrdpic $XT, $XB", IIC_VecFP, []>;
   def XVRSPIC : XX2Form<60, 171,
                       (outs vsrc:$XT), (ins vsrc:$XB),
-                      "xvrspic $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+                      "xvrspic $XT, $XB", IIC_VecFP, []>;
   // Max/Min Instructions
   let isCommutable = 1 in {
   def XSMAXDP : XX3Form<60, 160,
@@ -1062,6 +1063,14 @@ let hasSideEffects = 0 in {
                        [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB,
                          imm32SExt16:$DM))]>;
   let isCodeGenOnly = 1 in
+  // Note that the input register class for `$XA` of XXPERMDIs is `vsfrc` which
+  // is not the same with the input register class(`vsrc`) of XXPERMDI instruction.
+  // We did this on purpose because:
+  // 1: The input is primarily for loads that load a partial vector(LFIWZX,
+  //    etc.), no need for SUBREG_TO_REG.
+  // 2: With `vsfrc` register class, in the final assembly, float registers
+  //    like `f0` are used instead of vector scalar register like `vs0`. This
+  //    helps readability.
   def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
                              "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
   def XXSEL : XX4Form<60, 3,
@@ -2771,9 +2780,6 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be ForceXForm:$src)), (LXVD2X ForceXForm:$s
 def : Pat<(f32 (any_fround f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPI
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (fnearbyint f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPIC
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
 def : Pat<(f32 (any_ffloor f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPIM
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
@@ -2792,6 +2798,19 @@ def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
 def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>;
 def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
 
+// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
+// these need to be defined after the any_frint versions so ISEL will correctly
+// add the chain to the strict versions.
+def : Pat<(f32 (fnearbyint f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIC
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f64 (fnearbyint f64:$S)),
+          (f64 (XSRDPIC $S))>;
+def : Pat<(v2f64 (fnearbyint v2f64:$S)),
+          (v2f64 (XVRDPIC $S))>;
+def : Pat<(v4f32 (fnearbyint v4f32:$S)),
+          (v4f32 (XVRSPIC $S))>;
+
 // Materialize a zero-vector of long long
 def : Pat<(v2i64 immAllZerosV),
           (v2i64 (XXLXORz))>;
@@ -2809,6 +2828,10 @@ def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
 def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
           (v2i64 (XXPERMDI (SUBREG_TO_REG (i64 1), (XSCVDPUXDS $A), sub_64),
                            (SUBREG_TO_REG (i64 1), (XSCVDPUXDS $A), sub_64), 0))>;
+def : Pat<(v4i32 (PPCSToV DblToInt.A)),
+          (v4i32 (SUBREG_TO_REG (i64 1), (XSCVDPSXWS f64:$A), sub_64))>;
+def : Pat<(v4i32 (PPCSToV DblToUInt.A)),
+          (v4i32 (SUBREG_TO_REG (i64 1), (XSCVDPUXWS f64:$A), sub_64))>;
 defm : ScalToVecWPermute<
   v4i32, FltToIntLoad.A,
   (XXSPLTW (SUBREG_TO_REG (i64 1), (XSCVDPSXWSs (XFLOADf32 ForceXForm:$A)), sub_64), 1),
@@ -2823,10 +2846,20 @@ def : Pat<(v4f32 (build_vector (f32 (fpround f64:$A)), (f32 (fpround f64:$A)),
 
 def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
           (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+
+// Splat loads.
 def : Pat<(v2f64 (PPCldsplat ForceXForm:$A)),
           (v2f64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)),
+          (v4f32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>;
 def : Pat<(v2i64 (PPCldsplat ForceXForm:$A)),
           (v2i64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
+          (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>;
+def : Pat<(v2i64 (PPCzextldsplat ForceXForm:$A)),
+          (v2i64 (XXPERMDIs (LFIWZX ForceXForm:$A), 0))>;
+def : Pat<(v2i64 (PPCsextldsplat ForceXForm:$A)),
+          (v2i64 (XXPERMDIs (LFIWAX ForceXForm:$A), 0))>;
 
 // Build vectors of floating point converted to i64.
 def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@@ -2962,11 +2995,11 @@ def : Pat<(v2i64 (fp_to_uint
 def : Pat<WToDPExtractConv.BV02S,
           (v2f64 (XVCVSXWDP $A))>;
 def : Pat<WToDPExtractConv.BV13S,
-          (v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 3)))>;
+          (v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 1)))>;
 def : Pat<WToDPExtractConv.BV02U,
           (v2f64 (XVCVUXWDP $A))>;
 def : Pat<WToDPExtractConv.BV13U,
-          (v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 3)))>;
+          (v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 1)))>;
 def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 0)),
           (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $B, sub_64), $A, 1))>;
 def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)),
@@ -3536,6 +3569,12 @@ def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
 def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
                                immSExt5NonZero:$A, immSExt5NonZero:$A)),
           (v4i32 (VSPLTISW imm:$A))>;
+
+// Splat loads.
+def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
+          (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>;
+def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)),
+          (v16i8 (VSPLTBs 7, (MTVSRWZ (LBZX ForceXForm:$A))))>;
 } // HasVSX, HasDirectMove
 
 // Big endian VSX subtarget with direct moves.
@@ -3547,7 +3586,7 @@ defm : ScalToVecWPermute<
   (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
 defm : ScalToVecWPermute<
   v8i16, (i32 i32:$A),
-  (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64),
+  (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64),
   (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
 defm : ScalToVecWPermute<
   v4i32, (i32 i32:$A),
@@ -4083,6 +4122,10 @@ def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)),
           (v4f32 (LXVWSX ForceXForm:$A))>;
 def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
           (v4i32 (LXVWSX ForceXForm:$A))>;
+def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
+          (v8i16 (VSPLTHs 3, (LXSIHZX ForceXForm:$A)))>;
+def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)),
+          (v16i8 (VSPLTBs 7, (LXSIBZX ForceXForm:$A)))>;
 } // HasVSX, HasP9Vector
 
 // Any Power9 VSX subtarget with equivalent length but better Power10 VSX
@@ -4138,12 +4181,52 @@ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
           (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 0)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            0))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 0)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            0))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 1)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            4))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 1)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            4))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 2)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            8))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 2)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            8))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 3)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            12))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 3)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            12))>;
 def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
           (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
 def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
@@ -4382,12 +4465,52 @@ def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
           (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 0)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            12))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 0)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            12))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 1)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            8))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 1)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            8))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 2)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            4))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 2)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            4))>;
 def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
           (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToInt.B, 3)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPSXWS f64:$B), sub_64),
+                            0))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, DblToUInt.B, 3)),
+          (v4i32 (XXINSERTW v4i32:$A,
+                            (SUBREG_TO_REG (i64 1),
+                                           (XSCVDPUXWS f64:$B), sub_64),
+                            0))>;
 def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
           (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
 def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 1d2b1ed3f626..7f63827afbd6 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -39,6 +39,40 @@
 //    T *p = array[-1];
 //    for (int i = 0; i < n; ++i)
 //      *++p = c;
+//
+// 3: common multiple chains for the load/stores with same offsets in the loop,
+//    so that we can reuse the offsets and reduce the register pressure in the
+//    loop. This transformation can also increase the loop ILP as now each chain
+//    uses its own loop induction add/addi. But this will increase the number of
+//    add/addi in the loop.
+//
+//    Generically, this means transforming loops like this:
+//
+//    char *p;
+//    A1 = p + base1
+//    A2 = p + base1 + offset
+//    B1 = p + base2
+//    B2 = p + base2 + offset
+//
+//    for (int i = 0; i < n; i++)
+//      unsigned long x1 = *(unsigned long *)(A1 + i);
+//      unsigned long x2 = *(unsigned long *)(A2 + i)
+//      unsigned long x3 = *(unsigned long *)(B1 + i);
+//      unsigned long x4 = *(unsigned long *)(B2 + i);
+//    }
+//
+//    to look like this:
+//
+//    A1_new = p + base1 // chain 1
+//    B1_new = p + base2 // chain 2, now inside the loop, common offset is
+//                       // reused.
+//
+//    for (long long i = 0; i < n; i+=count) {
+//      unsigned long x1 = *(unsigned long *)(A1_new + i);
+//      unsigned long x2 = *(unsigned long *)((A1_new + i) + offset);
+//      unsigned long x3 = *(unsigned long *)(B1_new + i);
+//      unsigned long x4 = *(unsigned long *)((B1_new + i) + offset);
+//    }
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
@@ -81,15 +115,25 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned> MaxVarsPrep("ppc-formprep-max-vars",
-                                 cl::Hidden, cl::init(24),
-  cl::desc("Potential common base number threshold per function for PPC loop "
-           "prep"));
+static cl::opt<unsigned>
+    MaxVarsPrep("ppc-formprep-max-vars", cl::Hidden, cl::init(24),
+                cl::ZeroOrMore,
+                cl::desc("Potential common base number threshold per function "
+                         "for PPC loop prep"));
 
 static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
                                  cl::init(true), cl::Hidden,
   cl::desc("prefer update form when ds form is also a update form"));
 
+static cl::opt<bool> EnableUpdateFormForNonConstInc(
+    "ppc-formprep-update-nonconst-inc", cl::init(false), cl::Hidden,
+    cl::desc("prepare update form when the load/store increment is a loop "
+             "invariant non-const value."));
+
+static cl::opt<bool> EnableChainCommoning(
+    "ppc-formprep-chain-commoning", cl::init(false), cl::Hidden,
+    cl::desc("Enable chain commoning in PPC loop prepare pass."));
+
 // Sum of following 3 per loop thresholds for all loops can not be larger
 // than MaxVarsPrep.
 // now the thresholds for each kind prep are exterimental values on Power9.
@@ -106,6 +150,16 @@ static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
                                  cl::Hidden, cl::init(8),
   cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));
 
+// Commoning chain will reduce the register pressure, so we don't consider about
+// the PHI nodes number.
+// But commoning chain will increase the addi/add number in the loop and also
+// increase loop ILP. Maximum chain number should be same with hardware
+// IssueWidth, because we won't benefit from ILP if the parallel chains number
+// is bigger than IssueWidth. We assume there are 2 chains in one bucket, so
+// there would be 4 buckets at most on P9(IssueWidth is 8).
+static cl::opt<unsigned> MaxVarsChainCommon(
+    "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4),
+    cl::desc("Bucket number per loop for PPC loop chain common"));
 
 // If would not be profitable if the common base has only one load/store, ISEL
 // should already be able to choose best load/store form based on offset for
@@ -116,35 +170,54 @@ static cl::opt<unsigned> DispFormPrepMinThreshold("ppc-dispprep-min-threshold",
   cl::desc("Minimal common base load/store instructions triggering DS/DQ form "
            "preparation"));
 
+static cl::opt<unsigned> ChainCommonPrepMinThreshold(
+    "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4),
+    cl::desc("Minimal common base load/store instructions triggering chain "
+             "commoning preparation. Must be not smaller than 4"));
+
 STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form");
 STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form");
 STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form");
 STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten");
 STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten");
 STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
+STATISTIC(ChainCommoningRewritten, "Num of commoning chains");
 
 namespace {
   struct BucketElement {
-    BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+    BucketElement(const SCEV *O, Instruction *I) : Offset(O), Instr(I) {}
     BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
 
-    const SCEVConstant *Offset;
+    const SCEV *Offset;
     Instruction *Instr;
   };
 
   struct Bucket {
-    Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
-                                            Elements(1, BucketElement(I)) {}
+    Bucket(const SCEV *B, Instruction *I)
+        : BaseSCEV(B), Elements(1, BucketElement(I)) {
+      ChainSize = 0;
+    }
 
+    // The base of the whole bucket.
     const SCEV *BaseSCEV;
+
+    // All elements in the bucket. In the bucket, the element with the BaseSCEV
+    // has no offset and all other elements are stored as offsets to the
+    // BaseSCEV.
     SmallVector<BucketElement, 16> Elements;
+
+    // The potential chains size. This is used for chain commoning only.
+    unsigned ChainSize;
+
+    // The base for each potential chain. This is used for chain commoning only.
+    SmallVector<BucketElement, 16> ChainBases;
   };
 
   // "UpdateForm" is not a real PPC instruction form, it stands for dform
   // load/store with update like ldu/stdu, or Prefetch intrinsic.
   // For DS form instructions, their displacements must be multiple of 4.
   // For DQ form instructions, their displacements must be multiple of 16.
-  enum InstrForm { UpdateForm = 1, DSForm = 4, DQForm = 16 };
+  enum PrepForm { UpdateForm = 1, DSForm = 4, DQForm = 16, ChainCommoning };
 
   class PPCLoopInstrFormPrep : public FunctionPass {
   public:
@@ -169,11 +242,12 @@ namespace {
 
   private:
     PPCTargetMachine *TM = nullptr;
-    const PPCSubtarget *ST; 
+    const PPCSubtarget *ST;
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
     bool PreserveLCSSA;
+    bool HasCandidateForPrepare;
 
     /// Successful preparation number for Update/DS/DQ form in all inner most
     /// loops. One successful preparation will put one common base out of loop,
@@ -184,22 +258,39 @@ namespace {
     bool runOnLoop(Loop *L);
 
     /// Check if required PHI node is already exist in Loop \p L.
-    bool alreadyPrepared(Loop *L, Instruction* MemI,
+    bool alreadyPrepared(Loop *L, Instruction *MemI,
                          const SCEV *BasePtrStartSCEV,
-                         const SCEVConstant *BasePtrIncSCEV,
-                         InstrForm Form);
+                         const SCEV *BasePtrIncSCEV, PrepForm Form);
+
+    /// Get the value which defines the increment SCEV \p BasePtrIncSCEV.
+    Value *getNodeForInc(Loop *L, Instruction *MemI,
+                         const SCEV *BasePtrIncSCEV);
+
+    /// Common chains to reuse offsets for a loop to reduce register pressure.
+    bool chainCommoning(Loop *L, SmallVector<Bucket, 16> &Buckets);
+
+    /// Find out the potential commoning chains and their bases.
+    bool prepareBasesForCommoningChains(Bucket &BucketChain);
+
+    /// Rewrite load/store according to the common chains.
+    bool
+    rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket,
+                                        SmallSet<BasicBlock *, 16> &BBChanged);
 
     /// Collect condition matched(\p isValidCandidate() returns true)
     /// candidates in Loop \p L.
     SmallVector<Bucket, 16> collectCandidates(
         Loop *L,
-        std::function<bool(const Instruction *, const Value *, const Type *)>
+        std::function<bool(const Instruction *, Value *, const Type *)>
             isValidCandidate,
+        std::function<bool(const SCEV *)> isValidDiff,
         unsigned MaxCandidateNum);
 
-    /// Add a candidate to candidates \p Buckets.
+    /// Add a candidate to candidates \p Buckets if diff between candidate and
+    /// one base in \p Buckets matches \p isValidDiff.
     void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
                          SmallVector<Bucket, 16> &Buckets,
+                         std::function<bool(const SCEV *)> isValidDiff,
                          unsigned MaxCandidateNum);
 
     /// Prepare all candidates in \p Buckets for update form.
@@ -207,8 +298,7 @@ namespace {
 
     /// Prepare all candidates in \p Buckets for displacement form, now for
     /// ds/dq.
-    bool dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets,
-                      InstrForm Form);
+    bool dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets, PrepForm Form);
 
     /// Prepare for one chain \p BucketChain, find the best base element and
     /// update all other elements in \p BucketChain accordingly.
@@ -216,8 +306,7 @@ namespace {
     /// If success, best base element must be stored as the first element of
     /// \p BucketChain.
     /// Return false if no base element found, otherwise return true.
-    bool prepareBaseForDispFormChain(Bucket &BucketChain,
-                                     InstrForm Form);
+    bool prepareBaseForDispFormChain(Bucket &BucketChain, PrepForm Form);
 
     /// Prepare for one chain \p BucketChain, find the best base element and
     /// update all other elements in \p BucketChain accordingly.
@@ -230,7 +319,20 @@ namespace {
     /// preparation.
     bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
                            SmallSet<BasicBlock *, 16> &BBChanged,
-                           InstrForm Form);
+                           PrepForm Form);
+
+    /// Rewrite for the base load/store of a chain.
+    std::pair<Instruction *, Instruction *>
+    rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
+                   Instruction *BaseMemI, bool CanPreInc, PrepForm Form,
+                   SCEVExpander &SCEVE, SmallPtrSet<Value *, 16> &DeletedPtrs);
+
+    /// Rewrite for the other load/stores of a chain according to the new \p
+    /// Base.
+    Instruction *
+    rewriteForBucketElement(std::pair<Instruction *, Instruction *> Base,
+                            const BucketElement &Element, Value *OffToBase,
+                            SmallPtrSet<Value *, 16> &DeletedPtrs);
   };
 
 } // end anonymous namespace
@@ -266,23 +368,35 @@ static std::string getInstrName(const Value *I, StringRef Suffix) {
   if (I->hasName())
     return (I->getName() + Suffix).str();
   else
-    return ""; 
+    return "";
 }
 
-static Value *GetPointerOperand(Value *MemI) {
+static Value *getPointerOperandAndType(Value *MemI,
+                                       Type **PtrElementType = nullptr) {
+
+  Value *PtrValue = nullptr;
+  Type *PointerElementType = nullptr;
+
   if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
-    return LMemI->getPointerOperand();
+    PtrValue = LMemI->getPointerOperand();
+    PointerElementType = LMemI->getType();
   } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
-    return SMemI->getPointerOperand();
+    PtrValue = SMemI->getPointerOperand();
+    PointerElementType = SMemI->getValueOperand()->getType();
   } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
+    PointerElementType = Type::getInt8Ty(MemI->getContext());
     if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
-        IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp)
-      return IMemI->getArgOperand(0);
-    if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)
-      return IMemI->getArgOperand(1);
+        IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
+      PtrValue = IMemI->getArgOperand(0);
+    } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
+      PtrValue = IMemI->getArgOperand(1);
+    }
   }
+  /*Get ElementType if PtrElementType is not null.*/
+  if (PtrElementType)
+    *PtrElementType = PointerElementType;
 
-  return nullptr;
+  return PtrValue;
 }
 
 bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
@@ -306,58 +420,460 @@ bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-void PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
-                                        SmallVector<Bucket, 16> &Buckets,
-                                        unsigned MaxCandidateNum) {
-  assert((MemI && GetPointerOperand(MemI)) &&
+// Finding the minimal(chain_number + reusable_offset_number) is a complicated
+// algorithmic problem.
+// For now, the algorithm used here is simply adjusted to handle the case for
+// manually unrolling cases.
+// FIXME: use a more powerful algorithm to find minimal sum of chain_number and
+// reusable_offset_number for one base with multiple offsets.
+bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) {
+  // The minimal size for profitable chain commoning:
+  // A1 = base + offset1
+  // A2 = base + offset2 (offset2 - offset1 = X)
+  // A3 = base + offset3
+  // A4 = base + offset4 (offset4 - offset3 = X)
+  // ======>
+  // base1 = base + offset1
+  // base2 = base + offset3
+  // A1 = base1
+  // A2 = base1 + X
+  // A3 = base2
+  // A4 = base2 + X
+  //
+  // There is benefit because of reuse of offest 'X'.
+
+  assert(ChainCommonPrepMinThreshold >= 4 &&
+         "Thredhold can not be smaller than 4!\n");
+  if (CBucket.Elements.size() < ChainCommonPrepMinThreshold)
+    return false;
+
+  // We simply select the FirstOffset as the first reusable offset between each
+  // chain element 1 and element 0.
+  const SCEV *FirstOffset = CBucket.Elements[1].Offset;
+
+  // Figure out how many times above FirstOffset is used in the chain.
+  // For a success commoning chain candidate, offset difference between each
+  // chain element 1 and element 0 must be also FirstOffset.
+  unsigned FirstOffsetReusedCount = 1;
+
+  // Figure out how many times above FirstOffset is used in the first chain.
+  // Chain number is FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain
+  unsigned FirstOffsetReusedCountInFirstChain = 1;
+
+  unsigned EleNum = CBucket.Elements.size();
+  bool SawChainSeparater = false;
+  for (unsigned j = 2; j != EleNum; ++j) {
+    if (SE->getMinusSCEV(CBucket.Elements[j].Offset,
+                         CBucket.Elements[j - 1].Offset) == FirstOffset) {
+      if (!SawChainSeparater)
+        FirstOffsetReusedCountInFirstChain++;
+      FirstOffsetReusedCount++;
+    } else
+      // For now, if we meet any offset which is not FirstOffset, we assume we
+      // find a new Chain.
+      // This makes us miss some opportunities.
+      // For example, we can common:
+      //
+      // {OffsetA, Offset A, OffsetB, OffsetA, OffsetA, OffsetB}
+      //
+      // as two chains:
+      // {{OffsetA, Offset A, OffsetB}, {OffsetA, OffsetA, OffsetB}}
+      // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 2
+      //
+      // But we fail to common:
+      //
+      // {OffsetA, OffsetB, OffsetA, OffsetA, OffsetB, OffsetA}
+      // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 1
+
+      SawChainSeparater = true;
+  }
+
+  // FirstOffset is not reused, skip this bucket.
+  if (FirstOffsetReusedCount == 1)
+    return false;
+
+  unsigned ChainNum =
+      FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain;
+
+  // All elements are increased by FirstOffset.
+  // The number of chains should be sqrt(EleNum).
+  if (!SawChainSeparater)
+    ChainNum = (unsigned)sqrt((double)EleNum);
+
+  CBucket.ChainSize = (unsigned)(EleNum / ChainNum);
+
+  // If this is not a perfect chain(eg: not all elements can be put inside
+  // commoning chains.), skip now.
+  if (CBucket.ChainSize * ChainNum != EleNum)
+    return false;
+
+  if (SawChainSeparater) {
+    // Check that the offset seqs are the same for all chains.
+    for (unsigned i = 1; i < CBucket.ChainSize; i++)
+      for (unsigned j = 1; j < ChainNum; j++)
+        if (CBucket.Elements[i].Offset !=
+            SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset,
+                             CBucket.Elements[j * CBucket.ChainSize].Offset))
+          return false;
+  }
+
+  for (unsigned i = 0; i < ChainNum; i++)
+    CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]);
+
+  LLVM_DEBUG(dbgs() << "Bucket has " << ChainNum << " chains.\n");
+
+  return true;
+}
+
+bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
+                                          SmallVector<Bucket, 16> &Buckets) {
+  bool MadeChange = false;
+
+  if (Buckets.empty())
+    return MadeChange;
+
+  SmallSet<BasicBlock *, 16> BBChanged;
+
+  for (auto &Bucket : Buckets) {
+    if (prepareBasesForCommoningChains(Bucket))
+      MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged);
+  }
+
+  if (MadeChange)
+    for (auto *BB : BBChanged)
+      DeleteDeadPHIs(BB);
+  return MadeChange;
+}
+
+bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
+    Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {
+  bool MadeChange = false;
+
+  assert(Bucket.Elements.size() ==
+             Bucket.ChainBases.size() * Bucket.ChainSize &&
+         "invalid bucket for chain commoning!\n");
+  SmallPtrSet<Value *, 16> DeletedPtrs;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+
+  SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
+                     "loopprepare-chaincommon");
+
+  for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) {
+    unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx;
+    const SCEV *BaseSCEV =
+        ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV,
+                                  Bucket.Elements[BaseElemIdx].Offset)
+                 : Bucket.BaseSCEV;
+    const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV);
+
+    // Make sure the base is able to expand.
+    if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
+      return MadeChange;
+
+    assert(BasePtrSCEV->isAffine() &&
+           "Invalid SCEV type for the base ptr for a candidate chain!\n");
+
+    std::pair<Instruction *, Instruction *> Base = rewriteForBase(
+        L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr,
+        false /* CanPreInc */, ChainCommoning, SCEVE, DeletedPtrs);
+
+    if (!Base.first || !Base.second)
+      return MadeChange;
+
+    // Keep track of the replacement pointer values we've inserted so that we
+    // don't generate more pointer values than necessary.
+    SmallPtrSet<Value *, 16> NewPtrs;
+    NewPtrs.insert(Base.first);
+
+    for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize;
+         ++Idx) {
+      BucketElement &I = Bucket.Elements[Idx];
+      Value *Ptr = getPointerOperandAndType(I.Instr);
+      assert(Ptr && "No pointer operand");
+      if (NewPtrs.count(Ptr))
+        continue;
+
+      const SCEV *OffsetSCEV =
+          BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset,
+                                         Bucket.Elements[BaseElemIdx].Offset)
+                      : Bucket.Elements[Idx].Offset;
+
+      // Make sure offset is able to expand. Only need to check one time as the
+      // offsets are reused between different chains.
+      if (!BaseElemIdx)
+        if (!isSafeToExpand(OffsetSCEV, *SE))
+          return false;
+
+      Value *OffsetValue = SCEVE.expandCodeFor(
+          OffsetSCEV, OffsetSCEV->getType(), LoopPredecessor->getTerminator());
+
+      Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx],
+                                                    OffsetValue, DeletedPtrs);
+
+      assert(NewPtr && "Wrong rewrite!\n");
+      NewPtrs.insert(NewPtr);
+    }
+
+    ++ChainCommoningRewritten;
+  }
+
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted below, causing the AssertingVH in the cache to trigger.
+  SCEVE.clear();
+
+  for (auto *Ptr : DeletedPtrs) {
+    if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+      BBChanged.insert(IDel->getParent());
+    RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+  }
+
+  MadeChange = true;
+  return MadeChange;
+}
+
+// Rewrite the new base according to BasePtrSCEV.
+// bb.loop.preheader:
+//   %newstart = ...
+// bb.loop.body:
+//   %phinode = phi [ %newstart, %bb.loop.preheader ], [ %add, %bb.loop.body ]
+//   ...
+//   %add = getelementptr %phinode, %inc
+//
+// First returned instruciton is %phinode (or a type cast to %phinode), caller
+// needs this value to rewrite other load/stores in the same chain.
+// Second returned instruction is %add, caller needs this value to rewrite other
+// load/stores in the same chain.
+std::pair<Instruction *, Instruction *>
+PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
+                                     Instruction *BaseMemI, bool CanPreInc,
+                                     PrepForm Form, SCEVExpander &SCEVE,
+                                     SmallPtrSet<Value *, 16> &DeletedPtrs) {
+
+  LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+
+  assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?");
+
+  Value *BasePtr = getPointerOperandAndType(BaseMemI);
+  assert(BasePtr && "No pointer operand");
+
+  Type *I8Ty = Type::getInt8Ty(BaseMemI->getParent()->getContext());
+  Type *I8PtrTy =
+      Type::getInt8PtrTy(BaseMemI->getParent()->getContext(),
+                         BasePtr->getType()->getPointerAddressSpace());
+
+  bool IsConstantInc = false;
+  const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE);
+  Value *IncNode = getNodeForInc(L, BaseMemI, BasePtrIncSCEV);
+
+  const SCEVConstant *BasePtrIncConstantSCEV =
+      dyn_cast<SCEVConstant>(BasePtrIncSCEV);
+  if (BasePtrIncConstantSCEV)
+    IsConstantInc = true;
+
+  // No valid representation for the increment.
+  if (!IncNode) {
+    LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n");
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  if (Form == UpdateForm && !IsConstantInc && !EnableUpdateFormForNonConstInc) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Update form prepare for non-const increment is not enabled!\n");
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  const SCEV *BasePtrStartSCEV = nullptr;
+  if (CanPreInc) {
+    assert(SE->isLoopInvariant(BasePtrIncSCEV, L) &&
+           "Increment is not loop invariant!\n");
+    BasePtrStartSCEV = SE->getMinusSCEV(BasePtrSCEV->getStart(),
+                                        IsConstantInc ? BasePtrIncConstantSCEV
+                                                      : BasePtrIncSCEV);
+  } else
+    BasePtrStartSCEV = BasePtrSCEV->getStart();
+
+  if (alreadyPrepared(L, BaseMemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) {
+    LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n");
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+
+  BasicBlock *Header = L->getHeader();
+  unsigned HeaderLoopPredCount = pred_size(Header);
+  BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+
+  PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
+                                    getInstrName(BaseMemI, PHINodeNameSuffix),
+                                    Header->getFirstNonPHI());
+
+  Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
+                                            LoopPredecessor->getTerminator());
+
+  // Note that LoopPredecessor might occur in the predecessor list multiple
+  // times, and we need to add it the right number of times.
+  for (auto PI : predecessors(Header)) {
+    if (PI != LoopPredecessor)
+      continue;
+
+    NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
+  }
+
+  Instruction *PtrInc = nullptr;
+  Instruction *NewBasePtr = nullptr;
+  if (CanPreInc) {
+    Instruction *InsPoint = &*Header->getFirstInsertionPt();
+    PtrInc = GetElementPtrInst::Create(
+        I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
+        InsPoint);
+    cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
+    for (auto PI : predecessors(Header)) {
+      if (PI == LoopPredecessor)
+        continue;
+
+      NewPHI->addIncoming(PtrInc, PI);
+    }
+    if (PtrInc->getType() != BasePtr->getType())
+      NewBasePtr =
+          new BitCastInst(PtrInc, BasePtr->getType(),
+                          getInstrName(PtrInc, CastNodeNameSuffix), InsPoint);
+    else
+      NewBasePtr = PtrInc;
+  } else {
+    // Note that LoopPredecessor might occur in the predecessor list multiple
+    // times, and we need to make sure no more incoming value for them in PHI.
+    for (auto PI : predecessors(Header)) {
+      if (PI == LoopPredecessor)
+        continue;
+
+      // For the latch predecessor, we need to insert a GEP just before the
+      // terminator to increase the address.
+      BasicBlock *BB = PI;
+      Instruction *InsPoint = BB->getTerminator();
+      PtrInc = GetElementPtrInst::Create(
+          I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
+          InsPoint);
+      cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
+
+      NewPHI->addIncoming(PtrInc, PI);
+    }
+    PtrInc = NewPHI;
+    if (NewPHI->getType() != BasePtr->getType())
+      NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(),
+                                   getInstrName(NewPHI, CastNodeNameSuffix),
+                                   &*Header->getFirstInsertionPt());
+    else
+      NewBasePtr = NewPHI;
+  }
+
+  BasePtr->replaceAllUsesWith(NewBasePtr);
+
+  DeletedPtrs.insert(BasePtr);
+
+  return std::make_pair(NewBasePtr, PtrInc);
+}
+
+Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
+    std::pair<Instruction *, Instruction *> Base, const BucketElement &Element,
+    Value *OffToBase, SmallPtrSet<Value *, 16> &DeletedPtrs) {
+  Instruction *NewBasePtr = Base.first;
+  Instruction *PtrInc = Base.second;
+  assert((NewBasePtr && PtrInc) && "base does not exist!\n");
+
+  Type *I8Ty = Type::getInt8Ty(PtrInc->getParent()->getContext());
+
+  Value *Ptr = getPointerOperandAndType(Element.Instr);
+  assert(Ptr && "No pointer operand");
+
+  Instruction *RealNewPtr;
+  if (!Element.Offset ||
+      (isa<SCEVConstant>(Element.Offset) &&
+       cast<SCEVConstant>(Element.Offset)->getValue()->isZero())) {
+    RealNewPtr = NewBasePtr;
+  } else {
+    Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+    if (PtrIP && isa<Instruction>(NewBasePtr) &&
+        cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
+      PtrIP = nullptr;
+    else if (PtrIP && isa<PHINode>(PtrIP))
+      PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+    else if (!PtrIP)
+      PtrIP = Element.Instr;
+
+    assert(OffToBase && "There should be an offset for non base element!\n");
+    GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
+        I8Ty, PtrInc, OffToBase,
+        getInstrName(Element.Instr, GEPNodeOffNameSuffix), PtrIP);
+    if (!PtrIP)
+      NewPtr->insertAfter(cast<Instruction>(PtrInc));
+    NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
+    RealNewPtr = NewPtr;
+  }
+
+  Instruction *ReplNewPtr;
+  if (Ptr->getType() != RealNewPtr->getType()) {
+    ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
+                                 getInstrName(Ptr, CastNodeNameSuffix));
+    ReplNewPtr->insertAfter(RealNewPtr);
+  } else
+    ReplNewPtr = RealNewPtr;
+
+  Ptr->replaceAllUsesWith(ReplNewPtr);
+  DeletedPtrs.insert(Ptr);
+
+  return ReplNewPtr;
+}
+
+void PPCLoopInstrFormPrep::addOneCandidate(
+    Instruction *MemI, const SCEV *LSCEV, SmallVector<Bucket, 16> &Buckets,
+    std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
+  assert((MemI && getPointerOperandAndType(MemI)) &&
          "Candidate should be a memory instruction.");
   assert(LSCEV && "Invalid SCEV for Ptr value.");
+
   bool FoundBucket = false;
   for (auto &B : Buckets) {
+    if (cast<SCEVAddRecExpr>(B.BaseSCEV)->getStepRecurrence(*SE) !=
+        cast<SCEVAddRecExpr>(LSCEV)->getStepRecurrence(*SE))
+      continue;
     const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
-    if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
-      B.Elements.push_back(BucketElement(CDiff, MemI));
+    if (isValidDiff(Diff)) {
+      B.Elements.push_back(BucketElement(Diff, MemI));
       FoundBucket = true;
       break;
     }
   }
 
   if (!FoundBucket) {
-    if (Buckets.size() == MaxCandidateNum)
+    if (Buckets.size() == MaxCandidateNum) {
+      LLVM_DEBUG(dbgs() << "Can not prepare more chains, reach maximum limit "
+                        << MaxCandidateNum << "\n");
       return;
+    }
     Buckets.push_back(Bucket(LSCEV, MemI));
   }
 }
 
 SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
     Loop *L,
-    std::function<bool(const Instruction *, const Value *, const Type *)>
+    std::function<bool(const Instruction *, Value *, const Type *)>
         isValidCandidate,
-    unsigned MaxCandidateNum) {
+    std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {
   SmallVector<Bucket, 16> Buckets;
+
   for (const auto &BB : L->blocks())
     for (auto &J : *BB) {
-      Value *PtrValue;
-      Type *PointerElementType;
-
-      if (LoadInst *LMemI = dyn_cast<LoadInst>(&J)) {
-        PtrValue = LMemI->getPointerOperand();
-        PointerElementType = LMemI->getType();
-      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&J)) {
-        PtrValue = SMemI->getPointerOperand();
-        PointerElementType = SMemI->getValueOperand()->getType();
-      } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
-        PointerElementType = Type::getInt8Ty(J.getContext());
-        if (IMemI->getIntrinsicID() == Intrinsic::prefetch ||
-            IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) {
-          PtrValue = IMemI->getArgOperand(0);
-        } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) {
-          PtrValue = IMemI->getArgOperand(1);
-        } else continue;
-      } else continue;
-
-      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
-      if (PtrAddrSpace)
+      Value *PtrValue = nullptr;
+      Type *PointerElementType = nullptr;
+      PtrValue = getPointerOperandAndType(&J, &PointerElementType);
+
+      if (!PtrValue)
+        continue;
+
+      if (PtrValue->getType()->getPointerAddressSpace())
         continue;
 
       if (L->isLoopInvariant(PtrValue))
@@ -368,14 +884,17 @@ SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
       if (!LARSCEV || LARSCEV->getLoop() != L)
         continue;
 
+      // Mark that we have candidates for preparing.
+      HasCandidateForPrepare = true;
+
       if (isValidCandidate(&J, PtrValue, PointerElementType))
-        addOneCandidate(&J, LSCEV, Buckets, MaxCandidateNum);
+        addOneCandidate(&J, LSCEV, Buckets, isValidDiff, MaxCandidateNum);
     }
   return Buckets;
 }
 
 bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
-                                                    InstrForm Form) {
+                                                       PrepForm Form) {
   // RemainderOffsetInfo details:
   // key:            value of (Offset urem DispConstraint). For DSForm, it can
   //                 be [0, 4).
@@ -388,8 +907,9 @@ bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
     if (!BucketChain.Elements[j].Offset)
       RemainderOffsetInfo[0] = std::make_pair(0, 1);
     else {
-      unsigned Remainder =
-          BucketChain.Elements[j].Offset->getAPInt().urem(Form);
+      unsigned Remainder = cast<SCEVConstant>(BucketChain.Elements[j].Offset)
+                               ->getAPInt()
+                               .urem(Form);
       if (RemainderOffsetInfo.find(Remainder) == RemainderOffsetInfo.end())
         RemainderOffsetInfo[Remainder] = std::make_pair(j, 1);
       else
@@ -404,13 +924,13 @@ bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
   // contains following load/stores with different remainders:
   // 1: 10 load/store whose remainder is 1;
   // 2: 9 load/store whose remainder is 2;
-  // 3: 1 for remainder 3 and 0 for remainder 0; 
+  // 3: 1 for remainder 3 and 0 for remainder 0;
   // Now we will choose the first load/store whose remainder is 1 as base and
   // adjust all other load/stores according to new base, so we will get 10 DS
   // form and 10 X form.
   // But we should be more clever, for this case we could use two bases, one for
-  // remainder 1 and the other for remainder 2, thus we could get 19 DS form and 1
-  // X form.
+  // remainder 1 and the other for remainder 2, thus we could get 19 DS form and
+  // 1 X form.
   unsigned MaxCountRemainder = 0;
   for (unsigned j = 0; j < (unsigned)Form; j++)
     if ((RemainderOffsetInfo.find(j) != RemainderOffsetInfo.end()) &&
@@ -471,7 +991,7 @@ bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
     // If our chosen element has no offset from the base pointer, there's
     // nothing to do.
     if (!BucketChain.Elements[j].Offset ||
-        BucketChain.Elements[j].Offset->isZero())
+        cast<SCEVConstant>(BucketChain.Elements[j].Offset)->isZero())
       break;
 
     const SCEV *Offset = BucketChain.Elements[j].Offset;
@@ -489,191 +1009,76 @@ bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
   return true;
 }
 
-bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain,
-                                          SmallSet<BasicBlock *, 16> &BBChanged,
-                                          InstrForm Form) {
+bool PPCLoopInstrFormPrep::rewriteLoadStores(
+    Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged,
+    PrepForm Form) {
   bool MadeChange = false;
+
   const SCEVAddRecExpr *BasePtrSCEV =
       cast<SCEVAddRecExpr>(BucketChain.BaseSCEV);
   if (!BasePtrSCEV->isAffine())
     return MadeChange;
 
-  LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
-
-  assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?");
-
-  // The instruction corresponding to the Bucket's BaseSCEV must be the first
-  // in the vector of elements.
-  Instruction *MemI = BucketChain.Elements.begin()->Instr;
-  Value *BasePtr = GetPointerOperand(MemI);
-  assert(BasePtr && "No pointer operand");
-
-  Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
-  Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
-    BasePtr->getType()->getPointerAddressSpace());
-
-  if (!SE->isLoopInvariant(BasePtrSCEV->getStart(), L))
+  if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
     return MadeChange;
 
-  const SCEVConstant *BasePtrIncSCEV =
-    dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
-  if (!BasePtrIncSCEV)
-    return MadeChange;
+  SmallPtrSet<Value *, 16> DeletedPtrs;
+
+  BasicBlock *Header = L->getHeader();
+  SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
+                     "loopprepare-formrewrite");
 
   // For some DS form load/store instructions, it can also be an update form,
-  // if the stride is a multipler of 4. Use update form if prefer it.
+  // if the stride is constant and is a multipler of 4. Use update form if
+  // prefer it.
   bool CanPreInc = (Form == UpdateForm ||
-                    ((Form == DSForm) && !BasePtrIncSCEV->getAPInt().urem(4) &&
+                    ((Form == DSForm) &&
+                     isa<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)) &&
+                     !cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE))
+                          ->getAPInt()
+                          .urem(4) &&
                      PreferUpdateForm));
-  const SCEV *BasePtrStartSCEV = nullptr;
-  if (CanPreInc)
-    BasePtrStartSCEV =
-        SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncSCEV);
-  else
-    BasePtrStartSCEV = BasePtrSCEV->getStart();
 
-  if (!isSafeToExpand(BasePtrStartSCEV, *SE))
-    return MadeChange;
+  std::pair<Instruction *, Instruction *> Base =
+      rewriteForBase(L, BasePtrSCEV, BucketChain.Elements.begin()->Instr,
+                     CanPreInc, Form, SCEVE, DeletedPtrs);
 
-  if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form))
+  if (!Base.first || !Base.second)
     return MadeChange;
 
-  LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
-
-  BasicBlock *Header = L->getHeader();
-  unsigned HeaderLoopPredCount = pred_size(Header);
-  BasicBlock *LoopPredecessor = L->getLoopPredecessor();
-
-  PHINode *NewPHI =
-      PHINode::Create(I8PtrTy, HeaderLoopPredCount,
-                      getInstrName(MemI, PHINodeNameSuffix),
-                      Header->getFirstNonPHI());
-
-  SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
-  Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
-                                            LoopPredecessor->getTerminator());
-
-  // Note that LoopPredecessor might occur in the predecessor list multiple
-  // times, and we need to add it the right number of times.
-  for (auto PI : predecessors(Header)) {
-    if (PI != LoopPredecessor)
-      continue;
-
-    NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
-  }
-
-  Instruction *PtrInc = nullptr;
-  Instruction *NewBasePtr = nullptr;
-  if (CanPreInc) {
-    Instruction *InsPoint = &*Header->getFirstInsertionPt();
-    PtrInc = GetElementPtrInst::Create(
-        I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
-        getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint);
-    cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
-    for (auto PI : predecessors(Header)) {
-      if (PI == LoopPredecessor)
-        continue;
-
-      NewPHI->addIncoming(PtrInc, PI);
-    }
-    if (PtrInc->getType() != BasePtr->getType())
-      NewBasePtr = new BitCastInst(
-          PtrInc, BasePtr->getType(),
-          getInstrName(PtrInc, CastNodeNameSuffix), InsPoint);
-    else
-      NewBasePtr = PtrInc;
-  } else {
-    // Note that LoopPredecessor might occur in the predecessor list multiple
-    // times, and we need to make sure no more incoming value for them in PHI.
-    for (auto PI : predecessors(Header)) {
-      if (PI == LoopPredecessor)
-        continue;
-
-      // For the latch predecessor, we need to insert a GEP just before the
-      // terminator to increase the address.
-      BasicBlock *BB = PI;
-      Instruction *InsPoint = BB->getTerminator();
-      PtrInc = GetElementPtrInst::Create(
-          I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
-          getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint);
-
-      cast<GetElementPtrInst>(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr));
-
-      NewPHI->addIncoming(PtrInc, PI);
-    }
-    PtrInc = NewPHI;
-    if (NewPHI->getType() != BasePtr->getType())
-      NewBasePtr =
-          new BitCastInst(NewPHI, BasePtr->getType(),
-                          getInstrName(NewPHI, CastNodeNameSuffix),
-                          &*Header->getFirstInsertionPt());
-    else
-      NewBasePtr = NewPHI;
-  }
-
-  // Clear the rewriter cache, because values that are in the rewriter's cache
-  // can be deleted below, causing the AssertingVH in the cache to trigger.
-  SCEVE.clear();
-
-  if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
-    BBChanged.insert(IDel->getParent());
-  BasePtr->replaceAllUsesWith(NewBasePtr);
-  RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
-
   // Keep track of the replacement pointer values we've inserted so that we
   // don't generate more pointer values than necessary.
   SmallPtrSet<Value *, 16> NewPtrs;
-  NewPtrs.insert(NewBasePtr);
+  NewPtrs.insert(Base.first);
 
   for (auto I = std::next(BucketChain.Elements.begin()),
        IE = BucketChain.Elements.end(); I != IE; ++I) {
-    Value *Ptr = GetPointerOperand(I->Instr);
+    Value *Ptr = getPointerOperandAndType(I->Instr);
     assert(Ptr && "No pointer operand");
     if (NewPtrs.count(Ptr))
       continue;
 
-    Instruction *RealNewPtr;
-    if (!I->Offset || I->Offset->getValue()->isZero()) {
-      RealNewPtr = NewBasePtr;
-    } else {
-      Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
-      if (PtrIP && isa<Instruction>(NewBasePtr) &&
-          cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
-        PtrIP = nullptr;
-      else if (PtrIP && isa<PHINode>(PtrIP))
-        PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
-      else if (!PtrIP)
-        PtrIP = I->Instr;
-
-      GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
-          I8Ty, PtrInc, I->Offset->getValue(),
-          getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP);
-      if (!PtrIP)
-        NewPtr->insertAfter(cast<Instruction>(PtrInc));
-      NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
-      RealNewPtr = NewPtr;
-    }
+    Instruction *NewPtr = rewriteForBucketElement(
+        Base, *I,
+        I->Offset ? cast<SCEVConstant>(I->Offset)->getValue() : nullptr,
+        DeletedPtrs);
+    assert(NewPtr && "wrong rewrite!\n");
+    NewPtrs.insert(NewPtr);
+  }
+
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted below, causing the AssertingVH in the cache to trigger.
+  SCEVE.clear();
 
+  for (auto *Ptr : DeletedPtrs) {
     if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
       BBChanged.insert(IDel->getParent());
-
-    Instruction *ReplNewPtr;
-    if (Ptr->getType() != RealNewPtr->getType()) {
-      ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
-        getInstrName(Ptr, CastNodeNameSuffix));
-      ReplNewPtr->insertAfter(RealNewPtr);
-    } else
-      ReplNewPtr = RealNewPtr;
-
-    Ptr->replaceAllUsesWith(ReplNewPtr);
     RecursivelyDeleteTriviallyDeadInstructions(Ptr);
-
-    NewPtrs.insert(RealNewPtr);
   }
 
   MadeChange = true;
 
-  SuccPrepCount++;  
+  SuccPrepCount++;
 
   if (Form == DSForm && !CanPreInc)
     DSFormChainRewritten++;
@@ -698,14 +1103,14 @@ bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
       MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, UpdateForm);
 
   if (MadeChange)
-    for (auto &BB : L->blocks())
-      if (BBChanged.count(BB))
-        DeleteDeadPHIs(BB);
+    for (auto *BB : BBChanged)
+      DeleteDeadPHIs(BB);
   return MadeChange;
 }
 
-bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets,
-                                     InstrForm Form) {
+bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L,
+                                        SmallVector<Bucket, 16> &Buckets,
+                                        PrepForm Form) {
   bool MadeChange = false;
 
   if (Buckets.empty())
@@ -720,20 +1125,95 @@ bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Bucket
   }
 
   if (MadeChange)
-    for (auto &BB : L->blocks())
-      if (BBChanged.count(BB))
-        DeleteDeadPHIs(BB);
+    for (auto *BB : BBChanged)
+      DeleteDeadPHIs(BB);
   return MadeChange;
 }
 
+// Find the loop invariant increment node for SCEV BasePtrIncSCEV.
+// bb.loop.preheader:
+//   %start = ...
+// bb.loop.body:
+//   %phinode = phi [ %start, %bb.loop.preheader ], [ %add, %bb.loop.body ]
+//   ...
+//   %add = add %phinode, %inc  ; %inc is what we want to get.
+//
+Value *PPCLoopInstrFormPrep::getNodeForInc(Loop *L, Instruction *MemI,
+                                           const SCEV *BasePtrIncSCEV) {
+  // If the increment is a constant, no definition is needed.
+  // Return the value directly.
+  if (isa<SCEVConstant>(BasePtrIncSCEV))
+    return cast<SCEVConstant>(BasePtrIncSCEV)->getValue();
+
+  if (!SE->isLoopInvariant(BasePtrIncSCEV, L))
+    return nullptr;
+
+  BasicBlock *BB = MemI->getParent();
+  if (!BB)
+    return nullptr;
+
+  BasicBlock *LatchBB = L->getLoopLatch();
+
+  if (!LatchBB)
+    return nullptr;
+
+  // Run through the PHIs and check their operands to find valid representation
+  // for the increment SCEV.
+  iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
+  for (auto &CurrentPHI : PHIIter) {
+    PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
+    if (!CurrentPHINode)
+      continue;
+
+    if (!SE->isSCEVable(CurrentPHINode->getType()))
+      continue;
+
+    const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
+
+    const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
+    if (!PHIBasePtrSCEV)
+      continue;
+
+    const SCEV *PHIBasePtrIncSCEV = PHIBasePtrSCEV->getStepRecurrence(*SE);
+
+    if (!PHIBasePtrIncSCEV || (PHIBasePtrIncSCEV != BasePtrIncSCEV))
+      continue;
+
+    // Get the incoming value from the loop latch and check if the value has
+    // the add form with the required increment.
+    if (Instruction *I = dyn_cast<Instruction>(
+            CurrentPHINode->getIncomingValueForBlock(LatchBB))) {
+      Value *StrippedBaseI = I;
+      while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBaseI))
+        StrippedBaseI = BC->getOperand(0);
+
+      Instruction *StrippedI = dyn_cast<Instruction>(StrippedBaseI);
+      if (!StrippedI)
+        continue;
+
+      // LSR pass may add a getelementptr instruction to do the loop increment,
+      // also search in that getelementptr instruction.
+      if (StrippedI->getOpcode() == Instruction::Add ||
+          (StrippedI->getOpcode() == Instruction::GetElementPtr &&
+           StrippedI->getNumOperands() == 2)) {
+        if (SE->getSCEVAtScope(StrippedI->getOperand(0), L) == BasePtrIncSCEV)
+          return StrippedI->getOperand(0);
+        if (SE->getSCEVAtScope(StrippedI->getOperand(1), L) == BasePtrIncSCEV)
+          return StrippedI->getOperand(1);
+      }
+    }
+  }
+  return nullptr;
+}
+
 // In order to prepare for the preferred instruction form, a PHI is added.
 // This function will check to see if that PHI already exists and will return
 // true if it found an existing PHI with the matched start and increment as the
 // one we wanted to create.
-bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction* MemI,
-                                        const SCEV *BasePtrStartSCEV,
-                                        const SCEVConstant *BasePtrIncSCEV,
-                                        InstrForm Form) {
+bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI,
+                                           const SCEV *BasePtrStartSCEV,
+                                           const SCEV *BasePtrIncSCEV,
+                                           PrepForm Form) {
   BasicBlock *BB = MemI->getParent();
   if (!BB)
     return false;
@@ -773,11 +1253,11 @@ bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction* MemI,
         if (PHIBasePtrIncSCEV == BasePtrIncSCEV) {
           // The existing PHI (CurrentPHINode) has the same start and increment
           // as the PHI that we wanted to create.
-          if (Form == UpdateForm &&
+          if ((Form == UpdateForm || Form == ChainCommoning ) &&
               PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) {
             ++PHINodeAlreadyExistsUpdate;
             return true;
-          } 
+          }
           if (Form == DSForm || Form == DQForm) {
             const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
                 SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV));
@@ -788,7 +1268,7 @@ bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction* MemI,
                 ++PHINodeAlreadyExistsDQ;
               return true;
             }
-          } 
+          }
         }
       }
     }
@@ -825,7 +1305,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
   }
   // Check if a load/store has update form. This lambda is used by function
   // collectCandidates which can collect candidates for types defined by lambda.
-  auto isUpdateFormCandidate = [&](const Instruction *I, const Value *PtrValue,
+  auto isUpdateFormCandidate = [&](const Instruction *I, Value *PtrValue,
                                    const Type *PointerElementType) {
     assert((PtrValue && I) && "Invalid parameter!");
     // There are no update forms for Altivec vector load/stores.
@@ -857,7 +1337,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
   };
 
   // Check if a load/store has DS form.
-  auto isDSFormCandidate = [](const Instruction *I, const Value *PtrValue,
+  auto isDSFormCandidate = [](const Instruction *I, Value *PtrValue,
                               const Type *PointerElementType) {
     assert((PtrValue && I) && "Invalid parameter!");
     if (isa<IntrinsicInst>(I))
@@ -871,7 +1351,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
   };
 
   // Check if a load/store has DQ form.
-  auto isDQFormCandidate = [&](const Instruction *I, const Value *PtrValue,
+  auto isDQFormCandidate = [&](const Instruction *I, Value *PtrValue,
                                const Type *PointerElementType) {
     assert((PtrValue && I) && "Invalid parameter!");
     // Check if it is a P10 lxvp/stxvp intrinsic.
@@ -883,31 +1363,131 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
     return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy());
   };
 
-  // intrinsic for update form.
-  SmallVector<Bucket, 16> UpdateFormBuckets =
-      collectCandidates(L, isUpdateFormCandidate, MaxVarsUpdateForm);
+  // Check if a load/store is candidate for chain commoning.
+  // If the SCEV is only with one ptr operand in its start, we can use that
+  // start as a chain separator. Mark this load/store as a candidate.
+  auto isChainCommoningCandidate = [&](const Instruction *I, Value *PtrValue,
+                                       const Type *PointerElementType) {
+    const SCEVAddRecExpr *ARSCEV =
+        cast<SCEVAddRecExpr>(SE->getSCEVAtScope(PtrValue, L));
+    if (!ARSCEV)
+      return false;
+
+    if (!ARSCEV->isAffine())
+      return false;
+
+    const SCEV *Start = ARSCEV->getStart();
+
+    // A single pointer. We can treat it as offset 0.
+    if (isa<SCEVUnknown>(Start) && Start->getType()->isPointerTy())
+      return true;
+
+    const SCEVAddExpr *ASCEV = dyn_cast<SCEVAddExpr>(Start);
+
+    // We need a SCEVAddExpr to include both base and offset.
+    if (!ASCEV)
+      return false;
+
+    // Make sure there is only one pointer operand(base) and all other operands
+    // are integer type.
+    bool SawPointer = false;
+    for (const SCEV *Op : ASCEV->operands()) {
+      if (Op->getType()->isPointerTy()) {
+        if (SawPointer)
+          return false;
+        SawPointer = true;
+      } else if (!Op->getType()->isIntegerTy())
+        return false;
+    }
+
+    return SawPointer;
+  };
+
+  // Check if the diff is a constant type. This is used for update/DS/DQ form
+  // preparation.
+  auto isValidConstantDiff = [](const SCEV *Diff) {
+    return dyn_cast<SCEVConstant>(Diff) != nullptr;
+  };
+
+  // Make sure the diff between the base and new candidate is required type.
+  // This is used for chain commoning preparation.
+  auto isValidChainCommoningDiff = [](const SCEV *Diff) {
+    assert(Diff && "Invalid Diff!\n");
+
+    // Don't mess up previous dform prepare.
+    if (isa<SCEVConstant>(Diff))
+      return false;
+
+    // A single integer type offset.
+    if (isa<SCEVUnknown>(Diff) && Diff->getType()->isIntegerTy())
+      return true;
+
+    const SCEVNAryExpr *ADiff = dyn_cast<SCEVNAryExpr>(Diff);
+    if (!ADiff)
+      return false;
+
+    for (const SCEV *Op : ADiff->operands())
+      if (!Op->getType()->isIntegerTy())
+        return false;
+
+    return true;
+  };
+
+  HasCandidateForPrepare = false;
+
+  LLVM_DEBUG(dbgs() << "Start to prepare for update form.\n");
+  // Collect buckets of comparable addresses used by loads and stores for update
+  // form.
+  SmallVector<Bucket, 16> UpdateFormBuckets = collectCandidates(
+      L, isUpdateFormCandidate, isValidConstantDiff, MaxVarsUpdateForm);
 
   // Prepare for update form.
   if (!UpdateFormBuckets.empty())
     MadeChange |= updateFormPrep(L, UpdateFormBuckets);
+  else if (!HasCandidateForPrepare) {
+    LLVM_DEBUG(
+        dbgs()
+        << "No prepare candidates found, stop praparation for current loop!\n");
+    // If no candidate for preparing, return early.
+    return MadeChange;
+  }
 
+  LLVM_DEBUG(dbgs() << "Start to prepare for DS form.\n");
   // Collect buckets of comparable addresses used by loads and stores for DS
   // form.
-  SmallVector<Bucket, 16> DSFormBuckets =
-      collectCandidates(L, isDSFormCandidate, MaxVarsDSForm);
+  SmallVector<Bucket, 16> DSFormBuckets = collectCandidates(
+      L, isDSFormCandidate, isValidConstantDiff, MaxVarsDSForm);
 
   // Prepare for DS form.
   if (!DSFormBuckets.empty())
     MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm);
 
+  LLVM_DEBUG(dbgs() << "Start to prepare for DQ form.\n");
   // Collect buckets of comparable addresses used by loads and stores for DQ
   // form.
-  SmallVector<Bucket, 16> DQFormBuckets =
-      collectCandidates(L, isDQFormCandidate, MaxVarsDQForm);
+  SmallVector<Bucket, 16> DQFormBuckets = collectCandidates(
+      L, isDQFormCandidate, isValidConstantDiff, MaxVarsDQForm);
 
   // Prepare for DQ form.
   if (!DQFormBuckets.empty())
     MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm);
 
+  // Collect buckets of comparable addresses used by loads and stores for chain
+  // commoning. With chain commoning, we reuse offsets between the chains, so
+  // the register pressure will be reduced.
+  if (!EnableChainCommoning) {
+    LLVM_DEBUG(dbgs() << "Chain commoning is not enabled.\n");
+    return MadeChange;
+  }
+
+  LLVM_DEBUG(dbgs() << "Start to prepare for chain commoning.\n");
+  SmallVector<Bucket, 16> Buckets =
+      collectCandidates(L, isChainCommoningCandidate, isValidChainCommoningDiff,
+                        MaxVarsChainCommon);
+
+  // Prepare for chain commoning.
+  if (!Buckets.empty())
+    MadeChange |= chainCommoning(L, Buckets);
+
   return MadeChange;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 4bbb6ed85a6c..d12a9b806fd0 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -79,6 +79,11 @@ static cl::opt<bool>
                           cl::desc("enable elimination of zero-extensions"),
                           cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    EnableTrapOptimization("ppc-opt-conditional-trap",
+                           cl::desc("enable optimization of conditional traps"),
+                           cl::init(false), cl::Hidden);
+
 namespace {
 
 struct PPCMIPeephole : public MachineFunctionPass {
@@ -322,8 +327,7 @@ static void convertUnprimedAccPHIs(const PPCInstrInfo *TII,
                                    SmallVectorImpl<MachineInstr *> &PHIs,
                                    Register Dst) {
   DenseMap<MachineInstr *, MachineInstr *> ChangedPHIMap;
-  for (auto It = PHIs.rbegin(), End = PHIs.rend(); It != End; ++It) {
-    MachineInstr *PHI = *It;
+  for (MachineInstr *PHI : llvm::reverse(PHIs)) {
     SmallVector<std::pair<MachineOperand, MachineOperand>, 4> PHIOps;
     // We check if the current PHI node can be changed by looking at its
     // operands. If all the operands are either copies from primed
@@ -379,6 +383,7 @@ static void convertUnprimedAccPHIs(const PPCInstrInfo *TII,
 // Perform peephole optimizations.
 bool PPCMIPeephole::simplifyCode(void) {
   bool Simplified = false;
+  bool TrapOpt = false;
   MachineInstr* ToErase = nullptr;
   std::map<MachineInstr *, bool> TOCSaves;
   const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
@@ -420,6 +425,13 @@ bool PPCMIPeephole::simplifyCode(void) {
         ToErase->eraseFromParent();
         ToErase = nullptr;
       }
+      // If a conditional trap instruction got optimized to an
+      // unconditional trap, eliminate all the instructions after
+      // the trap.
+      if (EnableTrapOptimization && TrapOpt) {
+        ToErase = &MI;
+        continue;
+      }
 
       // Ignore debug instructions.
       if (MI.isDebugInstr())
@@ -603,14 +615,24 @@ bool PPCMIPeephole::simplifyCode(void) {
             ToErase = &MI;
             Simplified = true;
           }
-        } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs &&
+        } else if ((Immed == 0 || Immed == 3 || Immed == 2) &&
+                   DefOpc == PPC::XXPERMDIs &&
                    (DefMI->getOperand(2).getImm() == 0 ||
                     DefMI->getOperand(2).getImm() == 3)) {
+          ToErase = &MI;
+          Simplified = true;
+          // Swap of a splat, convert to copy.
+          if (Immed == 2) {
+            LLVM_DEBUG(dbgs() << "Optimizing swap(splat) => copy(splat): ");
+            LLVM_DEBUG(MI.dump());
+            BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                    MI.getOperand(0).getReg())
+                .add(MI.getOperand(1));
+            break;
+          }
           // Splat fed by another splat - switch the output of the first
           // and remove the second.
           DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
-          ToErase = &MI;
-          Simplified = true;
           LLVM_DEBUG(dbgs() << "Removing redundant splat: ");
           LLVM_DEBUG(MI.dump());
         }
@@ -997,6 +1019,51 @@ bool PPCMIPeephole::simplifyCode(void) {
           ++NumRotatesCollapsed;
         break;
       }
+      // We will replace TD/TW/TDI/TWI with an unconditional trap if it will
+      // always trap, we will delete the node if it will never trap.
+      case PPC::TDI:
+      case PPC::TWI:
+      case PPC::TD:
+      case PPC::TW: {
+        if (!EnableTrapOptimization) break;
+        MachineInstr *LiMI1 = getVRegDefOrNull(&MI.getOperand(1), MRI);
+        MachineInstr *LiMI2 = getVRegDefOrNull(&MI.getOperand(2), MRI);
+        bool IsOperand2Immediate = MI.getOperand(2).isImm();
+        // We can only do the optimization if we can get immediates
+        // from both operands
+        if (!(LiMI1 && (LiMI1->getOpcode() == PPC::LI ||
+                        LiMI1->getOpcode() == PPC::LI8)))
+          break;
+        if (!IsOperand2Immediate &&
+            !(LiMI2 && (LiMI2->getOpcode() == PPC::LI ||
+                        LiMI2->getOpcode() == PPC::LI8)))
+          break;
+
+        auto ImmOperand0 = MI.getOperand(0).getImm();
+        auto ImmOperand1 = LiMI1->getOperand(1).getImm();
+        auto ImmOperand2 = IsOperand2Immediate ? MI.getOperand(2).getImm()
+                                               : LiMI2->getOperand(1).getImm();
+
+        // We will replace the MI with an unconditional trap if it will always
+        // trap.
+        if ((ImmOperand0 == 31) ||
+            ((ImmOperand0 & 0x10) &&
+             ((int64_t)ImmOperand1 < (int64_t)ImmOperand2)) ||
+            ((ImmOperand0 & 0x8) &&
+             ((int64_t)ImmOperand1 > (int64_t)ImmOperand2)) ||
+            ((ImmOperand0 & 0x2) &&
+             ((uint64_t)ImmOperand1 < (uint64_t)ImmOperand2)) ||
+            ((ImmOperand0 & 0x1) &&
+             ((uint64_t)ImmOperand1 > (uint64_t)ImmOperand2)) ||
+            ((ImmOperand0 & 0x4) && (ImmOperand1 == ImmOperand2))) {
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::TRAP));
+          TrapOpt = true;
+        }
+        // We will delete the MI if it will never trap.
+        ToErase = &MI;
+        Simplified = true;
+        break;
+      }
       }
     }
 
@@ -1006,6 +1073,9 @@ bool PPCMIPeephole::simplifyCode(void) {
       ToErase->eraseFromParent();
       ToErase = nullptr;
     }
+    // Reset TrapOpt to false at the end of the basic block.
+    if (EnableTrapOptimization)
+      TrapOpt = false;
   }
 
   // Eliminate all the TOC save instructions which are redundant.
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
index d12c6d9cd406..bdff5109c1e1 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -75,6 +75,19 @@ static bool matchingRegOps(const MachineInstr &FirstMI,
   return Op1.getReg() == Op2.getReg();
 }
 
+static bool matchingImmOps(const MachineInstr &MI,
+                           int MIOpIndex,
+                           int64_t Expect,
+                           unsigned ExtendFrom = 64) {
+  const MachineOperand &Op = MI.getOperand(MIOpIndex);
+  if (!Op.isImm())
+    return false;
+  int64_t Imm = Op.getImm();
+  if (ExtendFrom < 64)
+    Imm = SignExtend64(Imm, ExtendFrom);
+  return Imm == Expect;
+}
+
 // Return true if the FirstMI meets the constraints of SecondMI according to
 // fusion specification.
 static bool checkOpConstraints(FusionFeature::FusionKind Kd,
@@ -116,7 +129,7 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd,
     if (((Imm & 0xFFF0) != 0) && ((Imm & 0xFFF0) != 0xFFF0))
       return false;
 
-    // If si = 1111111111110000 and the msb of the d/ds field of the load equals 
+    // If si = 1111111111110000 and the msb of the d/ds field of the load equals
     // 1, then fusion does not occur.
     if ((Imm & 0xFFF0) == 0xFFF0) {
       const MachineOperand &D = SecondMI.getOperand(1);
@@ -132,6 +145,10 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd,
     }
     return true;
   }
+
+  case FusionFeature::FK_SldiAdd:
+    return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) ||
+           (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57));
   }
 
   llvm_unreachable("All the cases should have been handled");
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index c7e4e7c22e0a..469a24800423 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -41,5 +41,42 @@ FUSION_FEATURE(AddisLoad, hasAddisLoadFusion, 2, \
                FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), \
                FUSION_OP_SET(LD, LBZ, LBZ8, LHZ, LHZ8, LWZ, LWZ8))
 
+// Power10 User Manual Section 19.1.5.4, Fusion
+// {add, mulld} - add
+FUSION_FEATURE(ArithAdd, hasArithAddFusion, -1,
+               FUSION_OP_SET(ADD4, ADD8, MULLD), FUSION_OP_SET(ADD4, ADD8))
+
+// {add, subf} - {and, nand, nor, or}
+FUSION_FEATURE(ArithLogical, hasAddLogicalFusion, -1,
+               FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8),
+               FUSION_OP_SET(AND, AND8, OR, OR8, NAND, NAND8, NOR, NOR8))
+
+// {and, andc, eqv, nand, nor, or, orc, xor} - {add, subf}
+FUSION_FEATURE(LogicalArith, hasLogicalAddFusion, -1,
+               FUSION_OP_SET(AND, ANDC, EQV, NAND, NOR, OR, ORC, XOR, AND8,
+                             ANDC8, EQV8, NAND8, NOR8, OR8, ORC8, XOR8),
+               FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8))
+
+// Either of {and, andc, eqv, nand, nor, or, orc, xor}
+FUSION_FEATURE(Logical, hasLogicalFusion, -1,
+               FUSION_OP_SET(AND, ANDC, EQV, NAND, NOR, OR, ORC, XOR, AND8,
+                             ANDC8, EQV8, NAND8, NOR8, OR8, ORC8, XOR8),
+               FUSION_OP_SET(AND, ANDC, EQV, NAND, NOR, OR, ORC, XOR, AND8,
+                             ANDC8, EQV8, NAND8, NOR8, OR8, ORC8, XOR8))
+
+// vaddudm - vaddudm
+FUSION_FEATURE(VecAdd, hasArithAddFusion, -1, FUSION_OP_SET(VADDUDM),
+               FUSION_OP_SET(VADDUDM))
+
+// Either of {vand, vandc, veqv, vnand, vnor, vor, vorc, vxor}
+FUSION_FEATURE(VecLogical, hasLogicalFusion, -1,
+               FUSION_OP_SET(VAND, VANDC, VEQV, VNAND, VNOR, VOR, VORC, VXOR),
+               FUSION_OP_SET(VAND, VANDC, VEQV, VNAND, VNOR, VOR, VORC, VXOR))
+
+// sldi rx, ra, {3, 6} - {add, subf}
+// sldi rx, ra n is alias of rldicr rx, ra, n, 63-n
+FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32),
+               FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8))
+
 #undef FUSION_FEATURE
 #undef FUSION_OP_SET
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 4f16c7f5ff17..4bccc5596d2b 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -135,6 +135,23 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
   ImmToIdxMap[PPC::SPELWZ] = PPC::SPELWZX;
 
   // Power10
+  ImmToIdxMap[PPC::PLBZ]   = PPC::LBZX; ImmToIdxMap[PPC::PLBZ8]   = PPC::LBZX8;
+  ImmToIdxMap[PPC::PLHZ]   = PPC::LHZX; ImmToIdxMap[PPC::PLHZ8]   = PPC::LHZX8;
+  ImmToIdxMap[PPC::PLHA]   = PPC::LHAX; ImmToIdxMap[PPC::PLHA8]   = PPC::LHAX8;
+  ImmToIdxMap[PPC::PLWZ]   = PPC::LWZX; ImmToIdxMap[PPC::PLWZ8]   = PPC::LWZX8;
+  ImmToIdxMap[PPC::PLWA]   = PPC::LWAX; ImmToIdxMap[PPC::PLWA8]   = PPC::LWAX;
+  ImmToIdxMap[PPC::PLD]    = PPC::LDX;  ImmToIdxMap[PPC::PSTD]   = PPC::STDX;
+
+  ImmToIdxMap[PPC::PSTB]   = PPC::STBX; ImmToIdxMap[PPC::PSTB8]   = PPC::STBX8;
+  ImmToIdxMap[PPC::PSTH]   = PPC::STHX; ImmToIdxMap[PPC::PSTH8]   = PPC::STHX8;
+  ImmToIdxMap[PPC::PSTW]   = PPC::STWX; ImmToIdxMap[PPC::PSTW8]   = PPC::STWX8;
+
+  ImmToIdxMap[PPC::PLFS]   = PPC::LFSX; ImmToIdxMap[PPC::PSTFS]   = PPC::STFSX;
+  ImmToIdxMap[PPC::PLFD]   = PPC::LFDX; ImmToIdxMap[PPC::PSTFD]   = PPC::STFDX;
+  ImmToIdxMap[PPC::PLXSSP] = PPC::LXSSPX; ImmToIdxMap[PPC::PSTXSSP] = PPC::STXSSPX;
+  ImmToIdxMap[PPC::PLXSD]  = PPC::LXSDX; ImmToIdxMap[PPC::PSTXSD]  = PPC::STXSDX;
+  ImmToIdxMap[PPC::PLXV]   = PPC::LXVX; ImmToIdxMap[PPC::PSTXV]  = PPC::STXVX;
+
   ImmToIdxMap[PPC::LXVP]   = PPC::LXVPX;
   ImmToIdxMap[PPC::STXVP]  = PPC::STXVPX;
   ImmToIdxMap[PPC::PLXVP]  = PPC::LXVPX;
@@ -506,7 +523,9 @@ bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg,
           VRM->hasPhys(ResultReg)) {
         Register UACCPhys = VRM->getPhys(ResultReg);
         Register HintReg = getSubReg(UACCPhys, ResultOp->getSubReg());
-        Hints.push_back(HintReg);
+        // Ensure that the hint is a VSRp register.
+        if (HintReg >= PPC::VSRp0 && HintReg <= PPC::VSRp31)
+          Hints.push_back(HintReg);
       }
       break;
     }
@@ -1345,7 +1364,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MBB.getParent();
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   // Get the instruction info.
-  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   // Get the frame info.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   DebugLoc dl = MI.getDebugLoc();
@@ -1457,7 +1476,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   bool OffsetFitsMnemonic = (OpC == PPC::EVSTDD || OpC == PPC::EVLDD) ?
                             isUInt<8>(Offset) :
                             isInt<16>(Offset);
-  if (OpC == PPC::PLXVP || OpC == PPC::PSTXVP)
+  if (TII.isPrefixed(MI.getOpcode()))
     OffsetFitsMnemonic = isInt<34>(Offset);
   if (!noImmForm && ((OffsetFitsMnemonic &&
                       ((Offset % offsetMinAlign(MI)) == 0)) ||
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index c22a5826337b..2e534dd1bcd5 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -147,8 +147,6 @@ public:
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
-  bool addAllocPriorityToGlobalRanges() const override { return true; }
-
   // Support for virtual base registers.
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
   Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
diff --git a/llvm/lib/Target/PowerPC/PPCSchedPredicates.td b/llvm/lib/Target/PowerPC/PPCSchedPredicates.td
new file mode 100644
index 000000000000..18f325e99a60
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCSchedPredicates.td
@@ -0,0 +1,294 @@
+//===--- PPCSchedPredicates.td - PowerPC Scheduling Preds -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Automatically generated file, do not edit!
+//
+// This file defines scheduling predicate definitions that are used by the
+// PowerPC subtargets.
+//===----------------------------------------------------------------------===//
+// Identify instructions that write BF pipelines with 7 cycles.
+def P10W_BF_7C_Pred : MCSchedPredicate<
+      CheckOpcode<[FADD,
+                   FADDS,
+                   FADDS_rec,
+                   FADD_rec,
+                   FCFID,
+                   FCFIDS,
+                   FCFIDS_rec,
+                   FCFIDU,
+                   FCFIDUS,
+                   FCFIDUS_rec,
+                   FCFIDU_rec,
+                   FCFID_rec,
+                   FCTID,
+                   FCTIDU,
+                   FCTIDUZ,
+                   FCTIDUZ_rec,
+                   FCTIDU_rec,
+                   FCTIDZ,
+                   FCTIDZ_rec,
+                   FCTID_rec,
+                   FCTIW,
+                   FCTIWU,
+                   FCTIWUZ,
+                   FCTIWUZ_rec,
+                   FCTIWU_rec,
+                   FCTIWZ,
+                   FCTIWZ_rec,
+                   FCTIW_rec,
+                   FMADD,
+                   FMADDS,
+                   FMADDS_rec,
+                   FMADD_rec,
+                   FMSUB,
+                   FMSUBS,
+                   FMSUBS_rec,
+                   FMSUB_rec,
+                   FMUL,
+                   FMULS,
+                   FMULS_rec,
+                   FMUL_rec,
+                   FNMADD,
+                   FNMADDS,
+                   FNMADDS_rec,
+                   FNMADD_rec,
+                   FNMSUB,
+                   FNMSUBS,
+                   FNMSUBS_rec,
+                   FNMSUB_rec,
+                   FRE,
+                   FRES,
+                   FRES_rec,
+                   FRE_rec,
+                   FRIMD, FRIMS,
+                   FRIMD_rec, FRIMS_rec,
+                   FRIND, FRINS,
+                   FRIND_rec, FRINS_rec,
+                   FRIPD, FRIPS,
+                   FRIPD_rec, FRIPS_rec,
+                   FRIZD, FRIZS,
+                   FRIZD_rec, FRIZS_rec,
+                   FRSP,
+                   FRSP_rec,
+                   FRSQRTE,
+                   FRSQRTES,
+                   FRSQRTES_rec,
+                   FRSQRTE_rec,
+                   FSELD, FSELS,
+                   FSELD_rec, FSELS_rec,
+                   FSUB,
+                   FSUBS,
+                   FSUBS_rec,
+                   FSUB_rec,
+                   VADDFP,
+                   VCFSX, VCFSX_0,
+                   VCFUX, VCFUX_0,
+                   VCTSXS, VCTSXS_0,
+                   VCTUXS, VCTUXS_0,
+                   VEXPTEFP,
+                   VEXPTEFP,
+                   VLOGEFP,
+                   VMADDFP,
+                   VNMSUBFP,
+                   VREFP,
+                   VRFIM,
+                   VRFIN,
+                   VRFIP,
+                   VRFIZ,
+                   VRSQRTEFP,
+                   VSUBFP,
+                   XSADDDP,
+                   XSADDSP,
+                   XSCVDPHP,
+                   XSCVDPSP,
+                   XSCVDPSPN,
+                   XSCVDPSXDS, XSCVDPSXDSs,
+                   XSCVDPSXWS, XSCVDPSXWSs,
+                   XSCVDPUXDS, XSCVDPUXDSs,
+                   XSCVDPUXWS, XSCVDPUXWSs,
+                   XSCVSPDP,
+                   XSCVSXDDP,
+                   XSCVSXDSP,
+                   XSCVUXDDP,
+                   XSCVUXDSP,
+                   XSMADDADP,
+                   XSMADDASP,
+                   XSMADDMDP,
+                   XSMADDMSP,
+                   XSMSUBADP,
+                   XSMSUBASP,
+                   XSMSUBMDP,
+                   XSMSUBMSP,
+                   XSMULDP,
+                   XSMULSP,
+                   XSNMADDADP,
+                   XSNMADDASP,
+                   XSNMADDMDP,
+                   XSNMADDMSP,
+                   XSNMSUBADP,
+                   XSNMSUBASP,
+                   XSNMSUBMDP,
+                   XSNMSUBMSP,
+                   XSRDPI,
+                   XSRDPIC,
+                   XSRDPIM,
+                   XSRDPIP,
+                   XSRDPIZ,
+                   XSREDP,
+                   XSRESP,
+                   XSRSP,
+                   XSRSQRTEDP,
+                   XSRSQRTESP,
+                   XSSUBDP,
+                   XSSUBSP,
+                   XVADDDP,
+                   XVADDSP,
+                   XVCVDPSP,
+                   XVCVDPSXDS,
+                   XVCVDPSXWS,
+                   XVCVDPUXDS,
+                   XVCVDPUXWS,
+                   XVCVSPBF16,
+                   XVCVSPDP,
+                   XVCVSPHP,
+                   XVCVSPSXDS,
+                   XVCVSPSXWS,
+                   XVCVSPUXDS,
+                   XVCVSPUXWS,
+                   XVCVSXDDP,
+                   XVCVSXDSP,
+                   XVCVSXWDP,
+                   XVCVSXWSP,
+                   XVCVUXDDP,
+                   XVCVUXDSP,
+                   XVCVUXWDP,
+                   XVCVUXWSP,
+                   XVMADDADP,
+                   XVMADDASP,
+                   XVMADDMDP,
+                   XVMADDMSP,
+                   XVMSUBADP,
+                   XVMSUBASP,
+                   XVMSUBMDP,
+                   XVMSUBMSP,
+                   XVMULDP,
+                   XVMULSP,
+                   XVNMADDADP,
+                   XVNMADDASP,
+                   XVNMADDMDP,
+                   XVNMADDMSP,
+                   XVNMSUBADP,
+                   XVNMSUBASP,
+                   XVNMSUBMDP,
+                   XVNMSUBMSP,
+                   XVRDPI,
+                   XVRDPIC,
+                   XVRDPIM,
+                   XVRDPIP,
+                   XVRDPIZ,
+                   XVREDP,
+                   XVRESP,
+                   XVRSPI,
+                   XVRSPIC,
+                   XVRSPIM,
+                   XVRSPIP,
+                   XVRSPIZ,
+                   XVRSQRTEDP,
+                   XVRSQRTESP,
+                   XVSUBDP,
+                   XVSUBSP]>
+>;
+
+// Identify instructions that write CY pipelines with 7 cycles.
+def P10W_CY_7C_Pred : MCSchedPredicate<
+      CheckOpcode<[CFUGED,
+                   CNTLZDM,
+                   CNTTZDM,
+                   PDEPD,
+                   PEXTD,
+                   VCFUGED,
+                   VCIPHER,
+                   VCIPHERLAST,
+                   VCLZDM,
+                   VCTZDM,
+                   VGNB,
+                   VNCIPHER,
+                   VNCIPHERLAST,
+                   VPDEPD,
+                   VPEXTD,
+                   VPMSUMB,
+                   VPMSUMD,
+                   VPMSUMH,
+                   VPMSUMW,
+                   VSBOX]>
+>;
+
+// Identify instructions that write MM pipelines with 10 cycles.
+def P10W_MM_10C_Pred : MCSchedPredicate<
+      CheckOpcode<[PMXVBF16GER2,
+                   PMXVBF16GER2NN,
+                   PMXVBF16GER2NP,
+                   PMXVBF16GER2PN,
+                   PMXVBF16GER2PP,
+                   PMXVF16GER2,
+                   PMXVF16GER2NN,
+                   PMXVF16GER2NP,
+                   PMXVF16GER2PN,
+                   PMXVF16GER2PP,
+                   PMXVF32GER,
+                   PMXVF32GERNN,
+                   PMXVF32GERNP,
+                   PMXVF32GERPN,
+                   PMXVF32GERPP,
+                   PMXVF64GER,
+                   PMXVF64GERNN,
+                   PMXVF64GERNP,
+                   PMXVF64GERPN,
+                   PMXVF64GERPP,
+                   PMXVI16GER2,
+                   PMXVI16GER2PP,
+                   PMXVI16GER2S,
+                   PMXVI16GER2SPP,
+                   PMXVI4GER8,
+                   PMXVI4GER8PP,
+                   PMXVI8GER4,
+                   PMXVI8GER4PP,
+                   PMXVI8GER4SPP,
+                   XVBF16GER2,
+                   XVBF16GER2NN,
+                   XVBF16GER2NP,
+                   XVBF16GER2PN,
+                   XVBF16GER2PP,
+                   XVF16GER2,
+                   XVF16GER2NN,
+                   XVF16GER2NP,
+                   XVF16GER2PN,
+                   XVF16GER2PP,
+                   XVF32GER,
+                   XVF32GERNN,
+                   XVF32GERNP,
+                   XVF32GERPN,
+                   XVF32GERPP,
+                   XVF64GER,
+                   XVF64GERNN,
+                   XVF64GERNP,
+                   XVF64GERPN,
+                   XVF64GERPP,
+                   XVI16GER2,
+                   XVI16GER2PP,
+                   XVI16GER2S,
+                   XVI16GER2SPP,
+                   XVI4GER8,
+                   XVI4GER8PP,
+                   XVI8GER4,
+                   XVI8GER4PP,
+                   XVI8GER4SPP,
+                   XXMFACC,
+                   XXMFACC,
+                   XXMTACC,
+                   XXSETACCZ]>
+>;
diff --git a/llvm/lib/Target/PowerPC/PPCSchedule.td b/llvm/lib/Target/PowerPC/PPCSchedule.td
index e378d57d325e..f65dbae16d3a 100644
--- a/llvm/lib/Target/PowerPC/PPCSchedule.td
+++ b/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -128,7 +128,9 @@ def IIC_SprMTPMR     : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
+include "PPCInstrInfo.td"
 
+include "PPCSchedPredicates.td"
 include "PPCScheduleG3.td"
 include "PPCSchedule440.td"
 include "PPCScheduleG4.td"
@@ -137,6 +139,7 @@ include "PPCScheduleG5.td"
 include "PPCScheduleP7.td"
 include "PPCScheduleP8.td"
 include "PPCScheduleP9.td"
+include "PPCScheduleP10.td"
 include "PPCScheduleA2.td"
 include "PPCScheduleE500.td"
 include "PPCScheduleE500mc.td"
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP10.td b/llvm/lib/Target/PowerPC/PPCScheduleP10.td
new file mode 100644
index 000000000000..bf56491f373a
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP10.td
@@ -0,0 +1,416 @@
+//===--- PPCScheduleP10.td - P10 Scheduling Definitions -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Automatically generated file, do not edit!
+//
+// This file defines the resources required by P10 instructions.
+//===----------------------------------------------------------------------===//
+// Modeling pipeline forwarding logic.
+def P10BR_Read : SchedRead;
+def P10DF_Read : SchedRead;
+def P10DV_Read : SchedRead;
+def P10DX_Read : SchedRead;
+def P10F2_Read : SchedRead;
+def P10FX_Read : SchedRead;
+def P10LD_Read : SchedRead;
+def P10MU_Read : SchedRead;
+def P10PM_Read : SchedRead;
+def P10ST_Read : SchedRead;
+def P10SX_Read : SchedRead;
+def P10vMU_Read : SchedRead;
+
+def P10Model : SchedMachineModel {
+  let IssueWidth = 8;
+
+  // TODO - Need to be updated according to P10 UM.
+  let MicroOpBufferSize = 44;
+
+  // TODO - tune this on real HW once it arrives. For now, we will use the same
+  // value as we do on P9.
+  let LoopMicroOpBufferSize = 60;
+
+  let CompleteModel = 1;
+
+  // Do not support SPE (Signal Procesing Engine) on Power 10.
+  let UnsupportedFeatures = [HasSPE, IsE500, IsBookE];
+}
+
+let SchedModel = P10Model in {
+
+  // ***************** Processor Resources *****************
+
+  // Pipeline Groups
+
+  def P10_BF : ProcResource<4>; // Four Binary Floating Point pipelines.
+  def P10_BR : ProcResource<2>; // Two Branch pipelines.
+  def P10_CY : ProcResource<4>; // Four Crypto pipelines.
+  def P10_DF : ProcResource<1>; // One Decimal Floating Point pipelines.
+  def P10_DV : ProcResource<2>; // Two Fixed-point divide (DIV) pipelines.
+  def P10_DX : ProcResource<2>; // Two 128-bit fixed-point and BCD pipelines.
+  def P10_FX : ProcResource<4>; // Four ALU pipelines.
+  def P10_LD : ProcResource<2>; // Two Load pipelines.
+  def P10_MM : ProcResource<2>; // Two 512-bit SIMD matrix multiply engine pipelines.
+  def P10_PM : ProcResource<4>; // Four 128-bit permute (PM) pipelines.
+  def P10_ST : ProcResource<2>; // Two ST-D pipelines.
+  def P10_SX : ProcResource<2>; // Two Simple Fixed-point (SFX) pipelines.
+
+  // Dispatch Groups
+
+  // Dispatch to any slots
+  def P10_ANY_SLOT : ProcResource<8>;
+
+  let Super = P10_ANY_SLOT in {
+
+    // Dispatch to even slots
+    def P10_EVEN_SLOT : ProcResource<4>;
+
+    // Dispatch to odd slots
+    def P10_ODD_SLOT : ProcResource<4>;
+  }
+
+  // Dispatch Rules
+  let NumMicroOps = 0, Latency = 1 in {
+    // Dispatch Rule '-'
+    def P10W_DISP_ANY : SchedWriteRes<[P10_ANY_SLOT]>;
+
+    // Dispatch Rule '-', even slot
+    def P10W_DISP_EVEN : SchedWriteRes<[P10_EVEN_SLOT]>;
+
+    // Dispatch Rule 'P'
+    def P10W_DISP_PAIR : SchedWriteRes<[P10_EVEN_SLOT, P10_ODD_SLOT]>;
+  }
+
+  // ***************** SchedWriteRes Definitions *****************
+
+  // A BF pipeline may take from 7 to 36 cycles to complete.
+  // Some BF operations may keep the pipeline busy for up to 10 cycles.
+  def P10W_BF_7C : SchedWriteRes<[P10_BF]> {
+    let Latency = 7;
+  }
+
+  def P10W_BF_22C : SchedWriteRes<[P10_BF]> {
+    let ResourceCycles = [ 5 ];
+    let Latency = 22;
+  }
+
+  def P10W_BF_24C : SchedWriteRes<[P10_BF]> {
+    let ResourceCycles = [ 8 ];
+    let Latency = 24;
+  }
+
+  def P10W_BF_26C : SchedWriteRes<[P10_BF]> {
+    let ResourceCycles = [ 5 ];
+    let Latency = 26;
+  }
+
+  def P10W_BF_27C : SchedWriteRes<[P10_BF]> {
+    let ResourceCycles = [ 7 ];
+    let Latency = 27;
+  }
+
+  def P10W_BF_36C : SchedWriteRes<[P10_BF]> {
+    let ResourceCycles = [ 10 ];
+    let Latency = 36;
+  }
+
+  // A BR pipeline may take 2 cycles to complete.
+  def P10W_BR_2C : SchedWriteRes<[P10_BR]> {
+    let Latency = 2;
+  }
+
+  // A CY pipeline may take 7 cycles to complete.
+  def P10W_CY_7C : SchedWriteRes<[P10_CY]> {
+    let Latency = 7;
+  }
+
+  // A DF pipeline may take from 13 to 174 cycles to complete.
+  // Some DF operations may keep the pipeline busy for up to 67 cycles.
+  def P10W_DF_13C : SchedWriteRes<[P10_DF]> {
+    let Latency = 13;
+  }
+
+  def P10W_DF_24C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 16 ];
+    let Latency = 24;
+  }
+
+  def P10W_DF_25C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 17 ];
+    let Latency = 25;
+  }
+
+  def P10W_DF_26C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 18 ];
+    let Latency = 26;
+  }
+
+  def P10W_DF_32C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 22 ];
+    let Latency = 32;
+  }
+
+  def P10W_DF_33C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 25 ];
+    let Latency = 33;
+  }
+
+  def P10W_DF_34C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 25 ];
+    let Latency = 34;
+  }
+
+  def P10W_DF_38C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 30 ];
+    let Latency = 38;
+  }
+
+  def P10W_DF_40C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 17 ];
+    let Latency = 40;
+  }
+
+  def P10W_DF_43C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 34 ];
+    let Latency = 43;
+  }
+
+  def P10W_DF_59C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 49 ];
+    let Latency = 59;
+  }
+
+  def P10W_DF_61C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 12 ];
+    let Latency = 61;
+  }
+
+  def P10W_DF_68C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 15 ];
+    let Latency = 68;
+  }
+
+  def P10W_DF_77C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 67 ];
+    let Latency = 77;
+  }
+
+  def P10W_DF_87C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 12 ];
+    let Latency = 87;
+  }
+
+  def P10W_DF_100C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 32 ];
+    let Latency = 100;
+  }
+
+  def P10W_DF_174C : SchedWriteRes<[P10_DF]> {
+    let ResourceCycles = [ 33 ];
+    let Latency = 174;
+  }
+
+  // A DV pipeline may take from 20 to 83 cycles to complete.
+  // Some DV operations may keep the pipeline busy for up to 33 cycles.
+  def P10W_DV_20C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 10 ];
+    let Latency = 20;
+  }
+
+  def P10W_DV_25C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 10 ];
+    let Latency = 25;
+  }
+
+  def P10W_DV_27C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 10 ];
+    let Latency = 27;
+  }
+
+  def P10W_DV_41C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 10 ];
+    let Latency = 41;
+  }
+
+  def P10W_DV_43C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 21 ];
+    let Latency = 43;
+  }
+
+  def P10W_DV_47C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 21 ];
+    let Latency = 47;
+  }
+
+  def P10W_DV_54C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 33 ];
+    let Latency = 54;
+  }
+
+  def P10W_DV_60C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 33 ];
+    let Latency = 60;
+  }
+
+  def P10W_DV_75C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 21 ];
+    let Latency = 75;
+  }
+
+  def P10W_DV_83C : SchedWriteRes<[P10_DV]> {
+    let ResourceCycles = [ 33 ];
+    let Latency = 83;
+  }
+
+  // A DX pipeline may take 5 cycles to complete.
+  def P10W_DX_5C : SchedWriteRes<[P10_DX]> {
+    let Latency = 5;
+  }
+
+  // A F2 pipeline may take 4 cycles to complete.
+  def P10W_F2_4C : SchedWriteRes<[P10_FX]> {
+    let Latency = 4;
+  }
+
+  // A FX pipeline may take from 2 to 3 cycles to complete.
+  def P10W_FX_2C : SchedWriteRes<[P10_FX]> {
+    let Latency = 2;
+  }
+
+  def P10W_FX_3C : SchedWriteRes<[P10_FX]> {
+    let Latency = 3;
+  }
+
+  // A LD pipeline may take 6 cycles to complete.
+  def P10W_LD_6C : SchedWriteRes<[P10_LD]> {
+    let Latency = 6;
+  }
+
+  // A MF pipeline may take 13 cycles to complete.
+  def P10W_MF_13C : SchedWriteRes<[P10_SX]> {
+    let Latency = 13;
+  }
+
+  // A MFL pipeline may take 13 cycles to complete.
+  def P10W_MFL_13C : SchedWriteRes<[P10_SX]> {
+    let Latency = 13;
+  }
+
+  // A MM pipeline may take 10 cycles to complete.
+  def P10W_MM_10C : SchedWriteRes<[P10_MM]> {
+    let Latency = 10;
+  }
+
+  // A MU pipeline may take 5 cycles to complete.
+  def P10W_MU_5C : SchedWriteRes<[P10_BF]> {
+    let Latency = 5;
+  }
+
+  // A PM pipeline may take 4 cycles to complete.
+  def P10W_PM_4C : SchedWriteRes<[P10_PM]> {
+    let Latency = 4;
+  }
+
+  // A ST pipeline may take 3 cycles to complete.
+  def P10W_ST_3C : SchedWriteRes<[P10_ST]> {
+    let Latency = 3;
+  }
+
+  // A SX pipeline may take from 0 to 3 cycles to complete.
+  def P10W_SX : SchedWriteRes<[P10_SX]> {
+    let Latency = 0;
+  }
+
+  def P10W_SX_3C : SchedWriteRes<[P10_SX]> {
+    let Latency = 3;
+  }
+
+  // A vMU pipeline may take 7 cycles to complete.
+  def P10W_vMU_7C : SchedWriteRes<[P10_BF]> {
+    let Latency = 7;
+  }
+
+  // ***************** Read Advance Definitions *****************
+
+  // Modeling pipeline forwarding logic.
+  def P10BF_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_CY_7C, P10W_DF_13C, P10W_MM_10C]>;
+  def P10BF_Read_2C : SchedReadAdvance<2, [P10W_BF_7C]>;
+  def P10BR_Read_1C : SchedReadAdvance<1, [P10W_FX_3C, P10W_F2_4C]>;
+  def P10CY_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_BF_7C, P10W_DF_13C, P10W_MM_10C]>;
+  def P10CY_Read_3C : SchedReadAdvance<3, [P10W_CY_7C]>;
+  def P10DF_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_BF_7C, P10W_CY_7C, P10W_DF_13C, P10W_MM_10C]>;
+  def P10DV_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_BF_7C, P10W_CY_7C, P10W_DF_13C, P10W_MM_10C]>;
+  def P10DX_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_BF_7C, P10W_CY_7C, P10W_DF_13C, P10W_MM_10C]>;
+  def P10F2_Read_1C : SchedReadAdvance<1, [P10W_ST_3C, P10W_SX_3C, P10W_FX_3C, P10W_F2_4C, P10W_PM_4C]>;
+  def P10FX_Read_1C : SchedReadAdvance<1, [P10W_ST_3C, P10W_SX_3C, P10W_FX_3C, P10W_F2_4C, P10W_PM_4C]>;
+  def P10LD_Read_1C : SchedReadAdvance<1, [P10W_ST_3C, P10W_SX_3C, P10W_FX_3C, P10W_F2_4C]>;
+  def P10MM_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_BF_7C, P10W_CY_7C, P10W_DF_13C]>;
+  def P10MM_Read_6C : SchedReadAdvance<6, [P10W_MM_10C]>;
+  def P10MU_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_DF_13C]>;
+  def P10PM_Read_1C : SchedReadAdvance<1, [P10W_ST_3C, P10W_SX_3C, P10W_FX_3C, P10W_F2_4C, P10W_PM_4C]>;
+  def P10ST_Read_1C : SchedReadAdvance<1, [P10W_ST_3C, P10W_SX_3C, P10W_FX_3C, P10W_F2_4C, P10W_PM_4C]>;
+  def P10SX_Read_1C : SchedReadAdvance<1, [P10W_ST_3C, P10W_SX_3C, P10W_FX_3C, P10W_F2_4C, P10W_PM_4C, P10W_MM_10C]>;
+  def P10vMU_Read_1C : SchedReadAdvance<1, [P10W_DX_5C, P10W_MU_5C, P10W_vMU_7C, P10W_BF_7C, P10W_CY_7C, P10W_DF_13C, P10W_MM_10C]>;
+
+  // Save 1 cycles if pipeline BF reads the data from pipelines DX, MU, vMU, CY, DF, MM.
+  // Save 2 cycles if pipeline BF reads the data from pipelines BF.
+  def P10BF_Read : SchedReadVariant<[
+        SchedVar<P10W_BF_7C_Pred, [P10BF_Read_2C]>,
+        SchedVar<NoSchedPred,     [P10BF_Read_1C]>
+  ]>;
+
+  // Save 1 cycles if pipeline CY reads the data from pipelines DX, MU, vMU, BF, DF, MM.
+  // Save 3 cycles if pipeline CY reads the data from pipelines CY.
+  def P10CY_Read : SchedReadVariant<[
+        SchedVar<P10W_CY_7C_Pred, [P10CY_Read_3C]>,
+        SchedVar<NoSchedPred,     [P10CY_Read_1C]>
+  ]>;
+
+  // Save 1 cycles if pipeline MM reads the data from pipelines DX, MU, vMU, BF, CY, DF.
+  // Save 6 cycles if pipeline MM reads the data from pipelines MM.
+  def P10MM_Read : SchedReadVariant<[
+        SchedVar<P10W_MM_10C_Pred, [P10MM_Read_6C]>,
+        SchedVar<NoSchedPred,     [P10MM_Read_1C]>
+  ]>;
+
+  // Save 1 cycles if pipeline BR reads the data from pipelines FX, F2.
+  def : SchedAlias<P10BR_Read, P10BR_Read_1C>;
+
+  // Save 1 cycles if pipeline DF reads the data from pipelines DX, MU, vMU, BF, CY, DF, MM.
+  def : SchedAlias<P10DF_Read, P10DF_Read_1C>;
+
+  // Save 1 cycles if pipeline DV reads the data from pipelines DX, MU, vMU, BF, CY, DF, MM.
+  def : SchedAlias<P10DV_Read, P10DV_Read_1C>;
+
+  // Save 1 cycles if pipeline DX reads the data from pipelines DX, MU, vMU, BF, CY, DF, MM.
+  def : SchedAlias<P10DX_Read, P10DX_Read_1C>;
+
+  // Save 1 cycles if pipeline F2 reads the data from pipelines ST, SX, FX, F2, PM.
+  def : SchedAlias<P10F2_Read, P10F2_Read_1C>;
+
+  // Save 1 cycles if pipeline FX reads the data from pipelines ST, SX, FX, F2, PM.
+  def : SchedAlias<P10FX_Read, P10FX_Read_1C>;
+
+  // Save 1 cycles if pipeline LD reads the data from pipelines ST, SX, FX, F2.
+  def : SchedAlias<P10LD_Read, P10LD_Read_1C>;
+
+  // Save 1 cycles if pipeline MU reads the data from pipelines DX, MU, DF.
+  def : SchedAlias<P10MU_Read, P10MU_Read_1C>;
+
+  // Save 1 cycles if pipeline PM reads the data from pipelines ST, SX, FX, F2, PM.
+  def : SchedAlias<P10PM_Read, P10PM_Read_1C>;
+
+  // Save 1 cycles if pipeline ST reads the data from pipelines ST, SX, FX, F2, PM.
+  def : SchedAlias<P10ST_Read, P10ST_Read_1C>;
+
+  // Save 1 cycles if pipeline SX reads the data from pipelines ST, SX, FX, F2, PM, MM.
+  def : SchedAlias<P10SX_Read, P10SX_Read_1C>;
+
+  // Save 1 cycles if pipeline vMU reads the data from pipelines DX, MU, vMU, BF, CY, DF, MM.
+  def : SchedAlias<P10vMU_Read, P10vMU_Read_1C>;
+
+  include "P10InstrResources.td"
+}
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index 571cc219ff2b..3dc069ecad8a 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -9,8 +9,6 @@
 // This file defines the itinerary class data for the POWER9 processor.
 //
 //===----------------------------------------------------------------------===//
-include "PPCInstrInfo.td"
-
 def P9Model : SchedMachineModel {
   // The maximum number of instructions to be issued at the same time.
   // While a value of 8 is technically correct since 8 instructions can be
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 87ce32f027ab..dfc29dbb10f1 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -23,8 +23,8 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cstdlib>
 
@@ -127,6 +127,11 @@ void PPCSubtarget::initializeEnvironment() {
   HasStoreFusion = false;
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
+  HasArithAddFusion = false;
+  HasAddLogicalFusion = false;
+  HasLogicalAddFusion = false;
+  HasLogicalFusion = false;
+  IsISA2_06 = false;
   IsISA2_07 = false;
   IsISA3_0 = false;
   IsISA3_1 = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index e916b0c02000..783ea121ccb8 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -147,6 +147,11 @@ protected:
   bool HasStoreFusion;
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
+  bool HasArithAddFusion;
+  bool HasAddLogicalFusion;
+  bool HasLogicalAddFusion;
+  bool HasLogicalFusion;
+  bool IsISA2_06;
   bool IsISA2_07;
   bool IsISA3_0;
   bool IsISA3_1;
@@ -322,6 +327,7 @@ public:
 
   bool hasHTM() const { return HasHTM; }
   bool hasFloat128() const { return HasFloat128; }
+  bool isISA2_06() const { return IsISA2_06; }
   bool isISA2_07() const { return IsISA2_07; }
   bool isISA3_0() const { return IsISA3_0; }
   bool isISA3_1() const { return IsISA3_1; }
@@ -330,6 +336,10 @@ public:
   bool hasStoreFusion() const { return HasStoreFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
   bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
+  bool hasArithAddFusion() const { return HasArithAddFusion; }
+  bool hasAddLogicalFusion() const { return HasAddLogicalFusion; }
+  bool hasLogicalAddFusion() const { return HasLogicalAddFusion; }
+  bool hasLogicalFusion() const { return HasLogicalFusion; }
   bool needsSwapsForVSXMemOps() const {
     return hasVSX() && isLittleEndian() && !hasP9Vector();
   }
diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 3186d197931d..fbd487fbcfd5 100644
--- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -208,11 +208,9 @@ public:
 
       bool Changed = false;
 
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
+      for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
         if (processBlock(B))
           Changed = true;
-      }
 
       return Changed;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 895ae6744421..8120975c4fb2 100644
--- a/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -73,9 +73,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -131,11 +131,9 @@ public:
     bool runOnMachineFunction(MachineFunction &MF) override {
       bool Changed = false;
 
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
+      for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
         if (processBlock(B))
           Changed = true;
-      }
 
       return Changed;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 82717300a480..3eff00fc3c05 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -36,10 +36,10 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/llvm/lib/Target/PowerPC/PPCTargetMachine.h
index ed9e74b72d1e..d3fe5362ccdc 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -68,6 +68,8 @@ public:
   }
 
   bool isLittleEndian() const;
+
+  int unqualifiedInlineAsmVariant() const override { return 1; }
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index d5a7873bd056..77d5a2668b60 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -485,6 +485,9 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
           case Intrinsic::experimental_constrained_sin:
           case Intrinsic::experimental_constrained_cos:
             return true;
+          // There is no corresponding FMA instruction for PPC double double.
+          // Thus, we need to disable CTR loop generation for this type.
+          case Intrinsic::fmuladd:
           case Intrinsic::copysign:
             if (CI->getArgOperand(0)->getType()->getScalarType()->
                 isPPC_FP128Ty())
@@ -787,7 +790,8 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
 }
 
 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                         TTI::UnrollingPreferences &UP) {
+                                         TTI::UnrollingPreferences &UP,
+                                         OptimizationRemarkEmitter *ORE) {
   if (ST->getCPUDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
     // helps expose latency-hiding opportunities to the instruction scheduler.
@@ -798,7 +802,7 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     UP.AllowExpensiveTripCount = true;
   }
 
-  BaseT::getUnrollingPreferences(L, SE, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
 }
 
 void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8ac3038d51d6..aa84013803af 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -72,7 +72,8 @@ public:
                   TargetLibraryInfo *LibInfo);
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
@@ -102,8 +103,7 @@ public:
   InstructionCost vectorCostAdjustment(InstructionCost Cost, unsigned Opcode,
                                        Type *Ty1, Type *Ty2);
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -130,8 +130,7 @@ public:
                                   const Instruction *I = nullptr);
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
index 3463bbbdc5f0..7272e6edefc5 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -27,9 +27,9 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -148,11 +148,9 @@ public:
 
       bool Changed = false;
 
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
+      for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
         if (processBlock(B))
           Changed = true;
-      }
 
       return Changed;
     }
@@ -169,4 +167,3 @@ INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
 char PPCVSXCopy::ID = 0;
 FunctionPass*
 llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
-
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index e72e29112da7..0be35adc35c7 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -31,10 +31,10 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -361,11 +361,9 @@ public:
       if (DisableVSXFMAMutate)
         return Changed;
 
-      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
-        MachineBasicBlock &B = *I++;
+      for (MachineBasicBlock &B : llvm::make_early_inc_range(MF))
         if (processBlock(B))
           Changed = true;
-      }
 
       return Changed;
     }
diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 6bb952f27fee..0bfa0bd5ec0e 100644
--- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/PowerPCTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getThePPC32Target() {
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 87496e0b9330..f00813f1301a 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -32,10 +32,11 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributes.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/RISCVISAInfo.h"
 
 #include <limits>
 
@@ -50,6 +51,10 @@ using namespace llvm;
 STATISTIC(RISCVNumInstrsCompressed,
           "Number of RISC-V Compressed instructions emitted");
 
+namespace llvm {
+extern const SubtargetFeatureKV RISCVFeatureKV[RISCV::NumSubtargetFeatures];
+} // namespace llvm
+
 namespace {
 struct RISCVOperand;
 
@@ -169,6 +174,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   bool parseDirectiveOption();
   bool parseDirectiveAttribute();
+  bool parseDirectiveInsn(SMLoc L);
 
   void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (!(getSTI().getFeatureBits()[Feature])) {
@@ -504,6 +510,24 @@ public:
     return (isRV64() && isUInt<5>(Imm)) || isUInt<4>(Imm);
   }
 
+  bool isUImm2() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (!isImm())
+      return false;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isUInt<2>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
+  bool isUImm3() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (!isImm())
+      return false;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isUInt<3>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isUImm5() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
@@ -513,6 +537,15 @@ public:
     return IsConstantImm && isUInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isUImm7() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (!isImm())
+      return false;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isUInt<7>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isSImm5() const {
     if (!isImm())
       return false;
@@ -960,10 +993,6 @@ bool RISCVAsmParser::generateImmOutOfRangeError(
   return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
 }
 
-static std::string RISCVMnemonicSpellCheck(StringRef S,
-                                           const FeatureBitset &FBS,
-                                           unsigned VariantID = 0);
-
 bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                              OperandVector &Operands,
                                              MCStreamer &Out,
@@ -996,13 +1025,13 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
   case Match_MnemonicFail: {
     FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
-    std::string Suggestion =
-        RISCVMnemonicSpellCheck(((RISCVOperand &)*Operands[0]).getToken(), FBS);
+    std::string Suggestion = RISCVMnemonicSpellCheck(
+        ((RISCVOperand &)*Operands[0]).getToken(), FBS, 0);
     return Error(IDLoc, "unrecognized instruction mnemonic" + Suggestion);
   }
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(ErrorLoc, "too few operands for instruction");
 
@@ -1019,7 +1048,7 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   // corresponding operand is missing.
   if (Result > FIRST_TARGET_MATCH_RESULT_TY) {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
+    if (ErrorInfo != ~0ULL && ErrorInfo >= Operands.size())
       return Error(ErrorLoc, "too few operands for instruction");
   }
 
@@ -1050,8 +1079,14 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (isRV64())
       return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 4) - 1);
+  case Match_InvalidUImm2:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 2) - 1);
+  case Match_InvalidUImm3:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 3) - 1);
   case Match_InvalidUImm5:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
+  case Match_InvalidUImm7:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 7) - 1);
   case Match_InvalidSImm5:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4),
                                       (1 << 4) - 1);
@@ -1835,8 +1870,10 @@ bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".option")
     return parseDirectiveOption();
-  else if (IDVal == ".attribute")
+  if (IDVal == ".attribute")
     return parseDirectiveAttribute();
+  if (IDVal == ".insn")
+    return parseDirectiveInsn(DirectiveID.getLoc());
 
   return true;
 }
@@ -2027,113 +2064,35 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
 
   if (Tag == RISCVAttrs::ARCH) {
     StringRef Arch = StringValue;
-    if (Arch.consume_front("rv32"))
+    for (auto Feature : RISCVFeatureKV)
+      if (llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key))
+        clearFeatureBits(Feature.Value, Feature.Key);
+
+    auto ParseResult = llvm::RISCVISAInfo::parseArchString(
+        StringValue, /*EnableExperimentalExtension=*/true,
+        /*ExperimentalExtensionVersionCheck=*/false);
+    if (!ParseResult) {
+      std::string Buffer;
+      raw_string_ostream OutputErrMsg(Buffer);
+      handleAllErrors(ParseResult.takeError(), [&](llvm::StringError &ErrMsg) {
+        OutputErrMsg << "invalid arch name '" << Arch << "', "
+                     << ErrMsg.getMessage();
+      });
+
+      return Error(ValueExprLoc, OutputErrMsg.str());
+    }
+    auto &ISAInfo = *ParseResult;
+
+    for (auto Feature : RISCVFeatureKV)
+      if (ISAInfo->hasExtension(Feature.Key))
+        setFeatureBits(Feature.Value, Feature.Key);
+
+    if (ISAInfo->getXLen() == 32)
       clearFeatureBits(RISCV::Feature64Bit, "64bit");
-    else if (Arch.consume_front("rv64"))
+    else if (ISAInfo->getXLen() == 64)
       setFeatureBits(RISCV::Feature64Bit, "64bit");
     else
       return Error(ValueExprLoc, "bad arch string " + Arch);
-
-    // .attribute arch overrides the current architecture, so unset all
-    // currently enabled extensions
-    clearFeatureBits(RISCV::FeatureRV32E, "e");
-    clearFeatureBits(RISCV::FeatureStdExtM, "m");
-    clearFeatureBits(RISCV::FeatureStdExtA, "a");
-    clearFeatureBits(RISCV::FeatureStdExtF, "f");
-    clearFeatureBits(RISCV::FeatureStdExtD, "d");
-    clearFeatureBits(RISCV::FeatureStdExtC, "c");
-    clearFeatureBits(RISCV::FeatureStdExtB, "experimental-b");
-    clearFeatureBits(RISCV::FeatureStdExtV, "experimental-v");
-    clearFeatureBits(RISCV::FeatureExtZfh, "experimental-zfh");
-    clearFeatureBits(RISCV::FeatureExtZba, "experimental-zba");
-    clearFeatureBits(RISCV::FeatureExtZbb, "experimental-zbb");
-    clearFeatureBits(RISCV::FeatureExtZbc, "experimental-zbc");
-    clearFeatureBits(RISCV::FeatureExtZbe, "experimental-zbe");
-    clearFeatureBits(RISCV::FeatureExtZbf, "experimental-zbf");
-    clearFeatureBits(RISCV::FeatureExtZbm, "experimental-zbm");
-    clearFeatureBits(RISCV::FeatureExtZbp, "experimental-zbp");
-    clearFeatureBits(RISCV::FeatureExtZbproposedc, "experimental-zbproposedc");
-    clearFeatureBits(RISCV::FeatureExtZbr, "experimental-zbr");
-    clearFeatureBits(RISCV::FeatureExtZbs, "experimental-zbs");
-    clearFeatureBits(RISCV::FeatureExtZbt, "experimental-zbt");
-    clearFeatureBits(RISCV::FeatureExtZvamo, "experimental-zvamo");
-    clearFeatureBits(RISCV::FeatureStdExtZvlsseg, "experimental-zvlsseg");
-
-    while (!Arch.empty()) {
-      bool DropFirst = true;
-      if (Arch[0] == 'i')
-        clearFeatureBits(RISCV::FeatureRV32E, "e");
-      else if (Arch[0] == 'e')
-        setFeatureBits(RISCV::FeatureRV32E, "e");
-      else if (Arch[0] == 'g') {
-        clearFeatureBits(RISCV::FeatureRV32E, "e");
-        setFeatureBits(RISCV::FeatureStdExtM, "m");
-        setFeatureBits(RISCV::FeatureStdExtA, "a");
-        setFeatureBits(RISCV::FeatureStdExtF, "f");
-        setFeatureBits(RISCV::FeatureStdExtD, "d");
-      } else if (Arch[0] == 'm')
-        setFeatureBits(RISCV::FeatureStdExtM, "m");
-      else if (Arch[0] == 'a')
-        setFeatureBits(RISCV::FeatureStdExtA, "a");
-      else if (Arch[0] == 'f')
-        setFeatureBits(RISCV::FeatureStdExtF, "f");
-      else if (Arch[0] == 'd') {
-        setFeatureBits(RISCV::FeatureStdExtF, "f");
-        setFeatureBits(RISCV::FeatureStdExtD, "d");
-      } else if (Arch[0] == 'c') {
-        setFeatureBits(RISCV::FeatureStdExtC, "c");
-      } else if (Arch[0] == 'b') {
-        setFeatureBits(RISCV::FeatureStdExtB, "experimental-b");
-      } else if (Arch[0] == 'v') {
-        setFeatureBits(RISCV::FeatureStdExtV, "experimental-v");
-      } else if (Arch[0] == 's' || Arch[0] == 'x' || Arch[0] == 'z') {
-        StringRef Ext =
-            Arch.take_until([](char c) { return ::isdigit(c) || c == '_'; });
-        if (Ext == "zba")
-          setFeatureBits(RISCV::FeatureExtZba, "experimental-zba");
-        else if (Ext == "zbb")
-          setFeatureBits(RISCV::FeatureExtZbb, "experimental-zbb");
-        else if (Ext == "zbc")
-          setFeatureBits(RISCV::FeatureExtZbc, "experimental-zbc");
-        else if (Ext == "zbe")
-          setFeatureBits(RISCV::FeatureExtZbe, "experimental-zbe");
-        else if (Ext == "zbf")
-          setFeatureBits(RISCV::FeatureExtZbf, "experimental-zbf");
-        else if (Ext == "zbm")
-          setFeatureBits(RISCV::FeatureExtZbm, "experimental-zbm");
-        else if (Ext == "zbp")
-          setFeatureBits(RISCV::FeatureExtZbp, "experimental-zbp");
-        else if (Ext == "zbproposedc")
-          setFeatureBits(RISCV::FeatureExtZbproposedc,
-                         "experimental-zbproposedc");
-        else if (Ext == "zbr")
-          setFeatureBits(RISCV::FeatureExtZbr, "experimental-zbr");
-        else if (Ext == "zbs")
-          setFeatureBits(RISCV::FeatureExtZbs, "experimental-zbs");
-        else if (Ext == "zbt")
-          setFeatureBits(RISCV::FeatureExtZbt, "experimental-zbt");
-        else if (Ext == "zfh")
-          setFeatureBits(RISCV::FeatureExtZfh, "experimental-zfh");
-        else if (Ext == "zvamo")
-          setFeatureBits(RISCV::FeatureExtZvamo, "experimental-zvamo");
-        else if (Ext == "zvlsseg")
-          setFeatureBits(RISCV::FeatureStdExtZvlsseg, "experimental-zvlsseg");
-        else
-          return Error(ValueExprLoc, "bad arch string " + Ext);
-        Arch = Arch.drop_until([](char c) { return ::isdigit(c) || c == '_'; });
-        DropFirst = false;
-      } else
-        return Error(ValueExprLoc, "bad arch string " + Arch);
-
-      if (DropFirst)
-        Arch = Arch.drop_front(1);
-      int major = 0;
-      int minor = 0;
-      Arch.consumeInteger(10, major);
-      Arch.consume_front("p");
-      Arch.consumeInteger(10, minor);
-      Arch = Arch.drop_while([](char c) { return c == '_'; });
-    }
   }
 
   if (IsIntegerValue)
@@ -2142,64 +2101,63 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
     if (Tag != RISCVAttrs::ARCH) {
       getTargetStreamer().emitTextAttribute(Tag, StringValue);
     } else {
-      std::string formalArchStr = "rv32";
-      if (getFeatureBits(RISCV::Feature64Bit))
-        formalArchStr = "rv64";
-      if (getFeatureBits(RISCV::FeatureRV32E))
-        formalArchStr = (Twine(formalArchStr) + "e1p9").str();
-      else
-        formalArchStr = (Twine(formalArchStr) + "i2p0").str();
-
-      if (getFeatureBits(RISCV::FeatureStdExtM))
-        formalArchStr = (Twine(formalArchStr) + "_m2p0").str();
-      if (getFeatureBits(RISCV::FeatureStdExtA))
-        formalArchStr = (Twine(formalArchStr) + "_a2p0").str();
-      if (getFeatureBits(RISCV::FeatureStdExtF))
-        formalArchStr = (Twine(formalArchStr) + "_f2p0").str();
-      if (getFeatureBits(RISCV::FeatureStdExtD))
-        formalArchStr = (Twine(formalArchStr) + "_d2p0").str();
-      if (getFeatureBits(RISCV::FeatureStdExtC))
-        formalArchStr = (Twine(formalArchStr) + "_c2p0").str();
-      if (getFeatureBits(RISCV::FeatureStdExtB))
-        formalArchStr = (Twine(formalArchStr) + "_b0p93").str();
-      if (getFeatureBits(RISCV::FeatureStdExtV))
-        formalArchStr = (Twine(formalArchStr) + "_v0p10").str();
-      if (getFeatureBits(RISCV::FeatureExtZfh))
-        formalArchStr = (Twine(formalArchStr) + "_zfh0p1").str();
-      if (getFeatureBits(RISCV::FeatureExtZba))
-        formalArchStr = (Twine(formalArchStr) + "_zba0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbb))
-        formalArchStr = (Twine(formalArchStr) + "_zbb0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbc))
-        formalArchStr = (Twine(formalArchStr) + "_zbc0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbe))
-        formalArchStr = (Twine(formalArchStr) + "_zbe0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbf))
-        formalArchStr = (Twine(formalArchStr) + "_zbf0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbm))
-        formalArchStr = (Twine(formalArchStr) + "_zbm0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbp))
-        formalArchStr = (Twine(formalArchStr) + "_zbp0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbproposedc))
-        formalArchStr = (Twine(formalArchStr) + "_zbproposedc0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbr))
-        formalArchStr = (Twine(formalArchStr) + "_zbr0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbs))
-        formalArchStr = (Twine(formalArchStr) + "_zbs0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZbt))
-        formalArchStr = (Twine(formalArchStr) + "_zbt0p93").str();
-      if (getFeatureBits(RISCV::FeatureExtZvamo))
-        formalArchStr = (Twine(formalArchStr) + "_zvamo0p10").str();
-      if (getFeatureBits(RISCV::FeatureStdExtZvlsseg))
-        formalArchStr = (Twine(formalArchStr) + "_zvlsseg0p10").str();
-
-      getTargetStreamer().emitTextAttribute(Tag, formalArchStr);
+      std::vector<std::string> FeatureVector;
+      RISCVFeatures::toFeatureVector(FeatureVector, getSTI().getFeatureBits());
+
+      // Parse that by RISCVISAInfo->
+      unsigned XLen = getFeatureBits(RISCV::Feature64Bit) ? 64 : 32;
+      auto ParseResult = llvm::RISCVISAInfo::parseFeatures(XLen, FeatureVector);
+      if (!ParseResult) {
+        std::string Buffer;
+        raw_string_ostream OutputErrMsg(Buffer);
+        handleAllErrors(ParseResult.takeError(),
+                        [&](llvm::StringError &ErrMsg) {
+                          OutputErrMsg << ErrMsg.getMessage();
+                        });
+
+        return Error(ValueExprLoc, OutputErrMsg.str());
+      }
+      auto &ISAInfo = *ParseResult;
+
+      // Then emit the arch string.
+      getTargetStreamer().emitTextAttribute(Tag, ISAInfo->toString());
     }
   }
 
   return false;
 }
 
+/// parseDirectiveInsn
+/// ::= .insn [ format encoding, (operands (, operands)*) ]
+bool RISCVAsmParser::parseDirectiveInsn(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
+  // Expect instruction format as identifier.
+  StringRef Format;
+  SMLoc ErrorLoc = Parser.getTok().getLoc();
+  if (Parser.parseIdentifier(Format))
+    return Error(ErrorLoc, "expected instruction format");
+
+  if (Format != "r" && Format != "r4" && Format != "i" && Format != "b" &&
+      Format != "sb" && Format != "u" && Format != "j" && Format != "uj" &&
+      Format != "s")
+    return Error(ErrorLoc, "invalid instruction format");
+
+  std::string FormatName = (".insn_" + Format).str();
+
+  ParseInstructionInfo Info;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> Operands;
+
+  if (ParseInstruction(Info, FormatName, L, Operands))
+    return true;
+
+  unsigned Opcode;
+  uint64_t ErrorInfo;
+  return MatchAndEmitInstruction(L, Opcode, Operands, Parser.getStreamer(),
+                                 ErrorInfo,
+                                 /*MatchingInlineAsm=*/false);
+}
+
 void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
   MCInst CInst;
   bool Res = compressInst(CInst, Inst, getSTI(), S.getContext());
@@ -2223,6 +2181,11 @@ void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
                               .addReg(DestReg)
                               .addReg(SrcReg)
                               .addReg(RISCV::X0));
+    } else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
+               Inst.Opc == RISCV::SH3ADD) {
+      emitToStreamer(
+          Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addReg(
+                   SrcReg));
     } else {
       emitToStreamer(
           Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
@@ -2339,10 +2302,10 @@ void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode,
   //
   //   TmpLabel: AUIPC tmp, %pcrel_hi(symbol)
   //             [S|L]X    rd, %pcrel_lo(TmpLabel)(tmp)
-  MCOperand DestReg = Inst.getOperand(0);
+  unsigned DestRegOpIdx = HasTmpReg ? 1 : 0;
+  MCOperand DestReg = Inst.getOperand(DestRegOpIdx);
   unsigned SymbolOpIdx = HasTmpReg ? 2 : 1;
-  unsigned TmpRegOpIdx = HasTmpReg ? 1 : 0;
-  MCOperand TmpReg = Inst.getOperand(TmpRegOpIdx);
+  MCOperand TmpReg = Inst.getOperand(0);
   const MCExpr *Symbol = Inst.getOperand(SymbolOpIdx).getExpr();
   emitAuipcInstPair(DestReg, TmpReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
                     Opcode, IDLoc, Out);
@@ -2414,7 +2377,7 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
     // masked va >= x, vd == v0
     //
     //  pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
-    //  expansion: vmslt{u}.vx vt, va, x;  vmandnot.mm vd, vd, vt
+    //  expansion: vmslt{u}.vx vt, va, x;  vmandn.mm vd, vd, vt
     assert(Inst.getOperand(0).getReg() == RISCV::V0 &&
            "The destination register should be V0.");
     assert(Inst.getOperand(1).getReg() != RISCV::V0 &&
@@ -2424,7 +2387,7 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(2))
                             .addOperand(Inst.getOperand(3))
                             .addOperand(Inst.getOperand(4)));
-    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(1)));
@@ -2432,7 +2395,7 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
     // masked va >= x, any vd
     //
     // pseudoinstruction: vmsge{u}.vx vd, va, x, v0.t, vt
-    // expansion: vmslt{u}.vx vt, va, x; vmandnot.mm vt, v0, vt; vmandnot.mm vd,
+    // expansion: vmslt{u}.vx vt, va, x; vmandn.mm vt, v0, vt; vmandn.mm vd,
     // vd, v0; vmor.mm vd, vt, vd
     assert(Inst.getOperand(1).getReg() != RISCV::V0 &&
            "The temporary vector register should not be V0.");
@@ -2441,11 +2404,11 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(2))
                             .addOperand(Inst.getOperand(3))
                             .addReg(RISCV::NoRegister));
-    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
                             .addOperand(Inst.getOperand(1))
                             .addReg(RISCV::V0)
                             .addOperand(Inst.getOperand(1)));
-    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDNOT_MM)
+    emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(0))
                             .addReg(RISCV::V0));
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 504a78d91f32..ff96b2b254ca 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -20,8 +20,8 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -449,19 +449,6 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       }
     }
 
-    if (STI.getFeatureBits()[RISCV::FeatureExtZbproposedc] &&
-        STI.getFeatureBits()[RISCV::FeatureStdExtC]) {
-      LLVM_DEBUG(
-          dbgs() << "Trying RVBC32 table (BitManip 16-bit Instruction):\n");
-      // Calling the auto-generated decoder function.
-      Result = decodeInstruction(DecoderTableRVBC16, MI, Insn, Address,
-                                 this, STI);
-      if (Result != MCDisassembler::Fail) {
-        Size = 2;
-        return Result;
-      }
-    }
-
     LLVM_DEBUG(dbgs() << "Trying RISCV_C table (16-bit Instruction):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTable16, MI, Insn, Address, this, STI);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index b93197e713e5..514789b3f645 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -352,8 +352,9 @@ bool RISCVAsmBackend::mayNeedRelaxation(const MCInst &Inst,
   return getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode();
 }
 
-bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
-  bool HasStdExtC = STI.getFeatureBits()[RISCV::FeatureStdExtC];
+bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                   const MCSubtargetInfo *STI) const {
+  bool HasStdExtC = STI->getFeatureBits()[RISCV::FeatureStdExtC];
   unsigned MinNopLen = HasStdExtC ? 2 : 4;
 
   if ((Count % MinNopLen) != 0)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index e1628673419a..f04d2912f09d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -99,7 +99,8 @@ public:
   bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout,
                      bool &WasRelaxed) const override;
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 
   const MCTargetOptions &getTargetOptions() const { return TargetOptions; }
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 60e86093d9f4..0aba18b20f0d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -14,9 +14,14 @@
 #include "RISCVBaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/RISCVISAInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+
+extern const SubtargetFeatureKV RISCVFeatureKV[RISCV::NumSubtargetFeatures];
+
 namespace RISCVSysReg {
 #define GET_SysRegsList_IMPL
 #include "RISCVGenSearchableTables.inc"
@@ -96,6 +101,15 @@ void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
     report_fatal_error("RV32E can't be enabled for an RV64 target");
 }
 
+void toFeatureVector(std::vector<std::string> &FeatureVector,
+                     const FeatureBitset &FeatureBits) {
+  for (auto Feature : RISCVFeatureKV) {
+    if (FeatureBits[Feature.Value] &&
+        llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key))
+      FeatureVector.push_back(std::string("+") + Feature.Key);
+  }
+}
+
 } // namespace RISCVFeatures
 
 // Encode VTYPE into the binary format used by the the VSETVLI instruction which
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 9bdd2003cb15..d8f4403c824f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -68,14 +68,25 @@ enum {
   HasMergeOpMask = 1 << HasMergeOpShift,
 
   // Does this instruction have a SEW operand. It will be the last explicit
-  // operand. Used by RVV Pseudos.
+  // operand unless there is a vector policy operand. Used by RVV Pseudos.
   HasSEWOpShift = HasMergeOpShift + 1,
   HasSEWOpMask = 1 << HasSEWOpShift,
 
   // Does this instruction have a VL operand. It will be the second to last
-  // explicit operand. Used by RVV Pseudos.
+  // explicit operand unless there is a vector policy operand. Used by RVV
+  // Pseudos.
   HasVLOpShift = HasSEWOpShift + 1,
   HasVLOpMask = 1 << HasVLOpShift,
+
+  // Does this instruction have a vector policy operand. It will be the last
+  // explicit operand. Used by RVV Pseudos.
+  HasVecPolicyOpShift = HasVLOpShift + 1,
+  HasVecPolicyOpMask = 1 << HasVecPolicyOpShift,
+
+  // Is this instruction a vector widening reduction instruction. Used by RVV
+  // Pseudos.
+  IsRVVWideningReductionShift = HasVecPolicyOpShift + 1,
+  IsRVVWideningReductionMask = 1 << IsRVVWideningReductionShift,
 };
 
 // Match with the definitions in RISCVInstrFormatsV.td
@@ -97,6 +108,11 @@ enum VLMUL : uint8_t {
   LMUL_F2
 };
 
+enum {
+  TAIL_UNDISTURBED = 0,
+  TAIL_AGNOSTIC = 1,
+};
+
 // Helper functions to read TSFlags.
 /// \returns the format of the instruction.
 static inline unsigned getFormat(uint64_t TSFlags) {
@@ -131,6 +147,14 @@ static inline bool hasSEWOp(uint64_t TSFlags) {
 static inline bool hasVLOp(uint64_t TSFlags) {
   return TSFlags & HasVLOpMask;
 }
+/// \returns true if there is a vector policy operand for this instruction.
+static inline bool hasVecPolicyOp(uint64_t TSFlags) {
+  return TSFlags & HasVecPolicyOpMask;
+}
+/// \returns true if it is a vector widening reduction instruction.
+static inline bool isRVVWideningReduction(uint64_t TSFlags) {
+  return TSFlags & IsRVVWideningReductionMask;
+}
 
 // RISC-V Specific Machine Operand Flags
 enum {
@@ -158,8 +182,11 @@ enum {
 namespace RISCVOp {
 enum OperandType : unsigned {
   OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
-  OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
+  OPERAND_UIMM2 = OPERAND_FIRST_RISCV_IMM,
+  OPERAND_UIMM3,
+  OPERAND_UIMM4,
   OPERAND_UIMM5,
+  OPERAND_UIMM7,
   OPERAND_UIMM12,
   OPERAND_SIMM12,
   OPERAND_UIMM20,
@@ -306,6 +333,10 @@ namespace RISCVFeatures {
 // triple. Exits with report_fatal_error if not.
 void validate(const Triple &TT, const FeatureBitset &FeatureBits);
 
+// Convert FeatureBitset to FeatureVector.
+void toFeatureVector(std::vector<std::string> &FeatureVector,
+                     const FeatureBitset &FeatureBits);
+
 } // namespace RISCVFeatures
 
 namespace RISCVVType {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 1ef276b10100..14d0191a505f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -358,7 +358,7 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() == MCSymbolRefExpr::VK_None) {
-    if (Desc.getOpcode() == RISCV::JAL) {
+    if (MIFrm == RISCVII::InstFormatJ) {
       FixupKind = RISCV::fixup_riscv_jal;
     } else if (MIFrm == RISCVII::InstFormatB) {
       FixupKind = RISCV::fixup_riscv_branch;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp
new file mode 100644
index 000000000000..9c9d9221578c
--- /dev/null
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.cpp
@@ -0,0 +1,22 @@
+//===-- RISCVMCObjectFileInfo.cpp - RISCV object file properties ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the RISCVMCObjectFileInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMCObjectFileInfo.h"
+#include "RISCVMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+
+using namespace llvm;
+
+unsigned RISCVMCObjectFileInfo::getTextSectionAlignment() const {
+  const MCSubtargetInfo *STI = getContext().getSubtargetInfo();
+  return STI->hasFeature(RISCV::FeatureStdExtC) ? 2 : 4;
+}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.h
new file mode 100644
index 000000000000..2f6b10229864
--- /dev/null
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCObjectFileInfo.h
@@ -0,0 +1,27 @@
+//===-- RISCVMCObjectFileInfo.h - RISCV object file Info -------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the RISCVMCObjectFileInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCOBJECTFILEINFO_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCOBJECTFILEINFO_H
+
+#include "llvm/MC/MCObjectFileInfo.h"
+
+namespace llvm {
+
+class RISCVMCObjectFileInfo : public MCObjectFileInfo {
+public:
+  unsigned getTextSectionAlignment() const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 38c32539833c..07c2be624932 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -15,6 +15,7 @@
 #include "RISCVELFStreamer.h"
 #include "RISCVInstPrinter.h"
 #include "RISCVMCAsmInfo.h"
+#include "RISCVMCObjectFileInfo.h"
 #include "RISCVTargetStreamer.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,12 +24,13 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "RISCVGenInstrInfo.inc"
@@ -65,6 +67,14 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
+static MCObjectFileInfo *
+createRISCVMCObjectFileInfo(MCContext &Ctx, bool PIC,
+                            bool LargeCodeModel = false) {
+  MCObjectFileInfo *MOFI = new RISCVMCObjectFileInfo();
+  MOFI->initMCObjectFileInfo(Ctx, PIC, LargeCodeModel);
+  return MOFI;
+}
+
 static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
                                                    StringRef CPU, StringRef FS) {
   if (CPU.empty())
@@ -155,6 +165,7 @@ MCStreamer *createRISCVELFStreamer(const Triple &T, MCContext &Context,
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
     TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
+    TargetRegistry::RegisterMCObjectFileInfo(*T, createRISCVMCObjectFileInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createRISCVMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createRISCVMCRegisterInfo);
     TargetRegistry::RegisterMCAsmBackend(*T, createRISCVAsmBackend);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 2ca5eeb8392e..0ee6d8de78c9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -20,7 +20,8 @@ static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) {
   for (auto Instr : Res) {
     bool Compressed;
     switch (Instr.Opc) {
-    default: llvm_unreachable("Unexpected opcode");
+    default:
+      llvm_unreachable("Unexpected opcode");
     case RISCV::SLLI:
     case RISCV::SRLI:
       Compressed = true;
@@ -77,7 +78,7 @@ static void generateInstSeqImpl(int64_t Val,
   assert(IsRV64 && "Can't emit >32-bit imm for non-RV64 target");
 
   // In the worst case, for a full 64-bit constant, a sequence of 8 instructions
-  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emmitted. Note
+  // (i.e., LUI+ADDIW+SLLI+ADDI+SLLI+ADDI+SLLI+ADDI) has to be emitted. Note
   // that the first two instructions (LUI+ADDIW) can contribute up to 32 bits
   // while the following ADDI instructions contribute up to 12 bits each.
   //
@@ -106,15 +107,36 @@ static void generateInstSeqImpl(int64_t Val,
 
   // If the remaining bits don't fit in 12 bits, we might be able to reduce the
   // shift amount in order to use LUI which will zero the lower 12 bits.
-  if (ShiftAmount > 12 && !isInt<12>(Hi52) && isInt<32>((uint64_t)Hi52 << 12)) {
-    // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
-    ShiftAmount -= 12;
-    Hi52 = (uint64_t)Hi52 << 12;
+  bool Unsigned = false;
+  if (ShiftAmount > 12 && !isInt<12>(Hi52)) {
+    if (isInt<32>((uint64_t)Hi52 << 12)) {
+      // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
+      ShiftAmount -= 12;
+      Hi52 = (uint64_t)Hi52 << 12;
+    } else if (isUInt<32>((uint64_t)Hi52 << 12) &&
+               ActiveFeatures[RISCV::FeatureStdExtZba]) {
+      // Reduce the shift amount and add zeros to the LSBs so it will match
+      // LUI, then shift left with SLLI.UW to clear the upper 32 set bits.
+      ShiftAmount -= 12;
+      Hi52 = ((uint64_t)Hi52 << 12) | (0xffffffffull << 32);
+      Unsigned = true;
+    }
+  }
+
+  // Try to use SLLIUW for Hi52 when it is uint32 but not int32.
+  if (isUInt<32>((uint64_t)Hi52) && !isInt<32>((uint64_t)Hi52) &&
+      ActiveFeatures[RISCV::FeatureStdExtZba]) {
+    // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with SLLIUW.
+    Hi52 = ((uint64_t)Hi52) | (0xffffffffull << 32);
+    Unsigned = true;
   }
 
   generateInstSeqImpl(Hi52, ActiveFeatures, Res);
 
-  Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount));
+  if (Unsigned)
+    Res.push_back(RISCVMatInt::Inst(RISCV::SLLIUW, ShiftAmount));
+  else
+    Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount));
   if (Lo12)
     Res.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
 }
@@ -165,7 +187,7 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
 
     // If we have exactly 32 leading zeros and Zba, we can try using zext.w at
     // the end of the sequence.
-    if (LeadingZeros == 32 && ActiveFeatures[RISCV::FeatureExtZba]) {
+    if (LeadingZeros == 32 && ActiveFeatures[RISCV::FeatureStdExtZba]) {
       // Try replacing upper bits with 1.
       uint64_t LeadingOnesVal = Val | maskLeadingOnes<uint64_t>(LeadingZeros);
       TmpSeq.clear();
@@ -182,12 +204,119 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
     }
   }
 
+  // Perform optimization with BCLRI/BSETI in the Zbs extension.
+  if (Res.size() > 2 && ActiveFeatures[RISCV::FeatureStdExtZbs]) {
+    assert(ActiveFeatures[RISCV::Feature64Bit] &&
+           "Expected RV32 to only need 2 instructions");
+
+    // 1. For values in range 0xffffffff 7fffffff ~ 0xffffffff 00000000,
+    //    call generateInstSeqImpl with Val|0x80000000 (which is expected be
+    //    an int32), then emit (BCLRI r, 31).
+    // 2. For values in range 0x80000000 ~ 0xffffffff, call generateInstSeqImpl
+    //    with Val&~0x80000000 (which is expected to be an int32), then
+    //    emit (BSETI r, 31).
+    int64_t NewVal;
+    unsigned Opc;
+    if (Val < 0) {
+      Opc = RISCV::BCLRI;
+      NewVal = Val | 0x80000000ll;
+    } else {
+      Opc = RISCV::BSETI;
+      NewVal = Val & ~0x80000000ll;
+    }
+    if (isInt<32>(NewVal)) {
+      RISCVMatInt::InstSeq TmpSeq;
+      generateInstSeqImpl(NewVal, ActiveFeatures, TmpSeq);
+      TmpSeq.push_back(RISCVMatInt::Inst(Opc, 31));
+      if (TmpSeq.size() < Res.size())
+        Res = TmpSeq;
+    }
+
+    // Try to use BCLRI for upper 32 bits if the original lower 32 bits are
+    // negative int32, or use BSETI for upper 32 bits if the original lower
+    // 32 bits are positive int32.
+    int32_t Lo = Val;
+    uint32_t Hi = Val >> 32;
+    Opc = 0;
+    RISCVMatInt::InstSeq TmpSeq;
+    generateInstSeqImpl(Lo, ActiveFeatures, TmpSeq);
+    // Check if it is profitable to use BCLRI/BSETI.
+    if (Lo > 0 && TmpSeq.size() + countPopulation(Hi) < Res.size()) {
+      Opc = RISCV::BSETI;
+    } else if (Lo < 0 && TmpSeq.size() + countPopulation(~Hi) < Res.size()) {
+      Opc = RISCV::BCLRI;
+      Hi = ~Hi;
+    }
+    // Search for each bit and build corresponding BCLRI/BSETI.
+    if (Opc > 0) {
+      while (Hi != 0) {
+        unsigned Bit = countTrailingZeros(Hi);
+        TmpSeq.push_back(RISCVMatInt::Inst(Opc, Bit + 32));
+        Hi &= ~(1 << Bit);
+      }
+      if (TmpSeq.size() < Res.size())
+        Res = TmpSeq;
+    }
+  }
+
+  // Perform optimization with SH*ADD in the Zba extension.
+  if (Res.size() > 2 && ActiveFeatures[RISCV::FeatureStdExtZba]) {
+    assert(ActiveFeatures[RISCV::Feature64Bit] &&
+           "Expected RV32 to only need 2 instructions");
+    int64_t Div = 0;
+    unsigned Opc = 0;
+    RISCVMatInt::InstSeq TmpSeq;
+    // Select the opcode and divisor.
+    if ((Val % 3) == 0 && isInt<32>(Val / 3)) {
+      Div = 3;
+      Opc = RISCV::SH1ADD;
+    } else if ((Val % 5) == 0 && isInt<32>(Val / 5)) {
+      Div = 5;
+      Opc = RISCV::SH2ADD;
+    } else if ((Val % 9) == 0 && isInt<32>(Val / 9)) {
+      Div = 9;
+      Opc = RISCV::SH3ADD;
+    }
+    // Build the new instruction sequence.
+    if (Div > 0) {
+      generateInstSeqImpl(Val / Div, ActiveFeatures, TmpSeq);
+      TmpSeq.push_back(RISCVMatInt::Inst(Opc, 0));
+      if (TmpSeq.size() < Res.size())
+        Res = TmpSeq;
+    }
+    // Try to use LUI+SH*ADD+ADDI.
+    int64_t Hi52 = ((uint64_t)Val + 0x800ull) & ~0xfffull;
+    int64_t Lo12 = SignExtend64<12>(Val);
+    Div = 0;
+    if (isInt<32>(Hi52 / 3) && (Hi52 % 3) == 0) {
+      Div = 3;
+      Opc = RISCV::SH1ADD;
+    } else if (isInt<32>(Hi52 / 5) && (Hi52 % 5) == 0) {
+      Div = 5;
+      Opc = RISCV::SH2ADD;
+    } else if (isInt<32>(Hi52 / 9) && (Hi52 % 9) == 0) {
+      Div = 9;
+      Opc = RISCV::SH3ADD;
+    }
+    // Build the new instruction sequence.
+    if (Div > 0) {
+      // For Val that has zero Lo12 (implies Val equals to Hi52) should has
+      // already been processed to LUI+SH*ADD by previous optimization.
+      assert(Lo12 != 0 &&
+             "unexpected instruction sequence for immediate materialisation");
+      generateInstSeqImpl(Hi52 / Div, ActiveFeatures, TmpSeq);
+      TmpSeq.push_back(RISCVMatInt::Inst(Opc, 0));
+      TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
+      if (TmpSeq.size() < Res.size())
+        Res = TmpSeq;
+    }
+  }
+
   return Res;
 }
 
 int getIntMatCost(const APInt &Val, unsigned Size,
-                  const FeatureBitset &ActiveFeatures,
-                  bool CompressionCost) {
+                  const FeatureBitset &ActiveFeatures, bool CompressionCost) {
   bool IsRV64 = ActiveFeatures[RISCV::Feature64Bit];
   bool HasRVC = CompressionCost && ActiveFeatures[RISCV::FeatureStdExtC];
   int PlatRegSize = IsRV64 ? 64 : 32;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 13c4b84aa300..2f016374e6a2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -11,9 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetStreamer.h"
+#include "RISCVBaseInfo.h"
 #include "RISCVMCTargetDesc.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/RISCVAttributes.h"
+#include "llvm/Support/RISCVISAInfo.h"
 
 using namespace llvm;
 
@@ -43,57 +45,19 @@ void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
   else
     emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_16);
 
-  std::string Arch = "rv32";
-  if (STI.hasFeature(RISCV::Feature64Bit))
-    Arch = "rv64";
-  if (STI.hasFeature(RISCV::FeatureRV32E))
-    Arch += "e1p9";
-  else
-    Arch += "i2p0";
-  if (STI.hasFeature(RISCV::FeatureStdExtM))
-    Arch += "_m2p0";
-  if (STI.hasFeature(RISCV::FeatureStdExtA))
-    Arch += "_a2p0";
-  if (STI.hasFeature(RISCV::FeatureStdExtF))
-    Arch += "_f2p0";
-  if (STI.hasFeature(RISCV::FeatureStdExtD))
-    Arch += "_d2p0";
-  if (STI.hasFeature(RISCV::FeatureStdExtC))
-    Arch += "_c2p0";
-  if (STI.hasFeature(RISCV::FeatureStdExtB))
-    Arch += "_b0p93";
-  if (STI.hasFeature(RISCV::FeatureStdExtV))
-    Arch += "_v0p10";
-  if (STI.hasFeature(RISCV::FeatureExtZfh))
-    Arch += "_zfh0p1";
-  if (STI.hasFeature(RISCV::FeatureExtZba))
-    Arch += "_zba0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbb))
-    Arch += "_zbb0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbc))
-    Arch += "_zbc0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbe))
-    Arch += "_zbe0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbf))
-    Arch += "_zbf0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbm))
-    Arch += "_zbm0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbp))
-    Arch += "_zbp0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbproposedc))
-    Arch += "_zbproposedc0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbr))
-    Arch += "_zbr0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbs))
-    Arch += "_zbs0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZbt))
-    Arch += "_zbt0p93";
-  if (STI.hasFeature(RISCV::FeatureExtZvamo))
-    Arch += "_zvamo0p10";
-  if (STI.hasFeature(RISCV::FeatureStdExtZvlsseg))
-    Arch += "_zvlsseg0p10";
-
-  emitTextAttribute(RISCVAttrs::ARCH, Arch);
+  unsigned XLen = STI.hasFeature(RISCV::Feature64Bit) ? 64 : 32;
+  std::vector<std::string> FeatureVector;
+  RISCVFeatures::toFeatureVector(FeatureVector, STI.getFeatureBits());
+
+  auto ParseResult = llvm::RISCVISAInfo::parseFeatures(XLen, FeatureVector);
+  if (!ParseResult) {
+    /* Assume any error about features should handled earlier.  */
+    consumeError(ParseResult.takeError());
+    llvm_unreachable("Parsing feature error when emitTargetAttributes?");
+  } else {
+    auto &ISAInfo = *ParseResult;
+    emitTextAttribute(RISCVAttrs::ARCH, ISAInfo->toString());
+  }
 }
 
 // This part is for ascii assembly output
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index ef1f97067e12..b415c9f35e7f 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -37,6 +37,9 @@ bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
 
 FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
 
+FunctionPass *createRISCVGatherScatterLoweringPass();
+void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
+
 FunctionPass *createRISCVMergeBaseOffsetOptPass();
 void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 52e8d8cdc774..772a4f8ecd53 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -41,12 +41,20 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                            AssemblerPredicate<(all_of FeatureStdExtD),
                            "'D' (Double-Precision Floating-Point)">;
 
-def FeatureExtZfh
+def FeatureStdExtZfhmin
+    : SubtargetFeature<"experimental-zfhmin", "HasStdExtZfhmin", "true",
+                       "'Zfhmin' (Half-Precision Floating-Point Minimal)",
+                       [FeatureStdExtF]>;
+def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZfhmin),
+                             "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
+
+def FeatureStdExtZfh
     : SubtargetFeature<"experimental-zfh", "HasStdExtZfh", "true",
                        "'Zfh' (Half-Precision Floating-Point)",
-                       [FeatureStdExtF]>;
+                       [FeatureStdExtZfhmin, FeatureStdExtF]>;
 def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
-                             AssemblerPredicate<(all_of FeatureExtZfh),
+                             AssemblerPredicate<(all_of FeatureStdExtZfh),
                              "'Zfh' (Half-Precision Floating-Point)">;
 
 def FeatureStdExtC
@@ -56,109 +64,85 @@ def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                            AssemblerPredicate<(all_of FeatureStdExtC),
                            "'C' (Compressed Instructions)">;
 
-def FeatureExtZba
+def FeatureStdExtZba
     : SubtargetFeature<"experimental-zba", "HasStdExtZba", "true",
                        "'Zba' (Address calculation 'B' Instructions)">;
 def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
-                             AssemblerPredicate<(all_of FeatureExtZba),
+                             AssemblerPredicate<(all_of FeatureStdExtZba),
                              "'Zba' (Address calculation 'B' Instructions)">;
 def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
 
-def FeatureExtZbb
+def FeatureStdExtZbb
     : SubtargetFeature<"experimental-zbb", "HasStdExtZbb", "true",
                        "'Zbb' (Base 'B' Instructions)">;
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
-                             AssemblerPredicate<(all_of FeatureExtZbb),
+                             AssemblerPredicate<(all_of FeatureStdExtZbb),
                              "'Zbb' (Base 'B' Instructions)">;
 
-def FeatureExtZbc
+def FeatureStdExtZbc
     : SubtargetFeature<"experimental-zbc", "HasStdExtZbc", "true",
                        "'Zbc' (Carry-Less 'B' Instructions)">;
 def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
-                             AssemblerPredicate<(all_of FeatureExtZbc),
+                             AssemblerPredicate<(all_of FeatureStdExtZbc),
                              "'Zbc' (Carry-Less 'B' Instructions)">;
 
-def FeatureExtZbe
+def FeatureStdExtZbe
     : SubtargetFeature<"experimental-zbe", "HasStdExtZbe", "true",
                        "'Zbe' (Extract-Deposit 'B' Instructions)">;
 def HasStdExtZbe : Predicate<"Subtarget->hasStdExtZbe()">,
-                             AssemblerPredicate<(all_of FeatureExtZbe),
+                             AssemblerPredicate<(all_of FeatureStdExtZbe),
                              "'Zbe' (Extract-Deposit 'B' Instructions)">;
 
-def FeatureExtZbf
+def FeatureStdExtZbf
     : SubtargetFeature<"experimental-zbf", "HasStdExtZbf", "true",
                        "'Zbf' (Bit-Field 'B' Instructions)">;
 def HasStdExtZbf : Predicate<"Subtarget->hasStdExtZbf()">,
-                             AssemblerPredicate<(all_of FeatureExtZbf),
+                             AssemblerPredicate<(all_of FeatureStdExtZbf),
                              "'Zbf' (Bit-Field 'B' Instructions)">;
 
-def FeatureExtZbm
+def FeatureStdExtZbm
     : SubtargetFeature<"experimental-zbm", "HasStdExtZbm", "true",
                        "'Zbm' (Matrix 'B' Instructions)">;
 def HasStdExtZbm : Predicate<"Subtarget->hasStdExtZbm()">,
-                             AssemblerPredicate<(all_of FeatureExtZbm),
+                             AssemblerPredicate<(all_of FeatureStdExtZbm),
                              "'Zbm' (Matrix 'B' Instructions)">;
 
-def FeatureExtZbp
+def FeatureStdExtZbp
     : SubtargetFeature<"experimental-zbp", "HasStdExtZbp", "true",
                        "'Zbp' (Permutation 'B' Instructions)">;
 def HasStdExtZbp : Predicate<"Subtarget->hasStdExtZbp()">,
-                             AssemblerPredicate<(all_of FeatureExtZbp),
+                             AssemblerPredicate<(all_of FeatureStdExtZbp),
                              "'Zbp' (Permutation 'B' Instructions)">;
 
-def FeatureExtZbr
+def FeatureStdExtZbr
     : SubtargetFeature<"experimental-zbr", "HasStdExtZbr", "true",
                        "'Zbr' (Polynomial Reduction 'B' Instructions)">;
 def HasStdExtZbr : Predicate<"Subtarget->hasStdExtZbr()">,
-                             AssemblerPredicate<(all_of FeatureExtZbr),
+                             AssemblerPredicate<(all_of FeatureStdExtZbr),
                              "'Zbr' (Polynomial Reduction 'B' Instructions)">;
 
-def FeatureExtZbs
+def FeatureStdExtZbs
     : SubtargetFeature<"experimental-zbs", "HasStdExtZbs", "true",
                        "'Zbs' (Single-Bit 'B' Instructions)">;
 def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
-                             AssemblerPredicate<(all_of FeatureExtZbs),
+                             AssemblerPredicate<(all_of FeatureStdExtZbs),
                              "'Zbs' (Single-Bit 'B' Instructions)">;
 
-def FeatureExtZbt
+def FeatureStdExtZbt
     : SubtargetFeature<"experimental-zbt", "HasStdExtZbt", "true",
                        "'Zbt' (Ternary 'B' Instructions)">;
 def HasStdExtZbt : Predicate<"Subtarget->hasStdExtZbt()">,
-                             AssemblerPredicate<(all_of FeatureExtZbt),
+                             AssemblerPredicate<(all_of FeatureStdExtZbt),
                              "'Zbt' (Ternary 'B' Instructions)">;
 
 // Some instructions belong to both the basic and the permutation
 // subextensions. They should be enabled if either has been specified.
 def HasStdExtZbbOrZbp
     : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()">,
-                AssemblerPredicate<(any_of FeatureExtZbb, FeatureExtZbp),
+                AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbp),
                                    "'Zbb' (Base 'B' Instructions) or "
                                    "'Zbp' (Permutation 'B' Instructions)">;
 
-def FeatureExtZbproposedc
-    : SubtargetFeature<"experimental-zbproposedc", "HasStdExtZbproposedc", "true",
-                       "'Zbproposedc' (Proposed Compressed 'B' Instructions)">;
-def HasStdExtZbproposedc : Predicate<"Subtarget->hasStdExtZbproposedc()">,
-                           AssemblerPredicate<(all_of FeatureExtZbproposedc),
-                           "'Zbproposedc' (Proposed Compressed 'B' Instructions)">;
-
-def FeatureStdExtB
-    : SubtargetFeature<"experimental-b", "HasStdExtB", "true",
-                       "'B' (Bit Manipulation Instructions)",
-                       [FeatureExtZba,
-                        FeatureExtZbb,
-                        FeatureExtZbc,
-                        FeatureExtZbe,
-                        FeatureExtZbf,
-                        FeatureExtZbm,
-                        FeatureExtZbp,
-                        FeatureExtZbr,
-                        FeatureExtZbs,
-                        FeatureExtZbt]>;
-def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
-                           AssemblerPredicate<(all_of FeatureStdExtB),
-                           "'B' (Bit Manipulation Instructions)">;
-
 def FeatureNoRVCHints
     : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
                        "Disable RVC Hint Instructions.">;
@@ -173,6 +157,9 @@ def HasStdExtV : Predicate<"Subtarget->hasStdExtV()">,
                            AssemblerPredicate<(all_of FeatureStdExtV),
                            "'V' (Vector Instructions)">;
 
+def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">;
+def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">;
+
 def FeatureStdExtZvlsseg
     : SubtargetFeature<"experimental-zvlsseg", "HasStdExtZvlsseg", "true",
                        "'Zvlsseg' (Vector segment load/store instructions)",
@@ -181,12 +168,12 @@ def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">,
                                  AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
                                  "'Zvlsseg' (Vector segment load/store instructions)">;
 
-def FeatureExtZvamo
+def FeatureStdExtZvamo
     : SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true",
                        "'Zvamo' (Vector AMO Operations)",
                        [FeatureStdExtV]>;
 def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">,
-                               AssemblerPredicate<(all_of FeatureExtZvamo),
+                               AssemblerPredicate<(all_of FeatureStdExtZvamo),
                                "'Zvamo' (Vector AMO Operations)">;
 
 def Feature64Bit
@@ -250,22 +237,63 @@ def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
 def : ProcessorModel<"sifive-7-rv32", SiFive7Model, []>;
 def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit]>;
 
+def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-e21", RocketModel, [FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-e24", RocketModel, [FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtF,
+                                                 FeatureStdExtC]>;
+
 def : ProcessorModel<"sifive-e31", RocketModel, [FeatureStdExtM,
                                                  FeatureStdExtA,
                                                  FeatureStdExtC]>;
 
-def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
+def : ProcessorModel<"sifive-e34", RocketModel, [FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtF,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
+                                                  FeatureStdExtA,
+                                                  FeatureStdExtF,
+                                                  FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit,
+                                                 FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-s51", RocketModel, [Feature64Bit,
+                                                 FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-s54", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
                                                  FeatureStdExtA,
                                                  FeatureStdExtF,
                                                  FeatureStdExtD,
                                                  FeatureStdExtC]>;
 
-def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
+def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit,
+                                                  FeatureStdExtM,
                                                   FeatureStdExtA,
                                                   FeatureStdExtF,
+                                                  FeatureStdExtD,
                                                   FeatureStdExtC]>;
 
+def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
+                                                 FeatureStdExtM,
+                                                 FeatureStdExtA,
+                                                 FeatureStdExtF,
+                                                 FeatureStdExtD,
+                                                 FeatureStdExtC]>;
+
 def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtM,
                                                   FeatureStdExtA,
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index bdf30f8eb1b3..9fed6e7baadc 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -27,7 +27,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 31ef752967cc..80340ee81509 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -105,6 +105,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoLA_TLS_GD:
     return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI);
   case RISCV::PseudoVSETVLI:
+  case RISCV::PseudoVSETVLIX0:
   case RISCV::PseudoVSETIVLI:
     return expandVSetVL(MBB, MBBI);
   case RISCV::PseudoVMCLR_M_B1:
@@ -246,13 +247,14 @@ bool RISCVExpandPseudo::expandVSetVL(MachineBasicBlock &MBB,
   DebugLoc DL = MBBI->getDebugLoc();
 
   assert((MBBI->getOpcode() == RISCV::PseudoVSETVLI ||
+          MBBI->getOpcode() == RISCV::PseudoVSETVLIX0 ||
           MBBI->getOpcode() == RISCV::PseudoVSETIVLI) &&
          "Unexpected pseudo instruction");
   unsigned Opcode;
-  if (MBBI->getOpcode() == RISCV::PseudoVSETVLI)
-    Opcode = RISCV::VSETVLI;
-  else
+  if (MBBI->getOpcode() == RISCV::PseudoVSETIVLI)
     Opcode = RISCV::VSETIVLI;
+  else
+    Opcode = RISCV::VSETVLI;
   const MCInstrDesc &Desc = TII->get(Opcode);
   assert(Desc.getNumOperands() == 3 && "Unexpected instruction format");
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 188bd49595a5..595c3cdfbb1d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -220,6 +220,10 @@ getRestoreLibCallName(const MachineFunction &MF,
   return RestoreLibCalls[LibCallID];
 }
 
+// Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if frame pointer elimination is
+// disabled, if it needs dynamic stack realignment, if the function has
+// variable sized allocas, or if the frame address is taken.
 bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
@@ -671,15 +675,15 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // |--------------------------| --     |
       // | Padding after RVV        | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |-- MFI.getStackSize()
       // | RVV objects              | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | Padding before RVV       | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | scalar local variables   | | <----'
       // |--------------------------| -- <-- BP
@@ -696,15 +700,15 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // |--------------------------| --     |
       // | Padding after RVV        | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |-- MFI.getStackSize()
       // | RVV objects              | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | Padding before RVV       | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | scalar local variables   | | <----'
       // |--------------------------| -- <-- SP
@@ -749,15 +753,15 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // |--------------------------| --     |
       // | Padding after RVV        | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | RVV objects              | |      |-- MFI.getStackSize()
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | Padding before RVV       | |      |
       // | (not counted in          | |      |
-      // | MFI.getStackSize()       | |      |
+      // | MFI.getStackSize())      | |      |
       // |--------------------------| --     |
       // | scalar local variables   | | <----'
       // |--------------------------| -- <-- SP
@@ -767,8 +771,10 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // objects to 8 bytes.
       if (MFI.getStackID(FI) == TargetStackID::Default) {
         if (MFI.isFixedObjectIndex(FI)) {
-          Offset += StackOffset::get(MFI.getStackSize() + RVFI->getRVVPadding() 
-                        + RVFI->getLibCallStackSize(), RVFI->getRVVStackSize());
+          Offset +=
+              StackOffset::get(MFI.getStackSize() + RVFI->getRVVPadding() +
+                                   RVFI->getLibCallStackSize(),
+                               RVFI->getRVVStackSize());
         } else {
           Offset += StackOffset::getFixed(MFI.getStackSize());
         }
@@ -860,7 +866,7 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const {
 }
 
 static bool hasRVVSpillWithFIs(MachineFunction &MF, const RISCVInstrInfo &TII) {
-  if (!MF.getSubtarget<RISCVSubtarget>().hasStdExtV())
+  if (!MF.getSubtarget<RISCVSubtarget>().hasVInstructions())
     return false;
   return any_of(MF, [&TII](const MachineBasicBlock &MBB) {
     return any_of(MBB, [&TII](const MachineInstr &MI) {
@@ -1040,7 +1046,8 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
     // Insert the spill to the stack frame.
     Register Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI);
+    TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(),
+                            RC, TRI);
   }
 
   return true;
@@ -1087,6 +1094,14 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
+bool RISCVFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // Keep the conventional code flow when not optimizing.
+  if (MF.getFunction().hasOptNone())
+    return false;
+
+  return true;
+}
+
 bool RISCVFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
   const MachineFunction *MF = MBB.getParent();
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index bc3ace786272..1e94e34acf2f 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -65,6 +65,8 @@ public:
   bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   bool isSupportedStackID(TargetStackID::Value ID) const override;
   TargetStackID::Value getStackIDForScalableVectors() const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
new file mode 100644
index 000000000000..d47bd739235f
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -0,0 +1,475 @@
+//===- RISCVGatherScatterLowering.cpp - Gather/Scatter lowering -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass custom lowers llvm.gather and llvm.scatter instructions to
+// RISCV intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-gather-scatter-lowering"
+
+namespace {
+
+class RISCVGatherScatterLowering : public FunctionPass {
+  const RISCVSubtarget *ST = nullptr;
+  const RISCVTargetLowering *TLI = nullptr;
+  LoopInfo *LI = nullptr;
+  const DataLayout *DL = nullptr;
+
+  SmallVector<WeakTrackingVH> MaybeDeadPHIs;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RISCVGatherScatterLowering() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+
+  StringRef getPassName() const override {
+    return "RISCV gather/scatter lowering";
+  }
+
+private:
+  bool isLegalTypeAndAlignment(Type *DataType, Value *AlignOp);
+
+  bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr,
+                                 Value *AlignOp);
+
+  std::pair<Value *, Value *> determineBaseAndStride(GetElementPtrInst *GEP,
+                                                     IRBuilder<> &Builder);
+
+  bool matchStridedRecurrence(Value *Index, Loop *L, Value *&Stride,
+                              PHINode *&BasePtr, BinaryOperator *&Inc,
+                              IRBuilder<> &Builder);
+};
+
+} // end anonymous namespace
+
+char RISCVGatherScatterLowering::ID = 0;
+
+INITIALIZE_PASS(RISCVGatherScatterLowering, DEBUG_TYPE,
+                "RISCV gather/scatter lowering pass", false, false)
+
+FunctionPass *llvm::createRISCVGatherScatterLoweringPass() {
+  return new RISCVGatherScatterLowering();
+}
+
+bool RISCVGatherScatterLowering::isLegalTypeAndAlignment(Type *DataType,
+                                                         Value *AlignOp) {
+  Type *ScalarType = DataType->getScalarType();
+  if (!TLI->isLegalElementTypeForRVV(ScalarType))
+    return false;
+
+  MaybeAlign MA = cast<ConstantInt>(AlignOp)->getMaybeAlignValue();
+  if (MA && MA->value() < DL->getTypeStoreSize(ScalarType).getFixedSize())
+    return false;
+
+  // FIXME: Let the backend type legalize by splitting/widening?
+  EVT DataVT = TLI->getValueType(*DL, DataType);
+  if (!TLI->isTypeLegal(DataVT))
+    return false;
+
+  return true;
+}
+
+// TODO: Should we consider the mask when looking for a stride?
+static std::pair<Value *, Value *> matchStridedConstant(Constant *StartC) {
+  unsigned NumElts = cast<FixedVectorType>(StartC->getType())->getNumElements();
+
+  // Check that the start value is a strided constant.
+  auto *StartVal =
+      dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement((unsigned)0));
+  if (!StartVal)
+    return std::make_pair(nullptr, nullptr);
+  APInt StrideVal(StartVal->getValue().getBitWidth(), 0);
+  ConstantInt *Prev = StartVal;
+  for (unsigned i = 1; i != NumElts; ++i) {
+    auto *C = dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement(i));
+    if (!C)
+      return std::make_pair(nullptr, nullptr);
+
+    APInt LocalStride = C->getValue() - Prev->getValue();
+    if (i == 1)
+      StrideVal = LocalStride;
+    else if (StrideVal != LocalStride)
+      return std::make_pair(nullptr, nullptr);
+
+    Prev = C;
+  }
+
+  Value *Stride = ConstantInt::get(StartVal->getType(), StrideVal);
+
+  return std::make_pair(StartVal, Stride);
+}
+
+// Recursively, walk about the use-def chain until we find a Phi with a strided
+// start value. Build and update a scalar recurrence as we unwind the recursion.
+// We also update the Stride as we unwind. Our goal is to move all of the
+// arithmetic out of the loop.
+bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
+                                                        Value *&Stride,
+                                                        PHINode *&BasePtr,
+                                                        BinaryOperator *&Inc,
+                                                        IRBuilder<> &Builder) {
+  // Our base case is a Phi.
+  if (auto *Phi = dyn_cast<PHINode>(Index)) {
+    // A phi node we want to perform this function on should be from the
+    // loop header.
+    if (Phi->getParent() != L->getHeader())
+      return false;
+
+    Value *Step, *Start;
+    if (!matchSimpleRecurrence(Phi, Inc, Start, Step) ||
+        Inc->getOpcode() != Instruction::Add)
+      return false;
+    assert(Phi->getNumIncomingValues() == 2 && "Expected 2 operand phi.");
+    unsigned IncrementingBlock = Phi->getIncomingValue(0) == Inc ? 0 : 1;
+    assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
+           "Expected one operand of phi to be Inc");
+
+    // Only proceed if the step is loop invariant.
+    if (!L->isLoopInvariant(Step))
+      return false;
+
+    // Step should be a splat.
+    Step = getSplatValue(Step);
+    if (!Step)
+      return false;
+
+    // Start should be a strided constant.
+    auto *StartC = dyn_cast<Constant>(Start);
+    if (!StartC)
+      return false;
+
+    std::tie(Start, Stride) = matchStridedConstant(StartC);
+    if (!Start)
+      return false;
+    assert(Stride != nullptr);
+
+    // Build scalar phi and increment.
+    BasePtr =
+        PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi);
+    Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar",
+                                    Inc);
+    BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock));
+    BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock));
+
+    // Note that this Phi might be eligible for removal.
+    MaybeDeadPHIs.push_back(Phi);
+    return true;
+  }
+
+  // Otherwise look for binary operator.
+  auto *BO = dyn_cast<BinaryOperator>(Index);
+  if (!BO)
+    return false;
+
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Or &&
+      BO->getOpcode() != Instruction::Mul &&
+      BO->getOpcode() != Instruction::Shl)
+    return false;
+
+  // Only support shift by constant.
+  if (BO->getOpcode() == Instruction::Shl && !isa<Constant>(BO->getOperand(1)))
+    return false;
+
+  // We need to be able to treat Or as Add.
+  if (BO->getOpcode() == Instruction::Or &&
+      !haveNoCommonBitsSet(BO->getOperand(0), BO->getOperand(1), *DL))
+    return false;
+
+  // We should have one operand in the loop and one splat.
+  Value *OtherOp;
+  if (isa<Instruction>(BO->getOperand(0)) &&
+      L->contains(cast<Instruction>(BO->getOperand(0)))) {
+    Index = cast<Instruction>(BO->getOperand(0));
+    OtherOp = BO->getOperand(1);
+  } else if (isa<Instruction>(BO->getOperand(1)) &&
+             L->contains(cast<Instruction>(BO->getOperand(1)))) {
+    Index = cast<Instruction>(BO->getOperand(1));
+    OtherOp = BO->getOperand(0);
+  } else {
+    return false;
+  }
+
+  // Make sure other op is loop invariant.
+  if (!L->isLoopInvariant(OtherOp))
+    return false;
+
+  // Make sure we have a splat.
+  Value *SplatOp = getSplatValue(OtherOp);
+  if (!SplatOp)
+    return false;
+
+  // Recurse up the use-def chain.
+  if (!matchStridedRecurrence(Index, L, Stride, BasePtr, Inc, Builder))
+    return false;
+
+  // Locate the Step and Start values from the recurrence.
+  unsigned StepIndex = Inc->getOperand(0) == BasePtr ? 1 : 0;
+  unsigned StartBlock = BasePtr->getOperand(0) == Inc ? 1 : 0;
+  Value *Step = Inc->getOperand(StepIndex);
+  Value *Start = BasePtr->getOperand(StartBlock);
+
+  // We need to adjust the start value in the preheader.
+  Builder.SetInsertPoint(
+      BasePtr->getIncomingBlock(StartBlock)->getTerminator());
+  Builder.SetCurrentDebugLocation(DebugLoc());
+
+  switch (BO->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case Instruction::Add:
+  case Instruction::Or: {
+    // An add only affects the start value. It's ok to do this for Or because
+    // we already checked that there are no common set bits.
+
+    // If the start value is Zero, just take the SplatOp.
+    if (isa<ConstantInt>(Start) && cast<ConstantInt>(Start)->isZero())
+      Start = SplatOp;
+    else
+      Start = Builder.CreateAdd(Start, SplatOp, "start");
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  case Instruction::Mul: {
+    // If the start is zero we don't need to multiply.
+    if (!isa<ConstantInt>(Start) || !cast<ConstantInt>(Start)->isZero())
+      Start = Builder.CreateMul(Start, SplatOp, "start");
+
+    Step = Builder.CreateMul(Step, SplatOp, "step");
+
+    // If the Stride is 1 just take the SplatOpt.
+    if (isa<ConstantInt>(Stride) && cast<ConstantInt>(Stride)->isOne())
+      Stride = SplatOp;
+    else
+      Stride = Builder.CreateMul(Stride, SplatOp, "stride");
+    Inc->setOperand(StepIndex, Step);
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  case Instruction::Shl: {
+    // If the start is zero we don't need to shift.
+    if (!isa<ConstantInt>(Start) || !cast<ConstantInt>(Start)->isZero())
+      Start = Builder.CreateShl(Start, SplatOp, "start");
+    Step = Builder.CreateShl(Step, SplatOp, "step");
+    Stride = Builder.CreateShl(Stride, SplatOp, "stride");
+    Inc->setOperand(StepIndex, Step);
+    BasePtr->setIncomingValue(StartBlock, Start);
+    break;
+  }
+  }
+
+  return true;
+}
+
+std::pair<Value *, Value *>
+RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
+                                                   IRBuilder<> &Builder) {
+
+  SmallVector<Value *, 2> Ops(GEP->operands());
+
+  // Base pointer needs to be a scalar.
+  if (Ops[0]->getType()->isVectorTy())
+    return std::make_pair(nullptr, nullptr);
+
+  // Make sure we're in a loop and it is in loop simplify form.
+  Loop *L = LI->getLoopFor(GEP->getParent());
+  if (!L || !L->isLoopSimplifyForm())
+    return std::make_pair(nullptr, nullptr);
+
+  Optional<unsigned> VecOperand;
+  unsigned TypeScale = 0;
+
+  // Look for a vector operand and scale.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    if (!Ops[i]->getType()->isVectorTy())
+      continue;
+
+    if (VecOperand)
+      return std::make_pair(nullptr, nullptr);
+
+    VecOperand = i;
+
+    TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType());
+    if (TS.isScalable())
+      return std::make_pair(nullptr, nullptr);
+
+    TypeScale = TS.getFixedSize();
+  }
+
+  // We need to find a vector index to simplify.
+  if (!VecOperand)
+    return std::make_pair(nullptr, nullptr);
+
+  // We can't extract the stride if the arithmetic is done at a different size
+  // than the pointer type. Adding the stride later may not wrap correctly.
+  // Technically we could handle wider indices, but I don't expect that in
+  // practice.
+  Value *VecIndex = Ops[*VecOperand];
+  Type *VecIntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (VecIndex->getType() != VecIntPtrTy)
+    return std::make_pair(nullptr, nullptr);
+
+  Value *Stride;
+  BinaryOperator *Inc;
+  PHINode *BasePhi;
+  if (!matchStridedRecurrence(VecIndex, L, Stride, BasePhi, Inc, Builder))
+    return std::make_pair(nullptr, nullptr);
+
+  assert(BasePhi->getNumIncomingValues() == 2 && "Expected 2 operand phi.");
+  unsigned IncrementingBlock = BasePhi->getOperand(0) == Inc ? 0 : 1;
+  assert(BasePhi->getIncomingValue(IncrementingBlock) == Inc &&
+         "Expected one operand of phi to be Inc");
+
+  Builder.SetInsertPoint(GEP);
+
+  // Replace the vector index with the scalar phi and build a scalar GEP.
+  Ops[*VecOperand] = BasePhi;
+  Type *SourceTy = GEP->getSourceElementType();
+  Value *BasePtr =
+      Builder.CreateGEP(SourceTy, Ops[0], makeArrayRef(Ops).drop_front());
+
+  // Cast the GEP to an i8*.
+  LLVMContext &Ctx = GEP->getContext();
+  Type *I8PtrTy =
+      Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
+  if (BasePtr->getType() != I8PtrTy)
+    BasePtr = Builder.CreatePointerCast(BasePtr, I8PtrTy);
+
+  // Final adjustments to stride should go in the start block.
+  Builder.SetInsertPoint(
+      BasePhi->getIncomingBlock(1 - IncrementingBlock)->getTerminator());
+
+  // Convert stride to pointer size if needed.
+  Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
+  assert(Stride->getType() == IntPtrTy && "Unexpected type");
+
+  // Scale the stride by the size of the indexed type.
+  if (TypeScale != 1)
+    Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale));
+
+  return std::make_pair(BasePtr, Stride);
+}
+
+bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
+                                                           Type *DataType,
+                                                           Value *Ptr,
+                                                           Value *AlignOp) {
+  // Make sure the operation will be supported by the backend.
+  if (!isLegalTypeAndAlignment(DataType, AlignOp))
+    return false;
+
+  // Pointer should be a GEP.
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return false;
+
+  IRBuilder<> Builder(GEP);
+
+  Value *BasePtr, *Stride;
+  std::tie(BasePtr, Stride) = determineBaseAndStride(GEP, Builder);
+  if (!BasePtr)
+    return false;
+  assert(Stride != nullptr);
+
+  Builder.SetInsertPoint(II);
+
+  CallInst *Call;
+  if (II->getIntrinsicID() == Intrinsic::masked_gather)
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_masked_strided_load,
+        {DataType, BasePtr->getType(), Stride->getType()},
+        {II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
+  else
+    Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_masked_strided_store,
+        {DataType, BasePtr->getType(), Stride->getType()},
+        {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
+
+  Call->takeName(II);
+  II->replaceAllUsesWith(Call);
+  II->eraseFromParent();
+
+  if (GEP->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(GEP);
+
+  return true;
+}
+
+bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<RISCVTargetMachine>();
+  ST = &TM.getSubtarget<RISCVSubtarget>(F);
+  if (!ST->hasVInstructions() || !ST->useRVVForFixedLengthVectors())
+    return false;
+
+  TLI = ST->getTargetLowering();
+  DL = &F.getParent()->getDataLayout();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  SmallVector<IntrinsicInst *, 4> Gathers;
+  SmallVector<IntrinsicInst *, 4> Scatters;
+
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
+          isa<FixedVectorType>(II->getType())) {
+        Gathers.push_back(II);
+      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter &&
+                 isa<FixedVectorType>(II->getArgOperand(0)->getType())) {
+        Scatters.push_back(II);
+      }
+    }
+  }
+
+  // Rewrite gather/scatter to form strided load/store if possible.
+  for (auto *II : Gathers)
+    Changed |= tryCreateStridedLoadStore(
+        II, II->getType(), II->getArgOperand(0), II->getArgOperand(1));
+  for (auto *II : Scatters)
+    Changed |=
+        tryCreateStridedLoadStore(II, II->getArgOperand(0)->getType(),
+                                  II->getArgOperand(1), II->getArgOperand(2));
+
+  // Remove any dead phis.
+  while (!MaybeDeadPHIs.empty()) {
+    if (auto *Phi = dyn_cast_or_null<PHINode>(MaybeDeadPHIs.pop_back_val()))
+      RecursivelyDeleteDeadPHINode(Phi);
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9866567ac1ee..66a34d73dd37 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -108,7 +108,21 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
 }
 
 void RISCVDAGToDAGISel::PostprocessISelDAG() {
-  doPeepholeLoadStoreADDI();
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    MadeChange |= doPeepholeSExtW(N);
+    MadeChange |= doPeepholeLoadStoreADDI(N);
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
 }
 
 static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
@@ -126,6 +140,9 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
     else if (Inst.Opc == RISCV::ADDUW)
       Result = CurDAG->getMachineNode(RISCV::ADDUW, DL, XLenVT, SrcReg,
                                       CurDAG->getRegister(RISCV::X0, XLenVT));
+    else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
+             Inst.Opc == RISCV::SH3ADD)
+      Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SrcReg);
     else
       Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);
 
@@ -199,7 +216,7 @@ static SDValue createTuple(SelectionDAG &CurDAG, ArrayRef<SDValue> Regs,
 void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
     SDNode *Node, unsigned Log2SEW, const SDLoc &DL, unsigned CurOp,
     bool IsMasked, bool IsStridedOrIndexed, SmallVectorImpl<SDValue> &Operands,
-    MVT *IndexVT) {
+    bool IsLoad, MVT *IndexVT) {
   SDValue Chain = Node->getOperand(0);
   SDValue Glue;
 
@@ -228,6 +245,14 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
   SDValue SEWOp = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
   Operands.push_back(SEWOp);
 
+  // Masked load has the tail policy argument.
+  if (IsMasked && IsLoad) {
+    // Policy must be a constant.
+    uint64_t Policy = Node->getConstantOperandVal(CurOp++);
+    SDValue PolicyOp = CurDAG->getTargetConstant(Policy, DL, XLenVT);
+    Operands.push_back(PolicyOp);
+  }
+
   Operands.push_back(Chain); // Chain.
   if (Glue)
     Operands.push_back(Glue);
@@ -252,7 +277,7 @@ void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, bool IsMasked,
   }
 
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
-                             Operands);
+                             Operands, /*IsLoad=*/true);
 
   const RISCV::VLSEGPseudo *P =
       RISCV::getVLSEGPseudo(NF, IsMasked, IsStrided, /*FF*/ false, Log2SEW,
@@ -293,7 +318,8 @@ void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node, bool IsMasked) {
   }
 
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
-                             /*IsStridedOrIndexed*/ false, Operands);
+                             /*IsStridedOrIndexed*/ false, Operands,
+                             /*IsLoad=*/true);
 
   const RISCV::VLSEGPseudo *P =
       RISCV::getVLSEGPseudo(NF, IsMasked, /*Strided*/ false, /*FF*/ true,
@@ -338,7 +364,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, bool IsMasked,
 
   MVT IndexVT;
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
-                             /*IsStridedOrIndexed*/ true, Operands, &IndexVT);
+                             /*IsStridedOrIndexed*/ true, Operands,
+                             /*IsLoad=*/true, &IndexVT);
 
   assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
          "Element count mismatch");
@@ -415,7 +442,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, bool IsMasked,
 
   MVT IndexVT;
   addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
-                             /*IsStridedOrIndexed*/ true, Operands, &IndexVT);
+                             /*IsStridedOrIndexed*/ true, Operands,
+                             /*IsLoad=*/false, &IndexVT);
 
   assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
          "Element count mismatch");
@@ -453,14 +481,24 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   switch (Opcode) {
   case ISD::Constant: {
     auto *ConstNode = cast<ConstantSDNode>(Node);
-    if (VT == XLenVT && ConstNode->isNullValue()) {
+    if (VT == XLenVT && ConstNode->isZero()) {
       SDValue New =
           CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, RISCV::X0, XLenVT);
       ReplaceNode(Node, New.getNode());
       return;
     }
-    ReplaceNode(Node,
-                selectImm(CurDAG, DL, ConstNode->getSExtValue(), *Subtarget));
+    int64_t Imm = ConstNode->getSExtValue();
+    // If the upper XLen-16 bits are not used, try to convert this to a simm12
+    // by sign extending bit 15.
+    if (isUInt<16>(Imm) && isInt<12>(SignExtend64(Imm, 16)) &&
+        hasAllHUsers(Node))
+      Imm = SignExtend64(Imm, 16);
+    // If the upper 32-bits are not used try to convert this into a simm32 by
+    // sign extending bit 32.
+    if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node))
+      Imm = SignExtend64(Imm, 32);
+
+    ReplaceNode(Node, selectImm(CurDAG, DL, Imm, *Subtarget));
     return;
   }
   case ISD::FrameIndex: {
@@ -591,7 +629,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       }
     }
 
-    // Turn (and (shl x, c2) c1) -> (srli (slli c2+c3), c3) if c1 is a mask
+    // Turn (and (shl x, c2), c1) -> (srli (slli c2+c3), c3) if c1 is a mask
     // shifted by c2 bits with c3 leading zeros.
     if (LeftShift && isShiftedMask_64(C1)) {
       uint64_t C3 = XLen - (64 - countLeadingZeros(C1));
@@ -621,6 +659,63 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       }
     }
 
+    // Turn (and (shr x, c2), c1) -> (slli (srli x, c2+c3), c3) if c1 is a
+    // shifted mask with c2 leading zeros and c3 trailing zeros.
+    if (!LeftShift && isShiftedMask_64(C1)) {
+      uint64_t Leading = XLen - (64 - countLeadingZeros(C1));
+      uint64_t C3 = countTrailingZeros(C1);
+      if (Leading == C2 && C2 + C3 < XLen && OneUseOrZExtW && !ZExtOrANDI) {
+        SDNode *SRLI = CurDAG->getMachineNode(
+            RISCV::SRLI, DL, XLenVT, X,
+            CurDAG->getTargetConstant(C2 + C3, DL, XLenVT));
+        SDNode *SLLI =
+            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLI, 0),
+                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+        ReplaceNode(Node, SLLI);
+        return;
+      }
+      // If the leading zero count is C2+32, we can use SRLIW instead of SRLI.
+      if (Leading > 32 && (Leading - 32) == C2 && C2 + C3 < 32 &&
+          OneUseOrZExtW && !ZExtOrANDI) {
+        SDNode *SRLIW = CurDAG->getMachineNode(
+            RISCV::SRLIW, DL, XLenVT, X,
+            CurDAG->getTargetConstant(C2 + C3, DL, XLenVT));
+        SDNode *SLLI =
+            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLIW, 0),
+                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+        ReplaceNode(Node, SLLI);
+        return;
+      }
+    }
+
+    // Turn (and (shl x, c2), c1) -> (slli (srli x, c3-c2), c3) if c1 is a
+    // shifted mask with no leading zeros and c3 trailing zeros.
+    if (LeftShift && isShiftedMask_64(C1)) {
+      uint64_t Leading = XLen - (64 - countLeadingZeros(C1));
+      uint64_t C3 = countTrailingZeros(C1);
+      if (Leading == 0 && C2 < C3 && OneUseOrZExtW && !ZExtOrANDI) {
+        SDNode *SRLI = CurDAG->getMachineNode(
+            RISCV::SRLI, DL, XLenVT, X,
+            CurDAG->getTargetConstant(C3 - C2, DL, XLenVT));
+        SDNode *SLLI =
+            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLI, 0),
+                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+        ReplaceNode(Node, SLLI);
+        return;
+      }
+      // If we have (32-C2) leading zeros, we can use SRLIW instead of SRLI.
+      if (C2 < C3 && Leading + C2 == 32 && OneUseOrZExtW && !ZExtOrANDI) {
+        SDNode *SRLIW = CurDAG->getMachineNode(
+            RISCV::SRLIW, DL, XLenVT, X,
+            CurDAG->getTargetConstant(C3 - C2, DL, XLenVT));
+        SDNode *SLLI =
+            CurDAG->getMachineNode(RISCV::SLLI, DL, XLenVT, SDValue(SRLIW, 0),
+                                   CurDAG->getTargetConstant(C3, DL, XLenVT));
+        ReplaceNode(Node, SLLI);
+        return;
+      }
+    }
+
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -713,7 +808,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       }
       bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu_mask;
       MVT Src1VT = Src1.getSimpleValueType();
-      unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOTOpcode;
+      unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode;
       switch (RISCVTargetLowering::getLMUL(Src1VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
@@ -766,31 +861,31 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         llvm_unreachable("Unexpected LMUL!");
       case RISCVII::VLMUL::LMUL_F8:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_MF8;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_MF8;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF8;
         break;
       case RISCVII::VLMUL::LMUL_F4:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_MF4;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_MF4;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF4;
         break;
       case RISCVII::VLMUL::LMUL_F2:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_MF2;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_MF2;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF2;
         break;
       case RISCVII::VLMUL::LMUL_1:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_M1;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M1;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M1;
         break;
       case RISCVII::VLMUL::LMUL_2:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_M2;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M2;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M2;
         break;
       case RISCVII::VLMUL::LMUL_4:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_M4;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M4;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M4;
         break;
       case RISCVII::VLMUL::LMUL_8:
         VMXOROpcode = RISCV::PseudoVMXOR_MM_M8;
-        VMANDNOTOpcode = RISCV::PseudoVMANDNOT_MM_M8;
+        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M8;
         break;
       }
       SDValue SEW = CurDAG->getTargetConstant(
@@ -801,13 +896,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       SDValue MaskedOff = Node->getOperand(1);
       SDValue Mask = Node->getOperand(4);
       // If the MaskedOff value and the Mask are the same value use
-      // vmslt{u}.vx vt, va, x;  vmandnot.mm vd, vd, vt
+      // vmslt{u}.vx vt, va, x;  vmandn.mm vd, vd, vt
       // This avoids needing to copy v0 to vd before starting the next sequence.
       if (Mask == MaskedOff) {
         SDValue Cmp = SDValue(
             CurDAG->getMachineNode(VMSLTOpcode, DL, VT, {Src1, Src2, VL, SEW}),
             0);
-        ReplaceNode(Node, CurDAG->getMachineNode(VMANDNOTOpcode, DL, VT,
+        ReplaceNode(Node, CurDAG->getMachineNode(VMANDNOpcode, DL, VT,
                                                  {Mask, Cmp, VL, MaskSEW}));
         return;
       }
@@ -840,7 +935,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
     case Intrinsic::riscv_vsetvli:
     case Intrinsic::riscv_vsetvlimax: {
-      if (!Subtarget->hasStdExtV())
+      if (!Subtarget->hasVInstructions())
         break;
 
       bool VLMax = IntNo == Intrinsic::riscv_vsetvlimax;
@@ -859,8 +954,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
 
       SDValue VLOperand;
+      unsigned Opcode = RISCV::PseudoVSETVLI;
       if (VLMax) {
         VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
+        Opcode = RISCV::PseudoVSETVLIX0;
       } else {
         VLOperand = Node->getOperand(2);
 
@@ -878,7 +975,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       }
 
       ReplaceNode(Node,
-                  CurDAG->getMachineNode(RISCV::PseudoVSETVLI, DL, XLenVT,
+                  CurDAG->getMachineNode(Opcode, DL, XLenVT,
                                          MVT::Other, VLOperand, VTypeIOp,
                                          /* Chain */ Node->getOperand(0)));
       return;
@@ -999,7 +1096,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       MVT IndexVT;
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
                                  /*IsStridedOrIndexed*/ true, Operands,
-                                 &IndexVT);
+                                 /*IsLoad=*/true, &IndexVT);
 
       assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
              "Element count mismatch");
@@ -1019,7 +1116,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, Load);
       return;
     }
-    case Intrinsic::riscv_vle1:
+    case Intrinsic::riscv_vlm:
     case Intrinsic::riscv_vle:
     case Intrinsic::riscv_vle_mask:
     case Intrinsic::riscv_vlse:
@@ -1038,7 +1135,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         Operands.push_back(Node->getOperand(CurOp++));
 
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
-                                 Operands);
+                                 Operands, /*IsLoad=*/true);
 
       RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VLEPseudo *P =
@@ -1066,7 +1163,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         Operands.push_back(Node->getOperand(CurOp++));
 
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
-                                 /*IsStridedOrIndexed*/ false, Operands);
+                                 /*IsStridedOrIndexed*/ false, Operands,
+                                 /*IsLoad=*/true);
 
       RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VLEPseudo *P =
@@ -1188,7 +1286,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       MVT IndexVT;
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
                                  /*IsStridedOrIndexed*/ true, Operands,
-                                 &IndexVT);
+                                 /*IsLoad=*/false, &IndexVT);
 
       assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
              "Element count mismatch");
@@ -1208,7 +1306,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, Store);
       return;
     }
-    case Intrinsic::riscv_vse1:
+    case Intrinsic::riscv_vsm:
     case Intrinsic::riscv_vse:
     case Intrinsic::riscv_vse_mask:
     case Intrinsic::riscv_vsse:
@@ -1496,6 +1594,97 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
   return false;
 }
 
+// Return true if all users of this SDNode* only consume the lower \p Bits.
+// This can be used to form W instructions for add/sub/mul/shl even when the
+// root isn't a sext_inreg. This can allow the ADDW/SUBW/MULW/SLLIW to CSE if
+// SimplifyDemandedBits has made it so some users see a sext_inreg and some
+// don't. The sext_inreg+add/sub/mul/shl will get selected, but still leave
+// the add/sub/mul/shl to become non-W instructions. By checking the users we
+// may be able to use a W instruction and CSE with the other instruction if
+// this has happened. We could try to detect that the CSE opportunity exists
+// before doing this, but that would be more complicated.
+// TODO: Does this need to look through AND/OR/XOR to their users to find more
+// opportunities.
+bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
+  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB ||
+          Node->getOpcode() == ISD::MUL || Node->getOpcode() == ISD::SHL ||
+          Node->getOpcode() == ISD::SRL ||
+          Node->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+          isa<ConstantSDNode>(Node)) &&
+         "Unexpected opcode");
+
+  for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    // Users of this node should have already been instruction selected
+    if (!User->isMachineOpcode())
+      return false;
+
+    // TODO: Add more opcodes?
+    switch (User->getMachineOpcode()) {
+    default:
+      return false;
+    case RISCV::ADDW:
+    case RISCV::ADDIW:
+    case RISCV::SUBW:
+    case RISCV::MULW:
+    case RISCV::SLLW:
+    case RISCV::SLLIW:
+    case RISCV::SRAW:
+    case RISCV::SRAIW:
+    case RISCV::SRLW:
+    case RISCV::SRLIW:
+    case RISCV::DIVW:
+    case RISCV::DIVUW:
+    case RISCV::REMW:
+    case RISCV::REMUW:
+    case RISCV::ROLW:
+    case RISCV::RORW:
+    case RISCV::RORIW:
+    case RISCV::CLZW:
+    case RISCV::CTZW:
+    case RISCV::CPOPW:
+    case RISCV::SLLIUW:
+    case RISCV::FCVT_H_W:
+    case RISCV::FCVT_H_WU:
+    case RISCV::FCVT_S_W:
+    case RISCV::FCVT_S_WU:
+    case RISCV::FCVT_D_W:
+    case RISCV::FCVT_D_WU:
+      if (Bits < 32)
+        return false;
+      break;
+    case RISCV::SLLI:
+      // SLLI only uses the lower (XLen - ShAmt) bits.
+      if (Bits < Subtarget->getXLen() - User->getConstantOperandVal(1))
+        return false;
+      break;
+    case RISCV::ADDUW:
+    case RISCV::SH1ADDUW:
+    case RISCV::SH2ADDUW:
+    case RISCV::SH3ADDUW:
+      // The first operand to add.uw/shXadd.uw is implicitly zero extended from
+      // 32 bits.
+      if (UI.getOperandNo() != 0 || Bits < 32)
+        return false;
+      break;
+    case RISCV::SB:
+      if (UI.getOperandNo() != 0 || Bits < 8)
+        return false;
+      break;
+    case RISCV::SH:
+      if (UI.getOperandNo() != 0 || Bits < 16)
+        return false;
+      break;
+    case RISCV::SW:
+      if (UI.getOperandNo() != 0 || Bits < 32)
+        return false;
+      break;
+    }
+  }
+
+  return true;
+}
+
 // Select VL as a 5 bit immediate or a value that will become a register. This
 // allows us to choose betwen VSETIVLI or VSETVLI later.
 bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
@@ -1609,113 +1798,162 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
 // This is possible when off1+off2 fits a 12-bit immediate.
-void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
+  int OffsetOpIdx;
+  int BaseOpIdx;
 
-  while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--Position;
-    // Skip dead nodes and any non-machine opcodes.
-    if (N->use_empty() || !N->isMachineOpcode())
-      continue;
+  // Only attempt this optimisation for I-type loads and S-type stores.
+  switch (N->getMachineOpcode()) {
+  default:
+    return false;
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLH:
+  case RISCV::FLW:
+  case RISCV::FLD:
+    BaseOpIdx = 0;
+    OffsetOpIdx = 1;
+    break;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSH:
+  case RISCV::FSW:
+  case RISCV::FSD:
+    BaseOpIdx = 1;
+    OffsetOpIdx = 2;
+    break;
+  }
 
-    int OffsetOpIdx;
-    int BaseOpIdx;
+  if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
+    return false;
 
-    // Only attempt this optimisation for I-type loads and S-type stores.
-    switch (N->getMachineOpcode()) {
-    default:
-      continue;
-    case RISCV::LB:
-    case RISCV::LH:
-    case RISCV::LW:
-    case RISCV::LBU:
-    case RISCV::LHU:
-    case RISCV::LWU:
-    case RISCV::LD:
-    case RISCV::FLH:
-    case RISCV::FLW:
-    case RISCV::FLD:
-      BaseOpIdx = 0;
-      OffsetOpIdx = 1;
-      break;
-    case RISCV::SB:
-    case RISCV::SH:
-    case RISCV::SW:
-    case RISCV::SD:
-    case RISCV::FSH:
-    case RISCV::FSW:
-    case RISCV::FSD:
-      BaseOpIdx = 1;
-      OffsetOpIdx = 2;
-      break;
-    }
+  SDValue Base = N->getOperand(BaseOpIdx);
 
-    if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
-      continue;
+  // If the base is an ADDI, we can merge it in to the load/store.
+  if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
+    return false;
 
-    SDValue Base = N->getOperand(BaseOpIdx);
+  SDValue ImmOperand = Base.getOperand(1);
+  uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
 
-    // If the base is an ADDI, we can merge it in to the load/store.
-    if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
-      continue;
+  if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
+    int64_t Offset1 = Const->getSExtValue();
+    int64_t CombinedOffset = Offset1 + Offset2;
+    if (!isInt<12>(CombinedOffset))
+      return false;
+    ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
+                                           ImmOperand.getValueType());
+  } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+    // If the off1 in (addi base, off1) is a global variable's address (its
+    // low part, really), then we can rely on the alignment of that variable
+    // to provide a margin of safety before off1 can overflow the 12 bits.
+    // Check if off2 falls within that margin; if so off1+off2 can't overflow.
+    const DataLayout &DL = CurDAG->getDataLayout();
+    Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
+    if (Offset2 != 0 && Alignment <= Offset2)
+      return false;
+    int64_t Offset1 = GA->getOffset();
+    int64_t CombinedOffset = Offset1 + Offset2;
+    ImmOperand = CurDAG->getTargetGlobalAddress(
+        GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
+        CombinedOffset, GA->getTargetFlags());
+  } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
+    // Ditto.
+    Align Alignment = CP->getAlign();
+    if (Offset2 != 0 && Alignment <= Offset2)
+      return false;
+    int64_t Offset1 = CP->getOffset();
+    int64_t CombinedOffset = Offset1 + Offset2;
+    ImmOperand = CurDAG->getTargetConstantPool(
+        CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
+        CombinedOffset, CP->getTargetFlags());
+  } else {
+    return false;
+  }
 
-    SDValue ImmOperand = Base.getOperand(1);
-    uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
-
-    if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
-      int64_t Offset1 = Const->getSExtValue();
-      int64_t CombinedOffset = Offset1 + Offset2;
-      if (!isInt<12>(CombinedOffset))
-        continue;
-      ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
-                                             ImmOperand.getValueType());
-    } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
-      // If the off1 in (addi base, off1) is a global variable's address (its
-      // low part, really), then we can rely on the alignment of that variable
-      // to provide a margin of safety before off1 can overflow the 12 bits.
-      // Check if off2 falls within that margin; if so off1+off2 can't overflow.
-      const DataLayout &DL = CurDAG->getDataLayout();
-      Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
-      if (Offset2 != 0 && Alignment <= Offset2)
-        continue;
-      int64_t Offset1 = GA->getOffset();
-      int64_t CombinedOffset = Offset1 + Offset2;
-      ImmOperand = CurDAG->getTargetGlobalAddress(
-          GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
-          CombinedOffset, GA->getTargetFlags());
-    } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
-      // Ditto.
-      Align Alignment = CP->getAlign();
-      if (Offset2 != 0 && Alignment <= Offset2)
-        continue;
-      int64_t Offset1 = CP->getOffset();
-      int64_t CombinedOffset = Offset1 + Offset2;
-      ImmOperand = CurDAG->getTargetConstantPool(
-          CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
-          CombinedOffset, CP->getTargetFlags());
-    } else {
-      continue;
+  LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+  LLVM_DEBUG(Base->dump(CurDAG));
+  LLVM_DEBUG(dbgs() << "\nN: ");
+  LLVM_DEBUG(N->dump(CurDAG));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Modify the offset operand of the load/store.
+  if (BaseOpIdx == 0) // Load
+    CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                               N->getOperand(2));
+  else // Store
+    CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                               ImmOperand, N->getOperand(3));
+
+  return true;
+}
+
+// Try to remove sext.w if the input is a W instruction or can be made into
+// a W instruction cheaply.
+bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
+  // Look for the sext.w pattern, addiw rd, rs1, 0.
+  if (N->getMachineOpcode() != RISCV::ADDIW ||
+      !isNullConstant(N->getOperand(1)))
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+  if (!N0.isMachineOpcode())
+    return false;
+
+  switch (N0.getMachineOpcode()) {
+  default:
+    break;
+  case RISCV::ADD:
+  case RISCV::ADDI:
+  case RISCV::SUB:
+  case RISCV::MUL:
+  case RISCV::SLLI: {
+    // Convert sext.w+add/sub/mul to their W instructions. This will create
+    // a new independent instruction. This improves latency.
+    unsigned Opc;
+    switch (N0.getMachineOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case RISCV::ADD:  Opc = RISCV::ADDW;  break;
+    case RISCV::ADDI: Opc = RISCV::ADDIW; break;
+    case RISCV::SUB:  Opc = RISCV::SUBW;  break;
+    case RISCV::MUL:  Opc = RISCV::MULW;  break;
+    case RISCV::SLLI: Opc = RISCV::SLLIW; break;
     }
 
-    LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
-    LLVM_DEBUG(Base->dump(CurDAG));
-    LLVM_DEBUG(dbgs() << "\nN: ");
-    LLVM_DEBUG(N->dump(CurDAG));
-    LLVM_DEBUG(dbgs() << "\n");
-
-    // Modify the offset operand of the load/store.
-    if (BaseOpIdx == 0) // Load
-      CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
-                                 N->getOperand(2));
-    else // Store
-      CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
-                                 ImmOperand, N->getOperand(3));
-
-    // The add-immediate may now be dead, in which case remove it.
-    if (Base.getNode()->use_empty())
-      CurDAG->RemoveDeadNode(Base.getNode());
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+
+    // Shift amount needs to be uimm5.
+    if (N0.getMachineOpcode() == RISCV::SLLI &&
+        !isUInt<5>(cast<ConstantSDNode>(N01)->getSExtValue()))
+      break;
+
+    SDNode *Result =
+        CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+                               N00, N01);
+    ReplaceUses(N, Result);
+    return true;
+  }
+  case RISCV::ADDW:
+  case RISCV::ADDIW:
+  case RISCV::SUBW:
+  case RISCV::MULW:
+  case RISCV::SLLIW:
+    // Result is already sign extended just remove the sext.w.
+    // NOTE: We only handle the nodes that are selected with hasAllWUsers.
+    ReplaceUses(N, N0.getNode());
+    return true;
   }
+
+  return false;
 }
 
 // This pass converts a legalized DAG into a RISCV-specific DAG, ready
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 56d072206316..a2770089995d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -58,6 +58,10 @@ public:
   bool selectSExti32(SDValue N, SDValue &Val);
   bool selectZExti32(SDValue N, SDValue &Val);
 
+  bool hasAllNBitUsers(SDNode *Node, unsigned Bits) const;
+  bool hasAllHUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 16); }
+  bool hasAllWUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 32); }
+
   bool selectVLOp(SDValue N, SDValue &VL);
 
   bool selectVSplat(SDValue N, SDValue &SplatVal);
@@ -75,7 +79,7 @@ public:
                                   const SDLoc &DL, unsigned CurOp,
                                   bool IsMasked, bool IsStridedOrIndexed,
                                   SmallVectorImpl<SDValue> &Operands,
-                                  MVT *IndexVT = nullptr);
+                                  bool IsLoad = false, MVT *IndexVT = nullptr);
 
   void selectVLSEG(SDNode *Node, bool IsMasked, bool IsStrided);
   void selectVLSEGFF(SDNode *Node, bool IsMasked);
@@ -83,11 +87,34 @@ public:
   void selectVSSEG(SDNode *Node, bool IsMasked, bool IsStrided);
   void selectVSXSEG(SDNode *Node, bool IsMasked, bool IsOrdered);
 
+  // Return the RISC-V condition code that matches the given DAG integer
+  // condition code. The CondCode must be one of those supported by the RISC-V
+  // ISA (see translateSetCCForBranch).
+  static RISCVCC::CondCode getRISCVCCForIntCC(ISD::CondCode CC) {
+    switch (CC) {
+    default:
+      llvm_unreachable("Unsupported CondCode");
+    case ISD::SETEQ:
+      return RISCVCC::COND_EQ;
+    case ISD::SETNE:
+      return RISCVCC::COND_NE;
+    case ISD::SETLT:
+      return RISCVCC::COND_LT;
+    case ISD::SETGE:
+      return RISCVCC::COND_GE;
+    case ISD::SETULT:
+      return RISCVCC::COND_LTU;
+    case ISD::SETUGE:
+      return RISCVCC::COND_GEU;
+    }
+  }
+
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
 
 private:
-  void doPeepholeLoadStoreADDI();
+  bool doPeepholeLoadStoreADDI(SDNode *Node);
+  bool doPeepholeSExtW(SDNode *Node);
 };
 
 namespace RISCV {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d37ed584d9d2..0f1a6e5f9154 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "RISCVTargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -28,8 +29,9 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -107,7 +109,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   static const MVT::SimpleValueType F64VecVTs[] = {
       MVT::nxv1f64, MVT::nxv2f64, MVT::nxv4f64, MVT::nxv8f64};
 
-  if (Subtarget.hasStdExtV()) {
+  if (Subtarget.hasVInstructions()) {
     auto addRegClassForRVV = [this](MVT VT) {
       unsigned Size = VT.getSizeInBits().getKnownMinValue();
       assert(Size <= 512 && isPowerOf2_32(Size));
@@ -126,18 +128,22 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
     for (MVT VT : BoolVecVTs)
       addRegClassForRVV(VT);
-    for (MVT VT : IntVecVTs)
+    for (MVT VT : IntVecVTs) {
+      if (VT.getVectorElementType() == MVT::i64 &&
+          !Subtarget.hasVInstructionsI64())
+        continue;
       addRegClassForRVV(VT);
+    }
 
-    if (Subtarget.hasStdExtZfh())
+    if (Subtarget.hasVInstructionsF16())
       for (MVT VT : F16VecVTs)
         addRegClassForRVV(VT);
 
-    if (Subtarget.hasStdExtF())
+    if (Subtarget.hasVInstructionsF32())
       for (MVT VT : F32VecVTs)
         addRegClassForRVV(VT);
 
-    if (Subtarget.hasStdExtD())
+    if (Subtarget.hasVInstructionsF64())
       for (MVT VT : F64VecVTs)
         addRegClassForRVV(VT);
 
@@ -199,6 +205,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::USUBO, MVT::i32, Custom);
     setOperationAction(ISD::UADDSAT, MVT::i32, Custom);
     setOperationAction(ISD::USUBSAT, MVT::i32, Custom);
+  } else {
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MUL_I128, nullptr);
+    setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
   if (!Subtarget.hasStdExtM()) {
@@ -299,14 +311,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, XLenVT, Custom);
   }
 
-  ISD::CondCode FPCCToExpand[] = {
+  static const ISD::CondCode FPCCToExpand[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
       ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
       ISD::SETGE,  ISD::SETNE,  ISD::SETO,   ISD::SETUO};
 
-  ISD::NodeType FPOpToExpand[] = {
-      ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM, ISD::FP16_TO_FP,
-      ISD::FP_TO_FP16};
+  static const ISD::NodeType FPOpToExpand[] = {
+      ISD::FSIN, ISD::FCOS,       ISD::FSINCOS,   ISD::FPOW,
+      ISD::FREM, ISD::FP16_TO_FP, ISD::FP_TO_FP16};
 
   if (Subtarget.hasStdExtZfh())
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
@@ -325,6 +337,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     for (auto Op : FPOpToExpand)
       setOperationAction(Op, MVT::f16, Expand);
+
+    setOperationAction(ISD::FREM,       MVT::f16, Promote);
+    setOperationAction(ISD::FCEIL,      MVT::f16,  Promote);
+    setOperationAction(ISD::FFLOOR,     MVT::f16,  Promote);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16,  Promote);
+    setOperationAction(ISD::FRINT,      MVT::f16,  Promote);
+    setOperationAction(ISD::FROUND,     MVT::f16,  Promote);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16,  Promote);
+    setOperationAction(ISD::FTRUNC,     MVT::f16,  Promote);
   }
 
   if (Subtarget.hasStdExtF()) {
@@ -376,6 +397,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget.hasStdExtF()) {
+    setOperationAction(ISD::FP_TO_UINT_SAT, XLenVT, Custom);
+    setOperationAction(ISD::FP_TO_SINT_SAT, XLenVT, Custom);
+
     setOperationAction(ISD::FLT_ROUNDS_, XLenVT, Custom);
     setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
   }
@@ -407,7 +431,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
-  if (Subtarget.hasStdExtV()) {
+  if (Subtarget.hasVInstructions()) {
     setBooleanVectorContents(ZeroOrOneBooleanContent);
 
     setOperationAction(ISD::VSCALE, XLenVT, Custom);
@@ -426,14 +450,21 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
 
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-
-    static unsigned IntegerVPOps[] = {
-        ISD::VP_ADD,  ISD::VP_SUB,  ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV,
-        ISD::VP_SREM, ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR,   ISD::VP_XOR,
-        ISD::VP_ASHR, ISD::VP_LSHR, ISD::VP_SHL};
-
-    static unsigned FloatingPointVPOps[] = {ISD::VP_FADD, ISD::VP_FSUB,
-                                            ISD::VP_FMUL, ISD::VP_FDIV};
+    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+    static const unsigned IntegerVPOps[] = {
+        ISD::VP_ADD,         ISD::VP_SUB,         ISD::VP_MUL,
+        ISD::VP_SDIV,        ISD::VP_UDIV,        ISD::VP_SREM,
+        ISD::VP_UREM,        ISD::VP_AND,         ISD::VP_OR,
+        ISD::VP_XOR,         ISD::VP_ASHR,        ISD::VP_LSHR,
+        ISD::VP_SHL,         ISD::VP_REDUCE_ADD,  ISD::VP_REDUCE_AND,
+        ISD::VP_REDUCE_OR,   ISD::VP_REDUCE_XOR,  ISD::VP_REDUCE_SMAX,
+        ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN};
+
+    static const unsigned FloatingPointVPOps[] = {
+        ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
+        ISD::VP_FDIV,        ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
+        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX};
 
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
@@ -449,6 +480,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_SMIN, MVT::i64, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, MVT::i64, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, MVT::i64, Custom);
+
+      setOperationAction(ISD::VP_REDUCE_ADD, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_AND, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_OR, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_XOR, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_SMAX, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_SMIN, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_UMAX, MVT::i64, Custom);
+      setOperationAction(ISD::VP_REDUCE_UMIN, MVT::i64, Custom);
     }
 
     for (MVT VT : BoolVecVTs) {
@@ -471,6 +511,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
 
+      setOperationAction(ISD::VP_REDUCE_AND, VT, Custom);
+      setOperationAction(ISD::VP_REDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom);
+
       // RVV has native int->float & float->int conversions where the
       // element type sizes are within one power-of-two of each other. Any
       // wider distances between type sizes have to be lowered as sequences
@@ -491,9 +535,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
 
     for (MVT VT : IntVecVTs) {
+      if (VT.getVectorElementType() == MVT::i64 &&
+          !Subtarget.hasVInstructionsI64())
+        continue;
+
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
       setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);
 
+      // Vectors implement MULHS/MULHU.
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
@@ -502,6 +554,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::ROTL, VT, Expand);
       setOperationAction(ISD::ROTR, VT, Expand);
 
+      setOperationAction(ISD::CTTZ, VT, Expand);
+      setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTPOP, VT, Expand);
+
+      setOperationAction(ISD::BSWAP, VT, Expand);
+
       // Custom-lower extensions and truncations from/to mask types.
       setOperationAction(ISD::ANY_EXTEND, VT, Custom);
       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
@@ -551,6 +609,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
+      setOperationAction(ISD::VP_LOAD, VT, Custom);
+      setOperationAction(ISD::VP_STORE, VT, Custom);
+      setOperationAction(ISD::VP_GATHER, VT, Custom);
+      setOperationAction(ISD::VP_SCATTER, VT, Custom);
+
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -567,6 +630,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand);
       }
+
+      // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
+      // type that can represent the value exactly.
+      if (VT.getVectorElementType() != MVT::i64) {
+        MVT FloatEltVT =
+            VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
+        EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
+        if (isTypeLegal(FloatVT)) {
+          setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
+          setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+        }
+      }
     }
 
     // Expand various CCs to best match the RVV ISA, which natively supports UNE
@@ -576,7 +651,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     // and we pattern-match those back to the "original", swapping operands once
     // more. This way we catch both operations and both "vf" and "fv" forms with
     // fewer patterns.
-    ISD::CondCode VFPCCToExpand[] = {
+    static const ISD::CondCode VFPCCToExpand[] = {
         ISD::SETO,   ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
         ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUO,
         ISD::SETGT,  ISD::SETOGT, ISD::SETGE,  ISD::SETOGE,
@@ -605,6 +680,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+
       setOperationAction(ISD::FCOPYSIGN, VT, Legal);
 
       setOperationAction(ISD::LOAD, VT, Custom);
@@ -615,6 +691,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
+      setOperationAction(ISD::VP_LOAD, VT, Custom);
+      setOperationAction(ISD::VP_STORE, VT, Custom);
+      setOperationAction(ISD::VP_GATHER, VT, Custom);
+      setOperationAction(ISD::VP_SCATTER, VT, Custom);
+
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SELECT_CC, VT, Expand);
 
@@ -638,18 +719,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           }
         };
 
-    if (Subtarget.hasStdExtZfh())
+    if (Subtarget.hasVInstructionsF16())
       for (MVT VT : F16VecVTs)
         SetCommonVFPActions(VT);
 
     for (MVT VT : F32VecVTs) {
-      if (Subtarget.hasStdExtF())
+      if (Subtarget.hasVInstructionsF32())
         SetCommonVFPActions(VT);
       SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
     }
 
     for (MVT VT : F64VecVTs) {
-      if (Subtarget.hasStdExtD())
+      if (Subtarget.hasVInstructionsF64())
         SetCommonVFPActions(VT);
       SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
       SetCommonVFPExtLoadTruncStoreActions(VT, F32VecVTs);
@@ -695,6 +776,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
         setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
 
+        setOperationAction(ISD::VP_REDUCE_AND, VT, Custom);
+        setOperationAction(ISD::VP_REDUCE_OR, VT, Custom);
+        setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom);
+
         setOperationAction(ISD::SINT_TO_FP, VT, Custom);
         setOperationAction(ISD::UINT_TO_FP, VT, Custom);
         setOperationAction(ISD::FP_TO_SINT, VT, Custom);
@@ -724,6 +809,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::MSTORE, VT, Custom);
         setOperationAction(ISD::MGATHER, VT, Custom);
         setOperationAction(ISD::MSCATTER, VT, Custom);
+
+        setOperationAction(ISD::VP_LOAD, VT, Custom);
+        setOperationAction(ISD::VP_STORE, VT, Custom);
+        setOperationAction(ISD::VP_GATHER, VT, Custom);
+        setOperationAction(ISD::VP_SCATTER, VT, Custom);
+
         setOperationAction(ISD::ADD, VT, Custom);
         setOperationAction(ISD::MUL, VT, Custom);
         setOperationAction(ISD::SUB, VT, Custom);
@@ -769,6 +860,19 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         for (unsigned VPOpc : IntegerVPOps)
           setOperationAction(VPOpc, VT, Custom);
+
+        // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
+        // type that can represent the value exactly.
+        if (VT.getVectorElementType() != MVT::i64) {
+          MVT FloatEltVT =
+              VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
+          EVT FloatVT =
+              MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
+          if (isTypeLegal(FloatVT)) {
+            setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
+            setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+          }
+        }
       }
 
       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
@@ -788,6 +892,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
         setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+        setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
         setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
         setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
         setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -798,6 +903,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::MSTORE, VT, Custom);
         setOperationAction(ISD::MGATHER, VT, Custom);
         setOperationAction(ISD::MSCATTER, VT, Custom);
+
+        setOperationAction(ISD::VP_LOAD, VT, Custom);
+        setOperationAction(ISD::VP_STORE, VT, Custom);
+        setOperationAction(ISD::VP_GATHER, VT, Custom);
+        setOperationAction(ISD::VP_SCATTER, VT, Custom);
+
         setOperationAction(ISD::FADD, VT, Custom);
         setOperationAction(ISD::FSUB, VT, Custom);
         setOperationAction(ISD::FMUL, VT, Custom);
@@ -852,21 +963,23 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   // Jumps are expensive, compared to logic
   setJumpIsExpensive();
 
-  // We can use any register for comparisons
-  setHasMultipleConditionRegisters();
-
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
-  if (Subtarget.hasStdExtV()) {
+  if (Subtarget.hasVInstructions()) {
     setTargetDAGCombine(ISD::FCOPYSIGN);
     setTargetDAGCombine(ISD::MGATHER);
     setTargetDAGCombine(ISD::MSCATTER);
+    setTargetDAGCombine(ISD::VP_GATHER);
+    setTargetDAGCombine(ISD::VP_SCATTER);
     setTargetDAGCombine(ISD::SRA);
     setTargetDAGCombine(ISD::SRL);
     setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::STORE);
   }
 }
 
@@ -875,7 +988,7 @@ EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
                                             EVT VT) const {
   if (!VT.isVector())
     return getPointerTy(DL);
-  if (Subtarget.hasStdExtV() &&
+  if (Subtarget.hasVInstructions() &&
       (VT.isScalableVector() || Subtarget.useRVVForFixedLengthVectors()))
     return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
   return VT.changeVectorElementTypeToInteger();
@@ -889,6 +1002,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                              const CallInst &I,
                                              MachineFunction &MF,
                                              unsigned Intrinsic) const {
+  auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
   default:
     return false;
@@ -911,6 +1025,25 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                  MachineMemOperand::MOVolatile;
     return true;
   }
+  case Intrinsic::riscv_masked_strided_load:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.memVT = getValueType(DL, I.getType()->getScalarType());
+    Info.align = Align(DL.getTypeSizeInBits(I.getType()->getScalarType()) / 8);
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
+  case Intrinsic::riscv_masked_strided_store:
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.memVT =
+        getValueType(DL, I.getArgOperand(0)->getType()->getScalarType());
+    Info.align = Align(
+        DL.getTypeSizeInBits(I.getArgOperand(0)->getType()->getScalarType()) /
+        8);
+    Info.size = MemoryLocation::UnknownSize;
+    Info.flags |= MachineMemOperand::MOStore;
+    return true;
   }
 }
 
@@ -994,9 +1127,91 @@ bool RISCVTargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget.hasStdExtZbb();
 }
 
+bool RISCVTargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // FIXME: Support vectors once we have tests.
+  if (VT.isVector())
+    return false;
+
+  return Subtarget.hasStdExtZbb() && !isa<ConstantSDNode>(Y);
+}
+
+/// Check if sinking \p I's operands to I's basic block is profitable, because
+/// the operands can be folded into a target instruction, e.g.
+/// splats of scalars can fold into vector instructions.
+bool RISCVTargetLowering::shouldSinkOperands(
+    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+  using namespace llvm::PatternMatch;
+
+  if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
+    return false;
+
+  auto IsSinker = [&](Instruction *I, int Operand) {
+    switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return true;
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+      return Operand == 1;
+    case Instruction::Call:
+      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::fma:
+          return Operand == 0 || Operand == 1;
+        default:
+          return false;
+        }
+      }
+      return false;
+    default:
+      return false;
+    }
+  };
+
+  for (auto OpIdx : enumerate(I->operands())) {
+    if (!IsSinker(I, OpIdx.index()))
+      continue;
+
+    Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
+    // Make sure we are not already sinking this operand
+    if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+      continue;
+
+    // We are looking for a splat that can be sunk.
+    if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
+                             m_Undef(), m_ZeroMask())))
+      continue;
+
+    // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+    // and vector registers
+    for (Use &U : Op->uses()) {
+      Instruction *Insn = cast<Instruction>(U.getUser());
+      if (!IsSinker(Insn, U.getOperandNo()))
+        return false;
+    }
+
+    Ops.push_back(&Op->getOperandUse(0));
+    Ops.push_back(&OpIdx.value());
+  }
+  return true;
+}
+
 bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                        bool ForCodeSize) const {
-  if (VT == MVT::f16 && !Subtarget.hasStdExtZfh())
+  if (VT == MVT::f16 && !Subtarget.hasStdExtZfhmin())
     return false;
   if (VT == MVT::f32 && !Subtarget.hasStdExtF())
     return false;
@@ -1016,9 +1231,9 @@ bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
 MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                       CallingConv::ID CC,
                                                       EVT VT) const {
-  // Use f32 to pass f16 if it is legal and Zfh is not enabled. We might still
-  // end up using a GPR but that will be decided based on ABI.
-  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh())
+  // Use f32 to pass f16 if it is legal and Zfhmin/Zfh is not enabled.
+  // We might still end up using a GPR but that will be decided based on ABI.
+  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfhmin())
     return MVT::f32;
 
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -1027,9 +1242,9 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
 unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                            CallingConv::ID CC,
                                                            EVT VT) const {
-  // Use f32 to pass f16 if it is legal and Zfh is not enabled. We might still
-  // end up using a GPR but that will be decided based on ABI.
-  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh())
+  // Use f32 to pass f16 if it is legal and Zfhmin/Zfh is not enabled.
+  // We might still end up using a GPR but that will be decided based on ABI.
+  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfhmin())
     return 1;
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@@ -1068,28 +1283,6 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
   }
 }
 
-// Return the RISC-V branch opcode that matches the given DAG integer
-// condition code. The CondCode must be one of those supported by the RISC-V
-// ISA (see translateSetCCForBranch).
-static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unsupported CondCode");
-  case ISD::SETEQ:
-    return RISCV::BEQ;
-  case ISD::SETNE:
-    return RISCV::BNE;
-  case ISD::SETLT:
-    return RISCV::BLT;
-  case ISD::SETGE:
-    return RISCV::BGE;
-  case ISD::SETULT:
-    return RISCV::BLTU;
-  case ISD::SETUGE:
-    return RISCV::BGEU;
-  }
-}
-
 RISCVII::VLMUL RISCVTargetLowering::getLMUL(MVT VT) {
   assert(VT.isScalableVector() && "Expecting a scalable vector type");
   unsigned KnownSize = VT.getSizeInBits().getKnownMinValue();
@@ -1206,6 +1399,27 @@ bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const {
          (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
 }
 
+bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const {
+  if (ScalarTy->isPointerTy())
+    return true;
+
+  if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
+      ScalarTy->isIntegerTy(32))
+    return true;
+
+  if (ScalarTy->isIntegerTy(64))
+    return Subtarget.hasVInstructionsI64();
+
+  if (ScalarTy->isHalfTy())
+    return Subtarget.hasVInstructionsF16();
+  if (ScalarTy->isFloatTy())
+    return Subtarget.hasVInstructionsF32();
+  if (ScalarTy->isDoubleTy())
+    return Subtarget.hasVInstructionsF64();
+
+  return false;
+}
+
 static bool useRVVForFixedLengthVectorVT(MVT VT,
                                          const RISCVSubtarget &Subtarget) {
   assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
@@ -1221,8 +1435,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
 
   unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
 
+  MVT EltVT = VT.getVectorElementType();
+
   // Don't use RVV for vectors we cannot scalarize if required.
-  switch (VT.getVectorElementType().SimpleTy) {
+  switch (EltVT.SimpleTy) {
   // i1 is supported but has different rules.
   default:
     return false;
@@ -1235,22 +1451,29 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
+    break;
   case MVT::i64:
+    if (!Subtarget.hasVInstructionsI64())
+      return false;
     break;
   case MVT::f16:
-    if (!Subtarget.hasStdExtZfh())
+    if (!Subtarget.hasVInstructionsF16())
       return false;
     break;
   case MVT::f32:
-    if (!Subtarget.hasStdExtF())
+    if (!Subtarget.hasVInstructionsF32())
       return false;
     break;
   case MVT::f64:
-    if (!Subtarget.hasStdExtD())
+    if (!Subtarget.hasVInstructionsF64())
       return false;
     break;
   }
 
+  // Reject elements larger than ELEN.
+  if (EltVT.getSizeInBits() > Subtarget.getMaxELENForFixedLengthVectors())
+    return false;
+
   unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
   // Don't use RVV for types that don't fit.
   if (LMul > Subtarget.getMaxLMULForFixedLengthVectors())
@@ -1277,6 +1500,7 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
          "Expected legal fixed length vector!");
 
   unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+  unsigned MaxELen = Subtarget.getMaxELENForFixedLengthVectors();
 
   MVT EltVT = VT.getVectorElementType();
   switch (EltVT.SimpleTy) {
@@ -1291,10 +1515,12 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
   case MVT::f32:
   case MVT::f64: {
     // We prefer to use LMUL=1 for VLEN sized types. Use fractional lmuls for
-    // narrower types, but we can't have a fractional LMUL with demoninator less
-    // than 64/SEW.
+    // narrower types. The smallest fractional LMUL we support is 8/ELEN. Within
+    // each fractional LMUL we support SEW between 8 and LMUL*ELEN.
     unsigned NumElts =
-        divideCeil(VT.getVectorNumElements(), MinVLen / RISCV::RVVBitsPerBlock);
+        (VT.getVectorNumElements() * RISCV::RVVBitsPerBlock) / MinVLen;
+    NumElts = std::max(NumElts, RISCV::RVVBitsPerBlock / MaxELen);
+    assert(isPowerOf2_32(NumElts) && "Expected power of 2 NumElts");
     return MVT::getScalableVectorVT(EltVT, NumElts);
   }
   }
@@ -1344,7 +1570,7 @@ getDefaultVLOps(MVT VecVT, MVT ContainerVT, SDLoc DL, SelectionDAG &DAG,
   MVT XLenVT = Subtarget.getXLenVT();
   SDValue VL = VecVT.isFixedLengthVector()
                    ? DAG.getConstant(VecVT.getVectorNumElements(), DL, XLenVT)
-                   : DAG.getRegister(RISCV::X0, XLenVT);
+                   : DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT);
   MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
   SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
   return {Mask, VL};
@@ -1379,6 +1605,32 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   return false;
 }
 
+static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) {
+  // RISCV FP-to-int conversions saturate to the destination register size, but
+  // don't produce 0 for nan. We can use a conversion instruction and fix the
+  // nan case with a compare and a select.
+  SDValue Src = Op.getOperand(0);
+
+  EVT DstVT = Op.getValueType();
+  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+
+  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
+  unsigned Opc;
+  if (SatVT == DstVT)
+    Opc = IsSigned ? RISCVISD::FCVT_X_RTZ : RISCVISD::FCVT_XU_RTZ;
+  else if (DstVT == MVT::i64 && SatVT == MVT::i32)
+    Opc = IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64;
+  else
+    return SDValue();
+  // FIXME: Support other SatVTs by clamping before or after the conversion.
+
+  SDLoc DL(Op);
+  SDValue FpToInt = DAG.getNode(Opc, DL, DstVT, Src);
+
+  SDValue ZeroInt = DAG.getConstant(0, DL, DstVT);
+  return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+}
+
 static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
@@ -1397,13 +1649,18 @@ static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG,
 }
 
 struct VIDSequence {
-  int64_t Step;
+  int64_t StepNumerator;
+  unsigned StepDenominator;
   int64_t Addend;
 };
 
 // Try to match an arithmetic-sequence BUILD_VECTOR [X,X+S,X+2*S,...,X+(N-1)*S]
 // to the (non-zero) step S and start value X. This can be then lowered as the
 // RVV sequence (VID * S) + X, for example.
+// The step S is represented as an integer numerator divided by a positive
+// denominator. Note that the implementation currently only identifies
+// sequences in which either the numerator is +/- 1 or the denominator is 1. It
+// cannot detect 2/3, for example.
 // Note that this method will also match potentially unappealing index
 // sequences, like <i32 0, i32 50939494>, however it is left to the caller to
 // determine whether this is worth generating code for.
@@ -1413,7 +1670,8 @@ static Optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
   if (!Op.getValueType().isInteger())
     return None;
 
-  Optional<int64_t> SeqStep, SeqAddend;
+  Optional<unsigned> SeqStepDenom;
+  Optional<int64_t> SeqStepNum, SeqAddend;
   Optional<std::pair<uint64_t, unsigned>> PrevElt;
   unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits();
   for (unsigned Idx = 0; Idx < NumElts; Idx++) {
@@ -1431,26 +1689,40 @@ static Optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
     if (PrevElt) {
       // Calculate the step since the last non-undef element, and ensure
       // it's consistent across the entire sequence.
-      int64_t Diff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
-      // The difference must cleanly divide the element span.
-      if (Diff % (Idx - PrevElt->second) != 0)
-        return None;
-      int64_t Step = Diff / (Idx - PrevElt->second);
-      // A zero step indicates we're either a not an index sequence, or we
-      // have a fractional step. This must be handled by a more complex
-      // pattern recognition (undefs complicate things here).
-      if (Step == 0)
-        return None;
-      if (!SeqStep)
-        SeqStep = Step;
-      else if (Step != SeqStep)
-        return None;
+      unsigned IdxDiff = Idx - PrevElt->second;
+      int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
+
+      // A zero-value value difference means that we're somewhere in the middle
+      // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
+      // step change before evaluating the sequence.
+      if (ValDiff != 0) {
+        int64_t Remainder = ValDiff % IdxDiff;
+        // Normalize the step if it's greater than 1.
+        if (Remainder != ValDiff) {
+          // The difference must cleanly divide the element span.
+          if (Remainder != 0)
+            return None;
+          ValDiff /= IdxDiff;
+          IdxDiff = 1;
+        }
+
+        if (!SeqStepNum)
+          SeqStepNum = ValDiff;
+        else if (ValDiff != SeqStepNum)
+          return None;
+
+        if (!SeqStepDenom)
+          SeqStepDenom = IdxDiff;
+        else if (IdxDiff != *SeqStepDenom)
+          return None;
+      }
     }
 
     // Record and/or check any addend.
-    if (SeqStep) {
-      int64_t Addend =
-          SignExtend64(Val - (Idx * (uint64_t)*SeqStep), EltSizeInBits);
+    if (SeqStepNum && SeqStepDenom) {
+      uint64_t ExpectedVal =
+          (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
+      int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
       if (!SeqAddend)
         SeqAddend = Addend;
       else if (SeqAddend != Addend)
@@ -1458,14 +1730,15 @@ static Optional<VIDSequence> isSimpleVIDSequence(SDValue Op) {
     }
 
     // Record this non-undef element for later.
-    PrevElt = std::make_pair(Val, Idx);
+    if (!PrevElt || PrevElt->first != Val)
+      PrevElt = std::make_pair(Val, Idx);
   }
   // We need to have logged both a step and an addend for this to count as
   // a legal index sequence.
-  if (!SeqStep || !SeqAddend)
+  if (!SeqStepNum || !SeqStepDenom || !SeqAddend)
     return None;
 
-  return VIDSequence{*SeqStep, *SeqAddend};
+  return VIDSequence{*SeqStepNum, *SeqStepDenom, *SeqAddend};
 }
 
 static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -1599,31 +1872,38 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   // with optional modifications. An all-undef vector is matched by
   // getSplatValue, above.
   if (auto SimpleVID = isSimpleVIDSequence(Op)) {
-    int64_t Step = SimpleVID->Step;
+    int64_t StepNumerator = SimpleVID->StepNumerator;
+    unsigned StepDenominator = SimpleVID->StepDenominator;
     int64_t Addend = SimpleVID->Addend;
     // Only emit VIDs with suitably-small steps/addends. We use imm5 is a
     // threshold since it's the immediate value many RVV instructions accept.
-    if (isInt<5>(Step) && isInt<5>(Addend)) {
+    if (isInt<5>(StepNumerator) && isPowerOf2_32(StepDenominator) &&
+        isInt<5>(Addend)) {
       SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL);
       // Convert right out of the scalable type so we can use standard ISD
       // nodes for the rest of the computation. If we used scalable types with
       // these, we'd lose the fixed-length vector info and generate worse
       // vsetvli code.
       VID = convertFromScalableVector(VT, VID, DAG, Subtarget);
-      assert(Step != 0 && "Invalid step");
+      assert(StepNumerator != 0 && "Invalid step");
       bool Negate = false;
-      if (Step != 1) {
-        int64_t SplatStepVal = Step;
+      if (StepNumerator != 1) {
+        int64_t SplatStepVal = StepNumerator;
         unsigned Opcode = ISD::MUL;
-        if (isPowerOf2_64(std::abs(Step))) {
-          Negate = Step < 0;
+        if (isPowerOf2_64(std::abs(StepNumerator))) {
+          Negate = StepNumerator < 0;
           Opcode = ISD::SHL;
-          SplatStepVal = Log2_64(std::abs(Step));
+          SplatStepVal = Log2_64(std::abs(StepNumerator));
         }
         SDValue SplatStep = DAG.getSplatVector(
             VT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT));
         VID = DAG.getNode(Opcode, DL, VT, VID, SplatStep);
       }
+      if (StepDenominator != 1) {
+        SDValue SplatStep = DAG.getSplatVector(
+            VT, DL, DAG.getConstant(Log2_64(StepDenominator), DL, XLenVT));
+        VID = DAG.getNode(ISD::SRL, DL, VT, VID, SplatStep);
+      }
       if (Addend != 0 || Negate) {
         SDValue SplatAddend =
             DAG.getSplatVector(VT, DL, DAG.getConstant(Addend, DL, XLenVT));
@@ -1704,6 +1984,13 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   unsigned NumUndefElts =
       count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
 
+  // Track the number of scalar loads we know we'd be inserting, estimated as
+  // any non-zero floating-point constant. Other kinds of element are either
+  // already in registers or are materialized on demand. The threshold at which
+  // a vector load is more desirable than several scalar materializion and
+  // vector-insertion instructions is not known.
+  unsigned NumScalarLoads = 0;
+
   for (SDValue V : Op->op_values()) {
     if (V.isUndef())
       continue;
@@ -1711,6 +1998,9 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     ValueCounts.insert(std::make_pair(V, 0));
     unsigned &Count = ValueCounts[V];
 
+    if (auto *CFP = dyn_cast<ConstantFPSDNode>(V))
+      NumScalarLoads += !CFP->isExactlyValue(+0.0);
+
     // Is this value dominant? In case of a tie, prefer the highest element as
     // it's cheaper to insert near the beginning of a vector than it is at the
     // end.
@@ -1726,7 +2016,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   // Don't perform this optimization when optimizing for size, since
   // materializing elements and inserting them tends to cause code bloat.
-  if (!DAG.shouldOptForSize() &&
+  if (!DAG.shouldOptForSize() && NumScalarLoads < NumElts &&
       ((MostCommonCount > DominantValueCountThreshold) ||
        (ValueCounts.size() <= Log2_32(NumDefElts)))) {
     // Start by splatting the most common element.
@@ -1926,6 +2216,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
   bool InvertMask = IsSelect == SwapOps;
 
+  // Keep a track of which non-undef indices are used by each LHS/RHS shuffle
+  // half.
+  DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
+
   // Now construct the mask that will be used by the vselect or blended
   // vrgather operation. For vrgathers, construct the appropriate indices into
   // each vector.
@@ -1940,6 +2234,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       GatherIndicesRHS.push_back(
           IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
                             : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
+      if (IsLHSOrUndefIndex && MaskIndex >= 0)
+        ++LHSIndexCounts[MaskIndex];
+      if (!IsLHSOrUndefIndex)
+        ++RHSIndexCounts[MaskIndex - NumElts];
     }
   }
 
@@ -1963,13 +2261,14 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return SDValue();
   }
 
-  unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+  unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
+  unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
   MVT IndexVT = VT.changeTypeToInteger();
   // Since we can't introduce illegal index types at this stage, use i16 and
   // vrgatherei16 if the corresponding index type for plain vrgather is greater
   // than XLenVT.
   if (IndexVT.getScalarType().bitsGT(XLenVT)) {
-    GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+    GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
     IndexVT = IndexVT.changeVectorElementType(MVT::i16);
   }
 
@@ -1982,28 +2281,48 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
     Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
   } else {
-    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
-    LHSIndices =
-        convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
-
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    Gather =
-        DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (LHSIndexCounts.size() == 1) {
+      int SplatIndex = LHSIndexCounts.begin()->getFirst();
+      Gather =
+          DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
+                      DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+    } else {
+      SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+      LHSIndices =
+          convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+                           TrueMask, VL);
+    }
   }
 
   // If a second vector operand is used by this shuffle, blend it in with an
   // additional vrgather.
   if (!V2.isUndef()) {
+    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (RHSIndexCounts.size() == 1) {
+      int SplatIndex = RHSIndexCounts.begin()->getFirst();
+      V2 = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+                       DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+    } else {
+      SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+      RHSIndices =
+          convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+      V2 = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, TrueMask,
+                       VL);
+    }
+
     MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
     SelectMask =
         convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
 
-    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
-    RHSIndices =
-        convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-
-    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-    V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL);
     Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
                          Gather, VL);
   }
@@ -2026,6 +2345,57 @@ static SDValue getRVVFPExtendOrRound(SDValue Op, MVT VT, MVT ContainerVT,
   return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL);
 }
 
+// Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
+// the exponent.
+static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  SDValue Src = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // We need a FP type that can represent the value.
+  // TODO: Use f16 for i8 when possible?
+  MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32;
+  MVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
+
+  // Legal types should have been checked in the RISCVTargetLowering
+  // constructor.
+  // TODO: Splitting may make sense in some cases.
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(FloatVT) &&
+         "Expected legal float type!");
+
+  // For CTTZ_ZERO_UNDEF, we need to extract the lowest set bit using X & -X.
+  // The trailing zero count is equal to log2 of this single bit value.
+  if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
+    SDValue Neg =
+        DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
+    Src = DAG.getNode(ISD::AND, DL, VT, Src, Neg);
+  }
+
+  // We have a legal FP type, convert to it.
+  SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+  // Bitcast to integer and shift the exponent to the LSB.
+  EVT IntVT = FloatVT.changeVectorElementTypeToInteger();
+  SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal);
+  unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23;
+  SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
+                              DAG.getConstant(ShiftAmt, DL, IntVT));
+  // Truncate back to original type to allow vnsrl.
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift);
+  // The exponent contains log2 of the value in biased form.
+  unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127;
+
+  // For trailing zeros, we just need to subtract the bias.
+  if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF)
+    return DAG.getNode(ISD::SUB, DL, VT, Trunc,
+                       DAG.getConstant(ExponentBias, DL, VT));
+
+  // For leading zeros, we need to remove the bias and convert from log2 to
+  // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)).
+  unsigned Adjust = ExponentBias + (EltSize - 1);
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc);
+}
+
 // While RVV has alignment restrictions, we should always be able to load as a
 // legal equivalently-sized byte-typed vector instead. This method is
 // responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If
@@ -2132,7 +2502,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       // into a one-element vector of the result type, and perform a vector
       // bitcast.
       if (!Op0VT.isVector()) {
-        auto BVT = EVT::getVectorVT(*DAG.getContext(), Op0VT, 1);
+        EVT BVT = EVT::getVectorVT(*DAG.getContext(), Op0VT, 1);
+        if (!isTypeLegal(BVT))
+          return SDValue();
         return DAG.getBitcast(VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, BVT,
                                               DAG.getUNDEF(BVT), Op0,
                                               DAG.getConstant(0, DL, XLenVT)));
@@ -2143,8 +2515,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // thus: bitcast the vector to a one-element vector type whose element type
     // is the same as the result type, and extract the first element.
     if (!VT.isVector() && Op0VT.isFixedLengthVector()) {
-      LLVMContext &Context = *DAG.getContext();
-      SDValue BVec = DAG.getBitcast(EVT::getVectorVT(Context, VT, 1), Op0);
+      EVT BVT = EVT::getVectorVT(*DAG.getContext(), VT, 1);
+      if (!isTypeLegal(BVT))
+        return SDValue();
+      SDValue BVec = DAG.getBitcast(BVT, Op0);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
                          DAG.getConstant(0, DL, XLenVT));
     }
@@ -2166,6 +2540,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:
     return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+    return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::BSWAP:
   case ISD::BITREVERSE: {
     // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining.
@@ -2479,6 +2855,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     Src = DAG.getNode(RVVOpc, DL, ContainerVT, Src, Mask, VL);
     return convertFromScalableVector(VT, Src, DAG, Subtarget);
   }
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    return lowerFP_TO_INT_SAT(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_SMAX:
@@ -2489,13 +2868,29 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_OR:
   case ISD::VECREDUCE_XOR:
     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
-      return lowerVectorMaskVECREDUCE(Op, DAG);
+      return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ false);
     return lowerVECREDUCE(Op, DAG);
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_SEQ_FADD:
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX:
     return lowerFPVECREDUCE(Op, DAG);
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_FMAX:
+    return lowerVPREDUCE(Op, DAG);
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+    if (Op.getOperand(1).getValueType().getVectorElementType() == MVT::i1)
+      return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ true);
+    return lowerVPREDUCE(Op, DAG);
   case ISD::INSERT_SUBVECTOR:
     return lowerINSERT_SUBVECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
@@ -2538,9 +2933,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       return lowerFixedLengthVectorStoreToRVV(Op, DAG);
     return Op;
   case ISD::MLOAD:
-    return lowerMLOAD(Op, DAG);
+  case ISD::VP_LOAD:
+    return lowerMaskedLoad(Op, DAG);
   case ISD::MSTORE:
-    return lowerMSTORE(Op, DAG);
+  case ISD::VP_STORE:
+    return lowerMaskedStore(Op, DAG);
   case ISD::SETCC:
     return lowerFixedLengthVectorSetccToRVV(Op, DAG);
   case ISD::ADD:
@@ -2617,14 +3014,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerToScalableOp(Op, DAG, RISCVISD::FMAXNUM_VL);
   case ISD::ABS:
     return lowerABS(Op, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+    return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
   case ISD::VSELECT:
     return lowerFixedLengthVectorSelectToRVV(Op, DAG);
   case ISD::FCOPYSIGN:
     return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG);
   case ISD::MGATHER:
-    return lowerMGATHER(Op, DAG);
+  case ISD::VP_GATHER:
+    return lowerMaskedGather(Op, DAG);
   case ISD::MSCATTER:
-    return lowerMSCATTER(Op, DAG);
+  case ISD::VP_SCATTER:
+    return lowerMaskedScatter(Op, DAG);
   case ISD::FLT_ROUNDS_:
     return lowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
@@ -2932,7 +3334,7 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
 
-    SDValue TargetCC = DAG.getTargetConstant(CCVal, DL, XLenVT);
+    SDValue TargetCC = DAG.getCondCode(CCVal);
     SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
     return DAG.getNode(RISCVISD::SELECT_CC, DL, Op.getValueType(), Ops);
   }
@@ -2941,7 +3343,7 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // (select condv, truev, falsev)
   // -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
   SDValue Zero = DAG.getConstant(0, DL, XLenVT);
-  SDValue SetNE = DAG.getTargetConstant(ISD::SETNE, DL, XLenVT);
+  SDValue SetNE = DAG.getCondCode(ISD::SETNE);
 
   SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};
 
@@ -3200,7 +3602,7 @@ SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
 
   // Fall back to use a stack store and stride x0 vector load. Use X0 as VL.
   return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VecVT, Lo, Hi,
-                     DAG.getRegister(RISCV::X0, MVT::i64));
+                     DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, MVT::i64));
 }
 
 // Custom-lower extensions from mask vectors by using a vselect either with 1
@@ -3483,7 +3885,7 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
           Op.getOpcode() == ISD::INTRINSIC_W_CHAIN) &&
          "Unexpected opcode");
 
-  if (!Subtarget.hasStdExtV())
+  if (!Subtarget.hasVInstructions())
     return SDValue();
 
   bool HasChain = Op.getOpcode() == ISD::INTRINSIC_W_CHAIN;
@@ -3645,7 +4047,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::riscv_vslide1down_mask: {
     // We need to special case these when the scalar is larger than XLen.
     unsigned NumOps = Op.getNumOperands();
-    bool IsMasked = NumOps == 6;
+    bool IsMasked = NumOps == 7;
     unsigned OpOffset = IsMasked ? 1 : 0;
     SDValue Scalar = Op.getOperand(2 + OpOffset);
     if (Scalar.getValueType().bitsLE(XLenVT))
@@ -3670,7 +4072,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                    DAG.getConstant(1, DL, XLenVT));
 
     // Double the VL since we halved SEW.
-    SDValue VL = Op.getOperand(NumOps - 1);
+    SDValue VL = Op.getOperand(NumOps - (1 + OpOffset));
     SDValue I32VL =
         DAG.getNode(ISD::SHL, DL, XLenVT, VL, DAG.getConstant(1, DL, XLenVT));
 
@@ -3699,7 +4101,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return Vec;
 
     // Apply mask after the operation.
-    SDValue Mask = Op.getOperand(NumOps - 2);
+    SDValue Mask = Op.getOperand(NumOps - 3);
     SDValue MaskedOff = Op.getOperand(1);
     return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff, VL);
   }
@@ -3710,9 +4112,113 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_masked_strided_load: {
+    SDLoc DL(Op);
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+    // the selection of the masked intrinsics doesn't do this for us.
+    SDValue Mask = Op.getOperand(5);
+    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+    MVT VT = Op->getSimpleValueType(0);
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    SDValue PassThru = Op.getOperand(2);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+      PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
+    }
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
+        XLenVT);
+
+    auto *Load = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
+    if (!IsUnmasked)
+      Ops.push_back(PassThru);
+    Ops.push_back(Op.getOperand(3)); // Ptr
+    Ops.push_back(Op.getOperand(4)); // Stride
+    if (!IsUnmasked)
+      Ops.push_back(Mask);
+    Ops.push_back(VL);
+    if (!IsUnmasked) {
+      SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
+      Ops.push_back(Policy);
+    }
+
+    SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+    SDValue Result =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                                Load->getMemoryVT(), Load->getMemOperand());
+    SDValue Chain = Result.getValue(1);
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+  }
+
   return lowerVectorIntrinsicSplats(Op, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    break;
+  case Intrinsic::riscv_masked_strided_store: {
+    SDLoc DL(Op);
+    MVT XLenVT = Subtarget.getXLenVT();
+
+    // If the mask is known to be all ones, optimize to an unmasked intrinsic;
+    // the selection of the masked intrinsics doesn't do this for us.
+    SDValue Mask = Op.getOperand(5);
+    bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+    SDValue Val = Op.getOperand(2);
+    MVT VT = Val.getSimpleValueType();
+    MVT ContainerVT = getContainerForFixedLengthVector(VT);
+
+    Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+
+    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vsse : Intrinsic::riscv_vsse_mask, DL,
+        XLenVT);
+
+    auto *Store = cast<MemIntrinsicSDNode>(Op);
+    SmallVector<SDValue, 8> Ops{Store->getChain(), IntID};
+    Ops.push_back(Val);
+    Ops.push_back(Op.getOperand(3)); // Ptr
+    Ops.push_back(Op.getOperand(4)); // Stride
+    if (!IsUnmasked)
+      Ops.push_back(Mask);
+    Ops.push_back(VL);
+
+    return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Store->getVTList(),
+                                   Ops, Store->getMemoryVT(),
+                                   Store->getMemOperand());
+  }
+  }
+
+  return SDValue();
+}
+
 static MVT getLMUL1VT(MVT VT) {
   assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
          "Unexpected vector MVT");
@@ -3744,14 +4250,18 @@ static unsigned getRVVReductionOp(unsigned ISDOpcode) {
   }
 }
 
-SDValue RISCVTargetLowering::lowerVectorMaskVECREDUCE(SDValue Op,
-                                                      SelectionDAG &DAG) const {
+SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
+                                                         SelectionDAG &DAG,
+                                                         bool IsVP) const {
   SDLoc DL(Op);
-  SDValue Vec = Op.getOperand(0);
+  SDValue Vec = Op.getOperand(IsVP ? 1 : 0);
   MVT VecVT = Vec.getSimpleValueType();
   assert((Op.getOpcode() == ISD::VECREDUCE_AND ||
           Op.getOpcode() == ISD::VECREDUCE_OR ||
-          Op.getOpcode() == ISD::VECREDUCE_XOR) &&
+          Op.getOpcode() == ISD::VECREDUCE_XOR ||
+          Op.getOpcode() == ISD::VP_REDUCE_AND ||
+          Op.getOpcode() == ISD::VP_REDUCE_OR ||
+          Op.getOpcode() == ISD::VP_REDUCE_XOR) &&
          "Unexpected reduction lowering");
 
   MVT XLenVT = Subtarget.getXLenVT();
@@ -3765,29 +4275,62 @@ SDValue RISCVTargetLowering::lowerVectorMaskVECREDUCE(SDValue Op,
   }
 
   SDValue Mask, VL;
-  std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+  if (IsVP) {
+    Mask = Op.getOperand(2);
+    VL = Op.getOperand(3);
+  } else {
+    std::tie(Mask, VL) =
+        getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
+  }
+
+  unsigned BaseOpc;
+  ISD::CondCode CC;
   SDValue Zero = DAG.getConstant(0, DL, XLenVT);
 
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unhandled reduction");
   case ISD::VECREDUCE_AND:
-    // vpopc ~x == 0
-    Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, Mask, VL);
-    Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
-    return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETEQ);
+  case ISD::VP_REDUCE_AND: {
+    // vcpop ~x == 0
+    SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
+    Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, TrueMask, VL);
+    Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL);
+    CC = ISD::SETEQ;
+    BaseOpc = ISD::AND;
+    break;
+  }
   case ISD::VECREDUCE_OR:
-    // vpopc x != 0
-    Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
-    return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE);
-  case ISD::VECREDUCE_XOR: {
-    // ((vpopc x) & 1) != 0
+  case ISD::VP_REDUCE_OR:
+    // vcpop x != 0
+    Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL);
+    CC = ISD::SETNE;
+    BaseOpc = ISD::OR;
+    break;
+  case ISD::VECREDUCE_XOR:
+  case ISD::VP_REDUCE_XOR: {
+    // ((vcpop x) & 1) != 0
     SDValue One = DAG.getConstant(1, DL, XLenVT);
-    Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL);
+    Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL);
     Vec = DAG.getNode(ISD::AND, DL, XLenVT, Vec, One);
-    return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE);
+    CC = ISD::SETNE;
+    BaseOpc = ISD::XOR;
+    break;
   }
   }
+
+  SDValue SetCC = DAG.getSetCC(DL, XLenVT, Vec, Zero, CC);
+
+  if (!IsVP)
+    return SetCC;
+
+  // Now include the start value in the operation.
+  // Note that we must return the start value when no elements are operated
+  // upon. The vcpop instructions we've emitted in each case above will return
+  // 0 for an inactive vector, and so we've already received the neutral value:
+  // AND gives us (0 == 0) -> 1 and OR/XOR give us (0 != 0) -> 0. Therefore we
+  // can simply include the start value.
+  return DAG.getNode(BaseOpc, DL, XLenVT, SetCC, Op.getOperand(0));
 }
 
 SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
@@ -3833,8 +4376,8 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
   SDValue NeutralElem =
       DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags());
   SDValue IdentitySplat = DAG.getSplatVector(M1VT, DL, NeutralElem);
-  SDValue Reduction =
-      DAG.getNode(RVVOpcode, DL, M1VT, Vec, IdentitySplat, Mask, VL);
+  SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec,
+                                  IdentitySplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
                              DAG.getConstant(0, DL, Subtarget.getXLenVT()));
   return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
@@ -3892,12 +4435,83 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
   // FIXME: This is a VLMAX splat which might be too large and can prevent
   // vsetvli removal.
   SDValue ScalarSplat = DAG.getSplatVector(M1VT, DL, ScalarVal);
-  SDValue Reduction =
-      DAG.getNode(RVVOpcode, DL, M1VT, VectorVal, ScalarSplat, Mask, VL);
+  SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT),
+                                  VectorVal, ScalarSplat, Mask, VL);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
                      DAG.getConstant(0, DL, Subtarget.getXLenVT()));
 }
 
+static unsigned getRVVVPReductionOp(unsigned ISDOpcode) {
+  switch (ISDOpcode) {
+  default:
+    llvm_unreachable("Unhandled reduction");
+  case ISD::VP_REDUCE_ADD:
+    return RISCVISD::VECREDUCE_ADD_VL;
+  case ISD::VP_REDUCE_UMAX:
+    return RISCVISD::VECREDUCE_UMAX_VL;
+  case ISD::VP_REDUCE_SMAX:
+    return RISCVISD::VECREDUCE_SMAX_VL;
+  case ISD::VP_REDUCE_UMIN:
+    return RISCVISD::VECREDUCE_UMIN_VL;
+  case ISD::VP_REDUCE_SMIN:
+    return RISCVISD::VECREDUCE_SMIN_VL;
+  case ISD::VP_REDUCE_AND:
+    return RISCVISD::VECREDUCE_AND_VL;
+  case ISD::VP_REDUCE_OR:
+    return RISCVISD::VECREDUCE_OR_VL;
+  case ISD::VP_REDUCE_XOR:
+    return RISCVISD::VECREDUCE_XOR_VL;
+  case ISD::VP_REDUCE_FADD:
+    return RISCVISD::VECREDUCE_FADD_VL;
+  case ISD::VP_REDUCE_SEQ_FADD:
+    return RISCVISD::VECREDUCE_SEQ_FADD_VL;
+  case ISD::VP_REDUCE_FMAX:
+    return RISCVISD::VECREDUCE_FMAX_VL;
+  case ISD::VP_REDUCE_FMIN:
+    return RISCVISD::VECREDUCE_FMIN_VL;
+  }
+}
+
+SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vec = Op.getOperand(1);
+  EVT VecEVT = Vec.getValueType();
+
+  // TODO: The type may need to be widened rather than split. Or widened before
+  // it can be split.
+  if (!isTypeLegal(VecEVT))
+    return SDValue();
+
+  MVT VecVT = VecEVT.getSimpleVT();
+  MVT VecEltVT = VecVT.getVectorElementType();
+  unsigned RVVOpcode = getRVVVPReductionOp(Op.getOpcode());
+
+  MVT ContainerVT = VecVT;
+  if (VecVT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+  }
+
+  SDValue VL = Op.getOperand(3);
+  SDValue Mask = Op.getOperand(2);
+
+  MVT M1VT = getLMUL1VT(ContainerVT);
+  MVT XLenVT = Subtarget.getXLenVT();
+  MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT;
+
+  // FIXME: This is a VLMAX splat which might be too large and can prevent
+  // vsetvli removal.
+  SDValue StartSplat = DAG.getSplatVector(M1VT, DL, Op.getOperand(0));
+  SDValue Reduction =
+      DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL);
+  SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
+                             DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+  if (!VecVT.isInteger())
+    return Elt0;
+  return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
+}
+
 SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
@@ -4338,36 +4952,63 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
       Store->getMemoryVT(), Store->getMemOperand());
 }
 
-SDValue RISCVTargetLowering::lowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
-  auto *Load = cast<MaskedLoadSDNode>(Op);
-
+SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
+                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
-  MVT XLenVT = Subtarget.getXLenVT();
 
-  SDValue Mask = Load->getMask();
-  SDValue PassThru = Load->getPassThru();
-  SDValue VL;
+  const auto *MemSD = cast<MemSDNode>(Op);
+  EVT MemVT = MemSD->getMemoryVT();
+  MachineMemOperand *MMO = MemSD->getMemOperand();
+  SDValue Chain = MemSD->getChain();
+  SDValue BasePtr = MemSD->getBasePtr();
+
+  SDValue Mask, PassThru, VL;
+  if (const auto *VPLoad = dyn_cast<VPLoadSDNode>(Op)) {
+    Mask = VPLoad->getMask();
+    PassThru = DAG.getUNDEF(VT);
+    VL = VPLoad->getVectorLength();
+  } else {
+    const auto *MLoad = cast<MaskedLoadSDNode>(Op);
+    Mask = MLoad->getMask();
+    PassThru = MLoad->getPassThru();
+  }
+
+  bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+  MVT XLenVT = Subtarget.getXLenVT();
 
   MVT ContainerVT = VT;
   if (VT.isFixedLengthVector()) {
     ContainerVT = getContainerForFixedLengthVector(VT);
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
-
-    Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
-    VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
-  } else
-    VL = DAG.getRegister(RISCV::X0, XLenVT);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+  }
+
+  if (!VL)
+    VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+
+  unsigned IntID =
+      IsUnmasked ? Intrinsic::riscv_vle : Intrinsic::riscv_vle_mask;
+  SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
+  if (!IsUnmasked)
+    Ops.push_back(PassThru);
+  Ops.push_back(BasePtr);
+  if (!IsUnmasked)
+    Ops.push_back(Mask);
+  Ops.push_back(VL);
+  if (!IsUnmasked)
+    Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT));
 
   SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
-  SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vle_mask, DL, XLenVT);
-  SDValue Ops[] = {Load->getChain(),   IntID, PassThru,
-                   Load->getBasePtr(), Mask,  VL};
+
   SDValue Result =
-      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
-                              Load->getMemoryVT(), Load->getMemOperand());
-  SDValue Chain = Result.getValue(1);
+      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO);
+  Chain = Result.getValue(1);
 
   if (VT.isFixedLengthVector())
     Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
@@ -4375,32 +5016,58 @@ SDValue RISCVTargetLowering::lowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({Result, Chain}, DL);
 }
 
-SDValue RISCVTargetLowering::lowerMSTORE(SDValue Op, SelectionDAG &DAG) const {
-  auto *Store = cast<MaskedStoreSDNode>(Op);
-
+SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
+                                              SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  SDValue Val = Store->getValue();
-  SDValue Mask = Store->getMask();
+
+  const auto *MemSD = cast<MemSDNode>(Op);
+  EVT MemVT = MemSD->getMemoryVT();
+  MachineMemOperand *MMO = MemSD->getMemOperand();
+  SDValue Chain = MemSD->getChain();
+  SDValue BasePtr = MemSD->getBasePtr();
+  SDValue Val, Mask, VL;
+
+  if (const auto *VPStore = dyn_cast<VPStoreSDNode>(Op)) {
+    Val = VPStore->getValue();
+    Mask = VPStore->getMask();
+    VL = VPStore->getVectorLength();
+  } else {
+    const auto *MStore = cast<MaskedStoreSDNode>(Op);
+    Val = MStore->getValue();
+    Mask = MStore->getMask();
+  }
+
+  bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
   MVT VT = Val.getSimpleValueType();
   MVT XLenVT = Subtarget.getXLenVT();
-  SDValue VL;
 
   MVT ContainerVT = VT;
   if (VT.isFixedLengthVector()) {
     ContainerVT = getContainerForFixedLengthVector(VT);
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
 
     Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
-    Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
-    VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
-  } else
-    VL = DAG.getRegister(RISCV::X0, XLenVT);
+    if (!IsUnmasked) {
+      MVT MaskVT =
+          MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+    }
+  }
 
-  SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vse_mask, DL, XLenVT);
-  return DAG.getMemIntrinsicNode(
-      ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other),
-      {Store->getChain(), IntID, Val, Store->getBasePtr(), Mask, VL},
-      Store->getMemoryVT(), Store->getMemOperand());
+  if (!VL)
+    VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+
+  unsigned IntID =
+      IsUnmasked ? Intrinsic::riscv_vse : Intrinsic::riscv_vse_mask;
+  SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
+  Ops.push_back(Val);
+  Ops.push_back(BasePtr);
+  if (!IsUnmasked)
+    Ops.push_back(Mask);
+  Ops.push_back(VL);
+
+  return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL,
+                                 DAG.getVTList(MVT::Other), Ops, MemVT, MMO);
 }
 
 SDValue
@@ -4596,36 +5263,57 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG,
   return convertFromScalableVector(VT, VPOp, DAG, Subtarget);
 }
 
-// Custom lower MGATHER to a legalized form for RVV. It will then be matched to
-// a RVV indexed load. The RVV indexed load instructions only support the
-// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
-// truncated to XLEN and are treated as byte offsets. Any signed or scaled
-// indexing is extended to the XLEN value type and scaled accordingly.
-SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const {
-  auto *MGN = cast<MaskedGatherSDNode>(Op.getNode());
+// Custom lower MGATHER/VP_GATHER to a legalized form for RVV. It will then be
+// matched to a RVV indexed load. The RVV indexed load instructions only
+// support the "unsigned unscaled" addressing mode; indices are implicitly
+// zero-extended or truncated to XLEN and are treated as byte offsets. Any
+// signed or scaled indexing is extended to the XLEN value type and scaled
+// accordingly.
+SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
 
-  SDValue Index = MGN->getIndex();
-  SDValue Mask = MGN->getMask();
-  SDValue PassThru = MGN->getPassThru();
+  const auto *MemSD = cast<MemSDNode>(Op.getNode());
+  EVT MemVT = MemSD->getMemoryVT();
+  MachineMemOperand *MMO = MemSD->getMemOperand();
+  SDValue Chain = MemSD->getChain();
+  SDValue BasePtr = MemSD->getBasePtr();
+
+  ISD::LoadExtType LoadExtType;
+  SDValue Index, Mask, PassThru, VL;
+
+  if (auto *VPGN = dyn_cast<VPGatherSDNode>(Op.getNode())) {
+    Index = VPGN->getIndex();
+    Mask = VPGN->getMask();
+    PassThru = DAG.getUNDEF(VT);
+    VL = VPGN->getVectorLength();
+    // VP doesn't support extending loads.
+    LoadExtType = ISD::NON_EXTLOAD;
+  } else {
+    // Else it must be a MGATHER.
+    auto *MGN = cast<MaskedGatherSDNode>(Op.getNode());
+    Index = MGN->getIndex();
+    Mask = MGN->getMask();
+    PassThru = MGN->getPassThru();
+    LoadExtType = MGN->getExtensionType();
+  }
 
-  MVT VT = Op.getSimpleValueType();
   MVT IndexVT = Index.getSimpleValueType();
   MVT XLenVT = Subtarget.getXLenVT();
 
   assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
          "Unexpected VTs!");
-  assert(MGN->getBasePtr().getSimpleValueType() == XLenVT &&
-         "Unexpected pointer type");
+  assert(BasePtr.getSimpleValueType() == XLenVT && "Unexpected pointer type");
   // Targets have to explicitly opt-in for extending vector loads.
-  assert(MGN->getExtensionType() == ISD::NON_EXTLOAD &&
-         "Unexpected extending MGATHER");
+  assert(LoadExtType == ISD::NON_EXTLOAD &&
+         "Unexpected extending MGATHER/VP_GATHER");
+  (void)LoadExtType;
 
   // If the mask is known to be all ones, optimize to an unmasked intrinsic;
   // the selection of the masked intrinsics doesn't do this for us.
   bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
 
-  SDValue VL;
   MVT ContainerVT = VT;
   if (VT.isFixedLengthVector()) {
     // We need to use the larger of the result and index type to determine the
@@ -4648,28 +5336,28 @@ SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const {
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
       PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
     }
+  }
 
-    VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
-  } else
-    VL = DAG.getRegister(RISCV::X0, XLenVT);
+  if (!VL)
+    VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
 
   unsigned IntID =
       IsUnmasked ? Intrinsic::riscv_vluxei : Intrinsic::riscv_vluxei_mask;
-  SmallVector<SDValue, 8> Ops{MGN->getChain(),
-                              DAG.getTargetConstant(IntID, DL, XLenVT)};
+  SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
   if (!IsUnmasked)
     Ops.push_back(PassThru);
-  Ops.push_back(MGN->getBasePtr());
+  Ops.push_back(BasePtr);
   Ops.push_back(Index);
   if (!IsUnmasked)
     Ops.push_back(Mask);
   Ops.push_back(VL);
+  if (!IsUnmasked)
+    Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT));
 
   SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
   SDValue Result =
-      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
-                              MGN->getMemoryVT(), MGN->getMemOperand());
-  SDValue Chain = Result.getValue(1);
+      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO);
+  Chain = Result.getValue(1);
 
   if (VT.isFixedLengthVector())
     Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
@@ -4677,18 +5365,39 @@ SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({Result, Chain}, DL);
 }
 
-// Custom lower MSCATTER to a legalized form for RVV. It will then be matched to
-// a RVV indexed store. The RVV indexed store instructions only support the
-// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or
-// truncated to XLEN and are treated as byte offsets. Any signed or scaled
-// indexing is extended to the XLEN value type and scaled accordingly.
-SDValue RISCVTargetLowering::lowerMSCATTER(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  auto *MSN = cast<MaskedScatterSDNode>(Op.getNode());
+// Custom lower MSCATTER/VP_SCATTER to a legalized form for RVV. It will then be
+// matched to a RVV indexed store. The RVV indexed store instructions only
+// support the "unsigned unscaled" addressing mode; indices are implicitly
+// zero-extended or truncated to XLEN and are treated as byte offsets. Any
+// signed or scaled indexing is extended to the XLEN value type and scaled
+// accordingly.
+SDValue RISCVTargetLowering::lowerMaskedScatter(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  SDValue Index = MSN->getIndex();
-  SDValue Mask = MSN->getMask();
-  SDValue Val = MSN->getValue();
+  const auto *MemSD = cast<MemSDNode>(Op.getNode());
+  EVT MemVT = MemSD->getMemoryVT();
+  MachineMemOperand *MMO = MemSD->getMemOperand();
+  SDValue Chain = MemSD->getChain();
+  SDValue BasePtr = MemSD->getBasePtr();
+
+  bool IsTruncatingStore = false;
+  SDValue Index, Mask, Val, VL;
+
+  if (auto *VPSN = dyn_cast<VPScatterSDNode>(Op.getNode())) {
+    Index = VPSN->getIndex();
+    Mask = VPSN->getMask();
+    Val = VPSN->getValue();
+    VL = VPSN->getVectorLength();
+    // VP doesn't support truncating stores.
+    IsTruncatingStore = false;
+  } else {
+    // Else it must be a MSCATTER.
+    auto *MSN = cast<MaskedScatterSDNode>(Op.getNode());
+    Index = MSN->getIndex();
+    Mask = MSN->getMask();
+    Val = MSN->getValue();
+    IsTruncatingStore = MSN->isTruncatingStore();
+  }
 
   MVT VT = Val.getSimpleValueType();
   MVT IndexVT = Index.getSimpleValueType();
@@ -4696,21 +5405,20 @@ SDValue RISCVTargetLowering::lowerMSCATTER(SDValue Op,
 
   assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
          "Unexpected VTs!");
-  assert(MSN->getBasePtr().getSimpleValueType() == XLenVT &&
-         "Unexpected pointer type");
+  assert(BasePtr.getSimpleValueType() == XLenVT && "Unexpected pointer type");
   // Targets have to explicitly opt-in for extending vector loads and
   // truncating vector stores.
-  assert(!MSN->isTruncatingStore() && "Unexpected extending MSCATTER");
+  assert(!IsTruncatingStore && "Unexpected truncating MSCATTER/VP_SCATTER");
+  (void)IsTruncatingStore;
 
   // If the mask is known to be all ones, optimize to an unmasked intrinsic;
   // the selection of the masked intrinsics doesn't do this for us.
   bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
 
-  SDValue VL;
+  MVT ContainerVT = VT;
   if (VT.isFixedLengthVector()) {
     // We need to use the larger of the value and index type to determine the
     // scalable type to use so we don't increase LMUL for any operand/result.
-    MVT ContainerVT;
     if (VT.bitsGE(IndexVT)) {
       ContainerVT = getContainerForFixedLengthVector(VT);
       IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(),
@@ -4729,24 +5437,23 @@ SDValue RISCVTargetLowering::lowerMSCATTER(SDValue Op,
           MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
+  }
 
-    VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
-  } else
-    VL = DAG.getRegister(RISCV::X0, XLenVT);
+  if (!VL)
+    VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
 
   unsigned IntID =
       IsUnmasked ? Intrinsic::riscv_vsoxei : Intrinsic::riscv_vsoxei_mask;
-  SmallVector<SDValue, 8> Ops{MSN->getChain(),
-                              DAG.getTargetConstant(IntID, DL, XLenVT)};
+  SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
   Ops.push_back(Val);
-  Ops.push_back(MSN->getBasePtr());
+  Ops.push_back(BasePtr);
   Ops.push_back(Index);
   if (!IsUnmasked)
     Ops.push_back(Mask);
   Ops.push_back(VL);
 
-  return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, MSN->getVTList(), Ops,
-                                 MSN->getMemoryVT(), MSN->getMemOperand());
+  return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL,
+                                 DAG.getVTList(MVT::Other), Ops, MemVT, MMO);
 }
 
 SDValue RISCVTargetLowering::lowerGET_ROUNDING(SDValue Op,
@@ -4754,7 +5461,7 @@ SDValue RISCVTargetLowering::lowerGET_ROUNDING(SDValue Op,
   const MVT XLenVT = Subtarget.getXLenVT();
   SDLoc DL(Op);
   SDValue Chain = Op->getOperand(0);
-  SDValue SysRegNo = DAG.getConstant(
+  SDValue SysRegNo = DAG.getTargetConstant(
       RISCVSysReg::lookupSysRegByName("FRM")->Encoding, DL, XLenVT);
   SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
   SDValue RM = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
@@ -4786,7 +5493,7 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
   SDLoc DL(Op);
   SDValue Chain = Op->getOperand(0);
   SDValue RMValue = Op->getOperand(1);
-  SDValue SysRegNo = DAG.getConstant(
+  SDValue SysRegNo = DAG.getTargetConstant(
       RISCVSysReg::lookupSysRegByName("FRM")->Encoding, DL, XLenVT);
 
   // Encoding used for rounding mode in RISCV differs from that used in
@@ -4891,7 +5598,8 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         return;
       if (!isTypeLegal(Op0.getValueType()))
         return;
-      unsigned Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64;
+      unsigned Opc =
+          IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64;
       SDValue Res = DAG.getNode(Opc, DL, MVT::i64, Op0);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
@@ -4973,8 +5681,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SUB:
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
-    if (N->getOperand(1).getOpcode() == ISD::Constant)
-      return;
     Results.push_back(customLegalizeToWOpWithSExt(N, DAG));
     break;
   case ISD::SHL:
@@ -4982,9 +5688,26 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SRL:
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
-    if (N->getOperand(1).getOpcode() == ISD::Constant)
-      return;
-    Results.push_back(customLegalizeToWOp(N, DAG));
+    if (N->getOperand(1).getOpcode() != ISD::Constant) {
+      Results.push_back(customLegalizeToWOp(N, DAG));
+      break;
+    }
+
+    // Custom legalize ISD::SHL by placing a SIGN_EXTEND_INREG after. This is
+    // similar to customLegalizeToWOpWithSExt, but we must zero_extend the
+    // shift amount.
+    if (N->getOpcode() == ISD::SHL) {
+      SDLoc DL(N);
+      SDValue NewOp0 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+      SDValue NewOp1 =
+          DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1));
+      SDValue NewWOp = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp0, NewOp1);
+      SDValue NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewWOp,
+                                   DAG.getValueType(MVT::i32));
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+    }
+
     break;
   case ISD::ROTL:
   case ISD::ROTR:
@@ -5098,10 +5821,12 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       // scalar types in order to improve codegen. Bitcast the vector to a
       // one-element vector type whose element type is the same as the result
       // type, and extract the first element.
-      LLVMContext &Context = *DAG.getContext();
-      SDValue BVec = DAG.getBitcast(EVT::getVectorVT(Context, VT, 1), Op0);
-      Results.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
-                                    DAG.getConstant(0, DL, XLenVT)));
+      EVT BVT = EVT::getVectorVT(*DAG.getContext(), VT, 1);
+      if (isTypeLegal(BVT)) {
+        SDValue BVec = DAG.getBitcast(BVT, Op0);
+        Results.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
+                                      DAG.getConstant(0, DL, XLenVT)));
+      }
     }
     break;
   }
@@ -5211,7 +5936,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     MVT XLenVT = Subtarget.getXLenVT();
 
     // Use a VL of 1 to avoid processing more elements than we need.
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
     SDValue VL = DAG.getConstant(1, DL, XLenVT);
     SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
 
@@ -5354,6 +6079,17 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     if (SDValue V = lowerVECREDUCE(SDValue(N, 0), DAG))
       Results.push_back(V);
     break;
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMIN:
+    if (SDValue V = lowerVPREDUCE(SDValue(N, 0), DAG))
+      Results.push_back(V);
+    break;
   case ISD::FLT_ROUNDS_: {
     SDVTList VTs = DAG.getVTList(Subtarget.getXLenVT(), MVT::Other);
     SDValue Res = DAG.getNode(ISD::FLT_ROUNDS_, DL, VTs, N->getOperand(0));
@@ -5656,6 +6392,52 @@ static SDValue combineORToSHFL(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(Match1->ShAmt, DL, VT));
 }
 
+// Optimize (add (shl x, c0), (shl y, c1)) ->
+//          (SLLI (SH*ADD x, y), c0), if c1-c0 equals to [1|2|3].
+static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
+                                  const RISCVSubtarget &Subtarget) {
+  // Perform this optimization only in the zba extension.
+  if (!Subtarget.hasStdExtZba())
+    return SDValue();
+
+  // Skip for vector types and larger types.
+  EVT VT = N->getValueType(0);
+  if (VT.isVector() || VT.getSizeInBits() > Subtarget.getXLen())
+    return SDValue();
+
+  // The two operand nodes must be SHL and have no other use.
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0->getOpcode() != ISD::SHL || N1->getOpcode() != ISD::SHL ||
+      !N0->hasOneUse() || !N1->hasOneUse())
+    return SDValue();
+
+  // Check c0 and c1.
+  auto *N0C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  auto *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(1));
+  if (!N0C || !N1C)
+    return SDValue();
+  int64_t C0 = N0C->getSExtValue();
+  int64_t C1 = N1C->getSExtValue();
+  if (C0 <= 0 || C1 <= 0)
+    return SDValue();
+
+  // Skip if SH1ADD/SH2ADD/SH3ADD are not applicable.
+  int64_t Bits = std::min(C0, C1);
+  int64_t Diff = std::abs(C0 - C1);
+  if (Diff != 1 && Diff != 2 && Diff != 3)
+    return SDValue();
+
+  // Build nodes.
+  SDLoc DL(N);
+  SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0);
+  SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0);
+  SDValue NA0 =
+      DAG.getNode(ISD::SHL, DL, VT, NL, DAG.getConstant(Diff, DL, VT));
+  SDValue NA1 = DAG.getNode(ISD::ADD, DL, VT, NA0, NS);
+  return DAG.getNode(ISD::SHL, DL, VT, NA1, DAG.getConstant(Bits, DL, VT));
+}
+
 // Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
 // non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
 // Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
@@ -5691,17 +6473,27 @@ static SDValue combineGREVI_GORCI(SDNode *N, SelectionDAG &DAG) {
 
 // Combine a constant select operand into its use:
 //
-// (and (select_cc lhs, rhs, cc, -1, c), x)
-//   -> (select_cc lhs, rhs, cc, x, (and, x, c))  [AllOnes=1]
-// (or  (select_cc lhs, rhs, cc, 0, c), x)
-//   -> (select_cc lhs, rhs, cc, x, (or, x, c))  [AllOnes=0]
-// (xor (select_cc lhs, rhs, cc, 0, c), x)
-//   -> (select_cc lhs, rhs, cc, x, (xor, x, c))  [AllOnes=0]
-static SDValue combineSelectCCAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
-                                     SelectionDAG &DAG, bool AllOnes) {
+// (and (select cond, -1, c), x)
+//   -> (select cond, x, (and x, c))  [AllOnes=1]
+// (or  (select cond, 0, c), x)
+//   -> (select cond, x, (or x, c))  [AllOnes=0]
+// (xor (select cond, 0, c), x)
+//   -> (select cond, x, (xor x, c))  [AllOnes=0]
+// (add (select cond, 0, c), x)
+//   -> (select cond, x, (add x, c))  [AllOnes=0]
+// (sub x, (select cond, 0, c))
+//   -> (select cond, x, (sub x, c))  [AllOnes=0]
+static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                                   SelectionDAG &DAG, bool AllOnes) {
   EVT VT = N->getValueType(0);
 
-  if (Slct.getOpcode() != RISCVISD::SELECT_CC || !Slct.hasOneUse())
+  // Skip vectors.
+  if (VT.isVector())
+    return SDValue();
+
+  if ((Slct.getOpcode() != ISD::SELECT &&
+       Slct.getOpcode() != RISCVISD::SELECT_CC) ||
+      !Slct.hasOneUse())
     return SDValue();
 
   auto isZeroOrAllOnes = [](SDValue N, bool AllOnes) {
@@ -5709,8 +6501,9 @@ static SDValue combineSelectCCAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
   };
 
   bool SwapSelectOps;
-  SDValue TrueVal = Slct.getOperand(3);
-  SDValue FalseVal = Slct.getOperand(4);
+  unsigned OpOffset = Slct.getOpcode() == RISCVISD::SELECT_CC ? 2 : 0;
+  SDValue TrueVal = Slct.getOperand(1 + OpOffset);
+  SDValue FalseVal = Slct.getOperand(2 + OpOffset);
   SDValue NonConstantVal;
   if (isZeroOrAllOnes(TrueVal, AllOnes)) {
     SwapSelectOps = false;
@@ -5724,40 +6517,120 @@ static SDValue combineSelectCCAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
   // Slct is now know to be the desired identity constant when CC is true.
   TrueVal = OtherOp;
   FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal);
-  // Unless SwapSelectOps says CC should be false.
+  // Unless SwapSelectOps says the condition should be false.
   if (SwapSelectOps)
     std::swap(TrueVal, FalseVal);
 
-  return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), VT,
-                     {Slct.getOperand(0), Slct.getOperand(1),
-                      Slct.getOperand(2), TrueVal, FalseVal});
+  if (Slct.getOpcode() == RISCVISD::SELECT_CC)
+    return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), VT,
+                       {Slct.getOperand(0), Slct.getOperand(1),
+                        Slct.getOperand(2), TrueVal, FalseVal});
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
+                     {Slct.getOperand(0), TrueVal, FalseVal});
 }
 
 // Attempt combineSelectAndUse on each operand of a commutative operator N.
-static SDValue combineSelectCCAndUseCommutative(SDNode *N, SelectionDAG &DAG,
-                                                bool AllOnes) {
+static SDValue combineSelectAndUseCommutative(SDNode *N, SelectionDAG &DAG,
+                                              bool AllOnes) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  if (SDValue Result = combineSelectCCAndUse(N, N0, N1, DAG, AllOnes))
+  if (SDValue Result = combineSelectAndUse(N, N0, N1, DAG, AllOnes))
     return Result;
-  if (SDValue Result = combineSelectCCAndUse(N, N1, N0, DAG, AllOnes))
+  if (SDValue Result = combineSelectAndUse(N, N1, N0, DAG, AllOnes))
     return Result;
   return SDValue();
 }
 
-static SDValue performANDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
+// Transform (add (mul x, c0), c1) ->
+//           (add (mul (add x, c1/c0), c0), c1%c0).
+// if c1/c0 and c1%c0 are simm12, while c1 is not. A special corner case
+// that should be excluded is when c0*(c1/c0) is simm12, which will lead
+// to an infinite loop in DAGCombine if transformed.
+// Or transform (add (mul x, c0), c1) ->
+//              (add (mul (add x, c1/c0+1), c0), c1%c0-c0),
+// if c1/c0+1 and c1%c0-c0 are simm12, while c1 is not. A special corner
+// case that should be excluded is when c0*(c1/c0+1) is simm12, which will
+// lead to an infinite loop in DAGCombine if transformed.
+// Or transform (add (mul x, c0), c1) ->
+//              (add (mul (add x, c1/c0-1), c0), c1%c0+c0),
+// if c1/c0-1 and c1%c0+c0 are simm12, while c1 is not. A special corner
+// case that should be excluded is when c0*(c1/c0-1) is simm12, which will
+// lead to an infinite loop in DAGCombine if transformed.
+// Or transform (add (mul x, c0), c1) ->
+//              (mul (add x, c1/c0), c0).
+// if c1%c0 is zero, and c1/c0 is simm12 while c1 is not.
+static SDValue transformAddImmMulImm(SDNode *N, SelectionDAG &DAG,
+                                     const RISCVSubtarget &Subtarget) {
+  // Skip for vector types and larger types.
+  EVT VT = N->getValueType(0);
+  if (VT.isVector() || VT.getSizeInBits() > Subtarget.getXLen())
+    return SDValue();
+  // The first operand node must be a MUL and has no other use.
+  SDValue N0 = N->getOperand(0);
+  if (!N0->hasOneUse() || N0->getOpcode() != ISD::MUL)
+    return SDValue();
+  // Check if c0 and c1 match above conditions.
+  auto *N0C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!N0C || !N1C)
+    return SDValue();
+  int64_t C0 = N0C->getSExtValue();
+  int64_t C1 = N1C->getSExtValue();
+  int64_t CA, CB;
+  if (C0 == -1 || C0 == 0 || C0 == 1 || isInt<12>(C1))
+    return SDValue();
+  // Search for proper CA (non-zero) and CB that both are simm12.
+  if ((C1 / C0) != 0 && isInt<12>(C1 / C0) && isInt<12>(C1 % C0) &&
+      !isInt<12>(C0 * (C1 / C0))) {
+    CA = C1 / C0;
+    CB = C1 % C0;
+  } else if ((C1 / C0 + 1) != 0 && isInt<12>(C1 / C0 + 1) &&
+             isInt<12>(C1 % C0 - C0) && !isInt<12>(C0 * (C1 / C0 + 1))) {
+    CA = C1 / C0 + 1;
+    CB = C1 % C0 - C0;
+  } else if ((C1 / C0 - 1) != 0 && isInt<12>(C1 / C0 - 1) &&
+             isInt<12>(C1 % C0 + C0) && !isInt<12>(C0 * (C1 / C0 - 1))) {
+    CA = C1 / C0 - 1;
+    CB = C1 % C0 + C0;
+  } else
+    return SDValue();
+  // Build new nodes (add (mul (add x, c1/c0), c0), c1%c0).
+  SDLoc DL(N);
+  SDValue New0 = DAG.getNode(ISD::ADD, DL, VT, N0->getOperand(0),
+                             DAG.getConstant(CA, DL, VT));
+  SDValue New1 =
+      DAG.getNode(ISD::MUL, DL, VT, New0, DAG.getConstant(C0, DL, VT));
+  return DAG.getNode(ISD::ADD, DL, VT, New1, DAG.getConstant(CB, DL, VT));
+}
+
+static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
+  if (SDValue V = transformAddImmMulImm(N, DAG, Subtarget))
+    return V;
+  if (SDValue V = transformAddShlImm(N, DAG, Subtarget))
+    return V;
+  // fold (add (select lhs, rhs, cc, 0, y), x) ->
+  //      (select lhs, rhs, cc, x, (add x, y))
+  return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
+}
+
+static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG) {
+  // fold (sub x, (select lhs, rhs, cc, 0, y)) ->
+  //      (select lhs, rhs, cc, x, (sub x, y))
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false);
+}
 
-  // fold (and (select_cc lhs, rhs, cc, -1, y), x) ->
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG) {
+  // fold (and (select lhs, rhs, cc, -1, y), x) ->
   //      (select lhs, rhs, cc, x, (and x, y))
-  return combineSelectCCAndUseCommutative(N, DAG, true);
+  return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ true);
 }
 
-static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
                                 const RISCVSubtarget &Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
   if (Subtarget.hasStdExtZbp()) {
     if (auto GREV = combineORToGREV(SDValue(N, 0), DAG, Subtarget))
       return GREV;
@@ -5767,19 +6640,15 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       return SHFL;
   }
 
-  // fold (or (select_cc lhs, rhs, cc, 0, y), x) ->
-  //      (select lhs, rhs, cc, x, (or x, y))
-  return combineSelectCCAndUseCommutative(N, DAG, false);
+  // fold (or (select cond, 0, y), x) ->
+  //      (select cond, x, (or x, y))
+  return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
 }
 
-static SDValue performXORCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const RISCVSubtarget &Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-
-  // fold (xor (select_cc lhs, rhs, cc, 0, y), x) ->
-  //      (select lhs, rhs, cc, x, (xor x, y))
-  return combineSelectCCAndUseCommutative(N, DAG, false);
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG) {
+  // fold (xor (select cond, 0, y), x) ->
+  //      (select cond, x, (xor x, y))
+  return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
 }
 
 // Attempt to turn ANY_EXTEND into SIGN_EXTEND if the input to the ANY_EXTEND
@@ -5814,6 +6683,13 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
     break;
   }
 
+  // Only handle cases where the result is used by a CopyToReg. That likely
+  // means the value is a liveout of the basic block. This helps prevent
+  // infinite combine loops like PR51206.
+  if (none_of(N->uses(),
+              [](SDNode *User) { return User->getOpcode() == ISD::CopyToReg; }))
+    return SDValue();
+
   SmallVector<SDNode *, 4> SetCCs;
   for (SDNode::use_iterator UI = Src.getNode()->use_begin(),
                             UE = Src.getNode()->use_end();
@@ -5859,10 +6735,105 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
   return SDValue(N, 0);
 }
 
+// Try to form VWMUL or VWMULU.
+// FIXME: Support VWMULSU.
+static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1,
+                                    SelectionDAG &DAG) {
+  assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode");
+  bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
+  bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
+  if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse())
+    return SDValue();
+
+  SDValue Mask = N->getOperand(2);
+  SDValue VL = N->getOperand(3);
+
+  // Make sure the mask and VL match.
+  if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL)
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+
+  // Determine the narrow size for a widening multiply.
+  unsigned NarrowSize = VT.getScalarSizeInBits() / 2;
+  MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize),
+                                  VT.getVectorElementCount());
+
+  SDLoc DL(N);
+
+  // See if the other operand is the same opcode.
+  if (Op0.getOpcode() == Op1.getOpcode()) {
+    if (!Op1.hasOneUse())
+      return SDValue();
+
+    // Make sure the mask and VL match.
+    if (Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
+      return SDValue();
+
+    Op1 = Op1.getOperand(0);
+  } else if (Op1.getOpcode() == RISCVISD::VMV_V_X_VL) {
+    // The operand is a splat of a scalar.
+
+    // The VL must be the same.
+    if (Op1.getOperand(1) != VL)
+      return SDValue();
+
+    // Get the scalar value.
+    Op1 = Op1.getOperand(0);
+
+    // See if have enough sign bits or zero bits in the scalar to use a
+    // widening multiply by splatting to smaller element size.
+    unsigned EltBits = VT.getScalarSizeInBits();
+    unsigned ScalarBits = Op1.getValueSizeInBits();
+    // Make sure we're getting all element bits from the scalar register.
+    // FIXME: Support implicit sign extension of vmv.v.x?
+    if (ScalarBits < EltBits)
+      return SDValue();
+
+    if (IsSignExt) {
+      if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize))
+        return SDValue();
+    } else {
+      APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
+      if (!DAG.MaskedValueIsZero(Op1, Mask))
+        return SDValue();
+    }
+
+    Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op1, VL);
+  } else
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+
+  // Re-introduce narrower extends if needed.
+  unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
+  if (Op0.getValueType() != NarrowVT)
+    Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
+  if (Op1.getValueType() != NarrowVT)
+    Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
+
+  unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+  return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
+  // Helper to call SimplifyDemandedBits on an operand of N where only some low
+  // bits are demanded. N will be added to the Worklist if it was not deleted.
+  // Caller should return SDValue(N, 0) if this returns true.
+  auto SimplifyDemandedLowBitsHelper = [&](unsigned OpNo, unsigned LowBits) {
+    SDValue Op = N->getOperand(OpNo);
+    APInt Mask = APInt::getLowBitsSet(Op.getValueSizeInBits(), LowBits);
+    if (!SimplifyDemandedBits(Op, Mask, DCI))
+      return false;
+
+    if (N->getOpcode() != ISD::DELETED_NODE)
+      DCI.AddToWorklist(N);
+    return true;
+  };
+
   switch (N->getOpcode()) {
   default:
     break;
@@ -5914,147 +6885,101 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::ROLW:
   case RISCVISD::RORW: {
     // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
-    if (SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI) ||
-        SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::CLZW:
   case RISCVISD::CTZW: {
     // Only the lower 32 bits of the first operand are read
-    SDValue Op0 = N->getOperand(0);
-    APInt Mask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
-    if (SimplifyDemandedBits(Op0, Mask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::FSL:
   case RISCVISD::FSR: {
     // Only the lower log2(Bitwidth)+1 bits of the the shift amount are read.
-    SDValue ShAmt = N->getOperand(2);
-    unsigned BitWidth = ShAmt.getValueSizeInBits();
+    unsigned BitWidth = N->getOperand(2).getValueSizeInBits();
     assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    APInt ShAmtMask(BitWidth, (BitWidth * 2) - 1);
-    if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(2, Log2_32(BitWidth) + 1))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::FSLW:
   case RISCVISD::FSRW: {
     // Only the lower 32 bits of Values and lower 6 bits of shift amount are
     // read.
-    SDValue Op0 = N->getOperand(0);
-    SDValue Op1 = N->getOperand(1);
-    SDValue ShAmt = N->getOperand(2);
-    APInt OpMask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
-    APInt ShAmtMask = APInt::getLowBitsSet(ShAmt.getValueSizeInBits(), 6);
-    if (SimplifyDemandedBits(Op0, OpMask, DCI) ||
-        SimplifyDemandedBits(Op1, OpMask, DCI) ||
-        SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 32) ||
+        SimplifyDemandedLowBitsHelper(2, 6))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::GREV:
   case RISCVISD::GORC: {
     // Only the lower log2(Bitwidth) bits of the the shift amount are read.
-    SDValue ShAmt = N->getOperand(1);
-    unsigned BitWidth = ShAmt.getValueSizeInBits();
+    unsigned BitWidth = N->getOperand(1).getValueSizeInBits();
     assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    APInt ShAmtMask(BitWidth, BitWidth - 1);
-    if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth)))
       return SDValue(N, 0);
-    }
 
     return combineGREVI_GORCI(N, DCI.DAG);
   }
   case RISCVISD::GREVW:
   case RISCVISD::GORCW: {
     // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
-    if (SimplifyDemandedBits(LHS, LHSMask, DCI) ||
-        SimplifyDemandedBits(RHS, RHSMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
-    }
 
     return combineGREVI_GORCI(N, DCI.DAG);
   }
   case RISCVISD::SHFL:
   case RISCVISD::UNSHFL: {
-    // Only the lower log2(Bitwidth) bits of the the shift amount are read.
-    SDValue ShAmt = N->getOperand(1);
-    unsigned BitWidth = ShAmt.getValueSizeInBits();
+    // Only the lower log2(Bitwidth)-1 bits of the the shift amount are read.
+    unsigned BitWidth = N->getOperand(1).getValueSizeInBits();
     assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    APInt ShAmtMask(BitWidth, (BitWidth / 2) - 1);
-    if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth) - 1))
       return SDValue(N, 0);
-    }
 
     break;
   }
   case RISCVISD::SHFLW:
   case RISCVISD::UNSHFLW: {
-    // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
+    // Only the lower 32 bits of LHS and lower 4 bits of RHS are read.
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
     APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
     APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 4);
-    if (SimplifyDemandedBits(LHS, LHSMask, DCI) ||
-        SimplifyDemandedBits(RHS, RHSMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 4))
       return SDValue(N, 0);
-    }
 
     break;
   }
   case RISCVISD::BCOMPRESSW:
   case RISCVISD::BDECOMPRESSW: {
     // Only the lower 32 bits of LHS and RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt Mask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    if (SimplifyDemandedBits(LHS, Mask, DCI) ||
-        SimplifyDemandedBits(RHS, Mask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 32))
       return SDValue(N, 0);
-    }
 
     break;
   }
+  case RISCVISD::FMV_X_ANYEXTH:
   case RISCVISD::FMV_X_ANYEXTW_RV64: {
     SDLoc DL(N);
     SDValue Op0 = N->getOperand(0);
+    MVT VT = N->getSimpleValueType(0);
     // If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
-    // conversion is unnecessary and can be replaced with an ANY_EXTEND
-    // of the FMV_W_X_RV64 operand.
-    if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
-      assert(Op0.getOperand(0).getValueType() == MVT::i64 &&
+    // conversion is unnecessary and can be replaced with the FMV_W_X_RV64
+    // operand. Similar for FMV_X_ANYEXTH and FMV_H_X.
+    if ((N->getOpcode() == RISCVISD::FMV_X_ANYEXTW_RV64 &&
+         Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) ||
+        (N->getOpcode() == RISCVISD::FMV_X_ANYEXTH &&
+         Op0->getOpcode() == RISCVISD::FMV_H_X)) {
+      assert(Op0.getOperand(0).getValueType() == VT &&
              "Unexpected value type!");
       return Op0.getOperand(0);
     }
@@ -6066,23 +6991,27 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
         !Op0.getNode()->hasOneUse())
       break;
-    SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
-                                 Op0.getOperand(0));
-    APInt SignBit = APInt::getSignMask(32).sext(64);
+    SDValue NewFMV = DAG.getNode(N->getOpcode(), DL, VT, Op0.getOperand(0));
+    unsigned FPBits = N->getOpcode() == RISCVISD::FMV_X_ANYEXTW_RV64 ? 32 : 16;
+    APInt SignBit = APInt::getSignMask(FPBits).sextOrSelf(VT.getSizeInBits());
     if (Op0.getOpcode() == ISD::FNEG)
-      return DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
-                         DAG.getConstant(SignBit, DL, MVT::i64));
+      return DAG.getNode(ISD::XOR, DL, VT, NewFMV,
+                         DAG.getConstant(SignBit, DL, VT));
 
     assert(Op0.getOpcode() == ISD::FABS);
-    return DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
-                       DAG.getConstant(~SignBit, DL, MVT::i64));
+    return DAG.getNode(ISD::AND, DL, VT, NewFMV,
+                       DAG.getConstant(~SignBit, DL, VT));
   }
+  case ISD::ADD:
+    return performADDCombine(N, DAG, Subtarget);
+  case ISD::SUB:
+    return performSUBCombine(N, DAG);
   case ISD::AND:
-    return performANDCombine(N, DCI, Subtarget);
+    return performANDCombine(N, DAG);
   case ISD::OR:
-    return performORCombine(N, DCI, Subtarget);
+    return performORCombine(N, DAG, Subtarget);
   case ISD::XOR:
-    return performXORCombine(N, DCI, Subtarget);
+    return performXORCombine(N, DAG);
   case ISD::ANY_EXTEND:
     return performANY_EXTENDCombine(N, DCI, Subtarget);
   case ISD::ZERO_EXTEND:
@@ -6099,7 +7028,14 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // Transform
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
-    auto CCVal = static_cast<ISD::CondCode>(N->getConstantOperandVal(2));
+    SDValue TrueV = N->getOperand(3);
+    SDValue FalseV = N->getOperand(4);
+
+    // If the True and False values are the same, we don't need a select_cc.
+    if (TrueV == FalseV)
+      return TrueV;
+
+    ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(2))->get();
     if (!ISD::isIntEqualitySetCC(CCVal))
       break;
 
@@ -6120,11 +7056,9 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       LHS = LHS.getOperand(0);
       translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
 
-      SDValue TargetCC =
-          DAG.getTargetConstant(CCVal, DL, Subtarget.getXLenVT());
-      return DAG.getNode(
-          RISCVISD::SELECT_CC, DL, N->getValueType(0),
-          {LHS, RHS, TargetCC, N->getOperand(3), N->getOperand(4)});
+      SDValue TargetCC = DAG.getCondCode(CCVal);
+      return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
+                         {LHS, RHS, TargetCC, TrueV, FalseV});
     }
 
     // Fold (select_cc (xor X, Y), 0, eq/ne, trueV, falseV) ->
@@ -6132,8 +7066,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
       return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), N->getValueType(0),
                          {LHS.getOperand(0), LHS.getOperand(1),
-                          N->getOperand(2), N->getOperand(3),
-                          N->getOperand(4)});
+                          N->getOperand(2), TrueV, FalseV});
     // (select_cc X, 1, setne, trueV, falseV) ->
     // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
     // This can occur when legalizing some floating point comparisons.
@@ -6141,12 +7074,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
       SDLoc DL(N);
       CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-      SDValue TargetCC =
-          DAG.getTargetConstant(CCVal, DL, Subtarget.getXLenVT());
+      SDValue TargetCC = DAG.getCondCode(CCVal);
       RHS = DAG.getConstant(0, DL, LHS.getValueType());
-      return DAG.getNode(
-          RISCVISD::SELECT_CC, DL, N->getValueType(0),
-          {LHS, RHS, TargetCC, N->getOperand(3), N->getOperand(4)});
+      return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
+                         {LHS, RHS, TargetCC, TrueV, FalseV});
     }
 
     break;
@@ -6227,18 +7158,33 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                        DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound));
   }
   case ISD::MGATHER:
-  case ISD::MSCATTER: {
+  case ISD::MSCATTER:
+  case ISD::VP_GATHER:
+  case ISD::VP_SCATTER: {
     if (!DCI.isBeforeLegalize())
       break;
-    MaskedGatherScatterSDNode *MGSN = cast<MaskedGatherScatterSDNode>(N);
-    SDValue Index = MGSN->getIndex();
+    SDValue Index, ScaleOp;
+    bool IsIndexScaled = false;
+    bool IsIndexSigned = false;
+    if (const auto *VPGSN = dyn_cast<VPGatherScatterSDNode>(N)) {
+      Index = VPGSN->getIndex();
+      ScaleOp = VPGSN->getScale();
+      IsIndexScaled = VPGSN->isIndexScaled();
+      IsIndexSigned = VPGSN->isIndexSigned();
+    } else {
+      const auto *MGSN = cast<MaskedGatherScatterSDNode>(N);
+      Index = MGSN->getIndex();
+      ScaleOp = MGSN->getScale();
+      IsIndexScaled = MGSN->isIndexScaled();
+      IsIndexSigned = MGSN->isIndexSigned();
+    }
     EVT IndexVT = Index.getValueType();
     MVT XLenVT = Subtarget.getXLenVT();
     // RISCV indexed loads only support the "unsigned unscaled" addressing
     // mode, so anything else must be manually legalized.
-    bool NeedsIdxLegalization = MGSN->isIndexScaled() ||
-                                (MGSN->isIndexSigned() &&
-                                 IndexVT.getVectorElementType().bitsLT(XLenVT));
+    bool NeedsIdxLegalization =
+        IsIndexScaled ||
+        (IsIndexSigned && IndexVT.getVectorElementType().bitsLT(XLenVT));
     if (!NeedsIdxLegalization)
       break;
 
@@ -6247,36 +7193,48 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // Any index legalization should first promote to XLenVT, so we don't lose
     // bits when scaling. This may create an illegal index type so we let
     // LLVM's legalization take care of the splitting.
+    // FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet.
     if (IndexVT.getVectorElementType().bitsLT(XLenVT)) {
       IndexVT = IndexVT.changeVectorElementType(XLenVT);
-      Index = DAG.getNode(MGSN->isIndexSigned() ? ISD::SIGN_EXTEND
-                                                : ISD::ZERO_EXTEND,
+      Index = DAG.getNode(IsIndexSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
                           DL, IndexVT, Index);
     }
 
-    unsigned Scale = N->getConstantOperandVal(5);
-    if (MGSN->isIndexScaled() && Scale != 1) {
+    unsigned Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
+    if (IsIndexScaled && Scale != 1) {
       // Manually scale the indices by the element size.
       // TODO: Sanitize the scale operand here?
+      // TODO: For VP nodes, should we use VP_SHL here?
       assert(isPowerOf2_32(Scale) && "Expecting power-of-two types");
       SDValue SplatScale = DAG.getConstant(Log2_32(Scale), DL, IndexVT);
       Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, SplatScale);
     }
 
     ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED;
-    if (const auto *MGN = dyn_cast<MaskedGatherSDNode>(N)) {
+    if (const auto *VPGN = dyn_cast<VPGatherSDNode>(N))
+      return DAG.getGatherVP(N->getVTList(), VPGN->getMemoryVT(), DL,
+                             {VPGN->getChain(), VPGN->getBasePtr(), Index,
+                              VPGN->getScale(), VPGN->getMask(),
+                              VPGN->getVectorLength()},
+                             VPGN->getMemOperand(), NewIndexTy);
+    if (const auto *VPSN = dyn_cast<VPScatterSDNode>(N))
+      return DAG.getScatterVP(N->getVTList(), VPSN->getMemoryVT(), DL,
+                              {VPSN->getChain(), VPSN->getValue(),
+                               VPSN->getBasePtr(), Index, VPSN->getScale(),
+                               VPSN->getMask(), VPSN->getVectorLength()},
+                              VPSN->getMemOperand(), NewIndexTy);
+    if (const auto *MGN = dyn_cast<MaskedGatherSDNode>(N))
       return DAG.getMaskedGather(
-          N->getVTList(), MGSN->getMemoryVT(), DL,
-          {MGSN->getChain(), MGN->getPassThru(), MGSN->getMask(),
-           MGSN->getBasePtr(), Index, MGN->getScale()},
+          N->getVTList(), MGN->getMemoryVT(), DL,
+          {MGN->getChain(), MGN->getPassThru(), MGN->getMask(),
+           MGN->getBasePtr(), Index, MGN->getScale()},
           MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType());
-    }
     const auto *MSN = cast<MaskedScatterSDNode>(N);
     return DAG.getMaskedScatter(
-        N->getVTList(), MGSN->getMemoryVT(), DL,
-        {MGSN->getChain(), MSN->getValue(), MGSN->getMask(), MGSN->getBasePtr(),
-         Index, MGSN->getScale()},
-        MGSN->getMemOperand(), NewIndexTy, MSN->isTruncatingStore());
+        N->getVTList(), MSN->getMemoryVT(), DL,
+        {MSN->getChain(), MSN->getValue(), MSN->getMask(), MSN->getBasePtr(),
+         Index, MSN->getScale()},
+        MSN->getMemOperand(), NewIndexTy, MSN->isTruncatingStore());
   }
   case RISCVISD::SRA_VL:
   case RISCVISD::SRL_VL:
@@ -6309,45 +7267,37 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::MUL_VL: {
-    // Try to form VWMUL or VWMULU.
-    // FIXME: Look for splat of extended scalar as well.
-    // FIXME: Support VWMULSU.
     SDValue Op0 = N->getOperand(0);
     SDValue Op1 = N->getOperand(1);
-    bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
-    bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
-    if ((!IsSignExt && !IsZeroExt) || Op0.getOpcode() != Op1.getOpcode())
-      return SDValue();
-
-    // Make sure the extends have a single use.
-    if (!Op0.hasOneUse() || !Op1.hasOneUse())
-      return SDValue();
-
-    SDValue Mask = N->getOperand(2);
-    SDValue VL = N->getOperand(3);
-    if (Op0.getOperand(1) != Mask || Op1.getOperand(1) != Mask ||
-        Op0.getOperand(2) != VL || Op1.getOperand(2) != VL)
-      return SDValue();
-
-    Op0 = Op0.getOperand(0);
-    Op1 = Op1.getOperand(0);
-
-    MVT VT = N->getSimpleValueType(0);
-    MVT NarrowVT =
-        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() / 2),
-                         VT.getVectorElementCount());
-
-    SDLoc DL(N);
-
-    // Re-introduce narrower extends if needed.
-    unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
-    if (Op0.getValueType() != NarrowVT)
-      Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
-    if (Op1.getValueType() != NarrowVT)
-      Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
+    if (SDValue V = combineMUL_VLToVWMUL(N, Op0, Op1, DAG))
+      return V;
+    if (SDValue V = combineMUL_VLToVWMUL(N, Op1, Op0, DAG))
+      return V;
+    return SDValue();
+  }
+  case ISD::STORE: {
+    auto *Store = cast<StoreSDNode>(N);
+    SDValue Val = Store->getValue();
+    // Combine store of vmv.x.s to vse with VL of 1.
+    // FIXME: Support FP.
+    if (Val.getOpcode() == RISCVISD::VMV_X_S) {
+      SDValue Src = Val.getOperand(0);
+      EVT VecVT = Src.getValueType();
+      EVT MemVT = Store->getMemoryVT();
+      // The memory VT and the element type must match.
+      if (VecVT.getVectorElementType() == MemVT) {
+        SDLoc DL(N);
+        MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
+        return DAG.getStoreVP(Store->getChain(), DL, Src, Store->getBasePtr(),
+                              DAG.getConstant(1, DL, MaskVT),
+                              DAG.getConstant(1, DL, Subtarget.getXLenVT()),
+                              Store->getPointerInfo(),
+                              Store->getOriginalAlign(),
+                              Store->getMemOperand()->getFlags());
+      }
+    }
 
-    unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
-    return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+    break;
   }
   }
 
@@ -6479,7 +7429,7 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
   else
     return false;
 
-  // Sanity check that our new mask is a subset of the demanded mask.
+  // Check that our new mask is a subset of the demanded mask.
   assert(IsLegalMask(NewMask));
   return UseMask(NewMask);
 }
@@ -6609,6 +7559,12 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   switch (Op.getOpcode()) {
   default:
     break;
+  case RISCVISD::SELECT_CC: {
+    unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1);
+    if (Tmp == 1) return 1;  // Early out.
+    unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1);
+    return std::min(Tmp, Tmp2);
+  }
   case RISCVISD::SLLW:
   case RISCVISD::SRAW:
   case RISCVISD::SRLW:
@@ -6625,8 +7581,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   case RISCVISD::UNSHFLW:
   case RISCVISD::BCOMPRESSW:
   case RISCVISD::BDECOMPRESSW:
-  case RISCVISD::FCVT_W_RV64:
-  case RISCVISD::FCVT_WU_RV64:
+  case RISCVISD::FCVT_W_RTZ_RV64:
+  case RISCVISD::FCVT_WU_RTZ_RV64:
     // TODO: As the result is sign-extended, this is conservatively correct. A
     // more precise answer could be calculated for SRAW depending on known
     // bits in the shift amount.
@@ -6803,7 +7759,8 @@ static bool isSelectPseudo(MachineInstr &MI) {
 }
 
 static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
-                                           MachineBasicBlock *BB) {
+                                           MachineBasicBlock *BB,
+                                           const RISCVSubtarget &Subtarget) {
   // To "insert" Select_* instructions, we actually have to insert the triangle
   // control-flow pattern.  The incoming instructions know the destination vreg
   // to set, the condition code register to branch on, the true/false values to
@@ -6830,7 +7787,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   // related approach and more information.
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
-  auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
+  auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
 
   SmallVector<MachineInstr *, 4> SelectDebugValues;
   SmallSet<Register, 4> SelectDests;
@@ -6863,7 +7820,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
     }
   }
 
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  const RISCVInstrInfo &TII = *Subtarget.getInstrInfo();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction::iterator I = ++BB->getIterator();
@@ -6892,9 +7849,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   HeadMBB->addSuccessor(TailMBB);
 
   // Insert appropriate branch.
-  unsigned Opcode = getBranchOpcodeForIntCondCode(CC);
-
-  BuildMI(HeadMBB, DL, TII.get(Opcode))
+  BuildMI(HeadMBB, DL, TII.getBrCond(CC))
     .addReg(LHS)
     .addReg(RHS)
     .addMBB(TailMBB);
@@ -6939,7 +7894,7 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case RISCV::Select_FPR16_Using_CC_GPR:
   case RISCV::Select_FPR32_Using_CC_GPR:
   case RISCV::Select_FPR64_Using_CC_GPR:
-    return emitSelectPseudo(MI, BB);
+    return emitSelectPseudo(MI, BB, Subtarget);
   case RISCV::BuildPairF64Pseudo:
     return emitBuildPairF64Pseudo(MI, BB);
   case RISCV::SplitF64Pseudo:
@@ -7258,7 +8213,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   }
 
   assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT ||
-          (TLI.getSubtarget().hasStdExtV() && ValVT.isVector())) &&
+          (TLI.getSubtarget().hasVInstructions() && ValVT.isVector())) &&
          "Expected an XLenVT or vector types at this stage");
 
   if (Reg) {
@@ -7294,7 +8249,7 @@ void RISCVTargetLowering::analyzeInputArgs(
   FunctionType *FType = MF.getFunction().getFunctionType();
 
   Optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasStdExtV())
+  if (Subtarget.hasVInstructions())
     FirstMaskArgument = preAssignMask(Ins);
 
   for (unsigned i = 0; i != NumArgs; ++i) {
@@ -7325,7 +8280,7 @@ void RISCVTargetLowering::analyzeOutputArgs(
   unsigned NumArgs = Outs.size();
 
   Optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasStdExtV())
+  if (Subtarget.hasVInstructions())
     FirstMaskArgument = preAssignMask(Outs);
 
   for (unsigned i = 0; i != NumArgs; i++) {
@@ -8170,7 +9125,7 @@ bool RISCVTargetLowering::CanLowerReturn(
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
   Optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasStdExtV())
+  if (Subtarget.hasVInstructions())
     FirstMaskArgument = preAssignMask(Outs);
 
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
@@ -8339,8 +9294,10 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMV_X_ANYEXTH)
   NODE_NAME_CASE(FMV_W_X_RV64)
   NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
-  NODE_NAME_CASE(FCVT_W_RV64)
-  NODE_NAME_CASE(FCVT_WU_RV64)
+  NODE_NAME_CASE(FCVT_X_RTZ)
+  NODE_NAME_CASE(FCVT_XU_RTZ)
+  NODE_NAME_CASE(FCVT_W_RTZ_RV64)
+  NODE_NAME_CASE(FCVT_WU_RTZ_RV64)
   NODE_NAME_CASE(READ_CYCLE_WIDE)
   NODE_NAME_CASE(GREV)
   NODE_NAME_CASE(GREVW)
@@ -8435,7 +9392,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VRGATHEREI16_VV_VL)
   NODE_NAME_CASE(VSEXT_VL)
   NODE_NAME_CASE(VZEXT_VL)
-  NODE_NAME_CASE(VPOPC_VL)
+  NODE_NAME_CASE(VCPOP_VL)
   NODE_NAME_CASE(VLE_VL)
   NODE_NAME_CASE(VSE_VL)
   NODE_NAME_CASE(READ_CSR)
@@ -8456,7 +9413,6 @@ RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
     default:
       break;
     case 'f':
-    case 'v':
       return C_RegisterClass;
     case 'I':
     case 'J':
@@ -8467,6 +9423,9 @@ RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
     case 'S': // A symbolic address
       return C_Other;
     }
+  } else {
+    if (Constraint == "vr" || Constraint == "vm")
+      return C_RegisterClass;
   }
   return TargetLowering::getConstraintType(Constraint);
 }
@@ -8489,16 +9448,19 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (Subtarget.hasStdExtD() && VT == MVT::f64)
         return std::make_pair(0U, &RISCV::FPR64RegClass);
       break;
-    case 'v':
-      for (const auto *RC :
-           {&RISCV::VMRegClass, &RISCV::VRRegClass, &RISCV::VRM2RegClass,
-            &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) {
+    default:
+      break;
+    }
+  } else {
+    if (Constraint == "vr") {
+      for (const auto *RC : {&RISCV::VRRegClass, &RISCV::VRM2RegClass,
+                             &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) {
         if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
           return std::make_pair(0U, RC);
       }
-      break;
-    default:
-      break;
+    } else if (Constraint == "vm") {
+      if (TRI->isTypeLegalForClass(RISCV::VMRegClass, VT.SimpleTy))
+        return std::make_pair(0U, &RISCV::VMRegClass);
     }
   }
 
@@ -8596,7 +9558,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     }
   }
 
-  if (Subtarget.hasStdExtV()) {
+  if (Subtarget.hasVInstructions()) {
     Register VReg = StringSwitch<Register>(Constraint.lower())
                         .Case("{v0}", RISCV::V0)
                         .Case("{v1}", RISCV::V1)
@@ -8934,6 +9896,11 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
       if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
           (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
         return true;
+      // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
+      if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
+          ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
+           (Imm - 8).isPowerOf2()))
+        return true;
       // Omit the following optimization if the sub target has the M extension
       // and the data size >= XLen.
       if (Subtarget.hasStdExtM() && VT.getSizeInBits() >= Subtarget.getXLen())
@@ -8952,6 +9919,29 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   return false;
 }
 
+bool RISCVTargetLowering::isMulAddWithConstProfitable(
+    const SDValue &AddNode, const SDValue &ConstNode) const {
+  // Let the DAGCombiner decide for vectors.
+  EVT VT = AddNode.getValueType();
+  if (VT.isVector())
+    return true;
+
+  // Let the DAGCombiner decide for larger types.
+  if (VT.getScalarSizeInBits() > Subtarget.getXLen())
+    return true;
+
+  // It is worse if c1 is simm12 while c1*c2 is not.
+  ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
+  ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
+  const APInt &C1 = C1Node->getAPIntValue();
+  const APInt &C2 = C2Node->getAPIntValue();
+  if (C1.isSignedIntN(12) && !(C1 * C2).isSignedIntN(12))
+    return false;
+
+  // Default to true and let the DAGCombiner decide.
+  return true;
+}
+
 bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     bool *Fast) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0e71220da3b3..8e3d716ae919 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -84,10 +84,16 @@ enum NodeType : unsigned {
   FMV_X_ANYEXTH,
   FMV_W_X_RV64,
   FMV_X_ANYEXTW_RV64,
+  // FP to XLen int conversions. Corresponds to fcvt.l(u).s/d/h on RV64 and
+  // fcvt.w(u).s/d/h on RV32. Unlike FP_TO_S/UINT these saturate out of
+  // range inputs. These are used for FP_TO_S/UINT_SAT lowering.
+  FCVT_X_RTZ,
+  FCVT_XU_RTZ,
   // FP to 32 bit int conversions for RV64. These are used to keep track of the
-  // result being sign extended to 64 bit.
-  FCVT_W_RV64,
-  FCVT_WU_RV64,
+  // result being sign extended to 64 bit. These saturate out of range inputs.
+  // Used for FP_TO_S/UINT and FP_TO_S/UINT_SAT lowering.
+  FCVT_W_RTZ_RV64,
+  FCVT_WU_RTZ_RV64,
   // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
   // (returns (Lo, Hi)). It takes a chain operand.
   READ_CYCLE_WIDE,
@@ -158,12 +164,13 @@ enum NodeType : unsigned {
   VFNCVT_ROD_VL,
   // These nodes match the semantics of the corresponding RVV vector reduction
   // instructions. They produce a vector result which is the reduction
-  // performed over the first vector operand plus the first element of the
-  // second vector operand. The first operand is an unconstrained vector type,
-  // and the result and second operand's types are expected to be the
-  // corresponding full-width LMUL=1 type for the first operand:
-  //   nxv8i8 = vecreduce_add nxv32i8, nxv8i8
-  //   nxv2i32 = vecreduce_add nxv8i32, nxv2i32
+  // performed over the second vector operand plus the first element of the
+  // third vector operand. The first operand is the pass-thru operand. The
+  // second operand is an unconstrained vector type, and the result, first, and
+  // third operand's types are expected to be the corresponding full-width
+  // LMUL=1 type for the second operand:
+  //   nxv8i8 = vecreduce_add nxv8i8, nxv32i8, nxv8i8
+  //   nxv2i32 = vecreduce_add nxv2i32, nxv8i32, nxv2i32
   // The different in types does introduce extra vsetvli instructions but
   // similarly it reduces the number of registers consumed per reduction.
   // Also has a mask and VL operand.
@@ -256,8 +263,8 @@ enum NodeType : unsigned {
   VSEXT_VL,
   VZEXT_VL,
 
-  //  vpopc.m with additional mask and VL operands.
-  VPOPC_VL,
+  //  vcpop.m with additional mask and VL operands.
+  VCPOP_VL,
 
   // Reads value of CSR.
   // The first operand is a chain pointer. The second specifies address of the
@@ -308,6 +315,9 @@ public:
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
+  bool hasAndNot(SDValue Y) const override;
+  bool shouldSinkOperands(Instruction *I,
+                          SmallVectorImpl<Use *> &Ops) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
 
@@ -455,6 +465,9 @@ public:
   bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
                               SDValue C) const override;
 
+  bool isMulAddWithConstProfitable(const SDValue &AddNode,
+                                   const SDValue &ConstNode) const override;
+
   TargetLowering::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
   Value *emitMaskedAtomicRMWIntrinsic(IRBuilderBase &Builder, AtomicRMWInst *AI,
@@ -499,6 +512,8 @@ public:
 
   bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
 
+  bool isLegalElementTypeForRVV(Type *ScalarTy) const;
+
 private:
   /// RISCVCCAssignFn - This target-specific function extends the default
   /// CCValAssign with additional information used to lower RISC-V calling
@@ -547,20 +562,23 @@ private:
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorMaskVecReduction(SDValue Op, SelectionDAG &DAG,
+                                      bool IsVP) const;
   SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op,
                                                SelectionDAG &DAG) const;
-  SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMaskedGather(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerMaskedScatter(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const;
@@ -602,6 +620,14 @@ private:
   /// NOTE: Once BUILD_VECTOR can be custom lowered for all legal vector types,
   /// this override can be removed.
   bool mergeStoresAfterLegalization(EVT VT) const override;
+
+  /// Disable normalizing
+  /// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and
+  /// select(N0|N1, X, Y) => select(N0, select(N1, X, Y, Y))
+  /// RISCV doesn't have flags so it's better to perform the and/or in a GPR.
+  bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override {
+    return false;
+  };
 };
 
 namespace RISCV {
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index fb7cb408cade..dbfc90f36f80 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -58,12 +58,13 @@ class VSETVLIInfo {
   uint8_t TailAgnostic : 1;
   uint8_t MaskAgnostic : 1;
   uint8_t MaskRegOp : 1;
+  uint8_t StoreOp : 1;
   uint8_t SEWLMULRatioOnly : 1;
 
 public:
   VSETVLIInfo()
       : AVLImm(0), TailAgnostic(false), MaskAgnostic(false), MaskRegOp(false),
-        SEWLMULRatioOnly(false) {}
+        StoreOp(false), SEWLMULRatioOnly(false) {}
 
   static VSETVLIInfo getUnknown() {
     VSETVLIInfo Info;
@@ -118,7 +119,8 @@ public:
     TailAgnostic = RISCVVType::isTailAgnostic(VType);
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
   }
-  void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA, bool MRO) {
+  void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA, bool MRO,
+                bool IsStore) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
@@ -126,6 +128,7 @@ public:
     TailAgnostic = TA;
     MaskAgnostic = MA;
     MaskRegOp = MRO;
+    StoreOp = IsStore;
   }
 
   unsigned encodeVTYPE() const {
@@ -148,10 +151,7 @@ public:
                     Other.MaskAgnostic);
   }
 
-  // Convert VLMUL to a fixed point value with 3 bits of fraction.
-  unsigned getSEWLMULRatio() const {
-    assert(isValid() && !isUnknown() &&
-           "Can't use VTYPE for uninitialized or unknown");
+  static unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
     unsigned LMul;
     bool Fractional;
     std::tie(LMul, Fractional) = RISCVVType::decodeVLMUL(VLMul);
@@ -163,6 +163,12 @@ public:
     return (SEW * 8) / LMul;
   }
 
+  unsigned getSEWLMULRatio() const {
+    assert(isValid() && !isUnknown() &&
+           "Can't use VTYPE for uninitialized or unknown");
+    return getSEWLMULRatio(SEW, VLMul);
+  }
+
   // Check if the VTYPE for these two VSETVLIInfos produce the same VLMAX.
   bool hasSameVLMAX(const VSETVLIInfo &Other) const {
     assert(isValid() && Other.isValid() &&
@@ -172,10 +178,30 @@ public:
     return getSEWLMULRatio() == Other.getSEWLMULRatio();
   }
 
+  bool hasCompatibleVTYPE(const VSETVLIInfo &InstrInfo, bool Strict) const {
+    // Simple case, see if full VTYPE matches.
+    if (hasSameVTYPE(InstrInfo))
+      return true;
+
+    if (Strict)
+      return false;
+
+    // If this is a mask reg operation, it only cares about VLMAX.
+    // FIXME: Mask reg operations are probably ok if "this" VLMAX is larger
+    // than "InstrInfo".
+    // FIXME: The policy bits can probably be ignored for mask reg operations.
+    if (InstrInfo.MaskRegOp && hasSameVLMAX(InstrInfo) &&
+        TailAgnostic == InstrInfo.TailAgnostic &&
+        MaskAgnostic == InstrInfo.MaskAgnostic)
+      return true;
+
+    return false;
+  }
+
   // Determine whether the vector instructions requirements represented by
   // InstrInfo are compatible with the previous vsetvli instruction represented
   // by this.
-  bool isCompatible(const VSETVLIInfo &InstrInfo) const {
+  bool isCompatible(const VSETVLIInfo &InstrInfo, bool Strict) const {
     assert(isValid() && InstrInfo.isValid() &&
            "Can't compare invalid VSETVLIInfos");
     assert(!InstrInfo.SEWLMULRatioOnly &&
@@ -190,22 +216,52 @@ public:
 
     // If the instruction doesn't need an AVLReg and the SEW matches, consider
     // it compatible.
-    if (InstrInfo.hasAVLReg() && InstrInfo.AVLReg == RISCV::NoRegister) {
+    if (!Strict && InstrInfo.hasAVLReg() &&
+        InstrInfo.AVLReg == RISCV::NoRegister) {
       if (SEW == InstrInfo.SEW)
         return true;
     }
 
-    // VTypes must match unless the instruction is a mask reg operation, then it
-    // only care about VLMAX.
-    // FIXME: Mask reg operations are probably ok if "this" VLMAX is larger
-    // than "InstrInfo".
-    if (!hasSameVTYPE(InstrInfo) &&
-        !(InstrInfo.MaskRegOp && hasSameVLMAX(InstrInfo) &&
-          TailAgnostic == InstrInfo.TailAgnostic &&
-          MaskAgnostic == InstrInfo.MaskAgnostic))
+    // The AVL must match.
+    if (!hasSameAVL(InstrInfo))
       return false;
 
-    return hasSameAVL(InstrInfo);
+    if (hasCompatibleVTYPE(InstrInfo, Strict))
+      return true;
+
+    // Strict matches must ensure a full VTYPE match.
+    if (Strict)
+      return false;
+
+    // Store instructions don't use the policy fields.
+    // TODO: Move into hasCompatibleVTYPE?
+    if (InstrInfo.StoreOp && VLMul == InstrInfo.VLMul && SEW == InstrInfo.SEW)
+      return true;
+
+    // Anything else is not compatible.
+    return false;
+  }
+
+  bool isCompatibleWithLoadStoreEEW(unsigned EEW,
+                                    const VSETVLIInfo &InstrInfo) const {
+    assert(isValid() && InstrInfo.isValid() &&
+           "Can't compare invalid VSETVLIInfos");
+    assert(!InstrInfo.SEWLMULRatioOnly &&
+           "Expected a valid VTYPE for instruction!");
+    assert(EEW == InstrInfo.SEW && "Mismatched EEW/SEW for store");
+
+    if (isUnknown() || hasSEWLMULRatioOnly())
+      return false;
+
+    if (!hasSameAVL(InstrInfo))
+      return false;
+
+    // Stores can ignore the tail and mask policies.
+    if (!InstrInfo.StoreOp && (TailAgnostic != InstrInfo.TailAgnostic ||
+                               MaskAgnostic != InstrInfo.MaskAgnostic))
+      return false;
+
+    return getSEWLMULRatio() == getSEWLMULRatio(EEW, InstrInfo.VLMul);
   }
 
   bool operator==(const VSETVLIInfo &Other) const {
@@ -278,7 +334,7 @@ public:
 
     // If the change is compatible with the input, we won't create a VSETVLI
     // and should keep the predecessor.
-    if (isCompatible(Other))
+    if (isCompatible(Other, /*Strict*/ true))
       return *this;
 
     // Otherwise just use whatever is in this block.
@@ -362,14 +418,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
                                        const MachineRegisterInfo *MRI) {
   VSETVLIInfo InstrInfo;
   unsigned NumOperands = MI.getNumExplicitOperands();
-
-  RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags);
-
-  unsigned Log2SEW = MI.getOperand(NumOperands - 1).getImm();
-  // A Log2SEW of 0 is an operation on mask registers only.
-  bool MaskRegOp = Log2SEW == 0;
-  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
-  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+  bool HasPolicy = RISCVII::hasVecPolicyOp(TSFlags);
 
   // Default to tail agnostic unless the destination is tied to a source.
   // Unless the source is undef. In that case the user would have some control
@@ -377,8 +426,15 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
   // despite having a tied def.
   bool ForceTailAgnostic = RISCVII::doesForceTailAgnostic(TSFlags);
   bool TailAgnostic = true;
+  // If the instruction has policy argument, use the argument.
+  if (HasPolicy) {
+    const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1);
+    TailAgnostic = Op.getImm() & 0x1;
+  }
+
   unsigned UseOpIdx;
-  if (!ForceTailAgnostic && MI.isRegTiedToUseOperand(0, &UseOpIdx)) {
+  if (!(ForceTailAgnostic || (HasPolicy && TailAgnostic)) &&
+      MI.isRegTiedToUseOperand(0, &UseOpIdx)) {
     TailAgnostic = false;
     // If the tied operand is an IMPLICIT_DEF we can keep TailAgnostic.
     const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
@@ -390,16 +446,38 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
     }
   }
 
+  // Remove the tail policy so we can find the SEW and VL.
+  if (HasPolicy)
+    --NumOperands;
+
+  RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags);
+
+  unsigned Log2SEW = MI.getOperand(NumOperands - 1).getImm();
+  // A Log2SEW of 0 is an operation on mask registers only.
+  bool MaskRegOp = Log2SEW == 0;
+  unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
+  assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+
+  // If there are no explicit defs, this is a store instruction which can
+  // ignore the tail and mask policies.
+  bool StoreOp = MI.getNumExplicitDefs() == 0;
+
   if (RISCVII::hasVLOp(TSFlags)) {
-    const MachineOperand &VLOp = MI.getOperand(MI.getNumExplicitOperands() - 2);
-    if (VLOp.isImm())
-      InstrInfo.setAVLImm(VLOp.getImm());
-    else
+    const MachineOperand &VLOp = MI.getOperand(NumOperands - 2);
+    if (VLOp.isImm()) {
+      int64_t Imm = VLOp.getImm();
+      // Conver the VLMax sentintel to X0 register.
+      if (Imm == RISCV::VLMaxSentinel)
+        InstrInfo.setAVLReg(RISCV::X0);
+      else
+        InstrInfo.setAVLImm(Imm);
+    } else {
       InstrInfo.setAVLReg(VLOp.getReg());
+    }
   } else
     InstrInfo.setAVLReg(RISCV::NoRegister);
   InstrInfo.setVTYPE(VLMul, SEW, /*TailAgnostic*/ TailAgnostic,
-                     /*MaskAgnostic*/ false, MaskRegOp);
+                     /*MaskAgnostic*/ false, MaskRegOp, StoreOp);
 
   return InstrInfo;
 }
@@ -413,7 +491,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
   // VLMAX.
   if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
       Info.hasSameAVL(PrevInfo) && Info.hasSameVLMAX(PrevInfo)) {
-    BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLI))
+    BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLIX0))
         .addReg(RISCV::X0, RegState::Define | RegState::Dead)
         .addReg(RISCV::X0, RegState::Kill)
         .addImm(Info.encodeVTYPE())
@@ -435,7 +513,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
     // the previous vl to become invalid.
     if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
         Info.hasSameVLMAX(PrevInfo)) {
-      BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLI))
+      BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLIX0))
           .addReg(RISCV::X0, RegState::Define | RegState::Dead)
           .addReg(RISCV::X0, RegState::Kill)
           .addImm(Info.encodeVTYPE())
@@ -450,11 +528,19 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
     return;
   }
 
-  // Use X0 as the DestReg unless AVLReg is X0.
+  if (AVLReg.isVirtual())
+    MRI->constrainRegClass(AVLReg, &RISCV::GPRNoX0RegClass);
+
+  // Use X0 as the DestReg unless AVLReg is X0. We also need to change the
+  // opcode if the AVLReg is X0 as they have different register classes for
+  // the AVL operand.
   Register DestReg = RISCV::X0;
-  if (AVLReg == RISCV::X0)
+  unsigned Opcode = RISCV::PseudoVSETVLI;
+  if (AVLReg == RISCV::X0) {
     DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
-  BuildMI(MBB, MI, DL, TII->get(RISCV::PseudoVSETVLI))
+    Opcode = RISCV::PseudoVSETVLIX0;
+  }
+  BuildMI(MBB, MI, DL, TII->get(Opcode))
       .addReg(DestReg, RegState::Define | RegState::Dead)
       .addReg(AVLReg)
       .addImm(Info.encodeVTYPE());
@@ -464,14 +550,15 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, MachineInstr &MI,
 // VSETIVLI instruction.
 static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
   VSETVLIInfo NewInfo;
-  if (MI.getOpcode() == RISCV::PseudoVSETVLI) {
+  if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
+    NewInfo.setAVLImm(MI.getOperand(1).getImm());
+  } else {
+    assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
+           MI.getOpcode() == RISCV::PseudoVSETVLIX0);
     Register AVLReg = MI.getOperand(1).getReg();
     assert((AVLReg != RISCV::X0 || MI.getOperand(0).getReg() != RISCV::X0) &&
            "Can't handle X0, X0 vsetvli yet");
     NewInfo.setAVLReg(AVLReg);
-  } else {
-    assert(MI.getOpcode() == RISCV::PseudoVSETIVLI);
-    NewInfo.setAVLImm(MI.getOperand(1).getImm());
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
@@ -480,7 +567,7 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
 
 bool RISCVInsertVSETVLI::needVSETVLI(const VSETVLIInfo &Require,
                                      const VSETVLIInfo &CurInfo) {
-  if (CurInfo.isCompatible(Require))
+  if (CurInfo.isCompatible(Require, /*Strict*/ false))
     return false;
 
   // We didn't find a compatible value. If our AVL is a virtual register,
@@ -489,9 +576,10 @@ bool RISCVInsertVSETVLI::needVSETVLI(const VSETVLIInfo &Require,
   // VSETVLI here.
   if (!CurInfo.isUnknown() && Require.hasAVLReg() &&
       Require.getAVLReg().isVirtual() && !CurInfo.hasSEWLMULRatioOnly() &&
-      Require.hasSameVTYPE(CurInfo)) {
+      CurInfo.hasCompatibleVTYPE(Require, /*Strict*/ false)) {
     if (MachineInstr *DefMI = MRI->getVRegDef(Require.getAVLReg())) {
       if (DefMI->getOpcode() == RISCV::PseudoVSETVLI ||
+          DefMI->getOpcode() == RISCV::PseudoVSETVLIX0 ||
           DefMI->getOpcode() == RISCV::PseudoVSETIVLI) {
         VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
         if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVTYPE(CurInfo))
@@ -503,6 +591,202 @@ bool RISCVInsertVSETVLI::needVSETVLI(const VSETVLIInfo &Require,
   return true;
 }
 
+bool canSkipVSETVLIForLoadStore(const MachineInstr &MI,
+                                const VSETVLIInfo &Require,
+                                const VSETVLIInfo &CurInfo) {
+  unsigned EEW;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::PseudoVLE8_V_M1:
+  case RISCV::PseudoVLE8_V_M1_MASK:
+  case RISCV::PseudoVLE8_V_M2:
+  case RISCV::PseudoVLE8_V_M2_MASK:
+  case RISCV::PseudoVLE8_V_M4:
+  case RISCV::PseudoVLE8_V_M4_MASK:
+  case RISCV::PseudoVLE8_V_M8:
+  case RISCV::PseudoVLE8_V_M8_MASK:
+  case RISCV::PseudoVLE8_V_MF2:
+  case RISCV::PseudoVLE8_V_MF2_MASK:
+  case RISCV::PseudoVLE8_V_MF4:
+  case RISCV::PseudoVLE8_V_MF4_MASK:
+  case RISCV::PseudoVLE8_V_MF8:
+  case RISCV::PseudoVLE8_V_MF8_MASK:
+  case RISCV::PseudoVLSE8_V_M1:
+  case RISCV::PseudoVLSE8_V_M1_MASK:
+  case RISCV::PseudoVLSE8_V_M2:
+  case RISCV::PseudoVLSE8_V_M2_MASK:
+  case RISCV::PseudoVLSE8_V_M4:
+  case RISCV::PseudoVLSE8_V_M4_MASK:
+  case RISCV::PseudoVLSE8_V_M8:
+  case RISCV::PseudoVLSE8_V_M8_MASK:
+  case RISCV::PseudoVLSE8_V_MF2:
+  case RISCV::PseudoVLSE8_V_MF2_MASK:
+  case RISCV::PseudoVLSE8_V_MF4:
+  case RISCV::PseudoVLSE8_V_MF4_MASK:
+  case RISCV::PseudoVLSE8_V_MF8:
+  case RISCV::PseudoVLSE8_V_MF8_MASK:
+  case RISCV::PseudoVSE8_V_M1:
+  case RISCV::PseudoVSE8_V_M1_MASK:
+  case RISCV::PseudoVSE8_V_M2:
+  case RISCV::PseudoVSE8_V_M2_MASK:
+  case RISCV::PseudoVSE8_V_M4:
+  case RISCV::PseudoVSE8_V_M4_MASK:
+  case RISCV::PseudoVSE8_V_M8:
+  case RISCV::PseudoVSE8_V_M8_MASK:
+  case RISCV::PseudoVSE8_V_MF2:
+  case RISCV::PseudoVSE8_V_MF2_MASK:
+  case RISCV::PseudoVSE8_V_MF4:
+  case RISCV::PseudoVSE8_V_MF4_MASK:
+  case RISCV::PseudoVSE8_V_MF8:
+  case RISCV::PseudoVSE8_V_MF8_MASK:
+  case RISCV::PseudoVSSE8_V_M1:
+  case RISCV::PseudoVSSE8_V_M1_MASK:
+  case RISCV::PseudoVSSE8_V_M2:
+  case RISCV::PseudoVSSE8_V_M2_MASK:
+  case RISCV::PseudoVSSE8_V_M4:
+  case RISCV::PseudoVSSE8_V_M4_MASK:
+  case RISCV::PseudoVSSE8_V_M8:
+  case RISCV::PseudoVSSE8_V_M8_MASK:
+  case RISCV::PseudoVSSE8_V_MF2:
+  case RISCV::PseudoVSSE8_V_MF2_MASK:
+  case RISCV::PseudoVSSE8_V_MF4:
+  case RISCV::PseudoVSSE8_V_MF4_MASK:
+  case RISCV::PseudoVSSE8_V_MF8:
+  case RISCV::PseudoVSSE8_V_MF8_MASK:
+    EEW = 8;
+    break;
+  case RISCV::PseudoVLE16_V_M1:
+  case RISCV::PseudoVLE16_V_M1_MASK:
+  case RISCV::PseudoVLE16_V_M2:
+  case RISCV::PseudoVLE16_V_M2_MASK:
+  case RISCV::PseudoVLE16_V_M4:
+  case RISCV::PseudoVLE16_V_M4_MASK:
+  case RISCV::PseudoVLE16_V_M8:
+  case RISCV::PseudoVLE16_V_M8_MASK:
+  case RISCV::PseudoVLE16_V_MF2:
+  case RISCV::PseudoVLE16_V_MF2_MASK:
+  case RISCV::PseudoVLE16_V_MF4:
+  case RISCV::PseudoVLE16_V_MF4_MASK:
+  case RISCV::PseudoVLSE16_V_M1:
+  case RISCV::PseudoVLSE16_V_M1_MASK:
+  case RISCV::PseudoVLSE16_V_M2:
+  case RISCV::PseudoVLSE16_V_M2_MASK:
+  case RISCV::PseudoVLSE16_V_M4:
+  case RISCV::PseudoVLSE16_V_M4_MASK:
+  case RISCV::PseudoVLSE16_V_M8:
+  case RISCV::PseudoVLSE16_V_M8_MASK:
+  case RISCV::PseudoVLSE16_V_MF2:
+  case RISCV::PseudoVLSE16_V_MF2_MASK:
+  case RISCV::PseudoVLSE16_V_MF4:
+  case RISCV::PseudoVLSE16_V_MF4_MASK:
+  case RISCV::PseudoVSE16_V_M1:
+  case RISCV::PseudoVSE16_V_M1_MASK:
+  case RISCV::PseudoVSE16_V_M2:
+  case RISCV::PseudoVSE16_V_M2_MASK:
+  case RISCV::PseudoVSE16_V_M4:
+  case RISCV::PseudoVSE16_V_M4_MASK:
+  case RISCV::PseudoVSE16_V_M8:
+  case RISCV::PseudoVSE16_V_M8_MASK:
+  case RISCV::PseudoVSE16_V_MF2:
+  case RISCV::PseudoVSE16_V_MF2_MASK:
+  case RISCV::PseudoVSE16_V_MF4:
+  case RISCV::PseudoVSE16_V_MF4_MASK:
+  case RISCV::PseudoVSSE16_V_M1:
+  case RISCV::PseudoVSSE16_V_M1_MASK:
+  case RISCV::PseudoVSSE16_V_M2:
+  case RISCV::PseudoVSSE16_V_M2_MASK:
+  case RISCV::PseudoVSSE16_V_M4:
+  case RISCV::PseudoVSSE16_V_M4_MASK:
+  case RISCV::PseudoVSSE16_V_M8:
+  case RISCV::PseudoVSSE16_V_M8_MASK:
+  case RISCV::PseudoVSSE16_V_MF2:
+  case RISCV::PseudoVSSE16_V_MF2_MASK:
+  case RISCV::PseudoVSSE16_V_MF4:
+  case RISCV::PseudoVSSE16_V_MF4_MASK:
+    EEW = 16;
+    break;
+  case RISCV::PseudoVLE32_V_M1:
+  case RISCV::PseudoVLE32_V_M1_MASK:
+  case RISCV::PseudoVLE32_V_M2:
+  case RISCV::PseudoVLE32_V_M2_MASK:
+  case RISCV::PseudoVLE32_V_M4:
+  case RISCV::PseudoVLE32_V_M4_MASK:
+  case RISCV::PseudoVLE32_V_M8:
+  case RISCV::PseudoVLE32_V_M8_MASK:
+  case RISCV::PseudoVLE32_V_MF2:
+  case RISCV::PseudoVLE32_V_MF2_MASK:
+  case RISCV::PseudoVLSE32_V_M1:
+  case RISCV::PseudoVLSE32_V_M1_MASK:
+  case RISCV::PseudoVLSE32_V_M2:
+  case RISCV::PseudoVLSE32_V_M2_MASK:
+  case RISCV::PseudoVLSE32_V_M4:
+  case RISCV::PseudoVLSE32_V_M4_MASK:
+  case RISCV::PseudoVLSE32_V_M8:
+  case RISCV::PseudoVLSE32_V_M8_MASK:
+  case RISCV::PseudoVLSE32_V_MF2:
+  case RISCV::PseudoVLSE32_V_MF2_MASK:
+  case RISCV::PseudoVSE32_V_M1:
+  case RISCV::PseudoVSE32_V_M1_MASK:
+  case RISCV::PseudoVSE32_V_M2:
+  case RISCV::PseudoVSE32_V_M2_MASK:
+  case RISCV::PseudoVSE32_V_M4:
+  case RISCV::PseudoVSE32_V_M4_MASK:
+  case RISCV::PseudoVSE32_V_M8:
+  case RISCV::PseudoVSE32_V_M8_MASK:
+  case RISCV::PseudoVSE32_V_MF2:
+  case RISCV::PseudoVSE32_V_MF2_MASK:
+  case RISCV::PseudoVSSE32_V_M1:
+  case RISCV::PseudoVSSE32_V_M1_MASK:
+  case RISCV::PseudoVSSE32_V_M2:
+  case RISCV::PseudoVSSE32_V_M2_MASK:
+  case RISCV::PseudoVSSE32_V_M4:
+  case RISCV::PseudoVSSE32_V_M4_MASK:
+  case RISCV::PseudoVSSE32_V_M8:
+  case RISCV::PseudoVSSE32_V_M8_MASK:
+  case RISCV::PseudoVSSE32_V_MF2:
+  case RISCV::PseudoVSSE32_V_MF2_MASK:
+    EEW = 32;
+    break;
+  case RISCV::PseudoVLE64_V_M1:
+  case RISCV::PseudoVLE64_V_M1_MASK:
+  case RISCV::PseudoVLE64_V_M2:
+  case RISCV::PseudoVLE64_V_M2_MASK:
+  case RISCV::PseudoVLE64_V_M4:
+  case RISCV::PseudoVLE64_V_M4_MASK:
+  case RISCV::PseudoVLE64_V_M8:
+  case RISCV::PseudoVLE64_V_M8_MASK:
+  case RISCV::PseudoVLSE64_V_M1:
+  case RISCV::PseudoVLSE64_V_M1_MASK:
+  case RISCV::PseudoVLSE64_V_M2:
+  case RISCV::PseudoVLSE64_V_M2_MASK:
+  case RISCV::PseudoVLSE64_V_M4:
+  case RISCV::PseudoVLSE64_V_M4_MASK:
+  case RISCV::PseudoVLSE64_V_M8:
+  case RISCV::PseudoVLSE64_V_M8_MASK:
+  case RISCV::PseudoVSE64_V_M1:
+  case RISCV::PseudoVSE64_V_M1_MASK:
+  case RISCV::PseudoVSE64_V_M2:
+  case RISCV::PseudoVSE64_V_M2_MASK:
+  case RISCV::PseudoVSE64_V_M4:
+  case RISCV::PseudoVSE64_V_M4_MASK:
+  case RISCV::PseudoVSE64_V_M8:
+  case RISCV::PseudoVSE64_V_M8_MASK:
+  case RISCV::PseudoVSSE64_V_M1:
+  case RISCV::PseudoVSSE64_V_M1_MASK:
+  case RISCV::PseudoVSSE64_V_M2:
+  case RISCV::PseudoVSSE64_V_M2_MASK:
+  case RISCV::PseudoVSSE64_V_M4:
+  case RISCV::PseudoVSSE64_V_M4_MASK:
+  case RISCV::PseudoVSSE64_V_M8:
+  case RISCV::PseudoVSSE64_V_M8_MASK:
+    EEW = 64;
+    break;
+  }
+
+  return CurInfo.isCompatibleWithLoadStoreEEW(EEW, Require);
+}
+
 bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
   bool HadVectorOp = false;
 
@@ -510,6 +794,7 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
   for (const MachineInstr &MI : MBB) {
     // If this is an explicit VSETVLI or VSETIVLI, update our state.
     if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
+        MI.getOpcode() == RISCV::PseudoVSETVLIX0 ||
         MI.getOpcode() == RISCV::PseudoVSETIVLI) {
       HadVectorOp = true;
       BBInfo.Change = getInfoForVSETVLI(MI);
@@ -527,7 +812,13 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
       } else {
         // If this instruction isn't compatible with the previous VL/VTYPE
         // we need to insert a VSETVLI.
-        if (needVSETVLI(NewInfo, BBInfo.Change))
+        // If this is a unit-stride or strided load/store, we may be able to use
+        // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
+        // NOTE: We only do this if the vtype we're comparing against was
+        // created in this block. We need the first and third phase to treat
+        // the store the same way.
+        if (!canSkipVSETVLIForLoadStore(MI, NewInfo, BBInfo.Change) &&
+            needVSETVLI(NewInfo, BBInfo.Change))
           BBInfo.Change = NewInfo;
       }
     }
@@ -609,12 +900,14 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
     const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
     // If the exit from the predecessor has the VTYPE we are looking for
     // we might be able to avoid a VSETVLI.
-    if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
+    if (PBBInfo.Exit.isUnknown() ||
+        !PBBInfo.Exit.hasCompatibleVTYPE(Require, /*Strict*/ false))
       return true;
 
     // We need the PHI input to the be the output of a VSET(I)VLI.
     MachineInstr *DefMI = MRI->getVRegDef(InReg);
     if (!DefMI || (DefMI->getOpcode() != RISCV::PseudoVSETVLI &&
+                   DefMI->getOpcode() != RISCV::PseudoVSETVLIX0 &&
                    DefMI->getOpcode() != RISCV::PseudoVSETIVLI))
       return true;
 
@@ -633,10 +926,13 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
 
 void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
   VSETVLIInfo CurInfo;
+  // Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI.
+  MachineInstr *PrevVSETVLIMI = nullptr;
 
   for (MachineInstr &MI : MBB) {
     // If this is an explicit VSETVLI or VSETIVLI, update our state.
     if (MI.getOpcode() == RISCV::PseudoVSETVLI ||
+        MI.getOpcode() == RISCV::PseudoVSETVLIX0 ||
         MI.getOpcode() == RISCV::PseudoVSETIVLI) {
       // Conservatively, mark the VL and VTYPE as live.
       assert(MI.getOperand(3).getReg() == RISCV::VL &&
@@ -645,6 +941,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
       MI.getOperand(3).setIsDead(false);
       MI.getOperand(4).setIsDead(false);
       CurInfo = getInfoForVSETVLI(MI);
+      PrevVSETVLIMI = &MI;
       continue;
     }
 
@@ -652,7 +949,11 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
     if (RISCVII::hasSEWOp(TSFlags)) {
       VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, MRI);
       if (RISCVII::hasVLOp(TSFlags)) {
-        MachineOperand &VLOp = MI.getOperand(MI.getNumExplicitOperands() - 2);
+        unsigned Offset = 2;
+        if (RISCVII::hasVecPolicyOp(TSFlags))
+          Offset = 3;
+        MachineOperand &VLOp =
+            MI.getOperand(MI.getNumExplicitOperands() - Offset);
         if (VLOp.isReg()) {
           // Erase the AVL operand from the instruction.
           VLOp.setReg(RISCV::NoRegister);
@@ -677,11 +978,35 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
       } else {
         // If this instruction isn't compatible with the previous VL/VTYPE
         // we need to insert a VSETVLI.
-        if (needVSETVLI(NewInfo, CurInfo)) {
-          insertVSETVLI(MBB, MI, NewInfo, CurInfo);
+        // If this is a unit-stride or strided load/store, we may be able to use
+        // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
+        // NOTE: We can't use predecessor information for the store. We must
+        // treat it the same as the first phase so that we produce the correct
+        // vl/vtype for succesor blocks.
+        if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) &&
+            needVSETVLI(NewInfo, CurInfo)) {
+          // If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it
+          // with current VL/VTYPE.
+          bool NeedInsertVSETVLI = true;
+          if (PrevVSETVLIMI) {
+            bool HasSameAVL =
+                CurInfo.hasSameAVL(NewInfo) ||
+                (NewInfo.hasAVLReg() && NewInfo.getAVLReg().isVirtual() &&
+                 NewInfo.getAVLReg() == PrevVSETVLIMI->getOperand(0).getReg());
+            // If these two VSETVLI have the same AVL and the same VLMAX,
+            // we could merge these two VSETVLI.
+            if (HasSameAVL &&
+                CurInfo.getSEWLMULRatio() == NewInfo.getSEWLMULRatio()) {
+              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
+              NeedInsertVSETVLI = false;
+            }
+          }
+          if (NeedInsertVSETVLI)
+            insertVSETVLI(MBB, MI, NewInfo, CurInfo);
           CurInfo = NewInfo;
         }
       }
+      PrevVSETVLIMI = nullptr;
     }
 
     // If this is something updates VL/VTYPE that we don't know about, set
@@ -689,6 +1014,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
     if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
         MI.modifiesRegister(RISCV::VTYPE)) {
       CurInfo = VSETVLIInfo::getUnknown();
+      PrevVSETVLIMI = nullptr;
     }
   }
 }
@@ -696,7 +1022,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
 bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
   // Skip if the vector extension is not enabled.
   const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
-  if (!ST.hasStdExtV())
+  if (!ST.hasVInstructions())
     return false;
 
   TII = ST.getInstrInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 8e9d245f13eb..cfad4cdb9364 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -178,6 +178,12 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
 
   bit HasVLOp = 0;
   let TSFlags{15} = HasVLOp;
+
+  bit HasVecPolicyOp = 0;
+  let TSFlags{16} = HasVecPolicyOp;
+
+  bit IsRVVWideningReduction = 0;
+  let TSFlags{17} =  IsRVVWideningReduction;
 }
 
 // Pseudo instructions
@@ -199,7 +205,7 @@ class PseudoLoad<string opcodestr, RegisterClass rdty = GPR>
 }
 
 class PseudoFloatLoad<string opcodestr, RegisterClass rdty = GPR>
-    : Pseudo<(outs rdty:$rd, GPR:$tmp), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr, $tmp"> {
+    : Pseudo<(outs GPR:$tmp, rdty:$rd), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr, $tmp"> {
   let hasSideEffects = 0;
   let mayLoad = 1;
   let mayStore = 0;
@@ -209,7 +215,7 @@ class PseudoFloatLoad<string opcodestr, RegisterClass rdty = GPR>
 
 // Pseudo store instructions.
 class PseudoStore<string opcodestr, RegisterClass rsty = GPR>
-    : Pseudo<(outs rsty:$rs, GPR:$tmp), (ins bare_symbol:$addr), [], opcodestr, "$rs, $addr, $tmp"> {
+    : Pseudo<(outs GPR:$tmp), (ins rsty:$rs, bare_symbol:$addr), [], opcodestr, "$rs, $addr, $tmp"> {
   let hasSideEffects = 0;
   let mayLoad = 0;
   let mayStore = 1;
@@ -406,3 +412,135 @@ class RVInstJ<RISCVOpcode opcode, dag outs, dag ins, string opcodestr,
   let Inst{11-7} = rd;
   let Opcode = opcode.Value;
 }
+
+//===----------------------------------------------------------------------===//
+// Instruction classes for .insn directives
+//===----------------------------------------------------------------------===//
+
+class DirectiveInsnR<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatR> {
+  bits<7> opcode;
+  bits<7> funct7;
+  bits<3> funct3;
+
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31-25} = funct7;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+
+  let AsmString = ".insn r " # argstr;
+}
+
+class DirectiveInsnR4<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatR4> {
+  bits<7> opcode;
+  bits<2> funct2;
+  bits<3> funct3;
+
+  bits<5> rs3;
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31-27} = rs3;
+  let Inst{26-25} = funct2;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+
+  let AsmString = ".insn r4 " # argstr;
+}
+
+class DirectiveInsnI<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatI> {
+  bits<7> opcode;
+  bits<3> funct3;
+
+  bits<12> imm12;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31-20} = imm12;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+
+  let AsmString = ".insn i " # argstr;
+}
+
+class DirectiveInsnS<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatS> {
+  bits<7> opcode;
+  bits<3> funct3;
+
+  bits<12> imm12;
+  bits<5> rs2;
+  bits<5> rs1;
+
+  let Inst{31-25} = imm12{11-5};
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = imm12{4-0};
+  let Opcode = opcode;
+
+  let AsmString = ".insn s " # argstr;
+}
+
+class DirectiveInsnB<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatB> {
+  bits<7> opcode;
+  bits<3> funct3;
+
+  bits<12> imm12;
+  bits<5> rs2;
+  bits<5> rs1;
+
+  let Inst{31} = imm12{11};
+  let Inst{30-25} = imm12{9-4};
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-8} = imm12{3-0};
+  let Inst{7} = imm12{10};
+  let Opcode = opcode;
+
+  let AsmString = ".insn b " # argstr;
+}
+
+class DirectiveInsnU<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatU> {
+  bits<7> opcode;
+
+  bits<20> imm20;
+  bits<5> rd;
+
+  let Inst{31-12} = imm20;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+
+  let AsmString = ".insn u " # argstr;
+}
+
+class DirectiveInsnJ<dag outs, dag ins, string argstr>
+  : RVInst<outs, ins, "", "", [], InstFormatJ> {
+  bits<7> opcode;
+
+  bits<20> imm20;
+  bits<5> rd;
+
+  let Inst{31-12} = imm20;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+
+  let AsmString = ".insn j " # argstr;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index a541daaff9f4..547d82550cac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -19,14 +19,15 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -36,6 +37,10 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "RISCVGenInstrInfo.inc"
 
+static cl::opt<bool> PreferWholeRegisterMove(
+    "riscv-prefer-whole-register-move", cl::init(false), cl::Hidden,
+    cl::desc("Prefer whole register move for vector registers."));
+
 namespace llvm {
 namespace RISCVVPseudosTable {
 
@@ -113,9 +118,137 @@ unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 
 static bool forwardCopyWillClobberTuple(unsigned DstReg, unsigned SrcReg,
                                         unsigned NumRegs) {
-  // We really want the positive remainder mod 32 here, that happens to be
-  // easily obtainable with a mask.
-  return ((DstReg - SrcReg) & 0x1f) < NumRegs;
+  return DstReg > SrcReg && (DstReg - SrcReg) < NumRegs;
+}
+
+static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI,
+                                   const MachineBasicBlock &MBB,
+                                   MachineBasicBlock::const_iterator MBBI,
+                                   MachineBasicBlock::const_iterator &DefMBBI,
+                                   RISCVII::VLMUL LMul) {
+  if (PreferWholeRegisterMove)
+    return false;
+
+  assert(MBBI->getOpcode() == TargetOpcode::COPY &&
+         "Unexpected COPY instruction.");
+  Register SrcReg = MBBI->getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  bool FoundDef = false;
+  bool FirstVSetVLI = false;
+  unsigned FirstSEW = 0;
+  while (MBBI != MBB.begin()) {
+    --MBBI;
+    if (MBBI->isMetaInstruction())
+      continue;
+
+    if (MBBI->getOpcode() == RISCV::PseudoVSETVLI ||
+        MBBI->getOpcode() == RISCV::PseudoVSETVLIX0 ||
+        MBBI->getOpcode() == RISCV::PseudoVSETIVLI) {
+      // There is a vsetvli between COPY and source define instruction.
+      // vy = def_vop ...  (producing instruction)
+      // ...
+      // vsetvli
+      // ...
+      // vx = COPY vy
+      if (!FoundDef) {
+        if (!FirstVSetVLI) {
+          FirstVSetVLI = true;
+          unsigned FirstVType = MBBI->getOperand(2).getImm();
+          RISCVII::VLMUL FirstLMul = RISCVVType::getVLMUL(FirstVType);
+          FirstSEW = RISCVVType::getSEW(FirstVType);
+          // The first encountered vsetvli must have the same lmul as the
+          // register class of COPY.
+          if (FirstLMul != LMul)
+            return false;
+        }
+        // Only permit `vsetvli x0, x0, vtype` between COPY and the source
+        // define instruction.
+        if (MBBI->getOperand(0).getReg() != RISCV::X0)
+          return false;
+        if (MBBI->getOperand(1).isImm())
+          return false;
+        if (MBBI->getOperand(1).getReg() != RISCV::X0)
+          return false;
+        continue;
+      }
+
+      // MBBI is the first vsetvli before the producing instruction.
+      unsigned VType = MBBI->getOperand(2).getImm();
+      // If there is a vsetvli between COPY and the producing instruction.
+      if (FirstVSetVLI) {
+        // If SEW is different, return false.
+        if (RISCVVType::getSEW(VType) != FirstSEW)
+          return false;
+      }
+
+      // If the vsetvli is tail undisturbed, keep the whole register move.
+      if (!RISCVVType::isTailAgnostic(VType))
+        return false;
+
+      // The checking is conservative. We only have register classes for
+      // LMUL = 1/2/4/8. We should be able to convert vmv1r.v to vmv.v.v
+      // for fractional LMUL operations. However, we could not use the vsetvli
+      // lmul for widening operations. The result of widening operation is
+      // 2 x LMUL.
+      return LMul == RISCVVType::getVLMUL(VType);
+    } else if (MBBI->isInlineAsm() || MBBI->isCall()) {
+      return false;
+    } else if (MBBI->getNumDefs()) {
+      // Check all the instructions which will change VL.
+      // For example, vleff has implicit def VL.
+      if (MBBI->modifiesRegister(RISCV::VL))
+        return false;
+
+      // Go through all defined operands, including implicit defines.
+      for (const MachineOperand &MO : MBBI->operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+        if (!FoundDef && TRI->isSubRegisterEq(MO.getReg(), SrcReg)) {
+          // We only permit the source of COPY has the same LMUL as the defined
+          // operand.
+          // There are cases we need to keep the whole register copy if the LMUL
+          // is different.
+          // For example,
+          // $x0 = PseudoVSETIVLI 4, 73   // vsetivli zero, 4, e16,m2,ta,m
+          // $v28m4 = PseudoVWADD_VV_M2 $v26m2, $v8m2
+          // # The COPY may be created by vlmul_trunc intrinsic.
+          // $v26m2 = COPY renamable $v28m2, implicit killed $v28m4
+          //
+          // After widening, the valid value will be 4 x e32 elements. If we
+          // convert the COPY to vmv.v.v, it will only copy 4 x e16 elements.
+          // FIXME: The COPY of subregister of Zvlsseg register will not be able
+          // to convert to vmv.v.[v|i] under the constraint.
+          if (MO.getReg() != SrcReg)
+            return false;
+
+          // In widening reduction instructions with LMUL_1 input vector case,
+          // only checking the LMUL is insufficient due to reduction result is
+          // always LMUL_1.
+          // For example,
+          // $x11 = PseudoVSETIVLI 1, 64 // vsetivli a1, 1, e8, m1, ta, mu
+          // $v8m1 = PseudoVWREDSUM_VS_M1 $v26, $v27
+          // $v26 = COPY killed renamable $v8
+          // After widening, The valid value will be 1 x e16 elements. If we
+          // convert the COPY to vmv.v.v, it will only copy 1 x e8 elements.
+          uint64_t TSFlags = MBBI->getDesc().TSFlags;
+          if (RISCVII::isRVVWideningReduction(TSFlags))
+            return false;
+
+          // Found the definition.
+          FoundDef = true;
+          DefMBBI = MBBI;
+          // If the producing instruction does not depend on vsetvli, do not
+          // convert COPY to vmv.v.v. For example, VL1R_V or PseudoVRELOAD.
+          if (!RISCVII::hasSEWOp(TSFlags))
+            return false;
+          break;
+        }
+      }
+    }
+  }
+
+  return false;
 }
 
 void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -133,7 +266,7 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned Opc;
   bool IsScalableVector = true;
   unsigned NF = 1;
-  unsigned LMul = 1;
+  RISCVII::VLMUL LMul = RISCVII::LMUL_1;
   unsigned SubRegIdx = RISCV::sub_vrm1_0;
   if (RISCV::FPR16RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::FSGNJ_H;
@@ -146,91 +279,157 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     IsScalableVector = false;
   } else if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV2R_V;
+    LMul = RISCVII::LMUL_2;
   } else if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV4R_V;
+    LMul = RISCVII::LMUL_4;
   } else if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV8R_V;
+    LMul = RISCVII::LMUL_8;
   } else if (RISCV::VRN2M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 2;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRN2M2RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV2R_V;
     SubRegIdx = RISCV::sub_vrm2_0;
     NF = 2;
-    LMul = 2;
+    LMul = RISCVII::LMUL_2;
   } else if (RISCV::VRN2M4RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV4R_V;
     SubRegIdx = RISCV::sub_vrm4_0;
     NF = 2;
-    LMul = 4;
+    LMul = RISCVII::LMUL_4;
   } else if (RISCV::VRN3M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 3;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRN3M2RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV2R_V;
     SubRegIdx = RISCV::sub_vrm2_0;
     NF = 3;
-    LMul = 2;
+    LMul = RISCVII::LMUL_2;
   } else if (RISCV::VRN4M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 4;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRN4M2RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV2R_V;
     SubRegIdx = RISCV::sub_vrm2_0;
     NF = 4;
-    LMul = 2;
+    LMul = RISCVII::LMUL_2;
   } else if (RISCV::VRN5M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 5;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRN6M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 6;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRN7M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 7;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRN8M1RegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::PseudoVMV1R_V;
     SubRegIdx = RISCV::sub_vrm1_0;
     NF = 8;
-    LMul = 1;
+    LMul = RISCVII::LMUL_1;
   } else {
     llvm_unreachable("Impossible reg-to-reg copy");
   }
 
   if (IsScalableVector) {
+    bool UseVMV_V_V = false;
+    MachineBasicBlock::const_iterator DefMBBI;
+    unsigned DefExplicitOpNum;
+    unsigned VIOpc;
+    if (isConvertibleToVMV_V_V(STI, MBB, MBBI, DefMBBI, LMul)) {
+      UseVMV_V_V = true;
+      DefExplicitOpNum = DefMBBI->getNumExplicitOperands();
+      // We only need to handle LMUL = 1/2/4/8 here because we only define
+      // vector register classes for LMUL = 1/2/4/8.
+      switch (LMul) {
+      default:
+        llvm_unreachable("Impossible LMUL for vector register copy.");
+      case RISCVII::LMUL_1:
+        Opc = RISCV::PseudoVMV_V_V_M1;
+        VIOpc = RISCV::PseudoVMV_V_I_M1;
+        break;
+      case RISCVII::LMUL_2:
+        Opc = RISCV::PseudoVMV_V_V_M2;
+        VIOpc = RISCV::PseudoVMV_V_I_M2;
+        break;
+      case RISCVII::LMUL_4:
+        Opc = RISCV::PseudoVMV_V_V_M4;
+        VIOpc = RISCV::PseudoVMV_V_I_M4;
+        break;
+      case RISCVII::LMUL_8:
+        Opc = RISCV::PseudoVMV_V_V_M8;
+        VIOpc = RISCV::PseudoVMV_V_I_M8;
+        break;
+      }
+    }
+
+    bool UseVMV_V_I = false;
+    if (UseVMV_V_V && (DefMBBI->getOpcode() == VIOpc)) {
+      UseVMV_V_I = true;
+      Opc = VIOpc;
+    }
+
     if (NF == 1) {
-      BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+      auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), DstReg);
+      if (UseVMV_V_I)
+        MIB = MIB.add(DefMBBI->getOperand(1));
+      else
+        MIB = MIB.addReg(SrcReg, getKillRegState(KillSrc));
+      if (UseVMV_V_V) {
+        // The last two arguments of vector instructions are
+        // AVL, SEW. We also need to append the implicit-use vl and vtype.
+        MIB.add(DefMBBI->getOperand(DefExplicitOpNum - 2)); // AVL
+        MIB.add(DefMBBI->getOperand(DefExplicitOpNum - 1)); // SEW
+        MIB.addReg(RISCV::VL, RegState::Implicit);
+        MIB.addReg(RISCV::VTYPE, RegState::Implicit);
+      }
     } else {
       const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
       int I = 0, End = NF, Incr = 1;
       unsigned SrcEncoding = TRI->getEncodingValue(SrcReg);
       unsigned DstEncoding = TRI->getEncodingValue(DstReg);
-      if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMul)) {
+      unsigned LMulVal;
+      bool Fractional;
+      std::tie(LMulVal, Fractional) = RISCVVType::decodeVLMUL(LMul);
+      assert(!Fractional && "It is impossible be fractional lmul here.");
+      if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMulVal)) {
         I = NF - 1;
         End = -1;
         Incr = -1;
       }
 
       for (; I != End; I += Incr) {
-        BuildMI(MBB, MBBI, DL, get(Opc), TRI->getSubReg(DstReg, SubRegIdx + I))
-            .addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
-                    getKillRegState(KillSrc));
+        auto MIB = BuildMI(MBB, MBBI, DL, get(Opc),
+                           TRI->getSubReg(DstReg, SubRegIdx + I));
+        if (UseVMV_V_I)
+          MIB = MIB.add(DefMBBI->getOperand(1));
+        else
+          MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
+                           getKillRegState(KillSrc));
+        if (UseVMV_V_V) {
+          MIB.add(DefMBBI->getOperand(DefExplicitOpNum - 2)); // AVL
+          MIB.add(DefMBBI->getOperand(DefExplicitOpNum - 1)); // SEW
+          MIB.addReg(RISCV::VL, RegState::Implicit);
+          MIB.addReg(RISCV::VTYPE, RegState::Implicit);
+        }
       }
     }
   } else {
@@ -458,6 +657,12 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
           .addReg(SrcReg, RegState::Kill)
           .addReg(RISCV::X0)
           .setMIFlag(Flag);
+    } else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD ||
+               Inst.Opc == RISCV::SH3ADD) {
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
+          .addReg(SrcReg, RegState::Kill)
+          .addReg(SrcReg, RegState::Kill)
+          .setMIFlag(Flag);
     } else {
       BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
           .addReg(SrcReg, RegState::Kill)
@@ -469,6 +674,25 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
   }
 }
 
+static RISCVCC::CondCode getCondFromBranchOpc(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return RISCVCC::COND_INVALID;
+  case RISCV::BEQ:
+    return RISCVCC::COND_EQ;
+  case RISCV::BNE:
+    return RISCVCC::COND_NE;
+  case RISCV::BLT:
+    return RISCVCC::COND_LT;
+  case RISCV::BGE:
+    return RISCVCC::COND_GE;
+  case RISCV::BLTU:
+    return RISCVCC::COND_LTU;
+  case RISCV::BGEU:
+    return RISCVCC::COND_GEU;
+  }
+}
+
 // The contents of values added to Cond are not examined outside of
 // RISCVInstrInfo, giving us flexibility in what to push to it. For RISCV, we
 // push BranchOpcode, Reg1, Reg2.
@@ -478,27 +702,47 @@ static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
   assert(LastInst.getDesc().isConditionalBranch() &&
          "Unknown conditional branch");
   Target = LastInst.getOperand(2).getMBB();
-  Cond.push_back(MachineOperand::CreateImm(LastInst.getOpcode()));
+  unsigned CC = getCondFromBranchOpc(LastInst.getOpcode());
+  Cond.push_back(MachineOperand::CreateImm(CC));
   Cond.push_back(LastInst.getOperand(0));
   Cond.push_back(LastInst.getOperand(1));
 }
 
-static unsigned getOppositeBranchOpcode(int Opc) {
-  switch (Opc) {
+const MCInstrDesc &RISCVInstrInfo::getBrCond(RISCVCC::CondCode CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case RISCVCC::COND_EQ:
+    return get(RISCV::BEQ);
+  case RISCVCC::COND_NE:
+    return get(RISCV::BNE);
+  case RISCVCC::COND_LT:
+    return get(RISCV::BLT);
+  case RISCVCC::COND_GE:
+    return get(RISCV::BGE);
+  case RISCVCC::COND_LTU:
+    return get(RISCV::BLTU);
+  case RISCVCC::COND_GEU:
+    return get(RISCV::BGEU);
+  }
+}
+
+RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) {
+  switch (CC) {
   default:
     llvm_unreachable("Unrecognized conditional branch");
-  case RISCV::BEQ:
-    return RISCV::BNE;
-  case RISCV::BNE:
-    return RISCV::BEQ;
-  case RISCV::BLT:
-    return RISCV::BGE;
-  case RISCV::BGE:
-    return RISCV::BLT;
-  case RISCV::BLTU:
-    return RISCV::BGEU;
-  case RISCV::BGEU:
-    return RISCV::BLTU;
+  case RISCVCC::COND_EQ:
+    return RISCVCC::COND_NE;
+  case RISCVCC::COND_NE:
+    return RISCVCC::COND_EQ;
+  case RISCVCC::COND_LT:
+    return RISCVCC::COND_GE;
+  case RISCVCC::COND_GE:
+    return RISCVCC::COND_LT;
+  case RISCVCC::COND_LTU:
+    return RISCVCC::COND_GEU;
+  case RISCVCC::COND_GEU:
+    return RISCVCC::COND_LTU;
   }
 }
 
@@ -624,9 +868,9 @@ unsigned RISCVInstrInfo::insertBranch(
   }
 
   // Either a one or two-way conditional branch.
-  unsigned Opc = Cond[0].getImm();
+  auto CC = static_cast<RISCVCC::CondCode>(Cond[0].getImm());
   MachineInstr &CondMI =
-      *BuildMI(&MBB, DL, get(Opc)).add(Cond[1]).add(Cond[2]).addMBB(TBB);
+      *BuildMI(&MBB, DL, getBrCond(CC)).add(Cond[1]).add(Cond[2]).addMBB(TBB);
   if (BytesAdded)
     *BytesAdded += getInstSizeInBytes(CondMI);
 
@@ -641,11 +885,11 @@ unsigned RISCVInstrInfo::insertBranch(
   return 2;
 }
 
-unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
-                                              MachineBasicBlock &DestBB,
-                                              const DebugLoc &DL,
-                                              int64_t BrOffset,
-                                              RegScavenger *RS) const {
+void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                          MachineBasicBlock &DestBB,
+                                          MachineBasicBlock &RestoreBB,
+                                          const DebugLoc &DL, int64_t BrOffset,
+                                          RegScavenger *RS) const {
   assert(RS && "RegScavenger required for long branching");
   assert(MBB.empty() &&
          "new block should be inserted for expanding unconditional branch");
@@ -671,16 +915,18 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   RS->enterBasicBlockEnd(MBB);
   unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
                                                 MI.getIterator(), false, 0);
+  // TODO: The case when there is no scavenged register needs special handling.
+  assert(Scav != RISCV::NoRegister && "No register is scavenged!");
   MRI.replaceRegWith(ScratchReg, Scav);
   MRI.clearVirtRegs();
   RS->setRegUsed(Scav);
-  return 8;
 }
 
 bool RISCVInstrInfo::reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
   assert((Cond.size() == 3) && "Invalid branch condition!");
-  Cond[0].setImm(getOppositeBranchOpcode(Cond[0].getImm()));
+  auto CC = static_cast<RISCVCC::CondCode>(Cond[0].getImm());
+  Cond[0].setImm(getOppositeBranchCondition(CC));
   return false;
 }
 
@@ -866,12 +1112,21 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         switch (OpType) {
         default:
           llvm_unreachable("Unexpected operand type");
+        case RISCVOp::OPERAND_UIMM2:
+          Ok = isUInt<2>(Imm);
+          break;
+        case RISCVOp::OPERAND_UIMM3:
+          Ok = isUInt<3>(Imm);
+          break;
         case RISCVOp::OPERAND_UIMM4:
           Ok = isUInt<4>(Imm);
           break;
         case RISCVOp::OPERAND_UIMM5:
           Ok = isUInt<5>(Imm);
           break;
+        case RISCVOp::OPERAND_UIMM7:
+          Ok = isUInt<7>(Imm);
+          break;
         case RISCVOp::OPERAND_UIMM12:
           Ok = isUInt<12>(Imm);
           break;
@@ -1086,7 +1341,7 @@ RISCVInstrInfo::getOutliningType(MachineBasicBlock::iterator &MBBI,
 
   // Make sure the operands don't reference something unsafe.
   for (const auto &MO : MI.operands())
-    if (MO.isMBB() || MO.isBlockAddress() || MO.isCPI())
+    if (MO.isMBB() || MO.isBlockAddress() || MO.isCPI() || MO.isJTI())
       return outliner::InstrType::Illegal;
 
   // Don't allow instructions which won't be materialized to impact outlining
@@ -1139,7 +1394,7 @@ MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
 
 // clang-format off
 #define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL)                                \
-  RISCV::PseudoV##OP##_##TYPE##_##LMUL##_COMMUTABLE
+  RISCV::PseudoV##OP##_##TYPE##_##LMUL
 
 #define CASE_VFMA_OPCODE_LMULS(OP, TYPE)                                       \
   CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF8):                                      \
@@ -1182,6 +1437,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VFMA_OPCODE_LMULS(NMSAC, VX):
   case CASE_VFMA_OPCODE_LMULS(MACC, VV):
   case CASE_VFMA_OPCODE_LMULS(NMSAC, VV): {
+    // If the tail policy is undisturbed we can't commute.
+    assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags));
+    if ((MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 1) == 0)
+      return false;
+
     // For these instructions we can only swap operand 1 and operand 3 by
     // changing the opcode.
     unsigned CommutableOpIdx1 = 1;
@@ -1197,6 +1457,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV):
   case CASE_VFMA_OPCODE_LMULS(MADD, VV):
   case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): {
+    // If the tail policy is undisturbed we can't commute.
+    assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags));
+    if ((MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 1) == 0)
+      return false;
+
     // For these instructions we have more freedom. We can commute with the
     // other multiplicand or with the addend/subtrahend/minuend.
 
@@ -1223,7 +1488,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
         // Both of operands are not fixed. Set one of commutable
         // operands to the tied source.
         CommutableOpIdx1 = 1;
-      } else if (SrcOpIdx1 == CommutableOpIdx1) {
+      } else if (SrcOpIdx1 == CommuteAnyOperandIndex) {
         // Only one of the operands is not fixed.
         CommutableOpIdx1 = SrcOpIdx2;
       }
@@ -1261,8 +1526,8 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
 }
 
 #define CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, LMUL)               \
-  case RISCV::PseudoV##OLDOP##_##TYPE##_##LMUL##_COMMUTABLE:                   \
-    Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL##_COMMUTABLE;                \
+  case RISCV::PseudoV##OLDOP##_##TYPE##_##LMUL:                                \
+    Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL;                             \
     break;
 
 #define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                      \
@@ -1409,8 +1674,9 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4)
 
-MachineInstr *RISCVInstrInfo::convertToThreeAddress(
-    MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const {
+MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
+                                                    LiveVariables *LV,
+                                                    LiveIntervals *LIS) const {
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1434,7 +1700,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(
     }
     //clang-format on
 
-    MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+    MachineBasicBlock &MBB = *MI.getParent();
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                                   .add(MI.getOperand(0))
                                   .add(MI.getOperand(1))
                                   .add(MI.getOperand(2))
@@ -1451,6 +1718,20 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(
       }
     }
 
+    if (LIS) {
+      SlotIndex Idx = LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+
+      if (MI.getOperand(0).isEarlyClobber()) {
+        // Use operand 1 was tied to early-clobber def operand 0, so its live
+        // interval could have ended at an early-clobber slot. Now they are not
+        // tied we need to update it to the normal register slot.
+        LiveInterval &LI = LIS->getInterval(MI.getOperand(1).getReg());
+        LiveRange::Segment *S = LI.getSegmentContaining(Idx);
+        if (S->end == Idx.getRegSlot(true))
+          S->end = Idx.getRegSlot();
+      }
+    }
+
     return MIB;
   }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index d80fc483826f..2bfad7844c43 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -24,12 +24,29 @@ namespace llvm {
 
 class RISCVSubtarget;
 
+namespace RISCVCC {
+
+enum CondCode {
+  COND_EQ,
+  COND_NE,
+  COND_LT,
+  COND_GE,
+  COND_LTU,
+  COND_GEU,
+  COND_INVALID
+};
+
+CondCode getOppositeBranchCondition(CondCode);
+
+} // end of namespace RISCVCC
+
 class RISCVInstrInfo : public RISCVGenInstrInfo {
 
 public:
   explicit RISCVInstrInfo(RISCVSubtarget &STI);
 
   MCInst getNop() const override;
+  const MCInstrDesc &getBrCond(RISCVCC::CondCode CC) const;
 
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -68,10 +85,10 @@ public:
                         const DebugLoc &dl,
                         int *BytesAdded = nullptr) const override;
 
-  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
-                                MachineBasicBlock &NewDestBB,
-                                const DebugLoc &DL, int64_t BrOffset,
-                                RegScavenger *RS = nullptr) const override;
+  void insertIndirectBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock &NewDestBB,
+                            MachineBasicBlock &RestoreBB, const DebugLoc &DL,
+                            int64_t BrOffset, RegScavenger *RS) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
@@ -143,9 +160,8 @@ public:
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
-  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
-                                      MachineInstr &MI,
-                                      LiveVariables *LV) const override;
+  MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                      LiveIntervals *LIS) const override;
 
   Register getVLENFactoredAmount(
       MachineFunction &MF, MachineBasicBlock &MBB,
@@ -164,6 +180,11 @@ protected:
   const RISCVSubtarget &STI;
 };
 
+namespace RISCV {
+// Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
+static constexpr int64_t VLMaxSentinel = -1LL;
+} // namespace RISCV
+
 namespace RISCVVPseudosTable {
 
 struct PseudoInfo {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 949fff25e9e0..b653928ccea9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -23,6 +23,7 @@ def SDT_CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
 // Target-dependent type requirements.
 def SDT_RISCVCall     : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
 def SDT_RISCVSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, OtherVT>,
                                              SDTCisSameAs<0, 4>,
                                              SDTCisSameAs<4, 5>]>;
 def SDT_RISCVBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
@@ -152,6 +153,20 @@ def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{
   let OperandNamespace = "RISCVOp";
 }
 
+def uimm2 : Operand<XLenVT> {
+  let ParserMatchClass = UImmAsmOperand<2>;
+  let DecoderMethod = "decodeUImmOperand<2>";
+  let OperandType = "OPERAND_UIMM2";
+  let OperandNamespace = "RISCVOp";
+}
+
+def uimm3 : Operand<XLenVT> {
+  let ParserMatchClass = UImmAsmOperand<3>;
+  let DecoderMethod = "decodeUImmOperand<3>";
+  let OperandType = "OPERAND_UIMM3";
+  let OperandNamespace = "RISCVOp";
+}
+
 def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
   let ParserMatchClass = UImmAsmOperand<5>;
   let DecoderMethod = "decodeUImmOperand<5>";
@@ -159,6 +174,13 @@ def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
   let OperandNamespace = "RISCVOp";
 }
 
+def uimm7 : Operand<XLenVT> {
+  let ParserMatchClass = UImmAsmOperand<7>;
+  let DecoderMethod = "decodeUImmOperand<7>";
+  let OperandType = "OPERAND_UIMM7";
+  let OperandNamespace = "RISCVOp";
+}
+
 def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
   let ParserMatchClass = SImmAsmOperand<12>;
   let EncoderMethod = "getImmOpValue";
@@ -849,6 +871,87 @@ def : MnemonicAlias<"sbreak", "ebreak">;
 def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF), 0>;
 
 //===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// isCodeGenOnly = 1 to hide them from the tablegened assembly parser.
+let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1,
+    hasNoSchedulingInfo = 1 in {
+def InsnR : DirectiveInsnR<(outs AnyReg:$rd), (ins uimm7:$opcode, uimm3:$funct3,
+                                                   uimm7:$funct7, AnyReg:$rs1,
+                                                   AnyReg:$rs2),
+                           "$opcode, $funct3, $funct7, $rd, $rs1, $rs2">;
+def InsnR4 : DirectiveInsnR4<(outs AnyReg:$rd), (ins uimm7:$opcode,
+                                                     uimm3:$funct3,
+                                                     uimm2:$funct2,
+                                                     AnyReg:$rs1, AnyReg:$rs2,
+                                                     AnyReg:$rs3),
+                            "$opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3">;
+def InsnI : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7:$opcode, uimm3:$funct3,
+                                                   AnyReg:$rs1, simm12:$imm12),
+                           "$opcode, $funct3, $rd, $rs1, $imm12">;
+def InsnI_Mem : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7:$opcode,
+                                                       uimm3:$funct3,
+                                                       AnyReg:$rs1,
+                                                       simm12:$imm12),
+                               "$opcode, $funct3, $rd, ${imm12}(${rs1})">;
+def InsnB : DirectiveInsnB<(outs), (ins uimm7:$opcode, uimm3:$funct3,
+                                        AnyReg:$rs1, AnyReg:$rs2,
+                                        simm13_lsb0:$imm12),
+                           "$opcode, $funct3, $rs1, $rs2, $imm12">;
+def InsnU : DirectiveInsnU<(outs AnyReg:$rd), (ins uimm7:$opcode,
+                                                   uimm20_lui:$imm20),
+                           "$opcode, $rd, $imm20">;
+def InsnJ : DirectiveInsnJ<(outs AnyReg:$rd), (ins uimm7:$opcode,
+                                                   simm21_lsb0_jal:$imm20),
+                           "$opcode, $rd, $imm20">;
+def InsnS : DirectiveInsnS<(outs), (ins uimm7:$opcode, uimm3:$funct3,
+                                        AnyReg:$rs2, AnyReg:$rs1,
+                                        simm12:$imm12),
+                           "$opcode, $funct3, $rs2, ${imm12}(${rs1})">;
+}
+
+// Use InstAliases to match these so that we can combine the insn and format
+// into a mnemonic to use as the key for the tablegened asm matcher table. The
+// parser will take care of creating these fake mnemonics and will only do it
+// for known formats.
+let EmitPriority = 0 in {
+def : InstAlias<".insn_r $opcode, $funct3, $funct7, $rd, $rs1, $rs2",
+                (InsnR AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm7:$funct7,
+                       AnyReg:$rs1, AnyReg:$rs2)>;
+// Accept 4 register form of ".insn r" as alias for ".insn r4".
+def : InstAlias<".insn_r $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3",
+                (InsnR4 AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm2:$funct2,
+                        AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>;
+def : InstAlias<".insn_r4 $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3",
+                (InsnR4 AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm2:$funct2,
+                        AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>;
+def : InstAlias<".insn_i $opcode, $funct3, $rd, $rs1, $imm12",
+                (InsnI AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1,
+                       simm12:$imm12)>;
+def : InstAlias<".insn_i $opcode, $funct3, $rd, ${imm12}(${rs1})",
+                (InsnI_Mem AnyReg:$rd, uimm7:$opcode, uimm3:$funct3,
+                           AnyReg:$rs1, simm12:$imm12)>;
+def : InstAlias<".insn_b $opcode, $funct3, $rs1, $rs2, $imm12",
+                (InsnB uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1,
+                       AnyReg:$rs2, simm13_lsb0:$imm12)>;
+// Accept sb as an alias for b.
+def : InstAlias<".insn_sb $opcode, $funct3, $rs1, $rs2, $imm12",
+                (InsnB uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1,
+                       AnyReg:$rs2, simm13_lsb0:$imm12)>;
+def : InstAlias<".insn_u $opcode, $rd, $imm20",
+                (InsnU AnyReg:$rd, uimm7:$opcode, uimm20_lui:$imm20)>;
+def : InstAlias<".insn_j $opcode, $rd, $imm20",
+                (InsnJ AnyReg:$rd, uimm7:$opcode, simm21_lsb0_jal:$imm20)>;
+// Accept uj as an alias for j.
+def : InstAlias<".insn_uj $opcode, $rd, $imm20",
+                (InsnJ AnyReg:$rd, uimm7:$opcode, simm21_lsb0_jal:$imm20)>;
+def : InstAlias<".insn_s $opcode, $funct3, $rs2, ${imm12}(${rs1})",
+                (InsnS uimm7:$opcode, uimm3:$funct3, AnyReg:$rs2,
+                       AnyReg:$rs1, simm12:$imm12)>;
+}
+
+//===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //
 // Naming convention: For 'generic' pattern classes, we use the naming
@@ -893,6 +996,14 @@ def mul_oneuse : PatFrag<(ops node:$A, node:$B), (mul node:$A, node:$B), [{
   return N->hasOneUse();
 }]>;
 
+def mul_const_oneuse : PatFrag<(ops node:$A, node:$B),
+                               (mul node:$A, node:$B), [{
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (N1C->hasOneUse())
+      return true;
+  return false;
+}]>;
+
 /// Simple arithmetic operations
 
 def : PatGprGpr<add, ADD>;
@@ -966,13 +1077,27 @@ def : Pat<(setgt GPR:$rs1, GPR:$rs2), (SLT GPR:$rs2, GPR:$rs1)>;
 def : Pat<(setge GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs1, GPR:$rs2), 1)>;
 def : Pat<(setle GPR:$rs1, GPR:$rs2), (XORI (SLT GPR:$rs2, GPR:$rs1), 1)>;
 
+def IntCCtoRISCVCC : SDNodeXForm<riscv_selectcc, [{
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  RISCVCC::CondCode BrCC = getRISCVCCForIntCC(CC);
+  return CurDAG->getTargetConstant(BrCC, SDLoc(N), Subtarget->getXLenVT());
+}]>;
+
+def riscv_selectcc_frag : PatFrag<(ops node:$lhs, node:$rhs, node:$cc,
+                                       node:$truev, node:$falsev),
+                                  (riscv_selectcc node:$lhs, node:$rhs,
+                                                  node:$cc, node:$truev,
+                                                  node:$falsev), [{}],
+                                  IntCCtoRISCVCC>;
+
 let usesCustomInserter = 1 in
 class SelectCC_rrirr<RegisterClass valty, RegisterClass cmpty>
     : Pseudo<(outs valty:$dst),
              (ins cmpty:$lhs, cmpty:$rhs, ixlenimm:$imm,
               valty:$truev, valty:$falsev),
-             [(set valty:$dst, (riscv_selectcc cmpty:$lhs, cmpty:$rhs,
-              (XLenVT timm:$imm), valty:$truev, valty:$falsev))]>;
+             [(set valty:$dst,
+               (riscv_selectcc_frag:$imm cmpty:$lhs, cmpty:$rhs, cond,
+                                         valty:$truev, valty:$falsev))]>;
 
 def Select_GPR_Using_CC_GPR : SelectCC_rrirr<GPR, GPR>;
 
@@ -1231,22 +1356,30 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
           (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
+// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
+// if only the lower 32 bits of their result is used.
+class binop_allwusers<SDPatternOperator operator>
+    : PatFrag<(ops node:$lhs, node:$rhs),
+              (operator node:$lhs, node:$rhs), [{
+  return hasAllWUsers(Node);
+}]>;
+
+def sexti32_allwusers : PatFrag<(ops node:$src),
+                                (sext_inreg node:$src, i32), [{
+  return hasAllWUsers(Node);
+}]>;
+
 let Predicates = [IsRV64] in {
 
 /// sext and zext
 
+// Sign extend is not needed if all users are W instructions.
+def : Pat<(sexti32_allwusers GPR:$rs1), (XLenVT GPR:$rs1)>;
+
 def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>;
 
 /// ALU operations
 
-def : Pat<(sext_inreg (add GPR:$rs1, GPR:$rs2), i32),
-          (ADDW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (add GPR:$rs1, simm12:$imm12), i32),
-          (ADDIW GPR:$rs1, simm12:$imm12)>;
-def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32),
-          (SUBW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
-          (SLLIW GPR:$rs1, uimm5:$shamt)>;
 def : Pat<(i64 (srl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
           (SRLIW GPR:$rs1, uimm5:$shamt)>;
 def : Pat<(i64 (srl (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt)),
@@ -1260,6 +1393,18 @@ def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
 def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
 def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
 
+// Select W instructions if only the lower 32 bits of the result are used.
+def : PatGprGpr<binop_allwusers<add>, ADDW>;
+def : PatGprSimm12<binop_allwusers<add>, ADDIW>;
+def : PatGprGpr<binop_allwusers<sub>, SUBW>;
+def : PatGprImm<binop_allwusers<shl>, SLLIW, uimm5>;
+
+// If this is a shr of a value sign extended from i32, and all the users only
+// use the lower 32 bits, we can use an sraiw to remove the sext_inreg. This
+// occurs because SimplifyDemandedBits prefers srl over sra.
+def : Pat<(binop_allwusers<srl> (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
+          (SRAIW GPR:$rs1, uimm5:$shamt)>;
+
 /// Loads
 
 defm : LdPat<sextloadi32, LW, i64>;
@@ -1300,7 +1445,8 @@ def : Pat<(add GPR:$rs1, (AddiPair:$rs2)),
                 (AddiPairImmA GPR:$rs2))>;
 
 let Predicates = [IsRV64] in {
-def : Pat<(sext_inreg (add_oneuse GPR:$rs1, (AddiPair:$rs2)), i32),
+// Select W instructions if only the lower 32-bits of the result are used.
+def : Pat<(binop_allwusers<add> GPR:$rs1, (AddiPair:$rs2)),
           (ADDIW (ADDIW GPR:$rs1, (AddiPairImmB AddiPair:$rs2)),
                  (AddiPairImmA AddiPair:$rs2))>;
 }
@@ -1314,6 +1460,6 @@ include "RISCVInstrInfoA.td"
 include "RISCVInstrInfoF.td"
 include "RISCVInstrInfoD.td"
 include "RISCVInstrInfoC.td"
-include "RISCVInstrInfoB.td"
+include "RISCVInstrInfoZb.td"
 include "RISCVInstrInfoV.td"
 include "RISCVInstrInfoZfh.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 86f96c1529b1..d204c85d6179 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -745,13 +745,6 @@ def : InstAlias<"c.sdsp $rs2, (${rs1})", (C_SDSP GPRC:$rs2, SP:$rs1, 0)>;
 // Compress Instruction tablegen backend.
 //===----------------------------------------------------------------------===//
 
-class CompressPat<dag input, dag output> {
-  dag Input  = input;
-  dag Output    = output;
-  list<Predicate> Predicates = [];
-  bit isCompressOnly = false;
-}
-
 // Patterns are defined in the same order the compressed instructions appear
 // on page 82 of the ISA manual.
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 41eff2ef7607..2cd011a02345 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -331,6 +331,10 @@ def : Pat<(f64 (fpimm0)), (FCVT_D_W (i32 X0))>;
 def : Pat<(i32 (fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>;
 def : Pat<(i32 (fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>;
 
+// Saturating double->[u]int32.
+def : Pat<(i32 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_W_D $rs1, 0b001)>;
+def : Pat<(i32 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_WU_D $rs1, 0b001)>;
+
 // float->int32 with current rounding mode.
 def : Pat<(i32 (lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>;
 
@@ -354,13 +358,17 @@ def : Pat<(i64 (bitconvert FPR64:$rs1)), (FMV_X_D FPR64:$rs1)>;
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_fcvt_w_rv64 FPR64:$rs1),  (FCVT_W_D $rs1, 0b001)>;
-def : Pat<(riscv_fcvt_wu_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_w_rtz_rv64 FPR64:$rs1),  (FCVT_W_D $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>;
 
 // [u]int32->fp
 def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>;
 def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>;
 
+// Saturating double->[u]int64.
+def : Pat<(i64 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_L_D $rs1, 0b001)>;
+def : Pat<(i64 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_LU_D $rs1, 0b001)>;
+
 // double->[u]int64. Round-to-zero must be used.
 def : Pat<(i64 (fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>;
 def : Pat<(i64 (fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 6b5c9617426a..3400c3be52bf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -21,15 +21,21 @@ def SDT_RISCVFMV_X_ANYEXTW_RV64
     : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
 def STD_RISCVFCVT_W_RV64
     : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisFP<1>]>;
+def STD_RISCVFCVT_X
+    : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>;
 
 def riscv_fmv_w_x_rv64
     : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>;
 def riscv_fmv_x_anyextw_rv64
     : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>;
-def riscv_fcvt_w_rv64
-    : SDNode<"RISCVISD::FCVT_W_RV64", STD_RISCVFCVT_W_RV64>;
-def riscv_fcvt_wu_rv64
-    : SDNode<"RISCVISD::FCVT_WU_RV64", STD_RISCVFCVT_W_RV64>;
+def riscv_fcvt_w_rtz_rv64
+    : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", STD_RISCVFCVT_W_RV64>;
+def riscv_fcvt_wu_rtz_rv64
+    : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", STD_RISCVFCVT_W_RV64>;
+def riscv_fcvt_x_rtz
+    : SDNode<"RISCVISD::FCVT_X_RTZ", STD_RISCVFCVT_X>;
+def riscv_fcvt_xu_rtz
+    : SDNode<"RISCVISD::FCVT_XU_RTZ", STD_RISCVFCVT_X>;
 
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
@@ -379,6 +385,10 @@ def : Pat<(i32 (bitconvert FPR32:$rs1)), (FMV_X_W FPR32:$rs1)>;
 def : Pat<(i32 (fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
 def : Pat<(i32 (fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
 
+// Saturating float->[u]int32.
+def : Pat<(i32 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(i32 (riscv_fcvt_xu_rtz FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
+
 // float->int32 with current rounding mode.
 def : Pat<(i32 (lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>;
 
@@ -400,13 +410,17 @@ def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32),
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_fcvt_w_rv64 FPR32:$rs1),  (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(riscv_fcvt_wu_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_w_rtz_rv64 FPR32:$rs1),  (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
 
 // float->[u]int64. Round-to-zero must be used.
 def : Pat<(i64 (fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
 def : Pat<(i64 (fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
 
+// Saturating float->[u]int64.
+def : Pat<(i64 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
+def : Pat<(i64 (riscv_fcvt_xu_rtz FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
+
 // float->int64 with current rounding mode.
 def : Pat<(i64 (lrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
 def : Pat<(i64 (llrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index f654ed1949a4..a037dbf585ce 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -72,8 +72,8 @@ def : PatGprGpr<urem, REMU>;
 } // Predicates = [HasStdExtM]
 
 let Predicates = [HasStdExtM, IsRV64] in {
-def : Pat<(sext_inreg (mul GPR:$rs1, GPR:$rs2), i32),
-          (MULW GPR:$rs1, GPR:$rs2)>;
+// Select W instructions if only the lower 32-bits of the result are used.
+def : PatGprGpr<binop_allwusers<mul>, MULW>;
 
 def : PatGprGpr<riscv_divw, DIVW>;
 def : PatGprGpr<riscv_divuw, DIVUW>;
@@ -96,20 +96,24 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
           (REMW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtM, IsRV64]
 
+// Pattern to detect constants with no more than 32 active bits that can't
+// be materialized with lui+addiw.
+def uimm32_not_simm32 : PatLeaf<(XLenVT GPR:$a), [{
+  auto *C = dyn_cast<ConstantSDNode>(N);
+  return C && C->hasOneUse() && isUInt<32>(C->getZExtValue()) &&
+         !isInt<32>(C->getSExtValue());
+}]>;
+
 let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
 // Special case for calculating the full 64-bit product of a 32x32 unsigned
 // multiply where the inputs aren't known to be zero extended. We can shift the
 // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
 // zeroing the upper 32 bits.
-// TODO: If one of the operands is zero extended and the other isn't, we might
-// still be better off shifting both left by 32.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
           (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
-// Prevent matching the first part of this pattern to mulw. The mul here has
-// additionals users or the ANDs would have been removed. The above pattern
-// will be used for the other users. If we form a mulw we'll keep the ANDs alive
-// and they'll still become SLLI+SRLI.
-def : Pat<(sext_inreg (mul (and GPR:$rs1, 0xffffffff),
-                           (and GPR:$rs2, 0xffffffff)), i32),
-          (ADDIW (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32)), 0)>;
+// The RHS could also be a constant that is hard to materialize. By shifting
+// left we can allow constant materialization to use LUI+ADDIW via
+// hasAllWUsers.
+def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), uimm32_not_simm32:$rs2)),
+          (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
 } // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 342497150d49..3d5f9bc54731 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -78,65 +78,105 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
 }
 
 //===----------------------------------------------------------------------===//
+// Scheduling definitions.
+//===----------------------------------------------------------------------===//
+
+class VMVRSched<int n>: Sched <[!cast<SchedReadWrite>("WriteVMov" # n # "V"),
+                                !cast<SchedReadWrite>("ReadVMov" # n # "V")]>;
+
+class VLESched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDE" # n),
+                                ReadVLDX, ReadVMask]>;
+
+class VSESched<int n> : Sched <[!cast<SchedReadWrite>("WriteVSTE" # n),
+                                !cast<SchedReadWrite>("ReadVSTE" # n # "V"),
+                                ReadVSTX, ReadVMask]>;
+
+class VLSSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDS" # n),
+                                ReadVLDX, ReadVLDSX, ReadVMask]>;
+
+class VSSSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVSTS" # n),
+                                !cast<SchedReadWrite>("ReadVSTS" # n # "V"),
+                                ReadVSTX, ReadVSTSX, ReadVMask]>;
+
+class VLXSched<int n, string o> :
+  Sched <[!cast<SchedReadWrite>("WriteVLD" # o # "X" # n),
+          ReadVLDX, !cast<SchedReadWrite>("ReadVLD" # o # "XV"), ReadVMask]>;
+
+class VSXSched<int n, string o> :
+  Sched <[!cast<SchedReadWrite>("WriteVST" # o # "X" # n),
+          !cast<SchedReadWrite>("ReadVST" # o # "X" # n),
+          ReadVSTX, !cast<SchedReadWrite>("ReadVST" # o # "XV"), ReadVMask]>;
+
+class VLFSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDFF" # n),
+                                ReadVLDX, ReadVMask]>;
+
+//===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-// load vd, (rs1)
+// unit-stride load vd, (rs1), vm
+class VUnitStrideLoad<RISCVWidth width, string opcodestr>
+    : RVInstVLU<0b000, width.Value{3}, LUMOPUnitStride, width.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
+
+let vm = 1, RVVConstraint = NoConstraint in {
+// unit-stride whole register load vl<nf>r.v vd, (rs1)
+class VWholeLoad<bits<3> nf, RISCVWidth width, string opcodestr, RegisterClass VRC>
+    : RVInstVLU<nf, width.Value{3}, LUMOPUnitStrideWholeReg,
+                width.Value{2-0}, (outs VRC:$vd), (ins GPR:$rs1),
+                opcodestr, "$vd, (${rs1})"> {
+  let Uses = [];
+}
+
+// unit-stride mask load vd, (rs1)
 class VUnitStrideLoadMask<string opcodestr>
     : RVInstVLU<0b000, LSWidth8.Value{3}, LUMOPUnitStrideMask, LSWidth8.Value{2-0},
                 (outs VR:$vd),
-                (ins GPR:$rs1), opcodestr, "$vd, (${rs1})"> {
-  let vm = 1;
-  let RVVConstraint = NoConstraint;
-}
+                (ins GPR:$rs1), opcodestr, "$vd, (${rs1})">;
+} // vm = 1, RVVConstraint = NoConstraint
 
-// load vd, (rs1), vm
-class VUnitStrideLoad<RISCVLSUMOP lumop, RISCVWidth width,
-                      string opcodestr>
-    : RVInstVLU<0b000, width.Value{3}, lumop, width.Value{2-0},
+// unit-stride fault-only-first load vd, (rs1), vm
+class VUnitStrideLoadFF<RISCVWidth width, string opcodestr>
+    : RVInstVLU<0b000, width.Value{3}, LUMOPUnitStrideFF, width.Value{2-0},
                 (outs VR:$vd),
                 (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
 
-// load vd, (rs1), rs2, vm
+// strided load vd, (rs1), rs2, vm
 class VStridedLoad<RISCVWidth width, string opcodestr>
     : RVInstVLS<0b000, width.Value{3}, width.Value{2-0},
                 (outs VR:$vd),
                 (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
                 "$vd, (${rs1}), $rs2$vm">;
 
-// load vd, (rs1), vs2, vm
+// indexed load vd, (rs1), vs2, vm
 class VIndexedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
     : RVInstVLX<0b000, width.Value{3}, mop, width.Value{2-0},
                 (outs VR:$vd),
                 (ins GPR:$rs1, VR:$vs2, VMaskOp:$vm), opcodestr,
                 "$vd, (${rs1}), $vs2$vm">;
 
-// vl<nf>r.v vd, (rs1)
-class VWholeLoad<bits<3> nf, RISCVWidth width, string opcodestr, RegisterClass VRC>
-    : RVInstVLU<nf, width.Value{3}, LUMOPUnitStrideWholeReg,
-                width.Value{2-0}, (outs VRC:$vd), (ins GPR:$rs1),
-                opcodestr, "$vd, (${rs1})"> {
-  let vm = 1;
-  let Uses = [];
-  let RVVConstraint = NoConstraint;
-}
+// unit-stride segment load vd, (rs1), vm
+class VUnitStrideSegmentLoad<bits<3> nf, RISCVWidth width, string opcodestr>
+    : RVInstVLU<nf, width.Value{3}, LUMOPUnitStride, width.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
 
-// segment load vd, (rs1), vm
-class VUnitStrideSegmentLoad<bits<3> nf, RISCVLSUMOP lumop,
-                             RISCVWidth width, string opcodestr>
-    : RVInstVLU<nf, width.Value{3}, lumop, width.Value{2-0},
+// segment fault-only-first load vd, (rs1), vm
+class VUnitStrideSegmentLoadFF<bits<3> nf, RISCVWidth width, string opcodestr>
+    : RVInstVLU<nf, width.Value{3}, LUMOPUnitStrideFF, width.Value{2-0},
                 (outs VR:$vd),
                 (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
 
-// segment load vd, (rs1), rs2, vm
+// strided segment load vd, (rs1), rs2, vm
 class VStridedSegmentLoad<bits<3> nf, RISCVWidth width, string opcodestr>
     : RVInstVLS<nf, width.Value{3}, width.Value{2-0},
                 (outs VR:$vd),
                 (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
                 "$vd, (${rs1}), $rs2$vm">;
 
-// segment load vd, (rs1), vs2, vm
+// indexed segment load vd, (rs1), vs2, vm
 class VIndexedSegmentLoad<bits<3> nf, RISCVMOP mop, RISCVWidth width,
                           string opcodestr>
     : RVInstVLX<nf, width.Value{3}, mop, width.Value{2-0},
@@ -146,42 +186,40 @@ class VIndexedSegmentLoad<bits<3> nf, RISCVMOP mop, RISCVWidth width,
 } // hasSideEffects = 0, mayLoad = 1, mayStore = 0
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-// store vd, vs3, (rs1)
+// unit-stride store vd, vs3, (rs1), vm
+class VUnitStrideStore<RISCVWidth width, string opcodestr>
+    : RVInstVSU<0b000, width.Value{3}, SUMOPUnitStride, width.Value{2-0},
+                (outs), (ins VR:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
+                "$vs3, (${rs1})$vm">;
+
+let vm = 1 in {
+// vs<nf>r.v vd, (rs1)
+class VWholeStore<bits<3> nf, string opcodestr, RegisterClass VRC>
+    : RVInstVSU<nf, 0, SUMOPUnitStrideWholeReg,
+                0b000, (outs), (ins VRC:$vs3, GPR:$rs1),
+                opcodestr, "$vs3, (${rs1})"> {
+  let Uses = [];
+}
+
+// unit-stride mask store vd, vs3, (rs1)
 class VUnitStrideStoreMask<string opcodestr>
     : RVInstVSU<0b000, LSWidth8.Value{3}, SUMOPUnitStrideMask, LSWidth8.Value{2-0},
                 (outs), (ins VR:$vs3, GPR:$rs1), opcodestr,
-                "$vs3, (${rs1})"> {
-  let vm = 1;
-}
-
-// store vd, vs3, (rs1), vm
-class VUnitStrideStore<RISCVLSUMOP sumop, RISCVWidth width,
-                         string opcodestr>
-    : RVInstVSU<0b000, width.Value{3}, sumop, width.Value{2-0},
-                (outs), (ins VR:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
-                "$vs3, (${rs1})$vm">;
+                "$vs3, (${rs1})">;
+} // vm = 1
 
-// store vd, vs3, (rs1), rs2, vm
+// strided store vd, vs3, (rs1), rs2, vm
 class VStridedStore<RISCVWidth width, string opcodestr>
     : RVInstVSS<0b000, width.Value{3}, width.Value{2-0}, (outs),
                 (ins VR:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
                 opcodestr, "$vs3, (${rs1}), $rs2$vm">;
 
-// store vd, vs3, (rs1), vs2, vm
+// indexed store vd, vs3, (rs1), vs2, vm
 class VIndexedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
     : RVInstVSX<0b000, width.Value{3}, mop, width.Value{2-0}, (outs),
                 (ins VR:$vs3, GPR:$rs1, VR:$vs2, VMaskOp:$vm),
                 opcodestr, "$vs3, (${rs1}), $vs2$vm">;
 
-// vs<nf>r.v vd, (rs1)
-class VWholeStore<bits<3> nf, string opcodestr, RegisterClass VRC>
-    : RVInstVSU<nf, 0, SUMOPUnitStrideWholeReg,
-                0b000, (outs), (ins VRC:$vs3, GPR:$rs1),
-                opcodestr, "$vs3, (${rs1})"> {
-  let vm = 1;
-  let Uses = [];
-}
-
 // segment store vd, vs3, (rs1), vm
 class VUnitStrideSegmentStore<bits<3> nf, RISCVWidth width, string opcodestr>
     : RVInstVSU<nf, width.Value{3}, SUMOPUnitStride, width.Value{2-0},
@@ -328,106 +366,417 @@ class VAMONoWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
 // Use these multiclasses to define instructions more easily.
 //===----------------------------------------------------------------------===//
 multiclass VALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
-  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
-  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
-  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
 }
 
 multiclass VALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
-  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
 }
 
-multiclass VALUr_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUrVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
-  def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+multiclass VALU_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
 }
 
-multiclass VALU_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
-  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
-  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
+multiclass VALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>;
 }
 
-multiclass VALU_IV_V<string opcodestr, bits<6> funct6> {
-  def _VS  : VALUVV<funct6, OPIVV, opcodestr # ".vs">;
+multiclass VMAC_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVMask]>;
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+          Sched<[WriteVIMulAddX, ReadVIMulAddV, ReadVIMulAddX, ReadVMask]>;
 }
 
-multiclass VALUr_IV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+multiclass VWMAC_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVMask]>;
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+          Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
 }
 
-multiclass VALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
-  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+multiclass VWMAC_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+          Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
 }
 
-multiclass VALU_MV_V<string opcodestr, bits<6> funct6> {
-  def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">;
+multiclass VALU_MV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+           Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
 }
 
-multiclass VALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
-  def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">;
+multiclass VALUm_IV_V_X_I<string opcodestr, bits<6> funct6> {
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+           Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+           Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  def IM : VALUmVI<funct6, opcodestr # ".vim">,
+           Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
 }
 
-multiclass VALU_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+multiclass VMRG_IV_V_X_I<string opcodestr, bits<6> funct6> {
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+           Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+           Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>;
+  def IM : VALUmVI<funct6, opcodestr # ".vim">,
+           Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>;
 }
 
-multiclass VALUr_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
-  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+multiclass VALUm_IV_V_X<string opcodestr, bits<6> funct6> {
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+           Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+           Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
 }
 
-multiclass VALUr_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+multiclass VALUNoVm_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5> {
+  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">,
+          Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">,
+          Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
+  def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>,
+          Sched<[WriteVICALUI, ReadVIALUCV]>;
 }
 
-multiclass VALU_MV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
-  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>;
+multiclass VALUNoVm_IV_V_X<string opcodestr, bits<6> funct6> {
+  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">,
+          Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">,
+          Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
 }
 
-multiclass VALUm_IV_V_X_I<string opcodestr, bits<6> funct6> {
-  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
-  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
-  def IM : VALUmVI<funct6, opcodestr # ".vim">;
+multiclass VALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
 }
 
-multiclass VALUm_IV_V_X<string opcodestr, bits<6> funct6> {
-  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
-  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
+multiclass VALU_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
 }
 
-multiclass VALUNoVm_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5> {
-  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
-  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
-  def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>;
+multiclass VWALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
 }
 
-multiclass VALUNoVm_IV_V_X<string opcodestr, bits<6> funct6> {
-  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
-  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
+multiclass VMUL_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>;
 }
 
-multiclass VALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
-  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+multiclass VDIV_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
 }
 
-multiclass VALU_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+multiclass VRDIV_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
 }
 
-multiclass VALUr_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
-  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+multiclass VWMUL_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>;
 }
 
-multiclass VALU_FV_V<string opcodestr, bits<6> funct6> {
-  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">;
+multiclass VMAC_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>;
+  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>;
 }
 
-multiclass VALU_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
-  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>;
+multiclass VWMAC_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>;
+  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>;
+}
+
+multiclass VSQR_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
+}
+
+multiclass VRCP_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
+}
+
+multiclass VCMP_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VCMP_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VSGNJ_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>;
+}
+
+multiclass VCLS_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
+}
+
+multiclass VCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>;
+}
+
+multiclass VCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>;
+}
+
+multiclass VWCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>;
+}
+
+multiclass VWCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>;
+}
+
+multiclass VWCVTF_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>;
+}
+
+multiclass VNCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>;
+}
+
+multiclass VNCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>;
+}
+
+multiclass VNCVTF_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>;
+}
+
+multiclass VRED_MV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">,
+            Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV0, ReadVMask]>;
+}
+
+multiclass VWRED_IV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPIVV, opcodestr # ".vs">,
+            Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV0, ReadVMask]>;
+}
+
+multiclass VRED_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV0, ReadVMask]>;
+}
+
+multiclass VREDO_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV0, ReadVMask]>;
+}
+
+multiclass VWRED_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV0, ReadVMask]>;
+}
+
+multiclass VWREDO_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFWRedOV, ReadVFWRedOV, ReadVFWRedOV0, ReadVMask]>;
+}
+
+multiclass VMALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+  def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">,
+          Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
+}
+
+multiclass VMSFS_MV_V<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+           Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>;
+}
+
+multiclass VMIOT_MV_V<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+           Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
+}
+
+multiclass VSHT_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVShiftV, ReadVShiftV, ReadVShiftV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVShiftX, ReadVShiftV, ReadVShiftX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVShiftI, ReadVShiftV, ReadVMask]>;
+}
+
+multiclass VNSHT_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVNShiftV, ReadVNShiftV, ReadVNShiftV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVNShiftX, ReadVNShiftV, ReadVNShiftX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVNShiftI, ReadVNShiftV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+}
+
+multiclass VMUL_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIMulV, ReadVIMulV, ReadVIMulV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIMulX, ReadVIMulV, ReadVIMulX, ReadVMask]>;
+}
+
+multiclass VWMUL_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIWMulV, ReadVIWMulV, ReadVIWMulV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIWMulX, ReadVIWMulV, ReadVIWMulX, ReadVMask]>;
+}
+
+multiclass VDIV_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIDivV, ReadVIDivV, ReadVIDivV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIDivX, ReadVIDivV, ReadVIDivX, ReadVMask]>;
+}
+
+multiclass VSALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVSALUI, ReadVSALUV, ReadVMask]>;
+}
+
+multiclass VSALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+}
+
+multiclass VAALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVAALUV, ReadVAALUV, ReadVAALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVAALUX, ReadVAALUV, ReadVAALUX, ReadVMask]>;
+}
+
+multiclass VSMUL_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSMulV, ReadVSMulV, ReadVSMulV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSMulX, ReadVSMulV, ReadVSMulX, ReadVMask]>;
+}
+
+multiclass VSSHF_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSShiftV, ReadVSShiftV, ReadVSShiftV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSShiftX, ReadVSShiftV, ReadVSShiftX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVSShiftI, ReadVSShiftV, ReadVMask]>;
+}
+
+multiclass VNCLP_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVNClipV, ReadVNClipV, ReadVNClipV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVNClipX, ReadVNClipV, ReadVNClipX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVNClipI, ReadVNClipV, ReadVMask]>;
+}
+
+multiclass VSLD_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVISlideX, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVISlideI, ReadVISlideV, ReadVMask]>;
+}
+
+multiclass VSLD1_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+}
+
+multiclass VSLD1_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>;
+}
+
+multiclass VGTR_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVGatherX, ReadVGatherV, ReadVGatherX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVGatherI, ReadVGatherV, ReadVMask]>;
+}
+
+multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+  def M  : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">,
+           Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>;
 }
 
 multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
@@ -435,11 +784,14 @@ multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
   def _UNWD : VAMONoWd<amoop, width, opcodestr>;
 }
 
-multiclass VWholeLoad<bits<3> nf, string opcodestr, RegisterClass VRC> {
-  def E8_V : VWholeLoad<nf, LSWidth8, opcodestr # "e8.v", VRC>;
-  def E16_V : VWholeLoad<nf, LSWidth16, opcodestr # "e16.v", VRC>;
-  def E32_V : VWholeLoad<nf, LSWidth32, opcodestr # "e32.v", VRC>;
-  def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v", VRC>;
+multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> {
+  foreach l = [8, 16, 32, 64] in {
+    defvar w = !cast<RISCVWidth>("LSWidth" # l);
+    defvar s = !cast<SchedWrite>("WriteVLD" # !add(nf, 1) # "R" # l);
+
+    def E # l # _V : VWholeLoad<nf, w, opcodestr # "e" # l # ".v", VRC>,
+                     Sched<[s, ReadVLDX]>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -457,71 +809,58 @@ def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp:$vtypei)
 def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
                          "vsetvl", "$rd, $rs1, $rs2">;
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
+foreach eew = [8, 16, 32, 64] in {
+  defvar w = !cast<RISCVWidth>("LSWidth" # eew);
+
+  // Vector Unit-Stride Instructions
+  def VLE#eew#_V : VUnitStrideLoad<w, "vle"#eew#".v">, VLESched<eew>;
+  def VSE#eew#_V  : VUnitStrideStore<w,  "vse"#eew#".v">, VSESched<eew>;
+
+  // Vector Unit-Stride Fault-only-First Loads
+  def VLE#eew#FF_V : VUnitStrideLoadFF<w,  "vle"#eew#"ff.v">, VLFSched<eew>;
+
+  // Vector Strided Instructions
+  def VLSE#eew#_V  : VStridedLoad<w,  "vlse"#eew#".v">, VLSSched<eew>;
+  def VSSE#eew#_V  : VStridedStore<w,  "vsse"#eew#".v">, VSSSched<eew>;
+  
+  // Vector Indexed Instructions
+  def VLUXEI#eew#_V :
+    VIndexedLoad<MOPLDIndexedUnord, w, "vluxei"#eew#".v">, VLXSched<eew, "U">;
+  def VLOXEI#eew#_V :
+    VIndexedLoad<MOPLDIndexedOrder, w, "vloxei"#eew#".v">, VLXSched<eew, "O">;
+  def VSUXEI#eew#_V :
+    VIndexedStore<MOPSTIndexedUnord, w, "vsuxei"#eew#".v">, VSXSched<eew, "U">;
+  def VSOXEI#eew#_V :
+    VIndexedStore<MOPSTIndexedOrder, w, "vsoxei"#eew#".v">, VSXSched<eew, "O">;
+}
+
+def VLM_V : VUnitStrideLoadMask<"vlm.v">,
+             Sched<[WriteVLDM, ReadVLDX]>;
+def VSM_V : VUnitStrideStoreMask<"vsm.v">,
+             Sched<[WriteVSTM, ReadVSTM, ReadVSTX]>;
+def : InstAlias<"vle1.v $vd, (${rs1})",
+                (VLM_V VR:$vd, GPR:$rs1), 0>;
+def : InstAlias<"vse1.v $vs3, (${rs1})",
+                (VSM_V VR:$vs3, GPR:$rs1), 0>;
+
+defm VL1R : VWholeLoadN<0, "vl1r", VR>;
+defm VL2R : VWholeLoadN<1, "vl2r", VRM2>;
+defm VL4R : VWholeLoadN<3, "vl4r", VRM4>;
+defm VL8R : VWholeLoadN<7, "vl8r", VRM8>;
 
-// Vector Unit-Stride Instructions
-def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">;
-def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">;
-def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">;
-def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">;
-
-def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">;
-def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">;
-def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">;
-def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">;
-
-def VLE1_V : VUnitStrideLoadMask<"vle1.v">;
-def VSE1_V : VUnitStrideStoreMask<"vse1.v">;
-
-def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">;
-def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">;
-def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">;
-def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">;
-
-// Vector Strided Instructions
-def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">;
-def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">;
-def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">;
-def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">;
-
-def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">;
-def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">;
-def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">;
-def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">;
-
-// Vector Indexed Instructions
-def VLUXEI8_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth8, "vluxei8.v">;
-def VLUXEI16_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth16, "vluxei16.v">;
-def VLUXEI32_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth32, "vluxei32.v">;
-def VLUXEI64_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth64, "vluxei64.v">;
-
-def VLOXEI8_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth8, "vloxei8.v">;
-def VLOXEI16_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth16, "vloxei16.v">;
-def VLOXEI32_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth32, "vloxei32.v">;
-def VLOXEI64_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth64, "vloxei64.v">;
-
-def VSUXEI8_V : VIndexedStore<MOPSTIndexedUnord, LSWidth8, "vsuxei8.v">;
-def VSUXEI16_V : VIndexedStore<MOPSTIndexedUnord, LSWidth16, "vsuxei16.v">;
-def VSUXEI32_V : VIndexedStore<MOPSTIndexedUnord, LSWidth32, "vsuxei32.v">;
-def VSUXEI64_V : VIndexedStore<MOPSTIndexedUnord, LSWidth64, "vsuxei64.v">;
-
-def VSOXEI8_V : VIndexedStore<MOPSTIndexedOrder, LSWidth8, "vsoxei8.v">;
-def VSOXEI16_V : VIndexedStore<MOPSTIndexedOrder, LSWidth16, "vsoxei16.v">;
-def VSOXEI32_V : VIndexedStore<MOPSTIndexedOrder, LSWidth32, "vsoxei32.v">;
-def VSOXEI64_V : VIndexedStore<MOPSTIndexedOrder, LSWidth64, "vsoxei64.v">;
-
-defm VL1R : VWholeLoad<0, "vl1r", VR>;
-defm VL2R : VWholeLoad<1, "vl2r", VRM2>;
-defm VL4R : VWholeLoad<3, "vl4r", VRM4>;
-defm VL8R : VWholeLoad<7, "vl8r", VRM8>;
 def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
 def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>;
 def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>;
 def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>;
 
-def VS1R_V : VWholeStore<0, "vs1r.v", VR>;
-def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>;
-def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>;
-def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>;
+def VS1R_V : VWholeStore<0, "vs1r.v", VR>,
+             Sched<[WriteVST1R, ReadVST1R, ReadVSTX]>;
+def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>,
+             Sched<[WriteVST2R, ReadVST2R, ReadVSTX]>;
+def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>,
+             Sched<[WriteVST4R, ReadVST4R, ReadVSTX]>;
+def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>,
+             Sched<[WriteVST8R, ReadVST8R, ReadVSTX]>;
 
 // Vector Single-Width Integer Add and Subtract
 defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
@@ -588,9 +927,9 @@ def : InstAlias<"vnot.v $vd, $vs$vm",
                 (VXOR_VI VR:$vd, VR:$vs, -1, VMaskOp:$vm)>;
 
 // Vector Single-Width Bit Shift Instructions
-defm VSLL_V : VALU_IV_V_X_I<"vsll", 0b100101, uimm5>;
-defm VSRL_V : VALU_IV_V_X_I<"vsrl", 0b101000, uimm5>;
-defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
+defm VSLL_V : VSHT_IV_V_X_I<"vsll", 0b100101, uimm5>;
+defm VSRL_V : VSHT_IV_V_X_I<"vsrl", 0b101000, uimm5>;
+defm VSRA_V : VSHT_IV_V_X_I<"vsra", 0b101001, uimm5>;
 
 // Vector Narrowing Integer Right Shift Instructions
 // Refer to 11.3. Narrowing Vector Arithmetic Instructions
@@ -598,8 +937,8 @@ defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
 // vector register group (specified by vs2). The destination vector register
 // group cannot overlap the mask register if used, unless LMUL=1.
 let Constraints = "@earlyclobber $vd" in {
-defm VNSRL_W : VALU_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
-defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
+defm VNSRL_W : VNSHT_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
+defm VNSRA_W : VNSHT_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
 } // Constraints = "@earlyclobber $vd"
 
 def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
@@ -607,14 +946,14 @@ def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
 
 // Vector Integer Comparison Instructions
 let RVVConstraint = NoConstraint in {
-defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>;
-defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>;
-defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>;
-defm VMSLT_V : VALU_IV_V_X<"vmslt", 0b011011>;
-defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>;
-defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>;
-defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>;
-defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>;
+defm VMSEQ_V : VCMP_IV_V_X_I<"vmseq", 0b011000>;
+defm VMSNE_V : VCMP_IV_V_X_I<"vmsne", 0b011001>;
+defm VMSLTU_V : VCMP_IV_V_X<"vmsltu", 0b011010>;
+defm VMSLT_V : VCMP_IV_V_X<"vmslt", 0b011011>;
+defm VMSLEU_V : VCMP_IV_V_X_I<"vmsleu", 0b011100>;
+defm VMSLE_V : VCMP_IV_V_X_I<"vmsle", 0b011101>;
+defm VMSGTU_V : VCMP_IV_X_I<"vmsgtu", 0b011110>;
+defm VMSGT_V : VCMP_IV_X_I<"vmsgt", 0b011111>;
 } // RVVConstraint = NoConstraint
 
 def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
@@ -672,84 +1011,87 @@ def PseudoVMSGE_VX_M_T : Pseudo<(outs VR:$vd, VRNoV0:$scratch),
 }
 
 // Vector Integer Min/Max Instructions
-defm VMINU_V : VALU_IV_V_X<"vminu", 0b000100>;
-defm VMIN_V : VALU_IV_V_X<"vmin", 0b000101>;
-defm VMAXU_V : VALU_IV_V_X<"vmaxu", 0b000110>;
-defm VMAX_V : VALU_IV_V_X<"vmax", 0b000111>;
+defm VMINU_V : VCMP_IV_V_X<"vminu", 0b000100>;
+defm VMIN_V : VCMP_IV_V_X<"vmin", 0b000101>;
+defm VMAXU_V : VCMP_IV_V_X<"vmaxu", 0b000110>;
+defm VMAX_V : VCMP_IV_V_X<"vmax", 0b000111>;
 
 // Vector Single-Width Integer Multiply Instructions
-defm VMUL_V : VALU_MV_V_X<"vmul", 0b100101>;
-defm VMULH_V : VALU_MV_V_X<"vmulh", 0b100111>;
-defm VMULHU_V : VALU_MV_V_X<"vmulhu", 0b100100>;
-defm VMULHSU_V : VALU_MV_V_X<"vmulhsu", 0b100110>;
+defm VMUL_V : VMUL_MV_V_X<"vmul", 0b100101>;
+defm VMULH_V : VMUL_MV_V_X<"vmulh", 0b100111>;
+defm VMULHU_V : VMUL_MV_V_X<"vmulhu", 0b100100>;
+defm VMULHSU_V : VMUL_MV_V_X<"vmulhsu", 0b100110>;
 
 // Vector Integer Divide Instructions
-defm VDIVU_V : VALU_MV_V_X<"vdivu", 0b100000>;
-defm VDIV_V : VALU_MV_V_X<"vdiv", 0b100001>;
-defm VREMU_V : VALU_MV_V_X<"vremu", 0b100010>;
-defm VREM_V : VALU_MV_V_X<"vrem", 0b100011>;
+defm VDIVU_V : VDIV_MV_V_X<"vdivu", 0b100000>;
+defm VDIV_V : VDIV_MV_V_X<"vdiv", 0b100001>;
+defm VREMU_V : VDIV_MV_V_X<"vremu", 0b100010>;
+defm VREM_V : VDIV_MV_V_X<"vrem", 0b100011>;
 
 // Vector Widening Integer Multiply Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VWMUL_V : VALU_MV_V_X<"vwmul", 0b111011>;
-defm VWMULU_V : VALU_MV_V_X<"vwmulu", 0b111000>;
-defm VWMULSU_V : VALU_MV_V_X<"vwmulsu", 0b111010>;
+defm VWMUL_V : VWMUL_MV_V_X<"vwmul", 0b111011>;
+defm VWMULU_V : VWMUL_MV_V_X<"vwmulu", 0b111000>;
+defm VWMULSU_V : VWMUL_MV_V_X<"vwmulsu", 0b111010>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Single-Width Integer Multiply-Add Instructions
-defm VMACC_V : VALUr_MV_V_X<"vmacc", 0b101101>;
-defm VNMSAC_V : VALUr_MV_V_X<"vnmsac", 0b101111>;
-defm VMADD_V : VALUr_MV_V_X<"vmadd", 0b101001>;
-defm VNMSUB_V : VALUr_MV_V_X<"vnmsub", 0b101011>;
+defm VMACC_V : VMAC_MV_V_X<"vmacc", 0b101101>;
+defm VNMSAC_V : VMAC_MV_V_X<"vnmsac", 0b101111>;
+defm VMADD_V : VMAC_MV_V_X<"vmadd", 0b101001>;
+defm VNMSUB_V : VMAC_MV_V_X<"vnmsub", 0b101011>;
 
 // Vector Widening Integer Multiply-Add Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VWMACCU_V : VALUr_MV_V_X<"vwmaccu", 0b111100>;
-defm VWMACC_V : VALUr_MV_V_X<"vwmacc", 0b111101>;
-defm VWMACCSU_V : VALUr_MV_V_X<"vwmaccsu", 0b111111>;
-defm VWMACCUS_V : VALUr_MV_X<"vwmaccus", 0b111110>;
+defm VWMACCU_V : VWMAC_MV_V_X<"vwmaccu", 0b111100>;
+defm VWMACC_V : VWMAC_MV_V_X<"vwmacc", 0b111101>;
+defm VWMACCSU_V : VWMAC_MV_V_X<"vwmaccsu", 0b111111>;
+defm VWMACCUS_V : VWMAC_MV_X<"vwmaccus", 0b111110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Integer Merge Instructions
-defm VMERGE_V : VALUm_IV_V_X_I<"vmerge", 0b010111>;
+defm VMERGE_V : VMRG_IV_V_X_I<"vmerge", 0b010111>;
 
 // Vector Integer Move Instructions
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1,
     RVVConstraint = NoConstraint  in {
 // op vd, vs1
 def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VR:$vd),
-                       (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">;
+                       (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">,
+              Sched<[WriteVIMovV, ReadVIMovV]>;
 // op vd, rs1
 def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VR:$vd),
-                       (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">;
+                       (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">,
+              Sched<[WriteVIMovX, ReadVIMovX]>;
 // op vd, imm
 def VMV_V_I : RVInstIVI<0b010111, (outs VR:$vd),
-                       (ins simm5:$imm), "vmv.v.i", "$vd, $imm">;
+                       (ins simm5:$imm), "vmv.v.i", "$vd, $imm">,
+              Sched<[WriteVIMovI]>;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 // Vector Fixed-Point Arithmetic Instructions
-defm VSADDU_V : VALU_IV_V_X_I<"vsaddu", 0b100000>;
-defm VSADD_V : VALU_IV_V_X_I<"vsadd", 0b100001>;
-defm VSSUBU_V : VALU_IV_V_X<"vssubu", 0b100010>;
-defm VSSUB_V : VALU_IV_V_X<"vssub", 0b100011>;
+defm VSADDU_V : VSALU_IV_V_X_I<"vsaddu", 0b100000>;
+defm VSADD_V : VSALU_IV_V_X_I<"vsadd", 0b100001>;
+defm VSSUBU_V : VSALU_IV_V_X<"vssubu", 0b100010>;
+defm VSSUB_V : VSALU_IV_V_X<"vssub", 0b100011>;
 
 // Vector Single-Width Averaging Add and Subtract
-defm VAADDU_V : VALU_MV_V_X<"vaaddu", 0b001000>;
-defm VAADD_V : VALU_MV_V_X<"vaadd", 0b001001>;
-defm VASUBU_V : VALU_MV_V_X<"vasubu", 0b001010>;
-defm VASUB_V : VALU_MV_V_X<"vasub", 0b001011>;
+defm VAADDU_V : VAALU_MV_V_X<"vaaddu", 0b001000>;
+defm VAADD_V : VAALU_MV_V_X<"vaadd", 0b001001>;
+defm VASUBU_V : VAALU_MV_V_X<"vasubu", 0b001010>;
+defm VASUB_V : VAALU_MV_V_X<"vasub", 0b001011>;
 
 // Vector Single-Width Fractional Multiply with Rounding and Saturation
-defm VSMUL_V : VALU_IV_V_X<"vsmul", 0b100111>;
+defm VSMUL_V : VSMUL_IV_V_X<"vsmul", 0b100111>;
 
 // Vector Single-Width Scaling Shift Instructions
-defm VSSRL_V : VALU_IV_V_X_I<"vssrl", 0b101010, uimm5>;
-defm VSSRA_V : VALU_IV_V_X_I<"vssra", 0b101011, uimm5>;
+defm VSSRL_V : VSSHF_IV_V_X_I<"vssrl", 0b101010, uimm5>;
+defm VSSRA_V : VSSHF_IV_V_X_I<"vssra", 0b101011, uimm5>;
 
 // Vector Narrowing Fixed-Point Clip Instructions
 let Constraints = "@earlyclobber $vd" in {
-defm VNCLIPU_W : VALU_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
-defm VNCLIP_W : VALU_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
+defm VNCLIPU_W : VNCLP_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
+defm VNCLIP_W : VNCLP_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
 } // Constraints = "@earlyclobber $vd"
 } // Predicates = [HasStdExtV]
 
@@ -762,60 +1104,60 @@ defm VFRSUB_V : VALU_FV_F<"vfrsub", 0b100111>;
 // Vector Widening Floating-Point Add/Subtract Instructions
 let Constraints = "@earlyclobber $vd" in {
 let RVVConstraint = WidenV in {
-defm VFWADD_V : VALU_FV_V_F<"vfwadd", 0b110000>;
-defm VFWSUB_V : VALU_FV_V_F<"vfwsub", 0b110010>;
+defm VFWADD_V : VWALU_FV_V_F<"vfwadd", 0b110000>;
+defm VFWSUB_V : VWALU_FV_V_F<"vfwsub", 0b110010>;
 } // RVVConstraint = WidenV
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
 let RVVConstraint = WidenW in {
-defm VFWADD_W : VALU_FV_V_F<"vfwadd", 0b110100, "w">;
-defm VFWSUB_W : VALU_FV_V_F<"vfwsub", 0b110110, "w">;
+defm VFWADD_W : VWALU_FV_V_F<"vfwadd", 0b110100, "w">;
+defm VFWSUB_W : VWALU_FV_V_F<"vfwsub", 0b110110, "w">;
 } // RVVConstraint = WidenW
 } // Constraints = "@earlyclobber $vd"
 
 // Vector Single-Width Floating-Point Multiply/Divide Instructions
-defm VFMUL_V : VALU_FV_V_F<"vfmul", 0b100100>;
-defm VFDIV_V : VALU_FV_V_F<"vfdiv", 0b100000>;
-defm VFRDIV_V : VALU_FV_F<"vfrdiv", 0b100001>;
+defm VFMUL_V : VMUL_FV_V_F<"vfmul", 0b100100>;
+defm VFDIV_V : VDIV_FV_V_F<"vfdiv", 0b100000>;
+defm VFRDIV_V : VRDIV_FV_F<"vfrdiv", 0b100001>;
 
 // Vector Widening Floating-Point Multiply
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VFWMUL_V : VALU_FV_V_F<"vfwmul", 0b111000>;
+defm VFWMUL_V : VWMUL_FV_V_F<"vfwmul", 0b111000>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Single-Width Floating-Point Fused Multiply-Add Instructions
-defm VFMACC_V : VALUr_FV_V_F<"vfmacc", 0b101100>;
-defm VFNMACC_V : VALUr_FV_V_F<"vfnmacc", 0b101101>;
-defm VFMSAC_V : VALUr_FV_V_F<"vfmsac", 0b101110>;
-defm VFNMSAC_V : VALUr_FV_V_F<"vfnmsac", 0b101111>;
-defm VFMADD_V : VALUr_FV_V_F<"vfmadd", 0b101000>;
-defm VFNMADD_V : VALUr_FV_V_F<"vfnmadd", 0b101001>;
-defm VFMSUB_V : VALUr_FV_V_F<"vfmsub", 0b101010>;
-defm VFNMSUB_V : VALUr_FV_V_F<"vfnmsub", 0b101011>;
+defm VFMACC_V : VMAC_FV_V_F<"vfmacc", 0b101100>;
+defm VFNMACC_V : VMAC_FV_V_F<"vfnmacc", 0b101101>;
+defm VFMSAC_V : VMAC_FV_V_F<"vfmsac", 0b101110>;
+defm VFNMSAC_V : VMAC_FV_V_F<"vfnmsac", 0b101111>;
+defm VFMADD_V : VMAC_FV_V_F<"vfmadd", 0b101000>;
+defm VFNMADD_V : VMAC_FV_V_F<"vfnmadd", 0b101001>;
+defm VFMSUB_V : VMAC_FV_V_F<"vfmsub", 0b101010>;
+defm VFNMSUB_V : VMAC_FV_V_F<"vfnmsub", 0b101011>;
 
 // Vector Widening Floating-Point Fused Multiply-Add Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VFWMACC_V : VALUr_FV_V_F<"vfwmacc", 0b111100>;
-defm VFWNMACC_V : VALUr_FV_V_F<"vfwnmacc", 0b111101>;
-defm VFWMSAC_V : VALUr_FV_V_F<"vfwmsac", 0b111110>;
-defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
+defm VFWMACC_V : VWMAC_FV_V_F<"vfwmacc", 0b111100>;
+defm VFWNMACC_V : VWMAC_FV_V_F<"vfwnmacc", 0b111101>;
+defm VFWMSAC_V : VWMAC_FV_V_F<"vfwmsac", 0b111110>;
+defm VFWNMSAC_V : VWMAC_FV_V_F<"vfwnmsac", 0b111111>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Floating-Point Square-Root Instruction
-defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
-defm VFRSQRT7_V : VALU_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
-defm VFREC7_V : VALU_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
+defm VFSQRT_V : VSQR_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
+defm VFRSQRT7_V : VRCP_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
+defm VFREC7_V : VRCP_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
 
 // Vector Floating-Point MIN/MAX Instructions
-defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
-defm VFMAX_V : VALU_FV_V_F<"vfmax", 0b000110>;
+defm VFMIN_V : VCMP_FV_V_F<"vfmin", 0b000100>;
+defm VFMAX_V : VCMP_FV_V_F<"vfmax", 0b000110>;
 
 // Vector Floating-Point Sign-Injection Instructions
-defm VFSGNJ_V : VALU_FV_V_F<"vfsgnj", 0b001000>;
-defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>;
-defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>;
+defm VFSGNJ_V : VSGNJ_FV_V_F<"vfsgnj", 0b001000>;
+defm VFSGNJN_V : VSGNJ_FV_V_F<"vfsgnjn", 0b001001>;
+defm VFSGNJX_V : VSGNJ_FV_V_F<"vfsgnjx", 0b001010>;
 
 def : InstAlias<"vfneg.v $vd, $vs$vm",
                 (VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
@@ -824,12 +1166,12 @@ def : InstAlias<"vfabs.v $vd, $vs$vm",
 
 // Vector Floating-Point Compare Instructions
 let RVVConstraint = NoConstraint in {
-defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>;
-defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>;
-defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>;
-defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>;
-defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>;
-defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>;
+defm VMFEQ_V : VCMP_FV_V_F<"vmfeq", 0b011000>;
+defm VMFNE_V : VCMP_FV_V_F<"vmfne", 0b011100>;
+defm VMFLT_V : VCMP_FV_V_F<"vmflt", 0b011011>;
+defm VMFLE_V : VCMP_FV_V_F<"vmfle", 0b011001>;
+defm VMFGT_V : VCMP_FV_F<"vmfgt", 0b011101>;
+defm VMFGE_V : VCMP_FV_F<"vmfge", 0b011111>;
 } // RVVConstraint = NoConstraint
 
 def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
@@ -838,68 +1180,70 @@ def : InstAlias<"vmfge.vv $vd, $va, $vb$vm",
                 (VMFLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 
 // Vector Floating-Point Classify Instruction
-defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
+defm VFCLASS_V : VCLS_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+
 // Vector Floating-Point Merge Instruction
+let vm = 0 in
 def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
                            (ins VR:$vs2, FPR32:$rs1, VMV0:$v0),
-                           "vfmerge.vfm", "$vd, $vs2, $rs1, v0"> {
-  let vm = 0;
-}
+                           "vfmerge.vfm", "$vd, $vs2, $rs1, v0">,
+                  Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>;
 
 // Vector Floating-Point Move Instruction
 let RVVConstraint = NoConstraint in
+let vm = 1, vs2 = 0 in
 def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
-                       (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1"> {
-  let vs2 = 0;
-  let vm = 1;
-}
+                       (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1">,
+               Sched<[WriteVFMovV, ReadVFMovF]>;
+
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 // Single-Width Floating-Point/Integer Type-Convert Instructions
-defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
-defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
-defm VFCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
-defm VFCVT_RTZ_X_F_V : VALU_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
-defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
-defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
+defm VFCVT_XU_F_V : VCVTI_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
+defm VFCVT_X_F_V : VCVTI_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
+defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
+defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
+defm VFCVT_F_XU_V : VCVTF_IV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
+defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
 
 // Widening Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in {
-defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
-defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
-defm VFWCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
-defm VFWCVT_RTZ_X_F_V : VALU_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
-defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
-defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
-defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
+defm VFWCVT_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
+defm VFWCVT_X_F_V : VWCVTI_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
+defm VFWCVT_RTZ_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
+defm VFWCVT_RTZ_X_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
+defm VFWCVT_F_XU_V : VWCVTF_IV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
+defm VFWCVT_F_X_V : VWCVTF_IV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
+defm VFWCVT_F_F_V : VWCVTF_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
 
 // Narrowing Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd" in {
-defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
-defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
-defm VFNCVT_RTZ_XU_F_W : VALU_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
-defm VFNCVT_RTZ_X_F_W : VALU_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
-defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
-defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
-defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
-defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
+defm VFNCVT_XU_F_W : VNCVTI_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
+defm VFNCVT_X_F_W : VNCVTI_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
+defm VFNCVT_RTZ_XU_F_W : VNCVTI_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
+defm VFNCVT_RTZ_X_F_W : VNCVTI_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
+defm VFNCVT_F_XU_W : VNCVTF_IV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
+defm VFNCVT_F_X_W : VNCVTF_IV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
+defm VFNCVT_F_F_W : VNCVTF_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
+defm VFNCVT_ROD_F_F_W : VNCVTF_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
 } // Constraints = "@earlyclobber $vd"
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
+
 // Vector Single-Width Integer Reduction Instructions
 let RVVConstraint = NoConstraint in {
-defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>;
-defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>;
-defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>;
-defm VREDMINU : VALU_MV_V<"vredminu", 0b000100>;
-defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>;
-defm VREDAND : VALU_MV_V<"vredand", 0b000001>;
-defm VREDOR : VALU_MV_V<"vredor", 0b000010>;
-defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>;
+defm VREDSUM : VRED_MV_V<"vredsum", 0b000000>;
+defm VREDMAXU : VRED_MV_V<"vredmaxu", 0b000110>;
+defm VREDMAX : VRED_MV_V<"vredmax", 0b000111>;
+defm VREDMINU : VRED_MV_V<"vredminu", 0b000100>;
+defm VREDMIN : VRED_MV_V<"vredmin", 0b000101>;
+defm VREDAND : VRED_MV_V<"vredand", 0b000001>;
+defm VREDOR : VRED_MV_V<"vredor", 0b000010>;
+defm VREDXOR : VRED_MV_V<"vredxor", 0b000011>;
 } // RVVConstraint = NoConstraint
 
 // Vector Widening Integer Reduction Instructions
@@ -908,42 +1252,49 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
-defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>;
-defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>;
+defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>;
+defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtV, HasStdExtF] in {
 // Vector Single-Width Floating-Point Reduction Instructions
 let RVVConstraint = NoConstraint in {
-defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>;
-defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>;
-defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>;
-defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>;
+defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>;
+defm VFREDUSUM : VRED_FV_V<"vfredusum", 0b000001>;
+defm VFREDMAX : VRED_FV_V<"vfredmax", 0b000111>;
+defm VFREDMIN : VRED_FV_V<"vfredmin", 0b000101>;
 } // RVVConstraint = NoConstraint
 
+def : InstAlias<"vfredsum.vs $vd, $vs2, $vs1$vm",
+                (VFREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
+
 // Vector Widening Floating-Point Reduction Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
-defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>;
-defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>;
+defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>;
+defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+
+def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm",
+                (VFWREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
 // Vector Mask-Register Logical Instructions
 let RVVConstraint = NoConstraint in {
-defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">;
-defm VMNAND_M : VALU_MV_Mask<"vmnand", 0b011101, "m">;
-defm VMANDNOT_M : VALU_MV_Mask<"vmandnot", 0b011000, "m">;
-defm VMXOR_M : VALU_MV_Mask<"vmxor", 0b011011, "m">;
-defm VMOR_M : VALU_MV_Mask<"vmor", 0b011010, "m">;
-defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">;
-defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">;
-defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">;
+defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">;
+defm VMNAND_M : VMALU_MV_Mask<"vmnand", 0b011101, "m">;
+defm VMANDN_M : VMALU_MV_Mask<"vmandn", 0b011000, "m">;
+defm VMXOR_M : VMALU_MV_Mask<"vmxor", 0b011011, "m">;
+defm VMOR_M : VMALU_MV_Mask<"vmor", 0b011010, "m">;
+defm VMNOR_M : VMALU_MV_Mask<"vmnor", 0b011110, "m">;
+defm VMORN_M : VMALU_MV_Mask<"vmorn", 0b011100, "m">;
+defm VMXNOR_M : VMALU_MV_Mask<"vmxnor", 0b011111, "m">;
 }
 
 def : InstAlias<"vmmv.m $vd, $vs",
@@ -955,207 +1306,175 @@ def : InstAlias<"vmset.m $vd",
 def : InstAlias<"vmnot.m $vd, $vs",
                 (VMNAND_MM VR:$vd, VR:$vs, VR:$vs)>;
 
+def : InstAlias<"vmandnot.mm $vd, $vs2, $vs1",
+                (VMANDN_MM VR:$vd, VR:$vs2, VR:$vs1), 0>;
+def : InstAlias<"vmornot.mm $vd, $vs2, $vs1",
+                (VMORN_MM VR:$vd, VR:$vs2, VR:$vs1), 0>;
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     RVVConstraint = NoConstraint  in {
-// Vector mask population count vpopc
-def VPOPC_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
-                        (ins VR:$vs2, VMaskOp:$vm),
-                        "vpopc.m", "$vd, $vs2$vm">;
+
+// Vector mask population count vcpop
+def VCPOP_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
+                      (ins VR:$vs2, VMaskOp:$vm),
+                      "vcpop.m", "$vd, $vs2$vm">,
+              Sched<[WriteVMPopV, ReadVMPopV, ReadVMask]>;
 
 // vfirst find-first-set mask bit
 def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
-                        (ins VR:$vs2, VMaskOp:$vm),
-                        "vfirst.m", "$vd, $vs2$vm">;
+                       (ins VR:$vs2, VMaskOp:$vm),
+                       "vfirst.m", "$vd, $vs2$vm">,
+              Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMask]>;
+
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
+def : InstAlias<"vpopc.m $vd, $vs2$vm",
+                (VCPOP_M GPR:$vd, VR:$vs2, VMaskOp:$vm), 0>;
+
 let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
+
 // vmsbf.m set-before-first mask bit
-defm VMSBF_M : VALU_MV_VS2<"vmsbf.m", 0b010100, 0b00001>;
+defm VMSBF_M : VMSFS_MV_V<"vmsbf.m", 0b010100, 0b00001>;
 // vmsif.m set-including-first mask bit
-defm VMSIF_M : VALU_MV_VS2<"vmsif.m", 0b010100, 0b00011>;
+defm VMSIF_M : VMSFS_MV_V<"vmsif.m", 0b010100, 0b00011>;
 // vmsof.m set-only-first mask bit
-defm VMSOF_M : VALU_MV_VS2<"vmsof.m", 0b010100, 0b00010>;
+defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
 // Vector Iota Instruction
-defm VIOTA_M : VALU_MV_VS2<"viota.m", 0b010100, 0b10000>;
+defm VIOTA_M : VMIOT_MV_V<"viota.m", 0b010100, 0b10000>;
+
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Iota
 
 // Vector Element Index Instruction
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+
+let vs2 = 0 in
 def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VR:$vd),
-                      (ins VMaskOp:$vm), "vid.v", "$vd$vm"> {
-  let vs2 = 0;
-}
+                    (ins VMaskOp:$vm), "vid.v", "$vd$vm">,
+            Sched<[WriteVMIdxV, ReadVMask]>;
 
 // Integer Scalar Move Instructions
 let vm = 1, RVVConstraint = NoConstraint in {
 def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
-                      (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">;
+                      (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">,
+              Sched<[WriteVIMovVX, ReadVIMovVX]>;
 let Constraints = "$vd = $vd_wb" in
 def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
-                      (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
-
+                      (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">,
+              Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>;
 }
+
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtV, HasStdExtF] in {
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
     RVVConstraint = NoConstraint  in {
 // Floating-Point Scalar Move Instructions
 def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
-                      (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">;
+                      (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">,
+               Sched<[WriteVFMovVF, ReadVFMovVF]>;
 let Constraints = "$vd = $vd_wb" in
 def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
-                      (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
+                       (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">,
+               Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>;
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
+
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
 // Vector Slide Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VSLIDEUP_V : VALU_IV_X_I<"vslideup", 0b001110, uimm5>;
-defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
+defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, uimm5>;
+defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>;
-defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, uimm5>;
+defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtV, HasStdExtF] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VFSLIDE1UP_V : VALU_FV_F<"vfslide1up", 0b001110>;
+defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VFSLIDE1DOWN_V : VALU_FV_F<"vfslide1down", 0b001111>;
+defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
 // Vector Register Gather Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
-defm VRGATHER_V : VALU_IV_V_X_I<"vrgather", 0b001100, uimm5>;
-def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">;
+defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100, uimm5>;
+def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
+                      Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV]>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
 
 // Vector Compress Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress in {
-defm VCOMPRESS_V : VALU_MV_Mask<"vcompress", 0b010111>;
+defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     RVVConstraint = NoConstraint in {
-foreach nf = [1, 2, 4, 8] in {
-  def VMV#nf#R_V  : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VR:$vd),
-                            (ins VR:$vs2), "vmv" # nf # "r.v",
-                            "$vd, $vs2"> {
-    let Uses = [];
-    let vm = 1;
-  }
+foreach n = [1, 2, 4, 8] in {
+  def VMV#n#R_V  : RVInstV<0b100111, !add(n, -1), OPIVI, (outs VR:$vd),
+                           (ins VR:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
+                   VMVRSched<n> {
+  let Uses = [];
+  let vm = 1;
+}
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtZvlsseg] in {
   foreach nf=2-8 in {
-    def VLSEG#nf#E8_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth8, "vlseg"#nf#"e8.v">;
-    def VLSEG#nf#E16_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth16, "vlseg"#nf#"e16.v">;
-    def VLSEG#nf#E32_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth32, "vlseg"#nf#"e32.v">;
-    def VLSEG#nf#E64_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth64, "vlseg"#nf#"e64.v">;
-
-    def VLSEG#nf#E8FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth8, "vlseg"#nf#"e8ff.v">;
-    def VLSEG#nf#E16FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth16, "vlseg"#nf#"e16ff.v">;
-    def VLSEG#nf#E32FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth32, "vlseg"#nf#"e32ff.v">;
-    def VLSEG#nf#E64FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth64, "vlseg"#nf#"e64ff.v">;
-
-    def VSSEG#nf#E8_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth8, "vsseg"#nf#"e8.v">;
-    def VSSEG#nf#E16_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth16, "vsseg"#nf#"e16.v">;
-    def VSSEG#nf#E32_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth32, "vsseg"#nf#"e32.v">;
-    def VSSEG#nf#E64_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">;
-
-    // Vector Strided Instructions
-    def VLSSEG#nf#E8_V : VStridedSegmentLoad<!add(nf, -1), LSWidth8, "vlsseg"#nf#"e8.v">;
-    def VLSSEG#nf#E16_V : VStridedSegmentLoad<!add(nf, -1), LSWidth16, "vlsseg"#nf#"e16.v">;
-    def VLSSEG#nf#E32_V : VStridedSegmentLoad<!add(nf, -1), LSWidth32, "vlsseg"#nf#"e32.v">;
-    def VLSSEG#nf#E64_V : VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">;
-
-    def VSSSEG#nf#E8_V : VStridedSegmentStore<!add(nf, -1), LSWidth8, "vssseg"#nf#"e8.v">;
-    def VSSSEG#nf#E16_V : VStridedSegmentStore<!add(nf, -1), LSWidth16, "vssseg"#nf#"e16.v">;
-    def VSSSEG#nf#E32_V : VStridedSegmentStore<!add(nf, -1), LSWidth32, "vssseg"#nf#"e32.v">;
-    def VSSSEG#nf#E64_V : VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">;
-
-    // Vector Indexed Instructions
-    def VLUXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                             LSWidth8, "vluxseg"#nf#"ei8.v">;
-    def VLUXSEG#nf#EI16_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                              LSWidth16, "vluxseg"#nf#"ei16.v">;
-    def VLUXSEG#nf#EI32_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                              LSWidth32, "vluxseg"#nf#"ei32.v">;
-    def VLUXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                              LSWidth64, "vluxseg"#nf#"ei64.v">;
-
-    def VLOXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                             LSWidth8, "vloxseg"#nf#"ei8.v">;
-    def VLOXSEG#nf#EI16_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                              LSWidth16, "vloxseg"#nf#"ei16.v">;
-    def VLOXSEG#nf#EI32_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                              LSWidth32, "vloxseg"#nf#"ei32.v">;
-    def VLOXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                              LSWidth64, "vloxseg"#nf#"ei64.v">;
-
-    def VSUXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                             LSWidth8, "vsuxseg"#nf#"ei8.v">;
-    def VSUXSEG#nf#EI16_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                              LSWidth16, "vsuxseg"#nf#"ei16.v">;
-    def VSUXSEG#nf#EI32_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                              LSWidth32, "vsuxseg"#nf#"ei32.v">;
-    def VSUXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                              LSWidth64, "vsuxseg"#nf#"ei64.v">;
-
-    def VSOXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                             LSWidth8, "vsoxseg"#nf#"ei8.v">;
-    def VSOXSEG#nf#EI16_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                              LSWidth16, "vsoxseg"#nf#"ei16.v">;
-    def VSOXSEG#nf#EI32_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                              LSWidth32, "vsoxseg"#nf#"ei32.v">;
-    def VSOXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                              LSWidth64, "vsoxseg"#nf#"ei64.v">;
+    foreach eew = [8, 16, 32, 64] in {
+      defvar w = !cast<RISCVWidth>("LSWidth"#eew);
+
+      def VLSEG#nf#E#eew#_V :
+        VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">;
+      def VLSEG#nf#E#eew#FF_V :
+        VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">;
+      def VSSEG#nf#E#eew#_V :
+        VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">;
+
+      // Vector Strided Instructions
+      def VLSSEG#nf#E#eew#_V :
+        VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">;
+      def VSSSEG#nf#E#eew#_V :
+        VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">;
+
+      // Vector Indexed Instructions
+      def VLUXSEG#nf#EI#eew#_V :
+        VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, w,
+                            "vluxseg"#nf#"ei"#eew#".v">;
+      def VLOXSEG#nf#EI#eew#_V :
+        VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, w,
+                            "vloxseg"#nf#"ei"#eew#".v">;
+      def VSUXSEG#nf#EI#eew#_V :
+        VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, w,
+                             "vsuxseg"#nf#"ei"#eew#".v">;
+      def VSOXSEG#nf#EI#eew#_V :
+        VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, w,
+                             "vsoxseg"#nf#"ei"#eew#".v">;
+    }
   }
 } // Predicates = [HasStdExtZvlsseg]
 
 let Predicates = [HasStdExtZvamo, HasStdExtA] in {
-  defm VAMOSWAPEI8 : VAMO<AMOOPVamoSwap, LSWidth8, "vamoswapei8.v">;
-  defm VAMOSWAPEI16 : VAMO<AMOOPVamoSwap, LSWidth16, "vamoswapei16.v">;
-  defm VAMOSWAPEI32 : VAMO<AMOOPVamoSwap, LSWidth32, "vamoswapei32.v">;
-
-  defm VAMOADDEI8 : VAMO<AMOOPVamoAdd, LSWidth8, "vamoaddei8.v">;
-  defm VAMOADDEI16 : VAMO<AMOOPVamoAdd, LSWidth16, "vamoaddei16.v">;
-  defm VAMOADDEI32 : VAMO<AMOOPVamoAdd, LSWidth32, "vamoaddei32.v">;
-
-  defm VAMOXOREI8 : VAMO<AMOOPVamoXor, LSWidth8, "vamoxorei8.v">;
-  defm VAMOXOREI16 : VAMO<AMOOPVamoXor, LSWidth16, "vamoxorei16.v">;
-  defm VAMOXOREI32 : VAMO<AMOOPVamoXor, LSWidth32, "vamoxorei32.v">;
-
-  defm VAMOANDEI8 : VAMO<AMOOPVamoAnd, LSWidth8, "vamoandei8.v">;
-  defm VAMOANDEI16 : VAMO<AMOOPVamoAnd, LSWidth16, "vamoandei16.v">;
-  defm VAMOANDEI32 : VAMO<AMOOPVamoAnd, LSWidth32, "vamoandei32.v">;
-
-  defm VAMOOREI8 : VAMO<AMOOPVamoOr, LSWidth8, "vamoorei8.v">;
-  defm VAMOOREI16 : VAMO<AMOOPVamoOr, LSWidth16, "vamoorei16.v">;
-  defm VAMOOREI32 : VAMO<AMOOPVamoOr, LSWidth32, "vamoorei32.v">;
-
-  defm VAMOMINEI8 : VAMO<AMOOPVamoMin, LSWidth8, "vamominei8.v">;
-  defm VAMOMINEI16 : VAMO<AMOOPVamoMin, LSWidth16, "vamominei16.v">;
-  defm VAMOMINEI32 : VAMO<AMOOPVamoMin, LSWidth32, "vamominei32.v">;
-
-  defm VAMOMAXEI8 : VAMO<AMOOPVamoMax, LSWidth8, "vamomaxei8.v">;
-  defm VAMOMAXEI16 : VAMO<AMOOPVamoMax, LSWidth16, "vamomaxei16.v">;
-  defm VAMOMAXEI32 : VAMO<AMOOPVamoMax, LSWidth32, "vamomaxei32.v">;
-
-  defm VAMOMINUEI8 : VAMO<AMOOPVamoMinu, LSWidth8, "vamominuei8.v">;
-  defm VAMOMINUEI16 : VAMO<AMOOPVamoMinu, LSWidth16, "vamominuei16.v">;
-  defm VAMOMINUEI32 : VAMO<AMOOPVamoMinu, LSWidth32, "vamominuei32.v">;
-
-  defm VAMOMAXUEI8 : VAMO<AMOOPVamoMaxu, LSWidth8, "vamomaxuei8.v">;
-  defm VAMOMAXUEI16 : VAMO<AMOOPVamoMaxu, LSWidth16, "vamomaxuei16.v">;
-  defm VAMOMAXUEI32 : VAMO<AMOOPVamoMaxu, LSWidth32, "vamomaxuei32.v">;
+  foreach eew = [8, 16, 32] in {
+    defvar w = !cast<RISCVWidth>("LSWidth"#eew);
+    defm VAMOSWAPEI#eew : VAMO<AMOOPVamoSwap, w, "vamoswapei"#eew#".v">;
+    defm VAMOADDEI#eew : VAMO<AMOOPVamoAdd, w, "vamoaddei"#eew#".v">;
+    defm VAMOXOREI#eew : VAMO<AMOOPVamoXor, w, "vamoxorei"#eew#".v">;
+    defm VAMOANDEI#eew : VAMO<AMOOPVamoAnd, w, "vamoandei"#eew#".v">;
+    defm VAMOOREI#eew : VAMO<AMOOPVamoOr, w, "vamoorei"#eew#".v">;
+    defm VAMOMINEI#eew : VAMO<AMOOPVamoMin, w, "vamominei"#eew#".v">;
+    defm VAMOMAXEI#eew : VAMO<AMOOPVamoMax, w, "vamomaxei"#eew#".v">;
+    defm VAMOMINUEI#eew : VAMO<AMOOPVamoMinu, w, "vamominuei"#eew#".v">;
+    defm VAMOMAXUEI#eew : VAMO<AMOOPVamoMaxu, w, "vamomaxuei"#eew#".v">;
+  }
 } // Predicates = [HasStdExtZvamo, HasStdExtA]
 
 let Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 0284ff6d1c6b..a82e333e6bab 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -23,7 +23,7 @@ def riscv_read_vlenb : SDNode<"RISCVISD::READ_VLENB",
 // Operand that is allowed to be a register or a 5 bit immediate.
 // This allows us to pick between VSETIVLI and VSETVLI opcodes using the same
 // pseudo instructions.
-def AVL : RegisterOperand<GPR> {
+def AVL : RegisterOperand<GPRNoX0> {
   let OperandNamespace = "RISCVOp";
   let OperandType = "OPERAND_AVL";
 }
@@ -40,6 +40,9 @@ def DecImm : SDNodeXForm<imm, [{
                                    N->getValueType(0));
 }]>;
 
+defvar TAIL_UNDISTURBED = 0;
+defvar TAIL_AGNOSTIC = 1;
+
 //===----------------------------------------------------------------------===//
 // Utilities.
 //===----------------------------------------------------------------------===//
@@ -137,7 +140,9 @@ class octuple_to_str<int octuple> {
 def VLOpFrag : PatFrag<(ops), (XLenVT (VLOp (XLenVT AVL:$vl)))>;
 
 // Output pattern for X0 used to represent VLMAX in the pseudo instructions.
-def VLMax : OutPatFrag<(ops), (XLenVT X0)>;
+// We can't use X0 register becuase the AVL operands use GPRNoX0.
+// This must be kept in sync with RISCV::VLMaxSentinel.
+def VLMax : OutPatFrag<(ops), (XLenVT -1)>;
 
 // List of EEW.
 defvar EEWList = [8, 16, 32, 64];
@@ -577,13 +582,11 @@ class PseudoToVInst<string PseudoInst> {
                  !subst("_B32", "",
                  !subst("_B64", "",
                  !subst("_MASK", "",
-                 !subst("_COMMUTABLE", "",
-                 !subst("_TA", "",
                  !subst("_TIED", "",
                  !subst("F16", "F",
                  !subst("F32", "F",
                  !subst("F64", "F",
-                 !subst("Pseudo", "", PseudoInst))))))))))))))))))))));
+                 !subst("Pseudo", "", PseudoInst))))))))))))))))))));
 }
 
 // The destination vector register group for a masked vector instruction cannot
@@ -643,7 +646,7 @@ class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
               (ins GetVRegNoV0<RetClass>.R:$merge,
                    GPR:$rs1,
-                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
       RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
@@ -653,6 +656,7 @@ class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> :
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -674,7 +678,7 @@ class VPseudoSLoadMask<VReg RetClass, int EEW>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
               (ins GetVRegNoV0<RetClass>.R:$merge,
                    GPR:$rs1, GPR:$rs2,
-                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
       RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
@@ -684,6 +688,7 @@ class VPseudoSLoadMask<VReg RetClass, int EEW>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -708,7 +713,7 @@ class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
               (ins GetVRegNoV0<RetClass>.R:$merge,
                    GPR:$rs1, IdxClass:$rs2,
-                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
       RISCVVLX</*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
   let mayLoad = 1;
@@ -718,6 +723,7 @@ class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -861,6 +867,22 @@ class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoUnaryMaskTA<VReg RetClass, VReg OpClass, string Constraint = ""> :
+        Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+               (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+                    VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 // mask unary operation without maskedoff
 class VPseudoMaskUnarySOutMask:
         Pseudo<(outs GPR:$rd),
@@ -976,6 +998,26 @@ class VPseudoBinaryMask<VReg RetClass,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoBinaryMaskTA<VReg RetClass,
+                          RegisterClass Op1Class,
+                          DAGOperand Op2Class,
+                          string Constraint> :
+        Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
+                (ins GetVRegNoV0<RetClass>.R:$merge,
+                     Op1Class:$rs2, Op2Class:$rs1,
+                     VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 // Like VPseudoBinaryMask, but output can be V0.
 class VPseudoBinaryMOutMask<VReg RetClass,
                             RegisterClass Op1Class,
@@ -1005,7 +1047,7 @@ class VPseudoTiedBinaryMask<VReg RetClass,
         Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
                 (ins GetVRegNoV0<RetClass>.R:$merge,
                      Op2Class:$rs1,
-                     VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
+                     VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
         RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
@@ -1014,6 +1056,7 @@ class VPseudoTiedBinaryMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 0; // Merge is also rs2.
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -1060,6 +1103,27 @@ class VPseudoTernaryNoMask<VReg RetClass,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
+                                     RegisterClass Op1Class,
+                                     DAGOperand Op2Class,
+                                     string Constraint> :
+        Pseudo<(outs RetClass:$rd),
+               (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
+                    AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),
+               []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Join<[Constraint, "$rd = $rs3"], ",">.ret;
+  let HasVecPolicyOp = 1;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasDummyMask = 1;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 class VPseudoAMOWDNoMask<VReg RetClass,
                          VReg Op1Class> :
         Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
@@ -1139,7 +1203,7 @@ class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
 class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
-                  VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+                  VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
@@ -1149,6 +1213,7 @@ class VPseudoUSSegLoadMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -1170,7 +1235,8 @@ class VPseudoSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF>:
 class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
-                  GPR:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+                  GPR:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
+                  ixlenimm:$policy),[]>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
@@ -1180,6 +1246,7 @@ class VPseudoSSegLoadMask<VReg RetClass, int EEW, bits<4> NF>:
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -1205,7 +1272,8 @@ class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
                           bits<4> NF, bit Ordered>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, GPR:$rs1,
-                  IdxClass:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew),[]>,
+                  IdxClass:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
+                  ixlenimm:$policy),[]>,
       RISCVVPseudo,
       RISCVVLXSEG<NF, /*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
   let mayLoad = 1;
@@ -1217,6 +1285,7 @@ class VPseudoISegLoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasMergeOp = 1;
+  let HasVecPolicyOp = 1;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
@@ -1492,8 +1561,8 @@ multiclass VPseudoBinary<VReg RetClass,
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
                                              Constraint>;
-    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class,
-                                                     Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMaskTA<RetClass, Op1Class, Op2Class,
+                                                       Constraint>;
   }
 }
 
@@ -1520,8 +1589,8 @@ multiclass VPseudoBinaryEmul<VReg RetClass,
   let VLMul = lmul.value in {
     def "_" # lmul.MX # "_" # emul.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class,
                                                             Constraint>;
-    def "_" # lmul.MX # "_" # emul.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class,
-                                                                    Constraint>;
+    def "_" # lmul.MX # "_" # emul.MX # "_MASK" : VPseudoBinaryMaskTA<RetClass, Op1Class, Op2Class,
+                                                                      Constraint>;
   }
 }
 
@@ -1713,6 +1782,15 @@ multiclass VPseudoUnaryV_F_NoDummyMask {
   }
 }
 
+multiclass VPseudoUnaryTAV_V {
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>;
+      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>;
+    }
+  }
+}
+
 multiclass VPseudoUnaryV_V {
   foreach m = MxList.m in {
     let VLMul = m.value in {
@@ -1728,8 +1806,8 @@ multiclass PseudoUnaryV_VF2 {
   {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f2vrclass,
-                                                  constraints>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f2vrclass,
+                                                    constraints>;
     }
   }
 }
@@ -1740,8 +1818,8 @@ multiclass PseudoUnaryV_VF4 {
   {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f4vrclass,
-                                                  constraints>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f4vrclass,
+                                                    constraints>;
     }
   }
 }
@@ -1752,8 +1830,8 @@ multiclass PseudoUnaryV_VF8 {
   {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.f8vrclass,
-                                                  constraints>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f8vrclass,
+                                                    constraints>;
     }
   }
 }
@@ -1887,16 +1965,23 @@ multiclass VPseudoTernary<VReg RetClass,
   }
 }
 
-multiclass VPseudoTernaryV_VV<string Constraint = ""> {
+multiclass VPseudoTernaryWithPolicy<VReg RetClass,
+                                    RegisterClass Op1Class,
+                                    DAGOperand Op2Class,
+                                    LMULInfo MInfo,
+                                    string Constraint = "",
+                                    bit Commutable = 0> {
+  let VLMul = MInfo.value in {
+    let isCommutable = Commutable in
+    def "_" # MInfo.MX : VPseudoTernaryNoMaskWithPolicy<RetClass, Op1Class, Op2Class, Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMask<RetClass, Op1Class, Op2Class, Constraint>;
+  }
+}
+
+multiclass VPseudoTernaryV_VV_AAXA<string Constraint = ""> {
   foreach m = MxList.m in {
-    defm _VV : VPseudoTernary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
-
-    // Add a commutable version for use by IR mul+add.
-    let isCommutable = 1, ForceTailAgnostic = true, VLMul = m.value in
-    def "_VV_" # m.MX # "_COMMUTABLE" : VPseudoTernaryNoMask<m.vrclass,
-                                                             m.vrclass,
-                                                             m.vrclass,
-                                                             Constraint>;
+    defm _VV : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, m.vrclass, m,
+                                        Constraint, /*Commutable*/1>;
   }
 }
 
@@ -1906,68 +1991,39 @@ multiclass VPseudoTernaryV_VX<string Constraint = ""> {
 }
 
 multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> {
-  foreach m = MxList.m in {
-    defm "_VX" : VPseudoTernary<m.vrclass, GPR, m.vrclass, m, Constraint>;
-
-    // Add a commutable version for use by IR mul+add.
-    let isCommutable = 1, ForceTailAgnostic = true, VLMul = m.value in
-    def "_VX_" # m.MX # "_COMMUTABLE" :
-       VPseudoTernaryNoMask<m.vrclass, GPR, m.vrclass, Constraint>;
-  }
+  foreach m = MxList.m in
+    defm "_VX" : VPseudoTernaryWithPolicy<m.vrclass, GPR, m.vrclass, m,
+                                          Constraint, /*Commutable*/1>;
 }
 
 multiclass VPseudoTernaryV_VF_AAXA<string Constraint = ""> {
-  foreach m = MxList.m in {
-    foreach f = FPList.fpinfo in {
-      defm "_V" # f.FX : VPseudoTernary<m.vrclass, f.fprclass, m.vrclass,
-                                        m, Constraint>;
-
-      // Add a commutable version for use by IR mul+add.
-      let isCommutable = 1, ForceTailAgnostic = true, VLMul = m.value in
-      def "_V" # f.FX # "_" # m.MX # "_COMMUTABLE" :
-         VPseudoTernaryNoMask<m.vrclass, f.fprclass, m.vrclass, Constraint>;
-    }
-  }
+  foreach m = MxList.m in
+    foreach f = FPList.fpinfo in
+      defm "_V" # f.FX : VPseudoTernaryWithPolicy<m.vrclass, f.fprclass,
+                                                  m.vrclass, m, Constraint,
+                                                  /*Commutable*/1>;
 }
 
 multiclass VPseudoTernaryW_VV {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in {
-    defm _VV : VPseudoTernary<m.wvrclass, m.vrclass, m.vrclass, m, constraint>;
-
-    // Add a tail agnostic version for us by IR mul+add.
-    let ForceTailAgnostic = true, VLMul = m.value in
-    def "_VV_" # m.MX # "_TA" : VPseudoTernaryNoMask<m.wvrclass,
-                                                     m.vrclass,
-                                                     m.vrclass,
-                                                     constraint>;
-  }
+  foreach m = MxListW.m in
+    defm _VV : VPseudoTernaryWithPolicy<m.wvrclass, m.vrclass, m.vrclass, m,
+                                        constraint>;
 }
 
 multiclass VPseudoTernaryW_VX {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in {
-    defm "_VX" : VPseudoTernary<m.wvrclass, GPR, m.vrclass, m, constraint>;
-
-    // Add a tail agnostic version for use by IR mul+add.
-    let ForceTailAgnostic = true, VLMul = m.value in
-    def "_VX_" # m.MX # "_TA" :
-       VPseudoTernaryNoMask<m.wvrclass, GPR, m.vrclass, constraint>;
-  }
+  foreach m = MxListW.m in
+    defm "_VX" : VPseudoTernaryWithPolicy<m.wvrclass, GPR, m.vrclass, m,
+                                          constraint>;
 }
 
 multiclass VPseudoTernaryW_VF {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxListW.m in
-    foreach f = FPListW.fpinfo in {
-      defm "_V" # f.FX : VPseudoTernary<m.wvrclass, f.fprclass, m.vrclass, m,
-                                        constraint>;
-
-      // Add a tail agnostic version for use by IR mul+add.
-      let ForceTailAgnostic = true, VLMul = m.value in
-      def "_V" # f.FX # "_" # m.MX # "_TA" :
-         VPseudoTernaryNoMask<m.vrclass, f.fprclass, m.vrclass, constraint>;
-    }
+    foreach f = FPListW.fpinfo in
+      defm "_V" # f.FX : VPseudoTernaryWithPolicy<m.wvrclass, f.fprclass,
+                                                  m.vrclass, m, constraint>;
 }
 
 multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
@@ -1976,12 +2032,12 @@ multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
 }
 
 multiclass VPseudoTernaryV_VV_VX_AAXA<string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VV<Constraint>;
+  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>;
   defm "" : VPseudoTernaryV_VX_AAXA<Constraint>;
 }
 
 multiclass VPseudoTernaryV_VV_VF_AAXA<string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VV<Constraint>;
+  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>;
   defm "" : VPseudoTernaryV_VF_AAXA<Constraint>;
 }
 
@@ -2033,8 +2089,8 @@ multiclass VPseudoConversion<VReg RetClass,
                              string Constraint = ""> {
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoUnaryNoMask<RetClass, Op1Class, Constraint>;
-    def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask<RetClass, Op1Class,
-                                                    Constraint>;
+    def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMaskTA<RetClass, Op1Class,
+                                                      Constraint>;
   }
 }
 
@@ -2217,6 +2273,26 @@ class VPatUnaryMask<string intrinsic_name,
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0), GPR:$vl, sew)>;
 
+class VPatUnaryMaskTA<string intrinsic_name,
+                      string inst,
+                      string kind,
+                      ValueType result_type,
+                      ValueType op2_type,
+                      ValueType mask_type,
+                      int sew,
+                      LMULInfo vlmul,
+                      VReg result_reg_class,
+                      VReg op2_reg_class> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_reg_class:$rs2),
+                   (mask_type V0),
+                   VLOpFrag, (XLenVT timm:$policy))),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_MASK")
+                   (result_type result_reg_class:$merge),
+                   (op2_type op2_reg_class:$rs2),
+                   (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
+
 class VPatMaskUnaryNoMask<string intrinsic_name,
                           string inst,
                           MTypeInfo mti> :
@@ -2318,6 +2394,28 @@ class VPatBinaryMask<string intrinsic_name,
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew)>;
 
+class VPatBinaryMaskTA<string intrinsic_name,
+                       string inst,
+                       ValueType result_type,
+                       ValueType op1_type,
+                       ValueType op2_type,
+                       ValueType mask_type,
+                       int sew,
+                       VReg result_reg_class,
+                       VReg op1_reg_class,
+                       DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   (mask_type V0),
+                   VLOpFrag, (XLenVT timm:$policy))),
+                   (!cast<Instruction>(inst#"_MASK")
+                   (result_type result_reg_class:$merge),
+                   (op1_type op1_reg_class:$rs1),
+                   (op2_type op2_kind:$rs2),
+                   (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
+
 // Same as above but source operands are swapped.
 class VPatBinaryMaskSwapped<string intrinsic_name,
                             string inst,
@@ -2370,11 +2468,11 @@ class VPatTiedBinaryMask<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
-                   VLOpFrag)),
+                   VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0), GPR:$vl, sew)>;
+                   (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
 
 class VPatTernaryNoMask<string intrinsic,
                         string inst,
@@ -2382,7 +2480,6 @@ class VPatTernaryNoMask<string intrinsic,
                         ValueType result_type,
                         ValueType op1_type,
                         ValueType op2_type,
-                        ValueType mask_type,
                         int sew,
                         LMULInfo vlmul,
                         VReg result_reg_class,
@@ -2399,6 +2496,28 @@ class VPatTernaryNoMask<string intrinsic,
                     op2_kind:$rs2,
                     GPR:$vl, sew)>;
 
+class VPatTernaryNoMaskWithPolicy<string intrinsic,
+                                  string inst,
+                                  string kind,
+                                  ValueType result_type,
+                                  ValueType op1_type,
+                                  ValueType op2_type,
+                                  int sew,
+                                  LMULInfo vlmul,
+                                  VReg result_reg_class,
+                                  RegisterClass op1_reg_class,
+                                  DAGOperand op2_kind> :
+  Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                    (result_type result_reg_class:$rs3),
+                    (op1_type op1_reg_class:$rs1),
+                    (op2_type op2_kind:$rs2),
+                    VLOpFrag)),
+                   (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                    result_reg_class:$rs3,
+                    (op1_type op1_reg_class:$rs1),
+                    op2_kind:$rs2,
+                    GPR:$vl, sew, TAIL_UNDISTURBED)>;
+
 class VPatTernaryMask<string intrinsic,
                       string inst,
                       string kind,
@@ -2514,9 +2633,9 @@ multiclass VPatUnaryV_VF<string intrinsic, string instruction, string suffix,
       def : VPatUnaryNoMask<intrinsic, instruction, suffix,
                             vti.Vector, fti.Vector,
                             vti.Log2SEW, vti.LMul, fti.RegClass>;
-      def : VPatUnaryMask<intrinsic, instruction, suffix,
-                          vti.Vector, fti.Vector, vti.Mask,
-                          vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
+      def : VPatUnaryMaskTA<intrinsic, instruction, suffix,
+                            vti.Vector, fti.Vector, vti.Mask,
+                            vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
    }
 }
 
@@ -2526,9 +2645,9 @@ multiclass VPatUnaryV_V<string intrinsic, string instruction,
     def : VPatUnaryNoMask<intrinsic, instruction, "V",
                           vti.Vector, vti.Vector,
                           vti.Log2SEW, vti.LMul, vti.RegClass>;
-    def : VPatUnaryMask<intrinsic, instruction, "V",
-                        vti.Vector, vti.Vector, vti.Mask,
-                        vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
+    def : VPatUnaryMaskTA<intrinsic, instruction, "V",
+                          vti.Vector, vti.Vector, vti.Mask,
+                          vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
   }
 }
 
@@ -2574,6 +2693,24 @@ multiclass VPatBinary<string intrinsic,
                        op2_kind>;
 }
 
+multiclass VPatBinaryTA<string intrinsic,
+                        string inst,
+                        ValueType result_type,
+                        ValueType op1_type,
+                        ValueType op2_type,
+                        ValueType mask_type,
+                        int sew,
+                        VReg result_reg_class,
+                        VReg op1_reg_class,
+                        DAGOperand op2_kind>
+{
+  def : VPatBinaryNoMask<intrinsic, inst, result_type, op1_type, op2_type,
+                         sew, op1_reg_class, op2_kind>;
+  def : VPatBinaryMaskTA<intrinsic, inst, result_type, op1_type, op2_type,
+                         mask_type, sew, result_reg_class, op1_reg_class,
+                         op2_kind>;
+}
+
 multiclass VPatBinarySwapped<string intrinsic,
                       string inst,
                       ValueType result_type,
@@ -2653,23 +2790,40 @@ multiclass VPatConversion<string intrinsic,
                       mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
 }
 
+multiclass VPatConversionTA<string intrinsic,
+                            string inst,
+                            string kind,
+                            ValueType result_type,
+                            ValueType op1_type,
+                            ValueType mask_type,
+                            int sew,
+                            LMULInfo vlmul,
+                            VReg result_reg_class,
+                            VReg op1_reg_class>
+{
+  def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
+                        sew, vlmul, op1_reg_class>;
+  def : VPatUnaryMaskTA<intrinsic, inst, kind, result_type, op1_type,
+                        mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
+}
+
 multiclass VPatBinaryV_VV<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
-                      vti.Vector, vti.Vector, vti.Vector,vti.Mask,
-                      vti.Log2SEW, vti.RegClass,
-                      vti.RegClass, vti.RegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                        vti.Vector, vti.Vector, vti.Vector,vti.Mask,
+                        vti.Log2SEW, vti.RegClass,
+                        vti.RegClass, vti.RegClass>;
 }
 
 multiclass VPatBinaryV_VV_INT<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
     defvar ivti = GetIntVTypeInfo<vti>.Vti;
-    defm : VPatBinary<intrinsic, instruction # "_VV_" # vti.LMul.MX,
-                      vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
-                      vti.Log2SEW, vti.RegClass,
-                      vti.RegClass, vti.RegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_VV_" # vti.LMul.MX,
+                        vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
+                        vti.Log2SEW, vti.RegClass,
+                        vti.RegClass, vti.RegClass>;
   }
 }
 
@@ -2684,10 +2838,10 @@ multiclass VPatBinaryV_VV_INT_EEW<string intrinsic, string instruction,
       defvar emul_str = octuple_to_str<octuple_emul>.ret;
       defvar ivti = !cast<VTypeInfo>("VI" # eew # emul_str);
       defvar inst = instruction # "_VV_" # vti.LMul.MX # "_" # emul_str;
-      defm : VPatBinary<intrinsic, inst,
-                        vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
-                        vti.Log2SEW, vti.RegClass,
-                        vti.RegClass, ivti.RegClass>;
+      defm : VPatBinaryTA<intrinsic, inst,
+                          vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
+                          vti.Log2SEW, vti.RegClass,
+                          vti.RegClass, ivti.RegClass>;
     }
   }
 }
@@ -2696,29 +2850,29 @@ multiclass VPatBinaryV_VX<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
     defvar kind = "V"#vti.ScalarSuffix;
-    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
-                      vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
-                      vti.Log2SEW, vti.RegClass,
-                      vti.RegClass, vti.ScalarRegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction#"_"#kind#"_"#vti.LMul.MX,
+                        vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
+                        vti.Log2SEW, vti.RegClass,
+                        vti.RegClass, vti.ScalarRegClass>;
   }
 }
 
 multiclass VPatBinaryV_VX_INT<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatBinary<intrinsic, instruction # "_VX_" # vti.LMul.MX,
-                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
-                      vti.Log2SEW, vti.RegClass,
-                      vti.RegClass, GPR>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_VX_" # vti.LMul.MX,
+                        vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                        vti.Log2SEW, vti.RegClass,
+                        vti.RegClass, GPR>;
 }
 
 multiclass VPatBinaryV_VI<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist, Operand imm_type> {
   foreach vti = vtilist in
-    defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
-                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
-                      vti.Log2SEW, vti.RegClass,
-                      vti.RegClass, imm_type>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+                        vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                        vti.Log2SEW, vti.RegClass,
+                        vti.RegClass, imm_type>;
 }
 
 multiclass VPatBinaryM_MM<string intrinsic, string instruction> {
@@ -2733,10 +2887,10 @@ multiclass VPatBinaryW_VV<string intrinsic, string instruction,
   foreach VtiToWti = vtilist in {
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
-    defm : VPatBinary<intrinsic, instruction # "_VV_" # Vti.LMul.MX,
-                      Wti.Vector, Vti.Vector, Vti.Vector, Vti.Mask,
-                      Vti.Log2SEW, Wti.RegClass,
-                      Vti.RegClass, Vti.RegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_VV_" # Vti.LMul.MX,
+                        Wti.Vector, Vti.Vector, Vti.Vector, Vti.Mask,
+                        Vti.Log2SEW, Wti.RegClass,
+                        Vti.RegClass, Vti.RegClass>;
   }
 }
 
@@ -2746,10 +2900,10 @@ multiclass VPatBinaryW_VX<string intrinsic, string instruction,
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
     defvar kind = "V"#Vti.ScalarSuffix;
-    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
-                      Wti.Vector, Vti.Vector, Vti.Scalar, Vti.Mask,
-                      Vti.Log2SEW, Wti.RegClass,
-                      Vti.RegClass, Vti.ScalarRegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                        Wti.Vector, Vti.Vector, Vti.Scalar, Vti.Mask,
+                        Vti.Log2SEW, Wti.RegClass,
+                        Vti.RegClass, Vti.ScalarRegClass>;
   }
 }
 
@@ -2765,10 +2919,10 @@ multiclass VPatBinaryW_WV<string intrinsic, string instruction,
     def : VPatTiedBinaryMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
                              Wti.Vector, Vti.Vector, Vti.Mask,
                              Vti.Log2SEW, Wti.RegClass, Vti.RegClass>;
-    def : VPatBinaryMask<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
-                         Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
-                         Vti.Log2SEW, Wti.RegClass,
-                         Wti.RegClass, Vti.RegClass>;
+    def : VPatBinaryMaskTA<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+                           Wti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+                           Vti.Log2SEW, Wti.RegClass,
+                           Wti.RegClass, Vti.RegClass>;
   }
 }
 
@@ -2778,10 +2932,10 @@ multiclass VPatBinaryW_WX<string intrinsic, string instruction,
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
     defvar kind = "W"#Vti.ScalarSuffix;
-    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
-                      Wti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
-                      Vti.Log2SEW, Wti.RegClass,
-                      Wti.RegClass, Vti.ScalarRegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                        Wti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
+                        Vti.Log2SEW, Wti.RegClass,
+                        Wti.RegClass, Vti.ScalarRegClass>;
   }
 }
 
@@ -2790,10 +2944,10 @@ multiclass VPatBinaryV_WV<string intrinsic, string instruction,
   foreach VtiToWti = vtilist in {
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
-    defm : VPatBinary<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
-                      Vti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
-                      Vti.Log2SEW, Vti.RegClass,
-                      Wti.RegClass, Vti.RegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_WV_" # Vti.LMul.MX,
+                        Vti.Vector, Wti.Vector, Vti.Vector, Vti.Mask,
+                        Vti.Log2SEW, Vti.RegClass,
+                        Wti.RegClass, Vti.RegClass>;
   }
 }
 
@@ -2803,10 +2957,10 @@ multiclass VPatBinaryV_WX<string intrinsic, string instruction,
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
     defvar kind = "W"#Vti.ScalarSuffix;
-    defm : VPatBinary<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
-                      Vti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
-                      Vti.Log2SEW, Vti.RegClass,
-                      Wti.RegClass, Vti.ScalarRegClass>;
+    defm : VPatBinaryTA<intrinsic, instruction#"_"#kind#"_"#Vti.LMul.MX,
+                        Vti.Vector, Wti.Vector, Vti.Scalar, Vti.Mask,
+                        Vti.Log2SEW, Vti.RegClass,
+                        Wti.RegClass, Vti.ScalarRegClass>;
   }
 }
 
@@ -2815,10 +2969,10 @@ multiclass VPatBinaryV_WI<string intrinsic, string instruction,
   foreach VtiToWti = vtilist in {
     defvar Vti = VtiToWti.Vti;
     defvar Wti = VtiToWti.Wti;
-    defm : VPatBinary<intrinsic, instruction # "_WI_" # Vti.LMul.MX,
-                      Vti.Vector, Wti.Vector, XLenVT, Vti.Mask,
-                      Vti.Log2SEW, Vti.RegClass,
-                      Wti.RegClass, uimm5>;
+    defm : VPatBinaryTA<intrinsic, instruction # "_WI_" # Vti.LMul.MX,
+                        Vti.Vector, Wti.Vector, XLenVT, Vti.Mask,
+                        Vti.Log2SEW, Vti.RegClass,
+                        Wti.RegClass, uimm5>;
   }
 }
 
@@ -2989,20 +3143,40 @@ multiclass VPatTernary<string intrinsic,
                        RegisterClass op1_reg_class,
                        DAGOperand op2_kind> {
   def : VPatTernaryNoMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
-                    mask_type, sew, vlmul, result_reg_class, op1_reg_class,
-                    op2_kind>;
+                          sew, vlmul, result_reg_class, op1_reg_class,
+                          op2_kind>;
   def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
                         mask_type, sew, vlmul, result_reg_class, op1_reg_class,
                         op2_kind>;
 }
 
-multiclass VPatTernaryV_VV<string intrinsic, string instruction,
-                           list<VTypeInfo> vtilist> {
+multiclass VPatTernaryWithPolicy<string intrinsic,
+                                 string inst,
+                                 string kind,
+                                 ValueType result_type,
+                                 ValueType op1_type,
+                                 ValueType op2_type,
+                                 ValueType mask_type,
+                                 int sew,
+                                 LMULInfo vlmul,
+                                 VReg result_reg_class,
+                                 RegisterClass op1_reg_class,
+                                 DAGOperand op2_kind> {
+  def : VPatTernaryNoMaskWithPolicy<intrinsic, inst, kind, result_type, op1_type,
+                                    op2_type, sew, vlmul, result_reg_class,
+                                    op1_reg_class, op2_kind>;
+  def : VPatTernaryMask<intrinsic, inst, kind, result_type, op1_type, op2_type,
+                        mask_type, sew, vlmul, result_reg_class, op1_reg_class,
+                        op2_kind>;
+}
+
+multiclass VPatTernaryV_VV_AAXA<string intrinsic, string instruction,
+                                list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatTernary<intrinsic, instruction, "VV",
-                       vti.Vector, vti.Vector, vti.Vector, vti.Mask,
-                       vti.Log2SEW, vti.LMul, vti.RegClass,
-                       vti.RegClass, vti.RegClass>;
+    defm : VPatTernaryWithPolicy<intrinsic, instruction, "VV",
+                                 vti.Vector, vti.Vector, vti.Vector, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, vti.RegClass>;
 }
 
 multiclass VPatTernaryV_VX<string intrinsic, string instruction,
@@ -3017,11 +3191,11 @@ multiclass VPatTernaryV_VX<string intrinsic, string instruction,
 multiclass VPatTernaryV_VX_AAXA<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    defm : VPatTernary<intrinsic, instruction,
-                       "V"#vti.ScalarSuffix,
-                       vti.Vector, vti.Scalar, vti.Vector, vti.Mask,
-                       vti.Log2SEW, vti.LMul, vti.RegClass,
-                       vti.ScalarRegClass, vti.RegClass>;
+    defm : VPatTernaryWithPolicy<intrinsic, instruction,
+                                 "V"#vti.ScalarSuffix,
+                                 vti.Vector, vti.Scalar, vti.Vector, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.ScalarRegClass, vti.RegClass>;
 }
 
 multiclass VPatTernaryV_VI<string intrinsic, string instruction,
@@ -3038,10 +3212,10 @@ multiclass VPatTernaryW_VV<string intrinsic, string instruction,
   foreach vtiToWti = vtilist in {
     defvar vti = vtiToWti.Vti;
     defvar wti = vtiToWti.Wti;
-    defm : VPatTernary<intrinsic, instruction, "VV",
-                      wti.Vector, vti.Vector, vti.Vector,
-                      vti.Mask, vti.Log2SEW, vti.LMul,
-                      wti.RegClass, vti.RegClass, vti.RegClass>;
+    defm : VPatTernaryWithPolicy<intrinsic, instruction, "VV",
+                                 wti.Vector, vti.Vector, vti.Vector,
+                                 vti.Mask, vti.Log2SEW, vti.LMul,
+                                 wti.RegClass, vti.RegClass, vti.RegClass>;
   }
 }
 
@@ -3050,17 +3224,17 @@ multiclass VPatTernaryW_VX<string intrinsic, string instruction,
   foreach vtiToWti = vtilist in {
     defvar vti = vtiToWti.Vti;
     defvar wti = vtiToWti.Wti;
-    defm : VPatTernary<intrinsic, instruction,
-                       "V"#vti.ScalarSuffix,
-                       wti.Vector, vti.Scalar, vti.Vector,
-                       vti.Mask, vti.Log2SEW, vti.LMul,
-                       wti.RegClass, vti.ScalarRegClass, vti.RegClass>;
+    defm : VPatTernaryWithPolicy<intrinsic, instruction,
+                                 "V"#vti.ScalarSuffix,
+                                 wti.Vector, vti.Scalar, vti.Vector,
+                                 vti.Mask, vti.Log2SEW, vti.LMul,
+                                 wti.RegClass, vti.ScalarRegClass, vti.RegClass>;
   }
 }
 
 multiclass VPatTernaryV_VV_VX_AAXA<string intrinsic, string instruction,
                               list<VTypeInfo> vtilist>
-    : VPatTernaryV_VV<intrinsic, instruction, vtilist>,
+    : VPatTernaryV_VV_AAXA<intrinsic, instruction, vtilist>,
       VPatTernaryV_VX_AAXA<intrinsic, instruction, vtilist>;
 
 multiclass VPatTernaryV_VX_VI<string intrinsic, string instruction,
@@ -3131,8 +3305,8 @@ multiclass VPatReductionW_VS<string intrinsic, string instruction, bit IsFloat =
   }
 }
 
-multiclass VPatConversionVI_VF<string intrinsic,
-                               string instruction>
+multiclass VPatClassifyVI_VF<string intrinsic,
+                             string instruction>
 {
   foreach fvti = AllFloatVectors in
   {
@@ -3144,6 +3318,19 @@ multiclass VPatConversionVI_VF<string intrinsic,
   }
 }
 
+multiclass VPatConversionVI_VF<string intrinsic,
+                               string instruction>
+{
+  foreach fvti = AllFloatVectors in
+  {
+    defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+
+    defm : VPatConversionTA<intrinsic, instruction, "V",
+                            ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW,
+                            fvti.LMul, ivti.RegClass, fvti.RegClass>;
+  }
+}
+
 multiclass VPatConversionVF_VI<string intrinsic,
                                string instruction>
 {
@@ -3151,9 +3338,9 @@ multiclass VPatConversionVF_VI<string intrinsic,
   {
     defvar ivti = GetIntVTypeInfo<fvti>.Vti;
 
-    defm : VPatConversion<intrinsic, instruction, "V",
-                          fvti.Vector, ivti.Vector, fvti.Mask, ivti.Log2SEW,
-                          ivti.LMul, fvti.RegClass, ivti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "V",
+                            fvti.Vector, ivti.Vector, fvti.Mask, ivti.Log2SEW,
+                            ivti.LMul, fvti.RegClass, ivti.RegClass>;
   }
 }
 
@@ -3163,9 +3350,9 @@ multiclass VPatConversionWI_VF<string intrinsic, string instruction> {
     defvar fvti = fvtiToFWti.Vti;
     defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
 
-    defm : VPatConversion<intrinsic, instruction, "V",
-                          iwti.Vector, fvti.Vector, iwti.Mask, fvti.Log2SEW,
-                          fvti.LMul, iwti.RegClass, fvti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "V",
+                            iwti.Vector, fvti.Vector, iwti.Mask, fvti.Log2SEW,
+                            fvti.LMul, iwti.RegClass, fvti.RegClass>;
   }
 }
 
@@ -3175,9 +3362,9 @@ multiclass VPatConversionWF_VI<string intrinsic, string instruction> {
     defvar vti = vtiToWti.Vti;
     defvar fwti = vtiToWti.Wti;
 
-    defm : VPatConversion<intrinsic, instruction, "V",
-                          fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW,
-                          vti.LMul, fwti.RegClass, vti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "V",
+                            fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW,
+                            vti.LMul, fwti.RegClass, vti.RegClass>;
   }
 }
 
@@ -3187,9 +3374,9 @@ multiclass VPatConversionWF_VF <string intrinsic, string instruction> {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
 
-    defm : VPatConversion<intrinsic, instruction, "V",
-                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
-                          fvti.LMul, fwti.RegClass, fvti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "V",
+                            fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+                            fvti.LMul, fwti.RegClass, fvti.RegClass>;
   }
 }
 
@@ -3199,9 +3386,9 @@ multiclass VPatConversionVI_WF <string intrinsic, string instruction> {
     defvar vti = vtiToWti.Vti;
     defvar fwti = vtiToWti.Wti;
 
-    defm : VPatConversion<intrinsic, instruction, "W",
-                          vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
-                          vti.LMul, vti.RegClass, fwti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "W",
+                            vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
+                            vti.LMul, vti.RegClass, fwti.RegClass>;
   }
 }
 
@@ -3211,9 +3398,9 @@ multiclass VPatConversionVF_WI <string intrinsic, string instruction> {
     defvar fvti = fvtiToFWti.Vti;
     defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
 
-    defm : VPatConversion<intrinsic, instruction, "W",
-                          fvti.Vector, iwti.Vector, fvti.Mask, fvti.Log2SEW,
-                          fvti.LMul, fvti.RegClass, iwti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "W",
+                            fvti.Vector, iwti.Vector, fvti.Mask, fvti.Log2SEW,
+                            fvti.LMul, fvti.RegClass, iwti.RegClass>;
   }
 }
 
@@ -3223,9 +3410,9 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
 
-    defm : VPatConversion<intrinsic, instruction, "W",
-                          fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
-                          fvti.LMul, fvti.RegClass, fwti.RegClass>;
+    defm : VPatConversionTA<intrinsic, instruction, "W",
+                            fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+                            fvti.LMul, fvti.RegClass, fwti.RegClass>;
   }
 }
 
@@ -3271,7 +3458,7 @@ multiclass VPatAMOV_WD<string intrinsic,
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions for CodeGen
@@ -3326,7 +3513,12 @@ foreach lmul = MxList.m in {
 
 // Pseudos.
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
-def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei), []>;
+// Due to rs1=X0 having special meaning, we need a GPRNoX0 register class for
+// the when we aren't using one of the special X0 encodings. Otherwise it could
+// be accidentally be made X0 by MachineIR optimizations. To satisfy the
+// verifier, we also need a GPRX0 instruction for the special encodings.
+def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPRNoX0:$rs1, VTypeIOp:$vtypei), []>;
+def PseudoVSETVLIX0 : Pseudo<(outs GPR:$rd), (ins GPRX0:$rs1, VTypeIOp:$vtypei), []>;
 def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei), []>;
 }
 
@@ -3342,8 +3534,8 @@ def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei),
 defm PseudoVL : VPseudoUSLoad</*isFF=*/false>;
 defm PseudoVS : VPseudoUSStore;
 
-defm PseudoVLE1 : VPseudoLoadMask;
-defm PseudoVSE1 : VPseudoStoreMask;
+defm PseudoVLM : VPseudoLoadMask;
+defm PseudoVSM : VPseudoStoreMask;
 
 //===----------------------------------------------------------------------===//
 // 7.5 Vector Strided Instructions
@@ -3427,14 +3619,16 @@ foreach vti = AllIntegerVectors in {
                                               (vti.Vector vti.RegClass:$rs2),
                                               (vti.Vector vti.RegClass:$rs1),
                                               (vti.Mask V0),
-                                              VLOpFrag)),
+                                              VLOpFrag,
+                                              (XLenVT timm:$policy))),
             (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX#"_MASK")
                                                       vti.RegClass:$merge,
                                                       vti.RegClass:$rs1,
                                                       vti.RegClass:$rs2,
                                                       (vti.Mask V0),
                                                       GPR:$vl,
-                                                      vti.Log2SEW)>;
+                                                      vti.Log2SEW,
+                                                      (XLenVT timm:$policy))>;
 
   // Match VSUB with a small immediate to vadd.vi by negating the immediate.
   def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$rs1),
@@ -3448,14 +3642,16 @@ foreach vti = AllIntegerVectors in {
                                              (vti.Vector vti.RegClass:$rs1),
                                              (vti.Scalar simm5_plus1:$rs2),
                                              (vti.Mask V0),
-                                             VLOpFrag)),
+                                             VLOpFrag,
+                                             (XLenVT timm:$policy))),
             (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX#"_MASK")
                                                       vti.RegClass:$merge,
                                                       vti.RegClass:$rs1,
                                                       (NegImm simm5_plus1:$rs2),
                                                       (vti.Mask V0),
                                                       GPR:$vl,
-                                                      vti.Log2SEW)>;
+                                                      vti.Log2SEW,
+                                                      (XLenVT timm:$policy))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3623,9 +3819,9 @@ let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
   defm PseudoVNCLIPU    : VPseudoBinaryV_WV_WX_WI;
 }
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
@@ -3676,17 +3872,17 @@ defm PseudoVFWNMSAC    : VPseudoTernaryW_VV_VF;
 //===----------------------------------------------------------------------===//
 // 14.8. Vector Floating-Point Square-Root Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFSQRT      : VPseudoUnaryV_V;
+defm PseudoVFSQRT      : VPseudoUnaryTAV_V;
 
 //===----------------------------------------------------------------------===//
 // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFRSQRT7    : VPseudoUnaryV_V;
+defm PseudoVFRSQRT7    : VPseudoUnaryTAV_V;
 
 //===----------------------------------------------------------------------===//
 // 14.10. Vector Floating-Point Reciprocal Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFREC7      : VPseudoUnaryV_V;
+defm PseudoVFREC7      : VPseudoUnaryTAV_V;
 
 //===----------------------------------------------------------------------===//
 // 14.11. Vector Floating-Point Min/Max Instructions
@@ -3758,9 +3954,9 @@ defm PseudoVFNCVT_F_XU : VPseudoConversionV_W;
 defm PseudoVFNCVT_F_X : VPseudoConversionV_W;
 defm PseudoVFNCVT_F_F : VPseudoConversionV_W;
 defm PseudoVFNCVT_ROD_F_F : VPseudoConversionV_W;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 //===----------------------------------------------------------------------===//
 // 15.1. Vector Single-Width Integer Reduction Instructions
 //===----------------------------------------------------------------------===//
@@ -3776,26 +3972,30 @@ defm PseudoVREDMAX     : VPseudoReductionV_VS;
 //===----------------------------------------------------------------------===//
 // 15.2. Vector Widening Integer Reduction Instructions
 //===----------------------------------------------------------------------===//
+let IsRVVWideningReduction = 1 in {
 defm PseudoVWREDSUMU   : VPseudoReductionV_VS;
 defm PseudoVWREDSUM    : VPseudoReductionV_VS;
-} // Predicates = [HasStdExtV]
+}
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
 defm PseudoVFREDOSUM   : VPseudoReductionV_VS;
-defm PseudoVFREDSUM    : VPseudoReductionV_VS;
+defm PseudoVFREDUSUM   : VPseudoReductionV_VS;
 defm PseudoVFREDMIN    : VPseudoReductionV_VS;
 defm PseudoVFREDMAX    : VPseudoReductionV_VS;
 
 //===----------------------------------------------------------------------===//
 // 15.4. Vector Widening Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFWREDSUM   : VPseudoReductionV_VS;
+let IsRVVWideningReduction = 1 in {
+defm PseudoVFWREDUSUM  : VPseudoReductionV_VS;
 defm PseudoVFWREDOSUM  : VPseudoReductionV_VS;
+}
 
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 16. Vector Mask Instructions
@@ -3807,11 +4007,11 @@ defm PseudoVFWREDOSUM  : VPseudoReductionV_VS;
 
 defm PseudoVMAND: VPseudoBinaryM_MM;
 defm PseudoVMNAND: VPseudoBinaryM_MM;
-defm PseudoVMANDNOT: VPseudoBinaryM_MM;
+defm PseudoVMANDN: VPseudoBinaryM_MM;
 defm PseudoVMXOR: VPseudoBinaryM_MM;
 defm PseudoVMOR: VPseudoBinaryM_MM;
 defm PseudoVMNOR: VPseudoBinaryM_MM;
-defm PseudoVMORNOT: VPseudoBinaryM_MM;
+defm PseudoVMORN: VPseudoBinaryM_MM;
 defm PseudoVMXNOR: VPseudoBinaryM_MM;
 
 // Pseudo instructions
@@ -3819,10 +4019,10 @@ defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">;
 defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;
 
 //===----------------------------------------------------------------------===//
-// 16.2. Vector mask population count vpopc
+// 16.2. Vector mask population count vcpop
 //===----------------------------------------------------------------------===//
 
-defm PseudoVPOPC: VPseudoUnaryS_M;
+defm PseudoVCPOP: VPseudoUnaryS_M;
 
 //===----------------------------------------------------------------------===//
 // 16.3. vfirst find-first-set mask bit
@@ -3863,7 +4063,7 @@ defm PseudoVID : VPseudoMaskNullaryV;
 // 17.1. Integer Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach m = MxList.m in {
     let VLMul = m.value in {
@@ -3880,13 +4080,13 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
     }
   }
 }
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
 //===----------------------------------------------------------------------===//
 // 17.2. Floating-Point Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach m = MxList.m in {
     foreach f = FPList.fpinfo in {
@@ -3908,22 +4108,22 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
     }
   }
 }
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 17.3. Vector Slide Instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
   defm PseudoVSLIDEUP    : VPseudoTernaryV_VX_VI<uimm5, "@earlyclobber $rd">;
   defm PseudoVSLIDEDOWN  : VPseudoTernaryV_VX_VI<uimm5>;
   defm PseudoVSLIDE1UP   : VPseudoBinaryV_VX<"@earlyclobber $rd">;
   defm PseudoVSLIDE1DOWN : VPseudoBinaryV_VX;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
   defm PseudoVFSLIDE1UP  : VPseudoBinaryV_VF<"@earlyclobber $rd">;
   defm PseudoVFSLIDE1DOWN : VPseudoBinaryV_VF;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 17.4. Vector Register Gather Instructions
@@ -3955,15 +4155,15 @@ let Predicates = [HasStdExtZvamo] in {
   defm : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>;
 } // Predicates = [HasStdExtZvamo]
 
-let Predicates = [HasStdExtZvamo, HasStdExtF] in {
+let Predicates = [HasStdExtZvamo, HasVInstructionsAnyF] in {
   defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>;
-} // Predicates = [HasStdExtZvamo, HasStdExtF]
+} // Predicates = [HasStdExtZvamo, HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 12. Vector Integer Arithmetic Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 //===----------------------------------------------------------------------===//
 // 12.1. Vector Single-Width Integer Add and Subtract
 //===----------------------------------------------------------------------===//
@@ -4279,9 +4479,9 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vssra", "PseudoVSSRA", AllIntegerVectors,
 defm : VPatBinaryV_WV_WX_WI<"int_riscv_vnclipu", "PseudoVNCLIPU", AllWidenableIntVectors>;
 defm : VPatBinaryV_WV_WX_WI<"int_riscv_vnclip", "PseudoVNCLIP", AllWidenableIntVectors>;
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
@@ -4372,12 +4572,16 @@ defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE", AllFloatVectors>;
 //===----------------------------------------------------------------------===//
 // 14.14. Vector Floating-Point Classify Instruction
 //===----------------------------------------------------------------------===//
-defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
+defm : VPatClassifyVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 
 //===----------------------------------------------------------------------===//
 // 14.15. Vector Floating-Point Merge Instruction
 //===----------------------------------------------------------------------===//
 // We can use vmerge.vvm to support vector-vector vfmerge.
+// NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
+// int_riscv_vmerge. Support both for compatibility.
+defm : VPatBinaryV_VM<"int_riscv_vmerge", "PseudoVMERGE",
+                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
 defm : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
                       /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
 defm : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
@@ -4423,9 +4627,9 @@ defm : VPatConversionVF_WI <"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU">;
 defm : VPatConversionVF_WI <"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X">;
 defm : VPatConversionVF_WF<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F">;
 defm : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F">;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 //===----------------------------------------------------------------------===//
 // 15.1. Vector Single-Width Integer Reduction Instructions
 //===----------------------------------------------------------------------===//
@@ -4443,40 +4647,40 @@ defm : VPatReductionV_VS<"int_riscv_vredmax", "PseudoVREDMAX">;
 //===----------------------------------------------------------------------===//
 defm : VPatReductionW_VS<"int_riscv_vwredsumu", "PseudoVWREDSUMU">;
 defm : VPatReductionW_VS<"int_riscv_vwredsum", "PseudoVWREDSUM">;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
 defm : VPatReductionV_VS<"int_riscv_vfredosum", "PseudoVFREDOSUM", /*IsFloat=*/1>;
-defm : VPatReductionV_VS<"int_riscv_vfredsum", "PseudoVFREDSUM", /*IsFloat=*/1>;
+defm : VPatReductionV_VS<"int_riscv_vfredusum", "PseudoVFREDUSUM", /*IsFloat=*/1>;
 defm : VPatReductionV_VS<"int_riscv_vfredmin", "PseudoVFREDMIN", /*IsFloat=*/1>;
 defm : VPatReductionV_VS<"int_riscv_vfredmax", "PseudoVFREDMAX", /*IsFloat=*/1>;
 
 //===----------------------------------------------------------------------===//
 // 15.4. Vector Widening Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
-defm : VPatReductionW_VS<"int_riscv_vfwredsum", "PseudoVFWREDSUM", /*IsFloat=*/1>;
+defm : VPatReductionW_VS<"int_riscv_vfwredusum", "PseudoVFWREDUSUM", /*IsFloat=*/1>;
 defm : VPatReductionW_VS<"int_riscv_vfwredosum", "PseudoVFWREDOSUM", /*IsFloat=*/1>;
 
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 16. Vector Mask Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 //===----------------------------------------------------------------------===//
 // 16.1 Vector Mask-Register Logical Instructions
 //===----------------------------------------------------------------------===//
 defm : VPatBinaryM_MM<"int_riscv_vmand", "PseudoVMAND">;
 defm : VPatBinaryM_MM<"int_riscv_vmnand", "PseudoVMNAND">;
-defm : VPatBinaryM_MM<"int_riscv_vmandnot", "PseudoVMANDNOT">;
+defm : VPatBinaryM_MM<"int_riscv_vmandn", "PseudoVMANDN">;
 defm : VPatBinaryM_MM<"int_riscv_vmxor", "PseudoVMXOR">;
 defm : VPatBinaryM_MM<"int_riscv_vmor", "PseudoVMOR">;
 defm : VPatBinaryM_MM<"int_riscv_vmnor", "PseudoVMNOR">;
-defm : VPatBinaryM_MM<"int_riscv_vmornot", "PseudoVMORNOT">;
+defm : VPatBinaryM_MM<"int_riscv_vmorn", "PseudoVMORN">;
 defm : VPatBinaryM_MM<"int_riscv_vmxnor", "PseudoVMXNOR">;
 
 // pseudo instructions
@@ -4484,9 +4688,9 @@ defm : VPatNullaryM<"int_riscv_vmclr", "PseudoVMCLR">;
 defm : VPatNullaryM<"int_riscv_vmset", "PseudoVMSET">;
 
 //===----------------------------------------------------------------------===//
-// 16.2. Vector mask population count vpopc
+// 16.2. Vector count population in mask vcpop.m
 //===----------------------------------------------------------------------===//
-defm : VPatUnaryS_M<"int_riscv_vpopc", "PseudoVPOPC">;
+defm : VPatUnaryS_M<"int_riscv_vcpop", "PseudoVCPOP">;
 
 //===----------------------------------------------------------------------===//
 // 16.3. vfirst find-first-set mask bit
@@ -4518,7 +4722,7 @@ defm : VPatUnaryV_M<"int_riscv_viota", "PseudoVIOTA">;
 //===----------------------------------------------------------------------===//
 defm : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
 //===----------------------------------------------------------------------===//
 // 17. Vector Permutation Instructions
@@ -4528,19 +4732,19 @@ defm : VPatNullaryV<"int_riscv_vid", "PseudoVID">;
 // 17.1. Integer Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 foreach vti = AllIntegerVectors in {
   def : Pat<(riscv_vmv_x_s (vti.Vector vti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.Log2SEW)>;
   // vmv.s.x is handled with a custom node in RISCVInstrInfoVVLPatterns.td
 }
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
 //===----------------------------------------------------------------------===//
 // 17.2. Floating-Point Scalar Move Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 foreach fvti = AllFloatVectors in {
   defvar instr = !cast<Instruction>("PseudoVFMV_"#fvti.ScalarSuffix#"_S_" #
                                     fvti.LMul.MX);
@@ -4555,52 +4759,52 @@ foreach fvti = AllFloatVectors in {
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
              GPR:$vl, fvti.Log2SEW)>;
 }
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 17.3. Vector Slide Instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
   defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllIntegerVectors, uimm5>;
   defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllIntegerVectors, uimm5>;
   defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
   defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
   defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
   defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
   defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
   defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 17.4. Vector Register Gather Instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
   defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                   AllIntegerVectors, uimm5>;
   defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
                                 /* eew */ 16, AllIntegerVectors>;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
   defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                   AllFloatVectors, uimm5>;
   defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
                                 /* eew */ 16, AllFloatVectors>;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 17.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
   defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
   defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 // Include the non-intrinsic ISel patterns
 include "RISCVInstrInfoVSDPatterns.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 483fc8bfecda..711ad4335ece 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -89,8 +89,8 @@ multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
 
 multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m>
 {
-  defvar load_instr = !cast<Instruction>("PseudoVLE1_V_"#m.BX);
-  defvar store_instr = !cast<Instruction>("PseudoVSE1_V_"#m.BX);
+  defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX);
+  defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX);
   // Load
   def : Pat<(m.Mask (load BaseAddr:$rs1)),
             (load_instr BaseAddr:$rs1, m.AVL, m.Log2SEW)>;
@@ -103,11 +103,9 @@ class VPatBinarySDNode_VV<SDNode vop,
                           string instruction_name,
                           ValueType result_type,
                           ValueType op_type,
-                          ValueType mask_type,
                           int sew,
                           LMULInfo vlmul,
                           OutPatFrag avl,
-                          VReg RetClass,
                           VReg op_reg_class> :
     Pat<(result_type (vop
                      (op_type op_reg_class:$rs1),
@@ -122,11 +120,9 @@ class VPatBinarySDNode_XI<SDNode vop,
                           string suffix,
                           ValueType result_type,
                           ValueType vop_type,
-                          ValueType mask_type,
                           int sew,
                           LMULInfo vlmul,
                           OutPatFrag avl,
-                          VReg RetClass,
                           VReg vop_reg_class,
                           ComplexPattern SplatPatKind,
                           DAGOperand xop_kind> :
@@ -141,11 +137,11 @@ class VPatBinarySDNode_XI<SDNode vop,
 multiclass VPatBinarySDNode_VV_VX<SDNode vop, string instruction_name> {
   foreach vti = AllIntegerVectors in {
     def : VPatBinarySDNode_VV<vop, instruction_name,
-                              vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+                              vti.Vector, vti.Vector, vti.Log2SEW,
+                              vti.LMul, vti.AVL, vti.RegClass>;
     def : VPatBinarySDNode_XI<vop, instruction_name, "VX",
-                              vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              vti.Vector, vti.Vector, vti.Log2SEW,
+                              vti.LMul, vti.AVL, vti.RegClass,
                               SplatPat, GPR>;
   }
 }
@@ -155,8 +151,8 @@ multiclass VPatBinarySDNode_VV_VX_VI<SDNode vop, string instruction_name,
     : VPatBinarySDNode_VV_VX<vop, instruction_name> {
   foreach vti = AllIntegerVectors in {
     def : VPatBinarySDNode_XI<vop, instruction_name, "VI",
-                              vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              vti.Vector, vti.Vector, vti.Log2SEW,
+                              vti.LMul, vti.AVL, vti.RegClass,
                               !cast<ComplexPattern>(SplatPat#_#ImmType),
                               ImmType>;
   }
@@ -167,11 +163,9 @@ class VPatBinarySDNode_VF<SDNode vop,
                           ValueType result_type,
                           ValueType vop_type,
                           ValueType xop_type,
-                          ValueType mask_type,
                           int sew,
                           LMULInfo vlmul,
                           OutPatFrag avl,
-                          VReg RetClass,
                           VReg vop_reg_class,
                           DAGOperand xop_kind> :
     Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
@@ -184,11 +178,11 @@ class VPatBinarySDNode_VF<SDNode vop,
 multiclass VPatBinaryFPSDNode_VV_VF<SDNode vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
     def : VPatBinarySDNode_VV<vop, instruction_name,
-                              vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                              vti.LMul, vti.AVL, vti.RegClass, vti.RegClass>;
+                              vti.Vector, vti.Vector, vti.Log2SEW,
+                              vti.LMul, vti.AVL, vti.RegClass>;
     def : VPatBinarySDNode_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
-                              vti.Vector, vti.Vector, vti.Scalar, vti.Mask,
-                              vti.Log2SEW, vti.LMul, vti.AVL, vti.RegClass, vti.RegClass,
+                              vti.Vector, vti.Vector, vti.Scalar,
+                              vti.Log2SEW, vti.LMul, vti.AVL, vti.RegClass,
                               vti.ScalarRegClass>;
   }
 }
@@ -373,7 +367,7 @@ multiclass VPatNConvertFP2ISDNode_V<SDNode vop, string instruction_name> {
 // Patterns.
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 
 // 7.4. Vector Unit-Stride Instructions
 foreach vti = !listconcat(FractionalGroupIntegerVectors,
@@ -491,17 +485,17 @@ defm : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
 foreach vti = AllIntegerVectors in {
   // NOTE: We choose VMADD because it has the most commuting freedom. So it
   // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = vti.LMul.MX # "_COMMUTABLE";
+  defvar suffix = vti.LMul.MX;
   def : Pat<(vti.Vector (add vti.RegClass:$rs2,
                               (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
             (!cast<Instruction>("PseudoVMADD_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW)>;
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (sub vti.RegClass:$rs2,
                               (mul_oneuse vti.RegClass:$rs1, vti.RegClass:$rd))),
             (!cast<Instruction>("PseudoVNMSUB_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW)>;
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
   // commutable.
@@ -510,32 +504,32 @@ foreach vti = AllIntegerVectors in {
                                           vti.RegClass:$rd))),
             (!cast<Instruction>("PseudoVMADD_VX_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW)>;
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (sub vti.RegClass:$rs2,
                               (mul_oneuse (SplatPat XLenVT:$rs1),
                                           vti.RegClass:$rd))),
             (!cast<Instruction>("PseudoVNMSUB_VX_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 vti.AVL, vti.Log2SEW)>;
+                 vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 12.15. Vector Integer Merge Instructions
 foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), vti.RegClass:$rs1,
+  def : Pat<(vti.Vector (vselect (vti.Mask V0), vti.RegClass:$rs1,
                                                       vti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                 vti.RegClass:$rs2, vti.RegClass:$rs1, VMV0:$vm,
+                 vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0),
                  vti.AVL, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1),
+  def : Pat<(vti.Vector (vselect (vti.Mask V0), (SplatPat XLenVT:$rs1),
                                                       vti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                 vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, vti.AVL, vti.Log2SEW)>;
+                 vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), vti.AVL, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1),
+  def : Pat<(vti.Vector (vselect (vti.Mask V0), (SplatPat_simm5 simm5:$rs1),
                                                       vti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                 vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, vti.AVL, vti.Log2SEW)>;
+                 vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), vti.AVL, vti.Log2SEW)>;
 }
 
 // 12.1. Vector Single-Width Saturating Add and Subtract
@@ -567,10 +561,10 @@ foreach mti = AllMasks in {
                  VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
 
   def : Pat<(mti.Mask (and VR:$rs1, (rvv_vnot VR:$rs2))),
-            (!cast<Instruction>("PseudoVMANDNOT_MM_"#mti.LMul.MX)
+            (!cast<Instruction>("PseudoVMANDN_MM_"#mti.LMul.MX)
                  VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
   def : Pat<(mti.Mask (or VR:$rs1, (rvv_vnot VR:$rs2))),
-            (!cast<Instruction>("PseudoVMORNOT_MM_"#mti.LMul.MX)
+            (!cast<Instruction>("PseudoVMORN_MM_"#mti.LMul.MX)
                  VR:$rs1, VR:$rs2, mti.AVL, mti.Log2SEW)>;
 
   // Handle rvv_vnot the same as the vmnot.m pseudoinstruction.
@@ -579,9 +573,9 @@ foreach mti = AllMasks in {
                  VR:$rs, VR:$rs, mti.AVL, mti.Log2SEW)>;
 }
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 defm : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">;
@@ -597,27 +591,27 @@ defm : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">;
 foreach fvti = AllFloatVectors in {
   // NOTE: We choose VFMADD because it has the most commuting freedom. So it
   // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = fvti.LMul.MX # "_COMMUTABLE";
+  defvar suffix = fvti.LMul.MX;
   def : Pat<(fvti.Vector (fma fvti.RegClass:$rs1, fvti.RegClass:$rd,
                               fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFMADD_VV_"# suffix)
                  fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(fvti.Vector (fma fvti.RegClass:$rs1, fvti.RegClass:$rd,
                               (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFMSUB_VV_"# suffix)
                  fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(fvti.Vector (fma (fneg fvti.RegClass:$rs1), fvti.RegClass:$rd,
                               (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFNMADD_VV_"# suffix)
                  fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(fvti.Vector (fma (fneg fvti.RegClass:$rs1), fvti.RegClass:$rd,
                               fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFNMSUB_VV_"# suffix)
                  fvti.RegClass:$rd, fvti.RegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The choice of VFMADD here is arbitrary, vfmadd.vf and vfmacc.vf are equally
   // commutable.
@@ -625,35 +619,35 @@ foreach fvti = AllFloatVectors in {
                               fvti.RegClass:$rd, fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFMADD_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
                               fvti.RegClass:$rd, (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 
   def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
                               (fneg fvti.RegClass:$rd), (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFNMADD_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(fvti.Vector (fma (splat_vector fvti.ScalarRegClass:$rs1),
                               (fneg fvti.RegClass:$rd), fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFNMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The splat might be negated.
   def : Pat<(fvti.Vector (fma (fneg (splat_vector fvti.ScalarRegClass:$rs1)),
                               fvti.RegClass:$rd, (fneg fvti.RegClass:$rs2))),
             (!cast<Instruction>("PseudoVFNMADD_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(fvti.Vector (fma (fneg (splat_vector fvti.ScalarRegClass:$rs1)),
                               fvti.RegClass:$rd, fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFNMSUB_V" # fvti.ScalarSuffix # "_" # suffix)
                  fvti.RegClass:$rd, fvti.ScalarRegClass:$rs1, fvti.RegClass:$rs2,
-                 fvti.AVL, fvti.Log2SEW)>;
+                 fvti.AVL, fvti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 foreach vti = AllFloatVectors in {
@@ -711,25 +705,25 @@ defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
 // 12.15. Vector Integer Merge Instructions
 // 14.15. Vector Floating-Point Merge Instruction
 foreach fvti = AllFloatVectors in {
-  def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
+  def : Pat<(fvti.Vector (vselect (fvti.Mask V0), fvti.RegClass:$rs1,
                                                         fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                 fvti.RegClass:$rs2, fvti.RegClass:$rs1, VMV0:$vm,
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
                  fvti.AVL, fvti.Log2SEW)>;
 
-  def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+  def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
                                   (splat_vector fvti.ScalarRegClass:$rs1),
                                   fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
                  fvti.RegClass:$rs2,
                  (fvti.Scalar fvti.ScalarRegClass:$rs1),
-                 VMV0:$vm, fvti.AVL, fvti.Log2SEW)>;
+                 (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
 
-  def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm),
+  def : Pat<(fvti.Vector (vselect (fvti.Mask V0),
                                   (splat_vector (fvti.Scalar fpimm0)),
                                   fvti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                 fvti.RegClass:$rs2, 0, VMV0:$vm, fvti.AVL, fvti.Log2SEW)>;
+                 fvti.RegClass:$rs2, 0, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>;
 }
 
 // 14.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions
@@ -763,13 +757,13 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
             (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
                 fwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
 }
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // Vector Splats
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 foreach vti = AllIntegerVectors in {
   def : Pat<(vti.Vector (SplatPat GPR:$rs1)),
             (!cast<Instruction>("PseudoVMV_V_X_" # vti.LMul.MX)
@@ -778,9 +772,9 @@ foreach vti = AllIntegerVectors in {
             (!cast<Instruction>("PseudoVMV_V_I_" # vti.LMul.MX)
               simm5:$rs1, vti.AVL, vti.Log2SEW)>;
 }
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 foreach fvti = AllFloatVectors in {
   def : Pat<(fvti.Vector (splat_vector fvti.ScalarRegClass:$rs1)),
             (!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
@@ -791,12 +785,12 @@ foreach fvti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
               0, fvti.AVL, fvti.Log2SEW)>;
 }
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // Vector Element Extracts
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtV, HasStdExtF] in
+let Predicates = [HasVInstructionsAnyF] in
 foreach vti = AllFloatVectors in {
   defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
                                                        vti.ScalarSuffix,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index c9c42152c47b..73b97e1c3675 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -199,7 +199,7 @@ def true_mask : PatLeaf<(riscv_vmset_vl (XLenVT srcvalue))>;
 def riscv_vmnot_vl : PatFrag<(ops node:$rs, node:$vl),
                              (riscv_vmxor_vl node:$rs, true_mask, node:$vl)>;
 
-def riscv_vpopc_vl : SDNode<"RISCVISD::VPOPC_VL",
+def riscv_vcpop_vl : SDNode<"RISCVISD::VCPOP_VL",
                             SDTypeProfile<1, 3, [SDTCisVT<0, XLenVT>,
                                                  SDTCisVec<1>, SDTCisInt<1>,
                                                  SDTCVecEltisVT<2, i1>,
@@ -230,9 +230,9 @@ def SDT_RISCVVWMUL_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
 def riscv_vwmul_vl  : SDNode<"RISCVISD::VWMUL_VL",  SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
 def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
 
-def SDTRVVVecReduce : SDTypeProfile<1, 4, [
-  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>,
-  SDTCisSameNumEltsAs<1, 3>, SDTCisVT<4, XLenVT>
+def SDTRVVVecReduce : SDTypeProfile<1, 5, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
+  SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<2, 4>, SDTCisVT<5, XLenVT>
 ]>;
 
 def riscv_mul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D),
@@ -273,7 +273,6 @@ multiclass VPatBinaryVL_VV<SDNode vop,
                            ValueType mask_type,
                            int sew,
                            LMULInfo vlmul,
-                           VReg RetClass,
                            VReg op_reg_class> {
   def : Pat<(result_type (vop
                          (op_type op_reg_class:$rs1),
@@ -287,13 +286,13 @@ multiclass VPatBinaryVL_VV<SDNode vop,
   def : Pat<(result_type (vop
                          (op_type op_reg_class:$rs1),
                          (op_type op_reg_class:$rs2),
-                         (mask_type VMV0:$vm),
+                         (mask_type V0),
                          VLOpFrag)),
         (!cast<Instruction>(instruction_name#"_VV_"# vlmul.MX#"_MASK")
                      (result_type (IMPLICIT_DEF)),
                      op_reg_class:$rs1,
                      op_reg_class:$rs2,
-                     VMV0:$vm, GPR:$vl, sew)>;
+                     (mask_type V0), GPR:$vl, sew, TAIL_AGNOSTIC)>;
 }
 
 multiclass VPatBinaryVL_XI<SDNode vop,
@@ -304,7 +303,6 @@ multiclass VPatBinaryVL_XI<SDNode vop,
                            ValueType mask_type,
                            int sew,
                            LMULInfo vlmul,
-                           VReg RetClass,
                            VReg vop_reg_class,
                            ComplexPattern SplatPatKind,
                            DAGOperand xop_kind> {
@@ -320,24 +318,23 @@ multiclass VPatBinaryVL_XI<SDNode vop,
   def : Pat<(result_type (vop
                      (vop_type vop_reg_class:$rs1),
                      (vop_type (SplatPatKind (XLenVT xop_kind:$rs2))),
-                     (mask_type VMV0:$vm),
+                     (mask_type V0),
                      VLOpFrag)),
         (!cast<Instruction>(instruction_name#_#suffix#_# vlmul.MX#"_MASK")
                      (result_type (IMPLICIT_DEF)),
                      vop_reg_class:$rs1,
                      xop_kind:$rs2,
-                     VMV0:$vm, GPR:$vl, sew)>;
+                     (mask_type V0), GPR:$vl, sew, TAIL_AGNOSTIC)>;
 }
 
 multiclass VPatBinaryVL_VV_VX<SDNode vop, string instruction_name> {
   foreach vti = AllIntegerVectors in {
     defm : VPatBinaryVL_VV<vop, instruction_name,
                            vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass, vti.RegClass>;
+                           vti.LMul, vti.RegClass>;
     defm : VPatBinaryVL_XI<vop, instruction_name, "VX",
                            vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass, vti.RegClass,
-                           SplatPat, GPR>;
+                           vti.LMul, vti.RegClass, SplatPat, GPR>;
   }
 }
 
@@ -347,7 +344,7 @@ multiclass VPatBinaryVL_VV_VX_VI<SDNode vop, string instruction_name,
   foreach vti = AllIntegerVectors in {
     defm : VPatBinaryVL_XI<vop, instruction_name, "VI",
                            vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass, vti.RegClass,
+                           vti.LMul, vti.RegClass,
                            !cast<ComplexPattern>(SplatPat#_#ImmType),
                            ImmType>;
   }
@@ -359,11 +356,10 @@ multiclass VPatBinaryWVL_VV_VX<SDNode vop, string instruction_name> {
     defvar wti = VtiToWti.Wti;
     defm : VPatBinaryVL_VV<vop, instruction_name,
                            wti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, wti.RegClass, vti.RegClass>;
+                           vti.LMul, vti.RegClass>;
     defm : VPatBinaryVL_XI<vop, instruction_name, "VX",
                            wti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, wti.RegClass, vti.RegClass,
-                           SplatPat, GPR>;
+                           vti.LMul, vti.RegClass, SplatPat, GPR>;
   }
 }
 
@@ -374,7 +370,6 @@ class VPatBinaryVL_VF<SDNode vop,
                       ValueType mask_type,
                       int sew,
                       LMULInfo vlmul,
-                      VReg RetClass,
                       VReg vop_reg_class,
                       RegisterClass scalar_reg_class> :
     Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
@@ -390,11 +385,10 @@ multiclass VPatBinaryFPVL_VV_VF<SDNode vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
     defm : VPatBinaryVL_VV<vop, instruction_name,
                            vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                           vti.LMul, vti.RegClass, vti.RegClass>;
+                           vti.LMul, vti.RegClass>;
     def : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
                           vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                          vti.LMul, vti.RegClass, vti.RegClass,
-                          vti.ScalarRegClass>;
+                          vti.LMul, vti.RegClass, vti.ScalarRegClass>;
   }
 }
 
@@ -589,14 +583,22 @@ multiclass VPatNConvertI2FPSDNode_V_VL<SDNode vop, string instruction_name> {
 multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
   foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
     defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
-    def: Pat<(vti_m1.Vector (vop (vti.Vector vti.RegClass:$rs1), VR:$rs2,
+    def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), VR:$rs2,
                                  (vti.Mask true_mask),
                                  VLOpFrag)),
         (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
-            (vti_m1.Vector (IMPLICIT_DEF)),
+            (vti_m1.Vector VR:$merge),
             (vti.Vector vti.RegClass:$rs1),
             (vti_m1.Vector VR:$rs2),
             GPR:$vl, vti.Log2SEW)>;
+
+    def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), VR:$rs2,
+                                 (vti.Mask V0), VLOpFrag)),
+        (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_MASK")
+            (vti_m1.Vector VR:$merge),
+            (vti.Vector vti.RegClass:$rs1),
+            (vti_m1.Vector VR:$rs2),
+            (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
 
@@ -604,7 +606,7 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
 // Patterns.
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 
 // 7.4. Vector Unit-Stride Instructions
 foreach vti = AllVectors in {
@@ -620,8 +622,8 @@ foreach vti = AllVectors in {
 }
 
 foreach mti = AllMasks in {
-  defvar load_instr = !cast<Instruction>("PseudoVLE1_V_"#mti.BX);
-  defvar store_instr = !cast<Instruction>("PseudoVSE1_V_"#mti.BX);
+  defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#mti.BX);
+  defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#mti.BX);
   def : Pat<(mti.Mask (riscv_vle_vl BaseAddr:$rs1, VLOpFrag)),
             (load_instr BaseAddr:$rs1, GPR:$vl, mti.Log2SEW)>;
   def : Pat<(riscv_vse_vl (mti.Mask VR:$rs2), BaseAddr:$rs1,
@@ -641,22 +643,22 @@ foreach vti = AllIntegerVectors in {
             (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX)
                  vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
   def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
-                          (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm),
+                          (vti.Vector vti.RegClass:$rs1), (vti.Mask V0),
                           VLOpFrag),
             (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK")
                  (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2,
-                 VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
                           (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask),
                           VLOpFrag),
             (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX)
                  vti.RegClass:$rs1, simm5:$rs2, GPR:$vl, vti.Log2SEW)>;
   def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
-                          (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm),
+                          (vti.Vector vti.RegClass:$rs1), (vti.Mask V0),
                           VLOpFrag),
             (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX#"_MASK")
                  (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, simm5:$rs2,
-                 VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 12.3. Vector Integer Extension
@@ -794,7 +796,7 @@ defm : VPatBinaryWVL_VV_VX<riscv_vwmulu_vl, "PseudoVWMULU">;
 foreach vti = AllIntegerVectors in {
   // NOTE: We choose VMADD because it has the most commuting freedom. So it
   // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = vti.LMul.MX # "_COMMUTABLE";
+  defvar suffix = vti.LMul.MX;
   def : Pat<(vti.Vector
              (riscv_add_vl vti.RegClass:$rs2,
                            (riscv_mul_vl_oneuse vti.RegClass:$rs1,
@@ -803,7 +805,7 @@ foreach vti = AllIntegerVectors in {
                            (vti.Mask true_mask), VLOpFrag)),
             (!cast<Instruction>("PseudoVMADD_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector
              (riscv_sub_vl vti.RegClass:$rs2,
                            (riscv_mul_vl_oneuse vti.RegClass:$rs1,
@@ -812,7 +814,7 @@ foreach vti = AllIntegerVectors in {
                            (vti.Mask true_mask), VLOpFrag)),
             (!cast<Instruction>("PseudoVNMSUB_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The choice of VMADD here is arbitrary, vmadd.vx and vmacc.vx are equally
   // commutable.
@@ -824,7 +826,7 @@ foreach vti = AllIntegerVectors in {
                            (vti.Mask true_mask), VLOpFrag)),
             (!cast<Instruction>("PseudoVMADD_VX_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector
              (riscv_sub_vl vti.RegClass:$rs2,
                            (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1),
@@ -834,7 +836,7 @@ foreach vti = AllIntegerVectors in {
                            (vti.Mask true_mask), VLOpFrag)),
             (!cast<Instruction>("PseudoVNMSUB_VX_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 12.14. Vector Widening Integer Multiply-Add Instructions
@@ -847,18 +849,18 @@ foreach vtiTowti = AllWidenableIntVectors in {
                                                   (vti.Vector vti.RegClass:$rs2),
                                                   (vti.Mask true_mask), VLOpFrag),
                            (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACC_VV_" # vti.LMul.MX # "_TA")
+            (!cast<Instruction>("PseudoVWMACC_VV_" # vti.LMul.MX)
                  wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(wti.Vector
              (riscv_add_vl wti.RegClass:$rd,
                            (riscv_vwmulu_vl_oneuse vti.RegClass:$rs1,
                                                    (vti.Vector vti.RegClass:$rs2),
                                                    (vti.Mask true_mask), VLOpFrag),
                            (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACCU_VV_" # vti.LMul.MX # "_TA")
+            (!cast<Instruction>("PseudoVWMACCU_VV_" # vti.LMul.MX)
                  wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   def : Pat<(wti.Vector
              (riscv_add_vl wti.RegClass:$rd,
@@ -866,43 +868,43 @@ foreach vtiTowti = AllWidenableIntVectors in {
                                                   (vti.Vector vti.RegClass:$rs2),
                                                   (vti.Mask true_mask), VLOpFrag),
                            (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACC_VX_" # vti.LMul.MX # "_TA")
+            (!cast<Instruction>("PseudoVWMACC_VX_" # vti.LMul.MX)
                  wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(wti.Vector
              (riscv_add_vl wti.RegClass:$rd,
                            (riscv_vwmulu_vl_oneuse (SplatPat XLenVT:$rs1),
                                                    (vti.Vector vti.RegClass:$rs2),
                                                    (vti.Mask true_mask), VLOpFrag),
                            (vti.Mask true_mask), VLOpFrag)),
-            (!cast<Instruction>("PseudoVWMACCU_VX_" # vti.LMul.MX # "_TA")
+            (!cast<Instruction>("PseudoVWMACCU_VX_" # vti.LMul.MX)
                  wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 12.15. Vector Integer Merge Instructions
 foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                           vti.RegClass:$rs1,
                                           vti.RegClass:$rs2,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                 vti.RegClass:$rs2, vti.RegClass:$rs1, VMV0:$vm,
+                 vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0),
                  GPR:$vl, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                           (SplatPat XLenVT:$rs1),
                                           vti.RegClass:$rs2,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                 vti.RegClass:$rs2, GPR:$rs1, VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+                 vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                           (SplatPat_simm5 simm5:$rs1),
                                           vti.RegClass:$rs2,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                 vti.RegClass:$rs2, simm5:$rs1, VMV0:$vm, GPR:$vl, vti.Log2SEW)>;
+                 vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
 // 12.16. Vector Integer Move Instructions
@@ -923,10 +925,10 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_uaddsat_vl, "PseudoVSADDU">;
 defm : VPatBinaryVL_VV_VX<riscv_ssubsat_vl, "PseudoVSSUB">;
 defm : VPatBinaryVL_VV_VX<riscv_usubsat_vl, "PseudoVSSUBU">;
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
 // 15.1. Vector Single-Width Integer Reduction Instructions
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 defm : VPatReductionVL<rvv_vecreduce_ADD_vl,  "PseudoVREDSUM", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_UMAX_vl, "PseudoVREDMAXU", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_SMAX_vl, "PseudoVREDMAX", /*is_float*/0>;
@@ -935,17 +937,17 @@ defm : VPatReductionVL<rvv_vecreduce_SMIN_vl, "PseudoVREDMIN", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_AND_vl,  "PseudoVREDAND", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_OR_vl,   "PseudoVREDOR", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_XOR_vl,  "PseudoVREDXOR", /*is_float*/0>;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 defm : VPatReductionVL<rvv_vecreduce_SEQ_FADD_vl, "PseudoVFREDOSUM", /*is_float*/1>;
-defm : VPatReductionVL<rvv_vecreduce_FADD_vl,     "PseudoVFREDSUM", /*is_float*/1>;
+defm : VPatReductionVL<rvv_vecreduce_FADD_vl,     "PseudoVFREDUSUM", /*is_float*/1>;
 defm : VPatReductionVL<rvv_vecreduce_FMIN_vl,     "PseudoVFREDMIN", /*is_float*/1>;
 defm : VPatReductionVL<rvv_vecreduce_FMAX_vl,     "PseudoVFREDMAX", /*is_float*/1>;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 defm : VPatBinaryFPVL_VV_VF<riscv_fadd_vl, "PseudoVFADD">;
@@ -961,13 +963,13 @@ defm : VPatBinaryFPVL_R_VF<riscv_fdiv_vl, "PseudoVFRDIV">;
 foreach vti = AllFloatVectors in {
   // NOTE: We choose VFMADD because it has the most commuting freedom. So it
   // works best with how TwoAddressInstructionPass tries commuting.
-  defvar suffix = vti.LMul.MX # "_COMMUTABLE";
+  defvar suffix = vti.LMul.MX;
   def : Pat<(vti.Vector (riscv_fma_vl vti.RegClass:$rs1, vti.RegClass:$rd,
                                       vti.RegClass:$rs2, (vti.Mask true_mask),
                                       VLOpFrag)),
             (!cast<Instruction>("PseudoVFMADD_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl vti.RegClass:$rs1, vti.RegClass:$rd,
                                       (riscv_fneg_vl vti.RegClass:$rs2,
                                                      (vti.Mask true_mask),
@@ -976,7 +978,7 @@ foreach vti = AllFloatVectors in {
                                       VLOpFrag)),
             (!cast<Instruction>("PseudoVFMSUB_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl vti.RegClass:$rs1,
                                                      (vti.Mask true_mask),
                                                      VLOpFrag),
@@ -988,7 +990,7 @@ foreach vti = AllFloatVectors in {
                                       VLOpFrag)),
             (!cast<Instruction>("PseudoVFNMADD_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl vti.RegClass:$rs1,
                                                      (vti.Mask true_mask),
                                                      VLOpFrag),
@@ -997,7 +999,7 @@ foreach vti = AllFloatVectors in {
                                       VLOpFrag)),
             (!cast<Instruction>("PseudoVFNMSUB_VV_"# suffix)
                  vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The choice of VFMADD here is arbitrary, vfmadd.vf and vfmacc.vf are equally
   // commutable.
@@ -1007,7 +1009,7 @@ foreach vti = AllFloatVectors in {
                                        VLOpFrag)),
             (!cast<Instruction>("PseudoVFMADD_V" # vti.ScalarSuffix # "_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
                                        vti.RegClass:$rd,
                                        (riscv_fneg_vl vti.RegClass:$rs2,
@@ -1017,7 +1019,7 @@ foreach vti = AllFloatVectors in {
                                        VLOpFrag)),
             (!cast<Instruction>("PseudoVFMSUB_V" # vti.ScalarSuffix # "_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
                                        (riscv_fneg_vl vti.RegClass:$rd,
                                                       (vti.Mask true_mask),
@@ -1029,7 +1031,7 @@ foreach vti = AllFloatVectors in {
                                        VLOpFrag)),
             (!cast<Instruction>("PseudoVFNMADD_V" # vti.ScalarSuffix # "_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl (SplatFPOp vti.ScalarRegClass:$rs1),
                                        (riscv_fneg_vl vti.RegClass:$rd,
                                                       (vti.Mask true_mask),
@@ -1039,7 +1041,7 @@ foreach vti = AllFloatVectors in {
                                        VLOpFrag)),
             (!cast<Instruction>("PseudoVFNMSUB_V" # vti.ScalarSuffix # "_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // The splat might be negated.
   def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl (SplatFPOp vti.ScalarRegClass:$rs1),
@@ -1053,7 +1055,7 @@ foreach vti = AllFloatVectors in {
                                        VLOpFrag)),
             (!cast<Instruction>("PseudoVFNMADD_V" # vti.ScalarSuffix # "_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   def : Pat<(vti.Vector (riscv_fma_vl (riscv_fneg_vl (SplatFPOp vti.ScalarRegClass:$rs1),
                                                      (vti.Mask true_mask),
                                                      VLOpFrag),
@@ -1062,7 +1064,7 @@ foreach vti = AllFloatVectors in {
                                        VLOpFrag)),
             (!cast<Instruction>("PseudoVFNMSUB_V" # vti.ScalarSuffix # "_" # suffix)
                  vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
-                 GPR:$vl, vti.Log2SEW)>;
+                 GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
 // 14.11. Vector Floating-Point MIN/MAX Instructions
@@ -1126,29 +1128,29 @@ foreach fvti = AllFloatVectors in {
   // Floating-point vselects:
   // 12.15. Vector Integer Merge Instructions
   // 14.15. Vector Floating-Point Merge Instruction
-  def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask VMV0:$vm),
+  def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
                                            fvti.RegClass:$rs1,
                                            fvti.RegClass:$rs2,
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                 fvti.RegClass:$rs2, fvti.RegClass:$rs1, VMV0:$vm,
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
 
-  def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask VMV0:$vm),
+  def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
                                            (SplatFPOp fvti.ScalarRegClass:$rs1),
                                            fvti.RegClass:$rs2,
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
                  fvti.RegClass:$rs2,
                  (fvti.Scalar fvti.ScalarRegClass:$rs1),
-                 VMV0:$vm, GPR:$vl, fvti.Log2SEW)>;
+                 (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
-  def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask VMV0:$vm),
+  def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0),
                                            (SplatFPOp (fvti.Scalar fpimm0)),
                                            fvti.RegClass:$rs2,
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                 fvti.RegClass:$rs2, 0, VMV0:$vm, GPR:$vl, fvti.Log2SEW)>;
+                 fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
   // 14.16. Vector Floating-Point Move Instruction
   // If we're splatting fpimm0, use vmv.v.x vd, x0.
@@ -1207,9 +1209,9 @@ foreach fvti = AllFloatVectors in {
   }
 }
 
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 
 foreach mti = AllMasks in {
   // 16.1 Vector Mask-Register Logical Instructions
@@ -1231,12 +1233,12 @@ foreach mti = AllMasks in {
   def : Pat<(mti.Mask (riscv_vmand_vl VR:$rs1,
                                       (riscv_vmnot_vl VR:$rs2, VLOpFrag),
                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVMANDNOT_MM_" # mti.LMul.MX)
+            (!cast<Instruction>("PseudoVMANDN_MM_" # mti.LMul.MX)
                  VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
   def : Pat<(mti.Mask (riscv_vmor_vl VR:$rs1,
                                      (riscv_vmnot_vl VR:$rs2, VLOpFrag),
                                      VLOpFrag)),
-            (!cast<Instruction>("PseudoVMORNOT_MM_" # mti.LMul.MX)
+            (!cast<Instruction>("PseudoVMORN_MM_" # mti.LMul.MX)
                  VR:$rs1, VR:$rs2, GPR:$vl, mti.Log2SEW)>;
   // XOR is associative so we need 2 patterns for VMXNOR.
   def : Pat<(mti.Mask (riscv_vmxor_vl (riscv_vmnot_vl VR:$rs1,
@@ -1266,16 +1268,20 @@ foreach mti = AllMasks in {
             (!cast<Instruction>("PseudoVMNAND_MM_" # mti.LMul.MX)
                  VR:$rs, VR:$rs, GPR:$vl, mti.Log2SEW)>;
 
-  // 16.2 Vector Mask Population Count vpopc
-  def : Pat<(XLenVT (riscv_vpopc_vl (mti.Mask VR:$rs2), (mti.Mask true_mask),
+  // 16.2 Vector count population in mask vcpop.m
+  def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask true_mask),
                                     VLOpFrag)),
-            (!cast<Instruction>("PseudoVPOPC_M_" # mti.BX)
+            (!cast<Instruction>("PseudoVCPOP_M_" # mti.BX)
                  VR:$rs2, GPR:$vl, mti.Log2SEW)>;
+  def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask V0),
+                                    VLOpFrag)),
+            (!cast<Instruction>("PseudoVCPOP_M_" # mti.BX # "_MASK")
+                 VR:$rs2, (mti.Mask V0), GPR:$vl, mti.Log2SEW)>;
 }
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 // 17.1. Integer Scalar Move Instructions
 // 17.4. Vector Register Gather Instruction
 foreach vti = AllIntegerVectors in {
@@ -1302,7 +1308,7 @@ foreach vti = AllIntegerVectors in {
             (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
                  vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                           (riscv_vrgather_vv_vl
                                             vti.RegClass:$rs2,
                                             vti.RegClass:$rs1,
@@ -1312,7 +1318,19 @@ foreach vti = AllIntegerVectors in {
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
+                                          (riscv_vrgather_vx_vl
+                                            vti.RegClass:$rs2,
+                                            uimm5:$imm,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   // emul = lmul * 16 / sew
   defvar vlmul = vti.LMul;
@@ -1329,7 +1347,7 @@ foreach vti = AllIntegerVectors in {
               (!cast<Instruction>(inst)
                    vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                             (riscv_vrgatherei16_vv_vl
                                               vti.RegClass:$rs2,
                                               (ivti.Vector ivti.RegClass:$rs1),
@@ -1339,13 +1357,13 @@ foreach vti = AllIntegerVectors in {
                                             VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
-                   vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 
 // 17.2. Floating-Point Scalar Move Instructions
 foreach vti = AllFloatVectors in {
@@ -1373,7 +1391,7 @@ foreach vti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX)
                  vti.RegClass:$rs2, uimm5:$imm, GPR:$vl, vti.Log2SEW)>;
 
-  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                           (riscv_vrgather_vv_vl
                                             vti.RegClass:$rs2,
                                             (ivti.Vector vti.RegClass:$rs1),
@@ -1383,7 +1401,19 @@ foreach vti = AllFloatVectors in {
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_MASK")
                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
+                                          (riscv_vrgather_vx_vl
+                                            vti.RegClass:$rs2,
+                                            uimm5:$imm,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
   defvar vlmul = vti.LMul;
   defvar octuple_lmul = vlmul.octuple;
@@ -1399,7 +1429,7 @@ foreach vti = AllFloatVectors in {
               (!cast<Instruction>(inst)
                    vti.RegClass:$rs2, ivti.RegClass:$rs1, GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0),
                                             (riscv_vrgatherei16_vv_vl
                                               vti.RegClass:$rs2,
                                               (ivti.Vector ivti.RegClass:$rs1),
@@ -1409,11 +1439,11 @@ foreach vti = AllFloatVectors in {
                                             VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
                    vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
-                   vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+                   (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
 
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous RISCVISD SDNodes
@@ -1437,7 +1467,7 @@ def riscv_slide1up_vl  : SDNode<"RISCVISD::VSLIDE1UP_VL", SDTRVVSlide1, []>;
 def riscv_slidedown_vl : SDNode<"RISCVISD::VSLIDEDOWN_VL", SDTRVVSlide, []>;
 def riscv_slide1down_vl  : SDNode<"RISCVISD::VSLIDE1DOWN_VL", SDTRVVSlide1, []>;
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 
 foreach vti = AllIntegerVectors in {
   def : Pat<(vti.Vector (riscv_vid_vl (vti.Mask true_mask),
@@ -1490,4 +1520,4 @@ foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
                 GPR:$vl, vti.Log2SEW)>;
 }
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 7359e567a58d..461bdd348934 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -1,4 +1,4 @@
-//===-- RISCVInstrInfoB.td - RISC-V 'B' instructions -------*- tablegen -*-===//
+//===-- RISCVInstrInfoZb.td - RISC-V Bitmanip instructions -*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the RISC-V instructions from the standard 'B' Bitmanip
-// extension, version 0.93.
-// This version is still experimental as the 'B' extension hasn't been
+// This file describes the RISC-V instructions from the standard Bitmanip
+// extensions, versions:
+//   Zba - 1.0
+//   Zbb - 1.0
+//   Zbc - 1.0
+//   Zbs - 1.0
+//   Zbe - 0.93
+//   Zbf - 0.93
+//   Zbm - 0.93
+//   Zbp - 0.93
+//   Zbr - 0.93
+//   Zbt - 0.93
+// This version is still experimental as the Bitmanip extensions haven't been
 // ratified yet.
 //
 //===----------------------------------------------------------------------===//
@@ -186,6 +196,32 @@ def C9LeftShift : PatLeaf<(imm), [{
   return C > 9 && ((C % 9) == 0) && isPowerOf2_64(C / 9);
 }]>;
 
+def CSImm12MulBy4 : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  int64_t C = N->getSExtValue();
+  // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair.
+  return !isInt<13>(C) && isInt<14>(C) && (C & 3) == 0;
+}]>;
+
+def CSImm12MulBy8 : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  int64_t C = N->getSExtValue();
+  // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair.
+  return !isInt<13>(C) && isInt<15>(C) && (C & 7) == 0;
+}]>;
+
+def SimmShiftRightBy2XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() >> 2, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+def SimmShiftRightBy3XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() >> 3, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -459,15 +495,6 @@ def RORW  : ALUW_rr<0b0110000, 0b101, "rorw">,
             Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
 } // Predicates = [HasStdExtZbbOrZbp, IsRV64]
 
-let Predicates = [HasStdExtZbs, IsRV64] in {
-// NOTE: These instructions have been removed from the 0.94 spec. As a result
-// we have no isel patterns for them.
-def BCLRW : ALUW_rr<0b0100100, 0b001, "bclrw">, Sched<[]>;
-def BSETW : ALUW_rr<0b0010100, 0b001, "bsetw">, Sched<[]>;
-def BINVW : ALUW_rr<0b0110100, 0b001, "binvw">, Sched<[]>;
-def BEXTW : ALUW_rr<0b0100100, 0b101, "bextw">, Sched<[]>;
-} // Predicates = [HasStdExtZbs, IsRV64]
-
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def GORCW  : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
 def GREVW  : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
@@ -481,17 +508,6 @@ let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
 def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">,
             Sched<[WriteRotateImm32, ReadRotateImm32]>;
 
-let Predicates = [HasStdExtZbs, IsRV64] in {
-// NOTE: These instructions have been removed from the 0.94 spec. As a result
-// we have no isel patterns for them.
-def BCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "bclriw">,
-              Sched<[]>;
-def BSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "bsetiw">,
-              Sched<[]>;
-def BINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "binviw">,
-              Sched<[]>;
-} // Predicates = [HasStdExtZbs, IsRV64]
-
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
 def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
@@ -585,43 +601,10 @@ def ORCB : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
 } // Predicates = [HasStdExtZbbOrZbp]
 
 //===----------------------------------------------------------------------===//
-// Future compressed instructions
-//===----------------------------------------------------------------------===//
-
-// The presence of these instructions in the B extension is purely experimental
-// and they should be moved to the C extension as soon as they are ratified.
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVBInstC<bits<2> funct2, string opcodestr>
-    : RVInst16<(outs GPRC:$rs_wb), (ins GPRC:$rs), opcodestr, "$rs", [],
-               InstFormatCR> {
-  bits<3> rs;
-  let Constraints = "$rs = $rs_wb";
-
-  let Inst{15-12} = 0b0110;
-  let Inst{11-10} = funct2;
-  let Inst{9-7} = rs;
-  let Inst{6-0} = 0b0000001;
-}
-
-// The namespace RVBC exists to avoid encoding conflicts with the compressed
-// instructions c.addi16sp and c.lui already implemented in the C extension.
-
-let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
-def C_NOT : RVBInstC<0b00, "c.not">, Sched<[]>;
-def C_NEG : RVBInstC<0b01, "c.neg">, Sched<[]>;
-} // DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC]
-
-let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZba, HasStdExtC, IsRV64] in
-def C_ZEXTW : RVBInstC<0b10, "c.zext.w">, Sched<[]>;
-
-//===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtZba, IsRV64] in {
-// NOTE: The 0.93 spec shows zext.w as an alias of pack/packw. It has been
-// changed to add.uw in a draft after 0.94.
 def : InstAlias<"zext.w $rd, $rs", (ADDUW GPR:$rd, GPR:$rs, X0)>;
 }
 
@@ -770,21 +753,6 @@ def : InstAlias<"bext $rd, $rs1, $shamt",
 } // Predicates = [HasStdExtZbs]
 
 //===----------------------------------------------------------------------===//
-// Compressed Instruction patterns
-//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
-def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
-                  (C_NOT GPRC:$rs1)>;
-def : CompressPat<(SUB GPRC:$rs1, X0, GPRC:$rs1),
-                  (C_NEG GPRC:$rs1)>;
-} // Predicates = [HasStdExtZbproposedc, HasStdExtC]
-
-let Predicates = [HasStdExtZbproposedc, HasStdExtZba, HasStdExtC, IsRV64] in {
-def : CompressPat<(ADDUW GPRC:$rs1, GPRC:$rs1, X0),
-                  (C_ZEXTW GPRC:$rs1)>;
-} // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
-
-//===----------------------------------------------------------------------===//
 // Codegen patterns
 //===----------------------------------------------------------------------===//
 
@@ -1011,6 +979,13 @@ def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
           (SH3ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
 
+def : Pat<(add GPR:$r, CSImm12MulBy4:$i),
+          (SH2ADD (ADDI X0, (SimmShiftRightBy2XForm CSImm12MulBy4:$i)),
+                  GPR:$r)>;
+def : Pat<(add GPR:$r, CSImm12MulBy8:$i),
+          (SH3ADD (ADDI X0, (SimmShiftRightBy3XForm CSImm12MulBy8:$i)),
+                  GPR:$r)>;
+
 def : Pat<(mul GPR:$r, C3LeftShift:$i),
           (SLLI (SH1ADD GPR:$r, GPR:$r),
                 (TrailingZerosXForm C3LeftShift:$i))>;
@@ -1020,6 +995,29 @@ def : Pat<(mul GPR:$r, C5LeftShift:$i),
 def : Pat<(mul GPR:$r, C9LeftShift:$i),
           (SLLI (SH3ADD GPR:$r, GPR:$r),
                 (TrailingZerosXForm C9LeftShift:$i))>;
+
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 11)),
+          (SH1ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 19)),
+          (SH1ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 13)),
+          (SH2ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 21)),
+          (SH2ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 37)),
+          (SH2ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 25)),
+          (SH3ADD (SH1ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 41)),
+          (SH3ADD (SH2ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 73)),
+          (SH3ADD (SH3ADD GPR:$r, GPR:$r), GPR:$r)>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 27)),
+          (SH1ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 45)),
+          (SH2ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
+def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)),
+          (SH3ADD (SH3ADD GPR:$r, GPR:$r), (SH3ADD GPR:$r, GPR:$r))>;
 } // Predicates = [HasStdExtZba]
 
 let Predicates = [HasStdExtZba, IsRV64] in {
@@ -1085,6 +1083,9 @@ def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
                                (and GPR:$rs1, 0x000000000000FFFF)),
                            i32)),
           (PACKW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
+                   (and GPR:$rs1, 0x000000000000FFFF))),
+          (PACKW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
                    (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))),
           (PACKUW GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 7316b7ad7674..a33494461869 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -66,7 +66,7 @@ class FPCmpH_rr<bits<3> funct3, string opcodestr>
 // Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZfh] in {
+let Predicates = [HasStdExtZfhmin] in {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 def FLH : RVInstI<0b001, OPC_LOAD_FP, (outs FPR16:$rd),
                   (ins GPR:$rs1, simm12:$imm12),
@@ -81,7 +81,9 @@ def FSH : RVInstS<0b001, OPC_STORE_FP, (outs),
                   (ins FPR16:$rs2, GPR:$rs1, simm12:$imm12),
                    "fsh", "$rs2, ${imm12}(${rs1})">,
           Sched<[WriteFST16, ReadStoreData, ReadFMemBase]>;
+} // Predicates = [HasStdExtZfhmin]
 
+let Predicates = [HasStdExtZfh] in {
 def FMADD_H  : FPFMAH_rrr_frm<OPC_MADD, "fmadd.h">,
                Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
 def          : FPFMAHDynFrmAlias<FMADD_H, "fmadd.h">;
@@ -148,7 +150,9 @@ def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.wu">,
   let rs2 = 0b00001;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
+} // Predicates = [HasStdExtZfh]
 
+let Predicates = [HasStdExtZfhmin] in {
 def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, FPR16, FPR32, "fcvt.h.s">,
                Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]> {
   let rs2 = 0b00000;
@@ -169,7 +173,9 @@ def FMV_H_X : FPUnaryOp_r<0b1111010, 0b000, FPR16, GPR, "fmv.h.x">,
               Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]> {
   let rs2 = 0b00000;
 }
+} // Predicates = [HasStdExtZfhmin]
 
+let Predicates = [HasStdExtZfh] in {
 def FEQ_H : FPCmpH_rr<0b010, "feq.h">;
 def FLT_H : FPCmpH_rr<0b001, "flt.h">;
 def FLE_H : FPCmpH_rr<0b000, "fle.h">;
@@ -206,7 +212,7 @@ def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.lu">,
 def           : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
 } // Predicates = [HasStdExtZfh, IsRV64]
 
-let Predicates = [HasStdExtZfh, HasStdExtD] in {
+let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
 def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, FPR16, FPR64, "fcvt.h.d">,
                Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]> {
   let rs2 = 0b00001;
@@ -217,16 +223,18 @@ def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR16, "fcvt.d.h">,
                Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]> {
   let rs2 = 0b00010;
 }
-} // Predicates = [HasStdExtZfh, HasStdExtD]
+} // Predicates = [HasStdExtZfhmin, HasStdExtD]
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZfh] in {
+let Predicates = [HasStdExtZfhmin] in {
 def : InstAlias<"flh $rd, (${rs1})",  (FLH FPR16:$rd,  GPR:$rs1, 0), 0>;
 def : InstAlias<"fsh $rs2, (${rs1})", (FSH FPR16:$rs2, GPR:$rs1, 0), 0>;
+} // Predicates = [HasStdExtZfhmin]
 
+let Predicates = [HasStdExtZfh] in {
 def : InstAlias<"fmv.h $rd, $rs",  (FSGNJ_H  FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
 def : InstAlias<"fabs.h $rd, $rs", (FSGNJX_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
 def : InstAlias<"fneg.h $rd, $rs", (FSGNJN_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
@@ -237,10 +245,12 @@ def : InstAlias<"fgt.h $rd, $rs, $rt",
                 (FLT_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
 def : InstAlias<"fge.h $rd, $rs, $rt",
                 (FLE_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>;
+} // Predicates = [HasStdExtZfh]
 
+let Predicates = [HasStdExtZfhmin] in {
 def PseudoFLH  : PseudoFloatLoad<"flh", FPR16>;
 def PseudoFSH  : PseudoStore<"fsh", FPR16>;
-} // Predicates = [HasStdExtZfh]
+} // Predicates = [HasStdExtZfhmin]
 
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
@@ -313,7 +323,9 @@ def : PatFpr16Fpr16<setle, FLE_H>;
 def : PatFpr16Fpr16<setole, FLE_H>;
 
 def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>;
+} // Predicates = [HasStdExtZfh]
 
+let Predicates = [HasStdExtZfhmin] in {
 /// Loads
 
 defm : LdPat<load, FLH, f16>;
@@ -331,13 +343,17 @@ def : Pat<(fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
 // Moves (no conversion)
 def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>;
 def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>;
-} // Predicates = [HasStdExtZfh]
+} // Predicates = [HasStdExtZfhmin]
 
 let Predicates = [HasStdExtZfh, IsRV32] in {
 // half->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
 def : Pat<(i32 (fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
 
+// Saturating float->[u]int32.
+def : Pat<(i32 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(i32 (riscv_fcvt_xu_rtz FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
+
 // half->int32 with current rounding mode.
 def : Pat<(i32 (lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>;
 
@@ -353,13 +369,17 @@ let Predicates = [HasStdExtZfh, IsRV64] in {
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_fcvt_w_rv64 FPR16:$rs1),  (FCVT_W_H $rs1, 0b001)>;
-def : Pat<(riscv_fcvt_wu_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_w_rtz_rv64 FPR16:$rs1),  (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
 
 // half->[u]int64. Round-to-zero must be used.
 def : Pat<(i64 (fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
 def : Pat<(i64 (fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
 
+// Saturating float->[u]int64.
+def : Pat<(i64 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
+def : Pat<(i64 (riscv_fcvt_xu_rtz FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
+
 // half->int64 with current rounding mode.
 def : Pat<(i64 (lrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
 def : Pat<(i64 (llrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
@@ -375,7 +395,7 @@ def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>;
 def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>;
 } // Predicates = [HasStdExtZfh, IsRV64]
 
-let Predicates = [HasStdExtZfh, HasStdExtD] in {
+let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
 /// Float conversion operations
 // f64 -> f16, f16 -> f64
 def : Pat<(fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
@@ -385,4 +405,4 @@ def : Pat<(fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
 def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
           (FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>;
 def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>;
-}
+} // Predicates = [HasStdExtZfhmin, HasStdExtD]
diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index 74d92468b9b9..dd084f53e511 100644
--- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -148,17 +148,18 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
   assert(TRI && "TargetRegisterInfo expected");
 
   uint64_t TSFlags = MI->getDesc().TSFlags;
-  int NumOps = MI->getNumExplicitOperands();
-
-  for (const MachineOperand &MO : MI->explicit_operands()) {
-    int OpNo = (int)MI->getOperandNo(&MO);
-    assert(OpNo >= 0 && "Operand number doesn't fit in an 'int' type");
-
-    // Skip VL and SEW operands which are the last two operands if present.
-    if (RISCVII::hasVLOp(TSFlags) && OpNo == (NumOps - 2))
-      continue;
-    if (RISCVII::hasSEWOp(TSFlags) && OpNo == (NumOps - 1))
-      continue;
+  unsigned NumOps = MI->getNumExplicitOperands();
+
+  // Skip policy, VL and SEW operands which are the last operands if present.
+  if (RISCVII::hasVecPolicyOp(TSFlags))
+    --NumOps;
+  if (RISCVII::hasVLOp(TSFlags))
+    --NumOps;
+  if (RISCVII::hasSEWOp(TSFlags))
+    --NumOps;
+
+  for (unsigned OpNo = 0; OpNo != NumOps; ++OpNo) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
 
     // Skip merge op. It should be the first operand after the result.
     if (RISCVII::hasMergeOp(TSFlags) && OpNo == 1) {
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 87586023caa4..5f4022439abb 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -26,8 +26,8 @@
 #include "RISCV.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include <set>
 using namespace llvm;
@@ -38,7 +38,6 @@ namespace {
 
 struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
   static char ID;
-  const MachineFunction *MF;
   bool runOnMachineFunction(MachineFunction &Fn) override;
   bool detectLuiAddiGlobal(MachineInstr &LUI, MachineInstr *&ADDI);
 
@@ -53,6 +52,11 @@ struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
         MachineFunctionProperties::Property::IsSSA);
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   StringRef getPassName() const override {
     return RISCV_MERGE_BASE_OFFSET_NAME;
   }
@@ -193,7 +197,7 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
     LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
     foldOffset(HiLUI, LoADDI, Tail, Offset);
     return true;
-  } break;
+  }
   case RISCV::ADD: {
     // The offset is too large to fit in the immediate field of ADDI.
     // This can be in two forms:
@@ -208,7 +212,7 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
       return false;
     foldOffset(HiLUI, LoADDI, Tail, Offset);
     return true;
-  } break;
+  }
   case RISCV::LB:
   case RISCV::LH:
   case RISCV::LW:
@@ -252,7 +256,7 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
     Tail.getOperand(1).setReg(HiLUI.getOperand(0).getReg());
     DeadInstrs.insert(&LoADDI);
     return true;
-  } break;
+  }
   }
   return false;
 }
@@ -261,6 +265,7 @@ bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(Fn.getFunction()))
     return false;
 
+  bool MadeChange = false;
   DeadInstrs.clear();
   MRI = &Fn.getRegInfo();
   for (MachineBasicBlock &MBB : Fn) {
@@ -272,13 +277,13 @@ bool RISCVMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
       LLVM_DEBUG(dbgs() << "  Found lowered global address with one use: "
                         << *LoADDI->getOperand(2).getGlobal() << "\n");
       // If the use count is only one, merge the offset
-      detectAndFoldOffset(HiLUI, *LoADDI);
+      MadeChange |= detectAndFoldOffset(HiLUI, *LoADDI);
     }
   }
   // Delete dead instructions.
   for (auto *MI : DeadInstrs)
     MI->eraseFromParent();
-  return true;
+  return MadeChange;
 }
 
 /// Returns an instance of the Merge Base Offset Optimization pass.
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index fde75206889c..a915a572f3b7 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -372,7 +372,7 @@ class NFList<int lmul> {
 }
 
 // Generate [start, end) SubRegIndex list.
-class SubRegSet<list<SubRegIndex> LIn, int start, int nf, int lmul> {
+class SubRegSet<int nf, int lmul> {
   list<SubRegIndex> L = !foldl([]<SubRegIndex>,
                                [0, 1, 2, 3, 4, 5, 6, 7],
                                AccList, i,
@@ -382,39 +382,61 @@ class SubRegSet<list<SubRegIndex> LIn, int start, int nf, int lmul> {
                                    [])));
 }
 
-class IndexSet<int index, int nf, int lmul> {
+// Collect the valid indexes into 'R' under NF and LMUL values from TUPLE_INDEX.
+// When NF = 2, the valid TUPLE_INDEX is 0 and 1.
+// For example, when LMUL = 4, the potential valid indexes is
+// [8, 12, 16, 20, 24, 28, 4]. However, not all these indexes are valid under
+// NF = 2. For example, 28 is not valid under LMUL = 4, NF = 2 and TUPLE_INDEX = 0.
+// The filter is
+//   (tuple_index + i) x lmul <= (tuple_index x lmul) + 32 - (nf x lmul)
+//
+// Use START = 0, LMUL = 4 and NF = 2 as the example,
+//   i x 4 <= 24
+// The class will return [8, 12, 16, 20, 24, 4].
+// Use START = 1, LMUL = 4 and NF = 2 as the example,
+//   (1 + i) x 4 <= 28
+// The class will return [12, 16, 20, 24, 28, 8].
+//
+class IndexSet<int tuple_index, int nf, int lmul, bit isV0 = false> {
   list<int> R =
     !foldl([]<int>,
-              [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-               13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-               23, 24, 25, 26, 27, 28, 29, 30, 31],
+              !if(isV0, [0],
+                !cond(
+                  !eq(lmul, 1):
+                  [8, 9, 10, 11, 12, 13, 14, 15,
+                   16, 17, 18, 19, 20, 21, 22, 23,
+                   24, 25, 26, 27, 28, 29, 30, 31,
+                   1, 2, 3, 4, 5, 6, 7],
+                  !eq(lmul, 2):
+                  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3],
+                  !eq(lmul, 4):
+                  [2, 3, 4, 5, 6, 7, 1])),
               L, i,
               !listconcat(L,
-                          !if(!and(
-                                !le(!mul(index, lmul), !mul(i, lmul)),
-                                !le(!mul(i, lmul),
-                                    !sub(!add(32, !mul(index, lmul)), !mul(nf, lmul)))
-                              ), [!mul(i, lmul)], [])));
+                          !if(!le(!mul(!add(i, tuple_index), lmul),
+                                  !sub(!add(32, !mul(tuple_index, lmul)), !mul(nf, lmul))),
+                              [!mul(!add(i, tuple_index), lmul)], [])));
 }
 
-class VRegList<list<dag> LIn, int start, int nf, int lmul, bit NoV0> {
+// This class returns a list of vector register collections.
+// For example, for NF = 2 and LMUL = 4,
+// it will return
+//   ([ V8M4, V12M4, V16M4, V20M4, V24M4, V4M4],
+//    [V12M4, V16M4, V20M4, V24M4, V28M4, V8M4])
+//
+class VRegList<list<dag> LIn, int start, int nf, int lmul, bit isV0> {
   list<dag> L =
     !if(!ge(start, nf),
         LIn,
         !listconcat(
           [!dag(add,
-                !foreach(i,
-                  !if(NoV0,
-                    !tail(IndexSet<start, nf, lmul>.R),
-                    [!head(IndexSet<start, nf, lmul>.R)]),
+                !foreach(i, IndexSet<start, nf, lmul, isV0>.R,
                   !cast<Register>("V" # i # !cond(!eq(lmul, 2): "M2",
                                                   !eq(lmul, 4): "M4",
                                                   true: ""))),
                 !listsplat("",
-                  !if(NoV0,
-                    !size(!tail(IndexSet<start, nf, lmul>.R)),
-                    !size([!head(IndexSet<start, nf, lmul>.R)]))))],
-          VRegList<LIn, !add(start, 1), nf, lmul, NoV0>.L));
+                  !size(IndexSet<start, nf, lmul, isV0>.R)))],
+          VRegList<LIn, !add(start, 1), nf, lmul, isV0>.L));
 }
 
 // Vector registers
@@ -463,11 +485,11 @@ let RegAltNameIndices = [ABIRegAltName] in {
 foreach m = [1, 2, 4] in {
   foreach n = NFList<m>.L in {
     def "VN" # n # "M" # m # "NoV0": RegisterTuples<
-                                       SubRegSet<[], 0, n, m>.L,
-                                       VRegList<[], 0, n, m, 1>.L>;
+                                       SubRegSet<n, m>.L,
+                                       VRegList<[], 0, n, m, false>.L>;
     def "VN" # n # "M" # m # "V0" : RegisterTuples<
-                                       SubRegSet<[], 0, n, m>.L,
-                                       VRegList<[], 0, n, m, 0>.L>;
+                                       SubRegSet<n, m>.L,
+                                       VRegList<[], 0, n, m, true>.L>;
   }
 }
 
@@ -487,8 +509,7 @@ def VR : VReg<[vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
                vfloat16mf4_t, vfloat16mf2_t, vfloat32mf2_t,
                vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
                vbool2_t, vbool1_t],
-           (add (sequence "V%u", 25, 31),
-                (sequence "V%u", 8, 24),
+           (add (sequence "V%u", 8, 31),
                 (sequence "V%u", 0, 7)), 1>;
 
 def VRNoV0 : VReg<[vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
@@ -498,27 +519,26 @@ def VRNoV0 : VReg<[vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,
                    vfloat16mf4_t, vfloat16mf2_t, vfloat32mf2_t,
                    vbool64_t, vbool32_t, vbool16_t, vbool8_t, vbool4_t,
                    vbool2_t, vbool1_t],
-               (add (sequence "V%u", 25, 31),
-                    (sequence "V%u", 8, 24),
+               (add (sequence "V%u", 8, 31),
                     (sequence "V%u", 1, 7)), 1>;
 
 def VRM2 : VReg<[vint8m2_t, vint16m2_t, vint32m2_t, vint64m2_t,
                  vfloat16m2_t, vfloat32m2_t, vfloat64m2_t],
-             (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
-                  V18M2, V20M2, V22M2, V24M2, V0M2, V2M2, V4M2, V6M2), 2>;
+             (add (sequence "V%uM2", 8, 31, 2),
+                  (sequence "V%uM2", 0, 7, 2)), 2>;
 
 def VRM2NoV0 : VReg<[vint8m2_t, vint16m2_t, vint32m2_t, vint64m2_t,
                      vfloat16m2_t, vfloat32m2_t, vfloat64m2_t],
-                 (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
-                      V18M2, V20M2, V22M2, V24M2, V2M2, V4M2, V6M2), 2>;
+             (add (sequence "V%uM2", 8, 31, 2),
+                  (sequence "V%uM2", 2, 7, 2)), 2>;
 
 def VRM4 : VReg<[vint8m4_t, vint16m4_t, vint32m4_t, vint64m4_t,
                  vfloat16m4_t, vfloat32m4_t, vfloat64m4_t],
-             (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V0M4, V4M4), 4>;
+             (add V8M4, V12M4, V16M4, V20M4, V24M4, V28M4, V0M4, V4M4), 4>;
 
 def VRM4NoV0 : VReg<[vint8m4_t, vint16m4_t, vint32m4_t, vint64m4_t,
                      vfloat16m4_t, vfloat32m4_t, vfloat64m4_t],
-                 (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V4M4), 4>;
+             (add V8M4, V12M4, V16M4, V20M4, V24M4, V28M4, V4M4), 4>;
 
 def VRM8 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
                  vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
@@ -526,7 +546,7 @@ def VRM8 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
 
 def VRM8NoV0 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
                      vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
-                 (add V8M8, V16M8, V24M8), 8>;
+             (add V8M8, V16M8, V24M8), 8>;
 
 defvar VMaskVTs = [vbool64_t, vbool32_t, vbool16_t, vbool8_t,
                    vbool4_t, vbool2_t, vbool1_t];
@@ -538,18 +558,18 @@ def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
 // The register class is added for inline assembly for vector mask types.
 def VM : VReg<[vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,
                vbool32_t, vbool64_t],
-           (add (sequence "V%u", 25, 31),
-                (sequence "V%u", 8, 24),
+           (add (sequence "V%u", 8, 31),
                 (sequence "V%u", 0, 7)), 1>;
 
 foreach m = LMULList.m in {
   foreach nf = NFList<m>.L in {
-    def "VRN" # nf # "M" # m: VReg<[untyped],
-                               (add !cast<RegisterTuples>("VN" # nf # "M" # m # "V0"), !cast<RegisterTuples>("VN" # nf # "M" # m # "NoV0")),
-                                    !mul(nf, m)>;
     def "VRN" # nf # "M" # m # "NoV0": VReg<[untyped],
                                (add !cast<RegisterTuples>("VN" # nf # "M" # m # "NoV0")),
                                     !mul(nf, m)>;
+    def "VRN" # nf # "M" # m: VReg<[untyped],
+                               (add !cast<RegisterTuples>("VN" # nf # "M" # m # "NoV0"),
+                                    !cast<RegisterTuples>("VN" # nf # "M" # m # "V0")),
+                                    !mul(nf, m)>;
   }
 }
 
@@ -557,3 +577,15 @@ foreach m = LMULList.m in {
 def FFLAGS : RISCVReg<0, "fflags">;
 def FRM    : RISCVReg<0, "frm">;
 def FCSR   : RISCVReg<0, "fcsr">;
+
+// Any type register. Used for .insn directives when we don't know what the
+// register types could be.
+// NOTE: The alignment and size are bogus values. The Size needs to be non-zero
+// or tablegen will use "untyped" to determine the size which will assert.
+let isAllocatable = 0 in
+def AnyReg : RegisterClass<"RISCV", [untyped], 32,
+                           (add (sequence "X%u", 0, 31),
+                                (sequence "F%u_D", 0, 31),
+                                (sequence "V%u", 0, 31))> {
+  let Size = 32;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index ed26a5026114..14f59152ed42 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -231,6 +231,9 @@ def : ReadAdvance<ReadFMovI64ToF64, 0>;
 def : ReadAdvance<ReadFClass32, 0>;
 def : ReadAdvance<ReadFClass64, 0>;
 
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
 defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
 defm : UnsupportedSchedZfh;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 314af180aca1..5b435fcb16a2 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -18,7 +18,7 @@ def SiFive7Model : SchedMachineModel {
   let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
 }
 
-// The SiFive7 microarchitecure has two pipelines: A and B.
+// The SiFive7 microarchitecture has two pipelines: A and B.
 // Pipe A can handle memory, integer alu and vector operations.
 // Pipe B can handle integer alu, control flow, integer multiply and divide,
 // and floating point computation.
@@ -219,6 +219,9 @@ def : ReadAdvance<ReadFMovI64ToF64, 0>;
 def : ReadAdvance<ReadFClass32, 0>;
 def : ReadAdvance<ReadFClass64, 0>;
 
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
 defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
 defm : UnsupportedSchedZfh;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index f31e4af46c1b..4971ca1d4e3e 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -230,3 +230,4 @@ def : ReadAdvance<ReadFSqrt16, 0>;
 
 // Include the scheduler resources for other instruction extensions.
 include "RISCVScheduleB.td"
+include "RISCVScheduleV.td"
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
new file mode 100644
index 000000000000..43af1802d706
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -0,0 +1,820 @@
+//===-- RISCVScheduleV.td - RISCV Scheduling Definitions V -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// Define scheduler resources associated with def operands.
+
+// 7. Vector Loads and Stores
+// 7.4. Vector Unit-Stride Instructions
+def WriteVLDE8        : SchedWrite;
+def WriteVLDE16       : SchedWrite;
+def WriteVLDE32       : SchedWrite;
+def WriteVLDE64       : SchedWrite;
+def WriteVSTE8        : SchedWrite;
+def WriteVSTE16       : SchedWrite;
+def WriteVSTE32       : SchedWrite;
+def WriteVSTE64       : SchedWrite;
+// 7.4.1. Vector Unit-Strided Mask
+def WriteVLDM         : SchedWrite;
+def WriteVSTM         : SchedWrite;
+// 7.5. Vector Strided Instructions
+def WriteVLDS8        : SchedWrite;
+def WriteVLDS16       : SchedWrite;
+def WriteVLDS32       : SchedWrite;
+def WriteVLDS64       : SchedWrite;
+def WriteVSTS8        : SchedWrite;
+def WriteVSTS16       : SchedWrite;
+def WriteVSTS32       : SchedWrite;
+def WriteVSTS64       : SchedWrite;
+// 7.6. Vector Indexed Instructions
+def WriteVLDUX8       : SchedWrite;
+def WriteVLDUX16      : SchedWrite;
+def WriteVLDUX32      : SchedWrite;
+def WriteVLDUX64      : SchedWrite;
+def WriteVLDOX8       : SchedWrite;
+def WriteVLDOX16      : SchedWrite;
+def WriteVLDOX32      : SchedWrite;
+def WriteVLDOX64      : SchedWrite;
+def WriteVSTUX8       : SchedWrite;
+def WriteVSTUX16      : SchedWrite;
+def WriteVSTUX32      : SchedWrite;
+def WriteVSTUX64      : SchedWrite;
+def WriteVSTOX8       : SchedWrite;
+def WriteVSTOX16      : SchedWrite;
+def WriteVSTOX32      : SchedWrite;
+def WriteVSTOX64      : SchedWrite;
+// 7.7. Vector Unit-stride Fault-Only-First Loads
+def WriteVLDFF8       : SchedWrite;
+def WriteVLDFF16      : SchedWrite;
+def WriteVLDFF32      : SchedWrite;
+def WriteVLDFF64      : SchedWrite;
+// 7.9. Vector Whole Register Instructions
+def WriteVLD1R8       : SchedWrite;
+def WriteVLD1R16      : SchedWrite;
+def WriteVLD1R32      : SchedWrite;
+def WriteVLD1R64      : SchedWrite;
+def WriteVLD2R8       : SchedWrite;
+def WriteVLD2R16      : SchedWrite;
+def WriteVLD2R32      : SchedWrite;
+def WriteVLD2R64      : SchedWrite;
+def WriteVLD4R8       : SchedWrite;
+def WriteVLD4R16      : SchedWrite;
+def WriteVLD4R32      : SchedWrite;
+def WriteVLD4R64      : SchedWrite;
+def WriteVLD8R8       : SchedWrite;
+def WriteVLD8R16      : SchedWrite;
+def WriteVLD8R32      : SchedWrite;
+def WriteVLD8R64      : SchedWrite;
+def WriteVST1R        : SchedWrite;
+def WriteVST2R        : SchedWrite;
+def WriteVST4R        : SchedWrite;
+def WriteVST8R        : SchedWrite;
+
+// 11. Vector Integer Arithmetic Instructions
+// 11.1. Vector Single-Width Integer Add and Subtract
+// 11.5. Vector Bitwise Logical Instructions
+def WriteVIALUV       : SchedWrite;
+def WriteVIALUX       : SchedWrite;
+def WriteVIALUI       : SchedWrite;
+// 11.2. Vector Widening Integer Add/Subtract
+def WriteVIWALUV      : SchedWrite;
+def WriteVIWALUX      : SchedWrite;
+def WriteVIWALUI      : SchedWrite;
+// 11.3. Vector Integer Extension
+def WriteVExtV        : SchedWrite;
+// 11.4. Vector Integer Arithmetic with Carry or Borrow Instructions
+def WriteVICALUV      : SchedWrite;
+def WriteVICALUX      : SchedWrite;
+def WriteVICALUI      : SchedWrite;
+// 11.6. Vector Single-Width Bit Shift Instructions
+def WriteVShiftV      : SchedWrite;
+def WriteVShiftX      : SchedWrite;
+def WriteVShiftI      : SchedWrite;
+// 11.7. Vector Narrowing Integer Right Shift Instructions
+def WriteVNShiftV     : SchedWrite;
+def WriteVNShiftX     : SchedWrite;
+def WriteVNShiftI     : SchedWrite;
+// 11.8. Vector Integer Comparison Instructions
+// 11.9. Vector Integer Min/Max Instructions
+def WriteVICmpV       : SchedWrite;
+def WriteVICmpX       : SchedWrite;
+def WriteVICmpI       : SchedWrite;
+// 11.10. Vector Single-Width Integer Multiply Instructions
+def WriteVIMulV       : SchedWrite;
+def WriteVIMulX       : SchedWrite;
+// 11.11. Vector Integer Divide Instructions
+def WriteVIDivV       : SchedWrite;
+def WriteVIDivX       : SchedWrite;
+// 11.12. Vector Widening Integer Multiply Instructions
+def WriteVIWMulV      : SchedWrite;
+def WriteVIWMulX      : SchedWrite;
+// 11.13. Vector Single-Width Integer Multiply-Add Instructions
+def WriteVIMulAddV    : SchedWrite;
+def WriteVIMulAddX    : SchedWrite;
+// 11.14. Vector Widening Integer Multiply-Add Instructions
+def WriteVIWMulAddV   : SchedWrite;
+def WriteVIWMulAddX   : SchedWrite;
+// 11.15. Vector Integer Merge Instructions
+def WriteVIMergeV     : SchedWrite;
+def WriteVIMergeX     : SchedWrite;
+def WriteVIMergeI     : SchedWrite;
+// 11.16. Vector Integer Move Instructions
+def WriteVIMovV       : SchedWrite;
+def WriteVIMovX       : SchedWrite;
+def WriteVIMovI       : SchedWrite;
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+// 12.1. Vector Single-Width Saturating Add and Subtract
+def WriteVSALUV       : SchedWrite;
+def WriteVSALUX       : SchedWrite;
+def WriteVSALUI       : SchedWrite;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+def WriteVAALUV       : SchedWrite;
+def WriteVAALUX       : SchedWrite;
+// 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+def WriteVSMulV       : SchedWrite;
+def WriteVSMulX       : SchedWrite;
+// 12.4. Vector Single-Width Scaling Shift Instructions
+def WriteVSShiftV     : SchedWrite;
+def WriteVSShiftX     : SchedWrite;
+def WriteVSShiftI     : SchedWrite;
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+def WriteVNClipV      : SchedWrite;
+def WriteVNClipX      : SchedWrite;
+def WriteVNClipI      : SchedWrite;
+
+// 13. Vector Floating-Point Instructions
+// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+def WriteVFALUV       : SchedWrite;
+def WriteVFALUF       : SchedWrite;
+// 13.3. Vector Widening Floating-Point Add/Subtract Instructions
+def WriteVFWALUV      : SchedWrite;
+def WriteVFWALUF      : SchedWrite;
+// 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+def WriteVFMulV       : SchedWrite;
+def WriteVFMulF       : SchedWrite;
+def WriteVFDivV       : SchedWrite;
+def WriteVFDivF       : SchedWrite;
+// 13.5. Vector Widening Floating-Point Multiply
+def WriteVFWMulV      : SchedWrite;
+def WriteVFWMulF      : SchedWrite;
+// 13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+def WriteVFMulAddV    : SchedWrite;
+def WriteVFMulAddF    : SchedWrite;
+// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+def WriteVFWMulAddV   : SchedWrite;
+def WriteVFWMulAddF   : SchedWrite;
+// 13.8. Vector Floating-Point Square-Root Instruction
+def WriteVFSqrtV      : SchedWrite;
+// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
+def WriteVFRecpV      : SchedWrite;
+// 13.11. Vector Floating-Point MIN/MAX Instructions
+// 13.13. Vector Floating-Point Compare Instructions
+def WriteVFCmpV       : SchedWrite;
+def WriteVFCmpF       : SchedWrite;
+// 13.12. Vector Floating-Point Sign-Injection Instructions
+def WriteVFSgnjV      : SchedWrite;
+def WriteVFSgnjF      : SchedWrite;
+// 13.14. Vector Floating-Point Classify Instruction
+def WriteVFClassV     : SchedWrite;
+// 13.15. Vector Floating-Point Merge Instruction
+def WriteVFMergeV     : SchedWrite;
+// 13.16. Vector Floating-Point Move Instruction
+def WriteVFMovV       : SchedWrite;
+// 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+def WriteVFCvtIToFV   : SchedWrite;
+def WriteVFCvtFToIV   : SchedWrite;
+def WriteVFCvtFToFV   : SchedWrite;
+// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
+def WriteVFWCvtIToFV  : SchedWrite;
+def WriteVFWCvtFToIV  : SchedWrite;
+def WriteVFWCvtFToFV  : SchedWrite;
+// 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+def WriteVFNCvtIToFV  : SchedWrite;
+def WriteVFNCvtFToIV  : SchedWrite;
+def WriteVFNCvtFToFV  : SchedWrite;
+
+// 14. Vector Reduction Operations
+// 14.1. Vector Single-Width Integer Reduction Instructions
+def WriteVIRedV       : SchedWrite;
+// 14.2. Vector Widening Integer Reduction Instructions
+def WriteVIWRedV      : SchedWrite;
+// 14.3. Vector Single-Width Floating-Point Reduction Instructions
+def WriteVFRedV       : SchedWrite;
+def WriteVFRedOV      : SchedWrite;
+// 14.4. Vector Widening Floating-Point Reduction Instructions
+def WriteVFWRedV      : SchedWrite;
+def WriteVFWRedOV     : SchedWrite;
+
+// 15. Vector Mask Instructions
+// 15.1. Vector Mask-Register Logical Instructions
+def WriteVMALUV       : SchedWrite;
+// 15.2. Vector Mask Population Count
+def WriteVMPopV       : SchedWrite;
+// 15.3. Vector Find-First-Set Mask Bit
+def WriteVMFFSV       : SchedWrite;
+// 15.4. Vector Set-Before-First Mask Bit
+// 15.5. Vector Set-Including-First Mask Bit
+// 15.6. Vector Set-only-First Mask Bit
+def WriteVMSFSV       : SchedWrite;
+// 15.8. Vector Iota Instruction
+def WriteVMIotV       : SchedWrite;
+// 15.9. Vector Element Index Instruction
+def WriteVMIdxV       : SchedWrite;
+
+// 16. Vector Permutation Instructions
+// 16.1. Integer Scalar Move Instructions
+def WriteVIMovVX      : SchedWrite;
+def WriteVIMovXV      : SchedWrite;
+// 16.2. Floating-Point Scalar Move Instructions
+def WriteVFMovVF      : SchedWrite;
+def WriteVFMovFV      : SchedWrite;
+// 16.3. Vector Slide Instructions
+def WriteVISlideX     : SchedWrite;
+def WriteVISlideI     : SchedWrite;
+def WriteVISlide1X    : SchedWrite;
+def WriteVFSlide1F    : SchedWrite;
+// 16.4. Vector Register Gather Instructions
+def WriteVGatherV     : SchedWrite;
+def WriteVGatherX     : SchedWrite;
+def WriteVGatherI     : SchedWrite;
+// 16.5. Vector Compress Instruction
+def WriteVCompressV   : SchedWrite;
+// 16.6. Whole Vector Register Move
+def WriteVMov1V       : SchedWrite;
+def WriteVMov2V       : SchedWrite;
+def WriteVMov4V       : SchedWrite;
+def WriteVMov8V       : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+/// Define scheduler resources associated with use operands.
+
+// 7. Vector Loads and Stores
+def ReadVLDX          : SchedRead;
+def ReadVSTX          : SchedRead;
+// 7.4. Vector Unit-Stride Instructions
+def ReadVSTE8V        : SchedRead;
+def ReadVSTE16V       : SchedRead;
+def ReadVSTE32V       : SchedRead;
+def ReadVSTE64V       : SchedRead;
+// 7.4.1. Vector Unit-Strided Mask
+def ReadVSTM          : SchedRead;
+// 7.5. Vector Strided Instructions
+def ReadVLDSX         : SchedRead;
+def ReadVSTSX         : SchedRead;
+def ReadVSTS8V        : SchedRead;
+def ReadVSTS16V       : SchedRead;
+def ReadVSTS32V       : SchedRead;
+def ReadVSTS64V       : SchedRead;
+// 7.6. Vector Indexed Instructions
+def ReadVLDUXV        : SchedRead;
+def ReadVLDOXV        : SchedRead;
+def ReadVSTUX8        : SchedRead;
+def ReadVSTUX16       : SchedRead;
+def ReadVSTUX32       : SchedRead;
+def ReadVSTUX64       : SchedRead;
+def ReadVSTUXV        : SchedRead;
+def ReadVSTUX8V       : SchedRead;
+def ReadVSTUX16V      : SchedRead;
+def ReadVSTUX32V      : SchedRead;
+def ReadVSTUX64V      : SchedRead;
+def ReadVSTOX8        : SchedRead;
+def ReadVSTOX16       : SchedRead;
+def ReadVSTOX32       : SchedRead;
+def ReadVSTOX64       : SchedRead;
+def ReadVSTOXV        : SchedRead;
+def ReadVSTOX8V       : SchedRead;
+def ReadVSTOX16V      : SchedRead;
+def ReadVSTOX32V      : SchedRead;
+def ReadVSTOX64V      : SchedRead;
+// 7.9. Vector Whole Register Instructions
+def ReadVST1R         : SchedRead;
+def ReadVST2R         : SchedRead;
+def ReadVST4R         : SchedRead;
+def ReadVST8R         : SchedRead;
+
+// 11. Vector Integer Arithmetic Instructions
+// 11.1. Vector Single-Width Integer Add and Subtract
+// 11.5. Vector Bitwise Logical Instructions
+def ReadVIALUV        : SchedRead;
+def ReadVIALUX        : SchedRead;
+// 11.2. Vector Widening Integer Add/Subtract
+def ReadVIWALUV       : SchedRead;
+def ReadVIWALUX       : SchedRead;
+// 11.3. Vector Integer Extension
+def ReadVExtV         : SchedRead;
+// 11.4. Vector Integer Arithmetic with Carry or Borrow Instructions
+def ReadVIALUCV       : SchedRead;
+def ReadVIALUCX       : SchedRead;
+// 11.6. Vector Single-Width Bit Shift Instructions
+def ReadVShiftV       : SchedRead;
+def ReadVShiftX       : SchedRead;
+// 11.7. Vector Narrowing Integer Right Shift Instructions
+def ReadVNShiftV      : SchedRead;
+def ReadVNShiftX      : SchedRead;
+// 11.8. Vector Integer Comparison Instructions
+// 11.9. Vector Integer Min/Max Instructions
+def ReadVICmpV        : SchedRead;
+def ReadVICmpX        : SchedRead;
+// 11.10. Vector Single-Width Integer Multiply Instructions
+def ReadVIMulV        : SchedRead;
+def ReadVIMulX        : SchedRead;
+// 11.11. Vector Integer Divide Instructions
+def ReadVIDivV        : SchedRead;
+def ReadVIDivX        : SchedRead;
+// 11.12. Vector Widening Integer Multiply Instructions
+def ReadVIWMulV       : SchedRead;
+def ReadVIWMulX       : SchedRead;
+// 11.13. Vector Single-Width Integer Multiply-Add Instructions
+def ReadVIMulAddV     : SchedRead;
+def ReadVIMulAddX     : SchedRead;
+// 11.14. Vector Widening Integer Multiply-Add Instructions
+def ReadVIWMulAddV    : SchedRead;
+def ReadVIWMulAddX    : SchedRead;
+// 11.15. Vector Integer Merge Instructions
+def ReadVIMergeV      : SchedRead;
+def ReadVIMergeX      : SchedRead;
+// 11.16. Vector Integer Move Instructions
+def ReadVIMovV        : SchedRead;
+def ReadVIMovX        : SchedRead;
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+// 12.1. Vector Single-Width Saturating Add and Subtract
+def ReadVSALUV        : SchedRead;
+def ReadVSALUX        : SchedRead;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+def ReadVAALUV        : SchedRead;
+def ReadVAALUX        : SchedRead;
+// 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+def ReadVSMulV        : SchedRead;
+def ReadVSMulX        : SchedRead;
+// 12.4. Vector Single-Width Scaling Shift Instructions
+def ReadVSShiftV      : SchedRead;
+def ReadVSShiftX      : SchedRead;
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+def ReadVNClipV       : SchedRead;
+def ReadVNClipX       : SchedRead;
+
+// 13. Vector Floating-Point Instructions
+// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+def ReadVFALUV        : SchedRead;
+def ReadVFALUF        : SchedRead;
+// 13.3. Vector Widening Floating-Point Add/Subtract Instructions
+def ReadVFWALUV       : SchedRead;
+def ReadVFWALUF       : SchedRead;
+// 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+def ReadVFMulV        : SchedRead;
+def ReadVFMulF        : SchedRead;
+def ReadVFDivV        : SchedRead;
+def ReadVFDivF        : SchedRead;
+// 13.5. Vector Widening Floating-Point Multiply
+def ReadVFWMulV       : SchedRead;
+def ReadVFWMulF       : SchedRead;
+// 13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+def ReadVFMulAddV     : SchedRead;
+def ReadVFMulAddF     : SchedRead;
+// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+def ReadVFWMulAddV    : SchedRead;
+def ReadVFWMulAddF    : SchedRead;
+// 13.8. Vector Floating-Point Square-Root Instruction
+def ReadVFSqrtV       : SchedRead;
+// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
+def ReadVFRecpV       : SchedRead;
+// 13.11. Vector Floating-Point MIN/MAX Instructions
+// 13.13. Vector Floating-Point Compare Instructions
+def ReadVFCmpV        : SchedRead;
+def ReadVFCmpF        : SchedRead;
+// 13.12. Vector Floating-Point Sign-Injection Instructions
+def ReadVFSgnjV       : SchedRead;
+def ReadVFSgnjF       : SchedRead;
+// 13.14. Vector Floating-Point Classify Instruction
+def ReadVFClassV      : SchedRead;
+// 13.15. Vector Floating-Point Merge Instruction
+def ReadVFMergeV      : SchedRead;
+def ReadVFMergeF      : SchedRead;
+// 13.16. Vector Floating-Point Move Instruction
+def ReadVFMovF        : SchedRead;
+// 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+def ReadVFCvtIToFV    : SchedRead;
+def ReadVFCvtFToIV    : SchedRead;
+// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
+def ReadVFWCvtIToFV   : SchedRead;
+def ReadVFWCvtFToIV   : SchedRead;
+def ReadVFWCvtFToFV   : SchedRead;
+// 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+def ReadVFNCvtIToFV   : SchedRead;
+def ReadVFNCvtFToIV   : SchedRead;
+def ReadVFNCvtFToFV   : SchedRead;
+
+// 14. Vector Reduction Operations
+// 14.1. Vector Single-Width Integer Reduction Instructions
+def ReadVIRedV        : SchedRead;
+def ReadVIRedV0       : SchedRead;
+// 14.2. Vector Widening Integer Reduction Instructions
+def ReadVIWRedV       : SchedRead;
+def ReadVIWRedV0      : SchedRead;
+// 14.3. Vector Single-Width Floating-Point Reduction Instructions
+def ReadVFRedV        : SchedRead;
+def ReadVFRedV0       : SchedRead;
+def ReadVFRedOV       : SchedRead;
+def ReadVFRedOV0      : SchedRead;
+// 14.4. Vector Widening Floating-Point Reduction Instructions
+def ReadVFWRedV       : SchedRead;
+def ReadVFWRedV0      : SchedRead;
+def ReadVFWRedOV      : SchedRead;
+def ReadVFWRedOV0     : SchedRead;
+
+// 15. Vector Mask Instructions
+// 15.1. Vector Mask-Register Logical Instructions
+def ReadVMALUV        : SchedRead;
+// 15.2. Vector Mask Population Count
+def ReadVMPopV        : SchedRead;
+// 15.3. Vector Find-First-Set Mask Bit
+def ReadVMFFSV        : SchedRead;
+// 15.4. Vector Set-Before-First Mask Bit
+// 15.5. Vector Set-Including-First Mask Bit
+// 15.6. Vector Set-only-First Mask Bit
+def ReadVMSFSV        : SchedRead;
+// 15.8. Vector Iota Instruction
+def ReadVMIotV        : SchedRead;
+
+// 16. Vector Permutation Instructions
+// 16.1. Integer Scalar Move Instructions
+def ReadVIMovVX       : SchedRead;
+def ReadVIMovXV       : SchedRead;
+def ReadVIMovXX       : SchedRead;
+// 16.2. Floating-Point Scalar Move Instructions
+def ReadVFMovVF       : SchedRead;
+def ReadVFMovFV       : SchedRead;
+def ReadVFMovFX       : SchedRead;
+// 16.3. Vector Slide Instructions
+def ReadVISlideV      : SchedRead;
+def ReadVISlideX      : SchedRead;
+def ReadVFSlideV      : SchedRead;
+def ReadVFSlideF      : SchedRead;
+// 16.4. Vector Register Gather Instructions
+def ReadVGatherV      : SchedRead;
+def ReadVGatherX      : SchedRead;
+// 16.5. Vector Compress Instruction
+def ReadVCompressV    : SchedRead;
+// 16.6. Whole Vector Register Move
+def ReadVMov1V        : SchedRead;
+def ReadVMov2V        : SchedRead;
+def ReadVMov4V        : SchedRead;
+def ReadVMov8V        : SchedRead;
+
+// Others
+def ReadVMask         : SchedRead;
+
+//===----------------------------------------------------------------------===//
+/// Define default scheduler resources for V.
+
+multiclass UnsupportedSchedV {
+let Unsupported = true in {
+
+// 7. Vector Loads and Stores
+def : WriteRes<WriteVLDE8, []>;
+def : WriteRes<WriteVLDE16, []>;
+def : WriteRes<WriteVLDE32, []>;
+def : WriteRes<WriteVLDE64, []>;
+def : WriteRes<WriteVSTE8, []>;
+def : WriteRes<WriteVSTE16, []>;
+def : WriteRes<WriteVSTE32, []>;
+def : WriteRes<WriteVSTE64, []>;
+def : WriteRes<WriteVLDM, []>;
+def : WriteRes<WriteVSTM, []>;
+def : WriteRes<WriteVLDS8, []>;
+def : WriteRes<WriteVLDS16, []>;
+def : WriteRes<WriteVLDS32, []>;
+def : WriteRes<WriteVLDS64, []>;
+def : WriteRes<WriteVSTS8, []>;
+def : WriteRes<WriteVSTS16, []>;
+def : WriteRes<WriteVSTS32, []>;
+def : WriteRes<WriteVSTS64, []>;
+def : WriteRes<WriteVLDUX8, []>;
+def : WriteRes<WriteVLDUX16, []>;
+def : WriteRes<WriteVLDUX32, []>;
+def : WriteRes<WriteVLDUX64, []>;
+def : WriteRes<WriteVLDOX8, []>;
+def : WriteRes<WriteVLDOX16, []>;
+def : WriteRes<WriteVLDOX32, []>;
+def : WriteRes<WriteVLDOX64, []>;
+def : WriteRes<WriteVSTUX8, []>;
+def : WriteRes<WriteVSTUX16, []>;
+def : WriteRes<WriteVSTUX32, []>;
+def : WriteRes<WriteVSTUX64, []>;
+def : WriteRes<WriteVSTOX8, []>;
+def : WriteRes<WriteVSTOX16, []>;
+def : WriteRes<WriteVSTOX32, []>;
+def : WriteRes<WriteVSTOX64, []>;
+def : WriteRes<WriteVLDFF8, []>;
+def : WriteRes<WriteVLDFF16, []>;
+def : WriteRes<WriteVLDFF32, []>;
+def : WriteRes<WriteVLDFF64, []>;
+def : WriteRes<WriteVLD1R8, []>;
+def : WriteRes<WriteVLD1R16, []>;
+def : WriteRes<WriteVLD1R32, []>;
+def : WriteRes<WriteVLD1R64, []>;
+def : WriteRes<WriteVLD2R8, []>;
+def : WriteRes<WriteVLD2R16, []>;
+def : WriteRes<WriteVLD2R32, []>;
+def : WriteRes<WriteVLD2R64, []>;
+def : WriteRes<WriteVLD4R8, []>;
+def : WriteRes<WriteVLD4R16, []>;
+def : WriteRes<WriteVLD4R32, []>;
+def : WriteRes<WriteVLD4R64, []>;
+def : WriteRes<WriteVLD8R8, []>;
+def : WriteRes<WriteVLD8R16, []>;
+def : WriteRes<WriteVLD8R32, []>;
+def : WriteRes<WriteVLD8R64, []>;
+def : WriteRes<WriteVST1R, []>;
+def : WriteRes<WriteVST2R, []>;
+def : WriteRes<WriteVST4R, []>;
+def : WriteRes<WriteVST8R, []>;
+
+// 12. Vector Integer Arithmetic Instructions
+def : WriteRes<WriteVIALUV, []>;
+def : WriteRes<WriteVIALUX, []>;
+def : WriteRes<WriteVIALUI, []>;
+def : WriteRes<WriteVIWALUV, []>;
+def : WriteRes<WriteVIWALUX, []>;
+def : WriteRes<WriteVIWALUI, []>;
+def : WriteRes<WriteVExtV, []>;
+def : WriteRes<WriteVICALUV, []>;
+def : WriteRes<WriteVICALUX, []>;
+def : WriteRes<WriteVICALUI, []>;
+def : WriteRes<WriteVShiftV, []>;
+def : WriteRes<WriteVShiftX, []>;
+def : WriteRes<WriteVShiftI, []>;
+def : WriteRes<WriteVNShiftV, []>;
+def : WriteRes<WriteVNShiftX, []>;
+def : WriteRes<WriteVNShiftI, []>;
+def : WriteRes<WriteVICmpV, []>;
+def : WriteRes<WriteVICmpX, []>;
+def : WriteRes<WriteVICmpI, []>;
+def : WriteRes<WriteVIMulV, []>;
+def : WriteRes<WriteVIMulX, []>;
+def : WriteRes<WriteVIDivV, []>;
+def : WriteRes<WriteVIDivX, []>;
+def : WriteRes<WriteVIWMulV, []>;
+def : WriteRes<WriteVIWMulX, []>;
+def : WriteRes<WriteVIMulAddV, []>;
+def : WriteRes<WriteVIMulAddX, []>;
+def : WriteRes<WriteVIWMulAddV, []>;
+def : WriteRes<WriteVIWMulAddX, []>;
+def : WriteRes<WriteVIMergeV, []>;
+def : WriteRes<WriteVIMergeX, []>;
+def : WriteRes<WriteVIMergeI, []>;
+def : WriteRes<WriteVIMovV, []>;
+def : WriteRes<WriteVIMovX, []>;
+def : WriteRes<WriteVIMovI, []>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+def : WriteRes<WriteVSALUV, []>;
+def : WriteRes<WriteVSALUX, []>;
+def : WriteRes<WriteVSALUI, []>;
+def : WriteRes<WriteVAALUV, []>;
+def : WriteRes<WriteVAALUX, []>;
+def : WriteRes<WriteVSMulV, []>;
+def : WriteRes<WriteVSMulX, []>;
+def : WriteRes<WriteVSShiftV, []>;
+def : WriteRes<WriteVSShiftX, []>;
+def : WriteRes<WriteVSShiftI, []>;
+def : WriteRes<WriteVNClipV, []>;
+def : WriteRes<WriteVNClipX, []>;
+def : WriteRes<WriteVNClipI, []>;
+
+// 14. Vector Floating-Point Instructions
+def : WriteRes<WriteVFALUV, []>;
+def : WriteRes<WriteVFALUF, []>;
+def : WriteRes<WriteVFWALUV, []>;
+def : WriteRes<WriteVFWALUF, []>;
+def : WriteRes<WriteVFMulV, []>;
+def : WriteRes<WriteVFMulF, []>;
+def : WriteRes<WriteVFDivV, []>;
+def : WriteRes<WriteVFDivF, []>;
+def : WriteRes<WriteVFWMulV, []>;
+def : WriteRes<WriteVFWMulF, []>;
+def : WriteRes<WriteVFMulAddV, []>;
+def : WriteRes<WriteVFMulAddF, []>;
+def : WriteRes<WriteVFWMulAddV, []>;
+def : WriteRes<WriteVFWMulAddF, []>;
+def : WriteRes<WriteVFSqrtV, []>;
+def : WriteRes<WriteVFRecpV, []>;
+def : WriteRes<WriteVFCmpV, []>;
+def : WriteRes<WriteVFCmpF, []>;
+def : WriteRes<WriteVFSgnjV, []>;
+def : WriteRes<WriteVFSgnjF, []>;
+def : WriteRes<WriteVFClassV, []>;
+def : WriteRes<WriteVFMergeV, []>;
+def : WriteRes<WriteVFMovV, []>;
+def : WriteRes<WriteVFCvtIToFV, []>;
+def : WriteRes<WriteVFCvtFToIV, []>;
+def : WriteRes<WriteVFCvtFToFV, []>;
+def : WriteRes<WriteVFWCvtIToFV, []>;
+def : WriteRes<WriteVFWCvtFToIV, []>;
+def : WriteRes<WriteVFWCvtFToFV, []>;
+def : WriteRes<WriteVFNCvtIToFV, []>;
+def : WriteRes<WriteVFNCvtFToIV, []>;
+def : WriteRes<WriteVFNCvtFToFV, []>;
+
+// 15. Vector Reduction Operations
+def : WriteRes<WriteVIRedV, []>;
+def : WriteRes<WriteVIWRedV, []>;
+def : WriteRes<WriteVFRedV, []>;
+def : WriteRes<WriteVFRedOV, []>;
+def : WriteRes<WriteVFWRedV, []>;
+def : WriteRes<WriteVFWRedOV, []>;
+
+// 16. Vector Mask Instructions
+def : WriteRes<WriteVMALUV, []>;
+def : WriteRes<WriteVMPopV, []>;
+def : WriteRes<WriteVMFFSV, []>;
+def : WriteRes<WriteVMSFSV, []>;
+def : WriteRes<WriteVMIotV, []>;
+def : WriteRes<WriteVMIdxV, []>;
+
+// 17. Vector Permutation Instructions
+def : WriteRes<WriteVIMovVX, []>;
+def : WriteRes<WriteVIMovXV, []>;
+def : WriteRes<WriteVFMovVF, []>;
+def : WriteRes<WriteVFMovFV, []>;
+def : WriteRes<WriteVISlideX, []>;
+def : WriteRes<WriteVISlideI, []>;
+def : WriteRes<WriteVISlide1X, []>;
+def : WriteRes<WriteVFSlide1F, []>;
+def : WriteRes<WriteVGatherV, []>;
+def : WriteRes<WriteVGatherX, []>;
+def : WriteRes<WriteVGatherI, []>;
+def : WriteRes<WriteVCompressV, []>;
+def : WriteRes<WriteVMov1V, []>;
+def : WriteRes<WriteVMov2V, []>;
+def : WriteRes<WriteVMov4V, []>;
+def : WriteRes<WriteVMov8V, []>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+def : ReadAdvance<ReadVSTE8V, 0>;
+def : ReadAdvance<ReadVSTE16V, 0>;
+def : ReadAdvance<ReadVSTE32V, 0>;
+def : ReadAdvance<ReadVSTE64V, 0>;
+def : ReadAdvance<ReadVSTM, 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+def : ReadAdvance<ReadVSTS8V, 0>;
+def : ReadAdvance<ReadVSTS16V, 0>;
+def : ReadAdvance<ReadVSTS32V, 0>;
+def : ReadAdvance<ReadVSTS64V, 0>;
+def : ReadAdvance<ReadVLDUXV, 0>;
+def : ReadAdvance<ReadVLDOXV, 0>;
+def : ReadAdvance<ReadVSTUXV, 0>;
+def : ReadAdvance<ReadVSTUX8, 0>;
+def : ReadAdvance<ReadVSTUX16, 0>;
+def : ReadAdvance<ReadVSTUX32, 0>;
+def : ReadAdvance<ReadVSTUX64, 0>;
+def : ReadAdvance<ReadVSTUX8V, 0>;
+def : ReadAdvance<ReadVSTUX16V, 0>;
+def : ReadAdvance<ReadVSTUX32V, 0>;
+def : ReadAdvance<ReadVSTUX64V, 0>;
+def : ReadAdvance<ReadVSTOX8, 0>;
+def : ReadAdvance<ReadVSTOX16, 0>;
+def : ReadAdvance<ReadVSTOX32, 0>;
+def : ReadAdvance<ReadVSTOX64, 0>;
+def : ReadAdvance<ReadVSTOXV, 0>;
+def : ReadAdvance<ReadVSTOX8V, 0>;
+def : ReadAdvance<ReadVSTOX16V, 0>;
+def : ReadAdvance<ReadVSTOX32V, 0>;
+def : ReadAdvance<ReadVSTOX64V, 0>;
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+def : ReadAdvance<ReadVIALUV, 0>;
+def : ReadAdvance<ReadVIALUX, 0>;
+def : ReadAdvance<ReadVIWALUV, 0>;
+def : ReadAdvance<ReadVIWALUX, 0>;
+def : ReadAdvance<ReadVExtV, 0>;
+def : ReadAdvance<ReadVIALUCV, 0>;
+def : ReadAdvance<ReadVIALUCX, 0>;
+def : ReadAdvance<ReadVShiftV, 0>;
+def : ReadAdvance<ReadVShiftX, 0>;
+def : ReadAdvance<ReadVNShiftV, 0>;
+def : ReadAdvance<ReadVNShiftX, 0>;
+def : ReadAdvance<ReadVICmpV, 0>;
+def : ReadAdvance<ReadVICmpX, 0>;
+def : ReadAdvance<ReadVIMulV, 0>;
+def : ReadAdvance<ReadVIMulX, 0>;
+def : ReadAdvance<ReadVIDivV, 0>;
+def : ReadAdvance<ReadVIDivX, 0>;
+def : ReadAdvance<ReadVIWMulV, 0>;
+def : ReadAdvance<ReadVIWMulX, 0>;
+def : ReadAdvance<ReadVIMulAddV, 0>;
+def : ReadAdvance<ReadVIMulAddX, 0>;
+def : ReadAdvance<ReadVIWMulAddV, 0>;
+def : ReadAdvance<ReadVIWMulAddX, 0>;
+def : ReadAdvance<ReadVIMergeV, 0>;
+def : ReadAdvance<ReadVIMergeX, 0>;
+def : ReadAdvance<ReadVIMovV, 0>;
+def : ReadAdvance<ReadVIMovX, 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+def : ReadAdvance<ReadVSALUV, 0>;
+def : ReadAdvance<ReadVSALUX, 0>;
+def : ReadAdvance<ReadVAALUV, 0>;
+def : ReadAdvance<ReadVAALUX, 0>;
+def : ReadAdvance<ReadVSMulV, 0>;
+def : ReadAdvance<ReadVSMulX, 0>;
+def : ReadAdvance<ReadVSShiftV, 0>;
+def : ReadAdvance<ReadVSShiftX, 0>;
+def : ReadAdvance<ReadVNClipV, 0>;
+def : ReadAdvance<ReadVNClipX, 0>;
+
+// 14. Vector Floating-Point Instructions
+def : ReadAdvance<ReadVFALUV, 0>;
+def : ReadAdvance<ReadVFALUF, 0>;
+def : ReadAdvance<ReadVFWALUV, 0>;
+def : ReadAdvance<ReadVFWALUF, 0>;
+def : ReadAdvance<ReadVFMulV, 0>;
+def : ReadAdvance<ReadVFMulF, 0>;
+def : ReadAdvance<ReadVFDivV, 0>;
+def : ReadAdvance<ReadVFDivF, 0>;
+def : ReadAdvance<ReadVFWMulV, 0>;
+def : ReadAdvance<ReadVFWMulF, 0>;
+def : ReadAdvance<ReadVFMulAddV, 0>;
+def : ReadAdvance<ReadVFMulAddF, 0>;
+def : ReadAdvance<ReadVFWMulAddV, 0>;
+def : ReadAdvance<ReadVFWMulAddF, 0>;
+def : ReadAdvance<ReadVFSqrtV, 0>;
+def : ReadAdvance<ReadVFRecpV, 0>;
+def : ReadAdvance<ReadVFCmpV, 0>;
+def : ReadAdvance<ReadVFCmpF, 0>;
+def : ReadAdvance<ReadVFSgnjV, 0>;
+def : ReadAdvance<ReadVFSgnjF, 0>;
+def : ReadAdvance<ReadVFClassV, 0>;
+def : ReadAdvance<ReadVFMergeV, 0>;
+def : ReadAdvance<ReadVFMergeF, 0>;
+def : ReadAdvance<ReadVFMovF, 0>;
+def : ReadAdvance<ReadVFCvtIToFV, 0>;
+def : ReadAdvance<ReadVFCvtFToIV, 0>;
+def : ReadAdvance<ReadVFWCvtIToFV, 0>;
+def : ReadAdvance<ReadVFWCvtFToIV, 0>;
+def : ReadAdvance<ReadVFWCvtFToFV, 0>;
+def : ReadAdvance<ReadVFNCvtIToFV, 0>;
+def : ReadAdvance<ReadVFNCvtFToIV, 0>;
+def : ReadAdvance<ReadVFNCvtFToFV, 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+def : ReadAdvance<ReadVMALUV, 0>;
+def : ReadAdvance<ReadVMPopV, 0>;
+def : ReadAdvance<ReadVMFFSV, 0>;
+def : ReadAdvance<ReadVMSFSV, 0>;
+def : ReadAdvance<ReadVMIotV, 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVIMovVX, 0>;
+def : ReadAdvance<ReadVIMovXV, 0>;
+def : ReadAdvance<ReadVIMovXX, 0>;
+def : ReadAdvance<ReadVFMovVF, 0>;
+def : ReadAdvance<ReadVFMovFV, 0>;
+def : ReadAdvance<ReadVFMovFX, 0>;
+def : ReadAdvance<ReadVISlideV, 0>;
+def : ReadAdvance<ReadVISlideX, 0>;
+def : ReadAdvance<ReadVFSlideV, 0>;
+def : ReadAdvance<ReadVFSlideF, 0>;
+def : ReadAdvance<ReadVGatherV, 0>;
+def : ReadAdvance<ReadVGatherX, 0>;
+def : ReadAdvance<ReadVCompressV, 0>;
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+
+} // Unsupported
+} // UnsupportedSchedV
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index b19fdcb0082b..1063134b8a6c 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -17,7 +17,7 @@
 #include "RISCVLegalizerInfo.h"
 #include "RISCVRegisterBankInfo.h"
 #include "RISCVTargetMachine.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -45,6 +45,11 @@ static cl::opt<unsigned> RVVVectorLMULMax(
              "Fractional LMUL values are not supported."),
     cl::init(8), cl::Hidden);
 
+static cl::opt<unsigned> RVVVectorELENMax(
+    "riscv-v-fixed-length-vector-elen-max",
+    cl::desc("The maximum ELEN value to use for fixed length vectors."),
+    cl::init(64), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -106,7 +111,8 @@ const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const {
 }
 
 unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const {
-  assert(hasStdExtV() && "Tried to get vector length without V support!");
+  assert(hasVInstructions() &&
+         "Tried to get vector length without Zve or V extension support!");
   if (RVVVectorBitsMax == 0)
     return 0;
   assert(RVVVectorBitsMax >= 128 && RVVVectorBitsMax <= 65536 &&
@@ -121,8 +127,8 @@ unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const {
 }
 
 unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const {
-  assert(hasStdExtV() &&
-         "Tried to get vector length without V extension support!");
+  assert(hasVInstructions() &&
+         "Tried to get vector length without Zve or V extension support!");
   assert((RVVVectorBitsMin == 0 ||
           (RVVVectorBitsMin >= 128 && RVVVectorBitsMax <= 65536 &&
            isPowerOf2_32(RVVVectorBitsMin))) &&
@@ -138,13 +144,24 @@ unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const {
 }
 
 unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
-  assert(hasStdExtV() &&
-         "Tried to get maximum LMUL without V extension support!");
+  assert(hasVInstructions() &&
+         "Tried to get vector length without Zve or V extension support!");
   assert(RVVVectorLMULMax <= 8 && isPowerOf2_32(RVVVectorLMULMax) &&
          "V extension requires a LMUL to be at most 8 and a power of 2!");
-  return PowerOf2Floor(std::max<unsigned>(RVVVectorLMULMax, 1));
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorLMULMax, 8), 1));
+}
+
+unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const {
+  assert(hasVInstructions() &&
+         "Tried to get maximum ELEN without Zve or V extension support!");
+  assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 &&
+         isPowerOf2_32(RVVVectorELENMax) &&
+         "V extension requires a ELEN to be a power of 2 between 8 and 64!");
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, 64), 8));
 }
 
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
-  return hasStdExtV() && getMinRVVVectorSizeInBits() != 0;
+  return hasVInstructions() && getMinRVVVectorSizeInBits() != 0;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ce36331e044d..deb2a11f98f1 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -39,7 +39,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtF = false;
   bool HasStdExtD = false;
   bool HasStdExtC = false;
-  bool HasStdExtB = false;
   bool HasStdExtZba = false;
   bool HasStdExtZbb = false;
   bool HasStdExtZbc = false;
@@ -50,10 +49,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtZbr = false;
   bool HasStdExtZbs = false;
   bool HasStdExtZbt = false;
-  bool HasStdExtZbproposedc = false;
   bool HasStdExtV = false;
   bool HasStdExtZvlsseg = false;
   bool HasStdExtZvamo = false;
+  bool HasStdExtZfhmin = false;
   bool HasStdExtZfh = false;
   bool HasRV64 = false;
   bool IsRV32E = false;
@@ -107,7 +106,6 @@ public:
   bool hasStdExtF() const { return HasStdExtF; }
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
-  bool hasStdExtB() const { return HasStdExtB; }
   bool hasStdExtZba() const { return HasStdExtZba; }
   bool hasStdExtZbb() const { return HasStdExtZbb; }
   bool hasStdExtZbc() const { return HasStdExtZbc; }
@@ -118,10 +116,10 @@ public:
   bool hasStdExtZbr() const { return HasStdExtZbr; }
   bool hasStdExtZbs() const { return HasStdExtZbs; }
   bool hasStdExtZbt() const { return HasStdExtZbt; }
-  bool hasStdExtZbproposedc() const { return HasStdExtZbproposedc; }
   bool hasStdExtV() const { return HasStdExtV; }
   bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; }
   bool hasStdExtZvamo() const { return HasStdExtZvamo; }
+  bool hasStdExtZfhmin() const { return HasStdExtZfhmin; }
   bool hasStdExtZfh() const { return HasStdExtZfh; }
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
@@ -135,8 +133,17 @@ public:
     assert(i < RISCV::NUM_TARGET_REGS && "Register out of range");
     return UserReservedRegister[i];
   }
+
+  // Vector codegen related methods.
+  bool hasVInstructions() const { return HasStdExtV; }
+  bool hasVInstructionsI64() const { return HasStdExtV; }
+  bool hasVInstructionsF16() const { return HasStdExtV && hasStdExtZfh(); }
+  bool hasVInstructionsF32() const { return HasStdExtV && hasStdExtF(); }
+  bool hasVInstructionsF64() const { return HasStdExtV && hasStdExtD(); }
+  // F16 and F64 both require F32.
+  bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); }
   unsigned getMaxInterleaveFactor() const {
-    return hasStdExtV() ? MaxInterleaveFactor : 1;
+    return hasVInstructions() ? MaxInterleaveFactor : 1;
   }
 
 protected:
@@ -158,6 +165,7 @@ public:
   unsigned getMaxRVVVectorSizeInBits() const;
   unsigned getMinRVVVectorSizeInBits() const;
   unsigned getMaxLMULForFixedLengthVectors() const;
+  unsigned getMaxELENForFixedLengthVectors() const;
   bool useRVVForFixedLengthVectors() const;
 };
 } // End llvm namespace
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index a561772b650b..41599dd8bb3f 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -385,6 +385,7 @@ def : SysReg<"dscratch1", 0x7B3>;
 def : SysReg<"vstart", 0x008>;
 def : SysReg<"vxsat", 0x009>;
 def : SysReg<"vxrm", 0x00A>;
+def : SysReg<"vcsr", 0x00F>;
 def : SysReg<"vl", 0xC20>;
 def : SysReg<"vtype", 0xC21>;
 def : SysReg<"vlenb", 0xC22>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b18ee6009217..b421eba8d442 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -27,8 +27,8 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
@@ -37,6 +37,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
+  initializeRISCVGatherScatterLoweringPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
@@ -149,6 +150,9 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void RISCVPassConfig::addIRPasses() {
   addPass(createAtomicExpandPass());
+
+  addPass(createRISCVGatherScatterLoweringPass());
+
   TargetPassConfig::addIRPasses();
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fd110db1064b..56f0952fafc9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -52,8 +52,15 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     // split up large offsets in GEP into better parts than ConstantHoisting
     // can.
     return TTI::TCC_Free;
-  case Instruction::Add:
   case Instruction::And:
+    // zext.h
+    if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
+      return TTI::TCC_Free;
+    // zext.w
+    if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZbb())
+      return TTI::TCC_Free;
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Mul:
@@ -125,7 +132,7 @@ Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
   // know whether the LoopVectorizer is safe to do or not.
   // We only consider to use single vector register (LMUL = 1) to vectorize.
   unsigned MaxVectorSizeInBits = ST->getMaxRVVVectorSizeInBits();
-  if (ST->hasStdExtV() && MaxVectorSizeInBits != 0)
+  if (ST->hasVInstructions() && MaxVectorSizeInBits != 0)
     return MaxVectorSizeInBits / RISCV::RVVBitsPerBlock;
   return BaseT::getMaxVScale();
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 95dacb1e6285..675681616d6e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -18,6 +18,7 @@
 
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
@@ -54,7 +55,7 @@ public:
   TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   bool shouldExpandReduction(const IntrinsicInst *II) const;
-  bool supportsScalableVectors() const { return ST->hasStdExtV(); }
+  bool supportsScalableVectors() const { return ST->hasVInstructions(); }
   Optional<unsigned> getMaxVScale() const;
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
@@ -63,52 +64,44 @@ public:
       return TypeSize::getFixed(ST->getXLen());
     case TargetTransformInfo::RGK_FixedWidthVector:
       return TypeSize::getFixed(
-          ST->hasStdExtV() ? ST->getMinRVVVectorSizeInBits() : 0);
+          ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0);
     case TargetTransformInfo::RGK_ScalableVector:
       return TypeSize::getScalable(
-          ST->hasStdExtV() ? ST->getMinRVVVectorSizeInBits() : 0);
+          ST->hasVInstructions() ? RISCV::RVVBitsPerBlock : 0);
     }
 
     llvm_unreachable("Unsupported register kind");
   }
 
+  unsigned getMinVectorRegisterBitWidth() const {
+    return ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0;
+  }
+
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
-  bool isLegalElementTypeForRVV(Type *ScalarTy) const {
-    if (ScalarTy->isPointerTy())
-      return true;
-
-    if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
-        ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64))
-      return true;
-
-    if (ScalarTy->isHalfTy())
-      return ST->hasStdExtZfh();
-    if (ScalarTy->isFloatTy())
-      return ST->hasStdExtF();
-    if (ScalarTy->isDoubleTy())
-      return ST->hasStdExtD();
-
-    return false;
-  }
-
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
-    if (!ST->hasStdExtV())
+    if (!ST->hasVInstructions())
       return false;
 
     // Only support fixed vectors if we know the minimum vector size.
     if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
       return false;
 
+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
     if (Alignment <
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
 
-    return isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -119,18 +112,24 @@ public:
   }
 
   bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) {
-    if (!ST->hasStdExtV())
+    if (!ST->hasVInstructions())
       return false;
 
     // Only support fixed vectors if we know the minimum vector size.
     if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
       return false;
 
+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
     if (Alignment <
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
 
-    return isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
@@ -150,14 +149,14 @@ public:
 
   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                    ElementCount VF) const {
-    if (!ST->hasStdExtV())
+    if (!ST->hasVInstructions())
       return false;
 
     if (!VF.isScalable())
       return true;
 
     Type *Ty = RdxDesc.getRecurrenceType();
-    if (!isLegalElementTypeForRVV(Ty))
+    if (!TLI->isLegalElementTypeForRVV(Ty))
       return false;
 
     switch (RdxDesc.getRecurrenceKind()) {
diff --git a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
index 4f265d556380..27d1326d5f6c 100644
--- a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
+++ b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/RISCVTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheRISCV32Target() {
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 9a2df8ca7fe9..48e6903bd1b1 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -25,10 +25,10 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 5c4419c108c0..142124a8e0d9 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -17,7 +17,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 51eccfa52359..e950f9582f09 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -15,8 +15,8 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -274,7 +274,8 @@ namespace {
       llvm_unreachable("relaxInstruction() unimplemented");
     }
 
-    bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
+    bool writeNopData(raw_ostream &OS, uint64_t Count,
+                      const MCSubtargetInfo *STI) const override {
       // Cannot emit NOP with size not multiple of 32 bits.
       if (Count % 4 != 0)
         return false;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 9531e3105fe2..49b75b7e0bd1 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -18,8 +18,8 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 2006c9bede34..f6f9c0a1de81 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -29,7 +29,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 2007303d9903..ed1faf6b1fe8 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1614,11 +1614,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
 
   if (!Subtarget->is64Bit()) {
     // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::MULO_I64, nullptr);
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
+  setLibcallName(RTLIB::MULO_I128, nullptr);
+
   if (!Subtarget->isV9()) {
     // SparcV8 does not have FNEGD and FABSD.
     setOperationAction(ISD::FNEG, MVT::f64, Custom);
@@ -2957,8 +2960,15 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
   SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
 
   SDValue RHS = Op.getOperand(1);
-  SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
-  SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
+  SDValue HiLHS, HiRHS;
+  if (isSigned) {
+    HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
+    HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
+  } else {
+    HiLHS = DAG.getConstant(0, dl, VT);
+    HiRHS = DAG.getConstant(0, dl, MVT::i64);
+  }
+
   SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
 
   TargetLowering::MakeLibCallOptions CallOptions;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index dc3a41c63098..a8a0b2cc9e67 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index b161e2a9d087..5e305fc9df71 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -369,8 +369,7 @@ multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
 
 // TODO: Instructions of the LoadASI class are currently asm only; hooking up
 // CodeGen's address spaces to use these is a future task.
-class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
-              RegisterClass RC, ValueType Ty, InstrItinClass itin = NoItinerary> :
+class LoadASI<string OpcStr, bits<6> Op3Val, RegisterClass RC> :
   F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
                 !strconcat(OpcStr, "a [$addr] $asi, $dst"),
                 []>;
@@ -380,7 +379,7 @@ multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
                  InstrItinClass itin = NoItinerary> :
              Load<OpcStr, Op3Val, OpNode, RC, Ty, itin> {
-  def Arr  : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>;
+  def Arr  : LoadASI<OpcStr, LoadAOp3Val, RC>;
 }
 
 // The LDSTUB instruction is supported for asm only.
@@ -411,8 +410,7 @@ multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
 
 // TODO: Instructions of the StoreASI class are currently asm only; hooking up
 // CodeGen's address spaces to use these is a future task.
-class StoreASI<string OpcStr, bits<6> Op3Val,
-               SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+class StoreASI<string OpcStr, bits<6> Op3Val, RegisterClass RC,
                InstrItinClass itin = IIC_st> :
   F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
            !strconcat(OpcStr, "a $rd, [$addr] $asi"),
@@ -420,10 +418,9 @@ class StoreASI<string OpcStr, bits<6> Op3Val,
            itin>;
 
 multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
-                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
-                  InstrItinClass itin = IIC_st> :
+                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> :
              Store<OpcStr, Op3Val, OpNode, RC, Ty> {
-  def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty, itin>;
+  def Arr : StoreASI<OpcStr, StoreAOp3Val, RC>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -523,12 +520,12 @@ let DecoderMethod = "DecodeLoadIntPair" in
 // Section B.2 - Load Floating-point Instructions, p. 92
 let DecoderMethod = "DecodeLoadFP" in {
   defm LDF   : Load<"ld",  0b100000, load,    FPRegs,  f32, IIC_iu_or_fpu_instr>;
-  def LDFArr : LoadASI<"ld",  0b110000, load, FPRegs,  f32, IIC_iu_or_fpu_instr>,
+  def LDFArr : LoadASI<"ld",  0b110000, FPRegs>,
                 Requires<[HasV9]>;
 }
 let DecoderMethod = "DecodeLoadDFP" in {
   defm LDDF   : Load<"ldd", 0b100011, load,    DFPRegs, f64, IIC_ldd>;
-  def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>,
+  def LDDFArr : LoadASI<"ldd", 0b110011, DFPRegs>,
                  Requires<[HasV9]>;
 }
 let DecoderMethod = "DecodeLoadQFP" in
@@ -573,17 +570,17 @@ let DecoderMethod = "DecodeStoreInt" in {
 }
 
 let DecoderMethod = "DecodeStoreIntPair" in
-  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32, IIC_std>;
+  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>;
 
 // Section B.5 - Store Floating-point Instructions, p. 97
 let DecoderMethod = "DecodeStoreFP" in {
   defm STF   : Store<"st",  0b100100, store,         FPRegs,  f32>;
-  def STFArr : StoreASI<"st",  0b110100, store,      FPRegs,  f32>,
+  def STFArr : StoreASI<"st",  0b110100, FPRegs>,
                Requires<[HasV9]>;
 }
 let DecoderMethod = "DecodeStoreDFP" in {
   defm STDF   : Store<"std", 0b100111, store,         DFPRegs, f64, IIC_std>;
-  def STDFArr : StoreASI<"std", 0b110111, store,      DFPRegs, f64>,
+  def STDFArr : StoreASI<"std", 0b110111, DFPRegs>,
                 Requires<[HasV9]>;
 }
 let DecoderMethod = "DecodeStoreQFP" in
@@ -1623,6 +1620,17 @@ let hasSideEffects = 1 in {
 }
 }
 
+// Section A.42 - Prefetch Data
+let Predicates = [HasV9] in {
+  def PREFETCHr : F3_1<3, 0b101101,
+                   (outs), (ins MEMrr:$addr, shift_imm5:$rd),
+                   "prefetch [$addr], $rd", []>;
+  def PREFETCHi : F3_2<3, 0b101101,
+                   (outs), (ins MEMri:$addr, shift_imm5:$rd),
+                   "prefetch [$addr], $rd", []>;
+}
+
+
 
 // Section A.43 - Read Privileged Register Instructions
 let Predicates = [HasV9] in {
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index abc47ef51563..618a8633f0a9 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -12,8 +12,8 @@
 
 #include "SparcSubtarget.h"
 #include "Sparc.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 083339bc157c..27c49a408a02 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 3bf5907012da..1138788ac7fa 100644
--- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/SparcTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheSparcTarget() {
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 0de24245cfcc..40ed417d0817 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -9,6 +9,7 @@
 #include "MCTargetDesc/SystemZInstPrinter.h"
 #include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZTargetStreamer.h"
 #include "TargetInfo/SystemZTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -25,10 +26,10 @@
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -39,13 +40,15 @@
 
 using namespace llvm;
 
-// Return true if Expr is in the range [MinValue, MaxValue].
-static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
+// Return true if Expr is in the range [MinValue, MaxValue]. If AllowSymbol
+// is true any MCExpr is accepted (address displacement).
+static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue,
+                    bool AllowSymbol = false) {
   if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
     int64_t Value = CE->getValue();
     return Value >= MinValue && Value <= MaxValue;
   }
-  return false;
+  return AllowSymbol;
 }
 
 namespace {
@@ -264,10 +267,10 @@ public:
     return isMem(MemKind) && Mem.RegKind == RegKind;
   }
   bool isMemDisp12(MemoryKind MemKind, RegisterKind RegKind) const {
-    return isMem(MemKind, RegKind) && inRange(Mem.Disp, 0, 0xfff);
+    return isMem(MemKind, RegKind) && inRange(Mem.Disp, 0, 0xfff, true);
   }
   bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
-    return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
+    return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287, true);
   }
   bool isMemDisp12Len4(RegisterKind RegKind) const {
     return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10);
@@ -405,6 +408,13 @@ private:
     SMLoc StartLoc, EndLoc;
   };
 
+  SystemZTargetStreamer &getTargetStreamer() {
+    assert(getParser().getStreamer().getTargetStreamer() &&
+           "do not have a target streamer");
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<SystemZTargetStreamer &>(TS);
+  }
+
   bool parseRegister(Register &Reg, bool RestoreOnFailure = false);
 
   bool parseIntegerRegister(Register &Reg, RegisterGroup Group);
@@ -420,6 +430,7 @@ private:
   bool parseAddressRegister(Register &Reg);
 
   bool ParseDirectiveInsn(SMLoc L);
+  bool ParseDirectiveMachine(SMLoc L);
 
   OperandMatchResultTy parseAddress(OperandVector &Operands,
                                     MemoryKind MemKind,
@@ -1210,6 +1221,8 @@ bool SystemZAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".insn")
     return ParseDirectiveInsn(DirectiveID.getLoc());
+  if (IDVal == ".machine")
+    return ParseDirectiveMachine(DirectiveID.getLoc());
 
   return true;
 }
@@ -1322,6 +1335,28 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
   return false;
 }
 
+/// ParseDirectiveMachine
+/// ::= .machine [ mcpu ]
+bool SystemZAsmParser::ParseDirectiveMachine(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Identifier) &&
+      Parser.getTok().isNot(AsmToken::String))
+    return Error(L, "unexpected token in '.machine' directive");
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+  Parser.Lex();
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.machine' directive");
+
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, "");
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+  getTargetStreamer().emitMachine(CPU);
+
+  return false;
+}
+
 bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                      SMLoc &EndLoc, bool RestoreOnFailure) {
   Register Reg;
@@ -1486,10 +1521,6 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   return false;
 }
 
-static std::string SystemZMnemonicSpellCheck(StringRef S,
-                                             const FeatureBitset &FBS,
-                                             unsigned VariantID = 0);
-
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index e81db1030c01..5eba150dadc3 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -13,8 +13,8 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <cassert>
 #include <cstdint>
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
index f3f3f096da33..0cb6bfaaebfb 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
@@ -24,9 +24,9 @@ using namespace llvm;
 #include "SystemZGenAsmWriter.inc"
 
 void SystemZInstPrinter::printAddress(const MCAsmInfo *MAI, unsigned Base,
-                                      int64_t Disp, unsigned Index,
+                                      const MCOperand &DispMO, unsigned Index,
                                       raw_ostream &O) {
-  O << Disp;
+  printOperand(DispMO, MAI, O);
   if (Base || Index) {
     O << '(';
     if (Index) {
@@ -194,23 +194,23 @@ void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
 
 void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
                                             raw_ostream &O) {
-  printAddress(&MAI, MI->getOperand(OpNum).getReg(),
-               MI->getOperand(OpNum + 1).getImm(), 0, O);
+  printAddress(&MAI, MI->getOperand(OpNum).getReg(), MI->getOperand(OpNum + 1),
+               0, O);
 }
 
 void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
                                              raw_ostream &O) {
-  printAddress(&MAI, MI->getOperand(OpNum).getReg(),
-               MI->getOperand(OpNum + 1).getImm(),
+  printAddress(&MAI, MI->getOperand(OpNum).getReg(), MI->getOperand(OpNum + 1),
                MI->getOperand(OpNum + 2).getReg(), O);
 }
 
 void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum,
                                              raw_ostream &O) {
   unsigned Base = MI->getOperand(OpNum).getReg();
-  uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+  const MCOperand &DispMO = MI->getOperand(OpNum + 1);
   uint64_t Length = MI->getOperand(OpNum + 2).getImm();
-  O << Disp << '(' << Length;
+  printOperand(DispMO, &MAI, O);
+  O << '(' << Length;
   if (Base) {
     O << ",";
     printRegName(O, Base);
@@ -221,9 +221,10 @@ void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum,
 void SystemZInstPrinter::printBDRAddrOperand(const MCInst *MI, int OpNum,
                                              raw_ostream &O) {
   unsigned Base = MI->getOperand(OpNum).getReg();
-  uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+  const MCOperand &DispMO = MI->getOperand(OpNum + 1);
   unsigned Length = MI->getOperand(OpNum + 2).getReg();
-  O << Disp << "(";
+  printOperand(DispMO, &MAI, O);
+  O << "(";
   printRegName(O, Length);
   if (Base) {
     O << ",";
@@ -234,8 +235,7 @@ void SystemZInstPrinter::printBDRAddrOperand(const MCInst *MI, int OpNum,
 
 void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum,
                                              raw_ostream &O) {
-  printAddress(&MAI, MI->getOperand(OpNum).getReg(),
-               MI->getOperand(OpNum + 1).getImm(),
+  printAddress(&MAI, MI->getOperand(OpNum).getReg(), MI->getOperand(OpNum + 1),
                MI->getOperand(OpNum + 2).getReg(), O);
 }
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
index 0a57ca0082e6..008bf747e5a1 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
@@ -33,8 +33,9 @@ public:
   static const char *getRegisterName(unsigned RegNo);
 
   // Print an address with the given base, displacement and index.
-  static void printAddress(const MCAsmInfo *MAI, unsigned Base, int64_t Disp,
-                           unsigned Index, raw_ostream &O);
+  static void printAddress(const MCAsmInfo *MAI, unsigned Base,
+                           const MCOperand &DispMO, unsigned Index,
+                           raw_ostream &O);
 
   // Print the given operand.
   static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 134c85e822be..0f5e0b9672a9 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -10,6 +10,8 @@
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInst.h"
@@ -21,7 +23,8 @@ using namespace llvm;
 // Value is a fully-resolved relocation value: Symbol + Addend [- Pivot].
 // Return the bits that should be installed in a relocation field for
 // fixup kind Kind.
-static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
+static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value,
+                                    const MCFixup &Fixup, MCContext &Ctx) {
   if (Kind < FirstTargetFixupKind)
     return Value;
 
@@ -32,6 +35,24 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
   case SystemZ::FK_390_PC32DBL:
     return (int64_t)Value / 2;
 
+  case SystemZ::FK_390_12:
+    if (!isUInt<12>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "displacement exceeds uint12");
+      return 0;
+    }
+    return Value;
+
+  case SystemZ::FK_390_20: {
+    if (!isInt<20>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "displacement exceeds int20");
+      return 0;
+    }
+    // The high byte of a 20 bit displacement value comes first.
+    uint64_t DLo = Value & 0xfff;
+    uint64_t DHi = (Value >> 12) & 0xff;
+    return (DLo << 8) | DHi;
+  }
+
   case SystemZ::FK_390_TLS_CALL:
     return 0;
   }
@@ -63,7 +84,8 @@ public:
                             const MCAsmLayout &Layout) const override {
     return false;
   }
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
     return createSystemZObjectWriter(OSABI);
@@ -94,7 +116,9 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
     { "FK_390_PC16DBL",  0, 16, MCFixupKindInfo::FKF_IsPCRel },
     { "FK_390_PC24DBL",  0, 24, MCFixupKindInfo::FKF_IsPCRel },
     { "FK_390_PC32DBL",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
-    { "FK_390_TLS_CALL", 0, 0, 0 }
+    { "FK_390_TLS_CALL", 0, 0, 0 },
+    { "FK_390_12",       4, 12, 0 },
+    { "FK_390_20",       4, 20, 0 }
   };
 
   // Fixup kinds from .reloc directive are like R_390_NONE. They
@@ -132,7 +156,7 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
   assert(Offset + Size <= Data.size() && "Invalid fixup offset!");
 
   // Big-endian insertion of Size bytes.
-  Value = extractBitsForFixup(Kind, Value);
+  Value = extractBitsForFixup(Kind, Value, Fixup, Asm.getContext());
   if (BitSize < 64)
     Value &= ((uint64_t)1 << BitSize) - 1;
   unsigned ShiftValue = (Size * 8) - 8;
@@ -142,7 +166,8 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
   }
 }
 
-bool SystemZMCAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool SystemZMCAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                       const MCSubtargetInfo *STI) const {
   for (uint64_t I = 0; I != Count; ++I)
     OS << '\x7';
   return true;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index fa4864299586..e61b07e973e9 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -12,37 +12,39 @@
 
 using namespace llvm;
 
-SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
-  CodePointerSize = 8;
+SystemZMCAsmInfoELF::SystemZMCAsmInfoELF(const Triple &TT) {
+  AssemblerDialect = AD_ATT;
   CalleeSaveStackSlotSize = 8;
+  CodePointerSize = 8;
+  Data64bitsDirective = "\t.quad\t";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
   IsLittleEndian = false;
-
-  AssemblerDialect = TT.isOSzOS() ? AD_HLASM : AD_ATT;
-
   MaxInstLength = 6;
-
-  CommentString = AssemblerDialect == AD_HLASM ? "*" : "#";
-  RestrictCommentStringToStartOfStatement = (AssemblerDialect == AD_HLASM);
-  AllowAdditionalComments = (AssemblerDialect == AD_ATT);
-  AllowAtAtStartOfIdentifier = (AssemblerDialect == AD_HLASM);
-  AllowDollarAtStartOfIdentifier = (AssemblerDialect == AD_HLASM);
-  AllowHashAtStartOfIdentifier = (AssemblerDialect == AD_HLASM);
-  DotIsPC = (AssemblerDialect == AD_ATT);
-  StarIsPC = (AssemblerDialect == AD_HLASM);
-  EmitGNUAsmStartIndentationMarker = (AssemblerDialect == AD_ATT);
-  AllowAtInName = (AssemblerDialect == AD_HLASM);
-  EmitLabelsInUpperCase = (AssemblerDialect == AD_HLASM);
-
-  ZeroDirective = "\t.space\t";
-  Data64bitsDirective = "\t.quad\t";
-  UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
-  ExceptionsType = ExceptionHandling::DwarfCFI;
+  UsesELFSectionDirectiveForBSS = true;
+  ZeroDirective = "\t.space\t";
 }
 
-bool SystemZMCAsmInfo::isAcceptableChar(char C) const {
-  if (AssemblerDialect == AD_ATT)
-    return MCAsmInfo::isAcceptableChar(C);
+SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) {
+  AllowAdditionalComments = false;
+  AllowAtInName = true;
+  AllowAtAtStartOfIdentifier = true;
+  AllowDollarAtStartOfIdentifier = true;
+  AllowHashAtStartOfIdentifier = true;
+  AssemblerDialect = AD_HLASM;
+  CalleeSaveStackSlotSize = 8;
+  CodePointerSize = 8;
+  CommentString = "*";
+  DotIsPC = false;
+  EmitGNUAsmStartIndentationMarker = false;
+  EmitLabelsInUpperCase = true;
+  IsLittleEndian = false;
+  MaxInstLength = 6;
+  RestrictCommentStringToStartOfStatement = true;
+  StarIsPC = true;
+  SupportsDebugInformation = true;
+}
 
+bool SystemZMCAsmInfoGOFF::isAcceptableChar(char C) const {
   return MCAsmInfo::isAcceptableChar(C) || C == '#';
 }
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 389575d14679..b2f191424d01 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -10,15 +10,21 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoGOFF.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
 class Triple;
 enum SystemZAsmDialect { AD_ATT = 0, AD_HLASM = 1 };
 
-class SystemZMCAsmInfo : public MCAsmInfoELF {
+class SystemZMCAsmInfoELF : public MCAsmInfoELF {
 public:
-  explicit SystemZMCAsmInfo(const Triple &TT);
+  explicit SystemZMCAsmInfoELF(const Triple &TT);
+};
+
+class SystemZMCAsmInfoGOFF : public MCAsmInfoGOFF {
+public:
+  explicit SystemZMCAsmInfoGOFF(const Triple &TT);
   bool isAcceptableChar(char C) const override;
 };
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index a5ccf4f68ffd..e280e4aaf3d8 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -12,6 +12,7 @@
 
 #include "MCTargetDesc/SystemZMCFixups.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZInstrInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -60,6 +61,12 @@ private:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
+  // Return the displacement value for the OpNum operand. If it is a symbol,
+  // add a fixup for it and return 0.
+  uint64_t getDispOpValue(const MCInst &MI, unsigned OpNum,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          SystemZ::FixupKind Kind) const;
+
   // Called by the TableGen code to get the binary encoding of an address.
   // The index or length, if any, is encoded first, followed by the base,
   // followed by the displacement.  In a 20-bit displacement,
@@ -180,11 +187,29 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 }
 
 uint64_t SystemZMCCodeEmitter::
+getDispOpValue(const MCInst &MI, unsigned OpNum,
+               SmallVectorImpl<MCFixup> &Fixups,
+               SystemZ::FixupKind Kind) const {
+  const MCOperand &MO = MI.getOperand(OpNum);
+  if (MO.isImm())
+    return static_cast<uint64_t>(MO.getImm());
+  if (MO.isExpr()) {
+    // All instructions follow the pattern where the first displacement has a
+    // 2 bytes offset, and the second one 4 bytes.
+    unsigned ByteOffs = Fixups.size() == 0 ? 2 : 4;
+    Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind));
+    assert(Fixups.size() <= 2 && "More than two memory operands in MI?");
+    return 0;
+  }
+  llvm_unreachable("Unexpected operand type!");
+}
+
+uint64_t SystemZMCCodeEmitter::
 getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
                     SmallVectorImpl<MCFixup> &Fixups,
                     const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_12);
   assert(isUInt<4>(Base) && isUInt<12>(Disp));
   return (Base << 12) | Disp;
 }
@@ -194,7 +219,7 @@ getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
                     SmallVectorImpl<MCFixup> &Fixups,
                     const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_20);
   assert(isUInt<4>(Base) && isInt<20>(Disp));
   return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
 }
@@ -204,7 +229,7 @@ getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_12);
   uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
   return (Index << 16) | (Base << 12) | Disp;
@@ -215,7 +240,7 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_20);
   uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
   assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
   return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
@@ -227,7 +252,7 @@ getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_12);
   uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
   return (Len << 16) | (Base << 12) | Disp;
@@ -238,7 +263,7 @@ getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_12);
   uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<8>(Len));
   return (Len << 16) | (Base << 12) | Disp;
@@ -249,7 +274,7 @@ getBDRAddr12Encoding(const MCInst &MI, unsigned OpNum,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_12);
   uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
   return (Len << 16) | (Base << 12) | Disp;
@@ -260,7 +285,7 @@ getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
   uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
-  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Disp = getDispOpValue(MI, OpNum + 1, Fixups, SystemZ::FK_390_12);
   uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
   assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<5>(Index));
   return (Index << 16) | (Base << 12) | Disp;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
index 14f6198183b9..1f62baabb9e7 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -20,6 +20,8 @@ enum FixupKind {
   FK_390_PC24DBL,
   FK_390_PC32DBL,
   FK_390_TLS_CALL,
+  FK_390_12,
+  FK_390_20,
 
   // Marker
   LastTargetFixupKind,
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 0b3e7b15df13..c23463ab9bde 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -46,6 +46,8 @@ static unsigned getAbsoluteReloc(unsigned Kind) {
   case FK_Data_2: return ELF::R_390_16;
   case FK_Data_4: return ELF::R_390_32;
   case FK_Data_8: return ELF::R_390_64;
+  case SystemZ::FK_390_12: return ELF::R_390_12;
+  case SystemZ::FK_390_20: return ELF::R_390_20;
   }
   llvm_unreachable("Unsupported absolute address");
 }
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 2a53dda84144..c7b73fd3b805 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -9,13 +9,15 @@
 #include "SystemZMCTargetDesc.h"
 #include "SystemZInstPrinter.h"
 #include "SystemZMCAsmInfo.h"
+#include "SystemZTargetStreamer.h"
 #include "TargetInfo/SystemZTargetInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -149,7 +151,10 @@ unsigned SystemZMC::getFirstReg(unsigned Reg) {
 static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
                                          const Triple &TT,
                                          const MCTargetOptions &Options) {
-  MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
+  if (TT.isOSzOS())
+    return new SystemZMCAsmInfoGOFF(TT);
+
+  MCAsmInfo *MAI = new SystemZMCAsmInfoELF(TT);
   MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
       nullptr, MRI.getDwarfRegNum(SystemZ::R15D, true),
       SystemZMC::ELFCFAOffsetFromInitialSP);
@@ -182,6 +187,53 @@ static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
   return new SystemZInstPrinter(MAI, MII, MRI);
 }
 
+void SystemZTargetStreamer::emitConstantPools() {
+  // Emit EXRL target instructions.
+  if (EXRLTargets2Sym.empty())
+    return;
+  // Switch to the .text section.
+  const MCObjectFileInfo &OFI = *Streamer.getContext().getObjectFileInfo();
+  Streamer.SwitchSection(OFI.getTextSection());
+  for (auto &I : EXRLTargets2Sym) {
+    Streamer.emitLabel(I.second);
+    const MCInstSTIPair &MCI_STI = I.first;
+    Streamer.emitInstruction(MCI_STI.first, *MCI_STI.second);
+  }
+  EXRLTargets2Sym.clear();
+}
+
+namespace {
+class SystemZTargetAsmStreamer : public SystemZTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  SystemZTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
+      : SystemZTargetStreamer(S), OS(OS) {}
+  void emitMachine(StringRef CPU) override {
+    OS << "\t.machine " << CPU << "\n";
+  }
+};
+
+class SystemZTargetELFStreamer : public SystemZTargetStreamer {
+public:
+  SystemZTargetELFStreamer(MCStreamer &S) : SystemZTargetStreamer(S) {}
+  void emitMachine(StringRef CPU) override {}
+};
+} // end namespace
+
+static MCTargetStreamer *
+createAsmTargetStreamer(MCStreamer &S,
+                        formatted_raw_ostream &OS,
+                        MCInstPrinter *InstPrint,
+                        bool isVerboseAsm) {
+  return new SystemZTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  return new SystemZTargetELFStreamer(S);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetMC() {
   // Register the MCAsmInfo.
   TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
@@ -210,4 +262,12 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetMC() {
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(getTheSystemZTarget(),
                                         createSystemZMCInstPrinter);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmTargetStreamer(getTheSystemZTarget(),
+                                            createAsmTargetStreamer);
+
+  // Register the obj streamer
+  TargetRegistry::RegisterObjectTargetStreamer(getTheSystemZTarget(),
+                                               createObjectTargetStreamer);
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 46ccd2129969..defab665f924 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -24,7 +24,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -549,15 +549,17 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     Register SrcReg = MI->getOperand(4).getReg();
     int64_t SrcDisp = MI->getOperand(5).getImm();
 
+    SystemZTargetStreamer *TS = getTargetStreamer();
     MCSymbol *DotSym = nullptr;
     MCInst ET = MCInstBuilder(TargetInsOpc).addReg(DestReg)
       .addImm(DestDisp).addImm(1).addReg(SrcReg).addImm(SrcDisp);
-    MCInstSTIPair ET_STI(ET, &MF->getSubtarget());
-    EXRLT2SymMap::iterator I = EXRLTargets2Sym.find(ET_STI);
-    if (I != EXRLTargets2Sym.end())
+    SystemZTargetStreamer::MCInstSTIPair ET_STI(ET, &MF->getSubtarget());
+    SystemZTargetStreamer::EXRLT2SymMap::iterator I =
+        TS->EXRLTargets2Sym.find(ET_STI);
+    if (I != TS->EXRLTargets2Sym.end())
       DotSym = I->second;
     else
-      EXRLTargets2Sym[ET_STI] = DotSym = OutContext.createTempSymbol();
+      TS->EXRLTargets2Sym[ET_STI] = DotSym = OutContext.createTempSymbol();
     const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext);
     EmitToStreamer(
         *OutStreamer,
@@ -722,19 +724,6 @@ void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
                             getSubtargetInfo());
 }
 
-void SystemZAsmPrinter::emitEXRLTargetInstructions() {
-  if (EXRLTargets2Sym.empty())
-    return;
-  // Switch to the .text section.
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-  for (auto &I : EXRLTargets2Sym) {
-    OutStreamer->emitLabel(I.second);
-    const MCInstSTIPair &MCI_STI = I.first;
-    OutStreamer->emitInstruction(MCI_STI.first, *MCI_STI.second);
-  }
-  EXRLTargets2Sym.clear();
-}
-
 // Convert a SystemZ-specific constant pool modifier into the associated
 // MCSymbolRefExpr variant kind.
 static MCSymbolRefExpr::VariantKind
@@ -786,14 +775,14 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned OpNo,
                                               const char *ExtraCode,
                                               raw_ostream &OS) {
-  SystemZInstPrinter::printAddress(MAI, MI->getOperand(OpNo).getReg(),
-                                   MI->getOperand(OpNo + 1).getImm(),
-                                   MI->getOperand(OpNo + 2).getReg(), OS);
+  SystemZInstPrinter::
+    printAddress(MAI, MI->getOperand(OpNo).getReg(),
+                 MCOperand::createImm(MI->getOperand(OpNo + 1).getImm()),
+                 MI->getOperand(OpNo + 2).getReg(), OS);
   return false;
 }
 
 void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) {
-  emitEXRLTargetInstructions();
   emitStackMaps(SM);
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index 11b731103c17..6cfd7bd4c486 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -11,6 +11,7 @@
 
 #include "SystemZMCInstLower.h"
 #include "SystemZTargetMachine.h"
+#include "SystemZTargetStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/MC/MCInstBuilder.h"
@@ -27,32 +28,11 @@ class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
 private:
   StackMaps SM;
 
-  typedef std::pair<MCInst, const MCSubtargetInfo *> MCInstSTIPair;
-  struct CmpMCInst {
-    bool operator()(const MCInstSTIPair &MCI_STI_A,
-                    const MCInstSTIPair &MCI_STI_B) const {
-      if (MCI_STI_A.second != MCI_STI_B.second)
-        return uintptr_t(MCI_STI_A.second) < uintptr_t(MCI_STI_B.second);
-      const MCInst &A = MCI_STI_A.first;
-      const MCInst &B = MCI_STI_B.first;
-      assert(A.getNumOperands() == B.getNumOperands() &&
-             A.getNumOperands() == 5 && A.getOperand(2).getImm() == 1 &&
-             B.getOperand(2).getImm() == 1 && "Unexpected EXRL target MCInst");
-      if (A.getOpcode() != B.getOpcode())
-        return A.getOpcode() < B.getOpcode();
-      if (A.getOperand(0).getReg() != B.getOperand(0).getReg())
-        return A.getOperand(0).getReg() < B.getOperand(0).getReg();
-      if (A.getOperand(1).getImm() != B.getOperand(1).getImm())
-        return A.getOperand(1).getImm() < B.getOperand(1).getImm();
-      if (A.getOperand(3).getReg() != B.getOperand(3).getReg())
-        return A.getOperand(3).getReg() < B.getOperand(3).getReg();
-      if (A.getOperand(4).getImm() != B.getOperand(4).getImm())
-        return A.getOperand(4).getImm() < B.getOperand(4).getImm();
-      return false;
-    }
-  };
-  typedef std::map<MCInstSTIPair, MCSymbol *, CmpMCInst> EXRLT2SymMap;
-  EXRLT2SymMap EXRLTargets2Sym;
+  SystemZTargetStreamer *getTargetStreamer() {
+    MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
+    assert(TS && "do not have a target streamer");
+    return static_cast<SystemZTargetStreamer *>(TS);
+  }
 
 public:
   SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -77,7 +57,6 @@ private:
   void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
-  void emitEXRLTargetInstructions();
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
index 86eb8365d527..9c73757d7f5c 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -28,3 +28,7 @@ const MCPhysReg SystemZ::XPLINK64ArgGPRs[SystemZ::XPLINK64NumArgGPRs] = {
 const MCPhysReg SystemZ::XPLINK64ArgFPRs[SystemZ::XPLINK64NumArgFPRs] = {
     SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
 };
+
+const MCPhysReg SystemZ::XPLINK64ArgVRs[SystemZ::XPLINK64NumArgVRs] = {
+    SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
+    SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31};
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index 96c1080d5237..f82c61c0f344 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -27,6 +27,9 @@ namespace SystemZ {
 
   const unsigned XPLINK64NumArgFPRs = 4;
   extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
+
+  const unsigned XPLINK64NumArgVRs = 8;
+  extern const MCPhysReg XPLINK64ArgVRs[XPLINK64NumArgVRs];
 } // end namespace SystemZ
 
 class SystemZCCState : public CCState {
@@ -124,7 +127,9 @@ inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT,
   else
     llvm_unreachable("Unknown Calling Convention!");
 
-  unsigned Offset = Reg ? 0 : State.AllocateStack(8, Align(8));
+  unsigned Offset = Reg && !Subtarget.isTargetXPLINK64()
+                        ? 0
+                        : State.AllocateStack(8, Align(8));
 
   // Use that same location for all the pending parts.
   for (auto &It : PendingMembers) {
@@ -167,12 +172,6 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
                                              CCValAssign::LocInfo &LocInfo,
                                              ISD::ArgFlagsTy &ArgFlags,
                                              CCState &State) {
-  if (LocVT.getSizeInBits() < 128)
-    return false;
-
-  if (static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))
-    return false;
-
   // For any C or C++ program, this should always be
   // false, since it is illegal to have a function
   // where the first argument is variadic. Therefore
@@ -185,21 +184,59 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
   bool AllocGPR3 = State.AllocateReg(SystemZ::R3D);
 
   // If GPR2 and GPR3 are available, then we may pass vararg in R2Q.
-  if (AllocGPR2 && AllocGPR3) {
-    State.addLoc(
-        CCValAssign::getReg(ValNo, ValVT, SystemZ::R2Q, LocVT, LocInfo));
+  // If only GPR3 is available, we need to set custom handling to copy
+  // hi bits into GPR3.
+  // Either way, we allocate on the stack.
+  if (AllocGPR3) {
+    // For f128 and vector var arg case, set the bitcast flag to bitcast to
+    // i128.
+    LocVT = MVT::i128;
+    LocInfo = CCValAssign::BCvt;
+    auto Offset = State.AllocateStack(16, Align(8));
+    if (AllocGPR2)
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, SystemZ::R2Q, LocVT, LocInfo));
+    else
+      State.addLoc(
+          CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     return true;
   }
 
-  // If only GPR3 is available, we allocate on stack but need to
-  // set custom handling to copy hi bits into GPR3.
-  if (!AllocGPR2 && AllocGPR3) {
-    auto Offset = State.AllocateStack(16, Align(8));
-    State.addLoc(
-        CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return true;
+  return false;
+}
+
+inline bool CC_XPLINK64_Shadow_Stack(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                     CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags,
+                                     CCState &State) {
+  ArrayRef<MCPhysReg> RegList;
+
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+    RegList = SystemZ::XPLINK64ArgGPRs;
+    break;
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v4f32:
+  case MVT::v2f64:
+    RegList = SystemZ::XPLINK64ArgVRs;
+    break;
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::f128:
+    RegList = SystemZ::XPLINK64ArgFPRs;
+    break;
+  default:
+    return false;
   }
 
+  unsigned UnallocatedRegisterIndex = State.getFirstUnallocated(RegList);
+  // Every time we can allocate a register, allocate on the stack.
+  if (UnallocatedRegisterIndex < RegList.size())
+    State.AllocateStack(LocVT.getSizeInBits() / 8, Align(8));
+
   return false;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 45e22b07be30..373023effb4a 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -162,12 +162,14 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>;
 //===----------------------------------------------------------------------===//
 // z/OS XPLINK64 callee-saved registers
 //===----------------------------------------------------------------------===//
-def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
-                                                (sequence "F%dD", 8, 15))>;
+// %R7D is volatile by the spec, but it must be saved in the prologue by
+// any non-leaf function and restored in the epilogue for use by the
+// return instruction so it functions exactly like a callee-saved register.
+def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15),
+                                                (sequence "F%dD", 15, 8))>;
 
-def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
-                                                       (sequence "F%dD", 15, 8),
-                                                       (sequence "V%d", 23, 16))>;
+def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64,
+                                                   (sequence "V%d", 23, 16))>;
 
 //===----------------------------------------------------------------------===//
 // z/OS XPLINK64 return value calling convention
@@ -222,6 +224,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // XPLINK64 ABI compliant code widens integral types smaller than i64
   // to i64 before placing the parameters either on the stack or in registers.
   CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+  // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRS.
+  CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
+  CCIfType<[f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+  // long double, can only be passed in GPR2 and GPR3, if available,
+  // hence R2Q
+  CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
+  // Non fixed vector arguments are treated in the same way as long
+  // doubles.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+      CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
 
   // A SwiftSelf is passed in callee-saved R10.
   CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
@@ -236,7 +249,7 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // The first 3 integer arguments are passed in registers R1D-R3D.
   // The rest will be passed in the user area. The address offset of the user
   // area can be found in register R4D.
-  CCIfType<[i32], CCAssignToReg<[R1L, R2L, R3L]>>,
+  CCIfType<[i64], CCCustom<"CC_XPLINK64_Shadow_Stack">>,
   CCIfType<[i64], CCAssignToReg<[R1D, R2D, R3D]>>,
 
   // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
@@ -247,34 +260,24 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
              CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>>,
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
              CCIfFixed<CCAssignToReg<[V24, V25, V26, V27,
                                       V28, V29, V30, V31]>>>>,
 
   // The first 4 named  float and double arguments are passed in registers FPR0-FPR6.
   // The rest will be passed in the user area.
   CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+  CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
   CCIfType<[f32], CCIfFixed<CCAssignToReg<[F0S, F2S, F4S, F6S]>>>,
   CCIfType<[f64], CCIfFixed<CCAssignToReg<[F0D, F2D, F4D, F6D]>>>,
   // The first 2 long double arguments are passed in register FPR0/FPR2
   // and FPR4/FPR6. The rest will be passed in the user area.
   CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+  CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
   CCIfType<[f128], CCIfFixed<CCAssignToReg<[F0Q, F4Q]>>>,
 
-  // Non fixed floats are passed in GPRs
-  // Promote f32 to f64, if it needs to be passed in GPRs.
-  CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
-  // Assign f64 varargs to their proper GPRs.
-  CCIfType<[f64], CCIfNotFixed<CCAssignToReg<[R1D, R2D, R3D]>>>,
-  // long double, can only be passed in GPR2 and GPR3, if available,
-  // hence R2Q
-  CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
-
-  // Non fixed vector arguments are treated in the same way as long
-  // doubles.
-  CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-      CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
-
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
   // Other f128 arguments are passed in 8-byte-aligned 16-byte stack slots.
diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 19b703bbb226..ac94570e568f 100644
--- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -571,10 +571,9 @@ bool SystemZElimCompare::optimizeCompareZero(
   // Also do a forward search to handle cases where an instruction after the
   // compare can be converted, like
   // LTEBRCompare %f0s, %f0s; %f2s = LER %f0s  =>  LTEBRCompare %f2s, %f0s
-  for (MachineBasicBlock::iterator MBBI =
-         std::next(MachineBasicBlock::iterator(&Compare)), MBBE = MBB.end();
-       MBBI != MBBE;) {
-    MachineInstr &MI = *MBBI++;
+  auto MIRange = llvm::make_range(
+      std::next(MachineBasicBlock::iterator(&Compare)), MBB.end());
+  for (MachineInstr &MI : llvm::make_early_inc_range(MIRange)) {
     if (preservesValueOf(MI, SrcReg)) {
       // Try to eliminate Compare by reusing a CC result from MI.
       if (convertToLoadAndTest(MI, Compare, CCUsers)) {
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index d2f6ff96158d..d11d118fb8ee 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 namespace {
 // The ABI-defined register save slots, relative to the CFA (i.e.
 // incoming stack pointer + SystemZMC::ELFCallFrameSize).
-static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
+static const TargetFrameLowering::SpillSlot ELFSpillOffsetTable[] = {
   { SystemZ::R2D,  0x10 },
   { SystemZ::R3D,  0x18 },
   { SystemZ::R4D,  0x20 },
@@ -44,29 +44,55 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
   { SystemZ::F4D,  0x90 },
   { SystemZ::F6D,  0x98 }
 };
+
+static const TargetFrameLowering::SpillSlot XPLINKSpillOffsetTable[] = {
+    {SystemZ::R4D, 0x00},  {SystemZ::R5D, 0x08},  {SystemZ::R6D, 0x10},
+    {SystemZ::R7D, 0x18},  {SystemZ::R8D, 0x20},  {SystemZ::R9D, 0x28},
+    {SystemZ::R10D, 0x30}, {SystemZ::R11D, 0x38}, {SystemZ::R12D, 0x40},
+    {SystemZ::R13D, 0x48}, {SystemZ::R14D, 0x50}, {SystemZ::R15D, 0x58}};
 } // end anonymous namespace
 
-SystemZFrameLowering::SystemZFrameLowering()
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8),
-                          0, Align(8), false /* StackRealignable */),
-      RegSpillOffsets(0) {
-  // Due to the SystemZ ABI, the DWARF CFA (Canonical Frame Address) is not
-  // equal to the incoming stack pointer, but to incoming stack pointer plus
-  // 160.  Instead of using a Local Area Offset, the Register save area will
-  // be occupied by fixed frame objects, and all offsets are actually
-  // relative to CFA.
+SystemZFrameLowering::SystemZFrameLowering(StackDirection D, Align StackAl,
+                                           int LAO, Align TransAl,
+                                           bool StackReal)
+    : TargetFrameLowering(D, StackAl, LAO, TransAl, StackReal) {}
 
-  // Create a mapping from register number to save slot offset.
-  // These offsets are relative to the start of the register save area.
-  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
-  for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
-    RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
+std::unique_ptr<SystemZFrameLowering>
+SystemZFrameLowering::create(const SystemZSubtarget &STI) {
+  if (STI.isTargetXPLINK64())
+    return std::make_unique<SystemZXPLINKFrameLowering>();
+  return std::make_unique<SystemZELFFrameLowering>();
 }
 
-bool SystemZFrameLowering::
-assignCalleeSavedSpillSlots(MachineFunction &MF,
-                            const TargetRegisterInfo *TRI,
-                            std::vector<CalleeSavedInfo> &CSI) const {
+MachineBasicBlock::iterator SystemZFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::ADJCALLSTACKDOWN:
+  case SystemZ::ADJCALLSTACKUP:
+    assert(hasReservedCallFrame(MF) &&
+           "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
+    return MBB.erase(MI);
+    break;
+
+  default:
+    llvm_unreachable("Unexpected call frame instruction");
+  }
+}
+
+bool SystemZFrameLowering::hasReservedCallFrame(
+    const MachineFunction &MF) const {
+  // The ELF ABI requires us to allocate 160 bytes of stack space for the
+  // callee, with any outgoing stack arguments being placed above that. It
+  // seems better to make that area a permanent feature of the frame even if
+  // we're using a frame pointer. Similarly, 64-bit XPLINK requires 96 bytes
+  // of stack space for the register save area.
+  return true;
+}
+
+bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
   bool IsVarArg = MF.getFunction().isVarArg();
@@ -130,9 +156,9 @@ assignCalleeSavedSpillSlots(MachineFunction &MF,
   return true;
 }
 
-void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
-                                                BitVector &SavedRegs,
-                                                RegScavenger *RS) const {
+void SystemZELFFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                   BitVector &SavedRegs,
+                                                   RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
@@ -179,6 +205,24 @@ void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 }
 
+SystemZELFFrameLowering::SystemZELFFrameLowering()
+    : SystemZFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0,
+                           Align(8), /* StackRealignable */ false),
+      RegSpillOffsets(0) {
+
+  // Due to the SystemZ ABI, the DWARF CFA (Canonical Frame Address) is not
+  // equal to the incoming stack pointer, but to incoming stack pointer plus
+  // 160.  Instead of using a Local Area Offset, the Register save area will
+  // be occupied by fixed frame objects, and all offsets are actually
+  // relative to CFA.
+
+  // Create a mapping from register number to save slot offset.
+  // These offsets are relative to the start of the register save area.
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+  for (unsigned I = 0, E = array_lengthof(ELFSpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[ELFSpillOffsetTable[I].Reg] = ELFSpillOffsetTable[I].Offset;
+}
+
 // Add GPR64 to the save instruction being built by MIB, which is in basic
 // block MBB.  IsImplicit says whether this is an explicit operand to the
 // instruction, or an implicit one that comes between the explicit start
@@ -196,7 +240,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
   }
 }
 
-bool SystemZFrameLowering::spillCalleeSavedRegisters(
+bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
@@ -256,7 +300,7 @@ bool SystemZFrameLowering::spillCalleeSavedRegisters(
   return true;
 }
 
-bool SystemZFrameLowering::restoreCalleeSavedRegisters(
+bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
@@ -312,9 +356,8 @@ bool SystemZFrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
-void SystemZFrameLowering::
-processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                    RegScavenger *RS) const {
+void SystemZELFFrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineRegisterInfo *MRI = &MF.getRegInfo();
@@ -410,8 +453,8 @@ static void buildDefCFAReg(MachineBasicBlock &MBB,
     .addCFIIndex(CFIIndex);
 }
 
-void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
-                                        MachineBasicBlock &MBB) const {
+void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>();
   const SystemZTargetLowering &TLI = *STI.getTargetLowering();
@@ -530,8 +573,8 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
     // Mark the FramePtr as live at the beginning of every block except
     // the entry block.  (We'll have marked R11 as live on entry when
     // saving the GPRs.)
-    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
-      I->addLiveIn(SystemZ::R11D);
+    for (MachineBasicBlock &MBBJ : llvm::drop_begin(MF))
+      MBBJ.addLiveIn(SystemZ::R11D);
   }
 
   // Skip over the FPR/VR saves.
@@ -573,15 +616,15 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
-void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
-                                        MachineBasicBlock &MBB) const {
+void SystemZELFFrameLowering::emitEpilogue(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   auto *ZII =
       static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
 
-  // See SystemZFrameLowering::emitPrologue
+  // See SystemZELFFrameLowering::emitPrologue
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
@@ -619,8 +662,8 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
-                                            MachineBasicBlock &PrologMBB) const {
+void SystemZELFFrameLowering::inlineStackProbe(
+    MachineFunction &MF, MachineBasicBlock &PrologMBB) const {
   auto *ZII =
     static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>();
@@ -719,24 +762,14 @@ void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
   }
 }
 
-bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
+bool SystemZELFFrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           MF.getFrameInfo().hasVarSizedObjects() ||
           MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
 }
 
-bool
-SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  // The ABI requires us to allocate 160 bytes of stack space for the callee,
-  // with any outgoing stack arguments being placed above that.  It seems
-  // better to make that area a permanent feature of the frame even if
-  // we're using a frame pointer.
-  return true;
-}
-
-StackOffset
-SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                             Register &FrameReg) const {
+StackOffset SystemZELFFrameLowering::getFrameIndexReference(
+    const MachineFunction &MF, int FI, Register &FrameReg) const {
   // Our incoming SP is actually SystemZMC::ELFCallFrameSize below the CFA, so
   // add that difference here.
   StackOffset Offset =
@@ -744,25 +777,8 @@ SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + StackOffset::getFixed(SystemZMC::ELFCallFrameSize);
 }
 
-MachineBasicBlock::iterator SystemZFrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF,
-                              MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MI) const {
-  switch (MI->getOpcode()) {
-  case SystemZ::ADJCALLSTACKDOWN:
-  case SystemZ::ADJCALLSTACKUP:
-    assert(hasReservedCallFrame(MF) &&
-           "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
-    return MBB.erase(MI);
-    break;
-
-  default:
-    llvm_unreachable("Unexpected call frame instruction");
-  }
-}
-
-unsigned SystemZFrameLowering::getRegSpillOffset(MachineFunction &MF,
-                                                 Register Reg) const {
+unsigned SystemZELFFrameLowering::getRegSpillOffset(MachineFunction &MF,
+                                                    Register Reg) const {
   bool IsVarArg = MF.getFunction().isVarArg();
   bool BackChain = MF.getFunction().hasFnAttribute("backchain");
   bool SoftFloat = MF.getSubtarget<SystemZSubtarget>().hasSoftFloat();
@@ -778,8 +794,8 @@ unsigned SystemZFrameLowering::getRegSpillOffset(MachineFunction &MF,
   return Offset;
 }
 
-int SystemZFrameLowering::
-getOrCreateFramePointerSaveIndex(MachineFunction &MF) const {
+int SystemZELFFrameLowering::getOrCreateFramePointerSaveIndex(
+    MachineFunction &MF) const {
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   int FI = ZFI->getFramePointerSaveIndex();
   if (!FI) {
@@ -791,7 +807,7 @@ getOrCreateFramePointerSaveIndex(MachineFunction &MF) const {
   return FI;
 }
 
-bool SystemZFrameLowering::usePackedStack(MachineFunction &MF) const {
+bool SystemZELFFrameLowering::usePackedStack(MachineFunction &MF) const {
   bool HasPackedStackAttr = MF.getFunction().hasFnAttribute("packed-stack");
   bool BackChain = MF.getFunction().hasFnAttribute("backchain");
   bool SoftFloat = MF.getSubtarget<SystemZSubtarget>().hasSoftFloat();
@@ -800,3 +816,186 @@ bool SystemZFrameLowering::usePackedStack(MachineFunction &MF) const {
   bool CallConv = MF.getFunction().getCallingConv() != CallingConv::GHC;
   return HasPackedStackAttr && CallConv;
 }
+
+SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering()
+    : SystemZFrameLowering(TargetFrameLowering::StackGrowsUp, Align(32), 128,
+                           Align(32), /* StackRealignable */ false),
+      RegSpillOffsets(-1) {
+
+  // Create a mapping from register number to save slot offset.
+  // These offsets are relative to the start of the local are area.
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+  for (unsigned I = 0, E = array_lengthof(XPLINKSpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[XPLINKSpillOffsetTable[I].Reg] =
+        XPLINKSpillOffsetTable[I].Offset;
+}
+
+bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  // Scan the call-saved GPRs and find the bounds of the register spill area.
+  unsigned LowGPR = 0;
+  int LowOffset = INT32_MAX;
+  unsigned HighGPR = LowGPR;
+  int HighOffset = -1;
+
+  unsigned RegSP = Regs.getStackPointerRegister();
+  auto &GRRegClass = SystemZ::GR64BitRegClass;
+  const unsigned RegSize = 8;
+
+  auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) {
+    for (auto &CS : CSIList) {
+      unsigned Reg = CS.getReg();
+      int Offset = RegSpillOffsets[Reg];
+      if (Offset >= 0) {
+        if (GRRegClass.contains(Reg)) {
+          if (LowOffset > Offset) {
+            LowOffset = Offset;
+            LowGPR = Reg;
+          }
+
+          if (Offset > HighOffset) {
+            HighOffset = Offset;
+            HighGPR = Reg;
+          }
+        }
+        int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset);
+        CS.setFrameIdx(FrameIdx);
+      } else
+        CS.setFrameIdx(INT32_MAX);
+    }
+  };
+
+  std::vector<CalleeSavedInfo> Spills;
+
+  // For non-leaf functions:
+  // - the address of callee (entry point) register R6 must be saved
+  Spills.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister()));
+
+  // If the function needs a frame pointer, or if the backchain pointer should
+  // be stored, then save the stack pointer register R4.
+  if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain"))
+    Spills.push_back(CalleeSavedInfo(RegSP));
+
+  // Save the range of call-saved registers, for use by the
+  // prologue/epilogue inserters.
+  ProcessCSI(CSI);
+  MFI->setRestoreGPRRegs(LowGPR, HighGPR, LowOffset);
+
+  // Save the range of call-saved registers, for use by the epilogue inserter.
+  ProcessCSI(Spills);
+  MFI->setSpillGPRRegs(LowGPR, HighGPR, LowOffset);
+
+  // Create spill slots for the remaining registers.
+  for (auto &CS : CSI) {
+    if (CS.getFrameIdx() != INT32_MAX)
+      continue;
+    unsigned Reg = CS.getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    Align Alignment = TRI->getSpillAlign(*RC);
+    unsigned Size = TRI->getSpillSize(*RC);
+    Alignment = std::min(Alignment, getStackAlign());
+    int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true);
+    CS.setFrameIdx(FrameIdx);
+  }
+
+  return true;
+}
+
+void SystemZXPLINKFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                      BitVector &SavedRegs,
+                                                      RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  bool HasFP = hasFP(MF);
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  // If the function requires a frame pointer, record that the hard
+  // frame pointer will be clobbered.
+  if (HasFP)
+    SavedRegs.set(Regs.getFramePointerRegister());
+
+  // If the function is not an XPLeaf function, we need to save the
+  // return address register. We also always use that register for
+  // the return instruction, so it needs to be restored in the
+  // epilogue even though that register is considered to be volatile.
+  // #TODO: Implement leaf detection.
+  SavedRegs.set(Regs.getReturnFunctionAddressRegister());
+}
+
+bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction &MF = *MBB.getParent();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+  SystemZ::GPRRegs SpillGPRs = ZFI->getSpillGPRRegs();
+  DebugLoc DL;
+
+  // Save GPRs
+  if (SpillGPRs.LowGPR) {
+    assert(SpillGPRs.LowGPR != SpillGPRs.HighGPR &&
+           "Should be saving multiple registers");
+
+    // Build an STM/STMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
+
+    // Add the explicit register operands.
+    addSavedGPR(MBB, MIB, SpillGPRs.LowGPR, false);
+    addSavedGPR(MBB, MIB, SpillGPRs.HighGPR, false);
+
+    // Add the address r4
+    MIB.addReg(Regs.getStackPointerRegister());
+
+    // Add the partial offset
+    // We cannot add the actual offset as, at the stack is not finalized
+    MIB.addImm(SpillGPRs.GPROffset);
+
+    // Make sure all call-saved GPRs are included as operands and are
+    // marked as live on entry.
+    auto &GRRegClass = SystemZ::GR64BitRegClass;
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (GRRegClass.contains(Reg))
+        addSavedGPR(MBB, MIB, Reg, true);
+    }
+  }
+
+  // Spill FPRs to the stack in the normal TargetInstrInfo way
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::FP64BitRegClass, TRI);
+    }
+    if (SystemZ::VR128BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::VR128BitRegClass, TRI);
+    }
+  }
+
+  return true;
+}
+
+void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
+                                              MachineBasicBlock &MBB) const {}
+
+void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF,
+                                              MachineBasicBlock &MBB) const {}
+
+bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
+  return false;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index c8312b836e57..6fddb4f81c41 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -10,6 +10,8 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZMachineFunctionInfo.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Support/TypeSize.h"
@@ -19,10 +21,26 @@ class SystemZTargetMachine;
 class SystemZSubtarget;
 
 class SystemZFrameLowering : public TargetFrameLowering {
+public:
+  SystemZFrameLowering(StackDirection D, Align StackAl, int LAO, Align TransAl,
+                       bool StackReal);
+
+  static std::unique_ptr<SystemZFrameLowering>
+  create(const SystemZSubtarget &STI);
+
+  // Override TargetFrameLowering.
+  bool isFPCloseToIncomingSP() const override { return false; }
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+};
+
+class SystemZELFFrameLowering : public SystemZFrameLowering {
   IndexedMap<unsigned> RegSpillOffsets;
 
 public:
-  SystemZFrameLowering();
+  SystemZELFFrameLowering();
 
   // Override TargetFrameLowering.
   bool isFPCloseToIncomingSP() const override { return false; }
@@ -48,21 +66,14 @@ public:
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologMBB) const override;
   bool hasFP(const MachineFunction &MF) const override;
-  bool hasReservedCallFrame(const MachineFunction &MF) const override;
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
-  MachineBasicBlock::iterator
-  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const override;
 
   // Return the byte offset from the incoming stack pointer of Reg's
   // ABI-defined save slot.  Return 0 if no slot is defined for Reg.  Adjust
   // the offset in case MF has packed-stack.
   unsigned getRegSpillOffset(MachineFunction &MF, Register Reg) const;
 
-  // Get or create the frame index of where the old frame pointer is stored.
-  int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const;
-
   bool usePackedStack(MachineFunction &MF) const;
 
   // Return the offset of the backchain.
@@ -70,6 +81,35 @@ public:
     // The back chain is stored topmost with packed-stack.
     return usePackedStack(MF) ? SystemZMC::ELFCallFrameSize - 8 : 0;
   }
+
+  // Get or create the frame index of where the old frame pointer is stored.
+  int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const;
+};
+
+class SystemZXPLINKFrameLowering : public SystemZFrameLowering {
+  IndexedMap<unsigned> RegSpillOffsets;
+
+public:
+  SystemZXPLINKFrameLowering();
+
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index d70d48638b14..71432218068e 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -82,6 +82,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     : TargetLowering(TM), Subtarget(STI) {
   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
 
+  auto *Regs = STI.getSpecialRegisters();
+
   // Set up the register classes.
   if (Subtarget.hasHighWord())
     addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
@@ -115,7 +117,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
   // Set up special registers.
-  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+  setStackPointerRegisterToSaveRestore(Regs->getStackPointerRegister());
 
   // TODO: It may be better to default to latency-oriented scheduling, however
   // LLVM's current latency-oriented scheduler can't handle physreg definitions
@@ -293,6 +295,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::SHL_I128, nullptr);
   setLibcallName(RTLIB::SRA_I128, nullptr);
 
+  // Handle bitcast from fp128 to i128.
+  setOperationAction(ISD::BITCAST, MVT::i128, Custom);
+
   // We have native instructions for i8, i16 and i32 extensions, but not i1.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   for (MVT VT : MVT::integer_valuetypes()) {
@@ -1353,14 +1358,21 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
     return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
   case CCValAssign::AExt:
     return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
-  case CCValAssign::BCvt:
-    // If this is a short vector argument to be stored to the stack,
+  case CCValAssign::BCvt: {
+    assert(VA.getLocVT() == MVT::i64 || VA.getLocVT() == MVT::i128);
+    assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f64 ||
+           VA.getValVT() == MVT::f128);
+    MVT BitCastToType = VA.getValVT().isVector() && VA.getLocVT() == MVT::i64
+                            ? MVT::v2i64
+                            : VA.getLocVT();
+    Value = DAG.getNode(ISD::BITCAST, DL, BitCastToType, Value);
+    // For ELF, this is a short vector argument to be stored to the stack,
     // bitcast to v2i64 and then extract first element.
-    assert(VA.getLocVT() == MVT::i64);
-    assert(VA.getValVT().isVector());
-    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
-                       DAG.getConstant(0, DL, MVT::i32));
+    if (BitCastToType == MVT::v2i64)
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
+                         DAG.getConstant(0, DL, MVT::i32));
+    return Value;
+  }
   case CCValAssign::Full:
     return Value;
   default:
@@ -1426,8 +1438,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SystemZMachineFunctionInfo *FuncInfo =
       MF.getInfo<SystemZMachineFunctionInfo>();
-  auto *TFL =
-      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+  auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Detect unsupported vector argument types.
@@ -1468,6 +1479,10 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
         NumFixedFPRs += 1;
         RC = &SystemZ::FP64BitRegClass;
         break;
+      case MVT::f128:
+        NumFixedFPRs += 2;
+        RC = &SystemZ::FP128BitRegClass;
+        break;
       case MVT::v16i8:
       case MVT::v8i16:
       case MVT::v4i32:
@@ -1521,7 +1536,8 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
       InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
   }
 
-  if (IsVarArg) {
+  // FIXME: Add support for lowering varargs for XPLINK64 in a later patch.
+  if (IsVarArg && Subtarget.isTargetELF()) {
     // Save the number of non-varargs registers for later use by va_start, etc.
     FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
     FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
@@ -1560,6 +1576,8 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
     }
   }
 
+  // FIXME: For XPLINK64, Add in support for handling incoming "ADA" special
+  // register (R5)
   return Chain;
 }
 
@@ -1600,6 +1618,11 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
   LLVMContext &Ctx = *DAG.getContext();
+  SystemZCallingConventionRegisters *Regs = Subtarget.getSpecialRegisters();
+
+  // FIXME: z/OS support to be added in later.
+  if (Subtarget.isTargetXPLINK64())
+    IsTailCall = false;
 
   // Detect unsupported vector argument and return types.
   if (Subtarget.hasVector()) {
@@ -1620,6 +1643,13 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getNextStackOffset();
 
+  if (Subtarget.isTargetXPLINK64())
+    // Although the XPLINK specifications for AMODE64 state that minimum size
+    // of the param area is minimum 32 bytes and no rounding is otherwise
+    // specified, we round this area in 64 bytes increments to be compatible
+    // with existing compilers.
+    NumBytes = std::max(64U, (unsigned)alignTo(NumBytes, 64));
+
   // Mark the start of the call.
   if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
@@ -1670,17 +1700,24 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     } else
       ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
 
-    if (VA.isRegLoc())
+    if (VA.isRegLoc()) {
+      // In XPLINK64, for the 128-bit vararg case, ArgValue is bitcasted to a
+      // MVT::i128 type. We decompose the 128-bit type to a pair of its high
+      // and low values.
+      if (VA.getLocVT() == MVT::i128)
+        ArgValue = lowerI128ToGR128(DAG, ArgValue);
       // Queue up the argument copies and emit them at the end.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
-    else {
+    } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
 
       // Work out the address of the stack slot.  Unpromoted ints and
       // floats are passed as right-justified 8-byte values.
       if (!StackPtr.getNode())
-        StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
-      unsigned Offset = SystemZMC::ELFCallFrameSize + VA.getLocMemOffset();
+        StackPtr = DAG.getCopyFromReg(Chain, DL,
+                                      Regs->getStackPointerRegister(), PtrVT);
+      unsigned Offset = Regs->getStackPointerBias() + Regs->getCallFrameSize() +
+                        VA.getLocMemOffset();
       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
         Offset += 4;
       SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
@@ -1689,6 +1726,17 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
       // Emit the store.
       MemOpChains.push_back(
           DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
+
+      // Although long doubles or vectors are passed through the stack when
+      // they are vararg (non-fixed arguments), if a long double or vector
+      // occupies the third and fourth slot of the argument list GPR3 should
+      // still shadow the third slot of the argument list.
+      if (Subtarget.isTargetXPLINK64() && VA.needsCustom()) {
+        SDValue ShadowArgValue =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, ArgValue,
+                        DAG.getIntPtrConstant(1, DL));
+        RegsToPass.push_back(std::make_pair(SystemZ::R3D, ShadowArgValue));
+      }
     }
   }
 
@@ -1700,6 +1748,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // associated Target* opcodes.  Force %r1 to be used for indirect
   // tail calls.
   SDValue Glue;
+  // FIXME: Add support for XPLINK using the ADA register.
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
@@ -2282,8 +2331,7 @@ static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
                                  Comparison &C) {
   if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
       C.CCMask == SystemZ::CCMASK_CMP_NE) {
-    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
-      SDNode *N = *I;
+    for (SDNode *N : C.Op0->uses()) {
       if (N->getOpcode() == ISD::SUB &&
           ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
            (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
@@ -2306,8 +2354,7 @@ static void adjustForFNeg(Comparison &C) {
     return;
   auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
   if (C1 && C1->isZero()) {
-    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
-      SDNode *N = *I;
+    for (SDNode *N : C.Op0->uses()) {
       if (N->getOpcode() == ISD::FNEG) {
         C.Op0 = SDValue(N, 0);
         C.CCMask = SystemZ::reverseCCMask(C.CCMask);
@@ -2333,8 +2380,7 @@ static void adjustForLTGFR(Comparison &C) {
     if (C1 && C1->getZExtValue() == 32) {
       SDValue ShlOp0 = C.Op0.getOperand(0);
       // See whether X has any SIGN_EXTEND_INREG uses.
-      for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
-        SDNode *N = *I;
+      for (SDNode *N : ShlOp0->uses()) {
         if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
             cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
           C.Op0 = SDValue(N, 0);
@@ -3320,8 +3366,7 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
 
 SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
                                               SelectionDAG &DAG) const {
-  auto *TFL =
-      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+  auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
@@ -4139,17 +4184,21 @@ SystemZTargetLowering::getTargetMMOFlags(const Instruction &I) const {
 SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+  auto *Regs = Subtarget->getSpecialRegisters();
   MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     report_fatal_error("Variable-sized stack allocations are not supported "
                        "in GHC calling convention");
   return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
-                            SystemZ::R15D, Op.getValueType());
+                            Regs->getStackPointerRegister(), Op.getValueType());
 }
 
 SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+  auto *Regs = Subtarget->getSpecialRegisters();
   MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
   bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
 
@@ -4163,12 +4212,13 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
   SDLoc DL(Op);
 
   if (StoreBackchain) {
-    SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
+    SDValue OldSP = DAG.getCopyFromReg(
+        Chain, DL, Regs->getStackPointerRegister(), MVT::i64);
     Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
                             MachinePointerInfo());
   }
 
-  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
+  Chain = DAG.getCopyToReg(Chain, DL, Regs->getStackPointerRegister(), NewSP);
 
   if (StoreBackchain)
     Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
@@ -5589,6 +5639,32 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
     Results.push_back(Res.getValue(2));
     break;
   }
+  case ISD::BITCAST: {
+    SDValue Src = N->getOperand(0);
+    if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
+        !useSoftFloat()) {
+      SDLoc DL(N);
+      SDValue Lo, Hi;
+      if (getRepRegClassFor(MVT::f128) == &SystemZ::VR128BitRegClass) {
+        SDValue VecBC = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Src);
+        Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
+                         DAG.getConstant(1, DL, MVT::i32));
+        Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
+                         DAG.getConstant(0, DL, MVT::i32));
+      } else {
+        assert(getRepRegClassFor(MVT::f128) == &SystemZ::FP128BitRegClass &&
+               "Unrecognized register class for f128.");
+        SDValue LoFP = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
+                                                  DL, MVT::f64, Src);
+        SDValue HiFP = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
+                                                  DL, MVT::f64, Src);
+        Lo = DAG.getNode(ISD::BITCAST, DL, MVT::i64, LoFP);
+        Hi = DAG.getNode(ISD::BITCAST, DL, MVT::i64, HiFP);
+      }
+      Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi));
+    }
+    break;
+  }
   default:
     llvm_unreachable("Unexpected node to lower");
   }
@@ -5634,15 +5710,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SUBCARRY);
     OPCODE(GET_CCMASK);
     OPCODE(MVC);
-    OPCODE(MVC_LOOP);
     OPCODE(NC);
-    OPCODE(NC_LOOP);
     OPCODE(OC);
-    OPCODE(OC_LOOP);
     OPCODE(XC);
-    OPCODE(XC_LOOP);
     OPCODE(CLC);
-    OPCODE(CLC_LOOP);
     OPCODE(STPCPY);
     OPCODE(STRCMP);
     OPCODE(SEARCH_STRING);
@@ -7071,13 +7142,19 @@ SystemZTargetLowering::getStackProbeSize(MachineFunction &MF) const {
 // Force base value Base into a register before MI.  Return the register.
 static Register forceReg(MachineInstr &MI, MachineOperand &Base,
                          const SystemZInstrInfo *TII) {
-  if (Base.isReg())
-    return Base.getReg();
-
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  if (Base.isReg()) {
+    // Copy Base into a new virtual register to help register coalescing in
+    // cases with multiple uses.
+    Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+    BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), Reg)
+      .add(Base);
+    return Reg;
+  }
+
   Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
       .add(Base)
@@ -7103,8 +7180,8 @@ static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
   // If we hit the end of the block, check whether CC is live into a
   // successor.
   if (miI == MBB->end()) {
-    for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI)
-      if ((*SI)->isLiveIn(SystemZ::CC))
+    for (const MachineBasicBlock *Succ : MBB->successors())
+      if (Succ->isLiveIn(SystemZ::CC))
         return false;
   }
 
@@ -7796,26 +7873,67 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
   MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
   uint64_t SrcDisp = MI.getOperand(3).getImm();
   MachineOperand &LengthMO = MI.getOperand(4);
-  uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0;
-  Register LenMinus1Reg =
-      LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister;
+  bool IsImmForm = LengthMO.isImm();
+  bool IsRegForm = !IsImmForm;
+
+  bool NeedsLoop = false;
+  uint64_t ImmLength = 0;
+  Register LenMinus1Reg = SystemZ::NoRegister;
+  if (IsImmForm) {
+    ImmLength = LengthMO.getImm();
+    ImmLength++; // Add back the '1' subtracted originally.
+    if (ImmLength == 0) {
+      MI.eraseFromParent();
+      return MBB;
+    }
+    if (Opcode == SystemZ::CLC) {
+      if (ImmLength > 3 * 256)
+        // A two-CLC sequence is a clear win over a loop, not least because
+        // it needs only one branch.  A three-CLC sequence needs the same
+        // number of branches as a loop (i.e. 2), but is shorter.  That
+        // brings us to lengths greater than 768 bytes.  It seems relatively
+        // likely that a difference will be found within the first 768 bytes,
+        // so we just optimize for the smallest number of branch
+        // instructions, in order to avoid polluting the prediction buffer
+        // too much.
+        NeedsLoop = true;
+    } else if (ImmLength > 6 * 256)
+      // The heuristic we use is to prefer loops for anything that would
+      // require 7 or more MVCs.  With these kinds of sizes there isn't much
+      // to choose between straight-line code and looping code, since the
+      // time will be dominated by the MVCs themselves.
+      NeedsLoop = true;
+  } else {
+    NeedsLoop = true;
+    LenMinus1Reg = LengthMO.getReg();
+  }
 
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a difference is found.
-  MachineBasicBlock *EndMBB = (ImmLength > 256 && Opcode == SystemZ::CLC
-                                   ? SystemZ::splitBlockAfter(MI, MBB)
-                                   : nullptr);
-
-  // Check for the loop form, in which operand 5 is the trip count.
-  if (MI.getNumExplicitOperands() > 5) {
-    Register StartCountReg = MI.getOperand(5).getReg();
-    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+  MachineBasicBlock *EndMBB =
+      (Opcode == SystemZ::CLC && (ImmLength > 256 || NeedsLoop)
+           ? SystemZ::splitBlockAfter(MI, MBB)
+           : nullptr);
+
+  if (NeedsLoop) {
+    Register StartCountReg =
+      MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+    if (IsImmForm) {
+      TII->loadImmediate(*MBB, MI, StartCountReg, ImmLength / 256);
+      ImmLength &= 255;
+    } else {
+      BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
+        .addReg(LenMinus1Reg)
+        .addReg(0)
+        .addImm(8);
+    }
 
     auto loadZeroAddress = [&]() -> MachineOperand {
       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
       return MachineOperand::CreateReg(Reg, false);
     };
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
       DestBase = loadZeroAddress();
     if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7842,12 +7960,12 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     Register ThisCountReg = MRI.createVirtualRegister(RC);
     Register NextCountReg = MRI.createVirtualRegister(RC);
 
-    if (LengthMO.isReg()) {
+    if (IsRegForm) {
       AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB);
       StartMBB = SystemZ::emitBlockAfter(MBB);
       LoopMBB = SystemZ::emitBlockAfter(StartMBB);
-      NextMBB = LoopMBB;
-      DoneMBB = SystemZ::emitBlockAfter(LoopMBB);
+      NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
+      DoneMBB = SystemZ::emitBlockAfter(NextMBB);
 
       //  MBB:
       //   # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
@@ -7882,7 +8000,6 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
 
       DestBase = MachineOperand::CreateReg(NextDestReg, false);
       SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
-      ImmLength &= 255;
       if (EndMBB && !ImmLength)
         // If the loop handled the whole CLC range, DoneMBB will be empty with
         // CC live-through into EndMBB, so add it as live-in.
@@ -7953,7 +8070,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     MBB->addSuccessor(DoneMBB);
 
     MBB = DoneMBB;
-    if (LengthMO.isReg()) {
+    if (IsRegForm) {
       // DoneMBB:
       // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
       // # Use EXecute Relative Long for the remainder of the bytes. The target
@@ -7966,19 +8083,23 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
         : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg)
         .addReg(StartDestReg).addMBB(StartMBB)
-        .addReg(NextDestReg).addMBB(LoopMBB);
+        .addReg(NextDestReg).addMBB(NextMBB);
       if (!HaveSingleBase)
         BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
           .addReg(StartSrcReg).addMBB(StartMBB)
-          .addReg(NextSrcReg).addMBB(LoopMBB);
-      MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass);
-      BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
-        .addImm(Opcode)
-        .addReg(LenMinus1Reg)
-        .addReg(RemDestReg).addImm(DestDisp)
-        .addReg(RemSrcReg).addImm(SrcDisp);
+          .addReg(NextSrcReg).addMBB(NextMBB);
+      MachineInstrBuilder EXRL_MIB =
+        BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
+          .addImm(Opcode)
+          .addReg(LenMinus1Reg)
+          .addReg(RemDestReg).addImm(DestDisp)
+          .addReg(RemSrcReg).addImm(SrcDisp);
       MBB->addSuccessor(AllDoneMBB);
       MBB = AllDoneMBB;
+      if (EndMBB) {
+        EXRL_MIB.addReg(SystemZ::CC, RegState::ImplicitDefine);
+        MBB->addLiveIn(SystemZ::CC);
+      }
     }
   }
 
@@ -8264,8 +8385,7 @@ MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
 SDValue SystemZTargetLowering::
 getBackchainAddress(SDValue SP, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  auto *TFL =
-      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+  auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
   SDLoc DL(SP);
   return DAG.getNode(ISD::ADD, DL, MVT::i64, SP,
                      DAG.getIntPtrConstant(TFL->getBackchainOffset(MF), DL));
@@ -8497,21 +8617,18 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
 
   case SystemZ::ATOMIC_CMP_SWAPW:
     return emitAtomicCmpSwapW(MI, MBB);
-  case SystemZ::MVCSequence:
-  case SystemZ::MVCLoop:
+  case SystemZ::MVCImm:
+  case SystemZ::MVCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
-  case SystemZ::NCSequence:
-  case SystemZ::NCLoop:
+  case SystemZ::NCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::NC);
-  case SystemZ::OCSequence:
-  case SystemZ::OCLoop:
+  case SystemZ::OCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::OC);
-  case SystemZ::XCSequence:
-  case SystemZ::XCLoop:
-  case SystemZ::XCLoopVarLen:
+  case SystemZ::XCImm:
+  case SystemZ::XCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::XC);
-  case SystemZ::CLCSequence:
-  case SystemZ::CLCLoop:
+  case SystemZ::CLCImm:
+  case SystemZ::CLCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
   case SystemZ::CLSTLoop:
     return emitStringWrapper(MI, MBB, SystemZ::CLST);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 248efc11b87f..461f804ca55e 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -117,23 +117,14 @@ enum NodeType : unsigned {
   // MachineMemOperands rather than one.
   MVC,
 
-  // Like MVC, but implemented as a loop that handles X*256 bytes
-  // followed by straight-line code to handle the rest (if any).
-  // The value of X is passed as an additional operand.
-  MVC_LOOP,
-
-  // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+  // Similar to MVC, but for logic operations (AND, OR, XOR).
   NC,
-  NC_LOOP,
   OC,
-  OC_LOOP,
   XC,
-  XC_LOOP,
 
   // Use CLC to compare two blocks of memory, with the same comments
-  // as for MVC and MVC_LOOP.
+  // as for MVC.
   CLC,
-  CLC_LOOP,
 
   // Use an MVST-based sequence to implement stpcpy().
   STPCPY,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 337164d55e5f..7cbe125533d3 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -128,9 +128,10 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
                                     (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 }
 
-defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
-defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
+// The length is given as one less for MVCImm.
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCImm, 3>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCImm, 7>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 5cb46cdb36a6..cd60fff1ab11 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5329,42 +5329,37 @@ multiclass CondUnaryRSYPseudoAndMemFold<string mnemonic,
 
 // Define an instruction that operates on two fixed-length blocks of memory,
 // and associated pseudo instructions for operating on blocks of any size.
-// The Sequence form uses a straight-line sequence of instructions and
-// the Loop form uses a loop of length-256 instructions followed by
-// another instruction to handle the excess.
-// The LoopVarLen form is for a loop with a non-constant length parameter.
-multiclass MemorySS<string mnemonic, bits<8> opcode,
-                    SDPatternOperator sequence, SDPatternOperator loop> {
+// There are two pseudos for the different cases of when the length is
+// constant or variable. The length operand of a pseudo is actually one less
+// than the intended number of bytes, since the register case needs to use an
+// EXRL with a target instruction that adds one to the length always.
+multiclass MemorySS<string mnemonic, bits<8> opcode, SDPatternOperator memop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
-    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm64:$length),
-                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm64:$length)]>;
-    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                   imm64:$length, GR64:$count256),
-                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
-                             imm64:$length, GR64:$count256)]>;
-    def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                         GR64:$length, GR64:$count256),
-                            [(loop bdaddr12only:$dest, bdaddr12only:$src,
-                                   GR64:$length, GR64:$count256)]>;
+    def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  imm64:$length),
+                             [(memop bdaddr12only:$dest, bdaddr12only:$src,
+                                     imm64:$length)]>;
+    def Reg : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  ADDR64:$length),
+                             [(memop bdaddr12only:$dest, bdaddr12only:$src,
+                                     ADDR64:$length)]>;
   }
 }
 
 // The same, but setting a CC result as comparison operator.
 multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
-                          SDPatternOperator sequence, SDPatternOperator loop> {
+                           SDPatternOperator memop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm64:$length),
-                           [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
-                                               imm64:$length))]>;
-    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                   imm64:$length, GR64:$count256),
-                      [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm64:$length, GR64:$count256))]>;
+    def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  imm64:$length),
+                          [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src,
+                                           imm64:$length))]>;
+    def Reg : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  ADDR64:$length),
+                          [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src,
+                                           ADDR64:$length))]>;
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index b9f64198f4e5..2bf80882fa61 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -514,8 +515,8 @@ unsigned SystemZInstrInfo::insertBranch(MachineBasicBlock &MBB,
 }
 
 bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                      Register &SrcReg2, int &Mask,
-                                      int &Value) const {
+                                      Register &SrcReg2, int64_t &Mask,
+                                      int64_t &Value) const {
   assert(MI.isCompare() && "Caller should have checked for a comparison");
 
   if (MI.getNumExplicitOperands() == 2 && MI.getOperand(0).isReg() &&
@@ -942,8 +943,9 @@ static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
     NewMI->setFlag(Flag);
 }
 
-MachineInstr *SystemZInstrInfo::convertToThreeAddress(
-    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+MachineInstr *
+SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                        LiveIntervals *LIS) const {
   MachineBasicBlock *MBB = MI.getParent();
 
   // Try to convert an AND into an RISBG-type instruction.
@@ -984,6 +986,8 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
             LV->replaceKillInstruction(Op.getReg(), MI, *MIB);
         }
       }
+      if (LIS)
+        LIS->ReplaceMachineInstrInMaps(MI, *MIB);
       transferDeadCC(&MI, MIB);
       return MIB;
     }
@@ -1515,6 +1519,13 @@ unsigned SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     const char *AsmStr = MI.getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
+  else if (MI.getOpcode() == SystemZ::PATCHPOINT)
+    return PatchPointOpers(&MI).getNumPatchBytes();
+  else if (MI.getOpcode() == SystemZ::STACKMAP)
+    return MI.getOperand(1).getImm();
+  else if (MI.getOpcode() == SystemZ::FENTRY_CALL)
+    return 6;
+
   return MI.getDesc().getSize();
 }
 
@@ -1923,7 +1934,7 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      unsigned Reg, uint64_t Value) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-  unsigned Opcode;
+  unsigned Opcode = 0;
   if (isInt<16>(Value))
     Opcode = SystemZ::LGHI;
   else if (SystemZ::isImmLL(Value))
@@ -1931,11 +1942,23 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
   else if (SystemZ::isImmLH(Value)) {
     Opcode = SystemZ::LLILH;
     Value >>= 16;
-  } else {
-    assert(isInt<32>(Value) && "Huge values not handled yet");
+  }
+  else if (isInt<32>(Value))
     Opcode = SystemZ::LGFI;
+  if (Opcode) {
+    BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value);
+    return;
   }
-  BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value);
+
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert (MRI.isSSA() &&  "Huge values only handled before reg-alloc .");
+  Register Reg0 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+  Register Reg1 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+  BuildMI(MBB, MBBI, DL, get(SystemZ::IMPLICIT_DEF), Reg0);
+  BuildMI(MBB, MBBI, DL, get(SystemZ::IIHF64), Reg1)
+    .addReg(Reg0).addImm(Value >> 32);
+  BuildMI(MBB, MBBI, DL, get(SystemZ::IILF64), Reg)
+    .addReg(Reg1).addImm(Value & ((uint64_t(1) << 32) - 1));
 }
 
 bool SystemZInstrInfo::verifyInstruction(const MachineInstr &MI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 72dafc3c93c2..396f56c7f59c 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -47,7 +47,8 @@ enum {
   CCMaskFirst            = (1 << 18),
   CCMaskLast             = (1 << 19),
   IsLogical              = (1 << 20),
-  CCIfNoSignedWrap       = (1 << 21)
+  CCIfNoSignedWrap       = (1 << 21),
+  MemMemOp               = (1 << 22)
 };
 
 static inline unsigned getAccessSize(unsigned int Flags) {
@@ -234,7 +235,8 @@ public:
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &Mask, int &Value) const override;
+                      Register &SrcReg2, int64_t &Mask,
+                      int64_t &Value) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
                        Register, Register, Register, int &, int &,
                        int &) const override;
@@ -270,9 +272,8 @@ public:
                             Register DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
-  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineInstr &MI,
-                                      LiveVariables *LV) const override;
+  MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                      LiveIntervals *LIS) const override;
   MachineInstr *
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 7df7cc93d6eb..e4760229fd6b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -503,7 +503,7 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
-  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc>;
 let mayLoad = 1, mayStore = 1, Defs = [CC] in {
   def MVCL  : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
   def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
@@ -1200,7 +1200,7 @@ let Defs = [CC] in {
 
   // Block AND.
   let mayLoad = 1, mayStore = 1 in
-    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
+    defm NC : MemorySS<"nc", 0xD4, z_nc>;
 }
 defm : RMWIByte<and, bdaddr12pair, NI>;
 defm : RMWIByte<and, bdaddr20pair, NIY>;
@@ -1257,7 +1257,7 @@ let Defs = [CC] in {
 
   // Block OR.
   let mayLoad = 1, mayStore = 1 in
-    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
+    defm OC : MemorySS<"oc", 0xD6, z_oc>;
 }
 defm : RMWIByte<or, bdaddr12pair, OI>;
 defm : RMWIByte<or, bdaddr20pair, OIY>;
@@ -1297,7 +1297,7 @@ let Defs = [CC] in {
 
   // Block XOR.
   let mayLoad = 1, mayStore = 1 in
-    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
+    defm XC : MemorySS<"xc", 0xD7, z_xc>;
 }
 defm : RMWIByte<xor, bdaddr12pair, XI>;
 defm : RMWIByte<xor, bdaddr20pair, XIY>;
@@ -1624,7 +1624,7 @@ defm : ZXB<z_ucmp, GR64, CLGFR>;
 
 // Memory-to-memory comparison.
 let mayLoad = 1, Defs = [CC] in {
-  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc>;
   def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
   def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
   def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
@@ -2173,7 +2173,7 @@ let hasSideEffects = 1 in {
   def EX   : SideEffectBinaryRX<"ex", 0x44, ADDR64>;
   def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, ADDR64>;
   let hasNoSchedulingInfo = 1 in
-    def EXRL_Pseudo : Pseudo<(outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1,
+    def EXRL_Pseudo : Alias<6, (outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1,
                                           bdaddr12only:$bdl1, bdaddr12only:$bd2),
                                           []>;
 }
@@ -2355,21 +2355,15 @@ let AddedComplexity = 4 in {
             (RLLG GR64:$val, (LCR GR32:$shift), 0)>;
 }
 
-// Peepholes for turning scalar operations into block operations.
-defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 1>;
-defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 2>;
-defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 4>;
-defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
-                      OCSequence, XCSequence, 1>;
-defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 2>;
-defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 4>;
-defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 8>;
+// Peepholes for turning scalar operations into block operations.  The length
+// is given as one less for these pseudos.
+defm : BlockLoadStore<anyextloadi8, i32, MVCImm, NCImm, OCImm, XCImm, 0>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCImm, NCImm, OCImm, XCImm, 1>;
+defm : BlockLoadStore<load, i32, MVCImm, NCImm, OCImm, XCImm, 3>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCImm, NCImm, OCImm, XCImm, 0>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCImm, NCImm, OCImm, XCImm, 1>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCImm, NCImm, OCImm, XCImm, 3>;
+defm : BlockLoadStore<load, i64, MVCImm, NCImm, OCImm, XCImm, 7>;
 
 //===----------------------------------------------------------------------===//
 // Mnemonic Aliases
diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index b1964321c78a..9c985c16f082 100644
--- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -209,10 +209,24 @@ void SystemZLongBranch::skipTerminator(BlockPosition &Position,
     Position.Address += Terminator.ExtraRelaxSize;
 }
 
+static unsigned getInstSizeInBytes(const MachineInstr &MI,
+                                   const SystemZInstrInfo *TII) {
+  unsigned Size = TII->getInstSizeInBytes(MI);
+  assert((Size ||
+          // These do not have a size:
+          MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() ||
+          MI.isImplicitDef() || MI.getOpcode() == SystemZ::MemBarrier ||
+          // These have a size that may be zero:
+          MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP ||
+          MI.getOpcode() == SystemZ::PATCHPOINT) &&
+         "Missing size value for instruction.");
+  return Size;
+}
+
 // Return a description of terminator instruction MI.
 TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) {
   TerminatorInfo Terminator;
-  Terminator.Size = TII->getInstSizeInBytes(MI);
+  Terminator.Size = getInstSizeInBytes(MI, TII);
   if (MI.isConditionalBranch() || MI.isUnconditionalBranch()) {
     switch (MI.getOpcode()) {
     case SystemZ::J:
@@ -287,7 +301,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
     MachineBasicBlock::iterator MI = MBB->begin();
     MachineBasicBlock::iterator End = MBB->end();
     while (MI != End && !MI->isTerminator()) {
-      Block.Size += TII->getInstSizeInBytes(*MI);
+      Block.Size += getInstSizeInBytes(*MI, TII);
       ++MI;
     }
     skipNonTerminators(Position, Block);
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 9bee5e8d1864..4bc979de795d 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -46,9 +46,9 @@ static MachineBasicBlock *getSingleSchedPred(MachineBasicBlock *MBB,
   // The loop header has two predecessors, return the latch, but not for a
   // single block loop.
   if (MBB->pred_size() == 2 && Loop != nullptr && Loop->getHeader() == MBB) {
-    for (auto I = MBB->pred_begin(); I != MBB->pred_end(); ++I)
-      if (Loop->contains(*I))
-        PredMBB = (*I == MBB ? nullptr : *I);
+    for (MachineBasicBlock *Pred : MBB->predecessors())
+      if (Loop->contains(Pred))
+        PredMBB = (Pred == MBB ? nullptr : Pred);
   }
 
   assert ((PredMBB == nullptr || !Loop || Loop->contains(PredMBB))
@@ -106,13 +106,12 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
 
   // Emit incoming terminator(s). Be optimistic and assume that branch
   // prediction will generally do "the right thing".
-  for (MachineBasicBlock::iterator I = SinglePredMBB->getFirstTerminator();
-       I != SinglePredMBB->end(); I++) {
-    LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; I->dump(););
-    bool TakenBranch = (I->isBranch() &&
-                        (TII->getBranchInfo(*I).isIndirect() ||
-                         TII->getBranchInfo(*I).getMBBTarget() == MBB));
-    HazardRec->emitInstruction(&*I, TakenBranch);
+  for (MachineInstr &MI : SinglePredMBB->terminators()) {
+    LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; MI.dump(););
+    bool TakenBranch = (MI.isBranch() &&
+                        (TII->getBranchInfo(MI).isIndirect() ||
+                         TII->getBranchInfo(MI).getMBBTarget() == MBB));
+    HazardRec->emitInstruction(&MI, TakenBranch);
     if (TakenBranch)
       break;
   }
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 992b1512a077..927d97233286 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -102,17 +102,6 @@ def SDT_ZMemMemLengthCC     : SDTypeProfile<1, 3,
                                              SDTCisPtrTy<1>,
                                              SDTCisPtrTy<2>,
                                              SDTCisVT<3, i64>]>;
-def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
-                                            [SDTCisPtrTy<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i64>,
-                                             SDTCisVT<3, i64>]>;
-def SDT_ZMemMemLoopCC       : SDTypeProfile<1, 4,
-                                            [SDTCisVT<0, i32>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisPtrTy<2>,
-                                             SDTCisVT<3, i64>,
-                                             SDTCisVT<4, i64>]>;
 def SDT_ZString             : SDTypeProfile<1, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
@@ -416,24 +405,14 @@ def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
 
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
-                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
-def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoopCC,
-                                 [SDNPHasChain, SDNPMayLoad]>;
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 0062e39602f5..48cec176b006 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -190,7 +190,9 @@ bool SystemZRegisterInfo::getRegAllocationHints(
 
 const MCPhysReg *
 SystemZXPLINK64Registers::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_SystemZ_XPLINK64_SaveList;
+  const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
+  return Subtarget.hasVector() ? CSR_SystemZ_XPLINK64_Vector_SaveList
+                               : CSR_SystemZ_XPLINK64_SaveList;
 }
 
 const MCPhysReg *
@@ -211,7 +213,9 @@ SystemZELFRegisters::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t *
 SystemZXPLINK64Registers::getCallPreservedMask(const MachineFunction &MF,
                                                CallingConv::ID CC) const {
-  return CSR_SystemZ_XPLINK64_RegMask;
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  return Subtarget.hasVector() ? CSR_SystemZ_XPLINK64_Vector_RegMask
+                               : CSR_SystemZ_XPLINK64_RegMask;
 }
 
 const uint32_t *
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 122504d4b44b..8ce01074873a 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
 
 #include "SystemZ.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
@@ -44,9 +45,9 @@ inline bool isHighReg(unsigned int Reg) {
 /// It is abstract, all calling conventions must override and
 /// define the pure virtual member function defined in this class.
 class SystemZCallingConventionRegisters {
+
 public:
-  /// \returns the register that keeps the
-  /// return function address.
+  /// \returns the register that keeps the return function address.
   virtual int getReturnFunctionAddressRegister() = 0;
 
   /// \returns the register that keeps the
@@ -65,6 +66,12 @@ public:
   virtual const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                                CallingConv::ID CC) const = 0;
 
+  /// \returns the offset to the locals area.
+  virtual int getCallFrameSize() = 0;
+
+  /// \returns the stack pointer bias.
+  virtual int getStackPointerBias() = 0;
+
   /// Destroys the object. Bogus destructor allowing derived classes
   /// to override it.
   virtual ~SystemZCallingConventionRegisters(){};
@@ -82,12 +89,18 @@ public:
 
   int getFramePointerRegister() override final { return SystemZ::R8D; };
 
+  int getAddressOfCalleeRegister() { return SystemZ::R6D; };
+
   const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction *MF) const override final;
 
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override final;
 
+  int getCallFrameSize() override final { return 128; }
+
+  int getStackPointerBias() override final { return 2048; }
+
   /// Destroys the object. Bogus destructor overriding base class destructor
   ~SystemZXPLINK64Registers(){};
 };
@@ -110,6 +123,10 @@ public:
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override final;
 
+  int getCallFrameSize() override final { return SystemZMC::ELFCallFrameSize; }
+
+  int getStackPointerBias() override final { return 0; }
+
   /// Destroys the object. Bogus destructor overriding base class destructor
   ~SystemZELFRegisters(){};
 };
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 4a9ea69d101c..f38e93109967 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -17,32 +17,29 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-// Decide whether it is best to use a loop or straight-line code for
-// a block operation of Size bytes with source address Src and destination
-// address Dest.  Sequence is the opcode to use for straight-line code
-// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
-// Return the chain for the completed operation.
-static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
-                          unsigned Loop, SDValue Chain, SDValue Dst,
-                          SDValue Src, uint64_t Size) {
-  EVT PtrVT = Src.getValueType();
-  // The heuristic we use is to prefer loops for anything that would
-  // require 7 or more MVCs.  With these kinds of sizes there isn't
-  // much to choose between straight-line code and looping code,
-  // since the time will be dominated by the MVCs themselves.
-  // However, the loop has 4 or 5 instructions (depending on whether
-  // the base addresses can be proved equal), so there doesn't seem
-  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
-  // the range (5 * 256, 6 * 256) will need another instruction after
-  // the loop, so it doesn't seem worth using a loop then either.
-  // The next value up, 6 * 256, can be implemented in the same
-  // number of straight-line MVCs as 6 * 256 - 1.
-  if (Size > 6 * 256)
-    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
-                       DAG.getConstant(Size, DL, PtrVT),
-                       DAG.getConstant(Size / 256, DL, PtrVT));
-  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
-                     DAG.getConstant(Size, DL, PtrVT));
+static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
+  return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
+                               : DAG.getVTList(MVT::Other);
+}
+
+// Emit a mem-mem operation after subtracting one from size, which will be
+// added back during pseudo expansion. As the Reg case emitted here may be
+// converted by DAGCombiner into having an Imm length, they are both emitted
+// the same way.
+static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                             SDValue Chain, SDValue Dst, SDValue Src,
+                             uint64_t Size) {
+  return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
+                     DAG.getConstant(Size - 1, DL, Src.getValueType()));
+}
+
+static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                             SDValue Chain, SDValue Dst, SDValue Src,
+                             SDValue Size) {
+  SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
+                                  DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+                                  DAG.getConstant(-1, DL, MVT::i64));
+  return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
 }
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -53,9 +50,10 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
     return SDValue();
 
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
-    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
-                      Chain, Dst, Src, CSize->getZExtValue());
-  return SDValue();
+    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, Dst, Src,
+                         CSize->getZExtValue());
+
+  return emitMemMemReg(DAG, DL, SystemZISD::MVC, Chain, Dst, Src, Size);
 }
 
 // Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
@@ -127,52 +125,23 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
 
     // Handle the special case of a memset of 0, which can use XC.
     if (CByte && CByte->getZExtValue() == 0)
-      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
-                        Chain, Dst, Dst, Bytes);
+      return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
 
     // Copy the byte to the first location and then use MVC to copy
     // it to the rest.
     Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
     SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
-    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
-                      Chain, DstPlus1, Dst, Bytes - 1);
+    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
+                         Bytes - 1);
   }
 
   // Variable length
-  if (CByte && CByte->getZExtValue() == 0) {
+  if (CByte && CByte->getZExtValue() == 0)
     // Handle the special case of a variable length memset of 0 with XC.
-    SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
-                                    DAG.getZExtOrTrunc(Size, DL, MVT::i64),
-                                    DAG.getConstant(-1, DL, MVT::i64));
-    SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
-                                DAG.getConstant(8, DL, MVT::i64));
-    return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst,
-                       LenMinus1, TripC);
-  }
-  return SDValue();
-}
+    return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
 
-// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
-// deciding whether to use a loop or straight-line code.
-static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
-                       SDValue Src1, SDValue Src2, uint64_t Size) {
-  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
-  EVT PtrVT = Src1.getValueType();
-  // A two-CLC sequence is a clear win over a loop, not least because it
-  // needs only one branch.  A three-CLC sequence needs the same number
-  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
-  // lengths greater than 768 bytes.  It seems relatively likely that
-  // a difference will be found within the first 768 bytes, so we just
-  // optimize for the smallest number of branch instructions, in order
-  // to avoid polluting the prediction buffer too much.  A loop only ever
-  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
-  if (Size > 3 * 256)
-    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
-                       DAG.getConstant(Size, DL, PtrVT),
-                       DAG.getConstant(Size / 256, DL, PtrVT));
-  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
-                     DAG.getConstant(Size, DL, PtrVT));
+  return SDValue();
 }
 
 // Convert the current CC value into an integer that is 0 if CC == 0,
@@ -193,15 +162,16 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
     SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo,
     MachinePointerInfo Op2PtrInfo) const {
+  SDValue CCReg;
+  // Swap operands to invert CC == 1 vs. CC == 2 cases.
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
     uint64_t Bytes = CSize->getZExtValue();
     assert(Bytes > 0 && "Caller should have handled 0-size case");
-    // Swap operands to invert CC == 1 vs. CC == 2 cases.
-    SDValue CCReg = emitCLC(DAG, DL, Chain, Src2, Src1, Bytes);
-    Chain = CCReg.getValue(1);
-    return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
-  }
-  return std::make_pair(SDValue(), SDValue());
+    CCReg = emitMemMemImm(DAG, DL, SystemZISD::CLC, Chain, Src2, Src1, Bytes);
+  } else
+    CCReg = emitMemMemReg(DAG, DL, SystemZISD::CLC, Chain, Src2, Src1, Size);
+  Chain = CCReg.getValue(1);
+  return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain);
 }
 
 std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index 3d27b70d6ef9..254e5e92449b 100644
--- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -211,8 +211,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   LiveRegs.addLiveOuts(MBB);
 
   // Iterate backwards through the block looking for instructions to change.
-  for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
-    MachineInstr &MI = *MBBI;
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
     switch (MI.getOpcode()) {
     case SystemZ::IILF:
       Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH);
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index bfcdee270f29..0f03d96655bf 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -89,9 +89,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasSoftFloat(false), TargetTriple(TT),
       SpecialRegisters(initializeSpecialRegisters()),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(), FrameLowering() {}
-
-SystemZSubtarget::~SystemZSubtarget() { delete getSpecialRegisters(); }
+      TSInfo(), FrameLowering(SystemZFrameLowering::create(*this)) {}
 
 bool SystemZSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index f6c155de44a0..67c5b8eb09b6 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -77,11 +77,11 @@ protected:
 
 private:
   Triple TargetTriple;
-  SystemZCallingConventionRegisters *SpecialRegisters;
+  std::unique_ptr<SystemZCallingConventionRegisters> SpecialRegisters;
   SystemZInstrInfo InstrInfo;
   SystemZTargetLowering TLInfo;
   SystemZSelectionDAGInfo TSInfo;
-  SystemZFrameLowering FrameLowering;
+  std::unique_ptr<const SystemZFrameLowering> FrameLowering;
 
   SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
                                                     StringRef FS);
@@ -91,16 +91,23 @@ public:
   SystemZSubtarget(const Triple &TT, const std::string &CPU,
                    const std::string &FS, const TargetMachine &TM);
 
-  ~SystemZSubtarget();
-
   SystemZCallingConventionRegisters *getSpecialRegisters() const {
     assert(SpecialRegisters && "Unsupported SystemZ calling convention");
-    return SpecialRegisters;
+    return SpecialRegisters.get();
+  }
+
+  template <class SR> SR &getSpecialRegisters() const {
+    return *static_cast<SR *>(getSpecialRegisters());
   }
 
   const TargetFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return FrameLowering.get();
   }
+
+  template <class TFL> const TFL *getFrameLowering() const {
+    return static_cast<const TFL *>(getFrameLowering());
+  }
+
   const SystemZInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const SystemZRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index a886f9b9d814..deb3358102ed 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -21,8 +21,8 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
 #include <string>
@@ -84,8 +84,9 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   // 128-bit floats are aligned only to 64 bits.
   Ret += "-f128:64";
 
-  // When using the vector ABI, 128-bit vectors are also aligned to 64 bits.
-  if (VectorABI)
+  // When using the vector ABI on Linux, 128-bit vectors are also aligned to 64
+  // bits. On z/OS, vector types are always aligned to 64 bits.
+  if (VectorABI || TT.isOSzOS())
     Ret += "-v128:64";
 
   // We prefer 16 bits of aligned for all globals; see above.
@@ -284,7 +285,7 @@ void SystemZPassConfig::addPreEmitPass() {
   // vector instructions will be shortened into opcodes that compare
   // elimination recognizes.
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()));
 
   // We eliminate comparisons here rather than earlier because some
   // transformations can change the set of available CC values and we
@@ -310,7 +311,7 @@ void SystemZPassConfig::addPreEmitPass() {
   // between the comparison and the branch, but it isn't clear whether
   // preventing that would be a win or not.
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
+    addPass(createSystemZElimComparePass(getSystemZTargetMachine()));
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
 
   // Do final scheduling after all other optimizations, to get an
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h b/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h
new file mode 100644
index 000000000000..a610a90d2069
--- /dev/null
+++ b/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h
@@ -0,0 +1,55 @@
+//=- SystemZTargetStreamer.h - SystemZ Target Streamer ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETSTREAMER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class SystemZTargetStreamer : public MCTargetStreamer {
+public:
+  SystemZTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+  typedef std::pair<MCInst, const MCSubtargetInfo *> MCInstSTIPair;
+  struct CmpMCInst {
+    bool operator()(const MCInstSTIPair &MCI_STI_A,
+                    const MCInstSTIPair &MCI_STI_B) const {
+      if (MCI_STI_A.second != MCI_STI_B.second)
+        return uintptr_t(MCI_STI_A.second) < uintptr_t(MCI_STI_B.second);
+      const MCInst &A = MCI_STI_A.first;
+      const MCInst &B = MCI_STI_B.first;
+      assert(A.getNumOperands() == B.getNumOperands() &&
+             A.getNumOperands() == 5 && A.getOperand(2).getImm() == 1 &&
+             B.getOperand(2).getImm() == 1 && "Unexpected EXRL target MCInst");
+      if (A.getOpcode() != B.getOpcode())
+        return A.getOpcode() < B.getOpcode();
+      if (A.getOperand(0).getReg() != B.getOperand(0).getReg())
+        return A.getOperand(0).getReg() < B.getOperand(0).getReg();
+      if (A.getOperand(1).getImm() != B.getOperand(1).getImm())
+        return A.getOperand(1).getImm() < B.getOperand(1).getImm();
+      if (A.getOperand(3).getReg() != B.getOperand(3).getReg())
+        return A.getOperand(3).getReg() < B.getOperand(3).getReg();
+      if (A.getOperand(4).getImm() != B.getOperand(4).getImm())
+        return A.getOperand(4).getImm() < B.getOperand(4).getImm();
+      return false;
+    }
+  };
+  typedef std::map<MCInstSTIPair, MCSymbol *, CmpMCInst> EXRLT2SymMap;
+  EXRLT2SymMap EXRLTargets2Sym;
+
+  void emitConstantPools() override;
+
+  virtual void emitMachine(StringRef CPU) = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETSTREAMER_H
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 03c4da8495ab..6d66ebfced05 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -243,7 +243,8 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
 }
 
 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                                             TTI::UnrollingPreferences &UP) {
+                                             TTI::UnrollingPreferences &UP,
+                                             OptimizationRemarkEmitter *ORE) {
   // Find out if L contains a call, what the machine instruction count
   // estimate is, and how many stores there are.
   bool HasCall = false;
@@ -423,8 +424,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
           (C->getType()->isVectorTy()
                ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
                : dyn_cast<const ConstantInt>(C));
-      if (CVal != nullptr &&
-          (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
+      if (CVal && (CVal->getValue().isPowerOf2() ||
+                   CVal->getValue().isNegatedPowerOf2()))
         DivRemConstPow2 = true;
       else
         DivRemConst = true;
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 51cf557ae99b..db4ec794b3e4 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -52,7 +52,8 @@ public:
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP);
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
@@ -82,8 +83,7 @@ public:
   bool enableInterleavedAccessVectorization() { return true; }
 
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -115,8 +115,7 @@ public:
 
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
diff --git a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index 36291e079882..acfafd91bc17 100644
--- a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/SystemZTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 0a655a82b889..390457dbb2bc 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -101,46 +101,41 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   // dso_preemptable.  At this point in time, the various IR producers
   // have not been transitioned to always produce a dso_local when it
   // is possible to do so.
-  // In the case of ExternalSymbolSDNode, GV is null and we should just return
-  // false. However, COFF currently relies on this to be true
   //
   // As a result we still have some logic in here to improve the quality of the
   // generated code.
-  // FIXME: Add a module level metadata for whether intrinsics should be assumed
-  // local.
   if (!GV)
-    return TT.isOSBinFormatCOFF();
+    return false;
 
   // If the IR producer requested that this GV be treated as dso local, obey.
   if (GV->isDSOLocal())
     return true;
 
-  // DLLImport explicitly marks the GV as external.
-  if (GV->hasDLLImportStorageClass())
-    return false;
-
-  // On MinGW, variables that haven't been declared with DLLImport may still
-  // end up automatically imported by the linker. To make this feasible,
-  // don't assume the variables to be DSO local unless we actually know
-  // that for sure. This only has to be done for variables; for functions
-  // the linker can insert thunks for calling functions from another DLL.
-  if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() &&
-      GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
-    return false;
-
-  // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
-  // remain unresolved in the link, they can be resolved to zero, which is
-  // outside the current DSO.
-  if (TT.isOSBinFormatCOFF() && GV->hasExternalWeakLinkage())
-    return false;
+  if (TT.isOSBinFormatCOFF()) {
+    // DLLImport explicitly marks the GV as external.
+    if (GV->hasDLLImportStorageClass())
+      return false;
+
+    // On MinGW, variables that haven't been declared with DLLImport may still
+    // end up automatically imported by the linker. To make this feasible,
+    // don't assume the variables to be DSO local unless we actually know
+    // that for sure. This only has to be done for variables; for functions
+    // the linker can insert thunks for calling functions from another DLL.
+    if (TT.isWindowsGNUEnvironment() && GV->isDeclarationForLinker() &&
+        isa<GlobalVariable>(GV))
+      return false;
+
+    // Don't mark 'extern_weak' symbols as DSO local. If these symbols remain
+    // unresolved in the link, they can be resolved to zero, which is outside
+    // the current DSO.
+    if (GV->hasExternalWeakLinkage())
+      return false;
+
+    // Every other GV is local on COFF.
+    return true;
+  }
 
-  // Every other GV is local on COFF.
-  // Make an exception for windows OS in the triple: Some firmware builds use
-  // *-win32-macho triples. This (accidentally?) produced windows relocations
-  // without GOT tables in older clang versions; Keep this behaviour.
-  // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables
-  // either.
-  if (TT.isOSBinFormatCOFF() || TT.isOSWindows())
+  if (TT.isOSBinFormatGOFF())
     return true;
 
   if (TT.isOSBinFormatMachO()) {
@@ -149,13 +144,8 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     return GV->isStrongDefinitionForLinker();
   }
 
-  // Due to the AIX linkage model, any global with default visibility is
-  // considered non-local.
-  if (TT.isOSBinFormatXCOFF())
-    return false;
-
-  assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm());
-  assert(RM != Reloc::DynamicNoPIC);
+  assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm() ||
+         TT.isOSBinFormatXCOFF());
   return false;
 }
 
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index 60fe84cadacc..55047a1bb3cd 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -18,10 +18,10 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/CodeGenCWrappers.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index a3309a68c76d..7e92e4b33812 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -25,7 +25,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <memory>
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 20d609bc6b32..72c40cbe78c4 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -18,7 +18,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index 9a6ae90b5c73..29c209934680 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -15,8 +15,8 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -164,7 +164,8 @@ public:
     llvm_unreachable("relaxInstruction() should not be called");
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override {
     if ((Count % 8) != 0)
       return false;
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index 76824335239b..9f29fc092c69 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -37,5 +37,4 @@ VEELFMCAsmInfo::VEELFMCAsmInfo(const Triple &TheTriple) {
   UsesELFSectionDirectiveForBSS = true;
 
   SupportsDebugInformation = true;
-  UseIntegratedAssembler = false;
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index 4c480c050274..f4fbf763e59c 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -18,8 +18,8 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index a95a299def88..7c4bf1cfd672 100644
--- a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/VETargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index 08a75b6b8c55..af69d04a17ca 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -30,7 +30,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index b297e0fcd1a2..32315543826a 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -2508,13 +2508,12 @@ static bool isI32Insn(const SDNode *User, const SDNode *N) {
   case ISD::CopyToReg:
     // Check all use of selections, bit operations, and copies.  If all of them
     // are safe, optimize truncate to extract_subreg.
-    for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end();
-         UI != UE; ++UI) {
-      switch ((*UI)->getOpcode()) {
+    for (const SDNode *U : User->uses()) {
+      switch (U->getOpcode()) {
       default:
         // If the use is an instruction which treats the source operand as i32,
         // it is safe to avoid truncate here.
-        if (isI32Insn(*UI, N))
+        if (isI32Insn(U, N))
           continue;
         break;
       case ISD::ANY_EXTEND:
@@ -2561,10 +2560,7 @@ SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
     return SDValue();
 
   // Check all use of this TRUNCATE.
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
-       ++UI) {
-    SDNode *User = *UI;
-
+  for (const SDNode *User : N->uses()) {
     // Make sure that we're not going to replace TRUNCATE for non i32
     // instructions.
     //
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 9770052ff913..ddcfb9da8249 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -20,10 +20,10 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 #define DEBUG_TYPE "ve-instr-info"
 
@@ -99,7 +99,7 @@ static bool isUncondBranchOpcode(int Opc) {
 
 #define BRKIND(NAME) (Opc == NAME##a || Opc == NAME##a_nt || Opc == NAME##a_t)
   // VE has other branch relative always instructions for word/double/float,
-  // but we use only long branches in our lower.  So, sanity check it here.
+  // but we use only long branches in our lower.  So, check it here.
   assert(!BRKIND(BRCFW) && !BRKIND(BRCFD) && !BRKIND(BRCFS) &&
          "Branch relative word/double/float always instructions should not be "
          "used!");
@@ -127,7 +127,7 @@ static bool isIndirectBranchOpcode(int Opc) {
 #define BRKIND(NAME)                                                           \
   (Opc == NAME##ari || Opc == NAME##ari_nt || Opc == NAME##ari_t)
   // VE has other branch always instructions for word/double/float, but
-  // we use only long branches in our lower.  So, sanity check it here.
+  // we use only long branches in our lower.  So, check it here.
   assert(!BRKIND(BCFW) && !BRKIND(BCFD) && !BRKIND(BCFS) &&
          "Branch word/double/float always instructions should not be used!");
   return BRKIND(BCFL);
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 2f77daae7130..c3abbe2cafab 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -634,9 +634,7 @@ multiclass RRIm<string opcStr, bits<8>opc,
 // Special RR multiclass for 128 bits shift left instruction.
 //   e.g. SLD
 let Constraints = "$hi = $sx", DisableEncoding = "$hi", hasSideEffects = 0 in
-multiclass RRILDm<string opcStr, bits<8>opc,
-                  RegisterClass RC, ValueType Ty,
-                  SDPatternOperator OpNode = null_frag> {
+multiclass RRILDm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rrr : RR<opc, (outs RC:$sx), (ins RC:$hi, RC:$sz, I32:$sy),
               !strconcat(opcStr, " $sx, $sz, $sy")>;
   let cz = 0 in
@@ -653,9 +651,7 @@ multiclass RRILDm<string opcStr, bits<8>opc,
 // Special RR multiclass for 128 bits shift right instruction.
 //   e.g. SRD
 let Constraints = "$low = $sx", DisableEncoding = "$low", hasSideEffects = 0 in
-multiclass RRIRDm<string opcStr, bits<8>opc,
-                  RegisterClass RC, ValueType Ty,
-                  SDPatternOperator OpNode = null_frag> {
+multiclass RRIRDm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rrr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$low, I32:$sy),
               !strconcat(opcStr, " $sx, $sz, $sy")>;
   let cz = 0 in
@@ -685,7 +681,7 @@ multiclass RRI1m<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
 // Special RR multiclass for MRG instruction.
 //   e.g. MRG
 let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0 in
-multiclass RRMRGm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty> {
+multiclass RRMRGm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sd),
               !strconcat(opcStr, " $sx, $sy, $sz")>;
   let cy = 0 in
@@ -719,7 +715,7 @@ multiclass RRSWPm<string opcStr, bits<8>opc,
 //   e.g. CMOVL, CMOVW, CMOVD, and etc.
 let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0,
     cfw = ? in
-multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty> {
+multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC> {
   def rr : RR<opc, (outs I64:$sx), (ins CCOp:$cfw, RC:$sy, I64:$sz, I64:$sd),
               !strconcat(opcStr, " $sx, $sz, $sy")>;
   let cy = 0 in
@@ -740,8 +736,8 @@ multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty> {
 //   e.g. CVTWDSX, CVTWDZX, CVTWSSX, and etc.
 // sz{3-0} = rounding mode
 let cz = 0, hasSideEffects = 0 in
-multiclass CVTRDm<string opcStr, bits<8> opc, RegisterClass RCo, ValueType Tyo,
-                  RegisterClass RCi, ValueType Tyi> {
+multiclass CVTRDm<string opcStr, bits<8> opc, RegisterClass RCo,
+                  RegisterClass RCi> {
   def r : RR<opc, (outs RCo:$sx), (ins RDOp:$rd, RCi:$sy),
              !strconcat(opcStr, "${rd} $sx, $sy")> {
     bits<4> rd;
@@ -1265,7 +1261,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm NND : RRNCm<"nnd", 0x54, I64, i64, and_not>;
 
 // Section 8.5.6 - MRG (Merge)
-defm MRG : RRMRGm<"mrg", 0x56, I64, i64>;
+defm MRG : RRMRGm<"mrg", 0x56, I64>;
 
 // Section 8.5.7 - LDZ (Leading Zero Count)
 def ctlz_pat : PatFrags<(ops node:$src),
@@ -1297,10 +1293,10 @@ def : Pat<(i32 (bswap (i32 mimm:$src))),
           (EXTRACT_SUBREG (BSWPmi (MIMM $src), 1), sub_i32)>;
 
 // Section 8.5.11 - CMOV (Conditional Move)
-let cw = 0, cw2 = 0 in defm CMOVL : RRCMOVm<"cmov.l.${cfw}", 0x3B, I64, i64>;
-let cw = 1, cw2 = 0 in defm CMOVW : RRCMOVm<"cmov.w.${cfw}", 0x3B, I32, i32>;
-let cw = 0, cw2 = 1 in defm CMOVD : RRCMOVm<"cmov.d.${cfw}", 0x3B, I64, f64>;
-let cw = 1, cw2 = 1 in defm CMOVS : RRCMOVm<"cmov.s.${cfw}", 0x3B, F32, f32>;
+let cw = 0, cw2 = 0 in defm CMOVL : RRCMOVm<"cmov.l.${cfw}", 0x3B, I64>;
+let cw = 1, cw2 = 0 in defm CMOVW : RRCMOVm<"cmov.w.${cfw}", 0x3B, I32>;
+let cw = 0, cw2 = 1 in defm CMOVD : RRCMOVm<"cmov.d.${cfw}", 0x3B, I64>;
+let cw = 1, cw2 = 1 in defm CMOVS : RRCMOVm<"cmov.s.${cfw}", 0x3B, F32>;
 def : MnemonicAlias<"cmov.l", "cmov.l.at">;
 def : MnemonicAlias<"cmov.w", "cmov.w.at">;
 def : MnemonicAlias<"cmov.d", "cmov.d.at">;
@@ -1315,14 +1311,14 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm SLL : RRIm<"sll", 0x65, I64, i64, shl>;
 
 // Section 8.6.2 - SLD (Shift Left Double)
-defm SLD : RRILDm<"sld", 0x64, I64, i64>;
+defm SLD : RRILDm<"sld", 0x64, I64>;
 
 // Section 8.6.3 - SRL (Shift Right Logical)
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm SRL : RRIm<"srl", 0x75, I64, i64, srl>;
 
 // Section 8.6.4 - SRD (Shift Right Double)
-defm SRD : RRIRDm<"srd", 0x74, I64, i64>;
+defm SRD : RRIRDm<"srd", 0x74, I64>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 
@@ -1405,16 +1401,16 @@ defm FCMPQ : RRNCbm<"fcmp.q", 0x7D, I64, f64, F128, f128, null_frag, simm7fp,
 // Section 8.7.11 - FIX (Convert to Fixed Point)
 // cx: double/float, cw: sx/zx, sz{0-3} = round
 let cx = 0, cw = 0 /* sign extend */ in
-defm CVTWDSX : CVTRDm<"cvt.w.d.sx", 0x4E, I32, i32, I64, f64>;
+defm CVTWDSX : CVTRDm<"cvt.w.d.sx", 0x4E, I32, I64>;
 let cx = 0, cw = 1 /* zero extend */ in
-defm CVTWDZX : CVTRDm<"cvt.w.d.zx", 0x4E, I32, i32, I64, f64>;
+defm CVTWDZX : CVTRDm<"cvt.w.d.zx", 0x4E, I32, I64>;
 let cx = 1, cw = 0 /* sign extend */ in
-defm CVTWSSX : CVTRDm<"cvt.w.s.sx", 0x4E, I32, i32, F32, f32>;
+defm CVTWSSX : CVTRDm<"cvt.w.s.sx", 0x4E, I32, F32>;
 let cx = 1, cw = 1 /* zero extend */ in
-defm CVTWSZX : CVTRDm<"cvt.w.s.zx", 0x4E, I32, i32, F32, f32>;
+defm CVTWSZX : CVTRDm<"cvt.w.s.zx", 0x4E, I32, F32>;
 
 // Section 8.7.12 - FIXX (Convert to Fixed Point)
-defm CVTLD : CVTRDm<"cvt.l.d", 0x4F, I64, i64, I64, f64>;
+defm CVTLD : CVTRDm<"cvt.l.d", 0x4F, I64, I64>;
 
 // Section 8.7.13 - FLT (Convert to Floating Point)
 defm CVTDW : CVTm<"cvt.d.w", 0x5E, I64, f64, I32, i32, sint_to_fp>;
@@ -1836,7 +1832,7 @@ multiclass ZXATMLDm<SDPatternOperator from, int VAL,
   def : Pat<(i64 (and (anyext (from ADDRzii:$addr)), VAL)),
             (i2l (tozii MEMzii:$addr))>;
 }
-multiclass ZXATMLD32m<SDPatternOperator from, int VAL,
+multiclass ZXATMLD32m<SDPatternOperator from,
                       RM torri, RM torii,
                       RM tozri, RM tozii> {
   def : Pat<(i64 (zext (from ADDRrri:$addr))),
@@ -1852,8 +1848,7 @@ defm : ZXATMLDm<atomic_load_8, 0xFF, LD1BZXrri, LD1BZXrii, LD1BZXzri,
                 LD1BZXzii>;
 defm : ZXATMLDm<atomic_load_16, 0xFFFF, LD2BZXrri, LD2BZXrii, LD2BZXzri,
                 LD2BZXzii>;
-defm : ZXATMLD32m<atomic_load_32, 0xFFFFFFFF, LDLZXrri, LDLZXrii, LDLZXzri,
-                  LDLZXzii>;
+defm : ZXATMLD32m<atomic_load_32, LDLZXrri, LDLZXrii, LDLZXzri, LDLZXzii>;
 
 // Atomic stores
 multiclass ATMSTm<SDPatternOperator from, ValueType ty,
@@ -1871,7 +1866,6 @@ defm : ATMSTm<atomic_store_64, i64, STrri, STrii, STzri, STzii>;
 
 // Optimized atomic stores with truncate
 multiclass TRATMSTm<SDPatternOperator from,
-                  ValueType ty,
                   RM torri,
                   RM torii,
                   RM tozri,
@@ -1885,9 +1879,9 @@ multiclass TRATMSTm<SDPatternOperator from,
   def : Pat<(from ADDRzii:$addr, (i32 (trunc i64:$src))),
             (tozii MEMzii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
 }
-defm : TRATMSTm<atomic_store_8, i32, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
-defm : TRATMSTm<atomic_store_16, i32, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
-defm : TRATMSTm<atomic_store_32, i32, STLrri, STLrii, STLzri, STLzii>;
+defm : TRATMSTm<atomic_store_8, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
+defm : TRATMSTm<atomic_store_16, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
+defm : TRATMSTm<atomic_store_32, STLrri, STLrii, STLzri, STLzii>;
 
 // Atomic swaps
 def : Pat<(i32 (ts1am i64:$src, i32:$flag, i32:$new)),
diff --git a/llvm/lib/Target/VE/VESubtarget.cpp b/llvm/lib/Target/VE/VESubtarget.cpp
index daa6cfb8aa84..78ac742ebf52 100644
--- a/llvm/lib/Target/VE/VESubtarget.cpp
+++ b/llvm/lib/Target/VE/VESubtarget.cpp
@@ -12,8 +12,8 @@
 
 #include "VESubtarget.h"
 #include "VE.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 414ae09431c0..9f294f15da91 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index 7003fb387670..ac03e0bf627e 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -20,8 +20,7 @@ include "VVPInstrInfo.td"
 multiclass VectorBinaryArith<
     SDPatternOperator OpNode,
     ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
-    string OpBaseName,
-    SDPatternOperator ImmOp, SDNodeXForm ImmCast> {
+    string OpBaseName> {
   // No mask.
   def : Pat<(OpNode
                 (any_broadcast ScalarVT:$sx),
@@ -56,10 +55,10 @@ multiclass VectorBinaryArith_ShortLong<
     ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
   defm : VectorBinaryArith<OpNode,
                            LongScalarVT, LongDataVT, v256i1,
-                           LongOpBaseName, simm7, LO7>;
+                           LongOpBaseName>;
   defm : VectorBinaryArith<OpNode,
                            ShortScalarVT, ShortDataVT, v256i1,
-                           ShortOpBaseName, simm7, LO7>;
+                           ShortOpBaseName>;
 }
 
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index eb1dd879941a..7d1e6c553f81 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -31,9 +31,9 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -431,10 +431,10 @@ public:
 
   bool checkForP2AlignIfLoadStore(OperandVector &Operands, StringRef InstName) {
     // FIXME: there is probably a cleaner way to do this.
-    auto IsLoadStore = InstName.find(".load") != StringRef::npos ||
-                       InstName.find(".store") != StringRef::npos ||
-                       InstName.find("prefetch") != StringRef::npos;
-    auto IsAtomic = InstName.find("atomic.") != StringRef::npos;
+    auto IsLoadStore = InstName.contains(".load") ||
+                       InstName.contains(".store") ||
+                       InstName.contains("prefetch");
+    auto IsAtomic = InstName.contains("atomic.");
     if (IsLoadStore || IsAtomic) {
       // Parse load/store operands of the form: offset:p2align=align
       if (IsLoadStore && isNext(AsmToken::Colon)) {
@@ -450,7 +450,7 @@ public:
         // v128.{load,store}{8,16,32,64}_lane has both a memarg and a lane
         // index. We need to avoid parsing an extra alignment operand for the
         // lane index.
-        auto IsLoadStoreLane = InstName.find("_lane") != StringRef::npos;
+        auto IsLoadStoreLane = InstName.contains("_lane");
         if (IsLoadStoreLane && Operands.size() == 4)
           return false;
         // Alignment not specified (or atomics, must use default alignment).
@@ -1114,6 +1114,8 @@ public:
 
   void onEndOfFunction(SMLoc ErrorLoc) {
     TC.endOfFunction(ErrorLoc);
+    // Reset the type checker state.
+    TC.Clear();
 
     // Automatically output a .size directive, so it becomes optional for the
     // user.
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index 2f9245a7c66c..a6b5d4252f2f 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -31,10 +31,10 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -74,6 +74,9 @@ bool WebAssemblyAsmTypeCheck::typeError(SMLoc ErrorLoc, const Twine &Msg) {
   // which are mostly not helpful.
   if (TypeErrorThisFunction)
     return true;
+  // If we're currently in unreachable code, we surpress errors as well.
+  if (Unreachable)
+    return true;
   TypeErrorThisFunction = true;
   dumpTypeStack("current stack: ");
   return Parser.Error(ErrorLoc, Msg);
@@ -89,8 +92,7 @@ bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc,
                           : StringRef(
                                     "empty stack while popping value"));
   }
-  auto PVT = Stack.back();
-  Stack.pop_back();
+  auto PVT = Stack.pop_back_val();
   if (EVT.hasValue() && EVT.getValue() != PVT) {
     return typeError(
         ErrorLoc, StringRef("popped ") + WebAssembly::typeToString(PVT) +
@@ -155,8 +157,12 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
     break;
   case wasm::WASM_SYMBOL_TYPE_FUNCTION:
   case wasm::WASM_SYMBOL_TYPE_DATA:
-    if (SymRef->getKind() == MCSymbolRefExpr::VK_GOT) {
+    switch (SymRef->getKind()) {
+    case MCSymbolRefExpr::VK_GOT:
+    case MCSymbolRefExpr::VK_WASM_GOT_TLS:
       Type = is64 ? wasm::ValType::I64 : wasm::ValType::I32;
+      return false;
+    default:
       break;
     }
     LLVM_FALLTHROUGH;
@@ -167,17 +173,18 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
-void WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
+bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) {
   // Check the return types.
   for (auto RVT : llvm::reverse(ReturnTypes)) {
-    popType(ErrorLoc, RVT);
+    if (popType(ErrorLoc, RVT))
+      return true;
   }
   if (!Stack.empty()) {
-    typeError(ErrorLoc,
-              std::to_string(Stack.size()) + " superfluous return values");
+    return typeError(ErrorLoc, std::to_string(Stack.size()) +
+                                   " superfluous return values");
   }
-  // Reset the type checker state.
-  Clear();
+  Unreachable = true;
+  return false;
 }
 
 bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
@@ -213,13 +220,20 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
     if (popType(ErrorLoc, {}))
       return true;
   } else if (Name == "end_block" || Name == "end_loop" || Name == "end_if" ||
-              Name == "else") {
+             Name == "else" || Name == "end_try") {
     if (checkEnd(ErrorLoc))
       return true;
+    if (Name == "end_block")
+      Unreachable = false;
+  } else if (Name == "return") {
+    if (endOfFunction(ErrorLoc))
+      return true;
   } else if (Name == "call_indirect" || Name == "return_call_indirect") {
     // Function value.
     if (popType(ErrorLoc, wasm::ValType::I32)) return true;
     if (checkSig(ErrorLoc, LastSig)) return true;
+    if (Name == "return_call_indirect" && endOfFunction(ErrorLoc))
+      return true;
   } else if (Name == "call" || Name == "return_call") {
     const MCSymbolRefExpr *SymRef;
     if (getSymRef(ErrorLoc, Inst, SymRef))
@@ -230,9 +244,25 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
       return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
                                       " missing .functype");
     if (checkSig(ErrorLoc, *Sig)) return true;
+    if (Name == "return_call" && endOfFunction(ErrorLoc))
+      return true;
+  } else if (Name == "catch") {
+    const MCSymbolRefExpr *SymRef;
+    if (getSymRef(ErrorLoc, Inst, SymRef))
+      return true;
+    const auto *WasmSym = cast<MCSymbolWasm>(&SymRef->getSymbol());
+    const auto *Sig = WasmSym->getSignature();
+    if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_TAG)
+      return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() +
+                                     " missing .tagtype");
+    // catch instruction pushes values whose types are specified in the tag's
+    // "params" part
+    Stack.insert(Stack.end(), Sig->Params.begin(), Sig->Params.end());
   } else if (Name == "ref.null") {
     auto VT = static_cast<wasm::ValType>(Inst.getOperand(0).getImm());
     Stack.push_back(VT);
+  } else if (Name == "unreachable") {
+    Unreachable = true;
   } else {
     // The current instruction is a stack instruction which doesn't have
     // explicit operands that indicate push/pop types, so we get those from
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
index a15a69b50418..aa35213ccca3 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -32,15 +32,9 @@ class WebAssemblyAsmTypeCheck final {
   SmallVector<wasm::ValType, 4> ReturnTypes;
   wasm::WasmSignature LastSig;
   bool TypeErrorThisFunction = false;
+  bool Unreachable = false;
   bool is64;
 
-  void Clear() {
-    Stack.clear();
-    LocalTypes.clear();
-    ReturnTypes.clear();
-    TypeErrorThisFunction = false;
-  }
-
   void dumpTypeStack(Twine Msg);
   bool typeError(SMLoc ErrorLoc, const Twine &Msg);
   bool popType(SMLoc ErrorLoc, Optional<wasm::ValType> EVT);
@@ -57,8 +51,16 @@ public:
   void funcDecl(const wasm::WasmSignature &Sig);
   void localDecl(const SmallVector<wasm::ValType, 4> &Locals);
   void setLastSig(const wasm::WasmSignature &Sig) { LastSig = Sig; }
-  void endOfFunction(SMLoc ErrorLoc);
+  bool endOfFunction(SMLoc ErrorLoc);
   bool typeCheck(SMLoc ErrorLoc, const MCInst &Inst);
+
+  void Clear() {
+    Stack.clear();
+    LocalTypes.clear();
+    ReturnTypes.clear();
+    TypeErrorThisFunction = false;
+    Unreachable = false;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 6770ccc9df6a..2e1e4f061219 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -24,9 +24,9 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index d88311197c1a..85bb52c03e80 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -59,7 +59,8 @@ public:
     return false;
   }
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 };
 
 const MCFixupKindInfo &
@@ -83,8 +84,8 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS,
-                                         uint64_t Count) const {
+bool WebAssemblyAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                         const MCSubtargetInfo *STI) const {
   for (uint64_t I = 0; I < Count; ++I)
     OS << char(WebAssembly::Nop);
 
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 31cccb24d798..8f670ec88897 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -19,8 +19,8 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-target-desc"
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 99defb42e380..d07bfce9abc1 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -95,6 +95,9 @@ enum TOF {
   // platforms.
   MO_GOT,
 
+  // Same as MO_GOT but the address stored in the global is a TLS address.
+  MO_GOT_TLS,
+
   // On a symbol operand this indicates that the immediate is the symbol
   // address relative the __memory_base wasm global.
   // Only applicable to data symbols.
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index f67fab946746..405712906c40 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -74,6 +74,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(
 
   switch (Modifier) {
     case MCSymbolRefExpr::VK_GOT:
+    case MCSymbolRefExpr::VK_WASM_GOT_TLS:
       return wasm::R_WASM_GLOBAL_INDEX_LEB;
     case MCSymbolRefExpr::VK_WASM_TBREL:
       assert(SymA.isFunction());
@@ -88,7 +89,10 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(
                        : wasm::R_WASM_MEMORY_ADDR_REL_SLEB;
     case MCSymbolRefExpr::VK_WASM_TYPEINDEX:
       return wasm::R_WASM_TYPE_INDEX_LEB;
+    case MCSymbolRefExpr::VK_None:
+      break;
     default:
+      report_fatal_error("unknown VariantKind");
       break;
   }
 
diff --git a/llvm/lib/Target/WebAssembly/README.txt b/llvm/lib/Target/WebAssembly/README.txt
index 934a3ba3bc4a..ab1cd8f0f84a 100644
--- a/llvm/lib/Target/WebAssembly/README.txt
+++ b/llvm/lib/Target/WebAssembly/README.txt
@@ -2,11 +2,11 @@
 
 The object format emitted by the WebAssembly backed is documented in:
 
-  * https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  * https://github.com/WebAssembly/tool-conventions/blob/main/Linking.md
 
 The C ABI is described in:
 
-  * https://github.com/WebAssembly/tool-conventions/blob/master/BasicCABI.md
+  * https://github.com/WebAssembly/tool-conventions/blob/main/BasicCABI.md
 
 For more information on WebAssembly itself, see the home page:
 
@@ -31,8 +31,8 @@ For more information, see:
 
 The following documents contain some information on the semantics and binary
 encoding of WebAssembly itself:
-  * https://github.com/WebAssembly/design/blob/master/Semantics.md
-  * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
+  * https://github.com/WebAssembly/design/blob/main/Semantics.md
+  * https://github.com/WebAssembly/design/blob/main/BinaryEncoding.md
 
 Some notes on ways that the generated code could be improved follow:
 
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index f9a96819905f..e3daf6bfa72e 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/WebAssemblyTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-target-info"
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
index 673dc9521ced..f6e96d9b2877 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
 
+#include "llvm/IR/DerivedTypes.h"
+
 namespace llvm {
 
 class MachineBasicBlock;
@@ -35,18 +37,35 @@ enum WasmAddressSpace : unsigned {
   // linear memory: WebAssembly globals or WebAssembly locals.  Loads and stores
   // to these pointers are lowered to global.get / global.set or local.get /
   // local.set, as appropriate.
-  WASM_ADDRESS_SPACE_WASM_VAR = 1
+  WASM_ADDRESS_SPACE_VAR = 1,
+  // A non-integral address space for externref values
+  WASM_ADDRESS_SPACE_EXTERNREF = 10,
+  // A non-integral address space for funcref values
+  WASM_ADDRESS_SPACE_FUNCREF = 20,
 };
 
 inline bool isDefaultAddressSpace(unsigned AS) {
   return AS == WASM_ADDRESS_SPACE_DEFAULT;
 }
 inline bool isWasmVarAddressSpace(unsigned AS) {
-  return AS == WASM_ADDRESS_SPACE_WASM_VAR;
+  return AS == WASM_ADDRESS_SPACE_VAR;
 }
 inline bool isValidAddressSpace(unsigned AS) {
   return isDefaultAddressSpace(AS) || isWasmVarAddressSpace(AS);
 }
+inline bool isFuncrefType(const Type *Ty) {
+  return isa<PointerType>(Ty) &&
+         Ty->getPointerAddressSpace() ==
+             WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF;
+}
+inline bool isExternrefType(const Type *Ty) {
+  return isa<PointerType>(Ty) &&
+         Ty->getPointerAddressSpace() ==
+             WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF;
+}
+inline bool isRefType(const Type *Ty) {
+  return isFuncrefType(Ty) || isExternrefType(Ty);
+}
 
 bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
 bool mayThrow(const MachineInstr &MI);
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 9eb960d018d3..803786e0c9c2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -25,12 +25,12 @@ class ModulePass;
 class FunctionPass;
 
 // LLVM IR passes.
-ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool EnableEH,
-                                                   bool EnableSjLj);
+ModulePass *createWebAssemblyLowerEmscriptenEHSjLj();
 ModulePass *createWebAssemblyLowerGlobalDtors();
 ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
+FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv();
 
 // ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
@@ -56,7 +56,7 @@ FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
 FunctionPass *createWebAssemblyDebugFixup();
 FunctionPass *createWebAssemblyPeephole();
-FunctionPass *createWebAssemblyMCLowerPrePass();
+ModulePass *createWebAssemblyMCLowerPrePass();
 
 // PassRegistry initialization declarations.
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
@@ -85,6 +85,7 @@ void initializeWebAssemblyRegNumberingPass(PassRegistry &);
 void initializeWebAssemblyDebugFixupPass(PassRegistry &);
 void initializeWebAssemblyPeepholePass(PassRegistry &);
 void initializeWebAssemblyMCLowerPrePassPass(PassRegistry &);
+void initializeWebAssemblyLowerRefTypesIntPtrConvPass(PassRegistry &);
 
 namespace WebAssembly {
 enum TargetIndex {
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index c1872dd91c58..a529c6217189 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -25,6 +25,9 @@ include "llvm/Target/Target.td"
 def FeatureSIMD128 : SubtargetFeature<"simd128", "SIMDLevel", "SIMD128",
                                       "Enable 128-bit SIMD">;
 
+def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
+                                      "Enable relaxed-simd instructions">;
+
 def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true",
                                       "Enable Atomics">;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
index 530a55cda0e5..90e819912847 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -86,27 +86,37 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
                            F.getName());
     }
 
-    // Create a function prototype based on the first call site (first bitcast)
-    // that we find.
+    // Find calls of this function, looking through bitcasts.
+    SmallVector<CallBase *> Calls;
+    SmallVector<Value *> Worklist;
+    Worklist.push_back(&F);
+    while (!Worklist.empty()) {
+      Value *V = Worklist.pop_back_val();
+      for (User *U : V->users()) {
+        if (auto *BC = dyn_cast<BitCastOperator>(U))
+          Worklist.push_back(BC);
+        else if (auto *CB = dyn_cast<CallBase>(U))
+          if (CB->getCalledOperand() == V)
+            Calls.push_back(CB);
+      }
+    }
+
+    // Create a function prototype based on the first call site that we find.
     FunctionType *NewType = nullptr;
-    for (Use &U : F.uses()) {
-      LLVM_DEBUG(dbgs() << "prototype-less use: " << F.getName() << "\n");
-      LLVM_DEBUG(dbgs() << *U.getUser() << "\n");
-      if (auto *BC = dyn_cast<BitCastOperator>(U.getUser())) {
-        if (auto *DestType = dyn_cast<FunctionType>(
-                BC->getDestTy()->getPointerElementType())) {
-          if (!NewType) {
-            // Create a new function with the correct type
-            NewType = DestType;
-            LLVM_DEBUG(dbgs() << "found function type: " << *NewType << "\n");
-          } else if (NewType != DestType) {
-            errs() << "warning: prototype-less function used with "
-                      "conflicting signatures: "
-                   << F.getName() << "\n";
-            LLVM_DEBUG(dbgs() << "  " << *DestType << "\n");
-            LLVM_DEBUG(dbgs() << "  "<<  *NewType << "\n");
-          }
-        }
+    for (CallBase *CB : Calls) {
+      LLVM_DEBUG(dbgs() << "prototype-less call of " << F.getName() << ":\n");
+      LLVM_DEBUG(dbgs() << *CB << "\n");
+      FunctionType *DestType = CB->getFunctionType();
+      if (!NewType) {
+        // Create a new function with the correct type
+        NewType = DestType;
+        LLVM_DEBUG(dbgs() << "found function type: " << *NewType << "\n");
+      } else if (NewType != DestType) {
+        errs() << "warning: prototype-less function used with "
+                  "conflicting signatures: "
+               << F.getName() << "\n";
+        LLVM_DEBUG(dbgs() << "  " << *DestType << "\n");
+        LLVM_DEBUG(dbgs() << "  " << *NewType << "\n");
       }
     }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 56829eb45e21..0d3f51693261 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -42,8 +42,8 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -51,8 +51,8 @@ using namespace llvm;
 #define DEBUG_TYPE "asm-printer"
 
 extern cl::opt<bool> WasmKeepRegisters;
-extern cl::opt<bool> EnableEmException;
-extern cl::opt<bool> EnableEmSjLj;
+extern cl::opt<bool> WasmEnableEmEH;
+extern cl::opt<bool> WasmEnableEmSjLj;
 
 //===----------------------------------------------------------------------===//
 // Helpers.
@@ -161,7 +161,7 @@ MCSymbolWasm *WebAssemblyAsmPrinter::getMCSymbolForFunction(
           "Emscripten EH/SjLj does not support multivalue returns: " +
           std::string(F->getName()) + ": " +
           WebAssembly::signatureToString(Sig);
-      report_fatal_error(Msg);
+      report_fatal_error(Twine(Msg));
     }
     WasmSym = cast<MCSymbolWasm>(
         GetExternalSymbolSymbol(getEmscriptenInvokeSymbolName(Sig)));
@@ -234,26 +234,32 @@ MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
     return WasmSym;
   }
 
+  if (Name.startswith("GCC_except_table")) {
+    WasmSym->setType(wasm::WASM_SYMBOL_TYPE_DATA);
+    return WasmSym;
+  }
+
   SmallVector<wasm::ValType, 4> Returns;
   SmallVector<wasm::ValType, 4> Params;
-  if (Name == "__cpp_exception") {
+  if (Name == "__cpp_exception" || Name == "__c_longjmp") {
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TAG);
-    // We can't confirm its signature index for now because there can be
-    // imported exceptions. Set it to be 0 for now.
-    WasmSym->setTagType(
-        {wasm::WASM_TAG_ATTRIBUTE_EXCEPTION, /* SigIndex */ 0});
-    // We may have multiple C++ compilation units to be linked together, each of
-    // which defines the exception symbol. To resolve them, we declare them as
-    // weak.
-    WasmSym->setWeak(true);
+    // In static linking we define tag symbols in WasmException::endModule().
+    // But we may have multiple objects to be linked together, each of which
+    // defines the tag symbols. To resolve them, we declare them as weak. In
+    // dynamic linking we make tag symbols undefined in the backend, define it
+    // in JS, and feed them to each importing module.
+    if (!isPositionIndependent())
+      WasmSym->setWeak(true);
     WasmSym->setExternal(true);
 
-    // All C++ exceptions are assumed to have a single i32 (for wasm32) or i64
-    // (for wasm64) param type and void return type. The reaon is, all C++
-    // exception values are pointers, and to share the type section with
-    // functions, exceptions are assumed to have void return type.
-    Params.push_back(Subtarget.hasAddr64() ? wasm::ValType::I64
-                                           : wasm::ValType::I32);
+    // Currently both C++ exceptions and C longjmps have a single pointer type
+    // param. For C++ exceptions it is a pointer to an exception object, and for
+    // C longjmps it is pointer to a struct that contains a setjmp buffer and a
+    // longjmp return value. We may consider using multiple value parameters for
+    // longjmps later when multivalue support is ready.
+    wasm::ValType AddrType =
+        Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32;
+    Params.push_back(AddrType);
   } else { // Function symbols
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
     getLibcallSignature(Subtarget, Name, Returns, Params);
@@ -309,7 +315,7 @@ void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
       // will discard it later if it turns out not to be necessary.
       auto Signature = signatureFromMVTs(Results, Params);
       bool InvokeDetected = false;
-      auto *Sym = getMCSymbolForFunction(&F, EnableEmException || EnableEmSjLj,
+      auto *Sym = getMCSymbolForFunction(&F, WasmEnableEmEH || WasmEnableEmSjLj,
                                          Signature.get(), InvokeDetected);
 
       // Multiple functions can be mapped to the same invoke symbol. For
@@ -497,6 +503,15 @@ void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
   // This pseudo-feature tells the linker whether shared memory would be safe
   EmitFeature("shared-mem");
 
+  // This is an "architecture", not a "feature", but we emit it as such for
+  // the benefit of tools like Binaryen and consistency with other producers.
+  // FIXME: Subtarget is null here, so can't Subtarget->hasAddr64() ?
+  if (M.getDataLayout().getPointerSize() == 8) {
+    // Can't use EmitFeature since "wasm-feature-memory64" is not a module
+    // flag.
+    EmittedFeatures.push_back({wasm::WASM_FEATURE_PREFIX_USED, "memory64"});
+  }
+
   if (EmittedFeatures.size() == 0)
     return;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 59d69e48b775..7832f199a2cc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -173,7 +173,7 @@ static bool explicitlyBranchesTo(MachineBasicBlock *Pred,
 // satisfying the restrictions given by BeforeSet and AfterSet. BeforeSet
 // contains instructions that should go before the marker, and AfterSet contains
 // ones that should go after the marker. In this function, AfterSet is only
-// used for sanity checking.
+// used for validation checking.
 template <typename Container>
 static MachineBasicBlock::iterator
 getEarliestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
@@ -182,7 +182,7 @@ getEarliestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
   while (InsertPos != MBB->begin()) {
     if (BeforeSet.count(&*std::prev(InsertPos))) {
 #ifndef NDEBUG
-      // Sanity check
+      // Validation check
       for (auto Pos = InsertPos, E = MBB->begin(); Pos != E; --Pos)
         assert(!AfterSet.count(&*std::prev(Pos)));
 #endif
@@ -197,7 +197,7 @@ getEarliestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
 // satisfying the restrictions given by BeforeSet and AfterSet. BeforeSet
 // contains instructions that should go before the marker, and AfterSet contains
 // ones that should go after the marker. In this function, BeforeSet is only
-// used for sanity checking.
+// used for validation checking.
 template <typename Container>
 static MachineBasicBlock::iterator
 getLatestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
@@ -206,7 +206,7 @@ getLatestInsertPos(MachineBasicBlock *MBB, const Container &BeforeSet,
   while (InsertPos != MBB->end()) {
     if (AfterSet.count(&*InsertPos)) {
 #ifndef NDEBUG
-      // Sanity check
+      // Validation check
       for (auto Pos = InsertPos, E = MBB->end(); Pos != E; ++Pos)
         assert(!BeforeSet.count(&*Pos));
 #endif
@@ -842,8 +842,7 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
   //    INST ..., TeeReg, ...
   //    INST ..., Reg, ...
   //    INST ..., Reg, ...
-  for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
-    MachineInstr &MI = *I++;
+  for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
     if (!WebAssembly::isTee(MI.getOpcode()))
       continue;
     Register TeeReg = MI.getOperand(0).getReg();
@@ -1671,8 +1670,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
   SmallVector<EndMarkerInfo, 8> Stack;
   SmallVector<const MachineBasicBlock *, 8> EHPadStack;
   for (auto &MBB : reverse(MF)) {
-    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
-      MachineInstr &MI = *I;
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
       switch (MI.getOpcode()) {
       case WebAssembly::BLOCK:
       case WebAssembly::TRY:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 4a0738dc3b7a..910a4e5e0d1a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -252,8 +252,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
 
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
-      MachineInstr &MI = *I++;
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       assert(!WebAssembly::isArgument(MI.getOpcode()));
 
       if (MI.isDebugInstr() || MI.isLabel())
@@ -380,9 +379,14 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
         Register NewReg = MRI.createVirtualRegister(RC);
         unsigned Opc = getLocalGetOpcode(RC);
-        InsertPt =
-            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
-                .addImm(LocalId);
+        // Use a InsertPt as our DebugLoc, since MI may be discontinuous from
+        // the where this local is being inserted, causing non-linear stepping
+        // in the debugger or function entry points where variables aren't live
+        // yet. Alternative is previous instruction, but that is strictly worse
+        // since it can point at the previous statement.
+        // See crbug.com/1251909, crbug.com/1249745
+        InsertPt = BuildMI(MBB, InsertPt, InsertPt->getDebugLoc(),
+                           TII->get(Opc), NewReg).addImm(LocalId);
         MO.setReg(NewReg);
         MFI.stackifyVReg(MRI, NewReg);
         Changed = true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 171d59ae4c6b..642aa6b4028a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -157,7 +157,7 @@ private:
   void addLoadStoreOperands(const Address &Addr, const MachineInstrBuilder &MIB,
                             MachineMemOperand *MMO);
   unsigned maskI1Value(unsigned Reg, const Value *V);
-  unsigned getRegForI1Value(const Value *V, bool &Not);
+  unsigned getRegForI1Value(const Value *V, const BasicBlock *BB, bool &Not);
   unsigned zeroExtendToI32(unsigned Reg, const Value *V,
                            MVT::SimpleValueType From);
   unsigned signExtendToI32(unsigned Reg, const Value *V,
@@ -418,20 +418,17 @@ unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) {
   return zeroExtendToI32(Reg, V, MVT::i1);
 }
 
-unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
+unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V,
+                                               const BasicBlock *BB,
+                                               bool &Not) {
   if (const auto *ICmp = dyn_cast<ICmpInst>(V))
     if (const ConstantInt *C = dyn_cast<ConstantInt>(ICmp->getOperand(1)))
-      if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) {
+      if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32) &&
+          ICmp->getParent() == BB) {
         Not = ICmp->isTrueWhenEqual();
         return getRegForValue(ICmp->getOperand(0));
       }
 
-  Value *NotV;
-  if (match(V, m_Not(m_Value(NotV))) && V->getType()->isIntegerTy(32)) {
-    Not = true;
-    return getRegForValue(NotV);
-  }
-
   Not = false;
   unsigned Reg = getRegForValue(V);
   if (Reg == 0)
@@ -648,11 +645,11 @@ bool WebAssemblyFastISel::fastLowerArguments() {
   unsigned I = 0;
   for (auto const &Arg : F->args()) {
     const AttributeList &Attrs = F->getAttributes();
-    if (Attrs.hasParamAttribute(I, Attribute::ByVal) ||
-        Attrs.hasParamAttribute(I, Attribute::SwiftSelf) ||
-        Attrs.hasParamAttribute(I, Attribute::SwiftError) ||
-        Attrs.hasParamAttribute(I, Attribute::InAlloca) ||
-        Attrs.hasParamAttribute(I, Attribute::Nest))
+    if (Attrs.hasParamAttr(I, Attribute::ByVal) ||
+        Attrs.hasParamAttr(I, Attribute::SwiftSelf) ||
+        Attrs.hasParamAttr(I, Attribute::SwiftError) ||
+        Attrs.hasParamAttr(I, Attribute::InAlloca) ||
+        Attrs.hasParamAttr(I, Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
@@ -825,25 +822,25 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
   }
 
   SmallVector<unsigned, 8> Args;
-  for (unsigned I = 0, E = Call->getNumArgOperands(); I < E; ++I) {
+  for (unsigned I = 0, E = Call->arg_size(); I < E; ++I) {
     Value *V = Call->getArgOperand(I);
     MVT::SimpleValueType ArgTy = getSimpleType(V->getType());
     if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
       return false;
 
     const AttributeList &Attrs = Call->getAttributes();
-    if (Attrs.hasParamAttribute(I, Attribute::ByVal) ||
-        Attrs.hasParamAttribute(I, Attribute::SwiftSelf) ||
-        Attrs.hasParamAttribute(I, Attribute::SwiftError) ||
-        Attrs.hasParamAttribute(I, Attribute::InAlloca) ||
-        Attrs.hasParamAttribute(I, Attribute::Nest))
+    if (Attrs.hasParamAttr(I, Attribute::ByVal) ||
+        Attrs.hasParamAttr(I, Attribute::SwiftSelf) ||
+        Attrs.hasParamAttr(I, Attribute::SwiftError) ||
+        Attrs.hasParamAttr(I, Attribute::InAlloca) ||
+        Attrs.hasParamAttr(I, Attribute::Nest))
       return false;
 
     unsigned Reg;
 
-    if (Attrs.hasParamAttribute(I, Attribute::SExt))
+    if (Attrs.hasParamAttr(I, Attribute::SExt))
       Reg = getRegForSignedValue(V);
-    else if (Attrs.hasParamAttribute(I, Attribute::ZExt))
+    else if (Attrs.hasParamAttr(I, Attribute::ZExt))
       Reg = getRegForUnsignedValue(V);
     else
       Reg = getRegForValue(V);
@@ -912,7 +909,8 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
   const auto *Select = cast<SelectInst>(I);
 
   bool Not;
-  unsigned CondReg = getRegForI1Value(Select->getCondition(), Not);
+  unsigned CondReg =
+      getRegForI1Value(Select->getCondition(), I->getParent(), Not);
   if (CondReg == 0)
     return false;
 
@@ -1312,7 +1310,7 @@ bool WebAssemblyFastISel::selectBr(const Instruction *I) {
   MachineBasicBlock *FBB = FuncInfo.MBBMap[Br->getSuccessor(1)];
 
   bool Not;
-  unsigned CondReg = getRegForI1Value(Br->getCondition(), Not);
+  unsigned CondReg = getRegForI1Value(Br->getCondition(), Br->getParent(), Not);
   if (CondReg == 0)
     return false;
 
@@ -1370,9 +1368,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   }
 
   unsigned Reg;
-  if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt))
+  if (FuncInfo.Fn->getAttributes().hasRetAttr(Attribute::SExt))
     Reg = getRegForSignedValue(RV);
-  else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt))
+  else if (FuncInfo.Fn->getAttributes().hasRetAttr(Attribute::ZExt))
     Reg = getRegForUnsignedValue(RV);
   else
     Reg = getRegForValue(RV);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
index 52aa3534c78e..5bdec89f1125 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
@@ -61,9 +61,13 @@ void fixBrTableIndex(MachineInstr &MI, MachineBasicBlock *MBB,
   auto ExtMI = MF.getRegInfo().getVRegDef(MI.getOperand(0).getReg());
   if (ExtMI->getOpcode() == WebAssembly::I64_EXTEND_U_I32) {
     // Unnecessarily extending a 32-bit value to 64, remove it.
-    assert(MI.getOperand(0).getReg() == ExtMI->getOperand(0).getReg());
+    auto ExtDefReg = ExtMI->getOperand(0).getReg();
+    assert(MI.getOperand(0).getReg() == ExtDefReg);
     MI.getOperand(0).setReg(ExtMI->getOperand(1).getReg());
-    ExtMI->eraseFromParent();
+    if (MF.getRegInfo().use_nodbg_empty(ExtDefReg)) {
+      // No more users of extend, delete it.
+      ExtMI->eraseFromParent();
+    }
   } else {
     // Incoming 64-bit value that needs to be truncated.
     Register Reg32 =
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 7abb6fa8905c..2a4349e02f1b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -64,29 +64,21 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
 // Recursively descend the def-use lists from V to find non-bitcast users of
 // bitcasts of V.
 static void findUses(Value *V, Function &F,
-                     SmallVectorImpl<std::pair<Use *, Function *>> &Uses,
-                     SmallPtrSetImpl<Constant *> &ConstantBCs) {
-  for (Use &U : V->uses()) {
-    if (auto *BC = dyn_cast<BitCastOperator>(U.getUser()))
-      findUses(BC, F, Uses, ConstantBCs);
-    else if (auto *A = dyn_cast<GlobalAlias>(U.getUser()))
-      findUses(A, F, Uses, ConstantBCs);
-    else if (U.get()->getType() != F.getType()) {
-      CallBase *CB = dyn_cast<CallBase>(U.getUser());
-      if (!CB)
-        // Skip uses that aren't immediately called
-        continue;
+                     SmallVectorImpl<std::pair<CallBase *, Function *>> &Uses) {
+  for (User *U : V->users()) {
+    if (auto *BC = dyn_cast<BitCastOperator>(U))
+      findUses(BC, F, Uses);
+    else if (auto *A = dyn_cast<GlobalAlias>(U))
+      findUses(A, F, Uses);
+    else if (auto *CB = dyn_cast<CallBase>(U)) {
       Value *Callee = CB->getCalledOperand();
       if (Callee != V)
         // Skip calls where the function isn't the callee
         continue;
-      if (isa<Constant>(U.get())) {
-        // Only add constant bitcasts to the list once; they get RAUW'd
-        auto C = ConstantBCs.insert(cast<Constant>(U.get()));
-        if (!C.second)
-          continue;
-      }
-      Uses.push_back(std::make_pair(&U, &F));
+      if (CB->getFunctionType() == F.getValueType())
+        // Skip uses that are immediately called
+        continue;
+      Uses.push_back(std::make_pair(CB, &F));
     }
   }
 }
@@ -238,8 +230,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
 
   Function *Main = nullptr;
   CallInst *CallMain = nullptr;
-  SmallVector<std::pair<Use *, Function *>, 0> Uses;
-  SmallPtrSet<Constant *, 2> ConstantBCs;
+  SmallVector<std::pair<CallBase *, Function *>, 0> Uses;
 
   // Collect all the places that need wrappers.
   for (Function &F : M) {
@@ -247,7 +238,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
     // bitcast type difference for swiftself and swifterror.
     if (F.getCallingConv() == CallingConv::Swift)
       continue;
-    findUses(&F, F, Uses, ConstantBCs);
+    findUses(&F, F, Uses);
 
     // If we have a "main" function, and its type isn't
     // "int main(int argc, char *argv[])", create an artificial call with it
@@ -268,8 +259,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
         Value *Casted =
             ConstantExpr::getBitCast(Main, PointerType::get(MainTy, 0));
         CallMain = CallInst::Create(MainTy, Casted, Args, "call_main");
-        Use *UseMain = &CallMain->getOperandUse(2);
-        Uses.push_back(std::make_pair(UseMain, &F));
+        Uses.push_back(std::make_pair(CallMain, &F));
       }
     }
   }
@@ -277,16 +267,9 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
   DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
 
   for (auto &UseFunc : Uses) {
-    Use *U = UseFunc.first;
+    CallBase *CB = UseFunc.first;
     Function *F = UseFunc.second;
-    auto *PTy = cast<PointerType>(U->get()->getType());
-    auto *Ty = dyn_cast<FunctionType>(PTy->getElementType());
-
-    // If the function is casted to something like i8* as a "generic pointer"
-    // to be later casted to something else, we can't generate a wrapper for it.
-    // Just ignore such casts for now.
-    if (!Ty)
-      continue;
+    FunctionType *Ty = CB->getFunctionType();
 
     auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
     if (Pair.second)
@@ -296,10 +279,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
     if (!Wrapper)
       continue;
 
-    if (isa<Constant>(U->get()))
-      U->get()->replaceAllUsesWith(Wrapper);
-    else
-      U->set(Wrapper);
+    CB->setCalledOperand(Wrapper);
   }
 
   // If we created a wrapper for main, rename the wrapper so that it's the
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 21519d6135b7..1fa0ea3867c7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -21,9 +21,9 @@ HANDLE_NODETYPE(LOCAL_GET)
 HANDLE_NODETYPE(LOCAL_SET)
 // A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
 HANDLE_NODETYPE(Wrapper)
-// A special wapper used in PIC code for __memory_base/__table_base relative
-// access.
-HANDLE_NODETYPE(WrapperPIC)
+// A special node for TargetGlobalAddress used in PIC code for
+// __memory_base/__table_base relative access.
+HANDLE_NODETYPE(WrapperREL)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(SHUFFLE)
@@ -41,8 +41,6 @@ HANDLE_NODETYPE(PROMOTE_LOW)
 HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
 HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
 HANDLE_NODETYPE(DEMOTE_ZERO)
-HANDLE_NODETYPE(THROW)
-HANDLE_NODETYPE(CATCH)
 HANDLE_NODETYPE(MEMORY_COPY)
 HANDLE_NODETYPE(MEMORY_FILL)
 
@@ -50,4 +48,5 @@ HANDLE_NODETYPE(MEMORY_FILL)
 HANDLE_MEM_NODETYPE(LOAD_SPLAT)
 HANDLE_MEM_NODETYPE(GLOBAL_GET)
 HANDLE_MEM_NODETYPE(GLOBAL_SET)
+HANDLE_MEM_NODETYPE(TABLE_GET)
 HANDLE_MEM_NODETYPE(TABLE_SET)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index f4bae59132e6..7e75989d3def 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -17,6 +17,7 @@
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/IntrinsicsWebAssembly.h"
@@ -24,6 +25,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-isel"
@@ -48,32 +50,11 @@ public:
     return "WebAssembly Instruction Selection";
   }
 
-  void checkForInvalidNodes(const Function &F) {
-    // This function will check for uses of ptrtoint on reference types and
-    // report a fatal error if these are found.
-    for (const BasicBlock &BB : F) {
-      for (const Instruction &I : BB) {
-        if (const PtrToIntInst *PTI = dyn_cast<const PtrToIntInst>(&I)) {
-          const Value *V = PTI->getPointerOperand();
-          if (WebAssemblyTargetLowering::isFuncrefType(V->getType()) ||
-              WebAssemblyTargetLowering::isExternrefType(V->getType()))
-            report_fatal_error("ptrtoint not allowed on reference types");
-        } else if (const IntToPtrInst *ITP = dyn_cast<const IntToPtrInst>(&I)) {
-          if (WebAssemblyTargetLowering::isFuncrefType(ITP->getDestTy()) ||
-              WebAssemblyTargetLowering::isExternrefType(ITP->getDestTy()))
-            report_fatal_error("inttoptr not allowed on reference types");
-        }
-      }
-    }
-  }
-
   bool runOnMachineFunction(MachineFunction &MF) override {
     LLVM_DEBUG(dbgs() << "********** ISelDAGToDAG **********\n"
                          "********** Function: "
                       << MF.getName() << '\n');
 
-    checkForInvalidNodes(MF.getFunction());
-
     Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
 
     return SelectionDAGISel::runOnMachineFunction(MF);
@@ -85,7 +66,6 @@ public:
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
-  bool SelectExternRefAddr(const SDValue &Addr, const SDValue &Base);
 
 // Include the pieces autogenerated from the target description.
 #include "WebAssemblyGenDAGISel.inc"
@@ -107,6 +87,17 @@ void WebAssemblyDAGToDAGISel::PreprocessISelDAG() {
   SelectionDAGISel::PreprocessISelDAG();
 }
 
+static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
+  assert(Tag == WebAssembly::CPP_EXCEPTION || WebAssembly::C_LONGJMP);
+  auto &MF = DAG->getMachineFunction();
+  const auto &TLI = DAG->getTargetLoweringInfo();
+  MVT PtrVT = TLI.getPointerTy(DAG->getDataLayout());
+  const char *SymName = Tag == WebAssembly::CPP_EXCEPTION
+                            ? MF.createExternalSymbolName("__cpp_exception")
+                            : MF.createExternalSymbolName("__c_longjmp");
+  return DAG->getTargetExternalSymbol(SymName, PtrVT);
+}
+
 void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -127,8 +118,7 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     if (!MF.getSubtarget<WebAssemblySubtarget>().hasAtomics())
       break;
 
-    uint64_t SyncScopeID =
-        cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue();
+    uint64_t SyncScopeID = Node->getConstantOperandVal(2);
     MachineSDNode *Fence = nullptr;
     switch (SyncScopeID) {
     case SyncScope::SingleThread:
@@ -162,7 +152,7 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(0);
     switch (IntNo) {
     case Intrinsic::wasm_tls_size: {
       MachineSDNode *TLSSize = CurDAG->getMachineNode(
@@ -171,6 +161,7 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, TLSSize);
       return;
     }
+
     case Intrinsic::wasm_tls_align: {
       MachineSDNode *TLSAlign = CurDAG->getMachineNode(
           GlobalGetIns, DL, PtrVT,
@@ -181,8 +172,11 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
+    const auto &TLI = CurDAG->getTargetLoweringInfo();
+    MVT PtrVT = TLI.getPointerTy(CurDAG->getDataLayout());
     switch (IntNo) {
     case Intrinsic::wasm_tls_base: {
       MachineSDNode *TLSBase = CurDAG->getMachineNode(
@@ -192,9 +186,48 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, TLSBase);
       return;
     }
+
+    case Intrinsic::wasm_catch: {
+      int Tag = Node->getConstantOperandVal(2);
+      SDValue SymNode = getTagSymNode(Tag, CurDAG);
+      MachineSDNode *Catch =
+          CurDAG->getMachineNode(WebAssembly::CATCH, DL,
+                                 {
+                                     PtrVT,     // exception pointer
+                                     MVT::Other // outchain type
+                                 },
+                                 {
+                                     SymNode,            // exception symbol
+                                     Node->getOperand(0) // inchain
+                                 });
+      ReplaceNode(Node, Catch);
+      return;
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = Node->getConstantOperandVal(1);
+    switch (IntNo) {
+    case Intrinsic::wasm_throw: {
+      int Tag = Node->getConstantOperandVal(2);
+      SDValue SymNode = getTagSymNode(Tag, CurDAG);
+      MachineSDNode *Throw =
+          CurDAG->getMachineNode(WebAssembly::THROW, DL,
+                                 MVT::Other, // outchain type
+                                 {
+                                     SymNode,             // exception symbol
+                                     Node->getOperand(3), // thrown value
+                                     Node->getOperand(0)  // inchain
+                                 });
+      ReplaceNode(Node, Throw);
+      return;
+    }
     }
     break;
   }
+
   case WebAssemblyISD::CALL:
   case WebAssemblyISD::RET_CALL: {
     // CALL has both variable operands and variable results, but ISel only
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 62c53c0051ae..0df8f3e0e09c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
@@ -33,6 +32,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
@@ -88,7 +88,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     }
   }
   if (Subtarget->hasReferenceTypes()) {
-    for (auto T : {MVT::externref, MVT::funcref}) {
+    // We need custom load and store lowering for both externref, funcref and
+    // Other. The MVT::Other here represents tables of reference types.
+    for (auto T : {MVT::externref, MVT::funcref, MVT::Other}) {
       setOperationAction(ISD::LOAD, T, Custom);
       setOperationAction(ISD::STORE, T, Custom);
     }
@@ -213,8 +215,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::SELECT_CC, T, Expand);
 
     // Expand integer operations supported for scalars but not SIMD
-    for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV,
-                    ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR})
+    for (auto Op :
+         {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR})
       for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
         setOperationAction(Op, T, Expand);
 
@@ -223,8 +225,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
         setOperationAction(Op, T, Legal);
 
-    // And we have popcnt for i8x16
+    // And we have popcnt for i8x16. It can be used to expand ctlz/cttz.
     setOperationAction(ISD::CTPOP, MVT::v16i8, Legal);
+    setOperationAction(ISD::CTLZ, MVT::v16i8, Expand);
+    setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
+
+    // Custom lower bit counting operations for other types to scalarize them.
+    for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP})
+      for (auto T : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
+        setOperationAction(Op, T, Custom);
 
     // Expand float operations supported for scalars but not SIMD
     for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
@@ -303,9 +312,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
     }
-    // And some truncating stores are legal as well
-    setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
-    setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Legal);
   }
 
   // Don't do anything clever with build_pairs
@@ -338,6 +345,24 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setMinimumJumpTableEntries(2);
 }
 
+MVT WebAssemblyTargetLowering::getPointerTy(const DataLayout &DL,
+                                            uint32_t AS) const {
+  if (AS == WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF)
+    return MVT::externref;
+  if (AS == WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF)
+    return MVT::funcref;
+  return TargetLowering::getPointerTy(DL, AS);
+}
+
+MVT WebAssemblyTargetLowering::getPointerMemTy(const DataLayout &DL,
+                                               uint32_t AS) const {
+  if (AS == WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF)
+    return MVT::externref;
+  if (AS == WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF)
+    return MVT::funcref;
+  return TargetLowering::getPointerMemTy(DL, AS);
+}
+
 TargetLowering::AtomicExpansionKind
 WebAssemblyTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // We have wasm instructions for these
@@ -551,7 +576,21 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
   if (IsIndirect) {
     auto FnPtr = CallParams.getOperand(0);
     CallParams.RemoveOperand(0);
-    CallParams.addOperand(FnPtr);
+
+    // For funcrefs, call_indirect is done through __funcref_call_table and the
+    // funcref is always installed in slot 0 of the table, therefore instead of having
+    // the function pointer added at the end of the params list, a zero (the index in
+    // __funcref_call_table is added).
+    if (IsFuncrefCall) {
+      Register RegZero =
+          MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+      MachineInstrBuilder MIBC0 =
+          BuildMI(MF, DL, TII.get(WebAssembly::CONST_I32), RegZero).addImm(0);
+
+      BB->insert(CallResults.getIterator(), MIBC0);
+      MachineInstrBuilder(MF, CallParams).addReg(RegZero);
+    } else
+      CallParams.addOperand(FnPtr);
   }
 
   for (auto Def : CallResults.defs())
@@ -770,6 +809,13 @@ bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
          (ExtT == MVT::v2i64 && MemT == MVT::v2i32);
 }
 
+bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // Wasm doesn't support function addresses with offsets
+  const GlobalValue *GV = GA->getGlobal();
+  return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
+}
+
 EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
                                                   LLVMContext &C,
                                                   EVT VT) const {
@@ -823,6 +869,45 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
 }
 
+void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = Op.getConstantOperandVal(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::wasm_bitmask: {
+      unsigned BitWidth = Known.getBitWidth();
+      EVT VT = Op.getOperand(1).getSimpleValueType();
+      unsigned PossibleBits = VT.getVectorNumElements();
+      APInt ZeroMask = APInt::getHighBitsSet(BitWidth, BitWidth - PossibleBits);
+      Known.Zero |= ZeroMask;
+      break;
+    }
+    }
+  }
+  }
+}
+
+TargetLoweringBase::LegalizeTypeAction
+WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
+  if (VT.isFixedLengthVector()) {
+    MVT EltVT = VT.getVectorElementType();
+    // We have legal vector types with these lane types, so widening the
+    // vector would let us use some of the lanes directly without having to
+    // extend or truncate values.
+    if (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+        EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64)
+      return TypeWidenVector;
+  }
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Lowering private implementation.
 //===----------------------------------------------------------------------===//
@@ -1088,7 +1173,8 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Lastly, if this is a call to a funcref we need to add an instruction
   // table.set to the chain and transform the call.
-  if (CLI.CB && isFuncrefType(CLI.CB->getCalledOperand()->getType())) {
+  if (CLI.CB &&
+      WebAssembly::isFuncrefType(CLI.CB->getCalledOperand()->getType())) {
     // In the absence of function references proposal where a funcref call is
     // lowered to call_ref, using reference types we generate a table.set to set
     // the funcref to a special table used solely for this purpose, followed by
@@ -1106,7 +1192,8 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
         WebAssemblyISD::TABLE_SET, DL, DAG.getVTList(MVT::Other), TableSetOps,
         MVT::funcref,
         // Machine Mem Operand args
-        MachinePointerInfo(WasmAddressSpace::FUNCREF),
+        MachinePointerInfo(
+            WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF),
         CLI.CB->getCalledOperand()->getPointerAlignment(DAG.getDataLayout()),
         MachineMemOperand::MOStore);
 
@@ -1325,6 +1412,10 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerLoad(Op, DAG);
   case ISD::STORE:
     return LowerStore(Op, DAG);
+  case ISD::CTPOP:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+    return DAG.UnrollVectorOp(Op.getNode());
   }
 }
 
@@ -1344,14 +1435,78 @@ static Optional<unsigned> IsWebAssemblyLocal(SDValue Op, SelectionDAG &DAG) {
   return WebAssemblyFrameLowering::getLocalForStackObject(MF, FI->getIndex());
 }
 
-bool WebAssemblyTargetLowering::isFuncrefType(const Type *Ty) {
-  return isa<PointerType>(Ty) &&
-         Ty->getPointerAddressSpace() == WasmAddressSpace::FUNCREF;
+static bool IsWebAssemblyTable(SDValue Op) {
+  const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+  if (GA && WebAssembly::isWasmVarAddressSpace(GA->getAddressSpace())) {
+    const GlobalValue *Value = GA->getGlobal();
+    const Type *Ty = Value->getValueType();
+
+    if (Ty->isArrayTy() && WebAssembly::isRefType(Ty->getArrayElementType()))
+      return true;
+  }
+  return false;
+}
+
+// This function will accept as Op any access to a table, so Op can
+// be the actual table or an offset into the table.
+static bool IsWebAssemblyTableWithOffset(SDValue Op) {
+  if (Op->getOpcode() == ISD::ADD && Op->getNumOperands() == 2)
+    return (Op->getOperand(1).getSimpleValueType() == MVT::i32 &&
+            IsWebAssemblyTableWithOffset(Op->getOperand(0))) ||
+           (Op->getOperand(0).getSimpleValueType() == MVT::i32 &&
+            IsWebAssemblyTableWithOffset(Op->getOperand(1)));
+
+  return IsWebAssemblyTable(Op);
 }
 
-bool WebAssemblyTargetLowering::isExternrefType(const Type *Ty) {
-  return isa<PointerType>(Ty) &&
-         Ty->getPointerAddressSpace() == WasmAddressSpace::EXTERNREF;
+// Helper for table pattern matching used in LowerStore and LowerLoad
+bool WebAssemblyTargetLowering::MatchTableForLowering(SelectionDAG &DAG,
+                                                      const SDLoc &DL,
+                                                      const SDValue &Base,
+                                                      GlobalAddressSDNode *&GA,
+                                                      SDValue &Idx) const {
+  // We expect the following graph for a load of the form:
+  // table[<var> + <constant offset>]
+  //
+  // Case 1:
+  // externref = load t1
+  // t1: i32 = add t2, i32:<constant offset>
+  // t2: i32 = add tX, table
+  //
+  // This is in some cases simplified to just:
+  // Case 2:
+  // externref = load t1
+  // t1: i32 = add t2, i32:tX
+  //
+  // So, unfortunately we need to check for both cases and if we are in the
+  // first case extract the table GlobalAddressNode and build a new node tY
+  // that's tY: i32 = add i32:<constant offset>, i32:tX
+  //
+  if (IsWebAssemblyTable(Base)) {
+    GA = cast<GlobalAddressSDNode>(Base);
+    Idx = DAG.getConstant(0, DL, MVT::i32);
+  } else {
+    GA = dyn_cast<GlobalAddressSDNode>(Base->getOperand(0));
+    if (GA) {
+      // We are in Case 2 above.
+      Idx = Base->getOperand(1);
+      if (!Idx || GA->getNumValues() != 1 || Idx->getNumValues() != 1)
+        return false;
+    } else {
+      // This might be Case 1 above (or an error)
+      SDValue V = Base->getOperand(0);
+      GA = dyn_cast<GlobalAddressSDNode>(V->getOperand(1));
+
+      if (V->getOpcode() != ISD::ADD || V->getNumOperands() != 2 || !GA)
+        return false;
+
+      SDValue IdxV = DAG.getNode(ISD::ADD, DL, MVT::i32, Base->getOperand(1),
+                                 V->getOperand(0));
+      Idx = IdxV;
+    }
+  }
+
+  return true;
 }
 
 SDValue WebAssemblyTargetLowering::LowerStore(SDValue Op,
@@ -1362,6 +1517,26 @@ SDValue WebAssemblyTargetLowering::LowerStore(SDValue Op,
   const SDValue &Base = SN->getBasePtr();
   const SDValue &Offset = SN->getOffset();
 
+  if (IsWebAssemblyTableWithOffset(Base)) {
+    if (!Offset->isUndef())
+      report_fatal_error(
+          "unexpected offset when loading from webassembly table", false);
+
+    SDValue Idx;
+    GlobalAddressSDNode *GA;
+
+    if (!MatchTableForLowering(DAG, DL, Base, GA, Idx))
+      report_fatal_error("failed pattern matching for lowering table store",
+                         false);
+
+    SDVTList Tys = DAG.getVTList(MVT::Other);
+    SDValue TableSetOps[] = {SN->getChain(), SDValue(GA, 0), Idx, Value};
+    SDValue TableSet =
+        DAG.getMemIntrinsicNode(WebAssemblyISD::TABLE_SET, DL, Tys, TableSetOps,
+                                SN->getMemoryVT(), SN->getMemOperand());
+    return TableSet;
+  }
+
   if (IsWebAssemblyGlobal(Base)) {
     if (!Offset->isUndef())
       report_fatal_error("unexpected offset when storing to webassembly global",
@@ -1394,6 +1569,26 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op,
   const SDValue &Base = LN->getBasePtr();
   const SDValue &Offset = LN->getOffset();
 
+  if (IsWebAssemblyTableWithOffset(Base)) {
+    if (!Offset->isUndef())
+      report_fatal_error(
+          "unexpected offset when loading from webassembly table", false);
+
+    GlobalAddressSDNode *GA;
+    SDValue Idx;
+
+    if (!MatchTableForLowering(DAG, DL, Base, GA, Idx))
+      report_fatal_error("failed pattern matching for lowering table load",
+                         false);
+
+    SDVTList Tys = DAG.getVTList(LN->getValueType(0), MVT::Other);
+    SDValue TableGetOps[] = {LN->getChain(), SDValue(GA, 0), Idx};
+    SDValue TableGet =
+        DAG.getMemIntrinsicNode(WebAssemblyISD::TABLE_GET, DL, Tys, TableGetOps,
+                                LN->getMemoryVT(), LN->getMemOperand());
+    return TableGet;
+  }
+
   if (IsWebAssemblyGlobal(Base)) {
     if (!Offset->isUndef())
       report_fatal_error(
@@ -1468,7 +1663,7 @@ SDValue WebAssemblyTargetLowering::LowerRETURNADDR(SDValue Op,
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   MakeLibCallOptions CallOptions;
   return makeLibCall(DAG, RTLIB::RETURN_ADDRESS, Op.getValueType(),
                      {DAG.getConstant(Depth, DL, MVT::i32)}, CallOptions, DL)
@@ -1495,7 +1690,6 @@ WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);
   const auto *GA = cast<GlobalAddressSDNode>(Op);
-  MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFunction &MF = DAG.getMachineFunction();
   if (!MF.getSubtarget<WebAssemblySubtarget>().hasBulkMemory())
@@ -1517,20 +1711,43 @@ WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                        false);
   }
 
-  auto GlobalGet = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
-                                     : WebAssembly::GLOBAL_GET_I32;
-  const char *BaseName = MF.createExternalSymbolName("__tls_base");
+  auto model = GV->getThreadLocalMode();
 
-  SDValue BaseAddr(
-      DAG.getMachineNode(GlobalGet, DL, PtrVT,
-                         DAG.getTargetExternalSymbol(BaseName, PtrVT)),
-      0);
+  // Unsupported TLS modes
+  assert(model != GlobalValue::NotThreadLocal);
+  assert(model != GlobalValue::InitialExecTLSModel);
+
+  if (model == GlobalValue::LocalExecTLSModel ||
+      model == GlobalValue::LocalDynamicTLSModel ||
+      (model == GlobalValue::GeneralDynamicTLSModel &&
+       getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))) {
+    // For DSO-local TLS variables we use offset from __tls_base
+
+    MVT PtrVT = getPointerTy(DAG.getDataLayout());
+    auto GlobalGet = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
+                                       : WebAssembly::GLOBAL_GET_I32;
+    const char *BaseName = MF.createExternalSymbolName("__tls_base");
+
+    SDValue BaseAddr(
+        DAG.getMachineNode(GlobalGet, DL, PtrVT,
+                           DAG.getTargetExternalSymbol(BaseName, PtrVT)),
+        0);
+
+    SDValue TLSOffset = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, GA->getOffset(), WebAssemblyII::MO_TLS_BASE_REL);
+    SDValue SymOffset =
+        DAG.getNode(WebAssemblyISD::WrapperREL, DL, PtrVT, TLSOffset);
+
+    return DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, SymOffset);
+  }
 
-  SDValue TLSOffset = DAG.getTargetGlobalAddress(
-      GV, DL, PtrVT, GA->getOffset(), WebAssemblyII::MO_TLS_BASE_REL);
-  SDValue SymAddr = DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, TLSOffset);
+  assert(model == GlobalValue::GeneralDynamicTLSModel);
 
-  return DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, SymAddr);
+  EVT VT = Op.getValueType();
+  return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+                     DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT,
+                                                GA->getOffset(),
+                                                WebAssemblyII::MO_GOT_TLS));
 }
 
 SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
@@ -1563,14 +1780,13 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
                       DAG.getTargetExternalSymbol(BaseName, PtrVT));
 
       SDValue SymAddr = DAG.getNode(
-          WebAssemblyISD::WrapperPIC, DL, VT,
+          WebAssemblyISD::WrapperREL, DL, VT,
           DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset(),
                                      OperandFlags));
 
       return DAG.getNode(ISD::ADD, DL, VT, BaseAddr, SymAddr);
-    } else {
-      OperandFlags = WebAssemblyII::MO_GOT;
     }
+    OperandFlags = WebAssemblyII::MO_GOT;
   }
 
   return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
@@ -1640,21 +1856,6 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
-static SDValue getCppExceptionSymNode(SDValue Op, unsigned TagIndex,
-                                      SelectionDAG &DAG) {
-  // We only support C++ exceptions for now
-  int Tag =
-      cast<ConstantSDNode>(Op.getOperand(TagIndex).getNode())->getZExtValue();
-  if (Tag != WebAssembly::CPP_EXCEPTION)
-    llvm_unreachable("Invalid tag: We only support C++ exceptions for now");
-  auto &MF = DAG.getMachineFunction();
-  const auto &TLI = DAG.getTargetLoweringInfo();
-  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-  const char *SymName = MF.createExternalSymbolName("__cpp_exception");
-  return DAG.getNode(WebAssemblyISD::Wrapper, SDLoc(Op), PtrVT,
-                     DAG.getTargetExternalSymbol(SymName, PtrVT));
-}
-
 SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1662,10 +1863,10 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
   switch (Op.getOpcode()) {
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
-    IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    IntNo = Op.getConstantOperandVal(1);
     break;
   case ISD::INTRINSIC_WO_CHAIN:
-    IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    IntNo = Op.getConstantOperandVal(0);
     break;
   default:
     llvm_unreachable("Invalid intrinsic");
@@ -1677,38 +1878,22 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
     return SDValue(); // Don't custom lower most intrinsics.
 
   case Intrinsic::wasm_lsda: {
-    EVT VT = Op.getValueType();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-    auto &Context = MF.getMMI().getContext();
-    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
-                                            Twine(MF.getFunctionNumber()));
-    return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
-                       DAG.getMCSymbol(S, PtrVT));
-  }
-
-  case Intrinsic::wasm_throw: {
-    SDValue SymNode = getCppExceptionSymNode(Op, 2, DAG);
-    return DAG.getNode(WebAssemblyISD::THROW, DL,
-                       MVT::Other, // outchain type
-                       {
-                           Op.getOperand(0), // inchain
-                           SymNode,          // exception symbol
-                           Op.getOperand(3)  // thrown value
-                       });
-  }
-
-  case Intrinsic::wasm_catch: {
-    SDValue SymNode = getCppExceptionSymNode(Op, 2, DAG);
-    return DAG.getNode(WebAssemblyISD::CATCH, DL,
-                       {
-                           MVT::i32,  // outchain type
-                           MVT::Other // return value
-                       },
-                       {
-                           Op.getOperand(0), // inchain
-                           SymNode           // exception symbol
-                       });
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    const char *SymName = MF.createExternalSymbolName(
+        "GCC_except_table" + std::to_string(MF.getFunctionNumber()));
+    if (isPositionIndependent()) {
+      SDValue Node = DAG.getTargetExternalSymbol(
+          SymName, PtrVT, WebAssemblyII::MO_MEMORY_BASE_REL);
+      const char *BaseName = MF.createExternalSymbolName("__memory_base");
+      SDValue BaseAddr =
+          DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT,
+                      DAG.getTargetExternalSymbol(BaseName, PtrVT));
+      SDValue SymAddr =
+          DAG.getNode(WebAssemblyISD::WrapperREL, DL, PtrVT, Node);
+      return DAG.getNode(ISD::ADD, DL, PtrVT, BaseAddr, SymAddr);
+    }
+    SDValue Node = DAG.getTargetExternalSymbol(SymName, PtrVT);
+    return DAG.getNode(WebAssemblyISD::Wrapper, DL, PtrVT, Node);
   }
 
   case Intrinsic::wasm_shuffle: {
@@ -1774,8 +1959,76 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                      Op.getOperand(1));
 }
 
+static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::v2f64)
+    return SDValue();
+
+  auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec,
+                             unsigned &Index) -> bool {
+    switch (Op.getOpcode()) {
+    case ISD::SINT_TO_FP:
+      Opcode = WebAssemblyISD::CONVERT_LOW_S;
+      break;
+    case ISD::UINT_TO_FP:
+      Opcode = WebAssemblyISD::CONVERT_LOW_U;
+      break;
+    case ISD::FP_EXTEND:
+      Opcode = WebAssemblyISD::PROMOTE_LOW;
+      break;
+    default:
+      return false;
+    }
+
+    auto ExtractVector = Op.getOperand(0);
+    if (ExtractVector.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    if (!isa<ConstantSDNode>(ExtractVector.getOperand(1).getNode()))
+      return false;
+
+    SrcVec = ExtractVector.getOperand(0);
+    Index = ExtractVector.getConstantOperandVal(1);
+    return true;
+  };
+
+  unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex;
+  SDValue LHSSrcVec, RHSSrcVec;
+  if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) ||
+      !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex))
+    return SDValue();
+
+  if (LHSOpcode != RHSOpcode)
+    return SDValue();
+
+  MVT ExpectedSrcVT;
+  switch (LHSOpcode) {
+  case WebAssemblyISD::CONVERT_LOW_S:
+  case WebAssemblyISD::CONVERT_LOW_U:
+    ExpectedSrcVT = MVT::v4i32;
+    break;
+  case WebAssemblyISD::PROMOTE_LOW:
+    ExpectedSrcVT = MVT::v4f32;
+    break;
+  }
+  if (LHSSrcVec.getValueType() != ExpectedSrcVT)
+    return SDValue();
+
+  auto Src = LHSSrcVec;
+  if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) {
+    // Shuffle the source vector so that the converted lanes are the low lanes.
+    Src = DAG.getVectorShuffle(
+        ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec,
+        {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1});
+  }
+  return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src);
+}
+
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
+  if (auto ConvertLow = LowerConvertLow(Op, DAG))
+    return ConvertLow;
+
   SDLoc DL(Op);
   const EVT VecT = Op.getValueType();
   const EVT LaneT = Op.getOperand(0).getValueType();
@@ -1901,12 +2154,8 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   size_t NumShuffleLanes = 0;
   if (ShuffleCounts.size()) {
     std::tie(ShuffleSrc1, NumShuffleLanes) = GetMostCommon(ShuffleCounts);
-    ShuffleCounts.erase(std::remove_if(ShuffleCounts.begin(),
-                                       ShuffleCounts.end(),
-                                       [&](const auto &Pair) {
-                                         return Pair.first == ShuffleSrc1;
-                                       }),
-                        ShuffleCounts.end());
+    llvm::erase_if(ShuffleCounts,
+                   [&](const auto &Pair) { return Pair.first == ShuffleSrc1; });
   }
   if (ShuffleCounts.size()) {
     size_t AdditionalShuffleLanes;
@@ -1974,7 +2223,23 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     SmallVector<SDValue, 16> ConstLanes;
     for (const SDValue &Lane : Op->op_values()) {
       if (IsConstant(Lane)) {
-        ConstLanes.push_back(Lane);
+        // Values may need to be fixed so that they will sign extend to be
+        // within the expected range during ISel. Check whether the value is in
+        // bounds based on the lane bit width and if it is out of bounds, lop
+        // off the extra bits and subtract 2^n to reflect giving the high bit
+        // value -2^(n-1) rather than +2^(n-1). Skip the i64 case because it
+        // cannot possibly be out of range.
+        auto *Const = dyn_cast<ConstantSDNode>(Lane.getNode());
+        int64_t Val = Const ? Const->getSExtValue() : 0;
+        uint64_t LaneBits = 128 / Lanes;
+        assert((LaneBits == 64 || Val >= -(1ll << (LaneBits - 1))) &&
+               "Unexpected out of bounds negative value");
+        if (Const && LaneBits != 64 && Val > (1ll << (LaneBits - 1)) - 1) {
+          auto NewVal = ((uint64_t)Val % (1ll << LaneBits)) - (1ll << LaneBits);
+          ConstLanes.push_back(DAG.getConstant(NewVal, SDLoc(Lane), LaneT));
+        } else {
+          ConstLanes.push_back(Lane);
+        }
       } else if (LaneT.isFloatingPoint()) {
         ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
       } else {
@@ -2227,120 +2492,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 }
 
 static SDValue
-performVectorConvertLowCombine(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI) {
-  auto &DAG = DCI.DAG;
-
-  EVT ResVT = N->getValueType(0);
-  if (ResVT != MVT::v2f64)
-    return SDValue();
-
-  auto GetWasmConversionOp = [](unsigned Op) {
-    switch (Op) {
-    case ISD::SINT_TO_FP:
-      return WebAssemblyISD::CONVERT_LOW_S;
-    case ISD::UINT_TO_FP:
-      return WebAssemblyISD::CONVERT_LOW_U;
-    case ISD::FP_EXTEND:
-      return WebAssemblyISD::PROMOTE_LOW;
-    }
-    llvm_unreachable("unexpected op");
-  };
-
-  if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    // Combine this:
-    //
-    //   (v2f64 (extract_subvector
-    //     (v4f64 ({s,u}int_to_fp (v4i32 $x))), 0))
-    //
-    // into (f64x2.convert_low_i32x4_{s,u} $x).
-    //
-    // Or this:
-    //
-    //  (v2f64 (extract_subvector
-    //    (v4f64 (fp_extend (v4f32 $x))), 0))
-    //
-    // into (f64x2.promote_low_f32x4 $x).
-    auto Conversion = N->getOperand(0);
-    auto ConversionOp = Conversion.getOpcode();
-    MVT ExpectedSourceType;
-    switch (ConversionOp) {
-    case ISD::SINT_TO_FP:
-    case ISD::UINT_TO_FP:
-      ExpectedSourceType = MVT::v4i32;
-      break;
-    case ISD::FP_EXTEND:
-      ExpectedSourceType = MVT::v4f32;
-      break;
-    default:
-      return SDValue();
-    }
-
-    if (Conversion.getValueType() != MVT::v4f64)
-      return SDValue();
-
-    auto Source = Conversion.getOperand(0);
-    if (Source.getValueType() != ExpectedSourceType)
-      return SDValue();
-
-    auto IndexNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
-      return SDValue();
-
-    auto Op = GetWasmConversionOp(ConversionOp);
-    return DAG.getNode(Op, SDLoc(N), ResVT, Source);
-  }
-
-  // Combine this:
-  //
-  //   (v2f64 ({s,u}int_to_fp
-  //     (v2i32 (extract_subvector (v4i32 $x), 0))))
-  //
-  // into (f64x2.convert_low_i32x4_{s,u} $x).
-  //
-  // Or this:
-  //
-  //   (v2f64 (fp_extend
-  //     (v2f32 (extract_subvector (v4f32 $x), 0))))
-  //
-  // into (f64x2.promote_low_f32x4 $x).
-  auto ConversionOp = N->getOpcode();
-  MVT ExpectedExtractType;
-  MVT ExpectedSourceType;
-  switch (ConversionOp) {
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    ExpectedExtractType = MVT::v2i32;
-    ExpectedSourceType = MVT::v4i32;
-    break;
-  case ISD::FP_EXTEND:
-    ExpectedExtractType = MVT::v2f32;
-    ExpectedSourceType = MVT::v4f32;
-    break;
-  default:
-    llvm_unreachable("unexpected opcode");
-  }
-
-  auto Extract = N->getOperand(0);
-  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return SDValue();
-
-  if (Extract.getValueType() != ExpectedExtractType)
-    return SDValue();
-
-  auto Source = Extract.getOperand(0);
-  if (Source.getValueType() != ExpectedSourceType)
-    return SDValue();
-
-  auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
-  if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
-    return SDValue();
-
-  unsigned Op = GetWasmConversionOp(ConversionOp);
-  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
-}
-
-static SDValue
 performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   auto &DAG = DCI.DAG;
 
@@ -2470,11 +2621,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     return performVectorExtendCombine(N, DCI);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-  case ISD::FP_EXTEND:
-  case ISD::EXTRACT_SUBVECTOR:
-    return performVectorConvertLowCombine(N, DCI);
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::FP_ROUND:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 5d813fefb96b..f7b460f61dbb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -45,35 +45,8 @@ public:
   WebAssemblyTargetLowering(const TargetMachine &TM,
                             const WebAssemblySubtarget &STI);
 
-  enum WasmAddressSpace : unsigned {
-    // WebAssembly uses the following address spaces:
-    // AS 0 : is the default address space for values in linear memory
-    DEFAULT = 0,
-    // AS 1 : is a non-integral address space for global variables
-    GLOBAL = 1,
-    // AS 10 : is a non-integral address space for externref values
-    EXTERNREF = 10,
-    // AS 20 : is a non-integral address space for funcref values
-    FUNCREF = 20,
-  };
-
-  MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
-    if (AS == WasmAddressSpace::EXTERNREF)
-      return MVT::externref;
-    if (AS == WasmAddressSpace::FUNCREF)
-      return MVT::funcref;
-    return TargetLowering::getPointerTy(DL, AS);
-  }
-  MVT getPointerMemTy(const DataLayout &DL, uint32_t AS = 0) const override {
-    if (AS == WasmAddressSpace::EXTERNREF)
-      return MVT::externref;
-    if (AS == WasmAddressSpace::FUNCREF)
-      return MVT::funcref;
-    return TargetLowering::getPointerMemTy(DL, AS);
-  }
-
-  static bool isFuncrefType(const Type *Ty);
-  static bool isExternrefType(const Type *Ty);
+  MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override;
+  MVT getPointerMemTy(const DataLayout &DL, uint32_t AS = 0) const override;
 
 private:
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
@@ -102,12 +75,21 @@ private:
                                       bool *Fast) const override;
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth) const override;
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(MVT VT) const override;
+
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -154,6 +136,11 @@ private:
   SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
 
+  // Helper for LoadLoad and LowerStore
+  bool MatchTableForLowering(SelectionDAG &DAG, const SDLoc &DL,
+                             const SDValue &Base, GlobalAddressSDNode *&GA,
+                             SDValue &Idx) const;
+
   // Custom DAG combine hooks
   SDValue
   PerformDAGCombine(SDNode *N,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 1ee6ae196d02..42183d1645e1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -114,13 +114,13 @@ def NotifyPatOffsetOnly_A64 :
   Requires<[HasAddr64, HasAtomics]>;
 
 def NotifyPatGlobalAddrOffOnly_A32 :
-  Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+  Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblyWrapper tglobaladdr:$off),
                                           I32:$count)),
       (MEMORY_ATOMIC_NOTIFY_A32 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)
      >,
   Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
 def NotifyPatGlobalAddrOffOnly_A64 :
-  Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+  Pat<(i32 (int_wasm_memory_atomic_notify (WebAssemblyWrapper tglobaladdr:$off),
                                           I32:$count)),
       (MEMORY_ATOMIC_NOTIFY_A64 0, tglobaladdr:$off, (CONST_I64 0), I32:$count)
      >,
@@ -185,12 +185,12 @@ defm : WaitPatOffsetOnly<i64, int_wasm_memory_atomic_wait64,
                          "MEMORY_ATOMIC_WAIT64">;
 
 multiclass WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, string inst> {
-  def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
+  def : Pat<(i32 (kind (WebAssemblyWrapper tglobaladdr:$off), ty:$exp,
                   I64:$timeout)),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
                I64:$timeout)>,
         Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
-  def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
+  def : Pat<(i32 (kind (WebAssemblyWrapper tglobaladdr:$off), ty:$exp,
                   I64:$timeout)),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
                I64:$timeout)>,
@@ -390,10 +390,10 @@ defm : AStorePatOffsetOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
 defm : AStorePatOffsetOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
 
 multiclass AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
+  def : Pat<(kind (WebAssemblyWrapper tglobaladdr:$off), ty:$val),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
         Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
-  def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
+  def : Pat<(kind (WebAssemblyWrapper tglobaladdr:$off), ty:$val),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
         Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 }
@@ -592,10 +592,10 @@ multiclass BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
 }
 
 multiclass BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
+  def : Pat<(ty (kind (WebAssemblyWrapper tglobaladdr:$off), ty:$val)),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
         Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
-  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
+  def : Pat<(ty (kind (WebAssemblyWrapper tglobaladdr:$off), ty:$val)),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
         Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
 }
@@ -659,7 +659,7 @@ class sext_bin_rmw_16_64<PatFrag kind> : sext_bin_rmw_8_64<kind>;
 
 // Patterns for various addressing modes for truncating-extending binary RMWs.
 multiclass BinRMWTruncExtPattern<
-  PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
+  PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32,
   string inst8_32, string inst16_32, string inst8_64, string inst16_64, string inst32_64> {
   // Truncating-extending binary RMWs with no constant offset
   defm : BinRMWPatNoOffset<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
@@ -724,27 +724,27 @@ multiclass BinRMWTruncExtPattern<
 }
 
 defm : BinRMWTruncExtPattern<
-  atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, atomic_load_add_64,
+  atomic_load_add_8, atomic_load_add_16, atomic_load_add_32,
   "ATOMIC_RMW8_U_ADD_I32", "ATOMIC_RMW16_U_ADD_I32",
   "ATOMIC_RMW8_U_ADD_I64", "ATOMIC_RMW16_U_ADD_I64", "ATOMIC_RMW32_U_ADD_I64">;
 defm : BinRMWTruncExtPattern<
-  atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32, atomic_load_sub_64,
+  atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32,
   "ATOMIC_RMW8_U_SUB_I32", "ATOMIC_RMW16_U_SUB_I32",
   "ATOMIC_RMW8_U_SUB_I64", "ATOMIC_RMW16_U_SUB_I64", "ATOMIC_RMW32_U_SUB_I64">;
 defm : BinRMWTruncExtPattern<
-  atomic_load_and_8, atomic_load_and_16, atomic_load_and_32, atomic_load_and_64,
+  atomic_load_and_8, atomic_load_and_16, atomic_load_and_32,
   "ATOMIC_RMW8_U_AND_I32", "ATOMIC_RMW16_U_AND_I32",
   "ATOMIC_RMW8_U_AND_I64", "ATOMIC_RMW16_U_AND_I64", "ATOMIC_RMW32_U_AND_I64">;
 defm : BinRMWTruncExtPattern<
-  atomic_load_or_8, atomic_load_or_16, atomic_load_or_32, atomic_load_or_64,
+  atomic_load_or_8, atomic_load_or_16, atomic_load_or_32,
   "ATOMIC_RMW8_U_OR_I32", "ATOMIC_RMW16_U_OR_I32",
   "ATOMIC_RMW8_U_OR_I64", "ATOMIC_RMW16_U_OR_I64", "ATOMIC_RMW32_U_OR_I64">;
 defm : BinRMWTruncExtPattern<
-  atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32, atomic_load_xor_64,
+  atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32,
   "ATOMIC_RMW8_U_XOR_I32", "ATOMIC_RMW16_U_XOR_I32",
   "ATOMIC_RMW8_U_XOR_I64", "ATOMIC_RMW16_U_XOR_I64", "ATOMIC_RMW32_U_XOR_I64">;
 defm : BinRMWTruncExtPattern<
-  atomic_swap_8, atomic_swap_16, atomic_swap_32, atomic_swap_64,
+  atomic_swap_8, atomic_swap_16, atomic_swap_32,
   "ATOMIC_RMW8_U_XCHG_I32", "ATOMIC_RMW16_U_XCHG_I32",
   "ATOMIC_RMW8_U_XCHG_I64", "ATOMIC_RMW16_U_XCHG_I64",
   "ATOMIC_RMW32_U_XCHG_I64">;
@@ -826,11 +826,11 @@ multiclass TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
 }
 
 multiclass TerRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
+  def : Pat<(ty (kind (WebAssemblyWrapper tglobaladdr:$off), ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
               ty:$new)>,
         Requires<[HasAddr32, HasAtomics, IsNotPIC]>;
-  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
+  def : Pat<(ty (kind (WebAssemblyWrapper tglobaladdr:$off), ty:$exp, ty:$new)),
             (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
               ty:$new)>,
         Requires<[HasAddr64, HasAtomics, IsNotPIC]>;
@@ -895,7 +895,7 @@ class sext_ter_rmw_16_64<PatFrag kind> : sext_ter_rmw_8_64<kind>;
 
 // Patterns for various addressing modes for truncating-extending ternary RMWs.
 multiclass TerRMWTruncExtPattern<
-  PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
+  PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32,
   string inst8_32, string inst16_32, string inst8_64, string inst16_64,
   string inst32_64> {
   // Truncating-extending ternary RMWs with no constant offset
@@ -961,7 +961,7 @@ multiclass TerRMWTruncExtPattern<
 }
 
 defm : TerRMWTruncExtPattern<
-  atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32, atomic_cmp_swap_64,
+  atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32,
   "ATOMIC_RMW8_U_CMPXCHG_I32", "ATOMIC_RMW16_U_CMPXCHG_I32",
   "ATOMIC_RMW8_U_CMPXCHG_I64", "ATOMIC_RMW16_U_CMPXCHG_I64",
   "ATOMIC_RMW32_U_CMPXCHG_I64">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 437b07bf8baf..be6547007aaf 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -130,8 +130,7 @@ let Predicates = [HasExceptionHandling] in {
 // Throwing an exception: throw / rethrow
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 defm THROW : I<(outs), (ins tag_op:$tag, variable_ops),
-               (outs), (ins tag_op:$tag),
-               [(WebAssemblythrow (WebAssemblywrapper texternalsym:$tag))],
+               (outs), (ins tag_op:$tag), [],
                "throw   \t$tag", "throw   \t$tag", 0x08>;
 defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
@@ -147,14 +146,10 @@ defm END_TRY : NRI<(outs), (ins), [], "end_try", 0x0b>;
 
 // Catching an exception: catch / catch_all
 let hasCtrlDep = 1, hasSideEffects = 1 in {
-// Currently 'catch' can only extract an i32, which is sufficient for C++
-// support, but according to the spec 'catch' can extract any number of values
-// based on the tag type.
-defm CATCH : I<(outs I32:$dst), (ins tag_op:$tag),
-               (outs), (ins tag_op:$tag),
-               [(set I32:$dst,
-                 (WebAssemblycatch (WebAssemblywrapper texternalsym:$tag)))],
-               "catch   \t$dst, $tag", "catch   \t$tag", 0x07>;
+let variadicOpsAreDefs = 1 in
+defm CATCH : I<(outs), (ins tag_op:$tag, variable_ops),
+               (outs), (ins tag_op:$tag), [],
+               "catch",  "catch   \t$tag", 0x07>;
 defm CATCH_ALL : NRI<(outs), (ins), [], "catch_all", 0x19>;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 15748067f123..ee9247a8bef9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -26,6 +26,10 @@ def HasSIMD128 :
     Predicate<"Subtarget->hasSIMD128()">,
     AssemblerPredicate<(all_of FeatureSIMD128), "simd128">;
 
+def HasRelaxedSIMD :
+    Predicate<"Subtarget->hasRelaxedSIMD()">,
+    AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
+
 def HasAtomics :
     Predicate<"Subtarget->hasAtomics()">,
     AssemblerPredicate<(all_of FeatureAtomics), "atomics">;
@@ -77,10 +81,6 @@ def SDT_WebAssemblyLocalSet   : SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>;
 def SDT_WebAssemblyReturn     : SDTypeProfile<0, -1, []>;
 def SDT_WebAssemblyWrapper    : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                      SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyWrapperPIC : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
-                                                     SDTCisPtrTy<0>]>;
-def SDT_WebAssemblyThrow      : SDTypeProfile<0, -1, []>;
-def SDT_WebAssemblyCatch      : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
 def SDT_WebAssemblyGlobalGet  : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>;
 def SDT_WebAssemblyGlobalSet  : SDTypeProfile<0, 2, [SDTCisPtrTy<1>]>;
 
@@ -102,14 +102,10 @@ def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
 def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
                                  SDT_WebAssemblyReturn,
                                  [SDNPHasChain, SDNPVariadic]>;
-def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
+def WebAssemblyWrapper  : SDNode<"WebAssemblyISD::Wrapper",
                                  SDT_WebAssemblyWrapper>;
-def WebAssemblywrapperPIC  : SDNode<"WebAssemblyISD::WrapperPIC",
-                                     SDT_WebAssemblyWrapperPIC>;
-def WebAssemblythrow : SDNode<"WebAssemblyISD::THROW", SDT_WebAssemblyThrow,
-                              [SDNPHasChain, SDNPVariadic]>;
-def WebAssemblycatch : SDNode<"WebAssemblyISD::CATCH", SDT_WebAssemblyCatch,
-                              [SDNPHasChain, SDNPSideEffect]>;
+def WebAssemblyWrapperREL  : SDNode<"WebAssemblyISD::WrapperREL",
+                                     SDT_WebAssemblyWrapper>;
 def WebAssemblyglobal_get :
     SDNode<"WebAssemblyISD::GLOBAL_GET", SDT_WebAssemblyGlobalGet,
            [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -348,10 +344,10 @@ multiclass LOCAL<WebAssemblyRegClass rc, Operand global_op> {
   } // hasSideEffects = 0
   foreach vt = rc.RegTypes in {
     def : Pat<(vt (WebAssemblyglobal_get
-                   (WebAssemblywrapper tglobaladdr:$addr))),
+                   (WebAssemblyWrapper tglobaladdr:$addr))),
               (!cast<NI>("GLOBAL_GET_" # rc) tglobaladdr:$addr)>;
     def : Pat<(WebAssemblyglobal_set
-               vt:$src, (WebAssemblywrapper tglobaladdr:$addr)),
+               vt:$src, (WebAssemblyWrapper tglobaladdr:$addr)),
               (!cast<NI>("GLOBAL_SET_" # rc) tglobaladdr:$addr, vt:$src)>;
     def : Pat<(vt (WebAssemblylocal_get (i32 timm:$local))),
               (!cast<NI>("LOCAL_GET_" # rc) timm:$local)>;
@@ -386,38 +382,45 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                    "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
-def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+def : Pat<(i32 (WebAssemblyWrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>, Requires<[IsNotPIC, HasAddr32]>;
-def : Pat<(i64 (WebAssemblywrapper tglobaladdr:$addr)),
+def : Pat<(i64 (WebAssemblyWrapper tglobaladdr:$addr)),
           (CONST_I64 tglobaladdr:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
 
-def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+def : Pat<(i32 (WebAssemblyWrapper tglobaladdr:$addr)),
           (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
-def : Pat<(i64 (WebAssemblywrapper tglobaladdr:$addr)),
+def : Pat<(i64 (WebAssemblyWrapper tglobaladdr:$addr)),
           (GLOBAL_GET_I64 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr64]>;
 
-def : Pat<(i32 (WebAssemblywrapperPIC tglobaladdr:$addr)),
+def : Pat<(i32 (WebAssemblyWrapperREL tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr32]>;
-def : Pat<(i64 (WebAssemblywrapperPIC tglobaladdr:$addr)),
+def : Pat<(i64 (WebAssemblyWrapperREL tglobaladdr:$addr)),
           (CONST_I64 tglobaladdr:$addr)>, Requires<[IsPIC, HasAddr64]>;
 
-def : Pat<(i32 (WebAssemblywrapper tglobaltlsaddr:$addr)),
+def : Pat<(i32 (WebAssemblyWrapperREL tglobaltlsaddr:$addr)),
           (CONST_I32 tglobaltlsaddr:$addr)>, Requires<[HasAddr32]>;
-def : Pat<(i64 (WebAssemblywrapper tglobaltlsaddr:$addr)),
+def : Pat<(i64 (WebAssemblyWrapperREL tglobaltlsaddr:$addr)),
           (CONST_I64 tglobaltlsaddr:$addr)>, Requires<[HasAddr64]>;
 
-def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+def : Pat<(i32 (WebAssemblyWrapper tglobaltlsaddr:$addr)),
+          (GLOBAL_GET_I32 tglobaltlsaddr:$addr)>, Requires<[HasAddr32]>;
+def : Pat<(i64 (WebAssemblyWrapper tglobaltlsaddr:$addr)),
+          (GLOBAL_GET_I64 tglobaltlsaddr:$addr)>, Requires<[HasAddr64]>;
+
+def : Pat<(i32 (WebAssemblyWrapper texternalsym:$addr)),
           (GLOBAL_GET_I32 texternalsym:$addr)>, Requires<[IsPIC, HasAddr32]>;
-def : Pat<(i64 (WebAssemblywrapper texternalsym:$addr)),
+def : Pat<(i64 (WebAssemblyWrapper texternalsym:$addr)),
           (GLOBAL_GET_I64 texternalsym:$addr)>, Requires<[IsPIC, HasAddr64]>;
 
-def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+def : Pat<(i32 (WebAssemblyWrapper texternalsym:$addr)),
           (CONST_I32 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr32]>;
-def : Pat<(i64 (WebAssemblywrapper texternalsym:$addr)),
+def : Pat<(i64 (WebAssemblyWrapper texternalsym:$addr)),
           (CONST_I64 texternalsym:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
 
-def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
-def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
+def : Pat<(i32 (WebAssemblyWrapperREL texternalsym:$addr)),
+          (CONST_I32 texternalsym:$addr)>, Requires<[IsPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblyWrapperREL texternalsym:$addr)),
+          (CONST_I64 texternalsym:$addr)>, Requires<[IsPIC, HasAddr64]>;
 
 //===----------------------------------------------------------------------===//
 // Additional sets of instructions.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 82f5e985c558..a70f62dde845 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -117,10 +117,10 @@ defm : LoadPatOffsetOnly<f32, load, "LOAD_F32">;
 defm : LoadPatOffsetOnly<f64, load, "LOAD_F64">;
 
 multiclass LoadPatGlobalAddrOffOnly<ValueType ty, SDPatternOperator kind, string inst> {
-  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
+  def : Pat<(ty (kind (WebAssemblyWrapper tglobaladdr:$off))),
             (!cast<NI>(inst # "_A32") 0, tglobaladdr:$off, (CONST_I32 0))>,
         Requires<[IsNotPIC, HasAddr32]>;
-  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
+  def : Pat<(ty (kind (WebAssemblyWrapper tglobaladdr:$off))),
             (!cast<NI>(inst # "_A64") 0, tglobaladdr:$off, (CONST_I64 0))>,
         Requires<[IsNotPIC, HasAddr64]>;
 }
@@ -313,11 +313,11 @@ defm : StorePatOffsetOnly<f32, store, "STORE_F32">;
 defm : StorePatOffsetOnly<f64, store, "STORE_F64">;
 
 multiclass StorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
+  def : Pat<(kind ty:$val, (WebAssemblyWrapper tglobaladdr:$off)),
             (!cast<NI>(inst # "_A32") 0, tglobaladdr:$off, (CONST_I32 0),
              ty:$val)>,
         Requires<[IsNotPIC, HasAddr32]>;
-  def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
+  def : Pat<(kind ty:$val, (WebAssemblyWrapper tglobaladdr:$off)),
             (!cast<NI>(inst # "_A64") 0, tglobaladdr:$off, (CONST_I64 0),
              ty:$val)>,
         Requires<[IsNotPIC, HasAddr64]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 6429b46673a6..30b99c3a69a9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -11,17 +11,34 @@
 ///
 //===----------------------------------------------------------------------===//
 
-// Instructions requiring HasSIMD128 and the simd128 prefix byte
-multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
-                  list<dag> pattern_r, string asmstr_r = "",
-                  string asmstr_s = "", bits<32> simdop = -1> {
+// Instructions using the SIMD opcode prefix and requiring one of the SIMD
+// feature predicates.
+multiclass ABSTRACT_SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                           list<dag> pattern_r, string asmstr_r,
+                           string asmstr_s, bits<32> simdop,
+                           Predicate simd_level> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
               !if(!ge(simdop, 0x100),
                   !or(0xfd0000, !and(0xffff, simdop)),
                   !or(0xfd00, !and(0xff, simdop)))>,
-            Requires<[HasSIMD128]>;
+            Requires<[simd_level]>;
+}
+
+multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                  list<dag> pattern_r, string asmstr_r = "",
+                  string asmstr_s = "", bits<32> simdop = -1> {
+  defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
+                            asmstr_s, simdop, HasSIMD128>;
 }
 
+multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                     list<dag> pattern_r, string asmstr_r = "",
+                     string asmstr_s = "", bits<32> simdop = -1> {
+  defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
+                            asmstr_s, simdop, HasRelaxedSIMD>;
+}
+
+
 defm "" : ARGUMENT<V128, v16i8>;
 defm "" : ARGUMENT<V128, v8i16>;
 defm "" : ARGUMENT<V128, v4i32>;
@@ -267,6 +284,16 @@ multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
 defm "" : SIMDLoadZero<I32x4, 0x5c>;
 defm "" : SIMDLoadZero<I64x2, 0x5d>;
 
+// Use load_zero to load scalars into vectors as well where possible.
+// TODO: i32, i16, and i8 scalars
+def load_scalar :
+  PatFrag<(ops node:$addr), (scalar_to_vector (i64 (load $addr)))>;
+defm : LoadPatNoOffset<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+defm : LoadPatImmOff<v2i64, load_scalar, regPlusImm, "LOAD_ZERO_I64x2">;
+defm : LoadPatImmOff<v2i64, load_scalar, or_is_add, "LOAD_ZERO_I64x2">;
+defm : LoadPatOffsetOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+defm : LoadPatGlobalAddrOffOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+
 // TODO: f32x4 and f64x2 as well
 foreach vec = [I32x4, I64x2] in {
   defvar inst = "LOAD_ZERO_"#vec;
@@ -1165,6 +1192,16 @@ def : Pat<(vec.int_vt (vselect
           (pmax $lhs, $rhs)>;
 }
 
+// And match the pmin/pmax LLVM intrinsics as well
+def : Pat<(v4f32 (int_wasm_pmin (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+          (PMIN_F32x4 V128:$lhs, V128:$rhs)>;
+def : Pat<(v4f32 (int_wasm_pmax (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+          (PMAX_F32x4 V128:$lhs, V128:$rhs)>;
+def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+          (PMIN_F64x2 V128:$lhs, V128:$rhs)>;
+def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+          (PMAX_F64x2 V128:$lhs, V128:$rhs)>;
+
 //===----------------------------------------------------------------------===//
 // Conversions
 //===----------------------------------------------------------------------===//
@@ -1241,87 +1278,6 @@ multiclass SIMDNarrow<Vec vec, bits<32> baseInst> {
 defm "" : SIMDNarrow<I16x8, 101>;
 defm "" : SIMDNarrow<I32x4, 133>;
 
-// Use narrowing operations for truncating stores. Since the narrowing
-// operations are saturating instead of truncating, we need to mask
-// the stored values first.
-def store_v8i8_trunc_v8i16 :
-  OutPatFrag<(ops node:$val),
-             (EXTRACT_LANE_I64x2
-               (NARROW_U_I8x16
-                 (AND
-                   (CONST_V128_I16x8
-                     0x00ff, 0x00ff, 0x00ff, 0x00ff,
-                     0x00ff, 0x00ff, 0x00ff, 0x00ff),
-                   node:$val),
-                 $val), // Unused input
-               0)>;
-
-def store_v4i16_trunc_v4i32 :
-  OutPatFrag<(ops node:$val),
-             (EXTRACT_LANE_I64x2
-               (NARROW_U_I16x8
-                 (AND
-                  (CONST_V128_I32x4
-                    0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff),
-                  node:$val),
-                 $val), // Unused input
-               0)>;
-
-// Store patterns adapted from WebAssemblyInstrMemory.td
-multiclass NarrowingStorePatNoOffset<Vec vec, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, I32:$addr),
-            (STORE_I64_A32 0, 0, $addr, (out $val))>,
-        Requires<[HasAddr32]>;
-  def : Pat<(node vec.vt:$val, I64:$addr),
-            (STORE_I64_A64 0, 0, $addr, (out $val))>,
-        Requires<[HasAddr64]>;
-}
-
-defm : NarrowingStorePatNoOffset<I16x8, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatNoOffset<I32x4, store_v4i16_trunc_v4i32>;
-
-multiclass NarrowingStorePatImmOff<Vec vec, PatFrag operand, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, (operand I32:$addr, imm:$off)),
-            (STORE_I64_A32 0, imm:$off, $addr, (out $val))>,
-        Requires<[HasAddr32]>;
-  def : Pat<(node vec.vt:$val, (operand I64:$addr, imm:$off)),
-            (STORE_I64_A64 0, imm:$off, $addr, (out $val))>,
-        Requires<[HasAddr64]>;
-}
-
-defm : NarrowingStorePatImmOff<I16x8, regPlusImm, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatImmOff<I32x4, regPlusImm, store_v4i16_trunc_v4i32>;
-defm : NarrowingStorePatImmOff<I16x8, or_is_add, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatImmOff<I32x4, or_is_add, store_v4i16_trunc_v4i32>;
-
-multiclass NarrowingStorePatOffsetOnly<Vec vec, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, imm:$off),
-            (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (out $val))>,
-        Requires<[HasAddr32]>;
-  def : Pat<(node vec.vt:$val, imm:$off),
-            (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (out $val))>,
-        Requires<[HasAddr64]>;
-}
-
-defm : NarrowingStorePatOffsetOnly<I16x8, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatOffsetOnly<I32x4, store_v4i16_trunc_v4i32>;
-
-multiclass NarrowingStorePatGlobalAddrOffOnly<Vec vec, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
-            (STORE_I64_A32 0, tglobaladdr:$off, (CONST_I32 0), (out $val))>,
-        Requires<[IsNotPIC, HasAddr32]>;
-  def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
-            (STORE_I64_A64  0, tglobaladdr:$off, (CONST_I64 0), (out $val))>,
-        Requires<[IsNotPIC, HasAddr64]>;
-}
-
-defm : NarrowingStorePatGlobalAddrOffOnly<I16x8, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatGlobalAddrOffOnly<I32x4, store_v4i16_trunc_v4i32>;
-
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
 foreach t1 = AllVecs in
@@ -1349,9 +1305,107 @@ def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
 defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>;
 
+// Lower extending loads to load64_zero + promote_low
+def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+  let MemoryVT = v2f32;
+}
+// Adapted from the body of LoadPatNoOffset
+// TODO: other addressing patterns
+def : Pat<(v2f64 (extloadv2f32 (i32 I32:$addr))),
+          (promote_low_F64x2 (LOAD_ZERO_I64x2_A32 0, 0, I32:$addr))>,
+      Requires<[HasAddr32]>;
+def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))),
+          (promote_low_F64x2 (LOAD_ZERO_I64x2_A64 0, 0, I64:$addr))>,
+      Requires<[HasAddr64]>;
+
 //===----------------------------------------------------------------------===//
 // Saturating Rounding Q-Format Multiplication
 //===----------------------------------------------------------------------===//
 
 defm Q15MULR_SAT_S :
   SIMDBinary<I16x8, int_wasm_q15mulr_sat_signed, "q15mulr_sat_s", 0x82>;
+
+//===----------------------------------------------------------------------===//
+// Fused Multiply- Add and Subtract (FMA/FMS)
+//===----------------------------------------------------------------------===//
+
+multiclass SIMDFM<Vec vec, bits<32> simdopA, bits<32> simdopS> {
+  defm FMA_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+              [(set (vec.vt V128:$dst), (int_wasm_fma
+                (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+              vec.prefix#".fma\t$dst, $a, $b, $c", vec.prefix#".fma", simdopA>;
+  defm FMS_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+              [(set (vec.vt V128:$dst), (int_wasm_fms
+                (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+              vec.prefix#".fms\t$dst, $a, $b, $c", vec.prefix#".fms", simdopS>;
+}
+
+defm "" : SIMDFM<F32x4, 0xaf, 0xb0>;
+defm "" : SIMDFM<F64x2, 0xcf, 0xd0>;
+
+//===----------------------------------------------------------------------===//
+// Laneselect
+//===----------------------------------------------------------------------===//
+
+multiclass SIMDLANESELECT<Vec vec, bits<32> op> {
+  defm LANESELECT_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+              [(set (vec.vt V128:$dst), (int_wasm_laneselect
+                (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+              vec.prefix#".laneselect\t$dst, $a, $b, $c", vec.prefix#".laneselect", op>;
+}
+
+defm "" : SIMDLANESELECT<I8x16, 0xb2>;
+defm "" : SIMDLANESELECT<I16x8, 0xb3>;
+defm "" : SIMDLANESELECT<I32x4, 0xd2>;
+defm "" : SIMDLANESELECT<I64x2, 0xd3>;
+
+
+//===----------------------------------------------------------------------===//
+// Relaxed swizzle
+//===----------------------------------------------------------------------===//
+
+defm RELAXED_SWIZZLE :
+  RELAXED_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
+         [(set (v16i8 V128:$dst),
+           (int_wasm_relaxed_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
+         "i8x16.relaxed_swizzle\t$dst, $src, $mask", "i8x16.relaxed_swizzle", 162>;
+
+//===----------------------------------------------------------------------===//
+// Relaxed floating-point min and max.
+//===----------------------------------------------------------------------===//
+
+multiclass SIMD_RELAXED_FMINMAX<Vec vec, bits<32> simdopMin, bits<32> simdopMax> {
+  defm RELAXED_FMIN_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b), (outs), (ins),
+              [(set (vec.vt V128:$dst), (int_wasm_relaxed_min
+                (vec.vt V128:$a), (vec.vt V128:$b)))],
+              vec.prefix#".relaxed_min\t$dst, $a, $b", vec.prefix#".relaxed_min", simdopMin>;
+  defm RELAXED_FMAX_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$a, V128:$b), (outs), (ins),
+              [(set (vec.vt V128:$dst), (int_wasm_relaxed_max
+                (vec.vt V128:$a), (vec.vt V128:$b)))],
+              vec.prefix#".relaxed_max\t$dst, $a, $b", vec.prefix#".relaxed_max", simdopMax>;
+}
+
+defm "" : SIMD_RELAXED_FMINMAX<F32x4, 0xb4, 0xe2>;
+defm "" : SIMD_RELAXED_FMINMAX<F64x2, 0xd4, 0xee>;
+
+//===----------------------------------------------------------------------===//
+// Relaxed floating-point to int conversions
+//===----------------------------------------------------------------------===//
+
+multiclass SIMD_RELAXED_CONVERT<Vec vec, Vec arg, SDPatternOperator op, string name, bits<32> simdop> {
+  defm op#_#vec :
+    RELAXED_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
+              [(set (vec.vt V128:$dst), (vec.vt (op (arg.vt V128:$vec))))],
+              vec.prefix#"."#name#"\t$dst, $vec", vec.prefix#"."#name, simdop>;
+}
+
+defm "" : SIMD_RELAXED_CONVERT<I32x4, F32x4, int_wasm_relaxed_trunc_signed, "relaxed_trunc_f32x4_s", 0xa5>;
+defm "" : SIMD_RELAXED_CONVERT<I32x4, F32x4, int_wasm_relaxed_trunc_unsigned, "relaxed_trunc_f32x4_u", 0xa6>;
+
+defm "" : SIMD_RELAXED_CONVERT<I32x4, F64x2, int_wasm_relaxed_trunc_zero_signed, "relaxed_trunc_f64x2_s_zero", 0xc5>;
+defm "" : SIMD_RELAXED_CONVERT<I32x4, F64x2, int_wasm_relaxed_trunc_zero_unsigned, "relaxed_trunc_f64x2_u_zero", 0xc6>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
index 2348bb165daf..e44c2073eaeb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -11,9 +11,18 @@
 /// Instructions that handle tables
 //===----------------------------------------------------------------------===//
 
-multiclass TABLE<WebAssemblyRegClass rt> {
+def WebAssemblyTableSet_t : SDTypeProfile<0, 3, [SDTCisPtrTy<1>]>;
+def WebAssemblyTableSet : SDNode<"WebAssemblyISD::TABLE_SET", WebAssemblyTableSet_t,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def WebAssemblyTableGet_t : SDTypeProfile<1, 2, [SDTCisPtrTy<1>]>;
+def WebAssemblyTableGet : SDNode<"WebAssemblyISD::TABLE_GET", WebAssemblyTableGet_t,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+
+multiclass TABLE<WebAssemblyRegClass rc> {
   let mayLoad = 1 in
-  defm TABLE_GET_#rt : I<(outs rt:$res), (ins table32_op:$table, I32:$i),
+  defm TABLE_GET_#rc : I<(outs rc:$res), (ins table32_op:$table, I32:$i),
                          (outs), (ins table32_op:$table),
                          [],
                          "table.get\t$res, $table, $i",
@@ -21,41 +30,43 @@ multiclass TABLE<WebAssemblyRegClass rt> {
                          0x25>;
 
   let mayStore = 1 in
-  defm TABLE_SET_#rt : I<(outs), (ins table32_op:$table, I32:$i, rt:$val),
+  defm TABLE_SET_#rc : I<(outs), (ins table32_op:$table, I32:$i, rc:$val),
                          (outs), (ins table32_op:$table),
                          [],
                          "table.set\t$table, $i, $val",
                          "table.set\t$table",
                          0x26>;
 
-  defm TABLE_GROW_#rt : I<(outs I32:$sz), (ins table32_op:$table, rt:$val, I32:$n),
+  defm TABLE_GROW_#rc : I<(outs I32:$sz), (ins table32_op:$table, rc:$val, I32:$n),
                           (outs), (ins table32_op:$table),
                           [],
                           "table.grow\t$sz, $table, $val, $n",
                           "table.grow\t$table",
                           0xfc0f>;
 
-  defm TABLE_FILL_#rt : I<(outs), (ins table32_op:$table, I32:$i, rt:$val, I32:$n),
+  defm TABLE_FILL_#rc : I<(outs), (ins table32_op:$table, I32:$i, rc:$val, I32:$n),
                           (outs), (ins table32_op:$table),
                           [],
                           "table.fill\t$table, $i, $val, $n",
                           "table.fill\t$table",
                           0xfc11>;
 
+  foreach vt = rc.RegTypes in {
+    def : Pat<(vt (WebAssemblyTableGet (WebAssemblyWrapper tglobaladdr:$table), i32:$idx)),
+              (!cast<NI>("TABLE_GET_" # rc) tglobaladdr:$table, i32:$idx)>;
+    def : Pat<(WebAssemblyTableSet
+               (WebAssemblyWrapper tglobaladdr:$table),
+               i32:$idx,
+               vt:$src),
+              (!cast<NI>("TABLE_SET_" # rc) tglobaladdr:$table, i32:$idx, vt:$src)>;
+  }
 }
 
 defm "" : TABLE<FUNCREF>, Requires<[HasReferenceTypes]>;
 defm "" : TABLE<EXTERNREF>, Requires<[HasReferenceTypes]>;
 
-def wasm_table_set_t : SDTypeProfile<0, 3, []>;
-def wasm_table_set : SDNode<"WebAssemblyISD::TABLE_SET", wasm_table_set_t,
-                            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-
-def : Pat<(wasm_table_set i32:$table, i32:$idx, funcref:$r),
-          (TABLE_SET_FUNCREF i32:$table, i32:$idx, funcref:$r)>,
-          Requires<[HasReferenceTypes]>;
-def : Pat<(wasm_table_set i32:$table, i32:$idx, externref:$r),
-          (TABLE_SET_EXTERNREF i32:$table, i32:$idx, externref:$r)>,
+def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r),
+          (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>,
           Requires<[HasReferenceTypes]>;
 
 defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 01b3aa887738..52226206eb32 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -63,12 +63,11 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
   auto &MRI = MF.getRegInfo();
 
   for (auto &MBB : MF) {
-    for (auto MII = MBB.begin(); MII != MBB.end();) {
-      MachineInstr *MI = &*MII++;
-      if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+      if (MI.getOpcode() != WebAssembly::BR_UNLESS)
         continue;
 
-      Register Cond = MI->getOperand(1).getReg();
+      Register Cond = MI.getOperand(1).getReg();
       bool Inverted = false;
 
       // Attempt to invert the condition in place.
@@ -189,7 +188,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // instruction to invert it.
       if (!Inverted) {
         Register Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
             .addReg(Cond);
         MFI.stackifyVReg(MRI, Tmp);
         Cond = Tmp;
@@ -199,10 +198,10 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // The br_unless condition has now been inverted. Insert a br_if and
       // delete the br_unless.
       assert(Inverted);
-      BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
-          .add(MI->getOperand(0))
+      BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(WebAssembly::BR_IF))
+          .add(MI.getOperand(0))
           .addReg(Cond);
-      MBB.erase(MI);
+      MBB.erase(&MI);
     }
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 599829a9e474..4eacc921b6cd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file lowers exception-related instructions and setjmp/longjmp
-/// function calls in order to use Emscripten's JavaScript try and catch
-/// mechanism.
+/// This file lowers exception-related instructions and setjmp/longjmp function
+/// calls to use Emscripten's library functions. The pass uses JavaScript's try
+/// and catch mechanism in case of Emscripten EH/SjLj and Wasm EH intrinsics in
+/// case of Emscripten SjLJ.
 ///
-/// To handle exceptions and setjmp/longjmps, this scheme relies on JavaScript's
-/// try and catch syntax and relevant exception-related libraries implemented
-/// in JavaScript glue code that will be produced by Emscripten.
-///
-/// * Exception handling
+/// * Emscripten exception handling
 /// This pass lowers invokes and landingpads into library functions in JS glue
 /// code. Invokes are lowered into function wrappers called invoke wrappers that
 /// exist in JS side, which wraps the original function call with JS try-catch.
@@ -23,7 +20,7 @@
 /// variables (see below) so we can check whether an exception occurred from
 /// wasm code and handle it appropriately.
 ///
-/// * Setjmp-longjmp handling
+/// * Emscripten setjmp-longjmp handling
 /// This pass lowers setjmp to a reasonably-performant approach for emscripten.
 /// The idea is that each block with a setjmp is broken up into two parts: the
 /// part containing setjmp and the part right after the setjmp. The latter part
@@ -52,7 +49,7 @@
 ///    __threwValue is 0 for exceptions, and the argument to longjmp in case of
 ///    longjmp.
 ///
-/// * Exception handling
+/// * Emscripten exception handling
 ///
 /// 2) We assume the existence of setThrew and setTempRet0/getTempRet0 functions
 ///    at link time. setThrew exists in Emscripten's compiler-rt:
@@ -121,16 +118,16 @@
 ///      call @llvm_eh_typeid_for(type)
 ///    llvm_eh_typeid_for function will be generated in JS glue code.
 ///
-/// * Setjmp / Longjmp handling
+/// * Emscripten setjmp / longjmp handling
 ///
-/// In case calls to longjmp() exists
+/// If there are calls to longjmp()
 ///
 /// 1) Lower
-///      longjmp(buf, value)
+///      longjmp(env, val)
 ///    into
-///      emscripten_longjmp(buf, value)
+///      emscripten_longjmp(env, val)
 ///
-/// In case calls to setjmp() exists
+/// If there are calls to setjmp()
 ///
 /// 2) In the function entry that calls setjmp, initialize setjmpTable and
 ///    sejmpTableSize as follows:
@@ -141,9 +138,9 @@
 ///    Emscripten compiler-rt.
 ///
 /// 3) Lower
-///      setjmp(buf)
+///      setjmp(env)
 ///    into
-///      setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
+///      setjmpTable = saveSetjmp(env, label, setjmpTable, setjmpTableSize);
 ///      setjmpTableSize = getTempRet0();
 ///    For each dynamic setjmp call, setjmpTable stores its ID (a number which
 ///    is incrementally assigned from 0) and its label (a unique number that
@@ -151,10 +148,9 @@
 ///    setjmpTable, it is reallocated in saveSetjmp() in Emscripten's
 ///    compiler-rt and it will return the new table address, and assign the new
 ///    table size in setTempRet0(). saveSetjmp also stores the setjmp's ID into
-///    the buffer buf. A BB with setjmp is split into two after setjmp call in
+///    the buffer 'env'. A BB with setjmp is split into two after setjmp call in
 ///    order to make the post-setjmp BB the possible destination of longjmp BB.
 ///
-///
 /// 4) Lower every call that might longjmp into
 ///      __THREW__ = 0;
 ///      call @__invoke_SIG(func, arg1, arg2)
@@ -171,7 +167,7 @@
 ///        %label = -1;
 ///      }
 ///      longjmp_result = getTempRet0();
-///      switch label {
+///      switch %label {
 ///        label 1: goto post-setjmp BB 1
 ///        label 2: goto post-setjmp BB 2
 ///        ...
@@ -188,23 +184,114 @@
 ///    occurred. Otherwise we jump to the right post-setjmp BB based on the
 ///    label.
 ///
+/// * Wasm setjmp / longjmp handling
+/// This mode still uses some Emscripten library functions but not JavaScript's
+/// try-catch mechanism. It instead uses Wasm exception handling intrinsics,
+/// which will be lowered to exception handling instructions.
+///
+/// If there are calls to longjmp()
+///
+/// 1) Lower
+///      longjmp(env, val)
+///    into
+///      __wasm_longjmp(env, val)
+///
+/// If there are calls to setjmp()
+///
+/// 2) and 3): The same as 2) and 3) in Emscripten SjLj.
+/// (setjmpTable/setjmpTableSize initialization + setjmp callsite
+/// transformation)
+///
+/// 4) Create a catchpad with a wasm.catch() intrinsic, which returns the value
+/// thrown by __wasm_longjmp function. In Emscripten library, we have this
+/// struct:
+///
+/// struct __WasmLongjmpArgs {
+///   void *env;
+///   int val;
+/// };
+/// struct __WasmLongjmpArgs __wasm_longjmp_args;
+///
+/// The thrown value here is a pointer to __wasm_longjmp_args struct object. We
+/// use this struct to transfer two values by throwing a single value. Wasm
+/// throw and catch instructions are capable of throwing and catching multiple
+/// values, but it also requires multivalue support that is currently not very
+/// reliable.
+/// TODO Switch to throwing and catching two values without using the struct
+///
+/// All longjmpable function calls will be converted to an invoke that will
+/// unwind to this catchpad in case a longjmp occurs. Within the catchpad, we
+/// test the thrown values using testSetjmp function as we do for Emscripten
+/// SjLj. The main difference is, in Emscripten SjLj, we need to transform every
+/// longjmpable callsite into a sequence of code including testSetjmp() call; in
+/// Wasm SjLj we do the testing in only one place, in this catchpad.
+///
+/// After testing calling testSetjmp(), if the longjmp does not correspond to
+/// one of the setjmps within the current function, it rethrows the longjmp
+/// by calling __wasm_longjmp(). If it corresponds to one of setjmps in the
+/// function, we jump to the beginning of the function, which contains a switch
+/// to each post-setjmp BB. Again, in Emscripten SjLj, this switch is added for
+/// every longjmpable callsite; in Wasm SjLj we do this only once at the top of
+/// the function. (after setjmpTable/setjmpTableSize initialization)
+///
+/// The below is the pseudocode for what we have described
+///
+/// entry:
+///   Initialize setjmpTable and setjmpTableSize
+///
+/// setjmp.dispatch:
+///    switch %label {
+///      label 1: goto post-setjmp BB 1
+///      label 2: goto post-setjmp BB 2
+///      ...
+///      default: goto splitted next BB
+///    }
+/// ...
+///
+/// bb:
+///   invoke void @foo() ;; foo is a longjmpable function
+///     to label %next unwind label %catch.dispatch.longjmp
+/// ...
+///
+/// catch.dispatch.longjmp:
+///   %0 = catchswitch within none [label %catch.longjmp] unwind to caller
+///
+/// catch.longjmp:
+///   %longjmp.args = wasm.catch() ;; struct __WasmLongjmpArgs
+///   %env = load 'env' field from __WasmLongjmpArgs
+///   %val = load 'val' field from __WasmLongjmpArgs
+///   %label = testSetjmp(mem[%env], setjmpTable, setjmpTableSize);
+///   if (%label == 0)
+///     __wasm_longjmp(%env, %val)
+///   catchret to %setjmp.dispatch
+///
 ///===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower-em-ehsjlj"
 
+// Emscripten's asm.js-style exception handling
+extern cl::opt<bool> WasmEnableEmEH;
+// Emscripten's asm.js-style setjmp/longjmp handling
+extern cl::opt<bool> WasmEnableEmSjLj;
+// Wasm setjmp/longjmp handling using wasm EH instructions
+extern cl::opt<bool> WasmEnableSjLj;
+
 static cl::list<std::string>
     EHAllowlist("emscripten-cxx-exceptions-allowed",
                 cl::desc("The list of function names in which Emscripten-style "
@@ -214,19 +301,25 @@ static cl::list<std::string>
 
 namespace {
 class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
-  bool EnableEH;   // Enable exception handling
-  bool EnableSjLj; // Enable setjmp/longjmp handling
-  bool DoSjLj;     // Whether we actually perform setjmp/longjmp handling
-
-  GlobalVariable *ThrewGV = nullptr;
-  GlobalVariable *ThrewValueGV = nullptr;
-  Function *GetTempRet0Func = nullptr;
-  Function *SetTempRet0Func = nullptr;
-  Function *ResumeF = nullptr;
-  Function *EHTypeIDF = nullptr;
-  Function *EmLongjmpF = nullptr;
-  Function *SaveSetjmpF = nullptr;
-  Function *TestSetjmpF = nullptr;
+  bool EnableEmEH;     // Enable Emscripten exception handling
+  bool EnableEmSjLj;   // Enable Emscripten setjmp/longjmp handling
+  bool EnableWasmSjLj; // Enable Wasm setjmp/longjmp handling
+  bool DoSjLj;         // Whether we actually perform setjmp/longjmp handling
+
+  GlobalVariable *ThrewGV = nullptr;      // __THREW__ (Emscripten)
+  GlobalVariable *ThrewValueGV = nullptr; // __threwValue (Emscripten)
+  Function *GetTempRet0F = nullptr;       // getTempRet0() (Emscripten)
+  Function *SetTempRet0F = nullptr;       // setTempRet0() (Emscripten)
+  Function *ResumeF = nullptr;            // __resumeException() (Emscripten)
+  Function *EHTypeIDF = nullptr;          // llvm.eh.typeid.for() (intrinsic)
+  Function *EmLongjmpF = nullptr;         // emscripten_longjmp() (Emscripten)
+  Function *SaveSetjmpF = nullptr;        // saveSetjmp() (Emscripten)
+  Function *TestSetjmpF = nullptr;        // testSetjmp() (Emscripten)
+  Function *WasmLongjmpF = nullptr;       // __wasm_longjmp() (Emscripten)
+  Function *CatchF = nullptr;             // wasm.catch() (intrinsic)
+
+  // type of 'struct __WasmLongjmpArgs' defined in emscripten
+  Type *LongjmpArgsTy = nullptr;
 
   // __cxa_find_matching_catch_N functions.
   // Indexed by the number of clauses in an original landingpad instruction.
@@ -242,31 +335,47 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
     return "WebAssembly Lower Emscripten Exceptions";
   }
 
+  using InstVector = SmallVectorImpl<Instruction *>;
   bool runEHOnFunction(Function &F);
   bool runSjLjOnFunction(Function &F);
+  void handleLongjmpableCallsForEmscriptenSjLj(
+      Function &F, InstVector &SetjmpTableInsts,
+      InstVector &SetjmpTableSizeInsts,
+      SmallVectorImpl<PHINode *> &SetjmpRetPHIs);
+  void
+  handleLongjmpableCallsForWasmSjLj(Function &F, InstVector &SetjmpTableInsts,
+                                    InstVector &SetjmpTableSizeInsts,
+                                    SmallVectorImpl<PHINode *> &SetjmpRetPHIs);
   Function *getFindMatchingCatch(Module &M, unsigned NumClauses);
 
   Value *wrapInvoke(CallBase *CI);
   void wrapTestSetjmp(BasicBlock *BB, DebugLoc DL, Value *Threw,
                       Value *SetjmpTable, Value *SetjmpTableSize, Value *&Label,
-                      Value *&LongjmpResult, BasicBlock *&EndBB);
+                      Value *&LongjmpResult, BasicBlock *&CallEmLongjmpBB,
+                      PHINode *&CallEmLongjmpBBThrewPHI,
+                      PHINode *&CallEmLongjmpBBThrewValuePHI,
+                      BasicBlock *&EndBB);
   Function *getInvokeWrapper(CallBase *CI);
 
   bool areAllExceptionsAllowed() const { return EHAllowlistSet.empty(); }
-  bool canLongjmp(Module &M, const Value *Callee) const;
-  bool isEmAsmCall(Module &M, const Value *Callee) const;
   bool supportsException(const Function *F) const {
-    return EnableEH && (areAllExceptionsAllowed() ||
-                        EHAllowlistSet.count(std::string(F->getName())));
+    return EnableEmEH && (areAllExceptionsAllowed() ||
+                          EHAllowlistSet.count(std::string(F->getName())));
   }
+  void replaceLongjmpWith(Function *LongjmpF, Function *NewF);
 
   void rebuildSSA(Function &F);
 
 public:
   static char ID;
 
-  WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true)
-      : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj) {
+  WebAssemblyLowerEmscriptenEHSjLj()
+      : ModulePass(ID), EnableEmEH(WasmEnableEmEH),
+        EnableEmSjLj(WasmEnableEmSjLj), EnableWasmSjLj(WasmEnableSjLj) {
+    assert(!(EnableEmSjLj && EnableWasmSjLj) &&
+           "Two SjLj modes cannot be turned on at the same time");
+    assert(!(EnableEmEH && EnableWasmSjLj) &&
+           "Wasm SjLj should be only used with Wasm EH");
     EHAllowlistSet.insert(EHAllowlist.begin(), EHAllowlist.end());
   }
   bool runOnModule(Module &M) override;
@@ -282,9 +391,8 @@ INITIALIZE_PASS(WebAssemblyLowerEmscriptenEHSjLj, DEBUG_TYPE,
                 "WebAssembly Lower Emscripten Exceptions / Setjmp / Longjmp",
                 false, false)
 
-ModulePass *llvm::createWebAssemblyLowerEmscriptenEHSjLj(bool EnableEH,
-                                                         bool EnableSjLj) {
-  return new WebAssemblyLowerEmscriptenEHSjLj(EnableEH, EnableSjLj);
+ModulePass *llvm::createWebAssemblyLowerEmscriptenEHSjLj() {
+  return new WebAssemblyLowerEmscriptenEHSjLj();
 }
 
 static bool canThrow(const Value *V) {
@@ -353,12 +461,12 @@ static Function *getEmscriptenFunction(FunctionType *Ty, const Twine &Name,
   if (!F->hasFnAttribute("wasm-import-module")) {
     llvm::AttrBuilder B;
     B.addAttribute("wasm-import-module", "env");
-    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    F->addFnAttrs(B);
   }
   if (!F->hasFnAttribute("wasm-import-name")) {
     llvm::AttrBuilder B;
     B.addAttribute("wasm-import-name", F->getName());
-    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    F->addFnAttrs(B);
   }
   return F;
 }
@@ -415,15 +523,6 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   Module *M = CI->getModule();
   LLVMContext &C = M->getContext();
 
-  // If we are calling a function that is noreturn, we must remove that
-  // attribute. The code we insert here does expect it to return, after we
-  // catch the exception.
-  if (CI->doesNotReturn()) {
-    if (auto *F = CI->getCalledFunction())
-      F->removeFnAttr(Attribute::NoReturn);
-    CI->removeAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
-  }
-
   IRBuilder<> IRB(C);
   IRB.SetInsertPoint(CI);
 
@@ -450,10 +549,10 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   // No attributes for the callee pointer.
   ArgAttributes.push_back(AttributeSet());
   // Copy the argument attributes from the original
-  for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I)
-    ArgAttributes.push_back(InvokeAL.getParamAttributes(I));
+  for (unsigned I = 0, E = CI->arg_size(); I < E; ++I)
+    ArgAttributes.push_back(InvokeAL.getParamAttrs(I));
 
-  AttrBuilder FnAttrs(InvokeAL.getFnAttributes());
+  AttrBuilder FnAttrs(InvokeAL.getFnAttrs());
   if (FnAttrs.contains(Attribute::AllocSize)) {
     // The allocsize attribute (if any) referes to parameters by index and needs
     // to be adjusted.
@@ -467,9 +566,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   }
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeList NewCallAL =
-      AttributeList::get(C, AttributeSet::get(C, FnAttrs),
-                         InvokeAL.getRetAttributes(), ArgAttributes);
+  AttributeList NewCallAL = AttributeList::get(
+      C, AttributeSet::get(C, FnAttrs), InvokeAL.getRetAttrs(), ArgAttributes);
   NewCall->setAttributes(NewCallAL);
 
   CI->replaceAllUsesWith(NewCall);
@@ -504,8 +602,7 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallBase *CI) {
   return F;
 }
 
-bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
-                                                  const Value *Callee) const {
+static bool canLongjmp(const Value *Callee) {
   if (auto *CalleeF = dyn_cast<Function>(Callee))
     if (CalleeF->isIntrinsic())
       return false;
@@ -543,8 +640,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
   return true;
 }
 
-bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M,
-                                                   const Value *Callee) const {
+static bool isEmAsmCall(const Value *Callee) {
   StringRef CalleeName = Callee->getName();
   // This is an exhaustive list from Emscripten's <emscripten/em_asm.h>.
   return CalleeName == "emscripten_asm_const_int" ||
@@ -558,7 +654,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M,
 // The code this generates is equivalent to the following JavaScript code:
 // %__threwValue.val = __threwValue;
 // if (%__THREW__.val != 0 & %__threwValue.val != 0) {
-//   %label = _testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
+//   %label = testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
 //   if (%label == 0)
 //     emscripten_longjmp(%__THREW__.val, %__threwValue.val);
 //   setTempRet0(%__threwValue.val);
@@ -572,7 +668,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M,
 void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
     BasicBlock *BB, DebugLoc DL, Value *Threw, Value *SetjmpTable,
     Value *SetjmpTableSize, Value *&Label, Value *&LongjmpResult,
-    BasicBlock *&EndBB) {
+    BasicBlock *&CallEmLongjmpBB, PHINode *&CallEmLongjmpBBThrewPHI,
+    PHINode *&CallEmLongjmpBBThrewValuePHI, BasicBlock *&EndBB) {
   Function *F = BB->getParent();
   Module *M = F->getParent();
   LLVMContext &C = M->getContext();
@@ -591,10 +688,27 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
   Value *Cmp1 = IRB.CreateAnd(ThrewCmp, ThrewValueCmp, "cmp1");
   IRB.CreateCondBr(Cmp1, ThenBB1, ElseBB1);
 
-  // %label = _testSetjmp(mem[%__THREW__.val], _setjmpTable, _setjmpTableSize);
+  // Generate call.em.longjmp BB once and share it within the function
+  if (!CallEmLongjmpBB) {
+    // emscripten_longjmp(%__THREW__.val, %__threwValue.val);
+    CallEmLongjmpBB = BasicBlock::Create(C, "call.em.longjmp", F);
+    IRB.SetInsertPoint(CallEmLongjmpBB);
+    CallEmLongjmpBBThrewPHI = IRB.CreatePHI(getAddrIntType(M), 4, "threw.phi");
+    CallEmLongjmpBBThrewValuePHI =
+        IRB.CreatePHI(IRB.getInt32Ty(), 4, "threwvalue.phi");
+    CallEmLongjmpBBThrewPHI->addIncoming(Threw, ThenBB1);
+    CallEmLongjmpBBThrewValuePHI->addIncoming(ThrewValue, ThenBB1);
+    IRB.CreateCall(EmLongjmpF,
+                   {CallEmLongjmpBBThrewPHI, CallEmLongjmpBBThrewValuePHI});
+    IRB.CreateUnreachable();
+  } else {
+    CallEmLongjmpBBThrewPHI->addIncoming(Threw, ThenBB1);
+    CallEmLongjmpBBThrewValuePHI->addIncoming(ThrewValue, ThenBB1);
+  }
+
+  // %label = testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
   // if (%label == 0)
   IRB.SetInsertPoint(ThenBB1);
-  BasicBlock *ThenBB2 = BasicBlock::Create(C, "if.then2", F);
   BasicBlock *EndBB2 = BasicBlock::Create(C, "if.end2", F);
   Value *ThrewPtr =
       IRB.CreateIntToPtr(Threw, getAddrPtrType(M), Threw->getName() + ".p");
@@ -603,16 +717,11 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
   Value *ThenLabel = IRB.CreateCall(
       TestSetjmpF, {LoadedThrew, SetjmpTable, SetjmpTableSize}, "label");
   Value *Cmp2 = IRB.CreateICmpEQ(ThenLabel, IRB.getInt32(0));
-  IRB.CreateCondBr(Cmp2, ThenBB2, EndBB2);
-
-  // emscripten_longjmp(%__THREW__.val, %__threwValue.val);
-  IRB.SetInsertPoint(ThenBB2);
-  IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
-  IRB.CreateUnreachable();
+  IRB.CreateCondBr(Cmp2, CallEmLongjmpBB, EndBB2);
 
   // setTempRet0(%__threwValue.val);
   IRB.SetInsertPoint(EndBB2);
-  IRB.CreateCall(SetTempRet0Func, ThrewValue);
+  IRB.CreateCall(SetTempRet0F, ThrewValue);
   IRB.CreateBr(EndBB1);
 
   IRB.SetInsertPoint(ElseBB1);
@@ -628,53 +737,67 @@ void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
   // Output parameter assignment
   Label = LabelPHI;
   EndBB = EndBB1;
-  LongjmpResult = IRB.CreateCall(GetTempRet0Func, None, "longjmp_result");
+  LongjmpResult = IRB.CreateCall(GetTempRet0F, None, "longjmp_result");
 }
 
 void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
   DT.recalculate(F); // CFG has been changed
-  SSAUpdater SSA;
+
+  SSAUpdaterBulk SSA;
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
-      SSA.Initialize(I.getType(), I.getName());
-      SSA.AddAvailableValue(&BB, &I);
-      for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
-        Use &U = *UI;
-        ++UI;
+      unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
+      // If a value is defined by an invoke instruction, it is only available in
+      // its normal destination and not in its unwind destination.
+      if (auto *II = dyn_cast<InvokeInst>(&I))
+        SSA.AddAvailableValue(VarID, II->getNormalDest(), II);
+      else
+        SSA.AddAvailableValue(VarID, &BB, &I);
+      for (auto &U : I.uses()) {
         auto *User = cast<Instruction>(U.getUser());
         if (auto *UserPN = dyn_cast<PHINode>(User))
           if (UserPN->getIncomingBlock(U) == &BB)
             continue;
-
         if (DT.dominates(&I, User))
           continue;
-        SSA.RewriteUseAfterInsertions(U);
+        SSA.AddUse(VarID, &U);
       }
     }
   }
+  SSA.RewriteAllUses(&DT);
 }
 
-// Replace uses of longjmp with emscripten_longjmp. emscripten_longjmp takes
-// arguments of type {i32, i32} (wasm32) / {i64, i32} (wasm64) and longjmp takes
-// {jmp_buf*, i32}, so we need a ptrtoint instruction here to make the type
-// match. jmp_buf* will eventually be lowered to i32 in the wasm backend.
-static void replaceLongjmpWithEmscriptenLongjmp(Function *LongjmpF,
-                                                Function *EmLongjmpF) {
+// Replace uses of longjmp with a new longjmp function in Emscripten library.
+// In Emscripten SjLj, the new function is
+//   void emscripten_longjmp(uintptr_t, i32)
+// In Wasm SjLj, the new function is
+//   void __wasm_longjmp(i8*, i32)
+// Because the original libc longjmp function takes (jmp_buf*, i32), we need a
+// ptrtoint/bitcast instruction here to make the type match. jmp_buf* will
+// eventually be lowered to i32/i64 in the wasm backend.
+void WebAssemblyLowerEmscriptenEHSjLj::replaceLongjmpWith(Function *LongjmpF,
+                                                          Function *NewF) {
+  assert(NewF == EmLongjmpF || NewF == WasmLongjmpF);
   Module *M = LongjmpF->getParent();
   SmallVector<CallInst *, 8> ToErase;
   LLVMContext &C = LongjmpF->getParent()->getContext();
   IRBuilder<> IRB(C);
 
-  // For calls to longjmp, replace it with emscripten_longjmp and cast its first
-  // argument (jmp_buf*) to int
+  // For calls to longjmp, replace it with emscripten_longjmp/__wasm_longjmp and
+  // cast its first argument (jmp_buf*) appropriately
   for (User *U : LongjmpF->users()) {
     auto *CI = dyn_cast<CallInst>(U);
     if (CI && CI->getCalledFunction() == LongjmpF) {
       IRB.SetInsertPoint(CI);
-      Value *Jmpbuf =
-          IRB.CreatePtrToInt(CI->getArgOperand(0), getAddrIntType(M), "jmpbuf");
-      IRB.CreateCall(EmLongjmpF, {Jmpbuf, CI->getArgOperand(1)});
+      Value *Env = nullptr;
+      if (NewF == EmLongjmpF)
+        Env =
+            IRB.CreatePtrToInt(CI->getArgOperand(0), getAddrIntType(M), "env");
+      else // WasmLongjmpF
+        Env =
+            IRB.CreateBitCast(CI->getArgOperand(0), IRB.getInt8PtrTy(), "env");
+      IRB.CreateCall(NewF, {Env, CI->getArgOperand(1)});
       ToErase.push_back(CI);
     }
   }
@@ -682,14 +805,23 @@ static void replaceLongjmpWithEmscriptenLongjmp(Function *LongjmpF,
     I->eraseFromParent();
 
   // If we have any remaining uses of longjmp's function pointer, replace it
-  // with (int(*)(jmp_buf*, int))emscripten_longjmp.
+  // with (void(*)(jmp_buf*, int))emscripten_longjmp / __wasm_longjmp.
   if (!LongjmpF->uses().empty()) {
-    Value *EmLongjmp =
-        IRB.CreateBitCast(EmLongjmpF, LongjmpF->getType(), "em_longjmp");
-    LongjmpF->replaceAllUsesWith(EmLongjmp);
+    Value *NewLongjmp =
+        IRB.CreateBitCast(NewF, LongjmpF->getType(), "longjmp.cast");
+    LongjmpF->replaceAllUsesWith(NewLongjmp);
   }
 }
 
+static bool containsLongjmpableCalls(const Function *F) {
+  for (const auto &BB : *F)
+    for (const auto &I : BB)
+      if (const auto *CB = dyn_cast<CallBase>(&I))
+        if (canLongjmp(CB->getCalledOperand()))
+          return true;
+  return false;
+}
+
 bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   LLVM_DEBUG(dbgs() << "********** Lower Emscripten EH & SjLj **********\n");
 
@@ -698,39 +830,60 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
 
   Function *SetjmpF = M.getFunction("setjmp");
   Function *LongjmpF = M.getFunction("longjmp");
-  bool SetjmpUsed = SetjmpF && !SetjmpF->use_empty();
-  bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
-  DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
+
+  // In some platforms _setjmp and _longjmp are used instead. Change these to
+  // use setjmp/longjmp instead, because we later detect these functions by
+  // their names.
+  Function *SetjmpF2 = M.getFunction("_setjmp");
+  Function *LongjmpF2 = M.getFunction("_longjmp");
+  if (SetjmpF2) {
+    if (SetjmpF) {
+      if (SetjmpF->getFunctionType() != SetjmpF2->getFunctionType())
+        report_fatal_error("setjmp and _setjmp have different function types");
+    } else {
+      SetjmpF = Function::Create(SetjmpF2->getFunctionType(),
+                                 GlobalValue::ExternalLinkage, "setjmp", M);
+    }
+    SetjmpF2->replaceAllUsesWith(SetjmpF);
+  }
+  if (LongjmpF2) {
+    if (LongjmpF) {
+      if (LongjmpF->getFunctionType() != LongjmpF2->getFunctionType())
+        report_fatal_error(
+            "longjmp and _longjmp have different function types");
+    } else {
+      LongjmpF = Function::Create(LongjmpF2->getFunctionType(),
+                                  GlobalValue::ExternalLinkage, "setjmp", M);
+    }
+    LongjmpF2->replaceAllUsesWith(LongjmpF);
+  }
 
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   assert(TPC && "Expected a TargetPassConfig");
   auto &TM = TPC->getTM<WebAssemblyTargetMachine>();
 
-  if (EnableEH && TM.Options.ExceptionModel == ExceptionHandling::Wasm)
-    report_fatal_error("-exception-model=wasm not allowed with "
-                       "-enable-emscripten-cxx-exceptions");
-
   // Declare (or get) global variables __THREW__, __threwValue, and
   // getTempRet0/setTempRet0 function which are used in common for both
   // exception handling and setjmp/longjmp handling
   ThrewGV = getGlobalVariable(M, getAddrIntType(&M), TM, "__THREW__");
   ThrewValueGV = getGlobalVariable(M, IRB.getInt32Ty(), TM, "__threwValue");
-  GetTempRet0Func = getEmscriptenFunction(
+  GetTempRet0F = getEmscriptenFunction(
       FunctionType::get(IRB.getInt32Ty(), false), "getTempRet0", &M);
-  SetTempRet0Func = getEmscriptenFunction(
+  SetTempRet0F = getEmscriptenFunction(
       FunctionType::get(IRB.getVoidTy(), IRB.getInt32Ty(), false),
       "setTempRet0", &M);
-  GetTempRet0Func->setDoesNotThrow();
-  SetTempRet0Func->setDoesNotThrow();
+  GetTempRet0F->setDoesNotThrow();
+  SetTempRet0F->setDoesNotThrow();
 
   bool Changed = false;
 
   // Function registration for exception handling
-  if (EnableEH) {
+  if (EnableEmEH) {
     // Register __resumeException function
     FunctionType *ResumeFTy =
         FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false);
     ResumeF = getEmscriptenFunction(ResumeFTy, "__resumeException", &M);
+    ResumeF->addFnAttr(Attribute::NoReturn);
 
     // Register llvm_eh_typeid_for function
     FunctionType *EHTypeIDTy =
@@ -738,20 +891,55 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
     EHTypeIDF = getEmscriptenFunction(EHTypeIDTy, "llvm_eh_typeid_for", &M);
   }
 
+  if ((EnableEmSjLj || EnableWasmSjLj) && SetjmpF) {
+    // Precompute setjmp users
+    for (User *U : SetjmpF->users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        auto *UserF = CB->getFunction();
+        // If a function that calls setjmp does not contain any other calls that
+        // can longjmp, we don't need to do any transformation on that function,
+        // so can ignore it
+        if (containsLongjmpableCalls(UserF))
+          SetjmpUsers.insert(UserF);
+      } else {
+        std::string S;
+        raw_string_ostream SS(S);
+        SS << *U;
+        report_fatal_error(Twine("Indirect use of setjmp is not supported: ") +
+                           SS.str());
+      }
+    }
+  }
+
+  bool SetjmpUsed = SetjmpF && !SetjmpUsers.empty();
+  bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
+  DoSjLj = (EnableEmSjLj | EnableWasmSjLj) && (SetjmpUsed || LongjmpUsed);
+
   // Function registration and data pre-gathering for setjmp/longjmp handling
   if (DoSjLj) {
-    // Register emscripten_longjmp function
-    FunctionType *FTy = FunctionType::get(
-        IRB.getVoidTy(), {getAddrIntType(&M), IRB.getInt32Ty()}, false);
-    EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
+    assert(EnableEmSjLj || EnableWasmSjLj);
+    if (EnableEmSjLj) {
+      // Register emscripten_longjmp function
+      FunctionType *FTy = FunctionType::get(
+          IRB.getVoidTy(), {getAddrIntType(&M), IRB.getInt32Ty()}, false);
+      EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
+      EmLongjmpF->addFnAttr(Attribute::NoReturn);
+    } else { // EnableWasmSjLj
+      // Register __wasm_longjmp function, which calls __builtin_wasm_longjmp.
+      FunctionType *FTy = FunctionType::get(
+          IRB.getVoidTy(), {IRB.getInt8PtrTy(), IRB.getInt32Ty()}, false);
+      WasmLongjmpF = getEmscriptenFunction(FTy, "__wasm_longjmp", &M);
+      WasmLongjmpF->addFnAttr(Attribute::NoReturn);
+    }
 
     if (SetjmpF) {
       // Register saveSetjmp function
       FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
-      FTy = FunctionType::get(Type::getInt32PtrTy(C),
-                              {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
-                               Type::getInt32PtrTy(C), IRB.getInt32Ty()},
-                              false);
+      FunctionType *FTy =
+          FunctionType::get(Type::getInt32PtrTy(C),
+                            {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
+                             Type::getInt32PtrTy(C), IRB.getInt32Ty()},
+                            false);
       SaveSetjmpF = getEmscriptenFunction(FTy, "saveSetjmp", &M);
 
       // Register testSetjmp function
@@ -761,16 +949,18 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
           false);
       TestSetjmpF = getEmscriptenFunction(FTy, "testSetjmp", &M);
 
-      // Precompute setjmp users
-      for (User *U : SetjmpF->users()) {
-        auto *UI = cast<Instruction>(U);
-        SetjmpUsers.insert(UI->getFunction());
-      }
+      // wasm.catch() will be lowered down to wasm 'catch' instruction in
+      // instruction selection.
+      CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch);
+      // Type for struct __WasmLongjmpArgs
+      LongjmpArgsTy = StructType::get(IRB.getInt8PtrTy(), // env
+                                      IRB.getInt32Ty()    // val
+      );
     }
   }
 
   // Exception handling transformation
-  if (EnableEH) {
+  if (EnableEmEH) {
     for (Function &F : M) {
       if (F.isDeclaration())
         continue;
@@ -782,7 +972,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   if (DoSjLj) {
     Changed = true; // We have setjmp or longjmp somewhere
     if (LongjmpF)
-      replaceLongjmpWithEmscriptenLongjmp(LongjmpF, EmLongjmpF);
+      replaceLongjmpWith(LongjmpF, EnableEmSjLj ? EmLongjmpF : WasmLongjmpF);
     // Only traverse functions that uses setjmp in order not to insert
     // unnecessary prep / cleanup code in every function
     if (SetjmpF)
@@ -816,6 +1006,12 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
   SmallVector<Instruction *, 64> ToErase;
   SmallPtrSet<LandingPadInst *, 32> LandingPads;
 
+  // rethrow.longjmp BB that will be shared within the function.
+  BasicBlock *RethrowLongjmpBB = nullptr;
+  // PHI node for the loaded value of __THREW__ global variable in
+  // rethrow.longjmp BB
+  PHINode *RethrowLongjmpBBThrewPHI = nullptr;
+
   for (BasicBlock &BB : F) {
     auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
     if (!II)
@@ -836,37 +1032,48 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
       // setjmp, it will be appropriately handled in runSjLjOnFunction. But even
       // if the function does not contain setjmp calls, we shouldn't silently
       // ignore longjmps; we should rethrow them so they can be correctly
-      // handled in somewhere up the call chain where setjmp is.
-      // __THREW__'s value is 0 when nothing happened, 1 when an exception is
-      // thrown, other values when longjmp is thrown.
+      // handled in somewhere up the call chain where setjmp is. __THREW__'s
+      // value is 0 when nothing happened, 1 when an exception is thrown, and
+      // other values when longjmp is thrown.
       //
       // if (%__THREW__.val == 0 || %__THREW__.val == 1)
       //   goto %tail
       // else
       //   goto %longjmp.rethrow
       //
-      // longjmp.rethrow: ;; This is longjmp. Rethrow it
+      // rethrow.longjmp: ;; This is longjmp. Rethrow it
       //   %__threwValue.val = __threwValue
       //   emscripten_longjmp(%__THREW__.val, %__threwValue.val);
       //
       // tail: ;; Nothing happened or an exception is thrown
       //   ... Continue exception handling ...
-      if (DoSjLj && !SetjmpUsers.count(&F) && canLongjmp(M, Callee)) {
+      if (DoSjLj && EnableEmSjLj && !SetjmpUsers.count(&F) &&
+          canLongjmp(Callee)) {
+        // Create longjmp.rethrow BB once and share it within the function
+        if (!RethrowLongjmpBB) {
+          RethrowLongjmpBB = BasicBlock::Create(C, "rethrow.longjmp", &F);
+          IRB.SetInsertPoint(RethrowLongjmpBB);
+          RethrowLongjmpBBThrewPHI =
+              IRB.CreatePHI(getAddrIntType(&M), 4, "threw.phi");
+          RethrowLongjmpBBThrewPHI->addIncoming(Threw, &BB);
+          Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV,
+                                             ThrewValueGV->getName() + ".val");
+          IRB.CreateCall(EmLongjmpF, {RethrowLongjmpBBThrewPHI, ThrewValue});
+          IRB.CreateUnreachable();
+        } else {
+          RethrowLongjmpBBThrewPHI->addIncoming(Threw, &BB);
+        }
+
+        IRB.SetInsertPoint(II); // Restore the insert point back
         BasicBlock *Tail = BasicBlock::Create(C, "tail", &F);
-        BasicBlock *RethrowBB = BasicBlock::Create(C, "longjmp.rethrow", &F);
         Value *CmpEqOne =
             IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 1), "cmp.eq.one");
         Value *CmpEqZero =
             IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 0), "cmp.eq.zero");
         Value *Or = IRB.CreateOr(CmpEqZero, CmpEqOne, "or");
-        IRB.CreateCondBr(Or, Tail, RethrowBB);
-        IRB.SetInsertPoint(RethrowBB);
-        Value *ThrewValue = IRB.CreateLoad(IRB.getInt32Ty(), ThrewValueGV,
-                                           ThrewValueGV->getName() + ".val");
-        IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
-
-        IRB.CreateUnreachable();
+        IRB.CreateCondBr(Or, Tail, RethrowLongjmpBB);
         IRB.SetInsertPoint(Tail);
+        BB.replaceSuccessorsPhiUsesWith(&BB, Tail);
       }
 
       // Insert a branch based on __THREW__ variable
@@ -961,7 +1168,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     CallInst *FMCI = IRB.CreateCall(FMCF, FMCArgs, "fmc");
     Value *Undef = UndefValue::get(LPI->getType());
     Value *Pair0 = IRB.CreateInsertValue(Undef, FMCI, 0, "pair0");
-    Value *TempRet0 = IRB.CreateCall(GetTempRet0Func, None, "tempret0");
+    Value *TempRet0 = IRB.CreateCall(GetTempRet0F, None, "tempret0");
     Value *Pair1 = IRB.CreateInsertValue(Pair0, TempRet0, 1, "pair1");
 
     LPI->replaceAllUsesWith(Pair1);
@@ -997,14 +1204,15 @@ static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
 }
 
 bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
+  assert(EnableEmSjLj || EnableWasmSjLj);
   Module &M = *F.getParent();
   LLVMContext &C = F.getContext();
   IRBuilder<> IRB(C);
   SmallVector<Instruction *, 64> ToErase;
   // Vector of %setjmpTable values
-  std::vector<Instruction *> SetjmpTableInsts;
+  SmallVector<Instruction *, 4> SetjmpTableInsts;
   // Vector of %setjmpTableSize values
-  std::vector<Instruction *> SetjmpTableSizeInsts;
+  SmallVector<Instruction *, 4> SetjmpTableSizeInsts;
 
   // Setjmp preparation
 
@@ -1012,11 +1220,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   // We create this as an instruction intentionally, and we don't want to fold
   // this instruction to a constant 4, because this value will be used in
   // SSAUpdater.AddAvailableValue(...) later.
-  BasicBlock &EntryBB = F.getEntryBlock();
-  DebugLoc FirstDL = getOrCreateDebugLoc(&*EntryBB.begin(), F.getSubprogram());
-  BinaryOperator *SetjmpTableSize = BinaryOperator::Create(
-      Instruction::Add, IRB.getInt32(4), IRB.getInt32(0), "setjmpTableSize",
-      &*EntryBB.getFirstInsertionPt());
+  BasicBlock *Entry = &F.getEntryBlock();
+  DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
+  SplitBlock(Entry, &*Entry->getFirstInsertionPt());
+
+  BinaryOperator *SetjmpTableSize =
+      BinaryOperator::Create(Instruction::Add, IRB.getInt32(4), IRB.getInt32(0),
+                             "setjmpTableSize", Entry->getTerminator());
   SetjmpTableSize->setDebugLoc(FirstDL);
   // setjmpTable = (int *) malloc(40);
   Instruction *SetjmpTable = CallInst::CreateMalloc(
@@ -1036,13 +1246,14 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   SetjmpTableSizeInsts.push_back(SetjmpTableSize);
 
   // Setjmp transformation
-  std::vector<PHINode *> SetjmpRetPHIs;
+  SmallVector<PHINode *, 4> SetjmpRetPHIs;
   Function *SetjmpF = M.getFunction("setjmp");
   for (User *U : SetjmpF->users()) {
     auto *CI = dyn_cast<CallInst>(U);
+    // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but
+    // we don't support two being used together yet.
     if (!CI)
-      report_fatal_error("Does not support indirect calls to setjmp");
-
+      report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet");
     BasicBlock *BB = CI->getParent();
     if (BB->getParent() != &F) // in other function
       continue;
@@ -1072,14 +1283,136 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
     Instruction *NewSetjmpTable =
         IRB.CreateCall(SaveSetjmpF, Args, "setjmpTable");
     Instruction *NewSetjmpTableSize =
-        IRB.CreateCall(GetTempRet0Func, None, "setjmpTableSize");
+        IRB.CreateCall(GetTempRet0F, None, "setjmpTableSize");
     SetjmpTableInsts.push_back(NewSetjmpTable);
     SetjmpTableSizeInsts.push_back(NewSetjmpTableSize);
     ToErase.push_back(CI);
   }
 
-  // Update each call that can longjmp so it can return to a setjmp where
-  // relevant.
+  // Handle longjmpable calls.
+  if (EnableEmSjLj)
+    handleLongjmpableCallsForEmscriptenSjLj(
+        F, SetjmpTableInsts, SetjmpTableSizeInsts, SetjmpRetPHIs);
+  else // EnableWasmSjLj
+    handleLongjmpableCallsForWasmSjLj(F, SetjmpTableInsts, SetjmpTableSizeInsts,
+                                      SetjmpRetPHIs);
+
+  // Erase everything we no longer need in this function
+  for (Instruction *I : ToErase)
+    I->eraseFromParent();
+
+  // Free setjmpTable buffer before each return instruction + function-exiting
+  // call
+  SmallVector<Instruction *, 16> ExitingInsts;
+  for (BasicBlock &BB : F) {
+    Instruction *TI = BB.getTerminator();
+    if (isa<ReturnInst>(TI))
+      ExitingInsts.push_back(TI);
+    // Any 'call' instruction with 'noreturn' attribute exits the function at
+    // this point. If this throws but unwinds to another EH pad within this
+    // function instead of exiting, this would have been an 'invoke', which
+    // happens if we use Wasm EH or Wasm SjLJ.
+    for (auto &I : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        bool IsNoReturn = CI->hasFnAttr(Attribute::NoReturn);
+        if (Function *CalleeF = CI->getCalledFunction())
+          IsNoReturn |= CalleeF->hasFnAttribute(Attribute::NoReturn);
+        if (IsNoReturn)
+          ExitingInsts.push_back(&I);
+      }
+    }
+  }
+  for (auto *I : ExitingInsts) {
+    DebugLoc DL = getOrCreateDebugLoc(I, F.getSubprogram());
+    // If this existing instruction is a call within a catchpad, we should add
+    // it as "funclet" to the operand bundle of 'free' call
+    SmallVector<OperandBundleDef, 1> Bundles;
+    if (auto *CB = dyn_cast<CallBase>(I))
+      if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet))
+        Bundles.push_back(OperandBundleDef(*Bundle));
+    auto *Free = CallInst::CreateFree(SetjmpTable, Bundles, I);
+    Free->setDebugLoc(DL);
+    // CallInst::CreateFree may create a bitcast instruction if its argument
+    // types mismatch. We need to set the debug loc for the bitcast too.
+    if (auto *FreeCallI = dyn_cast<CallInst>(Free)) {
+      if (auto *BitCastI = dyn_cast<BitCastInst>(FreeCallI->getArgOperand(0)))
+        BitCastI->setDebugLoc(DL);
+    }
+  }
+
+  // Every call to saveSetjmp can change setjmpTable and setjmpTableSize
+  // (when buffer reallocation occurs)
+  // entry:
+  //   setjmpTableSize = 4;
+  //   setjmpTable = (int *) malloc(40);
+  //   setjmpTable[0] = 0;
+  // ...
+  // somebb:
+  //   setjmpTable = saveSetjmp(env, label, setjmpTable, setjmpTableSize);
+  //   setjmpTableSize = getTempRet0();
+  // So we need to make sure the SSA for these variables is valid so that every
+  // saveSetjmp and testSetjmp calls have the correct arguments.
+  SSAUpdater SetjmpTableSSA;
+  SSAUpdater SetjmpTableSizeSSA;
+  SetjmpTableSSA.Initialize(Type::getInt32PtrTy(C), "setjmpTable");
+  SetjmpTableSizeSSA.Initialize(Type::getInt32Ty(C), "setjmpTableSize");
+  for (Instruction *I : SetjmpTableInsts)
+    SetjmpTableSSA.AddAvailableValue(I->getParent(), I);
+  for (Instruction *I : SetjmpTableSizeInsts)
+    SetjmpTableSizeSSA.AddAvailableValue(I->getParent(), I);
+
+  for (auto &U : make_early_inc_range(SetjmpTable->uses()))
+    if (auto *I = dyn_cast<Instruction>(U.getUser()))
+      if (I->getParent() != Entry)
+        SetjmpTableSSA.RewriteUse(U);
+  for (auto &U : make_early_inc_range(SetjmpTableSize->uses()))
+    if (auto *I = dyn_cast<Instruction>(U.getUser()))
+      if (I->getParent() != Entry)
+        SetjmpTableSizeSSA.RewriteUse(U);
+
+  // Finally, our modifications to the cfg can break dominance of SSA variables.
+  // For example, in this code,
+  // if (x()) { .. setjmp() .. }
+  // if (y()) { .. longjmp() .. }
+  // We must split the longjmp block, and it can jump into the block splitted
+  // from setjmp one. But that means that when we split the setjmp block, it's
+  // first part no longer dominates its second part - there is a theoretically
+  // possible control flow path where x() is false, then y() is true and we
+  // reach the second part of the setjmp block, without ever reaching the first
+  // part. So, we rebuild SSA form here.
+  rebuildSSA(F);
+  return true;
+}
+
+// Update each call that can longjmp so it can return to the corresponding
+// setjmp. Refer to 4) of "Emscripten setjmp/longjmp handling" section in the
+// comments at top of the file for details.
+void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForEmscriptenSjLj(
+    Function &F, InstVector &SetjmpTableInsts, InstVector &SetjmpTableSizeInsts,
+    SmallVectorImpl<PHINode *> &SetjmpRetPHIs) {
+  Module &M = *F.getParent();
+  LLVMContext &C = F.getContext();
+  IRBuilder<> IRB(C);
+  SmallVector<Instruction *, 64> ToErase;
+
+  // We need to pass setjmpTable and setjmpTableSize to testSetjmp function.
+  // These values are defined in the beginning of the function and also in each
+  // setjmp callsite, but we don't know which values we should use at this
+  // point. So here we arbitraily use the ones defined in the beginning of the
+  // function, and SSAUpdater will later update them to the correct values.
+  Instruction *SetjmpTable = *SetjmpTableInsts.begin();
+  Instruction *SetjmpTableSize = *SetjmpTableSizeInsts.begin();
+
+  // call.em.longjmp BB that will be shared within the function.
+  BasicBlock *CallEmLongjmpBB = nullptr;
+  // PHI node for the loaded value of __THREW__ global variable in
+  // call.em.longjmp BB
+  PHINode *CallEmLongjmpBBThrewPHI = nullptr;
+  // PHI node for the loaded value of __threwValue global variable in
+  // call.em.longjmp BB
+  PHINode *CallEmLongjmpBBThrewValuePHI = nullptr;
+  // rethrow.exn BB that will be shared within the function.
+  BasicBlock *RethrowExnBB = nullptr;
 
   // Because we are creating new BBs while processing and don't want to make
   // all these newly created BBs candidates again for longjmp processing, we
@@ -1092,15 +1425,18 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   for (unsigned I = 0; I < BBs.size(); I++) {
     BasicBlock *BB = BBs[I];
     for (Instruction &I : *BB) {
-      assert(!isa<InvokeInst>(&I));
+      if (isa<InvokeInst>(&I))
+        report_fatal_error("When using Wasm EH with Emscripten SjLj, there is "
+                           "a restriction that `setjmp` function call and "
+                           "exception cannot be used within the same function");
       auto *CI = dyn_cast<CallInst>(&I);
       if (!CI)
         continue;
 
       const Value *Callee = CI->getCalledOperand();
-      if (!canLongjmp(M, Callee))
+      if (!canLongjmp(Callee))
         continue;
-      if (isEmAsmCall(M, Callee))
+      if (isEmAsmCall(Callee))
         report_fatal_error("Cannot use EM_ASM* alongside setjmp/longjmp in " +
                                F.getName() +
                                ". Please consider using EM_JS, or move the "
@@ -1171,19 +1507,26 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
         // tail:
         //   ...
         if (supportsException(&F) && canThrow(Callee)) {
-          IRB.SetInsertPoint(CI);
           // We will add a new conditional branch. So remove the branch created
           // when we split the BB
           ToErase.push_back(BB->getTerminator());
+
+          // Generate rethrow.exn BB once and share it within the function
+          if (!RethrowExnBB) {
+            RethrowExnBB = BasicBlock::Create(C, "rethrow.exn", &F);
+            IRB.SetInsertPoint(RethrowExnBB);
+            CallInst *Exn =
+                IRB.CreateCall(getFindMatchingCatch(M, 0), {}, "exn");
+            IRB.CreateCall(ResumeF, {Exn});
+            IRB.CreateUnreachable();
+          }
+
+          IRB.SetInsertPoint(CI);
           BasicBlock *NormalBB = BasicBlock::Create(C, "normal", &F);
-          BasicBlock *RethrowBB = BasicBlock::Create(C, "eh.rethrow", &F);
           Value *CmpEqOne =
               IRB.CreateICmpEQ(Threw, getAddrSizeInt(&M, 1), "cmp.eq.one");
-          IRB.CreateCondBr(CmpEqOne, RethrowBB, NormalBB);
-          IRB.SetInsertPoint(RethrowBB);
-          CallInst *Exn = IRB.CreateCall(getFindMatchingCatch(M, 0), {}, "exn");
-          IRB.CreateCall(ResumeF, {Exn});
-          IRB.CreateUnreachable();
+          IRB.CreateCondBr(CmpEqOne, RethrowExnBB, NormalBB);
+
           IRB.SetInsertPoint(NormalBB);
           IRB.CreateBr(Tail);
           BB = NormalBB; // New insertion point to insert testSetjmp()
@@ -1202,7 +1545,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
       Value *LongjmpResult = nullptr;
       BasicBlock *EndBB = nullptr;
       wrapTestSetjmp(BB, CI->getDebugLoc(), Threw, SetjmpTable, SetjmpTableSize,
-                     Label, LongjmpResult, EndBB);
+                     Label, LongjmpResult, CallEmLongjmpBB,
+                     CallEmLongjmpBBThrewPHI, CallEmLongjmpBBThrewValuePHI,
+                     EndBB);
       assert(Label && LongjmpResult && EndBB);
 
       // Create switch instruction
@@ -1224,76 +1569,184 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
     }
   }
 
-  // Erase everything we no longer need in this function
   for (Instruction *I : ToErase)
     I->eraseFromParent();
+}
 
-  // Free setjmpTable buffer before each return instruction
-  for (BasicBlock &BB : F) {
-    Instruction *TI = BB.getTerminator();
-    if (isa<ReturnInst>(TI)) {
-      DebugLoc DL = getOrCreateDebugLoc(TI, F.getSubprogram());
-      auto *Free = CallInst::CreateFree(SetjmpTable, TI);
-      Free->setDebugLoc(DL);
-      // CallInst::CreateFree may create a bitcast instruction if its argument
-      // types mismatch. We need to set the debug loc for the bitcast too.
-      if (auto *FreeCallI = dyn_cast<CallInst>(Free)) {
-        if (auto *BitCastI = dyn_cast<BitCastInst>(FreeCallI->getArgOperand(0)))
-          BitCastI->setDebugLoc(DL);
-      }
-    }
+// Create a catchpad in which we catch a longjmp's env and val arguments, test
+// if the longjmp corresponds to one of setjmps in the current function, and if
+// so, jump to the setjmp dispatch BB from which we go to one of post-setjmp
+// BBs. Refer to 4) of "Wasm setjmp/longjmp handling" section in the comments at
+// top of the file for details.
+void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
+    Function &F, InstVector &SetjmpTableInsts, InstVector &SetjmpTableSizeInsts,
+    SmallVectorImpl<PHINode *> &SetjmpRetPHIs) {
+  Module &M = *F.getParent();
+  LLVMContext &C = F.getContext();
+  IRBuilder<> IRB(C);
+
+  // A function with catchswitch/catchpad instruction should have a personality
+  // function attached to it. Search for the wasm personality function, and if
+  // it exists, use it, and if it doesn't, create a dummy personality function.
+  // (SjLj is not going to call it anyway.)
+  if (!F.hasPersonalityFn()) {
+    StringRef PersName = getEHPersonalityName(EHPersonality::Wasm_CXX);
+    FunctionType *PersType =
+        FunctionType::get(IRB.getInt32Ty(), /* isVarArg */ true);
+    Value *PersF = M.getOrInsertFunction(PersName, PersType).getCallee();
+    F.setPersonalityFn(
+        cast<Constant>(IRB.CreateBitCast(PersF, IRB.getInt8PtrTy())));
   }
 
-  // Every call to saveSetjmp can change setjmpTable and setjmpTableSize
-  // (when buffer reallocation occurs)
+  // Use the entry BB's debugloc as a fallback
+  BasicBlock *Entry = &F.getEntryBlock();
+  DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
+  IRB.SetCurrentDebugLocation(FirstDL);
+
+  // Arbitrarily use the ones defined in the beginning of the function.
+  // SSAUpdater will later update them to the correct values.
+  Instruction *SetjmpTable = *SetjmpTableInsts.begin();
+  Instruction *SetjmpTableSize = *SetjmpTableSizeInsts.begin();
+
+  // Add setjmp.dispatch BB right after the entry block. Because we have
+  // initialized setjmpTable/setjmpTableSize in the entry block and split the
+  // rest into another BB, here 'OrigEntry' is the function's original entry
+  // block before the transformation.
+  //
   // entry:
-  //   setjmpTableSize = 4;
-  //   setjmpTable = (int *) malloc(40);
-  //   setjmpTable[0] = 0;
-  // ...
-  // somebb:
-  //   setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
-  //   setjmpTableSize = getTempRet0();
-  // So we need to make sure the SSA for these variables is valid so that every
-  // saveSetjmp and testSetjmp calls have the correct arguments.
-  SSAUpdater SetjmpTableSSA;
-  SSAUpdater SetjmpTableSizeSSA;
-  SetjmpTableSSA.Initialize(Type::getInt32PtrTy(C), "setjmpTable");
-  SetjmpTableSizeSSA.Initialize(Type::getInt32Ty(C), "setjmpTableSize");
-  for (Instruction *I : SetjmpTableInsts)
-    SetjmpTableSSA.AddAvailableValue(I->getParent(), I);
-  for (Instruction *I : SetjmpTableSizeInsts)
-    SetjmpTableSizeSSA.AddAvailableValue(I->getParent(), I);
+  //   setjmpTable / setjmpTableSize initialization
+  // setjmp.dispatch:
+  //   switch will be inserted here later
+  // entry.split: (OrigEntry)
+  //   the original function starts here
+  BasicBlock *OrigEntry = Entry->getNextNode();
+  BasicBlock *SetjmpDispatchBB =
+      BasicBlock::Create(C, "setjmp.dispatch", &F, OrigEntry);
+  cast<BranchInst>(Entry->getTerminator())->setSuccessor(0, SetjmpDispatchBB);
+
+  // Create catch.dispatch.longjmp BB a catchswitch instruction
+  BasicBlock *CatchSwitchBB =
+      BasicBlock::Create(C, "catch.dispatch.longjmp", &F);
+  IRB.SetInsertPoint(CatchSwitchBB);
+  CatchSwitchInst *CatchSwitch =
+      IRB.CreateCatchSwitch(ConstantTokenNone::get(C), nullptr, 1);
+
+  // Create catch.longjmp BB and a catchpad instruction
+  BasicBlock *CatchLongjmpBB = BasicBlock::Create(C, "catch.longjmp", &F);
+  CatchSwitch->addHandler(CatchLongjmpBB);
+  IRB.SetInsertPoint(CatchLongjmpBB);
+  CatchPadInst *CatchPad = IRB.CreateCatchPad(CatchSwitch, {});
+
+  // Wasm throw and catch instructions can throw and catch multiple values, but
+  // that requires multivalue support in the toolchain, which is currently not
+  // very reliable. We instead throw and catch a pointer to a struct value of
+  // type 'struct __WasmLongjmpArgs', which is defined in Emscripten.
+  Instruction *CatchCI =
+      IRB.CreateCall(CatchF, {IRB.getInt32(WebAssembly::C_LONGJMP)}, "thrown");
+  Value *LongjmpArgs =
+      IRB.CreateBitCast(CatchCI, LongjmpArgsTy->getPointerTo(), "longjmp.args");
+  Value *EnvField =
+      IRB.CreateConstGEP2_32(LongjmpArgsTy, LongjmpArgs, 0, 0, "env_gep");
+  Value *ValField =
+      IRB.CreateConstGEP2_32(LongjmpArgsTy, LongjmpArgs, 0, 1, "val_gep");
+  // void *env = __wasm_longjmp_args.env;
+  Instruction *Env = IRB.CreateLoad(IRB.getInt8PtrTy(), EnvField, "env");
+  // int val = __wasm_longjmp_args.val;
+  Instruction *Val = IRB.CreateLoad(IRB.getInt32Ty(), ValField, "val");
+
+  // %label = testSetjmp(mem[%env], setjmpTable, setjmpTableSize);
+  // if (%label == 0)
+  //   __wasm_longjmp(%env, %val)
+  // catchret to %setjmp.dispatch
+  BasicBlock *ThenBB = BasicBlock::Create(C, "if.then", &F);
+  BasicBlock *EndBB = BasicBlock::Create(C, "if.end", &F);
+  Value *EnvP = IRB.CreateBitCast(Env, getAddrPtrType(&M), "env.p");
+  Value *SetjmpID = IRB.CreateLoad(getAddrIntType(&M), EnvP, "setjmp.id");
+  Value *Label =
+      IRB.CreateCall(TestSetjmpF, {SetjmpID, SetjmpTable, SetjmpTableSize},
+                     OperandBundleDef("funclet", CatchPad), "label");
+  Value *Cmp = IRB.CreateICmpEQ(Label, IRB.getInt32(0));
+  IRB.CreateCondBr(Cmp, ThenBB, EndBB);
+
+  IRB.SetInsertPoint(ThenBB);
+  CallInst *WasmLongjmpCI = IRB.CreateCall(
+      WasmLongjmpF, {Env, Val}, OperandBundleDef("funclet", CatchPad));
+  IRB.CreateUnreachable();
 
-  for (auto UI = SetjmpTable->use_begin(), UE = SetjmpTable->use_end();
-       UI != UE;) {
-    // Grab the use before incrementing the iterator.
-    Use &U = *UI;
-    // Increment the iterator before removing the use from the list.
-    ++UI;
-    if (auto *I = dyn_cast<Instruction>(U.getUser()))
-      if (I->getParent() != &EntryBB)
-        SetjmpTableSSA.RewriteUse(U);
+  IRB.SetInsertPoint(EndBB);
+  // Jump to setjmp.dispatch block
+  IRB.CreateCatchRet(CatchPad, SetjmpDispatchBB);
+
+  // Go back to setjmp.dispatch BB
+  // setjmp.dispatch:
+  //   switch %label {
+  //     label 1: goto post-setjmp BB 1
+  //     label 2: goto post-setjmp BB 2
+  //     ...
+  //     default: goto splitted next BB
+  //   }
+  IRB.SetInsertPoint(SetjmpDispatchBB);
+  PHINode *LabelPHI = IRB.CreatePHI(IRB.getInt32Ty(), 2, "label.phi");
+  LabelPHI->addIncoming(Label, EndBB);
+  LabelPHI->addIncoming(IRB.getInt32(-1), Entry);
+  SwitchInst *SI = IRB.CreateSwitch(LabelPHI, OrigEntry, SetjmpRetPHIs.size());
+  // -1 means no longjmp happened, continue normally (will hit the default
+  // switch case). 0 means a longjmp that is not ours to handle, needs a
+  // rethrow. Otherwise the index is the same as the index in P+1 (to avoid
+  // 0).
+  for (unsigned I = 0; I < SetjmpRetPHIs.size(); I++) {
+    SI->addCase(IRB.getInt32(I + 1), SetjmpRetPHIs[I]->getParent());
+    SetjmpRetPHIs[I]->addIncoming(Val, SetjmpDispatchBB);
   }
-  for (auto UI = SetjmpTableSize->use_begin(), UE = SetjmpTableSize->use_end();
-       UI != UE;) {
-    Use &U = *UI;
-    ++UI;
-    if (auto *I = dyn_cast<Instruction>(U.getUser()))
-      if (I->getParent() != &EntryBB)
-        SetjmpTableSizeSSA.RewriteUse(U);
+
+  // Convert all longjmpable call instructions to invokes that unwind to the
+  // newly created catch.dispatch.longjmp BB.
+  SmallVector<Instruction *, 64> ToErase;
+  for (auto *BB = &*F.begin(); BB; BB = BB->getNextNode()) {
+    for (Instruction &I : *BB) {
+      auto *CI = dyn_cast<CallInst>(&I);
+      if (!CI)
+        continue;
+      const Value *Callee = CI->getCalledOperand();
+      if (!canLongjmp(Callee))
+        continue;
+      if (isEmAsmCall(Callee))
+        report_fatal_error("Cannot use EM_ASM* alongside setjmp/longjmp in " +
+                               F.getName() +
+                               ". Please consider using EM_JS, or move the "
+                               "EM_ASM into another function.",
+                           false);
+      // This is __wasm_longjmp() call we inserted in this function, which
+      // rethrows the longjmp when the longjmp does not correspond to one of
+      // setjmps in this function. We should not convert this call to an invoke.
+      if (CI == WasmLongjmpCI)
+        continue;
+      ToErase.push_back(CI);
+
+      // Even if the callee function has attribute 'nounwind', which is true for
+      // all C functions, it can longjmp, which means it can throw a Wasm
+      // exception now.
+      CI->removeFnAttr(Attribute::NoUnwind);
+      if (Function *CalleeF = CI->getCalledFunction()) {
+        CalleeF->removeFnAttr(Attribute::NoUnwind);
+      }
+
+      IRB.SetInsertPoint(CI);
+      BasicBlock *Tail = SplitBlock(BB, CI->getNextNode());
+      // We will add a new invoke. So remove the branch created when we split
+      // the BB
+      ToErase.push_back(BB->getTerminator());
+      SmallVector<Value *, 8> Args(CI->args());
+      InvokeInst *II =
+          IRB.CreateInvoke(CI->getFunctionType(), CI->getCalledOperand(), Tail,
+                           CatchSwitchBB, Args);
+      II->takeName(CI);
+      II->setDebugLoc(CI->getDebugLoc());
+      II->setAttributes(CI->getAttributes());
+      CI->replaceAllUsesWith(II);
+    }
   }
 
-  // Finally, our modifications to the cfg can break dominance of SSA variables.
-  // For example, in this code,
-  // if (x()) { .. setjmp() .. }
-  // if (y()) { .. longjmp() .. }
-  // We must split the longjmp block, and it can jump into the block splitted
-  // from setjmp one. But that means that when we split the setjmp block, it's
-  // first part no longer dominates its second part - there is a theoretically
-  // possible control flow path where x() is false, then y() is true and we
-  // reach the second part of the setjmp block, without ever reaching the first
-  // part. So, we rebuild SSA form here.
-  rebuildSSA(F);
-  return true;
+  for (Instruction *I : ToErase)
+    I->eraseFromParent();
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 9ccbee819c35..3a0bef8c765c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -68,7 +68,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
   if (!InitList)
     return false;
 
-  // Sanity-check @llvm.global_dtor's type.
+  // Validate @llvm.global_dtor's type.
   auto *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
   if (!ETy || ETy->getNumElements() != 3 ||
       !ETy->getTypeAtIndex(0U)->isIntegerTy() ||
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
new file mode 100644
index 000000000000..8ff916c28c4e
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
@@ -0,0 +1,84 @@
+//=== WebAssemblyLowerRefTypesIntPtrConv.cpp -
+//                     Lower IntToPtr and PtrToInt on Reference Types   ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Lowers IntToPtr and PtrToInt instructions on reference types to
+/// Trap instructions since they have been allowed to operate
+/// on non-integral pointers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Utils/WebAssemblyUtilities.h"
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-reftypes-intptr-conv"
+
+namespace {
+class WebAssemblyLowerRefTypesIntPtrConv final : public FunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Lower RefTypes Int-Ptr Conversions";
+  }
+
+  bool runOnFunction(Function &MF) override;
+
+public:
+  static char ID; // Pass identification
+  WebAssemblyLowerRefTypesIntPtrConv() : FunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerRefTypesIntPtrConv::ID = 0;
+INITIALIZE_PASS(WebAssemblyLowerRefTypesIntPtrConv, DEBUG_TYPE,
+                "WebAssembly Lower RefTypes Int-Ptr Conversions", false, false)
+
+FunctionPass *llvm::createWebAssemblyLowerRefTypesIntPtrConv() {
+  return new WebAssemblyLowerRefTypesIntPtrConv();
+}
+
+bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "********** Lower RefTypes IntPtr Convs **********\n"
+                       "********** Function: "
+                    << F.getName() << '\n');
+
+  // This function will check for uses of ptrtoint and inttoptr on reference
+  // types and replace them with a trap instruction.
+  //
+  // We replace the instruction by a trap instruction
+  // and its uses by null in the case of inttoptr and 0 in the
+  // case of ptrtoint.
+  std::set<Instruction *> worklist;
+
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    PtrToIntInst *PTI = dyn_cast<PtrToIntInst>(&*I);
+    IntToPtrInst *ITP = dyn_cast<IntToPtrInst>(&*I);
+    if (!(PTI && WebAssembly::isRefType(PTI->getPointerOperand()->getType())) &&
+        !(ITP && WebAssembly::isRefType(ITP->getDestTy())))
+      continue;
+
+    UndefValue *U = UndefValue::get(I->getType());
+    I->replaceAllUsesWith(U);
+
+    Function *TrapIntrin =
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::debugtrap);
+    CallInst::Create(TrapIntrin, {}, "", &*I);
+
+    worklist.insert(&*I);
+  }
+
+  // erase each instruction replaced by trap
+  for (Instruction *I : worklist)
+    I->eraseFromParent();
+
+  return !worklist.empty();
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index ec2380a501ab..0b953a90aeab 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "Utils/WebAssemblyUtilities.h"
 #include "WebAssemblyAsmPrinter.h"
+#include "WebAssemblyISelLowering.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -28,6 +29,7 @@
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 // This disables the removal of registers when lowering into MC, as required
@@ -38,8 +40,8 @@ cl::opt<bool>
                                " instruction output for test purposes only."),
                       cl::init(false));
 
-extern cl::opt<bool> EnableEmException;
-extern cl::opt<bool> EnableEmSjLj;
+extern cl::opt<bool> WasmEnableEmEH;
+extern cl::opt<bool> WasmEnableEmSjLj;
 
 static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
 
@@ -56,15 +58,36 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
       const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
       const TargetMachine &TM = MF.getTarget();
       const Function &CurrentFunc = MF.getFunction();
+      Type *GlobalVT = Global->getValueType();
       SmallVector<MVT, 1> VTs;
-      computeLegalValueVTs(CurrentFunc, TM, Global->getValueType(), VTs);
-      if (VTs.size() != 1)
+      computeLegalValueVTs(CurrentFunc, TM, GlobalVT, VTs);
+
+      // Tables are represented as Arrays in LLVM IR therefore
+      // they reach this point as aggregate Array types with an element type
+      // that is a reference type.
+      wasm::ValType Type;
+      if (GlobalVT->isArrayTy() &&
+          WebAssembly::isRefType(GlobalVT->getArrayElementType())) {
+        MVT VT;
+        switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) {
+        case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF:
+          VT = MVT::funcref;
+          break;
+        case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF:
+          VT = MVT::externref;
+          break;
+        default:
+          report_fatal_error("unhandled address space type");
+        }
+        Type = WebAssembly::toValType(VT);
+      } else if (VTs.size() == 1) {
+        Type = WebAssembly::toValType(VTs[0]);
+      } else
         report_fatal_error("Aggregate globals not yet implemented");
 
-      bool Mutable = true;
-      wasm::ValType Type = WebAssembly::toValType(VTs[0]);
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
-      WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable});
+      WasmSym->setGlobalType(
+          wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true});
     }
     return WasmSym;
   }
@@ -82,7 +105,7 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
 
   bool InvokeDetected = false;
   auto *WasmSym = Printer.getMCSymbolForFunction(
-      F, EnableEmException || EnableEmSjLj, Signature.get(), InvokeDetected);
+      F, WasmEnableEmEH || WasmEnableEmSjLj, Signature.get(), InvokeDetected);
   WasmSym->setSignature(Signature.get());
   Printer.addSignature(std::move(Signature));
   WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
@@ -102,6 +125,9 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
   switch (TargetFlags) {
     case WebAssemblyII::MO_NO_FLAG:
       break;
+    case WebAssemblyII::MO_GOT_TLS:
+      Kind = MCSymbolRefExpr::VK_WASM_GOT_TLS;
+      break;
     case WebAssemblyII::MO_GOT:
       Kind = MCSymbolRefExpr::VK_GOT;
       break;
@@ -275,15 +301,9 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
       MCOp = lowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
       break;
     case MachineOperand::MO_ExternalSymbol:
-      // The target flag indicates whether this is a symbol for a
-      // variable or a function.
-      assert(MO.getTargetFlags() == 0 &&
-             "WebAssembly uses only symbol flags on ExternalSymbols");
       MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
       break;
     case MachineOperand::MO_MCSymbol:
-      // This is currently used only for LSDA symbols (GCC_except_table),
-      // because global addresses or other external symbols are handled above.
       assert(MO.getTargetFlags() == 0 &&
              "WebAssembly does not use target flags on MCSymbol");
       MCOp = lowerSymbolOperand(MO, MO.getMCSymbol());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
index 3daffd1c23a2..37ac8e75f4b7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCLowerPrePass.cpp
@@ -33,21 +33,21 @@ using namespace llvm;
 #define DEBUG_TYPE "wasm-mclower-prepass"
 
 namespace {
-class WebAssemblyMCLowerPrePass final : public MachineFunctionPass {
+class WebAssemblyMCLowerPrePass final : public ModulePass {
   StringRef getPassName() const override {
     return "WebAssembly MC Lower Pre Pass";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
+    ModulePass::getAnalysisUsage(AU);
   }
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnModule(Module &M) override;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  WebAssemblyMCLowerPrePass() : MachineFunctionPass(ID) {}
+  WebAssemblyMCLowerPrePass() : ModulePass(ID) {}
 };
 } // end anonymous namespace
 
@@ -57,30 +57,43 @@ INITIALIZE_PASS(
     "Collects information ahead of time for MC lowering",
     false, false)
 
-FunctionPass *llvm::createWebAssemblyMCLowerPrePass() {
+ModulePass *llvm::createWebAssemblyMCLowerPrePass() {
   return new WebAssemblyMCLowerPrePass();
 }
 
-bool WebAssemblyMCLowerPrePass::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "********** MC Lower Pre Pass **********\n"
-                       "********** Function: "
-                    << MF.getName() << '\n');
+// NOTE: this is a ModulePass since we need to enforce that this code has run
+// for all functions before AsmPrinter. If this way of doing things is ever
+// suboptimal, we could opt to make it a MachineFunctionPass and instead use
+// something like createBarrierNoopPass() to enforce ordering.
+bool WebAssemblyMCLowerPrePass::runOnModule(Module &M) {
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  if (!MMIWP)
+    return true;
 
-  MachineModuleInfo &MMI = MF.getMMI();
+  MachineModuleInfo &MMI = MMIWP->getMMI();
   MachineModuleInfoWasm &MMIW = MMI.getObjFileInfo<MachineModuleInfoWasm>();
 
-  for (MachineBasicBlock &MBB : MF) {
-    for (auto &MI : MBB) {
-      // FIXME: what should all be filtered out beyond these?
-      if (MI.isDebugInstr() || MI.isInlineAsm())
-        continue;
-      for (MachineOperand &MO : MI.uses()) {
-        if (MO.isSymbol()) {
-          MMIW.MachineSymbolsUsed.insert(MO.getSymbolName());
+  for (Function &F : M) {
+    MachineFunction *MF = MMI.getMachineFunction(F);
+    if (!MF)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "********** MC Lower Pre Pass **********\n"
+                         "********** Function: "
+                      << MF->getName() << '\n');
+
+    for (MachineBasicBlock &MBB : *MF) {
+      for (auto &MI : MBB) {
+        // FIXME: what should all be filtered out beyond these?
+        if (MI.isDebugInstr() || MI.isInlineAsm())
+          continue;
+        for (MachineOperand &MO : MI.uses()) {
+          if (MO.isSymbol()) {
+            MMIW.MachineSymbolsUsed.insert(MO.getSymbolName());
+          }
         }
       }
     }
   }
-
   return true;
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index 9aea65cba280..2180f57c106a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -96,9 +96,8 @@ static bool replaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
 
   SmallVector<SlotIndex, 4> Indices;
 
-  for (auto I = MRI.use_nodbg_begin(FromReg), E = MRI.use_nodbg_end();
-       I != E;) {
-    MachineOperand &O = *I++;
+  for (MachineOperand &O :
+       llvm::make_early_inc_range(MRI.use_nodbg_operands(FromReg))) {
     MachineInstr *Where = O.getParent();
 
     // Check that MI dominates the instruction in the normal way.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 6bfed1a7195c..9d83a75a8247 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -106,13 +106,12 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
   // instructions to satisfy LiveIntervals' requirement that all uses be
   // dominated by defs. Now that LiveIntervals has computed which of these
   // defs are actually needed and which are dead, remove the dead ones.
-  for (auto MII = MF.begin()->begin(), MIE = MF.begin()->end(); MII != MIE;) {
-    MachineInstr *MI = &*MII++;
-    if (MI->isImplicitDef() && MI->getOperand(0).isDead()) {
-      LiveInterval &LI = LIS.getInterval(MI->getOperand(0).getReg());
-      LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*MI).getRegSlot());
-      LIS.RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
+  for (MachineInstr &MI : llvm::make_early_inc_range(MF.front())) {
+    if (MI.isImplicitDef() && MI.getOperand(0).isDead()) {
+      LiveInterval &LI = LIS.getInterval(MI.getOperand(0).getReg());
+      LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(MI).getRegSlot());
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI.eraseFromParent();
     }
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 96390de8f5e7..7912aeb4f502 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -56,7 +56,7 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
 }
 
 void OptimizeReturned::visitCallBase(CallBase &CB) {
-  for (unsigned I = 0, E = CB.getNumArgOperands(); I < E; ++I)
+  for (unsigned I = 0, E = CB.arg_size(); I < E; ++I)
     if (CB.paramHasAttr(I, Attribute::Returned)) {
       Value *Arg = CB.getArgOperand(I);
       // Ignore constants, globals, undef, etc.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index ed5f7ccc854f..8b8593ddcbdd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -112,8 +112,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
 
   // Move ARGUMENT_* instructions to the top of the entry block, so that their
   // liveness reflects the fact that these really are live-in values.
-  for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE;) {
-    MachineInstr &MI = *MII++;
+  for (MachineInstr &MI : llvm::make_early_inc_range(Entry)) {
     if (WebAssembly::isArgument(MI.getOpcode())) {
       MI.removeFromParent();
       Entry.insert(Entry.begin(), &MI);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index d6adc2fd155c..42419259802e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -497,6 +497,10 @@ static unsigned getTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::TEE_F64;
   if (RC == &WebAssembly::V128RegClass)
     return WebAssembly::TEE_V128;
+  if (RC == &WebAssembly::EXTERNREFRegClass)
+    return WebAssembly::TEE_EXTERNREF;
+  if (RC == &WebAssembly::FUNCREFRegClass)
+    return WebAssembly::TEE_FUNCREF;
   llvm_unreachable("Unexpected register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index 9f5d6b2a9a47..71f0bd28e1be 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -85,8 +85,8 @@ bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
     // Replace explicit uses of the physical register with a virtual register.
     const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg);
     unsigned VReg = WebAssembly::NoRegister;
-    for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E;) {
-      MachineOperand &MO = *I++;
+    for (MachineOperand &MO :
+         llvm::make_early_inc_range(MRI.reg_operands(PReg))) {
       if (!MO.isImplicit()) {
         if (VReg == WebAssembly::NoRegister) {
           VReg = MRI.createVirtualRegister(RC);
@@ -101,8 +101,6 @@ bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
           }
         }
         MO.setReg(VReg);
-        if (MO.getParent()->isDebugValue())
-          MO.setIsDebug();
         Changed = true;
       }
     }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 7943e1ecc8e1..add3c799f4aa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -15,7 +15,7 @@
 #include "WebAssemblySubtarget.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblyInstrInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-subtarget"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 43d5871f0aa0..b553c8150652 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -36,6 +36,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   enum SIMDEnum {
     NoSIMD,
     SIMD128,
+    RelaxedSIMD,
   } SIMDLevel = NoSIMD;
 
   bool HasAtomics = false;
@@ -89,6 +90,7 @@ public:
   // Predicates used by WebAssemblyInstrInfo.td.
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
   bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
+  bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
   bool hasAtomics() const { return HasAtomics; }
   bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
   bool hasSignExt() const { return HasSignExt; }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 746a7599c58c..80abccd74782 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -24,7 +24,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
@@ -34,17 +34,27 @@ using namespace llvm;
 #define DEBUG_TYPE "wasm"
 
 // Emscripten's asm.js-style exception handling
-cl::opt<bool> EnableEmException(
-    "enable-emscripten-cxx-exceptions",
-    cl::desc("WebAssembly Emscripten-style exception handling"),
-    cl::init(false));
+cl::opt<bool>
+    WasmEnableEmEH("enable-emscripten-cxx-exceptions",
+                   cl::desc("WebAssembly Emscripten-style exception handling"),
+                   cl::init(false));
 
 // Emscripten's asm.js-style setjmp/longjmp handling
-cl::opt<bool> EnableEmSjLj(
+cl::opt<bool> WasmEnableEmSjLj(
     "enable-emscripten-sjlj",
     cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
     cl::init(false));
 
+// Exception handling using wasm EH instructions
+cl::opt<bool> WasmEnableEH("wasm-enable-eh",
+                           cl::desc("WebAssembly exception handling"),
+                           cl::init(false));
+
+// setjmp/longjmp handling using wasm EH instrutions
+cl::opt<bool> WasmEnableSjLj("wasm-enable-sjlj",
+                             cl::desc("WebAssembly setjmp/longjmp handling"),
+                             cl::init(false));
+
 // A command-line option to keep implicit locals
 // for the purpose of testing with lit/llc ONLY.
 // This produces output which is not valid WebAssembly, and is not supported
@@ -123,12 +133,14 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     : LLVMTargetMachine(
           T,
           TT.isArch64Bit()
-              ? (TT.isOSEmscripten()
-                     ? "e-m:e-p:64:64-i64:64-f128:64-n32:64-S128-ni:1:10:20"
-                     : "e-m:e-p:64:64-i64:64-n32:64-S128-ni:1:10:20")
-              : (TT.isOSEmscripten()
-                     ? "e-m:e-p:32:32-i64:64-f128:64-n32:64-S128-ni:1:10:20"
-                     : "e-m:e-p:32:32-i64:64-n32:64-S128-ni:1:10:20"),
+              ? (TT.isOSEmscripten() ? "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-"
+                                       "f128:64-n32:64-S128-ni:1:10:20"
+                                     : "e-m:e-p:64:64-p10:8:8-p20:8:8-i64:64-"
+                                       "n32:64-S128-ni:1:10:20")
+              : (TT.isOSEmscripten() ? "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-"
+                                       "f128:64-n32:64-S128-ni:1:10:20"
+                                     : "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-"
+                                       "n32:64-S128-ni:1:10:20"),
           TT, CPU, FS, Options, getEffectiveRelocModel(RM, TT),
           getEffectiveCodeModel(CM, CodeModel::Large), OL),
       TLOF(new WebAssemblyTargetObjectFile()) {
@@ -332,6 +344,7 @@ public:
   void addPostRegAlloc() override;
   bool addGCPasses() override { return false; }
   void addPreEmitPass() override;
+  bool addPreISel() override;
 
   // No reg alloc
   bool addRegAssignAndRewriteFast() override { return false; }
@@ -355,6 +368,43 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
   return nullptr; // No reg alloc
 }
 
+static void checkSanityForEHAndSjLj(const TargetMachine *TM) {
+  // Sanity checking related to -exception-model
+  if (TM->Options.ExceptionModel != ExceptionHandling::None &&
+      TM->Options.ExceptionModel != ExceptionHandling::Wasm)
+    report_fatal_error("-exception-model should be either 'none' or 'wasm'");
+  if (WasmEnableEmEH && TM->Options.ExceptionModel == ExceptionHandling::Wasm)
+    report_fatal_error("-exception-model=wasm not allowed with "
+                       "-enable-emscripten-cxx-exceptions");
+  if (WasmEnableEH && TM->Options.ExceptionModel != ExceptionHandling::Wasm)
+    report_fatal_error(
+        "-wasm-enable-eh only allowed with -exception-model=wasm");
+  if (WasmEnableSjLj && TM->Options.ExceptionModel != ExceptionHandling::Wasm)
+    report_fatal_error(
+        "-wasm-enable-sjlj only allowed with -exception-model=wasm");
+  if ((!WasmEnableEH && !WasmEnableSjLj) &&
+      TM->Options.ExceptionModel == ExceptionHandling::Wasm)
+    report_fatal_error(
+        "-exception-model=wasm only allowed with at least one of "
+        "-wasm-enable-eh or -wasm-enable-sjj");
+
+  // You can't enable two modes of EH at the same time
+  if (WasmEnableEmEH && WasmEnableEH)
+    report_fatal_error(
+        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh");
+  // You can't enable two modes of SjLj at the same time
+  if (WasmEnableEmSjLj && WasmEnableSjLj)
+    report_fatal_error(
+        "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj");
+  // You can't mix Emscripten EH with Wasm SjLj.
+  if (WasmEnableEmEH && WasmEnableSjLj)
+    report_fatal_error(
+        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
+  // Currently it is allowed to mix Wasm EH with Emscripten SjLj as an interim
+  // measure, but some code will error out at compile time in this combination.
+  // See WebAssemblyLowerEmscriptenEHSjLj pass for details.
+}
+
 //===----------------------------------------------------------------------===//
 // The following functions are called from lib/CodeGen/Passes.cpp to modify
 // the CodeGen pass sequence.
@@ -381,23 +431,27 @@ void WebAssemblyPassConfig::addIRPasses() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createWebAssemblyOptimizeReturned());
 
+  checkSanityForEHAndSjLj(TM);
+
   // If exception handling is not enabled and setjmp/longjmp handling is
   // enabled, we lower invokes into calls and delete unreachable landingpad
   // blocks. Lowering invokes when there is no EH support is done in
-  // TargetPassConfig::addPassesToHandleExceptions, but this runs after this
-  // function and SjLj handling expects all invokes to be lowered before.
-  if (!EnableEmException &&
-      TM->Options.ExceptionModel == ExceptionHandling::None) {
+  // TargetPassConfig::addPassesToHandleExceptions, but that runs after these IR
+  // passes and Emscripten SjLj handling expects all invokes to be lowered
+  // before.
+  if (!WasmEnableEmEH && !WasmEnableEH) {
     addPass(createLowerInvokePass());
     // The lower invoke pass may create unreachable code. Remove it in order not
     // to process dead blocks in setjmp/longjmp handling.
     addPass(createUnreachableBlockEliminationPass());
   }
 
-  // Handle exceptions and setjmp/longjmp if enabled.
-  if (EnableEmException || EnableEmSjLj)
-    addPass(createWebAssemblyLowerEmscriptenEHSjLj(EnableEmException,
-                                                   EnableEmSjLj));
+  // Handle exceptions and setjmp/longjmp if enabled. Unlike Wasm EH preparation
+  // done in WasmEHPrepare pass, Wasm SjLj preparation shares libraries and
+  // transformation algorithms with Emscripten SjLj, so we run
+  // LowerEmscriptenEHSjLj pass also when Wasm SjLj is enabled.
+  if (WasmEnableEmEH || WasmEnableEmSjLj || WasmEnableSjLj)
+    addPass(createWebAssemblyLowerEmscriptenEHSjLj());
 
   // Expand indirectbr instructions to switches.
   addPass(createIndirectBrExpandPass());
@@ -518,6 +572,12 @@ void WebAssemblyPassConfig::addPreEmitPass() {
   addPass(createWebAssemblyMCLowerPrePass());
 }
 
+bool WebAssemblyPassConfig::addPreISel() {
+  TargetPassConfig::addPreISel();
+  addPass(createWebAssemblyLowerRefTypesIntPtrConv());
+  return false;
+}
+
 yaml::MachineFunctionInfo *
 WebAssemblyTargetMachine::createDefaultFuncInfoYAML() const {
   return new yaml::WebAssemblyFunctionInfo();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index d9bc7c6d2c3f..f1ebcbc6fc51 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -114,7 +114,8 @@ bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller,
 }
 
 void WebAssemblyTTIImpl::getUnrollingPreferences(
-  Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) const {
+    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
+    OptimizationRemarkEmitter *ORE) const {
   // Scan the loop: don't unroll loops with calls. This is a standard approach
   // for most (all?) targets.
   for (BasicBlock *BB : L->blocks())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 1a33bd20d027..50036f7f7e98 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -49,7 +49,8 @@ public:
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
-                               TTI::UnrollingPreferences &UP) const;
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE) const;
 
   /// @}
 
@@ -59,8 +60,7 @@ public:
   unsigned getNumberOfRegisters(unsigned ClassID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 67ca67d6cee6..8ce6b47d10e8 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -31,10 +31,10 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <memory>
@@ -1758,8 +1758,8 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
   // It is widely common for MS InlineAsm to use a global variable and one/two
   // registers in a mmory expression, and though unaccessible via rip/eip.
   if (IsGlobalLV && (BaseReg || IndexReg)) {
-    Operands.push_back(
-        X86Operand::CreateMem(getPointerWidth(), Disp, Start, End));
+    Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
+                                             End, Size, Identifier, Decl));
     return false;
   }
   // Otherwise, we set the base register to a non-zero value
@@ -2551,6 +2551,8 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
   StringRef ErrMsg;
   unsigned BaseReg = SM.getBaseReg();
   unsigned IndexReg = SM.getIndexReg();
+  if (IndexReg && BaseReg == X86::RIP)
+    BaseReg = 0;
   unsigned Scale = SM.getScale();
   if (!PtrInOperand)
     Size = SM.getElementSize() << 3;
@@ -2655,7 +2657,7 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
         Expr = nullptr;
         Reg = RE->getRegNo();
 
-        // Sanity check register.
+        // Check the register.
         if (Reg == X86::EIZ || Reg == X86::RIZ)
           return Error(
               Loc, "%eiz and %riz can only be used as index registers",
@@ -2753,6 +2755,7 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
               .Case("1to4", "{1to4}")
               .Case("1to8", "{1to8}")
               .Case("1to16", "{1to16}")
+              .Case("1to32", "{1to32}")
               .Default(nullptr);
       if (!BroadcastPrimitive)
         return TokError("Invalid memory broadcast primitive.");
@@ -2914,7 +2917,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
         check(!isa<X86MCExpr>(E), BaseLoc, "expected register here"))
       return true;
 
-    // Sanity check register.
+    // Check the register.
     BaseReg = cast<X86MCExpr>(E)->getRegNo();
     if (BaseReg == X86::EIZ || BaseReg == X86::RIZ)
       return Error(BaseLoc, "eiz and riz can only be used as index registers",
@@ -3126,9 +3129,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   unsigned ComparisonPredicate = ~0U;
 
-  // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+  // FIXME: Hack to recognize cmp<comparison code>{sh,ss,sd,ph,ps,pd}.
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+       PatchedName.endswith("sh") || PatchedName.endswith("ph") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
     bool IsVCMP = PatchedName[0] == 'v';
     unsigned CCIdx = IsVCMP ? 4 : 3;
@@ -3182,7 +3186,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       .Case("gt_oq",    0x1E)
       .Case("true_us",  0x1F)
       .Default(~0U);
-    if (CC != ~0U && (IsVCMP || CC < 8)) {
+    if (CC != ~0U && (IsVCMP || CC < 8) &&
+        (IsVCMP || PatchedName.back() != 'h')) {
       if (PatchedName.endswith("ss"))
         PatchedName = IsVCMP ? "vcmpss" : "cmpss";
       else if (PatchedName.endswith("sd"))
@@ -3191,6 +3196,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         PatchedName = IsVCMP ? "vcmpps" : "cmpps";
       else if (PatchedName.endswith("pd"))
         PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+      else if (PatchedName.endswith("sh"))
+        PatchedName = "vcmpsh";
+      else if (PatchedName.endswith("ph"))
+        PatchedName = "vcmpph";
       else
         llvm_unreachable("Unexpected suffix!");
 
@@ -3859,6 +3868,176 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
     }
     break;
   }
+  case X86::VFCMADDCPHZ128m:
+  case X86::VFCMADDCPHZ256m:
+  case X86::VFCMADDCPHZm:
+  case X86::VFCMADDCPHZ128mb:
+  case X86::VFCMADDCPHZ256mb:
+  case X86::VFCMADDCPHZmb:
+  case X86::VFCMADDCPHZ128mbk:
+  case X86::VFCMADDCPHZ256mbk:
+  case X86::VFCMADDCPHZmbk:
+  case X86::VFCMADDCPHZ128mbkz:
+  case X86::VFCMADDCPHZ256mbkz:
+  case X86::VFCMADDCPHZmbkz:
+  case X86::VFCMADDCPHZ128mk:
+  case X86::VFCMADDCPHZ256mk:
+  case X86::VFCMADDCPHZmk:
+  case X86::VFCMADDCPHZ128mkz:
+  case X86::VFCMADDCPHZ256mkz:
+  case X86::VFCMADDCPHZmkz:
+  case X86::VFCMADDCPHZ128r:
+  case X86::VFCMADDCPHZ256r:
+  case X86::VFCMADDCPHZr:
+  case X86::VFCMADDCPHZ128rk:
+  case X86::VFCMADDCPHZ256rk:
+  case X86::VFCMADDCPHZrk:
+  case X86::VFCMADDCPHZ128rkz:
+  case X86::VFCMADDCPHZ256rkz:
+  case X86::VFCMADDCPHZrkz:
+  case X86::VFCMADDCPHZrb:
+  case X86::VFCMADDCPHZrbk:
+  case X86::VFCMADDCPHZrbkz:
+  case X86::VFCMADDCSHZm:
+  case X86::VFCMADDCSHZmk:
+  case X86::VFCMADDCSHZmkz:
+  case X86::VFCMADDCSHZr:
+  case X86::VFCMADDCSHZrb:
+  case X86::VFCMADDCSHZrbk:
+  case X86::VFCMADDCSHZrbkz:
+  case X86::VFCMADDCSHZrk:
+  case X86::VFCMADDCSHZrkz:
+  case X86::VFMADDCPHZ128m:
+  case X86::VFMADDCPHZ256m:
+  case X86::VFMADDCPHZm:
+  case X86::VFMADDCPHZ128mb:
+  case X86::VFMADDCPHZ256mb:
+  case X86::VFMADDCPHZmb:
+  case X86::VFMADDCPHZ128mbk:
+  case X86::VFMADDCPHZ256mbk:
+  case X86::VFMADDCPHZmbk:
+  case X86::VFMADDCPHZ128mbkz:
+  case X86::VFMADDCPHZ256mbkz:
+  case X86::VFMADDCPHZmbkz:
+  case X86::VFMADDCPHZ128mk:
+  case X86::VFMADDCPHZ256mk:
+  case X86::VFMADDCPHZmk:
+  case X86::VFMADDCPHZ128mkz:
+  case X86::VFMADDCPHZ256mkz:
+  case X86::VFMADDCPHZmkz:
+  case X86::VFMADDCPHZ128r:
+  case X86::VFMADDCPHZ256r:
+  case X86::VFMADDCPHZr:
+  case X86::VFMADDCPHZ128rk:
+  case X86::VFMADDCPHZ256rk:
+  case X86::VFMADDCPHZrk:
+  case X86::VFMADDCPHZ128rkz:
+  case X86::VFMADDCPHZ256rkz:
+  case X86::VFMADDCPHZrkz:
+  case X86::VFMADDCPHZrb:
+  case X86::VFMADDCPHZrbk:
+  case X86::VFMADDCPHZrbkz:
+  case X86::VFMADDCSHZm:
+  case X86::VFMADDCSHZmk:
+  case X86::VFMADDCSHZmkz:
+  case X86::VFMADDCSHZr:
+  case X86::VFMADDCSHZrb:
+  case X86::VFMADDCSHZrbk:
+  case X86::VFMADDCSHZrbkz:
+  case X86::VFMADDCSHZrk:
+  case X86::VFMADDCSHZrkz: {
+    unsigned Dest = Inst.getOperand(0).getReg();
+    for (unsigned i = 2; i < Inst.getNumOperands(); i++)
+      if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+        return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+                                              "distinct from source registers");
+    break;
+  }
+  case X86::VFCMULCPHZ128rm:
+  case X86::VFCMULCPHZ256rm:
+  case X86::VFCMULCPHZrm:
+  case X86::VFCMULCPHZ128rmb:
+  case X86::VFCMULCPHZ256rmb:
+  case X86::VFCMULCPHZrmb:
+  case X86::VFCMULCPHZ128rmbk:
+  case X86::VFCMULCPHZ256rmbk:
+  case X86::VFCMULCPHZrmbk:
+  case X86::VFCMULCPHZ128rmbkz:
+  case X86::VFCMULCPHZ256rmbkz:
+  case X86::VFCMULCPHZrmbkz:
+  case X86::VFCMULCPHZ128rmk:
+  case X86::VFCMULCPHZ256rmk:
+  case X86::VFCMULCPHZrmk:
+  case X86::VFCMULCPHZ128rmkz:
+  case X86::VFCMULCPHZ256rmkz:
+  case X86::VFCMULCPHZrmkz:
+  case X86::VFCMULCPHZ128rr:
+  case X86::VFCMULCPHZ256rr:
+  case X86::VFCMULCPHZrr:
+  case X86::VFCMULCPHZ128rrk:
+  case X86::VFCMULCPHZ256rrk:
+  case X86::VFCMULCPHZrrk:
+  case X86::VFCMULCPHZ128rrkz:
+  case X86::VFCMULCPHZ256rrkz:
+  case X86::VFCMULCPHZrrkz:
+  case X86::VFCMULCPHZrrb:
+  case X86::VFCMULCPHZrrbk:
+  case X86::VFCMULCPHZrrbkz:
+  case X86::VFCMULCSHZrm:
+  case X86::VFCMULCSHZrmk:
+  case X86::VFCMULCSHZrmkz:
+  case X86::VFCMULCSHZrr:
+  case X86::VFCMULCSHZrrb:
+  case X86::VFCMULCSHZrrbk:
+  case X86::VFCMULCSHZrrbkz:
+  case X86::VFCMULCSHZrrk:
+  case X86::VFCMULCSHZrrkz:
+  case X86::VFMULCPHZ128rm:
+  case X86::VFMULCPHZ256rm:
+  case X86::VFMULCPHZrm:
+  case X86::VFMULCPHZ128rmb:
+  case X86::VFMULCPHZ256rmb:
+  case X86::VFMULCPHZrmb:
+  case X86::VFMULCPHZ128rmbk:
+  case X86::VFMULCPHZ256rmbk:
+  case X86::VFMULCPHZrmbk:
+  case X86::VFMULCPHZ128rmbkz:
+  case X86::VFMULCPHZ256rmbkz:
+  case X86::VFMULCPHZrmbkz:
+  case X86::VFMULCPHZ128rmk:
+  case X86::VFMULCPHZ256rmk:
+  case X86::VFMULCPHZrmk:
+  case X86::VFMULCPHZ128rmkz:
+  case X86::VFMULCPHZ256rmkz:
+  case X86::VFMULCPHZrmkz:
+  case X86::VFMULCPHZ128rr:
+  case X86::VFMULCPHZ256rr:
+  case X86::VFMULCPHZrr:
+  case X86::VFMULCPHZ128rrk:
+  case X86::VFMULCPHZ256rrk:
+  case X86::VFMULCPHZrrk:
+  case X86::VFMULCPHZ128rrkz:
+  case X86::VFMULCPHZ256rrkz:
+  case X86::VFMULCPHZrrkz:
+  case X86::VFMULCPHZrrb:
+  case X86::VFMULCPHZrrbk:
+  case X86::VFMULCPHZrrbkz:
+  case X86::VFMULCSHZrm:
+  case X86::VFMULCSHZrmk:
+  case X86::VFMULCSHZrmkz:
+  case X86::VFMULCSHZrr:
+  case X86::VFMULCSHZrrb:
+  case X86::VFMULCSHZrrbk:
+  case X86::VFMULCSHZrrbkz:
+  case X86::VFMULCSHZrrk:
+  case X86::VFMULCSHZrrkz: {
+    unsigned Dest = Inst.getOperand(0).getReg();
+    for (unsigned i = 1; i < Inst.getNumOperands(); i++)
+      if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
+        return Warning(Ops[0]->getStartLoc(), "Destination register should be "
+                                              "distinct from source registers");
+    break;
+  }
   }
 
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
@@ -3916,12 +4095,12 @@ void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) {
   // be found here:
   // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
   switch (Inst.getOpcode()) {
-  case X86::RETW:
-  case X86::RETL:
-  case X86::RETQ:
-  case X86::RETIL:
-  case X86::RETIQ:
-  case X86::RETIW: {
+  case X86::RET16:
+  case X86::RET32:
+  case X86::RET64:
+  case X86::RETI16:
+  case X86::RETI32:
+  case X86::RETI64: {
     MCInst ShlInst, FenceInst;
     bool Parse32 = is32BitMode() || Code16GCC;
     unsigned Basereg =
@@ -4093,24 +4272,6 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
        ForcedVEXEncoding != VEXEncoding_VEX3))
     return Match_Unsupported;
 
-  // These instructions match ambiguously with their VEX encoded counterparts
-  // and appear first in the matching table. Reject them unless we're forcing
-  // EVEX encoding.
-  // FIXME: We really need a way to break the ambiguity.
-  switch (Opc) {
-  case X86::VCVTSD2SIZrm_Int:
-  case X86::VCVTSD2SI64Zrm_Int:
-  case X86::VCVTSS2SIZrm_Int:
-  case X86::VCVTSS2SI64Zrm_Int:
-  case X86::VCVTTSD2SIZrm:   case X86::VCVTTSD2SIZrm_Int:
-  case X86::VCVTTSD2SI64Zrm: case X86::VCVTTSD2SI64Zrm_Int:
-  case X86::VCVTTSS2SIZrm:   case X86::VCVTTSS2SIZrm_Int:
-  case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int:
-    if (ForcedVEXEncoding != VEXEncoding_EVEX)
-      return Match_Unsupported;
-    break;
-  }
-
   return Match_Success;
 }
 
@@ -4678,7 +4839,7 @@ bool X86AsmParser::parseDirectiveArch() {
 bool X86AsmParser::parseDirectiveNops(SMLoc L) {
   int64_t NumBytes = 0, Control = 0;
   SMLoc NumBytesLoc, ControlLoc;
-  const MCSubtargetInfo STI = getSTI();
+  const MCSubtargetInfo& STI = getSTI();
   NumBytesLoc = getTok().getLoc();
   if (getParser().checkForValidSection() ||
       getParser().parseAbsoluteExpression(NumBytes))
@@ -4704,7 +4865,7 @@ bool X86AsmParser::parseDirectiveNops(SMLoc L) {
   }
 
   /// Emit nops
-  getParser().getStreamer().emitNops(NumBytes, Control, L);
+  getParser().getStreamer().emitNops(NumBytes, Control, L, STI);
 
   return false;
 }
@@ -4717,11 +4878,11 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
 
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
   if (!Section) {
-    getStreamer().InitSections(false);
+    getStreamer().initSections(false, getSTI());
     Section = getStreamer().getCurrentSectionOnly();
   }
   if (Section->UseCodeAlign())
-    getStreamer().emitCodeAlignment(2, 0);
+    getStreamer().emitCodeAlignment(2, &getSTI(), 0);
   else
     getStreamer().emitValueToAlignment(2, 0, 1, 0);
   return false;
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 2bc6492483c0..9164c699b569 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -34,7 +34,6 @@ struct X86Operand final : public MCParsedAsmOperand {
   StringRef SymName;
   void *OpDecl;
   bool AddressOf;
-  bool CallOperand;
 
   struct TokOp {
     const char *Data;
@@ -79,7 +78,7 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   X86Operand(KindTy K, SMLoc Start, SMLoc End)
       : Kind(K), StartLoc(Start), EndLoc(End), OpDecl(nullptr),
-        AddressOf(false), CallOperand(false) {}
+        AddressOf(false) {}
 
   StringRef getSymName() override { return SymName; }
   void *getOpDecl() override { return OpDecl; }
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 82581eb3c30a..908eb6d1fab1 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -83,9 +83,9 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -150,6 +150,12 @@ static InstrUID decode(OpcodeType type, InstructionContext insnContext,
     dec =
         &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case MAP5:
+    dec = &MAP5_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case MAP6:
+    dec = &MAP6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
 
   switch (dec->modrm_type) {
@@ -332,7 +338,7 @@ static int readPrefixes(struct InternalInstruction *insn) {
     }
 
     if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
-        ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+        ((~byte1 & 0x8) == 0x8) && ((byte2 & 0x4) == 0x4)) {
       insn->vectorExtensionType = TYPE_EVEX;
     } else {
       --insn->readerCursor; // unconsume byte1
@@ -800,10 +806,6 @@ static int readModRM(struct InternalInstruction *insn) {
       return prefix##_DR0 + index;                                             \
     case TYPE_CONTROLREG:                                                      \
       return prefix##_CR0 + index;                                             \
-    case TYPE_BNDR:                                                            \
-      if (index > 3)                                                           \
-        *valid = 0;                                                            \
-      return prefix##_BND0 + index;                                            \
     case TYPE_MVSIBX:                                                          \
       return prefix##_XMM0 + index;                                            \
     case TYPE_MVSIBY:                                                          \
@@ -876,11 +878,11 @@ static bool readOpcode(struct InternalInstruction *insn) {
 
   insn->opcodeType = ONEBYTE;
   if (insn->vectorExtensionType == TYPE_EVEX) {
-    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    switch (mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
     default:
       LLVM_DEBUG(
-          dbgs() << format("Unhandled mm field for instruction (0x%hhx)",
-                           mmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
+          dbgs() << format("Unhandled mmm field for instruction (0x%hhx)",
+                           mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
       return true;
     case VEX_LOB_0F:
       insn->opcodeType = TWOBYTE;
@@ -891,6 +893,12 @@ static bool readOpcode(struct InternalInstruction *insn) {
     case VEX_LOB_0F3A:
       insn->opcodeType = THREEBYTE_3A;
       return consume(insn, insn->opcode);
+    case VEX_LOB_MAP5:
+      insn->opcodeType = MAP5;
+      return consume(insn, insn->opcode);
+    case VEX_LOB_MAP6:
+      insn->opcodeType = MAP6;
+      return consume(insn, insn->opcode);
     }
   } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
     switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
@@ -908,6 +916,12 @@ static bool readOpcode(struct InternalInstruction *insn) {
     case VEX_LOB_0F3A:
       insn->opcodeType = THREEBYTE_3A;
       return consume(insn, insn->opcode);
+    case VEX_LOB_MAP5:
+      insn->opcodeType = MAP5;
+      return consume(insn, insn->opcode);
+    case VEX_LOB_MAP6:
+      insn->opcodeType = MAP6;
+      return consume(insn, insn->opcode);
     }
   } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
     insn->opcodeType = TWOBYTE;
@@ -1043,6 +1057,12 @@ static int getInstructionIDWithAttrMask(uint16_t *instructionID,
   case THREEDNOW_MAP:
     decision = &THREEDNOW_MAP_SYM;
     break;
+  case MAP5:
+    decision = &MAP5_SYM;
+    break;
+  case MAP6:
+    decision = &MAP6_SYM;
+    break;
   }
 
   if (decision->opcodeDecisions[insnCtx]
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 4318c17f03a0..95d3c8ede366 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -37,7 +37,7 @@ namespace X86Disassembler {
 #define xFromEVEX2of4(evex)     (((~(evex)) & 0x40) >> 6)
 #define bFromEVEX2of4(evex)     (((~(evex)) & 0x20) >> 5)
 #define r2FromEVEX2of4(evex)    (((~(evex)) & 0x10) >> 4)
-#define mmFromEVEX2of4(evex)    ((evex) & 0x3)
+#define mmmFromEVEX2of4(evex)   ((evex) & 0x7)
 #define wFromEVEX3of4(evex)     (((evex) & 0x80) >> 7)
 #define vvvvFromEVEX3of4(evex)  (((~(evex)) & 0x78) >> 3)
 #define ppFromEVEX3of4(evex)    ((evex) & 0x3)
@@ -374,12 +374,6 @@ namespace X86Disassembler {
   ENTRY(CR14)         \
   ENTRY(CR15)
 
-#define REGS_BOUND    \
-  ENTRY(BND0)         \
-  ENTRY(BND1)         \
-  ENTRY(BND2)         \
-  ENTRY(BND3)
-
 #undef  REGS_TMM
 #define REGS_TMM  \
   ENTRY(TMM0)     \
@@ -414,7 +408,6 @@ namespace X86Disassembler {
   REGS_SEGMENT        \
   REGS_DEBUG          \
   REGS_CONTROL        \
-  REGS_BOUND          \
   REGS_TMM            \
   ENTRY(RIP)
 
@@ -489,7 +482,9 @@ enum SegmentOverride {
 enum VEXLeadingOpcodeByte {
   VEX_LOB_0F = 0x1,
   VEX_LOB_0F38 = 0x2,
-  VEX_LOB_0F3A = 0x3
+  VEX_LOB_0F3A = 0x3,
+  VEX_LOB_MAP5 = 0x5,
+  VEX_LOB_MAP6 = 0x6
 };
 
 enum XOPMapSelect {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index c685d7e0db81..baacf2f46183 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -153,6 +153,20 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPSZrrib:    case X86::VCMPPSZrribk:
   case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
   case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+  case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rmi:  case X86::VCMPPHZ256rri:
+  case X86::VCMPPHZrmi:     case X86::VCMPPHZrri:
+  case X86::VCMPSHZrm:      case X86::VCMPSHZrr:
+  case X86::VCMPSHZrm_Int:  case X86::VCMPSHZrr_Int:
+  case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik:
+  case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik:
+  case X86::VCMPPHZrmik:    case X86::VCMPPHZrrik:
+  case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk:
+  case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik:
+  case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik:
+  case X86::VCMPPHZrmbi:    case X86::VCMPPHZrmbik:
+  case X86::VCMPPHZrrib:    case X86::VCMPPHZrribk:
+  case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
@@ -162,11 +176,15 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
       if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
         if (Desc.TSFlags & X86II::EVEX_B) {
           // Broadcast form.
-          // Load size is based on W-bit.
-          if (Desc.TSFlags & X86II::VEX_W)
+          // Load size is word for TA map. Otherwise it is based on W-bit.
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            printwordmem(MI, CurOp--, OS);
+          } else if (Desc.TSFlags & X86II::VEX_W) {
             printqwordmem(MI, CurOp--, OS);
-          else
+          } else {
             printdwordmem(MI, CurOp--, OS);
+          }
 
           // Print the number of elements broadcasted.
           unsigned NumElts;
@@ -176,18 +194,28 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
           else
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            NumElts *= 2;
+          }
           OS << "{1to" << NumElts << "}";
         } else {
-          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
-            printdwordmem(MI, CurOp--, OS);
-          else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) {
+            if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA)
+              printwordmem(MI, CurOp--, OS);
+            else
+              printdwordmem(MI, CurOp--, OS);
+          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) {
+            assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
+                   "Unexpected op map!");
             printqwordmem(MI, CurOp--, OS);
-          else if (Desc.TSFlags & X86II::EVEX_L2)
+          } else if (Desc.TSFlags & X86II::EVEX_L2) {
             printzmmwordmem(MI, CurOp--, OS);
-          else if (Desc.TSFlags & X86II::VEX_L)
+          } else if (Desc.TSFlags & X86II::VEX_L) {
             printymmwordmem(MI, CurOp--, OS);
-          else
+          } else {
             printxmmwordmem(MI, CurOp--, OS);
+          }
         }
       } else {
         if (Desc.TSFlags & X86II::EVEX_B)
@@ -391,7 +419,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
     uint64_t Target;
     if (MIA->evaluateBranch(*MI, 0, 0, Target))
       return;
-    if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+    if (MIA->evaluateMemoryOperandAddress(*MI, /*STI=*/nullptr, 0, 0))
       return;
   }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 83f3614ded1a..d4f39b571394 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -29,9 +29,9 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -116,13 +116,6 @@ cl::opt<bool> X86PadForBranchAlign(
     "x86-pad-for-branch-align", cl::init(true), cl::Hidden,
     cl::desc("Pad previous instructions to implement branch alignment"));
 
-class X86ELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
-                     bool HasRelocationAddend, bool foobar)
-    : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
-};
-
 class X86AsmBackend : public MCAsmBackend {
   const MCSubtargetInfo &STI;
   std::unique_ptr<const MCInstrInfo> MCII;
@@ -166,7 +159,8 @@ public:
 
   bool allowAutoPadding() const override;
   bool allowEnhancedRelaxation() const override;
-  void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
+  void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
+                            const MCSubtargetInfo &STI) override;
   void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
 
   unsigned getNumFixupKinds() const override {
@@ -207,9 +201,10 @@ public:
 
   void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
 
-  unsigned getMaximumNopSize() const override;
+  unsigned getMaximumNopSize(const MCSubtargetInfo &STI) const override;
 
-  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+  bool writeNopData(raw_ostream &OS, uint64_t Count,
+                    const MCSubtargetInfo *STI) const override;
 };
 } // end anonymous namespace
 
@@ -598,7 +593,7 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
 
 /// Insert BoundaryAlignFragment before instructions to align branches.
 void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
-                                         const MCInst &Inst) {
+                                         const MCInst &Inst, const MCSubtargetInfo &STI) {
   CanPadInst = canPadInst(Inst, OS);
 
   if (!canPadBranches(OS))
@@ -637,7 +632,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
                           isFirstMacroFusibleInst(Inst, *MCII))) {
     // If we meet a unfused branch or the first instuction in a fusiable pair,
     // insert a BoundaryAlign fragment.
-    OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary));
+    OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary, STI));
   }
 }
 
@@ -1081,16 +1076,16 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
   }
 }
 
-unsigned X86AsmBackend::getMaximumNopSize() const {
+unsigned X86AsmBackend::getMaximumNopSize(const MCSubtargetInfo &STI) const {
   if (STI.hasFeature(X86::Mode16Bit))
     return 4;
   if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
     return 1;
-  if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
+  if (STI.getFeatureBits()[X86::TuningFast7ByteNOP])
     return 7;
-  if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+  if (STI.getFeatureBits()[X86::TuningFast15ByteNOP])
     return 15;
-  if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+  if (STI.getFeatureBits()[X86::TuningFast11ByteNOP])
     return 11;
   // FIXME: handle 32-bit mode
   // 15-bytes is the longest single NOP instruction, but 10-bytes is
@@ -1101,7 +1096,8 @@ unsigned X86AsmBackend::getMaximumNopSize() const {
 /// Write a sequence of optimal nops to the output, covering \p Count
 /// bytes.
 /// \return - true on success, false on failure
-bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                 const MCSubtargetInfo *STI) const {
   static const char Nops32Bit[10][11] = {
       // nop
       "\x90",
@@ -1138,9 +1134,9 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   };
 
   const char(*Nops)[11] =
-      STI.getFeatureBits()[X86::Mode16Bit] ? Nops16Bit : Nops32Bit;
+      STI->getFeatureBits()[X86::Mode16Bit] ? Nops16Bit : Nops32Bit;
 
-  uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
+  uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(*STI);
 
   // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
   // length.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 58e233d86da1..4161765fc1ae 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -441,6 +441,11 @@ namespace X86II {
     ///    SYMBOL_LABEL @GOTPCREL
     MO_GOTPCREL,
 
+    /// MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL
+    /// relocations are guaranteed to be emitted by the integrated assembler
+    /// instead of the relaxable R_X86_64[_REX]_GOTPCRELX relocations.
+    MO_GOTPCREL_NORELAX,
+
     /// MO_PLT - On a symbol operand this indicates that the immediate is
     /// offset to the PLT entry of symbol name from the current code location.
     ///
@@ -790,7 +795,7 @@ namespace X86II {
     // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
     //
     OpMapShift = OpPrefixShift + 2,
-    OpMapMask  = 0x7 << OpMapShift,
+    OpMapMask  = 0xF << OpMapShift,
 
     // OB - OneByte - Set if this instruction has a one byte opcode.
     OB = 0 << OpMapShift,
@@ -819,13 +824,17 @@ namespace X86II {
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
     ThreeDNow = 7 << OpMapShift,
 
+    // MAP5, MAP6 - Prefix after the 0x0F prefix.
+    T_MAP5 = 8 << OpMapShift,
+    T_MAP6 = 9 << OpMapShift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
     // etc. We only cares about REX.W and REX.R bits and only the former is
     // statically determined.
     //
-    REXShift    = OpMapShift + 3,
+    REXShift    = OpMapShift + 4,
     REX_W       = 1 << REXShift,
 
     //===------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index fa937d381613..8ab86f46ffe6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -218,6 +218,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
       return ELF::R_X86_64_REX_GOTPCRELX;
     }
     llvm_unreachable("unexpected relocation type!");
+  case MCSymbolRefExpr::VK_GOTPCREL_NORELAX:
+    checkIs32(Ctx, Loc, Type);
+    return ELF::R_X86_64_GOTPCREL;
   case MCSymbolRefExpr::VK_X86_PLTOFF:
     checkIs64(Ctx, Loc, Type);
     return ELF::R_X86_64_PLTOFF64;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index d8dbbbbf2779..167580ec1ed0 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -264,6 +264,24 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
   case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
     OS << "ss\t";
     break;
+  case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rmi:  case X86::VCMPPHZ256rri:
+  case X86::VCMPPHZrmi:     case X86::VCMPPHZrri:
+  case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik:
+  case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik:
+  case X86::VCMPPHZrmik:    case X86::VCMPPHZrrik:
+  case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik:
+  case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik:
+  case X86::VCMPPHZrmbi:    case X86::VCMPPHZrmbik:
+  case X86::VCMPPHZrrib:    case X86::VCMPPHZrribk:
+    OS << "ph\t";
+    break;
+  case X86::VCMPSHZrm:      case X86::VCMPSHZrr:
+  case X86::VCMPSHZrm_Int:  case X86::VCMPSHZrr_Int:
+  case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk:
+  case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk:
+    OS << "sh\t";
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index d5b205ad9a63..48c335f9a777 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -132,6 +132,20 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPSZrrib:    case X86::VCMPPSZrribk:
   case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
   case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+  case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rmi:  case X86::VCMPPHZ256rri:
+  case X86::VCMPPHZrmi:     case X86::VCMPPHZrri:
+  case X86::VCMPSHZrm:      case X86::VCMPSHZrr:
+  case X86::VCMPSHZrm_Int:  case X86::VCMPSHZrr_Int:
+  case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik:
+  case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik:
+  case X86::VCMPPHZrmik:    case X86::VCMPPHZrrik:
+  case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk:
+  case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik:
+  case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik:
+  case X86::VCMPPHZrmbi:    case X86::VCMPPHZrmbik:
+  case X86::VCMPPHZrrib:    case X86::VCMPPHZrribk:
+  case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
@@ -152,11 +166,15 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
       if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
         if (Desc.TSFlags & X86II::EVEX_B) {
           // Broadcast form.
-          // Load size is based on W-bit.
-          if (Desc.TSFlags & X86II::VEX_W)
+          // Load size is word for TA map. Otherwise it is based on W-bit.
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            printwordmem(MI, CurOp++, OS);
+          } else if (Desc.TSFlags & X86II::VEX_W) {
             printqwordmem(MI, CurOp++, OS);
-          else
+          } else {
             printdwordmem(MI, CurOp++, OS);
+          }
 
           // Print the number of elements broadcasted.
           unsigned NumElts;
@@ -166,18 +184,28 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
           else
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            NumElts *= 2;
+          }
           OS << "{1to" << NumElts << "}";
         } else {
-          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
-            printdwordmem(MI, CurOp++, OS);
-          else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) {
+            if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA)
+              printwordmem(MI, CurOp++, OS);
+            else
+              printdwordmem(MI, CurOp++, OS);
+          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) {
+            assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
+                   "Unexpected op map!");
             printqwordmem(MI, CurOp++, OS);
-          else if (Desc.TSFlags & X86II::EVEX_L2)
+          } else if (Desc.TSFlags & X86II::EVEX_L2) {
             printzmmwordmem(MI, CurOp++, OS);
-          else if (Desc.TSFlags & X86II::VEX_L)
+          } else if (Desc.TSFlags & X86II::VEX_L) {
             printymmwordmem(MI, CurOp++, OS);
-          else
+          } else {
             printxmmwordmem(MI, CurOp++, OS);
+          }
         }
       } else {
         printOperand(MI, CurOp++, OS);
@@ -349,7 +377,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
     uint64_t Target;
     if (MIA->evaluateBranch(*MI, 0, 0, Target))
       return;
-    if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+    if (MIA->evaluateMemoryOperandAddress(*MI, /*STI=*/nullptr, 0, 0))
       return;
   }
   const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 714d2d839054..4fa8bc64b245 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -799,7 +799,10 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
   //  0b00001: implied 0F leading opcode
   //  0b00010: implied 0F 38 leading opcode bytes
   //  0b00011: implied 0F 3A leading opcode bytes
-  //  0b00100-0b11111: Reserved for future use
+  //  0b00100: Reserved for future use
+  //  0b00101: VEX MAP5
+  //  0b00110: VEX MAP6
+  //  0b00111-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
   //  0b01001: XOP map select - 09h instructions with no imm byte
   //  0b01010: XOP map select - 0Ah instructions with imm dword
@@ -825,6 +828,12 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
   case X86II::XOPA:
     VEX_5M = 0xA;
     break;
+  case X86II::T_MAP5:
+    VEX_5M = 0x5;
+    break;
+  case X86II::T_MAP6:
+    VEX_5M = 0x6;
+    break;
   }
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -1173,10 +1182,10 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     // EVEX opcode prefix can have 4 bytes
     //
     // +-----+ +--------------+ +-------------------+ +------------------------+
-    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+    // | 62h | | RXBR' | 0mmm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
     // +-----+ +--------------+ +-------------------+ +------------------------+
-    assert((VEX_5M & 0x3) == VEX_5M &&
-           "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+    assert((VEX_5M & 0x7) == VEX_5M &&
+           "More than 3 significant bits in VEX.m-mmmm fields for EVEX!");
 
     emitByte(0x62, OS);
     emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 12dc053cd970..9da0a8129f23 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -26,9 +26,9 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -405,8 +405,12 @@ public:
   bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
                       uint64_t &Target) const override;
   Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+                                                  const MCSubtargetInfo *STI,
                                                   uint64_t Addr,
                                                   uint64_t Size) const override;
+  Optional<uint64_t>
+  getMemoryOperandRelocationOffset(const MCInst &Inst,
+                                   uint64_t Size) const override;
 };
 
 #define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS
@@ -532,7 +536,8 @@ bool X86MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
 }
 
 Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
-    const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
+    const MCInst &Inst, const MCSubtargetInfo *STI, uint64_t Addr,
+    uint64_t Size) const {
   const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
   int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags);
   if (MemOpStart == -1)
@@ -555,6 +560,30 @@ Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
   return None;
 }
 
+Optional<uint64_t>
+X86MCInstrAnalysis::getMemoryOperandRelocationOffset(const MCInst &Inst,
+                                                     uint64_t Size) const {
+  if (Inst.getOpcode() != X86::LEA64r)
+    return None;
+  const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
+  int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags);
+  if (MemOpStart == -1)
+    return None;
+  MemOpStart += X86II::getOperandBias(MCID);
+  const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg);
+  const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg);
+  const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg);
+  const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt);
+  const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp);
+  // Must be a simple rip-relative address.
+  if (BaseReg.getReg() != X86::RIP || SegReg.getReg() != 0 ||
+      IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 || !Disp.isImm())
+    return None;
+  // rip-relative ModR/M immediate is 32 bits.
+  assert(Size > 4 && "invalid instruction size for rip-relative lea");
+  return Size - 4;
+}
+
 } // end of namespace X86_MC
 
 } // end of namespace llvm
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
index 201b22d6232d..82f4460a42e7 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/MathExtras.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 075e85f4e243..10fc176b59d8 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -100,7 +100,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
       if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
         return COFF::IMAGE_REL_I386_DIR32NB;
       if (Modifier == MCSymbolRefExpr::VK_SECREL)
-        return COFF::IMAGE_REL_AMD64_SECREL;
+        return COFF::IMAGE_REL_I386_SECREL;
       return COFF::IMAGE_REL_I386_DIR32;
     case FK_SecRel_2:
       return COFF::IMAGE_REL_I386_SECTION;
diff --git a/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 18cda8f591c3..7490703251e9 100644
--- a/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/X86TargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheX86_32Target() {
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index eba5b6ce7836..10e1c5d6ed38 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -73,8 +73,8 @@ FunctionPass *createX86AvoidStoreForwardingBlocks();
 /// Return a pass that lowers EFLAGS copy pseudo instructions.
 FunctionPass *createX86FlagsCopyLoweringPass();
 
-/// Return a pass that expands WinAlloca pseudo-instructions.
-FunctionPass *createX86WinAllocaExpander();
+/// Return a pass that expands DynAlloca pseudo-instructions.
+FunctionPass *createX86DynAllocaExpander();
 
 /// Return a pass that config the tile registers.
 FunctionPass *createX86TileConfigPass();
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 53bbd93798ac..380507308c3d 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -27,7 +27,7 @@ def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
                                   "16-bit mode (i8086)">;
 
 //===----------------------------------------------------------------------===//
-// X86 Subtarget features
+// X86 Subtarget ISA features
 //===----------------------------------------------------------------------===//
 
 def FeatureX87     : SubtargetFeature<"x87","HasX87", "true",
@@ -42,6 +42,9 @@ def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
 def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
                                         "Support CMPXCHG8B instructions">;
 
+def FeatureCRC32   : SubtargetFeature<"crc32", "HasCRC32", "true",
+                                      "Enable SSE 4.2 CRC32 instruction">;
+
 def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
                                        "Support POPCNT instruction">;
 
@@ -100,20 +103,6 @@ def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
 def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
                                       "64-bit with cmpxchg16b",
                                       [FeatureCMPXCHG8B]>;
-def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
-                                       "SHLD instruction is slow">;
-def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
-                                        "PMULLD instruction is slow">;
-def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
-                                          "true",
-                                          "PMADDWD is slower than PMULLD">;
-// FIXME: This should not apply to CPUs that do not have SSE.
-def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
-                                "IsUAMem16Slow", "true",
-                                "Slow unaligned 16-byte memory access">;
-def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
-                                "IsUAMem32Slow", "true",
-                                "Slow unaligned 32-byte memory access">;
 def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                       "Support SSE 4a instructions",
                                       [FeatureSSE3]>;
@@ -184,6 +173,14 @@ def FeatureVP2INTERSECT  : SubtargetFeature<"avx512vp2intersect",
                                             "HasVP2INTERSECT", "true",
                                             "Enable AVX-512 vp2intersect",
                                             [FeatureAVX512]>;
+// FIXME: FP16 scalar intrinsics use the type v8f16, which is supposed to be
+// guarded under condition hasVLX. So we imply it in FeatureFP16 currently.
+// FIXME: FP16 conversion between f16 and i64 customize type v8i64, which is
+// supposed to be guarded under condition hasDQI. So we imply it in FeatureFP16
+// currently.
+def FeatureFP16    : SubtargetFeature<"avx512fp16", "HasFP16", "true",
+                           "Support 16-bit floating point",
+                           [FeatureBWI, FeatureVLX, FeatureDQI]>;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
@@ -255,17 +252,6 @@ def FeatureAMXINT8     : SubtargetFeature<"amx-int8", "HasAMXINT8", "true",
 def FeatureAMXBF16     : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true",
                                       "Support AMX-BF16 instructions",
                                       [FeatureAMXTILE]>;
-def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
-                                     "Use LEA for adjusting the stack pointer">;
-def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
-                                     "HasSlowDivide32", "true",
-                                     "Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
-                                     "HasSlowDivide64", "true",
-                                     "Use 32-bit divide for positive values less than 2^32">;
-def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
-                                     "PadShortFunctions", "true",
-                                     "Pad short functions">;
 def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
                                       "Invalidate Process-Context Identifier">;
 def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
@@ -296,116 +282,244 @@ def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
                                        "Support TSXLDTRK instructions">;
 def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
                                     "Has UINTR Instructions">;
+def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
+                                      "platform configuration instruction">;
+def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
+                                       "Support movdiri instruction">;
+def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
+                                        "Support movdir64b instruction">;
+
+// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
+// "string operations"). See "REP String Enhancement" in the Intel Software
+// Development Manual. This feature essentially means that REP MOVSB will copy
+// using the largest available size instead of copying bytes one by one, making
+// it at least as fast as REPMOVS{W,D,Q}.
+def FeatureERMSB
+    : SubtargetFeature<
+          "ermsb", "HasERMSB", "true",
+          "REP MOVS/STOS are fast">;
+
+// Icelake and newer processors have Fast Short REP MOV.
+def FeatureFSRM
+    : SubtargetFeature<
+          "fsrm", "HasFSRM", "true",
+          "REP MOVSB of short lengths is faster">;
+
+def FeatureSoftFloat
+    : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                       "Use software floating point features">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget Security Mitigation features
+//===----------------------------------------------------------------------===//
+
+// Lower indirect calls using a special construct called a `retpoline` to
+// mitigate potential Spectre v2 attacks against them.
+def FeatureRetpolineIndirectCalls
+    : SubtargetFeature<
+          "retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
+          "Remove speculation of indirect calls from the generated code">;
+
+// Lower indirect branches and switches either using conditional branch trees
+// or using a special construct called a `retpoline` to mitigate potential
+// Spectre v2 attacks against them.
+def FeatureRetpolineIndirectBranches
+    : SubtargetFeature<
+          "retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
+          "Remove speculation of indirect branches from the generated code">;
+
+// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
+// `retpoline-indirect-branches` above.
+def FeatureRetpoline
+    : SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
+                       "Remove speculation of indirect branches from the "
+                       "generated code, either by avoiding them entirely or "
+                       "lowering them with a speculation blocking construct",
+                       [FeatureRetpolineIndirectCalls,
+                        FeatureRetpolineIndirectBranches]>;
+
+// Rely on external thunks for the emitted retpoline calls. This allows users
+// to provide their own custom thunk definitions in highly specialized
+// environments such as a kernel that does boot-time hot patching.
+def FeatureRetpolineExternalThunk
+    : SubtargetFeature<
+          "retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
+          "When lowering an indirect call or branch using a `retpoline`, rely "
+          "on the specified user provided thunk rather than emitting one "
+          "ourselves. Only has effect when combined with some other retpoline "
+          "feature", [FeatureRetpolineIndirectCalls]>;
+
+// Mitigate LVI attacks against indirect calls/branches and call returns
+def FeatureLVIControlFlowIntegrity
+    : SubtargetFeature<
+          "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
+          "Prevent indirect calls/branches from using a memory operand, and "
+          "precede all indirect calls/branches from a register with an "
+          "LFENCE instruction to serialize control flow. Also decompose RET "
+          "instructions into a POP+LFENCE+JMP sequence.">;
+
+// Enable SESES to mitigate speculative execution attacks
+def FeatureSpeculativeExecutionSideEffectSuppression
+    : SubtargetFeature<
+          "seses", "UseSpeculativeExecutionSideEffectSuppression", "true",
+          "Prevent speculative execution side channel timing attacks by "
+          "inserting a speculation barrier before memory reads, memory writes, "
+          "and conditional branches. Implies LVI Control Flow integrity.",
+          [FeatureLVIControlFlowIntegrity]>;
+
+// Mitigate LVI attacks against data loads
+def FeatureLVILoadHardening
+    : SubtargetFeature<
+          "lvi-load-hardening", "UseLVILoadHardening", "true",
+          "Insert LFENCE instructions to prevent data speculatively injected "
+          "into loads from being used maliciously.">;
+
+def FeatureTaggedGlobals
+    : SubtargetFeature<
+          "tagged-globals", "AllowTaggedGlobals", "true",
+          "Use an instruction sequence for taking the address of a global "
+          "that allows a memory tag in the upper address bits.">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget Tuning features
+//===----------------------------------------------------------------------===//
+
+def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
+
+def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
+                                        "PMULLD instruction is slow">;
+
+def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
+                                          "true",
+                                          "PMADDWD is slower than PMULLD">;
+
+// FIXME: This should not apply to CPUs that do not have SSE.
+def TuningSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+                                "IsUAMem16Slow", "true",
+                                "Slow unaligned 16-byte memory access">;
+
+def TuningSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+                                "IsUAMem32Slow", "true",
+                                "Slow unaligned 32-byte memory access">;
+
+def TuningLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+                                     "Use LEA for adjusting the stack pointer">;
+
+def TuningSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+                                     "HasSlowDivide32", "true",
+                                     "Use 8-bit divide for positive values less than 256">;
+
+def TuningSlowDivide64 : SubtargetFeature<"idivq-to-divl",
+                                     "HasSlowDivide64", "true",
+                                     "Use 32-bit divide for positive values less than 2^32">;
+
+def TuningPadShortFunctions : SubtargetFeature<"pad-short-functions",
+                                     "PadShortFunctions", "true",
+                                     "Pad short functions">;
+
 // On some processors, instructions that implicitly take two memory operands are
 // slow. In practice, this means that CALL, PUSH, and POP with memory operands
 // should be avoided in favor of a MOV + register CALL/PUSH/POP.
-def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
+def TuningSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
                                      "SlowTwoMemOps", "true",
                                      "Two memory operand instructions are slow">;
-def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+
+def TuningLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
-def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+
+def TuningSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
-def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+
+def TuningSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
                                    "LEA instruction with 3 ops or certain registers is slow">;
-def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+
+def TuningSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
-def FeatureSoftFloat
-    : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
-                       "Use software floating point features">;
-def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
+
+def TuningPOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
                                      "HasPOPCNTFalseDeps", "true",
                                      "POPCNT has a false dependency on dest register">;
-def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
+
+def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
                                      "HasLZCNTFalseDeps", "true",
                                      "LZCNT/TZCNT have a false dependency on dest register">;
-def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
-                                      "platform configuration instruction">;
+
 // On recent X86 (port bound) processors, its preferable to combine to a single shuffle
 // using a variable mask over multiple fixed shuffles.
-def FeatureFastVariableCrossLaneShuffle
+def TuningFastVariableCrossLaneShuffle
     : SubtargetFeature<"fast-variable-crosslane-shuffle",
                        "HasFastVariableCrossLaneShuffle",
                        "true", "Cross-lane shuffles with variable masks are fast">;
-def FeatureFastVariablePerLaneShuffle
+def TuningFastVariablePerLaneShuffle
     : SubtargetFeature<"fast-variable-perlane-shuffle",
                        "HasFastVariablePerLaneShuffle",
                        "true", "Per-lane shuffles with variable masks are fast">;
 
 // On some X86 processors, a vzeroupper instruction should be inserted after
 // using ymm/zmm registers before executing code that may use SSE instructions.
-def FeatureInsertVZEROUPPER
+def TuningInsertVZEROUPPER
     : SubtargetFeature<"vzeroupper",
                        "InsertVZEROUPPER",
                        "true", "Should insert vzeroupper instructions">;
-// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
-// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+
+// TuningFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. TuningFastVectorFSQRT should be enabled if
 // vector FSQRT has higher throughput than the corresponding NR code.
 // The idea is that throughput bound code is likely to be vectorized, so for
 // vectorized code we should care about the throughput of SQRT operations.
 // But if the code is scalar that probably means that the code has some kind of
 // dependency and we should care more about reducing the latency.
-def FeatureFastScalarFSQRT
+def TuningFastScalarFSQRT
     : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
                        "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
-def FeatureFastVectorFSQRT
+def TuningFastVectorFSQRT
     : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
                        "true", "Vector SQRT is fast (disable Newton-Raphson)">;
+
 // If lzcnt has equivalent latency/throughput to most simple integer ops, it can
 // be used to replace test/set sequences.
-def FeatureFastLZCNT
+def TuningFastLZCNT
     : SubtargetFeature<
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
+
 // If the target can efficiently decode NOPs upto 7-bytes in length.
-def FeatureFast7ByteNOP
+def TuningFast7ByteNOP
     : SubtargetFeature<
           "fast-7bytenop", "HasFast7ByteNOP", "true",
           "Target can quickly decode up to 7 byte NOPs">;
+
 // If the target can efficiently decode NOPs upto 11-bytes in length.
-def FeatureFast11ByteNOP
+def TuningFast11ByteNOP
     : SubtargetFeature<
           "fast-11bytenop", "HasFast11ByteNOP", "true",
           "Target can quickly decode up to 11 byte NOPs">;
+
 // If the target can efficiently decode NOPs upto 15-bytes in length.
-def FeatureFast15ByteNOP
+def TuningFast15ByteNOP
     : SubtargetFeature<
           "fast-15bytenop", "HasFast15ByteNOP", "true",
           "Target can quickly decode up to 15 byte NOPs">;
+
 // Sandy Bridge and newer processors can use SHLD with the same source on both
 // inputs to implement rotate to avoid the partial flag update of the normal
 // rotate instructions.
-def FeatureFastSHLDRotate
+def TuningFastSHLDRotate
     : SubtargetFeature<
           "fast-shld-rotate", "HasFastSHLDRotate", "true",
           "SHLD can be used as a faster rotate">;
 
-// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
-// "string operations"). See "REP String Enhancement" in the Intel Software
-// Development Manual. This feature essentially means that REP MOVSB will copy
-// using the largest available size instead of copying bytes one by one, making
-// it at least as fast as REPMOVS{W,D,Q}.
-def FeatureERMSB
-    : SubtargetFeature<
-          "ermsb", "HasERMSB", "true",
-          "REP MOVS/STOS are fast">;
-
-// Icelake and newer processors have Fast Short REP MOV.
-def FeatureFSRM
-    : SubtargetFeature<
-          "fsrm", "HasFSRM", "true",
-          "REP MOVSB of short lengths is faster">;
-
 // Bulldozer and newer processors can merge CMP/TEST (but not other
 // instructions) with conditional branches.
-def FeatureBranchFusion
+def TuningBranchFusion
     : SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
                  "CMP/TEST can be fused with conditional branches">;
 
 // Sandy Bridge and newer processors have many instructions that can be
 // fused with conditional branches and pass through the CPU as a single
 // operation.
-def FeatureMacroFusion
+def TuningMacroFusion
     : SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
                  "Various instructions can be fused with conditional branches">;
 
@@ -413,117 +527,54 @@ def FeatureMacroFusion
 // generate Gathers on all AVX2 processors. But the overhead on HSW is high.
 // Skylake Client processor has faster Gathers than HSW and performance is
 // similar to Skylake Server (AVX-512).
-def FeatureHasFastGather
+def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast">;
 
-def FeaturePrefer128Bit
+def TuningPrefer128Bit
     : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
                        "Prefer 128-bit AVX instructions">;
 
-def FeaturePrefer256Bit
+def TuningPrefer256Bit
     : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
                        "Prefer 256-bit AVX instructions">;
 
-def FeaturePreferMaskRegisters
+def TuningPreferMaskRegisters
     : SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
                        "Prefer AVX512 mask registers over PTEST/MOVMSK">;
 
-// Lower indirect calls using a special construct called a `retpoline` to
-// mitigate potential Spectre v2 attacks against them.
-def FeatureRetpolineIndirectCalls
-    : SubtargetFeature<
-          "retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
-          "Remove speculation of indirect calls from the generated code">;
-
-// Lower indirect branches and switches either using conditional branch trees
-// or using a special construct called a `retpoline` to mitigate potential
-// Spectre v2 attacks against them.
-def FeatureRetpolineIndirectBranches
-    : SubtargetFeature<
-          "retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
-          "Remove speculation of indirect branches from the generated code">;
-
-// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
-// `retpoline-indirect-branches` above.
-def FeatureRetpoline
-    : SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
-                       "Remove speculation of indirect branches from the "
-                       "generated code, either by avoiding them entirely or "
-                       "lowering them with a speculation blocking construct",
-                       [FeatureRetpolineIndirectCalls,
-                        FeatureRetpolineIndirectBranches]>;
-
-// Rely on external thunks for the emitted retpoline calls. This allows users
-// to provide their own custom thunk definitions in highly specialized
-// environments such as a kernel that does boot-time hot patching.
-def FeatureRetpolineExternalThunk
-    : SubtargetFeature<
-          "retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
-          "When lowering an indirect call or branch using a `retpoline`, rely "
-          "on the specified user provided thunk rather than emitting one "
-          "ourselves. Only has effect when combined with some other retpoline "
-          "feature", [FeatureRetpolineIndirectCalls]>;
-
-// Mitigate LVI attacks against indirect calls/branches and call returns
-def FeatureLVIControlFlowIntegrity
-    : SubtargetFeature<
-          "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
-          "Prevent indirect calls/branches from using a memory operand, and "
-          "precede all indirect calls/branches from a register with an "
-          "LFENCE instruction to serialize control flow. Also decompose RET "
-          "instructions into a POP+LFENCE+JMP sequence.">;
-
-// Enable SESES to mitigate speculative execution attacks
-def FeatureSpeculativeExecutionSideEffectSuppression
-    : SubtargetFeature<
-          "seses", "UseSpeculativeExecutionSideEffectSuppression", "true",
-          "Prevent speculative execution side channel timing attacks by "
-          "inserting a speculation barrier before memory reads, memory writes, "
-          "and conditional branches. Implies LVI Control Flow integrity.",
-          [FeatureLVIControlFlowIntegrity]>;
-
-// Mitigate LVI attacks against data loads
-def FeatureLVILoadHardening
-    : SubtargetFeature<
-          "lvi-load-hardening", "UseLVILoadHardening", "true",
-          "Insert LFENCE instructions to prevent data speculatively injected "
-          "into loads from being used maliciously.">;
-
-// Direct Move instructions.
-def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
-                                       "Support movdiri instruction">;
-def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
-                                        "Support movdir64b instruction">;
-
-def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
+def TuningFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
           "Indicates that the BEXTR instruction is implemented as a single uop "
           "with good throughput">;
 
 // Combine vector math operations with shuffles into horizontal math
 // instructions if a CPU implements horizontal operations (introduced with
 // SSE3) with better latency/throughput than the alternative sequence.
-def FeatureFastHorizontalOps
+def TuningFastHorizontalOps
     : SubtargetFeature<
         "fast-hops", "HasFastHorizontalOps", "true",
         "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
         "normal vector instructions with shuffles">;
 
-def FeatureFastScalarShiftMasks
+def TuningFastScalarShiftMasks
     : SubtargetFeature<
         "fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
         "Prefer a left/right scalar logical shift pair over a shift+and pair">;
 
-def FeatureFastVectorShiftMasks
+def TuningFastVectorShiftMasks
     : SubtargetFeature<
         "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
         "Prefer a left/right vector logical shift pair over a shift+and pair">;
 
-def FeatureFastMOVBE
+def TuningFastMOVBE
     : SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
     "Prefer a movbe over a single-use load + bswap / single-use bswap + store">;
 
-def FeatureUseGLMDivSqrtCosts
+def TuningUseSLMArithCosts
+    : SubtargetFeature<"use-slm-arith-costs", "UseSLMArithCosts", "true",
+        "Use Silvermont specific arithmetic costs">;
+
+def TuningUseGLMDivSqrtCosts
     : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
         "Use Goldmont specific floating point div/sqrt costs">;
 
@@ -531,10 +582,13 @@ def FeatureUseGLMDivSqrtCosts
 def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
                                     "Use alias analysis during codegen">;
 
+//===----------------------------------------------------------------------===//
+// X86 CPU Families
+// TODO: Remove these - use general tuning features to determine codegen.
+//===----------------------------------------------------------------------===//
+
 // Bonnell
 def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
-// Silvermont
-def ProcIntelSLM  : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -569,6 +623,7 @@ include "X86ScheduleBdVer2.td"
 include "X86ScheduleBtVer2.td"
 include "X86SchedSkylakeClient.td"
 include "X86SchedSkylakeServer.td"
+include "X86SchedIceLake.td"
 
 //===----------------------------------------------------------------------===//
 // X86 Processor Feature Lists
@@ -580,9 +635,10 @@ def ProcessorFeatures {
     FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
     FeatureFXSR, FeatureNOPL, Feature64Bit
   ];
-  list<SubtargetFeature> X86_64V2Features = !listconcat(
-      X86_64V1Features,
-      [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]);
+  list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
+    FeatureCMPXCHG16B, FeatureLAHFSAHF, FeatureCRC32, FeaturePOPCNT,
+    FeatureSSE42
+  ]);
   list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
     FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT,
     FeatureMOVBE, FeatureXSAVE
@@ -596,8 +652,8 @@ def ProcessorFeatures {
 
   // Nehalem
   list<SubtargetFeature> NHMFeatures = X86_64V2Features;
-  list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
+                                      TuningInsertVZEROUPPER];
 
   // Westmere
   list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -609,15 +665,15 @@ def ProcessorFeatures {
   list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
                                                   FeatureXSAVE,
                                                   FeatureXSAVEOPT];
-  list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
-                                      FeatureSlow3OpsLEA,
-                                      FeatureSlowDivide64,
-                                      FeatureSlowUAMem32,
-                                      FeatureFastScalarFSQRT,
-                                      FeatureFastSHLDRotate,
-                                      FeatureFast15ByteNOP,
-                                      FeaturePOPCNTFalseDeps,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> SNBTuning = [TuningMacroFusion,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowDivide64,
+                                      TuningSlowUAMem32,
+                                      TuningFastScalarFSQRT,
+                                      TuningFastSHLDRotate,
+                                      TuningFast15ByteNOP,
+                                      TuningPOPCNTFalseDeps,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> SNBFeatures =
     !listconcat(WSMFeatures, SNBAdditionalFeatures);
 
@@ -638,17 +694,17 @@ def ProcessorFeatures {
                                                   FeatureINVPCID,
                                                   FeatureLZCNT,
                                                   FeatureMOVBE];
-  list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
-                                      FeatureSlow3OpsLEA,
-                                      FeatureSlowDivide64,
-                                      FeatureFastScalarFSQRT,
-                                      FeatureFastSHLDRotate,
-                                      FeatureFast15ByteNOP,
-                                      FeatureFastVariableCrossLaneShuffle,
-                                      FeatureFastVariablePerLaneShuffle,
-                                      FeaturePOPCNTFalseDeps,
-                                      FeatureLZCNTFalseDeps,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> HSWTuning = [TuningMacroFusion,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowDivide64,
+                                      TuningFastScalarFSQRT,
+                                      TuningFastSHLDRotate,
+                                      TuningFast15ByteNOP,
+                                      TuningFastVariableCrossLaneShuffle,
+                                      TuningFastVariablePerLaneShuffle,
+                                      TuningPOPCNTFalseDeps,
+                                      TuningLZCNTFalseDeps,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> HSWFeatures =
     !listconcat(IVBFeatures, HSWAdditionalFeatures);
 
@@ -665,18 +721,18 @@ def ProcessorFeatures {
                                                   FeatureXSAVEC,
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT];
-  list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
-                                      FeatureMacroFusion,
-                                      FeatureSlow3OpsLEA,
-                                      FeatureSlowDivide64,
-                                      FeatureFastScalarFSQRT,
-                                      FeatureFastVectorFSQRT,
-                                      FeatureFastSHLDRotate,
-                                      FeatureFast15ByteNOP,
-                                      FeatureFastVariableCrossLaneShuffle,
-                                      FeatureFastVariablePerLaneShuffle,
-                                      FeaturePOPCNTFalseDeps,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> SKLTuning = [TuningFastGather,
+                                      TuningMacroFusion,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowDivide64,
+                                      TuningFastScalarFSQRT,
+                                      TuningFastVectorFSQRT,
+                                      TuningFastSHLDRotate,
+                                      TuningFast15ByteNOP,
+                                      TuningFastVariableCrossLaneShuffle,
+                                      TuningFastVariablePerLaneShuffle,
+                                      TuningPOPCNTFalseDeps,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> SKLFeatures =
     !listconcat(BDWFeatures, SKLAdditionalFeatures);
 
@@ -692,19 +748,19 @@ def ProcessorFeatures {
                                                   FeatureVLX,
                                                   FeaturePKU,
                                                   FeatureCLWB];
-  list<SubtargetFeature> SKXTuning = [FeatureHasFastGather,
-                                      FeatureMacroFusion,
-                                      FeatureSlow3OpsLEA,
-                                      FeatureSlowDivide64,
-                                      FeatureFastScalarFSQRT,
-                                      FeatureFastVectorFSQRT,
-                                      FeatureFastSHLDRotate,
-                                      FeatureFast15ByteNOP,
-                                      FeatureFastVariableCrossLaneShuffle,
-                                      FeatureFastVariablePerLaneShuffle,
-                                      FeaturePrefer256Bit,
-                                      FeaturePOPCNTFalseDeps,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> SKXTuning = [TuningFastGather,
+                                      TuningMacroFusion,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowDivide64,
+                                      TuningFastScalarFSQRT,
+                                      TuningFastVectorFSQRT,
+                                      TuningFastSHLDRotate,
+                                      TuningFast15ByteNOP,
+                                      TuningFastVariableCrossLaneShuffle,
+                                      TuningFastVariablePerLaneShuffle,
+                                      TuningPrefer256Bit,
+                                      TuningPOPCNTFalseDeps,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> SKXFeatures =
     !listconcat(BDWFeatures, SKXAdditionalFeatures);
 
@@ -730,18 +786,18 @@ def ProcessorFeatures {
                                                   FeatureVBMI,
                                                   FeatureIFMA,
                                                   FeatureSHA];
-  list<SubtargetFeature> CNLTuning = [FeatureHasFastGather,
-                                      FeatureMacroFusion,
-                                      FeatureSlow3OpsLEA,
-                                      FeatureSlowDivide64,
-                                      FeatureFastScalarFSQRT,
-                                      FeatureFastVectorFSQRT,
-                                      FeatureFastSHLDRotate,
-                                      FeatureFast15ByteNOP,
-                                      FeatureFastVariableCrossLaneShuffle,
-                                      FeatureFastVariablePerLaneShuffle,
-                                      FeaturePrefer256Bit,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> CNLTuning = [TuningFastGather,
+                                      TuningMacroFusion,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowDivide64,
+                                      TuningFastScalarFSQRT,
+                                      TuningFastVectorFSQRT,
+                                      TuningFastSHLDRotate,
+                                      TuningFast15ByteNOP,
+                                      TuningFastVariableCrossLaneShuffle,
+                                      TuningFastVariablePerLaneShuffle,
+                                      TuningPrefer256Bit,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> CNLFeatures =
     !listconcat(SKLFeatures, CNLAdditionalFeatures);
 
@@ -755,7 +811,18 @@ def ProcessorFeatures {
                                                   FeatureGFNI,
                                                   FeatureRDPID,
                                                   FeatureFSRM];
-  list<SubtargetFeature> ICLTuning = CNLTuning;
+  list<SubtargetFeature> ICLTuning = [TuningFastGather,
+                                      TuningMacroFusion,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowDivide64,
+                                      TuningFastScalarFSQRT,
+                                      TuningFastVectorFSQRT,
+                                      TuningFastSHLDRotate,
+                                      TuningFast15ByteNOP,
+                                      TuningFastVariableCrossLaneShuffle,
+                                      TuningFastVariablePerLaneShuffle,
+                                      TuningPrefer256Bit,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> ICLFeatures =
     !listconcat(CNLFeatures, ICLAdditionalFeatures);
 
@@ -763,7 +830,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
                                                   FeatureCLWB,
                                                   FeatureWBNOINVD];
-  list<SubtargetFeature> ICXTuning = CNLTuning;
+  list<SubtargetFeature> ICXTuning = ICLTuning;
   list<SubtargetFeature> ICXFeatures =
     !listconcat(ICLFeatures, ICXAdditionalFeatures);
 
@@ -773,7 +840,7 @@ def ProcessorFeatures {
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureSHSTK];
-  list<SubtargetFeature> TGLTuning = CNLTuning;
+  list<SubtargetFeature> TGLTuning = ICLTuning;
   list<SubtargetFeature> TGLFeatures =
     !listconcat(ICLFeatures, TGLAdditionalFeatures );
 
@@ -786,6 +853,7 @@ def ProcessorFeatures {
                                                   FeatureCLDEMOTE,
                                                   FeatureWAITPKG,
                                                   FeaturePTWRITE,
+                                                  FeatureFP16,
                                                   FeatureAVXVNNI,
                                                   FeatureTSXLDTRK,
                                                   FeatureENQCMD,
@@ -811,31 +879,32 @@ def ProcessorFeatures {
                                          FeatureMOVBE,
                                          FeatureLAHFSAHF];
   list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
-                                       FeatureSlowUAMem16,
-                                       FeatureLEAForSP,
-                                       FeatureSlowDivide32,
-                                       FeatureSlowDivide64,
-                                       FeatureSlowTwoMemOps,
-                                       FeatureLEAUsesAG,
-                                       FeaturePadShortFunctions,
-                                       FeatureInsertVZEROUPPER];
+                                       TuningSlowUAMem16,
+                                       TuningLEAForSP,
+                                       TuningSlowDivide32,
+                                       TuningSlowDivide64,
+                                       TuningSlowTwoMemOps,
+                                       TuningLEAUsesAG,
+                                       TuningPadShortFunctions,
+                                       TuningInsertVZEROUPPER];
 
   // Silvermont
   list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
+                                                  FeatureCRC32,
                                                   FeaturePOPCNT,
                                                   FeaturePCLMUL,
                                                   FeaturePRFCHW,
                                                   FeatureRDRAND];
-  list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
-                                      FeatureSlowTwoMemOps,
-                                      FeatureSlowLEA,
-                                      FeatureSlowIncDec,
-                                      FeatureSlowDivide64,
-                                      FeatureSlowPMULLD,
-                                      FeatureFast7ByteNOP,
-                                      FeatureFastMOVBE,
-                                      FeaturePOPCNTFalseDeps,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> SLMTuning = [TuningUseSLMArithCosts,
+                                      TuningSlowTwoMemOps,
+                                      TuningSlowLEA,
+                                      TuningSlowIncDec,
+                                      TuningSlowDivide64,
+                                      TuningSlowPMULLD,
+                                      TuningFast7ByteNOP,
+                                      TuningFastMOVBE,
+                                      TuningPOPCNTFalseDeps,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> SLMFeatures =
     !listconcat(AtomFeatures, SLMAdditionalFeatures);
 
@@ -849,25 +918,25 @@ def ProcessorFeatures {
                                                   FeatureXSAVES,
                                                   FeatureCLFLUSHOPT,
                                                   FeatureFSGSBase];
-  list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
-                                      FeatureSlowTwoMemOps,
-                                      FeatureSlowLEA,
-                                      FeatureSlowIncDec,
-                                      FeatureFastMOVBE,
-                                      FeaturePOPCNTFalseDeps,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> GLMTuning = [TuningUseGLMDivSqrtCosts,
+                                      TuningSlowTwoMemOps,
+                                      TuningSlowLEA,
+                                      TuningSlowIncDec,
+                                      TuningFastMOVBE,
+                                      TuningPOPCNTFalseDeps,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> GLMFeatures =
     !listconcat(SLMFeatures, GLMAdditionalFeatures);
 
   // Goldmont Plus
   list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
                                                   FeatureRDPID];
-  list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
-                                      FeatureSlowTwoMemOps,
-                                      FeatureSlowLEA,
-                                      FeatureSlowIncDec,
-                                      FeatureFastMOVBE,
-                                      FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> GLPTuning = [TuningUseGLMDivSqrtCosts,
+                                      TuningSlowTwoMemOps,
+                                      TuningSlowLEA,
+                                      TuningSlowIncDec,
+                                      TuningFastMOVBE,
+                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> GLPFeatures =
     !listconcat(GLMFeatures, GLPAdditionalFeatures);
 
@@ -912,6 +981,7 @@ def ProcessorFeatures {
                                         FeatureNOPL,
                                         Feature64Bit,
                                         FeatureCMPXCHG16B,
+                                        FeatureCRC32,
                                         FeaturePOPCNT,
                                         FeaturePCLMUL,
                                         FeatureXSAVE,
@@ -934,14 +1004,14 @@ def ProcessorFeatures {
                                         FeatureBMI2,
                                         FeatureFMA,
                                         FeaturePRFCHW];
-  list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
-                                      FeatureSlow3OpsLEA,
-                                      FeatureSlowIncDec,
-                                      FeatureSlowTwoMemOps,
-                                      FeaturePreferMaskRegisters,
-                                      FeatureHasFastGather,
-                                      FeatureFastMOVBE,
-                                      FeatureSlowPMADDWD];
+  list<SubtargetFeature> KNLTuning = [TuningSlowDivide64,
+                                      TuningSlow3OpsLEA,
+                                      TuningSlowIncDec,
+                                      TuningSlowTwoMemOps,
+                                      TuningPreferMaskRegisters,
+                                      TuningFastGather,
+                                      TuningFastMOVBE,
+                                      TuningSlowPMADDWD];
   // TODO Add AVX5124FMAPS/AVX5124VNNIW features
   list<SubtargetFeature> KNMFeatures =
     !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
@@ -960,9 +1030,9 @@ def ProcessorFeatures {
                                               FeatureLAHFSAHF,
                                               FeatureCMOV,
                                               Feature64Bit];
-  list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
-                                            FeatureSlowSHLD,
-                                            FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
+                                            TuningSlowSHLD,
+                                            TuningInsertVZEROUPPER];
 
   // Bobcat
   list<SubtargetFeature> BtVer1Features = [FeatureX87,
@@ -979,29 +1049,30 @@ def ProcessorFeatures {
                                            FeatureLZCNT,
                                            FeaturePOPCNT,
                                            FeatureLAHFSAHF];
-  list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
-                                         FeatureFastScalarShiftMasks,
-                                         FeatureFastVectorShiftMasks,
-                                         FeatureSlowSHLD,
-                                         FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
+                                         TuningFastScalarShiftMasks,
+                                         TuningFastVectorShiftMasks,
+                                         TuningSlowSHLD,
+                                         TuningInsertVZEROUPPER];
 
   // Jaguar
   list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
                                                      FeatureAES,
+                                                     FeatureCRC32,
                                                      FeaturePCLMUL,
                                                      FeatureBMI,
                                                      FeatureF16C,
                                                      FeatureMOVBE,
                                                      FeatureXSAVE,
                                                      FeatureXSAVEOPT];
-  list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
-                                         FeatureFastBEXTR,
-                                         FeatureFastHorizontalOps,
-                                         FeatureFast15ByteNOP,
-                                         FeatureFastScalarShiftMasks,
-                                         FeatureFastVectorShiftMasks,
-                                         FeatureFastMOVBE,
-                                         FeatureSlowSHLD];
+  list<SubtargetFeature> BtVer2Tuning = [TuningFastLZCNT,
+                                         TuningFastBEXTR,
+                                         TuningFastHorizontalOps,
+                                         TuningFast15ByteNOP,
+                                         TuningFastScalarShiftMasks,
+                                         TuningFastVectorShiftMasks,
+                                         TuningFastMOVBE,
+                                         TuningSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
 
@@ -1013,6 +1084,7 @@ def ProcessorFeatures {
                                            Feature64Bit,
                                            FeatureCMPXCHG16B,
                                            FeatureAES,
+                                           FeatureCRC32,
                                            FeaturePRFCHW,
                                            FeaturePCLMUL,
                                            FeatureMMX,
@@ -1023,19 +1095,19 @@ def ProcessorFeatures {
                                            FeatureXSAVE,
                                            FeatureLWP,
                                            FeatureLAHFSAHF];
-  list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
-                                         FeatureFast11ByteNOP,
-                                         FeatureFastScalarShiftMasks,
-                                         FeatureBranchFusion,
-                                         FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
+                                         TuningFast11ByteNOP,
+                                         TuningFastScalarShiftMasks,
+                                         TuningBranchFusion,
+                                         TuningInsertVZEROUPPER];
 
   // PileDriver
   list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
                                                      FeatureBMI,
                                                      FeatureTBM,
-                                                     FeatureFMA,
-                                                     FeatureFastBEXTR];
-  list<SubtargetFeature> BdVer2AdditionalTuning = [FeatureFastMOVBE];
+                                                     FeatureFMA];
+  list<SubtargetFeature> BdVer2AdditionalTuning = [TuningFastBEXTR,
+                                                   TuningFastMOVBE];
   list<SubtargetFeature> BdVer2Tuning =
     !listconcat(BdVer1Tuning, BdVer2AdditionalTuning);
   list<SubtargetFeature> BdVer2Features =
@@ -1070,6 +1142,7 @@ def ProcessorFeatures {
                                        FeatureCMOV,
                                        Feature64Bit,
                                        FeatureCMPXCHG16B,
+                                       FeatureCRC32,
                                        FeatureF16C,
                                        FeatureFMA,
                                        FeatureFSGSBase,
@@ -1092,14 +1165,14 @@ def ProcessorFeatures {
                                        FeatureXSAVEC,
                                        FeatureXSAVEOPT,
                                        FeatureXSAVES];
-  list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
-                                     FeatureFastBEXTR,
-                                     FeatureFast15ByteNOP,
-                                     FeatureBranchFusion,
-                                     FeatureFastScalarShiftMasks,
-                                     FeatureFastMOVBE,
-                                     FeatureSlowSHLD,
-                                     FeatureInsertVZEROUPPER];
+  list<SubtargetFeature> ZNTuning = [TuningFastLZCNT,
+                                     TuningFastBEXTR,
+                                     TuningFast15ByteNOP,
+                                     TuningBranchFusion,
+                                     TuningFastScalarShiftMasks,
+                                     TuningFastMOVBE,
+                                     TuningSlowSHLD,
+                                     TuningInsertVZEROUPPER];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                   FeatureRDPID,
                                                   FeatureWBNOINVD];
@@ -1112,8 +1185,8 @@ def ProcessorFeatures {
                                                   FeatureVAES,
                                                   FeatureVPCLMULQDQ];
   list<SubtargetFeature> ZN3AdditionalTuning =
-    [FeatureMacroFusion,
-     FeatureFastVariablePerLaneShuffle];
+    [TuningMacroFusion,
+     TuningFastVariablePerLaneShuffle];
   list<SubtargetFeature> ZN3Tuning =
     !listconcat(ZNTuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
@@ -1140,37 +1213,37 @@ class ProcModel<string Name, SchedMachineModel Model,
 // It has no effect on code generation.
 def : ProcModel<"generic", SandyBridgeModel,
                 [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
-                [FeatureSlow3OpsLEA,
-                 FeatureSlowDivide64,
-                 FeatureSlowIncDec,
-                 FeatureMacroFusion,
-                 FeatureInsertVZEROUPPER]>;
+                [TuningSlow3OpsLEA,
+                 TuningSlowDivide64,
+                 TuningSlowIncDec,
+                 TuningMacroFusion,
+                 TuningInsertVZEROUPPER]>;
 
 def : Proc<"i386",            [FeatureX87],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"i486",            [FeatureX87],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"i586",            [FeatureX87, FeatureCMPXCHG8B],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"pentium",         [FeatureX87, FeatureCMPXCHG8B],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"pentium-mmx",     [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
-                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                   [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
                           FeatureNOPL],
-                         [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                         [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
                         FeatureFXSR, FeatureNOPL],
-                       [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                       [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["pentium3", "pentium3m"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
                  FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
-                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 // Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -1186,30 +1259,30 @@ foreach P = ["pentium3", "pentium3m"] in {
 def : ProcModel<"pentium-m", GenericPostRAModel,
                 [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
                 FeatureFXSR, FeatureNOPL, FeatureCMOV],
-                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["pentium4", "pentium4m"] in {
   def : ProcModel<P, GenericPostRAModel,
                   [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
                    FeatureFXSR, FeatureNOPL, FeatureCMOV],
-                  [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                  [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 // Intel Quark.
 def : Proc<"lakemont", [FeatureCMPXCHG8B],
-                       [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                       [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 // Intel Core Duo.
 def : ProcModel<"yonah", SandyBridgeModel,
                 [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
                  FeatureFXSR, FeatureNOPL, FeatureCMOV],
-                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 // NetBurst.
 def : ProcModel<"prescott", GenericPostRAModel,
                 [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
                  FeatureFXSR, FeatureNOPL, FeatureCMOV],
-                [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : ProcModel<"nocona", GenericPostRAModel, [
   FeatureX87,
   FeatureCMPXCHG8B,
@@ -1222,8 +1295,8 @@ def : ProcModel<"nocona", GenericPostRAModel, [
   FeatureCMPXCHG16B,
 ],
 [
-  FeatureSlowUAMem16,
-  FeatureInsertVZEROUPPER
+  TuningSlowUAMem16,
+  TuningInsertVZEROUPPER
 ]>;
 
 // Intel Core 2 Solo/Duo.
@@ -1240,9 +1313,9 @@ def : ProcModel<"core2", SandyBridgeModel, [
   FeatureLAHFSAHF
 ],
 [
-  FeatureMacroFusion,
-  FeatureSlowUAMem16,
-  FeatureInsertVZEROUPPER
+  TuningMacroFusion,
+  TuningSlowUAMem16,
+  TuningInsertVZEROUPPER
 ]>;
 def : ProcModel<"penryn", SandyBridgeModel, [
   FeatureX87,
@@ -1257,9 +1330,9 @@ def : ProcModel<"penryn", SandyBridgeModel, [
   FeatureLAHFSAHF
 ],
 [
-  FeatureMacroFusion,
-  FeatureSlowUAMem16,
-  FeatureInsertVZEROUPPER
+  TuningMacroFusion,
+  TuningSlowUAMem16,
+  TuningInsertVZEROUPPER
 ]>;
 
 // Atom CPUs.
@@ -1328,13 +1401,13 @@ def : ProcModel<"cooperlake", SkylakeServerModel,
                 ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>;
 def : ProcModel<"cannonlake", SkylakeServerModel,
                 ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
-def : ProcModel<"icelake-client", SkylakeServerModel,
+def : ProcModel<"icelake-client", IceLakeModel,
                 ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
-def : ProcModel<"rocketlake", SkylakeServerModel,
+def : ProcModel<"rocketlake", IceLakeModel,
                 ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
-def : ProcModel<"icelake-server", SkylakeServerModel,
+def : ProcModel<"icelake-server", IceLakeModel,
                 ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
-def : ProcModel<"tigerlake", SkylakeServerModel,
+def : ProcModel<"tigerlake", IceLakeModel,
                 ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>;
 def : ProcModel<"sapphirerapids", SkylakeServerModel,
                 ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
@@ -1344,37 +1417,37 @@ def : ProcModel<"alderlake", SkylakeClientModel,
 // AMD CPUs.
 
 def : Proc<"k6",   [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
-                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                   [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
-                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                   [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
-                   [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                   [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
                  FeatureNOPL],
-                [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
                  FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
-                [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
                  FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
-                [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
-                 FeatureInsertVZEROUPPER]>;
+                [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
+                 TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
                  FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
                  Feature64Bit],
-                [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
-                 FeatureInsertVZEROUPPER]>;
+                [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
+                 TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["amdfam10", "barcelona"] in {
@@ -1410,17 +1483,17 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
                 ProcessorFeatures.ZN3Tuning>;
 
 def : Proc<"geode",           [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureMMX],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"winchip2",        [FeatureX87, Feature3DNow],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"c3",              [FeatureX87, Feature3DNow],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"c3-2",            [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
                                FeatureSSE1, FeatureFXSR, FeatureCMOV],
-                              [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+                              [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 // We also provide a generic 64-bit specific x86 processor model which tries to
 // be good for modern chips without enabling instruction set encodings past the
@@ -1434,11 +1507,11 @@ def : Proc<"c3-2",            [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
 // forming a common base for them.
 def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
 [
-  FeatureSlow3OpsLEA,
-  FeatureSlowDivide64,
-  FeatureSlowIncDec,
-  FeatureMacroFusion,
-  FeatureInsertVZEROUPPER
+  TuningSlow3OpsLEA,
+  TuningSlowDivide64,
+  TuningSlowIncDec,
+  TuningMacroFusion,
+  TuningInsertVZEROUPPER
 ]>;
 
 // x86-64 micro-architecture levels.
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index a27645389dd4..2e08482e4ff6 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -37,10 +37,10 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -190,6 +190,7 @@ void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
   case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
   case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
   case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOTPCREL_NORELAX: O << "@GOTPCREL_NORELAX"; break;
   case X86II::MO_GOT:       O << "@GOT";       break;
   case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
   case X86II::MO_PLT:       O << "@PLT";       break;
@@ -753,6 +754,8 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
 void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
+  emitAsanMemaccessSymbols(M);
+
   if (TT.isOSBinFormatMachO()) {
     // Mach-O uses non-lazy symbol stubs to encode per-TU information into
     // global table for symbol lookup.
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index a3b74c8ee387..3b0983a7d935 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -23,6 +23,7 @@ class MCCodeEmitter;
 class MCStreamer;
 class X86Subtarget;
 class TargetMachine;
+struct ASanAccessInfo;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget = nullptr;
@@ -30,7 +31,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   FaultMaps FM;
   std::unique_ptr<MCCodeEmitter> CodeEmitter;
   bool EmitFPOData = false;
-  bool NeedsRetpoline = false;
 
   // This utility class tracks the length of a stackmap instruction's 'shadow'.
   // It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -98,6 +98,23 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
 
   void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
 
+  // Address sanitizer specific lowering for X86.
+  void LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI);
+  void emitAsanMemaccessSymbols(Module &M);
+  void emitAsanMemaccessPartial(Module &M, unsigned Reg,
+                                const ASanAccessInfo &AccessInfo,
+                                MCSubtargetInfo &STI);
+  void emitAsanMemaccessFull(Module &M, unsigned Reg,
+                             const ASanAccessInfo &AccessInfo,
+                             MCSubtargetInfo &STI);
+  void emitAsanReportError(Module &M, unsigned Reg,
+                           const ASanAccessInfo &AccessInfo,
+                           MCSubtargetInfo &STI);
+
+  typedef std::tuple<unsigned /*Reg*/, uint32_t /*AccessInfo*/>
+      AsanMemaccessTuple;
+  std::map<AsanMemaccessTuple, MCSymbol *> AsanMemaccessSymbols;
+
   // Choose between emitting .seh_ directives and .cv_fpo_ directives.
   void EmitSEHInstruction(const MachineInstr *MI);
 
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index b6a37f08d7e9..04931afdec51 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -360,22 +360,17 @@ findPotentialBlockers(MachineInstr *LoadInst) {
   if (BlockCount < InspectionLimit) {
     MachineBasicBlock *MBB = LoadInst->getParent();
     int LimitLeft = InspectionLimit - BlockCount;
-    for (MachineBasicBlock::pred_iterator PB = MBB->pred_begin(),
-                                          PE = MBB->pred_end();
-         PB != PE; ++PB) {
-      MachineBasicBlock *PMBB = *PB;
+    for (MachineBasicBlock *PMBB : MBB->predecessors()) {
       int PredCount = 0;
-      for (MachineBasicBlock::reverse_iterator PBInst = PMBB->rbegin(),
-                                               PME = PMBB->rend();
-           PBInst != PME; ++PBInst) {
-        if (PBInst->isMetaInstruction())
+      for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
+        if (PBInst.isMetaInstruction())
           continue;
         PredCount++;
         if (PredCount >= LimitLeft)
           break;
-        if (PBInst->getDesc().isCall())
+        if (PBInst.getDesc().isCall())
           break;
-        PotentialBlockers.push_back(&*PBInst);
+        PotentialBlockers.push_back(&PBInst);
       }
     }
   }
@@ -542,9 +537,8 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
       int DefVR = MI.getOperand(0).getReg();
       if (!MRI->hasOneNonDBGUse(DefVR))
         continue;
-      for (auto UI = MRI->use_nodbg_begin(DefVR), UE = MRI->use_nodbg_end();
-           UI != UE;) {
-        MachineOperand &StoreMO = *UI++;
+      for (MachineOperand &StoreMO :
+           llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {
         MachineInstr &StoreMI = *StoreMO.getParent();
         // Skip cases where the memcpy may overlap.
         if (StoreMI.getParent() == MI.getParent() &&
diff --git a/llvm/lib/Target/X86/X86CallLowering.cpp b/llvm/lib/Target/X86/X86CallLowering.cpp
index c8bffb4d4d37..a14ce82313cb 100644
--- a/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -105,7 +105,7 @@ struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     MIB.addUse(PhysReg, RegState::Implicit);
     Register ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -195,7 +195,7 @@ struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
-                        CCValAssign &VA) override {
+                        CCValAssign VA) override {
     markPhysRegUsed(PhysReg);
     IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
   }
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 98883bbf59a8..4dd8a6cdd898 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -23,6 +23,13 @@ class CCIfNotSubtarget<string F, CCAction A>
                        "(State.getMachineFunction().getSubtarget()).", F),
            A>;
 
+/// CCIfIsVarArgOnWin - Match if isVarArg on Windows 32bits.
+class CCIfIsVarArgOnWin<CCAction A>
+    : CCIf<"State.isVarArg() && "
+           "State.getMachineFunction().getSubtarget().getTargetTriple()."
+           "isWindowsMSVCEnvironment()",
+           A>;
+
 // Register classes for RegCall
 class RC_X86_RegCall {
   list<Register> GPR_8 = [];
@@ -233,19 +240,19 @@ def RetCC_X86Common : CallingConv<[
   // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
   // can only be used by ABI non-compliant code. If the target doesn't have XMM
   // registers, it won't have vector types.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
             CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
 
   // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
   // can only be used by ABI non-compliant code. This vector type is only
   // supported while using the AVX target feature.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
             CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
 
   // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
   // can only be used by ABI non-compliant code. This vector type is only
   // supported while using the AVX-512 target feature.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
 
   // MMX vector types are always returned in MM0. If the target doesn't have
@@ -266,7 +273,11 @@ def RetCC_X86_32_C : CallingConv<[
   // conv.
   CCIfInReg<CCIfSubtarget<"hasSSE2()",
     CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
-  CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+  CCIfSubtarget<"hasX87()",
+    CCIfType<[f32, f64], CCAssignToReg<[FP0, FP1]>>>,
+  CCIfNotSubtarget<"hasX87()",
+    CCIfType<[f32], CCAssignToReg<[EAX, EDX, ECX]>>>,
+  CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -329,6 +340,7 @@ def RetCC_X86_32_VectorCall : CallingConv<[
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f16], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
@@ -552,7 +564,7 @@ def CC_X86_64_C : CallingConv<[
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
   // The first 8 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
             CCIfSubtarget<"hasSSE1()",
             CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
 
@@ -561,33 +573,33 @@ def CC_X86_64_C : CallingConv<[
   // FIXME: This isn't precisely correct; the x86-64 ABI document says that
   // fixed arguments to vararg functions are supposed to be passed in
   // registers.  Actually modeling that would be a lot of work, though.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
                           CCIfSubtarget<"hasAVX()",
                           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
                                          YMM4, YMM5, YMM6, YMM7]>>>>,
 
   // The first 8 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCIfSubtarget<"hasAVX512()",
             CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
 
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>,
 
   // Long doubles get stack slots whose size and alignment depends on the
   // subtarget.
   CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
 
   // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>,
 
   // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
            CCAssignToStack<32, 32>>,
 
   // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
            CCAssignToStack<64, 64>>
 ]>;
 
@@ -635,13 +647,13 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfCFGuardTarget<CCAssignToReg<[RAX]>>,
 
   // 128 bit vectors are passed by pointer
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect<i64>>,
 
   // 256 bit vectors are passed by pointer
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect<i64>>,
 
   // 512 bit vectors are passed by pointer
-  CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+  CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
 
   // Long doubles are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
@@ -655,7 +667,7 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
 
   // The first 4 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64],
+  CCIfType<[f16, f32, f64],
            CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
                                    [RCX , RDX , R8  , R9  ]>>,
 
@@ -678,7 +690,7 @@ def CC_X86_Win64_C : CallingConv<[
 
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
+  CCIfType<[i8, i16, i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
 def CC_X86_Win64_VectorCall : CallingConv<[
@@ -757,33 +769,51 @@ def CC_X86_64_AnyReg : CallingConv<[
 /// values are spilled on the stack.
 def CC_X86_32_Vector_Common : CallingConv<[
   // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+           CCAssignToStack<16, 16>>,
 
   // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
            CCAssignToStack<32, 32>>,
 
   // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
            CCAssignToStack<64, 64>>
 ]>;
 
+/// CC_X86_Win32_Vector - In X86 Win32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_Win32_Vector : CallingConv<[
+  // Other SSE vectors get 16-byte stack slots that are 4-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+           CCAssignToStack<16, 4>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
+           CCAssignToStack<32, 4>>,
+
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
+           CCAssignToStack<64, 4>>
+]>;
+
 // CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
 // vector registers
 def CC_X86_32_Vector_Standard : CallingConv<[
   // SSE vector arguments are passed in XMM registers.
-  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
                 CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
 
   // AVX 256-bit vector arguments are passed in YMM registers.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
                 CCIfSubtarget<"hasAVX()",
                 CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
 
   // AVX 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
                 CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
 
+  CCIfIsVarArgOnWin<CCDelegateTo<CC_X86_Win32_Vector>>,
   CCDelegateTo<CC_X86_32_Vector_Common>
 ]>;
 
@@ -791,16 +821,16 @@ def CC_X86_32_Vector_Standard : CallingConv<[
 // vector registers.
 def CC_X86_32_Vector_Darwin : CallingConv<[
   // SSE vector arguments are passed in XMM registers.
-  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
                 CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
 
   // AVX 256-bit vector arguments are passed in YMM registers.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
                 CCIfSubtarget<"hasAVX()",
                 CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
 
   // AVX 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
                 CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
 
   CCDelegateTo<CC_X86_32_Vector_Common>
@@ -819,11 +849,15 @@ def CC_X86_32_Common : CallingConv<[
                 CCIfSubtarget<"hasSSE2()",
                 CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
 
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
   // The first 3 __m64 vector arguments are passed in mmx registers if the
   // call is not a vararg call.
   CCIfNotVarArg<CCIfType<[x86mmx],
                 CCAssignToReg<[MM0, MM1, MM2]>>>,
 
+  CCIfType<[f16], CCAssignToStack<4, 4>>,
+
   // Integer/Float values get stored in stack slots that are 4 bytes in
   // size and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
@@ -831,8 +865,8 @@ def CC_X86_32_Common : CallingConv<[
   // Doubles get 8-byte slots that are 4-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 4>>,
 
-  // Long doubles get slots whose size depends on the subtarget.
-  CCIfType<[f80], CCAssignToStack<0, 4>>,
+  // Long doubles get slots whose size and alignment depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
 
   // Boolean vectors of AVX-512 are passed in SIMD registers.
   // The call from AVX to AVX-512 function should work,
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index 05349a7c01f8..863438793acf 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -582,10 +582,9 @@ static bool checkEFLAGSLive(MachineInstr *MI) {
   }
 
   // We hit the end of the block, check whether EFLAGS is live into a successor.
-  for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
-    if ((*I)->isLiveIn(X86::EFLAGS))
+  for (MachineBasicBlock *Succ : BB->successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
       return true;
-  }
 
   return false;
 }
@@ -797,8 +796,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
         MOp.setIsKill(false);
       }
     }
-    MBB->erase(MachineBasicBlock::iterator(MI),
-               std::next(MachineBasicBlock::iterator(MI)));
+    MBB->erase(&MI);
 
     // Add this PHI to the rewrite table.
     FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
diff --git a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
index 9ada0a8dd412..df8df1e3a65d 100644
--- a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
@@ -1,4 +1,4 @@
-//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
+//===----- X86DynAllocaExpander.cpp - Expand DynAlloca pseudo instruction -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines a pass that expands WinAlloca pseudo-instructions.
+// This file defines a pass that expands DynAlloca pseudo-instructions.
 //
 // It performs a conservative analysis to determine whether each allocation
 // falls within a region of the stack that is safe to use, or whether stack
@@ -33,26 +33,26 @@ using namespace llvm;
 
 namespace {
 
-class X86WinAllocaExpander : public MachineFunctionPass {
+class X86DynAllocaExpander : public MachineFunctionPass {
 public:
-  X86WinAllocaExpander() : MachineFunctionPass(ID) {}
+  X86DynAllocaExpander() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
-  /// Strategies for lowering a WinAlloca.
+  /// Strategies for lowering a DynAlloca.
   enum Lowering { TouchAndSub, Sub, Probe };
 
-  /// Deterministic-order map from WinAlloca instruction to desired lowering.
+  /// Deterministic-order map from DynAlloca instruction to desired lowering.
   typedef MapVector<MachineInstr*, Lowering> LoweringMap;
 
-  /// Compute which lowering to use for each WinAlloca instruction.
+  /// Compute which lowering to use for each DynAlloca instruction.
   void computeLowerings(MachineFunction &MF, LoweringMap& Lowerings);
 
   /// Get the appropriate lowering based on current offset and amount.
   Lowering getLowering(int64_t CurrentOffset, int64_t AllocaAmount);
 
-  /// Lower a WinAlloca instruction.
+  /// Lower a DynAlloca instruction.
   void lower(MachineInstr* MI, Lowering L);
 
   MachineRegisterInfo *MRI = nullptr;
@@ -64,22 +64,22 @@ private:
   int64_t StackProbeSize = 0;
   bool NoStackArgProbe = false;
 
-  StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
+  StringRef getPassName() const override { return "X86 DynAlloca Expander"; }
   static char ID;
 };
 
-char X86WinAllocaExpander::ID = 0;
+char X86DynAllocaExpander::ID = 0;
 
 } // end anonymous namespace
 
-FunctionPass *llvm::createX86WinAllocaExpander() {
-  return new X86WinAllocaExpander();
+FunctionPass *llvm::createX86DynAllocaExpander() {
+  return new X86DynAllocaExpander();
 }
 
-/// Return the allocation amount for a WinAlloca instruction, or -1 if unknown.
-static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
-  assert(MI->getOpcode() == X86::WIN_ALLOCA_32 ||
-         MI->getOpcode() == X86::WIN_ALLOCA_64);
+/// Return the allocation amount for a DynAlloca instruction, or -1 if unknown.
+static int64_t getDynAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
+  assert(MI->getOpcode() == X86::DYN_ALLOCA_32 ||
+         MI->getOpcode() == X86::DYN_ALLOCA_64);
   assert(MI->getOperand(0).isReg());
 
   Register AmountReg = MI->getOperand(0).getReg();
@@ -93,8 +93,8 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
   return Def->getOperand(1).getImm();
 }
 
-X86WinAllocaExpander::Lowering
-X86WinAllocaExpander::getLowering(int64_t CurrentOffset,
+X86DynAllocaExpander::Lowering
+X86DynAllocaExpander::getLowering(int64_t CurrentOffset,
                                   int64_t AllocaAmount) {
   // For a non-constant amount or a large amount, we have to probe.
   if (AllocaAmount < 0 || AllocaAmount > StackProbeSize)
@@ -128,11 +128,11 @@ static bool isPushPop(const MachineInstr &MI) {
   }
 }
 
-void X86WinAllocaExpander::computeLowerings(MachineFunction &MF,
+void X86DynAllocaExpander::computeLowerings(MachineFunction &MF,
                                             LoweringMap &Lowerings) {
   // Do a one-pass reverse post-order walk of the CFG to conservatively estimate
   // the offset between the stack pointer and the lowest touched part of the
-  // stack, and use that to decide how to lower each WinAlloca instruction.
+  // stack, and use that to decide how to lower each DynAlloca instruction.
 
   // Initialize OutOffset[B], the stack offset at exit from B, to something big.
   DenseMap<MachineBasicBlock *, int64_t> OutOffset;
@@ -153,10 +153,10 @@ void X86WinAllocaExpander::computeLowerings(MachineFunction &MF,
     if (Offset == -1) Offset = INT32_MAX;
 
     for (MachineInstr &MI : *MBB) {
-      if (MI.getOpcode() == X86::WIN_ALLOCA_32 ||
-          MI.getOpcode() == X86::WIN_ALLOCA_64) {
-        // A WinAlloca moves StackPtr, and potentially touches it.
-        int64_t Amount = getWinAllocaAmount(&MI, MRI);
+      if (MI.getOpcode() == X86::DYN_ALLOCA_32 ||
+          MI.getOpcode() == X86::DYN_ALLOCA_64) {
+        // A DynAlloca moves StackPtr, and potentially touches it.
+        int64_t Amount = getDynAllocaAmount(&MI, MRI);
         Lowering L = getLowering(Offset, Amount);
         Lowerings[&MI] = L;
         switch (L) {
@@ -195,12 +195,12 @@ static unsigned getSubOpcode(bool Is64Bit, int64_t Amount) {
   return isInt<8>(Amount) ? X86::SUB32ri8 : X86::SUB32ri;
 }
 
-void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
+void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) {
   const DebugLoc &DL = MI->getDebugLoc();
   MachineBasicBlock *MBB = MI->getParent();
   MachineBasicBlock::iterator I = *MI;
 
-  int64_t Amount = getWinAllocaAmount(MI, MRI);
+  int64_t Amount = getDynAllocaAmount(MI, MRI);
   if (Amount == 0) {
     MI->eraseFromParent();
     return;
@@ -209,7 +209,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
   // These two variables differ on x32, which is a 64-bit target with a
   // 32-bit alloca.
   bool Is64Bit = STI->is64Bit();
-  bool Is64BitAlloca = MI->getOpcode() == X86::WIN_ALLOCA_64;
+  bool Is64BitAlloca = MI->getOpcode() == X86::DYN_ALLOCA_64;
   assert(SlotSize == 4 || SlotSize == 8);
 
   switch (L) {
@@ -271,8 +271,8 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
       AmountDef->eraseFromParent();
 }
 
-bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
-  if (!MF.getInfo<X86MachineFunctionInfo>()->hasWinAlloca())
+bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getInfo<X86MachineFunctionInfo>()->hasDynAlloca())
     return false;
 
   MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 4add8d30e010..01dc509df795 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -236,19 +236,10 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
     MBB.getParent()->moveCallSiteInfo(&MI, Marker);
 
   // Emit call to ObjC runtime.
-  unsigned RuntimeCallType = MI.getOperand(0).getImm();
-  assert(RuntimeCallType <= 1 && "objc runtime call type must be 0 or 1");
-  Module *M = MBB.getParent()->getFunction().getParent();
-  auto &Context = M->getContext();
-  auto *I8PtrTy = PointerType::get(IntegerType::get(Context, 8), 0);
-  FunctionCallee Fn = M->getOrInsertFunction(
-      RuntimeCallType == 0 ? "objc_retainAutoreleasedReturnValue"
-                           : "objc_unsafeClaimAutoreleasedReturnValue",
-      FunctionType::get(I8PtrTy, {I8PtrTy}, false));
   const uint32_t *RegMask =
       TRI->getCallPreservedMask(*MBB.getParent(), CallingConv::C);
   BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64pcrel32))
-      .addGlobalAddress(cast<GlobalValue>(Fn.getCallee()), 0, 0)
+      .addGlobalAddress(MI.getOperand(0).getGlobal(), 0, 0)
       .addRegMask(RegMask)
       .addReg(X86::RAX,
               RegState::Implicit |
@@ -403,10 +394,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB;
     if (StackAdj == 0) {
       MIB = BuildMI(MBB, MBBI, DL,
-                    TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
+                    TII->get(STI->is64Bit() ? X86::RET64 : X86::RET32));
     } else if (isUInt<16>(StackAdj)) {
       MIB = BuildMI(MBB, MBBI, DL,
-                    TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
+                    TII->get(STI->is64Bit() ? X86::RETI64 : X86::RETI32))
                 .addImm(StackAdj);
     } else {
       assert(!STI->is64Bit() &&
@@ -416,7 +407,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
       X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, /*InEpilogue=*/true);
       BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
-      MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
+      MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RET32));
     }
     for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
       MIB.add(MBBI->getOperand(I));
@@ -657,35 +648,24 @@ void X86ExpandPseudo::ExpandVastartSaveXmmRegs(
                   EntryBlk->end());
   TailBlk->transferSuccessorsAndUpdatePHIs(EntryBlk);
 
-  int64_t FrameIndex = VAStartPseudoInstr->getOperand(1).getImm();
-  Register BaseReg;
-  uint64_t FrameOffset =
-      X86FL->getFrameIndexReference(*Func, FrameIndex, BaseReg).getFixed();
-  uint64_t VarArgsRegsOffset = VAStartPseudoInstr->getOperand(2).getImm();
+  uint64_t FrameOffset = VAStartPseudoInstr->getOperand(4).getImm();
+  uint64_t VarArgsRegsOffset = VAStartPseudoInstr->getOperand(6).getImm();
 
   // TODO: add support for YMM and ZMM here.
   unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
 
   // In the XMM save block, save all the XMM argument registers.
-  for (int64_t OpndIdx = 3, RegIdx = 0;
+  for (int64_t OpndIdx = 7, RegIdx = 0;
        OpndIdx < VAStartPseudoInstr->getNumOperands() - 1;
        OpndIdx++, RegIdx++) {
-
-    int64_t Offset = FrameOffset + VarArgsRegsOffset + RegIdx * 16;
-
-    MachineMemOperand *MMO = Func->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(*Func, FrameIndex, Offset),
-        MachineMemOperand::MOStore,
-        /*Size=*/16, Align(16));
-
-    BuildMI(GuardedRegsBlk, DL, TII->get(MOVOpc))
-        .addReg(BaseReg)
-        .addImm(/*Scale=*/1)
-        .addReg(/*IndexReg=*/0)
-        .addImm(/*Disp=*/Offset)
-        .addReg(/*Segment=*/0)
-        .addReg(VAStartPseudoInstr->getOperand(OpndIdx).getReg())
-        .addMemOperand(MMO);
+    auto NewMI = BuildMI(GuardedRegsBlk, DL, TII->get(MOVOpc));
+    for (int i = 0; i < X86::AddrNumOperands; ++i) {
+      if (i == X86::AddrDisp)
+        NewMI.addImm(FrameOffset + VarArgsRegsOffset + RegIdx * 16);
+      else
+        NewMI.add(VAStartPseudoInstr->getOperand(i + 1));
+    }
+    NewMI.addReg(VAStartPseudoInstr->getOperand(OpndIdx).getReg());
     assert(Register::isPhysicalRegister(
         VAStartPseudoInstr->getOperand(OpndIdx).getReg()));
   }
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index bb95ed3ccdc5..1ac998b7ff7e 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -55,6 +55,7 @@ class X86FastISel final : public FastISel {
   /// When SSE2 is available, use it for f64 operations.
   bool X86ScalarSSEf64;
   bool X86ScalarSSEf32;
+  bool X86ScalarSSEf16;
 
 public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
@@ -63,6 +64,7 @@ public:
     Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
+    X86ScalarSSEf16 = Subtarget->hasFP16();
   }
 
   bool fastSelectInstruction(const Instruction *I) override;
@@ -157,7 +159,8 @@ private:
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
     return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+           (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
+           (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
   }
 
   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@@ -786,7 +789,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
           RC  = &X86::GR32RegClass;
         }
 
-        if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL)
+        if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL ||
+            GVFlags == X86II::MO_GOTPCREL_NORELAX)
           StubAM.Base.Reg = X86::RIP;
 
         LoadReg = createResultReg(RC);
@@ -1301,11 +1305,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   MachineInstrBuilder MIB;
   if (X86MFInfo->getBytesToPopOnReturn()) {
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                  TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
+                  TII.get(Subtarget->is64Bit() ? X86::RETI64 : X86::RETI32))
               .addImm(X86MFInfo->getBytesToPopOnReturn());
   } else {
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                  TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+                  TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));
   }
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
@@ -2283,9 +2287,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   unsigned Opc;
   switch (RetVT.SimpleTy) {
   default: return false;
-  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
-  case MVT::i16: Opc = X86::CMOV_GR16; break;
-  case MVT::i32: Opc = X86::CMOV_GR32; break;
+  case MVT::i8:  Opc = X86::CMOV_GR8;   break;
+  case MVT::i16: Opc = X86::CMOV_GR16;  break;
+  case MVT::f16: Opc = X86::CMOV_FR16X; break;
+  case MVT::i32: Opc = X86::CMOV_GR32;  break;
   case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
                                               : X86::CMOV_FR32; break;
   case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
@@ -2741,7 +2746,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
       return false;
 
-    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 1);
+    return lowerCallTo(II, "memcpy", II->arg_size() - 1);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
@@ -2756,7 +2761,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (MSI->getDestAddressSpace() > 255)
       return false;
 
-    return lowerCallTo(II, "memset", II->getNumArgOperands() - 1);
+    return lowerCallTo(II, "memset", II->arg_size() - 1);
   }
   case Intrinsic::stackprotector: {
     // Emit code to store the stack guard onto the stack.
@@ -2780,8 +2785,6 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!X86SelectAddress(DI->getAddress(), AM))
       return false;
     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
-    // FIXME may need to add RegState::Debug to any registers produced,
-    // although ESP/EBP should be the only ones at the moment.
     assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
            "Expected inlined-at fields to agree");
     addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
@@ -3484,6 +3487,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     // NonLazyBind calls or dllimport calls.
     bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
                     OpFlags == X86II::MO_GOTPCREL ||
+                    OpFlags == X86II::MO_GOTPCREL_NORELAX ||
                     OpFlags == X86II::MO_COFFSTUB;
     unsigned CallOpc = NeedLoad
                            ? (Is64Bit ? X86::CALL64m : X86::CALL32m)
@@ -3838,11 +3842,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
 
   if (const auto *CI = dyn_cast<ConstantInt>(C))
     return X86MaterializeInt(CI, VT);
-  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+  if (const auto *CFP = dyn_cast<ConstantFP>(C))
     return X86MaterializeFP(CFP, VT);
-  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+  if (const auto *GV = dyn_cast<GlobalValue>(C))
     return X86MaterializeGV(GV, VT);
-  else if (isa<UndefValue>(C)) {
+  if (isa<UndefValue>(C)) {
     unsigned Opc = 0;
     switch (VT.SimpleTy) {
     default:
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 7031bd40215d..87c04a07cd13 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -44,6 +44,7 @@ class X86FastTileConfig : public MachineFunctionPass {
   const TargetRegisterInfo *TRI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  X86MachineFunctionInfo *X86FI = nullptr;
 
   MachineInstr *getTileConfigPoint();
   void tileConfig();
@@ -289,6 +290,8 @@ bool X86FastTileConfig::fastTileConfig() {
     if (!CFGs.empty())
       Changed = true;
   }
+  if (Changed)
+    X86FI->setHasVirtualTileReg(true);
   return Changed;
 }
 
@@ -298,6 +301,7 @@ bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
   ST = &MFunc.getSubtarget<X86Subtarget>();
   TRI = ST->getRegisterInfo();
   TII = MFunc.getSubtarget().getInstrInfo();
+  X86FI = MFunc.getInfo<X86MachineFunctionInfo>();
 
   return fastTileConfig();
 }
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 05cab776e0b7..9a63cffe0a09 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -212,8 +212,7 @@ FixupLEAPass::postRAConvertToLEA(MachineBasicBlock &MBB,
     // These instructions are all fine to convert.
     break;
   }
-  MachineFunction::iterator MFI = MBB.getIterator();
-  return TII->convertToThreeAddress(MFI, MI, nullptr);
+  return TII->convertToThreeAddress(MI, nullptr, nullptr);
 }
 
 FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index 2d9886e3f238..f24dbcfe972d 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -964,7 +964,11 @@ void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
   if (!SetCCI.mayStore()) {
     assert(SetCCI.getOperand(0).isReg() &&
            "Cannot have a non-register defined operand to SETcc!");
-    MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
+    Register OldReg = SetCCI.getOperand(0).getReg();
+    // Drop Kill flags on the old register before replacing. CondReg may have
+    // a longer live range.
+    MRI->clearKillFlags(OldReg);
+    MRI->replaceRegWith(OldReg, CondReg);
     SetCCI.eraseFromParent();
     return;
   }
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index e0f30f090171..60e1b37ed61c 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -832,6 +832,24 @@ static const TableEntry PopTable[] = {
   { X86::UCOM_Fr   , X86::UCOM_FPr    },
 };
 
+static bool doesInstructionSetFPSW(MachineInstr &MI) {
+  if (const MachineOperand *MO = MI.findRegisterDefOperand(X86::FPSW))
+    if (!MO->isDead())
+      return true;
+  return false;
+}
+
+static MachineBasicBlock::iterator
+getNextFPInstruction(MachineBasicBlock::iterator I) {
+  MachineBasicBlock &MBB = *I->getParent();
+  while (++I != MBB.end()) {
+    MachineInstr &MI = *I;
+    if (X86::isX87Instruction(MI))
+      return I;
+  }
+  return MBB.end();
+}
+
 /// popStackAfter - Pop the current value off of the top of the FP stack after
 /// the specified instruction.  This attempts to be sneaky and combine the pop
 /// into the instruction itself if possible.  The iterator is left pointing to
@@ -853,6 +871,14 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
       I->RemoveOperand(0);
     MI.dropDebugNumber();
   } else {    // Insert an explicit pop
+    // If this instruction sets FPSW, which is read in following instruction,
+    // insert pop after that reader.
+    if (doesInstructionSetFPSW(MI)) {
+      MachineBasicBlock &MBB = *MI.getParent();
+      MachineBasicBlock::iterator Next = getNextFPInstruction(I);
+      if (Next != MBB.end() && Next->readsRegister(X86::FPSW))
+        I = Next;
+    }
     I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
   }
 }
@@ -1038,9 +1064,10 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
   for (unsigned I = 0; I < N; ++I)
     pushReg(N - I - 1);
 
-  // Drop all variable values defined by this call -- we can't track them
-  // once they've been stackified.
-  I->dropDebugNumber();
+  // If this call has been modified, drop all variable values defined by it.
+  // We can't track them once they've been stackified.
+  if (STReturns)
+    I->dropDebugNumber();
 }
 
 /// If RET has an FP register use operand, pass the first one in ST(0) and
@@ -1732,16 +1759,14 @@ void FPS::setKillFlags(MachineBasicBlock &MBB) const {
 
   LPR.addLiveOuts(MBB);
 
-  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
-       I != E; ++I) {
-    if (I->isDebugInstr())
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (MI.isDebugInstr())
       continue;
 
     std::bitset<8> Defs;
     SmallVector<MachineOperand *, 2> Uses;
-    MachineInstr &MI = *I;
 
-    for (auto &MO : I->operands()) {
+    for (auto &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
 
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 4cde7971e597..bd780273509f 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -671,7 +671,9 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   MF.insert(MBBIter, testMBB);
   MF.insert(MBBIter, tailMBB);
 
-  Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+  Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
+                              : Is64Bit         ? X86::R11D
+                                                : X86::EAX;
   BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
@@ -1092,7 +1094,9 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
       MF.insert(MBBIter, bodyMBB);
       MF.insert(MBBIter, footMBB);
       const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
-      Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+      Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
+                                  : Is64Bit         ? X86::R11D
+                                                    : X86::EAX;
 
       // Setup entry block
       {
@@ -1349,25 +1353,44 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // to determine the end of the prologue.
   DebugLoc DL;
 
-  // Add RETADDR move area to callee saved frame size.
-  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-  if (TailCallReturnAddrDelta && IsWin64Prologue)
+  // Space reserved for stack-based arguments when making a (ABI-guaranteed)
+  // tail call.
+  unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
+  if (TailCallArgReserveSize  && IsWin64Prologue)
     report_fatal_error("Can't handle guaranteed tail call under win64 yet");
 
-  if (TailCallReturnAddrDelta < 0)
-    X86FI->setCalleeSavedFrameSize(
-      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
-
   const bool EmitStackProbeCall =
       STI.getTargetLowering()->hasStackProbeSymbol(MF);
   unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
 
   if (HasFP && X86FI->hasSwiftAsyncContext()) {
-    BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8),
-            MachineFramePtr)
-        .addUse(MachineFramePtr)
-        .addImm(60)
-        .setMIFlag(MachineInstr::FrameSetup);
+    switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
+    case SwiftAsyncFramePointerMode::DeploymentBased:
+      if (STI.swiftAsyncContextIsDynamicallySet()) {
+        // The special symbol below is absolute and has a *value* suitable to be
+        // combined with the frame pointer directly.
+        BuildMI(MBB, MBBI, DL, TII.get(X86::OR64rm), MachineFramePtr)
+            .addUse(MachineFramePtr)
+            .addUse(X86::RIP)
+            .addImm(1)
+            .addUse(X86::NoRegister)
+            .addExternalSymbol("swift_async_extendedFramePointerFlags",
+                               X86II::MO_GOTPCREL)
+            .addUse(X86::NoRegister);
+        break;
+      }
+      LLVM_FALLTHROUGH;
+
+    case SwiftAsyncFramePointerMode::Always:
+      BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8), MachineFramePtr)
+          .addUse(MachineFramePtr)
+          .addImm(60)
+          .setMIFlag(MachineInstr::FrameSetup);
+      break;
+
+    case SwiftAsyncFramePointerMode::Never:
+      break;
+    }
   }
 
   // Re-align the stack on 64-bit if the x86-interrupt calling convention is
@@ -1391,7 +1414,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       !EmitStackProbeCall &&                   // No stack probes.
       !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
       !MF.shouldSplitStack()) {                // Regular stack
-    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    uint64_t MinSize =
+        X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta();
     if (HasFP) MinSize += SlotSize;
     X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -1401,8 +1425,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // Insert stack pointer adjustment for later moving of return addr.  Only
   // applies to tail call optimized functions where the callee argument stack
   // size is bigger than the callers.
-  if (TailCallReturnAddrDelta < 0) {
-    BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+  if (TailCallArgReserveSize != 0) {
+    BuildStackAdjustment(MBB, MBBI, DL, -(int)TailCallArgReserveSize,
                          /*InEpilogue=*/false)
         .setMIFlag(MachineInstr::FrameSetup);
   }
@@ -1451,7 +1475,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     if (X86FI->getRestoreBasePointer())
       FrameSize += SlotSize;
 
-    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    NumBytes = FrameSize -
+               (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
 
     // Callee-saved registers are pushed on stack before the stack is realigned.
     if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
@@ -1554,7 +1579,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     }
   } else {
     assert(!IsFunclet && "funclets without FPs not yet implemented");
-    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+    NumBytes = StackSize -
+               (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
   }
 
   // Update the offset adjustment, which is mainly used by codeview to translate
@@ -2011,6 +2037,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   uint64_t StackSize = MFI.getStackSize();
   uint64_t MaxAlign = calculateMaxStackAlign(MF);
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
   bool HasFP = hasFP(MF);
   uint64_t NumBytes = 0;
 
@@ -2024,14 +2051,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   } else if (HasFP) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    NumBytes = FrameSize - CSSize;
+    NumBytes = FrameSize - CSSize - TailCallArgReserveSize;
 
     // Callee-saved registers were pushed on stack before the stack was
     // realigned.
     if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
       NumBytes = alignTo(FrameSize, MaxAlign);
   } else {
-    NumBytes = StackSize - CSSize;
+    NumBytes = StackSize - CSSize - TailCallArgReserveSize;
   }
   uint64_t SEHStackAllocAmt = NumBytes;
 
@@ -2098,7 +2125,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
-
   // If there is an ADD32ri or SUB32ri of ESP immediately before this
   // instruction, merge the two instructions.
   if (NumBytes || MFI.hasVarSizedObjects())
@@ -2140,10 +2166,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
     emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
-    if (!hasFP(MF) && NeedsDwarfCFI) {
+    if (!HasFP && NeedsDwarfCFI) {
       // Define the current CFA rule to use the provided offset.
       BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::cfiDefCfaOffset(nullptr, CSSize + SlotSize));
+               MCCFIInstruction::cfiDefCfaOffset(
+                   nullptr, CSSize + TailCallArgReserveSize + SlotSize));
     }
     --MBBI;
   }
@@ -2157,7 +2184,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (NeedsWin64CFI && MF.hasWinCFI())
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
 
-  if (!hasFP(MF) && NeedsDwarfCFI) {
+  if (!HasFP && NeedsDwarfCFI) {
     MBBI = FirstCSPop;
     int64_t Offset = -CSSize - SlotSize;
     // Mark callee-saved pop instruction.
@@ -2177,9 +2204,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   // Emit DWARF info specifying the restores of the callee-saved registers.
   // For epilogue with return inside or being other block without successor,
   // no need to generate .cfi_restore for callee-saved registers.
-  if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) {
+  if (NeedsDwarfCFI && !MBB.succ_empty())
     emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);
-  }
 
   if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
     // Add the return addr area delta back since we are not tail calling.
@@ -2193,13 +2219,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   // Emit tilerelease for AMX kernel.
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
-  for (unsigned I = 0; I < RC->getNumRegs(); I++)
-    if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
-      BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
-      break;
-    }
+  if (X86FI->hasVirtualTileReg())
+    BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
 }
 
 StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
@@ -2226,7 +2247,6 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
   const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t StackSize = MFI.getStackSize();
-  bool HasFP = hasFP(MF);
   bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   int64_t FPDelta = 0;
 
@@ -2262,39 +2282,27 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
            "FPDelta isn't aligned per the Win64 ABI!");
   }
 
-
-  if (TRI->hasBasePointer(MF)) {
-    assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
-    if (FI < 0) {
-      // Skip the saved EBP.
-      return StackOffset::getFixed(Offset + SlotSize + FPDelta);
-    } else {
-      assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
-      return StackOffset::getFixed(Offset + StackSize);
-    }
-  } else if (TRI->hasStackRealignment(MF)) {
-    if (FI < 0) {
-      // Skip the saved EBP.
-      return StackOffset::getFixed(Offset + SlotSize + FPDelta);
-    } else {
-      assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
-      return StackOffset::getFixed(Offset + StackSize);
-    }
-    // FIXME: Support tail calls
-  } else {
-    if (!HasFP)
-      return StackOffset::getFixed(Offset + StackSize);
-
-    // Skip the saved EBP.
+  if (FrameReg == TRI->getFramePtr()) {
+    // Skip saved EBP/RBP
     Offset += SlotSize;
 
+    // Account for restricted Windows prologue.
+    Offset += FPDelta;
+
     // Skip the RETADDR move area
     int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
     if (TailCallReturnAddrDelta < 0)
       Offset -= TailCallReturnAddrDelta;
+
+    return StackOffset::getFixed(Offset);
   }
 
-  return StackOffset::getFixed(Offset + FPDelta);
+  // FrameReg is either the stack pointer or a base pointer. But the base is
+  // located at the end of the statically known StackSize so the distinction
+  // doesn't really matter.
+  if (TRI->hasStackRealignment(MF) || TRI->hasBasePointer(MF))
+    assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
+  return StackOffset::getFixed(Offset + StackSize);
 }
 
 int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
@@ -3091,8 +3099,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
         // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
         // "_", such as the BIF "suspend_0") as they are executed on another
         // stack.
-        if (F->getName().find("erlang.") != StringRef::npos ||
-            F->getName().find("bif_") != StringRef::npos ||
+        if (F->getName().contains("erlang.") || F->getName().contains("bif_") ||
             F->getName().find_first_of("._") == StringRef::npos)
           continue;
 
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index e9c7ba44b524..7ed05fd0331d 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -216,6 +216,8 @@ namespace {
     bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                  unsigned Depth);
+    bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                       unsigned Depth);
     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -336,10 +338,9 @@ namespace {
         return false;
 
       // Walk all the users of the immediate.
-      for (SDNode::use_iterator UI = N->use_begin(),
-           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
-
-        SDNode *User = *UI;
+      for (const SDNode *User : N->uses()) {
+        if (UseCount >= 2)
+          break;
 
         // This user is already selected. Count it as a legitimate use and
         // move on.
@@ -433,6 +434,18 @@ namespace {
       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
     }
 
+    SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
+                                               const SDLoc &DL) {
+      assert(VecWidth == 128 && "Unexpected vector width");
+      uint64_t Index = N->getConstantOperandVal(2);
+      MVT VecVT = N->getSimpleValueType(0);
+      uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
+      assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
+      // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
+      // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
+      return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
+    }
+
     // Helper to detect unneeded and instructions on shift amounts. Called
     // from PatFrags in tablegen.
     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
@@ -504,8 +517,9 @@ namespace {
     bool tryShiftAmountMod(SDNode *N);
     bool tryShrinkShlLogicImm(SDNode *N);
     bool tryVPTERNLOG(SDNode *N);
-    bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC,
-                        SDValue A, SDValue B, SDValue C, uint8_t Imm);
+    bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
+                        SDNode *ParentC, SDValue A, SDValue B, SDValue C,
+                        uint8_t Imm);
     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
     bool tryMatchBitSelect(SDNode *N);
 
@@ -877,19 +891,34 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       continue;
     }
 
-    /// Convert vector increment or decrement to sub/add with an all-ones
-    /// constant:
-    /// add X, <1, 1...> --> sub X, <-1, -1...>
-    /// sub X, <1, 1...> --> add X, <-1, -1...>
-    /// The all-ones vector constant can be materialized using a pcmpeq
-    /// instruction that is commonly recognized as an idiom (has no register
-    /// dependency), so that's better/smaller than loading a splat 1 constant.
+    // Convert vector increment or decrement to sub/add with an all-ones
+    // constant:
+    // add X, <1, 1...> --> sub X, <-1, -1...>
+    // sub X, <1, 1...> --> add X, <-1, -1...>
+    // The all-ones vector constant can be materialized using a pcmpeq
+    // instruction that is commonly recognized as an idiom (has no register
+    // dependency), so that's better/smaller than loading a splat 1 constant.
+    //
+    // But don't do this if it would inhibit a potentially profitable load
+    // folding opportunity for the other operand. That only occurs with the
+    // intersection of:
+    // (1) The other operand (op0) is load foldable.
+    // (2) The op is an add (otherwise, we are *creating* an add and can still
+    //     load fold the other op).
+    // (3) The target has AVX (otherwise, we have a destructive add and can't
+    //     load fold the other op without killing the constant op).
+    // (4) The constant 1 vector has multiple uses (so it is profitable to load
+    //     into a register anyway).
+    auto mayPreventLoadFold = [&]() {
+      return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
+             N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
+             !N->getOperand(1).hasOneUse();
+    };
     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
-        N->getSimpleValueType(0).isVector()) {
-
+        N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
       APInt SplatVal;
       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
-          SplatVal.isOneValue()) {
+          SplatVal.isOne()) {
         SDLoc DL(N);
 
         MVT VT = N->getSimpleValueType(0);
@@ -1121,7 +1150,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       if (VT.isVector() || VT == MVT::f128)
         break;
 
-      MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
+      MVT VecVT = VT == MVT::f64   ? MVT::v2f64
+                  : VT == MVT::f32 ? MVT::v4f32
+                                   : MVT::v8f16;
+
       SDLoc dl(N);
       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
                                     N->getOperand(0));
@@ -2464,10 +2496,18 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   return false;
 }
 
-/// Helper for selectVectorAddr. Handles things that can be folded into a
-/// gather scatter address. The index register and scale should have already
-/// been handled.
-bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
+bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
+                                                    X86ISelAddressMode &AM,
+                                                    unsigned Depth) {
+  SDLoc dl(N);
+  LLVM_DEBUG({
+    dbgs() << "MatchVectorAddress: ";
+    AM.dump(CurDAG);
+  });
+  // Limit recursion.
+  if (Depth > 5)
+    return matchAddressBase(N, AM);
+
   // TODO: Support other operations.
   switch (N.getOpcode()) {
   case ISD::Constant: {
@@ -2480,11 +2520,41 @@ bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
     if (!matchWrapper(N, AM))
       return false;
     break;
+  case ISD::ADD: {
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    X86ISelAddressMode Backup = AM;
+    if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
+        !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
+                                       Depth + 1))
+      return false;
+    AM = Backup;
+
+    // Try again after commuting the operands.
+    if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
+                                       Depth + 1) &&
+        !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
+                                       Depth + 1))
+      return false;
+    AM = Backup;
+
+    N = Handle.getValue();
+    break;
+  }
   }
 
   return matchAddressBase(N, AM);
 }
 
+/// Helper for selectVectorAddr. Handles things that can be folded into a
+/// gather/scatter address. The index register and scale should have already
+/// been handled.
+bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
+  return matchVectorAddressRecursively(N, AM, 0);
+}
+
 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
                                        SDValue IndexOp, SDValue ScaleOp,
                                        SDValue &Base, SDValue &Scale,
@@ -3387,16 +3457,24 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     return false;
 
   SDValue NBits;
+  bool NegateNBits;
 
   // If we have BMI2's BZHI, we are ok with muti-use patterns.
   // Else, if we only have BMI1's BEXTR, we require one-use.
-  const bool CanHaveExtraUses = Subtarget->hasBMI2();
-  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
-    return CanHaveExtraUses ||
+  const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
+  auto checkUses = [AllowExtraUsesByDefault](SDValue Op, unsigned NUses,
+                                             Optional<bool> AllowExtraUses) {
+    return AllowExtraUses.getValueOr(AllowExtraUsesByDefault) ||
            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
   };
-  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
-  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
+  auto checkOneUse = [checkUses](SDValue Op,
+                                 Optional<bool> AllowExtraUses = None) {
+    return checkUses(Op, 1, AllowExtraUses);
+  };
+  auto checkTwoUse = [checkUses](SDValue Op,
+                                 Optional<bool> AllowExtraUses = None) {
+    return checkUses(Op, 2, AllowExtraUses);
+  };
 
   auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
     if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
@@ -3409,8 +3487,8 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   };
 
   // a) x & ((1 << nbits) + (-1))
-  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
-                        &NBits](SDValue Mask) -> bool {
+  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
+                        &NegateNBits](SDValue Mask) -> bool {
     // Match `add`. Must only have one use!
     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
       return false;
@@ -3424,6 +3502,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     if (!isOneConstant(M0->getOperand(0)))
       return false;
     NBits = M0->getOperand(1);
+    NegateNBits = false;
     return true;
   };
 
@@ -3436,7 +3515,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
 
   // b) x & ~(-1 << nbits)
   auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
-                        &NBits](SDValue Mask) -> bool {
+                        &NBits, &NegateNBits](SDValue Mask) -> bool {
     // Match `~()`. Must only have one use!
     if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
       return false;
@@ -3451,32 +3530,35 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     if (!isAllOnes(M0->getOperand(0)))
       return false;
     NBits = M0->getOperand(1);
+    NegateNBits = false;
     return true;
   };
 
-  // Match potentially-truncated (bitwidth - y)
-  auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
-                                             unsigned Bitwidth) {
-    // Skip over a truncate of the shift amount.
-    if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
-      ShiftAmt = ShiftAmt.getOperand(0);
-      // The trunc should have been the only user of the real shift amount.
-      if (!checkOneUse(ShiftAmt))
-        return false;
-    }
-    // Match the shift amount as: (bitwidth - y). It should go away, too.
-    if (ShiftAmt.getOpcode() != ISD::SUB)
-      return false;
-    auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+  // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
+  // or leave the shift amount as-is, but then we'll have to negate it.
+  auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
+                                                     unsigned Bitwidth) {
+    NBits = ShiftAmt;
+    NegateNBits = true;
+    // Skip over a truncate of the shift amount, if any.
+    if (NBits.getOpcode() == ISD::TRUNCATE)
+      NBits = NBits.getOperand(0);
+    // Try to match the shift amount as (bitwidth - y). It should go away, too.
+    // If it doesn't match, that's fine, we'll just negate it ourselves.
+    if (NBits.getOpcode() != ISD::SUB)
+      return;
+    auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
     if (!V0 || V0->getZExtValue() != Bitwidth)
-      return false;
-    NBits = ShiftAmt.getOperand(1);
-    return true;
+      return;
+    NBits = NBits.getOperand(1);
+    NegateNBits = false;
   };
 
+  // c) x &  (-1 >> z)  but then we'll have to subtract z from bitwidth
+  //   or
   // c) x &  (-1 >> (32 - y))
-  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
-                        matchShiftAmt](SDValue Mask) -> bool {
+  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
+                        canonicalizeShiftAmt](SDValue Mask) -> bool {
     // The mask itself may be truncated.
     Mask = peekThroughOneUseTruncation(Mask);
     unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
@@ -3490,27 +3572,39 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
     // The shift amount should not be used externally.
     if (!checkOneUse(M1))
       return false;
-    return matchShiftAmt(M1, Bitwidth);
+    canonicalizeShiftAmt(M1, Bitwidth);
+    // Pattern c. is non-canonical, and is expanded into pattern d. iff there
+    // is no extra use of the mask. Clearly, there was one since we are here.
+    // But at the same time, if we need to negate the shift amount,
+    // then we don't want the mask to stick around, else it's unprofitable.
+    return !NegateNBits;
   };
 
   SDValue X;
 
+  // d) x << z >> z  but then we'll have to subtract z from bitwidth
+  //   or
   // d) x << (32 - y) >> (32 - y)
-  auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
+  auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
+                        AllowExtraUsesByDefault, &NegateNBits,
                         &X](SDNode *Node) -> bool {
     if (Node->getOpcode() != ISD::SRL)
       return false;
     SDValue N0 = Node->getOperand(0);
-    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
+    if (N0->getOpcode() != ISD::SHL)
       return false;
     unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
     SDValue N1 = Node->getOperand(1);
     SDValue N01 = N0->getOperand(1);
     // Both of the shifts must be by the exact same value.
-    // There should not be any uses of the shift amount outside of the pattern.
-    if (N1 != N01 || !checkTwoUse(N1))
+    if (N1 != N01)
       return false;
-    if (!matchShiftAmt(N1, Bitwidth))
+    canonicalizeShiftAmt(N1, Bitwidth);
+    // There should not be any external uses of the inner shift / shift amount.
+    // Note that while we are generally okay with external uses given BMI2,
+    // iff we need to negate the shift amount, we are not okay with extra uses.
+    const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
+    if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
       return false;
     X = N0->getOperand(0);
     return true;
@@ -3535,6 +3629,11 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   } else if (!matchPatternD(Node))
     return false;
 
+  // If we need to negate the shift amount, require BMI2 BZHI support.
+  // It's just too unprofitable for BMI1 BEXTR.
+  if (NegateNBits && !Subtarget->hasBMI2())
+    return false;
+
   SDLoc DL(Node);
 
   // Truncate the shift amount.
@@ -3549,11 +3648,21 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
 
   SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
   insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
-  NBits = SDValue(
-      CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
-                             NBits, SRIdxVal), 0);
+  NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                         MVT::i32, ImplDef, NBits, SRIdxVal),
+                  0);
   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
 
+  // We might have matched the amount of high bits to be cleared,
+  // but we want the amount of low bits to be kept, so negate it then.
+  if (NegateNBits) {
+    SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
+    insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
+
+    NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
+    insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
+  }
+
   if (Subtarget->hasBMI2()) {
     // Great, just emit the the BZHI..
     if (NVT != MVT::i32) {
@@ -4040,11 +4149,11 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
 }
 
 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
-                                     SDNode *ParentBC, SDValue A, SDValue B,
-                                     SDValue C, uint8_t Imm) {
-  assert(A.isOperandOf(ParentA));
-  assert(B.isOperandOf(ParentBC));
-  assert(C.isOperandOf(ParentBC));
+                                     SDNode *ParentB, SDNode *ParentC,
+                                     SDValue A, SDValue B, SDValue C,
+                                     uint8_t Imm) {
+  assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
+         C.isOperandOf(ParentC) && "Incorrect parent node");
 
   auto tryFoldLoadOrBCast =
       [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
@@ -4072,7 +4181,7 @@ bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
 
   bool FoldedLoad = false;
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+  if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     FoldedLoad = true;
   } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
                                 Tmp4)) {
@@ -4085,7 +4194,7 @@ bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
     if (OldImm & 0x10) Imm |= 0x02;
     if (OldImm & 0x08) Imm |= 0x40;
     if (OldImm & 0x40) Imm |= 0x08;
-  } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3,
+  } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
                                 Tmp4)) {
     FoldedLoad = true;
     std::swap(B, C);
@@ -4163,7 +4272,6 @@ bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
 }
 
 // Try to match two logic ops to a VPTERNLOG.
-// FIXME: Handle inverted inputs?
 // FIXME: Handle more complex patterns that use an operand more than once?
 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   MVT NVT = N->getSimpleValueType(0);
@@ -4206,12 +4314,31 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
 
   SDValue B = FoldableOp.getOperand(0);
   SDValue C = FoldableOp.getOperand(1);
+  SDNode *ParentA = N;
+  SDNode *ParentB = FoldableOp.getNode();
+  SDNode *ParentC = FoldableOp.getNode();
 
   // We can build the appropriate control immediate by performing the logic
   // operation we're matching using these constants for A, B, and C.
-  const uint8_t TernlogMagicA = 0xf0;
-  const uint8_t TernlogMagicB = 0xcc;
-  const uint8_t TernlogMagicC = 0xaa;
+  uint8_t TernlogMagicA = 0xf0;
+  uint8_t TernlogMagicB = 0xcc;
+  uint8_t TernlogMagicC = 0xaa;
+
+  // Some of the inputs may be inverted, peek through them and invert the
+  // magic values accordingly.
+  // TODO: There may be a bitcast before the xor that we should peek through.
+  auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
+    if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
+        ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
+      Magic = ~Magic;
+      Parent = Op.getNode();
+      Op = Op.getOperand(0);
+    }
+  };
+
+  PeekThroughNot(A, ParentA, TernlogMagicA);
+  PeekThroughNot(B, ParentB, TernlogMagicB);
+  PeekThroughNot(C, ParentC, TernlogMagicC);
 
   uint8_t Imm;
   switch (FoldableOp.getOpcode()) {
@@ -4235,7 +4362,7 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   case ISD::XOR: Imm ^= TernlogMagicA; break;
   }
 
-  return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm);
+  return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
 }
 
 /// If the high bits of an 'and' operand are known zero, try setting the
@@ -4295,7 +4422,7 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
 
   // Check if the mask is -1. In that case, this is an unnecessary instruction
   // that escaped earlier analysis.
-  if (NegMaskVal.isAllOnesValue()) {
+  if (NegMaskVal.isAllOnes()) {
     ReplaceNode(And, And0.getNode());
     return true;
   }
@@ -4572,7 +4699,7 @@ bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
   ReplaceNode(N, Ternlog.getNode());
 
   return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
-                        A, B, C, 0xCA);
+                        Ternlog.getNode(), A, B, C, 0xCA);
 }
 
 void X86DAGToDAGISel::Select(SDNode *Node) {
@@ -4807,7 +4934,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
   case X86ISD::VPTERNLOG: {
     uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
-    if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0),
+    if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
                        Node->getOperand(1), Node->getOperand(2), Imm))
       return;
     break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3a64b3460030..dba0321d9431 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48,9 +48,10 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -71,14 +72,6 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
-static cl::opt<int> ExperimentalPrefLoopAlignment(
-    "x86-experimental-pref-loop-alignment", cl::init(4),
-    cl::desc(
-        "Sets the preferable loop alignment for experiments (as log2 bytes)"
-        "(the last x86-experimental-pref-loop-alignment bits"
-        " of the loop header PC will be 0)."),
-    cl::Hidden);
-
 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
     cl::desc(
@@ -117,6 +110,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
   X86ScalarSSEf64 = Subtarget.hasSSE2();
   X86ScalarSSEf32 = Subtarget.hasSSE1();
+  X86ScalarSSEf16 = Subtarget.hasFP16();
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   // Set up the TargetLowering object.
@@ -213,6 +207,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
 
+  // Signed saturation subtraction.
+  setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
+  setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
+  if (Subtarget.is64Bit())
+    setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
+
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
     // For slow shld targets we only lower for code size.
@@ -424,8 +425,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
   setOperationAction(ISD::PARITY, MVT::i8, Custom);
+  setOperationAction(ISD::PARITY, MVT::i16, Custom);
+  setOperationAction(ISD::PARITY, MVT::i32, Custom);
+  if (Subtarget.is64Bit())
+    setOperationAction(ISD::PARITY, MVT::i64, Custom);
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
+    // popcntw is longer to encode than popcntl and also has a false dependency
+    // on the dest that popcntl hasn't had since Cannon Lake.
+    setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
   } else {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
@@ -434,11 +442,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
     else
       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
-
-    setOperationAction(ISD::PARITY, MVT::i16, Custom);
-    setOperationAction(ISD::PARITY, MVT::i32, Custom);
-    if (Subtarget.is64Bit())
-      setOperationAction(ISD::PARITY, MVT::i64, Custom);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
@@ -532,7 +535,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
-  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
+  if (Subtarget.getTargetTriple().isPS4CPU())
+    setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
+  else
+    setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -968,6 +974,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
 
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
@@ -1147,6 +1154,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
 
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
@@ -1172,10 +1181,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
     }
 
-    // i8 vectors are custom because the source register and source
-    // source memory operand types are not the same width.
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
-
     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
       // do the pre and post work in the vector domain.
@@ -1677,6 +1682,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
     }
 
+    // With BWI, expanding (and promoting the shifts) is the better.
+    if (!Subtarget.useBWIRegs())
+      setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
+
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
@@ -1903,6 +1912,155 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
+    auto setGroup = [&] (MVT VT) {
+      setOperationAction(ISD::FADD,               VT, Legal);
+      setOperationAction(ISD::STRICT_FADD,        VT, Legal);
+      setOperationAction(ISD::FSUB,               VT, Legal);
+      setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
+      setOperationAction(ISD::FMUL,               VT, Legal);
+      setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
+      setOperationAction(ISD::FDIV,               VT, Legal);
+      setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
+      setOperationAction(ISD::FSQRT,              VT, Legal);
+      setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
+
+      setOperationAction(ISD::FFLOOR,             VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
+      setOperationAction(ISD::FCEIL,              VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
+      setOperationAction(ISD::FTRUNC,             VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
+      setOperationAction(ISD::FRINT,              VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
+      setOperationAction(ISD::FNEARBYINT,         VT, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
+
+      setOperationAction(ISD::LOAD,               VT, Legal);
+      setOperationAction(ISD::STORE,              VT, Legal);
+
+      setOperationAction(ISD::FMA,                VT, Legal);
+      setOperationAction(ISD::STRICT_FMA,         VT, Legal);
+      setOperationAction(ISD::VSELECT,            VT, Legal);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::SELECT,             VT, Custom);
+
+      setOperationAction(ISD::FNEG,               VT, Custom);
+      setOperationAction(ISD::FABS,               VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+    };
+
+    // AVX512_FP16 scalar operations
+    setGroup(MVT::f16);
+    addRegisterClass(MVT::f16,    &X86::FR16XRegClass);
+    setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
+    setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
+    setOperationAction(ISD::SETCC,                MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FSETCC,        MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FSETCCS,       MVT::f16, Custom);
+    setOperationAction(ISD::FROUND,               MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);
+    setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
+    setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
+    if (isTypeLegal(MVT::f80)) {
+      setOperationAction(ISD::FP_EXTEND,          MVT::f80, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::f80, Custom);
+    }
+
+    setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
+    setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
+
+    if (Subtarget.useAVX512Regs()) {
+      setGroup(MVT::v32f16);
+      addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
+
+      setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
+      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
+                                 MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
+                                 MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
+                                 MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
+                                 MVT::v32i16);
+
+      setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
+
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
+
+      setOperationAction(ISD::STRICT_FSETCC,      MVT::v32i1, Custom);
+      setOperationAction(ISD::STRICT_FSETCCS,     MVT::v32i1, Custom);
+    }
+
+    if (Subtarget.hasVLX()) {
+      addRegisterClass(MVT::v8f16,  &X86::VR128XRegClass);
+      addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
+      setGroup(MVT::v8f16);
+      setGroup(MVT::v16f16);
+
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
+      setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
+
+      setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
+
+      // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
+      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
+
+      setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
+
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
+
+      // Need to custom widen these to prevent scalarization.
+      setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
+      setOperationAction(ISD::STORE, MVT::v4f16, Custom);
+    }
+
+    // Support fp16 0 immediate
+    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
@@ -1921,6 +2079,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
     }
 
+    if (Subtarget.hasFP16()) {
+      // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
+      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
+      // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
+      setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
+      // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
+      setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
+      // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
+      setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
+    }
+
     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
@@ -1969,7 +2158,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
     setLibcallName(RTLIB::MUL_I128, nullptr);
+    // The MULO libcall is not part of libgcc, only compiler-rt.
+    setLibcallName(RTLIB::MULO_I64, nullptr);
   }
+  // The MULO libcall is not part of libgcc, only compiler-rt.
+  setLibcallName(RTLIB::MULO_I128, nullptr);
 
   // Combine sin / cos into _sincos_stret if it is available.
   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
@@ -1983,6 +2176,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
     setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
   }
 
   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
@@ -2070,8 +2271,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxLoadsPerMemcmp = 2;
   MaxLoadsPerMemcmpOptSize = 2;
 
-  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
-  setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
+  // Default loop alignment, which can be overridden by -align-loops.
+  setPrefLoopAlignment(Align(16));
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
@@ -2165,6 +2366,16 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
       return RegisterVT;
   }
 
+  // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
+  // So its default register type is f16. We override the type to v8f16 here.
+  if (VT == MVT::v3f16 && Subtarget.hasFP16())
+    return MVT::v8f16;
+
+  // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
+  if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
+      !Subtarget.hasX87())
+    return MVT::i32;
+
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
 
@@ -2183,6 +2394,20 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
       return NumRegisters;
   }
 
+  // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
+  // So its default register number is 3. We override the number to 1 here.
+  if (VT == MVT::v3f16 && Subtarget.hasFP16())
+    return 1;
+
+  // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
+  // x87 is disabled.
+  if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
+    if (VT == MVT::f64)
+      return 2;
+    if (VT == MVT::f80)
+      return 3;
+  }
+
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
 
@@ -2272,7 +2497,7 @@ static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
   if (Subtarget.is64Bit()) {
     // Max of 8 and alignment of type.
@@ -2294,7 +2519,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
 /// preferred vector width.
 EVT X86TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
-  if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+  if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
     if (Op.size() >= 16 &&
         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
       // FIXME: Check if unaligned 64-byte accesses are slow.
@@ -2547,7 +2772,7 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
         Type::getInt8PtrTy(M.getContext()));
     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
       F->setCallingConv(CallingConv::X86_FastCall);
-      F->addAttribute(1, Attribute::AttrKind::InReg);
+      F->addParamAttr(0, Attribute::AttrKind::InReg);
     }
     return;
   }
@@ -2898,16 +3123,15 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     return false;
 
   bool HasRet = false;
-  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
-       UI != UE; ++UI) {
-    if (UI->getOpcode() != X86ISD::RET_FLAG)
+  for (const SDNode *U : Copy->uses()) {
+    if (U->getOpcode() != X86ISD::RET_FLAG)
       return false;
     // If we are returning more than one value, we can definitely
     // not make a tail call see PR19530
-    if (UI->getNumOperands() > 4)
+    if (U->getNumOperands() > 4)
       return false;
-    if (UI->getNumOperands() == 4 &&
-        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+    if (U->getNumOperands() == 4 &&
+        U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
       return false;
     HasRet = true;
   }
@@ -3137,38 +3361,40 @@ SDValue X86TargetLowering::LowerCallResult(
 //  For info on fast calling convention see Fast Calling Convention (tail call)
 //  implementation LowerX86_32FastCCCallTo.
 
-/// CallIsStructReturn - Determines whether a call uses struct return
-/// semantics.
-enum StructReturnType {
-  NotStructReturn,
-  RegStructReturn,
-  StackStructReturn
-};
-static StructReturnType
-callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
-  if (Outs.empty())
-    return NotStructReturn;
+/// Determines whether Args, either a set of outgoing arguments to a call, or a
+/// set of incoming args of a call, contains an sret pointer that the callee
+/// pops
+template <typename T>
+static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
+                             const X86Subtarget &Subtarget) {
+  // Not C++20 (yet), so no concepts available.
+  static_assert(std::is_same<T, ISD::OutputArg>::value ||
+                    std::is_same<T, ISD::InputArg>::value,
+                "requires ISD::OutputArg or ISD::InputArg");
 
-  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
-  if (!Flags.isSRet())
-    return NotStructReturn;
-  if (Flags.isInReg() || IsMCU)
-    return RegStructReturn;
-  return StackStructReturn;
-}
+  // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
+  // for most compilations.
+  if (!Subtarget.is32Bit())
+    return false;
+
+  if (Args.empty())
+    return false;
 
-/// Determines whether a function uses struct return semantics.
-static StructReturnType
-argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
-  if (Ins.empty())
-    return NotStructReturn;
+  // Most calls do not have an sret argument, check the arg next.
+  const ISD::ArgFlagsTy &Flags = Args[0].Flags;
+  if (!Flags.isSRet() || Flags.isInReg())
+    return false;
+
+  // The MSVCabi does not pop the sret.
+  if (Subtarget.getTargetTriple().isOSMSVCRT())
+    return false;
+
+  // MCUs don't pop the sret
+  if (Subtarget.isTargetMCU())
+    return false;
 
-  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
-  if (!Flags.isSRet())
-    return NotStructReturn;
-  if (Flags.isInReg() || IsMCU)
-    return RegStructReturn;
-  return StackStructReturn;
+  // Callee pops argument
+  return true;
 }
 
 /// Make a copy of an aggregate at address specified by "Src" to address
@@ -3533,13 +3759,19 @@ void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
       SmallVector<SDValue, 12> SaveXMMOps;
       SaveXMMOps.push_back(Chain);
       SaveXMMOps.push_back(ALVal);
-      SaveXMMOps.push_back(
-          DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
+      SaveXMMOps.push_back(RSFIN);
       SaveXMMOps.push_back(
           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
       llvm::append_range(SaveXMMOps, LiveXMMRegs);
-      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
-                                   MVT::Other, SaveXMMOps));
+      MachineMemOperand *StoreMMO =
+          DAG.getMachineFunction().getMachineMemOperand(
+              MachinePointerInfo::getFixedStack(
+                  DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
+                  Offset),
+              MachineMemOperand::MOStore, 128, Align(16));
+      MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
+                                               DL, DAG.getVTList(MVT::Other),
+                                               SaveXMMOps, MVT::i8, StoreMMO));
     }
 
     if (!MemOps.empty())
@@ -3670,6 +3902,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
           RC = &X86::GR32RegClass;
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
+        else if (RegVT == MVT::f16)
+          RC = &X86::FR16XRegClass;
         else if (RegVT == MVT::f32)
           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
         else if (RegVT == MVT::f64)
@@ -3767,12 +4001,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
     // the argument into a virtual register so that we can access it from the
     // return points.
     if (Ins[I].Flags.isSRet()) {
-      Register Reg = FuncInfo->getSRetReturnReg();
-      if (!Reg) {
-        MVT PtrTy = getPointerTy(DAG.getDataLayout());
-        Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
-        FuncInfo->setSRetReturnReg(Reg);
-      }
+      assert(!FuncInfo->getSRetReturnReg() &&
+             "SRet return has already been set");
+      MVT PtrTy = getPointerTy(DAG.getDataLayout());
+      Register Reg =
+          MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+      FuncInfo->setSRetReturnReg(Reg);
       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
       break;
@@ -3800,9 +4034,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
-        !Subtarget.getTargetTriple().isOSMSVCRT() &&
-        argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
+    if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
@@ -3921,10 +4153,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget.is64Bit();
   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
-  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   bool IsSibcall      = false;
   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
+  bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   bool HasNCSR = (CB && isa<CallInst>(CB) &&
                   CB->hasFnAttr("no_caller_saved_registers"));
@@ -3950,13 +4182,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isTailCall = false;
   }
 
-
   if (isTailCall && !IsMustTail) {
     // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    isVarArg, SR != NotStructReturn,
-                    MF.getFunction().hasStructRetAttr(), CLI.RetTy,
-                    Outs, OutVals, Ins, DAG);
+    isTailCall = IsEligibleForTailCallOptimization(
+        Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
+        Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
@@ -4199,7 +4429,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
+  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
+      (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -4324,7 +4555,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // address into a register.
     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
   } else if (Subtarget.isTarget64BitILP32() &&
-             Callee->getValueType(0) == MVT::i32) {
+             Callee.getValueType() == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   }
@@ -4436,14 +4667,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
            "tail calls cannot be marked with clang.arc.attachedcall");
     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
 
-    // Add target constant to select ObjC runtime call just before the call
-    // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
-    // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
-    // epxanding the pseudo.
-    unsigned RuntimeCallType =
-        objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
-    Ops.insert(Ops.begin() + 1,
-               DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
+    // Add a target global address for the retainRV/claimRV runtime function
+    // just before the call target.
+    Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
+    Ops.insert(Ops.begin() + 1, GA);
     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
   } else {
     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
@@ -4459,20 +4688,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
 
   // Create the CALLSEQ_END node.
-  unsigned NumBytesForCalleeToPop;
+  unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
-           !Subtarget.getTargetTriple().isOSMSVCRT() &&
-           SR == StackStructReturn)
-    // If this is a call to a struct-return function, the callee
-    // pops the hidden struct pointer, so we have to push it back.
-    // This is common for Darwin/X86, Linux & Mingw32 targets.
-    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+  else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
+    // If this call passes a struct-return pointer, the callee
+    // pops that struct pointer.
     NumBytesForCalleeToPop = 4;
-  else
-    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
 
   // Returns a flag for retval copy to use.
   if (!IsSibcall) {
@@ -4631,9 +4854,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
 /// Check whether the call is eligible for tail call optimization. Targets
 /// that want to do tail call optimization should implement this function.
 bool X86TargetLowering::IsEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
+    bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   if (!mayTailCallThisCC(CalleeCC))
@@ -4677,9 +4899,17 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   if (RegInfo->hasStackRealignment(MF))
     return false;
 
-  // Also avoid sibcall optimization if either caller or callee uses struct
-  // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
+  // Also avoid sibcall optimization if we're an sret return fn and the callee
+  // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
+  // insufficient.
+  if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
+    // For a compatible tail call the callee must return our sret pointer. So it
+    // needs to be (a) an sret function itself and (b) we pass our sret as its
+    // sret. Condition #b is harder to determine.
+    return false;
+  } else if (IsCalleePopSRet)
+    // The callee pops an sret, so we cannot tail-call, as our caller doesn't
+    // expect that.
     return false;
 
   // Do not sibcall optimize vararg calls unless all arguments are passed via
@@ -4833,15 +5063,44 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//
 
-static bool MayFoldLoad(SDValue Op) {
-  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+                      bool AssumeSingleUse) {
+  if (!AssumeSingleUse && !Op.hasOneUse())
+    return false;
+  if (!ISD::isNormalLoad(Op.getNode()))
+    return false;
+
+  // If this is an unaligned vector, make sure the target supports folding it.
+  auto *Ld = cast<LoadSDNode>(Op.getNode());
+  if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
+      Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
+    return false;
+
+  // TODO: If this is a non-temporal load and the target has an instruction
+  //       for it, it should not be folded. See "useNonTemporalLoad()".
+
+  return true;
+}
+
+bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+                                          const X86Subtarget &Subtarget,
+                                          bool AssumeSingleUse) {
+  assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
+  if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
+    return false;
+
+  // We can not replace a wide volatile load with a broadcast-from-memory,
+  // because that would narrow the load, which isn't legal for volatiles.
+  auto *Ld = cast<LoadSDNode>(Op.getNode());
+  return !Ld->isVolatile() ||
+         Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
 }
 
-static bool MayFoldIntoStore(SDValue Op) {
+bool X86::mayFoldIntoStore(SDValue Op) {
   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
 }
 
-static bool MayFoldIntoZeroExtend(SDValue Op) {
+bool X86::mayFoldIntoZeroExtend(SDValue Op) {
   if (Op.hasOneUse()) {
     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
     return (ISD::ZERO_EXTEND == Opcode);
@@ -4872,6 +5131,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VBROADCAST:
@@ -5023,20 +5283,20 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
 /// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
-                               bool isFP, SDValue &LHS, SDValue &RHS,
-                               SelectionDAG &DAG) {
+                                    bool isFP, SDValue &LHS, SDValue &RHS,
+                                    SelectionDAG &DAG) {
   if (!isFP) {
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
-      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
         // X > -1   -> X == 0, jump !sign.
         RHS = DAG.getConstant(0, DL, RHS.getValueType());
         return X86::COND_NS;
       }
-      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+      if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
         // X < 0   -> X == 0, jump on sign.
         return X86::COND_S;
       }
-      if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
+      if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
         // X >= 0   -> X == 0, jump on !sign.
         return X86::COND_NS;
       }
@@ -5119,6 +5379,10 @@ static bool hasFPCMov(unsigned X86CC) {
   }
 }
 
+static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
+  return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
+         VT.is512BitVector();
+}
 
 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
@@ -5312,10 +5576,13 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
     VT = getTypeToTransformTo(Context, VT);
 
   // If vector multiply is legal, assume that's faster than shl + add/sub.
-  // TODO: Multiply is a complex op with higher latency and lower throughput in
-  //       most implementations, so this check could be loosened based on type
-  //       and/or a CPU attribute.
-  if (isOperationLegal(ISD::MUL, VT))
+  // Multiply is a complex op with higher latency and lower throughput in
+  // most implementations, sub-vXi32 vector multiplies are always fast,
+  // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
+  // is always going to be slow.
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
+      (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
     return false;
 
   // shl+add, shl+sub, shl+add+neg
@@ -5393,11 +5660,10 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
 }
 
 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                                         const SelectionDAG &DAG) const {
+                                         const MachineFunction &MF) const {
   // Do not merge to float value size (128 bytes) if no implicit
   // float attribute is set.
-  bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
-      Attribute::NoImplicitFloat);
+  bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
 
   if (NoFloat) {
     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
@@ -5731,7 +5997,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
   // Here we do not set undef elements as zeroable.
   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
   if (V2IsZero) {
-    assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+    assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
     for (int i = 0, Size = Mask.size(); i != Size; ++i)
       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
         ZeroableMask[i] = SM_SentinelZero;
@@ -6037,62 +6303,67 @@ static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
          "Can't split odd sized vector");
 
+  // If this is a splat value (with no-undefs) then use the lower subvector,
+  // which should be a free extraction.
   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
+  if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
+    return std::make_pair(Lo, Lo);
+
   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
   return std::make_pair(Lo, Hi);
 }
 
-// Split an unary integer op into 2 half sized ops.
-static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+/// Break an operation into 2 half sized ops and then concatenate the results.
+static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
+  unsigned NumOps = Op.getNumOperands();
   EVT VT = Op.getValueType();
+  SDLoc dl(Op);
 
+  // Extract the LHS Lo/Hi vectors
+  SmallVector<SDValue> LoOps(NumOps, SDValue());
+  SmallVector<SDValue> HiOps(NumOps, SDValue());
+  for (unsigned I = 0; I != NumOps; ++I) {
+    SDValue SrcOp = Op.getOperand(I);
+    if (!SrcOp.getValueType().isVector()) {
+      LoOps[I] = HiOps[I] = SrcOp;
+      continue;
+    }
+    std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
+  }
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
+                     DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
+}
+
+/// Break an unary integer operation into 2 half sized ops and then
+/// concatenate the result back.
+static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
   // Make sure we only try to split 256/512-bit types to avoid creating
   // narrow vectors.
+  EVT VT = Op.getValueType();
+  (void)VT;
   assert((Op.getOperand(0).getValueType().is256BitVector() ||
           Op.getOperand(0).getValueType().is512BitVector()) &&
          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
              VT.getVectorNumElements() &&
          "Unexpected VTs!");
-
-  SDLoc dl(Op);
-
-  // Extract the Lo/Hi vectors
-  SDValue Lo, Hi;
-  std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                     DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
-                     DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
+  return splitVectorOp(Op, DAG);
 }
 
 /// Break a binary integer operation into 2 half sized ops and then
 /// concatenate the result back.
 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
+  // Assert that all the types match.
   EVT VT = Op.getValueType();
-
-  // Sanity check that all the types match.
+  (void)VT;
   assert(Op.getOperand(0).getValueType() == VT &&
          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
-
-  SDLoc dl(Op);
-
-  // Extract the LHS Lo/Hi vectors
-  SDValue LHS1, LHS2;
-  std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
-
-  // Extract the RHS Lo/Hi vectors
-  SDValue RHS1, RHS2;
-  std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                     DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
-                     DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
+  return splitVectorOp(Op, DAG);
 }
 
 // Helper for splitting operands of an operation to legal target size and
@@ -6143,6 +6414,71 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
 }
 
+// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
+// targets.
+static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
+                             ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+                             const X86Subtarget &Subtarget) {
+  assert(Subtarget.hasAVX512() && "AVX512 target expected");
+  MVT SVT = VT.getScalarType();
+
+  // If we have a 32/64 splatted constant, splat it to DstTy to
+  // encourage a foldable broadcast'd operand.
+  auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
+    unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
+    // AVX512 broadcasts 32/64-bit operands.
+    // TODO: Support float once getAVX512Node is used by fp-ops.
+    if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
+        !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
+      return SDValue();
+    // If we're not widening, don't bother if we're not bitcasting.
+    if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
+      return SDValue();
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
+      APInt SplatValue, SplatUndef;
+      unsigned SplatBitSize;
+      bool HasAnyUndefs;
+      if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, OpEltSizeInBits) &&
+          !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
+        return DAG.getConstant(SplatValue, DL, DstVT);
+    }
+    return SDValue();
+  };
+
+  bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
+
+  MVT DstVT = VT;
+  if (Widen)
+    DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
+
+  // Canonicalize src operands.
+  SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
+  for (SDValue &Op : SrcOps) {
+    MVT OpVT = Op.getSimpleValueType();
+    // Just pass through scalar operands.
+    if (!OpVT.isVector())
+      continue;
+    assert(OpVT == VT && "Vector type mismatch");
+
+    if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
+      Op = BroadcastOp;
+      continue;
+    }
+
+    // Just widen the subvector by inserting into an undef wide vector.
+    if (Widen)
+      Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
+  }
+
+  SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
+
+  // Perform the 512-bit op then extract the bottom subvector.
+  if (Widen)
+    Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
+  return Res;
+}
+
 /// Insert i1-subvector to i1-vector.
 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
@@ -6214,14 +6550,21 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
 
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
     assert(IdxVal != 0 && "Unexpected index");
-    NumElems = WideOpVT.getVectorNumElements();
-    unsigned ShiftLeft = NumElems - SubVecNumElems;
-    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
-    SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
-                         DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
-    if (ShiftRight != 0)
-      SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
-                           DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+    // If upper elements of Vec are known undef, then just shift into place.
+    if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
+                     [](SDValue V) { return V.isUndef(); })) {
+      SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+                           DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+    } else {
+      NumElems = WideOpVT.getVectorNumElements();
+      unsigned ShiftLeft = NumElems - SubVecNumElems;
+      unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+      SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+                           DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+      if (ShiftRight != 0)
+        SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+                             DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+    }
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
@@ -6323,7 +6666,7 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
 
-  APInt Ones = APInt::getAllOnesValue(32);
+  APInt Ones = APInt::getAllOnes(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
@@ -6461,6 +6804,58 @@ static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
+/// Returns a node that packs the LHS + RHS nodes together at half width.
+/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
+/// TODO: Add vXi64 -> vXi32 pack support with vector_shuffle node.
+/// TODO: Add subvector splitting if/when we have a need for it.
+static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
+                       const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
+                       bool PackHiHalf = false) {
+  MVT OpVT = LHS.getSimpleValueType();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
+  assert(OpVT == RHS.getSimpleValueType() &&
+         VT.getSizeInBits() == OpVT.getSizeInBits() &&
+         (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
+         "Unexpected PACK operand types");
+  assert((EltSizeInBits == 8 || EltSizeInBits == 16) &&
+         "Unexpected PACK result type");
+
+  // See if we already have sufficient leading bits for PACKSS/PACKUS.
+  if (!PackHiHalf) {
+    if (UsePackUS &&
+        DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
+        DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
+      return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
+
+    if (DAG.ComputeMinSignedBits(LHS) <= EltSizeInBits &&
+        DAG.ComputeMinSignedBits(RHS) <= EltSizeInBits)
+      return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
+  }
+
+  // Fallback to sign/zero extending the requested half and pack.
+  SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
+  if (UsePackUS) {
+    if (PackHiHalf) {
+      LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
+      RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
+    } else {
+      SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
+      LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
+      RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
+    };
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
+  };
+
+  if (!PackHiHalf) {
+    LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
+    RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
+  }
+  LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
+  RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
+  return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
+}
+
 /// Return a vector_shuffle of the specified vector of zero or undef vector.
 /// This produces a shuffle where the low element of V2 is swizzled into the
 /// zero/undef vector, landing at element Idx.
@@ -6563,7 +6958,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
 
       // Only treat an element as UNDEF if all bits are UNDEF.
-      if (UndefEltBits.isAllOnesValue()) {
+      if (UndefEltBits.isAllOnes()) {
         if (!AllowWholeUndefs)
           return false;
         UndefElts.setBit(i);
@@ -6602,59 +6997,36 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
 
   // Handle UNDEFs.
   if (Op.isUndef()) {
-    APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
+    APInt UndefSrcElts = APInt::getAllOnes(NumElts);
     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract scalar constant bits.
   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
-    APInt UndefSrcElts = APInt::getNullValue(1);
+    APInt UndefSrcElts = APInt::getZero(1);
     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
-    APInt UndefSrcElts = APInt::getNullValue(1);
+    APInt UndefSrcElts = APInt::getZero(1);
     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from build vector.
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
+    BitVector Undefs;
+    SmallVector<APInt> SrcEltBits;
     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
-    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
-
-    APInt UndefSrcElts(NumSrcElts, 0);
-    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
-    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
-      const SDValue &Src = Op.getOperand(i);
-      if (Src.isUndef()) {
-        UndefSrcElts.setBit(i);
-        continue;
-      }
-      auto *Cst = cast<ConstantSDNode>(Src);
-      SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+    if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
+      APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());
+      for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
+        if (Undefs[I])
+          UndefSrcElts.setBit(I);
+      return CastBitData(UndefSrcElts, SrcEltBits);
     }
-    return CastBitData(UndefSrcElts, SrcEltBits);
-  }
-  if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
-    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
-    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
-
-    APInt UndefSrcElts(NumSrcElts, 0);
-    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
-    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
-      const SDValue &Src = Op.getOperand(i);
-      if (Src.isUndef()) {
-        UndefSrcElts.setBit(i);
-        continue;
-      }
-      auto *Cst = cast<ConstantFPSDNode>(Src);
-      APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
-      SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
-    }
-    return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from constant pool vector.
@@ -6704,17 +7076,21 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
     SDValue Ptr = MemIntr->getBasePtr();
+    // The source constant may be larger than the subvector broadcast,
+    // ensure we extract the correct subvector constants.
     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
       Type *CstTy = Cst->getType();
       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
-      if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+      unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
+      if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
+          (SizeInBits % SubVecSizeInBits) != 0)
         return false;
-      unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
-      unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
-      unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+      unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+      unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
+      unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
       APInt UndefSubElts(NumSubElts, 0);
       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
-                                        APInt(SubEltSizeInBits, 0));
+                                        APInt(CstEltSizeInBits, 0));
       for (unsigned i = 0; i != NumSubElts; ++i) {
         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
                                  UndefSubElts, i))
@@ -6814,12 +7190,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
                                        AllowPartialUndefs))
       return false;
 
-    UndefElts = APInt::getNullValue(NumElts);
+    UndefElts = APInt::getZero(NumElts);
     for (int i = 0; i != (int)NumElts; ++i) {
       int M = Mask[i];
       if (M < 0) {
         UndefElts.setBit(i);
-        EltBits.push_back(APInt::getNullValue(EltSizeInBits));
+        EltBits.push_back(APInt::getZero(EltSizeInBits));
       } else if (M < (int)NumElts) {
         if (UndefElts0[M])
           UndefElts.setBit(i);
@@ -6916,8 +7292,8 @@ static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
   int NumEltsPerLane = NumElts / NumLanes;
   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
 
-  DemandedLHS = APInt::getNullValue(NumInnerElts);
-  DemandedRHS = APInt::getNullValue(NumInnerElts);
+  DemandedLHS = APInt::getZero(NumInnerElts);
+  DemandedRHS = APInt::getZero(NumInnerElts);
 
   // Map DemandedElts to the packed operands.
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
@@ -6940,8 +7316,8 @@ static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
   int NumEltsPerLane = NumElts / NumLanes;
   int HalfEltsPerLane = NumEltsPerLane / 2;
 
-  DemandedLHS = APInt::getNullValue(NumElts);
-  DemandedRHS = APInt::getNullValue(NumElts);
+  DemandedLHS = APInt::getZero(NumElts);
+  DemandedRHS = APInt::getZero(NumElts);
 
   // Map DemandedElts to the horizontal operands.
   for (int Idx = 0; Idx != NumElts; ++Idx) {
@@ -7148,6 +7524,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     break;
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
@@ -7287,7 +7664,7 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask,
                                            SDValue V1, SDValue V2,
                                            APInt &KnownUndef, APInt &KnownZero) {
   int Size = Mask.size();
-  KnownUndef = KnownZero = APInt::getNullValue(Size);
+  KnownUndef = KnownZero = APInt::getZero(Size);
 
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
@@ -7380,7 +7757,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
   int Size = Mask.size();
   SDValue V1 = Ops[0];
   SDValue V2 = IsUnary ? V1 : Ops[1];
-  KnownUndef = KnownZero = APInt::getNullValue(Size);
+  KnownUndef = KnownZero = APInt::getZero(Size);
 
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
@@ -7487,7 +7864,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
                                               APInt &KnownUndef,
                                               APInt &KnownZero) {
   unsigned NumElts = Mask.size();
-  KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+  KnownUndef = KnownZero = APInt::getZero(NumElts);
 
   for (unsigned i = 0; i != NumElts; ++i) {
     int M = Mask[i];
@@ -7760,9 +8137,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     // lanes), we can treat this as a truncation shuffle.
     bool Offset0 = false, Offset1 = false;
     if (Opcode == X86ISD::PACKSS) {
-      if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
+      if ((!(N0.isUndef() || EltsLHS.isZero()) &&
            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
-          (!(N1.isUndef() || EltsRHS.isNullValue()) &&
+          (!(N1.isUndef() || EltsRHS.isZero()) &&
            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
         return false;
       // We can't easily fold ASHR into a shuffle, but if it was feeding a
@@ -7780,9 +8157,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       }
     } else {
       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
-      if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
+      if ((!(N0.isUndef() || EltsLHS.isZero()) &&
            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
-          (!(N1.isUndef() || EltsRHS.isNullValue()) &&
+          (!(N1.isUndef() || EltsRHS.isZero()) &&
            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
         return false;
     }
@@ -7983,7 +8360,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
 
   APInt KnownUndef, KnownZero;
   unsigned NumElts = Op.getValueType().getVectorNumElements();
-  APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+  APInt DemandedElts = APInt::getAllOnes(NumElts);
   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
                                 KnownZero, DAG, Depth, ResolveKnownElts);
 }
@@ -8467,10 +8844,10 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
   case ISD::SCALAR_TO_VECTOR:
     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
   case ISD::SRL:
-    if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
-      uint64_t Idx = IdxC->getZExtValue();
-      if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
-        ByteOffset += Idx / 8;
+    if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
+      uint64_t Amt = AmtC->getZExtValue();
+      if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
+        ByteOffset += Amt / 8;
         return true;
       }
     }
@@ -8508,9 +8885,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   unsigned NumElems = Elts.size();
 
   int LastLoadedElt = -1;
-  APInt LoadMask = APInt::getNullValue(NumElems);
-  APInt ZeroMask = APInt::getNullValue(NumElems);
-  APInt UndefMask = APInt::getNullValue(NumElems);
+  APInt LoadMask = APInt::getZero(NumElems);
+  APInt ZeroMask = APInt::getZero(NumElems);
+  APInt UndefMask = APInt::getZero(NumElems);
 
   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
@@ -8671,7 +9048,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   // If the upper half of a ymm/zmm load is undef then just load the lower half.
   if (VT.is256BitVector() || VT.is512BitVector()) {
     unsigned HalfNumElems = NumElems / 2;
-    if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
+    if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
       EVT HalfVT =
           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
       SDValue HalfLD =
@@ -8685,7 +9062,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
-      (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
+      ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
+       LoadSizeInBits == 64) &&
       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
                                       : MVT::getIntegerVT(LoadSizeInBits);
@@ -8709,7 +9087,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
   // BROADCAST - match the smallest possible repetition pattern, load that
   // scalar/subvector element and then broadcast to the entire vector.
-  if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
+  if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
       unsigned RepeatSize = SubElems * BaseSizeInBits;
@@ -8758,6 +9136,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
           } else {
+            if (!Subtarget.hasAVX2() &&
+                !X86::mayFoldLoadIntoBroadcastFromMem(
+                    RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
+                    Subtarget,
+                    /*AssumeSingleUse=*/true))
+              return SDValue();
             Broadcast =
                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
           }
@@ -8800,7 +9184,9 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
-      if (ScalarSize == 32) {
+      if (ScalarSize == 16) {
+        Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
+      } else if (ScalarSize == 32) {
         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
       } else {
         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
@@ -9009,6 +9395,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+        (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
@@ -9071,6 +9458,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     return BCast;
   }
 
+  if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
   // Unsupported broadcast.
   return SDValue();
 }
@@ -9760,7 +10150,7 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
 
   unsigned NumElts = VT.getVectorNumElements();
-  APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+  APInt DemandedElts = APInt::getAllOnes(NumElts);
   for (unsigned i = 0; i != NumElts; ++i)
     if (BV->getOperand(i).isUndef())
       DemandedElts.clearBit(i);
@@ -10335,9 +10725,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return VectorConstant;
 
   unsigned EVTBits = EltVT.getSizeInBits();
-  APInt UndefMask = APInt::getNullValue(NumElems);
-  APInt ZeroMask = APInt::getNullValue(NumElems);
-  APInt NonZeroMask = APInt::getNullValue(NumElems);
+  APInt UndefMask = APInt::getZero(NumElems);
+  APInt ZeroMask = APInt::getZero(NumElems);
+  APInt NonZeroMask = APInt::getZero(NumElems);
   bool IsAllConstants = true;
   SmallSet<SDValue, 8> Values;
   unsigned NumConstants = NumElems;
@@ -10361,7 +10751,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // All undef vector. Return an UNDEF. All zero vectors were handled above.
   if (NonZeroMask == 0) {
-    assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
+    assert(UndefMask.isAllOnes() && "Fully undef mask expected");
     return DAG.getUNDEF(VT);
   }
 
@@ -10471,13 +10861,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
-      if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
-          (EltVT == MVT::i64 && Subtarget.is64Bit())) {
+      if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
+          EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
+          (EltVT == MVT::i16 && Subtarget.hasFP16())) {
         assert((VT.is128BitVector() || VT.is256BitVector() ||
                 VT.is512BitVector()) &&
                "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
+        // zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
       }
 
@@ -10607,7 +10999,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                           DAG, Subtarget))
       return V;
 
-  if (EVTBits == 16 && NumElems == 8)
+  if (EltVT == MVT::i16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
@@ -10664,7 +11056,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return Sh;
 
   // For SSE 4.1, use insertps to put the high elements into the low element.
-  if (Subtarget.hasSSE41()) {
+  if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
     SDValue Result;
     if (!Op.getOperand(0).isUndef())
       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -11206,7 +11598,7 @@ static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
     // Arbitrarily choose from the 2nd operand if the select condition element
     // is undef.
     // TODO: Can we do better by matching patterns such as even/odd?
-    if (UndefElts[i] || EltBits[i].isNullValue())
+    if (UndefElts[i] || EltBits[i].isZero())
       Mask[i] += NumElts;
   }
 
@@ -11575,7 +11967,7 @@ static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
       continue;
     unsigned UpperElts = NumElts - NumSrcElts;
-    if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+    if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
       continue;
     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
@@ -11672,7 +12064,7 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
     unsigned NumSrcElts = NumElts / Scale;
     unsigned UpperElts = NumElts - NumSrcElts;
     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
-        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
       continue;
 
     SDValue Src = V1;
@@ -11729,7 +12121,7 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
     // The elements beyond the truncation must be undef/zero.
     unsigned UpperElts = NumElts - NumSrcElts;
     if (UpperElts > 0 &&
-        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
       continue;
     bool UndefUppers =
         UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
@@ -11955,8 +12347,8 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
   MVT LogicVT = VT;
   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
     Zero = DAG.getConstantFP(0.0, DL, EltVT);
-    APFloat AllOnesValue = APFloat::getAllOnesValue(
-        SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
+    APFloat AllOnesValue =
+        APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
     LogicVT =
         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
@@ -12038,10 +12430,15 @@ static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
     int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
-    if (M == i)
+    if (M == i ||
+        (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
+      Mask[i] = i;
       continue;
-    if (M == i + Size) {
+    }
+    if (M == (i + Size) ||
+        (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
       BlendMask |= 1ull << i;
+      Mask[i] = i + Size;
       continue;
     }
     if (Zeroable[i]) {
@@ -12424,6 +12821,14 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
   return SDValue();
 }
 
+static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
+  return isUndefOrEqual(Mask, 0);
+}
+
+static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
+  return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
+}
+
 /// Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
@@ -12457,6 +12862,38 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
     }
   }
 
+  // If we effectively only demand the 0'th element of \p Input, and not only
+  // as 0'th element, then broadcast said input,
+  // and change \p InputMask to be a no-op (identity) mask.
+  auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
+                                         &DAG](SDValue &Input,
+                                               MutableArrayRef<int> InputMask) {
+    unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
+    if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
+                                 !X86::mayFoldLoad(Input, Subtarget)))
+      return;
+    if (isNoopShuffleMask(InputMask))
+      return;
+    assert(isBroadcastShuffleMask(InputMask) &&
+           "Expected to demand only the 0'th element.");
+    Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
+    for (auto I : enumerate(InputMask)) {
+      int &InputMaskElt = I.value();
+      if (InputMaskElt >= 0)
+        InputMaskElt = I.index();
+    }
+  };
+
+  // Currently, we may need to produce one shuffle per input, and blend results.
+  // It is possible that the shuffle for one of the inputs is already a no-op.
+  // See if we can simplify non-no-op shuffles into broadcasts,
+  // which we consider to be strictly better than an arbitrary shuffle.
+  if (isNoopOrBroadcastShuffleMask(V1Mask) &&
+      isNoopOrBroadcastShuffleMask(V2Mask)) {
+    canonicalizeBroadcastableInput(V1, V1Mask);
+    canonicalizeBroadcastableInput(V2, V2Mask);
+  }
+
   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
   // the shuffle may be able to fold with a load or other benefit. However, when
@@ -12974,7 +13411,7 @@ static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
-  assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
+  assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
 
   // Upper half must be undefined.
   if (!isUndefUpperHalf(Mask))
@@ -13462,7 +13899,7 @@ static SDValue lowerShuffleAsElementInsertion(
   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
-    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+    if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
       // Using zext to expand a narrow element won't work for non-zero
       // insertions.
       if (!IsV1Zeroable)
@@ -13494,11 +13931,17 @@ static SDValue lowerShuffleAsElementInsertion(
     if (!VT.is128BitVector())
       return SDValue();
 
-    // Otherwise, use MOVSD or MOVSS.
-    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
-           "Only two types of floating point element types to handle!");
-    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
-                       ExtVT, V1, V2);
+    // Otherwise, use MOVSD, MOVSS or MOVSH.
+    unsigned MovOpc = 0;
+    if (EltVT == MVT::f16)
+      MovOpc = X86ISD::MOVSH;
+    else if (EltVT == MVT::f32)
+      MovOpc = X86ISD::MOVSS;
+    else if (EltVT == MVT::f64)
+      MovOpc = X86ISD::MOVSD;
+    else
+      llvm_unreachable("Unsupported floating point element type to handle!");
+    return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
   }
 
   // This lowering only works for the low element with floating point vectors.
@@ -15264,14 +15707,28 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
   if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
       !Subtarget.hasVLX()) {
-    SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
-    for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
-      DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
-    SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
-    V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
-                     DWordClearMask);
-    V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
-                     DWordClearMask);
+    // Check if this is part of a 256-bit vector truncation.
+    if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
+        peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
+      V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
+                         getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
+                         DAG.getTargetConstant(0xEE, DL, MVT::i8));
+      V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
+      V1 = extract128BitVector(V1V2, 0, DAG, DL);
+      V2 = extract128BitVector(V1V2, 4, DAG, DL);
+    } else {
+      SmallVector<SDValue> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+      for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
+        DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
+      SDValue DWordClearMask =
+          DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
+      V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
+                       DWordClearMask);
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
+                       DWordClearMask);
+    }
     // Now pack things back together.
     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
     if (NumEvenDrops == 2) {
@@ -15300,6 +15757,33 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Mask, Subtarget, DAG);
 }
 
+/// Lower 8-lane 16-bit floating point shuffles.
+static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                 const APInt &Zeroable, SDValue V1, SDValue V2,
+                                 const X86Subtarget &Subtarget,
+                                 SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Broadcast;
+  }
+  if (NumV2Elements == 1 && Mask[0] >= 8)
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
+      return V;
+
+  V1 = DAG.getBitcast(MVT::v8i16, V1);
+  V2 = DAG.getBitcast(MVT::v8i16, V2);
+  return DAG.getBitcast(MVT::v8f16,
+                        DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
+}
+
 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
 // the active subvector is extracted.
@@ -15705,6 +16189,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i16:
     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v8f16:
+    return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i8:
     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
@@ -16083,22 +16569,13 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
-        MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
+        X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
+      MVT MemVT = VT.getHalfNumVectorElementsVT();
+      unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
-      if (!Ld->isNonTemporal()) {
-        MVT MemVT = VT.getHalfNumVectorElementsVT();
-        unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
-        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-        SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
-                                               TypeSize::Fixed(Ofs), DL);
-        SDValue Ops[] = {Ld->getChain(), Ptr};
-        SDValue BcastLd = DAG.getMemIntrinsicNode(
-            X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
-            DAG.getMachineFunction().getMachineMemOperand(
-                Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
-        return BcastLd;
-      }
+      if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
+                                             VT, MemVT, Ld, Ofs, DAG))
+        return BcstLd;
     }
 
     // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
@@ -17569,6 +18046,13 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   }
 
+  if (VT == MVT::v16f16) {
+    V1 = DAG.getBitcast(MVT::v16i16, V1);
+    V2 = DAG.getBitcast(MVT::v16i16, V2);
+    return DAG.getBitcast(MVT::v16f16,
+                          DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
+  }
+
   switch (VT.SimpleTy) {
   case MVT::v4f64:
     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
@@ -18135,6 +18619,13 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
   }
 
+  if (VT == MVT::v32f16) {
+    V1 = DAG.getBitcast(MVT::v32i16, V1);
+    V2 = DAG.getBitcast(MVT::v32i16, V2);
+    return DAG.getBitcast(MVT::v32f16,
+                          DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
+  }
+
   // Dispatch to each element type for lowering. If we don't have support for
   // specific element type shuffles at 512 bits, immediately split them and
   // lower them. Each lowering routine of a given type is allowed to assume that
@@ -18431,7 +18922,13 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
   return false;
 }
 
-/// Top-level lowering for x86 vector shuffles.
+// Forward declaration.
+static SDValue canonicalizeShuffleMaskWithHorizOp(
+    MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
+    unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
+    const X86Subtarget &Subtarget);
+
+    /// Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -18489,7 +18986,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
 
   APInt Zeroable = KnownUndef | KnownZero;
-  if (Zeroable.isAllOnesValue())
+  if (Zeroable.isAllOnes())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
@@ -18540,8 +19037,22 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
+  SmallVector<SDValue> Ops = {V1, V2};
+  SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
+
+  // Canonicalize the shuffle with any horizontal ops inputs.
+  // NOTE: This may update Ops and Mask.
+  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+          Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
+    return DAG.getBitcast(VT, HOp);
+
+  V1 = DAG.getBitcast(VT, Ops[0]);
+  V2 = DAG.getBitcast(VT, Ops[1]);
+  assert(NumElements == (int)Mask.size() &&
+         "canonicalizeShuffleMaskWithHorizOp "
+         "shouldn't alter the shuffle mask size");
+
   // Commute the shuffle if it will improve canonicalization.
-  SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
   if (canonicalizeShuffleMaskWithCommute(Mask)) {
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(V1, V2);
@@ -18686,8 +19197,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   if (VT.getSizeInBits() == 8) {
     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
     // we're going to zero extend the register or fold the store.
-    if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
-        !MayFoldIntoStore(Op))
+    if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
+        !X86::mayFoldIntoStore(Op))
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
@@ -18840,14 +19351,18 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   MVT VT = Op.getSimpleValueType();
 
-  if (VT.getSizeInBits() == 16) {
+  if (VT == MVT::i16) {
     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
     // we're going to zero extend the register or fold the store (SSE41 only).
-    if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
-        !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
+    if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
+        !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
+      if (Subtarget.hasFP16())
+        return Op;
+
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
+    }
 
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
@@ -18886,12 +19401,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   }
 
-  if (VT.getSizeInBits() == 32) {
+  if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
       return Op;
 
-    // SHUFPS the element to the lowest double word, then movss.
-    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+    // Shuffle the element to the lowest element, then movss or movsh.
+    SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
+    Mask[0] = static_cast<int>(IdxVal);
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
@@ -18994,17 +19510,28 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   bool IsZeroElt = X86::isZeroNode(N1);
   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
 
-  // If we are inserting a element, see if we can do this more efficiently with
-  // a blend shuffle with a rematerializable vector than a costly integer
-  // insertion.
-  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
-      (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
-    SmallVector<int, 8> BlendMask;
-    for (unsigned i = 0; i != NumElts; ++i)
-      BlendMask.push_back(i == IdxVal ? i + NumElts : i);
-    SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
-                                  : getOnesVector(VT, DAG, dl);
-    return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
+  if (IsZeroElt || IsAllOnesElt) {
+    // Lower insertion of i8 -1 as an 'OR' blend.
+    // We don't deal with i8 0 since it appears to be handled elsewhere.
+    if (IsAllOnesElt && EltSizeInBits == 8 && !Subtarget.hasSSE41()) {
+      SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
+      SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
+      SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
+      CstVectorElts[IdxVal] = OnesCst;
+      SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
+      return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
+    }
+    // See if we can do this more efficiently with a blend shuffle with a
+    // rematerializable vector.
+    if (Subtarget.hasSSE41() &&
+        (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
+      SmallVector<int, 8> BlendMask;
+      for (unsigned i = 0; i != NumElts; ++i)
+        BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+      SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+                                    : getOnesVector(VT, DAG, dl);
+      return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
+    }
   }
 
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
@@ -19024,12 +19551,28 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       }
     }
 
+    unsigned NumEltsIn128 = 128 / EltSizeInBits;
+    assert(isPowerOf2_32(NumEltsIn128) &&
+           "Vectors will always have power-of-two number of elements.");
+
+    // If we are not inserting into the low 128-bit vector chunk,
+    // then prefer the broadcast+blend sequence.
+    // FIXME: relax the profitability check iff all N1 uses are insertions.
+    if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
+        ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
+         (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
+          X86::mayFoldLoad(N1, Subtarget)))) {
+      SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
+      SmallVector<int, 8> BlendMask;
+      for (unsigned i = 0; i != NumElts; ++i)
+        BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+      return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
+    }
+
     // Get the desired 128-bit vector chunk.
     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired chunk.
-    unsigned NumEltsIn128 = 128 / EltSizeInBits;
-    assert(isPowerOf2_32(NumEltsIn128));
     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
 
@@ -19041,10 +19584,10 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  // This will be just movd/movq/movss/movsd.
+  // This will be just movw/movd/movq/movsh/movss/movsd.
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
-        EltVT == MVT::i64) {
+        EltVT == MVT::f16 || EltVT == MVT::i64) {
       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
     }
@@ -19091,7 +19634,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       //   combine either bitwise AND or insert of float 0.0 to set these bits.
 
       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
-      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+      if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
         // If this is an insertion of 32-bits into the low 32-bits of
         // a vector, we prefer to generate a blend with immediate rather
         // than an insertps. Blends are simpler operations in hardware and so
@@ -19143,8 +19686,9 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
          "Expected an SSE type!");
 
-  // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
-  if (OpVT == MVT::v4i32)
+  // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
+  // tblgen.
+  if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
     return Op;
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
@@ -19207,7 +19751,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(
     return X86ISD::WrapperRIP;
 
   // GOTPCREL references must always use RIP.
-  if (OpFlags == X86II::MO_GOTPCREL)
+  if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
     return X86ISD::WrapperRIP;
 
   return X86ISD::Wrapper;
@@ -19682,92 +20226,6 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
 }
 
-static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
-                                SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
-         "Unexpected funnel shift opcode!");
-
-  SDLoc DL(Op);
-  SDValue Op0 = Op.getOperand(0);
-  SDValue Op1 = Op.getOperand(1);
-  SDValue Amt = Op.getOperand(2);
-
-  bool IsFSHR = Op.getOpcode() == ISD::FSHR;
-
-  if (VT.isVector()) {
-    assert(Subtarget.hasVBMI2() && "Expected VBMI2");
-
-    if (IsFSHR)
-      std::swap(Op0, Op1);
-
-    // With AVX512, but not VLX we need to widen to get a 512-bit result type.
-    if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
-      Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
-      Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
-    }
-
-    SDValue Funnel;
-    APInt APIntShiftAmt;
-    MVT ResultVT = Op0.getSimpleValueType();
-    if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
-      uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
-      Funnel =
-          DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
-                      Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
-    } else {
-      if (!Subtarget.hasVLX() && !VT.is512BitVector())
-        Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
-      Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
-                           ResultVT, Op0, Op1, Amt);
-    }
-    if (!Subtarget.hasVLX() && !VT.is512BitVector())
-      Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
-    return Funnel;
-  }
-  assert(
-      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
-      "Unexpected funnel shift type!");
-
-  // Expand slow SHLD/SHRD cases if we are not optimizing for size.
-  bool OptForSize = DAG.shouldOptForSize();
-  bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
-
-  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
-  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
-  if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
-      !isa<ConstantSDNode>(Amt)) {
-    unsigned EltSizeInBits = VT.getScalarSizeInBits();
-    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
-    SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
-    Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
-    Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
-    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
-    SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
-    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
-    if (IsFSHR) {
-      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
-    } else {
-      Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
-      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
-    }
-    return DAG.getZExtOrTrunc(Res, DL, VT);
-  }
-
-  if (VT == MVT::i8 || ExpandFunnel)
-    return SDValue();
-
-  // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
-  if (VT == MVT::i16) {
-    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
-                      DAG.getConstant(15, DL, Amt.getValueType()));
-    unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
-    return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
-  }
-
-  return Op;
-}
-
 // Try to use a packed vector operation to handle i64 on 32-bit targets when
 // AVX512DQ is enabled.
 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
@@ -19811,6 +20269,43 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
                      DAG.getIntPtrConstant(0, dl));
 }
 
+// Try to use a packed vector operation to handle i64 on 32-bit targets.
+static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
+  assert((Op.getOpcode() == ISD::SINT_TO_FP ||
+          Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
+          Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
+          Op.getOpcode() == ISD::UINT_TO_FP) &&
+         "Unexpected opcode!");
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = Op.getSimpleValueType();
+
+  if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
+    return SDValue();
+
+  // Pack the i64 into a vector, do the operation and extract.
+
+  assert(Subtarget.hasFP16() && "Expected FP16");
+
+  SDLoc dl(Op);
+  SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+  if (IsStrict) {
+    SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
+                                 {Op.getOperand(0), InVec});
+    SDValue Chain = CvtVec.getValue(1);
+    SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+                                DAG.getIntPtrConstant(0, dl));
+    return DAG.getMergeValues({Value, Chain}, dl);
+  }
+
+  SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
                           const X86Subtarget &Subtarget) {
   switch (Opcode) {
@@ -20024,6 +20519,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
+    return LowerWin64_INT128_TO_FP(Op, DAG);
+
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
@@ -20063,6 +20561,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
+  if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
+    return V;
 
   // SSE doesn't have an i16 conversion so we need to promote.
   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
@@ -20521,6 +21021,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
 
+  if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
+    return LowerWin64_INT128_TO_FP(Op, DAG);
+
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
@@ -20542,6 +21045,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
+  if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
+    return V;
 
   // The transform for i64->f64 isn't correct for 0 when rounding to negative
   // infinity. It produces -0.0, so disable under strictfp.
@@ -21323,9 +21828,11 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
   MVT VT = Op->getSimpleValueType(0);
   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
   MVT SrcVT = Src.getSimpleValueType();
   SDLoc dl(Op);
 
+  SDValue Res;
   if (VT.isVector()) {
     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
@@ -21350,10 +21857,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
                           DAG.getIntPtrConstant(0, dl));
       }
-      SDValue Res, Chain;
       if (IsStrict) {
-        Res =
-            DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
+        Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
         Chain = Res.getValue(1);
       } else {
         Res = DAG.getNode(Opc, dl, ResVT, Src);
@@ -21367,6 +21872,67 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       return Res;
     }
 
+    if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
+      if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
+        return Op;
+
+      MVT ResVT = VT;
+      MVT EleVT = VT.getVectorElementType();
+      if (EleVT != MVT::i64)
+        ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
+
+      if (SrcVT != MVT::v8f16) {
+        SDValue Tmp =
+            IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
+        SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
+        Ops[0] = Src;
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
+      }
+
+      if (IsStrict) {
+        Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
+                                   : X86ISD::STRICT_CVTTP2UI,
+                          dl, {ResVT, MVT::Other}, {Chain, Src});
+        Chain = Res.getValue(1);
+      } else {
+        Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
+                          ResVT, Src);
+      }
+
+      // TODO: Need to add exception check code for strict FP.
+      if (EleVT.getSizeInBits() < 16) {
+        ResVT = MVT::getVectorVT(EleVT, 8);
+        Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
+      }
+
+      if (ResVT != VT)
+        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+                          DAG.getIntPtrConstant(0, dl));
+
+      if (IsStrict)
+        return DAG.getMergeValues({Res, Chain}, dl);
+      return Res;
+    }
+
+    if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
+      if (IsStrict) {
+        Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
+                                   : ISD::STRICT_FP_TO_UINT,
+                          dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
+        Chain = Res.getValue(1);
+      } else {
+        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
+                          MVT::v8i32, Src);
+      }
+
+      // TODO: Need to add exception check code for strict FP.
+      Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
+
+      if (IsStrict)
+        return DAG.getMergeValues({Res, Chain}, dl);
+      return Res;
+    }
+
     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
       assert(!IsSigned && "Expected unsigned conversion!");
@@ -21390,10 +21956,9 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
                         DAG.getIntPtrConstant(0, dl));
 
-      SDValue Res, Chain;
       if (IsStrict) {
         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
-                          {Op->getOperand(0), Src});
+                          {Chain, Src});
         Chain = Res.getValue(1);
       } else {
         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
@@ -21421,10 +21986,9 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
                         DAG.getIntPtrConstant(0, dl));
 
-      SDValue Res, Chain;
       if (IsStrict) {
         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
-                          {Op->getOperand(0), Src});
+                          {Chain, Src});
         Chain = Res.getValue(1);
       } else {
         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
@@ -21449,7 +22013,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
                                   {Src, Zero, Zero, Zero});
         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
-                          {Op->getOperand(0), Tmp});
+                          {Chain, Tmp});
         SDValue Chain = Tmp.getValue(1);
         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
                           DAG.getIntPtrConstant(0, dl));
@@ -21532,17 +22096,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     // FIXME: This does not generate an invalid exception if the input does not
     // fit in i32. PR44019
     if (Subtarget.is64Bit()) {
-      SDValue Res, Chain;
       if (IsStrict) {
-        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
-                          { Op.getOperand(0), Src });
+        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
+                          {Chain, Src});
         Chain = Res.getValue(1);
       } else
         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
 
       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
       if (IsStrict)
-        return DAG.getMergeValues({ Res, Chain }, dl);
+        return DAG.getMergeValues({Res, Chain}, dl);
       return Res;
     }
 
@@ -21557,17 +22120,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   // fit in i16. PR44019
   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
-    SDValue Res, Chain;
     if (IsStrict) {
-      Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
-                        { Op.getOperand(0), Src });
+      Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
+                        {Chain, Src});
       Chain = Res.getValue(1);
     } else
       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
 
     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
     if (IsStrict)
-      return DAG.getMergeValues({ Res, Chain }, dl);
+      return DAG.getMergeValues({Res, Chain}, dl);
     return Res;
   }
 
@@ -21583,7 +22145,6 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     else
       LC = RTLIB::getFPTOUINT(SrcVT, VT);
 
-    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
     MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
                                                   SDLoc(Op), Chain);
@@ -21595,7 +22156,6 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Fall back to X87.
-  SDValue Chain;
   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
     if (IsStrict)
       return DAG.getMergeValues({V, Chain}, dl);
@@ -21822,6 +22382,35 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   if (VT == MVT::f128)
     return SDValue();
 
+  if (VT == MVT::f80) {
+    if (SVT == MVT::f16) {
+      assert(Subtarget.hasFP16() && "Unexpected features!");
+      RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
+      MakeLibCallOptions CallOptions;
+      std::pair<SDValue, SDValue> Tmp =
+          makeLibCall(DAG, LC, VT, In, CallOptions, DL,
+                      IsStrict ? Op.getOperand(0) : SDValue());
+      if (IsStrict)
+        return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
+      else
+        return Tmp.first;
+    }
+    return Op;
+  }
+
+  if (SVT.getVectorElementType() == MVT::f16) {
+    assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
+    if (SVT == MVT::v2f16)
+      In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
+                       DAG.getUNDEF(MVT::v2f16));
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
+                              DAG.getUNDEF(MVT::v4f16));
+    if (IsStrict)
+      return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
+                         {Op->getOperand(0), Res});
+    return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
+  }
+
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
   SDValue Res =
@@ -21835,8 +22424,11 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
-  // It's legal except when f128 is involved
-  if (In.getSimpleValueType() != MVT::f128)
+  MVT VT = Op.getSimpleValueType();
+  MVT SVT = In.getSimpleValueType();
+
+  // It's legal except when f128 is involved or we're converting f80->f16.
+  if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
     return Op;
 
   return SDValue();
@@ -22026,9 +22618,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   bool IsF128 = (VT == MVT::f128);
-  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
-          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
-          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+  assert(VT.isFloatingPoint() && VT != MVT::f80 &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
          "Unexpected type in LowerFABSorFNEG");
 
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
@@ -22042,7 +22633,9 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
-    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    LogicVT = (VT == MVT::f64)   ? MVT::v2f64
+              : (VT == MVT::f32) ? MVT::v4f32
+                                 : MVT::v8f16;
 
   unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -22087,9 +22680,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
   bool IsF128 = (VT == MVT::f128);
-  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
-          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
-          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+  assert(VT.isFloatingPoint() && VT != MVT::f80 &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
          "Unexpected type in LowerFCOPYSIGN");
 
   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
@@ -22102,7 +22694,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
-    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    LogicVT = (VT == MVT::f64)   ? MVT::v2f64
+              : (VT == MVT::f32) ? MVT::v4f32
+                                 : MVT::v8f16;
 
   // The mask constants are automatically splatted for vector types.
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
@@ -22208,7 +22802,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
         return false;
       unsigned NumElts = VT.getVectorNumElements();
-      APInt EltCount = APInt::getNullValue(NumElts);
+      APInt EltCount = APInt::getZero(NumElts);
       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
       SrcOps.push_back(Src);
     }
@@ -22227,7 +22821,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
   } else {
     // Quit if not all elements are used.
     for (const auto &I : SrcOpMap)
-      if (!I.second.isAllOnesValue())
+      if (!I.second.isAllOnes())
         return false;
   }
 
@@ -22250,7 +22844,7 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
 
   auto MaskBits = [&](SDValue Src) {
-    if (Mask.isAllOnesValue())
+    if (Mask.isAllOnes())
       return Src;
     EVT SrcVT = Src.getValueType();
     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
@@ -22288,8 +22882,8 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
 
   // Without PTEST, a masked v2i64 or-reduction is not faster than
   // scalarization.
-  if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
-      return SDValue();
+  if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
+    return SDValue();
 
   V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
   V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
@@ -22312,7 +22906,7 @@ static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
 
   // Check whether we're masking/truncating an OR-reduction result, in which
   // case track the masked bits.
-  APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+  APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
   switch (Op.getOpcode()) {
   case ISD::TRUNCATE: {
     SDValue Src = Op.getOperand(0);
@@ -22543,16 +23137,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
         // For equality comparisons try to use SIGN_EXTEND if the input was
         // truncate from something with enough sign bits.
         if (Op0.getOpcode() == ISD::TRUNCATE) {
-          SDValue In = Op0.getOperand(0);
-          unsigned EffBits =
-              In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
-          if (EffBits <= 16)
+          if (DAG.ComputeMinSignedBits(Op0.getOperand(0)) <= 16)
             ExtendOp = ISD::SIGN_EXTEND;
         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
-          SDValue In = Op1.getOperand(0);
-          unsigned EffBits =
-              In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
-          if (EffBits <= 16)
+          if (DAG.ComputeMinSignedBits(Op1.getOperand(0)) <= 16)
             ExtendOp = ISD::SIGN_EXTEND;
         }
       }
@@ -22618,6 +23206,7 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
                                            int &RefinementSteps,
                                            bool &UseOneConstNR,
                                            bool Reciprocal) const {
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
@@ -22639,7 +23228,23 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
     UseOneConstNR = false;
     // There is no FSQRT for 512-bits, but there is RSQRT14.
     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
-    return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
+    return DAG.getNode(Opcode, DL, VT, Op);
+  }
+
+  if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
+      Subtarget.hasFP16()) {
+    if (RefinementSteps == ReciprocalEstimate::Unspecified)
+      RefinementSteps = 0;
+
+    if (VT == MVT::f16) {
+      SDValue Zero = DAG.getIntPtrConstant(0, DL);
+      SDValue Undef = DAG.getUNDEF(MVT::v8f16);
+      Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
+      Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
+    }
+
+    return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
   }
   return SDValue();
 }
@@ -22649,6 +23254,7 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
                                             int Enabled,
                                             int &RefinementSteps) const {
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
@@ -22673,7 +23279,23 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 
     // There is no FSQRT for 512-bits, but there is RCP14.
     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
-    return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
+    return DAG.getNode(Opcode, DL, VT, Op);
+  }
+
+  if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
+      Subtarget.hasFP16()) {
+    if (RefinementSteps == ReciprocalEstimate::Unspecified)
+      RefinementSteps = 0;
+
+    if (VT == MVT::f16) {
+      SDValue Zero = DAG.getIntPtrConstant(0, DL);
+      SDValue Undef = DAG.getUNDEF(MVT::v8f16);
+      Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
+      Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
+    }
+
+    return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
   }
   return SDValue();
 }
@@ -22696,7 +23318,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
   if (isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
 
-  assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
+  assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
          "Unexpected divisor!");
 
   // Only perform this transform if CMOV is supported otherwise the select
@@ -22956,7 +23578,7 @@ static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
 
     // Avoid overflow/underflow.
     const APInt &EltC = Elt->getAPIntValue();
-    if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
+    if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
       return SDValue();
 
     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
@@ -23037,7 +23659,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   if (isFP) {
 #ifndef NDEBUG
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+    assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
@@ -23051,7 +23673,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
         (!IsStrict || Subtarget.hasVLX() ||
          Op0.getSimpleValueType().is512BitVector())) {
-      assert(VT.getVectorNumElements() <= 16);
+#ifndef NDEBUG
+      unsigned Num = VT.getVectorNumElements();
+      assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
+#endif
       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
     } else {
       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
@@ -23272,7 +23897,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       Cond = ISD::SETGT;
     else if (ConstValue.isMaxSignedValue())
       Cond = ISD::SETLT;
-    else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
+    else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
       Cond = ISD::SETGT;
   }
 
@@ -23625,7 +24250,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
     if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
       const APInt &Op1Val = Op1C->getAPIntValue();
-      if (!Op1Val.isNullValue()) {
+      if (!Op1Val.isZero()) {
         // Ensure the constant+1 doesn't overflow.
         if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
             (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
@@ -24053,8 +24678,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   //        being inserted between two CMOV's. (in i16 case too TBN)
   //        https://bugs.llvm.org/show_bug.cgi?id=40974
   if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
-      (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
-       !MayFoldLoad(Op2))) {
+      (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
+       !X86::mayFoldLoad(Op2, Subtarget))) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
     SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -24699,8 +25324,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                 DAG.getRegister(Vreg, SPTy));
   } else {
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
-    MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
+    Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
+    MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
 
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     Register SPReg = RegInfo->getStackRegister();
@@ -24814,7 +25439,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (ArgMode == 2) {
-    // Sanity Check: Make sure using fp_offset makes sense.
+    // Make sure using fp_offset makes sense.
     assert(!Subtarget.useSoftFloat() &&
            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
            Subtarget.hasSSE1());
@@ -25554,6 +26179,35 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Swap Src1 and Src2 in the node creation
       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
     }
+    case CFMA_OP_MASKZ:
+    case CFMA_OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+
+      SDValue PassThru = Src3;
+      if (IntrData->Type == CFMA_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+      // We add rounding mode to the Node when
+      //   - RC Opcode is specified and
+      //   - RC is not "current direction".
+      SDValue NewOp;
+      if (IntrData->Opc1 != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        unsigned RC = 0;
+        if (isRoundModeSAEToX(Rnd, RC))
+          NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
+                              DAG.getTargetConstant(RC, dl, MVT::i32));
+        else if (!isRoundModeCurDirection(Rnd))
+          return SDValue();
+      }
+      if (!NewOp)
+        NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
+      return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
+    }
     case IFMA_OP:
       // NOTE: We need to swizzle the operands to pass the multiply operands
       // first.
@@ -26165,6 +26819,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                              getPointerTy(DAG.getDataLayout())),
                        Op.getOperand(1), ShAmt);
   }
+  case Intrinsic::thread_pointer: {
+    if (Subtarget.isTargetELF()) {
+      SDLoc dl(Op);
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
+      // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+      Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
+          *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
+      return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                         DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
+    }
+    report_fatal_error(
+        "Target OS doesn't support __builtin_thread_pointer() yet.");
+  }
   }
 }
 
@@ -26469,6 +27136,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                          DAG.getConstant(0, dl, MVT::i32),
                          DAG.getConstant(0, dl, MVT::i32));
     }
+    case llvm::Intrinsic::asan_check_memaccess: {
+      // Mark this as adjustsStack because it will be lowered to a call.
+      DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
+      // Don't do anything here, we will expand these intrinsics out later.
+      return Op;
+    }
     case llvm::Intrinsic::x86_flags_read_u32:
     case llvm::Intrinsic::x86_flags_read_u64:
     case llvm::Intrinsic::x86_flags_write_u32:
@@ -27044,11 +27717,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
-        unsigned Idx = 1;
+        unsigned Idx = 0;
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+          if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
             const DataLayout &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
@@ -27517,15 +28190,51 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
   EVT SetCCResultType =
       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
-  if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
-    // usubsat X, Y --> (X >u Y) ? X - Y : 0
-    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
-    SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
-    // TODO: Move this to DAGCombiner?
-    if (SetCCResultType == VT &&
-        DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
-      return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
-    return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+  unsigned BitWidth = VT.getScalarSizeInBits();
+  if (Opcode == ISD::USUBSAT) {
+    if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
+      // Handle a special-case with a bit-hack instead of cmp+select:
+      // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
+      // If the target can use VPTERNLOG, DAGToDAG will match this as
+      // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
+      // "broadcast" constant load.
+      ConstantSDNode *C = isConstOrConstSplat(Y, true);
+      if (C && C->getAPIntValue().isSignMask()) {
+        SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
+        SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
+        SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
+        SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
+        return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
+      }
+    }
+    if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
+      // usubsat X, Y --> (X >u Y) ? X - Y : 0
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+      SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+      // TODO: Move this to DAGCombiner?
+      if (SetCCResultType == VT &&
+          DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+        return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
+      return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+    }
+  }
+
+  if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
+      (!VT.isVector() || VT == MVT::v2i64)) {
+    APInt MinVal = APInt::getSignedMinValue(BitWidth);
+    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    SDValue Result =
+        DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
+                    DAG.getVTList(VT, SetCCResultType), X, Y);
+    SDValue SumDiff = Result.getValue(0);
+    SDValue Overflow = Result.getValue(1);
+    SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
+    SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
+    SDValue SumNeg =
+        DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
+    Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
+    return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
   }
 
   // Use default expansion.
@@ -27542,7 +28251,7 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
     SDValue N0 = Op.getOperand(0);
     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
                               DAG.getConstant(0, DL, VT), N0);
-    SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
+    SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
                      SDValue(Neg.getNode(), 1)};
     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
   }
@@ -27646,9 +28355,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
     // Multiply, mask the lower 8bits of the lo/hi results and pack.
     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
-    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
-    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
-    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+    return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
   }
 
   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
@@ -27801,19 +28508,10 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
 
-  if (Low) {
-    // Mask the lower bits and pack the results to rejoin the halves.
-    SDValue Mask = DAG.getConstant(255, dl, ExVT);
-    SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
-    SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
-    *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
-  }
-
-  RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
-  RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
+  if (Low)
+    *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
 
-  // Bitcast back to VT and then pack all the even elements from Lo and Hi.
-  return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
 }
 
 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -28111,9 +28809,80 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   return DAG.getBitcast(VT, CallInfo.first);
 }
 
+SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   SDValue &Chain) const {
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  bool IsStrict = Op->isStrictFPOpcode();
+
+  SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
+  EVT ArgVT = Arg.getValueType();
+
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  if (Op->getOpcode() == ISD::FP_TO_SINT ||
+      Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(ArgVT, VT);
+  else
+    LC = RTLIB::getFPTOUINT(ArgVT, VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
+
+  SDLoc dl(Op);
+  MakeLibCallOptions CallOptions;
+  Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+  SDValue Result;
+  // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
+  // expected VT (i128).
+  std::tie(Result, Chain) =
+      makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
+  Result = DAG.getBitcast(VT, Result);
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  bool IsStrict = Op->isStrictFPOpcode();
+
+  SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
+  EVT ArgVT = Arg.getValueType();
+
+  assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+         "Unexpected argument type for lowering");
+
+  RTLIB::Libcall LC;
+  if (Op->getOpcode() == ISD::SINT_TO_FP ||
+      Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(ArgVT, VT);
+  else
+    LC = RTLIB::getUINTTOFP(ArgVT, VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
+
+  SDLoc dl(Op);
+  MakeLibCallOptions CallOptions;
+  SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+  // Pass the i128 argument as an indirect argument on the stack.
+  SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+  Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
+
+  SDValue Result;
+  std::tie(Result, Chain) =
+      makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
+  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
+}
+
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
-static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
+static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
                                         unsigned Opcode) {
   if (VT.getScalarSizeInBits() < 16)
     return false;
@@ -28133,14 +28902,14 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instructions are defined together with shift-immediate.
 static
-bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
+bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
                                       unsigned Opcode) {
-  return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+  return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
 // Return true if the required (according to Opcode) variable-shift form is
 // natively supported by the Subtarget
-static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
+static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
                                     unsigned Opcode) {
 
   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
@@ -28216,7 +28985,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
-  if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
   // i64 SRA needs to be performed as partial shifts.
@@ -28231,8 +29000,15 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
     // Simple i8 add case
-    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
+      // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
+      // must be 0). (add undef, undef) however can be any value. To make this
+      // safe, we must freeze R to ensure that register allocation uses the same
+      // register for an undefined value. This ensures that the result will
+      // still be even and preserves the original semantics.
+      R = DAG.getNode(ISD::FREEZE, dl, VT, R);
       return DAG.getNode(ISD::ADD, dl, VT, R, R);
+    }
 
     // ashr(R, 7)  === cmp_slt(R, 0)
     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
@@ -28293,7 +29069,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
 
   if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
-    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
+    if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
       MVT EltVT = VT.getVectorElementType();
       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
@@ -28311,7 +29087,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         !Subtarget.hasXOP()) {
       unsigned NumElts = VT.getVectorNumElements();
       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
-      if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
+      if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
@@ -28363,7 +29139,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
           return SDValue();
     }
 
-    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+    if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   }
   return SDValue();
@@ -28376,8 +29152,10 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   MVT VT = Amt.getSimpleValueType();
   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
-        (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
-        (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
+        (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
+        (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
+        (Subtarget.hasInt256() && VT == MVT::v32i8) ||
+        (Subtarget.hasBWI() && VT == MVT::v64i8)))
     return SDValue();
 
   if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
@@ -28425,10 +29203,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
     if (Subtarget.hasSSE41())
       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
-
-    return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
-                                        DAG.getBitcast(VT, Hi),
-                                        {0, 2, 4, 6, 8, 10, 12, 14});
+    return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
   }
 
   return SDValue();
@@ -28456,9 +29231,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
     return V;
 
-  if (SupportedVectorVarShift(VT, Subtarget, Opc))
+  if (supportedVectorVarShift(VT, Subtarget, Opc))
     return Op;
 
+  // i64 vector arithmetic shift can be emulated with the transform:
+  // M = lshr(SIGN_MASK, Amt)
+  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+  if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
+       (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
+      Opc == ISD::SRA) {
+    SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
+    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+    return R;
+  }
+
   // XOP has 128-bit variable logical/arithmetic shifts.
   // +ve/-ve Amt = shift left/right.
   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
@@ -28484,19 +29273,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   }
 
-  // i64 vector arithmetic shift can be emulated with the transform:
-  // M = lshr(SIGN_MASK, Amt)
-  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
-  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
-      Opc == ISD::SRA) {
-    SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
-    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
-    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
-    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
-    return R;
-  }
-
   // If possible, lower this shift as a sequence of two shifts by
   // constant plus a BLENDing shuffle instead of scalarizing it.
   // Example:
@@ -28552,7 +29328,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
-  if (Opc == ISD::SHL)
+  // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
+  if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
+                                                Subtarget.canExtendTo512BW())))
     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
 
@@ -28920,6 +29698,77 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   return SDValue();
 }
 
+static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
+         "Unexpected funnel shift opcode!");
+
+  SDLoc DL(Op);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool IsFSHR = Op.getOpcode() == ISD::FSHR;
+
+  if (VT.isVector()) {
+    assert(Subtarget.hasVBMI2() && "Expected VBMI2");
+
+    if (IsFSHR)
+      std::swap(Op0, Op1);
+
+    APInt APIntShiftAmt;
+    if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
+      uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
+      SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
+      return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
+                           {Op0, Op1, Imm}, DAG, Subtarget);
+    }
+    return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+                         {Op0, Op1, Amt}, DAG, Subtarget);
+  }
+  assert(
+      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+      "Unexpected funnel shift type!");
+
+  // Expand slow SHLD/SHRD cases if we are not optimizing for size.
+  bool OptForSize = DAG.shouldOptForSize();
+  bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
+
+  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+  if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+      !isa<ConstantSDNode>(Amt)) {
+    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+    SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+    Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+    Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+    SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+    if (IsFSHR) {
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+    } else {
+      Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+    }
+    return DAG.getZExtOrTrunc(Res, DL, VT);
+  }
+
+  if (VT == MVT::i8 || ExpandFunnel)
+    return SDValue();
+
+  // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
+  if (VT == MVT::i16) {
+    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
+                      DAG.getConstant(15, DL, Amt.getValueType()));
+    unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
+    return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
+  }
+
+  return Op;
+}
+
 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -28931,6 +29780,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   unsigned Opcode = Op.getOpcode();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   int NumElts = VT.getVectorNumElements();
+  bool IsROTL = Opcode == ISD::ROTL;
 
   // Check for constant splat rotation amount.
   APInt CstSplatValue;
@@ -28944,7 +29794,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
     if (IsCstSplat) {
-      unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+      unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
       return DAG.getNode(RotOpc, DL, VT, R,
                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
@@ -28956,11 +29806,11 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
 
   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
-    unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
+    unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
   }
 
-  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
+  assert(IsROTL && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
@@ -28996,16 +29846,41 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return SDValue();
 
   bool IsSplatAmt = DAG.isSplatValue(Amt);
+  SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
 
   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
   // the amount bit.
-  if (EltSizeInBits == 8 && !IsSplatAmt) {
+  if (EltSizeInBits == 8) {
     if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
       return SDValue();
 
-    // We don't need ModuloAmt here as we just peek at individual bits.
+    // Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we
+    // currently hit infinite loops in legalization if we allow ISD::ROTR.
+    // FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT.
+    SDValue HiddenROTRAmt;
+    if (Amt.getOpcode() == ISD::SUB &&
+        ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode()))
+      HiddenROTRAmt = Amt.getOperand(1);
+
     MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
+    // If the amount is a splat, attempt to fold as unpack(x,x) << zext(y):
+    // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
+    // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
+    if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode(
+            ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) {
+      unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI;
+      BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt);
+      SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
+      SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
+      Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
+                               Subtarget, DAG);
+      Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
+                               Subtarget, DAG);
+      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt);
+    }
+
+    // We don't need ModuloAmt here as we just peek at individual bits.
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (Subtarget.hasSSE41()) {
         // On SSE41 targets we can use PBLENDVB which selects bytes based just
@@ -29024,6 +29899,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getSelect(DL, SelVT, C, V0, V1);
     };
 
+    // 'Hidden' ROTR is currently only profitable on AVX512 targets where we
+    // have VPTERNLOG.
+    unsigned ShiftLHS = ISD::SHL;
+    unsigned ShiftRHS = ISD::SRL;
+    if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) {
+      std::swap(ShiftLHS, ShiftRHS);
+      Amt = HiddenROTRAmt;
+    }
+
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
     // We can safely do this using i16 shifts as we're only interested in
     // the 3 lower bits of each byte.
@@ -29035,8 +29919,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     SDValue M;
     M = DAG.getNode(
         ISD::OR, DL, VT,
-        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
-        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
+        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
+        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
     R = SignBitSelect(VT, Amt, M, R);
 
     // a += a
@@ -29045,8 +29929,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     // r = VSELECT(r, rot(r, 2), a);
     M = DAG.getNode(
         ISD::OR, DL, VT,
-        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
-        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
+        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
+        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
     R = SignBitSelect(VT, Amt, M, R);
 
     // a += a
@@ -29055,8 +29939,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     // return VSELECT(r, rot(r, 1), a);
     M = DAG.getNode(
         ISD::OR, DL, VT,
-        DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
-        DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
+        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
+        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
     return SignBitSelect(VT, Amt, M, R);
   }
 
@@ -29065,18 +29949,16 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     // If the amount is a splat, perform the modulo BEFORE the splat,
     // this helps LowerScalarVariableShift to remove the splat later.
     Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
-    Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
-                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
+    Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
     Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
                                SmallVector<int>(NumElts, 0));
   } else {
-    Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
-                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
+    Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
   }
 
   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
-  bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
-                        SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
+  bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
+                        supportedVectorVarShift(VT, Subtarget, ISD::SRL);
 
   // Fallback for splats + all supported variable shifts.
   // Fallback for non-constants AVX2 vXi16 as well.
@@ -29088,9 +29970,11 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   }
 
-  // As with shifts, convert the rotation amount to a multiplication factor.
+  // As with shifts, attempt to convert the rotation amount to a multiplication
+  // factor, fallback to general expansion.
   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
-  assert(Scale && "Failed to convert ROTL amount to scale");
+  if (!Scale)
+    return SDValue();
 
   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
   if (EltSizeInBits == 16) {
@@ -29803,6 +30687,10 @@ static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
   }
 
+  // If we have POPCNT, use the default expansion.
+  if (Subtarget.hasPOPCNT())
+    return SDValue();
+
   if (VT == MVT::i64) {
     // Xor the high and low 16-bits together using a 32-bit operation.
     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
@@ -30358,6 +31246,10 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     Mask = ExtendToType(Mask, MaskVT, DAG, true);
   }
 
+  // Break dependency on the data register.
+  if (PassThru.isUndef())
+    PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
                     N->getScale() };
   SDValue NewGather = DAG.getMemIntrinsicNode(
@@ -30886,6 +31778,51 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
     EVT SrcVT = Src.getValueType();
 
+    if (VT.isVector() && Subtarget.hasFP16() &&
+        SrcVT.getVectorElementType() == MVT::f16) {
+      EVT EleVT = VT.getVectorElementType();
+      EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
+
+      if (SrcVT != MVT::v8f16) {
+        SDValue Tmp =
+            IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
+        SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
+        Ops[0] = Src;
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
+      }
+
+      SDValue Res, Chain;
+      if (IsStrict) {
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+        Res =
+            DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
+        Chain = Res.getValue(1);
+      } else {
+        unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+        Res = DAG.getNode(Opc, dl, ResVT, Src);
+      }
+
+      // TODO: Need to add exception check code for strict FP.
+      if (EleVT.getSizeInBits() < 16) {
+        MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
+        Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
+
+        // Now widen to 128 bits.
+        unsigned NumConcats = 128 / TmpVT.getSizeInBits();
+        MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
+        SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
+        ConcatOps[0] = Res;
+        Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
+      }
+
+      Results.push_back(Res);
+      if (IsStrict)
+        Results.push_back(Chain);
+
+      return;
+    }
+
     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
              "Unexpected type action!");
@@ -31001,8 +31938,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
     assert(!VT.isVector() && "Vectors should have been handled above!");
 
-    if (Subtarget.hasDQI() && VT == MVT::i64 &&
-        (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
+    if ((Subtarget.hasDQI() && VT == MVT::i64 &&
+         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
+        (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
       assert(!Subtarget.is64Bit() && "i64 should be legal");
       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
       // If we use a 128-bit result we might need to use a target specific node.
@@ -31036,6 +31974,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
+      SDValue Chain;
+      SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
+      Results.push_back(V);
+      if (IsStrict)
+        Results.push_back(Chain);
+      return;
+    }
+
     SDValue Chain;
     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
       Results.push_back(V);
@@ -31059,9 +32006,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
     EVT VT = N->getValueType(0);
+    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
+        Subtarget.hasVLX()) {
+      if (Src.getValueType().getVectorElementType() == MVT::i16)
+        return;
+
+      if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+                          IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
+                                   : DAG.getUNDEF(MVT::v2i32));
+      if (IsStrict) {
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
+        SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
+                                  {N->getOperand(0), Src});
+        Results.push_back(Res);
+        Results.push_back(Res.getValue(1));
+      } else {
+        unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
+        Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
+      }
+      return;
+    }
     if (VT != MVT::v2f32)
       return;
-    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
     EVT SrcVT = Src.getValueType();
     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
       if (IsStrict) {
@@ -31162,14 +32131,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::FP_ROUND: {
     bool IsStrict = N->isStrictFPOpcode();
     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    EVT VT = N->getValueType(0);
+    EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
+    if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
+      SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
+                             : DAG.getUNDEF(MVT::v2f32);
+      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
+    }
     if (!isTypeLegal(Src.getValueType()))
       return;
     SDValue V;
     if (IsStrict)
-      V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
-                      {N->getOperand(0), N->getOperand(1)});
+      V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
+                      {N->getOperand(0), Src});
     else
-      V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+      V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
     Results.push_back(V);
     if (IsStrict)
       Results.push_back(V.getValue(1));
@@ -31181,6 +32157,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     // No other ValueType for FP_EXTEND should reach this point.
     assert(N->getValueType(0) == MVT::v2f32 &&
            "Do not know how to legalize this Node");
+    if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
+      return;
+    bool IsStrict = N->isStrictFPOpcode();
+    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
+                           : DAG.getUNDEF(MVT::v2f16);
+    SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
+    if (IsStrict)
+      V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
+                      {N->getOperand(0), V});
+    else
+      V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
+    Results.push_back(V);
+    if (IsStrict)
+      Results.push_back(V.getValue(1));
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
@@ -31656,6 +32647,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MOVSLDUP)
   NODE_NAME_CASE(MOVSD)
   NODE_NAME_CASE(MOVSS)
+  NODE_NAME_CASE(MOVSH)
   NODE_NAME_CASE(UNPCKL)
   NODE_NAME_CASE(UNPCKH)
   NODE_NAME_CASE(VBROADCAST)
@@ -31684,7 +32676,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
   NODE_NAME_CASE(VAARG_64)
   NODE_NAME_CASE(VAARG_X32)
-  NODE_NAME_CASE(WIN_ALLOCA)
+  NODE_NAME_CASE(DYN_ALLOCA)
   NODE_NAME_CASE(MEMBARRIER)
   NODE_NAME_CASE(MFENCE)
   NODE_NAME_CASE(SEG_ALLOCA)
@@ -31714,6 +32706,22 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FNMSUB_RND)
   NODE_NAME_CASE(FMADDSUB_RND)
   NODE_NAME_CASE(FMSUBADD_RND)
+  NODE_NAME_CASE(VFMADDC)
+  NODE_NAME_CASE(VFMADDC_RND)
+  NODE_NAME_CASE(VFCMADDC)
+  NODE_NAME_CASE(VFCMADDC_RND)
+  NODE_NAME_CASE(VFMULC)
+  NODE_NAME_CASE(VFMULC_RND)
+  NODE_NAME_CASE(VFCMULC)
+  NODE_NAME_CASE(VFCMULC_RND)
+  NODE_NAME_CASE(VFMULCSH)
+  NODE_NAME_CASE(VFMULCSH_RND)
+  NODE_NAME_CASE(VFCMULCSH)
+  NODE_NAME_CASE(VFCMULCSH_RND)
+  NODE_NAME_CASE(VFMADDCSH)
+  NODE_NAME_CASE(VFMADDCSH_RND)
+  NODE_NAME_CASE(VFCMADDCSH)
+  NODE_NAME_CASE(VFCMADDCSH_RND)
   NODE_NAME_CASE(VPMADD52H)
   NODE_NAME_CASE(VPMADD52L)
   NODE_NAME_CASE(VRNDSCALE)
@@ -31954,6 +32962,7 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const {
 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
   switch (Opcode) {
   // TODO: Add more X86ISD opcodes once we have test coverage.
+  case X86ISD::AVG:
   case X86ISD::PCMPEQ:
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ:
@@ -32047,6 +33056,36 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 
 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
                                            SmallVectorImpl<Use *> &Ops) const {
+  using namespace llvm::PatternMatch;
+
+  FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
+  if (!VTy)
+    return false;
+
+  if (I->getOpcode() == Instruction::Mul &&
+      VTy->getElementType()->isIntegerTy(64)) {
+    for (auto &Op : I->operands()) {
+      // Make sure we are not already sinking this operand
+      if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+        continue;
+
+      // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
+      // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
+      if (Subtarget.hasSSE41() &&
+          match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
+                                 m_SpecificInt(32)))) {
+        Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+        Ops.push_back(&Op);
+      } else if (Subtarget.hasSSE2() &&
+                 match(Op.get(),
+                       m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
+        Ops.push_back(&Op);
+      }
+    }
+
+    return !Ops.empty();
+  }
+
   // A uniform shift amount in a vector shift or funnel shift may be much
   // cheaper than a generic variable vector shift, so make that pattern visible
   // to SDAG by sinking the shuffle instruction next to the shift.
@@ -32102,6 +33141,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f16:
+    return Subtarget.hasFP16();
   case MVT::f32:
   case MVT::f64:
     return true;
@@ -32180,13 +33221,9 @@ static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
 
   // If we hit the end of the block, check whether EFLAGS is live into a
   // successor.
-  for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
-                                        sEnd = BB->succ_end();
-       sItr != sEnd; ++sItr) {
-    MachineBasicBlock* succ = *sItr;
-    if (succ->isLiveIn(X86::EFLAGS))
+  for (MachineBasicBlock *Succ : BB->successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
       return true;
-  }
 
   return false;
 }
@@ -32576,6 +33613,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 // conditional jump around it.
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
+  case X86::CMOV_FR16X:
   case X86::CMOV_FR32:
   case X86::CMOV_FR32X:
   case X86::CMOV_FR64:
@@ -32922,14 +33960,11 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   }
 
   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
-  auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
-  auto DbgIt = MachineBasicBlock::iterator(MI);
-  while (DbgIt != DbgEnd) {
-    auto Next = std::next(DbgIt);
-    if (DbgIt->isDebugInstr())
-      SinkMBB->push_back(DbgIt->removeFromParent());
-    DbgIt = Next;
-  }
+  auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
+                                   MachineBasicBlock::iterator(LastCMOV));
+  for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
+    if (MI.isDebugInstr())
+      SinkMBB->push_back(MI.removeFromParent());
 
   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   SinkMBB->splice(SinkMBB->end(), ThisMBB,
@@ -34576,6 +35611,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBF16PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
@@ -34603,6 +35639,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTILESTORED: {
     unsigned Opc;
     switch (MI.getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
@@ -34795,8 +35832,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     APInt DemandedLHS, DemandedRHS;
     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
 
-    Known.One = APInt::getAllOnesValue(BitWidth * 2);
-    Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
+    Known.One = APInt::getAllOnes(BitWidth * 2);
+    Known.Zero = APInt::getAllOnes(BitWidth * 2);
 
     KnownBits Known2;
     if (!!DemandedLHS) {
@@ -35197,17 +36234,16 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
-  // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
-  if (MaskEltSize == 32 && Mask[0] == 0) {
-    if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+  // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
+  if (Mask[0] == 0 &&
+      (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
+    if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
+        (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+         isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
       Shuffle = X86ISD::VZEXT_MOVL;
-      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
-      return true;
-    }
-    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-        isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
-      Shuffle = X86ISD::VZEXT_MOVL;
-      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+      SrcVT = DstVT = MaskEltSize == 16      ? MVT::v8f16
+                      : !Subtarget.hasSSE2() ? MVT::v4f32
+                                             : MaskVT;
       return true;
     }
   }
@@ -35251,11 +36287,14 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
-  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
+       (MaskEltSize == 16 && Subtarget.hasFP16())) &&
       isUndefOrEqual(Mask[0], 0) &&
       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
     Shuffle = X86ISD::VZEXT_MOVL;
-    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+    SrcVT = DstVT = MaskEltSize == 16      ? MVT::v8f16
+                    : !Subtarget.hasSSE2() ? MVT::v4f32
+                                           : MaskVT;
     return true;
   }
 
@@ -35501,6 +36540,12 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
+        Subtarget.hasFP16()) {
+      Shuffle = X86ISD::MOVSH;
+      SrcVT = DstVT = MVT::v8f16;
+      return true;
+    }
   }
 
   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
@@ -35538,8 +36583,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
     unsigned Scale1 = NumV1Elts / NumMaskElts;
     unsigned Scale2 = NumV2Elts / NumMaskElts;
-    APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
-    APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
+    APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
+    APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
       int M = Mask[i];
       if (M == SM_SentinelUndef)
@@ -35560,12 +36605,58 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
       IsBlend = false;
       break;
     }
-    if (IsBlend &&
-        DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
-        DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
-      Shuffle = ISD::OR;
-      SrcVT = DstVT = MaskVT.changeTypeToInteger();
-      return true;
+    if (IsBlend) {
+      if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
+          DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
+        Shuffle = ISD::OR;
+        SrcVT = DstVT = MaskVT.changeTypeToInteger();
+        return true;
+      }
+      if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
+        // FIXME: handle mismatched sizes?
+        // TODO: investigate if `ISD::OR` handling in
+        // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
+        auto computeKnownBitsElementWise = [&DAG](SDValue V) {
+          unsigned NumElts = V.getValueType().getVectorNumElements();
+          KnownBits Known(NumElts);
+          for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
+            APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
+            KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
+            if (PeepholeKnown.isZero())
+              Known.Zero.setBit(EltIdx);
+            if (PeepholeKnown.isAllOnes())
+              Known.One.setBit(EltIdx);
+          }
+          return Known;
+        };
+
+        KnownBits V1Known = computeKnownBitsElementWise(V1);
+        KnownBits V2Known = computeKnownBitsElementWise(V2);
+
+        for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
+          int M = Mask[i];
+          if (M == SM_SentinelUndef)
+            continue;
+          if (M == SM_SentinelZero) {
+            IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
+            continue;
+          }
+          if (M == (int)i) {
+            IsBlend &= V2Known.Zero[i] || V1Known.One[i];
+            continue;
+          }
+          if (M == (int)(i + NumMaskElts)) {
+            IsBlend &= V1Known.Zero[i] || V2Known.One[i];
+            continue;
+          }
+          llvm_unreachable("will not get here.");
+        }
+        if (IsBlend) {
+          Shuffle = ISD::OR;
+          SrcVT = DstVT = MaskVT.changeTypeToInteger();
+          return true;
+        }
+      }
     }
   }
 
@@ -35817,13 +36908,15 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return CanonicalizeShuffleInput(RootVT, V1);
   }
 
+  SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
+
   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
   // etc. can be simplified.
-  if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
+  if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
     SmallVector<int> ScaledMask, IdentityMask;
     unsigned NumElts = VT1.getVectorNumElements();
-    if (BaseMask.size() <= NumElts &&
-        scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
+    if (Mask.size() <= NumElts &&
+        scaleShuffleElements(Mask, NumElts, ScaledMask)) {
       for (unsigned i = 0; i != NumElts; ++i)
         IdentityMask.push_back(i);
       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
@@ -35837,35 +36930,36 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // If the upper subvectors are zeroable, then an extract+insert is more
     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
     // to zero the upper subvectors.
-    if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
+    if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
-      assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
+      assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
              "Unexpected lane shuffle");
       Res = CanonicalizeShuffleInput(RootVT, V1);
-      unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
-      bool UseZero = isAnyZero(BaseMask);
+      unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
+      bool UseZero = isAnyZero(Mask);
       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
     }
 
     // Narrow shuffle mask to v4x128.
-    SmallVector<int, 4> Mask;
+    SmallVector<int, 4> ScaledMask;
     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
-    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
+    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
 
     // Try to lower to vshuf64x2/vshuf32x4.
-    auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
-                            SDValue V1, SDValue V2, SelectionDAG &DAG) {
+    auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
+                            ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
+                            SelectionDAG &DAG) {
       unsigned PermMask = 0;
       // Insure elements came from the same Op.
       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
       for (int i = 0; i < 4; ++i) {
-        assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
-        if (Mask[i] < 0)
+        assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
+        if (ScaledMask[i] < 0)
           continue;
 
-        SDValue Op = Mask[i] >= 4 ? V2 : V1;
+        SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
         unsigned OpIndex = i / 2;
         if (Ops[OpIndex].isUndef())
           Ops[OpIndex] = Op;
@@ -35875,7 +36969,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         // Convert the 128-bit shuffle mask selection values into 128-bit
         // selection bits defined by a vshuf64x2 instruction's immediate control
         // byte.
-        PermMask |= (Mask[i] % 4) << (i * 2);
+        PermMask |= (ScaledMask[i] % 4) << (i * 2);
       }
 
       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
@@ -35887,18 +36981,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
     // doesn't work because our mask is for 128 bits and we don't have an MVT
     // to match that.
-    bool PreferPERMQ =
-        UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
-        isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
-        isUndefOrInRange(Mask[3], 2, 4) &&
-        (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
-        (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
-
-    if (!isAnyZero(Mask) && !PreferPERMQ) {
+    bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
+                       isUndefOrInRange(ScaledMask[1], 0, 2) &&
+                       isUndefOrInRange(ScaledMask[2], 2, 4) &&
+                       isUndefOrInRange(ScaledMask[3], 2, 4) &&
+                       (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
+                        ScaledMask[0] == (ScaledMask[2] % 2)) &&
+                       (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
+                        ScaledMask[1] == (ScaledMask[3] % 2));
+
+    if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
         return SDValue(); // Nothing to do!
       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
-      if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
+      if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
         return DAG.getBitcast(RootVT, V);
     }
   }
@@ -35908,25 +37004,27 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // If the upper half is zeroable, then an extract+insert is more optimal
     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
     // zero the upper half.
-    if (isUndefOrZero(BaseMask[1])) {
+    if (isUndefOrZero(Mask[1])) {
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
-      assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
+      assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
       Res = CanonicalizeShuffleInput(RootVT, V1);
-      Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
-      return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
-                            DL, 256);
+      Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
+      return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
+                            256);
     }
 
-    // If we're splatting the low subvector, an insert-subvector 'concat'
+    // If we're inserting the low subvector, an insert-subvector 'concat'
     // pattern is quicker than VPERM2X128.
     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
-    if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
+    if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
+        !Subtarget.hasAVX2()) {
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
-      Res = CanonicalizeShuffleInput(RootVT, V1);
-      Res = extractSubVector(Res, 0, DAG, DL, 128);
-      return concatSubVectors(Res, Res, DAG, DL);
+      SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
+      SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
+      Hi = extractSubVector(Hi, 0, DAG, DL, 128);
+      return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
     }
 
     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
@@ -35936,11 +37034,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // we need to use the zeroing feature.
     // Prefer blends for sequential shuffles unless we are optimizing for size.
     if (UnaryShuffle &&
-        !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
-        (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
+        !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
+        (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
       unsigned PermMask = 0;
-      PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
-      PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+      PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
+      PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
       return DAG.getNode(
           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
@@ -35951,16 +37049,15 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
     if (!UnaryShuffle && !IsMaskedShuffle) {
-      assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
+      assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
              "Unexpected shuffle sentinel value");
       // Prefer blends to X86ISD::VPERM2X128.
-      if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
-            (BaseMask[0] == 2 && BaseMask[1] == 1))) {
+      if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
         unsigned PermMask = 0;
-        PermMask |= ((BaseMask[0] & 3) << 0);
-        PermMask |= ((BaseMask[1] & 3) << 4);
-        SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
-        SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
+        PermMask |= ((Mask[0] & 3) << 0);
+        PermMask |= ((Mask[1] & 3) << 4);
+        SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
+        SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
                           CanonicalizeShuffleInput(RootVT, LHS),
                           CanonicalizeShuffleInput(RootVT, RHS),
@@ -35971,13 +37068,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   // For masks that have been widened to 128-bit elements or more,
   // narrow back down to 64-bit elements.
-  SmallVector<int, 64> Mask;
   if (BaseMaskEltSizeInBits > 64) {
     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
     int MaskScale = BaseMaskEltSizeInBits / 64;
-    narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
-  } else {
-    Mask.assign(BaseMask.begin(), BaseMask.end());
+    SmallVector<int, 64> ScaledMask;
+    narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
+    Mask = std::move(ScaledMask);
   }
 
   // For masked shuffles, we're trying to match the root width for better
@@ -36029,7 +37125,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (isUndefOrEqual(Mask, 0)) {
         if (V1.getValueType() == MaskVT &&
             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-            MayFoldLoad(V1.getOperand(0))) {
+            X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
           Res = V1.getOperand(0);
@@ -36306,8 +37402,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
-    APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
-    APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
+    APInt Zero = APInt::getZero(MaskEltSizeInBits);
+    APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
     APInt UndefElts(NumMaskElts, 0);
     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
@@ -36804,10 +37900,11 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
       return SDValue();
   }
 
-  // Only fold if at least one of the constants is only used once or
-  // the combined shuffle has included a variable mask shuffle, this
-  // is to avoid constant pool bloat.
-  if (!OneUseConstantOp && !HasVariableMask)
+  // If we're optimizing for size, only fold if at least one of the constants is
+  // only used once or the combined shuffle has included a variable mask
+  // shuffle, this is to avoid constant pool bloat.
+  bool IsOptimizingSize = DAG.shouldOptForSize();
+  if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)
     return SDValue();
 
   // Shuffle the constant bits according to the mask.
@@ -36816,7 +37913,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
   APInt ZeroElts(NumMaskElts, 0);
   APInt ConstantElts(NumMaskElts, 0);
   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
-                                        APInt::getNullValue(MaskSizeInBits));
+                                        APInt::getZero(MaskSizeInBits));
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef) {
@@ -36847,10 +37944,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
     ConstantElts.setBit(i);
     ConstantBitData[i] = Bits;
   }
-  assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
+  assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
 
   // Attempt to create a zero vector.
-  if ((UndefElts | ZeroElts).isAllOnesValue())
+  if ((UndefElts | ZeroElts).isAllOnes())
     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
 
   // Create the constant data.
@@ -36931,6 +38028,10 @@ static SDValue combineX86ShufflesRecursively(
   if (!VT.isVector() || !VT.isSimple())
     return SDValue(); // Bail if we hit a non-simple non-vector.
 
+  // FIXME: Just bail on f16 for now.
+  if (VT.getVectorElementType() == MVT::f16)
+    return SDValue();
+
   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
          "Can only combine shuffles upto size of the root op.");
 
@@ -36939,7 +38040,7 @@ static SDValue combineX86ShufflesRecursively(
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
   APInt OpUndef, OpZero;
-  APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
   if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
                               OpZero, DAG, Depth, false))
@@ -36981,14 +38082,14 @@ static SDValue combineX86ShufflesRecursively(
     // Only resolve zeros if it will remove an input, otherwise we might end
     // up in an infinite loop.
     bool ResolveKnownZeros = true;
-    if (!OpZero.isNullValue()) {
-      APInt UsedInputs = APInt::getNullValue(OpInputs.size());
+    if (!OpZero.isZero()) {
+      APInt UsedInputs = APInt::getZero(OpInputs.size());
       for (int i = 0, e = OpMask.size(); i != e; ++i) {
         int M = OpMask[i];
         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
           continue;
         UsedInputs.setBit(M / OpMask.size());
-        if (UsedInputs.isAllOnesValue()) {
+        if (UsedInputs.isAllOnes()) {
           ResolveKnownZeros = false;
           break;
         }
@@ -37178,6 +38279,48 @@ static SDValue combineX86ShufflesRecursively(
           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
     return DAG.getBitcast(Root.getValueType(), HOp);
 
+  // Try to refine our inputs given our knowledge of target shuffle mask.
+  for (auto I : enumerate(Ops)) {
+    int OpIdx = I.index();
+    SDValue &Op = I.value();
+
+    // What range of shuffle mask element values results in picking from Op?
+    int Lo = OpIdx * Mask.size();
+    int Hi = Lo + Mask.size();
+
+    // Which elements of Op do we demand, given the mask's granularity?
+    APInt OpDemandedElts(Mask.size(), 0);
+    for (int MaskElt : Mask) {
+      if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
+        int OpEltIdx = MaskElt - Lo;
+        OpDemandedElts.setBit(OpEltIdx);
+      }
+    }
+
+    // Is the shuffle result smaller than the root?
+    if (Op.getValueSizeInBits() < RootSizeInBits) {
+      // We padded the mask with undefs. But we now need to undo that.
+      unsigned NumExpectedVectorElts = Mask.size();
+      unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
+      unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
+      assert(!OpDemandedElts.extractBits(
+                 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
+             "Demanding the virtual undef widening padding?");
+      OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
+    }
+
+    // The Op itself may be of different VT, so we need to scale the mask.
+    unsigned NumOpElts = Op.getValueType().getVectorNumElements();
+    APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
+
+    // Can this operand be simplified any further, given it's demanded elements?
+    if (SDValue NewOp =
+            DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
+                Op, OpScaledDemandedElts, DAG))
+      Op = NewOp;
+  }
+  // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
+
   // Widen any subvector shuffle inputs we've collected.
   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
         return Op.getValueSizeInBits() < RootSizeInBits;
@@ -37424,8 +38567,10 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
     SDValue N0 = V.getOperand(0);
     SDValue N1 = V.getOperand(1);
     unsigned Imm = V.getConstantOperandVal(2);
-    if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
-        MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+    const X86Subtarget &Subtarget =
+        static_cast<const X86Subtarget &>(DAG.getSubtarget());
+    if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
+        X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
       return SDValue();
     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
@@ -37721,6 +38866,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
 
+    // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
+    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        isNullConstant(Src.getOperand(1)) &&
+        DAG.getTargetLoweringInfo().isTypeLegal(
+            Src.getOperand(0).getValueType()))
+      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
     // Share broadcast with the longest vector and extract low subvector (free).
     // Ensure the same SDValue from the SDNode use is being used.
     for (SDNode *User : Src->uses())
@@ -37988,6 +39140,41 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     }
     return SDValue();
   }
+  case X86ISD::SHUFP: {
+    // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
+    // This is a more relaxed shuffle combiner that can ignore oneuse limits.
+    // TODO: Support types other than v4f32.
+    if (VT == MVT::v4f32) {
+      bool Updated = false;
+      SmallVector<int> Mask;
+      SmallVector<SDValue> Ops;
+      if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
+          Ops.size() == 2) {
+        for (int i = 0; i != 2; ++i) {
+          SmallVector<SDValue> SubOps;
+          SmallVector<int> SubMask, SubScaledMask;
+          SDValue Sub = peekThroughBitcasts(Ops[i]);
+          // TODO: Scaling might be easier if we specify the demanded elts.
+          if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
+              scaleShuffleElements(SubMask, 4, SubScaledMask) &&
+              SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
+            int Ofs = i * 2;
+            Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
+            Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
+            Ops[i] = DAG.getBitcast(VT, SubOps[0]);
+            Updated = true;
+          }
+        }
+      }
+      if (Updated) {
+        for (int &M : Mask)
+          M %= 4;
+        Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+        return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
+      }
+    }
+    return SDValue();
+  }
   case X86ISD::VPERMI: {
     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
@@ -38057,6 +39244,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     assert(Mask.size() == 4);
     break;
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
   case X86ISD::MOVSS: {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
@@ -38441,6 +39629,12 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
   if (VT.is512BitVector())
     return SDValue();
 
+  // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
+  // the ADDSUB idiom has been successfully recognized. There are no known
+  // X86 targets with FP16 ADDSUB instructions!
+  if (VT.getVectorElementType() == MVT::f16)
+    return SDValue();
+
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
@@ -38568,7 +39762,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     // Simplify source operands based on shuffle mask.
     // TODO - merge this into combineX86ShufflesRecursively.
     APInt KnownUndef, KnownZero;
-    APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+    APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
                                        DCI))
       return SDValue(N, 0);
@@ -38584,7 +39778,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
   // If we're demanding all elements don't bother trying to simplify the mask.
   unsigned NumElts = DemandedElts.getBitWidth();
-  if (DemandedElts.isAllOnesValue())
+  if (DemandedElts.isAllOnes())
     return false;
 
   SDValue Mask = Op.getOperand(MaskIndex);
@@ -38671,6 +39865,58 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     KnownZero = LHSZero | RHSZero;
     break;
   }
+  case X86ISD::VPMADDWD: {
+    APInt LHSUndef, LHSZero;
+    APInt RHSUndef, RHSZero;
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
+
+    if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
+                                   Depth + 1))
+      return true;
+
+    // TODO: Multiply by zero.
+
+    // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
+    APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
+    if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
+    if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    break;
+  }
+  case X86ISD::PSADBW: {
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    assert(VT.getScalarType() == MVT::i64 &&
+           LHS.getValueType() == RHS.getValueType() &&
+           LHS.getValueType().getScalarType() == MVT::i8 &&
+           "Unexpected PSADBW types");
+
+    // Aggressively peek through ops to get at the demanded elts.
+    if (!DemandedElts.isAllOnes()) {
+      unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
+      APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+      SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
+          LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
+      SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
+          RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (NewLHS || NewRHS) {
+        NewLHS = NewLHS ? NewLHS : LHS;
+        NewRHS = NewRHS ? NewRHS : RHS;
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
+      }
+    }
+    break;
+  }
   case X86ISD::VSHL:
   case X86ISD::VSRL:
   case X86ISD::VSRA: {
@@ -38706,7 +39952,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       return true;
 
     // Aggressively peek through ops to get at the demanded elts.
-    if (!DemandedElts.isAllOnesValue())
+    if (!DemandedElts.isAllOnes())
       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
               Src, DemandedElts, TLO.DAG, Depth + 1))
         return TLO.CombineTo(
@@ -38823,7 +40069,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
     // Aggressively peek through ops to get at the demanded elts.
     // TODO - we should do this for all target/faux shuffles ops.
-    if (!DemandedElts.isAllOnesValue()) {
+    if (!DemandedElts.isAllOnes()) {
       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
                                                             TLO.DAG, Depth + 1);
       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
@@ -38860,7 +40106,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
     // Aggressively peek through ops to get at the demanded elts.
     // TODO: Handle repeated operands.
-    if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
+    if (N0 != N1 && !DemandedElts.isAllOnes()) {
       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
                                                             TLO.DAG, Depth + 1);
       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
@@ -39019,15 +40265,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
         SDLoc DL(Op);
         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
                                       ExtSizeInBits / VT.getScalarSizeInBits());
-        SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
-        SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
-        SDValue Bcst =
-            TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
-                                        Ops, MemVT, MemIntr->getMemOperand());
-        TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
-                                             Bcst.getValue(1));
-        return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
-                                                 TLO.DAG, DL, ExtSizeInBits));
+        if (SDValue BcstLd =
+                getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
+          return TLO.CombineTo(Op,
+                               insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
+                                               TLO.DAG, DL, ExtSizeInBits));
       }
       break;
     }
@@ -39130,6 +40372,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     }
   }
 
+  // For broadcasts, unless we *only* demand the 0'th element,
+  // stop attempts at simplification here, we aren't going to improve things,
+  // this is better than any potential shuffle.
+  if (isTargetShuffleSplat(Op) && !DemandedElts.isOne())
+    return false;
+
   // Get target/faux shuffle mask.
   APInt OpUndef, OpZero;
   SmallVector<int, 64> OpMask;
@@ -39175,7 +40423,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       continue;
 
     int Lo = Src * NumElts;
-    APInt SrcElts = APInt::getNullValue(NumElts);
+    APInt SrcElts = APInt::getZero(NumElts);
     for (int i = 0; i != NumElts; ++i)
       if (DemandedElts[i]) {
         int M = OpMask[i] - Lo;
@@ -39197,7 +40445,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   // to match. This prevents combineX86ShuffleChain from returning a
   // combined shuffle that's the same as the original root, causing an
   // infinite loop.
-  if (!DemandedElts.isAllOnesValue()) {
+  if (!DemandedElts.isAllOnes()) {
     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
 
     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
@@ -39492,7 +40740,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     // Don't attempt this on AVX512 as it might affect broadcast folding.
     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
-        OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
+        OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&
+        Src->hasOneUse()) {
       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
       SDValue NewSrc =
           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
@@ -39697,7 +40946,7 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
 
       // Bitmask that indicates which ops have only been accessed 'inline'.
-      APInt IdentityOp = APInt::getAllOnesValue(NumOps);
+      APInt IdentityOp = APInt::getAllOnes(NumOps);
       for (int i = 0; i != NumElts; ++i) {
         int M = ShuffleMask[i];
         if (!DemandedElts[i] || ShuffleUndef[i])
@@ -40351,9 +41600,9 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
       isa<ConstantSDNode>(N0)) {
     auto *C = cast<ConstantSDNode>(N0);
-    if (C->isAllOnesValue())
+    if (C->isAllOnes())
       return DAG.getConstant(1, SDLoc(N0), VT);
-    if (C->isNullValue())
+    if (C->isZero())
       return DAG.getConstant(0, SDLoc(N0), VT);
   }
 
@@ -40419,6 +41668,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // Check if we have a bitcast from another integer type as well.
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
+        (Subtarget.hasFP16() && VT == MVT::f16) ||
         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
          TLI.isTypeLegal(VT))))
     return SDValue();
@@ -40547,7 +41797,7 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
   else if (BinOp == ISD::SMIN)
     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::UMAX)
-    Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
+    Mask = DAG.getAllOnesConstant(DL, SrcVT);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
@@ -40994,7 +42244,8 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
 
 /// Extracting a scalar FP value from vector element 0 is free, so extract each
 /// operand first, then perform the math as a scalar op.
-static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
+static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
@@ -41022,7 +42273,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
   }
 
-  if (VT != MVT::f32 && VT != MVT::f64)
+  if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
+      VT != MVT::f64)
     return SDValue();
 
   // Vector FP selects don't fit the pattern of FP math ops (because the
@@ -41277,8 +42529,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
 
   if (IsPextr) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLI.SimplifyDemandedBits(
-            SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
+    if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+                                 APInt::getAllOnes(VT.getSizeInBits()), DCI))
       return SDValue(N, 0);
 
     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
@@ -41336,7 +42588,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
     return V;
 
-  if (SDValue V = scalarizeExtEltFP(N, DAG))
+  if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
     return V;
 
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
@@ -41573,11 +42825,11 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
 
     // Multiply condition by the difference if non-one.
-    if (!AbsDiff.isOneValue())
+    if (!AbsDiff.isOne())
       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
 
     // Add the base if non-zero.
-    if (!FalseC->isNullValue())
+    if (!FalseC->isZero())
       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
 
     return R;
@@ -41794,10 +43046,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
       int NumElts = VT.getVectorNumElements();
       for (int i = 0; i != NumElts; ++i) {
-        if (CondMask[i] < NumElts)
+        // getConstVector sets negative shuffle mask values as undef, so ensure
+        // we hardcode SM_SentinelZero values to zero (0x80).
+        if (CondMask[i] < NumElts) {
+          LHSMask[i] = (LHSMask[i] == SM_SentinelZero) ? 0x80 : LHSMask[i];
           RHSMask[i] = 0x80;
-        else
+        } else {
           LHSMask[i] = 0x80;
+          RHSMask[i] = (RHSMask[i] == SM_SentinelZero) ? 0x80 : RHSMask[i];
+        }
       }
       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
@@ -42331,7 +43588,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
 
   // We can handle comparisons with zero in a number of cases by manipulating
   // the CC used.
-  if (!Comparison.isNullValue())
+  if (!Comparison.isZero())
     return SDValue();
 
   if (CC == X86::COND_S && Addend == 1)
@@ -42737,7 +43994,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   unsigned NumElts = VecVT.getVectorNumElements();
   unsigned NumEltBits = VecVT.getScalarSizeInBits();
 
-  bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
+  bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
   bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
                  CmpVal.isMask(NumElts);
   if (!IsAnyOf && !IsAllOf)
@@ -42830,12 +44087,12 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
     unsigned NumShuffleElts = ShuffleMask.size();
-    APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
+    APInt DemandedElts = APInt::getZero(NumShuffleElts);
     for (int M : ShuffleMask) {
       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
       DemandedElts.setBit(M);
     }
-    if (DemandedElts.isAllOnesValue()) {
+    if (DemandedElts.isAllOnes()) {
       SDLoc DL(EFLAGS);
       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
@@ -43316,8 +44573,9 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// If the upper 17 bits of each element are zero then we can use PMADDWD,
-// which is always at least as quick as PMULLD, except on KNL.
+// If the upper 17 bits of either element are zero and the other element are
+// zero/sign bits then we can use PMADDWD, which is always at least as quick as
+// PMULLD, except on KNL.
 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
@@ -43332,33 +44590,92 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
     return SDValue();
 
-  // Make sure the type is legal or will be widened to a legal type.
-  if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+  // Make sure the type is legal or can split/widen to a legal type.
+  // With AVX512 but without BWI, we would need to split v32i16.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 1 || !isPowerOf2_32(NumElts))
     return SDValue();
 
-  MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
+  EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
 
-  // Without BWI, we would need to split v32i16.
-  if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
+  // With AVX512 but without BWI, we would need to split v32i16.
+  if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  // If we are zero extending two steps without SSE4.1, its better to reduce
+  // If we are zero/sign extending two steps without SSE4.1, its better to
+  // reduce the vmul width instead.
+  if (!Subtarget.hasSSE41() &&
+      (((N0.getOpcode() == ISD::ZERO_EXTEND &&
+         N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
+        (N1.getOpcode() == ISD::ZERO_EXTEND &&
+         N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
+       ((N0.getOpcode() == ISD::SIGN_EXTEND &&
+         N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
+        (N1.getOpcode() == ISD::SIGN_EXTEND &&
+         N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
+    return SDValue();
+
+  // If we are sign extending a wide vector without SSE4.1, its better to reduce
   // the vmul width instead.
   if (!Subtarget.hasSSE41() &&
-      (N0.getOpcode() == ISD::ZERO_EXTEND &&
-       N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
-      (N1.getOpcode() == ISD::ZERO_EXTEND &&
-       N1.getOperand(0).getScalarValueSizeInBits() <= 8))
+      (N0.getOpcode() == ISD::SIGN_EXTEND &&
+       N0.getOperand(0).getValueSizeInBits() > 128) &&
+      (N1.getOpcode() == ISD::SIGN_EXTEND &&
+       N1.getOperand(0).getValueSizeInBits() > 128))
     return SDValue();
 
-  APInt Mask17 = APInt::getHighBitsSet(32, 17);
-  if (!DAG.MaskedValueIsZero(N1, Mask17) ||
-      !DAG.MaskedValueIsZero(N0, Mask17))
+  // Sign bits must extend down to the lowest i16.
+  if (DAG.ComputeMinSignedBits(N1) > 16 || DAG.ComputeMinSignedBits(N0) > 16)
     return SDValue();
 
+  // At least one of the elements must be zero in the upper 17 bits, or can be
+  // safely made zero without altering the final result.
+  auto GetZeroableOp = [&](SDValue Op) {
+    APInt Mask17 = APInt::getHighBitsSet(32, 17);
+    if (DAG.MaskedValueIsZero(Op, Mask17))
+      return Op;
+    // Mask off upper 16-bits of sign-extended constants.
+    if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
+      return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
+                         DAG.getConstant(0xFFFF, SDLoc(N), VT));
+    if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
+      SDValue Src = Op.getOperand(0);
+      // Convert sext(vXi16) to zext(vXi16).
+      if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
+      // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
+      // which will expand the extension.
+      if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
+        EVT ExtVT = VT.changeVectorElementType(MVT::i16);
+        Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
+      }
+    }
+    // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
+    if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
+        N->isOnlyUserOf(Op.getNode())) {
+      SDValue Src = Op.getOperand(0);
+      if (Src.getScalarValueSizeInBits() == 16)
+        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
+    }
+    // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
+    if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
+        N->isOnlyUserOf(Op.getNode())) {
+      return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
+                         Op.getOperand(1));
+    }
+    return SDValue();
+  };
+  SDValue ZeroN0 = GetZeroableOp(N0);
+  SDValue ZeroN1 = GetZeroableOp(N1);
+  if (!ZeroN0 && !ZeroN1)
+    return SDValue();
+  N0 = ZeroN0 ? ZeroN0 : N0;
+  N1 = ZeroN1 ? ZeroN1 : N1;
+
   // Use SplitOpsAndApply to handle AVX splitting.
   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                            ArrayRef<SDValue> Ops) {
@@ -43412,8 +44729,6 @@ static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Optimize a single multiply with constant into two operations in order to
-/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -43428,8 +44743,11 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
+  // Optimize a single multiply with constant into two operations in order to
+  // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
   if (!MulConstantOptimization)
     return SDValue();
+
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction().hasMinSize())
     return SDValue();
@@ -43569,9 +44887,7 @@ static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
            "SRL or SRA node is required here!");
   SDLoc DL(N);
 
-  // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
-  // the multiply.
-  if (!Subtarget.hasSSE41())
+  if (!Subtarget.hasSSE2())
     return SDValue();
 
   // The operation feeding into the shift must be a multiply.
@@ -43964,7 +45280,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
 
     APInt Undefs(NumDstElts, 0);
-    SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
+    SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
@@ -43994,9 +45310,9 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
           if (Val.isIntN(DstBitsPerElt))
             Val = Val.trunc(DstBitsPerElt);
           else if (Val.isNegative())
-            Val = APInt::getNullValue(DstBitsPerElt);
+            Val = APInt::getZero(DstBitsPerElt);
           else
-            Val = APInt::getAllOnesValue(DstBitsPerElt);
+            Val = APInt::getAllOnes(DstBitsPerElt);
         }
         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
       }
@@ -44048,6 +45364,14 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
     }
+
+    // Try again with pack(*_extend_vector_inreg, undef).
+    unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                    : ISD::ZERO_EXTEND_VECTOR_INREG;
+    if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
+        N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
+      return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
+                                    DAG);
   }
 
   // Attempt to combine as shuffle.
@@ -44066,47 +45390,25 @@ static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
          "Unexpected horizontal add/sub opcode");
 
   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
-    // For slow-hop targets, if we have a hop with a single op, see if we already
-    // have another user that we can reuse and shuffle the result.
     MVT VT = N->getSimpleValueType(0);
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
-    if (VT.is128BitVector() && LHS == RHS) {
-      for (SDNode *User : LHS->uses()) {
-        if (User != N && User->getOpcode() == N->getOpcode()) {
-          MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
-          if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
-            return DAG.getBitcast(
-                VT,
-                DAG.getVectorShuffle(ShufVT, SDLoc(N),
-                                     DAG.getBitcast(ShufVT, SDValue(User, 0)),
-                                     DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
-          }
-          if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
-            return DAG.getBitcast(
-                VT,
-                DAG.getVectorShuffle(ShufVT, SDLoc(N),
-                                     DAG.getBitcast(ShufVT, SDValue(User, 0)),
-                                     DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
-          }
-        }
-      }
-    }
 
     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
         LHS.getOpcode() == RHS.getOpcode() &&
-        LHS.getValueType() == RHS.getValueType()) {
+        LHS.getValueType() == RHS.getValueType() &&
+        N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
       SDValue LHS0 = LHS.getOperand(0);
-      SDValue RHS0 = LHS.getOperand(1);
-      SDValue LHS1 = RHS.getOperand(0);
+      SDValue LHS1 = LHS.getOperand(1);
+      SDValue RHS0 = RHS.getOperand(0);
       SDValue RHS1 = RHS.getOperand(1);
-      if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
-          (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
+      if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
+          (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
         SDLoc DL(N);
         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
-                                  LHS0.isUndef() ? RHS0 : LHS0,
-                                  LHS1.isUndef() ? RHS1 : LHS1);
+                                  LHS0.isUndef() ? LHS1 : LHS0,
+                                  RHS0.isUndef() ? RHS1 : RHS0);
         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
         Res = DAG.getBitcast(ShufVT, Res);
         SDValue NewLHS =
@@ -44115,9 +45417,8 @@ static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
         SDValue NewRHS =
             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
-        DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
-        DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
-        return SDValue(N, 0);
+        return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
+                           DAG.getBitcast(VT, NewRHS));
       }
     }
   }
@@ -44154,7 +45455,7 @@ static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
 
   APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
                                      KnownZero, DCI))
     return SDValue(N, 0);
@@ -44256,8 +45557,8 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
   }
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
-                               APInt::getAllOnesValue(NumBitsPerElt), DCI))
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
+                               DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -44276,7 +45577,7 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
-                                 APInt::getAllOnesValue(NumBitsPerElt), DCI))
+                                 APInt::getAllOnes(NumBitsPerElt), DCI))
       return SDValue(N, 0);
   }
 
@@ -44315,12 +45616,15 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
     SDValue CMP01 = CMP0->getOperand(1);
     EVT     VT    = CMP00.getValueType();
 
-    if (VT == MVT::f32 || VT == MVT::f64) {
+    if (VT == MVT::f32 || VT == MVT::f64 ||
+        (VT == MVT::f16 && Subtarget.hasFP16())) {
       bool ExpectingFlags = false;
       // Check for any users that want flags:
-      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-           !ExpectingFlags && UI != UE; ++UI)
-        switch (UI->getOpcode()) {
+      for (const SDNode *U : N->uses()) {
+        if (ExpectingFlags)
+          break;
+
+        switch (U->getOpcode()) {
         default:
         case ISD::BR_CC:
         case ISD::BRCOND:
@@ -44333,6 +45637,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
         case ISD::ANY_EXTEND:
           break;
         }
+      }
 
       if (!ExpectingFlags) {
         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
@@ -44396,7 +45701,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
-static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
 
   MVT VT = N->getSimpleValueType(0);
@@ -44543,17 +45848,19 @@ static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
   return FPOpcode;
 }
 
-/// If both input operands of a logic op are being cast from floating point
-/// types, try to convert this into a floating point logic node to avoid
-/// unnecessary moves from SSE to integer registers.
+/// If both input operands of a logic op are being cast from floating-point
+/// types or FP compares, try to convert this into a floating-point logic node
+/// to avoid unnecessary moves from SSE to integer registers.
 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
 
-  if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
+  if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
+        (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
@@ -44562,14 +45869,44 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   EVT N10Type = N10.getValueType();
 
   // Ensure that both types are the same and are legal scalar fp types.
-  if (N00Type != N10Type ||
-      !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
-        (Subtarget.hasSSE2() && N00Type == MVT::f64)))
+  if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
+                              (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
+                              (Subtarget.hasFP16() && N00Type == MVT::f16)))
     return SDValue();
 
-  unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
-  SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
-  return DAG.getBitcast(VT, FPLogic);
+  if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
+    unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
+    SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+    return DAG.getBitcast(VT, FPLogic);
+  }
+
+  // The vector ISA for FP predicates is incomplete before AVX, so converting
+  // COMIS* to CMPS* may not be a win before AVX.
+  // TODO: Check types/predicates to see if they are available with SSE/SSE2.
+  if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC ||
+      !N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
+  // and vector logic:
+  // logic (setcc N00, N01), (setcc N10, N11) -->
+  // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
+  unsigned NumElts = 128 / N00Type.getSizeInBits();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
+  EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+  SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
+  SDValue N01 = N0.getOperand(1);
+  SDValue N11 = N1.getOperand(1);
+  SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
+  SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
+  SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
+  SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
+  SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01,
+                                cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11,
+                                cast<CondCodeSDNode>(N1.getOperand(2))->get());
+  SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
 }
 
 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
@@ -44613,12 +45950,40 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
-  EVT VT0 = Op0.getValueType();
-  EVT VT1 = Op1.getValueType();
-
-  if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
+  EVT VT = Op0.getValueType();
+  if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
     return SDValue();
 
+  // Try to convert an "is positive" signbit masking operation into arithmetic
+  // shift and "andn". This saves a materialization of a -1 vector constant.
+  // The "is negative" variant should be handled more generally because it only
+  // requires "and" rather than "andn":
+  // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
+  //
+  // This is limited to the original type to avoid producing even more bitcasts.
+  // If the bitcasts can't be eliminated, then it is unlikely that this fold
+  // will be profitable.
+  if (N->getValueType(0) == VT &&
+      supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
+    SDValue X, Y;
+    if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&
+        isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {
+      X = Op1.getOperand(0);
+      Y = Op0;
+    } else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&
+               isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {
+      X = Op0.getOperand(0);
+      Y = Op1;
+    }
+    if (X && Y) {
+      SDLoc DL(N);
+      SDValue Sra =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
+                                     VT.getScalarSizeInBits() - 1, DAG);
+      return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
+    }
+  }
+
   APInt SplatVal;
   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
       !SplatVal.isMask())
@@ -44628,17 +45993,17 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
   if (isBitwiseNot(Op0))
     return SDValue();
 
-  if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+  if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
     return SDValue();
 
-  unsigned EltBitWidth = VT0.getScalarSizeInBits();
+  unsigned EltBitWidth = VT.getScalarSizeInBits();
   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
   SDLoc DL(N);
   unsigned ShiftVal = SplatVal.countTrailingOnes();
   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
-  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
 
@@ -44881,16 +46246,16 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
     return R;
 
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
+    return FPLogic;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
-  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
-    return FPLogic;
-
-  if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
+  if (SDValue R = combineAndNotIntoANDNP(N, DAG))
     return R;
 
   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
@@ -44921,7 +46286,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
         llvm::all_of(EltBits, [](const APInt &M) {
-          return M.isNullValue() || M.isAllOnesValue();
+          return M.isZero() || M.isAllOnes();
         })) {
       unsigned NumElts = SrcVecVT.getVectorNumElements();
       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
@@ -44933,8 +46298,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
         if (UndefElts[i])
           continue;
         int VecIdx = Scale * Idx + i;
-        ShuffleMask[VecIdx] =
-            EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
+        ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
       }
 
       if (SDValue Shuffle = combineX86ShufflesRecursively(
@@ -44956,7 +46320,8 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
   MVT VT = N->getSimpleValueType(0);
-  if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  if (!VT.isVector() || (EltSizeInBits % 8) != 0)
     return SDValue();
 
   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
@@ -44966,9 +46331,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
 
   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
-  bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
-                      Subtarget.hasVLX();
-  if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
+  if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
     return SDValue();
 
@@ -44992,13 +46355,19 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
 
   SDLoc DL(N);
 
-  if (UseVPTERNLOG) {
-    // Emit a VPTERNLOG node directly.
-    SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
-    SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
-    SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
+  if (useVPTERNLOG(Subtarget, VT)) {
+    // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
+    // VPTERNLOG is only available as vXi32/64-bit types.
+    MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
+    MVT OpVT =
+        MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
+    SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
+    SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
+    SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
-    return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
+    SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
+                                DAG, Subtarget);
+    return DAG.getBitcast(VT, Res);
   }
 
   SDValue X = N->getOperand(0);
@@ -45247,15 +46616,15 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
     return R;
 
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
+    return FPLogic;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
-  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
-    return FPLogic;
-
   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
     return R;
 
@@ -45476,7 +46845,7 @@ static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
 
   APInt SignedMax, SignedMin;
   if (MatchPackUS) {
-    SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
+    SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
     SignedMin = APInt(NumSrcBits, 0);
   } else {
     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
@@ -45641,6 +47010,11 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     });
   };
 
+  auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
+    unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
+    return MaxActiveBits <= ScalarVT.getSizeInBits();
+  };
+
   // Check if each element of the vector is right-shifted by one.
   SDValue LHS = In.getOperand(0);
   SDValue RHS = In.getOperand(1);
@@ -45659,23 +47033,25 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
   };
 
-  auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
+  auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
+    for (SDValue &Op : Ops)
+      if (Op.getValueType() != VT)
+        Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     // Pad to a power-of-2 vector, split+apply and extract the original vector.
     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
     if (NumElemsPow2 != NumElems) {
-      SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
-      SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
-      for (unsigned i = 0; i != NumElems; ++i) {
-        SDValue Idx = DAG.getIntPtrConstant(i, DL);
-        Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
-        Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
+      for (SDValue &Op : Ops) {
+        SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+        for (unsigned i = 0; i != NumElems; ++i) {
+          SDValue Idx = DAG.getIntPtrConstant(i, DL);
+          EltsOfOp[i] =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
+        }
+        Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
       }
-      Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
-      Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
     }
-    SDValue Res =
-        SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
+    SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
     if (NumElemsPow2 == NumElems)
       return Res;
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
@@ -45685,14 +47061,12 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   // Take care of the case when one of the operands is a constant vector whose
   // element is in the range [1, 256].
   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
-      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
-      Operands[0].getOperand(0).getValueType() == VT) {
+      IsZExtLike(Operands[0])) {
     // The pattern is detected. Subtract one from the constant vector, then
     // demote it and emit X86ISD::AVG instruction.
     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
-    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
-    return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
+    return AVGSplitter({Operands[0], Operands[1]});
   }
 
   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
@@ -45731,15 +47105,12 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
 
     // Check if Operands[0] and Operands[1] are results of type promotion.
     for (int j = 0; j < 2; ++j)
-      if (Operands[j].getValueType() != VT) {
-        if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
-            Operands[j].getOperand(0).getValueType() != VT)
+      if (Operands[j].getValueType() != VT)
+        if (!IsZExtLike(Operands[j]))
           return SDValue();
-        Operands[j] = Operands[j].getOperand(0);
-      }
 
     // The pattern is detected, emit X86ISD::AVG instruction(s).
-    return AVGSplitter(Operands[0], Operands[1]);
+    return AVGSplitter({Operands[0], Operands[1]});
   }
 
   return SDValue();
@@ -46685,11 +48056,171 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+//  Try to combine the following nodes
+//  t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
+//    <i32 -2147483648[float -0.000000e+00]> 0
+//  t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
+//    <(load 4 from constant-pool)> t0, t29
+//  [t30: v16i32 = bitcast t27]
+//  t6: v16i32 = xor t7, t27[t30]
+//  t11: v16f32 = bitcast t6
+//  t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
+//  into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
+//  t22: v16f32 = bitcast t7
+//  t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
+//  t24: v32f16 = bitcast t23
+static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  int CombineOpcode =
+      N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
+  auto isConjugationConstant = [](const Constant *c) {
+    if (const auto *CI = dyn_cast<ConstantInt>(c)) {
+      APInt ConjugationInt32 = APInt(32, 0x80000000, true);
+      APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
+      switch (CI->getBitWidth()) {
+      case 16:
+        return false;
+      case 32:
+        return CI->getValue() == ConjugationInt32;
+      case 64:
+        return CI->getValue() == ConjugationInt64;
+      default:
+        llvm_unreachable("Unexpected bit width");
+      }
+    }
+    if (const auto *CF = dyn_cast<ConstantFP>(c))
+      return CF->isNegativeZeroValue();
+    return false;
+  };
+  auto combineConjugation = [&](SDValue &r) {
+    if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
+      SDValue XOR = LHS.getOperand(0);
+      if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
+        SDValue XORRHS = XOR.getOperand(1);
+        if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
+          XORRHS = XORRHS.getOperand(0);
+        if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
+            XORRHS.getOperand(1).getNumOperands()) {
+          ConstantPoolSDNode *CP =
+              dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
+          if (CP && isConjugationConstant(CP->getConstVal())) {
+            SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+            SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
+            SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
+            r = DAG.getBitcast(VT, FCMulC);
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  };
+  SDValue Res;
+  if (combineConjugation(Res))
+    return Res;
+  std::swap(LHS, RHS);
+  if (combineConjugation(Res))
+    return Res;
+  return Res;
+}
+
+//  Try to combine the following nodes:
+//  FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
+static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
+    return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+           Flags.hasAllowContract();
+  };
+
+  auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
+    return DAG.getTarget().Options.NoSignedZerosFPMath ||
+           Flags.hasNoSignedZeros();
+  };
+  auto IsVectorAllNegativeZero = [](const SDNode *N) {
+    if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
+      return false;
+    assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&
+           "Unexpected vector type!");
+    if (ConstantPoolSDNode *CP =
+            dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
+      APInt AI = APInt(32, 0x80008000, true);
+      if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
+        return CI->getValue() == AI;
+      if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
+    }
+    return false;
+  };
+
+  if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
+      !AllowContract(N->getFlags()))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  bool IsConj;
+  SDValue FAddOp1, MulOp0, MulOp1;
+  auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
+                       &IsVectorAllNegativeZero,
+                       &HasNoSignedZero](SDValue N) -> bool {
+    if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
+      return false;
+    SDValue Op0 = N.getOperand(0);
+    unsigned Opcode = Op0.getOpcode();
+    if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
+      if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
+        MulOp0 = Op0.getOperand(0);
+        MulOp1 = Op0.getOperand(1);
+        IsConj = Opcode == X86ISD::VFCMULC;
+        return true;
+      }
+      if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
+          ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
+            HasNoSignedZero(Op0->getFlags())) ||
+           IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
+        MulOp0 = Op0.getOperand(0);
+        MulOp1 = Op0.getOperand(1);
+        IsConj = Opcode == X86ISD::VFCMADDC;
+        return true;
+      }
+    }
+    return false;
+  };
+
+  if (GetCFmulFrom(LHS))
+    FAddOp1 = RHS;
+  else if (GetCFmulFrom(RHS))
+    FAddOp1 = LHS;
+  else
+    return SDValue();
+
+  MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
+  FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
+  unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
+  // FIXME: How do we handle when fast math flags of FADD are different from
+  // CFMUL's?
+  SDValue CFmul =
+      DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
+  return DAG.getBitcast(VT, CFmul);
+}
+
 /// Do target-specific dag combines on floating-point adds/subs.
 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
     return HOp;
+
+  if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
+    return COp;
+
   return SDValue();
 }
 
@@ -46922,7 +48453,7 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
   if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
-            In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
+            In, APInt::getAllOnes(VT.getVectorNumElements()))) {
       if (*ShAmt == MinSignBits) {
         SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
         return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
@@ -47178,7 +48709,7 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+  APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
     return SDValue(N, 0);
 
@@ -47498,6 +49029,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
     return R;
 
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
+    return FPLogic;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -47546,9 +49080,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
-    return FPLogic;
-
   return combineFneg(N, DAG, DCI, Subtarget);
 }
 
@@ -47562,7 +49093,7 @@ static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
 
   // Simplify the inputs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+  APInt DemandedMask(APInt::getAllOnes(NumBits));
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
     return SDValue(N, 0);
 
@@ -47704,6 +49235,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
+        (Subtarget.hasFP16() && VT == MVT::f16) ||
         (VT.isVector() && TLI.isTypeLegal(VT))))
     return SDValue();
 
@@ -47765,7 +49297,7 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   APInt KnownUndef, KnownZero;
-  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
                                      KnownZero, DCI))
     return SDValue(N, 0);
@@ -48265,6 +49797,9 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
     return SDValue();
 
+  // We don't have CMPP Instruction for vxf16
+  if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
+    return SDValue();
   // We can only do this if the vector size in 256 bits or less.
   unsigned Size = VT.getSizeInBits();
   if (Size > 256 && Subtarget.useAVX512Regs())
@@ -48366,7 +49901,9 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   }
 
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+  if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+       !Subtarget.hasAnyFMA()) &&
+      !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
     return SDValue();
 
   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
@@ -48873,7 +50410,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
 
   // Simplify the inputs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+  APInt DemandedMask(APInt::getAllOnes(NumBits));
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
     return SDValue(N, 0);
 
@@ -48881,9 +50418,44 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
-                                       TargetLowering::DAGCombinerInfo &DCI) {
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const X86Subtarget &Subtarget) {
+  auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
+  SDValue BasePtr = MemOp->getBasePtr();
+  SDValue Index = MemOp->getIndex();
+  SDValue Scale = MemOp->getScale();
+  SDValue Mask = MemOp->getMask();
+
+  // Attempt to fold an index scale into the scale value directly.
+  // For smaller indices, implicit sext is performed BEFORE scale, preventing
+  // this fold under most circumstances.
+  // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
+  if ((Index.getOpcode() == X86ISD::VSHLI ||
+       (Index.getOpcode() == ISD::ADD &&
+        Index.getOperand(0) == Index.getOperand(1))) &&
+      isa<ConstantSDNode>(Scale) &&
+      BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
+    unsigned ShiftAmt =
+        Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
+    uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
+    uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
+    if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
+      SDValue NewIndex = Index.getOperand(0);
+      SDValue NewScale =
+          DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
+      if (N->getOpcode() == X86ISD::MGATHER)
+        return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
+                                 MemOp->getOperand(1), Mask,
+                                 MemOp->getBasePtr(), NewIndex, NewScale,
+                                 MemOp->getChain(), Subtarget);
+      if (N->getOpcode() == X86ISD::MSCATTER)
+        return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
+                              MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
+                              NewIndex, NewScale, MemOp->getChain(), Subtarget);
+    }
+  }
+
   // With vector masks we only demand the upper bit of the mask.
-  SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
   if (Mask.getScalarValueSizeInBits() != 1) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
@@ -48962,6 +50534,48 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  // Try to move splat constant adders from the index operand to the base
+  // pointer operand. Taking care to multiply by the scale. We can only do
+  // this when index element type is the same as the pointer type.
+  // Otherwise we need to be sure the math doesn't wrap before the scale.
+  if (Index.getOpcode() == ISD::ADD &&
+      Index.getValueType().getVectorElementType() == PtrVT &&
+      isa<ConstantSDNode>(Scale)) {
+    uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
+      BitVector UndefElts;
+      if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
+        // FIXME: Allow non-constant?
+        if (UndefElts.none()) {
+          // Apply the scale.
+          APInt Adder = C->getAPIntValue() * ScaleAmt;
+          // Add it to the existing base.
+          Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
+                             DAG.getConstant(Adder, DL, PtrVT));
+          Index = Index.getOperand(0);
+          return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+        }
+      }
+
+      // It's also possible base is just a constant. In that case, just
+      // replace it with 0 and move the displacement into the index.
+      if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
+          isOneConstant(Scale)) {
+        SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
+        // Combine the constant build_vector and the constant base.
+        Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
+                            Index.getOperand(1), Splat);
+        // Add to the LHS of the original Index add.
+        Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
+                            Index.getOperand(0), Splat);
+        Base = DAG.getConstant(0, DL, Base.getValueType());
+        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+      }
+    }
+  }
+
   if (DCI.isBeforeLegalizeOps()) {
     unsigned IndexWidth = Index.getScalarValueSizeInBits();
 
@@ -49120,10 +50734,31 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
 
+  // UINT_TO_FP(vXi1~15)  -> UINT_TO_FP(ZEXT(vXi1~15  to vXi16))
+  // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
+  // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
+  if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
+    unsigned ScalarSize = InVT.getScalarSizeInBits();
+    if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+      return SDValue();
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ScalarSize < 16   ? MVT::i16
+                                 : ScalarSize < 32 ? MVT::i32
+                                                   : MVT::i64,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+    if (IsStrict)
+      return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
+                         {N->getOperand(0), P});
+    return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+  }
+
   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
-  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
+  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
+      VT.getScalarType() != MVT::f16) {
     SDLoc dl(N);
     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
@@ -49162,10 +50797,31 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
 
+  // SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))
+  // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
+  // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
+  if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
+    unsigned ScalarSize = InVT.getScalarSizeInBits();
+    if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+      return SDValue();
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ScalarSize < 16   ? MVT::i16
+                                 : ScalarSize < 32 ? MVT::i32
+                                                   : MVT::i64,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    if (IsStrict)
+      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+                         {N->getOperand(0), P});
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
-  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
+  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
+      VT.getScalarType() != MVT::f16) {
     SDLoc dl(N);
     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
@@ -49244,10 +50900,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
 static bool needCarryOrOverflowFlag(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
-  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
-         UI != UE; ++UI) {
-    SDNode *User = *UI;
-
+  for (const SDNode *User : Flags->uses()) {
     X86::CondCode CC;
     switch (User->getOpcode()) {
     default:
@@ -49282,10 +50935,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
 static bool onlyZeroFlagUsed(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
-  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
-         UI != UE; ++UI) {
-    SDNode *User = *UI;
-
+  for (const SDNode *User : Flags->uses()) {
     unsigned CCOpNo;
     switch (User->getOpcode()) {
     default:
@@ -49534,8 +51184,8 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   // the general case below.
   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
   if (ConstantX) {
-    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
-        (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
+    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
+        (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
       // This is a complicated way to get -1 or 0 from the carry flag:
       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
@@ -49544,8 +51194,8 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
                          Y.getOperand(1));
     }
 
-    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
-        (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
+    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
+        (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
       SDValue EFLAGS = Y->getOperand(1);
       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
           EFLAGS.getValueType().isInteger() &&
@@ -49643,8 +51293,8 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
     // fake operands:
     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
-    if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
-        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+    if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
+        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
       SDValue Zero = DAG.getConstant(0, DL, ZVT);
       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
@@ -49657,8 +51307,8 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
     // with fake operands:
     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
-    if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
-        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+    if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
+        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
       SDValue One = DAG.getConstant(1, DL, ZVT);
       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
@@ -49932,6 +51582,50 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                           PMADDBuilder);
 }
 
+// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
+// If upper element in each pair of both VPMADDWD are zero then we can merge
+// the operand elements and use the implicit add of VPMADDWD.
+// TODO: Add support for VPMADDUBSW (which isn't commutable).
+static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
+                                   const SDLoc &DL, EVT VT) {
+  if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
+    return SDValue();
+
+  // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
+  if (VT.getSizeInBits() > 128)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT OpVT = N0.getOperand(0).getSimpleValueType();
+  APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
+  APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
+
+  bool Op0HiZero =
+      DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
+      DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
+  bool Op1HiZero =
+      DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
+      DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
+
+  // TODO: Check for zero lower elements once we have actual codegen that
+  // creates them.
+  if (!Op0HiZero || !Op1HiZero)
+    return SDValue();
+
+  // Create a shuffle mask packing the lower elements from each VPMADDWD.
+  SmallVector<int> Mask;
+  for (int i = 0; i != (int)NumElts; ++i) {
+    Mask.push_back(2 * i);
+    Mask.push_back(2 * (i + NumElts));
+  }
+
+  SDValue LHS =
+      DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
+  SDValue RHS =
+      DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
+  return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
+}
+
 /// CMOV of constants requires materializing constant operands in registers.
 /// Try to fold those constants into an 'add' instruction to reduce instruction
 /// count. We do this with CMOV rather the generic 'select' because there are
@@ -49961,11 +51655,34 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
   if (!isSuitableCmov(Cmov))
     return SDValue();
 
-  // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue FalseOp = Cmov.getOperand(0);
   SDValue TrueOp = Cmov.getOperand(1);
+
+  // We will push the add through the select, but we can potentially do better
+  // if we know there is another add in the sequence and this is pointer math.
+  // In that case, we can absorb an add into the trailing memory op and avoid
+  // a 3-operand LEA which is likely slower than a 2-operand LEA.
+  // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
+  if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
+      !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
+      all_of(N->uses(), [&](SDNode *Use) {
+        auto *MemNode = dyn_cast<MemSDNode>(Use);
+        return MemNode && MemNode->getBasePtr().getNode() == N;
+      })) {
+    // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
+    // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
+    //       it is possible that choosing op1 might be better.
+    SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
+    FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
+    TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
+    Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
+                       Cmov.getOperand(2), Cmov.getOperand(3));
+    return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
+  }
+
+  // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
@@ -49978,13 +51695,16 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
+  SDLoc DL(N);
 
   if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
     return Select;
 
-  if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+  if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
+    return MAdd;
+  if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
     return MAdd;
-  if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
+  if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
     return MAdd;
 
   // Try to synthesize horizontal adds from adds of shuffles.
@@ -50001,7 +51721,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
-      SDLoc DL(N);
       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
     }
@@ -50009,7 +51728,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
-      SDLoc DL(N);
       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
     }
@@ -50018,6 +51736,47 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
+// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
+// condition comes from the subtract node that produced -X. This matches the
+// cmov expansion for absolute value. By swapping the operands we convert abs
+// to nabs.
+static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
+    return SDValue();
+
+  X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
+  if (CC != X86::COND_S && CC != X86::COND_NS)
+    return SDValue();
+
+  // Condition should come from a negate operation.
+  SDValue Cond = N1.getOperand(3);
+  if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
+    return SDValue();
+  assert(Cond.getResNo() == 1 && "Unexpected result number");
+
+  // Get the X and -X from the negate.
+  SDValue NegX = Cond.getValue(0);
+  SDValue X = Cond.getOperand(1);
+
+  SDValue FalseOp = N1.getOperand(0);
+  SDValue TrueOp = N1.getOperand(1);
+
+  // Cmov operands should be X and NegX. Order doesn't matter.
+  if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
+    return SDValue();
+
+  // Build a new CMOV with the operands swapped.
+  SDLoc DL(N);
+  MVT VT = N->getSimpleValueType(0);
+  SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
+                             N1.getOperand(2), Cond);
+  // Convert sub to add.
+  return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
+}
+
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -50049,6 +51808,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
   }
 
+  if (SDValue V = combineSubABS(N, DAG))
+    return V;
+
   // Try to synthesize horizontal subs from subs of shuffles.
   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
     return V;
@@ -50099,43 +51861,30 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     if (Op0.getOpcode() == X86ISD::VBROADCAST)
       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
 
-    // If this scalar/subvector broadcast_load is inserted into both halves, use
-    // a larger broadcast_load. Update other uses to use an extracted subvector.
-    if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+    // If this simple subvector or scalar/subvector broadcast_load is inserted
+    // into both halves, use a larger broadcast_load. Update other uses to use
+    // an extracted subvector.
+    if (ISD::isNormalLoad(Op0.getNode()) ||
+        Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
-      auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
-      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-      SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
-      SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
-                                                MemIntr->getMemoryVT(),
-                                                MemIntr->getMemOperand());
-      DAG.ReplaceAllUsesOfValueWith(
-          Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
-      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
-      return BcastLd;
-    }
-
-    // If this is a simple subvector load repeated across multiple lanes, then
-    // broadcast the load. Update other uses to use an extracted subvector.
-    if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
-      if (Ld->isSimple() && !Ld->isNonTemporal() &&
-          Ld->getExtensionType() == ISD::NON_EXTLOAD) {
-        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-        SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
-        SDValue BcastLd =
-            DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
-                                    Ld->getMemoryVT(), Ld->getMemOperand());
-        DAG.ReplaceAllUsesOfValueWith(
-            Op0,
-            extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+      auto *Mem = cast<MemSDNode>(Op0);
+      unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
+                         ? X86ISD::VBROADCAST_LOAD
+                         : X86ISD::SUBV_BROADCAST_LOAD;
+      if (SDValue BcastLd =
+              getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
+        SDValue BcastSrc =
+            extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
+        DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
         return BcastLd;
       }
     }
 
     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
-        (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
+        (Subtarget.hasAVX2() ||
+         X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
+                                              VT.getScalarType(), Subtarget)))
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
                                      Op0.getOperand(0),
@@ -50144,7 +51893,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         (Subtarget.hasAVX2() ||
-         (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+         (EltSizeInBits >= 32 &&
+          X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
         Op0.getOperand(0).getValueType() == VT.getScalarType())
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
 
@@ -50773,7 +52523,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
   // TODO: SimplifyDemandedBits instead?
   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
-      if (C->getAPIntValue().isOneValue())
+      if (C->getAPIntValue().isOne())
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
                            Src.getOperand(0));
 
@@ -50782,7 +52532,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
-      if (C->isNullValue())
+      if (C->isZero())
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
                            Src.getOperand(1));
 
@@ -50851,7 +52601,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
 
   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
     return SDValue(N, 0);
 
   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
@@ -50885,6 +52635,29 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Simplify VPMADDUBSW/VPMADDWD operations.
+static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Multiply by zero.
+  // Don't return LHS/RHS as it may contain UNDEFs.
+  if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
+      ISD::isBuildVectorAllZeros(RHS.getNode()))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
+  if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const X86Subtarget &Subtarget) {
@@ -50950,7 +52723,7 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
 
   APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+  APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
                                      KnownZero, DCI))
     return SDValue(N, 0);
@@ -50988,6 +52761,9 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
     return SDValue();
 
+  if (Subtarget.hasFP16())
+    return SDValue();
+
   bool IsStrict = N->isStrictFPOpcode();
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
@@ -51096,6 +52872,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
     return SDValue();
 
+  if (Subtarget.hasFP16())
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   EVT SrcVT = Src.getValueType();
@@ -51156,8 +52935,7 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI) {
   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
-                               APInt::getAllOnesValue(NumBits), DCI))
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -51215,6 +52993,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
     return combineUIntToFP(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
+  case X86ISD::VFCMULC:
+  case X86ISD::VFMULC:      return combineFMulcFCMulc(N, DAG, Subtarget);
   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
@@ -51289,6 +53069,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
   case X86ISD::VBROADCAST:
   case X86ISD::VPPERM:
   case X86ISD::VPERMI:
@@ -51319,13 +53100,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
   case X86ISD::MGATHER:
-  case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
+  case X86ISD::MSCATTER:
+    return combineX86GatherScatter(N, DAG, DCI, Subtarget);
   case ISD::MGATHER:
   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
+  case X86ISD::VPMADDUBSW:
+  case X86ISD::VPMADDWD:    return combineVPMADD(N, DAG, DCI);
   case X86ISD::KSHIFTL:
   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
@@ -51451,7 +53235,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   case ISD::SRL: {
     SDValue N0 = Op.getOperand(0);
     // Look out for (store (shl (load), x)).
-    if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
+    if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
       return false;
     break;
   }
@@ -51466,11 +53250,11 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     // Avoid disabling potential load folding opportunities.
-    if (MayFoldLoad(N1) &&
+    if (X86::mayFoldLoad(N1, Subtarget) &&
         (!Commute || !isa<ConstantSDNode>(N0) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
       return false;
-    if (MayFoldLoad(N0) &&
+    if (X86::mayFoldLoad(N0, Subtarget) &&
         ((Commute && !isa<ConstantSDNode>(N1)) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
@@ -51510,13 +53294,13 @@ static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
 
   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
-    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
-        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
-        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+    if (llvm::is_contained(AsmPieces, "~{cc}") &&
+        llvm::is_contained(AsmPieces, "~{flags}") &&
+        llvm::is_contained(AsmPieces, "~{fpsr}")) {
 
       if (AsmPieces.size() == 3)
         return true;
-      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+      else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
         return true;
     }
   }
@@ -52041,7 +53825,8 @@ static bool isGRClass(const TargetRegisterClass &RC) {
 /// Check if \p RC is a vector register class.
 /// I.e., FR* / VR* or one of their variant.
 static bool isFRClass(const TargetRegisterClass &RC) {
-  return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
+  return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
+         RC.hasSuperClassEq(&X86::FR32XRegClass) ||
          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
@@ -52166,6 +53951,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       switch (VT.SimpleTy) {
       default: break;
       // Scalar SSE types.
+      case MVT::f16:
+        if (VConstraint && Subtarget.hasFP16())
+          return std::make_pair(0U, &X86::FR16XRegClass);
+        break;
       case MVT::f32:
       case MVT::i32:
         if (VConstraint && Subtarget.hasVLX())
@@ -52184,6 +53973,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         }
         break;
       // Vector types and fp128.
+      case MVT::v8f16:
+        if (!Subtarget.hasFP16())
+          break;
+        LLVM_FALLTHROUGH;
       case MVT::f128:
       case MVT::v16i8:
       case MVT::v8i16:
@@ -52195,6 +53988,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &X86::VR128XRegClass);
         return std::make_pair(0U, &X86::VR128RegClass);
       // AVX types.
+      case MVT::v16f16:
+        if (!Subtarget.hasFP16())
+          break;
+        LLVM_FALLTHROUGH;
       case MVT::v32i8:
       case MVT::v16i16:
       case MVT::v8i32:
@@ -52206,6 +54003,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         if (Subtarget.hasAVX())
           return std::make_pair(0U, &X86::VR256RegClass);
         break;
+      case MVT::v32f16:
+        if (!Subtarget.hasFP16())
+          break;
+        LLVM_FALLTHROUGH;
       case MVT::v64i8:
       case MVT::v32i16:
       case MVT::v8f64:
@@ -52235,12 +54036,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       switch (VT.SimpleTy) {
       default: break;
       // Scalar SSE types.
+      case MVT::f16:
+        if (!Subtarget.hasFP16())
+          break;
+        return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
       case MVT::f32:
       case MVT::i32:
         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
       case MVT::f64:
       case MVT::i64:
         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
+      case MVT::v8f16:
+        if (!Subtarget.hasFP16())
+          break;
+        LLVM_FALLTHROUGH;
       case MVT::f128:
       case MVT::v16i8:
       case MVT::v8i16:
@@ -52250,6 +54059,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       case MVT::v2f64:
         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
       // AVX types.
+      case MVT::v16f16:
+        if (!Subtarget.hasFP16())
+          break;
+        LLVM_FALLTHROUGH;
       case MVT::v32i8:
       case MVT::v16i16:
       case MVT::v8i32:
@@ -52259,6 +54072,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         if (Subtarget.hasAVX())
           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
         break;
+      case MVT::v32f16:
+        if (!Subtarget.hasFP16())
+          break;
+        LLVM_FALLTHROUGH;
       case MVT::v64i8:
       case MVT::v32i16:
       case MVT::v8f64:
@@ -52416,7 +54233,9 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     // find, ignoring the required type.
 
     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
-    if (VT == MVT::f32 || VT == MVT::i32)
+    if (VT == MVT::f16)
+      Res.second = &X86::FR16XRegClass;
+    else if (VT == MVT::f32 || VT == MVT::i32)
       Res.second = &X86::FR32XRegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
       Res.second = &X86::FR64XRegClass;
@@ -52489,7 +54308,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
+  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 869857bcc0d6..6805cb75f0f2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
@@ -460,6 +461,7 @@ namespace llvm {
     MOVHLPS,
     MOVSD,
     MOVSS,
+    MOVSH,
     UNPCKL,
     UNPCKH,
     VPERMILPV,
@@ -564,6 +566,27 @@ namespace llvm {
     FMADDSUB_RND,
     FMSUBADD_RND,
 
+    // AVX512-FP16 complex addition and multiplication.
+    VFMADDC,
+    VFMADDC_RND,
+    VFCMADDC,
+    VFCMADDC_RND,
+
+    VFMULC,
+    VFMULC_RND,
+    VFCMULC,
+    VFCMULC_RND,
+
+    VFMADDCSH,
+    VFMADDCSH_RND,
+    VFCMADDCSH,
+    VFCMADDCSH_RND,
+
+    VFMULCSH,
+    VFMULCSH_RND,
+    VFCMULCSH,
+    VFCMULCSH_RND,
+
     // Compress and expand.
     COMPRESS,
     EXPAND,
@@ -627,12 +650,8 @@ namespace llvm {
     // packed single precision.
     DPBF16PS,
 
-    // Save xmm argument registers to the stack, according to %al. An operator
-    // is needed so that this can be expanded with control flow.
-    VASTART_SAVE_XMM_REGS,
-
-    // Windows's _chkstk call to do stack probing.
-    WIN_ALLOCA,
+    // A stack checking function call. On Windows it's _chkstk call.
+    DYN_ALLOCA,
 
     // For allocating variable amounts of stack space when using
     // segmented stacks. Check if the current stacklet has enough space, and
@@ -848,6 +867,10 @@ namespace llvm {
     AESENCWIDE256KL,
     AESDECWIDE256KL,
 
+    // Save xmm argument registers to the stack, according to %al. An operator
+    // is needed so that this can be expanded with control flow.
+    VASTART_SAVE_XMM_REGS,
+
     // WARNING: Do not add anything in the end unless you want the node to
     // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
     // opcodes will be thought as target memory ops!
@@ -888,6 +911,25 @@ namespace llvm {
     /// as zero if AllowPartialUndefs is set, else we fail and return false.
     bool isConstantSplat(SDValue Op, APInt &SplatVal,
                          bool AllowPartialUndefs = true);
+
+    /// Check if Op is a load operation that could be folded into some other x86
+    /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
+    bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+                     bool AssumeSingleUse = false);
+
+    /// Check if Op is a load operation that could be folded into a vector splat
+    /// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
+    bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+                                         const X86Subtarget &Subtarget,
+                                         bool AssumeSingleUse = false);
+
+    /// Check if Op is a value that could be used to fold a store into some
+    /// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi).
+    bool mayFoldIntoStore(SDValue Op);
+
+    /// Check if Op is an operation that could be folded into a zero extend x86
+    /// instruction.
+    bool mayFoldIntoZeroExtend(SDValue Op);
   } // end namespace X86
 
   //===--------------------------------------------------------------------===//
@@ -923,7 +965,7 @@ namespace llvm {
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
-    unsigned getByValTypeAlignment(Type *Ty,
+    uint64_t getByValTypeAlignment(Type *Ty,
                                    const DataLayout &DL) const override;
 
     EVT getOptimalMemOpType(const MemOp &Op,
@@ -989,7 +1031,7 @@ namespace llvm {
     }
 
     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                          const SelectionDAG &DAG) const override;
+                          const MachineFunction &MF) const override;
 
     bool isCheapToSpeculateCttz() const override;
 
@@ -998,7 +1040,8 @@ namespace llvm {
     bool isCtlzFast() const override;
 
     bool hasBitPreservingFPLogic(EVT VT) const override {
-      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
+             (VT == MVT::f16 && X86ScalarSSEf16);
     }
 
     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
@@ -1282,7 +1325,8 @@ namespace llvm {
     /// register, not on the X87 floating point stack.
     bool isScalarFPTypeInSSEReg(EVT VT) const {
       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-             (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+             (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
+             (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
     }
 
     /// Returns true if it is beneficial to convert a load of a constant
@@ -1442,6 +1486,7 @@ namespace llvm {
     /// When SSE2 is available, use it for f64 operations.
     bool X86ScalarSSEf32;
     bool X86ScalarSSEf64;
+    bool X86ScalarSSEf16;
 
     /// A list of legal FP immediates.
     std::vector<APFloat> LegalFPImmediates;
@@ -1472,16 +1517,11 @@ namespace llvm {
 
     /// Check whether the call is eligible for tail call optimization. Targets
     /// that want to do tail call optimization should implement this function.
-    bool IsEligibleForTailCallOptimization(SDValue Callee,
-                                           CallingConv::ID CalleeCC,
-                                           bool isVarArg,
-                                           bool isCalleeStructRet,
-                                           bool isCallerStructRet,
-                                           Type *RetTy,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                           SelectionDAG& DAG) const;
+    bool IsEligibleForTailCallOptimization(
+        SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet,
+        bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
+        const SmallVectorImpl<SDValue> &OutVals,
+        const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
                                     SDValue Chain, bool IsTailCall,
                                     bool Is64Bit, int FPDiff,
@@ -1540,6 +1580,9 @@ namespace llvm {
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
+                                    SDValue &Chain) const;
+    SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 85410c54a4d2..732b2b1a5ada 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -92,7 +92,7 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
   if (!CalleeFn)
     return false;
   AttributeList Attrs = CalleeFn->getAttributes();
-  return Attrs.hasFnAttribute(Attribute::ReturnsTwice);
+  return Attrs.hasFnAttr(Attribute::ReturnsTwice);
 }
 
 bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
index 3d96d198b409..e08b4b7c03c6 100644
--- a/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -212,7 +212,7 @@ void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
   MF.push_back(CallTarget);
 
   const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-  const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
+  const unsigned RetOpc = Is64Bit ? X86::RET64 : X86::RET32;
 
   Entry->addLiveIn(ThunkReg);
   BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
diff --git a/llvm/lib/Target/X86/X86InsertWait.cpp b/llvm/lib/Target/X86/X86InsertWait.cpp
index 56d2709f5937..69a3d32a9314 100644
--- a/llvm/lib/Target/X86/X86InsertWait.cpp
+++ b/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -55,23 +55,6 @@ char WaitInsert::ID = 0;
 
 FunctionPass *llvm::createX86InsertX87waitPass() { return new WaitInsert(); }
 
-/// Return true if the Reg is X87 register.
-static bool isX87Reg(unsigned Reg) {
-  return (Reg == X86::FPCW || Reg == X86::FPSW ||
-          (Reg >= X86::ST0 && Reg <= X86::ST7));
-}
-
-/// check if the instruction is X87 instruction
-static bool isX87Instruction(MachineInstr &MI) {
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isReg())
-      continue;
-    if (isX87Reg(MO.getReg()))
-      return true;
-  }
-  return false;
-}
-
 static bool isX87ControlInstruction(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case X86::FNINIT:
@@ -121,7 +104,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) {
       // Jump non X87 instruction.
-      if (!isX87Instruction(*MI))
+      if (!X86::isX87Instruction(*MI))
         continue;
       // If the instruction instruction neither has float exception nor is
       // a load/store instruction, or the instruction is x87 control
@@ -132,7 +115,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
       // If the following instruction is an X87 instruction and isn't an X87
       // non-waiting control instruction, we can omit insert wait instruction.
       MachineBasicBlock::iterator AfterMI = std::next(MI);
-      if (AfterMI != MBB.end() && isX87Instruction(*AfterMI) &&
+      if (AfterMI != MBB.end() && X86::isX87Instruction(*AfterMI) &&
           !isX87NonWaitingControlInstruction(*AfterMI))
         continue;
 
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 7d9466f0d181..ff8710634e89 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -239,7 +239,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
     KnownBits KnownUpperBits = llvm::computeKnownBits(
         Amt, DemandedUpper, II.getModule()->getDataLayout());
     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
-        (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+        (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
       SmallVector<int, 16> ZeroSplat(VWidth, 0);
       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
@@ -269,7 +269,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
   }
 
   // If shift-by-zero then just return the original value.
-  if (Count.isNullValue())
+  if (Count.isZero())
     return Vec;
 
   // Handle cases when Shift >= BitWidth.
@@ -476,7 +476,7 @@ static Value *simplifyX86pack(IntrinsicInst &II,
     // PACKUS: Truncate signed value with unsigned saturation.
     // Source values less than zero are saturated to zero.
     // Source values greater than dst maxuint are saturated to maxuint.
-    MinValue = APInt::getNullValue(SrcScalarSizeInBits);
+    MinValue = APInt::getZero(SrcScalarSizeInBits);
     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
   }
 
@@ -1764,7 +1764,7 @@ Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
     // we know that DemandedMask is non-zero already.
     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
     Type *VTy = II.getType();
-    if (DemandedElts.isNullValue()) {
+    if (DemandedElts.isZero()) {
       return ConstantInt::getNullValue(VTy);
     }
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index dd61d91c3a62..8aee96e1c504 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -44,8 +44,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   // It is a little bit complex for scalar types, where NumElts = 1.
   // In this case we build v4f32 or v2f64
   string VTName = "v" # !if (!eq (NumElts, 1),
+                        !if (!eq (EltVT.Size, 16), 8,
                         !if (!eq (EltVT.Size, 32), 4,
-                        !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+                        !if (!eq (EltVT.Size, 64), 2, NumElts))), NumElts) # EltVT;
 
   // The vector VT.
   ValueType VT = !cast<ValueType>(VTName);
@@ -65,8 +66,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
   X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
   // FP scalar memory operand for intrinsics - ssmem/sdmem.
-  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
-                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
+  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f16"), !cast<Operand>("shmem"),
+                           !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
+                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?)));
 
   // Load patterns
   PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
@@ -76,11 +78,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
   PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
 
-  PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"),
-                                           !cast<PatFrags>("sse_load_f32"),
-                               !if (!eq (EltTypeName, "f64"),
-                                     !cast<PatFrags>("sse_load_f64"),
-                               ?));
+  PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f16"), !cast<PatFrags>("sse_load_f16"),
+                               !if (!eq (EltTypeName, "f32"), !cast<PatFrags>("sse_load_f32"),
+                               !if (!eq (EltTypeName, "f64"), !cast<PatFrags>("sse_load_f64"), ?)));
 
   // The string to specify embedded broadcast in assembly.
   string BroadcastStr = "{1to" # NumElts # "}";
@@ -95,9 +95,12 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
 
   Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
                      !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
-                     SSEPackedInt));
+                     !if (!eq (EltTypeName, "f16"), SSEPackedSingle, // FIXME?
+                     SSEPackedInt)));
 
-  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X,
+                      !if (!eq (EltTypeName, "f16"), FR16X,
+                      FR64X));
 
   dag ImmAllZerosV = (VT immAllZerosV);
 
@@ -109,6 +112,7 @@ def v64i8_info  : X86VectorVTInfo<64,  i8, VR512, "b">;
 def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
 def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
 def v8i64_info  : X86VectorVTInfo<8,  i64, VR512, "q">;
+def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">;
 def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
 def v8f64_info  : X86VectorVTInfo<8,  f64, VR512, "pd">;
 
@@ -117,6 +121,7 @@ def v32i8x_info  : X86VectorVTInfo<32,  i8, VR256X, "b">;
 def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
 def v8i32x_info  : X86VectorVTInfo<8,  i32, VR256X, "d">;
 def v4i64x_info  : X86VectorVTInfo<4,  i64, VR256X, "q">;
+def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">;
 def v8f32x_info  : X86VectorVTInfo<8,  f32, VR256X, "ps">;
 def v4f64x_info  : X86VectorVTInfo<4,  f64, VR256X, "pd">;
 
@@ -124,6 +129,7 @@ def v16i8x_info  : X86VectorVTInfo<16,  i8, VR128X, "b">;
 def v8i16x_info  : X86VectorVTInfo<8,  i16, VR128X, "w">;
 def v4i32x_info  : X86VectorVTInfo<4,  i32, VR128X, "d">;
 def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
+def v8f16x_info  : X86VectorVTInfo<8,  f16, VR128X, "ph">;
 def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
 def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
 
@@ -131,6 +137,7 @@ def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
 // with the appropriate element type. This allows to use the same masking logic.
 def i32x_info    : X86VectorVTInfo<1,  i32, GR32, "si">;
 def i64x_info    : X86VectorVTInfo<1,  i64, GR64, "sq">;
+def f16x_info    : X86VectorVTInfo<1,  f16, VR128X, "sh">;
 def f32x_info    : X86VectorVTInfo<1,  f32, VR128X, "ss">;
 def f64x_info    : X86VectorVTInfo<1,  f64, VR128X, "sd">;
 
@@ -149,6 +156,8 @@ def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
                                              v4i32x_info>;
 def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
                                              v2i64x_info>;
+def avx512vl_f16_info : AVX512VLVectorVTInfo<v32f16_info, v16f16x_info,
+                                             v8f16x_info>;
 def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
                                              v4f32x_info>;
 def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
@@ -196,8 +205,9 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                                   string MaskingConstraint = "",
                                   bit IsCommutable = 0,
                                   bit IsKCommutable = 0,
-                                  bit IsKZCommutable = IsCommutable> {
-  let isCommutable = IsCommutable in
+                                  bit IsKZCommutable = IsCommutable,
+                                  string ClobberConstraint = ""> {
+  let isCommutable = IsCommutable, Constraints = ClobberConstraint in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
                                      "$dst, "#IntelSrcAsm#"}",
@@ -211,12 +221,15 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                        MaskingPattern>,
               EVEX_K {
       // In case of the 3src subclass this is overridden with a let.
-      string Constraints = MaskingConstraint;
+      string Constraints = !if(!eq(ClobberConstraint, ""), MaskingConstraint,
+                               !if(!eq(MaskingConstraint, ""), ClobberConstraint,
+                                   !strconcat(ClobberConstraint, ", ", MaskingConstraint)));
     }
 
   // Zero mask does not add any restrictions to commute operands transformation.
   // So, it is Ok to use IsCommutable instead of IsKCommutable.
-  let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<>
+  let isCommutable = IsKZCommutable, // Prefer over VMOV*rrkz Pat<>
+      Constraints = ClobberConstraint in
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -236,7 +249,8 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                                   string MaskingConstraint = "",
                                   bit IsCommutable = 0,
                                   bit IsKCommutable = 0,
-                                  bit IsKZCommutable = IsCommutable> :
+                                  bit IsKZCommutable = IsCommutable,
+                                  string ClobberConstraint = ""> :
   AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.RC:$dst, RHS)],
@@ -244,7 +258,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          [(set _.RC:$dst,
                                (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
                          MaskingConstraint, IsCommutable,
-                         IsKCommutable, IsKZCommutable>;
+                         IsKCommutable, IsKZCommutable, ClobberConstraint>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -254,6 +268,7 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS, dag MaskRHS,
+                           string ClobberConstraint = "",
                            bit IsCommutable = 0, bit IsKCommutable = 0,
                            bit IsKZCommutable = IsCommutable> :
    AVX512_maskable_custom<O, F, Outs, Ins,
@@ -266,7 +281,7 @@ multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
                           [(set _.RC:$dst,
                               (vselect_mask _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
                           "$src0 = $dst", IsCommutable, IsKCommutable,
-                          IsKZCommutable>;
+                          IsKZCommutable, ClobberConstraint>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -277,14 +292,15 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            dag RHS,
                            bit IsCommutable = 0, bit IsKCommutable = 0,
                            bit IsKZCommutable = IsCommutable,
-                           SDPatternOperator Select = vselect_mask> :
+                           SDPatternOperator Select = vselect_mask,
+                           string ClobberConstraint = ""> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (Select _.KRCWM:$mask, RHS, _.RC:$src0),
                           Select, "$src0 = $dst", IsCommutable, IsKCommutable,
-                          IsKZCommutable>;
+                          IsKZCommutable, ClobberConstraint>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the scalar instruction.
@@ -501,6 +517,12 @@ def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
 def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
 }
 
+let Predicates = [HasFP16] in {
+def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
+}
+
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -513,6 +535,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
                             [(set VR128X:$dst, fp128imm0)]>;
 }
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in {
+  def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
+                          [(set FR16X:$dst, fp16imm0)]>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
@@ -649,16 +677,22 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>;
 // Codegen pattern with the alternative types insert VEC128 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
                vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>;
 // Codegen pattern with the alternative types insert VEC256 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>;
 
 
 multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
@@ -944,17 +978,23 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>;
 // Codegen pattern with the alternative types extract VEC256 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>;
 
 
 // A 128-bit extract from bits [255:128] of a 512-bit vector should use a
@@ -1015,6 +1055,12 @@ def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
                   (iPTR 1)))>;
 }
 
+let Predicates = [HasFP16, HasVLX] in
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+          (v8f16 (VEXTRACTF32x4Z256rr
+                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+                  (iPTR 1)))>;
+
 
 // Additional patterns for handling a bitcast between the vselect and the
 // extract_subvector.
@@ -1140,9 +1186,8 @@ def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
 // AVX-512 BROADCAST
 //---
 // broadcast with a scalar argument.
-multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
-                            string Name,
-                            X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+multiclass avx512_broadcast_scalar<string Name, X86VectorVTInfo DestInfo,
+                                   X86VectorVTInfo SrcInfo> {
   def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
             (!cast<Instruction>(Name#DestInfo.ZSuffix#rr)
              (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
@@ -1162,7 +1207,6 @@ multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
 // Split version to allow mask and broadcast node to be different types. This
 // helps support the 32x2 broadcasts.
 multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
-                                     string Name,
                                      SchedWrite SchedRR, SchedWrite SchedRM,
                                      X86VectorVTInfo MaskInfo,
                                      X86VectorVTInfo DestInfo,
@@ -1251,54 +1295,49 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
 }
 
 // Helper class to force mask and broadcast result to same type.
-multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                                SchedWrite SchedRR, SchedWrite SchedRM,
                                X86VectorVTInfo DestInfo,
                                X86VectorVTInfo SrcInfo,
                                bit IsConvertibleToThreeAddress> :
-  avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
+  avx512_broadcast_rm_split<opc, OpcodeStr, SchedRR, SchedRM,
                             DestInfo, DestInfo, SrcInfo,
                             IsConvertibleToThreeAddress>;
 
 multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
-                                                       AVX512VLVectorVTInfo _> {
+                                  AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
-    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
                                   WriteFShuffle256Ld, _.info512, _.info128, 1>,
-              avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
-                                      _.info128>,
+              avx512_broadcast_scalar<NAME, _.info512, _.info128>,
               EVEX_V512;
   }
 
   let Predicates = [HasVLX] in {
-    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
                                      WriteFShuffle256Ld, _.info256, _.info128, 1>,
-                 avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
-                                         _.info128>,
+                 avx512_broadcast_scalar<NAME, _.info256, _.info128>,
                  EVEX_V256;
   }
 }
 
 multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
-                                                       AVX512VLVectorVTInfo _> {
+                                  AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
-    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
                                   WriteFShuffle256Ld, _.info512, _.info128, 1>,
-              avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
-                                      _.info128>,
+              avx512_broadcast_scalar<NAME, _.info512, _.info128>,
               EVEX_V512;
   }
 
   let Predicates = [HasVLX] in {
-    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
                                      WriteFShuffle256Ld, _.info256, _.info128, 1>,
-                 avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
-                                         _.info128>,
+                 avx512_broadcast_scalar<NAME, _.info256, _.info128>,
                  EVEX_V256;
-    defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
+    defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
                                      WriteFShuffle256Ld, _.info128, _.info128, 1>,
-                 avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
-                                         _.info128>,
+                 avx512_broadcast_scalar<NAME, _.info128, _.info128>,
                  EVEX_V128;
   }
 }
@@ -1384,20 +1423,20 @@ defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
                                                  X86VBroadcast, GR64, HasAVX512>, VEX_W;
 
 multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
-                                        AVX512VLVectorVTInfo _, Predicate prd,
-                                        bit IsConvertibleToThreeAddress> {
+                                      AVX512VLVectorVTInfo _, Predicate prd,
+                                      bit IsConvertibleToThreeAddress> {
   let Predicates = [prd] in {
-    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
+    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
                                    WriteShuffle256Ld, _.info512, _.info128,
                                    IsConvertibleToThreeAddress>,
                                   EVEX_V512;
   }
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
                                     WriteShuffle256Ld, _.info256, _.info128,
                                     IsConvertibleToThreeAddress>,
                                  EVEX_V256;
-    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle,
                                     WriteShuffleXLd, _.info128, _.info128,
                                     IsConvertibleToThreeAddress>,
                                  EVEX_V128;
@@ -1439,6 +1478,31 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
+let Predicates = [HasFP16] in {
+  def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWZrm addr:$src)>;
+
+  def : Pat<(v32f16 (X86VBroadcast (v8f16 VR128X:$src))),
+            (VPBROADCASTWZrr VR128X:$src)>;
+  def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))),
+            (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
+}
+let Predicates = [HasVLX, HasFP16] in {
+  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWZ128rm addr:$src)>;
+  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWZ256rm addr:$src)>;
+
+  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128X:$src))),
+            (VPBROADCASTWZ128rr VR128X:$src)>;
+  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128X:$src))),
+            (VPBROADCASTWZ256rr VR128X:$src)>;
+
+  def : Pat<(v8f16 (X86VBroadcast (f16 FR16X:$src))),
+            (VPBROADCASTWZ128rr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
+  def : Pat<(v16f16 (X86VBroadcast (f16 FR16X:$src))),
+            (VPBROADCASTWZ256rr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST SUBVECTORS
@@ -1462,6 +1526,8 @@ def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTF64X4rm addr:$src)>;
 def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v32f16 (X86SubVBroadcastld256 addr:$src)),
+          (VBROADCASTF64X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTI64X4rm addr:$src)>;
 def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
@@ -1475,6 +1541,8 @@ def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v32f16 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4rm addr:$src)>;
 def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
@@ -1532,6 +1600,8 @@ def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
@@ -1638,25 +1708,27 @@ def : Pat<(vselect_mask VK8WM:$mask,
 }
 
 multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
-                         AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
+                                        AVX512VLVectorVTInfo _Dst,
+                                        AVX512VLVectorVTInfo _Src> {
   let Predicates = [HasDQI] in
-    defm Z :    avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
+    defm Z :    avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
                                           WriteShuffle256Ld, _Dst.info512,
                                           _Src.info512, _Src.info128, 0, null_frag, null_frag>,
                                           EVEX_V512;
   let Predicates = [HasDQI, HasVLX] in
-    defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
+    defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
                                           WriteShuffle256Ld, _Dst.info256,
                                           _Src.info256, _Src.info128, 0, null_frag, null_frag>,
                                           EVEX_V256;
 }
 
 multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
-                         AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
+                                         AVX512VLVectorVTInfo _Dst,
+                                         AVX512VLVectorVTInfo _Src> :
   avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
 
   let Predicates = [HasDQI, HasVLX] in
-    defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
+    defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle,
                                           WriteShuffleXLd, _Dst.info128,
                                           _Src.info128, _Src.info128, 0, null_frag, null_frag>,
                                           EVEX_V128;
@@ -2099,6 +2171,10 @@ let Predicates = [HasAVX512] in {
                                    X86cmpms_su, X86cmpmsSAE_su,
                                    SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
 }
+let Predicates = [HasFP16], ExeDomain = SSEPackedSingle in
+  defm VCMPSHZ : avx512_cmp_scalar<f16x_info, X86cmpms, X86cmpmsSAE,
+                                   X86cmpms_su, X86cmpmsSAE_su,
+                                   SchedWriteFCmp.Scl>, AVX512XSIi8Base, TA;
 
 multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
                               X86FoldableSchedWrite sched,
@@ -2561,13 +2637,14 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
                      EVEX_B, Sched<[sched]>;
 }
 
-multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
-  let Predicates = [HasAVX512] in {
+multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
+                       Predicate Pred = HasAVX512> {
+  let Predicates = [Pred] in {
     defm Z    : avx512_vcmp_common<sched.ZMM, _.info512, NAME>,
                 avx512_vcmp_sae<sched.ZMM, _.info512>, EVEX_V512;
 
   }
-  let Predicates = [HasAVX512,HasVLX] in {
+  let Predicates = [Pred,HasVLX] in {
    defm Z128 : avx512_vcmp_common<sched.XMM, _.info128, NAME>, EVEX_V128;
    defm Z256 : avx512_vcmp_common<sched.YMM, _.info256, NAME>, EVEX_V256;
   }
@@ -2577,18 +2654,23 @@ defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
                           AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
                           AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VCMPPH : avx512_vcmp<SchedWriteFCmp, avx512vl_f16_info, HasFP16>,
+                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA;
 
 // Patterns to select fp compares with load as first operand.
 let Predicates = [HasAVX512] in {
-  def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
-                            timm:$cc)),
+  def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, timm:$cc)),
             (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
 
-  def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
-                            timm:$cc)),
+  def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, timm:$cc)),
             (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
 }
 
+let Predicates = [HasFP16] in {
+  def : Pat<(v1i1 (X86cmpms (loadf16 addr:$src2), FR16X:$src1, timm:$cc)),
+            (VCMPSHZrm FR16X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
+}
+
 // ----------------------------------------------------------------
 // FPClass
 
@@ -2736,24 +2818,28 @@ multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
 }
 
 multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
-                                 bits<8> opcScalar, X86SchedWriteWidths sched,
-                                 Predicate prd> {
+                                 bits<8> opcScalar, X86SchedWriteWidths sched> {
+  defm PH : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f16_info, opcVec,
+                                      sched, HasFP16>,
+                                      EVEX_CD8<16, CD8VF>, AVX512PSIi8Base, TA;
+  defm SHZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+                                   sched.Scl, f16x_info, HasFP16>,
+                                   EVEX_CD8<16, CD8VT1>, AVX512PSIi8Base, TA;
   defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec,
-                                      sched, prd>,
-                                      EVEX_CD8<32, CD8VF>;
+                                      sched, HasDQI>,
+                                      EVEX_CD8<32, CD8VF>, AVX512AIi8Base;
   defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec,
-                                      sched, prd>,
-                                      EVEX_CD8<64, CD8VF> , VEX_W;
+                                      sched, HasDQI>,
+                                      EVEX_CD8<64, CD8VF>, AVX512AIi8Base, VEX_W;
   defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
-                                   sched.Scl, f32x_info, prd>, VEX_LIG,
-                                   EVEX_CD8<32, CD8VT1>;
+                                   sched.Scl, f32x_info, HasDQI>, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>, AVX512AIi8Base;
   defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
-                                   sched.Scl, f64x_info, prd>, VEX_LIG,
-                                   EVEX_CD8<64, CD8VT1>, VEX_W;
+                                   sched.Scl, f64x_info, HasDQI>, VEX_LIG,
+                                   EVEX_CD8<64, CD8VT1>, AVX512AIi8Base, VEX_W;
 }
 
-defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp,
-                                      HasDQI>, AVX512AIi8Base, EVEX;
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp>, EVEX;
 
 //-----------------------------------------------------------------
 // Mask register copy, including
@@ -3766,6 +3852,110 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
 }
+let Predicates = [HasFP16] in {
+  def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>;
+  def : Pat<(v32f16 (alignedloadv32f16 addr:$src)),
+            (VMOVAPSZrm addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (loadv32f16 addr:$src)),
+            (VMOVUPSZrm addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (loadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, undef)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+
+  def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
+            (VMOVAPSZmr addr:$dst, VR512:$src)>;
+  def : Pat<(store (v32f16 VR512:$src), addr:$dst),
+            (VMOVUPSZmr addr:$dst, VR512:$src)>;
+  def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
+            (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
+}
+let Predicates = [HasFP16, HasVLX] in {
+  def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>;
+  def : Pat<(v16f16 (alignedloadv16f16 addr:$src)),
+            (VMOVAPSZ256rm addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (loadv16f16 addr:$src)),
+            (VMOVUPSZ256rm addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (loadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, undef)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+
+  def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
+            (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
+            (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask),
+            (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>;
+  def : Pat<(v8f16 (alignedloadv8f16 addr:$src)),
+            (VMOVAPSZ128rm addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (loadv8f16 addr:$src)),
+            (VMOVUPSZ128rm addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (loadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, undef)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+
+  def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
+            (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
+            (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask),
+            (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>;
+}
 
 // Move Int Doubleword to Packed Double Int
 //
@@ -3905,12 +4095,13 @@ def : Pat<(f64 (bitconvert VK64:$src)),
           (VMOV64toSDZrr (KMOVQrk VK64:$src))>;
 
 //===----------------------------------------------------------------------===//
-// AVX-512  MOVSS, MOVSD
+// AVX-512  MOVSH, MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
-                              X86VectorVTInfo _> {
-  let Predicates = [HasAVX512, OptForSize] in
+                              X86VectorVTInfo _,
+                              list<Predicate> prd = [HasAVX512, OptForSize]> {
+  let Predicates = prd in
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3976,6 +4167,9 @@ defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
 defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
+defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
+                                  [HasFP16]>,
+                                  VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
                                        PatLeaf ZeroFP, X86VectorVTInfo _> {
@@ -4144,9 +4338,14 @@ def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
                       addr:$srcAddr)>;
 }
 
+defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
+defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4154,6 +4353,13 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (insert_subvector
+                           (v32i1 immAllZerosV),
+                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                           (iPTR 0))),
+                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                   GR8, sub_8bit>;
 defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4179,6 +4385,10 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
+defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4186,6 +4396,13 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
 defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (insert_subvector
+                           (v32i1 immAllZerosV),
+                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                           (iPTR 0))),
+                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                   GR8, sub_8bit>;
 defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4211,6 +4428,16 @@ defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
+           VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
            (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
@@ -4259,6 +4486,32 @@ def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 immAllZer
           (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
 
 let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+  let Predicates = [HasFP16] in {
+    def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+        (ins VR128X:$src1, VR128X:$src2),
+        "vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+        []>, T_MAP5XS, EVEX_4V, VEX_LIG,
+        FoldGenData<"VMOVSHZrr">,
+        Sched<[SchedWriteFShuffle.XMM]>;
+
+    let Constraints = "$src0 = $dst" in
+    def VMOVSHZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+        (ins f16x_info.RC:$src0, f16x_info.KRCWM:$mask,
+         VR128X:$src1, VR128X:$src2),
+        "vmovsh\t{$src2, $src1, $dst {${mask}}|"#
+          "$dst {${mask}}, $src1, $src2}",
+        []>, T_MAP5XS, EVEX_K, EVEX_4V, VEX_LIG,
+        FoldGenData<"VMOVSHZrrk">,
+        Sched<[SchedWriteFShuffle.XMM]>;
+
+    def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+        (ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
+        "vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"#
+          "$dst {${mask}} {z}, $src1, $src2}",
+        []>, EVEX_KZ, T_MAP5XS, EVEX_4V, VEX_LIG,
+        FoldGenData<"VMOVSHZrrkz">,
+        Sched<[SchedWriteFShuffle.XMM]>;
+  }
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4311,6 +4564,16 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                               Sched<[SchedWriteFShuffle.XMM]>;
 }
 
+def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSHZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst {${mask}}|"#
+                             "$dst {${mask}}, $src1, $src2}",
+                (VMOVSHZrrk_REV VR128X:$dst, VK1WM:$mask,
+                                VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                             "$dst {${mask}} {z}, $src1, $src2}",
+                (VMOVSHZrrkz_REV VR128X:$dst, VK1WM:$mask,
+                                 VR128X:$src1, VR128X:$src2), 0>;
 def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                 (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
 def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
@@ -4393,6 +4656,29 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8f64 (X86vzload64 addr:$src)),
             (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
 }
+let Predicates = [HasFP16] in {
+  def : Pat<(v8f16 (X86vzmovl (v8f16 VR128X:$src))),
+            (VMOVSHZrr (v8f16 (AVX512_128_SET0)), VR128X:$src)>;
+
+  // FIXME we need better canonicalization in dag combine
+  def : Pat<(v16f16 (X86vzmovl (v16f16 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v8f16 (VMOVSHZrr (v8f16 (AVX512_128_SET0)),
+              (v8f16 (EXTRACT_SUBREG (v16f16 VR256X:$src), sub_xmm)))), sub_xmm)>;
+  def : Pat<(v32f16 (X86vzmovl (v32f16 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v8f16 (VMOVSHZrr (v8f16 (AVX512_128_SET0)),
+              (v8f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+  def : Pat<(v8f16 (X86vzload16 addr:$src)),
+            (VMOVSHZrm addr:$src)>;
+
+  def : Pat<(v16f16 (X86vzload16 addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSHZrm addr:$src), sub_xmm)>;
+
+  def : Pat<(v32f16 (X86vzload16 addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSHZrm addr:$src), sub_xmm)>;
+}
 
 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
@@ -5295,8 +5581,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 }
 
 multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                                  SDNode VecNode, X86FoldableSchedWrite sched,
-                                  bit IsCommutable = 0> {
+                                  SDNode VecNode, X86FoldableSchedWrite sched> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                           (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
@@ -5357,13 +5642,19 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
   defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
                               sched.PS.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
-                              sched.PS.Scl, IsCommutable>,
+                              sched.PS.Scl>,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
                               sched.PD.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
-                              sched.PD.Scl, IsCommutable>,
+                              sched.PD.Scl>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+  let Predicates = [HasFP16] in
+    defm SHZ : avx512_fp_scalar<opc, OpcodeStr#"sh", f16x_info, OpNode,
+                                VecNode, sched.PH.Scl, IsCommutable>,
+               avx512_fp_scalar_round<opc, OpcodeStr#"sh", f16x_info, RndNode,
+                                sched.PH.Scl>,
+                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5377,6 +5668,13 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               VecNode, SaeNode, sched.PD.Scl, IsCommutable,
                               NAME#"SD">,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+  let Predicates = [HasFP16] in {
+    defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
+                                VecNode, SaeNode, sched.PH.Scl, IsCommutable,
+                                NAME#"SH">,
+                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>,
+                                NotEVEX2VEXConvertible;
+  }
 }
 defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds,
                                  SchedWriteFAddSizes, 1>;
@@ -5432,47 +5730,60 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
                                          VEX_W, EVEX_4V, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
+defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
+                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS,
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         NotEVEX2VEXConvertible;
+defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
+                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS,
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         NotEVEX2VEXConvertible;
+
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             SDPatternOperator MaskOpNode,
                             X86VectorVTInfo _, X86FoldableSchedWrite sched,
                             bit IsCommutable,
-                            bit IsKCommutable = IsCommutable> {
+                            bit IsKCommutable = IsCommutable,
+                            string suffix = _.Suffix,
+                            string ClobberConstraint = "",
+                            bit MayRaiseFPException = 1> {
   let ExeDomain = _.ExeDomain, hasSideEffects = 0,
-      Uses = [MXCSR], mayRaiseFPException = 1 in {
+      Uses = [MXCSR], mayRaiseFPException = MayRaiseFPException in {
   defm rr: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
-                  "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
-                  (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
-                  IsKCommutable, IsKCommutable>,
-                  EVEX_4V, Sched<[sched]>;
+                                 (ins _.RC:$src1, _.RC:$src2), OpcodeStr#suffix,
+                                 "$src2, $src1", "$src1, $src2",
+                                 (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                 (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), ClobberConstraint,
+                                 IsCommutable, IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>;
   let mayLoad = 1 in {
     defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
-                    "$src2, $src1", "$src1, $src2",
-                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
-                    (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
-                    EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#suffix,
+                                   "$src2, $src1", "$src1, $src2",
+                                   (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+                                   (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+                                   ClobberConstraint>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
-                     "${src2}"#_.BroadcastStr#", $src1",
-                     "$src1, ${src2}"#_.BroadcastStr,
-                     (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
-                     (MaskOpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
-                     EVEX_4V, EVEX_B,
-                     Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#suffix,
+                                    "${src2}"#_.BroadcastStr#", $src1",
+                                    "$src1, ${src2}"#_.BroadcastStr,
+                                    (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+                                    (MaskOpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+                                    ClobberConstraint>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
     }
   }
 }
 
 multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
                                   SDPatternOperator OpNodeRnd,
-                                  X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                                  string suffix = _.Suffix,
+                                  string ClobberConstraint = ""> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#_.Suffix,
+                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#suffix,
                   "$rc, $src2, $src1", "$src1, $src2, $rc",
-                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc))),
+                  0, 0, 0, vselect_mask, ClobberConstraint>,
                   EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
@@ -5519,9 +5830,32 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
   }
 }
 
+multiclass avx512_fp_binop_ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                              SDPatternOperator MaskOpNode,
+                              X86SchedWriteSizes sched, bit IsCommutable = 0> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32f16_info,
+                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5PS,
+                                EVEX_CD8<16, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasFP16] in {
+    defm PHZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f16x_info,
+                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5PS,
+                                   EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f16x_info,
+                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5PS,
+                                   EVEX_CD8<16, CD8VF>;
+  }
+}
+
 let Uses = [MXCSR] in
 multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                                    X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
+                                      v32f16_info>,
+                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  }
   defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                     v16f32_info>,
                                     EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -5533,6 +5867,11 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
 let Uses = [MXCSR] in
 multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                                  X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
+                                    v32f16_info>,
+                                    EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  }
   defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                   v16f32_info>,
                                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -5543,26 +5882,36 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
 
 defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
                               SchedWriteFAddSizes, 1>,
+            avx512_fp_binop_ph<0x58, "vadd", any_fadd, fadd, SchedWriteFAddSizes, 1>,
             avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
 defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512,
                               SchedWriteFMulSizes, 1>,
+            avx512_fp_binop_ph<0x59, "vmul", any_fmul, fmul, SchedWriteFMulSizes, 1>,
             avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
 defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512,
                               SchedWriteFAddSizes>,
+            avx512_fp_binop_ph<0x5C, "vsub", any_fsub, fsub, SchedWriteFAddSizes>,
             avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
 defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512,
                               SchedWriteFDivSizes>,
+            avx512_fp_binop_ph<0x5E, "vdiv", any_fdiv, fdiv, SchedWriteFDivSizes>,
             avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
 defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512,
                               SchedWriteFCmpSizes, 0>,
+            avx512_fp_binop_ph<0x5D, "vmin", X86fmin, X86fmin, SchedWriteFCmpSizes, 0>,
             avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
 defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512,
                               SchedWriteFCmpSizes, 0>,
+            avx512_fp_binop_ph<0x5F, "vmax", X86fmax, X86fmax, SchedWriteFCmpSizes, 0>,
             avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
 let isCodeGenOnly = 1 in {
   defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512,
+                                 SchedWriteFCmpSizes, 1>,
+               avx512_fp_binop_ph<0x5D, "vmin", X86fminc, X86fminc,
                                  SchedWriteFCmpSizes, 1>;
   defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512,
+                                 SchedWriteFCmpSizes, 1>,
+               avx512_fp_binop_ph<0x5F, "vmax", X86fmaxc, X86fmaxc,
                                  SchedWriteFCmpSizes, 1>;
 }
 let Uses = []<Register>, mayRaiseFPException = 0 in {
@@ -5616,43 +5965,57 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
                                 X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32f16_info>,
+               avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v32f16_info>,
+                                EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+    defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
+               avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
+                             EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+  }
   defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
-                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+                              EVEX_V512, EVEX_CD8<32, CD8VF>, T8PD;
   defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
-                              EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+                              EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
   defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                    EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
   defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
+                                    EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W, T8PD;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
-                                   EVEX_V128, EVEX_CD8<32, CD8VF>;
+                                   EVEX_V128, EVEX_CD8<32, CD8VF>, T8PD;
     defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
-                                   EVEX_V256, EVEX_CD8<32, CD8VF>;
+                                   EVEX_V256, EVEX_CD8<32, CD8VF>, T8PD;
     defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
-                                   EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+                                   EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
     defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
-                                   EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+                                   EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
+  }
+
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8f16x_info>,
+                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+    defm PHZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16f16x_info>,
+                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6PD;
   }
 }
 defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
-                                    SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
+                                    SchedWriteFAdd>, NotEVEX2VEXConvertible;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
-                         X86FoldableSchedWrite sched, X86VectorVTInfo _,
-                         string Name> {
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
   // There are just too many permutations due to commutability and bitcasts.
   let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
@@ -5687,13 +6050,13 @@ multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteWidths sched,
                                   AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
-  defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>,
+  defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512>,
            avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512;
 
   let Predicates = [HasAVX512, HasVLX] in {
-  defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>,
+  defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256>,
               avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256;
-  defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>,
+  defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128>,
               avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128;
   }
 }
@@ -5710,20 +6073,20 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
                             X86SchedWriteWidths sched> {
   let Predicates = [HasBWI] in {
   defm WZ:    avx512_vptest<opc, OpcodeStr#"w", sched.ZMM,
-                            v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
+                            v32i16_info>, EVEX_V512, VEX_W;
   defm BZ:    avx512_vptest<opc, OpcodeStr#"b", sched.ZMM,
-                            v64i8_info, NAME#"B">, EVEX_V512;
+                            v64i8_info>, EVEX_V512;
   }
-  let Predicates = [HasVLX, HasBWI] in {
 
+  let Predicates = [HasVLX, HasBWI] in {
   defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM,
-                            v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
+                            v16i16x_info>, EVEX_V256, VEX_W;
   defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM,
-                            v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
+                            v8i16x_info>, EVEX_V128, VEX_W;
   defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM,
-                            v32i8x_info, NAME#"B">, EVEX_V256;
+                            v32i8x_info>, EVEX_V256;
   defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM,
-                            v16i8x_info, NAME#"B">, EVEX_V128;
+                            v16i8x_info>, EVEX_V128;
   }
 }
 
@@ -6392,7 +6755,7 @@ let Predicates = [HasAVX512] in {
 
 multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                                SDNode MaskOpNode, X86FoldableSchedWrite sched,
-                               X86VectorVTInfo _, string Suff> {
+                               X86VectorVTInfo _> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -6400,14 +6763,14 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
-          AVX512FMA3Base, Sched<[sched]>;
+          EVEX_4V, Sched<[sched]>;
 
   defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
-          AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6417,13 +6780,13 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
             (MaskOpNode _.RC:$src2,
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
-            AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86FoldableSchedWrite sched,
-                                 X86VectorVTInfo _, string Suff> {
+                                 X86VectorVTInfo _> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR] in
   defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -6431,38 +6794,42 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
-          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+          EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                                    SDNode MaskOpNode, SDNode OpNodeRnd,
                                    X86SchedWriteWidths sched,
-                                   AVX512VLVectorVTInfo _, string Suff> {
-  let Predicates = [HasAVX512] in {
+                                   AVX512VLVectorVTInfo _,
+                                   Predicate prd = HasAVX512> {
+  let Predicates = [prd] in {
     defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                      sched.ZMM, _.info512, Suff>,
+                                      sched.ZMM, _.info512>,
                   avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
-                                        _.info512, Suff>,
+                                        _.info512>,
                               EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
-  let Predicates = [HasVLX, HasAVX512] in {
+  let Predicates = [HasVLX, prd] in {
     defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                    sched.YMM, _.info256, Suff>,
+                                    sched.YMM, _.info256>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
     defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                    sched.XMM, _.info128, Suff>,
+                                    sched.XMM, _.info128>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                               SDNode MaskOpNode, SDNode OpNodeRnd> {
+    defm PH : avx512_fma3p_213_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
     defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info, "PS">;
+                                      avx512vl_f32_info>, T8PD;
     defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info, "PD">, VEX_W;
+                                      avx512vl_f64_info>, T8PD, VEX_W;
 }
 
 defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
@@ -6481,7 +6848,7 @@ defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub,
 
 multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                                SDNode MaskOpNode, X86FoldableSchedWrite sched,
-                               X86VectorVTInfo _, string Suff> {
+                               X86VectorVTInfo _> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -6489,14 +6856,14 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (null_frag),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-          AVX512FMA3Base, Sched<[sched]>;
+          EVEX_4V, Sched<[sched]>;
 
   defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
           (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
-          AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6507,14 +6874,14 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
                       _.RC:$src1)),
          (_.VT (MaskOpNode _.RC:$src2,
                            (_.VT (_.BroadcastLdFrag addr:$src3)),
-                           _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+                           _.RC:$src1)), 1, 0>, EVEX_4V, EVEX_B,
          Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86FoldableSchedWrite sched,
-                                 X86VectorVTInfo _, string Suff> {
+                                 X86VectorVTInfo _> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR] in
   defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -6522,38 +6889,42 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (null_frag),
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
-          1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+          1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                                    SDNode MaskOpNode, SDNode OpNodeRnd,
                                    X86SchedWriteWidths sched,
-                                   AVX512VLVectorVTInfo _, string Suff> {
-  let Predicates = [HasAVX512] in {
+                                   AVX512VLVectorVTInfo _,
+                                   Predicate prd = HasAVX512> {
+  let Predicates = [prd] in {
     defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                      sched.ZMM, _.info512, Suff>,
+                                      sched.ZMM, _.info512>,
                   avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
-                                        _.info512, Suff>,
+                                        _.info512>,
                               EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
-  let Predicates = [HasVLX, HasAVX512] in {
+  let Predicates = [HasVLX, prd] in {
     defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                    sched.YMM, _.info256, Suff>,
+                                    sched.YMM, _.info256>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
     defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                    sched.XMM, _.info128, Suff>,
+                                    sched.XMM, _.info128>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                               SDNode MaskOpNode, SDNode OpNodeRnd > {
+    defm PH : avx512_fma3p_231_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
     defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info, "PS">;
+                                      avx512vl_f32_info>, T8PD;
     defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info, "PD">, VEX_W;
+                                      avx512vl_f64_info>, T8PD, VEX_W;
 }
 
 defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
@@ -6571,7 +6942,7 @@ defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub,
 
 multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                                SDNode MaskOpNode, X86FoldableSchedWrite sched,
-                               X86VectorVTInfo _, string Suff> {
+                               X86VectorVTInfo _> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -6579,7 +6950,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (null_frag),
           (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
-          AVX512FMA3Base, Sched<[sched]>;
+          EVEX_4V, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6588,7 +6959,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
           (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
-          AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6600,13 +6971,13 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
                        _.RC:$src1, _.RC:$src2)),
          (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
                            _.RC:$src1, _.RC:$src2)), 1, 0>,
-         AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+         EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86FoldableSchedWrite sched,
-                                 X86VectorVTInfo _, string Suff> {
+                                 X86VectorVTInfo _> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR] in
   defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -6614,38 +6985,42 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (null_frag),
           (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
-          1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+          1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                                    SDNode MaskOpNode, SDNode OpNodeRnd,
                                    X86SchedWriteWidths sched,
-                                   AVX512VLVectorVTInfo _, string Suff> {
-  let Predicates = [HasAVX512] in {
+                                   AVX512VLVectorVTInfo _,
+                                   Predicate prd = HasAVX512> {
+  let Predicates = [prd] in {
     defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                      sched.ZMM, _.info512, Suff>,
+                                      sched.ZMM, _.info512>,
                   avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
-                                        _.info512, Suff>,
+                                        _.info512>,
                               EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
-  let Predicates = [HasVLX, HasAVX512] in {
+  let Predicates = [HasVLX, prd] in {
     defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                    sched.YMM, _.info256, Suff>,
+                                    sched.YMM, _.info256>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
     defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
-                                    sched.XMM, _.info128, Suff>,
+                                    sched.XMM, _.info128>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                               SDNode MaskOpNode, SDNode OpNodeRnd > {
+    defm PH : avx512_fma3p_132_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
     defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info, "PS">;
+                                      avx512vl_f32_info>, T8PD;
     defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info, "PD">, VEX_W;
+                                      avx512vl_f64_info>, T8PD, VEX_W;
 }
 
 defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
@@ -6668,39 +7043,39 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
   defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+          EVEX_4V, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
 
   let mayLoad = 1 in
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+          EVEX_4V, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
-         AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
+         EVEX_4V, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
 
   let isCodeGenOnly = 1, isCommutable = 1 in {
-    def r     : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+    def r     : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
                      !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
-    def m     : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
+                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX_4V, SIMD_EXC;
+    def m     : AVX512<opc, MRMSrcMem, (outs _.FRC:$dst),
                     (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
                     !strconcat(OpcodeStr,
                                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+                    [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, EVEX_4V, SIMD_EXC;
 
     let Uses = [MXCSR] in
-    def rb    : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
+    def rb    : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
                      !strconcat(OpcodeStr,
                               "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
                      !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
-                     Sched<[SchedWriteFMA.Scl]>;
+                     Sched<[SchedWriteFMA.Scl]>, EVEX_4V;
   }// isCodeGenOnly = 1
 }// Constraints = "$src1 = $dst"
 }
@@ -6744,10 +7119,15 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
   let Predicates = [HasAVX512] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f32x_info, "SS">,
-                                 EVEX_CD8<32, CD8VT1>, VEX_LIG;
+                                 EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD;
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f64x_info, "SD">,
-                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD;
+  }
+  let Predicates = [HasFP16] in {
+    defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+                                 OpNodeRnd, f16x_info, "SH">,
+                                 EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6PD;
   }
 }
 
@@ -6759,8 +7139,9 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86Fnmsu
 multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                                       SDNode RndOp, string Prefix,
                                       string Suffix, SDNode Move,
-                                      X86VectorVTInfo _, PatLeaf ZeroFP> {
-  let Predicates = [HasAVX512] in {
+                                      X86VectorVTInfo _, PatLeaf ZeroFP,
+                                      Predicate prd = HasAVX512> {
+  let Predicates = [prd] in {
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
                 (Op _.FRC:$src2,
                     (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
@@ -6958,6 +7339,14 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
   }
 }
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD", "SH",
+                                  X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB", "SH",
+                                  X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SH",
+                                  X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SH",
+                                  X86Movsh, v8f16x_info, fp16imm0, HasFP16>;
 
 defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
                                   "SS", X86Movss, v4f32x_info, fp32imm0>;
@@ -6990,13 +7379,13 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-         AVX512FMA3Base, Sched<[sched]>;
+          T8PD, EVEX_4V, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
-          AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
+          T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -7005,7 +7394,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src2,
                     (_.VT (_.BroadcastLdFrag addr:$src3)),
                     _.RC:$src1)>,
-            AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 } // Constraints = "$src1 = $dst"
@@ -7190,8 +7579,8 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                                   X86VectorVTInfo DstVT, SDNode OpNode,
                                   SDNode OpNodeRnd,
                                   X86FoldableSchedWrite sched, string asm,
-                                  string aliasStr> {
-  let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
+                                  string aliasStr, Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = SrcVT.ExeDomain in {
     def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src)))]>,
@@ -7207,7 +7596,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                 [(set DstVT.RC:$dst, (OpNode
                       (SrcVT.ScalarIntMemFrags addr:$src)))]>,
                 EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
-  } // Predicates = [HasAVX512]
+  } // Predicates = [prd]
 
   def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
           (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
@@ -7246,8 +7635,7 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2u
 
 multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
                         X86VectorVTInfo DstVT, SDNode OpNode,
-                        X86FoldableSchedWrite sched,
-                        string aliasStr> {
+                        X86FoldableSchedWrite sched> {
   let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
     let isCodeGenOnly = 1 in {
     def rr : AVX512<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.FRC:$src),
@@ -7263,17 +7651,13 @@ multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
 }
 
 defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info,
-                       lrint, WriteCvtSS2I,
-                       "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+                       lrint, WriteCvtSS2I>, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info,
-                       llrint, WriteCvtSS2I,
-                       "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+                       llrint, WriteCvtSS2I>, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info,
-                       lrint, WriteCvtSD2I,
-                       "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+                       lrint, WriteCvtSD2I>, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info,
-                       llrint, WriteCvtSD2I,
-                       "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+                       llrint, WriteCvtSD2I>, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
 
 let Predicates = [HasAVX512] in {
   def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>;
@@ -7371,8 +7755,9 @@ def : Pat<(v2f64 (X86Movsd
 multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
                             X86VectorVTInfo _DstRC, SDPatternOperator OpNode,
                             SDNode OpNodeInt, SDNode OpNodeSAE,
-                            X86FoldableSchedWrite sched, string aliasStr>{
-let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
+                            X86FoldableSchedWrite sched, string aliasStr,
+                            Predicate prd = HasAVX512> {
+let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in {
   let isCodeGenOnly = 1 in {
   def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
@@ -7399,7 +7784,7 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
               [(set _DstRC.RC:$dst,
                 (OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>,
               EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
-} //HasAVX512
+} // Predicates = [prd]
 
   def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
           (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
@@ -7497,33 +7882,47 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
                         EVEX_4V, VEX_LIG, Sched<[sched]>,
                         EVEX_B, EVEX_RC;
 }
-multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
+multiclass avx512_cvt_fp_scalar_trunc<bits<8> opc, string OpcodeStr,
                                       SDNode OpNode, SDNode OpNodeRnd,
                                       X86FoldableSchedWrite sched,
-                                      X86VectorVTInfo _src, X86VectorVTInfo _dst> {
-  let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+                                      X86VectorVTInfo _src, X86VectorVTInfo _dst,
+                                      Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = SSEPackedSingle in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
              avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
-                               OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+                               OpNodeRnd, sched>, EVEX_CD8<_src.EltSize, CD8VT1>;
   }
 }
 
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
-                                      SDNode OpNode, SDNode OpNodeSAE,
-                                      X86FoldableSchedWrite sched,
-                                      X86VectorVTInfo _src, X86VectorVTInfo _dst> {
-  let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+multiclass avx512_cvt_fp_scalar_extend<bits<8> opc, string OpcodeStr,
+                                       SDNode OpNode, SDNode OpNodeSAE,
+                                       X86FoldableSchedWrite sched,
+                                       X86VectorVTInfo _src, X86VectorVTInfo _dst,
+                                       Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = SSEPackedSingle in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
              avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
-             EVEX_CD8<32, CD8VT1>, XS;
+             EVEX_CD8<_src.EltSize, CD8VT1>;
   }
 }
-defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds,
+defm VCVTSD2SS : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2ss", X86frounds,
                                          X86froundsRnd, WriteCvtSD2SS, f64x_info,
-                                         f32x_info>;
-defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts,
+                                         f32x_info>, XD, VEX_W;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtss2sd", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f32x_info,
-                                          f64x_info>;
+                                          f64x_info>, XS;
+defm VCVTSD2SH : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2sh", X86frounds,
+                                          X86froundsRnd, WriteCvtSD2SS, f64x_info,
+                                          f16x_info, HasFP16>, T_MAP5XD, VEX_W;
+defm VCVTSH2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtsh2sd", X86fpexts,
+                                          X86fpextsSAE, WriteCvtSS2SD, f16x_info,
+                                          f64x_info, HasFP16>, T_MAP5XS;
+defm VCVTSS2SH : avx512_cvt_fp_scalar_trunc<0x1D, "vcvtss2sh", X86frounds,
+                                          X86froundsRnd, WriteCvtSD2SS, f32x_info,
+                                          f16x_info, HasFP16>, T_MAP5PS;
+defm VCVTSH2SS : avx512_cvt_fp_scalar_extend<0x13, "vcvtsh2ss", X86fpexts,
+                                          X86fpextsSAE, WriteCvtSS2SD, f16x_info,
+                                          f32x_info, HasFP16>, T_MAP6PS;
 
 def : Pat<(f64 (any_fpextend FR32X:$src)),
           (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
@@ -7536,6 +7935,27 @@ def : Pat<(f32 (any_fpround FR64X:$src)),
           (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
            Requires<[HasAVX512]>;
 
+def : Pat<(f32 (any_fpextend FR16X:$src)),
+          (VCVTSH2SSZrr (f32 (IMPLICIT_DEF)), FR16X:$src)>,
+          Requires<[HasFP16]>;
+def : Pat<(f32 (any_fpextend (loadf16 addr:$src))),
+          (VCVTSH2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasFP16, OptForSize]>;
+
+def : Pat<(f64 (any_fpextend FR16X:$src)),
+          (VCVTSH2SDZrr (f64 (IMPLICIT_DEF)), FR16X:$src)>,
+          Requires<[HasFP16]>;
+def : Pat<(f64 (any_fpextend (loadf16 addr:$src))),
+          (VCVTSH2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasFP16, OptForSize]>;
+
+def : Pat<(f16 (any_fpround FR32X:$src)),
+          (VCVTSS2SHZrr (f16 (IMPLICIT_DEF)), FR32X:$src)>,
+           Requires<[HasFP16]>;
+def : Pat<(f16 (any_fpround FR64X:$src)),
+          (VCVTSD2SHZrr (f16 (IMPLICIT_DEF)), FR64X:$src)>,
+           Requires<[HasFP16]>;
+
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128X:$dst),
                    (v4f32 (scalar_to_vector
@@ -7649,39 +8069,76 @@ multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
                    (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src)),
                    (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
 
-// Extend Float to Double
-multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
-                           X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX512] in {
-    defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
+// Extend [Float to Double, Half to Float]
+multiclass avx512_cvt_extend<bits<8> opc, string OpcodeStr,
+                             AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src,
+                             X86SchedWriteWidths sched, Predicate prd = HasAVX512> {
+  let Predicates = [prd] in {
+    defm Z : avx512_vcvt_fpextend<opc, OpcodeStr,  _dst.info512, _src.info256,
                             any_fpextend, fpextend, sched.ZMM>,
-             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, _dst.info512, _src.info256,
                                 X86vfpextSAE, sched.ZMM>, EVEX_V512;
   }
-  let Predicates = [HasVLX] in {
-    defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
-                               X86any_vfpext, X86vfpext, sched.XMM, "{1to2}",
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, _dst.info128, _src.info128,
+                               X86any_vfpext, X86vfpext, sched.XMM,
+                               _dst.info128.BroadcastStr,
                                "", f64mem>, EVEX_V128;
-    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info,
-                                     any_fpextend, fpextend, sched.YMM>, EVEX_V256;
+    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, _dst.info256, _src.info128,
+                               any_fpextend, fpextend, sched.YMM>, EVEX_V256;
   }
 }
 
-// Truncate Double to Float
-multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX512] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info,
+// Truncate [Double to Float, Float to Half]
+multiclass avx512_cvt_trunc<bits<8> opc, string OpcodeStr,
+                            AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src,
+                            X86SchedWriteWidths sched, Predicate prd = HasAVX512,
+                            PatFrag bcast128 = _src.info128.BroadcastLdFrag,
+                            PatFrag loadVT128 = _src.info128.LdFrag,
+                            RegisterClass maskRC128 = _src.info128.KRCWM> {
+  let Predicates = [prd] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _dst.info256, _src.info512,
                             X86any_vfpround, X86vfpround, sched.ZMM>,
-             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, _dst.info256, _src.info512,
                                X86vfproundRnd, sched.ZMM>, EVEX_V512;
   }
-  let Predicates = [HasVLX] in {
-    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
-                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
-                               f128mem, VK2WM>, EVEX_V128;
-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info,
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128,
+                               null_frag, null_frag, sched.XMM,
+                               _src.info128.BroadcastStr, "{x}",
+                               f128mem, maskRC128>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256,
                                X86any_vfpround, X86vfpround,
-                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+                               sched.YMM, _src.info256.BroadcastStr, "{y}">, EVEX_V256;
+
+    // Special patterns to allow use of X86vmfpround for masking. Instruction
+    // patterns have been disabled with null_frag.
+    def : Pat<(_dst.info128.VT (X86any_vfpround (_src.info128.VT VR128X:$src))),
+              (!cast<Instruction>(NAME # "Z128rr") VR128X:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT VR128X:$src), (_dst.info128.VT VR128X:$src0),
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rrk") VR128X:$src0, maskRC128:$mask, VR128X:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT VR128X:$src), _dst.info128.ImmAllZerosV,
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rrkz") maskRC128:$mask, VR128X:$src)>;
+
+    def : Pat<(_dst.info128.VT (X86any_vfpround (loadVT128 addr:$src))),
+              (!cast<Instruction>(NAME # "Z128rm") addr:$src)>;
+    def : Pat<(X86vmfpround (loadVT128 addr:$src), (_dst.info128.VT VR128X:$src0),
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmk") VR128X:$src0, maskRC128:$mask, addr:$src)>;
+    def : Pat<(X86vmfpround (loadVT128 addr:$src), _dst.info128.ImmAllZerosV,
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmkz") maskRC128:$mask, addr:$src)>;
+
+    def : Pat<(_dst.info128.VT (X86any_vfpround (_src.info128.VT (bcast128 addr:$src)))),
+              (!cast<Instruction>(NAME # "Z128rmb") addr:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT (bcast128 addr:$src)),
+                            (_dst.info128.VT VR128X:$src0), maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$src0, maskRC128:$mask, addr:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT (bcast128 addr:$src)),
+                            _dst.info128.ImmAllZerosV, maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbkz") maskRC128:$mask, addr:$src)>;
   }
 
   def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
@@ -7725,40 +8182,185 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
                   VK4WM:$mask, f64mem:$src), 0, "att">;
 }
 
-defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
+defm VCVTPD2PS : avx512_cvt_trunc<0x5A, "vcvtpd2ps",
+                                  avx512vl_f32_info, avx512vl_f64_info, SchedWriteCvtPD2PS>,
                                   VEX_W, PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
-                                  PS, EVEX_CD8<32, CD8VH>;
+defm VCVTPS2PD : avx512_cvt_extend<0x5A, "vcvtps2pd",
+                                   avx512vl_f64_info, avx512vl_f32_info, SchedWriteCvtPS2PD>,
+                                   PS, EVEX_CD8<32, CD8VH>;
 
-let Predicates = [HasVLX] in {
+// Extend Half to Double
+multiclass avx512_cvtph2pd<bits<8> opc, string OpcodeStr,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f16x_info,
+                                  any_fpextend, fpextend, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f16x_info,
+                                X86vfpextSAE, sched.ZMM>, EVEX_V512;
+    def : Pat<(v8f64 (extloadv8f16 addr:$src)),
+                (!cast<Instruction>(NAME # "Zrm") addr:$src)>;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v8f16x_info,
+                                     X86any_vfpext, X86vfpext, sched.XMM, "{1to2}", "",
+                                     f32mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v8f16x_info,
+                                     X86any_vfpext, X86vfpext, sched.YMM, "{1to4}", "",
+                                     f64mem>, EVEX_V256;
+  }
+}
+
+// Truncate Double to Half
+multiclass avx512_cvtpd2ph<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v8f64_info,
+                            X86any_vfpround, X86vfpround, sched.ZMM, "{1to8}", "{z}">,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f16x_info, v8f64_info,
+                               X86vfproundRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2f64x_info, null_frag,
+                               null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+                               VK2WM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4f64x_info, null_frag,
+                               null_frag, sched.YMM, "{1to4}", "{y}", f256mem,
+                               VK4WM>, EVEX_V256;
+  }
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+                  VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+                  VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Zrr") VR128X:$dst,
+                  VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrk") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrkz") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst|$dst, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbk") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbkz") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+}
+
+defm VCVTPS2PHX : avx512_cvt_trunc<0x1D, "vcvtps2phx", avx512vl_f16_info,
+                                   avx512vl_f32_info, SchedWriteCvtPD2PS,
+                                   HasFP16>, T_MAP5PD, EVEX_CD8<32, CD8VF>;
+defm VCVTPH2PSX : avx512_cvt_extend<0x13, "vcvtph2psx", avx512vl_f32_info,
+                                    avx512vl_f16_info, SchedWriteCvtPS2PD,
+                                    HasFP16>, T_MAP6PD, EVEX_CD8<16, CD8VH>;
+defm VCVTPD2PH : avx512_cvtpd2ph<0x5A, "vcvtpd2ph", SchedWriteCvtPD2PS>,
+                                 VEX_W, T_MAP5PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPH2PD : avx512_cvtph2pd<0x5A, "vcvtph2pd", SchedWriteCvtPS2PD>,
+                                 T_MAP5PS, EVEX_CD8<16, CD8VQ>;
+
+let Predicates = [HasFP16, HasVLX] in {
   // Special patterns to allow use of X86vmfpround for masking. Instruction
   // patterns have been disabled with null_frag.
-  def : Pat<(X86any_vfpround (v2f64 VR128X:$src)),
-            (VCVTPD2PSZ128rr VR128X:$src)>;
-  def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v4f32 VR128X:$src0),
+  def : Pat<(v8f16 (X86any_vfpround (v4f64 VR256X:$src))),
+            (VCVTPD2PHZ256rr VR256X:$src)>;
+  def : Pat<(v8f16 (X86vmfpround (v4f64 VR256X:$src), (v8f16 VR128X:$src0),
+                          VK4WM:$mask)),
+            (VCVTPD2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+  def : Pat<(X86vmfpround (v4f64 VR256X:$src), v8f16x_info.ImmAllZerosV,
+                          VK4WM:$mask),
+            (VCVTPD2PHZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (X86any_vfpround (loadv4f64 addr:$src))),
+            (VCVTPD2PHZ256rm addr:$src)>;
+  def : Pat<(X86vmfpround (loadv4f64 addr:$src), (v8f16 VR128X:$src0),
+                          VK4WM:$mask),
+            (VCVTPD2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86vmfpround (loadv4f64 addr:$src), v8f16x_info.ImmAllZerosV,
+                          VK4WM:$mask),
+            (VCVTPD2PHZ256rmkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_vfpround (v4f64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTPD2PHZ256rmb addr:$src)>;
+  def : Pat<(X86vmfpround (v4f64 (X86VBroadcastld64 addr:$src)),
+                          (v8f16 VR128X:$src0), VK4WM:$mask),
+            (VCVTPD2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86vmfpround (v4f64 (X86VBroadcastld64 addr:$src)),
+                          v8f16x_info.ImmAllZerosV, VK4WM:$mask),
+            (VCVTPD2PHZ256rmbkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_vfpround (v2f64 VR128X:$src))),
+            (VCVTPD2PHZ128rr VR128X:$src)>;
+  def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v8f16 VR128X:$src0),
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
-  def : Pat<(X86vmfpround (v2f64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+            (VCVTPD2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+  def : Pat<(X86vmfpround (v2f64 VR128X:$src), v8f16x_info.ImmAllZerosV,
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+            (VCVTPD2PHZ128rrkz VK2WM:$mask, VR128X:$src)>;
 
-  def : Pat<(X86any_vfpround (loadv2f64 addr:$src)),
-            (VCVTPD2PSZ128rm addr:$src)>;
-  def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v4f32 VR128X:$src0),
+  def : Pat<(v8f16 (X86any_vfpround (loadv2f64 addr:$src))),
+            (VCVTPD2PHZ128rm addr:$src)>;
+  def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v8f16 VR128X:$src0),
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86vmfpround (loadv2f64 addr:$src), v4f32x_info.ImmAllZerosV,
+            (VCVTPD2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86vmfpround (loadv2f64 addr:$src), v8f16x_info.ImmAllZerosV,
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+            (VCVTPD2PHZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
-            (VCVTPD2PSZ128rmb addr:$src)>;
+  def : Pat<(v8f16 (X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTPD2PHZ128rmb addr:$src)>;
   def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
-                          (v4f32 VR128X:$src0), VK2WM:$mask),
-            (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+                          (v8f16 VR128X:$src0), VK2WM:$mask),
+            (VCVTPD2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
   def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
-                          v4f32x_info.ImmAllZerosV, VK2WM:$mask),
-            (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+                          v8f16x_info.ImmAllZerosV, VK2WM:$mask),
+            (VCVTPD2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
 }
 
 // Convert Signed/Unsigned Doubleword to Double
@@ -8079,26 +8681,60 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 }
 
 // Convert Signed/Unsigned Quardword to Float
-multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
-                           SDNode MaskOpNode, SDNode OpNodeRnd,
-                           X86SchedWriteWidths sched> {
-  let Predicates = [HasDQI] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
+// Also Convert Signed/Unsigned Doubleword to Half
+multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                                 SDPatternOperator MaskOpNode, SDPatternOperator OpNode128,
+                                 SDPatternOperator OpNode128M, SDPatternOperator OpNodeRnd,
+                                 AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src,
+                                 X86SchedWriteWidths sched, Predicate prd = HasDQI> {
+  let Predicates = [prd] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _dst.info256, _src.info512, OpNode,
                             MaskOpNode, sched.ZMM>,
-             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, _dst.info256, _src.info512,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
-  let Predicates = [HasDQI, HasVLX] in {
+  let Predicates = [prd, HasVLX] in {
     // we need "x"/"y" suffixes in order to distinguish between 128 and 256
     // memory forms of these instructions in Asm Parcer. They have the same
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
-    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
-                               null_frag, sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128, null_frag,
+                               null_frag, sched.XMM, _src.info128.BroadcastStr,
+                               "{x}", i128mem, _src.info128.KRCWM>,
                                EVEX_V128, NotEVEX2VEXConvertible;
-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
-                               MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256, OpNode,
+                               MaskOpNode, sched.YMM, _src.info256.BroadcastStr,
+                               "{y}">, EVEX_V256,
                                NotEVEX2VEXConvertible;
+
+    // Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction
+    // patterns have been disabled with null_frag.
+    def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.VT VR128X:$src))),
+              (!cast<Instruction>(NAME # "Z128rr") VR128X:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT VR128X:$src), (_dst.info128.VT VR128X:$src0),
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rrk") VR128X:$src0, _src.info128.KRCWM:$mask, VR128X:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT VR128X:$src), _dst.info128.ImmAllZerosV,
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rrkz") _src.info128.KRCWM:$mask, VR128X:$src)>;
+
+    def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.LdFrag addr:$src))),
+              (!cast<Instruction>(NAME # "Z128rm") addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.LdFrag addr:$src), (_dst.info128.VT VR128X:$src0),
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmk") VR128X:$src0, _src.info128.KRCWM:$mask, addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.LdFrag addr:$src), _dst.info128.ImmAllZerosV,
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmkz") _src.info128.KRCWM:$mask, addr:$src)>;
+
+    def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.VT (X86VBroadcastld64 addr:$src)))),
+              (!cast<Instruction>(NAME # "Z128rmb") addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT (X86VBroadcastld64 addr:$src)),
+                             (_dst.info128.VT VR128X:$src0), _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$src0, _src.info128.KRCWM:$mask, addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT (X86VBroadcastld64 addr:$src)),
+                             _dst.info128.ImmAllZerosV, _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbkz") _src.info128.KRCWM:$mask, addr:$src)>;
   }
 
   def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
@@ -8240,13 +8876,29 @@ defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
                             uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
                             VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
-defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp,
-                            sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
-                            VEX_W, PS, EVEX_CD8<64, CD8VF>;
+defm VCVTDQ2PH : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtdq2ph", any_sint_to_fp, sint_to_fp,
+                            X86any_VSintToFP, X86VMSintToFP,
+                            X86VSintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
+                            SchedWriteCvtDQ2PS, HasFP16>,
+                            T_MAP5PS, EVEX_CD8<32, CD8VF>;
 
-defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp,
-                            uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>,
-                            VEX_W, XD, EVEX_CD8<64, CD8VF>;
+defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint_to_fp,
+                            X86any_VUintToFP, X86VMUintToFP,
+                            X86VUintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
+                            SchedWriteCvtDQ2PS, HasFP16>, T_MAP5XD,
+                            EVEX_CD8<32, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtqq2ps", any_sint_to_fp, sint_to_fp,
+                            X86any_VSintToFP, X86VMSintToFP,
+                            X86VSintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
+                            SchedWriteCvtDQ2PS>, VEX_W, PS,
+                            EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtuqq2ps", any_uint_to_fp, uint_to_fp,
+                            X86any_VUintToFP, X86VMUintToFP,
+                            X86VUintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
+                            SchedWriteCvtDQ2PS>, VEX_W, XD,
+                            EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
   // Special patterns to allow use of X86mcvtp2Int for masking. Instruction
@@ -8436,66 +9088,6 @@ let Predicates = [HasVLX] in {
             (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
 }
 
-let Predicates = [HasDQI, HasVLX] in {
-  // Special patterns to allow use of X86VMSintToFP for masking. Instruction
-  // patterns have been disabled with null_frag.
-  def : Pat<(v4f32 (X86any_VSintToFP (v2i64 VR128X:$src))),
-            (VCVTQQ2PSZ128rr VR128X:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
-
-  def : Pat<(v4f32 (X86any_VSintToFP (loadv2i64 addr:$src))),
-            (VCVTQQ2PSZ128rm addr:$src)>;
-  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
-
-  def : Pat<(v4f32 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
-            (VCVTQQ2PSZ128rmb addr:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           (v4f32 VR128X:$src0), VK2WM:$mask),
-            (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           v4f32x_info.ImmAllZerosV, VK2WM:$mask),
-            (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
-
-  // Special patterns to allow use of X86VMUintToFP for masking. Instruction
-  // patterns have been disabled with null_frag.
-  def : Pat<(v4f32 (X86any_VUintToFP (v2i64 VR128X:$src))),
-            (VCVTUQQ2PSZ128rr VR128X:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
-
-  def : Pat<(v4f32 (X86any_VUintToFP (loadv2i64 addr:$src))),
-            (VCVTUQQ2PSZ128rm addr:$src)>;
-  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
-
-  def : Pat<(v4f32 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
-            (VCVTUQQ2PSZ128rmb addr:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           (v4f32 VR128X:$src0), VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           v4f32x_info.ImmAllZerosV, VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
@@ -8626,9 +9218,9 @@ let Predicates = [HasVLX] in {
 
 //  Unordered/Ordered scalar fp compare with Sae and set EFLAGS
 multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
-                            string OpcodeStr, Domain d,
-                            X86FoldableSchedWrite sched = WriteFComX> {
-  let hasSideEffects = 0, Uses = [MXCSR] in
+                              string OpcodeStr, Domain d,
+                              X86FoldableSchedWrite sched = WriteFComX> {
+  let ExeDomain = d, hasSideEffects = 0, Uses = [MXCSR] in
   def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
                   !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
                   EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>;
@@ -8675,10 +9267,35 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   }
 }
 
-/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+let Defs = [EFLAGS], Predicates = [HasFP16] in {
+  defm VUCOMISHZ : avx512_ord_cmp_sae<0x2E, v8f16x_info, "vucomish",
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                EVEX_CD8<16, CD8VT1>;
+  defm VCOMISHZ : avx512_ord_cmp_sae<0x2F, v8f16x_info, "vcomish",
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                EVEX_CD8<16, CD8VT1>;
+  defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16,
+                                "ucomish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16,
+                                "comish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  let isCodeGenOnly = 1 in {
+    defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem,
+                                sse_load_f16, "ucomish", SSEPackedSingle>,
+                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+
+    defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem,
+                                sse_load_f16, "comish", SSEPackedSingle>,
+                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd, rcpsh, rsqrtsh
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                         Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -8693,6 +9310,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 }
 
+defm VRCPSHZ : avx512_fp14_s<0x4D, "vrcpsh", X86rcp14s, SchedWriteFRcp.Scl,
+                               f16x_info, HasFP16>, EVEX_CD8<16, CD8VT1>,
+                               T_MAP6PD;
+defm VRSQRTSHZ : avx512_fp14_s<0x4F, "vrsqrtsh", X86rsqrt14s,
+                                 SchedWriteFRsqrt.Scl, f16x_info, HasFP16>,
+                                 EVEX_CD8<16, CD8VT1>, T_MAP6PD;
+let Uses = [MXCSR] in {
 defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
                                f32x_info>, EVEX_CD8<32, CD8VT1>,
                                T8PD;
@@ -8705,6 +9329,7 @@ defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
 defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f64x_info>, VEX_W,
                                  EVEX_CD8<64, CD8VT1>, T8PD;
+}
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8728,33 +9353,45 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-let Uses = [MXCSR] in
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86SchedWriteWidths sched> {
-  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
-                           v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM,
-                           v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-  // Define only if AVX512VL feature is present.
-  let Predicates = [HasVLX] in {
-    defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
-                                OpNode, sched.XMM, v4f32x_info>,
-                               EVEX_V128, EVEX_CD8<32, CD8VF>;
-    defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
-                                OpNode, sched.YMM, v8f32x_info>,
-                               EVEX_V256, EVEX_CD8<32, CD8VF>;
-    defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
-                                OpNode, sched.XMM, v2f64x_info>,
-                               EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
-    defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
-                                OpNode, sched.YMM, v4f64x_info>,
-                               EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  let Uses = [MXCSR] in {
+  defm 14PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"), OpNode, sched.ZMM,
+                             v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm 14PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"), OpNode, sched.ZMM,
+                             v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
   }
-}
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"), OpNode, sched.ZMM,
+                           v32f16_info>, EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
 
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX], Uses = [MXCSR] in {
+    defm 14PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"),
+                                  OpNode, sched.XMM, v4f32x_info>,
+                                  EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm 14PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"),
+                                  OpNode, sched.YMM, v8f32x_info>,
+                                  EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm 14PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"),
+                                  OpNode, sched.XMM, v2f64x_info>,
+                                  EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm 14PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"),
+                                  OpNode, sched.YMM, v4f64x_info>,
+                                  EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
+                                OpNode, sched.XMM, v8f16x_info>,
+                                EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
+                                OpNode, sched.YMM, v16f16x_info>,
+                                EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+  }
+}
+
+defm VRSQRT : avx512_fp14_p_vl_all<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>;
+defm VRCP : avx512_fp14_p_vl_all<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>;
 
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -8784,20 +9421,29 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG;
+                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V;
   defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD, EVEX_4V;
+}
+
+multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
+  let Predicates = [HasFP16] in
+  defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode,  OpNodeSAE, sched>,
+               EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V;
 }
 
 let Predicates = [HasERI] in {
   defm VRCP28   : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
-                               SchedWriteFRcp.Scl>, T8PD, EVEX_4V;
+                               SchedWriteFRcp.Scl>;
   defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
-                               SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
+                               SchedWriteFRsqrt.Scl>;
 }
 
 defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
-                              SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
+                              SchedWriteFRnd.Scl>,
+                 avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
+                                  SchedWriteFRnd.Scl>;
 /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -8861,6 +9507,19 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
   }
 }
 
+multiclass  avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                       SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_fp28_p<opc, OpcodeStr#"ph", v32f16_info, OpNode, sched.ZMM>,
+              avx512_fp28_p_sae<opc, OpcodeStr#"ph", v32f16_info, OpNodeSAE, sched.ZMM>,
+              T_MAP6PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_fp28_p<opc, OpcodeStr#"ph", v8f16x_info, OpNode, sched.XMM>,
+                                     EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_fp28_p<opc, OpcodeStr#"ph", v16f16x_info, OpNode, sched.YMM>,
+                                     EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+  }
+}
 let Predicates = [HasERI] in {
  defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
                             SchedWriteFRsqrt>, EVEX;
@@ -8871,6 +9530,8 @@ let Predicates = [HasERI] in {
 }
 defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
                             SchedWriteFRnd>,
+                 avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
+                                     SchedWriteFRnd>,
                  avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp,
                                           SchedWriteFRnd>, EVEX;
 
@@ -8908,6 +9569,18 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+                                sched.PH.ZMM, v32f16_info>,
+                                EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+                                     sched.PH.XMM, v8f16x_info>,
+                                     EVEX_V128, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+                                     sched.PH.YMM, v16f16x_info>,
+                                     EVEX_V256, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  }
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                 sched.PS.ZMM, v16f32_info>,
                                 EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -8934,6 +9607,10 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR] in
 multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
                                         X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ph"),
+                                      sched.PH.ZMM, v32f16_info>,
+                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
   defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
                                       sched.PS.ZMM, v16f32_info>,
                                       EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -8943,8 +9620,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
-                              X86VectorVTInfo _, string Name> {
-  let ExeDomain = _.ExeDomain in {
+                              X86VectorVTInfo _, string Name, Predicate prd = HasAVX512> {
+  let ExeDomain = _.ExeDomain, Predicates = [prd] in {
     defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -8966,7 +9643,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
                                      (i32 timm:$rc))>,
                          EVEX_B, EVEX_RC, Sched<[sched]>;
 
-    let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
+    let isCodeGenOnly = 1, hasSideEffects = 0 in {
       def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                 (ins _.FRC:$src1, _.FRC:$src2),
                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -8979,13 +9656,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
     }
   }
 
-  let Predicates = [HasAVX512] in {
+  let Predicates = [prd] in {
     def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)),
               (!cast<Instruction>(Name#Zr)
                   (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
   }
 
-  let Predicates = [HasAVX512, OptForSize] in {
+  let Predicates = [prd, OptForSize] in {
     def : Pat<(_.EltVT (any_fsqrt (load addr:$src))),
               (!cast<Instruction>(Name#Zm)
                   (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
@@ -8994,6 +9671,8 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
 
 multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
+  defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
+                        EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS;
   defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
                         EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
   defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
@@ -9058,6 +9737,12 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
   }
 }
 
+let Predicates = [HasFP16] in
+defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh",
+                                           SchedWriteFRnd.Scl, f16x_info>,
+                                           AVX512PSIi8Base, TA, EVEX_4V,
+                                           EVEX_CD8<16, CD8VT1>;
+
 defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
                                            SchedWriteFRnd.Scl, f32x_info>,
                                            AVX512AIi8Base, EVEX_4V, VEX_LIG,
@@ -9086,6 +9771,9 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
   }
 }
 
+defm : avx512_masked_scalar<fsqrt, "SQRTSHZ", X86Movsh,
+                            (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v8f16x_info,
+                            fp16imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasFP16>;
 defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
                             (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
                             fp32imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasAVX512>;
@@ -9154,7 +9842,6 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
-                                    X86VectorVTInfo DestInfo,
                                     PatFrag truncFrag, PatFrag mtruncFrag,
                                     string Name> {
 
@@ -9184,23 +9871,22 @@ multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode128,
   let Predicates = [HasVLX, prd] in {
     defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode128, MaskNode128, sched,
                              VTSrcInfo.info128, DestInfoZ128, x86memopZ128>,
-                avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
-                             truncFrag, mtruncFrag, NAME>, EVEX_V128;
+                avx512_trunc_mr_lowering<VTSrcInfo.info128, truncFrag,
+                                         mtruncFrag, NAME>, EVEX_V128;
 
     defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode256, MaskNode256, sched,
                              VTSrcInfo.info256, DestInfoZ256, x86memopZ256>,
-                avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
-                             truncFrag, mtruncFrag, NAME>, EVEX_V256;
+                avx512_trunc_mr_lowering<VTSrcInfo.info256, truncFrag,
+                                         mtruncFrag, NAME>, EVEX_V256;
   }
   let Predicates = [prd] in
     defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode512, MaskNode512, sched,
                              VTSrcInfo.info512, DestInfoZ, x86memopZ>,
-                avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
-                             truncFrag, mtruncFrag, NAME>, EVEX_V512;
+                avx512_trunc_mr_lowering<VTSrcInfo.info512, truncFrag,
+                                         mtruncFrag, NAME>, EVEX_V512;
 }
 
-multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDPatternOperator MaskNode,
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr,
                            X86FoldableSchedWrite sched, PatFrag StoreNode,
                            PatFrag MaskedStoreNode, SDNode InVecNode,
                            SDPatternOperator InVecMaskNode> {
@@ -9271,17 +9957,16 @@ multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
 }
 
-defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb",   trunc, select_trunc,
+defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb",
                                   WriteShuffle256, truncstorevi8,
                                   masked_truncstorevi8, X86vtrunc, X86vmtrunc>;
-defm VPMOVSQB   : avx512_trunc_qb<0x22, "vpmovsqb",  X86vtruncs, select_truncs,
+defm VPMOVSQB   : avx512_trunc_qb<0x22, "vpmovsqb",
                                   WriteShuffle256, truncstore_s_vi8,
                                   masked_truncstore_s_vi8, X86vtruncs,
                                   X86vmtruncs>;
-defm VPMOVUSQB  : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
-                                  select_truncus, WriteShuffle256,
-                                  truncstore_us_vi8, masked_truncstore_us_vi8,
-                                  X86vtruncus, X86vmtruncus>;
+defm VPMOVUSQB  : avx512_trunc_qb<0x12, "vpmovusqb",
+                                  WriteShuffle256, truncstore_us_vi8,
+                                  masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>;
 
 defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw", trunc, select_trunc,
                                   WriteShuffle256, truncstorevi16,
@@ -9454,8 +10139,9 @@ multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr,
-          SDNode OpNode, SDNode InVecNode, string ExtTy,
-          X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+                              SDNode InVecNode, string ExtTy,
+                              X86FoldableSchedWrite sched,
+                              PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
                    v16i8x_info, i16mem, LdFrag, InVecNode>,
@@ -9532,14 +10218,14 @@ multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr,
 
 defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>;
 defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq",       zext_invec, "z", WriteShuffle256>;
 defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>;
 defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>;
 defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>;
 
 defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>;
 defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq",       sext_invec, "s", WriteShuffle256>;
 defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>;
 defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>;
 defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>;
@@ -10304,24 +10990,26 @@ multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
                     bits<8> opcPs, bits<8> opcPd, SDPatternOperator OpNode,
                     SDPatternOperator MaskOpNode, SDNode OpNodeSAE,
                     X86SchedWriteWidths sched, Predicate prd>{
+  defm PH : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f16_info,
+                            opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, HasFP16>,
+                            AVX512PSIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
   defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
                             opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
-                            EVEX_CD8<32, CD8VF>;
+                            AVX512AIi8Base, EVEX, EVEX_CD8<32, CD8VF>;
   defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
                             opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
-                            EVEX_CD8<64, CD8VF>, VEX_W;
+                            AVX512AIi8Base, EVEX, EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
 defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
                               X86VReduce, X86VReduce, X86VReduceSAE,
-                              SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX;
+                              SchedWriteFRnd, HasDQI>;
 defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
                               X86any_VRndScale, X86VRndScale, X86VRndScaleSAE,
-                              SchedWriteFRnd, HasAVX512>,
-                              AVX512AIi8Base, EVEX;
+                              SchedWriteFRnd, HasAVX512>;
 defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
                               X86VGetMant, X86VGetMant, X86VGetMantSAE,
-                              SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX;
+                              SchedWriteFRnd, HasAVX512>;
 
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                 0x50, X86VRange, X86VRangeSAE,
@@ -10345,6 +11033,9 @@ defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
 defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info,
+      0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>,
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
 
 defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
@@ -10352,6 +11043,9 @@ defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
 defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
+      0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>,
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                                           X86FoldableSchedWrite sched,
@@ -10770,7 +11464,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr,
   }
 }
 
-multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr,
                                  X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo> {
   defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.ZMM,
                            VTInfo.info512>, EVEX_V512;
@@ -10783,13 +11477,13 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr,
                           X86SchedWriteWidths sched> {
-  defm NAME:      avx512_movddup_common<opc, OpcodeStr, OpNode, sched,
+  defm NAME:      avx512_movddup_common<opc, OpcodeStr, sched,
                                         avx512vl_f64_info>, XD, VEX_W;
 }
 
-defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", SchedWriteFShuffle>;
 
 let Predicates = [HasVLX] in {
 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
@@ -10956,16 +11650,15 @@ defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
 // VSHUFPS - VSHUFPD Operations
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
-                        AVX512VLVectorVTInfo VTInfo_FP>{
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_FP>{
   defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
                                     SchedWriteFShuffle>,
                                     EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
                                     AVX512AIi8Base, EVEX_4V;
 }
 
-defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
-defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, PD, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - Byte shift Left/Right
@@ -11598,6 +12291,11 @@ defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSD", X86Movsd, v2f64x_
 defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
 defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
 
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSH", X86Movsh, v8f16x_info, fp16imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSH", X86Movsh, v8f16x_info, fp16imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSH", X86Movsh, v8f16x_info, fp16imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSH", X86Movsh, v8f16x_info, fp16imm0>;
+
 multiclass AVX512_scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix,
                                              SDNode Move, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
@@ -11609,6 +12307,7 @@ multiclass AVX512_scalar_unary_math_patterns<SDPatternOperator OpNode, string Op
 
 defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
 defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSH", X86Movsh, v8f16x_info>;
 
 //===----------------------------------------------------------------------===//
 // AES instructions
@@ -11671,13 +12370,13 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
-                AVX512FMA3Base, Sched<[sched]>;
+                T8PD, EVEX_4V, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                         (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                AVX512FMA3Base,
+                T8PD, EVEX_4V,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -11693,7 +12392,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
               "$src2, ${src3}"#VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
                (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
-              AVX512FMA3Base, EVEX_B,
+              T8PD, EVEX_4V, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12200,3 +12899,732 @@ let ExeDomain = SSEPackedSingle in
 defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
                                        avx512vl_f32_info, avx512vl_i32_info,
                                        HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX512FP16
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasFP16] in {
+// Move word ( r/m16) to Packed word
+def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
+def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src),
+                      "vmovw\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v8i16 (scalar_to_vector (loadi16 addr:$src))))]>,
+                      T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>;
+
+def : Pat<(f16 (bitconvert GR16:$src)),
+          (f16 (COPY_TO_REGCLASS
+                (VMOVW2SHrr
+                 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)),
+                FR16X))>;
+def : Pat<(v8i16 (scalar_to_vector (i16 GR16:$src))),
+          (VMOVW2SHrr (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit))>;
+def : Pat<(v4i32 (X86vzmovl (scalar_to_vector (and GR32:$src, 0xffff)))),
+          (VMOVW2SHrr GR32:$src)>;
+// FIXME: We should really find a way to improve these patterns.
+def : Pat<(v8i32 (X86vzmovl
+                  (insert_subvector undef,
+                                    (v4i32 (scalar_to_vector
+                                            (and GR32:$src, 0xffff))),
+                                    (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>;
+def : Pat<(v16i32 (X86vzmovl
+                   (insert_subvector undef,
+                                     (v4i32 (scalar_to_vector
+                                             (and GR32:$src, 0xffff))),
+                                     (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>;
+
+def : Pat<(v8i16 (X86vzmovl (v8i16 (scalar_to_vector (i16 (trunc GR32:$src)))))),
+          (VMOVW2SHrr GR32:$src)>;
+
+// AVX 128-bit movw instruction write zeros in the high 128-bit part.
+def : Pat<(v8i16 (X86vzload16 addr:$src)),
+          (VMOVWrm addr:$src)>;
+def : Pat<(v16i16 (X86vzload16 addr:$src)),
+          (SUBREG_TO_REG (i32 0), (v8i16 (VMOVWrm addr:$src)), sub_xmm)>;
+
+// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+def : Pat<(v32i16 (X86vzload16 addr:$src)),
+          (SUBREG_TO_REG (i32 0), (v8i16 (VMOVWrm addr:$src)), sub_xmm)>;
+
+def : Pat<(v4i32 (scalar_to_vector (i32 (extloadi16 addr:$src)))),
+          (VMOVWrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (scalar_to_vector (i32 (zextloadi16 addr:$src))))),
+          (VMOVWrm addr:$src)>;
+def : Pat<(v8i32 (X86vzmovl
+                  (insert_subvector undef,
+                                    (v4i32 (scalar_to_vector
+                                            (i32 (zextloadi16 addr:$src)))),
+                                    (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVWrm addr:$src), sub_xmm)>;
+def : Pat<(v16i32 (X86vzmovl
+                   (insert_subvector undef,
+                                     (v4i32 (scalar_to_vector
+                                             (i32 (zextloadi16 addr:$src)))),
+                                     (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVWrm addr:$src), sub_xmm)>;
+
+// Move word from xmm register to r/m16
+def VMOVSH2Wrr  : AVX512<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveToGpr]>;
+def VMOVWmr  : AVX512<0x7E, MRMDestMem, (outs),
+                       (ins i16mem:$dst, VR128X:$src),
+                       "vmovw\t{$src, $dst|$dst, $src}",
+                       [(store (i16 (extractelt (v8i16 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)]>,
+                       T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>;
+
+def : Pat<(i16 (bitconvert FR16X:$src)),
+          (i16 (EXTRACT_SUBREG
+                (VMOVSH2Wrr (COPY_TO_REGCLASS FR16X:$src, VR128X)),
+                sub_16bit))>;
+def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
+          (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>;
+}
+
+// Allow "vmovw" to use GR64
+let hasSideEffects = 0 in {
+  def VMOVW64toSHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
+  def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
+}
+
+// Convert 16-bit float to i16/u16
+multiclass avx512_cvtph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                          SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                          AVX512VLVectorVTInfo _Dst,
+                          AVX512VLVectorVTInfo _Src,
+                          X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                            OpNode, MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info128, _Src.info128,
+                               OpNode, MaskOpNode, sched.XMM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info256, _Src.info256,
+                               OpNode, MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+// Convert 16-bit float to i16/u16 truncate
+multiclass avx512_cvttph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src,
+                           X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                            OpNode, MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info128, _Src.info128,
+                               OpNode, MaskOpNode, sched.XMM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info256, _Src.info256,
+                               OpNode, MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+defm VCVTPH2UW : avx512_cvtph2w<0x7D, "vcvtph2uw", X86cvtp2UInt, X86cvtp2UInt,
+                                X86cvtp2UIntRnd, avx512vl_i16_info,
+                                avx512vl_f16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5PS, EVEX_CD8<16, CD8VF>;
+defm VCVTUW2PH : avx512_cvtph2w<0x7D, "vcvtuw2ph", any_uint_to_fp, uint_to_fp,
+                                X86VUintToFpRnd, avx512vl_f16_info,
+                                avx512vl_i16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5XD, EVEX_CD8<16, CD8VF>;
+defm VCVTTPH2W : avx512_cvttph2w<0x7C, "vcvttph2w", X86any_cvttp2si,
+                                X86cvttp2si, X86cvttp2siSAE,
+                                avx512vl_i16_info, avx512vl_f16_info,
+                                SchedWriteCvtPD2DQ>, T_MAP5PD, EVEX_CD8<16, CD8VF>;
+defm VCVTTPH2UW : avx512_cvttph2w<0x7C, "vcvttph2uw", X86any_cvttp2ui,
+                                X86cvttp2ui, X86cvttp2uiSAE,
+                                avx512vl_i16_info, avx512vl_f16_info,
+                                SchedWriteCvtPD2DQ>, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+defm VCVTPH2W : avx512_cvtph2w<0x7D, "vcvtph2w", X86cvtp2Int, X86cvtp2Int,
+                                X86cvtp2IntRnd, avx512vl_i16_info,
+                                avx512vl_f16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5PD, EVEX_CD8<16, CD8VF>;
+defm VCVTW2PH : avx512_cvtph2w<0x7D, "vcvtw2ph", any_sint_to_fp, sint_to_fp,
+                                X86VSintToFpRnd, avx512vl_f16_info,
+                                avx512vl_i16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5XS, EVEX_CD8<16, CD8VF>;
+
+// Convert Half to Signed/Unsigned Doubleword
+multiclass avx512_cvtph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f16x_info,
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to4}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+// Convert Half to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                            SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f16x_info,
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to4}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+
+defm VCVTPH2DQ : avx512_cvtph2dq<0x5B, "vcvtph2dq", X86cvtp2Int, X86cvtp2Int,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VH>;
+defm VCVTPH2UDQ : avx512_cvtph2dq<0x79, "vcvtph2udq", X86cvtp2UInt, X86cvtp2UInt,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 EVEX_CD8<16, CD8VH>;
+
+defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si,
+                                X86cvttp2si, X86cvttp2siSAE,
+                                SchedWriteCvtPS2DQ>, T_MAP5XS,
+                                EVEX_CD8<16, CD8VH>;
+
+defm VCVTTPH2UDQ : avx512_cvttph2dq<0x78, "vcvttph2udq", X86any_cvttp2ui,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 EVEX_CD8<16, CD8VH>;
+
+// Convert Half to Signed/Unsigned Quardword
+multiclass avx512_cvtph2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f16x_info,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v8f16x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to2}", "", f32mem>,
+                               EVEX_V128;
+    // Explicitly specified broadcast string, since we take only 4 elements
+    // from v8f16x_info source
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM, "{1to4}", "", f64mem>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Half to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttph2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                            SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f16x_info,
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v8f16x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to2}", "", f32mem>, EVEX_V128;
+    // Explicitly specified broadcast string, since we take only 4 elements
+    // from v8f16x_info source
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM, "{1to4}", "", f64mem>, EVEX_V256;
+  }
+}
+
+defm VCVTPH2QQ : avx512_cvtph2qq<0x7B, "vcvtph2qq", X86cvtp2Int, X86cvtp2Int,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+defm VCVTPH2UQQ : avx512_cvtph2qq<0x79, "vcvtph2uqq", X86cvtp2UInt, X86cvtp2UInt,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+defm VCVTTPH2QQ : avx512_cvttph2qq<0x7A, "vcvttph2qq", X86any_cvttp2si,
+                                 X86cvttp2si, X86cvttp2siSAE,
+                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+defm VCVTTPH2UQQ : avx512_cvttph2qq<0x78, "vcvttph2uqq", X86any_cvttp2ui,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+// Convert Signed/Unsigned Quardword to Half
+multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
+  // we need "x"/"y"/"z" suffixes in order to distinguish between 128, 256 and
+  // 512 memory forms of these instructions in Asm Parcer. They have the same
+  // dest type - 'v8f16x_info'. We also specify the broadcast string explicitly
+  // due to the same reason.
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v8i64_info, OpNode,
+                            MaskOpNode, sched.ZMM, "{1to8}", "{z}">,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f16x_info, v8i64_info,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2i64x_info,
+                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
+                               i128mem, VK2WM>,
+                               EVEX_V128, NotEVEX2VEXConvertible;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4i64x_info,
+                               null_frag, null_frag, sched.YMM, "{1to4}", "{y}",
+                               i256mem, VK4WM>,
+                               EVEX_V256, NotEVEX2VEXConvertible;
+  }
+
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+                  VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+                  VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Zrr") VR128X:$dst,
+                  VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrk") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrkz") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst|$dst, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbk") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbkz") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+}
+
+defm VCVTQQ2PH : avx512_cvtqq2ph<0x5B, "vcvtqq2ph", any_sint_to_fp, sint_to_fp,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, T_MAP5PS,
+                            EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PH : avx512_cvtqq2ph<0x7A, "vcvtuqq2ph", any_uint_to_fp, uint_to_fp,
+                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, T_MAP5XD,
+                            EVEX_CD8<64, CD8VF>;
+
+// Convert half to signed/unsigned int 32/64
+defm VCVTSH2SIZ: avx512_cvt_s_int_round<0x2D, f16x_info, i32x_info, X86cvts2si,
+                                   X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{l}", HasFP16>,
+                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTSH2SI64Z: avx512_cvt_s_int_round<0x2D, f16x_info, i64x_info, X86cvts2si,
+                                   X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{q}", HasFP16>,
+                                   T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>;
+defm VCVTSH2USIZ: avx512_cvt_s_int_round<0x79, f16x_info, i32x_info, X86cvts2usi,
+                                   X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{l}", HasFP16>,
+                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTSH2USI64Z: avx512_cvt_s_int_round<0x79, f16x_info, i64x_info, X86cvts2usi,
+                                   X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{q}", HasFP16>,
+                                   T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>;
+
+defm VCVTTSH2SIZ: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i32x_info,
+                        any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTTSH2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i64x_info,
+                        any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+                        "{q}", HasFP16>, VEX_W, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTTSH2USIZ: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i32x_info,
+                        any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTTSH2USI64Z: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i64x_info,
+                        any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+                        "{q}", HasFP16>, T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>;
+
+let Predicates = [HasFP16] in {
+  defm VCVTSI2SHZ  : avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR32,
+                                   v8f16x_info, i32mem, loadi32, "cvtsi2sh", "l">,
+                                   T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+  defm VCVTSI642SHZ: avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR64,
+                                   v8f16x_info, i64mem, loadi64, "cvtsi2sh","q">,
+                                   T_MAP5XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCVTUSI2SHZ   : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR32,
+                                    v8f16x_info, i32mem, loadi32,
+                                    "cvtusi2sh","l">, T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+  defm VCVTUSI642SHZ : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR64,
+                                    v8f16x_info, i64mem, loadi64, "cvtusi2sh", "q">,
+                                    T_MAP5XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def : InstAlias<"vcvtsi2sh\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+
+  def : InstAlias<"vcvtusi2sh\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTUSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+
+
+  def : Pat<(f16 (any_sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f16 (any_sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI642SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f16 (any_sint_to_fp GR32:$src)),
+            (VCVTSI2SHZrr (f16 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f16 (any_sint_to_fp GR64:$src)),
+            (VCVTSI642SHZrr (f16 (IMPLICIT_DEF)), GR64:$src)>;
+
+  def : Pat<(f16 (any_uint_to_fp (loadi32 addr:$src))),
+            (VCVTUSI2SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f16 (any_uint_to_fp (loadi64 addr:$src))),
+            (VCVTUSI642SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f16 (any_uint_to_fp GR32:$src)),
+            (VCVTUSI2SHZrr (f16 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f16 (any_uint_to_fp GR64:$src)),
+            (VCVTUSI642SHZrr (f16 (IMPLICIT_DEF)), GR64:$src)>;
+
+  // Patterns used for matching vcvtsi2sh intrinsic sequences from clang
+  // which produce unnecessary vmovsh instructions
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp GR64:$src)))))),
+            (VCVTSI642SHZrr_Int VR128X:$dst, GR64:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp (loadi64 addr:$src))))))),
+            (VCVTSI642SHZrm_Int VR128X:$dst, addr:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp GR32:$src)))))),
+            (VCVTSI2SHZrr_Int VR128X:$dst, GR32:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp (loadi32 addr:$src))))))),
+            (VCVTSI2SHZrm_Int VR128X:$dst, addr:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp GR64:$src)))))),
+            (VCVTUSI642SHZrr_Int VR128X:$dst, GR64:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp (loadi64 addr:$src))))))),
+            (VCVTUSI642SHZrm_Int VR128X:$dst, addr:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp GR32:$src)))))),
+            (VCVTUSI2SHZrr_Int VR128X:$dst, GR32:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp (loadi32 addr:$src))))))),
+            (VCVTUSI2SHZrm_Int VR128X:$dst, addr:$src)>;
+} // Predicates = [HasFP16]
+
+let Predicates = [HasFP16, HasVLX] in {
+  // Special patterns to allow use of X86VMSintToFP for masking. Instruction
+  // patterns have been disabled with null_frag.
+  def : Pat<(v8f16 (X86any_VSintToFP (v4i64 VR256X:$src))),
+            (VCVTQQ2PHZ256rr VR256X:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 VR256X:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 VR256X:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (loadv4i64 addr:$src))),
+            (VCVTQQ2PHZ256rm addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv4i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv4i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rmkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (v4i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTQQ2PHZ256rmb addr:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK4WM:$mask),
+            (VCVTQQ2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK4WM:$mask),
+            (VCVTQQ2PHZ256rmbkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (v2i64 VR128X:$src))),
+            (VCVTQQ2PHZ128rr VR128X:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (loadv2i64 addr:$src))),
+            (VCVTQQ2PHZ128rm addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTQQ2PHZ128rmb addr:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK2WM:$mask),
+            (VCVTQQ2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK2WM:$mask),
+            (VCVTQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+  // Special patterns to allow use of X86VMUintToFP for masking. Instruction
+  // patterns have been disabled with null_frag.
+  def : Pat<(v8f16 (X86any_VUintToFP (v4i64 VR256X:$src))),
+            (VCVTUQQ2PHZ256rr VR256X:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 VR256X:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 VR256X:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (loadv4i64 addr:$src))),
+            (VCVTUQQ2PHZ256rm addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv4i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv4i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (v4i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTUQQ2PHZ256rmb addr:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmbkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (v2i64 VR128X:$src))),
+            (VCVTUQQ2PHZ128rr VR128X:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (loadv2i64 addr:$src))),
+            (VCVTUQQ2PHZ128rm addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTUQQ2PHZ128rmb addr:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
+}
+
+let Constraints = "@earlyclobber $dst, $src1 = $dst" in {
+  multiclass avx512_cfmaop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, bit IsCommutable> {
+    defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.RC:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX_4V;
+
+    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX_4V;
+
+    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.ScalarMemOp:$src3),
+            OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr),
+            (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX_4V;
+  }
+} // Constraints = "@earlyclobber $dst, $src1 = $dst"
+
+multiclass avx512_cfmaop_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _> {
+  let Constraints = "@earlyclobber $dst, $src1 = $dst" in
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc)))>,
+          EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+
+multiclass avx512_cfmaop_common<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, bit IsCommutable> {
+  let Predicates = [HasFP16] in {
+    defm Z    : avx512_cfmaop_rm<opc, OpcodeStr, OpNode, v16f32_info, IsCommutable>,
+                avx512_cfmaop_round<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+                      EVEX_V512, Sched<[WriteFMAZ]>;
+  }
+  let Predicates = [HasVLX, HasFP16] in {
+    defm Z256 : avx512_cfmaop_rm<opc, OpcodeStr, OpNode, v8f32x_info, IsCommutable>, EVEX_V256, Sched<[WriteFMAY]>;
+    defm Z128 : avx512_cfmaop_rm<opc, OpcodeStr, OpNode, v4f32x_info, IsCommutable>, EVEX_V128, Sched<[WriteFMAX]>;
+  }
+}
+
+multiclass avx512_cfmulop_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 SDNode MaskOpNode, SDNode OpNodeRnd, bit IsCommutable> {
+  let Predicates = [HasFP16] in {
+    defm Z    : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
+                                 WriteFMAZ, IsCommutable, IsCommutable, "", "@earlyclobber $dst", 0>,
+                avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, WriteFMAZ, v16f32_info,
+                                       "", "@earlyclobber $dst">, EVEX_V512;
+  }
+  let Predicates = [HasVLX, HasFP16] in {
+    defm Z256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
+                                 WriteFMAY, IsCommutable, IsCommutable, "", "@earlyclobber $dst", 0>, EVEX_V256;
+    defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
+                                 WriteFMAX, IsCommutable, IsCommutable, "", "@earlyclobber $dst", 0>, EVEX_V128;
+  }
+}
+
+
+let Uses = [MXCSR] in {
+  defm VFMADDCPH  : avx512_cfmaop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd, 1>,
+                                    T_MAP6XS, EVEX_CD8<32, CD8VF>;
+  defm VFCMADDCPH : avx512_cfmaop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd, 0>,
+                                    T_MAP6XD, EVEX_CD8<32, CD8VF>;
+
+  defm VFMULCPH  : avx512_cfmulop_common<0xD6, "vfmulcph", x86vfmulc, x86vfmulc,
+                                         x86vfmulcRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VF>;
+  defm VFCMULCPH : avx512_cfmulop_common<0xD6, "vfcmulcph", x86vfcmulc,
+                                         x86vfcmulc, x86vfcmulcRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VF>;
+}
+
+
+multiclass avx512_cfmaop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd,
+                                   bit IsCommutable> {
+  let Predicates = [HasFP16], Constraints = "@earlyclobber $dst, $src1 = $dst" in {
+    defm r : AVX512_maskable_3src<opc, MRMSrcReg, v4f32x_info, (outs VR128X:$dst),
+                        (ins VR128X:$src2, VR128X:$src3), OpcodeStr,
+                        "$src3, $src2", "$src2, $src3",
+                        (v4f32 (OpNode VR128X:$src2, VR128X:$src3, VR128X:$src1)), IsCommutable>,
+                        Sched<[WriteFMAX]>;
+    defm m : AVX512_maskable_3src<opc, MRMSrcMem, v4f32x_info, (outs VR128X:$dst),
+                        (ins VR128X:$src2, ssmem:$src3), OpcodeStr,
+                        "$src3, $src2", "$src2, $src3",
+                        (v4f32 (OpNode VR128X:$src2, (sse_load_f32 addr:$src3), VR128X:$src1))>,
+                        Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>;
+    defm rb : AVX512_maskable_3src<opc,  MRMSrcReg, v4f32x_info, (outs VR128X:$dst),
+                        (ins VR128X:$src2, VR128X:$src3, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src3, $src2", "$src2, $src3, $rc",
+                        (v4f32 (OpNodeRnd VR128X:$src2, VR128X:$src3, VR128X:$src1, (i32 timm:$rc)))>,
+                        EVEX_B, EVEX_RC, Sched<[WriteFMAX]>;
+  }
+}
+
+multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                     SDNode OpNodeRnd, bit IsCommutable> {
+  let Predicates = [HasFP16] in {
+    defm rr : AVX512_maskable<opc, MRMSrcReg, f32x_info, (outs VR128X:$dst),
+                        (ins VR128X:$src1, VR128X:$src2), OpcodeStr,
+                        "$src2, $src1", "$src1, $src2",
+                        (v4f32 (OpNode VR128X:$src1, VR128X:$src2)),
+                        IsCommutable, IsCommutable, IsCommutable,
+                        X86selects, "@earlyclobber $dst">, Sched<[WriteFMAX]>;
+    defm rm : AVX512_maskable<opc, MRMSrcMem, f32x_info, (outs VR128X:$dst),
+                        (ins VR128X:$src1, ssmem:$src2), OpcodeStr,
+                        "$src2, $src1", "$src1, $src2",
+                        (v4f32 (OpNode VR128X:$src1, (sse_load_f32 addr:$src2))),
+                        0, 0, 0, X86selects, "@earlyclobber $dst">,
+                        Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>;
+    defm rrb : AVX512_maskable<opc, MRMSrcReg, f32x_info, (outs VR128X:$dst),
+                        (ins VR128X:$src1, VR128X:$src2, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src2, $src1", "$src1, $src2, $rc",
+                        (OpNodeRnd (v4f32 VR128X:$src1), (v4f32 VR128X:$src2), (i32 timm:$rc)),
+                        0, 0, 0, X86selects, "@earlyclobber $dst">,
+                        EVEX_B, EVEX_RC, Sched<[WriteFMAX]>;
+  }
+}
+
+let Uses = [MXCSR] in {
+  defm VFMADDCSHZ  : avx512_cfmaop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd, 1>,
+                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+  defm VFCMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd, 0>,
+                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+
+  defm VFMULCSHZ  : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd, 1>,
+                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+  defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>,
+                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+}
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index e83e1e74ff52..8337d2b37383 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -708,6 +708,19 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
         mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
     Sched<[sched.Folded, sched.ReadAfterFold]>;
 
+// BinOpRM - Instructions like "adc reg, reg, [mem]".
+// There is an implicit register read at the end of the operand sequence.
+class BinOpRM_ImplicitUse<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+    Sched<[sched.Folded, sched.ReadAfterFold,
+           // base, scale, index, offset, segment.
+           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+           // implicit register read.
+           sched.ReadAfterFold]>;
+
 // BinOpRM_F - Instructions like "cmp reg, [mem]".
 class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDNode opnode>
@@ -725,7 +738,7 @@ class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
 class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
+  : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
                     EFLAGS))]>;
@@ -805,7 +818,11 @@ class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
           [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
-           (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
+           (implicit EFLAGS)]>, Sched<[WriteALURMW,
+                                       // base, scale, index, offset, segment
+                                       ReadDefault, ReadDefault, ReadDefault,
+                                       ReadDefault, ReadDefault,
+                                       WriteALU.ReadAfterFold]>;  // reg
 
 // BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
 class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -813,7 +830,12 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMR<opcode, mnemonic, typeinfo,
             [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
                     addr:$dst),
-             (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
+             (implicit EFLAGS)]>, Sched<[WriteADCRMW,
+                                         // base, scale, index, offset, segment
+                                         ReadDefault, ReadDefault, ReadDefault,
+                                         ReadDefault, ReadDefault,
+                                         WriteALU.ReadAfterFold,    // reg
+                                         WriteALU.ReadAfterFold]>;  // EFLAGS
 
 // BinOpMR_F - Instructions like "cmp [mem], reg".
 class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -1475,13 +1497,17 @@ multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
 let hasSideEffects = 0 in {
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
+             []>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
 
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-
-             []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+             []>, T8XD, VEX_4V,
+             Sched<[WriteIMulHLd, sched.Folded,
+                    // Memory operand.
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    // Implicit read of EDX/RDX
+                    sched.ReadAfterFold]>;
 
   // Pseudo instructions to be used when the low result isn't used. The
   // instruction is defined to keep the high if both destinations are the same.
@@ -1496,9 +1522,9 @@ let hasSideEffects = 0 in {
 
 let Predicates = [HasBMI2] in {
   let Uses = [EDX] in
-    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul32>;
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>;
   let Uses = [RDX] in
-    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W;
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, VEX_W;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1525,7 +1551,12 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
                     "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
   } // SchedRW
 
-  let mayLoad = 1, SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold] in {
+  let mayLoad = 1,
+      SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
+                 // Memory operand.
+                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                 // Implicit read of EFLAGS
+                 WriteADC.ReadAfterFold] in {
   def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
                    (ins GR32:$src1, i32mem:$src2),
                    "adcx{l}\t{$src2, $dst|$dst, $src2}", []>, T8PD;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 202d320cd731..ba52283b570d 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -69,16 +69,12 @@ def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
 let SchedRW = [WriteSystem] in {
 
 // x86-64 va_start lowering magic.
-let hasSideEffects = 1, Defs = [EFLAGS] in {
+let hasSideEffects = 1, mayStore = 1, Defs = [EFLAGS] in {
 def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
                               (outs),
-                              (ins GR8:$al,
-                                   i32imm:$regsavefi, i32imm:$offset,
-                                   variable_ops),
-                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
-                              [(X86vastart_save_xmm_regs GR8:$al,
-                                                         timm:$regsavefi,
-                                                         timm:$offset),
+                              (ins GR8:$al, i8mem:$regsavefi, variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi",
+                              [(X86vastart_save_xmm_regs GR8:$al, addr:$regsavefi),
                                (implicit EFLAGS)]>;
 }
 
@@ -153,15 +149,15 @@ def STACKALLOC_W_PROBING : I<0, Pseudo, (outs), (ins i64imm:$stacksize),
 // (compared to ordinary calls) like stack pointer change.
 
 let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
-def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
+def DYN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
                      "# dynamic stack allocation",
-                     [(X86WinAlloca GR32:$size)]>,
+                     [(X86DynAlloca GR32:$size)]>,
                      Requires<[NotLP64]>;
 
 let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
-def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
+def DYN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
                      "# dynamic stack allocation",
-                     [(X86WinAlloca GR64:$size)]>,
+                     [(X86DynAlloca GR64:$size)]>,
                      Requires<[In64BitMode]>;
 } // SchedRW
 
@@ -261,6 +257,17 @@ let isPseudo = 1, SchedRW = [WriteSystem] in {
 }
 
 //===----------------------------------------------------------------------===//
+// Pseudo instructions used by address sanitizer.
+//===----------------------------------------------------------------------===//
+let
+  Defs = [R8, EFLAGS] in {
+def ASAN_CHECK_MEMACCESS : PseudoI<
+  (outs), (ins GR64NoR8:$addr, i32imm:$accessinfo),
+  [(int_asan_check_memaccess GR64NoR8:$addr, (i32 timm:$accessinfo))]>,
+  Sched<[]>;
+}
+
+//===----------------------------------------------------------------------===//
 // Pseudo instructions used by segmented stacks.
 //
 
@@ -555,6 +562,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
   let Predicates = [HasMMX] in
     defm _VR64   : CMOVrr_PSEUDO<VR64, x86mmx>;
 
+  defm _FR16X    : CMOVrr_PSEUDO<FR16X, f16>;
   let Predicates = [HasSSE1,NoAVX512] in
     defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
   let Predicates = [HasSSE2,NoAVX512] in
@@ -612,6 +620,8 @@ let Predicates = [HasVLX] in {
             (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
   def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+  def : Pat<(v8f16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
   def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
   def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
@@ -623,6 +633,8 @@ let Predicates = [HasVLX] in {
             (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
   def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+  def : Pat<(v16f16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
   def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
   def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
@@ -635,6 +647,8 @@ def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v32f16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
@@ -953,7 +967,7 @@ multiclass ATOMIC_RMW_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
                     !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
                     [(set
                        GR32:$dst,
-                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, 
+                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>,
                     OpSize32;
     def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
                      (ins GR64:$val, i64mem:$ptr),
@@ -1197,10 +1211,10 @@ def : Pat<(X86call (i64 tglobaladdr:$dst)),
 def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>;
 
-def : Pat<(X86call_rvmarker (timm:$sel), (i64 texternalsym:$dst)),
-          (CALL64pcrel32_RVMARKER timm:$sel, texternalsym:$dst)>;
-def : Pat<(X86call_rvmarker (timm:$sel), (i64 tglobaladdr:$dst)),
-          (CALL64pcrel32_RVMARKER timm:$sel, tglobaladdr:$dst)>;
+def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
+          (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, texternalsym:$dst)>;
+def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
 
 
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index a6cb17f17a17..6d969962afff 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -20,30 +20,30 @@
 // ST1 arguments when returning values on the x87 stack.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
-  def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+  def RET32  : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret{l}", []>, OpSize32, Requires<[Not64BitMode]>;
-  def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+  def RET64  : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret{q}", []>, OpSize32, Requires<[In64BitMode]>;
-  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
+  def RET16  : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}", []>, OpSize16;
-  def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+  def RETI32 : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret{l}\t$amt", []>, OpSize32, Requires<[Not64BitMode]>;
-  def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+  def RETI64 : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret{q}\t$amt", []>, OpSize32, Requires<[In64BitMode]>;
-  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+  def RETI16 : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret{w}\t$amt", []>, OpSize16;
-  def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
+  def LRET32 : I   <0xCB, RawFrm, (outs), (ins),
                     "{l}ret{l|f}", []>, OpSize32;
-  def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
+  def LRET64 : RI  <0xCB, RawFrm, (outs), (ins),
                     "{l}ret{|f}q", []>, Requires<[In64BitMode]>;
-  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+  def LRET16 : I   <0xCB, RawFrm, (outs), (ins),
                     "{l}ret{w|f}", []>, OpSize16;
-  def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{l|f}\t$amt", []>, OpSize32;
-  def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{|f}q\t$amt", []>, Requires<[In64BitMode]>;
-  def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "{l}ret{w|f}\t$amt", []>, OpSize16;
+  def LRETI32 : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                     "{l}ret{l|f}\t$amt", []>, OpSize32;
+  def LRETI64 : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                      "{l}ret{|f}q\t$amt", []>, Requires<[In64BitMode]>;
+  def LRETI16 : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                     "{l}ret{w|f}\t$amt", []>, OpSize16;
 
   // The machine return from interrupt instruction, but sometimes we need to
   // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
@@ -419,15 +419,15 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
     Uses = [RSP, SSP],
     SchedRW = [WriteJump] in {
   def CALL64m_RVMARKER :
-     PseudoI<(outs), (ins i32imm:$sel, i64mem:$dst), [(X86call_rvmarker timm:$sel, (loadi64 addr:$dst))]>,
+     PseudoI<(outs), (ins i64imm:$rvfunc, i64mem:$dst), [(X86call_rvmarker tglobaladdr:$rvfunc, (loadi64 addr:$dst))]>,
              Requires<[In64BitMode]>;
 
   def CALL64r_RVMARKER :
-    PseudoI<(outs), (ins i32imm:$sel, GR64:$dst), [(X86call_rvmarker timm:$sel, GR64:$dst)]>,
+    PseudoI<(outs), (ins i64imm:$rvfunc, GR64:$dst), [(X86call_rvmarker tglobaladdr:$rvfunc, GR64:$dst)]>,
             Requires<[In64BitMode]>;
 
   def CALL64pcrel32_RVMARKER :
-    PseudoI<(outs), (ins i32imm:$sel, i64i32imm_brtarget:$dst), []>,
+    PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>,
             Requires<[In64BitMode]>;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 27328fe42c44..1f92293fa73f 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -427,7 +427,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
-                     ValueType VT, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
 let isCodeGenOnly = 1, hasSideEffects = 0,
     Uses = [MXCSR], mayRaiseFPException = 1 in {
   def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
@@ -540,20 +540,16 @@ let ExeDomain = SSEPackedSingle in {
   // Scalar Instructions
   defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
                           SchedWriteFMA.Scl>,
-                    fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x6A, "vfmaddss", ssmem, SchedWriteFMA.Scl>;
   defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
                           SchedWriteFMA.Scl>,
-                    fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x6E, "vfmsubss", ssmem, SchedWriteFMA.Scl>;
   defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
                           X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
-                    fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, SchedWriteFMA.Scl>;
   defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
                           X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
-                    fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, SchedWriteFMA.Scl>;
   // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -573,20 +569,16 @@ let ExeDomain = SSEPackedDouble in {
   // Scalar Instructions
   defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
                           SchedWriteFMA.Scl>,
-                    fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, SchedWriteFMA.Scl>;
   defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
                           SchedWriteFMA.Scl>,
-                    fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, SchedWriteFMA.Scl>;
   defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
                           X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
-                    fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, SchedWriteFMA.Scl>;
   defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
                           X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
-                    fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
-                              SchedWriteFMA.Scl>;
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, SchedWriteFMA.Scl>;
   // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -603,8 +595,8 @@ let ExeDomain = SSEPackedDouble in {
 }
 
 multiclass scalar_fma4_patterns<SDPatternOperator Op, string Name,
-                               ValueType VT, ValueType EltVT,
-                               RegisterClass RC, PatFrag mem_frag> {
+                                ValueType VT, RegisterClass RC,
+                                PatFrag mem_frag> {
   let Predicates = [HasFMA4] in {
     def : Pat<(VT (X86vzmovl (VT (scalar_to_vector
                                   (Op RC:$src1, RC:$src2, RC:$src3))))),
@@ -629,12 +621,12 @@ multiclass scalar_fma4_patterns<SDPatternOperator Op, string Name,
   }
 }
 
-defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, FR32, loadf32>;
 
-defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, FR64, loadf64>;
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 6d803e931b68..52b2a62316cd 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -28,35 +28,43 @@ using namespace llvm;
   FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \
   FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked)
 
-#define FMA3GROUP_PACKED_WIDTHS(Name, Suf, Attrs) \
-  FMA3GROUP(Name, Suf##Ym, Attrs) \
-  FMA3GROUP(Name, Suf##Yr, Attrs) \
+#define FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Z256m, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Z256r, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Zm, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Zr, Attrs) \
+
+#define FMA3GROUP_PACKED_WIDTHS_ALL(Name, Suf, Attrs) \
+  FMA3GROUP(Name, Suf##Ym, Attrs) \
+  FMA3GROUP(Name, Suf##Yr, Attrs) \
+  FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP(Name, Suf##m, Attrs) \
   FMA3GROUP(Name, Suf##r, Attrs)
 
 #define FMA3GROUP_PACKED(Name, Attrs) \
-  FMA3GROUP_PACKED_WIDTHS(Name, PD, Attrs) \
-  FMA3GROUP_PACKED_WIDTHS(Name, PS, Attrs)
+  FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \
+  FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \
+  FMA3GROUP_PACKED_WIDTHS_ALL(Name, PS, Attrs)
 
-#define FMA3GROUP_SCALAR_WIDTHS(Name, Suf, Attrs) \
+#define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP(Name, Suf##Zm, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
   FMA3GROUP(Name, Suf##Zr, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+
+#define FMA3GROUP_SCALAR_WIDTHS_ALL(Name, Suf, Attrs) \
+  FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP(Name, Suf##m, Attrs) \
   FMA3GROUP(Name, Suf##m_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
   FMA3GROUP(Name, Suf##r, Attrs) \
   FMA3GROUP(Name, Suf##r_Int, Attrs | X86InstrFMA3Group::Intrinsic)
 
 #define FMA3GROUP_SCALAR(Name, Attrs) \
-  FMA3GROUP_SCALAR_WIDTHS(Name, SD, Attrs) \
-  FMA3GROUP_SCALAR_WIDTHS(Name, SS, Attrs)
+  FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SD, Attrs) \
+  FMA3GROUP_SCALAR_WIDTHS_Z(Name, SH, Attrs) \
+  FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SS, Attrs)
 
 #define FMA3GROUP_FULL(Name, Attrs) \
   FMA3GROUP_PACKED(Name, Attrs) \
@@ -78,15 +86,19 @@ static const X86InstrFMA3Group Groups[] = {
 
 #define FMA3GROUP_PACKED_AVX512(Name, Suf, Attrs) \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
 
 #define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \
   FMA3GROUP_MASKED(Name, PDZ##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, PHZ##Suf, Attrs) \
   FMA3GROUP_MASKED(Name, PSZ##Suf, Attrs)
 
 #define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \
   FMA3GROUP(Name, SDZ##Suf, Attrs) \
   FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \
+  FMA3GROUP(Name, SHZ##Suf, Attrs) \
+  FMA3GROUP_MASKED(Name, SHZ##Suf##_Int, Attrs) \
   FMA3GROUP(Name, SSZ##Suf, Attrs) \
   FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
 
@@ -130,14 +142,16 @@ const X86InstrFMA3Group *llvm::getFMA3Group(unsigned Opcode, uint64_t TSFlags) {
 
   // FMA3 instructions have a well defined encoding pattern we can exploit.
   uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
-  bool IsFMA3 = ((TSFlags & X86II::EncodingMask) == X86II::VEX ||
-                 (TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
-                (TSFlags & X86II::OpMapMask) == X86II::T8 &&
-                (TSFlags & X86II::OpPrefixMask) == X86II::PD &&
-                ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
-                 (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
-                 (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
-  if (!IsFMA3)
+  bool IsFMA3Opcode = ((BaseOpcode >= 0x96 && BaseOpcode <= 0x9F) ||
+                       (BaseOpcode >= 0xA6 && BaseOpcode <= 0xAF) ||
+                       (BaseOpcode >= 0xB6 && BaseOpcode <= 0xBF));
+  bool IsFMA3Encoding = ((TSFlags & X86II::EncodingMask) == X86II::VEX &&
+                         (TSFlags & X86II::OpMapMask) == X86II::T8) ||
+                        ((TSFlags & X86II::EncodingMask) == X86II::EVEX &&
+                         ((TSFlags & X86II::OpMapMask) == X86II::T8 ||
+                          (TSFlags & X86II::OpMapMask) == X86II::T_MAP6));
+  bool IsFMA3Prefix = (TSFlags & X86II::OpPrefixMask) == X86II::PD;
+  if (!IsFMA3Opcode || !IsFMA3Encoding || !IsFMA3Prefix)
     return nullptr;
 
   verifyTables();
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index cda28d18f4aa..e310f369be08 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -377,7 +377,7 @@ def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
 } // SchedRW
 } // Uses = [FPCW], mayRaiseFPException = 1
 
-let SchedRW = [WriteFTest] in {
+let SchedRW = [WriteFTest], Defs = [FPSW] in {
 def XAM_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
 def XAM_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
 def XAM_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 17fe7f0bd310..6d4ad08842c7 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -815,10 +815,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VFPCLASSPDZ128rr,     X86::VFPCLASSPDZ128rm,     0 },
   { X86::VFPCLASSPDZ256rr,     X86::VFPCLASSPDZ256rm,     0 },
   { X86::VFPCLASSPDZrr,        X86::VFPCLASSPDZrm,        0 },
+  { X86::VFPCLASSPHZ128rr,     X86::VFPCLASSPHZ128rm,     0 },
+  { X86::VFPCLASSPHZ256rr,     X86::VFPCLASSPHZ256rm,     0 },
+  { X86::VFPCLASSPHZrr,        X86::VFPCLASSPHZrm,        0 },
   { X86::VFPCLASSPSZ128rr,     X86::VFPCLASSPSZ128rm,     0 },
   { X86::VFPCLASSPSZ256rr,     X86::VFPCLASSPSZ256rm,     0 },
   { X86::VFPCLASSPSZrr,        X86::VFPCLASSPSZrm,        0 },
   { X86::VFPCLASSSDZrr,        X86::VFPCLASSSDZrm,        TB_NO_REVERSE },
+  { X86::VFPCLASSSHZrr,        X86::VFPCLASSSHZrm,        TB_NO_REVERSE },
   { X86::VFPCLASSSSZrr,        X86::VFPCLASSSSZrm,        TB_NO_REVERSE },
   { X86::VFRCZPDYrr,           X86::VFRCZPDYrm,           0 },
   { X86::VFRCZPDrr,            X86::VFRCZPDrm,            0 },
@@ -829,12 +833,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VGETEXPPDZ128r,       X86::VGETEXPPDZ128m,       0 },
   { X86::VGETEXPPDZ256r,       X86::VGETEXPPDZ256m,       0 },
   { X86::VGETEXPPDZr,          X86::VGETEXPPDZm,          0 },
+  { X86::VGETEXPPHZ128r,       X86::VGETEXPPHZ128m,       0 },
+  { X86::VGETEXPPHZ256r,       X86::VGETEXPPHZ256m,       0 },
+  { X86::VGETEXPPHZr,          X86::VGETEXPPHZm,          0 },
   { X86::VGETEXPPSZ128r,       X86::VGETEXPPSZ128m,       0 },
   { X86::VGETEXPPSZ256r,       X86::VGETEXPPSZ256m,       0 },
   { X86::VGETEXPPSZr,          X86::VGETEXPPSZm,          0 },
   { X86::VGETMANTPDZ128rri,    X86::VGETMANTPDZ128rmi,    0 },
   { X86::VGETMANTPDZ256rri,    X86::VGETMANTPDZ256rmi,    0 },
   { X86::VGETMANTPDZrri,       X86::VGETMANTPDZrmi,       0 },
+  { X86::VGETMANTPHZ128rri,    X86::VGETMANTPHZ128rmi,    0 },
+  { X86::VGETMANTPHZ256rri,    X86::VGETMANTPHZ256rmi,    0 },
+  { X86::VGETMANTPHZrri,       X86::VGETMANTPHZrmi,       0 },
   { X86::VGETMANTPSZ128rri,    X86::VGETMANTPSZ128rmi,    0 },
   { X86::VGETMANTPSZ256rri,    X86::VGETMANTPSZ256rmi,    0 },
   { X86::VGETMANTPSZrri,       X86::VGETMANTPSZrmi,       0 },
@@ -1161,17 +1171,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VRCP14PSZr,           X86::VRCP14PSZm,           0 },
   { X86::VRCP28PDZr,           X86::VRCP28PDZm,           0 },
   { X86::VRCP28PSZr,           X86::VRCP28PSZm,           0 },
+  { X86::VRCPPHZ128r,          X86::VRCPPHZ128m,          0 },
+  { X86::VRCPPHZ256r,          X86::VRCPPHZ256m,          0 },
+  { X86::VRCPPHZr,             X86::VRCPPHZm,             0 },
   { X86::VRCPPSYr,             X86::VRCPPSYm,             0 },
   { X86::VRCPPSr,              X86::VRCPPSm,              0 },
   { X86::VREDUCEPDZ128rri,     X86::VREDUCEPDZ128rmi,     0 },
   { X86::VREDUCEPDZ256rri,     X86::VREDUCEPDZ256rmi,     0 },
   { X86::VREDUCEPDZrri,        X86::VREDUCEPDZrmi,        0 },
+  { X86::VREDUCEPHZ128rri,     X86::VREDUCEPHZ128rmi,     0 },
+  { X86::VREDUCEPHZ256rri,     X86::VREDUCEPHZ256rmi,     0 },
+  { X86::VREDUCEPHZrri,        X86::VREDUCEPHZrmi,        0 },
   { X86::VREDUCEPSZ128rri,     X86::VREDUCEPSZ128rmi,     0 },
   { X86::VREDUCEPSZ256rri,     X86::VREDUCEPSZ256rmi,     0 },
   { X86::VREDUCEPSZrri,        X86::VREDUCEPSZrmi,        0 },
   { X86::VRNDSCALEPDZ128rri,   X86::VRNDSCALEPDZ128rmi,   0 },
   { X86::VRNDSCALEPDZ256rri,   X86::VRNDSCALEPDZ256rmi,   0 },
   { X86::VRNDSCALEPDZrri,      X86::VRNDSCALEPDZrmi,      0 },
+  { X86::VRNDSCALEPHZ128rri,   X86::VRNDSCALEPHZ128rmi,   0 },
+  { X86::VRNDSCALEPHZ256rri,   X86::VRNDSCALEPHZ256rmi,   0 },
+  { X86::VRNDSCALEPHZrri,      X86::VRNDSCALEPHZrmi,      0 },
   { X86::VRNDSCALEPSZ128rri,   X86::VRNDSCALEPSZ128rmi,   0 },
   { X86::VRNDSCALEPSZ256rri,   X86::VRNDSCALEPSZ256rmi,   0 },
   { X86::VRNDSCALEPSZrri,      X86::VRNDSCALEPSZrmi,      0 },
@@ -1187,6 +1206,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VRSQRT14PSZr,         X86::VRSQRT14PSZm,         0 },
   { X86::VRSQRT28PDZr,         X86::VRSQRT28PDZm,         0 },
   { X86::VRSQRT28PSZr,         X86::VRSQRT28PSZm,         0 },
+  { X86::VRSQRTPHZ128r,        X86::VRSQRTPHZ128m,        0 },
+  { X86::VRSQRTPHZ256r,        X86::VRSQRTPHZ256m,        0 },
+  { X86::VRSQRTPHZr,           X86::VRSQRTPHZm,           0 },
   { X86::VRSQRTPSYr,           X86::VRSQRTPSYm,           0 },
   { X86::VRSQRTPSr,            X86::VRSQRTPSm,            0 },
   { X86::VSQRTPDYr,            X86::VSQRTPDYm,            0 },
@@ -1194,6 +1216,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VSQRTPDZ256r,         X86::VSQRTPDZ256m,         0 },
   { X86::VSQRTPDZr,            X86::VSQRTPDZm,            0 },
   { X86::VSQRTPDr,             X86::VSQRTPDm,             0 },
+  { X86::VSQRTPHZ128r,         X86::VSQRTPHZ128m,         0 },
+  { X86::VSQRTPHZ256r,         X86::VSQRTPHZ256m,         0 },
+  { X86::VSQRTPHZr,            X86::VSQRTPHZm,            0 },
   { X86::VSQRTPSYr,            X86::VSQRTPSYm,            0 },
   { X86::VSQRTPSZ128r,         X86::VSQRTPSZ128m,         0 },
   { X86::VSQRTPSZ256r,         X86::VSQRTPSZ256m,         0 },
@@ -1550,6 +1575,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VADDPDZ256rr,             X86::VADDPDZ256rm,             0 },
   { X86::VADDPDZrr,                X86::VADDPDZrm,                0 },
   { X86::VADDPDrr,                 X86::VADDPDrm,                 0 },
+  { X86::VADDPHZ128rr,             X86::VADDPHZ128rm,             0 },
+  { X86::VADDPHZ256rr,             X86::VADDPHZ256rm,             0 },
+  { X86::VADDPHZrr,                X86::VADDPHZrm,                0 },
   { X86::VADDPSYrr,                X86::VADDPSYrm,                0 },
   { X86::VADDPSZ128rr,             X86::VADDPSZ128rm,             0 },
   { X86::VADDPSZ256rr,             X86::VADDPSZ256rm,             0 },
@@ -1559,6 +1587,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VADDSDZrr_Int,            X86::VADDSDZrm_Int,            TB_NO_REVERSE },
   { X86::VADDSDrr,                 X86::VADDSDrm,                 0 },
   { X86::VADDSDrr_Int,             X86::VADDSDrm_Int,             TB_NO_REVERSE },
+  { X86::VADDSHZrr,                X86::VADDSHZrm,                0 },
+  { X86::VADDSHZrr_Int,            X86::VADDSHZrm_Int,            TB_NO_REVERSE },
   { X86::VADDSSZrr,                X86::VADDSSZrm,                0 },
   { X86::VADDSSZrr_Int,            X86::VADDSSZrm_Int,            TB_NO_REVERSE },
   { X86::VADDSSrr,                 X86::VADDSSrm,                 0 },
@@ -1642,6 +1672,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VCMPPDZ256rri,            X86::VCMPPDZ256rmi,            0 },
   { X86::VCMPPDZrri,               X86::VCMPPDZrmi,               0 },
   { X86::VCMPPDrri,                X86::VCMPPDrmi,                0 },
+  { X86::VCMPPHZ128rri,            X86::VCMPPHZ128rmi,            0 },
+  { X86::VCMPPHZ256rri,            X86::VCMPPHZ256rmi,            0 },
+  { X86::VCMPPHZrri,               X86::VCMPPHZrmi,               0 },
   { X86::VCMPPSYrri,               X86::VCMPPSYrmi,               0 },
   { X86::VCMPPSZ128rri,            X86::VCMPPSZ128rmi,            0 },
   { X86::VCMPPSZ256rri,            X86::VCMPPSZ256rmi,            0 },
@@ -1651,6 +1684,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VCMPSDZrr_Int,            X86::VCMPSDZrm_Int,            TB_NO_REVERSE },
   { X86::VCMPSDrr,                 X86::VCMPSDrm,                 0 },
   { X86::VCMPSDrr_Int,             X86::VCMPSDrm_Int,             TB_NO_REVERSE },
+  { X86::VCMPSHZrr,                X86::VCMPSHZrm,                0 },
+  { X86::VCMPSHZrr_Int,            X86::VCMPSHZrm_Int,            TB_NO_REVERSE },
   { X86::VCMPSSZrr,                X86::VCMPSSZrm,                0 },
   { X86::VCMPSSZrr_Int,            X86::VCMPSSZrm_Int,            TB_NO_REVERSE },
   { X86::VCMPSSrr,                 X86::VCMPSSrm,                 0 },
@@ -1782,6 +1817,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VDIVPDZ256rr,             X86::VDIVPDZ256rm,             0 },
   { X86::VDIVPDZrr,                X86::VDIVPDZrm,                0 },
   { X86::VDIVPDrr,                 X86::VDIVPDrm,                 0 },
+  { X86::VDIVPHZ128rr,             X86::VDIVPHZ128rm,             0 },
+  { X86::VDIVPHZ256rr,             X86::VDIVPHZ256rm,             0 },
+  { X86::VDIVPHZrr,                X86::VDIVPHZrm,                0 },
   { X86::VDIVPSYrr,                X86::VDIVPSYrm,                0 },
   { X86::VDIVPSZ128rr,             X86::VDIVPSZ128rm,             0 },
   { X86::VDIVPSZ256rr,             X86::VDIVPSZ256rm,             0 },
@@ -1791,6 +1829,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VDIVSDZrr_Int,            X86::VDIVSDZrm_Int,            TB_NO_REVERSE },
   { X86::VDIVSDrr,                 X86::VDIVSDrm,                 0 },
   { X86::VDIVSDrr_Int,             X86::VDIVSDrm_Int,             TB_NO_REVERSE },
+  { X86::VDIVSHZrr,                X86::VDIVSHZrm,                0 },
+  { X86::VDIVSHZrr_Int,            X86::VDIVSHZrm_Int,            TB_NO_REVERSE },
   { X86::VDIVSSZrr,                X86::VDIVSSZrm,                0 },
   { X86::VDIVSSZrr_Int,            X86::VDIVSSZrm_Int,            TB_NO_REVERSE },
   { X86::VDIVSSrr,                 X86::VDIVSSrm,                 0 },
@@ -1806,6 +1846,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VEXPANDPSZ128rrkz,        X86::VEXPANDPSZ128rmkz,        TB_NO_REVERSE },
   { X86::VEXPANDPSZ256rrkz,        X86::VEXPANDPSZ256rmkz,        TB_NO_REVERSE },
   { X86::VEXPANDPSZrrkz,           X86::VEXPANDPSZrmkz,           TB_NO_REVERSE },
+  { X86::VFCMULCPHZ128rr,          X86::VFCMULCPHZ128rm,          0 },
+  { X86::VFCMULCPHZ256rr,          X86::VFCMULCPHZ256rm,          0 },
+  { X86::VFCMULCPHZrr,             X86::VFCMULCPHZrm,             0 },
+  { X86::VFCMULCSHZrr,             X86::VFCMULCSHZrm,             TB_NO_REVERSE },
   { X86::VFMADDPD4Yrr,             X86::VFMADDPD4Ymr,             0 },
   { X86::VFMADDPD4rr,              X86::VFMADDPD4mr,              0 },
   { X86::VFMADDPS4Yrr,             X86::VFMADDPS4Ymr,             0 },
@@ -1830,6 +1874,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VFMSUBSD4rr_Int,          X86::VFMSUBSD4mr_Int,          TB_NO_REVERSE },
   { X86::VFMSUBSS4rr,              X86::VFMSUBSS4mr,              0 },
   { X86::VFMSUBSS4rr_Int,          X86::VFMSUBSS4mr_Int,          TB_NO_REVERSE },
+  { X86::VFMULCPHZ128rr,           X86::VFMULCPHZ128rm,           0 },
+  { X86::VFMULCPHZ256rr,           X86::VFMULCPHZ256rm,           0 },
+  { X86::VFMULCPHZrr,              X86::VFMULCPHZrm,              0 },
+  { X86::VFMULCSHZrr,              X86::VFMULCSHZrm,              TB_NO_REVERSE },
   { X86::VFNMADDPD4Yrr,            X86::VFNMADDPD4Ymr,            0 },
   { X86::VFNMADDPD4rr,             X86::VFNMADDPD4mr,             0 },
   { X86::VFNMADDPS4Yrr,            X86::VFNMADDPS4Ymr,            0 },
@@ -1849,26 +1897,38 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VFPCLASSPDZ128rrk,        X86::VFPCLASSPDZ128rmk,        0 },
   { X86::VFPCLASSPDZ256rrk,        X86::VFPCLASSPDZ256rmk,        0 },
   { X86::VFPCLASSPDZrrk,           X86::VFPCLASSPDZrmk,           0 },
+  { X86::VFPCLASSPHZ128rrk,        X86::VFPCLASSPHZ128rmk,        0 },
+  { X86::VFPCLASSPHZ256rrk,        X86::VFPCLASSPHZ256rmk,        0 },
+  { X86::VFPCLASSPHZrrk,           X86::VFPCLASSPHZrmk,           0 },
   { X86::VFPCLASSPSZ128rrk,        X86::VFPCLASSPSZ128rmk,        0 },
   { X86::VFPCLASSPSZ256rrk,        X86::VFPCLASSPSZ256rmk,        0 },
   { X86::VFPCLASSPSZrrk,           X86::VFPCLASSPSZrmk,           0 },
   { X86::VFPCLASSSDZrrk,           X86::VFPCLASSSDZrmk,           TB_NO_REVERSE },
+  { X86::VFPCLASSSHZrrk,           X86::VFPCLASSSHZrmk,           TB_NO_REVERSE },
   { X86::VFPCLASSSSZrrk,           X86::VFPCLASSSSZrmk,           TB_NO_REVERSE },
   { X86::VGETEXPPDZ128rkz,         X86::VGETEXPPDZ128mkz,         0 },
   { X86::VGETEXPPDZ256rkz,         X86::VGETEXPPDZ256mkz,         0 },
   { X86::VGETEXPPDZrkz,            X86::VGETEXPPDZmkz,            0 },
+  { X86::VGETEXPPHZ128rkz,         X86::VGETEXPPHZ128mkz,         0 },
+  { X86::VGETEXPPHZ256rkz,         X86::VGETEXPPHZ256mkz,         0 },
+  { X86::VGETEXPPHZrkz,            X86::VGETEXPPHZmkz,            0 },
   { X86::VGETEXPPSZ128rkz,         X86::VGETEXPPSZ128mkz,         0 },
   { X86::VGETEXPPSZ256rkz,         X86::VGETEXPPSZ256mkz,         0 },
   { X86::VGETEXPPSZrkz,            X86::VGETEXPPSZmkz,            0 },
   { X86::VGETEXPSDZr,              X86::VGETEXPSDZm,              TB_NO_REVERSE },
+  { X86::VGETEXPSHZr,              X86::VGETEXPSHZm,              TB_NO_REVERSE },
   { X86::VGETEXPSSZr,              X86::VGETEXPSSZm,              TB_NO_REVERSE },
   { X86::VGETMANTPDZ128rrikz,      X86::VGETMANTPDZ128rmikz,      0 },
   { X86::VGETMANTPDZ256rrikz,      X86::VGETMANTPDZ256rmikz,      0 },
   { X86::VGETMANTPDZrrikz,         X86::VGETMANTPDZrmikz,         0 },
+  { X86::VGETMANTPHZ128rrikz,      X86::VGETMANTPHZ128rmikz,      0 },
+  { X86::VGETMANTPHZ256rrikz,      X86::VGETMANTPHZ256rmikz,      0 },
+  { X86::VGETMANTPHZrrikz,         X86::VGETMANTPHZrmikz,         0 },
   { X86::VGETMANTPSZ128rrikz,      X86::VGETMANTPSZ128rmikz,      0 },
   { X86::VGETMANTPSZ256rrikz,      X86::VGETMANTPSZ256rmikz,      0 },
   { X86::VGETMANTPSZrrikz,         X86::VGETMANTPSZrmikz,         0 },
   { X86::VGETMANTSDZrri,           X86::VGETMANTSDZrmi,           TB_NO_REVERSE },
+  { X86::VGETMANTSHZrri,           X86::VGETMANTSHZrmi,           TB_NO_REVERSE },
   { X86::VGETMANTSSZrri,           X86::VGETMANTSSZrmi,           TB_NO_REVERSE },
   { X86::VGF2P8AFFINEINVQBYrri,    X86::VGF2P8AFFINEINVQBYrmi,    0 },
   { X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 },
@@ -1912,6 +1972,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXCPDZ256rr,            X86::VMAXCPDZ256rm,            0 },
   { X86::VMAXCPDZrr,               X86::VMAXCPDZrm,               0 },
   { X86::VMAXCPDrr,                X86::VMAXCPDrm,                0 },
+  { X86::VMAXCPHZ128rr,            X86::VMAXCPHZ128rm,            0 },
+  { X86::VMAXCPHZ256rr,            X86::VMAXCPHZ256rm,            0 },
+  { X86::VMAXCPHZrr,               X86::VMAXCPHZrm,               0 },
   { X86::VMAXCPSYrr,               X86::VMAXCPSYrm,               0 },
   { X86::VMAXCPSZ128rr,            X86::VMAXCPSZ128rm,            0 },
   { X86::VMAXCPSZ256rr,            X86::VMAXCPSZ256rm,            0 },
@@ -1919,6 +1982,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXCPSrr,                X86::VMAXCPSrm,                0 },
   { X86::VMAXCSDZrr,               X86::VMAXCSDZrm,               0 },
   { X86::VMAXCSDrr,                X86::VMAXCSDrm,                0 },
+  { X86::VMAXCSHZrr,               X86::VMAXCSHZrm,               0 },
   { X86::VMAXCSSZrr,               X86::VMAXCSSZrm,               0 },
   { X86::VMAXCSSrr,                X86::VMAXCSSrm,                0 },
   { X86::VMAXPDYrr,                X86::VMAXPDYrm,                0 },
@@ -1926,6 +1990,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXPDZ256rr,             X86::VMAXPDZ256rm,             0 },
   { X86::VMAXPDZrr,                X86::VMAXPDZrm,                0 },
   { X86::VMAXPDrr,                 X86::VMAXPDrm,                 0 },
+  { X86::VMAXPHZ128rr,             X86::VMAXPHZ128rm,             0 },
+  { X86::VMAXPHZ256rr,             X86::VMAXPHZ256rm,             0 },
+  { X86::VMAXPHZrr,                X86::VMAXPHZrm,                0 },
   { X86::VMAXPSYrr,                X86::VMAXPSYrm,                0 },
   { X86::VMAXPSZ128rr,             X86::VMAXPSZ128rm,             0 },
   { X86::VMAXPSZ256rr,             X86::VMAXPSZ256rm,             0 },
@@ -1935,6 +2002,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXSDZrr_Int,            X86::VMAXSDZrm_Int,            TB_NO_REVERSE },
   { X86::VMAXSDrr,                 X86::VMAXSDrm,                 0 },
   { X86::VMAXSDrr_Int,             X86::VMAXSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMAXSHZrr,                X86::VMAXSHZrm,                0 },
+  { X86::VMAXSHZrr_Int,            X86::VMAXSHZrm_Int,            TB_NO_REVERSE },
   { X86::VMAXSSZrr,                X86::VMAXSSZrm,                0 },
   { X86::VMAXSSZrr_Int,            X86::VMAXSSZrm_Int,            TB_NO_REVERSE },
   { X86::VMAXSSrr,                 X86::VMAXSSrm,                 0 },
@@ -1944,6 +2013,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINCPDZ256rr,            X86::VMINCPDZ256rm,            0 },
   { X86::VMINCPDZrr,               X86::VMINCPDZrm,               0 },
   { X86::VMINCPDrr,                X86::VMINCPDrm,                0 },
+  { X86::VMINCPHZ128rr,            X86::VMINCPHZ128rm,            0 },
+  { X86::VMINCPHZ256rr,            X86::VMINCPHZ256rm,            0 },
+  { X86::VMINCPHZrr,               X86::VMINCPHZrm,               0 },
   { X86::VMINCPSYrr,               X86::VMINCPSYrm,               0 },
   { X86::VMINCPSZ128rr,            X86::VMINCPSZ128rm,            0 },
   { X86::VMINCPSZ256rr,            X86::VMINCPSZ256rm,            0 },
@@ -1951,6 +2023,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINCPSrr,                X86::VMINCPSrm,                0 },
   { X86::VMINCSDZrr,               X86::VMINCSDZrm,               0 },
   { X86::VMINCSDrr,                X86::VMINCSDrm,                0 },
+  { X86::VMINCSHZrr,               X86::VMINCSHZrm,               0 },
   { X86::VMINCSSZrr,               X86::VMINCSSZrm,               0 },
   { X86::VMINCSSrr,                X86::VMINCSSrm,                0 },
   { X86::VMINPDYrr,                X86::VMINPDYrm,                0 },
@@ -1958,6 +2031,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINPDZ256rr,             X86::VMINPDZ256rm,             0 },
   { X86::VMINPDZrr,                X86::VMINPDZrm,                0 },
   { X86::VMINPDrr,                 X86::VMINPDrm,                 0 },
+  { X86::VMINPHZ128rr,             X86::VMINPHZ128rm,             0 },
+  { X86::VMINPHZ256rr,             X86::VMINPHZ256rm,             0 },
+  { X86::VMINPHZrr,                X86::VMINPHZrm,                0 },
   { X86::VMINPSYrr,                X86::VMINPSYrm,                0 },
   { X86::VMINPSZ128rr,             X86::VMINPSZ128rm,             0 },
   { X86::VMINPSZ256rr,             X86::VMINPSZ256rm,             0 },
@@ -1967,6 +2043,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINSDZrr_Int,            X86::VMINSDZrm_Int,            TB_NO_REVERSE },
   { X86::VMINSDrr,                 X86::VMINSDrm,                 0 },
   { X86::VMINSDrr_Int,             X86::VMINSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMINSHZrr,                X86::VMINSHZrm,                0 },
+  { X86::VMINSHZrr_Int,            X86::VMINSHZrm_Int,            TB_NO_REVERSE },
   { X86::VMINSSZrr,                X86::VMINSSZrm,                0 },
   { X86::VMINSSZrr_Int,            X86::VMINSSZrm_Int,            TB_NO_REVERSE },
   { X86::VMINSSrr,                 X86::VMINSSrm,                 0 },
@@ -2021,6 +2099,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMULPDZ256rr,             X86::VMULPDZ256rm,             0 },
   { X86::VMULPDZrr,                X86::VMULPDZrm,                0 },
   { X86::VMULPDrr,                 X86::VMULPDrm,                 0 },
+  { X86::VMULPHZ128rr,             X86::VMULPHZ128rm,             0 },
+  { X86::VMULPHZ256rr,             X86::VMULPHZ256rm,             0 },
+  { X86::VMULPHZrr,                X86::VMULPHZrm,                0 },
   { X86::VMULPSYrr,                X86::VMULPSYrm,                0 },
   { X86::VMULPSZ128rr,             X86::VMULPSZ128rm,             0 },
   { X86::VMULPSZ256rr,             X86::VMULPSZ256rm,             0 },
@@ -2030,6 +2111,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMULSDZrr_Int,            X86::VMULSDZrm_Int,            TB_NO_REVERSE },
   { X86::VMULSDrr,                 X86::VMULSDrm,                 0 },
   { X86::VMULSDrr_Int,             X86::VMULSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMULSHZrr,                X86::VMULSHZrm,                0 },
+  { X86::VMULSHZrr_Int,            X86::VMULSHZrm_Int,            TB_NO_REVERSE },
   { X86::VMULSSZrr,                X86::VMULSSZrm,                0 },
   { X86::VMULSSZrr_Int,            X86::VMULSSZrm_Int,            TB_NO_REVERSE },
   { X86::VMULSSrr,                 X86::VMULSSrm,                 0 },
@@ -2861,24 +2944,37 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VRCP28PSZrkz,             X86::VRCP28PSZmkz,             0 },
   { X86::VRCP28SDZr,               X86::VRCP28SDZm,               TB_NO_REVERSE },
   { X86::VRCP28SSZr,               X86::VRCP28SSZm,               TB_NO_REVERSE },
+  { X86::VRCPPHZ128rkz,            X86::VRCPPHZ128mkz,            0 },
+  { X86::VRCPPHZ256rkz,            X86::VRCPPHZ256mkz,            0 },
+  { X86::VRCPPHZrkz,               X86::VRCPPHZmkz,               0 },
+  { X86::VRCPSHZrr,                X86::VRCPSHZrm,                TB_NO_REVERSE },
   { X86::VRCPSSr,                  X86::VRCPSSm,                  0 },
   { X86::VRCPSSr_Int,              X86::VRCPSSm_Int,              TB_NO_REVERSE },
   { X86::VREDUCEPDZ128rrikz,       X86::VREDUCEPDZ128rmikz,       0 },
   { X86::VREDUCEPDZ256rrikz,       X86::VREDUCEPDZ256rmikz,       0 },
   { X86::VREDUCEPDZrrikz,          X86::VREDUCEPDZrmikz,          0 },
+  { X86::VREDUCEPHZ128rrikz,       X86::VREDUCEPHZ128rmikz,       0 },
+  { X86::VREDUCEPHZ256rrikz,       X86::VREDUCEPHZ256rmikz,       0 },
+  { X86::VREDUCEPHZrrikz,          X86::VREDUCEPHZrmikz,          0 },
   { X86::VREDUCEPSZ128rrikz,       X86::VREDUCEPSZ128rmikz,       0 },
   { X86::VREDUCEPSZ256rrikz,       X86::VREDUCEPSZ256rmikz,       0 },
   { X86::VREDUCEPSZrrikz,          X86::VREDUCEPSZrmikz,          0 },
   { X86::VREDUCESDZrri,            X86::VREDUCESDZrmi,            TB_NO_REVERSE },
+  { X86::VREDUCESHZrri,            X86::VREDUCESHZrmi,            TB_NO_REVERSE },
   { X86::VREDUCESSZrri,            X86::VREDUCESSZrmi,            TB_NO_REVERSE },
   { X86::VRNDSCALEPDZ128rrikz,     X86::VRNDSCALEPDZ128rmikz,     0 },
   { X86::VRNDSCALEPDZ256rrikz,     X86::VRNDSCALEPDZ256rmikz,     0 },
   { X86::VRNDSCALEPDZrrikz,        X86::VRNDSCALEPDZrmikz,        0 },
+  { X86::VRNDSCALEPHZ128rrikz,     X86::VRNDSCALEPHZ128rmikz,     0 },
+  { X86::VRNDSCALEPHZ256rrikz,     X86::VRNDSCALEPHZ256rmikz,     0 },
+  { X86::VRNDSCALEPHZrrikz,        X86::VRNDSCALEPHZrmikz,        0 },
   { X86::VRNDSCALEPSZ128rrikz,     X86::VRNDSCALEPSZ128rmikz,     0 },
   { X86::VRNDSCALEPSZ256rrikz,     X86::VRNDSCALEPSZ256rmikz,     0 },
   { X86::VRNDSCALEPSZrrikz,        X86::VRNDSCALEPSZrmikz,        0 },
   { X86::VRNDSCALESDZr,            X86::VRNDSCALESDZm,            0 },
   { X86::VRNDSCALESDZr_Int,        X86::VRNDSCALESDZm_Int,        TB_NO_REVERSE },
+  { X86::VRNDSCALESHZr,            X86::VRNDSCALESHZm,            0 },
+  { X86::VRNDSCALESHZr_Int,        X86::VRNDSCALESHZm_Int,        TB_NO_REVERSE },
   { X86::VRNDSCALESSZr,            X86::VRNDSCALESSZm,            0 },
   { X86::VRNDSCALESSZr_Int,        X86::VRNDSCALESSZm_Int,        TB_NO_REVERSE },
   { X86::VROUNDSDr,                X86::VROUNDSDm,                0 },
@@ -2897,15 +2993,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VRSQRT28PSZrkz,           X86::VRSQRT28PSZmkz,           0 },
   { X86::VRSQRT28SDZr,             X86::VRSQRT28SDZm,             TB_NO_REVERSE },
   { X86::VRSQRT28SSZr,             X86::VRSQRT28SSZm,             TB_NO_REVERSE },
+  { X86::VRSQRTPHZ128rkz,          X86::VRSQRTPHZ128mkz,          0 },
+  { X86::VRSQRTPHZ256rkz,          X86::VRSQRTPHZ256mkz,          0 },
+  { X86::VRSQRTPHZrkz,             X86::VRSQRTPHZmkz,             0 },
+  { X86::VRSQRTSHZrr,              X86::VRSQRTSHZrm,              TB_NO_REVERSE },
   { X86::VRSQRTSSr,                X86::VRSQRTSSm,                0 },
   { X86::VRSQRTSSr_Int,            X86::VRSQRTSSm_Int,            TB_NO_REVERSE },
   { X86::VSCALEFPDZ128rr,          X86::VSCALEFPDZ128rm,          0 },
   { X86::VSCALEFPDZ256rr,          X86::VSCALEFPDZ256rm,          0 },
   { X86::VSCALEFPDZrr,             X86::VSCALEFPDZrm,             0 },
+  { X86::VSCALEFPHZ128rr,          X86::VSCALEFPHZ128rm,          0 },
+  { X86::VSCALEFPHZ256rr,          X86::VSCALEFPHZ256rm,          0 },
+  { X86::VSCALEFPHZrr,             X86::VSCALEFPHZrm,             0 },
   { X86::VSCALEFPSZ128rr,          X86::VSCALEFPSZ128rm,          0 },
   { X86::VSCALEFPSZ256rr,          X86::VSCALEFPSZ256rm,          0 },
   { X86::VSCALEFPSZrr,             X86::VSCALEFPSZrm,             0 },
   { X86::VSCALEFSDZrr,             X86::VSCALEFSDZrm,             TB_NO_REVERSE },
+  { X86::VSCALEFSHZrr,             X86::VSCALEFSHZrm,             TB_NO_REVERSE },
   { X86::VSCALEFSSZrr,             X86::VSCALEFSSZrm,             TB_NO_REVERSE },
   { X86::VSHUFF32X4Z256rri,        X86::VSHUFF32X4Z256rmi,        0 },
   { X86::VSHUFF32X4Zrri,           X86::VSHUFF32X4Zrmi,           0 },
@@ -2928,6 +3032,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSQRTPDZ128rkz,           X86::VSQRTPDZ128mkz,           0 },
   { X86::VSQRTPDZ256rkz,           X86::VSQRTPDZ256mkz,           0 },
   { X86::VSQRTPDZrkz,              X86::VSQRTPDZmkz,              0 },
+  { X86::VSQRTPHZ128rkz,           X86::VSQRTPHZ128mkz,           0 },
+  { X86::VSQRTPHZ256rkz,           X86::VSQRTPHZ256mkz,           0 },
+  { X86::VSQRTPHZrkz,              X86::VSQRTPHZmkz,              0 },
   { X86::VSQRTPSZ128rkz,           X86::VSQRTPSZ128mkz,           0 },
   { X86::VSQRTPSZ256rkz,           X86::VSQRTPSZ256mkz,           0 },
   { X86::VSQRTPSZrkz,              X86::VSQRTPSZmkz,              0 },
@@ -2935,6 +3042,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSQRTSDZr_Int,            X86::VSQRTSDZm_Int,            TB_NO_REVERSE },
   { X86::VSQRTSDr,                 X86::VSQRTSDm,                 0 },
   { X86::VSQRTSDr_Int,             X86::VSQRTSDm_Int,             TB_NO_REVERSE },
+  { X86::VSQRTSHZr,                X86::VSQRTSHZm,                0 },
+  { X86::VSQRTSHZr_Int,            X86::VSQRTSHZm_Int,            TB_NO_REVERSE },
   { X86::VSQRTSSZr,                X86::VSQRTSSZm,                0 },
   { X86::VSQRTSSZr_Int,            X86::VSQRTSSZm_Int,            TB_NO_REVERSE },
   { X86::VSQRTSSr,                 X86::VSQRTSSm,                 0 },
@@ -2944,6 +3053,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSUBPDZ256rr,             X86::VSUBPDZ256rm,             0 },
   { X86::VSUBPDZrr,                X86::VSUBPDZrm,                0 },
   { X86::VSUBPDrr,                 X86::VSUBPDrm,                 0 },
+  { X86::VSUBPHZ128rr,             X86::VSUBPHZ128rm,             0 },
+  { X86::VSUBPHZ256rr,             X86::VSUBPHZ256rm,             0 },
+  { X86::VSUBPHZrr,                X86::VSUBPHZrm,                0 },
   { X86::VSUBPSYrr,                X86::VSUBPSYrm,                0 },
   { X86::VSUBPSZ128rr,             X86::VSUBPSZ128rm,             0 },
   { X86::VSUBPSZ256rr,             X86::VSUBPSZ256rm,             0 },
@@ -2953,6 +3065,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSUBSDZrr_Int,            X86::VSUBSDZrm_Int,            TB_NO_REVERSE },
   { X86::VSUBSDrr,                 X86::VSUBSDrm,                 0 },
   { X86::VSUBSDrr_Int,             X86::VSUBSDrm_Int,             TB_NO_REVERSE },
+  { X86::VSUBSHZrr,                X86::VSUBSHZrm,                0 },
+  { X86::VSUBSHZrr_Int,            X86::VSUBSHZrm_Int,            TB_NO_REVERSE },
   { X86::VSUBSSZrr,                X86::VSUBSSZrm,                0 },
   { X86::VSUBSSZrr_Int,            X86::VSUBSSZrm_Int,            TB_NO_REVERSE },
   { X86::VSUBSSrr,                 X86::VSUBSSrm,                 0 },
@@ -2999,10 +3113,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VADDPDZ128rrkz,             X86::VADDPDZ128rmkz,             0 },
   { X86::VADDPDZ256rrkz,             X86::VADDPDZ256rmkz,             0 },
   { X86::VADDPDZrrkz,                X86::VADDPDZrmkz,                0 },
+  { X86::VADDPHZ128rrkz,             X86::VADDPHZ128rmkz,             0 },
+  { X86::VADDPHZ256rrkz,             X86::VADDPHZ256rmkz,             0 },
+  { X86::VADDPHZrrkz,                X86::VADDPHZrmkz,                0 },
   { X86::VADDPSZ128rrkz,             X86::VADDPSZ128rmkz,             0 },
   { X86::VADDPSZ256rrkz,             X86::VADDPSZ256rmkz,             0 },
   { X86::VADDPSZrrkz,                X86::VADDPSZrmkz,                0 },
   { X86::VADDSDZrr_Intkz,            X86::VADDSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VADDSHZrr_Intkz,            X86::VADDSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VADDSSZrr_Intkz,            X86::VADDSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VALIGNDZ128rrikz,           X86::VALIGNDZ128rmikz,           0 },
   { X86::VALIGNDZ256rrikz,           X86::VALIGNDZ256rmikz,           0 },
@@ -3041,10 +3159,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VCMPPDZ128rrik,             X86::VCMPPDZ128rmik,             0 },
   { X86::VCMPPDZ256rrik,             X86::VCMPPDZ256rmik,             0 },
   { X86::VCMPPDZrrik,                X86::VCMPPDZrmik,                0 },
+  { X86::VCMPPHZ128rrik,             X86::VCMPPHZ128rmik,             0 },
+  { X86::VCMPPHZ256rrik,             X86::VCMPPHZ256rmik,             0 },
+  { X86::VCMPPHZrrik,                X86::VCMPPHZrmik,                0 },
   { X86::VCMPPSZ128rrik,             X86::VCMPPSZ128rmik,             0 },
   { X86::VCMPPSZ256rrik,             X86::VCMPPSZ256rmik,             0 },
   { X86::VCMPPSZrrik,                X86::VCMPPSZrmik,                0 },
   { X86::VCMPSDZrr_Intk,             X86::VCMPSDZrm_Intk,             TB_NO_REVERSE },
+  { X86::VCMPSHZrr_Intk,             X86::VCMPSHZrm_Intk,             TB_NO_REVERSE },
   { X86::VCMPSSZrr_Intk,             X86::VCMPSSZrm_Intk,             TB_NO_REVERSE },
   { X86::VCVTDQ2PDZ128rrk,           X86::VCVTDQ2PDZ128rmk,           TB_NO_REVERSE },
   { X86::VCVTDQ2PDZ256rrk,           X86::VCVTDQ2PDZ256rmk,           0 },
@@ -3141,10 +3263,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VDIVPDZ128rrkz,             X86::VDIVPDZ128rmkz,             0 },
   { X86::VDIVPDZ256rrkz,             X86::VDIVPDZ256rmkz,             0 },
   { X86::VDIVPDZrrkz,                X86::VDIVPDZrmkz,                0 },
+  { X86::VDIVPHZ128rrkz,             X86::VDIVPHZ128rmkz,             0 },
+  { X86::VDIVPHZ256rrkz,             X86::VDIVPHZ256rmkz,             0 },
+  { X86::VDIVPHZrrkz,                X86::VDIVPHZrmkz,                0 },
   { X86::VDIVPSZ128rrkz,             X86::VDIVPSZ128rmkz,             0 },
   { X86::VDIVPSZ256rrkz,             X86::VDIVPSZ256rmkz,             0 },
   { X86::VDIVPSZrrkz,                X86::VDIVPSZrmkz,                0 },
   { X86::VDIVSDZrr_Intkz,            X86::VDIVSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VDIVSHZrr_Intkz,            X86::VDIVSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VDIVSSZrr_Intkz,            X86::VDIVSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VDPBF16PSZ128r,             X86::VDPBF16PSZ128m,             0 },
   { X86::VDPBF16PSZ256r,             X86::VDPBF16PSZ256m,             0 },
@@ -3157,6 +3283,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VEXPANDPSZ128rrk,           X86::VEXPANDPSZ128rmk,           TB_NO_REVERSE },
   { X86::VEXPANDPSZ256rrk,           X86::VEXPANDPSZ256rmk,           TB_NO_REVERSE },
   { X86::VEXPANDPSZrrk,              X86::VEXPANDPSZrmk,              TB_NO_REVERSE },
+  { X86::VFCMADDCPHZ128r,            X86::VFCMADDCPHZ128m,            0 },
+  { X86::VFCMADDCPHZ256r,            X86::VFCMADDCPHZ256m,            0 },
+  { X86::VFCMADDCPHZr,               X86::VFCMADDCPHZm,               0 },
+  { X86::VFCMADDCSHZr,               X86::VFCMADDCSHZm,               TB_NO_REVERSE },
+  { X86::VFCMULCPHZ128rrkz,          X86::VFCMULCPHZ128rmkz,          0 },
+  { X86::VFCMULCPHZ256rrkz,          X86::VFCMULCPHZ256rmkz,          0 },
+  { X86::VFCMULCPHZrrkz,             X86::VFCMULCPHZrmkz,             0 },
+  { X86::VFCMULCSHZrrkz,             X86::VFCMULCSHZrmkz,             TB_NO_REVERSE },
   { X86::VFIXUPIMMPDZ128rri,         X86::VFIXUPIMMPDZ128rmi,         0 },
   { X86::VFIXUPIMMPDZ256rri,         X86::VFIXUPIMMPDZ256rmi,         0 },
   { X86::VFIXUPIMMPDZrri,            X86::VFIXUPIMMPDZrmi,            0 },
@@ -3170,6 +3304,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADD132PDZ256r,           X86::VFMADD132PDZ256m,           0 },
   { X86::VFMADD132PDZr,              X86::VFMADD132PDZm,              0 },
   { X86::VFMADD132PDr,               X86::VFMADD132PDm,               0 },
+  { X86::VFMADD132PHZ128r,           X86::VFMADD132PHZ128m,           0 },
+  { X86::VFMADD132PHZ256r,           X86::VFMADD132PHZ256m,           0 },
+  { X86::VFMADD132PHZr,              X86::VFMADD132PHZm,              0 },
   { X86::VFMADD132PSYr,              X86::VFMADD132PSYm,              0 },
   { X86::VFMADD132PSZ128r,           X86::VFMADD132PSZ128m,           0 },
   { X86::VFMADD132PSZ256r,           X86::VFMADD132PSZ256m,           0 },
@@ -3179,6 +3316,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADD132SDZr_Int,          X86::VFMADD132SDZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD132SDr,               X86::VFMADD132SDm,               0 },
   { X86::VFMADD132SDr_Int,           X86::VFMADD132SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD132SHZr,              X86::VFMADD132SHZm,              0 },
+  { X86::VFMADD132SHZr_Int,          X86::VFMADD132SHZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD132SSZr,              X86::VFMADD132SSZm,              0 },
   { X86::VFMADD132SSZr_Int,          X86::VFMADD132SSZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD132SSr,               X86::VFMADD132SSm,               0 },
@@ -3188,6 +3327,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADD213PDZ256r,           X86::VFMADD213PDZ256m,           0 },
   { X86::VFMADD213PDZr,              X86::VFMADD213PDZm,              0 },
   { X86::VFMADD213PDr,               X86::VFMADD213PDm,               0 },
+  { X86::VFMADD213PHZ128r,           X86::VFMADD213PHZ128m,           0 },
+  { X86::VFMADD213PHZ256r,           X86::VFMADD213PHZ256m,           0 },
+  { X86::VFMADD213PHZr,              X86::VFMADD213PHZm,              0 },
   { X86::VFMADD213PSYr,              X86::VFMADD213PSYm,              0 },
   { X86::VFMADD213PSZ128r,           X86::VFMADD213PSZ128m,           0 },
   { X86::VFMADD213PSZ256r,           X86::VFMADD213PSZ256m,           0 },
@@ -3197,6 +3339,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADD213SDZr_Int,          X86::VFMADD213SDZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD213SDr,               X86::VFMADD213SDm,               0 },
   { X86::VFMADD213SDr_Int,           X86::VFMADD213SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD213SHZr,              X86::VFMADD213SHZm,              0 },
+  { X86::VFMADD213SHZr_Int,          X86::VFMADD213SHZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD213SSZr,              X86::VFMADD213SSZm,              0 },
   { X86::VFMADD213SSZr_Int,          X86::VFMADD213SSZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD213SSr,               X86::VFMADD213SSm,               0 },
@@ -3206,6 +3350,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADD231PDZ256r,           X86::VFMADD231PDZ256m,           0 },
   { X86::VFMADD231PDZr,              X86::VFMADD231PDZm,              0 },
   { X86::VFMADD231PDr,               X86::VFMADD231PDm,               0 },
+  { X86::VFMADD231PHZ128r,           X86::VFMADD231PHZ128m,           0 },
+  { X86::VFMADD231PHZ256r,           X86::VFMADD231PHZ256m,           0 },
+  { X86::VFMADD231PHZr,              X86::VFMADD231PHZm,              0 },
   { X86::VFMADD231PSYr,              X86::VFMADD231PSYm,              0 },
   { X86::VFMADD231PSZ128r,           X86::VFMADD231PSZ128m,           0 },
   { X86::VFMADD231PSZ256r,           X86::VFMADD231PSZ256m,           0 },
@@ -3215,10 +3362,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADD231SDZr_Int,          X86::VFMADD231SDZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD231SDr,               X86::VFMADD231SDm,               0 },
   { X86::VFMADD231SDr_Int,           X86::VFMADD231SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMADD231SHZr,              X86::VFMADD231SHZm,              0 },
+  { X86::VFMADD231SHZr_Int,          X86::VFMADD231SHZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD231SSZr,              X86::VFMADD231SSZm,              0 },
   { X86::VFMADD231SSZr_Int,          X86::VFMADD231SSZm_Int,          TB_NO_REVERSE },
   { X86::VFMADD231SSr,               X86::VFMADD231SSm,               0 },
   { X86::VFMADD231SSr_Int,           X86::VFMADD231SSm_Int,           TB_NO_REVERSE },
+  { X86::VFMADDCPHZ128r,             X86::VFMADDCPHZ128m,             0 },
+  { X86::VFMADDCPHZ256r,             X86::VFMADDCPHZ256m,             0 },
+  { X86::VFMADDCPHZr,                X86::VFMADDCPHZm,                0 },
+  { X86::VFMADDCSHZr,                X86::VFMADDCSHZm,                TB_NO_REVERSE },
   { X86::VFMADDPD4Yrr,               X86::VFMADDPD4Yrm,               0 },
   { X86::VFMADDPD4rr,                X86::VFMADDPD4rm,                0 },
   { X86::VFMADDPS4Yrr,               X86::VFMADDPS4Yrm,               0 },
@@ -3232,6 +3385,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADDSUB132PDZ256r,        X86::VFMADDSUB132PDZ256m,        0 },
   { X86::VFMADDSUB132PDZr,           X86::VFMADDSUB132PDZm,           0 },
   { X86::VFMADDSUB132PDr,            X86::VFMADDSUB132PDm,            0 },
+  { X86::VFMADDSUB132PHZ128r,        X86::VFMADDSUB132PHZ128m,        0 },
+  { X86::VFMADDSUB132PHZ256r,        X86::VFMADDSUB132PHZ256m,        0 },
+  { X86::VFMADDSUB132PHZr,           X86::VFMADDSUB132PHZm,           0 },
   { X86::VFMADDSUB132PSYr,           X86::VFMADDSUB132PSYm,           0 },
   { X86::VFMADDSUB132PSZ128r,        X86::VFMADDSUB132PSZ128m,        0 },
   { X86::VFMADDSUB132PSZ256r,        X86::VFMADDSUB132PSZ256m,        0 },
@@ -3242,6 +3398,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADDSUB213PDZ256r,        X86::VFMADDSUB213PDZ256m,        0 },
   { X86::VFMADDSUB213PDZr,           X86::VFMADDSUB213PDZm,           0 },
   { X86::VFMADDSUB213PDr,            X86::VFMADDSUB213PDm,            0 },
+  { X86::VFMADDSUB213PHZ128r,        X86::VFMADDSUB213PHZ128m,        0 },
+  { X86::VFMADDSUB213PHZ256r,        X86::VFMADDSUB213PHZ256m,        0 },
+  { X86::VFMADDSUB213PHZr,           X86::VFMADDSUB213PHZm,           0 },
   { X86::VFMADDSUB213PSYr,           X86::VFMADDSUB213PSYm,           0 },
   { X86::VFMADDSUB213PSZ128r,        X86::VFMADDSUB213PSZ128m,        0 },
   { X86::VFMADDSUB213PSZ256r,        X86::VFMADDSUB213PSZ256m,        0 },
@@ -3252,6 +3411,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMADDSUB231PDZ256r,        X86::VFMADDSUB231PDZ256m,        0 },
   { X86::VFMADDSUB231PDZr,           X86::VFMADDSUB231PDZm,           0 },
   { X86::VFMADDSUB231PDr,            X86::VFMADDSUB231PDm,            0 },
+  { X86::VFMADDSUB231PHZ128r,        X86::VFMADDSUB231PHZ128m,        0 },
+  { X86::VFMADDSUB231PHZ256r,        X86::VFMADDSUB231PHZ256m,        0 },
+  { X86::VFMADDSUB231PHZr,           X86::VFMADDSUB231PHZm,           0 },
   { X86::VFMADDSUB231PSYr,           X86::VFMADDSUB231PSYm,           0 },
   { X86::VFMADDSUB231PSZ128r,        X86::VFMADDSUB231PSZ128m,        0 },
   { X86::VFMADDSUB231PSZ256r,        X86::VFMADDSUB231PSZ256m,        0 },
@@ -3266,6 +3428,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUB132PDZ256r,           X86::VFMSUB132PDZ256m,           0 },
   { X86::VFMSUB132PDZr,              X86::VFMSUB132PDZm,              0 },
   { X86::VFMSUB132PDr,               X86::VFMSUB132PDm,               0 },
+  { X86::VFMSUB132PHZ128r,           X86::VFMSUB132PHZ128m,           0 },
+  { X86::VFMSUB132PHZ256r,           X86::VFMSUB132PHZ256m,           0 },
+  { X86::VFMSUB132PHZr,              X86::VFMSUB132PHZm,              0 },
   { X86::VFMSUB132PSYr,              X86::VFMSUB132PSYm,              0 },
   { X86::VFMSUB132PSZ128r,           X86::VFMSUB132PSZ128m,           0 },
   { X86::VFMSUB132PSZ256r,           X86::VFMSUB132PSZ256m,           0 },
@@ -3275,6 +3440,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUB132SDZr_Int,          X86::VFMSUB132SDZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB132SDr,               X86::VFMSUB132SDm,               0 },
   { X86::VFMSUB132SDr_Int,           X86::VFMSUB132SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB132SHZr,              X86::VFMSUB132SHZm,              0 },
+  { X86::VFMSUB132SHZr_Int,          X86::VFMSUB132SHZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB132SSZr,              X86::VFMSUB132SSZm,              0 },
   { X86::VFMSUB132SSZr_Int,          X86::VFMSUB132SSZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB132SSr,               X86::VFMSUB132SSm,               0 },
@@ -3284,6 +3451,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUB213PDZ256r,           X86::VFMSUB213PDZ256m,           0 },
   { X86::VFMSUB213PDZr,              X86::VFMSUB213PDZm,              0 },
   { X86::VFMSUB213PDr,               X86::VFMSUB213PDm,               0 },
+  { X86::VFMSUB213PHZ128r,           X86::VFMSUB213PHZ128m,           0 },
+  { X86::VFMSUB213PHZ256r,           X86::VFMSUB213PHZ256m,           0 },
+  { X86::VFMSUB213PHZr,              X86::VFMSUB213PHZm,              0 },
   { X86::VFMSUB213PSYr,              X86::VFMSUB213PSYm,              0 },
   { X86::VFMSUB213PSZ128r,           X86::VFMSUB213PSZ128m,           0 },
   { X86::VFMSUB213PSZ256r,           X86::VFMSUB213PSZ256m,           0 },
@@ -3293,6 +3463,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUB213SDZr_Int,          X86::VFMSUB213SDZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB213SDr,               X86::VFMSUB213SDm,               0 },
   { X86::VFMSUB213SDr_Int,           X86::VFMSUB213SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB213SHZr,              X86::VFMSUB213SHZm,              0 },
+  { X86::VFMSUB213SHZr_Int,          X86::VFMSUB213SHZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB213SSZr,              X86::VFMSUB213SSZm,              0 },
   { X86::VFMSUB213SSZr_Int,          X86::VFMSUB213SSZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB213SSr,               X86::VFMSUB213SSm,               0 },
@@ -3302,6 +3474,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUB231PDZ256r,           X86::VFMSUB231PDZ256m,           0 },
   { X86::VFMSUB231PDZr,              X86::VFMSUB231PDZm,              0 },
   { X86::VFMSUB231PDr,               X86::VFMSUB231PDm,               0 },
+  { X86::VFMSUB231PHZ128r,           X86::VFMSUB231PHZ128m,           0 },
+  { X86::VFMSUB231PHZ256r,           X86::VFMSUB231PHZ256m,           0 },
+  { X86::VFMSUB231PHZr,              X86::VFMSUB231PHZm,              0 },
   { X86::VFMSUB231PSYr,              X86::VFMSUB231PSYm,              0 },
   { X86::VFMSUB231PSZ128r,           X86::VFMSUB231PSZ128m,           0 },
   { X86::VFMSUB231PSZ256r,           X86::VFMSUB231PSZ256m,           0 },
@@ -3311,6 +3486,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUB231SDZr_Int,          X86::VFMSUB231SDZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB231SDr,               X86::VFMSUB231SDm,               0 },
   { X86::VFMSUB231SDr_Int,           X86::VFMSUB231SDm_Int,           TB_NO_REVERSE },
+  { X86::VFMSUB231SHZr,              X86::VFMSUB231SHZm,              0 },
+  { X86::VFMSUB231SHZr_Int,          X86::VFMSUB231SHZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB231SSZr,              X86::VFMSUB231SSZm,              0 },
   { X86::VFMSUB231SSZr_Int,          X86::VFMSUB231SSZm_Int,          TB_NO_REVERSE },
   { X86::VFMSUB231SSr,               X86::VFMSUB231SSm,               0 },
@@ -3320,6 +3497,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUBADD132PDZ256r,        X86::VFMSUBADD132PDZ256m,        0 },
   { X86::VFMSUBADD132PDZr,           X86::VFMSUBADD132PDZm,           0 },
   { X86::VFMSUBADD132PDr,            X86::VFMSUBADD132PDm,            0 },
+  { X86::VFMSUBADD132PHZ128r,        X86::VFMSUBADD132PHZ128m,        0 },
+  { X86::VFMSUBADD132PHZ256r,        X86::VFMSUBADD132PHZ256m,        0 },
+  { X86::VFMSUBADD132PHZr,           X86::VFMSUBADD132PHZm,           0 },
   { X86::VFMSUBADD132PSYr,           X86::VFMSUBADD132PSYm,           0 },
   { X86::VFMSUBADD132PSZ128r,        X86::VFMSUBADD132PSZ128m,        0 },
   { X86::VFMSUBADD132PSZ256r,        X86::VFMSUBADD132PSZ256m,        0 },
@@ -3330,6 +3510,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUBADD213PDZ256r,        X86::VFMSUBADD213PDZ256m,        0 },
   { X86::VFMSUBADD213PDZr,           X86::VFMSUBADD213PDZm,           0 },
   { X86::VFMSUBADD213PDr,            X86::VFMSUBADD213PDm,            0 },
+  { X86::VFMSUBADD213PHZ128r,        X86::VFMSUBADD213PHZ128m,        0 },
+  { X86::VFMSUBADD213PHZ256r,        X86::VFMSUBADD213PHZ256m,        0 },
+  { X86::VFMSUBADD213PHZr,           X86::VFMSUBADD213PHZm,           0 },
   { X86::VFMSUBADD213PSYr,           X86::VFMSUBADD213PSYm,           0 },
   { X86::VFMSUBADD213PSZ128r,        X86::VFMSUBADD213PSZ128m,        0 },
   { X86::VFMSUBADD213PSZ256r,        X86::VFMSUBADD213PSZ256m,        0 },
@@ -3340,6 +3523,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUBADD231PDZ256r,        X86::VFMSUBADD231PDZ256m,        0 },
   { X86::VFMSUBADD231PDZr,           X86::VFMSUBADD231PDZm,           0 },
   { X86::VFMSUBADD231PDr,            X86::VFMSUBADD231PDm,            0 },
+  { X86::VFMSUBADD231PHZ128r,        X86::VFMSUBADD231PHZ128m,        0 },
+  { X86::VFMSUBADD231PHZ256r,        X86::VFMSUBADD231PHZ256m,        0 },
+  { X86::VFMSUBADD231PHZr,           X86::VFMSUBADD231PHZm,           0 },
   { X86::VFMSUBADD231PSYr,           X86::VFMSUBADD231PSYm,           0 },
   { X86::VFMSUBADD231PSZ128r,        X86::VFMSUBADD231PSZ128m,        0 },
   { X86::VFMSUBADD231PSZ256r,        X86::VFMSUBADD231PSZ256m,        0 },
@@ -3357,11 +3543,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFMSUBSD4rr_Int,            X86::VFMSUBSD4rm_Int,            TB_NO_REVERSE },
   { X86::VFMSUBSS4rr,                X86::VFMSUBSS4rm,                0 },
   { X86::VFMSUBSS4rr_Int,            X86::VFMSUBSS4rm_Int,            TB_NO_REVERSE },
+  { X86::VFMULCPHZ128rrkz,           X86::VFMULCPHZ128rmkz,           0 },
+  { X86::VFMULCPHZ256rrkz,           X86::VFMULCPHZ256rmkz,           0 },
+  { X86::VFMULCPHZrrkz,              X86::VFMULCPHZrmkz,              0 },
+  { X86::VFMULCSHZrrkz,              X86::VFMULCSHZrmkz,              TB_NO_REVERSE },
   { X86::VFNMADD132PDYr,             X86::VFNMADD132PDYm,             0 },
   { X86::VFNMADD132PDZ128r,          X86::VFNMADD132PDZ128m,          0 },
   { X86::VFNMADD132PDZ256r,          X86::VFNMADD132PDZ256m,          0 },
   { X86::VFNMADD132PDZr,             X86::VFNMADD132PDZm,             0 },
   { X86::VFNMADD132PDr,              X86::VFNMADD132PDm,              0 },
+  { X86::VFNMADD132PHZ128r,          X86::VFNMADD132PHZ128m,          0 },
+  { X86::VFNMADD132PHZ256r,          X86::VFNMADD132PHZ256m,          0 },
+  { X86::VFNMADD132PHZr,             X86::VFNMADD132PHZm,             0 },
   { X86::VFNMADD132PSYr,             X86::VFNMADD132PSYm,             0 },
   { X86::VFNMADD132PSZ128r,          X86::VFNMADD132PSZ128m,          0 },
   { X86::VFNMADD132PSZ256r,          X86::VFNMADD132PSZ256m,          0 },
@@ -3371,6 +3564,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMADD132SDZr_Int,         X86::VFNMADD132SDZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD132SDr,              X86::VFNMADD132SDm,              0 },
   { X86::VFNMADD132SDr_Int,          X86::VFNMADD132SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD132SHZr,             X86::VFNMADD132SHZm,             0 },
+  { X86::VFNMADD132SHZr_Int,         X86::VFNMADD132SHZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD132SSZr,             X86::VFNMADD132SSZm,             0 },
   { X86::VFNMADD132SSZr_Int,         X86::VFNMADD132SSZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD132SSr,              X86::VFNMADD132SSm,              0 },
@@ -3380,6 +3575,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMADD213PDZ256r,          X86::VFNMADD213PDZ256m,          0 },
   { X86::VFNMADD213PDZr,             X86::VFNMADD213PDZm,             0 },
   { X86::VFNMADD213PDr,              X86::VFNMADD213PDm,              0 },
+  { X86::VFNMADD213PHZ128r,          X86::VFNMADD213PHZ128m,          0 },
+  { X86::VFNMADD213PHZ256r,          X86::VFNMADD213PHZ256m,          0 },
+  { X86::VFNMADD213PHZr,             X86::VFNMADD213PHZm,             0 },
   { X86::VFNMADD213PSYr,             X86::VFNMADD213PSYm,             0 },
   { X86::VFNMADD213PSZ128r,          X86::VFNMADD213PSZ128m,          0 },
   { X86::VFNMADD213PSZ256r,          X86::VFNMADD213PSZ256m,          0 },
@@ -3389,6 +3587,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMADD213SDZr_Int,         X86::VFNMADD213SDZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD213SDr,              X86::VFNMADD213SDm,              0 },
   { X86::VFNMADD213SDr_Int,          X86::VFNMADD213SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD213SHZr,             X86::VFNMADD213SHZm,             0 },
+  { X86::VFNMADD213SHZr_Int,         X86::VFNMADD213SHZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD213SSZr,             X86::VFNMADD213SSZm,             0 },
   { X86::VFNMADD213SSZr_Int,         X86::VFNMADD213SSZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD213SSr,              X86::VFNMADD213SSm,              0 },
@@ -3398,6 +3598,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMADD231PDZ256r,          X86::VFNMADD231PDZ256m,          0 },
   { X86::VFNMADD231PDZr,             X86::VFNMADD231PDZm,             0 },
   { X86::VFNMADD231PDr,              X86::VFNMADD231PDm,              0 },
+  { X86::VFNMADD231PHZ128r,          X86::VFNMADD231PHZ128m,          0 },
+  { X86::VFNMADD231PHZ256r,          X86::VFNMADD231PHZ256m,          0 },
+  { X86::VFNMADD231PHZr,             X86::VFNMADD231PHZm,             0 },
   { X86::VFNMADD231PSYr,             X86::VFNMADD231PSYm,             0 },
   { X86::VFNMADD231PSZ128r,          X86::VFNMADD231PSZ128m,          0 },
   { X86::VFNMADD231PSZ256r,          X86::VFNMADD231PSZ256m,          0 },
@@ -3407,6 +3610,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMADD231SDZr_Int,         X86::VFNMADD231SDZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD231SDr,              X86::VFNMADD231SDm,              0 },
   { X86::VFNMADD231SDr_Int,          X86::VFNMADD231SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMADD231SHZr,             X86::VFNMADD231SHZm,             0 },
+  { X86::VFNMADD231SHZr_Int,         X86::VFNMADD231SHZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD231SSZr,             X86::VFNMADD231SSZm,             0 },
   { X86::VFNMADD231SSZr_Int,         X86::VFNMADD231SSZm_Int,         TB_NO_REVERSE },
   { X86::VFNMADD231SSr,              X86::VFNMADD231SSm,              0 },
@@ -3424,6 +3629,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMSUB132PDZ256r,          X86::VFNMSUB132PDZ256m,          0 },
   { X86::VFNMSUB132PDZr,             X86::VFNMSUB132PDZm,             0 },
   { X86::VFNMSUB132PDr,              X86::VFNMSUB132PDm,              0 },
+  { X86::VFNMSUB132PHZ128r,          X86::VFNMSUB132PHZ128m,          0 },
+  { X86::VFNMSUB132PHZ256r,          X86::VFNMSUB132PHZ256m,          0 },
+  { X86::VFNMSUB132PHZr,             X86::VFNMSUB132PHZm,             0 },
   { X86::VFNMSUB132PSYr,             X86::VFNMSUB132PSYm,             0 },
   { X86::VFNMSUB132PSZ128r,          X86::VFNMSUB132PSZ128m,          0 },
   { X86::VFNMSUB132PSZ256r,          X86::VFNMSUB132PSZ256m,          0 },
@@ -3433,6 +3641,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMSUB132SDZr_Int,         X86::VFNMSUB132SDZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB132SDr,              X86::VFNMSUB132SDm,              0 },
   { X86::VFNMSUB132SDr_Int,          X86::VFNMSUB132SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB132SHZr,             X86::VFNMSUB132SHZm,             0 },
+  { X86::VFNMSUB132SHZr_Int,         X86::VFNMSUB132SHZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB132SSZr,             X86::VFNMSUB132SSZm,             0 },
   { X86::VFNMSUB132SSZr_Int,         X86::VFNMSUB132SSZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB132SSr,              X86::VFNMSUB132SSm,              0 },
@@ -3442,6 +3652,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMSUB213PDZ256r,          X86::VFNMSUB213PDZ256m,          0 },
   { X86::VFNMSUB213PDZr,             X86::VFNMSUB213PDZm,             0 },
   { X86::VFNMSUB213PDr,              X86::VFNMSUB213PDm,              0 },
+  { X86::VFNMSUB213PHZ128r,          X86::VFNMSUB213PHZ128m,          0 },
+  { X86::VFNMSUB213PHZ256r,          X86::VFNMSUB213PHZ256m,          0 },
+  { X86::VFNMSUB213PHZr,             X86::VFNMSUB213PHZm,             0 },
   { X86::VFNMSUB213PSYr,             X86::VFNMSUB213PSYm,             0 },
   { X86::VFNMSUB213PSZ128r,          X86::VFNMSUB213PSZ128m,          0 },
   { X86::VFNMSUB213PSZ256r,          X86::VFNMSUB213PSZ256m,          0 },
@@ -3451,6 +3664,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMSUB213SDZr_Int,         X86::VFNMSUB213SDZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB213SDr,              X86::VFNMSUB213SDm,              0 },
   { X86::VFNMSUB213SDr_Int,          X86::VFNMSUB213SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB213SHZr,             X86::VFNMSUB213SHZm,             0 },
+  { X86::VFNMSUB213SHZr_Int,         X86::VFNMSUB213SHZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB213SSZr,             X86::VFNMSUB213SSZm,             0 },
   { X86::VFNMSUB213SSZr_Int,         X86::VFNMSUB213SSZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB213SSr,              X86::VFNMSUB213SSm,              0 },
@@ -3460,6 +3675,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMSUB231PDZ256r,          X86::VFNMSUB231PDZ256m,          0 },
   { X86::VFNMSUB231PDZr,             X86::VFNMSUB231PDZm,             0 },
   { X86::VFNMSUB231PDr,              X86::VFNMSUB231PDm,              0 },
+  { X86::VFNMSUB231PHZ128r,          X86::VFNMSUB231PHZ128m,          0 },
+  { X86::VFNMSUB231PHZ256r,          X86::VFNMSUB231PHZ256m,          0 },
+  { X86::VFNMSUB231PHZr,             X86::VFNMSUB231PHZm,             0 },
   { X86::VFNMSUB231PSYr,             X86::VFNMSUB231PSYm,             0 },
   { X86::VFNMSUB231PSZ128r,          X86::VFNMSUB231PSZ128m,          0 },
   { X86::VFNMSUB231PSZ256r,          X86::VFNMSUB231PSZ256m,          0 },
@@ -3469,6 +3687,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VFNMSUB231SDZr_Int,         X86::VFNMSUB231SDZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB231SDr,              X86::VFNMSUB231SDm,              0 },
   { X86::VFNMSUB231SDr_Int,          X86::VFNMSUB231SDm_Int,          TB_NO_REVERSE },
+  { X86::VFNMSUB231SHZr,             X86::VFNMSUB231SHZm,             0 },
+  { X86::VFNMSUB231SHZr_Int,         X86::VFNMSUB231SHZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB231SSZr,             X86::VFNMSUB231SSZm,             0 },
   { X86::VFNMSUB231SSZr_Int,         X86::VFNMSUB231SSZm_Int,         TB_NO_REVERSE },
   { X86::VFNMSUB231SSr,              X86::VFNMSUB231SSm,              0 },
@@ -3484,18 +3704,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VGETEXPPDZ128rk,            X86::VGETEXPPDZ128mk,            0 },
   { X86::VGETEXPPDZ256rk,            X86::VGETEXPPDZ256mk,            0 },
   { X86::VGETEXPPDZrk,               X86::VGETEXPPDZmk,               0 },
+  { X86::VGETEXPPHZ128rk,            X86::VGETEXPPHZ128mk,            0 },
+  { X86::VGETEXPPHZ256rk,            X86::VGETEXPPHZ256mk,            0 },
+  { X86::VGETEXPPHZrk,               X86::VGETEXPPHZmk,               0 },
   { X86::VGETEXPPSZ128rk,            X86::VGETEXPPSZ128mk,            0 },
   { X86::VGETEXPPSZ256rk,            X86::VGETEXPPSZ256mk,            0 },
   { X86::VGETEXPPSZrk,               X86::VGETEXPPSZmk,               0 },
   { X86::VGETEXPSDZrkz,              X86::VGETEXPSDZmkz,              TB_NO_REVERSE },
+  { X86::VGETEXPSHZrkz,              X86::VGETEXPSHZmkz,              TB_NO_REVERSE },
   { X86::VGETEXPSSZrkz,              X86::VGETEXPSSZmkz,              TB_NO_REVERSE },
   { X86::VGETMANTPDZ128rrik,         X86::VGETMANTPDZ128rmik,         0 },
   { X86::VGETMANTPDZ256rrik,         X86::VGETMANTPDZ256rmik,         0 },
   { X86::VGETMANTPDZrrik,            X86::VGETMANTPDZrmik,            0 },
+  { X86::VGETMANTPHZ128rrik,         X86::VGETMANTPHZ128rmik,         0 },
+  { X86::VGETMANTPHZ256rrik,         X86::VGETMANTPHZ256rmik,         0 },
+  { X86::VGETMANTPHZrrik,            X86::VGETMANTPHZrmik,            0 },
   { X86::VGETMANTPSZ128rrik,         X86::VGETMANTPSZ128rmik,         0 },
   { X86::VGETMANTPSZ256rrik,         X86::VGETMANTPSZ256rmik,         0 },
   { X86::VGETMANTPSZrrik,            X86::VGETMANTPSZrmik,            0 },
   { X86::VGETMANTSDZrrikz,           X86::VGETMANTSDZrmikz,           TB_NO_REVERSE },
+  { X86::VGETMANTSHZrrikz,           X86::VGETMANTSHZrmikz,           TB_NO_REVERSE },
   { X86::VGETMANTSSZrrikz,           X86::VGETMANTSSZrmikz,           TB_NO_REVERSE },
   { X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 },
   { X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 },
@@ -3521,30 +3749,44 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VMAXCPDZ128rrkz,            X86::VMAXCPDZ128rmkz,            0 },
   { X86::VMAXCPDZ256rrkz,            X86::VMAXCPDZ256rmkz,            0 },
   { X86::VMAXCPDZrrkz,               X86::VMAXCPDZrmkz,               0 },
+  { X86::VMAXCPHZ128rrkz,            X86::VMAXCPHZ128rmkz,            0 },
+  { X86::VMAXCPHZ256rrkz,            X86::VMAXCPHZ256rmkz,            0 },
+  { X86::VMAXCPHZrrkz,               X86::VMAXCPHZrmkz,               0 },
   { X86::VMAXCPSZ128rrkz,            X86::VMAXCPSZ128rmkz,            0 },
   { X86::VMAXCPSZ256rrkz,            X86::VMAXCPSZ256rmkz,            0 },
   { X86::VMAXCPSZrrkz,               X86::VMAXCPSZrmkz,               0 },
   { X86::VMAXPDZ128rrkz,             X86::VMAXPDZ128rmkz,             0 },
   { X86::VMAXPDZ256rrkz,             X86::VMAXPDZ256rmkz,             0 },
   { X86::VMAXPDZrrkz,                X86::VMAXPDZrmkz,                0 },
+  { X86::VMAXPHZ128rrkz,             X86::VMAXPHZ128rmkz,             0 },
+  { X86::VMAXPHZ256rrkz,             X86::VMAXPHZ256rmkz,             0 },
+  { X86::VMAXPHZrrkz,                X86::VMAXPHZrmkz,                0 },
   { X86::VMAXPSZ128rrkz,             X86::VMAXPSZ128rmkz,             0 },
   { X86::VMAXPSZ256rrkz,             X86::VMAXPSZ256rmkz,             0 },
   { X86::VMAXPSZrrkz,                X86::VMAXPSZrmkz,                0 },
   { X86::VMAXSDZrr_Intkz,            X86::VMAXSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMAXSHZrr_Intkz,            X86::VMAXSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMAXSSZrr_Intkz,            X86::VMAXSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMINCPDZ128rrkz,            X86::VMINCPDZ128rmkz,            0 },
   { X86::VMINCPDZ256rrkz,            X86::VMINCPDZ256rmkz,            0 },
   { X86::VMINCPDZrrkz,               X86::VMINCPDZrmkz,               0 },
+  { X86::VMINCPHZ128rrkz,            X86::VMINCPHZ128rmkz,            0 },
+  { X86::VMINCPHZ256rrkz,            X86::VMINCPHZ256rmkz,            0 },
+  { X86::VMINCPHZrrkz,               X86::VMINCPHZrmkz,               0 },
   { X86::VMINCPSZ128rrkz,            X86::VMINCPSZ128rmkz,            0 },
   { X86::VMINCPSZ256rrkz,            X86::VMINCPSZ256rmkz,            0 },
   { X86::VMINCPSZrrkz,               X86::VMINCPSZrmkz,               0 },
   { X86::VMINPDZ128rrkz,             X86::VMINPDZ128rmkz,             0 },
   { X86::VMINPDZ256rrkz,             X86::VMINPDZ256rmkz,             0 },
   { X86::VMINPDZrrkz,                X86::VMINPDZrmkz,                0 },
+  { X86::VMINPHZ128rrkz,             X86::VMINPHZ128rmkz,             0 },
+  { X86::VMINPHZ256rrkz,             X86::VMINPHZ256rmkz,             0 },
+  { X86::VMINPHZrrkz,                X86::VMINPHZrmkz,                0 },
   { X86::VMINPSZ128rrkz,             X86::VMINPSZ128rmkz,             0 },
   { X86::VMINPSZ256rrkz,             X86::VMINPSZ256rmkz,             0 },
   { X86::VMINPSZrrkz,                X86::VMINPSZrmkz,                0 },
   { X86::VMINSDZrr_Intkz,            X86::VMINSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMINSHZrr_Intkz,            X86::VMINSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMINSSZrr_Intkz,            X86::VMINSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMOVAPDZ128rrk,             X86::VMOVAPDZ128rmk,             TB_NO_REVERSE | TB_ALIGN_16 },
   { X86::VMOVAPDZ256rrk,             X86::VMOVAPDZ256rmk,             TB_NO_REVERSE | TB_ALIGN_32 },
@@ -3588,10 +3830,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VMULPDZ128rrkz,             X86::VMULPDZ128rmkz,             0 },
   { X86::VMULPDZ256rrkz,             X86::VMULPDZ256rmkz,             0 },
   { X86::VMULPDZrrkz,                X86::VMULPDZrmkz,                0 },
+  { X86::VMULPHZ128rrkz,             X86::VMULPHZ128rmkz,             0 },
+  { X86::VMULPHZ256rrkz,             X86::VMULPHZ256rmkz,             0 },
+  { X86::VMULPHZrrkz,                X86::VMULPHZrmkz,                0 },
   { X86::VMULPSZ128rrkz,             X86::VMULPSZ128rmkz,             0 },
   { X86::VMULPSZ256rrkz,             X86::VMULPSZ256rmkz,             0 },
   { X86::VMULPSZrrkz,                X86::VMULPSZrmkz,                0 },
   { X86::VMULSDZrr_Intkz,            X86::VMULSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMULSHZrr_Intkz,            X86::VMULSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMULSSZrr_Intkz,            X86::VMULSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VORPDZ128rrkz,              X86::VORPDZ128rmkz,              0 },
   { X86::VORPDZ256rrkz,              X86::VORPDZ256rmkz,              0 },
@@ -4258,21 +4504,33 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VRCP28PSZrk,                X86::VRCP28PSZmk,                0 },
   { X86::VRCP28SDZrkz,               X86::VRCP28SDZmkz,               TB_NO_REVERSE },
   { X86::VRCP28SSZrkz,               X86::VRCP28SSZmkz,               TB_NO_REVERSE },
+  { X86::VRCPPHZ128rk,               X86::VRCPPHZ128mk,               0 },
+  { X86::VRCPPHZ256rk,               X86::VRCPPHZ256mk,               0 },
+  { X86::VRCPPHZrk,                  X86::VRCPPHZmk,                  0 },
+  { X86::VRCPSHZrrkz,                X86::VRCPSHZrmkz,                TB_NO_REVERSE },
   { X86::VREDUCEPDZ128rrik,          X86::VREDUCEPDZ128rmik,          0 },
   { X86::VREDUCEPDZ256rrik,          X86::VREDUCEPDZ256rmik,          0 },
   { X86::VREDUCEPDZrrik,             X86::VREDUCEPDZrmik,             0 },
+  { X86::VREDUCEPHZ128rrik,          X86::VREDUCEPHZ128rmik,          0 },
+  { X86::VREDUCEPHZ256rrik,          X86::VREDUCEPHZ256rmik,          0 },
+  { X86::VREDUCEPHZrrik,             X86::VREDUCEPHZrmik,             0 },
   { X86::VREDUCEPSZ128rrik,          X86::VREDUCEPSZ128rmik,          0 },
   { X86::VREDUCEPSZ256rrik,          X86::VREDUCEPSZ256rmik,          0 },
   { X86::VREDUCEPSZrrik,             X86::VREDUCEPSZrmik,             0 },
   { X86::VREDUCESDZrrikz,            X86::VREDUCESDZrmikz,            TB_NO_REVERSE },
+  { X86::VREDUCESHZrrikz,            X86::VREDUCESHZrmikz,            TB_NO_REVERSE },
   { X86::VREDUCESSZrrikz,            X86::VREDUCESSZrmikz,            TB_NO_REVERSE },
   { X86::VRNDSCALEPDZ128rrik,        X86::VRNDSCALEPDZ128rmik,        0 },
   { X86::VRNDSCALEPDZ256rrik,        X86::VRNDSCALEPDZ256rmik,        0 },
   { X86::VRNDSCALEPDZrrik,           X86::VRNDSCALEPDZrmik,           0 },
+  { X86::VRNDSCALEPHZ128rrik,        X86::VRNDSCALEPHZ128rmik,        0 },
+  { X86::VRNDSCALEPHZ256rrik,        X86::VRNDSCALEPHZ256rmik,        0 },
+  { X86::VRNDSCALEPHZrrik,           X86::VRNDSCALEPHZrmik,           0 },
   { X86::VRNDSCALEPSZ128rrik,        X86::VRNDSCALEPSZ128rmik,        0 },
   { X86::VRNDSCALEPSZ256rrik,        X86::VRNDSCALEPSZ256rmik,        0 },
   { X86::VRNDSCALEPSZrrik,           X86::VRNDSCALEPSZrmik,           0 },
   { X86::VRNDSCALESDZr_Intkz,        X86::VRNDSCALESDZm_Intkz,        TB_NO_REVERSE },
+  { X86::VRNDSCALESHZr_Intkz,        X86::VRNDSCALESHZm_Intkz,        TB_NO_REVERSE },
   { X86::VRNDSCALESSZr_Intkz,        X86::VRNDSCALESSZm_Intkz,        TB_NO_REVERSE },
   { X86::VRSQRT14PDZ128rk,           X86::VRSQRT14PDZ128mk,           0 },
   { X86::VRSQRT14PDZ256rk,           X86::VRSQRT14PDZ256mk,           0 },
@@ -4286,13 +4544,21 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VRSQRT28PSZrk,              X86::VRSQRT28PSZmk,              0 },
   { X86::VRSQRT28SDZrkz,             X86::VRSQRT28SDZmkz,             TB_NO_REVERSE },
   { X86::VRSQRT28SSZrkz,             X86::VRSQRT28SSZmkz,             TB_NO_REVERSE },
+  { X86::VRSQRTPHZ128rk,             X86::VRSQRTPHZ128mk,             0 },
+  { X86::VRSQRTPHZ256rk,             X86::VRSQRTPHZ256mk,             0 },
+  { X86::VRSQRTPHZrk,                X86::VRSQRTPHZmk,                0 },
+  { X86::VRSQRTSHZrrkz,              X86::VRSQRTSHZrmkz,              TB_NO_REVERSE },
   { X86::VSCALEFPDZ128rrkz,          X86::VSCALEFPDZ128rmkz,          0 },
   { X86::VSCALEFPDZ256rrkz,          X86::VSCALEFPDZ256rmkz,          0 },
   { X86::VSCALEFPDZrrkz,             X86::VSCALEFPDZrmkz,             0 },
+  { X86::VSCALEFPHZ128rrkz,          X86::VSCALEFPHZ128rmkz,          0 },
+  { X86::VSCALEFPHZ256rrkz,          X86::VSCALEFPHZ256rmkz,          0 },
+  { X86::VSCALEFPHZrrkz,             X86::VSCALEFPHZrmkz,             0 },
   { X86::VSCALEFPSZ128rrkz,          X86::VSCALEFPSZ128rmkz,          0 },
   { X86::VSCALEFPSZ256rrkz,          X86::VSCALEFPSZ256rmkz,          0 },
   { X86::VSCALEFPSZrrkz,             X86::VSCALEFPSZrmkz,             0 },
   { X86::VSCALEFSDZrrkz,             X86::VSCALEFSDZrmkz,             TB_NO_REVERSE },
+  { X86::VSCALEFSHZrrkz,             X86::VSCALEFSHZrmkz,             TB_NO_REVERSE },
   { X86::VSCALEFSSZrrkz,             X86::VSCALEFSSZrmkz,             TB_NO_REVERSE },
   { X86::VSHUFF32X4Z256rrikz,        X86::VSHUFF32X4Z256rmikz,        0 },
   { X86::VSHUFF32X4Zrrikz,           X86::VSHUFF32X4Zrmikz,           0 },
@@ -4311,18 +4577,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VSQRTPDZ128rk,              X86::VSQRTPDZ128mk,              0 },
   { X86::VSQRTPDZ256rk,              X86::VSQRTPDZ256mk,              0 },
   { X86::VSQRTPDZrk,                 X86::VSQRTPDZmk,                 0 },
+  { X86::VSQRTPHZ128rk,              X86::VSQRTPHZ128mk,              0 },
+  { X86::VSQRTPHZ256rk,              X86::VSQRTPHZ256mk,              0 },
+  { X86::VSQRTPHZrk,                 X86::VSQRTPHZmk,                 0 },
   { X86::VSQRTPSZ128rk,              X86::VSQRTPSZ128mk,              0 },
   { X86::VSQRTPSZ256rk,              X86::VSQRTPSZ256mk,              0 },
   { X86::VSQRTPSZrk,                 X86::VSQRTPSZmk,                 0 },
   { X86::VSQRTSDZr_Intkz,            X86::VSQRTSDZm_Intkz,            TB_NO_REVERSE },
+  { X86::VSQRTSHZr_Intkz,            X86::VSQRTSHZm_Intkz,            TB_NO_REVERSE },
   { X86::VSQRTSSZr_Intkz,            X86::VSQRTSSZm_Intkz,            TB_NO_REVERSE },
   { X86::VSUBPDZ128rrkz,             X86::VSUBPDZ128rmkz,             0 },
   { X86::VSUBPDZ256rrkz,             X86::VSUBPDZ256rmkz,             0 },
   { X86::VSUBPDZrrkz,                X86::VSUBPDZrmkz,                0 },
+  { X86::VSUBPHZ128rrkz,             X86::VSUBPHZ128rmkz,             0 },
+  { X86::VSUBPHZ256rrkz,             X86::VSUBPHZ256rmkz,             0 },
+  { X86::VSUBPHZrrkz,                X86::VSUBPHZrmkz,                0 },
   { X86::VSUBPSZ128rrkz,             X86::VSUBPSZ128rmkz,             0 },
   { X86::VSUBPSZ256rrkz,             X86::VSUBPSZ256rmkz,             0 },
   { X86::VSUBPSZrrkz,                X86::VSUBPSZrmkz,                0 },
   { X86::VSUBSDZrr_Intkz,            X86::VSUBSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VSUBSHZrr_Intkz,            X86::VSUBSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VSUBSSZrr_Intkz,            X86::VSUBSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VUNPCKHPDZ128rrkz,          X86::VUNPCKHPDZ128rmkz,          0 },
   { X86::VUNPCKHPDZ256rrkz,          X86::VUNPCKHPDZ256rmkz,          0 },
@@ -4348,10 +4622,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VADDPDZ128rrk,             X86::VADDPDZ128rmk,             0 },
   { X86::VADDPDZ256rrk,             X86::VADDPDZ256rmk,             0 },
   { X86::VADDPDZrrk,                X86::VADDPDZrmk,                0 },
+  { X86::VADDPHZ128rrk,             X86::VADDPHZ128rmk,             0 },
+  { X86::VADDPHZ256rrk,             X86::VADDPHZ256rmk,             0 },
+  { X86::VADDPHZrrk,                X86::VADDPHZrmk,                0 },
   { X86::VADDPSZ128rrk,             X86::VADDPSZ128rmk,             0 },
   { X86::VADDPSZ256rrk,             X86::VADDPSZ256rmk,             0 },
   { X86::VADDPSZrrk,                X86::VADDPSZrmk,                0 },
   { X86::VADDSDZrr_Intk,            X86::VADDSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VADDSHZrr_Intk,            X86::VADDSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VADDSSZrr_Intk,            X86::VADDSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VALIGNDZ128rrik,           X86::VALIGNDZ128rmik,           0 },
   { X86::VALIGNDZ256rrik,           X86::VALIGNDZ256rmik,           0 },
@@ -4374,18 +4652,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VCVTNE2PS2BF16Z128rrk,     X86::VCVTNE2PS2BF16Z128rmk,     0 },
   { X86::VCVTNE2PS2BF16Z256rrk,     X86::VCVTNE2PS2BF16Z256rmk,     0 },
   { X86::VCVTNE2PS2BF16Zrrk,        X86::VCVTNE2PS2BF16Zrmk,        0 },
+  { X86::VCVTSD2SHZrr_Intk,         X86::VCVTSD2SHZrm_Intk,         TB_NO_REVERSE },
   { X86::VCVTSD2SSZrr_Intk,         X86::VCVTSD2SSZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSH2SDZrr_Intk,         X86::VCVTSH2SDZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSH2SSZrr_Intk,         X86::VCVTSH2SSZrm_Intk,         TB_NO_REVERSE },
   { X86::VCVTSS2SDZrr_Intk,         X86::VCVTSS2SDZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSS2SHZrr_Intk,         X86::VCVTSS2SHZrm_Intk,         TB_NO_REVERSE },
   { X86::VDBPSADBWZ128rrik,         X86::VDBPSADBWZ128rmik,         0 },
   { X86::VDBPSADBWZ256rrik,         X86::VDBPSADBWZ256rmik,         0 },
   { X86::VDBPSADBWZrrik,            X86::VDBPSADBWZrmik,            0 },
   { X86::VDIVPDZ128rrk,             X86::VDIVPDZ128rmk,             0 },
   { X86::VDIVPDZ256rrk,             X86::VDIVPDZ256rmk,             0 },
   { X86::VDIVPDZrrk,                X86::VDIVPDZrmk,                0 },
+  { X86::VDIVPHZ128rrk,             X86::VDIVPHZ128rmk,             0 },
+  { X86::VDIVPHZ256rrk,             X86::VDIVPHZ256rmk,             0 },
+  { X86::VDIVPHZrrk,                X86::VDIVPHZrmk,                0 },
   { X86::VDIVPSZ128rrk,             X86::VDIVPSZ128rmk,             0 },
   { X86::VDIVPSZ256rrk,             X86::VDIVPSZ256rmk,             0 },
   { X86::VDIVPSZrrk,                X86::VDIVPSZrmk,                0 },
   { X86::VDIVSDZrr_Intk,            X86::VDIVSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VDIVSHZrr_Intk,            X86::VDIVSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VDIVSSZrr_Intk,            X86::VDIVSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VDPBF16PSZ128rk,           X86::VDPBF16PSZ128mk,           0 },
   { X86::VDPBF16PSZ128rkz,          X86::VDPBF16PSZ128mkz,          0 },
@@ -4393,6 +4679,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VDPBF16PSZ256rkz,          X86::VDPBF16PSZ256mkz,          0 },
   { X86::VDPBF16PSZrk,              X86::VDPBF16PSZmk,              0 },
   { X86::VDPBF16PSZrkz,             X86::VDPBF16PSZmkz,             0 },
+  { X86::VFCMADDCPHZ128rk,          X86::VFCMADDCPHZ128mk,          0 },
+  { X86::VFCMADDCPHZ128rkz,         X86::VFCMADDCPHZ128mkz,         0 },
+  { X86::VFCMADDCPHZ256rk,          X86::VFCMADDCPHZ256mk,          0 },
+  { X86::VFCMADDCPHZ256rkz,         X86::VFCMADDCPHZ256mkz,         0 },
+  { X86::VFCMADDCPHZrk,             X86::VFCMADDCPHZmk,             0 },
+  { X86::VFCMADDCPHZrkz,            X86::VFCMADDCPHZmkz,            0 },
+  { X86::VFCMADDCSHZrk,             X86::VFCMADDCSHZmk,             TB_NO_REVERSE },
+  { X86::VFCMADDCSHZrkz,            X86::VFCMADDCSHZmkz,            TB_NO_REVERSE },
+  { X86::VFCMULCPHZ128rrk,          X86::VFCMULCPHZ128rmk,          0 },
+  { X86::VFCMULCPHZ256rrk,          X86::VFCMULCPHZ256rmk,          0 },
+  { X86::VFCMULCPHZrrk,             X86::VFCMULCPHZrmk,             0 },
+  { X86::VFCMULCSHZrrk,             X86::VFCMULCSHZrmk,             TB_NO_REVERSE },
   { X86::VFIXUPIMMPDZ128rrik,       X86::VFIXUPIMMPDZ128rmik,       0 },
   { X86::VFIXUPIMMPDZ128rrikz,      X86::VFIXUPIMMPDZ128rmikz,      0 },
   { X86::VFIXUPIMMPDZ256rrik,       X86::VFIXUPIMMPDZ256rmik,       0 },
@@ -4415,6 +4713,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADD132PDZ256rkz,        X86::VFMADD132PDZ256mkz,        0 },
   { X86::VFMADD132PDZrk,            X86::VFMADD132PDZmk,            0 },
   { X86::VFMADD132PDZrkz,           X86::VFMADD132PDZmkz,           0 },
+  { X86::VFMADD132PHZ128rk,         X86::VFMADD132PHZ128mk,         0 },
+  { X86::VFMADD132PHZ128rkz,        X86::VFMADD132PHZ128mkz,        0 },
+  { X86::VFMADD132PHZ256rk,         X86::VFMADD132PHZ256mk,         0 },
+  { X86::VFMADD132PHZ256rkz,        X86::VFMADD132PHZ256mkz,        0 },
+  { X86::VFMADD132PHZrk,            X86::VFMADD132PHZmk,            0 },
+  { X86::VFMADD132PHZrkz,           X86::VFMADD132PHZmkz,           0 },
   { X86::VFMADD132PSZ128rk,         X86::VFMADD132PSZ128mk,         0 },
   { X86::VFMADD132PSZ128rkz,        X86::VFMADD132PSZ128mkz,        0 },
   { X86::VFMADD132PSZ256rk,         X86::VFMADD132PSZ256mk,         0 },
@@ -4423,6 +4727,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADD132PSZrkz,           X86::VFMADD132PSZmkz,           0 },
   { X86::VFMADD132SDZr_Intk,        X86::VFMADD132SDZm_Intk,        TB_NO_REVERSE },
   { X86::VFMADD132SDZr_Intkz,       X86::VFMADD132SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD132SHZr_Intk,        X86::VFMADD132SHZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD132SHZr_Intkz,       X86::VFMADD132SHZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMADD132SSZr_Intk,        X86::VFMADD132SSZm_Intk,        TB_NO_REVERSE },
   { X86::VFMADD132SSZr_Intkz,       X86::VFMADD132SSZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMADD213PDZ128rk,         X86::VFMADD213PDZ128mk,         0 },
@@ -4431,6 +4737,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADD213PDZ256rkz,        X86::VFMADD213PDZ256mkz,        0 },
   { X86::VFMADD213PDZrk,            X86::VFMADD213PDZmk,            0 },
   { X86::VFMADD213PDZrkz,           X86::VFMADD213PDZmkz,           0 },
+  { X86::VFMADD213PHZ128rk,         X86::VFMADD213PHZ128mk,         0 },
+  { X86::VFMADD213PHZ128rkz,        X86::VFMADD213PHZ128mkz,        0 },
+  { X86::VFMADD213PHZ256rk,         X86::VFMADD213PHZ256mk,         0 },
+  { X86::VFMADD213PHZ256rkz,        X86::VFMADD213PHZ256mkz,        0 },
+  { X86::VFMADD213PHZrk,            X86::VFMADD213PHZmk,            0 },
+  { X86::VFMADD213PHZrkz,           X86::VFMADD213PHZmkz,           0 },
   { X86::VFMADD213PSZ128rk,         X86::VFMADD213PSZ128mk,         0 },
   { X86::VFMADD213PSZ128rkz,        X86::VFMADD213PSZ128mkz,        0 },
   { X86::VFMADD213PSZ256rk,         X86::VFMADD213PSZ256mk,         0 },
@@ -4439,6 +4751,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADD213PSZrkz,           X86::VFMADD213PSZmkz,           0 },
   { X86::VFMADD213SDZr_Intk,        X86::VFMADD213SDZm_Intk,        TB_NO_REVERSE },
   { X86::VFMADD213SDZr_Intkz,       X86::VFMADD213SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD213SHZr_Intk,        X86::VFMADD213SHZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD213SHZr_Intkz,       X86::VFMADD213SHZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMADD213SSZr_Intk,        X86::VFMADD213SSZm_Intk,        TB_NO_REVERSE },
   { X86::VFMADD213SSZr_Intkz,       X86::VFMADD213SSZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMADD231PDZ128rk,         X86::VFMADD231PDZ128mk,         0 },
@@ -4447,6 +4761,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADD231PDZ256rkz,        X86::VFMADD231PDZ256mkz,        0 },
   { X86::VFMADD231PDZrk,            X86::VFMADD231PDZmk,            0 },
   { X86::VFMADD231PDZrkz,           X86::VFMADD231PDZmkz,           0 },
+  { X86::VFMADD231PHZ128rk,         X86::VFMADD231PHZ128mk,         0 },
+  { X86::VFMADD231PHZ128rkz,        X86::VFMADD231PHZ128mkz,        0 },
+  { X86::VFMADD231PHZ256rk,         X86::VFMADD231PHZ256mk,         0 },
+  { X86::VFMADD231PHZ256rkz,        X86::VFMADD231PHZ256mkz,        0 },
+  { X86::VFMADD231PHZrk,            X86::VFMADD231PHZmk,            0 },
+  { X86::VFMADD231PHZrkz,           X86::VFMADD231PHZmkz,           0 },
   { X86::VFMADD231PSZ128rk,         X86::VFMADD231PSZ128mk,         0 },
   { X86::VFMADD231PSZ128rkz,        X86::VFMADD231PSZ128mkz,        0 },
   { X86::VFMADD231PSZ256rk,         X86::VFMADD231PSZ256mk,         0 },
@@ -4455,14 +4775,30 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADD231PSZrkz,           X86::VFMADD231PSZmkz,           0 },
   { X86::VFMADD231SDZr_Intk,        X86::VFMADD231SDZm_Intk,        TB_NO_REVERSE },
   { X86::VFMADD231SDZr_Intkz,       X86::VFMADD231SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADD231SHZr_Intk,        X86::VFMADD231SHZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMADD231SHZr_Intkz,       X86::VFMADD231SHZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMADD231SSZr_Intk,        X86::VFMADD231SSZm_Intk,        TB_NO_REVERSE },
   { X86::VFMADD231SSZr_Intkz,       X86::VFMADD231SSZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMADDCPHZ128rk,           X86::VFMADDCPHZ128mk,           0 },
+  { X86::VFMADDCPHZ128rkz,          X86::VFMADDCPHZ128mkz,          0 },
+  { X86::VFMADDCPHZ256rk,           X86::VFMADDCPHZ256mk,           0 },
+  { X86::VFMADDCPHZ256rkz,          X86::VFMADDCPHZ256mkz,          0 },
+  { X86::VFMADDCPHZrk,              X86::VFMADDCPHZmk,              0 },
+  { X86::VFMADDCPHZrkz,             X86::VFMADDCPHZmkz,             0 },
+  { X86::VFMADDCSHZrk,              X86::VFMADDCSHZmk,              TB_NO_REVERSE },
+  { X86::VFMADDCSHZrkz,             X86::VFMADDCSHZmkz,             TB_NO_REVERSE },
   { X86::VFMADDSUB132PDZ128rk,      X86::VFMADDSUB132PDZ128mk,      0 },
   { X86::VFMADDSUB132PDZ128rkz,     X86::VFMADDSUB132PDZ128mkz,     0 },
   { X86::VFMADDSUB132PDZ256rk,      X86::VFMADDSUB132PDZ256mk,      0 },
   { X86::VFMADDSUB132PDZ256rkz,     X86::VFMADDSUB132PDZ256mkz,     0 },
   { X86::VFMADDSUB132PDZrk,         X86::VFMADDSUB132PDZmk,         0 },
   { X86::VFMADDSUB132PDZrkz,        X86::VFMADDSUB132PDZmkz,        0 },
+  { X86::VFMADDSUB132PHZ128rk,      X86::VFMADDSUB132PHZ128mk,      0 },
+  { X86::VFMADDSUB132PHZ128rkz,     X86::VFMADDSUB132PHZ128mkz,     0 },
+  { X86::VFMADDSUB132PHZ256rk,      X86::VFMADDSUB132PHZ256mk,      0 },
+  { X86::VFMADDSUB132PHZ256rkz,     X86::VFMADDSUB132PHZ256mkz,     0 },
+  { X86::VFMADDSUB132PHZrk,         X86::VFMADDSUB132PHZmk,         0 },
+  { X86::VFMADDSUB132PHZrkz,        X86::VFMADDSUB132PHZmkz,        0 },
   { X86::VFMADDSUB132PSZ128rk,      X86::VFMADDSUB132PSZ128mk,      0 },
   { X86::VFMADDSUB132PSZ128rkz,     X86::VFMADDSUB132PSZ128mkz,     0 },
   { X86::VFMADDSUB132PSZ256rk,      X86::VFMADDSUB132PSZ256mk,      0 },
@@ -4475,6 +4811,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADDSUB213PDZ256rkz,     X86::VFMADDSUB213PDZ256mkz,     0 },
   { X86::VFMADDSUB213PDZrk,         X86::VFMADDSUB213PDZmk,         0 },
   { X86::VFMADDSUB213PDZrkz,        X86::VFMADDSUB213PDZmkz,        0 },
+  { X86::VFMADDSUB213PHZ128rk,      X86::VFMADDSUB213PHZ128mk,      0 },
+  { X86::VFMADDSUB213PHZ128rkz,     X86::VFMADDSUB213PHZ128mkz,     0 },
+  { X86::VFMADDSUB213PHZ256rk,      X86::VFMADDSUB213PHZ256mk,      0 },
+  { X86::VFMADDSUB213PHZ256rkz,     X86::VFMADDSUB213PHZ256mkz,     0 },
+  { X86::VFMADDSUB213PHZrk,         X86::VFMADDSUB213PHZmk,         0 },
+  { X86::VFMADDSUB213PHZrkz,        X86::VFMADDSUB213PHZmkz,        0 },
   { X86::VFMADDSUB213PSZ128rk,      X86::VFMADDSUB213PSZ128mk,      0 },
   { X86::VFMADDSUB213PSZ128rkz,     X86::VFMADDSUB213PSZ128mkz,     0 },
   { X86::VFMADDSUB213PSZ256rk,      X86::VFMADDSUB213PSZ256mk,      0 },
@@ -4487,6 +4829,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMADDSUB231PDZ256rkz,     X86::VFMADDSUB231PDZ256mkz,     0 },
   { X86::VFMADDSUB231PDZrk,         X86::VFMADDSUB231PDZmk,         0 },
   { X86::VFMADDSUB231PDZrkz,        X86::VFMADDSUB231PDZmkz,        0 },
+  { X86::VFMADDSUB231PHZ128rk,      X86::VFMADDSUB231PHZ128mk,      0 },
+  { X86::VFMADDSUB231PHZ128rkz,     X86::VFMADDSUB231PHZ128mkz,     0 },
+  { X86::VFMADDSUB231PHZ256rk,      X86::VFMADDSUB231PHZ256mk,      0 },
+  { X86::VFMADDSUB231PHZ256rkz,     X86::VFMADDSUB231PHZ256mkz,     0 },
+  { X86::VFMADDSUB231PHZrk,         X86::VFMADDSUB231PHZmk,         0 },
+  { X86::VFMADDSUB231PHZrkz,        X86::VFMADDSUB231PHZmkz,        0 },
   { X86::VFMADDSUB231PSZ128rk,      X86::VFMADDSUB231PSZ128mk,      0 },
   { X86::VFMADDSUB231PSZ128rkz,     X86::VFMADDSUB231PSZ128mkz,     0 },
   { X86::VFMADDSUB231PSZ256rk,      X86::VFMADDSUB231PSZ256mk,      0 },
@@ -4499,6 +4847,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUB132PDZ256rkz,        X86::VFMSUB132PDZ256mkz,        0 },
   { X86::VFMSUB132PDZrk,            X86::VFMSUB132PDZmk,            0 },
   { X86::VFMSUB132PDZrkz,           X86::VFMSUB132PDZmkz,           0 },
+  { X86::VFMSUB132PHZ128rk,         X86::VFMSUB132PHZ128mk,         0 },
+  { X86::VFMSUB132PHZ128rkz,        X86::VFMSUB132PHZ128mkz,        0 },
+  { X86::VFMSUB132PHZ256rk,         X86::VFMSUB132PHZ256mk,         0 },
+  { X86::VFMSUB132PHZ256rkz,        X86::VFMSUB132PHZ256mkz,        0 },
+  { X86::VFMSUB132PHZrk,            X86::VFMSUB132PHZmk,            0 },
+  { X86::VFMSUB132PHZrkz,           X86::VFMSUB132PHZmkz,           0 },
   { X86::VFMSUB132PSZ128rk,         X86::VFMSUB132PSZ128mk,         0 },
   { X86::VFMSUB132PSZ128rkz,        X86::VFMSUB132PSZ128mkz,        0 },
   { X86::VFMSUB132PSZ256rk,         X86::VFMSUB132PSZ256mk,         0 },
@@ -4507,6 +4861,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUB132PSZrkz,           X86::VFMSUB132PSZmkz,           0 },
   { X86::VFMSUB132SDZr_Intk,        X86::VFMSUB132SDZm_Intk,        TB_NO_REVERSE },
   { X86::VFMSUB132SDZr_Intkz,       X86::VFMSUB132SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB132SHZr_Intk,        X86::VFMSUB132SHZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB132SHZr_Intkz,       X86::VFMSUB132SHZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMSUB132SSZr_Intk,        X86::VFMSUB132SSZm_Intk,        TB_NO_REVERSE },
   { X86::VFMSUB132SSZr_Intkz,       X86::VFMSUB132SSZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMSUB213PDZ128rk,         X86::VFMSUB213PDZ128mk,         0 },
@@ -4515,6 +4871,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUB213PDZ256rkz,        X86::VFMSUB213PDZ256mkz,        0 },
   { X86::VFMSUB213PDZrk,            X86::VFMSUB213PDZmk,            0 },
   { X86::VFMSUB213PDZrkz,           X86::VFMSUB213PDZmkz,           0 },
+  { X86::VFMSUB213PHZ128rk,         X86::VFMSUB213PHZ128mk,         0 },
+  { X86::VFMSUB213PHZ128rkz,        X86::VFMSUB213PHZ128mkz,        0 },
+  { X86::VFMSUB213PHZ256rk,         X86::VFMSUB213PHZ256mk,         0 },
+  { X86::VFMSUB213PHZ256rkz,        X86::VFMSUB213PHZ256mkz,        0 },
+  { X86::VFMSUB213PHZrk,            X86::VFMSUB213PHZmk,            0 },
+  { X86::VFMSUB213PHZrkz,           X86::VFMSUB213PHZmkz,           0 },
   { X86::VFMSUB213PSZ128rk,         X86::VFMSUB213PSZ128mk,         0 },
   { X86::VFMSUB213PSZ128rkz,        X86::VFMSUB213PSZ128mkz,        0 },
   { X86::VFMSUB213PSZ256rk,         X86::VFMSUB213PSZ256mk,         0 },
@@ -4523,6 +4885,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUB213PSZrkz,           X86::VFMSUB213PSZmkz,           0 },
   { X86::VFMSUB213SDZr_Intk,        X86::VFMSUB213SDZm_Intk,        TB_NO_REVERSE },
   { X86::VFMSUB213SDZr_Intkz,       X86::VFMSUB213SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB213SHZr_Intk,        X86::VFMSUB213SHZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB213SHZr_Intkz,       X86::VFMSUB213SHZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMSUB213SSZr_Intk,        X86::VFMSUB213SSZm_Intk,        TB_NO_REVERSE },
   { X86::VFMSUB213SSZr_Intkz,       X86::VFMSUB213SSZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMSUB231PDZ128rk,         X86::VFMSUB231PDZ128mk,         0 },
@@ -4531,6 +4895,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUB231PDZ256rkz,        X86::VFMSUB231PDZ256mkz,        0 },
   { X86::VFMSUB231PDZrk,            X86::VFMSUB231PDZmk,            0 },
   { X86::VFMSUB231PDZrkz,           X86::VFMSUB231PDZmkz,           0 },
+  { X86::VFMSUB231PHZ128rk,         X86::VFMSUB231PHZ128mk,         0 },
+  { X86::VFMSUB231PHZ128rkz,        X86::VFMSUB231PHZ128mkz,        0 },
+  { X86::VFMSUB231PHZ256rk,         X86::VFMSUB231PHZ256mk,         0 },
+  { X86::VFMSUB231PHZ256rkz,        X86::VFMSUB231PHZ256mkz,        0 },
+  { X86::VFMSUB231PHZrk,            X86::VFMSUB231PHZmk,            0 },
+  { X86::VFMSUB231PHZrkz,           X86::VFMSUB231PHZmkz,           0 },
   { X86::VFMSUB231PSZ128rk,         X86::VFMSUB231PSZ128mk,         0 },
   { X86::VFMSUB231PSZ128rkz,        X86::VFMSUB231PSZ128mkz,        0 },
   { X86::VFMSUB231PSZ256rk,         X86::VFMSUB231PSZ256mk,         0 },
@@ -4539,6 +4909,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUB231PSZrkz,           X86::VFMSUB231PSZmkz,           0 },
   { X86::VFMSUB231SDZr_Intk,        X86::VFMSUB231SDZm_Intk,        TB_NO_REVERSE },
   { X86::VFMSUB231SDZr_Intkz,       X86::VFMSUB231SDZm_Intkz,       TB_NO_REVERSE },
+  { X86::VFMSUB231SHZr_Intk,        X86::VFMSUB231SHZm_Intk,        TB_NO_REVERSE },
+  { X86::VFMSUB231SHZr_Intkz,       X86::VFMSUB231SHZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMSUB231SSZr_Intk,        X86::VFMSUB231SSZm_Intk,        TB_NO_REVERSE },
   { X86::VFMSUB231SSZr_Intkz,       X86::VFMSUB231SSZm_Intkz,       TB_NO_REVERSE },
   { X86::VFMSUBADD132PDZ128rk,      X86::VFMSUBADD132PDZ128mk,      0 },
@@ -4547,6 +4919,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUBADD132PDZ256rkz,     X86::VFMSUBADD132PDZ256mkz,     0 },
   { X86::VFMSUBADD132PDZrk,         X86::VFMSUBADD132PDZmk,         0 },
   { X86::VFMSUBADD132PDZrkz,        X86::VFMSUBADD132PDZmkz,        0 },
+  { X86::VFMSUBADD132PHZ128rk,      X86::VFMSUBADD132PHZ128mk,      0 },
+  { X86::VFMSUBADD132PHZ128rkz,     X86::VFMSUBADD132PHZ128mkz,     0 },
+  { X86::VFMSUBADD132PHZ256rk,      X86::VFMSUBADD132PHZ256mk,      0 },
+  { X86::VFMSUBADD132PHZ256rkz,     X86::VFMSUBADD132PHZ256mkz,     0 },
+  { X86::VFMSUBADD132PHZrk,         X86::VFMSUBADD132PHZmk,         0 },
+  { X86::VFMSUBADD132PHZrkz,        X86::VFMSUBADD132PHZmkz,        0 },
   { X86::VFMSUBADD132PSZ128rk,      X86::VFMSUBADD132PSZ128mk,      0 },
   { X86::VFMSUBADD132PSZ128rkz,     X86::VFMSUBADD132PSZ128mkz,     0 },
   { X86::VFMSUBADD132PSZ256rk,      X86::VFMSUBADD132PSZ256mk,      0 },
@@ -4559,6 +4937,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUBADD213PDZ256rkz,     X86::VFMSUBADD213PDZ256mkz,     0 },
   { X86::VFMSUBADD213PDZrk,         X86::VFMSUBADD213PDZmk,         0 },
   { X86::VFMSUBADD213PDZrkz,        X86::VFMSUBADD213PDZmkz,        0 },
+  { X86::VFMSUBADD213PHZ128rk,      X86::VFMSUBADD213PHZ128mk,      0 },
+  { X86::VFMSUBADD213PHZ128rkz,     X86::VFMSUBADD213PHZ128mkz,     0 },
+  { X86::VFMSUBADD213PHZ256rk,      X86::VFMSUBADD213PHZ256mk,      0 },
+  { X86::VFMSUBADD213PHZ256rkz,     X86::VFMSUBADD213PHZ256mkz,     0 },
+  { X86::VFMSUBADD213PHZrk,         X86::VFMSUBADD213PHZmk,         0 },
+  { X86::VFMSUBADD213PHZrkz,        X86::VFMSUBADD213PHZmkz,        0 },
   { X86::VFMSUBADD213PSZ128rk,      X86::VFMSUBADD213PSZ128mk,      0 },
   { X86::VFMSUBADD213PSZ128rkz,     X86::VFMSUBADD213PSZ128mkz,     0 },
   { X86::VFMSUBADD213PSZ256rk,      X86::VFMSUBADD213PSZ256mk,      0 },
@@ -4571,18 +4955,34 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFMSUBADD231PDZ256rkz,     X86::VFMSUBADD231PDZ256mkz,     0 },
   { X86::VFMSUBADD231PDZrk,         X86::VFMSUBADD231PDZmk,         0 },
   { X86::VFMSUBADD231PDZrkz,        X86::VFMSUBADD231PDZmkz,        0 },
+  { X86::VFMSUBADD231PHZ128rk,      X86::VFMSUBADD231PHZ128mk,      0 },
+  { X86::VFMSUBADD231PHZ128rkz,     X86::VFMSUBADD231PHZ128mkz,     0 },
+  { X86::VFMSUBADD231PHZ256rk,      X86::VFMSUBADD231PHZ256mk,      0 },
+  { X86::VFMSUBADD231PHZ256rkz,     X86::VFMSUBADD231PHZ256mkz,     0 },
+  { X86::VFMSUBADD231PHZrk,         X86::VFMSUBADD231PHZmk,         0 },
+  { X86::VFMSUBADD231PHZrkz,        X86::VFMSUBADD231PHZmkz,        0 },
   { X86::VFMSUBADD231PSZ128rk,      X86::VFMSUBADD231PSZ128mk,      0 },
   { X86::VFMSUBADD231PSZ128rkz,     X86::VFMSUBADD231PSZ128mkz,     0 },
   { X86::VFMSUBADD231PSZ256rk,      X86::VFMSUBADD231PSZ256mk,      0 },
   { X86::VFMSUBADD231PSZ256rkz,     X86::VFMSUBADD231PSZ256mkz,     0 },
   { X86::VFMSUBADD231PSZrk,         X86::VFMSUBADD231PSZmk,         0 },
   { X86::VFMSUBADD231PSZrkz,        X86::VFMSUBADD231PSZmkz,        0 },
+  { X86::VFMULCPHZ128rrk,           X86::VFMULCPHZ128rmk,           0 },
+  { X86::VFMULCPHZ256rrk,           X86::VFMULCPHZ256rmk,           0 },
+  { X86::VFMULCPHZrrk,              X86::VFMULCPHZrmk,              0 },
+  { X86::VFMULCSHZrrk,              X86::VFMULCSHZrmk,              TB_NO_REVERSE },
   { X86::VFNMADD132PDZ128rk,        X86::VFNMADD132PDZ128mk,        0 },
   { X86::VFNMADD132PDZ128rkz,       X86::VFNMADD132PDZ128mkz,       0 },
   { X86::VFNMADD132PDZ256rk,        X86::VFNMADD132PDZ256mk,        0 },
   { X86::VFNMADD132PDZ256rkz,       X86::VFNMADD132PDZ256mkz,       0 },
   { X86::VFNMADD132PDZrk,           X86::VFNMADD132PDZmk,           0 },
   { X86::VFNMADD132PDZrkz,          X86::VFNMADD132PDZmkz,          0 },
+  { X86::VFNMADD132PHZ128rk,        X86::VFNMADD132PHZ128mk,        0 },
+  { X86::VFNMADD132PHZ128rkz,       X86::VFNMADD132PHZ128mkz,       0 },
+  { X86::VFNMADD132PHZ256rk,        X86::VFNMADD132PHZ256mk,        0 },
+  { X86::VFNMADD132PHZ256rkz,       X86::VFNMADD132PHZ256mkz,       0 },
+  { X86::VFNMADD132PHZrk,           X86::VFNMADD132PHZmk,           0 },
+  { X86::VFNMADD132PHZrkz,          X86::VFNMADD132PHZmkz,          0 },
   { X86::VFNMADD132PSZ128rk,        X86::VFNMADD132PSZ128mk,        0 },
   { X86::VFNMADD132PSZ128rkz,       X86::VFNMADD132PSZ128mkz,       0 },
   { X86::VFNMADD132PSZ256rk,        X86::VFNMADD132PSZ256mk,        0 },
@@ -4591,6 +4991,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMADD132PSZrkz,          X86::VFNMADD132PSZmkz,          0 },
   { X86::VFNMADD132SDZr_Intk,       X86::VFNMADD132SDZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMADD132SDZr_Intkz,      X86::VFNMADD132SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD132SHZr_Intk,       X86::VFNMADD132SHZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD132SHZr_Intkz,      X86::VFNMADD132SHZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMADD132SSZr_Intk,       X86::VFNMADD132SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMADD132SSZr_Intkz,      X86::VFNMADD132SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMADD213PDZ128rk,        X86::VFNMADD213PDZ128mk,        0 },
@@ -4599,6 +5001,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMADD213PDZ256rkz,       X86::VFNMADD213PDZ256mkz,       0 },
   { X86::VFNMADD213PDZrk,           X86::VFNMADD213PDZmk,           0 },
   { X86::VFNMADD213PDZrkz,          X86::VFNMADD213PDZmkz,          0 },
+  { X86::VFNMADD213PHZ128rk,        X86::VFNMADD213PHZ128mk,        0 },
+  { X86::VFNMADD213PHZ128rkz,       X86::VFNMADD213PHZ128mkz,       0 },
+  { X86::VFNMADD213PHZ256rk,        X86::VFNMADD213PHZ256mk,        0 },
+  { X86::VFNMADD213PHZ256rkz,       X86::VFNMADD213PHZ256mkz,       0 },
+  { X86::VFNMADD213PHZrk,           X86::VFNMADD213PHZmk,           0 },
+  { X86::VFNMADD213PHZrkz,          X86::VFNMADD213PHZmkz,          0 },
   { X86::VFNMADD213PSZ128rk,        X86::VFNMADD213PSZ128mk,        0 },
   { X86::VFNMADD213PSZ128rkz,       X86::VFNMADD213PSZ128mkz,       0 },
   { X86::VFNMADD213PSZ256rk,        X86::VFNMADD213PSZ256mk,        0 },
@@ -4607,6 +5015,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMADD213PSZrkz,          X86::VFNMADD213PSZmkz,          0 },
   { X86::VFNMADD213SDZr_Intk,       X86::VFNMADD213SDZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMADD213SDZr_Intkz,      X86::VFNMADD213SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD213SHZr_Intk,       X86::VFNMADD213SHZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD213SHZr_Intkz,      X86::VFNMADD213SHZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMADD213SSZr_Intk,       X86::VFNMADD213SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMADD213SSZr_Intkz,      X86::VFNMADD213SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMADD231PDZ128rk,        X86::VFNMADD231PDZ128mk,        0 },
@@ -4615,6 +5025,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMADD231PDZ256rkz,       X86::VFNMADD231PDZ256mkz,       0 },
   { X86::VFNMADD231PDZrk,           X86::VFNMADD231PDZmk,           0 },
   { X86::VFNMADD231PDZrkz,          X86::VFNMADD231PDZmkz,          0 },
+  { X86::VFNMADD231PHZ128rk,        X86::VFNMADD231PHZ128mk,        0 },
+  { X86::VFNMADD231PHZ128rkz,       X86::VFNMADD231PHZ128mkz,       0 },
+  { X86::VFNMADD231PHZ256rk,        X86::VFNMADD231PHZ256mk,        0 },
+  { X86::VFNMADD231PHZ256rkz,       X86::VFNMADD231PHZ256mkz,       0 },
+  { X86::VFNMADD231PHZrk,           X86::VFNMADD231PHZmk,           0 },
+  { X86::VFNMADD231PHZrkz,          X86::VFNMADD231PHZmkz,          0 },
   { X86::VFNMADD231PSZ128rk,        X86::VFNMADD231PSZ128mk,        0 },
   { X86::VFNMADD231PSZ128rkz,       X86::VFNMADD231PSZ128mkz,       0 },
   { X86::VFNMADD231PSZ256rk,        X86::VFNMADD231PSZ256mk,        0 },
@@ -4623,6 +5039,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMADD231PSZrkz,          X86::VFNMADD231PSZmkz,          0 },
   { X86::VFNMADD231SDZr_Intk,       X86::VFNMADD231SDZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMADD231SDZr_Intkz,      X86::VFNMADD231SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMADD231SHZr_Intk,       X86::VFNMADD231SHZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMADD231SHZr_Intkz,      X86::VFNMADD231SHZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMADD231SSZr_Intk,       X86::VFNMADD231SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMADD231SSZr_Intkz,      X86::VFNMADD231SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMSUB132PDZ128rk,        X86::VFNMSUB132PDZ128mk,        0 },
@@ -4631,6 +5049,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB132PDZ256rkz,       X86::VFNMSUB132PDZ256mkz,       0 },
   { X86::VFNMSUB132PDZrk,           X86::VFNMSUB132PDZmk,           0 },
   { X86::VFNMSUB132PDZrkz,          X86::VFNMSUB132PDZmkz,          0 },
+  { X86::VFNMSUB132PHZ128rk,        X86::VFNMSUB132PHZ128mk,        0 },
+  { X86::VFNMSUB132PHZ128rkz,       X86::VFNMSUB132PHZ128mkz,       0 },
+  { X86::VFNMSUB132PHZ256rk,        X86::VFNMSUB132PHZ256mk,        0 },
+  { X86::VFNMSUB132PHZ256rkz,       X86::VFNMSUB132PHZ256mkz,       0 },
+  { X86::VFNMSUB132PHZrk,           X86::VFNMSUB132PHZmk,           0 },
+  { X86::VFNMSUB132PHZrkz,          X86::VFNMSUB132PHZmkz,          0 },
   { X86::VFNMSUB132PSZ128rk,        X86::VFNMSUB132PSZ128mk,        0 },
   { X86::VFNMSUB132PSZ128rkz,       X86::VFNMSUB132PSZ128mkz,       0 },
   { X86::VFNMSUB132PSZ256rk,        X86::VFNMSUB132PSZ256mk,        0 },
@@ -4639,6 +5063,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB132PSZrkz,          X86::VFNMSUB132PSZmkz,          0 },
   { X86::VFNMSUB132SDZr_Intk,       X86::VFNMSUB132SDZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB132SDZr_Intkz,      X86::VFNMSUB132SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB132SHZr_Intk,       X86::VFNMSUB132SHZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB132SHZr_Intkz,      X86::VFNMSUB132SHZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMSUB132SSZr_Intk,       X86::VFNMSUB132SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB132SSZr_Intkz,      X86::VFNMSUB132SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMSUB213PDZ128rk,        X86::VFNMSUB213PDZ128mk,        0 },
@@ -4647,6 +5073,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB213PDZ256rkz,       X86::VFNMSUB213PDZ256mkz,       0 },
   { X86::VFNMSUB213PDZrk,           X86::VFNMSUB213PDZmk,           0 },
   { X86::VFNMSUB213PDZrkz,          X86::VFNMSUB213PDZmkz,          0 },
+  { X86::VFNMSUB213PHZ128rk,        X86::VFNMSUB213PHZ128mk,        0 },
+  { X86::VFNMSUB213PHZ128rkz,       X86::VFNMSUB213PHZ128mkz,       0 },
+  { X86::VFNMSUB213PHZ256rk,        X86::VFNMSUB213PHZ256mk,        0 },
+  { X86::VFNMSUB213PHZ256rkz,       X86::VFNMSUB213PHZ256mkz,       0 },
+  { X86::VFNMSUB213PHZrk,           X86::VFNMSUB213PHZmk,           0 },
+  { X86::VFNMSUB213PHZrkz,          X86::VFNMSUB213PHZmkz,          0 },
   { X86::VFNMSUB213PSZ128rk,        X86::VFNMSUB213PSZ128mk,        0 },
   { X86::VFNMSUB213PSZ128rkz,       X86::VFNMSUB213PSZ128mkz,       0 },
   { X86::VFNMSUB213PSZ256rk,        X86::VFNMSUB213PSZ256mk,        0 },
@@ -4655,6 +5087,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB213PSZrkz,          X86::VFNMSUB213PSZmkz,          0 },
   { X86::VFNMSUB213SDZr_Intk,       X86::VFNMSUB213SDZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB213SDZr_Intkz,      X86::VFNMSUB213SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB213SHZr_Intk,       X86::VFNMSUB213SHZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB213SHZr_Intkz,      X86::VFNMSUB213SHZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMSUB213SSZr_Intk,       X86::VFNMSUB213SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB213SSZr_Intkz,      X86::VFNMSUB213SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMSUB231PDZ128rk,        X86::VFNMSUB231PDZ128mk,        0 },
@@ -4663,6 +5097,12 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB231PDZ256rkz,       X86::VFNMSUB231PDZ256mkz,       0 },
   { X86::VFNMSUB231PDZrk,           X86::VFNMSUB231PDZmk,           0 },
   { X86::VFNMSUB231PDZrkz,          X86::VFNMSUB231PDZmkz,          0 },
+  { X86::VFNMSUB231PHZ128rk,        X86::VFNMSUB231PHZ128mk,        0 },
+  { X86::VFNMSUB231PHZ128rkz,       X86::VFNMSUB231PHZ128mkz,       0 },
+  { X86::VFNMSUB231PHZ256rk,        X86::VFNMSUB231PHZ256mk,        0 },
+  { X86::VFNMSUB231PHZ256rkz,       X86::VFNMSUB231PHZ256mkz,       0 },
+  { X86::VFNMSUB231PHZrk,           X86::VFNMSUB231PHZmk,           0 },
+  { X86::VFNMSUB231PHZrkz,          X86::VFNMSUB231PHZmkz,          0 },
   { X86::VFNMSUB231PSZ128rk,        X86::VFNMSUB231PSZ128mk,        0 },
   { X86::VFNMSUB231PSZ128rkz,       X86::VFNMSUB231PSZ128mkz,       0 },
   { X86::VFNMSUB231PSZ256rk,        X86::VFNMSUB231PSZ256mk,        0 },
@@ -4671,11 +5111,15 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB231PSZrkz,          X86::VFNMSUB231PSZmkz,          0 },
   { X86::VFNMSUB231SDZr_Intk,       X86::VFNMSUB231SDZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB231SDZr_Intkz,      X86::VFNMSUB231SDZm_Intkz,      TB_NO_REVERSE },
+  { X86::VFNMSUB231SHZr_Intk,       X86::VFNMSUB231SHZm_Intk,       TB_NO_REVERSE },
+  { X86::VFNMSUB231SHZr_Intkz,      X86::VFNMSUB231SHZm_Intkz,      TB_NO_REVERSE },
   { X86::VFNMSUB231SSZr_Intk,       X86::VFNMSUB231SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB231SSZr_Intkz,      X86::VFNMSUB231SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VGETEXPSDZrk,              X86::VGETEXPSDZmk,              TB_NO_REVERSE },
+  { X86::VGETEXPSHZrk,              X86::VGETEXPSHZmk,              TB_NO_REVERSE },
   { X86::VGETEXPSSZrk,              X86::VGETEXPSSZmk,              TB_NO_REVERSE },
   { X86::VGETMANTSDZrrik,           X86::VGETMANTSDZrmik,           TB_NO_REVERSE },
+  { X86::VGETMANTSHZrrik,           X86::VGETMANTSHZrmik,           TB_NO_REVERSE },
   { X86::VGETMANTSSZrrik,           X86::VGETMANTSSZrmik,           TB_NO_REVERSE },
   { X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 },
   { X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 },
@@ -4701,38 +5145,56 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VMAXCPDZ128rrk,            X86::VMAXCPDZ128rmk,            0 },
   { X86::VMAXCPDZ256rrk,            X86::VMAXCPDZ256rmk,            0 },
   { X86::VMAXCPDZrrk,               X86::VMAXCPDZrmk,               0 },
+  { X86::VMAXCPHZ128rrk,            X86::VMAXCPHZ128rmk,            0 },
+  { X86::VMAXCPHZ256rrk,            X86::VMAXCPHZ256rmk,            0 },
+  { X86::VMAXCPHZrrk,               X86::VMAXCPHZrmk,               0 },
   { X86::VMAXCPSZ128rrk,            X86::VMAXCPSZ128rmk,            0 },
   { X86::VMAXCPSZ256rrk,            X86::VMAXCPSZ256rmk,            0 },
   { X86::VMAXCPSZrrk,               X86::VMAXCPSZrmk,               0 },
   { X86::VMAXPDZ128rrk,             X86::VMAXPDZ128rmk,             0 },
   { X86::VMAXPDZ256rrk,             X86::VMAXPDZ256rmk,             0 },
   { X86::VMAXPDZrrk,                X86::VMAXPDZrmk,                0 },
+  { X86::VMAXPHZ128rrk,             X86::VMAXPHZ128rmk,             0 },
+  { X86::VMAXPHZ256rrk,             X86::VMAXPHZ256rmk,             0 },
+  { X86::VMAXPHZrrk,                X86::VMAXPHZrmk,                0 },
   { X86::VMAXPSZ128rrk,             X86::VMAXPSZ128rmk,             0 },
   { X86::VMAXPSZ256rrk,             X86::VMAXPSZ256rmk,             0 },
   { X86::VMAXPSZrrk,                X86::VMAXPSZrmk,                0 },
   { X86::VMAXSDZrr_Intk,            X86::VMAXSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMAXSHZrr_Intk,            X86::VMAXSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VMAXSSZrr_Intk,            X86::VMAXSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VMINCPDZ128rrk,            X86::VMINCPDZ128rmk,            0 },
   { X86::VMINCPDZ256rrk,            X86::VMINCPDZ256rmk,            0 },
   { X86::VMINCPDZrrk,               X86::VMINCPDZrmk,               0 },
+  { X86::VMINCPHZ128rrk,            X86::VMINCPHZ128rmk,            0 },
+  { X86::VMINCPHZ256rrk,            X86::VMINCPHZ256rmk,            0 },
+  { X86::VMINCPHZrrk,               X86::VMINCPHZrmk,               0 },
   { X86::VMINCPSZ128rrk,            X86::VMINCPSZ128rmk,            0 },
   { X86::VMINCPSZ256rrk,            X86::VMINCPSZ256rmk,            0 },
   { X86::VMINCPSZrrk,               X86::VMINCPSZrmk,               0 },
   { X86::VMINPDZ128rrk,             X86::VMINPDZ128rmk,             0 },
   { X86::VMINPDZ256rrk,             X86::VMINPDZ256rmk,             0 },
   { X86::VMINPDZrrk,                X86::VMINPDZrmk,                0 },
+  { X86::VMINPHZ128rrk,             X86::VMINPHZ128rmk,             0 },
+  { X86::VMINPHZ256rrk,             X86::VMINPHZ256rmk,             0 },
+  { X86::VMINPHZrrk,                X86::VMINPHZrmk,                0 },
   { X86::VMINPSZ128rrk,             X86::VMINPSZ128rmk,             0 },
   { X86::VMINPSZ256rrk,             X86::VMINPSZ256rmk,             0 },
   { X86::VMINPSZrrk,                X86::VMINPSZrmk,                0 },
   { X86::VMINSDZrr_Intk,            X86::VMINSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMINSHZrr_Intk,            X86::VMINSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VMINSSZrr_Intk,            X86::VMINSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VMULPDZ128rrk,             X86::VMULPDZ128rmk,             0 },
   { X86::VMULPDZ256rrk,             X86::VMULPDZ256rmk,             0 },
   { X86::VMULPDZrrk,                X86::VMULPDZrmk,                0 },
+  { X86::VMULPHZ128rrk,             X86::VMULPHZ128rmk,             0 },
+  { X86::VMULPHZ256rrk,             X86::VMULPHZ256rmk,             0 },
+  { X86::VMULPHZrrk,                X86::VMULPHZrmk,                0 },
   { X86::VMULPSZ128rrk,             X86::VMULPSZ128rmk,             0 },
   { X86::VMULPSZ256rrk,             X86::VMULPSZ256rmk,             0 },
   { X86::VMULPSZrrk,                X86::VMULPSZrmk,                0 },
   { X86::VMULSDZrr_Intk,            X86::VMULSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMULSHZrr_Intk,            X86::VMULSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VMULSSZrr_Intk,            X86::VMULSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VORPDZ128rrk,              X86::VORPDZ128rmk,              0 },
   { X86::VORPDZ256rrk,              X86::VORPDZ256rmk,              0 },
@@ -5213,21 +5675,29 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VRCP14SSZrrk,              X86::VRCP14SSZrmk,              TB_NO_REVERSE },
   { X86::VRCP28SDZrk,               X86::VRCP28SDZmk,               TB_NO_REVERSE },
   { X86::VRCP28SSZrk,               X86::VRCP28SSZmk,               TB_NO_REVERSE },
+  { X86::VRCPSHZrrk,                X86::VRCPSHZrmk,                TB_NO_REVERSE },
   { X86::VREDUCESDZrrik,            X86::VREDUCESDZrmik,            TB_NO_REVERSE },
+  { X86::VREDUCESHZrrik,            X86::VREDUCESHZrmik,            TB_NO_REVERSE },
   { X86::VREDUCESSZrrik,            X86::VREDUCESSZrmik,            TB_NO_REVERSE },
   { X86::VRNDSCALESDZr_Intk,        X86::VRNDSCALESDZm_Intk,        TB_NO_REVERSE },
+  { X86::VRNDSCALESHZr_Intk,        X86::VRNDSCALESHZm_Intk,        TB_NO_REVERSE },
   { X86::VRNDSCALESSZr_Intk,        X86::VRNDSCALESSZm_Intk,        TB_NO_REVERSE },
   { X86::VRSQRT14SDZrrk,            X86::VRSQRT14SDZrmk,            TB_NO_REVERSE },
   { X86::VRSQRT14SSZrrk,            X86::VRSQRT14SSZrmk,            TB_NO_REVERSE },
   { X86::VRSQRT28SDZrk,             X86::VRSQRT28SDZmk,             TB_NO_REVERSE },
   { X86::VRSQRT28SSZrk,             X86::VRSQRT28SSZmk,             TB_NO_REVERSE },
+  { X86::VRSQRTSHZrrk,              X86::VRSQRTSHZrmk,              TB_NO_REVERSE },
   { X86::VSCALEFPDZ128rrk,          X86::VSCALEFPDZ128rmk,          0 },
   { X86::VSCALEFPDZ256rrk,          X86::VSCALEFPDZ256rmk,          0 },
   { X86::VSCALEFPDZrrk,             X86::VSCALEFPDZrmk,             0 },
+  { X86::VSCALEFPHZ128rrk,          X86::VSCALEFPHZ128rmk,          0 },
+  { X86::VSCALEFPHZ256rrk,          X86::VSCALEFPHZ256rmk,          0 },
+  { X86::VSCALEFPHZrrk,             X86::VSCALEFPHZrmk,             0 },
   { X86::VSCALEFPSZ128rrk,          X86::VSCALEFPSZ128rmk,          0 },
   { X86::VSCALEFPSZ256rrk,          X86::VSCALEFPSZ256rmk,          0 },
   { X86::VSCALEFPSZrrk,             X86::VSCALEFPSZrmk,             0 },
   { X86::VSCALEFSDZrrk,             X86::VSCALEFSDZrmk,             TB_NO_REVERSE },
+  { X86::VSCALEFSHZrrk,             X86::VSCALEFSHZrmk,             TB_NO_REVERSE },
   { X86::VSCALEFSSZrrk,             X86::VSCALEFSSZrmk,             TB_NO_REVERSE },
   { X86::VSHUFF32X4Z256rrik,        X86::VSHUFF32X4Z256rmik,        0 },
   { X86::VSHUFF32X4Zrrik,           X86::VSHUFF32X4Zrmik,           0 },
@@ -5244,14 +5714,19 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VSHUFPSZ256rrik,           X86::VSHUFPSZ256rmik,           0 },
   { X86::VSHUFPSZrrik,              X86::VSHUFPSZrmik,              0 },
   { X86::VSQRTSDZr_Intk,            X86::VSQRTSDZm_Intk,            TB_NO_REVERSE },
+  { X86::VSQRTSHZr_Intk,            X86::VSQRTSHZm_Intk,            TB_NO_REVERSE },
   { X86::VSQRTSSZr_Intk,            X86::VSQRTSSZm_Intk,            TB_NO_REVERSE },
   { X86::VSUBPDZ128rrk,             X86::VSUBPDZ128rmk,             0 },
   { X86::VSUBPDZ256rrk,             X86::VSUBPDZ256rmk,             0 },
   { X86::VSUBPDZrrk,                X86::VSUBPDZrmk,                0 },
+  { X86::VSUBPHZ128rrk,             X86::VSUBPHZ128rmk,             0 },
+  { X86::VSUBPHZ256rrk,             X86::VSUBPHZ256rmk,             0 },
+  { X86::VSUBPHZrrk,                X86::VSUBPHZrmk,                0 },
   { X86::VSUBPSZ128rrk,             X86::VSUBPSZ128rmk,             0 },
   { X86::VSUBPSZ256rrk,             X86::VSUBPSZ256rmk,             0 },
   { X86::VSUBPSZrrk,                X86::VSUBPSZrmk,                0 },
   { X86::VSUBSDZrr_Intk,            X86::VSUBSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VSUBSHZrr_Intk,            X86::VSUBSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VSUBSSZrr_Intk,            X86::VSUBSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VUNPCKHPDZ128rrk,          X86::VUNPCKHPDZ128rmk,          0 },
   { X86::VUNPCKHPDZ256rrk,          X86::VUNPCKHPDZ256rmk,          0 },
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index dba13720cbd2..0e7033fc233a 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -149,8 +149,8 @@ def PS     : Prefix<4>; // Similar to NoPrfx, but disassembler uses this to know
                         // disable to ANDPS.
 
 // Class specifying the opcode map.
-class Map<bits<3> val> {
-  bits<3> Value = val;
+class Map<bits<4> val> {
+  bits<4> Value = val;
 }
 def OB        : Map<0>;
 def TB        : Map<1>;
@@ -160,6 +160,8 @@ def XOP8      : Map<4>;
 def XOP9      : Map<5>;
 def XOPA      : Map<6>;
 def ThreeDNow : Map<7>;
+def T_MAP5    : Map<8>;
+def T_MAP6    : Map<9>;
 
 // Class specifying the encoding
 class Encoding<bits<2> val> {
@@ -204,6 +206,16 @@ class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
 class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
 class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
 class ThreeDNow { Map OpMap = ThreeDNow; }
+class T_MAP5     { Map OpMap = T_MAP5; }
+class T_MAP5PS : T_MAP5 { Prefix OpPrefix = PS; } // none
+class T_MAP5PD : T_MAP5 { Prefix OpPrefix = PD; } // 0x66
+class T_MAP5XS : T_MAP5 { Prefix OpPrefix = XS; } // 0xF3
+class T_MAP5XD : T_MAP5 { Prefix OpPrefix = XD; } // 0xF2
+class T_MAP6     { Map OpMap = T_MAP6; }
+class T_MAP6PS : T_MAP6 { Prefix OpPrefix = PS; }
+class T_MAP6PD : T_MAP6 { Prefix OpPrefix = PD; }
+class T_MAP6XS : T_MAP6 { Prefix OpPrefix = XS; }
+class T_MAP6XD : T_MAP6 { Prefix OpPrefix = XD; }
 class OBXS   { Prefix OpPrefix = XS; }
 class PS   : TB { Prefix OpPrefix = PS; }
 class PD   : TB { Prefix OpPrefix = PD; }
@@ -284,6 +296,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   // If this is a pseudo instruction, mark it isCodeGenOnly.
   let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
 
+  let HasPositionOrder = 1;
+
   //
   // Attributes specific to X86 instructions...
   //
@@ -301,7 +315,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
   bits<3> OpPrefixBits = OpPrefix.Value;
   Map OpMap = OB;           // Which opcode map does this inst have?
-  bits<3> OpMapBits = OpMap.Value;
+  bits<4> OpMapBits = OpMap.Value;
   bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
   FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
@@ -360,28 +374,28 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{10-9}  = AdSizeBits;
   // No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
   let TSFlags{12-11} = OpPrefixBits{1-0};
-  let TSFlags{15-13} = OpMapBits;
-  let TSFlags{16}    = hasREX_WPrefix;
-  let TSFlags{20-17} = ImmT.Value;
-  let TSFlags{23-21} = FPForm.Value;
-  let TSFlags{24}    = hasLockPrefix;
-  let TSFlags{25}    = hasREPPrefix;
-  let TSFlags{27-26} = ExeDomain.Value;
-  let TSFlags{29-28} = OpEncBits;
-  let TSFlags{37-30} = Opcode;
+  let TSFlags{16-13} = OpMapBits;
+  let TSFlags{17}    = hasREX_WPrefix;
+  let TSFlags{21-18} = ImmT.Value;
+  let TSFlags{24-22} = FPForm.Value;
+  let TSFlags{25}    = hasLockPrefix;
+  let TSFlags{26}    = hasREPPrefix;
+  let TSFlags{28-27} = ExeDomain.Value;
+  let TSFlags{30-29} = OpEncBits;
+  let TSFlags{38-31} = Opcode;
   // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
-  let TSFlags{38}    = HasVEX_W;
-  let TSFlags{39}    = hasVEX_4V;
-  let TSFlags{40}    = hasVEX_L;
-  let TSFlags{41}    = hasEVEX_K;
-  let TSFlags{42}    = hasEVEX_Z;
-  let TSFlags{43}    = hasEVEX_L2;
-  let TSFlags{44}    = hasEVEX_B;
+  let TSFlags{39}    = HasVEX_W;
+  let TSFlags{40}    = hasVEX_4V;
+  let TSFlags{41}    = hasVEX_L;
+  let TSFlags{42}    = hasEVEX_K;
+  let TSFlags{43}    = hasEVEX_Z;
+  let TSFlags{44}    = hasEVEX_L2;
+  let TSFlags{45}    = hasEVEX_B;
   // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
-  let TSFlags{51-45} = CD8_Scale;
-  let TSFlags{52}    = hasEVEX_RC;
-  let TSFlags{53}    = hasNoTrackPrefix;
-  let TSFlags{54}    = ExplicitVEXPrefix;
+  let TSFlags{52-46} = CD8_Scale;
+  let TSFlags{53}    = hasEVEX_RC;
+  let TSFlags{54}    = hasNoTrackPrefix;
+  let TSFlags{55}    = ExplicitVEXPrefix;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -738,18 +752,19 @@ class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
       : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
         Requires<[UseSSE42]>;
 
-//   SS42FI - SSE 4.2 instructions with T8XD prefix.
-// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
-class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasSSE42]>;
-
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
         Requires<[UseSSE42]>;
 
+//   CRC32I - SSE 4.2 CRC32 instructions.
+// NOTE: 'HasCRC32' is used as CRC32 instructions are GPR only and not directly
+// controlled by the SSE42 flag.
+class CRC32I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasCRC32]>;
+
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)
 //
@@ -870,7 +885,6 @@ class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
       : I<o, F, outs, ins, asm, pattern>, T8PD,
         EVEX_4V, Requires<[HasAVX512]>;
-class AVX512FMA3Base : T8PD, EVEX_4V;
 
 class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 777c5a158b4c..166f1f8c3251 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -130,14 +130,12 @@ def X86vmtruncs  : SDNode<"X86ISD::VMTRUNCS",  SDTVmtrunc>;
 def X86vmtruncus : SDNode<"X86ISD::VMTRUNCUS", SDTVmtrunc>;
 
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
-                                             SDTCVecEltisVT<1, f32>,
-                                             SDTCisSameSizeAs<0, 1>]>>;
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>]>>;
 
 def X86strict_vfpext  : SDNode<"X86ISD::STRICT_VFPEXT",
-                               SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
-                                                    SDTCVecEltisVT<1, f32>,
-                                                    SDTCisSameSizeAs<0, 1>]>,
+                               SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                                    SDTCisFP<1>, SDTCisVec<1>]>,
                                                     [SDNPHasChain]>;
 
 def X86any_vfpext : PatFrags<(ops node:$src),
@@ -145,13 +143,13 @@ def X86any_vfpext : PatFrags<(ops node:$src),
                                (X86vfpext node:$src)]>;
 
 def X86vfpround: SDNode<"X86ISD::VFPROUND",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<0, 1>]>>;
 
 def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<0, 1>]>,
                                              [SDNPHasChain]>;
 
@@ -160,33 +158,32 @@ def X86any_vfpround : PatFrags<(ops node:$src),
                                (X86vfpround node:$src)]>;
 
 def X86frounds   : SDNode<"X86ISD::VFPROUNDS",
-                           SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+                           SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
                                                 SDTCisSameAs<0, 1>,
-                                                SDTCVecEltisVT<2, f64>,
+                                                SDTCisFP<2>, SDTCisVec<2>,
                                                 SDTCisSameSizeAs<0, 2>]>>;
 
 def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND",
-                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
-                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisFP<2>, SDTCisVec<2>,
                                              SDTCisSameSizeAs<0, 2>,
                                              SDTCisVT<3, i32>]>>;
 
 def X86fpexts     : SDNode<"X86ISD::VFPEXTS",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
-                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisFP<2>, SDTCisVec<2>,
                                              SDTCisSameSizeAs<0, 2>]>>;
 def X86fpextsSAE  : SDNode<"X86ISD::VFPEXTS_SAE",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
-                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisFP<2>, SDTCisVec<2>,
                                              SDTCisSameSizeAs<0, 2>]>>;
 
 def X86vmfpround: SDNode<"X86ISD::VMFPROUND",
-                         SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
-                                              SDTCVecEltisVT<1, f64>,
-                                              SDTCisSameSizeAs<0, 1>,
+                         SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                              SDTCisFP<1>, SDTCisVec<1>,
                                               SDTCisSameAs<0, 2>,
                                               SDTCVecEltisVT<3, i1>,
                                               SDTCisSameNumEltsAs<1, 3>]>>;
@@ -417,6 +414,11 @@ def X86Movss : SDNode<"X86ISD::MOVSS",
                                            SDTCisVT<1, v4f32>,
                                            SDTCisVT<2, v4f32>]>>;
 
+def X86Movsh : SDNode<"X86ISD::MOVSH",
+                      SDTypeProfile<1, 2, [SDTCisVT<0, v8f16>,
+                                           SDTCisVT<1, v8f16>,
+                                           SDTCisVT<2, v8f16>]>>;
+
 def X86Movlhps : SDNode<"X86ISD::MOVLHPS",
                         SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
                                              SDTCisVT<1, v4f32>,
@@ -570,6 +572,24 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
 def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma, [SDNPCommutative]>;
 def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTIFma, [SDNPCommutative]>;
 
+def x86vfmaddc       : SDNode<"X86ISD::VFMADDC",       SDTFPTernaryOp,  [SDNPCommutative]>;
+def x86vfmaddcRnd    : SDNode<"X86ISD::VFMADDC_RND",   SDTFmaRound,     [SDNPCommutative]>;
+def x86vfcmaddc      : SDNode<"X86ISD::VFCMADDC",      SDTFPTernaryOp>;
+def x86vfcmaddcRnd   : SDNode<"X86ISD::VFCMADDC_RND",  SDTFmaRound>;
+def x86vfmulc        : SDNode<"X86ISD::VFMULC",        SDTFPBinOp,      [SDNPCommutative]>;
+def x86vfmulcRnd     : SDNode<"X86ISD::VFMULC_RND",    SDTFPBinOpRound, [SDNPCommutative]>;
+def x86vfcmulc       : SDNode<"X86ISD::VFCMULC",       SDTFPBinOp>;
+def x86vfcmulcRnd    : SDNode<"X86ISD::VFCMULC_RND",   SDTFPBinOpRound>;
+
+def x86vfmaddcSh     : SDNode<"X86ISD::VFMADDCSH",     SDTFPTernaryOp,  [SDNPCommutative]>;
+def x86vfcmaddcSh    : SDNode<"X86ISD::VFCMADDCSH",    SDTFPTernaryOp>;
+def x86vfmulcSh      : SDNode<"X86ISD::VFMULCSH",      SDTFPBinOp,      [SDNPCommutative]>;
+def x86vfcmulcSh     : SDNode<"X86ISD::VFCMULCSH",     SDTFPBinOp>;
+def x86vfmaddcShRnd  : SDNode<"X86ISD::VFMADDCSH_RND", SDTFmaRound,     [SDNPCommutative]>;
+def x86vfcmaddcShRnd : SDNode<"X86ISD::VFCMADDCSH_RND",SDTFmaRound>;
+def x86vfmulcShRnd   : SDNode<"X86ISD::VFMULCSH_RND",  SDTFPBinOpRound, [SDNPCommutative]>;
+def x86vfcmulcShRnd  : SDNode<"X86ISD::VFCMULCSH_RND", SDTFPBinOpRound>;
+
 def X86rsqrt14   : SDNode<"X86ISD::RSQRT14",  SDTFPUnaryOp>;
 def X86rcp14     : SDNode<"X86ISD::RCP14",    SDTFPUnaryOp>;
 
@@ -704,7 +724,6 @@ def X86cvtp2UInt     : SDNode<"X86ISD::CVTP2UI",  SDTFloatToInt>;
 // Masked versions of above
 def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
                                        SDTCisFP<0>, SDTCisInt<1>,
-                                       SDTCisSameSizeAs<0, 1>,
                                        SDTCisSameAs<0, 2>,
                                        SDTCVecEltisVT<3, i1>,
                                        SDTCisSameNumEltsAs<1, 3>]>;
@@ -752,12 +771,12 @@ def X86mcvtps2ph   : SDNode<"X86ISD::MCVTPS2PH",
                                              SDTCVecEltisVT<4, i1>,
                                              SDTCisSameNumEltsAs<1, 4>]> >;
 def X86vfpextSAE  : SDNode<"X86ISD::VFPEXT_SAE",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
-                                             SDTCVecEltisVT<1, f32>,
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<1, 0>]>>;
 def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<0, 1>,
                                              SDTCisVT<2, i32>]>>;
 
@@ -796,6 +815,7 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [       // masked store
 //===----------------------------------------------------------------------===//
 
 // 128-bit load pattern fragments
+def loadv8f16    : PatFrag<(ops node:$ptr), (v8f16 (load node:$ptr))>;
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
@@ -804,6 +824,7 @@ def loadv8i16    : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>;
 def loadv16i8    : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
+def loadv16f16   : PatFrag<(ops node:$ptr), (v16f16 (load node:$ptr))>;
 def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32  (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64  (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64  (load node:$ptr))>;
@@ -812,6 +833,7 @@ def loadv16i16   : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>;
 def loadv32i8    : PatFrag<(ops node:$ptr), (v32i8  (load node:$ptr))>;
 
 // 512-bit load pattern fragments
+def loadv32f16   : PatFrag<(ops node:$ptr), (v32f16 (load node:$ptr))>;
 def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
 def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64  (load node:$ptr))>;
 def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64  (load node:$ptr))>;
@@ -823,6 +845,10 @@ def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8  (load node:$ptr))>;
 def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
 def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
 def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv2f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
+def extloadv4f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
+def extloadv8f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
+def extloadv16f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
 
 // Like 'store', but always requires vector size alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -839,6 +865,8 @@ def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 
 // 128-bit aligned load pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv8f16 : PatFrag<(ops node:$ptr),
+                               (v8f16 (alignedload node:$ptr))>;
 def alignedloadv4f32 : PatFrag<(ops node:$ptr),
                                (v4f32 (alignedload node:$ptr))>;
 def alignedloadv2f64 : PatFrag<(ops node:$ptr),
@@ -854,6 +882,8 @@ def alignedloadv16i8 : PatFrag<(ops node:$ptr),
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv16f16 : PatFrag<(ops node:$ptr),
+                                (v16f16 (alignedload node:$ptr))>;
 def alignedloadv8f32  : PatFrag<(ops node:$ptr),
                                 (v8f32  (alignedload node:$ptr))>;
 def alignedloadv4f64  : PatFrag<(ops node:$ptr),
@@ -868,6 +898,8 @@ def alignedloadv32i8  : PatFrag<(ops node:$ptr),
                                 (v32i8  (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
+def alignedloadv32f16 : PatFrag<(ops node:$ptr),
+                                (v32f16 (alignedload node:$ptr))>;
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
                                 (v16f32 (alignedload node:$ptr))>;
 def alignedloadv8f64  : PatFrag<(ops node:$ptr),
@@ -926,6 +958,11 @@ def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
 def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
 def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
 
+def X86vzload16 : PatFrag<(ops node:$src),
+                          (X86vzld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2;
+}]>;
+
 def X86vzload32 : PatFrag<(ops node:$src),
                           (X86vzld node:$src), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
@@ -976,6 +1013,10 @@ def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
 // only load a single element.
 // FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
 // the simple_load case.
+def sse_load_f16 : PatFrags<(ops node:$ptr),
+                            [(v8f16 (simple_load node:$ptr)),
+                             (v8f16 (X86vzload16 node:$ptr)),
+                             (v8f16 (scalar_to_vector (loadf16 node:$ptr)))]>;
 def sse_load_f32 : PatFrags<(ops node:$ptr),
                             [(v4f32 (simple_load node:$ptr)),
                              (v4f32 (X86vzload32 node:$ptr)),
@@ -985,9 +1026,13 @@ def sse_load_f64 : PatFrags<(ops node:$ptr),
                              (v2f64 (X86vzload64 node:$ptr)),
                              (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>;
 
+def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
 def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
 def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
 
+def fp16imm0 : PatLeaf<(f16 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
 
 def fp32imm0 : PatLeaf<(f32 fpimm), [{
   return N->isExactlyValue(+0.0);
@@ -1013,6 +1058,12 @@ def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
   return getInsertVINSERTImmediate(N, 128, SDLoc(N));
 }]>;
 
+// INSERT_get_vperm2x128_imm xform function: convert insert_subvector index to
+// commuted VPERM2F128/VPERM2I128 imm.
+def INSERT_get_vperm2x128_commutedimm : SDNodeXForm<insert_subvector, [{
+  return getPermuteVINSERTCommutedImmediate(N, 128, SDLoc(N));
+}]>;
+
 // EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
 // to VEXTRACTF64x4 imm.
 def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 12a2d92fd888..639aa5199ea5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -19,6 +19,7 @@
 #include "X86TargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -82,7 +83,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
                                                : X86::ADJCALLSTACKUP32),
                       X86::CATCHRET,
-                      (STI.is64Bit() ? X86::RETQ : X86::RETL)),
+                      (STI.is64Bit() ? X86::RET64 : X86::RET32)),
       Subtarget(STI), RI(STI.getTargetTriple()) {
 }
 
@@ -699,6 +700,8 @@ static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
     return true;
   case X86::MOV16rm:
   case X86::KMOVWkm:
+  case X86::VMOVSHZrm:
+  case X86::VMOVSHZrm_alt:
     MemBytes = 2;
     return true;
   case X86::MOV32rm:
@@ -795,6 +798,7 @@ static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
     return true;
   case X86::MOV16mr:
   case X86::KMOVWmk:
+  case X86::VMOVSHZmr:
     MemBytes = 2;
     return true;
   case X86::MOV32mr:
@@ -980,6 +984,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case X86::AVX512_512_SET0:
   case X86::AVX512_512_SETALLONES:
   case X86::AVX512_FsFLD0SD:
+  case X86::AVX512_FsFLD0SH:
   case X86::AVX512_FsFLD0SS:
   case X86::AVX512_FsFLD0F128:
   case X86::AVX_SET0:
@@ -1047,6 +1052,8 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case X86::VMOVSSZrm_alt:
   case X86::VMOVSDZrm:
   case X86::VMOVSDZrm_alt:
+  case X86::VMOVSHZrm:
+  case X86::VMOVSHZrm_alt:
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPDZrm:
@@ -1189,7 +1196,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP, Register &NewSrc,
                                   bool &isKill, MachineOperand &ImplicitOp,
-                                  LiveVariables *LV) const {
+                                  LiveVariables *LV, LiveIntervals *LIS) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetRegisterClass *RC;
   if (AllowSP) {
@@ -1199,12 +1206,12 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
       &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
   }
   Register SrcReg = Src.getReg();
+  isKill = MI.killsRegister(SrcReg);
 
   // For both LEA64 and LEA32 the register already has essentially the right
   // type (32-bit or 64-bit) we may just need to forbid SP.
   if (Opc != X86::LEA64_32r) {
     NewSrc = SrcReg;
-    isKill = Src.isKill();
     assert(!Src.isUndef() && "Undef op doesn't need optimization");
 
     if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
@@ -1219,8 +1226,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     ImplicitOp = Src;
     ImplicitOp.setImplicit();
 
-    NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
-    isKill = Src.isKill();
+    NewSrc = getX86SubSuperRegister(SrcReg, 64);
     assert(!Src.isUndef() && "Undef op doesn't need optimization");
   } else {
     // Virtual register of the wrong class, we have to create a temporary 64-bit
@@ -1229,24 +1235,36 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     MachineInstr *Copy =
         BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
             .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
-            .add(Src);
+            .addReg(SrcReg, getKillRegState(isKill));
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
 
     if (LV)
       LV->replaceKillInstruction(SrcReg, MI, *Copy);
+
+    if (LIS) {
+      SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
+      SlotIndex Idx = LIS->getInstructionIndex(MI);
+      LiveInterval &LI = LIS->getInterval(SrcReg);
+      LiveRange::Segment *S = LI.getSegmentContaining(Idx);
+      if (S->end.getBaseIndex() == Idx)
+        S->end = CopyIdx.getRegSlot();
+    }
   }
 
   // We've set all the parameters without issue.
   return true;
 }
 
-MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
-    unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
-    LiveVariables *LV, bool Is8BitOp) const {
+MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+                                                         MachineInstr &MI,
+                                                         LiveVariables *LV,
+                                                         LiveIntervals *LIS,
+                                                         bool Is8BitOp) const {
   // We handle 8-bit adds and various 16-bit opcodes in the switch below.
-  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
   assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
               *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
          "Unexpected type for LEA transform");
@@ -1264,6 +1282,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   unsigned Opcode = X86::LEA64_32r;
   Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
   Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  Register InRegLEA2;
 
   // Build and insert into an implicit UNDEF value. This is OK because
   // we will be shifting and then extracting the lower 8/16-bits.
@@ -1275,18 +1294,22 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   Register Dest = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
+  Register Src2;
   bool IsDead = MI.getOperand(0).isDead();
   bool IsKill = MI.getOperand(1).isKill();
   unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
   assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
-  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
+  MachineInstr *ImpDef =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
   MachineInstr *InsMI =
-      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
           .addReg(InRegLEA, RegState::Define, SubReg)
           .addReg(Src, getKillRegState(IsKill));
+  MachineInstr *ImpDef2 = nullptr;
+  MachineInstr *InsMI2 = nullptr;
 
   MachineInstrBuilder MIB =
-      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
   switch (MIOpc) {
   default: llvm_unreachable("Unreachable!");
   case X86::SHL8ri:
@@ -1316,11 +1339,9 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   case X86::ADD8rr_DB:
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
-    Register Src2 = MI.getOperand(2).getReg();
+    Src2 = MI.getOperand(2).getReg();
     bool IsKill2 = MI.getOperand(2).isKill();
     assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
-    unsigned InRegLEA2 = 0;
-    MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
       // ADD8rr/ADD16rr killed %reg1028, %reg1028
       // just a single insert_subreg.
@@ -1332,8 +1353,9 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
         InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
       // we will be shifting and then extracting the lower 8/16-bits.
-      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
-      InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
+      ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
+                        InRegLEA2);
+      InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
                    .addReg(InRegLEA2, RegState::Define, SubReg)
                    .addReg(Src2, getKillRegState(IsKill2));
       addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
@@ -1346,7 +1368,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
 
   MachineInstr *NewMI = MIB;
   MachineInstr *ExtMI =
-      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
           .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
           .addReg(OutRegLEA, RegState::Kill, SubReg);
 
@@ -1360,6 +1382,45 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
       LV->replaceKillInstruction(Dest, MI, *ExtMI);
   }
 
+  if (LIS) {
+    LIS->InsertMachineInstrInMaps(*ImpDef);
+    SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
+    if (ImpDef2)
+      LIS->InsertMachineInstrInMaps(*ImpDef2);
+    SlotIndex Ins2Idx;
+    if (InsMI2)
+      Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
+    SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
+    LIS->getInterval(InRegLEA);
+    LIS->getInterval(OutRegLEA);
+    if (InRegLEA2)
+      LIS->getInterval(InRegLEA2);
+
+    // Move the use of Src up to InsMI.
+    LiveInterval &SrcLI = LIS->getInterval(Src);
+    LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
+    if (SrcSeg->end == NewIdx.getRegSlot())
+      SrcSeg->end = InsIdx.getRegSlot();
+
+    if (InsMI2) {
+      // Move the use of Src2 up to InsMI2.
+      LiveInterval &Src2LI = LIS->getInterval(Src2);
+      LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
+      if (Src2Seg->end == NewIdx.getRegSlot())
+        Src2Seg->end = Ins2Idx.getRegSlot();
+    }
+
+    // Move the definition of Dest down to ExtMI.
+    LiveInterval &DestLI = LIS->getInterval(Dest);
+    LiveRange::Segment *DestSeg =
+        DestLI.getSegmentContaining(NewIdx.getRegSlot());
+    assert(DestSeg->start == NewIdx.getRegSlot() &&
+           DestSeg->valno->def == NewIdx.getRegSlot());
+    DestSeg->start = ExtIdx.getRegSlot();
+    DestSeg->valno->def = ExtIdx.getRegSlot();
+  }
+
   return ExtMI;
 }
 
@@ -1373,9 +1434,9 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
 /// This method returns a null pointer if the transformation cannot be
 /// performed, otherwise it returns the new instruction.
 ///
-MachineInstr *
-X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
-                                    MachineInstr &MI, LiveVariables *LV) const {
+MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
+                                                  LiveVariables *LV,
+                                                  LiveIntervals *LIS) const {
   // The following opcodes also sets the condition code register(s). Only
   // convert them to equivalent lea if the condition code register def's
   // are dead!
@@ -1398,6 +1459,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
   MachineInstr *NewMI = nullptr;
+  Register SrcReg, SrcReg2;
   bool Is64Bit = Subtarget.is64Bit();
 
   bool Is8BitOp = false;
@@ -1432,10 +1494,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     // LEA can't handle ESP.
     bool isKill;
-    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
-                        SrcReg, isKill, ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
+                        ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -1460,7 +1521,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt))
       return nullptr;
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
   }
   case X86::INC64r:
   case X86::INC32r: {
@@ -1468,10 +1529,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
         (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
     bool isKill;
-    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
-                        ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
+                        ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB =
@@ -1491,10 +1551,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
     bool isKill;
-    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
-                        ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
+                        ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1513,7 +1572,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     LLVM_FALLTHROUGH;
   case X86::DEC16r:
   case X86::INC16r:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD32rr:
@@ -1525,21 +1584,26 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     else
       Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    bool isKill;
-    Register SrcReg;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, ImplicitOp, LV))
-      return nullptr;
-
     const MachineOperand &Src2 = MI.getOperand(2);
     bool isKill2;
-    Register SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
-                        SrcReg2, isKill2, ImplicitOp2, LV))
+    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
+                        ImplicitOp2, LV, LIS))
       return nullptr;
 
+    bool isKill;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (Src.getReg() == Src2.getReg()) {
+      // Don't call classify LEAReg a second time on the same register, in case
+      // the first call inserted a COPY from Src2 and marked it as killed.
+      isKill = isKill2;
+      SrcReg = SrcReg2;
+    } else {
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
+                          ImplicitOp, LV, LIS))
+        return nullptr;
+    }
+
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
@@ -1557,7 +1621,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     LLVM_FALLTHROUGH;
   case X86::ADD16rr:
   case X86::ADD16rr_DB:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
   case X86::ADD64ri32:
   case X86::ADD64ri8:
   case X86::ADD64ri32_DB:
@@ -1575,10 +1639,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill;
-    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
+                        ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1598,7 +1661,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
+    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
   case X86::SUB8ri:
   case X86::SUB16ri8:
   case X86::SUB16ri:
@@ -1616,10 +1679,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill;
-    Register SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
-                        SrcReg, isKill, ImplicitOp, LV))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
+                        ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1806,7 +1868,17 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
   }
 
-  MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
+  MachineBasicBlock &MBB = *MI.getParent();
+  MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
+
+  if (LIS) {
+    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    if (SrcReg)
+      LIS->getInterval(SrcReg);
+    if (SrcReg2)
+      LIS->getInterval(SrcReg2);
+  }
+
   return NewMI;
 }
 
@@ -2235,6 +2307,10 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VCMPSSZrr:
   case X86::VCMPPDZrri:
   case X86::VCMPPSZrri:
+  case X86::VCMPSHZrr:
+  case X86::VCMPPHZrri:
+  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rri:
   case X86::VCMPPDZ128rri:
   case X86::VCMPPSZ128rri:
   case X86::VCMPPDZ256rri:
@@ -2481,6 +2557,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VCMPSSZrr:
   case X86::VCMPPDZrri:
   case X86::VCMPPSZrri:
+  case X86::VCMPSHZrr:
+  case X86::VCMPPHZrri:
+  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rri:
   case X86::VCMPPDZ128rri:
   case X86::VCMPPSZ128rri:
   case X86::VCMPPDZ256rri:
@@ -2606,7 +2686,19 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPMADD52LUQZ256rkz:
   case X86::VPMADD52LUQZr:
   case X86::VPMADD52LUQZrk:
-  case X86::VPMADD52LUQZrkz: {
+  case X86::VPMADD52LUQZrkz:
+  case X86::VFMADDCPHZr:
+  case X86::VFMADDCPHZrk:
+  case X86::VFMADDCPHZrkz:
+  case X86::VFMADDCPHZ128r:
+  case X86::VFMADDCPHZ128rk:
+  case X86::VFMADDCPHZ128rkz:
+  case X86::VFMADDCPHZ256r:
+  case X86::VFMADDCPHZ256rk:
+  case X86::VFMADDCPHZ256rkz:
+  case X86::VFMADDCSHZr:
+  case X86::VFMADDCSHZrk:
+  case X86::VFMADDCSHZrkz: {
     unsigned CommutableOpIdx1 = 2;
     unsigned CommutableOpIdx2 = 3;
     if (X86II::isKMasked(Desc.TSFlags)) {
@@ -2834,11 +2926,6 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
   return std::make_pair(CC, NeedSwap);
 }
 
-/// Return a setcc opcode based on whether it has memory operand.
-unsigned X86::getSETOpc(bool HasMemoryOperand) {
-  return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
-}
-
 /// Return a cmov opcode for the given register size in bytes, and operand type.
 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
   switch(RegBytes) {
@@ -2919,6 +3006,23 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) {
   return Imm;
 }
 
+/// Return true if the Reg is X87 register.
+static bool isX87Reg(unsigned Reg) {
+  return (Reg == X86::FPCW || Reg == X86::FPSW ||
+          (Reg >= X86::ST0 && Reg <= X86::ST7));
+}
+
+/// check if the instruction is X87 instruction
+bool X86::isX87Instruction(MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (isX87Reg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
 bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case X86::TCRETURNdi:
@@ -3018,13 +3122,13 @@ static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
   // and fallthrough MBB. If we find more than one, we cannot identify the
   // fallthrough MBB and should return nullptr.
   MachineBasicBlock *FallthroughBB = nullptr;
-  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
-    if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
+  for (MachineBasicBlock *Succ : MBB->successors()) {
+    if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
       continue;
     // Return a nullptr if we found more than one fallthrough successor.
     if (FallthroughBB && FallthroughBB != TBB)
       return nullptr;
-    FallthroughBB = *SI;
+    FallthroughBB = Succ;
   }
   return FallthroughBB;
 }
@@ -3228,13 +3332,13 @@ bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
   MachineInstr *ConditionDef = nullptr;
   bool SingleUseCondition = true;
 
-  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
-    if (I->modifiesRegister(X86::EFLAGS, TRI)) {
-      ConditionDef = &*I;
+  for (MachineInstr &MI : llvm::drop_begin(llvm::reverse(MBB))) {
+    if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
+      ConditionDef = &MI;
       break;
     }
 
-    if (I->readsRegister(X86::EFLAGS, TRI))
+    if (MI.readsRegister(X86::EFLAGS, TRI))
       SingleUseCondition = false;
   }
 
@@ -3605,6 +3709,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
   case 2:
     if (X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (X86::FR16XRegClass.hasSubClassEq(RC)) {
+      assert(STI.hasFP16());
+      return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
+    }
     assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
     return load ? X86::MOV16rm : X86::MOV16mr;
   case 4:
@@ -3680,12 +3788,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
            HasAVX    ? X86::VMOVUPSmr :
                        X86::MOVUPSmr);
     }
-    if (X86::BNDRRegClass.hasSubClassEq(RC)) {
-      if (STI.is64Bit())
-        return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
-      else
-        return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
-    }
     llvm_unreachable("Unknown 16-byte regclass");
   }
   case 32:
@@ -3904,8 +4006,8 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 }
 
 bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                                  Register &SrcReg2, int &CmpMask,
-                                  int &CmpValue) const {
+                                  Register &SrcReg2, int64_t &CmpMask,
+                                  int64_t &CmpValue) const {
   switch (MI.getOpcode()) {
   default: break;
   case X86::CMP64ri32:
@@ -3984,42 +4086,83 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   return false;
 }
 
-/// Check whether the first instruction, whose only
-/// purpose is to update flags, can be made redundant.
-/// CMPrr can be made redundant by SUBrr if the operands are the same.
-/// This function can be extended later on.
-/// SrcReg, SrcRegs: register operands for FlagI.
-/// ImmValue: immediate for FlagI if it takes an immediate.
-inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
+bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
                                         Register SrcReg, Register SrcReg2,
-                                        int ImmMask, int ImmValue,
-                                        const MachineInstr &OI) {
-  if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
-       (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
-       (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
-       (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
-      ((OI.getOperand(1).getReg() == SrcReg &&
-        OI.getOperand(2).getReg() == SrcReg2) ||
-       (OI.getOperand(1).getReg() == SrcReg2 &&
-        OI.getOperand(2).getReg() == SrcReg)))
-    return true;
-
-  if (ImmMask != 0 &&
-      ((FlagI.getOpcode() == X86::CMP64ri32 &&
-        OI.getOpcode() == X86::SUB64ri32) ||
-       (FlagI.getOpcode() == X86::CMP64ri8 &&
-        OI.getOpcode() == X86::SUB64ri8) ||
-       (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
-       (FlagI.getOpcode() == X86::CMP32ri8 &&
-        OI.getOpcode() == X86::SUB32ri8) ||
-       (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
-       (FlagI.getOpcode() == X86::CMP16ri8 &&
-        OI.getOpcode() == X86::SUB16ri8) ||
-       (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
-      OI.getOperand(1).getReg() == SrcReg &&
-      OI.getOperand(2).getImm() == ImmValue)
-    return true;
-  return false;
+                                        int64_t ImmMask, int64_t ImmValue,
+                                        const MachineInstr &OI, bool *IsSwapped,
+                                        int64_t *ImmDelta) const {
+  switch (OI.getOpcode()) {
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr: {
+    Register OISrcReg;
+    Register OISrcReg2;
+    int64_t OIMask;
+    int64_t OIValue;
+    if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
+        OIMask != ImmMask || OIValue != ImmValue)
+      return false;
+    if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
+      *IsSwapped = false;
+      return true;
+    }
+    if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
+      *IsSwapped = true;
+      return true;
+    }
+    return false;
+  }
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+  case X86::TEST64rr:
+  case X86::TEST32rr:
+  case X86::TEST16rr:
+  case X86::TEST8rr: {
+    if (ImmMask != 0) {
+      Register OISrcReg;
+      Register OISrcReg2;
+      int64_t OIMask;
+      int64_t OIValue;
+      if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
+          SrcReg == OISrcReg && ImmMask == OIMask) {
+        if (OIValue == ImmValue) {
+          *ImmDelta = 0;
+          return true;
+        } else if (static_cast<uint64_t>(ImmValue) ==
+                   static_cast<uint64_t>(OIValue) - 1) {
+          *ImmDelta = -1;
+          return true;
+        } else if (static_cast<uint64_t>(ImmValue) ==
+                   static_cast<uint64_t>(OIValue) + 1) {
+          *ImmDelta = 1;
+          return true;
+        } else {
+          return false;
+        }
+      }
+    }
+    return FlagI.isIdenticalTo(OI);
+  }
+  default:
+    return false;
+  }
 }
 
 /// Check whether the definition can be converted
@@ -4189,8 +4332,8 @@ static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
 bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                                        Register SrcReg2, int CmpMask,
-                                        int CmpValue,
+                                        Register SrcReg2, int64_t CmpMask,
+                                        int64_t CmpValue,
                                         const MachineRegisterInfo *MRI) const {
   // Check whether we can replace SUB with CMP.
   switch (CmpInstr.getOpcode()) {
@@ -4243,114 +4386,117 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   }
   }
 
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI) return false;
-
-  // CmpInstr is the first instruction of the BB.
-  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+  // The following code tries to remove the comparison by re-using EFLAGS
+  // from earlier instructions.
 
-  // If we are comparing against zero, check whether we can use MI to update
-  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
   bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
-  if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
+
+  // Transformation currently requires SSA values.
+  if (SrcReg2.isPhysical())
     return false;
+  MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
+  assert(SrcRegDef && "Must have a definition (SSA)");
 
-  // If we have a use of the source register between the def and our compare
-  // instruction we can eliminate the compare iff the use sets EFLAGS in the
-  // right way.
-  bool ShouldUpdateCC = false;
+  MachineInstr *MI = nullptr;
+  MachineInstr *Sub = nullptr;
+  MachineInstr *Movr0Inst = nullptr;
   bool NoSignFlag = false;
   bool ClearsOverflowFlag = false;
+  bool ShouldUpdateCC = false;
+  bool IsSwapped = false;
   X86::CondCode NewCC = X86::COND_INVALID;
-  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
-    // Scan forward from the use until we hit the use we're looking for or the
-    // compare instruction.
-    for (MachineBasicBlock::iterator J = MI;; ++J) {
-      // Do we have a convertible instruction?
-      NewCC = isUseDefConvertible(*J);
-      if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
-          J->getOperand(1).getReg() == SrcReg) {
-        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
-        ShouldUpdateCC = true; // Update CC later on.
-        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
-        // with the new def.
-        Def = J;
-        MI = &*Def;
-        break;
-      }
+  int64_t ImmDelta = 0;
 
-      if (J == I)
+  // Search backward from CmpInstr for the next instruction defining EFLAGS.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
+  MachineBasicBlock::reverse_iterator From =
+      std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
+  for (MachineBasicBlock *MBB = &CmpMBB;;) {
+    for (MachineInstr &Inst : make_range(From, MBB->rend())) {
+      // Try to use EFLAGS from the instruction defining %SrcReg. Example:
+      //     %eax = addl ...
+      //     ...                // EFLAGS not changed
+      //     testl %eax, %eax   // <-- can be removed
+      if (&Inst == SrcRegDef) {
+        if (IsCmpZero &&
+            isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
+          MI = &Inst;
+          break;
+        }
+        // Cannot find other candidates before definition of SrcReg.
         return false;
-    }
-  }
+      }
 
-  // We are searching for an earlier instruction that can make CmpInstr
-  // redundant and that instruction will be saved in Sub.
-  MachineInstr *Sub = nullptr;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
+      if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
+        // Try to use EFLAGS produced by an instruction reading %SrcReg.
+        // Example:
+        //      %eax = ...
+        //      ...
+        //      popcntl %eax
+        //      ...                 // EFLAGS not changed
+        //      testl %eax, %eax    // <-- can be removed
+        if (IsCmpZero) {
+          NewCC = isUseDefConvertible(Inst);
+          if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
+              Inst.getOperand(1).getReg() == SrcReg) {
+            ShouldUpdateCC = true;
+            MI = &Inst;
+            break;
+          }
+        }
 
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of a source register or done with the BB.
-  // RI points to the instruction before CmpInstr.
-  // If the definition is in this basic block, RE points to the definition;
-  // otherwise, RE is the rend of the basic block.
-  MachineBasicBlock::reverse_iterator
-      RI = ++I.getReverse(),
-      RE = CmpInstr.getParent() == MI->getParent()
-               ? Def.getReverse() /* points to MI */
-               : CmpInstr.getParent()->rend();
-  MachineInstr *Movr0Inst = nullptr;
-  for (; RI != RE; ++RI) {
-    MachineInstr &Instr = *RI;
-    // Check whether CmpInstr can be made redundant by the current instruction.
-    if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
-                                           CmpValue, Instr)) {
-      Sub = &Instr;
-      break;
-    }
+        // Try to use EFLAGS from an instruction with similar flag results.
+        // Example:
+        //     sub x, y  or  cmp x, y
+        //     ...           // EFLAGS not changed
+        //     cmp x, y      // <-- can be removed
+        if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
+                                 Inst, &IsSwapped, &ImmDelta)) {
+          Sub = &Inst;
+          break;
+        }
 
-    if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
-        Instr.readsRegister(X86::EFLAGS, TRI)) {
-      // This instruction modifies or uses EFLAGS.
+        // MOV32r0 is implemented with xor which clobbers condition code. It is
+        // safe to move up, if the definition to EFLAGS is dead and earlier
+        // instructions do not read or write EFLAGS.
+        if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
+            Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
+          Movr0Inst = &Inst;
+          continue;
+        }
 
-      // MOV32r0 etc. are implemented with xor which clobbers condition code.
-      // They are safe to move up, if the definition to EFLAGS is dead and
-      // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
-          Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
-        Movr0Inst = &Instr;
-        continue;
+        // Cannot do anything for any other EFLAG changes.
+        return false;
       }
-
-      // We can't remove CmpInstr.
-      return false;
     }
-  }
 
-  // Return false if no candidates exist.
-  if (!IsCmpZero && !Sub)
-    return false;
+    if (MI || Sub)
+      break;
 
-  bool IsSwapped =
-      (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
-       Sub->getOperand(2).getReg() == SrcReg);
+    // Reached begin of basic block. Continue in predecessor if there is
+    // exactly one.
+    if (MBB->pred_size() != 1)
+      return false;
+    MBB = *MBB->pred_begin();
+    From = MBB->rbegin();
+  }
 
   // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
   // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
   // If we are done with the basic block, we need to check whether EFLAGS is
   // live-out.
-  bool IsSafe = false;
+  bool FlagsMayLiveOut = true;
   SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
-  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
-  for (++I; I != E; ++I) {
-    const MachineInstr &Instr = *I;
+  MachineBasicBlock::iterator AfterCmpInstr =
+      std::next(MachineBasicBlock::iterator(CmpInstr));
+  for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
     bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
     bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
     // We should check the usage if this instruction uses and updates EFLAGS.
     if (!UseEFLAGS && ModifyEFLAGS) {
       // It is safe to remove CmpInstr if EFLAGS is updated again.
-      IsSafe = true;
+      FlagsMayLiveOut = false;
       break;
     }
     if (!UseEFLAGS && !ModifyEFLAGS)
@@ -4358,7 +4504,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
     // EFLAGS is used by this instruction.
     X86::CondCode OldCC = X86::COND_INVALID;
-    if (IsCmpZero || IsSwapped) {
+    if (MI || IsSwapped || ImmDelta != 0) {
       // We decode the condition code from opcode.
       if (Instr.isBranch())
         OldCC = X86::getCondFromBranch(Instr);
@@ -4370,7 +4516,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       if (OldCC == X86::COND_INVALID) return false;
     }
     X86::CondCode ReplacementCC = X86::COND_INVALID;
-    if (IsCmpZero) {
+    if (MI) {
       switch (OldCC) {
       default: break;
       case X86::COND_A: case X86::COND_AE:
@@ -4411,43 +4557,97 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       // We swap the condition code and synthesize the new opcode.
       ReplacementCC = getSwappedCondition(OldCC);
       if (ReplacementCC == X86::COND_INVALID) return false;
+      ShouldUpdateCC = true;
+    } else if (ImmDelta != 0) {
+      unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
+      // Shift amount for min/max constants to adjust for 8/16/32 instruction
+      // sizes.
+      switch (OldCC) {
+      case X86::COND_L: // x <s (C + 1)  -->  x <=s C
+        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_LE;
+        break;
+      case X86::COND_B: // x <u (C + 1)  -->  x <=u C
+        if (ImmDelta != 1 || CmpValue == 0)
+          return false;
+        ReplacementCC = X86::COND_BE;
+        break;
+      case X86::COND_GE: // x >=s (C + 1)  -->  x >s C
+        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_G;
+        break;
+      case X86::COND_AE: // x >=u (C + 1)  -->  x >u C
+        if (ImmDelta != 1 || CmpValue == 0)
+          return false;
+        ReplacementCC = X86::COND_A;
+        break;
+      case X86::COND_G: // x >s (C - 1)  -->  x >=s C
+        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_GE;
+        break;
+      case X86::COND_A: // x >u (C - 1)  -->  x >=u C
+        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_AE;
+        break;
+      case X86::COND_LE: // x <=s (C - 1)  -->  x <s C
+        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_L;
+        break;
+      case X86::COND_BE: // x <=u (C - 1)  -->  x <u C
+        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_B;
+        break;
+      default:
+        return false;
+      }
+      ShouldUpdateCC = true;
     }
 
-    if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
+    if (ShouldUpdateCC && ReplacementCC != OldCC) {
       // Push the MachineInstr to OpsToUpdate.
       // If it is safe to remove CmpInstr, the condition code of these
       // instructions will be modified.
-      OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
+      OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
     }
     if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
       // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
-      IsSafe = true;
+      FlagsMayLiveOut = false;
       break;
     }
   }
 
-  // If EFLAGS is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if ((IsCmpZero || IsSwapped) && !IsSafe) {
-    MachineBasicBlock *MBB = CmpInstr.getParent();
-    for (MachineBasicBlock *Successor : MBB->successors())
+  // If we have to update users but EFLAGS is live-out abort, since we cannot
+  // easily find all of the users.
+  if (ShouldUpdateCC && FlagsMayLiveOut) {
+    for (MachineBasicBlock *Successor : CmpMBB.successors())
       if (Successor->isLiveIn(X86::EFLAGS))
         return false;
   }
 
   // The instruction to be updated is either Sub or MI.
-  Sub = IsCmpZero ? MI : Sub;
+  assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
+  Sub = MI != nullptr ? MI : Sub;
+  MachineBasicBlock *SubBB = Sub->getParent();
   // Move Movr0Inst to the appropriate place before Sub.
   if (Movr0Inst) {
+    // Only move within the same block so we don't accidentally move to a
+    // block with higher execution frequency.
+    if (&CmpMBB != SubBB)
+      return false;
     // Look backwards until we find a def that doesn't use the current EFLAGS.
-    Def = Sub;
-    MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
+    MachineBasicBlock::reverse_iterator InsertI = Sub,
                                         InsertE = Sub->getParent()->rend();
     for (; InsertI != InsertE; ++InsertI) {
       MachineInstr *Instr = &*InsertI;
       if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
           Instr->modifiesRegister(X86::EFLAGS, TRI)) {
-        Sub->getParent()->remove(Movr0Inst);
+        Movr0Inst->getParent()->remove(Movr0Inst);
         Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
                                    Movr0Inst);
         break;
@@ -4469,6 +4669,13 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
         .setImm(Op.second);
   }
+  // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
+  for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
+       MBB = *MBB->pred_begin()) {
+    assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
+    if (!MBB->isLiveIn(X86::EFLAGS))
+      MBB->addLiveIn(X86::EFLAGS);
+  }
   return true;
 }
 
@@ -4755,6 +4962,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
   case X86::AVX512_128_SET0:
+  case X86::AVX512_FsFLD0SH:
   case X86::AVX512_FsFLD0SS:
   case X86::AVX512_FsFLD0SD:
   case X86::AVX512_FsFLD0F128: {
@@ -5158,6 +5366,26 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VCVTUSI642SDZrr_Int:
   case X86::VCVTUSI642SDZrrb_Int:
   case X86::VCVTUSI642SDZrm_Int:
+  case X86::VCVTSI2SHZrr:
+  case X86::VCVTSI2SHZrm:
+  case X86::VCVTSI2SHZrr_Int:
+  case X86::VCVTSI2SHZrrb_Int:
+  case X86::VCVTSI2SHZrm_Int:
+  case X86::VCVTSI642SHZrr:
+  case X86::VCVTSI642SHZrm:
+  case X86::VCVTSI642SHZrr_Int:
+  case X86::VCVTSI642SHZrrb_Int:
+  case X86::VCVTSI642SHZrm_Int:
+  case X86::VCVTUSI2SHZrr:
+  case X86::VCVTUSI2SHZrm:
+  case X86::VCVTUSI2SHZrr_Int:
+  case X86::VCVTUSI2SHZrrb_Int:
+  case X86::VCVTUSI2SHZrm_Int:
+  case X86::VCVTUSI642SHZrr:
+  case X86::VCVTUSI642SHZrm:
+  case X86::VCVTUSI642SHZrr_Int:
+  case X86::VCVTUSI642SHZrrb_Int:
+  case X86::VCVTUSI642SHZrm_Int:
     // Load folding won't effect the undef register update since the input is
     // a GPR.
     return OpNum == 1 && !ForLoadFold;
@@ -5230,6 +5458,29 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VRCP14SDZrm:
   case X86::VRCP14SSZrr:
   case X86::VRCP14SSZrm:
+  case X86::VRCPSHZrr:
+  case X86::VRCPSHZrm:
+  case X86::VRSQRTSHZrr:
+  case X86::VRSQRTSHZrm:
+  case X86::VREDUCESHZrmi:
+  case X86::VREDUCESHZrri:
+  case X86::VREDUCESHZrrib:
+  case X86::VGETEXPSHZr:
+  case X86::VGETEXPSHZrb:
+  case X86::VGETEXPSHZm:
+  case X86::VGETMANTSHZrri:
+  case X86::VGETMANTSHZrrib:
+  case X86::VGETMANTSHZrmi:
+  case X86::VRNDSCALESHZr:
+  case X86::VRNDSCALESHZr_Int:
+  case X86::VRNDSCALESHZrb_Int:
+  case X86::VRNDSCALESHZm:
+  case X86::VRNDSCALESHZm_Int:
+  case X86::VSQRTSHZr:
+  case X86::VSQRTSHZr_Int:
+  case X86::VSQRTSHZrb_Int:
+  case X86::VSQRTSHZm:
+  case X86::VSQRTSHZm_Int:
   case X86::VRCP28SDZr:
   case X86::VRCP28SDZrb:
   case X86::VRCP28SDZm:
@@ -5259,6 +5510,26 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VSQRTSDZrb_Int:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
+  case X86::VCVTSD2SHZrr:
+  case X86::VCVTSD2SHZrr_Int:
+  case X86::VCVTSD2SHZrrb_Int:
+  case X86::VCVTSD2SHZrm:
+  case X86::VCVTSD2SHZrm_Int:
+  case X86::VCVTSS2SHZrr:
+  case X86::VCVTSS2SHZrr_Int:
+  case X86::VCVTSS2SHZrrb_Int:
+  case X86::VCVTSS2SHZrm:
+  case X86::VCVTSS2SHZrm_Int:
+  case X86::VCVTSH2SDZrr:
+  case X86::VCVTSH2SDZrr_Int:
+  case X86::VCVTSH2SDZrrb_Int:
+  case X86::VCVTSH2SDZrm:
+  case X86::VCVTSH2SDZrm_Int:
+  case X86::VCVTSH2SSZrr:
+  case X86::VCVTSH2SSZrr_Int:
+  case X86::VCVTSH2SSZrrb_Int:
+  case X86::VCVTSH2SSZrm:
+  case X86::VCVTSH2SSZrm_Int:
     return OpNum == 1;
   case X86::VMOVSSZrrk:
   case X86::VMOVSDZrrk:
@@ -6036,6 +6307,49 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     }
   }
 
+  if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
+    // These instructions only load 16 bits, we can't fold them if the
+    // destination register is wider than 16 bits (2 bytes), and its user
+    // instruction isn't scalar (SH).
+    switch (UserOpc) {
+    case X86::VADDSHZrr_Int:
+    case X86::VCMPSHZrr_Int:
+    case X86::VDIVSHZrr_Int:
+    case X86::VMAXSHZrr_Int:
+    case X86::VMINSHZrr_Int:
+    case X86::VMULSHZrr_Int:
+    case X86::VSUBSHZrr_Int:
+    case X86::VADDSHZrr_Intk: case X86::VADDSHZrr_Intkz:
+    case X86::VCMPSHZrr_Intk:
+    case X86::VDIVSHZrr_Intk: case X86::VDIVSHZrr_Intkz:
+    case X86::VMAXSHZrr_Intk: case X86::VMAXSHZrr_Intkz:
+    case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz:
+    case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz:
+    case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz:
+    case X86::VFMADD132SHZr_Int: case X86::VFNMADD132SHZr_Int:
+    case X86::VFMADD213SHZr_Int: case X86::VFNMADD213SHZr_Int:
+    case X86::VFMADD231SHZr_Int: case X86::VFNMADD231SHZr_Int:
+    case X86::VFMSUB132SHZr_Int: case X86::VFNMSUB132SHZr_Int:
+    case X86::VFMSUB213SHZr_Int: case X86::VFNMSUB213SHZr_Int:
+    case X86::VFMSUB231SHZr_Int: case X86::VFNMSUB231SHZr_Int:
+    case X86::VFMADD132SHZr_Intk: case X86::VFNMADD132SHZr_Intk:
+    case X86::VFMADD213SHZr_Intk: case X86::VFNMADD213SHZr_Intk:
+    case X86::VFMADD231SHZr_Intk: case X86::VFNMADD231SHZr_Intk:
+    case X86::VFMSUB132SHZr_Intk: case X86::VFNMSUB132SHZr_Intk:
+    case X86::VFMSUB213SHZr_Intk: case X86::VFNMSUB213SHZr_Intk:
+    case X86::VFMSUB231SHZr_Intk: case X86::VFNMSUB231SHZr_Intk:
+    case X86::VFMADD132SHZr_Intkz: case X86::VFNMADD132SHZr_Intkz:
+    case X86::VFMADD213SHZr_Intkz: case X86::VFNMADD213SHZr_Intkz:
+    case X86::VFMADD231SHZr_Intkz: case X86::VFNMADD231SHZr_Intkz:
+    case X86::VFMSUB132SHZr_Intkz: case X86::VFNMSUB132SHZr_Intkz:
+    case X86::VFMSUB213SHZr_Intkz: case X86::VFNMSUB213SHZr_Intkz:
+    case X86::VFMSUB231SHZr_Intkz: case X86::VFNMSUB231SHZr_Intkz:
+      return false;
+    default:
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -6101,6 +6415,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX512_FsFLD0SS:
       Alignment = Align(4);
       break;
+    case X86::AVX512_FsFLD0SH:
+      Alignment = Align(2);
+      break;
     default:
       return nullptr;
     }
@@ -6136,6 +6453,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
   case X86::AVX512_512_SETALLONES:
+  case X86::AVX512_FsFLD0SH:
   case X86::FsFLD0SD:
   case X86::AVX512_FsFLD0SD:
   case X86::FsFLD0SS:
@@ -6174,6 +6492,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Ty = Type::getDoubleTy(MF.getFunction().getContext());
     else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
       Ty = Type::getFP128Ty(MF.getFunction().getContext());
+    else if (Opc == X86::AVX512_FsFLD0SH)
+      Ty = Type::getHalfTy(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
                                 16);
@@ -8384,6 +8704,14 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VMINCSSrr:
   case X86::VMINCSDZrr:
   case X86::VMINCSSZrr:
+  case X86::VMAXCPHZ128rr:
+  case X86::VMAXCPHZ256rr:
+  case X86::VMAXCPHZrr:
+  case X86::VMAXCSHZrr:
+  case X86::VMINCPHZ128rr:
+  case X86::VMINCPHZ256rr:
+  case X86::VMINCPHZrr:
+  case X86::VMINCSHZrr:
     return true;
   case X86::ADDPDrr:
   case X86::ADDPSrr:
@@ -8421,6 +8749,14 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VMULSSrr:
   case X86::VMULSDZrr:
   case X86::VMULSSZrr:
+  case X86::VADDPHZ128rr:
+  case X86::VADDPHZ256rr:
+  case X86::VADDPHZrr:
+  case X86::VADDSHZrr:
+  case X86::VMULPHZ128rr:
+  case X86::VMULPHZ256rr:
+  case X86::VMULPHZrr:
+  case X86::VMULSHZrr:
     return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
            Inst.getFlag(MachineInstr::MIFlag::FmNsz);
   default:
@@ -8667,6 +9003,7 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_GOT, "x86-got"},
       {MO_GOTOFF, "x86-gotoff"},
       {MO_GOTPCREL, "x86-gotpcrel"},
+      {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
       {MO_PLT, "x86-plt"},
       {MO_TLSGD, "x86-tlsgd"},
       {MO_TLSLD, "x86-tlsld"},
@@ -8966,13 +9303,8 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
   MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
   for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
        Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
-    const std::vector<MCCFIInstruction> &CFIInstructions =
-        RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
-    if (MBBI->isCFIInstruction()) {
-      unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
-      MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+    if (MBBI->isCFIInstruction())
       CFICount++;
-    }
     MBBI++;
   }
 
@@ -9102,7 +9434,7 @@ void X86InstrInfo::buildOutlinedFrame(MachineBasicBlock &MBB,
 
   // We're a normal call, so our sequence doesn't have a return instruction.
   // Add it in.
-  MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ));
+  MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
   MBB.insert(MBB.end(), retq);
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index c663bb32af37..33ce55bbdb2b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -37,9 +37,6 @@ enum AsmComments {
 /// the instruction operands should be swaped to match the condition code.
 std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
 
-/// Return a setcc opcode based on whether it has a memory operand.
-unsigned getSETOpc(bool HasMemoryOperand = false);
-
 /// Return a cmov opcode for the given register size in bytes, and operand type.
 unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
 
@@ -68,6 +65,8 @@ unsigned getSwappedVPCOMImm(unsigned Imm);
 /// Get the VCMP immediate if the opcodes are swapped.
 unsigned getSwappedVCMPImm(unsigned Imm);
 
+/// Check if the instruction is X87 instruction.
+bool isX87Instruction(MachineInstr &MI);
 } // namespace X86
 
 /// isGlobalStubReference - Return true if the specified TargetFlag operand is
@@ -76,6 +75,7 @@ inline static bool isGlobalStubReference(unsigned char TargetFlag) {
   switch (TargetFlag) {
   case X86II::MO_DLLIMPORT:               // dllimport stub.
   case X86II::MO_GOTPCREL:                // rip-relative GOT reference.
+  case X86II::MO_GOTPCREL_NORELAX:        // rip-relative GOT reference.
   case X86II::MO_GOT:                     // normal GOT reference.
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
   case X86II::MO_DARWIN_NONLAZY:          // Normal $non_lazy_ptr ref.
@@ -250,7 +250,7 @@ public:
   bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                       unsigned LEAOpcode, bool AllowSP, Register &NewSrc,
                       bool &isKill, MachineOperand &ImplicitOp,
-                      LiveVariables *LV) const;
+                      LiveVariables *LV, LiveIntervals *LIS) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
@@ -262,9 +262,8 @@ public:
   /// This method returns a null pointer if the transformation cannot be
   /// performed, otherwise it returns the new instruction.
   ///
-  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
-                                      MachineInstr &MI,
-                                      LiveVariables *LV) const override;
+  MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
+                                      LiveIntervals *LIS) const override;
 
   /// Returns true iff the routine could find two commutable operands in the
   /// given machine instruction.
@@ -510,14 +509,14 @@ public:
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
   bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
-                      Register &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
+                      Register &SrcReg2, int64_t &CmpMask,
+                      int64_t &CmpValue) const override;
 
   /// optimizeCompareInstr - Check if there exists an earlier instruction that
   /// operates on the same source operands and sets flags in the same way as
   /// Compare; remove Compare if possible.
   bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
-                            Register SrcReg2, int CmpMask, int CmpValue,
+                            Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   /// optimizeLoadInstr - Try to remove the load by folding it to a register
@@ -591,10 +590,9 @@ private:
   /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
   /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit
   /// super-register and then truncating back down to a 8/16-bit sub-register.
-  MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
-                                             MachineFunction::iterator &MFI,
-                                             MachineInstr &MI,
+  MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc, MachineInstr &MI,
                                              LiveVariables *LV,
+                                             LiveIntervals *LIS,
                                              bool Is8BitOp) const;
 
   /// Handles memory folding for special case instructions, for instance those
@@ -631,6 +629,22 @@ private:
                                      unsigned &SrcOpIdx1,
                                      unsigned &SrcOpIdx2,
                                      bool IsIntrinsic = false) const;
+
+  /// Returns true when instruction \p FlagI produces the same flags as \p OI.
+  /// The caller should pass in the results of calling analyzeCompare on \p OI:
+  /// \p SrcReg, \p SrcReg2, \p ImmMask, \p ImmValue.
+  /// If the flags match \p OI as if it had the input operands swapped then the
+  /// function succeeds and sets \p IsSwapped to true.
+  ///
+  /// Examples of OI, FlagI pairs returning true:
+  ///   CMP %1, 42   and  CMP %1, 42
+  ///   CMP %1, %2   and  %3 = SUB %1, %2
+  ///   TEST %1, %1  and  %2 = SUB %1, 0
+  ///   CMP %1, %2   and  %3 = SUB %2, %1  ; IsSwapped=true
+  bool isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg,
+                            Register SrcReg2, int64_t ImmMask, int64_t ImmValue,
+                            const MachineInstr &OI, bool *IsSwapped,
+                            int64_t *ImmDelta) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 34afedb5bad2..fee9939b8dfc 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -91,8 +91,7 @@ def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 
 def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
-                                                         SDTCisVT<1, iPTR>,
-                                                         SDTCisVT<2, iPTR>]>;
+                                                         SDTCisPtrTy<1>]>;
 
 def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
                                          SDTCisPtrTy<1>,
@@ -112,7 +111,7 @@ def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
-def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_X86DYN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 
@@ -184,7 +183,7 @@ def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
 def X86vastart_save_xmm_regs :
                  SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
                         SDT_X86VASTART_SAVE_XMM_REGS,
-                        [SDNPHasChain, SDNPVariadic]>;
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPVariadic]>;
 def X86vaarg64 :
                  SDNode<"X86ISD::VAARG_64", SDT_X86VAARG,
                         [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
@@ -294,7 +293,7 @@ def X86pext   : SDNode<"X86ISD::PEXT",   SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
-def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
+def X86DynAlloca : SDNode<"X86ISD::DYN_ALLOCA", SDT_X86DYN_ALLOCA,
                           [SDNPHasChain, SDNPOutGlue]>;
 
 def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
@@ -421,6 +420,7 @@ def i64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand>;
 def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
 def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
 def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def f16mem  : X86MemOperand<"printwordmem",   X86Mem16AsmOperand>;
 def f32mem  : X86MemOperand<"printdwordmem",  X86Mem32AsmOperand>;
 def f64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand>;
 def f80mem  : X86MemOperand<"printtbytemem",  X86Mem80AsmOperand>;
@@ -919,6 +919,7 @@ def PKU        : Predicate<"Subtarget->hasPKU()">;
 def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
+def HasFP16      : Predicate<"Subtarget->hasFP16()">;
 def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
 def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
 
@@ -993,6 +994,7 @@ def HasAMXTILE   : Predicate<"Subtarget->hasAMXTILE()">;
 def HasAMXBF16   : Predicate<"Subtarget->hasAMXBF16()">;
 def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
+def HasCRC32     : Predicate<"Subtarget->hasCRC32()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
@@ -1193,6 +1195,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
 }]>;
 
 def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf16  : PatFrag<(ops node:$ptr), (f16 (load node:$ptr))>;
 def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
 def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
 def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
@@ -3155,9 +3158,6 @@ include "X86InstrAVX512.td"
 include "X86InstrMMX.td"
 include "X86Instr3DNow.td"
 
-// MPX instructions
-include "X86InstrMPX.td"
-
 include "X86InstrVMX.td"
 include "X86InstrSVM.td"
 include "X86InstrSNP.td"
diff --git a/llvm/lib/Target/X86/X86InstrKL.td b/llvm/lib/Target/X86/X86InstrKL.td
index b91e563a15f3..a716aab4260b 100644
--- a/llvm/lib/Target/X86/X86InstrKL.td
+++ b/llvm/lib/Target/X86/X86InstrKL.td
@@ -1,10 +1,9 @@
 //===---------------------------*-tablegen-*-------------------------------===//
 //===------------- X86InstrKL.td - KL Instruction Set Extension -----------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Target/X86/X86InstrMPX.td b/llvm/lib/Target/X86/X86InstrMPX.td
deleted file mode 100644
index 44ba071947c2..000000000000
--- a/llvm/lib/Target/X86/X86InstrMPX.td
+++ /dev/null
@@ -1,77 +0,0 @@
-//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the X86 MPX instruction set, defining the
-// instructions, and properties of the instructions which are needed for code
-// generation, machine code emission, and analysis.
-//
-//===----------------------------------------------------------------------===//
-
-// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM.
-let SchedRW = [WriteSystem] in {
-
-multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
-  def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
-              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
-              Requires<[Not64BitMode]>;
-  def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
-              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
-              Requires<[In64BitMode]>;
-}
-
-defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
-
-multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
-  def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, anymem:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[Not64BitMode]>;
-  def 64rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, anymem:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[In64BitMode]>;
-
-  def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[Not64BitMode]>;
-  def 64rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
-              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
-              Requires<[In64BitMode]>;
-}
-defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
-defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
-defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
-
-def BNDMOVrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
-                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  NotMemoryFoldable;
-let mayLoad = 1 in {
-def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
-                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[Not64BitMode]>, NotMemoryFoldable;
-def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
-                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[In64BitMode]>, NotMemoryFoldable;
-}
-let isCodeGenOnly = 1, ForceDisassemble = 1 in
-def BNDMOVrr_REV   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
-                       "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                       NotMemoryFoldable;
-let mayStore = 1 in {
-def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
-                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[Not64BitMode]>, NotMemoryFoldable;
-def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
-                  "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
-                  Requires<[In64BitMode]>, NotMemoryFoldable;
-
-def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
-                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS;
-}
-let mayLoad = 1 in
-def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
-                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS;
-} // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 41fda603d5a9..035f139e6f33 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -40,7 +40,7 @@ let isCodeGenOnly = 1 in {
 }
 
 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
-multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
+multiclass sse12_fp_scalar_int<bits<8> opc,
                                SDPatternOperator OpNode, RegisterClass RC,
                                ValueType VT, string asm, Operand memopr,
                                PatFrags mem_frags, Domain d,
@@ -187,8 +187,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
-multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
-                         X86MemOperand x86memop, string base_opc,
+multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
                          string asm_opr, Domain d, string Name> {
   let isCommutable = 1 in
   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
@@ -210,7 +209,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
                       Domain d, string Name, Predicate pred> {
   // AVX
   let Predicates = [UseAVX, OptForSize] in
-  defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
+  defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
                               "V"#Name>,
                               VEX_4V, VEX_LIG, VEX_WIG;
@@ -222,7 +221,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     let Predicates = [pred, NoSSE41_Or_OptForSize] in
-    defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
+    defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $dst|$dst, $src2}", d, Name>;
   }
 
@@ -1747,20 +1746,20 @@ let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 // XMM only
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
                        VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
+                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
                        VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
 
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
+                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
                         VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
+                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
                         VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
 } // Predicates = [HasAVX, NoVLX]
 
@@ -1771,11 +1770,11 @@ def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
 
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
                      Sched<[WriteCvtPD2PS]>, SIMD_EXC;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
+                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
                      Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
 
 //===----------------------------------------------------------------------===//
@@ -2266,7 +2265,7 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
 /// There are no patterns here because isel prefers integer versions for SSE2
 /// and later. There are SSE1 v4f32 patterns later.
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
-                                   SDNode OpNode, X86SchedWriteWidths sched> {
+                                   X86SchedWriteWidths sched> {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
         !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
@@ -2296,11 +2295,11 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
-defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
-defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
+defm AND  : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
 let isCommutable = 0 in
-  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
 
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
@@ -2643,18 +2642,18 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
                                       SDPatternOperator OpNode,
                                       X86SchedWriteSizes sched> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
                    SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
-  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
                    SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
-    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
+    defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
                    SSEPackedSingle, sched.PS.Scl>, XS;
-    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
+    defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
                    SSEPackedDouble, sched.PD.Scl>, XD;
   }
@@ -2790,8 +2789,8 @@ defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf
 /// For the non-AVX defs, we need $src1 to be tied to $dst because
 /// the HW instructions are 2 operand / destructive.
 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                          ValueType ScalarVT, X86MemOperand x86memop,
-                          Operand intmemop, SDPatternOperator OpNode, Domain d,
+                          X86MemOperand x86memop, Operand intmemop,
+                          SDPatternOperator OpNode, Domain d,
                           X86FoldableSchedWrite sched, Predicate target> {
   let isCodeGenOnly = 1, hasSideEffects = 0 in {
   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
@@ -2818,9 +2817,8 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 }
 
-multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
-                              PatFrags mem_frags, Intrinsic Intr,
-                              Predicate target, string Suffix> {
+multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
+                              Intrinsic Intr, Predicate target> {
   let Predicates = [target] in {
   // These are unary operations, but they are modeled as having 2 source operands
   // because the high elements of the destination are unchanged in SSE.
@@ -2841,7 +2839,7 @@ multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
   }
 }
 
-multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags,
+multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
                               Intrinsic Intr, Predicate target> {
   let Predicates = [target] in {
    def : Pat<(Intr VR128:$src),
@@ -2972,12 +2970,11 @@ let Predicates = [HasAVX, NoVLX] in {
                 Sched<[sched.XMM.Folded]>;
 }
 
-multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          X86SchedWriteWidths sched, Predicate AVXTarget> {
-  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
+  defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
-                      UseSSE1, "SS">, XS;
-  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+                      UseSSE1>, XS;
+  defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
                       AVXTarget>,
                       XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
@@ -2985,7 +2982,7 @@ multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
-  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
+  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
                       ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
                       f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
@@ -2994,7 +2991,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNod
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
-  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
+  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
                          sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
                          f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
@@ -3010,10 +3007,10 @@ defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
-             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+             sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
 defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
-             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+             sse1_fp_unop_s_intr<"rcp", HasAVX>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
 
 // There is no f64 version of the reciprocal approximation instructions.
@@ -6588,14 +6585,14 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
 // of r and m.
 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
                    RegisterClass RCIn, SDPatternOperator Int> :
-  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
          [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
          Sched<[WriteCRC32]>;
 
 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
                    X86MemOperand x86memop, SDPatternOperator Int> :
-  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
          !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
          [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
          Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
@@ -7049,6 +7046,50 @@ def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
 }
 
 //===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+
+let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+}
+
+// Immediate transform to help with commuting.
+def Perm2XCommuteImm : SDNodeXForm<timm, [{
+  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
+}]>;
+
+multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
+  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
+  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
+            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
+  // Pattern with load in other operand.
+  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
+            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+                                             (Perm2XCommuteImm timm:$imm))>;
+}
+
+let Predicates = [HasAVX] in {
+  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
+  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
+}
+
+let Predicates = [HasAVX1Only] in {
+  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
+  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
+  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
+}
+
+//===----------------------------------------------------------------------===//
 // VINSERTF128 - Insert packed floating-point values
 //
 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
@@ -7070,29 +7111,37 @@ let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
 }
 
-multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
-                            PatFrag memop_frag> {
+multiclass vinsert_lowering<string InstrStr, string PermStr,
+                            ValueType From, ValueType To,
+                            PatFrag frommemop_frag, PatFrag tomemop_frag> {
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
                                    (iPTR imm)),
             (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
-                                    (From (memop_frag addr:$src2)),
+                                    (From (frommemop_frag addr:$src2)),
                                     (iPTR imm)),
             (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
+  // Folding "To" vector - convert to perm2x128 and commute inputs.
+  def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
+                                    (From VR128:$src2),
+                                    (iPTR imm)),
+            (!cast<Instruction>(PermStr#rm)
+              (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
+              addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
-  defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
 }
 
 let Predicates = [HasAVX1Only] in {
-  defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
-  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
-  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7297,50 +7346,6 @@ let ExeDomain = SSEPackedDouble in {
 }
 
 //===----------------------------------------------------------------------===//
-// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
-//
-
-let ExeDomain = SSEPackedSingle in {
-let isCommutable = 1 in
-def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
-          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
-          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
-def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
-          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
-          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
-}
-
-// Immediate transform to help with commuting.
-def Perm2XCommuteImm : SDNodeXForm<timm, [{
-  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
-}]>;
-
-multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
-  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
-            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
-  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
-            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
-  // Pattern with load in other operand.
-  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
-            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
-                                             (Perm2XCommuteImm timm:$imm))>;
-}
-
-let Predicates = [HasAVX] in {
-  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
-  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
-}
-
-let Predicates = [HasAVX1Only] in {
-  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
-  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
-  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
-  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
-}
-
-//===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 // Note: These instruction do not affect the YMM16-YMM31.
 //
@@ -7625,10 +7630,18 @@ let Predicates = [HasAVX1Only] in {
             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
               (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
               (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
+  def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
+              (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
               (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
               (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
+              (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
 
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
             (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
@@ -7741,10 +7754,10 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
-  defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
-  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
-  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7889,10 +7902,8 @@ let Predicates = [HasAVX2, NoVLX] in {
 // VGATHER - GATHER Operations
 
 // FIXME: Improve scheduling of gather instructions.
-multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
-                       ValueType VTy, RegisterClass RC256,
-                       X86MemOperand memop128, X86MemOperand memop256,
-                       ValueType MTx = VTx, ValueType MTy = VTy> {
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
 let mayLoad = 1, hasSideEffects = 0 in {
   def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
             (ins VR128:$src1, memop128:$src2, VR128:$mask),
@@ -7911,27 +7922,27 @@ let Predicates = [HasAVX2] in {
   let mayLoad = 1, hasSideEffects = 0, Constraints
     = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
     in {
-    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
-                        VR256, vx128mem, vx256mem>, VEX_W;
-    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
-                        VR256, vx128mem, vy256mem>, VEX_W;
-    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
-                        VR256, vx128mem, vy256mem>;
-    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
-                        VR128, vx64mem, vy128mem>;
+    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
+                                  VR256, vx128mem, vx256mem>, VEX_W;
+    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
+                                  VR256, vx128mem, vy256mem>, VEX_W;
+    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
+                                  VR256, vx128mem, vy256mem>;
+    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
+                                  VR128, vx64mem, vy128mem>;
 
     let ExeDomain = SSEPackedDouble in {
-      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
-                          VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
-      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
-                          VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
+      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
+                                    VR256, vx128mem, vx256mem>, VEX_W;
+      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
+                                    VR256, vx128mem, vy256mem>, VEX_W;
     }
 
     let ExeDomain = SSEPackedSingle in {
-      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
-                          VR256, vx128mem, vy256mem, v4i32, v8i32>;
-      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
-                          VR128, vx64mem, vy128mem, v4i32, v4i32>;
+      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
+                                    VR256, vx128mem, vy256mem>;
+      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
+                                    VR128, vx64mem, vy128mem>;
     }
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 48c27051a872..b4dd99d08a62 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -529,16 +529,17 @@ let SchedRW = [WriteSystem] in {
 //===----------------------------------------------------------------------===//
 // XSAVE instructions
 let SchedRW = [WriteSystem] in {
-let Predicates = [HasXSAVE] in {
+// NOTE: No HasXSAVE predicate so that these can be used with _xgetbv/_xsetbv
+// on Windows without needing to enable the xsave feature to be compatible with
+// MSVC.
 let Defs = [EDX, EAX], Uses = [ECX] in
-  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
+def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
 
 let Uses = [EDX, EAX, ECX] in
-  def XSETBV : I<0x01, MRM_D1, (outs), (ins),
-                "xsetbv",
-                [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
+def XSETBV : I<0x01, MRM_D1, (outs), (ins),
+              "xsetbv",
+              [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
 
-} // HasXSAVE
 
 let Uses = [EDX, EAX] in {
 def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
@@ -583,7 +584,7 @@ def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
 //===----------------------------------------------------------------------===//
 // VIA PadLock crypto instructions
 let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in
-  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB, REP;
+  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
 
 def : InstAlias<"xstorerng", (XSTORE)>;
 
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index e98843bd3ae3..2429aa113fb1 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -25,6 +25,8 @@ let Predicates = [NoAVX512] in {
 
 let Predicates = [HasAVX512] in {
   // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f16 (extractelt (v8f16 VR128X:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v8f16 VR128X:$src), FR16X)>;
   def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
             (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>;
   def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
@@ -32,6 +34,8 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [NoVLX] in {
+  def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
+            (COPY_TO_REGCLASS FR16X:$src, VR128)>;
   // Implicitly promote a 32-bit scalar to a vector.
   def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
             (COPY_TO_REGCLASS FR32:$src, VR128)>;
@@ -41,6 +45,8 @@ let Predicates = [NoVLX] in {
 }
 
 let Predicates = [HasVLX] in {
+  def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
+            (COPY_TO_REGCLASS FR16X:$src, VR128X)>;
   // Implicitly promote a 32-bit scalar to a vector.
   def : Pat<(v4f32 (scalar_to_vector FR32X:$src)),
             (COPY_TO_REGCLASS FR32X:$src, VR128X)>;
@@ -74,6 +80,7 @@ defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8,  sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8f16, VR256, v16f16, sub_xmm>;
 
 // A 128-bit subvector extract from the first 512-bit vector position is a
 // subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -85,6 +92,7 @@ defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8,  sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8f16, VR512, v32f16, sub_xmm>;
 
 // A 128-bit subvector extract from the first 512-bit vector position is a
 // subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -96,6 +104,7 @@ defm : subvector_subreg_lowering<VR256, v4i64,  VR512, v8i64,  sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v4f64,  VR512, v8f64,  sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v32i8,  VR512, v64i8,  sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16f16, VR512, v32f16, sub_ymm>;
 
 
 // If we're inserting into an all zeros vector, just use a plain move which
@@ -103,8 +112,7 @@ defm : subvector_subreg_lowering<VR256, v32i8,  VR512, v64i8,  sub_ymm>;
 // any moves that we can prove are unnecessary.
 multiclass subvec_zero_lowering<string MoveStr,
                                 RegisterClass RC, ValueType DstTy,
-                                ValueType SrcTy, ValueType ZeroTy,
-                                SubRegIndex SubIdx> {
+                                ValueType SrcTy, SubRegIndex SubIdx> {
   def : Pat<(DstTy (insert_subvector immAllZerosV,
                                      (SrcTy RC:$src), (iPTR 0))),
             (SUBREG_TO_REG (i64 0),
@@ -112,51 +120,57 @@ multiclass subvec_zero_lowering<string MoveStr,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, sub_xmm>;
+  defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, sub_xmm>;
 }
 
 let Predicates = [HasVLX] in {
-  defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>;
-
-  defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>;
-
-  defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, sub_xmm>;
+
+  defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, sub_xmm>;
+
+  defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, sub_ymm>;
+  defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, sub_ymm>;
+  defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, sub_ymm>;
 }
 
 let Predicates = [HasAVX512, NoVLX] in {
-  defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>;
-  defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>;
-
-  defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>;
-  defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>;
+  defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, sub_xmm>;
+  defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, sub_xmm>;
+  defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, sub_xmm>;
+
+  defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, sub_ymm>;
+  defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, sub_ymm>;
+  defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, sub_ymm>;
+}
+
+let Predicates = [HasFP16, HasVLX] in {
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v16f16, v8f16, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v32f16, v8f16, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ256", VR256X, v32f16, v16f16, sub_ymm>;
 }
 
 class maskzeroupper<ValueType vt, RegisterClass RC> :
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index ff531713037c..8abbaa92c8cf 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -479,7 +479,7 @@ static void X86SelectAddress(const MachineInstr &I,
          "unsupported type.");
 
   if (I.getOpcode() == TargetOpcode::G_PTR_ADD) {
-    if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
+    if (auto COff = getIConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
       int64_t Imm = *COff;
       if (isInt<32>(Imm)) { // Check for displacement overflow.
         AM.Disp = static_cast<int32_t>(Imm);
@@ -1065,7 +1065,7 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
       return false;
 
     Opcode = X86::ADC32rr;
-  } else if (auto val = getConstantVRegVal(CarryInReg, MRI)) {
+  } else if (auto val = getIConstantVRegVal(CarryInReg, MRI)) {
     // carry is constant, support only 0.
     if (*val != 0)
       return false;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index de2500b8e1bd..1edec96bbec3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -24,6 +24,7 @@ enum IntrinsicType : uint16_t {
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
   INTR_TYPE_3OP_IMM8,
+  CFMA_OP_MASK, CFMA_OP_MASKZ,
   CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
   CVTPD2PS_MASK,
   INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
@@ -987,6 +988,236 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
   X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
   X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
+  X86_INTRINSIC_DATA(avx512fp16_add_ph_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512fp16_div_ph_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_add_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FADDS, X86ISD::FADDS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_sh, CMP_MASK_SCALAR_CC,
+                     X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_div_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FDIVS, X86ISD::FDIVS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_fpclass_sh, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_sh, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_128, INTR_TYPE_2OP_MASK_SAE,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_256, INTR_TYPE_2OP_MASK_SAE,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_512, INTR_TYPE_2OP_MASK_SAE,
+                     X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_sh, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+                     X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_max_sh_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_min_sh_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::FMINS, X86ISD::FMINS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_mul_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMULS, X86ISD::FMULS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_sh, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_sh, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::SCALEF, X86ISD::SCALEF_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_sh, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_sqrt_sh, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FSUBS, X86ISD::FSUBS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtdq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::VFPROUND, X86ISD::VMFPROUND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_256, TRUNCATE_TO_REG,
+                     X86ISD::VFPROUND, X86ISD::VMFPROUND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_256, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_512, INTR_TYPE_1OP_MASK_SAE,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_256, INTR_TYPE_1OP_MASK, ISD::FP_EXTEND, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_512, INTR_TYPE_1OP_MASK_SAE,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_128, TRUNCATE_TO_REG,
+                     X86ISD::VFPROUND, X86ISD::VMFPROUND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_256, INTR_TYPE_1OP_MASK, X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtqq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtqq2ph_256, TRUNCATE_TO_REG,
+                     X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsd2sh_round, INTR_TYPE_SCALAR_MASK_RND,
+                     X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsh2sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsh2ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtss2sh_round, INTR_TYPE_SCALAR_MASK_RND,
+                     X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtudq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_256, TRUNCATE_TO_REG,
+                     X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_128, CFMA_OP_MASK,  X86ISD::VFCMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_256, CFMA_OP_MASK,  X86ISD::VFCMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_512, CFMA_OP_MASK,  X86ISD::VFCMADDC,  X86ISD::VFCMADDC_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_csh, CFMA_OP_MASK,  X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_128, INTR_TYPE_2OP_MASK,  X86ISD::VFCMULC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_256, INTR_TYPE_2OP_MASK,  X86ISD::VFCMULC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_512, INTR_TYPE_2OP_MASK,  X86ISD::VFCMULC,  X86ISD::VFCMULC_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_csh, INTR_TYPE_SCALAR_MASK,  X86ISD::VFCMULCSH, X86ISD::VFCMULCSH_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_128, CFMA_OP_MASK,  X86ISD::VFMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_256, CFMA_OP_MASK,  X86ISD::VFMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_512, CFMA_OP_MASK,  X86ISD::VFMADDC,  X86ISD::VFMADDC_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_csh, CFMA_OP_MASK,  X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_128, INTR_TYPE_2OP_MASK,  X86ISD::VFMULC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_256, INTR_TYPE_2OP_MASK,  X86ISD::VFMULC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_512, INTR_TYPE_2OP_MASK,  X86ISD::VFMULC,  X86ISD::VFMULC_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_csh, INTR_TYPE_SCALAR_MASK,  X86ISD::VFMULCSH, X86ISD::VFMULCSH_RND),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_128, CFMA_OP_MASKZ,  X86ISD::VFCMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_256, CFMA_OP_MASKZ,  X86ISD::VFCMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_512, CFMA_OP_MASKZ,  X86ISD::VFCMADDC,  X86ISD::VFCMADDC_RND),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_csh, CFMA_OP_MASKZ,  X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_128, CFMA_OP_MASKZ,  X86ISD::VFMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_256, CFMA_OP_MASKZ,  X86ISD::VFMADDC,  0),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_512, CFMA_OP_MASKZ,  X86ISD::VFMADDC,  X86ISD::VFMADDC_RND),
+  X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_csh, CFMA_OP_MASKZ,  X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND),
+  X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_min_ph_128, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512fp16_min_ph_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512fp16_min_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512fp16_sqrt_ph_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  /*fp16 scalar convert instruction*/
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2usi32, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2usi64, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsi2sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsi642sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2si32, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2usi32, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtusi2sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtusi642sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vfmadd_f16, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vfmadd_ph_512, INTR_TYPE_3OP, ISD::FMA, X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_128, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512fp16_vfmaddsub_ph_512, INTR_TYPE_3OP, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
   X86_INTRINSIC_DATA(bmi_bextr_32,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bzhi_32,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 165533eba346..4710e524931c 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -306,7 +306,8 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
       OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary(
           OptimizePluginPath.c_str(), &ErrorMsg);
       if (!ErrorMsg.empty())
-        report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"');
+        report_fatal_error(Twine("Failed to load opt plugin: \"") + ErrorMsg +
+                           "\"");
       OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut");
       if (!OptimizeCut)
         report_fatal_error("Invalid optimization plugin");
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
index 7b6276c1d87e..e562748c98fe 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -76,7 +76,7 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
   bool Modified = false;
   for (auto &MBB : MF) {
     for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) {
-      if (MBBI->getOpcode() != X86::RETQ)
+      if (MBBI->getOpcode() != X86::RET64)
         continue;
 
       unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI);
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 248069f4deb4..6b564a0356a6 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -498,8 +498,8 @@ X86LowerAMXIntrinsics::lowerTileDP(Instruction *TileDP) {
   Value *ResAMX =
       Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext()));
   // Delete TileDP intrinsic and do some clean-up.
-  for (auto UI = TileDP->use_begin(), UE = TileDP->use_end(); UI != UE;) {
-    Instruction *I = cast<Instruction>((UI++)->getUser());
+  for (Use &U : llvm::make_early_inc_range(TileDP->uses())) {
+    Instruction *I = cast<Instruction>(U.getUser());
     Value *Vec;
     if (match(I, m_BitCast(m_Value(Vec)))) {
       I->replaceAllUsesWith(ResVec);
@@ -542,9 +542,8 @@ bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) {
     Value *ResAMX =
         Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext()));
     // Delete tileloadd6 intrinsic and do some clean-up
-    for (auto UI = TileLoadStore->use_begin(), UE = TileLoadStore->use_end();
-         UI != UE;) {
-      Instruction *I = cast<Instruction>((UI++)->getUser());
+    for (Use &U : llvm::make_early_inc_range(TileLoadStore->uses())) {
+      Instruction *I = cast<Instruction>(U.getUser());
       Value *Vec;
       if (match(I, m_BitCast(m_Value(Vec)))) {
         I->replaceAllUsesWith(ResVec);
@@ -561,8 +560,8 @@ bool X86LowerAMXIntrinsics::lowerTileZero(Instruction *TileZero) {
   IRBuilder<> Builder(TileZero);
   FixedVectorType *V256I32Ty = FixedVectorType::get(Builder.getInt32Ty(), 256);
   Value *VecZero = Constant::getNullValue(V256I32Ty);
-  for (auto UI = TileZero->use_begin(), UE = TileZero->use_end(); UI != UE;) {
-    Instruction *I = cast<Instruction>((UI++)->getUser());
+  for (Use &U : llvm::make_early_inc_range(TileZero->uses())) {
+    Instruction *I = cast<Instruction>(U.getUser());
     Value *Vec;
     if (match(I, m_BitCast(m_Value(Vec)))) {
       I->replaceAllUsesWith(VecZero);
@@ -631,6 +630,7 @@ bool X86LowerAMXIntrinsics::visit() {
   return C;
 }
 
+namespace {
 class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
 public:
   static char ID;
@@ -665,6 +665,7 @@ public:
     AU.addRequired<TargetPassConfig>();
   }
 };
+} // namespace
 
 static const char PassName[] = "Lower AMX intrinsics";
 char X86LowerAMXIntrinsicsLegacyPass::ID = 0;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 4ba44ccb6c16..7368b64efd9a 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -40,8 +40,10 @@
 //
 #include "X86.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -56,66 +58,44 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "lower-amx-type"
 
-static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder,
-                                           BasicBlock *BB) {
+static bool isAMXCast(Instruction *II) {
+  return match(II,
+               m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(m_Value())) ||
+         match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
+}
+
+static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
+                                           Type *Ty) {
   Function &F = *BB->getParent();
   Module *M = BB->getModule();
   const DataLayout &DL = M->getDataLayout();
 
-  Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
   LLVMContext &Ctx = Builder.getContext();
   auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   AllocaInst *AllocaRes =
-      new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
+      new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front());
   AllocaRes->setAlignment(AllocaAlignment);
   return AllocaRes;
 }
 
-namespace {
-class X86LowerAMXType {
-  Function &Func;
-  TargetMachine *TM = nullptr;
-
-  // In AMX intrinsics we let Shape = {Row, Col}, but the
-  // RealCol = Col / ElementSize. We may use the RealCol
-  // as a new Row for other new created AMX intrinsics.
-  std::map<Value *, Value *> Col2Row;
-
-public:
-  X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {}
-  bool visit();
-  void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
-  void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
-  bool transformBitcast(BitCastInst *Bitcast);
-  std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
-  Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
-};
-
-Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V,
-                                      unsigned Granularity) {
-  if (Col2Row.count(V))
-    return Col2Row[V];
-  IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt());
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    BasicBlock::iterator Iter = I->getIterator();
-    ++Iter;
-    Builder.SetInsertPoint(&*Iter);
-  }
-  ConstantInt *Gran = Builder.getInt16(Granularity);
-  Value *RealRow = Builder.CreateUDiv(V, Gran);
-  Col2Row[V] = RealRow;
-  return RealRow;
+static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
+  for (Instruction &I : F.getEntryBlock())
+    if (!isa<AllocaInst>(&I))
+      return &I;
+  llvm_unreachable("No terminator in the entry block!");
 }
 
-std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
-                                                      unsigned OpNo) {
+static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+  IRBuilder<> Builder(II);
   Value *Row = nullptr, *Col = nullptr;
   switch (II->getIntrinsicID()) {
   default:
@@ -144,14 +124,32 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
       Col = II->getArgOperand(2);
       break;
     case 5:
-      Row = II->getArgOperand(2);
-      // FIXME: There is a design bug for AMX shape, which the Col should be
-      // Col/4 if it will be used as Row, but current Greedy RA can't handle
-      // this case well, it may failed if we generate a new Shape definition.
-      // So Let's just do it in O0 first.
-      // Row = Row / 4
-      if (TM->getOptLevel() == CodeGenOpt::None)
-        Row = getRowFromCol(II, Row, 4);
+      if (isa<ConstantInt>(II->getArgOperand(2)))
+        Row = Builder.getInt16(
+            (cast<ConstantInt>(II->getOperand(2))->getSExtValue()) / 4);
+      else if (isa<Instruction>(II->getArgOperand(2))) {
+        // When it is not a const value and it is not a function argument, we
+        // create Row after the definition of II->getOperand(2) instead of
+        // before II. For example, II is %118, we try to getshape for %117:
+        //   %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
+        //   i32> %115).
+        //   %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
+        //   %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
+        //   %117).
+        // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
+        // definition is after its user(new tileload for %117).
+        // So, the best choice is to create %row right after the definition of
+        // %106.
+        Builder.SetInsertPoint(cast<Instruction>(II->getOperand(2)));
+        Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4));
+        cast<Instruction>(Row)->moveAfter(cast<Instruction>(II->getOperand(2)));
+      } else {
+        // When it is not a const value and it is a function argument, we create
+        // Row at the entry bb.
+        IRBuilder<> NewBuilder(
+            getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
+        Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4));
+      }
       Col = II->getArgOperand(1);
       break;
     }
@@ -162,6 +160,23 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
   return std::make_pair(Row, Col);
 }
 
+namespace {
+class X86LowerAMXType {
+  Function &Func;
+
+  // In AMX intrinsics we let Shape = {Row, Col}, but the
+  // RealCol = Col / ElementSize. We may use the RealCol
+  // as a new Row for other new created AMX intrinsics.
+  std::map<Value *, Value *> Col2Row;
+
+public:
+  X86LowerAMXType(Function &F) : Func(F) {}
+  bool visit();
+  void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
+  void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
+  bool transformBitcast(BitCastInst *Bitcast);
+};
+
 // %src = load <256 x i32>, <256 x i32>* %addr, align 64
 // %2 = bitcast <256 x i32> %src to x86_amx
 // -->
@@ -230,8 +245,8 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
   Value *I8Ptr, *Stride;
   auto *Src = Bitcast->getOperand(0);
 
-  auto Prepare = [&]() {
-    AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent());
+  auto Prepare = [&](Type *MemTy) {
+    AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent(), MemTy);
     I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
     Stride = Builder.getInt64(64);
   };
@@ -250,7 +265,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
     auto *II = dyn_cast<IntrinsicInst>(U.getUser());
     if (!II)
       return false; // May be bitcast from x86amx to <256 x i32>.
-    Prepare();
+    Prepare(Bitcast->getOperand(0)->getType());
     Builder.CreateStore(Src, AllocaAddr);
     // TODO we can pick an constant operand for the shape.
     Value *Row = nullptr, *Col = nullptr;
@@ -270,7 +285,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
     auto *II = dyn_cast<IntrinsicInst>(Src);
     if (!II)
       return false; // May be bitcast from <256 x i32> to x86amx.
-    Prepare();
+    Prepare(Bitcast->getType());
     Value *Row = II->getOperand(0);
     Value *Col = II->getOperand(1);
     std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
@@ -287,9 +302,7 @@ bool X86LowerAMXType::visit() {
   Col2Row.clear();
 
   for (BasicBlock *BB : post_order(&Func)) {
-    for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
-         II != IE;) {
-      Instruction &Inst = *II++;
+    for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*BB))) {
       auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
       if (!Bitcast)
         continue;
@@ -332,10 +345,8 @@ bool X86LowerAMXType::visit() {
           continue;
         }
         StoreInst *ST = nullptr;
-        for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
-             UI != UE;) {
-          Value *I = (UI++)->getUser();
-          ST = dyn_cast<StoreInst>(I);
+        for (Use &U : Bitcast->uses()) {
+          ST = dyn_cast<StoreInst>(U.getUser());
           if (ST)
             break;
         }
@@ -637,6 +648,366 @@ bool X86VolatileTileData::volatileTileData() {
 
 namespace {
 
+class X86LowerAMXCast {
+  Function &Func;
+
+public:
+  X86LowerAMXCast(Function &F) : Func(F) {}
+  bool combineAMXcast(TargetLibraryInfo *TLI);
+  bool transformAMXCast(IntrinsicInst *AMXCast);
+  bool transformAllAMXCast();
+  bool optimizeAMXCastFromPhi(IntrinsicInst *CI, PHINode *PN,
+                              SmallSetVector<Instruction *, 16> &DeadInst);
+};
+
+static bool DCEInstruction(Instruction *I,
+                           SmallSetVector<Instruction *, 16> &WorkList,
+                           const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    salvageDebugInfo(*I);
+    salvageKnowledge(I);
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) {
+        if (isInstructionTriviallyDead(OpI, TLI)) {
+          WorkList.insert(OpI);
+        }
+      }
+    }
+    I->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// This function handles following case
+///
+///     A  ->  B    amxcast
+///     PHI
+///     B  ->  A    amxcast
+///
+/// All the related PHI nodes can be replaced by new PHI nodes with type A.
+/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
+bool X86LowerAMXCast::optimizeAMXCastFromPhi(
+    IntrinsicInst *CI, PHINode *PN,
+    SmallSetVector<Instruction *, 16> &DeadInst) {
+  IRBuilder<> Builder(CI);
+  Value *Src = CI->getOperand(0);
+  Type *SrcTy = Src->getType(); // Type B
+  Type *DestTy = CI->getType(); // Type A
+
+  SmallVector<PHINode *, 4> PhiWorklist;
+  SmallSetVector<PHINode *, 4> OldPhiNodes;
+
+  // Find all of the A->B casts and PHI nodes.
+  // We need to inspect all related PHI nodes, but PHIs can be cyclic, so
+  // OldPhiNodes is used to track all known PHI nodes, before adding a new
+  // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
+  PhiWorklist.push_back(PN);
+  OldPhiNodes.insert(PN);
+  while (!PhiWorklist.empty()) {
+    auto *OldPN = PhiWorklist.pop_back_val();
+    for (Value *IncValue : OldPN->incoming_values()) {
+      // TODO: currently, We ignore cases where it is a const. In the future, we
+      // might support const.
+      if (isa<Constant>(IncValue))
+        return false;
+
+      if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
+        if (OldPhiNodes.insert(PNode))
+          PhiWorklist.push_back(PNode);
+        continue;
+      }
+      Instruction *ACI = dyn_cast<Instruction>(IncValue);
+      if (ACI && isAMXCast(ACI)) {
+        // Verify it's a A->B cast.
+        Type *TyA = ACI->getOperand(0)->getType();
+        Type *TyB = ACI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return false;
+        continue;
+      }
+      return false;
+    }
+  }
+
+  // Check that each user of each old PHI node is something that we can
+  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
+  for (auto *OldPN : OldPhiNodes) {
+    for (User *V : OldPN->users()) {
+      Instruction *ACI = dyn_cast<Instruction>(V);
+      if (ACI && isAMXCast(ACI)) {
+        // Verify it's a B->A cast.
+        Type *TyB = ACI->getOperand(0)->getType();
+        Type *TyA = ACI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return false;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // As long as the user is another old PHI node, then even if we don't
+        // rewrite it, the PHI web we're considering won't have any users
+        // outside itself, so it'll be dead.
+        // example:
+        //   bb.0:
+        //      %0 = amxcast ...
+        //   bb.1:
+        //      %1 = amxcast ...
+        //   bb.2:
+        //      %goodphi = phi %0, %1
+        //      %3 = amxcast %goodphi
+        //   bb.3:
+        //      %goodphi2 = phi %0, %goodphi
+        //      %4 = amxcast %goodphi2
+        // When optimizeAMXCastFromPhi process %3 and %goodphi, %goodphi2 is
+        // outside the phi-web, so the combination stop When
+        // optimizeAMXCastFromPhi process %4 and %goodphi2, the optimization
+        // will be done.
+        if (OldPhiNodes.count(PHI) == 0)
+          return false;
+      } else
+        return false;
+    }
+  }
+
+  // For each old PHI node, create a corresponding new PHI node with a type A.
+  SmallDenseMap<PHINode *, PHINode *> NewPNodes;
+  for (auto *OldPN : OldPhiNodes) {
+    Builder.SetInsertPoint(OldPN);
+    PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
+    NewPNodes[OldPN] = NewPN;
+  }
+
+  // Fill in the operands of new PHI nodes.
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
+    for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
+      Value *V = OldPN->getOperand(j);
+      Value *NewV = nullptr;
+      Instruction *ACI = dyn_cast<Instruction>(V);
+      // There should not be a AMXcast from a const.
+      if (ACI && isAMXCast(ACI))
+        NewV = ACI->getOperand(0);
+      else if (auto *PrevPN = dyn_cast<PHINode>(V))
+        NewV = NewPNodes[PrevPN];
+      assert(NewV);
+      NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
+    }
+  }
+
+  // Traverse all accumulated PHI nodes and process its users,
+  // which are Stores and BitcCasts. Without this processing
+  // NewPHI nodes could be replicated and could lead to extra
+  // moves generated after DeSSA.
+  // If there is a store with type B, change it to type A.
+
+  // Replace users of BitCast B->A with NewPHI. These will help
+  // later to get rid of a closure formed by OldPHI nodes.
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
+    for (User *V : make_early_inc_range(OldPN->users())) {
+      Instruction *ACI = dyn_cast<Instruction>(V);
+      if (ACI && isAMXCast(ACI)) {
+        Type *TyB = ACI->getOperand(0)->getType();
+        Type *TyA = ACI->getType();
+        assert(TyA == DestTy && TyB == SrcTy);
+        (void)TyA;
+        (void)TyB;
+        ACI->replaceAllUsesWith(NewPN);
+        DeadInst.insert(ACI);
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // We don't need to push PHINode into DeadInst since they are operands
+        // of rootPN DCE can safely delete rootPN's operands if rootPN is dead.
+        assert(OldPhiNodes.contains(PHI));
+        (void)PHI;
+      } else
+        llvm_unreachable("all uses should be handled");
+    }
+  }
+  return true;
+}
+
+bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
+  bool Change = false;
+  // Collect tile cast instruction.
+  SmallVector<Instruction *, 8> Vec2TileInsts;
+  SmallVector<Instruction *, 8> Tile2VecInsts;
+  SmallVector<Instruction *, 8> PhiCastWorkList;
+  SmallSetVector<Instruction *, 16> DeadInst;
+  for (BasicBlock &BB : Func) {
+    for (Instruction &I : BB) {
+      Value *Vec;
+      if (match(&I,
+                m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(m_Value(Vec))))
+        Vec2TileInsts.push_back(&I);
+      else if (match(&I, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
+                             m_Value(Vec))))
+        Tile2VecInsts.push_back(&I);
+    }
+  }
+
+  auto Convert = [&](SmallVectorImpl<Instruction *> &Insts, Intrinsic::ID IID) {
+    for (auto *Inst : Insts) {
+      for (User *U : Inst->users()) {
+        IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+        if (!II || II->getIntrinsicID() != IID)
+          continue;
+        // T1 = vec2tile V0
+        // V2 = tile2vec T1
+        // V3 = OP V2
+        // -->
+        // T1 = vec2tile V0
+        // V2 = tile2vec T1
+        // V3 = OP V0
+        II->replaceAllUsesWith(Inst->getOperand(0));
+        Change = true;
+      }
+    }
+  };
+
+  Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
+  Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
+
+  auto EraseInst = [&](SmallVectorImpl<Instruction *> &Insts) {
+    for (auto *Inst : Insts) {
+      if (Inst->use_empty()) {
+        Inst->eraseFromParent();
+        Change = true;
+      }
+    }
+  };
+
+  EraseInst(Vec2TileInsts);
+  EraseInst(Tile2VecInsts);
+
+  // Handle the A->B->A cast, and there is an intervening PHI node.
+  for (BasicBlock &BB : Func) {
+    for (Instruction &I : BB) {
+      if (isAMXCast(&I)) {
+        if (isa<PHINode>(I.getOperand(0)))
+          PhiCastWorkList.push_back(&I);
+      }
+    }
+  }
+  for (auto *I : PhiCastWorkList) {
+    // We skip the dead Amxcast.
+    if (DeadInst.contains(I))
+      continue;
+    PHINode *PN = cast<PHINode>(I->getOperand(0));
+    if (optimizeAMXCastFromPhi(cast<IntrinsicInst>(I), PN, DeadInst)) {
+      DeadInst.insert(PN);
+      Change = true;
+    }
+  }
+
+  // Since we create new phi and merge AMXCast, some old phis and AMXCast might
+  // have no uses. We do some DeadCodeElimination for them.
+  while (!DeadInst.empty()) {
+    Instruction *I = DeadInst.pop_back_val();
+    Change |= DCEInstruction(I, DeadInst, TLI);
+  }
+  return Change;
+}
+
+// There might be remaining AMXcast after combineAMXcast and they should be
+// handled elegantly.
+bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
+  IRBuilder<> Builder(AMXCast);
+  AllocaInst *AllocaAddr;
+  Value *I8Ptr, *Stride;
+  auto *Src = AMXCast->getOperand(0);
+
+  auto Prepare = [&](Type *MemTy) {
+    AllocaAddr = createAllocaInstAtEntry(Builder, AMXCast->getParent(), MemTy);
+    I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
+    Stride = Builder.getInt64(64);
+  };
+
+  if (AMXCast->getType()->isX86_AMXTy()) {
+    // %2 = amxcast <225 x i32> %src to x86_amx
+    // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
+    //                                           i8* %addr3, i64 60, x86_amx %2)
+    // -->
+    // %addr = alloca <225 x i32>, align 64
+    // store <225 x i32> %src, <225 x i32>* %addr, align 64
+    // %addr2 = bitcast <225 x i32>* %addr to i8*
+    // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 15, i16 60,
+    //                                                  i8* %addr2,
+    //                                                  i64 60)
+    // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
+    //                                           i8* %addr3, i64 60, x86_amx %2)
+    Use &U = *(AMXCast->use_begin());
+    unsigned OpNo = U.getOperandNo();
+    auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+    if (!II)
+      return false; // May be bitcast from x86amx to <256 x i32>.
+    Prepare(AMXCast->getOperand(0)->getType());
+    Builder.CreateStore(Src, AllocaAddr);
+    // TODO we can pick an constant operand for the shape.
+    Value *Row = nullptr, *Col = nullptr;
+    std::tie(Row, Col) = getShape(II, OpNo);
+    std::array<Value *, 4> Args = {
+        Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
+    Value *NewInst = Builder.CreateIntrinsic(
+        Intrinsic::x86_tileloadd64_internal, None, Args);
+    AMXCast->replaceAllUsesWith(NewInst);
+    AMXCast->eraseFromParent();
+  } else {
+    // %2 = amxcast x86_amx %src to <225 x i32>
+    // -->
+    // %addr = alloca <225 x i32>, align 64
+    // %addr2 = bitcast <225 x i32>* to i8*
+    // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
+    //                                           i8* %addr2, i64 %stride)
+    // %2 = load <225 x i32>, <225 x i32>* %addr, align 64
+    auto *II = dyn_cast<IntrinsicInst>(Src);
+    if (!II)
+      return false; // May be bitcast from <256 x i32> to x86amx.
+    Prepare(AMXCast->getType());
+    Value *Row = II->getOperand(0);
+    Value *Col = II->getOperand(1);
+    std::array<Value *, 5> Args = {
+        Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src};
+    Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+    Value *NewInst = Builder.CreateLoad(AMXCast->getType(), AllocaAddr);
+    AMXCast->replaceAllUsesWith(NewInst);
+    AMXCast->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool X86LowerAMXCast::transformAllAMXCast() {
+  bool Change = false;
+  // Collect tile cast instruction.
+  SmallVector<Instruction *, 8> WorkLists;
+  for (BasicBlock &BB : Func) {
+    for (Instruction &I : BB) {
+      if (isAMXCast(&I))
+        WorkLists.push_back(&I);
+    }
+  }
+
+  for (auto *Inst : WorkLists) {
+    Change |= transformAMXCast(cast<IntrinsicInst>(Inst));
+  }
+
+  return Change;
+}
+
+} // anonymous namespace
+
+namespace {
+
 class X86LowerAMXTypeLegacyPass : public FunctionPass {
 public:
   static char ID;
@@ -646,10 +1017,18 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
+    bool C = false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    X86LowerAMXCast LAC(F);
+    C |= LAC.combineAMXcast(TLI);
+    // There might be remaining AMXcast after combineAMXcast and they should be
+    // handled elegantly.
+    C |= LAC.transformAllAMXCast();
 
-    X86LowerAMXType LAT(F, TM);
-    bool C = LAT.visit();
+    X86LowerAMXType LAT(F);
+    C |= LAT.visit();
 
     // Prepare for fast register allocation at O0.
     // Todo: May better check the volatile model of AMX code, not just
@@ -671,6 +1050,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -681,6 +1061,7 @@ char X86LowerAMXTypeLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
                     false)
 
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index 03692d195768..d6b42145859d 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -75,9 +75,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
-         MII != MIE;) {
-      MachineInstr &MI = *MII++;
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       if (!MI.isCopy())
         continue;
       MachineOperand &DstMO = MI.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 7d916f917d5e..c3cd634612a4 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -43,8 +43,11 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 
 using namespace llvm;
 
@@ -274,6 +277,9 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_GOTPCREL:
     RefKind = MCSymbolRefExpr::VK_GOTPCREL;
     break;
+  case X86II::MO_GOTPCREL_NORELAX:
+    RefKind = MCSymbolRefExpr::VK_GOTPCREL_NORELAX;
+    break;
   case X86II::MO_GOT:
     RefKind = MCSymbolRefExpr::VK_GOT;
     break;
@@ -418,7 +424,7 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
 }
 
 static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
-  return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+  return Subtarget.is64Bit() ? X86::RET64 : X86::RET32;
 }
 
 Optional<MCOperand>
@@ -1094,11 +1100,11 @@ static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
   if (Subtarget->is64Bit()) {
     // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
     // IndexReg/BaseReg below need to be updated.
-    if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
+    if (Subtarget->hasFeature(X86::TuningFast7ByteNOP))
       MaxNopLength = 7;
-    else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
+    else if (Subtarget->hasFeature(X86::TuningFast15ByteNOP))
       MaxNopLength = 15;
-    else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
+    else if (Subtarget->hasFeature(X86::TuningFast11ByteNOP))
       MaxNopLength = 11;
     else
       MaxNopLength = 10;
@@ -1323,6 +1329,244 @@ void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
           .addExpr(Op));
 }
 
+void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
+  // FIXME: Make this work on non-ELF.
+  if (!TM.getTargetTriple().isOSBinFormatELF()) {
+    report_fatal_error("llvm.asan.check.memaccess only supported on ELF");
+    return;
+  }
+
+  unsigned Reg = MI.getOperand(0).getReg().id();
+  ASanAccessInfo AccessInfo(MI.getOperand(1).getImm());
+
+  MCSymbol *&Sym =
+      AsanMemaccessSymbols[AsanMemaccessTuple(Reg, AccessInfo.Packed)];
+  if (!Sym) {
+    std::string Name = AccessInfo.IsWrite ? "store" : "load";
+    std::string SymName = "__asan_check_" + Name +
+                          utostr(1ULL << AccessInfo.AccessSizeIndex) + "_rn" +
+                          utostr(Reg);
+    Sym = OutContext.getOrCreateSymbol(SymName);
+  }
+
+  EmitAndCountInstruction(
+      MCInstBuilder(X86::CALL64pcrel32)
+          .addExpr(MCSymbolRefExpr::create(Sym, OutContext)));
+}
+
+void X86AsmPrinter::emitAsanMemaccessPartial(Module &M, unsigned Reg,
+                                             const ASanAccessInfo &AccessInfo,
+                                             MCSubtargetInfo &STI) {
+  assert(AccessInfo.AccessSizeIndex == 0 || AccessInfo.AccessSizeIndex == 1 ||
+         AccessInfo.AccessSizeIndex == 2);
+  assert(Reg != X86::R8);
+
+  uint64_t ShadowBase;
+  int MappingScale;
+  bool OrShadowOffset;
+  getAddressSanitizerParams(
+      Triple(M.getTargetTriple()), M.getDataLayout().getPointerSizeInBits(),
+      AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset);
+
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::MOV64rr).addReg(X86::R8).addReg(X86::NoRegister + Reg),
+      STI);
+  OutStreamer->emitInstruction(MCInstBuilder(X86::SHR64ri)
+                                   .addReg(X86::R8)
+                                   .addReg(X86::R8)
+                                   .addImm(MappingScale),
+                               STI);
+  if (OrShadowOffset) {
+    OutStreamer->emitInstruction(MCInstBuilder(X86::OR64ri32)
+                                     .addReg(X86::R8)
+                                     .addReg(X86::R8)
+                                     .addImm(ShadowBase),
+                                 STI);
+    OutStreamer->emitInstruction(MCInstBuilder(X86::MOV8rm)
+                                     .addReg(X86::R8B)
+                                     .addReg(X86::R8)
+                                     .addImm(1)
+                                     .addReg(X86::NoRegister)
+                                     .addImm(0)
+                                     .addReg(X86::NoRegister),
+                                 STI);
+    OutStreamer->emitInstruction(
+        MCInstBuilder(X86::TEST8rr).addReg(X86::R8B).addReg(X86::R8B), STI);
+  } else {
+    OutStreamer->emitInstruction(MCInstBuilder(X86::MOVSX32rm8)
+                                     .addReg(X86::R8D)
+                                     .addReg(X86::R8)
+                                     .addImm(1)
+                                     .addReg(X86::NoRegister)
+                                     .addImm(ShadowBase)
+                                     .addReg(X86::NoRegister),
+                                 STI);
+    OutStreamer->emitInstruction(
+        MCInstBuilder(X86::TEST32rr).addReg(X86::R8D).addReg(X86::R8D), STI);
+  }
+  MCSymbol *AdditionalCheck = OutContext.createTempSymbol();
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::JCC_1)
+          .addExpr(MCSymbolRefExpr::create(AdditionalCheck, OutContext))
+          .addImm(X86::COND_NE),
+      STI);
+  MCSymbol *ReturnSym = OutContext.createTempSymbol();
+  OutStreamer->emitLabel(ReturnSym);
+  OutStreamer->emitInstruction(MCInstBuilder(getRetOpcode(*Subtarget)), STI);
+
+  // Shadow byte is non-zero so we need to perform additional checks.
+  OutStreamer->emitLabel(AdditionalCheck);
+  OutStreamer->emitInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RCX),
+                               STI);
+  OutStreamer->emitInstruction(MCInstBuilder(X86::MOV64rr)
+                                   .addReg(X86::RCX)
+                                   .addReg(X86::NoRegister + Reg),
+                               STI);
+  const size_t Granularity = 1ULL << MappingScale;
+  OutStreamer->emitInstruction(MCInstBuilder(X86::AND32ri8)
+                                   .addReg(X86::NoRegister)
+                                   .addReg(X86::ECX)
+                                   .addImm(Granularity - 1),
+                               STI);
+  if (AccessInfo.AccessSizeIndex == 1) {
+    OutStreamer->emitInstruction(MCInstBuilder(X86::ADD32ri8)
+                                     .addReg(X86::NoRegister)
+                                     .addReg(X86::ECX)
+                                     .addImm(1),
+                                 STI);
+  } else if (AccessInfo.AccessSizeIndex == 2) {
+    OutStreamer->emitInstruction(MCInstBuilder(X86::ADD32ri8)
+                                     .addReg(X86::NoRegister)
+                                     .addReg(X86::ECX)
+                                     .addImm(3),
+                                 STI);
+  }
+
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::R8D).addImm(1),
+      STI);
+  OutStreamer->emitInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RCX),
+                               STI);
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::JCC_1)
+          .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext))
+          .addImm(X86::COND_L),
+      STI);
+
+  emitAsanReportError(M, Reg, AccessInfo, STI);
+}
+
+void X86AsmPrinter::emitAsanMemaccessFull(Module &M, unsigned Reg,
+                                          const ASanAccessInfo &AccessInfo,
+                                          MCSubtargetInfo &STI) {
+  assert(AccessInfo.AccessSizeIndex == 3 || AccessInfo.AccessSizeIndex == 4);
+  assert(Reg != X86::R8);
+
+  uint64_t ShadowBase;
+  int MappingScale;
+  bool OrShadowOffset;
+  getAddressSanitizerParams(
+      Triple(M.getTargetTriple()), M.getDataLayout().getPointerSizeInBits(),
+      AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset);
+
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::MOV64rr).addReg(X86::R8).addReg(X86::NoRegister + Reg),
+      STI);
+  OutStreamer->emitInstruction(MCInstBuilder(X86::SHR64ri)
+                                   .addReg(X86::R8)
+                                   .addReg(X86::R8)
+                                   .addImm(MappingScale),
+                               STI);
+  if (OrShadowOffset) {
+    OutStreamer->emitInstruction(MCInstBuilder(X86::OR64ri32)
+                                     .addReg(X86::R8)
+                                     .addReg(X86::R8)
+                                     .addImm(ShadowBase),
+                                 STI);
+    auto OpCode = AccessInfo.AccessSizeIndex == 3 ? X86::CMP8mi : X86::CMP16mi8;
+    OutStreamer->emitInstruction(MCInstBuilder(OpCode)
+                                     .addReg(X86::R8)
+                                     .addImm(1)
+                                     .addReg(X86::NoRegister)
+                                     .addImm(0)
+                                     .addReg(X86::NoRegister)
+                                     .addImm(0),
+                                 STI);
+  } else {
+    auto OpCode = AccessInfo.AccessSizeIndex == 3 ? X86::CMP8mi : X86::CMP16mi8;
+    OutStreamer->emitInstruction(MCInstBuilder(OpCode)
+                                     .addReg(X86::R8)
+                                     .addImm(1)
+                                     .addReg(X86::NoRegister)
+                                     .addImm(ShadowBase)
+                                     .addReg(X86::NoRegister)
+                                     .addImm(0),
+                                 STI);
+  }
+  MCSymbol *ReportCode = OutContext.createTempSymbol();
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::JCC_1)
+          .addExpr(MCSymbolRefExpr::create(ReportCode, OutContext))
+          .addImm(X86::COND_NE),
+      STI);
+  MCSymbol *ReturnSym = OutContext.createTempSymbol();
+  OutStreamer->emitLabel(ReturnSym);
+  OutStreamer->emitInstruction(MCInstBuilder(getRetOpcode(*Subtarget)), STI);
+
+  OutStreamer->emitLabel(ReportCode);
+  emitAsanReportError(M, Reg, AccessInfo, STI);
+}
+
+void X86AsmPrinter::emitAsanReportError(Module &M, unsigned Reg,
+                                        const ASanAccessInfo &AccessInfo,
+                                        MCSubtargetInfo &STI) {
+  std::string Name = AccessInfo.IsWrite ? "store" : "load";
+  MCSymbol *ReportError = OutContext.getOrCreateSymbol(
+      "__asan_report_" + Name + utostr(1ULL << AccessInfo.AccessSizeIndex));
+  OutStreamer->emitInstruction(MCInstBuilder(X86::MOV64rr)
+                                   .addReg(X86::RDI)
+                                   .addReg(X86::NoRegister + Reg),
+                               STI);
+  OutStreamer->emitInstruction(
+      MCInstBuilder(X86::JMP_4)
+          .addExpr(MCSymbolRefExpr::create(ReportError, MCSymbolRefExpr::VK_PLT,
+                                           OutContext)),
+      STI);
+}
+
+void X86AsmPrinter::emitAsanMemaccessSymbols(Module &M) {
+  if (AsanMemaccessSymbols.empty())
+    return;
+
+  const Triple &TT = TM.getTargetTriple();
+  assert(TT.isOSBinFormatELF());
+  std::unique_ptr<MCSubtargetInfo> STI(
+      TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
+  assert(STI && "Unable to create subtarget info");
+
+  for (auto &P : AsanMemaccessSymbols) {
+    MCSymbol *Sym = P.second;
+    OutStreamer->SwitchSection(OutContext.getELFSection(
+        ".text.hot", ELF::SHT_PROGBITS,
+        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName(),
+        /*IsComdat=*/true));
+
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
+    OutStreamer->emitLabel(Sym);
+
+    unsigned Reg = std::get<0>(P.first);
+    ASanAccessInfo AccessInfo(std::get<1>(P.first));
+
+    if (AccessInfo.AccessSizeIndex < 3) {
+      emitAsanMemaccessPartial(M, Reg, AccessInfo, *STI);
+    } else {
+      emitAsanMemaccessFull(M, Reg, AccessInfo, *STI);
+    }
+  }
+}
+
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
                                       X86MCInstLower &MCIL) {
   // PATCHABLE_OP minsize, opcode, operands
@@ -1477,7 +1721,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   // First we emit the label and the jump.
   auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
   OutStreamer->AddComment("# XRay Custom Event Log");
-  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitCodeAlignment(2, &getSubtargetInfo());
   OutStreamer->emitLabel(CurSled);
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
@@ -1573,7 +1817,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
   // First we emit the label and the jump.
   auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
   OutStreamer->AddComment("# XRay Typed Event Log");
-  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitCodeAlignment(2, &getSubtargetInfo());
   OutStreamer->emitLabel(CurSled);
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
@@ -1675,7 +1919,7 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
   //   call <relative offset, 32-bits>   // 5 bytes
   //
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitCodeAlignment(2, &getSubtargetInfo());
   OutStreamer->emitLabel(CurSled);
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
@@ -1705,7 +1949,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   //
   // This just makes sure that the alignment for the next instruction is 2.
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitCodeAlignment(2, &getSubtargetInfo());
   OutStreamer->emitLabel(CurSled);
   unsigned OpCode = MI.getOperand(0).getImm();
   MCInst Ret;
@@ -1729,7 +1973,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
   // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
   // tail call much like how we have it in PATCHABLE_RET.
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitCodeAlignment(2, &getSubtargetInfo());
   OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
 
@@ -2563,6 +2807,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
     return;
 
+  case X86::ASAN_CHECK_MEMACCESS:
+    return LowerASAN_CHECK_MEMACCESS(*MI);
+
   case X86::MORESTACK_RET_RESTORE_R10:
     // Return, then restore R10.
     EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 46d2e2a66fd6..99d1a97380dd 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -102,8 +102,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// True if this function uses the red zone.
   bool UsesRedZone = false;
 
-  /// True if this function has WIN_ALLOCA instructions.
-  bool HasWinAlloca = false;
+  /// True if this function has DYN_ALLOCA instructions.
+  bool HasDynAlloca = false;
 
   /// True if this function has any preallocated calls.
   bool HasPreallocatedCall = false;
@@ -113,6 +113,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// other tools to detect the extended record.
   bool HasSwiftAsyncContext = false;
 
+  /// True if this function has tile virtual register. This is used to
+  /// determine if we should insert tilerelease in frame lowering.
+  bool HasVirtualTileReg = false;
+
   Optional<int> SwiftAsyncContextFrameIdx;
 
   ValueMap<const Value *, size_t> PreallocatedIds;
@@ -198,8 +202,8 @@ public:
   bool getUsesRedZone() const { return UsesRedZone; }
   void setUsesRedZone(bool V) { UsesRedZone = V; }
 
-  bool hasWinAlloca() const { return HasWinAlloca; }
-  void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+  bool hasDynAlloca() const { return HasDynAlloca; }
+  void setHasDynAlloca(bool v) { HasDynAlloca = v; }
 
   bool hasPreallocatedCall() const { return HasPreallocatedCall; }
   void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
@@ -207,6 +211,9 @@ public:
   bool hasSwiftAsyncContext() const { return HasSwiftAsyncContext; }
   void setHasSwiftAsyncContext(bool v) { HasSwiftAsyncContext = v; }
 
+  bool hasVirtualTileReg() const { return HasVirtualTileReg; }
+  void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; }
+
   Optional<int> getSwiftAsyncContextFrameIdx() const {
     return SwiftAsyncContextFrameIdx;
   }
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index ab4d2bd05772..6967a96ce83b 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -503,9 +503,7 @@ bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
   MachineBasicBlock *MBB = (*LEAs.begin()->second.begin())->getParent();
 
   // Process all instructions in basic block.
-  for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr &MI = *I++;
-
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
     // Instruction must be load or store.
     if (!MI.mayLoadOrStore())
       continue;
@@ -655,9 +653,8 @@ bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         // isReplaceable function.
         Register FirstVReg = First.getOperand(0).getReg();
         Register LastVReg = Last.getOperand(0).getReg();
-        for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end();
-             UI != UE;) {
-          MachineOperand &MO = *UI++;
+        for (MachineOperand &MO :
+             llvm::make_early_inc_range(MRI->use_operands(LastVReg))) {
           MachineInstr &MI = *MO.getParent();
 
           if (MI.isDebugValue()) {
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index e10dab72078d..47ae517ae76d 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -174,12 +174,9 @@ void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
   }
 
   // Follow branches in BB and look for returns
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
-       I != MBB->succ_end(); ++I) {
-    if (*I == MBB)
-      continue;
-    findReturns(*I, Cycles);
-  }
+  for (MachineBasicBlock *Succ : MBB->successors())
+    if (Succ != MBB)
+      findReturns(Succ, Cycles);
 }
 
 /// cyclesUntilReturn - return true if the MBB has a return instruction,
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index 3844667ccc74..25fcba1a7581 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -142,8 +142,24 @@ def SkylakeServerPfmCounters : ProcPfmCounters {
 def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>;
 def : PfmCountersBinding<"cascadelake", SkylakeServerPfmCounters>;
 def : PfmCountersBinding<"cannonlake", SkylakeServerPfmCounters>;
-def : PfmCountersBinding<"icelake-client", SkylakeServerPfmCounters>;
-def : PfmCountersBinding<"icelake-server", SkylakeServerPfmCounters>;
+
+def IceLakePfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"ICXPort0",  "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"ICXPort1",  "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"ICXPort23", "uops_dispatched_port:port_2_3">,
+    PfmIssueCounter<"ICXPort49", "uops_dispatched_port:port_4_9">,
+    PfmIssueCounter<"ICXPort5",  "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"ICXPort6",  "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"ICXPort78", "uops_dispatched_port:port_7_8">
+  ];
+}
+def : PfmCountersBinding<"icelake-client", IceLakePfmCounters>;
+def : PfmCountersBinding<"icelake-server", IceLakePfmCounters>;
+def : PfmCountersBinding<"rocketlake", IceLakePfmCounters>;
+def : PfmCountersBinding<"tigerlake", IceLakePfmCounters>;
 
 // AMD X86 Counters.
 // Set basic counters for AMD cpus that we know libpfm4 supports.
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index b85a0b61d6f6..5d21f8666ec6 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -25,6 +25,7 @@
 
 #include "X86.h"
 #include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -235,6 +236,7 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
   const TargetInstrInfo *TII = ST.getInstrInfo();
   const TargetRegisterInfo *TRI = ST.getRegisterInfo();
   const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
   BitVector AMXRegs(TRI->getNumRegs());
   for (unsigned I = 0; I < RC->getNumRegs(); I++)
@@ -294,6 +296,7 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
   // There's no AMX instruction if we didn't find a tile config live in point.
   if (CfgNeedInsert.empty())
     return false;
+  X86FI->setHasVirtualTileReg(true);
 
   // Avoid to insert ldtilecfg before any shape defs.
   SmallVector<MachineBasicBlock *, 8> WorkList;
@@ -323,7 +326,7 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) {
       ST.getTileConfigSize(), ST.getTileConfigAlignment(), false);
 
   // Try to insert for the tile config live in points.
-  for (auto I : CfgNeedInsert) {
+  for (const auto &I : CfgNeedInsert) {
     SmallSet<MIRef, 8> InsertPoints;
     SmallVector<MIRef, 8> WorkList({I});
     while (!WorkList.empty()) {
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index c4748423baea..130cb61cdde2 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -816,10 +816,10 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
     return 0;
   case TargetOpcode::PATCHABLE_RET:
   case X86::RET:
-  case X86::RETL:
-  case X86::RETQ:
-  case X86::RETIL:
-  case X86::RETIQ:
+  case X86::RET32:
+  case X86::RET64:
+  case X86::RETI32:
+  case X86::RETI64:
   case X86::TCRETURNdi:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 1ab9d2588a90..d835f452b67e 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -373,12 +373,6 @@ def CR15 : X86Reg<"cr15", 15>;
 def EIZ : X86Reg<"eiz", 4>;
 def RIZ : X86Reg<"riz", 4>;
 
-// Bound registers, used in MPX instructions
-def BND0 : X86Reg<"bnd0",   0>;
-def BND1 : X86Reg<"bnd1",   1>;
-def BND2 : X86Reg<"bnd2",   2>;
-def BND3 : X86Reg<"bnd3",   3>;
-
 // CET registers - Shadow Stack Pointer
 def SSP : X86Reg<"ssp", 0>;
 
@@ -436,6 +430,12 @@ def GR64 : RegisterClass<"X86", [i64], 64,
                          (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
                               RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
 
+// GR64 - 64-bit GPRs without R8 and RIP. Could be used when emitting code for
+// intrinsics, which use implict input registers.
+def GR64NoR8 : RegisterClass<"X86", [i64], 64,
+                              (add RAX, RCX, RDX, RSI, RDI, R9, R10, R11,
+                                   RBX, R14, R15, R12, R13, RBP, RSP)>;
+
 // Segment registers for use by MOV instructions (and others) that have a
 //   segment register as one operand.  Always contain a 16-bit segment
 //   descriptor.
@@ -567,9 +567,9 @@ def RSTi : RegisterOperand<RST, "printSTiRegOperand">;
 // Generic vector registers: VR64 and VR128.
 // Ensure that float types are declared first - only float is legal on SSE1.
 def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
-def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],
                           128, (add FR32)>;
-def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64],
                           256, (sequence "YMM%u", 0, 15)>;
 
 // Status flags registers.
@@ -587,7 +587,7 @@ def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> {
 }
 
 // AVX-512 vector/mask registers.
-def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v32f16, v64i8, v32i16, v16i32, v8i64],
                           512, (sequence "ZMM%u", 0, 31)>;
 
 // Represents the lower 16 registers that have VEX/legacy encodable subregs.
@@ -599,10 +599,12 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
 
 def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
 
+def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>;
+
 // Extended VR128 and VR256 for AVX-512 instructions
-def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],
                            128, (add FR32X)>;
-def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
@@ -632,9 +634,6 @@ def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
 def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
 def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 
-// Bound registers
-def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
-
 // Tiles
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index d2ced1c67407..2827981b7fb0 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -112,6 +112,25 @@ multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
 // 2/3/7 cycle to recompute the address.
 def : WriteRes<WriteRMW, [BWPort237,BWPort4]>;
 
+// Loads, stores, and moves, not folded with other operations.
+// Store_addr on 237.
+// Store_data on 4.
+defm : X86WriteRes<WriteStore,   [BWPort237, BWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteLoad,    [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteMove,    [BWPort0156], 1, [1], 1>;
+
+// Treat misc copies as a move.
+def  : InstRW<[WriteMove], (instrs COPY)>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def  : WriteRes<WriteZero,       []>;
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
 // Arithmetic.
 defm : BWWriteResPair<WriteALU,    [BWPort0156], 1>; // Simple integer ALU op.
 defm : BWWriteResPair<WriteADC,    [BWPort06], 1>; // Integer ALU + flags op.
@@ -123,41 +142,41 @@ defm : X86WriteRes<WriteIMul16Imm,    [BWPort1,BWPort0156], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,  [BWPort1,BWPort0156,BWPort23], 8, [1,1,1], 3>;
 defm : BWWriteResPair<WriteIMul16Reg, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul32,    [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
+defm : BWWriteResPair<WriteMULX32,    [BWPort1,BWPort06,BWPort0156], 3, [1,1,1], 3>;
 defm : BWWriteResPair<WriteIMul32Imm, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul32Reg, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul64,    [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : BWWriteResPair<WriteMULX64,    [BWPort1,BWPort5], 3, [1,1], 2>;
 defm : BWWriteResPair<WriteIMul64Imm, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul64Reg, [BWPort1],   3>;
-def : WriteRes<WriteIMulH, []> { let Latency = 3; }
-
-// TODO: Why isn't the BWDivider used consistently?
-defm : X86WriteRes<WriteDiv8,      [BWPort0, BWDivider], 25, [1, 10], 1>;
-defm : X86WriteRes<WriteDiv16,     [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32,     [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64,     [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv8Ld,    [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv16Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv32Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv64Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-
-defm : X86WriteRes<WriteIDiv8,     [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv16,    [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv32,    [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv64,    [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv8Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv16Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv32Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv64Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+def BWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
+def  : WriteRes<WriteIMulHLd, []> {
+  let Latency = !add(BWWriteIMulH.Latency, BroadwellModel.LoadLatency);
+}
 
-defm : X86WriteRes<WriteCMPXCHG,[BWPort06, BWPort0156], 5, [2, 3], 5>;
-defm : X86WriteRes<WriteCMPXCHGRMW,[BWPort23, BWPort06, BWPort0156, BWPort237, BWPort4], 8, [1, 2, 1, 1, 1], 6>;
 defm : X86WriteRes<WriteBSWAP32,   [BWPort15], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64,   [BWPort06, BWPort15], 2, [1, 1], 2>;
+defm : X86WriteRes<WriteCMPXCHG,[BWPort06, BWPort0156], 5, [2, 3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[BWPort23, BWPort06, BWPort0156, BWPort237, BWPort4], 8, [1, 2, 1, 1, 1], 6>;
 defm : X86WriteRes<WriteXCHG,      [BWPort0156], 2, [3], 3>;
 
-defm : BWWriteResPair<WriteCRC32, [BWPort1],   3>;
+// Integer shifts and rotates.
+defm : BWWriteResPair<WriteShift,    [BWPort06],  1>;
+defm : BWWriteResPair<WriteShiftCL,  [BWPort06,BWPort0156],  3, [2,1], 3>;
+defm : BWWriteResPair<WriteRotate,   [BWPort06],  1, [1], 1>;
+defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156],  3, [2,1], 3>;
 
-def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>;
+defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : BWWriteResPair<WriteJump,  [BWPort06],   1>;
+
+defm : BWWriteResPair<WriteCRC32, [BWPort1],   3>;
 
 defm : BWWriteResPair<WriteCMOV,  [BWPort06], 1>; // Conditional move.
 defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
@@ -176,6 +195,11 @@ defm : X86WriteRes<WriteBitTestSet,      [BWPort06], 1, [1], 1>; // Bit Test + S
 defm : X86WriteRes<WriteBitTestSetImmLd, [BWPort06,BWPort23], 5, [1,1], 3>;
 defm : X86WriteRes<WriteBitTestSetRegLd, [BWPort0156,BWPort23], 5, [1,1], 2>;
 
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [BWPort15]>;
+
 // Bit counts.
 defm : BWWriteResPair<WriteBSF, [BWPort1], 3>;
 defm : BWWriteResPair<WriteBSR, [BWPort1], 3>;
@@ -183,43 +207,29 @@ defm : BWWriteResPair<WriteLZCNT,          [BWPort1], 3>;
 defm : BWWriteResPair<WriteTZCNT,          [BWPort1], 3>;
 defm : BWWriteResPair<WritePOPCNT,         [BWPort1], 3>;
 
-// Integer shifts and rotates.
-defm : BWWriteResPair<WriteShift,    [BWPort06],  1>;
-defm : BWWriteResPair<WriteShiftCL,  [BWPort06,BWPort0156],  3, [2,1], 3>;
-defm : BWWriteResPair<WriteRotate,   [BWPort06],  1, [1], 1>;
-defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156],  3, [2,1], 3>;
-
-// SHLD/SHRD.
-defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>;
-defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>;
-defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>;
-defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>;
-
 // BMI1 BEXTR/BLS, BMI2 BZHI
 defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
 defm : BWWriteResPair<WriteBLS,   [BWPort15], 1>;
 defm : BWWriteResPair<WriteBZHI,  [BWPort15], 1>;
 
-// Loads, stores, and moves, not folded with other operations.
-defm : X86WriteRes<WriteLoad,    [BWPort23], 5, [1], 1>;
-defm : X86WriteRes<WriteStore,   [BWPort237, BWPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteMove,    [BWPort0156], 1, [1], 1>;
-
-// Model the effect of clobbering the read-write mask operand of the GATHER operation.
-// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
-defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
-
-// Idioms that clear a register, like xorps %xmm0, %xmm0.
-// These can often bypass execution ports completely.
-def : WriteRes<WriteZero,  []>;
-
-// Treat misc copies as a move.
-def : InstRW<[WriteMove], (instrs COPY)>;
-
-// Branches don't produce values, so they have no latency, but they still
-// consume resources. Indirect branches can fold loads.
-defm : BWWriteResPair<WriteJump,  [BWPort06],   1>;
+// TODO: Why isn't the BWDivider used consistently?
+defm : X86WriteRes<WriteDiv8,     [BWPort0, BWDivider], 25, [1, 10], 1>;
+defm : X86WriteRes<WriteDiv16,    [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32,    [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64,    [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv8Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv16Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv32Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv64Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+
+defm : X86WriteRes<WriteIDiv8,    [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv16,   [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv32,   [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv64,   [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv8Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv16Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv32Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv64Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
 
 // Floating point. This covers both scalar and vector operations.
 defm : X86WriteRes<WriteFLD0,          [BWPort01], 1, [1], 1>;
@@ -245,6 +255,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5
 defm : X86WriteRes<WriteFMove,         [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
 
 defm : BWWriteResPair<WriteFAdd,    [BWPort1],  3, [1], 1, 5>; // Floating point add/sub.
 defm : BWWriteResPair<WriteFAddX,   [BWPort1],  3, [1], 1, 5>; // Floating point add/sub (XMM).
@@ -285,6 +296,16 @@ defm : BWWriteResPair<WriteFDiv64X,  [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; //
 defm : BWWriteResPair<WriteFDiv64Y,  [BWPort0,BWPort015,BWFPDivider], 23, [2,1,16], 3, 6>; // Floating point division (YMM).
 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
 
+defm : BWWriteResPair<WriteFRcp,   [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRcpX,  [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
+defm : BWWriteResPair<WriteFRcpY,  [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : BWWriteResPair<WriteFRsqrt, [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFRsqrtX,[BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
+defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
 defm : X86WriteRes<WriteFSqrt,       [BWPort0,BWFPDivider], 11, [1,4], 1>; // Floating point square root.
 defm : X86WriteRes<WriteFSqrtLd,     [BWPort0,BWPort23,BWFPDivider], 16, [1,1,7], 2>;
 defm : BWWriteResPair<WriteFSqrtX,   [BWPort0,BWFPDivider], 11, [1,7], 1, 5>; // Floating point square root (XMM).
@@ -297,16 +318,6 @@ defm : BWWriteResPair<WriteFSqrt64Y, [BWPort0,BWPort015,BWFPDivider], 29, [2,1,2
 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
 defm : BWWriteResPair<WriteFSqrt80,  [BWPort0,BWFPDivider], 23, [1,9]>; // Floating point long double square root.
 
-defm : BWWriteResPair<WriteFRcp,   [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate.
-defm : BWWriteResPair<WriteFRcpX,  [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
-defm : BWWriteResPair<WriteFRcpY,  [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
-defm : X86WriteResPairUnsupported<WriteFRcpZ>;
-
-defm : BWWriteResPair<WriteFRsqrt, [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate.
-defm : BWWriteResPair<WriteFRsqrtX,[BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
-defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
-defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
-
 defm : BWWriteResPair<WriteFMA,    [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add.
 defm : BWWriteResPair<WriteFMAX,   [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add (XMM).
 defm : BWWriteResPair<WriteFMAY,   [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
@@ -336,6 +347,8 @@ defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 defm : BWWriteResPair<WriteFBlend,  [BWPort015], 1, [1], 1, 5>; // Floating point vector blends.
 defm : BWWriteResPair<WriteFBlendY, [BWPort015], 1, [1], 1, 6>; // Floating point vector blends.
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
 defm : BWWriteResPair<WriteFVarBlend,  [BWPort5], 2, [2], 2, 5>; // Fp vector variable blends.
 defm : BWWriteResPair<WriteFVarBlendY, [BWPort5], 2, [2], 2, 6>; // Fp vector variable blends.
 defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
@@ -343,6 +356,48 @@ defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 // FMA Scheduling helper class.
 // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
+// Conversion between integer and float.
+defm : BWWriteResPair<WriteCvtSS2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2IY,  [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : BWWriteResPair<WriteCvtSD2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2IY,  [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : BWWriteResPair<WriteCvtI2SS,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PS,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PSY,  [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : BWWriteResPair<WriteCvtI2SD,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PD,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PDY,  [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : BWWriteResPair<WriteCvtSS2SD,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PD,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : BWWriteResPair<WriteCvtSD2SS,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PS,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS,     [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd,  [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH,    [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [BWPort1,BWPort5], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt,  [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
 // Vector integer operations.
 defm : X86WriteRes<WriteVecLoad,         [BWPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteVecLoadX,        [BWPort23], 5, [1], 1>;
@@ -366,12 +421,6 @@ defm : X86WriteRes<WriteVecMoveY,        [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [BWPort0], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [BWPort5], 1, [1], 1>;
 
-defm : X86WriteRes<WriteEMMS,            [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
-
-defm : BWWriteResPair<WriteVecALU,   [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecALUX,  [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecALUY,  [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
-defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
 defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
 defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM).
@@ -379,6 +428,10 @@ defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
 defm : BWWriteResPair<WriteVecTest,  [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions.
 defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : BWWriteResPair<WriteVecALU,   [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUX,  [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUY,  [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 defm : BWWriteResPair<WriteVecIMul,  [BWPort0],  5, [1], 1, 5>; // Vector integer multiply.
 defm : BWWriteResPair<WriteVecIMulX, [BWPort0],  5, [1], 1, 5>; // Vector integer multiply.
 defm : BWWriteResPair<WriteVecIMulY, [BWPort0],  5, [1], 1, 6>; // Vector integer multiply.
@@ -397,6 +450,9 @@ defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
 defm : BWWriteResPair<WriteBlend,  [BWPort5], 1, [1], 1, 5>; // Vector blends.
 defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVPMOV256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width packed vector width-changing move.
+defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector variable shuffles.
 defm : BWWriteResPair<WriteVarBlend,  [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
 defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
@@ -444,49 +500,7 @@ def : WriteRes<WriteVecExtractSt, [BWPort4,BWPort5,BWPort237]> {
   let NumMicroOps = 3;
 }
 
-// Conversion between integer and float.
-defm : BWWriteResPair<WriteCvtSS2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2IY,  [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : BWWriteResPair<WriteCvtSD2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2IY,  [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-
-defm : BWWriteResPair<WriteCvtI2SS,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PS,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PSY,  [BWPort1], 4>;
-defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : BWWriteResPair<WriteCvtI2SD,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PD,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PDY,  [BWPort1], 4>;
-defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
-
-defm : BWWriteResPair<WriteCvtSS2SD,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2PD,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : BWWriteResPair<WriteCvtSD2SS,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2PS,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
-
-defm : X86WriteRes<WriteCvtPH2PS,     [BWPort0,BWPort5], 2, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPH2PSY,    [BWPort0,BWPort5], 2, [1,1], 2>;
-defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
-defm : X86WriteRes<WriteCvtPH2PSLd,  [BWPort0,BWPort23], 6, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
-defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
-
-defm : X86WriteRes<WriteCvtPS2PH,    [BWPort1,BWPort5], 4, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPS2PHY,   [BWPort1,BWPort5], 6, [1,1], 2>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
-defm : X86WriteRes<WriteCvtPS2PHSt,  [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
-
-// Strings instructions.
+// String instructions.
 
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [BWPort0]> {
@@ -542,7 +556,7 @@ def : WriteRes<WriteVecMOVMSK,  [BWPort0]> { let Latency = 3; }
 def : WriteRes<WriteVecMOVMSKY, [BWPort0]> { let Latency = 3; }
 def : WriteRes<WriteMMXMOVMSK,  [BWPort0]> { let Latency = 1; }
 
-// AES instructions.
+// AES Instructions.
 def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption.
   let Latency = 7;
   let NumMicroOps = 1;
@@ -578,27 +592,19 @@ def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23, BWPort015]> {
 
 // Carry-less multiplication instructions.
 defm : BWWriteResPair<WriteCLMul,  [BWPort0], 5>;
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
 
 // Catch-all for expensive system instructions.
-def : WriteRes<WriteSystem,     [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
-
-// AVX2.
-defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
-defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteVPMOV256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width packed vector width-changing move.
-defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector variable shuffles.
+def : WriteRes<WriteSystem,     [BWPort0156]> { let Latency = 100; }
 
 // Old microcoded instructions that nobody use.
-def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; }
 
 // Fence instructions.
 def : WriteRes<WriteFence,  [BWPort23, BWPort4]>;
 
-// Load/store MXCSR.
-def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
-def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
-
 // Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
@@ -1104,7 +1110,7 @@ def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[BWWriteResGroup84], (instrs LRETQ, RETQ)>;
+def: InstRW<[BWWriteResGroup84], (instrs LRET64, RET64)>;
 
 def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 99fddcd4b2d5..68961d6245ab 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -117,12 +117,16 @@ multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
 // 2/3/7 cycle to recompute the address.
 def : WriteRes<WriteRMW, [HWPort237,HWPort4]>;
 
+// Loads, stores, and moves, not folded with other operations.
 // Store_addr on 237.
 // Store_data on 4.
 defm : X86WriteRes<WriteStore,   [HWPort237, HWPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteStoreNT, [HWPort237, HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteLoad,    [HWPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteMove,    [HWPort0156], 1, [1], 1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
 def  : WriteRes<WriteZero,       []>;
 
 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
@@ -140,12 +144,17 @@ defm : X86WriteRes<WriteIMul16Imm,    [HWPort1,HWPort0156], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,  [HWPort1,HWPort0156,HWPort23], 8, [1,1,1], 3>;
 defm : HWWriteResPair<WriteIMul16Reg, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul32,    [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
+defm : HWWriteResPair<WriteMULX32,    [HWPort1,HWPort06,HWPort0156], 3, [1,1,1], 3>;
 defm : HWWriteResPair<WriteIMul32Imm, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul32Reg, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul64,    [HWPort1,HWPort6], 4, [1,1], 2>;
+defm : HWWriteResPair<WriteMULX64,    [HWPort1,HWPort6], 3, [1,1], 2>;
 defm : HWWriteResPair<WriteIMul64Imm, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul64Reg, [HWPort1],   3>;
-def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+def HWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
+def  : WriteRes<WriteIMulHLd, []> {
+  let Latency = !add(HWWriteIMulH.Latency, HaswellModel.LoadLatency);
+}
 
 defm : X86WriteRes<WriteBSWAP32,   [HWPort15], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64,   [HWPort06, HWPort15], 2, [1,1], 2>;
@@ -165,11 +174,15 @@ defm : X86WriteRes<WriteSHDrrcl,[HWPort1, HWPort06, HWPort0156], 6, [1, 1, 2], 4
 defm : X86WriteRes<WriteSHDmri, [HWPort1, HWPort23, HWPort237, HWPort0156], 10, [1, 1, 1, 1], 4>;
 defm : X86WriteRes<WriteSHDmrcl,[HWPort1, HWPort23, HWPort237, HWPort06, HWPort0156], 12, [1, 1, 1, 1, 2], 6>;
 
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
 defm : HWWriteResPair<WriteJump,   [HWPort06],  1>;
+
 defm : HWWriteResPair<WriteCRC32,  [HWPort1],   3>;
 
 defm : HWWriteResPair<WriteCMOV,  [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
 defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
+
 def  : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
 def  : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
   let Latency = 2;
@@ -220,7 +233,7 @@ defm : X86WriteRes<WriteIDiv16Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>
 defm : X86WriteRes<WriteIDiv32Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
 defm : X86WriteRes<WriteIDiv64Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
 
-// Scalar and vector floating point.
+// Floating point. This covers both scalar and vector operations.
 defm : X86WriteRes<WriteFLD0,          [HWPort01], 1, [1], 1>;
 defm : X86WriteRes<WriteFLD1,          [HWPort01], 1, [2], 2>;
 defm : X86WriteRes<WriteFLDC,          [HWPort01], 1, [2], 2>;
@@ -305,14 +318,14 @@ defm : HWWriteResPair<WriteFSqrt64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28
 defm : HWWriteResPair<WriteFSqrt64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFSqrt80,  [HWPort0,HWFPDivider], 23, [1,17]>;
 
-defm : HWWriteResPair<WriteFMA,   [HWPort01], 5, [1], 1, 5>;
-defm : HWWriteResPair<WriteFMAX,  [HWPort01], 5, [1], 1, 6>;
-defm : HWWriteResPair<WriteFMAY,  [HWPort01], 5, [1], 1, 7>;
-defm : HWWriteResPair<WriteFMAZ,  [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
-defm : HWWriteResPair<WriteDPPD,  [HWPort0,HWPort1,HWPort5],  9, [1,1,1], 3, 6>;
-defm : HWWriteResPair<WriteDPPS,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
-defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
-defm : HWWriteResPair<WriteDPPSZ, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFMA,    [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMAX,   [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMAY,   [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMAZ,   [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteDPPD,   [HWPort0,HWPort1,HWPort5],  9, [1,1,1], 3, 6>;
+defm : HWWriteResPair<WriteDPPS,   [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
+defm : HWWriteResPair<WriteDPPSY,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : HWWriteResPair<WriteDPPSZ,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
 defm : X86WriteRes<WriteFRnd,            [HWPort23],  6, [1],   1>;
 defm : X86WriteRes<WriteFRndY,           [HWPort23],  6, [1],   1>;
@@ -593,11 +606,28 @@ def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
 def : WriteRes<WriteLDMXCSR, [HWPort0,HWPort23,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
 def : WriteRes<WriteSTMXCSR, [HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
 
+// Catch-all for expensive system instructions.
 def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
+
+// Old microcoded instructions that nobody use.
 def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+
+// Fence instructions.
 def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
+
+// Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : HWWriteResPair<WriteFHAdd,   [HWPort1, HWPort5], 5, [1,2], 3, 6>;
+defm : HWWriteResPair<WriteFHAddY,  [HWPort1, HWPort5], 5, [1,2], 3, 7>;
+defm : HWWriteResPair<WritePHAdd,  [HWPort5, HWPort15], 3, [2,1], 3, 5>;
+defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
+
 //================ Exceptions ================//
 
 //-- Specific Scheduling Models --//
@@ -680,7 +710,7 @@ def HWWriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
   let NumMicroOps = 4;
   let ResourceCycles = [1, 2, 1];
 }
-def : InstRW<[HWWriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+def : InstRW<[HWWriteRETI], (instregex "RETI(16|32|64)", "LRETI(16|32|64)")>;
 
 // BOUND.
 // r,m.
@@ -821,16 +851,6 @@ def HWWriteFXTRACT : SchedWriteRes<[]> {
 }
 def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>;
 
-////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub  instructions.
-////////////////////////////////////////////////////////////////////////////////
-
-defm : HWWriteResPair<WriteFHAdd,  [HWPort1, HWPort5], 5, [1,2], 3, 6>;
-defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>;
-defm : HWWriteResPair<WritePHAdd,  [HWPort5, HWPort15], 3, [2,1], 3, 5>;
-defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
-defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
-
 //=== Floating Point XMM and YMM Instructions ===//
 
 // Remaining instrs.
@@ -1168,7 +1188,7 @@ def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup41], (instrs LRETQ, RETL, RETQ)>;
+def: InstRW<[HWWriteResGroup41], (instrs LRET64, RET32, RET64)>;
 
 def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
   let Latency = 3;
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
new file mode 100644
index 000000000000..889b9b7fa666
--- /dev/null
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -0,0 +1,2636 @@
+//=- X86SchedIceLake.td - X86 Ice Lake Scheduling ------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Ice Lake to support
+// instruction scheduling and other instruction cost heuristics.
+//
+// TODO: This is mainly a copy X86SchedSkylakeServer.td, but allows us to
+// iteratively improve scheduling handling toward better modelling the 
+// Ice Lake (Sunny/Cypress Cove) microarchitecture.
+//
+//===----------------------------------------------------------------------===//
+
+def IceLakeModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and Ice Lake can
+  // decode 6 instructions per cycle.
+  let IssueWidth = 6;
+  let MicroOpBufferSize = 224; // Based on the reorder buffer.
+  let LoadLatency = 5;
+  let MispredictPenalty = 14;
+
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
+  // This flag is set to allow the scheduler to assign a default model to
+  // unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = IceLakeModel in {
+
+// Ice Lake can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Ports 4 and 9 gets the data half of stores. Store data can be available later
+// than the store address, but since we don't model the latency of stores, we
+// can ignore that.
+// Ports 2 and 3 are identical. They handle loads and address calculations.
+// Ports 7 and 8 are identical. They handle stores address calculations.
+def ICXPort0 : ProcResource<1>;
+def ICXPort1 : ProcResource<1>;
+def ICXPort2 : ProcResource<1>;
+def ICXPort3 : ProcResource<1>;
+def ICXPort4 : ProcResource<1>;
+def ICXPort5 : ProcResource<1>;
+def ICXPort6 : ProcResource<1>;
+def ICXPort7 : ProcResource<1>;
+def ICXPort8 : ProcResource<1>;
+def ICXPort9 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def ICXPort01  : ProcResGroup<[ICXPort0, ICXPort1]>;
+def ICXPort23  : ProcResGroup<[ICXPort2, ICXPort3]>;
+def ICXPort237 : ProcResGroup<[ICXPort2, ICXPort3, ICXPort7]>;
+def ICXPort04  : ProcResGroup<[ICXPort0, ICXPort4]>;
+def ICXPort05  : ProcResGroup<[ICXPort0, ICXPort5]>;
+def ICXPort06  : ProcResGroup<[ICXPort0, ICXPort6]>;
+def ICXPort15  : ProcResGroup<[ICXPort1, ICXPort5]>;
+def ICXPort16  : ProcResGroup<[ICXPort1, ICXPort6]>;
+def ICXPort49  : ProcResGroup<[ICXPort4, ICXPort9]>;
+def ICXPort56  : ProcResGroup<[ICXPort5, ICXPort6]>;
+def ICXPort78  : ProcResGroup<[ICXPort7, ICXPort8]>;
+def ICXPort015 : ProcResGroup<[ICXPort0, ICXPort1, ICXPort5]>;
+def ICXPort056 : ProcResGroup<[ICXPort0, ICXPort5, ICXPort6]>;
+def ICXPort0156: ProcResGroup<[ICXPort0, ICXPort1, ICXPort5, ICXPort6]>;
+
+def ICXDivider : ProcResource<1>; // Integer division issued on port 0.
+// FP division and sqrt on port 0.
+def ICXFPDivider : ProcResource<1>;
+
+// 60 Entry Unified Scheduler
+def ICXPortAny : ProcResGroup<[ICXPort0, ICXPort1, ICXPort2, ICXPort3, ICXPort4,
+                               ICXPort5, ICXPort6, ICXPort7, ICXPort8, ICXPort9]> {
+  let BufferSize=60;
+}
+
+// Integer loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Vector loads are 5/6/7 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5/6/7 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 6>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass ICXWriteResPair<X86FoldableSchedWrite SchedRW,
+                          list<ProcResourceKind> ExePorts,
+                          int Lat, list<int> Res = [1], int UOps = 1,
+                          int LoadLat = 5> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
+
+  // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to
+  // the latency (default = 5).
+  def : WriteRes<SchedRW.Folded, !listconcat([ICXPort23], ExePorts)> {
+    let Latency = !add(Lat, LoadLat);
+    let ResourceCycles = !listconcat([1], Res);
+    let NumMicroOps = !add(UOps, 1);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, and an extra port
+// 2/3/7 cycle to recompute the address.
+def : WriteRes<WriteRMW, [ICXPort237,ICXPort4]>;
+
+// Arithmetic.
+defm : ICXWriteResPair<WriteALU,    [ICXPort0156], 1>; // Simple integer ALU op.
+defm : ICXWriteResPair<WriteADC,    [ICXPort06],   1>; // Integer ALU + flags op.
+
+// Integer multiplication.
+defm : ICXWriteResPair<WriteIMul8,     [ICXPort1],   3>;
+defm : ICXWriteResPair<WriteIMul16,    [ICXPort1,ICXPort06,ICXPort0156], 4, [1,1,2], 4>;
+defm : X86WriteRes<WriteIMul16Imm,     [ICXPort1,ICXPort0156], 4, [1,1], 2>;
+defm : X86WriteRes<WriteIMul16ImmLd,   [ICXPort1,ICXPort0156,ICXPort23], 8, [1,1,1], 3>;
+defm : X86WriteRes<WriteIMul16Reg,     [ICXPort1],   3, [1], 1>;
+defm : X86WriteRes<WriteIMul16RegLd,   [ICXPort1,ICXPort0156,ICXPort23], 8, [1,1,1], 3>;
+defm : ICXWriteResPair<WriteIMul32,    [ICXPort1,ICXPort06,ICXPort0156], 4, [1,1,1], 3>;
+defm : ICXWriteResPair<WriteMULX32,    [ICXPort1,ICXPort06,ICXPort0156], 3, [1,1,1], 3>;
+defm : ICXWriteResPair<WriteIMul32Imm, [ICXPort1],   3>;
+defm : ICXWriteResPair<WriteIMul32Reg, [ICXPort1],   3>;
+defm : ICXWriteResPair<WriteIMul64,    [ICXPort1,ICXPort5], 4, [1,1], 2>;
+defm : ICXWriteResPair<WriteMULX64,    [ICXPort1,ICXPort5], 3, [1,1], 2>;
+defm : ICXWriteResPair<WriteIMul64Imm, [ICXPort1],   3>;
+defm : ICXWriteResPair<WriteIMul64Reg, [ICXPort1],   3>;
+def ICXWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
+def  : WriteRes<WriteIMulHLd, []> {
+  let Latency = !add(ICXWriteIMulH.Latency, SkylakeServerModel.LoadLatency);
+}
+
+defm : X86WriteRes<WriteBSWAP32, [ICXPort15], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [ICXPort06, ICXPort15], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCMPXCHG,[ICXPort06, ICXPort0156], 5, [2,3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[ICXPort23,ICXPort06,ICXPort0156,ICXPort237,ICXPort4], 8, [1,2,1,1,1], 6>;
+defm : X86WriteRes<WriteXCHG,       [ICXPort0156], 2, [3], 3>;
+
+// TODO: Why isn't the ICXDivider used?
+defm : ICXWriteResPair<WriteDiv8,  [ICXPort0, ICXDivider], 25, [1,10], 1, 4>;
+defm : X86WriteRes<WriteDiv16,     [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32,     [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64,     [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156], 76, [7,2,8,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv16Ld,   [ICXPort0,ICXPort23,ICXDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv32Ld,   [ICXPort0,ICXPort23,ICXDivider], 29, [1,1,10], 2>;
+defm : X86WriteRes<WriteDiv64Ld,   [ICXPort0,ICXPort23,ICXDivider], 29, [1,1,10], 2>;
+
+defm : X86WriteRes<WriteIDiv8,     [ICXPort0, ICXDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv16,    [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort06,ICXPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv32,    [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort06,ICXPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv64,    [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort06,ICXPort0156], 102, [4,2,4,8,14,34], 66>;
+defm : X86WriteRes<WriteIDiv8Ld,   [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv16Ld,  [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv32Ld,  [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
+defm : X86WriteRes<WriteIDiv64Ld,  [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
+
+defm : ICXWriteResPair<WriteCRC32, [ICXPort1], 3>;
+
+def : WriteRes<WriteLEA, [ICXPort15]>; // LEA instructions can't fold loads.
+
+defm : ICXWriteResPair<WriteCMOV,  [ICXPort06], 1, [1], 1>; // Conditional move.
+defm : X86WriteRes<WriteFCMOV, [ICXPort1], 3, [1], 1>; // x87 conditional move.
+def  : WriteRes<WriteSETCC, [ICXPort06]>; // Setcc.
+def  : WriteRes<WriteSETCCStore, [ICXPort06,ICXPort4,ICXPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+defm : X86WriteRes<WriteLAHFSAHF,        [ICXPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTest,         [ICXPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd,    [ICXPort06,ICXPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd,    [ICXPort0156,ICXPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet,      [ICXPort06], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [ICXPort06,ICXPort23], 5, [1,1], 3>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [ICXPort0156,ICXPort23], 5, [1,1], 2>;
+
+// Integer shifts and rotates.
+defm : ICXWriteResPair<WriteShift,    [ICXPort06],  1>;
+defm : ICXWriteResPair<WriteShiftCL,  [ICXPort06],  3, [3], 3>;
+defm : ICXWriteResPair<WriteRotate,   [ICXPort06],  1, [1], 1>;
+defm : ICXWriteResPair<WriteRotateCL, [ICXPort06],  3, [3], 3>;
+
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [ICXPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[ICXPort1,ICXPort06,ICXPort0156], 6, [1, 2, 1], 4>;
+defm : X86WriteRes<WriteSHDmri, [ICXPort1,ICXPort23,ICXPort237,ICXPort0156], 9, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[ICXPort1,ICXPort23,ICXPort237,ICXPort06,ICXPort0156], 11, [1, 1, 1, 2, 1], 6>;
+
+// Bit counts.
+defm : ICXWriteResPair<WriteBSF, [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteBSR, [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteLZCNT,          [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteTZCNT,          [ICXPort1], 3>;
+defm : ICXWriteResPair<WritePOPCNT,         [ICXPort1], 3>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : ICXWriteResPair<WriteBEXTR, [ICXPort06,ICXPort15], 2, [1,1], 2>;
+defm : ICXWriteResPair<WriteBLS,   [ICXPort15], 1>;
+defm : ICXWriteResPair<WriteBZHI,  [ICXPort15], 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+defm : X86WriteRes<WriteLoad,    [ICXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteStore,   [ICXPort237, ICXPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [ICXPort237, ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteMove,    [ICXPort0156], 1, [1], 1>;
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero,  []>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : ICXWriteResPair<WriteJump,  [ICXPort06],   1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : X86WriteRes<WriteFLD0,          [ICXPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteFLD1,          [ICXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLDC,          [ICXPort05], 1, [2], 2>;
+defm : X86WriteRes<WriteFLoad,         [ICXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadX,        [ICXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteFLoadY,        [ICXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad,   [ICXPort23,ICXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedLoadY,  [ICXPort23,ICXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFStore,        [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreX,       [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreY,       [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNT,      [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTX,     [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteFStoreNTY,     [ICXPort237,ICXPort4], 1, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32,  [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64,  [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [ICXPort237,ICXPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMove,         [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [ICXPort05,ICXPort0156], 10, [9,1], 10>;
+
+defm : ICXWriteResPair<WriteFAdd,      [ICXPort01],  4, [1], 1, 5>; // Floating point add/sub.
+defm : ICXWriteResPair<WriteFAddX,     [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFAddY,     [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAddZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAdd64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double add/sub.
+defm : ICXWriteResPair<WriteFAdd64X,   [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFAdd64Y,   [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFAdd64Z,   [ICXPort05],  4, [1], 1, 7>;
+
+defm : ICXWriteResPair<WriteFCmp,      [ICXPort01],  4, [1], 1, 5>; // Floating point compare.
+defm : ICXWriteResPair<WriteFCmpX,     [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFCmpY,     [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFCmpZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFCmp64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double compare.
+defm : ICXWriteResPair<WriteFCmp64X,   [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFCmp64Y,   [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFCmp64Z,   [ICXPort05],  4, [1], 1, 7>;
+
+defm : ICXWriteResPair<WriteFCom,       [ICXPort0],  2>; // Floating point compare to flags (X87).
+defm : ICXWriteResPair<WriteFComX,      [ICXPort0],  2>; // Floating point compare to flags (SSE).
+
+defm : ICXWriteResPair<WriteFMul,      [ICXPort01],  4, [1], 1, 5>; // Floating point multiplication.
+defm : ICXWriteResPair<WriteFMulX,     [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFMulY,     [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMulZ,     [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMul64,    [ICXPort01],  4, [1], 1, 5>; // Floating point double multiplication.
+defm : ICXWriteResPair<WriteFMul64X,   [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFMul64Y,   [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMul64Z,   [ICXPort05],  4, [1], 1, 7>;
+
+defm : ICXWriteResPair<WriteFDiv,     [ICXPort0,ICXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : ICXWriteResPair<WriteFDivX,    [ICXPort0,ICXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles.
+defm : ICXWriteResPair<WriteFDivY,    [ICXPort0,ICXFPDivider], 11, [1,5], 1, 7>; // 10-14 cycles.
+defm : ICXWriteResPair<WriteFDivZ,    [ICXPort0,ICXPort5,ICXFPDivider], 18, [2,1,10], 3, 7>; // 10-14 cycles.
+//defm : ICXWriteResPair<WriteFDiv64,   [ICXPort0,ICXFPDivider], 14, [1,3], 1, 5>; // 10-14 cycles. // Floating point division.
+//defm : ICXWriteResPair<WriteFDiv64X,  [ICXPort0,ICXFPDivider], 14, [1,3], 1, 6>; // 10-14 cycles.
+//defm : ICXWriteResPair<WriteFDiv64Y,  [ICXPort0,ICXFPDivider], 14, [1,5], 1, 7>; // 10-14 cycles.
+defm : ICXWriteResPair<WriteFDiv64Z,  [ICXPort0,ICXPort5,ICXFPDivider], 23, [2,1,16], 3, 7>; // 10-14 cycles.
+
+defm : ICXWriteResPair<WriteFSqrt,    [ICXPort0,ICXFPDivider], 12, [1,3], 1, 5>; // Floating point square root.
+defm : ICXWriteResPair<WriteFSqrtX,   [ICXPort0,ICXFPDivider], 12, [1,3], 1, 6>;
+defm : ICXWriteResPair<WriteFSqrtY,   [ICXPort0,ICXFPDivider], 12, [1,6], 1, 7>;
+defm : ICXWriteResPair<WriteFSqrtZ,   [ICXPort0,ICXPort5,ICXFPDivider], 20, [2,1,12], 3, 7>;
+defm : ICXWriteResPair<WriteFSqrt64,  [ICXPort0,ICXFPDivider], 18, [1,6], 1, 5>; // Floating point double square root.
+defm : ICXWriteResPair<WriteFSqrt64X, [ICXPort0,ICXFPDivider], 18, [1,6], 1, 6>;
+defm : ICXWriteResPair<WriteFSqrt64Y, [ICXPort0,ICXFPDivider], 18, [1,12],1, 7>;
+defm : ICXWriteResPair<WriteFSqrt64Z, [ICXPort0,ICXPort5,ICXFPDivider], 32, [2,1,24], 3, 7>;
+defm : ICXWriteResPair<WriteFSqrt80,  [ICXPort0,ICXFPDivider], 21, [1,7]>; // Floating point long double square root.
+
+defm : ICXWriteResPair<WriteFRcp,   [ICXPort0],  4, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : ICXWriteResPair<WriteFRcpX,  [ICXPort0],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFRcpY,  [ICXPort0],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFRcpZ,  [ICXPort0,ICXPort5],  4, [2,1], 3, 7>;
+
+defm : ICXWriteResPair<WriteFRsqrt, [ICXPort0],  4, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : ICXWriteResPair<WriteFRsqrtX,[ICXPort0],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFRsqrtY,[ICXPort0],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFRsqrtZ,[ICXPort0,ICXPort5],  9, [2,1], 3, 7>;
+
+defm : ICXWriteResPair<WriteFMA,  [ICXPort01],  4, [1], 1, 5>; // Fused Multiply Add.
+defm : ICXWriteResPair<WriteFMAX, [ICXPort01],  4, [1], 1, 6>;
+defm : ICXWriteResPair<WriteFMAY, [ICXPort01],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMAZ, [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteDPPD, [ICXPort5,ICXPort015],  9, [1,2], 3, 6>; // Floating point double dot product.
+defm : ICXWriteResPair<WriteDPPS, [ICXPort5,ICXPort015], 13, [1,3], 4, 6>;
+defm : ICXWriteResPair<WriteDPPSY,[ICXPort5,ICXPort015], 13, [1,3], 4, 7>;
+defm : ICXWriteResPair<WriteDPPSZ,[ICXPort5,ICXPort015], 13, [1,3], 4, 7>;
+defm : ICXWriteResPair<WriteFSign,  [ICXPort0],  1>; // Floating point fabs/fchs.
+defm : ICXWriteResPair<WriteFRnd,   [ICXPort01], 8, [2], 2, 6>; // Floating point rounding.
+defm : ICXWriteResPair<WriteFRndY,  [ICXPort01], 8, [2], 2, 7>;
+defm : ICXWriteResPair<WriteFRndZ,  [ICXPort05], 8, [2], 2, 7>;
+defm : ICXWriteResPair<WriteFLogic, [ICXPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals.
+defm : ICXWriteResPair<WriteFLogicY, [ICXPort015], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFLogicZ, [ICXPort05], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFTest,  [ICXPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
+defm : ICXWriteResPair<WriteFTestY, [ICXPort0], 2, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFTestZ, [ICXPort0], 2, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFShuffle,  [ICXPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : ICXWriteResPair<WriteFShuffleY, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFShuffleZ, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFVarShuffle,  [ICXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : ICXWriteResPair<WriteFVarShuffleY, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFVarShuffleZ, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFBlend, [ICXPort015], 1, [1], 1, 6>; // Floating point vector blends.
+defm : ICXWriteResPair<WriteFBlendY,[ICXPort015], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFBlendZ,[ICXPort015], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFVarBlend, [ICXPort015], 2, [2], 2, 6>; // Fp vector variable blends.
+defm : ICXWriteResPair<WriteFVarBlendY,[ICXPort015], 2, [2], 2, 7>;
+defm : ICXWriteResPair<WriteFVarBlendZ,[ICXPort015], 2, [2], 2, 7>;
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : X86WriteRes<WriteVecLoad,         [ICXPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX,        [ICXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY,        [ICXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT,       [ICXPort23], 6, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY,      [ICXPort23], 7, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad,   [ICXPort23,ICXPort015], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY,  [ICXPort23,ICXPort015], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecStore,        [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreX,       [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreY,       [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNT,      [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecStoreNTY,     [ICXPort237,ICXPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [ICXPort237,ICXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove,         [ICXPort05],  1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveToGpr,    [ICXPort0], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr,  [ICXPort5], 1, [1], 1>;
+
+defm : ICXWriteResPair<WriteVecALU,   [ICXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : ICXWriteResPair<WriteVecALUX,  [ICXPort01], 1, [1], 1, 6>;
+defm : ICXWriteResPair<WriteVecALUY,  [ICXPort01], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVecALUZ,  [ICXPort0], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVecLogic, [ICXPort05],  1, [1], 1, 5>; // Vector integer and/or/xor.
+defm : ICXWriteResPair<WriteVecLogicX,[ICXPort015], 1, [1], 1, 6>;
+defm : ICXWriteResPair<WriteVecLogicY,[ICXPort015], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVecLogicZ,[ICXPort05], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVecTest,  [ICXPort0,ICXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
+defm : ICXWriteResPair<WriteVecTestY, [ICXPort0,ICXPort5], 3, [1,1], 2, 7>;
+defm : ICXWriteResPair<WriteVecTestZ, [ICXPort0,ICXPort5], 3, [1,1], 2, 7>;
+defm : ICXWriteResPair<WriteVecIMul,  [ICXPort0],   5, [1], 1, 5>; // Vector integer multiply.
+defm : ICXWriteResPair<WriteVecIMulX, [ICXPort01],  5, [1], 1, 6>;
+defm : ICXWriteResPair<WriteVecIMulY, [ICXPort01],  5, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVecIMulZ, [ICXPort05],  5, [1], 1, 7>;
+defm : ICXWriteResPair<WritePMULLD,   [ICXPort01], 10, [2], 2, 6>; // Vector PMULLD.
+defm : ICXWriteResPair<WritePMULLDY,  [ICXPort01], 10, [2], 2, 7>;
+defm : ICXWriteResPair<WritePMULLDZ,  [ICXPort05], 10, [2], 2, 7>;
+defm : ICXWriteResPair<WriteShuffle,  [ICXPort5], 1, [1], 1, 5>; // Vector shuffles.
+defm : ICXWriteResPair<WriteShuffleX, [ICXPort5], 1, [1], 1, 6>;
+defm : ICXWriteResPair<WriteShuffleY, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteShuffleZ, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarShuffle,  [ICXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
+defm : ICXWriteResPair<WriteVarShuffleX, [ICXPort5], 1, [1], 1, 6>;
+defm : ICXWriteResPair<WriteVarShuffleY, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarShuffleZ, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteBlend, [ICXPort5], 1, [1], 1, 6>; // Vector blends.
+defm : ICXWriteResPair<WriteBlendY,[ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteBlendZ,[ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarBlend, [ICXPort015], 2, [2], 2, 6>; // Vector variable blends.
+defm : ICXWriteResPair<WriteVarBlendY,[ICXPort015], 2, [2], 2, 6>;
+defm : ICXWriteResPair<WriteVarBlendZ,[ICXPort05],  2, [1], 1, 6>;
+defm : ICXWriteResPair<WriteMPSAD,   [ICXPort5], 4, [2], 2, 6>; // Vector MPSAD.
+defm : ICXWriteResPair<WriteMPSADY,  [ICXPort5], 4, [2], 2, 7>;
+defm : ICXWriteResPair<WriteMPSADZ,  [ICXPort5], 4, [2], 2, 7>;
+defm : ICXWriteResPair<WritePSADBW,  [ICXPort5], 3, [1], 1, 5>; // Vector PSADBW.
+defm : ICXWriteResPair<WritePSADBWX, [ICXPort5], 3, [1], 1, 6>;
+defm : ICXWriteResPair<WritePSADBWY, [ICXPort5], 3, [1], 1, 7>;
+defm : ICXWriteResPair<WritePSADBWZ, [ICXPort5], 3, [1], 1, 7>;
+defm : ICXWriteResPair<WritePHMINPOS, [ICXPort0], 4, [1], 1, 6>; // Vector PHMINPOS.
+
+// Vector integer shifts.
+defm : ICXWriteResPair<WriteVecShift, [ICXPort0], 1, [1], 1, 5>;
+defm : X86WriteRes<WriteVecShiftX,    [ICXPort5,ICXPort01],  2, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftY,    [ICXPort5,ICXPort01],  4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZ,    [ICXPort5,ICXPort0],   4, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftXLd,  [ICXPort01,ICXPort23], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftYLd,  [ICXPort01,ICXPort23], 8, [1,1], 2>;
+defm : X86WriteRes<WriteVecShiftZLd,  [ICXPort0,ICXPort23],  8, [1,1], 2>;
+
+defm : ICXWriteResPair<WriteVecShiftImm,  [ICXPort0],  1, [1], 1, 5>;
+defm : ICXWriteResPair<WriteVecShiftImmX, [ICXPort01], 1, [1], 1, 6>; // Vector integer immediate shifts.
+defm : ICXWriteResPair<WriteVecShiftImmY, [ICXPort01], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVecShiftImmZ, [ICXPort0], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarVecShift,  [ICXPort01], 1, [1], 1, 6>; // Variable vector shifts.
+defm : ICXWriteResPair<WriteVarVecShiftY, [ICXPort01], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarVecShiftZ, [ICXPort0], 1, [1], 1, 7>;
+
+// Vector insert/extract operations.
+def : WriteRes<WriteVecInsert, [ICXPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVecInsertLd, [ICXPort5,ICXPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>;
+
+def : WriteRes<WriteVecExtract, [ICXPort0,ICXPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def : WriteRes<WriteVecExtractSt, [ICXPort4,ICXPort5,ICXPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+// Conversion between integer and float.
+defm : ICXWriteResPair<WriteCvtSS2I,   [ICXPort01], 6, [2], 2>; // Needs more work: DD vs DQ.
+defm : ICXWriteResPair<WriteCvtPS2I,   [ICXPort01], 3>;
+defm : ICXWriteResPair<WriteCvtPS2IY,  [ICXPort01], 3>;
+defm : ICXWriteResPair<WriteCvtPS2IZ,  [ICXPort05], 3>;
+defm : ICXWriteResPair<WriteCvtSD2I,   [ICXPort01], 6, [2], 2>;
+defm : ICXWriteResPair<WriteCvtPD2I,   [ICXPort01], 3>;
+defm : ICXWriteResPair<WriteCvtPD2IY,  [ICXPort01], 3>;
+defm : ICXWriteResPair<WriteCvtPD2IZ,  [ICXPort05], 3>;
+
+defm : ICXWriteResPair<WriteCvtI2SS,   [ICXPort1], 4>;
+defm : ICXWriteResPair<WriteCvtI2PS,   [ICXPort01], 4>;
+defm : ICXWriteResPair<WriteCvtI2PSY,  [ICXPort01], 4>;
+defm : ICXWriteResPair<WriteCvtI2PSZ,  [ICXPort05], 4>;  // Needs more work: DD vs DQ.
+defm : ICXWriteResPair<WriteCvtI2SD,   [ICXPort1], 4>;
+defm : ICXWriteResPair<WriteCvtI2PD,   [ICXPort01], 4>;
+defm : ICXWriteResPair<WriteCvtI2PDY,  [ICXPort01], 4>;
+defm : ICXWriteResPair<WriteCvtI2PDZ,  [ICXPort05], 4>;
+
+defm : ICXWriteResPair<WriteCvtSS2SD,  [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteCvtPS2PD,  [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteCvtPS2PDY, [ICXPort5,ICXPort01], 3, [1,1], 2>;
+defm : ICXWriteResPair<WriteCvtPS2PDZ, [ICXPort05], 3, [2], 2>;
+defm : ICXWriteResPair<WriteCvtSD2SS,  [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteCvtPD2PS,  [ICXPort1], 3>;
+defm : ICXWriteResPair<WriteCvtPD2PSY, [ICXPort5,ICXPort01], 3, [1,1], 2>;
+defm : ICXWriteResPair<WriteCvtPD2PSZ, [ICXPort05], 3, [2], 2>;
+
+defm : X86WriteRes<WriteCvtPH2PS,     [ICXPort5,ICXPort01],  5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [ICXPort5,ICXPort01],  7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZ,    [ICXPort5,ICXPort0],   7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSLd,  [ICXPort23,ICXPort01],  9, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [ICXPort23,ICXPort01], 10, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSZLd, [ICXPort23,ICXPort05], 10, [1,1], 2>;
+
+defm : X86WriteRes<WriteCvtPS2PH,    [ICXPort5,ICXPort01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [ICXPort5,ICXPort01], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHZ,   [ICXPort5,ICXPort05], 7, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHSt,  [ICXPort4,ICXPort5,ICXPort237,ICXPort01], 6, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [ICXPort4,ICXPort5,ICXPort237,ICXPort01], 8, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteCvtPS2PHZSt, [ICXPort4,ICXPort5,ICXPort237,ICXPort05], 8, [1,1,1,1], 4>;
+
+// Strings instructions.
+
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [ICXPort0]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [ICXPort0, ICXPort23]> {
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [ICXPort0, ICXPort5, ICXPort015, ICXPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
+}
+def : WriteRes<WritePCmpEStrMLd, [ICXPort0, ICXPort5, ICXPort23, ICXPort015, ICXPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4,3,1,1,1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [ICXPort0]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [ICXPort0, ICXPort23]> {
+  let Latency = 16;
+  let NumMicroOps = 4;
+  let ResourceCycles = [3,1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [ICXPort0,ICXPort5,ICXPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4,3,1];
+}
+def : WriteRes<WritePCmpEStrILd, [ICXPort0, ICXPort5, ICXPort23, ICXPort0156]> {
+  let Latency = 24;
+  let NumMicroOps = 9;
+  let ResourceCycles = [4,3,1,1];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK,    [ICXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSK,  [ICXPort0]> { let Latency = 2; }
+def : WriteRes<WriteVecMOVMSKY, [ICXPort0]> { let Latency = 2; }
+def : WriteRes<WriteMMXMOVMSK,  [ICXPort0]> { let Latency = 2; }
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [ICXPort0]> { // Decryption, encryption.
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [ICXPort0, ICXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+
+def : WriteRes<WriteAESIMC, [ICXPort0]> { // InvMixColumn.
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [ICXPort0, ICXPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+
+def : WriteRes<WriteAESKeyGen, [ICXPort0,ICXPort5,ICXPort015]> { // Key Generation.
+  let Latency = 20;
+  let NumMicroOps = 11;
+  let ResourceCycles = [3,6,2];
+}
+def : WriteRes<WriteAESKeyGenLd, [ICXPort0,ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 25;
+  let NumMicroOps = 11;
+  let ResourceCycles = [3,6,1,1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [ICXPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [ICXPort5, ICXPort23]> {
+  let Latency = 12;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem,     [ICXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : ICXWriteResPair<WriteFShuffle256, [ICXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles.
+defm : ICXWriteResPair<WriteFVarShuffle256, [ICXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles.
+defm : ICXWriteResPair<WriteShuffle256, [ICXPort5], 3, [1], 1, 7>;  // 256-bit width vector shuffles.
+defm : ICXWriteResPair<WriteVPMOV256, [ICXPort5], 3, [1], 1, 7>;  // 256-bit width packed vector width-changing move.
+defm : ICXWriteResPair<WriteVarShuffle256, [ICXPort5], 3, [1], 1, 7>;  // 256-bit width vector variable shuffles.
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [ICXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence,  [ICXPort23, ICXPort4]>;
+
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [ICXPort0,ICXPort23,ICXPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [ICXPort4,ICXPort5,ICXPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : ICXWriteResPair<WriteFHAdd,  [ICXPort5,ICXPort015], 6, [2,1], 3, 6>;
+defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort015], 6, [2,1], 3, 7>;
+defm : ICXWriteResPair<WritePHAdd,  [ICXPort5,ICXPort05],  3, [2,1], 3, 5>;
+defm : ICXWriteResPair<WritePHAddX, [ICXPort5,ICXPort015], 3, [2,1], 3, 6>;
+defm : ICXWriteResPair<WritePHAddY, [ICXPort5,ICXPort015], 3, [2,1], 3, 7>;
+
+// Remaining instrs.
+
+def ICXWriteResGroup1 : SchedWriteRes<[ICXPort0]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
+                                            "KANDN(B|D|Q|W)rr",
+                                            "KMOV(B|D|Q|W)kk",
+                                            "KNOT(B|D|Q|W)rr",
+                                            "KOR(B|D|Q|W)rr",
+                                            "KXNOR(B|D|Q|W)rr",
+                                            "KXOR(B|D|Q|W)rr",
+                                            "KSET0(B|D|Q|W)", // Same as KXOR
+                                            "KSET1(B|D|Q|W)", // Same as KXNOR
+                                            "MMX_PADDS(B|W)irr",
+                                            "MMX_PADDUS(B|W)irr",
+                                            "MMX_PAVG(B|W)irr",
+                                            "MMX_PCMPEQ(B|D|W)irr",
+                                            "MMX_PCMPGT(B|D|W)irr",
+                                            "MMX_P(MAX|MIN)SWirr",
+                                            "MMX_P(MAX|MIN)UBirr",
+                                            "MMX_PSUBS(B|W)irr",
+                                            "MMX_PSUBUS(B|W)irr",
+                                            "VPMOVB2M(Z|Z128|Z256)rr",
+                                            "VPMOVD2M(Z|Z128|Z256)rr",
+                                            "VPMOVQ2M(Z|Z128|Z256)rr",
+                                            "VPMOVW2M(Z|Z128|Z256)rr")>;
+
+def ICXWriteResGroup3 : SchedWriteRes<[ICXPort5]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup3], (instregex "COM(P?)_FST0r",
+                                            "KMOV(B|D|Q|W)kr",
+                                            "UCOM_F(P?)r")>;
+
+def ICXWriteResGroup4 : SchedWriteRes<[ICXPort6]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def ICXWriteResGroup6 : SchedWriteRes<[ICXPort05]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup6], (instrs FINCSTP, FNOP)>;
+
+def ICXWriteResGroup7 : SchedWriteRes<[ICXPort06]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
+
+def ICXWriteResGroup8 : SchedWriteRes<[ICXPort15]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+
+def ICXWriteResGroup9 : SchedWriteRes<[ICXPort015]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
+                                            "VBLENDMPS(Z128|Z256)rr",
+                                            "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr",
+                                            "(V?)PADD(B|D|Q|W)rr",
+                                            "VPBLENDD(Y?)rri",
+                                            "VPBLENDMB(Z128|Z256)rr",
+                                            "VPBLENDMD(Z128|Z256)rr",
+                                            "VPBLENDMQ(Z128|Z256)rr",
+                                            "VPBLENDMW(Z128|Z256)rr",
+                                            "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rrk",
+                                            "VPTERNLOGD(Z|Z128|Z256)rri",
+                                            "VPTERNLOGQ(Z|Z128|Z256)rri")>;
+
+def ICXWriteResGroup10 : SchedWriteRes<[ICXPort0156]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup10], (instrs CBW, CWDE, CDQE,
+                                          CMC, STC,
+                                          SGDT64m,
+                                          SIDT64m,
+                                          SMSW16m,
+                                          STRm,
+                                          SYSCALL)>;
+
+def ICXWriteResGroup11 : SchedWriteRes<[ICXPort4,ICXPort237]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup11], (instrs FBSTPm, VMPTRSTm)>;
+def: InstRW<[ICXWriteResGroup11], (instregex "KMOV(B|D|Q|W)mk",
+                                             "ST_FP(32|64|80)m")>;
+
+def ICXWriteResGroup13 : SchedWriteRes<[ICXPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[ICXWriteResGroup13], (instrs MMX_MOVQ2DQrr)>;
+
+def ICXWriteResGroup14 : SchedWriteRes<[ICXPort05]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[ICXWriteResGroup14], (instrs FDECSTP,
+                                          MMX_MOVDQ2Qrr)>;
+
+def ICXWriteResGroup17 : SchedWriteRes<[ICXPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[ICXWriteResGroup17], (instrs LFENCE,
+                                          WAIT,
+                                          XGETBV)>;
+
+def ICXWriteResGroup20 : SchedWriteRes<[ICXPort6,ICXPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup20], (instregex "CLFLUSH")>;
+
+def ICXWriteResGroup21 : SchedWriteRes<[ICXPort237,ICXPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup21], (instrs SFENCE)>;
+
+def ICXWriteResGroup23 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup23], (instrs CWD,
+                                          JCXZ, JECXZ, JRCXZ,
+                                          ADC8i8, SBB8i8,
+                                          ADC16i16, SBB16i16,
+                                          ADC32i32, SBB32i32,
+                                          ADC64i32, SBB64i32)>;
+
+def ICXWriteResGroup25 : SchedWriteRes<[ICXPort4,ICXPort6,ICXPort237]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup25], (instrs FNSTCW16m)>;
+
+def ICXWriteResGroup27 : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort15]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>;
+
+def ICXWriteResGroup28 : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r, PUSH64i8,
+                                          STOSB, STOSL, STOSQ, STOSW)>;
+def: InstRW<[ICXWriteResGroup28], (instregex "PUSH(16|32|64)rmr")>;
+
+def ICXWriteResGroup29 : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort15]> {
+  let Latency = 2;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2,2,1];
+}
+def: InstRW<[ICXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)")>;
+
+def ICXWriteResGroup30 : SchedWriteRes<[ICXPort0]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup30], (instregex "KMOV(B|D|Q|W)rk",
+                                             "KORTEST(B|D|Q|W)rr",
+                                             "KTEST(B|D|Q|W)rr")>;
+
+def ICXWriteResGroup31 : SchedWriteRes<[ICXPort1]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup31], (instregex "PDEP(32|64)rr",
+                                             "PEXT(32|64)rr")>;
+
+def ICXWriteResGroup32 : SchedWriteRes<[ICXPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup32], (instrs VPSADBWZrr)>; // TODO: 512-bit ops require ports 0/1 to be joined.
+def: InstRW<[ICXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
+                                             "VALIGND(Z|Z128|Z256)rri",
+                                             "VALIGNQ(Z|Z128|Z256)rri",
+                                             "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
+                                             "VPBROADCAST(B|W)rr",
+                                             "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr")>;
+
+def ICXWriteResGroup33 : SchedWriteRes<[ICXPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup33], (instregex "KADD(B|D|Q|W)rr",
+                                             "KSHIFTL(B|D|Q|W)ri",
+                                             "KSHIFTR(B|D|Q|W)ri",
+                                             "KUNPCK(BW|DQ|WD)rr",
+                                             "VCMPPD(Z|Z128|Z256)rri",
+                                             "VCMPPS(Z|Z128|Z256)rri",
+                                             "VCMP(SD|SS)Zrr",
+                                             "VFPCLASS(PD|PS)(Z|Z128|Z256)rr",
+                                             "VFPCLASS(SD|SS)Zrr",
+                                             "VPCMPB(Z|Z128|Z256)rri",
+                                             "VPCMPD(Z|Z128|Z256)rri",
+                                             "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
+                                             "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr",
+                                             "VPCMPQ(Z|Z128|Z256)rri",
+                                             "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
+                                             "VPCMPW(Z|Z128|Z256)rri",
+                                             "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>;
+
+def ICXWriteResGroup34 : SchedWriteRes<[ICXPort0,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup34], (instrs FNSTSW16r)>;
+
+def ICXWriteResGroup37 : SchedWriteRes<[ICXPort0,ICXPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[ICXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>;
+
+def ICXWriteResGroup38 : SchedWriteRes<[ICXPort5,ICXPort01]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup38], (instregex "(V?)PH(ADD|SUB)SW(Y?)rr")>;
+
+def ICXWriteResGroup41 : SchedWriteRes<[ICXPort5,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup41], (instrs MMX_PACKSSDWirr,
+                                          MMX_PACKSSWBirr,
+                                          MMX_PACKUSWBirr)>;
+
+def ICXWriteResGroup42 : SchedWriteRes<[ICXPort6,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[ICXWriteResGroup42], (instregex "CLD")>;
+
+def ICXWriteResGroup43 : SchedWriteRes<[ICXPort237,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[ICXWriteResGroup43], (instrs MFENCE)>;
+
+def ICXWriteResGroup44 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[ICXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
+                                             "RCR(8|16|32|64)r(1|i)")>;
+
+def ICXWriteResGroup45 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup45], (instrs FNSTSWm)>;
+
+def ICXWriteResGroup47 : SchedWriteRes<[ICXPort4,ICXPort6,ICXPort237,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup47], (instregex "CALL(16|32|64)r")>;
+
+def ICXWriteResGroup48 : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup48], (instrs CALL64pcrel32)>;
+
+def ICXWriteResGroup49 : SchedWriteRes<[ICXPort0]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup49], (instregex "MUL_(FPrST0|FST0r|FrST0)")>;
+
+def ICXWriteResGroup50 : SchedWriteRes<[ICXPort01]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup50], (instregex "VCVTDQ2PS(Y|Z128|Z256)rr",
+                                             "(V?)CVTDQ2PSrr",
+                                             "VCVTPD2QQ(Z128|Z256)rr",
+                                             "VCVTPD2UQQ(Z128|Z256)rr",
+                                             "VCVTPS2DQ(Y|Z128|Z256)rr",
+                                             "(V?)CVTPS2DQrr",
+                                             "VCVTPS2UDQ(Z128|Z256)rr",
+                                             "VCVTQQ2PD(Z128|Z256)rr",
+                                             "VCVTTPD2QQ(Z128|Z256)rr",
+                                             "VCVTTPD2UQQ(Z128|Z256)rr",
+                                             "VCVTTPS2DQ(Z128|Z256)rr",
+                                             "(V?)CVTTPS2DQrr",
+                                             "VCVTTPS2UDQ(Z128|Z256)rr",
+                                             "VCVTUDQ2PS(Z128|Z256)rr",
+                                             "VCVTUQQ2PD(Z128|Z256)rr")>;
+
+def ICXWriteResGroup50z : SchedWriteRes<[ICXPort05]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup50z], (instrs VCVTDQ2PSZrr,
+                                           VCVTPD2QQZrr,
+                                           VCVTPD2UQQZrr,
+                                           VCVTPS2DQZrr,
+                                           VCVTPS2UDQZrr,
+                                           VCVTQQ2PDZrr,
+                                           VCVTTPD2QQZrr,
+                                           VCVTTPD2UQQZrr,
+                                           VCVTTPS2DQZrr,
+                                           VCVTTPS2UDQZrr,
+                                           VCVTUDQ2PSZrr,
+                                           VCVTUQQ2PDZrr)>;
+
+def ICXWriteResGroup51 : SchedWriteRes<[ICXPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[ICXWriteResGroup51], (instregex "VEXPANDPD(Z|Z128|Z256)rr",
+                                             "VEXPANDPS(Z|Z128|Z256)rr",
+                                             "VPEXPANDD(Z|Z128|Z256)rr",
+                                             "VPEXPANDQ(Z|Z128|Z256)rr",
+                                             "VPMOVDB(Z|Z128|Z256)rr",
+                                             "VPMOVDW(Z|Z128|Z256)rr",
+                                             "VPMOVQB(Z|Z128|Z256)rr",
+                                             "VPMOVQW(Z|Z128|Z256)rr",
+                                             "VPMOVSDB(Z|Z128|Z256)rr",
+                                             "VPMOVSDW(Z|Z128|Z256)rr",
+                                             "VPMOVSQB(Z|Z128|Z256)rr",
+                                             "VPMOVSQD(Z|Z128|Z256)rr",
+                                             "VPMOVSQW(Z|Z128|Z256)rr",
+                                             "VPMOVSWB(Z|Z128|Z256)rr",
+                                             "VPMOVUSDB(Z|Z128|Z256)rr",
+                                             "VPMOVUSDW(Z|Z128|Z256)rr",
+                                             "VPMOVUSQB(Z|Z128|Z256)rr",
+                                             "VPMOVUSQD(Z|Z128|Z256)rr",
+                                             "VPMOVUSWB(Z|Z128|Z256)rr",
+                                             "VPMOVWB(Z|Z128|Z256)rr")>;
+
+def ICXWriteResGroup54 : SchedWriteRes<[ICXPort4,ICXPort5,ICXPort237]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup54], (instregex "IST(T?)_FP(16|32|64)m",
+                                             "IST_F(16|32)m",
+                                             "VPMOVQD(Z|Z128|Z256)mr(b?)")>;
+
+def ICXWriteResGroup55 : SchedWriteRes<[ICXPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4];
+}
+def: InstRW<[ICXWriteResGroup55], (instrs FNCLEX)>;
+
+def ICXWriteResGroup56 : SchedWriteRes<[]> {
+  let Latency = 0;
+  let NumMicroOps = 4;
+  let ResourceCycles = [];
+}
+def: InstRW<[ICXWriteResGroup56], (instrs VZEROUPPER)>;
+
+def ICXWriteResGroup57 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,2];
+}
+def: InstRW<[ICXWriteResGroup57], (instregex "LAR(16|32|64)rr")>;
+
+def ICXWriteResGroup58 : SchedWriteRes<[ICXPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup58], (instregex "MOVSX(16|32|64)rm(8|16|32)",
+                                             "MOVZX(16|32|64)rm(8|16)",
+                                             "(V?)MOVDDUPrm")>;  // TODO: Should this be ICXWriteResGroup71?
+
+def ICXWriteResGroup61 : SchedWriteRes<[ICXPort5,ICXPort015]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr",
+                                             "MMX_CVT(T?)PS2PIirr",
+                                             "VCVTDQ2PDZ128rr",
+                                             "VCVTPD2DQZ128rr",
+                                             "(V?)CVT(T?)PD2DQrr",
+                                             "VCVTPD2PSZ128rr",
+                                             "(V?)CVTPD2PSrr",
+                                             "VCVTPD2UDQZ128rr",
+                                             "VCVTPS2PDZ128rr",
+                                             "(V?)CVTPS2PDrr",
+                                             "VCVTPS2QQZ128rr",
+                                             "VCVTPS2UQQZ128rr",
+                                             "VCVTQQ2PSZ128rr",
+                                             "(V?)CVTSD2SS(Z?)rr",
+                                             "(V?)CVTSI(64)?2SDrr",
+                                             "VCVTSI2SSZrr",
+                                             "(V?)CVTSI2SSrr",
+                                             "VCVTSI(64)?2SDZrr",
+                                             "VCVTSS2SDZrr",
+                                             "(V?)CVTSS2SDrr",
+                                             "VCVTTPD2DQZ128rr",
+                                             "VCVTTPD2UDQZ128rr",
+                                             "VCVTTPS2QQZ128rr",
+                                             "VCVTTPS2UQQZ128rr",
+                                             "VCVTUDQ2PDZ128rr",
+                                             "VCVTUQQ2PSZ128rr",
+                                             "VCVTUSI2SSZrr",
+                                             "VCVTUSI(64)?2SDZrr")>;
+
+def ICXWriteResGroup62 : SchedWriteRes<[ICXPort5,ICXPort015]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup62], (instregex "VPCONFLICTQZ128rr")>;
+
+def ICXWriteResGroup63 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort06]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup63], (instregex "STR(16|32|64)r")>;
+
+def ICXWriteResGroup65 : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort015]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)",
+                                             "VCVTPS2PHZ256mr(b?)",
+                                             "VCVTPS2PHZmr(b?)")>;
+
+def ICXWriteResGroup66 : SchedWriteRes<[ICXPort4,ICXPort5,ICXPort237]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[ICXWriteResGroup66], (instregex "VPMOVDB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVDW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVQB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVQW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSDB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSDW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSQB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSQD(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSQW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVSWB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSDB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSDW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSQB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSQD(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSQW(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVUSWB(Z|Z128|Z256)mr(b?)",
+                                             "VPMOVWB(Z|Z128|Z256)mr(b?)")>;
+
+def ICXWriteResGroup67 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,4];
+}
+def: InstRW<[ICXWriteResGroup67], (instrs XSETBV)>;
+
+def ICXWriteResGroup69 : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort0156]> {
+  let Latency = 5;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,4];
+}
+def: InstRW<[ICXWriteResGroup69], (instregex "PUSHF(16|64)")>;
+
+def ICXWriteResGroup71 : SchedWriteRes<[ICXPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup71], (instrs VBROADCASTSSrm,
+                                          VPBROADCASTDrm,
+                                          VPBROADCASTQrm,
+                                          VMOVSHDUPrm,
+                                          VMOVSLDUPrm,
+                                          MOVSHDUPrm,
+                                          MOVSLDUPrm)>;
+
+def ICXWriteResGroup72 : SchedWriteRes<[ICXPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[ICXWriteResGroup72], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[ICXWriteResGroup72], (instregex "VCOMPRESSPD(Z|Z128|Z256)rr",
+                                             "VCOMPRESSPS(Z|Z128|Z256)rr",
+                                             "VPCOMPRESSD(Z|Z128|Z256)rr",
+                                             "VPCOMPRESSQ(Z|Z128|Z256)rr",
+                                             "VPERMW(Z|Z128|Z256)rr")>;
+
+def ICXWriteResGroup73 : SchedWriteRes<[ICXPort0,ICXPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup73], (instrs MMX_PADDSBirm,
+                                          MMX_PADDSWirm,
+                                          MMX_PADDUSBirm,
+                                          MMX_PADDUSWirm,
+                                          MMX_PAVGBirm,
+                                          MMX_PAVGWirm,
+                                          MMX_PCMPEQBirm,
+                                          MMX_PCMPEQDirm,
+                                          MMX_PCMPEQWirm,
+                                          MMX_PCMPGTBirm,
+                                          MMX_PCMPGTDirm,
+                                          MMX_PCMPGTWirm,
+                                          MMX_PMAXSWirm,
+                                          MMX_PMAXUBirm,
+                                          MMX_PMINSWirm,
+                                          MMX_PMINUBirm,
+                                          MMX_PSUBSBirm,
+                                          MMX_PSUBSWirm,
+                                          MMX_PSUBUSBirm,
+                                          MMX_PSUBUSWirm)>;
+
+def ICXWriteResGroup76 : SchedWriteRes<[ICXPort6,ICXPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup76], (instrs FARJMP64m)>;
+def: InstRW<[ICXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
+
+def ICXWriteResGroup79 : SchedWriteRes<[ICXPort23,ICXPort15]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup79], (instregex "ANDN(32|64)rm",
+                                             "MOVBE(16|32|64)rm")>;
+
+def ICXWriteResGroup80 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)")>;
+def: InstRW<[ICXWriteResGroup80], (instrs VMOVDI2PDIZrm)>;
+
+def ICXWriteResGroup81 : SchedWriteRes<[ICXPort23,ICXPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup81], (instrs POP16r, POP32r, POP64r)>;
+def: InstRW<[ICXWriteResGroup81], (instregex "POP(16|32|64)rmr")>;
+
+def ICXWriteResGroup82 : SchedWriteRes<[ICXPort5,ICXPort015]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup82], (instregex "(V?)CVTSI642SSrr",
+                                             "VCVTSI642SSZrr",
+                                             "VCVTUSI642SSZrr")>;
+
+def ICXWriteResGroup84 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort06,ICXPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup84], (instregex "SLDT(16|32|64)r")>;
+
+def ICXWriteResGroup86 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort06]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup86], (instregex "SAR(8|16|32|64)m(1|i)",
+                                             "SHL(8|16|32|64)m(1|i)",
+                                             "SHR(8|16|32|64)m(1|i)")>;
+
+def ICXWriteResGroup87 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup87], (instregex "POP(16|32|64)rmm",
+                                             "PUSH(16|32|64)rmm")>;
+
+def ICXWriteResGroup88 : SchedWriteRes<[ICXPort6,ICXPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,5];
+}
+def: InstRW<[ICXWriteResGroup88], (instrs STD)>;
+
+def ICXWriteResGroup89 : SchedWriteRes<[ICXPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup89], (instregex "LD_F(32|64|80)m")>;
+def: InstRW<[ICXWriteResGroup89], (instrs VBROADCASTF128,
+                                          VBROADCASTI128,
+                                          VBROADCASTSDYrm,
+                                          VBROADCASTSSYrm,
+                                          VMOVDDUPYrm,
+                                          VMOVSHDUPYrm,
+                                          VMOVSLDUPYrm,
+                                          VPBROADCASTDYrm,
+                                          VPBROADCASTQYrm)>;
+
+def ICXWriteResGroup90 : SchedWriteRes<[ICXPort01,ICXPort5]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup90], (instrs VCVTDQ2PDYrr)>;
+
+def ICXWriteResGroup92 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup92], (instregex "VMOVSDZrm(b?)",
+                                             "VMOVSSZrm(b?)")>;
+
+def ICXWriteResGroup92a : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm",
+                                              "(V?)PMOV(SX|ZX)BQrm",
+                                              "(V?)PMOV(SX|ZX)BWrm",
+                                              "(V?)PMOV(SX|ZX)DQrm",
+                                              "(V?)PMOV(SX|ZX)WDrm",
+                                              "(V?)PMOV(SX|ZX)WQrm")>;
+
+def ICXWriteResGroup93 : SchedWriteRes<[ICXPort5,ICXPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr",
+                                             "VCVTPD2DQ(Y|Z256)rr",
+                                             "VCVTPD2PS(Y|Z256)rr",
+                                             "VCVTPD2UDQZ256rr",
+                                             "VCVTPS2PD(Y|Z256)rr",
+                                             "VCVTPS2QQZ256rr",
+                                             "VCVTPS2UQQZ256rr",
+                                             "VCVTQQ2PSZ256rr",
+                                             "VCVTTPD2DQ(Y|Z256)rr",
+                                             "VCVTTPD2UDQZ256rr",
+                                             "VCVTTPS2QQZ256rr",
+                                             "VCVTTPS2UQQZ256rr",
+                                             "VCVTUDQ2PDZ256rr",
+                                             "VCVTUQQ2PSZ256rr")>;
+
+def ICXWriteResGroup93z : SchedWriteRes<[ICXPort5,ICXPort05]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup93z], (instrs VCVTDQ2PDZrr,
+                                           VCVTPD2DQZrr,
+                                           VCVTPD2PSZrr,
+                                           VCVTPD2UDQZrr,
+                                           VCVTPS2PDZrr,
+                                           VCVTPS2QQZrr,
+                                           VCVTPS2UQQZrr,
+                                           VCVTQQ2PSZrr,
+                                           VCVTTPD2DQZrr,
+                                           VCVTTPD2UDQZrr,
+                                           VCVTTPS2QQZrr,
+                                           VCVTTPS2UQQZrr,
+                                           VCVTUDQ2PDZrr,
+                                           VCVTUQQ2PSZrr)>;
+
+def ICXWriteResGroup95 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
+                                          VPBLENDDrmi)>;
+def: InstRW<[ICXWriteResGroup95, ReadAfterVecXLd],
+                                  (instregex "VBLENDMPDZ128rm(b?)",
+                                             "VBLENDMPSZ128rm(b?)",
+                                             "VBROADCASTI32X2Z128rm(b?)",
+                                             "VBROADCASTSSZ128rm(b?)",
+                                             "VINSERT(F|I)128rm",
+                                             "VMOVAPDZ128rm(b?)",
+                                             "VMOVAPSZ128rm(b?)",
+                                             "VMOVDDUPZ128rm(b?)",
+                                             "VMOVDQA32Z128rm(b?)",
+                                             "VMOVDQA64Z128rm(b?)",
+                                             "VMOVDQU16Z128rm(b?)",
+                                             "VMOVDQU32Z128rm(b?)",
+                                             "VMOVDQU64Z128rm(b?)",
+                                             "VMOVDQU8Z128rm(b?)",
+                                             "VMOVSHDUPZ128rm(b?)",
+                                             "VMOVSLDUPZ128rm(b?)",
+                                             "VMOVUPDZ128rm(b?)",
+                                             "VMOVUPSZ128rm(b?)",
+                                             "VPADD(B|D|Q|W)Z128rm(b?)",
+                                             "(V?)PADD(B|D|Q|W)rm",
+                                             "VPBLENDM(B|D|Q|W)Z128rm(b?)",
+                                             "VPBROADCASTDZ128rm(b?)",
+                                             "VPBROADCASTQZ128rm(b?)",
+                                             "VPSUB(B|D|Q|W)Z128rm(b?)",
+                                             "(V?)PSUB(B|D|Q|W)rm",
+                                             "VPTERNLOGDZ128rm(b?)i",
+                                             "VPTERNLOGQZ128rm(b?)i")>;
+
+def ICXWriteResGroup96 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup96], (instrs MMX_PACKSSDWirm,
+                                          MMX_PACKSSWBirm,
+                                          MMX_PACKUSWBirm)>;
+
+def ICXWriteResGroup97 : SchedWriteRes<[ICXPort5,ICXPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup97], (instregex "VPERMI2W128rr",
+                                             "VPERMI2W256rr",
+                                             "VPERMI2Wrr",
+                                             "VPERMT2W128rr",
+                                             "VPERMT2W256rr",
+                                             "VPERMT2Wrr")>;
+
+def ICXWriteResGroup99 : SchedWriteRes<[ICXPort23,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[ICXWriteResGroup99], (instrs LEAVE, LEAVE64,
+                                          SCASB, SCASL, SCASQ, SCASW)>;
+
+def ICXWriteResGroup100 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort015]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup100], (instregex "VCVTSS2USI64Zrr",
+                                              "(V?)CVTSS2SI64(Z?)rr",
+                                              "(V?)CVTTSS2SI64(Z?)rr",
+                                              "VCVTTSS2USI64Zrr")>;
+
+def ICXWriteResGroup101 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort05]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup101], (instrs FLDCW16m)>;
+
+def ICXWriteResGroup103 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup103], (instregex "KMOV(B|D|Q|W)km")>;
+
+def ICXWriteResGroup104 : SchedWriteRes<[ICXPort6,ICXPort23,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup104], (instrs LRET64, RET64)>;
+
+def ICXWriteResGroup106 : SchedWriteRes<[ICXPort4,ICXPort5,ICXPort237]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[ICXWriteResGroup106], (instregex "VCOMPRESSPD(Z|Z128|Z256)mr(b?)",
+                                              "VCOMPRESSPS(Z|Z128|Z256)mr(b?)",
+                                              "VPCOMPRESSD(Z|Z128|Z256)mr(b?)",
+                                              "VPCOMPRESSQ(Z|Z128|Z256)mr(b?)")>;
+
+def ICXWriteResGroup107 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort06]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[ICXWriteResGroup107], (instregex "ROL(8|16|32|64)m(1|i)",
+                                              "ROR(8|16|32|64)m(1|i)")>;
+
+def ICXWriteResGroup107_1 : SchedWriteRes<[ICXPort06]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def: InstRW<[ICXWriteResGroup107_1], (instrs ROL8r1, ROL16r1, ROL32r1, ROL64r1,
+                                             ROR8r1, ROR16r1, ROR32r1, ROR64r1)>;
+
+def ICXWriteResGroup108 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[ICXWriteResGroup108], (instregex "XADD(8|16|32|64)rm")>;
+
+def ICXWriteResGroup109 : SchedWriteRes<[ICXPort4,ICXPort6,ICXPort23,ICXPort237,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[ICXWriteResGroup109], (instrs FARCALL64m)>;
+
+def ICXWriteResGroup110 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 7;
+  let ResourceCycles = [1,2,2,2];
+}
+def: InstRW<[ICXWriteResGroup110], (instrs VPSCATTERDQZ128mr,
+                                           VPSCATTERQQZ128mr,
+                                           VSCATTERDPDZ128mr,
+                                           VSCATTERQPDZ128mr)>;
+
+def ICXWriteResGroup111 : SchedWriteRes<[ICXPort6,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 7;
+  let ResourceCycles = [1,3,1,2];
+}
+def: InstRW<[ICXWriteResGroup111], (instrs LOOP)>;
+
+def ICXWriteResGroup112 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 11;
+  let ResourceCycles = [1,4,4,2];
+}
+def: InstRW<[ICXWriteResGroup112], (instrs VPSCATTERDQZ256mr,
+                                           VPSCATTERQQZ256mr,
+                                           VSCATTERDPDZ256mr,
+                                           VSCATTERQPDZ256mr)>;
+
+def ICXWriteResGroup113 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 19;
+  let ResourceCycles = [1,8,8,2];
+}
+def: InstRW<[ICXWriteResGroup113], (instrs VPSCATTERDQZmr,
+                                           VPSCATTERQQZmr,
+                                           VSCATTERDPDZmr,
+                                           VSCATTERQPDZmr)>;
+
+def ICXWriteResGroup114 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort5,ICXPort237,ICXPort0156]> {
+  let Latency = 7;
+  let NumMicroOps = 36;
+  let ResourceCycles = [1,16,1,16,2];
+}
+def: InstRW<[ICXWriteResGroup114], (instrs VSCATTERDPSZmr)>;
+
+def ICXWriteResGroup118 : SchedWriteRes<[ICXPort1,ICXPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup118], (instregex "PDEP(32|64)rm",
+                                              "PEXT(32|64)rm")>;
+
+def ICXWriteResGroup119 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
+                                              "VPBROADCASTB(Z|Z256)rm(b?)",
+                                              "VPBROADCASTW(Z|Z256)rm(b?)")>;
+def: InstRW<[ICXWriteResGroup119], (instrs VPBROADCASTBYrm,
+                                           VPBROADCASTWYrm,
+                                           VPMOVSXBDYrm,
+                                           VPMOVSXBQYrm,
+                                           VPMOVSXWQYrm)>;
+
+def ICXWriteResGroup121 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
+                                           VPBLENDDYrmi)>;
+def: InstRW<[ICXWriteResGroup121, ReadAfterVecYLd],
+                                   (instregex "VBLENDMPD(Z|Z256)rm(b?)",
+                                              "VBLENDMPS(Z|Z256)rm(b?)",
+                                              "VBROADCASTF32X2Z256rm(b?)",
+                                              "VBROADCASTF32X2Zrm(b?)",
+                                              "VBROADCASTF32X4Z256rm(b?)",
+                                              "VBROADCASTF32X4rm(b?)",
+                                              "VBROADCASTF32X8rm(b?)",
+                                              "VBROADCASTF64X2Z128rm(b?)",
+                                              "VBROADCASTF64X2rm(b?)",
+                                              "VBROADCASTF64X4rm(b?)",
+                                              "VBROADCASTI32X2Z256rm(b?)",
+                                              "VBROADCASTI32X2Zrm(b?)",
+                                              "VBROADCASTI32X4Z256rm(b?)",
+                                              "VBROADCASTI32X4rm(b?)",
+                                              "VBROADCASTI32X8rm(b?)",
+                                              "VBROADCASTI64X2Z128rm(b?)",
+                                              "VBROADCASTI64X2rm(b?)",
+                                              "VBROADCASTI64X4rm(b?)",
+                                              "VBROADCASTSD(Z|Z256)rm(b?)",
+                                              "VBROADCASTSS(Z|Z256)rm(b?)",
+                                              "VINSERTF32x4(Z|Z256)rm(b?)",
+                                              "VINSERTF32x8Zrm(b?)",
+                                              "VINSERTF64x2(Z|Z256)rm(b?)",
+                                              "VINSERTF64x4Zrm(b?)",
+                                              "VINSERTI32x4(Z|Z256)rm(b?)",
+                                              "VINSERTI32x8Zrm(b?)",
+                                              "VINSERTI64x2(Z|Z256)rm(b?)",
+                                              "VINSERTI64x4Zrm(b?)",
+                                              "VMOVAPD(Z|Z256)rm(b?)",
+                                              "VMOVAPS(Z|Z256)rm(b?)",
+                                              "VMOVDDUP(Z|Z256)rm(b?)",
+                                              "VMOVDQA32(Z|Z256)rm(b?)",
+                                              "VMOVDQA64(Z|Z256)rm(b?)",
+                                              "VMOVDQU16(Z|Z256)rm(b?)",
+                                              "VMOVDQU32(Z|Z256)rm(b?)",
+                                              "VMOVDQU64(Z|Z256)rm(b?)",
+                                              "VMOVDQU8(Z|Z256)rm(b?)",
+                                              "VMOVSHDUP(Z|Z256)rm(b?)",
+                                              "VMOVSLDUP(Z|Z256)rm(b?)",
+                                              "VMOVUPD(Z|Z256)rm(b?)",
+                                              "VMOVUPS(Z|Z256)rm(b?)",
+                                              "VPADD(B|D|Q|W)Yrm",
+                                              "VPADD(B|D|Q|W)(Z|Z256)rm(b?)",
+                                              "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)",
+                                              "VPBROADCASTD(Z|Z256)rm(b?)",
+                                              "VPBROADCASTQ(Z|Z256)rm(b?)",
+                                              "VPSUB(B|D|Q|W)Yrm",
+                                              "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)",
+                                              "VPTERNLOGD(Z|Z256)rm(b?)i",
+                                              "VPTERNLOGQ(Z|Z256)rm(b?)i")>;
+
+def ICXWriteResGroup123 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,2,1];
+}
+def: InstRW<[ICXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>;
+
+def ICXWriteResGroup127 : SchedWriteRes<[ICXPort23,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[ICXWriteResGroup127], (instregex "RCL(8|16|32|64)m(1|i)",
+                                              "RCR(8|16|32|64)m(1|i)")>;
+
+def ICXWriteResGroup128 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort06]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[ICXWriteResGroup128], (instregex "ROL(8|16|32|64)mCL",
+                                              "ROR(8|16|32|64)mCL",
+                                              "SAR(8|16|32|64)mCL",
+                                              "SHL(8|16|32|64)mCL",
+                                              "SHR(8|16|32|64)mCL")>;
+
+def ICXWriteResGroup130 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 8;
+  let NumMicroOps = 6;
+  let ResourceCycles = [1,1,1,2,1];
+}
+def: SchedAlias<WriteADCRMW, ICXWriteResGroup130>;
+
+def ICXWriteResGroup131 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort5,ICXPort237,ICXPort0156]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,2,1,2,2];
+}
+def: InstRW<[ICXWriteResGroup131], (instrs VPSCATTERQDZ128mr,
+                                           VPSCATTERQDZ256mr,
+                                           VSCATTERQPSZ128mr,
+                                           VSCATTERQPSZ256mr)>;
+
+def ICXWriteResGroup132 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort5,ICXPort237,ICXPort0156]> {
+  let Latency = 8;
+  let NumMicroOps = 12;
+  let ResourceCycles = [1,4,1,4,2];
+}
+def: InstRW<[ICXWriteResGroup132], (instrs VPSCATTERDDZ128mr,
+                                           VSCATTERDPSZ128mr)>;
+
+def ICXWriteResGroup133 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort5,ICXPort237,ICXPort0156]> {
+  let Latency = 8;
+  let NumMicroOps = 20;
+  let ResourceCycles = [1,8,1,8,2];
+}
+def: InstRW<[ICXWriteResGroup133], (instrs VPSCATTERDDZ256mr,
+                                           VSCATTERDPSZ256mr)>;
+
+def ICXWriteResGroup134 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort5,ICXPort237,ICXPort0156]> {
+  let Latency = 8;
+  let NumMicroOps = 36;
+  let ResourceCycles = [1,16,1,16,2];
+}
+def: InstRW<[ICXWriteResGroup134], (instrs VPSCATTERDDZmr)>;
+
+def ICXWriteResGroup135 : SchedWriteRes<[ICXPort0,ICXPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup135], (instrs MMX_CVTPI2PSirm)>;
+
+def ICXWriteResGroup136 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup136], (instrs VPMOVSXBWYrm,
+                                           VPMOVSXDQYrm,
+                                           VPMOVSXWDYrm,
+                                           VPMOVZXWDYrm)>;
+def: InstRW<[ICXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
+                                              "VFPCLASSSDZrm(b?)",
+                                              "VFPCLASSSSZrm(b?)",
+                                              "(V?)PCMPGTQrm",
+                                              "VPERMI2D128rm(b?)",
+                                              "VPERMI2PD128rm(b?)",
+                                              "VPERMI2PS128rm(b?)",
+                                              "VPERMI2Q128rm(b?)",
+                                              "VPERMT2D128rm(b?)",
+                                              "VPERMT2PD128rm(b?)",
+                                              "VPERMT2PS128rm(b?)",
+                                              "VPERMT2Q128rm(b?)",
+                                              "VPMAXSQZ128rm(b?)",
+                                              "VPMAXUQZ128rm(b?)",
+                                              "VPMINSQZ128rm(b?)",
+                                              "VPMINUQZ128rm(b?)",
+                                              "VPMOVSXBDZ128rm(b?)",
+                                              "VPMOVSXBQZ128rm(b?)",
+                                              "VPMOVSXBWZ128rm(b?)",
+                                              "VPMOVSXDQZ128rm(b?)",
+                                              "VPMOVSXWDZ128rm(b?)",
+                                              "VPMOVSXWQZ128rm(b?)",
+                                              "VPMOVZXBDZ128rm(b?)",
+                                              "VPMOVZXBQZ128rm(b?)",
+                                              "VPMOVZXBWZ128rm(b?)",
+                                              "VPMOVZXDQZ128rm(b?)",
+                                              "VPMOVZXWDZ128rm(b?)",
+                                              "VPMOVZXWQZ128rm(b?)")>;
+
+def ICXWriteResGroup136_2 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i",
+                                                "VCMP(SD|SS)Zrm",
+                                                "VFPCLASSPDZ128rm(b?)",
+                                                "VFPCLASSPSZ128rm(b?)",
+                                                "VPCMPBZ128rmi(b?)",
+                                                "VPCMPDZ128rmi(b?)",
+                                                "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
+                                                "VPCMPGT(B|D|Q|W)Z128rm(b?)",
+                                                "VPCMPQZ128rmi(b?)",
+                                                "VPCMPU(B|D|Q|W)Z128rmi(b?)",
+                                                "VPCMPWZ128rmi(b?)",
+                                                "VPTESTMBZ128rm(b?)",
+                                                "VPTESTMDZ128rm(b?)",
+                                                "VPTESTMQZ128rm(b?)",
+                                                "VPTESTMWZ128rm(b?)",
+                                                "VPTESTNMBZ128rm(b?)",
+                                                "VPTESTNMDZ128rm(b?)",
+                                                "VPTESTNMQZ128rm(b?)",
+                                                "VPTESTNMWZ128rm(b?)")>;
+
+def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm",
+                                              "(V?)CVTPS2PDrm")>;
+
+def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[ICXWriteResGroup143], (instregex "(V?)PHADDSWrm",
+                                              "(V?)PHSUBSWrm")>;
+
+def ICXWriteResGroup146 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort23,ICXPort0156]> {
+  let Latency = 9;
+  let NumMicroOps = 5;
+  let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[ICXWriteResGroup146], (instregex "LAR(16|32|64)rm",
+                                              "LSL(16|32|64)rm")>;
+
+def ICXWriteResGroup148 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup148], (instrs VPCMPGTQYrm)>;
+def: InstRW<[ICXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
+                                              "ILD_F(16|32|64)m",
+                                              "VALIGND(Z|Z256)rm(b?)i",
+                                              "VALIGNQ(Z|Z256)rm(b?)i",
+                                              "VPMAXSQ(Z|Z256)rm(b?)",
+                                              "VPMAXUQ(Z|Z256)rm(b?)",
+                                              "VPMINSQ(Z|Z256)rm(b?)",
+                                              "VPMINUQ(Z|Z256)rm(b?)")>;
+
+def ICXWriteResGroup148_2 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
+                                                "VCMPPS(Z|Z256)rm(b?)i",
+                                                "VFPCLASSPD(Z|Z256)rm(b?)",
+                                                "VFPCLASSPS(Z|Z256)rm(b?)",
+                                                "VPCMPB(Z|Z256)rmi(b?)",
+                                                "VPCMPD(Z|Z256)rmi(b?)",
+                                                "VPCMPEQB(Z|Z256)rm(b?)",
+                                                "VPCMPEQD(Z|Z256)rm(b?)",
+                                                "VPCMPEQQ(Z|Z256)rm(b?)",
+                                                "VPCMPEQW(Z|Z256)rm(b?)",
+                                                "VPCMPGTB(Z|Z256)rm(b?)",
+                                                "VPCMPGTD(Z|Z256)rm(b?)",
+                                                "VPCMPGTQ(Z|Z256)rm(b?)",
+                                                "VPCMPGTW(Z|Z256)rm(b?)",
+                                                "VPCMPQ(Z|Z256)rmi(b?)",
+                                                "VPCMPU(B|D|Q|W)Z256rmi(b?)",
+                                                "VPCMPU(B|D|Q|W)Zrmi(b?)",
+                                                "VPCMPW(Z|Z256)rmi(b?)",
+                                                "VPTESTM(B|D|Q|W)Z256rm(b?)",
+                                                "VPTESTM(B|D|Q|W)Zrm(b?)",
+                                                "VPTESTNM(B|D|Q|W)Z256rm(b?)",
+                                                "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
+
+def ICXWriteResGroup149 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)",
+                                              "VCVTDQ2PSZ128rm(b?)",
+                                              "(V?)CVTDQ2PSrm",
+                                              "VCVTPD2QQZ128rm(b?)",
+                                              "VCVTPD2UQQZ128rm(b?)",
+                                              "VCVTPH2PSZ128rm(b?)",
+                                              "VCVTPS2DQZ128rm(b?)",
+                                              "(V?)CVTPS2DQrm",
+                                              "VCVTPS2PDZ128rm(b?)",
+                                              "VCVTPS2QQZ128rm(b?)",
+                                              "VCVTPS2UDQZ128rm(b?)",
+                                              "VCVTPS2UQQZ128rm(b?)",
+                                              "VCVTQQ2PDZ128rm(b?)",
+                                              "VCVTQQ2PSZ128rm(b?)",
+                                              "VCVTSS2SDZrm",
+                                              "(V?)CVTSS2SDrm",
+                                              "VCVTTPD2QQZ128rm(b?)",
+                                              "VCVTTPD2UQQZ128rm(b?)",
+                                              "VCVTTPS2DQZ128rm(b?)",
+                                              "(V?)CVTTPS2DQrm",
+                                              "VCVTTPS2QQZ128rm(b?)",
+                                              "VCVTTPS2UDQZ128rm(b?)",
+                                              "VCVTTPS2UQQZ128rm(b?)",
+                                              "VCVTUDQ2PDZ128rm(b?)",
+                                              "VCVTUDQ2PSZ128rm(b?)",
+                                              "VCVTUQQ2PDZ128rm(b?)",
+                                              "VCVTUQQ2PSZ128rm(b?)")>;
+
+def ICXWriteResGroup151 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)",
+                                              "VEXPANDPSZ128rm(b?)",
+                                              "VPEXPANDDZ128rm(b?)",
+                                              "VPEXPANDQZ128rm(b?)")>;
+
+def ICXWriteResGroup153 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup153], (instregex "(V?)CVTSD2SSrm")>;
+
+def ICXWriteResGroup154 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[ICXWriteResGroup154], (instrs VPHADDSWYrm,
+                                           VPHSUBSWYrm)>;
+
+def ICXWriteResGroup157 : SchedWriteRes<[ICXPort4,ICXPort6,ICXPort23,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 10;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,1,1,1,3];
+}
+def: InstRW<[ICXWriteResGroup157], (instregex "XCHG(8|16|32|64)rm")>;
+
+def ICXWriteResGroup159 : SchedWriteRes<[ICXPort0,ICXFPDivider]> {
+  let Latency = 11;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1,3];
+}
+def : SchedAlias<WriteFDivX,  ICXWriteResGroup159>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteResGroup160 : SchedWriteRes<[ICXPort0,ICXPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup160], (instregex "MUL_F(32|64)m")>;
+
+def ICXWriteResGroup161 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup161], (instrs VCVTDQ2PSYrm,
+                                           VCVTPS2PDYrm)>;
+def: InstRW<[ICXWriteResGroup161], (instregex "VCVTDQ2(PD|PS)(Z|Z256)rm(b?)",
+                                              "VCVTPH2PS(Z|Z256)rm(b?)",
+                                              "VCVTPS2PD(Z|Z256)rm(b?)",
+                                              "VCVTQQ2PD(Z|Z256)rm(b?)",
+                                              "VCVTQQ2PSZ256rm(b?)",
+                                              "VCVT(T?)PD2QQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PD2UQQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PS2DQYrm",
+                                              "VCVT(T?)PS2DQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PS2QQZ256rm(b?)",
+                                              "VCVT(T?)PS2UDQ(Z|Z256)rm(b?)",
+                                              "VCVT(T?)PS2UQQZ256rm(b?)",
+                                              "VCVTUDQ2(PD|PS)(Z|Z256)rm(b?)",
+                                              "VCVTUQQ2PD(Z|Z256)rm(b?)",
+                                              "VCVTUQQ2PSZ256rm(b?)")>;
+
+def ICXWriteResGroup162 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup162], (instregex "FICOM(P?)(16|32)m",
+                                              "VEXPANDPD(Z|Z256)rm(b?)",
+                                              "VEXPANDPS(Z|Z256)rm(b?)",
+                                              "VPEXPANDD(Z|Z256)rm(b?)",
+                                              "VPEXPANDQ(Z|Z256)rm(b?)")>;
+
+def ICXWriteResGroup163 : SchedWriteRes<[ICXPort23,ICXPort015]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,2];
+}
+def: InstRW<[ICXWriteResGroup163], (instregex "VCVTSD2SSZrm")>;
+
+def ICXWriteResGroup164 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup164], (instregex "(V?)CVTDQ2PDrm")>;
+
+def ICXWriteResGroup166 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 11;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup166], (instrs CVTPD2PSrm,
+                                           CVTPD2DQrm,
+                                           CVTTPD2DQrm,
+                                           MMX_CVTPD2PIirm,
+                                           MMX_CVTTPD2PIirm)>;
+
+def ICXWriteResGroup167 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[ICXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>;
+
+def ICXWriteResGroup169 : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
+  let Latency = 11;
+  let NumMicroOps = 7;
+  let ResourceCycles = [2,3,2];
+}
+def: InstRW<[ICXWriteResGroup169], (instregex "RCL(16|32|64)rCL",
+                                              "RCR(16|32|64)rCL")>;
+
+def ICXWriteResGroup170 : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 11;
+  let NumMicroOps = 9;
+  let ResourceCycles = [1,5,1,2];
+}
+def: InstRW<[ICXWriteResGroup170], (instrs RCL8rCL)>;
+
+def ICXWriteResGroup171 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
+  let Latency = 11;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,9];
+}
+def: InstRW<[ICXWriteResGroup171], (instrs LOOPE, LOOPNE)>;
+
+def ICXWriteResGroup174 : SchedWriteRes<[ICXPort01]> {
+  let Latency = 15;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def: InstRW<[ICXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
+
+def ICXWriteResGroup174z : SchedWriteRes<[ICXPort05]> {
+  let Latency = 15;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def: InstRW<[ICXWriteResGroup174z], (instregex "VPMULLQZrr")>;
+
+def ICXWriteResGroup175 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 12;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup175], (instregex "VPERMWZ128rm(b?)")>;
+
+def ICXWriteResGroup176 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort015]> {
+  let Latency = 12;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup176], (instregex "VCVT(T?)SD2USIZrm(b?)",
+                                              "VCVT(T?)SS2USI64Zrm(b?)")>;
+
+def ICXWriteResGroup177 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 12;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup177], (instregex "VCVT(T?)PS2QQZrm(b?)",
+                                              "VCVT(T?)PS2UQQZrm(b?)")>;
+
+def ICXWriteResGroup179 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[ICXWriteResGroup179], (instregex "CVTTSS2SI64rm")>;
+
+def ICXWriteResGroup180 : SchedWriteRes<[ICXPort5,ICXPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2,1];
+}
+def: InstRW<[ICXWriteResGroup180], (instregex "(ADD|SUB|SUBR)_FI(16|32)m",
+                                              "VPERMWZ256rm(b?)",
+                                              "VPERMWZrm(b?)")>;
+
+def ICXWriteResGroup181 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup181], (instrs VCVTDQ2PDYrm)>;
+
+def ICXWriteResGroup183 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[ICXWriteResGroup183], (instregex "VPERMI2W128rm(b?)",
+                                              "VPERMT2W128rm(b?)")>;
+
+def ICXWriteResGroup184 : SchedWriteRes<[ICXPort0,ICXFPDivider]> {
+  let Latency = 14;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1,3];
+}
+def : SchedAlias<WriteFDiv64,  ICXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
+def : SchedAlias<WriteFDiv64X, ICXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteResGroup184_1 : SchedWriteRes<[ICXPort0,ICXFPDivider]> {
+  let Latency = 14;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1,5];
+}
+def : SchedAlias<WriteFDiv64Y, ICXWriteResGroup184_1>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteResGroup187 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup187], (instregex "MUL_FI(16|32)m")>;
+
+def ICXWriteResGroup188 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 14;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)",
+                                              "VCVTPD2PSZrm(b?)",
+                                              "VCVTPD2UDQZrm(b?)",
+                                              "VCVTQQ2PSZrm(b?)",
+                                              "VCVTTPD2DQZrm(b?)",
+                                              "VCVTTPD2UDQZrm(b?)",
+                                              "VCVTUQQ2PSZrm(b?)")>;
+
+def ICXWriteResGroup189 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2,1,1];
+}
+def: InstRW<[ICXWriteResGroup189], (instregex "VPERMI2W256rm(b?)",
+                                              "VPERMI2Wrm(b?)",
+                                              "VPERMT2W256rm(b?)",
+                                              "VPERMT2Wrm(b?)")>;
+
+def ICXWriteResGroup190 : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 14;
+  let NumMicroOps = 10;
+  let ResourceCycles = [2,4,1,3];
+}
+def: InstRW<[ICXWriteResGroup190], (instrs RCR8rCL)>;
+
+def ICXWriteResGroup191 : SchedWriteRes<[ICXPort0]> {
+  let Latency = 15;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup191], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>;
+
+def ICXWriteResGroup194 : SchedWriteRes<[ICXPort1,ICXPort5,ICXPort01,ICXPort23,ICXPort015]> {
+  let Latency = 15;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,2,2,1,2];
+}
+def: InstRW<[ICXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)")>;
+
+def ICXWriteResGroup195 : SchedWriteRes<[ICXPort1,ICXPort23,ICXPort237,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 15;
+  let NumMicroOps = 10;
+  let ResourceCycles = [1,1,1,5,1,1];
+}
+def: InstRW<[ICXWriteResGroup195], (instregex "RCL(8|16|32|64)mCL")>;
+
+def ICXWriteResGroup199 : SchedWriteRes<[ICXPort4,ICXPort23,ICXPort237,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 16;
+  let NumMicroOps = 14;
+  let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[ICXWriteResGroup199], (instrs CMPXCHG8B)>;
+
+def ICXWriteResGroup200 : SchedWriteRes<[ICXPort1, ICXPort05, ICXPort6]> {
+  let Latency = 12;
+  let NumMicroOps = 34;
+  let ResourceCycles = [1, 4, 5];
+}
+def: InstRW<[ICXWriteResGroup200], (instrs VZEROALL)>;
+
+def ICXWriteResGroup201 : SchedWriteRes<[ICXPort0,ICXPort23,ICXFPDivider]> {
+  let Latency = 17;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1,5];
+}
+def : SchedAlias<WriteFDivXLd, ICXWriteResGroup201>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteResGroup202 : SchedWriteRes<[ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156]> {
+  let Latency = 17;
+  let NumMicroOps = 15;
+  let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[ICXWriteResGroup202], (instrs XCH_F)>;
+
+def ICXWriteResGroup205 : SchedWriteRes<[ICXPort23,ICXPort01]> {
+  let Latency = 21;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[ICXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)")>;
+
+def ICXWriteResGroup207 : SchedWriteRes<[ICXPort5,ICXPort6,ICXPort06,ICXPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[ICXWriteResGroup207], (instrs CPUID, RDTSC)>;
+
+def ICXWriteResGroup208 : SchedWriteRes<[ICXPort1,ICXPort23,ICXPort237,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 18;
+  let NumMicroOps = 11;
+  let ResourceCycles = [2,1,1,4,1,2];
+}
+def: InstRW<[ICXWriteResGroup208], (instregex "RCR(8|16|32|64)mCL")>;
+
+def ICXWriteResGroup209 : SchedWriteRes<[ICXPort0,ICXPort23,ICXFPDivider]> {
+  let Latency = 19;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1,4];
+}
+def : SchedAlias<WriteFDiv64Ld,  ICXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteResGroup211 : SchedWriteRes<[ICXPort23,ICXPort01]> {
+  let Latency = 22;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[ICXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)")>;
+
+def ICXWriteResGroup211_1 : SchedWriteRes<[ICXPort23,ICXPort05]> {
+  let Latency = 22;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[ICXWriteResGroup211_1], (instregex "VPMULLQZrm(b?)")>;
+
+def ICXWriteResGroup215 : SchedWriteRes<[ICXPort0]> {
+  let Latency = 20;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[ICXWriteResGroup215], (instregex "DIV_(FPrST0|FST0r|FrST0)")>;
+
+def ICXWriteResGroup216 : SchedWriteRes<[ICXPort0,ICXPort23,ICXFPDivider]> {
+  let Latency = 20;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1,4];
+}
+def : SchedAlias<WriteFDiv64XLd, ICXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteGatherEVEX2 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort015,ICXPort0156]> {
+  let Latency = 17;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[ICXWriteGatherEVEX2], (instrs VGATHERQPSZ128rm, VPGATHERQDZ128rm,
+                                           VGATHERDPDZ128rm, VPGATHERDQZ128rm,
+                                           VGATHERQPDZ128rm, VPGATHERQQZ128rm)>;
+
+def ICXWriteGatherEVEX4 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort015,ICXPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[ICXWriteGatherEVEX4], (instrs VGATHERQPSZ256rm, VPGATHERQDZ256rm,
+                                           VGATHERQPDZ256rm, VPGATHERQQZ256rm,
+                                           VGATHERDPSZ128rm, VPGATHERDDZ128rm,
+                                           VGATHERDPDZ256rm, VPGATHERDQZ256rm)>;
+
+def ICXWriteGatherEVEX8 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort015,ICXPort0156]> {
+  let Latency = 21;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[ICXWriteGatherEVEX8], (instrs VGATHERDPSZ256rm, VPGATHERDDZ256rm,
+                                           VGATHERDPDZrm,    VPGATHERDQZrm,
+                                           VGATHERQPDZrm,    VPGATHERQQZrm,
+                                           VGATHERQPSZrm,    VPGATHERQDZrm)>;
+
+def ICXWriteGatherEVEX16 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort015,ICXPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,16,1,1];
+}
+def: InstRW<[ICXWriteGatherEVEX16], (instrs VGATHERDPSZrm, VPGATHERDDZrm)>;
+
+def ICXWriteResGroup219 : SchedWriteRes<[ICXPort4,ICXPort5,ICXPort6,ICXPort23,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 20;
+  let NumMicroOps = 8;
+  let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[ICXWriteResGroup219], (instrs INSB, INSL, INSW)>;
+
+def ICXWriteResGroup220 : SchedWriteRes<[ICXPort5,ICXPort6,ICXPort0156]> {
+  let Latency = 20;
+  let NumMicroOps = 10;
+  let ResourceCycles = [1,2,7];
+}
+def: InstRW<[ICXWriteResGroup220], (instrs MWAITrr)>;
+
+def ICXWriteResGroup222 : SchedWriteRes<[ICXPort0,ICXPort23,ICXFPDivider]> {
+  let Latency = 21;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1,8];
+}
+def : SchedAlias<WriteFDiv64YLd, ICXWriteResGroup222>; // TODO - convert to ZnWriteResFpuPair
+
+def ICXWriteResGroup223 : SchedWriteRes<[ICXPort0,ICXPort23]> {
+  let Latency = 22;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup223], (instregex "DIV_F(32|64)m")>;
+
+def ICXWriteResGroupVEX2 : SchedWriteRes<[ICXPort0, ICXPort23, ICXPort5, ICXPort015]> {
+  let Latency = 18;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[ICXWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+                                            VGATHERQPDrm, VPGATHERQQrm,
+                                            VGATHERQPSrm, VPGATHERQDrm)>;
+
+def ICXWriteResGroupVEX4 : SchedWriteRes<[ICXPort0, ICXPort23, ICXPort5, ICXPort015]> {
+  let Latency = 20;
+  let NumMicroOps = 5; // 2 uops peform multiple loads
+  let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[ICXWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+                                            VGATHERDPSrm,  VPGATHERDDrm,
+                                            VGATHERQPDYrm, VPGATHERQQYrm,
+                                            VGATHERQPSYrm,  VPGATHERQDYrm)>;
+
+def ICXWriteResGroupVEX8 : SchedWriteRes<[ICXPort0, ICXPort23, ICXPort5, ICXPort015]> {
+  let Latency = 22;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[ICXWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
+
+def ICXWriteResGroup225 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort015]> {
+  let Latency = 22;
+  let NumMicroOps = 14;
+  let ResourceCycles = [5,5,4];
+}
+def: InstRW<[ICXWriteResGroup225], (instregex "VPCONFLICTDZ128rr",
+                                              "VPCONFLICTQZ256rr")>;
+
+def ICXWriteResGroup228 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort5,ICXPort23,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 23;
+  let NumMicroOps = 19;
+  let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[ICXWriteResGroup228], (instrs CMPXCHG16B)>;
+
+def ICXWriteResGroup233 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23]> {
+  let Latency = 25;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;
+
+def ICXWriteResGroup239 : SchedWriteRes<[ICXPort0,ICXPort23]> {
+  let Latency = 27;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[ICXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;
+
+def ICXWriteResGroup242 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23,ICXPort015]> {
+  let Latency = 29;
+  let NumMicroOps = 15;
+  let ResourceCycles = [5,5,1,4];
+}
+def: InstRW<[ICXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)")>;
+
+def ICXWriteResGroup243 : SchedWriteRes<[ICXPort0,ICXPort5,ICXPort23]> {
+  let Latency = 30;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1,1,1];
+}
+def: InstRW<[ICXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;
+
+def ICXWriteResGroup247 : SchedWriteRes<[ICXPort5,ICXPort6,ICXPort23,ICXPort06,ICXPort0156]> {
+  let Latency = 35;
+  let NumMicroOps = 23;
+  let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[ICXWriteResGroup247], (instregex "IN(8|16|32)ri",
+                                              "IN(8|16|32)rr")>;
+
+def ICXWriteResGroup248 : SchedWriteRes<[ICXPort5,ICXPort6,ICXPort23,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 35;
+  let NumMicroOps = 23;
+  let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[ICXWriteResGroup248], (instregex "OUT(8|16|32)ir",
+                                              "OUT(8|16|32)rr")>;
+
+def ICXWriteResGroup249 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort015]> {
+  let Latency = 37;
+  let NumMicroOps = 21;
+  let ResourceCycles = [9,7,5];
+}
+def: InstRW<[ICXWriteResGroup249], (instregex "VPCONFLICTDZ256rr",
+                                              "VPCONFLICTQZrr")>;
+
+def ICXWriteResGroup250 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort23,ICXPort0156]> {
+  let Latency = 37;
+  let NumMicroOps = 31;
+  let ResourceCycles = [1,8,1,21];
+}
+def: InstRW<[ICXWriteResGroup250], (instregex "XRSTOR(64)?")>;
+
+def ICXWriteResGroup252 : SchedWriteRes<[ICXPort1,ICXPort4,ICXPort5,ICXPort6,ICXPort23,ICXPort237,ICXPort15,ICXPort0156]> {
+  let Latency = 40;
+  let NumMicroOps = 18;
+  let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[ICXWriteResGroup252], (instrs VMCLEARm)>;
+
+def ICXWriteResGroup253 : SchedWriteRes<[ICXPort4,ICXPort6,ICXPort23,ICXPort237,ICXPort0156]> {
+  let Latency = 41;
+  let NumMicroOps = 39;
+  let ResourceCycles = [1,10,1,1,26];
+}
+def: InstRW<[ICXWriteResGroup253], (instrs XSAVE64)>;
+
+def ICXWriteResGroup254 : SchedWriteRes<[ICXPort5,ICXPort0156]> {
+  let Latency = 42;
+  let NumMicroOps = 22;
+  let ResourceCycles = [2,20];
+}
+def: InstRW<[ICXWriteResGroup254], (instrs RDTSCP)>;
+
+def ICXWriteResGroup255 : SchedWriteRes<[ICXPort4,ICXPort6,ICXPort23,ICXPort237,ICXPort0156]> {
+  let Latency = 42;
+  let NumMicroOps = 40;
+  let ResourceCycles = [1,11,1,1,26];
+}
+def: InstRW<[ICXWriteResGroup255], (instrs XSAVE)>;
+def: InstRW<[ICXWriteResGroup255], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>;
+
+def ICXWriteResGroup256 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23,ICXPort015]> {
+  let Latency = 44;
+  let NumMicroOps = 22;
+  let ResourceCycles = [9,7,1,5];
+}
+def: InstRW<[ICXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)",
+                                              "VPCONFLICTQZrm(b?)")>;
+
+def ICXWriteResGroup258 : SchedWriteRes<[ICXPort0,ICXPort23,ICXPort05,ICXPort06,ICXPort0156]> {
+  let Latency = 62;
+  let NumMicroOps = 64;
+  let ResourceCycles = [2,8,5,10,39];
+}
+def: InstRW<[ICXWriteResGroup258], (instrs FLDENVm)>;
+
+def ICXWriteResGroup259 : SchedWriteRes<[ICXPort0,ICXPort6,ICXPort23,ICXPort05,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 63;
+  let NumMicroOps = 88;
+  let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[ICXWriteResGroup259], (instrs FXRSTOR64)>;
+
+def ICXWriteResGroup260 : SchedWriteRes<[ICXPort0,ICXPort6,ICXPort23,ICXPort05,ICXPort06,ICXPort15,ICXPort0156]> {
+  let Latency = 63;
+  let NumMicroOps = 90;
+  let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[ICXWriteResGroup260], (instrs FXRSTOR)>;
+
+def ICXWriteResGroup261 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort015]> {
+  let Latency = 67;
+  let NumMicroOps = 35;
+  let ResourceCycles = [17,11,7];
+}
+def: InstRW<[ICXWriteResGroup261], (instregex "VPCONFLICTDZrr")>;
+
+def ICXWriteResGroup262 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23,ICXPort015]> {
+  let Latency = 74;
+  let NumMicroOps = 36;
+  let ResourceCycles = [17,11,1,7];
+}
+def: InstRW<[ICXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)")>;
+
+def ICXWriteResGroup263 : SchedWriteRes<[ICXPort5,ICXPort05,ICXPort0156]> {
+  let Latency = 75;
+  let NumMicroOps = 15;
+  let ResourceCycles = [6,3,6];
+}
+def: InstRW<[ICXWriteResGroup263], (instrs FNINIT)>;
+
+def ICXWriteResGroup266 : SchedWriteRes<[ICXPort0,ICXPort1,ICXPort4,ICXPort5,ICXPort6,ICXPort237,ICXPort06,ICXPort0156]> {
+  let Latency = 106;
+  let NumMicroOps = 100;
+  let ResourceCycles = [9,1,11,16,1,11,21,30];
+}
+def: InstRW<[ICXWriteResGroup266], (instrs FSTENVm)>;
+
+def ICXWriteResGroup267 : SchedWriteRes<[ICXPort6,ICXPort0156]> {
+  let Latency = 140;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
+}
+def: InstRW<[ICXWriteResGroup267], (instrs PAUSE)>;
+
+def: InstRW<[WriteZero], (instrs CLC)>;
+
+
+// Instruction variants handled by the renamer. These might not need execution
+// ports in certain conditions.
+// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
+// section "Skylake Pipeline" > "Register allocation and renaming".
+// These can be investigated with llvm-exegesis, e.g.
+// echo 'pxor %mm0, %mm0' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+// echo 'vxorpd %xmm0, %xmm0, %xmm1' | /tmp/llvm-exegesis -mode=uops -snippets-file=-
+
+def ICXWriteZeroLatency : SchedWriteRes<[]> {
+  let Latency = 0;
+}
+
+def ICXWriteZeroIdiom : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteALU]>
+]>;
+def : InstRW<[ICXWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+                                          XOR32rr, XOR64rr)>;
+
+def ICXWriteFZeroIdiom : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteFLogic]>
+]>;
+def : InstRW<[ICXWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
+                                           XORPDrr, VXORPDrr,
+                                           VXORPSZ128rr,
+                                           VXORPDZ128rr)>;
+
+def ICXWriteFZeroIdiomY : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteFLogicY]>
+]>;
+def : InstRW<[ICXWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+                                            VXORPSZ256rr, VXORPDZ256rr)>;
+
+def ICXWriteFZeroIdiomZ : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteFLogicZ]>
+]>;
+def : InstRW<[ICXWriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr)>;
+
+def ICXWriteVZeroIdiomLogicX : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
+]>;
+def : InstRW<[ICXWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
+                                                 VPXORDZ128rr, VPXORQZ128rr)>;
+
+def ICXWriteVZeroIdiomLogicY : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
+]>;
+def : InstRW<[ICXWriteVZeroIdiomLogicY], (instrs VPXORYrr,
+                                                 VPXORDZ256rr, VPXORQZ256rr)>;
+
+def ICXWriteVZeroIdiomLogicZ : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteVecLogicZ]>
+]>;
+def : InstRW<[ICXWriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr)>;
+
+def ICXWriteVZeroIdiomALUX : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteVecALUX]>
+]>;
+def : InstRW<[ICXWriteVZeroIdiomALUX], (instrs PCMPGTBrr, VPCMPGTBrr,
+                                               PCMPGTDrr, VPCMPGTDrr,
+                                               PCMPGTWrr, VPCMPGTWrr)>;
+
+def ICXWriteVZeroIdiomALUY : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [WriteVecALUY]>
+]>;
+def : InstRW<[ICXWriteVZeroIdiomALUY], (instrs VPCMPGTBYrr,
+                                               VPCMPGTDYrr,
+                                               VPCMPGTWYrr)>;
+
+def ICXWritePSUB : SchedWriteRes<[ICXPort015]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+
+def ICXWriteVZeroIdiomPSUB : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [ICXWritePSUB]>
+]>;
+
+def : InstRW<[ICXWriteVZeroIdiomPSUB], (instrs PSUBBrr, VPSUBBrr, VPSUBBZ128rr,
+                                               PSUBDrr, VPSUBDrr, VPSUBDZ128rr,
+                                               PSUBQrr, VPSUBQrr, VPSUBQZ128rr,
+                                               PSUBWrr, VPSUBWrr, VPSUBWZ128rr,
+                                               VPSUBBYrr, VPSUBBZ256rr,
+                                               VPSUBDYrr, VPSUBDZ256rr,
+                                               VPSUBQYrr, VPSUBQZ256rr,
+                                               VPSUBWYrr, VPSUBWZ256rr,
+                                               VPSUBBZrr,
+                                               VPSUBDZrr,
+                                               VPSUBQZrr,
+                                               VPSUBWZrr)>;
+def ICXWritePCMPGTQ : SchedWriteRes<[ICXPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+
+def ICXWriteVZeroIdiomPCMPGTQ : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [ICXWriteZeroLatency]>,
+    SchedVar<NoSchedPred,                          [ICXWritePCMPGTQ]>
+]>;
+def : InstRW<[ICXWriteVZeroIdiomPCMPGTQ], (instrs PCMPGTQrr, VPCMPGTQrr,
+                                                  VPCMPGTQYrr)>;
+
+
+// CMOVs that use both Z and C flag require an extra uop.
+def ICXWriteCMOVA_CMOVBErr : SchedWriteRes<[ICXPort06]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+  let NumMicroOps = 2;
+}
+
+def ICXWriteCMOVA_CMOVBErm : SchedWriteRes<[ICXPort23,ICXPort06]> {
+  let Latency = 7;
+  let ResourceCycles = [1,2];
+  let NumMicroOps = 3;
+}
+
+def ICXCMOVA_CMOVBErr :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<IsCMOVArr_Or_CMOVBErr>, [ICXWriteCMOVA_CMOVBErr]>,
+  SchedVar<NoSchedPred,                             [WriteCMOV]>
+]>;
+
+def ICXCMOVA_CMOVBErm :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<IsCMOVArm_Or_CMOVBErm>, [ICXWriteCMOVA_CMOVBErm]>,
+  SchedVar<NoSchedPred,                             [WriteCMOV.Folded]>
+]>;
+
+def : InstRW<[ICXCMOVA_CMOVBErr], (instrs CMOV16rr, CMOV32rr, CMOV64rr)>;
+def : InstRW<[ICXCMOVA_CMOVBErm], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
+
+// SETCCs that use both Z and C flag require an extra uop.
+def ICXWriteSETA_SETBEr : SchedWriteRes<[ICXPort06]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+  let NumMicroOps = 2;
+}
+
+def ICXWriteSETA_SETBEm : SchedWriteRes<[ICXPort4,ICXPort237,ICXPort06]> {
+  let Latency = 3;
+  let ResourceCycles = [1,1,2];
+  let NumMicroOps = 4;
+}
+
+def ICXSETA_SETBErr :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<IsSETAr_Or_SETBEr>, [ICXWriteSETA_SETBEr]>,
+  SchedVar<NoSchedPred,                         [WriteSETCC]>
+]>;
+
+def ICXSETA_SETBErm :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<IsSETAm_Or_SETBEm>, [ICXWriteSETA_SETBEm]>,
+  SchedVar<NoSchedPred,                         [WriteSETCCStore]>
+]>;
+
+def : InstRW<[ICXSETA_SETBErr], (instrs SETCCr)>;
+def : InstRW<[ICXSETA_SETBErm], (instrs SETCCm)>;
+
+} // SchedModel
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 2f7157f43268..c8d7b0f72c1c 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -124,12 +124,17 @@ defm : X86WriteRes<WriteIMul16Imm,    [SBPort1,SBPort015], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,  [SBPort1,SBPort015,SBPort23], 8, [1,1,1], 3>;
 defm : SBWriteResPair<WriteIMul16Reg, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul32,    [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
+defm : SBWriteResPair<WriteMULX32,    [SBPort1,SBPort05,SBPort015], 3, [1,1,1], 3>;
 defm : SBWriteResPair<WriteIMul32Imm, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul32Reg, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul64,    [SBPort1,SBPort0], 4, [1,1], 2>;
+defm : SBWriteResPair<WriteMULX64,    [SBPort1,SBPort0], 3, [1,1], 2>;
 defm : SBWriteResPair<WriteIMul64Imm, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul64Reg, [SBPort1],   3>;
-def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+def SBWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
+def  : WriteRes<WriteIMulHLd, []> {
+  let Latency = !add(SBWriteIMulH.Latency, SandyBridgeModel.LoadLatency);
+}
 
 defm : X86WriteRes<WriteXCHG,      [SBPort015], 2, [3], 3>;
 defm : X86WriteRes<WriteBSWAP32,   [SBPort1], 1, [1], 1>;
@@ -601,7 +606,7 @@ def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> {
 def: InstRW<[SBWriteResGroup2], (instrs FDECSTP, FINCSTP, FFREE, FFREEP, FNOP,
                                         LD_Frr, ST_Frr, ST_FPrr)>;
 def: InstRW<[SBWriteResGroup2], (instrs LOOP, LOOPE, LOOPNE)>; // FIXME: This seems wrong compared to other Intel CPUs.
-def: InstRW<[SBWriteResGroup2], (instrs RETQ)>;
+def: InstRW<[SBWriteResGroup2], (instrs RET64)>;
 
 def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> {
   let Latency = 1;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 8486bdda0349..7d3229c3b023 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -122,12 +122,17 @@ defm : X86WriteRes<WriteIMul16Imm,     [SKLPort1,SKLPort0156], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,   [SKLPort1,SKLPort0156,SKLPort23], 8, [1,1,1], 3>;
 defm : SKLWriteResPair<WriteIMul16Reg, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul32,    [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
+defm : SKLWriteResPair<WriteMULX32,    [SKLPort1,SKLPort06,SKLPort0156], 3, [1,1,1], 3>;
 defm : SKLWriteResPair<WriteIMul32Imm, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul32Reg, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul64,    [SKLPort1,SKLPort5], 4, [1,1], 2>;
+defm : SKLWriteResPair<WriteMULX64,    [SKLPort1,SKLPort5], 3, [1,1], 2>;
 defm : SKLWriteResPair<WriteIMul64Imm, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul64Reg, [SKLPort1],   3>;
-def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+def SKLWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
+def  : WriteRes<WriteIMulHLd, []> {
+  let Latency = !add(SKLWriteIMulH.Latency, SkylakeClientModel.LoadLatency);
+}
 
 defm : X86WriteRes<WriteBSWAP32,    [SKLPort15], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64,    [SKLPort06, SKLPort15], 2, [1,1], 2>;
@@ -1170,7 +1175,7 @@ def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup98], (instrs LRETQ, RETQ)>;
+def: InstRW<[SKLWriteResGroup98], (instrs LRET64, RET64)>;
 
 def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index ba80d47c4eb6..1d8417aef41e 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -123,12 +123,17 @@ defm : X86WriteRes<WriteIMul16ImmLd,   [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1
 defm : X86WriteRes<WriteIMul16Reg,     [SKXPort1],   3, [1], 1>;
 defm : X86WriteRes<WriteIMul16RegLd,   [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1,1], 3>;
 defm : SKXWriteResPair<WriteIMul32,    [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
+defm : SKXWriteResPair<WriteMULX32,    [SKXPort1,SKXPort06,SKXPort0156], 3, [1,1,1], 3>;
 defm : SKXWriteResPair<WriteIMul32Imm, [SKXPort1],   3>;
 defm : SKXWriteResPair<WriteIMul32Reg, [SKXPort1],   3>;
 defm : SKXWriteResPair<WriteIMul64,    [SKXPort1,SKXPort5], 4, [1,1], 2>;
+defm : SKXWriteResPair<WriteMULX64,    [SKXPort1,SKXPort5], 3, [1,1], 2>;
 defm : SKXWriteResPair<WriteIMul64Imm, [SKXPort1],   3>;
 defm : SKXWriteResPair<WriteIMul64Reg, [SKXPort1],   3>;
-def : WriteRes<WriteIMulH, []> { let Latency = 3; }
+def SKXWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
+def  : WriteRes<WriteIMulHLd, []> {
+  let Latency = !add(SKXWriteIMulH.Latency, SkylakeServerModel.LoadLatency);
+}
 
 defm : X86WriteRes<WriteBSWAP32, [SKXPort15], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64, [SKXPort06, SKXPort15], 2, [1,1], 2>;
@@ -1431,7 +1436,7 @@ def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKXWriteResGroup104], (instrs LRETQ, RETQ)>;
+def: InstRW<[SKXWriteResGroup104], (instrs LRET64, RET64)>;
 
 def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index 09148fc19e57..1cb48175260a 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -87,8 +87,10 @@ class X86SchedWriteWidths<X86FoldableSchedWrite sScl,
 }
 
 // Multiclass that wraps X86SchedWriteWidths for each fp vector type.
-class X86SchedWriteSizes<X86SchedWriteWidths sPS,
+class X86SchedWriteSizes<X86SchedWriteWidths sPH,
+                         X86SchedWriteWidths sPS,
                          X86SchedWriteWidths sPD> {
+  X86SchedWriteWidths PH = sPH;
   X86SchedWriteWidths PS = sPS;
   X86SchedWriteWidths PD = sPD;
 }
@@ -146,7 +148,10 @@ defm WriteIMul32Reg : X86SchedWritePair; // Integer 32-bit multiplication by reg
 defm WriteIMul64    : X86SchedWritePair; // Integer 64-bit multiplication.
 defm WriteIMul64Imm : X86SchedWritePair; // Integer 64-bit multiplication by immediate.
 defm WriteIMul64Reg : X86SchedWritePair; // Integer 64-bit multiplication by register.
-def  WriteIMulH     : SchedWrite;        // Integer multiplication, high part.
+defm WriteMULX32    : X86SchedWritePair; // Integer 32-bit Multiplication without affecting flags.
+defm WriteMULX64    : X86SchedWritePair; // Integer 64-bit Multiplication without affecting flags.
+def  WriteIMulH     : SchedWrite;        // Integer multiplication, high part (only used by the RR variant of MULX).
+def  WriteIMulHLd   : SchedWrite;        // Integer multiplication, high part (only used by the RM variant of MULX).
 
 def  WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap.
 def  WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap.
@@ -681,20 +686,22 @@ def SchedWriteVarBlend
                        WriteVarBlendY, WriteVarBlendZ>;
 
 // Vector size wrappers.
+// FIXME: Currently PH uses the same schedule method as PS.
+// We may refine them later.
 def SchedWriteFAddSizes
- : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd64>;
+ : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd, SchedWriteFAdd64>;
 def SchedWriteFCmpSizes
- : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp64>;
+ : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp, SchedWriteFCmp64>;
 def SchedWriteFMulSizes
- : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul64>;
+ : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul, SchedWriteFMul64>;
 def SchedWriteFDivSizes
- : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv64>;
+ : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv, SchedWriteFDiv64>;
 def SchedWriteFSqrtSizes
- : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt64>;
+ : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt, SchedWriteFSqrt64>;
 def SchedWriteFLogicSizes
- : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic>;
+ : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic, SchedWriteFLogic>;
 def SchedWriteFShuffleSizes
- : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>;
+ : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle, SchedWriteFShuffle>;
 
 //===----------------------------------------------------------------------===//
 // Generic Processor Scheduler Models.
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index d00c2e3718d3..6fd98280f560 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -56,17 +56,21 @@ multiclass AtomWriteResPair<X86FoldableSchedWrite SchedRW,
                             list<ProcResourceKind> RMPorts,
                             int RRLat = 1, int RMLat = 1,
                             list<int> RRRes = [1],
-                            list<int> RMRes = [1]> {
+                            list<int> RMRes = [1],
+                            int RRUOps = 1,
+                            int RMUOps = 1> {
   // Register variant.
   def : WriteRes<SchedRW, RRPorts> {
     let Latency = RRLat;
     let ResourceCycles = RRRes;
+    let NumMicroOps = RRUOps;
   }
 
   // Memory variant.
   def : WriteRes<SchedRW.Folded, RMPorts> {
     let Latency = RMLat;
     let ResourceCycles = RMRes;
+    let NumMicroOps = RMUOps;
   }
 }
 
@@ -80,17 +84,20 @@ def : WriteRes<WriteRMW, [AtomPort0]>;
 defm : AtomWriteResPair<WriteALU,    [AtomPort01], [AtomPort0]>;
 defm : AtomWriteResPair<WriteADC,    [AtomPort01], [AtomPort0]>;
 
-defm : AtomWriteResPair<WriteIMul8,     [AtomPort01], [AtomPort01],  7,  7,  [7],  [7]>;
-defm : AtomWriteResPair<WriteIMul16,    [AtomPort01], [AtomPort01],  7,  8,  [7],  [8]>;
-defm : AtomWriteResPair<WriteIMul16Imm, [AtomPort01], [AtomPort01],  6,  7,  [6],  [7]>;
-defm : AtomWriteResPair<WriteIMul16Reg, [AtomPort01], [AtomPort01],  6,  7,  [6],  [7]>;
-defm : AtomWriteResPair<WriteIMul32,    [AtomPort01], [AtomPort01],  6,  7,  [6],  [7]>;
+defm : AtomWriteResPair<WriteIMul8,     [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  7,  [7,7],  [7,7], 3, 3>;
+defm : AtomWriteResPair<WriteIMul16,    [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [7,7],  [8,8], 4, 5>;
+defm : AtomWriteResPair<WriteIMul16Imm, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 2, 3>;
+defm : AtomWriteResPair<WriteIMul16Reg, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 2, 3>;
+defm : AtomWriteResPair<WriteIMul32,    [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 3, 4>;
 defm : AtomWriteResPair<WriteIMul32Imm, [AtomPort0],  [AtomPort0],   5,  5,  [5],  [5]>;
 defm : AtomWriteResPair<WriteIMul32Reg, [AtomPort0],  [AtomPort0],   5,  5,  [5],  [5]>;
-defm : AtomWriteResPair<WriteIMul64,    [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
-defm : AtomWriteResPair<WriteIMul64Imm, [AtomPort01], [AtomPort01], 14, 14, [14], [14]>;
-defm : AtomWriteResPair<WriteIMul64Reg, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
+defm : AtomWriteResPair<WriteIMul64,    [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 12, 12, [12,12], [12,12], 8, 8>;
+defm : AtomWriteResPair<WriteIMul64Imm, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 14, 14, [14,14], [14,14], 7, 7>;
+defm : AtomWriteResPair<WriteIMul64Reg, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 12, 12, [12,12], [12,12], 6, 6>;
 defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResUnsupported<WriteIMulHLd>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : X86WriteRes<WriteXCHG,        [AtomPort01], 2, [2], 1>;
 defm : X86WriteRes<WriteBSWAP32,     [AtomPort0], 1, [1], 1>;
@@ -98,14 +105,14 @@ defm : X86WriteRes<WriteBSWAP64,     [AtomPort0], 1, [1], 1>;
 defm : AtomWriteResPair<WriteCMPXCHG, [AtomPort01], [AtomPort01], 15, 15, [15]>;
 defm : X86WriteRes<WriteCMPXCHGRMW,   [AtomPort01, AtomPort0], 1, [1, 1], 1>;
 
-defm : AtomWriteResPair<WriteDiv8,   [AtomPort01], [AtomPort01], 50, 68, [50], [68]>;
-defm : AtomWriteResPair<WriteDiv16,  [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
-defm : AtomWriteResPair<WriteDiv32,  [AtomPort01], [AtomPort01], 50, 50, [50], [50]>;
-defm : AtomWriteResPair<WriteDiv64,  [AtomPort01], [AtomPort01],130,130,[130],[130]>;
-defm : AtomWriteResPair<WriteIDiv8,  [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
-defm : AtomWriteResPair<WriteIDiv16, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
-defm : AtomWriteResPair<WriteIDiv32, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
-defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>;
+defm : AtomWriteResPair<WriteDiv8,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 50, 68, [50,50], [68,68],  9,  9>;
+defm : AtomWriteResPair<WriteDiv16,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 50, 50, [50,50], [50,50], 12, 12>;
+defm : AtomWriteResPair<WriteDiv32,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 50, 50, [50,50], [50,50], 12, 12>;
+defm : AtomWriteResPair<WriteDiv64,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],130,130,[130,130],[130,130], 38, 38>;
+defm : AtomWriteResPair<WriteIDiv8,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 62, 62, [62,62], [62,62], 26, 26>;
+defm : AtomWriteResPair<WriteIDiv16, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 62, 62, [62,62], [62,62], 29, 29>;
+defm : AtomWriteResPair<WriteIDiv32, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 62, 62, [62,62], [62,62], 29, 29>;
+defm : AtomWriteResPair<WriteIDiv64, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],130,130,[130,130],[130,130], 60, 60>;
 
 defm : X86WriteResPairUnsupported<WriteCRC32>;
 
@@ -132,8 +139,8 @@ defm : X86WriteRes<WriteBitTestSet,      [AtomPort1],  1, [1], 1>;
 def : WriteRes<WriteLEA, [AtomPort1]>;
 
 // Bit counts.
-defm : AtomWriteResPair<WriteBSF, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
-defm : AtomWriteResPair<WriteBSR, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>;
+defm : AtomWriteResPair<WriteBSF, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 16, 16, [16,16], [16,16], 10, 10>;
+defm : AtomWriteResPair<WriteBSR, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 16, 16, [16,16], [16,16], 10, 10>;
 defm : X86WriteResPairUnsupported<WritePOPCNT>;
 defm : X86WriteResPairUnsupported<WriteLZCNT>;
 defm : X86WriteResPairUnsupported<WriteTZCNT>;
@@ -230,52 +237,52 @@ defm : AtomWriteResPair<WriteFAddX,          [AtomPort1], [AtomPort0,AtomPort1],
 defm : X86WriteResPairUnsupported<WriteFAddY>;
 defm : X86WriteResPairUnsupported<WriteFAddZ>;
 defm : AtomWriteResPair<WriteFAdd64,         [AtomPort1], [AtomPort0,AtomPort1],  5,  5,  [1],  [1,1]>;
-defm : AtomWriteResPair<WriteFAdd64X,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
+defm : AtomWriteResPair<WriteFAdd64X,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6], 3, 4>;
 defm : X86WriteResPairUnsupported<WriteFAdd64Y>;
 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
 defm : AtomWriteResPair<WriteFCmp,           [AtomPort1], [AtomPort0,AtomPort1],  5,  5,  [1],  [1,1]>;
-defm : AtomWriteResPair<WriteFCmpX,          [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
+defm : AtomWriteResPair<WriteFCmpX,          [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6], 3, 4>;
 defm : X86WriteResPairUnsupported<WriteFCmpY>;
 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
 defm : AtomWriteResPair<WriteFCmp64,         [AtomPort1], [AtomPort0,AtomPort1],  5,  5,  [1],  [1,1]>;
-defm : AtomWriteResPair<WriteFCmp64X,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
+defm : AtomWriteResPair<WriteFCmp64X,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6], 3, 4>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : AtomWriteResPair<WriteFCom,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
-defm : AtomWriteResPair<WriteFComX,          [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFComX,          [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  9, 10,  [9,9],[10,10], 4, 5>;
 defm : AtomWriteResPair<WriteFMul,           [AtomPort0],  [AtomPort0],  4,  4,  [2],  [2]>;
 defm : AtomWriteResPair<WriteFMulX,          [AtomPort0],  [AtomPort0],  5,  5,  [2],  [2]>;
 defm : X86WriteResPairUnsupported<WriteFMulY>;
 defm : X86WriteResPairUnsupported<WriteFMulZ>;
 defm : AtomWriteResPair<WriteFMul64,         [AtomPort0],  [AtomPort0],  5,  5,  [2],  [2]>;
-defm : AtomWriteResPair<WriteFMul64X,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  9, 10,  [9,9],  [10,10]>;
+defm : AtomWriteResPair<WriteFMul64X,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  9, 10,  [9,9],[10,10], 6, 7>;
 defm : X86WriteResPairUnsupported<WriteFMul64Y>;
 defm : X86WriteResPairUnsupported<WriteFMul64Z>;
 defm : AtomWriteResPair<WriteFRcp,           [AtomPort0],  [AtomPort0],  4,  4,  [4],  [4]>;
-defm : AtomWriteResPair<WriteFRcpX,         [AtomPort01], [AtomPort01],  9, 10,  [9], [10]>;
+defm : AtomWriteResPair<WriteFRcpX,          [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  9, 10,  [9,9], [10,10], 5, 6>;
 defm : X86WriteResPairUnsupported<WriteFRcpY>;
 defm : X86WriteResPairUnsupported<WriteFRcpZ>;
 defm : AtomWriteResPair<WriteFRsqrt,         [AtomPort0],  [AtomPort0],  4,  4,  [4],  [4]>;
-defm : AtomWriteResPair<WriteFRsqrtX,       [AtomPort01], [AtomPort01],  9, 10,  [9], [10]>;
+defm : AtomWriteResPair<WriteFRsqrtX,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  9, 10,  [9,9], [10,10], 5, 6>;
 defm : X86WriteResPairUnsupported<WriteFRsqrtY>;
 defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
-defm : AtomWriteResPair<WriteFDiv,          [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
-defm : AtomWriteResPair<WriteFDivX,         [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : AtomWriteResPair<WriteFDiv,          [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 34, 34, [34,34], [34,34], 3, 4>;
+defm : AtomWriteResPair<WriteFDivX,         [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 70, 70, [70,70], [70,70], 6, 7>;
 defm : X86WriteResPairUnsupported<WriteFDivY>;
 defm : X86WriteResPairUnsupported<WriteFDivZ>;
-defm : AtomWriteResPair<WriteFDiv64,        [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
-defm : AtomWriteResPair<WriteFDiv64X,       [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : AtomWriteResPair<WriteFDiv64,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 62, 62,  [62,62],  [62,62], 3, 4>;
+defm : AtomWriteResPair<WriteFDiv64X,       [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],125,125,[125,125],[125,125], 6, 7>;
 defm : X86WriteResPairUnsupported<WriteFDiv64Y>;
 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
-defm : AtomWriteResPair<WriteFSqrt,         [AtomPort01], [AtomPort01], 34, 34, [34], [34]>;
-defm : AtomWriteResPair<WriteFSqrtX,        [AtomPort01], [AtomPort01], 70, 70, [70], [70]>;
+defm : AtomWriteResPair<WriteFSqrt,         [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 34, 34, [34,34], [34,34], 3, 4>;
+defm : AtomWriteResPair<WriteFSqrtX,        [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 70, 70, [70,70], [70,70], 5, 6>;
 defm : X86WriteResPairUnsupported<WriteFSqrtY>;
 defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
-defm : AtomWriteResPair<WriteFSqrt64,       [AtomPort01], [AtomPort01], 62, 62, [62], [62]>;
-defm : AtomWriteResPair<WriteFSqrt64X,      [AtomPort01], [AtomPort01],125,125,[125],[125]>;
+defm : AtomWriteResPair<WriteFSqrt64,       [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 62, 62,  [62,62],  [62,62], 3, 4>;
+defm : AtomWriteResPair<WriteFSqrt64X,      [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],125,125,[125,125],[125,125], 5, 6>;
 defm : X86WriteResPairUnsupported<WriteFSqrt64Y>;
 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
-defm : AtomWriteResPair<WriteFSqrt80,       [AtomPort01], [AtomPort01], 71, 71, [71], [71]>;
+defm : AtomWriteResPair<WriteFSqrt80,        [AtomPort0],  [AtomPort0], 71, 71, [71], [71]>;
 defm : AtomWriteResPair<WriteFSign,          [AtomPort1],  [AtomPort1]>;
 defm : AtomWriteResPair<WriteFRnd,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
 defm : X86WriteResPairUnsupported<WriteFRndY>;
@@ -389,8 +396,8 @@ defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
 defm : X86WriteResPairUnsupported<WriteVecTest>;
 defm : X86WriteResPairUnsupported<WriteVecTestY>;
 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
-defm : AtomWriteResPair<WriteVecShift,     [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 2, 3, [1,1], [2,2]>;
-defm : AtomWriteResPair<WriteVecShiftX,    [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 2, 3, [1,1], [2,2]>;
+defm : AtomWriteResPair<WriteVecShift,     [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 2, 3, [1,1], [2,2], 2, 3>;
+defm : AtomWriteResPair<WriteVecShiftX,    [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 2, 3, [1,1], [2,2], 2, 3>;
 defm : X86WriteResPairUnsupported<WriteVecShiftY>;
 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
 defm : AtomWriteResPair<WriteVecShiftImm,   [AtomPort0],  [AtomPort0], 1, 1>;
@@ -417,7 +424,7 @@ defm : AtomWriteResPair<WriteShuffleX,      [AtomPort0],  [AtomPort0], 1, 1>;
 defm : X86WriteResPairUnsupported<WriteShuffleY>;
 defm : X86WriteResPairUnsupported<WriteShuffleZ>;
 defm : AtomWriteResPair<WriteVarShuffle,    [AtomPort0],  [AtomPort0], 1, 1>;
-defm : AtomWriteResPair<WriteVarShuffleX,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 4, 5, [3,3], [4,4]>;
+defm : AtomWriteResPair<WriteVarShuffleX,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 4, 5, [3,3], [4,4], 4, 5>;
 defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
 defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
 defm : X86WriteResPairUnsupported<WriteBlend>;
@@ -471,11 +478,11 @@ defm : X86WriteResPairUnsupported<WriteAESDecEnc>;
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : AtomWriteResPair<WriteFHAdd,  [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
-defm : AtomWriteResPair<WriteFHAddY, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>;
-defm : AtomWriteResPair<WritePHAdd,  [AtomPort01], [AtomPort01], 3, 4, [3], [4]>;
-defm : AtomWriteResPair<WritePHAddX, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
-defm : AtomWriteResPair<WritePHAddY, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>;
+defm : AtomWriteResPair<WriteFHAdd,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [8,8], [9,9], 5, 6>;
+defm : X86WriteResPairUnsupported<WriteFHAddY>;
+defm : AtomWriteResPair<WritePHAdd,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 3, 4, [3,3], [4,4], 3, 4>;
+defm : AtomWriteResPair<WritePHAddX, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [7,7], [8,8], 3, 4>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Carry-less multiplication instructions.
@@ -487,8 +494,8 @@ defm : X86WriteResPairUnsupported<WriteCLMul>;
 // Load/store MXCSR.
 ////////////////////////////////////////////////////////////////////////////////
 
-def : WriteRes<WriteLDMXCSR, [AtomPort01]> { let Latency = 5; let ResourceCycles = [5]; }
-def : WriteRes<WriteSTMXCSR, [AtomPort01]> { let Latency = 15; let ResourceCycles = [15]; }
+defm : X86WriteRes<WriteLDMXCSR, [AtomPort0,AtomPort1],  5,   [5,5], 4>;
+defm : X86WriteRes<WriteSTMXCSR, [AtomPort0,AtomPort1], 15, [15,15], 4>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Special Cases.
@@ -533,7 +540,7 @@ def : InstRW<[AtomWrite0_1_1], (instrs POP32r, POP64r,
                                        PUSH16rmr, PUSH32rmr, PUSH64rmr,
                                        PUSH16i8, PUSH32i8, PUSH64i8, PUSH64i32,
                                        XCH_F)>;
-def : InstRW<[AtomWrite0_1_1], (instregex "RETI(L|Q|W)$",
+def : InstRW<[AtomWrite0_1_1], (instregex "RETI(16|32|64)$",
                                           "IRET(16|32|64)?")>;
 
 def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> {
@@ -551,10 +558,7 @@ def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> {
 def : InstRW<[AtomWrite01_1], (instrs FDECSTP, FFREE, FFREEP, FINCSTP, WAIT,
                                       LFENCE,
                                       STOSB, STOSL, STOSQ, STOSW,
-                                      MOVSSrr, MOVSSrr_REV,
-                                      PSLLDQri, PSRLDQri)>;
-def : InstRW<[AtomWrite01_1], (instregex "MMX_PACK(SSDW|SSWB|USWB)irr",
-                                         "MMX_PUNPCKH(BW|DQ|WD)irr")>;
+                                      MOVSSrr, MOVSSrr_REV)>;
 
 def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> {
   let Latency = 2;
@@ -644,7 +648,6 @@ def : InstRW<[AtomWrite01_9], (instrs POPA16, POPA32,
                                       SHLD64rri8, SHRD64rri8,
                                       CMPXCHG8rr)>;
 def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F",
-                                         "(U)?COMIS(D|S)rr",
                                          "CVT(T)?SS2SI64rr(_Int)?")>;
 
 def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> {
@@ -652,8 +655,7 @@ def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> {
   let ResourceCycles = [10];
 }
 def : SchedAlias<WriteFLDC, AtomWrite01_10>;
-def : InstRW<[AtomWrite01_10], (instregex "(U)?COMIS(D|S)rm",
-                                          "CVT(T)?SS2SI64rm(_Int)?")>;
+def : InstRW<[AtomWrite01_10], (instregex "CVT(T)?SS2SI64rm(_Int)?")>;
 
 def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> {
   let Latency = 11;
@@ -817,8 +819,8 @@ def AtomWrite01_79 : SchedWriteRes<[AtomPort01]> {
   let Latency = 79;
   let ResourceCycles = [79];
 }
-def : InstRW<[AtomWrite01_79], (instregex "RET(L|Q|W)?$",
-                                          "LRETI?(L|Q|W)")>;
+def : InstRW<[AtomWrite01_79], (instregex "RET(16|32|64)?$",
+                                          "LRETI?(16|32|64)")>;
 
 def AtomWrite01_92 : SchedWriteRes<[AtomPort01]> {
   let Latency = 92;
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 99d4011dae77..4c16b5b52b1d 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -435,7 +435,12 @@ defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul],          4,  [1, 2]>;
 defm : PdWriteResExPair<WriteIMul64,    [PdEX1, PdMul],          6,  [1, 6]>;
 defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul],          6,  [1, 4],1, 1>;
 defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul],          6,  [1, 4]>;
-defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
+
+// BMI2 MULX
+defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResUnsupported<WriteIMulHLd>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : PdWriteResExPair<WriteDiv8,    [PdEX1, PdDiv],           12,  [1, 12]>;
 defm : PdWriteResExPair<WriteDiv16,   [PdEX1, PdDiv],           15,  [1, 15],   2>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index cdd03830bcad..68ebaa244acf 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -209,7 +209,10 @@ defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
 defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;  
 defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
 defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
-defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
+defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResUnsupported<WriteIMulHLd>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
 defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 123844a73a59..5af9835f75a7 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -62,7 +62,7 @@ def : ReadAdvance<ReadInt2Fpu, 0>;
 multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
                            list<ProcResourceKind> ExePorts,
                            int Lat, list<int> Res = [1], int UOps = 1,
-                           int LoadLat = 3> {
+                           int LoadUOps = 0, int LoadLat = 3> {
   // Register variant is using a single cycle on ExePort.
   def : WriteRes<SchedRW, ExePorts> {
     let Latency = Lat;
@@ -75,13 +75,13 @@ multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
   def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> {
     let Latency = !add(Lat, LoadLat);
     let ResourceCycles = !listconcat([1], Res);
-    let NumMicroOps = UOps;
+    let NumMicroOps = !add(UOps, LoadUOps);
   }
 }
 
-// A folded store needs a cycle on MEC_RSV for the store data, but it does not
-// need an extra port cycle to recompute the address.
-def : WriteRes<WriteRMW, [SLM_MEC_RSV]>;
+// A folded store needs a cycle on MEC_RSV for the store data (using the same uop),
+// but it does not need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [SLM_MEC_RSV]> { let NumMicroOps = 0; }
 
 def : WriteRes<WriteStore,   [SLM_IEC_RSV01, SLM_MEC_RSV]>;
 def : WriteRes<WriteStoreNT, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
@@ -101,17 +101,20 @@ def : InstRW<[WriteMove], (instrs COPY)>;
 defm : SLMWriteResPair<WriteALU,    [SLM_IEC_RSV01], 1>;
 defm : SLMWriteResPair<WriteADC,    [SLM_IEC_RSV01], 1>;
 
-defm : SLMWriteResPair<WriteIMul8,     [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul16,    [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul16Imm, [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul16Reg, [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul32,    [SLM_IEC_RSV1],  3>;
+defm : SLMWriteResPair<WriteIMul8,     [SLM_IEC_RSV1],  5, [5], 3>;
+defm : SLMWriteResPair<WriteIMul16,    [SLM_IEC_RSV1],  5, [5], 4, 1>;
+defm : SLMWriteResPair<WriteIMul16Imm, [SLM_IEC_RSV1],  4, [4], 2, 1>;
+defm : SLMWriteResPair<WriteIMul16Reg, [SLM_IEC_RSV1],  4, [4], 2, 1>;
+defm : SLMWriteResPair<WriteIMul32,    [SLM_IEC_RSV1],  5, [5], 3, 1>;
 defm : SLMWriteResPair<WriteIMul32Imm, [SLM_IEC_RSV1],  3>;
 defm : SLMWriteResPair<WriteIMul32Reg, [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul64,    [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul64Imm, [SLM_IEC_RSV1],  3>;
-defm : SLMWriteResPair<WriteIMul64Reg, [SLM_IEC_RSV1],  3>;
-def  : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
+defm : SLMWriteResPair<WriteIMul64,    [SLM_IEC_RSV1],  7, [7], 3>;
+defm : SLMWriteResPair<WriteIMul64Imm, [SLM_IEC_RSV1],  5, [2]>;
+defm : SLMWriteResPair<WriteIMul64Reg, [SLM_IEC_RSV1],  5, [2]>;
+defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResUnsupported<WriteIMulHLd>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>;
@@ -140,12 +143,12 @@ def  : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
   let ResourceCycles = [2,1];
 }
 defm : X86WriteRes<WriteLAHFSAHF,        [SLM_IEC_RSV01], 1, [1], 1>;
-defm : X86WriteRes<WriteBitTest,         [SLM_IEC_RSV01], 1, [1], 1>;
-defm : X86WriteRes<WriteBitTestImmLd,    [SLM_IEC_RSV01, SLM_MEC_RSV], 4, [1,1], 1>;
-defm : X86WriteRes<WriteBitTestRegLd,    [SLM_IEC_RSV01, SLM_MEC_RSV], 4, [1,1], 1>;
-defm : X86WriteRes<WriteBitTestSet,      [SLM_IEC_RSV01], 1, [1], 1>;
-defm : X86WriteRes<WriteBitTestSetImmLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 3, [1,1], 1>;
-defm : X86WriteRes<WriteBitTestSetRegLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 3, [1,1], 1>;
+defm : X86WriteRes<WriteBitTest,         [SLM_IEC_RSV0, SLM_IEC_RSV1], 1, [1,1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd,    [SLM_IEC_RSV0, SLM_IEC_RSV1, SLM_MEC_RSV], 4, [1,1,1], 1>;
+defm : X86WriteRes<WriteBitTestRegLd,    [SLM_IEC_RSV0, SLM_IEC_RSV1, SLM_MEC_RSV], 4, [1,1,1], 7>;
+defm : X86WriteRes<WriteBitTestSet,      [SLM_IEC_RSV0, SLM_IEC_RSV1], 1, [1,1], 1>;
+defm : X86WriteRes<WriteBitTestSetImmLd, [SLM_IEC_RSV0, SLM_IEC_RSV1, SLM_MEC_RSV], 3, [1,1,1], 1>;
+defm : X86WriteRes<WriteBitTestSetRegLd, [SLM_IEC_RSV0, SLM_IEC_RSV1, SLM_MEC_RSV], 3, [1,1,1], 7>;
 
 // This is for simple LEAs with one or two input operands.
 // The complex ones can only execute on port 1, and they require two cycles on
@@ -153,8 +156,8 @@ defm : X86WriteRes<WriteBitTestSetRegLd, [SLM_IEC_RSV01, SLM_MEC_RSV], 3, [1,1],
 def : WriteRes<WriteLEA, [SLM_IEC_RSV1]>;
 
 // Bit counts.
-defm : SLMWriteResPair<WriteBSF, [SLM_IEC_RSV01], 10, [20], 10>;
-defm : SLMWriteResPair<WriteBSR, [SLM_IEC_RSV01], 10, [20], 10>;
+defm : SLMWriteResPair<WriteBSF, [SLM_IEC_RSV0, SLM_IEC_RSV1], 10, [10,10], 10>;
+defm : SLMWriteResPair<WriteBSR, [SLM_IEC_RSV0, SLM_IEC_RSV1], 10, [10,10], 10>;
 defm : SLMWriteResPair<WriteLZCNT,          [SLM_IEC_RSV0], 3>;
 defm : SLMWriteResPair<WriteTZCNT,          [SLM_IEC_RSV0], 3>;
 defm : SLMWriteResPair<WritePOPCNT,         [SLM_IEC_RSV0], 3>;
@@ -164,14 +167,14 @@ defm : X86WriteResPairUnsupported<WriteBEXTR>;
 defm : X86WriteResPairUnsupported<WriteBLS>;
 defm : X86WriteResPairUnsupported<WriteBZHI>;
 
-defm : SLMWriteResPair<WriteDiv8,   [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteDiv16,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteDiv32,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteDiv64,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteIDiv8,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteIDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteIDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
-defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;
+defm : SLMWriteResPair<WriteDiv8,   [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteDiv16,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteDiv32,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteDiv64,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteIDiv8,  [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteIDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteIDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
+defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 0, 4>;
 
 // Scalar and vector floating point.
 defm : X86WriteRes<WriteFLD0,       [SLM_FPC_RSV01], 1, [1], 1>;
@@ -230,33 +233,33 @@ defm : X86WriteResPairUnsupported<WriteFMAX>;
 defm : X86WriteResPairUnsupported<WriteFMAY>;
 defm : X86WriteResPairUnsupported<WriteFMAZ>;
 defm : SLMWriteResPair<WriteFDiv,     [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>;
-defm : SLMWriteResPair<WriteFDivX,    [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
-defm : SLMWriteResPair<WriteFDivY,    [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
+defm : SLMWriteResPair<WriteFDivX,    [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39], 6, 1>;
+defm : X86WriteResPairUnsupported<WriteFDivY>;
 defm : X86WriteResPairUnsupported<WriteFDivZ>;
 defm : SLMWriteResPair<WriteFDiv64,   [SLM_FPC_RSV0, SLMFPDivider], 34, [1,32]>;
-defm : SLMWriteResPair<WriteFDiv64X,  [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
-defm : SLMWriteResPair<WriteFDiv64Y,  [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>;
+defm : SLMWriteResPair<WriteFDiv64X,  [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69], 6, 1>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Y>;
 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
-defm : SLMWriteResPair<WriteFRcp,     [SLM_FPC_RSV0], 5>;
-defm : SLMWriteResPair<WriteFRcpX,    [SLM_FPC_RSV0], 5>;
-defm : SLMWriteResPair<WriteFRcpY,    [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRcp,     [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteFRcpX,    [SLM_FPC_RSV0], 9, [8], 5, 1>;
+defm : X86WriteResPairUnsupported<WriteFRcpY>;
 defm : X86WriteResPairUnsupported<WriteFRcpZ>;
-defm : SLMWriteResPair<WriteFRsqrt,   [SLM_FPC_RSV0], 5>;
-defm : SLMWriteResPair<WriteFRsqrtX,  [SLM_FPC_RSV0], 5>;
-defm : SLMWriteResPair<WriteFRsqrtY,  [SLM_FPC_RSV0], 5>;
+defm : SLMWriteResPair<WriteFRsqrt,   [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WriteFRsqrtX,  [SLM_FPC_RSV0], 9, [8], 5, 1>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtY>;
 defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
-defm : SLMWriteResPair<WriteFSqrt,    [SLM_FPC_RSV0,SLMFPDivider], 20, [1,20], 1, 3>;
-defm : SLMWriteResPair<WriteFSqrtX,   [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
-defm : SLMWriteResPair<WriteFSqrtY,   [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt,    [SLM_FPC_RSV0, SLMFPDivider], 20, [1,20]>;
+defm : SLMWriteResPair<WriteFSqrtX,   [SLM_FPC_RSV0, SLMFPDivider], 41, [1,40], 5, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrtY>;
 defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
-defm : SLMWriteResPair<WriteFSqrt64,  [SLM_FPC_RSV0,SLMFPDivider], 35, [1,35], 1, 3>;
-defm : SLMWriteResPair<WriteFSqrt64X, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
-defm : SLMWriteResPair<WriteFSqrt64Y, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>;
+defm : SLMWriteResPair<WriteFSqrt64,  [SLM_FPC_RSV0, SLMFPDivider], 35, [1,35]>;
+defm : SLMWriteResPair<WriteFSqrt64X, [SLM_FPC_RSV0, SLMFPDivider], 71, [1,70], 5, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Y>;
 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
 defm : SLMWriteResPair<WriteFSqrt80,  [SLM_FPC_RSV0,SLMFPDivider], 40, [1,40]>;
-defm : SLMWriteResPair<WriteDPPD,   [SLM_FPC_RSV1], 3>;
-defm : SLMWriteResPair<WriteDPPS,   [SLM_FPC_RSV1], 3>;
-defm : SLMWriteResPair<WriteDPPSY,  [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteDPPD,   [SLM_FPC_RSV1], 12,  [8], 5, 1>;
+defm : SLMWriteResPair<WriteDPPS,   [SLM_FPC_RSV1], 15, [12], 9, 1>;
+defm : X86WriteResPairUnsupported<WriteDPPSY>;
 defm : X86WriteResPairUnsupported<WriteDPPSZ>;
 defm : SLMWriteResPair<WriteFSign,  [SLM_FPC_RSV01], 1>;
 defm : SLMWriteResPair<WriteFRnd,   [SLM_FPC_RSV1], 3>;
@@ -277,7 +280,7 @@ defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 defm : SLMWriteResPair<WriteFBlend,  [SLM_FPC_RSV0],  1>;
 defm : X86WriteResPairUnsupported<WriteFBlendY>;
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
-defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>;
+defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 2, 1>;
 defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
 defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 defm : X86WriteResPairUnsupported<WriteFShuffle256>;
@@ -369,8 +372,8 @@ defm : SLMWriteResPair<WriteVecALUX,  [SLM_FPC_RSV01],  1>;
 defm : SLMWriteResPair<WriteVecALUY,  [SLM_FPC_RSV01],  1>;
 defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 defm : SLMWriteResPair<WriteVecIMul,  [SLM_FPC_RSV0],   4>;
-defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0],   5, [2], 2>;
-defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0],   5, [2], 2>;
+defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0],   5, [2]>;
+defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0],   5, [2]>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
 // FIXME: The below is closer to correct, but caused some perf regressions.
 //defm : SLMWriteResPair<WritePMULLD,  [SLM_FPC_RSV0],   11, [11], 7>;
@@ -382,21 +385,21 @@ defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0],  1>;
 defm : X86WriteResPairUnsupported<WriteShuffleZ>;
 defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteVarShuffle,  [SLM_FPC_RSV0],  1>;
-defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0],  5, [5], 4>;
-defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0],  5, [5], 4>;
+defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0],  5, [5], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
 defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
 defm : SLMWriteResPair<WriteBlend,  [SLM_FPC_RSV0],  1>;
 defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0],  1>;
 defm : X86WriteResPairUnsupported<WriteBlendZ>;
-defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 4, [4], 2, 1>;
 defm : X86WriteResPairUnsupported<WriteVarBlendY>;
 defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
-defm : SLMWriteResPair<WriteMPSAD,  [SLM_FPC_RSV0],  7>;
-defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0],  7>;
+defm : SLMWriteResPair<WriteMPSAD,  [SLM_FPC_RSV0],  7, [5], 3, 1>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
 defm : X86WriteResPairUnsupported<WriteMPSADZ>;
 defm : SLMWriteResPair<WritePSADBW,  [SLM_FPC_RSV0],  4>;
-defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0],  4>;
-defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0],  4>;
+defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0],  5, [2]>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
 defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 defm : SLMWriteResPair<WritePHMINPOS,  [SLM_FPC_RSV0],   4>;
 defm : X86WriteResPairUnsupported<WriteShuffle256>;
@@ -417,26 +420,26 @@ def  : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : SLMWriteResPair<WriteFHAdd,   [SLM_FPC_RSV01], 6, [6], 4>;
-defm : SLMWriteResPair<WriteFHAddY,  [SLM_FPC_RSV01], 6, [6], 4>;
+defm : SLMWriteResPair<WriteFHAdd,   [SLM_FPC_RSV1],  6, [6], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteFHAddY>;
 defm : X86WriteResPairUnsupported<WriteFHAddZ>;
-defm : SLMWriteResPair<WritePHAdd,   [SLM_FPC_RSV01], 1>;
-defm : SLMWriteResPair<WritePHAddX,  [SLM_FPC_RSV01], 1>;
-defm : SLMWriteResPair<WritePHAddY,  [SLM_FPC_RSV01], 1>;
+defm : SLMWriteResPair<WritePHAdd,   [SLM_FPC_RSV01], 6, [6], 3, 1>;
+defm : SLMWriteResPair<WritePHAddX,  [SLM_FPC_RSV01], 6, [6], 3, 1>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
 defm : X86WriteResPairUnsupported<WritePHAddZ>;
 
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
-defm : SLMWriteResPair<WritePCmpIStrM,  [SLM_FPC_RSV0], 13, [13]>;
+defm : SLMWriteResPair<WritePCmpIStrM,  [SLM_FPC_RSV0], 13, [13], 5, 1>;
 
 // Packed Compare Explicit Length Strings, Return Mask
-defm : SLMWriteResPair<WritePCmpEStrM,  [SLM_FPC_RSV0], 17, [17]>;
+defm : SLMWriteResPair<WritePCmpEStrM,  [SLM_FPC_RSV0], 17, [17], 8, 1>;
 
 // Packed Compare Implicit Length Strings, Return Index
-defm : SLMWriteResPair<WritePCmpIStrI,  [SLM_FPC_RSV0], 17, [17]>;
+defm : SLMWriteResPair<WritePCmpIStrI,  [SLM_FPC_RSV0], 17, [17], 6, 1>;
 
 // Packed Compare Explicit Length Strings, Return Index
-defm : SLMWriteResPair<WritePCmpEStrI,  [SLM_FPC_RSV0], 21, [21]>;
+defm : SLMWriteResPair<WritePCmpEStrI,  [SLM_FPC_RSV0], 21, [21], 9, 1>;
 
 // MOVMSK Instructions.
 def : WriteRes<WriteFMOVMSK,    [SLM_FPC_RSV1]> { let Latency = 4; }
@@ -450,7 +453,7 @@ defm : SLMWriteResPair<WriteAESIMC,    [SLM_FPC_RSV0], 8, [5]>;
 defm : SLMWriteResPair<WriteAESKeyGen, [SLM_FPC_RSV0], 8, [5]>;
 
 // Carry-less multiplication instructions.
-defm : SLMWriteResPair<WriteCLMul, [SLM_FPC_RSV0], 10, [10]>;
+defm : SLMWriteResPair<WriteCLMul, [SLM_FPC_RSV0], 10, [10], 8, 1>;
 
 def : WriteRes<WriteSystem,     [SLM_FPC_RSV0]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; }
@@ -462,15 +465,19 @@ def : WriteRes<WriteNop, []>;
 def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> {
   let Latency = 4;
   let NumMicroOps = 2;
-  let ResourceCycles = [4];
+  let ResourceCycles = [8];
 }
-def: InstRW<[SLMWriteResGroup1rr], (instrs PADDQrr, PSUBQrr, PCMPEQQrr)>;
+def: InstRW<[SLMWriteResGroup1rr], (instrs MMX_PADDQirr, PADDQrr,
+                                           MMX_PSUBQirr, PSUBQrr,
+                                           PCMPEQQrr)>;
 
 def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> {
   let Latency = 7;
   let NumMicroOps = 3;
-  let ResourceCycles = [1,4];
+  let ResourceCycles = [1,8];
 }
-def: InstRW<[SLMWriteResGroup1rm], (instrs PADDQrm, PSUBQrm, PCMPEQQrm)>;
+def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQirm, PADDQrm,
+                                           MMX_PSUBQirm, PSUBQrm,
+                                           PCMPEQQrm)>;
 
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 12f8e7cc76f7..8e30e5e10ca8 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -256,8 +256,13 @@ defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
 defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
 
 // IMULH
-def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
-  let Latency = 4;
+def ZnWriteIMulH : WriteRes<WriteIMulH, [ZnMultiplier]>{
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+def  : WriteRes<WriteIMulHLd, [ZnMultiplier]> {
+  let Latency = !add(ZnWriteIMulH.Latency, Znver1Model.LoadLatency);
+  let NumMicroOps = ZnWriteIMulH.NumMicroOps;
 }
 
 // Floating point operations
@@ -659,32 +664,10 @@ def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
 }
 def : SchedAlias<WriteIMul64Ld, ZnWriteMul64Ld>;
 
-// MULX.
-// r32,r32,r32.
-def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
-  let Latency = 3;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>;
-
-// r32,r32,m32.
-def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2, 2];
-}
-def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
-
-// r64,r64,r64.
-def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> {
-  let Latency = 3;
-}
-def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>;
-
-// r64,r64,m64.
-def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
-  let Latency = 8;
-}
-def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+// MULX
+// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies.
+defm : ZnWriteResPair<WriteMULX32, [ZnALU1, ZnMultiplier], 3, [1, 1], 1, 5, 0>;
+defm : ZnWriteResPair<WriteMULX64, [ZnALU1, ZnMultiplier], 3, [1, 1], 1, 5, 0>;
 
 //-- Control transfer instructions --//
 
@@ -714,7 +697,7 @@ def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>;
 def ZnWriteRET : SchedWriteRes<[ZnALU03]> {
   let NumMicroOps = 2;
 }
-def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
+def : InstRW<[ZnWriteRET], (instregex "RET(16|32|64)", "LRET(16|32|64)",
                             "IRET(16|32|64)")>;
 
 //-- Logic instructions --//
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 5b4b151d2938..a83c89e2f28a 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -243,10 +243,17 @@ defm : Zn2WriteResPair<WriteIDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>;
 defm : Zn2WriteResPair<WriteIDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>;
 
 // IMULH
-def  : WriteRes<WriteIMulH, [Zn2ALU1, Zn2Multiplier]>{
-  let Latency = 4;
+def Zn2WriteIMulH : WriteRes<WriteIMulH, [Zn2Multiplier]>{
+  let Latency = 3;
+  let NumMicroOps = 0;
 }
 
+def  : WriteRes<WriteIMulHLd, [Zn2Multiplier]>{
+  let Latency = !add(Zn2WriteIMulH.Latency, Znver2Model.LoadLatency);
+  let NumMicroOps = Zn2WriteIMulH.NumMicroOps;
+}
+
+
 // Floating point operations
 defm : X86WriteRes<WriteFLoad,         [Zn2AGU], 8, [1], 1>;
 defm : X86WriteRes<WriteFLoadX,        [Zn2AGU], 8, [1], 1>;
@@ -658,31 +665,9 @@ def : SchedAlias<WriteIMul64ImmLd, Zn2WriteMul64Ld>;
 def : SchedAlias<WriteIMul64RegLd, Zn2WriteMul64Ld>;
 
 // MULX.
-// r32,r32,r32.
-def Zn2WriteMulX32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
-  let Latency = 3;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[Zn2WriteMulX32], (instrs MULX32rr)>;
-
-// r32,r32,m32.
-def Zn2WriteMulX32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 2, 2];
-}
-def : InstRW<[Zn2WriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
-
-// r64,r64,r64.
-def Zn2WriteMulX64 : SchedWriteRes<[Zn2ALU1]> {
-  let Latency = 3;
-}
-def : InstRW<[Zn2WriteMulX64], (instrs MULX64rr)>;
-
-// r64,r64,m64.
-def Zn2WriteMulX64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
-  let Latency = 7;
-}
-def : InstRW<[Zn2WriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies.
+defm : Zn2WriteResPair<WriteMULX32, [Zn2ALU1, Zn2Multiplier], 3, [1, 1], 1, 4, 0>;
+defm : Zn2WriteResPair<WriteMULX64, [Zn2ALU1, Zn2Multiplier], 3, [1, 1], 1, 4, 0>;
 
 //-- Control transfer instructions --//
 
@@ -712,7 +697,7 @@ def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>;
 def Zn2WriteRET : SchedWriteRes<[Zn2ALU03]> {
   let NumMicroOps = 2;
 }
-def : InstRW<[Zn2WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
+def : InstRW<[Zn2WriteRET], (instregex "RET(16|32|64)", "LRET(16|32|64)",
                             "IRET(16|32|64)")>;
 
 //-- Logic instructions --//
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index 4a91a91a0f0f..be07c069aae1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -617,42 +617,15 @@ defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/
 defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
 defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
 defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
-
-def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
-  let Latency = 4;
-  let ResourceCycles = [1];
-  let NumMicroOps = 2;
-}
-def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>;
-
-def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency);
-  let ResourceCycles = [1, 1, 2];
-  let NumMicroOps = Zn3MULX32rr.NumMicroOps;
-}
-def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>;
-
+defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
 defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
 defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
 defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
-
-def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
-  let Latency = 4;
-  let ResourceCycles = [1];
-  let NumMicroOps = 2;
-}
-def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>;
-
-def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
-  let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency);
-  let ResourceCycles = [1, 1, 2];
-  let NumMicroOps = Zn3MULX64rr.NumMicroOps;
-}
-def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>;
-
+defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
 defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
 defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
-defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;         // Integer multiplication, high part.
+defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
+defm : Zn3WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
 
 defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
 defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index a3238e6317a0..5e59081c63b0 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -71,9 +71,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
 
-    if (const char *bzeroName = (ValC && ValC->isNullValue())
-        ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
-        : nullptr) {
+    if (const char *bzeroName =
+            (ValC && ValC->isZero())
+                ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
+                : nullptr) {
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
diff --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 14a3fea240e7..1a97904e9bc9 100644
--- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -100,7 +100,7 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
 
     // Only treat the element as UNDEF if all bits are UNDEF, otherwise
     // treat it as zero.
-    if (EltUndef.isAllOnesValue()) {
+    if (EltUndef.isAllOnes()) {
       UndefElts.setBit(i);
       RawMask[i] = 0;
       continue;
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index fcaf7c86128a..83a4a025f518 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -850,11 +850,9 @@ getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
 void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
     MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
-    for (auto MII = MBB.instr_begin(), MIE = MBB.instr_end(); MII != MIE;) {
-      // Grab a reference and increment the iterator so we can remove this
-      // instruction if needed without disturbing the iteration.
-      MachineInstr &MI = *MII++;
-
+    // We use make_early_inc_range here so we can remove instructions if needed
+    // without disturbing the iteration.
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB.instrs())) {
       // Must either be a call or a branch.
       if (!MI.isCall() && !MI.isBranch())
         continue;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 4af0ac238f59..a3d4d04b1e0d 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -67,6 +67,13 @@ X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
 
 unsigned char
 X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
+  // Tagged globals have non-zero upper bits, which makes direct references
+  // require a 64-bit immediate.  On the small code model this causes relocation
+  // errors, so we go through the GOT instead.
+  if (AllowTaggedGlobals && TM.getCodeModel() == CodeModel::Small && GV &&
+      !isa<Function>(GV))
+    return X86II::MO_GOTPCREL_NORELAX;
+
   // If we're not PIC, it's not very interesting.
   if (!isPositionIndependent())
     return X86II::MO_NO_FLAG;
@@ -143,6 +150,9 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     return classifyLocalReference(GV);
 
   if (isTargetCOFF()) {
+    // ExternalSymbolSDNode like _tls_index.
+    if (!GV)
+      return X86II::MO_NO_FLAG;
     if (GV->hasDLLImportStorageClass())
       return X86II::MO_DLLIMPORT;
     return X86II::MO_COFFSTUB;
@@ -157,6 +167,11 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     // reference for them.
     if (TM.getCodeModel() == CodeModel::Large)
       return isTargetELF() ? X86II::MO_GOT : X86II::MO_NO_FLAG;
+    // Tagged globals have non-zero upper bits, which makes direct references
+    // require a 64-bit immediate. So we can't let the linker relax the
+    // relocation to a 32-bit RIP-relative direct reference.
+    if (AllowTaggedGlobals && GV && !isa<Function>(GV))
+      return X86II::MO_GOTPCREL_NORELAX;
     return X86II::MO_GOTPCREL;
   }
 
@@ -184,10 +199,13 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
   if (TM.shouldAssumeDSOLocal(M, GV))
     return X86II::MO_NO_FLAG;
 
-  // Functions on COFF can be non-DSO local for two reasons:
+  // Functions on COFF can be non-DSO local for three reasons:
+  // - They are intrinsic functions (!GV)
   // - They are marked dllimport
   // - They are extern_weak, and a stub is needed
   if (isTargetCOFF()) {
+    if (!GV)
+      return X86II::MO_NO_FLAG;
     if (GV->hasDLLImportStorageClass())
       return X86II::MO_DLLIMPORT;
     return X86II::MO_COFFSTUB;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 935dbd882a44..9da54dc2e9b7 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -54,8 +54,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   // are not a good idea. We should be migrating away from these.
   enum X86ProcFamilyEnum {
     Others,
-    IntelAtom,
-    IntelSLM
+    IntelAtom
   };
 
   enum X86SSEEnum {
@@ -353,6 +352,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Processor has AVX-512 Vector Length eXtenstions
   bool HasVLX = false;
 
+  /// Processor has AVX-512 16 bit floating-point extenstions
+  bool HasFP16 = false;
+
   /// Processor has PKU extenstions
   bool HasPKU = false;
 
@@ -425,6 +427,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Processor supports User Level Interrupt instructions
   bool HasUINTR = false;
 
+  /// Enable SSE4.2 CRC32 instruction (Used when SSE4.2 is supported but
+  /// function is GPR only)
+  bool HasCRC32 = false;
+
   /// Processor has a single uop BEXTR implementation.
   bool HasFastBEXTR = false;
 
@@ -469,6 +475,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// loads from being used maliciously.
   bool UseLVILoadHardening = false;
 
+  /// Use an instruction sequence for taking the address of a global that allows
+  /// a memory tag in the upper address bits.
+  bool AllowTaggedGlobals = false;
+
   /// Use software floating point for code generation.
   bool UseSoftFloat = false;
 
@@ -495,6 +505,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Indicates target prefers AVX512 mask registers.
   bool PreferMaskRegisters = false;
 
+  /// Use Silvermont specific arithmetic costs.
+  bool UseSLMArithCosts = false;
+
   /// Use Goldmont specific floating point div/sqrt costs.
   bool UseGLMDivSqrtCosts = false;
 
@@ -742,6 +755,7 @@ public:
   bool hasDQI() const { return HasDQI; }
   bool hasBWI() const { return HasBWI; }
   bool hasVLX() const { return HasVLX; }
+  bool hasFP16() const { return HasFP16; }
   bool hasPKU() const { return HasPKU; }
   bool hasVNNI() const { return HasVNNI; }
   bool hasBF16() const { return HasBF16; }
@@ -763,6 +777,7 @@ public:
   bool hasSERIALIZE() const { return HasSERIALIZE; }
   bool hasTSXLDTRK() const { return HasTSXLDTRK; }
   bool hasUINTR() const { return HasUINTR; }
+  bool hasCRC32() const { return HasCRC32; }
   bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
   bool useRetpolineIndirectBranches() const {
     return UseRetpolineIndirectBranches;
@@ -784,8 +799,10 @@ public:
   }
 
   bool preferMaskRegisters() const { return PreferMaskRegisters; }
+  bool useSLMArithCosts() const { return UseSLMArithCosts; }
   bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
   bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
+  bool allowTaggedGlobals() const { return AllowTaggedGlobals; }
   bool useLVILoadHardening() const { return UseLVILoadHardening; }
   bool useSpeculativeExecutionSideEffectSuppression() const {
     return UseSpeculativeExecutionSideEffectSuppression;
@@ -819,7 +836,6 @@ public:
 
   /// TODO: to be removed later and replaced with suitable properties
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
-  bool isSLM() const { return X86ProcFamily == IntelSLM; }
   bool useSoftFloat() const { return UseSoftFloat; }
   bool useAA() const override { return UseAA; }
 
@@ -933,6 +949,31 @@ public:
   /// Return true if the subtarget allows calls to immediate address.
   bool isLegalToCallImmediateAddr() const;
 
+  /// Return whether FrameLowering should always set the "extended frame
+  /// present" bit in FP, or set it based on a symbol in the runtime.
+  bool swiftAsyncContextIsDynamicallySet() const {
+    // Older OS versions (particularly system unwinders) are confused by the
+    // Swift extended frame, so when building code that might be run on them we
+    // must dynamically query the concurrency library to determine whether
+    // extended frames should be flagged as present.
+    const Triple &TT = getTargetTriple();
+
+    unsigned Major, Minor, Micro;
+    TT.getOSVersion(Major, Minor, Micro);
+    switch(TT.getOS()) {
+    default:
+      return false;
+    case Triple::IOS:
+    case Triple::TvOS:
+      return Major < 15;
+    case Triple::WatchOS:
+      return Major < 8;
+    case Triple::MacOSX:
+    case Triple::Darwin:
+      return Major < 12;
+    }
+  }
+
   /// If we are using indirect thunks, we need to expand indirectbr to avoid it
   /// lowering to an actual indirect jump.
   bool enableIndirectBrExpand() const override {
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index ee8cff3e008b..336985f3bf9d 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -39,11 +39,11 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/CFGuard.h"
@@ -503,7 +503,7 @@ void X86PassConfig::addPreRegAlloc() {
 
   addPass(createX86SpeculativeLoadHardeningPass());
   addPass(createX86FlagsCopyLoweringPass());
-  addPass(createX86WinAllocaExpander());
+  addPass(createX86DynAllocaExpander());
 
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86PreTileConfigPass());
@@ -585,6 +585,9 @@ void X86PassConfig::addPreEmitPass2() {
     addPass(createEHContGuardCatchretPass());
   }
   addPass(createX86LoadValueInjectionRetHardeningPass());
+
+  // Insert pseudo probe annotation for callsite profiling
+  addPass(createPseudoProbeInserter());
 }
 
 bool X86PassConfig::addPostFastRegAllocRewrite() {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 971c430d73b1..06dacb638d16 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -206,6 +206,87 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
+      LT.second.getScalarType() == MVT::i32) {
+    // Check if the operands can be represented as a smaller datatype.
+    bool Op1Signed = false, Op2Signed = false;
+    unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+    unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+    unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+    // If both are representable as i15 and at least one is constant,
+    // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
+    // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
+    if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
+      bool Op1Constant =
+          isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
+      bool Op2Constant =
+          isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
+      bool Op1Sext = isa<SExtInst>(Args[0]) &&
+                     (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
+      bool Op2Sext = isa<SExtInst>(Args[1]) &&
+                     (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
+
+      bool IsZeroExtended = !Op1Signed || !Op2Signed;
+      bool IsConstant = Op1Constant || Op2Constant;
+      bool IsSext = Op1Sext || Op2Sext;
+      if (IsConstant || IsZeroExtended || IsSext)
+        LT.second =
+            MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
+    }
+  }
+
+  if ((ISD == ISD::MUL || ISD == ISD::SDIV || ISD == ISD::SREM ||
+       ISD == ISD::UDIV || ISD == ISD::UREM) &&
+      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // Vector multiply by pow2 will be simplified to shifts.
+    if (ISD == ISD::MUL) {
+      InstructionCost Cost = getArithmeticInstrCost(
+          Instruction::Shl, Ty, CostKind, Op1Info, Op2Info,
+          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+      return Cost;
+    }
+
+    if (ISD == ISD::SDIV || ISD == ISD::SREM) {
+      // On X86, vector signed division by constants power-of-two are
+      // normally expanded to the sequence SRA + SRL + ADD + SRA.
+      // The OperandValue properties may not be the same as that of the previous
+      // operation; conservatively assume OP_None.
+      InstructionCost Cost =
+          2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
+                                     Op2Info, TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+                                     Op2Info, TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
+                                     Op2Info, TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+
+      if (ISD == ISD::SREM) {
+        // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
+        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
+                                       Op2Info);
+        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
+                                       Op2Info);
+      }
+
+      return Cost;
+    }
+
+    // Vector unsigned division/remainder will be simplified to shifts/masks.
+    if (ISD == ISD::UDIV)
+      return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+                                    Op2Info, TargetTransformInfo::OP_None,
+                                    TargetTransformInfo::OP_None);
+    // UREM
+    return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
+                                  Op2Info, TargetTransformInfo::OP_None,
+                                  TargetTransformInfo::OP_None);
+  }
+
   static const CostTblEntry GLMCostTable[] = {
     { ISD::FDIV,  MVT::f32,   18 }, // divss
     { ISD::FDIV,  MVT::v4f32, 35 }, // divps
@@ -241,9 +322,10 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::SUB,   MVT::v2i64, 4  },
   };
 
-  if (ST->isSLM()) {
+  if (ST->useSLMArithCosts()) {
     if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
       // Check if the operands can be shrinked into a smaller datatype.
+      // TODO: Merge this into generiic vXi32 MUL patterns above.
       bool Op1Signed = false;
       unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
       bool Op2Signed = false;
@@ -268,54 +350,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     }
   }
 
-  if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
-       ISD == ISD::UREM) &&
-      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
-       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
-      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
-    if (ISD == ISD::SDIV || ISD == ISD::SREM) {
-      // On X86, vector signed division by constants power-of-two are
-      // normally expanded to the sequence SRA + SRL + ADD + SRA.
-      // The OperandValue properties may not be the same as that of the previous
-      // operation; conservatively assume OP_None.
-      InstructionCost Cost =
-          2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
-                                     Op2Info, TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
-                                     Op2Info,
-                                     TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
-                                     Op2Info,
-                                     TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-
-      if (ISD == ISD::SREM) {
-        // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
-        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
-                                       Op2Info);
-        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
-                                       Op2Info);
-      }
-
-      return Cost;
-    }
-
-    // Vector unsigned division/remainder will be simplified to shifts/masks.
-    if (ISD == ISD::UDIV)
-      return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
-                                    Op1Info, Op2Info,
-                                    TargetTransformInfo::OP_None,
-                                    TargetTransformInfo::OP_None);
-
-    else // UREM
-      return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
-                                    Op1Info, Op2Info,
-                                    TargetTransformInfo::OP_None,
-                                    TargetTransformInfo::OP_None);
-  }
-
   static const CostTblEntry AVX512BWUniformConstCostTable[] = {
     { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
     { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
@@ -1005,6 +1039,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
     { ISD::ADD,  MVT::i64,    1 }, // Core (Merom) from http://www.agner.org/
     { ISD::SUB,  MVT::i64,    1 }, // Core (Merom) from http://www.agner.org/
+    { ISD::MUL,  MVT::i64,    2 }, // Nehalem from http://www.agner.org/
   };
 
   if (ST->is64Bit())
@@ -1121,6 +1156,9 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
         return SubLT.first;
     }
+
+    // If the insertion isn't aligned, treat it like a 2-op shuffle.
+    Kind = TTI::SK_PermuteTwoSrc;
   }
 
   // Handle some common (illegal) sub-vector types as they are often very cheap
@@ -1196,6 +1234,29 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     LT.first = NumOfDests * NumOfShufflesPerDest;
   }
 
+  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
+      {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v8f16, 1},  // vpbroadcastw
+
+      {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
+      {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
+      {TTI::SK_Reverse, MVT::v8f16, 1},  // vpshufb
+
+      {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1},  // vpshufb
+
+      {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2}   // vpermt2w
+  };
+
+  if (!ST->useSoftFloat() && ST->hasFP16())
+    if (const auto *Entry =
+            CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
@@ -1533,6 +1594,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
     { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
     { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1, 1 },
 
     // Mask zero extend is a sext + shift.
@@ -1546,6 +1608,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1, 2 },
 
     { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
@@ -1557,12 +1620,14 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // vpmovwb
     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v16i8,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // vpmovwb
     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // widen to zmm
     { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v32i16, 2 },
     { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
   };
 
@@ -1606,17 +1671,26 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i32,  2 }, // vpmovdb
     { ISD::TRUNCATE,  MVT::v4i8,    MVT::v4i32,  2 }, // vpmovdb
     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 }, // vpmovdb
-    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v32i8,   MVT::v16i32, 2 }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v64i8,   MVT::v16i32, 2 }, // vpmovdb
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 }, // vpmovdw
+    { ISD::TRUNCATE,  MVT::v32i16,  MVT::v16i32, 2 }, // vpmovdw
     { ISD::TRUNCATE,  MVT::v2i8,    MVT::v2i64,  2 }, // vpmovqb
     { ISD::TRUNCATE,  MVT::v2i16,   MVT::v2i64,  1 }, // vpshufb
     { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v8i64,  2 }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v32i8,   MVT::v8i64,  2 }, // vpmovqb
+    { ISD::TRUNCATE,  MVT::v64i8,   MVT::v8i64,  2 }, // vpmovqb
     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 }, // vpmovqw
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v8i64,  2 }, // vpmovqw
+    { ISD::TRUNCATE,  MVT::v32i16,  MVT::v8i64,  2 }, // vpmovqw
     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 }, // vpmovqd
     { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
 
     { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
     { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
+    { ISD::TRUNCATE,  MVT::v64i8,  MVT::v32i16,  8 },
 
     // Sign extend is zmm vpternlogd+vptruncdb.
     // Zero extend is zmm broadcast load+vptruncdw.
@@ -1889,6 +1963,8 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
 
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 4 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 4 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  1 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  1 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  1 },
@@ -1964,6 +2040,8 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  9 },
     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, 11 },
 
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 6 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // and+extract+packuswb
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  5 },
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
@@ -2365,13 +2443,21 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   assert(ISD && "Invalid opcode");
 
   unsigned ExtraCost = 0;
-  if (I && (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)) {
+  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
     // Some vector comparison predicates cost extra instructions.
+    // TODO: Should we invert this and assume worst case cmp costs
+    // and reduce for particular predicates?
     if (MTy.isVector() &&
         !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
           (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
           ST->hasBWI())) {
-      switch (cast<CmpInst>(I)->getPredicate()) {
+      // Fallback to I if a specific predicate wasn't specified.
+      CmpInst::Predicate Pred = VecPred;
+      if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
+                Pred == CmpInst::BAD_FCMP_PREDICATE))
+        Pred = cast<CmpInst>(I)->getPredicate();
+
+      switch (Pred) {
       case CmpInst::Predicate::ICMP_NE:
         // xor(cmpeq(x,y),-1)
         ExtraCost = 1;
@@ -2399,6 +2485,11 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
           ExtraCost = 3;
         }
         break;
+      case CmpInst::Predicate::BAD_ICMP_PREDICATE:
+      case CmpInst::Predicate::BAD_FCMP_PREDICATE:
+        // Assume worst case scenario and add the maximum extra cost.
+        ExtraCost = 3;
+        break;
       default:
         break;
       }
@@ -2502,7 +2593,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SELECT,  MVT::v4f32,   3 }, // andps + andnps + orps
   };
 
-  if (ST->isSLM())
+  if (ST->useSLMArithCosts())
     if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
       return LT.first * (ExtraCost + Entry->Cost);
 
@@ -2556,6 +2647,22 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
   //       specialized in these tables yet.
+  static const CostTblEntry AVX512BITALGCostTbl[] = {
+    { ISD::CTPOP,      MVT::v32i16,  1 },
+    { ISD::CTPOP,      MVT::v64i8,   1 },
+    { ISD::CTPOP,      MVT::v16i16,  1 },
+    { ISD::CTPOP,      MVT::v32i8,   1 },
+    { ISD::CTPOP,      MVT::v8i16,   1 },
+    { ISD::CTPOP,      MVT::v16i8,   1 },
+  };
+  static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
+    { ISD::CTPOP,      MVT::v8i64,   1 },
+    { ISD::CTPOP,      MVT::v16i32,  1 },
+    { ISD::CTPOP,      MVT::v4i64,   1 },
+    { ISD::CTPOP,      MVT::v8i32,   1 },
+    { ISD::CTPOP,      MVT::v2i64,   1 },
+    { ISD::CTPOP,      MVT::v4i32,   1 },
+  };
   static const CostTblEntry AVX512CDCostTbl[] = {
     { ISD::CTLZ,       MVT::v8i64,   1 },
     { ISD::CTLZ,       MVT::v16i32,  1 },
@@ -2573,10 +2680,10 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   static const CostTblEntry AVX512BWCostTbl[] = {
     { ISD::ABS,        MVT::v32i16,  1 },
     { ISD::ABS,        MVT::v64i8,   1 },
-    { ISD::BITREVERSE, MVT::v8i64,   5 },
-    { ISD::BITREVERSE, MVT::v16i32,  5 },
-    { ISD::BITREVERSE, MVT::v32i16,  5 },
-    { ISD::BITREVERSE, MVT::v64i8,   5 },
+    { ISD::BITREVERSE, MVT::v8i64,   3 },
+    { ISD::BITREVERSE, MVT::v16i32,  3 },
+    { ISD::BITREVERSE, MVT::v32i16,  3 },
+    { ISD::BITREVERSE, MVT::v64i8,   2 },
     { ISD::BSWAP,      MVT::v8i64,   1 },
     { ISD::BSWAP,      MVT::v16i32,  1 },
     { ISD::BSWAP,      MVT::v32i16,  1 },
@@ -2612,8 +2719,8 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   static const CostTblEntry AVX512CostTbl[] = {
     { ISD::ABS,        MVT::v8i64,   1 },
     { ISD::ABS,        MVT::v16i32,  1 },
-    { ISD::ABS,        MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::ABS,        MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::ABS,        MVT::v32i16,  2 },
+    { ISD::ABS,        MVT::v64i8,   2 },
     { ISD::ABS,        MVT::v4i64,   1 },
     { ISD::ABS,        MVT::v2i64,   1 },
     { ISD::BITREVERSE, MVT::v8i64,  36 },
@@ -2637,26 +2744,26 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTTZ,       MVT::v64i8,  18 },
     { ISD::SMAX,       MVT::v8i64,   1 },
     { ISD::SMAX,       MVT::v16i32,  1 },
-    { ISD::SMAX,       MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::SMAX,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::SMAX,       MVT::v32i16,  2 },
+    { ISD::SMAX,       MVT::v64i8,   2 },
     { ISD::SMAX,       MVT::v4i64,   1 },
     { ISD::SMAX,       MVT::v2i64,   1 },
     { ISD::SMIN,       MVT::v8i64,   1 },
     { ISD::SMIN,       MVT::v16i32,  1 },
-    { ISD::SMIN,       MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::SMIN,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::SMIN,       MVT::v32i16,  2 },
+    { ISD::SMIN,       MVT::v64i8,   2 },
     { ISD::SMIN,       MVT::v4i64,   1 },
     { ISD::SMIN,       MVT::v2i64,   1 },
     { ISD::UMAX,       MVT::v8i64,   1 },
     { ISD::UMAX,       MVT::v16i32,  1 },
-    { ISD::UMAX,       MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::UMAX,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::UMAX,       MVT::v32i16,  2 },
+    { ISD::UMAX,       MVT::v64i8,   2 },
     { ISD::UMAX,       MVT::v4i64,   1 },
     { ISD::UMAX,       MVT::v2i64,   1 },
     { ISD::UMIN,       MVT::v8i64,   1 },
     { ISD::UMIN,       MVT::v16i32,  1 },
-    { ISD::UMIN,       MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::UMIN,       MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::UMIN,       MVT::v32i16,  2 },
+    { ISD::UMIN,       MVT::v64i8,   2 },
     { ISD::UMIN,       MVT::v4i64,   1 },
     { ISD::UMIN,       MVT::v2i64,   1 },
     { ISD::USUBSAT,    MVT::v16i32,  2 }, // pmaxud + psubd
@@ -2667,14 +2774,14 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UADDSAT,    MVT::v2i64,   3 }, // not + pminuq + paddq
     { ISD::UADDSAT,    MVT::v4i64,   3 }, // not + pminuq + paddq
     { ISD::UADDSAT,    MVT::v8i64,   3 }, // not + pminuq + paddq
-    { ISD::SADDSAT,    MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::SADDSAT,    MVT::v64i8,   2 }, // FIXME: include split
-    { ISD::SSUBSAT,    MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::SSUBSAT,    MVT::v64i8,   2 }, // FIXME: include split
-    { ISD::UADDSAT,    MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::UADDSAT,    MVT::v64i8,   2 }, // FIXME: include split
-    { ISD::USUBSAT,    MVT::v32i16,  2 }, // FIXME: include split
-    { ISD::USUBSAT,    MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::SADDSAT,    MVT::v32i16,  2 },
+    { ISD::SADDSAT,    MVT::v64i8,   2 },
+    { ISD::SSUBSAT,    MVT::v32i16,  2 },
+    { ISD::SSUBSAT,    MVT::v64i8,   2 },
+    { ISD::UADDSAT,    MVT::v32i16,  2 },
+    { ISD::UADDSAT,    MVT::v64i8,   2 },
+    { ISD::USUBSAT,    MVT::v32i16,  2 },
+    { ISD::USUBSAT,    MVT::v64i8,   2 },
     { ISD::FMAXNUM,    MVT::f32,     2 },
     { ISD::FMAXNUM,    MVT::v4f32,   2 },
     { ISD::FMAXNUM,    MVT::v8f32,   2 },
@@ -2703,25 +2810,41 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ABS,        MVT::v8i32,   1 },
     { ISD::ABS,        MVT::v16i16,  1 },
     { ISD::ABS,        MVT::v32i8,   1 },
-    { ISD::BITREVERSE, MVT::v4i64,   5 },
-    { ISD::BITREVERSE, MVT::v8i32,   5 },
-    { ISD::BITREVERSE, MVT::v16i16,  5 },
-    { ISD::BITREVERSE, MVT::v32i8,   5 },
+    { ISD::BITREVERSE, MVT::v2i64,   3 },
+    { ISD::BITREVERSE, MVT::v4i64,   3 },
+    { ISD::BITREVERSE, MVT::v4i32,   3 },
+    { ISD::BITREVERSE, MVT::v8i32,   3 },
+    { ISD::BITREVERSE, MVT::v8i16,   3 },
+    { ISD::BITREVERSE, MVT::v16i16,  3 },
+    { ISD::BITREVERSE, MVT::v16i8,   3 },
+    { ISD::BITREVERSE, MVT::v32i8,   3 },
     { ISD::BSWAP,      MVT::v4i64,   1 },
     { ISD::BSWAP,      MVT::v8i32,   1 },
     { ISD::BSWAP,      MVT::v16i16,  1 },
-    { ISD::CTLZ,       MVT::v4i64,  23 },
-    { ISD::CTLZ,       MVT::v8i32,  18 },
-    { ISD::CTLZ,       MVT::v16i16, 14 },
-    { ISD::CTLZ,       MVT::v32i8,   9 },
-    { ISD::CTPOP,      MVT::v4i64,   7 },
-    { ISD::CTPOP,      MVT::v8i32,  11 },
-    { ISD::CTPOP,      MVT::v16i16,  9 },
-    { ISD::CTPOP,      MVT::v32i8,   6 },
-    { ISD::CTTZ,       MVT::v4i64,  10 },
-    { ISD::CTTZ,       MVT::v8i32,  14 },
-    { ISD::CTTZ,       MVT::v16i16, 12 },
-    { ISD::CTTZ,       MVT::v32i8,   9 },
+    { ISD::CTLZ,       MVT::v2i64,   7 },
+    { ISD::CTLZ,       MVT::v4i64,   7 },
+    { ISD::CTLZ,       MVT::v4i32,   5 },
+    { ISD::CTLZ,       MVT::v8i32,   5 },
+    { ISD::CTLZ,       MVT::v8i16,   4 },
+    { ISD::CTLZ,       MVT::v16i16,  4 },
+    { ISD::CTLZ,       MVT::v16i8,   3 },
+    { ISD::CTLZ,       MVT::v32i8,   3 },
+    { ISD::CTPOP,      MVT::v2i64,   3 },
+    { ISD::CTPOP,      MVT::v4i64,   3 },
+    { ISD::CTPOP,      MVT::v4i32,   7 },
+    { ISD::CTPOP,      MVT::v8i32,   7 },
+    { ISD::CTPOP,      MVT::v8i16,   3 },
+    { ISD::CTPOP,      MVT::v16i16,  3 },
+    { ISD::CTPOP,      MVT::v16i8,   2 },
+    { ISD::CTPOP,      MVT::v32i8,   2 },
+    { ISD::CTTZ,       MVT::v2i64,   4 },
+    { ISD::CTTZ,       MVT::v4i64,   4 },
+    { ISD::CTTZ,       MVT::v4i32,   7 },
+    { ISD::CTTZ,       MVT::v8i32,   7 },
+    { ISD::CTTZ,       MVT::v8i16,   4 },
+    { ISD::CTTZ,       MVT::v16i16,  4 },
+    { ISD::CTTZ,       MVT::v16i8,   3 },
+    { ISD::CTTZ,       MVT::v32i8,   3 },
     { ISD::SADDSAT,    MVT::v16i16,  1 },
     { ISD::SADDSAT,    MVT::v32i8,   1 },
     { ISD::SMAX,       MVT::v8i32,   1 },
@@ -3093,10 +3216,18 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
-    if (ST->isSLM())
+    if (ST->useSLMArithCosts())
       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
+    if (ST->hasBITALG())
+      if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+    if (ST->hasVPOPCNTDQ())
+      if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
     if (ST->hasCDI())
       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
@@ -3179,8 +3310,6 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       }
     }
 
-    // TODO - add BMI (TZCNT) scalar handling
-
     if (ST->is64Bit())
       if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
@@ -3312,7 +3441,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   if (Index == -1U && (Opcode == Instruction::ExtractElement ||
                        Opcode == Instruction::InsertElement)) {
     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
-    // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. 
+    // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
 
     // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
     assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
@@ -3378,7 +3507,7 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Unexpected vector opcode");
     MVT MScalarTy = LT.second.getScalarType();
-    if (ST->isSLM())
+    if (ST->useSLMArithCosts())
       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
         return Entry->Cost + RegisterFileMoveCost;
 
@@ -3505,6 +3634,112 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
   return Cost;
 }
 
+InstructionCost
+X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
+                                      int VF, const APInt &DemandedDstElts,
+                                      TTI::TargetCostKind CostKind) {
+  const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
+  // We don't differentiate element types here, only element bit width.
+  EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
+
+  auto bailout = [&]() {
+    return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
+                                            DemandedDstElts, CostKind);
+  };
+
+  // For now, only deal with AVX512 cases.
+  if (!ST->hasAVX512())
+    return bailout();
+
+  // Do we have a native shuffle for this element type, or should we promote?
+  unsigned PromEltTyBits = EltTyBits;
+  switch (EltTyBits) {
+  case 32:
+  case 64:
+    break; // AVX512F.
+  case 16:
+    if (!ST->hasBWI())
+      PromEltTyBits = 32; // promote to i32, AVX512F.
+    break;                // AVX512BW
+  case 8:
+    if (!ST->hasVBMI())
+      PromEltTyBits = 32; // promote to i32, AVX512F.
+    break;                // AVX512VBMI
+  case 1:
+    // There is no support for shuffling i1 elements. We *must* promote.
+    if (ST->hasBWI()) {
+      if (ST->hasVBMI())
+        PromEltTyBits = 8; // promote to i8, AVX512VBMI.
+      else
+        PromEltTyBits = 16; // promote to i16, AVX512BW.
+      break;
+    }
+    return bailout();
+  default:
+    return bailout();
+  }
+  auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
+
+  auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
+  auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
+
+  int NumDstElements = VF * ReplicationFactor;
+  auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
+  auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
+
+  // Legalize the types.
+  MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second;
+  MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second;
+  MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second;
+  MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second;
+  // They should have legalized into vector types.
+  if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
+      !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
+    return bailout();
+
+  if (PromEltTyBits != EltTyBits) {
+    // If we have to perform the shuffle with wider elt type than our data type,
+    // then we will first need to anyext (we don't care about the new bits)
+    // the source elements, and then truncate Dst elements.
+    InstructionCost PromotionCost;
+    PromotionCost += getCastInstrCost(
+        Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
+        TargetTransformInfo::CastContextHint::None, CostKind);
+    PromotionCost +=
+        getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
+                         /*Src=*/PromDstVecTy,
+                         TargetTransformInfo::CastContextHint::None, CostKind);
+    return PromotionCost + getReplicationShuffleCost(PromEltTy,
+                                                     ReplicationFactor, VF,
+                                                     DemandedDstElts, CostKind);
+  }
+
+  assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
+         LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
+         "We expect that the legalization doesn't affect the element width, "
+         "doesn't coalesce/split elements.");
+
+  unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
+  unsigned NumDstVectors =
+      divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
+
+  auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
+
+  // Not all the produced Dst elements may be demanded. In our case,
+  // given that a single Dst vector is formed by a single shuffle,
+  // if all elements that will form a single Dst vector aren't demanded,
+  // then we won't need to do that shuffle, so adjust the cost accordingly.
+  APInt DemandedDstVectors = APIntOps::ScaleBitMask(
+      DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec),
+      NumDstVectors);
+  unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
+
+  InstructionCost SingleShuffleCost =
+      getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy,
+                     /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr);
+  return NumDstVectorsDemanded * SingleShuffleCost;
+}
+
 InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                             MaybeAlign Alignment,
                                             unsigned AddressSpace,
@@ -3677,7 +3912,7 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
   if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
       (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
     // Scalarization
-    APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+    APInt DemandedElts = APInt::getAllOnes(NumElem);
     InstructionCost MaskSplitCost =
         getScalarizationOverhead(MaskTy, DemandedElts, false, true);
     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
@@ -3795,7 +4030,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   EVT VT = TLI->getValueType(DL, ValTy);
   if (VT.isSimple()) {
     MVT MTy = VT.getSimpleVT();
-    if (ST->isSLM())
+    if (ST->useSLMArithCosts())
       if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
         return Entry->Cost;
 
@@ -3834,7 +4069,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     ArithmeticCost *= LT.first - 1;
   }
 
-  if (ST->isSLM())
+  if (ST->useSLMArithCosts())
     if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
       return ArithmeticCost + Entry->Cost;
 
@@ -4589,16 +4824,17 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
 InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
                                             bool VariableMask, Align Alignment,
                                             unsigned AddressSpace) {
+  Type *ScalarTy = SrcVTy->getScalarType();
   unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
-  APInt DemandedElts = APInt::getAllOnesValue(VF);
+  APInt DemandedElts = APInt::getAllOnes(VF);
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   InstructionCost MaskUnpackCost = 0;
   if (VariableMask) {
     auto *MaskTy =
         FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
-    MaskUnpackCost =
-        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
+    MaskUnpackCost = getScalarizationOverhead(
+        MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true);
     InstructionCost ScalarCompareCost = getCmpSelInstrCost(
         Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
         CmpInst::BAD_ICMP_PREDICATE, CostKind);
@@ -4606,24 +4842,23 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
   }
 
+  InstructionCost AddressUnpackCost = getScalarizationOverhead(
+      FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
+      /*Insert=*/false, /*Extract=*/true);
+
   // The cost of the scalar loads/stores.
   InstructionCost MemoryOpCost =
-      VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
-                           MaybeAlign(Alignment), AddressSpace, CostKind);
-
-  InstructionCost InsertExtractCost = 0;
-  if (Opcode == Instruction::Load)
-    for (unsigned i = 0; i < VF; ++i)
-      // Add the cost of inserting each scalar load into the vector
-      InsertExtractCost +=
-        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
-  else
-    for (unsigned i = 0; i < VF; ++i)
-      // Add the cost of extracting each element out of the data vector
-      InsertExtractCost +=
-        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+      VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
+                           AddressSpace, CostKind);
 
-  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+  // The cost of forming the vector from loaded scalars/
+  // scalarizing the vector to perform scalar stores.
+  InstructionCost InsertExtractCost =
+      getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts,
+                               /*Insert=*/Opcode == Instruction::Load,
+                               /*Extract=*/Opcode == Instruction::Store);
+
+  return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
 }
 
 /// Calculate the cost of Gather / Scatter operation
@@ -4690,6 +4925,9 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
     return true;
 
+  if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
+    return true;
+
   if (!ScalarTy->isIntegerTy())
     return false;
 
@@ -4732,7 +4970,7 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
   // loads require AVX2).
   if (DataSize == 32)
     return ST->hasAVX();
-  else if (DataSize == 16)
+  if (DataSize == 16)
     return ST->hasSSE1();
   return true;
 }
@@ -4765,11 +5003,15 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
   return isLegalMaskedExpandLoad(DataTy);
 }
 
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+bool X86TTIImpl::supportsGather() const {
   // Some CPUs have better gather performance than others.
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
   // enable gather with a -march.
-  if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())))
+  return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
+  if (!supportsGather())
     return false;
 
   // This function is called now in two cases: from the Loop Vectorizer
@@ -4893,6 +5135,14 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
+bool X86TTIImpl::prefersVectorizedAddressing() const {
+  return supportsGather();
+}
+
+bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
+  return false;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
@@ -4900,122 +5150,6 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
   return !(ST->isAtom());
 }
 
-// Get estimation for interleaved load/store operations for AVX2.
-// \p Factor is the interleaved-access factor (stride) - number of
-// (interleaved) elements in the group.
-// \p Indices contains the indices for a strided load: when the
-// interleaved load has gaps they indicate which elements are used.
-// If Indices is empty (or if the number of indices is equal to the size
-// of the interleaved-access as given in \p Factor) the access has no gaps.
-//
-// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
-// computing the cost using a generic formula as a function of generic
-// shuffles. We therefore use a lookup table instead, filled according to
-// the instruction sequences that codegen currently generates.
-InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
-    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
-    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
-    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
-
-  if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, CostKind,
-                                             UseMaskForCond, UseMaskForGaps);
-
-  // We currently Support only fully-interleaved groups, with no gaps.
-  // TODO: Support also strided loads (interleaved-groups with gaps).
-  if (Indices.size() && Indices.size() != Factor)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, CostKind);
-
-  // VecTy for interleave memop is <VF*Factor x Elt>.
-  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
-  // VecTy = <12 x i32>.
-  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
-
-  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
-  // the VF=2, while v2i128 is an unsupported MVT vector type
-  // (see MachineValueType.h::getVectorVT()).
-  if (!LegalVT.isVector())
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, CostKind);
-
-  unsigned VF = VecTy->getNumElements() / Factor;
-  Type *ScalarTy = VecTy->getElementType();
-  // Deduplicate entries, model floats/pointers as appropriately-sized integers.
-  if (!ScalarTy->isIntegerTy())
-    ScalarTy =
-        Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
-
-  // Get the cost of all the memory operations.
-  InstructionCost MemOpCosts = getMemoryOpCost(
-      Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
-
-  auto *VT = FixedVectorType::get(ScalarTy, VF);
-  EVT ETy = TLI->getValueType(DL, VT);
-  if (!ETy.isSimple())
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, CostKind);
-
-  // TODO: Complete for other data-types and strides.
-  // Each combination of Stride, element bit width and VF results in a different
-  // sequence; The cost tables are therefore accessed with:
-  // Factor (stride) and VectorType=VFxiN.
-  // The Cost accounts only for the shuffle sequence;
-  // The cost of the loads/stores is accounted for separately.
-  //
-  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
-      {2, MVT::v4i64, 6}, // (load 8i64 and) deinterleave into 2 x 4i64
-
-      {3, MVT::v2i8, 10},  // (load 6i8 and) deinterleave into 3 x 2i8
-      {3, MVT::v4i8, 4},   // (load 12i8 and) deinterleave into 3 x 4i8
-      {3, MVT::v8i8, 9},   // (load 24i8 and) deinterleave into 3 x 8i8
-      {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
-      {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8
-
-      {3, MVT::v8i32, 17}, // (load 24i32 and) deinterleave into 3 x 8i32
-
-      {4, MVT::v2i8, 12},  // (load 8i8 and) deinterleave into 4 x 2i8
-      {4, MVT::v4i8, 4},   // (load 16i8 and) deinterleave into 4 x 4i8
-      {4, MVT::v8i8, 20},  // (load 32i8 and) deinterleave into 4 x 8i8
-      {4, MVT::v16i8, 39}, // (load 64i8 and) deinterleave into 4 x 16i8
-      {4, MVT::v32i8, 80}, // (load 128i8 and) deinterleave into 4 x 32i8
-
-      {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
-  };
-
-  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
-      {2, MVT::v4i64, 6}, // interleave 2 x 4i64 into 8i64 (and store)
-
-      {3, MVT::v2i8, 7},   // interleave 3 x 2i8 into 6i8 (and store)
-      {3, MVT::v4i8, 8},   // interleave 3 x 4i8 into 12i8 (and store)
-      {3, MVT::v8i8, 11},  // interleave 3 x 8i8 into 24i8 (and store)
-      {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
-      {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
-
-      {4, MVT::v2i8, 12},  // interleave 4 x 2i8 into 8i8 (and store)
-      {4, MVT::v4i8, 9},   // interleave 4 x 4i8 into 16i8 (and store)
-      {4, MVT::v8i8, 10},  // interleave 4 x 8i8 into 32i8 (and store)
-      {4, MVT::v16i8, 10}, // interleave 4 x 16i8 into 64i8 (and store)
-      {4, MVT::v32i8, 12}  // interleave 4 x 32i8 into 128i8 (and store)
-  };
-
-  if (Opcode == Instruction::Load) {
-    if (const auto *Entry =
-            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
-      return MemOpCosts + Entry->Cost;
-  } else {
-    assert(Opcode == Instruction::Store &&
-           "Expected Store Instruction at this  point");
-    if (const auto *Entry =
-            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
-      return MemOpCosts + Entry->Cost;
-  }
-
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace, CostKind);
-}
-
 // Get estimation for interleaved load/store operations and strided load.
 // \p Indices contains indices for strided load.
 // \p Factor - the factor of interleaving.
@@ -5024,12 +5158,6 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
     unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
     ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
     TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
-
-  if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace, CostKind,
-                                             UseMaskForCond, UseMaskForGaps);
-
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
   // VecTy = <12 x i32>.
@@ -5044,12 +5172,46 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   // Get the cost of one memory operation.
   auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
                                              LegalVT.getVectorNumElements());
-  InstructionCost MemOpCost = getMemoryOpCost(
-      Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind);
+  InstructionCost MemOpCost;
+  if (UseMaskForCond || UseMaskForGaps)
+    MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
+                                      AddressSpace, CostKind);
+  else
+    MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
+                                AddressSpace, CostKind);
 
   unsigned VF = VecTy->getNumElements() / Factor;
   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
 
+  // FIXME: this is the most conservative estimate for the mask cost.
+  InstructionCost MaskCost;
+  if (UseMaskForCond || UseMaskForGaps) {
+    APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
+    for (unsigned Index : Indices) {
+      assert(Index < Factor && "Invalid index for interleaved memory op");
+      for (unsigned Elm = 0; Elm < VF; Elm++)
+        DemandedLoadStoreElts.setBit(Index + Elm * Factor);
+    }
+
+    Type *I8Type = Type::getInt8Ty(VecTy->getContext());
+
+    MaskCost = getReplicationShuffleCost(
+        I8Type, Factor, VF,
+        UseMaskForGaps ? DemandedLoadStoreElts
+                       : APInt::getAllOnes(VecTy->getNumElements()),
+        CostKind);
+
+    // The Gaps mask is invariant and created outside the loop, therefore the
+    // cost of creating it is not accounted for here. However if we have both
+    // a MaskForGaps and some other mask that guards the execution of the
+    // memory access, we need to account for the cost of And-ing the two masks
+    // inside the loop.
+    if (UseMaskForGaps) {
+      auto *MaskVT = FixedVectorType::get(I8Type, VecTy->getNumElements());
+      MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
+    }
+  }
+
   if (Opcode == Instruction::Load) {
     // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
     // contain the cost of the optimized shuffle sequence that the
@@ -5065,7 +5227,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
 
     if (const auto *Entry =
             CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
-      return NumOfMemOps * MemOpCost + Entry->Cost;
+      return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
     //If an entry does not exist, fallback to the default implementation.
 
     // Kind of shuffle depends on number of loaded values.
@@ -5102,7 +5264,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
 
     InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
-                           NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+                           MaskCost + NumOfUnfoldedLoads * MemOpCost +
+                           NumOfMoves;
 
     return Cost;
   }
@@ -5124,7 +5287,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
 
   if (const auto *Entry =
           CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
-    return NumOfMemOps * MemOpCost + Entry->Cost;
+    return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
   //If an entry does not exist, fallback to the default implementation.
 
   // There is no strided stores meanwhile. And store can't be folded in
@@ -5138,33 +5301,321 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   // We need additional instructions to keep sources.
   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
   InstructionCost Cost =
+      MaskCost +
       NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
       NumOfMoves;
   return Cost;
 }
 
 InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
     bool UseMaskForCond, bool UseMaskForGaps) {
-  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
+  auto *VecTy = cast<FixedVectorType>(BaseTy);
+
+  auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
       return true;
-    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
+    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
+        (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
       return HasBW;
     return false;
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(
-        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
-        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
-  if (ST->hasAVX2())
-    return getInterleavedMemoryOpCostAVX2(
-        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+        Opcode, VecTy, Factor, Indices, Alignment,
         AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
 
+  if (UseMaskForCond || UseMaskForGaps)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, CostKind,
+                                             UseMaskForCond, UseMaskForGaps);
+
+  // Get estimation for interleaved load/store operations for SSE-AVX2.
+  // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
+  // computing the cost using a generic formula as a function of generic
+  // shuffles. We therefore use a lookup table instead, filled according to
+  // the instruction sequences that codegen currently generates.
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+  // the VF=2, while v2i128 is an unsupported MVT vector type
+  // (see MachineValueType.h::getVectorVT()).
+  if (!LegalVT.isVector())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, CostKind);
+
+  unsigned VF = VecTy->getNumElements() / Factor;
+  Type *ScalarTy = VecTy->getElementType();
+  // Deduplicate entries, model floats/pointers as appropriately-sized integers.
+  if (!ScalarTy->isIntegerTy())
+    ScalarTy =
+        Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
+
+  // Get the cost of all the memory operations.
+  // FIXME: discount dead loads.
+  InstructionCost MemOpCosts = getMemoryOpCost(
+      Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
+
+  auto *VT = FixedVectorType::get(ScalarTy, VF);
+  EVT ETy = TLI->getValueType(DL, VT);
+  if (!ETy.isSimple())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace, CostKind);
+
+  // TODO: Complete for other data-types and strides.
+  // Each combination of Stride, element bit width and VF results in a different
+  // sequence; The cost tables are therefore accessed with:
+  // Factor (stride) and VectorType=VFxiN.
+  // The Cost accounts only for the shuffle sequence;
+  // The cost of the loads/stores is accounted for separately.
+  //
+  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+      {2, MVT::v2i8, 2},  // (load 4i8 and) deinterleave into 2 x 2i8
+      {2, MVT::v4i8, 2},  // (load 8i8 and) deinterleave into 2 x 4i8
+      {2, MVT::v8i8, 2},  // (load 16i8 and) deinterleave into 2 x 8i8
+      {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
+      {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
+
+      {2, MVT::v8i16, 6},   // (load 16i16 and) deinterleave into 2 x 8i16
+      {2, MVT::v16i16, 9},  // (load 32i16 and) deinterleave into 2 x 16i16
+      {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
+
+      {2, MVT::v8i32, 4},   // (load 16i32 and) deinterleave into 2 x 8i32
+      {2, MVT::v16i32, 8},  // (load 32i32 and) deinterleave into 2 x 16i32
+      {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
+
+      {2, MVT::v4i64, 4},   // (load 8i64 and) deinterleave into 2 x 4i64
+      {2, MVT::v8i64, 8},   // (load 16i64 and) deinterleave into 2 x 8i64
+      {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
+      {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
+
+      {3, MVT::v2i8, 3},   // (load 6i8 and) deinterleave into 3 x 2i8
+      {3, MVT::v4i8, 3},   // (load 12i8 and) deinterleave into 3 x 4i8
+      {3, MVT::v8i8, 6},   // (load 24i8 and) deinterleave into 3 x 8i8
+      {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
+      {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
+
+      {3, MVT::v2i16, 5},   // (load 6i16 and) deinterleave into 3 x 2i16
+      {3, MVT::v4i16, 7},   // (load 12i16 and) deinterleave into 3 x 4i16
+      {3, MVT::v8i16, 9},   // (load 24i16 and) deinterleave into 3 x 8i16
+      {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
+      {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
+
+      {3, MVT::v2i32, 3},   // (load 6i32 and) deinterleave into 3 x 2i32
+      {3, MVT::v4i32, 3},   // (load 12i32 and) deinterleave into 3 x 4i32
+      {3, MVT::v8i32, 7},   // (load 24i32 and) deinterleave into 3 x 8i32
+      {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
+      {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
+
+      {3, MVT::v2i64, 1},   // (load 6i64 and) deinterleave into 3 x 2i64
+      {3, MVT::v4i64, 5},   // (load 12i64 and) deinterleave into 3 x 4i64
+      {3, MVT::v8i64, 10},  // (load 24i64 and) deinterleave into 3 x 8i64
+      {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
+
+      {4, MVT::v2i8, 4},   // (load 8i8 and) deinterleave into 4 x 2i8
+      {4, MVT::v4i8, 4},   // (load 16i8 and) deinterleave into 4 x 4i8
+      {4, MVT::v8i8, 12},  // (load 32i8 and) deinterleave into 4 x 8i8
+      {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
+      {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
+
+      {4, MVT::v2i16, 6},    // (load 8i16 and) deinterleave into 4 x 2i16
+      {4, MVT::v4i16, 17},   // (load 16i16 and) deinterleave into 4 x 4i16
+      {4, MVT::v8i16, 33},   // (load 32i16 and) deinterleave into 4 x 8i16
+      {4, MVT::v16i16, 75},  // (load 64i16 and) deinterleave into 4 x 16i16
+      {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
+
+      {4, MVT::v2i32, 4},   // (load 8i32 and) deinterleave into 4 x 2i32
+      {4, MVT::v4i32, 8},   // (load 16i32 and) deinterleave into 4 x 4i32
+      {4, MVT::v8i32, 16},  // (load 32i32 and) deinterleave into 4 x 8i32
+      {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
+      {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
+
+      {4, MVT::v2i64, 6},  // (load 8i64 and) deinterleave into 4 x 2i64
+      {4, MVT::v4i64, 8},  // (load 16i64 and) deinterleave into 4 x 4i64
+      {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
+      {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
+
+      {6, MVT::v2i8, 6},   // (load 12i8 and) deinterleave into 6 x 2i8
+      {6, MVT::v4i8, 14},  // (load 24i8 and) deinterleave into 6 x 4i8
+      {6, MVT::v8i8, 18},  // (load 48i8 and) deinterleave into 6 x 8i8
+      {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
+      {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
+
+      {6, MVT::v2i16, 13},   // (load 12i16 and) deinterleave into 6 x 2i16
+      {6, MVT::v4i16, 9},    // (load 24i16 and) deinterleave into 6 x 4i16
+      {6, MVT::v8i16, 39},   // (load 48i16 and) deinterleave into 6 x 8i16
+      {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
+      {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
+
+      {6, MVT::v2i32, 6},   // (load 12i32 and) deinterleave into 6 x 2i32
+      {6, MVT::v4i32, 15},  // (load 24i32 and) deinterleave into 6 x 4i32
+      {6, MVT::v8i32, 31},  // (load 48i32 and) deinterleave into 6 x 8i32
+      {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
+
+      {6, MVT::v2i64, 6},  // (load 12i64 and) deinterleave into 6 x 2i64
+      {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
+      {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
+
+      {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
+  };
+
+  static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
+      {2, MVT::v4i16, 2},   // (load 8i16 and) deinterleave into 2 x 4i16
+  };
+
+  static const CostTblEntry SSE2InterleavedLoadTbl[] = {
+      {2, MVT::v2i16, 2},   // (load 4i16 and) deinterleave into 2 x 2i16
+      {2, MVT::v4i16, 7},   // (load 8i16 and) deinterleave into 2 x 4i16
+
+      {2, MVT::v2i32, 2},   // (load 4i32 and) deinterleave into 2 x 2i32
+      {2, MVT::v4i32, 2},   // (load 8i32 and) deinterleave into 2 x 4i32
+
+      {2, MVT::v2i64, 2},   // (load 4i64 and) deinterleave into 2 x 2i64
+  };
+
+  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+      {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
+      {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
+
+      {2, MVT::v8i16, 3},  // interleave 2 x 8i16 into 16i16 (and store)
+      {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
+      {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
+
+      {2, MVT::v4i32, 2},   // interleave 2 x 4i32 into 8i32 (and store)
+      {2, MVT::v8i32, 4},   // interleave 2 x 8i32 into 16i32 (and store)
+      {2, MVT::v16i32, 8},  // interleave 2 x 16i32 into 32i32 (and store)
+      {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
+
+      {2, MVT::v2i64, 2},   // interleave 2 x 2i64 into 4i64 (and store)
+      {2, MVT::v4i64, 4},   // interleave 2 x 4i64 into 8i64 (and store)
+      {2, MVT::v8i64, 8},   // interleave 2 x 8i64 into 16i64 (and store)
+      {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
+      {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
+
+      {3, MVT::v2i8, 4},   // interleave 3 x 2i8 into 6i8 (and store)
+      {3, MVT::v4i8, 4},   // interleave 3 x 4i8 into 12i8 (and store)
+      {3, MVT::v8i8, 6},   // interleave 3 x 8i8 into 24i8 (and store)
+      {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
+      {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
+
+      {3, MVT::v2i16, 4},   // interleave 3 x 2i16 into 6i16 (and store)
+      {3, MVT::v4i16, 6},   // interleave 3 x 4i16 into 12i16 (and store)
+      {3, MVT::v8i16, 12},  // interleave 3 x 8i16 into 24i16 (and store)
+      {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
+      {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
+
+      {3, MVT::v2i32, 4},   // interleave 3 x 2i32 into 6i32 (and store)
+      {3, MVT::v4i32, 5},   // interleave 3 x 4i32 into 12i32 (and store)
+      {3, MVT::v8i32, 11},  // interleave 3 x 8i32 into 24i32 (and store)
+      {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
+      {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
+
+      {3, MVT::v2i64, 4},   // interleave 3 x 2i64 into 6i64 (and store)
+      {3, MVT::v4i64, 6},   // interleave 3 x 4i64 into 12i64 (and store)
+      {3, MVT::v8i64, 12},  // interleave 3 x 8i64 into 24i64 (and store)
+      {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
+
+      {4, MVT::v2i8, 4},   // interleave 4 x 2i8 into 8i8 (and store)
+      {4, MVT::v4i8, 4},   // interleave 4 x 4i8 into 16i8 (and store)
+      {4, MVT::v8i8, 4},   // interleave 4 x 8i8 into 32i8 (and store)
+      {4, MVT::v16i8, 8},  // interleave 4 x 16i8 into 64i8 (and store)
+      {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
+
+      {4, MVT::v2i16, 2},   // interleave 4 x 2i16 into 8i16 (and store)
+      {4, MVT::v4i16, 6},   // interleave 4 x 4i16 into 16i16 (and store)
+      {4, MVT::v8i16, 10},  // interleave 4 x 8i16 into 32i16 (and store)
+      {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
+      {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
+
+      {4, MVT::v2i32, 5},   // interleave 4 x 2i32 into 8i32 (and store)
+      {4, MVT::v4i32, 6},   // interleave 4 x 4i32 into 16i32 (and store)
+      {4, MVT::v8i32, 16},  // interleave 4 x 8i32 into 32i32 (and store)
+      {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
+      {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
+
+      {4, MVT::v2i64, 6},  // interleave 4 x 2i64 into 8i64 (and store)
+      {4, MVT::v4i64, 8},  // interleave 4 x 4i64 into 16i64 (and store)
+      {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
+      {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
+
+      {6, MVT::v2i8, 7},   // interleave 6 x 2i8 into 12i8 (and store)
+      {6, MVT::v4i8, 9},   // interleave 6 x 4i8 into 24i8 (and store)
+      {6, MVT::v8i8, 16},  // interleave 6 x 8i8 into 48i8 (and store)
+      {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
+      {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
+
+      {6, MVT::v2i16, 10},  // interleave 6 x 2i16 into 12i16 (and store)
+      {6, MVT::v4i16, 15},  // interleave 6 x 4i16 into 24i16 (and store)
+      {6, MVT::v8i16, 21},  // interleave 6 x 8i16 into 48i16 (and store)
+      {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
+      {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
+
+      {6, MVT::v2i32, 9},   // interleave 6 x 2i32 into 12i32 (and store)
+      {6, MVT::v4i32, 12},  // interleave 6 x 4i32 into 24i32 (and store)
+      {6, MVT::v8i32, 33},  // interleave 6 x 8i32 into 48i32 (and store)
+      {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
+
+      {6, MVT::v2i64, 8},  // interleave 6 x 2i64 into 12i64 (and store)
+      {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
+      {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
+  };
+
+  static const CostTblEntry SSE2InterleavedStoreTbl[] = {
+      {2, MVT::v2i8, 1},   // interleave 2 x 2i8 into 4i8 (and store)
+      {2, MVT::v4i8, 1},   // interleave 2 x 4i8 into 8i8 (and store)
+      {2, MVT::v8i8, 1},   // interleave 2 x 8i8 into 16i8 (and store)
+
+      {2, MVT::v2i16, 1},  // interleave 2 x 2i16 into 4i16 (and store)
+      {2, MVT::v4i16, 1},  // interleave 2 x 4i16 into 8i16 (and store)
+
+      {2, MVT::v2i32, 1},  // interleave 2 x 2i32 into 4i32 (and store)
+  };
+
+  if (Opcode == Instruction::Load) {
+    auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
+                              MemOpCosts](const CostTblEntry *Entry) {
+      // NOTE: this is just an approximation!
+      //       It can over/under -estimate the cost!
+      return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
+    };
+
+    if (ST->hasAVX2())
+      if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
+                                              ETy.getSimpleVT()))
+        return GetDiscountedCost(Entry);
+
+    if (ST->hasSSSE3())
+      if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
+                                              ETy.getSimpleVT()))
+        return GetDiscountedCost(Entry);
+
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
+                                              ETy.getSimpleVT()))
+        return GetDiscountedCost(Entry);
+  } else {
+    assert(Opcode == Instruction::Store &&
+           "Expected Store Instruction at this point");
+    assert((!Indices.size() || Indices.size() == Factor) &&
+           "Interleaved store only supports fully-interleaved groups.");
+    if (ST->hasAVX2())
+      if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
+                                              ETy.getSimpleVT()))
+        return MemOpCosts + Entry->Cost;
+
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
+                                              ETy.getSimpleVT()))
+        return MemOpCosts + Entry->Cost;
+  }
+
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace, CostKind,
                                            UseMaskForCond, UseMaskForGaps);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 69ff6584316e..c53424ec0026 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -45,52 +45,54 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::FeatureCMPXCHG16B,
       X86::FeatureLAHFSAHF,
 
-      // Codegen control options.
-      X86::FeatureFast11ByteNOP,
-      X86::FeatureFast15ByteNOP,
-      X86::FeatureFastBEXTR,
-      X86::FeatureFastHorizontalOps,
-      X86::FeatureFastLZCNT,
-      X86::FeatureFastScalarFSQRT,
-      X86::FeatureFastSHLDRotate,
-      X86::FeatureFastScalarShiftMasks,
-      X86::FeatureFastVectorShiftMasks,
-      X86::FeatureFastVariableCrossLaneShuffle,
-      X86::FeatureFastVariablePerLaneShuffle,
-      X86::FeatureFastVectorFSQRT,
-      X86::FeatureLEAForSP,
-      X86::FeatureLEAUsesAG,
-      X86::FeatureLZCNTFalseDeps,
-      X86::FeatureBranchFusion,
-      X86::FeatureMacroFusion,
-      X86::FeaturePadShortFunctions,
-      X86::FeaturePOPCNTFalseDeps,
+      // Some older targets can be setup to fold unaligned loads.
       X86::FeatureSSEUnalignedMem,
-      X86::FeatureSlow3OpsLEA,
-      X86::FeatureSlowDivide32,
-      X86::FeatureSlowDivide64,
-      X86::FeatureSlowIncDec,
-      X86::FeatureSlowLEA,
-      X86::FeatureSlowPMADDWD,
-      X86::FeatureSlowPMULLD,
-      X86::FeatureSlowSHLD,
-      X86::FeatureSlowTwoMemOps,
-      X86::FeatureSlowUAMem16,
-      X86::FeaturePreferMaskRegisters,
-      X86::FeatureInsertVZEROUPPER,
-      X86::FeatureUseGLMDivSqrtCosts,
+
+      // Codegen control options.
+      X86::TuningFast11ByteNOP,
+      X86::TuningFast15ByteNOP,
+      X86::TuningFastBEXTR,
+      X86::TuningFastHorizontalOps,
+      X86::TuningFastLZCNT,
+      X86::TuningFastScalarFSQRT,
+      X86::TuningFastSHLDRotate,
+      X86::TuningFastScalarShiftMasks,
+      X86::TuningFastVectorShiftMasks,
+      X86::TuningFastVariableCrossLaneShuffle,
+      X86::TuningFastVariablePerLaneShuffle,
+      X86::TuningFastVectorFSQRT,
+      X86::TuningLEAForSP,
+      X86::TuningLEAUsesAG,
+      X86::TuningLZCNTFalseDeps,
+      X86::TuningBranchFusion,
+      X86::TuningMacroFusion,
+      X86::TuningPadShortFunctions,
+      X86::TuningPOPCNTFalseDeps,
+      X86::TuningSlow3OpsLEA,
+      X86::TuningSlowDivide32,
+      X86::TuningSlowDivide64,
+      X86::TuningSlowIncDec,
+      X86::TuningSlowLEA,
+      X86::TuningSlowPMADDWD,
+      X86::TuningSlowPMULLD,
+      X86::TuningSlowSHLD,
+      X86::TuningSlowTwoMemOps,
+      X86::TuningSlowUAMem16,
+      X86::TuningPreferMaskRegisters,
+      X86::TuningInsertVZEROUPPER,
+      X86::TuningUseSLMArithCosts,
+      X86::TuningUseGLMDivSqrtCosts,
 
       // Perf-tuning flags.
-      X86::FeatureHasFastGather,
-      X86::FeatureSlowUAMem32,
+      X86::TuningFastGather,
+      X86::TuningSlowUAMem32,
 
       // Based on whether user set the -mprefer-vector-width command line.
-      X86::FeaturePrefer128Bit,
-      X86::FeaturePrefer256Bit,
+      X86::TuningPrefer128Bit,
+      X86::TuningPrefer256Bit,
 
       // CPU name enums. These just follow CPU string.
-      X86::ProcIntelAtom,
-      X86::ProcIntelSLM,
+      X86::ProcIntelAtom
   };
 
 public:
@@ -120,8 +122,7 @@ public:
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
   InstructionCost getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -144,14 +145,17 @@ public:
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract);
+  InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
+                                            int VF,
+                                            const APInt &DemandedDstElts,
+                                            TTI::TargetCostKind CostKind);
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
                                   MaybeAlign Alignment, unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
                                   const Instruction *I = nullptr);
-  InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
-                        TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+  InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                        Align Alignment, unsigned AddressSpace,
+                                        TTI::TargetCostKind CostKind);
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
@@ -180,9 +184,9 @@ public:
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
-  InstructionCost getArithmeticReductionCost(
-      unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             Optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
 
   InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
 
@@ -192,19 +196,13 @@ public:
 
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   InstructionCost getInterleavedMemoryOpCostAVX512(
       unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
       ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
-      bool UseMaskForCond = false, bool UseMaskForGaps = false);
-  InstructionCost getInterleavedMemoryOpCostAVX2(
-      unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
-      ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
-      bool UseMaskForCond = false, bool UseMaskForGaps = false);
+      TTI::TargetCostKind CostKind, bool UseMaskForCond = false,
+      bool UseMaskForGaps = false);
 
   InstructionCost getIntImmCost(int64_t);
 
@@ -241,9 +239,12 @@ public:
                                     SmallPtrSetImpl<Argument *> &Args) const;
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
+  bool prefersVectorizedAddressing() const;
+  bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
 
 private:
+  bool supportsGather() const;
   InstructionCost getGSScalarCost(unsigned Opcode, Type *DataTy,
                                   bool VariableMask, Align Alignment,
                                   unsigned AddressSpace);
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index c3031b698552..59b5dc111ce3 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -271,10 +271,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
                     << getBlockExitStateName(CurState) << '\n');
 
   if (CurState == EXITS_DIRTY)
-    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
-                                          SE = MBB.succ_end();
-         SI != SE; ++SI)
-      addDirtySuccessor(**SI);
+    for (MachineBasicBlock *Succ : MBB.successors())
+      addDirtySuccessor(*Succ);
 
   BlockStates[MBB.getNumber()].ExitState = CurState;
 }
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 8d8bd5e6b326..02186949960d 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -458,7 +458,7 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
 void WinEHStatePass::rewriteSetJmpCall(IRBuilder<> &Builder, Function &F,
                                        CallBase &Call, Value *State) {
   // Don't rewrite calls with a weird number of arguments.
-  if (Call.getNumArgOperands() != 2)
+  if (Call.arg_size() != 2)
     return;
 
   SmallVector<OperandBundleDef, 1> OpBundles;
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 0505686e645b..f6b97e9e84b3 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -19,7 +19,7 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index b44984ff6b4c..c286b747a271 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -20,10 +20,10 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index d5f66c2bd824..8916c6ca7be7 100644
--- a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetInfo/XCoreTargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 using namespace llvm;
 
 Target &llvm::getTheXCoreTarget() {
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index 4ea775305e12..38b613700674 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -38,8 +38,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include <algorithm>
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 51fdfe54db18..7c86262269fc 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1315,7 +1315,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments(
         CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
       }
     } else {
-      // sanity check
+      // Only arguments passed on the stack should make it here. 
       assert(VA.isMemLoc());
       // Load the argument to a virtual register
       unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
@@ -1643,7 +1643,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       return DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N1, N0, N2);
 
     // fold (ladd 0, 0, x) -> 0, x & 1
-    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+    if (N0C && N0C->isZero() && N1C && N1C->isZero()) {
       SDValue Carry = DAG.getConstant(0, dl, VT);
       SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
                                    DAG.getConstant(1, dl, VT));
@@ -1653,7 +1653,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 
     // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
     // low bit set
-    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+    if (N1C && N1C->isZero() && N->hasNUsesOfValue(0, 1)) {
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
       KnownBits Known = DAG.computeKnownBits(N2);
@@ -1675,7 +1675,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     EVT VT = N0.getValueType();
 
     // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
-    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+    if (N0C && N0C->isZero() && N1C && N1C->isZero()) {
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
       KnownBits Known = DAG.computeKnownBits(N2);
@@ -1690,7 +1690,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 
     // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
     // low bit set
-    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+    if (N1C && N1C->isZero() && N->hasNUsesOfValue(0, 1)) {
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
       KnownBits Known = DAG.computeKnownBits(N2);
@@ -1719,7 +1719,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
                          N1, N0, N2, N3);
 
     // lmul(x, 0, a, b)
-    if (N1C && N1C->isNullValue()) {
+    if (N1C && N1C->isZero()) {
       // If the high result is unused fold to add(a, b)
       if (N->hasNUsesOfValue(0, 0)) {
         SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index 1b21e1ce195b..1b53d593c130 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -21,9 +21,9 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 6528154ab0e2..b5a683de33ab 100644
--- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/IntrinsicsXCore.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
-#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -90,11 +89,11 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
               if (PredBB->getTerminator()->getNumSuccessors() > 1)
                 PredBB = SplitEdge(PredBB, PN->getParent());
               Instruction *InsertPos = PredBB->getTerminator();
-              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+              Instruction *NewInst = CE->getAsInstruction(InsertPos);
               PN->setOperand(I, NewInst);
             }
         } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
-          Instruction *NewInst = createReplacementInstr(CE, Instr);
+          Instruction *NewInst = CE->getAsInstruction(Instr);
           Instr->replaceUsesOfWith(CE, NewInst);
         } else {
           ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
@@ -103,7 +102,7 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
         }
       }
   } while (CE->hasNUsesOrMore(1)); // We need to check because a recursive
-  // sibling may have used 'CE' when createReplacementInstr was called.
+  // sibling may have used 'CE' when getAsInstruction was called.
   CE->destroyConstant();
   return true;
 }
diff --git a/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
index 4b29751c7d06..1be707cb488c 100644
--- a/llvm/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -12,7 +12,7 @@
 
 #include "XCoreSubtarget.h"
 #include "XCore.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 046cd6b5db7d..2e49627a19bf 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -20,8 +20,8 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -99,7 +99,7 @@ bool XCorePassConfig::addInstSelector() {
 }
 
 void XCorePassConfig::addPreEmitPass() {
-  addPass(createXCoreFrameToArgsOffsetEliminationPass(), false);
+  addPass(createXCoreFrameToArgsOffsetEliminationPass());
 }
 
 // Force static initialization.
diff --git a/llvm/lib/TextAPI/TextStub.cpp b/llvm/lib/TextAPI/TextStub.cpp
index 5d85342adb26..b64f19ab65cc 100644
--- a/llvm/lib/TextAPI/TextStub.cpp
+++ b/llvm/lib/TextAPI/TextStub.cpp
@@ -1121,9 +1121,9 @@ TextAPIReader::get(MemoryBufferRef InputBuffer) {
   auto File = std::unique_ptr<InterfaceFile>(
       const_cast<InterfaceFile *>(Files.front()));
 
-  for (auto Iter = std::next(Files.begin()); Iter != Files.end(); ++Iter)
+  for (const InterfaceFile *FI : llvm::drop_begin(Files))
     File->addDocument(
-        std::shared_ptr<InterfaceFile>(const_cast<InterfaceFile *>(*Iter)));
+        std::shared_ptr<InterfaceFile>(const_cast<InterfaceFile *>(FI)));
 
   if (YAMLIn.error())
     return make_error<StringError>(Ctx.ErrorMessage, YAMLIn.error());
diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 961577f126ba..de1634ebed3c 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/COFFModuleDefinition.h"
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 85abbf6d86e0..7243e39c9029 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -18,6 +18,7 @@
 #include "llvm-c/Transforms/AggressiveInstCombine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -205,8 +206,8 @@ struct MaskOps {
   bool FoundAnd1;
 
   MaskOps(unsigned BitWidth, bool MatchAnds)
-      : Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
-        MatchAndChain(MatchAnds), FoundAnd1(false) {}
+      : Root(nullptr), Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds),
+        FoundAnd1(false) {}
 };
 
 /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
@@ -377,10 +378,10 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
     // Also, we want to avoid matching partial patterns.
     // TODO: It would be more efficient if we removed dead instructions
     // iteratively in this loop rather than waiting until the end.
-    for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
+    for (Instruction &I : llvm::reverse(BB)) {
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedFunnelShift(I, DT);
-      MadeChange |= tryToRecognizePopCount(I); 
+      MadeChange |= tryToRecognizePopCount(I);
     }
   }
 
@@ -394,10 +395,11 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
 
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
-static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
+static bool runImpl(Function &F, AssumptionCache &AC, TargetLibraryInfo &TLI,
+                    DominatorTree &DT) {
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
-  TruncInstCombine TIC(TLI, DL, DT);
+  TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
   MadeChange |= foldUnusualPatterns(F, DT);
   return MadeChange;
@@ -406,6 +408,7 @@ static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
 void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+  AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addPreserved<AAResultsWrapperPass>();
@@ -415,16 +418,18 @@ void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
 }
 
 bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  return runImpl(F, TLI, DT);
+  return runImpl(F, AC, TLI, DT);
 }
 
 PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  if (!runImpl(F, TLI, DT)) {
+  if (!runImpl(F, AC, TLI, DT)) {
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
   }
@@ -438,6 +443,7 @@ char AggressiveInstCombinerLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
                       "aggressive-instcombine",
                       "Combine pattern based expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 42bcadfc7dcd..5d69e26d6ecc 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -17,6 +17,8 @@
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -39,16 +41,18 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-  class DataLayout;
-  class DominatorTree;
-  class Function;
-  class Instruction;
-  class TargetLibraryInfo;
-  class TruncInst;
-  class Type;
-  class Value;
+class AssumptionCache;
+class DataLayout;
+class DominatorTree;
+class Function;
+class Instruction;
+class TargetLibraryInfo;
+class TruncInst;
+class Type;
+class Value;
 
 class TruncInstCombine {
+  AssumptionCache &AC;
   TargetLibraryInfo &TLI;
   const DataLayout &DL;
   const DominatorTree &DT;
@@ -75,9 +79,9 @@ class TruncInstCombine {
   MapVector<Instruction *, Info> InstInfoMap;
 
 public:
-  TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL,
-                   const DominatorTree &DT)
-      : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
+  TruncInstCombine(AssumptionCache &AC, TargetLibraryInfo &TLI,
+                   const DataLayout &DL, const DominatorTree &DT)
+      : AC(AC), TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
 
   /// Perform TruncInst pattern optimization on given function.
   bool run(Function &F);
@@ -104,6 +108,18 @@ private:
   ///         to be reduced.
   Type *getBestTruncatedType();
 
+  KnownBits computeKnownBits(const Value *V) const {
+    return llvm::computeKnownBits(V, DL, /*Depth=*/0, &AC,
+                                  /*CtxI=*/cast<Instruction>(CurrentTruncInst),
+                                  &DT);
+  }
+
+  unsigned ComputeNumSignBits(const Value *V) const {
+    return llvm::ComputeNumSignBits(
+        V, DL, /*Depth=*/0, &AC, /*CtxI=*/cast<Instruction>(CurrentTruncInst),
+        &DT);
+  }
+
   /// Given a \p V value and a \p SclTy scalar type return the generated reduced
   /// value of \p V based on the type \p SclTy.
   ///
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 16b82219e8ca..abac3f801a22 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -61,9 +62,18 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::UDiv:
+  case Instruction::URem:
+  case Instruction::InsertElement:
     Ops.push_back(I->getOperand(0));
     Ops.push_back(I->getOperand(1));
     break;
+  case Instruction::ExtractElement:
+    Ops.push_back(I->getOperand(0));
+    break;
   case Instruction::Select:
     Ops.push_back(I->getOperand(1));
     Ops.push_back(I->getOperand(2));
@@ -127,6 +137,13 @@ bool TruncInstCombine::buildTruncExpressionDag() {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::UDiv:
+    case Instruction::URem:
+    case Instruction::InsertElement:
+    case Instruction::ExtractElement:
     case Instruction::Select: {
       SmallVector<Value *, 2> Operands;
       getRelevantOperands(I, Operands);
@@ -135,10 +152,9 @@ bool TruncInstCombine::buildTruncExpressionDag() {
     }
     default:
       // TODO: Can handle more cases here:
-      // 1. shufflevector, extractelement, insertelement
-      // 2. udiv, urem
-      // 3. shl, lshr, ashr
-      // 4. phi node(and loop handling)
+      // 1. shufflevector
+      // 2. sdiv, srem
+      // 3. phi node(and loop handling)
       // ...
       return false;
     }
@@ -270,6 +286,50 @@ Type *TruncInstCombine::getBestTruncatedType() {
   unsigned OrigBitWidth =
       CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
 
+  // Initialize MinBitWidth for shift instructions with the minimum number
+  // that is greater than shift amount (i.e. shift amount + 1).
+  // For `lshr` adjust MinBitWidth so that all potentially truncated
+  // bits of the value-to-be-shifted are zeros.
+  // For `ashr` adjust MinBitWidth so that all potentially truncated
+  // bits of the value-to-be-shifted are sign bits (all zeros or ones)
+  // and even one (first) untruncated bit is sign bit.
+  // Exit early if MinBitWidth is not less than original bitwidth.
+  for (auto &Itr : InstInfoMap) {
+    Instruction *I = Itr.first;
+    if (I->isShift()) {
+      KnownBits KnownRHS = computeKnownBits(I->getOperand(1));
+      unsigned MinBitWidth = KnownRHS.getMaxValue()
+                                 .uadd_sat(APInt(OrigBitWidth, 1))
+                                 .getLimitedValue(OrigBitWidth);
+      if (MinBitWidth == OrigBitWidth)
+        return nullptr;
+      if (I->getOpcode() == Instruction::LShr) {
+        KnownBits KnownLHS = computeKnownBits(I->getOperand(0));
+        MinBitWidth =
+            std::max(MinBitWidth, KnownLHS.getMaxValue().getActiveBits());
+      }
+      if (I->getOpcode() == Instruction::AShr) {
+        unsigned NumSignBits = ComputeNumSignBits(I->getOperand(0));
+        MinBitWidth = std::max(MinBitWidth, OrigBitWidth - NumSignBits + 1);
+      }
+      if (MinBitWidth >= OrigBitWidth)
+        return nullptr;
+      Itr.second.MinBitWidth = MinBitWidth;
+    }
+    if (I->getOpcode() == Instruction::UDiv ||
+        I->getOpcode() == Instruction::URem) {
+      unsigned MinBitWidth = 0;
+      for (const auto &Op : I->operands()) {
+        KnownBits Known = computeKnownBits(Op);
+        MinBitWidth =
+            std::max(Known.getMaxValue().getActiveBits(), MinBitWidth);
+        if (MinBitWidth >= OrigBitWidth)
+          return nullptr;
+      }
+      Itr.second.MinBitWidth = MinBitWidth;
+    }
+  }
+
   // Calculate minimum allowed bit-width allowed for shrinking the currently
   // visited truncate's operand.
   unsigned MinBitWidth = getMinBitWidth();
@@ -356,10 +416,32 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
     case Instruction::Mul:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::UDiv:
+    case Instruction::URem: {
       Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
       Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
       Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+      // Preserve `exact` flag since truncation doesn't change exactness
+      if (auto *PEO = dyn_cast<PossiblyExactOperator>(I))
+        if (auto *ResI = dyn_cast<Instruction>(Res))
+          ResI->setIsExact(PEO->isExact());
+      break;
+    }
+    case Instruction::ExtractElement: {
+      Value *Vec = getReducedOperand(I->getOperand(0), SclTy);
+      Value *Idx = I->getOperand(1);
+      Res = Builder.CreateExtractElement(Vec, Idx);
+      break;
+    }
+    case Instruction::InsertElement: {
+      Value *Vec = getReducedOperand(I->getOperand(0), SclTy);
+      Value *NewElt = getReducedOperand(I->getOperand(1), SclTy);
+      Value *Idx = I->getOperand(2);
+      Res = Builder.CreateInsertElement(Vec, NewElt, Idx);
       break;
     }
     case Instruction::Select: {
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index 5b09cdb35791..67f8828e4c75 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -56,8 +56,10 @@ static void lowerSubFn(IRBuilder<> &Builder, CoroSubFnInst *SubFn) {
 bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
   bool Changed = false;
 
-  for (auto IB = inst_begin(F), E = inst_end(F); IB != E;) {
-    Instruction &I = *IB++;
+  bool IsPrivateAndUnprocessed =
+      F.hasFnAttribute(CORO_PRESPLIT_ATTR) && F.hasLocalLinkage();
+
+  for (Instruction &I : llvm::make_early_inc_range(instructions(F))) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
       default:
@@ -71,6 +73,10 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
       case Intrinsic::coro_alloc:
         II->replaceAllUsesWith(ConstantInt::getTrue(Context));
         break;
+      case Intrinsic::coro_async_resume:
+        II->replaceAllUsesWith(
+            ConstantPointerNull::get(cast<PointerType>(I.getType())));
+        break;
       case Intrinsic::coro_id:
       case Intrinsic::coro_id_retcon:
       case Intrinsic::coro_id_retcon_once:
@@ -80,6 +86,13 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
       case Intrinsic::coro_subfn_addr:
         lowerSubFn(Builder, cast<CoroSubFnInst>(II));
         break;
+      case Intrinsic::coro_end:
+      case Intrinsic::coro_suspend_retcon:
+        if (IsPrivateAndUnprocessed) {
+          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+        } else
+          continue;
+        break;
       case Intrinsic::coro_async_size_replace:
         auto *Target = cast<ConstantStruct>(
             cast<GlobalVariable>(II->getArgOperand(0)->stripPointerCasts())
@@ -115,7 +128,8 @@ static bool declaresCoroCleanupIntrinsics(const Module &M) {
   return coro::declaresIntrinsics(
       M, {"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.subfn.addr",
           "llvm.coro.free", "llvm.coro.id", "llvm.coro.id.retcon",
-          "llvm.coro.id.retcon.once", "llvm.coro.async.size.replace"});
+          "llvm.coro.id.retcon.once", "llvm.coro.async.size.replace",
+          "llvm.coro.async.resume"});
 }
 
 PreservedAnalyses CoroCleanupPass::run(Function &F,
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 5e5e513cdfda..68a34bdcb1cd 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -150,8 +150,7 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   CoroIdInst *CoroId = nullptr;
   SmallVector<CoroFreeInst *, 4> CoroFrees;
   bool HasCoroSuspend = false;
-  for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
-    Instruction &I = *IB++;
+  for (Instruction &I : llvm::make_early_inc_range(instructions(F))) {
     if (auto *CB = dyn_cast<CallBase>(&I)) {
       switch (CB->getIntrinsicID()) {
       default:
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index beae5fdac8ab..ac3d078714ce 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -16,6 +16,7 @@
 
 #include "CoroInternal.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/StackLifetime.h"
@@ -435,7 +436,7 @@ private:
   DenseMap<Value*, unsigned> FieldIndexByKey;
 
 public:
-  FrameTypeBuilder(LLVMContext &Context, DataLayout const &DL,
+  FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL,
                    Optional<Align> MaxFrameAlignment)
       : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {}
 
@@ -576,13 +577,8 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F,
   using AllocaSetType = SmallVector<AllocaInst *, 4>;
   SmallVector<AllocaSetType, 4> NonOverlapedAllocas;
 
-  // We need to add field for allocas at the end of this function. However, this
-  // function has multiple exits, so we use this helper to avoid redundant code.
-  struct RTTIHelper {
-    std::function<void()> func;
-    RTTIHelper(std::function<void()> &&func) : func(func) {}
-    ~RTTIHelper() { func(); }
-  } Helper([&]() {
+  // We need to add field for allocas at the end of this function.
+  auto AddFieldForAllocasAtExit = make_scope_exit([&]() {
     for (auto AllocaList : NonOverlapedAllocas) {
       auto *LargestAI = *AllocaList.begin();
       FieldIDType Id = addFieldForAlloca(LargestAI);
@@ -840,8 +836,9 @@ static StringRef solveTypeName(Type *Ty) {
   return "UnknownType";
 }
 
-static DIType *solveDIType(DIBuilder &Builder, Type *Ty, DataLayout &Layout,
-                           DIScope *Scope, unsigned LineNum,
+static DIType *solveDIType(DIBuilder &Builder, Type *Ty,
+                           const DataLayout &Layout, DIScope *Scope,
+                           unsigned LineNum,
                            DenseMap<Type *, DIType *> &DITypeCache) {
   if (DIType *DT = DITypeCache.lookup(Ty))
     return DT;
@@ -1348,13 +1345,17 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   }
 
   void visitIntrinsicInst(IntrinsicInst &II) {
-    if (II.getIntrinsicID() != Intrinsic::lifetime_start)
+    // When we found the lifetime markers refers to a
+    // subrange of the original alloca, ignore the lifetime
+    // markers to avoid misleading the analysis.
+    if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown ||
+        !Offset.isZero())
       return Base::visitIntrinsicInst(II);
     LifetimeStarts.insert(&II);
   }
 
   void visitCallBase(CallBase &CB) {
-    for (unsigned Op = 0, OpCount = CB.getNumArgOperands(); Op < OpCount; ++Op)
+    for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op)
       if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op))
         PI.setEscaped(&CB);
     handleMayWrite(CB);
@@ -1868,8 +1869,7 @@ static void cleanupSinglePredPHIs(Function &F) {
     }
   }
   while (!Worklist.empty()) {
-    auto *Phi = Worklist.back();
-    Worklist.pop_back();
+    auto *Phi = Worklist.pop_back_val();
     auto *OriginalValue = Phi->getIncomingValue(0);
     Phi->replaceAllUsesWith(OriginalValue);
   }
@@ -1984,14 +1984,15 @@ static void rewriteMaterializableInstructions(IRBuilder<> &IRB,
       if (CurrentBlock != U->getParent()) {
 
         bool IsInCoroSuspendBlock = isa<AnyCoroSuspendInst>(U);
-        CurrentBlock = IsInCoroSuspendBlock
-                           ? U->getParent()->getSinglePredecessor()
-                           : U->getParent();
+        CurrentBlock = U->getParent();
+        auto *InsertBlock = IsInCoroSuspendBlock
+                                ? CurrentBlock->getSinglePredecessor()
+                                : CurrentBlock;
         CurrentMaterialization = cast<Instruction>(Def)->clone();
         CurrentMaterialization->setName(Def->getName());
         CurrentMaterialization->insertBefore(
-            IsInCoroSuspendBlock ? CurrentBlock->getTerminator()
-                                 : &*CurrentBlock->getFirstInsertionPt());
+            IsInCoroSuspendBlock ? InsertBlock->getTerminator()
+                                 : &*InsertBlock->getFirstInsertionPt());
       }
       if (auto *PN = dyn_cast<PHINode>(U)) {
         assert(PN->getNumIncomingValues() == 1 &&
@@ -2244,12 +2245,7 @@ static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call,
 /// intrinsics and attempting to MemToReg the alloca away.
 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca,
                                       coro::Shape &Shape) {
-  for (auto UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ) {
-    // We're likely changing the use list, so use a mutation-safe
-    // iteration pattern.
-    auto &Use = *UI;
-    ++UI;
-
+  for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) {
     // swifterror values can only be used in very specific ways.
     // We take advantage of that here.
     auto User = Use.getUser();
@@ -2510,11 +2506,11 @@ void coro::salvageDebugInfo(
   DIExpression *Expr = DVI->getExpression();
   // Follow the pointer arithmetic all the way to the incoming
   // function argument and convert into a DIExpression.
-  bool OutermostLoad = true;
+  bool SkipOutermostLoad = !isa<DbgValueInst>(DVI);
   Value *Storage = DVI->getVariableLocationOp(0);
   Value *OriginalStorage = Storage;
-  while (Storage) {
-    if (auto *LdInst = dyn_cast<LoadInst>(Storage)) {
+  while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) {
+    if (auto *LdInst = dyn_cast<LoadInst>(Inst)) {
       Storage = LdInst->getOperand(0);
       // FIXME: This is a heuristic that works around the fact that
       // LLVM IR debug intrinsics cannot yet distinguish between
@@ -2522,26 +2518,25 @@ void coro::salvageDebugInfo(
       // implicitly a memory location no DW_OP_deref operation for the
       // last direct load from an alloca is necessary.  This condition
       // effectively drops the *last* DW_OP_deref in the expression.
-      if (!OutermostLoad)
+      if (!SkipOutermostLoad)
         Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
-      OutermostLoad = false;
-    } else if (auto *StInst = dyn_cast<StoreInst>(Storage)) {
+    } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) {
       Storage = StInst->getOperand(0);
-    } else if (auto *GEPInst = dyn_cast<GetElementPtrInst>(Storage)) {
-      SmallVector<Value *> AdditionalValues;
-      DIExpression *SalvagedExpr = llvm::salvageDebugInfoImpl(
-          *GEPInst, Expr,
-          /*WithStackValue=*/false, 0, AdditionalValues);
-      // Debug declares cannot currently handle additional location
-      // operands.
-      if (!SalvagedExpr || !AdditionalValues.empty())
+    } else {
+      SmallVector<uint64_t, 16> Ops;
+      SmallVector<Value *, 0> AdditionalValues;
+      Value *Op = llvm::salvageDebugInfoImpl(
+          *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops,
+          AdditionalValues);
+      if (!Op || !AdditionalValues.empty()) {
+        // If salvaging failed or salvaging produced more than one location
+        // operand, give up.
         break;
-      Expr = SalvagedExpr;
-      Storage = GEPInst->getOperand(0);
-    } else if (auto *BCInst = dyn_cast<llvm::BitCastInst>(Storage))
-      Storage = BCInst->getOperand(0);
-    else
-      break;
+      }
+      Storage = Op;
+      Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false);
+    }
+    SkipOutermostLoad = false;
   }
   if (!Storage)
     return;
diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h
index 5ed800d67fe9..bf3d781ba43e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -638,7 +638,7 @@ public:
   void checkWellFormed() const;
 
   Function *getMustTailCallFunction() const {
-    if (getNumArgOperands() < 3)
+    if (arg_size() < 3)
       return nullptr;
 
     return cast<Function>(
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index b6932dbbfc3f..fa1d92f439b8 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -520,8 +520,8 @@ void CoroCloner::replaceRetconOrAsyncSuspendUses() {
   }
 
   // Try to peephole extracts of an aggregate return.
-  for (auto UI = NewS->use_begin(), UE = NewS->use_end(); UI != UE; ) {
-    auto EVI = dyn_cast<ExtractValueInst>((UI++)->getUser());
+  for (Use &U : llvm::make_early_inc_range(NewS->uses())) {
+    auto *EVI = dyn_cast<ExtractValueInst>(U.getUser());
     if (!EVI || EVI->getNumIndices() != 1)
       continue;
 
@@ -622,12 +622,12 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
 
     // If there are no arguments, this is a 'get' operation.
     Value *MappedResult;
-    if (Op->getNumArgOperands() == 0) {
+    if (Op->arg_empty()) {
       auto ValueTy = Op->getType();
       auto Slot = getSwiftErrorSlot(ValueTy);
       MappedResult = Builder.CreateLoad(ValueTy, Slot);
     } else {
-      assert(Op->getNumArgOperands() == 1);
+      assert(Op->arg_size() == 1);
       auto Value = MappedOp->getArgOperand(0);
       auto ValueTy = Value->getType();
       auto Slot = getSwiftErrorSlot(ValueTy);
@@ -669,7 +669,7 @@ void CoroCloner::salvageDebugInfo() {
   for (DbgVariableIntrinsic *DVI : Worklist) {
     if (IsUnreachableBlock(DVI->getParent()))
       DVI->eraseFromParent();
-    else if (dyn_cast_or_null<AllocaInst>(DVI->getVariableLocationOp(0))) {
+    else if (isa_and_nonnull<AllocaInst>(DVI->getVariableLocationOp(0))) {
       // Count all non-debuginfo uses in reachable blocks.
       unsigned Uses = 0;
       for (auto *User : DVI->getVariableLocationOp(0)->users())
@@ -738,8 +738,7 @@ void CoroCloner::replaceEntryBlock() {
   // entry needs to be moved to the new entry.
   Function *F = OldEntry->getParent();
   DominatorTree DT{*F};
-  for (auto IT = inst_begin(F), End = inst_end(F); IT != End;) {
-    Instruction &I = *IT++;
+  for (Instruction &I : llvm::make_early_inc_range(instructions(F))) {
     auto *Alloca = dyn_cast<AllocaInst>(&I);
     if (!Alloca || I.use_empty())
       continue;
@@ -773,9 +772,8 @@ Value *CoroCloner::deriveNewFramePointer() {
     auto DbgLoc =
         cast<CoroSuspendAsyncInst>(VMap[ActiveSuspend])->getDebugLoc();
     // Calling i8* (i8*)
-    auto *CallerContext = Builder.CreateCall(
-        cast<FunctionType>(ProjectionFunc->getType()->getPointerElementType()),
-        ProjectionFunc, CalleeContext);
+    auto *CallerContext = Builder.CreateCall(ProjectionFunc->getFunctionType(),
+                                             ProjectionFunc, CalleeContext);
     CallerContext->setCallingConv(ProjectionFunc->getCallingConv());
     CallerContext->setDebugLoc(DbgLoc);
     // The frame is located after the async_context header.
@@ -906,8 +904,7 @@ void CoroCloner::create() {
   case coro::ABI::Switch:
     // Bootstrap attributes by copying function attributes from the
     // original function.  This should include optimization settings and so on.
-    NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex,
-                                      OrigAttrs.getFnAttributes());
+    NewAttrs = NewAttrs.addFnAttributes(Context, OrigAttrs.getFnAttrs());
 
     addFramePointerAttrs(NewAttrs, Context, 0,
                          Shape.FrameSize, Shape.FrameAlign);
@@ -929,9 +926,8 @@ void CoroCloner::create() {
     }
 
     // Transfer the original function's attributes.
-    auto FnAttrs = OrigF.getAttributes().getFnAttributes();
-    NewAttrs =
-        NewAttrs.addAttributes(Context, AttributeList::FunctionIndex, FnAttrs);
+    auto FnAttrs = OrigF.getAttributes().getFnAttrs();
+    NewAttrs = NewAttrs.addFnAttributes(Context, FnAttrs);
     break;
   }
   case coro::ABI::Retcon:
@@ -1144,11 +1140,13 @@ static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
 static void postSplitCleanup(Function &F) {
   removeUnreachableBlocks(F);
 
+#ifndef NDEBUG
   // For now, we do a mandatory verification step because we don't
   // entirely trust this pass.  Note that we don't want to add a verifier
   // pass to FPM below because it will also verify all the global data.
   if (verifyFunction(F, &errs()))
     report_fatal_error("Broken function");
+#endif
 }
 
 // Assuming we arrived at the block NewBlock from Prev instruction, store
@@ -1262,7 +1260,7 @@ static bool shouldBeMustTail(const CallInst &CI, const Function &F) {
       Attribute::SwiftSelf,    Attribute::SwiftError};
   AttributeList Attrs = CI.getAttributes();
   for (auto AK : ABIAttrs)
-    if (Attrs.hasParamAttribute(0, AK))
+    if (Attrs.hasParamAttr(0, AK))
       return false;
 
   return true;
@@ -1357,7 +1355,7 @@ static bool hasCallsInBlocksBetween(BasicBlock *SaveBB, BasicBlock *ResDesBB) {
     auto *BB = Worklist.pop_back_val();
     Set.insert(BB);
     for (auto *Pred : predecessors(BB))
-      if (Set.count(Pred) == 0)
+      if (!Set.contains(Pred))
         Worklist.push_back(Pred);
   }
 
@@ -1547,8 +1545,7 @@ static void coerceArguments(IRBuilder<> &Builder, FunctionType *FnTy,
 CallInst *coro::createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
                                    ArrayRef<Value *> Arguments,
                                    IRBuilder<> &Builder) {
-  auto *FnTy =
-      cast<FunctionType>(MustTailCallFn->getType()->getPointerElementType());
+  auto *FnTy = MustTailCallFn->getFunctionType();
   // Coerce the arguments, llvm optimizations seem to ignore the types in
   // vaarg functions and throws away casts in optimized mode.
   SmallVector<Value *, 8> CallArgs;
@@ -1568,8 +1565,8 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
   // Reset various things that the optimizer might have decided it
   // "knows" about the coroutine function due to not seeing a return.
   F.removeFnAttr(Attribute::NoReturn);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  F.removeRetAttr(Attribute::NoAlias);
+  F.removeRetAttr(Attribute::NonNull);
 
   auto &Context = F.getContext();
   auto *Int8PtrTy = Type::getInt8PtrTy(Context);
@@ -1667,8 +1664,8 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
   // Reset various things that the optimizer might have decided it
   // "knows" about the coroutine function due to not seeing a return.
   F.removeFnAttr(Attribute::NoReturn);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  F.removeRetAttr(Attribute::NoAlias);
+  F.removeRetAttr(Attribute::NonNull);
 
   // Allocate the frame.
   auto *Id = cast<AnyCoroIdRetconInst>(Shape.CoroBegin->getId());
@@ -1977,9 +1974,9 @@ static void replacePrepare(CallInst *Prepare, LazyCallGraph &CG,
   //    %2 = bitcast %1 to [[TYPE]]
   // ==>
   //    %2 = @some_function
-  for (auto UI = Prepare->use_begin(), UE = Prepare->use_end(); UI != UE;) {
+  for (Use &U : llvm::make_early_inc_range(Prepare->uses())) {
     // Look for bitcasts back to the original function type.
-    auto *Cast = dyn_cast<BitCastInst>((UI++)->getUser());
+    auto *Cast = dyn_cast<BitCastInst>(U.getUser());
     if (!Cast || Cast->getType() != Fn->getType())
       continue;
 
@@ -2019,10 +2016,9 @@ static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
   //    %2 = bitcast %1 to [[TYPE]]
   // ==>
   //    %2 = @some_function
-  for (auto UI = Prepare->use_begin(), UE = Prepare->use_end();
-         UI != UE; ) {
+  for (Use &U : llvm::make_early_inc_range(Prepare->uses())) {
     // Look for bitcasts back to the original function type.
-    auto *Cast = dyn_cast<BitCastInst>((UI++)->getUser());
+    auto *Cast = dyn_cast<BitCastInst>(U.getUser());
     if (!Cast || Cast->getType() != Fn->getType()) continue;
 
     // Check whether the replacement will introduce new direct calls.
@@ -2059,9 +2055,9 @@ static void replacePrepare(CallInst *Prepare, CallGraph &CG) {
 static bool replaceAllPrepares(Function *PrepareFn, LazyCallGraph &CG,
                                LazyCallGraph::SCC &C) {
   bool Changed = false;
-  for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end(); PI != PE;) {
+  for (Use &P : llvm::make_early_inc_range(PrepareFn->uses())) {
     // Intrinsics can only be used in calls.
-    auto *Prepare = cast<CallInst>((PI++)->getUser());
+    auto *Prepare = cast<CallInst>(P.getUser());
     replacePrepare(Prepare, CG, C);
     Changed = true;
   }
@@ -2077,10 +2073,9 @@ static bool replaceAllPrepares(Function *PrepareFn, LazyCallGraph &CG,
 /// switch coroutines, which are lowered in multiple stages).
 static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
   bool Changed = false;
-  for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end();
-         PI != PE; ) {
+  for (Use &P : llvm::make_early_inc_range(PrepareFn->uses())) {
     // Intrinsics can only be used in calls.
-    auto *Prepare = cast<CallInst>((PI++)->getUser());
+    auto *Prepare = cast<CallInst>(P.getUser());
     replacePrepare(Prepare, CG);
     Changed = true;
   }
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index ae2d9e192c87..e4883ef89db7 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -126,6 +126,7 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
       "llvm.coro.alloc",
       "llvm.coro.async.context.alloc",
       "llvm.coro.async.context.dealloc",
+      "llvm.coro.async.resume",
       "llvm.coro.async.size.replace",
       "llvm.coro.async.store_resume",
       "llvm.coro.begin",
@@ -311,10 +312,9 @@ void coro::Shape::buildFrom(Function &F) {
         if (CoroBegin)
           report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-        CB->removeAttribute(AttributeList::FunctionIndex,
-                            Attribute::NoDuplicate);
+        CB->addRetAttr(Attribute::NonNull);
+        CB->addRetAttr(Attribute::NoAlias);
+        CB->removeFnAttr(Attribute::NoDuplicate);
         CoroBegin = CB;
         break;
       }
@@ -571,8 +571,8 @@ void coro::Shape::emitDealloc(IRBuilder<> &Builder, Value *Ptr,
   llvm_unreachable("Unknown coro::ABI enum");
 }
 
-LLVM_ATTRIBUTE_NORETURN
-static void fail(const Instruction *I, const char *Reason, Value *V) {
+[[noreturn]] static void fail(const Instruction *I, const char *Reason,
+                              Value *V) {
 #ifndef NDEBUG
   I->dump();
   if (V) {
@@ -722,7 +722,7 @@ void CoroAsyncEndInst::checkWellFormed() const {
     return;
   auto *FnTy =
       cast<FunctionType>(MustTailCallFunc->getType()->getPointerElementType());
-  if (FnTy->getNumParams() != (getNumArgOperands() - 3))
+  if (FnTy->getNumParams() != (arg_size() - 3))
     fail(this,
          "llvm.coro.end.async must tail call function argument type must "
          "match the tail arguments",
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 532599b42e0d..01e724e22dcf 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -73,8 +73,8 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
             },
             ORE);
         assert(OIC);
-        emitInlinedInto(ORE, CB->getDebugLoc(), CB->getParent(), F, *Caller,
-                        *OIC, false, DEBUG_TYPE);
+        emitInlinedIntoBasedOnCost(ORE, CB->getDebugLoc(), CB->getParent(), F,
+                                   *Caller, *OIC, false, DEBUG_TYPE);
 
         InlineFunctionInfo IFI(
             /*cg=*/nullptr, GetAssumptionCache, &PSI,
@@ -108,8 +108,10 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
   // Delete the non-comdat ones from the module and also from our vector.
   auto NonComdatBegin = partition(
       InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
-  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
+  for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end())) {
     M.getFunctionList().erase(F);
+    Changed = true;
+  }
   InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
 
   if (!InlinedFunctions.empty()) {
@@ -117,8 +119,10 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
     // are not actually dead.
     filterDeadComdatFunctions(M, InlinedFunctions);
     // The remaining functions are actually dead.
-    for (Function *F : InlinedFunctions)
+    for (Function *F : InlinedFunctions) {
       M.getFunctionList().erase(F);
+      Changed = true;
+    }
   }
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index f670a101767e..93bb11433775 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -148,7 +148,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     } else if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+      ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo));
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -177,9 +177,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
         // Since loads will only have a single operand, and GEPs only a single
         // non-index operand, this will record direct loads without any indices,
         // and gep+loads with the GEP indices.
-        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
-             II != IE; ++II)
-          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        for (const Use &I : llvm::drop_begin(UI->operands()))
+          Indices.push_back(cast<ConstantInt>(I)->getSExtValue());
         // GEPs with a single 0 index can be merged with direct loads
         if (Indices.size() == 1 && Indices.front() == 0)
           Indices.clear();
@@ -231,8 +230,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
-                                       PAL.getRetAttributes(), ArgAttrVec));
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(),
+                                       PAL.getRetAttrs(), ArgAttrVec));
   ArgAttrVec.clear();
 
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
@@ -257,7 +256,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
          ++I, ++AI, ++ArgNo)
       if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
-        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+        ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
       } else if (ByValArgsToTransform.count(&*I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = I->getParamByValType();
@@ -313,9 +312,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
               IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
           newLoad->setAlignment(OrigLoad->getAlign());
           // Transfer the AA info too.
-          AAMDNodes AAInfo;
-          OrigLoad->getAAMetadata(AAInfo);
-          newLoad->setAAMetadata(AAInfo);
+          newLoad->setAAMetadata(OrigLoad->getAAMetadata());
 
           Args.push_back(newLoad);
           ArgAttrVec.push_back(AttributeSet());
@@ -325,7 +322,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     // Push any varargs arguments on the list.
     for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
       Args.push_back(*AI);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+      ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -341,9 +338,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       NewCS = NewCall;
     }
     NewCS->setCallingConv(CB.getCallingConv());
-    NewCS->setAttributes(
-        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
-                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->setAttributes(AttributeList::get(F->getContext(),
+                                            CallPAL.getFnAttrs(),
+                                            CallPAL.getRetAttrs(), ArgAttrVec));
     NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
     Args.clear();
     ArgAttrVec.clear();
@@ -1018,11 +1015,12 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
   do {
     LocalChange = false;
 
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
     for (LazyCallGraph::Node &N : C) {
       Function &OldF = N.getFunction();
 
-      FunctionAnalysisManager &FAM =
-          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
       // FIXME: This lambda must only be used with this function. We should
       // skip the lambda and just get the AA results directly.
       auto AARGetter = [&](Function &F) -> AAResults & {
@@ -1045,6 +1043,13 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
       C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
       FAM.clear(OldF, OldF.getName());
       OldF.eraseFromParent();
+
+      PreservedAnalyses FuncPA;
+      FuncPA.preserveSet<CFGAnalyses>();
+      for (auto *U : NewF->users()) {
+        auto *UserF = cast<CallBase>(U)->getFunction();
+        FAM.invalidate(*UserF, FuncPA);
+      }
     }
 
     Changed |= LocalChange;
@@ -1053,7 +1058,12 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  // We've cleared out analyses for deleted functions.
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  // We've manually invalidated analyses for functions we've modified.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  return PA;
 }
 
 namespace {
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 762317425026..edadc79e3a9f 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/ValueHandle.h"
@@ -250,10 +251,12 @@ Value *AA::getWithType(Value &V, Type &Ty) {
       return Constant::getNullValue(&Ty);
     if (C->getType()->isPointerTy() && Ty.isPointerTy())
       return ConstantExpr::getPointerCast(C, &Ty);
-    if (C->getType()->isIntegerTy() && Ty.isIntegerTy())
-      return ConstantExpr::getTrunc(C, &Ty, /* OnlyIfReduced */ true);
-    if (C->getType()->isFloatingPointTy() && Ty.isFloatingPointTy())
-      return ConstantExpr::getFPTrunc(C, &Ty, /* OnlyIfReduced */ true);
+    if (C->getType()->getPrimitiveSizeInBits() >= Ty.getPrimitiveSizeInBits()) {
+      if (C->getType()->isIntegerTy() && Ty.isIntegerTy())
+        return ConstantExpr::getTrunc(C, &Ty, /* OnlyIfReduced */ true);
+      if (C->getType()->isFloatingPointTy() && Ty.isFloatingPointTy())
+        return ConstantExpr::getFPTrunc(C, &Ty, /* OnlyIfReduced */ true);
+    }
   }
   return nullptr;
 }
@@ -379,30 +382,30 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
 
   if (Attr.isEnumAttribute()) {
     Attribute::AttrKind Kind = Attr.getKindAsEnum();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
+    if (Attrs.hasAttributeAtIndex(AttrIdx, Kind))
       if (!ForceReplace &&
-          isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+          isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind)))
         return false;
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr);
     return true;
   }
   if (Attr.isStringAttribute()) {
     StringRef Kind = Attr.getKindAsString();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
+    if (Attrs.hasAttributeAtIndex(AttrIdx, Kind))
       if (!ForceReplace &&
-          isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+          isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind)))
         return false;
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr);
     return true;
   }
   if (Attr.isIntAttribute()) {
     Attribute::AttrKind Kind = Attr.getKindAsEnum();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
+    if (Attrs.hasAttributeAtIndex(AttrIdx, Kind))
       if (!ForceReplace &&
-          isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+          isEqualOrWorse(Attr, Attrs.getAttributeAtIndex(AttrIdx, Kind)))
         return false;
-    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    Attrs = Attrs.removeAttributeAtIndex(Ctx, AttrIdx, Kind);
+    Attrs = Attrs.addAttributeAtIndex(Ctx, AttrIdx, Attr);
     return true;
   }
 
@@ -655,9 +658,9 @@ bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK,
   else
     AttrList = getAssociatedFunction()->getAttributes();
 
-  bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK);
+  bool HasAttr = AttrList.hasAttributeAtIndex(getAttrIdx(), AK);
   if (HasAttr)
-    Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK));
+    Attrs.push_back(AttrList.getAttributeAtIndex(getAttrIdx(), AK));
   return HasAttr;
 }
 
@@ -1023,7 +1026,7 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
 
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
-    if (!Visited.insert(U).second)
+    if (isa<PHINode>(U->getUser()) && !Visited.insert(U).second)
       continue;
     LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in "
                       << *U->getUser() << "\n");
@@ -1040,6 +1043,8 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
 
     if (auto *SI = dyn_cast<StoreInst>(U->getUser())) {
       if (&SI->getOperandUse(0) == U) {
+        if (!Visited.insert(U).second)
+          continue;
         SmallSetVector<Value *, 4> PotentialCopies;
         if (AA::getPotentialCopiesOfStoredValue(*this, *SI, PotentialCopies,
                                                 QueryingAA,
@@ -1118,6 +1123,10 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
       if (CE->isCast() && CE->getType()->isPointerTy() &&
           CE->getType()->getPointerElementType()->isFunctionTy()) {
+        LLVM_DEBUG(
+            dbgs() << "[Attributor] Use, is constant cast expression, add "
+                   << CE->getNumUses()
+                   << " uses of that expression instead!\n");
         for (const Use &CEU : CE->uses())
           Uses.push_back(&CEU);
         continue;
@@ -1138,9 +1147,13 @@ bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
     const Use *EffectiveUse =
         ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
     if (!ACS.isCallee(EffectiveUse)) {
-      if (!RequireAllCallSites)
+      if (!RequireAllCallSites) {
+        LLVM_DEBUG(dbgs() << "[Attributor] User " << *EffectiveUse->getUser()
+                          << " is not a call of " << Fn.getName()
+                          << ", skip use\n");
         continue;
-      LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
+      }
+      LLVM_DEBUG(dbgs() << "[Attributor] User " << *EffectiveUse->getUser()
                         << " is an invalid use of " << Fn.getName() << "\n");
       return false;
     }
@@ -1410,6 +1423,16 @@ void Attributor::runTillFixpoint() {
   } while (!Worklist.empty() && (IterationCounter++ < MaxFixedPointIterations ||
                                  VerifyMaxFixpointIterations));
 
+  if (IterationCounter > MaxFixedPointIterations && !Worklist.empty()) {
+    auto Remark = [&](OptimizationRemarkMissed ORM) {
+      return ORM << "Attributor did not reach a fixpoint after "
+                 << ore::NV("Iterations", MaxFixedPointIterations)
+                 << " iterations.";
+    };
+    Function *F = Worklist.front()->getIRPosition().getAssociatedFunction();
+    emitRemark<OptimizationRemarkMissed>(F, "FixedPoint", Remark);
+  }
+
   LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                     << IterationCounter << "/" << MaxFixpointIterations
                     << " iterations\n");
@@ -1919,55 +1942,91 @@ void Attributor::createShallowWrapper(Function &F) {
 
   CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
   CI->setTailCall(true);
-  CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+  CI->addFnAttr(Attribute::NoInline);
   ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
 
   NumFnShallowWrappersCreated++;
 }
 
+bool Attributor::isInternalizable(Function &F) {
+  if (F.isDeclaration() || F.hasLocalLinkage() ||
+      GlobalValue::isInterposableLinkage(F.getLinkage()))
+    return false;
+  return true;
+}
+
 Function *Attributor::internalizeFunction(Function &F, bool Force) {
   if (!AllowDeepWrapper && !Force)
     return nullptr;
-  if (F.isDeclaration() || F.hasLocalLinkage() ||
-      GlobalValue::isInterposableLinkage(F.getLinkage()))
+  if (!isInternalizable(F))
     return nullptr;
 
-  Module &M = *F.getParent();
-  FunctionType *FnTy = F.getFunctionType();
+  SmallPtrSet<Function *, 2> FnSet = {&F};
+  DenseMap<Function *, Function *> InternalizedFns;
+  internalizeFunctions(FnSet, InternalizedFns);
 
-  // create a copy of the current function
-  Function *Copied = Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(),
-                                      F.getName() + ".internalized");
-  ValueToValueMapTy VMap;
-  auto *NewFArgIt = Copied->arg_begin();
-  for (auto &Arg : F.args()) {
-    auto ArgName = Arg.getName();
-    NewFArgIt->setName(ArgName);
-    VMap[&Arg] = &(*NewFArgIt++);
-  }
-  SmallVector<ReturnInst *, 8> Returns;
-
-  // Copy the body of the original function to the new one
-  CloneFunctionInto(Copied, &F, VMap, CloneFunctionChangeType::LocalChangesOnly,
-                    Returns);
-
-  // Set the linakage and visibility late as CloneFunctionInto has some implicit
-  // requirements.
-  Copied->setVisibility(GlobalValue::DefaultVisibility);
-  Copied->setLinkage(GlobalValue::PrivateLinkage);
+  return InternalizedFns[&F];
+}
 
-  // Copy metadata
-  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
-  F.getAllMetadata(MDs);
-  for (auto MDIt : MDs)
-    if (!Copied->hasMetadata())
-      Copied->addMetadata(MDIt.first, *MDIt.second);
+bool Attributor::internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
+                                      DenseMap<Function *, Function *> &FnMap) {
+  for (Function *F : FnSet)
+    if (!Attributor::isInternalizable(*F))
+      return false;
 
-  M.getFunctionList().insert(F.getIterator(), Copied);
-  F.replaceAllUsesWith(Copied);
-  Copied->setDSOLocal(true);
+  FnMap.clear();
+  // Generate the internalized version of each function.
+  for (Function *F : FnSet) {
+    Module &M = *F->getParent();
+    FunctionType *FnTy = F->getFunctionType();
+
+    // Create a copy of the current function
+    Function *Copied =
+        Function::Create(FnTy, F->getLinkage(), F->getAddressSpace(),
+                         F->getName() + ".internalized");
+    ValueToValueMapTy VMap;
+    auto *NewFArgIt = Copied->arg_begin();
+    for (auto &Arg : F->args()) {
+      auto ArgName = Arg.getName();
+      NewFArgIt->setName(ArgName);
+      VMap[&Arg] = &(*NewFArgIt++);
+    }
+    SmallVector<ReturnInst *, 8> Returns;
+
+    // Copy the body of the original function to the new one
+    CloneFunctionInto(Copied, F, VMap,
+                      CloneFunctionChangeType::LocalChangesOnly, Returns);
+
+    // Set the linakage and visibility late as CloneFunctionInto has some
+    // implicit requirements.
+    Copied->setVisibility(GlobalValue::DefaultVisibility);
+    Copied->setLinkage(GlobalValue::PrivateLinkage);
+
+    // Copy metadata
+    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+    F->getAllMetadata(MDs);
+    for (auto MDIt : MDs)
+      if (!Copied->hasMetadata())
+        Copied->addMetadata(MDIt.first, *MDIt.second);
+
+    M.getFunctionList().insert(F->getIterator(), Copied);
+    Copied->setDSOLocal(true);
+    FnMap[F] = Copied;
+  }
+
+  // Replace all uses of the old function with the new internalized function
+  // unless the caller is a function that was just internalized.
+  for (Function *F : FnSet) {
+    auto &InternalizedFn = FnMap[F];
+    auto IsNotInternalized = [&](Use &U) -> bool {
+      if (auto *CB = dyn_cast<CallBase>(U.getUser()))
+        return !FnMap.lookup(CB->getCaller());
+      return false;
+    };
+    F->replaceUsesWithIf(InternalizedFn, IsNotInternalized);
+  }
 
-  return Copied;
+  return true;
 }
 
 bool Attributor::isValidFunctionSignatureRewrite(
@@ -1976,7 +2035,8 @@ bool Attributor::isValidFunctionSignatureRewrite(
   if (!RewriteSignatures)
     return false;
 
-  auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
+  Function *Fn = Arg.getParent();
+  auto CallSiteCanBeChanged = [Fn](AbstractCallSite ACS) {
     // Forbid the call site to cast the function return type. If we need to
     // rewrite these functions we need to re-create a cast for the new call site
     // (if the old had uses).
@@ -1984,11 +2044,12 @@ bool Attributor::isValidFunctionSignatureRewrite(
         ACS.getInstruction()->getType() !=
             ACS.getCalledFunction()->getReturnType())
       return false;
+    if (ACS.getCalledOperand()->getType() != Fn->getType())
+      return false;
     // Forbid must-tail calls for now.
     return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall();
   };
 
-  Function *Fn = Arg.getParent();
   // Avoid var-arg functions for now.
   if (Fn->isVarArg()) {
     LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n");
@@ -2118,7 +2179,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       } else {
         NewArgumentTypes.push_back(Arg.getType());
         NewArgumentAttributes.push_back(
-            OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
+            OldFnAttributeList.getParamAttrs(Arg.getArgNo()));
       }
     }
 
@@ -2149,8 +2210,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     // the function.
     LLVMContext &Ctx = OldFn->getContext();
     NewFn->setAttributes(AttributeList::get(
-        Ctx, OldFnAttributeList.getFnAttributes(),
-        OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
+        Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(),
+        NewArgumentAttributes));
 
     // Since we have now created the new function, splice the body of the old
     // function right into the new function, leaving the old rotting hulk of the
@@ -2195,7 +2256,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
         } else {
           NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
           NewArgOperandAttributes.push_back(
-              OldCallAttributeList.getParamAttributes(OldArgNum));
+              OldCallAttributeList.getParamAttrs(OldArgNum));
         }
       }
 
@@ -2225,8 +2286,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       NewCB->setCallingConv(OldCB->getCallingConv());
       NewCB->takeName(OldCB);
       NewCB->setAttributes(AttributeList::get(
-          Ctx, OldCallAttributeList.getFnAttributes(),
-          OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
+          Ctx, OldCallAttributeList.getFnAttrs(),
+          OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes));
 
       CallSitePairs.push_back({OldCB, NewCB});
       return true;
@@ -2441,6 +2502,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
   // Every function can be "readnone/argmemonly/inaccessiblememonly/...".
   getOrCreateAAFor<AAMemoryLocation>(FPos);
 
+  // Every function can track active assumptions.
+  getOrCreateAAFor<AAAssumptionInfo>(FPos);
+
   // Every function might be applicable for Heap-To-Stack conversion.
   if (EnableHeapToStack)
     getOrCreateAAFor<AAHeapToStack>(FPos);
@@ -2526,6 +2590,7 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
   auto CallSitePred = [&](Instruction &I) -> bool {
     auto &CB = cast<CallBase>(I);
     IRPosition CBRetPos = IRPosition::callsite_returned(CB);
+    IRPosition CBFnPos = IRPosition::callsite_function(CB);
 
     // Call sites might be dead if they do not have side effects and no live
     // users. The return value might be dead if there are no live users.
@@ -2537,6 +2602,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     if (!Callee)
       return true;
 
+    // Every call site can track active assumptions.
+    getOrCreateAAFor<AAAssumptionInfo>(CBFnPos);
+
     // Skip declarations except if annotations on their call sites were
     // explicitly requested.
     if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
@@ -2549,7 +2617,7 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       getOrCreateAAFor<AAValueSimplify>(CBRetPos);
     }
 
-    for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) {
+    for (int I = 0, E = CB.arg_size(); I < E; ++I) {
 
       IRPosition CBArgPos = IRPosition::callsite_argument(CB, I);
 
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 98ce286d5139..ec08287393de 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -28,6 +29,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Assumptions.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -146,6 +148,7 @@ PIPE_OPERATOR(AANoUndef)
 PIPE_OPERATOR(AACallEdges)
 PIPE_OPERATOR(AAFunctionReachability)
 PIPE_OPERATOR(AAPointerInfo)
+PIPE_OPERATOR(AAAssumptionInfo)
 
 #undef PIPE_OPERATOR
 
@@ -203,46 +206,25 @@ static Value *constructPointer(Type *ResTy, Type *PtrElemTy, Value *Ptr,
                     << "-bytes as " << *ResTy << "\n");
 
   if (Offset) {
-    SmallVector<Value *, 4> Indices;
-    std::string GEPName = Ptr->getName().str() + ".0";
-
-    // Add 0 index to look through the pointer.
-    assert((uint64_t)Offset < DL.getTypeAllocSize(PtrElemTy) &&
-           "Offset out of bounds");
-    Indices.push_back(Constant::getNullValue(IRB.getInt32Ty()));
-
     Type *Ty = PtrElemTy;
-    do {
-      auto *STy = dyn_cast<StructType>(Ty);
-      if (!STy)
-        // Non-aggregate type, we cast and make byte-wise progress now.
-        break;
-
-      const StructLayout *SL = DL.getStructLayout(STy);
-      if (int64_t(SL->getSizeInBytes()) < Offset)
-        break;
-
-      uint64_t Idx = SL->getElementContainingOffset(Offset);
-      assert(Idx < STy->getNumElements() && "Offset calculation error!");
-      uint64_t Rem = Offset - SL->getElementOffset(Idx);
-      Ty = STy->getElementType(Idx);
-
-      LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset
-                        << " Idx: " << Idx << " Rem: " << Rem << "\n");
+    APInt IntOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
+    SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(Ty, IntOffset);
 
-      GEPName += "." + std::to_string(Idx);
-      Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx));
-      Offset = Rem;
-    } while (Offset);
+    SmallVector<Value *, 4> ValIndices;
+    std::string GEPName = Ptr->getName().str();
+    for (const APInt &Index : IntIndices) {
+      ValIndices.push_back(IRB.getInt(Index));
+      GEPName += "." + std::to_string(Index.getZExtValue());
+    }
 
     // Create a GEP for the indices collected above.
-    Ptr = IRB.CreateGEP(PtrElemTy, Ptr, Indices, GEPName);
+    Ptr = IRB.CreateGEP(PtrElemTy, Ptr, ValIndices, GEPName);
 
     // If an offset is left we use byte-wise adjustment.
-    if (Offset) {
+    if (IntOffset != 0) {
       Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy());
-      Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt32(Offset),
-                          GEPName + ".b" + Twine(Offset));
+      Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(IntOffset),
+                          GEPName + ".b" + Twine(IntOffset.getZExtValue()));
     }
   }
 
@@ -431,6 +413,7 @@ const Value *stripAndAccumulateMinimalOffsets(
   };
 
   return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
+                                                /* AllowInvariant */ false,
                                                 AttributorAnalysis);
 }
 
@@ -503,6 +486,7 @@ static void clampReturnedValueStates(
     S ^= *T;
 }
 
+namespace {
 /// Helper class for generic deduction: return value -> returned position.
 template <typename AAType, typename BaseType,
           typename StateType = typename BaseType::StateType,
@@ -661,6 +645,7 @@ struct AACallSiteReturnedFromReturned : public BaseType {
     return clampStateAndIndicateChange(S, AA.getState());
   }
 };
+} // namespace
 
 /// Helper function to accumulate uses.
 template <class AAType, typename StateType = typename AAType::StateType>
@@ -1051,6 +1036,7 @@ private:
   BooleanState BS;
 };
 
+namespace {
 struct AAPointerInfoImpl
     : public StateWrapper<AA::PointerInfo::State, AAPointerInfo> {
   using BaseTy = StateWrapper<AA::PointerInfo::State, AAPointerInfo>;
@@ -1149,19 +1135,23 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
     return true;
   };
 
+  /// Helper struct, will support ranges eventually.
+  struct OffsetInfo {
+    int64_t Offset = AA::PointerInfo::OffsetAndSize::Unknown;
+
+    bool operator==(const OffsetInfo &OI) const { return Offset == OI.Offset; }
+  };
+
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     using namespace AA::PointerInfo;
     State S = getState();
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
     Value &AssociatedValue = getAssociatedValue();
-    struct OffsetInfo {
-      int64_t Offset = 0;
-    };
 
     const DataLayout &DL = A.getDataLayout();
     DenseMap<Value *, OffsetInfo> OffsetInfoMap;
-    OffsetInfoMap[&AssociatedValue] = {};
+    OffsetInfoMap[&AssociatedValue] = OffsetInfo{0};
 
     auto HandlePassthroughUser = [&](Value *Usr, OffsetInfo &PtrOI,
                                      bool &Follow) {
@@ -1203,7 +1193,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         }
 
         SmallVector<Value *, 8> Indices;
-        for (Use &Idx : llvm::make_range(GEP->idx_begin(), GEP->idx_end())) {
+        for (Use &Idx : GEP->indices()) {
           if (auto *CIdx = dyn_cast<ConstantInt>(Idx)) {
             Indices.push_back(CIdx);
             continue;
@@ -1219,8 +1209,52 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         Follow = true;
         return true;
       }
-      if (isa<CastInst>(Usr) || isa<PHINode>(Usr) || isa<SelectInst>(Usr))
+      if (isa<CastInst>(Usr) || isa<SelectInst>(Usr))
         return HandlePassthroughUser(Usr, PtrOI, Follow);
+
+      // For PHIs we need to take care of the recurrence explicitly as the value
+      // might change while we iterate through a loop. For now, we give up if
+      // the PHI is not invariant.
+      if (isa<PHINode>(Usr)) {
+        // Check if the PHI is invariant (so far).
+        OffsetInfo &UsrOI = OffsetInfoMap[Usr];
+        if (UsrOI == PtrOI)
+          return true;
+
+        // Check if the PHI operand has already an unknown offset as we can't
+        // improve on that anymore.
+        if (PtrOI.Offset == OffsetAndSize::Unknown) {
+          UsrOI = PtrOI;
+          Follow = true;
+          return true;
+        }
+
+        // Check if the PHI operand is not dependent on the PHI itself.
+        // TODO: This is not great as we look at the pointer type. However, it
+        // is unclear where the Offset size comes from with typeless pointers.
+        APInt Offset(
+            DL.getIndexSizeInBits(CurPtr->getType()->getPointerAddressSpace()),
+            0);
+        if (&AssociatedValue == CurPtr->stripAndAccumulateConstantOffsets(
+                                    DL, Offset, /* AllowNonInbounds */ true)) {
+          if (Offset != PtrOI.Offset) {
+            LLVM_DEBUG(dbgs()
+                       << "[AAPointerInfo] PHI operand pointer offset mismatch "
+                       << *CurPtr << " in " << *Usr << "\n");
+            return false;
+          }
+          return HandlePassthroughUser(Usr, PtrOI, Follow);
+        }
+
+        // TODO: Approximate in case we know the direction of the recurrence.
+        LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI operand is too complex "
+                          << *CurPtr << " in " << *Usr << "\n");
+        UsrOI = PtrOI;
+        UsrOI.Offset = OffsetAndSize::Unknown;
+        Follow = true;
+        return true;
+      }
+
       if (auto *LoadI = dyn_cast<LoadInst>(Usr))
         return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr,
                             AccessKind::AK_READ, PtrOI.Offset, Changed,
@@ -2388,6 +2422,10 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
     const size_t NoUBPrevSize = AssumedNoUBInsts.size();
 
     auto InspectMemAccessInstForUB = [&](Instruction &I) {
+      // Lang ref now states volatile store is not UB, let's skip them.
+      if (I.isVolatile() && I.mayWriteToMemory())
+        return true;
+
       // Skip instructions that are already saved.
       if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
         return true;
@@ -2467,7 +2505,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
       Function *Callee = CB.getCalledFunction();
       if (!Callee)
         return true;
-      for (unsigned idx = 0; idx < CB.getNumArgOperands(); idx++) {
+      for (unsigned idx = 0; idx < CB.arg_size(); idx++) {
         // If current argument is known to be simplified to null pointer and the
         // corresponding argument position is known to have nonnull attribute,
         // the argument is poison. Furthermore, if the argument is poison and
@@ -3135,8 +3173,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
     // value passed at this call site.
     // TODO: AbstractCallSite
     const auto &CB = cast<CallBase>(getAnchorValue());
-    for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands();
-         OtherArgNo++)
+    for (unsigned OtherArgNo = 0; OtherArgNo < CB.arg_size(); OtherArgNo++)
       if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo))
         return false;
 
@@ -3354,6 +3391,10 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
   }
 
   bool isDeadStore(Attributor &A, StoreInst &SI) {
+    // Lang ref now states volatile store is not UB/dead, let's skip them.
+    if (SI.isVolatile())
+      return false;
+
     bool UsedAssumedInformation = false;
     SmallSetVector<Value *, 4> PotentialCopies;
     if (!AA::getPotentialCopiesOfStoredValue(A, SI, PotentialCopies, *this,
@@ -5039,6 +5080,7 @@ struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
     STATS_DECLTRACK_CSRET_ATTR(nocapture)
   }
 };
+} // namespace
 
 /// ------------------ Value Simplify Attribute ----------------------------
 
@@ -5059,6 +5101,7 @@ bool ValueSimplifyStateType::unionAssumed(Optional<Value *> Other) {
   return true;
 }
 
+namespace {
 struct AAValueSimplifyImpl : AAValueSimplify {
   AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
       : AAValueSimplify(IRP, A) {}
@@ -6464,7 +6507,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) {
       CallBase *DC = cast<CallBase>(ACS.getInstruction());
       int DCArgNo = ACS.getCallArgOperandNo(ArgNo);
-      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() &&
+      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->arg_size() &&
              "Expected a direct call operand for callback call operand");
 
       LLVM_DEBUG({
@@ -7287,10 +7330,12 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U,
 
   case Instruction::Store:
     // Stores cause the NO_WRITES property to disappear if the use is the
-    // pointer operand. Note that we do assume that capturing was taken care of
-    // somewhere else.
+    // pointer operand. Note that while capturing was taken care of somewhere
+    // else we need to deal with stores of the value that is not looked through.
     if (cast<StoreInst>(UserI)->getPointerOperand() == U.get())
       removeAssumedBits(NO_WRITES);
+    else
+      indicatePessimisticFixpoint();
     return;
 
   case Instruction::Call:
@@ -7336,6 +7381,7 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use &U,
   if (UserI->mayWriteToMemory())
     removeAssumedBits(NO_WRITES);
 }
+} // namespace
 
 /// -------------------- Memory Locations Attributes ---------------------------
 /// Includes read-none, argmemonly, inaccessiblememonly,
@@ -7628,11 +7674,14 @@ void AAMemoryLocationImpl::categorizePtrValue(
     assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped.");
     if (isa<UndefValue>(Obj))
       continue;
-    if (auto *Arg = dyn_cast<Argument>(Obj)) {
-      if (Arg->hasByValAttr())
-        MLK = NO_LOCAL_MEM;
-      else
-        MLK = NO_ARGUMENT_MEM;
+    if (isa<Argument>(Obj)) {
+      // TODO: For now we do not treat byval arguments as local copies performed
+      // on the call edge, though, we should. To make that happen we need to
+      // teach various passes, e.g., DSE, about the copy effect of a byval. That
+      // would also allow us to mark functions only accessing byval arguments as
+      // readnone again, atguably their acceses have no effect outside of the
+      // function, like accesses to allocas.
+      MLK = NO_ARGUMENT_MEM;
     } else if (auto *GV = dyn_cast<GlobalValue>(Obj)) {
       // Reading constant memory is not treated as a read "effect" by the
       // function attr pass so we won't neither. Constants defined by TBAA are
@@ -7678,7 +7727,7 @@ void AAMemoryLocationImpl::categorizePtrValue(
 void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
     Attributor &A, CallBase &CB, AAMemoryLocation::StateType &AccessedLocs,
     bool &Changed) {
-  for (unsigned ArgNo = 0, E = CB.getNumArgOperands(); ArgNo < E; ++ArgNo) {
+  for (unsigned ArgNo = 0, E = CB.arg_size(); ArgNo < E; ++ArgNo) {
 
     // Skip non-pointer arguments.
     const Value *ArgOp = CB.getArgOperand(ArgNo);
@@ -8611,31 +8660,7 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
 
   static bool calculateICmpInst(const ICmpInst *ICI, const APInt &LHS,
                                 const APInt &RHS) {
-    ICmpInst::Predicate Pred = ICI->getPredicate();
-    switch (Pred) {
-    case ICmpInst::ICMP_UGT:
-      return LHS.ugt(RHS);
-    case ICmpInst::ICMP_SGT:
-      return LHS.sgt(RHS);
-    case ICmpInst::ICMP_EQ:
-      return LHS.eq(RHS);
-    case ICmpInst::ICMP_UGE:
-      return LHS.uge(RHS);
-    case ICmpInst::ICMP_SGE:
-      return LHS.sge(RHS);
-    case ICmpInst::ICMP_ULT:
-      return LHS.ult(RHS);
-    case ICmpInst::ICMP_SLT:
-      return LHS.slt(RHS);
-    case ICmpInst::ICMP_NE:
-      return LHS.ne(RHS);
-    case ICmpInst::ICMP_ULE:
-      return LHS.ule(RHS);
-    case ICmpInst::ICMP_SLE:
-      return LHS.sle(RHS);
-    default:
-      llvm_unreachable("Invalid ICmp predicate!");
-    }
+    return ICmpInst::compare(LHS, RHS, ICI->getPredicate());
   }
 
   static APInt calculateCastInst(const CastInst *CI, const APInt &Src,
@@ -8675,25 +8700,25 @@ struct AAPotentialValuesFloating : AAPotentialValuesImpl {
     case Instruction::Mul:
       return LHS * RHS;
     case Instruction::UDiv:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
       return LHS.udiv(RHS);
     case Instruction::SDiv:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
       return LHS.sdiv(RHS);
     case Instruction::URem:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
       return LHS.urem(RHS);
     case Instruction::SRem:
-      if (RHS.isNullValue()) {
+      if (RHS.isZero()) {
         SkipOperation = true;
         return LHS;
       }
@@ -9292,32 +9317,69 @@ struct AANoUndefCallSiteReturned final
   void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) }
 };
 
-struct AACallEdgesFunction : public AACallEdges {
-  AACallEdgesFunction(const IRPosition &IRP, Attributor &A)
-      : AACallEdges(IRP, A) {}
+struct AACallEdgesImpl : public AACallEdges {
+  AACallEdgesImpl(const IRPosition &IRP, Attributor &A) : AACallEdges(IRP, A) {}
+
+  virtual const SetVector<Function *> &getOptimisticEdges() const override {
+    return CalledFunctions;
+  }
+
+  virtual bool hasUnknownCallee() const override { return HasUnknownCallee; }
+
+  virtual bool hasNonAsmUnknownCallee() const override {
+    return HasUnknownCalleeNonAsm;
+  }
+
+  const std::string getAsStr() const override {
+    return "CallEdges[" + std::to_string(HasUnknownCallee) + "," +
+           std::to_string(CalledFunctions.size()) + "]";
+  }
 
+  void trackStatistics() const override {}
+
+protected:
+  void addCalledFunction(Function *Fn, ChangeStatus &Change) {
+    if (CalledFunctions.insert(Fn)) {
+      Change = ChangeStatus::CHANGED;
+      LLVM_DEBUG(dbgs() << "[AACallEdges] New call edge: " << Fn->getName()
+                        << "\n");
+    }
+  }
+
+  void setHasUnknownCallee(bool NonAsm, ChangeStatus &Change) {
+    if (!HasUnknownCallee)
+      Change = ChangeStatus::CHANGED;
+    if (NonAsm && !HasUnknownCalleeNonAsm)
+      Change = ChangeStatus::CHANGED;
+    HasUnknownCalleeNonAsm |= NonAsm;
+    HasUnknownCallee = true;
+  }
+
+private:
+  /// Optimistic set of functions that might be called by this position.
+  SetVector<Function *> CalledFunctions;
+
+  /// Is there any call with a unknown callee.
+  bool HasUnknownCallee = false;
+
+  /// Is there any call with a unknown callee, excluding any inline asm.
+  bool HasUnknownCalleeNonAsm = false;
+};
+
+struct AACallEdgesCallSite : public AACallEdgesImpl {
+  AACallEdgesCallSite(const IRPosition &IRP, Attributor &A)
+      : AACallEdgesImpl(IRP, A) {}
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Change = ChangeStatus::UNCHANGED;
-    bool OldHasUnknownCallee = HasUnknownCallee;
-    bool OldHasUnknownCalleeNonAsm = HasUnknownCalleeNonAsm;
-
-    auto AddCalledFunction = [&](Function *Fn) {
-      if (CalledFunctions.insert(Fn)) {
-        Change = ChangeStatus::CHANGED;
-        LLVM_DEBUG(dbgs() << "[AACallEdges] New call edge: " << Fn->getName()
-                          << "\n");
-      }
-    };
 
     auto VisitValue = [&](Value &V, const Instruction *CtxI, bool &HasUnknown,
                           bool Stripped) -> bool {
       if (Function *Fn = dyn_cast<Function>(&V)) {
-        AddCalledFunction(Fn);
+        addCalledFunction(Fn, Change);
       } else {
         LLVM_DEBUG(dbgs() << "[AACallEdges] Unrecognized value: " << V << "\n");
-        HasUnknown = true;
-        HasUnknownCalleeNonAsm = true;
+        setHasUnknownCallee(true, Change);
       }
 
       // Explore all values.
@@ -9325,44 +9387,67 @@ struct AACallEdgesFunction : public AACallEdges {
     };
 
     // Process any value that we might call.
-    auto ProcessCalledOperand = [&](Value *V, Instruction *Ctx) {
+    auto ProcessCalledOperand = [&](Value *V) {
+      bool DummyValue = false;
       if (!genericValueTraversal<bool>(A, IRPosition::value(*V), *this,
-                                       HasUnknownCallee, VisitValue, nullptr,
+                                       DummyValue, VisitValue, nullptr,
                                        false)) {
         // If we haven't gone through all values, assume that there are unknown
         // callees.
-        HasUnknownCallee = true;
-        HasUnknownCalleeNonAsm = true;
+        setHasUnknownCallee(true, Change);
       }
     };
 
-    auto ProcessCallInst = [&](Instruction &Inst) {
-      CallBase &CB = static_cast<CallBase &>(Inst);
-      if (CB.isInlineAsm()) {
-        HasUnknownCallee = true;
-        return true;
-      }
+    CallBase *CB = static_cast<CallBase *>(getCtxI());
 
-      // Process callee metadata if available.
-      if (auto *MD = Inst.getMetadata(LLVMContext::MD_callees)) {
-        for (auto &Op : MD->operands()) {
-          Function *Callee = mdconst::extract_or_null<Function>(Op);
-          if (Callee)
-            AddCalledFunction(Callee);
-        }
-        // Callees metadata grantees that the called function is one of its
-        // operands, So we are done.
-        return true;
+    if (CB->isInlineAsm()) {
+      setHasUnknownCallee(false, Change);
+      return Change;
+    }
+
+    // Process callee metadata if available.
+    if (auto *MD = getCtxI()->getMetadata(LLVMContext::MD_callees)) {
+      for (auto &Op : MD->operands()) {
+        Function *Callee = mdconst::dyn_extract_or_null<Function>(Op);
+        if (Callee)
+          addCalledFunction(Callee, Change);
       }
+      return Change;
+    }
 
-      // The most simple case.
-      ProcessCalledOperand(CB.getCalledOperand(), &Inst);
+    // The most simple case.
+    ProcessCalledOperand(CB->getCalledOperand());
 
-      // Process callback functions.
-      SmallVector<const Use *, 4u> CallbackUses;
-      AbstractCallSite::getCallbackUses(CB, CallbackUses);
-      for (const Use *U : CallbackUses)
-        ProcessCalledOperand(U->get(), &Inst);
+    // Process callback functions.
+    SmallVector<const Use *, 4u> CallbackUses;
+    AbstractCallSite::getCallbackUses(*CB, CallbackUses);
+    for (const Use *U : CallbackUses)
+      ProcessCalledOperand(U->get());
+
+    return Change;
+  }
+};
+
+struct AACallEdgesFunction : public AACallEdgesImpl {
+  AACallEdgesFunction(const IRPosition &IRP, Attributor &A)
+      : AACallEdgesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto ProcessCallInst = [&](Instruction &Inst) {
+      CallBase &CB = static_cast<CallBase &>(Inst);
+
+      auto &CBEdges = A.getAAFor<AACallEdges>(
+          *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
+      if (CBEdges.hasNonAsmUnknownCallee())
+        setHasUnknownCallee(true, Change);
+      if (CBEdges.hasUnknownCallee())
+        setHasUnknownCallee(false, Change);
+
+      for (Function *F : CBEdges.getOptimisticEdges())
+        addCalledFunction(F, Change);
 
       return true;
     };
@@ -9373,155 +9458,323 @@ struct AACallEdgesFunction : public AACallEdges {
                                            UsedAssumedInformation)) {
       // If we haven't looked at all call like instructions, assume that there
       // are unknown callees.
-      HasUnknownCallee = true;
-      HasUnknownCalleeNonAsm = true;
+      setHasUnknownCallee(true, Change);
     }
 
-    // Track changes.
-    if (OldHasUnknownCallee != HasUnknownCallee ||
-        OldHasUnknownCalleeNonAsm != HasUnknownCalleeNonAsm)
-      Change = ChangeStatus::CHANGED;
-
     return Change;
   }
+};
 
-  virtual const SetVector<Function *> &getOptimisticEdges() const override {
-    return CalledFunctions;
-  };
+struct AAFunctionReachabilityFunction : public AAFunctionReachability {
+private:
+  struct QuerySet {
+    void markReachable(Function *Fn) {
+      Reachable.insert(Fn);
+      Unreachable.erase(Fn);
+    }
+
+    ChangeStatus update(Attributor &A, const AAFunctionReachability &AA,
+                        ArrayRef<const AACallEdges *> AAEdgesList) {
+      ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+      for (auto *AAEdges : AAEdgesList) {
+        if (AAEdges->hasUnknownCallee()) {
+          if (!CanReachUnknownCallee)
+            Change = ChangeStatus::CHANGED;
+          CanReachUnknownCallee = true;
+          return Change;
+        }
+      }
 
-  virtual bool hasUnknownCallee() const override { return HasUnknownCallee; }
+      for (Function *Fn : make_early_inc_range(Unreachable)) {
+        if (checkIfReachable(A, AA, AAEdgesList, Fn)) {
+          Change = ChangeStatus::CHANGED;
+          markReachable(Fn);
+        }
+      }
+      return Change;
+    }
 
-  virtual bool hasNonAsmUnknownCallee() const override {
-    return HasUnknownCalleeNonAsm;
-  }
+    bool isReachable(Attributor &A, const AAFunctionReachability &AA,
+                     ArrayRef<const AACallEdges *> AAEdgesList, Function *Fn) {
+      // Assume that we can reach the function.
+      // TODO: Be more specific with the unknown callee.
+      if (CanReachUnknownCallee)
+        return true;
 
-  const std::string getAsStr() const override {
-    return "CallEdges[" + std::to_string(HasUnknownCallee) + "," +
-           std::to_string(CalledFunctions.size()) + "]";
-  }
+      if (Reachable.count(Fn))
+        return true;
 
-  void trackStatistics() const override {}
+      if (Unreachable.count(Fn))
+        return false;
 
-  /// Optimistic set of functions that might be called by this function.
-  SetVector<Function *> CalledFunctions;
+      // We need to assume that this function can't reach Fn to prevent
+      // an infinite loop if this function is recursive.
+      Unreachable.insert(Fn);
 
-  /// Is there any call with a unknown callee.
-  bool HasUnknownCallee = false;
+      bool Result = checkIfReachable(A, AA, AAEdgesList, Fn);
+      if (Result)
+        markReachable(Fn);
+      return Result;
+    }
 
-  /// Is there any call with a unknown callee, excluding any inline asm.
-  bool HasUnknownCalleeNonAsm = false;
-};
+    bool checkIfReachable(Attributor &A, const AAFunctionReachability &AA,
+                          ArrayRef<const AACallEdges *> AAEdgesList,
+                          Function *Fn) const {
 
-struct AAFunctionReachabilityFunction : public AAFunctionReachability {
-  AAFunctionReachabilityFunction(const IRPosition &IRP, Attributor &A)
-      : AAFunctionReachability(IRP, A) {}
+      // Handle the most trivial case first.
+      for (auto *AAEdges : AAEdgesList) {
+        const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges();
 
-  bool canReach(Attributor &A, Function *Fn) const override {
-    // Assume that we can reach any function if we can reach a call with
-    // unknown callee.
-    if (CanReachUnknownCallee)
-      return true;
+        if (Edges.count(Fn))
+          return true;
+      }
 
-    if (ReachableQueries.count(Fn))
-      return true;
+      SmallVector<const AAFunctionReachability *, 8> Deps;
+      for (auto &AAEdges : AAEdgesList) {
+        const SetVector<Function *> &Edges = AAEdges->getOptimisticEdges();
+
+        for (Function *Edge : Edges) {
+          // We don't need a dependency if the result is reachable.
+          const AAFunctionReachability &EdgeReachability =
+              A.getAAFor<AAFunctionReachability>(
+                  AA, IRPosition::function(*Edge), DepClassTy::NONE);
+          Deps.push_back(&EdgeReachability);
+
+          if (EdgeReachability.canReach(A, Fn))
+            return true;
+        }
+      }
+
+      // The result is false for now, set dependencies and leave.
+      for (auto Dep : Deps)
+        A.recordDependence(AA, *Dep, DepClassTy::REQUIRED);
 
-    if (UnreachableQueries.count(Fn))
       return false;
+    }
+
+    /// Set of functions that we know for sure is reachable.
+    DenseSet<Function *> Reachable;
+
+    /// Set of functions that are unreachable, but might become reachable.
+    DenseSet<Function *> Unreachable;
+
+    /// If we can reach a function with a call to a unknown function we assume
+    /// that we can reach any function.
+    bool CanReachUnknownCallee = false;
+  };
+
+public:
+  AAFunctionReachabilityFunction(const IRPosition &IRP, Attributor &A)
+      : AAFunctionReachability(IRP, A) {}
 
+  bool canReach(Attributor &A, Function *Fn) const override {
     const AACallEdges &AAEdges =
         A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::REQUIRED);
 
-    const SetVector<Function *> &Edges = AAEdges.getOptimisticEdges();
-    bool Result = checkIfReachable(A, Edges, Fn);
+    // Attributor returns attributes as const, so this function has to be
+    // const for users of this attribute to use it without having to do
+    // a const_cast.
+    // This is a hack for us to be able to cache queries.
+    auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this);
+    bool Result =
+        NonConstThis->WholeFunction.isReachable(A, *this, {&AAEdges}, Fn);
+
+    return Result;
+  }
+
+  /// Can \p CB reach \p Fn
+  bool canReach(Attributor &A, CallBase &CB, Function *Fn) const override {
+    const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+        *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
 
     // Attributor returns attributes as const, so this function has to be
     // const for users of this attribute to use it without having to do
     // a const_cast.
     // This is a hack for us to be able to cache queries.
     auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this);
+    QuerySet &CBQuery = NonConstThis->CBQueries[&CB];
 
-    if (Result)
-      NonConstThis->ReachableQueries.insert(Fn);
-    else
-      NonConstThis->UnreachableQueries.insert(Fn);
+    bool Result = CBQuery.isReachable(A, *this, {&AAEdges}, Fn);
 
     return Result;
   }
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    if (CanReachUnknownCallee)
-      return ChangeStatus::UNCHANGED;
-
     const AACallEdges &AAEdges =
         A.getAAFor<AACallEdges>(*this, getIRPosition(), DepClassTy::REQUIRED);
-    const SetVector<Function *> &Edges = AAEdges.getOptimisticEdges();
     ChangeStatus Change = ChangeStatus::UNCHANGED;
 
-    if (AAEdges.hasUnknownCallee()) {
-      bool OldCanReachUnknown = CanReachUnknownCallee;
-      CanReachUnknownCallee = true;
-      return OldCanReachUnknown ? ChangeStatus::UNCHANGED
-                                : ChangeStatus::CHANGED;
-    }
+    Change |= WholeFunction.update(A, *this, {&AAEdges});
 
-    // Check if any of the unreachable functions become reachable.
-    for (auto Current = UnreachableQueries.begin();
-         Current != UnreachableQueries.end();) {
-      if (!checkIfReachable(A, Edges, *Current)) {
-        Current++;
-        continue;
-      }
-      ReachableQueries.insert(*Current);
-      UnreachableQueries.erase(*Current++);
-      Change = ChangeStatus::CHANGED;
+    for (auto CBPair : CBQueries) {
+      const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+          *this, IRPosition::callsite_function(*CBPair.first),
+          DepClassTy::REQUIRED);
+
+      Change |= CBPair.second.update(A, *this, {&AAEdges});
     }
 
     return Change;
   }
 
   const std::string getAsStr() const override {
-    size_t QueryCount = ReachableQueries.size() + UnreachableQueries.size();
+    size_t QueryCount =
+        WholeFunction.Reachable.size() + WholeFunction.Unreachable.size();
 
-    return "FunctionReachability [" + std::to_string(ReachableQueries.size()) +
-           "," + std::to_string(QueryCount) + "]";
+    return "FunctionReachability [" +
+           std::to_string(WholeFunction.Reachable.size()) + "," +
+           std::to_string(QueryCount) + "]";
   }
 
   void trackStatistics() const override {}
 
 private:
-  bool canReachUnknownCallee() const override { return CanReachUnknownCallee; }
+  bool canReachUnknownCallee() const override {
+    return WholeFunction.CanReachUnknownCallee;
+  }
 
-  bool checkIfReachable(Attributor &A, const SetVector<Function *> &Edges,
-                        Function *Fn) const {
-    if (Edges.count(Fn))
-      return true;
+  /// Used to answer if a the whole function can reacha a specific function.
+  QuerySet WholeFunction;
 
-    for (Function *Edge : Edges) {
-      // We don't need a dependency if the result is reachable.
-      const AAFunctionReachability &EdgeReachability =
-          A.getAAFor<AAFunctionReachability>(*this, IRPosition::function(*Edge),
-                                             DepClassTy::NONE);
+  /// Used to answer if a call base inside this function can reach a specific
+  /// function.
+  DenseMap<CallBase *, QuerySet> CBQueries;
+};
 
-      if (EdgeReachability.canReach(A, Fn))
-        return true;
-    }
-    for (Function *Fn : Edges)
-      A.getAAFor<AAFunctionReachability>(*this, IRPosition::function(*Fn),
-                                         DepClassTy::REQUIRED);
+/// ---------------------- Assumption Propagation ------------------------------
+struct AAAssumptionInfoImpl : public AAAssumptionInfo {
+  AAAssumptionInfoImpl(const IRPosition &IRP, Attributor &A,
+                       const DenseSet<StringRef> &Known)
+      : AAAssumptionInfo(IRP, A, Known) {}
 
-    return false;
+  bool hasAssumption(const StringRef Assumption) const override {
+    return isValidState() && setContains(Assumption);
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    const SetContents &Known = getKnown();
+    const SetContents &Assumed = getAssumed();
+
+    const std::string KnownStr =
+        llvm::join(Known.getSet().begin(), Known.getSet().end(), ",");
+    const std::string AssumedStr =
+        (Assumed.isUniversal())
+            ? "Universal"
+            : llvm::join(Assumed.getSet().begin(), Assumed.getSet().end(), ",");
+
+    return "Known [" + KnownStr + "]," + " Assumed [" + AssumedStr + "]";
+  }
+};
+
+/// Propagates assumption information from parent functions to all of their
+/// successors. An assumption can be propagated if the containing function
+/// dominates the called function.
+///
+/// We start with a "known" set of assumptions already valid for the associated
+/// function and an "assumed" set that initially contains all possible
+/// assumptions. The assumed set is inter-procedurally updated by narrowing its
+/// contents as concrete values are known. The concrete values are seeded by the
+/// first nodes that are either entries into the call graph, or contains no
+/// assumptions. Each node is updated as the intersection of the assumed state
+/// with all of its predecessors.
+struct AAAssumptionInfoFunction final : AAAssumptionInfoImpl {
+  AAAssumptionInfoFunction(const IRPosition &IRP, Attributor &A)
+      : AAAssumptionInfoImpl(IRP, A,
+                             getAssumptions(*IRP.getAssociatedFunction())) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    const auto &Assumptions = getKnown();
+
+    // Don't manifest a universal set if it somehow made it here.
+    if (Assumptions.isUniversal())
+      return ChangeStatus::UNCHANGED;
+
+    Function *AssociatedFunction = getAssociatedFunction();
+
+    bool Changed = addAssumptions(*AssociatedFunction, Assumptions.getSet());
+
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool Changed = false;
+
+    auto CallSitePred = [&](AbstractCallSite ACS) {
+      const auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
+          *this, IRPosition::callsite_function(*ACS.getInstruction()),
+          DepClassTy::REQUIRED);
+      // Get the set of assumptions shared by all of this function's callers.
+      Changed |= getIntersection(AssumptionAA.getAssumed());
+      return !getAssumed().empty() || !getKnown().empty();
+    };
+
+    bool AllCallSitesKnown;
+    // Get the intersection of all assumptions held by this node's predecessors.
+    // If we don't know all the call sites then this is either an entry into the
+    // call graph or an empty node. This node is known to only contain its own
+    // assumptions and can be propagated to its successors.
+    if (!A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  void trackStatistics() const override {}
+};
+
+/// Assumption Info defined for call sites.
+struct AAAssumptionInfoCallSite final : AAAssumptionInfoImpl {
+
+  AAAssumptionInfoCallSite(const IRPosition &IRP, Attributor &A)
+      : AAAssumptionInfoImpl(IRP, A, getInitialAssumptions(IRP)) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    const IRPosition &FnPos = IRPosition::function(*getAnchorScope());
+    A.getAAFor<AAAssumptionInfo>(*this, FnPos, DepClassTy::REQUIRED);
   }
 
-  /// Set of functions that we know for sure is reachable.
-  SmallPtrSet<Function *, 8> ReachableQueries;
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // Don't manifest a universal set if it somehow made it here.
+    if (getKnown().isUniversal())
+      return ChangeStatus::UNCHANGED;
+
+    CallBase &AssociatedCall = cast<CallBase>(getAssociatedValue());
+    bool Changed = addAssumptions(AssociatedCall, getAssumed().getSet());
+
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const IRPosition &FnPos = IRPosition::function(*getAnchorScope());
+    auto &AssumptionAA =
+        A.getAAFor<AAAssumptionInfo>(*this, FnPos, DepClassTy::REQUIRED);
+    bool Changed = getIntersection(AssumptionAA.getAssumed());
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
 
-  /// Set of functions that are unreachable, but might become reachable.
-  SmallPtrSet<Function *, 8> UnreachableQueries;
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
 
-  /// If we can reach a function with a call to a unknown function we assume
-  /// that we can reach any function.
-  bool CanReachUnknownCallee = false;
+private:
+  /// Helper to initialized the known set as all the assumptions this call and
+  /// the callee contain.
+  DenseSet<StringRef> getInitialAssumptions(const IRPosition &IRP) {
+    const CallBase &CB = cast<CallBase>(IRP.getAssociatedValue());
+    auto Assumptions = getAssumptions(CB);
+    if (Function *F = IRP.getAssociatedFunction())
+      set_union(Assumptions, getAssumptions(*F));
+    if (Function *F = IRP.getAssociatedFunction())
+      set_union(Assumptions, getAssumptions(*F));
+    return Assumptions;
+  }
 };
 
 } // namespace
@@ -9559,6 +9812,7 @@ const char AANoUndef::ID = 0;
 const char AACallEdges::ID = 0;
 const char AAFunctionReachability::ID = 0;
 const char AAPointerInfo::ID = 0;
+const char AAAssumptionInfo::ID = 0;
 
 // Macro magic to create the static generator function for attributes that
 // follow the naming scheme.
@@ -9660,6 +9914,8 @@ CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
 CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
 CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
 CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AACallEdges)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAssumptionInfo)
 
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
@@ -9679,7 +9935,6 @@ CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AACallEdges)
 CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAFunctionReachability)
 
 CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
diff --git a/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index 8e81f4bad4af..178d3f41963e 100644
--- a/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -153,33 +153,30 @@ static bool mergeConstants(Module &M) {
   // were just merged.
   while (true) {
     // Find the canonical constants others will be merged with.
-    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-         GVI != E; ) {
-      GlobalVariable *GV = &*GVI++;
-
+    for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
       // If this GV is dead, remove it.
-      GV->removeDeadConstantUsers();
-      if (GV->use_empty() && GV->hasLocalLinkage()) {
-        GV->eraseFromParent();
+      GV.removeDeadConstantUsers();
+      if (GV.use_empty() && GV.hasLocalLinkage()) {
+        GV.eraseFromParent();
         ++ChangesMade;
         continue;
       }
 
-      if (isUnmergeableGlobal(GV, UsedGlobals))
+      if (isUnmergeableGlobal(&GV, UsedGlobals))
         continue;
 
       // This transformation is legal for weak ODR globals in the sense it
       // doesn't change semantics, but we really don't want to perform it
       // anyway; it's likely to pessimize code generation, and some tools
       // (like the Darwin linker in cases involving CFString) don't expect it.
-      if (GV->isWeakForLinker())
+      if (GV.isWeakForLinker())
         continue;
 
       // Don't touch globals with metadata other then !dbg.
-      if (hasMetadataOtherThanDebugLoc(GV))
+      if (hasMetadataOtherThanDebugLoc(&GV))
         continue;
 
-      Constant *Init = GV->getInitializer();
+      Constant *Init = GV.getInitializer();
 
       // Check to see if the initializer is already known.
       GlobalVariable *&Slot = CMap[Init];
@@ -188,9 +185,9 @@ static bool mergeConstants(Module &M) {
       // replace with the current one. If the current is externally visible
       // it cannot be replace, but can be the canonical constant we merge with.
       bool FirstConstantFound = !Slot;
-      if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) {
-        Slot = GV;
-        LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName()
+      if (FirstConstantFound || IsBetterCanonical(GV, *Slot)) {
+        Slot = &GV;
+        LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV.getName()
                           << (FirstConstantFound ? "\n" : " (updated)\n"));
       }
     }
@@ -199,18 +196,15 @@ static bool mergeConstants(Module &M) {
     // SameContentReplacements vector. We cannot do the replacement in this pass
     // because doing so may cause initializers of other globals to be rewritten,
     // invalidating the Constant* pointers in CMap.
-    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-         GVI != E; ) {
-      GlobalVariable *GV = &*GVI++;
-
-      if (isUnmergeableGlobal(GV, UsedGlobals))
+    for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
+      if (isUnmergeableGlobal(&GV, UsedGlobals))
         continue;
 
       // We can only replace constant with local linkage.
-      if (!GV->hasLocalLinkage())
+      if (!GV.hasLocalLinkage())
         continue;
 
-      Constant *Init = GV->getInitializer();
+      Constant *Init = GV.getInitializer();
 
       // Check to see if the initializer is already known.
       auto Found = CMap.find(Init);
@@ -218,16 +212,16 @@ static bool mergeConstants(Module &M) {
         continue;
 
       GlobalVariable *Slot = Found->second;
-      if (Slot == GV)
+      if (Slot == &GV)
         continue;
 
-      if (makeMergeable(GV, Slot) == CanMerge::No)
+      if (makeMergeable(&GV, Slot) == CanMerge::No)
         continue;
 
       // Make all uses of the duplicate constant use the canonical version.
-      LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @"
+      LLVM_DEBUG(dbgs() << "Will replace: @" << GV.getName() << " -> @"
                         << Slot->getName() << "\n");
-      SameContentReplacements.push_back(std::make_pair(GV, Slot));
+      SameContentReplacements.push_back(std::make_pair(&GV, Slot));
     }
 
     // Now that we have figured out which replacements must be made, do them all
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d95fd55870f8..fb9ab7954e36 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -175,8 +175,8 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   // to pass in a smaller number of arguments into the new function.
   //
   std::vector<Value *> Args;
-  for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
-    CallBase *CB = dyn_cast<CallBase>(*I++);
+  for (User *U : llvm::make_early_inc_range(Fn.users())) {
+    CallBase *CB = dyn_cast<CallBase>(U);
     if (!CB)
       continue;
 
@@ -188,9 +188,9 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     if (!PAL.isEmpty()) {
       SmallVector<AttributeSet, 8> ArgAttrs;
       for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
-        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
-      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
-                               PAL.getRetAttributes(), ArgAttrs);
+        ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttrs(),
+                               PAL.getRetAttrs(), ArgAttrs);
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -762,8 +762,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[ArgI] = true;
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
-      HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned);
+      ArgAttrVec.push_back(PAL.getParamAttrs(ArgI));
+      HasLiveReturnedArg |= PAL.hasParamAttr(ArgI, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
       LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
@@ -838,7 +838,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttrBuilder RAttrs(PAL.getRetAttributes());
+  AttrBuilder RAttrs(PAL.getRetAttrs());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
@@ -853,8 +853,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
   // Strip allocsize attributes. They might refer to the deleted arguments.
-  AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
-      F->getContext(), Attribute::AllocSize);
+  AttributeSet FnAttrs =
+      PAL.getFnAttrs().removeAttribute(F->getContext(), Attribute::AllocSize);
 
   // Reconstruct the AttributesList based on the vector we constructed.
   assert(ArgAttrVec.size() == Params.size());
@@ -889,7 +889,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Adjust the call return attributes in case the function was changed to
     // return void.
-    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    AttrBuilder RAttrs(CallPAL.getRetAttrs());
     RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
     AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
@@ -903,7 +903,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[Pi]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
+        AttributeSet Attrs = CallPAL.getParamAttrs(Pi);
         if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
@@ -922,7 +922,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
       Args.push_back(*I);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
+      ArgAttrVec.push_back(CallPAL.getParamAttrs(Pi));
     }
 
     // Reconstruct the AttributesList based on the vector we constructed.
@@ -930,7 +930,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Again, be sure to remove any allocsize attributes, since their indices
     // may now be incorrect.
-    AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+    AttributeSet FnAttrs = CallPAL.getFnAttrs().removeAttribute(
         F->getContext(), Attribute::AllocSize);
 
     AttributeList NewCallPAL = AttributeList::get(
@@ -1094,11 +1094,9 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
   // fused with the next loop, because deleting a function invalidates
   // information computed while surveying other functions.
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    Function &F = *I++;
+  for (Function &F : llvm::make_early_inc_range(M))
     if (F.getFunctionType()->isVarArg())
       Changed |= DeleteDeadVarargs(F);
-  }
 
   // Second phase:loop through the module, determining which arguments are live.
   // We assume all arguments are dead unless proven otherwise (allowing us to
@@ -1109,13 +1107,10 @@ PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
     SurveyFunction(F);
 
   // Now, remove all dead arguments and return values from each function in
-  // turn.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    // Increment now, because the function will probably get removed (ie.
-    // replaced by a new one).
-    Function *F = &*I++;
-    Changed |= RemoveDeadStuffFromFunction(F);
-  }
+  // turn.  We use make_early_inc_range here because functions will probably get
+  // removed (i.e. replaced by new ones).
+  for (Function &F : llvm::make_early_inc_range(M))
+    Changed |= RemoveDeadStuffFromFunction(&F);
 
   // Finally, look for any unused parameters in functions with non-local
   // linkage and replace the passed in parameters with undef.
diff --git a/llvm/lib/Transforms/IPO/ExtractGV.cpp b/llvm/lib/Transforms/IPO/ExtractGV.cpp
index ba0efd46af16..387f114f6ffa 100644
--- a/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -121,32 +121,27 @@ namespace {
       }
 
       // Visit the Aliases.
-      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-           I != E;) {
-        Module::alias_iterator CurI = I;
-        ++I;
-
-        bool Delete = deleteStuff == (bool)Named.count(&*CurI);
-        makeVisible(*CurI, Delete);
+      for (GlobalAlias &GA : llvm::make_early_inc_range(M.aliases())) {
+        bool Delete = deleteStuff == (bool)Named.count(&GA);
+        makeVisible(GA, Delete);
 
         if (Delete) {
-          Type *Ty =  CurI->getValueType();
+          Type *Ty = GA.getValueType();
 
-          CurI->removeFromParent();
+          GA.removeFromParent();
           llvm::Value *Declaration;
           if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
-            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                           CurI->getAddressSpace(),
-                                           CurI->getName(), &M);
+            Declaration =
+                Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                 GA.getAddressSpace(), GA.getName(), &M);
 
           } else {
             Declaration =
-              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
-                                 nullptr, CurI->getName());
-
+                new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+                                   nullptr, GA.getName());
           }
-          CurI->replaceAllUsesWith(Declaration);
-          delete &*CurI;
+          GA.replaceAllUsesWith(Declaration);
+          delete &GA;
         }
       }
 
diff --git a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 47fdf042f9d4..16d00a0c89e1 100644
--- a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -50,14 +50,14 @@ static void forceAttributes(Function &F) {
     return Kind;
   };
 
-  for (auto &S : ForceAttributes) {
+  for (const auto &S : ForceAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
     if (Kind == Attribute::None || F.hasFnAttribute(Kind))
       continue;
     F.addFnAttr(Kind);
   }
 
-  for (auto &S : ForceRemoveAttributes) {
+  for (const auto &S : ForceRemoveAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
     if (Kind == Attribute::None || !F.hasFnAttribute(Kind))
       continue;
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index ca8660a98ded..cde78713b554 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -14,10 +14,12 @@
 
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -82,6 +84,11 @@ STATISTIC(NumNoFree, "Number of functions marked as nofree");
 STATISTIC(NumWillReturn, "Number of functions marked as willreturn");
 STATISTIC(NumNoSync, "Number of functions marked as nosync");
 
+STATISTIC(NumThinLinkNoRecurse,
+          "Number of functions marked as norecurse during thinlink");
+STATISTIC(NumThinLinkNoUnwind,
+          "Number of functions marked as nounwind during thinlink");
+
 static cl::opt<bool> EnableNonnullArgPropagation(
     "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
     cl::desc("Try to propagate nonnull argument attributes from callsites to "
@@ -95,6 +102,10 @@ static cl::opt<bool> DisableNoFreeInference(
     "disable-nofree-inference", cl::Hidden,
     cl::desc("Stop inferring nofree attribute during function-attrs pass"));
 
+static cl::opt<bool> DisableThinLTOPropagation(
+    "disable-thinlto-funcattrs", cl::init(true), cl::Hidden,
+    cl::desc("Don't propagate function-attrs in thinLTO"));
+
 namespace {
 
 using SCCNodeSet = SmallSetVector<Function *, 8>;
@@ -131,12 +142,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
   // Scan the function body for instructions that may read or write memory.
   bool ReadsMemory = false;
   bool WritesMemory = false;
-  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
-    Instruction *I = &*II;
-
+  for (Instruction &I : instructions(F)) {
     // Some instructions can be ignored even if they read or write memory.
     // Detect these now, skipping to the next instruction if one is found.
-    if (auto *Call = dyn_cast<CallBase>(I)) {
+    if (auto *Call = dyn_cast<CallBase>(&I)) {
       // Ignore calls to functions in the same SCC, as long as the call sites
       // don't have operand bundles.  Calls with operand bundles are allowed to
       // have memory effects not described by the memory effects of the call
@@ -170,14 +179,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
       // Check whether all pointer arguments point to local memory, and
       // ignore calls that only access local memory.
-      for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) {
-        Value *Arg = *CI;
+      for (const Use &U : Call->args()) {
+        const Value *Arg = U;
         if (!Arg->getType()->isPtrOrPtrVectorTy())
           continue;
 
-        AAMDNodes AAInfo;
-        I->getAAMetadata(AAInfo);
-        MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(Arg, AAInfo);
+        MemoryLocation Loc =
+            MemoryLocation::getBeforeOrAfter(Arg, I.getAAMetadata());
 
         // Skip accesses to local or constant memory as they don't impact the
         // externally visible mod/ref behavior.
@@ -192,21 +200,21 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
           ReadsMemory = true;
       }
       continue;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
       // Ignore non-volatile loads from local memory. (Atomic is okay here.)
       if (!LI->isVolatile()) {
         MemoryLocation Loc = MemoryLocation::get(LI);
         if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
           continue;
       }
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
       // Ignore non-volatile stores to local memory. (Atomic is okay here.)
       if (!SI->isVolatile()) {
         MemoryLocation Loc = MemoryLocation::get(SI);
         if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
           continue;
       }
-    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(&I)) {
       // Ignore vaargs on local memory.
       MemoryLocation Loc = MemoryLocation::get(VI);
       if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
@@ -217,10 +225,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
     // read or write memory.
     //
     // Writes memory, remember that.
-    WritesMemory |= I->mayWriteToMemory();
+    WritesMemory |= I.mayWriteToMemory();
 
     // If this instruction may read memory, remember that.
-    ReadsMemory |= I->mayReadFromMemory();
+    ReadsMemory |= I.mayReadFromMemory();
   }
 
   if (WritesMemory) { 
@@ -240,7 +248,8 @@ MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
 
 /// Deduce readonly/readnone attributes for the SCC.
 template <typename AARGetterT>
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
+static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
+                         SmallSet<Function *, 8> &Changed) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
@@ -255,7 +264,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
                                       AAR, SCCNodes)) {
     case MAK_MayWrite:
-      return false;
+      return;
     case MAK_ReadOnly:
       ReadsMemory = true;
       break;
@@ -271,11 +280,10 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
   // If the SCC contains both functions that read and functions that write, then
   // we cannot add readonly attributes.
   if (ReadsMemory && WritesMemory)
-    return false;
+    return;
 
   // Success!  Functions in this SCC do not access memory, or only read memory.
   // Give them the appropriate attribute.
-  bool MadeChange = false;
 
   for (Function *F : SCCNodes) {
     if (F->doesNotAccessMemory())
@@ -289,7 +297,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     if (F->doesNotReadMemory() && WritesMemory)
       continue;
 
-    MadeChange = true;
+    Changed.insert(F);
 
     // Clear out any existing attributes.
     AttrBuilder AttrsToRemove;
@@ -303,7 +311,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly);
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
     }
-    F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove);
+    F->removeFnAttrs(AttrsToRemove);
 
     // Add in the new attribute.
     if (WritesMemory && !ReadsMemory)
@@ -318,8 +326,195 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
     else
       ++NumReadNone;
   }
+}
+
+// Compute definitive function attributes for a function taking into account
+// prevailing definitions and linkage types
+static FunctionSummary *calculatePrevailingSummary(
+    ValueInfo VI,
+    DenseMap<ValueInfo, FunctionSummary *> &CachedPrevailingSummary,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        IsPrevailing) {
+
+  if (CachedPrevailingSummary.count(VI))
+    return CachedPrevailingSummary[VI];
+
+  /// At this point, prevailing symbols have been resolved. The following leads
+  /// to returning a conservative result:
+  /// - Multiple instances with local linkage. Normally local linkage would be
+  ///   unique per module
+  ///   as the GUID includes the module path. We could have a guid alias if
+  ///   there wasn't any distinguishing path when each file was compiled, but
+  ///   that should be rare so we'll punt on those.
+
+  /// These next 2 cases should not happen and will assert:
+  /// - Multiple instances with external linkage. This should be caught in
+  ///   symbol resolution
+  /// - Non-existent FunctionSummary for Aliasee. This presents a hole in our
+  ///   knowledge meaning we have to go conservative.
+
+  /// Otherwise, we calculate attributes for a function as:
+  ///   1. If we have a local linkage, take its attributes. If there's somehow
+  ///      multiple, bail and go conservative.
+  ///   2. If we have an external/WeakODR/LinkOnceODR linkage check that it is
+  ///      prevailing, take its attributes.
+  ///   3. If we have a Weak/LinkOnce linkage the copies can have semantic
+  ///      differences. However, if the prevailing copy is known it will be used
+  ///      so take its attributes. If the prevailing copy is in a native file
+  ///      all IR copies will be dead and propagation will go conservative.
+  ///   4. AvailableExternally summaries without a prevailing copy are known to
+  ///      occur in a couple of circumstances:
+  ///      a. An internal function gets imported due to its caller getting
+  ///         imported, it becomes AvailableExternally but no prevailing
+  ///         definition exists. Because it has to get imported along with its
+  ///         caller the attributes will be captured by propagating on its
+  ///         caller.
+  ///      b. C++11 [temp.explicit]p10 can generate AvailableExternally
+  ///         definitions of explicitly instanced template declarations
+  ///         for inlining which are ultimately dropped from the TU. Since this
+  ///         is localized to the TU the attributes will have already made it to
+  ///         the callers.
+  ///      These are edge cases and already captured by their callers so we
+  ///      ignore these for now. If they become relevant to optimize in the
+  ///      future this can be revisited.
+  ///   5. Otherwise, go conservative.
+
+  CachedPrevailingSummary[VI] = nullptr;
+  FunctionSummary *Local = nullptr;
+  FunctionSummary *Prevailing = nullptr;
+
+  for (const auto &GVS : VI.getSummaryList()) {
+    if (!GVS->isLive())
+      continue;
+
+    FunctionSummary *FS = dyn_cast<FunctionSummary>(GVS->getBaseObject());
+    // Virtual and Unknown (e.g. indirect) calls require going conservative
+    if (!FS || FS->fflags().HasUnknownCall)
+      return nullptr;
+
+    const auto &Linkage = GVS->linkage();
+    if (GlobalValue::isLocalLinkage(Linkage)) {
+      if (Local) {
+        LLVM_DEBUG(
+            dbgs()
+            << "ThinLTO FunctionAttrs: Multiple Local Linkage, bailing on "
+               "function "
+            << VI.name() << " from " << FS->modulePath() << ". Previous module "
+            << Local->modulePath() << "\n");
+        return nullptr;
+      }
+      Local = FS;
+    } else if (GlobalValue::isExternalLinkage(Linkage)) {
+      assert(IsPrevailing(VI.getGUID(), GVS.get()));
+      Prevailing = FS;
+      break;
+    } else if (GlobalValue::isWeakODRLinkage(Linkage) ||
+               GlobalValue::isLinkOnceODRLinkage(Linkage) ||
+               GlobalValue::isWeakAnyLinkage(Linkage) ||
+               GlobalValue::isLinkOnceAnyLinkage(Linkage)) {
+      if (IsPrevailing(VI.getGUID(), GVS.get())) {
+        Prevailing = FS;
+        break;
+      }
+    } else if (GlobalValue::isAvailableExternallyLinkage(Linkage)) {
+      // TODO: Handle these cases if they become meaningful
+      continue;
+    }
+  }
+
+  if (Local) {
+    assert(!Prevailing);
+    CachedPrevailingSummary[VI] = Local;
+  } else if (Prevailing) {
+    assert(!Local);
+    CachedPrevailingSummary[VI] = Prevailing;
+  }
 
-  return MadeChange;
+  return CachedPrevailingSummary[VI];
+}
+
+bool llvm::thinLTOPropagateFunctionAttrs(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        IsPrevailing) {
+  // TODO: implement addNoAliasAttrs once
+  // there's more information about the return type in the summary
+  if (DisableThinLTOPropagation)
+    return false;
+
+  DenseMap<ValueInfo, FunctionSummary *> CachedPrevailingSummary;
+  bool Changed = false;
+
+  auto PropagateAttributes = [&](std::vector<ValueInfo> &SCCNodes) {
+    // Assume we can propagate unless we discover otherwise
+    FunctionSummary::FFlags InferredFlags;
+    InferredFlags.NoRecurse = (SCCNodes.size() == 1);
+    InferredFlags.NoUnwind = true;
+
+    for (auto &V : SCCNodes) {
+      FunctionSummary *CallerSummary =
+          calculatePrevailingSummary(V, CachedPrevailingSummary, IsPrevailing);
+
+      // Function summaries can fail to contain information such as declarations
+      if (!CallerSummary)
+        return;
+
+      if (CallerSummary->fflags().MayThrow)
+        InferredFlags.NoUnwind = false;
+
+      for (const auto &Callee : CallerSummary->calls()) {
+        FunctionSummary *CalleeSummary = calculatePrevailingSummary(
+            Callee.first, CachedPrevailingSummary, IsPrevailing);
+
+        if (!CalleeSummary)
+          return;
+
+        if (!CalleeSummary->fflags().NoRecurse)
+          InferredFlags.NoRecurse = false;
+
+        if (!CalleeSummary->fflags().NoUnwind)
+          InferredFlags.NoUnwind = false;
+
+        if (!InferredFlags.NoUnwind && !InferredFlags.NoRecurse)
+          break;
+      }
+    }
+
+    if (InferredFlags.NoUnwind || InferredFlags.NoRecurse) {
+      Changed = true;
+      for (auto &V : SCCNodes) {
+        if (InferredFlags.NoRecurse) {
+          LLVM_DEBUG(dbgs() << "ThinLTO FunctionAttrs: Propagated NoRecurse to "
+                            << V.name() << "\n");
+          ++NumThinLinkNoRecurse;
+        }
+
+        if (InferredFlags.NoUnwind) {
+          LLVM_DEBUG(dbgs() << "ThinLTO FunctionAttrs: Propagated NoUnwind to "
+                            << V.name() << "\n");
+          ++NumThinLinkNoUnwind;
+        }
+
+        for (auto &S : V.getSummaryList()) {
+          if (auto *FS = dyn_cast<FunctionSummary>(S.get())) {
+            if (InferredFlags.NoRecurse)
+              FS->setNoRecurse();
+
+            if (InferredFlags.NoUnwind)
+              FS->setNoUnwind();
+          }
+        }
+      }
+    }
+  };
+
+  // Call propagation functions on each SCC in the Index
+  for (scc_iterator<ModuleSummaryIndex *> I = scc_begin(&Index); !I.isAtEnd();
+       ++I) {
+    std::vector<ValueInfo> Nodes(*I);
+    PropagateAttributes(Nodes);
+  }
+  return Changed;
 }
 
 namespace {
@@ -395,7 +590,7 @@ struct ArgumentUsesTracker : public CaptureTracker {
     assert(UseIndex < CB->data_operands_size() &&
            "Indirect function calls should have been filtered above!");
 
-    if (UseIndex >= CB->getNumArgOperands()) {
+    if (UseIndex >= CB->arg_size()) {
       // Data operand, but not a argument operand -- must be a bundle operand
       assert(CB->hasOperandBundles() && "Must be!");
 
@@ -530,7 +725,7 @@ determinePointerReadAttrs(Argument *A,
       assert(UseIndex < CB.data_operands_size() &&
              "Data operand use expected!");
 
-      bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands();
+      bool IsOperandBundleUse = UseIndex >= CB.arg_size();
 
       if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
         assert(F->isVarArg() && "More params than args in non-varargs call");
@@ -581,9 +776,8 @@ determinePointerReadAttrs(Argument *A,
 }
 
 /// Deduce returned attributes for the SCC.
-static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes,
+                                     SmallSet<Function *, 8> &Changed) {
   // Check each function in turn, determining if an argument is always returned.
   for (Function *F : SCCNodes) {
     // We can infer and propagate function attributes only when we know that the
@@ -623,11 +817,9 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
       auto *A = cast<Argument>(RetArg);
       A->addAttr(Attribute::Returned);
       ++NumReturned;
-      Changed = true;
+      Changed.insert(F);
     }
   }
-
-  return Changed;
 }
 
 /// If a callsite has arguments that are also arguments to the parent function,
@@ -693,9 +885,8 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
 }
 
 /// Deduce nocapture attributes for the SCC.
-static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
+                             SmallSet<Function *, 8> &Changed) {
   ArgumentGraph AG;
 
   // Check each function in turn, determining which pointer arguments are not
@@ -707,7 +898,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (!F->hasExactDefinition())
       continue;
 
-    Changed |= addArgumentAttrsFromCallsites(*F);
+    if (addArgumentAttrsFromCallsites(*F))
+      Changed.insert(F);
 
     // Functions that are readonly (or readnone) and nounwind and don't return
     // a value can't capture arguments. Don't analyze them.
@@ -718,7 +910,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
           A->addAttr(Attribute::NoCapture);
           ++NumNoCapture;
-          Changed = true;
+          Changed.insert(F);
         }
       }
       continue;
@@ -737,7 +929,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
             // If it's trivially not captured, mark it nocapture now.
             A->addAttr(Attribute::NoCapture);
             ++NumNoCapture;
-            Changed = true;
+            Changed.insert(F);
           } else {
             // If it's not trivially captured and not trivially not captured,
             // then it must be calling into another function in our SCC. Save
@@ -761,7 +953,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         Self.insert(&*A);
         Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
         if (R != Attribute::None)
-          Changed = addReadAttr(A, R);
+          if (addReadAttr(A, R))
+            Changed.insert(F);
       }
     }
   }
@@ -785,7 +978,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         Argument *A = ArgumentSCC[0]->Definition;
         A->addAttr(Attribute::NoCapture);
         ++NumNoCapture;
-        Changed = true;
+        Changed.insert(A->getParent());
       }
       continue;
     }
@@ -827,7 +1020,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       Argument *A = ArgumentSCC[i]->Definition;
       A->addAttr(Attribute::NoCapture);
       ++NumNoCapture;
-      Changed = true;
+      Changed.insert(A->getParent());
     }
 
     // We also want to compute readonly/readnone. With a small number of false
@@ -858,12 +1051,11 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (ReadAttr != Attribute::None) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
-        Changed = addReadAttr(A, ReadAttr);
+        if (addReadAttr(A, ReadAttr))
+          Changed.insert(A->getParent());
       }
     }
   }
-
-  return Changed;
 }
 
 /// Tests whether a function is "malloc-like".
@@ -934,7 +1126,8 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
 }
 
 /// Deduce noalias attributes for the SCC.
-static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
+static void addNoAliasAttrs(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
   // Check each function in turn, determining which functions return noalias
   // pointers.
   for (Function *F : SCCNodes) {
@@ -946,7 +1139,7 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
     // definition we'll get at link time is *exactly* the definition we see now.
     // For more details, see GlobalValue::mayBeDerefined.
     if (!F->hasExactDefinition())
-      return false;
+      return;
 
     // We annotate noalias return values, which are only applicable to
     // pointer types.
@@ -954,10 +1147,9 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
       continue;
 
     if (!isFunctionMallocLike(F, SCCNodes))
-      return false;
+      return;
   }
 
-  bool MadeChange = false;
   for (Function *F : SCCNodes) {
     if (F->returnDoesNotAlias() ||
         !F->getReturnType()->isPointerTy())
@@ -965,10 +1157,8 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
 
     F->setReturnDoesNotAlias();
     ++NumNoAlias;
-    MadeChange = true;
+    Changed.insert(F);
   }
-
-  return MadeChange;
 }
 
 /// Tests whether this function is known to not return null.
@@ -1044,26 +1234,24 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
 }
 
 /// Deduce nonnull attributes for the SCC.
-static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
+static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
   // Speculative that all functions in the SCC return only nonnull
   // pointers.  We may refute this as we analyze functions.
   bool SCCReturnsNonNull = true;
 
-  bool MadeChange = false;
-
   // Check each function in turn, determining which functions return nonnull
   // pointers.
   for (Function *F : SCCNodes) {
     // Already nonnull.
-    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                        Attribute::NonNull))
+    if (F->getAttributes().hasRetAttr(Attribute::NonNull))
       continue;
 
     // We can infer and propagate function attributes only when we know that the
     // definition we'll get at link time is *exactly* the definition we see now.
     // For more details, see GlobalValue::mayBeDerefined.
     if (!F->hasExactDefinition())
-      return false;
+      return;
 
     // We annotate nonnull return values, which are only applicable to
     // pointer types.
@@ -1077,9 +1265,9 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
         // which prevents us from speculating about the entire SCC
         LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
                           << " as nonnull\n");
-        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        F->addRetAttr(Attribute::NonNull);
         ++NumNonNullReturn;
-        MadeChange = true;
+        Changed.insert(F);
       }
       continue;
     }
@@ -1090,19 +1278,16 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
 
   if (SCCReturnsNonNull) {
     for (Function *F : SCCNodes) {
-      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                          Attribute::NonNull) ||
+      if (F->getAttributes().hasRetAttr(Attribute::NonNull) ||
           !F->getReturnType()->isPointerTy())
         continue;
 
       LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
-      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+      F->addRetAttr(Attribute::NonNull);
       ++NumNonNullReturn;
-      MadeChange = true;
+      Changed.insert(F);
     }
   }
-
-  return MadeChange;
 }
 
 namespace {
@@ -1155,12 +1340,13 @@ public:
     InferenceDescriptors.push_back(AttrInference);
   }
 
-  bool run(const SCCNodeSet &SCCNodes);
+  void run(const SCCNodeSet &SCCNodes, SmallSet<Function *, 8> &Changed);
 };
 
 /// Perform all the requested attribute inference actions according to the
 /// attribute predicates stored before.
-bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
+void AttributeInferer::run(const SCCNodeSet &SCCNodes,
+                           SmallSet<Function *, 8> &Changed) {
   SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
   // Go through all the functions in SCC and check corresponding attribute
   // assumptions for each of them. Attributes that are invalid for this SCC
@@ -1169,7 +1355,7 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
 
     // No attributes whose assumptions are still valid - done.
     if (InferInSCC.empty())
-      return false;
+      return;
 
     // Check if our attributes ever need scanning/can be scanned.
     llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
@@ -1212,9 +1398,8 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
   }
 
   if (InferInSCC.empty())
-    return false;
+    return;
 
-  bool Changed = false;
   for (Function *F : SCCNodes)
     // At this point InferInSCC contains only functions that were either:
     //   - explicitly skipped from scan/inference, or
@@ -1223,10 +1408,9 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
     for (auto &ID : InferInSCC) {
       if (ID.SkipFunction(*F))
         continue;
-      Changed = true;
+      Changed.insert(F);
       ID.SetAttribute(*F);
     }
-  return Changed;
 }
 
 struct SCCNodesResult {
@@ -1243,7 +1427,7 @@ static bool InstrBreaksNonConvergent(Instruction &I,
   // Breaks non-convergent assumption if CS is a convergent call to a function
   // not in the SCC.
   return CB && CB->isConvergent() &&
-         SCCNodes.count(CB->getCalledFunction()) == 0;
+         !SCCNodes.contains(CB->getCalledFunction());
 }
 
 /// Helper for NoUnwind inference predicate InstrBreaksAttribute.
@@ -1282,7 +1466,8 @@ static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
 /// Attempt to remove convergent function attribute when possible.
 ///
 /// Returns true if any changes to function attributes were made.
-static bool inferConvergent(const SCCNodeSet &SCCNodes) {
+static void inferConvergent(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
   AttributeInferer AI;
 
   // Request to remove the convergent attribute from all functions in the SCC
@@ -1305,7 +1490,7 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) {
       },
       /* RequiresExactDefinition= */ false});
   // Perform all the requested attribute inference actions.
-  return AI.run(SCCNodes);
+  AI.run(SCCNodes, Changed);
 }
 
 /// Infer attributes from all functions in the SCC by scanning every
@@ -1314,7 +1499,8 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) {
 ///   - addition of NoUnwind attribute
 ///
 /// Returns true if any changes to function attributes were made.
-static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
+static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
+                                         SmallSet<Function *, 8> &Changed) {
   AttributeInferer AI;
 
   if (!DisableNoUnwindInference)
@@ -1363,19 +1549,20 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
         /* RequiresExactDefinition= */ true});
 
   // Perform all the requested attribute inference actions.
-  return AI.run(SCCNodes);
+  AI.run(SCCNodes, Changed);
 }
 
-static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
+static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
+                              SmallSet<Function *, 8> &Changed) {
   // Try and identify functions that do not recurse.
 
   // If the SCC contains multiple nodes we know for sure there is recursion.
   if (SCCNodes.size() != 1)
-    return false;
+    return;
 
   Function *F = *SCCNodes.begin();
   if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
-    return false;
+    return;
 
   // If all of the calls in F are identifiable and are to norecurse functions, F
   // is norecurse. This check also detects self-recursion as F is not currently
@@ -1386,7 +1573,7 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
         Function *Callee = CB->getCalledFunction();
         if (!Callee || Callee == F || !Callee->doesNotRecurse())
           // Function calls a potentially recursive function.
-          return false;
+          return;
       }
 
   // Every call was to a non-recursive function other than this function, and
@@ -1394,7 +1581,7 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // recurse.
   F->setDoesNotRecurse();
   ++NumNoRecurse;
-  return true;
+  Changed.insert(F);
 }
 
 static bool instructionDoesNotReturn(Instruction &I) {
@@ -1412,9 +1599,8 @@ static bool basicBlockCanReturn(BasicBlock &BB) {
 }
 
 // Set the noreturn function attribute if possible.
-static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
+                             SmallSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
         F->doesNotReturn())
@@ -1424,11 +1610,9 @@ static bool addNoReturnAttrs(const SCCNodeSet &SCCNodes) {
     // FIXME: this doesn't handle recursion or unreachable blocks.
     if (none_of(*F, basicBlockCanReturn)) {
       F->setDoesNotReturn();
-      Changed = true;
+      Changed.insert(F);
     }
   }
-
-  return Changed;
 }
 
 static bool functionWillReturn(const Function &F) {
@@ -1461,19 +1645,16 @@ static bool functionWillReturn(const Function &F) {
 }
 
 // Set the willreturn function attribute if possible.
-static bool addWillReturn(const SCCNodeSet &SCCNodes) {
-  bool Changed = false;
-
+static void addWillReturn(const SCCNodeSet &SCCNodes,
+                          SmallSet<Function *, 8> &Changed) {
   for (Function *F : SCCNodes) {
     if (!F || F->willReturn() || !functionWillReturn(*F))
       continue;
 
     F->setWillReturn();
     NumWillReturn++;
-    Changed = true;
+    Changed.insert(F);
   }
-
-  return Changed;
 }
 
 // Return true if this is an atomic which has an ordering stronger than
@@ -1532,7 +1713,8 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) {
 }
 
 // Infer the nosync attribute.
-static bool addNoSyncAttr(const SCCNodeSet &SCCNodes) {
+static void addNoSyncAttr(const SCCNodeSet &SCCNodes,
+                          SmallSet<Function *, 8> &Changed) {
   AttributeInferer AI;
   AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
       Attribute::NoSync,
@@ -1549,14 +1731,15 @@ static bool addNoSyncAttr(const SCCNodeSet &SCCNodes) {
         ++NumNoSync;
       },
       /* RequiresExactDefinition= */ true});
-  return AI.run(SCCNodes);
+  AI.run(SCCNodes, Changed);
 }
 
 static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
   SCCNodesResult Res;
   Res.HasUnknownCall = false;
   for (Function *F : Functions) {
-    if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked)) {
+    if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) ||
+        F->isPresplitCoroutine()) {
       // Treat any function we're trying not to optimize as if it were an
       // indirect call and omit it from the node set used below.
       Res.HasUnknownCall = true;
@@ -1582,32 +1765,33 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
 }
 
 template <typename AARGetterT>
-static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
-                                   AARGetterT &&AARGetter) {
+static SmallSet<Function *, 8>
+deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter) {
   SCCNodesResult Nodes = createSCCNodeSet(Functions);
-  bool Changed = false;
 
   // Bail if the SCC only contains optnone functions.
   if (Nodes.SCCNodes.empty())
-    return Changed;
+    return {};
+
+  SmallSet<Function *, 8> Changed;
 
-  Changed |= addArgumentReturnedAttrs(Nodes.SCCNodes);
-  Changed |= addReadAttrs(Nodes.SCCNodes, AARGetter);
-  Changed |= addArgumentAttrs(Nodes.SCCNodes);
-  Changed |= inferConvergent(Nodes.SCCNodes);
-  Changed |= addNoReturnAttrs(Nodes.SCCNodes);
-  Changed |= addWillReturn(Nodes.SCCNodes);
+  addArgumentReturnedAttrs(Nodes.SCCNodes, Changed);
+  addReadAttrs(Nodes.SCCNodes, AARGetter, Changed);
+  addArgumentAttrs(Nodes.SCCNodes, Changed);
+  inferConvergent(Nodes.SCCNodes, Changed);
+  addNoReturnAttrs(Nodes.SCCNodes, Changed);
+  addWillReturn(Nodes.SCCNodes, Changed);
 
   // If we have no external nodes participating in the SCC, we can deduce some
   // more precise attributes as well.
   if (!Nodes.HasUnknownCall) {
-    Changed |= addNoAliasAttrs(Nodes.SCCNodes);
-    Changed |= addNonNullAttrs(Nodes.SCCNodes);
-    Changed |= inferAttrsFromFunctionBodies(Nodes.SCCNodes);
-    Changed |= addNoRecurseAttrs(Nodes.SCCNodes);
+    addNoAliasAttrs(Nodes.SCCNodes, Changed);
+    addNonNullAttrs(Nodes.SCCNodes, Changed);
+    inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
+    addNoRecurseAttrs(Nodes.SCCNodes, Changed);
   }
 
-  Changed |= addNoSyncAttr(Nodes.SCCNodes);
+  addNoSyncAttr(Nodes.SCCNodes, Changed);
 
   // Finally, infer the maximal set of attributes from the ones we've inferred
   // above.  This is handling the cases where one attribute on a signature
@@ -1615,7 +1799,8 @@ static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
   // the later is missing (or simply less sophisticated).
   for (Function *F : Nodes.SCCNodes)
     if (F)
-      Changed |= inferAttributesFromOthers(*F);
+      if (inferAttributesFromOthers(*F))
+        Changed.insert(F);
 
   return Changed;
 }
@@ -1638,14 +1823,35 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
     Functions.push_back(&N.getFunction());
   }
 
-  if (deriveAttrsInPostOrder(Functions, AARGetter)) {
-    // We have not changed the call graph or removed/added functions.
-    PreservedAnalyses PA;
-    PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
-    return PA;
+  auto ChangedFunctions = deriveAttrsInPostOrder(Functions, AARGetter);
+  if (ChangedFunctions.empty())
+    return PreservedAnalyses::all();
+
+  // Invalidate analyses for modified functions so that we don't have to
+  // invalidate all analyses for all functions in this SCC.
+  PreservedAnalyses FuncPA;
+  // We haven't changed the CFG for modified functions.
+  FuncPA.preserveSet<CFGAnalyses>();
+  for (Function *Changed : ChangedFunctions) {
+    FAM.invalidate(*Changed, FuncPA);
+    // Also invalidate any direct callers of changed functions since analyses
+    // may care about attributes of direct callees. For example, MemorySSA cares
+    // about whether or not a call's callee modifies memory and queries that
+    // through function attributes.
+    for (auto *U : Changed->users()) {
+      if (auto *Call = dyn_cast<CallBase>(U)) {
+        if (Call->getCalledFunction() == Changed)
+          FAM.invalidate(*Call->getFunction(), FuncPA);
+      }
+    }
   }
 
-  return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  // We have not added or removed functions.
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  // We already invalidated all relevant function analyses above.
+  PA.preserveSet<AllAnalysesOn<Function>>();
+  return PA;
 }
 
 namespace {
@@ -1690,7 +1896,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
     Functions.push_back(I->getFunction());
   }
 
-  return deriveAttrsInPostOrder(Functions, AARGetter);
+  return !deriveAttrsInPostOrder(Functions, AARGetter).empty();
 }
 
 bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 2f6cf0ca7087..d9b43109f629 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -187,23 +188,6 @@ selectCallee(const ModuleSummaryIndex &Index,
           return false;
         }
 
-        // For SamplePGO, in computeImportForFunction the OriginalId
-        // may have been used to locate the callee summary list (See
-        // comment there).
-        // The mapping from OriginalId to GUID may return a GUID
-        // that corresponds to a static variable. Filter it out here.
-        // This can happen when
-        // 1) There is a call to a library function which is not defined
-        // in the index.
-        // 2) There is a static variable with the  OriginalGUID identical
-        // to the GUID of the library function in 1);
-        // When this happens, the logic for SamplePGO kicks in and
-        // the static variable in 2) will be found, which needs to be
-        // filtered out.
-        if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) {
-          Reason = FunctionImporter::ImportFailureReason::GlobalVar;
-          return false;
-        }
         if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) {
           Reason = FunctionImporter::ImportFailureReason::InterposableLinkage;
           // There is no point in importing these, we can't inline them
@@ -264,21 +248,6 @@ using EdgeInfo =
 
 } // anonymous namespace
 
-static ValueInfo
-updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
-  if (!VI.getSummaryList().empty())
-    return VI;
-  // For SamplePGO, the indirect call targets for local functions will
-  // have its original name annotated in profile. We try to find the
-  // corresponding PGOFuncName as the GUID.
-  // FIXME: Consider updating the edges in the graph after building
-  // it, rather than needing to perform this mapping on each walk.
-  auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
-  if (GUID == 0)
-    return ValueInfo();
-  return Index.getValueInfo(GUID);
-}
-
 static bool shouldImportGlobal(const ValueInfo &VI,
                                const GVSummaryMapTy &DefinedGVSummaries) {
   const auto &GVS = DefinedGVSummaries.find(VI.getGUID());
@@ -400,10 +369,6 @@ static void computeImportForFunction(
       continue;
     }
 
-    VI = updateValueInfoForIndirectCalls(Index, VI);
-    if (!VI)
-      continue;
-
     if (DefinedGVSummaries.count(VI.getGUID())) {
       // FIXME: Consider not skipping import if the module contains
       // a non-prevailing def with interposable linkage. The prevailing copy
@@ -496,7 +461,7 @@ static void computeImportForFunction(
                             VI.name().str() + " due to " +
                             getFailureName(Reason);
           auto Error = make_error<StringError>(
-              Msg, std::make_error_code(std::errc::operation_not_supported));
+              Msg, make_error_code(errc::not_supported));
           logAllUnhandledErrors(std::move(Error), errs(),
                                 "Error importing module: ");
           break;
@@ -839,16 +804,61 @@ void llvm::ComputeCrossModuleImportForModuleFromIndex(
 #endif
 }
 
-void llvm::computeDeadSymbols(
+// For SamplePGO, the indirect call targets for local functions will
+// have its original name annotated in profile. We try to find the
+// corresponding PGOFuncName as the GUID, and fix up the edges
+// accordingly.
+void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index,
+                                     FunctionSummary *FS) {
+  for (auto &EI : FS->mutableCalls()) {
+    if (!EI.first.getSummaryList().empty())
+      continue;
+    auto GUID = Index.getGUIDFromOriginalID(EI.first.getGUID());
+    if (GUID == 0)
+      continue;
+    // Update the edge to point directly to the correct GUID.
+    auto VI = Index.getValueInfo(GUID);
+    if (llvm::any_of(
+            VI.getSummaryList(),
+            [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
+              // The mapping from OriginalId to GUID may return a GUID
+              // that corresponds to a static variable. Filter it out here.
+              // This can happen when
+              // 1) There is a call to a library function which is not defined
+              // in the index.
+              // 2) There is a static variable with the  OriginalGUID identical
+              // to the GUID of the library function in 1);
+              // When this happens the static variable in 2) will be found,
+              // which needs to be filtered out.
+              return SummaryPtr->getSummaryKind() ==
+                     GlobalValueSummary::GlobalVarKind;
+            }))
+      continue;
+    EI.first = VI;
+  }
+}
+
+void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) {
+  for (const auto &Entry : Index) {
+    for (auto &S : Entry.second.SummaryList) {
+      if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
+        updateValueInfoForIndirectCalls(Index, FS);
+    }
+  }
+}
+
+void llvm::computeDeadSymbolsAndUpdateIndirectCalls(
     ModuleSummaryIndex &Index,
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
   assert(!Index.withGlobalValueDeadStripping());
-  if (!ComputeDead)
-    return;
-  if (GUIDPreservedSymbols.empty())
-    // Don't do anything when nothing is live, this is friendly with tests.
+  if (!ComputeDead ||
+      // Don't do anything when nothing is live, this is friendly with tests.
+      GUIDPreservedSymbols.empty()) {
+    // Still need to update indirect calls.
+    updateIndirectCalls(Index);
     return;
+  }
   unsigned LiveSymbols = 0;
   SmallVector<ValueInfo, 128> Worklist;
   Worklist.reserve(GUIDPreservedSymbols.size() * 2);
@@ -863,13 +873,16 @@ void llvm::computeDeadSymbols(
   // Add values flagged in the index as live roots to the worklist.
   for (const auto &Entry : Index) {
     auto VI = Index.getValueInfo(Entry);
-    for (auto &S : Entry.second.SummaryList)
+    for (auto &S : Entry.second.SummaryList) {
+      if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
+        updateValueInfoForIndirectCalls(Index, FS);
       if (S->isLive()) {
         LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
         Worklist.push_back(VI);
         ++LiveSymbols;
         break;
       }
+    }
   }
 
   // Make value live and add it to the worklist if it was not live before.
@@ -882,9 +895,6 @@ void llvm::computeDeadSymbols(
     // binary, which increases the binary size unnecessarily. Note that
     // if this code changes, the importer needs to change so that edges
     // to functions marked dead are skipped.
-    VI = updateValueInfoForIndirectCalls(Index, VI);
-    if (!VI)
-      return;
 
     if (llvm::any_of(VI.getSummaryList(),
                      [](const std::unique_ptr<llvm::GlobalValueSummary> &S) {
@@ -958,7 +968,8 @@ void llvm::computeDeadSymbolsWithConstProp(
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
     function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
     bool ImportEnabled) {
-  computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+  computeDeadSymbolsAndUpdateIndirectCalls(Index, GUIDPreservedSymbols,
+                                           isPrevailing);
   if (ImportEnabled)
     Index.propagateAttributes(GUIDPreservedSymbols);
 }
@@ -1040,13 +1051,33 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
   return true;
 }
 
-void llvm::thinLTOResolvePrevailingInModule(
-    Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
-  auto updateLinkage = [&](GlobalValue &GV) {
+void llvm::thinLTOFinalizeInModule(Module &TheModule,
+                                   const GVSummaryMapTy &DefinedGlobals,
+                                   bool PropagateAttrs) {
+  auto FinalizeInModule = [&](GlobalValue &GV, bool Propagate = false) {
     // See if the global summary analysis computed a new resolved linkage.
     const auto &GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end())
       return;
+
+    if (Propagate)
+      if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GS->second)) {
+        if (Function *F = dyn_cast<Function>(&GV)) {
+          // TODO: propagate ReadNone and ReadOnly.
+          if (FS->fflags().ReadNone && !F->doesNotAccessMemory())
+            F->setDoesNotAccessMemory();
+
+          if (FS->fflags().ReadOnly && !F->onlyReadsMemory())
+            F->setOnlyReadsMemory();
+
+          if (FS->fflags().NoRecurse && !F->doesNotRecurse())
+            F->setDoesNotRecurse();
+
+          if (FS->fflags().NoUnwind && !F->doesNotThrow())
+            F->setDoesNotThrow();
+        }
+      }
+
     auto NewLinkage = GS->second->linkage();
     if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
         // Don't internalize anything here, because the code below
@@ -1105,11 +1136,11 @@ void llvm::thinLTOResolvePrevailingInModule(
 
   // Process functions and global now
   for (auto &GV : TheModule)
-    updateLinkage(GV);
+    FinalizeInModule(GV, PropagateAttrs);
   for (auto &GV : TheModule.globals())
-    updateLinkage(GV);
+    FinalizeInModule(GV);
   for (auto &GV : TheModule.aliases())
-    updateLinkage(GV);
+    FinalizeInModule(GV);
 }
 
 /// Run internalization on \p TheModule based on symmary analysis.
@@ -1153,7 +1184,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
 
 /// Make alias a clone of its aliasee.
 static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
-  Function *Fn = cast<Function>(GA->getBaseObject());
+  Function *Fn = cast<Function>(GA->getAliaseeObject());
 
   ValueToValueMapTy VMap;
   Function *NewFn = CloneFunction(Fn, VMap);
@@ -1259,12 +1290,12 @@ Expected<bool> FunctionImporter::importFunctions(
         if (Error Err = GA.materialize())
           return std::move(Err);
         // Import alias as a copy of its aliasee.
-        GlobalObject *Base = GA.getBaseObject();
-        if (Error Err = Base->materialize())
+        GlobalObject *GO = GA.getAliaseeObject();
+        if (Error Err = GO->materialize())
           return std::move(Err);
         auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
-        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
-                          << " " << Base->getName() << " from "
+        LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " "
+                          << GO->getName() << " from "
                           << SrcModule->getSourceFileName() << "\n");
         if (EnableImportMetadata) {
           // Add 'thinlto_src_module' metadata for statistics and debugging.
@@ -1303,7 +1334,7 @@ Expected<bool> FunctionImporter::importFunctions(
             std::move(SrcModule), GlobalsToImport.getArrayRef(),
             [](GlobalValue &, IRMover::ValueAdder) {},
             /*IsPerformingImport=*/true))
-      report_fatal_error("Function Import: link error: " +
+      report_fatal_error(Twine("Function Import: link error: ") +
                          toString(std::move(Err)));
 
     ImportedCount += GlobalsToImport.size();
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index f61f4312b777..fbd083bb9bbf 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -11,7 +11,6 @@
 // are propagated to the callee by specializing the function.
 //
 // Current limitations:
-// - It does not handle specialization of recursive functions,
 // - It does not yet handle integer ranges.
 // - Only 1 argument per function is specialised,
 // - The cost-model could be further looked into,
@@ -22,6 +21,18 @@
 //   a direct way to steer function specialization, avoiding the cost-model,
 //   and thus control compile-times / code-size.
 //
+// Todos:
+// - Specializing recursive functions relies on running the transformation a
+//   number of times, which is controlled by option
+//   `func-specialization-max-iters`. Thus, increasing this value and the
+//   number of iterations, will linearly increase the number of times recursive
+//   functions get specialized, see also the discussion in
+//   https://reviews.llvm.org/D106426 for details. Perhaps there is a
+//   compile-time friendlier way to control/limit the number of specialisations
+//   for recursive functions.
+// - Don't transform the function if there is no function specialization
+//   happens.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
@@ -59,20 +70,166 @@ static cl::opt<unsigned> MaxConstantsThreshold(
              "specialization"),
     cl::init(3));
 
+static cl::opt<unsigned> SmallFunctionThreshold(
+    "func-specialization-size-threshold", cl::Hidden,
+    cl::desc("Don't specialize functions that have less than this theshold "
+             "number of instructions"),
+    cl::init(100));
+
 static cl::opt<unsigned>
     AvgLoopIterationCount("func-specialization-avg-iters-cost", cl::Hidden,
                           cl::desc("Average loop iteration count cost"),
                           cl::init(10));
 
+static cl::opt<bool> SpecializeOnAddresses(
+    "func-specialization-on-address", cl::init(false), cl::Hidden,
+    cl::desc("Enable function specialization on the address of global values"));
+
+// TODO: This needs checking to see the impact on compile-times, which is why
+// this is off by default for now.
 static cl::opt<bool> EnableSpecializationForLiteralConstant(
     "function-specialization-for-literal-constant", cl::init(false), cl::Hidden,
-    cl::desc("Make function specialization available for literal constant."));
+    cl::desc("Enable specialization of functions that take a literal constant "
+             "as an argument."));
+
+// Helper to check if \p LV is either a constant or a constant
+// range with a single element. This should cover exactly the same cases as the
+// old ValueLatticeElement::isConstant() and is intended to be used in the
+// transition to ValueLatticeElement.
+static bool isConstant(const ValueLatticeElement &LV) {
+  return LV.isConstant() ||
+         (LV.isConstantRange() && LV.getConstantRange().isSingleElement());
+}
 
 // Helper to check if \p LV is either overdefined or a constant int.
 static bool isOverdefined(const ValueLatticeElement &LV) {
-  return !LV.isUnknownOrUndef() && !LV.isConstant();
+  return !LV.isUnknownOrUndef() && !isConstant(LV);
+}
+
+static Constant *getPromotableAlloca(AllocaInst *Alloca, CallInst *Call) {
+  Value *StoreValue = nullptr;
+  for (auto *User : Alloca->users()) {
+    // We can't use llvm::isAllocaPromotable() as that would fail because of
+    // the usage in the CallInst, which is what we check here.
+    if (User == Call)
+      continue;
+    if (auto *Bitcast = dyn_cast<BitCastInst>(User)) {
+      if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call)
+        return nullptr;
+      continue;
+    }
+
+    if (auto *Store = dyn_cast<StoreInst>(User)) {
+      // This is a duplicate store, bail out.
+      if (StoreValue || Store->isVolatile())
+        return nullptr;
+      StoreValue = Store->getValueOperand();
+      continue;
+    }
+    // Bail if there is any other unknown usage.
+    return nullptr;
+  }
+  return dyn_cast_or_null<Constant>(StoreValue);
 }
 
+// A constant stack value is an AllocaInst that has a single constant
+// value stored to it. Return this constant if such an alloca stack value
+// is a function argument.
+static Constant *getConstantStackValue(CallInst *Call, Value *Val,
+                                       SCCPSolver &Solver) {
+  if (!Val)
+    return nullptr;
+  Val = Val->stripPointerCasts();
+  if (auto *ConstVal = dyn_cast<ConstantInt>(Val))
+    return ConstVal;
+  auto *Alloca = dyn_cast<AllocaInst>(Val);
+  if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy())
+    return nullptr;
+  return getPromotableAlloca(Alloca, Call);
+}
+
+// To support specializing recursive functions, it is important to propagate
+// constant arguments because after a first iteration of specialisation, a
+// reduced example may look like this:
+//
+//     define internal void @RecursiveFn(i32* arg1) {
+//       %temp = alloca i32, align 4
+//       store i32 2 i32* %temp, align 4
+//       call void @RecursiveFn.1(i32* nonnull %temp)
+//       ret void
+//     }
+//
+// Before a next iteration, we need to propagate the constant like so
+// which allows further specialization in next iterations.
+//
+//     @funcspec.arg = internal constant i32 2
+//
+//     define internal void @someFunc(i32* arg1) {
+//       call void @otherFunc(i32* nonnull @funcspec.arg)
+//       ret void
+//     }
+//
+static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList,
+                                   Module &M, SCCPSolver &Solver) {
+  // Iterate over the argument tracked functions see if there
+  // are any new constant values for the call instruction via
+  // stack variables.
+  for (auto *F : WorkList) {
+    // TODO: Generalize for any read only arguments.
+    if (F->arg_size() != 1)
+      continue;
+
+    auto &Arg = *F->arg_begin();
+    if (!Arg.onlyReadsMemory() || !Arg.getType()->isPointerTy())
+      continue;
+
+    for (auto *User : F->users()) {
+      auto *Call = dyn_cast<CallInst>(User);
+      if (!Call)
+        break;
+      auto *ArgOp = Call->getArgOperand(0);
+      auto *ArgOpType = ArgOp->getType();
+      auto *ConstVal = getConstantStackValue(Call, ArgOp, Solver);
+      if (!ConstVal)
+        break;
+
+      Value *GV = new GlobalVariable(M, ConstVal->getType(), true,
+                                     GlobalValue::InternalLinkage, ConstVal,
+                                     "funcspec.arg");
+
+      if (ArgOpType != ConstVal->getType())
+        GV = ConstantExpr::getBitCast(cast<Constant>(GV), ArgOp->getType());
+
+      Call->setArgOperand(0, GV);
+
+      // Add the changed CallInst to Solver Worklist
+      Solver.visitCall(*Call);
+    }
+  }
+}
+
+// ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics
+// interfere with the constantArgPropagation optimization.
+static void removeSSACopy(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
+      auto *II = dyn_cast<IntrinsicInst>(&Inst);
+      if (!II)
+        continue;
+      if (II->getIntrinsicID() != Intrinsic::ssa_copy)
+        continue;
+      Inst.replaceAllUsesWith(II->getOperand(0));
+      Inst.eraseFromParent();
+    }
+  }
+}
+
+static void removeSSACopy(Module &M) {
+  for (Function &F : M)
+    removeSSACopy(F);
+}
+
+namespace {
 class FunctionSpecializer {
 
   /// The IPSCCP Solver.
@@ -115,9 +272,14 @@ public:
     for (auto *SpecializedFunc : CurrentSpecializations) {
       SpecializedFuncs.insert(SpecializedFunc);
 
-      // TODO: If we want to support specializing specialized functions,
-      // initialize here the state of the newly created functions, marking
-      // them argument-tracked and executable.
+      // Initialize the state of the newly created functions, marking them
+      // argument-tracked and executable.
+      if (SpecializedFunc->hasExactDefinition() &&
+          !SpecializedFunc->hasFnAttribute(Attribute::Naked))
+        Solver.addTrackedFunction(SpecializedFunc);
+      Solver.addArgumentTrackedFunction(SpecializedFunc);
+      FuncDecls.push_back(SpecializedFunc);
+      Solver.markBlockExecutable(&SpecializedFunc->front());
 
       // Replace the function arguments for the specialized functions.
       for (Argument &Arg : SpecializedFunc->args())
@@ -138,12 +300,22 @@ public:
     const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
     if (isOverdefined(IV))
       return false;
-    auto *Const = IV.isConstant() ? Solver.getConstant(IV)
-                                  : UndefValue::get(V->getType());
+    auto *Const =
+        isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
     V->replaceAllUsesWith(Const);
 
-    // TODO: Update the solver here if we want to specialize specialized
-    // functions.
+    for (auto *U : Const->users())
+      if (auto *I = dyn_cast<Instruction>(U))
+        if (Solver.isBlockExecutable(I->getParent()))
+          Solver.visit(I);
+
+    // Remove the instruction from Block and Solver.
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      if (I->isSafeToRemove()) {
+        I->eraseFromParent();
+        Solver.removeLatticeValueFor(I);
+      }
+    }
     return true;
   }
 
@@ -152,6 +324,15 @@ private:
   // also in the cost model.
   unsigned NbFunctionsSpecialized = 0;
 
+  /// Clone the function \p F and remove the ssa_copy intrinsics added by
+  /// the SCCPSolver in the cloned version.
+  Function *cloneCandidateFunction(Function *F) {
+    ValueToValueMapTy EmptyMap;
+    Function *Clone = CloneFunction(F, EmptyMap);
+    removeSSACopy(*Clone);
+    return Clone;
+  }
+
   /// This function decides whether to specialize function \p F based on the
   /// known constant values its arguments can take on. Specialization is
   /// performed on the first interesting argument. Specializations based on
@@ -162,9 +343,8 @@ private:
                           SmallVectorImpl<Function *> &Specializations) {
 
     // Do not specialize the cloned function again.
-    if (SpecializedFuncs.contains(F)) {
+    if (SpecializedFuncs.contains(F))
       return false;
-    }
 
     // If we're optimizing the function for size, we shouldn't specialize it.
     if (F->hasOptSize() ||
@@ -176,8 +356,25 @@ private:
     if (!Solver.isBlockExecutable(&F->getEntryBlock()))
       return false;
 
+    // It wastes time to specialize a function which would get inlined finally.
+    if (F->hasFnAttribute(Attribute::AlwaysInline))
+      return false;
+
     LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName()
                       << "\n");
+
+    // Determine if it would be profitable to create a specialization of the
+    // function where the argument takes on the given constant value. If so,
+    // add the constant to Constants.
+    auto FnSpecCost = getSpecializationCost(F);
+    if (!FnSpecCost.isValid()) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: ";
+               FnSpecCost.print(dbgs()); dbgs() << "\n");
+
     // Determine if we should specialize the function based on the values the
     // argument can take on. If specialization is not profitable, we continue
     // on to the next argument.
@@ -195,7 +392,7 @@ private:
       // be set to false by isArgumentInteresting (that function only adds
       // values to the Constants list that are deemed profitable).
       SmallVector<Constant *, 4> Constants;
-      if (!isArgumentInteresting(&A, Constants, IsPartial)) {
+      if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) {
         LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
         continue;
       }
@@ -214,8 +411,7 @@ private:
       for (auto *C : Constants) {
         // Clone the function. We leave the ValueToValueMap empty to allow
         // IPSCCP to propagate the constant arguments.
-        ValueToValueMapTy EmptyMap;
-        Function *Clone = CloneFunction(F, EmptyMap);
+        Function *Clone = cloneCandidateFunction(F);
         Argument *ClonedArg = Clone->arg_begin() + A.getArgNo();
 
         // Rewrite calls to the function so that they call the clone instead.
@@ -231,9 +427,10 @@ private:
         NbFunctionsSpecialized++;
       }
 
-      // TODO: if we want to support specialize specialized functions, and if
-      // the function has been completely specialized, the original function is
-      // no longer needed, so we would need to mark it unreachable here.
+      // If the function has been completely specialized, the original function
+      // is no longer needed. Mark it unreachable.
+      if (!IsPartial)
+        Solver.markFunctionUnreachable(F);
 
       // FIXME: Only one argument per function.
       return true;
@@ -253,7 +450,11 @@ private:
 
     // If the code metrics reveal that we shouldn't duplicate the function, we
     // shouldn't specialize it. Set the specialization cost to Invalid.
-    if (Metrics.notDuplicatable) {
+    // Or if the lines of codes implies that this function is easy to get
+    // inlined so that we shouldn't specialize it.
+    if (Metrics.notDuplicatable ||
+        (!ForceFunctionSpecialization &&
+         Metrics.NumInsts < SmallFunctionThreshold)) {
       InstructionCost C{};
       C.setInvalid();
       return C;
@@ -379,9 +580,8 @@ private:
   /// argument.
   bool isArgumentInteresting(Argument *A,
                              SmallVectorImpl<Constant *> &Constants,
+                             const InstructionCost &FnSpecCost,
                              bool &IsPartial) {
-    Function *F = A->getParent();
-
     // For now, don't attempt to specialize functions based on the values of
     // composite types.
     if (!A->getType()->isSingleValueType() || A->user_empty())
@@ -420,18 +620,6 @@ private:
       return false;
     }
 
-    // Determine if it would be profitable to create a specialization of the
-    // function where the argument takes on the given constant value. If so,
-    // add the constant to Constants.
-    auto FnSpecCost = getSpecializationCost(F);
-    if (!FnSpecCost.isValid()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
-      return false;
-    }
-
-    LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: ";
-               FnSpecCost.print(dbgs()); dbgs() << "\n");
-
     for (auto *C : PossibleConstants) {
       LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n");
       if (ForceFunctionSpecialization) {
@@ -475,6 +663,12 @@ private:
       if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
         continue;
       auto &CS = *cast<CallBase>(U);
+      // If the call site has attribute minsize set, that callsite won't be
+      // specialized.
+      if (CS.hasFnAttr(Attribute::MinSize)) {
+        AllConstant = false;
+        continue;
+      }
 
       // If the parent of the call site will never be executed, we don't need
       // to worry about the passed value.
@@ -482,11 +676,25 @@ private:
         continue;
 
       auto *V = CS.getArgOperand(A->getArgNo());
+      if (isa<PoisonValue>(V))
+        return false;
+
+      // For now, constant expressions are fine but only if they are function
+      // calls.
+      if (auto *CE =  dyn_cast<ConstantExpr>(V))
+        if (!isa<Function>(CE->getOperand(0)))
+          return false;
+
       // TrackValueOfGlobalVariable only tracks scalar global variables.
       if (auto *GV = dyn_cast<GlobalVariable>(V)) {
-        if (!GV->getValueType()->isSingleValueType()) {
+        // Check if we want to specialize on the address of non-constant
+        // global values.
+        if (!GV->isConstant())
+          if (!SpecializeOnAddresses)
+            return false;
+
+        if (!GV->getValueType()->isSingleValueType())
           return false;
-        }
       }
 
       if (isa<Constant>(V) && (Solver.getLatticeValueFor(V).isConstant() ||
@@ -506,6 +714,9 @@ private:
   /// This function modifies calls to function \p F whose argument at index \p
   /// ArgNo is equal to constant \p C. The calls are rewritten to call function
   /// \p Clone instead.
+  ///
+  /// Callsites that have been marked with the MinSize function attribute won't
+  /// be specialized and rewritten.
   void rewriteCallSites(Function *F, Function *Clone, Argument &Arg,
                         Constant *C) {
     unsigned ArgNo = Arg.getArgNo();
@@ -527,24 +738,7 @@ private:
     }
   }
 };
-
-/// Function to clean up the left over intrinsics from SCCP util.
-static void cleanup(Module &M) {
-  for (Function &F : M) {
-    for (BasicBlock &BB : F) {
-      for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
-        Instruction *Inst = &*BI++;
-        if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
-          if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
-            Value *Op = II->getOperand(0);
-            Inst->replaceAllUsesWith(Op);
-            Inst->eraseFromParent();
-          }
-        }
-      }
-    }
-  }
-}
+} // namespace
 
 bool llvm::runFunctionSpecialization(
     Module &M, const DataLayout &DL,
@@ -597,12 +791,27 @@ bool llvm::runFunctionSpecialization(
       Solver.trackValueOfGlobalVariable(&G);
   }
 
+  auto &TrackedFuncs = Solver.getArgumentTrackedFunctions();
+  SmallVector<Function *, 16> FuncDecls(TrackedFuncs.begin(),
+                                        TrackedFuncs.end());
+
+  // No tracked functions, so nothing to do: don't run the solver and remove
+  // the ssa_copy intrinsics that may have been introduced.
+  if (TrackedFuncs.empty()) {
+    removeSSACopy(M);
+    return false;
+  }
+
   // Solve for constants.
   auto RunSCCPSolver = [&](auto &WorkList) {
     bool ResolvedUndefs = true;
 
     while (ResolvedUndefs) {
+      // Not running the solver unnecessary is checked in regression test
+      // nothing-to-do.ll, so if this debug message is changed, this regression
+      // test needs updating too.
       LLVM_DEBUG(dbgs() << "FnSpecialization: Running solver\n");
+
       Solver.solve();
       LLVM_DEBUG(dbgs() << "FnSpecialization: Resolving undefs\n");
       ResolvedUndefs = false;
@@ -615,15 +824,14 @@ bool llvm::runFunctionSpecialization(
       for (BasicBlock &BB : *F) {
         if (!Solver.isBlockExecutable(&BB))
           continue;
+        // FIXME: The solver may make changes to the function here, so set
+        // Changed, even if later function specialization does not trigger.
         for (auto &I : make_early_inc_range(BB))
-          FS.tryToReplaceWithConstant(&I);
+          Changed |= FS.tryToReplaceWithConstant(&I);
       }
     }
   };
 
-  auto &TrackedFuncs = Solver.getArgumentTrackedFunctions();
-  SmallVector<Function *, 16> FuncDecls(TrackedFuncs.begin(),
-                                        TrackedFuncs.end());
 #ifndef NDEBUG
   LLVM_DEBUG(dbgs() << "FnSpecialization: Worklist fn decls:\n");
   for (auto *F : FuncDecls)
@@ -637,14 +845,18 @@ bool llvm::runFunctionSpecialization(
   unsigned I = 0;
   while (FuncSpecializationMaxIters != I++ &&
          FS.specializeFunctions(FuncDecls, CurrentSpecializations)) {
-    // TODO: run the solver here for the specialized functions only if we want
-    // to specialize recursively.
+
+    // Run the solver for the specialized functions.
+    RunSCCPSolver(CurrentSpecializations);
+
+    // Replace some unresolved constant arguments.
+    constantArgPropagation(FuncDecls, M, Solver);
 
     CurrentSpecializations.clear();
     Changed = true;
   }
 
   // Clean up the IR by removing ssa_copy intrinsics.
-  cleanup(M);
+  removeSSACopy(M);
   return Changed;
 }
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index fb4cb23b837e..5e5d2086adc2 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -88,7 +88,7 @@ ModulePass *llvm::createGlobalDCEPass() {
 static bool isEmptyFunction(Function *F) {
   BasicBlock &Entry = F->getEntryBlock();
   for (auto &I : Entry) {
-    if (isa<DbgInfoIntrinsic>(I))
+    if (I.isDebugOrPseudoInst())
       continue;
     if (auto *RI = dyn_cast<ReturnInst>(&I))
       return !RI->getReturnValue();
@@ -210,7 +210,7 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
 
     Constant *Ptr =
         getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
-                           *Caller->getParent());
+                           *Caller->getParent(), VTable);
     if (!Ptr) {
       LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
       VFESafeVTables.erase(VTable);
@@ -416,6 +416,16 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
       // virtual function pointers with null, allowing us to remove the
       // function itself.
       ++NumVFuncs;
+
+      // Detect vfuncs that are referenced as "relative pointers" which are used
+      // in Swift vtables, i.e. entries in the form of:
+      //
+      //   i32 trunc (i64 sub (i64 ptrtoint @f, i64 ptrtoint ...)) to i32)
+      //
+      // In this case, replace the whole "sub" expression with constant 0 to
+      // avoid leaving a weird sub(0, symbol) expression behind.
+      replaceRelativePointerUsersWithZero(F);
+
       F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
     }
     EraseUnusedGlobalValue(F);
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 8750eb9ecc4e..b2c2efed7db8 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -208,9 +208,7 @@ CleanupPointerRootUsers(GlobalVariable *GV,
   SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
 
   // Constants can't be pointers to dynamically allocated memory.
-  for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
-       UI != E;) {
-    User *U = *UI++;
+  for (User *U : llvm::make_early_inc_range(GV->users())) {
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       Value *V = SI->getValueOperand();
       if (isa<Constant>(V)) {
@@ -703,8 +701,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
                !ICmpInst::isSigned(cast<ICmpInst>(U)->getPredicate()) &&
                isa<LoadInst>(U->getOperand(0)) &&
                isa<ConstantPointerNull>(U->getOperand(1))) {
-      assert(isa<GlobalValue>(
-                 cast<LoadInst>(U->getOperand(0))->getPointerOperand()) &&
+      assert(isa<GlobalValue>(cast<LoadInst>(U->getOperand(0))
+                                  ->getPointerOperand()
+                                  ->stripPointerCasts()) &&
              "Should be GlobalVariable");
       // This and only this kind of non-signed ICmpInst is to be replaced with
       // the comparing of the value of the created global init bool later in
@@ -720,22 +719,55 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
 /// Return true if all uses of any loads from GV will trap if the loaded value
 /// is null.  Note that this also permits comparisons of the loaded value
 /// against null, as a special case.
-static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
-  for (const User *U : GV->users())
-    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      SmallPtrSet<const PHINode*, 8> PHIs;
-      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+static bool allUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(GV);
+  while (!Worklist.empty()) {
+    const Value *P = Worklist.pop_back_val();
+    for (auto *U : P->users()) {
+      if (auto *LI = dyn_cast<LoadInst>(U)) {
+        SmallPtrSet<const PHINode *, 8> PHIs;
+        if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+          return false;
+      } else if (auto *SI = dyn_cast<StoreInst>(U)) {
+        // Ignore stores to the global.
+        if (SI->getPointerOperand() != P)
+          return false;
+      } else if (auto *CE = dyn_cast<ConstantExpr>(U)) {
+        if (CE->stripPointerCasts() != GV)
+          return false;
+        // Check further the ConstantExpr.
+        Worklist.push_back(CE);
+      } else {
+        // We don't know or understand this user, bail out.
         return false;
-    } else if (isa<StoreInst>(U)) {
-      // Ignore stores to the global.
-    } else {
-      // We don't know or understand this user, bail out.
-      //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
-      return false;
+      }
     }
+  }
+
   return true;
 }
 
+/// Get all the loads/store uses for global variable \p GV.
+static void allUsesOfLoadAndStores(GlobalVariable *GV,
+                                   SmallVector<Value *, 4> &Uses) {
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(GV);
+  while (!Worklist.empty()) {
+    auto *P = Worklist.pop_back_val();
+    for (auto *U : P->users()) {
+      if (auto *CE = dyn_cast<ConstantExpr>(U)) {
+        Worklist.push_back(CE);
+        continue;
+      }
+
+      assert((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+             "Expect only load or store instructions");
+      Uses.push_back(U);
+    }
+  }
+}
+
 static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
   bool Changed = false;
   for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
@@ -817,8 +849,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(
   bool AllNonStoreUsesGone = true;
 
   // Replace all uses of loads with uses of uses of the stored value.
-  for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
-    User *GlobalUser = *GUI++;
+  for (User *GlobalUser : llvm::make_early_inc_range(GV->users())) {
     if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
       Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
       // If we were able to delete all uses of the loads
@@ -934,9 +965,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
     }
   }
 
-  Constant *RepValue = NewGV;
-  if (NewGV->getType() != GV->getValueType())
-    RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
+  SmallPtrSet<Constant *, 1> RepValues;
+  RepValues.insert(NewGV);
 
   // If there is a comparison against null, we will insert a global bool to
   // keep track of whether the global was initialized yet or not.
@@ -947,9 +977,11 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
                        GV->getName()+".init", GV->getThreadLocalMode());
   bool InitBoolUsed = false;
 
-  // Loop over all uses of GV, processing them in turn.
-  while (!GV->use_empty()) {
-    if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
+  // Loop over all instruction uses of GV, processing them in turn.
+  SmallVector<Value *, 4> Guses;
+  allUsesOfLoadAndStores(GV, Guses);
+  for (auto *U : Guses) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       // The global is initialized when the store to it occurs. If the stored
       // value is null value, the global bool is set to false, otherwise true.
       new StoreInst(ConstantInt::getBool(
@@ -961,12 +993,14 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       continue;
     }
 
-    LoadInst *LI = cast<LoadInst>(GV->user_back());
+    LoadInst *LI = cast<LoadInst>(U);
     while (!LI->use_empty()) {
       Use &LoadUse = *LI->use_begin();
       ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
       if (!ICI) {
-        LoadUse = RepValue;
+        auto *CE = ConstantExpr::getBitCast(NewGV, LI->getType());
+        RepValues.insert(CE);
+        LoadUse.set(CE);
         continue;
       }
 
@@ -1012,40 +1046,53 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   // To further other optimizations, loop over all users of NewGV and try to
   // constant prop them.  This will promote GEP instructions with constant
   // indices into GEP constant-exprs, which will allow global-opt to hack on it.
-  ConstantPropUsersOf(NewGV, DL, TLI);
-  if (RepValue != NewGV)
-    ConstantPropUsersOf(RepValue, DL, TLI);
+  for (auto *CE : RepValues)
+    ConstantPropUsersOf(CE, DL, TLI);
 
   return NewGV;
 }
 
-/// Scan the use-list of V checking to make sure that there are no complex uses
-/// of V.  We permit simple things like dereferencing the pointer, but not
+/// Scan the use-list of GV checking to make sure that there are no complex uses
+/// of GV.  We permit simple things like dereferencing the pointer, but not
 /// storing through the address, unless it is to the specified global.
 static bool
-valueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
+valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI,
                                           const GlobalVariable *GV) {
-  for (const User *U : V->users()) {
-    const Instruction *Inst = cast<Instruction>(U);
+  SmallPtrSet<const Value *, 4> Visited;
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(CI);
 
-    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
-      continue; // Fine, ignore.
-    }
+  while (!Worklist.empty()) {
+    const Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
 
-    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
-        return false;  // Storing the pointer itself... bad.
-      continue; // Otherwise, storing through it, or storing into GV... fine.
-    }
+    for (const Use &VUse : V->uses()) {
+      const User *U = VUse.getUser();
+      if (isa<LoadInst>(U) || isa<CmpInst>(U))
+        continue; // Fine, ignore.
 
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
-      if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV))
-        return false;
-      continue;
-    }
+      if (auto *SI = dyn_cast<StoreInst>(U)) {
+        if (SI->getValueOperand() == V &&
+            SI->getPointerOperand()->stripPointerCasts() != GV)
+          return false; // Storing the pointer not into GV... bad.
+        continue; // Otherwise, storing through it, or storing into GV... fine.
+      }
 
-    return false;
+      if (auto *BCI = dyn_cast<BitCastInst>(U)) {
+        Worklist.push_back(BCI);
+        continue;
+      }
+
+      if (auto *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+        Worklist.push_back(GEPI);
+        continue;
+      }
+
+      return false;
+    }
   }
+
   return true;
 }
 
@@ -1066,12 +1113,12 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
   // been reached).  To do this, we check to see if all uses of the global
   // would trap if the global were null: this proves that they must all
   // happen after the malloc.
-  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+  if (!allUsesOfLoadedValueWillTrapIfNull(GV))
     return false;
 
   // We can't optimize this if the malloc itself is used in a complex way,
   // for example, being stored into multiple globals.  This allows the
-  // malloc to be stored into the specified global, loaded icmp'd.
+  // malloc to be stored into the specified global, loaded, gep, icmp'd.
   // These are all things we could transform to using the global for.
   if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV))
     return false;
@@ -1112,6 +1159,7 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
   // value was null.
   if (GV->getInitializer()->getType()->isPointerTy() &&
       GV->getInitializer()->isNullValue() &&
+      StoredOnceVal->getType()->isPointerTy() &&
       !NullPointerIsDefined(
           nullptr /* F */,
           GV->getInitializer()->getType()->getPointerAddressSpace())) {
@@ -1442,8 +1490,7 @@ static void makeAllConstantUsesInstructions(Constant *C) {
     append_range(UUsers, U->users());
     for (auto *UU : UUsers) {
       Instruction *UI = cast<Instruction>(UU);
-      Instruction *NewU = U->getAsInstruction();
-      NewU->insertBefore(UI);
+      Instruction *NewU = U->getAsInstruction(UI);
       UI->replaceUsesOfWith(U, NewU);
     }
     // We've replaced all the uses, so destroy the constant. (destroyConstant
@@ -1456,6 +1503,7 @@ static void makeAllConstantUsesInstructions(Constant *C) {
 /// it if possible.  If we make a change, return true.
 static bool
 processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
+                      function_ref<TargetTransformInfo &(Function &)> GetTTI,
                       function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                       function_ref<DominatorTree &(Function &)> LookupDomTree) {
   auto &DL = GV->getParent()->getDataLayout();
@@ -1554,43 +1602,57 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     if (SRAGlobal(GV, DL))
       return true;
   }
-  if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
+  Value *StoredOnceValue = GS.getStoredOnceValue();
+  if (GS.StoredType == GlobalStatus::StoredOnce && StoredOnceValue) {
+    // Avoid speculating constant expressions that might trap (div/rem).
+    auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue);
+    if (SOVConstant && SOVConstant->canTrap())
+      return Changed;
+
+    Function &StoreFn =
+        const_cast<Function &>(*GS.StoredOnceStore->getFunction());
+    bool CanHaveNonUndefGlobalInitializer =
+        GetTTI(StoreFn).canHaveNonUndefGlobalInitializerInAddressSpace(
+            GV->getType()->getAddressSpace());
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
     // global.  This allows us to mark it constant.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-      if (isa<UndefValue>(GV->getInitializer())) {
-        // Change the initial value here.
-        GV->setInitializer(SOVConstant);
-
-        // Clean up any obviously simplifiable users now.
-        CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
-
-        if (GV->use_empty()) {
-          LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
-                            << "simplify all users and delete global!\n");
-          GV->eraseFromParent();
-          ++NumDeleted;
-        }
-        ++NumSubstitute;
-        return true;
+    // This is restricted to address spaces that allow globals to have
+    // initializers. NVPTX, for example, does not support initializers for
+    // shared memory (AS 3).
+    if (SOVConstant && SOVConstant->getType() == GV->getValueType() &&
+        isa<UndefValue>(GV->getInitializer()) &&
+        CanHaveNonUndefGlobalInitializer) {
+      // Change the initial value here.
+      GV->setInitializer(SOVConstant);
+
+      // Clean up any obviously simplifiable users now.
+      CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+
+      if (GV->use_empty()) {
+        LLVM_DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
+                          << "simplify all users and delete global!\n");
+        GV->eraseFromParent();
+        ++NumDeleted;
       }
+      ++NumSubstitute;
+      return true;
+    }
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
-                                 GetTLI))
+    if (optimizeOnceStoredGlobal(GV, StoredOnceValue, GS.Ordering, DL, GetTLI))
       return true;
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
-    // boolean.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
-      if (GS.Ordering == AtomicOrdering::NotAtomic) {
-        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
-          ++NumShrunkToBool;
-          return true;
-        }
+    // boolean. Skip this optimization for AS that doesn't allow an initializer.
+    if (SOVConstant && GS.Ordering == AtomicOrdering::NotAtomic &&
+        (!isa<UndefValue>(GV->getInitializer()) ||
+         CanHaveNonUndefGlobalInitializer)) {
+      if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+        ++NumShrunkToBool;
+        return true;
       }
     }
   }
@@ -1602,6 +1664,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
 /// make a change, return true.
 static bool
 processGlobal(GlobalValue &GV,
+              function_ref<TargetTransformInfo &(Function &)> GetTTI,
               function_ref<TargetLibraryInfo &(Function &)> GetTLI,
               function_ref<DominatorTree &(Function &)> LookupDomTree) {
   if (GV.getName().startswith("llvm."))
@@ -1634,7 +1697,8 @@ processGlobal(GlobalValue &GV,
   if (GVar->isConstant() || !GVar->hasInitializer())
     return Changed;
 
-  return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
+  return processInternalGlobal(GVar, GS, GetTTI, GetTLI, LookupDomTree) ||
+         Changed;
 }
 
 /// Walk all of the direct calls of the specified function, changing them to
@@ -1651,7 +1715,7 @@ static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
                                Attribute::AttrKind A) {
   unsigned AttrIndex;
   if (Attrs.hasAttrSomewhere(A, &AttrIndex))
-    return Attrs.removeAttribute(C, AttrIndex, A);
+    return Attrs.removeAttributeAtIndex(C, AttrIndex, A);
   return Attrs;
 }
 
@@ -1864,10 +1928,8 @@ static void RemovePreallocated(Function *F) {
       Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
       if (!AllocaReplacement) {
         auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
-        auto *ArgType = UseCall
-                            ->getAttribute(AttributeList::FunctionIndex,
-                                           Attribute::Preallocated)
-                            .getValueAsType();
+        auto *ArgType =
+            UseCall->getFnAttr(Attribute::Preallocated).getValueAsType();
         auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
         Builder.SetInsertPoint(InsertBefore);
         auto *Alloca =
@@ -1897,26 +1959,22 @@ OptimizeFunctions(Module &M,
   bool Changed = false;
 
   std::vector<Function *> AllCallsCold;
-  for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
-    Function *F = &*FI++;
-    if (hasOnlyColdCalls(*F, GetBFI))
-      AllCallsCold.push_back(F);
-  }
+  for (Function &F : llvm::make_early_inc_range(M))
+    if (hasOnlyColdCalls(F, GetBFI))
+      AllCallsCold.push_back(&F);
 
   // Optimize functions.
-  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
-    Function *F = &*FI++;
-
+  for (Function &F : llvm::make_early_inc_range(M)) {
     // Don't perform global opt pass on naked functions; we don't want fast
     // calling conventions for naked functions.
-    if (F->hasFnAttribute(Attribute::Naked))
+    if (F.hasFnAttribute(Attribute::Naked))
       continue;
 
     // Functions without names cannot be referenced outside this module.
-    if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
-      F->setLinkage(GlobalValue::InternalLinkage);
+    if (!F.hasName() && !F.isDeclaration() && !F.hasLocalLinkage())
+      F.setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(*F, NotDiscardableComdats)) {
+    if (deleteIfDead(F, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
@@ -1931,17 +1989,17 @@ OptimizeFunctions(Module &M,
     // some more complicated logic to break these cycles.
     // Removing unreachable blocks might invalidate the dominator so we
     // recalculate it.
-    if (!F->isDeclaration()) {
-      if (removeUnreachableBlocks(*F)) {
-        auto &DT = LookupDomTree(*F);
-        DT.recalculate(*F);
+    if (!F.isDeclaration()) {
+      if (removeUnreachableBlocks(F)) {
+        auto &DT = LookupDomTree(F);
+        DT.recalculate(F);
         Changed = true;
       }
     }
 
-    Changed |= processGlobal(*F, GetTLI, LookupDomTree);
+    Changed |= processGlobal(F, GetTTI, GetTLI, LookupDomTree);
 
-    if (!F->hasLocalLinkage())
+    if (!F.hasLocalLinkage())
       continue;
 
     // If we have an inalloca parameter that we can safely remove the
@@ -1949,56 +2007,55 @@ OptimizeFunctions(Module &M,
     // wouldn't be safe in the presence of inalloca.
     // FIXME: We should also hoist alloca affected by this to the entry
     // block if possible.
-    if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
-        !F->hasAddressTaken() && !hasMustTailCallers(F)) {
-      RemoveAttribute(F, Attribute::InAlloca);
+    if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
+        !F.hasAddressTaken() && !hasMustTailCallers(&F)) {
+      RemoveAttribute(&F, Attribute::InAlloca);
       Changed = true;
     }
 
     // FIXME: handle invokes
     // FIXME: handle musttail
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
-      if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
-          !hasInvokeCallers(F)) {
-        RemovePreallocated(F);
+    if (F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+      if (!F.hasAddressTaken() && !hasMustTailCallers(&F) &&
+          !hasInvokeCallers(&F)) {
+        RemovePreallocated(&F);
         Changed = true;
       }
       continue;
     }
 
-    if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
+    if (hasChangeableCC(&F) && !F.isVarArg() && !F.hasAddressTaken()) {
       NumInternalFunc++;
-      TargetTransformInfo &TTI = GetTTI(*F);
+      TargetTransformInfo &TTI = GetTTI(F);
       // Change the calling convention to coldcc if either stress testing is
       // enabled or the target would like to use coldcc on functions which are
       // cold at all call sites and the callers contain no other non coldcc
       // calls.
       if (EnableColdCCStressTest ||
-          (TTI.useColdCCForColdCall(*F) &&
-           isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) {
-        F->setCallingConv(CallingConv::Cold);
-        changeCallSitesToColdCC(F);
+          (TTI.useColdCCForColdCall(F) &&
+           isValidCandidateForColdCC(F, GetBFI, AllCallsCold))) {
+        F.setCallingConv(CallingConv::Cold);
+        changeCallSitesToColdCC(&F);
         Changed = true;
         NumColdCC++;
       }
     }
 
-    if (hasChangeableCC(F) && !F->isVarArg() &&
-        !F->hasAddressTaken()) {
+    if (hasChangeableCC(&F) && !F.isVarArg() && !F.hasAddressTaken()) {
       // If this function has a calling convention worth changing, is not a
       // varargs function, and is only called directly, promote it to use the
       // Fast calling convention.
-      F->setCallingConv(CallingConv::Fast);
-      ChangeCalleesToFastCall(F);
+      F.setCallingConv(CallingConv::Fast);
+      ChangeCalleesToFastCall(&F);
       ++NumFastCallFns;
       Changed = true;
     }
 
-    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
-        !F->hasAddressTaken()) {
+    if (F.getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+        !F.hasAddressTaken()) {
       // The function is not used by a trampoline intrinsic, so it is safe
       // to remove the 'nest' attribute.
-      RemoveAttribute(F, Attribute::Nest);
+      RemoveAttribute(&F, Attribute::Nest);
       ++NumNestRemoved;
       Changed = true;
     }
@@ -2008,35 +2065,34 @@ OptimizeFunctions(Module &M,
 
 static bool
 OptimizeGlobalVars(Module &M,
+                   function_ref<TargetTransformInfo &(Function &)> GetTTI,
                    function_ref<TargetLibraryInfo &(Function &)> GetTLI,
                    function_ref<DominatorTree &(Function &)> LookupDomTree,
                    SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
   bool Changed = false;
 
-  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
-       GVI != E; ) {
-    GlobalVariable *GV = &*GVI++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     // Global variables without names cannot be referenced outside this module.
-    if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
-      GV->setLinkage(GlobalValue::InternalLinkage);
+    if (!GV.hasName() && !GV.isDeclaration() && !GV.hasLocalLinkage())
+      GV.setLinkage(GlobalValue::InternalLinkage);
     // Simplify the initializer.
-    if (GV->hasInitializer())
-      if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
+    if (GV.hasInitializer())
+      if (auto *C = dyn_cast<Constant>(GV.getInitializer())) {
         auto &DL = M.getDataLayout();
         // TLI is not used in the case of a Constant, so use default nullptr
         // for that optional parameter, since we don't have a Function to
         // provide GetTLI anyway.
         Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
         if (New != C)
-          GV->setInitializer(New);
+          GV.setInitializer(New);
       }
 
-    if (deleteIfDead(*GV, NotDiscardableComdats)) {
+    if (deleteIfDead(GV, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
-    Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
+    Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
   return Changed;
 }
@@ -2425,24 +2481,21 @@ OptimizeGlobalAliases(Module &M,
   for (GlobalValue *GV : Used.used())
     Used.compilerUsedErase(GV);
 
-  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E;) {
-    GlobalAlias *J = &*I++;
-
+  for (GlobalAlias &J : llvm::make_early_inc_range(M.aliases())) {
     // Aliases without names cannot be referenced outside this module.
-    if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
-      J->setLinkage(GlobalValue::InternalLinkage);
+    if (!J.hasName() && !J.isDeclaration() && !J.hasLocalLinkage())
+      J.setLinkage(GlobalValue::InternalLinkage);
 
-    if (deleteIfDead(*J, NotDiscardableComdats)) {
+    if (deleteIfDead(J, NotDiscardableComdats)) {
       Changed = true;
       continue;
     }
 
     // If the alias can change at link time, nothing can be done - bail out.
-    if (J->isInterposable())
+    if (J.isInterposable())
       continue;
 
-    Constant *Aliasee = J->getAliasee();
+    Constant *Aliasee = J.getAliasee();
     GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
     // We can't trivially replace the alias with the aliasee if the aliasee is
     // non-trivial in some way. We also can't replace the alias with the aliasee
@@ -2455,31 +2508,31 @@ OptimizeGlobalAliases(Module &M,
 
     // Make all users of the alias use the aliasee instead.
     bool RenameTarget;
-    if (!hasUsesToReplace(*J, Used, RenameTarget))
+    if (!hasUsesToReplace(J, Used, RenameTarget))
       continue;
 
-    J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
+    J.replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J.getType()));
     ++NumAliasesResolved;
     Changed = true;
 
     if (RenameTarget) {
       // Give the aliasee the name, linkage and other attributes of the alias.
-      Target->takeName(&*J);
-      Target->setLinkage(J->getLinkage());
-      Target->setDSOLocal(J->isDSOLocal());
-      Target->setVisibility(J->getVisibility());
-      Target->setDLLStorageClass(J->getDLLStorageClass());
+      Target->takeName(&J);
+      Target->setLinkage(J.getLinkage());
+      Target->setDSOLocal(J.isDSOLocal());
+      Target->setVisibility(J.getVisibility());
+      Target->setDLLStorageClass(J.getDLLStorageClass());
 
-      if (Used.usedErase(&*J))
+      if (Used.usedErase(&J))
         Used.usedInsert(Target);
 
-      if (Used.compilerUsedErase(&*J))
+      if (Used.compilerUsedErase(&J))
         Used.compilerUsedInsert(Target);
-    } else if (mayHaveOtherReferences(*J, Used))
+    } else if (mayHaveOtherReferences(J, Used))
       continue;
 
     // Delete the alias.
-    M.getAliasList().erase(J);
+    M.getAliasList().erase(&J);
     ++NumAliasesRemoved;
     Changed = true;
   }
@@ -2526,7 +2579,7 @@ static bool cxxDtorIsEmpty(const Function &Fn) {
     return false;
 
   for (auto &I : Fn.getEntryBlock()) {
-    if (isa<DbgInfoIntrinsic>(I))
+    if (I.isDebugOrPseudoInst())
       continue;
     if (isa<ReturnInst>(I))
       return true;
@@ -2552,12 +2605,11 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   // and remove them.
   bool Changed = false;
 
-  for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
-       I != E;) {
+  for (User *U : llvm::make_early_inc_range(CXAAtExitFn->users())) {
     // We're only interested in calls. Theoretically, we could handle invoke
     // instructions as well, but neither llvm-gcc nor clang generate invokes
     // to __cxa_atexit.
-    CallInst *CI = dyn_cast<CallInst>(*I++);
+    CallInst *CI = dyn_cast<CallInst>(U);
     if (!CI)
       continue;
 
@@ -2614,8 +2666,8 @@ static bool optimizeGlobalsInModule(
     });
 
     // Optimize non-address-taken globals.
-    LocalChange |=
-        OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
+    LocalChange |= OptimizeGlobalVars(M, GetTTI, GetTLI, LookupDomTree,
+                                      NotDiscardableComdats);
 
     // Resolve aliases, when possible.
     LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index 365b269dc3bf..e7d698c42fcf 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -154,11 +154,8 @@ static bool splitGlobals(Module &M) {
     return false;
 
   bool Changed = false;
-  for (auto I = M.global_begin(); I != M.global_end();) {
-    GlobalVariable &GV = *I;
-    ++I;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals()))
     Changed |= splitGlobal(GV);
-  }
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index adf9ffba5780..b8a314c54f18 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -33,6 +34,10 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+// A command flag to be used for debugging to exclude branches from similarity
+// matching and outlining.
+extern cl::opt<bool> DisableBranches;
+
 // Set to true if the user wants the ir outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
 // functions. Since the outliner is confined to a single module (modulo LTO),
@@ -71,8 +76,12 @@ struct OutlinableGroup {
   /// for extraction.
   bool IgnoreGroup = false;
 
-  /// The return block for the overall function.
-  BasicBlock *EndBB = nullptr;
+  /// The return blocks for the overall function.
+  DenseMap<Value *, BasicBlock *> EndBBs;
+
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
 
   /// A set containing the different GVN store sets needed. Each array contains
   /// a sorted list of the different values that need to be stored into output
@@ -87,6 +96,14 @@ struct OutlinableGroup {
   /// index in ArgumentTypes is an output argument.
   unsigned NumAggregateInputs = 0;
 
+  /// The mapping of the canonical numbering of the values in outlined sections
+  /// to specific arguments.
+  DenseMap<unsigned, unsigned> CanonicalNumberToAggArg;
+
+  /// The number of branches in the region target a basic block that is outside
+  /// of the region.
+  unsigned BranchesToOutside = 0;
+
   /// The number of instructions that will be outlined by extracting \ref
   /// Regions.
   InstructionCost Benefit = 0;
@@ -118,20 +135,67 @@ struct OutlinableGroup {
 /// \param SourceBB - the BasicBlock to pull Instructions from.
 /// \param TargetBB - the BasicBlock to put Instruction into.
 static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) {
-  BasicBlock::iterator BBCurr, BBEnd, BBNext;
-  for (BBCurr = SourceBB.begin(), BBEnd = SourceBB.end(); BBCurr != BBEnd;
-       BBCurr = BBNext) {
-    BBNext = std::next(BBCurr);
-    BBCurr->moveBefore(TargetBB, TargetBB.end());
-  }
+  for (Instruction &I : llvm::make_early_inc_range(SourceBB))
+    I.moveBefore(TargetBB, TargetBB.end());
+}
+
+/// A function to sort the keys of \p Map, which must be a mapping of constant
+/// values to basic blocks and return it in \p SortedKeys
+///
+/// \param SortedKeys - The vector the keys will be return in and sorted.
+/// \param Map - The DenseMap containing keys to sort.
+static void getSortedConstantKeys(std::vector<Value *> &SortedKeys,
+                                  DenseMap<Value *, BasicBlock *> &Map) {
+  for (auto &VtoBB : Map)
+    SortedKeys.push_back(VtoBB.first);
+
+  stable_sort(SortedKeys, [](const Value *LHS, const Value *RHS) {
+    const ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS);
+    const ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS);
+    assert(RHSC && "Not a constant integer in return value?");
+    assert(LHSC && "Not a constant integer in return value?");
+
+    return LHSC->getLimitedValue() < RHSC->getLimitedValue();
+  });
+}
+
+Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other,
+                                                  Value *V) {
+  Optional<unsigned> GVN = Candidate->getGVN(V);
+  assert(GVN.hasValue() && "No GVN for incoming value");
+  Optional<unsigned> CanonNum = Candidate->getCanonicalNum(*GVN);
+  Optional<unsigned> FirstGVN = Other.Candidate->fromCanonicalNum(*CanonNum);
+  Optional<Value *> FoundValueOpt = Other.Candidate->fromGVN(*FirstGVN);
+  return FoundValueOpt.getValueOr(nullptr);
 }
 
 void OutlinableRegion::splitCandidate() {
   assert(!CandidateSplit && "Candidate already split!");
 
+  Instruction *BackInst = Candidate->backInstruction();
+
+  Instruction *EndInst = nullptr;
+  // Check whether the last instruction is a terminator, if it is, we do
+  // not split on the following instruction. We leave the block as it is.  We
+  // also check that this is not the last instruction in the Module, otherwise
+  // the check for whether the current following instruction matches the
+  // previously recorded instruction will be incorrect.
+  if (!BackInst->isTerminator() ||
+      BackInst->getParent() != &BackInst->getFunction()->back()) {
+    EndInst = Candidate->end()->Inst;
+    assert(EndInst && "Expected an end instruction?");
+  }
+
+  // We check if the current instruction following the last instruction in the
+  // region is the same as the recorded instruction following the last
+  // instruction. If they do not match, there could be problems in rewriting
+  // the program after outlining, so we ignore it.
+  if (!BackInst->isTerminator() &&
+      EndInst != BackInst->getNextNonDebugInstruction())
+    return;
+
   Instruction *StartInst = (*Candidate->begin()).Inst;
-  Instruction *EndInst = (*Candidate->end()).Inst;
-  assert(StartInst && EndInst && "Expected a start and end instruction?");
+  assert(StartInst && "Expected a start instruction?");
   StartBB = StartInst->getParent();
   PrevBB = StartBB;
 
@@ -153,13 +217,20 @@ void OutlinableRegion::splitCandidate() {
   std::string OriginalName = PrevBB->getName().str();
 
   StartBB = PrevBB->splitBasicBlock(StartInst, OriginalName + "_to_outline");
-
-  // This is the case for the inner block since we do not have to include
-  // multiple blocks.
-  EndBB = StartBB;
-  FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+  PrevBB->replaceSuccessorsPhiUsesWith(PrevBB, StartBB);
 
   CandidateSplit = true;
+  if (!BackInst->isTerminator()) {
+    EndBB = EndInst->getParent();
+    FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
+    EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB);
+    FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB);
+    return;
+  }
+
+  EndBB = BackInst->getParent();
+  EndsInBranch = true;
+  FollowBB = nullptr;
 }
 
 void OutlinableRegion::reattachCandidate() {
@@ -180,7 +251,6 @@ void OutlinableRegion::reattachCandidate() {
   //   inst3
   //   inst4
   assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
-  assert(FollowBB != nullptr && "StartBB for Candidate is not defined!");
 
   // StartBB should only have one predecessor since we put an unconditional
   // branch at the end of PrevBB when we split the BasicBlock.
@@ -189,21 +259,24 @@ void OutlinableRegion::reattachCandidate() {
          "No Predecessor for the region start basic block!");
 
   assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
-  assert(EndBB->getTerminator() && "Terminator removed from EndBB!");
   PrevBB->getTerminator()->eraseFromParent();
-  EndBB->getTerminator()->eraseFromParent();
 
   moveBBContents(*StartBB, *PrevBB);
 
   BasicBlock *PlacementBB = PrevBB;
   if (StartBB != EndBB)
     PlacementBB = EndBB;
-  moveBBContents(*FollowBB, *PlacementBB);
+  if (!EndsInBranch && PlacementBB->getUniqueSuccessor() != nullptr) {
+    assert(FollowBB != nullptr && "FollowBB for Candidate is not defined!");
+    assert(PlacementBB->getTerminator() && "Terminator removed from EndBB!");
+    PlacementBB->getTerminator()->eraseFromParent();
+    moveBBContents(*FollowBB, *PlacementBB);
+    PlacementBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
+    FollowBB->eraseFromParent();
+  }
 
   PrevBB->replaceSuccessorsPhiUsesWith(StartBB, PrevBB);
-  PrevBB->replaceSuccessorsPhiUsesWith(FollowBB, PlacementBB);
   StartBB->eraseFromParent();
-  FollowBB->eraseFromParent();
 
   // Make sure to save changes back to the StartBB.
   StartBB = PrevBB;
@@ -261,8 +334,9 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
   // division instruction for targets that have a native division instruction.
   // To be overly conservative, we only add 1 to the number of instructions for
   // each division instruction.
-  for (Instruction &I : *StartBB) {
-    switch (I.getOpcode()) {
+  for (IRInstructionData &ID : *Candidate) {
+    Instruction *I = ID.Inst;
+    switch (I->getOpcode()) {
     case Instruction::FDiv:
     case Instruction::FRem:
     case Instruction::SDiv:
@@ -272,7 +346,7 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
       Benefit += 1;
       break;
     default:
-      Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+      Benefit += TTI.getInstructionCost(I, TargetTransformInfo::TCK_CodeSize);
       break;
     }
   }
@@ -373,8 +447,24 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
                                      unsigned FunctionNameSuffix) {
   assert(!Group.OutlinedFunction && "Function is already defined!");
 
+  Type *RetTy = Type::getVoidTy(M.getContext());
+  // All extracted functions _should_ have the same return type at this point
+  // since the similarity identifier ensures that all branches outside of the
+  // region occur in the same place.
+
+  // NOTE: Should we ever move to the model that uses a switch at every point
+  // needed, meaning that we could branch within the region or out, it is
+  // possible that we will need to switch to using the most general case all of
+  // the time.
+  for (OutlinableRegion *R : Group.Regions) {
+    Type *ExtractedFuncType = R->ExtractedFunction->getReturnType();
+    if ((RetTy->isVoidTy() && !ExtractedFuncType->isVoidTy()) ||
+        (RetTy->isIntegerTy(1) && ExtractedFuncType->isIntegerTy(16)))
+      RetTy = ExtractedFuncType;
+  }
+
   Group.OutlinedFunctionType = FunctionType::get(
-      Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false);
+      RetTy, Group.ArgumentTypes, false);
 
   // These functions will only be called from within the same module, so
   // we can set an internal linkage.
@@ -430,21 +520,23 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
 ///
 /// \param [in] Old - The function to move the basic blocks from.
 /// \param [in] New - The function to move the basic blocks to.
-/// \returns the first return block for the function in New.
-static BasicBlock *moveFunctionData(Function &Old, Function &New) {
-  Function::iterator CurrBB, NextBB, FinalBB;
-  BasicBlock *NewEnd = nullptr;
-  std::vector<Instruction *> DebugInsts;
-  for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB;
-       CurrBB = NextBB) {
-    NextBB = std::next(CurrBB);
-    CurrBB->removeFromParent();
-    CurrBB->insertInto(&New);
-    Instruction *I = CurrBB->getTerminator();
-    if (isa<ReturnInst>(I))
-      NewEnd = &(*CurrBB);
-
-    for (Instruction &Val : *CurrBB) {
+/// \param [out] NewEnds - The return blocks of the new overall function.
+static void moveFunctionData(Function &Old, Function &New,
+                             DenseMap<Value *, BasicBlock *> &NewEnds) {
+  for (BasicBlock &CurrBB : llvm::make_early_inc_range(Old)) {
+    CurrBB.removeFromParent();
+    CurrBB.insertInto(&New);
+    Instruction *I = CurrBB.getTerminator();
+
+    // For each block we find a return instruction is, it is a potential exit
+    // path for the function.  We keep track of each block based on the return
+    // value here.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
+      NewEnds.insert(std::make_pair(RI->getReturnValue(), &CurrBB));
+
+    std::vector<Instruction *> DebugInsts;
+
+    for (Instruction &Val : CurrBB) {
       // We must handle the scoping of called functions differently than
       // other outlined instructions.
       if (!isa<CallInst>(&Val)) {
@@ -476,8 +568,7 @@ static BasicBlock *moveFunctionData(Function &Old, Function &New) {
       I->eraseFromParent();
   }
 
-  assert(NewEnd && "No return instruction for new function?");
-  return NewEnd;
+  assert(NewEnds.size() > 0 && "No return instruction for new function?");
 }
 
 /// Find the the constants that will need to be lifted into arguments
@@ -664,11 +755,22 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
   // function to account for the extracted constants, we have two different
   // counters as we find extracted arguments, and as we come across overall
   // arguments.
+
+  // Additionally, in our first pass, for the first extracted function,
+  // we find argument locations for the canonical value numbering.  This
+  // numbering overrides any discovered location for the extracted code.
   for (unsigned InputVal : InputGVNs) {
+    Optional<unsigned> CanonicalNumberOpt = C.getCanonicalNum(InputVal);
+    assert(CanonicalNumberOpt.hasValue() && "Canonical number not found?");
+    unsigned CanonicalNumber = CanonicalNumberOpt.getValue();
+
     Optional<Value *> InputOpt = C.fromGVN(InputVal);
     assert(InputOpt.hasValue() && "Global value number not found?");
     Value *Input = InputOpt.getValue();
 
+    DenseMap<unsigned, unsigned>::iterator AggArgIt =
+        Group.CanonicalNumberToAggArg.find(CanonicalNumber);
+
     if (!Group.InputTypesSet) {
       Group.ArgumentTypes.push_back(Input->getType());
       // If the input value has a swifterr attribute, make sure to mark the
@@ -684,17 +786,34 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
     // Check if we have a constant. If we do add it to the overall argument
     // number to Constant map for the region, and continue to the next input.
     if (Constant *CST = dyn_cast<Constant>(Input)) {
-      Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST));
+      if (AggArgIt != Group.CanonicalNumberToAggArg.end())
+        Region.AggArgToConstant.insert(std::make_pair(AggArgIt->second, CST));
+      else {
+        Group.CanonicalNumberToAggArg.insert(
+            std::make_pair(CanonicalNumber, TypeIndex));
+        Region.AggArgToConstant.insert(std::make_pair(TypeIndex, CST));
+      }
       TypeIndex++;
       continue;
     }
 
     // It is not a constant, we create the mapping from extracted argument list
-    // to the overall argument list.
+    // to the overall argument list, using the canonical location, if it exists.
     assert(ArgInputs.count(Input) && "Input cannot be found!");
 
-    Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex));
-    Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex));
+    if (AggArgIt != Group.CanonicalNumberToAggArg.end()) {
+      if (OriginalIndex != AggArgIt->second)
+        Region.ChangedArgOrder = true;
+      Region.ExtractedArgToAgg.insert(
+          std::make_pair(OriginalIndex, AggArgIt->second));
+      Region.AggArgToExtracted.insert(
+          std::make_pair(AggArgIt->second, OriginalIndex));
+    } else {
+      Group.CanonicalNumberToAggArg.insert(
+          std::make_pair(CanonicalNumber, TypeIndex));
+      Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, TypeIndex));
+      Region.AggArgToExtracted.insert(std::make_pair(TypeIndex, OriginalIndex));
+    }
     OriginalIndex++;
     TypeIndex++;
   }
@@ -718,10 +837,41 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
 /// \param [in] Outputs - The values found by the code extractor.
 static void
 findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
-                                          ArrayRef<Value *> Outputs) {
+                                          SetVector<Value *> &Outputs) {
   OutlinableGroup &Group = *Region.Parent;
   IRSimilarityCandidate &C = *Region.Candidate;
 
+  SmallVector<BasicBlock *> BE;
+  DenseSet<BasicBlock *> BBSet;
+  C.getBasicBlocks(BBSet, BE);
+
+  // Find the exits to the region.
+  SmallPtrSet<BasicBlock *, 1> Exits;
+  for (BasicBlock *Block : BE)
+    for (BasicBlock *Succ : successors(Block))
+      if (!BBSet.contains(Succ))
+        Exits.insert(Succ);
+
+  // After determining which blocks exit to PHINodes, we add these PHINodes to
+  // the set of outputs to be processed.  We also check the incoming values of
+  // the PHINodes for whether they should no longer be considered outputs.
+  for (BasicBlock *ExitBB : Exits) {
+    for (PHINode &PN : ExitBB->phis()) {
+      // Find all incoming values from the outlining region.
+      SmallVector<unsigned, 2> IncomingVals;
+      for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
+        if (BBSet.contains(PN.getIncomingBlock(Idx)))
+          IncomingVals.push_back(Idx);
+
+      // Do not process PHI if there is one (or fewer) predecessor from region.
+      if (IncomingVals.size() <= 1)
+        continue;
+
+      Region.IgnoreRegion = true;
+      return;
+    }
+  }
+
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
 
@@ -797,7 +947,7 @@ void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
 
   // Map the outputs found by the CodeExtractor to the arguments found for
   // the overall function.
-  findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef());
+  findExtractedOutputToOverallOutputMapping(Region, Outputs);
 }
 
 /// Replace the extracted function in the Region with a call to the overall
@@ -820,9 +970,10 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
   assert(AggFunc && "Function to replace with is nullptr?");
 
   // If the arguments are the same size, there are not values that need to be
-  // made argument, or different output registers to handle.  We can simply
-  // replace the called function in this case.
-  if (AggFunc->arg_size() == Call->arg_size()) {
+  // made into an argument, the argument ordering has not been change, or
+  // different output registers to handle.  We can simply replace the called
+  // function in this case.
+  if (!Region.ChangedArgOrder && AggFunc->arg_size() == Call->arg_size()) {
     LLVM_DEBUG(dbgs() << "Replace call to " << *Call << " with call to "
                       << *AggFunc << " with same number of arguments\n");
     Call->setCalledFunction(AggFunc);
@@ -895,6 +1046,9 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 
   // Transfer any debug information.
   Call->setDebugLoc(Region.Call->getDebugLoc());
+  // Since our output may determine which branch we go to, we make sure to
+  // propogate this new call value through the module.
+  OldCall->replaceAllUsesWith(Call);
 
   // Remove the old instruction.
   OldCall->eraseFromParent();
@@ -913,13 +1067,23 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 // region with the arguments of the function for an OutlinableGroup.
 //
 /// \param [in] Region - The region of extracted code to be changed.
-/// \param [in,out] OutputBB - The BasicBlock for the output stores for this
+/// \param [in,out] OutputBBs - The BasicBlock for the output stores for this
 /// region.
-static void replaceArgumentUses(OutlinableRegion &Region,
-                                BasicBlock *OutputBB) {
+/// \param [in] FirstFunction - A flag to indicate whether we are using this
+/// function to define the overall outlined function for all the regions, or
+/// if we are operating on one of the following regions.
+static void
+replaceArgumentUses(OutlinableRegion &Region,
+                    DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
 
+  Function *DominatingFunction = Region.ExtractedFunction;
+  if (FirstFunction)
+    DominatingFunction = Group.OutlinedFunction;
+  DominatorTree DT(*DominatingFunction);
+
   for (unsigned ArgIdx = 0; ArgIdx < Region.ExtractedFunction->arg_size();
        ArgIdx++) {
     assert(Region.ExtractedArgToAgg.find(ArgIdx) !=
@@ -946,11 +1110,53 @@ static void replaceArgumentUses(OutlinableRegion &Region,
     assert(InstAsUser && "User is nullptr!");
 
     Instruction *I = cast<Instruction>(InstAsUser);
-    I->setDebugLoc(DebugLoc());
-    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
-                      << *OutputBB << "\n");
+    BasicBlock *BB = I->getParent();
+    SmallVector<BasicBlock *, 4> Descendants;
+    DT.getDescendants(BB, Descendants);
+    bool EdgeAdded = false;
+    if (Descendants.size() == 0) {
+      EdgeAdded = true;
+      DT.insertEdge(&DominatingFunction->getEntryBlock(), BB);
+      DT.getDescendants(BB, Descendants);
+    }
+
+    // Iterate over the following blocks, looking for return instructions,
+    // if we find one, find the corresponding output block for the return value
+    // and move our store instruction there.
+    for (BasicBlock *DescendBB : Descendants) {
+      ReturnInst *RI = dyn_cast<ReturnInst>(DescendBB->getTerminator());
+      if (!RI)
+        continue;
+      Value *RetVal = RI->getReturnValue();
+      auto VBBIt = OutputBBs.find(RetVal);
+      assert(VBBIt != OutputBBs.end() && "Could not find output value!");
+
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      StoreInst *SI = cast<StoreInst>(I);
+
+      Value *ValueOperand = SI->getValueOperand();
+
+      StoreInst *NewI = cast<StoreInst>(I->clone());
+      NewI->setDebugLoc(DebugLoc());
+      BasicBlock *OutputBB = VBBIt->second;
+      OutputBB->getInstList().push_back(NewI);
+      LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                        << *OutputBB << "\n");
 
-    I->moveBefore(*OutputBB, OutputBB->end());
+      if (FirstFunction)
+        continue;
+      Value *CorrVal =
+          Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+      assert(CorrVal && "Value is nullptr?");
+      NewI->setOperand(0, CorrVal);
+    }
+
+    // If we added an edge for basic blocks without a predecessor, we remove it
+    // here.
+    if (EdgeAdded)
+      DT.deleteEdge(&DominatingFunction->getEntryBlock(), BB);
+    I->eraseFromParent();
 
     LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function "
                       << *Region.ExtractedFunction << " with " << *AggArg
@@ -990,69 +1196,53 @@ void replaceConstants(OutlinableRegion &Region) {
   }
 }
 
-/// For the given function, find all the nondebug or lifetime instructions,
-/// and return them as a vector. Exclude any blocks in \p ExludeBlocks.
-///
-/// \param [in] F - The function we collect the instructions from.
-/// \param [in] ExcludeBlocks - BasicBlocks to ignore.
-/// \returns the list of instructions extracted.
-static std::vector<Instruction *>
-collectRelevantInstructions(Function &F,
-                            DenseSet<BasicBlock *> &ExcludeBlocks) {
-  std::vector<Instruction *> RelevantInstructions;
-
-  for (BasicBlock &BB : F) {
-    if (ExcludeBlocks.contains(&BB))
-      continue;
-
-    for (Instruction &Inst : BB) {
-      if (Inst.isLifetimeStartOrEnd())
-        continue;
-      if (isa<DbgInfoIntrinsic>(Inst))
-        continue;
-
-      RelevantInstructions.push_back(&Inst);
-    }
-  }
-
-  return RelevantInstructions;
-}
-
 /// It is possible that there is a basic block that already performs the same
 /// stores. This returns a duplicate block, if it exists
 ///
-/// \param OutputBB [in] the block we are looking for a duplicate of.
+/// \param OutputBBs [in] the blocks we are looking for a duplicate of.
 /// \param OutputStoreBBs [in] The existing output blocks.
 /// \returns an optional value with the number output block if there is a match.
-Optional<unsigned>
-findDuplicateOutputBlock(BasicBlock *OutputBB,
-                         ArrayRef<BasicBlock *> OutputStoreBBs) {
+Optional<unsigned> findDuplicateOutputBlock(
+    DenseMap<Value *, BasicBlock *> &OutputBBs,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
 
-  bool WrongInst = false;
-  bool WrongSize = false;
+  bool Mismatch = false;
   unsigned MatchingNum = 0;
-  for (BasicBlock *CompBB : OutputStoreBBs) {
-    WrongInst = false;
-    if (CompBB->size() - 1 != OutputBB->size()) {
-      WrongSize = true;
-      MatchingNum++;
-      continue;
-    }
-
-    WrongSize = false;
-    BasicBlock::iterator NIt = OutputBB->begin();
-    for (Instruction &I : *CompBB) {
-      if (isa<BranchInst>(&I))
-        continue;
+  // We compare the new set output blocks to the other sets of output blocks.
+  // If they are the same number, and have identical instructions, they are
+  // considered to be the same.
+  for (DenseMap<Value *, BasicBlock *> &CompBBs : OutputStoreBBs) {
+    Mismatch = false;
+    for (std::pair<Value *, BasicBlock *> &VToB : CompBBs) {
+      DenseMap<Value *, BasicBlock *>::iterator OutputBBIt =
+          OutputBBs.find(VToB.first);
+      if (OutputBBIt == OutputBBs.end()) {
+        Mismatch = true;
+        break;
+      }
 
-      if (!I.isIdenticalTo(&(*NIt))) {
-        WrongInst = true;
+      BasicBlock *CompBB = VToB.second;
+      BasicBlock *OutputBB = OutputBBIt->second;
+      if (CompBB->size() - 1 != OutputBB->size()) {
+        Mismatch = true;
         break;
       }
 
-      NIt++;
+      BasicBlock::iterator NIt = OutputBB->begin();
+      for (Instruction &I : *CompBB) {
+        if (isa<BranchInst>(&I))
+          continue;
+
+        if (!I.isIdenticalTo(&(*NIt))) {
+          Mismatch = true;
+          break;
+        }
+
+        NIt++;
+      }
     }
-    if (!WrongInst && !WrongSize)
+
+    if (!Mismatch)
       return MatchingNum;
 
     MatchingNum++;
@@ -1061,95 +1251,130 @@ findDuplicateOutputBlock(BasicBlock *OutputBB,
   return None;
 }
 
+/// Remove empty output blocks from the outlined region.
+///
+/// \param BlocksToPrune - Mapping of return values output blocks for the \p
+/// Region.
+/// \param Region - The OutlinableRegion we are analyzing.
+static bool
+analyzeAndPruneOutputBlocks(DenseMap<Value *, BasicBlock *> &BlocksToPrune,
+                            OutlinableRegion &Region) {
+  bool AllRemoved = true;
+  Value *RetValueForBB;
+  BasicBlock *NewBB;
+  SmallVector<Value *, 4> ToRemove;
+  // Iterate over the output blocks created in the outlined section.
+  for (std::pair<Value *, BasicBlock *> &VtoBB : BlocksToPrune) {
+    RetValueForBB = VtoBB.first;
+    NewBB = VtoBB.second;
+  
+    // If there are no instructions, we remove it from the module, and also
+    // mark the value for removal from the return value to output block mapping.
+    if (NewBB->size() == 0) {
+      NewBB->eraseFromParent();
+      ToRemove.push_back(RetValueForBB);
+      continue;
+    }
+    
+    // Mark that we could not remove all the blocks since they were not all
+    // empty.
+    AllRemoved = false;
+  }
+
+  // Remove the return value from the mapping.
+  for (Value *V : ToRemove)
+    BlocksToPrune.erase(V);
+
+  // Mark the region as having the no output scheme.
+  if (AllRemoved)
+    Region.OutputBlockNum = -1;
+  
+  return AllRemoved;
+}
+
 /// For the outlined section, move needed the StoreInsts for the output
 /// registers into their own block. Then, determine if there is a duplicate
 /// output block already created.
 ///
 /// \param [in] OG - The OutlinableGroup of regions to be outlined.
 /// \param [in] Region - The OutlinableRegion that is being analyzed.
-/// \param [in,out] OutputBB - the block that stores for this region will be
+/// \param [in,out] OutputBBs - the blocks that stores for this region will be
 /// placed in.
-/// \param [in] EndBB - the final block of the extracted function.
+/// \param [in] EndBBs - the final blocks of the extracted function.
 /// \param [in] OutputMappings - OutputMappings the mapping of values that have
 /// been replaced by a new output value.
 /// \param [in,out] OutputStoreBBs - The existing output blocks.
-static void
-alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
-                            BasicBlock *OutputBB, BasicBlock *EndBB,
-                            const DenseMap<Value *, Value *> &OutputMappings,
-                            std::vector<BasicBlock *> &OutputStoreBBs) {
-  DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
-                                  Region.GVNStores.end());
-
-  // We iterate over the instructions in the extracted function, and find the
-  // global value number of the instructions.  If we find a value that should
-  // be contained in a store, we replace the uses of the value with the value
-  // from the overall function, so that the store is storing the correct
-  // value from the overall function.
-  DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(),
-                                    OutputStoreBBs.end());
-  ExcludeBBs.insert(OutputBB);
-  std::vector<Instruction *> ExtractedFunctionInsts =
-      collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
-  std::vector<Instruction *> OverallFunctionInsts =
-      collectRelevantInstructions(*OG.OutlinedFunction, ExcludeBBs);
-
-  assert(ExtractedFunctionInsts.size() == OverallFunctionInsts.size() &&
-         "Number of relevant instructions not equal!");
-
-  unsigned NumInstructions = ExtractedFunctionInsts.size();
-  for (unsigned Idx = 0; Idx < NumInstructions; Idx++) {
-    Value *V = ExtractedFunctionInsts[Idx];
-
-    if (OutputMappings.find(V) != OutputMappings.end())
-      V = OutputMappings.find(V)->second;
-    Optional<unsigned> GVN = Region.Candidate->getGVN(V);
-
-    // If we have found one of the stored values for output, replace the value
-    // with the corresponding one from the overall function.
-    if (GVN.hasValue() && ValuesToFind.erase(GVN.getValue())) {
-      V->replaceAllUsesWith(OverallFunctionInsts[Idx]);
-      if (ValuesToFind.size() == 0)
-        break;
-    }
-
-    if (ValuesToFind.size() == 0)
-      break;
-  }
-
-  assert(ValuesToFind.size() == 0 && "Not all store values were handled!");
-
-  // If the size of the block is 0, then there are no stores, and we do not
-  // need to save this block.
-  if (OutputBB->size() == 0) {
-    Region.OutputBlockNum = -1;
-    OutputBB->eraseFromParent();
+static void alignOutputBlockWithAggFunc(
+    OutlinableGroup &OG, OutlinableRegion &Region,
+    DenseMap<Value *, BasicBlock *> &OutputBBs,
+    DenseMap<Value *, BasicBlock *> &EndBBs,
+    const DenseMap<Value *, Value *> &OutputMappings,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
+  // If none of the output blocks have any instructions, this means that we do
+  // not have to determine if it matches any of the other output schemes, and we
+  // don't have to do anything else.
+  if (analyzeAndPruneOutputBlocks(OutputBBs, Region))
     return;
-  }
 
-  // Determine is there is a duplicate block.
+  // Determine is there is a duplicate set of blocks.
   Optional<unsigned> MatchingBB =
-      findDuplicateOutputBlock(OutputBB, OutputStoreBBs);
+      findDuplicateOutputBlock(OutputBBs, OutputStoreBBs);
 
-  // If there is, we remove the new output block.  If it does not,
-  // we add it to our list of output blocks.
+  // If there is, we remove the new output blocks.  If it does not,
+  // we add it to our list of sets of output blocks.
   if (MatchingBB.hasValue()) {
     LLVM_DEBUG(dbgs() << "Set output block for region in function"
                       << Region.ExtractedFunction << " to "
                       << MatchingBB.getValue());
 
     Region.OutputBlockNum = MatchingBB.getValue();
-    OutputBB->eraseFromParent();
+    for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs)
+      VtoBB.second->eraseFromParent();
     return;
   }
 
   Region.OutputBlockNum = OutputStoreBBs.size();
 
-  LLVM_DEBUG(dbgs() << "Create output block for region in"
-                    << Region.ExtractedFunction << " to "
-                    << *OutputBB);
-  OutputStoreBBs.push_back(OutputBB);
-  BranchInst::Create(EndBB, OutputBB);
+  Value *RetValueForBB;
+  BasicBlock *NewBB;
+  OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>());
+  for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) {
+    RetValueForBB = VtoBB.first;
+    NewBB = VtoBB.second;
+    DenseMap<Value *, BasicBlock *>::iterator VBBIt =
+        EndBBs.find(RetValueForBB);
+    LLVM_DEBUG(dbgs() << "Create output block for region in"
+                      << Region.ExtractedFunction << " to "
+                      << *NewBB);
+    BranchInst::Create(VBBIt->second, NewBB);
+    OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB));
+  }
+}
+
+/// Takes in a mapping, \p OldMap of ConstantValues to BasicBlocks, sorts keys,
+/// before creating a basic block for each \p NewMap, and inserting into the new
+/// block. Each BasicBlock is named with the scheme "<basename>_<key_idx>".
+///
+/// \param OldMap [in] - The mapping to base the new mapping off of.
+/// \param NewMap [out] - The output mapping using the keys of \p OldMap.
+/// \param ParentFunc [in] - The function to put the new basic block in.
+/// \param BaseName [in] - The start of the BasicBlock names to be appended to
+/// by an index value.
+static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap,
+                                       DenseMap<Value *, BasicBlock *> &NewMap,
+                                       Function *ParentFunc, Twine BaseName) {
+  unsigned Idx = 0;
+  std::vector<Value *> SortedKeys;
+  
+  getSortedConstantKeys(SortedKeys, OldMap);
+
+  for (Value *RetVal : SortedKeys) {
+    BasicBlock *NewBB = BasicBlock::Create(
+        ParentFunc->getContext(),
+        Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)),
+        ParentFunc);
+    NewMap.insert(std::make_pair(RetVal, NewBB));
+  }
 }
 
 /// Create the switch statement for outlined function to differentiate between
@@ -1159,50 +1384,74 @@ alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
 /// matches the needed stores for the extracted section.
 /// \param [in] M - The module we are outlining from.
 /// \param [in] OG - The group of regions to be outlined.
-/// \param [in] EndBB - The final block of the extracted function.
+/// \param [in] EndBBs - The final blocks of the extracted function.
 /// \param [in,out] OutputStoreBBs - The existing output blocks.
-void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
-                           ArrayRef<BasicBlock *> OutputStoreBBs) {
+void createSwitchStatement(
+    Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
   // combination.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
-    // Create a final block
-    BasicBlock *ReturnBlock =
-        BasicBlock::Create(M.getContext(), "final_block", AggFunc);
-    Instruction *Term = EndBB->getTerminator();
-    Term->moveBefore(*ReturnBlock, ReturnBlock->end());
-    // Put the switch statement in the old end basic block for the function with
-    // a fall through to the new return block
-    LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
-                      << OutputStoreBBs.size() << "\n");
-    SwitchInst *SwitchI =
-        SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
-                           ReturnBlock, OutputStoreBBs.size(), EndBB);
-
-    unsigned Idx = 0;
-    for (BasicBlock *BB : OutputStoreBBs) {
-      SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx),
-                       BB);
-      Term = BB->getTerminator();
-      Term->setSuccessor(0, ReturnBlock);
-      Idx++;
+    // Create a final block for each different return block.
+    DenseMap<Value *, BasicBlock *> ReturnBBs;
+    createAndInsertBasicBlocks(OG.EndBBs, ReturnBBs, AggFunc, "final_block");
+
+    for (std::pair<Value *, BasicBlock *> &RetBlockPair : ReturnBBs) {
+      std::pair<Value *, BasicBlock *> &OutputBlock =
+          *OG.EndBBs.find(RetBlockPair.first);
+      BasicBlock *ReturnBlock = RetBlockPair.second;
+      BasicBlock *EndBB = OutputBlock.second;
+      Instruction *Term = EndBB->getTerminator();
+      // Move the return value to the final block instead of the original exit
+      // stub.
+      Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+      // Put the switch statement in the old end basic block for the function
+      // with a fall through to the new return block.
+      LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                        << OutputStoreBBs.size() << "\n");
+      SwitchInst *SwitchI =
+          SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
+                             ReturnBlock, OutputStoreBBs.size(), EndBB);
+
+      unsigned Idx = 0;
+      for (DenseMap<Value *, BasicBlock *> &OutputStoreBB : OutputStoreBBs) {
+        DenseMap<Value *, BasicBlock *>::iterator OSBBIt =
+            OutputStoreBB.find(OutputBlock.first);
+
+        if (OSBBIt == OutputStoreBB.end())
+          continue;
+
+        BasicBlock *BB = OSBBIt->second;
+        SwitchI->addCase(
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), BB);
+        Term = BB->getTerminator();
+        Term->setSuccessor(0, ReturnBlock);
+        Idx++;
+      }
     }
     return;
   }
 
-  // If there needs to be stores, move them from the output block to the end
-  // block to save on branching instructions.
+  // If there needs to be stores, move them from the output blocks to their
+  // corresponding ending block.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
-    BasicBlock *OutputBlock = OutputStoreBBs[0];
-    Instruction *Term = OutputBlock->getTerminator();
-    Term->eraseFromParent();
-    Term = EndBB->getTerminator();
-    moveBBContents(*OutputBlock, *EndBB);
-    Term->moveBefore(*EndBB, EndBB->end());
-    OutputBlock->eraseFromParent();
+    DenseMap<Value *, BasicBlock *> OutputBlocks = OutputStoreBBs[0];
+    for (std::pair<Value *, BasicBlock *> &VBPair : OutputBlocks) {
+      DenseMap<Value *, BasicBlock *>::iterator EndBBIt =
+          EndBBs.find(VBPair.first);
+      assert(EndBBIt != EndBBs.end() && "Could not find end block");
+      BasicBlock *EndBB = EndBBIt->second;
+      BasicBlock *OutputBB = VBPair.second;
+      Instruction *Term = OutputBB->getTerminator();
+      Term->eraseFromParent();
+      Term = EndBB->getTerminator();
+      moveBBContents(*OutputBB, *EndBB);
+      Term->moveBefore(*EndBB, EndBB->end());
+      OutputBB->eraseFromParent();
+    }
   }
 }
 
@@ -1217,42 +1466,44 @@ void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
 /// set of stores needed for the different functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
-static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
-                                std::vector<BasicBlock *> &OutputStoreBBs,
-                                std::vector<Function *> &FuncsToRemove) {
+static void fillOverallFunction(
+    Module &M, OutlinableGroup &CurrentGroup,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
+    std::vector<Function *> &FuncsToRemove) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
   LLVM_DEBUG(dbgs() << "Move instructions from "
                     << *CurrentOS->ExtractedFunction << " to instruction "
                     << *CurrentGroup.OutlinedFunction << "\n");
-
-  CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction,
-                                        *CurrentGroup.OutlinedFunction);
+  moveFunctionData(*CurrentOS->ExtractedFunction,
+                   *CurrentGroup.OutlinedFunction, CurrentGroup.EndBBs);
 
   // Transfer the attributes from the function to the new function.
-  for (Attribute A :
-       CurrentOS->ExtractedFunction->getAttributes().getFnAttributes())
+  for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttrs())
     CurrentGroup.OutlinedFunction->addFnAttr(A);
 
-  // Create an output block for the first extracted function.
-  BasicBlock *NewBB = BasicBlock::Create(
-      M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)),
-      CurrentGroup.OutlinedFunction);
+  // Create a new set of output blocks for the first extracted function.
+  DenseMap<Value *, BasicBlock *> NewBBs;
+  createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs,
+                             CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBB);
+  replaceArgumentUses(*CurrentOS, NewBBs, true);
   replaceConstants(*CurrentOS);
 
-  // If the new basic block has no new stores, we can erase it from the module.
-  // It it does, we create a branch instruction to the last basic block from the
-  // new one.
-  if (NewBB->size() == 0) {
-    CurrentOS->OutputBlockNum = -1;
-    NewBB->eraseFromParent();
-  } else {
-    BranchInst::Create(CurrentGroup.EndBB, NewBB);
-    OutputStoreBBs.push_back(NewBB);
+  // We first identify if any output blocks are empty, if they are we remove
+  // them. We then create a branch instruction to the basic block to the return
+  // block for the function for each non empty output block.
+  if (!analyzeAndPruneOutputBlocks(NewBBs, *CurrentOS)) {
+    OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>());
+    for (std::pair<Value *, BasicBlock *> &VToBB : NewBBs) {
+      DenseMap<Value *, BasicBlock *>::iterator VBBIt =
+          CurrentGroup.EndBBs.find(VToBB.first);
+      BasicBlock *EndBB = VBBIt->second;
+      BranchInst::Create(EndBB, VToBB.second);
+      OutputStoreBBs.back().insert(VToBB);
+    }
   }
 
   // Replace the call to the extracted function with the outlined function.
@@ -1268,25 +1519,28 @@ void IROutliner::deduplicateExtractedSections(
     std::vector<Function *> &FuncsToRemove, unsigned &OutlinedFunctionNum) {
   createFunction(M, CurrentGroup, OutlinedFunctionNum);
 
-  std::vector<BasicBlock *> OutputStoreBBs;
+  std::vector<DenseMap<Value *, BasicBlock *>> OutputStoreBBs;
 
   OutlinableRegion *CurrentOS;
 
   fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
 
+  std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
     CurrentOS = CurrentGroup.Regions[Idx];
     AttributeFuncs::mergeAttributesForOutlining(*CurrentGroup.OutlinedFunction,
                                                *CurrentOS->ExtractedFunction);
 
-    // Create a new BasicBlock to hold the needed store instructions.
-    BasicBlock *NewBB = BasicBlock::Create(
-        M.getContext(), "output_block_" + std::to_string(Idx),
-        CurrentGroup.OutlinedFunction);
-    replaceArgumentUses(*CurrentOS, NewBB);
+    // Create a set of BasicBlocks, one for each return block, to hold the
+    // needed store instructions.
+    DenseMap<Value *, BasicBlock *> NewBBs;
+    createAndInsertBasicBlocks(
+        CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
+        "output_block_" + Twine(static_cast<unsigned>(Idx)));
 
-    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB,
-                                CurrentGroup.EndBB, OutputMappings,
+    replaceArgumentUses(*CurrentOS, NewBBs);
+    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
+                                CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
 
     CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
@@ -1294,11 +1548,78 @@ void IROutliner::deduplicateExtractedSections(
   }
 
   // Create a switch statement to handle the different output schemes.
-  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs);
+  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBBs, OutputStoreBBs);
 
   OutlinedFunctionNum++;
 }
 
+/// Checks that the next instruction in the InstructionDataList matches the
+/// next instruction in the module.  If they do not, there could be the
+/// possibility that extra code has been inserted, and we must ignore it.
+///
+/// \param ID - The IRInstructionData to check the next instruction of.
+/// \returns true if the InstructionDataList and actual instruction match.
+static bool nextIRInstructionDataMatchesNextInst(IRInstructionData &ID) {
+  // We check if there is a discrepancy between the InstructionDataList
+  // and the actual next instruction in the module.  If there is, it means
+  // that an extra instruction was added, likely by the CodeExtractor.
+
+  // Since we do not have any similarity data about this particular
+  // instruction, we cannot confidently outline it, and must discard this
+  // candidate.
+  IRInstructionDataList::iterator NextIDIt = std::next(ID.getIterator());
+  Instruction *NextIDLInst = NextIDIt->Inst;
+  Instruction *NextModuleInst = nullptr;
+  if (!ID.Inst->isTerminator())
+    NextModuleInst = ID.Inst->getNextNonDebugInstruction();
+  else if (NextIDLInst != nullptr)
+    NextModuleInst =
+        &*NextIDIt->Inst->getParent()->instructionsWithoutDebug().begin();
+
+  if (NextIDLInst && NextIDLInst != NextModuleInst)
+    return false;
+
+  return true;
+}
+
+bool IROutliner::isCompatibleWithAlreadyOutlinedCode(
+    const OutlinableRegion &Region) {
+  IRSimilarityCandidate *IRSC = Region.Candidate;
+  unsigned StartIdx = IRSC->getStartIdx();
+  unsigned EndIdx = IRSC->getEndIdx();
+
+  // A check to make sure that we are not about to attempt to outline something
+  // that has already been outlined.
+  for (unsigned Idx = StartIdx; Idx <= EndIdx; Idx++)
+    if (Outlined.contains(Idx))
+      return false;
+
+  // We check if the recorded instruction matches the actual next instruction,
+  // if it does not, we fix it in the InstructionDataList.
+  if (!Region.Candidate->backInstruction()->isTerminator()) {
+    Instruction *NewEndInst =
+        Region.Candidate->backInstruction()->getNextNonDebugInstruction();
+    assert(NewEndInst && "Next instruction is a nullptr?");
+    if (Region.Candidate->end()->Inst != NewEndInst) {
+      IRInstructionDataList *IDL = Region.Candidate->front()->IDL;
+      IRInstructionData *NewEndIRID = new (InstDataAllocator.Allocate())
+          IRInstructionData(*NewEndInst,
+                            InstructionClassifier.visit(*NewEndInst), *IDL);
+
+      // Insert the first IRInstructionData of the new region after the
+      // last IRInstructionData of the IRSimilarityCandidate.
+      IDL->insert(Region.Candidate->end(), *NewEndIRID);
+    }
+  }
+
+  return none_of(*IRSC, [this](IRInstructionData &ID) {
+    if (!nextIRInstructionDataMatchesNextInst(ID))
+      return true;
+
+    return !this->InstructionClassifier.visit(ID.Inst);
+  });
+}
+
 void IROutliner::pruneIncompatibleRegions(
     std::vector<IRSimilarityCandidate> &CandidateVec,
     OutlinableGroup &CurrentGroup) {
@@ -1310,6 +1631,15 @@ void IROutliner::pruneIncompatibleRegions(
     return LHS.getStartIdx() < RHS.getStartIdx();
   });
 
+  IRSimilarityCandidate &FirstCandidate = CandidateVec[0];
+  // Since outlining a call and a branch instruction will be the same as only
+  // outlinining a call instruction, we ignore it as a space saving.
+  if (FirstCandidate.getLength() == 2) {
+    if (isa<CallInst>(FirstCandidate.front()->Inst) &&
+        isa<BranchInst>(FirstCandidate.back()->Inst))
+        return;
+  }
+
   unsigned CurrentEndIdx = 0;
   for (IRSimilarityCandidate &IRSC : CandidateVec) {
     PreviouslyOutlined = false;
@@ -1325,9 +1655,13 @@ void IROutliner::pruneIncompatibleRegions(
     if (PreviouslyOutlined)
       continue;
 
-    // TODO: If in the future we can outline across BasicBlocks, we will need to
-    // check all BasicBlocks contained in the region.
-    if (IRSC.getStartBB()->hasAddressTaken())
+    // Check over the instructions, and if the basic block has its address
+    // taken for use somewhere else, we do not outline that block.
+    bool BBHasAddressTaken = any_of(IRSC, [](IRInstructionData &ID){
+      return ID.Inst->getParent()->hasAddressTaken();
+    });
+
+    if (BBHasAddressTaken)
       continue;
 
     if (IRSC.front()->Inst->getFunction()->hasLinkOnceODRLinkage() &&
@@ -1340,16 +1674,9 @@ void IROutliner::pruneIncompatibleRegions(
       continue;
 
     bool BadInst = any_of(IRSC, [this](IRInstructionData &ID) {
-      // We check if there is a discrepancy between the InstructionDataList
-      // and the actual next instruction in the module.  If there is, it means
-      // that an extra instruction was added, likely by the CodeExtractor.
-
-      // Since we do not have any similarity data about this particular
-      // instruction, we cannot confidently outline it, and must discard this
-      // candidate.
-      if (std::next(ID.getIterator())->Inst !=
-          ID.Inst->getNextNonDebugInstruction())
+      if (!nextIRInstructionDataMatchesNextInst(ID))
         return true;
+
       return !this->InstructionClassifier.visit(ID.Inst);
     });
 
@@ -1416,10 +1743,33 @@ static InstructionCost findCostForOutputBlocks(Module &M,
                                                OutlinableGroup &CurrentGroup,
                                                TargetTransformInfo &TTI) {
   InstructionCost OutputCost = 0;
+  unsigned NumOutputBranches = 0;
+
+  IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
+  DenseSet<BasicBlock *> CandidateBlocks;
+  Candidate.getBasicBlocks(CandidateBlocks);
+
+  // Count the number of different output branches that point to blocks outside
+  // of the region.
+  DenseSet<BasicBlock *> FoundBlocks;
+  for (IRInstructionData &ID : Candidate) {
+    if (!isa<BranchInst>(ID.Inst))
+      continue;
+
+    for (Value *V : ID.OperVals) {
+      BasicBlock *BB = static_cast<BasicBlock *>(V);
+      DenseSet<BasicBlock *>::iterator CBIt = CandidateBlocks.find(BB);
+      if (CBIt != CandidateBlocks.end() || FoundBlocks.contains(BB))
+        continue;
+      FoundBlocks.insert(BB);
+      NumOutputBranches++;
+    }
+  }
+
+  CurrentGroup.BranchesToOutside = NumOutputBranches;
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
     for (unsigned GVN : OutputUse) {
       Optional<Value *> OV = Candidate.fromGVN(GVN);
       assert(OV.hasValue() && "Could not find value for GVN?");
@@ -1434,14 +1784,14 @@ static InstructionCost findCostForOutputBlocks(Module &M,
       LLVM_DEBUG(dbgs() << "Adding: " << StoreCost
                         << " instructions to cost for output of type "
                         << *V->getType() << "\n");
-      OutputCost += StoreCost;
+      OutputCost += StoreCost * NumOutputBranches;
     }
 
     InstructionCost BranchCost =
         TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
     LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for"
                       << " a branch instruction\n");
-    OutputCost += BranchCost;
+    OutputCost += BranchCost * NumOutputBranches;
   }
 
   // If there is more than one output scheme, we must have a comparison and
@@ -1460,7 +1810,7 @@ static InstructionCost findCostForOutputBlocks(Module &M,
     LLVM_DEBUG(dbgs() << "Adding: " << TotalCost
                       << " instructions for each switch case for each different"
                       << " output path in a function\n");
-    OutputCost += TotalCost;
+    OutputCost += TotalCost * NumOutputBranches;
   }
 
   return OutputCost;
@@ -1548,13 +1898,12 @@ void IROutliner::updateOutputMapping(OutlinableRegion &Region,
 
 bool IROutliner::extractSection(OutlinableRegion &Region) {
   SetVector<Value *> ArgInputs, Outputs, SinkCands;
-  Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
-
   assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!");
-  assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!");
+  BasicBlock *InitialStart = Region.StartBB;
   Function *OrigF = Region.StartBB->getParent();
   CodeExtractorAnalysisCache CEAC(*OrigF);
-  Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC);
+  Region.ExtractedFunction =
+      Region.CE->extractCodeRegion(CEAC, ArgInputs, Outputs);
 
   // If the extraction was successful, find the BasicBlock, and reassign the
   // OutlinableRegion blocks
@@ -1565,7 +1914,23 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
     return false;
   }
 
-  BasicBlock *RewrittenBB = Region.FollowBB->getSinglePredecessor();
+  // Get the block containing the called branch, and reassign the blocks as
+  // necessary.  If the original block still exists, it is because we ended on
+  // a branch instruction, and so we move the contents into the block before
+  // and assign the previous block correctly.
+  User *InstAsUser = Region.ExtractedFunction->user_back();
+  BasicBlock *RewrittenBB = cast<Instruction>(InstAsUser)->getParent();
+  Region.PrevBB = RewrittenBB->getSinglePredecessor();
+  assert(Region.PrevBB && "PrevBB is nullptr?");
+  if (Region.PrevBB == InitialStart) {
+    BasicBlock *NewPrev = InitialStart->getSinglePredecessor();
+    Instruction *BI = NewPrev->getTerminator();
+    BI->eraseFromParent();
+    moveBBContents(*InitialStart, *NewPrev);
+    Region.PrevBB = NewPrev;
+    InitialStart->eraseFromParent();
+  }
+
   Region.StartBB = RewrittenBB;
   Region.EndBB = RewrittenBB;
 
@@ -1608,6 +1973,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
 
 unsigned IROutliner::doOutline(Module &M) {
   // Find the possible similarity sections.
+  InstructionClassifier.EnableBranches = !DisableBranches;
   IRSimilarityIdentifier &Identifier = getIRSI(M);
   SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
 
@@ -1622,12 +1988,17 @@ unsigned IROutliner::doOutline(Module &M) {
                         return LHS[0].getLength() * LHS.size() >
                                RHS[0].getLength() * RHS.size();
                       });
+  // Creating OutlinableGroups for each SimilarityCandidate to be used in
+  // each of the following for loops to avoid making an allocator.
+  std::vector<OutlinableGroup> PotentialGroups(SimilarityCandidates.size());
 
   DenseSet<unsigned> NotSame;
-  std::vector<Function *> FuncsToRemove;
+  std::vector<OutlinableGroup *> NegativeCostGroups;
+  std::vector<OutlinableRegion *> OutlinedRegions;
   // Iterate over the possible sets of similarity.
+  unsigned PotentialGroupIdx = 0;
   for (SimilarityGroup &CandidateVec : SimilarityCandidates) {
-    OutlinableGroup CurrentGroup;
+    OutlinableGroup &CurrentGroup = PotentialGroups[PotentialGroupIdx++];
 
     // Remove entries that were previously outlined
     pruneIncompatibleRegions(CandidateVec, CurrentGroup);
@@ -1649,20 +2020,31 @@ unsigned IROutliner::doOutline(Module &M) {
     // Create a CodeExtractor for each outlinable region. Identify inputs and
     // outputs for each section using the code extractor and create the argument
     // types for the Aggregate Outlining Function.
-    std::vector<OutlinableRegion *> OutlinedRegions;
+    OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
       // Break the outlinable region out of its parent BasicBlock into its own
       // BasicBlocks (see function implementation).
       OS->splitCandidate();
-      std::vector<BasicBlock *> BE = {OS->StartBB};
+
+      // There's a chance that when the region is split, extra instructions are
+      // added to the region. This makes the region no longer viable
+      // to be split, so we ignore it for outlining.
+      if (!OS->CandidateSplit)
+        continue;
+
+      SmallVector<BasicBlock *> BE;
+      DenseSet<BasicBlock *> BBSet;
+      OS->Candidate->getBasicBlocks(BBSet, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
       findAddInputsOutputs(M, *OS, NotSame);
       if (!OS->IgnoreRegion)
         OutlinedRegions.push_back(OS);
-      else
-        OS->reattachCandidate();
+
+      // We recombine the blocks together now that we have gathered all the
+      // needed information.
+      OS->reattachCandidate();
     }
 
     CurrentGroup.Regions = std::move(OutlinedRegions);
@@ -1675,12 +2057,11 @@ unsigned IROutliner::doOutline(Module &M) {
     if (CostModel)
       findCostBenefit(M, CurrentGroup);
 
-    // If we are adhering to the cost model, reattach all the candidates
+    // If we are adhering to the cost model, skip those groups where the cost
+    // outweighs the benefits.
     if (CurrentGroup.Cost >= CurrentGroup.Benefit && CostModel) {
-      for (OutlinableRegion *OS : CurrentGroup.Regions)
-        OS->reattachCandidate();
-      OptimizationRemarkEmitter &ORE = getORE(
-          *CurrentGroup.Regions[0]->Candidate->getFunction());
+      OptimizationRemarkEmitter &ORE =
+          getORE(*CurrentGroup.Regions[0]->Candidate->getFunction());
       ORE.emit([&]() {
         IRSimilarityCandidate *C = CurrentGroup.Regions[0]->Candidate;
         OptimizationRemarkMissed R(DEBUG_TYPE, "WouldNotDecreaseSize",
@@ -1704,12 +2085,70 @@ unsigned IROutliner::doOutline(Module &M) {
       continue;
     }
 
+    NegativeCostGroups.push_back(&CurrentGroup);
+  }
+
+  ExtractorAllocator.DestroyAll();
+
+  if (NegativeCostGroups.size() > 1)
+    stable_sort(NegativeCostGroups,
+                [](const OutlinableGroup *LHS, const OutlinableGroup *RHS) {
+                  return LHS->Benefit - LHS->Cost > RHS->Benefit - RHS->Cost;
+                });
+
+  std::vector<Function *> FuncsToRemove;
+  for (OutlinableGroup *CG : NegativeCostGroups) {
+    OutlinableGroup &CurrentGroup = *CG;
+
+    OutlinedRegions.clear();
+    for (OutlinableRegion *Region : CurrentGroup.Regions) {
+      // We check whether our region is compatible with what has already been
+      // outlined, and whether we need to ignore this item.
+      if (!isCompatibleWithAlreadyOutlinedCode(*Region))
+        continue;
+      OutlinedRegions.push_back(Region);
+    }
+
+    if (OutlinedRegions.size() < 2)
+      continue;
+
+    // Reestimate the cost and benefit of the OutlinableGroup. Continue only if
+    // we are still outlining enough regions to make up for the added cost.
+    CurrentGroup.Regions = std::move(OutlinedRegions);
+    if (CostModel) {
+      CurrentGroup.Benefit = 0;
+      CurrentGroup.Cost = 0;
+      findCostBenefit(M, CurrentGroup);
+      if (CurrentGroup.Cost >= CurrentGroup.Benefit)
+        continue;
+    }
+    OutlinedRegions.clear();
+    for (OutlinableRegion *Region : CurrentGroup.Regions) {
+      Region->splitCandidate();
+      if (!Region->CandidateSplit)
+        continue;
+      OutlinedRegions.push_back(Region);
+    }
+
+    CurrentGroup.Regions = std::move(OutlinedRegions);
+    if (CurrentGroup.Regions.size() < 2) {
+      for (OutlinableRegion *R : CurrentGroup.Regions)
+        R->reattachCandidate();
+      continue;
+    }
+
     LLVM_DEBUG(dbgs() << "Outlining regions with cost " << CurrentGroup.Cost
                       << " and benefit " << CurrentGroup.Benefit << "\n");
 
     // Create functions out of all the sections, and mark them as outlined.
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
+      SmallVector<BasicBlock *> BE;
+      DenseSet<BasicBlock *> BBSet;
+      OS->Candidate->getBasicBlocks(BBSet, BE);
+      OS->CE = new (ExtractorAllocator.Allocate())
+          CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
+                        false, "outlined");
       bool FunctionOutlined = extractSection(*OS);
       if (FunctionOutlined) {
         unsigned StartIdx = OS->Candidate->getStartIdx();
@@ -1767,6 +2206,7 @@ bool IROutliner::run(Module &M) {
 }
 
 // Pass Manager Boilerplate
+namespace {
 class IROutlinerLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -1782,6 +2222,7 @@ public:
 
   bool runOnModule(Module &M) override;
 };
+} // namespace
 
 bool IROutlinerLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 59260af88832..992c2b292e1e 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -31,9 +31,11 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InlineOrder.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ReplayInlineAdvisor.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
@@ -96,9 +98,53 @@ static cl::opt<std::string> CGSCCInlineReplayFile(
     "cgscc-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
         "Optimization remarks file containing inline remarks to be replayed "
-        "by inlining from cgscc inline remarks."),
+        "by cgscc inlining."),
     cl::Hidden);
 
+static cl::opt<ReplayInlinerSettings::Scope> CGSCCInlineReplayScope(
+    "cgscc-inline-replay-scope",
+    cl::init(ReplayInlinerSettings::Scope::Function),
+    cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
+                          "Replay on functions that have remarks associated "
+                          "with them (default)"),
+               clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
+                          "Replay on the entire module")),
+    cl::desc("Whether inline replay should be applied to the entire "
+             "Module or just the Functions (default) that are present as "
+             "callers in remarks during cgscc inlining."),
+    cl::Hidden);
+
+static cl::opt<ReplayInlinerSettings::Fallback> CGSCCInlineReplayFallback(
+    "cgscc-inline-replay-fallback",
+    cl::init(ReplayInlinerSettings::Fallback::Original),
+    cl::values(
+        clEnumValN(
+            ReplayInlinerSettings::Fallback::Original, "Original",
+            "All decisions not in replay send to original advisor (default)"),
+        clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
+                   "AlwaysInline", "All decisions not in replay are inlined"),
+        clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
+                   "All decisions not in replay are not inlined")),
+    cl::desc(
+        "How cgscc inline replay treats sites that don't come from the replay. "
+        "Original: defers to original advisor, AlwaysInline: inline all sites "
+        "not in replay, NeverInline: inline no sites not in replay"),
+    cl::Hidden);
+
+static cl::opt<CallSiteFormat::Format> CGSCCInlineReplayFormat(
+    "cgscc-inline-replay-format",
+    cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
+    cl::values(
+        clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
+        clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
+                   "<Line Number>:<Column Number>"),
+        clEnumValN(CallSiteFormat::Format::LineDiscriminator,
+                   "LineDiscriminator", "<Line Number>.<Discriminator>"),
+        clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
+                   "LineColumnDiscriminator",
+                   "<Line Number>:<Column Number>.<Discriminator> (default)")),
+    cl::desc("How cgscc inline replay file is formatted"), cl::Hidden);
+
 static cl::opt<bool> InlineEnablePriorityOrder(
     "inline-enable-priority-order", cl::Hidden, cl::init(false),
     cl::desc("Enable the priority inline order for the inliner"));
@@ -463,7 +509,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         }
         ++NumInlined;
 
-        emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+        emitInlinedIntoBasedOnCost(ORE, DLoc, Block, *Callee, *Caller, *OIC);
 
         // If inlining this function gave us any new call sites, throw them
         // onto our worklist to process.  They are useful inline candidates.
@@ -661,9 +707,12 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
         std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams());
 
     if (!CGSCCInlineReplayFile.empty())
-      OwnedAdvisor = std::make_unique<ReplayInlineAdvisor>(
+      OwnedAdvisor = getReplayInlineAdvisor(
           M, FAM, M.getContext(), std::move(OwnedAdvisor),
-          CGSCCInlineReplayFile,
+          ReplayInlinerSettings{CGSCCInlineReplayFile,
+                                CGSCCInlineReplayScope,
+                                CGSCCInlineReplayFallback,
+                                {CGSCCInlineReplayFormat}},
           /*EmitRemarks=*/true);
 
     return *OwnedAdvisor;
@@ -674,153 +723,6 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
   return *IAA->getAdvisor();
 }
 
-template <typename T> class InlineOrder {
-public:
-  using reference = T &;
-  using const_reference = const T &;
-
-  virtual ~InlineOrder() {}
-
-  virtual size_t size() = 0;
-
-  virtual void push(const T &Elt) = 0;
-
-  virtual T pop() = 0;
-
-  virtual const_reference front() = 0;
-
-  virtual void erase_if(function_ref<bool(T)> Pred) = 0;
-
-  bool empty() { return !size(); }
-};
-
-template <typename T, typename Container = SmallVector<T, 16>>
-class DefaultInlineOrder : public InlineOrder<T> {
-  using reference = T &;
-  using const_reference = const T &;
-
-public:
-  size_t size() override { return Calls.size() - FirstIndex; }
-
-  void push(const T &Elt) override { Calls.push_back(Elt); }
-
-  T pop() override {
-    assert(size() > 0);
-    return Calls[FirstIndex++];
-  }
-
-  const_reference front() override {
-    assert(size() > 0);
-    return Calls[FirstIndex];
-  }
-
-  void erase_if(function_ref<bool(T)> Pred) override {
-    Calls.erase(std::remove_if(Calls.begin() + FirstIndex, Calls.end(), Pred),
-                Calls.end());
-  }
-
-private:
-  Container Calls;
-  size_t FirstIndex = 0;
-};
-
-class Priority {
-public:
-  Priority(int Size) : Size(Size) {}
-
-  static bool isMoreDesirable(const Priority &S1, const Priority &S2) {
-    return S1.Size < S2.Size;
-  }
-
-  static Priority evaluate(CallBase *CB) {
-    Function *Callee = CB->getCalledFunction();
-    return Priority(Callee->getInstructionCount());
-  }
-
-  int Size;
-};
-
-template <typename PriorityT>
-class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
-  using T = std::pair<CallBase *, int>;
-  using HeapT = std::pair<CallBase *, PriorityT>;
-  using reference = T &;
-  using const_reference = const T &;
-
-  static bool cmp(const HeapT &P1, const HeapT &P2) {
-    return PriorityT::isMoreDesirable(P2.second, P1.second);
-  }
-
-  // A call site could become less desirable for inlining because of the size
-  // growth from prior inlining into the callee. This method is used to lazily
-  // update the desirability of a call site if it's decreasing. It is only
-  // called on pop() or front(), not every time the desirability changes. When
-  // the desirability of the front call site decreases, an updated one would be
-  // pushed right back into the heap. For simplicity, those cases where
-  // the desirability of a call site increases are ignored here.
-  void adjust() {
-    bool Changed = false;
-    do {
-      CallBase *CB = Heap.front().first;
-      const PriorityT PreviousGoodness = Heap.front().second;
-      const PriorityT CurrentGoodness = PriorityT::evaluate(CB);
-      Changed = PriorityT::isMoreDesirable(PreviousGoodness, CurrentGoodness);
-      if (Changed) {
-        std::pop_heap(Heap.begin(), Heap.end(), cmp);
-        Heap.pop_back();
-        Heap.push_back({CB, CurrentGoodness});
-        std::push_heap(Heap.begin(), Heap.end(), cmp);
-      }
-    } while (Changed);
-  }
-
-public:
-  size_t size() override { return Heap.size(); }
-
-  void push(const T &Elt) override {
-    CallBase *CB = Elt.first;
-    const int InlineHistoryID = Elt.second;
-    const PriorityT Goodness = PriorityT::evaluate(CB);
-
-    Heap.push_back({CB, Goodness});
-    std::push_heap(Heap.begin(), Heap.end(), cmp);
-    InlineHistoryMap[CB] = InlineHistoryID;
-  }
-
-  T pop() override {
-    assert(size() > 0);
-    adjust();
-
-    CallBase *CB = Heap.front().first;
-    T Result = std::make_pair(CB, InlineHistoryMap[CB]);
-    InlineHistoryMap.erase(CB);
-    std::pop_heap(Heap.begin(), Heap.end(), cmp);
-    Heap.pop_back();
-    return Result;
-  }
-
-  const_reference front() override {
-    assert(size() > 0);
-    adjust();
-
-    CallBase *CB = Heap.front().first;
-    return *InlineHistoryMap.find(CB);
-  }
-
-  void erase_if(function_ref<bool(T)> Pred) override {
-    auto PredWrapper = [=](HeapT P) -> bool {
-      return Pred(std::make_pair(P.first, 0));
-    };
-    Heap.erase(std::remove_if(Heap.begin(), Heap.end(), PredWrapper),
-               Heap.end());
-    std::make_heap(Heap.begin(), Heap.end(), cmp);
-  }
-
-private:
-  SmallVector<HeapT, 16> Heap;
-  DenseMap<CallBase *, int> InlineHistoryMap;
-};
-
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
@@ -868,7 +770,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // incrementally maknig a single function grow in a super linear fashion.
   std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls;
   if (InlineEnablePriorityOrder)
-    Calls = std::make_unique<PriorityInlineOrder<Priority>>();
+    Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>();
   else
     Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>();
   assert(Calls != nullptr && "Expected an initialized InlineOrder");
@@ -972,8 +874,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         continue;
       }
 
-      auto Advice = Advisor.getAdvice(*CB, OnlyMandatory);
+      std::unique_ptr<InlineAdvice> Advice =
+          Advisor.getAdvice(*CB, OnlyMandatory);
+
       // Check whether we want to inline this callsite.
+      if (!Advice)
+        continue;
+
       if (!Advice->isInliningRecommended()) {
         Advice->recordUnattemptedInlining();
         continue;
@@ -1104,6 +1011,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       UR.InlinedInternalEdges.insert({&N, OldC});
     }
     InlinedCallees.clear();
+
+    // Invalidate analyses for this function now so that we don't have to
+    // invalidate analyses for all functions in this SCC later.
+    FAM.invalidate(F, PreservedAnalyses::none());
   }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
@@ -1147,10 +1058,12 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   if (!Changed)
     return PreservedAnalyses::all();
 
+  PreservedAnalyses PA;
   // Even if we change the IR, we update the core CGSCC data structures and so
   // can preserve the proxy to the function analysis manager.
-  PreservedAnalyses PA;
   PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  // We have already invalidated all analyses on modified functions.
+  PA.preserveSet<AllAnalysesOn<Function>>();
   return PA;
 }
 
@@ -1173,7 +1086,11 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
 PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
                                                 ModuleAnalysisManager &MAM) {
   auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
-  if (!IAA.tryCreate(Params, Mode, CGSCCInlineReplayFile)) {
+  if (!IAA.tryCreate(Params, Mode,
+                     {CGSCCInlineReplayFile,
+                      CGSCCInlineReplayScope,
+                      CGSCCInlineReplayFallback,
+                      {CGSCCInlineReplayFormat}})) {
     M.getContext().emitError(
         "Could not setup Inlining Advisor for the requested "
         "mode and/or options");
@@ -1192,10 +1109,39 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
   else
     MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
         createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations)));
+
+  MPM.addPass(std::move(AfterCGMPM));
   MPM.run(M, MAM);
 
-  IAA.clear();
+  // Discard the InlineAdvisor, a subsequent inlining session should construct
+  // its own.
+  auto PA = PreservedAnalyses::all();
+  PA.abandon<InlineAdvisorAnalysis>();
+  return PA;
+}
 
-  // The ModulePassManager has already taken care of invalidating analyses.
-  return PreservedAnalyses::all();
+void InlinerPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<InlinerPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  if (OnlyMandatory)
+    OS << "<only-mandatory>";
+}
+
+void ModuleInlinerWrapperPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  // Print some info about passes added to the wrapper. This is however
+  // incomplete as InlineAdvisorAnalysis part isn't included (which also depends
+  // on Params and Mode).
+  if (!MPM.isEmpty()) {
+    MPM.printPipeline(OS, MapClassName2PassName);
+    OS << ",";
+  }
+  OS << "cgscc(";
+  if (MaxDevirtIterations != 0)
+    OS << "devirt<" << MaxDevirtIterations << ">(";
+  PM.printPipeline(OS, MapClassName2PassName);
+  if (MaxDevirtIterations != 0)
+    OS << ")";
+  OS << ")";
 }
diff --git a/llvm/lib/Transforms/IPO/Internalize.cpp b/llvm/lib/Transforms/IPO/Internalize.cpp
index db3b4384ce67..692e445cb7cb 100644
--- a/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -201,21 +201,6 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
     AlwaysPreserved.insert(V->getName());
   }
 
-  // Mark all functions not in the api as internal.
-  IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm();
-  for (Function &I : M) {
-    if (!maybeInternalize(I, ComdatMap))
-      continue;
-    Changed = true;
-
-    if (ExternalNode)
-      // Remove a callgraph edge from the external node to this function.
-      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
-
-    ++NumFunctions;
-    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
-  }
-
   // Never internalize the llvm.used symbol.  It is used to implement
   // attribute((used)).
   // FIXME: Shouldn't this just filter on llvm.metadata section??
@@ -237,6 +222,21 @@ bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
   else
     AlwaysPreserved.insert("__stack_chk_guard");
 
+  // Mark all functions not in the api as internal.
+  IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm();
+  for (Function &I : M) {
+    if (!maybeInternalize(I, ComdatMap))
+      continue;
+    Changed = true;
+
+    if (ExternalNode)
+      // Remove a callgraph edge from the external node to this function.
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
+
+    ++NumFunctions;
+    LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+  }
+
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   for (auto &GV : M.globals()) {
diff --git a/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index a497c0390bce..d9a59dd35fde 100644
--- a/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -283,3 +283,13 @@ PreservedAnalyses LoopExtractorPass::run(Module &M, ModuleAnalysisManager &AM) {
   PA.preserve<LoopAnalysis>();
   return PA;
 }
+
+void LoopExtractorPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LoopExtractorPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (NumLoops == 1)
+    OS << "single";
+  OS << ">";
+}
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index b492b200c6d5..f78971f0e586 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -342,7 +342,8 @@ private:
 struct ScopedSaveAliaseesAndUsed {
   Module &M;
   SmallVector<GlobalValue *, 4> Used, CompilerUsed;
-  std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
+  std::vector<std::pair<GlobalAlias *, Function *>> FunctionAliases;
+  std::vector<std::pair<GlobalIFunc *, Function *>> ResolverIFuncs;
 
   ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
     // The users of this class want to replace all function references except
@@ -362,13 +363,16 @@ struct ScopedSaveAliaseesAndUsed {
     if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
       GV->eraseFromParent();
 
-    for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
+    for (auto &GA : M.aliases()) {
       // FIXME: This should look past all aliases not just interposable ones,
       // see discussion on D65118.
-      if (auto *F =
-              dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
-        FunctionAliases.push_back({&GIS, F});
+      if (auto *F = dyn_cast<Function>(GA.getAliasee()->stripPointerCasts()))
+        FunctionAliases.push_back({&GA, F});
     }
+
+    for (auto &GI : M.ifuncs())
+      if (auto *F = dyn_cast<Function>(GI.getResolver()->stripPointerCasts()))
+        ResolverIFuncs.push_back({&GI, F});
   }
 
   ~ScopedSaveAliaseesAndUsed() {
@@ -376,8 +380,15 @@ struct ScopedSaveAliaseesAndUsed {
     appendToCompilerUsed(M, CompilerUsed);
 
     for (auto P : FunctionAliases)
-      P.first->setIndirectSymbol(
+      P.first->setAliasee(
           ConstantExpr::getBitCast(P.second, P.first->getType()));
+
+    for (auto P : ResolverIFuncs) {
+      // This does not preserve pointer casts that may have been stripped by the
+      // constructor, but the resolver's type is different from that of the
+      // ifunc anyway.
+      P.first->setResolver(P.second);
+    }
   }
 };
 
@@ -1550,17 +1561,28 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
               ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
                                    ConstantInt::get(IntPtrTy, I)}),
           F->getType());
-      if (Functions[I]->isExported()) {
-        if (IsJumpTableCanonical) {
-          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
-        } else {
-          GlobalAlias *JtAlias = GlobalAlias::create(
-              F->getValueType(), 0, GlobalValue::ExternalLinkage,
-              F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+
+      const bool IsExported = Functions[I]->isExported();
+      if (!IsJumpTableCanonical) {
+        GlobalValue::LinkageTypes LT = IsExported
+                                           ? GlobalValue::ExternalLinkage
+                                           : GlobalValue::InternalLinkage;
+        GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT,
+                                                   F->getName() + ".cfi_jt",
+                                                   CombinedGlobalElemPtr, &M);
+        if (IsExported)
           JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+        else
+          appendToUsed(M, {JtAlias});
+      }
+
+      if (IsExported) {
+        if (IsJumpTableCanonical)
+          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
+        else
           ExportSummary->cfiFunctionDecls().insert(std::string(F->getName()));
-        }
       }
+
       if (!IsJumpTableCanonical) {
         if (F->hasExternalWeakLinkage())
           replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
@@ -1751,11 +1773,7 @@ static bool isDirectCall(Use& U) {
 void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
                                           bool IsJumpTableCanonical) {
   SmallSetVector<Constant *, 4> Constants;
-  auto UI = Old->use_begin(), E = Old->use_end();
-  for (; UI != E;) {
-    Use &U = *UI;
-    ++UI;
-
+  for (Use &U : llvm::make_early_inc_range(Old->uses())) {
     // Skip block addresses
     if (isa<BlockAddress>(U.getUser()))
       continue;
@@ -1792,12 +1810,11 @@ bool LowerTypeTestsModule::lower() {
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
 
   if (DropTypeTests && TypeTestFunc) {
-    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
-         UI != UE;) {
-      auto *CI = cast<CallInst>((*UI++).getUser());
+    for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+      auto *CI = cast<CallInst>(U.getUser());
       // Find and erase llvm.assume intrinsics for this llvm.type.test call.
-      for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;)
-        if (auto *Assume = dyn_cast<AssumeInst>((*CIU++).getUser()))
+      for (Use &CIU : llvm::make_early_inc_range(CI->uses()))
+        if (auto *Assume = dyn_cast<AssumeInst>(CIU.getUser()))
           Assume->eraseFromParent();
       // If the assume was merged with another assume, we might have a use on a
       // phi (which will feed the assume). Simply replace the use on the phi
@@ -1835,13 +1852,9 @@ bool LowerTypeTestsModule::lower() {
     return false;
 
   if (ImportSummary) {
-    if (TypeTestFunc) {
-      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
-           UI != UE;) {
-        auto *CI = cast<CallInst>((*UI++).getUser());
-        importTypeTest(CI);
-      }
-    }
+    if (TypeTestFunc)
+      for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses()))
+        importTypeTest(cast<CallInst>(U.getUser()));
 
     if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
       report_fatal_error(
@@ -2100,11 +2113,11 @@ bool LowerTypeTestsModule::lower() {
       auto CI = cast<CallInst>(U.getUser());
 
       std::vector<GlobalTypeMember *> Targets;
-      if (CI->getNumArgOperands() % 2 != 1)
+      if (CI->arg_size() % 2 != 1)
         report_fatal_error("number of arguments should be odd");
 
       GlobalClassesTy::member_iterator CurSet;
-      for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
+      for (unsigned I = 1; I != CI->arg_size(); I += 2) {
         int64_t Offset;
         auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
             CI->getOperand(I), Offset, M.getDataLayout()));
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 9e6dd879ac01..97ef872c5499 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -463,17 +463,15 @@ bool MergeFunctions::runOnModule(Module &M) {
 // Replace direct callers of Old with New.
 void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
   Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
-  for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
-    Use *U = &*UI;
-    ++UI;
-    CallBase *CB = dyn_cast<CallBase>(U->getUser());
-    if (CB && CB->isCallee(U)) {
+  for (Use &U : llvm::make_early_inc_range(Old->uses())) {
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    if (CB && CB->isCallee(&U)) {
       // Do not copy attributes from the called function to the call-site.
       // Function comparison ensures that the attributes are the same up to
       // type congruences in byval(), in which case we need to keep the byval
       // type of the call-site, not the callee function.
       remove(CB->getFunction());
-      U->set(BitcastNew);
+      U.set(BitcastNew);
     }
   }
 }
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
new file mode 100644
index 000000000000..ebf080e87c3b
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -0,0 +1,354 @@
+//===- ModuleInliner.cpp - Code related to module inliner -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls in the module level. It doesn't need any infromation about
+// SCC or call graph, which is different from the SCC inliner.  The decisions of
+// which calls are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ModuleInliner.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InlineOrder.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cassert>
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "module-inline"
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+
+static cl::opt<bool> InlineEnablePriorityOrder(
+    "module-inline-enable-priority-order", cl::Hidden, cl::init(true),
+    cl::desc("Enable the priority inline order for the module inliner"));
+
+/// Return true if the specified inline history ID
+/// indicates an inline history that includes the specified function.
+static bool inlineHistoryIncludes(
+    Function *F, int InlineHistoryID,
+    const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
+  while (InlineHistoryID != -1) {
+    assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
+           "Invalid inline history ID");
+    if (InlineHistory[InlineHistoryID].first == F)
+      return true;
+    InlineHistoryID = InlineHistory[InlineHistoryID].second;
+  }
+  return false;
+}
+
+InlineAdvisor &ModuleInlinerPass::getAdvisor(const ModuleAnalysisManager &MAM,
+                                             FunctionAnalysisManager &FAM,
+                                             Module &M) {
+  if (OwnedAdvisor)
+    return *OwnedAdvisor;
+
+  auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
+  if (!IAA) {
+    // It should still be possible to run the inliner as a stand-alone module
+    // pass, for test scenarios. In that case, we default to the
+    // DefaultInlineAdvisor, which doesn't need to keep state between module
+    // pass runs. It also uses just the default InlineParams. In this case, we
+    // need to use the provided FAM, which is valid for the duration of the
+    // inliner pass, and thus the lifetime of the owned advisor. The one we
+    // would get from the MAM can be invalidated as a result of the inliner's
+    // activity.
+    OwnedAdvisor = std::make_unique<DefaultInlineAdvisor>(M, FAM, Params);
+
+    return *OwnedAdvisor;
+  }
+  assert(IAA->getAdvisor() &&
+         "Expected a present InlineAdvisorAnalysis also have an "
+         "InlineAdvisor initialized");
+  return *IAA->getAdvisor();
+}
+
+static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
+  LibFunc LF;
+
+  // Either this is a normal library function or a "vectorizable"
+  // function.  Not using the VFDatabase here because this query
+  // is related only to libraries handled via the TLI.
+  return TLI.getLibFunc(F, LF) ||
+         TLI.isKnownVectorFunctionInLibrary(F.getName());
+}
+
+PreservedAnalyses ModuleInlinerPass::run(Module &M,
+                                         ModuleAnalysisManager &MAM) {
+  LLVM_DEBUG(dbgs() << "---- Module Inliner is Running ---- \n");
+
+  auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
+  if (!IAA.tryCreate(Params, Mode, {})) {
+    M.getContext().emitError(
+        "Could not setup Inlining Advisor for the requested "
+        "mode and/or options");
+    return PreservedAnalyses::all();
+  }
+
+  bool Changed = false;
+
+  ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
+
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  InlineAdvisor &Advisor = getAdvisor(MAM, FAM, M);
+  Advisor.onPassEntry();
+
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
+
+  // In the module inliner, a priority-based worklist is used for calls across
+  // the entire Module. With this module inliner, the inline order is not
+  // limited to bottom-up order. More globally scope inline order is enabled.
+  // Also, the inline deferral logic become unnecessary in this module inliner.
+  // It is possible to use other priority heuristics, e.g. profile-based
+  // heuristic.
+  //
+  // TODO: Here is a huge amount duplicate code between the module inliner and
+  // the SCC inliner, which need some refactoring.
+  std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>> Calls;
+  if (InlineEnablePriorityOrder)
+    Calls = std::make_unique<PriorityInlineOrder<InlineSizePriority>>();
+  else
+    Calls = std::make_unique<DefaultInlineOrder<std::pair<CallBase *, int>>>();
+  assert(Calls != nullptr && "Expected an initialized InlineOrder");
+
+  // Populate the initial list of calls in this module.
+  for (Function &F : M) {
+    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(F))
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction()) {
+          if (!Callee->isDeclaration())
+            Calls->push({CB, -1});
+          else if (!isa<IntrinsicInst>(I)) {
+            using namespace ore;
+            setInlineRemark(*CB, "unavailable definition");
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+                     << NV("Callee", Callee) << " will not be inlined into "
+                     << NV("Caller", CB->getCaller())
+                     << " because its definition is unavailable"
+                     << setIsVerbose();
+            });
+          }
+        }
+  }
+  if (Calls->empty())
+    return PreservedAnalyses::all();
+
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function *, int>, 16> InlineHistory;
+
+  // Track a set vector of inlined callees so that we can augment the caller
+  // with all of their edges in the call graph before pruning out the ones that
+  // got simplified away.
+  SmallSetVector<Function *, 4> InlinedCallees;
+
+  // Track the dead functions to delete once finished with inlining calls. We
+  // defer deleting these to make it easier to handle the call graph updates.
+  SmallVector<Function *, 4> DeadFunctions;
+
+  // Loop forward over all of the calls.
+  while (!Calls->empty()) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls->front().first->getCaller();
+
+    LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n"
+                      << "    Function size: " << F.getInstructionCount()
+                      << "\n");
+
+    auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+      return FAM.getResult<AssumptionAnalysis>(F);
+    };
+
+    // Now process as many calls as we have within this caller in the sequence.
+    // We bail out as soon as the caller has to change so we can
+    // prepare the context of that new caller.
+    bool DidInline = false;
+    while (!Calls->empty() && Calls->front().first->getCaller() == &F) {
+      auto P = Calls->pop();
+      CallBase *CB = P.first;
+      const int InlineHistoryID = P.second;
+      Function &Callee = *CB->getCalledFunction();
+
+      if (InlineHistoryID != -1 &&
+          inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+        setInlineRemark(*CB, "recursive");
+        continue;
+      }
+
+      auto Advice = Advisor.getAdvice(*CB, /*OnlyMandatory*/ false);
+      // Check whether we want to inline this callsite.
+      if (!Advice->isInliningRecommended()) {
+        Advice->recordUnattemptedInlining();
+        continue;
+      }
+
+      // Setup the data structure used to plumb customization into the
+      // `InlineFunction` routine.
+      InlineFunctionInfo IFI(
+          /*cg=*/nullptr, GetAssumptionCache, PSI,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+          &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
+      InlineResult IR =
+          InlineFunction(*CB, IFI, &FAM.getResult<AAManager>(*CB->getCaller()));
+      if (!IR.isSuccess()) {
+        Advice->recordUnsuccessfulInlining(IR);
+        continue;
+      }
+
+      DidInline = true;
+      InlinedCallees.insert(&Callee);
+      ++NumInlined;
+
+      LLVM_DEBUG(dbgs() << "    Size after inlining: "
+                        << F.getInstructionCount() << "\n");
+
+      // Add any new callsites to defined functions to the worklist.
+      if (!IFI.InlinedCallSites.empty()) {
+        int NewHistoryID = InlineHistory.size();
+        InlineHistory.push_back({&Callee, InlineHistoryID});
+
+        for (CallBase *ICB : reverse(IFI.InlinedCallSites)) {
+          Function *NewCallee = ICB->getCalledFunction();
+          if (!NewCallee) {
+            // Try to promote an indirect (virtual) call without waiting for
+            // the post-inline cleanup and the next DevirtSCCRepeatedPass
+            // iteration because the next iteration may not happen and we may
+            // miss inlining it.
+            if (tryPromoteCall(*ICB))
+              NewCallee = ICB->getCalledFunction();
+          }
+          if (NewCallee)
+            if (!NewCallee->isDeclaration())
+              Calls->push({ICB, NewHistoryID});
+        }
+      }
+
+      // Merge the attributes based on the inlining.
+      AttributeFuncs::mergeAttributesForInlining(F, Callee);
+
+      // For local functions, check whether this makes the callee trivially
+      // dead. In that case, we can drop the body of the function eagerly
+      // which may reduce the number of callers of other functions to one,
+      // changing inline cost thresholds.
+      bool CalleeWasDeleted = false;
+      if (Callee.hasLocalLinkage()) {
+        // To check this we also need to nuke any dead constant uses (perhaps
+        // made dead by this operation on other functions).
+        Callee.removeDeadConstantUsers();
+        // if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+        if (Callee.use_empty() && !isKnownLibFunction(Callee, GetTLI(Callee))) {
+          Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
+            return Call.first->getCaller() == &Callee;
+          });
+          // Clear the body and queue the function itself for deletion when we
+          // finish inlining.
+          // Note that after this point, it is an error to do anything other
+          // than use the callee's address or delete it.
+          Callee.dropAllReferences();
+          assert(!is_contained(DeadFunctions, &Callee) &&
+                 "Cannot put cause a function to become dead twice!");
+          DeadFunctions.push_back(&Callee);
+          CalleeWasDeleted = true;
+        }
+      }
+      if (CalleeWasDeleted)
+        Advice->recordInliningWithCalleeDeleted();
+      else
+        Advice->recordInlining();
+    }
+
+    if (!DidInline)
+      continue;
+    Changed = true;
+
+    InlinedCallees.clear();
+  }
+
+  // Now that we've finished inlining all of the calls across this module,
+  // delete all of the trivially dead functions.
+  //
+  // Note that this walks a pointer set which has non-deterministic order but
+  // that is OK as all we do is delete things and add pointers to unordered
+  // sets.
+  for (Function *DeadF : DeadFunctions) {
+    // Clear out any cached analyses.
+    FAM.clear(*DeadF, DeadF->getName());
+
+    // And delete the actual function from the module.
+    // The Advisor may use Function pointers to efficiently index various
+    // internal maps, e.g. for memoization. Function cleanup passes like
+    // argument promotion create new functions. It is possible for a new
+    // function to be allocated at the address of a deleted function. We could
+    // index using names, but that's inefficient. Alternatively, we let the
+    // Advisor free the functions when it sees fit.
+    DeadF->getBasicBlockList().clear();
+    M.getFunctionList().remove(DeadF);
+
+    ++NumDeleted;
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b80349352719..f342c35fa283 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -33,6 +34,8 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
@@ -41,6 +44,8 @@
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 
+#include <algorithm>
+
 using namespace llvm;
 using namespace omp;
 
@@ -72,6 +77,46 @@ static cl::opt<bool> HideMemoryTransferLatency(
              " transfers"),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> DisableOpenMPOptDeglobalization(
+    "openmp-opt-disable-deglobalization", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations involving deglobalization."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> DisableOpenMPOptSPMDization(
+    "openmp-opt-disable-spmdization", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> DisableOpenMPOptFolding(
+    "openmp-opt-disable-folding", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
+    "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP optimizations that replace the state machine."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> PrintModuleAfterOptimizations(
+    "openmp-opt-print-module", cl::ZeroOrMore,
+    cl::desc("Print the current module after OpenMP optimizations."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> AlwaysInlineDeviceFunctions(
+    "openmp-opt-inline-device", cl::ZeroOrMore,
+    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool>
+    EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore,
+                         cl::desc("Enables more verbose remarks."), cl::Hidden,
+                         cl::init(false));
+
+static cl::opt<unsigned>
+    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
+                          cl::desc("Maximal number of attributor iterations."),
+                          cl::init(256));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -328,7 +373,7 @@ struct OMPInformationCache : public InformationCache {
     if (F->arg_size() != RTFArgTypes.size())
       return false;
 
-    auto RTFTyIt = RTFArgTypes.begin();
+    auto *RTFTyIt = RTFArgTypes.begin();
     for (Argument &Arg : F->args()) {
       if (Arg.getType() != *RTFTyIt)
         return false;
@@ -503,7 +548,7 @@ struct KernelInfoState : AbstractState {
   /// State to track if we are in SPMD-mode, assumed or know, and why we decided
   /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
   /// false.
-  BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker;
+  BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
 
   /// The __kmpc_target_init call in this kernel, if any. If we find more than
   /// one we abort as the kernel is malformed.
@@ -542,7 +587,9 @@ struct KernelInfoState : AbstractState {
   /// See AbstractState::indicatePessimisticFixpoint(...)
   ChangeStatus indicatePessimisticFixpoint() override {
     IsAtFixpoint = true;
+    ReachingKernelEntries.indicatePessimisticFixpoint();
     SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+    ReachedKnownParallelRegions.indicatePessimisticFixpoint();
     ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
     return ChangeStatus::CHANGED;
   }
@@ -550,6 +597,10 @@ struct KernelInfoState : AbstractState {
   /// See AbstractState::indicateOptimisticFixpoint(...)
   ChangeStatus indicateOptimisticFixpoint() override {
     IsAtFixpoint = true;
+    ReachingKernelEntries.indicateOptimisticFixpoint();
+    SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+    ReachedKnownParallelRegions.indicateOptimisticFixpoint();
+    ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
     return ChangeStatus::UNCHANGED;
   }
 
@@ -569,6 +620,12 @@ struct KernelInfoState : AbstractState {
     return true;
   }
 
+  /// Returns true if this kernel contains any OpenMP parallel regions.
+  bool mayContainParallelRegion() {
+    return !ReachedKnownParallelRegions.empty() ||
+           !ReachedUnknownParallelRegions.empty();
+  }
+
   /// Return empty set as the best state of potential values.
   static KernelInfoState getBestState() { return KernelInfoState(true); }
 
@@ -584,12 +641,14 @@ struct KernelInfoState : AbstractState {
     // Do not merge two different _init and _deinit call sites.
     if (KIS.KernelInitCB) {
       if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
-        indicatePessimisticFixpoint();
+        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
+                         "assumptions.");
       KernelInitCB = KIS.KernelInitCB;
     }
     if (KIS.KernelDeinitCB) {
       if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
-        indicatePessimisticFixpoint();
+        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
+                         "assumptions.");
       KernelDeinitCB = KIS.KernelDeinitCB;
     }
     SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
@@ -1032,8 +1091,8 @@ private:
         Args.clear();
         Args.push_back(OutlinedFn->getArg(0));
         Args.push_back(OutlinedFn->getArg(1));
-        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
-             U < E; ++U)
+        for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
+             ++U)
           Args.push_back(CI->getArgOperand(U));
 
         CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
@@ -1041,9 +1100,9 @@ private:
           NewCI->setDebugLoc(CI->getDebugLoc());
 
         // Forward parameter attributes from the callback to the callee.
-        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
-             U < E; ++U)
-          for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+        for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
+             ++U)
+          for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
             NewCI->addParamAttr(
                 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
 
@@ -1563,13 +1622,13 @@ private:
 
     // TODO: Use dominance to find a good position instead.
     auto CanBeMoved = [this](CallBase &CB) {
-      unsigned NumArgs = CB.getNumArgOperands();
+      unsigned NumArgs = CB.arg_size();
       if (NumArgs == 0)
         return true;
       if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
         return false;
-      for (unsigned u = 1; u < NumArgs; ++u)
-        if (isa<Instruction>(CB.getArgOperand(u)))
+      for (unsigned U = 1; U < NumArgs; ++U)
+        if (isa<Instruction>(CB.getArgOperand(U)))
           return false;
       return true;
     };
@@ -1612,7 +1671,7 @@ private:
     // valid at the new location. For now we just pick a global one, either
     // existing and used by one of the calls, or created from scratch.
     if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
-      if (CI->getNumArgOperands() > 0 &&
+      if (!CI->arg_empty() &&
           CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
         Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
                                                       /* GlobalOnly */ true);
@@ -1695,8 +1754,8 @@ private:
     // Transitively search for more arguments by looking at the users of the
     // ones we know already. During the search the GTIdArgs vector is extended
     // so we cannot cache the size nor can we use a range based for.
-    for (unsigned u = 0; u < GTIdArgs.size(); ++u)
-      AddUserArgs(*GTIdArgs[u]);
+    for (unsigned U = 0; U < GTIdArgs.size(); ++U)
+      AddUserArgs(*GTIdArgs[U]);
   }
 
   /// Kernel (=GPU) optimizations and utility functions
@@ -1822,6 +1881,10 @@ private:
                                     OMPRTL___kmpc_kernel_end_parallel);
     ExternalizationRAII BarrierSPMD(OMPInfoCache,
                                     OMPRTL___kmpc_barrier_simple_spmd);
+    ExternalizationRAII BarrierGeneric(OMPInfoCache,
+                                       OMPRTL___kmpc_barrier_simple_generic);
+    ExternalizationRAII ThreadId(OMPInfoCache,
+                                 OMPRTL___kmpc_get_hardware_thread_id_in_block);
 
     registerAAs(IsModulePass);
 
@@ -1918,6 +1981,10 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
   if (!KernelParallelRFI)
     return Changed;
 
+  // If we have disabled state machine changes, exit
+  if (DisableOpenMPOptStateMachineRewrite)
+    return Changed;
+
   for (Function *F : SCC) {
 
     // Check if the function is a use in a __kmpc_parallel_51 call at
@@ -1996,7 +2063,8 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
         UndefValue::get(Int8Ty), F->getName() + ".ID");
 
     for (Use *U : ToBeReplacedStateMachineUses)
-      U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
+      U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+          ID, U->get()->getType()));
 
     ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
 
@@ -2508,9 +2576,8 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
   auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
 
-  // Check if the edge into the successor block compares the __kmpc_target_init
-  // result with -1. If we are in non-SPMD-mode that signals only the main
-  // thread will execute the edge.
+  // Check if the edge into the successor block contains a condition that only
+  // lets the main thread execute it.
   auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
     if (!Edge || !Edge->isConditional())
       return false;
@@ -2525,16 +2592,27 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     if (!C)
       return false;
 
-    // Match:  -1 == __kmpc_target_init (for non-SPMD kernels only!)
+    // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
     if (C->isAllOnesValue()) {
       auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
       CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
       if (!CB)
         return false;
-      const int InitIsSPMDArgNo = 1;
-      auto *IsSPMDModeCI =
-          dyn_cast<ConstantInt>(CB->getOperand(InitIsSPMDArgNo));
-      return IsSPMDModeCI && IsSPMDModeCI->isZero();
+      const int InitModeArgNo = 1;
+      auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo));
+      return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC);
+    }
+
+    if (C->isZero()) {
+      // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
+      if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
+        if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
+          return true;
+
+      // Match: 0 == llvm.amdgcn.workitem.id.x()
+      if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
+        if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
+          return true;
     }
 
     return false;
@@ -2543,15 +2621,14 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   // Merge all the predecessor states into the current basic block. A basic
   // block is executed by a single thread if all of its predecessors are.
   auto MergePredecessorStates = [&](BasicBlock *BB) {
-    if (pred_begin(BB) == pred_end(BB))
+    if (pred_empty(BB))
       return SingleThreadedBBs.contains(BB);
 
     bool IsInitialThread = true;
-    for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
-         PredBB != PredEndBB; ++PredBB) {
-      if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()),
                                BB))
-        IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
+        IsInitialThread &= SingleThreadedBBs.contains(PredBB);
     }
 
     return IsInitialThread;
@@ -2683,9 +2760,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
 
       ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
 
-      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
-                        << CB->getCaller()->getName() << " with "
-                        << AllocSize->getZExtValue()
+      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
+                        << " with " << AllocSize->getZExtValue()
                         << " bytes of shared memory\n");
 
       // Create a new shared memory buffer of the same size as the allocation
@@ -2734,7 +2810,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       const auto &ED = A.getAAFor<AAExecutionDomain>(
           *this, IRPosition::function(*F), DepClassTy::REQUIRED);
       if (CallBase *CB = dyn_cast<CallBase>(U))
-        if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) ||
+        if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
             !ED.isExecutedByInitialThreadOnly(*CB))
           MallocCalls.erase(CB);
     }
@@ -2769,9 +2845,17 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
            std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
                                                                : "") +
            std::string(" #PRs: ") +
-           std::to_string(ReachedKnownParallelRegions.size()) +
+           (ReachedKnownParallelRegions.isValidState()
+                ? std::to_string(ReachedKnownParallelRegions.size())
+                : "<invalid>") +
            ", #Unknown PRs: " +
-           std::to_string(ReachedUnknownParallelRegions.size());
+           (ReachedUnknownParallelRegions.isValidState()
+                ? std::to_string(ReachedUnknownParallelRegions.size())
+                : "<invalid>") +
+           ", #Reaching Kernels: " +
+           (ReachingKernelEntries.isValidState()
+                ? std::to_string(ReachingKernelEntries.size())
+                : "<invalid>");
   }
 
   /// Create an abstract attribute biew for the position \p IRP.
@@ -2797,6 +2881,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
   AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
       : AAKernelInfo(IRP, A) {}
 
+  SmallPtrSet<Instruction *, 4> GuardedInstructions;
+
+  SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
+    return GuardedInstructions;
+  }
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     // This is a high-level transform that might change the constant arguments
@@ -2843,8 +2933,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
         },
         Fn);
 
-    assert((KernelInitCB && KernelDeinitCB) &&
-           "Kernel without __kmpc_target_init or __kmpc_target_deinit!");
+    // Ignore kernels without initializers such as global constructors.
+    if (!KernelInitCB || !KernelDeinitCB) {
+      indicateOptimisticFixpoint();
+      return;
+    }
 
     // For kernels we might need to initialize/finalize the IsSPMD state and
     // we need to register a simplification callback so that the Attributor
@@ -2859,7 +2952,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
       // state. As long as we are not in an invalid state, we will create a
       // custom state machine so the value should be a `i1 false`. If we are
       // in an invalid state, we won't change the value that is in the IR.
-      if (!isValidState())
+      if (!ReachedKnownParallelRegions.isValidState())
+        return nullptr;
+      // If we have disabled state machine rewrites, don't make a custom one.
+      if (DisableOpenMPOptStateMachineRewrite)
         return nullptr;
       if (AA)
         A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
@@ -2869,7 +2965,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return FalseVal;
     };
 
-    Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB =
+    Attributor::SimplifictionCallbackTy ModeSimplifyCB =
         [&](const IRPosition &IRP, const AbstractAttribute *AA,
             bool &UsedAssumedInformation) -> Optional<Value *> {
       // IRP represents the "SPMDCompatibilityTracker" argument of an
@@ -2885,8 +2981,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
       } else {
         UsedAssumedInformation = false;
       }
-      auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
-                                       SPMDCompatibilityTracker.isAssumed());
+      auto *Val = ConstantInt::getSigned(
+          IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()),
+          SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD
+                                               : OMP_TGT_EXEC_MODE_GENERIC);
       return Val;
     };
 
@@ -2911,8 +3009,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return Val;
     };
 
-    constexpr const int InitIsSPMDArgNo = 1;
-    constexpr const int DeinitIsSPMDArgNo = 1;
+    constexpr const int InitModeArgNo = 1;
+    constexpr const int DeinitModeArgNo = 1;
     constexpr const int InitUseStateMachineArgNo = 2;
     constexpr const int InitRequiresFullRuntimeArgNo = 3;
     constexpr const int DeinitRequiresFullRuntimeArgNo = 2;
@@ -2920,11 +3018,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
         IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
         StateMachineSimplifyCB);
     A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo),
-        IsSPMDModeSimplifyCB);
+        IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
+        ModeSimplifyCB);
     A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo),
-        IsSPMDModeSimplifyCB);
+        IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
+        ModeSimplifyCB);
     A.registerSimplificationCallback(
         IRPosition::callsite_argument(*KernelInitCB,
                                       InitRequiresFullRuntimeArgNo),
@@ -2935,10 +3033,25 @@ struct AAKernelInfoFunction : AAKernelInfo {
         IsGenericModeSimplifyCB);
 
     // Check if we know we are in SPMD-mode already.
-    ConstantInt *IsSPMDArg =
-        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
-    if (IsSPMDArg && !IsSPMDArg->isZero())
+    ConstantInt *ModeArg =
+        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
+    if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+    // This is a generic region but SPMDization is disabled so stop tracking.
+    else if (DisableOpenMPOptSPMDization)
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+  }
+
+  /// Sanitize the string \p S such that it is a suitable global symbol name.
+  static std::string sanitizeForGlobalName(std::string S) {
+    std::replace_if(
+        S.begin(), S.end(),
+        [](const char C) {
+          return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
+                   (C >= '0' && C <= '9') || C == '_');
+        },
+        '.');
+    return S;
   }
 
   /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
@@ -2949,19 +3062,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!KernelInitCB || !KernelDeinitCB)
       return ChangeStatus::UNCHANGED;
 
-    // Known SPMD-mode kernels need no manifest changes.
-    if (SPMDCompatibilityTracker.isKnown())
-      return ChangeStatus::UNCHANGED;
-
     // If we can we change the execution mode to SPMD-mode otherwise we build a
     // custom state machine.
-    if (!changeToSPMDMode(A))
-      buildCustomStateMachine(A);
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (!changeToSPMDMode(A, Changed))
+      return buildCustomStateMachine(A);
 
-    return ChangeStatus::CHANGED;
+    return Changed;
   }
 
-  bool changeToSPMDMode(Attributor &A) {
+  bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
 
     if (!SPMDCompatibilityTracker.isAssumed()) {
@@ -2993,38 +3103,259 @@ struct AAKernelInfoFunction : AAKernelInfo {
       return false;
     }
 
-    // Adjust the global exec mode flag that tells the runtime what mode this
-    // kernel is executed in.
+    // Check if the kernel is already in SPMD mode, if so, return success.
     Function *Kernel = getAnchorScope();
     GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
         (Kernel->getName() + "_exec_mode").str());
     assert(ExecMode && "Kernel without exec mode?");
-    assert(ExecMode->getInitializer() &&
-           ExecMode->getInitializer()->isOneValue() &&
-           "Initially non-SPMD kernel has SPMD exec mode!");
+    assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
 
     // Set the global exec mode flag to indicate SPMD-Generic mode.
-    constexpr int SPMDGeneric = 2;
-    if (!ExecMode->getInitializer()->isZeroValue())
-      ExecMode->setInitializer(
-          ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric));
+    assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+           "ExecMode is not an integer!");
+    const int8_t ExecModeVal =
+        cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+    if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
+      return true;
+
+    // We will now unconditionally modify the IR, indicate a change.
+    Changed = ChangeStatus::CHANGED;
+
+    auto CreateGuardedRegion = [&](Instruction *RegionStartI,
+                                   Instruction *RegionEndI) {
+      LoopInfo *LI = nullptr;
+      DominatorTree *DT = nullptr;
+      MemorySSAUpdater *MSU = nullptr;
+      using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+      BasicBlock *ParentBB = RegionStartI->getParent();
+      Function *Fn = ParentBB->getParent();
+      Module &M = *Fn->getParent();
+
+      // Create all the blocks and logic.
+      // ParentBB:
+      //    goto RegionCheckTidBB
+      // RegionCheckTidBB:
+      //    Tid = __kmpc_hardware_thread_id()
+      //    if (Tid != 0)
+      //        goto RegionBarrierBB
+      // RegionStartBB:
+      //    <execute instructions guarded>
+      //    goto RegionEndBB
+      // RegionEndBB:
+      //    <store escaping values to shared mem>
+      //    goto RegionBarrierBB
+      //  RegionBarrierBB:
+      //    __kmpc_simple_barrier_spmd()
+      //    // second barrier is omitted if lacking escaping values.
+      //    <load escaping values from shared mem>
+      //    __kmpc_simple_barrier_spmd()
+      //    goto RegionExitBB
+      // RegionExitBB:
+      //    <execute rest of instructions>
+
+      BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
+                                           DT, LI, MSU, "region.guarded.end");
+      BasicBlock *RegionBarrierBB =
+          SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
+                     MSU, "region.barrier");
+      BasicBlock *RegionExitBB =
+          SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
+                     DT, LI, MSU, "region.exit");
+      BasicBlock *RegionStartBB =
+          SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
+
+      assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
+             "Expected a different CFG");
+
+      BasicBlock *RegionCheckTidBB = SplitBlock(
+          ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
+
+      // Register basic blocks with the Attributor.
+      A.registerManifestAddedBasicBlock(*RegionEndBB);
+      A.registerManifestAddedBasicBlock(*RegionBarrierBB);
+      A.registerManifestAddedBasicBlock(*RegionExitBB);
+      A.registerManifestAddedBasicBlock(*RegionStartBB);
+      A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
+
+      bool HasBroadcastValues = false;
+      // Find escaping outputs from the guarded region to outside users and
+      // broadcast their values to them.
+      for (Instruction &I : *RegionStartBB) {
+        SmallPtrSet<Instruction *, 4> OutsideUsers;
+        for (User *Usr : I.users()) {
+          Instruction &UsrI = *cast<Instruction>(Usr);
+          if (UsrI.getParent() != RegionStartBB)
+            OutsideUsers.insert(&UsrI);
+        }
+
+        if (OutsideUsers.empty())
+          continue;
+
+        HasBroadcastValues = true;
+
+        // Emit a global variable in shared memory to store the broadcasted
+        // value.
+        auto *SharedMem = new GlobalVariable(
+            M, I.getType(), /* IsConstant */ false,
+            GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
+            sanitizeForGlobalName(
+                (I.getName() + ".guarded.output.alloc").str()),
+            nullptr, GlobalValue::NotThreadLocal,
+            static_cast<unsigned>(AddressSpace::Shared));
+
+        // Emit a store instruction to update the value.
+        new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
+
+        LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
+                                       I.getName() + ".guarded.output.load",
+                                       RegionBarrierBB->getTerminator());
+
+        // Emit a load instruction and replace uses of the output value.
+        for (Instruction *UsrI : OutsideUsers)
+          UsrI->replaceUsesOfWith(&I, LoadI);
+      }
+
+      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+      // Go to tid check BB in ParentBB.
+      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+      ParentBB->getTerminator()->eraseFromParent();
+      OpenMPIRBuilder::LocationDescription Loc(
+          InsertPointTy(ParentBB, ParentBB->end()), DL);
+      OMPInfoCache.OMPBuilder.updateToLocation(Loc);
+      auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
+      Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
+      BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
+
+      // Add check for Tid in RegionCheckTidBB
+      RegionCheckTidBB->getTerminator()->eraseFromParent();
+      OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
+          InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
+      OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
+      FunctionCallee HardwareTidFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
+      Value *Tid =
+          OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
+      Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
+      OMPInfoCache.OMPBuilder.Builder
+          .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
+          ->setDebugLoc(DL);
+
+      // First barrier for synchronization, ensures main thread has updated
+      // values.
+      FunctionCallee BarrierFn =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_barrier_simple_spmd);
+      OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
+          RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
+      OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
+          ->setDebugLoc(DL);
+
+      // Second barrier ensures workers have read broadcast values.
+      if (HasBroadcastValues)
+        CallInst::Create(BarrierFn, {Ident, Tid}, "",
+                         RegionBarrierBB->getTerminator())
+            ->setDebugLoc(DL);
+    };
+
+    auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+    SmallPtrSet<BasicBlock *, 8> Visited;
+    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+      BasicBlock *BB = GuardedI->getParent();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
+      Instruction *LastEffect = nullptr;
+      BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
+      while (++IP != IPEnd) {
+        if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
+          continue;
+        Instruction *I = &*IP;
+        if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
+          continue;
+        if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
+          LastEffect = nullptr;
+          continue;
+        }
+        if (LastEffect)
+          Reorders.push_back({I, LastEffect});
+        LastEffect = &*IP;
+      }
+      for (auto &Reorder : Reorders)
+        Reorder.first->moveBefore(Reorder.second);
+    }
+
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
+
+    for (Instruction *GuardedI : SPMDCompatibilityTracker) {
+      BasicBlock *BB = GuardedI->getParent();
+      auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
+          IRPosition::function(*GuardedI->getFunction()), nullptr,
+          DepClassTy::NONE);
+      assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
+      auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
+      // Continue if instruction is already guarded.
+      if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
+        continue;
+
+      Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
+      for (Instruction &I : *BB) {
+        // If instruction I needs to be guarded update the guarded region
+        // bounds.
+        if (SPMDCompatibilityTracker.contains(&I)) {
+          CalleeAAFunction.getGuardedInstructions().insert(&I);
+          if (GuardedRegionStart)
+            GuardedRegionEnd = &I;
+          else
+            GuardedRegionStart = GuardedRegionEnd = &I;
+
+          continue;
+        }
+
+        // Instruction I does not need guarding, store
+        // any region found and reset bounds.
+        if (GuardedRegionStart) {
+          GuardedRegions.push_back(
+              std::make_pair(GuardedRegionStart, GuardedRegionEnd));
+          GuardedRegionStart = nullptr;
+          GuardedRegionEnd = nullptr;
+        }
+      }
+    }
+
+    for (auto &GR : GuardedRegions)
+      CreateGuardedRegion(GR.first, GR.second);
+
+    // Adjust the global exec mode flag that tells the runtime what mode this
+    // kernel is executed in.
+    assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
+           "Initially non-SPMD kernel has SPMD exec mode!");
+    ExecMode->setInitializer(
+        ConstantInt::get(ExecMode->getInitializer()->getType(),
+                         ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
 
     // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
-    const int InitIsSPMDArgNo = 1;
-    const int DeinitIsSPMDArgNo = 1;
+    const int InitModeArgNo = 1;
+    const int DeinitModeArgNo = 1;
     const int InitUseStateMachineArgNo = 2;
     const int InitRequiresFullRuntimeArgNo = 3;
     const int DeinitRequiresFullRuntimeArgNo = 2;
 
     auto &Ctx = getAnchorValue().getContext();
-    A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo),
-                             *ConstantInt::getBool(Ctx, 1));
+    A.changeUseAfterManifest(
+        KernelInitCB->getArgOperandUse(InitModeArgNo),
+        *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
+                                OMP_TGT_EXEC_MODE_SPMD));
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
         *ConstantInt::getBool(Ctx, 0));
     A.changeUseAfterManifest(
-        KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo),
-        *ConstantInt::getBool(Ctx, 1));
+        KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
+        *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
+                                OMP_TGT_EXEC_MODE_SPMD));
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
         *ConstantInt::getBool(Ctx, 0));
@@ -3042,10 +3373,15 @@ struct AAKernelInfoFunction : AAKernelInfo {
   };
 
   ChangeStatus buildCustomStateMachine(Attributor &A) {
-    assert(ReachedKnownParallelRegions.isValidState() &&
-           "Custom state machine with invalid parallel region states?");
+    // If we have disabled state machine rewrites, don't make a custom one
+    if (DisableOpenMPOptStateMachineRewrite)
+      return ChangeStatus::UNCHANGED;
 
-    const int InitIsSPMDArgNo = 1;
+    // Don't rewrite the state machine if we are not in a valid state.
+    if (!ReachedKnownParallelRegions.isValidState())
+      return ChangeStatus::UNCHANGED;
+
+    const int InitModeArgNo = 1;
     const int InitUseStateMachineArgNo = 2;
 
     // Check if the current configuration is non-SPMD and generic state machine.
@@ -3054,14 +3390,14 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // we give up.
     ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
         KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
-    ConstantInt *IsSPMD =
-        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
+    ConstantInt *Mode =
+        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
 
     // If we are stuck with generic mode, try to create a custom device (=GPU)
     // state machine which is specialized for the parallel regions that are
     // reachable by the kernel.
-    if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD ||
-        !IsSPMD->isZero())
+    if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
+        (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
       return ChangeStatus::UNCHANGED;
 
     // If not SPMD mode, indicate we use a custom state machine now.
@@ -3074,8 +3410,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // happen if there simply are no parallel regions. In the resulting kernel
     // all worker threads will simply exit right away, leaving the main thread
     // to do the work alone.
-    if (ReachedKnownParallelRegions.empty() &&
-        ReachedUnknownParallelRegions.empty()) {
+    if (!mayContainParallelRegion()) {
       ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
 
       auto Remark = [&](OptimizationRemark OR) {
@@ -3121,9 +3456,14 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // Create all the blocks:
     //
     //                       InitCB = __kmpc_target_init(...)
-    //                       bool IsWorker = InitCB >= 0;
+    //                       BlockHwSize =
+    //                         __kmpc_get_hardware_num_threads_in_block();
+    //                       WarpSize = __kmpc_get_warp_size();
+    //                       BlockSize = BlockHwSize - WarpSize;
+    //                       if (InitCB >= BlockSize) return;
+    // IsWorkerCheckBB:      bool IsWorker = InitCB >= 0;
     //                       if (IsWorker) {
-    // SMBeginBB:               __kmpc_barrier_simple_spmd(...);
+    // SMBeginBB:               __kmpc_barrier_simple_generic(...);
     //                         void *WorkFn;
     //                         bool Active = __kmpc_kernel_parallel(&WorkFn);
     //                         if (!WorkFn) return;
@@ -3137,7 +3477,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     //                              ((WorkFnTy*)WorkFn)(...);
     // SMEndParallelBB:           __kmpc_kernel_end_parallel(...);
     //                          }
-    // SMDoneBB:                __kmpc_barrier_simple_spmd(...);
+    // SMDoneBB:                __kmpc_barrier_simple_generic(...);
     //                          goto SMBeginBB;
     //                       }
     // UserCodeEntryBB:      // user code
@@ -3149,6 +3489,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
     BasicBlock *InitBB = KernelInitCB->getParent();
     BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
         KernelInitCB->getNextNode(), "thread.user_code.check");
+    BasicBlock *IsWorkerCheckBB =
+        BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
     BasicBlock *StateMachineBeginBB = BasicBlock::Create(
         Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
     BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
@@ -3165,6 +3507,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
         Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
     A.registerManifestAddedBasicBlock(*InitBB);
     A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
+    A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
     A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
     A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
     A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
@@ -3174,22 +3517,47 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
     const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
     ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
-
     InitBB->getTerminator()->eraseFromParent();
+
+    Module &M = *Kernel->getParent();
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    FunctionCallee BlockHwSizeFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
+    FunctionCallee WarpSizeFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_get_warp_size);
+    Instruction *BlockHwSize =
+        CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
+    BlockHwSize->setDebugLoc(DLoc);
+    Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+    WarpSize->setDebugLoc(DLoc);
+    Instruction *BlockSize =
+        BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
+    BlockSize->setDebugLoc(DLoc);
+    Instruction *IsMainOrWorker =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB,
+                         BlockSize, "thread.is_main_or_worker", InitBB);
+    IsMainOrWorker->setDebugLoc(DLoc);
+    BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker,
+                       InitBB);
+
     Instruction *IsWorker =
         ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
                          ConstantInt::get(KernelInitCB->getType(), -1),
-                         "thread.is_worker", InitBB);
+                         "thread.is_worker", IsWorkerCheckBB);
     IsWorker->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
+    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker,
+                       IsWorkerCheckBB);
 
     // Create local storage for the work function pointer.
+    const DataLayout &DL = M.getDataLayout();
     Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
-    AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr",
-                                          &Kernel->getEntryBlock().front());
+    Instruction *WorkFnAI =
+        new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
+                       "worker.work_fn.addr", &Kernel->getEntryBlock().front());
     WorkFnAI->setDebugLoc(DLoc);
 
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     OMPInfoCache.OMPBuilder.updateToLocation(
         OpenMPIRBuilder::LocationDescription(
             IRBuilder<>::InsertPoint(StateMachineBeginBB,
@@ -3199,13 +3567,23 @@ struct AAKernelInfoFunction : AAKernelInfo {
     Value *Ident = KernelInitCB->getArgOperand(0);
     Value *GTid = KernelInitCB;
 
-    Module &M = *Kernel->getParent();
     FunctionCallee BarrierFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
-            M, OMPRTL___kmpc_barrier_simple_spmd);
+            M, OMPRTL___kmpc_barrier_simple_generic);
     CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
         ->setDebugLoc(DLoc);
 
+    if (WorkFnAI->getType()->getPointerAddressSpace() !=
+        (unsigned int)AddressSpace::Generic) {
+      WorkFnAI = new AddrSpaceCastInst(
+          WorkFnAI,
+          PointerType::getWithSamePointeeType(
+              cast<PointerType>(WorkFnAI->getType()),
+              (unsigned int)AddressSpace::Generic),
+          WorkFnAI->getName() + ".generic", StateMachineBeginBB);
+      WorkFnAI->setDebugLoc(DLoc);
+    }
+
     FunctionCallee KernelParallelFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
             M, OMPRTL___kmpc_kernel_parallel);
@@ -3243,8 +3621,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
     // Now that we have most of the CFG skeleton it is time for the if-cascade
     // that checks the function pointer we got from the runtime against the
     // parallel regions we expect, if there are any.
-    for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) {
-      auto *ParallelRegion = ReachedKnownParallelRegions[i];
+    for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
+      auto *ParallelRegion = ReachedKnownParallelRegions[I];
       BasicBlock *PRExecuteBB = BasicBlock::Create(
           Ctx, "worker_state_machine.parallel_region.execute", Kernel,
           StateMachineEndParallelBB);
@@ -3260,7 +3638,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
       // Check if we need to compare the pointer at all or if we can just
       // call the parallel region function.
       Value *IsPR;
-      if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) {
+      if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
         Instruction *CmpI = ICmpInst::Create(
             ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
             "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
@@ -3324,8 +3702,21 @@ struct AAKernelInfoFunction : AAKernelInfo {
         if (llvm::all_of(Objects,
                          [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
           return true;
+        // Check for AAHeapToStack moved objects which must not be guarded.
+        auto &HS = A.getAAFor<AAHeapToStack>(
+            *this, IRPosition::function(*I.getFunction()),
+            DepClassTy::OPTIONAL);
+        if (llvm::all_of(Objects, [&HS](const Value *Obj) {
+              auto *CB = dyn_cast<CallBase>(Obj);
+              if (!CB)
+                return false;
+              return HS.isAssumedHeapToStack(*CB);
+            })) {
+          return true;
+        }
       }
-      // For now we give up on everything but stores.
+
+      // Insert instruction that needs guarding.
       SPMDCompatibilityTracker.insert(&I);
       return true;
     };
@@ -3339,9 +3730,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!IsKernelEntry) {
       updateReachingKernelEntries(A);
       updateParallelLevels(A);
+
+      if (!ParallelLevels.isValidState())
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
     }
 
     // Callback to check a call instruction.
+    bool AllParallelRegionStatesWereFixed = true;
     bool AllSPMDStatesWereFixed = true;
     auto CheckCallInst = [&](Instruction &I) {
       auto &CB = cast<CallBase>(I);
@@ -3349,13 +3744,37 @@ struct AAKernelInfoFunction : AAKernelInfo {
           *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
       getState() ^= CBAA.getState();
       AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
+      AllParallelRegionStatesWereFixed &=
+          CBAA.ReachedKnownParallelRegions.isAtFixpoint();
+      AllParallelRegionStatesWereFixed &=
+          CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
       return true;
     };
 
     bool UsedAssumedInformationInCheckCallInst = false;
     if (!A.checkForAllCallLikeInstructions(
-            CheckCallInst, *this, UsedAssumedInformationInCheckCallInst))
+            CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
+      LLVM_DEBUG(dbgs() << TAG
+                        << "Failed to visit all call-like instructions!\n";);
       return indicatePessimisticFixpoint();
+    }
+
+    // If we haven't used any assumed information for the reached parallel
+    // region states we can fix it.
+    if (!UsedAssumedInformationInCheckCallInst &&
+        AllParallelRegionStatesWereFixed) {
+      ReachedKnownParallelRegions.indicateOptimisticFixpoint();
+      ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
+    }
+
+    // If we are sure there are no parallel regions in the kernel we do not
+    // want SPMD mode.
+    if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
+        ReachedKnownParallelRegions.isAtFixpoint() &&
+        ReachedUnknownParallelRegions.isValidState() &&
+        ReachedKnownParallelRegions.isValidState() &&
+        !mayContainParallelRegion())
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
 
     // If we haven't used any assumed information for the SPMD state we can fix
     // it.
@@ -3454,14 +3873,14 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     CallBase &CB = cast<CallBase>(getAssociatedValue());
     Function *Callee = getAssociatedFunction();
 
-    // Helper to lookup an assumption string.
-    auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) {
-      return Fn && hasAssumption(*Fn, AssumptionStr);
-    };
+    auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
+        *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
 
     // Check for SPMD-mode assumptions.
-    if (HasAssumption(Callee, "ompx_spmd_amenable"))
+    if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+      indicateOptimisticFixpoint();
+    }
 
     // First weed out calls we do not care about, that is readonly/readnone
     // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
@@ -3483,14 +3902,16 @@ struct AAKernelInfoCallSite : AAKernelInfo {
 
         // Unknown callees might contain parallel regions, except if they have
         // an appropriate assumption attached.
-        if (!(HasAssumption(Callee, "omp_no_openmp") ||
-              HasAssumption(Callee, "omp_no_parallelism")))
+        if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
+              AssumptionAA.hasAssumption("omp_no_parallelism")))
           ReachedUnknownParallelRegions.insert(&CB);
 
         // If SPMDCompatibilityTracker is not fixed, we need to give up on the
         // idea we can run something unknown in SPMD-mode.
-        if (!SPMDCompatibilityTracker.isAtFixpoint())
+        if (!SPMDCompatibilityTracker.isAtFixpoint()) {
+          SPMDCompatibilityTracker.indicatePessimisticFixpoint();
           SPMDCompatibilityTracker.insert(&CB);
+        }
 
         // We have updated the state for this unknown call properly, there won't
         // be any change so we indicate a fixpoint.
@@ -3506,6 +3927,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     switch (RF) {
     // All the functions we know are compatible with SPMD mode.
     case OMPRTL___kmpc_is_spmd_exec_mode:
+    case OMPRTL___kmpc_distribute_static_fini:
     case OMPRTL___kmpc_for_static_fini:
     case OMPRTL___kmpc_global_thread_num:
     case OMPRTL___kmpc_get_hardware_num_threads_in_block:
@@ -3516,6 +3938,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     case OMPRTL___kmpc_end_master:
     case OMPRTL___kmpc_barrier:
       break;
+    case OMPRTL___kmpc_distribute_static_init_4:
+    case OMPRTL___kmpc_distribute_static_init_4u:
+    case OMPRTL___kmpc_distribute_static_init_8:
+    case OMPRTL___kmpc_distribute_static_init_8u:
     case OMPRTL___kmpc_for_static_init_4:
     case OMPRTL___kmpc_for_static_init_4u:
     case OMPRTL___kmpc_for_static_init_8:
@@ -3533,6 +3959,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       case OMPScheduleType::DistributeChunked:
         break;
       default:
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
         SPMDCompatibilityTracker.insert(&CB);
         break;
       };
@@ -3565,7 +3992,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       return;
     default:
       // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
-      // generally.
+      // generally. However, they do not hide parallel regions.
       SPMDCompatibilityTracker.insert(&CB);
       break;
     }
@@ -3685,6 +4112,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
   }
 
   void initialize(Attributor &A) override {
+    if (DisableOpenMPOptFolding)
+      indicatePessimisticFixpoint();
+
     Function *Callee = getAssociatedFunction();
 
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
@@ -3741,11 +4171,24 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
     if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
-      Instruction &CB = *getCtxI();
-      A.changeValueAfterManifest(CB, **SimplifiedValue);
-      A.deleteAfterManifest(CB);
+      Instruction &I = *getCtxI();
+      A.changeValueAfterManifest(I, **SimplifiedValue);
+      A.deleteAfterManifest(I);
+
+      CallBase *CB = dyn_cast<CallBase>(&I);
+      auto Remark = [&](OptimizationRemark OR) {
+        if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
+          return OR << "Replacing OpenMP runtime call "
+                    << CB->getCalledFunction()->getName() << " with "
+                    << ore::NV("FoldedValue", C->getZExtValue()) << ".";
+        return OR << "Replacing OpenMP runtime call "
+                  << CB->getCalledFunction()->getName() << ".";
+      };
 
-      LLVM_DEBUG(dbgs() << TAG << "Folding runtime call: " << CB << " with "
+      if (CB && EnableVerboseRemarks)
+        A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
+
+      LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
                         << **SimplifiedValue << "\n");
 
       Changed = ChangeStatus::CHANGED;
@@ -3979,7 +4422,6 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
           DepClassTy::NONE, /* ForceUpdate */ false,
           /* UpdateAfterInit */ false);
 
-
     registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
     registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
     registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
@@ -4012,7 +4454,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
     A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
     return false;
   };
-  GlobalizationRFI.foreachUse(SCC, CreateAA);
+  if (!DisableOpenMPOptDeglobalization)
+    GlobalizationRFI.foreachUse(SCC, CreateAA);
 
   // Create an ExecutionDomain AA for every function and a HeapToStack AA for
   // every function if there is a device kernel.
@@ -4024,7 +4467,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
       continue;
 
     A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
-    A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
+    if (!DisableOpenMPOptDeglobalization)
+      A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
 
     for (auto &I : instructions(*F)) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
@@ -4176,28 +4620,32 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     ORE.emit([&]() {
       OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
       return ORA << "Could not internalize function. "
-                 << "Some optimizations may not be possible.";
+                 << "Some optimizations may not be possible. [OMP140]";
     });
   };
 
   // Create internal copies of each function if this is a kernel Module. This
   // allows iterprocedural passes to see every call edge.
-  DenseSet<const Function *> InternalizedFuncs;
-  if (isOpenMPDevice(M))
+  DenseMap<Function *, Function *> InternalizedMap;
+  if (isOpenMPDevice(M)) {
+    SmallPtrSet<Function *, 16> InternalizeFns;
     for (Function &F : M)
       if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
           !DisableInternalization) {
-        if (Attributor::internalizeFunction(F, /* Force */ true)) {
-          InternalizedFuncs.insert(&F);
+        if (Attributor::isInternalizable(F)) {
+          InternalizeFns.insert(&F);
         } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
           EmitRemark(F);
         }
       }
 
+    Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
+  }
+
   // Look at every function in the Module unless it was internalized.
   SmallVector<Function *, 16> SCC;
   for (Function &F : M)
-    if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
+    if (!F.isDeclaration() && !InternalizedMap.lookup(&F))
       SCC.push_back(&F);
 
   if (SCC.empty())
@@ -4215,12 +4663,24 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
   SetVector<Function *> Functions(SCC.begin(), SCC.end());
   OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
 
-  unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
+  unsigned MaxFixpointIterations =
+      (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
   Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
                MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(true);
+
+  // Optionally inline device functions for potentially better performance.
+  if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
+    for (Function &F : M)
+      if (!F.isDeclaration() && !Kernels.contains(&F) &&
+          !F.hasFnAttribute(Attribute::NoInline))
+        F.addFnAttr(Attribute::AlwaysInline);
+
+  if (PrintModuleAfterOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
+
   if (Changed)
     return PreservedAnalyses::none();
 
@@ -4267,12 +4727,17 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
   OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
                                 /*CGSCC*/ Functions, Kernels);
 
-  unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
+  unsigned MaxFixpointIterations =
+      (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
   Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
                MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(false);
+
+  if (PrintModuleAfterOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
+
   if (Changed)
     return PreservedAnalyses::none();
 
@@ -4333,12 +4798,18 @@ struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
                                   Allocator,
                                   /*CGSCC*/ Functions, Kernels);
 
-    unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
+    unsigned MaxFixpointIterations =
+        (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
     Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
                  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
     OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
-    return OMPOpt.run(false);
+    bool Result = OMPOpt.run(false);
+
+    if (PrintModuleAfterOptimizations)
+      LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
+
+    return Result;
   }
 
   bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index d517de38ace3..7402e399a88a 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -441,9 +441,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(
   };
 
   auto BBProfileCount = [BFI](BasicBlock *BB) {
-    return BFI->getBlockProfileCount(BB)
-               ? BFI->getBlockProfileCount(BB).getValue()
-               : 0;
+    return BFI->getBlockProfileCount(BB).getValueOr(0);
   };
 
   // Use the same computeBBInlineCost function to compute the cost savings of
@@ -1413,7 +1411,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
 
   uint64_t CalleeEntryCountV =
-      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
+      (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
 
   bool AnyInline = false;
   for (User *User : Users) {
@@ -1461,8 +1459,8 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   if (AnyInline) {
     Cloner.IsFunctionInlined = true;
     if (CalleeEntryCount)
-      Cloner.OrigFunc->setEntryCount(
-          CalleeEntryCount.setCount(CalleeEntryCountV));
+      Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
+          CalleeEntryCountV, CalleeEntryCount->getType()));
     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
     OrigFuncORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index aa916345954d..74f68531b89a 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -437,6 +437,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
+  // The matrix extension can introduce large vector operations early, which can
+  // benefit from running vector-combine early on.
+  if (EnableMatrix)
+    MPM.add(createVectorCombinePass());
+
   // Begin the loop pass pipeline.
   if (EnableSimpleLoopUnswitch) {
     // The simple loop unswitch pass relies on separate cleanup passes. Schedule
@@ -1012,7 +1017,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
         createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
 
     // Propage constant function arguments by specializing the functions.
-    if (EnableFunctionSpecialization)
+    if (EnableFunctionSpecialization && OptLevel > 2)
       PM.add(createFunctionSpecializationPass());
 
     // Propagate constants at call sites into the functions they call.  This
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index 081398a390fa..5779553ee732 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -135,6 +135,7 @@ PreservedAnalyses FunctionSpecializationPass::run(Module &M,
   return PA;
 }
 
+namespace {
 struct FunctionSpecializationLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
   FunctionSpecializationLegacyPass() : ModulePass(ID) {}
@@ -175,6 +176,7 @@ struct FunctionSpecializationLegacyPass : public ModulePass {
     return runFunctionSpecialization(M, DL, GetTLI, GetTTI, GetAC, GetAnalysis);
   }
 };
+} // namespace
 
 char FunctionSpecializationLegacyPass::ID = 0;
 
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 55b88ac14da5..bae9a1e27e75 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
   if (CalleeName.empty())
     return getHottestChildContext(CallSite);
 
-  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = nodeHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
   if (It != AllChildContext.end())
     return &It->second;
@@ -64,8 +64,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
 
 ContextTrieNode &ContextTrieNode::moveToChildContext(
     const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
-    StringRef ContextStrToRemove, bool DeleteNode) {
-  uint32_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
+    uint32_t ContextFramesToRemove, bool DeleteNode) {
+  uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
   assert(!AllChildContext.count(Hash) && "Node to remove must exist");
   LineLocation OldCallSite = NodeToMove.CallSiteLoc;
   ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
@@ -86,10 +86,10 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
     FunctionSamples *FSamples = Node->getFunctionSamples();
 
     if (FSamples) {
-      FSamples->getContext().promoteOnPath(ContextStrToRemove);
+      FSamples->getContext().promoteOnPath(ContextFramesToRemove);
       FSamples->getContext().setState(SyntheticContext);
-      LLVM_DEBUG(dbgs() << "  Context promoted to: " << FSamples->getContext()
-                        << "\n");
+      LLVM_DEBUG(dbgs() << "  Context promoted to: "
+                        << FSamples->getContext().toString() << "\n");
     }
 
     for (auto &It : Node->getAllChildContext()) {
@@ -108,12 +108,12 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
 
 void ContextTrieNode::removeChildContext(const LineLocation &CallSite,
                                          StringRef CalleeName) {
-  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = nodeHash(CalleeName, CallSite);
   // Note this essentially calls dtor and destroys that child context
   AllChildContext.erase(Hash);
 }
 
-std::map<uint32_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() {
+std::map<uint64_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() {
   return AllChildContext;
 }
 
@@ -127,6 +127,15 @@ void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
   FuncSamples = FSamples;
 }
 
+Optional<uint32_t> ContextTrieNode::getFunctionSize() const { return FuncSize; }
+
+void ContextTrieNode::addFunctionSize(uint32_t FSize) {
+  if (!FuncSize.hasValue())
+    FuncSize = 0;
+
+  FuncSize = FuncSize.getValue() + FSize;
+}
+
 LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; }
 
 ContextTrieNode *ContextTrieNode::getParentContext() const {
@@ -137,9 +146,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
   ParentContext = Parent;
 }
 
-void ContextTrieNode::dump() {
+void ContextTrieNode::dumpNode() {
   dbgs() << "Node: " << FuncName << "\n"
          << "  Callsite: " << CallSiteLoc << "\n"
+         << "  Size: " << FuncSize << "\n"
          << "  Children:\n";
 
   for (auto &It : AllChildContext) {
@@ -147,20 +157,38 @@ void ContextTrieNode::dump() {
   }
 }
 
-uint32_t ContextTrieNode::nodeHash(StringRef ChildName,
+void ContextTrieNode::dumpTree() {
+  dbgs() << "Context Profile Tree:\n";
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(this);
+
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Node->dumpNode();
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      NodeQueue.push(ChildNode);
+    }
+  }
+}
+
+uint64_t ContextTrieNode::nodeHash(StringRef ChildName,
                                    const LineLocation &Callsite) {
   // We still use child's name for child hash, this is
   // because for children of root node, we don't have
   // different line/discriminator, and we'll rely on name
   // to differentiate children.
-  uint32_t NameHash = std::hash<std::string>{}(ChildName.str());
-  uint32_t LocId = (Callsite.LineOffset << 16) | Callsite.Discriminator;
+  uint64_t NameHash = std::hash<std::string>{}(ChildName.str());
+  uint64_t LocId =
+      (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator;
   return NameHash + (LocId << 5) + LocId;
 }
 
 ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
     const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) {
-  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = nodeHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
   if (It != AllChildContext.end()) {
     assert(It->second.getFuncName() == CalleeName &&
@@ -177,13 +205,16 @@ ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
 
 // Profiler tracker than manages profiles and its associated context
 SampleContextTracker::SampleContextTracker(
-    StringMap<FunctionSamples> &Profiles) {
+    SampleProfileMap &Profiles,
+    const DenseMap<uint64_t, StringRef> *GUIDToFuncNameMap)
+    : GUIDToFuncNameMap(GUIDToFuncNameMap) {
   for (auto &FuncSample : Profiles) {
     FunctionSamples *FSamples = &FuncSample.second;
-    SampleContext Context(FuncSample.first(), RawContext);
-    LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n");
+    SampleContext Context = FuncSample.first;
+    LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context.toString()
+                      << "\n");
     if (!Context.isBaseContext())
-      FuncToCtxtProfiles[Context.getNameWithoutContext()].push_back(FSamples);
+      FuncToCtxtProfiles[Context.getName()].insert(FSamples);
     ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
     assert(!NewNode->getFunctionSamples() &&
            "New node can't have sample profile");
@@ -200,6 +231,10 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
     return nullptr;
 
   CalleeName = FunctionSamples::getCanonicalFnName(CalleeName);
+  // Convert real function names to MD5 names, if the input profile is
+  // MD5-based.
+  std::string FGUID;
+  CalleeName = getRepInFormat(CalleeName, FunctionSamples::UseMD5, FGUID);
 
   // For indirect call, CalleeName will be empty, in which case the context
   // profile for callee with largest total samples will be returned.
@@ -207,7 +242,8 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
   if (CalleeContext) {
     FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
     LLVM_DEBUG(if (FSamples) {
-      dbgs() << "  Callee context found: " << FSamples->getContext() << "\n";
+      dbgs() << "  Callee context found: " << FSamples->getContext().toString()
+             << "\n";
     });
     return FSamples;
   }
@@ -285,6 +321,11 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
 FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
                                                          bool MergeContext) {
   LLVM_DEBUG(dbgs() << "Getting base profile for function: " << Name << "\n");
+  // Convert real function names to MD5 names, if the input profile is
+  // MD5-based.
+  std::string FGUID;
+  Name = getRepInFormat(Name, FunctionSamples::UseMD5, FGUID);
+
   // Base profile is top-level node (child of root node), so try to retrieve
   // existing top-level node for given function first. If it exists, it could be
   // that we've merged base profile before, or there's actually context-less
@@ -299,14 +340,14 @@ FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
     // into base profile.
     for (auto *CSamples : FuncToCtxtProfiles[Name]) {
       SampleContext &Context = CSamples->getContext();
-      ContextTrieNode *FromNode = getContextFor(Context);
-      if (FromNode == Node)
-        continue;
-
       // Skip inlined context profile and also don't re-merge any context
       if (Context.hasState(InlinedContext) || Context.hasState(MergedContext))
         continue;
 
+      ContextTrieNode *FromNode = getContextFor(Context);
+      if (FromNode == Node)
+        continue;
+
       ContextTrieNode &ToNode = promoteMergeContextSamplesTree(*FromNode);
       assert((!Node || Node == &ToNode) && "Expect only one base profile");
       Node = &ToNode;
@@ -324,7 +365,7 @@ void SampleContextTracker::markContextSamplesInlined(
     const FunctionSamples *InlinedSamples) {
   assert(InlinedSamples && "Expect non-null inlined samples");
   LLVM_DEBUG(dbgs() << "Marking context profile as inlined: "
-                    << InlinedSamples->getContext() << "\n");
+                    << InlinedSamples->getContext().toString() << "\n");
   InlinedSamples->getContext().setState(InlinedContext);
 }
 
@@ -376,30 +417,23 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
   FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples();
   assert(FromSamples && "Shouldn't promote a context without profile");
   LLVM_DEBUG(dbgs() << "  Found context tree root to promote: "
-                    << FromSamples->getContext() << "\n");
+                    << FromSamples->getContext().toString() << "\n");
 
   assert(!FromSamples->getContext().hasState(InlinedContext) &&
          "Shouldn't promote inlined context profile");
-  StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
+  uint32_t ContextFramesToRemove =
+      FromSamples->getContext().getContextFrames().size() - 1;
   return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
-                                        ContextStrToRemove);
+                                        ContextFramesToRemove);
 }
 
-void SampleContextTracker::dump() {
-  dbgs() << "Context Profile Tree:\n";
-  std::queue<ContextTrieNode *> NodeQueue;
-  NodeQueue.push(&RootContext);
-
-  while (!NodeQueue.empty()) {
-    ContextTrieNode *Node = NodeQueue.front();
-    NodeQueue.pop();
-    Node->dump();
+void SampleContextTracker::dump() { RootContext.dumpTree(); }
 
-    for (auto &It : Node->getAllChildContext()) {
-      ContextTrieNode *ChildNode = &It.second;
-      NodeQueue.push(ChildNode);
-    }
-  }
+StringRef SampleContextTracker::getFuncNameFor(ContextTrieNode *Node) const {
+  if (!FunctionSamples::UseMD5)
+    return Node->getFuncName();
+  assert(GUIDToFuncNameMap && "GUIDToFuncNameMap needs to be populated first");
+  return GUIDToFuncNameMap->lookup(std::stoull(Node->getFuncName().data()));
 }
 
 ContextTrieNode *
@@ -444,11 +478,22 @@ ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
     RootName = PrevDIL->getScope()->getSubprogram()->getName();
   S.push_back(std::make_pair(LineLocation(0, 0), RootName));
 
+  // Convert real function names to MD5 names, if the input profile is
+  // MD5-based.
+  std::list<std::string> MD5Names;
+  if (FunctionSamples::UseMD5) {
+    for (auto &Location : S) {
+      MD5Names.emplace_back();
+      getRepInFormat(Location.second, FunctionSamples::UseMD5, MD5Names.back());
+      Location.second = MD5Names.back();
+    }
+  }
+
   ContextTrieNode *ContextNode = &RootContext;
   int I = S.size();
   while (--I >= 0 && ContextNode) {
     LineLocation &CallSite = S[I].first;
-    StringRef &CalleeName = S[I].second;
+    StringRef CalleeName = S[I].second;
     ContextNode = ContextNode->getChildContext(CallSite, CalleeName);
   }
 
@@ -462,27 +507,18 @@ ContextTrieNode *
 SampleContextTracker::getOrCreateContextPath(const SampleContext &Context,
                                              bool AllowCreate) {
   ContextTrieNode *ContextNode = &RootContext;
-  StringRef ContextRemain = Context;
-  StringRef ChildContext;
-  StringRef CalleeName;
   LineLocation CallSiteLoc(0, 0);
 
-  while (ContextNode && !ContextRemain.empty()) {
-    auto ContextSplit = SampleContext::splitContextString(ContextRemain);
-    ChildContext = ContextSplit.first;
-    ContextRemain = ContextSplit.second;
-    LineLocation NextCallSiteLoc(0, 0);
-    SampleContext::decodeContextString(ChildContext, CalleeName,
-                                       NextCallSiteLoc);
-
+  for (auto &Callsite : Context.getContextFrames()) {
     // Create child node at parent line/disc location
     if (AllowCreate) {
       ContextNode =
-          ContextNode->getOrCreateChildContext(CallSiteLoc, CalleeName);
+          ContextNode->getOrCreateChildContext(CallSiteLoc, Callsite.FuncName);
     } else {
-      ContextNode = ContextNode->getChildContext(CallSiteLoc, CalleeName);
+      ContextNode =
+          ContextNode->getChildContext(CallSiteLoc, Callsite.FuncName);
     }
-    CallSiteLoc = NextCallSiteLoc;
+    CallSiteLoc = Callsite.Location;
   }
 
   assert((!AllowCreate || ContextNode) &&
@@ -502,7 +538,7 @@ ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) {
 
 void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
                                             ContextTrieNode &ToNode,
-                                            StringRef ContextStrToRemove) {
+                                            uint32_t ContextFramesToRemove) {
   FunctionSamples *FromSamples = FromNode.getFunctionSamples();
   FunctionSamples *ToSamples = ToNode.getFunctionSamples();
   if (FromSamples && ToSamples) {
@@ -510,19 +546,21 @@ void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
     ToSamples->merge(*FromSamples);
     ToSamples->getContext().setState(SyntheticContext);
     FromSamples->getContext().setState(MergedContext);
+    if (FromSamples->getContext().hasAttribute(ContextShouldBeInlined))
+      ToSamples->getContext().setAttribute(ContextShouldBeInlined);
   } else if (FromSamples) {
     // Transfer FromSamples from FromNode to ToNode
     ToNode.setFunctionSamples(FromSamples);
     FromSamples->getContext().setState(SyntheticContext);
-    FromSamples->getContext().promoteOnPath(ContextStrToRemove);
+    FromSamples->getContext().promoteOnPath(ContextFramesToRemove);
     FromNode.setFunctionSamples(nullptr);
   }
 }
 
 ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
     ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent,
-    StringRef ContextStrToRemove) {
-  assert(!ContextStrToRemove.empty() && "Context to remove can't be empty");
+    uint32_t ContextFramesToRemove) {
+  assert(ContextFramesToRemove && "Context to remove can't be empty");
 
   // Ignore call site location if destination is top level under root
   LineLocation NewCallSiteLoc = LineLocation(0, 0);
@@ -540,21 +578,21 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
     // Do not delete node to move from its parent here because
     // caller is iterating over children of that parent node.
     ToNode = &ToNodeParent.moveToChildContext(
-        NewCallSiteLoc, std::move(FromNode), ContextStrToRemove, false);
+        NewCallSiteLoc, std::move(FromNode), ContextFramesToRemove, false);
   } else {
     // Destination node exists, merge samples for the context tree
-    mergeContextNode(FromNode, *ToNode, ContextStrToRemove);
+    mergeContextNode(FromNode, *ToNode, ContextFramesToRemove);
     LLVM_DEBUG({
       if (ToNode->getFunctionSamples())
         dbgs() << "  Context promoted and merged to: "
-               << ToNode->getFunctionSamples()->getContext() << "\n";
+               << ToNode->getFunctionSamples()->getContext().toString() << "\n";
     });
 
     // Recursively promote and merge children
     for (auto &It : FromNode.getAllChildContext()) {
       ContextTrieNode &FromChildNode = It.second;
       promoteMergeContextSamplesTree(FromChildNode, *ToNode,
-                                     ContextStrToRemove);
+                                     ContextFramesToRemove);
     }
 
     // Remove children once they're all merged
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 8e9c79fc7bbb..a961c47a7501 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -143,6 +143,12 @@ static cl::opt<bool> ProfileSampleAccurate(
              "callsite and function as having 0 samples. Otherwise, treat "
              "un-sampled callsites and functions conservatively as unknown. "));
 
+static cl::opt<bool> ProfileSampleBlockAccurate(
+    "profile-sample-block-accurate", cl::Hidden, cl::init(false),
+    cl::desc("If the sample profile is accurate, we will mark all un-sampled "
+             "branches and calls as having 0 samples. Otherwise, treat "
+             "them conservatively as unknown. "));
+
 static cl::opt<bool> ProfileAccurateForSymsInList(
     "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
     cl::init(true),
@@ -214,6 +220,16 @@ static cl::opt<bool> CallsitePrioritizedInline(
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
+static cl::opt<bool> UsePreInlinerDecision(
+    "sample-profile-use-preinliner", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Use the preinliner decisions stored in profile context."));
+
+static cl::opt<bool> AllowRecursiveInline(
+    "sample-profile-recursive-inline", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Allow sample loader inliner to inline recursive calls."));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -221,6 +237,50 @@ static cl::opt<std::string> ProfileInlineReplayFile(
         "by inlining from sample profile loader."),
     cl::Hidden);
 
+static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope(
+    "sample-profile-inline-replay-scope",
+    cl::init(ReplayInlinerSettings::Scope::Function),
+    cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
+                          "Replay on functions that have remarks associated "
+                          "with them (default)"),
+               clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
+                          "Replay on the entire module")),
+    cl::desc("Whether inline replay should be applied to the entire "
+             "Module or just the Functions (default) that are present as "
+             "callers in remarks during sample profile inlining."),
+    cl::Hidden);
+
+static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback(
+    "sample-profile-inline-replay-fallback",
+    cl::init(ReplayInlinerSettings::Fallback::Original),
+    cl::values(
+        clEnumValN(
+            ReplayInlinerSettings::Fallback::Original, "Original",
+            "All decisions not in replay send to original advisor (default)"),
+        clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
+                   "AlwaysInline", "All decisions not in replay are inlined"),
+        clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
+                   "All decisions not in replay are not inlined")),
+    cl::desc("How sample profile inline replay treats sites that don't come "
+             "from the replay. Original: defers to original advisor, "
+             "AlwaysInline: inline all sites not in replay, NeverInline: "
+             "inline no sites not in replay"),
+    cl::Hidden);
+
+static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat(
+    "sample-profile-inline-replay-format",
+    cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
+    cl::values(
+        clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
+        clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
+                   "<Line Number>:<Column Number>"),
+        clEnumValN(CallSiteFormat::Format::LineDiscriminator,
+                   "LineDiscriminator", "<Line Number>.<Discriminator>"),
+        clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
+                   "LineColumnDiscriminator",
+                   "<Line Number>:<Column Number>.<Discriminator> (default)")),
+    cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
+
 static cl::opt<unsigned>
     MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
                      cl::ZeroOrMore,
@@ -358,10 +418,10 @@ public:
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
       std::function<const TargetLibraryInfo &(Function &)> GetTLI)
-      : SampleProfileLoaderBaseImpl(std::string(Name)),
+      : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)),
         GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
-        RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
+        LTOPhase(LTOPhase) {}
 
   bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -377,7 +437,7 @@ protected:
   findFunctionSamples(const Instruction &I) const override;
   std::vector<const FunctionSamples *>
   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
-  void findExternalInlineCandidate(const FunctionSamples *Samples,
+  void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
                                    DenseSet<GlobalValue::GUID> &InlinedGUIDs,
                                    const StringMap<Function *> &SymbolMap,
                                    uint64_t Threshold);
@@ -385,8 +445,11 @@ protected:
   bool tryPromoteAndInlineCandidate(
       Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
       uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
+
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  Optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
+  bool getExternalInlineAdvisorShouldInline(CallBase &CB);
   InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
   bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
   bool
@@ -417,9 +480,6 @@ protected:
   /// Profile tracker for different context.
   std::unique_ptr<SampleContextTracker> ContextTracker;
 
-  /// Name of the profile remapping file to load.
-  std::string RemappingFilename;
-
   /// Flag indicating whether input profile is context-sensitive
   bool ProfileIsCS = false;
 
@@ -464,7 +524,7 @@ protected:
   bool ProfAccForSymsInList;
 
   // External inline advisor used to replay inline decision from remarks.
-  std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
+  std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
 
   // A pseudo probe helper to correlate the imported sample counts.
   std::unique_ptr<PseudoProbeManager> ProbeManager;
@@ -953,8 +1013,24 @@ void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
 }
 
 void SampleProfileLoader::findExternalInlineCandidate(
-    const FunctionSamples *Samples, DenseSet<GlobalValue::GUID> &InlinedGUIDs,
+    CallBase *CB, const FunctionSamples *Samples,
+    DenseSet<GlobalValue::GUID> &InlinedGUIDs,
     const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
+
+  // If ExternalInlineAdvisor wants to inline an external function
+  // make sure it's imported
+  if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
+    // Samples may not exist for replayed function, if so
+    // just add the direct GUID and move on
+    if (!Samples) {
+      InlinedGUIDs.insert(
+          FunctionSamples::getGUID(CB->getCalledFunction()->getName()));
+      return;
+    }
+    // Otherwise, drop the threshold to import everything that we can
+    Threshold = 0;
+  }
+
   assert(Samples && "expect non-null caller profile");
 
   // For AutoFDO profile, retrieve candidate profiles by walking over
@@ -975,14 +1051,21 @@ void SampleProfileLoader::findExternalInlineCandidate(
     // For CSSPGO profile, retrieve candidate profile by walking over the
     // trie built for context profile. Note that also take call targets
     // even if callee doesn't have a corresponding context profile.
-    if (!CalleeSample || CalleeSample->getEntrySamples() < Threshold)
+    if (!CalleeSample)
+      continue;
+
+    // If pre-inliner decision is used, honor that for importing as well.
+    bool PreInline =
+        UsePreInlinerDecision &&
+        CalleeSample->getContext().hasAttribute(ContextShouldBeInlined);
+    if (!PreInline && CalleeSample->getEntrySamples() < Threshold)
       continue;
 
     StringRef Name = CalleeSample->getFuncName();
     Function *Func = SymbolMap.lookup(Name);
     // Add to the import list only when it's defined out of module.
     if (!Func || Func->isDeclaration())
-      InlinedGUIDs.insert(FunctionSamples::getGUID(Name));
+      InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName()));
 
     // Import hot CallTargets, which may not be available in IR because full
     // profile annotation cannot be done until backend compilation in ThinLTO.
@@ -992,7 +1075,7 @@ void SampleProfileLoader::findExternalInlineCandidate(
           StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
           const Function *Callee = SymbolMap.lookup(CalleeName);
           if (!Callee || Callee->isDeclaration())
-            InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeName));
+            InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey()));
         }
 
     // Import hot child context profile associted with callees. Note that this
@@ -1042,16 +1125,20 @@ bool SampleProfileLoader::inlineHotFunctions(
       for (auto &I : BB.getInstList()) {
         const FunctionSamples *FS = nullptr;
         if (auto *CB = dyn_cast<CallBase>(&I)) {
-          if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
-            assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
-                   "GUIDToFuncNameMap has to be populated");
-            AllCandidates.push_back(CB);
-            if (FS->getEntrySamples() > 0 || ProfileIsCS)
-              LocalNotInlinedCallSites.try_emplace(CB, FS);
-            if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
-              Hot = true;
-            else if (shouldInlineColdCallee(*CB))
-              ColdCandidates.push_back(CB);
+          if (!isa<IntrinsicInst>(I)) {
+            if ((FS = findCalleeFunctionSamples(*CB))) {
+              assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
+                     "GUIDToFuncNameMap has to be populated");
+              AllCandidates.push_back(CB);
+              if (FS->getEntrySamples() > 0 || ProfileIsCS)
+                LocalNotInlinedCallSites.try_emplace(CB, FS);
+              if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
+                Hot = true;
+              else if (shouldInlineColdCallee(*CB))
+                ColdCandidates.push_back(CB);
+            } else if (getExternalInlineAdvisorShouldInline(*CB)) {
+              AllCandidates.push_back(CB);
+            }
           }
         }
       }
@@ -1078,7 +1165,7 @@ bool SampleProfileLoader::inlineHotFunctions(
         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
           uint64_t SumOrigin = Sum;
           if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-            findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap,
+            findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
                                         PSI->getOrCompHotCountThreshold());
             continue;
           }
@@ -1098,8 +1185,8 @@ bool SampleProfileLoader::inlineHotFunctions(
           LocalChanged = true;
         }
       } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-        findExternalInlineCandidate(findCalleeFunctionSamples(*I), InlinedGUIDs,
-                                    SymbolMap,
+        findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
+                                    InlinedGUIDs, SymbolMap,
                                     PSI->getOrCompHotCountThreshold());
       }
     }
@@ -1184,8 +1271,8 @@ bool SampleProfileLoader::tryInlineCandidate(
                                                *CalledFunction);
 
     // The call to InlineFunction erases I, so we can't pass it here.
-    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
-                    true, CSINLINE_DEBUG);
+    emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction,
+                               *BB->getParent(), Cost, true, CSINLINE_DEBUG);
 
     // Now populate the list of newly exposed call sites.
     if (InlinedCallSites) {
@@ -1228,7 +1315,9 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
 
   // Find the callee's profile. For indirect call, find hottest target profile.
   const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
-  if (!CalleeSamples)
+  // If ExternalInlineAdvisor wants to inline this site, do so even
+  // if Samples are not present.
+  if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
     return false;
 
   float Factor = 1.0;
@@ -1247,19 +1336,34 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
   return true;
 }
 
-InlineCost
-SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+Optional<InlineCost>
+SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
   std::unique_ptr<InlineAdvice> Advice = nullptr;
   if (ExternalInlineAdvisor) {
-    Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
-    if (!Advice->isInliningRecommended()) {
-      Advice->recordUnattemptedInlining();
-      return InlineCost::getNever("not previously inlined");
+    Advice = ExternalInlineAdvisor->getAdvice(CB);
+    if (Advice) {
+      if (!Advice->isInliningRecommended()) {
+        Advice->recordUnattemptedInlining();
+        return InlineCost::getNever("not previously inlined");
+      }
+      Advice->recordInlining();
+      return InlineCost::getAlways("previously inlined");
     }
-    Advice->recordInlining();
-    return InlineCost::getAlways("previously inlined");
   }
 
+  return {};
+}
+
+bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
+  Optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
+  return Cost ? !!Cost.getValue() : false;
+}
+
+InlineCost
+SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+  if (Optional<InlineCost> ReplayCost =
+          getExternalInlineAdvisorCost(*Candidate.CallInstr))
+    return ReplayCost.getValue();
   // Adjust threshold based on call site hotness, only do this for callsite
   // prioritized inliner because otherwise cost-benefit check is done earlier.
   int SampleThreshold = SampleColdCallSiteThreshold;
@@ -1274,7 +1378,9 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   assert(Callee && "Expect a definition for inline candidate of direct call");
 
   InlineParams Params = getInlineParams();
+  // We will ignore the threshold from inline cost, so always get full cost.
   Params.ComputeFullInlineCost = true;
+  Params.AllowRecursiveCall = AllowRecursiveInline;
   // Checks if there is anything in the reachable portion of the callee at
   // this callsite that makes this inlining potentially illegal. Need to
   // set ComputeFullInlineCost, otherwise getInlineCost may return early
@@ -1288,6 +1394,25 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   if (Cost.isNever() || Cost.isAlways())
     return Cost;
 
+  // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
+  // decisions based on hotness as well as accurate function byte sizes for
+  // given context using function/inlinee sizes from previous build. It
+  // stores the decision in profile, and also adjust/merge context profile
+  // aiming at better context-sensitive post-inline profile quality, assuming
+  // all inline decision estimates are going to be honored by compiler. Here
+  // we replay that inline decision under `sample-profile-use-preinliner`.
+  // Note that we don't need to handle negative decision from preinliner as
+  // context profile for not inlined calls are merged by preinliner already.
+  if (UsePreInlinerDecision && Candidate.CalleeSamples) {
+    // Once two node are merged due to promotion, we're losing some context
+    // so the original context-sensitive preinliner decision should be ignored
+    // for SyntheticContext.
+    SampleContext &Context = Candidate.CalleeSamples->getContext();
+    if (!Context.hasState(SyntheticContext) &&
+        Context.hasAttribute(ContextShouldBeInlined))
+      return InlineCost::getAlways("preinliner");
+  }
+
   // For old FDO inliner, we inline the call site as long as cost is not
   // "Never". The cost-benefit check is done earlier.
   if (!CallsitePrioritizedInline) {
@@ -1357,7 +1482,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
       for (const auto *FS : CalleeSamples) {
         // TODO: Consider disable pre-lTO ICP for MonoLTO as well
         if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-          findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap,
+          findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
                                       PSI->getOrCompHotCountThreshold());
           continue;
         }
@@ -1405,8 +1530,9 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
         Changed = true;
       }
     } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
-      findExternalInlineCandidate(Candidate.CalleeSamples, InlinedGUIDs,
-                                  SymbolMap, PSI->getOrCompHotCountThreshold());
+      findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
+                                  InlinedGUIDs, SymbolMap,
+                                  PSI->getOrCompHotCountThreshold());
     }
   }
 
@@ -1494,7 +1620,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
                             {static_cast<uint32_t>(BlockWeights[BB])}));
         }
       }
-    } else if (OverwriteExistingWeights) {
+    } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {
       // Set profile metadata (possibly annotated by LTO prelink) to zero or
       // clear it for cold code.
       for (auto &I : BB->getInstList()) {
@@ -1792,11 +1918,13 @@ bool SampleProfileLoader::doInitialization(Module &M,
   }
 
   if (FAM && !ProfileInlineReplayFile.empty()) {
-    ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
-        M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
+    ExternalInlineAdvisor = getReplayInlineAdvisor(
+        M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
+        ReplayInlinerSettings{ProfileInlineReplayFile,
+                              ProfileInlineReplayScope,
+                              ProfileInlineReplayFallback,
+                              {ProfileInlineReplayFormat}},
         /*EmitRemarks=*/false);
-    if (!ExternalInlineAdvisor->areReplayRemarksLoaded())
-      ExternalInlineAdvisor.reset();
   }
 
   // Apply tweaks if context-sensitive profile is available.
@@ -1810,13 +1938,21 @@ bool SampleProfileLoader::doInitialization(Module &M,
     if (!CallsitePrioritizedInline.getNumOccurrences())
       CallsitePrioritizedInline = true;
 
+    // For CSSPGO, use preinliner decision by default when available.
+    if (!UsePreInlinerDecision.getNumOccurrences())
+      UsePreInlinerDecision = true;
+
+    // For CSSPGO, we also allow recursive inline to best use context profile.
+    if (!AllowRecursiveInline.getNumOccurrences())
+      AllowRecursiveInline = true;
+
     // Enable iterative-BFI by default for CSSPGO.
     if (!UseIterativeBFIInference.getNumOccurrences())
       UseIterativeBFIInference = true;
 
     // Tracker for profiles under different context
-    ContextTracker =
-        std::make_unique<SampleContextTracker>(Reader->getProfiles());
+    ContextTracker = std::make_unique<SampleContextTracker>(
+        Reader->getProfiles(), &GUIDToFuncNameMap);
   }
 
   // Load pseudo probe descriptors for probe-based function samples.
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 08d316337ef5..21395460bccb 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -415,9 +415,7 @@ void PseudoProbeUpdatePass::runOnFunction(Function &F,
                                           FunctionAnalysisManager &FAM) {
   BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto BBProfileCount = [&BFI](BasicBlock *BB) {
-    return BFI.getBlockProfileCount(BB)
-               ? BFI.getBlockProfileCount(BB).getValue()
-               : 0;
+    return BFI.getBlockProfileCount(BB).getValueOr(0);
   };
 
   // Collect the sum of execution weight for each probe.
diff --git a/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 655a7a404951..0f2412dce1c9 100644
--- a/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -30,23 +30,20 @@ static bool stripDeadPrototypes(Module &M) {
   bool MadeChange = false;
 
   // Erase dead function prototypes.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
-    Function *F = &*I++;
+  for (Function &F : llvm::make_early_inc_range(M)) {
     // Function must be a prototype and unused.
-    if (F->isDeclaration() && F->use_empty()) {
-      F->eraseFromParent();
+    if (F.isDeclaration() && F.use_empty()) {
+      F.eraseFromParent();
       ++NumDeadPrototypes;
       MadeChange = true;
     }
   }
 
   // Erase dead global var prototypes.
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ) {
-    GlobalVariable *GV = &*I++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     // Global must be a prototype and unused.
-    if (GV->isDeclaration() && GV->use_empty())
-      GV->eraseFromParent();
+    if (GV.isDeclaration() && GV.use_empty())
+      GV.eraseFromParent();
   }
 
   // Return an indication of whether we changed anything or not.
diff --git a/llvm/lib/Transforms/IPO/StripSymbols.cpp b/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 168740a1158e..9d4e9464f361 100644
--- a/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -214,13 +214,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
   findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
 
   for (GlobalVariable &GV : M.globals()) {
-    if (GV.hasLocalLinkage() && llvmUsedValues.count(&GV) == 0)
+    if (GV.hasLocalLinkage() && !llvmUsedValues.contains(&GV))
       if (!PreserveDbgInfo || !GV.getName().startswith("llvm.dbg"))
         GV.setName(""); // Internal symbols can't participate in linkage
   }
 
   for (Function &I : M) {
-    if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
+    if (I.hasLocalLinkage() && !llvmUsedValues.contains(&I))
       if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
         I.setName(""); // Internal symbols can't participate in linkage
     if (auto *Symtab = I.getValueSymbolTable())
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 37329b489555..0cc1b37844f6 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -33,6 +33,19 @@ using namespace llvm;
 
 namespace {
 
+// Determine if a promotion alias should be created for a symbol name.
+static bool allowPromotionAlias(const std::string &Name) {
+  // Promotion aliases are used only in inline assembly. It's safe to
+  // simply skip unusual names. Subset of MCAsmInfo::isAcceptableChar()
+  // and MCAsmInfoXCOFF::isAcceptableChar().
+  for (const char &C : Name) {
+    if (isAlnum(C) || C == '_' || C == '.')
+      continue;
+    return false;
+  }
+  return true;
+}
+
 // Promote each local-linkage entity defined by ExportM and used by ImportM by
 // changing visibility and appending the given ModuleId.
 void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
@@ -55,6 +68,7 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
       }
     }
 
+    std::string OldName = Name.str();
     std::string NewName = (Name + ModuleId).str();
 
     if (const auto *C = ExportGV.getComdat())
@@ -69,6 +83,13 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
       ImportGV->setName(NewName);
       ImportGV->setVisibility(GlobalValue::HiddenVisibility);
     }
+
+    if (isa<Function>(&ExportGV) && allowPromotionAlias(OldName)) {
+      // Create a local alias with the original name to avoid breaking
+      // references from inline assembly.
+      std::string Alias = ".set " + OldName + "," + NewName + "\n";
+      ExportM.appendModuleInlineAsm(Alias);
+    }
   }
 
   if (!RenamedComdats.empty())
@@ -143,8 +164,7 @@ void simplifyExternals(Module &M) {
   FunctionType *EmptyFT =
       FunctionType::get(Type::getVoidTy(M.getContext()), false);
 
-  for (auto I = M.begin(), E = M.end(); I != E;) {
-    Function &F = *I++;
+  for (Function &F : llvm::make_early_inc_range(M)) {
     if (F.isDeclaration() && F.use_empty()) {
       F.eraseFromParent();
       continue;
@@ -160,16 +180,15 @@ void simplifyExternals(Module &M) {
                          F.getAddressSpace(), "", &M);
     NewF->copyAttributesFrom(&F);
     // Only copy function attribtues.
-    NewF->setAttributes(
-        AttributeList::get(M.getContext(), AttributeList::FunctionIndex,
-                           F.getAttributes().getFnAttributes()));
+    NewF->setAttributes(AttributeList::get(M.getContext(),
+                                           AttributeList::FunctionIndex,
+                                           F.getAttributes().getFnAttrs()));
     NewF->takeName(&F);
     F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
     F.eraseFromParent();
   }
 
-  for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
-    GlobalVariable &GV = *I++;
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     if (GV.isDeclaration() && GV.use_empty()) {
       GV.eraseFromParent();
       continue;
@@ -304,7 +323,8 @@ void splitAndWriteThinLTOBitcode(
             return true;
         if (auto *F = dyn_cast<Function>(GV))
           return EligibleVirtualFns.count(F);
-        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+        if (auto *GVar =
+                dyn_cast_or_null<GlobalVariable>(GV->getAliaseeObject()))
           return HasTypeMetadata(GVar);
         return false;
       }));
@@ -333,7 +353,7 @@ void splitAndWriteThinLTOBitcode(
   // Remove all globals with type metadata, globals with comdats that live in
   // MergedM, and aliases pointing to such globals from the thin LTO module.
   filterModule(&M, [&](const GlobalValue *GV) {
-    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getAliaseeObject()))
       if (HasTypeMetadata(GVar))
         return false;
     if (const auto *C = GV->getComdat())
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 7a8946110785..61054e7ae46f 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1288,7 +1288,7 @@ void DevirtModule::tryICallBranchFunnel(
                           M.getDataLayout().getProgramAddressSpace(),
                           "branch_funnel", &M);
   }
-  JT->addAttribute(1, Attribute::Nest);
+  JT->addParamAttr(0, Attribute::Nest);
 
   std::vector<Value *> JTArgs;
   JTArgs.push_back(JT->arg_begin());
@@ -1361,10 +1361,10 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
           M.getContext(), ArrayRef<Attribute>{Attribute::get(
                               M.getContext(), Attribute::Nest)}));
       for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I)
-        NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+        NewArgAttrs.push_back(Attrs.getParamAttrs(I));
       NewCS->setAttributes(
-          AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
-                             Attrs.getRetAttributes(), NewArgAttrs));
+          AttributeList::get(M.getContext(), Attrs.getFnAttrs(),
+                             Attrs.getRetAttrs(), NewArgAttrs));
 
       CB.replaceAllUsesWith(NewCS);
       CB.eraseFromParent();
@@ -1786,10 +1786,8 @@ void DevirtModule::scanTypeTestUsers(
   // points to a member of the type identifier %md. Group calls by (type ID,
   // offset) pair (effectively the identity of the virtual function) and store
   // to CallSlots.
-  for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
-       I != E;) {
-    auto CI = dyn_cast<CallInst>(I->getUser());
-    ++I;
+  for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
       continue;
 
@@ -1858,11 +1856,8 @@ void DevirtModule::scanTypeTestUsers(
 void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
   Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
 
-  for (auto I = TypeCheckedLoadFunc->use_begin(),
-            E = TypeCheckedLoadFunc->use_end();
-       I != E;) {
-    auto CI = dyn_cast<CallInst>(I->getUser());
-    ++I;
+  for (Use &U : llvm::make_early_inc_range(TypeCheckedLoadFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
       continue;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d01a021bf3f4..eb1b8a29cfc5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -939,7 +939,7 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
     // add (xor X, LowMaskC), C --> sub (LowMaskC + C), X
     if (C2->isMask()) {
       KnownBits LHSKnown = computeKnownBits(X, 0, &Add);
-      if ((*C2 | LHSKnown.Zero).isAllOnesValue())
+      if ((*C2 | LHSKnown.Zero).isAllOnes())
         return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C2 + *C), X);
     }
 
@@ -963,7 +963,7 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
     }
   }
 
-  if (C->isOneValue() && Op0->hasOneUse()) {
+  if (C->isOne() && Op0->hasOneUse()) {
     // add (sext i1 X), 1 --> zext (not X)
     // TODO: The smallest IR representation is (select X, 0, 1), and that would
     // not require the one-use check. But we need to remove a transform in
@@ -1355,6 +1355,17 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS)))))
     return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add"));
 
+  {
+    // (A + C1) + (C2 - B) --> (A - B) + (C1 + C2)
+    Constant *C1, *C2;
+    if (match(&I, m_c_Add(m_Add(m_Value(A), m_ImmConstant(C1)),
+                          m_Sub(m_ImmConstant(C2), m_Value(B)))) &&
+        (LHS->hasOneUse() || RHS->hasOneUse())) {
+      Value *Sub = Builder.CreateSub(A, B);
+      return BinaryOperator::CreateAdd(Sub, ConstantExpr::getAdd(C1, C2));
+    }
+  }
+
   // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
   if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
 
@@ -1817,12 +1828,8 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   if (match(Op0, m_AllOnes()))
     return BinaryOperator::CreateNot(Op1);
 
-  // (~X) - (~Y) --> Y - X
-  Value *X, *Y;
-  if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
-    return BinaryOperator::CreateSub(Y, X);
-
   // (X + -1) - Y --> ~Y + X
+  Value *X, *Y;
   if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
     return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
 
@@ -1843,6 +1850,17 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return BinaryOperator::CreateSub(X, Add);
   }
 
+  // (~X) - (~Y) --> Y - X
+  // This is placed after the other reassociations and explicitly excludes a
+  // sub-of-sub pattern to avoid infinite looping.
+  if (isFreeToInvert(Op0, Op0->hasOneUse()) &&
+      isFreeToInvert(Op1, Op1->hasOneUse()) &&
+      !match(Op0, m_Sub(m_ImmConstant(), m_Value()))) {
+    Value *NotOp0 = Builder.CreateNot(Op0);
+    Value *NotOp1 = Builder.CreateNot(Op1);
+    return BinaryOperator::CreateSub(NotOp1, NotOp0);
+  }
+
   auto m_AddRdx = [](Value *&Vec) {
     return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
   };
@@ -1892,7 +1910,7 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
     KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
-    if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
+    if ((*Op0C | RHSKnown.Zero).isAllOnes())
       return BinaryOperator::CreateXor(Op1, Op0);
   }
 
@@ -2039,12 +2057,31 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return BinaryOperator::CreateAnd(
         Op0, Builder.CreateNot(Y, Y->getName() + ".not"));
 
+  // ~X - Min/Max(~X, Y) -> ~Min/Max(X, ~Y) - X
+  // ~X - Min/Max(Y, ~X) -> ~Min/Max(X, ~Y) - X
+  // Min/Max(~X, Y) - ~X -> X - ~Min/Max(X, ~Y)
+  // Min/Max(Y, ~X) - ~X -> X - ~Min/Max(X, ~Y)
+  // As long as Y is freely invertible, this will be neutral or a win.
+  // Note: We don't generate the inverse max/min, just create the 'not' of
+  // it and let other folds do the rest.
+  if (match(Op0, m_Not(m_Value(X))) &&
+      match(Op1, m_c_MaxOrMin(m_Specific(Op0), m_Value(Y))) &&
+      !Op0->hasNUsesOrMore(3) && isFreeToInvert(Y, Y->hasOneUse())) {
+    Value *Not = Builder.CreateNot(Op1);
+    return BinaryOperator::CreateSub(Not, X);
+  }
+  if (match(Op1, m_Not(m_Value(X))) &&
+      match(Op0, m_c_MaxOrMin(m_Specific(Op1), m_Value(Y))) &&
+      !Op1->hasNUsesOrMore(3) && isFreeToInvert(Y, Y->hasOneUse())) {
+    Value *Not = Builder.CreateNot(Op0);
+    return BinaryOperator::CreateSub(X, Not);
+  }
+
+  // TODO: This is the same logic as above but handles the cmp-select idioms
+  //       for min/max, so the use checks are increased to account for the
+  //       extra instructions. If we canonicalize to intrinsics, this block
+  //       can likely be removed.
   {
-    // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A
-    // ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A
-    // Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O)
-    // Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O)
-    // So long as O here is freely invertible, this will be neutral or a win.
     Value *LHS, *RHS, *A;
     Value *NotA = Op0, *MinMax = Op1;
     SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
@@ -2057,12 +2094,10 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
         match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) {
       if (NotA == LHS)
         std::swap(LHS, RHS);
-      // LHS is now O above and expected to have at least 2 uses (the min/max)
-      // NotA is epected to have 2 uses from the min/max and 1 from the sub.
+      // LHS is now Y above and expected to have at least 2 uses (the min/max)
+      // NotA is expected to have 2 uses from the min/max and 1 from the sub.
       if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
           !NotA->hasNUsesOrMore(4)) {
-        // Note: We don't generate the inverse max/min, just create the not of
-        // it and let other folds do the rest.
         Value *Not = Builder.CreateNot(MinMax);
         if (NotA == Op0)
           return BinaryOperator::CreateSub(Not, A);
@@ -2119,7 +2154,7 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     unsigned BitWidth = Ty->getScalarSizeInBits();
     unsigned Cttz = AddC->countTrailingZeros();
     APInt HighMask(APInt::getHighBitsSet(BitWidth, BitWidth - Cttz));
-    if ((HighMask & *AndC).isNullValue())
+    if ((HighMask & *AndC).isZero())
       return BinaryOperator::CreateAnd(Op0, ConstantInt::get(Ty, ~(*AndC)));
   }
 
@@ -2133,6 +2168,19 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return replaceInstUsesWith(
         I, Builder.CreateIntrinsic(Intrinsic::umin, {I.getType()}, {Op0, Y}));
 
+  // umax(X, Op1) - Op1 --> usub.sat(X, Op1)
+  // TODO: The one-use restriction is not strictly necessary, but it may
+  //       require improving other pattern matching and/or codegen.
+  if (match(Op0, m_OneUse(m_c_UMax(m_Value(X), m_Specific(Op1)))))
+    return replaceInstUsesWith(
+        I, Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {X, Op1}));
+
+  // Op0 - umax(X, Op0) --> 0 - usub.sat(X, Op0)
+  if (match(Op1, m_OneUse(m_c_UMax(m_Value(X), m_Specific(Op0))))) {
+    Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {X, Op0});
+    return BinaryOperator::CreateNeg(USub);
+  }
+
   // C - ctpop(X) => ctpop(~X) if C is bitwidth
   if (match(Op0, m_SpecificInt(Ty->getScalarSizeInBits())) &&
       match(Op1, m_OneUse(m_Intrinsic<Intrinsic::ctpop>(m_Value(X)))))
@@ -2173,8 +2221,8 @@ static Instruction *foldFNegIntoConstant(Instruction &I) {
     // TODO: We could propagate nsz/ninf from fdiv alone?
     FastMathFlags FMF = I.getFastMathFlags();
     FastMathFlags OpFMF = FNegOp->getFastMathFlags();
-    FDiv->setHasNoSignedZeros(FMF.noSignedZeros() & OpFMF.noSignedZeros());
-    FDiv->setHasNoInfs(FMF.noInfs() & OpFMF.noInfs());
+    FDiv->setHasNoSignedZeros(FMF.noSignedZeros() && OpFMF.noSignedZeros());
+    FDiv->setHasNoInfs(FMF.noInfs() && OpFMF.noInfs());
     return FDiv;
   }
   // With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 120852c44474..06c9bf650f37 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -185,14 +185,15 @@ enum MaskedICmpType {
 /// satisfies.
 static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
                                   ICmpInst::Predicate Pred) {
-  ConstantInt *ACst = dyn_cast<ConstantInt>(A);
-  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+  const APInt *ConstA = nullptr, *ConstB = nullptr, *ConstC = nullptr;
+  match(A, m_APInt(ConstA));
+  match(B, m_APInt(ConstB));
+  match(C, m_APInt(ConstC));
   bool IsEq = (Pred == ICmpInst::ICMP_EQ);
-  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
-  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+  bool IsAPow2 = ConstA && ConstA->isPowerOf2();
+  bool IsBPow2 = ConstB && ConstB->isPowerOf2();
   unsigned MaskVal = 0;
-  if (CCst && CCst->isZero()) {
+  if (ConstC && ConstC->isZero()) {
     // if C is zero, then both A and B qualify as mask
     MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
                      : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
@@ -211,7 +212,7 @@ static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
     if (IsAPow2)
       MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
                        : (Mask_AllZeros | AMask_Mixed));
-  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+  } else if (ConstA && ConstC && ConstC->isSubsetOf(*ConstA)) {
     MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
   }
 
@@ -221,7 +222,7 @@ static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
     if (IsBPow2)
       MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
                        : (Mask_AllZeros | BMask_Mixed));
-  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+  } else if (ConstB && ConstC && ConstC->isSubsetOf(*ConstB)) {
     MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
   }
 
@@ -269,9 +270,9 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
                          ICmpInst *RHS,
                          ICmpInst::Predicate &PredL,
                          ICmpInst::Predicate &PredR) {
-  // vectors are not (yet?) supported. Don't support pointers either.
-  if (!LHS->getOperand(0)->getType()->isIntegerTy() ||
-      !RHS->getOperand(0)->getType()->isIntegerTy())
+  // Don't allow pointers. Splat vectors are fine.
+  if (!LHS->getOperand(0)->getType()->isIntOrIntVectorTy() ||
+      !RHS->getOperand(0)->getType()->isIntOrIntVectorTy())
     return None;
 
   // Here comes the tricky part:
@@ -367,9 +368,9 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
     } else {
       return None;
     }
+
+    assert(Ok && "Failed to find AND on the right side of the RHS icmp.");
   }
-  if (!Ok)
-    return None;
 
   if (L11 == A) {
     B = L12;
@@ -619,8 +620,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   // Remaining cases assume at least that B and D are constant, and depend on
   // their actual values. This isn't strictly necessary, just a "handle the
   // easy cases for now" decision.
-  ConstantInt *BCst, *DCst;
-  if (!match(B, m_ConstantInt(BCst)) || !match(D, m_ConstantInt(DCst)))
+  const APInt *ConstB, *ConstD;
+  if (!match(B, m_APInt(ConstB)) || !match(D, m_APInt(ConstD)))
     return nullptr;
 
   if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
@@ -629,11 +630,10 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
     // Only valid if one of the masks is a superset of the other (check "B&D" is
     // the same as either B or D).
-    APInt NewMask = BCst->getValue() & DCst->getValue();
-
-    if (NewMask == BCst->getValue())
+    APInt NewMask = *ConstB & *ConstD;
+    if (NewMask == *ConstB)
       return LHS;
-    else if (NewMask == DCst->getValue())
+    else if (NewMask == *ConstD)
       return RHS;
   }
 
@@ -642,11 +642,10 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
     // Only valid if one of the masks is a superset of the other (check "B|D" is
     // the same as either B or D).
-    APInt NewMask = BCst->getValue() | DCst->getValue();
-
-    if (NewMask == BCst->getValue())
+    APInt NewMask = *ConstB | *ConstD;
+    if (NewMask == *ConstB)
       return LHS;
-    else if (NewMask == DCst->getValue())
+    else if (NewMask == *ConstD)
       return RHS;
   }
 
@@ -661,23 +660,21 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     // We can't simply use C and E because we might actually handle
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set.
-    ConstantInt *CCst, *ECst;
-    if (!match(C, m_ConstantInt(CCst)) || !match(E, m_ConstantInt(ECst)))
+    const APInt *OldConstC, *OldConstE;
+    if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE)))
       return nullptr;
-    if (PredL != NewCC)
-      CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
-    if (PredR != NewCC)
-      ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+    const APInt ConstC = PredL != NewCC ? *ConstB ^ *OldConstC : *OldConstC;
+    const APInt ConstE = PredR != NewCC ? *ConstD ^ *OldConstE : *OldConstE;
 
     // If there is a conflict, we should actually return a false for the
     // whole construct.
-    if (((BCst->getValue() & DCst->getValue()) &
-         (CCst->getValue() ^ ECst->getValue())).getBoolValue())
+    if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue())
       return ConstantInt::get(LHS->getType(), !IsAnd);
 
     Value *NewOr1 = Builder.CreateOr(B, D);
-    Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
     Value *NewAnd = Builder.CreateAnd(A, NewOr1);
+    Constant *NewOr2 = ConstantInt::get(A->getType(), ConstC | ConstE);
     return Builder.CreateICmp(NewCC, NewAnd, NewOr2);
   }
 
@@ -777,20 +774,6 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
     return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
   }
 
-  // Special case: get the ordering right when the values wrap around zero.
-  // Ie, we assumed the constants were unsigned when swapping earlier.
-  if (C1->isNullValue() && C2->isAllOnesValue())
-    std::swap(C1, C2);
-
-  if (*C1 == *C2 - 1) {
-    // (X == 13 || X == 14) --> X - 13 <=u 1
-    // (X != 13 && X != 14) --> X - 13  >u 1
-    // An 'add' is the canonical IR form, so favor that over a 'sub'.
-    Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
-    auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
-    return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
-  }
-
   return nullptr;
 }
 
@@ -923,7 +906,7 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
   if (!tryToDecompose(OtherICmp, X0, UnsetBitsMask))
     return nullptr;
 
-  assert(!UnsetBitsMask.isNullValue() && "empty mask makes no sense.");
+  assert(!UnsetBitsMask.isZero() && "empty mask makes no sense.");
 
   // Are they working on the same value?
   Value *X;
@@ -1113,8 +1096,8 @@ static Value *extractIntPart(const IntPart &P, IRBuilderBase &Builder) {
 /// (icmp eq X0, Y0) & (icmp eq X1, Y1) -> icmp eq X01, Y01
 /// (icmp ne X0, Y0) | (icmp ne X1, Y1) -> icmp ne X01, Y01
 /// where X0, X1 and Y0, Y1 are adjacent parts extracted from an integer.
-static Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd,
-                            InstCombiner::BuilderTy &Builder) {
+Value *InstCombinerImpl::foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                       bool IsAnd) {
   if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
     return nullptr;
 
@@ -1202,6 +1185,51 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp);
 }
 
+/// Fold (icmp Pred1 V1, C1) & (icmp Pred2 V2, C2)
+/// or   (icmp Pred1 V1, C1) | (icmp Pred2 V2, C2)
+/// into a single comparison using range-based reasoning.
+static Value *foldAndOrOfICmpsUsingRanges(
+    ICmpInst::Predicate Pred1, Value *V1, const APInt &C1,
+    ICmpInst::Predicate Pred2, Value *V2, const APInt &C2,
+    IRBuilderBase &Builder, bool IsAnd) {
+  // Look through add of a constant offset on V1, V2, or both operands. This
+  // allows us to interpret the V + C' < C'' range idiom into a proper range.
+  const APInt *Offset1 = nullptr, *Offset2 = nullptr;
+  if (V1 != V2) {
+    Value *X;
+    if (match(V1, m_Add(m_Value(X), m_APInt(Offset1))))
+      V1 = X;
+    if (match(V2, m_Add(m_Value(X), m_APInt(Offset2))))
+      V2 = X;
+  }
+
+  if (V1 != V2)
+    return nullptr;
+
+  ConstantRange CR1 = ConstantRange::makeExactICmpRegion(Pred1, C1);
+  if (Offset1)
+    CR1 = CR1.subtract(*Offset1);
+
+  ConstantRange CR2 = ConstantRange::makeExactICmpRegion(Pred2, C2);
+  if (Offset2)
+    CR2 = CR2.subtract(*Offset2);
+
+  Optional<ConstantRange> CR =
+      IsAnd ? CR1.exactIntersectWith(CR2) : CR1.exactUnionWith(CR2);
+  if (!CR)
+    return nullptr;
+
+  CmpInst::Predicate NewPred;
+  APInt NewC, Offset;
+  CR->getEquivalentICmp(NewPred, NewC, Offset);
+
+  Type *Ty = V1->getType();
+  Value *NewV = V1;
+  if (Offset != 0)
+    NewV = Builder.CreateAdd(NewV, ConstantInt::get(Ty, Offset));
+  return Builder.CreateICmp(NewPred, NewV, ConstantInt::get(Ty, NewC));
+}
+
 /// Fold (icmp)&(icmp) if possible.
 Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                         BinaryOperator &And) {
@@ -1262,170 +1290,64 @@ Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
     return X;
 
-  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/true, Builder))
+  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/true))
     return X;
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
 
-  ConstantInt *LHSC, *RHSC;
-  if (!match(LHS->getOperand(1), m_ConstantInt(LHSC)) ||
-      !match(RHS->getOperand(1), m_ConstantInt(RHSC)))
-    return nullptr;
-
-  if (LHSC == RHSC && PredL == PredR) {
-    // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
-    // where C is a power of 2 or
-    // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
-    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
-        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
-      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
-      return Builder.CreateICmp(PredL, NewOr, LHSC);
-    }
+  // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
+  // TODO: Remove this when foldLogOpOfMaskedICmps can handle undefs.
+  if (PredL == ICmpInst::ICMP_EQ && match(LHS->getOperand(1), m_ZeroInt()) &&
+      PredR == ICmpInst::ICMP_EQ && match(RHS->getOperand(1), m_ZeroInt()) &&
+      LHS0->getType() == RHS0->getType()) {
+    Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+    return Builder.CreateICmp(PredL, NewOr,
+                              Constant::getNullValue(NewOr->getType()));
   }
 
+  const APInt *LHSC, *RHSC;
+  if (!match(LHS->getOperand(1), m_APInt(LHSC)) ||
+      !match(RHS->getOperand(1), m_APInt(RHSC)))
+    return nullptr;
+
   // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
   // where CMAX is the all ones value for the truncated type,
   // iff the lower bits of C2 and CA are zero.
   if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
       RHS->hasOneUse()) {
     Value *V;
-    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
+    const APInt *AndC, *SmallC = nullptr, *BigC = nullptr;
 
     // (trunc x) == C1 & (and x, CA) == C2
     // (and x, CA) == C2 & (trunc x) == C1
     if (match(RHS0, m_Trunc(m_Value(V))) &&
-        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+        match(LHS0, m_And(m_Specific(V), m_APInt(AndC)))) {
       SmallC = RHSC;
       BigC = LHSC;
     } else if (match(LHS0, m_Trunc(m_Value(V))) &&
-               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+               match(RHS0, m_And(m_Specific(V), m_APInt(AndC)))) {
       SmallC = LHSC;
       BigC = RHSC;
     }
 
     if (SmallC && BigC) {
-      unsigned BigBitSize = BigC->getType()->getBitWidth();
-      unsigned SmallBitSize = SmallC->getType()->getBitWidth();
+      unsigned BigBitSize = BigC->getBitWidth();
+      unsigned SmallBitSize = SmallC->getBitWidth();
 
       // Check that the low bits are zero.
       APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
-      if ((Low & AndC->getValue()).isNullValue() &&
-          (Low & BigC->getValue()).isNullValue()) {
-        Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue());
-        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
-        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+      if ((Low & *AndC).isZero() && (Low & *BigC).isZero()) {
+        Value *NewAnd = Builder.CreateAnd(V, Low | *AndC);
+        APInt N = SmallC->zext(BigBitSize) | *BigC;
+        Value *NewVal = ConstantInt::get(NewAnd->getType(), N);
         return Builder.CreateICmp(PredL, NewAnd, NewVal);
       }
     }
   }
 
-  // From here on, we only handle:
-  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
-  if (LHS0 != RHS0)
-    return nullptr;
-
-  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
-  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
-      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
-      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
-      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
-    return nullptr;
-
-  // We can't fold (ugt x, C) & (sgt x, C2).
-  if (!predicatesFoldable(PredL, PredR))
-    return nullptr;
-
-  // Ensure that the larger constant is on the RHS.
-  bool ShouldSwap;
-  if (CmpInst::isSigned(PredL) ||
-      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
-    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
-  else
-    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
-
-  if (ShouldSwap) {
-    std::swap(LHS, RHS);
-    std::swap(LHSC, RHSC);
-    std::swap(PredL, PredR);
-  }
-
-  // At this point, we know we have two icmp instructions
-  // comparing a value against two constants and and'ing the result
-  // together.  Because of the above check, we know that we only have
-  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
-  // (from the icmp folding check above), that the two constants
-  // are not equal and that the larger constant is on the RHS
-  assert(LHSC != RHSC && "Compares not folded above?");
-
-  switch (PredL) {
-  default:
-    llvm_unreachable("Unknown integer condition code!");
-  case ICmpInst::ICMP_NE:
-    switch (PredR) {
-    default:
-      llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_ULT:
-      // (X != 13 & X u< 14) -> X < 13
-      if (LHSC->getValue() == (RHSC->getValue() - 1))
-        return Builder.CreateICmpULT(LHS0, LHSC);
-      if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
-                               false, true);
-      break; // (X != 13 & X u< 15) -> no change
-    case ICmpInst::ICMP_SLT:
-      // (X != 13 & X s< 14) -> X < 13
-      if (LHSC->getValue() == (RHSC->getValue() - 1))
-        return Builder.CreateICmpSLT(LHS0, LHSC);
-      // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1))
-      if (LHSC->isMinValue(true))
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
-                               true, true);
-      break; // (X != 13 & X s< 15) -> no change
-    case ICmpInst::ICMP_NE:
-      // Potential folds for this case should already be handled.
-      break;
-    }
-    break;
-  case ICmpInst::ICMP_UGT:
-    switch (PredR) {
-    default:
-      llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_NE:
-      // (X u> 13 & X != 14) -> X u> 14
-      if (RHSC->getValue() == (LHSC->getValue() + 1))
-        return Builder.CreateICmp(PredL, LHS0, RHSC);
-      // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1)
-      if (RHSC->isMaxValue(false))
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
-                               false, true);
-      break;                 // (X u> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1
-      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
-                             false, true);
-    }
-    break;
-  case ICmpInst::ICMP_SGT:
-    switch (PredR) {
-    default:
-      llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_NE:
-      // (X s> 13 & X != 14) -> X s> 14
-      if (RHSC->getValue() == (LHSC->getValue() + 1))
-        return Builder.CreateICmp(PredL, LHS0, RHSC);
-      // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1)
-      if (RHSC->isMaxValue(true))
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
-                               true, true);
-      break;                 // (X s> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1
-      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
-                             true);
-    }
-    break;
-  }
-
-  return nullptr;
+  return foldAndOrOfICmpsUsingRanges(PredL, LHS0, *LHSC, PredR, RHS0, *RHSC,
+                                     Builder, /* IsAnd */ true);
 }
 
 Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
@@ -1496,15 +1418,15 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
     std::swap(Op0, Op1);
 
   // Match inner binop and the predicate for combining 2 NAN checks into 1.
-  BinaryOperator *BO1;
+  Value *BO10, *BO11;
   FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
                                                            : FCmpInst::FCMP_UNO;
   if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
-      !match(Op1, m_BinOp(BO1)) || BO1->getOpcode() != Opcode)
+      !match(Op1, m_BinOp(Opcode, m_Value(BO10), m_Value(BO11))))
     return nullptr;
 
   // The inner logic op must have a matching fcmp operand.
-  Value *BO10 = BO1->getOperand(0), *BO11 = BO1->getOperand(1), *Y;
+  Value *Y;
   if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
       Pred != NanPred || X->getType() != Y->getType())
     std::swap(BO10, BO11);
@@ -1524,27 +1446,42 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
   return BinaryOperator::Create(Opcode, NewFCmp, BO11);
 }
 
-/// Match De Morgan's Laws:
+/// Match variations of De Morgan's Laws:
 /// (~A & ~B) == (~(A | B))
 /// (~A | ~B) == (~(A & B))
 static Instruction *matchDeMorgansLaws(BinaryOperator &I,
                                        InstCombiner::BuilderTy &Builder) {
-  auto Opcode = I.getOpcode();
+  const Instruction::BinaryOps Opcode = I.getOpcode();
   assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
          "Trying to match De Morgan's Laws with something other than and/or");
 
   // Flip the logic operation.
-  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
+  const Instruction::BinaryOps FlippedOpcode =
+      (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Value *A, *B;
-  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
-      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+  if (match(Op0, m_OneUse(m_Not(m_Value(A)))) &&
+      match(Op1, m_OneUse(m_Not(m_Value(B)))) &&
       !InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
       !InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
-    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+    Value *AndOr =
+        Builder.CreateBinOp(FlippedOpcode, A, B, I.getName() + ".demorgan");
     return BinaryOperator::CreateNot(AndOr);
   }
 
+  // The 'not' ops may require reassociation.
+  // (A & ~B) & ~C --> A & ~(B | C)
+  // (~B & A) & ~C --> A & ~(B | C)
+  // (A | ~B) | ~C --> A | ~(B & C)
+  // (~B | A) | ~C --> A | ~(B & C)
+  Value *C;
+  if (match(Op0, m_OneUse(m_c_BinOp(Opcode, m_Value(A), m_Not(m_Value(B))))) &&
+      match(Op1, m_Not(m_Value(C)))) {
+    Value *FlippedBO = Builder.CreateBinOp(FlippedOpcode, B, C);
+    return BinaryOperator::Create(Opcode, A, Builder.CreateNot(FlippedBO));
+  }
+
   return nullptr;
 }
 
@@ -1778,6 +1715,72 @@ Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) {
   return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty);
 }
 
+/// Try folding relatively complex patterns for both And and Or operations
+/// with all And and Or swapped.
+static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
+                                             InstCombiner::BuilderTy &Builder) {
+  const Instruction::BinaryOps Opcode = I.getOpcode();
+  assert(Opcode == Instruction::And || Opcode == Instruction::Or);
+
+  // Flip the logic operation.
+  const Instruction::BinaryOps FlippedOpcode =
+      (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *A, *B, *C;
+
+  // (~(A | B) & C) | ... --> ...
+  // (~(A & B) | C) & ... --> ...
+  // TODO: One use checks are conservative. We just need to check that a total
+  //       number of multiple used values does not exceed reduction
+  //       in operations.
+  if (match(Op0, m_c_BinOp(FlippedOpcode,
+                           m_Not(m_BinOp(Opcode, m_Value(A), m_Value(B))),
+                           m_Value(C)))) {
+    // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A
+    // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A)
+    if (match(Op1,
+              m_OneUse(m_c_BinOp(FlippedOpcode,
+                                 m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(A),
+                                                          m_Specific(C)))),
+                                 m_Specific(B))))) {
+      Value *Xor = Builder.CreateXor(B, C);
+      return (Opcode == Instruction::Or)
+                 ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(A))
+                 : BinaryOperator::CreateNot(Builder.CreateAnd(Xor, A));
+    }
+
+    // (~(A | B) & C) | (~(B | C) & A) --> (A ^ C) & ~B
+    // (~(A & B) | C) & (~(B & C) | A) --> ~((A ^ C) & B)
+    if (match(Op1,
+              m_OneUse(m_c_BinOp(FlippedOpcode,
+                                 m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(B),
+                                                          m_Specific(C)))),
+                                 m_Specific(A))))) {
+      Value *Xor = Builder.CreateXor(A, C);
+      return (Opcode == Instruction::Or)
+                 ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(B))
+                 : BinaryOperator::CreateNot(Builder.CreateAnd(Xor, B));
+    }
+
+    // (~(A | B) & C) | ~(A | C) --> ~((B & C) | A)
+    // (~(A & B) | C) & ~(A & C) --> ~((B | C) & A)
+    if (match(Op1, m_OneUse(m_Not(m_OneUse(
+                       m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)))))))
+      return BinaryOperator::CreateNot(Builder.CreateBinOp(
+          Opcode, Builder.CreateBinOp(FlippedOpcode, B, C), A));
+
+    // (~(A | B) & C) | ~(B | C) --> ~((A & C) | B)
+    // (~(A & B) | C) & ~(B & C) --> ~((A | C) & B)
+    if (match(Op1, m_OneUse(m_Not(m_OneUse(
+                       m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)))))))
+      return BinaryOperator::CreateNot(Builder.CreateBinOp(
+          Opcode, Builder.CreateBinOp(FlippedOpcode, A, C), B));
+  }
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -1803,6 +1806,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *Xor = foldAndToXor(I, Builder))
     return Xor;
 
+  if (Instruction *X = foldComplexAndOrPatterns(I, Builder))
+    return X;
+
   // (A|B)&(A|C) -> A|(B&C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
@@ -1883,7 +1889,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       // (X + AddC) & LowMaskC --> X & LowMaskC
       unsigned Ctlz = C->countLeadingZeros();
       APInt LowMask(APInt::getLowBitsSet(Width, Width - Ctlz));
-      if ((*AddC & LowMask).isNullValue())
+      if ((*AddC & LowMask).isZero())
         return BinaryOperator::CreateAnd(X, Op1);
 
       // If we are masking the result of the add down to exactly one bit and
@@ -1896,44 +1902,37 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
         return BinaryOperator::CreateXor(NewAnd, Op1);
       }
     }
-  }
 
-  ConstantInt *AndRHS;
-  if (match(Op1, m_ConstantInt(AndRHS))) {
-    const APInt &AndRHSMask = AndRHS->getValue();
-
-    // Optimize a variety of ((val OP C1) & C2) combinations...
-    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
-      // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
-      // of X and OP behaves well when given trunc(C1) and X.
-      // TODO: Do this for vectors by using m_APInt instead of m_ConstantInt.
-      switch (Op0I->getOpcode()) {
-      default:
-        break;
+    // ((C1 OP zext(X)) & C2) -> zext((C1 OP X) & C2) if C2 fits in the
+    // bitwidth of X and OP behaves well when given trunc(C1) and X.
+    auto isSuitableBinOpcode = [](BinaryOperator *B) {
+      switch (B->getOpcode()) {
       case Instruction::Xor:
       case Instruction::Or:
       case Instruction::Mul:
       case Instruction::Add:
       case Instruction::Sub:
-        Value *X;
-        ConstantInt *C1;
-        // TODO: The one use restrictions could be relaxed a little if the AND
-        // is going to be removed.
-        if (match(Op0I, m_OneUse(m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))),
-                                           m_ConstantInt(C1))))) {
-          if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
-            auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
-            Value *BinOp;
-            Value *Op0LHS = Op0I->getOperand(0);
-            if (isa<ZExtInst>(Op0LHS))
-              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1);
-            else
-              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
-            auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
-            auto *And = Builder.CreateAnd(BinOp, TruncC2);
-            return new ZExtInst(And, Ty);
-          }
-        }
+        return true;
+      default:
+        return false;
+      }
+    };
+    BinaryOperator *BO;
+    if (match(Op0, m_OneUse(m_BinOp(BO))) && isSuitableBinOpcode(BO)) {
+      Value *X;
+      const APInt *C1;
+      // TODO: The one-use restrictions could be relaxed a little if the AND
+      // is going to be removed.
+      if (match(BO, m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))), m_APInt(C1))) &&
+          C->isIntN(X->getType()->getScalarSizeInBits())) {
+        unsigned XWidth = X->getType()->getScalarSizeInBits();
+        Constant *TruncC1 = ConstantInt::get(X->getType(), C1->trunc(XWidth));
+        Value *BinOp = isa<ZExtInst>(BO->getOperand(0))
+                           ? Builder.CreateBinOp(BO->getOpcode(), X, TruncC1)
+                           : Builder.CreateBinOp(BO->getOpcode(), TruncC1, X);
+        Constant *TruncC = ConstantInt::get(X->getType(), C->trunc(XWidth));
+        Value *And = Builder.CreateAnd(BinOp, TruncC);
+        return new ZExtInst(And, Ty);
       }
     }
   }
@@ -2071,13 +2070,13 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, Op0, Constant::getNullValue(Ty));
 
-  // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
-  if (match(&I, m_c_And(m_OneUse(m_AShr(
-                            m_NSWSub(m_Value(Y), m_Value(X)),
-                            m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
-                        m_Deferred(X)))) {
-    Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
-    return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
+  // (iN X s>> (N-1)) & Y --> (X s< 0) ? Y : 0
+  unsigned FullShift = Ty->getScalarSizeInBits() - 1;
+  if (match(&I, m_c_And(m_OneUse(m_AShr(m_Value(X), m_SpecificInt(FullShift))),
+                        m_Value(Y)))) {
+    Constant *Zero = ConstantInt::getNullValue(Ty);
+    Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
+    return SelectInst::Create(Cmp, Y, Zero);
   }
 
   // (~x) & y  -->  ~(x | (~y))  iff that gets rid of inversions
@@ -2284,28 +2283,38 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
 Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
-  // Step 1: We may have peeked through bitcasts in the caller.
+  // We may have peeked through bitcasts in the caller.
   // Exit immediately if we don't have (vector) integer types.
   Type *Ty = A->getType();
   if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy())
     return nullptr;
 
-  // Step 2: We need 0 or all-1's bitmasks.
-  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
-    return nullptr;
-
-  // Step 3: If B is the 'not' value of A, we have our answer.
-  if (match(A, m_Not(m_Specific(B)))) {
+  // If A is the 'not' operand of B and has enough signbits, we have our answer.
+  if (match(B, m_Not(m_Specific(A)))) {
     // If these are scalars or vectors of i1, A can be used directly.
     if (Ty->isIntOrIntVectorTy(1))
       return A;
-    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
+
+    // If we look through a vector bitcast, the caller will bitcast the operands
+    // to match the condition's number of bits (N x i1).
+    // To make this poison-safe, disallow bitcast from wide element to narrow
+    // element. That could allow poison in lanes where it was not present in the
+    // original code.
+    A = peekThroughBitcast(A);
+    if (A->getType()->isIntOrIntVectorTy()) {
+      unsigned NumSignBits = ComputeNumSignBits(A);
+      if (NumSignBits == A->getType()->getScalarSizeInBits() &&
+          NumSignBits <= Ty->getScalarSizeInBits())
+        return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(A->getType()));
+    }
+    return nullptr;
   }
 
   // If both operands are constants, see if the constants are inverse bitmasks.
   Constant *AConst, *BConst;
   if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
-    if (AConst == ConstantExpr::getNot(BConst))
+    if (AConst == ConstantExpr::getNot(BConst) &&
+        ComputeNumSignBits(A) == Ty->getScalarSizeInBits())
       return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
 
   // Look for more complex patterns. The 'not' op may be hidden behind various
@@ -2349,10 +2358,17 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
   B = peekThroughBitcast(B, true);
   if (Value *Cond = getSelectCondition(A, B)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+    // If this is a vector, we may need to cast to match the condition's length.
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
-    Value *BitcastC = Builder.CreateBitCast(C, A->getType());
-    Value *BitcastD = Builder.CreateBitCast(D, A->getType());
+    Type *SelTy = A->getType();
+    if (auto *VecTy = dyn_cast<VectorType>(Cond->getType())) {
+      unsigned Elts = VecTy->getElementCount().getKnownMinValue();
+      Type *EltTy = Builder.getIntNTy(SelTy->getPrimitiveSizeInBits() / Elts);
+      SelTy = VectorType::get(EltTy, VecTy->getElementCount());
+    }
+    Value *BitcastC = Builder.CreateBitCast(C, SelTy);
+    Value *BitcastD = Builder.CreateBitCast(D, SelTy);
     Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
     return Builder.CreateBitCast(Select, OrigType);
   }
@@ -2374,8 +2390,9 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1);
-  auto *LHSC = dyn_cast<ConstantInt>(LHS1);
-  auto *RHSC = dyn_cast<ConstantInt>(RHS1);
+  const APInt *LHSC = nullptr, *RHSC = nullptr;
+  match(LHS1, m_APInt(LHSC));
+  match(RHS1, m_APInt(RHSC));
 
   // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
   //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
@@ -2389,40 +2406,41 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // This implies all values in the two ranges differ by exactly one bit.
   if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
       PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
-      LHSC->getType() == RHSC->getType() &&
-      LHSC->getValue() == (RHSC->getValue())) {
+      LHSC->getBitWidth() == RHSC->getBitWidth() && *LHSC == *RHSC) {
 
     Value *AddOpnd;
-    ConstantInt *LAddC, *RAddC;
-    if (match(LHS0, m_Add(m_Value(AddOpnd), m_ConstantInt(LAddC))) &&
-        match(RHS0, m_Add(m_Specific(AddOpnd), m_ConstantInt(RAddC))) &&
-        LAddC->getValue().ugt(LHSC->getValue()) &&
-        RAddC->getValue().ugt(LHSC->getValue())) {
+    const APInt *LAddC, *RAddC;
+    if (match(LHS0, m_Add(m_Value(AddOpnd), m_APInt(LAddC))) &&
+        match(RHS0, m_Add(m_Specific(AddOpnd), m_APInt(RAddC))) &&
+        LAddC->ugt(*LHSC) && RAddC->ugt(*LHSC)) {
 
-      APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
+      APInt DiffC = *LAddC ^ *RAddC;
       if (DiffC.isPowerOf2()) {
-        ConstantInt *MaxAddC = nullptr;
-        if (LAddC->getValue().ult(RAddC->getValue()))
+        const APInt *MaxAddC = nullptr;
+        if (LAddC->ult(*RAddC))
           MaxAddC = RAddC;
         else
           MaxAddC = LAddC;
 
-        APInt RRangeLow = -RAddC->getValue();
-        APInt RRangeHigh = RRangeLow + LHSC->getValue();
-        APInt LRangeLow = -LAddC->getValue();
-        APInt LRangeHigh = LRangeLow + LHSC->getValue();
+        APInt RRangeLow = -*RAddC;
+        APInt RRangeHigh = RRangeLow + *LHSC;
+        APInt LRangeLow = -*LAddC;
+        APInt LRangeHigh = LRangeLow + *LHSC;
         APInt LowRangeDiff = RRangeLow ^ LRangeLow;
         APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
         APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
                                                    : RRangeLow - LRangeLow;
 
         if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
-            RangeDiff.ugt(LHSC->getValue())) {
-          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
+            RangeDiff.ugt(*LHSC)) {
+          Type *Ty = AddOpnd->getType();
+          Value *MaskC = ConstantInt::get(Ty, ~DiffC);
 
           Value *NewAnd = Builder.CreateAnd(AddOpnd, MaskC);
-          Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
-          return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
+          Value *NewAdd = Builder.CreateAdd(NewAnd,
+                                            ConstantInt::get(Ty, *MaxAddC));
+          return Builder.CreateICmp(LHS->getPredicate(), NewAdd,
+                                    ConstantInt::get(Ty, *LHSC));
         }
       }
     }
@@ -2496,14 +2514,13 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
     return X;
 
-  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/false, Builder))
+  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/false))
     return X;
 
   // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
-  // TODO: Remove this when foldLogOpOfMaskedICmps can handle vectors.
-  if (PredL == ICmpInst::ICMP_NE && match(LHS1, m_Zero()) &&
-      PredR == ICmpInst::ICMP_NE && match(RHS1, m_Zero()) &&
-      LHS0->getType()->isIntOrIntVectorTy() &&
+  // TODO: Remove this when foldLogOpOfMaskedICmps can handle undefs.
+  if (PredL == ICmpInst::ICMP_NE && match(LHS1, m_ZeroInt()) &&
+      PredR == ICmpInst::ICMP_NE && match(RHS1, m_ZeroInt()) &&
       LHS0->getType() == RHS0->getType()) {
     Value *NewOr = Builder.CreateOr(LHS0, RHS0);
     return Builder.CreateICmp(PredL, NewOr,
@@ -2514,114 +2531,8 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (!LHSC || !RHSC)
     return nullptr;
 
-  // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
-  //   iff C2 + CA == C1.
-  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
-    ConstantInt *AddC;
-    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
-      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
-        return Builder.CreateICmpULE(LHS0, LHSC);
-  }
-
-  // From here on, we only handle:
-  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
-  if (LHS0 != RHS0)
-    return nullptr;
-
-  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
-  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
-      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
-      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
-      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
-    return nullptr;
-
-  // We can't fold (ugt x, C) | (sgt x, C2).
-  if (!predicatesFoldable(PredL, PredR))
-    return nullptr;
-
-  // Ensure that the larger constant is on the RHS.
-  bool ShouldSwap;
-  if (CmpInst::isSigned(PredL) ||
-      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
-    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
-  else
-    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
-
-  if (ShouldSwap) {
-    std::swap(LHS, RHS);
-    std::swap(LHSC, RHSC);
-    std::swap(PredL, PredR);
-  }
-
-  // At this point, we know we have two icmp instructions
-  // comparing a value against two constants and or'ing the result
-  // together.  Because of the above check, we know that we only have
-  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
-  // icmp folding check above), that the two constants are not
-  // equal.
-  assert(LHSC != RHSC && "Compares not folded above?");
-
-  switch (PredL) {
-  default:
-    llvm_unreachable("Unknown integer condition code!");
-  case ICmpInst::ICMP_EQ:
-    switch (PredR) {
-    default:
-      llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:
-      // Potential folds for this case should already be handled.
-      break;
-    case ICmpInst::ICMP_UGT:
-      // (X == 0 || X u> C) -> (X-1) u>= C
-      if (LHSC->isMinValue(false))
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
-                               false, false);
-      // (X == 13 | X u> 14) -> no change
-      break;
-    case ICmpInst::ICMP_SGT:
-      // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN
-      if (LHSC->isMinValue(true))
-        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
-                               true, false);
-      // (X == 13 | X s> 14) -> no change
-      break;
-    }
-    break;
-  case ICmpInst::ICMP_ULT:
-    switch (PredR) {
-    default:
-      llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
-      // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C
-      if (RHSC->isMaxValue(false))
-        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
-                               false, false);
-      break;
-    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
-      assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
-      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
-                             false, false);
-    }
-    break;
-  case ICmpInst::ICMP_SLT:
-    switch (PredR) {
-    default:
-      llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:
-      // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C
-      if (RHSC->isMaxValue(true))
-        return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
-                               true, false);
-      // (X s< 13 | X == 14) -> no change
-      break;
-    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2
-      assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
-      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
-                             false);
-    }
-    break;
-  }
-  return nullptr;
+  return foldAndOrOfICmpsUsingRanges(PredL, LHS0, *LHSC, PredR, RHS0, *RHSC,
+                                     Builder, /* IsAnd */ false);
 }
 
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
@@ -2647,6 +2558,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *Xor = foldOrToXor(I, Builder))
     return Xor;
 
+  if (Instruction *X = foldComplexAndOrPatterns(I, Builder))
+    return X;
+
   // (A&B)|(A&C) -> A&(B|C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
@@ -2684,69 +2598,63 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   Value *X, *Y;
   const APInt *CV;
   if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) &&
-      !CV->isAllOnesValue() && MaskedValueIsZero(Y, *CV, 0, &I)) {
+      !CV->isAllOnes() && MaskedValueIsZero(Y, *CV, 0, &I)) {
     // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0
     // The check for a 'not' op is for efficiency (if Y is known zero --> ~X).
     Value *Or = Builder.CreateOr(X, Y);
     return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV));
   }
 
-  // (A & C)|(B & D)
+  // (A & C) | (B & D)
   Value *A, *B, *C, *D;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
       match(Op1, m_And(m_Value(B), m_Value(D)))) {
-    // (A & C1)|(B & C2)
-    ConstantInt *C1, *C2;
-    if (match(C, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2))) {
-      Value *V1 = nullptr, *V2 = nullptr;
-      if ((C1->getValue() & C2->getValue()).isNullValue()) {
-        // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
-        // iff (C1&C2) == 0 and (N&~C1) == 0
-        if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
-            ((V1 == B &&
-              MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N)
-             (V2 == B &&
-              MaskedValueIsZero(V1, ~C1->getValue(), 0, &I))))  // (N|V)
-          return BinaryOperator::CreateAnd(A,
-                                Builder.getInt(C1->getValue()|C2->getValue()));
-        // Or commutes, try both ways.
-        if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
-            ((V1 == A &&
-              MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N)
-             (V2 == A &&
-              MaskedValueIsZero(V1, ~C2->getValue(), 0, &I))))  // (N|V)
-          return BinaryOperator::CreateAnd(B,
-                                 Builder.getInt(C1->getValue()|C2->getValue()));
-
-        // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
-        // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
-        ConstantInt *C3 = nullptr, *C4 = nullptr;
-        if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
-            (C3->getValue() & ~C1->getValue()).isNullValue() &&
-            match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
-            (C4->getValue() & ~C2->getValue()).isNullValue()) {
-          V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
-          return BinaryOperator::CreateAnd(V2,
-                                 Builder.getInt(C1->getValue()|C2->getValue()));
-        }
-      }
 
-      if (C1->getValue() == ~C2->getValue()) {
-        Value *X;
-
-        // ((X|B)&C1)|(B&C2) -> (X&C1) | B iff C1 == ~C2
+    // (A & C0) | (B & C1)
+    const APInt *C0, *C1;
+    if (match(C, m_APInt(C0)) && match(D, m_APInt(C1))) {
+      Value *X;
+      if (*C0 == ~*C1) {
+        // ((X | B) & MaskC) | (B & ~MaskC) -> (X & MaskC) | B
         if (match(A, m_c_Or(m_Value(X), m_Specific(B))))
-          return BinaryOperator::CreateOr(Builder.CreateAnd(X, C1), B);
-        // (A&C2)|((X|A)&C1) -> (X&C2) | A iff C1 == ~C2
+          return BinaryOperator::CreateOr(Builder.CreateAnd(X, *C0), B);
+        // (A & MaskC) | ((X | A) & ~MaskC) -> (X & ~MaskC) | A
         if (match(B, m_c_Or(m_Specific(A), m_Value(X))))
-          return BinaryOperator::CreateOr(Builder.CreateAnd(X, C2), A);
+          return BinaryOperator::CreateOr(Builder.CreateAnd(X, *C1), A);
 
-        // ((X^B)&C1)|(B&C2) -> (X&C1) ^ B iff C1 == ~C2
+        // ((X ^ B) & MaskC) | (B & ~MaskC) -> (X & MaskC) ^ B
         if (match(A, m_c_Xor(m_Value(X), m_Specific(B))))
-          return BinaryOperator::CreateXor(Builder.CreateAnd(X, C1), B);
-        // (A&C2)|((X^A)&C1) -> (X&C2) ^ A iff C1 == ~C2
+          return BinaryOperator::CreateXor(Builder.CreateAnd(X, *C0), B);
+        // (A & MaskC) | ((X ^ A) & ~MaskC) -> (X & ~MaskC) ^ A
         if (match(B, m_c_Xor(m_Specific(A), m_Value(X))))
-          return BinaryOperator::CreateXor(Builder.CreateAnd(X, C2), A);
+          return BinaryOperator::CreateXor(Builder.CreateAnd(X, *C1), A);
+      }
+
+      if ((*C0 & *C1).isZero()) {
+        // ((X | B) & C0) | (B & C1) --> (X | B) & (C0 | C1)
+        // iff (C0 & C1) == 0 and (X & ~C0) == 0
+        if (match(A, m_c_Or(m_Value(X), m_Specific(B))) &&
+            MaskedValueIsZero(X, ~*C0, 0, &I)) {
+          Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+          return BinaryOperator::CreateAnd(A, C01);
+        }
+        // (A & C0) | ((X | A) & C1) --> (X | A) & (C0 | C1)
+        // iff (C0 & C1) == 0 and (X & ~C1) == 0
+        if (match(B, m_c_Or(m_Value(X), m_Specific(A))) &&
+            MaskedValueIsZero(X, ~*C1, 0, &I)) {
+          Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+          return BinaryOperator::CreateAnd(B, C01);
+        }
+        // ((X | C2) & C0) | ((X | C3) & C1) --> (X | C2 | C3) & (C0 | C1)
+        // iff (C0 & C1) == 0 and (C2 & ~C0) == 0 and (C3 & ~C1) == 0.
+        const APInt *C2, *C3;
+        if (match(A, m_Or(m_Value(X), m_APInt(C2))) &&
+            match(B, m_Or(m_Specific(X), m_APInt(C3))) &&
+            (*C2 & ~*C0).isZero() && (*C3 & ~*C1).isZero()) {
+          Value *Or = Builder.CreateOr(X, *C2 | *C3, "bitfield");
+          Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1);
+          return BinaryOperator::CreateAnd(Or, C01);
+        }
       }
     }
 
@@ -2801,6 +2709,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   // A | ( A ^ B) -> A |  B
   // A | (~A ^ B) -> A | ~B
   // (A & B) | (A ^ B)
+  // ~A | (A ^ B) -> ~(A & B)
+  // The swap above should always make Op0 the 'not' for the last case.
   if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
     if (Op0 == A || Op0 == B)
       return BinaryOperator::CreateOr(A, B);
@@ -2809,6 +2719,10 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
         match(Op0, m_And(m_Specific(B), m_Specific(A))))
       return BinaryOperator::CreateOr(A, B);
 
+    if ((Op0->hasOneUse() || Op1->hasOneUse()) &&
+        (match(Op0, m_Not(m_Specific(A))) || match(Op0, m_Not(m_Specific(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
+
     if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
       Value *Not = Builder.CreateNot(B, B->getName() + ".not");
       return BinaryOperator::CreateOr(Not, Op0);
@@ -3275,71 +3189,45 @@ bool InstCombinerImpl::sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I) {
   return true;
 }
 
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
-// here. We should standardize that construct where it is needed or choose some
-// other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
-  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
-                                 SQ.getWithInstruction(&I)))
-    return replaceInstUsesWith(I, V);
-
-  if (SimplifyAssociativeOrCommutative(I))
-    return &I;
-
-  if (Instruction *X = foldVectorBinop(I))
-    return X;
-
-  if (Instruction *NewXor = foldXorToXor(I, Builder))
-    return NewXor;
-
-  // (A&B)^(A&C) -> A&(B^C) etc
-  if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return replaceInstUsesWith(I, V);
-
-  // See if we can simplify any instructions used by the instruction whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(I))
-    return &I;
-
-  if (Value *V = SimplifyBSwap(I, Builder))
-    return replaceInstUsesWith(I, V);
-
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  Type *Ty = I.getType();
-
-  // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M)
-  // This it a special case in haveNoCommonBitsSet, but the computeKnownBits
-  // calls in there are unnecessary as SimplifyDemandedInstructionBits should
-  // have already taken care of those cases.
-  Value *M;
-  if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()),
-                        m_c_And(m_Deferred(M), m_Value()))))
-    return BinaryOperator::CreateOr(Op0, Op1);
+Instruction *InstCombinerImpl::foldNot(BinaryOperator &I) {
+  Value *NotOp;
+  if (!match(&I, m_Not(m_Value(NotOp))))
+    return nullptr;
 
   // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
-  Value *X, *Y;
-
   // We must eliminate the and/or (one-use) for these transforms to not increase
   // the instruction count.
+  //
   // ~(~X & Y) --> (X | ~Y)
   // ~(Y & ~X) --> (X | ~Y)
-  if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) {
+  //
+  // Note: The logical matches do not check for the commuted patterns because
+  //       those are handled via SimplifySelectsFeedingBinaryOp().
+  Type *Ty = I.getType();
+  Value *X, *Y;
+  if (match(NotOp, m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y))))) {
     Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
     return BinaryOperator::CreateOr(X, NotY);
   }
+  if (match(NotOp, m_OneUse(m_LogicalAnd(m_Not(m_Value(X)), m_Value(Y))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return SelectInst::Create(X, ConstantInt::getTrue(Ty), NotY);
+  }
+
   // ~(~X | Y) --> (X & ~Y)
   // ~(Y | ~X) --> (X & ~Y)
-  if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) {
+  if (match(NotOp, m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y))))) {
     Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
     return BinaryOperator::CreateAnd(X, NotY);
   }
-
-  if (Instruction *Xor = visitMaskedMerge(I, Builder))
-    return Xor;
+  if (match(NotOp, m_OneUse(m_LogicalOr(m_Not(m_Value(X)), m_Value(Y))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return SelectInst::Create(X, NotY, ConstantInt::getFalse(Ty));
+  }
 
   // Is this a 'not' (~) fed by a binary operator?
   BinaryOperator *NotVal;
-  if (match(&I, m_Not(m_BinOp(NotVal)))) {
+  if (match(NotOp, m_BinOp(NotVal))) {
     if (NotVal->getOpcode() == Instruction::And ||
         NotVal->getOpcode() == Instruction::Or) {
       // Apply DeMorgan's Law when inverts are free:
@@ -3411,9 +3299,164 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
                                                    NotVal);
   }
 
-  // Use DeMorgan and reassociation to eliminate a 'not' op.
+  // not (cmp A, B) = !cmp A, B
+  CmpInst::Predicate Pred;
+  if (match(NotOp, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) {
+    cast<CmpInst>(NotOp)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return replaceInstUsesWith(I, NotOp);
+  }
+
+  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
+  // ~min(~X, ~Y) --> max(X, Y)
+  // ~max(~X, Y) --> min(X, ~Y)
+  auto *II = dyn_cast<IntrinsicInst>(NotOp);
+  if (II && II->hasOneUse()) {
+    if (match(NotOp, m_MaxOrMin(m_Value(X), m_Value(Y))) &&
+        isFreeToInvert(X, X->hasOneUse()) &&
+        isFreeToInvert(Y, Y->hasOneUse())) {
+      Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
+      Value *NotX = Builder.CreateNot(X);
+      Value *NotY = Builder.CreateNot(Y);
+      Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, NotX, NotY);
+      return replaceInstUsesWith(I, InvMaxMin);
+    }
+    if (match(NotOp, m_c_MaxOrMin(m_Not(m_Value(X)), m_Value(Y)))) {
+      Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
+      Value *NotY = Builder.CreateNot(Y);
+      Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, NotY);
+      return replaceInstUsesWith(I, InvMaxMin);
+    }
+  }
+
+  // TODO: Remove folds if we canonicalize to intrinsics (see above).
+  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
+  //
+  //   %notx = xor i32 %x, -1
+  //   %cmp1 = icmp sgt i32 %notx, %y
+  //   %smax = select i1 %cmp1, i32 %notx, i32 %y
+  //   %res = xor i32 %smax, -1
+  // =>
+  //   %noty = xor i32 %y, -1
+  //   %cmp2 = icmp slt %x, %noty
+  //   %res = select i1 %cmp2, i32 %x, i32 %noty
+  //
+  // Same is applicable for smin/umax/umin.
+  if (NotOp->hasOneUse()) {
+    Value *LHS, *RHS;
+    SelectPatternFlavor SPF = matchSelectPattern(NotOp, LHS, RHS).Flavor;
+    if (SelectPatternResult::isMinOrMax(SPF)) {
+      // It's possible we get here before the not has been simplified, so make
+      // sure the input to the not isn't freely invertible.
+      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
+        Value *NotY = Builder.CreateNot(RHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
+      }
+
+      // It's possible we get here before the not has been simplified, so make
+      // sure the input to the not isn't freely invertible.
+      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
+        Value *NotX = Builder.CreateNot(LHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
+      }
+
+      // If both sides are freely invertible, then we can get rid of the xor
+      // completely.
+      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
+        Value *NotLHS = Builder.CreateNot(LHS);
+        Value *NotRHS = Builder.CreateNot(RHS);
+        return SelectInst::Create(
+            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS),
+            NotLHS, NotRHS);
+      }
+    }
+
+    // Pull 'not' into operands of select if both operands are one-use compares
+    // or one is one-use compare and the other one is a constant.
+    // Inverting the predicates eliminates the 'not' operation.
+    // Example:
+    //   not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
+    //     select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
+    //   not (select ?, (cmp TPred, ?, ?), true -->
+    //     select ?, (cmp InvTPred, ?, ?), false
+    if (auto *Sel = dyn_cast<SelectInst>(NotOp)) {
+      Value *TV = Sel->getTrueValue();
+      Value *FV = Sel->getFalseValue();
+      auto *CmpT = dyn_cast<CmpInst>(TV);
+      auto *CmpF = dyn_cast<CmpInst>(FV);
+      bool InvertibleT = (CmpT && CmpT->hasOneUse()) || isa<Constant>(TV);
+      bool InvertibleF = (CmpF && CmpF->hasOneUse()) || isa<Constant>(FV);
+      if (InvertibleT && InvertibleF) {
+        if (CmpT)
+          CmpT->setPredicate(CmpT->getInversePredicate());
+        else
+          Sel->setTrueValue(ConstantExpr::getNot(cast<Constant>(TV)));
+        if (CmpF)
+          CmpF->setPredicate(CmpF->getInversePredicate());
+        else
+          Sel->setFalseValue(ConstantExpr::getNot(cast<Constant>(FV)));
+        return replaceInstUsesWith(I, Sel);
+      }
+    }
+  }
+
+  if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
+    return NewXor;
+
+  return nullptr;
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
+Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
+  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
+                                 SQ.getWithInstruction(&I)))
+    return replaceInstUsesWith(I, V);
+
+  if (SimplifyAssociativeOrCommutative(I))
+    return &I;
+
+  if (Instruction *X = foldVectorBinop(I))
+    return X;
+
+  if (Instruction *NewXor = foldXorToXor(I, Builder))
+    return NewXor;
+
+  // (A&B)^(A&C) -> A&(B^C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (Instruction *R = foldNot(I))
+    return R;
+
+  // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M)
+  // This it a special case in haveNoCommonBitsSet, but the computeKnownBits
+  // calls in there are unnecessary as SimplifyDemandedInstructionBits should
+  // have already taken care of those cases.
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *M;
+  if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()),
+                        m_c_And(m_Deferred(M), m_Value()))))
+    return BinaryOperator::CreateOr(Op0, Op1);
+
+  if (Instruction *Xor = visitMaskedMerge(I, Builder))
+    return Xor;
+
+  Value *X, *Y;
   Constant *C1;
   if (match(Op1, m_Constant(C1))) {
+    // Use DeMorgan and reassociation to eliminate a 'not' op.
     Constant *C2;
     if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) {
       // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1
@@ -3425,15 +3468,24 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       Value *Or = Builder.CreateOr(X, ConstantExpr::getNot(C2));
       return BinaryOperator::CreateXor(Or, ConstantExpr::getNot(C1));
     }
-  }
 
-  // not (cmp A, B) = !cmp A, B
-  CmpInst::Predicate Pred;
-  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
-    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
-    return replaceInstUsesWith(I, Op0);
+    // Convert xor ([trunc] (ashr X, BW-1)), C =>
+    //   select(X >s -1, C, ~C)
+    // The ashr creates "AllZeroOrAllOne's", which then optionally inverses the
+    // constant depending on whether this input is less than 0.
+    const APInt *CA;
+    if (match(Op0, m_OneUse(m_TruncOrSelf(
+                       m_AShr(m_Value(X), m_APIntAllowUndef(CA))))) &&
+        *CA == X->getType()->getScalarSizeInBits() - 1 &&
+        !match(C1, m_AllOnes())) {
+      assert(!C1->isZeroValue() && "Unexpected xor with 0");
+      Value *ICmp =
+          Builder.CreateICmpSGT(X, Constant::getAllOnesValue(X->getType()));
+      return SelectInst::Create(ICmp, Op1, Builder.CreateNot(Op1));
+    }
   }
 
+  Type *Ty = I.getType();
   {
     const APInt *RHSC;
     if (match(Op1, m_APInt(RHSC))) {
@@ -3456,13 +3508,13 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       // canonicalize to a 'not' before the shift to help SCEV and codegen:
       // (X << C) ^ RHSC --> ~X << C
       if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_APInt(C)))) &&
-          *RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).shl(*C)) {
+          *RHSC == APInt::getAllOnes(Ty->getScalarSizeInBits()).shl(*C)) {
         Value *NotX = Builder.CreateNot(X);
         return BinaryOperator::CreateShl(NotX, ConstantInt::get(Ty, *C));
       }
       // (X >>u C) ^ RHSC --> ~X >>u C
       if (match(Op0, m_OneUse(m_LShr(m_Value(X), m_APInt(C)))) &&
-          *RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).lshr(*C)) {
+          *RHSC == APInt::getAllOnes(Ty->getScalarSizeInBits()).lshr(*C)) {
         Value *NotX = Builder.CreateNot(X);
         return BinaryOperator::CreateLShr(NotX, ConstantInt::get(Ty, *C));
       }
@@ -3572,101 +3624,6 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
     return CastedXor;
 
-  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
-  // ~min(~X, ~Y) --> max(X, Y)
-  // ~max(~X, Y) --> min(X, ~Y)
-  auto *II = dyn_cast<IntrinsicInst>(Op0);
-  if (II && match(Op1, m_AllOnes())) {
-    if (match(Op0, m_MaxOrMin(m_Not(m_Value(X)), m_Not(m_Value(Y))))) {
-      Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
-      Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y);
-      return replaceInstUsesWith(I, InvMaxMin);
-    }
-    if (match(Op0, m_OneUse(m_c_MaxOrMin(m_Not(m_Value(X)), m_Value(Y))))) {
-      Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
-      Value *NotY = Builder.CreateNot(Y);
-      Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, NotY);
-      return replaceInstUsesWith(I, InvMaxMin);
-    }
-  }
-
-  // TODO: Remove folds if we canonicalize to intrinsics (see above).
-  // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
-  //
-  //   %notx = xor i32 %x, -1
-  //   %cmp1 = icmp sgt i32 %notx, %y
-  //   %smax = select i1 %cmp1, i32 %notx, i32 %y
-  //   %res = xor i32 %smax, -1
-  // =>
-  //   %noty = xor i32 %y, -1
-  //   %cmp2 = icmp slt %x, %noty
-  //   %res = select i1 %cmp2, i32 %x, i32 %noty
-  //
-  // Same is applicable for smin/umax/umin.
-  if (match(Op1, m_AllOnes()) && Op0->hasOneUse()) {
-    Value *LHS, *RHS;
-    SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor;
-    if (SelectPatternResult::isMinOrMax(SPF)) {
-      // It's possible we get here before the not has been simplified, so make
-      // sure the input to the not isn't freely invertible.
-      if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
-        Value *NotY = Builder.CreateNot(RHS);
-        return SelectInst::Create(
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
-      }
-
-      // It's possible we get here before the not has been simplified, so make
-      // sure the input to the not isn't freely invertible.
-      if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
-        Value *NotX = Builder.CreateNot(LHS);
-        return SelectInst::Create(
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
-      }
-
-      // If both sides are freely invertible, then we can get rid of the xor
-      // completely.
-      if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
-          isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
-        Value *NotLHS = Builder.CreateNot(LHS);
-        Value *NotRHS = Builder.CreateNot(RHS);
-        return SelectInst::Create(
-            Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS),
-            NotLHS, NotRHS);
-      }
-    }
-
-    // Pull 'not' into operands of select if both operands are one-use compares
-    // or one is one-use compare and the other one is a constant.
-    // Inverting the predicates eliminates the 'not' operation.
-    // Example:
-    //   not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
-    //     select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
-    //   not (select ?, (cmp TPred, ?, ?), true -->
-    //     select ?, (cmp InvTPred, ?, ?), false
-    if (auto *Sel = dyn_cast<SelectInst>(Op0)) {
-      Value *TV = Sel->getTrueValue();
-      Value *FV = Sel->getFalseValue();
-      auto *CmpT = dyn_cast<CmpInst>(TV);
-      auto *CmpF = dyn_cast<CmpInst>(FV);
-      bool InvertibleT = (CmpT && CmpT->hasOneUse()) || isa<Constant>(TV);
-      bool InvertibleF = (CmpF && CmpF->hasOneUse()) || isa<Constant>(FV);
-      if (InvertibleT && InvertibleF) {
-        if (CmpT)
-          CmpT->setPredicate(CmpT->getInversePredicate());
-        else
-          Sel->setTrueValue(ConstantExpr::getNot(cast<Constant>(TV)));
-        if (CmpF)
-          CmpF->setPredicate(CmpF->getInversePredicate());
-        else
-          Sel->setFalseValue(ConstantExpr::getNot(cast<Constant>(FV)));
-        return replaceInstUsesWith(I, Sel);
-      }
-    }
-  }
-
-  if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
-    return NewXor;
-
   if (Instruction *Abs = canonicalizeAbs(I, Builder))
     return Abs;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 726bb545be12..bfa7bfa2290a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -67,7 +67,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -79,11 +78,12 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "instcombine"
-
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
 static cl::opt<unsigned> GuardWideningWindow(
@@ -513,7 +513,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
   // If the input to cttz/ctlz is known to be non-zero,
   // then change the 'ZeroIsUndef' parameter to 'true'
   // because we know the zero behavior can't affect the result.
-  if (!Known.One.isNullValue() ||
+  if (!Known.One.isZero() ||
       isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
                      &IC.getDominatorTree())) {
     if (!match(II.getArgOperand(1), m_One()))
@@ -656,8 +656,8 @@ static Value *simplifyNeonTbl1(const IntrinsicInst &II,
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
                              unsigned NumOperands) {
-  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
-  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
+  assert(I.arg_size() >= NumOperands && "Not enough operands");
+  assert(E.arg_size() >= NumOperands && "Not enough operands");
   for (unsigned i = 0; i < NumOperands; i++)
     if (I.getArgOperand(i) != E.getArgOperand(i))
       return false;
@@ -682,11 +682,11 @@ removeTriviallyEmptyRange(IntrinsicInst &EndI, InstCombinerImpl &IC,
   BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend());
   for (; BI != BE; ++BI) {
     if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) {
-      if (isa<DbgInfoIntrinsic>(I) ||
+      if (I->isDebugOrPseudoInst() ||
           I->getIntrinsicID() == EndI.getIntrinsicID())
         continue;
       if (IsStart(*I)) {
-        if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) {
+        if (haveSameOperands(EndI, *I, EndI.arg_size())) {
           IC.eraseInstFromFunction(*I);
           IC.eraseInstFromFunction(EndI);
           return true;
@@ -710,7 +710,7 @@ Instruction *InstCombinerImpl::visitVAEndInst(VAEndInst &I) {
 }
 
 static CallInst *canonicalizeConstantArg0ToArg1(CallInst &Call) {
-  assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
+  assert(Call.arg_size() > 1 && "Need at least 2 args to swap");
   Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
   if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
     Call.setArgOperand(0, Arg1);
@@ -754,6 +754,45 @@ static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
       ICmpInst::ICMP_SLT, Op, Constant::getNullValue(Op->getType()), CxtI, DL);
 }
 
+/// Try to canonicalize min/max(X + C0, C1) as min/max(X, C1 - C0) + C0. This
+/// can trigger other combines.
+static Instruction *moveAddAfterMinMax(IntrinsicInst *II,
+                                       InstCombiner::BuilderTy &Builder) {
+  Intrinsic::ID MinMaxID = II->getIntrinsicID();
+  assert((MinMaxID == Intrinsic::smax || MinMaxID == Intrinsic::smin ||
+          MinMaxID == Intrinsic::umax || MinMaxID == Intrinsic::umin) &&
+         "Expected a min or max intrinsic");
+
+  // TODO: Match vectors with undef elements, but undef may not propagate.
+  Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
+  Value *X;
+  const APInt *C0, *C1;
+  if (!match(Op0, m_OneUse(m_Add(m_Value(X), m_APInt(C0)))) ||
+      !match(Op1, m_APInt(C1)))
+    return nullptr;
+
+  // Check for necessary no-wrap and overflow constraints.
+  bool IsSigned = MinMaxID == Intrinsic::smax || MinMaxID == Intrinsic::smin;
+  auto *Add = cast<BinaryOperator>(Op0);
+  if ((IsSigned && !Add->hasNoSignedWrap()) ||
+      (!IsSigned && !Add->hasNoUnsignedWrap()))
+    return nullptr;
+
+  // If the constant difference overflows, then instsimplify should reduce the
+  // min/max to the add or C1.
+  bool Overflow;
+  APInt CDiff =
+      IsSigned ? C1->ssub_ov(*C0, Overflow) : C1->usub_ov(*C0, Overflow);
+  assert(!Overflow && "Expected simplify of min/max");
+
+  // min/max (add X, C0), C1 --> add (min/max X, C1 - C0), C0
+  // Note: the "mismatched" no-overflow setting does not propagate.
+  Constant *NewMinMaxC = ConstantInt::get(II->getType(), CDiff);
+  Value *NewMinMax = Builder.CreateBinaryIntrinsic(MinMaxID, X, NewMinMaxC);
+  return IsSigned ? BinaryOperator::CreateNSWAdd(NewMinMax, Add->getOperand(1))
+                  : BinaryOperator::CreateNUWAdd(NewMinMax, Add->getOperand(1));
+}
+
 /// If we have a clamp pattern like max (min X, 42), 41 -- where the output
 /// can only be one of two possible constant values -- turn that into a select
 /// of constants.
@@ -795,6 +834,63 @@ static Instruction *foldClampRangeOfTwo(IntrinsicInst *II,
   return SelectInst::Create(Cmp, ConstantInt::get(II->getType(), *C0), I1);
 }
 
+/// Reduce a sequence of min/max intrinsics with a common operand.
+static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
+  // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
+  auto *LHS = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
+  auto *RHS = dyn_cast<IntrinsicInst>(II->getArgOperand(1));
+  Intrinsic::ID MinMaxID = II->getIntrinsicID();
+  if (!LHS || !RHS || LHS->getIntrinsicID() != MinMaxID ||
+      RHS->getIntrinsicID() != MinMaxID ||
+      (!LHS->hasOneUse() && !RHS->hasOneUse()))
+    return nullptr;
+
+  Value *A = LHS->getArgOperand(0);
+  Value *B = LHS->getArgOperand(1);
+  Value *C = RHS->getArgOperand(0);
+  Value *D = RHS->getArgOperand(1);
+
+  // Look for a common operand.
+  Value *MinMaxOp = nullptr;
+  Value *ThirdOp = nullptr;
+  if (LHS->hasOneUse()) {
+    // If the LHS is only used in this chain and the RHS is used outside of it,
+    // reuse the RHS min/max because that will eliminate the LHS.
+    if (D == A || C == A) {
+      // min(min(a, b), min(c, a)) --> min(min(c, a), b)
+      // min(min(a, b), min(a, d)) --> min(min(a, d), b)
+      MinMaxOp = RHS;
+      ThirdOp = B;
+    } else if (D == B || C == B) {
+      // min(min(a, b), min(c, b)) --> min(min(c, b), a)
+      // min(min(a, b), min(b, d)) --> min(min(b, d), a)
+      MinMaxOp = RHS;
+      ThirdOp = A;
+    }
+  } else {
+    assert(RHS->hasOneUse() && "Expected one-use operand");
+    // Reuse the LHS. This will eliminate the RHS.
+    if (D == A || D == B) {
+      // min(min(a, b), min(c, a)) --> min(min(a, b), c)
+      // min(min(a, b), min(c, b)) --> min(min(a, b), c)
+      MinMaxOp = LHS;
+      ThirdOp = C;
+    } else if (C == A || C == B) {
+      // min(min(a, b), min(b, d)) --> min(min(a, b), d)
+      // min(min(a, b), min(c, b)) --> min(min(a, b), d)
+      MinMaxOp = LHS;
+      ThirdOp = D;
+    }
+  }
+
+  if (!MinMaxOp || !ThirdOp)
+    return nullptr;
+
+  Module *Mod = II->getModule();
+  Function *MinMax = Intrinsic::getDeclaration(Mod, MinMaxID, II->getType());
+  return CallInst::Create(MinMax, { MinMaxOp, ThirdOp });
+}
+
 /// CallInst simplification. This mostly only handles folding of intrinsic
 /// instructions. For normal calls, it allows visitCallBase to do the heavy
 /// lifting.
@@ -896,7 +992,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) {
     auto VWidth = IIFVTy->getNumElements();
     APInt UndefElts(VWidth, 0);
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    APInt AllOnesEltMask(APInt::getAllOnes(VWidth));
     if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
       if (V != II)
         return replaceInstUsesWith(*II, V);
@@ -1007,21 +1103,45 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    if (match(I0, m_Not(m_Value(X)))) {
-      // max (not X), (not Y) --> not (min X, Y)
-      Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID);
-      if (match(I1, m_Not(m_Value(Y))) &&
+    if (IID == Intrinsic::smax || IID == Intrinsic::smin) {
+      // smax (neg nsw X), (neg nsw Y) --> neg nsw (smin X, Y)
+      // smin (neg nsw X), (neg nsw Y) --> neg nsw (smax X, Y)
+      // TODO: Canonicalize neg after min/max if I1 is constant.
+      if (match(I0, m_NSWNeg(m_Value(X))) && match(I1, m_NSWNeg(m_Value(Y))) &&
           (I0->hasOneUse() || I1->hasOneUse())) {
+        Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID);
         Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y);
-        return BinaryOperator::CreateNot(InvMaxMin);
+        return BinaryOperator::CreateNSWNeg(InvMaxMin);
       }
-      // max (not X), C --> not(min X, ~C)
-      if (match(I1, m_Constant(C)) && I0->hasOneUse()) {
-        Constant *NotC = ConstantExpr::getNot(C);
-        Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, NotC);
+    }
+
+    // If we can eliminate ~A and Y is free to invert:
+    // max ~A, Y --> ~(min A, ~Y)
+    //
+    // Examples:
+    // max ~A, ~Y --> ~(min A, Y)
+    // max ~A, C --> ~(min A, ~C)
+    // max ~A, (max ~Y, ~Z) --> ~min( A, (min Y, Z))
+    auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
+      Value *A;
+      if (match(X, m_OneUse(m_Not(m_Value(A)))) &&
+          !isFreeToInvert(A, A->hasOneUse()) &&
+          isFreeToInvert(Y, Y->hasOneUse())) {
+        Value *NotY = Builder.CreateNot(Y);
+        Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID);
+        Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, A, NotY);
         return BinaryOperator::CreateNot(InvMaxMin);
       }
-    }
+      return nullptr;
+    };
+
+    if (Instruction *I = moveNotAfterMinMax(I0, I1))
+      return I;
+    if (Instruction *I = moveNotAfterMinMax(I1, I0))
+      return I;
+
+    if (Instruction *I = moveAddAfterMinMax(II, Builder))
+      return I;
 
     // smax(X, -X) --> abs(X)
     // smin(X, -X) --> -abs(X)
@@ -1051,11 +1171,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *Sel = foldClampRangeOfTwo(II, Builder))
       return Sel;
 
+    if (Instruction *SAdd = matchSAddSubSat(*II))
+      return SAdd;
+
     if (match(I1, m_ImmConstant()))
       if (auto *Sel = dyn_cast<SelectInst>(I0))
         if (Instruction *R = FoldOpIntoSelect(*II, Sel))
           return R;
 
+    if (Instruction *NewMinMax = factorizeMinMaxTree(II))
+       return NewMinMax;
+
     break;
   }
   case Intrinsic::bswap: {
@@ -1098,6 +1224,19 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       if (Power->equalsInt(2))
         return BinaryOperator::CreateFMulFMF(II->getArgOperand(0),
                                              II->getArgOperand(0), II);
+
+      if (!Power->getValue()[0]) {
+        Value *X;
+        // If power is even:
+        // powi(-x, p) -> powi(x, p)
+        // powi(fabs(x), p) -> powi(x, p)
+        // powi(copysign(x, y), p) -> powi(x, p)
+        if (match(II->getArgOperand(0), m_FNeg(m_Value(X))) ||
+            match(II->getArgOperand(0), m_FAbs(m_Value(X))) ||
+            match(II->getArgOperand(0),
+                  m_Intrinsic<Intrinsic::copysign>(m_Value(X), m_Value())))
+          return replaceOperand(*II, 0, X);
+      }
     }
     break;
 
@@ -1637,14 +1776,66 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::stackrestore: {
-    // If the save is right next to the restore, remove the restore.  This can
-    // happen when variable allocas are DCE'd.
+    enum class ClassifyResult {
+      None,
+      Alloca,
+      StackRestore,
+      CallWithSideEffects,
+    };
+    auto Classify = [](const Instruction *I) {
+      if (isa<AllocaInst>(I))
+        return ClassifyResult::Alloca;
+
+      if (auto *CI = dyn_cast<CallInst>(I)) {
+        if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
+          if (II->getIntrinsicID() == Intrinsic::stackrestore)
+            return ClassifyResult::StackRestore;
+
+          if (II->mayHaveSideEffects())
+            return ClassifyResult::CallWithSideEffects;
+        } else {
+          // Consider all non-intrinsic calls to be side effects
+          return ClassifyResult::CallWithSideEffects;
+        }
+      }
+
+      return ClassifyResult::None;
+    };
+
+    // If the stacksave and the stackrestore are in the same BB, and there is
+    // no intervening call, alloca, or stackrestore of a different stacksave,
+    // remove the restore. This can happen when variable allocas are DCE'd.
     if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
-      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
-        // Skip over debug info.
-        if (SS->getNextNonDebugInstruction() == II) {
-          return eraseInstFromFunction(CI);
+      if (SS->getIntrinsicID() == Intrinsic::stacksave &&
+          SS->getParent() == II->getParent()) {
+        BasicBlock::iterator BI(SS);
+        bool CannotRemove = false;
+        for (++BI; &*BI != II; ++BI) {
+          switch (Classify(&*BI)) {
+          case ClassifyResult::None:
+            // So far so good, look at next instructions.
+            break;
+
+          case ClassifyResult::StackRestore:
+            // If we found an intervening stackrestore for a different
+            // stacksave, we can't remove the stackrestore. Otherwise, continue.
+            if (cast<IntrinsicInst>(*BI).getArgOperand(0) != SS)
+              CannotRemove = true;
+            break;
+
+          case ClassifyResult::Alloca:
+          case ClassifyResult::CallWithSideEffects:
+            // If we found an alloca, a non-intrinsic call, or an intrinsic
+            // call with side effects, we can't remove the stackrestore.
+            CannotRemove = true;
+            break;
+          }
+          if (CannotRemove)
+            break;
         }
+
+        if (!CannotRemove)
+          return eraseInstFromFunction(CI);
       }
     }
 
@@ -1654,29 +1845,25 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Instruction *TI = II->getParent()->getTerminator();
     bool CannotRemove = false;
     for (++BI; &*BI != TI; ++BI) {
-      if (isa<AllocaInst>(BI)) {
+      switch (Classify(&*BI)) {
+      case ClassifyResult::None:
+        // So far so good, look at next instructions.
+        break;
+
+      case ClassifyResult::StackRestore:
+        // If there is a stackrestore below this one, remove this one.
+        return eraseInstFromFunction(CI);
+
+      case ClassifyResult::Alloca:
+      case ClassifyResult::CallWithSideEffects:
+        // If we found an alloca, a non-intrinsic call, or an intrinsic call
+        // with side effects (such as llvm.stacksave and llvm.read_register),
+        // we can't remove the stack restore.
         CannotRemove = true;
         break;
       }
-      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
-        if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) {
-          // If there is a stackrestore below this one, remove this one.
-          if (II2->getIntrinsicID() == Intrinsic::stackrestore)
-            return eraseInstFromFunction(CI);
-
-          // Bail if we cross over an intrinsic with side effects, such as
-          // llvm.stacksave, or llvm.read_register.
-          if (II2->mayHaveSideEffects()) {
-            CannotRemove = true;
-            break;
-          }
-        } else {
-          // If we found a non-intrinsic call, we can't remove the stack
-          // restore.
-          CannotRemove = true;
-          break;
-        }
-      }
+      if (CannotRemove)
+        break;
     }
 
     // If the stack restore is in a return, resume, or unwind block and if there
@@ -1963,6 +2150,46 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_vector_reverse: {
+    Value *BO0, *BO1, *X, *Y;
+    Value *Vec = II->getArgOperand(0);
+    if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
+      auto *OldBinOp = cast<BinaryOperator>(Vec);
+      if (match(BO0, m_Intrinsic<Intrinsic::experimental_vector_reverse>(
+                         m_Value(X)))) {
+        // rev(binop rev(X), rev(Y)) --> binop X, Y
+        if (match(BO1, m_Intrinsic<Intrinsic::experimental_vector_reverse>(
+                           m_Value(Y))))
+          return replaceInstUsesWith(CI,
+                                     BinaryOperator::CreateWithCopiedFlags(
+                                         OldBinOp->getOpcode(), X, Y, OldBinOp,
+                                         OldBinOp->getName(), II));
+        // rev(binop rev(X), BO1Splat) --> binop X, BO1Splat
+        if (isSplatValue(BO1))
+          return replaceInstUsesWith(CI,
+                                     BinaryOperator::CreateWithCopiedFlags(
+                                         OldBinOp->getOpcode(), X, BO1,
+                                         OldBinOp, OldBinOp->getName(), II));
+      }
+      // rev(binop BO0Splat, rev(Y)) --> binop BO0Splat, Y
+      if (match(BO1, m_Intrinsic<Intrinsic::experimental_vector_reverse>(
+                         m_Value(Y))) &&
+          isSplatValue(BO0))
+        return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
+                                           OldBinOp->getOpcode(), BO0, Y,
+                                           OldBinOp, OldBinOp->getName(), II));
+    }
+    // rev(unop rev(X)) --> unop X
+    if (match(Vec, m_OneUse(m_UnOp(
+                       m_Intrinsic<Intrinsic::experimental_vector_reverse>(
+                           m_Value(X)))))) {
+      auto *OldUnOp = cast<UnaryOperator>(Vec);
+      auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(
+          OldUnOp->getOpcode(), X, OldUnOp, OldUnOp->getName(), II);
+      return replaceInstUsesWith(CI, NewUnOp);
+    }
+    break;
+  }
   case Intrinsic::vector_reduce_or:
   case Intrinsic::vector_reduce_and: {
     // Canonicalize logical or/and reductions:
@@ -1973,21 +2200,26 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // %val = bitcast <ReduxWidth x i1> to iReduxWidth
     // %res = cmp eq iReduxWidth %val, 11111
     Value *Arg = II->getArgOperand(0);
-    Type *RetTy = II->getType();
-    if (RetTy == Builder.getInt1Ty())
-      if (auto *FVTy = dyn_cast<FixedVectorType>(Arg->getType())) {
-        Value *Res = Builder.CreateBitCast(
-            Arg, Builder.getIntNTy(FVTy->getNumElements()));
-        if (IID == Intrinsic::vector_reduce_and) {
-          Res = Builder.CreateICmpEQ(
-              Res, ConstantInt::getAllOnesValue(Res->getType()));
-        } else {
-          assert(IID == Intrinsic::vector_reduce_or &&
-                 "Expected or reduction.");
-          Res = Builder.CreateIsNotNull(Res);
+    Value *Vect;
+    if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) {
+      if (auto *FTy = dyn_cast<FixedVectorType>(Vect->getType()))
+        if (FTy->getElementType() == Builder.getInt1Ty()) {
+          Value *Res = Builder.CreateBitCast(
+              Vect, Builder.getIntNTy(FTy->getNumElements()));
+          if (IID == Intrinsic::vector_reduce_and) {
+            Res = Builder.CreateICmpEQ(
+                Res, ConstantInt::getAllOnesValue(Res->getType()));
+          } else {
+            assert(IID == Intrinsic::vector_reduce_or &&
+                   "Expected or reduction.");
+            Res = Builder.CreateIsNotNull(Res);
+          }
+          if (Arg != Vect)
+            Res = Builder.CreateCast(cast<CastInst>(Arg)->getOpcode(), Res,
+                                     II->getType());
+          return replaceInstUsesWith(CI, Res);
         }
-        return replaceInstUsesWith(CI, Res);
-      }
+    }
     LLVM_FALLTHROUGH;
   }
   case Intrinsic::vector_reduce_add: {
@@ -2017,12 +2249,117 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     LLVM_FALLTHROUGH;
   }
-  case Intrinsic::vector_reduce_mul:
-  case Intrinsic::vector_reduce_xor:
-  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_xor: {
+    if (IID == Intrinsic::vector_reduce_xor) {
+      // Exclusive disjunction reduction over the vector with
+      // (potentially-extended) i1 element type is actually a
+      // (potentially-extended) arithmetic `add` reduction over the original
+      // non-extended value:
+      //   vector_reduce_xor(?ext(<n x i1>))
+      //     -->
+      //   ?ext(vector_reduce_add(<n x i1>))
+      Value *Arg = II->getArgOperand(0);
+      Value *Vect;
+      if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) {
+        if (auto *FTy = dyn_cast<FixedVectorType>(Vect->getType()))
+          if (FTy->getElementType() == Builder.getInt1Ty()) {
+            Value *Res = Builder.CreateAddReduce(Vect);
+            if (Arg != Vect)
+              Res = Builder.CreateCast(cast<CastInst>(Arg)->getOpcode(), Res,
+                                       II->getType());
+            return replaceInstUsesWith(CI, Res);
+          }
+      }
+    }
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::vector_reduce_mul: {
+    if (IID == Intrinsic::vector_reduce_mul) {
+      // Multiplicative reduction over the vector with (potentially-extended)
+      // i1 element type is actually a (potentially zero-extended)
+      // logical `and` reduction over the original non-extended value:
+      //   vector_reduce_mul(?ext(<n x i1>))
+      //     -->
+      //   zext(vector_reduce_and(<n x i1>))
+      Value *Arg = II->getArgOperand(0);
+      Value *Vect;
+      if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) {
+        if (auto *FTy = dyn_cast<FixedVectorType>(Vect->getType()))
+          if (FTy->getElementType() == Builder.getInt1Ty()) {
+            Value *Res = Builder.CreateAndReduce(Vect);
+            if (Res->getType() != II->getType())
+              Res = Builder.CreateZExt(Res, II->getType());
+            return replaceInstUsesWith(CI, Res);
+          }
+      }
+    }
+    LLVM_FALLTHROUGH;
+  }
   case Intrinsic::vector_reduce_umin:
-  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umax: {
+    if (IID == Intrinsic::vector_reduce_umin ||
+        IID == Intrinsic::vector_reduce_umax) {
+      // UMin/UMax reduction over the vector with (potentially-extended)
+      // i1 element type is actually a (potentially-extended)
+      // logical `and`/`or` reduction over the original non-extended value:
+      //   vector_reduce_u{min,max}(?ext(<n x i1>))
+      //     -->
+      //   ?ext(vector_reduce_{and,or}(<n x i1>))
+      Value *Arg = II->getArgOperand(0);
+      Value *Vect;
+      if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) {
+        if (auto *FTy = dyn_cast<FixedVectorType>(Vect->getType()))
+          if (FTy->getElementType() == Builder.getInt1Ty()) {
+            Value *Res = IID == Intrinsic::vector_reduce_umin
+                             ? Builder.CreateAndReduce(Vect)
+                             : Builder.CreateOrReduce(Vect);
+            if (Arg != Vect)
+              Res = Builder.CreateCast(cast<CastInst>(Arg)->getOpcode(), Res,
+                                       II->getType());
+            return replaceInstUsesWith(CI, Res);
+          }
+      }
+    }
+    LLVM_FALLTHROUGH;
+  }
   case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax: {
+    if (IID == Intrinsic::vector_reduce_smin ||
+        IID == Intrinsic::vector_reduce_smax) {
+      // SMin/SMax reduction over the vector with (potentially-extended)
+      // i1 element type is actually a (potentially-extended)
+      // logical `and`/`or` reduction over the original non-extended value:
+      //   vector_reduce_s{min,max}(<n x i1>)
+      //     -->
+      //   vector_reduce_{or,and}(<n x i1>)
+      // and
+      //   vector_reduce_s{min,max}(sext(<n x i1>))
+      //     -->
+      //   sext(vector_reduce_{or,and}(<n x i1>))
+      // and
+      //   vector_reduce_s{min,max}(zext(<n x i1>))
+      //     -->
+      //   zext(vector_reduce_{and,or}(<n x i1>))
+      Value *Arg = II->getArgOperand(0);
+      Value *Vect;
+      if (match(Arg, m_ZExtOrSExtOrSelf(m_Value(Vect)))) {
+        if (auto *FTy = dyn_cast<FixedVectorType>(Vect->getType()))
+          if (FTy->getElementType() == Builder.getInt1Ty()) {
+            Instruction::CastOps ExtOpc = Instruction::CastOps::CastOpsEnd;
+            if (Arg != Vect)
+              ExtOpc = cast<CastInst>(Arg)->getOpcode();
+            Value *Res = ((IID == Intrinsic::vector_reduce_smin) ==
+                          (ExtOpc == Instruction::CastOps::ZExt))
+                             ? Builder.CreateAndReduce(Vect)
+                             : Builder.CreateOrReduce(Vect);
+            if (Arg != Vect)
+              Res = Builder.CreateCast(ExtOpc, Res, II->getType());
+            return replaceInstUsesWith(CI, Res);
+          }
+      }
+    }
+    LLVM_FALLTHROUGH;
+  }
   case Intrinsic::vector_reduce_fmax:
   case Intrinsic::vector_reduce_fmin:
   case Intrinsic::vector_reduce_fadd:
@@ -2228,7 +2565,7 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
 }
 
 void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
-  unsigned NumArgs = Call.getNumArgOperands();
+  unsigned NumArgs = Call.arg_size();
   ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
   ConstantInt *Op1C =
       (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
@@ -2239,55 +2576,46 @@ void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryI
 
   if (isMallocLikeFn(&Call, TLI) && Op0C) {
     if (isOpNewLikeFn(&Call, TLI))
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableBytes(
-                            Call.getContext(), Op0C->getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableBytes(
+          Call.getContext(), Op0C->getZExtValue()));
     else
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableOrNullBytes(
-                            Call.getContext(), Op0C->getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Op0C->getZExtValue()));
   } else if (isAlignedAllocLikeFn(&Call, TLI)) {
     if (Op1C)
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableOrNullBytes(
-                            Call.getContext(), Op1C->getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Op1C->getZExtValue()));
     // Add alignment attribute if alignment is a power of two constant.
     if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment) &&
         isKnownNonZero(Call.getOperand(1), DL, 0, &AC, &Call, &DT)) {
       uint64_t AlignmentVal = Op0C->getZExtValue();
       if (llvm::isPowerOf2_64(AlignmentVal)) {
-        Call.removeAttribute(AttributeList::ReturnIndex, Attribute::Alignment);
-        Call.addAttribute(AttributeList::ReturnIndex,
-                          Attribute::getWithAlignment(Call.getContext(),
-                                                      Align(AlignmentVal)));
+        Call.removeRetAttr(Attribute::Alignment);
+        Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
+                                                    Align(AlignmentVal)));
       }
     }
   } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
-    Call.addAttribute(AttributeList::ReturnIndex,
-                      Attribute::getWithDereferenceableOrNullBytes(
-                          Call.getContext(), Op1C->getZExtValue()));
+    Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+        Call.getContext(), Op1C->getZExtValue()));
   } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
     bool Overflow;
     const APInt &N = Op0C->getValue();
     APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
     if (!Overflow)
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableOrNullBytes(
-                            Call.getContext(), Size.getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Size.getZExtValue()));
   } else if (isStrdupLikeFn(&Call, TLI)) {
     uint64_t Len = GetStringLength(Call.getOperand(0));
     if (Len) {
       // strdup
       if (NumArgs == 1)
-        Call.addAttribute(AttributeList::ReturnIndex,
-                          Attribute::getWithDereferenceableOrNullBytes(
-                              Call.getContext(), Len));
+        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+            Call.getContext(), Len));
       // strndup
       else if (NumArgs == 2 && Op1C)
-        Call.addAttribute(
-            AttributeList::ReturnIndex,
-            Attribute::getWithDereferenceableOrNullBytes(
-                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+            Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
     }
   }
 }
@@ -2489,7 +2817,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
         // isKnownNonNull -> nonnull attribute
         if (!GCR.hasRetAttr(Attribute::NonNull) &&
             isKnownNonZero(DerivedPtr, DL, 0, &AC, &Call, &DT)) {
-          GCR.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          GCR.addRetAttr(Attribute::NonNull);
           // We discovered new fact, re-check users.
           Worklist.pushUsersToWorkList(GCR);
         }
@@ -2646,19 +2974,19 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
-    if (AttrBuilder(CallerPAL.getParamAttributes(i))
+    if (AttrBuilder(CallerPAL.getParamAttrs(i))
             .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
     if (Call.isInAllocaArgument(i))
       return false;   // Cannot transform to and from inalloca.
 
-    if (CallerPAL.hasParamAttribute(i, Attribute::SwiftError))
+    if (CallerPAL.hasParamAttr(i, Attribute::SwiftError))
       return false;
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
+    if (ParamTy != ActTy && CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (!ParamPTy || !ParamPTy->getElementType()->isSized())
         return false;
@@ -2699,7 +3027,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // that are compatible with being a vararg call argument.
     unsigned SRetIdx;
     if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
-        SRetIdx > FT->getNumParams())
+        SRetIdx - AttributeList::FirstArgIndex >= FT->getNumParams())
       return false;
   }
 
@@ -2728,12 +3056,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     Args.push_back(NewArg);
 
     // Add any parameter attributes.
-    if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
-      AttrBuilder AB(CallerPAL.getParamAttributes(i));
+    if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
+      AttrBuilder AB(CallerPAL.getParamAttrs(i));
       AB.addByValAttr(NewArg->getType()->getPointerElementType());
       ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
     } else
-      ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+      ArgAttrs.push_back(CallerPAL.getParamAttrs(i));
   }
 
   // If the function takes more arguments than the call was taking, add them
@@ -2760,12 +3088,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
         Args.push_back(NewArg);
 
         // Add any parameter attributes.
-        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+        ArgAttrs.push_back(CallerPAL.getParamAttrs(i));
       }
     }
   }
 
-  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+  AttributeSet FnAttrs = CallerPAL.getFnAttrs();
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
@@ -2866,7 +3194,7 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
                                       E = NestFTy->param_end();
          I != E; ++NestArgNo, ++I) {
-      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+      AttributeSet AS = NestAttrs.getParamAttrs(NestArgNo);
       if (AS.hasAttribute(Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
@@ -2902,7 +3230,7 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+          NewArgAttrs.push_back(Attrs.getParamAttrs(ArgNo));
 
           ++ArgNo;
           ++I;
@@ -2948,8 +3276,8 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
       AttributeList NewPAL =
-          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
-                             Attrs.getRetAttributes(), NewArgAttrs);
+          AttributeList::get(FTy->getContext(), Attrs.getFnAttrs(),
+                             Attrs.getRetAttrs(), NewArgAttrs);
 
       SmallVector<OperandBundleDef, 1> OpBundles;
       Call.getOperandBundlesAsDefs(OpBundles);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 04877bec94ec..ca87477c5d81 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -333,7 +333,7 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
         SrcTy->getNumElements() == DestTy->getNumElements() &&
         SrcTy->getPrimitiveSizeInBits() == DestTy->getPrimitiveSizeInBits()) {
       Value *CastX = Builder.CreateCast(CI.getOpcode(), X, DestTy);
-      return new ShuffleVectorInst(CastX, UndefValue::get(DestTy), Mask);
+      return new ShuffleVectorInst(CastX, Mask);
     }
   }
 
@@ -701,10 +701,10 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
   if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) &&
       is_splat(Shuf->getShuffleMask()) &&
       Shuf->getType() == Shuf->getOperand(0)->getType()) {
-    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
-    Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask
+    // trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask
     Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
-    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getShuffleMask());
+    return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask());
   }
 
   return nullptr;
@@ -961,14 +961,25 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
       return BinaryOperator::CreateAdd(NarrowCtlz, WidthDiff);
     }
   }
+
+  if (match(Src, m_VScale(DL))) {
+    if (Trunc.getFunction() &&
+        Trunc.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
+      unsigned MaxVScale = Trunc.getFunction()
+                               ->getFnAttribute(Attribute::VScaleRange)
+                               .getVScaleRangeArgs()
+                               .second;
+      if (MaxVScale > 0 && Log2_32(MaxVScale) < DestWidth) {
+        Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+        return replaceInstUsesWith(Trunc, VScale);
+      }
+    }
+  }
+
   return nullptr;
 }
 
-/// Transform (zext icmp) to bitwise / integer operations in order to
-/// eliminate it. If DoTransform is false, just test whether the given
-/// (zext icmp) can be transformed.
-Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
-                                                 bool DoTransform) {
+Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext) {
   // If we are just checking for a icmp eq of a single bit and zext'ing it
   // to an integer, then shift the bit to the appropriate place and then
   // cast to integer to avoid the comparison.
@@ -977,10 +988,8 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
 
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
-    if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) ||
-        (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) {
-      if (!DoTransform) return Cmp;
-
+    if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isZero()) ||
+        (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnes())) {
       Value *In = Cmp->getOperand(0);
       Value *Sh = ConstantInt::get(In->getType(),
                                    In->getType()->getScalarSizeInBits() - 1);
@@ -1004,7 +1013,7 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
     // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
     // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
-    if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) &&
+    if ((Op1CV->isZero() || Op1CV->isPowerOf2()) &&
         // This only works for EQ and NE
         Cmp->isEquality()) {
       // If Op1C some other power of two, convert:
@@ -1012,10 +1021,8 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
 
       APInt KnownZeroMask(~Known.Zero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
-        if (!DoTransform) return Cmp;
-
         bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE;
-        if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) {
+        if (!Op1CV->isZero() && (*Op1CV != KnownZeroMask)) {
           // (X&4) == 2 --> false
           // (X&4) != 2 --> true
           Constant *Res = ConstantInt::get(Zext.getType(), isNE);
@@ -1031,7 +1038,7 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
                                   In->getName() + ".lobit");
         }
 
-        if (!Op1CV->isNullValue() == isNE) { // Toggle the low bit.
+        if (!Op1CV->isZero() == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
           In = Builder.CreateXor(In, One);
         }
@@ -1053,9 +1060,6 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
     if (Cmp->hasOneUse() && match(Cmp->getOperand(1), m_ZeroInt()) &&
         match(Cmp->getOperand(0),
               m_OneUse(m_c_And(m_Shl(m_One(), m_Value(ShAmt)), m_Value(X))))) {
-      if (!DoTransform)
-        return Cmp;
-
       if (Cmp->getPredicate() == ICmpInst::ICMP_EQ)
         X = Builder.CreateNot(X);
       Value *Lshr = Builder.CreateLShr(X, ShAmt);
@@ -1077,8 +1081,6 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
         APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
         APInt UnknownBit = ~KnownBits;
         if (UnknownBit.countPopulation() == 1) {
-          if (!DoTransform) return Cmp;
-
           Value *Result = Builder.CreateXor(LHS, RHS);
 
           // Mask off any bits that are set and won't be shifted away.
@@ -1316,51 +1318,37 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
   if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src))
     return transformZExtICmp(Cmp, CI);
 
-  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
-  if (SrcI && SrcI->getOpcode() == Instruction::Or) {
-    // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one
-    // of the (zext icmp) can be eliminated. If so, immediately perform the
-    // according elimination.
-    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
-    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
-    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
-        LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
-        (transformZExtICmp(LHS, CI, false) ||
-         transformZExtICmp(RHS, CI, false))) {
-      // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
-      Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
-      Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
-      Value *Or = Builder.CreateOr(LCast, RCast, CI.getName());
-      if (auto *OrInst = dyn_cast<Instruction>(Or))
-        Builder.SetInsertPoint(OrInst);
-
-      // Perform the elimination.
-      if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
-        transformZExtICmp(LHS, *LZExt);
-      if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
-        transformZExtICmp(RHS, *RZExt);
-
-      return replaceInstUsesWith(CI, Or);
-    }
-  }
-
   // zext(trunc(X) & C) -> (X & zext(C)).
   Constant *C;
   Value *X;
-  if (SrcI &&
-      match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) &&
+  if (match(Src, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) &&
       X->getType() == CI.getType())
     return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType()));
 
   // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)).
   Value *And;
-  if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) &&
+  if (match(Src, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) &&
       match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) &&
       X->getType() == CI.getType()) {
     Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
     return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC);
   }
 
+  if (match(Src, m_VScale(DL))) {
+    if (CI.getFunction() &&
+        CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
+      unsigned MaxVScale = CI.getFunction()
+                               ->getFnAttribute(Attribute::VScaleRange)
+                               .getVScaleRangeArgs()
+                               .second;
+      unsigned TypeWidth = Src->getType()->getScalarSizeInBits();
+      if (MaxVScale > 0 && Log2_32(MaxVScale) < TypeWidth) {
+        Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+        return replaceInstUsesWith(CI, VScale);
+      }
+    }
+  }
+
   return nullptr;
 }
 
@@ -1605,6 +1593,32 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
     return BinaryOperator::CreateAShr(A, NewShAmt);
   }
 
+  // Splatting a bit of constant-index across a value:
+  // sext (ashr (trunc iN X to iM), M-1) to iN --> ashr (shl X, N-M), N-1
+  // TODO: If the dest type is different, use a cast (adjust use check).
+  if (match(Src, m_OneUse(m_AShr(m_Trunc(m_Value(X)),
+                                 m_SpecificInt(SrcBitSize - 1)))) &&
+      X->getType() == DestTy) {
+    Constant *ShlAmtC = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+    Constant *AshrAmtC = ConstantInt::get(DestTy, DestBitSize - 1);
+    Value *Shl = Builder.CreateShl(X, ShlAmtC);
+    return BinaryOperator::CreateAShr(Shl, AshrAmtC);
+  }
+
+  if (match(Src, m_VScale(DL))) {
+    if (CI.getFunction() &&
+        CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
+      unsigned MaxVScale = CI.getFunction()
+                               ->getFnAttribute(Attribute::VScaleRange)
+                               .getVScaleRangeArgs()
+                               .second;
+      if (MaxVScale > 0 && Log2_32(MaxVScale) < (SrcBitSize - 1)) {
+        Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+        return replaceInstUsesWith(CI, VScale);
+      }
+    }
+  }
+
   return nullptr;
 }
 
@@ -2060,6 +2074,19 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
     return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
   }
 
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(SrcOp)) {
+    // Fold ptrtoint(gep null, x) to multiply + constant if the GEP has one use.
+    // While this can increase the number of instructions it doesn't actually
+    // increase the overall complexity since the arithmetic is just part of
+    // the GEP otherwise.
+    if (GEP->hasOneUse() &&
+        isa<ConstantPointerNull>(GEP->getPointerOperand())) {
+      return replaceInstUsesWith(CI,
+                                 Builder.CreateIntCast(EmitGEPOffset(GEP), Ty,
+                                                       /*isSigned=*/false));
+    }
+  }
+
   Value *Vec, *Scalar, *Index;
   if (match(SrcOp, m_OneUse(m_InsertElt(m_IntToPtr(m_Value(Vec)),
                                         m_Value(Scalar), m_Value(Index)))) &&
@@ -2133,9 +2160,9 @@ optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
   if (SrcElts > DestElts) {
     // If we're shrinking the number of elements (rewriting an integer
     // truncate), just shuffle in the elements corresponding to the least
-    // significant bits from the input and use undef as the second shuffle
+    // significant bits from the input and use poison as the second shuffle
     // input.
-    V2 = UndefValue::get(SrcTy);
+    V2 = PoisonValue::get(SrcTy);
     // Make sure the shuffle mask selects the "least significant bits" by
     // keeping elements from back of the src vector for big endian, and from the
     // front for little endian.
@@ -2528,7 +2555,7 @@ Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
         // As long as the user is another old PHI node, then even if we don't
         // rewrite it, the PHI web we're considering won't have any users
         // outside itself, so it'll be dead.
-        if (OldPhiNodes.count(PHI) == 0)
+        if (!OldPhiNodes.contains(PHI))
           return nullptr;
       } else {
         return nullptr;
@@ -2736,6 +2763,30 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
       if (auto *InsElt = dyn_cast<InsertElementInst>(Src))
         return new BitCastInst(InsElt->getOperand(1), DestTy);
     }
+
+    // Convert an artificial vector insert into more analyzable bitwise logic.
+    unsigned BitWidth = DestTy->getScalarSizeInBits();
+    Value *X, *Y;
+    uint64_t IndexC;
+    if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))),
+                                        m_Value(Y), m_ConstantInt(IndexC)))) &&
+        DestTy->isIntegerTy() && X->getType() == DestTy &&
+        isDesirableIntType(BitWidth)) {
+      // Adjust for big endian - the LSBs are at the high index.
+      if (DL.isBigEndian())
+        IndexC = SrcVTy->getNumElements() - 1 - IndexC;
+
+      // We only handle (endian-normalized) insert to index 0. Any other insert
+      // would require a left-shift, so that is an extra instruction.
+      if (IndexC == 0) {
+        // bitcast (inselt (bitcast X), Y, 0) --> or (and X, MaskC), (zext Y)
+        unsigned EltWidth = Y->getType()->getScalarSizeInBits();
+        APInt MaskC = APInt::getHighBitsSet(BitWidth, BitWidth - EltWidth);
+        Value *AndX = Builder.CreateAnd(X, MaskC);
+        Value *ZextY = Builder.CreateZExt(Y, DestTy);
+        return BinaryOperator::CreateOr(AndX, ZextY);
+      }
+    }
   }
 
   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2b0ef0c5f2cc..7a9e177f19da 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -78,15 +78,15 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
   if (!ICmpInst::isSigned(Pred))
     return false;
 
-  if (C.isNullValue())
+  if (C.isZero())
     return ICmpInst::isRelational(Pred);
 
-  if (C.isOneValue()) {
+  if (C.isOne()) {
     if (Pred == ICmpInst::ICMP_SLT) {
       Pred = ICmpInst::ICMP_SLE;
       return true;
     }
-  } else if (C.isAllOnesValue()) {
+  } else if (C.isAllOnes()) {
     if (Pred == ICmpInst::ICMP_SGT) {
       Pred = ICmpInst::ICMP_SGE;
       return true;
@@ -541,7 +541,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
         if (!CI->isNoopCast(DL))
           return false;
 
-        if (Explored.count(CI->getOperand(0)) == 0)
+        if (!Explored.contains(CI->getOperand(0)))
           WorkList.push_back(CI->getOperand(0));
       }
 
@@ -553,7 +553,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
             GEP->getType() != Start->getType())
           return false;
 
-        if (Explored.count(GEP->getOperand(0)) == 0)
+        if (!Explored.contains(GEP->getOperand(0)))
           WorkList.push_back(GEP->getOperand(0));
       }
 
@@ -575,7 +575,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
     // Explore the PHI nodes further.
     for (auto *PN : PHIs)
       for (Value *Op : PN->incoming_values())
-        if (Explored.count(Op) == 0)
+        if (!Explored.contains(Op))
           WorkList.push_back(Op);
   }
 
@@ -589,7 +589,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
       auto *Inst = dyn_cast<Instruction>(Val);
 
       if (Inst == Base || Inst == PHI || !Inst || !PHI ||
-          Explored.count(PHI) == 0)
+          !Explored.contains(PHI))
         continue;
 
       if (PHI->getParent() == Inst->getParent())
@@ -1147,12 +1147,12 @@ Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
   };
 
   // Don't bother doing any work for cases which InstSimplify handles.
-  if (AP2.isNullValue())
+  if (AP2.isZero())
     return nullptr;
 
   bool IsAShr = isa<AShrOperator>(I.getOperand(0));
   if (IsAShr) {
-    if (AP2.isAllOnesValue())
+    if (AP2.isAllOnes())
       return nullptr;
     if (AP2.isNegative() != AP1.isNegative())
       return nullptr;
@@ -1178,7 +1178,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
     if (IsAShr && AP1 == AP2.ashr(Shift)) {
       // There are multiple solutions if we are comparing against -1 and the LHS
       // of the ashr is not a power of two.
-      if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
+      if (AP1.isAllOnes() && !AP2.isPowerOf2())
         return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
       return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
     } else if (AP1 == AP2.lshr(Shift)) {
@@ -1206,7 +1206,7 @@ Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A,
   };
 
   // Don't bother doing any work for cases which InstSimplify handles.
-  if (AP2.isNullValue())
+  if (AP2.isZero())
     return nullptr;
 
   unsigned AP2TrailingZeros = AP2.countTrailingZeros();
@@ -1270,9 +1270,8 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // This is only really a signed overflow check if the inputs have been
   // sign-extended; check for that condition. For example, if CI2 is 2^31 and
   // the operands of the add are 64 bits wide, we need at least 33 sign bits.
-  unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
-  if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
-      IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
+  if (IC.ComputeMinSignedBits(A, 0, &I) > NewWidth ||
+      IC.ComputeMinSignedBits(B, 0, &I) > NewWidth)
     return nullptr;
 
   // In order to replace the original add with a narrower
@@ -1544,7 +1543,7 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
                                                      const APInt &C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *X = Trunc->getOperand(0);
-  if (C.isOneValue() && C.getBitWidth() > 1) {
+  if (C.isOne() && C.getBitWidth() > 1) {
     // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
     Value *V = nullptr;
     if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
@@ -1725,7 +1724,7 @@ Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
   // Turn ((X >> Y) & C2) == 0  into  (X & (C2 << Y)) == 0.  The latter is
   // preferable because it allows the C2 << Y expression to be hoisted out of a
   // loop if Y is invariant and X is not.
-  if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() &&
+  if (Shift->hasOneUse() && C1.isZero() && Cmp.isEquality() &&
       !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
     // Compute C2 << Y.
     Value *NewShift =
@@ -1749,7 +1748,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
   // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
   // TODO: We canonicalize to the longer form for scalars because we have
   // better analysis/folds for icmp, and codegen may be better with icmp.
-  if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() &&
+  if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isZero() &&
       match(And->getOperand(1), m_One()))
     return new TruncInst(And->getOperand(0), Cmp.getType());
 
@@ -1762,7 +1761,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
   if (!And->hasOneUse())
     return nullptr;
 
-  if (Cmp.isEquality() && C1.isNullValue()) {
+  if (Cmp.isEquality() && C1.isZero()) {
     // Restrict this fold to single-use 'and' (PR10267).
     // Replace (and X, (1 << size(X)-1) != 0) with X s< 0
     if (C2->isSignMask()) {
@@ -1812,7 +1811,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
   // (icmp pred (and A, (or (shl 1, B), 1), 0))
   //
   // iff pred isn't signed
-  if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() &&
+  if (!Cmp.isSigned() && C1.isZero() && And->getOperand(0)->hasOneUse() &&
       match(And->getOperand(1), m_One())) {
     Constant *One = cast<Constant>(And->getOperand(1));
     Value *Or = And->getOperand(0);
@@ -1889,7 +1888,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
   // X & -C == -C -> X >  u ~C
   // X & -C != -C -> X <= u ~C
   //   iff C is a power of 2
-  if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
+  if (Cmp.getOperand(1) == Y && C.isNegatedPowerOf2()) {
     auto NewPred =
         Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE;
     return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
@@ -1899,7 +1898,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
   // (X & C2) != 0 -> (trunc X) <  0
   //   iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
   const APInt *C2;
-  if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) {
+  if (And->hasOneUse() && C.isZero() && match(Y, m_APInt(C2))) {
     int32_t ExactLogBase2 = C2->exactLogBase2();
     if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
       Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
@@ -1920,7 +1919,7 @@ Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
                                                   BinaryOperator *Or,
                                                   const APInt &C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
-  if (C.isOneValue()) {
+  if (C.isOne()) {
     // icmp slt signum(V) 1 --> icmp slt V, 1
     Value *V = nullptr;
     if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
@@ -1950,7 +1949,18 @@ Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
     }
   }
 
-  if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse())
+  // (X | (X-1)) s<  0 --> X s< 1
+  // (X | (X-1)) s> -1 --> X s> 0
+  Value *X;
+  bool TrueIfSigned;
+  if (isSignBitCheck(Pred, C, TrueIfSigned) &&
+      match(Or, m_c_Or(m_Add(m_Value(X), m_AllOnes()), m_Deferred(X)))) {
+    auto NewPred = TrueIfSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGT;
+    Constant *NewC = ConstantInt::get(X->getType(), TrueIfSigned ? 1 : 0);
+    return new ICmpInst(NewPred, X, NewC);
+  }
+
+  if (!Cmp.isEquality() || !C.isZero() || !Or->hasOneUse())
     return nullptr;
 
   Value *P, *Q;
@@ -2001,14 +2011,14 @@ Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
 
   // If the multiply does not wrap, try to divide the compare constant by the
   // multiplication factor.
-  if (Cmp.isEquality() && !MulC->isNullValue()) {
+  if (Cmp.isEquality() && !MulC->isZero()) {
     // (mul nsw X, MulC) == C --> X == C /s MulC
-    if (Mul->hasNoSignedWrap() && C.srem(*MulC).isNullValue()) {
+    if (Mul->hasNoSignedWrap() && C.srem(*MulC).isZero()) {
       Constant *NewC = ConstantInt::get(Mul->getType(), C.sdiv(*MulC));
       return new ICmpInst(Pred, Mul->getOperand(0), NewC);
     }
     // (mul nuw X, MulC) == C --> X == C /u MulC
-    if (Mul->hasNoUnsignedWrap() && C.urem(*MulC).isNullValue()) {
+    if (Mul->hasNoUnsignedWrap() && C.urem(*MulC).isZero()) {
       Constant *NewC = ConstantInt::get(Mul->getType(), C.udiv(*MulC));
       return new ICmpInst(Pred, Mul->getOperand(0), NewC);
     }
@@ -2053,7 +2063,7 @@ static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
     return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
   } else if (Cmp.isSigned()) {
     Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
-    if (C.isAllOnesValue()) {
+    if (C.isAllOnes()) {
       // (1 << Y) <= -1 -> Y == 31
       if (Pred == ICmpInst::ICMP_SLE)
         return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
@@ -2227,8 +2237,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
   Value *X = Shr->getOperand(0);
   CmpInst::Predicate Pred = Cmp.getPredicate();
-  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
-      C.isNullValue())
+  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && C.isZero())
     return new ICmpInst(Pred, X, Cmp.getOperand(1));
 
   const APInt *ShiftVal;
@@ -2316,7 +2325,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   if (Shr->isExact())
     return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));
 
-  if (C.isNullValue()) {
+  if (C.isZero()) {
     // == 0 is u< 1.
     if (Pred == CmpInst::ICMP_EQ)
       return new ICmpInst(CmpInst::ICMP_ULT, X,
@@ -2355,7 +2364,7 @@ Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
     return nullptr;
 
   const APInt *DivisorC;
-  if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
+  if (!C.isZero() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
     return nullptr;
 
   // Mask off the sign bit and the modulo bits (low-bits).
@@ -2435,8 +2444,7 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   // INT_MIN will also fail if the divisor is 1. Although folds of all these
   // division-by-constant cases should be present, we can not assert that they
   // have happened before we reach this icmp instruction.
-  if (C2->isNullValue() || C2->isOneValue() ||
-      (DivIsSigned && C2->isAllOnesValue()))
+  if (C2->isZero() || C2->isOne() || (DivIsSigned && C2->isAllOnes()))
     return nullptr;
 
   // Compute Prod = C * C2. We are essentially solving an equation of
@@ -2476,16 +2484,16 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
       HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
     }
   } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
-    if (C.isNullValue()) {       // (X / pos) op 0
+    if (C.isZero()) {                    // (X / pos) op 0
       // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
       LoBound = -(RangeSize - 1);
       HiBound = RangeSize;
-    } else if (C.isStrictlyPositive()) {   // (X / pos) op pos
+    } else if (C.isStrictlyPositive()) { // (X / pos) op pos
       LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
       HiOverflow = LoOverflow = ProdOV;
       if (!HiOverflow)
         HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
-    } else {                       // (X / pos) op neg
+    } else { // (X / pos) op neg
       // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
       HiBound = Prod + 1;
       LoOverflow = HiOverflow = ProdOV ? -1 : 0;
@@ -2497,7 +2505,7 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
   } else if (C2->isNegative()) { // Divisor is < 0.
     if (Div->isExact())
       RangeSize.negate();
-    if (C.isNullValue()) { // (X / neg) op 0
+    if (C.isZero()) { // (X / neg) op 0
       // e.g. X/-5 op 0  --> [-4, 5)
       LoBound = RangeSize + 1;
       HiBound = -RangeSize;
@@ -2505,13 +2513,13 @@ Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
         HiOverflow = 1;            // [INTMIN+1, overflow)
         HiBound = APInt();         // e.g. X/INTMIN = 0 --> X > INTMIN
       }
-    } else if (C.isStrictlyPositive()) {   // (X / neg) op pos
+    } else if (C.isStrictlyPositive()) { // (X / neg) op pos
       // e.g. X/-5 op 3  --> [-19, -14)
       HiBound = Prod + 1;
       HiOverflow = LoOverflow = ProdOV ? -1 : 0;
       if (!LoOverflow)
         LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
-    } else {                       // (X / neg) op neg
+    } else {                // (X / neg) op neg
       LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
       LoOverflow = HiOverflow = ProdOV;
       if (!HiOverflow)
@@ -2581,42 +2589,54 @@ Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
                                                    const APInt &C) {
   Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
   ICmpInst::Predicate Pred = Cmp.getPredicate();
-  const APInt *C2;
-  APInt SubResult;
+  Type *Ty = Sub->getType();
 
-  // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0
-  if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality())
-    return new ICmpInst(Cmp.getPredicate(), Y,
-                        ConstantInt::get(Y->getType(), 0));
+  // (SubC - Y) == C) --> Y == (SubC - C)
+  // (SubC - Y) != C) --> Y != (SubC - C)
+  Constant *SubC;
+  if (Cmp.isEquality() && match(X, m_ImmConstant(SubC))) {
+    return new ICmpInst(Pred, Y,
+                        ConstantExpr::getSub(SubC, ConstantInt::get(Ty, C)));
+  }
 
   // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
+  const APInt *C2;
+  APInt SubResult;
+  ICmpInst::Predicate SwappedPred = Cmp.getSwappedPredicate();
+  bool HasNSW = Sub->hasNoSignedWrap();
+  bool HasNUW = Sub->hasNoUnsignedWrap();
   if (match(X, m_APInt(C2)) &&
-      ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
-       (Cmp.isSigned() && Sub->hasNoSignedWrap())) &&
+      ((Cmp.isUnsigned() && HasNUW) || (Cmp.isSigned() && HasNSW)) &&
       !subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
-    return new ICmpInst(Cmp.getSwappedPredicate(), Y,
-                        ConstantInt::get(Y->getType(), SubResult));
+    return new ICmpInst(SwappedPred, Y, ConstantInt::get(Ty, SubResult));
 
   // The following transforms are only worth it if the only user of the subtract
   // is the icmp.
+  // TODO: This is an artificial restriction for all of the transforms below
+  //       that only need a single replacement icmp.
   if (!Sub->hasOneUse())
     return nullptr;
 
+  // X - Y == 0 --> X == Y.
+  // X - Y != 0 --> X != Y.
+  if (Cmp.isEquality() && C.isZero())
+    return new ICmpInst(Pred, X, Y);
+
   if (Sub->hasNoSignedWrap()) {
     // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
-    if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue())
+    if (Pred == ICmpInst::ICMP_SGT && C.isAllOnes())
       return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
 
     // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
-    if (Pred == ICmpInst::ICMP_SGT && C.isNullValue())
+    if (Pred == ICmpInst::ICMP_SGT && C.isZero())
       return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
 
     // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
-    if (Pred == ICmpInst::ICMP_SLT && C.isNullValue())
+    if (Pred == ICmpInst::ICMP_SLT && C.isZero())
       return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
 
     // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
-    if (Pred == ICmpInst::ICMP_SLT && C.isOneValue())
+    if (Pred == ICmpInst::ICMP_SLT && C.isOne())
       return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
   }
 
@@ -2634,7 +2654,12 @@ Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
   if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C)
     return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X);
 
-  return nullptr;
+  // We have handled special cases that reduce.
+  // Canonicalize any remaining sub to add as:
+  // (C2 - Y) > C --> (Y + ~C2) < ~C
+  Value *Add = Builder.CreateAdd(Y, ConstantInt::get(Ty, ~(*C2)), "notsub",
+                                 HasNUW, HasNSW);
+  return new ICmpInst(SwappedPred, Add, ConstantInt::get(Ty, ~C));
 }
 
 /// Fold icmp (add X, Y), C.
@@ -2723,6 +2748,14 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
     return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
+  // The range test idiom can use either ult or ugt. Arbitrarily canonicalize
+  // to the ult form.
+  // X+C2 >u C -> X+(C2-C-1) <u ~C
+  if (Pred == ICmpInst::ICMP_UGT)
+    return new ICmpInst(ICmpInst::ICMP_ULT,
+                        Builder.CreateAdd(X, ConstantInt::get(Ty, *C2 - C - 1)),
+                        ConstantInt::get(Ty, ~C));
+
   return nullptr;
 }
 
@@ -2830,8 +2863,7 @@ Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
-static Instruction *foldICmpBitCast(ICmpInst &Cmp,
-                                    InstCombiner::BuilderTy &Builder) {
+Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
   auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0));
   if (!Bitcast)
     return nullptr;
@@ -2917,6 +2949,39 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
     return new ICmpInst(Pred, BCSrcOp, Op1);
   }
 
+  const APInt *C;
+  if (!match(Cmp.getOperand(1), m_APInt(C)) ||
+      !Bitcast->getType()->isIntegerTy() ||
+      !Bitcast->getSrcTy()->isIntOrIntVectorTy())
+    return nullptr;
+
+  // If this is checking if all elements of a vector compare are set or not,
+  // invert the casted vector equality compare and test if all compare
+  // elements are clear or not. Compare against zero is generally easier for
+  // analysis and codegen.
+  // icmp eq/ne (bitcast (not X) to iN), -1 --> icmp eq/ne (bitcast X to iN), 0
+  // Example: are all elements equal? --> are zero elements not equal?
+  // TODO: Try harder to reduce compare of 2 freely invertible operands?
+  if (Cmp.isEquality() && C->isAllOnes() && Bitcast->hasOneUse() &&
+      isFreeToInvert(BCSrcOp, BCSrcOp->hasOneUse())) {
+    Type *ScalarTy = Bitcast->getType();
+    Value *Cast = Builder.CreateBitCast(Builder.CreateNot(BCSrcOp), ScalarTy);
+    return new ICmpInst(Pred, Cast, ConstantInt::getNullValue(ScalarTy));
+  }
+
+  // If this is checking if all elements of an extended vector are clear or not,
+  // compare in a narrow type to eliminate the extend:
+  // icmp eq/ne (bitcast (ext X) to iN), 0 --> icmp eq/ne (bitcast X to iM), 0
+  Value *X;
+  if (Cmp.isEquality() && C->isZero() && Bitcast->hasOneUse() &&
+      match(BCSrcOp, m_ZExtOrSExt(m_Value(X)))) {
+    if (auto *VecTy = dyn_cast<FixedVectorType>(X->getType())) {
+      Type *NewType = Builder.getIntNTy(VecTy->getPrimitiveSizeInBits());
+      Value *NewCast = Builder.CreateBitCast(X, NewType);
+      return new ICmpInst(Pred, NewCast, ConstantInt::getNullValue(NewType));
+    }
+  }
+
   // Folding: icmp <pred> iN X, C
   //  where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
   //    and C is a splat of a K-bit pattern
@@ -2924,12 +2989,6 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
   // Into:
   //   %E = extractelement <M x iK> %vec, i32 C'
   //   icmp <pred> iK %E, trunc(C)
-  const APInt *C;
-  if (!match(Cmp.getOperand(1), m_APInt(C)) ||
-      !Bitcast->getType()->isIntegerTy() ||
-      !Bitcast->getSrcTy()->isIntOrIntVectorTy())
-    return nullptr;
-
   Value *Vec;
   ArrayRef<int> Mask;
   if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) {
@@ -3055,7 +3114,7 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
   switch (BO->getOpcode()) {
   case Instruction::SRem:
     // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
-    if (C.isNullValue() && BO->hasOneUse()) {
+    if (C.isZero() && BO->hasOneUse()) {
       const APInt *BOC;
       if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
         Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
@@ -3069,7 +3128,7 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
     if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
       if (BO->hasOneUse())
         return new ICmpInst(Pred, BOp0, ConstantExpr::getSub(RHS, BOC));
-    } else if (C.isNullValue()) {
+    } else if (C.isZero()) {
       // Replace ((add A, B) != 0) with (A != -B) if A or B is
       // efficiently invertible, or if the add has just this one use.
       if (Value *NegVal = dyn_castNegVal(BOp1))
@@ -3090,25 +3149,12 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
         // For the xor case, we can xor two constants together, eliminating
         // the explicit xor.
         return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
-      } else if (C.isNullValue()) {
+      } else if (C.isZero()) {
         // Replace ((xor A, B) != 0) with (A != B)
         return new ICmpInst(Pred, BOp0, BOp1);
       }
     }
     break;
-  case Instruction::Sub:
-    if (BO->hasOneUse()) {
-      // Only check for constant LHS here, as constant RHS will be canonicalized
-      // to add and use the fold above.
-      if (Constant *BOC = dyn_cast<Constant>(BOp0)) {
-        // Replace ((sub BOC, B) != C) with (B != BOC-C).
-        return new ICmpInst(Pred, BOp1, ConstantExpr::getSub(BOC, RHS));
-      } else if (C.isNullValue()) {
-        // Replace ((sub A, B) != 0) with (A != B).
-        return new ICmpInst(Pred, BOp0, BOp1);
-      }
-    }
-    break;
   case Instruction::Or: {
     const APInt *BOC;
     if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
@@ -3132,7 +3178,7 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
     break;
   }
   case Instruction::UDiv:
-    if (C.isNullValue()) {
+    if (C.isZero()) {
       // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
       auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
       return new ICmpInst(NewPred, BOp1, BOp0);
@@ -3149,25 +3195,26 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
     ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) {
   Type *Ty = II->getType();
   unsigned BitWidth = C.getBitWidth();
+  const ICmpInst::Predicate Pred = Cmp.getPredicate();
+
   switch (II->getIntrinsicID()) {
   case Intrinsic::abs:
     // abs(A) == 0  ->  A == 0
     // abs(A) == INT_MIN  ->  A == INT_MIN
-    if (C.isNullValue() || C.isMinSignedValue())
-      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
-                          ConstantInt::get(Ty, C));
+    if (C.isZero() || C.isMinSignedValue())
+      return new ICmpInst(Pred, II->getArgOperand(0), ConstantInt::get(Ty, C));
     break;
 
   case Intrinsic::bswap:
     // bswap(A) == C  ->  A == bswap(C)
-    return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+    return new ICmpInst(Pred, II->getArgOperand(0),
                         ConstantInt::get(Ty, C.byteSwap()));
 
   case Intrinsic::ctlz:
   case Intrinsic::cttz: {
     // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for !=
     if (C == BitWidth)
-      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+      return new ICmpInst(Pred, II->getArgOperand(0),
                           ConstantInt::getNullValue(Ty));
 
     // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
@@ -3181,9 +3228,8 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
       APInt Mask2 = IsTrailing
         ? APInt::getOneBitSet(BitWidth, Num)
         : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
-      return new ICmpInst(Cmp.getPredicate(),
-          Builder.CreateAnd(II->getArgOperand(0), Mask1),
-          ConstantInt::get(Ty, Mask2));
+      return new ICmpInst(Pred, Builder.CreateAnd(II->getArgOperand(0), Mask1),
+                          ConstantInt::get(Ty, Mask2));
     }
     break;
   }
@@ -3191,28 +3237,49 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
   case Intrinsic::ctpop: {
     // popcount(A) == 0  ->  A == 0 and likewise for !=
     // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for !=
-    bool IsZero = C.isNullValue();
+    bool IsZero = C.isZero();
     if (IsZero || C == BitWidth)
-      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
-          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty));
+      return new ICmpInst(Pred, II->getArgOperand(0),
+                          IsZero ? Constant::getNullValue(Ty)
+                                 : Constant::getAllOnesValue(Ty));
 
     break;
   }
 
+  case Intrinsic::fshl:
+  case Intrinsic::fshr:
+    if (II->getArgOperand(0) == II->getArgOperand(1)) {
+      // (rot X, ?) == 0/-1 --> X == 0/-1
+      // TODO: This transform is safe to re-use undef elts in a vector, but
+      //       the constant value passed in by the caller doesn't allow that.
+      if (C.isZero() || C.isAllOnes())
+        return new ICmpInst(Pred, II->getArgOperand(0), Cmp.getOperand(1));
+
+      const APInt *RotAmtC;
+      // ror(X, RotAmtC) == C --> X == rol(C, RotAmtC)
+      // rol(X, RotAmtC) == C --> X == ror(C, RotAmtC)
+      if (match(II->getArgOperand(2), m_APInt(RotAmtC)))
+        return new ICmpInst(Pred, II->getArgOperand(0),
+                            II->getIntrinsicID() == Intrinsic::fshl
+                                ? ConstantInt::get(Ty, C.rotr(*RotAmtC))
+                                : ConstantInt::get(Ty, C.rotl(*RotAmtC)));
+    }
+    break;
+
   case Intrinsic::uadd_sat: {
     // uadd.sat(a, b) == 0  ->  (a | b) == 0
-    if (C.isNullValue()) {
+    if (C.isZero()) {
       Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
-      return new ICmpInst(Cmp.getPredicate(), Or, Constant::getNullValue(Ty));
+      return new ICmpInst(Pred, Or, Constant::getNullValue(Ty));
     }
     break;
   }
 
   case Intrinsic::usub_sat: {
     // usub.sat(a, b) == 0  ->  a <= b
-    if (C.isNullValue()) {
-      ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
-          ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+    if (C.isZero()) {
+      ICmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
       return new ICmpInst(NewPred, II->getArgOperand(0), II->getArgOperand(1));
     }
     break;
@@ -3224,6 +3291,42 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
   return nullptr;
 }
 
+/// Fold an icmp with LLVM intrinsics
+static Instruction *foldICmpIntrinsicWithIntrinsic(ICmpInst &Cmp) {
+  assert(Cmp.isEquality());
+
+  ICmpInst::Predicate Pred = Cmp.getPredicate();
+  Value *Op0 = Cmp.getOperand(0);
+  Value *Op1 = Cmp.getOperand(1);
+  const auto *IIOp0 = dyn_cast<IntrinsicInst>(Op0);
+  const auto *IIOp1 = dyn_cast<IntrinsicInst>(Op1);
+  if (!IIOp0 || !IIOp1 || IIOp0->getIntrinsicID() != IIOp1->getIntrinsicID())
+    return nullptr;
+
+  switch (IIOp0->getIntrinsicID()) {
+  case Intrinsic::bswap:
+  case Intrinsic::bitreverse:
+    // If both operands are byte-swapped or bit-reversed, just compare the
+    // original values.
+    return new ICmpInst(Pred, IIOp0->getOperand(0), IIOp1->getOperand(0));
+  case Intrinsic::fshl:
+  case Intrinsic::fshr:
+    // If both operands are rotated by same amount, just compare the
+    // original values.
+    if (IIOp0->getOperand(0) != IIOp0->getOperand(1))
+      break;
+    if (IIOp1->getOperand(0) != IIOp1->getOperand(1))
+      break;
+    if (IIOp0->getOperand(2) != IIOp1->getOperand(2))
+      break;
+    return new ICmpInst(Pred, IIOp0->getOperand(0), IIOp1->getOperand(0));
+  default:
+    break;
+  }
+
+  return nullptr;
+}
+
 /// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
 Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
                                                              IntrinsicInst *II,
@@ -3663,7 +3766,7 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
       (WidestTy->getScalarSizeInBits() - 1) +
       (NarrowestTy->getScalarSizeInBits() - 1);
   APInt MaximalRepresentableShiftAmount =
-      APInt::getAllOnesValue(XShAmt->getType()->getScalarSizeInBits());
+      APInt::getAllOnes(XShAmt->getType()->getScalarSizeInBits());
   if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
     return nullptr;
 
@@ -3746,19 +3849,22 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
 
 /// Fold
 ///   (-1 u/ x) u< y
-///   ((x * y) u/ x) != y
+///   ((x * y) ?/ x) != y
 /// to
-///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
+///   @llvm.?mul.with.overflow(x, y) plus extraction of overflow bit
 /// Note that the comparison is commutative, while inverted (u>=, ==) predicate
 /// will mean that we are looking for the opposite answer.
-Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
+Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) {
   ICmpInst::Predicate Pred;
   Value *X, *Y;
   Instruction *Mul;
+  Instruction *Div;
   bool NeedNegation;
   // Look for: (-1 u/ x) u</u>= y
   if (!I.isEquality() &&
-      match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
+      match(&I, m_c_ICmp(Pred,
+                         m_CombineAnd(m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
+                                      m_Instruction(Div)),
                          m_Value(Y)))) {
     Mul = nullptr;
 
@@ -3773,13 +3879,16 @@ Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
     default:
       return nullptr; // Wrong predicate.
     }
-  } else // Look for: ((x * y) u/ x) !=/== y
+  } else // Look for: ((x * y) / x) !=/== y
       if (I.isEquality() &&
-          match(&I, m_c_ICmp(Pred, m_Value(Y),
-                             m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
+          match(&I,
+                m_c_ICmp(Pred, m_Value(Y),
+                         m_CombineAnd(
+                             m_OneUse(m_IDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
                                                                   m_Value(X)),
                                                           m_Instruction(Mul)),
-                                             m_Deferred(X)))))) {
+                                             m_Deferred(X))),
+                             m_Instruction(Div))))) {
     NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
   } else
     return nullptr;
@@ -3791,19 +3900,22 @@ Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
   if (MulHadOtherUses)
     Builder.SetInsertPoint(Mul);
 
-  Function *F = Intrinsic::getDeclaration(
-      I.getModule(), Intrinsic::umul_with_overflow, X->getType());
-  CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul");
+  Function *F = Intrinsic::getDeclaration(I.getModule(),
+                                          Div->getOpcode() == Instruction::UDiv
+                                              ? Intrinsic::umul_with_overflow
+                                              : Intrinsic::smul_with_overflow,
+                                          X->getType());
+  CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul");
 
   // If the multiplication was used elsewhere, to ensure that we don't leave
   // "duplicate" instructions, replace uses of that original multiplication
   // with the multiplication result from the with.overflow intrinsic.
   if (MulHadOtherUses)
-    replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val"));
+    replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "mul.val"));
 
-  Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov");
+  Value *Res = Builder.CreateExtractValue(Call, 1, "mul.ov");
   if (NeedNegation) // This technically increases instruction count.
-    Res = Builder.CreateNot(Res, "umul.not.ov");
+    Res = Builder.CreateNot(Res, "mul.not.ov");
 
   // If we replaced the mul, erase it. Do this after all uses of Builder,
   // as the mul is used as insertion point.
@@ -4079,8 +4191,8 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
     if (match(Op0, m_Mul(m_Value(X), m_APInt(C))) && *C != 0 &&
         match(Op1, m_Mul(m_Value(Y), m_SpecificInt(*C))) && I.isEquality())
       if (!C->countTrailingZeros() ||
-          (BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap()) ||
-          (BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap()))
+          (BO0 && BO1 && BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap()) ||
+          (BO0 && BO1 && BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap()))
       return new ICmpInst(Pred, X, Y);
   }
 
@@ -4146,8 +4258,8 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
         break;
 
       const APInt *C;
-      if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
-          !C->isOneValue()) {
+      if (match(BO0->getOperand(1), m_APInt(C)) && !C->isZero() &&
+          !C->isOne()) {
         // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
         // Mask = -1 >> count-trailing-zeros(C).
         if (unsigned TZs = C->countTrailingZeros()) {
@@ -4200,7 +4312,7 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
     }
   }
 
-  if (Value *V = foldUnsignedMultiplicationOverflowCheck(I))
+  if (Value *V = foldMultiplicationOverflowCheck(I))
     return replaceInstUsesWith(I, V);
 
   if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
@@ -4373,6 +4485,19 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
     }
   }
 
+  {
+    // Similar to above, but specialized for constant because invert is needed:
+    // (X | C) == (Y | C) --> (X ^ Y) & ~C == 0
+    Value *X, *Y;
+    Constant *C;
+    if (match(Op0, m_OneUse(m_Or(m_Value(X), m_Constant(C)))) &&
+        match(Op1, m_OneUse(m_Or(m_Value(Y), m_Specific(C))))) {
+      Value *Xor = Builder.CreateXor(X, Y);
+      Value *And = Builder.CreateAnd(Xor, ConstantExpr::getNot(C));
+      return new ICmpInst(Pred, And, Constant::getNullValue(And->getType()));
+    }
+  }
+
   // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
   // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B)
   ConstantInt *Cst1;
@@ -4441,14 +4566,8 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
     }
   }
 
-  // If both operands are byte-swapped or bit-reversed, just compare the
-  // original values.
-  // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
-  // and handle more intrinsics.
-  if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
-      (match(Op0, m_BitReverse(m_Value(A))) &&
-       match(Op1, m_BitReverse(m_Value(B)))))
-    return new ICmpInst(Pred, A, B);
+  if (Instruction *ICmp = foldICmpIntrinsicWithIntrinsic(I))
+    return ICmp;
 
   // Canonicalize checking for a power-of-2-or-zero value:
   // (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants)
@@ -4474,6 +4593,74 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
         : new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1));
   }
 
+  // Match icmp eq (trunc (lshr A, BW), (ashr (trunc A), BW-1)), which checks the
+  // top BW/2 + 1 bits are all the same. Create "A >=s INT_MIN && A <=s INT_MAX",
+  // which we generate as "icmp ult (add A, 2^(BW-1)), 2^BW" to skip a few steps
+  // of instcombine.
+  unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+  if (match(Op0, m_AShr(m_Trunc(m_Value(A)), m_SpecificInt(BitWidth - 1))) &&
+      match(Op1, m_Trunc(m_LShr(m_Specific(A), m_SpecificInt(BitWidth)))) &&
+      A->getType()->getScalarSizeInBits() == BitWidth * 2 &&
+      (I.getOperand(0)->hasOneUse() || I.getOperand(1)->hasOneUse())) {
+    APInt C = APInt::getOneBitSet(BitWidth * 2, BitWidth - 1);
+    Value *Add = Builder.CreateAdd(A, ConstantInt::get(A->getType(), C));
+    return new ICmpInst(Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_ULT
+                                                  : ICmpInst::ICMP_UGE,
+                        Add, ConstantInt::get(A->getType(), C.shl(1)));
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldICmpWithTrunc(ICmpInst &ICmp,
+                                      InstCombiner::BuilderTy &Builder) {
+  const ICmpInst::Predicate Pred = ICmp.getPredicate();
+  Value *Op0 = ICmp.getOperand(0), *Op1 = ICmp.getOperand(1);
+
+  // Try to canonicalize trunc + compare-to-constant into a mask + cmp.
+  // The trunc masks high bits while the compare may effectively mask low bits.
+  Value *X;
+  const APInt *C;
+  if (!match(Op0, m_OneUse(m_Trunc(m_Value(X)))) || !match(Op1, m_APInt(C)))
+    return nullptr;
+
+  unsigned SrcBits = X->getType()->getScalarSizeInBits();
+  if (Pred == ICmpInst::ICMP_ULT) {
+    if (C->isPowerOf2()) {
+      // If C is a power-of-2 (one set bit):
+      // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?)
+      Constant *MaskC = ConstantInt::get(X->getType(), (-*C).zext(SrcBits));
+      Value *And = Builder.CreateAnd(X, MaskC);
+      Constant *Zero = ConstantInt::getNullValue(X->getType());
+      return new ICmpInst(ICmpInst::ICMP_EQ, And, Zero);
+    }
+    // If C is a negative power-of-2 (high-bit mask):
+    // (trunc X) u< C --> (X & C) != C (are any masked-high-bits clear?)
+    if (C->isNegatedPowerOf2()) {
+      Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits));
+      Value *And = Builder.CreateAnd(X, MaskC);
+      return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC);
+    }
+  }
+
+  if (Pred == ICmpInst::ICMP_UGT) {
+    // If C is a low-bit-mask (C+1 is a power-of-2):
+    // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?)
+    if (C->isMask()) {
+      Constant *MaskC = ConstantInt::get(X->getType(), (~*C).zext(SrcBits));
+      Value *And = Builder.CreateAnd(X, MaskC);
+      Constant *Zero = ConstantInt::getNullValue(X->getType());
+      return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+    }
+    // If C is not-of-power-of-2 (one clear bit):
+    // (trunc X) u> C --> (X & (C+1)) == C+1 (are all masked-high-bits set?)
+    if ((~*C).isPowerOf2()) {
+      Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits));
+      Value *And = Builder.CreateAnd(X, MaskC);
+      return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC);
+    }
+  }
+
   return nullptr;
 }
 
@@ -4620,6 +4807,9 @@ Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
       return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
   }
 
+  if (Instruction *R = foldICmpWithTrunc(ICmp, Builder))
+    return R;
+
   return foldICmpWithZextOrSext(ICmp, Builder);
 }
 
@@ -4943,7 +5133,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
 static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
   const APInt *RHS;
   if (!match(I.getOperand(1), m_APInt(RHS)))
-    return APInt::getAllOnesValue(BitWidth);
+    return APInt::getAllOnes(BitWidth);
 
   // If this is a normal comparison, it demands all bits. If it is a sign bit
   // comparison, it only demands the sign bit.
@@ -4965,7 +5155,7 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
     return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros());
 
   default:
-    return APInt::getAllOnesValue(BitWidth);
+    return APInt::getAllOnes(BitWidth);
   }
 }
 
@@ -5129,8 +5319,7 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
                            Op0Known, 0))
     return &I;
 
-  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
-                           Op1Known, 0))
+  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnes(BitWidth), Op1Known, 0))
     return &I;
 
   // Given the known and unknown bits, compute a range that the LHS could be
@@ -5158,6 +5347,83 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
   if (!isa<Constant>(Op1) && Op1Min == Op1Max)
     return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
 
+  // Don't break up a clamp pattern -- (min(max X, Y), Z) -- by replacing a
+  // min/max canonical compare with some other compare. That could lead to
+  // conflict with select canonicalization and infinite looping.
+  // FIXME: This constraint may go away if min/max intrinsics are canonical.
+  auto isMinMaxCmp = [&](Instruction &Cmp) {
+    if (!Cmp.hasOneUse())
+      return false;
+    Value *A, *B;
+    SelectPatternFlavor SPF = matchSelectPattern(Cmp.user_back(), A, B).Flavor;
+    if (!SelectPatternResult::isMinOrMax(SPF))
+      return false;
+    return match(Op0, m_MaxOrMin(m_Value(), m_Value())) ||
+           match(Op1, m_MaxOrMin(m_Value(), m_Value()));
+  };
+  if (!isMinMaxCmp(I)) {
+    switch (Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_ULT: {
+      if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        // A <u C -> A == C-1 if min(A)+1 == C
+        if (*CmpC == Op0Min + 1)
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC - 1));
+        // X <u C --> X == 0, if the number of zero bits in the bottom of X
+        // exceeds the log2 of C.
+        if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              Constant::getNullValue(Op1->getType()));
+      }
+      break;
+    }
+    case ICmpInst::ICMP_UGT: {
+      if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        // A >u C -> A == C+1 if max(a)-1 == C
+        if (*CmpC == Op0Max - 1)
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC + 1));
+        // X >u C --> X != 0, if the number of zero bits in the bottom of X
+        // exceeds the log2 of C.
+        if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
+          return new ICmpInst(ICmpInst::ICMP_NE, Op0,
+                              Constant::getNullValue(Op1->getType()));
+      }
+      break;
+    }
+    case ICmpInst::ICMP_SLT: {
+      if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC - 1));
+      }
+      break;
+    }
+    case ICmpInst::ICMP_SGT: {
+      if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC + 1));
+      }
+      break;
+    }
+    }
+  }
+
   // Based on the range information we know about the LHS, see if we can
   // simplify this comparison.  For example, (x&4) < 8 is always true.
   switch (Pred) {
@@ -5203,7 +5469,7 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
 
       // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
       const APInt *CI;
-      if (Op0KnownZeroInverted.isOneValue() &&
+      if (Op0KnownZeroInverted.isOne() &&
           match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
         // ((8 >>u X) & 1) == 0 -> X != 3
         // ((8 >>u X) & 1) != 0 -> X == 3
@@ -5219,21 +5485,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      // A <u C -> A == C-1 if min(A)+1 == C
-      if (*CmpC == Op0Min + 1)
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC - 1));
-      // X <u C --> X == 0, if the number of zero bits in the bottom of X
-      // exceeds the log2 of C.
-      if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            Constant::getNullValue(Op1->getType()));
-    }
     break;
   }
   case ICmpInst::ICMP_UGT: {
@@ -5241,21 +5492,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      // A >u C -> A == C+1 if max(a)-1 == C
-      if (*CmpC == Op0Max - 1)
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC + 1));
-      // X >u C --> X != 0, if the number of zero bits in the bottom of X
-      // exceeds the log2 of C.
-      if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
-        return new ICmpInst(ICmpInst::ICMP_NE, Op0,
-                            Constant::getNullValue(Op1->getType()));
-    }
     break;
   }
   case ICmpInst::ICMP_SLT: {
@@ -5263,14 +5499,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC - 1));
-    }
     break;
   }
   case ICmpInst::ICMP_SGT: {
@@ -5278,14 +5506,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC + 1));
-    }
     break;
   }
   case ICmpInst::ICMP_SGE:
@@ -5587,7 +5807,7 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
   if (match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(M))) &&
       V1Ty == V2->getType() && (LHS->hasOneUse() || RHS->hasOneUse())) {
     Value *NewCmp = Builder.CreateCmp(Pred, V1, V2);
-    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
+    return new ShuffleVectorInst(NewCmp, M);
   }
 
   // Try to canonicalize compare with splatted operand and splat constant.
@@ -5608,8 +5828,7 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
                                  ScalarC);
     SmallVector<int, 8> NewM(M.size(), MaskSplatIndex);
     Value *NewCmp = Builder.CreateCmp(Pred, V1, C);
-    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()),
-                                 NewM);
+    return new ShuffleVectorInst(NewCmp, NewM);
   }
 
   return nullptr;
@@ -5645,6 +5864,23 @@ static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
   return ExtractValueInst::Create(UAddOv, 1);
 }
 
+static Instruction *foldICmpInvariantGroup(ICmpInst &I) {
+  if (!I.getOperand(0)->getType()->isPointerTy() ||
+      NullPointerIsDefined(
+          I.getParent()->getParent(),
+          I.getOperand(0)->getType()->getPointerAddressSpace())) {
+    return nullptr;
+  }
+  Instruction *Op;
+  if (match(I.getOperand(0), m_Instruction(Op)) &&
+      match(I.getOperand(1), m_Zero()) &&
+      Op->isLaunderOrStripInvariantGroup()) {
+    return ICmpInst::Create(Instruction::ICmp, I.getPredicate(),
+                            Op->getOperand(0), I.getOperand(1));
+  }
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
@@ -5698,9 +5934,6 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpWithDominatingICmp(I))
     return Res;
 
-  if (Instruction *Res = foldICmpBinOp(I, Q))
-    return Res;
-
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
@@ -5746,6 +5979,15 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     }
   }
 
+  // The folds in here may rely on wrapping flags and special constants, so
+  // they can break up min/max idioms in some cases but not seemingly similar
+  // patterns.
+  // FIXME: It may be possible to enhance select folding to make this
+  //        unnecessary. It may also be moot if we canonicalize to min/max
+  //        intrinsics.
+  if (Instruction *Res = foldICmpBinOp(I, Q))
+    return Res;
+
   if (Instruction *Res = foldICmpInstWithConstant(I))
     return Res;
 
@@ -5757,13 +5999,12 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
-  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
+  // Try to optimize 'icmp GEP, P' or 'icmp P, GEP'.
+  if (auto *GEP = dyn_cast<GEPOperator>(Op0))
     if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
       return NI;
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
-    if (Instruction *NI = foldGEPICmp(GEP, Op0,
-                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+  if (auto *GEP = dyn_cast<GEPOperator>(Op1))
+    if (Instruction *NI = foldGEPICmp(GEP, Op0, I.getSwappedPredicate(), I))
       return NI;
 
   // Try to optimize equality comparisons against alloca-based pointers.
@@ -5777,7 +6018,7 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
         return New;
   }
 
-  if (Instruction *Res = foldICmpBitCast(I, Builder))
+  if (Instruction *Res = foldICmpBitCast(I))
     return Res;
 
   // TODO: Hoist this above the min/max bailout.
@@ -5879,6 +6120,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     if (Instruction *Res = foldVectorCmp(I, Builder))
       return Res;
 
+  if (Instruction *Res = foldICmpInvariantGroup(I))
+    return Res;
+
   return Changed ? &I : nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index eaa53348028d..72e1b21e8d49 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -22,14 +22,15 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 
 #define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
 
 using namespace llvm::PatternMatch;
 
@@ -61,7 +62,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
     : public InstCombiner,
       public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
-  InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder,
+  InstCombinerImpl(InstructionWorklist &Worklist, BuilderTy &Builder,
                    bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
                    TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                    DominatorTree &DT, OptimizationRemarkEmitter &ORE,
@@ -190,6 +191,7 @@ public:
 
 private:
   void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
+  bool isDesirableIntType(unsigned BitWidth) const;
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
@@ -240,15 +242,11 @@ private:
   ///
   /// \param ICI The icmp of the (zext icmp) pair we are interested in.
   /// \parem CI The zext of the (zext icmp) pair we are interested in.
-  /// \param DoTransform Pass false to just test whether the given (zext icmp)
-  /// would be transformed. Pass true to actually perform the transformation.
   ///
   /// \return null if the transformation cannot be performed. If the
   /// transformation can be performed the new instruction that replaces the
-  /// (zext icmp) pair will be returned (if \p DoTransform is false the
-  /// unmodified \p ICI will be returned in this case).
-  Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
-                                 bool DoTransform = true);
+  /// (zext icmp) pair will be returned.
+  Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI);
 
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
 
@@ -319,13 +317,15 @@ private:
 
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+  Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
   Instruction *narrowBinOp(TruncInst &Trunc);
   Instruction *narrowMaskedBinOp(BinaryOperator &And);
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
   Instruction *narrowFunnelShift(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
-  Instruction *matchSAddSubSat(SelectInst &MinMax1);
+  Instruction *matchSAddSubSat(Instruction &MinMax1);
+  Instruction *foldNot(BinaryOperator &I);
 
   void freelyInvertAllUsersOf(Value *V);
 
@@ -347,6 +347,8 @@ private:
   Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
   Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
 
+  Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd);
+
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
   /// already be inserted into the function.
@@ -623,6 +625,7 @@ public:
   Instruction *foldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *foldPHIArgLoadIntoPHI(PHINode &PN);
   Instruction *foldPHIArgZextsIntoPHI(PHINode &PN);
+  Instruction *foldPHIArgIntToPtrToPHI(PHINode &PN);
 
   /// If an integer typed PHI has only one use which is an IntToPtr operation,
   /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise
@@ -657,7 +660,7 @@ public:
   Instruction *foldSignBitTest(ICmpInst &I);
   Instruction *foldICmpWithZero(ICmpInst &Cmp);
 
-  Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp);
+  Value *foldMultiplicationOverflowCheck(ICmpInst &Cmp);
 
   Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
                                       ConstantInt *C);
@@ -701,6 +704,7 @@ public:
                                              const APInt &C);
   Instruction *foldICmpEqIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
                                                const APInt &C);
+  Instruction *foldICmpBitCast(ICmpInst &Cmp);
 
   // Helpers of visitSelectInst().
   Instruction *foldSelectExtConst(SelectInst &Sel);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index a8474e27383d..79a8a065d02a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -261,8 +261,8 @@ private:
 
 bool PointerReplacer::collectUsers(Instruction &I) {
   for (auto U : I.users()) {
-    Instruction *Inst = cast<Instruction>(&*U);
-    if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+    auto *Inst = cast<Instruction>(&*U);
+    if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
       Worklist.insert(Load);
@@ -270,7 +270,9 @@ bool PointerReplacer::collectUsers(Instruction &I) {
       Worklist.insert(Inst);
       if (!collectUsers(*Inst))
         return false;
-    } else if (isa<MemTransferInst>(Inst)) {
+    } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
+      if (MI->isVolatile())
+        return false;
       Worklist.insert(Inst);
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
@@ -335,8 +337,7 @@ void PointerReplacer::replace(Instruction *I) {
         MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
         SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
         MemCpy->isVolatile());
-    AAMDNodes AAMD;
-    MemCpy->getAAMetadata(AAMD);
+    AAMDNodes AAMD = MemCpy->getAAMetadata();
     if (AAMD)
       NewI->setAAMetadata(AAMD);
 
@@ -647,9 +648,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
     if (NumElements == 1) {
       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
                                                   ".unpack");
-      AAMDNodes AAMD;
-      LI.getAAMetadata(AAMD);
-      NewLoad->setAAMetadata(AAMD);
+      NewLoad->setAAMetadata(LI.getAAMetadata());
       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
@@ -678,9 +677,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
           ST->getElementType(i), Ptr,
           commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
       // Propagate AA metadata. It'll still be valid on the narrowed load.
-      AAMDNodes AAMD;
-      LI.getAAMetadata(AAMD);
-      L->setAAMetadata(AAMD);
+      L->setAAMetadata(LI.getAAMetadata());
       V = IC.Builder.CreateInsertValue(V, L, i);
     }
 
@@ -693,9 +690,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
-      AAMDNodes AAMD;
-      LI.getAAMetadata(AAMD);
-      NewLoad->setAAMetadata(AAMD);
+      NewLoad->setAAMetadata(LI.getAAMetadata());
       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
@@ -727,9 +722,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
       auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
                                              commonAlignment(Align, Offset),
                                              Name + ".unpack");
-      AAMDNodes AAMD;
-      LI.getAAMetadata(AAMD);
-      L->setAAMetadata(AAMD);
+      L->setAAMetadata(LI.getAAMetadata());
       V = IC.Builder.CreateInsertValue(V, L, i);
       Offset += EltSize;
     }
@@ -1206,9 +1199,7 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
       auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
       llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
-      AAMDNodes AAMD;
-      SI.getAAMetadata(AAMD);
-      NS->setAAMetadata(AAMD);
+      NS->setAAMetadata(SI.getAAMetadata());
     }
 
     return true;
@@ -1254,9 +1245,7 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
       auto EltAlign = commonAlignment(Align, Offset);
       Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
-      AAMDNodes AAMD;
-      SI.getAAMetadata(AAMD);
-      NS->setAAMetadata(AAMD);
+      NS->setAAMetadata(SI.getAAMetadata());
       Offset += EltSize;
     }
 
@@ -1498,8 +1487,8 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   StoreInst *OtherStore = nullptr;
   if (OtherBr->isUnconditional()) {
     --BBI;
-    // Skip over debugging info.
-    while (isa<DbgInfoIntrinsic>(BBI) ||
+    // Skip over debugging info and pseudo probes.
+    while (BBI->isDebugOrPseudoInst() ||
            (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
       if (BBI==OtherBB->begin())
         return false;
@@ -1567,12 +1556,9 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   NewSI->setDebugLoc(MergedLoc);
 
   // If the two stores had AA tags, merge them.
-  AAMDNodes AATags;
-  SI.getAAMetadata(AATags);
-  if (AATags) {
-    OtherStore->getAAMetadata(AATags, /* Merge = */ true);
-    NewSI->setAAMetadata(AATags);
-  }
+  AAMDNodes AATags = SI.getAAMetadata();
+  if (AATags)
+    NewSI->setAAMetadata(AATags.merge(OtherStore->getAAMetadata()));
 
   // Nuke the old stores.
   eraseInstFromFunction(SI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 6f2a8ebf839a..779d298da7a4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <cassert>
@@ -39,11 +38,12 @@
 #include <cstdint>
 #include <utility>
 
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "instcombine"
-
 /// The specific integer value is used in a context where it is known to be
 /// non-zero.  If this allows us to simplify the computation, do so and return
 /// the new operand, otherwise return null.
@@ -107,14 +107,19 @@ static Value *foldMulSelectToNegate(BinaryOperator &I,
   // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
   // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
   if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
-                        m_Value(OtherOp))))
-    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));
-
+                        m_Value(OtherOp)))) {
+    bool HasAnyNoWrap = I.hasNoSignedWrap() || I.hasNoUnsignedWrap();
+    Value *Neg = Builder.CreateNeg(OtherOp, "", false, HasAnyNoWrap);
+    return Builder.CreateSelect(Cond, OtherOp, Neg);
+  }
   // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
   // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
   if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
-                        m_Value(OtherOp))))
-    return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);
+                        m_Value(OtherOp)))) {
+    bool HasAnyNoWrap = I.hasNoSignedWrap() || I.hasNoUnsignedWrap();
+    Value *Neg = Builder.CreateNeg(OtherOp, "", false, HasAnyNoWrap);
+    return Builder.CreateSelect(Cond, Neg, OtherOp);
+  }
 
   // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
   // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
@@ -564,6 +569,16 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
         return replaceInstUsesWith(I, NewPow);
       }
 
+      // powi(x, y) * powi(x, z) -> powi(x, y + z)
+      if (match(Op0, m_Intrinsic<Intrinsic::powi>(m_Value(X), m_Value(Y))) &&
+          match(Op1, m_Intrinsic<Intrinsic::powi>(m_Specific(X), m_Value(Z))) &&
+          Y->getType() == Z->getType()) {
+        auto *YZ = Builder.CreateAdd(Y, Z);
+        auto *NewPow = Builder.CreateIntrinsic(
+            Intrinsic::powi, {X->getType(), YZ->getType()}, {X, YZ}, &I);
+        return replaceInstUsesWith(I, NewPow);
+      }
+
       // exp(X) * exp(Y) -> exp(X + Y)
       if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
           match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y)))) {
@@ -706,11 +721,11 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
   assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");
 
   // Bail if we will divide by zero.
-  if (C2.isNullValue())
+  if (C2.isZero())
     return false;
 
   // Bail if we would divide INT_MIN by -1.
-  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnes())
     return false;
 
   APInt Remainder(C1.getBitWidth(), /*val=*/0ULL, IsSigned);
@@ -778,11 +793,12 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
     }
 
     if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
-         *C1 != C1->getBitWidth() - 1) ||
-        (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+         C1->ult(C1->getBitWidth() - 1)) ||
+        (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))) &&
+         C1->ult(C1->getBitWidth()))) {
       APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
       APInt C1Shifted = APInt::getOneBitSet(
-          C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+          C1->getBitWidth(), static_cast<unsigned>(C1->getZExtValue()));
 
       // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
       if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
@@ -803,7 +819,7 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
       }
     }
 
-    if (!C2->isNullValue()) // avoid X udiv 0
+    if (!C2->isZero()) // avoid X udiv 0
       if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
         return FoldedDiv;
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
index 37c7e6135501..7dc516c6fdc3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -215,6 +215,20 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
                  : Builder.CreateSExt(I->getOperand(0), I->getType(),
                                       I->getName() + ".neg");
     break;
+  case Instruction::Select: {
+    // If both arms of the select are constants, we don't need to recurse.
+    // Therefore, this transform is not limited by uses.
+    auto *Sel = cast<SelectInst>(I);
+    Constant *TrueC, *FalseC;
+    if (match(Sel->getTrueValue(), m_ImmConstant(TrueC)) &&
+        match(Sel->getFalseValue(), m_ImmConstant(FalseC))) {
+      Constant *NegTrueC = ConstantExpr::getNeg(TrueC);
+      Constant *NegFalseC = ConstantExpr::getNeg(FalseC);
+      return Builder.CreateSelect(Sel->getCondition(), NegTrueC, NegFalseC,
+                                  I->getName() + ".neg", /*MDFrom=*/I);
+    }
+    break;
+  }
   default:
     break; // Other instructions require recursive reasoning.
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 6c6351c70e3a..35739c3b9a21 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -299,6 +299,29 @@ Instruction *InstCombinerImpl::foldIntegerTypedPHI(PHINode &PN) {
                                           IntToPtr->getOperand(0)->getType());
 }
 
+// Remove RoundTrip IntToPtr/PtrToInt Cast on PHI-Operand and
+// fold Phi-operand to bitcast.
+Instruction *InstCombinerImpl::foldPHIArgIntToPtrToPHI(PHINode &PN) {
+  // convert ptr2int ( phi[ int2ptr(ptr2int(x))] ) --> ptr2int ( phi [ x ] )
+  // Make sure all uses of phi are ptr2int.
+  if (!all_of(PN.users(), [](User *U) { return isa<PtrToIntInst>(U); }))
+    return nullptr;
+
+  // Iterating over all operands to check presence of target pointers for
+  // optimization.
+  bool OperandWithRoundTripCast = false;
+  for (unsigned OpNum = 0; OpNum != PN.getNumIncomingValues(); ++OpNum) {
+    if (auto *NewOp =
+            simplifyIntToPtrRoundTripCast(PN.getIncomingValue(OpNum))) {
+      PN.setIncomingValue(OpNum, NewOp);
+      OperandWithRoundTripCast = true;
+    }
+  }
+  if (!OperandWithRoundTripCast)
+    return nullptr;
+  return &PN;
+}
+
 /// If we have something like phi [insertvalue(a,b,0), insertvalue(c,d,0)],
 /// turn this into a phi[a,c] and phi[b,d] and a single insertvalue.
 Instruction *
@@ -1306,6 +1329,9 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
   if (Instruction *Result = foldPHIArgZextsIntoPHI(PN))
     return Result;
 
+  if (Instruction *Result = foldPHIArgIntToPtrToPHI(PN))
+    return Result;
+
   // If all PHI operands are the same operation, pull them through the PHI,
   // reducing code size.
   if (isa<Instruction>(PN.getIncomingValue(0)) &&
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index ce2b913dba61..4a1e82ae9c1d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -38,15 +38,16 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>
 
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "instcombine"
 
 static Value *createMinMax(InstCombiner::BuilderTy &Builder,
                            SelectPatternFlavor SPF, Value *A, Value *B) {
@@ -165,7 +166,7 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
   // simplify/reduce the instructions.
   APInt TC = *SelTC;
   APInt FC = *SelFC;
-  if (!TC.isNullValue() && !FC.isNullValue()) {
+  if (!TC.isZero() && !FC.isZero()) {
     // If the select constants differ by exactly one bit and that's the same
     // bit that is masked and checked by the select condition, the select can
     // be replaced by bitwise logic to set/clear one bit of the constant result.
@@ -202,7 +203,7 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
 
   // Determine which shift is needed to transform result of the 'and' into the
   // desired result.
-  const APInt &ValC = !TC.isNullValue() ? TC : FC;
+  const APInt &ValC = !TC.isZero() ? TC : FC;
   unsigned ValZeros = ValC.logBase2();
   unsigned AndZeros = AndMask.logBase2();
 
@@ -224,7 +225,7 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
 
   // Okay, now we know that everything is set up, we just don't know whether we
   // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
-  bool ShouldNotVal = !TC.isNullValue();
+  bool ShouldNotVal = !TC.isZero();
   ShouldNotVal ^= Pred == ICmpInst::ICMP_NE;
   if (ShouldNotVal)
     V = Builder.CreateXor(V, ValC);
@@ -319,8 +320,16 @@ Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
   Value *X, *Y;
   if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) &&
       (TI->hasOneUse() || FI->hasOneUse())) {
+    // Intersect FMF from the fneg instructions and union those with the select.
+    FastMathFlags FMF = TI->getFastMathFlags();
+    FMF &= FI->getFastMathFlags();
+    FMF |= SI.getFastMathFlags();
     Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI);
-    return UnaryOperator::CreateFNegFMF(NewSel, TI);
+    if (auto *NewSelI = dyn_cast<Instruction>(NewSel))
+      NewSelI->setFastMathFlags(FMF);
+    Instruction *NewFNeg = UnaryOperator::CreateFNeg(NewSel);
+    NewFNeg->setFastMathFlags(FMF);
+    return NewFNeg;
   }
 
   // Min/max intrinsic with a common operand can have the common operand pulled
@@ -420,10 +429,9 @@ Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
 }
 
 static bool isSelect01(const APInt &C1I, const APInt &C2I) {
-  if (!C1I.isNullValue() && !C2I.isNullValue()) // One side must be zero.
+  if (!C1I.isZero() && !C2I.isZero()) // One side must be zero.
     return false;
-  return C1I.isOneValue() || C1I.isAllOnesValue() ||
-         C2I.isOneValue() || C2I.isAllOnesValue();
+  return C1I.isOne() || C1I.isAllOnes() || C2I.isOne() || C2I.isAllOnes();
 }
 
 /// Try to fold the select into one of the operands to allow further
@@ -715,6 +723,58 @@ static Instruction *foldSetClearBits(SelectInst &Sel,
   return nullptr;
 }
 
+//   select (x == 0), 0, x * y --> freeze(y) * x
+//   select (y == 0), 0, x * y --> freeze(x) * y
+//   select (x == 0), undef, x * y --> freeze(y) * x
+//   select (x == undef), 0, x * y --> freeze(y) * x
+// Usage of mul instead of 0 will make the result more poisonous,
+// so the operand that was not checked in the condition should be frozen.
+// The latter folding is applied only when a constant compared with x is
+// is a vector consisting of 0 and undefs. If a constant compared with x
+// is a scalar undefined value or undefined vector then an expression
+// should be already folded into a constant.
+static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
+  auto *CondVal = SI.getCondition();
+  auto *TrueVal = SI.getTrueValue();
+  auto *FalseVal = SI.getFalseValue();
+  Value *X, *Y;
+  ICmpInst::Predicate Predicate;
+
+  // Assuming that constant compared with zero is not undef (but it may be
+  // a vector with some undef elements). Otherwise (when a constant is undef)
+  // the select expression should be already simplified.
+  if (!match(CondVal, m_ICmp(Predicate, m_Value(X), m_Zero())) ||
+      !ICmpInst::isEquality(Predicate))
+    return nullptr;
+
+  if (Predicate == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  // Check that TrueVal is a constant instead of matching it with m_Zero()
+  // to handle the case when it is a scalar undef value or a vector containing
+  // non-zero elements that are masked by undef elements in the compare
+  // constant.
+  auto *TrueValC = dyn_cast<Constant>(TrueVal);
+  if (TrueValC == nullptr ||
+      !match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) ||
+      !isa<Instruction>(FalseVal))
+    return nullptr;
+
+  auto *ZeroC = cast<Constant>(cast<Instruction>(CondVal)->getOperand(1));
+  auto *MergedC = Constant::mergeUndefsWith(TrueValC, ZeroC);
+  // If X is compared with 0 then TrueVal could be either zero or undef.
+  // m_Zero match vectors containing some undef elements, but for scalars
+  // m_Undef should be used explicitly.
+  if (!match(MergedC, m_Zero()) && !match(MergedC, m_Undef()))
+    return nullptr;
+
+  auto *FalseValI = cast<Instruction>(FalseVal);
+  auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"),
+                                     *FalseValI);
+  IC.replaceOperand(*FalseValI, FalseValI->getOperand(0) == Y ? 0 : 1, FrY);
+  return IC.replaceInstUsesWith(SI, FalseValI);
+}
+
 /// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b).
 /// There are 8 commuted/swapped variants of this pattern.
 /// TODO: Also support a - UMIN(a,b) patterns.
@@ -1229,8 +1289,8 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
 // Iff -C1 s<= C2 s<= C0-C1
 // Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
 //      SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
-static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
-                                          InstCombiner::BuilderTy &Builder) {
+static Value *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
+                                    InstCombiner::BuilderTy &Builder) {
   Value *X = Sel0.getTrueValue();
   Value *Sel1 = Sel0.getFalseValue();
 
@@ -1238,36 +1298,42 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
   // Said condition must be one-use.
   if (!Cmp0.hasOneUse())
     return nullptr;
+  ICmpInst::Predicate Pred0 = Cmp0.getPredicate();
   Value *Cmp00 = Cmp0.getOperand(0);
   Constant *C0;
   if (!match(Cmp0.getOperand(1),
              m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))
     return nullptr;
-  // Canonicalize Cmp0 into the form we expect.
+
+  if (!isa<SelectInst>(Sel1)) {
+    Pred0 = ICmpInst::getInversePredicate(Pred0);
+    std::swap(X, Sel1);
+  }
+
+  // Canonicalize Cmp0 into ult or uge.
   // FIXME: we shouldn't care about lanes that are 'undef' in the end?
-  switch (Cmp0.getPredicate()) {
+  switch (Pred0) {
   case ICmpInst::Predicate::ICMP_ULT:
+  case ICmpInst::Predicate::ICMP_UGE:
+    // Although icmp ult %x, 0 is an unusual thing to try and should generally
+    // have been simplified, it does not verify with undef inputs so ensure we
+    // are not in a strange state.
+    if (!match(C0, m_SpecificInt_ICMP(
+                       ICmpInst::Predicate::ICMP_NE,
+                       APInt::getZero(C0->getType()->getScalarSizeInBits()))))
+      return nullptr;
     break; // Great!
   case ICmpInst::Predicate::ICMP_ULE:
-    // We'd have to increment C0 by one, and for that it must not have all-ones
-    // element, but then it would have been canonicalized to 'ult' before
-    // we get here. So we can't do anything useful with 'ule'.
-    return nullptr;
   case ICmpInst::Predicate::ICMP_UGT:
-    // We want to canonicalize it to 'ult', so we'll need to increment C0,
-    // which again means it must not have any all-ones elements.
+    // We want to canonicalize it to 'ult' or 'uge', so we'll need to increment
+    // C0, which again means it must not have any all-ones elements.
     if (!match(C0,
-               m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
-                                  APInt::getAllOnesValue(
-                                      C0->getType()->getScalarSizeInBits()))))
+               m_SpecificInt_ICMP(
+                   ICmpInst::Predicate::ICMP_NE,
+                   APInt::getAllOnes(C0->getType()->getScalarSizeInBits()))))
       return nullptr; // Can't do, have all-ones element[s].
     C0 = InstCombiner::AddOne(C0);
-    std::swap(X, Sel1);
     break;
-  case ICmpInst::Predicate::ICMP_UGE:
-    // The only way we'd get this predicate if this `icmp` has extra uses,
-    // but then we won't be able to do this fold.
-    return nullptr;
   default:
     return nullptr; // Unknown predicate.
   }
@@ -1277,11 +1343,16 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
   if (!Sel1->hasOneUse())
     return nullptr;
 
+  // If the types do not match, look through any truncs to the underlying
+  // instruction.
+  if (Cmp00->getType() != X->getType() && X->hasOneUse())
+    match(X, m_TruncOrSelf(m_Value(X)));
+
   // We now can finish matching the condition of the outermost select:
   // it should either be the X itself, or an addition of some constant to X.
   Constant *C1;
   if (Cmp00 == X)
-    C1 = ConstantInt::getNullValue(Sel0.getType());
+    C1 = ConstantInt::getNullValue(X->getType());
   else if (!match(Cmp00,
                   m_Add(m_Specific(X),
                         m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1)))))
@@ -1335,6 +1406,8 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
   // The thresholds of this clamp-like pattern.
   auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
   auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
+  if (Pred0 == ICmpInst::Predicate::ICMP_UGE)
+    std::swap(ThresholdLowIncl, ThresholdHighExcl);
 
   // The fold has a precondition 1: C2 s>= ThresholdLow
   auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
@@ -1347,15 +1420,29 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
   if (!match(Precond2, m_One()))
     return nullptr;
 
+  // If we are matching from a truncated input, we need to sext the
+  // ReplacementLow and ReplacementHigh values. Only do the transform if they
+  // are free to extend due to being constants.
+  if (X->getType() != Sel0.getType()) {
+    Constant *LowC, *HighC;
+    if (!match(ReplacementLow, m_ImmConstant(LowC)) ||
+        !match(ReplacementHigh, m_ImmConstant(HighC)))
+      return nullptr;
+    ReplacementLow = ConstantExpr::getSExt(LowC, X->getType());
+    ReplacementHigh = ConstantExpr::getSExt(HighC, X->getType());
+  }
+
   // All good, finally emit the new pattern.
   Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl);
   Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl);
   Value *MaybeReplacedLow =
       Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X);
-  Instruction *MaybeReplacedHigh =
-      SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
 
-  return MaybeReplacedHigh;
+  // Create the final select. If we looked through a truncate above, we will
+  // need to retruncate the result.
+  Value *MaybeReplacedHigh = Builder.CreateSelect(
+      ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
+  return Builder.CreateTrunc(MaybeReplacedHigh, Sel0.getType());
 }
 
 // If we have
@@ -1446,8 +1533,8 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
     return NewAbs;
 
-  if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
-    return NewAbs;
+  if (Value *V = canonicalizeClampLike(SI, *ICI, Builder))
+    return replaceInstUsesWith(SI, V);
 
   if (Instruction *NewSel =
           tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
@@ -1816,9 +1903,7 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
                                m_Value(TrueVal), m_Value(FalseVal))))
       return false;
 
-    auto IsZeroOrOne = [](const APInt &C) {
-      return C.isNullValue() || C.isOneValue();
-    };
+    auto IsZeroOrOne = [](const APInt &C) { return C.isZero() || C.isOne(); };
     auto IsMinMax = [&](Value *Min, Value *Max) {
       APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits());
       APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits());
@@ -2182,7 +2267,7 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
 }
 
 /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
-Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
+Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) {
   Type *Ty = MinMax1.getType();
 
   // We are looking for a tree of:
@@ -2212,23 +2297,14 @@ Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
   if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
     return nullptr;
 
-  // Also make sure that the number of uses is as expected. The "3"s are for the
-  // the two items of min/max (the compare and the select).
-  if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
+  // Also make sure that the number of uses is as expected. The 3 is for the
+  // the two items of the compare and the select, or 2 from a min/max.
+  unsigned ExpUses = isa<IntrinsicInst>(MinMax1) ? 2 : 3;
+  if (MinMax2->hasNUsesOrMore(ExpUses) || AddSub->hasNUsesOrMore(ExpUses))
     return nullptr;
 
   // Create the new type (which can be a vector type)
   Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
-  // Match the two extends from the add/sub
-  Value *A, *B;
-  if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B)))))
-    return nullptr;
-  // And check the incoming values are of a type smaller than or equal to the
-  // size of the saturation. Otherwise the higher bits can cause different
-  // results.
-  if (A->getType()->getScalarSizeInBits() > NewBitWidth ||
-      B->getType()->getScalarSizeInBits() > NewBitWidth)
-    return nullptr;
 
   Intrinsic::ID IntrinsicID;
   if (AddSub->getOpcode() == Instruction::Add)
@@ -2238,10 +2314,16 @@ Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
   else
     return nullptr;
 
+  // The two operands of the add/sub must be nsw-truncatable to the NewTy. This
+  // is usually achieved via a sext from a smaller type.
+  if (ComputeMinSignedBits(AddSub->getOperand(0), 0, AddSub) > NewBitWidth ||
+      ComputeMinSignedBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
+    return nullptr;
+
   // Finally create and return the sat intrinsic, truncated to the new type
   Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
-  Value *AT = Builder.CreateSExt(A, NewTy);
-  Value *BT = Builder.CreateSExt(B, NewTy);
+  Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy);
+  Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy);
   Value *Sat = Builder.CreateCall(F, {AT, BT});
   return CastInst::Create(Instruction::SExt, Sat, Ty);
 }
@@ -2432,7 +2514,7 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
 
   unsigned NumElts = VecTy->getNumElements();
   APInt UndefElts(NumElts, 0);
-  APInt AllOnesEltMask(APInt::getAllOnesValue(NumElts));
+  APInt AllOnesEltMask(APInt::getAllOnes(NumElts));
   if (Value *V = SimplifyDemandedVectorElts(&Sel, AllOnesEltMask, UndefElts)) {
     if (V != &Sel)
       return replaceInstUsesWith(Sel, V);
@@ -2754,11 +2836,16 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
                                                         /* IsAnd */ IsAnd))
           return I;
 
-      if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal))
-        if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1))
+      if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal)) {
+        if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1)) {
           if (auto *V = foldAndOrOfICmpsOfAndWithPow2(ICmp0, ICmp1, &SI, IsAnd,
                                                       /* IsLogical */ true))
             return replaceInstUsesWith(SI, V);
+
+          if (auto *V = foldEqOfParts(ICmp0, ICmp1, IsAnd))
+            return replaceInstUsesWith(SI, V);
+        }
+      }
     }
 
     // select (select a, true, b), c, false -> select a, c, false
@@ -2863,14 +2950,10 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   }
 
   // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
-  // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We
-  // also require nnan because we do not want to unintentionally change the
-  // sign of a NaN value.
+  // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work.
   // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
-  Instruction *FSub;
   if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
       match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) &&
-      match(TrueVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
       (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) {
     Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, &SI);
     return replaceInstUsesWith(SI, Fabs);
@@ -2878,7 +2961,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   // (X >  +/-0.0) ? X : (0.0 - X) --> fabs(X)
   if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
       match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) &&
-      match(FalseVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
       (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) {
     Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, &SI);
     return replaceInstUsesWith(SI, Fabs);
@@ -2886,11 +2968,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   // With nnan and nsz:
   // (X <  +/-0.0) ? -X : X --> fabs(X)
   // (X <= +/-0.0) ? -X : X --> fabs(X)
-  Instruction *FNeg;
   if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
-      match(TrueVal, m_FNeg(m_Specific(FalseVal))) &&
-      match(TrueVal, m_Instruction(FNeg)) && FNeg->hasNoNaNs() &&
-      FNeg->hasNoSignedZeros() && SI.hasNoSignedZeros() &&
+      match(TrueVal, m_FNeg(m_Specific(FalseVal))) && SI.hasNoSignedZeros() &&
       (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE ||
        Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) {
     Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, &SI);
@@ -2900,9 +2979,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   // (X >  +/-0.0) ? X : -X --> fabs(X)
   // (X >= +/-0.0) ? X : -X --> fabs(X)
   if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
-      match(FalseVal, m_FNeg(m_Specific(TrueVal))) &&
-      match(FalseVal, m_Instruction(FNeg)) && FNeg->hasNoNaNs() &&
-      FNeg->hasNoSignedZeros() && SI.hasNoSignedZeros() &&
+      match(FalseVal, m_FNeg(m_Specific(TrueVal))) && SI.hasNoSignedZeros() &&
       (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE ||
        Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) {
     Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, &SI);
@@ -2920,6 +2997,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return Add;
   if (Instruction *Or = foldSetClearBits(SI, Builder))
     return Or;
+  if (Instruction *Mul = foldSelectZeroOrMul(SI, *this))
+    return Mul;
 
   // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
   auto *TI = dyn_cast<Instruction>(TrueVal);
@@ -2939,8 +3018,10 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     if (Gep->getNumOperands() != 2 || Gep->getPointerOperand() != Base ||
         !Gep->hasOneUse())
       return nullptr;
-    Type *ElementType = Gep->getResultElementType();
     Value *Idx = Gep->getOperand(1);
+    if (isa<VectorType>(CondVal->getType()) && !isa<VectorType>(Idx->getType()))
+      return nullptr;
+    Type *ElementType = Gep->getResultElementType();
     Value *NewT = Idx;
     Value *NewF = Constant::getNullValue(Idx->getType());
     if (Swap)
@@ -3188,9 +3269,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
     KnownBits Known(1);
     computeKnownBits(CondVal, Known, 0, &SI);
-    if (Known.One.isOneValue())
+    if (Known.One.isOne())
       return replaceInstUsesWith(SI, TrueVal);
-    if (Known.Zero.isOneValue())
+    if (Known.Zero.isOne())
       return replaceInstUsesWith(SI, FalseVal);
   }
 
@@ -3230,7 +3311,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   Value *Mask;
   if (match(TrueVal, m_Zero()) &&
       match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
-                                   m_CombineOr(m_Undef(), m_Zero())))) {
+                                   m_CombineOr(m_Undef(), m_Zero()))) &&
+      (CondVal->getType() == Mask->getType())) {
     // We can remove the select by ensuring the load zeros all lanes the
     // select would have.  We determine this by proving there is no overlap
     // between the load and select masks.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index ca5e473fdecb..06421d553915 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -41,7 +41,7 @@ bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
       (Sh0->getType()->getScalarSizeInBits() - 1) +
       (Sh1->getType()->getScalarSizeInBits() - 1);
   APInt MaximalRepresentableShiftAmount =
-      APInt::getAllOnesValue(ShAmt0->getType()->getScalarSizeInBits());
+      APInt::getAllOnes(ShAmt0->getType()->getScalarSizeInBits());
   return MaximalRepresentableShiftAmount.uge(MaximalPossibleTotalShiftAmount);
 }
 
@@ -172,8 +172,8 @@ Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
 // There are many variants to this pattern:
 //   a)  (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt
 //   b)  (x & (~(-1 << MaskShAmt))) << ShiftShAmt
-//   c)  (x & (-1 >> MaskShAmt)) << ShiftShAmt
-//   d)  (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt
+//   c)  (x & (-1 l>> MaskShAmt)) << ShiftShAmt
+//   d)  (x & ((-1 << MaskShAmt) l>> MaskShAmt)) << ShiftShAmt
 //   e)  ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt
 //   f)  ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt
 // All these patterns can be simplified to just:
@@ -213,11 +213,11 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
   auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
   // (~(-1 << maskNbits))
   auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
-  // (-1 >> MaskShAmt)
-  auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt));
-  // ((-1 << MaskShAmt) >> MaskShAmt)
+  // (-1 l>> MaskShAmt)
+  auto MaskC = m_LShr(m_AllOnes(), m_Value(MaskShAmt));
+  // ((-1 << MaskShAmt) l>> MaskShAmt)
   auto MaskD =
-      m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
+      m_LShr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
 
   Value *X;
   Constant *NewMask;
@@ -240,7 +240,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
     // that shall remain in the root value (OuterShift).
 
     // An extend of an undef value becomes zero because the high bits are never
-    // completely unknown. Replace the the `undef` shift amounts with final
+    // completely unknown. Replace the `undef` shift amounts with final
     // shift bitwidth to ensure that the value remains undef when creating the
     // subsequent shift op.
     SumOfShAmts = Constant::replaceUndefsWith(
@@ -272,7 +272,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
     // shall be unset in the root value (OuterShift).
 
     // An extend of an undef value becomes zero because the high bits are never
-    // completely unknown. Replace the the `undef` shift amounts with negated
+    // completely unknown. Replace the `undef` shift amounts with negated
     // bitwidth of innermost shift to ensure that the value remains undef when
     // creating the subsequent shift op.
     unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits();
@@ -346,9 +346,8 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
   // TODO: Remove the one-use check if the other logic operand (Y) is constant.
   Value *X, *Y;
   auto matchFirstShift = [&](Value *V) {
-    BinaryOperator *BO;
     APInt Threshold(Ty->getScalarSizeInBits(), Ty->getScalarSizeInBits());
-    return match(V, m_BinOp(BO)) && BO->getOpcode() == ShiftOpcode &&
+    return match(V, m_BinOp(ShiftOpcode, m_Value(), m_Value())) &&
            match(V, m_OneUse(m_Shift(m_Value(X), m_Constant(C0)))) &&
            match(ConstantExpr::getAdd(C0, C1),
                  m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
@@ -661,23 +660,22 @@ static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
 
 Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                    BinaryOperator &I) {
-  bool isLeftShift = I.getOpcode() == Instruction::Shl;
-
   const APInt *Op1C;
   if (!match(Op1, m_APInt(Op1C)))
     return nullptr;
 
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
+  bool IsLeftShift = I.getOpcode() == Instruction::Shl;
   if (I.getOpcode() != Instruction::AShr &&
-      canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
+      canEvaluateShifted(Op0, Op1C->getZExtValue(), IsLeftShift, *this, &I)) {
     LLVM_DEBUG(
         dbgs() << "ICE: GetShiftedValue propagating shift through expression"
                   " to eliminate shift:\n  IN: "
                << *Op0 << "\n  SH: " << I << "\n");
 
     return replaceInstUsesWith(
-        I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
+        I, getShiftedValue(Op0, Op1C->getZExtValue(), IsLeftShift, *this, DL));
   }
 
   // See if we can simplify any instructions used by the instruction whose sole
@@ -686,202 +684,72 @@ Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
   unsigned TypeBits = Ty->getScalarSizeInBits();
   assert(!Op1C->uge(TypeBits) &&
          "Shift over the type width should have been removed already");
+  (void)TypeBits;
 
   if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I))
     return FoldedShift;
 
-  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
-  if (auto *TI = dyn_cast<TruncInst>(Op0)) {
-    // If 'shift2' is an ashr, we would have to get the sign bit into a funny
-    // place.  Don't try to do this transformation in this case.  Also, we
-    // require that the input operand is a shift-by-constant so that we have
-    // confidence that the shifts will get folded together.  We could do this
-    // xform in more cases, but it is unlikely to be profitable.
-    const APInt *TrShiftAmt;
-    if (I.isLogicalShift() &&
-        match(TI->getOperand(0), m_Shift(m_Value(), m_APInt(TrShiftAmt)))) {
-      auto *TrOp = cast<Instruction>(TI->getOperand(0));
-      Type *SrcTy = TrOp->getType();
-
-      // Okay, we'll do this xform.  Make the shift of shift.
-      Constant *ShAmt = ConstantExpr::getZExt(Op1, SrcTy);
-      // (shift2 (shift1 & 0x00FF), c2)
-      Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
-
-      // For logical shifts, the truncation has the effect of making the high
-      // part of the register be zeros.  Emulate this by inserting an AND to
-      // clear the top bits as needed.  This 'and' will usually be zapped by
-      // other xforms later if dead.
-      unsigned SrcSize = SrcTy->getScalarSizeInBits();
-      Constant *MaskV =
-          ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcSize, TypeBits));
-
-      // The mask we constructed says what the trunc would do if occurring
-      // between the shifts.  We want to know the effect *after* the second
-      // shift.  We know that it is a logical shift by a constant, so adjust the
-      // mask as appropriate.
-      MaskV = ConstantExpr::get(I.getOpcode(), MaskV, ShAmt);
-      // shift1 & 0x00FF
-      Value *And = Builder.CreateAnd(NSh, MaskV, TI->getName());
-      // Return the value truncated to the interesting size.
-      return new TruncInst(And, Ty);
-    }
-  }
-
-  if (Op0->hasOneUse()) {
-    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
-      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
-      Value *V1;
-      const APInt *CC;
-      switch (Op0BO->getOpcode()) {
-      default: break;
-      case Instruction::Add:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor: {
-        // These operators commute.
-        // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
-        if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
-            match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
-                  m_Specific(Op1)))) {
-          Value *YS =         // (Y << C)
-            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
-          // (X + (Y << C))
-          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
-                                         Op0BO->getOperand(1)->getName());
-          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
-          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
-          Constant *Mask = ConstantInt::get(Ty, Bits);
-          return BinaryOperator::CreateAnd(X, Mask);
-        }
-
-        // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
-        Value *Op0BOOp1 = Op0BO->getOperand(1);
-        if (isLeftShift && Op0BOOp1->hasOneUse() &&
-            match(Op0BOOp1, m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
-                                  m_APInt(CC)))) {
-          Value *YS = // (Y << C)
-              Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
-          // X & (CC << C)
-          Value *XM = Builder.CreateAnd(
-              V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
-              V1->getName() + ".mask");
-          return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
-        }
-        LLVM_FALLTHROUGH;
-      }
-
-      case Instruction::Sub: {
-        // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
-        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
-            match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
-                  m_Specific(Op1)))) {
-          Value *YS =  // (Y << C)
-            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
-          // (X + (Y << C))
-          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
-                                         Op0BO->getOperand(0)->getName());
-          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
-          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
-          Constant *Mask = ConstantInt::get(Ty, Bits);
-          return BinaryOperator::CreateAnd(X, Mask);
-        }
-
-        // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
-        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
-            match(Op0BO->getOperand(0),
-                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
-                        m_APInt(CC)))) {
-          Value *YS = // (Y << C)
-              Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
-          // X & (CC << C)
-          Value *XM = Builder.CreateAnd(
-              V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
-              V1->getName() + ".mask");
-          return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
-        }
-
-        break;
-      }
-      }
+  if (!Op0->hasOneUse())
+    return nullptr;
 
-      // If the operand is a bitwise operator with a constant RHS, and the
-      // shift is the only use, we can pull it out of the shift.
-      const APInt *Op0C;
-      if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
-        if (canShiftBinOpWithConstantRHS(I, Op0BO)) {
-          Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
-                                     cast<Constant>(Op0BO->getOperand(1)), Op1);
+  if (auto *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+    // If the operand is a bitwise operator with a constant RHS, and the
+    // shift is the only use, we can pull it out of the shift.
+    const APInt *Op0C;
+    if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
+      if (canShiftBinOpWithConstantRHS(I, Op0BO)) {
+        Constant *NewRHS = ConstantExpr::get(
+            I.getOpcode(), cast<Constant>(Op0BO->getOperand(1)), Op1);
 
-          Value *NewShift =
+        Value *NewShift =
             Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
-          NewShift->takeName(Op0BO);
-
-          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
-                                        NewRHS);
-        }
-      }
-
-      // If the operand is a subtract with a constant LHS, and the shift
-      // is the only use, we can pull it out of the shift.
-      // This folds (shl (sub C1, X), C2) -> (sub (C1 << C2), (shl X, C2))
-      if (isLeftShift && Op0BO->getOpcode() == Instruction::Sub &&
-          match(Op0BO->getOperand(0), m_APInt(Op0C))) {
-        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
-                                   cast<Constant>(Op0BO->getOperand(0)), Op1);
-
-        Value *NewShift = Builder.CreateShl(Op0BO->getOperand(1), Op1);
         NewShift->takeName(Op0BO);
 
-        return BinaryOperator::CreateSub(NewRHS, NewShift);
+        return BinaryOperator::Create(Op0BO->getOpcode(), NewShift, NewRHS);
       }
     }
+  }
 
-    // If we have a select that conditionally executes some binary operator,
-    // see if we can pull it the select and operator through the shift.
-    //
-    // For example, turning:
-    //   shl (select C, (add X, C1), X), C2
-    // Into:
-    //   Y = shl X, C2
-    //   select C, (add Y, C1 << C2), Y
-    Value *Cond;
-    BinaryOperator *TBO;
-    Value *FalseVal;
-    if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
-                            m_Value(FalseVal)))) {
-      const APInt *C;
-      if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
-          match(TBO->getOperand(1), m_APInt(C)) &&
-          canShiftBinOpWithConstantRHS(I, TBO)) {
-        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
-                                       cast<Constant>(TBO->getOperand(1)), Op1);
-
-        Value *NewShift =
-          Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
-        Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift,
-                                           NewRHS);
-        return SelectInst::Create(Cond, NewOp, NewShift);
-      }
+  // If we have a select that conditionally executes some binary operator,
+  // see if we can pull it the select and operator through the shift.
+  //
+  // For example, turning:
+  //   shl (select C, (add X, C1), X), C2
+  // Into:
+  //   Y = shl X, C2
+  //   select C, (add Y, C1 << C2), Y
+  Value *Cond;
+  BinaryOperator *TBO;
+  Value *FalseVal;
+  if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
+                          m_Value(FalseVal)))) {
+    const APInt *C;
+    if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
+        match(TBO->getOperand(1), m_APInt(C)) &&
+        canShiftBinOpWithConstantRHS(I, TBO)) {
+      Constant *NewRHS = ConstantExpr::get(
+          I.getOpcode(), cast<Constant>(TBO->getOperand(1)), Op1);
+
+      Value *NewShift = Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
+      Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift, NewRHS);
+      return SelectInst::Create(Cond, NewOp, NewShift);
     }
+  }
 
-    BinaryOperator *FBO;
-    Value *TrueVal;
-    if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
-                            m_OneUse(m_BinOp(FBO))))) {
-      const APInt *C;
-      if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
-          match(FBO->getOperand(1), m_APInt(C)) &&
-          canShiftBinOpWithConstantRHS(I, FBO)) {
-        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
-                                       cast<Constant>(FBO->getOperand(1)), Op1);
-
-        Value *NewShift =
-          Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
-        Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift,
-                                           NewRHS);
-        return SelectInst::Create(Cond, NewShift, NewOp);
-      }
+  BinaryOperator *FBO;
+  Value *TrueVal;
+  if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
+                          m_OneUse(m_BinOp(FBO))))) {
+    const APInt *C;
+    if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
+        match(FBO->getOperand(1), m_APInt(C)) &&
+        canShiftBinOpWithConstantRHS(I, FBO)) {
+      Constant *NewRHS = ConstantExpr::get(
+          I.getOpcode(), cast<Constant>(FBO->getOperand(1)), Op1);
+
+      Value *NewShift = Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
+      Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift, NewRHS);
+      return SelectInst::Create(Cond, NewShift, NewOp);
     }
   }
 
@@ -908,41 +776,41 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
   Type *Ty = I.getType();
   unsigned BitWidth = Ty->getScalarSizeInBits();
 
-  const APInt *ShAmtAPInt;
-  if (match(Op1, m_APInt(ShAmtAPInt))) {
-    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+  const APInt *C;
+  if (match(Op1, m_APInt(C))) {
+    unsigned ShAmtC = C->getZExtValue();
 
-    // shl (zext X), ShAmt --> zext (shl X, ShAmt)
+    // shl (zext X), C --> zext (shl X, C)
     // This is only valid if X would have zeros shifted out.
     Value *X;
     if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) {
       unsigned SrcWidth = X->getType()->getScalarSizeInBits();
-      if (ShAmt < SrcWidth &&
-          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
-        return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty);
+      if (ShAmtC < SrcWidth &&
+          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmtC), 0, &I))
+        return new ZExtInst(Builder.CreateShl(X, ShAmtC), Ty);
     }
 
     // (X >> C) << C --> X & (-1 << C)
     if (match(Op0, m_Shr(m_Value(X), m_Specific(Op1)))) {
-      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmtC));
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
-    const APInt *ShOp1;
-    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
-        ShOp1->ult(BitWidth)) {
-      unsigned ShrAmt = ShOp1->getZExtValue();
-      if (ShrAmt < ShAmt) {
-        // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+    const APInt *C1;
+    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(C1)))) &&
+        C1->ult(BitWidth)) {
+      unsigned ShrAmt = C1->getZExtValue();
+      if (ShrAmt < ShAmtC) {
+        // If C1 < C: (X >>?,exact C1) << C --> X << (C - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmtC - ShrAmt);
         auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
         NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
         NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
         return NewShl;
       }
-      if (ShrAmt > ShAmt) {
-        // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+      if (ShrAmt > ShAmtC) {
+        // If C1 > C: (X >>?exact C1) << C --> X >>?exact (C1 - C)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmtC);
         auto *NewShr = BinaryOperator::Create(
             cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
         NewShr->setIsExact(true);
@@ -950,49 +818,135 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
       }
     }
 
-    if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
-        ShOp1->ult(BitWidth)) {
-      unsigned ShrAmt = ShOp1->getZExtValue();
-      if (ShrAmt < ShAmt) {
-        // If C1 < C2: (X >>? C1) << C2 --> X << (C2 - C1) & (-1 << C2)
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+    if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_APInt(C1)))) &&
+        C1->ult(BitWidth)) {
+      unsigned ShrAmt = C1->getZExtValue();
+      if (ShrAmt < ShAmtC) {
+        // If C1 < C: (X >>? C1) << C --> (X << (C - C1)) & (-1 << C)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmtC - ShrAmt);
         auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
         NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
         NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
         Builder.Insert(NewShl);
-        APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+        APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmtC));
         return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
       }
-      if (ShrAmt > ShAmt) {
-        // If C1 > C2: (X >>? C1) << C2 --> X >>? (C1 - C2) & (-1 << C2)
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+      if (ShrAmt > ShAmtC) {
+        // If C1 > C: (X >>? C1) << C --> (X >>? (C1 - C)) & (-1 << C)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmtC);
         auto *OldShr = cast<BinaryOperator>(Op0);
         auto *NewShr =
             BinaryOperator::Create(OldShr->getOpcode(), X, ShiftDiff);
         NewShr->setIsExact(OldShr->isExact());
         Builder.Insert(NewShr);
-        APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+        APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmtC));
         return BinaryOperator::CreateAnd(NewShr, ConstantInt::get(Ty, Mask));
       }
     }
 
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
-      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+    // Similar to above, but look through an intermediate trunc instruction.
+    BinaryOperator *Shr;
+    if (match(Op0, m_OneUse(m_Trunc(m_OneUse(m_BinOp(Shr))))) &&
+        match(Shr, m_Shr(m_Value(X), m_APInt(C1)))) {
+      // The larger shift direction survives through the transform.
+      unsigned ShrAmtC = C1->getZExtValue();
+      unsigned ShDiff = ShrAmtC > ShAmtC ? ShrAmtC - ShAmtC : ShAmtC - ShrAmtC;
+      Constant *ShiftDiffC = ConstantInt::get(X->getType(), ShDiff);
+      auto ShiftOpc = ShrAmtC > ShAmtC ? Shr->getOpcode() : Instruction::Shl;
+
+      // If C1 > C:
+      // (trunc (X >> C1)) << C --> (trunc (X >> (C1 - C))) && (-1 << C)
+      // If C > C1:
+      // (trunc (X >> C1)) << C --> (trunc (X << (C - C1))) && (-1 << C)
+      Value *NewShift = Builder.CreateBinOp(ShiftOpc, X, ShiftDiffC, "sh.diff");
+      Value *Trunc = Builder.CreateTrunc(NewShift, Ty, "tr.sh.diff");
+      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmtC));
+      return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(Ty, Mask));
+    }
+
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(C1))) && C1->ult(BitWidth)) {
+      unsigned AmtSum = ShAmtC + C1->getZExtValue();
       // Oversized shifts are simplified to zero in InstSimplify.
       if (AmtSum < BitWidth)
         // (X << C1) << C2 --> X << (C1 + C2)
         return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
     }
 
+    // If we have an opposite shift by the same amount, we may be able to
+    // reorder binops and shifts to eliminate math/logic.
+    auto isSuitableBinOpcode = [](Instruction::BinaryOps BinOpcode) {
+      switch (BinOpcode) {
+      default:
+        return false;
+      case Instruction::Add:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      case Instruction::Sub:
+        // NOTE: Sub is not commutable and the tranforms below may not be valid
+        //       when the shift-right is operand 1 (RHS) of the sub.
+        return true;
+      }
+    };
+    BinaryOperator *Op0BO;
+    if (match(Op0, m_OneUse(m_BinOp(Op0BO))) &&
+        isSuitableBinOpcode(Op0BO->getOpcode())) {
+      // Commute so shift-right is on LHS of the binop.
+      // (Y bop (X >> C)) << C         ->  ((X >> C) bop Y) << C
+      // (Y bop ((X >> C) & CC)) << C  ->  (((X >> C) & CC) bop Y) << C
+      Value *Shr = Op0BO->getOperand(0);
+      Value *Y = Op0BO->getOperand(1);
+      Value *X;
+      const APInt *CC;
+      if (Op0BO->isCommutative() && Y->hasOneUse() &&
+          (match(Y, m_Shr(m_Value(), m_Specific(Op1))) ||
+           match(Y, m_And(m_OneUse(m_Shr(m_Value(), m_Specific(Op1))),
+                          m_APInt(CC)))))
+        std::swap(Shr, Y);
+
+      // ((X >> C) bop Y) << C  ->  (X bop (Y << C)) & (~0 << C)
+      if (match(Shr, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) {
+        // Y << C
+        Value *YS = Builder.CreateShl(Y, Op1, Op0BO->getName());
+        // (X bop (Y << C))
+        Value *B =
+            Builder.CreateBinOp(Op0BO->getOpcode(), X, YS, Shr->getName());
+        unsigned Op1Val = C->getLimitedValue(BitWidth);
+        APInt Bits = APInt::getHighBitsSet(BitWidth, BitWidth - Op1Val);
+        Constant *Mask = ConstantInt::get(Ty, Bits);
+        return BinaryOperator::CreateAnd(B, Mask);
+      }
+
+      // (((X >> C) & CC) bop Y) << C  ->  (X & (CC << C)) bop (Y << C)
+      if (match(Shr,
+                m_OneUse(m_And(m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))),
+                               m_APInt(CC))))) {
+        // Y << C
+        Value *YS = Builder.CreateShl(Y, Op1, Op0BO->getName());
+        // X & (CC << C)
+        Value *M = Builder.CreateAnd(X, ConstantInt::get(Ty, CC->shl(*C)),
+                                     X->getName() + ".mask");
+        return BinaryOperator::Create(Op0BO->getOpcode(), M, YS);
+      }
+    }
+
+    // (C1 - X) << C --> (C1 << C) - (X << C)
+    if (match(Op0, m_OneUse(m_Sub(m_APInt(C1), m_Value(X))))) {
+      Constant *NewLHS = ConstantInt::get(Ty, C1->shl(*C));
+      Value *NewShift = Builder.CreateShl(X, Op1);
+      return BinaryOperator::CreateSub(NewLHS, NewShift);
+    }
+
     // If the shifted-out value is known-zero, then this is a NUW shift.
     if (!I.hasNoUnsignedWrap() &&
-        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
+        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmtC), 0,
+                          &I)) {
       I.setHasNoUnsignedWrap();
       return &I;
     }
 
     // If the shifted-out value is all signbits, then this is a NSW shift.
-    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
+    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmtC) {
       I.setHasNoSignedWrap();
       return &I;
     }
@@ -1048,12 +1002,12 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   Type *Ty = I.getType();
-  const APInt *ShAmtAPInt;
-  if (match(Op1, m_APInt(ShAmtAPInt))) {
-    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+  const APInt *C;
+  if (match(Op1, m_APInt(C))) {
+    unsigned ShAmtC = C->getZExtValue();
     unsigned BitWidth = Ty->getScalarSizeInBits();
     auto *II = dyn_cast<IntrinsicInst>(Op0);
-    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
+    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmtC &&
         (II->getIntrinsicID() == Intrinsic::ctlz ||
          II->getIntrinsicID() == Intrinsic::cttz ||
          II->getIntrinsicID() == Intrinsic::ctpop)) {
@@ -1067,78 +1021,81 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
     }
 
     Value *X;
-    const APInt *ShOp1;
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
-      if (ShOp1->ult(ShAmt)) {
-        unsigned ShlAmt = ShOp1->getZExtValue();
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+    const APInt *C1;
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(C1))) && C1->ult(BitWidth)) {
+      if (C1->ult(ShAmtC)) {
+        unsigned ShlAmtC = C1->getZExtValue();
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmtC - ShlAmtC);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
-          // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
+          // (X <<nuw C1) >>u C --> X >>u (C - C1)
           auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
           NewLShr->setIsExact(I.isExact());
           return NewLShr;
         }
-        // (X << C1) >>u C2  --> (X >>u (C2 - C1)) & (-1 >> C2)
+        // (X << C1) >>u C  --> (X >>u (C - C1)) & (-1 >> C)
         Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
-        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
         return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
       }
-      if (ShOp1->ugt(ShAmt)) {
-        unsigned ShlAmt = ShOp1->getZExtValue();
-        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+      if (C1->ugt(ShAmtC)) {
+        unsigned ShlAmtC = C1->getZExtValue();
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmtC - ShAmtC);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
-          // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+          // (X <<nuw C1) >>u C --> X <<nuw (C1 - C)
           auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
           NewShl->setHasNoUnsignedWrap(true);
           return NewShl;
         }
-        // (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+        // (X << C1) >>u C  --> X << (C1 - C) & (-1 >> C)
         Value *NewShl = Builder.CreateShl(X, ShiftDiff);
-        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
         return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
       }
-      assert(*ShOp1 == ShAmt);
+      assert(*C1 == ShAmtC);
       // (X << C) >>u C --> X & (-1 >>u C)
-      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
     if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
         (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
-      assert(ShAmt < X->getType()->getScalarSizeInBits() &&
+      assert(ShAmtC < X->getType()->getScalarSizeInBits() &&
              "Big shift not simplified to zero?");
       // lshr (zext iM X to iN), C --> zext (lshr X, C) to iN
-      Value *NewLShr = Builder.CreateLShr(X, ShAmt);
+      Value *NewLShr = Builder.CreateLShr(X, ShAmtC);
       return new ZExtInst(NewLShr, Ty);
     }
 
-    if (match(Op0, m_SExt(m_Value(X))) &&
-        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
-      // Are we moving the sign bit to the low bit and widening with high zeros?
+    if (match(Op0, m_SExt(m_Value(X)))) {
       unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
-      if (ShAmt == BitWidth - 1) {
-        // lshr (sext i1 X to iN), N-1 --> zext X to iN
-        if (SrcTyBitWidth == 1)
-          return new ZExtInst(X, Ty);
+      // lshr (sext i1 X to iN), C --> select (X, -1 >> C, 0)
+      if (SrcTyBitWidth == 1) {
+        auto *NewC = ConstantInt::get(
+            Ty, APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
+        return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+      }
 
-        // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
-        if (Op0->hasOneUse()) {
+      if ((!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType())) &&
+          Op0->hasOneUse()) {
+        // Are we moving the sign bit to the low bit and widening with high
+        // zeros? lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
+        if (ShAmtC == BitWidth - 1) {
           Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1);
           return new ZExtInst(NewLShr, Ty);
         }
-      }
 
-      // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
-      if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
-        // The new shift amount can't be more than the narrow source type.
-        unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
-        Value *AShr = Builder.CreateAShr(X, NewShAmt);
-        return new ZExtInst(AShr, Ty);
+        // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+        if (ShAmtC == BitWidth - SrcTyBitWidth) {
+          // The new shift amount can't be more than the narrow source type.
+          unsigned NewShAmt = std::min(ShAmtC, SrcTyBitWidth - 1);
+          Value *AShr = Builder.CreateAShr(X, NewShAmt);
+          return new ZExtInst(AShr, Ty);
+        }
       }
     }
 
     Value *Y;
-    if (ShAmt == BitWidth - 1) {
+    if (ShAmtC == BitWidth - 1) {
       // lshr i32 or(X,-X), 31 --> zext (X != 0)
       if (match(Op0, m_OneUse(m_c_Or(m_Neg(m_Value(X)), m_Deferred(X)))))
         return new ZExtInst(Builder.CreateIsNotNull(X), Ty);
@@ -1150,32 +1107,55 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       // Check if a number is negative and odd:
       // lshr i32 (srem X, 2), 31 --> and (X >> 31), X
       if (match(Op0, m_OneUse(m_SRem(m_Value(X), m_SpecificInt(2))))) {
-        Value *Signbit = Builder.CreateLShr(X, ShAmt);
+        Value *Signbit = Builder.CreateLShr(X, ShAmtC);
         return BinaryOperator::CreateAnd(Signbit, X);
       }
     }
 
-    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
-      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+    // (X >>u C1) >>u C --> X >>u (C1 + C)
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(C1)))) {
       // Oversized shifts are simplified to zero in InstSimplify.
+      unsigned AmtSum = ShAmtC + C1->getZExtValue();
       if (AmtSum < BitWidth)
-        // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
         return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
     }
 
+    Instruction *TruncSrc;
+    if (match(Op0, m_OneUse(m_Trunc(m_Instruction(TruncSrc)))) &&
+        match(TruncSrc, m_LShr(m_Value(X), m_APInt(C1)))) {
+      unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+      unsigned AmtSum = ShAmtC + C1->getZExtValue();
+
+      // If the combined shift fits in the source width:
+      // (trunc (X >>u C1)) >>u C --> and (trunc (X >>u (C1 + C)), MaskC
+      //
+      // If the first shift covers the number of bits truncated, then the
+      // mask instruction is eliminated (and so the use check is relaxed).
+      if (AmtSum < SrcWidth &&
+          (TruncSrc->hasOneUse() || C1->uge(SrcWidth - BitWidth))) {
+        Value *SumShift = Builder.CreateLShr(X, AmtSum, "sum.shift");
+        Value *Trunc = Builder.CreateTrunc(SumShift, Ty, I.getName());
+
+        // If the first shift does not cover the number of bits truncated, then
+        // we require a mask to get rid of high bits in the result.
+        APInt MaskC = APInt::getAllOnes(BitWidth).lshr(ShAmtC);
+        return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(Ty, MaskC));
+      }
+    }
+
     // Look for a "splat" mul pattern - it replicates bits across each half of
     // a value, so a right shift is just a mask of the low bits:
     // lshr i32 (mul nuw X, Pow2+1), 16 --> and X, Pow2-1
     // TODO: Generalize to allow more than just half-width shifts?
     const APInt *MulC;
     if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC))) &&
-        ShAmt * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
-        MulC->logBase2() == ShAmt)
+        ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
+        MulC->logBase2() == ShAmtC)
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmtC), 0, &I)) {
       I.setIsExact();
       return &I;
     }
@@ -1346,6 +1326,22 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
     }
   }
 
+  // Prefer `-(x & 1)` over `(x << (bitwidth(x)-1)) a>> (bitwidth(x)-1)`
+  // as the pattern to splat the lowest bit.
+  // FIXME: iff X is already masked, we don't need the one-use check.
+  Value *X;
+  if (match(Op1, m_SpecificIntAllowUndef(BitWidth - 1)) &&
+      match(Op0, m_OneUse(m_Shl(m_Value(X),
+                                m_SpecificIntAllowUndef(BitWidth - 1))))) {
+    Constant *Mask = ConstantInt::get(Ty, 1);
+    // Retain the knowledge about the ignored lanes.
+    Mask = Constant::mergeUndefsWith(
+        Constant::mergeUndefsWith(Mask, cast<Constant>(Op1)),
+        cast<Constant>(cast<Instruction>(Op0)->getOperand(1)));
+    X = Builder.CreateAnd(X, Mask);
+    return BinaryOperator::CreateNeg(X);
+  }
+
   if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
     return R;
 
@@ -1354,7 +1350,6 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
     return BinaryOperator::CreateLShr(Op0, Op1);
 
   // ashr (xor %x, -1), %y  -->  xor (ashr %x, %y), -1
-  Value *X;
   if (match(Op0, m_OneUse(m_Not(m_Value(X))))) {
     // Note that we must drop 'exact'-ness of the shift!
     // Note that we can't keep undef's in -1 vector constant!
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 15b51ae8a5ee..e357a9da8b12 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -55,7 +55,7 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
 bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
   unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
   KnownBits Known(BitWidth);
-  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+  APInt DemandedMask(APInt::getAllOnes(BitWidth));
 
   Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
                                      0, &Inst);
@@ -124,7 +124,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   }
 
   Known.resetAll();
-  if (DemandedMask.isNullValue())     // Not demanding any bits from V.
+  if (DemandedMask.isZero()) // Not demanding any bits from V.
     return UndefValue::get(VTy);
 
   if (Depth == MaxAnalysisRecursionDepth)
@@ -274,8 +274,8 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // constant because that's a canonical 'not' op, and that is better for
     // combining, SCEV, and codegen.
     const APInt *C;
-    if (match(I->getOperand(1), m_APInt(C)) && !C->isAllOnesValue()) {
-      if ((*C | ~DemandedMask).isAllOnesValue()) {
+    if (match(I->getOperand(1), m_APInt(C)) && !C->isAllOnes()) {
+      if ((*C | ~DemandedMask).isAllOnes()) {
         // Force bits to 1 to create a 'not' op.
         I->setOperand(1, ConstantInt::getAllOnesValue(VTy));
         return I;
@@ -385,8 +385,26 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known = KnownBits::commonBits(LHSKnown, RHSKnown);
     break;
   }
-  case Instruction::ZExt:
   case Instruction::Trunc: {
+    // If we do not demand the high bits of a right-shifted and truncated value,
+    // then we may be able to truncate it before the shift.
+    Value *X;
+    const APInt *C;
+    if (match(I->getOperand(0), m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
+      // The shift amount must be valid (not poison) in the narrow type, and
+      // it must not be greater than the high bits demanded of the result.
+      if (C->ult(I->getType()->getScalarSizeInBits()) &&
+          C->ule(DemandedMask.countLeadingZeros())) {
+        // trunc (lshr X, C) --> lshr (trunc X), C
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(I);
+        Value *Trunc = Builder.CreateTrunc(X, I->getType());
+        return Builder.CreateLShr(Trunc, C->getZExtValue());
+      }
+    }
+  }
+    LLVM_FALLTHROUGH;
+  case Instruction::ZExt: {
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
     APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
@@ -516,8 +534,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       return I->getOperand(0);
     // We can't do this with the LHS for subtraction, unless we are only
     // demanding the LSB.
-    if ((I->getOpcode() == Instruction::Add ||
-         DemandedFromOps.isOneValue()) &&
+    if ((I->getOpcode() == Instruction::Add || DemandedFromOps.isOne()) &&
         DemandedFromOps.isSubsetOf(LHSKnown.Zero))
       return I->getOperand(1);
 
@@ -615,7 +632,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // always convert this into a logical shr, even if the shift amount is
     // variable.  The low bit of the shift cannot be an input sign bit unless
     // the shift amount is >= the size of the datatype, which is undefined.
-    if (DemandedMask.isOneValue()) {
+    if (DemandedMask.isOne()) {
       // Perform the logical shift right.
       Instruction *NewVal = BinaryOperator::CreateLShr(
                         I->getOperand(0), I->getOperand(1), I->getName());
@@ -743,7 +760,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   }
   case Instruction::URem: {
     KnownBits Known2(BitWidth);
-    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    APInt AllOnes = APInt::getAllOnes(BitWidth);
     if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
         SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
       return I;
@@ -829,6 +846,29 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         KnownBitsComputed = true;
         break;
       }
+      case Intrinsic::umax: {
+        // UMax(A, C) == A if ...
+        // The lowest non-zero bit of DemandMask is higher than the highest
+        // non-zero bit of C.
+        const APInt *C;
+        unsigned CTZ = DemandedMask.countTrailingZeros();
+        if (match(II->getArgOperand(1), m_APInt(C)) &&
+            CTZ >= C->getActiveBits())
+          return II->getArgOperand(0);
+        break;
+      }
+      case Intrinsic::umin: {
+        // UMin(A, C) == A if ...
+        // The lowest non-zero bit of DemandMask is higher than the highest
+        // non-one bit of C.
+        // This comes from using DeMorgans on the above umax example.
+        const APInt *C;
+        unsigned CTZ = DemandedMask.countTrailingZeros();
+        if (match(II->getArgOperand(1), m_APInt(C)) &&
+            CTZ >= C->getBitWidth() - C->countLeadingOnes())
+          return II->getArgOperand(0);
+        break;
+      }
       default: {
         // Handle target specific intrinsics
         Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
@@ -1021,8 +1061,8 @@ Value *InstCombinerImpl::simplifyShrShlDemandedBits(
   Known.Zero.setLowBits(ShlAmt - 1);
   Known.Zero &= DemandedMask;
 
-  APInt BitMask1(APInt::getAllOnesValue(BitWidth));
-  APInt BitMask2(APInt::getAllOnesValue(BitWidth));
+  APInt BitMask1(APInt::getAllOnes(BitWidth));
+  APInt BitMask2(APInt::getAllOnes(BitWidth));
 
   bool isLshr = (Shr->getOpcode() == Instruction::LShr);
   BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
@@ -1088,7 +1128,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
     return nullptr;
 
   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
-  APInt EltMask(APInt::getAllOnesValue(VWidth));
+  APInt EltMask(APInt::getAllOnes(VWidth));
   assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
 
   if (match(V, m_Undef())) {
@@ -1097,7 +1137,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
     return nullptr;
   }
 
-  if (DemandedElts.isNullValue()) { // If nothing is demanded, provide poison.
+  if (DemandedElts.isZero()) { // If nothing is demanded, provide poison.
     UndefElts = EltMask;
     return PoisonValue::get(V->getType());
   }
@@ -1107,7 +1147,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
   if (auto *C = dyn_cast<Constant>(V)) {
     // Check if this is identity. If so, return 0 since we are not simplifying
     // anything.
-    if (DemandedElts.isAllOnesValue())
+    if (DemandedElts.isAllOnes())
       return nullptr;
 
     Type *EltTy = cast<VectorType>(V->getType())->getElementType();
@@ -1260,7 +1300,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
     // Handle trivial case of a splat. Only check the first element of LHS
     // operand.
     if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
-        DemandedElts.isAllOnesValue()) {
+        DemandedElts.isAllOnes()) {
       if (!match(I->getOperand(1), m_Undef())) {
         I->setOperand(1, PoisonValue::get(I->getOperand(1)->getType()));
         MadeChange = true;
@@ -1515,8 +1555,8 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
       // Subtlety: If we load from a pointer, the pointer must be valid
       // regardless of whether the element is demanded.  Doing otherwise risks
       // segfaults which didn't exist in the original program.
-      APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)),
-        DemandedPassThrough(DemandedElts);
+      APInt DemandedPtrs(APInt::getAllOnes(VWidth)),
+          DemandedPassThrough(DemandedElts);
       if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
         for (unsigned i = 0; i < VWidth; i++) {
           Constant *CElt = CV->getAggregateElement(i);
@@ -1568,7 +1608,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
 
   // If we've proven all of the lanes undef, return an undef value.
   // TODO: Intersect w/demanded lanes
-  if (UndefElts.isAllOnesValue())
+  if (UndefElts.isAllOnes())
     return UndefValue::get(I->getType());;
 
   return MadeChange ? I : nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 32b15376f898..32e537897140 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -35,37 +35,46 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <utility>
 
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "instcombine"
-
 STATISTIC(NumAggregateReconstructionsSimplified,
           "Number of aggregate reconstructions turned into reuse of the "
           "original aggregate");
 
 /// Return true if the value is cheaper to scalarize than it is to leave as a
-/// vector operation. IsConstantExtractIndex indicates whether we are extracting
-/// one known element from a vector constant.
+/// vector operation. If the extract index \p EI is a constant integer then
+/// some operations may be cheap to scalarize.
 ///
 /// FIXME: It's possible to create more instructions than previously existed.
-static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
+static bool cheapToScalarize(Value *V, Value *EI) {
+  ConstantInt *CEI = dyn_cast<ConstantInt>(EI);
+
   // If we can pick a scalar constant value out of a vector, that is free.
   if (auto *C = dyn_cast<Constant>(V))
-    return IsConstantExtractIndex || C->getSplatValue();
+    return CEI || C->getSplatValue();
+
+  if (CEI && match(V, m_Intrinsic<Intrinsic::experimental_stepvector>())) {
+    ElementCount EC = cast<VectorType>(V->getType())->getElementCount();
+    // Index needs to be lower than the minimum size of the vector, because
+    // for scalable vector, the vector size is known at run time.
+    return CEI->getValue().ult(EC.getKnownMinValue());
+  }
 
   // An insertelement to the same constant index as our extract will simplify
   // to the scalar inserted element. An insertelement to a different constant
   // index is irrelevant to our extract.
   if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
-    return IsConstantExtractIndex;
+    return CEI;
 
   if (match(V, m_OneUse(m_Load(m_Value()))))
     return true;
@@ -75,14 +84,12 @@ static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
 
   Value *V0, *V1;
   if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
-    if (cheapToScalarize(V0, IsConstantExtractIndex) ||
-        cheapToScalarize(V1, IsConstantExtractIndex))
+    if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI))
       return true;
 
   CmpInst::Predicate UnusedPred;
   if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
-    if (cheapToScalarize(V0, IsConstantExtractIndex) ||
-        cheapToScalarize(V1, IsConstantExtractIndex))
+    if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI))
       return true;
 
   return false;
@@ -119,7 +126,8 @@ Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
   // and that it is a binary operation which is cheap to scalarize.
   // otherwise return nullptr.
   if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
-      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
+      !(isa<BinaryOperator>(PHIUser)) ||
+      !cheapToScalarize(PHIUser, EI.getIndexOperand()))
     return nullptr;
 
   // Create a scalar PHI node that will replace the vector PHI node
@@ -170,24 +178,46 @@ Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
   return &EI;
 }
 
-static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
-                                      InstCombiner::BuilderTy &Builder,
-                                      bool IsBigEndian) {
+Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
   Value *X;
   uint64_t ExtIndexC;
   if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||
-      !X->getType()->isVectorTy() ||
       !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))
     return nullptr;
 
+  ElementCount NumElts =
+      cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
+  Type *DestTy = Ext.getType();
+  bool IsBigEndian = DL.isBigEndian();
+
+  // If we are casting an integer to vector and extracting a portion, that is
+  // a shift-right and truncate.
+  // TODO: Allow FP dest type by casting the trunc to FP?
+  if (X->getType()->isIntegerTy() && DestTy->isIntegerTy() &&
+      isDesirableIntType(X->getType()->getPrimitiveSizeInBits())) {
+    assert(isa<FixedVectorType>(Ext.getVectorOperand()->getType()) &&
+           "Expected fixed vector type for bitcast from scalar integer");
+
+    // Big endian requires adjusting the extract index since MSB is at index 0.
+    // LittleEndian: extelt (bitcast i32 X to v4i8), 0 -> trunc i32 X to i8
+    // BigEndian: extelt (bitcast i32 X to v4i8), 0 -> trunc i32 (X >> 24) to i8
+    if (IsBigEndian)
+      ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC;
+    unsigned ShiftAmountC = ExtIndexC * DestTy->getPrimitiveSizeInBits();
+    if (!ShiftAmountC || Ext.getVectorOperand()->hasOneUse()) {
+      Value *Lshr = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset");
+      return new TruncInst(Lshr, DestTy);
+    }
+  }
+
+  if (!X->getType()->isVectorTy())
+    return nullptr;
+
   // If this extractelement is using a bitcast from a vector of the same number
   // of elements, see if we can find the source element from the source vector:
   // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
   auto *SrcTy = cast<VectorType>(X->getType());
-  Type *DestTy = Ext.getType();
   ElementCount NumSrcElts = SrcTy->getElementCount();
-  ElementCount NumElts =
-      cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
   if (NumSrcElts == NumElts)
     if (Value *Elt = findScalarElement(X, ExtIndexC))
       return new BitCastInst(Elt, DestTy);
@@ -274,7 +304,7 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
 
   // Conservatively assume that all elements are needed.
-  APInt UsedElts(APInt::getAllOnesValue(VWidth));
+  APInt UsedElts(APInt::getAllOnes(VWidth));
 
   switch (UserInstr->getOpcode()) {
   case Instruction::ExtractElement: {
@@ -322,11 +352,11 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
     if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
       UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
     } else {
-      UnionUsedElts = APInt::getAllOnesValue(VWidth);
+      UnionUsedElts = APInt::getAllOnes(VWidth);
       break;
     }
 
-    if (UnionUsedElts.isAllOnesValue())
+    if (UnionUsedElts.isAllOnes())
       break;
   }
 
@@ -388,7 +418,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
         // If the input vector has multiple uses, simplify it based on a union
         // of all elements used.
         APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
-        if (!DemandedElts.isAllOnesValue()) {
+        if (!DemandedElts.isAllOnes()) {
           APInt UndefElts(NumElts, 0);
           if (Value *V = SimplifyDemandedVectorElts(
                   SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
@@ -402,7 +432,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
       }
     }
 
-    if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
+    if (Instruction *I = foldBitcastExtElt(EI))
       return I;
 
     // If there's a vector PHI feeding a scalar use through this extractelement
@@ -415,7 +445,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   // TODO come up with a n-ary matcher that subsumes both unary and
   // binary matchers.
   UnaryOperator *UO;
-  if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
+  if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, Index)) {
     // extelt (unop X), Index --> unop (extelt X, Index)
     Value *X = UO->getOperand(0);
     Value *E = Builder.CreateExtractElement(X, Index);
@@ -423,7 +453,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   }
 
   BinaryOperator *BO;
-  if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
+  if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, Index)) {
     // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
     Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
     Value *E0 = Builder.CreateExtractElement(X, Index);
@@ -434,7 +464,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   Value *X, *Y;
   CmpInst::Predicate Pred;
   if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
-      cheapToScalarize(SrcVec, IndexC)) {
+      cheapToScalarize(SrcVec, Index)) {
     // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
     Value *E0 = Builder.CreateExtractElement(X, Index);
     Value *E1 = Builder.CreateExtractElement(Y, Index);
@@ -651,8 +681,7 @@ static void replaceExtractElements(InsertElementInst *InsElt,
   if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
     return;
 
-  auto *WideVec =
-      new ShuffleVectorInst(ExtVecOp, PoisonValue::get(ExtVecType), ExtendMask);
+  auto *WideVec = new ShuffleVectorInst(ExtVecOp, ExtendMask);
 
   // Insert the new shuffle after the vector operand of the extract is defined
   // (as long as it's not a PHI) or at the start of the basic block of the
@@ -913,7 +942,7 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
              "We don't store nullptr in SourceAggregate!");
       assert((Describe(SourceAggregate) == AggregateDescription::Found) ==
                  (I.index() != 0) &&
-             "SourceAggregate should be valid after the the first element,");
+             "SourceAggregate should be valid after the first element,");
 
       // For this element, is there a plausible source aggregate?
       // FIXME: we could special-case undef element, IFF we know that in the
@@ -1179,7 +1208,7 @@ static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
     if (!ElementPresent[i])
       Mask[i] = -1;
 
-  return new ShuffleVectorInst(FirstIE, PoisonVec, Mask);
+  return new ShuffleVectorInst(FirstIE, Mask);
 }
 
 /// Try to fold an insert element into an existing splat shuffle by changing
@@ -1208,15 +1237,15 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
 
   // Replace the shuffle mask element at the index of this insert with a zero.
   // For example:
-  // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
-  //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
+  // inselt (shuf (inselt undef, X, 0), _, <0,undef,0,undef>), X, 1
+  //   --> shuf (inselt undef, X, 0), poison, <0,0,0,undef>
   unsigned NumMaskElts =
       cast<FixedVectorType>(Shuf->getType())->getNumElements();
   SmallVector<int, 16> NewMask(NumMaskElts);
   for (unsigned i = 0; i != NumMaskElts; ++i)
     NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
 
-  return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
+  return new ShuffleVectorInst(Op0, NewMask);
 }
 
 /// Try to fold an extract+insert element into an existing identity shuffle by
@@ -1348,6 +1377,10 @@ static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
         NewShufElts[I] = ShufConstVec->getAggregateElement(I);
         NewMaskElts[I] = Mask[I];
       }
+
+      // Bail if we failed to find an element.
+      if (!NewShufElts[I])
+        return nullptr;
     }
 
     // Create new operands for a shuffle that includes the constant of the
@@ -1399,6 +1432,41 @@ static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
   return nullptr;
 }
 
+/// If both the base vector and the inserted element are extended from the same
+/// type, do the insert element in the narrow source type followed by extend.
+/// TODO: This can be extended to include other cast opcodes, but particularly
+///       if we create a wider insertelement, make sure codegen is not harmed.
+static Instruction *narrowInsElt(InsertElementInst &InsElt,
+                                 InstCombiner::BuilderTy &Builder) {
+  // We are creating a vector extend. If the original vector extend has another
+  // use, that would mean we end up with 2 vector extends, so avoid that.
+  // TODO: We could ease the use-clause to "if at least one op has one use"
+  //       (assuming that the source types match - see next TODO comment).
+  Value *Vec = InsElt.getOperand(0);
+  if (!Vec->hasOneUse())
+    return nullptr;
+
+  Value *Scalar = InsElt.getOperand(1);
+  Value *X, *Y;
+  CastInst::CastOps CastOpcode;
+  if (match(Vec, m_FPExt(m_Value(X))) && match(Scalar, m_FPExt(m_Value(Y))))
+    CastOpcode = Instruction::FPExt;
+  else if (match(Vec, m_SExt(m_Value(X))) && match(Scalar, m_SExt(m_Value(Y))))
+    CastOpcode = Instruction::SExt;
+  else if (match(Vec, m_ZExt(m_Value(X))) && match(Scalar, m_ZExt(m_Value(Y))))
+    CastOpcode = Instruction::ZExt;
+  else
+    return nullptr;
+
+  // TODO: We can allow mismatched types by creating an intermediate cast.
+  if (X->getType()->getScalarType() != Y->getType())
+    return nullptr;
+
+  // inselt (ext X), (ext Y), Index --> ext (inselt X, Y, Index)
+  Value *NewInsElt = Builder.CreateInsertElement(X, Y, InsElt.getOperand(2));
+  return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType());
+}
+
 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
@@ -1495,7 +1563,7 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
     unsigned VWidth = VecTy->getNumElements();
     APInt UndefElts(VWidth, 0);
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    APInt AllOnesEltMask(APInt::getAllOnes(VWidth));
     if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
       if (V != &IE)
         return replaceInstUsesWith(IE, V);
@@ -1518,6 +1586,9 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
     return IdentityShuf;
 
+  if (Instruction *Ext = narrowInsElt(IE, Builder))
+    return Ext;
+
   return nullptr;
 }
 
@@ -1924,8 +1995,8 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
 
   // Splat from element 0. Any mask element that is undefined remains undefined.
   // For example:
-  // shuf (inselt undef, X, 2), undef, <2,2,undef>
-  //   --> shuf (inselt undef, X, 0), undef, <0,0,undef>
+  // shuf (inselt undef, X, 2), _, <2,2,undef>
+  //   --> shuf (inselt undef, X, 0), poison, <0,0,undef>
   unsigned NumMaskElts =
       cast<FixedVectorType>(Shuf.getType())->getNumElements();
   SmallVector<int, 16> NewMask(NumMaskElts, 0);
@@ -1933,7 +2004,7 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
     if (Mask[i] == UndefMaskElem)
       NewMask[i] = Mask[i];
 
-  return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
+  return new ShuffleVectorInst(NewIns, NewMask);
 }
 
 /// Try to fold shuffles that are the equivalent of a vector select.
@@ -2197,12 +2268,8 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
   SmallVector<int, 16> Mask;
   Shuf.getShuffleMask(Mask);
 
-  // The shuffle must not change vector sizes.
-  // TODO: This restriction could be removed if the insert has only one use
-  //       (because the transform would require a new length-changing shuffle).
   int NumElts = Mask.size();
-  if (NumElts != (int)(cast<FixedVectorType>(V0->getType())->getNumElements()))
-    return nullptr;
+  int InpNumElts = cast<FixedVectorType>(V0->getType())->getNumElements();
 
   // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
   // not be able to handle it there if the insertelement has >1 use.
@@ -2219,11 +2286,16 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
   if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
     // Offset the index constant by the vector width because we are checking for
     // accesses to the 2nd vector input of the shuffle.
-    IdxC += NumElts;
+    IdxC += InpNumElts;
     // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
     if (!is_contained(Mask, (int)IdxC))
       return IC.replaceOperand(Shuf, 1, X);
   }
+  // For the rest of the transform, the shuffle must not change vector sizes.
+  // TODO: This restriction could be removed if the insert has only one use
+  //       (because the transform would require a new length-changing shuffle).
+  if (NumElts != InpNumElts)
+    return nullptr;
 
   // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
   auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
@@ -2413,16 +2485,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (LHS == RHS) {
     assert(!match(RHS, m_Undef()) &&
            "Shuffle with 2 undef ops not simplified?");
-    // Remap any references to RHS to use LHS.
-    SmallVector<int, 16> Elts;
-    for (unsigned i = 0; i != VWidth; ++i) {
-      // Propagate undef elements or force mask to LHS.
-      if (Mask[i] < 0)
-        Elts.push_back(UndefMaskElem);
-      else
-        Elts.push_back(Mask[i] % LHSWidth);
-    }
-    return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
+    return new ShuffleVectorInst(LHS, createUnaryMask(Mask, LHSWidth));
   }
 
   // shuffle undef, x, mask --> shuffle x, undef, mask'
@@ -2444,7 +2507,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     return I;
 
   APInt UndefElts(VWidth, 0);
-  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  APInt AllOnesEltMask(APInt::getAllOnes(VWidth));
   if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
     if (V != &SVI)
       return replaceInstUsesWith(SVI, V);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4e3b18e805ee..47b6dcb67a78 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -100,7 +100,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
@@ -109,11 +108,12 @@
 #include <string>
 #include <utility>
 
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "instcombine"
-
 STATISTIC(NumWorklistIterations,
           "Number of instruction combining iterations performed");
 
@@ -202,23 +202,37 @@ Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(&Builder, DL, GEP);
 }
 
+/// Legal integers and common types are considered desirable. This is used to
+/// avoid creating instructions with types that may not be supported well by the
+/// the backend.
+/// NOTE: This treats i8, i16 and i32 specially because they are common
+///       types in frontend languages.
+bool InstCombinerImpl::isDesirableIntType(unsigned BitWidth) const {
+  switch (BitWidth) {
+  case 8:
+  case 16:
+  case 32:
+    return true;
+  default:
+    return DL.isLegalInteger(BitWidth);
+  }
+}
+
 /// Return true if it is desirable to convert an integer computation from a
 /// given bit width to a new bit width.
 /// We don't want to convert from a legal to an illegal type or from a smaller
-/// to a larger illegal type. A width of '1' is always treated as a legal type
-/// because i1 is a fundamental type in IR, and there are many specialized
-/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
+/// to a larger illegal type. A width of '1' is always treated as a desirable
+/// type because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types. Common/desirable widths are equally treated as
 /// legal to convert to, in order to open up more combining opportunities.
-/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
-/// from frontend languages.
 bool InstCombinerImpl::shouldChangeType(unsigned FromWidth,
                                         unsigned ToWidth) const {
   bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
   bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
-  // Convert to widths of 8, 16 or 32 even if they are not legal types. Only
-  // shrink types, to prevent infinite loops.
-  if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
+  // Convert to desirable widths even if they are not legal types.
+  // Only shrink types, to prevent infinite loops.
+  if (ToWidth < FromWidth && isDesirableIntType(ToWidth))
     return true;
 
   // If this is a legal integer from type, and the result would be an illegal
@@ -359,7 +373,8 @@ Value *InstCombinerImpl::simplifyIntToPtrRoundTripCast(Value *Val) {
             PtrToInt->getSrcTy()->getPointerAddressSpace() &&
         DL.getPointerTypeSizeInBits(PtrToInt->getSrcTy()) ==
             DL.getTypeSizeInBits(PtrToInt->getDestTy())) {
-      return Builder.CreateBitCast(PtrToInt->getOperand(0), CastTy);
+      return CastInst::CreateBitOrPointerCast(PtrToInt->getOperand(0), CastTy,
+                                              "", PtrToInt);
     }
   }
   return nullptr;
@@ -961,14 +976,14 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
     assert(canConstantFoldCallTo(II, cast<Function>(II->getCalledOperand())) &&
            "Expected constant-foldable intrinsic");
     Intrinsic::ID IID = II->getIntrinsicID();
-    if (II->getNumArgOperands() == 1)
+    if (II->arg_size() == 1)
       return Builder.CreateUnaryIntrinsic(IID, SO);
 
     // This works for real binary ops like min/max (where we always expect the
     // constant operand to be canonicalized as op1) and unary ops with a bonus
     // constant argument like ctlz/cttz.
     // TODO: Handle non-commutative binary intrinsics as below for binops.
-    assert(II->getNumArgOperands() == 2 && "Expected binary intrinsic");
+    assert(II->arg_size() == 2 && "Expected binary intrinsic");
     assert(isa<Constant>(II->getArgOperand(1)) && "Expected constant operand");
     return Builder.CreateBinaryIntrinsic(IID, SO, II->getArgOperand(1));
   }
@@ -1058,7 +1073,7 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
         // Compare for equality including undefs as equal.
         auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
         const APInt *C;
-        return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
+        return match(Cmp, m_APIntAllowUndef(C)) && C->isOne();
       };
 
       if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) ||
@@ -1120,9 +1135,11 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   BasicBlock *NonConstBB = nullptr;
   for (unsigned i = 0; i != NumPHIValues; ++i) {
     Value *InVal = PN->getIncomingValue(i);
-    // If I is a freeze instruction, count undef as a non-constant.
-    if (match(InVal, m_ImmConstant()) &&
-        (!isa<FreezeInst>(I) || isGuaranteedNotToBeUndefOrPoison(InVal)))
+    // For non-freeze, require constant operand
+    // For freeze, require non-undef, non-poison operand
+    if (!isa<FreezeInst>(I) && match(InVal, m_ImmConstant()))
+      continue;
+    if (isa<FreezeInst>(I) && isGuaranteedNotToBeUndefOrPoison(InVal))
       continue;
 
     if (isa<PHINode>(InVal)) return nullptr;  // Itself a phi.
@@ -1268,61 +1285,19 @@ Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
 /// specified offset. If so, fill them into NewIndices and return the resultant
 /// element type, otherwise return null.
 Type *
-InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
                                       SmallVectorImpl<Value *> &NewIndices) {
   Type *Ty = PtrTy->getElementType();
   if (!Ty->isSized())
     return nullptr;
 
-  // Start with the index over the outer type.  Note that the type size
-  // might be zero (even if the offset isn't zero) if the indexed type
-  // is something like [0 x {int, int}]
-  Type *IndexTy = DL.getIndexType(PtrTy);
-  int64_t FirstIdx = 0;
-  if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
-    FirstIdx = Offset/TySize;
-    Offset -= FirstIdx*TySize;
-
-    // Handle hosts where % returns negative instead of values [0..TySize).
-    if (Offset < 0) {
-      --FirstIdx;
-      Offset += TySize;
-      assert(Offset >= 0);
-    }
-    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
-  }
-
-  NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
-
-  // Index into the types.  If we fail, set OrigBase to null.
-  while (Offset) {
-    // Indexing into tail padding between struct/array elements.
-    if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
-      return nullptr;
-
-    if (StructType *STy = dyn_cast<StructType>(Ty)) {
-      const StructLayout *SL = DL.getStructLayout(STy);
-      assert(Offset < (int64_t)SL->getSizeInBytes() &&
-             "Offset must stay within the indexed type");
-
-      unsigned Elt = SL->getElementContainingOffset(Offset);
-      NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
-                                            Elt));
-
-      Offset -= SL->getElementOffset(Elt);
-      Ty = STy->getElementType(Elt);
-    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
-      uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
-      assert(EltSize && "Cannot index into a zero-sized array");
-      NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
-      Offset %= EltSize;
-      Ty = AT->getElementType();
-    } else {
-      // Otherwise, we can't index into the middle of this atomic type, bail.
-      return nullptr;
-    }
-  }
+  APInt Offset(DL.getIndexTypeSizeInBits(PtrTy), IntOffset);
+  SmallVector<APInt> Indices = DL.getGEPIndicesForOffset(Ty, Offset);
+  if (!Offset.isZero())
+    return nullptr;
 
+  for (const APInt &Index : Indices)
+    NewIndices.push_back(Builder.getInt(Index));
   return Ty;
 }
 
@@ -1623,7 +1598,7 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     Value *XY = Builder.CreateBinOp(Opcode, X, Y);
     if (auto *BO = dyn_cast<BinaryOperator>(XY))
       BO->copyIRFlags(&Inst);
-    return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
+    return new ShuffleVectorInst(XY, M);
   };
 
   // If both arguments of the binary operation are shuffles that use the same
@@ -1754,25 +1729,20 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     Value *X;
     ArrayRef<int> MaskC;
     int SplatIndex;
-    BinaryOperator *BO;
+    Value *Y, *OtherOp;
     if (!match(LHS,
                m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) ||
         !match(MaskC, m_SplatOrUndefMask(SplatIndex)) ||
-        X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) ||
-        BO->getOpcode() != Opcode)
+        X->getType() != Inst.getType() ||
+        !match(RHS, m_OneUse(m_BinOp(Opcode, m_Value(Y), m_Value(OtherOp)))))
       return nullptr;
 
     // FIXME: This may not be safe if the analysis allows undef elements. By
     //        moving 'Y' before the splat shuffle, we are implicitly assuming
     //        that it is not undef/poison at the splat index.
-    Value *Y, *OtherOp;
-    if (isSplatValue(BO->getOperand(0), SplatIndex)) {
-      Y = BO->getOperand(0);
-      OtherOp = BO->getOperand(1);
-    } else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
-      Y = BO->getOperand(1);
-      OtherOp = BO->getOperand(0);
-    } else {
+    if (isSplatValue(OtherOp, SplatIndex)) {
+      std::swap(Y, OtherOp);
+    } else if (!isSplatValue(Y, SplatIndex)) {
       return nullptr;
     }
 
@@ -1788,7 +1758,7 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     // dropped to be safe.
     if (isa<FPMathOperator>(R)) {
       R->copyFastMathFlags(&Inst);
-      R->andIRFlags(BO);
+      R->andIRFlags(RHS);
     }
     if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
       NewInstBO->copyIRFlags(R);
@@ -1896,7 +1866,8 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
   bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
-  if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
+  if (Value *V = SimplifyGEPInst(GEPEltType, Ops, GEP.isInBounds(),
+                                 SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
 
   // For vector geps, use the generic demanded vector support.
@@ -1905,7 +1876,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
     auto VWidth = GEPFVTy->getNumElements();
     APInt UndefElts(VWidth, 0);
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    APInt AllOnesEltMask(APInt::getAllOnes(VWidth));
     if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
                                               UndefElts)) {
       if (V != &GEP)
@@ -2117,10 +2088,12 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
               // -- have to recreate %src & %gep
               // put NewSrc at same location as %src
               Builder.SetInsertPoint(cast<Instruction>(PtrOp));
-              auto *NewSrc = cast<GetElementPtrInst>(
-                  Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
-              NewSrc->setIsInBounds(Src->isInBounds());
-              auto *NewGEP =
+              Value *NewSrc =
+                  Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName());
+              // Propagate 'inbounds' if the new source was not constant-folded.
+              if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
+                NewSrcGEPI->setIsInBounds(Src->isInBounds());
+              GetElementPtrInst *NewGEP =
                   GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
               NewGEP->setIsInBounds(GEP.isInBounds());
               return NewGEP;
@@ -2128,18 +2101,6 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           }
         }
       }
-
-      // Fold (gep(gep(Ptr,Idx0),Idx1) -> gep(Ptr,add(Idx0,Idx1))
-      if (GO1->getType() == SO1->getType()) {
-        bool NewInBounds = GEP.isInBounds() && Src->isInBounds();
-        auto *NewIdx =
-            Builder.CreateAdd(GO1, SO1, GEP.getName() + ".idx",
-                              /*HasNUW*/ false, /*HasNSW*/ NewInBounds);
-        auto *NewGEP = GetElementPtrInst::Create(
-            GEPEltType, Src->getPointerOperand(), {NewIdx});
-        NewGEP->setIsInBounds(NewInBounds);
-        return NewGEP;
-      }
     }
 
     // Note that if our source is a gep chain itself then we wait for that
@@ -2647,6 +2608,13 @@ static bool isAllocSiteRemovable(Instruction *AI,
           Users.emplace_back(I);
           continue;
         }
+
+        if (isReallocLikeFn(I, TLI, true)) {
+          Users.emplace_back(I);
+          Worklist.push_back(I);
+          continue;
+        }
+
         return false;
 
       case Instruction::Store: {
@@ -2834,15 +2802,33 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
 
   // At this point, we know that everything in FreeInstrBB can be moved
   // before TI.
-  for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
-       It != End;) {
-    Instruction &Instr = *It++;
+  for (Instruction &Instr : llvm::make_early_inc_range(*FreeInstrBB)) {
     if (&Instr == FreeInstrBBTerminator)
       break;
     Instr.moveBefore(TI);
   }
   assert(FreeInstrBB->size() == 1 &&
          "Only the branch instruction should remain");
+
+  // Now that we've moved the call to free before the NULL check, we have to
+  // remove any attributes on its parameter that imply it's non-null, because
+  // those attributes might have only been valid because of the NULL check, and
+  // we can get miscompiles if we keep them. This is conservative if non-null is
+  // also implied by something other than the NULL check, but it's guaranteed to
+  // be correct, and the conservativeness won't matter in practice, since the
+  // attributes are irrelevant for the call to free itself and the pointer
+  // shouldn't be used after the call.
+  AttributeList Attrs = FI.getAttributes();
+  Attrs = Attrs.removeParamAttribute(FI.getContext(), 0, Attribute::NonNull);
+  Attribute Dereferenceable = Attrs.getParamAttr(0, Attribute::Dereferenceable);
+  if (Dereferenceable.isValid()) {
+    uint64_t Bytes = Dereferenceable.getDereferenceableBytes();
+    Attrs = Attrs.removeParamAttribute(FI.getContext(), 0,
+                                       Attribute::Dereferenceable);
+    Attrs = Attrs.addDereferenceableOrNullParamAttr(FI.getContext(), 0, Bytes);
+  }
+  FI.setAttributes(Attrs);
+
   return &FI;
 }
 
@@ -2861,6 +2847,15 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
   if (isa<ConstantPointerNull>(Op))
     return eraseInstFromFunction(FI);
 
+  // If we had free(realloc(...)) with no intervening uses, then eliminate the
+  // realloc() entirely.
+  if (CallInst *CI = dyn_cast<CallInst>(Op)) {
+    if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI, true)) {
+      return eraseInstFromFunction(
+          *replaceInstUsesWith(*CI, CI->getOperand(0)));
+    }
+  }
+
   // If we optimize for code size, try to move the call to free before the null
   // test so that simplify cfg can remove the empty block and dead code
   // elimination the branch. I.e., helps to turn something like:
@@ -2947,7 +2942,7 @@ Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) {
 
   auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
     auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
-      return isa<DbgInfoIntrinsic>(BBI) ||
+      return BBI->isDebugOrPseudoInst() ||
              (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
     };
 
@@ -3138,26 +3133,21 @@ Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
       // checking for overflow.
       const APInt *C;
       if (match(WO->getRHS(), m_APInt(C))) {
-        // Compute the no-wrap range [X,Y) for LHS given RHS=C, then
-        // check for the inverted range using range offset trick (i.e.
-        // use a subtract to shift the range to bottom of either the
-        // signed or unsigned domain and then use a single compare to
-        // check range membership).
+        // Compute the no-wrap range for LHS given RHS=C, then construct an
+        // equivalent icmp, potentially using an offset.
         ConstantRange NWR =
           ConstantRange::makeExactNoWrapRegion(WO->getBinaryOp(), *C,
                                                WO->getNoWrapKind());
-        APInt Min = WO->isSigned() ? NWR.getSignedMin() : NWR.getUnsignedMin();
-        NWR = NWR.subtract(Min);
 
         CmpInst::Predicate Pred;
-        APInt NewRHSC;
-        if (NWR.getEquivalentICmp(Pred, NewRHSC)) {
-          auto *OpTy = WO->getRHS()->getType();
-          auto *NewLHS = Builder.CreateSub(WO->getLHS(),
-                                           ConstantInt::get(OpTy, Min));
-          return new ICmpInst(ICmpInst::getInversePredicate(Pred), NewLHS,
-                              ConstantInt::get(OpTy, NewRHSC));
-        }
+        APInt NewRHSC, Offset;
+        NWR.getEquivalentICmp(Pred, NewRHSC, Offset);
+        auto *OpTy = WO->getRHS()->getType();
+        auto *NewLHS = WO->getLHS();
+        if (Offset != 0)
+          NewLHS = Builder.CreateAdd(NewLHS, ConstantInt::get(OpTy, Offset));
+        return new ICmpInst(ICmpInst::getInversePredicate(Pred), NewLHS,
+                            ConstantInt::get(OpTy, NewRHSC));
       }
     }
   }
@@ -3183,9 +3173,7 @@ Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
       Instruction *NL = Builder.CreateLoad(EV.getType(), GEP);
       // Whatever aliasing information we had for the orignal load must also
       // hold for the smaller load, so propagate the annotations.
-      AAMDNodes Nodes;
-      L->getAAMetadata(Nodes);
-      NL->setAAMetadata(Nodes);
+      NL->setAAMetadata(L->getAAMetadata());
       // Returning the load directly will cause the main loop to insert it in
       // the wrong spot, so use replaceInstUsesWith().
       return replaceInstUsesWith(EV, NL);
@@ -3568,8 +3556,14 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
   // While we could change the other users of OrigOp to use freeze(OrigOp), that
   // potentially reduces their optimization potential, so let's only do this iff
   // the OrigOp is only used by the freeze.
-  if (!OrigOpInst || !OrigOpInst->hasOneUse() || isa<PHINode>(OrigOp) ||
-      canCreateUndefOrPoison(dyn_cast<Operator>(OrigOp)))
+  if (!OrigOpInst || !OrigOpInst->hasOneUse() || isa<PHINode>(OrigOp))
+    return nullptr;
+
+  // We can't push the freeze through an instruction which can itself create
+  // poison.  If the only source of new poison is flags, we can simply
+  // strip them (since we know the only use is the freeze and nothing can
+  // benefit from them.)
+  if (canCreateUndefOrPoison(cast<Operator>(OrigOp), /*ConsiderFlags*/ false))
     return nullptr;
 
   // If operand is guaranteed not to be poison, there is no need to add freeze
@@ -3585,6 +3579,8 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
       return nullptr;
   }
 
+  OrigOpInst->dropPoisonGeneratingFlags();
+
   // If all operands are guaranteed to be non-poison, we can drop freeze.
   if (!MaybePoisonOperand)
     return OrigOp;
@@ -3668,7 +3664,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
 /// instruction past all of the instructions between it and the end of its
 /// block.
 static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
-  assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
+  assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!");
   BasicBlock *SrcBlock = I->getParent();
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
@@ -3822,51 +3818,71 @@ bool InstCombinerImpl::run() {
 
     // See if we can trivially sink this instruction to its user if we can
     // prove that the successor is not executed more frequently than our block.
-    if (EnableCodeSinking)
-      if (Use *SingleUse = I->getSingleUndroppableUse()) {
-        BasicBlock *BB = I->getParent();
-        Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
-        BasicBlock *UserParent;
-
-        // Get the block the use occurs in.
-        if (PHINode *PN = dyn_cast<PHINode>(UserInst))
-          UserParent = PN->getIncomingBlock(*SingleUse);
-        else
-          UserParent = UserInst->getParent();
-
-        // Try sinking to another block. If that block is unreachable, then do
-        // not bother. SimplifyCFG should handle it.
-        if (UserParent != BB && DT.isReachableFromEntry(UserParent)) {
-          // See if the user is one of our successors that has only one
-          // predecessor, so that we don't have to split the critical edge.
-          bool ShouldSink = UserParent->getUniquePredecessor() == BB;
-          // Another option where we can sink is a block that ends with a
-          // terminator that does not pass control to other block (such as
-          // return or unreachable). In this case:
-          //   - I dominates the User (by SSA form);
-          //   - the User will be executed at most once.
-          // So sinking I down to User is always profitable or neutral.
-          if (!ShouldSink) {
-            auto *Term = UserParent->getTerminator();
-            ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term);
-          }
-          if (ShouldSink) {
-            assert(DT.dominates(BB, UserParent) &&
-                   "Dominance relation broken?");
-            // Okay, the CFG is simple enough, try to sink this instruction.
-            if (TryToSinkInstruction(I, UserParent)) {
-              LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
-              MadeIRChange = true;
-              // We'll add uses of the sunk instruction below, but since sinking
-              // can expose opportunities for it's *operands* add them to the
-              // worklist
-              for (Use &U : I->operands())
-                if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
-                  Worklist.push(OpI);
-            }
+    // Return the UserBlock if successful.
+    auto getOptionalSinkBlockForInst =
+        [this](Instruction *I) -> Optional<BasicBlock *> {
+      if (!EnableCodeSinking)
+        return None;
+      auto *UserInst = cast_or_null<Instruction>(I->getUniqueUndroppableUser());
+      if (!UserInst)
+        return None;
+
+      BasicBlock *BB = I->getParent();
+      BasicBlock *UserParent = nullptr;
+
+      // Special handling for Phi nodes - get the block the use occurs in.
+      if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+        for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+          if (PN->getIncomingValue(i) == I) {
+            // Bail out if we have uses in different blocks. We don't do any
+            // sophisticated analysis (i.e finding NearestCommonDominator of these
+            // use blocks).
+            if (UserParent && UserParent != PN->getIncomingBlock(i))
+              return None;
+            UserParent = PN->getIncomingBlock(i);
           }
         }
+        assert(UserParent && "expected to find user block!");
+      } else
+        UserParent = UserInst->getParent();
+
+      // Try sinking to another block. If that block is unreachable, then do
+      // not bother. SimplifyCFG should handle it.
+      if (UserParent == BB || !DT.isReachableFromEntry(UserParent))
+        return None;
+
+      auto *Term = UserParent->getTerminator();
+      // See if the user is one of our successors that has only one
+      // predecessor, so that we don't have to split the critical edge.
+      // Another option where we can sink is a block that ends with a
+      // terminator that does not pass control to other block (such as
+      // return or unreachable). In this case:
+      //   - I dominates the User (by SSA form);
+      //   - the User will be executed at most once.
+      // So sinking I down to User is always profitable or neutral.
+      if (UserParent->getUniquePredecessor() == BB ||
+          (isa<ReturnInst>(Term) || isa<UnreachableInst>(Term))) {
+        assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
+        return UserParent;
       }
+      return None;
+    };
+
+    auto OptBB = getOptionalSinkBlockForInst(I);
+    if (OptBB) {
+      auto *UserParent = *OptBB;
+      // Okay, the CFG is simple enough, try to sink this instruction.
+      if (TryToSinkInstruction(I, UserParent)) {
+        LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+        MadeIRChange = true;
+        // We'll add uses of the sunk instruction below, but since
+        // sinking can expose opportunities for it's *operands* add
+        // them to the worklist
+        for (Use &U : I->operands())
+          if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+            Worklist.push(OpI);
+      }
+    }
 
     // Now that we have an instruction, try combining it to simplify it.
     Builder.SetInsertPoint(I);
@@ -3994,13 +4010,13 @@ public:
 /// whose condition is a known constant, we only visit the reachable successors.
 static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
                                           const TargetLibraryInfo *TLI,
-                                          InstCombineWorklist &ICWorklist) {
+                                          InstructionWorklist &ICWorklist) {
   bool MadeIRChange = false;
   SmallPtrSet<BasicBlock *, 32> Visited;
   SmallVector<BasicBlock*, 256> Worklist;
   Worklist.push_back(&F.front());
 
-  SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
+  SmallVector<Instruction *, 128> InstrsForInstructionWorklist;
   DenseMap<Constant *, Constant *> FoldedConstants;
   AliasScopeTracker SeenAliasScopes;
 
@@ -4011,25 +4027,23 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
     if (!Visited.insert(BB).second)
       continue;
 
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
-      Instruction *Inst = &*BBI++;
-
+    for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
       // ConstantProp instruction if trivially constant.
-      if (!Inst->use_empty() &&
-          (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
-        if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
-          LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
+      if (!Inst.use_empty() &&
+          (Inst.getNumOperands() == 0 || isa<Constant>(Inst.getOperand(0))))
+        if (Constant *C = ConstantFoldInstruction(&Inst, DL, TLI)) {
+          LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << Inst
                             << '\n');
-          Inst->replaceAllUsesWith(C);
+          Inst.replaceAllUsesWith(C);
           ++NumConstProp;
-          if (isInstructionTriviallyDead(Inst, TLI))
-            Inst->eraseFromParent();
+          if (isInstructionTriviallyDead(&Inst, TLI))
+            Inst.eraseFromParent();
           MadeIRChange = true;
           continue;
         }
 
       // See if we can constant fold its operands.
-      for (Use &U : Inst->operands()) {
+      for (Use &U : Inst.operands()) {
         if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
           continue;
 
@@ -4039,7 +4053,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
           FoldRes = ConstantFoldConstant(C, DL, TLI);
 
         if (FoldRes != C) {
-          LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+          LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << Inst
                             << "\n    Old = " << *C
                             << "\n    New = " << *FoldRes << '\n');
           U = FoldRes;
@@ -4050,9 +4064,9 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
       // Skip processing debug and pseudo intrinsics in InstCombine. Processing
       // these call instructions consumes non-trivial amount of time and
       // provides no value for the optimization.
-      if (!Inst->isDebugOrPseudoInst()) {
-        InstrsForInstCombineWorklist.push_back(Inst);
-        SeenAliasScopes.analyse(Inst);
+      if (!Inst.isDebugOrPseudoInst()) {
+        InstrsForInstructionWorklist.push_back(&Inst);
+        SeenAliasScopes.analyse(&Inst);
       }
     }
 
@@ -4097,8 +4111,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
   // of the function down.  This jives well with the way that it adds all uses
   // of instructions to the worklist after doing a transformation, thus avoiding
   // some N^2 behavior in pathological cases.
-  ICWorklist.reserve(InstrsForInstCombineWorklist.size());
-  for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
+  ICWorklist.reserve(InstrsForInstructionWorklist.size());
+  for (Instruction *Inst : reverse(InstrsForInstructionWorklist)) {
     // DCE instruction if trivially dead. As we iterate in reverse program
     // order here, we will clean up whole chains of dead instructions.
     if (isInstructionTriviallyDead(Inst, TLI) ||
@@ -4118,7 +4132,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
 }
 
 static bool combineInstructionsOverFunction(
-    Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
+    Function &F, InstructionWorklist &Worklist, AliasAnalysis *AA,
     AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
     DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 0d4ca0bcecfb..b56329ad76ae 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -47,6 +48,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -176,7 +178,15 @@ const char kAMDGPUAddressPrivateName[] = "llvm.amdgcn.is.private";
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
 
-static const unsigned kAllocaRzSize = 32;
+static const uint64_t kAllocaRzSize = 32;
+
+// ASanAccessInfo implementation constants.
+constexpr size_t kCompileKernelShift = 0;
+constexpr size_t kCompileKernelMask = 0x1;
+constexpr size_t kAccessSizeIndexShift = 1;
+constexpr size_t kAccessSizeIndexMask = 0xf;
+constexpr size_t kIsWriteShift = 5;
+constexpr size_t kIsWriteMask = 0x1;
 
 // Command-line flags.
 
@@ -203,6 +213,11 @@ static cl::opt<bool> ClInstrumentWrites(
     "asan-instrument-writes", cl::desc("instrument write instructions"),
     cl::Hidden, cl::init(true));
 
+static cl::opt<bool>
+    ClUseStackSafety("asan-use-stack-safety", cl::Hidden, cl::init(false),
+                     cl::Hidden, cl::desc("Use Stack Safety analysis results"),
+                     cl::Optional);
+
 static cl::opt<bool> ClInstrumentAtomics(
     "asan-instrument-atomics",
     cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
@@ -348,6 +363,10 @@ static cl::opt<uint64_t>
 static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
                            cl::Hidden, cl::init(true));
 
+static cl::opt<bool> ClOptimizeCallbacks("asan-optimize-callbacks",
+                                         cl::desc("Optimize callbacks"),
+                                         cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClOptSameTemp(
     "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
     cl::Hidden, cl::init(true));
@@ -442,7 +461,7 @@ struct ShadowMapping {
 
 } // end anonymous namespace
 
-static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
@@ -559,6 +578,32 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   return Mapping;
 }
 
+namespace llvm {
+void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                               bool IsKasan, uint64_t *ShadowBase,
+                               int *MappingScale, bool *OrShadowOffset) {
+  auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan);
+  *ShadowBase = Mapping.Offset;
+  *MappingScale = Mapping.Scale;
+  *OrShadowOffset = Mapping.OrShadowOffset;
+}
+
+ASanAccessInfo::ASanAccessInfo(int32_t Packed)
+    : Packed(Packed),
+      AccessSizeIndex((Packed >> kAccessSizeIndexShift) & kAccessSizeIndexMask),
+      IsWrite((Packed >> kIsWriteShift) & kIsWriteMask),
+      CompileKernel((Packed >> kCompileKernelShift) & kCompileKernelMask) {}
+
+ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel,
+                               uint8_t AccessSizeIndex)
+    : Packed((IsWrite << kIsWriteShift) +
+             (CompileKernel << kCompileKernelShift) +
+             (AccessSizeIndex << kAccessSizeIndexShift)),
+      AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite),
+      CompileKernel(CompileKernel) {}
+
+} // namespace llvm
+
 static uint64_t getRedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
@@ -609,6 +654,7 @@ char ASanGlobalsMetadataWrapperPass::ID = 0;
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer {
   AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+                   const StackSafetyGlobalInfo *SSGI,
                    bool CompileKernel = false, bool Recover = false,
                    bool UseAfterScope = false,
                    AsanDetectStackUseAfterReturnMode UseAfterReturn =
@@ -619,10 +665,12 @@ struct AddressSanitizer {
         UseAfterScope(UseAfterScope || ClUseAfterScope),
         UseAfterReturn(ClUseAfterReturn.getNumOccurrences() ? ClUseAfterReturn
                                                             : UseAfterReturn),
-        GlobalsMD(*GlobalsMD) {
+        GlobalsMD(*GlobalsMD), SSGI(SSGI) {
     C = &(M.getContext());
     LongSize = M.getDataLayout().getPointerSizeInBits();
     IntptrTy = Type::getIntNTy(*C, LongSize);
+    Int8PtrTy = Type::getInt8PtrTy(*C);
+    Int32Ty = Type::getInt32Ty(*C);
     TargetTriple = Triple(M.getTargetTriple());
 
     Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
@@ -646,7 +694,7 @@ struct AddressSanitizer {
   /// Check if we want (and can) handle this alloca.
   bool isInterestingAlloca(const AllocaInst &AI);
 
-  bool ignoreAccess(Value *Ptr);
+  bool ignoreAccess(Instruction *Inst, Value *Ptr);
   void getInterestingMemoryOperands(
       Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
 
@@ -713,6 +761,8 @@ private:
   bool UseAfterScope;
   AsanDetectStackUseAfterReturnMode UseAfterReturn;
   Type *IntptrTy;
+  Type *Int8PtrTy;
+  Type *Int32Ty;
   ShadowMapping Mapping;
   FunctionCallee AsanHandleNoReturnFunc;
   FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction;
@@ -729,6 +779,7 @@ private:
   FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
   Value *LocalDynamicShadow = nullptr;
   const GlobalsMetadata &GlobalsMD;
+  const StackSafetyGlobalInfo *SSGI;
   DenseMap<const AllocaInst *, bool> ProcessedAllocas;
 
   FunctionCallee AMDGPUAddressShared;
@@ -755,16 +806,22 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+    if (ClUseStackSafety)
+      AU.addRequired<StackSafetyGlobalInfoWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
     GlobalsMetadata &GlobalsMD =
         getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+    const StackSafetyGlobalInfo *const SSGI =
+        ClUseStackSafety
+            ? &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult()
+            : nullptr;
     const TargetLibraryInfo *TLI =
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
-                          UseAfterScope, UseAfterReturn);
+    AddressSanitizer ASan(*F.getParent(), &GlobalsMD, SSGI, CompileKernel,
+                          Recover, UseAfterScope, UseAfterReturn);
     return ASan.instrumentFunction(F, TLI);
   }
 
@@ -1212,20 +1269,15 @@ GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
   return GlobalsMetadata(M);
 }
 
-AddressSanitizerPass::AddressSanitizerPass(
-    bool CompileKernel, bool Recover, bool UseAfterScope,
-    AsanDetectStackUseAfterReturnMode UseAfterReturn)
-    : CompileKernel(CompileKernel), Recover(Recover),
-      UseAfterScope(UseAfterScope), UseAfterReturn(UseAfterReturn) {}
-
 PreservedAnalyses AddressSanitizerPass::run(Function &F,
                                             AnalysisManager<Function> &AM) {
   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   Module &M = *F.getParent();
   if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
     const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
-    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope,
-                               UseAfterReturn);
+    AddressSanitizer Sanitizer(M, R, nullptr, Options.CompileKernel,
+                               Options.Recover, Options.UseAfterScope,
+                               Options.UseAfterReturn);
     if (Sanitizer.instrumentFunction(F, TLI))
       return PreservedAnalyses::none();
     return PreservedAnalyses::all();
@@ -1237,21 +1289,51 @@ PreservedAnalyses AddressSanitizerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
+void AddressSanitizerPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<AddressSanitizerPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (Options.CompileKernel)
+    OS << "kernel";
+  OS << ">";
+}
+
+void ModuleAddressSanitizerPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<ModuleAddressSanitizerPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (Options.CompileKernel)
+    OS << "kernel";
+  OS << ">";
+}
+
 ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(
-    bool CompileKernel, bool Recover, bool UseGlobalGC, bool UseOdrIndicator,
-    AsanDtorKind DestructorKind)
-    : CompileKernel(CompileKernel), Recover(Recover), UseGlobalGC(UseGlobalGC),
+    const AddressSanitizerOptions &Options, bool UseGlobalGC,
+    bool UseOdrIndicator, AsanDtorKind DestructorKind)
+    : Options(Options), UseGlobalGC(UseGlobalGC),
       UseOdrIndicator(UseOdrIndicator), DestructorKind(DestructorKind) {}
 
 PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
-                                                  AnalysisManager<Module> &AM) {
-  GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
-  ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
-                                   UseGlobalGC, UseOdrIndicator,
-                                   DestructorKind);
-  if (Sanitizer.instrumentModule(M))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
+                                                  ModuleAnalysisManager &MAM) {
+  GlobalsMetadata &GlobalsMD = MAM.getResult<ASanGlobalsMetadataAnalysis>(M);
+  ModuleAddressSanitizer ModuleSanitizer(M, &GlobalsMD, Options.CompileKernel,
+                                         Options.Recover, UseGlobalGC,
+                                         UseOdrIndicator, DestructorKind);
+  bool Modified = false;
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  const StackSafetyGlobalInfo *const SSGI =
+      ClUseStackSafety ? &MAM.getResult<StackSafetyGlobalAnalysis>(M) : nullptr;
+  for (Function &F : M) {
+    AddressSanitizer FunctionSanitizer(
+        M, &GlobalsMD, SSGI, Options.CompileKernel, Options.Recover,
+        Options.UseAfterScope, Options.UseAfterReturn);
+    const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+    Modified |= FunctionSanitizer.instrumentFunction(F, &TLI);
+  }
+  Modified |= ModuleSanitizer.instrumentModule(M);
+  return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
 INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md",
@@ -1266,6 +1348,7 @@ INITIALIZE_PASS_BEGIN(
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
     AddressSanitizerLegacyPass, "asan",
@@ -1404,7 +1487,7 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
   return IsInteresting;
 }
 
-bool AddressSanitizer::ignoreAccess(Value *Ptr) {
+bool AddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
   // Instrument acesses from different address spaces only for AMDGPU.
   Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
   if (PtrTy->getPointerAddressSpace() != 0 &&
@@ -1425,6 +1508,10 @@ bool AddressSanitizer::ignoreAccess(Value *Ptr) {
     if (ClSkipPromotableAllocas && !isInterestingAlloca(*AI))
       return true;
 
+  if (SSGI != nullptr && SSGI->stackAccessIsSafe(*Inst) &&
+      findAllocaForValue(Ptr))
+    return true;
+
   return false;
 }
 
@@ -1439,22 +1526,22 @@ void AddressSanitizer::getInterestingMemoryOperands(
     return;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+    if (!ClInstrumentReads || ignoreAccess(LI, LI->getPointerOperand()))
       return;
     Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
                              LI->getType(), LI->getAlign());
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+    if (!ClInstrumentWrites || ignoreAccess(LI, SI->getPointerOperand()))
       return;
     Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
                              SI->getValueOperand()->getType(), SI->getAlign());
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
-    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+    if (!ClInstrumentAtomics || ignoreAccess(LI, RMW->getPointerOperand()))
       return;
     Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
                              RMW->getValOperand()->getType(), None);
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
-    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+    if (!ClInstrumentAtomics || ignoreAccess(LI, XCHG->getPointerOperand()))
       return;
     Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
                              XCHG->getCompareOperand()->getType(), None);
@@ -1469,7 +1556,7 @@ void AddressSanitizer::getInterestingMemoryOperands(
         return;
 
       auto BasePtr = CI->getOperand(OpOffset);
-      if (ignoreAccess(BasePtr))
+      if (ignoreAccess(LI, BasePtr))
         return;
       auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
       MaybeAlign Alignment = Align(1);
@@ -1479,9 +1566,9 @@ void AddressSanitizer::getInterestingMemoryOperands(
       Value *Mask = CI->getOperand(2 + OpOffset);
       Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
     } else {
-      for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+      for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) {
         if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
-            ignoreAccess(CI->getArgOperand(ArgNo)))
+            ignoreAccess(LI, CI->getArgOperand(ArgNo)))
           continue;
         Type *Ty = CI->getParamByValType(ArgNo);
         Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
@@ -1738,9 +1825,20 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   }
 
   IRBuilder<> IRB(InsertBefore);
-  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+  const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
+
+  if (UseCalls && ClOptimizeCallbacks) {
+    const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
+    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+    IRB.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::asan_check_memaccess),
+        {IRB.CreatePointerCast(Addr, Int8PtrTy),
+         ConstantInt::get(Int32Ty, AccessInfo.Packed)});
+    return;
+  }
 
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   if (UseCalls) {
     if (Exp == 0)
       IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
@@ -1936,7 +2034,8 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
     // Globals from llvm.metadata aren't emitted, do not instrument them.
     if (Section == "llvm.metadata") return false;
     // Do not instrument globals from special LLVM sections.
-    if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
+    if (Section.contains("__llvm") || Section.contains("__LLVM"))
+      return false;
 
     // Do not instrument function pointers to initialization and termination
     // routines: dynamic linker will not properly handle redzones.
@@ -2133,8 +2232,7 @@ Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
   AsanDtorFunction = Function::createWithDefaultAttr(
       FunctionType::get(Type::getVoidTy(*C), false),
       GlobalValue::InternalLinkage, 0, kAsanModuleDtorName, &M);
-  AsanDtorFunction->addAttribute(AttributeList::FunctionIndex,
-                                 Attribute::NoUnwind);
+  AsanDtorFunction->addFnAttr(Attribute::NoUnwind);
   // Ensure Dtor cannot be discarded, even if in a comdat.
   appendToUsed(M, {AsanDtorFunction});
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
@@ -2753,7 +2851,7 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
     if (II && II->getIntrinsicID() == Intrinsic::localescape) {
       // We found a call. Mark all the allocas passed in as uninteresting.
-      for (Value *Arg : II->arg_operands()) {
+      for (Value *Arg : II->args()) {
         AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
         assert(AI && AI->isStaticAlloca() &&
                "non-static alloca arg to localescape");
@@ -2774,6 +2872,8 @@ bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
 
 bool AddressSanitizer::instrumentFunction(Function &F,
                                           const TargetLibraryInfo *TLI) {
+  if (F.empty())
+    return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().startswith("__asan_")) return false;
@@ -2916,7 +3016,8 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
   if (LongSize != 32) return false;
   CallInst *CI = dyn_cast<CallInst>(I);
   if (!CI || !CI->isInlineAsm()) return false;
-  if (CI->getNumArgOperands() <= 5) return false;
+  if (CI->arg_size() <= 5)
+    return false;
   // We have inline assembly with quite a few arguments.
   return true;
 }
@@ -3112,7 +3213,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
     assert(Alloca->isStaticAlloca());
   }
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
-  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+  uint64_t FrameAlignment = std::max(L.FrameAlignment, uint64_t(ClRealignStack));
   Alloca->setAlignment(Align(FrameAlignment));
   return IRB.CreatePointerCast(Alloca, IntptrTy);
 }
@@ -3256,8 +3357,8 @@ void FunctionStackPoisoner::processStaticAllocas() {
 
   // Minimal header size (left redzone) is 4 pointers,
   // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
-  size_t Granularity = 1ULL << Mapping.Scale;
-  size_t MinHeaderSize = std::max((size_t)ASan.LongSize / 2, Granularity);
+  uint64_t Granularity = 1ULL << Mapping.Scale;
+  uint64_t MinHeaderSize = std::max((uint64_t)ASan.LongSize / 2, Granularity);
   const ASanStackFrameLayout &L =
       ComputeASanStackFrameLayout(SVD, Granularity, MinHeaderSize);
 
@@ -3511,7 +3612,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
 void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   IRBuilder<> IRB(AI);
 
-  const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment());
+  const uint64_t Alignment = std::max(kAllocaRzSize, AI->getAlignment());
   const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
 
   Value *Zero = Constant::getNullValue(IntptrTy);
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index 9acd82c005e6..1a7f7a365ce4 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -53,6 +53,8 @@ static bool runCGProfilePass(
   InstrProfSymtab Symtab;
   auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
                           Function *CalledF, uint64_t NewCount) {
+    if (NewCount == 0)
+      return;
     if (!CalledF || !TTI.isLoweredToCall(CalledF) ||
         CalledF->hasDLLImportStorageClass())
       return;
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 3b4d80dc8023..497aac30c3f6 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1553,11 +1553,11 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
       SI->swapValues();
       SI->swapProfMetadata();
       if (Scope->TrueBiasedSelects.count(SI)) {
-        assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
+        assert(!Scope->FalseBiasedSelects.contains(SI) &&
                "Must not be already in");
         Scope->FalseBiasedSelects.insert(SI);
       } else if (Scope->FalseBiasedSelects.count(SI)) {
-        assert(Scope->TrueBiasedSelects.count(SI) == 0 &&
+        assert(!Scope->TrueBiasedSelects.contains(SI) &&
                "Must not be already in");
         Scope->TrueBiasedSelects.insert(SI);
       }
@@ -1592,7 +1592,7 @@ static void insertTrivialPHIs(CHRScope *Scope,
       SmallVector<Instruction *, 8> Users;
       for (User *U : I.users()) {
         if (auto *UI = dyn_cast<Instruction>(U)) {
-          if (BlocksInScope.count(UI->getParent()) == 0 &&
+          if (!BlocksInScope.contains(UI->getParent()) &&
               // Unless there's already a phi for I at the exit block.
               !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) {
             CHR_DEBUG(dbgs() << "V " << I << "\n");
@@ -1752,7 +1752,7 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
   // Create the combined branch condition and constant-fold the branches/selects
   // in the hot path.
   fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr,
-                          ProfileCount ? ProfileCount.getValue() : 0);
+                          ProfileCount.getValueOr(0));
 }
 
 // A helper for transformScopes. Clone the blocks in the scope (excluding the
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 63aa84e4a77c..38c219ce3465 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -18,6 +18,9 @@
 /// The analysis is based on automatic propagation of data flow labels (also
 /// known as taint labels) through a program as it performs computation.
 ///
+/// Argument and return value labels are passed through TLS variables
+/// __dfsan_arg_tls and __dfsan_retval_tls.
+///
 /// Each byte of application memory is backed by a shadow memory byte. The
 /// shadow byte can represent up to 8 labels. On Linux/x86_64, memory is then
 /// laid out as follows:
@@ -144,20 +147,22 @@ static cl::opt<bool> ClPreserveAlignment(
 // to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
 // additional annotations for those functions, a call to one of those functions
 // will produce a warning message, as the labelling behaviour of the function is
-// unknown.  The other supported annotations are "functional" and "discard",
-// which are described below under DataFlowSanitizer::WrapperKind.
+// unknown. The other supported annotations for uninstrumented functions are
+// "functional" and "discard", which are described below under
+// DataFlowSanitizer::WrapperKind.
+// Functions will often be labelled with both "uninstrumented" and one of
+// "functional" or "discard". This will leave the function unchanged by this
+// pass, and create a wrapper function that will call the original.
+//
+// Instrumented functions can also be annotated as "force_zero_labels", which
+// will make all shadow and return values set zero labels.
+// Functions should never be labelled with both "force_zero_labels" and
+// "uninstrumented" or any of the unistrumented wrapper kinds.
 static cl::list<std::string> ClABIListFiles(
     "dfsan-abilist",
     cl::desc("File listing native ABI functions and how the pass treats them"),
     cl::Hidden);
 
-// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
-// functions (see DataFlowSanitizer::InstrumentedABI below).
-static cl::opt<bool>
-    ClArgsABI("dfsan-args-abi",
-              cl::desc("Use the argument ABI rather than the TLS ABI"),
-              cl::Hidden);
-
 // Controls whether the pass includes or ignores the labels of pointers in load
 // instructions.
 static cl::opt<bool> ClCombinePointerLabelsOnLoad(
@@ -349,18 +354,18 @@ transformFunctionAttributes(const TransformedFunction &TransformedFunction,
   for (unsigned I = 0, IE = TransformedFunction.ArgumentIndexMapping.size();
        I < IE; ++I) {
     unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[I];
-    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(I);
+    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttrs(I);
   }
 
   // Copy annotations on varargs arguments.
   for (unsigned I = TransformedFunction.OriginalType->getNumParams(),
                 IE = CallSiteAttrs.getNumAttrSets();
        I < IE; ++I) {
-    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(I));
+    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttrs(I));
   }
 
-  return AttributeList::get(Ctx, CallSiteAttrs.getFnAttributes(),
-                            CallSiteAttrs.getRetAttributes(),
+  return AttributeList::get(Ctx, CallSiteAttrs.getFnAttrs(),
+                            CallSiteAttrs.getRetAttrs(),
                             llvm::makeArrayRef(ArgumentAttributes));
 }
 
@@ -372,17 +377,6 @@ class DataFlowSanitizer {
 
   enum { OriginWidthBits = 32, OriginWidthBytes = OriginWidthBits / 8 };
 
-  /// Which ABI should be used for instrumented functions?
-  enum InstrumentedABI {
-    /// Argument and return value labels are passed through additional
-    /// arguments and by modifying the return type.
-    IA_Args,
-
-    /// Argument and return value labels are passed through TLS variables
-    /// __dfsan_arg_tls and __dfsan_retval_tls.
-    IA_TLS
-  };
-
   /// How should calls to uninstrumented functions be handled?
   enum WrapperKind {
     /// This function is present in an uninstrumented form but we don't know
@@ -400,9 +394,7 @@ class DataFlowSanitizer {
 
     /// Instead of calling the function, a custom wrapper __dfsw_F is called,
     /// where F is the name of the function.  This function may wrap the
-    /// original function or provide its own implementation.  This is similar to
-    /// the IA_Args ABI, except that IA_Args uses a struct return type to
-    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// original function or provide its own implementation. WK_Custom uses an
     /// extra pointer argument to return the shadow.  This allows the wrapped
     /// form of the function type to be expressed in C.
     WK_Custom
@@ -469,10 +461,9 @@ class DataFlowSanitizer {
   getShadowOriginAddress(Value *Addr, Align InstAlignment, Instruction *Pos);
   bool isInstrumented(const Function *F);
   bool isInstrumented(const GlobalAlias *GA);
-  FunctionType *getArgsFunctionType(FunctionType *T);
+  bool isForceZeroLabels(const Function *F);
   FunctionType *getTrampolineFunctionType(FunctionType *T);
   TransformedFunction getCustomFunctionType(FunctionType *T);
-  InstrumentedABI getInstrumentedABI();
   WrapperKind getWrapperKind(Function *F);
   void addGlobalNameSuffix(GlobalValue *GV);
   Function *buildWrapperFunction(Function *F, StringRef NewFName,
@@ -496,18 +487,11 @@ class DataFlowSanitizer {
   /// Returns whether the pass tracks origins. Supports only TLS ABI mode.
   bool shouldTrackOrigins();
 
-  /// Returns whether the pass tracks labels for struct fields and array
-  /// indices. Supports only TLS ABI mode.
-  bool shouldTrackFieldsAndIndices();
-
   /// Returns a zero constant with the shadow type of OrigTy.
   ///
   /// getZeroShadow({T1,T2,...}) = {getZeroShadow(T1),getZeroShadow(T2,...}
   /// getZeroShadow([n x T]) = [n x getZeroShadow(T)]
   /// getZeroShadow(other type) = i16(0)
-  ///
-  /// Note that a zero shadow is always i16(0) when shouldTrackFieldsAndIndices
-  /// returns false.
   Constant *getZeroShadow(Type *OrigTy);
   /// Returns a zero constant with the shadow type of V's type.
   Constant *getZeroShadow(Value *V);
@@ -520,9 +504,6 @@ class DataFlowSanitizer {
   /// getShadowTy({T1,T2,...}) = {getShadowTy(T1),getShadowTy(T2),...}
   /// getShadowTy([n x T]) = [n x getShadowTy(T)]
   /// getShadowTy(other type) = i16
-  ///
-  /// Note that a shadow type is always i16 when shouldTrackFieldsAndIndices
-  /// returns false.
   Type *getShadowTy(Type *OrigTy);
   /// Returns the shadow type of of V's type.
   Type *getShadowTy(Value *V);
@@ -539,8 +520,8 @@ struct DFSanFunction {
   DataFlowSanitizer &DFS;
   Function *F;
   DominatorTree DT;
-  DataFlowSanitizer::InstrumentedABI IA;
   bool IsNativeABI;
+  bool IsForceZeroLabels;
   AllocaInst *LabelReturnAlloca = nullptr;
   AllocaInst *OriginReturnAlloca = nullptr;
   DenseMap<Value *, Value *> ValShadowMap;
@@ -571,8 +552,10 @@ struct DFSanFunction {
   DenseMap<Value *, Value *> CachedCollapsedShadows;
   DenseMap<Value *, std::set<Value *>> ShadowElements;
 
-  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
-      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI) {
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI,
+                bool IsForceZeroLabels)
+      : DFS(DFS), F(F), IsNativeABI(IsNativeABI),
+        IsForceZeroLabels(IsForceZeroLabels) {
     DT.recalculate(*F);
   }
 
@@ -787,17 +770,6 @@ DataFlowSanitizer::DataFlowSanitizer(
       SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
 }
 
-FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
-  SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
-  ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
-  if (T->isVarArg())
-    ArgTypes.push_back(PrimitiveShadowPtrTy);
-  Type *RetType = T->getReturnType();
-  if (!RetType->isVoidTy())
-    RetType = StructType::get(RetType, PrimitiveShadowTy);
-  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
-}
-
 FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
   assert(!T->isVarArg());
   SmallVector<Type *, 4> ArgTypes;
@@ -861,9 +833,6 @@ TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
 }
 
 bool DataFlowSanitizer::isZeroShadow(Value *V) {
-  if (!shouldTrackFieldsAndIndices())
-    return ZeroPrimitiveShadow == V;
-
   Type *T = V->getType();
   if (!isa<ArrayType>(T) && !isa<StructType>(T)) {
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
@@ -880,19 +849,11 @@ bool DataFlowSanitizer::hasLoadSizeForFastPath(uint64_t Size) {
 }
 
 bool DataFlowSanitizer::shouldTrackOrigins() {
-  static const bool ShouldTrackOrigins =
-      ClTrackOrigins && getInstrumentedABI() == DataFlowSanitizer::IA_TLS;
+  static const bool ShouldTrackOrigins = ClTrackOrigins;
   return ShouldTrackOrigins;
 }
 
-bool DataFlowSanitizer::shouldTrackFieldsAndIndices() {
-  return getInstrumentedABI() == DataFlowSanitizer::IA_TLS;
-}
-
 Constant *DataFlowSanitizer::getZeroShadow(Type *OrigTy) {
-  if (!shouldTrackFieldsAndIndices())
-    return ZeroPrimitiveShadow;
-
   if (!isa<ArrayType>(OrigTy) && !isa<StructType>(OrigTy))
     return ZeroPrimitiveShadow;
   Type *ShadowTy = getShadowTy(OrigTy);
@@ -992,8 +953,6 @@ Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow,
   if (!isa<ArrayType>(ShadowTy) && !isa<StructType>(ShadowTy))
     return Shadow;
 
-  assert(DFS.shouldTrackFieldsAndIndices());
-
   // Checks if the cached collapsed shadow value dominates Pos.
   Value *&CS = CachedCollapsedShadows[Shadow];
   if (CS && DT.dominates(CS, Pos))
@@ -1007,9 +966,6 @@ Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow,
 }
 
 Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) {
-  if (!shouldTrackFieldsAndIndices())
-    return PrimitiveShadowTy;
-
   if (!OrigTy->isSized())
     return PrimitiveShadowTy;
   if (isa<IntegerType>(OrigTy))
@@ -1107,8 +1063,8 @@ bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
   return !ABIList.isIn(*GA, "uninstrumented");
 }
 
-DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
-  return ClArgsABI ? IA_Args : IA_TLS;
+bool DataFlowSanitizer::isForceZeroLabels(const Function *F) {
+  return ABIList.isIn(*F, "force_zero_labels");
 }
 
 DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
@@ -1139,7 +1095,7 @@ void DataFlowSanitizer::addGlobalNameSuffix(GlobalValue *GV) {
     Pos = Asm.find("@");
 
     if (Pos == std::string::npos)
-      report_fatal_error("unsupported .symver: " + Asm);
+      report_fatal_error(Twine("unsupported .symver: ", Asm));
 
     Asm.replace(Pos, 1, Suffix + "@");
     GV->getParent()->setModuleInlineAsm(Asm);
@@ -1154,14 +1110,12 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
   Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
                                     NewFName, F->getParent());
   NewF->copyAttributesFrom(F);
-  NewF->removeAttributes(
-      AttributeList::ReturnIndex,
+  NewF->removeRetAttrs(
       AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
-    NewF->removeAttributes(AttributeList::FunctionIndex,
-                           AttrBuilder().addAttribute("split-stack"));
+    NewF->removeFnAttrs(AttrBuilder().addAttribute("split-stack"));
     CallInst::Create(DFSanVarargWrapperFn,
                      IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
                      BB);
@@ -1199,7 +1153,8 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
 
     // F is called by a wrapped custom function with primitive shadows. So
     // its arguments and return value need conversion.
-    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true,
+                       /*ForceZeroLabels=*/false);
     Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI;
     ++ValAI;
     for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
@@ -1238,23 +1193,17 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
 void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   {
     AttributeList AL;
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::NoUnwind);
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::ReadOnly);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::ReadOnly);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanUnionLoadFn =
         Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
   }
   {
     AttributeList AL;
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::NoUnwind);
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::ReadOnly);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::ReadOnly);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanLoadLabelAndOriginFn = Mod->getOrInsertFunction(
         "__dfsan_load_label_and_origin", DFSanLoadLabelAndOriginFnTy, AL);
   }
@@ -1274,8 +1223,7 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   {
     AttributeList AL;
     AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanChainOriginFn = Mod->getOrInsertFunction("__dfsan_chain_origin",
                                                   DFSanChainOriginFnTy, AL);
   }
@@ -1283,8 +1231,7 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
     AttributeList AL;
     AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
     AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanChainOriginIfTaintedFn = Mod->getOrInsertFunction(
         "__dfsan_chain_origin_if_tainted", DFSanChainOriginIfTaintedFnTy, AL);
   }
@@ -1409,34 +1356,32 @@ bool DataFlowSanitizer::runImpl(Module &M) {
 
   std::vector<Function *> FnsToInstrument;
   SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  SmallPtrSet<Function *, 2> FnsWithForceZeroLabel;
   for (Function &F : M)
     if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F))
       FnsToInstrument.push_back(&F);
 
   // Give function aliases prefixes when necessary, and build wrappers where the
   // instrumentedness is inconsistent.
-  for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
-       AI != AE;) {
-    GlobalAlias *GA = &*AI;
-    ++AI;
+  for (GlobalAlias &GA : llvm::make_early_inc_range(M.aliases())) {
     // Don't stop on weak.  We assume people aren't playing games with the
     // instrumentedness of overridden weak aliases.
-    auto *F = dyn_cast<Function>(GA->getBaseObject());
+    auto *F = dyn_cast<Function>(GA.getAliaseeObject());
     if (!F)
       continue;
 
-    bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+    bool GAInst = isInstrumented(&GA), FInst = isInstrumented(F);
     if (GAInst && FInst) {
-      addGlobalNameSuffix(GA);
+      addGlobalNameSuffix(&GA);
     } else if (GAInst != FInst) {
       // Non-instrumented alias of an instrumented function, or vice versa.
       // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
       // below will take care of instrumenting it.
       Function *NewF =
-          buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
-      GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
-      NewF->takeName(GA);
-      GA->eraseFromParent();
+          buildWrapperFunction(F, "", GA.getLinkage(), F->getFunctionType());
+      GA.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA.getType()));
+      NewF->takeName(&GA);
+      GA.eraseFromParent();
       FnsToInstrument.push_back(NewF);
     }
   }
@@ -1456,50 +1401,17 @@ bool DataFlowSanitizer::runImpl(Module &M) {
                               FT->getReturnType()->isVoidTy());
 
     if (isInstrumented(&F)) {
+      if (isForceZeroLabels(&F))
+        FnsWithForceZeroLabel.insert(&F);
+
       // Instrumented functions get a '.dfsan' suffix.  This allows us to more
       // easily identify cases of mismatching ABIs. This naming scheme is
       // mangling-compatible (see Itanium ABI), using a vendor-specific suffix.
-      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
-        FunctionType *NewFT = getArgsFunctionType(FT);
-        Function *NewF = Function::Create(NewFT, F.getLinkage(),
-                                          F.getAddressSpace(), "", &M);
-        NewF->copyAttributesFrom(&F);
-        NewF->removeAttributes(
-            AttributeList::ReturnIndex,
-            AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
-        for (Function::arg_iterator FArg = F.arg_begin(),
-                                    NewFArg = NewF->arg_begin(),
-                                    FArgEnd = F.arg_end();
-             FArg != FArgEnd; ++FArg, ++NewFArg) {
-          FArg->replaceAllUsesWith(&*NewFArg);
-        }
-        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
-
-        for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
-             UI != UE;) {
-          BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
-          ++UI;
-          if (BA) {
-            BA->replaceAllUsesWith(
-                BlockAddress::get(NewF, BA->getBasicBlock()));
-            delete BA;
-          }
-        }
-        F.replaceAllUsesWith(
-            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
-        NewF->takeName(&F);
-        F.eraseFromParent();
-        *FI = NewF;
-        addGlobalNameSuffix(NewF);
-      } else {
-        addGlobalNameSuffix(&F);
-      }
+      addGlobalNameSuffix(&F);
     } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
       // Build a wrapper function for F.  The wrapper simply calls F, and is
       // added to FnsToInstrument so that any instrumentation according to its
       // WrapperKind is done in the second pass below.
-      FunctionType *NewFT =
-          getInstrumentedABI() == IA_Args ? getArgsFunctionType(FT) : FT;
 
       // If the function being wrapped has local linkage, then preserve the
       // function's linkage in the wrapper function.
@@ -1511,9 +1423,8 @@ bool DataFlowSanitizer::runImpl(Module &M) {
           &F,
           (shouldTrackOrigins() ? std::string("dfso$") : std::string("dfsw$")) +
               std::string(F.getName()),
-          WrapperLinkage, NewFT);
-      if (getInstrumentedABI() == IA_TLS)
-        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
+          WrapperLinkage, FT);
+      NewF->removeFnAttrs(ReadOnlyNoneAttrs);
 
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
@@ -1552,7 +1463,8 @@ bool DataFlowSanitizer::runImpl(Module &M) {
 
     removeUnreachableBlocks(*F);
 
-    DFSanFunction DFSF(*this, F, FnsWithNativeABI.count(F));
+    DFSanFunction DFSF(*this, F, FnsWithNativeABI.count(F),
+                       FnsWithForceZeroLabel.count(F));
 
     // DFSanVisitor may create new basic blocks, which confuses df_iterator.
     // Build a copy of the list before iterating over it.
@@ -1649,23 +1561,14 @@ Value *DFSanFunction::getOrigin(Value *V) {
     if (Argument *A = dyn_cast<Argument>(V)) {
       if (IsNativeABI)
         return DFS.ZeroOrigin;
-      switch (IA) {
-      case DataFlowSanitizer::IA_TLS: {
-        if (A->getArgNo() < DFS.NumOfElementsInArgOrgTLS) {
-          Instruction *ArgOriginTLSPos = &*F->getEntryBlock().begin();
-          IRBuilder<> IRB(ArgOriginTLSPos);
-          Value *ArgOriginPtr = getArgOriginTLS(A->getArgNo(), IRB);
-          Origin = IRB.CreateLoad(DFS.OriginTy, ArgOriginPtr);
-        } else {
-          // Overflow
-          Origin = DFS.ZeroOrigin;
-        }
-        break;
-      }
-      case DataFlowSanitizer::IA_Args: {
+      if (A->getArgNo() < DFS.NumOfElementsInArgOrgTLS) {
+        Instruction *ArgOriginTLSPos = &*F->getEntryBlock().begin();
+        IRBuilder<> IRB(ArgOriginTLSPos);
+        Value *ArgOriginPtr = getArgOriginTLS(A->getArgNo(), IRB);
+        Origin = IRB.CreateLoad(DFS.OriginTy, ArgOriginPtr);
+      } else {
+        // Overflow
         Origin = DFS.ZeroOrigin;
-        break;
-      }
       }
     } else {
       Origin = DFS.ZeroOrigin;
@@ -1716,25 +1619,14 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
 Value *DFSanFunction::getShadow(Value *V) {
   if (!isa<Argument>(V) && !isa<Instruction>(V))
     return DFS.getZeroShadow(V);
+  if (IsForceZeroLabels)
+    return DFS.getZeroShadow(V);
   Value *&Shadow = ValShadowMap[V];
   if (!Shadow) {
     if (Argument *A = dyn_cast<Argument>(V)) {
       if (IsNativeABI)
         return DFS.getZeroShadow(V);
-      switch (IA) {
-      case DataFlowSanitizer::IA_TLS: {
-        Shadow = getShadowForTLSArgument(A);
-        break;
-      }
-      case DataFlowSanitizer::IA_Args: {
-        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
-        Function::arg_iterator Arg = F->arg_begin();
-        std::advance(Arg, ArgIdx);
-        Shadow = &*Arg;
-        assert(Shadow->getType() == DFS.PrimitiveShadowTy);
-        break;
-      }
-      }
+      Shadow = getShadowForTLSArgument(A);
       NonZeroChecks.push_back(Shadow);
     } else {
       Shadow = DFS.getZeroShadow(V);
@@ -1745,8 +1637,6 @@ Value *DFSanFunction::getShadow(Value *V) {
 
 void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
   assert(!ValShadowMap.count(I));
-  assert(DFS.shouldTrackFieldsAndIndices() ||
-         Shadow->getType() == DFS.PrimitiveShadowTy);
   ValShadowMap[I] = Shadow;
 }
 
@@ -2124,7 +2014,7 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowOriginSansLoadTracking(
         IRB.CreateCall(DFS.DFSanLoadLabelAndOriginFn,
                        {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
                         ConstantInt::get(DFS.IntptrTy, Size)});
-    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addRetAttr(Attribute::ZExt);
     return {IRB.CreateTrunc(IRB.CreateLShr(Call, DFS.OriginWidthBits),
                             DFS.PrimitiveShadowTy),
             IRB.CreateTrunc(Call, DFS.OriginTy)};
@@ -2171,7 +2061,7 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowOriginSansLoadTracking(
   IRBuilder<> IRB(Pos);
   CallInst *FallbackCall = IRB.CreateCall(
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+  FallbackCall->addRetAttr(Attribute::ZExt);
   return {FallbackCall, Origin};
 }
 
@@ -2563,15 +2453,12 @@ void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
 }
 
 void DFSanVisitor::visitBitCastInst(BitCastInst &BCI) {
-  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
-    // Special case: if this is the bitcast (there is exactly 1 allowed) between
-    // a musttail call and a ret, don't instrument. New instructions are not
-    // allowed after a musttail call.
-    if (auto *CI = dyn_cast<CallInst>(BCI.getOperand(0)))
-      if (CI->isMustTailCall())
-        return;
-  }
-  // TODO: handle musttail call returns for IA_Args.
+  // Special case: if this is the bitcast (there is exactly 1 allowed) between
+  // a musttail call and a ret, don't instrument. New instructions are not
+  // allowed after a musttail call.
+  if (auto *CI = dyn_cast<CallInst>(BCI.getOperand(0)))
+    if (CI->isMustTailCall())
+      return;
   visitInstOperands(BCI);
 }
 
@@ -2629,11 +2516,6 @@ void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
 }
 
 void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
-  if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
-    visitInstOperands(I);
-    return;
-  }
-
   IRBuilder<> IRB(&I);
   Value *Agg = I.getAggregateOperand();
   Value *AggShadow = DFSF.getShadow(Agg);
@@ -2643,11 +2525,6 @@ void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
 }
 
 void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
-  if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
-    visitInstOperands(I);
-    return;
-  }
-
   IRBuilder<> IRB(&I);
   Value *AggShadow = DFSF.getShadow(I.getAggregateOperand());
   Value *InsShadow = DFSF.getShadow(I.getInsertedValueOperand());
@@ -2798,41 +2675,22 @@ static bool isAMustTailRetVal(Value *RetVal) {
 
 void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
   if (!DFSF.IsNativeABI && RI.getReturnValue()) {
-    switch (DFSF.IA) {
-    case DataFlowSanitizer::IA_TLS: {
-      // Don't emit the instrumentation for musttail call returns.
-      if (isAMustTailRetVal(RI.getReturnValue()))
-        return;
-
-      Value *S = DFSF.getShadow(RI.getReturnValue());
-      IRBuilder<> IRB(&RI);
-      Type *RT = DFSF.F->getFunctionType()->getReturnType();
-      unsigned Size =
-          getDataLayout().getTypeAllocSize(DFSF.DFS.getShadowTy(RT));
-      if (Size <= RetvalTLSSize) {
-        // If the size overflows, stores nothing. At callsite, oversized return
-        // shadows are set to zero.
-        IRB.CreateAlignedStore(S, DFSF.getRetvalTLS(RT, IRB),
-                               ShadowTLSAlignment);
-      }
-      if (DFSF.DFS.shouldTrackOrigins()) {
-        Value *O = DFSF.getOrigin(RI.getReturnValue());
-        IRB.CreateStore(O, DFSF.getRetvalOriginTLS());
-      }
-      break;
-    }
-    case DataFlowSanitizer::IA_Args: {
-      // TODO: handle musttail call returns for IA_Args.
-
-      IRBuilder<> IRB(&RI);
-      Type *RT = DFSF.F->getFunctionType()->getReturnType();
-      Value *InsVal =
-          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
-      Value *InsShadow =
-          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
-      RI.setOperand(0, InsShadow);
-      break;
+    // Don't emit the instrumentation for musttail call returns.
+    if (isAMustTailRetVal(RI.getReturnValue()))
+      return;
+
+    Value *S = DFSF.getShadow(RI.getReturnValue());
+    IRBuilder<> IRB(&RI);
+    Type *RT = DFSF.F->getFunctionType()->getReturnType();
+    unsigned Size = getDataLayout().getTypeAllocSize(DFSF.DFS.getShadowTy(RT));
+    if (Size <= RetvalTLSSize) {
+      // If the size overflows, stores nothing. At callsite, oversized return
+      // shadows are set to zero.
+      IRB.CreateAlignedStore(S, DFSF.getRetvalTLS(RT, IRB), ShadowTLSAlignment);
     }
+    if (DFSF.DFS.shouldTrackOrigins()) {
+      Value *O = DFSF.getOrigin(RI.getReturnValue());
+      IRB.CreateStore(O, DFSF.getRetvalOriginTLS());
     }
   }
 }
@@ -2953,8 +2811,7 @@ bool DFSanVisitor::visitWrappedCallBase(Function &F, CallBase &CB) {
 
       // Custom functions returning non-void will write to the return label.
       if (!FT->getReturnType()->isVoidTy()) {
-        CustomFn->removeAttributes(AttributeList::FunctionIndex,
-                                   DFSF.DFS.ReadOnlyNoneAttrs);
+        CustomFn->removeFnAttrs(DFSF.DFS.ReadOnlyNoneAttrs);
       }
     }
 
@@ -3056,32 +2913,30 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
 
   const bool ShouldTrackOrigins = DFSF.DFS.shouldTrackOrigins();
   FunctionType *FT = CB.getFunctionType();
-  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
-    // Stores argument shadows.
-    unsigned ArgOffset = 0;
-    const DataLayout &DL = getDataLayout();
-    for (unsigned I = 0, N = FT->getNumParams(); I != N; ++I) {
-      if (ShouldTrackOrigins) {
-        // Ignore overflowed origins
-        Value *ArgShadow = DFSF.getShadow(CB.getArgOperand(I));
-        if (I < DFSF.DFS.NumOfElementsInArgOrgTLS &&
-            !DFSF.DFS.isZeroShadow(ArgShadow))
-          IRB.CreateStore(DFSF.getOrigin(CB.getArgOperand(I)),
-                          DFSF.getArgOriginTLS(I, IRB));
-      }
+  const DataLayout &DL = getDataLayout();
 
-      unsigned Size =
-          DL.getTypeAllocSize(DFSF.DFS.getShadowTy(FT->getParamType(I)));
-      // Stop storing if arguments' size overflows. Inside a function, arguments
-      // after overflow have zero shadow values.
-      if (ArgOffset + Size > ArgTLSSize)
-        break;
-      IRB.CreateAlignedStore(
-          DFSF.getShadow(CB.getArgOperand(I)),
-          DFSF.getArgTLS(FT->getParamType(I), ArgOffset, IRB),
-          ShadowTLSAlignment);
-      ArgOffset += alignTo(Size, ShadowTLSAlignment);
+  // Stores argument shadows.
+  unsigned ArgOffset = 0;
+  for (unsigned I = 0, N = FT->getNumParams(); I != N; ++I) {
+    if (ShouldTrackOrigins) {
+      // Ignore overflowed origins
+      Value *ArgShadow = DFSF.getShadow(CB.getArgOperand(I));
+      if (I < DFSF.DFS.NumOfElementsInArgOrgTLS &&
+          !DFSF.DFS.isZeroShadow(ArgShadow))
+        IRB.CreateStore(DFSF.getOrigin(CB.getArgOperand(I)),
+                        DFSF.getArgOriginTLS(I, IRB));
     }
+
+    unsigned Size =
+        DL.getTypeAllocSize(DFSF.DFS.getShadowTy(FT->getParamType(I)));
+    // Stop storing if arguments' size overflows. Inside a function, arguments
+    // after overflow have zero shadow values.
+    if (ArgOffset + Size > ArgTLSSize)
+      break;
+    IRB.CreateAlignedStore(DFSF.getShadow(CB.getArgOperand(I)),
+                           DFSF.getArgTLS(FT->getParamType(I), ArgOffset, IRB),
+                           ShadowTLSAlignment);
+    ArgOffset += alignTo(Size, ShadowTLSAlignment);
   }
 
   Instruction *Next = nullptr;
@@ -3099,99 +2954,31 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
       Next = CB.getNextNode();
     }
 
-    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
-      // Don't emit the epilogue for musttail call returns.
-      if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
-        return;
-
-      // Loads the return value shadow.
-      IRBuilder<> NextIRB(Next);
-      const DataLayout &DL = getDataLayout();
-      unsigned Size = DL.getTypeAllocSize(DFSF.DFS.getShadowTy(&CB));
-      if (Size > RetvalTLSSize) {
-        // Set overflowed return shadow to be zero.
-        DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
-      } else {
-        LoadInst *LI = NextIRB.CreateAlignedLoad(
-            DFSF.DFS.getShadowTy(&CB), DFSF.getRetvalTLS(CB.getType(), NextIRB),
-            ShadowTLSAlignment, "_dfsret");
-        DFSF.SkipInsts.insert(LI);
-        DFSF.setShadow(&CB, LI);
-        DFSF.NonZeroChecks.push_back(LI);
-      }
-
-      if (ShouldTrackOrigins) {
-        LoadInst *LI = NextIRB.CreateLoad(
-            DFSF.DFS.OriginTy, DFSF.getRetvalOriginTLS(), "_dfsret_o");
-        DFSF.SkipInsts.insert(LI);
-        DFSF.setOrigin(&CB, LI);
-      }
-    }
-  }
-
-  // Do all instrumentation for IA_Args down here to defer tampering with the
-  // CFG in a way that SplitEdge may be able to detect.
-  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
-    // TODO: handle musttail call returns for IA_Args.
-
-    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
-    Value *Func =
-        IRB.CreateBitCast(CB.getCalledOperand(), PointerType::getUnqual(NewFT));
-
-    const unsigned NumParams = FT->getNumParams();
-
-    // Copy original arguments.
-    auto *ArgIt = CB.arg_begin(), *ArgEnd = CB.arg_end();
-    std::vector<Value *> Args(NumParams);
-    std::copy_n(ArgIt, NumParams, Args.begin());
-
-    // Add shadow arguments by transforming original arguments.
-    std::generate_n(std::back_inserter(Args), NumParams,
-                    [&]() { return DFSF.getShadow(*ArgIt++); });
-
-    if (FT->isVarArg()) {
-      unsigned VarArgSize = CB.arg_size() - NumParams;
-      ArrayType *VarArgArrayTy =
-          ArrayType::get(DFSF.DFS.PrimitiveShadowTy, VarArgSize);
-      AllocaInst *VarArgShadow =
-          new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
-                         "", &DFSF.F->getEntryBlock().front());
-      Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
-
-      // Copy remaining var args.
-      unsigned GepIndex = 0;
-      std::for_each(ArgIt, ArgEnd, [&](Value *Arg) {
-        IRB.CreateStore(
-            DFSF.getShadow(Arg),
-            IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, GepIndex++));
-        Args.push_back(Arg);
-      });
-    }
+    // Don't emit the epilogue for musttail call returns.
+    if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
+      return;
 
-    CallBase *NewCB;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
-      NewCB = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
-                               II->getUnwindDest(), Args);
+    // Loads the return value shadow.
+    IRBuilder<> NextIRB(Next);
+    unsigned Size = DL.getTypeAllocSize(DFSF.DFS.getShadowTy(&CB));
+    if (Size > RetvalTLSSize) {
+      // Set overflowed return shadow to be zero.
+      DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
     } else {
-      NewCB = IRB.CreateCall(NewFT, Func, Args);
-    }
-    NewCB->setCallingConv(CB.getCallingConv());
-    NewCB->setAttributes(CB.getAttributes().removeAttributes(
-        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
-        AttributeFuncs::typeIncompatible(NewCB->getType())));
-
-    if (Next) {
-      ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next);
-      DFSF.SkipInsts.insert(ExVal);
-      ExtractValueInst *ExShadow = ExtractValueInst::Create(NewCB, 1, "", Next);
-      DFSF.SkipInsts.insert(ExShadow);
-      DFSF.setShadow(ExVal, ExShadow);
-      DFSF.NonZeroChecks.push_back(ExShadow);
-
-      CB.replaceAllUsesWith(ExVal);
+      LoadInst *LI = NextIRB.CreateAlignedLoad(
+          DFSF.DFS.getShadowTy(&CB), DFSF.getRetvalTLS(CB.getType(), NextIRB),
+          ShadowTLSAlignment, "_dfsret");
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(&CB, LI);
+      DFSF.NonZeroChecks.push_back(LI);
     }
 
-    CB.eraseFromParent();
+    if (ShouldTrackOrigins) {
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.DFS.OriginTy,
+                                        DFSF.getRetvalOriginTLS(), "_dfsret_o");
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setOrigin(&CB, LI);
+    }
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index c99f2e66b1cc..325089fc4402 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -86,7 +86,7 @@ GCOVOptions GCOVOptions::getDefault() {
   Options.Atomic = AtomicCounter;
 
   if (DefaultGCOVVersion.size() != 4) {
-    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+    llvm::report_fatal_error(Twine("Invalid -default-gcov-version: ") +
                              DefaultGCOVVersion);
   }
   memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
@@ -1373,12 +1373,16 @@ Function *GCOVProfiler::insertReset(
 
   BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF);
   IRBuilder<> Builder(Entry);
+  LLVMContext &C = Entry->getContext();
 
   // Zero out the counters.
   for (const auto &I : CountersBySP) {
     GlobalVariable *GV = I.first;
-    Constant *Null = Constant::getNullValue(GV->getValueType());
-    Builder.CreateStore(Null, GV);
+    auto *GVTy = cast<ArrayType>(GV->getValueType());
+    Builder.CreateMemSet(GV, Constant::getNullValue(Type::getInt8Ty(C)),
+                         GVTy->getNumElements() *
+                             GVTy->getElementType()->getScalarSizeInBits() / 8,
+                         GV->getAlign());
   }
 
   Type *RetTy = ResetF->getReturnType();
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 60a4ee8811fb..62c265e40dab 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -17,7 +17,10 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -26,6 +29,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
@@ -41,6 +45,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -115,6 +120,17 @@ static cl::opt<bool>
                      cl::Hidden, cl::desc("Use Stack Safety analysis results"),
                      cl::Optional);
 
+static cl::opt<size_t> ClMaxLifetimes(
+    "hwasan-max-lifetimes-for-alloca", cl::Hidden, cl::init(3),
+    cl::ReallyHidden,
+    cl::desc("How many lifetime ends to handle for a single alloca."),
+    cl::Optional);
+
+static cl::opt<bool>
+    ClUseAfterScope("hwasan-use-after-scope",
+                    cl::desc("detect use after scope within function"),
+                    cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClUARRetagToZero(
     "hwasan-uar-retag-to-zero",
     cl::desc("Clear alloca tags before returning from the function to allow "
@@ -220,9 +236,21 @@ bool shouldUseStackSafetyAnalysis(const Triple &TargetTriple,
   return shouldInstrumentStack(TargetTriple) &&
          mightUseStackSafetyAnalysis(DisableOptimization);
 }
+
+bool shouldDetectUseAfterScope(const Triple &TargetTriple) {
+  return ClUseAfterScope && shouldInstrumentStack(TargetTriple);
+}
+
 /// An instrumentation pass implementing detection of addressability bugs
 /// using tagged pointers.
 class HWAddressSanitizer {
+private:
+  struct AllocaInfo {
+    AllocaInst *AI;
+    SmallVector<IntrinsicInst *, 2> LifetimeStart;
+    SmallVector<IntrinsicInst *, 2> LifetimeEnd;
+  };
+
 public:
   HWAddressSanitizer(Module &M, bool CompileKernel, bool Recover,
                      const StackSafetyGlobalInfo *SSI)
@@ -237,7 +265,11 @@ public:
 
   void setSSI(const StackSafetyGlobalInfo *S) { SSI = S; }
 
-  bool sanitizeFunction(Function &F);
+  DenseMap<AllocaInst *, AllocaInst *> padInterestingAllocas(
+      const MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument);
+  bool sanitizeFunction(Function &F,
+                        llvm::function_ref<const DominatorTree &()> GetDT,
+                        llvm::function_ref<const PostDominatorTree &()> GetPDT);
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -250,23 +282,34 @@ public:
 
   void untagPointerOperand(Instruction *I, Value *Addr);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+
+  int64_t getAccessInfo(bool IsWrite, unsigned AccessSizeIndex);
+  void instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
+                                  unsigned AccessSizeIndex,
+                                  Instruction *InsertBefore);
   void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
                                  unsigned AccessSizeIndex,
                                  Instruction *InsertBefore);
+  bool ignoreMemIntrinsic(MemIntrinsic *MI);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   bool instrumentMemAccess(InterestingMemoryOperand &O);
-  bool ignoreAccess(Value *Ptr);
+  bool ignoreAccess(Instruction *Inst, Value *Ptr);
   void getInterestingMemoryOperands(
       Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
 
   bool isInterestingAlloca(const AllocaInst &AI);
-  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
+  void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
   Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
   Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
+  static bool isStandardLifetime(const AllocaInfo &AllocaInfo,
+                                 const DominatorTree &DT);
   bool instrumentStack(
-      SmallVectorImpl<AllocaInst *> &Allocas,
+      MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument,
+      SmallVector<Instruction *, 4> &UnrecognizedLifetimes,
       DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
-      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
+      SmallVectorImpl<Instruction *> &RetVec, Value *StackTag,
+      llvm::function_ref<const DominatorTree &()> GetDT,
+      llvm::function_ref<const PostDominatorTree &()> GetPDT);
   Value *readRegister(IRBuilder<> &IRB, StringRef Name);
   bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
   Value *getNextTagWithCall(IRBuilder<> &IRB);
@@ -313,8 +356,9 @@ private:
     bool WithFrameRecord;
 
     void init(Triple &TargetTriple, bool InstrumentWithCalls);
-    unsigned getObjectAlignment() const { return 1U << Scale; }
+    uint64_t getObjectAlignment() const { return 1ULL << Scale; }
   };
+
   ShadowMapping Mapping;
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
@@ -331,6 +375,7 @@ private:
   bool InstrumentLandingPads;
   bool InstrumentWithCalls;
   bool InstrumentStack;
+  bool DetectUseAfterScope;
   bool UsePageAliases;
 
   bool HasMatchAllTag = false;
@@ -377,14 +422,21 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
-    if (shouldUseStackSafetyAnalysis(Triple(F.getParent()->getTargetTriple()),
-                                     DisableOptimization)) {
+    auto TargetTriple = Triple(F.getParent()->getTargetTriple());
+    if (shouldUseStackSafetyAnalysis(TargetTriple, DisableOptimization)) {
       // We cannot call getAnalysis in doInitialization, that would cause a
       // crash as the required analyses are not initialized yet.
       HWASan->setSSI(
           &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult());
     }
-    return HWASan->sanitizeFunction(F);
+    return HWASan->sanitizeFunction(
+        F,
+        [&]() -> const DominatorTree & {
+          return getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+        },
+        [&]() -> const PostDominatorTree & {
+          return getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+        });
   }
 
   bool doFinalization(Module &M) override {
@@ -399,6 +451,8 @@ public:
     // This is so we don't need to plumb TargetTriple all the way to here.
     if (mightUseStackSafetyAnalysis(DisableOptimization))
       AU.addRequired<StackSafetyGlobalInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
   }
 
 private:
@@ -417,6 +471,8 @@ INITIALIZE_PASS_BEGIN(
     "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(
     HWAddressSanitizerLegacyPass, "hwasan",
     "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
@@ -430,25 +486,41 @@ llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel, bool Recover,
                                           DisableOptimization);
 }
 
-HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover,
-                                               bool DisableOptimization)
-    : CompileKernel(CompileKernel), Recover(Recover),
-      DisableOptimization(DisableOptimization) {}
-
 PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
   const StackSafetyGlobalInfo *SSI = nullptr;
-  if (shouldUseStackSafetyAnalysis(llvm::Triple(M.getTargetTriple()),
-                                   DisableOptimization))
+  auto TargetTriple = llvm::Triple(M.getTargetTriple());
+  if (shouldUseStackSafetyAnalysis(TargetTriple, Options.DisableOptimization))
     SSI = &MAM.getResult<StackSafetyGlobalAnalysis>(M);
-  HWAddressSanitizer HWASan(M, CompileKernel, Recover, SSI);
+
+  HWAddressSanitizer HWASan(M, Options.CompileKernel, Options.Recover, SSI);
   bool Modified = false;
-  for (Function &F : M)
-    Modified |= HWASan.sanitizeFunction(F);
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    Modified |= HWASan.sanitizeFunction(
+        F,
+        [&]() -> const DominatorTree & {
+          return FAM.getResult<DominatorTreeAnalysis>(F);
+        },
+        [&]() -> const PostDominatorTree & {
+          return FAM.getResult<PostDominatorTreeAnalysis>(F);
+        });
+  }
   if (Modified)
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
+void HWAddressSanitizerPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<HWAddressSanitizerPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (Options.CompileKernel)
+    OS << "kernel;";
+  if (Options.Recover)
+    OS << "recover";
+  OS << ">";
+}
 
 void HWAddressSanitizer::createHwasanCtorComdat() {
   std::tie(HwasanCtorFunction, std::ignore) =
@@ -566,6 +638,7 @@ void HWAddressSanitizer::initializeModule() {
   UsePageAliases = shouldUsePageAliases(TargetTriple);
   InstrumentWithCalls = shouldInstrumentWithCalls(TargetTriple);
   InstrumentStack = shouldInstrumentStack(TargetTriple);
+  DetectUseAfterScope = shouldDetectUseAfterScope(TargetTriple);
   PointerTagShift = IsX86_64 ? 57 : 56;
   TagMaskByte = IsX86_64 ? 0x3F : 0xFF;
 
@@ -712,7 +785,7 @@ Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) {
   }
 }
 
-bool HWAddressSanitizer::ignoreAccess(Value *Ptr) {
+bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) {
   // Do not instrument acesses from different address spaces; we cannot deal
   // with them.
   Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
@@ -726,6 +799,12 @@ bool HWAddressSanitizer::ignoreAccess(Value *Ptr) {
   if (Ptr->isSwiftError())
     return true;
 
+  if (findAllocaForValue(Ptr)) {
+    if (!InstrumentStack)
+      return true;
+    if (SSI && SSI->stackAccessIsSafe(*Inst))
+      return true;
+  }
   return false;
 }
 
@@ -740,29 +819,29 @@ void HWAddressSanitizer::getInterestingMemoryOperands(
     return;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+    if (!ClInstrumentReads || ignoreAccess(I, LI->getPointerOperand()))
       return;
     Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
                              LI->getType(), LI->getAlign());
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+    if (!ClInstrumentWrites || ignoreAccess(I, SI->getPointerOperand()))
       return;
     Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
                              SI->getValueOperand()->getType(), SI->getAlign());
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
-    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+    if (!ClInstrumentAtomics || ignoreAccess(I, RMW->getPointerOperand()))
       return;
     Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
                              RMW->getValOperand()->getType(), None);
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
-    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+    if (!ClInstrumentAtomics || ignoreAccess(I, XCHG->getPointerOperand()))
       return;
     Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
                              XCHG->getCompareOperand()->getType(), None);
   } else if (auto CI = dyn_cast<CallInst>(I)) {
-    for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+    for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) {
       if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
-          ignoreAccess(CI->getArgOperand(ArgNo)))
+          ignoreAccess(I, CI->getArgOperand(ArgNo)))
         continue;
       Type *Ty = CI->getParamByValType(ArgNo);
       Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
@@ -809,30 +888,38 @@ Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
   return IRB.CreateGEP(Int8Ty, ShadowBase, Shadow);
 }
 
+int64_t HWAddressSanitizer::getAccessInfo(bool IsWrite,
+                                          unsigned AccessSizeIndex) {
+  return (CompileKernel << HWASanAccessInfo::CompileKernelShift) +
+         (HasMatchAllTag << HWASanAccessInfo::HasMatchAllShift) +
+         (MatchAllTag << HWASanAccessInfo::MatchAllShift) +
+         (Recover << HWASanAccessInfo::RecoverShift) +
+         (IsWrite << HWASanAccessInfo::IsWriteShift) +
+         (AccessSizeIndex << HWASanAccessInfo::AccessSizeShift);
+}
+
+void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
+                                                    unsigned AccessSizeIndex,
+                                                    Instruction *InsertBefore) {
+  assert(!UsePageAliases);
+  const int64_t AccessInfo = getAccessInfo(IsWrite, AccessSizeIndex);
+  IRBuilder<> IRB(InsertBefore);
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
+  IRB.CreateCall(Intrinsic::getDeclaration(
+                     M, UseShortGranules
+                            ? Intrinsic::hwasan_check_memaccess_shortgranules
+                            : Intrinsic::hwasan_check_memaccess),
+                 {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+}
+
 void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
                                                    unsigned AccessSizeIndex,
                                                    Instruction *InsertBefore) {
   assert(!UsePageAliases);
-  const int64_t AccessInfo =
-      (CompileKernel << HWASanAccessInfo::CompileKernelShift) +
-      (HasMatchAllTag << HWASanAccessInfo::HasMatchAllShift) +
-      (MatchAllTag << HWASanAccessInfo::MatchAllShift) +
-      (Recover << HWASanAccessInfo::RecoverShift) +
-      (IsWrite << HWASanAccessInfo::IsWriteShift) +
-      (AccessSizeIndex << HWASanAccessInfo::AccessSizeShift);
+  const int64_t AccessInfo = getAccessInfo(IsWrite, AccessSizeIndex);
   IRBuilder<> IRB(InsertBefore);
 
-  if (OutlinedChecks) {
-    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-    Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
-    IRB.CreateCall(Intrinsic::getDeclaration(
-                       M, UseShortGranules
-                              ? Intrinsic::hwasan_check_memaccess_shortgranules
-                              : Intrinsic::hwasan_check_memaccess),
-                   {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
-    return;
-  }
-
   Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy);
   Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, PointerTagShift),
                                   IRB.getInt8Ty());
@@ -908,6 +995,16 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
     cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
 }
 
+bool HWAddressSanitizer::ignoreMemIntrinsic(MemIntrinsic *MI) {
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+    return (!ClInstrumentWrites || ignoreAccess(MTI, MTI->getDest())) &&
+           (!ClInstrumentReads || ignoreAccess(MTI, MTI->getSource()));
+  }
+  if (isa<MemSetInst>(MI))
+    return !ClInstrumentWrites || ignoreAccess(MI, MI->getDest());
+  return false;
+}
+
 void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
   IRBuilder<> IRB(MI);
   if (isa<MemTransferInst>(MI)) {
@@ -943,6 +1040,8 @@ bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
     if (InstrumentWithCalls) {
       IRB.CreateCall(HwasanMemoryAccessCallback[O.IsWrite][AccessSizeIndex],
                      IRB.CreatePointerCast(Addr, IntptrTy));
+    } else if (OutlinedChecks) {
+      instrumentMemAccessOutline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn());
     } else {
       instrumentMemAccessInline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn());
     }
@@ -968,7 +1067,7 @@ static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
   return SizeInBytes * ArraySize;
 }
 
-bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag,
+void HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag,
                                    size_t Size) {
   size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
   if (!UseShortGranules)
@@ -999,7 +1098,6 @@ bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag,
                                    AlignedSize - 1));
     }
   }
-  return true;
 }
 
 unsigned HWAddressSanitizer::retagMask(unsigned AllocaNo) {
@@ -1231,17 +1329,53 @@ bool HWAddressSanitizer::instrumentLandingPads(
   return true;
 }
 
+static bool
+maybeReachableFromEachOther(const SmallVectorImpl<IntrinsicInst *> &Insts,
+                            const DominatorTree &DT) {
+  // If we have too many lifetime ends, give up, as the algorithm below is N^2.
+  if (Insts.size() > ClMaxLifetimes)
+    return true;
+  for (size_t I = 0; I < Insts.size(); ++I) {
+    for (size_t J = 0; J < Insts.size(); ++J) {
+      if (I == J)
+        continue;
+      if (isPotentiallyReachable(Insts[I], Insts[J], nullptr, &DT))
+        return true;
+    }
+  }
+  return false;
+}
+
+// static
+bool HWAddressSanitizer::isStandardLifetime(const AllocaInfo &AllocaInfo,
+                                            const DominatorTree &DT) {
+  // An alloca that has exactly one start and end in every possible execution.
+  // If it has multiple ends, they have to be unreachable from each other, so
+  // at most one of them is actually used for each execution of the function.
+  return AllocaInfo.LifetimeStart.size() == 1 &&
+         (AllocaInfo.LifetimeEnd.size() == 1 ||
+          (AllocaInfo.LifetimeEnd.size() > 0 &&
+           !maybeReachableFromEachOther(AllocaInfo.LifetimeEnd, DT)));
+}
+
 bool HWAddressSanitizer::instrumentStack(
-    SmallVectorImpl<AllocaInst *> &Allocas,
+    MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument,
+    SmallVector<Instruction *, 4> &UnrecognizedLifetimes,
     DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
-    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
+    SmallVectorImpl<Instruction *> &RetVec, Value *StackTag,
+    llvm::function_ref<const DominatorTree &()> GetDT,
+    llvm::function_ref<const PostDominatorTree &()> GetPDT) {
   // Ideally, we want to calculate tagged stack base pointer, and rewrite all
   // alloca addresses using that. Unfortunately, offsets are not known yet
   // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
   // temp, shift-OR it into each alloca address and xor with the retag mask.
   // This generates one extra instruction per alloca use.
-  for (unsigned N = 0; N < Allocas.size(); ++N) {
-    auto *AI = Allocas[N];
+  unsigned int I = 0;
+
+  for (auto &KV : AllocasToInstrument) {
+    auto N = I++;
+    auto *AI = KV.first;
+    AllocaInfo &Info = KV.second;
     IRBuilder<> IRB(AI->getNextNode());
 
     // Replace uses of the alloca with tagged address.
@@ -1268,17 +1402,40 @@ bool HWAddressSanitizer::instrumentStack(
     }
 
     size_t Size = getAllocaSizeInBytes(*AI);
-    tagAlloca(IRB, AI, Tag, Size);
-
-    for (auto RI : RetVec) {
-      IRB.SetInsertPoint(RI);
-
-      // Re-tag alloca memory with the special UAR tag.
-      Value *Tag = getUARTag(IRB, StackTag);
-      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
+    size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    bool StandardLifetime =
+        UnrecognizedLifetimes.empty() && isStandardLifetime(Info, GetDT());
+    if (DetectUseAfterScope && StandardLifetime) {
+      IntrinsicInst *Start = Info.LifetimeStart[0];
+      IRB.SetInsertPoint(Start->getNextNode());
+      auto TagEnd = [&](Instruction *Node) {
+        IRB.SetInsertPoint(Node);
+        Value *UARTag = getUARTag(IRB, StackTag);
+        tagAlloca(IRB, AI, UARTag, AlignedSize);
+      };
+      tagAlloca(IRB, AI, Tag, Size);
+      if (!forAllReachableExits(GetDT(), GetPDT(), Start, Info.LifetimeEnd,
+                                RetVec, TagEnd)) {
+        for (auto *End : Info.LifetimeEnd)
+          End->eraseFromParent();
+      }
+    } else {
+      tagAlloca(IRB, AI, Tag, Size);
+      for (auto *RI : RetVec) {
+        IRB.SetInsertPoint(RI);
+        Value *UARTag = getUARTag(IRB, StackTag);
+        tagAlloca(IRB, AI, UARTag, AlignedSize);
+      }
+      if (!StandardLifetime) {
+        for (auto &II : Info.LifetimeStart)
+          II->eraseFromParent();
+        for (auto &II : Info.LifetimeEnd)
+          II->eraseFromParent();
+      }
     }
   }
-
+  for (auto &I : UnrecognizedLifetimes)
+    I->eraseFromParent();
   return true;
 }
 
@@ -1300,7 +1457,42 @@ bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
          !(SSI && SSI->isSafe(AI));
 }
 
-bool HWAddressSanitizer::sanitizeFunction(Function &F) {
+DenseMap<AllocaInst *, AllocaInst *> HWAddressSanitizer::padInterestingAllocas(
+    const MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument) {
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+  for (auto &KV : AllocasToInstrument) {
+    AllocaInst *AI = KV.first;
+    uint64_t Size = getAllocaSizeInBytes(*AI);
+    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    AI->setAlignment(
+        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+    if (Size != AlignedSize) {
+      Type *AllocatedType = AI->getAllocatedType();
+      if (AI->isArrayAllocation()) {
+        uint64_t ArraySize =
+            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
+      }
+      Type *TypeWithPadding = StructType::get(
+          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
+      auto *NewAI = new AllocaInst(
+          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+      NewAI->takeName(AI);
+      NewAI->setAlignment(AI->getAlign());
+      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+      NewAI->setSwiftError(AI->isSwiftError());
+      NewAI->copyMetadata(*AI);
+      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
+      AI->replaceAllUsesWith(Bitcast);
+      AllocaToPaddedAllocaMap[AI] = NewAI;
+    }
+  }
+  return AllocaToPaddedAllocaMap;
+}
+
+bool HWAddressSanitizer::sanitizeFunction(
+    Function &F, llvm::function_ref<const DominatorTree &()> GetDT,
+    llvm::function_ref<const PostDominatorTree &()> GetPDT) {
   if (&F == HwasanCtorFunction)
     return false;
 
@@ -1311,18 +1503,36 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
 
   SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
   SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
-  SmallVector<AllocaInst *, 8> AllocasToInstrument;
+  MapVector<AllocaInst *, AllocaInfo> AllocasToInstrument;
   SmallVector<Instruction *, 8> RetVec;
   SmallVector<Instruction *, 8> LandingPadVec;
+  SmallVector<Instruction *, 4> UnrecognizedLifetimes;
   DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
   for (auto &BB : F) {
     for (auto &Inst : BB) {
-      if (InstrumentStack)
+      if (InstrumentStack) {
         if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
           if (isInterestingAlloca(*AI))
-            AllocasToInstrument.push_back(AI);
+            AllocasToInstrument.insert({AI, {}});
+          continue;
+        }
+        auto *II = dyn_cast<IntrinsicInst>(&Inst);
+        if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+                   II->getIntrinsicID() == Intrinsic::lifetime_end)) {
+          AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
+          if (!AI) {
+            UnrecognizedLifetimes.push_back(&Inst);
+            continue;
+          }
+          if (!isInterestingAlloca(*AI))
+            continue;
+          if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+            AllocasToInstrument[AI].LifetimeStart.push_back(II);
+          else
+            AllocasToInstrument[AI].LifetimeEnd.push_back(II);
           continue;
         }
+      }
 
       if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
           isa<CleanupReturnInst>(Inst))
@@ -1343,7 +1553,8 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       getInterestingMemoryOperands(&Inst, OperandsToInstrument);
 
       if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
-        IntrinToInstrument.push_back(MI);
+        if (!ignoreMemIntrinsic(MI))
+          IntrinToInstrument.push_back(MI);
     }
   }
 
@@ -1377,38 +1588,14 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   if (!AllocasToInstrument.empty()) {
     Value *StackTag =
         ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
-    instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag);
+    instrumentStack(AllocasToInstrument, UnrecognizedLifetimes, AllocaDbgMap,
+                    RetVec, StackTag, GetDT, GetPDT);
   }
   // Pad and align each of the allocas that we instrumented to stop small
   // uninteresting allocas from hiding in instrumented alloca's padding and so
   // that we have enough space to store real tags for short granules.
-  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
-  for (AllocaInst *AI : AllocasToInstrument) {
-    uint64_t Size = getAllocaSizeInBytes(*AI);
-    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
-    AI->setAlignment(
-        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
-    if (Size != AlignedSize) {
-      Type *AllocatedType = AI->getAllocatedType();
-      if (AI->isArrayAllocation()) {
-        uint64_t ArraySize =
-            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
-        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
-      }
-      Type *TypeWithPadding = StructType::get(
-          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
-      auto *NewAI = new AllocaInst(
-          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
-      NewAI->takeName(AI);
-      NewAI->setAlignment(AI->getAlign());
-      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
-      NewAI->setSwiftError(AI->isSwiftError());
-      NewAI->copyMetadata(*AI);
-      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
-      AI->replaceAllUsesWith(Bitcast);
-      AllocaToPaddedAllocaMap[AI] = NewAI;
-    }
-  }
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap =
+      padInterestingAllocas(AllocasToInstrument);
 
   if (!AllocaToPaddedAllocaMap.empty()) {
     for (auto &BB : F) {
@@ -1434,13 +1621,11 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
   // dynamic allocas.
   if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
     InsertPt = &*F.getEntryBlock().begin();
-    for (auto II = EntryIRB.GetInsertBlock()->begin(),
-              IE = EntryIRB.GetInsertBlock()->end();
-         II != IE;) {
-      Instruction *I = &*II++;
-      if (auto *AI = dyn_cast<AllocaInst>(I))
+    for (Instruction &I :
+         llvm::make_early_inc_range(*EntryIRB.GetInsertBlock())) {
+      if (auto *AI = dyn_cast<AllocaInst>(&I))
         if (isa<ConstantInt>(AI->getArraySize()))
-          I->moveBefore(InsertPt);
+          I.moveBefore(InsertPt);
     }
   }
 
@@ -1586,9 +1771,10 @@ void HWAddressSanitizer::instrumentGlobals() {
   Hasher.update(M.getSourceFileName());
   MD5::MD5Result Hash;
   Hasher.final(Hash);
-  uint8_t Tag = Hash[0] & TagMaskByte;
+  uint8_t Tag = Hash[0];
 
   for (GlobalVariable *GV : Globals) {
+    Tag &= TagMaskByte;
     // Skip tag 0 in order to avoid collisions with untagged memory.
     if (Tag == 0)
       Tag = 1;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index 071feb876540..3ea314329079 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -1,9 +1,8 @@
 //===- InstrOrderFile.cpp ---- Late IR instrumentation for order file ----===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 0d257bb6bd52..ad21fec269ec 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -446,13 +446,12 @@ bool InstrProfiling::lowerIntrinsics(Function *F) {
   bool MadeChange = false;
   PromotionCandidates.clear();
   for (BasicBlock &BB : *F) {
-    for (auto I = BB.begin(), E = BB.end(); I != E;) {
-      auto Instr = I++;
-      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+    for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
+      InstrProfIncrementInst *Inc = castToIncrementInst(&Instr);
       if (Inc) {
         lowerIncrement(Inc);
         MadeChange = true;
-      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
         lowerValueProfileInst(Ind);
         MadeChange = true;
       }
@@ -520,6 +519,14 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) {
   }
 }
 
+static bool needsRuntimeHookUnconditionally(const Triple &TT) {
+  // On Fuchsia, we only need runtime hook if any counters are present.
+  if (TT.isOSFuchsia())
+    return false;
+
+  return true;
+}
+
 /// Check if the module contains uses of any profiling intrinsics.
 static bool containsProfilingIntrinsics(Module &M) {
   if (auto *F = M.getFunction(
@@ -548,8 +555,11 @@ bool InstrProfiling::run(
   UsedVars.clear();
   TT = Triple(M.getTargetTriple());
 
+  bool MadeChange = false;
+
   // Emit the runtime hook even if no counters are present.
-  bool MadeChange = emitRuntimeHook();
+  if (needsRuntimeHookUnconditionally(TT))
+    MadeChange = emitRuntimeHook();
 
   // Improve compile time by avoiding linear scans when there is no work.
   GlobalVariable *CoverageNamesVar =
@@ -588,6 +598,7 @@ bool InstrProfiling::run(
 
   emitVNodes();
   emitNameData();
+  emitRuntimeHook();
   emitRegistration();
   emitUses();
   emitInitialization();
@@ -692,7 +703,6 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
     LoadInst *LI = dyn_cast<LoadInst>(&I);
     if (!LI) {
       IRBuilder<> Builder(&I);
-      Type *Int64Ty = Type::getInt64Ty(M->getContext());
       GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
       if (!Bias) {
         // Compiler must define this variable when runtime counter relocation
@@ -747,14 +757,18 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 }
 
 /// Get the name of a profiling variable for a particular function.
-static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix,
+                              bool &Renamed) {
   StringRef NamePrefix = getInstrProfNameVarPrefix();
   StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
   Function *F = Inc->getParent()->getParent();
   Module *M = F->getParent();
   if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
-      !canRenameComdatFunc(*F))
+      !canRenameComdatFunc(*F)) {
+    Renamed = false;
     return (Prefix + Name).str();
+  }
+  Renamed = true;
   uint64_t FuncHash = Inc->getHash()->getZExtValue();
   SmallVector<char, 24> HashPostfix;
   if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
@@ -848,6 +862,15 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
   GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
 
+  // Due to the limitation of binder as of 2021/09/28, the duplicate weak
+  // symbols in the same csect won't be discarded. When there are duplicate weak
+  // symbols, we can NOT guarantee that the relocations get resolved to the
+  // intended weak symbol, so we can not ensure the correctness of the relative
+  // CounterPtr, so we have to use private linkage for counter and data symbols.
+  if (TT.isOSBinFormatXCOFF()) {
+    Linkage = GlobalValue::PrivateLinkage;
+    Visibility = GlobalValue::DefaultVisibility;
+  }
   // Move the name variable to the right section. Place them in a COMDAT group
   // if the associated function is a COMDAT. This will make sure that only one
   // copy of counters of the COMDAT function will be emitted after linking. Keep
@@ -867,8 +890,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // discarded.
   bool DataReferencedByCode = profDataReferencedByCode(*M);
   bool NeedComdat = needsComdatForCounter(*Fn, *M);
-  std::string CntsVarName = getVarName(Inc, getInstrProfCountersVarPrefix());
-  std::string DataVarName = getVarName(Inc, getInstrProfDataVarPrefix());
+  bool Renamed;
+  std::string CntsVarName =
+      getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
+  std::string DataVarName =
+      getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
   auto MaybeSetComdat = [&](GlobalVariable *GV) {
     bool UseComdat = (NeedComdat || TT.isOSBinFormatELF());
     if (UseComdat) {
@@ -909,7 +935,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
     ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
     auto *ValuesVar = new GlobalVariable(
         *M, ValuesTy, false, Linkage, Constant::getNullValue(ValuesTy),
-        getVarName(Inc, getInstrProfValuesVarPrefix()));
+        getVarName(Inc, getInstrProfValuesVarPrefix(), Renamed));
     ValuesVar->setVisibility(Visibility);
     ValuesVar->setSection(
         getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
@@ -920,6 +946,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   }
 
   // Create data variable.
+  auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
   auto *Int16Ty = Type::getInt16Ty(Ctx);
   auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
   Type *DataTypes[] = {
@@ -936,10 +963,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
 
-  Constant *DataVals[] = {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
-#include "llvm/ProfileData/InstrProfData.inc"
-  };
   // If the data variable is not referenced by code (if we don't emit
   // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
   // data variable live under linker GC, the data variable can be private. This
@@ -947,14 +970,30 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   //
   // On COFF, a comdat leader cannot be local so we require DataReferencedByCode
   // to be false.
-  if (NS == 0 && (TT.isOSBinFormatELF() ||
-                  (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
+  //
+  // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
+  // that other copies must have the same CFG and cannot have value profiling.
+  // If no hash suffix, other profd copies may be referenced by code.
+  if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
+      (TT.isOSBinFormatELF() ||
+       (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
     Linkage = GlobalValue::PrivateLinkage;
     Visibility = GlobalValue::DefaultVisibility;
   }
   auto *Data =
-      new GlobalVariable(*M, DataTy, false, Linkage,
-                         ConstantStruct::get(DataTy, DataVals), DataVarName);
+      new GlobalVariable(*M, DataTy, false, Linkage, nullptr, DataVarName);
+  // Reference the counter variable with a label difference (link-time
+  // constant).
+  auto *RelativeCounterPtr =
+      ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
+                           ConstantExpr::getPtrToInt(Data, IntPtrTy));
+
+  Constant *DataVals[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+  Data->setInitializer(ConstantStruct::get(DataTy, DataVals));
+
   Data->setVisibility(Visibility);
   Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
   Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
@@ -1035,7 +1074,7 @@ void InstrProfiling::emitNameData() {
   std::string CompressedNameStr;
   if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
                                           DoInstrProfNameCompression)) {
-    report_fatal_error(toString(std::move(E)), false);
+    report_fatal_error(Twine(toString(std::move(E))), false);
   }
 
   auto &Ctx = M->getContext();
@@ -1102,9 +1141,9 @@ void InstrProfiling::emitRegistration() {
 }
 
 bool InstrProfiling::emitRuntimeHook() {
-  // We expect the linker to be invoked with -u<hook_var> flag for Linux or
-  // Fuchsia, in which case there is no need to emit the user function.
-  if (TT.isOSLinux() || TT.isOSFuchsia())
+  // We expect the linker to be invoked with -u<hook_var> flag for Linux
+  // in which case there is no need to emit the external variable.
+  if (TT.isOSLinux())
     return false;
 
   // If the module's provided its own runtime, we don't need to do anything.
@@ -1117,23 +1156,28 @@ bool InstrProfiling::emitRuntimeHook() {
       new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          nullptr, getInstrProfRuntimeHookVarName());
 
-  // Make a function that uses it.
-  auto *User = Function::Create(FunctionType::get(Int32Ty, false),
-                                GlobalValue::LinkOnceODRLinkage,
-                                getInstrProfRuntimeHookVarUseFuncName(), M);
-  User->addFnAttr(Attribute::NoInline);
-  if (Options.NoRedZone)
-    User->addFnAttr(Attribute::NoRedZone);
-  User->setVisibility(GlobalValue::HiddenVisibility);
-  if (TT.supportsCOMDAT())
-    User->setComdat(M->getOrInsertComdat(User->getName()));
-
-  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
-  auto *Load = IRB.CreateLoad(Int32Ty, Var);
-  IRB.CreateRet(Load);
-
-  // Mark the user variable as used so that it isn't stripped out.
-  CompilerUsedVars.push_back(User);
+  if (TT.isOSBinFormatELF()) {
+    // Mark the user variable as used so that it isn't stripped out.
+    CompilerUsedVars.push_back(Var);
+  } else {
+    // Make a function that uses it.
+    auto *User = Function::Create(FunctionType::get(Int32Ty, false),
+                                  GlobalValue::LinkOnceODRLinkage,
+                                  getInstrProfRuntimeHookVarUseFuncName(), M);
+    User->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      User->addFnAttr(Attribute::NoRedZone);
+    User->setVisibility(GlobalValue::HiddenVisibility);
+    if (TT.supportsCOMDAT())
+      User->setComdat(M->getOrInsertComdat(User->getName()));
+
+    IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+    auto *Load = IRB.CreateLoad(Int32Ty, Var);
+    IRB.CreateRet(Load);
+
+    // Mark the function as used so that it isn't stripped out.
+    CompilerUsedVars.push_back(User);
+  }
   return true;
 }
 
@@ -1142,12 +1186,12 @@ void InstrProfiling::emitUses() {
   // GlobalOpt/ConstantMerge) may not discard associated sections as a unit, so
   // we conservatively retain all unconditionally in the compiler.
   //
-  // On ELF, the linker can guarantee the associated sections will be retained
-  // or discarded as a unit, so llvm.compiler.used is sufficient. Similarly on
-  // COFF, if prof data is not referenced by code we use one comdat and ensure
-  // this GC property as well. Otherwise, we have to conservatively make all of
-  // the sections retained by the linker.
-  if (TT.isOSBinFormatELF() ||
+  // On ELF and Mach-O, the linker can guarantee the associated sections will be
+  // retained or discarded as a unit, so llvm.compiler.used is sufficient.
+  // Similarly on COFF, if prof data is not referenced by code we use one comdat
+  // and ensure this GC property as well. Otherwise, we have to conservatively
+  // make all of the sections retained by the linker.
+  if (TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() ||
       (TT.isOSBinFormatCOFF() && !profDataReferencedByCode(*M)))
     appendToCompilerUsed(*M, CompilerUsedVars);
   else
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 0e6a404a9e0b..727672fa0605 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -107,6 +108,10 @@ static cl::opt<int>
                          cl::desc("granularity of memprof shadow mapping"),
                          cl::Hidden, cl::init(DefaultShadowGranularity));
 
+static cl::opt<bool> ClStack("memprof-instrument-stack",
+                             cl::desc("Instrument scalar stack variables"),
+                             cl::Hidden, cl::init(false));
+
 // Debug flags.
 
 static cl::opt<int> ClDebug("memprof-debug", cl::desc("debug"), cl::Hidden,
@@ -123,6 +128,8 @@ static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumSkippedStackReads, "Number of non-instrumented stack reads");
+STATISTIC(NumSkippedStackWrites, "Number of non-instrumented stack writes");
 
 namespace {
 
@@ -255,8 +262,6 @@ PreservedAnalyses MemProfilerPass::run(Function &F,
   if (Profiler.instrumentFunction(F))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
-
-  return PreservedAnalyses::all();
 }
 
 ModuleMemProfilerPass::ModuleMemProfilerPass() {}
@@ -448,6 +453,15 @@ void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
 
 void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
                                 InterestingMemoryAccess &Access) {
+  // Skip instrumentation of stack accesses unless requested.
+  if (!ClStack && isa<AllocaInst>(getUnderlyingObject(Access.Addr))) {
+    if (Access.IsWrite)
+      ++NumSkippedStackWrites;
+    else
+      ++NumSkippedStackReads;
+    return;
+  }
+
   if (Access.IsWrite)
     NumInstrumentedWrites++;
   else
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4e755bab15f3..4d15b784f486 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -673,14 +673,27 @@ PreservedAnalyses MemorySanitizerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-PreservedAnalyses MemorySanitizerPass::run(Module &M,
-                                           ModuleAnalysisManager &AM) {
+PreservedAnalyses
+ModuleMemorySanitizerPass::run(Module &M, ModuleAnalysisManager &AM) {
   if (Options.Kernel)
     return PreservedAnalyses::all();
   insertModuleCtor(M);
   return PreservedAnalyses::none();
 }
 
+void MemorySanitizerPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<MemorySanitizerPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (Options.Recover)
+    OS << "recover;";
+  if (Options.Kernel)
+    OS << "kernel;";
+  OS << "track-origins=" << Options.TrackOrigins;
+  OS << ">";
+}
+
 char MemorySanitizerLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
@@ -1695,7 +1708,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           if (FArgEagerCheck) {
             *ShadowPtr = getCleanShadow(V);
             setOrigin(A, getCleanOrigin());
-            continue;
+            break;
           } else if (FArgByVal) {
             Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
             // ByVal pointer itself has clean shadow. We copy the actual
@@ -1745,8 +1758,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           break;
         }
 
-        if (!FArgEagerCheck)
-          ArgOffset += alignTo(Size, kShadowTLSAlignment);
+        ArgOffset += alignTo(Size, kShadowTLSAlignment);
       }
       assert(*ShadowPtr && "Could not find shadow for an argument");
       return *ShadowPtr;
@@ -2661,7 +2673,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           RetTy->isX86_MMXTy()))
       return false;
 
-    unsigned NumArgOperands = I.getNumArgOperands();
+    unsigned NumArgOperands = I.arg_size();
     for (unsigned i = 0; i < NumArgOperands; ++i) {
       Type *Ty = I.getArgOperand(i)->getType();
       if (Ty != RetTy)
@@ -2688,7 +2700,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// We special-case intrinsics where this approach fails. See llvm.bswap
   /// handling as an example of that.
   bool handleUnknownIntrinsic(IntrinsicInst &I) {
-    unsigned NumArgOperands = I.getNumArgOperands();
+    unsigned NumArgOperands = I.arg_size();
     if (NumArgOperands == 0)
       return false;
 
@@ -2762,10 +2774,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *CopyOp, *ConvertOp;
 
     assert((!HasRoundingMode ||
-            isa<ConstantInt>(I.getArgOperand(I.getNumArgOperands() - 1))) &&
+            isa<ConstantInt>(I.getArgOperand(I.arg_size() - 1))) &&
            "Invalid rounding mode");
 
-    switch (I.getNumArgOperands() - HasRoundingMode) {
+    switch (I.arg_size() - HasRoundingMode) {
     case 2:
       CopyOp = I.getArgOperand(0);
       ConvertOp = I.getArgOperand(1);
@@ -2854,7 +2866,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // size, and the rest is ignored. Behavior is defined even if shift size is
   // greater than register (or field) width.
   void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
-    assert(I.getNumArgOperands() == 2);
+    assert(I.arg_size() == 2);
     IRBuilder<> IRB(&I);
     // If any of the S2 bits are poisoned, the whole thing is poisoned.
     // Otherwise perform the same shift on S1.
@@ -2919,7 +2931,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
   // EltSizeInBits is used only for x86mmx arguments.
   void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
-    assert(I.getNumArgOperands() == 2);
+    assert(I.arg_size() == 2);
     bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
     IRBuilder<> IRB(&I);
     Value *S1 = getShadow(&I, 0);
@@ -3653,9 +3665,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           .addAttribute(Attribute::ArgMemOnly)
           .addAttribute(Attribute::Speculatable);
 
-      Call->removeAttributes(AttributeList::FunctionIndex, B);
+      Call->removeFnAttrs(B);
       if (Function *Func = Call->getCalledFunction()) {
-        Func->removeAttributes(AttributeList::FunctionIndex, B);
+        Func->removeFnAttrs(B);
       }
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
@@ -3696,42 +3708,48 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       if (EagerCheck) {
         insertShadowCheck(A, &CB);
-        continue;
-      }
-      if (ByVal) {
-        // ByVal requires some special handling as it's too big for a single
-        // load
-        assert(A->getType()->isPointerTy() &&
-               "ByVal argument is not a pointer!");
-        Size = DL.getTypeAllocSize(CB.getParamByValType(i));
-        if (ArgOffset + Size > kParamTLSSize) break;
-        const MaybeAlign ParamAlignment(CB.getParamAlign(i));
-        MaybeAlign Alignment = llvm::None;
-        if (ParamAlignment)
-          Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
-        Value *AShadowPtr =
-            getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
-                               /*isStore*/ false)
-                .first;
-
-        Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
-                                 Alignment, Size);
-        // TODO(glider): need to copy origins.
-      } else {
-        // Any other parameters mean we need bit-grained tracking of uninit data
         Size = DL.getTypeAllocSize(A->getType());
-        if (ArgOffset + Size > kParamTLSSize) break;
-        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
-                                       kShadowTLSAlignment);
-        Constant *Cst = dyn_cast<Constant>(ArgShadow);
-        if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
+      } else {
+        if (ByVal) {
+          // ByVal requires some special handling as it's too big for a single
+          // load
+          assert(A->getType()->isPointerTy() &&
+                 "ByVal argument is not a pointer!");
+          Size = DL.getTypeAllocSize(CB.getParamByValType(i));
+          if (ArgOffset + Size > kParamTLSSize)
+            break;
+          const MaybeAlign ParamAlignment(CB.getParamAlign(i));
+          MaybeAlign Alignment = llvm::None;
+          if (ParamAlignment)
+            Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
+          Value *AShadowPtr =
+              getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
+                                 /*isStore*/ false)
+                  .first;
+
+          Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+                                   Alignment, Size);
+          // TODO(glider): need to copy origins.
+        } else {
+          // Any other parameters mean we need bit-grained tracking of uninit
+          // data
+          Size = DL.getTypeAllocSize(A->getType());
+          if (ArgOffset + Size > kParamTLSSize)
+            break;
+          Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                         kShadowTLSAlignment);
+          Constant *Cst = dyn_cast<Constant>(ArgShadow);
+          if (Cst && Cst->isNullValue())
+            ArgIsInitialized = true;
+        }
+        if (MS.TrackOrigins && !ArgIsInitialized)
+          IRB.CreateStore(getOrigin(A),
+                          getOriginPtrForArgument(A, IRB, ArgOffset));
+        (void)Store;
+        assert(Store != nullptr);
+        LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n");
       }
-      if (MS.TrackOrigins && !ArgIsInitialized)
-        IRB.CreateStore(getOrigin(A),
-                        getOriginPtrForArgument(A, IRB, ArgOffset));
-      (void)Store;
-      assert(Size != 0 && Store != nullptr);
-      LLVM_DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      assert(Size != 0);
       ArgOffset += alignTo(Size, kShadowTLSAlignment);
     }
     LLVM_DEBUG(dbgs() << "  done with call args\n");
@@ -3807,7 +3825,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (isAMustTailRetVal(RetVal)) return;
     Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
     bool HasNoUndef =
-        F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+        F.hasRetAttribute(Attribute::NoUndef);
     bool StoreShadow = !(ClEagerChecks && HasNoUndef);
     // FIXME: Consider using SpecialCaseList to specify a list of functions that
     // must always return fully initialized values. For now, we hardcode "main".
@@ -4176,7 +4194,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
                     MemorySanitizerVisitor &MSV)
       : F(F), MS(MS), MSV(MSV) {
     AMD64FpEndOffset = AMD64FpEndOffsetSSE;
-    for (const auto &Attr : F.getAttributes().getFnAttributes()) {
+    for (const auto &Attr : F.getAttributes().getFnAttrs()) {
       if (Attr.isStringAttribute() &&
           (Attr.getKindAsString() == "target-features")) {
         if (Attr.getValueAsString().contains("-sse"))
@@ -5330,6 +5348,9 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
   if (!CompileKernel && F.getName() == kMsanModuleCtorName)
     return false;
 
+  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+    return false;
+
   MemorySanitizerVisitor Visitor(F, *this, TLI);
 
   // Clear out readonly/readnone attributes.
@@ -5339,7 +5360,7 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
       .addAttribute(Attribute::WriteOnly)
       .addAttribute(Attribute::ArgMemOnly)
       .addAttribute(Attribute::Speculatable);
-  F.removeAttributes(AttributeList::FunctionIndex, B);
+  F.removeFnAttrs(B);
 
   return Visitor.runOnFunction();
 }
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 3d9261eb99ba..af5946325bbb 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -198,12 +199,14 @@ static cl::opt<bool>
                             "warnings about missing profile data for "
                             "functions."));
 
+namespace llvm {
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data.
-static cl::opt<bool>
+cl::opt<bool>
     NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
                       cl::desc("Use this option to turn off/on "
                                "warnings about profile cfg mismatch."));
+} // namespace llvm
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data for Comdat functions, which often turns out to be false
@@ -462,7 +465,10 @@ public:
 private:
   bool runOnModule(Module &M) override {
     createProfileFileNameVar(M, InstrProfileOutput);
-    createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
+    // The variable in a comdat may be discarded by LTO. Ensure the
+    // declaration will be retained.
+    appendToCompilerUsed(
+        M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry));
     return false;
   }
   std::string InstrProfileOutput;
@@ -1610,7 +1616,7 @@ static bool InstrumentAllFunctions(
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
   if (!IsCS)
-    createIRLevelProfileFlagVar(M, /* IsCS */ false, PGOInstrumentEntry);
+    createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry);
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
 
@@ -1630,7 +1636,10 @@ static bool InstrumentAllFunctions(
 PreservedAnalyses
 PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
   createProfileFileNameVar(M, CSInstrName);
-  createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
+  // The variable in a comdat may be discarded by LTO. Ensure the declaration
+  // will be retained.
+  appendToCompilerUsed(
+      M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry));
   return PreservedAnalyses::all();
 }
 
@@ -1677,7 +1686,7 @@ static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
   BlockFrequencyInfo NBFI(F, NBPI, LI);
 #ifndef NDEBUG
   auto BFIEntryCount = F.getEntryCount();
-  assert(BFIEntryCount.hasValue() && (BFIEntryCount.getCount() > 0) &&
+  assert(BFIEntryCount.hasValue() && (BFIEntryCount->getCount() > 0) &&
          "Invalid BFI Entrycount");
 #endif
   auto SumCount = APFloat::getZero(APFloat::IEEEdouble());
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 7607464cc0b9..da8ee1f15bf8 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -55,6 +55,16 @@ const char SanCovTraceConstCmp1[] = "__sanitizer_cov_trace_const_cmp1";
 const char SanCovTraceConstCmp2[] = "__sanitizer_cov_trace_const_cmp2";
 const char SanCovTraceConstCmp4[] = "__sanitizer_cov_trace_const_cmp4";
 const char SanCovTraceConstCmp8[] = "__sanitizer_cov_trace_const_cmp8";
+const char SanCovLoad1[] = "__sanitizer_cov_load1";
+const char SanCovLoad2[] = "__sanitizer_cov_load2";
+const char SanCovLoad4[] = "__sanitizer_cov_load4";
+const char SanCovLoad8[] = "__sanitizer_cov_load8";
+const char SanCovLoad16[] = "__sanitizer_cov_load16";
+const char SanCovStore1[] = "__sanitizer_cov_store1";
+const char SanCovStore2[] = "__sanitizer_cov_store2";
+const char SanCovStore4[] = "__sanitizer_cov_store4";
+const char SanCovStore8[] = "__sanitizer_cov_store8";
+const char SanCovStore16[] = "__sanitizer_cov_store16";
 const char SanCovTraceDiv4[] = "__sanitizer_cov_trace_div4";
 const char SanCovTraceDiv8[] = "__sanitizer_cov_trace_div8";
 const char SanCovTraceGep[] = "__sanitizer_cov_trace_gep";
@@ -122,6 +132,14 @@ static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
                                   cl::desc("Tracing of DIV instructions"),
                                   cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClLoadTracing("sanitizer-coverage-trace-loads",
+                                   cl::desc("Tracing of load instructions"),
+                                   cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClStoreTracing("sanitizer-coverage-trace-stores",
+                                    cl::desc("Tracing of store instructions"),
+                                    cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
                                   cl::desc("Tracing of GEP instructions"),
                                   cl::Hidden, cl::init(false));
@@ -175,9 +193,11 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.PCTable |= ClCreatePCTable;
   Options.NoPrune |= !ClPruneBlocks;
   Options.StackDepth |= ClStackDepth;
+  Options.TraceLoads |= ClLoadTracing;
+  Options.TraceStores |= ClStoreTracing;
   if (!Options.TracePCGuard && !Options.TracePC &&
       !Options.Inline8bitCounters && !Options.StackDepth &&
-      !Options.InlineBoolFlag)
+      !Options.InlineBoolFlag && !Options.TraceLoads && !Options.TraceStores)
     Options.TracePCGuard = true; // TracePCGuard is default.
   return Options;
 }
@@ -207,6 +227,8 @@ private:
                          ArrayRef<BinaryOperator *> DivTraceTargets);
   void InjectTraceForGep(Function &F,
                          ArrayRef<GetElementPtrInst *> GepTraceTargets);
+  void InjectTraceForLoadsAndStores(Function &F, ArrayRef<LoadInst *> Loads,
+                                    ArrayRef<StoreInst *> Stores);
   void InjectTraceForSwitch(Function &F,
                             ArrayRef<Instruction *> SwitchTraceTargets);
   bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
@@ -234,14 +256,17 @@ private:
   std::string getSectionEnd(const std::string &Section) const;
   FunctionCallee SanCovTracePCIndir;
   FunctionCallee SanCovTracePC, SanCovTracePCGuard;
-  FunctionCallee SanCovTraceCmpFunction[4];
-  FunctionCallee SanCovTraceConstCmpFunction[4];
-  FunctionCallee SanCovTraceDivFunction[2];
+  std::array<FunctionCallee, 4> SanCovTraceCmpFunction;
+  std::array<FunctionCallee, 4> SanCovTraceConstCmpFunction;
+  std::array<FunctionCallee, 5> SanCovLoadFunction;
+  std::array<FunctionCallee, 5> SanCovStoreFunction;
+  std::array<FunctionCallee, 2> SanCovTraceDivFunction;
   FunctionCallee SanCovTraceGepFunction;
   FunctionCallee SanCovTraceSwitchFunction;
   GlobalVariable *SanCovLowestStack;
-  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
-      *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty, *Int1PtrTy;
+  Type *Int128PtrTy, *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty,
+      *Int32PtrTy, *Int16PtrTy, *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty,
+      *Int1PtrTy;
   Module *CurModule;
   std::string CurModuleUniqueId;
   Triple TargetTriple;
@@ -411,7 +436,9 @@ bool ModuleSanitizerCoverage::instrumentModule(
   IntptrPtrTy = PointerType::getUnqual(IntptrTy);
   Type *VoidTy = Type::getVoidTy(*C);
   IRBuilder<> IRB(*C);
+  Int128PtrTy = PointerType::getUnqual(IRB.getInt128Ty());
   Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
+  Int16PtrTy = PointerType::getUnqual(IRB.getInt16Ty());
   Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
   Int1PtrTy = PointerType::getUnqual(IRB.getInt1Ty());
@@ -452,6 +479,28 @@ bool ModuleSanitizerCoverage::instrumentModule(
   SanCovTraceConstCmpFunction[3] =
       M.getOrInsertFunction(SanCovTraceConstCmp8, VoidTy, Int64Ty, Int64Ty);
 
+  // Loads.
+  SanCovLoadFunction[0] = M.getOrInsertFunction(SanCovLoad1, VoidTy, Int8PtrTy);
+  SanCovLoadFunction[1] =
+      M.getOrInsertFunction(SanCovLoad2, VoidTy, Int16PtrTy);
+  SanCovLoadFunction[2] =
+      M.getOrInsertFunction(SanCovLoad4, VoidTy, Int32PtrTy);
+  SanCovLoadFunction[3] =
+      M.getOrInsertFunction(SanCovLoad8, VoidTy, Int64PtrTy);
+  SanCovLoadFunction[4] =
+      M.getOrInsertFunction(SanCovLoad16, VoidTy, Int128PtrTy);
+  // Stores.
+  SanCovStoreFunction[0] =
+      M.getOrInsertFunction(SanCovStore1, VoidTy, Int8PtrTy);
+  SanCovStoreFunction[1] =
+      M.getOrInsertFunction(SanCovStore2, VoidTy, Int16PtrTy);
+  SanCovStoreFunction[2] =
+      M.getOrInsertFunction(SanCovStore4, VoidTy, Int32PtrTy);
+  SanCovStoreFunction[3] =
+      M.getOrInsertFunction(SanCovStore8, VoidTy, Int64PtrTy);
+  SanCovStoreFunction[4] =
+      M.getOrInsertFunction(SanCovStore16, VoidTy, Int128PtrTy);
+
   {
     AttributeList AL;
     AL = AL.addParamAttribute(*C, 0, Attribute::ZExt);
@@ -632,6 +681,8 @@ void ModuleSanitizerCoverage::instrumentFunction(
   SmallVector<Instruction *, 8> SwitchTraceTargets;
   SmallVector<BinaryOperator *, 8> DivTraceTargets;
   SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
+  SmallVector<LoadInst *, 8> Loads;
+  SmallVector<StoreInst *, 8> Stores;
 
   const DominatorTree *DT = DTCallback(F);
   const PostDominatorTree *PDT = PDTCallback(F);
@@ -661,6 +712,12 @@ void ModuleSanitizerCoverage::instrumentFunction(
       if (Options.TraceGep)
         if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
           GepTraceTargets.push_back(GEP);
+      if (Options.TraceLoads)
+        if (LoadInst *LI = dyn_cast<LoadInst>(&Inst))
+          Loads.push_back(LI);
+      if (Options.TraceStores)
+        if (StoreInst *SI = dyn_cast<StoreInst>(&Inst))
+          Stores.push_back(SI);
       if (Options.StackDepth)
         if (isa<InvokeInst>(Inst) ||
             (isa<CallInst>(Inst) && !isa<IntrinsicInst>(Inst)))
@@ -674,6 +731,7 @@ void ModuleSanitizerCoverage::instrumentFunction(
   InjectTraceForSwitch(F, SwitchTraceTargets);
   InjectTraceForDiv(F, DivTraceTargets);
   InjectTraceForGep(F, GepTraceTargets);
+  InjectTraceForLoadsAndStores(F, Loads, Stores);
 }
 
 GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
@@ -857,6 +915,40 @@ void ModuleSanitizerCoverage::InjectTraceForGep(
   }
 }
 
+void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
+    Function &, ArrayRef<LoadInst *> Loads, ArrayRef<StoreInst *> Stores) {
+  auto CallbackIdx = [&](const Value *Ptr) -> int {
+    auto ElementTy = cast<PointerType>(Ptr->getType())->getElementType();
+    uint64_t TypeSize = DL->getTypeStoreSizeInBits(ElementTy);
+    return TypeSize == 8     ? 0
+           : TypeSize == 16  ? 1
+           : TypeSize == 32  ? 2
+           : TypeSize == 64  ? 3
+           : TypeSize == 128 ? 4
+                             : -1;
+  };
+  Type *PointerType[5] = {Int8PtrTy, Int16PtrTy, Int32PtrTy, Int64PtrTy,
+                          Int128PtrTy};
+  for (auto LI : Loads) {
+    IRBuilder<> IRB(LI);
+    auto Ptr = LI->getPointerOperand();
+    int Idx = CallbackIdx(Ptr);
+    if (Idx < 0)
+      continue;
+    IRB.CreateCall(SanCovLoadFunction[Idx],
+                   IRB.CreatePointerCast(Ptr, PointerType[Idx]));
+  }
+  for (auto SI : Stores) {
+    IRBuilder<> IRB(SI);
+    auto Ptr = SI->getPointerOperand();
+    int Idx = CallbackIdx(Ptr);
+    if (Idx < 0)
+      continue;
+    IRB.CreateCall(SanCovStoreFunction[Idx],
+                   IRB.CreatePointerCast(Ptr, PointerType[Idx]));
+  }
+}
+
 void ModuleSanitizerCoverage::InjectTraceForCmp(
     Function &, ArrayRef<Instruction *> CmpTraceTargets) {
   for (auto I : CmpTraceTargets) {
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 063999a68236..f98e39d751f4 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -206,8 +206,8 @@ PreservedAnalyses ThreadSanitizerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-PreservedAnalyses ThreadSanitizerPass::run(Module &M,
-                                           ModuleAnalysisManager &MAM) {
+PreservedAnalyses ModuleThreadSanitizerPass::run(Module &M,
+                                                 ModuleAnalysisManager &MAM) {
   insertModuleCtor(M);
   return PreservedAnalyses::none();
 }
@@ -249,8 +249,7 @@ void ThreadSanitizer::initialize(Module &M) {
 
   IRBuilder<> IRB(M.getContext());
   AttributeList Attr;
-  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                           Attribute::NoUnwind);
+  Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
   // Initialize the callbacks.
   TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr,
                                         IRB.getVoidTy(), IRB.getInt8PtrTy());
@@ -563,6 +562,12 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   // all.
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
+
+  // __attribute__(disable_sanitizer_instrumentation) prevents all kinds of
+  // instrumentation.
+  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+    return false;
+
   initialize(*F.getParent());
   SmallVector<InstructionInfo, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
@@ -580,7 +585,8 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
         AtomicAccesses.push_back(&Inst);
       else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
         LocalLoadsAndStores.push_back(&Inst);
-      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+      else if ((isa<CallInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) ||
+               isa<InvokeInst>(Inst)) {
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
           maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
         if (isa<MemIntrinsic>(Inst))
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
index 06b12149f597..1ca6ddabac5b 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -103,9 +103,8 @@ CallInst *BundledRetainClaimRVs::insertRVCallWithColors(
     Instruction *InsertPt, CallBase *AnnotatedCall,
     const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
   IRBuilder<> Builder(InsertPt);
-  bool IsRetainRV = objcarc::hasAttachedCallOpBundle(AnnotatedCall, true);
-  Function *Func = EP.get(IsRetainRV ? ARCRuntimeEntryPointKind::RetainRV
-                                     : ARCRuntimeEntryPointKind::ClaimRV);
+  Function *Func = *objcarc::getAttachedARCFunction(AnnotatedCall);
+  assert(Func && "operand isn't a Function");
   Type *ParamTy = Func->getArg(0)->getType();
   Value *CallArg = Builder.CreateBitCast(AnnotatedCall, ParamTy);
   auto *Call =
@@ -115,16 +114,28 @@ CallInst *BundledRetainClaimRVs::insertRVCallWithColors(
 }
 
 BundledRetainClaimRVs::~BundledRetainClaimRVs() {
-  if (ContractPass) {
-    // At this point, we know that the annotated calls can't be tail calls as
-    // they are followed by marker instructions and retainRV/claimRV calls. Mark
-    // them as notail, so that the backend knows these calls can't be tail
-    // calls.
-    for (auto P : RVCalls)
-      if (auto *CI = dyn_cast<CallInst>(P.second))
+  for (auto P : RVCalls) {
+    if (ContractPass) {
+      CallBase *CB = P.second;
+      // At this point, we know that the annotated calls can't be tail calls
+      // as they are followed by marker instructions and retainRV/claimRV
+      // calls. Mark them as notail so that the backend knows these calls
+      // can't be tail calls.
+      if (auto *CI = dyn_cast<CallInst>(CB))
         CI->setTailCallKind(CallInst::TCK_NoTail);
-  } else {
-    for (auto P : RVCalls)
+
+      if (UseMarker) {
+        // Remove the retainRV/claimRV function operand from the operand bundle
+        // to reflect the fact that the backend is responsible for emitting only
+        // the marker instruction, but not the retainRV/claimRV call.
+        OperandBundleDef OB("clang.arc.attachedcall", None);
+        auto *NewCB = CallBase::Create(CB, OB, CB);
+        CB->replaceAllUsesWith(NewCB);
+        CB->eraseFromParent();
+      }
+    }
+
+    if (!ContractPass || !UseMarker)
       EraseInstruction(P.first);
   }
 
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index 1f9d76969bfd..2b47bec7ffe8 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -105,8 +105,8 @@ CallInst *createCallInstWithColors(
 
 class BundledRetainClaimRVs {
 public:
-  BundledRetainClaimRVs(ARCRuntimeEntryPoints &P, bool ContractPass)
-      : EP(P), ContractPass(ContractPass) {}
+  BundledRetainClaimRVs(bool ContractPass, bool UseMarker)
+      : ContractPass(ContractPass), UseMarker(UseMarker) {}
   ~BundledRetainClaimRVs();
 
   /// Insert a retainRV/claimRV call to the normal destination blocks of invokes
@@ -155,8 +155,10 @@ private:
   /// A map of inserted retainRV/claimRV calls to annotated calls/invokes.
   DenseMap<CallInst *, CallBase *> RVCalls;
 
-  ARCRuntimeEntryPoints &EP;
   bool ContractPass;
+
+  /// Indicates whether the target uses a special inline-asm marker.
+  bool UseMarker;
 };
 
 } // end namespace objcarc
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 6a928f2c7ffb..210ec60f2f87 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -64,30 +64,29 @@ bool OptimizeBB(BasicBlock *BB) {
   bool Changed = false;
 
   Instruction *Push = nullptr;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-    Instruction *Inst = &*I++;
-    switch (GetBasicARCInstKind(Inst)) {
+  for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
+    switch (GetBasicARCInstKind(&Inst)) {
     case ARCInstKind::AutoreleasepoolPush:
-      Push = Inst;
+      Push = &Inst;
       break;
     case ARCInstKind::AutoreleasepoolPop:
       // If this pop matches a push and nothing in between can autorelease,
       // zap the pair.
-      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+      if (Push && cast<CallInst>(&Inst)->getArgOperand(0) == Push) {
         Changed = true;
         LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
                              "autorelease pair:\n"
                              "                           Pop: "
-                          << *Inst << "\n"
+                          << Inst << "\n"
                           << "                           Push: " << *Push
                           << "\n");
-        Inst->eraseFromParent();
+        Inst.eraseFromParent();
         Push->eraseFromParent();
       }
       Push = nullptr;
       break;
     case ARCInstKind::CallOrUser:
-      if (MayAutorelease(cast<CallBase>(*Inst)))
+      if (MayAutorelease(cast<CallBase>(Inst)))
         Push = nullptr;
       break;
     default:
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 62161b5b6b40..c2ed94e8e1f6 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -226,13 +226,6 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
     // of Inst.
     ARCInstKind Class = GetBasicARCInstKind(Inst);
 
-    // If Inst is an unrelated retain, we don't care about it.
-    //
-    // TODO: This is one area where the optimization could be made more
-    // aggressive.
-    if (IsRetain(Class))
-      continue;
-
     // If we have seen the store, but not the release...
     if (Store) {
       // We need to make sure that it is safe to move the release from its
@@ -248,8 +241,18 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
       return nullptr;
     }
 
-    // Ok, now we know we have not seen a store yet. See if Inst can write to
-    // our load location, if it can not, just ignore the instruction.
+    // Ok, now we know we have not seen a store yet.
+
+    // If Inst is a retain, we don't care about it as it doesn't prevent moving
+    // the load to the store.
+    //
+    // TODO: This is one area where the optimization could be made more
+    // aggressive.
+    if (IsRetain(Class))
+      continue;
+
+    // See if Inst can write to our load location, if it can not, just ignore
+    // the instruction.
     if (!isModSet(AA->getModRefInfo(Inst, Loc)))
       continue;
 
@@ -431,13 +434,21 @@ bool ObjCARCContract::tryToPeepholeInstruction(
     LLVM_FALLTHROUGH;
   case ARCInstKind::RetainRV:
   case ARCInstKind::ClaimRV: {
-    // If we're compiling for a target which needs a special inline-asm
-    // marker to do the return value optimization and the retainRV/claimRV call
-    // wasn't bundled with a call, insert the marker now.
+    bool IsInstContainedInBundle = BundledInsts->contains(Inst);
+
+    // Return now if the target doesn't need a special inline-asm marker. Return
+    // true if this is a bundled retainRV/claimRV call, which is going to be
+    // erased at the end of this pass, to avoid undoing objc-arc-expand and
+    // replacing uses of the retainRV/claimRV call's argument with its result.
     if (!RVInstMarker)
-      return false;
+      return IsInstContainedInBundle;
+
+    // The target needs a special inline-asm marker.
 
-    if (BundledInsts->contains(Inst))
+    // We don't have to emit the marker if this is a bundled call since the
+    // backend is responsible for emitting it. Return false to undo
+    // objc-arc-expand.
+    if (IsInstContainedInBundle)
       return false;
 
     BasicBlock::iterator BBI = Inst->getIterator();
@@ -537,7 +548,7 @@ bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) {
   AA = A;
   DT = D;
   PA.setAA(A);
-  BundledRetainClaimRVs BRV(EP, true);
+  BundledRetainClaimRVs BRV(true, RVInstMarker);
   BundledInsts = &BRV;
 
   std::pair<bool, bool> R = BundledInsts->insertAfterInvokes(F, DT);
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index d2121dcebe91..6b074ac5adab 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -56,12 +56,10 @@ static bool runImpl(Function &F) {
   LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName()
                     << "\n");
 
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
-    Instruction *Inst = &*I;
+  for (Instruction &Inst : instructions(&F)) {
+    LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << Inst << "\n");
 
-    LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
-
-    switch (GetBasicARCInstKind(Inst)) {
+    switch (GetBasicARCInstKind(&Inst)) {
     case ARCInstKind::Retain:
     case ARCInstKind::RetainRV:
     case ARCInstKind::Autorelease:
@@ -73,12 +71,12 @@ static bool runImpl(Function &F) {
       // harder. Undo any uses of this optimization that the front-end
       // emitted here. We'll redo them in the contract pass.
       Changed = true;
-      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
-      LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst
+      Value *Value = cast<CallInst>(&Inst)->getArgOperand(0);
+      LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << Inst
                         << "\n"
                            "               New = "
                         << *Value << "\n");
-      Inst->replaceAllUsesWith(Value);
+      Inst.replaceAllUsesWith(Value);
       break;
     }
     default:
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index ada6aa8d9b6d..0fa4904456cd 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -2229,13 +2229,12 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
 
   // Then, for each destroyWeak with an alloca operand, check to see if
   // the alloca and all its users can be zapped.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-    ARCInstKind Class = GetBasicARCInstKind(Inst);
+  for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) {
+    ARCInstKind Class = GetBasicARCInstKind(&Inst);
     if (Class != ARCInstKind::DestroyWeak)
       continue;
 
-    CallInst *Call = cast<CallInst>(Inst);
+    CallInst *Call = cast<CallInst>(&Inst);
     Value *Arg = Call->getArgOperand(0);
     if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
       for (User *U : Alloca->users()) {
@@ -2250,8 +2249,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         }
       }
       Changed = true;
-      for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) {
-        CallInst *UserInst = cast<CallInst>(*UI++);
+      for (User *U : llvm::make_early_inc_range(Alloca->users())) {
+        CallInst *UserInst = cast<CallInst>(U);
         switch (GetBasicARCInstKind(UserInst)) {
         case ARCInstKind::InitWeak:
         case ARCInstKind::StoreWeak:
@@ -2462,7 +2461,7 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) {
     return false;
 
   Changed = CFGChanged = false;
-  BundledRetainClaimRVs BRV(EP, false);
+  BundledRetainClaimRVs BRV(false, objcarc::getRVInstMarker(*F.getParent()));
   BundledInsts = &BRV;
 
   LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName()
diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index a63e356ce1fc..6d0a67c91cfa 100644
--- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -56,7 +56,8 @@ class ProvenanceAnalysis {
 
   CachedResultsTy CachedResults;
 
-  DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
+  DenseMap<const Value *, std::pair<WeakVH, WeakTrackingVH>>
+      UnderlyingObjCPtrCache;
 
   bool relatedCheck(const Value *A, const Value *B);
   bool relatedSelect(const SelectInst *A, const Value *B);
diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index 6fdfe787d438..fe637ee066a4 100644
--- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -58,11 +58,11 @@ bool PAEval::runOnFunction(Function &F) {
   for (auto &Arg : F.args())
     insertIfNamed(Values, &Arg);
 
-  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-    insertIfNamed(Values, &*I);
+  for (Instruction &I : instructions(F)) {
+    insertIfNamed(Values, &I);
 
-    for (auto &Op : I->operands())
-    insertIfNamed(Values, Op);
+    for (auto &Op : I.operands())
+      insertIfNamed(Values, Op);
   }
 
   ProvenanceAnalysis PA;
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 6f3fdb88eda5..b693acceb3f6 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -538,7 +538,7 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
   // that have no side effects and do not influence the control flow or return
   // value of the function, and may therefore be deleted safely.
   // NOTE: We reuse the Worklist vector here for memory efficiency.
-  for (Instruction &I : instructions(F)) {
+  for (Instruction &I : llvm::reverse(instructions(F))) {
     // Check if the instruction is alive.
     if (isLive(&I))
       continue;
@@ -554,9 +554,11 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
     // Prepare to delete.
     Worklist.push_back(&I);
     salvageDebugInfo(I);
-    I.dropAllReferences();
   }
 
+  for (Instruction *&I : Worklist)
+    I->dropAllReferences();
+
   for (Instruction *&I : Worklist) {
     ++NumRemoved;
     I->eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index be21db9087d2..e4ec5f266eb8 100644
--- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -221,6 +221,10 @@ bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
   AAPtr = AAPtr->stripPointerCastsSameRepresentation();
   AlignSCEV = SE->getSCEV(AlignOB.Inputs[1].get());
   AlignSCEV = SE->getTruncateOrZeroExtend(AlignSCEV, Int64Ty);
+  if (!isa<SCEVConstant>(AlignSCEV))
+    // Added to suppress a crash because consumer doesn't expect non-constant
+    // alignments in the assume bundle.  TODO: Consider generalizing caller.
+    return false;
   if (AlignOB.Inputs.size() == 3)
     OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get());
   else
diff --git a/llvm/lib/Transforms/Scalar/BDCE.cpp b/llvm/lib/Transforms/Scalar/BDCE.cpp
index c06125788f37..6c2467db79f7 100644
--- a/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -53,7 +53,7 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
     // in the def-use chain needs to be changed.
     auto *J = dyn_cast<Instruction>(JU);
     if (J && J->getType()->isIntOrIntVectorTy() &&
-        !DB.getDemandedBits(J).isAllOnesValue()) {
+        !DB.getDemandedBits(J).isAllOnes()) {
       Visited.insert(J);
       WorkList.push_back(J);
     }
@@ -84,7 +84,7 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
       // that in the def-use chain needs to be changed.
       auto *K = dyn_cast<Instruction>(KU);
       if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() &&
-          !DB.getDemandedBits(K).isAllOnesValue())
+          !DB.getDemandedBits(K).isAllOnes())
         WorkList.push_back(K);
     }
   }
@@ -103,12 +103,9 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
     // Remove instructions that are dead, either because they were not reached
     // during analysis or have no demanded bits.
     if (DB.isInstructionDead(&I) ||
-        (I.getType()->isIntOrIntVectorTy() &&
-         DB.getDemandedBits(&I).isNullValue() &&
+        (I.getType()->isIntOrIntVectorTy() && DB.getDemandedBits(&I).isZero() &&
          wouldInstructionBeTriviallyDead(&I))) {
-      salvageDebugInfo(I);
       Worklist.push_back(&I);
-      I.dropAllReferences();
       Changed = true;
       continue;
     }
@@ -155,6 +152,11 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
     }
   }
 
+  for (Instruction *&I : llvm::reverse(Worklist)) {
+    salvageDebugInfo(*I);
+    I->dropAllReferences();
+  }
+
   for (Instruction *&I : Worklist) {
     ++NumRemoved;
     I->eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 2eb94b721d96..95de59fa8262 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -467,7 +467,7 @@ static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB,
   BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr;
 
   SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
-  for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
+  for (auto *Pred : llvm::reverse(Preds)) {
     ConditionsTy Conditions;
     // Record condition on edge BB(CS) <- Pred
     recordCondition(CB, Pred, CB.getParent(), Conditions);
@@ -505,8 +505,7 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
 
   DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
   bool Changed = false;
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
-    BasicBlock &BB = *BI++;
+  for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
     auto II = BB.getFirstNonPHIOrDbg()->getIterator();
     auto IE = BB.getTerminator()->getIterator();
     // Iterate until we reach the terminator instruction. tryToSplitCallSite
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 535f50d4f904..27f54f8026e1 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -762,7 +762,7 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
       PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx,
           cast<PointerType>(Ty)->getAddressSpace());
       Base = new BitCastInst(Base, Int8PtrTy, "base_bitcast", InsertionPt);
-      Mat = GetElementPtrInst::Create(Int8PtrTy->getElementType(), Base,
+      Mat = GetElementPtrInst::Create(Type::getInt8Ty(*Ctx), Base,
           Offset, "mat_gep", InsertionPt);
       Mat = new BitCastInst(Mat, Ty, "mat_bitcast", InsertionPt);
     } else
@@ -819,10 +819,9 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
 
     // Aside from constant GEPs, only constant cast expressions are collected.
     assert(ConstExpr->isCast() && "ConstExpr should be a cast");
-    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction(
+        findMatInsertPt(ConstUser.Inst, ConstUser.OpndIdx));
     ConstExprInst->setOperand(0, Mat);
-    ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
-                                                ConstUser.OpndIdx));
 
     // Use the same debug location as the instruction we are about to update.
     ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index efd1c025d0cd..7f2d5d7d9987 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstraintSystem.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -268,6 +269,31 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
       continue;
     WorkList.emplace_back(DT.getNode(&BB));
 
+    // True as long as long as the current instruction is guaranteed to execute.
+    bool GuaranteedToExecute = true;
+    // Scan BB for assume calls.
+    // TODO: also use this scan to queue conditions to simplify, so we can
+    // interleave facts from assumes and conditions to simplify in a single
+    // basic block. And to skip another traversal of each basic block when
+    // simplifying.
+    for (Instruction &I : BB) {
+      Value *Cond;
+      // For now, just handle assumes with a single compare as condition.
+      if (match(&I, m_Intrinsic<Intrinsic::assume>(m_Value(Cond))) &&
+          isa<CmpInst>(Cond)) {
+        if (GuaranteedToExecute) {
+          // The assume is guaranteed to execute when BB is entered, hence Cond
+          // holds on entry to BB.
+          WorkList.emplace_back(DT.getNode(&BB), cast<CmpInst>(Cond), false);
+        } else {
+          // Otherwise the condition only holds in the successors.
+          for (BasicBlock *Succ : successors(&BB))
+            WorkList.emplace_back(DT.getNode(Succ), cast<CmpInst>(Cond), false);
+        }
+      }
+      GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I);
+    }
+
     auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
     if (!Br || !Br->isConditional())
       continue;
@@ -395,8 +421,13 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
             for (auto &E : reverse(DFSInStack))
               dbgs() << "   C " << *E.Condition << " " << E.IsNot << "\n";
           });
-          Cmp->replaceAllUsesWith(
-              ConstantInt::getTrue(F.getParent()->getContext()));
+          Cmp->replaceUsesWithIf(
+              ConstantInt::getTrue(F.getParent()->getContext()), [](Use &U) {
+                // Conditions in an assume trivially simplify to true. Skip uses
+                // in assume calls to not destroy the available information.
+                auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+                return !II || II->getIntrinsicID() != Intrinsic::assume;
+              });
           NumCondsRemoved++;
           Changed = true;
         }
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 36cbd42a5fdd..ca9567dc7ac8 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -67,6 +67,7 @@ STATISTIC(NumUDivURemsNarrowed,
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
 STATISTIC(NumSExt,      "Number of sext converted to zext");
+STATISTIC(NumSICmps,    "Number of signed icmp preds simplified to unsigned");
 STATISTIC(NumAnd,       "Number of ands removed");
 STATISTIC(NumNW,        "Number of no-wrap deductions");
 STATISTIC(NumNSW,       "Number of no-signed-wrap deductions");
@@ -295,11 +296,34 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
   return true;
 }
 
+static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
+  // Only for signed relational comparisons of scalar integers.
+  if (Cmp->getType()->isVectorTy() ||
+      !Cmp->getOperand(0)->getType()->isIntegerTy())
+    return false;
+
+  if (!Cmp->isSigned())
+    return false;
+
+  ICmpInst::Predicate UnsignedPred =
+      ConstantRange::getEquivalentPredWithFlippedSignedness(
+          Cmp->getPredicate(), LVI->getConstantRange(Cmp->getOperand(0), Cmp),
+          LVI->getConstantRange(Cmp->getOperand(1), Cmp));
+
+  if (UnsignedPred == ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+    return false;
+
+  ++NumSICmps;
+  Cmp->setPredicate(UnsignedPred);
+
+  return true;
+}
+
 /// See if LazyValueInfo's ability to exploit edge conditions or range
 /// information is sufficient to prove this comparison. Even for local
 /// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
-static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
+static bool constantFoldCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
   Value *Op0 = Cmp->getOperand(0);
   auto *C = dyn_cast<Constant>(Cmp->getOperand(1));
   if (!C)
@@ -318,6 +342,17 @@ static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
   return true;
 }
 
+static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
+  if (constantFoldCmp(Cmp, LVI))
+    return true;
+
+  if (auto *ICmp = dyn_cast<ICmpInst>(Cmp))
+    if (processICmp(ICmp, LVI))
+      return true;
+
+  return false;
+}
+
 /// Simplify a switch instruction by removing cases which can never fire. If the
 /// uselessness of a case could be determined locally then constant propagation
 /// would already have figured it out. Instead, walk the predecessors and
@@ -341,7 +376,13 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
     // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
     SwitchInstProfUpdateWrapper SI(*I);
 
-    for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+    APInt Low =
+        APInt::getSignedMaxValue(Cond->getType()->getScalarSizeInBits());
+    APInt High =
+        APInt::getSignedMinValue(Cond->getType()->getScalarSizeInBits());
+
+    SwitchInst::CaseIt CI = SI->case_begin();
+    for (auto CE = SI->case_end(); CI != CE;) {
       ConstantInt *Case = CI->getCaseValue();
       LazyValueInfo::Tristate State =
           LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
@@ -374,9 +415,28 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
         break;
       }
 
+      // Get Lower/Upper bound from switch cases.
+      Low = APIntOps::smin(Case->getValue(), Low);
+      High = APIntOps::smax(Case->getValue(), High);
+
       // Increment the case iterator since we didn't delete it.
       ++CI;
     }
+
+    // Try to simplify default case as unreachable
+    if (CI == SI->case_end() && SI->getNumCases() != 0 &&
+        !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg())) {
+      const ConstantRange SIRange =
+          LVI->getConstantRange(SI->getCondition(), SI);
+
+      // If the numbered switch cases cover the entire range of the condition,
+      // then the default case is not reachable.
+      if (SIRange.getSignedMin() == Low && SIRange.getSignedMax() == High &&
+          SI->getNumCases() == High - Low + 1) {
+        createUnreachableSwitchDefault(SI, &DTU);
+        Changed = true;
+      }
+    }
   }
 
   if (Changed)
@@ -690,7 +750,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
 
   // sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can
   // prove that such a combination is impossible, we need to bump the bitwidth.
-  if (CRs[1]->contains(APInt::getAllOnesValue(OrigWidth)) &&
+  if (CRs[1]->contains(APInt::getAllOnes(OrigWidth)) &&
       CRs[0]->contains(
           APInt::getSignedMinValue(MinSignedBits).sextOrSelf(OrigWidth)))
     ++MinSignedBits;
@@ -1023,49 +1083,48 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
   // blocks.
   for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
     bool BBChanged = false;
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-      Instruction *II = &*BI++;
-      switch (II->getOpcode()) {
+    for (Instruction &II : llvm::make_early_inc_range(*BB)) {
+      switch (II.getOpcode()) {
       case Instruction::Select:
-        BBChanged |= processSelect(cast<SelectInst>(II), LVI);
+        BBChanged |= processSelect(cast<SelectInst>(&II), LVI);
         break;
       case Instruction::PHI:
-        BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
+        BBChanged |= processPHI(cast<PHINode>(&II), LVI, DT, SQ);
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
-        BBChanged |= processCmp(cast<CmpInst>(II), LVI);
+        BBChanged |= processCmp(cast<CmpInst>(&II), LVI);
         break;
       case Instruction::Load:
       case Instruction::Store:
-        BBChanged |= processMemAccess(II, LVI);
+        BBChanged |= processMemAccess(&II, LVI);
         break;
       case Instruction::Call:
       case Instruction::Invoke:
-        BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
+        BBChanged |= processCallSite(cast<CallBase>(II), LVI);
         break;
       case Instruction::SRem:
       case Instruction::SDiv:
-        BBChanged |= processSDivOrSRem(cast<BinaryOperator>(II), LVI);
+        BBChanged |= processSDivOrSRem(cast<BinaryOperator>(&II), LVI);
         break;
       case Instruction::UDiv:
       case Instruction::URem:
-        BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
+        BBChanged |= processUDivOrURem(cast<BinaryOperator>(&II), LVI);
         break;
       case Instruction::AShr:
-        BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
+        BBChanged |= processAShr(cast<BinaryOperator>(&II), LVI);
         break;
       case Instruction::SExt:
-        BBChanged |= processSExt(cast<SExtInst>(II), LVI);
+        BBChanged |= processSExt(cast<SExtInst>(&II), LVI);
         break;
       case Instruction::Add:
       case Instruction::Sub:
       case Instruction::Mul:
       case Instruction::Shl:
-        BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
+        BBChanged |= processBinOp(cast<BinaryOperator>(&II), LVI);
         break;
       case Instruction::And:
-        BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
+        BBChanged |= processAnd(cast<BinaryOperator>(&II), LVI);
         break;
       }
     }
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 90679bcac4b7..8c4523206070 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -1,9 +1,8 @@
 //===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
@@ -84,8 +83,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <deque>
-#include <unordered_map>
-#include <unordered_set>
 
 using namespace llvm;
 
@@ -147,8 +144,7 @@ private:
       Stack.push_back(SIToUnfold);
 
     while (!Stack.empty()) {
-      SelectInstToUnfold SIToUnfold = Stack.back();
-      Stack.pop_back();
+      SelectInstToUnfold SIToUnfold = Stack.pop_back_val();
 
       std::vector<SelectInstToUnfold> NewSIsToUnfold;
       std::vector<BasicBlock *> NewBBs;
@@ -174,6 +170,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
@@ -350,7 +347,7 @@ struct ClonedBlock {
 
 typedef std::deque<BasicBlock *> PathType;
 typedef std::vector<PathType> PathsType;
-typedef std::set<const BasicBlock *> VisitedBlocks;
+typedef SmallPtrSet<const BasicBlock *, 8> VisitedBlocks;
 typedef std::vector<ClonedBlock> CloneList;
 
 // This data structure keeps track of all blocks that have been cloned.  If two
@@ -493,7 +490,7 @@ private:
   }
 
   bool isPredictableValue(Value *InpVal, SmallSet<Value *, 16> &SeenValues) {
-    if (SeenValues.find(InpVal) != SeenValues.end())
+    if (SeenValues.contains(InpVal))
       return true;
 
     if (isa<ConstantInt>(InpVal))
@@ -508,7 +505,7 @@ private:
 
   void addInstToQueue(Value *Val, std::deque<Instruction *> &Q,
                       SmallSet<Value *, 16> &SeenValues) {
-    if (SeenValues.find(Val) != SeenValues.end())
+    if (SeenValues.contains(Val))
       return;
     if (Instruction *I = dyn_cast<Instruction>(Val))
       Q.push_back(I);
@@ -533,7 +530,7 @@ private:
       return false;
 
     if (isa<PHINode>(SIUse) &&
-        SIBB->getSingleSuccessor() != dyn_cast<Instruction>(SIUse)->getParent())
+        SIBB->getSingleSuccessor() != cast<Instruction>(SIUse)->getParent())
       return false;
 
     // If select will not be sunk during unfolding, and it is in the same basic
@@ -621,13 +618,9 @@ private:
     // Some blocks have multiple edges to the same successor, and this set
     // is used to prevent a duplicate path from being generated
     SmallSet<BasicBlock *, 4> Successors;
-
-    for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
-      BasicBlock *Succ = *SI;
-
-      if (Successors.find(Succ) != Successors.end())
+    for (BasicBlock *Succ : successors(BB)) {
+      if (!Successors.insert(Succ).second)
         continue;
-      Successors.insert(Succ);
 
       // Found a cycle through the SwitchBlock
       if (Succ == SwitchBlock) {
@@ -636,7 +629,7 @@ private:
       }
 
       // We have encountered a cycle, do not get caught in it
-      if (Visited.find(Succ) != Visited.end())
+      if (Visited.contains(Succ))
         continue;
 
       PathsType SuccPaths = paths(Succ, Visited, PathDepth + 1);
@@ -668,15 +661,14 @@ private:
     SmallSet<Value *, 16> SeenValues;
 
     while (!Stack.empty()) {
-      PHINode *CurPhi = Stack.back();
-      Stack.pop_back();
+      PHINode *CurPhi = Stack.pop_back_val();
 
       Res[CurPhi->getParent()] = CurPhi;
       SeenValues.insert(CurPhi);
 
       for (Value *Incoming : CurPhi->incoming_values()) {
         if (Incoming == FirstDef || isa<ConstantInt>(Incoming) ||
-            SeenValues.find(Incoming) != SeenValues.end()) {
+            SeenValues.contains(Incoming)) {
           continue;
         }
 
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index d22b3f409585..a8ec8bb97970 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -13,10 +13,10 @@
 // in between both MemoryDefs. A bit more concretely:
 //
 // For all MemoryDefs StartDef:
-// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
+// 1. Get the next dominating clobbering MemoryDef (MaybeDeadAccess) by walking
 //    upwards.
-// 2. Check that there are no reads between EarlierAccess and the StartDef by
-//    checking all uses starting at EarlierAccess and walking until we see
+// 2. Check that there are no reads between MaybeDeadAccess and the StartDef by
+//    checking all uses starting at MaybeDeadAccess and walking until we see
 //    StartDef.
 // 3. For each found CurrentDef, check that:
 //   1. There are no barrier instructions between CurrentDef and StartDef (like
@@ -56,6 +56,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -78,6 +79,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
@@ -122,7 +124,7 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging",
 static cl::opt<unsigned>
     MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
                        cl::desc("The number of memory instructions to scan for "
-                                "dead store elimination (default = 100)"));
+                                "dead store elimination (default = 150)"));
 static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
     "dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
     cl::desc("The maximum number of steps while walking upwards to find "
@@ -203,39 +205,6 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
   return false;
 }
 
-/// Return a Location stored to by the specified instruction. If isRemovable
-/// returns true, this function and getLocForRead completely describe the memory
-/// operations for this instruction.
-static MemoryLocation getLocForWrite(Instruction *Inst,
-                                     const TargetLibraryInfo &TLI) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return MemoryLocation::get(SI);
-
-  // memcpy/memmove/memset.
-  if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst))
-    return MemoryLocation::getForDest(MI);
-
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-    switch (II->getIntrinsicID()) {
-    default:
-      return MemoryLocation(); // Unhandled intrinsic.
-    case Intrinsic::init_trampoline:
-      return MemoryLocation::getAfter(II->getArgOperand(0));
-    case Intrinsic::masked_store:
-      return MemoryLocation::getForArgument(II, 1, TLI);
-    case Intrinsic::lifetime_end: {
-      uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
-      return MemoryLocation(II->getArgOperand(1), Len);
-    }
-    }
-  }
-  if (auto *CB = dyn_cast<CallBase>(Inst))
-    // All the supported TLI functions so far happen to have dest as their
-    // first argument.
-    return MemoryLocation::getAfter(CB->getArgOperand(0));
-  return MemoryLocation();
-}
-
 /// If the value of this instruction and the memory it writes to is unused, may
 /// we delete this instruction?
 static bool isRemovable(Instruction *I) {
@@ -333,147 +302,146 @@ enum OverwriteResult {
 } // end anonymous namespace
 
 /// Check if two instruction are masked stores that completely
-/// overwrite one another. More specifically, \p Later has to
-/// overwrite \p Earlier.
-static OverwriteResult isMaskedStoreOverwrite(const Instruction *Later,
-                                              const Instruction *Earlier,
+/// overwrite one another. More specifically, \p KillingI has to
+/// overwrite \p DeadI.
+static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI,
+                                              const Instruction *DeadI,
                                               BatchAAResults &AA) {
-  const auto *IIL = dyn_cast<IntrinsicInst>(Later);
-  const auto *IIE = dyn_cast<IntrinsicInst>(Earlier);
-  if (IIL == nullptr || IIE == nullptr)
+  const auto *KillingII = dyn_cast<IntrinsicInst>(KillingI);
+  const auto *DeadII = dyn_cast<IntrinsicInst>(DeadI);
+  if (KillingII == nullptr || DeadII == nullptr)
     return OW_Unknown;
-  if (IIL->getIntrinsicID() != Intrinsic::masked_store ||
-      IIE->getIntrinsicID() != Intrinsic::masked_store)
+  if (KillingII->getIntrinsicID() != Intrinsic::masked_store ||
+      DeadII->getIntrinsicID() != Intrinsic::masked_store)
     return OW_Unknown;
   // Pointers.
-  Value *LP = IIL->getArgOperand(1)->stripPointerCasts();
-  Value *EP = IIE->getArgOperand(1)->stripPointerCasts();
-  if (LP != EP && !AA.isMustAlias(LP, EP))
+  Value *KillingPtr = KillingII->getArgOperand(1)->stripPointerCasts();
+  Value *DeadPtr = DeadII->getArgOperand(1)->stripPointerCasts();
+  if (KillingPtr != DeadPtr && !AA.isMustAlias(KillingPtr, DeadPtr))
     return OW_Unknown;
   // Masks.
-  // TODO: check that Later's mask is a superset of the Earlier's mask.
-  if (IIL->getArgOperand(3) != IIE->getArgOperand(3))
+  // TODO: check that KillingII's mask is a superset of the DeadII's mask.
+  if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3))
     return OW_Unknown;
   return OW_Complete;
 }
 
-/// Return 'OW_Complete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
-/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
-/// beginning of the 'Earlier' location is overwritten by 'Later'.
-/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
-/// overwritten by a latter (smaller) store which doesn't write outside the big
+/// Return 'OW_Complete' if a store to the 'KillingLoc' location completely
+/// overwrites a store to the 'DeadLoc' location, 'OW_End' if the end of the
+/// 'DeadLoc' location is completely overwritten by 'KillingLoc', 'OW_Begin'
+/// if the beginning of the 'DeadLoc' location is overwritten by 'KillingLoc'.
+/// 'OW_PartialEarlierWithFullLater' means that a dead (big) store was
+/// overwritten by a killing (smaller) store which doesn't write outside the big
 /// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
-/// NOTE: This function must only be called if both \p Later and \p Earlier
-/// write to the same underlying object with valid \p EarlierOff and \p
-/// LaterOff.
-static OverwriteResult isPartialOverwrite(const MemoryLocation &Later,
-                                          const MemoryLocation &Earlier,
-                                          int64_t EarlierOff, int64_t LaterOff,
-                                          Instruction *DepWrite,
+/// NOTE: This function must only be called if both \p KillingLoc and \p
+/// DeadLoc belong to the same underlying object with valid \p KillingOff and
+/// \p DeadOff.
+static OverwriteResult isPartialOverwrite(const MemoryLocation &KillingLoc,
+                                          const MemoryLocation &DeadLoc,
+                                          int64_t KillingOff, int64_t DeadOff,
+                                          Instruction *DeadI,
                                           InstOverlapIntervalsTy &IOL) {
-  const uint64_t LaterSize = Later.Size.getValue();
-  const uint64_t EarlierSize = Earlier.Size.getValue();
+  const uint64_t KillingSize = KillingLoc.Size.getValue();
+  const uint64_t DeadSize = DeadLoc.Size.getValue();
   // We may now overlap, although the overlap is not complete. There might also
   // be other incomplete overlaps, and together, they might cover the complete
-  // earlier write.
+  // dead store.
   // Note: The correctness of this logic depends on the fact that this function
   // is not even called providing DepWrite when there are any intervening reads.
   if (EnablePartialOverwriteTracking &&
-      LaterOff < int64_t(EarlierOff + EarlierSize) &&
-      int64_t(LaterOff + LaterSize) >= EarlierOff) {
+      KillingOff < int64_t(DeadOff + DeadSize) &&
+      int64_t(KillingOff + KillingSize) >= DeadOff) {
 
     // Insert our part of the overlap into the map.
-    auto &IM = IOL[DepWrite];
-    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
-                      << ", " << int64_t(EarlierOff + EarlierSize)
-                      << ") Later [" << LaterOff << ", "
-                      << int64_t(LaterOff + LaterSize) << ")\n");
+    auto &IM = IOL[DeadI];
+    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: DeadLoc [" << DeadOff << ", "
+                      << int64_t(DeadOff + DeadSize) << ") KillingLoc ["
+                      << KillingOff << ", " << int64_t(KillingOff + KillingSize)
+                      << ")\n");
 
     // Make sure that we only insert non-overlapping intervals and combine
     // adjacent intervals. The intervals are stored in the map with the ending
     // offset as the key (in the half-open sense) and the starting offset as
     // the value.
-    int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize;
+    int64_t KillingIntStart = KillingOff;
+    int64_t KillingIntEnd = KillingOff + KillingSize;
 
-    // Find any intervals ending at, or after, LaterIntStart which start
-    // before LaterIntEnd.
-    auto ILI = IM.lower_bound(LaterIntStart);
-    if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+    // Find any intervals ending at, or after, KillingIntStart which start
+    // before KillingIntEnd.
+    auto ILI = IM.lower_bound(KillingIntStart);
+    if (ILI != IM.end() && ILI->second <= KillingIntEnd) {
       // This existing interval is overlapped with the current store somewhere
-      // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
+      // in [KillingIntStart, KillingIntEnd]. Merge them by erasing the existing
       // intervals and adjusting our start and end.
-      LaterIntStart = std::min(LaterIntStart, ILI->second);
-      LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+      KillingIntStart = std::min(KillingIntStart, ILI->second);
+      KillingIntEnd = std::max(KillingIntEnd, ILI->first);
       ILI = IM.erase(ILI);
 
       // Continue erasing and adjusting our end in case other previous
       // intervals are also overlapped with the current store.
       //
-      // |--- ealier 1 ---|  |--- ealier 2 ---|
-      //     |------- later---------|
+      // |--- dead 1 ---|  |--- dead 2 ---|
+      //     |------- killing---------|
       //
-      while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
-        assert(ILI->second > LaterIntStart && "Unexpected interval");
-        LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+      while (ILI != IM.end() && ILI->second <= KillingIntEnd) {
+        assert(ILI->second > KillingIntStart && "Unexpected interval");
+        KillingIntEnd = std::max(KillingIntEnd, ILI->first);
         ILI = IM.erase(ILI);
       }
     }
 
-    IM[LaterIntEnd] = LaterIntStart;
+    IM[KillingIntEnd] = KillingIntStart;
 
     ILI = IM.begin();
-    if (ILI->second <= EarlierOff &&
-        ILI->first >= int64_t(EarlierOff + EarlierSize)) {
-      LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
-                        << EarlierOff << ", "
-                        << int64_t(EarlierOff + EarlierSize)
-                        << ") Composite Later [" << ILI->second << ", "
+    if (ILI->second <= DeadOff && ILI->first >= int64_t(DeadOff + DeadSize)) {
+      LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: DeadLoc ["
+                        << DeadOff << ", " << int64_t(DeadOff + DeadSize)
+                        << ") Composite KillingLoc [" << ILI->second << ", "
                         << ILI->first << ")\n");
       ++NumCompletePartials;
       return OW_Complete;
     }
   }
 
-  // Check for an earlier store which writes to all the memory locations that
-  // the later store writes to.
-  if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
-      int64_t(EarlierOff + EarlierSize) > LaterOff &&
-      uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) {
-    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
-                      << EarlierOff << ", "
-                      << int64_t(EarlierOff + EarlierSize)
-                      << ") by a later store [" << LaterOff << ", "
-                      << int64_t(LaterOff + LaterSize) << ")\n");
+  // Check for a dead store which writes to all the memory locations that
+  // the killing store writes to.
+  if (EnablePartialStoreMerging && KillingOff >= DeadOff &&
+      int64_t(DeadOff + DeadSize) > KillingOff &&
+      uint64_t(KillingOff - DeadOff) + KillingSize <= DeadSize) {
+    LLVM_DEBUG(dbgs() << "DSE: Partial overwrite a dead load [" << DeadOff
+                      << ", " << int64_t(DeadOff + DeadSize)
+                      << ") by a killing store [" << KillingOff << ", "
+                      << int64_t(KillingOff + KillingSize) << ")\n");
     // TODO: Maybe come up with a better name?
     return OW_PartialEarlierWithFullLater;
   }
 
-  // Another interesting case is if the later store overwrites the end of the
-  // earlier store.
+  // Another interesting case is if the killing store overwrites the end of the
+  // dead store.
   //
-  //      |--earlier--|
-  //                |--   later   --|
+  //      |--dead--|
+  //                |--   killing   --|
   //
-  // In this case we may want to trim the size of earlier to avoid generating
-  // writes to addresses which will definitely be overwritten later
+  // In this case we may want to trim the size of dead store to avoid
+  // generating stores to addresses which will definitely be overwritten killing
+  // store.
   if (!EnablePartialOverwriteTracking &&
-      (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) &&
-       int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize)))
+      (KillingOff > DeadOff && KillingOff < int64_t(DeadOff + DeadSize) &&
+       int64_t(KillingOff + KillingSize) >= int64_t(DeadOff + DeadSize)))
     return OW_End;
 
-  // Finally, we also need to check if the later store overwrites the beginning
-  // of the earlier store.
+  // Finally, we also need to check if the killing store overwrites the
+  // beginning of the dead store.
   //
-  //                |--earlier--|
-  //      |--   later   --|
+  //                |--dead--|
+  //      |--  killing  --|
   //
   // In this case we may want to move the destination address and trim the size
-  // of earlier to avoid generating writes to addresses which will definitely
-  // be overwritten later.
+  // of dead store to avoid generating stores to addresses which will definitely
+  // be overwritten killing store.
   if (!EnablePartialOverwriteTracking &&
-      (LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) {
-    assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) &&
+      (KillingOff <= DeadOff && int64_t(KillingOff + KillingSize) > DeadOff)) {
+    assert(int64_t(KillingOff + KillingSize) < int64_t(DeadOff + DeadSize) &&
            "Expect to be handled as OW_Complete");
     return OW_Begin;
   }
@@ -505,7 +473,12 @@ memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI,
   BasicBlock::iterator SecondBBI(SecondI);
   BasicBlock *FirstBB = FirstI->getParent();
   BasicBlock *SecondBB = SecondI->getParent();
-  MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+  MemoryLocation MemLoc;
+  if (auto *MemSet = dyn_cast<MemSetInst>(SecondI))
+    MemLoc = MemoryLocation::getForDest(MemSet);
+  else
+    MemLoc = MemoryLocation::get(SecondI);
+
   auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
 
   // Start checking the SecondBB.
@@ -568,11 +541,11 @@ memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI,
   return true;
 }
 
-static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
-                         uint64_t &EarlierSize, int64_t LaterStart,
-                         uint64_t LaterSize, bool IsOverwriteEnd) {
-  auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
-  Align PrefAlign = EarlierIntrinsic->getDestAlign().valueOrOne();
+static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
+                         uint64_t &DeadSize, int64_t KillingStart,
+                         uint64_t KillingSize, bool IsOverwriteEnd) {
+  auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
+  Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
 
   // We assume that memet/memcpy operates in chunks of the "largest" native
   // type size and aligned on the same value. That means optimal start and size
@@ -593,19 +566,19 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
   // Compute start and size of the region to remove. Make sure 'PrefAlign' is
   // maintained on the remaining store.
   if (IsOverwriteEnd) {
-    // Calculate required adjustment for 'LaterStart'in order to keep remaining
-    // store size aligned on 'PerfAlign'.
+    // Calculate required adjustment for 'KillingStart' in order to keep
+    // remaining store size aligned on 'PerfAlign'.
     uint64_t Off =
-        offsetToAlignment(uint64_t(LaterStart - EarlierStart), PrefAlign);
-    ToRemoveStart = LaterStart + Off;
-    if (EarlierSize <= uint64_t(ToRemoveStart - EarlierStart))
+        offsetToAlignment(uint64_t(KillingStart - DeadStart), PrefAlign);
+    ToRemoveStart = KillingStart + Off;
+    if (DeadSize <= uint64_t(ToRemoveStart - DeadStart))
       return false;
-    ToRemoveSize = EarlierSize - uint64_t(ToRemoveStart - EarlierStart);
+    ToRemoveSize = DeadSize - uint64_t(ToRemoveStart - DeadStart);
   } else {
-    ToRemoveStart = EarlierStart;
-    assert(LaterSize >= uint64_t(EarlierStart - LaterStart) &&
+    ToRemoveStart = DeadStart;
+    assert(KillingSize >= uint64_t(DeadStart - KillingStart) &&
            "Not overlapping accesses?");
-    ToRemoveSize = LaterSize - uint64_t(EarlierStart - LaterStart);
+    ToRemoveSize = KillingSize - uint64_t(DeadStart - KillingStart);
     // Calculate required adjustment for 'ToRemoveSize'in order to keep
     // start of the remaining store aligned on 'PerfAlign'.
     uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign);
@@ -619,10 +592,10 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
   }
 
   assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove");
-  assert(EarlierSize > ToRemoveSize && "Can't remove more than original size");
+  assert(DeadSize > ToRemoveSize && "Can't remove more than original size");
 
-  uint64_t NewSize = EarlierSize - ToRemoveSize;
-  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+  uint64_t NewSize = DeadSize - ToRemoveSize;
+  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) {
     // When shortening an atomic memory intrinsic, the newly shortened
     // length must remain an integer multiple of the element size.
     const uint32_t ElementSize = AMI->getElementSizeInBytes();
@@ -631,65 +604,62 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
   }
 
   LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW "
-                    << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
-                    << *EarlierWrite << "\n  KILLER [" << ToRemoveStart << ", "
+                    << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *DeadI
+                    << "\n  KILLER [" << ToRemoveStart << ", "
                     << int64_t(ToRemoveStart + ToRemoveSize) << ")\n");
 
-  Value *EarlierWriteLength = EarlierIntrinsic->getLength();
-  Value *TrimmedLength =
-      ConstantInt::get(EarlierWriteLength->getType(), NewSize);
-  EarlierIntrinsic->setLength(TrimmedLength);
-  EarlierIntrinsic->setDestAlignment(PrefAlign);
+  Value *DeadWriteLength = DeadIntrinsic->getLength();
+  Value *TrimmedLength = ConstantInt::get(DeadWriteLength->getType(), NewSize);
+  DeadIntrinsic->setLength(TrimmedLength);
+  DeadIntrinsic->setDestAlignment(PrefAlign);
 
   if (!IsOverwriteEnd) {
-    Value *OrigDest = EarlierIntrinsic->getRawDest();
+    Value *OrigDest = DeadIntrinsic->getRawDest();
     Type *Int8PtrTy =
-        Type::getInt8PtrTy(EarlierIntrinsic->getContext(),
+        Type::getInt8PtrTy(DeadIntrinsic->getContext(),
                            OrigDest->getType()->getPointerAddressSpace());
     Value *Dest = OrigDest;
     if (OrigDest->getType() != Int8PtrTy)
-      Dest = CastInst::CreatePointerCast(OrigDest, Int8PtrTy, "", EarlierWrite);
+      Dest = CastInst::CreatePointerCast(OrigDest, Int8PtrTy, "", DeadI);
     Value *Indices[1] = {
-        ConstantInt::get(EarlierWriteLength->getType(), ToRemoveSize)};
+        ConstantInt::get(DeadWriteLength->getType(), ToRemoveSize)};
     Instruction *NewDestGEP = GetElementPtrInst::CreateInBounds(
-        Type::getInt8Ty(EarlierIntrinsic->getContext()),
-        Dest, Indices, "", EarlierWrite);
-    NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
+        Type::getInt8Ty(DeadIntrinsic->getContext()), Dest, Indices, "", DeadI);
+    NewDestGEP->setDebugLoc(DeadIntrinsic->getDebugLoc());
     if (NewDestGEP->getType() != OrigDest->getType())
       NewDestGEP = CastInst::CreatePointerCast(NewDestGEP, OrigDest->getType(),
-                                               "", EarlierWrite);
-    EarlierIntrinsic->setDest(NewDestGEP);
+                                               "", DeadI);
+    DeadIntrinsic->setDest(NewDestGEP);
   }
 
-  // Finally update start and size of earlier access.
+  // Finally update start and size of dead access.
   if (!IsOverwriteEnd)
-    EarlierStart += ToRemoveSize;
-  EarlierSize = NewSize;
+    DeadStart += ToRemoveSize;
+  DeadSize = NewSize;
 
   return true;
 }
 
-static bool tryToShortenEnd(Instruction *EarlierWrite,
-                            OverlapIntervalsTy &IntervalMap,
-                            int64_t &EarlierStart, uint64_t &EarlierSize) {
-  if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
+static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
+                            int64_t &DeadStart, uint64_t &DeadSize) {
+  if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
     return false;
 
   OverlapIntervalsTy::iterator OII = --IntervalMap.end();
-  int64_t LaterStart = OII->second;
-  uint64_t LaterSize = OII->first - LaterStart;
+  int64_t KillingStart = OII->second;
+  uint64_t KillingSize = OII->first - KillingStart;
 
-  assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
+  assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
 
-  if (LaterStart > EarlierStart &&
-      // Note: "LaterStart - EarlierStart" is known to be positive due to
+  if (KillingStart > DeadStart &&
+      // Note: "KillingStart - KillingStart" is known to be positive due to
       // preceding check.
-      (uint64_t)(LaterStart - EarlierStart) < EarlierSize &&
-      // Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to
+      (uint64_t)(KillingStart - DeadStart) < DeadSize &&
+      // Note: "DeadSize - (uint64_t)(KillingStart - DeadStart)" is known to
       // be non negative due to preceding checks.
-      LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) {
-    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
-                     LaterSize, true)) {
+      KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) {
+    if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
+                     true)) {
       IntervalMap.erase(OII);
       return true;
     }
@@ -697,28 +667,28 @@ static bool tryToShortenEnd(Instruction *EarlierWrite,
   return false;
 }
 
-static bool tryToShortenBegin(Instruction *EarlierWrite,
+static bool tryToShortenBegin(Instruction *DeadI,
                               OverlapIntervalsTy &IntervalMap,
-                              int64_t &EarlierStart, uint64_t &EarlierSize) {
-  if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
+                              int64_t &DeadStart, uint64_t &DeadSize) {
+  if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI))
     return false;
 
   OverlapIntervalsTy::iterator OII = IntervalMap.begin();
-  int64_t LaterStart = OII->second;
-  uint64_t LaterSize = OII->first - LaterStart;
+  int64_t KillingStart = OII->second;
+  uint64_t KillingSize = OII->first - KillingStart;
 
-  assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
+  assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
 
-  if (LaterStart <= EarlierStart &&
-      // Note: "EarlierStart - LaterStart" is known to be non negative due to
+  if (KillingStart <= DeadStart &&
+      // Note: "DeadStart - KillingStart" is known to be non negative due to
       // preceding check.
-      LaterSize > (uint64_t)(EarlierStart - LaterStart)) {
-    // Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be
-    // positive due to preceding checks.
-    assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize &&
+      KillingSize > (uint64_t)(DeadStart - KillingStart)) {
+    // Note: "KillingSize - (uint64_t)(DeadStart - DeadStart)" is known to
+    // be positive due to preceding checks.
+    assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize &&
            "Should have been handled as OW_Complete");
-    if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
-                     LaterSize, false)) {
+    if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
+                     false)) {
       IntervalMap.erase(OII);
       return true;
     }
@@ -726,71 +696,48 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
   return false;
 }
 
-static bool removePartiallyOverlappedStores(const DataLayout &DL,
-                                            InstOverlapIntervalsTy &IOL,
-                                            const TargetLibraryInfo &TLI) {
-  bool Changed = false;
-  for (auto OI : IOL) {
-    Instruction *EarlierWrite = OI.first;
-    MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI);
-    assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
-
-    const Value *Ptr = Loc.Ptr->stripPointerCasts();
-    int64_t EarlierStart = 0;
-    uint64_t EarlierSize = Loc.Size.getValue();
-    GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
-    OverlapIntervalsTy &IntervalMap = OI.second;
-    Changed |=
-        tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
-    if (IntervalMap.empty())
-      continue;
-    Changed |=
-        tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
-  }
-  return Changed;
-}
-
-static Constant *tryToMergePartialOverlappingStores(
-    StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset,
-    int64_t DepWriteOffset, const DataLayout &DL, BatchAAResults &AA,
-    DominatorTree *DT) {
-
-  if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
-      DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
-      Later && isa<ConstantInt>(Later->getValueOperand()) &&
-      DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
-      memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
+static Constant *
+tryToMergePartialOverlappingStores(StoreInst *KillingI, StoreInst *DeadI,
+                                   int64_t KillingOffset, int64_t DeadOffset,
+                                   const DataLayout &DL, BatchAAResults &AA,
+                                   DominatorTree *DT) {
+
+  if (DeadI && isa<ConstantInt>(DeadI->getValueOperand()) &&
+      DL.typeSizeEqualsStoreSize(DeadI->getValueOperand()->getType()) &&
+      KillingI && isa<ConstantInt>(KillingI->getValueOperand()) &&
+      DL.typeSizeEqualsStoreSize(KillingI->getValueOperand()->getType()) &&
+      memoryIsNotModifiedBetween(DeadI, KillingI, AA, DL, DT)) {
     // If the store we find is:
     //   a) partially overwritten by the store to 'Loc'
-    //   b) the later store is fully contained in the earlier one and
+    //   b) the killing store is fully contained in the dead one and
     //   c) they both have a constant value
     //   d) none of the two stores need padding
-    // Merge the two stores, replacing the earlier store's value with a
+    // Merge the two stores, replacing the dead store's value with a
     // merge of both values.
     // TODO: Deal with other constant types (vectors, etc), and probably
     // some mem intrinsics (if needed)
 
-    APInt EarlierValue =
-        cast<ConstantInt>(Earlier->getValueOperand())->getValue();
-    APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
-    unsigned LaterBits = LaterValue.getBitWidth();
-    assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
-    LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+    APInt DeadValue = cast<ConstantInt>(DeadI->getValueOperand())->getValue();
+    APInt KillingValue =
+        cast<ConstantInt>(KillingI->getValueOperand())->getValue();
+    unsigned KillingBits = KillingValue.getBitWidth();
+    assert(DeadValue.getBitWidth() > KillingValue.getBitWidth());
+    KillingValue = KillingValue.zext(DeadValue.getBitWidth());
 
     // Offset of the smaller store inside the larger store
-    unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
-    unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
-                                                   BitOffsetDiff - LaterBits
-                                             : BitOffsetDiff;
-    APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
-                                   LShiftAmount + LaterBits);
+    unsigned BitOffsetDiff = (KillingOffset - DeadOffset) * 8;
+    unsigned LShiftAmount =
+        DL.isBigEndian() ? DeadValue.getBitWidth() - BitOffsetDiff - KillingBits
+                         : BitOffsetDiff;
+    APInt Mask = APInt::getBitsSet(DeadValue.getBitWidth(), LShiftAmount,
+                                   LShiftAmount + KillingBits);
     // Clear the bits we'll be replacing, then OR with the smaller
     // store, shifted appropriately.
-    APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
-    LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *Earlier
-                      << "\n  Later: " << *Later
+    APInt Merged = (DeadValue & ~Mask) | (KillingValue << LShiftAmount);
+    LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Dead: " << *DeadI
+                      << "\n  Killing: " << *KillingI
                       << "\n  Merged Value: " << Merged << '\n');
-    return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
+    return ConstantInt::get(DeadI->getValueOperand()->getType(), Merged);
   }
   return nullptr;
 }
@@ -819,14 +766,17 @@ bool isNoopIntrinsic(Instruction *I) {
 }
 
 // Check if we can ignore \p D for DSE.
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller,
+                const TargetLibraryInfo &TLI) {
   Instruction *DI = D->getMemoryInst();
   // Calls that only access inaccessible memory cannot read or write any memory
   // locations we consider for elimination.
   if (auto *CB = dyn_cast<CallBase>(DI))
-    if (CB->onlyAccessesInaccessibleMemory())
+    if (CB->onlyAccessesInaccessibleMemory()) {
+      if (isAllocLikeFn(DI, &TLI))
+        return false;
       return true;
-
+    }
   // We can eliminate stores to locations not visible to the caller across
   // throwing instructions.
   if (DI->mayThrow() && !DefVisibleToCaller)
@@ -841,7 +791,7 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
     return true;
 
   // Skip intrinsics that do not really read or modify memory.
-  if (isNoopIntrinsic(D->getMemoryInst()))
+  if (isNoopIntrinsic(DI))
     return true;
 
   return false;
@@ -850,6 +800,7 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
 struct DSEState {
   Function &F;
   AliasAnalysis &AA;
+  EarliestEscapeInfo EI;
 
   /// The single BatchAA instance that is used to cache AA queries. It will
   /// not be invalidated over the whole run. This is safe, because:
@@ -892,30 +843,29 @@ struct DSEState {
   /// basic block.
   DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
 
+  // Class contains self-reference, make sure it's not copied/moved.
+  DSEState(const DSEState &) = delete;
+  DSEState &operator=(const DSEState &) = delete;
+
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
            const LoopInfo &LI)
-      : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
-        DL(F.getParent()->getDataLayout()), LI(LI) {}
-
-  static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
-                      DominatorTree &DT, PostDominatorTree &PDT,
-                      const TargetLibraryInfo &TLI, const LoopInfo &LI) {
-    DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+      : F(F), AA(AA), EI(DT, LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
+        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
     for (BasicBlock *BB : post_order(&F)) {
-      State.PostOrderNumbers[BB] = PO++;
+      PostOrderNumbers[BB] = PO++;
       for (Instruction &I : *BB) {
         MemoryAccess *MA = MSSA.getMemoryAccess(&I);
         if (I.mayThrow() && !MA)
-          State.ThrowingBlocks.insert(I.getParent());
+          ThrowingBlocks.insert(I.getParent());
 
         auto *MD = dyn_cast_or_null<MemoryDef>(MA);
-        if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
-            (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
-          State.MemDefs.push_back(MD);
+        if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit &&
+            (getLocForWriteEx(&I) || isMemTerminatorInst(&I)))
+          MemDefs.push_back(MD);
       }
     }
 
@@ -925,131 +875,134 @@ struct DSEState {
       if (AI.hasPassPointeeByValueCopyAttr()) {
         // For byval, the caller doesn't know the address of the allocation.
         if (AI.hasByValAttr())
-          State.InvisibleToCallerBeforeRet.insert({&AI, true});
-        State.InvisibleToCallerAfterRet.insert({&AI, true});
+          InvisibleToCallerBeforeRet.insert({&AI, true});
+        InvisibleToCallerAfterRet.insert({&AI, true});
       }
 
     // Collect whether there is any irreducible control flow in the function.
-    State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
-
-    return State;
+    ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
   }
 
-  /// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI
-  /// instruction) completely overwrites a store to the 'Earlier' location.
-  /// (by \p EarlierI instruction).
-  /// Return OW_MaybePartial if \p Later does not completely overwrite
-  /// \p Earlier, but they both write to the same underlying object. In that
-  /// case, use isPartialOverwrite to check if \p Later partially overwrites
-  /// \p Earlier. Returns 'OW_Unknown' if nothing can be determined.
-  OverwriteResult
-  isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
-              const MemoryLocation &Later, const MemoryLocation &Earlier,
-              int64_t &EarlierOff, int64_t &LaterOff) {
+  /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p
+  /// KillingI instruction) completely overwrites a store to the 'DeadLoc'
+  /// location (by \p DeadI instruction).
+  /// Return OW_MaybePartial if \p KillingI does not completely overwrite
+  /// \p DeadI, but they both write to the same underlying object. In that
+  /// case, use isPartialOverwrite to check if \p KillingI partially overwrites
+  /// \p DeadI. Returns 'OW_Unknown' if nothing can be determined.
+  OverwriteResult isOverwrite(const Instruction *KillingI,
+                              const Instruction *DeadI,
+                              const MemoryLocation &KillingLoc,
+                              const MemoryLocation &DeadLoc,
+                              int64_t &KillingOff, int64_t &DeadOff) {
     // AliasAnalysis does not always account for loops. Limit overwrite checks
-    // to dependencies for which we can guarantee they are independant of any
+    // to dependencies for which we can guarantee they are independent of any
     // loops they are in.
-    if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier))
+    if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc))
       return OW_Unknown;
 
     // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
     // get imprecise values here, though (except for unknown sizes).
-    if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
+    if (!KillingLoc.Size.isPrecise() || !DeadLoc.Size.isPrecise()) {
       // In case no constant size is known, try to an IR values for the number
       // of bytes written and check if they match.
-      const auto *LaterMemI = dyn_cast<MemIntrinsic>(LaterI);
-      const auto *EarlierMemI = dyn_cast<MemIntrinsic>(EarlierI);
-      if (LaterMemI && EarlierMemI) {
-        const Value *LaterV = LaterMemI->getLength();
-        const Value *EarlierV = EarlierMemI->getLength();
-        if (LaterV == EarlierV && BatchAA.isMustAlias(Earlier, Later))
+      const auto *KillingMemI = dyn_cast<MemIntrinsic>(KillingI);
+      const auto *DeadMemI = dyn_cast<MemIntrinsic>(DeadI);
+      if (KillingMemI && DeadMemI) {
+        const Value *KillingV = KillingMemI->getLength();
+        const Value *DeadV = DeadMemI->getLength();
+        if (KillingV == DeadV && BatchAA.isMustAlias(DeadLoc, KillingLoc))
           return OW_Complete;
       }
 
       // Masked stores have imprecise locations, but we can reason about them
       // to some extent.
-      return isMaskedStoreOverwrite(LaterI, EarlierI, BatchAA);
+      return isMaskedStoreOverwrite(KillingI, DeadI, BatchAA);
     }
 
-    const uint64_t LaterSize = Later.Size.getValue();
-    const uint64_t EarlierSize = Earlier.Size.getValue();
+    const uint64_t KillingSize = KillingLoc.Size.getValue();
+    const uint64_t DeadSize = DeadLoc.Size.getValue();
 
     // Query the alias information
-    AliasResult AAR = BatchAA.alias(Later, Earlier);
+    AliasResult AAR = BatchAA.alias(KillingLoc, DeadLoc);
 
     // If the start pointers are the same, we just have to compare sizes to see if
-    // the later store was larger than the earlier store.
+    // the killing store was larger than the dead store.
     if (AAR == AliasResult::MustAlias) {
-      // Make sure that the Later size is >= the Earlier size.
-      if (LaterSize >= EarlierSize)
+      // Make sure that the KillingSize size is >= the DeadSize size.
+      if (KillingSize >= DeadSize)
         return OW_Complete;
     }
 
     // If we hit a partial alias we may have a full overwrite
     if (AAR == AliasResult::PartialAlias && AAR.hasOffset()) {
       int32_t Off = AAR.getOffset();
-      if (Off >= 0 && (uint64_t)Off + EarlierSize <= LaterSize)
+      if (Off >= 0 && (uint64_t)Off + DeadSize <= KillingSize)
         return OW_Complete;
     }
 
-    // Check to see if the later store is to the entire object (either a global,
-    // an alloca, or a byval/inalloca argument).  If so, then it clearly
+    // Check to see if the killing store is to the entire object (either a
+    // global, an alloca, or a byval/inalloca argument).  If so, then it clearly
     // overwrites any other store to the same object.
-    const Value *P1 = Earlier.Ptr->stripPointerCasts();
-    const Value *P2 = Later.Ptr->stripPointerCasts();
-    const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
+    const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts();
+    const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts();
+    const Value *DeadUndObj = getUnderlyingObject(DeadPtr);
+    const Value *KillingUndObj = getUnderlyingObject(KillingPtr);
 
     // If we can't resolve the same pointers to the same object, then we can't
     // analyze them at all.
-    if (UO1 != UO2)
+    if (DeadUndObj != KillingUndObj)
       return OW_Unknown;
 
-    // If the "Later" store is to a recognizable object, get its size.
-    uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, &F);
-    if (ObjectSize != MemoryLocation::UnknownSize)
-      if (ObjectSize == LaterSize && ObjectSize >= EarlierSize)
+    // If the KillingI store is to a recognizable object, get its size.
+    uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
+    if (KillingUndObjSize != MemoryLocation::UnknownSize)
+      if (KillingUndObjSize == KillingSize && KillingUndObjSize >= DeadSize)
         return OW_Complete;
 
     // Okay, we have stores to two completely different pointers.  Try to
     // decompose the pointer into a "base + constant_offset" form.  If the base
     // pointers are equal, then we can reason about the two stores.
-    EarlierOff = 0;
-    LaterOff = 0;
-    const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
-    const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
-
-    // If the base pointers still differ, we have two completely different stores.
-    if (BP1 != BP2)
+    DeadOff = 0;
+    KillingOff = 0;
+    const Value *DeadBasePtr =
+        GetPointerBaseWithConstantOffset(DeadPtr, DeadOff, DL);
+    const Value *KillingBasePtr =
+        GetPointerBaseWithConstantOffset(KillingPtr, KillingOff, DL);
+
+    // If the base pointers still differ, we have two completely different
+    // stores.
+    if (DeadBasePtr != KillingBasePtr)
       return OW_Unknown;
 
-    // The later access completely overlaps the earlier store if and only if
-    // both start and end of the earlier one is "inside" the later one:
-    //    |<->|--earlier--|<->|
-    //    |-------later-------|
+    // The killing access completely overlaps the dead store if and only if
+    // both start and end of the dead one is "inside" the killing one:
+    //    |<->|--dead--|<->|
+    //    |-----killing------|
     // Accesses may overlap if and only if start of one of them is "inside"
     // another one:
-    //    |<->|--earlier--|<----->|
-    //    |-------later-------|
+    //    |<->|--dead--|<-------->|
+    //    |-------killing--------|
     //           OR
-    //    |----- earlier -----|
-    //    |<->|---later---|<----->|
+    //    |-------dead-------|
+    //    |<->|---killing---|<----->|
     //
     // We have to be careful here as *Off is signed while *.Size is unsigned.
 
-    // Check if the earlier access starts "not before" the later one.
-    if (EarlierOff >= LaterOff) {
-      // If the earlier access ends "not after" the later access then the earlier
-      // one is completely overwritten by the later one.
-      if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
+    // Check if the dead access starts "not before" the killing one.
+    if (DeadOff >= KillingOff) {
+      // If the dead access ends "not after" the killing access then the
+      // dead one is completely overwritten by the killing one.
+      if (uint64_t(DeadOff - KillingOff) + DeadSize <= KillingSize)
         return OW_Complete;
-      // If start of the earlier access is "before" end of the later access then
-      // accesses overlap.
-      else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize)
+      // If start of the dead access is "before" end of the killing access
+      // then accesses overlap.
+      else if ((uint64_t)(DeadOff - KillingOff) < KillingSize)
         return OW_MaybePartial;
     }
-    // If start of the later access is "before" end of the earlier access then
+    // If start of the killing access is "before" end of the dead access then
     // accesses overlap.
-    else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) {
+    else if ((uint64_t)(KillingOff - DeadOff) < DeadSize) {
       return OW_MaybePartial;
     }
 
@@ -1106,8 +1059,13 @@ struct DSEState {
       LibFunc LF;
       if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
         switch (LF) {
-        case LibFunc_strcpy:
         case LibFunc_strncpy:
+          if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2)))
+            return MemoryLocation(CB->getArgOperand(0),
+                                  LocationSize::precise(Len->getZExtValue()),
+                                  CB->getAAMetadata());
+          LLVM_FALLTHROUGH;
+        case LibFunc_strcpy:
         case LibFunc_strcat:
         case LibFunc_strncat:
           return {MemoryLocation::getAfter(CB->getArgOperand(0))};
@@ -1145,8 +1103,8 @@ struct DSEState {
 
     int64_t InstWriteOffset, DepWriteOffset;
     if (auto CC = getLocForWriteEx(UseInst))
-      return isOverwrite(UseInst, DefInst, *CC, DefLoc, DepWriteOffset,
-                         InstWriteOffset) == OW_Complete;
+      return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset,
+                         DepWriteOffset) == OW_Complete;
     return false;
   }
 
@@ -1248,9 +1206,10 @@ struct DSEState {
       const Value *LocUO = getUnderlyingObject(Loc.Ptr);
       return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
     }
-    int64_t InstWriteOffset, DepWriteOffset;
-    return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DepWriteOffset,
-                       InstWriteOffset) == OW_Complete;
+    int64_t InstWriteOffset = 0;
+    int64_t DepWriteOffset = 0;
+    return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, InstWriteOffset,
+                       DepWriteOffset) == OW_Complete;
   }
 
   // Returns true if \p Use may read from \p DefLoc.
@@ -1270,10 +1229,6 @@ struct DSEState {
       if (CB->onlyAccessesInaccessibleMemory())
         return false;
 
-    // NOTE: For calls, the number of stores removed could be slightly improved
-    // by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to
-    // be expensive compared to the benefits in practice. For now, avoid more
-    // expensive analysis to limit compile-time.
     return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
   }
 
@@ -1329,15 +1284,15 @@ struct DSEState {
     return IsGuaranteedLoopInvariantBase(Ptr);
   }
 
-  // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
-  // no read access between them or on any other path to a function exit block
-  // if \p DefLoc is not accessible after the function returns. If there is no
-  // such MemoryDef, return None. The returned value may not (completely)
-  // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
-  // MemoryUse (read).
+  // Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess,
+  // with no read access between them or on any other path to a function exit
+  // block if \p KillingLoc is not accessible after the function returns. If
+  // there is no such MemoryDef, return None. The returned value may not
+  // (completely) overwrite \p KillingLoc. Currently we bail out when we
+  // encounter an aliasing MemoryUse (read).
   Optional<MemoryAccess *>
   getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
-                  const MemoryLocation &DefLoc, const Value *DefUO,
+                  const MemoryLocation &KillingLoc, const Value *KillingUndObj,
                   unsigned &ScanLimit, unsigned &WalkerStepLimit,
                   bool IsMemTerm, unsigned &PartialLimit) {
     if (ScanLimit == 0 || WalkerStepLimit == 0) {
@@ -1389,19 +1344,20 @@ struct DSEState {
       MemoryDef *CurrentDef = cast<MemoryDef>(Current);
       Instruction *CurrentI = CurrentDef->getMemoryInst();
 
-      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO)))
+      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj),
+                     TLI))
         continue;
 
       // Before we try to remove anything, check for any extra throwing
       // instructions that block us from DSEing
-      if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
+      if (mayThrowBetween(KillingI, CurrentI, KillingUndObj)) {
         LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
         return None;
       }
 
       // Check for anything that looks like it will be a barrier to further
       // removal
-      if (isDSEBarrier(DefUO, CurrentI)) {
+      if (isDSEBarrier(KillingUndObj, CurrentI)) {
         LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
         return None;
       }
@@ -1410,14 +1366,14 @@ struct DSEState {
       // clobber, bail out, as the path is not profitable. We skip this check
       // for intrinsic calls, because the code knows how to handle memcpy
       // intrinsics.
-      if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(DefLoc, CurrentI))
+      if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(KillingLoc, CurrentI))
         return None;
 
       // Quick check if there are direct uses that are read-clobbers.
-      if (any_of(Current->uses(), [this, &DefLoc, StartAccess](Use &U) {
+      if (any_of(Current->uses(), [this, &KillingLoc, StartAccess](Use &U) {
             if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(U.getUser()))
               return !MSSA.dominates(StartAccess, UseOrDef) &&
-                     isReadClobber(DefLoc, UseOrDef->getMemoryInst());
+                     isReadClobber(KillingLoc, UseOrDef->getMemoryInst());
             return false;
           })) {
         LLVM_DEBUG(dbgs() << "   ...  found a read clobber\n");
@@ -1450,9 +1406,10 @@ struct DSEState {
         if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI))
           continue;
       } else {
-        int64_t InstWriteOffset, DepWriteOffset;
-        auto OR = isOverwrite(KillingI, CurrentI, DefLoc, *CurrentLoc,
-                              DepWriteOffset, InstWriteOffset);
+        int64_t KillingOffset = 0;
+        int64_t DeadOffset = 0;
+        auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc,
+                              KillingOffset, DeadOffset);
         // If Current does not write to the same object as KillingDef, check
         // the next candidate.
         if (OR == OW_Unknown)
@@ -1473,30 +1430,25 @@ struct DSEState {
     };
 
     // Accesses to objects accessible after the function returns can only be
-    // eliminated if the access is killed along all paths to the exit. Collect
+    // eliminated if the access is dead along all paths to the exit. Collect
     // the blocks with killing (=completely overwriting MemoryDefs) and check if
-    // they cover all paths from EarlierAccess to any function exit.
+    // they cover all paths from MaybeDeadAccess to any function exit.
     SmallPtrSet<Instruction *, 16> KillingDefs;
     KillingDefs.insert(KillingDef->getMemoryInst());
-    MemoryAccess *EarlierAccess = Current;
-    Instruction *EarlierMemInst =
-        cast<MemoryDef>(EarlierAccess)->getMemoryInst();
-    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *EarlierAccess << " ("
-                      << *EarlierMemInst << ")\n");
+    MemoryAccess *MaybeDeadAccess = Current;
+    MemoryLocation MaybeDeadLoc = *CurrentLoc;
+    Instruction *MaybeDeadI = cast<MemoryDef>(MaybeDeadAccess)->getMemoryInst();
+    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *MaybeDeadAccess << " ("
+                      << *MaybeDeadI << ")\n");
 
     SmallSetVector<MemoryAccess *, 32> WorkList;
     auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
       for (Use &U : Acc->uses())
         WorkList.insert(cast<MemoryAccess>(U.getUser()));
     };
-    PushMemUses(EarlierAccess);
-
-    // Optimistically collect all accesses for reads. If we do not find any
-    // read clobbers, add them to the cache.
-    SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
-    if (!EarlierMemInst->mayReadFromMemory())
-      KnownNoReads.insert(EarlierAccess);
-    // Check if EarlierDef may be read.
+    PushMemUses(MaybeDeadAccess);
+
+    // Check if DeadDef may be read.
     for (unsigned I = 0; I < WorkList.size(); I++) {
       MemoryAccess *UseAccess = WorkList[I];
 
@@ -1508,7 +1460,6 @@ struct DSEState {
       }
       --ScanLimit;
       NumDomMemDefChecks++;
-      KnownNoReads.insert(UseAccess);
 
       if (isa<MemoryPhi>(UseAccess)) {
         if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
@@ -1535,7 +1486,7 @@ struct DSEState {
 
       // A memory terminator kills all preceeding MemoryDefs and all succeeding
       // MemoryAccesses. We do not have to check it's users.
-      if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) {
+      if (isMemTerminator(MaybeDeadLoc, MaybeDeadI, UseInst)) {
         LLVM_DEBUG(
             dbgs()
             << " ... skipping, memterminator invalidates following accesses\n");
@@ -1548,14 +1499,14 @@ struct DSEState {
         continue;
       }
 
-      if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(DefUO)) {
+      if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) {
         LLVM_DEBUG(dbgs() << "  ... found throwing instruction\n");
         return None;
       }
 
       // Uses which may read the original MemoryDef mean we cannot eliminate the
       // original MD. Stop walk.
-      if (isReadClobber(*CurrentLoc, UseInst)) {
+      if (isReadClobber(MaybeDeadLoc, UseInst)) {
         LLVM_DEBUG(dbgs() << "    ... found read clobber\n");
         return None;
       }
@@ -1563,16 +1514,16 @@ struct DSEState {
       // If this worklist walks back to the original memory access (and the
       // pointer is not guarenteed loop invariant) then we cannot assume that a
       // store kills itself.
-      if (EarlierAccess == UseAccess &&
-          !isGuaranteedLoopInvariant(CurrentLoc->Ptr)) {
+      if (MaybeDeadAccess == UseAccess &&
+          !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) {
         LLVM_DEBUG(dbgs() << "    ... found not loop invariant self access\n");
         return None;
       }
-      // Otherwise, for the KillingDef and EarlierAccess we only have to check
+      // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check
       // if it reads the memory location.
       // TODO: It would probably be better to check for self-reads before
       // calling the function.
-      if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
+      if (KillingDef == UseAccess || MaybeDeadAccess == UseAccess) {
         LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n");
         continue;
       }
@@ -1581,18 +1532,18 @@ struct DSEState {
       // the original location. Otherwise we have to check uses of *all*
       // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
       // miss cases like the following
-      //   1 = Def(LoE) ; <----- EarlierDef stores [0,1]
+      //   1 = Def(LoE) ; <----- DeadDef stores [0,1]
       //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
       //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
       //                  (The Use points to the *first* Def it may alias)
       //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
       //                  stores [0,1]
       if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
-        if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
+        if (isCompleteOverwrite(MaybeDeadLoc, MaybeDeadI, UseInst)) {
           BasicBlock *MaybeKillingBlock = UseInst->getParent();
           if (PostOrderNumbers.find(MaybeKillingBlock)->second <
-              PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
-            if (!isInvisibleToCallerAfterRet(DefUO)) {
+              PostOrderNumbers.find(MaybeDeadAccess->getBlock())->second) {
+            if (!isInvisibleToCallerAfterRet(KillingUndObj)) {
               LLVM_DEBUG(dbgs()
                          << "    ... found killing def " << *UseInst << "\n");
               KillingDefs.insert(UseInst);
@@ -1608,9 +1559,9 @@ struct DSEState {
     }
 
     // For accesses to locations visible after the function returns, make sure
-    // that the location is killed (=overwritten) along all paths from
-    // EarlierAccess to the exit.
-    if (!isInvisibleToCallerAfterRet(DefUO)) {
+    // that the location is dead (=overwritten) along all paths from
+    // MaybeDeadAccess to the exit.
+    if (!isInvisibleToCallerAfterRet(KillingUndObj)) {
       SmallPtrSet<BasicBlock *, 16> KillingBlocks;
       for (Instruction *KD : KillingDefs)
         KillingBlocks.insert(KD->getParent());
@@ -1619,25 +1570,24 @@ struct DSEState {
 
       // Find the common post-dominator of all killing blocks.
       BasicBlock *CommonPred = *KillingBlocks.begin();
-      for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
-           I != E; I++) {
+      for (BasicBlock *BB : llvm::drop_begin(KillingBlocks)) {
         if (!CommonPred)
           break;
-        CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
+        CommonPred = PDT.findNearestCommonDominator(CommonPred, BB);
       }
 
       // If CommonPred is in the set of killing blocks, just check if it
-      // post-dominates EarlierAccess.
+      // post-dominates MaybeDeadAccess.
       if (KillingBlocks.count(CommonPred)) {
-        if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
-          return {EarlierAccess};
+        if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock()))
+          return {MaybeDeadAccess};
         return None;
       }
 
-      // If the common post-dominator does not post-dominate EarlierAccess,
-      // there is a path from EarlierAccess to an exit not going through a
+      // If the common post-dominator does not post-dominate MaybeDeadAccess,
+      // there is a path from MaybeDeadAccess to an exit not going through a
       // killing block.
-      if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
+      if (PDT.dominates(CommonPred, MaybeDeadAccess->getBlock())) {
         SetVector<BasicBlock *> WorkList;
 
         // If CommonPred is null, there are multiple exits from the function.
@@ -1650,16 +1600,16 @@ struct DSEState {
 
         NumCFGTries++;
         // Check if all paths starting from an exit node go through one of the
-        // killing blocks before reaching EarlierAccess.
+        // killing blocks before reaching MaybeDeadAccess.
         for (unsigned I = 0; I < WorkList.size(); I++) {
           NumCFGChecks++;
           BasicBlock *Current = WorkList[I];
           if (KillingBlocks.count(Current))
             continue;
-          if (Current == EarlierAccess->getBlock())
+          if (Current == MaybeDeadAccess->getBlock())
             return None;
 
-          // EarlierAccess is reachable from the entry, so we don't have to
+          // MaybeDeadAccess is reachable from the entry, so we don't have to
           // explore unreachable blocks further.
           if (!DT.isReachableFromEntry(Current))
             continue;
@@ -1671,14 +1621,14 @@ struct DSEState {
             return None;
         }
         NumCFGSuccess++;
-        return {EarlierAccess};
+        return {MaybeDeadAccess};
       }
       return None;
     }
 
-    // No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
+    // No aliasing MemoryUses of MaybeDeadAccess found, MaybeDeadAccess is
     // potentially dead.
-    return {EarlierAccess};
+    return {MaybeDeadAccess};
   }
 
   // Delete dead memory defs
@@ -1701,6 +1651,7 @@ struct DSEState {
         if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
           SkipStores.insert(MD);
         }
+
         Updater.removeMemoryAccess(MA);
       }
 
@@ -1715,47 +1666,49 @@ struct DSEState {
             NowDeadInsts.push_back(OpI);
         }
 
+      EI.removeInstruction(DeadInst);
       DeadInst->eraseFromParent();
     }
   }
 
-  // Check for any extra throws between SI and NI that block DSE.  This only
-  // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
-  // throw are handled during the walk from one def to the next.
-  bool mayThrowBetween(Instruction *SI, Instruction *NI,
-                       const Value *SILocUnd) {
-    // First see if we can ignore it by using the fact that SI is an
+  // Check for any extra throws between \p KillingI and \p DeadI that block
+  // DSE.  This only checks extra maythrows (those that aren't MemoryDef's).
+  // MemoryDef that may throw are handled during the walk from one def to the
+  // next.
+  bool mayThrowBetween(Instruction *KillingI, Instruction *DeadI,
+                       const Value *KillingUndObj) {
+    // First see if we can ignore it by using the fact that KillingI is an
     // alloca/alloca like object that is not visible to the caller during
     // execution of the function.
-    if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd))
+    if (KillingUndObj && isInvisibleToCallerBeforeRet(KillingUndObj))
       return false;
 
-    if (SI->getParent() == NI->getParent())
-      return ThrowingBlocks.count(SI->getParent());
+    if (KillingI->getParent() == DeadI->getParent())
+      return ThrowingBlocks.count(KillingI->getParent());
     return !ThrowingBlocks.empty();
   }
 
-  // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
-  // act as barriers:
-  //  * A memory instruction that may throw and \p SI accesses a non-stack
+  // Check if \p DeadI acts as a DSE barrier for \p KillingI. The following
+  // instructions act as barriers:
+  //  * A memory instruction that may throw and \p KillingI accesses a non-stack
   //  object.
   //  * Atomic stores stronger that monotonic.
-  bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) {
-    // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
-    // like object that does not escape.
-    if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd))
+  bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) {
+    // If DeadI may throw it acts as a barrier, unless we are to an
+    // alloca/alloca like object that does not escape.
+    if (DeadI->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj))
       return true;
 
-    // If NI is an atomic load/store stronger than monotonic, do not try to
+    // If DeadI is an atomic load/store stronger than monotonic, do not try to
     // eliminate/reorder it.
-    if (NI->isAtomic()) {
-      if (auto *LI = dyn_cast<LoadInst>(NI))
+    if (DeadI->isAtomic()) {
+      if (auto *LI = dyn_cast<LoadInst>(DeadI))
         return isStrongerThanMonotonic(LI->getOrdering());
-      if (auto *SI = dyn_cast<StoreInst>(NI))
+      if (auto *SI = dyn_cast<StoreInst>(DeadI))
         return isStrongerThanMonotonic(SI->getOrdering());
-      if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI))
+      if (auto *ARMW = dyn_cast<AtomicRMWInst>(DeadI))
         return isStrongerThanMonotonic(ARMW->getOrdering());
-      if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI))
+      if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(DeadI))
         return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
                isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
       llvm_unreachable("other instructions should be skipped in MemorySSA");
@@ -1776,7 +1729,6 @@ struct DSEState {
         continue;
 
       Instruction *DefI = Def->getMemoryInst();
-      SmallVector<const Value *, 4> Pointers;
       auto DefLoc = getLocForWriteEx(DefI);
       if (!DefLoc)
         continue;
@@ -1787,7 +1739,7 @@ struct DSEState {
       // uncommon. If it turns out to be important, we can use
       // getUnderlyingObjects here instead.
       const Value *UO = getUnderlyingObject(DefLoc->Ptr);
-      if (!UO || !isInvisibleToCallerAfterRet(UO))
+      if (!isInvisibleToCallerAfterRet(UO))
         continue;
 
       if (isWriteAtEndOfFunction(Def)) {
@@ -1804,8 +1756,7 @@ struct DSEState {
 
   /// \returns true if \p Def is a no-op store, either because it
   /// directly stores back a loaded value or stores zero to a calloced object.
-  bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc,
-                   const Value *DefUO) {
+  bool storeIsNoop(MemoryDef *Def, const Value *DefUO) {
     StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
     MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst());
     Constant *StoredConstant = nullptr;
@@ -1816,13 +1767,78 @@ struct DSEState {
 
     if (StoredConstant && StoredConstant->isNullValue()) {
       auto *DefUOInst = dyn_cast<Instruction>(DefUO);
-      if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) {
-        auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
-        // If UnderlyingDef is the clobbering access of Def, no instructions
-        // between them can modify the memory location.
-        auto *ClobberDef =
-            MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
-        return UnderlyingDef == ClobberDef;
+      if (DefUOInst) {
+        if (isCallocLikeFn(DefUOInst, &TLI)) {
+          auto *UnderlyingDef =
+              cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
+          // If UnderlyingDef is the clobbering access of Def, no instructions
+          // between them can modify the memory location.
+          auto *ClobberDef =
+              MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
+          return UnderlyingDef == ClobberDef;
+        }
+
+        if (MemSet) {
+          if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
+              F.hasFnAttribute(Attribute::SanitizeAddress) ||
+              F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+              F.getName() == "calloc")
+            return false;
+          auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUOInst));
+          if (!Malloc)
+            return false;
+          auto *InnerCallee = Malloc->getCalledFunction();
+          if (!InnerCallee)
+            return false;
+          LibFunc Func;
+          if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+              Func != LibFunc_malloc)
+            return false;
+
+          auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
+            // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
+            // of malloc block
+            auto *MallocBB = Malloc->getParent(),
+                 *MemsetBB = Memset->getParent();
+            if (MallocBB == MemsetBB)
+              return true;
+            auto *Ptr = Memset->getArgOperand(0);
+            auto *TI = MallocBB->getTerminator();
+            ICmpInst::Predicate Pred;
+            BasicBlock *TrueBB, *FalseBB;
+            if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
+                                FalseBB)))
+              return false;
+            if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+              return false;
+            return true;
+          };
+
+          if (Malloc->getOperand(0) == MemSet->getLength()) {
+            if (shouldCreateCalloc(Malloc, MemSet) &&
+                DT.dominates(Malloc, MemSet) &&
+                memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) {
+              IRBuilder<> IRB(Malloc);
+              const auto &DL = Malloc->getModule()->getDataLayout();
+              if (auto *Calloc =
+                      emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1),
+                                 Malloc->getArgOperand(0), IRB, TLI)) {
+                MemorySSAUpdater Updater(&MSSA);
+                auto *LastDef = cast<MemoryDef>(
+                    Updater.getMemorySSA()->getMemoryAccess(Malloc));
+                auto *NewAccess = Updater.createMemoryAccessAfter(
+                    cast<Instruction>(Calloc), LastDef, LastDef);
+                auto *NewAccessMD = cast<MemoryDef>(NewAccess);
+                Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
+                Updater.removeMemoryAccess(Malloc);
+                Malloc->replaceAllUsesWith(Calloc);
+                Malloc->eraseFromParent();
+                return true;
+              }
+              return false;
+            }
+          }
+        }
       }
     }
 
@@ -1875,6 +1891,76 @@ struct DSEState {
 
     return false;
   }
+
+  bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) {
+    bool Changed = false;
+    for (auto OI : IOL) {
+      Instruction *DeadI = OI.first;
+      MemoryLocation Loc = *getLocForWriteEx(DeadI);
+      assert(isRemovable(DeadI) && "Expect only removable instruction");
+
+      const Value *Ptr = Loc.Ptr->stripPointerCasts();
+      int64_t DeadStart = 0;
+      uint64_t DeadSize = Loc.Size.getValue();
+      GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL);
+      OverlapIntervalsTy &IntervalMap = OI.second;
+      Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize);
+      if (IntervalMap.empty())
+        continue;
+      Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+    }
+    return Changed;
+  }
+
+  /// Eliminates writes to locations where the value that is being written
+  /// is already stored at the same location.
+  bool eliminateRedundantStoresOfExistingValues() {
+    bool MadeChange = false;
+    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the "
+                         "already existing value\n");
+    for (auto *Def : MemDefs) {
+      if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) ||
+          !isRemovable(Def->getMemoryInst()))
+        continue;
+      auto *UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess());
+      if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef))
+        continue;
+
+      Instruction *DefInst = Def->getMemoryInst();
+      Instruction *UpperInst = UpperDef->getMemoryInst();
+      auto IsRedundantStore = [this, DefInst,
+                               UpperInst](MemoryLocation UpperLoc) {
+        if (DefInst->isIdenticalTo(UpperInst))
+          return true;
+        if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
+          if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
+            auto MaybeDefLoc = getLocForWriteEx(DefInst);
+            if (!MaybeDefLoc)
+              return false;
+            int64_t InstWriteOffset = 0;
+            int64_t DepWriteOffset = 0;
+            auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc,
+                                  InstWriteOffset, DepWriteOffset);
+            Value *StoredByte = isBytewiseValue(SI->getValueOperand(), DL);
+            return StoredByte && StoredByte == MemSetI->getOperand(1) &&
+                   OR == OW_Complete;
+          }
+        }
+        return false;
+      };
+
+      auto MaybeUpperLoc = getLocForWriteEx(UpperInst);
+      if (!MaybeUpperLoc || !IsRedundantStore(*MaybeUpperLoc) ||
+          isReadClobber(*MaybeUpperLoc, DefInst))
+        continue;
+      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *DefInst
+                        << '\n');
+      deleteDeadInstruction(DefInst);
+      NumRedundantStores++;
+      MadeChange = true;
+    }
+    return MadeChange;
+  }
 };
 
 static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
@@ -1883,68 +1969,64 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                                 const LoopInfo &LI) {
   bool MadeChange = false;
 
-  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI);
+  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
     if (State.SkipStores.count(KillingDef))
       continue;
-    Instruction *SI = KillingDef->getMemoryInst();
+    Instruction *KillingI = KillingDef->getMemoryInst();
 
-    Optional<MemoryLocation> MaybeSILoc;
-    if (State.isMemTerminatorInst(SI))
-      MaybeSILoc = State.getLocForTerminator(SI).map(
+    Optional<MemoryLocation> MaybeKillingLoc;
+    if (State.isMemTerminatorInst(KillingI))
+      MaybeKillingLoc = State.getLocForTerminator(KillingI).map(
           [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
     else
-      MaybeSILoc = State.getLocForWriteEx(SI);
+      MaybeKillingLoc = State.getLocForWriteEx(KillingI);
 
-    if (!MaybeSILoc) {
+    if (!MaybeKillingLoc) {
       LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
-                        << *SI << "\n");
+                        << *KillingI << "\n");
       continue;
     }
-    MemoryLocation SILoc = *MaybeSILoc;
-    assert(SILoc.Ptr && "SILoc should not be null");
-    const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr);
-
-    MemoryAccess *Current = KillingDef;
+    MemoryLocation KillingLoc = *MaybeKillingLoc;
+    assert(KillingLoc.Ptr && "KillingLoc should not be null");
+    const Value *KillingUndObj = getUnderlyingObject(KillingLoc.Ptr);
     LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
-                      << *Current << " (" << *SI << ")\n");
+                      << *KillingDef << " (" << *KillingI << ")\n");
 
     unsigned ScanLimit = MemorySSAScanLimit;
     unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
     unsigned PartialLimit = MemorySSAPartialStoreLimit;
     // Worklist of MemoryAccesses that may be killed by KillingDef.
     SetVector<MemoryAccess *> ToCheck;
-
-    if (SILocUnd)
-      ToCheck.insert(KillingDef->getDefiningAccess());
+    ToCheck.insert(KillingDef->getDefiningAccess());
 
     bool Shortend = false;
-    bool IsMemTerm = State.isMemTerminatorInst(SI);
+    bool IsMemTerm = State.isMemTerminatorInst(KillingI);
     // Check if MemoryAccesses in the worklist are killed by KillingDef.
     for (unsigned I = 0; I < ToCheck.size(); I++) {
-      Current = ToCheck[I];
+      MemoryAccess *Current = ToCheck[I];
       if (State.SkipStores.count(Current))
         continue;
 
-      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
-          KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit,
-          IsMemTerm, PartialLimit);
+      Optional<MemoryAccess *> MaybeDeadAccess = State.getDomMemoryDef(
+          KillingDef, Current, KillingLoc, KillingUndObj, ScanLimit,
+          WalkerStepLimit, IsMemTerm, PartialLimit);
 
-      if (!Next) {
+      if (!MaybeDeadAccess) {
         LLVM_DEBUG(dbgs() << "  finished walk\n");
         continue;
       }
 
-      MemoryAccess *EarlierAccess = *Next;
-      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
-      if (isa<MemoryPhi>(EarlierAccess)) {
+      MemoryAccess *DeadAccess = *MaybeDeadAccess;
+      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DeadAccess);
+      if (isa<MemoryPhi>(DeadAccess)) {
         LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
-        for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
+        for (Value *V : cast<MemoryPhi>(DeadAccess)->incoming_values()) {
           MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
           BasicBlock *IncomingBlock = IncomingAccess->getBlock();
-          BasicBlock *PhiBlock = EarlierAccess->getBlock();
+          BasicBlock *PhiBlock = DeadAccess->getBlock();
 
           // We only consider incoming MemoryAccesses that come before the
           // MemoryPhi. Otherwise we could discover candidates that do not
@@ -1955,72 +2037,73 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
         }
         continue;
       }
-      auto *NextDef = cast<MemoryDef>(EarlierAccess);
-      Instruction *NI = NextDef->getMemoryInst();
-      LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-      ToCheck.insert(NextDef->getDefiningAccess());
+      auto *DeadDefAccess = cast<MemoryDef>(DeadAccess);
+      Instruction *DeadI = DeadDefAccess->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *DeadI << ")\n");
+      ToCheck.insert(DeadDefAccess->getDefiningAccess());
       NumGetDomMemoryDefPassed++;
 
       if (!DebugCounter::shouldExecute(MemorySSACounter))
         continue;
 
-      MemoryLocation NILoc = *State.getLocForWriteEx(NI);
+      MemoryLocation DeadLoc = *State.getLocForWriteEx(DeadI);
 
       if (IsMemTerm) {
-        const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
-        if (SILocUnd != NIUnd)
+        const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr);
+        if (KillingUndObj != DeadUndObj)
           continue;
-        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
-                          << "\n  KILLER: " << *SI << '\n');
-        State.deleteDeadInstruction(NI);
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DeadI
+                          << "\n  KILLER: " << *KillingI << '\n');
+        State.deleteDeadInstruction(DeadI);
         ++NumFastStores;
         MadeChange = true;
       } else {
-        // Check if NI overwrites SI.
-        int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR = State.isOverwrite(SI, NI, SILoc, NILoc,
-                                               DepWriteOffset, InstWriteOffset);
+        // Check if DeadI overwrites KillingI.
+        int64_t KillingOffset = 0;
+        int64_t DeadOffset = 0;
+        OverwriteResult OR = State.isOverwrite(
+            KillingI, DeadI, KillingLoc, DeadLoc, KillingOffset, DeadOffset);
         if (OR == OW_MaybePartial) {
           auto Iter = State.IOLs.insert(
               std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
-                  NI->getParent(), InstOverlapIntervalsTy()));
+                  DeadI->getParent(), InstOverlapIntervalsTy()));
           auto &IOL = Iter.first->second;
-          OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset,
-                                  NI, IOL);
+          OR = isPartialOverwrite(KillingLoc, DeadLoc, KillingOffset,
+                                  DeadOffset, DeadI, IOL);
         }
 
         if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
-          auto *Earlier = dyn_cast<StoreInst>(NI);
-          auto *Later = dyn_cast<StoreInst>(SI);
+          auto *DeadSI = dyn_cast<StoreInst>(DeadI);
+          auto *KillingSI = dyn_cast<StoreInst>(KillingI);
           // We are re-using tryToMergePartialOverlappingStores, which requires
-          // Earlier to domiante Later.
+          // DeadSI to dominate DeadSI.
           // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
-          if (Earlier && Later && DT.dominates(Earlier, Later)) {
+          if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) {
             if (Constant *Merged = tryToMergePartialOverlappingStores(
-                    Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL,
+                    KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL,
                     State.BatchAA, &DT)) {
 
               // Update stored value of earlier store to merged constant.
-              Earlier->setOperand(0, Merged);
+              DeadSI->setOperand(0, Merged);
               ++NumModifiedStores;
               MadeChange = true;
 
               Shortend = true;
-              // Remove later store and remove any outstanding overlap intervals
-              // for the updated store.
-              State.deleteDeadInstruction(Later);
-              auto I = State.IOLs.find(Earlier->getParent());
+              // Remove killing store and remove any outstanding overlap
+              // intervals for the updated store.
+              State.deleteDeadInstruction(KillingSI);
+              auto I = State.IOLs.find(DeadSI->getParent());
               if (I != State.IOLs.end())
-                I->second.erase(Earlier);
+                I->second.erase(DeadSI);
               break;
             }
           }
         }
 
         if (OR == OW_Complete) {
-          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
-                            << "\n  KILLER: " << *SI << '\n');
-          State.deleteDeadInstruction(NI);
+          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DeadI
+                            << "\n  KILLER: " << *KillingI << '\n');
+          State.deleteDeadInstruction(DeadI);
           ++NumFastStores;
           MadeChange = true;
         }
@@ -2028,10 +2111,11 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
     }
 
     // Check if the store is a no-op.
-    if (!Shortend && isRemovable(SI) &&
-        State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
-      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *SI << '\n');
-      State.deleteDeadInstruction(SI);
+    if (!Shortend && isRemovable(KillingI) &&
+        State.storeIsNoop(KillingDef, KillingUndObj)) {
+      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *KillingI
+                        << '\n');
+      State.deleteDeadInstruction(KillingI);
       NumRedundantStores++;
       MadeChange = true;
       continue;
@@ -2040,8 +2124,9 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
 
   if (EnablePartialOverwriteTracking)
     for (auto &KV : State.IOLs)
-      MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI);
+      MadeChange |= State.removePartiallyOverlappedStores(KV.second);
 
+  MadeChange |= State.eliminateRedundantStoresOfExistingValues();
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
   return MadeChange;
 }
diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index c77769368ede..66c9d9f0902a 100644
--- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -272,9 +272,10 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
 
       if (PredBB && IsSafeToHoist(RemInst, RemBB) &&
           IsSafeToHoist(DivInst, DivBB) &&
-          llvm::all_of(successors(PredBB), [&](BasicBlock *BB) {
-            return BB == DivBB || BB == RemBB;
-          })) {
+          all_of(successors(PredBB),
+                 [&](BasicBlock *BB) { return BB == DivBB || BB == RemBB; }) &&
+          all_of(predecessors(DivBB),
+                 [&](BasicBlock *BB) { return BB == RemBB || BB == PredBB; })) {
         DivDominates = true;
         DivInst->moveBefore(PredBB->getTerminator());
         Changed = true;
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 978c6a77b8dc..90f71f7729a7 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -293,7 +293,7 @@ static unsigned getHashValueImpl(SimpleValue Val) {
   // TODO: Extend this to handle intrinsics with >2 operands where the 1st
   //       2 operands are commutative.
   auto *II = dyn_cast<IntrinsicInst>(Inst);
-  if (II && II->isCommutative() && II->getNumArgOperands() == 2) {
+  if (II && II->isCommutative() && II->arg_size() == 2) {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
     if (LHS > RHS)
       std::swap(LHS, RHS);
@@ -363,7 +363,7 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
   auto *LII = dyn_cast<IntrinsicInst>(LHSI);
   auto *RII = dyn_cast<IntrinsicInst>(RHSI);
   if (LII && RII && LII->getIntrinsicID() == RII->getIntrinsicID() &&
-      LII->isCommutative() && LII->getNumArgOperands() == 2) {
+      LII->isCommutative() && LII->arg_size() == 2) {
     return LII->getArgOperand(0) == RII->getArgOperand(1) &&
            LII->getArgOperand(1) == RII->getArgOperand(0);
   }
@@ -1265,6 +1265,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    // Skip pseudoprobe intrinsics, for the same reason as assume intrinsics.
+    if (match(&Inst, m_Intrinsic<Intrinsic::pseudoprobe>())) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE skipping pseudoprobe: " << Inst << '\n');
+      continue;
+    }
+
     // We can skip all invariant.start intrinsics since they only read memory,
     // and we can forward values across it. For invariant starts without
     // invariant ends, we can use the fact that the invariantness never ends to
@@ -1642,6 +1648,16 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   return PA;
 }
 
+void EarlyCSEPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<EarlyCSEPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (UseMemorySSA)
+    OS << "memssa";
+  OS << ">";
+}
+
 namespace {
 
 /// A simple and fast domtree-based CSE pass.
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 8a5d4f568774..a98bb8358aef 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -256,7 +256,7 @@ void Float2IntPass::walkForwards() {
       Op = [](ArrayRef<ConstantRange> Ops) {
         assert(Ops.size() == 1 && "FNeg is a unary operator!");
         unsigned Size = Ops[0].getBitWidth();
-        auto Zero = ConstantRange(APInt::getNullValue(Size));
+        auto Zero = ConstantRange(APInt::getZero(Size));
         return Zero.sub(Ops[0]);
       };
       break;
@@ -372,7 +372,7 @@ bool Float2IntPass::validateAndTransform() {
       // If it does, transformation would be illegal.
       //
       // Don't count the roots, as they terminate the graphs.
-      if (Roots.count(I) == 0) {
+      if (!Roots.contains(I)) {
         // Set the type of the conversion while we're here.
         if (!ConvertedToTy)
           ConvertedToTy = I->getType();
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 16368aec7c3f..00506fb86006 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -126,7 +126,7 @@ static cl::opt<uint32_t> MaxBBSpeculations(
              "into) when deducing if a value is fully available or not in GVN "
              "(default = 600)"));
 
-struct llvm::GVN::Expression {
+struct llvm::GVNPass::Expression {
   uint32_t opcode;
   bool commutative = false;
   Type *type = nullptr;
@@ -155,17 +155,18 @@ struct llvm::GVN::Expression {
 
 namespace llvm {
 
-template <> struct DenseMapInfo<GVN::Expression> {
-  static inline GVN::Expression getEmptyKey() { return ~0U; }
-  static inline GVN::Expression getTombstoneKey() { return ~1U; }
+template <> struct DenseMapInfo<GVNPass::Expression> {
+  static inline GVNPass::Expression getEmptyKey() { return ~0U; }
+  static inline GVNPass::Expression getTombstoneKey() { return ~1U; }
 
-  static unsigned getHashValue(const GVN::Expression &e) {
+  static unsigned getHashValue(const GVNPass::Expression &e) {
     using llvm::hash_value;
 
     return static_cast<unsigned>(hash_value(e));
   }
 
-  static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
+  static bool isEqual(const GVNPass::Expression &LHS,
+                      const GVNPass::Expression &RHS) {
     return LHS == RHS;
   }
 };
@@ -246,7 +247,7 @@ struct llvm::gvn::AvailableValue {
   /// Emit code at the specified insertion point to adjust the value defined
   /// here to the specified type. This handles various coercion cases.
   Value *MaterializeAdjustedValue(LoadInst *Load, Instruction *InsertPt,
-                                  GVN &gvn) const;
+                                  GVNPass &gvn) const;
 };
 
 /// Represents an AvailableValue which can be rematerialized at the end of
@@ -276,7 +277,7 @@ struct llvm::gvn::AvailableValueInBlock {
 
   /// Emit code at the end of this block to adjust the value defined here to
   /// the specified type. This handles various coercion cases.
-  Value *MaterializeAdjustedValue(LoadInst *Load, GVN &gvn) const {
+  Value *MaterializeAdjustedValue(LoadInst *Load, GVNPass &gvn) const {
     return AV.MaterializeAdjustedValue(Load, BB->getTerminator(), gvn);
   }
 };
@@ -285,7 +286,7 @@ struct llvm::gvn::AvailableValueInBlock {
 //                     ValueTable Internal Functions
 //===----------------------------------------------------------------------===//
 
-GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
+GVNPass::Expression GVNPass::ValueTable::createExpr(Instruction *I) {
   Expression e;
   e.type = I->getType();
   e.opcode = I->getOpcode();
@@ -330,9 +331,8 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
   return e;
 }
 
-GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
-                                               CmpInst::Predicate Predicate,
-                                               Value *LHS, Value *RHS) {
+GVNPass::Expression GVNPass::ValueTable::createCmpExpr(
+    unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) {
   assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
          "Not a comparison!");
   Expression e;
@@ -350,7 +350,8 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
   return e;
 }
 
-GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+GVNPass::Expression
+GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
   assert(EI && "Not an ExtractValueInst?");
   Expression e;
   e.type = EI->getType();
@@ -382,20 +383,21 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
 
-GVN::ValueTable::ValueTable() = default;
-GVN::ValueTable::ValueTable(const ValueTable &) = default;
-GVN::ValueTable::ValueTable(ValueTable &&) = default;
-GVN::ValueTable::~ValueTable() = default;
-GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default;
+GVNPass::ValueTable::ValueTable() = default;
+GVNPass::ValueTable::ValueTable(const ValueTable &) = default;
+GVNPass::ValueTable::ValueTable(ValueTable &&) = default;
+GVNPass::ValueTable::~ValueTable() = default;
+GVNPass::ValueTable &
+GVNPass::ValueTable::operator=(const GVNPass::ValueTable &Arg) = default;
 
 /// add - Insert a value into the table with a specified value number.
-void GVN::ValueTable::add(Value *V, uint32_t num) {
+void GVNPass::ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
   if (PHINode *PN = dyn_cast<PHINode>(V))
     NumberingPhi[num] = PN;
 }
 
-uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
+uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = createExpr(C);
     uint32_t e = assignExpNewValueNum(exp).first;
@@ -421,13 +423,12 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
       // a normal load or store instruction.
       CallInst *local_cdep = dyn_cast<CallInst>(local_dep.getInst());
 
-      if (!local_cdep ||
-          local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+      if (!local_cdep || local_cdep->arg_size() != C->arg_size()) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
       }
 
-      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      for (unsigned i = 0, e = C->arg_size(); i < e; ++i) {
         uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
         uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
         if (c_vn != cd_vn) {
@@ -477,11 +478,11 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
       return nextValueNumber++;
     }
 
-    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+    if (cdep->arg_size() != C->arg_size()) {
       valueNumbering[C] = nextValueNumber;
       return nextValueNumber++;
     }
-    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+    for (unsigned i = 0, e = C->arg_size(); i < e; ++i) {
       uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
       uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
       if (c_vn != cd_vn) {
@@ -500,11 +501,13 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
 }
 
 /// Returns true if a value number exists for the specified value.
-bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+bool GVNPass::ValueTable::exists(Value *V) const {
+  return valueNumbering.count(V) != 0;
+}
 
 /// lookup_or_add - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
-uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
+uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
   DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
   if (VI != valueNumbering.end())
     return VI->second;
@@ -581,7 +584,7 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
 
 /// Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
-uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
+uint32_t GVNPass::ValueTable::lookup(Value *V, bool Verify) const {
   DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
   if (Verify) {
     assert(VI != valueNumbering.end() && "Value not numbered?");
@@ -594,15 +597,15 @@ uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
 /// assigning it a new number if it did not have one before.  Useful when
 /// we deduced the result of a comparison, but don't immediately have an
 /// instruction realizing that comparison to hand.
-uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
-                                         CmpInst::Predicate Predicate,
-                                         Value *LHS, Value *RHS) {
+uint32_t GVNPass::ValueTable::lookupOrAddCmp(unsigned Opcode,
+                                             CmpInst::Predicate Predicate,
+                                             Value *LHS, Value *RHS) {
   Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
   return assignExpNewValueNum(exp).first;
 }
 
 /// Remove all entries from the ValueTable.
-void GVN::ValueTable::clear() {
+void GVNPass::ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
   NumberingPhi.clear();
@@ -614,7 +617,7 @@ void GVN::ValueTable::clear() {
 }
 
 /// Remove a value from the value numbering.
-void GVN::ValueTable::erase(Value *V) {
+void GVNPass::ValueTable::erase(Value *V) {
   uint32_t Num = valueNumbering.lookup(V);
   valueNumbering.erase(V);
   // If V is PHINode, V <--> value number is an one-to-one mapping.
@@ -624,7 +627,7 @@ void GVN::ValueTable::erase(Value *V) {
 
 /// verifyRemoved - Verify that the value is removed from all internal data
 /// structures.
-void GVN::ValueTable::verifyRemoved(const Value *V) const {
+void GVNPass::ValueTable::verifyRemoved(const Value *V) const {
   for (DenseMap<Value*, uint32_t>::const_iterator
          I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
     assert(I->first != V && "Inst still occurs in value numbering map!");
@@ -635,28 +638,28 @@ void GVN::ValueTable::verifyRemoved(const Value *V) const {
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
 
-bool GVN::isPREEnabled() const {
+bool GVNPass::isPREEnabled() const {
   return Options.AllowPRE.getValueOr(GVNEnablePRE);
 }
 
-bool GVN::isLoadPREEnabled() const {
+bool GVNPass::isLoadPREEnabled() const {
   return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE);
 }
 
-bool GVN::isLoadInLoopPREEnabled() const {
+bool GVNPass::isLoadInLoopPREEnabled() const {
   return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
 }
 
-bool GVN::isLoadPRESplitBackedgeEnabled() const {
+bool GVNPass::isLoadPRESplitBackedgeEnabled() const {
   return Options.AllowLoadPRESplitBackedge.getValueOr(
       GVNEnableSplitBackedgeInLoadPRE);
 }
 
-bool GVN::isMemDepEnabled() const {
+bool GVNPass::isMemDepEnabled() const {
   return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
 }
 
-PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) {
   // FIXME: The order of evaluation of these 'getResult' calls is very
   // significant! Re-ordering these variables will cause GVN when run alone to
   // be less effective! We should fix memdep and basic-aa to not exhibit this
@@ -684,8 +687,26 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   return PA;
 }
 
+void GVNPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<GVNPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+
+  OS << "<";
+  if (Options.AllowPRE != None)
+    OS << (Options.AllowPRE.getValue() ? "" : "no-") << "pre;";
+  if (Options.AllowLoadPRE != None)
+    OS << (Options.AllowLoadPRE.getValue() ? "" : "no-") << "load-pre;";
+  if (Options.AllowLoadPRESplitBackedge != None)
+    OS << (Options.AllowLoadPRESplitBackedge.getValue() ? "" : "no-")
+       << "split-backedge-load-pre;";
+  if (Options.AllowMemDep != None)
+    OS << (Options.AllowMemDep.getValue() ? "" : "no-") << "memdep";
+  OS << ">";
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
+LLVM_DUMP_METHOD void GVNPass::dump(DenseMap<uint32_t, Value *> &d) const {
   errs() << "{\n";
   for (auto &I : d) {
     errs() << I.first << "\n";
@@ -835,7 +856,7 @@ static bool IsValueFullyAvailableInBlock(
 static Value *
 ConstructSSAForLoadSet(LoadInst *Load,
                        SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
-                       GVN &gvn) {
+                       GVNPass &gvn) {
   // Check for the fully redundant, dominating load case.  In this case, we can
   // just use the dominating value directly.
   if (ValuesPerBlock.size() == 1 &&
@@ -878,7 +899,7 @@ ConstructSSAForLoadSet(LoadInst *Load,
 
 Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load,
                                                 Instruction *InsertPt,
-                                                GVN &gvn) const {
+                                                GVNPass &gvn) const {
   Value *Res;
   Type *LoadTy = Load->getType();
   const DataLayout &DL = Load->getModule()->getDataLayout();
@@ -1002,8 +1023,8 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
   ORE->emit(R);
 }
 
-bool GVN::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
-                                  Value *Address, AvailableValue &Res) {
+bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
+                                      Value *Address, AvailableValue &Res) {
   assert((DepInfo.isDef() || DepInfo.isClobber()) &&
          "expected a local dependence");
   assert(Load->isUnordered() && "rules below are incorrect for ordered access");
@@ -1137,9 +1158,9 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
   return false;
 }
 
-void GVN::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps,
-                                  AvailValInBlkVect &ValuesPerBlock,
-                                  UnavailBlkVect &UnavailableBlocks) {
+void GVNPass::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps,
+                                      AvailValInBlkVect &ValuesPerBlock,
+                                      UnavailBlkVect &UnavailableBlocks) {
   // Filter out useless results (non-locals, etc).  Keep track of the blocks
   // where we have a value available in repl, also keep track of whether we see
   // dependencies that produce an unknown value for the load (such as a call
@@ -1182,7 +1203,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps,
          "post condition violation");
 }
 
-void GVN::eliminatePartiallyRedundantLoad(
+void GVNPass::eliminatePartiallyRedundantLoad(
     LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
     MapVector<BasicBlock *, Value *> &AvailableLoads) {
   for (const auto &AvailableLoad : AvailableLoads) {
@@ -1212,8 +1233,7 @@ void GVN::eliminatePartiallyRedundantLoad(
     }
 
     // Transfer the old load's AA tags to the new load.
-    AAMDNodes Tags;
-    Load->getAAMetadata(Tags);
+    AAMDNodes Tags = Load->getAAMetadata();
     if (Tags)
       NewLoad->setAAMetadata(Tags);
 
@@ -1257,8 +1277,8 @@ void GVN::eliminatePartiallyRedundantLoad(
   });
 }
 
-bool GVN::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
-                         UnavailBlkVect &UnavailableBlocks) {
+bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
+                             UnavailBlkVect &UnavailableBlocks) {
   // Okay, we have *some* definitions of the value.  This means that the value
   // is available in some of our (transitive) predecessors.  Lets think about
   // doing PRE of this load.  This will involve inserting a new load into the
@@ -1498,8 +1518,9 @@ bool GVN::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
   return true;
 }
 
-bool GVN::performLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
-                             UnavailBlkVect &UnavailableBlocks) {
+bool GVNPass::performLoopLoadPRE(LoadInst *Load,
+                                 AvailValInBlkVect &ValuesPerBlock,
+                                 UnavailBlkVect &UnavailableBlocks) {
   if (!LI)
     return false;
 
@@ -1590,7 +1611,7 @@ static void reportLoadElim(LoadInst *Load, Value *AvailableValue,
 
 /// Attempt to eliminate a load whose dependencies are
 /// non-local by performing PHI construction.
-bool GVN::processNonLocalLoad(LoadInst *Load) {
+bool GVNPass::processNonLocalLoad(LoadInst *Load) {
   // non-local speculations are not allowed under asan.
   if (Load->getParent()->getParent()->hasFnAttribute(
           Attribute::SanitizeAddress) ||
@@ -1622,10 +1643,8 @@ bool GVN::processNonLocalLoad(LoadInst *Load) {
   // If this load follows a GEP, see if we can PRE the indices before analyzing.
   if (GetElementPtrInst *GEP =
           dyn_cast<GetElementPtrInst>(Load->getOperand(0))) {
-    for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
-                                        OE = GEP->idx_end();
-         OI != OE; ++OI)
-      if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+    for (Use &U : GEP->indices())
+      if (Instruction *I = dyn_cast<Instruction>(U.get()))
         Changed |= performScalarPRE(I);
   }
 
@@ -1673,8 +1692,11 @@ bool GVN::processNonLocalLoad(LoadInst *Load) {
   if (!isLoadInLoopPREEnabled() && LI && LI->getLoopFor(Load->getParent()))
     return Changed;
 
-  return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) ||
-         performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks);
+  if (performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) ||
+      PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks))
+    return true;
+
+  return Changed;
 }
 
 static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
@@ -1738,7 +1760,7 @@ static bool hasUsersIn(Value *V, BasicBlock *BB) {
   return false;
 }
 
-bool GVN::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
+bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
   Value *V = IntrinsicI->getArgOperand(0);
 
   if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
@@ -1882,7 +1904,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
 
 /// Attempt to eliminate a load, first by eliminating it
 /// locally, and then attempting non-local elimination if that fails.
-bool GVN::processLoad(LoadInst *L) {
+bool GVNPass::processLoad(LoadInst *L) {
   if (!MD)
     return false;
 
@@ -1936,7 +1958,7 @@ bool GVN::processLoad(LoadInst *L) {
 /// Return a pair the first field showing the value number of \p Exp and the
 /// second field showing whether it is a value number newly created.
 std::pair<uint32_t, bool>
-GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+GVNPass::ValueTable::assignExpNewValueNum(Expression &Exp) {
   uint32_t &e = expressionNumbering[Exp];
   bool CreateNewValNum = !e;
   if (CreateNewValNum) {
@@ -1951,8 +1973,8 @@ GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
 
 /// Return whether all the values related with the same \p num are
 /// defined in \p BB.
-bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
-                                     GVN &Gvn) {
+bool GVNPass::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+                                         GVNPass &Gvn) {
   LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
   while (Vals && Vals->BB == BB)
     Vals = Vals->Next;
@@ -1960,9 +1982,9 @@ bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
 }
 
 /// Wrap phiTranslateImpl to provide caching functionality.
-uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
-                                       const BasicBlock *PhiBlock, uint32_t Num,
-                                       GVN &Gvn) {
+uint32_t GVNPass::ValueTable::phiTranslate(const BasicBlock *Pred,
+                                           const BasicBlock *PhiBlock,
+                                           uint32_t Num, GVNPass &Gvn) {
   auto FindRes = PhiTranslateTable.find({Num, Pred});
   if (FindRes != PhiTranslateTable.end())
     return FindRes->second;
@@ -1973,9 +1995,10 @@ uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
 
 // Return true if the value number \p Num and NewNum have equal value.
 // Return false if the result is unknown.
-bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
-                                       const BasicBlock *Pred,
-                                       const BasicBlock *PhiBlock, GVN &Gvn) {
+bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
+                                           const BasicBlock *Pred,
+                                           const BasicBlock *PhiBlock,
+                                           GVNPass &Gvn) {
   CallInst *Call = nullptr;
   LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
   while (Vals) {
@@ -2008,9 +2031,9 @@ bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
 
 /// Translate value number \p Num using phis, so that it has the values of
 /// the phis in BB.
-uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
-                                           const BasicBlock *PhiBlock,
-                                           uint32_t Num, GVN &Gvn) {
+uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+                                               const BasicBlock *PhiBlock,
+                                               uint32_t Num, GVNPass &Gvn) {
   if (PHINode *PN = NumberingPhi[Num]) {
     for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
       if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
@@ -2063,8 +2086,8 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
 
 /// Erase stale entry from phiTranslate cache so phiTranslate can be computed
 /// again.
-void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
-                                               const BasicBlock &CurrBlock) {
+void GVNPass::ValueTable::eraseTranslateCacheEntry(
+    uint32_t Num, const BasicBlock &CurrBlock) {
   for (const BasicBlock *Pred : predecessors(&CurrBlock))
     PhiTranslateTable.erase({Num, Pred});
 }
@@ -2074,7 +2097,7 @@ void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
 // and then scan the list to find one whose block dominates the block in
 // question.  This is fast because dominator tree queries consist of only
 // a few comparisons of DFS numbers.
-Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+Value *GVNPass::findLeader(const BasicBlock *BB, uint32_t num) {
   LeaderTableEntry Vals = LeaderTable[num];
   if (!Vals.Val) return nullptr;
 
@@ -2113,7 +2136,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   return Pred != nullptr;
 }
 
-void GVN::assignBlockRPONumber(Function &F) {
+void GVNPass::assignBlockRPONumber(Function &F) {
   BlockRPONumber.clear();
   uint32_t NextBlockNumber = 1;
   ReversePostOrderTraversal<Function *> RPOT(&F);
@@ -2122,7 +2145,7 @@ void GVN::assignBlockRPONumber(Function &F) {
   InvalidBlockRPONumbers = false;
 }
 
-bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
+bool GVNPass::replaceOperandsForInBlockEquality(Instruction *Instr) const {
   bool Changed = false;
   for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
     Value *Operand = Instr->getOperand(OpNum);
@@ -2142,8 +2165,9 @@ bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
 /// If DominatesByEdge is false, then it means that we will propagate the RHS
 /// value starting from the end of Root.Start.
-bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
-                            bool DominatesByEdge) {
+bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
+                                const BasicBlockEdge &Root,
+                                bool DominatesByEdge) {
   SmallVector<std::pair<Value*, Value*>, 4> Worklist;
   Worklist.push_back(std::make_pair(LHS, RHS));
   bool Changed = false;
@@ -2291,7 +2315,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
 
 /// When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
-bool GVN::processInstruction(Instruction *I) {
+bool GVNPass::processInstruction(Instruction *I) {
   // Ignore dbg info intrinsics.
   if (isa<DbgInfoIntrinsic>(I))
     return false;
@@ -2432,10 +2456,10 @@ bool GVN::processInstruction(Instruction *I) {
 }
 
 /// runOnFunction - This is the main transformation entry point for a function.
-bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
-                  const TargetLibraryInfo &RunTLI, AAResults &RunAA,
-                  MemoryDependenceResults *RunMD, LoopInfo *LI,
-                  OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) {
+bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
+                      const TargetLibraryInfo &RunTLI, AAResults &RunAA,
+                      MemoryDependenceResults *RunMD, LoopInfo *LI,
+                      OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) {
   AC = &RunAC;
   DT = &RunDT;
   VN.setDomTree(DT);
@@ -2457,10 +2481,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   // Merge unconditional branches, allowing PRE to catch more
   // optimization opportunities.
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
-    BasicBlock *BB = &*FI++;
-
-    bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, MSSAU, MD);
+  for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
+    bool removedBlock = MergeBlockIntoPredecessor(&BB, &DTU, LI, MSSAU, MD);
     if (removedBlock)
       ++NumGVNBlocks;
 
@@ -2502,7 +2524,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   return Changed;
 }
 
-bool GVN::processBlock(BasicBlock *BB) {
+bool GVNPass::processBlock(BasicBlock *BB) {
   // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
   // (and incrementing BI before processing an instruction).
   assert(InstrsToErase.empty() &&
@@ -2563,8 +2585,8 @@ bool GVN::processBlock(BasicBlock *BB) {
 }
 
 // Instantiate an expression in a predecessor that lacked it.
-bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
-                                    BasicBlock *Curr, unsigned int ValNo) {
+bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                        BasicBlock *Curr, unsigned int ValNo) {
   // Because we are going top-down through the block, all value numbers
   // will be available in the predecessor by the time we need them.  Any
   // that weren't originally present will have been instantiated earlier
@@ -2612,7 +2634,7 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
   return true;
 }
 
-bool GVN::performScalarPRE(Instruction *CurInst) {
+bool GVNPass::performScalarPRE(Instruction *CurInst) {
   if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
       isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
       CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
@@ -2797,7 +2819,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
 
 /// Perform a purely local form of PRE that looks for diamond
 /// control flow patterns and attempts to perform simple PRE at the join point.
-bool GVN::performPRE(Function &F) {
+bool GVNPass::performPRE(Function &F) {
   bool Changed = false;
   for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
     // Nothing to PRE in the entry block.
@@ -2824,7 +2846,7 @@ bool GVN::performPRE(Function &F) {
 
 /// Split the critical edge connecting the given two blocks, and return
 /// the block inserted to the critical edge.
-BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+BasicBlock *GVNPass::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
   // GVN does not require loop-simplify, do not try to preserve it if it is not
   // possible.
   BasicBlock *BB = SplitCriticalEdge(
@@ -2840,7 +2862,7 @@ BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
 
 /// Split critical edges found during the previous
 /// iteration that may enable further optimization.
-bool GVN::splitCriticalEdges() {
+bool GVNPass::splitCriticalEdges() {
   if (toSplit.empty())
     return false;
 
@@ -2860,7 +2882,7 @@ bool GVN::splitCriticalEdges() {
 }
 
 /// Executes one iteration of GVN
-bool GVN::iterateOnFunction(Function &F) {
+bool GVNPass::iterateOnFunction(Function &F) {
   cleanupGlobalSets();
 
   // Top-down walk of the dominator tree
@@ -2876,7 +2898,7 @@ bool GVN::iterateOnFunction(Function &F) {
   return Changed;
 }
 
-void GVN::cleanupGlobalSets() {
+void GVNPass::cleanupGlobalSets() {
   VN.clear();
   LeaderTable.clear();
   BlockRPONumber.clear();
@@ -2887,7 +2909,7 @@ void GVN::cleanupGlobalSets() {
 
 /// Verify that the specified instruction does not occur in our
 /// internal data structures.
-void GVN::verifyRemoved(const Instruction *Inst) const {
+void GVNPass::verifyRemoved(const Instruction *Inst) const {
   VN.verifyRemoved(Inst);
 
   // Walk through the value number scope to make sure the instruction isn't
@@ -2907,7 +2929,7 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
 /// function is to add all these blocks to "DeadBlocks". For the dead blocks'
 /// live successors, update their phi nodes by replacing the operands
 /// corresponding to dead blocks with UndefVal.
-void GVN::addDeadBlock(BasicBlock *BB) {
+void GVNPass::addDeadBlock(BasicBlock *BB) {
   SmallVector<BasicBlock *, 4> NewDead;
   SmallSetVector<BasicBlock *, 4> DF;
 
@@ -2995,7 +3017,7 @@ void GVN::addDeadBlock(BasicBlock *BB) {
 //     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
 //
 // Return true iff *NEW* dead code are found.
-bool GVN::processFoldableCondBr(BranchInst *BI) {
+bool GVNPass::processFoldableCondBr(BranchInst *BI) {
   if (!BI || BI->isUnconditional())
     return false;
 
@@ -3023,7 +3045,7 @@ bool GVN::processFoldableCondBr(BranchInst *BI) {
 // associated val-num. As it normally has far more live instructions than dead
 // instructions, it makes more sense just to "fabricate" a val-number for the
 // dead code than checking if instruction involved is dead or not.
-void GVN::assignValNumForDeadCode() {
+void GVNPass::assignValNumForDeadCode() {
   for (BasicBlock *BB : DeadBlocks) {
     for (Instruction &Inst : *BB) {
       unsigned ValNum = VN.lookupOrAdd(&Inst);
@@ -3078,7 +3100,7 @@ public:
   }
 
 private:
-  GVN Impl;
+  GVNPass Impl;
 };
 
 char GVNLegacyPass::ID = 0;
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 790d71992da4..fdc3afd9348a 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -169,7 +169,7 @@ class InsnInfo {
 
 public:
   // Inserts I and its value number in VNtoScalars.
-  void insert(Instruction *I, GVN::ValueTable &VN) {
+  void insert(Instruction *I, GVNPass::ValueTable &VN) {
     // Scalar instruction.
     unsigned V = VN.lookupOrAdd(I);
     VNtoScalars[{V, InvalidVN}].push_back(I);
@@ -184,7 +184,7 @@ class LoadInfo {
 
 public:
   // Insert Load and the value number of its memory address in VNtoLoads.
-  void insert(LoadInst *Load, GVN::ValueTable &VN) {
+  void insert(LoadInst *Load, GVNPass::ValueTable &VN) {
     if (Load->isSimple()) {
       unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
       VNtoLoads[{V, InvalidVN}].push_back(Load);
@@ -201,7 +201,7 @@ class StoreInfo {
 public:
   // Insert the Store and a hash number of the store address and the stored
   // value in VNtoStores.
-  void insert(StoreInst *Store, GVN::ValueTable &VN) {
+  void insert(StoreInst *Store, GVNPass::ValueTable &VN) {
     if (!Store->isSimple())
       return;
     // Hash the store address and the stored value.
@@ -221,7 +221,7 @@ class CallInfo {
 
 public:
   // Insert Call and its value numbering in one of the VNtoCalls* containers.
-  void insert(CallInst *Call, GVN::ValueTable &VN) {
+  void insert(CallInst *Call, GVNPass::ValueTable &VN) {
     // A call that doesNotAccessMemory is handled as a Scalar,
     // onlyReadsMemory will be handled as a Load instruction,
     // all other calls will be handled as stores.
@@ -274,7 +274,7 @@ public:
   unsigned int rank(const Value *V) const;
 
 private:
-  GVN::ValueTable VN;
+  GVNPass::ValueTable VN;
   DominatorTree *DT;
   PostDominatorTree *PDT;
   AliasAnalysis *AA;
@@ -377,12 +377,12 @@ private:
     if (!Root)
       return;
     // Depth first walk on PDom tree to fill the CHIargs at each PDF.
-    RenameStackType RenameStack;
     for (auto Node : depth_first(Root)) {
       BasicBlock *BB = Node->getBlock();
       if (!BB)
         continue;
 
+      RenameStackType RenameStack;
       // Collect all values in BB and push to stack.
       fillRenameStack(BB, ValueBBs, RenameStack);
 
@@ -827,6 +827,8 @@ void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
   auto it1 = ValueBBs.find(BB);
   if (it1 != ValueBBs.end()) {
     // Iterate in reverse order to keep lower ranked values on the top.
+    LLVM_DEBUG(dbgs() << "\nVisiting: " << BB->getName()
+                      << " for pushing instructions on stack";);
     for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
       // Get the value of instruction I
       LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index 61eb4ce0ed46..82b81003ef21 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/ConstantRange.h"
@@ -105,8 +106,10 @@ static void setCondition(Instruction *I, Value *NewCond) {
 }
 
 // Eliminates the guard instruction properly.
-static void eliminateGuard(Instruction *GuardInst) {
+static void eliminateGuard(Instruction *GuardInst, MemorySSAUpdater *MSSAU) {
   GuardInst->eraseFromParent();
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(GuardInst);
   ++GuardsEliminated;
 }
 
@@ -114,6 +117,7 @@ class GuardWideningImpl {
   DominatorTree &DT;
   PostDominatorTree *PDT;
   LoopInfo &LI;
+  MemorySSAUpdater *MSSAU;
 
   /// Together, these describe the region of interest.  This might be all of
   /// the blocks within a function, or only a given loop's blocks and preheader.
@@ -269,12 +273,12 @@ class GuardWideningImpl {
   }
 
 public:
-
   explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
-                             LoopInfo &LI, DomTreeNode *Root,
+                             LoopInfo &LI, MemorySSAUpdater *MSSAU,
+                             DomTreeNode *Root,
                              std::function<bool(BasicBlock*)> BlockFilter)
-    : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter)
-        {}
+      : DT(DT), PDT(PDT), LI(LI), MSSAU(MSSAU), Root(Root),
+        BlockFilter(BlockFilter) {}
 
   /// The entry point for this pass.
   bool run();
@@ -313,7 +317,7 @@ bool GuardWideningImpl::run() {
     if (!WidenedGuards.count(I)) {
       assert(isa<ConstantInt>(getCondition(I)) && "Should be!");
       if (isSupportedGuardInstruction(I))
-        eliminateGuard(I);
+        eliminateGuard(I, MSSAU);
       else {
         assert(isa<BranchInst>(I) &&
                "Eliminated something other than guard or branch?");
@@ -514,27 +518,20 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
       ConstantRange CR1 =
           ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
 
-      // SubsetIntersect is a subset of the actual mathematical intersection of
-      // CR0 and CR1, while SupersetIntersect is a superset of the actual
-      // mathematical intersection.  If these two ConstantRanges are equal, then
-      // we know we were able to represent the actual mathematical intersection
-      // of CR0 and CR1, and can use the same to generate an icmp instruction.
-      //
       // Given what we're doing here and the semantics of guards, it would
-      // actually be correct to just use SubsetIntersect, but that may be too
+      // be correct to use a subset intersection, but that may be too
       // aggressive in cases we care about.
-      auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
-      auto SupersetIntersect = CR0.intersectWith(CR1);
-
-      APInt NewRHSAP;
-      CmpInst::Predicate Pred;
-      if (SubsetIntersect == SupersetIntersect &&
-          SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
-        if (InsertPt) {
-          ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
-          Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+      if (Optional<ConstantRange> Intersect = CR0.exactIntersectWith(CR1)) {
+        APInt NewRHSAP;
+        CmpInst::Predicate Pred;
+        if (Intersect->getEquivalentICmp(Pred, NewRHSAP)) {
+          if (InsertPt) {
+            ConstantInt *NewRHS =
+                ConstantInt::get(Cond0->getContext(), NewRHSAP);
+            Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+          }
+          return true;
         }
-        return true;
       }
     }
   }
@@ -766,12 +763,18 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
-                         [](BasicBlock*) { return true; } ).run())
+  auto *MSSAA = AM.getCachedResult<MemorySSAAnalysis>(F);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSAA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAA->getMSSA());
+  if (!GuardWideningImpl(DT, &PDT, LI, MSSAU ? MSSAU.get() : nullptr,
+                         DT.getRootNode(), [](BasicBlock *) { return true; })
+           .run())
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
+  PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
 
@@ -784,11 +787,17 @@ PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM,
   auto BlockFilter = [&](BasicBlock *BB) {
     return BB == RootBB || L.contains(BB);
   };
-  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB),
-                         BlockFilter).run())
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(AR.MSSA);
+  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, MSSAU ? MSSAU.get() : nullptr,
+                         AR.DT.getNode(RootBB), BlockFilter).run())
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
@@ -805,8 +814,14 @@ struct GuardWideningLegacyPass : public FunctionPass {
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-    return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
-                         [](BasicBlock*) { return true; } ).run();
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    std::unique_ptr<MemorySSAUpdater> MSSAU;
+    if (MSSAWP)
+      MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA());
+    return GuardWideningImpl(DT, &PDT, LI, MSSAU ? MSSAU.get() : nullptr,
+                             DT.getRootNode(),
+                             [](BasicBlock *) { return true; })
+        .run();
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -814,6 +829,7 @@ struct GuardWideningLegacyPass : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<PostDominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 
@@ -833,13 +849,18 @@ struct LoopGuardWideningLegacyPass : public LoopPass {
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
     auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    std::unique_ptr<MemorySSAUpdater> MSSAU;
+    if (MSSAWP)
+      MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA());
+
     BasicBlock *RootBB = L->getLoopPredecessor();
     if (!RootBB)
       RootBB = L->getHeader();
     auto BlockFilter = [&](BasicBlock *BB) {
       return BB == RootBB || L->contains(BB);
     };
-    return GuardWideningImpl(DT, PDT, LI,
+    return GuardWideningImpl(DT, PDT, LI, MSSAU ? MSSAU.get() : nullptr,
                              DT.getNode(RootBB), BlockFilter).run();
   }
 
@@ -847,6 +868,7 @@ struct LoopGuardWideningLegacyPass : public LoopPass {
     AU.setPreservesCFG();
     getLoopAnalysisUsage(AU);
     AU.addPreserved<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 }
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 9ee2a2d0bf08..ae2fe2767074 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -89,6 +89,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "indvars"
 
@@ -155,6 +156,10 @@ class IndVarSimplify {
   bool rewriteNonIntegerIVs(Loop *L);
 
   bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+  /// Try to improve our exit conditions by converting condition from signed
+  /// to unsigned or rotating computation out of the loop.
+  /// (See inline comment about why this is duplicated from simplifyAndExtend)
+  bool canonicalizeExitCondition(Loop *L);
   /// Try to eliminate loop exits based on analyzeable exit counts
   bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
   /// Try to form loop invariant tests for loop exits by changing how many
@@ -494,6 +499,7 @@ bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
           MadeAnyChanges = true;
           PN.setIncomingValue(IncomingValIdx,
                               ExitVal->getIncomingValue(PreheaderIdx));
+          SE->forgetValue(&PN);
         }
       }
     }
@@ -541,18 +547,18 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
     return;
   }
 
-  if (!WI.WidestNativeType) {
+  if (!WI.WidestNativeType ||
+      Width > SE->getTypeSizeInBits(WI.WidestNativeType)) {
     WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
     WI.IsSigned = IsSigned;
     return;
   }
 
-  // We extend the IV to satisfy the sign of its first user, arbitrarily.
-  if (WI.IsSigned != IsSigned)
-    return;
-
-  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
-    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+  // We extend the IV to satisfy the sign of its user(s), or 'signed'
+  // if there are multiple users with both sign- and zero extensions,
+  // in order not to introduce nondeterministic behaviour based on the
+  // unspecified order of a PHI nodes' users-iterator.
+  WI.IsSigned |= IsSigned;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1274,9 +1280,9 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
       // Skip debug info intrinsics.
       do {
         --I;
-      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+      } while (I->isDebugOrPseudoInst() && I != Preheader->begin());
 
-      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+      if (I->isDebugOrPseudoInst() && I == Preheader->begin())
         Done = true;
     } else {
       Done = true;
@@ -1309,6 +1315,18 @@ static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken,
   replaceExitCond(BI, NewCond, DeadInsts);
 }
 
+static void replaceLoopPHINodesWithPreheaderValues(
+    Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!");
+  auto *LoopPreheader = L->getLoopPreheader();
+  auto *LoopHeader = L->getHeader();
+  for (auto &PN : LoopHeader->phis()) {
+    auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader);
+    PN.replaceAllUsesWith(PreheaderIncoming);
+    DeadInsts.emplace_back(&PN);
+  }
+}
+
 static void replaceWithInvariantCond(
     const Loop *L, BasicBlock *ExitingBB, ICmpInst::Predicate InvariantPred,
     const SCEV *InvariantLHS, const SCEV *InvariantRHS, SCEVExpander &Rewriter,
@@ -1333,7 +1351,6 @@ static bool optimizeLoopExitWithUnknownExitCount(
     SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   ICmpInst::Predicate Pred;
   Value *LHS, *RHS;
-  using namespace PatternMatch;
   BasicBlock *TrueSucc, *FalseSucc;
   if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
                       m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc))))
@@ -1394,6 +1411,140 @@ static bool optimizeLoopExitWithUnknownExitCount(
   return true;
 }
 
+bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
+  // Note: This is duplicating a particular part on SimplifyIndVars reasoning.
+  // We need to duplicate it because given icmp zext(small-iv), C, IVUsers
+  // never reaches the icmp since the zext doesn't fold to an AddRec unless
+  // it already has flags.  The alternative to this would be to extending the
+  // set of "interesting" IV users to include the icmp, but doing that
+  // regresses results in practice by querying SCEVs before trip counts which
+  // rely on them which results in SCEV caching sub-optimal answers.  The
+  // concern about caching sub-optimal results is why we only query SCEVs of
+  // the loop invariant RHS here.
+  SmallVector<BasicBlock*, 16> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  bool Changed = false;
+  for (auto *ExitingBB : ExitingBlocks) {
+    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      continue;
+    assert(BI->isConditional() && "exit branch must be conditional");
+
+    auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
+    if (!ICmp || !ICmp->hasOneUse())
+      continue;
+
+    auto *LHS = ICmp->getOperand(0);
+    auto *RHS = ICmp->getOperand(1);
+    // For the range reasoning, avoid computing SCEVs in the loop to avoid
+    // poisoning cache with sub-optimal results.  For the must-execute case,
+    // this is a neccessary precondition for correctness.
+    if (!L->isLoopInvariant(RHS)) {
+      if (!L->isLoopInvariant(LHS))
+        continue;
+      // Same logic applies for the inverse case
+      std::swap(LHS, RHS);
+    }
+
+    // Match (icmp signed-cond zext, RHS)
+    Value *LHSOp = nullptr;
+    if (!match(LHS, m_ZExt(m_Value(LHSOp))) || !ICmp->isSigned())
+      continue;
+
+    const DataLayout &DL = ExitingBB->getModule()->getDataLayout();
+    const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType());
+    const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType());
+    auto FullCR = ConstantRange::getFull(InnerBitWidth);
+    FullCR = FullCR.zeroExtend(OuterBitWidth);
+    auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L));
+    if (FullCR.contains(RHSCR)) {
+      // We have now matched icmp signed-cond zext(X), zext(Y'), and can thus
+      // replace the signed condition with the unsigned version.
+      ICmp->setPredicate(ICmp->getUnsignedPredicate());
+      Changed = true;
+      // Note: No SCEV invalidation needed.  We've changed the predicate, but
+      // have not changed exit counts, or the values produced by the compare.
+      continue;
+    }
+  }
+
+  // Now that we've canonicalized the condition to match the extend,
+  // see if we can rotate the extend out of the loop.
+  for (auto *ExitingBB : ExitingBlocks) {
+    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      continue;
+    assert(BI->isConditional() && "exit branch must be conditional");
+
+    auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
+    if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned())
+      continue;
+
+    bool Swapped = false;
+    auto *LHS = ICmp->getOperand(0);
+    auto *RHS = ICmp->getOperand(1);
+    if (L->isLoopInvariant(LHS) == L->isLoopInvariant(RHS))
+      // Nothing to rotate
+      continue;
+    if (L->isLoopInvariant(LHS)) {
+      // Same logic applies for the inverse case until we actually pick
+      // which operand of the compare to update.
+      Swapped = true;
+      std::swap(LHS, RHS);
+    }
+    assert(!L->isLoopInvariant(LHS) && L->isLoopInvariant(RHS));
+
+    // Match (icmp unsigned-cond zext, RHS)
+    // TODO: Extend to handle corresponding sext/signed-cmp case
+    // TODO: Extend to other invertible functions
+    Value *LHSOp = nullptr;
+    if (!match(LHS, m_ZExt(m_Value(LHSOp))))
+      continue;
+
+    // In general, we only rotate if we can do so without increasing the number
+    // of instructions.  The exception is when we have an zext(add-rec).  The
+    // reason for allowing this exception is that we know we need to get rid
+    // of the zext for SCEV to be able to compute a trip count for said loops;
+    // we consider the new trip count valuable enough to increase instruction
+    // count by one.
+    if (!LHS->hasOneUse() && !isa<SCEVAddRecExpr>(SE->getSCEV(LHSOp)))
+      continue;
+
+    // Given a icmp unsigned-cond zext(Op) where zext(trunc(RHS)) == RHS
+    // replace with an icmp of the form icmp unsigned-cond Op, trunc(RHS)
+    // when zext is loop varying and RHS is loop invariant.  This converts
+    // loop varying work to loop-invariant work.
+    auto doRotateTransform = [&]() {
+      assert(ICmp->isUnsigned() && "must have proven unsigned already");
+      auto *NewRHS =
+        CastInst::Create(Instruction::Trunc, RHS, LHSOp->getType(), "",
+                         L->getLoopPreheader()->getTerminator());
+      ICmp->setOperand(Swapped ? 1 : 0, LHSOp);
+      ICmp->setOperand(Swapped ? 0 : 1, NewRHS);
+      if (LHS->use_empty())
+        DeadInsts.push_back(LHS);
+    };
+
+
+    const DataLayout &DL = ExitingBB->getModule()->getDataLayout();
+    const unsigned InnerBitWidth = DL.getTypeSizeInBits(LHSOp->getType());
+    const unsigned OuterBitWidth = DL.getTypeSizeInBits(RHS->getType());
+    auto FullCR = ConstantRange::getFull(InnerBitWidth);
+    FullCR = FullCR.zeroExtend(OuterBitWidth);
+    auto RHSCR = SE->getUnsignedRange(SE->applyLoopGuards(SE->getSCEV(RHS), L));
+    if (FullCR.contains(RHSCR)) {
+      doRotateTransform();
+      Changed = true;
+      // Note, we are leaving SCEV in an unfortunately imprecise case here
+      // as rotation tends to reveal information about trip counts not
+      // previously visible.
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
 bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -1499,20 +1650,18 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     // If we know we'd exit on the first iteration, rewrite the exit to
     // reflect this.  This does not imply the loop must exit through this
     // exit; there may be an earlier one taken on the first iteration.
-    // TODO: Given we know the backedge can't be taken, we should go ahead
-    // and break it.  Or at least, kill all the header phis and simplify.
+    // We know that the backedge can't be taken, so we replace all
+    // the header PHIs with values coming from the preheader.
     if (ExitCount->isZero()) {
       foldExit(L, ExitingBB, true, DeadInsts);
+      replaceLoopPHINodesWithPreheaderValues(L, DeadInsts);
       Changed = true;
       continue;
     }
 
-    // If we end up with a pointer exit count, bail.  Note that we can end up
-    // with a pointer exit count for one exiting block, and not for another in
-    // the same loop.
-    if (!ExitCount->getType()->isIntegerTy() ||
-        !MaxExitCount->getType()->isIntegerTy())
-      continue;
+    assert(ExitCount->getType()->isIntegerTy() &&
+           MaxExitCount->getType()->isIntegerTy() &&
+           "Exit counts must be integers");
 
     Type *WiderType =
       SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
@@ -1569,14 +1718,11 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // through *explicit* control flow.  We have to eliminate the possibility of
   // implicit exits (see below) before we know it's truly exact.
   const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(ExactBTC) ||
-      !SE->isLoopInvariant(ExactBTC, L) ||
-      !isSafeToExpand(ExactBTC, *SE))
+  if (isa<SCEVCouldNotCompute>(ExactBTC) || !isSafeToExpand(ExactBTC, *SE))
     return false;
 
-  // If we end up with a pointer exit count, bail.  It may be unsized.
-  if (!ExactBTC->getType()->isIntegerTy())
-    return false;
+  assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant");
+  assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer");
 
   auto BadExit = [&](BasicBlock *ExitingBB) {
     // If our exiting block exits multiple loops, we can only rewrite the
@@ -1603,15 +1749,12 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
       return true;
 
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
-    if (isa<SCEVCouldNotCompute>(ExitCount) ||
-        !SE->isLoopInvariant(ExitCount, L) ||
-        !isSafeToExpand(ExitCount, *SE))
-      return true;
-
-    // If we end up with a pointer exit count, bail.  It may be unsized.
-    if (!ExitCount->getType()->isIntegerTy())
+    if (isa<SCEVCouldNotCompute>(ExitCount) || !isSafeToExpand(ExitCount, *SE))
       return true;
 
+    assert(SE->isLoopInvariant(ExitCount, L) &&
+           "Exit count must be loop invariant");
+    assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer");
     return false;
   };
 
@@ -1781,7 +1924,11 @@ bool IndVarSimplify::run(Loop *L) {
   }
 
   // Eliminate redundant IV cycles.
-  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI);
+
+  // Try to convert exit conditions to unsigned and rotate computation
+  // out of the loop.  Note: Handles invalidation internally if needed.
+  Changed |= canonicalizeExitCondition(L);
 
   // Try to eliminate loop exits based on analyzeable exit counts
   if (optimizeLoopExits(L, Rewriter))  {
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index f7d631f5e785..883d4afff3bd 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -96,10 +96,13 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -115,6 +118,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -146,6 +150,14 @@ static const unsigned UninitializedAddressSpace =
 namespace {
 
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+// Different from ValueToAddrSpaceMapTy, where a new addrspace is inferred on
+// the *def* of a value, PredicatedAddrSpaceMapTy is map where a new
+// addrspace is inferred on the *use* of a pointer. This map is introduced to
+// infer addrspace from the addrspace predicate assumption built from assume
+// intrinsic. In that scenario, only specific uses (under valid assumption
+// context) could be inferred with a new addrspace.
+using PredicatedAddrSpaceMapTy =
+    DenseMap<std::pair<const Value *, const Value *>, unsigned>;
 using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
 
 class InferAddressSpaces : public FunctionPass {
@@ -160,6 +172,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
@@ -167,6 +181,8 @@ public:
 };
 
 class InferAddressSpacesImpl {
+  AssumptionCache &AC;
+  DominatorTree *DT = nullptr;
   const TargetTransformInfo *TTI = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -174,21 +190,24 @@ class InferAddressSpacesImpl {
   /// possible.
   unsigned FlatAddrSpace = 0;
 
-  // Returns the new address space of V if updated; otherwise, returns None.
-  Optional<unsigned>
-  updateAddressSpace(const Value &V,
-                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+  // Try to update the address space of V. If V is updated, returns true and
+  // false otherwise.
+  bool updateAddressSpace(const Value &V,
+                          ValueToAddrSpaceMapTy &InferredAddrSpace,
+                          PredicatedAddrSpaceMapTy &PredicatedAS) const;
 
   // Tries to infer the specific address space of each address expression in
   // Postorder.
   void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
-                          ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+                          ValueToAddrSpaceMapTy &InferredAddrSpace,
+                          PredicatedAddrSpaceMapTy &PredicatedAS) const;
 
   bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
 
   Value *cloneInstructionWithNewAddressSpace(
       Instruction *I, unsigned NewAddrSpace,
       const ValueToValueMapTy &ValueWithNewAddrSpace,
+      const PredicatedAddrSpaceMapTy &PredicatedAS,
       SmallVectorImpl<const Use *> *UndefUsesToFix) const;
 
   // Changes the flat address expressions in function F to point to specific
@@ -196,7 +215,8 @@ class InferAddressSpacesImpl {
   // all flat expressions in the use-def graph of function F.
   bool rewriteWithNewAddressSpaces(
       const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
-      const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
+      const ValueToAddrSpaceMapTy &InferredAddrSpace,
+      const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const;
 
   void appendsFlatAddressExpressionToPostorderStack(
       Value *V, PostorderStackTy &PostorderStack,
@@ -211,14 +231,18 @@ class InferAddressSpacesImpl {
   std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
 
   Value *cloneValueWithNewAddressSpace(
-    Value *V, unsigned NewAddrSpace,
-    const ValueToValueMapTy &ValueWithNewAddrSpace,
-    SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+      Value *V, unsigned NewAddrSpace,
+      const ValueToValueMapTy &ValueWithNewAddrSpace,
+      const PredicatedAddrSpaceMapTy &PredicatedAS,
+      SmallVectorImpl<const Use *> *UndefUsesToFix) const;
   unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
 
+  unsigned getPredicatedAddrSpace(const Value &V, Value *Opnd) const;
+
 public:
-  InferAddressSpacesImpl(const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
-      : TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
+  InferAddressSpacesImpl(AssumptionCache &AC, DominatorTree *DT,
+                         const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
+      : AC(AC), DT(DT), TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
   bool run(Function &F);
 };
 
@@ -232,8 +256,12 @@ void initializeInferAddressSpacesPass(PassRegistry &);
 
 } // end namespace llvm
 
-INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
-                false, false)
+INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+                    false, false)
 
 // Check whether that's no-op pointer bicast using a pair of
 // `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over
@@ -505,6 +533,7 @@ InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
 static Value *operandWithNewAddressSpaceOrCreateUndef(
     const Use &OperandUse, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
+    const PredicatedAddrSpaceMapTy &PredicatedAS,
     SmallVectorImpl<const Use *> *UndefUsesToFix) {
   Value *Operand = OperandUse.get();
 
@@ -517,6 +546,18 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
   if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
     return NewOperand;
 
+  Instruction *Inst = cast<Instruction>(OperandUse.getUser());
+  auto I = PredicatedAS.find(std::make_pair(Inst, Operand));
+  if (I != PredicatedAS.end()) {
+    // Insert an addrspacecast on that operand before the user.
+    unsigned NewAS = I->second;
+    Type *NewPtrTy = PointerType::getWithSamePointeeType(
+        cast<PointerType>(Operand->getType()), NewAS);
+    auto *NewI = new AddrSpaceCastInst(Operand, NewPtrTy);
+    NewI->insertBefore(Inst);
+    return NewI;
+  }
+
   UndefUsesToFix->push_back(&OperandUse);
   return UndefValue::get(NewPtrTy);
 }
@@ -536,6 +577,7 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
 Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
     Instruction *I, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
+    const PredicatedAddrSpaceMapTy &PredicatedAS,
     SmallVectorImpl<const Use *> *UndefUsesToFix) const {
   Type *NewPtrType = PointerType::getWithSamePointeeType(
       cast<PointerType>(I->getType()), NewAddrSpace);
@@ -557,7 +599,7 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
     assert(II->getIntrinsicID() == Intrinsic::ptrmask);
     Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef(
         II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace,
-        UndefUsesToFix);
+        PredicatedAS, UndefUsesToFix);
     Value *Rewrite =
         TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr);
     if (Rewrite) {
@@ -586,7 +628,8 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
       NewPointerOperands.push_back(nullptr);
     else
       NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
-                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+          OperandUse, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS,
+          UndefUsesToFix));
   }
 
   switch (I->getOpcode()) {
@@ -708,9 +751,8 @@ static Value *cloneConstantExprWithNewAddressSpace(
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     // Needs to specify the source type while constructing a getelementptr
     // constant expression.
-    return CE->getWithOperands(
-      NewOperands, TargetType, /*OnlyIfReduced=*/false,
-      NewOperands[0]->getType()->getPointerElementType());
+    return CE->getWithOperands(NewOperands, TargetType, /*OnlyIfReduced=*/false,
+                               cast<GEPOperator>(CE)->getSourceElementType());
   }
 
   return CE->getWithOperands(NewOperands, TargetType);
@@ -724,6 +766,7 @@ static Value *cloneConstantExprWithNewAddressSpace(
 Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
     Value *V, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
+    const PredicatedAddrSpaceMapTy &PredicatedAS,
     SmallVectorImpl<const Use *> *UndefUsesToFix) const {
   // All values in Postorder are flat address expressions.
   assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
@@ -731,7 +774,7 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
 
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     Value *NewV = cloneInstructionWithNewAddressSpace(
-      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+        I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, UndefUsesToFix);
     if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) {
       if (NewI->getParent() == nullptr) {
         NewI->insertBefore(I);
@@ -779,46 +822,43 @@ bool InferAddressSpacesImpl::run(Function &F) {
   // Runs a data-flow analysis to refine the address spaces of every expression
   // in Postorder.
   ValueToAddrSpaceMapTy InferredAddrSpace;
-  inferAddressSpaces(Postorder, &InferredAddrSpace);
+  PredicatedAddrSpaceMapTy PredicatedAS;
+  inferAddressSpaces(Postorder, InferredAddrSpace, PredicatedAS);
 
   // Changes the address spaces of the flat address expressions who are inferred
   // to point to a specific address space.
-  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
+  return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace,
+                                     PredicatedAS, &F);
 }
 
 // Constants need to be tracked through RAUW to handle cases with nested
 // constant expressions, so wrap values in WeakTrackingVH.
 void InferAddressSpacesImpl::inferAddressSpaces(
     ArrayRef<WeakTrackingVH> Postorder,
-    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+    ValueToAddrSpaceMapTy &InferredAddrSpace,
+    PredicatedAddrSpaceMapTy &PredicatedAS) const {
   SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
   // Initially, all expressions are in the uninitialized address space.
   for (Value *V : Postorder)
-    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+    InferredAddrSpace[V] = UninitializedAddressSpace;
 
   while (!Worklist.empty()) {
     Value *V = Worklist.pop_back_val();
 
-    // Tries to update the address space of the stack top according to the
+    // Try to update the address space of the stack top according to the
     // address spaces of its operands.
-    LLVM_DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
-    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
-    if (!NewAS.hasValue())
+    if (!updateAddressSpace(*V, InferredAddrSpace, PredicatedAS))
       continue;
-    // If any updates are made, grabs its users to the worklist because
-    // their address spaces can also be possibly updated.
-    LLVM_DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
-    (*InferredAddrSpace)[V] = NewAS.getValue();
 
     for (Value *User : V->users()) {
       // Skip if User is already in the worklist.
       if (Worklist.count(User))
         continue;
 
-      auto Pos = InferredAddrSpace->find(User);
+      auto Pos = InferredAddrSpace.find(User);
       // Our algorithm only updates the address spaces of flat address
       // expressions, which are those in InferredAddrSpace.
-      if (Pos == InferredAddrSpace->end())
+      if (Pos == InferredAddrSpace.end())
         continue;
 
       // Function updateAddressSpace moves the address space down a lattice
@@ -832,10 +872,37 @@ void InferAddressSpacesImpl::inferAddressSpaces(
   }
 }
 
-Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
-    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+unsigned InferAddressSpacesImpl::getPredicatedAddrSpace(const Value &V,
+                                                        Value *Opnd) const {
+  const Instruction *I = dyn_cast<Instruction>(&V);
+  if (!I)
+    return UninitializedAddressSpace;
+
+  Opnd = Opnd->stripInBoundsOffsets();
+  for (auto &AssumeVH : AC.assumptionsFor(Opnd)) {
+    if (!AssumeVH)
+      continue;
+    CallInst *CI = cast<CallInst>(AssumeVH);
+    if (!isValidAssumeForContext(CI, I, DT))
+      continue;
+
+    const Value *Ptr;
+    unsigned AS;
+    std::tie(Ptr, AS) = TTI->getPredicatedAddrSpace(CI->getArgOperand(0));
+    if (Ptr)
+      return AS;
+  }
+
+  return UninitializedAddressSpace;
+}
+
+bool InferAddressSpacesImpl::updateAddressSpace(
+    const Value &V, ValueToAddrSpaceMapTy &InferredAddrSpace,
+    PredicatedAddrSpaceMapTy &PredicatedAS) const {
   assert(InferredAddrSpace.count(&V));
 
+  LLVM_DEBUG(dbgs() << "Updating the address space of\n  " << V << '\n');
+
   // The new inferred address space equals the join of the address spaces
   // of all its pointer operands.
   unsigned NewAS = UninitializedAddressSpace;
@@ -861,7 +928,7 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
     // address space is known.
     if ((C1 && Src0AS == UninitializedAddressSpace) ||
         (C0 && Src1AS == UninitializedAddressSpace))
-      return None;
+      return false;
 
     if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
       NewAS = Src1AS;
@@ -878,10 +945,23 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
       // Otherwise, infer the address space from its pointer operands.
       for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
         auto I = InferredAddrSpace.find(PtrOperand);
-        unsigned OperandAS =
-            I != InferredAddrSpace.end()
-                ? I->second
-                : PtrOperand->getType()->getPointerAddressSpace();
+        unsigned OperandAS;
+        if (I == InferredAddrSpace.end()) {
+          OperandAS = PtrOperand->getType()->getPointerAddressSpace();
+          if (OperandAS == FlatAddrSpace) {
+            // Check AC for assumption dominating V.
+            unsigned AS = getPredicatedAddrSpace(V, PtrOperand);
+            if (AS != UninitializedAddressSpace) {
+              LLVM_DEBUG(dbgs()
+                         << "  deduce operand AS from the predicate addrspace "
+                         << AS << '\n');
+              OperandAS = AS;
+              // Record this use with the predicated AS.
+              PredicatedAS[std::make_pair(&V, PtrOperand)] = OperandAS;
+            }
+          }
+        } else
+          OperandAS = I->second;
 
         // join(flat, *) = flat. So we can break if NewAS is already flat.
         NewAS = joinAddressSpaces(NewAS, OperandAS);
@@ -894,8 +974,13 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
   unsigned OldAS = InferredAddrSpace.lookup(&V);
   assert(OldAS != FlatAddrSpace);
   if (OldAS == NewAS)
-    return None;
-  return NewAS;
+    return false;
+
+  // If any updates are made, grabs its users to the worklist because
+  // their address spaces can also be possibly updated.
+  LLVM_DEBUG(dbgs() << "  to " << NewAS << '\n');
+  InferredAddrSpace[&V] = NewAS;
+  return true;
 }
 
 /// \p returns true if \p U is the pointer operand of a memory instruction with
@@ -1026,7 +1111,8 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I,
 
 bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
     const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
-    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+    const ValueToAddrSpaceMapTy &InferredAddrSpace,
+    const PredicatedAddrSpaceMapTy &PredicatedAS, Function *F) const {
   // For each address expression to be modified, creates a clone of it with its
   // pointer operands converted to the new address space. Since the pointer
   // operands are converted, the clone is naturally in the new address space by
@@ -1042,8 +1128,9 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
       continue;
 
     if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
-      Value *New = cloneValueWithNewAddressSpace(
-          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+      Value *New =
+          cloneValueWithNewAddressSpace(V, NewAddrSpace, ValueWithNewAddrSpace,
+                                        PredicatedAS, &UndefUsesToFix);
       if (New)
         ValueWithNewAddrSpace[V] = New;
     }
@@ -1155,8 +1242,9 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
         if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
           unsigned NewAS = NewV->getType()->getPointerAddressSpace();
           if (ASC->getDestAddressSpace() == NewAS) {
-            if (ASC->getType()->getPointerElementType() !=
-                NewV->getType()->getPointerElementType()) {
+            if (!cast<PointerType>(ASC->getType())
+                    ->hasSameElementTypeAs(
+                        cast<PointerType>(NewV->getType()))) {
               NewV = CastInst::Create(Instruction::BitCast, NewV,
                                       ASC->getType(), "", ASC);
             }
@@ -1199,7 +1287,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   return InferAddressSpacesImpl(
+             getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), DT,
              &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
              FlatAddrSpace)
       .run(F);
@@ -1217,11 +1308,14 @@ InferAddressSpacesPass::InferAddressSpacesPass(unsigned AddressSpace)
 PreservedAnalyses InferAddressSpacesPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
   bool Changed =
-      InferAddressSpacesImpl(&AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace)
+      InferAddressSpacesImpl(AM.getResult<AssumptionAnalysis>(F),
+                             AM.getCachedResult<DominatorTreeAnalysis>(F),
+                             &AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace)
           .run(F);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
+    PA.preserve<DominatorTreeAnalysis>();
     return PA;
   }
   return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 9dc3b0351346..fe9a7211967c 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -331,7 +331,7 @@ bool JumpThreading::runOnFunction(Function &F) {
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
-  bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
+  bool Changed = Impl.runImpl(F, TLI, TTI, LVI, AA, &DTU, F.hasProfileData(),
                               std::move(BFI), std::move(BPI));
   if (PrintLVIAfterJumpThreading) {
     dbgs() << "LVI for function '" << F.getName() << "':\n";
@@ -360,7 +360,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
-  bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
+  bool Changed = runImpl(F, &TLI, &TTI, &LVI, &AA, &DTU, F.hasProfileData(),
                          std::move(BFI), std::move(BPI));
 
   if (PrintLVIAfterJumpThreading) {
@@ -377,12 +377,14 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 }
 
 bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
-                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
-                                DomTreeUpdater *DTU_, bool HasProfileData_,
+                                TargetTransformInfo *TTI_, LazyValueInfo *LVI_,
+                                AliasAnalysis *AA_, DomTreeUpdater *DTU_,
+                                bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
   LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
+  TTI = TTI_;
   LVI = LVI_;
   AA = AA_;
   DTU = DTU_;
@@ -514,7 +516,8 @@ static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
 /// Return the cost of duplicating a piece of this block from first non-phi
 /// and before StopAt instruction to thread across it. Stop scanning the block
 /// when exceeding the threshold. If duplication is impossible, returns ~0U.
-static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+static unsigned getJumpThreadDuplicationCost(const TargetTransformInfo *TTI,
+                                             BasicBlock *BB,
                                              Instruction *StopAt,
                                              unsigned Threshold) {
   assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
@@ -550,26 +553,21 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
     if (Size > Threshold)
       return Size;
 
-    // Debugger intrinsics don't incur code size.
-    if (isa<DbgInfoIntrinsic>(I)) continue;
-
-    // Pseudo-probes don't incur code size.
-    if (isa<PseudoProbeInst>(I))
-      continue;
-
-    // If this is a pointer->pointer bitcast, it is free.
-    if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
-      continue;
-
-    // Freeze instruction is free, too.
-    if (isa<FreezeInst>(I))
-      continue;
-
     // Bail out if this instruction gives back a token type, it is not possible
     // to duplicate it if it is used outside this BB.
     if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
       return ~0U;
 
+    // Blocks with NoDuplicate are modelled as having infinite cost, so they
+    // are never duplicated.
+    if (const CallInst *CI = dyn_cast<CallInst>(I))
+      if (CI->cannotDuplicate() || CI->isConvergent())
+        return ~0U;
+
+    if (TTI->getUserCost(&*I, TargetTransformInfo::TCK_SizeAndLatency)
+            == TargetTransformInfo::TCC_Free)
+      continue;
+
     // All other instructions count for at least one unit.
     ++Size;
 
@@ -578,11 +576,7 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
     // as having cost of 2 total, and if they are a vector intrinsic, we model
     // them as having cost 1.
     if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (CI->cannotDuplicate() || CI->isConvergent())
-        // Blocks with NoDuplicate are modelled as having infinite cost, so they
-        // are never duplicated.
-        return ~0U;
-      else if (!isa<IntrinsicInst>(CI))
+      if (!isa<IntrinsicInst>(CI))
         Size += 3;
       else if (!CI->getType()->isVectorTy())
         Size += 1;
@@ -1363,8 +1357,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
   // If all of the loads and stores that feed the value have the same AA tags,
   // then we can propagate them onto any newly inserted loads.
-  AAMDNodes AATags;
-  LoadI->getAAMetadata(AATags);
+  AAMDNodes AATags = LoadI->getAAMetadata();
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
 
@@ -2235,10 +2228,10 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
   }
 
   // Compute the cost of duplicating BB and PredBB.
-  unsigned BBCost =
-      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  unsigned BBCost = getJumpThreadDuplicationCost(
+      TTI, BB, BB->getTerminator(), BBDupThreshold);
   unsigned PredBBCost = getJumpThreadDuplicationCost(
-      PredBB, PredBB->getTerminator(), BBDupThreshold);
+      TTI, PredBB, PredBB->getTerminator(), BBDupThreshold);
 
   // Give up if costs are too high.  We need to check BBCost and PredBBCost
   // individually before checking their sum because getJumpThreadDuplicationCost
@@ -2346,8 +2339,8 @@ bool JumpThreadingPass::tryThreadEdge(
     return false;
   }
 
-  unsigned JumpThreadCost =
-      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(
+      TTI, BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
     LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
                       << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -2615,8 +2608,8 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
     return false;
   }
 
-  unsigned DuplicationCost =
-      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  unsigned DuplicationCost = getJumpThreadDuplicationCost(
+      TTI, BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
     LLVM_DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
                       << "' - Cost is too high: " << DuplicationCost << "\n");
@@ -3032,7 +3025,8 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
 
   ValueToValueMapTy UnguardedMapping, GuardedMapping;
   Instruction *AfterGuard = Guard->getNextNode();
-  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  unsigned Cost =
+      getJumpThreadDuplicationCost(TTI, BB, AfterGuard, BBDupThreshold);
   if (Cost > BBDupThreshold)
     return false;
   // Duplicate all instructions before the guard and the guard itself to the
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 30058df3ded5..bf714d167670 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -117,13 +117,6 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
     cl::desc("Max num uses visited for identifying load "
              "invariance in loop using invariant start (default = 8)"));
 
-// Default value of zero implies we use the regular alias set tracker mechanism
-// instead of the cross product using AA to identify aliasing of the memory
-// location we are interested in.
-static cl::opt<int>
-LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0),
-               cl::desc("How many instruction to cross product using AA"));
-
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
 // address the same issue. This flag applies only when LICM uses MemorySSA
@@ -151,7 +144,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
-                                  TargetTransformInfo *TTI, bool &FreeInLoop);
+                                  TargetTransformInfo *TTI, bool &FreeInLoop,
+                                  bool LoopNestMode);
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
                   MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
@@ -180,7 +174,7 @@ static Instruction *cloneInstructionInExitBlock(
     const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
 
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
-                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU);
+                             MemorySSAUpdater *MSSAU);
 
 static void moveInstructionBefore(Instruction &I, Instruction &Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
@@ -206,9 +200,6 @@ struct LoopInvariantCodeMotion {
 private:
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
-
-  std::unique_ptr<AliasSetTracker>
-  collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA);
 };
 
 struct LegacyLICMPass : public LoopPass {
@@ -228,9 +219,7 @@ struct LegacyLICMPass : public LoopPass {
                       << L->getHeader()->getNameOrAsOperand() << "\n");
 
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    MemorySSA *MSSA = EnableMSSALoopDependency
-                          ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
-                          : nullptr;
+    MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
     bool hasProfileData = L->getHeader()->getParent()->hasProfileData();
     BlockFrequencyInfo *BFI =
         hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
@@ -257,10 +246,8 @@ struct LegacyLICMPass : public LoopPass {
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
     LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
@@ -275,6 +262,9 @@ private:
 
 PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                 LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  if (!AR.MSSA)
+    report_fatal_error("LICM requires MemorySSA (loop-mssa)");
+
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
   // pass.  Function analyses need to be preserved across loop transformations
   // but ORE cannot be preserved (see comment before the pass definition).
@@ -289,8 +279,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
-  if (AR.MSSA)
-    PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
 
   return PA;
 }
@@ -298,6 +287,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
                                  LoopStandardAnalysisResults &AR,
                                  LPMUpdater &) {
+  if (!AR.MSSA)
+    report_fatal_error("LNICM requires MemorySSA (loop-mssa)");
+
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
   // pass.  Function analyses need to be preserved across loop transformations
   // but ORE cannot be preserved (see comment before the pass definition).
@@ -316,8 +308,7 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
-  if (AR.MSSA)
-    PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
 
   return PA;
 }
@@ -386,10 +377,6 @@ bool LoopInvariantCodeMotion::runOnLoop(
     return false;
   }
 
-  std::unique_ptr<AliasSetTracker> CurAST;
-  std::unique_ptr<MemorySSAUpdater> MSSAU;
-  std::unique_ptr<SinkAndHoistLICMFlags> Flags;
-
   // Don't sink stores from loops with coroutine suspend instructions.
   // LICM would sink instructions into the default destination of
   // the coroutine switch. The default destination of the switch is to
@@ -406,17 +393,9 @@ bool LoopInvariantCodeMotion::runOnLoop(
     });
   });
 
-  if (!MSSA) {
-    LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
-    CurAST = collectAliasInfoForLoop(L, LI, AA);
-    Flags = std::make_unique<SinkAndHoistLICMFlags>(
-        LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true);
-  } else {
-    LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    Flags = std::make_unique<SinkAndHoistLICMFlags>(
-        LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA);
-  }
+  MemorySSAUpdater MSSAU(MSSA);
+  SinkAndHoistLICMFlags Flags(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                              /*IsSink=*/true, L, MSSA);
 
   // Get the preheader block to move instructions into...
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -435,14 +414,16 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // us to sink instructions in one pass, without iteration.  After sinking
   // instructions, we perform another pass to hoist them out of the loop.
   if (L->hasDedicatedExits())
-    Changed |=
-        sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
-                   CurAST.get(), MSSAU.get(), &SafetyInfo, *Flags.get(), ORE);
-  Flags->setIsSink(false);
+    Changed |= LoopNestMode
+                   ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI,
+                                           DT, BFI, TLI, TTI, L, &MSSAU,
+                                           &SafetyInfo, Flags, ORE)
+                   : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI,
+                                TLI, TTI, L, &MSSAU, &SafetyInfo, Flags, ORE);
+  Flags.setIsSink(false);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
-                           CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
-                           *Flags.get(), ORE, LoopNestMode);
+                           &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -452,7 +433,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // preheader for SSA updater, so also avoid sinking when no preheader
   // is available.
   if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
-      !Flags->tooManyMemoryAccesses() && !HasCoroSuspendInst) {
+      !Flags.tooManyMemoryAccesses() && !HasCoroSuspendInst) {
     // Figure out the loop exits and their insertion points
     SmallVector<BasicBlock *, 8> ExitBlocks;
     L->getUniqueExitBlocks(ExitBlocks);
@@ -466,55 +447,29 @@ bool LoopInvariantCodeMotion::runOnLoop(
       SmallVector<Instruction *, 8> InsertPts;
       SmallVector<MemoryAccess *, 8> MSSAInsertPts;
       InsertPts.reserve(ExitBlocks.size());
-      if (MSSAU)
-        MSSAInsertPts.reserve(ExitBlocks.size());
+      MSSAInsertPts.reserve(ExitBlocks.size());
       for (BasicBlock *ExitBlock : ExitBlocks) {
         InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
-        if (MSSAU)
-          MSSAInsertPts.push_back(nullptr);
+        MSSAInsertPts.push_back(nullptr);
       }
 
       PredIteratorCache PIC;
 
+      // Promoting one set of accesses may make the pointers for another set
+      // loop invariant, so run this in a loop (with the MaybePromotable set
+      // decreasing in size over time).
       bool Promoted = false;
-      if (CurAST.get()) {
-        // Loop over all of the alias sets in the tracker object.
-        for (AliasSet &AS : *CurAST) {
-          // We can promote this alias set if it has a store, if it is a "Must"
-          // alias set, if the pointer is loop invariant, and if we are not
-          // eliminating any volatile loads or stores.
-          if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
-              !L->isLoopInvariant(AS.begin()->getValue()))
-            continue;
-
-          assert(
-              !AS.empty() &&
-              "Must alias set should have at least one pointer element in it!");
-
-          SmallSetVector<Value *, 8> PointerMustAliases;
-          for (const auto &ASI : AS)
-            PointerMustAliases.insert(ASI.getValue());
-
-          Promoted |= promoteLoopAccessesToScalars(
-              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
-              DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE);
+      bool LocalPromoted;
+      do {
+        LocalPromoted = false;
+        for (const SmallSetVector<Value *, 8> &PointerMustAliases :
+             collectPromotionCandidates(MSSA, AA, L)) {
+          LocalPromoted |= promoteLoopAccessesToScalars(
+              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC,
+              LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE);
         }
-      } else {
-        // Promoting one set of accesses may make the pointers for another set
-        // loop invariant, so run this in a loop (with the MaybePromotable set
-        // decreasing in size over time).
-        bool LocalPromoted;
-        do {
-          LocalPromoted = false;
-          for (const SmallSetVector<Value *, 8> &PointerMustAliases :
-               collectPromotionCandidates(MSSA, AA, L)) {
-            LocalPromoted |= promoteLoopAccessesToScalars(
-                PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC,
-                LI, DT, TLI, L, /*AST*/nullptr, MSSAU.get(), &SafetyInfo, ORE);
-          }
-          Promoted |= LocalPromoted;
-        } while (LocalPromoted);
-      }
+        Promoted |= LocalPromoted;
+      } while (LocalPromoted);
 
       // Once we have promoted values across the loop body we have to
       // recursively reform LCSSA as any nested loop may now have values defined
@@ -536,8 +491,8 @@ bool LoopInvariantCodeMotion::runOnLoop(
   assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
          "Parent loop not left in LCSSA form after LICM!");
 
-  if (MSSAU.get() && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+  if (VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
 
   if (Changed && SE)
     SE->forgetLoopDispositions(L);
@@ -552,17 +507,15 @@ bool LoopInvariantCodeMotion::runOnLoop(
 bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                       DominatorTree *DT, BlockFrequencyInfo *BFI,
                       TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-                      Loop *CurLoop, AliasSetTracker *CurAST,
-                      MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+                      Loop *CurLoop, MemorySSAUpdater *MSSAU,
+                      ICFLoopSafetyInfo *SafetyInfo,
                       SinkAndHoistLICMFlags &Flags,
-                      OptimizationRemarkEmitter *ORE) {
+                      OptimizationRemarkEmitter *ORE, Loop *OutermostLoop) {
 
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
-         CurLoop != nullptr && SafetyInfo != nullptr &&
+         CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to sinkRegion.");
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
-         "Either AliasSetTracker or MemorySSA should be initialized.");
 
   // We want to visit children before parents. We will enque all the parents
   // before their children in the worklist and process the worklist in reverse
@@ -587,7 +540,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
         salvageKnowledge(&I);
         salvageDebugInfo(I);
         ++II;
-        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+        eraseInstruction(I, *SafetyInfo, MSSAU);
         Changed = true;
         continue;
       }
@@ -598,26 +551,46 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       // operands of the instruction are loop invariant.
       //
       bool FreeInLoop = false;
+      bool LoopNestMode = OutermostLoop != nullptr;
       if (!I.mayHaveSideEffects() &&
-          isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
-                             ORE)) {
+          isNotUsedOrFreeInLoop(I, LoopNestMode ? OutermostLoop : CurLoop,
+                                SafetyInfo, TTI, FreeInLoop, LoopNestMode) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/nullptr, MSSAU, true,
+                             &Flags, ORE)) {
         if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
           if (!FreeInLoop) {
             ++II;
             salvageDebugInfo(I);
-            eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+            eraseInstruction(I, *SafetyInfo, MSSAU);
           }
           Changed = true;
         }
       }
     }
   }
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   return Changed;
 }
 
+bool llvm::sinkRegionForLoopNest(
+    DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
+    BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+    Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+    SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) {
+
+  bool Changed = false;
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  Worklist.insert(CurLoop);
+  appendLoopsToWorklist(*CurLoop, Worklist);
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI,
+                          TTI, L, MSSAU, SafetyInfo, Flags, ORE, CurLoop);
+  }
+  return Changed;
+}
+
 namespace {
 // This is a helper class for hoistRegion to make it able to hoist control flow
 // in order to be able to hoist phis. The way this works is that we initially
@@ -820,9 +793,8 @@ public:
     if (HoistTarget == InitialPreheader) {
       // Phis in the loop header now need to use the new preheader.
       InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
-      if (MSSAU)
-        MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
-            HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
+      MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+          HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
       // The new preheader dominates the loop header.
       DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
       DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader());
@@ -884,16 +856,14 @@ static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
 bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        DominatorTree *DT, BlockFrequencyInfo *BFI,
                        TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-                       ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
+                       MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                       ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
                        OptimizationRemarkEmitter *ORE, bool LoopNestMode) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
-         CurLoop != nullptr && SafetyInfo != nullptr &&
+         CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to hoistRegion.");
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
-         "Either AliasSetTracker or MemorySSA should be initialized.");
 
   ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU);
 
@@ -913,8 +883,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
     if (!LoopNestMode && inSubLoop(BB, CurLoop, LI))
       continue;
 
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
-      Instruction &I = *II++;
+    for (Instruction &I : llvm::make_early_inc_range(*BB)) {
       // Try constant folding this instruction.  If all the operands are
       // constants, it is technically hoistable, but it would be better to
       // just fold it.
@@ -922,12 +891,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
               &I, I.getModule()->getDataLayout(), TLI)) {
         LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C
                           << '\n');
-        if (CurAST)
-          CurAST->copyValue(&I, C);
         // FIXME MSSA: Such replacements may make accesses unoptimized (D51960).
         I.replaceAllUsesWith(C);
         if (isInstructionTriviallyDead(&I, TLI))
-          eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+          eraseInstruction(I, *SafetyInfo, MSSAU);
         Changed = true;
         continue;
       }
@@ -940,8 +907,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       // and we have accurately duplicated the control flow from the loop header
       // to that block.
       if (CurLoop->hasLoopInvariantOperands(&I) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
-                             ORE) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU,
+                             true, &Flags, ORE) &&
           worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
           isSafeToExecuteUnconditionally(
               I, DT, TLI, CurLoop, SafetyInfo, ORE,
@@ -970,7 +937,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
         SafetyInfo->insertInstructionTo(Product, I.getParent());
         Product->insertAfter(&I);
         I.replaceAllUsesWith(Product);
-        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+        eraseInstruction(I, *SafetyInfo, MSSAU);
 
         hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
               SafetyInfo, MSSAU, SE, ORE);
@@ -1049,7 +1016,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       }
     }
   }
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
 
     // Now that we've finished hoisting make sure that LI and DT are still
@@ -1101,6 +1068,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
       return false;
     Addr = BC->getOperand(0);
   }
+  // If we've ended up at a global/constant, bail. We shouldn't be looking at
+  // uselists for non-local Values in a loop pass.
+  if (isa<Constant>(Addr))
+    return false;
 
   unsigned UsesVisited = 0;
   // Traverse all uses of the load operand value, to see if invariant.start is
@@ -1273,7 +1244,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
       // writes to this memory in the loop, we can hoist or sink.
       if (AAResults::onlyAccessesArgPointees(Behavior)) {
         // TODO: expand to writeable arguments
-        for (Value *Op : CI->arg_operands())
+        for (Value *Op : CI->args())
           if (Op->getType()->isPointerTy()) {
             bool Invalidated;
             if (CurAST)
@@ -1443,7 +1414,8 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
 /// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop).
 static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
-                                  TargetTransformInfo *TTI, bool &FreeInLoop) {
+                                  TargetTransformInfo *TTI, bool &FreeInLoop,
+                                  bool LoopNestMode) {
   const auto &BlockColors = SafetyInfo->getBlockColors();
   bool IsFree = isFreeInLoop(I, CurLoop, TTI);
   for (const User *U : I.users()) {
@@ -1460,6 +1432,15 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
         if (!BlockColors.empty() &&
             BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
           return false;
+
+      if (LoopNestMode) {
+        while (isa<PHINode>(UI) && UI->hasOneUser() &&
+               UI->getNumOperands() == 1) {
+          if (!CurLoop->contains(UI))
+            break;
+          UI = cast<Instruction>(UI->user_back());
+        }
+      }
     }
 
     if (CurLoop->contains(UI)) {
@@ -1546,9 +1527,7 @@ static Instruction *cloneInstructionInExitBlock(
 }
 
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
-                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU) {
-  if (AST)
-    AST->deleteValue(&I);
+                             MemorySSAUpdater *MSSAU) {
   if (MSSAU)
     MSSAU->removeMemoryAccess(&I);
   SafetyInfo.removeInstruction(&I);
@@ -1599,8 +1578,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
   // predecessor fairly simple.
   if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
     return false;
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-    BasicBlock *BBPred = *PI;
+  for (BasicBlock *BBPred : predecessors(BB)) {
     if (isa<IndirectBrInst>(BBPred->getTerminator()) ||
         isa<CallBrInst>(BBPred->getTerminator()))
       return false;
@@ -1786,7 +1764,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     Instruction *New = sinkThroughTriviallyReplaceablePHI(
         PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
     PN->replaceAllUsesWith(New);
-    eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr);
+    eraseInstruction(*PN, *SafetyInfo, nullptr);
     Changed = true;
   }
   return Changed;
@@ -1875,11 +1853,10 @@ class LoopPromoter : public LoadAndStorePromoter {
   SmallVectorImpl<Instruction *> &LoopInsertPts;
   SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
   PredIteratorCache &PredCache;
-  AliasSetTracker *AST;
   MemorySSAUpdater *MSSAU;
   LoopInfo &LI;
   DebugLoc DL;
-  int Alignment;
+  Align Alignment;
   bool UnorderedAtomic;
   AAMDNodes AATags;
   ICFLoopSafetyInfo &SafetyInfo;
@@ -1907,13 +1884,13 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP,
                SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
-               AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
-               DebugLoc dl, int alignment, bool UnorderedAtomic,
-               const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
+               MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl,
+               Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags,
+               ICFLoopSafetyInfo &SafetyInfo)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
-        PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
-        Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
+        PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
+        Alignment(Alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
         SafetyInfo(SafetyInfo) {}
 
   bool isInstInList(Instruction *I,
@@ -1940,39 +1917,29 @@ public:
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
       if (UnorderedAtomic)
         NewSI->setOrdering(AtomicOrdering::Unordered);
-      NewSI->setAlignment(Align(Alignment));
+      NewSI->setAlignment(Alignment);
       NewSI->setDebugLoc(DL);
       if (AATags)
         NewSI->setAAMetadata(AATags);
 
-      if (MSSAU) {
-        MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
-        MemoryAccess *NewMemAcc;
-        if (!MSSAInsertPoint) {
-          NewMemAcc = MSSAU->createMemoryAccessInBB(
-              NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
-        } else {
-          NewMemAcc =
-              MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
-        }
-        MSSAInsertPts[i] = NewMemAcc;
-        MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
-        // FIXME: true for safety, false may still be correct.
+      MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
+      MemoryAccess *NewMemAcc;
+      if (!MSSAInsertPoint) {
+        NewMemAcc = MSSAU->createMemoryAccessInBB(
+            NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
+      } else {
+        NewMemAcc =
+            MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
       }
+      MSSAInsertPts[i] = NewMemAcc;
+      MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+      // FIXME: true for safety, false may still be correct.
     }
   }
 
-  void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
-    // Update alias analysis.
-    if (AST)
-      AST->copyValue(LI, V);
-  }
   void instructionDeleted(Instruction *I) const override {
     SafetyInfo.removeInstruction(I);
-    if (AST)
-      AST->deleteValue(I);
-    if (MSSAU)
-      MSSAU->removeMemoryAccess(I);
+    MSSAU->removeMemoryAccess(I);
   }
 };
 
@@ -2023,8 +1990,8 @@ bool llvm::promoteLoopAccessesToScalars(
     SmallVectorImpl<Instruction *> &InsertPts,
     SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-    ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
+    Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          SafetyInfo != nullptr &&
@@ -2189,9 +2156,9 @@ bool llvm::promoteLoopAccessesToScalars(
       // Merge the AA tags.
       if (LoopUses.empty()) {
         // On the first load/store, just take its AA tags.
-        UI->getAAMetadata(AATags);
+        AATags = UI->getAAMetadata();
       } else if (AATags) {
-        UI->getAAMetadata(AATags, /* Merge = */ true);
+        AATags = AATags.merge(UI->getAAMetadata());
       }
 
       LoopUses.push_back(UI);
@@ -2256,9 +2223,8 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL,
-                        Alignment.value(), SawUnorderedAtomic, AATags,
-                        *SafetyInfo);
+                        InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL,
+                        Alignment, SawUnorderedAtomic, AATags, *SafetyInfo);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -2273,24 +2239,22 @@ bool llvm::promoteLoopAccessesToScalars(
     PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
-  if (MSSAU) {
-    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
-        PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
-    MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
-    MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
-  }
+  MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+      PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
+  MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
+  MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
 
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   // Rewrite all the loads in the loop and remember all the definitions from
   // stores in the loop.
   Promoter.run(LoopUses);
 
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
   if (PreheaderLoad->use_empty())
-    eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU);
+    eraseInstruction(*PreheaderLoad, *SafetyInfo, MSSAU);
 
   return true;
 }
@@ -2356,71 +2320,10 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) {
   return Result;
 }
 
-/// Returns an owning pointer to an alias set which incorporates aliasing info
-/// from L and all subloops of L.
-std::unique_ptr<AliasSetTracker>
-LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
-                                                 AAResults *AA) {
-  auto CurAST = std::make_unique<AliasSetTracker>(*AA);
-
-  // Add everything from all the sub loops.
-  for (Loop *InnerL : L->getSubLoops())
-    for (BasicBlock *BB : InnerL->blocks())
-      CurAST->add(*BB);
-
-  // And merge in this loop (without anything from inner loops).
-  for (BasicBlock *BB : L->blocks())
-    if (LI->getLoopFor(BB) == L)
-      CurAST->add(*BB);
-
-  return CurAST;
-}
-
 static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
                                      AliasSetTracker *CurAST, Loop *CurLoop,
                                      AAResults *AA) {
-  // First check to see if any of the basic blocks in CurLoop invalidate *V.
-  bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod();
-
-  if (!isInvalidatedAccordingToAST || !LICMN2Theshold)
-    return isInvalidatedAccordingToAST;
-
-  // Check with a diagnostic analysis if we can refine the information above.
-  // This is to identify the limitations of using the AST.
-  // The alias set mechanism used by LICM has a major weakness in that it
-  // combines all things which may alias into a single set *before* asking
-  // modref questions. As a result, a single readonly call within a loop will
-  // collapse all loads and stores into a single alias set and report
-  // invalidation if the loop contains any store. For example, readonly calls
-  // with deopt states have this form and create a general alias set with all
-  // loads and stores.  In order to get any LICM in loops containing possible
-  // deopt states we need a more precise invalidation of checking the mod ref
-  // info of each instruction within the loop and LI. This has a complexity of
-  // O(N^2), so currently, it is used only as a diagnostic tool since the
-  // default value of LICMN2Threshold is zero.
-
-  // Don't look at nested loops.
-  if (CurLoop->begin() != CurLoop->end())
-    return true;
-
-  int N = 0;
-  for (BasicBlock *BB : CurLoop->getBlocks())
-    for (Instruction &I : *BB) {
-      if (N >= LICMN2Theshold) {
-        LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for "
-                          << *(MemLoc.Ptr) << "\n");
-        return true;
-      }
-      N++;
-      auto Res = AA->getModRefInfo(&I, MemLoc);
-      if (isModSet(Res)) {
-        LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for "
-                          << *(MemLoc.Ptr) << "\n");
-        return true;
-      }
-    }
-  LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n");
-  return false;
+  return CurAST->getAliasSetFor(MemLoc).isMod();
 }
 
 bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 993b154dc9a8..d438d56e38ca 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopBoundSplit.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -39,10 +40,12 @@ struct ConditionInfo {
   ICmpInst::Predicate Pred;
   /// AddRec llvm value
   Value *AddRecValue;
+  /// Non PHI AddRec llvm value
+  Value *NonPHIAddRecValue;
   /// Bound llvm value
   Value *BoundValue;
   /// AddRec SCEV
-  const SCEV *AddRecSCEV;
+  const SCEVAddRecExpr *AddRecSCEV;
   /// Bound SCEV
   const SCEV *BoundSCEV;
 
@@ -54,19 +57,31 @@ struct ConditionInfo {
 } // namespace
 
 static void analyzeICmp(ScalarEvolution &SE, ICmpInst *ICmp,
-                        ConditionInfo &Cond) {
+                        ConditionInfo &Cond, const Loop &L) {
   Cond.ICmp = ICmp;
   if (match(ICmp, m_ICmp(Cond.Pred, m_Value(Cond.AddRecValue),
                          m_Value(Cond.BoundValue)))) {
-    Cond.AddRecSCEV = SE.getSCEV(Cond.AddRecValue);
-    Cond.BoundSCEV = SE.getSCEV(Cond.BoundValue);
+    const SCEV *AddRecSCEV = SE.getSCEV(Cond.AddRecValue);
+    const SCEV *BoundSCEV = SE.getSCEV(Cond.BoundValue);
+    const SCEVAddRecExpr *LHSAddRecSCEV = dyn_cast<SCEVAddRecExpr>(AddRecSCEV);
+    const SCEVAddRecExpr *RHSAddRecSCEV = dyn_cast<SCEVAddRecExpr>(BoundSCEV);
     // Locate AddRec in LHSSCEV and Bound in RHSSCEV.
-    if (isa<SCEVAddRecExpr>(Cond.BoundSCEV) &&
-        !isa<SCEVAddRecExpr>(Cond.AddRecSCEV)) {
+    if (!LHSAddRecSCEV && RHSAddRecSCEV) {
       std::swap(Cond.AddRecValue, Cond.BoundValue);
-      std::swap(Cond.AddRecSCEV, Cond.BoundSCEV);
+      std::swap(AddRecSCEV, BoundSCEV);
       Cond.Pred = ICmpInst::getSwappedPredicate(Cond.Pred);
     }
+
+    Cond.AddRecSCEV = dyn_cast<SCEVAddRecExpr>(AddRecSCEV);
+    Cond.BoundSCEV = BoundSCEV;
+    Cond.NonPHIAddRecValue = Cond.AddRecValue;
+
+    // If the Cond.AddRecValue is PHI node, update Cond.NonPHIAddRecValue with
+    // value from backedge.
+    if (Cond.AddRecSCEV && isa<PHINode>(Cond.AddRecValue)) {
+      PHINode *PN = cast<PHINode>(Cond.AddRecValue);
+      Cond.NonPHIAddRecValue = PN->getIncomingValueForBlock(L.getLoopLatch());
+    }
   }
 }
 
@@ -118,21 +133,20 @@ static bool calculateUpperBound(const Loop &L, ScalarEvolution &SE,
 static bool hasProcessableCondition(const Loop &L, ScalarEvolution &SE,
                                     ICmpInst *ICmp, ConditionInfo &Cond,
                                     bool IsExitCond) {
-  analyzeICmp(SE, ICmp, Cond);
+  analyzeICmp(SE, ICmp, Cond, L);
 
   // The BoundSCEV should be evaluated at loop entry.
   if (!SE.isAvailableAtLoopEntry(Cond.BoundSCEV, &L))
     return false;
 
-  const SCEVAddRecExpr *AddRecSCEV = dyn_cast<SCEVAddRecExpr>(Cond.AddRecSCEV);
   // Allowed AddRec as induction variable.
-  if (!AddRecSCEV)
+  if (!Cond.AddRecSCEV)
     return false;
 
-  if (!AddRecSCEV->isAffine())
+  if (!Cond.AddRecSCEV->isAffine())
     return false;
 
-  const SCEV *StepRecSCEV = AddRecSCEV->getStepRecurrence(SE);
+  const SCEV *StepRecSCEV = Cond.AddRecSCEV->getStepRecurrence(SE);
   // Allowed constant step.
   if (!isa<SCEVConstant>(StepRecSCEV))
     return false;
@@ -264,6 +278,14 @@ static BranchInst *findSplitCandidate(const Loop &L, ScalarEvolution &SE,
         SplitCandidateCond.BoundSCEV->getType())
       continue;
 
+    // After transformation, we assume the split condition of the pre-loop is
+    // always true. In order to guarantee it, we need to check the start value
+    // of the split cond AddRec satisfies the split condition.
+    if (!SE.isLoopEntryGuardedByCond(&L, SplitCandidateCond.Pred,
+                                     SplitCandidateCond.AddRecSCEV->getStart(),
+                                     SplitCandidateCond.BoundSCEV))
+      continue;
+
     SplitCandidateCond.BI = BI;
     return BI;
   }
@@ -341,13 +363,45 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
                                     ".split", &LI, &DT, PostLoopBlocks);
   remapInstructionsInBlocks(PostLoopBlocks, VMap);
 
-  // Add conditional branch to check we can skip post-loop in its preheader.
   BasicBlock *PostLoopPreHeader = PostLoop->getLoopPreheader();
-  IRBuilder<> Builder(PostLoopPreHeader);
+  IRBuilder<> Builder(&PostLoopPreHeader->front());
+
+  // Update phi nodes in header of post-loop.
+  bool isExitingLatch =
+      (L.getExitingBlock() == L.getLoopLatch()) ? true : false;
+  Value *ExitingCondLCSSAPhi = nullptr;
+  for (PHINode &PN : L.getHeader()->phis()) {
+    // Create LCSSA phi node in preheader of post-loop.
+    PHINode *LCSSAPhi =
+        Builder.CreatePHI(PN.getType(), 1, PN.getName() + ".lcssa");
+    LCSSAPhi->setDebugLoc(PN.getDebugLoc());
+    // If the exiting block is loop latch, the phi does not have the update at
+    // last iteration. In this case, update lcssa phi with value from backedge.
+    LCSSAPhi->addIncoming(
+        isExitingLatch ? PN.getIncomingValueForBlock(L.getLoopLatch()) : &PN,
+        L.getExitingBlock());
+
+    // Update the start value of phi node in post-loop with the LCSSA phi node.
+    PHINode *PostLoopPN = cast<PHINode>(VMap[&PN]);
+    PostLoopPN->setIncomingValueForBlock(PostLoopPreHeader, LCSSAPhi);
+
+    // Find PHI with exiting condition from pre-loop. The PHI should be
+    // SCEVAddRecExpr and have same incoming value from backedge with
+    // ExitingCond.
+    if (!SE.isSCEVable(PN.getType()))
+      continue;
+
+    const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
+    if (PhiSCEV && ExitingCond.NonPHIAddRecValue ==
+                       PN.getIncomingValueForBlock(L.getLoopLatch()))
+      ExitingCondLCSSAPhi = LCSSAPhi;
+  }
+
+  // Add conditional branch to check we can skip post-loop in its preheader.
   Instruction *OrigBI = PostLoopPreHeader->getTerminator();
   ICmpInst::Predicate Pred = ICmpInst::ICMP_NE;
   Value *Cond =
-      Builder.CreateICmp(Pred, ExitingCond.AddRecValue, ExitingCond.BoundValue);
+      Builder.CreateICmp(Pred, ExitingCondLCSSAPhi, ExitingCond.BoundValue);
   Builder.CreateCondBr(Cond, PostLoop->getHeader(), PostLoop->getExitBlock());
   OrigBI->eraseFromParent();
 
@@ -368,21 +422,6 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   // Replace exiting bound value of pre-loop NewBound.
   ExitingCond.ICmp->setOperand(1, NewBoundValue);
 
-  // Replace IV's start value of post-loop by NewBound.
-  for (PHINode &PN : L.getHeader()->phis()) {
-    // Find PHI with exiting condition from pre-loop.
-    if (SE.isSCEVable(PN.getType()) && isa<SCEVAddRecExpr>(SE.getSCEV(&PN))) {
-      for (Value *Op : PN.incoming_values()) {
-        if (Op == ExitingCond.AddRecValue) {
-          // Find cloned PHI for post-loop.
-          PHINode *PostLoopPN = cast<PHINode>(VMap[&PN]);
-          PostLoopPN->setIncomingValueForBlock(PostLoopPreHeader,
-                                               NewBoundValue);
-        }
-      }
-    }
-  }
-
   // Replace SplitCandidateCond.BI's condition of pre-loop by True.
   LLVMContext &Context = PreHeader->getContext();
   SplitCandidateCond.BI->setCondition(ConstantInt::getTrue(Context));
@@ -398,6 +437,30 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   else
     ExitingCond.BI->setSuccessor(1, PostLoopPreHeader);
 
+  // Update phi node in exit block of post-loop.
+  Builder.SetInsertPoint(&PostLoopPreHeader->front());
+  for (PHINode &PN : PostLoop->getExitBlock()->phis()) {
+    for (auto i : seq<int>(0, PN.getNumOperands())) {
+      // Check incoming block is pre-loop's exiting block.
+      if (PN.getIncomingBlock(i) == L.getExitingBlock()) {
+        Value *IncomingValue = PN.getIncomingValue(i);
+
+        // Create LCSSA phi node for incoming value.
+        PHINode *LCSSAPhi =
+            Builder.CreatePHI(PN.getType(), 1, PN.getName() + ".lcssa");
+        LCSSAPhi->setDebugLoc(PN.getDebugLoc());
+        LCSSAPhi->addIncoming(IncomingValue, PN.getIncomingBlock(i));
+
+        // Replace pre-loop's exiting block by post-loop's preheader.
+        PN.setIncomingBlock(i, PostLoopPreHeader);
+        // Replace incoming value by LCSSAPhi.
+        PN.setIncomingValue(i, LCSSAPhi);
+        // Add a new incoming value with post-loop's exiting block.
+        PN.addIncoming(VMap[IncomingValue], PostLoop->getExitingBlock());
+      }
+    }
+  }
+
   // Update dominator tree.
   DT.changeImmediateDominator(PostLoopPreHeader, L.getExitingBlock());
   DT.changeImmediateDominator(PostLoop->getExitBlock(), PostLoopPreHeader);
@@ -406,10 +469,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   SE.forgetLoop(&L);
 
   // Canonicalize loops.
-  // TODO: Try to update LCSSA information according to above change.
-  formLCSSA(L, DT, &LI, &SE);
   simplifyLoop(&L, &DT, &LI, &SE, nullptr, nullptr, true);
-  formLCSSA(*PostLoop, DT, &LI, &SE);
   simplifyLoop(PostLoop, &DT, &LI, &SE, nullptr, nullptr, true);
 
   // Add new post-loop to loop pass manager.
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index a5d7835bd094..77d76609c926 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -127,6 +128,8 @@ public:
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addPreserved<ScalarEvolutionWrapperPass>();
@@ -143,6 +146,7 @@ INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index f7e8442fae81..5814e2f043d5 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -36,6 +36,8 @@ using namespace llvm;
 #define DEBUG_TYPE "loop-delete"
 
 STATISTIC(NumDeleted, "Number of loops deleted");
+STATISTIC(NumBackedgesBroken,
+          "Number of loops for which we managed to break the backedge");
 
 static cl::opt<bool> EnableSymbolicExecution(
     "loop-deletion-enable-symbolic-execution", cl::Hidden, cl::init(true),
@@ -191,6 +193,20 @@ getValueOnFirstIteration(Value *V, DenseMap<Value *, Value *> &FirstIterValue,
     Value *RHS =
         getValueOnFirstIteration(BO->getOperand(1), FirstIterValue, SQ);
     FirstIterV = SimplifyBinOp(BO->getOpcode(), LHS, RHS, SQ);
+  } else if (auto *Cmp = dyn_cast<ICmpInst>(V)) {
+    Value *LHS =
+        getValueOnFirstIteration(Cmp->getOperand(0), FirstIterValue, SQ);
+    Value *RHS =
+        getValueOnFirstIteration(Cmp->getOperand(1), FirstIterValue, SQ);
+    FirstIterV = SimplifyICmpInst(Cmp->getPredicate(), LHS, RHS, SQ);
+  } else if (auto *Select = dyn_cast<SelectInst>(V)) {
+    Value *Cond =
+        getValueOnFirstIteration(Select->getCondition(), FirstIterValue, SQ);
+    if (auto *C = dyn_cast<ConstantInt>(Cond)) {
+      auto *Selected = C->isAllOnesValue() ? Select->getTrueValue()
+                                           : Select->getFalseValue();
+      FirstIterV = getValueOnFirstIteration(Selected, FirstIterValue, SQ);
+    }
   }
   if (!FirstIterV)
     FirstIterV = V;
@@ -314,22 +330,20 @@ static bool canProveExitOnFirstIteration(Loop *L, DominatorTree &DT,
     }
 
     using namespace PatternMatch;
-    ICmpInst::Predicate Pred;
-    Value *LHS, *RHS;
+    Value *Cond;
     BasicBlock *IfTrue, *IfFalse;
     auto *Term = BB->getTerminator();
-    if (match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+    if (match(Term, m_Br(m_Value(Cond),
                          m_BasicBlock(IfTrue), m_BasicBlock(IfFalse)))) {
-      if (!LHS->getType()->isIntegerTy()) {
+      auto *ICmp = dyn_cast<ICmpInst>(Cond);
+      if (!ICmp || !ICmp->getType()->isIntegerTy()) {
         MarkAllSuccessorsLive(BB);
         continue;
       }
 
       // Can we prove constant true or false for this condition?
-      LHS = getValueOnFirstIteration(LHS, FirstIterValue, SQ);
-      RHS = getValueOnFirstIteration(RHS, FirstIterValue, SQ);
-      auto *KnownCondition = SimplifyICmpInst(Pred, LHS, RHS, SQ);
-      if (!KnownCondition) {
+      auto *KnownCondition = getValueOnFirstIteration(ICmp, FirstIterValue, SQ);
+      if (KnownCondition == ICmp) {
         // Failed to simplify.
         MarkAllSuccessorsLive(BB);
         continue;
@@ -393,14 +407,25 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   if (!L->getLoopLatch())
     return LoopDeletionResult::Unmodified;
 
-  auto *BTC = SE.getBackedgeTakenCount(L);
-  if (!isa<SCEVCouldNotCompute>(BTC) && SE.isKnownNonZero(BTC))
-    return LoopDeletionResult::Unmodified;
-  if (!BTC->isZero() && !canProveExitOnFirstIteration(L, DT, LI))
-    return LoopDeletionResult::Unmodified;
+  auto *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
+  if (BTC->isZero()) {
+    // SCEV knows this backedge isn't taken!
+    breakLoopBackedge(L, DT, SE, LI, MSSA);
+    ++NumBackedgesBroken;
+    return LoopDeletionResult::Deleted;
+  }
 
-  breakLoopBackedge(L, DT, SE, LI, MSSA);
-  return LoopDeletionResult::Deleted;
+  // If SCEV leaves open the possibility of a zero trip count, see if
+  // symbolically evaluating the first iteration lets us prove the backedge
+  // unreachable.
+  if (isa<SCEVCouldNotCompute>(BTC) || !SE.isKnownNonZero(BTC))
+    if (canProveExitOnFirstIteration(L, DT, LI)) {
+      breakLoopBackedge(L, DT, SE, LI, MSSA);
+      ++NumBackedgesBroken;
+      return LoopDeletionResult::Deleted;
+    }
+
+  return LoopDeletionResult::Unmodified;
 }
 
 /// Remove a loop if it is dead.
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index bac3dc0f3fb9..0f4c767c1e4c 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -1057,8 +1057,8 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   std::function<const LoopAccessInfo &(Loop &)> GetLAA =
       [&](Loop &L) -> const LoopAccessInfo & {
-    LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,     SE,
-                                      TLI, TTI, nullptr, nullptr};
+    LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
+                                      TLI, TTI, nullptr, nullptr, nullptr};
     return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index f54289f85ef5..965d1575518e 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -27,6 +27,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopFlatten.h"
+
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -49,11 +51,13 @@
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 
-#define DEBUG_TYPE "loop-flatten"
-
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "loop-flatten"
+
+STATISTIC(NumFlattened, "Number of loops flattened");
+
 static cl::opt<unsigned> RepeatedInstructionThreshold(
     "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
     cl::desc("Limit on the cost of instructions that can be repeated due to "
@@ -90,9 +94,33 @@ struct FlattenInfo {
   // Whether this holds the flatten info before or after widening.
   bool Widened = false;
 
+  // Holds the old/narrow induction phis, i.e. the Phis before IV widening has
+  // been applied. This bookkeeping is used so we can skip some checks on these
+  // phi nodes.
+  PHINode *NarrowInnerInductionPHI = nullptr;
+  PHINode *NarrowOuterInductionPHI = nullptr;
+
   FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
+
+  bool isNarrowInductionPhi(PHINode *Phi) {
+    // This can't be the narrow phi if we haven't widened the IV first.
+    if (!Widened)
+      return false;
+    return NarrowInnerInductionPHI == Phi || NarrowOuterInductionPHI == Phi;
+  }
 };
 
+static bool
+setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment,
+                  SmallPtrSetImpl<Instruction *> &IterationInstructions) {
+  TripCount = TC;
+  IterationInstructions.insert(Increment);
+  LLVM_DEBUG(dbgs() << "Found Increment: "; Increment->dump());
+  LLVM_DEBUG(dbgs() << "Found trip count: "; TripCount->dump());
+  LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
+  return true;
+}
+
 // Finds the induction variable, increment and trip count for a simple loop that
 // we can flatten.
 static bool findLoopComponents(
@@ -164,36 +192,68 @@ static bool findLoopComponents(
     return false;
   }
   // The trip count is the RHS of the compare. If this doesn't match the trip
-  // count computed by SCEV then this is either because the trip count variable
-  // has been widened (then leave the trip count as it is), or because it is a
-  // constant and another transformation has changed the compare, e.g.
-  // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, then we don't flatten
-  // the loop (yet).
-  TripCount = Compare->getOperand(1);
+  // count computed by SCEV then this is because the trip count variable
+  // has been widened so the types don't match, or because it is a constant and
+  // another transformation has changed the compare (e.g. icmp ult %inc,
+  // tripcount -> icmp ult %j, tripcount-1), or both.
+  Value *RHS = Compare->getOperand(1);
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n");
+    return false;
+  }
+  // The use of the Extend=false flag on getTripCountFromExitCount was added
+  // during a refactoring to preserve existing behavior.  However, there's
+  // nothing obvious in the surrounding code when handles the overflow case.
+  // FIXME: audit code to establish whether there's a latent bug here.
   const SCEV *SCEVTripCount =
-      SE->getTripCountFromExitCount(SE->getBackedgeTakenCount(L));
-  if (SE->getSCEV(TripCount) != SCEVTripCount) {
-    if (!IsWidened) {
-      LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
-      return false;
-    }
-    auto TripCountInst = dyn_cast<Instruction>(TripCount);
-    if (!TripCountInst) {
-      LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
-      return false;
+    SE->getTripCountFromExitCount(BackedgeTakenCount, false);
+  const SCEV *SCEVRHS = SE->getSCEV(RHS);
+  if (SCEVRHS == SCEVTripCount)
+    return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+  ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS);
+  if (ConstantRHS) {
+    const SCEV *BackedgeTCExt = nullptr;
+    if (IsWidened) {
+      const SCEV *SCEVTripCountExt;
+      // Find the extended backedge taken count and extended trip count using
+      // SCEV. One of these should now match the RHS of the compare.
+      BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType());
+      SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false);
+      if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) {
+        LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+        return false;
+      }
     }
-    if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
-        SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
-      LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
-      return false;
+    // If the RHS of the compare is equal to the backedge taken count we need
+    // to add one to get the trip count.
+    if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) {
+      ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1);
+      Value *NewRHS = ConstantInt::get(
+          ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue());
+      return setLoopComponents(NewRHS, TripCount, Increment,
+                               IterationInstructions);
     }
+    return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
   }
-  IterationInstructions.insert(Increment);
-  LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
-  LLVM_DEBUG(dbgs() << "Found trip count: "; TripCount->dump());
-
-  LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
-  return true;
+  // If the RHS isn't a constant then check that the reason it doesn't match
+  // the SCEV trip count is because the RHS is a ZExt or SExt instruction
+  // (and take the trip count to be the RHS).
+  if (!IsWidened) {
+    LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+    return false;
+  }
+  auto *TripCountInst = dyn_cast<Instruction>(RHS);
+  if (!TripCountInst) {
+    LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+    return false;
+  }
+  if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
+      SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
+    LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
+    return false;
+  }
+  return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
 }
 
 static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
@@ -221,6 +281,8 @@ static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
     // them specially when doing the transformation.
     if (&InnerPHI == FI.InnerInductionPHI)
       continue;
+    if (FI.isNarrowInductionPhi(&InnerPHI))
+      continue;
 
     // Each inner loop PHI node must have two incoming values/blocks - one
     // from the pre-header, and one from the latch.
@@ -266,6 +328,8 @@ static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
   }
 
   for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) {
+    if (FI.isNarrowInductionPhi(&OuterPHI))
+      continue;
     if (!SafeOuterPHIs.count(&OuterPHI)) {
       LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
       return false;
@@ -356,18 +420,25 @@ static bool checkIVUsers(FlattenInfo &FI) {
     if (U == FI.InnerIncrement)
       continue;
 
-    // After widening the IVs, a trunc instruction might have been introduced, so
-    // look through truncs.
+    // After widening the IVs, a trunc instruction might have been introduced,
+    // so look through truncs.
     if (isa<TruncInst>(U)) {
       if (!U->hasOneUse())
         return false;
       U = *U->user_begin();
     }
 
+    // If the use is in the compare (which is also the condition of the inner
+    // branch) then the compare has been altered by another transformation e.g
+    // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is
+    // a constant. Ignore this use as the compare gets removed later anyway.
+    if (U == FI.InnerBranch->getCondition())
+      continue;
+
     LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
 
-    Value *MatchedMul;
-    Value *MatchedItCount;
+    Value *MatchedMul = nullptr;
+    Value *MatchedItCount = nullptr;
     bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
                                   m_Value(MatchedMul))) &&
                  match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
@@ -375,11 +446,23 @@ static bool checkIVUsers(FlattenInfo &FI) {
 
     // Matches the same pattern as above, except it also looks for truncs
     // on the phi, which can be the result of widening the induction variables.
-    bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
-                                       m_Value(MatchedMul))) &&
-                      match(MatchedMul,
-                            m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
-                            m_Value(MatchedItCount)));
+    bool IsAddTrunc =
+        match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
+                         m_Value(MatchedMul))) &&
+        match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
+                                  m_Value(MatchedItCount)));
+
+    if (!MatchedItCount)
+      return false;
+    // Look through extends if the IV has been widened.
+    if (FI.Widened &&
+        (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
+      assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() &&
+             "Unexpected type mismatch in types after widening");
+      MatchedItCount = isa<SExtInst>(MatchedItCount)
+                           ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0)
+                           : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0);
+    }
 
     if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
       LLVM_DEBUG(dbgs() << "Use is optimisable\n");
@@ -451,17 +534,27 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
   for (Value *V : FI.LinearIVUses) {
     for (Value *U : V->users()) {
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
-        // The IV is used as the operand of a GEP, and the IV is at least as
-        // wide as the address space of the GEP. In this case, the GEP would
-        // wrap around the address space before the IV increment wraps, which
-        // would be UB.
-        if (GEP->isInBounds() &&
-            V->getType()->getIntegerBitWidth() >=
-                DL.getPointerTypeSizeInBits(GEP->getType())) {
-          LLVM_DEBUG(
-              dbgs() << "use of linear IV would be UB if overflow occurred: ";
-              GEP->dump());
-          return OverflowResult::NeverOverflows;
+        for (Value *GEPUser : U->users()) {
+          Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser);
+          if (!isa<LoadInst>(GEPUserInst) &&
+              !(isa<StoreInst>(GEPUserInst) &&
+                GEP == GEPUserInst->getOperand(1)))
+            continue;
+          if (!isGuaranteedToExecuteForEveryIteration(GEPUserInst,
+                                                      FI.InnerLoop))
+            continue;
+          // The IV is used as the operand of a GEP which dominates the loop
+          // latch, and the IV is at least as wide as the address space of the
+          // GEP. In this case, the GEP would wrap around the address space
+          // before the IV increment wraps, which would be UB.
+          if (GEP->isInBounds() &&
+              V->getType()->getIntegerBitWidth() >=
+                  DL.getPointerTypeSizeInBits(GEP->getType())) {
+            LLVM_DEBUG(
+                dbgs() << "use of linear IV would be UB if overflow occurred: ";
+                GEP->dump());
+            return OverflowResult::NeverOverflows;
+          }
         }
       }
     }
@@ -518,7 +611,7 @@ static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 
 static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
                               ScalarEvolution *SE, AssumptionCache *AC,
-                              const TargetTransformInfo *TTI) {
+                              const TargetTransformInfo *TTI, LPMUpdater *U) {
   Function *F = FI.OuterLoop->getHeader()->getParent();
   LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
   {
@@ -574,7 +667,13 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   // deleted, and any information that have about the outer loop invalidated.
   SE->forgetLoop(FI.OuterLoop);
   SE->forgetLoop(FI.InnerLoop);
+  if (U)
+    U->markLoopAsDeleted(*FI.InnerLoop, FI.InnerLoop->getName());
   LI->erase(FI.InnerLoop);
+
+  // Increment statistic value.
+  NumFlattened++;
+
   return true;
 }
 
@@ -605,14 +704,11 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   }
 
   SCEVExpander Rewriter(*SE, DL, "loopflatten");
-  SmallVector<WideIVInfo, 2> WideIVs;
   SmallVector<WeakTrackingVH, 4> DeadInsts;
-  WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
-  WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
   unsigned ElimExt = 0;
   unsigned Widened = 0;
 
-  for (const auto &WideIV : WideIVs) {
+  auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool {
     PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts,
                                     ElimExt, Widened, true /* HasGuards */,
                                     true /* UsePostIncrementRanges */);
@@ -620,17 +716,35 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
       return false;
     LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
     LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIV.NarrowIV->dump());
-    RecursivelyDeleteDeadPHINode(WideIV.NarrowIV);
-  }
-  // After widening, rediscover all the loop components.
+    Deleted = RecursivelyDeleteDeadPHINode(WideIV.NarrowIV);
+    return true;
+  };
+
+  bool Deleted;
+  if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted))
+    return false;
+  // Add the narrow phi to list, so that it will be adjusted later when the
+  // the transformation is performed.
+  if (!Deleted)
+    FI.InnerPHIsToTransform.insert(FI.InnerInductionPHI);
+
+  if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted))
+    return false;
+
   assert(Widened && "Widened IV expected");
   FI.Widened = true;
+
+  // Save the old/narrow induction phis, which we need to ignore in CheckPHIs.
+  FI.NarrowInnerInductionPHI = FI.InnerInductionPHI;
+  FI.NarrowOuterInductionPHI = FI.OuterInductionPHI;
+
+  // After widening, rediscover all the loop components.
   return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
 }
 
 static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
                             ScalarEvolution *SE, AssumptionCache *AC,
-                            const TargetTransformInfo *TTI) {
+                            const TargetTransformInfo *TTI, LPMUpdater *U) {
   LLVM_DEBUG(
       dbgs() << "Loop flattening running on outer loop "
              << FI.OuterLoop->getHeader()->getName() << " and inner loop "
@@ -641,12 +755,30 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
     return false;
 
   // Check if we can widen the induction variables to avoid overflow checks.
-  if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
-    return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
-
-  // Check if the new iteration variable might overflow. In this case, we
-  // need to version the loop, and select the original version at runtime if
-  // the iteration space is too large.
+  bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI);
+
+  // It can happen that after widening of the IV, flattening may not be
+  // possible/happening, e.g. when it is deemed unprofitable. So bail here if
+  // that is the case.
+  // TODO: IV widening without performing the actual flattening transformation
+  // is not ideal. While this codegen change should not matter much, it is an
+  // unnecessary change which is better to avoid. It's unlikely this happens
+  // often, because if it's unprofitibale after widening, it should be
+  // unprofitabe before widening as checked in the first round of checks. But
+  // 'RepeatedInstructionThreshold' is set to only 2, which can probably be
+  // relaxed. Because this is making a code change (the IV widening, but not
+  // the flattening), we return true here.
+  if (FI.Widened && !CanFlatten)
+    return true;
+
+  // If we have widened and can perform the transformation, do that here.
+  if (CanFlatten)
+    return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+
+  // Otherwise, if we haven't widened the IV, check if the new iteration
+  // variable might overflow. In this case, we need to version the loop, and
+  // select the original version at runtime if the iteration space is too
+  // large.
   // TODO: We currently don't version the loop.
   OverflowResult OR = checkOverflow(FI, DT, AC);
   if (OR == OverflowResult::AlwaysOverflowsHigh ||
@@ -659,18 +791,18 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   }
 
   LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
-  return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+  return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
 }
 
 bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
-             AssumptionCache *AC, TargetTransformInfo *TTI) {
+             AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U) {
   bool Changed = false;
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
       continue;
     FlattenInfo FI(OuterLoop, InnerLoop);
-    Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+    Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
   }
   return Changed;
 }
@@ -685,12 +817,12 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
-  Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI);
+  Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U);
 
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return PreservedAnalyses::none();
+  return getLoopPassPreservedAnalyses();
 }
 
 namespace {
@@ -735,7 +867,7 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
   bool Changed = false;
   for (Loop *L : *LI) {
     auto LN = LoopNest::getLoopNest(*L, *SE);
-    Changed |= Flatten(*LN, DT, LI, SE, AC, TTI);
+    Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3d60e205b002..42da86a9ecf5 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -217,15 +217,15 @@ private:
   bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
-  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
                                MaybeAlign StoreAlignment, Value *StoredVal,
                                Instruction *TheStore,
                                SmallPtrSetImpl<Instruction *> &Stores,
                                const SCEVAddRecExpr *Ev, const SCEV *BECount,
-                               bool NegStride, bool IsLoopMemset = false);
+                               bool IsNegStride, bool IsLoopMemset = false);
   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
   bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
-                                  unsigned StoreSize, MaybeAlign StoreAlign,
+                                  const SCEV *StoreSize, MaybeAlign StoreAlign,
                                   MaybeAlign LoadAlign, Instruction *TheStore,
                                   Instruction *TheLoad,
                                   const SCEVAddRecExpr *StoreEv,
@@ -625,8 +625,8 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   // We can only promote stores in this block if they are unconditionally
   // executed in the loop.  For a block to be unconditionally executed, it has
   // to dominate all the exit blocks of the loop.  Verify this now.
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (!DT->dominates(BB, ExitBlocks[i]))
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(BB, ExitBlock))
       return false;
 
   bool MadeChange = false;
@@ -750,16 +750,13 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
   bool Changed = false;
 
   // For stores that start but don't end a link in the chain:
-  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
-       it != e; ++it) {
-    if (Tails.count(*it))
+  for (StoreInst *I : Heads) {
+    if (Tails.count(I))
       continue;
 
     // We found a store instr that starts a chain. Now follow the chain and try
     // to transform it.
     SmallPtrSet<Instruction *, 8> AdjacentStores;
-    StoreInst *I = *it;
-
     StoreInst *HeadStore = I;
     unsigned StoreSize = 0;
 
@@ -784,12 +781,14 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
     if (StoreSize != Stride && StoreSize != -Stride)
       continue;
 
-    bool NegStride = StoreSize == -Stride;
+    bool IsNegStride = StoreSize == -Stride;
 
-    if (processLoopStridedStore(StorePtr, StoreSize,
+    Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
+    const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
+    if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
                                 MaybeAlign(HeadStore->getAlignment()),
                                 StoredVal, HeadStore, AdjacentStores, StoreEv,
-                                BECount, NegStride)) {
+                                BECount, IsNegStride)) {
       TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
       Changed = true;
     }
@@ -857,15 +856,15 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
 
   // Check if the stride matches the size of the memcpy. If so, then we know
   // that every byte is touched in the loop.
-  const SCEVConstant *StoreStride =
+  const SCEVConstant *ConstStoreStride =
       dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
-  const SCEVConstant *LoadStride =
+  const SCEVConstant *ConstLoadStride =
       dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
-  if (!StoreStride || !LoadStride)
+  if (!ConstStoreStride || !ConstLoadStride)
     return false;
 
-  APInt StoreStrideValue = StoreStride->getAPInt();
-  APInt LoadStrideValue = LoadStride->getAPInt();
+  APInt StoreStrideValue = ConstStoreStride->getAPInt();
+  APInt LoadStrideValue = ConstLoadStride->getAPInt();
   // Huge stride value - give up
   if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
     return false;
@@ -875,7 +874,7 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
       return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
              << ore::NV("Inst", "memcpy") << " in "
              << ore::NV("Function", MCI->getFunction())
-             << " function will not be hoised: "
+             << " function will not be hoisted: "
              << ore::NV("Reason", "memcpy size is not equal to stride");
     });
     return false;
@@ -887,16 +886,17 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
   if (StoreStrideInt != LoadStrideInt)
     return false;
 
-  return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
-                                    MCI->getDestAlign(), MCI->getSourceAlign(),
-                                    MCI, MCI, StoreEv, LoadEv, BECount);
+  return processLoopStoreOfLoopLoad(
+      Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
+      MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
+      BECount);
 }
 
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                                            const SCEV *BECount) {
-  // We can only handle non-volatile memsets with a constant size.
-  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+  // We can only handle non-volatile memsets.
+  if (MSI->isVolatile())
     return false;
 
   // If we're not allowed to hack on memset, we fail.
@@ -909,23 +909,72 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   // loop, which indicates a strided store.  If we have something else, it's a
   // random store we can't handle.
   const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
-  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+  if (!Ev || Ev->getLoop() != CurLoop)
     return false;
-
-  // Reject memsets that are so large that they overflow an unsigned.
-  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-  if ((SizeInBytes >> 32) != 0)
+  if (!Ev->isAffine()) {
+    LLVM_DEBUG(dbgs() << "  Pointer is not affine, abort\n");
     return false;
+  }
 
-  // Check to see if the stride matches the size of the memset.  If so, then we
-  // know that every byte is touched in the loop.
-  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
-  if (!ConstStride)
+  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
+  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
+  if (!PointerStrideSCEV || !MemsetSizeSCEV)
     return false;
 
-  APInt Stride = ConstStride->getAPInt();
-  if (SizeInBytes != Stride && SizeInBytes != -Stride)
-    return false;
+  bool IsNegStride = false;
+  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
+
+  if (IsConstantSize) {
+    // Memset size is constant.
+    // Check if the pointer stride matches the memset size. If so, then
+    // we know that every byte is touched in the loop.
+    LLVM_DEBUG(dbgs() << "  memset size is constant\n");
+    uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+    if (!ConstStride)
+      return false;
+
+    APInt Stride = ConstStride->getAPInt();
+    if (SizeInBytes != Stride && SizeInBytes != -Stride)
+      return false;
+
+    IsNegStride = SizeInBytes == -Stride;
+  } else {
+    // Memset size is non-constant.
+    // Check if the pointer stride matches the memset size.
+    // To be conservative, the pass would not promote pointers that aren't in
+    // address space zero. Also, the pass only handles memset length and stride
+    // that are invariant for the top level loop.
+    LLVM_DEBUG(dbgs() << "  memset size is non-constant\n");
+    if (Pointer->getType()->getPointerAddressSpace() != 0) {
+      LLVM_DEBUG(dbgs() << "  pointer is not in address space zero, "
+                        << "abort\n");
+      return false;
+    }
+    if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
+      LLVM_DEBUG(dbgs() << "  memset size is not a loop-invariant, "
+                        << "abort\n");
+      return false;
+    }
+
+    // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
+    IsNegStride = PointerStrideSCEV->isNonConstantNegative();
+    const SCEV *PositiveStrideSCEV =
+        IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
+                    : PointerStrideSCEV;
+    LLVM_DEBUG(dbgs() << "  MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
+                      << "  PositiveStrideSCEV: " << *PositiveStrideSCEV
+                      << "\n");
+
+    if (PositiveStrideSCEV != MemsetSizeSCEV) {
+      // TODO: folding can be done to the SCEVs
+      // The folding is to fold expressions that is covered by the loop guard
+      // at loop entry. After the folding, compare again and proceed
+      // optimization if equal.
+      LLVM_DEBUG(dbgs() << "  SCEV don't match, abort\n");
+      return false;
+    }
+  }
 
   // Verify that the memset value is loop invariant.  If not, we can't promote
   // the memset.
@@ -935,10 +984,10 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
 
   SmallPtrSet<Instruction *, 1> MSIs;
   MSIs.insert(MSI);
-  bool NegStride = SizeInBytes == -Stride;
-  return processLoopStridedStore(
-      Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
-      SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
+  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
+                                 MaybeAlign(MSI->getDestAlignment()),
+                                 SplatValue, MSI, MSIs, Ev, BECount,
+                                 IsNegStride, /*IsLoopMemset=*/true);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -946,9 +995,9 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
 /// argument specifies what the verboten forms of access are (read or write).
 static bool
 mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
-                      const SCEV *BECount, unsigned StoreSize,
+                      const SCEV *BECount, const SCEV *StoreSizeSCEV,
                       AliasAnalysis &AA,
-                      SmallPtrSetImpl<Instruction *> &IgnoredStores) {
+                      SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
   // at the pointer and has infinite size.
@@ -956,9 +1005,11 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
 
   // If the loop iterates a fixed number of times, we can refine the access size
   // to be exactly the size of the memset, which is (BECount+1)*StoreSize
-  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+  const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
+  const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
+  if (BECst && ConstSize)
     AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
-                                       StoreSize);
+                                       ConstSize->getValue()->getZExtValue());
 
   // TODO: For this to be really effective, we have to dive into the pointer
   // operand in the store.  Store to &A[i] of 100 will always return may alias
@@ -966,14 +1017,12 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // which will then no-alias a store to &A[100].
   MemoryLocation StoreLoc(Ptr, AccessSize);
 
-  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
-       ++BI)
-    for (Instruction &I : **BI)
-      if (IgnoredStores.count(&I) == 0 &&
+  for (BasicBlock *B : L->blocks())
+    for (Instruction &I : *B)
+      if (!IgnoredInsts.contains(&I) &&
           isModOrRefSet(
               intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
         return true;
-
   return false;
 }
 
@@ -981,57 +1030,67 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
 // we're trying to memset.  Therefore, we need to recompute the base pointer,
 // which is just Start - BECount*Size.
 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
-                                        Type *IntPtr, unsigned StoreSize,
+                                        Type *IntPtr, const SCEV *StoreSizeSCEV,
                                         ScalarEvolution *SE) {
   const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
-  if (StoreSize != 1)
-    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+  if (!StoreSizeSCEV->isOne()) {
+    // index = back edge count * store size
+    Index = SE->getMulExpr(Index,
+                           SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
                            SCEV::FlagNUW);
+  }
+  // base pointer = start - index * store size
   return SE->getMinusSCEV(Start, Index);
 }
 
-/// Compute the number of bytes as a SCEV from the backedge taken count.
-///
-/// This also maps the SCEV into the provided type and tries to handle the
-/// computation in a way that will fold cleanly.
-static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
-                               unsigned StoreSize, Loop *CurLoop,
-                               const DataLayout *DL, ScalarEvolution *SE) {
-  const SCEV *NumBytesS;
-  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+/// Compute trip count from the backedge taken count.
+static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
+                                Loop *CurLoop, const DataLayout *DL,
+                                ScalarEvolution *SE) {
+  const SCEV *TripCountS = nullptr;
+  // The # stored bytes is (BECount+1).  Expand the trip count out to
   // pointer size if it isn't already.
   //
   // If we're going to need to zero extend the BE count, check if we can add
   // one to it prior to zero extending without overflow. Provided this is safe,
   // it allows better simplification of the +1.
-  if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
-          DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
+  if (DL->getTypeSizeInBits(BECount->getType()) <
+          DL->getTypeSizeInBits(IntPtr) &&
       SE->isLoopEntryGuardedByCond(
           CurLoop, ICmpInst::ICMP_NE, BECount,
           SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
-    NumBytesS = SE->getZeroExtendExpr(
+    TripCountS = SE->getZeroExtendExpr(
         SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
         IntPtr);
   } else {
-    NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
-                               SE->getOne(IntPtr), SCEV::FlagNUW);
+    TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
+                                SE->getOne(IntPtr), SCEV::FlagNUW);
   }
 
-  // And scale it based on the store size.
-  if (StoreSize != 1) {
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
-                               SCEV::FlagNUW);
-  }
-  return NumBytesS;
+  return TripCountS;
+}
+
+/// Compute the number of bytes as a SCEV from the backedge taken count.
+///
+/// This also maps the SCEV into the provided type and tries to handle the
+/// computation in a way that will fold cleanly.
+static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
+                               const SCEV *StoreSizeSCEV, Loop *CurLoop,
+                               const DataLayout *DL, ScalarEvolution *SE) {
+  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
+
+  return SE->getMulExpr(TripCountSCEV,
+                        SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
+                        SCEV::FlagNUW);
 }
 
 /// processLoopStridedStore - We see a strided store of some value.  If we can
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 bool LoopIdiomRecognize::processLoopStridedStore(
-    Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
+    Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
     Value *StoredVal, Instruction *TheStore,
     SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
-    const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
+    const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
   Value *SplatValue = isBytewiseValue(StoredVal, *DL);
   Constant *PatternValue = nullptr;
 
@@ -1056,8 +1115,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   bool Changed = false;
   const SCEV *Start = Ev->getStart();
   // Handle negative strided loops.
-  if (NegStride)
-    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
+  if (IsNegStride)
+    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -1082,7 +1141,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   Changed = true;
 
   if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores))
+                            StoreSizeSCEV, *AA, Stores))
     return Changed;
 
   if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
@@ -1091,7 +1150,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // Okay, everything looks good, insert the memset.
 
   const SCEV *NumBytesS =
-      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+      getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -1138,13 +1197,20 @@ bool LoopIdiomRecognize::processLoopStridedStore(
                     << "\n");
 
   ORE.emit([&]() {
-    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
-                              NewCall->getDebugLoc(), Preheader)
-           << "Transformed loop-strided store in "
-           << ore::NV("Function", TheStore->getFunction())
-           << " function into a call to "
-           << ore::NV("NewFunction", NewCall->getCalledFunction())
-           << "() intrinsic";
+    OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
+                         NewCall->getDebugLoc(), Preheader);
+    R << "Transformed loop-strided store in "
+      << ore::NV("Function", TheStore->getFunction())
+      << " function into a call to "
+      << ore::NV("NewFunction", NewCall->getCalledFunction())
+      << "() intrinsic";
+    if (!Stores.empty())
+      R << ore::setExtraArgs();
+    for (auto *I : Stores) {
+      R << ore::NV("FromBlock", I->getParent()->getName())
+        << ore::NV("ToBlock", Preheader->getName());
+    }
+    return R;
   });
 
   // Okay, the memset has been formed.  Zap the original store and anything that
@@ -1181,16 +1247,63 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // random load we can't handle.
   Value *LoadPtr = LI->getPointerOperand();
   const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
-  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
+
+  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
+  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
                                     SI->getAlign(), LI->getAlign(), SI, LI,
                                     StoreEv, LoadEv, BECount);
 }
 
+class MemmoveVerifier {
+public:
+  explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
+                           const DataLayout &DL)
+      : DL(DL), LoadOff(0), StoreOff(0),
+        BP1(llvm::GetPointerBaseWithConstantOffset(
+            LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
+        BP2(llvm::GetPointerBaseWithConstantOffset(
+            StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
+        IsSameObject(BP1 == BP2) {}
+
+  bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
+                                  const Instruction &TheLoad,
+                                  bool IsMemCpy) const {
+    if (IsMemCpy) {
+      // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
+      // for negative stride.
+      if ((!IsNegStride && LoadOff <= StoreOff) ||
+          (IsNegStride && LoadOff >= StoreOff))
+        return false;
+    } else {
+      // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
+      // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
+      int64_t LoadSize =
+          DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
+      if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
+        return false;
+      if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
+          (IsNegStride && LoadOff + LoadSize > StoreOff))
+        return false;
+    }
+    return true;
+  }
+
+private:
+  const DataLayout &DL;
+  int64_t LoadOff;
+  int64_t StoreOff;
+  const Value *BP1;
+  const Value *BP2;
+
+public:
+  const bool IsSameObject;
+};
+
 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
-    Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
-    MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
-    const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
-    const SCEV *BECount) {
+    Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
+    MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
+    Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
+    const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
 
   // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
   // conservatively bail here, since otherwise we may have to transform
@@ -1213,11 +1326,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
 
   APInt Stride = getStoreStride(StoreEv);
-  bool NegStride = StoreSize == -Stride;
+  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
+
+  // TODO: Deal with non-constant size; Currently expect constant store size
+  assert(ConstStoreSize && "store size is expected to be a constant");
+
+  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
+  bool IsNegStride = StoreSize == -Stride;
 
   // Handle negative strided loops.
-  if (NegStride)
-    StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
+  if (IsNegStride)
+    StrStart =
+        getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
 
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
   // this into a memcpy in the loop preheader now if we want.  However, this
@@ -1237,19 +1357,24 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   // the return value will read this comment, and leave them alone.
   Changed = true;
 
-  SmallPtrSet<Instruction *, 2> Stores;
-  Stores.insert(TheStore);
+  SmallPtrSet<Instruction *, 2> IgnoredInsts;
+  IgnoredInsts.insert(TheStore);
 
   bool IsMemCpy = isa<MemCpyInst>(TheStore);
   const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
 
-  bool UseMemMove =
+  bool LoopAccessStore =
       mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores);
-  if (UseMemMove) {
-    Stores.insert(TheLoad);
+                            StoreSizeSCEV, *AA, IgnoredInsts);
+  if (LoopAccessStore) {
+    // For memmove case it's not enough to guarantee that loop doesn't access
+    // TheStore and TheLoad. Additionally we need to make sure that TheStore is
+    // the only user of TheLoad.
+    if (!TheLoad->hasOneUse())
+      return Changed;
+    IgnoredInsts.insert(TheLoad);
     if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
-                              BECount, StoreSize, *AA, Stores)) {
+                              BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
                                         TheStore)
@@ -1260,15 +1385,16 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
       });
       return Changed;
     }
-    Stores.erase(TheLoad);
+    IgnoredInsts.erase(TheLoad);
   }
 
   const SCEV *LdStart = LoadEv->getStart();
   unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
 
   // Handle negative strided loops.
-  if (NegStride)
-    LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
+  if (IsNegStride)
+    LdStart =
+        getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
 
   // For a memcpy, we have to make sure that the input array is not being
   // mutated by the loop.
@@ -1278,42 +1404,40 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   // If the store is a memcpy instruction, we must check if it will write to
   // the load memory locations. So remove it from the ignored stores.
   if (IsMemCpy)
-    Stores.erase(TheStore);
+    IgnoredInsts.erase(TheStore);
+  MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
-                            StoreSize, *AA, Stores)) {
-    ORE.emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
-             << ore::NV("Inst", InstRemark) << " in "
-             << ore::NV("Function", TheStore->getFunction())
-             << " function will not be hoisted: "
-             << ore::NV("Reason", "The loop may access load location");
-    });
-    return Changed;
-  }
-  if (UseMemMove) {
-    // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for
-    // negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
-    int64_t LoadOff = 0, StoreOff = 0;
-    const Value *BP1 = llvm::GetPointerBaseWithConstantOffset(
-        LoadBasePtr->stripPointerCasts(), LoadOff, *DL);
-    const Value *BP2 = llvm::GetPointerBaseWithConstantOffset(
-        StoreBasePtr->stripPointerCasts(), StoreOff, *DL);
-    int64_t LoadSize =
-        DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8;
-    if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
+                            StoreSizeSCEV, *AA, IgnoredInsts)) {
+    if (!IsMemCpy) {
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad",
+                                        TheLoad)
+               << ore::NV("Inst", InstRemark) << " in "
+               << ore::NV("Function", TheStore->getFunction())
+               << " function will not be hoisted: "
+               << ore::NV("Reason", "The loop may access load location");
+      });
       return Changed;
-    if ((!NegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
-        (NegStride && LoadOff + LoadSize > StoreOff))
+    }
+    // At this point loop may access load only for memcpy in same underlying
+    // object. If that's not the case bail out.
+    if (!Verifier.IsSameObject)
       return Changed;
   }
 
+  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
+  if (UseMemMove)
+    if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
+                                             IsMemCpy))
+      return Changed;
+
   if (avoidLIRForMultiBlockLoop())
     return Changed;
 
   // Okay, everything is safe, we can transform this!
 
   const SCEV *NumBytesS =
-      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+      getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
@@ -1375,11 +1499,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
            << ore::NV("NewFunction", NewCall->getCalledFunction())
            << "() intrinsic from " << ore::NV("Inst", InstRemark)
            << " instruction in " << ore::NV("Function", TheStore->getFunction())
-           << " function";
+           << " function"
+           << ore::setExtraArgs()
+           << ore::NV("FromBlock", TheStore->getParent()->getName())
+           << ore::NV("ToBlock", Preheader->getName());
   });
 
-  // Okay, the memcpy has been formed.  Zap the original store and anything that
-  // feeds into it.
+  // Okay, a new call to memcpy/memmove has been formed.  Zap the original store
+  // and anything that feeds into it.
   if (MSSAU)
     MSSAU->removeMemoryAccess(TheStore, true);
   deleteDeadInstruction(TheStore);
@@ -1544,24 +1671,22 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
   {
     CountInst = nullptr;
-    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
-                              IterE = LoopEntry->end();
-         Iter != IterE; Iter++) {
-      Instruction *Inst = &*Iter;
-      if (Inst->getOpcode() != Instruction::Add)
+    for (Instruction &Inst : llvm::make_range(
+             LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+      if (Inst.getOpcode() != Instruction::Add)
         continue;
 
-      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
       if (!Inc || !Inc->isOne())
         continue;
 
-      PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+      PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
       if (!Phi)
         continue;
 
       // Check if the result of the instruction is live of the loop.
       bool LiveOutLoop = false;
-      for (User *U : Inst->users()) {
+      for (User *U : Inst.users()) {
         if ((cast<Instruction>(U))->getParent() != LoopEntry) {
           LiveOutLoop = true;
           break;
@@ -1569,7 +1694,7 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
       }
 
       if (LiveOutLoop) {
-        CountInst = Inst;
+        CountInst = &Inst;
         CountPhi = Phi;
         break;
       }
@@ -1670,22 +1795,20 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   //       plus "cnt0". Currently it is not optimized.
   //       This step could be used to detect POPCNT instruction:
   //       cnt.next = cnt + (x.next & 1)
-  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
-                            IterE = LoopEntry->end();
-       Iter != IterE; Iter++) {
-    Instruction *Inst = &*Iter;
-    if (Inst->getOpcode() != Instruction::Add)
+  for (Instruction &Inst : llvm::make_range(
+           LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+    if (Inst.getOpcode() != Instruction::Add)
       continue;
 
-    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
     if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
       continue;
 
-    PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+    PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
     if (!Phi)
       continue;
 
-    CntInst = Inst;
+    CntInst = &Inst;
     CntPhi = Phi;
     break;
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 3153a8721193..b9e63a4bc06f 100644
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -105,9 +105,7 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
         if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
           continue;
 
-        for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
-             UI != UE;) {
-          Use &U = *UI++;
+        for (Use &U : llvm::make_early_inc_range(I.uses())) {
           auto *UserI = cast<Instruction>(U.getUser());
           U.set(V);
 
@@ -195,15 +193,10 @@ public:
     const TargetLibraryInfo &TLI =
         getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
             *L->getHeader()->getParent());
-    MemorySSA *MSSA = nullptr;
-    Optional<MemorySSAUpdater> MSSAU;
-    if (EnableMSSALoopDependency) {
-      MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-      MSSAU = MemorySSAUpdater(MSSA);
-    }
+    MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    MemorySSAUpdater MSSAU(MSSA);
 
-    return simplifyLoopInst(*L, DT, LI, AC, TLI,
-                            MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+    return simplifyLoopInst(*L, DT, LI, AC, TLI, &MSSAU);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -211,10 +204,8 @@ public:
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesCFG();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 34545f35b3c3..9f605b4ac4ad 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1710,16 +1710,12 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   auto &OuterInnerReductions = LIL.getOuterInnerReductions();
   // Now update the reduction PHIs in the inner and outer loop headers.
   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
-  for (PHINode &PHI : InnerLoopHeader->phis()) {
-    if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end())
-      continue;
-    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
-  }
-  for (PHINode &PHI : OuterLoopHeader->phis()) {
-    if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end())
-      continue;
-    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
-  }
+  for (PHINode &PHI : InnerLoopHeader->phis())
+    if (OuterInnerReductions.contains(&PHI))
+      InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+  for (PHINode &PHI : OuterLoopHeader->phis())
+    if (OuterInnerReductions.contains(&PHI))
+      OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
 
   // Now move the remaining reduction PHIs from outer to inner loop header and
   // vice versa. The PHI nodes must be part of a reduction across the inner and
@@ -1767,6 +1763,7 @@ bool LoopInterchangeTransform::adjustLoopLinks() {
   return Changed;
 }
 
+namespace {
 /// Main LoopInterchange Pass.
 struct LoopInterchangeLegacyPass : public LoopPass {
   static char ID;
@@ -1795,6 +1792,7 @@ struct LoopInterchangeLegacyPass : public LoopPass {
     return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
   }
 };
+} // namespace
 
 char LoopInterchangeLegacyPass::ID = 0;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index aaf586173e44..21d59936616b 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -109,8 +108,8 @@ struct StoreToLoadForwardingCandidate {
     // Currently we only support accesses with unit stride.  FIXME: we should be
     // able to handle non unit stirde as well as long as the stride is equal to
     // the dependence distance.
-    if (getPtrStride(PSE, LoadPtr, L) != 1 ||
-        getPtrStride(PSE, StorePtr, L) != 1)
+    if (getPtrStride(PSE, LoadType, LoadPtr, L) != 1 ||
+        getPtrStride(PSE, LoadType, StorePtr, L) != 1)
       return false;
 
     auto &DL = Load->getParent()->getModule()->getDataLayout();
@@ -718,15 +717,12 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
   auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
-  MemorySSA *MSSA = EnableMSSALoopDependency
-                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                        : nullptr;
 
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   bool Changed = eliminateLoadsAcrossLoops(
       F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & {
-        LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                          TLI, TTI, nullptr, MSSA};
+        LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
+                                          TLI, TTI, nullptr, nullptr, nullptr};
         return LAM.getResult<LoopAccessAnalysis>(L, AR);
       });
 
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index f4fce4871331..3df4cfe8e4c1 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
@@ -44,6 +45,18 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
   return PA;
 }
 
+void PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+                 LPMUpdater &>::printPipeline(raw_ostream &OS,
+                                              function_ref<StringRef(StringRef)>
+                                                  MapClassName2PassName) {
+  for (unsigned Idx = 0, Size = LoopPasses.size(); Idx != Size; ++Idx) {
+    auto *P = LoopPasses[Idx].get();
+    P->printPipeline(OS, MapClassName2PassName);
+    if (Idx + 1 < Size)
+      OS << ",";
+  }
+}
+
 // Run both loop passes and loop-nest passes on top-level loop \p L.
 PreservedAnalyses
 LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
@@ -112,12 +125,6 @@ LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
     // notify the updater, otherwise U.ParentL might gets outdated and triggers
     // assertion failures in addSiblingLoops and addChildLoops.
     U.setParentLoop(L.getParentLoop());
-
-    // FIXME: Historically, the pass managers all called the LLVM context's
-    // yield function here. We don't have a generic way to acquire the
-    // context and it isn't yet clear what the right pattern is for yielding
-    // in the new pass manager so it is currently omitted.
-    // ...getContext().yield();
   }
   return PA;
 }
@@ -161,17 +168,17 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
     // notify the updater, otherwise U.ParentL might gets outdated and triggers
     // assertion failures in addSiblingLoops and addChildLoops.
     U.setParentLoop(L.getParentLoop());
-
-    // FIXME: Historically, the pass managers all called the LLVM context's
-    // yield function here. We don't have a generic way to acquire the
-    // context and it isn't yet clear what the right pattern is for yielding
-    // in the new pass manager so it is currently omitted.
-    // ...getContext().yield();
   }
   return PA;
 }
 } // namespace llvm
 
+void FunctionToLoopPassAdaptor::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  OS << (UseMemorySSA ? "loop-mssa(" : "loop(");
+  Pass->printPipeline(OS, MapClassName2PassName);
+  OS << ")";
+}
 PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   // Before we even compute any loop analyses, first run a miniature function
@@ -201,6 +208,10 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
   BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData()
                                 ? (&AM.getResult<BlockFrequencyAnalysis>(F))
                                 : nullptr;
+  BranchProbabilityInfo *BPI =
+      UseBranchProbabilityInfo && F.hasProfileData()
+          ? (&AM.getResult<BranchProbabilityAnalysis>(F))
+          : nullptr;
   LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
                                      AM.getResult<AssumptionAnalysis>(F),
                                      AM.getResult<DominatorTreeAnalysis>(F),
@@ -209,6 +220,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                      AM.getResult<TargetLibraryAnalysis>(F),
                                      AM.getResult<TargetIRAnalysis>(F),
                                      BFI,
+                                     BPI,
                                      MSSA};
 
   // Setup the loop analysis manager from its proxy. It is important that
@@ -285,6 +297,10 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
     else
       PI.runAfterPass<Loop>(*Pass, *L, PassPA);
 
+    if (LAR.MSSA && !PassPA.getChecker<MemorySSAAnalysis>().preserved())
+      report_fatal_error("Loop pass manager using MemorySSA contains a pass "
+                         "that does not preserve MemorySSA");
+
 #ifndef NDEBUG
     // LoopAnalysisResults should always be valid.
     // Note that we don't LAR.SE.verify() because that can change observed SE
@@ -325,6 +341,8 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
   PA.preserve<ScalarEvolutionAnalysis>();
   if (UseBlockFrequencyInfo && F.hasProfileData())
     PA.preserve<BlockFrequencyAnalysis>();
+  if (UseBranchProbabilityInfo && F.hasProfileData())
+    PA.preserve<BranchProbabilityAnalysis>();
   if (UseMemorySSA)
     PA.preserve<MemorySSAAnalysis>();
   return PA;
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 4f97641e2027..aa7e79a589f2 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -183,6 +183,8 @@
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Function.h"
@@ -254,7 +256,7 @@ class LoopPredication {
   DominatorTree *DT;
   ScalarEvolution *SE;
   LoopInfo *LI;
-  BranchProbabilityInfo *BPI;
+  MemorySSAUpdater *MSSAU;
 
   Loop *L;
   const DataLayout *DL;
@@ -302,16 +304,15 @@ class LoopPredication {
   // If the loop always exits through another block in the loop, we should not
   // predicate based on the latch check. For example, the latch check can be a
   // very coarse grained check and there can be more fine grained exit checks
-  // within the loop. We identify such unprofitable loops through BPI.
+  // within the loop.
   bool isLoopProfitableToPredicate();
 
   bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
 
 public:
-  LoopPredication(AliasAnalysis *AA, DominatorTree *DT,
-                  ScalarEvolution *SE, LoopInfo *LI,
-                  BranchProbabilityInfo *BPI)
-    : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {};
+  LoopPredication(AliasAnalysis *AA, DominatorTree *DT, ScalarEvolution *SE,
+                  LoopInfo *LI, MemorySSAUpdater *MSSAU)
+      : AA(AA), DT(DT), SE(SE), LI(LI), MSSAU(MSSAU){};
   bool runOnLoop(Loop *L);
 };
 
@@ -325,6 +326,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
@@ -333,10 +335,12 @@ public:
     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    BranchProbabilityInfo &BPI =
-        getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    std::unique_ptr<MemorySSAUpdater> MSSAU;
+    if (MSSAWP)
+      MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA());
     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    LoopPredication LP(AA, DT, SE, LI, &BPI);
+    LoopPredication LP(AA, DT, SE, LI, MSSAU ? MSSAU.get() : nullptr);
     return LP.runOnLoop(L);
   }
 };
@@ -358,16 +362,18 @@ Pass *llvm::createLoopPredicationPass() {
 PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &U) {
-  Function *F = L.getHeader()->getParent();
-  // For the new PM, we also can't use BranchProbabilityInfo as an analysis
-  // pass. Function analyses need to be preserved across loop transformations
-  // but BPI is not preserved, hence a newly built one is needed.
-  BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI, &AR.DT, nullptr);
-  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(AR.MSSA);
+  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI,
+                     MSSAU ? MSSAU.get() : nullptr);
   if (!LP.runOnLoop(&L))
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 Optional<LoopICmp>
@@ -809,7 +815,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   Value *AllChecks = Builder.CreateAnd(Checks);
   auto *OldCond = Guard->getOperand(0);
   Guard->setOperand(0, AllChecks);
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond, nullptr /* TLI */, MSSAU);
 
   LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
   return true;
@@ -835,7 +841,7 @@ bool LoopPredication::widenWidenableBranchGuardConditions(
   Value *AllChecks = Builder.CreateAnd(Checks);
   auto *OldCond = BI->getCondition();
   BI->setCondition(AllChecks);
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond, nullptr /* TLI */, MSSAU);
   assert(isGuardAsWidenableBranch(BI) &&
          "Stopped being a guard after transform?");
 
@@ -912,7 +918,7 @@ Optional<LoopICmp> LoopPredication::parseLoopLatchICmp() {
 
 
 bool LoopPredication::isLoopProfitableToPredicate() {
-  if (SkipProfitabilityChecks || !BPI)
+  if (SkipProfitabilityChecks)
     return true;
 
   SmallVector<std::pair<BasicBlock *, BasicBlock *>, 8> ExitEdges;
@@ -934,8 +940,61 @@ bool LoopPredication::isLoopProfitableToPredicate() {
          "expected to be an exiting block with 2 succs!");
   unsigned LatchBrExitIdx =
       LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
+  // We compute branch probabilities without BPI. We do not rely on BPI since
+  // Loop predication is usually run in an LPM and BPI is only preserved
+  // lossily within loop pass managers, while BPI has an inherent notion of
+  // being complete for an entire function.
+
+  // If the latch exits into a deoptimize or an unreachable block, do not
+  // predicate on that latch check.
+  auto *LatchExitBlock = LatchTerm->getSuccessor(LatchBrExitIdx);
+  if (isa<UnreachableInst>(LatchTerm) ||
+      LatchExitBlock->getTerminatingDeoptimizeCall())
+    return false;
+
+  auto IsValidProfileData = [](MDNode *ProfileData, const Instruction *Term) {
+    if (!ProfileData || !ProfileData->getOperand(0))
+      return false;
+    if (MDString *MDS = dyn_cast<MDString>(ProfileData->getOperand(0)))
+      if (!MDS->getString().equals("branch_weights"))
+        return false;
+    if (ProfileData->getNumOperands() != 1 + Term->getNumSuccessors())
+      return false;
+    return true;
+  };
+  MDNode *LatchProfileData = LatchTerm->getMetadata(LLVMContext::MD_prof);
+  // Latch terminator has no valid profile data, so nothing to check
+  // profitability on.
+  if (!IsValidProfileData(LatchProfileData, LatchTerm))
+    return true;
+
+  auto ComputeBranchProbability =
+      [&](const BasicBlock *ExitingBlock,
+          const BasicBlock *ExitBlock) -> BranchProbability {
+    auto *Term = ExitingBlock->getTerminator();
+    MDNode *ProfileData = Term->getMetadata(LLVMContext::MD_prof);
+    unsigned NumSucc = Term->getNumSuccessors();
+    if (IsValidProfileData(ProfileData, Term)) {
+      uint64_t Numerator = 0, Denominator = 0, ProfVal = 0;
+      for (unsigned i = 0; i < NumSucc; i++) {
+        ConstantInt *CI =
+            mdconst::extract<ConstantInt>(ProfileData->getOperand(i + 1));
+        ProfVal = CI->getValue().getZExtValue();
+        if (Term->getSuccessor(i) == ExitBlock)
+          Numerator += ProfVal;
+        Denominator += ProfVal;
+      }
+      return BranchProbability::getBranchProbability(Numerator, Denominator);
+    } else {
+      assert(LatchBlock != ExitingBlock &&
+             "Latch term should always have profile data!");
+      // No profile data, so we choose the weight as 1/num_of_succ(Src)
+      return BranchProbability::getBranchProbability(1, NumSucc);
+    }
+  };
+
   BranchProbability LatchExitProbability =
-      BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
+      ComputeBranchProbability(LatchBlock, LatchExitBlock);
 
   // Protect against degenerate inputs provided by the user. Providing a value
   // less than one, can invert the definition of profitable loop predication.
@@ -948,18 +1007,18 @@ bool LoopPredication::isLoopProfitableToPredicate() {
     LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
     ScaleFactor = 1.0;
   }
-  const auto LatchProbabilityThreshold =
-      LatchExitProbability * ScaleFactor;
+  const auto LatchProbabilityThreshold = LatchExitProbability * ScaleFactor;
 
   for (const auto &ExitEdge : ExitEdges) {
     BranchProbability ExitingBlockProbability =
-        BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
+        ComputeBranchProbability(ExitEdge.first, ExitEdge.second);
     // Some exiting edge has higher probability than the latch exiting edge.
     // No longer profitable to predicate.
     if (ExitingBlockProbability > LatchProbabilityThreshold)
       return false;
   }
-  // Using BPI, we have concluded that the most probable way to exit from the
+
+  // We have concluded that the most probable way to exit from the
   // loop is through the latch (or there's no profile information and all
   // exits are equally likely).
   return true;
@@ -1071,28 +1130,26 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // widen so that we gain ability to analyze it's exit count and perform this
   // transform.  TODO: It'd be nice to know for sure the exit became
   // analyzeable after dropping widenability.
-  {
-    bool Invalidate = false;
+  bool ChangedLoop = false;
 
-    for (auto *ExitingBB : ExitingBlocks) {
-      if (LI->getLoopFor(ExitingBB) != L)
-        continue;
+  for (auto *ExitingBB : ExitingBlocks) {
+    if (LI->getLoopFor(ExitingBB) != L)
+      continue;
 
-      auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
-      if (!BI)
-        continue;
+    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    if (!BI)
+      continue;
 
-      Use *Cond, *WC;
-      BasicBlock *IfTrueBB, *IfFalseBB;
-      if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
-          L->contains(IfTrueBB)) {
-        WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
-        Invalidate = true;
-      }
+    Use *Cond, *WC;
+    BasicBlock *IfTrueBB, *IfFalseBB;
+    if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
+        L->contains(IfTrueBB)) {
+      WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
+      ChangedLoop = true;
     }
-    if (Invalidate)
-      SE->forgetLoop(L);
   }
+  if (ChangedLoop)
+    SE->forgetLoop(L);
 
   // The use of umin(all analyzeable exits) instead of latch is subtle, but
   // important for profitability.  We may have a loop which hasn't been fully
@@ -1104,18 +1161,24 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() ||
       !SE->isLoopInvariant(MinEC, L) ||
       !isSafeToExpandAt(MinEC, WidenableBR, *SE))
-    return false;
+    return ChangedLoop;
 
   // Subtlety: We need to avoid inserting additional uses of the WC.  We know
   // that it can only have one transitive use at the moment, and thus moving
   // that use to just before the branch and inserting code before it and then
   // modifying the operand is legal.
   auto *IP = cast<Instruction>(WidenableBR->getCondition());
+  // Here we unconditionally modify the IR, so after this point we should return
+  // only `true`!
   IP->moveBefore(WidenableBR);
+  if (MSSAU)
+    if (auto *MUD = MSSAU->getMemorySSA()->getMemoryAccess(IP))
+       MSSAU->moveToPlace(MUD, WidenableBR->getParent(),
+                          MemorySSA::BeforeTerminator);
   Rewriter.setInsertPoint(IP);
   IRBuilder<> B(IP);
 
-  bool Changed = false;
+  bool InvalidateLoop = false;
   Value *MinECV = nullptr; // lazily generated if needed
   for (BasicBlock *ExitingBB : ExitingBlocks) {
     // If our exiting block exits multiple loops, we can only rewrite the
@@ -1172,16 +1235,18 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
 
     Value *OldCond = BI->getCondition();
     BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue));
-    Changed = true;
+    InvalidateLoop = true;
   }
 
-  if (Changed)
+  if (InvalidateLoop)
     // We just mutated a bunch of loop exits changing there exit counts
     // widely.  We need to force recomputation of the exit counts given these
     // changes.  Note that all of the inserted exits are never taken, and
     // should be removed next time the CFG is modified.
     SE->forgetLoop(L);
-  return Changed;
+
+  // Always return `true` since we have moved the WidenableBR's condition.
+  return true;
 }
 
 bool LoopPredication::runOnLoop(Loop *Loop) {
@@ -1242,5 +1307,8 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
   for (auto *Guard : GuardsAsWidenableBranches)
     Changed |= widenWidenableBranchGuardConditions(Guard, Expander);
   Changed |= predicateLoopExits(L, Expander);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index 6d5b19443c76..5ba137b1c85f 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -99,8 +99,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    if (EnableMSSALoopDependency)
-      AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
 
     // Lazy BFI and BPI are marked as preserved here so LoopRotate
@@ -121,13 +120,11 @@ public:
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
     Optional<MemorySSAUpdater> MSSAU;
-    if (EnableMSSALoopDependency) {
-      // Not requiring MemorySSA and getting it only if available will split
-      // the loop pass pipeline when LoopRotate is being run first.
-      auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
-      if (MSSAA)
-        MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
-    }
+    // Not requiring MemorySSA and getting it only if available will split
+    // the loop pass pipeline when LoopRotate is being run first.
+    auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    if (MSSAA)
+      MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
     // Vectorization requires loop-rotation. Use default threshold for loops the
     // user explicitly marked for vectorization, even when header duplication is
     // disabled.
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index cc6d11220807..a87843d658a9 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -733,13 +733,12 @@ public:
     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
     Optional<MemorySSAUpdater> MSSAU;
-    if (EnableMSSALoopDependency) {
-      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-      MSSAU = MemorySSAUpdater(MSSA);
-      if (VerifyMemorySSA)
-        MSSA->verifyMemorySSA();
-    }
+    if (MSSAA)
+      MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
+    if (MSSAA && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
     bool DeleteCurrentLoop = false;
     bool Changed = simplifyLoopCFG(
         *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
@@ -750,10 +749,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addPreserved<MemorySSAWrapperPass>();
     AU.addPreserved<DependenceAnalysisWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
index a01287f587d7..c9c9e60d0921 100644
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -323,15 +323,14 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // Traverse preheader's instructions in reverse order becaue if A depends
   // on B (A appears after B), A needs to be sinked first before B can be
   // sinked.
-  for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
-    Instruction *I = &*II++;
+  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
     // No need to check for instruction's operands are loop invariant.
-    assert(L.hasLoopInvariantOperands(I) &&
+    assert(L.hasLoopInvariantOperands(&I) &&
            "Insts in a loop's preheader should have loop invariant operands!");
-    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
+    if (!canSinkOrHoistInst(I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
                             LICMFlags.get()))
       continue;
-    if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
+    if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
                         MSSAU.get()))
       Changed = true;
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 5f210380ae5a..a9a2266e1196 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -136,6 +136,12 @@ using namespace llvm;
 /// worst cases before LSR burns too much compile time and stack space.
 static const unsigned MaxIVUsers = 200;
 
+/// Limit the size of expression that SCEV-based salvaging will attempt to
+/// translate into a DIExpression.
+/// Choose a maximum size such that debuginfo is not excessively increased and
+/// the salvaging is not too expensive for the compiler.
+static const unsigned MaxSCEVSalvageExpressionSize = 64;
+
 // Temporary flag to cleanup congruent phis after LSR phi expansion.
 // It's currently disabled until we can determine whether it's truly useful or
 // not. The flag should be removed after the v3.0 release.
@@ -689,7 +695,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
     const APInt &RA = RC->getAPInt();
     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
     // some folding.
-    if (RA.isAllOnesValue()) {
+    if (RA.isAllOnes()) {
       if (LHS->getType()->isPointerTy())
         return nullptr;
       return SE.getMulExpr(LHS, RC);
@@ -2816,9 +2822,7 @@ static const SCEV *getExprBase(const SCEV *S) {
     // there's nothing more complex.
     // FIXME: not sure if we want to recognize negation.
     const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
-    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
-           E(Add->op_begin()); I != E; ++I) {
-      const SCEV *SubExpr = *I;
+    for (const SCEV *SubExpr : reverse(Add->operands())) {
       if (SubExpr->getSCEVType() == scAddExpr)
         return getExprBase(SubExpr);
 
@@ -3150,7 +3154,7 @@ void LSRInstance::CollectChains() {
 void LSRInstance::FinalizeChain(IVChain &Chain) {
   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
   LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
-
+  
   for (const IVInc &Inc : Chain) {
     LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
     auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
@@ -3385,7 +3389,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   // Mark uses whose expressions cannot be expanded.
-  if (!isSafeToExpand(S, SE))
+  if (!isSafeToExpand(S, SE, /*CanonicalMode*/ false))
     LU.RigidFormula = true;
 
   Formula F;
@@ -3934,6 +3938,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
 
   // Check each interesting stride.
   for (int64_t Factor : Factors) {
+    // Check that Factor can be represented by IntTy
+    if (!ConstantInt::isValueValidForType(IntTy, Factor))
+      continue;
     // Check that the multiplication doesn't overflow.
     if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
       continue;
@@ -4082,6 +4089,14 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   if (DstTy->isPointerTy())
     return;
 
+  // It is invalid to extend a pointer type so exit early if ScaledReg or
+  // any of the BaseRegs are pointers.
+  if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
+    return;
+  if (any_of(Base.BaseRegs,
+             [](const SCEV *S) { return S->getType()->isPointerTy(); }))
+    return;
+
   for (Type *SrcTy : Types) {
     if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
       Formula F = Base;
@@ -5689,23 +5704,6 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
     }
   }
 
-#ifndef NDEBUG
-  // All dominating loops must have preheaders, or SCEVExpander may not be able
-  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
-  //
-  // IVUsers analysis should only create users that are dominated by simple loop
-  // headers. Since this loop should dominate all of its users, its user list
-  // should be empty if this loop itself is not within a simple loop nest.
-  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
-       Rung; Rung = Rung->getIDom()) {
-    BasicBlock *BB = Rung->getBlock();
-    const Loop *DomLoop = LI.getLoopFor(BB);
-    if (DomLoop && DomLoop->getHeader() == BB) {
-      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
-    }
-  }
-#endif // DEBUG
-
   LLVM_DEBUG(dbgs() << "\nLSR on loop ";
              L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
              dbgs() << ":\n");
@@ -5870,6 +5868,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MemorySSAWrapperPass>();
 }
 
+namespace {
 struct SCEVDbgValueBuilder {
   SCEVDbgValueBuilder() = default;
   SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) {
@@ -5906,9 +5905,12 @@ struct SCEVDbgValueBuilder {
     pushValue(V);
   }
 
-  void pushConst(const SCEVConstant *C) {
+  bool pushConst(const SCEVConstant *C) {
+    if (C->getAPInt().getMinSignedBits() > 64)
+      return false;
     Expr.push_back(llvm::dwarf::DW_OP_consts);
     Expr.push_back(C->getAPInt().getSExtValue());
+    return true;
   }
 
   /// Several SCEV types are sequences of the same arithmetic operator applied
@@ -5947,10 +5949,10 @@ struct SCEVDbgValueBuilder {
   bool pushSCEV(const llvm::SCEV *S) {
     bool Success = true;
     if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
-      pushConst(StartInt);
+      Success &= pushConst(StartInt);
 
     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
-      if(!U->getValue())
+      if (!U->getValue())
         return false;
       pushValue(U->getValue());
 
@@ -6033,6 +6035,8 @@ struct SCEVDbgValueBuilder {
   /// SCEV constant value is an identity function.
   bool isIdentityFunction(uint64_t Op, const SCEV *S) {
     if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+      if (C->getAPInt().getMinSignedBits() > 64)
+        return false;
       int64_t I = C->getAPInt().getSExtValue();
       switch (Op) {
       case llvm::dwarf::DW_OP_plus:
@@ -6112,14 +6116,15 @@ struct DVIRecoveryRec {
   Metadata *LocationOp;
   const llvm::SCEV *SCEV;
 };
+} // namespace
 
-static bool RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI,
+static void RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI,
                                      const SCEVDbgValueBuilder &IterationCount,
                                      ScalarEvolution &SE) {
   // LSR may add locations to previously single location-op DVIs which
   // are currently not supported.
   if (CachedDVI.DVI->getNumVariableLocationOps() != 1)
-    return false;
+    return;
 
   // SCEVs for SSA values are most frquently of the form
   // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
@@ -6127,45 +6132,70 @@ static bool RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI,
   // SCEVs have not been observed to result in debuginfo-lossy optimisations,
   // so its not expected this point will be reached.
   if (!isa<SCEVAddRecExpr>(CachedDVI.SCEV))
-    return false;
+    return;
 
   LLVM_DEBUG(dbgs() << "scev-salvage: Value to salvage SCEV: "
                     << *CachedDVI.SCEV << '\n');
 
   const auto *Rec = cast<SCEVAddRecExpr>(CachedDVI.SCEV);
   if (!Rec->isAffine())
-    return false;
+    return;
+
+  if (CachedDVI.SCEV->getExpressionSize() > MaxSCEVSalvageExpressionSize)
+    return;
 
   // Initialise a new builder with the iteration count expression. In
   // combination with the value's SCEV this enables recovery.
   SCEVDbgValueBuilder RecoverValue(IterationCount);
   if (!RecoverValue.SCEVToValueExpr(*Rec, SE))
-    return false;
+    return;
 
   LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *CachedDVI.DVI << '\n');
   RecoverValue.applyExprToDbgValue(*CachedDVI.DVI, CachedDVI.Expr);
   LLVM_DEBUG(dbgs() << "scev-salvage: to: " << *CachedDVI.DVI << '\n');
-  return true;
 }
 
-static bool
+static void RewriteDVIUsingOffset(DVIRecoveryRec &DVIRec, llvm::PHINode &IV,
+                                  int64_t Offset) {
+  assert(!DVIRec.DVI->hasArgList() && "Expected single location-op dbg.value.");
+  DbgValueInst *DVI = DVIRec.DVI;
+  SmallVector<uint64_t, 8> Ops;
+  DIExpression::appendOffset(Ops, Offset);
+  DIExpression *Expr = DIExpression::prependOpcodes(DVIRec.Expr, Ops, true);
+  LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *DVIRec.DVI << '\n');
+  DVI->setExpression(Expr);
+  llvm::Value *ValIV = dyn_cast<llvm::Value>(&IV);
+  DVI->replaceVariableLocationOp(
+      0u, llvm::MetadataAsValue::get(DVI->getContext(),
+                                     llvm::ValueAsMetadata::get(ValIV)));
+  LLVM_DEBUG(dbgs() << "scev-salvage: updated with offset to IV: "
+                    << *DVIRec.DVI << '\n');
+}
+
+static void
 DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
                           llvm::PHINode *LSRInductionVar,
                           SmallVector<DVIRecoveryRec, 2> &DVIToUpdate) {
   if (DVIToUpdate.empty())
-    return false;
+    return;
 
   const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
   assert(SCEVInductionVar &&
          "Anticipated a SCEV for the post-LSR induction variable");
 
-  bool Changed = false;
   if (const SCEVAddRecExpr *IVAddRec =
           dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
+    if (!IVAddRec->isAffine())
+      return;
+
+    if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
+      return;
+
+    // The iteration count is required to recover location values.
     SCEVDbgValueBuilder IterCountExpr;
     IterCountExpr.pushValue(LSRInductionVar);
     if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
-      return false;
+      return;
 
     LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
                       << '\n');
@@ -6180,20 +6210,34 @@ DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
       // supported by SCEV salvaging. But, we can attempt a salvage by restoring
       // the pre-LSR single-op expression.
       if (DVIRec.DVI->hasArgList()) {
+        if (!DVIRec.DVI->getVariableLocationOp(0))
+          continue;
         llvm::Type *Ty = DVIRec.DVI->getVariableLocationOp(0)->getType();
         DVIRec.DVI->setRawLocation(
             llvm::ValueAsMetadata::get(UndefValue::get(Ty)));
         DVIRec.DVI->setExpression(DVIRec.Expr);
       }
 
-      Changed |= RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE);
+      LLVM_DEBUG(dbgs() << "scev-salvage: value to recover SCEV: "
+                        << *DVIRec.SCEV << '\n');
+
+      // Create a simple expression if the IV and value to salvage SCEVs
+      // start values differ by only a constant value.
+      if (Optional<APInt> Offset =
+              SE.computeConstantDifference(DVIRec.SCEV, SCEVInductionVar)) {
+        if (Offset.getValue().getMinSignedBits() <= 64)
+          RewriteDVIUsingOffset(DVIRec, *LSRInductionVar,
+                                Offset.getValue().getSExtValue());
+      } else {
+        RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE);
+      }
     }
   }
-  return Changed;
 }
 
 /// Identify and cache salvageable DVI locations and expressions along with the
-/// corresponding SCEV(s). Also ensure that the DVI is not deleted before
+/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
+/// cacheing and salvaging.
 static void
 DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE,
                        SmallVector<DVIRecoveryRec, 2> &SalvageableDVISCEVs,
@@ -6204,10 +6248,24 @@ DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE,
       if (!DVI)
         continue;
 
+      if (DVI->isUndef())
+        continue;
+
       if (DVI->hasArgList())
         continue;
 
-      if (!SE.isSCEVable(DVI->getVariableLocationOp(0)->getType()))
+      if (!DVI->getVariableLocationOp(0) ||
+          !SE.isSCEVable(DVI->getVariableLocationOp(0)->getType()))
+        continue;
+
+      // SCEVUnknown wraps an llvm::Value, it does not have a start and stride.
+      // Therefore no translation to DIExpression is performed.
+      const SCEV *S = SE.getSCEV(DVI->getVariableLocationOp(0));
+      if (isa<SCEVUnknown>(S))
+        continue;
+
+      // Avoid wasting resources generating an expression containing undef.
+      if (SE.containsUndefs(S))
         continue;
 
       SalvageableDVISCEVs.push_back(
@@ -6223,34 +6281,32 @@ DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE,
 /// surviving subsequent transforms.
 static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE,
                                            const LSRInstance &LSR) {
-  // For now, just pick the first IV generated and inserted. Ideally pick an IV
-  // that is unlikely to be optimised away by subsequent transforms.
+
+  auto IsSuitableIV = [&](PHINode *P) {
+    if (!SE.isSCEVable(P->getType()))
+      return false;
+    if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
+      return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
+    return false;
+  };
+
+  // For now, just pick the first IV that was generated and inserted by
+  // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
+  // by subsequent transforms.
   for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
     if (!IV)
       continue;
 
-    assert(isa<PHINode>(&*IV) && "Expected PhI node.");
-    if (SE.isSCEVable((*IV).getType())) {
-      PHINode *Phi = dyn_cast<PHINode>(&*IV);
-      LLVM_DEBUG(const llvm::SCEV *S = SE.getSCEV(Phi);
-                 dbgs() << "scev-salvage: IV : " << *IV << "with SCEV: " << *S
-                 << "\n");
-      return Phi;
-    }
-  }
+    // There should only be PHI node IVs.
+    PHINode *P = cast<PHINode>(&*IV);
 
-  for (PHINode &Phi : L.getHeader()->phis()) {
-    if (!SE.isSCEVable(Phi.getType()))
-      continue;
-
-    const llvm::SCEV *PhiSCEV = SE.getSCEV(&Phi);
-    if (const llvm::SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(PhiSCEV))
-      if (!Rec->isAffine())
-        continue;
+    if (IsSuitableIV(P))
+      return P;
+  }
 
-    LLVM_DEBUG(dbgs() << "scev-salvage: Selected IV from loop header: " << Phi
-                      << " with SCEV: " << *PhiSCEV << "\n");
-    return &Phi;
+  for (PHINode &P : L.getHeader()->phis()) {
+    if (IsSuitableIV(&P))
+      return &P;
   }
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 71eb393fcdd7..1ecbb86724e1 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -286,8 +286,8 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                       AssumptionCache &AC, DependenceInfo &DI,
                       OptimizationRemarkEmitter &ORE, int OptLevel) {
   TargetTransformInfo::UnrollingPreferences UP =
-      gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
-                                 None, None, None, None, None);
+      gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, ORE, OptLevel,
+                                 None, None, None, None, None, None);
   TargetTransformInfo::PeelingPreferences PP =
       gatherPeelingPreferences(L, SE, TTI, None, None);
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 49501f324a49..67702520511b 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -184,7 +184,8 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
 /// flags, TTI overrides and user specified parameters.
 TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
-    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
+    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+    OptimizationRemarkEmitter &ORE, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
@@ -214,7 +215,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
 
   // Override with any target specific settings
-  TTI.getUnrollingPreferences(L, SE, UP);
+  TTI.getUnrollingPreferences(L, SE, UP, &ORE);
 
   // Apply size attributes
   bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
@@ -318,6 +319,16 @@ struct EstimatedUnrollCost {
   unsigned RolledDynamicCost;
 };
 
+struct PragmaInfo {
+  PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
+      : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
+        PragmaEnableUnroll(PEU) {}
+  const bool UserUnrollCount;
+  const bool PragmaFullUnroll;
+  const unsigned PragmaCount;
+  const bool PragmaEnableUnroll;
+};
+
 } // end anonymous namespace
 
 /// Figure out if the loop is worth full unrolling.
@@ -746,13 +757,132 @@ public:
 
   // Returns loop size estimation for unrolled loop, given the unrolling
   // configuration specified by UP.
-  uint64_t getUnrolledLoopSize(TargetTransformInfo::UnrollingPreferences &UP) {
+  uint64_t
+  getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP,
+                      const unsigned CountOverwrite = 0) const {
     assert(LoopSize >= UP.BEInsns &&
            "LoopSize should not be less than BEInsns!");
-    return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+    if (CountOverwrite)
+      return static_cast<uint64_t>(LoopSize - UP.BEInsns) * CountOverwrite +
+             UP.BEInsns;
+    else
+      return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count +
+             UP.BEInsns;
   }
 };
 
+static Optional<unsigned>
+shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
+                   const unsigned TripMultiple, const unsigned TripCount,
+                   const UnrollCostEstimator UCE,
+                   const TargetTransformInfo::UnrollingPreferences &UP) {
+
+  // Using unroll pragma
+  // 1st priority is unroll count set by "unroll-count" option.
+
+  if (PInfo.UserUnrollCount) {
+    if (UP.AllowRemainder &&
+        UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
+      return (unsigned)UnrollCount;
+  }
+
+  // 2nd priority is unroll count set by pragma.
+  if (PInfo.PragmaCount > 0) {
+    if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)) &&
+        UCE.getUnrolledLoopSize(UP, PInfo.PragmaCount) < PragmaUnrollThreshold)
+      return PInfo.PragmaCount;
+  }
+
+  if (PInfo.PragmaFullUnroll && TripCount != 0) {
+    if (UCE.getUnrolledLoopSize(UP, TripCount) < PragmaUnrollThreshold)
+      return TripCount;
+  }
+  // if didn't return until here, should continue to other priorties
+  return None;
+}
+
+static Optional<unsigned> shouldFullUnroll(
+    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
+    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+    const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
+    const TargetTransformInfo::UnrollingPreferences &UP) {
+
+  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
+    // When computing the unrolled size, note that BEInsns are not replicated
+    // like the rest of the loop body.
+    if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) {
+      return FullUnrollTripCount;
+
+    } else {
+      // The loop isn't that small, but we still can fully unroll it if that
+      // helps to remove a significant number of instructions.
+      // To check that, run additional analysis on the loop.
+      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+              UP.MaxIterationsCountToAnalyze)) {
+        unsigned Boost =
+            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+          return FullUnrollTripCount;
+        }
+      }
+    }
+  }
+  return None;
+}
+
+static Optional<unsigned>
+shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
+                    const UnrollCostEstimator UCE,
+                    const TargetTransformInfo::UnrollingPreferences &UP) {
+
+  unsigned count = UP.Count;
+  if (TripCount) {
+    if (!UP.Partial) {
+      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
+                        << "-unroll-allow-partial not given\n");
+      count = 0;
+      return count;
+    }
+    if (count == 0)
+      count = TripCount;
+    if (UP.PartialThreshold != NoThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+        count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+                (LoopSize - UP.BEInsns);
+      if (count > UP.MaxCount)
+        count = UP.MaxCount;
+      while (count != 0 && TripCount % count != 0)
+        count--;
+      if (UP.AllowRemainder && count <= 1) {
+        // If there is no Count that is modulo of TripCount, set Count to
+        // largest power-of-two factor that satisfies the threshold limit.
+        // As we'll create fixup loop, do the type of unrolling only if
+        // remainder loop is allowed.
+        count = UP.DefaultUnrollRuntimeCount;
+        while (count != 0 &&
+               UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+          count >>= 1;
+      }
+      if (count < 2) {
+        count = 0;
+      }
+    } else {
+      count = TripCount;
+    }
+    if (count > UP.MaxCount)
+      count = UP.MaxCount;
+
+    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << count << "\n");
+
+    return count;
+  }
+
+  // if didn't return until here, should continue to other priorties
+  return None;
+}
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
 // Unless IgnoreUser is true, will also use metadata and command-line options
@@ -770,7 +900,18 @@ bool llvm::computeUnrollCount(
     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
 
   UnrollCostEstimator UCE(*L, LoopSize);
+  Optional<unsigned> UnrollFactor;
+
+  const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+  const bool PragmaFullUnroll = hasUnrollFullPragma(L);
+  const unsigned PragmaCount = unrollCountPragmaValue(L);
+  const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
 
+  const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+                              PragmaEnableUnroll || UserUnrollCount;
+
+  PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
+                   PragmaEnableUnroll);
   // Use an explicit peel count that has been specified for testing. In this
   // case it's not permitted to also specify an explicit unroll count.
   if (PP.PeelCount) {
@@ -782,47 +923,29 @@ bool llvm::computeUnrollCount(
     UP.Runtime = false;
     return true;
   }
-
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
-  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
-  if (UserUnrollCount) {
-    UP.Count = UnrollCount;
-    UP.AllowExpensiveTripCount = true;
-    UP.Force = true;
-    if (UP.AllowRemainder && UCE.getUnrolledLoopSize(UP) < UP.Threshold)
-      return true;
-  }
-
   // 2nd priority is unroll count set by pragma.
-  unsigned PragmaCount = unrollCountPragmaValue(L);
-  if (PragmaCount > 0) {
-    UP.Count = PragmaCount;
-    UP.Runtime = true;
-    UP.AllowExpensiveTripCount = true;
-    UP.Force = true;
-    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
-        UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold)
-      return true;
-  }
-  bool PragmaFullUnroll = hasUnrollFullPragma(L);
-  if (PragmaFullUnroll && TripCount != 0) {
-    UP.Count = TripCount;
-    if (UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold)
-      return false;
-  }
+  UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP);
+
+  if (UnrollFactor) {
+    UP.Count = *UnrollFactor;
 
-  bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
-  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
-                        PragmaEnableUnroll || UserUnrollCount;
-
-  if (ExplicitUnroll && TripCount != 0) {
-    // If the loop has an unrolling pragma, we want to be more aggressive with
-    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
-    // value which is larger than the default limits.
-    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
-    UP.PartialThreshold =
-        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+    if (UserUnrollCount || (PragmaCount > 0)) {
+      UP.AllowExpensiveTripCount = true;
+      UP.Force = true;
+    }
+    UP.Runtime |= (PragmaCount > 0);
+    return ExplicitUnroll;
+  } else {
+    if (ExplicitUnroll && TripCount != 0) {
+      // If the loop has an unrolling pragma, we want to be more aggressive with
+      // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+      // value which is larger than the default limits.
+      UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+      UP.PartialThreshold =
+          std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+    }
   }
 
   // 3rd priority is full unroll count.
@@ -852,71 +975,55 @@ bool llvm::computeUnrollCount(
   unsigned FullUnrollTripCount =
       ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
   UP.Count = FullUnrollTripCount;
-  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
-    // When computing the unrolled size, note that BEInsns are not replicated
-    // like the rest of the loop body.
-    if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) {
-      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
-      return ExplicitUnroll;
-    } else {
-      // The loop isn't that small, but we still can fully unroll it if that
-      // helps to remove a significant number of instructions.
-      // To check that, run additional analysis on the loop.
-      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
-              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
-              UP.MaxIterationsCountToAnalyze)) {
-        unsigned Boost =
-            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
-        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
-          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
-          return ExplicitUnroll;
-        }
-      }
-    }
+
+  UnrollFactor =
+      shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP);
+
+  // if shouldFullUnroll can do the unrolling, some side parameteres should be
+  // set
+  if (UnrollFactor) {
+    UP.Count = *UnrollFactor;
+    UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
+    TripCount = FullUnrollTripCount;
+    TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+    return ExplicitUnroll;
+  } else {
+    UP.Count = FullUnrollTripCount;
   }
 
   // 4th priority is loop peeling.
-  computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
+  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold);
   if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
     return ExplicitUnroll;
   }
 
+  // Before starting partial unrolling, set up.partial to true,
+  // if user explicitly asked  for unrolling
+  if (TripCount)
+    UP.Partial |= ExplicitUnroll;
+
   // 5th priority is partial unrolling.
   // Try partial unroll only when TripCount could be statically calculated.
-  if (TripCount) {
-    UP.Partial |= ExplicitUnroll;
-    if (!UP.Partial) {
-      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
-                        << "-unroll-allow-partial not given\n");
-      UP.Count = 0;
-      return false;
-    }
-    if (UP.Count == 0)
-      UP.Count = TripCount;
+  UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP);
+
+  if (UnrollFactor) {
+    UP.Count = *UnrollFactor;
+
+    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+        UP.Count != TripCount)
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE,
+                                        "FullUnrollAsDirectedTooLarge",
+                                        L->getStartLoc(), L->getHeader())
+               << "Unable to fully unroll loop as directed by unroll pragma "
+                  "because "
+                  "unrolled size is too large.";
+      });
+
     if (UP.PartialThreshold != NoThreshold) {
-      // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      if (UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
-        UP.Count =
-            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
-            (LoopSize - UP.BEInsns);
-      if (UP.Count > UP.MaxCount)
-        UP.Count = UP.MaxCount;
-      while (UP.Count != 0 && TripCount % UP.Count != 0)
-        UP.Count--;
-      if (UP.AllowRemainder && UP.Count <= 1) {
-        // If there is no Count that is modulo of TripCount, set Count to
-        // largest power-of-two factor that satisfies the threshold limit.
-        // As we'll create fixup loop, do the type of unrolling only if
-        // remainder loop is allowed.
-        UP.Count = UP.DefaultUnrollRuntimeCount;
-        while (UP.Count != 0 &&
-               UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
-          UP.Count >>= 1;
-      }
-      if (UP.Count < 2) {
+      if (UP.Count == 0) {
         if (PragmaEnableUnroll)
           ORE->emit([&]() {
             return OptimizationRemarkMissed(DEBUG_TYPE,
@@ -926,25 +1033,8 @@ bool llvm::computeUnrollCount(
                       "pragma "
                       "because unrolled size is too large.";
           });
-        UP.Count = 0;
       }
-    } else {
-      UP.Count = TripCount;
     }
-    if (UP.Count > UP.MaxCount)
-      UP.Count = UP.MaxCount;
-    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
-        UP.Count != TripCount)
-      ORE->emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE,
-                                        "FullUnrollAsDirectedTooLarge",
-                                        L->getStartLoc(), L->getHeader())
-               << "Unable to fully unroll loop as directed by unroll pragma "
-                  "because "
-                  "unrolled size is too large.";
-      });
-    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
-                      << "\n");
     return ExplicitUnroll;
   }
   assert(TripCount == 0 &&
@@ -981,8 +1071,6 @@ bool llvm::computeUnrollCount(
         UP.AllowExpensiveTripCount = true;
     }
   }
-
-  // Reduce count based on the type of unrolling and the threshold values.
   UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
   if (!UP.Runtime) {
     LLVM_DEBUG(
@@ -1017,7 +1105,7 @@ bool llvm::computeUnrollCount(
 
     using namespace ore;
 
-    if (PragmaCount > 0 && !UP.AllowRemainder)
+    if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
       ORE->emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE,
                                         "DifferentUnrollCountFromDirected",
@@ -1079,7 +1167,7 @@ static LoopUnrollResult tryToUnrollLoop(
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
+      L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
       ProvidedFullUnrollMaxCount);
   TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
@@ -1529,3 +1617,25 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
 
   return getLoopPassPreservedAnalyses();
 }
+
+void LoopUnrollPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (UnrollOpts.AllowPartial != None)
+    OS << (UnrollOpts.AllowPartial.getValue() ? "" : "no-") << "partial;";
+  if (UnrollOpts.AllowPeeling != None)
+    OS << (UnrollOpts.AllowPeeling.getValue() ? "" : "no-") << "peeling;";
+  if (UnrollOpts.AllowRuntime != None)
+    OS << (UnrollOpts.AllowRuntime.getValue() ? "" : "no-") << "runtime;";
+  if (UnrollOpts.AllowUpperBound != None)
+    OS << (UnrollOpts.AllowUpperBound.getValue() ? "" : "no-") << "upperbound;";
+  if (UnrollOpts.AllowProfileBasedPeeling != None)
+    OS << (UnrollOpts.AllowProfileBasedPeeling.getValue() ? "" : "no-")
+       << "profile-peeling;";
+  if (UnrollOpts.FullUnrollMaxCount != None)
+    OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ";";
+  OS << "O" << UnrollOpts.OptLevel;
+  OS << ">";
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 9a854ff80246..76bb5497c2c2 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -232,10 +232,8 @@ namespace {
       AU.addPreserved<LazyBranchProbabilityInfoPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
-      if (EnableMSSALoopDependency) {
-        AU.addRequired<MemorySSAWrapperPass>();
-        AU.addPreserved<MemorySSAWrapperPass>();
-      }
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
       if (HasBranchDivergence)
         AU.addRequired<LegacyDivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
@@ -539,11 +537,8 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
   LPM = &LPMRef;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  if (EnableMSSALoopDependency) {
-    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    assert(DT && "Cannot update MemorySSA without a valid DomTree.");
-  }
+  MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
   CurrentLoop = L;
   Function *F = CurrentLoop->getHeader()->getParent();
 
@@ -551,19 +546,19 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
   if (SanitizeMemory)
     SafetyInfo.computeLoopSafetyInfo(L);
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
   bool Changed = false;
   do {
     assert(CurrentLoop->isLCSSAForm(*DT));
-    if (MSSA && VerifyMemorySSA)
+    if (VerifyMemorySSA)
       MSSA->verifyMemorySSA();
     RedoLoop = false;
     Changed |= processCurrentLoop();
   } while (RedoLoop);
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
   return Changed;
@@ -1312,8 +1307,7 @@ void LoopUnswitch::splitExitEdges(
 
   for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) {
     BasicBlock *ExitBlock = ExitBlocks[I];
-    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
-                                       pred_end(ExitBlock));
+    SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBlock));
 
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index bd3001988369..186065db327e 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -55,11 +55,17 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
                                                  Value *NewValue,
                                                  DomTreeUpdater *DTU) {
   bool HasDeadBlocks = false;
-  SmallSetVector<Instruction *, 8> Worklist;
+  SmallSetVector<Instruction *, 8> UnsimplifiedUsers;
   replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
-                                &Worklist);
-  for (auto I : Worklist) {
-    BranchInst *BI = dyn_cast<BranchInst>(I);
+                                &UnsimplifiedUsers);
+  // UnsimplifiedUsers can contain PHI nodes that may be removed when
+  // replacing the branch instructions, so use a value handle worklist
+  // to handle those possibly removed instructions.
+  SmallVector<WeakVH, 8> Worklist(UnsimplifiedUsers.begin(),
+                                  UnsimplifiedUsers.end());
+
+  for (auto &VH : Worklist) {
+    BranchInst *BI = dyn_cast_or_null<BranchInst>(VH);
     if (!BI)
       continue;
     if (BI->isUnconditional())
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index ead8082f3036..1c186e9a0488 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -357,11 +357,10 @@ static bool lowerExpectIntrinsic(Function &F) {
     // Remove llvm.expect intrinsics. Iterate backwards in order
     // to process select instructions before the intrinsic gets
     // removed.
-    for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
-      Instruction *Inst = &*BI++;
-      CallInst *CI = dyn_cast<CallInst>(Inst);
+    for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(BB))) {
+      CallInst *CI = dyn_cast<CallInst>(&Inst);
       if (!CI) {
-        if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
+        if (SelectInst *SI = dyn_cast<SelectInst>(&Inst)) {
           if (handleBrSelExpect(*SI))
             ExpectIntrinsicsHandled++;
         }
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 42c183a6408e..4e4097e13271 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -900,8 +900,7 @@ public:
     // UndefedInsts and then check that we in fact remove them.
     SmallSet<Instruction *, 16> UndefedInsts;
     for (auto *Inst : reverse(ToRemove)) {
-      for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
-        Use &U = *I++;
+      for (Use &U : llvm::make_early_inc_range(Inst->uses())) {
         if (auto *Undefed = dyn_cast<Instruction>(U.getUser()))
           UndefedInsts.insert(Undefed);
         U.set(UndefValue::get(Inst->getType()));
@@ -981,8 +980,9 @@ public:
     Value *EltPtr = createElementPtr(Ptr, EltTy, Builder);
     MatrixTy Result;
     for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
-                                     Shape.getStride(), EltTy, Builder);
+      Value *GEP = computeVectorAddr(
+          EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
+          Stride, Shape.getStride(), EltTy, Builder);
       Value *Vector = Builder.CreateAlignedLoad(
           VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign),
           IsVolatile, "col.load");
@@ -1071,9 +1071,11 @@ public:
     auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
     for (auto Vec : enumerate(StoreVal.vectors())) {
-      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
-                                     Stride, StoreVal.getStride(),
-                                     VType->getElementType(), Builder);
+      Value *GEP = computeVectorAddr(
+          EltPtr,
+          Builder.getIntN(Stride->getType()->getScalarSizeInBits(),
+                          Vec.index()),
+          Stride, StoreVal.getStride(), VType->getElementType(), Builder);
       Builder.CreateAlignedStore(Vec.value(), GEP,
                                  getAlignForIndex(Vec.index(), Stride,
                                                   VType->getElementType(),
@@ -2261,6 +2263,16 @@ PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
+void LowerMatrixIntrinsicsPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LowerMatrixIntrinsicsPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  if (Minimal)
+    OS << "minimal";
+  OS << ">";
+}
+
 namespace {
 
 class LowerMatrixIntrinsicsLegacyPass : public FunctionPass {
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 2e36c50b75fc..67335a45fb58 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
@@ -67,9 +66,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
 
-static cl::opt<bool>
-    EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(true), cl::Hidden,
-                    cl::desc("Use MemorySSA-backed MemCpyOpt."));
+static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
+    "enable-memcpyopt-without-libcalls", cl::init(false), cl::Hidden,
+    cl::ZeroOrMore,
+    cl::desc("Enable memcpyopt even when libcalls are disabled"));
 
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
@@ -178,9 +178,9 @@ public:
   }
 
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
-    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
-
-    addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
+    TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+    assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
+    addRange(OffsetFromFirst, StoreSize.getFixedSize(), SI->getPointerOperand(),
              SI->getAlign().value(), SI);
   }
 
@@ -282,13 +282,9 @@ private:
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    if (!EnableMemorySSA)
-      AU.addRequired<MemoryDependenceWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
-    if (EnableMemorySSA)
-      AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
@@ -304,7 +300,6 @@ INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
@@ -329,10 +324,7 @@ static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
 }
 
 void MemCpyOptPass::eraseInstruction(Instruction *I) {
-  if (MSSAU)
-    MSSAU->removeMemoryAccess(I);
-  if (MD)
-    MD->removeInstruction(I);
+  MSSAU->removeMemoryAccess(I);
   I->eraseFromParent();
 }
 
@@ -371,6 +363,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
                                                  Value *ByteVal) {
   const DataLayout &DL = StartInst->getModule()->getDataLayout();
 
+  // We can't track scalable types
+  if (StoreInst *SI = dyn_cast<StoreInst>(StartInst))
+    if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
+      return nullptr;
+
   // Okay, so we now have a single store that can be splatable.  Scan to find
   // all subsequent stores of the same value to offset from the same pointer.
   // Join these together into ranges, so we can decide whether contiguous blocks
@@ -389,14 +386,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   // memsets.
   MemoryDef *LastMemDef = nullptr;
   for (++BI; !BI->isTerminator(); ++BI) {
-    if (MSSAU) {
-      auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
-          MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
-      if (CurrentAcc) {
-        MemInsertPoint = CurrentAcc;
-        if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc))
-          LastMemDef = CurrentDef;
-      }
+    auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
+        MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
+    if (CurrentAcc) {
+      MemInsertPoint = CurrentAcc;
+      if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc))
+        LastMemDef = CurrentDef;
     }
 
     // Calls that only access inaccessible memory do not block merging
@@ -426,6 +421,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
         break;
 
+      // We can't track ranges involving scalable types.
+      if (DL.getTypeStoreSize(StoredVal->getType()).isScalable())
+        break;
+
       // Check to see if this stored value is of the same byte-splattable value.
       Value *StoredByte = isBytewiseValue(StoredVal, DL);
       if (isa<UndefValue>(ByteVal) && StoredByte)
@@ -494,19 +493,17 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     if (!Range.TheStores.empty())
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
-    if (MSSAU) {
-      assert(LastMemDef && MemInsertPoint &&
-             "Both LastMemDef and MemInsertPoint need to be set");
-      auto *NewDef =
-          cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
-                              ? MSSAU->createMemoryAccessBefore(
-                                    AMemSet, LastMemDef, MemInsertPoint)
-                              : MSSAU->createMemoryAccessAfter(
-                                    AMemSet, LastMemDef, MemInsertPoint));
-      MSSAU->insertDef(NewDef, /*RenameUses=*/true);
-      LastMemDef = NewDef;
-      MemInsertPoint = NewDef;
-    }
+    assert(LastMemDef && MemInsertPoint &&
+           "Both LastMemDef and MemInsertPoint need to be set");
+    auto *NewDef =
+        cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
+                            ? MSSAU->createMemoryAccessBefore(
+                                  AMemSet, LastMemDef, MemInsertPoint)
+                            : MSSAU->createMemoryAccessAfter(
+                                  AMemSet, LastMemDef, MemInsertPoint));
+    MSSAU->insertDef(NewDef, /*RenameUses=*/true);
+    LastMemDef = NewDef;
+    MemInsertPoint = NewDef;
 
     // Zap all the stores.
     for (Instruction *SI : Range.TheStores)
@@ -615,17 +612,15 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
   // TODO: Simplify this once P will be determined by MSSA, in which case the
   // discrepancy can no longer occur.
   MemoryUseOrDef *MemInsertPoint = nullptr;
-  if (MSSAU) {
-    if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) {
-      MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator());
-    } else {
-      const Instruction *ConstP = P;
-      for (const Instruction &I : make_range(++ConstP->getReverseIterator(),
-                                             ++LI->getReverseIterator())) {
-        if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
-          MemInsertPoint = MA;
-          break;
-        }
+  if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) {
+    MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator());
+  } else {
+    const Instruction *ConstP = P;
+    for (const Instruction &I : make_range(++ConstP->getReverseIterator(),
+                                           ++LI->getReverseIterator())) {
+      if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+        MemInsertPoint = MA;
+        break;
       }
     }
   }
@@ -634,12 +629,10 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
   for (auto *I : llvm::reverse(ToLift)) {
     LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
     I->moveBefore(P);
-    if (MSSAU) {
-      assert(MemInsertPoint && "Must have found insert point");
-      if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
-        MSSAU->moveAfter(MA, MemInsertPoint);
-        MemInsertPoint = MA;
-      }
+    assert(MemInsertPoint && "Must have found insert point");
+    if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
+      MSSAU->moveAfter(MA, MemInsertPoint);
+      MemInsertPoint = MA;
     }
   }
 
@@ -673,7 +666,13 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         LI->getParent() == SI->getParent()) {
 
       auto *T = LI->getType();
-      if (T->isAggregateType()) {
+      // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
+      // the corresponding libcalls are not available.
+      // TODO: We should really distinguish between libcall availability and
+      // our ability to introduce intrinsics.
+      if (T->isAggregateType() &&
+          (EnableMemCpyOptWithoutLibcalls ||
+           (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) {
         MemoryLocation LoadLoc = MemoryLocation::get(LI);
 
         // We use alias analysis to check if an instruction may store to
@@ -703,9 +702,10 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         if (P) {
           // If we load from memory that may alias the memory we store to,
           // memmove must be used to preserve semantic. If not, memcpy can
-          // be used.
+          // be used. Also, if we load from constant memory, memcpy can be used
+          // as the constant memory won't be modified.
           bool UseMemMove = false;
-          if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc))
+          if (isModSet(AA->getModRefInfo(SI, LoadLoc)))
             UseMemMove = true;
 
           uint64_t Size = DL.getTypeStoreSize(T);
@@ -724,13 +724,10 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
           LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
                             << *M << "\n");
 
-          if (MSSAU) {
-            auto *LastDef =
-                cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
-            auto *NewAccess =
-                MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
-            MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
-          }
+          auto *LastDef =
+              cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
+          auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
+          MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
           eraseInstruction(SI);
           eraseInstruction(LI);
@@ -746,38 +743,21 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       // happen to be using a load-store pair to implement it, rather than
       // a memcpy.
       CallInst *C = nullptr;
-      if (EnableMemorySSA) {
-        if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
-                MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
-          // The load most post-dom the call. Limit to the same block for now.
-          // TODO: Support non-local call-slot optimization?
-          if (LoadClobber->getBlock() == SI->getParent())
-            C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
-        }
-      } else {
-        MemDepResult ldep = MD->getDependency(LI);
-        if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
-          C = dyn_cast<CallInst>(ldep.getInst());
+      if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
+              MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
+        // The load most post-dom the call. Limit to the same block for now.
+        // TODO: Support non-local call-slot optimization?
+        if (LoadClobber->getBlock() == SI->getParent())
+          C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
       }
 
       if (C) {
         // Check that nothing touches the dest of the "copy" between
         // the call and the store.
         MemoryLocation StoreLoc = MemoryLocation::get(SI);
-        if (EnableMemorySSA) {
-          if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
-                              MSSA->getMemoryAccess(SI)))
-            C = nullptr;
-        } else {
-          for (BasicBlock::iterator I = --SI->getIterator(),
-                                    E = C->getIterator();
-               I != E; --I) {
-            if (isModOrRefSet(AA->getModRefInfo(&*I, StoreLoc))) {
-              C = nullptr;
-              break;
-            }
-          }
-        }
+        if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
+                            MSSA->getMemoryAccess(SI)))
+          C = nullptr;
       }
 
       if (C) {
@@ -796,6 +776,13 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
     }
   }
 
+  // The following code creates memset intrinsics out of thin air. Don't do
+  // this if the corresponding libfunc is not available.
+  // TODO: We should really distinguish between libcall availability and
+  // our ability to introduce intrinsics.
+  if (!(TLI->has(LibFunc_memset) || EnableMemCpyOptWithoutLibcalls))
+    return false;
+
   // There are two cases that are interesting for this code to handle: memcpy
   // and memset.  Right now we only handle memset.
 
@@ -822,13 +809,12 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
       LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
 
-      if (MSSAU) {
-        assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)));
-        auto *LastDef =
-            cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
-        auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
-        MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
-      }
+      // The newly inserted memset is immediately overwritten by the original
+      // store, so we do not need to rename uses.
+      auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
+      auto *NewAccess = MSSAU->createMemoryAccessBefore(
+          M, StoreDef->getDefiningAccess(), StoreDef);
+      MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
 
       eraseInstruction(SI);
       NumMemSetInfer++;
@@ -859,7 +845,7 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// the call write its result directly into the destination of the memcpy.
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
                                          Instruction *cpyStore, Value *cpyDest,
-                                         Value *cpySrc, uint64_t cpyLen,
+                                         Value *cpySrc, TypeSize cpySize,
                                          Align cpyAlign, CallInst *C) {
   // The general transformation to keep in mind is
   //
@@ -875,6 +861,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   // src only holds uninitialized values at the moment of the call, meaning that
   // the memcpy can be discarded rather than moved.
 
+  // We can't optimize scalable types.
+  if (cpySize.isScalable())
+    return false;
+
   // Lifetime marks shouldn't be operated on.
   if (Function *F = C->getCalledFunction())
     if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
@@ -893,13 +883,13 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
                      srcArraySize->getZExtValue();
 
-  if (cpyLen < srcSize)
+  if (cpySize < srcSize)
     return false;
 
   // Check that accessing the first srcSize bytes of dest will not cause a
   // trap.  Otherwise the transform is invalid since it might cause a trap
   // to occur earlier than it otherwise would.
-  if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpyLen),
+  if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize),
                                           DL, C, DT))
     return false;
 
@@ -1020,11 +1010,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
     cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
   }
 
-  // Drop any cached information about the call, because we may have changed
-  // its dependence information by changing its parameter.
-  if (MD)
-    MD->removeInstruction(C);
-
   // Update AA metadata
   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
   // handled here, but combineMetadata doesn't support them yet
@@ -1073,28 +1058,19 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   //
   // TODO: If the code between M and MDep is transparent to the destination "c",
   // then we could still perform the xform by moving M up to the first memcpy.
-  if (EnableMemorySSA) {
-    // TODO: It would be sufficient to check the MDep source up to the memcpy
-    // size of M, rather than MDep.
-    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
-                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
-      return false;
-  } else {
-    // NOTE: This is conservative, it will stop on any read from the source loc,
-    // not just the defining memcpy.
-    MemDepResult SourceDep =
-        MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                     M->getIterator(), M->getParent());
-    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-      return false;
-  }
+  // TODO: It would be sufficient to check the MDep source up to the memcpy
+  // size of M, rather than MDep.
+  if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                     MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
+    return false;
 
   // If the dest of the second might alias the source of the first, then the
-  // source and dest might overlap.  We still want to eliminate the intermediate
-  // value, but we have to generate a memmove instead of memcpy.
+  // source and dest might overlap. In addition, if the source of the first
+  // points to constant memory, they won't overlap by definition. Otherwise, we
+  // still want to eliminate the intermediate value, but we have to generate a
+  // memmove instead of memcpy.
   bool UseMemMove = false;
-  if (!AA->isNoAlias(MemoryLocation::getForDest(M),
-                     MemoryLocation::getForSource(MDep)))
+  if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(MDep))))
     UseMemMove = true;
 
   // If all checks passed, then we can transform M.
@@ -1121,12 +1097,10 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
                                 MDep->getRawSource(), MDep->getSourceAlign(),
                                 M->getLength(), M->isVolatile());
 
-  if (MSSAU) {
-    assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
-    auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
-    auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
-    MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
-  }
+  assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
+  auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+  auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+  MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
   // Remove the instruction we're replacing.
   eraseInstruction(M);
@@ -1156,30 +1130,16 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
 
   // Check that src and dst of the memcpy aren't the same. While memcpy
   // operands cannot partially overlap, exact equality is allowed.
-  if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(),
-                                    LocationSize::precise(1)),
-                     MemoryLocation(MemCpy->getDest(),
-                                    LocationSize::precise(1))))
+  if (isModSet(AA->getModRefInfo(MemCpy, MemoryLocation::getForSource(MemCpy))))
     return false;
 
-  if (EnableMemorySSA) {
-    // We know that dst up to src_size is not written. We now need to make sure
-    // that dst up to dst_size is not accessed. (If we did not move the memset,
-    // checking for reads would be sufficient.)
-    if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
-                        MSSA->getMemoryAccess(MemSet),
-                        MSSA->getMemoryAccess(MemCpy))) {
-      return false;
-    }
-  } else {
-    // We have already checked that dst up to src_size is not accessed. We
-    // need to make sure that there are no accesses up to dst_size either.
-    MemDepResult DstDepInfo = MD->getPointerDependencyFrom(
-        MemoryLocation::getForDest(MemSet), false, MemCpy->getIterator(),
-        MemCpy->getParent());
-    if (DstDepInfo.getInst() != MemSet)
-      return false;
-  }
+  // We know that dst up to src_size is not written. We now need to make sure
+  // that dst up to dst_size is not accessed. (If we did not move the memset,
+  // checking for reads would be sufficient.)
+  if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
+                      MSSA->getMemoryAccess(MemSet),
+                      MSSA->getMemoryAccess(MemCpy)))
+    return false;
 
   // Use the same i8* dest as the memcpy, killing the memset dest if different.
   Value *Dest = MemCpy->getRawDest();
@@ -1229,18 +1189,16 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
                         SrcSize),
       MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
 
-  if (MSSAU) {
-    assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
-           "MemCpy must be a MemoryDef");
-    // The new memset is inserted after the memcpy, but it is known that its
-    // defining access is the memset about to be removed which immediately
-    // precedes the memcpy.
-    auto *LastDef =
-        cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
-    auto *NewAccess = MSSAU->createMemoryAccessBefore(
-        NewMemSet, LastDef->getDefiningAccess(), LastDef);
-    MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
-  }
+  assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
+         "MemCpy must be a MemoryDef");
+  // The new memset is inserted after the memcpy, but it is known that its
+  // defining access is the memset about to be removed which immediately
+  // precedes the memcpy.
+  auto *LastDef =
+      cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
+  auto *NewAccess = MSSAU->createMemoryAccessBefore(
+      NewMemSet, LastDef->getDefiningAccess(), LastDef);
+  MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
   eraseInstruction(MemSet);
   return true;
@@ -1248,23 +1206,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
 
 /// Determine whether the instruction has undefined content for the given Size,
 /// either because it was freshly alloca'd or started its lifetime.
-static bool hasUndefContents(Instruction *I, Value *Size) {
-  if (isa<AllocaInst>(I))
-    return true;
-
-  if (ConstantInt *CSize = dyn_cast<ConstantInt>(Size)) {
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start)
-        if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
-          if (LTSize->getZExtValue() >= CSize->getZExtValue())
-            return true;
-  }
-
-  return false;
-}
-
-static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
-                                 MemoryDef *Def, Value *Size) {
+static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
+                             MemoryDef *Def, Value *Size) {
   if (MSSA->isLiveOnEntryDef(Def))
     return isa<AllocaInst>(getUnderlyingObject(V));
 
@@ -1338,19 +1281,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
       // easily represent this location, we use the full 0..CopySize range.
       MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
       bool CanReduceSize = false;
-      if (EnableMemorySSA) {
-        MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
-        MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
-            MemSetAccess->getDefiningAccess(), MemCpyLoc);
-        if (auto *MD = dyn_cast<MemoryDef>(Clobber))
-          if (hasUndefContentsMSSA(MSSA, AA, MemCpy->getSource(), MD, CopySize))
-            CanReduceSize = true;
-      } else {
-        MemDepResult DepInfo = MD->getPointerDependencyFrom(
-            MemCpyLoc, true, MemSet->getIterator(), MemSet->getParent());
-        if (DepInfo.isDef() && hasUndefContents(DepInfo.getInst(), CopySize))
+      MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
+      MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+          MemSetAccess->getDefiningAccess(), MemCpyLoc);
+      if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+        if (hasUndefContents(MSSA, AA, MemCpy->getSource(), MD, CopySize))
           CanReduceSize = true;
-      }
 
       if (!CanReduceSize)
         return false;
@@ -1362,12 +1298,10 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   Instruction *NewM =
       Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
                            CopySize, MaybeAlign(MemCpy->getDestAlignment()));
-  if (MSSAU) {
-    auto *LastDef =
-        cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
-    auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
-    MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
-  }
+  auto *LastDef =
+      cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
+  auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+  MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
   return true;
 }
@@ -1397,149 +1331,90 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         Instruction *NewM =
             Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
                                  MaybeAlign(M->getDestAlignment()), false);
-        if (MSSAU) {
-          auto *LastDef =
-              cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
-          auto *NewAccess =
-              MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
-          MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
-        }
+        auto *LastDef =
+            cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+        auto *NewAccess =
+            MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
+        MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
         eraseInstruction(M);
         ++NumCpyToSet;
         return true;
       }
 
-  if (EnableMemorySSA) {
-    MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
-    MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
-    MemoryLocation DestLoc = MemoryLocation::getForDest(M);
-    const MemoryAccess *DestClobber =
-        MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
-
-    // Try to turn a partially redundant memset + memcpy into
-    // memcpy + smaller memset.  We don't need the memcpy size for this.
-    // The memcpy most post-dom the memset, so limit this to the same basic
-    // block. A non-local generalization is likely not worthwhile.
-    if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
-      if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
-        if (DestClobber->getBlock() == M->getParent())
-          if (processMemSetMemCpyDependence(M, MDep))
-            return true;
-
-    MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
-        AnyClobber, MemoryLocation::getForSource(M));
-
-    // There are four possible optimizations we can do for memcpy:
-    //   a) memcpy-memcpy xform which exposes redundance for DSE.
-    //   b) call-memcpy xform for return slot optimization.
-    //   c) memcpy from freshly alloca'd space or space that has just started
-    //      its lifetime copies undefined data, and we can therefore eliminate
-    //      the memcpy in favor of the data that was already at the destination.
-    //   d) memcpy from a just-memset'd source can be turned into memset.
-    if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
-      if (Instruction *MI = MD->getMemoryInst()) {
-        if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
-          if (auto *C = dyn_cast<CallInst>(MI)) {
-            // The memcpy must post-dom the call. Limit to the same block for
-            // now. Additionally, we need to ensure that there are no accesses
-            // to dest between the call and the memcpy. Accesses to src will be
-            // checked by performCallSlotOptzn().
-            // TODO: Support non-local call-slot optimization?
-            if (C->getParent() == M->getParent() &&
-                !accessedBetween(*AA, DestLoc, MD, MA)) {
-              // FIXME: Can we pass in either of dest/src alignment here instead
-              // of conservatively taking the minimum?
-              Align Alignment = std::min(M->getDestAlign().valueOrOne(),
-                                         M->getSourceAlign().valueOrOne());
-              if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
-                                       CopySize->getZExtValue(), Alignment,
-                                       C)) {
-                LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
-                                  << "    call: " << *C << "\n"
-                                  << "    memcpy: " << *M << "\n");
-                eraseInstruction(M);
-                ++NumMemCpyInstr;
-                return true;
-              }
-            }
-          }
-        }
-        if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-          return processMemCpyMemCpyDependence(M, MDep);
-        if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
-          if (performMemCpyToMemSetOptzn(M, MDep)) {
-            LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
-            eraseInstruction(M);
-            ++NumCpyToSet;
-            return true;
-          }
-        }
-      }
-
-      if (hasUndefContentsMSSA(MSSA, AA, M->getSource(), MD, M->getLength())) {
-        LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
-        eraseInstruction(M);
-        ++NumMemCpyInstr;
-        return true;
-      }
-    }
-  } else {
-    MemDepResult DepInfo = MD->getDependency(M);
-
-    // Try to turn a partially redundant memset + memcpy into
-    // memcpy + smaller memset.  We don't need the memcpy size for this.
-    if (DepInfo.isClobber())
-      if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+  MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+  MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+  MemoryLocation DestLoc = MemoryLocation::getForDest(M);
+  const MemoryAccess *DestClobber =
+      MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
+
+  // Try to turn a partially redundant memset + memcpy into
+  // memcpy + smaller memset.  We don't need the memcpy size for this.
+  // The memcpy most post-dom the memset, so limit this to the same basic
+  // block. A non-local generalization is likely not worthwhile.
+  if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+    if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
+      if (DestClobber->getBlock() == M->getParent())
         if (processMemSetMemCpyDependence(M, MDep))
           return true;
 
-    // There are four possible optimizations we can do for memcpy:
-    //   a) memcpy-memcpy xform which exposes redundance for DSE.
-    //   b) call-memcpy xform for return slot optimization.
-    //   c) memcpy from freshly alloca'd space or space that has just started
-    //      its lifetime copies undefined data, and we can therefore eliminate
-    //      the memcpy in favor of the data that was already at the destination.
-    //   d) memcpy from a just-memset'd source can be turned into memset.
-    if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
-      if (DepInfo.isClobber()) {
-        if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
-          // FIXME: Can we pass in either of dest/src alignment here instead
-          // of conservatively taking the minimum?
-          Align Alignment = std::min(M->getDestAlign().valueOrOne(),
-                                     M->getSourceAlign().valueOrOne());
-          if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
-                                   CopySize->getZExtValue(), Alignment, C)) {
-            eraseInstruction(M);
-            ++NumMemCpyInstr;
-            return true;
+  MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      AnyClobber, MemoryLocation::getForSource(M));
+
+  // There are four possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE.
+  //   b) call-memcpy xform for return slot optimization.
+  //   c) memcpy from freshly alloca'd space or space that has just started
+  //      its lifetime copies undefined data, and we can therefore eliminate
+  //      the memcpy in favor of the data that was already at the destination.
+  //   d) memcpy from a just-memset'd source can be turned into memset.
+  if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
+    if (Instruction *MI = MD->getMemoryInst()) {
+      if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
+        if (auto *C = dyn_cast<CallInst>(MI)) {
+          // The memcpy must post-dom the call. Limit to the same block for
+          // now. Additionally, we need to ensure that there are no accesses
+          // to dest between the call and the memcpy. Accesses to src will be
+          // checked by performCallSlotOptzn().
+          // TODO: Support non-local call-slot optimization?
+          if (C->getParent() == M->getParent() &&
+              !accessedBetween(*AA, DestLoc, MD, MA)) {
+            // FIXME: Can we pass in either of dest/src alignment here instead
+            // of conservatively taking the minimum?
+            Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                       M->getSourceAlign().valueOrOne());
+            if (performCallSlotOptzn(
+                    M, M, M->getDest(), M->getSource(),
+                    TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
+                    C)) {
+              LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
+                                << "    call: " << *C << "\n"
+                                << "    memcpy: " << *M << "\n");
+              eraseInstruction(M);
+              ++NumMemCpyInstr;
+              return true;
+            }
           }
         }
       }
-    }
-
-    MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
-    MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
-        SrcLoc, true, M->getIterator(), M->getParent());
-
-    if (SrcDepInfo.isClobber()) {
-      if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+      if (auto *MDep = dyn_cast<MemCpyInst>(MI))
         return processMemCpyMemCpyDependence(M, MDep);
-    } else if (SrcDepInfo.isDef()) {
-      if (hasUndefContents(SrcDepInfo.getInst(), M->getLength())) {
-        eraseInstruction(M);
-        ++NumMemCpyInstr;
-        return true;
-      }
-    }
-
-    if (SrcDepInfo.isClobber())
-      if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+      if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep)) {
+          LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
           eraseInstruction(M);
           ++NumCpyToSet;
           return true;
         }
+      }
+    }
+
+    if (hasUndefContents(MSSA, AA, M->getSource(), MD, M->getLength())) {
+      LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
+      eraseInstruction(M);
+      ++NumMemCpyInstr;
+      return true;
+    }
   }
 
   return false;
@@ -1548,12 +1423,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
 /// not to alias.
 bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
-  if (!TLI->has(LibFunc_memmove))
-    return false;
-
-  // See if the pointers alias.
-  if (!AA->isNoAlias(MemoryLocation::getForDest(M),
-                     MemoryLocation::getForSource(M)))
+  // See if the source could be modified by this memmove potentially.
+  if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M))))
     return false;
 
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
@@ -1569,11 +1440,6 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
   // For MemorySSA nothing really changes (except that memcpy may imply stricter
   // aliasing guarantees).
 
-  // MemDep may have over conservative information about this instruction, just
-  // conservatively flush it from the cache.
-  if (MD)
-    MD->removeInstruction(M);
-
   ++NumMoveToCpy;
   return true;
 }
@@ -1584,24 +1450,16 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   // Find out what feeds this byval argument.
   Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = CB.getParamByValType(ArgNo);
-  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
+  TypeSize ByValSize = DL.getTypeAllocSize(ByValTy);
   MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
+  MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
+  if (!CallAccess)
+    return false;
   MemCpyInst *MDep = nullptr;
-  if (EnableMemorySSA) {
-    MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
-    if (!CallAccess)
-      return false;
-    MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
-        CallAccess->getDefiningAccess(), Loc);
-    if (auto *MD = dyn_cast<MemoryDef>(Clobber))
-      MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
-  } else {
-    MemDepResult DepInfo = MD->getPointerDependencyFrom(
-        Loc, true, CB.getIterator(), CB.getParent());
-    if (!DepInfo.isClobber())
-      return false;
-    MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
-  }
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      CallAccess->getDefiningAccess(), Loc);
+  if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+    MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
 
   // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
   // a memcpy, see if we can byval from the source of the memcpy instead of the
@@ -1612,7 +1470,8 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
 
   // The length of the memcpy must be larger or equal to the size of the byval.
   ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
-  if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+  if (!C1 || !TypeSize::isKnownGE(
+                 TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
     return false;
 
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
@@ -1639,19 +1498,9 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   //    *b = 42;
   //    foo(*a)
   // It would be invalid to transform the second memcpy into foo(*b).
-  if (EnableMemorySSA) {
-    if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
-                       MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
-      return false;
-  } else {
-    // NOTE: This is conservative, it will stop on any read from the source loc,
-    // not just the defining memcpy.
-    MemDepResult SourceDep = MD->getPointerDependencyFrom(
-        MemoryLocation::getForSource(MDep), false,
-        CB.getIterator(), MDep->getParent());
-    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
-      return false;
-  }
+  if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+                     MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
+    return false;
 
   Value *TmpCast = MDep->getSource();
   if (MDep->getSource()->getType() != ByValArg->getType()) {
@@ -1718,47 +1567,33 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 }
 
 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-  auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
-                              : AM.getCachedResult<MemoryDependenceAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto *AA = &AM.getResult<AAManager>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
-                               : AM.getCachedResult<MemorySSAAnalysis>(F);
+  auto *MSSA = &AM.getResult<MemorySSAAnalysis>(F);
 
-  bool MadeChange =
-      runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
+  bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA());
   if (!MadeChange)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
-  if (MD)
-    PA.preserve<MemoryDependenceAnalysis>();
-  if (MSSA)
-    PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
 
-bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
-                            TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
-                            AssumptionCache *AC_, DominatorTree *DT_,
-                            MemorySSA *MSSA_) {
+bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
+                            AliasAnalysis *AA_, AssumptionCache *AC_,
+                            DominatorTree *DT_, MemorySSA *MSSA_) {
   bool MadeChange = false;
-  MD = MD_;
   TLI = TLI_;
   AA = AA_;
   AC = AC_;
   DT = DT_;
   MSSA = MSSA_;
   MemorySSAUpdater MSSAU_(MSSA_);
-  MSSAU = MSSA_ ? &MSSAU_ : nullptr;
-  // If we don't have at least memset and memcpy, there is little point of doing
-  // anything here.  These are required by a freestanding implementation, so if
-  // even they are disabled, there is no point in trying hard.
-  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
-    return false;
+  MSSAU = &MSSAU_;
 
   while (true) {
     if (!iterateOnFunction(F))
@@ -1766,10 +1601,9 @@ bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
     MadeChange = true;
   }
 
-  if (MSSA_ && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA_->verifyMemorySSA();
 
-  MD = nullptr;
   return MadeChange;
 }
 
@@ -1778,17 +1612,11 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  auto *MDWP = !EnableMemorySSA
-      ? &getAnalysis<MemoryDependenceWrapperPass>()
-      : getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
   auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  auto *MSSAWP = EnableMemorySSA
-      ? &getAnalysis<MemorySSAWrapperPass>()
-      : getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  auto *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
 
-  return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
-                      MSSAWP ? &MSSAWP->getMSSA() : nullptr);
+  return Impl.runImpl(F, TLI, AA, AC, DT, MSSA);
 }
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index f13f24ad2027..aac0deea5be3 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -66,15 +66,6 @@ namespace {
 
 #define DEBUG_TYPE "mergeicmps"
 
-// Returns true if the instruction is a simple load or a simple store
-static bool isSimpleLoadOrStore(const Instruction *I) {
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->isSimple();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isSimple();
-  return false;
-}
-
 // A BCE atom "Binary Compare Expression Atom" represents an integer load
 // that is a constant offset from a base value, e.g. `a` or `o.c` in the example
 // at the top.
@@ -154,6 +145,10 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
     return {};
   }
   Value *const Addr = LoadI->getOperand(0);
+  if (Addr->getType()->getPointerAddressSpace() != 0) {
+    LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n");
+    return {};
+  }
   auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
   if (!GEP)
     return {};
@@ -234,6 +229,8 @@ class BCECmpBlock {
   InstructionSet BlockInsts;
   // The block requires splitting.
   bool RequireSplit = false;
+  // Original order of this block in the chain.
+  unsigned OrigOrder = 0;
 
 private:
   BCECmp Cmp;
@@ -244,14 +241,13 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
   // If this instruction may clobber the loads and is in middle of the BCE cmp
   // block instructions, then bail for now.
   if (Inst->mayWriteToMemory()) {
-    // Bail if this is not a simple load or store
-    if (!isSimpleLoadOrStore(Inst))
-      return false;
-    // Disallow stores that might alias the BCE operands
-    MemoryLocation LLoc = MemoryLocation::get(Cmp.Lhs.LoadI);
-    MemoryLocation RLoc = MemoryLocation::get(Cmp.Rhs.LoadI);
-    if (isModSet(AA.getModRefInfo(Inst, LLoc)) ||
-        isModSet(AA.getModRefInfo(Inst, RLoc)))
+    auto MayClobber = [&](LoadInst *LI) {
+      // If a potentially clobbering instruction comes before the load,
+      // we can still safely sink the load.
+      return !Inst->comesBefore(LI) &&
+             isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI)));
+    };
+    if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI))
       return false;
   }
   // Make sure this instruction does not use any of the BCE cmp block
@@ -386,39 +382,83 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
                     << Comparison.Rhs().BaseId << " + "
                     << Comparison.Rhs().Offset << "\n");
   LLVM_DEBUG(dbgs() << "\n");
+  Comparison.OrigOrder = Comparisons.size();
   Comparisons.push_back(std::move(Comparison));
 }
 
 // A chain of comparisons.
 class BCECmpChain {
- public:
-   BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
-               AliasAnalysis &AA);
-
-   int size() const { return Comparisons_.size(); }
+public:
+  using ContiguousBlocks = std::vector<BCECmpBlock>;
 
-#ifdef MERGEICMPS_DOT_ON
-  void dump() const;
-#endif  // MERGEICMPS_DOT_ON
+  BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+              AliasAnalysis &AA);
 
   bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
                 DomTreeUpdater &DTU);
 
-private:
-  static bool IsContiguous(const BCECmpBlock &First,
-                           const BCECmpBlock &Second) {
-    return First.Lhs().BaseId == Second.Lhs().BaseId &&
-           First.Rhs().BaseId == Second.Rhs().BaseId &&
-           First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
-           First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+  bool atLeastOneMerged() const {
+    return any_of(MergedBlocks_,
+                  [](const auto &Blocks) { return Blocks.size() > 1; });
   }
 
+private:
   PHINode &Phi_;
-  std::vector<BCECmpBlock> Comparisons_;
+  // The list of all blocks in the chain, grouped by contiguity.
+  std::vector<ContiguousBlocks> MergedBlocks_;
   // The original entry block (before sorting);
   BasicBlock *EntryBlock_;
 };
 
+static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) {
+  return First.Lhs().BaseId == Second.Lhs().BaseId &&
+         First.Rhs().BaseId == Second.Rhs().BaseId &&
+         First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+         First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+}
+
+static unsigned getMinOrigOrder(const BCECmpChain::ContiguousBlocks &Blocks) {
+  unsigned MinOrigOrder = std::numeric_limits<unsigned>::max();
+  for (const BCECmpBlock &Block : Blocks)
+    MinOrigOrder = std::min(MinOrigOrder, Block.OrigOrder);
+  return MinOrigOrder;
+}
+
+/// Given a chain of comparison blocks, groups the blocks into contiguous
+/// ranges that can be merged together into a single comparison.
+static std::vector<BCECmpChain::ContiguousBlocks>
+mergeBlocks(std::vector<BCECmpBlock> &&Blocks) {
+  std::vector<BCECmpChain::ContiguousBlocks> MergedBlocks;
+
+  // Sort to detect continuous offsets.
+  llvm::sort(Blocks,
+             [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
+               return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) <
+                      std::tie(RhsBlock.Lhs(), RhsBlock.Rhs());
+             });
+
+  BCECmpChain::ContiguousBlocks *LastMergedBlock = nullptr;
+  for (BCECmpBlock &Block : Blocks) {
+    if (!LastMergedBlock || !areContiguous(LastMergedBlock->back(), Block)) {
+      MergedBlocks.emplace_back();
+      LastMergedBlock = &MergedBlocks.back();
+    } else {
+      LLVM_DEBUG(dbgs() << "Merging block " << Block.BB->getName() << " into "
+                        << LastMergedBlock->back().BB->getName() << "\n");
+    }
+    LastMergedBlock->push_back(std::move(Block));
+  }
+
+  // While we allow reordering for merging, do not reorder unmerged comparisons.
+  // Doing so may introduce branch on poison.
+  llvm::sort(MergedBlocks, [](const BCECmpChain::ContiguousBlocks &LhsBlocks,
+                              const BCECmpChain::ContiguousBlocks &RhsBlocks) {
+    return getMinOrigOrder(LhsBlocks) < getMinOrigOrder(RhsBlocks);
+  });
+
+  return MergedBlocks;
+}
+
 BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
                          AliasAnalysis &AA)
     : Phi_(Phi) {
@@ -498,47 +538,9 @@ BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
     return;
   }
   EntryBlock_ = Comparisons[0].BB;
-  Comparisons_ = std::move(Comparisons);
-#ifdef MERGEICMPS_DOT_ON
-  errs() << "BEFORE REORDERING:\n\n";
-  dump();
-#endif  // MERGEICMPS_DOT_ON
-  // Reorder blocks by LHS. We can do that without changing the
-  // semantics because we are only accessing dereferencable memory.
-  llvm::sort(Comparisons_,
-             [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
-               return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) <
-                      std::tie(RhsBlock.Lhs(), RhsBlock.Rhs());
-             });
-#ifdef MERGEICMPS_DOT_ON
-  errs() << "AFTER REORDERING:\n\n";
-  dump();
-#endif  // MERGEICMPS_DOT_ON
+  MergedBlocks_ = mergeBlocks(std::move(Comparisons));
 }
 
-#ifdef MERGEICMPS_DOT_ON
-void BCECmpChain::dump() const {
-  errs() << "digraph dag {\n";
-  errs() << " graph [bgcolor=transparent];\n";
-  errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
-  errs() << " edge [color=black];\n";
-  for (size_t I = 0; I < Comparisons_.size(); ++I) {
-    const auto &Comparison = Comparisons_[I];
-    errs() << " \"" << I << "\" [label=\"%"
-           << Comparison.Lhs().Base()->getName() << " + "
-           << Comparison.Lhs().Offset << " == %"
-           << Comparison.Rhs().Base()->getName() << " + "
-           << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
-           << " bytes)\"];\n";
-    const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
-    if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
-    errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
-  }
-  errs() << " \"Phi\" [label=\"Phi\"];\n";
-  errs() << "}\n\n";
-}
-#endif  // MERGEICMPS_DOT_ON
-
 namespace {
 
 // A class to compute the name of a set of merged basic blocks.
@@ -661,47 +663,18 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
 
 bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
                            DomTreeUpdater &DTU) {
-  assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain");
-  // First pass to check if there is at least one merge. If not, we don't do
-  // anything and we keep analysis passes intact.
-  const auto AtLeastOneMerged = [this]() {
-    for (size_t I = 1; I < Comparisons_.size(); ++I) {
-      if (IsContiguous(Comparisons_[I - 1], Comparisons_[I]))
-        return true;
-    }
-    return false;
-  };
-  if (!AtLeastOneMerged())
-    return false;
-
+  assert(atLeastOneMerged() && "simplifying trivial BCECmpChain");
   LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block "
                     << EntryBlock_->getName() << "\n");
 
   // Effectively merge blocks. We go in the reverse direction from the phi block
   // so that the next block is always available to branch to.
-  const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num,
-                                                  BasicBlock *InsertBefore,
-                                                  BasicBlock *Next) {
-    return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num),
-                            InsertBefore, Next, Phi_, TLI, AA, DTU);
-  };
-  int NumMerged = 1;
+  BasicBlock *InsertBefore = EntryBlock_;
   BasicBlock *NextCmpBlock = Phi_.getParent();
-  for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; --I) {
-    if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) {
-      LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName()
-                        << " into " << Comparisons_[I + 1].BB->getName()
-                        << "\n");
-      ++NumMerged;
-    } else {
-      NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock);
-      NumMerged = 1;
-    }
+  for (const auto &Blocks : reverse(MergedBlocks_)) {
+    InsertBefore = NextCmpBlock = mergeComparisons(
+        Blocks, InsertBefore, NextCmpBlock, Phi_, TLI, AA, DTU);
   }
-  // Insert the entry block for the new chain before the old entry block.
-  // If the old entry block was the function entry, this ensures that the new
-  // entry can become the function entry.
-  NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock);
 
   // Replace the original cmp chain with the new cmp chain by pointing all
   // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp
@@ -729,13 +702,16 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
 
   // Delete merged blocks. This also removes incoming values in phi.
   SmallVector<BasicBlock *, 16> DeadBlocks;
-  for (auto &Cmp : Comparisons_) {
-    LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n");
-    DeadBlocks.push_back(Cmp.BB);
+  for (const auto &Blocks : MergedBlocks_) {
+    for (const BCECmpBlock &Block : Blocks) {
+      LLVM_DEBUG(dbgs() << "Deleting merged block " << Block.BB->getName()
+                        << "\n");
+      DeadBlocks.push_back(Block.BB);
+    }
   }
   DeleteDeadBlocks(DeadBlocks, &DTU);
 
-  Comparisons_.clear();
+  MergedBlocks_.clear();
   return true;
 }
 
@@ -835,8 +811,8 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
   if (Blocks.empty()) return false;
   BCECmpChain CmpChain(Blocks, Phi, AA);
 
-  if (CmpChain.size() < 2) {
-    LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
+  if (!CmpChain.atLeastOneMerged()) {
+    LLVM_DEBUG(dbgs() << "skip: nothing merged\n");
     return false;
   }
 
@@ -862,9 +838,9 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
 
   bool MadeChange = false;
 
-  for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+  for (BasicBlock &BB : llvm::drop_begin(F)) {
     // A Phi operation is always first in a basic block.
-    if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+    if (auto *const Phi = dyn_cast<PHINode>(&*BB.begin()))
       MadeChange |= processPhi(*Phi, TLI, AA, DTU);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 033fc168a67f..734532a6670c 100644
--- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -420,3 +420,12 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
     PA.preserveSet<CFGAnalyses>();
   return PA;
 }
+
+void MergedLoadStoreMotionPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<MergedLoadStoreMotionPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  OS << (Options.SplitFooterBB ? "" : "no-") << "split-footer-bb";
+  OS << ">";
+}
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index ded5caf53b5a..6dca30d9876e 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -282,8 +282,12 @@ NaryReassociatePass::matchAndReassociateMinOrMax(Instruction *I,
           m_Value(LHS), m_Value(RHS));
   if (match(I, MinMaxMatcher)) {
     OrigSCEV = SE->getSCEV(I);
-    return dyn_cast_or_null<Instruction>(
-        tryReassociateMinOrMax(I, MinMaxMatcher, LHS, RHS));
+    if (auto *NewMinMax = dyn_cast_or_null<Instruction>(
+            tryReassociateMinOrMax(I, MinMaxMatcher, LHS, RHS)))
+      return NewMinMax;
+    if (auto *NewMinMax = dyn_cast_or_null<Instruction>(
+            tryReassociateMinOrMax(I, MinMaxMatcher, RHS, LHS)))
+      return NewMinMax;
   }
   return nullptr;
 }
@@ -596,58 +600,60 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
                                                    Value *LHS, Value *RHS) {
   Value *A = nullptr, *B = nullptr;
   MaxMinT m_MaxMin(m_Value(A), m_Value(B));
-  for (unsigned int i = 0; i < 2; ++i) {
-    if (!LHS->hasNUsesOrMore(3) && match(LHS, m_MaxMin)) {
-      const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
-      const SCEV *RHSExpr = SE->getSCEV(RHS);
-      for (unsigned int j = 0; j < 2; ++j) {
-        if (j == 0) {
-          if (BExpr == RHSExpr)
-            continue;
-          // Transform 'I = (A op B) op RHS' to 'I = (A op RHS) op B' on the
-          // first iteration.
-          std::swap(BExpr, RHSExpr);
-        } else {
-          if (AExpr == RHSExpr)
-            continue;
-          // Transform 'I = (A op RHS) op B' 'I = (B op RHS) op A' on the second
-          // iteration.
-          std::swap(AExpr, RHSExpr);
-        }
-
-        // The optimization is profitable only if LHS can be removed in the end.
-        // In other words LHS should be used (directly or indirectly) by I only.
-        if (llvm::any_of(LHS->users(), [&](auto *U) {
-              return U != I && !(U->hasOneUser() && *U->users().begin() == I);
-            }))
-          continue;
-
-        SCEVExpander Expander(*SE, *DL, "nary-reassociate");
-        SmallVector<const SCEV *, 2> Ops1{ BExpr, AExpr };
-        const SCEVTypes SCEVType = convertToSCEVype(m_MaxMin);
-        const SCEV *R1Expr = SE->getMinMaxExpr(SCEVType, Ops1);
-
-        Instruction *R1MinMax = findClosestMatchingDominator(R1Expr, I);
-
-        if (!R1MinMax)
-          continue;
-
-        LLVM_DEBUG(dbgs() << "NARY: Found common sub-expr: " << *R1MinMax
-                          << "\n");
-
-        R1Expr = SE->getUnknown(R1MinMax);
-        SmallVector<const SCEV *, 2> Ops2{ RHSExpr, R1Expr };
-        const SCEV *R2Expr = SE->getMinMaxExpr(SCEVType, Ops2);
-
-        Value *NewMinMax = Expander.expandCodeFor(R2Expr, I->getType(), I);
-        NewMinMax->setName(Twine(I->getName()).concat(".nary"));
-
-        LLVM_DEBUG(dbgs() << "NARY: Deleting:  " << *I << "\n"
-                          << "NARY: Inserting: " << *NewMinMax << "\n");
-        return NewMinMax;
-      }
-    }
-    std::swap(LHS, RHS);
+
+  if (LHS->hasNUsesOrMore(3) ||
+      // The optimization is profitable only if LHS can be removed in the end.
+      // In other words LHS should be used (directly or indirectly) by I only.
+      llvm::any_of(LHS->users(),
+                    [&](auto *U) {
+                      return U != I &&
+                             !(U->hasOneUser() && *U->users().begin() == I);
+                    }) ||
+      !match(LHS, m_MaxMin))
+    return nullptr;
+
+  auto tryCombination = [&](Value *A, const SCEV *AExpr, Value *B,
+                            const SCEV *BExpr, Value *C,
+                            const SCEV *CExpr) -> Value * {
+    SmallVector<const SCEV *, 2> Ops1{BExpr, AExpr};
+    const SCEVTypes SCEVType = convertToSCEVype(m_MaxMin);
+    const SCEV *R1Expr = SE->getMinMaxExpr(SCEVType, Ops1);
+
+    Instruction *R1MinMax = findClosestMatchingDominator(R1Expr, I);
+
+    if (!R1MinMax)
+      return nullptr;
+
+    LLVM_DEBUG(dbgs() << "NARY: Found common sub-expr: " << *R1MinMax << "\n");
+
+    SmallVector<const SCEV *, 2> Ops2{SE->getUnknown(C),
+                                      SE->getUnknown(R1MinMax)};
+    const SCEV *R2Expr = SE->getMinMaxExpr(SCEVType, Ops2);
+
+    SCEVExpander Expander(*SE, *DL, "nary-reassociate");
+    Value *NewMinMax = Expander.expandCodeFor(R2Expr, I->getType(), I);
+    NewMinMax->setName(Twine(I->getName()).concat(".nary"));
+
+    LLVM_DEBUG(dbgs() << "NARY: Deleting:  " << *I << "\n"
+                      << "NARY: Inserting: " << *NewMinMax << "\n");
+    return NewMinMax;
+  };
+
+  const SCEV *AExpr = SE->getSCEV(A);
+  const SCEV *BExpr = SE->getSCEV(B);
+  const SCEV *RHSExpr = SE->getSCEV(RHS);
+
+  if (BExpr != RHSExpr) {
+    // Try (A op RHS) op B
+    if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+      return NewMinMax;
+  }
+
+  if (AExpr != RHSExpr) {
+    // Try (RHS op B) op A
+    if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+      return NewMinMax;
   }
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index a137d13c6ea0..91215cd19e2b 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1194,9 +1194,10 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const {
         SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
-  } else if (isa<GetElementPtrInst>(I)) {
-    Value *V = SimplifyGEPInst(
-        E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
+  } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+    Value *V = SimplifyGEPInst(GEPI->getSourceElementType(),
+                               ArrayRef<Value *>(E->op_begin(), E->op_end()),
+                               GEPI->isInBounds(), SQ);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (AllConstant) {
@@ -1818,7 +1819,7 @@ NewGVN::ExprResult NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
   // See if we know something about the comparison itself, like it is the target
   // of an assume.
   auto *CmpPI = PredInfo->getPredicateInfoFor(I);
-  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+  if (isa_and_nonnull<PredicateAssume>(CmpPI))
     return ExprResult::some(
         createConstantExpression(ConstantInt::getTrue(CI->getType())));
 
@@ -3606,7 +3607,7 @@ void NewGVN::convertClassToDFSOrdered(
 
         // Skip uses in unreachable blocks, as we're going
         // to delete them.
-        if (ReachableBlocks.count(IBlock) == 0)
+        if (!ReachableBlocks.contains(IBlock))
           continue;
 
         DomTreeNode *DomNode = DT->getNode(IBlock);
diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 7872c553b412..44027ccd92ca 100644
--- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -82,7 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
 
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call.
-  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+  Call->addFnAttr(Attribute::ReadNone);
 
   // Insert a FP compare instruction and use it as the CurrBB branch condition.
   Builder.SetInsertPoint(CurrBBTerm);
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 888edc4d69a8..b0fb8daaba8f 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -140,7 +140,7 @@ XorOpnd::XorOpnd(Value *V) {
 
   // view the operand as "V | 0"
   SymbolicPart = V;
-  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
+  ConstPart = APInt::getZero(V->getType()->getScalarSizeInBits());
   isOr = true;
 }
 
@@ -1279,10 +1279,10 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
 /// be returned.
 static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
                              const APInt &ConstOpnd) {
-  if (ConstOpnd.isNullValue())
+  if (ConstOpnd.isZero())
     return nullptr;
 
-  if (ConstOpnd.isAllOnesValue())
+  if (ConstOpnd.isAllOnes())
     return Opnd;
 
   Instruction *I = BinaryOperator::CreateAnd(
@@ -1304,7 +1304,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
   //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
   //                       = (x & ~c1) ^ (c1 ^ c2)
   // It is useful only when c1 == c2.
-  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isZero())
     return false;
 
   if (!Opnd1->getValue()->hasOneUse())
@@ -1361,7 +1361,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     APInt C3((~C1) ^ C2);
 
     // Do not increase code size!
-    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+    if (!C3.isZero() && !C3.isAllOnes()) {
       int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
       if (NewInstNum > DeadInstNum)
         return false;
@@ -1377,7 +1377,7 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     APInt C3 = C1 ^ C2;
 
     // Do not increase code size
-    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+    if (!C3.isZero() && !C3.isAllOnes()) {
       int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
       if (NewInstNum > DeadInstNum)
         return false;
@@ -1468,8 +1468,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
     Value *CV;
 
     // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
-    if (!ConstOpnd.isNullValue() &&
-        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+    if (!ConstOpnd.isZero() && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
       Changed = true;
       if (CV)
         *CurrOpnd = XorOpnd(CV);
@@ -1510,7 +1509,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
       ValueEntry VE(getRank(O.getValue()), O.getValue());
       Ops.push_back(VE);
     }
-    if (!ConstOpnd.isNullValue()) {
+    if (!ConstOpnd.isZero()) {
       Value *C = ConstantInt::get(Ty, ConstOpnd);
       ValueEntry VE(getRank(C), C);
       Ops.push_back(VE);
@@ -1519,7 +1518,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
     if (Sz == 1)
       return Ops.back().Op;
     if (Sz == 0) {
-      assert(ConstOpnd.isNullValue());
+      assert(ConstOpnd.isZero());
       return ConstantInt::get(Ty, ConstOpnd);
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index bc0fecc972fc..2d3490b2d29e 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -755,7 +755,7 @@ public:
   }
 
   bool operator==(const BDVState &Other) const {
-    return OriginalValue == OriginalValue && BaseValue == Other.BaseValue &&
+    return OriginalValue == Other.OriginalValue && BaseValue == Other.BaseValue &&
       Status == Other.Status;
   }
 
@@ -910,7 +910,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
 #ifndef NDEBUG
   VerifyStates();
   LLVM_DEBUG(dbgs() << "States after initialization:\n");
-  for (auto Pair : States) {
+  for (const auto &Pair : States) {
     LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
 #endif
@@ -1002,7 +1002,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
 #ifndef NDEBUG
   VerifyStates();
   LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
-  for (auto Pair : States) {
+  for (const auto &Pair : States) {
     LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
 #endif
@@ -1163,7 +1163,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
           // llvm::Value of the correct type (and still remain pure).
           // This will remove the need to add bitcasts.
           assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
-                 "Sanity -- findBaseOrBDV should be pure!");
+                 "findBaseOrBDV should be pure!");
 #endif
         }
         Value *Base = BlockToValue[InBB];
@@ -1377,11 +1377,11 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
     return AL;
 
   // Remove the readonly, readnone, and statepoint function attributes.
-  AttrBuilder FnAttrs = AL.getFnAttributes();
+  AttrBuilder FnAttrs = AL.getFnAttrs();
   for (auto Attr : FnAttrsToStrip)
     FnAttrs.removeAttribute(Attr);
 
-  for (Attribute A : AL.getFnAttributes()) {
+  for (Attribute A : AL.getFnAttrs()) {
     if (isStatepointDirectiveAttr(A))
       FnAttrs.remove(A);
   }
@@ -1533,9 +1533,8 @@ static StringRef getDeoptLowering(CallBase *Call) {
     // FIXME: Calls have a *really* confusing interface around attributes
     // with values.
     const AttributeList &CSAS = Call->getAttributes();
-    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
-      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
-          .getValueAsString();
+    if (CSAS.hasFnAttr(DeoptLowering))
+      return CSAS.getFnAttr(DeoptLowering).getValueAsString();
     Function *F = Call->getCalledFunction();
     assert(F && F->hasFnAttribute(DeoptLowering));
     return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1801,7 +1800,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
       CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name);
       GCResult->setAttributes(
           AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
-                             Call->getAttributes().getRetAttributes()));
+                             Call->getAttributes().getRetAttrs()));
 
       // We cannot RAUW or delete CS.getInstruction() because it could be in the
       // live set of some other safepoint, in which case that safepoint's
@@ -1855,7 +1854,7 @@ makeStatepointExplicit(DominatorTree &DT, CallBase *Call,
 // It receives iterator to the statepoint gc relocates and emits a store to the
 // assigned location (via allocaMap) for the each one of them.  It adds the
 // visited values into the visitedLiveValues set, which we will later use them
-// for sanity checking.
+// for validation checking.
 static void
 insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
                        DenseMap<Value *, AllocaInst *> &AllocaMap,
@@ -2454,7 +2453,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
                               SmallVectorImpl<CallBase *> &ToUpdate,
                               DefiningValueMapTy &DVCache) {
 #ifndef NDEBUG
-  // sanity check the input
+  // Validate the input
   std::set<CallBase *> Uniqued;
   Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
   assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
@@ -2620,9 +2619,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // we just grab that.
     llvm::append_range(Live, Info.StatepointToken->gc_args());
 #ifndef NDEBUG
-    // Do some basic sanity checks on our liveness results before performing
-    // relocation.  Relocation can and will turn mistakes in liveness results
-    // into non-sensical code which is must harder to debug.
+    // Do some basic validation checking on our liveness results before
+    // performing relocation.  Relocation can and will turn mistakes in liveness
+    // results into non-sensical code which is must harder to debug.
     // TODO: It would be nice to test consistency as well
     assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
            "statepoint must be reachable or liveness is meaningless");
@@ -2641,7 +2640,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   unique_unsorted(Live);
 
 #ifndef NDEBUG
-  // sanity check
+  // Validation check
   for (auto *Ptr : Live)
     assert(isHandledGCPointerType(Ptr->getType()) &&
            "must be a gc pointer type");
@@ -2656,18 +2655,19 @@ template <typename AttrHolder>
 static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
                                       unsigned Index) {
   AttrBuilder R;
-  if (AH.getDereferenceableBytes(Index))
+  AttributeSet AS = AH.getAttributes().getAttributes(Index);
+  if (AS.getDereferenceableBytes())
     R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
-                                  AH.getDereferenceableBytes(Index)));
-  if (AH.getDereferenceableOrNullBytes(Index))
+                                  AS.getDereferenceableBytes()));
+  if (AS.getDereferenceableOrNullBytes())
     R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
-                                  AH.getDereferenceableOrNullBytes(Index)));
+                                  AS.getDereferenceableOrNullBytes()));
   for (auto Attr : ParamAttrsToStrip)
-    if (AH.getAttributes().hasAttribute(Index, Attr))
+    if (AS.hasAttribute(Attr))
       R.addAttribute(Attr);
 
   if (!R.empty())
-    AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
+    AH.setAttributes(AH.getAttributes().removeAttributesAtIndex(Ctx, Index, R));
 }
 
 static void stripNonValidAttributesFromPrototype(Function &F) {
@@ -3016,7 +3016,7 @@ static SetVector<Value *> computeKillSet(BasicBlock *BB) {
 
 #ifndef NDEBUG
 /// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
-/// sanity check for the liveness computation.
+/// validation check for the liveness computation.
 static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
                           Instruction *TI, bool TermOkay = false) {
   for (Value *V : Live) {
@@ -3103,7 +3103,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,
   } // while (!Worklist.empty())
 
 #ifndef NDEBUG
-  // Sanity check our output against SSA properties.  This helps catch any
+  // Verify our output against SSA properties.  This helps catch any
   // missing kills during the above iteration.
   for (BasicBlock &BB : F)
     checkBasicSSA(DT, Data, BB);
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index b09f896d0157..28e00c873361 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -490,17 +490,17 @@ bool llvm::runIPSCCP(
         AttrBuilder AttributesToRemove;
         AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
         AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
-        F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove);
+        F.removeFnAttrs(AttributesToRemove);
 
         for (User *U : F.users()) {
           auto *CB = dyn_cast<CallBase>(U);
           if (!CB || CB->getCalledFunction() != &F)
             continue;
 
-          CB->removeAttributes(AttributeList::FunctionIndex,
-                               AttributesToRemove);
+          CB->removeFnAttrs(AttributesToRemove);
         }
       }
+      MadeChanges |= ReplacedPointerArg;
     }
 
     SmallPtrSet<Value *, 32> InsertedValues;
@@ -540,14 +540,13 @@ bool llvm::runIPSCCP(
       DTU.deleteBB(DeadBB);
 
     for (BasicBlock &BB : F) {
-      for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
-        Instruction *Inst = &*BI++;
-        if (Solver.getPredicateInfoFor(Inst)) {
-          if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
+      for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
+        if (Solver.getPredicateInfoFor(&Inst)) {
+          if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
             if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
               Value *Op = II->getOperand(0);
-              Inst->replaceAllUsesWith(Op);
-              Inst->eraseFromParent();
+              Inst.replaceAllUsesWith(Op);
+              Inst.eraseFromParent();
             }
           }
         }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 5ec01454e5b2..31c8999c3724 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -122,7 +122,7 @@ namespace {
 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
   std::string Prefix;
 
-  const Twine getNameWithPrefix(const Twine &Name) const {
+  Twine getNameWithPrefix(const Twine &Name) const {
     return Name.isTriviallyEmpty() ? Name : Prefix + Name;
   }
 
@@ -1275,8 +1275,7 @@ static void speculatePHINodeLoads(PHINode &PN) {
 
   // Get the AA tags and alignment to use from one of the loads. It does not
   // matter which one we get and if any differ.
-  AAMDNodes AATags;
-  SomeLoad->getAAMetadata(AATags);
+  AAMDNodes AATags = SomeLoad->getAAMetadata();
   Align Alignment = SomeLoad->getAlign();
 
   // Rewrite all loads of the PN to use the new PHI.
@@ -1330,14 +1329,21 @@ static void speculatePHINodeLoads(PHINode &PN) {
 ///   %V = select i1 %cond, i32 %V1, i32 %V2
 ///
 /// We can do this to a select if its only uses are loads and if the operand
-/// to the select can be loaded unconditionally.
+/// to the select can be loaded unconditionally. If found an intervening bitcast
+/// with a single use of the load, allow the promotion.
 static bool isSafeSelectToSpeculate(SelectInst &SI) {
   Value *TValue = SI.getTrueValue();
   Value *FValue = SI.getFalseValue();
   const DataLayout &DL = SI.getModule()->getDataLayout();
 
   for (User *U : SI.users()) {
-    LoadInst *LI = dyn_cast<LoadInst>(U);
+    LoadInst *LI;
+    BitCastInst *BC = dyn_cast<BitCastInst>(U);
+    if (BC && BC->hasOneUse())
+      LI = dyn_cast<LoadInst>(*BC->user_begin());
+    else
+      LI = dyn_cast<LoadInst>(U);
+
     if (!LI || !LI->isSimple())
       return false;
 
@@ -1363,13 +1369,27 @@ static void speculateSelectInstLoads(SelectInst &SI) {
   Value *FV = SI.getFalseValue();
   // Replace the loads of the select with a select of two loads.
   while (!SI.use_empty()) {
-    LoadInst *LI = cast<LoadInst>(SI.user_back());
+    LoadInst *LI;
+    BitCastInst *BC = dyn_cast<BitCastInst>(SI.user_back());
+    if (BC) {
+      assert(BC->hasOneUse() && "Bitcast should have a single use.");
+      LI = cast<LoadInst>(BC->user_back());
+    } else {
+      LI = cast<LoadInst>(SI.user_back());
+    }
+
     assert(LI->isSimple() && "We only speculate simple loads");
 
     IRB.SetInsertPoint(LI);
-    LoadInst *TL = IRB.CreateLoad(LI->getType(), TV,
+    Value *NewTV =
+        BC ? IRB.CreateBitCast(TV, BC->getType(), TV->getName() + ".sroa.cast")
+           : TV;
+    Value *NewFV =
+        BC ? IRB.CreateBitCast(FV, BC->getType(), FV->getName() + ".sroa.cast")
+           : FV;
+    LoadInst *TL = IRB.CreateLoad(LI->getType(), NewTV,
                                   LI->getName() + ".sroa.speculate.load.true");
-    LoadInst *FL = IRB.CreateLoad(LI->getType(), FV,
+    LoadInst *FL = IRB.CreateLoad(LI->getType(), NewFV,
                                   LI->getName() + ".sroa.speculate.load.false");
     NumLoadsSpeculated += 2;
 
@@ -1377,8 +1397,7 @@ static void speculateSelectInstLoads(SelectInst &SI) {
     TL->setAlignment(LI->getAlign());
     FL->setAlignment(LI->getAlign());
 
-    AAMDNodes Tags;
-    LI->getAAMetadata(Tags);
+    AAMDNodes Tags = LI->getAAMetadata();
     if (Tags) {
       TL->setAAMetadata(Tags);
       FL->setAAMetadata(Tags);
@@ -1390,6 +1409,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
     LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
     LI->replaceAllUsesWith(V);
     LI->eraseFromParent();
+    if (BC)
+      BC->eraseFromParent();
   }
   SI.eraseFromParent();
 }
@@ -1462,76 +1483,6 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   return buildGEP(IRB, BasePtr, Indices, NamePrefix);
 }
 
-/// Recursively compute indices for a natural GEP.
-///
-/// This is the recursive step for getNaturalGEPWithOffset that walks down the
-/// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
-                                       Value *Ptr, Type *Ty, APInt &Offset,
-                                       Type *TargetTy,
-                                       SmallVectorImpl<Value *> &Indices,
-                                       const Twine &NamePrefix) {
-  if (Offset == 0)
-    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
-                                 NamePrefix);
-
-  // We can't recurse through pointer types.
-  if (Ty->isPointerTy())
-    return nullptr;
-
-  // We try to analyze GEPs over vectors here, but note that these GEPs are
-  // extremely poorly defined currently. The long-term goal is to remove GEPing
-  // over a vector from the IR completely.
-  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits =
-        DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
-    if (ElementSizeInBits % 8 != 0) {
-      // GEPs over non-multiple of 8 size vector elements are invalid.
-      return nullptr;
-    }
-    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
-    APInt NumSkippedElements = Offset.sdiv(ElementSize);
-    if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
-      return nullptr;
-    Offset -= NumSkippedElements * ElementSize;
-    Indices.push_back(IRB.getInt(NumSkippedElements));
-    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
-                                    Offset, TargetTy, Indices, NamePrefix);
-  }
-
-  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
-    Type *ElementTy = ArrTy->getElementType();
-    APInt ElementSize(Offset.getBitWidth(),
-                      DL.getTypeAllocSize(ElementTy).getFixedSize());
-    APInt NumSkippedElements = Offset.sdiv(ElementSize);
-    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
-      return nullptr;
-
-    Offset -= NumSkippedElements * ElementSize;
-    Indices.push_back(IRB.getInt(NumSkippedElements));
-    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                    Indices, NamePrefix);
-  }
-
-  StructType *STy = dyn_cast<StructType>(Ty);
-  if (!STy)
-    return nullptr;
-
-  const StructLayout *SL = DL.getStructLayout(STy);
-  uint64_t StructOffset = Offset.getZExtValue();
-  if (StructOffset >= SL->getSizeInBytes())
-    return nullptr;
-  unsigned Index = SL->getElementContainingOffset(StructOffset);
-  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
-  Type *ElementTy = STy->getElementType(Index);
-  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
-    return nullptr; // The offset points into alignment padding.
-
-  Indices.push_back(IRB.getInt32(Index));
-  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, NamePrefix);
-}
-
 /// Get a natural GEP from a base pointer to a particular offset and
 /// resulting in a particular type.
 ///
@@ -1556,18 +1507,15 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
     return nullptr; // We can't GEP through an unsized element.
-  if (isa<ScalableVectorType>(ElementTy))
+
+  SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(ElementTy, Offset);
+  if (Offset != 0)
     return nullptr;
-  APInt ElementSize(Offset.getBitWidth(),
-                    DL.getTypeAllocSize(ElementTy).getFixedSize());
-  if (ElementSize == 0)
-    return nullptr; // Zero-length arrays can't help us build a natural GEP.
-  APInt NumSkippedElements = Offset.sdiv(ElementSize);
-
-  Offset -= NumSkippedElements * ElementSize;
-  Indices.push_back(IRB.getInt(NumSkippedElements));
-  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, NamePrefix);
+
+  for (const APInt &Index : IntIndices)
+    Indices.push_back(IRB.getInt(Index));
+  return getNaturalGEPWithType(IRB, DL, Ptr, ElementTy, TargetTy, Indices,
+                               NamePrefix);
 }
 
 /// Compute an adjusted pointer from Ptr by Offset bytes where the
@@ -1588,6 +1536,15 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
                              APInt Offset, Type *PointerTy,
                              const Twine &NamePrefix) {
+  // Create i8 GEP for opaque pointers.
+  if (Ptr->getType()->isOpaquePointerTy()) {
+    if (Offset != 0)
+      Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset),
+                                  NamePrefix + "sroa_idx");
+    return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
+                                                   NamePrefix + "sroa_cast");
+  }
+
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1851,13 +1808,13 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
       return false;
-  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
-    // Disable vector promotion when there are loads or stores of an FCA.
-    return false;
   } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
     if (LI->isVolatile())
       return false;
     Type *LTy = LI->getType();
+    // Disable vector promotion when there are loads or stores of an FCA.
+    if (LTy->isStructTy())
+      return false;
     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
       assert(LTy->isIntegerTy());
       LTy = SplitIntTy;
@@ -1868,6 +1825,9 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
     if (SI->isVolatile())
       return false;
     Type *STy = SI->getValueOperand()->getType();
+    // Disable vector promotion when there are loads or stores of an FCA.
+    if (STy->isStructTy())
+      return false;
     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
       assert(STy->isIntegerTy());
       STy = SplitIntTy;
@@ -2282,7 +2242,7 @@ class llvm::sroa::AllocaSliceRewriter
 
   const DataLayout &DL;
   AllocaSlices &AS;
-  SROA &Pass;
+  SROAPass &Pass;
   AllocaInst &OldAI, &NewAI;
   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
   Type *NewAllocaTy;
@@ -2330,7 +2290,7 @@ class llvm::sroa::AllocaSliceRewriter
   IRBuilderTy IRB;
 
 public:
-  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass,
                       AllocaInst &OldAI, AllocaInst &NewAI,
                       uint64_t NewAllocaBeginOffset,
                       uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
@@ -2510,8 +2470,7 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
-    AAMDNodes AATags;
-    LI.getAAMetadata(AATags);
+    AAMDNodes AATags = LI.getAAMetadata();
 
     unsigned AS = LI.getPointerAddressSpace();
 
@@ -2675,9 +2634,7 @@ private:
     Value *OldOp = SI.getOperand(1);
     assert(OldOp == OldPtr);
 
-    AAMDNodes AATags;
-    SI.getAAMetadata(AATags);
-
+    AAMDNodes AATags = SI.getAAMetadata();
     Value *V = SI.getValueOperand();
 
     // Strip all inbounds GEPs and pointer casts to try to dig out any root
@@ -2743,7 +2700,9 @@ private:
     deleteIfTriviallyDead(OldOp);
 
     LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
-    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+    return NewSI->getPointerOperand() == &NewAI &&
+           NewSI->getValueOperand()->getType() == NewAllocaTy &&
+           !SI.isVolatile();
   }
 
   /// Compute an integer value from splatting an i8 across the given
@@ -2784,8 +2743,7 @@ private:
     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
     assert(II.getRawDest() == OldPtr);
 
-    AAMDNodes AATags;
-    II.getAAMetadata(AATags);
+    AAMDNodes AATags = II.getAAMetadata();
 
     // If the memset has a variable size, it cannot be split, just adjust the
     // pointer to the new alloca.
@@ -2811,10 +2769,11 @@ private:
       if (BeginOffset > NewAllocaBeginOffset ||
           EndOffset < NewAllocaEndOffset)
         return false;
+      // Length must be in range for FixedVectorType.
       auto *C = cast<ConstantInt>(II.getLength());
-      if (C->getBitWidth() > 64)
+      const uint64_t Len = C->getLimitedValue();
+      if (Len > std::numeric_limits<unsigned>::max())
         return false;
-      const auto Len = C->getZExtValue();
       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
       auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
       return canConvertValue(DL, SrcTy, AllocaTy) &&
@@ -2912,8 +2871,7 @@ private:
 
     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
 
-    AAMDNodes AATags;
-    II.getAAMetadata(AATags);
+    AAMDNodes AATags = II.getAAMetadata();
 
     bool IsDest = &II.getRawDestUse() == OldUse;
     assert((IsDest && II.getRawDest() == OldPtr) ||
@@ -3420,9 +3378,7 @@ private:
 
     // We have an aggregate being loaded, split it apart.
     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
-    AAMDNodes AATags;
-    LI.getAAMetadata(AATags);
-    LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
+    LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
                             getAdjustedAlignment(&LI, 0), DL);
     Value *V = UndefValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
@@ -3473,9 +3429,7 @@ private:
 
     // We have an aggregate being stored, split it apart.
     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
-    AAMDNodes AATags;
-    SI.getAAMetadata(AATags);
-    StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
+    StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(),
                              getAdjustedAlignment(&SI, 0), DL);
     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
     Visited.erase(&SI);
@@ -3801,7 +3755,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
 /// there all along.
 ///
 /// \returns true if any changes are made.
-bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
 
   // Track the loads and stores which are candidates for pre-splitting here, in
@@ -4281,8 +4235,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
 /// appropriate new offsets. It also evaluates how successful the rewrite was
 /// at enabling promotion and if it was successful queues the alloca to be
 /// promoted.
-AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                                   Partition &P) {
+AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                                       Partition &P) {
   // Try to compute a friendly type for this partition of the alloca. This
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
@@ -4433,7 +4387,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
 
 /// Walks the slices of an alloca and form partitions based on them,
 /// rewriting each of their uses.
-bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   if (AS.begin() == AS.end())
     return false;
 
@@ -4604,7 +4558,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 }
 
 /// Clobber a use with undef, deleting the used value if it becomes dead.
-void SROA::clobberUse(Use &U) {
+void SROAPass::clobberUse(Use &U) {
   Value *OldV = U;
   // Replace the use with an undef value.
   U = UndefValue::get(OldV->getType());
@@ -4623,7 +4577,7 @@ void SROA::clobberUse(Use &U) {
 /// This analyzes the alloca to ensure we can reason about it, builds
 /// the slices of the alloca, and then hands it off to be split and
 /// rewritten as needed.
-bool SROA::runOnAlloca(AllocaInst &AI) {
+bool SROAPass::runOnAlloca(AllocaInst &AI) {
   LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
   ++NumAllocasAnalyzed;
 
@@ -4697,7 +4651,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 ///
 /// We also record the alloca instructions deleted here so that they aren't
 /// subsequently handed to mem2reg to promote.
-bool SROA::deleteDeadInstructions(
+bool SROAPass::deleteDeadInstructions(
     SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
   bool Changed = false;
   while (!DeadInsts.empty()) {
@@ -4736,7 +4690,7 @@ bool SROA::deleteDeadInstructions(
 /// This attempts to promote whatever allocas have been identified as viable in
 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
 /// This function returns whether any promotion occurred.
-bool SROA::promoteAllocas(Function &F) {
+bool SROAPass::promoteAllocas(Function &F) {
   if (PromotableAllocas.empty())
     return false;
 
@@ -4748,8 +4702,8 @@ bool SROA::promoteAllocas(Function &F) {
   return true;
 }
 
-PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
-                                AssumptionCache &RunAC) {
+PreservedAnalyses SROAPass::runImpl(Function &F, DominatorTree &RunDT,
+                                    AssumptionCache &RunAC) {
   LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
   DT = &RunDT;
@@ -4803,7 +4757,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   return PA;
 }
 
-PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
   return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
                  AM.getResult<AssumptionAnalysis>(F));
 }
@@ -4814,7 +4768,7 @@ PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
 /// SROA pass.
 class llvm::sroa::SROALegacyPass : public FunctionPass {
   /// The SROA implementation.
-  SROA Impl;
+  SROAPass Impl;
 
 public:
   static char ID;
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index ca288a533f46..1284bae820a4 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -873,13 +873,11 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
   auto &DL = F.getParent()->getDataLayout();
   while (MadeChange) {
     MadeChange = false;
-    for (Function::iterator I = F.begin(); I != F.end();) {
-      BasicBlock *BB = &*I++;
+    for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
       bool ModifiedDTOnIteration = false;
-      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration, TTI, DL,
+      MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration, TTI, DL,
                                   DTU.hasValue() ? DTU.getPointer() : nullptr);
 
-
       // Restart BB iteration if the dominator tree of the Function was changed
       if (ModifiedDTOnIteration)
         break;
@@ -933,7 +931,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
   if (II) {
     // The scalarization code below does not work for scalable vectors.
     if (isa<ScalableVectorType>(II->getType()) ||
-        any_of(II->arg_operands(),
+        any_of(II->args(),
                [](Value *V) { return isa<ScalableVectorType>(V->getType()); }))
       return false;
 
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 8ef6b69673be..6b7419abe1d1 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -66,6 +66,15 @@ static cl::opt<bool>
 
 namespace {
 
+BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
+  BasicBlock *BB = Itr->getParent();
+  if (isa<PHINode>(Itr))
+    Itr = BB->getFirstInsertionPt();
+  if (Itr != BB->end())
+    Itr = skipDebugIntrinsics(Itr);
+  return Itr;
+}
+
 // Used to store the scattered form of a vector.
 using ValueVector = SmallVector<Value *, 8>;
 
@@ -371,10 +380,11 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
       return Scatterer(Point->getParent(), Point->getIterator(),
                        UndefValue::get(V->getType()));
     // Put the scattered form of an instruction directly after the
-    // instruction.
+    // instruction, skipping over PHI nodes and debug intrinsics.
     BasicBlock *BB = VOp->getParent();
-    return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
-                     V, &Scattered[V]);
+    return Scatterer(
+        BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V,
+        &Scattered[V]);
   }
   // In the fallback case, just put the scattered before Point and
   // keep the result local to Point.
@@ -530,7 +540,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     return false;
 
   unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
-  unsigned NumArgs = CI.getNumArgOperands();
+  unsigned NumArgs = CI.arg_size();
 
   ValueVector ScalarOperands(NumArgs);
   SmallVector<Scatterer, 8> Scattered(NumArgs);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index f216956406b6..ffa2f9adb978 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1164,8 +1164,11 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
   DL = &F.getParent()->getDataLayout();
   bool Changed = false;
   for (BasicBlock &B : F) {
-    for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+    if (!DT->isReachableFromEntry(&B))
+      continue;
+
+    for (Instruction &I : llvm::make_early_inc_range(B))
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I))
         Changed |= splitGEP(GEP);
     // No need to split GEP ConstantExprs because all its indices are constant
     // already.
@@ -1258,10 +1261,8 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
   DominatingSubs.clear();
   for (const auto Node : depth_first(DT)) {
     BasicBlock *BB = Node->getBlock();
-    for (auto I = BB->begin(); I != BB->end(); ) {
-      Instruction *Cur = &*I++;
-      Changed |= reuniteExts(Cur);
-    }
+    for (Instruction &I : llvm::make_early_inc_range(*BB))
+      Changed |= reuniteExts(&I);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index b9cccc2af309..a27da047bfd3 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -49,7 +50,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -81,6 +81,7 @@ static cl::opt<bool> EnableNonTrivialUnswitch(
 
 static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
+                      cl::ZeroOrMore,
                       cl::desc("The cost threshold for unswitching a loop."));
 
 static cl::opt<bool> EnableUnswitchCostMultiplier(
@@ -108,6 +109,10 @@ static cl::opt<unsigned>
                   cl::desc("Max number of memory uses to explore during "
                            "partial unswitching analysis"),
                   cl::init(100), cl::Hidden);
+static cl::opt<bool> FreezeLoopUnswitchCond(
+    "freeze-loop-unswitch-cond", cl::init(false), cl::Hidden,
+    cl::desc("If enabled, the freeze instruction will be added to condition "
+             "of loop unswitch to prevent miscompilation."));
 
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
@@ -195,15 +200,15 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
 /// Copy a set of loop invariant values \p ToDuplicate and insert them at the
 /// end of \p BB and conditionally branch on the copied condition. We only
 /// branch on a single value.
-static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
-                                                  ArrayRef<Value *> Invariants,
-                                                  bool Direction,
-                                                  BasicBlock &UnswitchedSucc,
-                                                  BasicBlock &NormalSucc) {
+static void buildPartialUnswitchConditionalBranch(
+    BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
+    BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) {
   IRBuilder<> IRB(&BB);
 
   Value *Cond = Direction ? IRB.CreateOr(Invariants) :
     IRB.CreateAnd(Invariants);
+  if (InsertFreeze)
+    Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr");
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                    Direction ? &NormalSucc : &UnswitchedSucc);
 }
@@ -564,7 +569,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
              "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the"
              " condition!");
     buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
-                                          *UnswitchedBB, *NewPH);
+                                          *UnswitchedBB, *NewPH, false);
   }
 
   // Update the dominator tree with the added edge.
@@ -1587,10 +1592,12 @@ deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
     BB->eraseFromParent();
 }
 
-static void deleteDeadBlocksFromLoop(Loop &L,
-                                     SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                                     DominatorTree &DT, LoopInfo &LI,
-                                     MemorySSAUpdater *MSSAU) {
+static void
+deleteDeadBlocksFromLoop(Loop &L,
+                         SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                         DominatorTree &DT, LoopInfo &LI,
+                         MemorySSAUpdater *MSSAU,
+                         function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
   // Find all the dead blocks tied to this loop, and remove them from their
   // successors.
   SmallSetVector<BasicBlock *, 8> DeadBlockSet;
@@ -1640,6 +1647,7 @@ static void deleteDeadBlocksFromLoop(Loop &L,
                         }) &&
            "If the child loop header is dead all blocks in the child loop must "
            "be dead as well!");
+    DestroyLoopCB(*ChildL, ChildL->getName());
     LI.destroy(ChildL);
     return true;
   });
@@ -1980,6 +1988,8 @@ static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
       ParentL->removeChildLoop(llvm::find(*ParentL, &L));
     else
       LI.removeLoop(llvm::find(LI, &L));
+    // markLoopAsDeleted for L should be triggered by the caller (it is typically
+    // done by using the UnswitchCB callback).
     LI.destroy(&L);
     return false;
   }
@@ -2019,7 +2029,8 @@ static void unswitchNontrivialInvariants(
     SmallVectorImpl<BasicBlock *> &ExitBlocks, IVConditionInfo &PartialIVInfo,
     DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
     function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
-    ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+    ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+    function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
   auto *ParentBB = TI.getParent();
   BranchInst *BI = dyn_cast<BranchInst>(&TI);
   SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
@@ -2117,6 +2128,13 @@ static void unswitchNontrivialInvariants(
       SE->forgetTopmostLoop(&L);
   }
 
+  bool InsertFreeze = false;
+  if (FreezeLoopUnswitchCond) {
+    ICFLoopSafetyInfo SafetyInfo;
+    SafetyInfo.computeLoopSafetyInfo(&L);
+    InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L);
+  }
+
   // If the edge from this terminator to a successor dominates that successor,
   // store a map from each block in its dominator subtree to it. This lets us
   // tell when cloning for a particular successor if a block is dominated by
@@ -2191,6 +2209,11 @@ static void unswitchNontrivialInvariants(
       BasicBlock *ClonedPH = ClonedPHs.begin()->second;
       BI->setSuccessor(ClonedSucc, ClonedPH);
       BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      if (InsertFreeze) {
+        auto Cond = BI->getCondition();
+        if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT))
+          BI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", BI));
+      }
       DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
     } else {
       assert(SI && "Must either be a branch or switch!");
@@ -2205,6 +2228,11 @@ static void unswitchNontrivialInvariants(
         else
           Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
 
+      if (InsertFreeze) {
+        auto Cond = SI->getCondition();
+        if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, SI, &DT))
+          SI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", SI));
+      }
       // We need to use the set to populate domtree updates as even when there
       // are multiple cases pointing at the same successor we only want to
       // remove and insert one edge in the domtree.
@@ -2285,7 +2313,7 @@ static void unswitchNontrivialInvariants(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
     else
       buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
-                                            *ClonedPH, *LoopPH);
+                                            *ClonedPH, *LoopPH, InsertFreeze);
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
 
     if (MSSAU) {
@@ -2319,7 +2347,7 @@ static void unswitchNontrivialInvariants(
   // Now that our cloned loops have been built, we can update the original loop.
   // First we delete the dead blocks from it and then we rebuild the loop
   // structure taking these deletions into account.
-  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU);
+  deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, DestroyLoopCB);
 
   if (MSSAU && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
@@ -2364,7 +2392,9 @@ static void unswitchNontrivialInvariants(
     ConstantInt *ContinueReplacement =
         Direction ? ConstantInt::getFalse(BI->getContext())
                   : ConstantInt::getTrue(BI->getContext());
-    for (Value *Invariant : Invariants)
+    for (Value *Invariant : Invariants) {
+      assert(!isa<Constant>(Invariant) &&
+             "Should not be replacing constant values!");
       // Use make_early_inc_range here as set invalidates the iterator.
       for (Use &U : llvm::make_early_inc_range(Invariant->uses())) {
         Instruction *UserI = dyn_cast<Instruction>(U.getUser());
@@ -2379,6 +2409,7 @@ static void unswitchNontrivialInvariants(
                  DT.dominates(ClonedPH, UserI->getParent()))
           U.set(UnswitchedReplacement);
       }
+    }
   }
 
   // We can change which blocks are exit blocks of all the cloned sibling
@@ -2670,7 +2701,8 @@ static bool unswitchBestCondition(
     Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
     AAResults &AA, TargetTransformInfo &TTI,
     function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
-    ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+    ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+    function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
   // Collect all invariant conditions within this loop (as opposed to an inner
   // loop which would be handled when visiting that inner loop).
   SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
@@ -2720,6 +2752,9 @@ static bool unswitchBestCondition(
       Cond = CondNext;
     BI->setCondition(Cond);
 
+    if (isa<Constant>(Cond))
+      continue;
+
     if (L.isLoopInvariant(BI->getCondition())) {
       UnswitchCandidates.push_back({BI, {BI->getCondition()}});
       continue;
@@ -2958,7 +2993,7 @@ static bool unswitchBestCondition(
                     << "\n");
   unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
                                ExitBlocks, PartialIVInfo, DT, LI, AC,
-                               UnswitchCB, SE, MSSAU);
+                               UnswitchCB, SE, MSSAU, DestroyLoopCB);
   return true;
 }
 
@@ -2988,7 +3023,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
              AAResults &AA, TargetTransformInfo &TTI, bool Trivial,
              bool NonTrivial,
              function_ref<void(bool, bool, ArrayRef<Loop *>)> UnswitchCB,
-             ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+             ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+             function_ref<void(Loop &, StringRef)> DestroyLoopCB) {
   assert(L.isRecursivelyLCSSAForm(DT, LI) &&
          "Loops must be in LCSSA form before unswitching.");
 
@@ -3036,7 +3072,8 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
 
   // Try to unswitch the best invariant condition. We prefer this full unswitch to
   // a partial unswitch when possible below the threshold.
-  if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, UnswitchCB, SE, MSSAU))
+  if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, UnswitchCB, SE, MSSAU,
+                            DestroyLoopCB))
     return true;
 
   // No other opportunities to unswitch.
@@ -3083,6 +3120,10 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
       U.markLoopAsDeleted(L, LoopName);
   };
 
+  auto DestroyLoopCB = [&U](Loop &L, StringRef Name) {
+    U.markLoopAsDeleted(L, Name);
+  };
+
   Optional<MemorySSAUpdater> MSSAU;
   if (AR.MSSA) {
     MSSAU = MemorySSAUpdater(AR.MSSA);
@@ -3091,7 +3132,8 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
   }
   if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial,
                     UnswitchCB, &AR.SE,
-                    MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
+                    MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+                    DestroyLoopCB))
     return PreservedAnalyses::all();
 
   if (AR.MSSA && VerifyMemorySSA)
@@ -3107,6 +3149,17 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
   return PA;
 }
 
+void SimpleLoopUnswitchPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+
+  OS << "<";
+  OS << (NonTrivial ? "" : "no-") << "nontrivial;";
+  OS << (Trivial ? "" : "no-") << "trivial";
+  OS << ">";
+}
+
 namespace {
 
 class SimpleLoopUnswitchLegacyPass : public LoopPass {
@@ -3126,10 +3179,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
@@ -3150,12 +3201,8 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  MemorySSA *MSSA = nullptr;
-  Optional<MemorySSAUpdater> MSSAU;
-  if (EnableMSSALoopDependency) {
-    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    MSSAU = MemorySSAUpdater(MSSA);
-  }
+  MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MemorySSAUpdater MSSAU(MSSA);
 
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
   auto *SE = SEWP ? &SEWP->getSE() : nullptr;
@@ -3179,14 +3226,17 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
       LPM.markLoopAsDeleted(*L);
   };
 
-  if (MSSA && VerifyMemorySSA)
+  auto DestroyLoopCB = [&LPM](Loop &L, StringRef /* Name */) {
+    LPM.markLoopAsDeleted(L);
+  };
+
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
-  bool Changed =
-      unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, UnswitchCB, SE,
-                   MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+  bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial,
+                              UnswitchCB, SE, &MSSAU, DestroyLoopCB);
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
   // Historically this pass has had issues with the dominator tree so verify it
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 09d59b0e884a..86d3620c312e 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -224,7 +224,11 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
   SmallVector<WeakVH, 16> LoopHeaders(UniqueLoopHeaders.begin(),
                                       UniqueLoopHeaders.end());
 
+  unsigned IterCnt = 0;
+  (void)IterCnt;
   while (LocalChange) {
+    assert(IterCnt++ < 1000 &&
+           "Sanity: iterative simplification didn't converge!");
     LocalChange = false;
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
@@ -319,6 +323,21 @@ SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts)
   applyCommandLineOverridesToOptions(Options);
 }
 
+void SimplifyCFGPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<SimplifyCFGPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+  OS << "bonus-inst-threshold=" << Options.BonusInstThreshold << ";";
+  OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;";
+  OS << (Options.ConvertSwitchToLookupTable ? "" : "no-")
+     << "switch-to-lookup;";
+  OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
+  OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
+  OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts";
+  OS << ">";
+}
+
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index dfa30418ea01..06169a7834f6 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -268,7 +268,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
     if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) {
       return all_of(DVI->location_ops(), [&NotHoisted](Value *V) {
         if (const auto *I = dyn_cast_or_null<Instruction>(V)) {
-          if (NotHoisted.count(I) == 0)
+          if (!NotHoisted.contains(I))
             return true;
         }
         return false;
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 20b8b982e14b..b47378808216 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -607,7 +607,7 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
   if (IndexOffset == 1)
     return C.Stride;
   // Common case 2: if (i' - i) is -1, Bump = -S.
-  if (IndexOffset.isAllOnesValue())
+  if (IndexOffset.isAllOnes())
     return Builder.CreateNeg(C.Stride);
 
   // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
@@ -620,7 +620,7 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
     ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
     return Builder.CreateShl(ExtendedStride, Exponent);
   }
-  if ((-IndexOffset).isPowerOf2()) {
+  if (IndexOffset.isNegatedPowerOf2()) {
     // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
     ConstantInt *Exponent =
         ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 846a9321f53e..3bcf92e28a21 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -262,7 +262,7 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
         // Note that this runs whether we know an alloca has escaped or not. If
         // it has, then we can't trust Tracker.AllocaUsers to be accurate.
         bool SafeToTail = true;
-        for (auto &Arg : CI->arg_operands()) {
+        for (auto &Arg : CI->args()) {
           if (isa<Constant>(Arg.getUser()))
             continue;
           if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
@@ -584,8 +584,8 @@ void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
 // call instruction into the newly created temporarily variable.
 void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI,
                                                               int OpndIdx) {
-  PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType());
-  Type *AggTy = ArgTy->getElementType();
+  Type *AggTy = CI->getParamByValType(OpndIdx);
+  assert(AggTy);
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Get alignment of byVal operand.
@@ -611,8 +611,8 @@ void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI,
 // into the corresponding function argument location.
 void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments(
     CallInst *CI, int OpndIdx) {
-  PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType());
-  Type *AggTy = ArgTy->getElementType();
+  Type *AggTy = CI->getParamByValType(OpndIdx);
+  assert(AggTy);
   const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Get alignment of byVal operand.
@@ -667,7 +667,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
     createTailRecurseLoopHeader(CI);
 
   // Copy values of ByVal operands into local temporarily variables.
-  for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) {
+  for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
     if (CI->isByValArgument(I))
       copyByValueOperandIntoLocalTemp(CI, I);
   }
@@ -675,7 +675,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   // Ok, now that we know we have a pseudo-entry block WITH all of the
   // required PHI nodes, add entries into the PHI node for the actual
   // parameters passed into the tail-recursive call.
-  for (unsigned I = 0, E = CI->getNumArgOperands(); I != E; ++I) {
+  for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) {
     if (CI->isByValArgument(I)) {
       copyLocalTempOfByValueOperandIntoArguments(CI, I);
       ArgumentPHIs[I]->addIncoming(F.getArg(I), BB);
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 8cd16ca3906f..fdc914a72bfd 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -63,6 +63,9 @@ static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
   auto Int64Ty = Builder.getInt64Ty();
   auto M = Builder.GetInsertBlock()->getModule();
   auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
+  if (!M->getModuleFlag("amdgpu_hostcall")) {
+    M->addModuleFlag(llvm::Module::Override, "amdgpu_hostcall", 1);
+  }
   return Builder.CreateCall(Fn, Version);
 }
 
diff --git a/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 01912297324a..cbc508bb863a 100644
--- a/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -33,14 +33,14 @@ static inline bool CompareVars(const ASanStackVariableDescription &a,
 
 // We also force minimal alignment for all vars to kMinAlignment so that vars
 // with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars.
-static const size_t kMinAlignment = 16;
+static const uint64_t kMinAlignment = 16;
 
 // We want to add a full redzone after every variable.
 // The larger the variable Size the larger is the redzone.
 // The resulting frame size is a multiple of Alignment.
-static size_t VarAndRedzoneSize(size_t Size, size_t Granularity,
-                                size_t Alignment) {
-  size_t Res = 0;
+static uint64_t VarAndRedzoneSize(uint64_t Size, uint64_t Granularity,
+                                  uint64_t Alignment) {
+  uint64_t Res = 0;
   if (Size <= 4)  Res = 16;
   else if (Size <= 16) Res = 32;
   else if (Size <= 128) Res = Size + 32;
@@ -52,7 +52,7 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Granularity,
 
 ASanStackFrameLayout
 ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
-                            size_t Granularity, size_t MinHeaderSize) {
+                            uint64_t Granularity, uint64_t MinHeaderSize) {
   assert(Granularity >= 8 && Granularity <= 64 &&
          (Granularity & (Granularity - 1)) == 0);
   assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 &&
@@ -67,22 +67,22 @@ ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
   ASanStackFrameLayout Layout;
   Layout.Granularity = Granularity;
   Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment);
-  size_t Offset = std::max(std::max(MinHeaderSize, Granularity),
-     Vars[0].Alignment);
+  uint64_t Offset =
+      std::max(std::max(MinHeaderSize, Granularity), Vars[0].Alignment);
   assert((Offset % Granularity) == 0);
   for (size_t i = 0; i < NumVars; i++) {
     bool IsLast = i == NumVars - 1;
-    size_t Alignment = std::max(Granularity, Vars[i].Alignment);
+    uint64_t Alignment = std::max(Granularity, Vars[i].Alignment);
     (void)Alignment;  // Used only in asserts.
-    size_t Size = Vars[i].Size;
+    uint64_t Size = Vars[i].Size;
     assert((Alignment & (Alignment - 1)) == 0);
     assert(Layout.FrameAlignment >= Alignment);
     assert((Offset % Alignment) == 0);
     assert(Size > 0);
-    size_t NextAlignment = IsLast ? Granularity
-                   : std::max(Granularity, Vars[i + 1].Alignment);
-    size_t SizeWithRedzone = VarAndRedzoneSize(Size, Granularity,
-                                               NextAlignment);
+    uint64_t NextAlignment =
+        IsLast ? Granularity : std::max(Granularity, Vars[i + 1].Alignment);
+    uint64_t SizeWithRedzone =
+        VarAndRedzoneSize(Size, Granularity, NextAlignment);
     Vars[i].Offset = Offset;
     Offset += SizeWithRedzone;
   }
@@ -118,7 +118,7 @@ GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars,
   assert(Vars.size() > 0);
   SmallVector<uint8_t, 64> SB;
   SB.clear();
-  const size_t Granularity = Layout.Granularity;
+  const uint64_t Granularity = Layout.Granularity;
   SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic);
   for (const auto &Var : Vars) {
     SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic);
@@ -135,13 +135,13 @@ SmallVector<uint8_t, 64> GetShadowBytesAfterScope(
     const SmallVectorImpl<ASanStackVariableDescription> &Vars,
     const ASanStackFrameLayout &Layout) {
   SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout);
-  const size_t Granularity = Layout.Granularity;
+  const uint64_t Granularity = Layout.Granularity;
 
   for (const auto &Var : Vars) {
     assert(Var.LifetimeSize <= Var.Size);
-    const size_t LifetimeShadowSize =
+    const uint64_t LifetimeShadowSize =
         (Var.LifetimeSize + Granularity - 1) / Granularity;
-    const size_t Offset = Var.Offset / Granularity;
+    const uint64_t Offset = Var.Offset / Granularity;
     std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize,
               kAsanStackUseAfterScopeMagic);
   }
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index d689e04da36f..f910f7c3c31f 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -67,7 +67,8 @@ bool isUsefullToPreserve(Attribute::AttrKind Kind) {
 
 /// This function will try to transform the given knowledge into a more
 /// canonical one. the canonical knowledge maybe the given one.
-RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, DataLayout DL) {
+RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK,
+                                         const DataLayout &DL) {
   switch (RK.AttrKind) {
   default:
     return RK;
@@ -103,7 +104,7 @@ struct AssumeBuilderState {
   Module *M;
 
   using MapKey = std::pair<Value *, Attribute::AttrKind>;
-  SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap;
+  SmallMapVector<MapKey, uint64_t, 8> AssumedKnowledgeMap;
   Instruction *InstBeingModified = nullptr;
   AssumptionCache* AC = nullptr;
   DominatorTree* DT = nullptr;
@@ -196,28 +197,27 @@ struct AssumeBuilderState {
         (!ShouldPreserveAllAttributes &&
          !isUsefullToPreserve(Attr.getKindAsEnum())))
       return;
-    unsigned AttrArg = 0;
+    uint64_t AttrArg = 0;
     if (Attr.isIntAttribute())
       AttrArg = Attr.getValueAsInt();
     addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn});
   }
 
   void addCall(const CallBase *Call) {
-    auto addAttrList = [&](AttributeList AttrList) {
-      for (unsigned Idx = AttributeList::FirstArgIndex;
-           Idx < AttrList.getNumAttrSets(); Idx++)
-        for (Attribute Attr : AttrList.getAttributes(Idx)) {
+    auto addAttrList = [&](AttributeList AttrList, unsigned NumArgs) {
+      for (unsigned Idx = 0; Idx < NumArgs; Idx++)
+        for (Attribute Attr : AttrList.getParamAttrs(Idx)) {
           bool IsPoisonAttr = Attr.hasAttribute(Attribute::NonNull) ||
                               Attr.hasAttribute(Attribute::Alignment);
-          if (!IsPoisonAttr || Call->isPassingUndefUB(Idx - 1))
-            addAttribute(Attr, Call->getArgOperand(Idx - 1));
+          if (!IsPoisonAttr || Call->isPassingUndefUB(Idx))
+            addAttribute(Attr, Call->getArgOperand(Idx));
         }
-      for (Attribute Attr : AttrList.getFnAttributes())
+      for (Attribute Attr : AttrList.getFnAttrs())
         addAttribute(Attr, nullptr);
     };
-    addAttrList(Call->getAttributes());
+    addAttrList(Call->getAttributes(), Call->arg_size());
     if (Function *Fn = Call->getCalledFunction())
-      addAttrList(Fn->getAttributes());
+      addAttrList(Fn->getAttributes(), Fn->arg_size());
   }
 
   AssumeInst *build() {
@@ -261,8 +261,7 @@ struct AssumeBuilderState {
         addKnowledge({Attribute::NonNull, 0u, Pointer});
     }
     if (MA.valueOrOne() > 1)
-      addKnowledge(
-          {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer});
+      addKnowledge({Attribute::Alignment, MA.valueOrOne().value(), Pointer});
   }
 
   void addInstruction(Instruction *I) {
@@ -392,7 +391,7 @@ struct AssumeSimplify {
   void dropRedundantKnowledge() {
     struct MapValue {
       IntrinsicInst *Assume;
-      unsigned ArgValue;
+      uint64_t ArgValue;
       CallInst::BundleOpInfo *BOI;
     };
     buildMapping(false);
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index ee933b638a23..6469c899feea 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -52,6 +53,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "basicblock-utils"
 
+static cl::opt<unsigned> MaxDeoptOrUnreachableSuccessorCheckDepth(
+    "max-deopt-or-unreachable-succ-check-depth", cl::init(8), cl::Hidden,
+    cl::desc("Set the maximum path length when checking whether a basic block "
+             "is followed by a block that either has a terminating "
+             "deoptimizing call or is terminated with an unreachable"));
+
 void llvm::DetatchDeadBlocks(
     ArrayRef<BasicBlock *> BBs,
     SmallVectorImpl<DominatorTree::UpdateType> *Updates,
@@ -230,7 +237,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   if (DTU) {
     SmallPtrSet<BasicBlock *, 2> SuccsOfBB(succ_begin(BB), succ_end(BB));
     SmallPtrSet<BasicBlock *, 2> SuccsOfPredBB(succ_begin(PredBB),
-                                               succ_begin(PredBB));
+                                               succ_end(PredBB));
     Updates.reserve(Updates.size() + 2 * SuccsOfBB.size() + 1);
     // Add insert edges first. Experimentally, for the particular case of two
     // blocks that can be merged, with a single successor and single predecessor
@@ -485,6 +492,20 @@ void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
   BI = New;
 }
 
+bool llvm::IsBlockFollowedByDeoptOrUnreachable(const BasicBlock *BB) {
+  // Remember visited blocks to avoid infinite loop
+  SmallPtrSet<const BasicBlock *, 8> VisitedBlocks;
+  unsigned Depth = 0;
+  while (BB && Depth++ < MaxDeoptOrUnreachableSuccessorCheckDepth &&
+         VisitedBlocks.insert(BB).second) {
+    if (BB->getTerminatingDeoptimizeCall() ||
+        isa<UnreachableInst>(BB->getTerminator()))
+      return true;
+    BB = BB->getUniqueSuccessor();
+  }
+  return false;
+}
+
 void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
   BasicBlock::iterator BI(From);
   ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 35e22f7a57e2..957935398972 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -96,9 +96,9 @@ static bool setDoesNotThrow(Function &F) {
 }
 
 static bool setRetDoesNotAlias(Function &F) {
-  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
+  if (F.hasRetAttribute(Attribute::NoAlias))
     return false;
-  F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  F.addRetAttr(Attribute::NoAlias);
   ++NumNoAlias;
   return true;
 }
@@ -145,8 +145,8 @@ static bool setSignExtendedArg(Function &F, unsigned ArgNo) {
 
 static bool setRetNoUndef(Function &F) {
   if (!F.getReturnType()->isVoidTy() &&
-      !F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef)) {
-    F.addAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+      !F.hasRetAttribute(Attribute::NoUndef)) {
+    F.addRetAttr(Attribute::NoUndef);
     ++NumNoUndef;
     return true;
   }
@@ -174,7 +174,10 @@ static bool setArgNoUndef(Function &F, unsigned ArgNo) {
 }
 
 static bool setRetAndArgsNoUndef(Function &F) {
-  return setRetNoUndef(F) | setArgsNoUndef(F);
+  bool UndefAdded = false;
+  UndefAdded |= setRetNoUndef(F);
+  UndefAdded |= setArgsNoUndef(F);
+  return UndefAdded;
 }
 
 static bool setReturnedArg(Function &F, unsigned ArgNo) {
@@ -1268,7 +1271,7 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
 
 Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
-  Type *I8Ptr = B.getInt8PtrTy();
+  Type *I8Ptr = Dst->getType();
   return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr},
                      {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
 }
@@ -1453,9 +1456,8 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
   // The incoming attribute set may have come from a speculatable intrinsic, but
   // is being replaced with a library call which is not allowed to be
   // speculatable.
-  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
-                                          AttributeList::FunctionIndex,
-                                          Attribute::Speculatable));
+  CI->setAttributes(
+      Attrs.removeFnAttribute(B.getContext(), Attribute::Speculatable));
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1498,9 +1500,8 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
   // The incoming attribute set may have come from a speculatable intrinsic, but
   // is being replaced with a library call which is not allowed to be
   // speculatable.
-  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
-                                          AttributeList::FunctionIndex,
-                                          Attribute::Speculatable));
+  CI->setAttributes(
+      Attrs.removeFnAttribute(B.getContext(), Attribute::Speculatable));
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1655,8 +1656,8 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
   return CI;
 }
 
-Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
-                        IRBuilderBase &B, const TargetLibraryInfo &TLI) {
+Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B,
+                        const TargetLibraryInfo &TLI) {
   if (!TLI.has(LibFunc_calloc))
     return nullptr;
 
@@ -1664,8 +1665,8 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
   StringRef CallocName = TLI.getName(LibFunc_calloc);
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  FunctionCallee Calloc = M->getOrInsertFunction(
-      CallocName, Attrs, B.getInt8PtrTy(), PtrType, PtrType);
+  FunctionCallee Calloc =
+      M->getOrInsertFunction(CallocName, B.getInt8PtrTy(), PtrType, PtrType);
   inferLibFuncAttributes(M, CallocName, TLI);
   CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
 
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 87868251036c..ebe19f1751e5 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -424,6 +424,21 @@ bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
         *FailureReason = "Argument type mismatch";
       return false;
     }
+    // Make sure that the callee and call agree on byval/inalloca. The types do
+    // not have to match.
+
+    if (Callee->hasParamAttribute(I, Attribute::ByVal) !=
+        CB.getAttributes().hasParamAttr(I, Attribute::ByVal)) {
+      if (FailureReason)
+        *FailureReason = "byval mismatch";
+      return false;
+    }
+    if (Callee->hasParamAttribute(I, Attribute::InAlloca) !=
+        CB.getAttributes().hasParamAttr(I, Attribute::InAlloca)) {
+      if (FailureReason)
+        *FailureReason = "inalloca mismatch";
+      return false;
+    }
   }
   for (; I < NumArgs; I++) {
     // Vararg functions can have more arguments than parameters.
@@ -485,18 +500,19 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
-      AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
+      AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo));
       ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
 
-      // If byval is used, this must be a pointer type, and the byval type must
-      // match the element type. Update it if present.
+      // We may have a different byval/inalloca type.
       if (ArgAttrs.getByValType())
         ArgAttrs.addByValAttr(Callee->getParamByValType(ArgNo));
+      if (ArgAttrs.getInAllocaType())
+        ArgAttrs.addInAllocaAttr(Callee->getParamInAllocaType(ArgNo));
 
       NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs));
       AttributeChanged = true;
     } else
-      NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo));
+      NewArgAttrs.push_back(CallerPAL.getParamAttrs(ArgNo));
   }
 
   // If the return type of the call site doesn't match that of the callee, cast
@@ -511,7 +527,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
 
   // Set the new callsite attribute.
   if (AttributeChanged)
-    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
+    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttrs(),
                                         AttributeSet::get(Ctx, RAttrs),
                                         NewArgAttrs));
 
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 1f649fe6c748..049c7d113521 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -33,7 +33,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 0ac9a5aaa425..048e691e33cf 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -62,7 +62,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
     NewBB->getInstList().push_back(NewInst);
     VMap[&I] = NewInst; // Add instruction map to value.
 
-    hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
+    hasCalls |= (isa<CallInst>(I) && !I.isDebugOrPseudoInst());
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       if (!AI->isStaticAlloca()) {
         hasDynamicAllocas = true;
@@ -116,13 +116,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   for (const Argument &OldArg : OldFunc->args()) {
     if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
       NewArgAttrs[NewArg->getArgNo()] =
-          OldAttrs.getParamAttributes(OldArg.getArgNo());
+          OldAttrs.getParamAttrs(OldArg.getArgNo());
     }
   }
 
   NewFunc->setAttributes(
-      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
-                         OldAttrs.getRetAttributes(), NewArgAttrs));
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttrs(),
+                         OldAttrs.getRetAttrs(), NewArgAttrs));
 
   // Everything else beyond this point deals with function instructions,
   // so if we are dealing with a function declaration, we're done.
@@ -410,7 +410,7 @@ void PruningFunctionCloner::CloneBlock(
       NewInst->setName(II->getName() + NameSuffix);
     VMap[&*II] = NewInst; // Add instruction map to value.
     NewBB->getInstList().push_back(NewInst);
-    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    hasCalls |= (isa<CallInst>(II) && !II->isDebugOrPseudoInst());
 
     if (CodeInfo) {
       CodeInfo->OrigVMap[&*II] = NewInst;
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 9edc52b53550..96aff563aa9b 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -434,6 +434,7 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
   }
   // Now add the old exit block to the outline region.
   Blocks.insert(CommonExitBlock);
+  OldTargets.push_back(NewExitBlock);
   return CommonExitBlock;
 }
 
@@ -885,7 +886,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+  for (const auto &Attr : oldFunction->getAttributes().getFnAttrs()) {
     if (Attr.isStringAttribute()) {
       if (Attr.getKindAsString() == "thunk")
         continue;
@@ -943,6 +944,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
       case Attribute::Cold:
+      case Attribute::DisableSanitizerInstrumentation:
       case Attribute::Hot:
       case Attribute::NoRecurse:
       case Attribute::InlineHint:
@@ -1044,9 +1046,8 @@ static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
                                          const SetVector<Value *> &SunkAllocas,
                                          SetVector<Value *> &LifetimesStart) {
   for (BasicBlock *BB : Blocks) {
-    for (auto It = BB->begin(), End = BB->end(); It != End;) {
-      auto *II = dyn_cast<IntrinsicInst>(&*It);
-      ++It;
+    for (Instruction &I : llvm::make_early_inc_range(*BB)) {
+      auto *II = dyn_cast<IntrinsicInst>(&I);
       if (!II || !II->isLifetimeStartOrEnd())
         continue;
 
@@ -1247,45 +1248,57 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   // not in the region to be extracted.
   std::map<BasicBlock *, BasicBlock *> ExitBlockMap;
 
+  // Iterate over the previously collected targets, and create new blocks inside
+  // the function to branch to.
   unsigned switchVal = 0;
+  for (BasicBlock *OldTarget : OldTargets) {
+    if (Blocks.count(OldTarget))
+      continue;
+    BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+    if (NewTarget)
+      continue;
+
+    // If we don't already have an exit stub for this non-extracted
+    // destination, create one now!
+    NewTarget = BasicBlock::Create(Context,
+                                    OldTarget->getName() + ".exitStub",
+                                    newFunction);
+    unsigned SuccNum = switchVal++;
+
+    Value *brVal = nullptr;
+    assert(NumExitBlocks < 0xffff && "too many exit blocks for switch");
+    switch (NumExitBlocks) {
+    case 0:
+    case 1: break;  // No value needed.
+    case 2:         // Conditional branch, return a bool
+      brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
+      break;
+    default:
+      brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
+      break;
+    }
+
+    ReturnInst::Create(Context, brVal, NewTarget);
+
+    // Update the switch instruction.
+    TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
+                                        SuccNum),
+                        OldTarget);
+  }
+
   for (BasicBlock *Block : Blocks) {
     Instruction *TI = Block->getTerminator();
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!Blocks.count(TI->getSuccessor(i))) {
-        BasicBlock *OldTarget = TI->getSuccessor(i);
-        // add a new basic block which returns the appropriate value
-        BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
-        if (!NewTarget) {
-          // If we don't already have an exit stub for this non-extracted
-          // destination, create one now!
-          NewTarget = BasicBlock::Create(Context,
-                                         OldTarget->getName() + ".exitStub",
-                                         newFunction);
-          unsigned SuccNum = switchVal++;
-
-          Value *brVal = nullptr;
-          switch (NumExitBlocks) {
-          case 0:
-          case 1: break;  // No value needed.
-          case 2:         // Conditional branch, return a bool
-            brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
-            break;
-          default:
-            brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
-            break;
-          }
-
-          ReturnInst::Create(Context, brVal, NewTarget);
-
-          // Update the switch instruction.
-          TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
-                                              SuccNum),
-                             OldTarget);
-        }
-
-        // rewrite the original branch instruction with this new target
-        TI->setSuccessor(i, NewTarget);
-      }
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (Blocks.count(TI->getSuccessor(i)))
+        continue;
+      BasicBlock *OldTarget = TI->getSuccessor(i);
+      // add a new basic block which returns the appropriate value
+      BasicBlock *NewTarget = ExitBlockMap[OldTarget];
+      assert(NewTarget && "Unknown target block!");
+
+      // rewrite the original branch instruction with this new target
+      TI->setSuccessor(i, NewTarget);
+   }
   }
 
   // Store the arguments right after the definition of output value.
@@ -1388,12 +1401,17 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
   Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
   Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
 
+  auto newFuncIt = newFunction->front().getIterator();
   for (BasicBlock *Block : Blocks) {
     // Delete the basic block from the old function, and the list of blocks
     oldBlocks.remove(Block);
 
     // Insert this basic block into the new function
-    newBlocks.push_back(Block);
+    // Insert the original blocks after the entry block created
+    // for the new function. The entry block may be followed
+    // by a set of exit blocks at this point, but these exit
+    // blocks better be placed at the end of the new function.
+    newFuncIt = newBlocks.insertAfter(newFuncIt, Block);
   }
 }
 
@@ -1569,6 +1587,13 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
 
 Function *
 CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
+  ValueSet Inputs, Outputs;
+  return extractCodeRegion(CEAC, Inputs, Outputs);
+}
+
+Function *
+CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
+                                 ValueSet &inputs, ValueSet &outputs) {
   if (!isEligible())
     return nullptr;
 
@@ -1593,11 +1618,8 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   // Remove @llvm.assume calls that will be moved to the new function from the
   // old function's assumption cache.
   for (BasicBlock *Block : Blocks) {
-    for (auto It = Block->begin(), End = Block->end(); It != End;) {
-      Instruction *I = &*It;
-      ++It;
-
-      if (auto *AI = dyn_cast<AssumeInst>(I)) {
+    for (Instruction &I : llvm::make_early_inc_range(*Block)) {
+      if (auto *AI = dyn_cast<AssumeInst>(&I)) {
         if (AC)
           AC->unregisterAssumption(AI);
         AI->eraseFromParent();
@@ -1627,6 +1649,16 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   }
   NumExitBlocks = ExitBlocks.size();
 
+  for (BasicBlock *Block : Blocks) {
+    Instruction *TI = Block->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (Blocks.count(TI->getSuccessor(i)))
+        continue;
+      BasicBlock *OldTarget = TI->getSuccessor(i);
+      OldTargets.push_back(OldTarget);
+    }
+  }
+
   // If we have to split PHI nodes of the entry or exit blocks, do so now.
   severSplitPHINodesOfEntry(header);
   severSplitPHINodesOfExits(ExitBlocks);
@@ -1657,7 +1689,7 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   }
   newFuncRoot->getInstList().push_back(BranchI);
 
-  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  ValueSet SinkingCands, HoistingCands;
   BasicBlock *CommonExit = nullptr;
   findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
   assert(HoistingCands.empty() || CommonExit);
diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
index ce982c7403aa..648f4e64a4d2 100644
--- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -309,7 +309,7 @@ collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst,
 
 bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
                               DominatorTree &DT, const PostDominatorTree *PDT,
-                              DependenceInfo *DI) {
+                              DependenceInfo *DI, bool CheckForEntireBlock) {
   // Skip tests when we don't have PDT or DI
   if (!PDT || !DI)
     return false;
@@ -332,16 +332,24 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
   if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT))
     return reportInvalidCandidate(I, NotControlFlowEquivalent);
 
-  if (!DT.dominates(&InsertPoint, &I))
+  if (isReachedBefore(&I, &InsertPoint, &DT, PDT))
     for (const Use &U : I.uses())
       if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
         if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
           return false;
-  if (!DT.dominates(&I, &InsertPoint))
+  if (isReachedBefore(&InsertPoint, &I, &DT, PDT))
     for (const Value *Op : I.operands())
-      if (auto *OpInst = dyn_cast<Instruction>(Op))
-        if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
+      if (auto *OpInst = dyn_cast<Instruction>(Op)) {
+        if (&InsertPoint == OpInst)
+          return false;
+        // If OpInst is an instruction that appears earlier in the same BB as
+        // I, then it is okay to move since OpInst will still be available.
+        if (CheckForEntireBlock && I.getParent() == OpInst->getParent() &&
+            DT.dominates(OpInst, &I))
+          continue;
+        if (!DT.dominates(OpInst, &InsertPoint))
           return false;
+      }
 
   DT.updateDFSNumbers();
   const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint);
@@ -393,7 +401,8 @@ bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint,
     if (BB.getTerminator() == &I)
       return true;
 
-    return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI);
+    return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI,
+                              /*CheckForEntireBlock=*/true);
   });
 }
 
@@ -401,11 +410,9 @@ void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB,
                                           DominatorTree &DT,
                                           const PostDominatorTree &PDT,
                                           DependenceInfo &DI) {
-  for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) {
+  for (Instruction &I :
+       llvm::make_early_inc_range(llvm::drop_begin(llvm::reverse(FromBB)))) {
     Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
-    Instruction &I = *It;
-    // Increment the iterator before modifying FromBB.
-    ++It;
 
     if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
       I.moveBefore(MovePos);
@@ -423,3 +430,47 @@ void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB,
       I.moveBefore(MovePos);
   }
 }
+
+bool llvm::nonStrictlyPostDominate(const BasicBlock *ThisBlock,
+                                   const BasicBlock *OtherBlock,
+                                   const DominatorTree *DT,
+                                   const PostDominatorTree *PDT) {
+  assert(isControlFlowEquivalent(*ThisBlock, *OtherBlock, *DT, *PDT) &&
+         "ThisBlock and OtherBlock must be CFG equivalent!");
+  const BasicBlock *CommonDominator =
+      DT->findNearestCommonDominator(ThisBlock, OtherBlock);
+  if (CommonDominator == nullptr)
+    return false;
+
+  /// Recursively check the predecessors of \p ThisBlock up to
+  /// their common dominator, and see if any of them post-dominates
+  /// \p OtherBlock.
+  SmallVector<const BasicBlock *, 8> WorkList;
+  SmallPtrSet<const BasicBlock *, 8> Visited;
+  WorkList.push_back(ThisBlock);
+  while (!WorkList.empty()) {
+    const BasicBlock *CurBlock = WorkList.back();
+    WorkList.pop_back();
+    Visited.insert(CurBlock);
+    if (PDT->dominates(CurBlock, OtherBlock))
+      return true;
+
+    for (auto *Pred : predecessors(CurBlock)) {
+      if (Pred == CommonDominator || Visited.count(Pred))
+        continue;
+      WorkList.push_back(Pred);
+    }
+  }
+  return false;
+}
+
+bool llvm::isReachedBefore(const Instruction *I0, const Instruction *I1,
+                           const DominatorTree *DT,
+                           const PostDominatorTree *PDT) {
+  const BasicBlock *BB0 = I0->getParent();
+  const BasicBlock *BB1 = I1->getParent();
+  if (BB0 == BB1)
+    return DT->dominates(I0, I1);
+
+  return nonStrictlyPostDominate(BB1, BB0, DT, PDT);
+}
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 30c3fa521d52..fc7083b0c30d 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -457,14 +457,14 @@ static bool checkInstructions(const DebugInstMap &DILocsBefore,
 }
 
 // This checks the preservation of original debug variable intrinsics.
-static bool checkVars(const DebugVarMap &DIFunctionsBefore,
-                      const DebugVarMap &DIFunctionsAfter,
+static bool checkVars(const DebugVarMap &DIVarsBefore,
+                      const DebugVarMap &DIVarsAfter,
                       StringRef NameOfWrappedPass, StringRef FileNameFromCU,
                       bool ShouldWriteIntoJSON, llvm::json::Array &Bugs) {
   bool Preserved = true;
-  for (const auto &V : DIFunctionsBefore) {
-    auto VarIt = DIFunctionsAfter.find(V.first);
-    if (VarIt == DIFunctionsAfter.end())
+  for (const auto &V : DIVarsBefore) {
+    auto VarIt = DIVarsAfter.find(V.first);
+    if (VarIt == DIVarsAfter.end())
       continue;
 
     unsigned NumOfDbgValsAfter = VarIt->second;
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 31d03e1e86af..e3e8f63383df 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -89,7 +89,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
 
     insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
     Changed = true;
-    F.removeAttribute(AttributeList::FunctionIndex, EntryAttr);
+    F.removeFnAttr(EntryAttr);
   }
 
   if (!ExitFunc.empty()) {
@@ -111,7 +111,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
       insertCall(F, ExitFunc, T, DL);
       Changed = true;
     }
-    F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
+    F.removeFnAttr(ExitAttr);
   }
 
   return Changed;
@@ -183,3 +183,13 @@ llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserveSet<CFGAnalyses>();
   return PA;
 }
+
+void llvm::EntryExitInstrumenterPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<llvm::EntryExitInstrumenterPass> *>(this)
+      ->printPipeline(OS, MapClassName2PassName);
+  OS << "<";
+  if (PostInlining)
+    OS << "post-inline";
+  OS << ">";
+}
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index 463c223d9e8f..9c8aed94708e 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -128,11 +128,6 @@ isSimpleEnoughValueToCommit(Constant *C,
 /// globals and GEP's of globals.  This should be kept up to date with
 /// CommitValueTo.
 static bool isSimpleEnoughPointerToCommit(Constant *C, const DataLayout &DL) {
-  // Conservatively, avoid aggregate types. This is because we don't
-  // want to worry about them partially overlapping other stores.
-  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
-    return false;
-
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
     // Do not allow weak/*_odr/linkonce linkage or external globals.
     return GV->hasUniqueInitializer();
@@ -284,7 +279,7 @@ bool Evaluator::getFormalParams(CallBase &CB, Function *F,
     return false;
 
   auto *FTy = F->getFunctionType();
-  if (FTy->getNumParams() > CB.getNumArgOperands()) {
+  if (FTy->getNumParams() > CB.arg_size()) {
     LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
     return false;
   }
@@ -343,7 +338,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
         Ptr = FoldedPtr;
         LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
       }
-      if (!isSimpleEnoughPointerToCommit(Ptr, DL)) {
+      // Conservatively, avoid aggregate types. This is because we don't
+      // want to worry about them partially overlapping other stores.
+      if (!SI->getValueOperand()->getType()->isSingleValueType() ||
+          !isSimpleEnoughPointerToCommit(Ptr, DL)) {
         // If this is too complex for us to commit, reject it.
         LLVM_DEBUG(
             dbgs() << "Pointer is too complex for us to evaluate store.");
diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
index 10f48fe827f4..8de3ce876bab 100644
--- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -124,7 +124,7 @@ static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop,
   // children to a new vector.
   auto FirstChild = std::partition(
       CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) {
-        return L == NewLoop || Blocks.count(L->getHeader()) == 0;
+        return L == NewLoop || !Blocks.contains(L->getHeader());
       });
   SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end());
   CandidateLoops.erase(FirstChild, CandidateLoops.end());
diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index dbcacc20b589..ddd3f597ae01 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -162,7 +162,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
       // of \param BB (BB4) and should not have address-taken.
       // There should exist only one such unconditional
       // branch among the predecessors.
-      if (UnCondBlock || !PP || (Preds.count(PP) == 0) ||
+      if (UnCondBlock || !PP || !Preds.contains(PP) ||
           Pred->hasAddressTaken())
         return false;
 
@@ -215,7 +215,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 
     // PS is the successor which is not BB. Check successors to identify
     // the last conditional branch.
-    if (Preds.count(PS) == 0) {
+    if (!Preds.contains(PS)) {
       // Case 2.
       LastCondBlock = Pred;
     } else {
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 2696557a719f..326864803d7c 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -110,7 +110,7 @@ int FunctionComparator::cmpAttrs(const AttributeList L,
   if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
     return Res;
 
-  for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
+  for (unsigned i : L.indexes()) {
     AttributeSet LAS = L.getAttributes(i);
     AttributeSet RAS = R.getAttributes(i);
     AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index f782396be7b6..9bfc73e4ba6c 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -105,8 +105,10 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
         // value, not an aggregate), keep more specific information about
         // stores.
         if (GS.StoredType != GlobalStatus::Stored) {
-          if (const GlobalVariable *GV =
-                  dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+          const Value *Ptr = SI->getPointerOperand();
+          if (isa<ConstantExpr>(Ptr))
+            Ptr = Ptr->stripPointerCasts();
+          if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
             Value *StoredVal = SI->getOperand(0);
 
             if (Constant *C = dyn_cast<Constant>(StoredVal)) {
@@ -125,9 +127,9 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
                 GS.StoredType = GlobalStatus::InitializerStored;
             } else if (GS.StoredType < GlobalStatus::StoredOnce) {
               GS.StoredType = GlobalStatus::StoredOnce;
-              GS.StoredOnceValue = StoredVal;
+              GS.StoredOnceStore = SI;
             } else if (GS.StoredType == GlobalStatus::StoredOnce &&
-                       GS.StoredOnceValue == StoredVal) {
+                       GS.getStoredOnceValue() == StoredVal) {
               // noop.
             } else {
               GS.StoredType = GlobalStatus::Stored;
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index a1e160d144dc..047bf5569ded 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -47,7 +47,7 @@ static void addVariantDeclaration(CallInst &CI, const ElementCount &VF,
   // Add function declaration.
   Type *RetTy = ToVectorTy(CI.getType(), VF);
   SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI.arg_operands())
+  for (Value *ArgOperand : CI.args())
     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
   assert(!CI.getFunctionType()->isVarArg() &&
          "VarArg functions are not supported.");
@@ -94,8 +94,8 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
     const std::string TLIName =
         std::string(TLI.getVectorizedFunction(ScalarName, VF));
     if (!TLIName.empty()) {
-      std::string MangledName = VFABI::mangleTLIVectorName(
-          TLIName, ScalarName, CI.getNumArgOperands(), VF);
+      std::string MangledName =
+          VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF);
       if (!OriginalSetOfMappings.count(MangledName)) {
         Mappings.push_back(MangledName);
         ++NumCallInjected;
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 792aa8208f27..f4776589910f 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -539,12 +539,10 @@ static Value *getUnwindDestToken(Instruction *EHPad,
 static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
     BasicBlock *BB, BasicBlock *UnwindEdge,
     UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
-  for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
-    Instruction *I = &*BBI++;
-
+  for (Instruction &I : llvm::make_early_inc_range(*BB)) {
     // We only need to check for function calls: inlined invoke
     // instructions require no special handling.
-    CallInst *CI = dyn_cast<CallInst>(I);
+    CallInst *CI = dyn_cast<CallInst>(&I);
 
     if (!CI || CI->doesNotThrow())
       continue;
@@ -830,6 +828,7 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
   }
 }
 
+namespace {
 /// Utility for cloning !noalias and !alias.scope metadata. When a code region
 /// using scoped alias metadata is inlined, the aliasing relationships may not
 /// hold between the two version. It is necessary to create a deep clone of the
@@ -851,6 +850,7 @@ public:
   /// metadata.
   void remap(Function::iterator FStart, Function::iterator FEnd);
 };
+} // namespace
 
 ScopedAliasMetadataDeepCloner::ScopedAliasMetadataDeepCloner(
     const Function *F) {
@@ -1179,14 +1179,8 @@ static bool MayContainThrowingOrExitingCall(Instruction *Begin,
 
   assert(Begin->getParent() == End->getParent() &&
          "Expected to be in same basic block!");
-  unsigned NumInstChecked = 0;
-  // Check that all instructions in the range [Begin, End) are guaranteed to
-  // transfer execution to successor.
-  for (auto &I : make_range(Begin->getIterator(), End->getIterator()))
-    if (NumInstChecked++ > InlinerAttributeWindow ||
-        !isGuaranteedToTransferExecutionToSuccessor(&I))
-      return true;
-  return false;
+  return !llvm::isGuaranteedToTransferExecutionToSuccessor(
+      Begin->getIterator(), End->getIterator(), InlinerAttributeWindow + 1);
 }
 
 static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
@@ -1259,8 +1253,7 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     // existing attribute value (i.e. attributes such as dereferenceable,
     // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
     AttributeList AL = NewRetVal->getAttributes();
-    AttributeList NewAL =
-        AL.addAttributes(Context, AttributeList::ReturnIndex, Valid);
+    AttributeList NewAL = AL.addRetAttributes(Context, Valid);
     NewRetVal->setAttributes(NewAL);
   }
 }
@@ -1376,13 +1369,13 @@ static void UpdateCallGraphAfterInlining(CallBase &CB,
   CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB));
 }
 
-static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
-                                    BasicBlock *InsertBlock,
+static void HandleByValArgumentInit(Type *ByValType, Value *Dst, Value *Src,
+                                    Module *M, BasicBlock *InsertBlock,
                                     InlineFunctionInfo &IFI) {
-  Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
   IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
 
-  Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
+  Value *Size =
+      Builder.getInt64(M->getDataLayout().getTypeStoreSize(ByValType));
 
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
@@ -1393,13 +1386,13 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
 
 /// When inlining a call site that has a byval argument,
 /// we have to make the implicit memcpy explicit by adding it.
-static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
+static Value *HandleByValArgument(Type *ByValType, Value *Arg,
+                                  Instruction *TheCall,
                                   const Function *CalledFunc,
                                   InlineFunctionInfo &IFI,
                                   unsigned ByValAlignment) {
-  PointerType *ArgTy = cast<PointerType>(Arg->getType());
-  Type *AggTy = ArgTy->getElementType();
-
+  assert(cast<PointerType>(Arg->getType())
+             ->isOpaqueOrPointeeTypeMatches(ByValType));
   Function *Caller = TheCall->getFunction();
   const DataLayout &DL = Caller->getParent()->getDataLayout();
 
@@ -1427,7 +1420,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  Align Alignment(DL.getPrefTypeAlignment(AggTy));
+  Align Alignment(DL.getPrefTypeAlignment(ByValType));
 
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
@@ -1435,7 +1428,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   Alignment = max(Alignment, MaybeAlign(ByValAlignment));
 
   Value *NewAlloca =
-      new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+      new AllocaInst(ByValType, DL.getAllocaAddrSpace(), nullptr, Alignment,
                      Arg->getName(), &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
 
@@ -1607,8 +1600,7 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
                               const ProfileCount &CalleeEntryCount,
                               const CallBase &TheCall, ProfileSummaryInfo *PSI,
                               BlockFrequencyInfo *CallerBFI) {
-  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
-      CalleeEntryCount.getCount() < 1)
+  if (CalleeEntryCount.isSynthetic() || CalleeEntryCount.getCount() < 1)
     return;
   auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
   int64_t CallCount =
@@ -1617,40 +1609,39 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
 }
 
 void llvm::updateProfileCallee(
-    Function *Callee, int64_t entryDelta,
+    Function *Callee, int64_t EntryDelta,
     const ValueMap<const Value *, WeakTrackingVH> *VMap) {
   auto CalleeCount = Callee->getEntryCount();
   if (!CalleeCount.hasValue())
     return;
 
-  uint64_t priorEntryCount = CalleeCount.getCount();
-  uint64_t newEntryCount;
+  const uint64_t PriorEntryCount = CalleeCount->getCount();
 
   // Since CallSiteCount is an estimate, it could exceed the original callee
   // count and has to be set to 0 so guard against underflow.
-  if (entryDelta < 0 && static_cast<uint64_t>(-entryDelta) > priorEntryCount)
-    newEntryCount = 0;
-  else
-    newEntryCount = priorEntryCount + entryDelta;
+  const uint64_t NewEntryCount =
+      (EntryDelta < 0 && static_cast<uint64_t>(-EntryDelta) > PriorEntryCount)
+          ? 0
+          : PriorEntryCount + EntryDelta;
 
   // During inlining ?
   if (VMap) {
-    uint64_t cloneEntryCount = priorEntryCount - newEntryCount;
+    uint64_t CloneEntryCount = PriorEntryCount - NewEntryCount;
     for (auto Entry : *VMap)
       if (isa<CallInst>(Entry.first))
         if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
-          CI->updateProfWeight(cloneEntryCount, priorEntryCount);
+          CI->updateProfWeight(CloneEntryCount, PriorEntryCount);
   }
 
-  if (entryDelta) {
-    Callee->setEntryCount(newEntryCount);
+  if (EntryDelta) {
+    Callee->setEntryCount(NewEntryCount);
 
     for (BasicBlock &BB : *Callee)
       // No need to update the callsite if it is pruned during inlining.
       if (!VMap || VMap->count(&BB))
         for (Instruction &I : BB)
           if (CallInst *CI = dyn_cast<CallInst>(&I))
-            CI->updateProfWeight(newEntryCount, priorEntryCount);
+            CI->updateProfWeight(NewEntryCount, PriorEntryCount);
   }
 }
 
@@ -1672,66 +1663,69 @@ void llvm::updateProfileCallee(
 /// 3. Otherwise, a call to objc_retain is inserted if the call in the caller is
 ///    a retainRV call.
 static void
-inlineRetainOrClaimRVCalls(CallBase &CB,
+inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
                            const SmallVectorImpl<ReturnInst *> &Returns) {
   Module *Mod = CB.getModule();
-  bool IsRetainRV = objcarc::hasAttachedCallOpBundle(&CB, true),
+  assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function");
+  bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV,
        IsClaimRV = !IsRetainRV;
 
   for (auto *RI : Returns) {
     Value *RetOpnd = objcarc::GetRCIdentityRoot(RI->getOperand(0));
-    BasicBlock::reverse_iterator I = ++(RI->getIterator().getReverse());
-    BasicBlock::reverse_iterator EI = RI->getParent()->rend();
     bool InsertRetainCall = IsRetainRV;
     IRBuilder<> Builder(RI->getContext());
 
     // Walk backwards through the basic block looking for either a matching
     // autoreleaseRV call or an unannotated call.
-    for (; I != EI;) {
-      auto CurI = I++;
-
+    auto InstRange = llvm::make_range(++(RI->getIterator().getReverse()),
+                                      RI->getParent()->rend());
+    for (Instruction &I : llvm::make_early_inc_range(InstRange)) {
       // Ignore casts.
-      if (isa<CastInst>(*CurI))
+      if (isa<CastInst>(I))
         continue;
 
-      if (auto *II = dyn_cast<IntrinsicInst>(&*CurI)) {
-        if (II->getIntrinsicID() == Intrinsic::objc_autoreleaseReturnValue &&
-            II->hasNUses(0) &&
-            objcarc::GetRCIdentityRoot(II->getOperand(0)) == RetOpnd) {
-          // If we've found a matching authoreleaseRV call:
-          // - If claimRV is attached to the call, insert a call to objc_release
-          //   and erase the autoreleaseRV call.
-          // - If retainRV is attached to the call, just erase the autoreleaseRV
-          //   call.
-          if (IsClaimRV) {
-            Builder.SetInsertPoint(II);
-            Function *IFn =
-                Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
-            Value *BC =
-                Builder.CreateBitCast(RetOpnd, IFn->getArg(0)->getType());
-            Builder.CreateCall(IFn, BC, "");
-          }
-          II->eraseFromParent();
-          InsertRetainCall = false;
-        }
-      } else if (auto *CI = dyn_cast<CallInst>(&*CurI)) {
-        if (objcarc::GetRCIdentityRoot(CI) == RetOpnd &&
-            !objcarc::hasAttachedCallOpBundle(CI)) {
-          // If we've found an unannotated call that defines RetOpnd, add a
-          // "clang.arc.attachedcall" operand bundle.
-          Value *BundleArgs[] = {ConstantInt::get(
-              Builder.getInt64Ty(),
-              objcarc::getAttachedCallOperandBundleEnum(IsRetainRV))};
-          OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
-          auto *NewCall = CallBase::addOperandBundle(
-              CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI);
-          NewCall->copyMetadata(*CI);
-          CI->replaceAllUsesWith(NewCall);
-          CI->eraseFromParent();
-          InsertRetainCall = false;
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+        if (II->getIntrinsicID() != Intrinsic::objc_autoreleaseReturnValue ||
+            !II->hasNUses(0) ||
+            objcarc::GetRCIdentityRoot(II->getOperand(0)) != RetOpnd)
+          break;
+
+        // If we've found a matching authoreleaseRV call:
+        // - If claimRV is attached to the call, insert a call to objc_release
+        //   and erase the autoreleaseRV call.
+        // - If retainRV is attached to the call, just erase the autoreleaseRV
+        //   call.
+        if (IsClaimRV) {
+          Builder.SetInsertPoint(II);
+          Function *IFn =
+              Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
+          Value *BC = Builder.CreateBitCast(RetOpnd, IFn->getArg(0)->getType());
+          Builder.CreateCall(IFn, BC, "");
         }
+        II->eraseFromParent();
+        InsertRetainCall = false;
+        break;
       }
 
+      auto *CI = dyn_cast<CallInst>(&I);
+
+      if (!CI)
+        break;
+
+      if (objcarc::GetRCIdentityRoot(CI) != RetOpnd ||
+          objcarc::hasAttachedCallOpBundle(CI))
+        break;
+
+      // If we've found an unannotated call that defines RetOpnd, add a
+      // "clang.arc.attachedcall" operand bundle.
+      Value *BundleArgs[] = {*objcarc::getAttachedARCFunction(&CB)};
+      OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
+      auto *NewCall = CallBase::addOperandBundle(
+          CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI);
+      NewCall->copyMetadata(*CI);
+      CI->replaceAllUsesWith(NewCall);
+      CI->eraseFromParent();
+      InsertRetainCall = false;
       break;
     }
 
@@ -1895,8 +1889,13 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
   { // Scope to destroy VMap after cloning.
     ValueToValueMapTy VMap;
+    struct ByValInit {
+      Value *Dst;
+      Value *Src;
+      Type *Ty;
+    };
     // Keep a list of pair (dst, src) to emit byval initializations.
-    SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
+    SmallVector<ByValInit, 4> ByValInits;
 
     // When inlining a function that contains noalias scope metadata,
     // this metadata needs to be cloned so that the inlined blocks
@@ -1921,10 +1920,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
       // or readnone, because the copy would be unneeded: the callee doesn't
       // modify the struct.
       if (CB.isByValArgument(ArgNo)) {
-        ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI,
+        ActualArg = HandleByValArgument(CB.getParamByValType(ArgNo), ActualArg,
+                                        &CB, CalledFunc, IFI,
                                         CalledFunc->getParamAlignment(ArgNo));
         if (ActualArg != *AI)
-          ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
+          ByValInits.push_back(
+              {ActualArg, (Value *)*AI, CB.getParamByValType(ArgNo)});
       }
 
       VMap[&*I] = ActualArg;
@@ -1953,8 +1954,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     FirstNewBlock = LastBlock; ++FirstNewBlock;
 
     // Insert retainRV/clainRV runtime calls.
-    if (objcarc::hasAttachedCallOpBundle(&CB))
-      inlineRetainOrClaimRVCalls(CB, Returns);
+    objcarc::ARCInstKind RVCallKind = objcarc::getAttachedARCFunctionKind(&CB);
+    if (RVCallKind != objcarc::ARCInstKind::None)
+      inlineRetainOrClaimRVCalls(CB, RVCallKind, Returns);
 
     // Updated caller/callee profiles only when requested. For sample loader
     // inlining, the context-sensitive inlinee profile doesn't need to be
@@ -1966,13 +1968,14 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
         updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
                         CalledFunc->front());
 
-      updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB,
-                        IFI.PSI, IFI.CallerBFI);
+      if (auto Profile = CalledFunc->getEntryCount())
+        updateCallProfile(CalledFunc, VMap, *Profile, CB, IFI.PSI,
+                          IFI.CallerBFI);
     }
 
     // Inject byval arguments initialization.
-    for (std::pair<Value*, Value*> &Init : ByValInit)
-      HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
+    for (ByValInit &Init : ByValInits)
+      HandleByValArgumentInit(Init.Ty, Init.Dst, Init.Src, Caller->getParent(),
                               &*FirstNewBlock, IFI);
 
     Optional<OperandBundleUse> ParentDeopt =
@@ -2100,9 +2103,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   SmallVector<Value*,4> VarArgsToForward;
   SmallVector<AttributeSet, 4> VarArgsAttrs;
   for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
-       i < CB.getNumArgOperands(); i++) {
+       i < CB.arg_size(); i++) {
     VarArgsToForward.push_back(CB.getArgOperand(i));
-    VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i));
+    VarArgsAttrs.push_back(CB.getAttributes().getParamAttrs(i));
   }
 
   bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
@@ -2117,8 +2120,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
     for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
          ++BB) {
-      for (auto II = BB->begin(); II != BB->end();) {
-        Instruction &I = *II++;
+      for (Instruction &I : llvm::make_early_inc_range(*BB)) {
         CallInst *CI = dyn_cast<CallInst>(&I);
         if (!CI)
           continue;
@@ -2135,15 +2137,15 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
             for (unsigned ArgNo = 0;
                  ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
-              ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+              ArgAttrs.push_back(Attrs.getParamAttrs(ArgNo));
           }
 
           // Add VarArg attributes.
           ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
-          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
-                                     Attrs.getRetAttributes(), ArgAttrs);
+          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttrs(),
+                                     Attrs.getRetAttrs(), ArgAttrs);
           // Add VarArgs to existing parameters.
-          SmallVector<Value *, 6> Params(CI->arg_operands());
+          SmallVector<Value *, 6> Params(CI->args());
           Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
           CallInst *NewCI = CallInst::Create(
               CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI);
@@ -2295,8 +2297,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
          BB != E; ++BB) {
       // Add bundle operands to any top-level call sites.
       SmallVector<OperandBundleDef, 1> OpBundles;
-      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
-        CallBase *I = dyn_cast<CallBase>(&*BBI++);
+      for (Instruction &II : llvm::make_early_inc_range(*BB)) {
+        CallBase *I = dyn_cast<CallBase>(&II);
         if (!I)
           continue;
 
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 277fd903e9aa..668626fef933 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -309,7 +309,7 @@ static void computeBlocksDominatingExits(
     // worklist, unless we visited it already.
     BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
 
-    // Exit blocks can have an immediate dominator not beloinging to the
+    // Exit blocks can have an immediate dominator not belonging to the
     // loop. For an exit block to be immediately dominated by another block
     // outside the loop, it implies not all paths from that dominator, to the
     // exit block, go through the loop.
diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 7e5832148bc0..6958a89f5be6 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -304,7 +304,7 @@ void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
   if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func))
     return;
 
-  if (CI.getNumArgOperands() == 0)
+  if (CI.arg_empty())
     return;
   // TODO: Handle long double in other formats.
   Type *ArgType = CI.getArgOperand(0)->getType();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d03d76f57ca1..74ab37fadf36 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1413,8 +1413,6 @@ static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
     if (auto *AI =
             dyn_cast_or_null<AllocaInst>(DII->getVariableLocationOp(0))) {
       if (Optional<TypeSize> FragmentSize = AI->getAllocationSizeInBits(DL)) {
-        assert(ValueSize.isScalable() == FragmentSize->isScalable() &&
-               "Both sizes should agree on the scalable flag.");
         return TypeSize::isKnownGE(ValueSize, *FragmentSize);
       }
     }
@@ -1733,9 +1731,11 @@ void llvm::salvageDebugInfo(Instruction &I) {
 
 void llvm::salvageDebugInfoForDbgValues(
     Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) {
-  // This is an arbitrary chosen limit on the maximum number of values we can
-  // salvage up to in a DIArgList, used for performance reasons.
+  // These are arbitrary chosen limits on the maximum number of values and the
+  // maximum size of a debug expression we can salvage up to, used for
+  // performance reasons.
   const unsigned MaxDebugArgs = 16;
+  const unsigned MaxExpressionSize = 128;
   bool Salvaged = false;
 
   for (auto *DII : DbgUsers) {
@@ -1752,23 +1752,30 @@ void llvm::salvageDebugInfoForDbgValues(
     // must be updated in the DIExpression and potentially have additional
     // values added; thus we call salvageDebugInfoImpl for each `I` instance in
     // DIILocation.
+    Value *Op0 = nullptr;
     DIExpression *SalvagedExpr = DII->getExpression();
     auto LocItr = find(DIILocation, &I);
     while (SalvagedExpr && LocItr != DIILocation.end()) {
+      SmallVector<uint64_t, 16> Ops;
       unsigned LocNo = std::distance(DIILocation.begin(), LocItr);
-      SalvagedExpr = salvageDebugInfoImpl(I, SalvagedExpr, StackValue, LocNo,
-                                          AdditionalValues);
+      uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands();
+      Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues);
+      if (!Op0)
+        break;
+      SalvagedExpr =
+          DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue);
       LocItr = std::find(++LocItr, DIILocation.end(), &I);
     }
     // salvageDebugInfoImpl should fail on examining the first element of
     // DbgUsers, or none of them.
-    if (!SalvagedExpr)
+    if (!Op0)
       break;
 
-    DII->replaceVariableLocationOp(&I, I.getOperand(0));
-    if (AdditionalValues.empty()) {
+    DII->replaceVariableLocationOp(&I, Op0);
+    bool IsValidSalvageExpr = SalvagedExpr->getNumElements() <= MaxExpressionSize;
+    if (AdditionalValues.empty() && IsValidSalvageExpr) {
       DII->setExpression(SalvagedExpr);
-    } else if (isa<DbgValueInst>(DII) &&
+    } else if (isa<DbgValueInst>(DII) && IsValidSalvageExpr &&
                DII->getNumVariableLocationOps() + AdditionalValues.size() <=
                    MaxDebugArgs) {
       DII->addVariableLocationOps(AdditionalValues, SalvagedExpr);
@@ -1793,16 +1800,16 @@ void llvm::salvageDebugInfoForDbgValues(
   }
 }
 
-bool getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL,
-                         uint64_t CurrentLocOps,
-                         SmallVectorImpl<uint64_t> &Opcodes,
-                         SmallVectorImpl<Value *> &AdditionalValues) {
+Value *getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL,
+                           uint64_t CurrentLocOps,
+                           SmallVectorImpl<uint64_t> &Opcodes,
+                           SmallVectorImpl<Value *> &AdditionalValues) {
   unsigned BitWidth = DL.getIndexSizeInBits(GEP->getPointerAddressSpace());
   // Rewrite a GEP into a DIExpression.
   MapVector<Value *, APInt> VariableOffsets;
   APInt ConstantOffset(BitWidth, 0);
   if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset))
-    return false;
+    return nullptr;
   if (!VariableOffsets.empty() && !CurrentLocOps) {
     Opcodes.insert(Opcodes.begin(), {dwarf::DW_OP_LLVM_arg, 0});
     CurrentLocOps = 1;
@@ -1816,7 +1823,7 @@ bool getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL,
                     dwarf::DW_OP_plus});
   }
   DIExpression::appendOffset(Opcodes, ConstantOffset.getSExtValue());
-  return true;
+  return GEP->getOperand(0);
 }
 
 uint64_t getDwarfOpForBinOp(Instruction::BinaryOps Opcode) {
@@ -1849,14 +1856,14 @@ uint64_t getDwarfOpForBinOp(Instruction::BinaryOps Opcode) {
   }
 }
 
-bool getSalvageOpsForBinOp(BinaryOperator *BI, uint64_t CurrentLocOps,
-                           SmallVectorImpl<uint64_t> &Opcodes,
-                           SmallVectorImpl<Value *> &AdditionalValues) {
+Value *getSalvageOpsForBinOp(BinaryOperator *BI, uint64_t CurrentLocOps,
+                             SmallVectorImpl<uint64_t> &Opcodes,
+                             SmallVectorImpl<Value *> &AdditionalValues) {
   // Handle binary operations with constant integer operands as a special case.
   auto *ConstInt = dyn_cast<ConstantInt>(BI->getOperand(1));
   // Values wider than 64 bits cannot be represented within a DIExpression.
   if (ConstInt && ConstInt->getBitWidth() > 64)
-    return false;
+    return nullptr;
 
   Instruction::BinaryOps BinOpcode = BI->getOpcode();
   // Push any Constant Int operand onto the expression stack.
@@ -1867,7 +1874,7 @@ bool getSalvageOpsForBinOp(BinaryOperator *BI, uint64_t CurrentLocOps,
     if (BinOpcode == Instruction::Add || BinOpcode == Instruction::Sub) {
       uint64_t Offset = BinOpcode == Instruction::Add ? Val : -int64_t(Val);
       DIExpression::appendOffset(Opcodes, Offset);
-      return true;
+      return BI->getOperand(0);
     }
     Opcodes.append({dwarf::DW_OP_constu, Val});
   } else {
@@ -1883,62 +1890,51 @@ bool getSalvageOpsForBinOp(BinaryOperator *BI, uint64_t CurrentLocOps,
   // representation in a DIExpression.
   uint64_t DwarfBinOp = getDwarfOpForBinOp(BinOpcode);
   if (!DwarfBinOp)
-    return false;
+    return nullptr;
   Opcodes.push_back(DwarfBinOp);
-
-  return true;
+  return BI->getOperand(0);
 }
 
-DIExpression *
-llvm::salvageDebugInfoImpl(Instruction &I, DIExpression *SrcDIExpr,
-                           bool WithStackValue, unsigned LocNo,
-                           SmallVectorImpl<Value *> &AdditionalValues) {
-  uint64_t CurrentLocOps = SrcDIExpr->getNumLocationOperands();
+Value *llvm::salvageDebugInfoImpl(Instruction &I, uint64_t CurrentLocOps,
+                                  SmallVectorImpl<uint64_t> &Ops,
+                                  SmallVectorImpl<Value *> &AdditionalValues) {
   auto &M = *I.getModule();
   auto &DL = M.getDataLayout();
 
-  // Apply a vector of opcodes to the source DIExpression.
-  auto doSalvage = [&](SmallVectorImpl<uint64_t> &Ops) -> DIExpression * {
-    DIExpression *DIExpr = SrcDIExpr;
-    if (!Ops.empty()) {
-      DIExpr = DIExpression::appendOpsToArg(DIExpr, Ops, LocNo, WithStackValue);
-    }
-    return DIExpr;
-  };
-
-  // initializer-list helper for applying operators to the source DIExpression.
-  auto applyOps = [&](ArrayRef<uint64_t> Opcodes) {
-    SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end());
-    return doSalvage(Ops);
-  };
-
   if (auto *CI = dyn_cast<CastInst>(&I)) {
+    Value *FromValue = CI->getOperand(0);
     // No-op casts are irrelevant for debug info.
-    if (CI->isNoopCast(DL))
-      return SrcDIExpr;
+    if (CI->isNoopCast(DL)) {
+      return FromValue;
+    }
 
     Type *Type = CI->getType();
+    if (Type->isPointerTy())
+      Type = DL.getIntPtrType(Type);
     // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged.
     if (Type->isVectorTy() ||
-        !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I)))
+        !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I) ||
+          isa<IntToPtrInst>(&I) || isa<PtrToIntInst>(&I)))
       return nullptr;
 
-    Value *FromValue = CI->getOperand(0);
-    unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits();
+    llvm::Type *FromType = FromValue->getType();
+    if (FromType->isPointerTy())
+      FromType = DL.getIntPtrType(FromType);
+
+    unsigned FromTypeBitSize = FromType->getScalarSizeInBits();
     unsigned ToTypeBitSize = Type->getScalarSizeInBits();
 
-    return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize,
-                                            isa<SExtInst>(&I)));
+    auto ExtOps = DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize,
+                                          isa<SExtInst>(&I));
+    Ops.append(ExtOps.begin(), ExtOps.end());
+    return FromValue;
   }
 
-  SmallVector<uint64_t, 8> Ops;
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-    if (getSalvageOpsForGEP(GEP, DL, CurrentLocOps, Ops, AdditionalValues))
-      return doSalvage(Ops);
-  } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
-    if (getSalvageOpsForBinOp(BI, CurrentLocOps, Ops, AdditionalValues))
-      return doSalvage(Ops);
-  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(&I))
+    return getSalvageOpsForGEP(GEP, DL, CurrentLocOps, Ops, AdditionalValues);
+  if (auto *BI = dyn_cast<BinaryOperator>(&I))
+    return getSalvageOpsForBinOp(BI, CurrentLocOps, Ops, AdditionalValues);
+
   // *Not* to do: we should not attempt to salvage load instructions,
   // because the validity and lifetime of a dbg.value containing
   // DW_OP_deref becomes difficult to analyze. See PR40628 for examples.
@@ -2194,6 +2190,26 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
 }
 
+void llvm::createUnreachableSwitchDefault(SwitchInst *Switch,
+                                          DomTreeUpdater *DTU) {
+  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+  auto *BB = Switch->getParent();
+  auto *OrigDefaultBlock = Switch->getDefaultDest();
+  OrigDefaultBlock->removePredecessor(BB);
+  BasicBlock *NewDefaultBlock = BasicBlock::Create(
+      BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
+      OrigDefaultBlock);
+  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  Switch->setDefaultDest(&*NewDefaultBlock);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
+    if (!is_contained(successors(BB), OrigDefaultBlock))
+      Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
+    DTU->applyUpdates(Updates);
+  }
+}
+
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
                                                    BasicBlock *UnwindEdge,
                                                    DomTreeUpdater *DTU) {
@@ -2669,9 +2685,7 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To,
   assert(From->getType() == To->getType());
 
   unsigned Count = 0;
-  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE;) {
-    Use &U = *UI++;
+  for (Use &U : llvm::make_early_inc_range(From->uses())) {
     if (!Dominates(Root, U))
       continue;
     U.set(To);
@@ -2687,9 +2701,7 @@ unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
    auto *BB = From->getParent();
    unsigned Count = 0;
 
-  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE;) {
-    Use &U = *UI++;
+   for (Use &U : llvm::make_early_inc_range(From->uses())) {
     auto *I = cast<Instruction>(U.getUser());
     if (I->getParent() == BB)
       continue;
@@ -3171,7 +3183,7 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
 
   // Now, is the bit permutation correct for a bswap or a bitreverse? We can
   // only byteswap values with an even number of bytes.
-  APInt DemandedMask = APInt::getAllOnesValue(DemandedBW);
+  APInt DemandedMask = APInt::getAllOnes(DemandedBW);
   bool OKForBSwap = MatchBSwaps && (DemandedBW % 16) == 0;
   bool OKForBitReverse = MatchBitReversals;
   for (unsigned BitIdx = 0;
@@ -3208,7 +3220,7 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   Instruction *Result = CallInst::Create(F, Provider, "rev", I);
   InsertedInsts.push_back(Result);
 
-  if (!DemandedMask.isAllOnesValue()) {
+  if (!DemandedMask.isAllOnes()) {
     auto *Mask = ConstantInt::get(DemandedTy, DemandedMask);
     Result = BinaryOperator::Create(Instruction::And, Result, Mask, "mask", I);
     InsertedInsts.push_back(Result);
@@ -3235,7 +3247,7 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
   if (F && !F->hasLocalLinkage() && F->hasName() &&
       TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
       !F->doesNotAccessMemory())
-    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+    CI->addFnAttr(Attribute::NoBuiltin);
 }
 
 bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
@@ -3263,7 +3275,7 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
     if (CB.isBundleOperand(OpIdx))
       return false;
 
-    if (OpIdx < CB.getNumArgOperands()) {
+    if (OpIdx < CB.arg_size()) {
       // Some variadic intrinsics require constants in the variadic arguments,
       // which currently aren't markable as immarg.
       if (isa<IntrinsicInst>(CB) &&
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index cd1f6f0c78a5..f3cf42be8ba1 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -73,57 +74,39 @@ static cl::opt<unsigned> UnrollForcePeelCount(
     "unroll-force-peel-count", cl::init(0), cl::Hidden,
     cl::desc("Force a peel count regardless of profiling information."));
 
-static cl::opt<bool> UnrollPeelMultiDeoptExit(
-    "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden,
-    cl::desc("Allow peeling of loops with multiple deopt exits."));
-
 static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
 
-// Designates that a Phi is estimated to become invariant after an "infinite"
-// number of loop iterations (i.e. only may become an invariant if the loop is
-// fully unrolled).
-static const unsigned InfiniteIterationsToInvariance =
-    std::numeric_limits<unsigned>::max();
-
 // Check whether we are capable of peeling this loop.
 bool llvm::canPeel(Loop *L) {
   // Make sure the loop is in simplified form
   if (!L->isLoopSimplifyForm())
     return false;
 
-  if (UnrollPeelMultiDeoptExit) {
-    SmallVector<BasicBlock *, 4> Exits;
-    L->getUniqueNonLatchExitBlocks(Exits);
-
-    if (!Exits.empty()) {
-      // Latch's terminator is a conditional branch, Latch is exiting and
-      // all non Latch exits ends up with deoptimize.
-      const BasicBlock *Latch = L->getLoopLatch();
-      const BranchInst *T = dyn_cast<BranchInst>(Latch->getTerminator());
-      return T && T->isConditional() && L->isLoopExiting(Latch) &&
-             all_of(Exits, [](const BasicBlock *BB) {
-               return BB->getTerminatingDeoptimizeCall();
-             });
-    }
-  }
-
-  // Only peel loops that contain a single exit
-  if (!L->getExitingBlock() || !L->getUniqueExitBlock())
-    return false;
-
   // Don't try to peel loops where the latch is not the exiting block.
   // This can be an indication of two different things:
   // 1) The loop is not rotated.
   // 2) The loop contains irreducible control flow that involves the latch.
   const BasicBlock *Latch = L->getLoopLatch();
-  if (Latch != L->getExitingBlock())
+  if (!L->isLoopExiting(Latch))
     return false;
 
   // Peeling is only supported if the latch is a branch.
   if (!isa<BranchInst>(Latch->getTerminator()))
     return false;
 
-  return true;
+  SmallVector<BasicBlock *, 4> Exits;
+  L->getUniqueNonLatchExitBlocks(Exits);
+  // The latch must either be the only exiting block or all non-latch exit
+  // blocks have either a deopt or unreachable terminator or compose a chain of
+  // blocks where the last one is either deopt or unreachable terminated. Both
+  // deopt and unreachable terminators are a strong indication they are not
+  // taken. Note that this is a profitability check, not a legality check. Also
+  // note that LoopPeeling currently can only update the branch weights of latch
+  // blocks and branch weights to blocks with deopt or unreachable do not need
+  // updating.
+  return all_of(Exits, [](const BasicBlock *BB) {
+    return IsBlockFollowedByDeoptOrUnreachable(BB);
+  });
 }
 
 // This function calculates the number of iterations after which the given Phi
@@ -139,9 +122,9 @@ bool llvm::canPeel(Loop *L) {
 //         %x = phi(0, %a),  <-- becomes invariant starting from 3rd iteration.
 //         %y = phi(0, 5),
 //         %a = %y + 1.
-static unsigned calculateIterationsToInvariance(
+static Optional<unsigned> calculateIterationsToInvariance(
     PHINode *Phi, Loop *L, BasicBlock *BackEdge,
-    SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) {
+    SmallDenseMap<PHINode *, Optional<unsigned> > &IterationsToInvariance) {
   assert(Phi->getParent() == L->getHeader() &&
          "Non-loop Phi should not be checked for turning into invariant.");
   assert(BackEdge == L->getLoopLatch() && "Wrong latch?");
@@ -154,29 +137,90 @@ static unsigned calculateIterationsToInvariance(
   Value *Input = Phi->getIncomingValueForBlock(BackEdge);
   // Place infinity to map to avoid infinite recursion for cycled Phis. Such
   // cycles can never stop on an invariant.
-  IterationsToInvariance[Phi] = InfiniteIterationsToInvariance;
-  unsigned ToInvariance = InfiniteIterationsToInvariance;
+  IterationsToInvariance[Phi] = None;
+  Optional<unsigned> ToInvariance = None;
 
   if (L->isLoopInvariant(Input))
     ToInvariance = 1u;
   else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) {
     // Only consider Phis in header block.
     if (IncPhi->getParent() != L->getHeader())
-      return InfiniteIterationsToInvariance;
+      return None;
     // If the input becomes an invariant after X iterations, then our Phi
     // becomes an invariant after X + 1 iterations.
-    unsigned InputToInvariance = calculateIterationsToInvariance(
+    auto InputToInvariance = calculateIterationsToInvariance(
         IncPhi, L, BackEdge, IterationsToInvariance);
-    if (InputToInvariance != InfiniteIterationsToInvariance)
-      ToInvariance = InputToInvariance + 1u;
+    if (InputToInvariance)
+      ToInvariance = *InputToInvariance + 1u;
   }
 
   // If we found that this Phi lies in an invariant chain, update the map.
-  if (ToInvariance != InfiniteIterationsToInvariance)
+  if (ToInvariance)
     IterationsToInvariance[Phi] = ToInvariance;
   return ToInvariance;
 }
 
+// Try to find any invariant memory reads that will become dereferenceable in
+// the remainder loop after peeling. The load must also be used (transitively)
+// by an exit condition. Returns the number of iterations to peel off (at the
+// moment either 0 or 1).
+static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
+                                                      DominatorTree &DT) {
+  // Skip loops with a single exiting block, because there should be no benefit
+  // for the heuristic below.
+  if (L.getExitingBlock())
+    return 0;
+
+  // All non-latch exit blocks must have an UnreachableInst terminator.
+  // Otherwise the heuristic below may not be profitable.
+  SmallVector<BasicBlock *, 4> Exits;
+  L.getUniqueNonLatchExitBlocks(Exits);
+  if (any_of(Exits, [](const BasicBlock *BB) {
+        return !isa<UnreachableInst>(BB->getTerminator());
+      }))
+    return 0;
+
+  // Now look for invariant loads that dominate the latch and are not known to
+  // be dereferenceable. If there are such loads and no writes, they will become
+  // dereferenceable in the loop if the first iteration is peeled off. Also
+  // collect the set of instructions controlled by such loads. Only peel if an
+  // exit condition uses (transitively) such a load.
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Latch = L.getLoopLatch();
+  SmallPtrSet<Value *, 8> LoadUsers;
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  for (BasicBlock *BB : L.blocks()) {
+    for (Instruction &I : *BB) {
+      if (I.mayWriteToMemory())
+        return 0;
+
+      auto Iter = LoadUsers.find(&I);
+      if (Iter != LoadUsers.end()) {
+        for (Value *U : I.users())
+          LoadUsers.insert(U);
+      }
+      // Do not look for reads in the header; they can already be hoisted
+      // without peeling.
+      if (BB == Header)
+        continue;
+      if (auto *LI = dyn_cast<LoadInst>(&I)) {
+        Value *Ptr = LI->getPointerOperand();
+        if (DT.dominates(BB, Latch) && L.isLoopInvariant(Ptr) &&
+            !isDereferenceablePointer(Ptr, LI->getType(), DL, LI, &DT))
+          for (Value *U : I.users())
+            LoadUsers.insert(U);
+      }
+    }
+  }
+  SmallVector<BasicBlock *> ExitingBlocks;
+  L.getExitingBlocks(ExitingBlocks);
+  if (any_of(ExitingBlocks, [&LoadUsers](BasicBlock *Exiting) {
+        return LoadUsers.contains(Exiting->getTerminator());
+      }))
+    return 1;
+  return 0;
+}
+
 // Return the number of iterations to peel off that make conditions in the
 // body true/false. For example, if we peel 2 iterations off the loop below,
 // the condition i < 2 can be evaluated at compile time.
@@ -292,8 +336,8 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::PeelingPreferences &PP,
-                            unsigned &TripCount, ScalarEvolution &SE,
-                            unsigned Threshold) {
+                            unsigned &TripCount, DominatorTree &DT,
+                            ScalarEvolution &SE, unsigned Threshold) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
   // Save the PP.PeelCount value set by the target in
   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -337,7 +381,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // First, check that we can peel at least one iteration.
   if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) {
     // Store the pre-calculated values here.
-    SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
+    SmallDenseMap<PHINode *, Optional<unsigned> > IterationsToInvariance;
     // Now go through all Phis to calculate their the number of iterations they
     // need to become invariants.
     // Start the max computation with the UP.PeelCount value set by the target
@@ -347,10 +391,10 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     assert(BackEdge && "Loop is not in simplified form?");
     for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
       PHINode *Phi = cast<PHINode>(&*BI);
-      unsigned ToInvariance = calculateIterationsToInvariance(
+      auto ToInvariance = calculateIterationsToInvariance(
           Phi, L, BackEdge, IterationsToInvariance);
-      if (ToInvariance != InfiniteIterationsToInvariance)
-        DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
+      if (ToInvariance)
+        DesiredPeelCount = std::max(DesiredPeelCount, *ToInvariance);
     }
 
     // Pay respect to limitations implied by loop size and the max peel count.
@@ -360,6 +404,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     DesiredPeelCount = std::max(DesiredPeelCount,
                                 countToEliminateCompares(*L, MaxPeelCount, SE));
 
+    if (DesiredPeelCount == 0)
+      DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT);
+
     if (DesiredPeelCount > 0) {
       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
       // Consider max peel count limitation.
@@ -679,34 +726,27 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
   L->getExitEdges(ExitEdges);
 
-  DenseMap<BasicBlock *, BasicBlock *> ExitIDom;
+  // Remember dominators of blocks we might reach through exits to change them
+  // later. Immediate dominator of such block might change, because we add more
+  // routes which can lead to the exit: we can reach it from the peeled
+  // iterations too.
+  DenseMap<BasicBlock *, BasicBlock *> NonLoopBlocksIDom;
   if (DT) {
-    // We'd like to determine the idom of exit block after peeling one
-    // iteration.
-    // Let Exit is exit block.
-    // Let ExitingSet - is a set of predecessors of Exit block. They are exiting
-    // blocks.
-    // Let Latch' and ExitingSet' are copies after a peeling.
-    // We'd like to find an idom'(Exit) - idom of Exit after peeling.
-    // It is an evident that idom'(Exit) will be the nearest common dominator
-    // of ExitingSet and ExitingSet'.
-    // idom(Exit) is a nearest common dominator of ExitingSet.
-    // idom(Exit)' is a nearest common dominator of ExitingSet'.
-    // Taking into account that we have a single Latch, Latch' will dominate
-    // Header and idom(Exit).
-    // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'.
-    // All these basic blocks are in the same loop, so what we find is
-    // (nearest common dominator of idom(Exit) and Latch)'.
-    // In the loop below we remember nearest common dominator of idom(Exit) and
-    // Latch to update idom of Exit later.
-    assert(L->hasDedicatedExits() && "No dedicated exits?");
-    for (auto Edge : ExitEdges) {
-      if (ExitIDom.count(Edge.second))
-        continue;
-      BasicBlock *BB = DT->findNearestCommonDominator(
-          DT->getNode(Edge.second)->getIDom()->getBlock(), Latch);
-      assert(L->contains(BB) && "IDom is not in a loop");
-      ExitIDom[Edge.second] = BB;
+    for (auto *BB : L->blocks()) {
+      auto *BBDomNode = DT->getNode(BB);
+      SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+      for (auto *ChildDomNode : BBDomNode->children()) {
+        auto *ChildBB = ChildDomNode->getBlock();
+        if (!L->contains(ChildBB))
+          ChildrenToUpdate.push_back(ChildBB);
+      }
+      // The new idom of the block will be the nearest common dominator
+      // of all copies of the previous idom. This is equivalent to the
+      // nearest common dominator of the previous idom and the first latch,
+      // which dominates all copies of the previous idom.
+      BasicBlock *NewIDom = DT->findNearestCommonDominator(BB, Latch);
+      for (auto *ChildBB : ChildrenToUpdate)
+        NonLoopBlocksIDom[ChildBB] = NewIDom;
     }
   }
 
@@ -795,13 +835,11 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     remapInstructionsInBlocks(NewBlocks, VMap);
 
     if (DT) {
-      // Latches of the cloned loops dominate over the loop exit, so idom of the
-      // latter is the first cloned loop body, as original PreHeader dominates
-      // the original loop body.
+      // Update IDoms of the blocks reachable through exits.
       if (Iter == 0)
-        for (auto Exit : ExitIDom)
-          DT->changeImmediateDominator(Exit.first,
-                                       cast<BasicBlock>(LVMap[Exit.second]));
+        for (auto BBIDom : NonLoopBlocksIDom)
+          DT->changeImmediateDominator(BBIDom.first,
+                                       cast<BasicBlock>(LVMap[BBIDom.second]));
 #ifdef EXPENSIVE_CHECKS
       assert(DT->verify(DominatorTree::VerificationLevel::Fast));
 #endif
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index ff7905bed91d..c66fd7bb0588 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -103,6 +103,7 @@ static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) {
 static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
                                             BasicBlock *OrigPreheader,
                                             ValueToValueMapTy &ValueMap,
+                                            ScalarEvolution *SE,
                                 SmallVectorImpl<PHINode*> *InsertedPHIs) {
   // Remove PHI node entries that are no longer live.
   BasicBlock::iterator I, E = OrigHeader->end();
@@ -125,19 +126,15 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     // The value now exits in two versions: the initial value in the preheader
     // and the loop "next" value in the original header.
     SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    // Force re-computation of OrigHeaderVal, as some users now need to use the
+    // new PHI node.
+    if (SE)
+      SE->forgetValue(OrigHeaderVal);
     SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
     SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
 
     // Visit each use of the OrigHeader instruction.
-    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
-                             UE = OrigHeaderVal->use_end();
-         UI != UE;) {
-      // Grab the use before incrementing the iterator.
-      Use &U = *UI;
-
-      // Increment the iterator before removing the use from the list.
-      ++UI;
-
+    for (Use &U : llvm::make_early_inc_range(OrigHeaderVal->uses())) {
       // SSAUpdater can't handle a non-PHI use in the same block as an
       // earlier def. We can easily handle those cases manually.
       Instruction *UserInst = cast<Instruction>(U.getUser());
@@ -399,9 +396,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
               D->getExpression()};
     };
     SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
-    for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
-         I != E; ++I) {
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I))
+    for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I))
         DbgIntrinsics.insert(makeHash(DII));
       else
         break;
@@ -563,7 +559,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     SmallVector<PHINode*, 2> InsertedPHIs;
     // If there were any uses of instructions in the duplicated block outside the
     // loop, update them, inserting PHI nodes as required
-    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
                                     &InsertedPHIs);
 
     // Attach dbg.value intrinsics to the new phis if that phi uses a value that
@@ -621,7 +617,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // one predecessor. Note that Exit could be an exit block for multiple
       // nested loops, causing both of the edges to now be critical and need to
       // be split.
-      SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+      SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
       bool SplitLatchEdge = false;
       for (BasicBlock *ExitPred : ExitPreds) {
         // We only need to split loop exit edges.
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index d2fd32c98d73..d14c006c8032 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -779,8 +779,7 @@ namespace {
       AU.addPreserved<DependenceAnalysisWrapperPass>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
       AU.addPreserved<BranchProbabilityInfoWrapperPass>();
-      if (EnableMSSALoopDependency)
-        AU.addPreserved<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
     }
 
     /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
@@ -814,12 +813,10 @@ bool LoopSimplify::runOnFunction(Function &F) {
       &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   MemorySSA *MSSA = nullptr;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
-  if (EnableMSSALoopDependency) {
-    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
-    if (MSSAAnalysis) {
-      MSSA = &MSSAAnalysis->getMSSA();
-      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    }
+  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  if (MSSAAnalysis) {
+    MSSA = &MSSAAnalysis->getMSSA();
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
   }
 
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index a91bf7b7af13..b0c622b98d5e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -224,13 +224,12 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   SmallVector<WeakTrackingVH, 16> DeadInsts;
   for (BasicBlock *BB : L->getBlocks()) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-      Instruction *Inst = &*I++;
-      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
-        if (LI->replacementPreservesLCSSAForm(Inst, V))
-          Inst->replaceAllUsesWith(V);
-      if (isInstructionTriviallyDead(Inst))
-        DeadInsts.emplace_back(Inst);
+    for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
+      if (Value *V = SimplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
+        if (LI->replacementPreservesLCSSAForm(&Inst, V))
+          Inst.replaceAllUsesWith(V);
+      if (isInstructionTriviallyDead(&Inst))
+        DeadInsts.emplace_back(&Inst);
     }
     // We can't do recursive deletion until we're done iterating, as we might
     // have a phi which (potentially indirectly) uses instructions later in
@@ -515,6 +514,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
   identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
 
+  // We place the unrolled iterations immediately after the original loop
+  // latch.  This is a reasonable default placement if we don't have block
+  // frequencies, and if we do, well the layout will be adjusted later.
+  auto BlockInsertPt = std::next(LatchBlock->getIterator());
   for (unsigned It = 1; It != ULO.Count; ++It) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -523,7 +526,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
       ValueToValueMapTy VMap;
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
-      Header->getParent()->getBasicBlockList().push_back(New);
+      Header->getParent()->getBasicBlockList().insert(BlockInsertPt, New);
 
       assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
              "Header should not be in a sub-loop");
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 6749d3db743c..a92cb6a313d3 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -22,6 +22,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
@@ -35,6 +36,7 @@
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -167,8 +169,11 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   // Add the branch to the exit block (around the unrolled loop)
   B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
   InsertPt->eraseFromParent();
-  if (DT)
-    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
+  if (DT) {
+    auto *NewDom = DT->findNearestCommonDominator(OriginalLoopLatchExit,
+                                                  PrologExit);
+    DT->changeImmediateDominator(OriginalLoopLatchExit, NewDom);
+  }
 }
 
 /// Connect the unrolling epilog code to the original loop.
@@ -215,7 +220,10 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     //   PN = PHI [I, Latch]
     // ...
     // Exit:
-    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //   EpilogPN = PHI [PN, EpilogPreHeader], [X, Exit2], [Y, Exit2.epil]
+    //
+    // Exits from non-latch blocks point to the original exit block and the
+    // epilogue edges have already been added.
     //
     // There is EpilogPreHeader incoming block instead of NewExit as
     // NewExit was spilt 1 more time to get EpilogPreHeader.
@@ -282,8 +290,10 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   // Add the branch to the exit block (around the unrolling loop)
   B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
   InsertPt->eraseFromParent();
-  if (DT)
-    DT->changeImmediateDominator(Exit, NewExit);
+  if (DT) {
+    auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
+    DT->changeImmediateDominator(Exit, NewDom);
+  }
 
   // Split the main loop exit to maintain canonicalization guarantees.
   SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
@@ -291,17 +301,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                          PreserveLCSSA);
 }
 
-/// Create a clone of the blocks in a loop and connect them together.
-/// If CreateRemainderLoop is false, loop structure will not be cloned,
-/// otherwise a new loop will be created including all cloned blocks, and the
-/// iterator of it switches to count NewIter down to 0.
+/// Create a clone of the blocks in a loop and connect them together. A new
+/// loop will be created including all cloned blocks, and the iterator of the
+/// new loop switched to count NewIter down to 0.
 /// The cloned blocks should be inserted between InsertTop and InsertBot.
-/// If loop structure is cloned InsertTop should be new preheader, InsertBot
-/// new loop exit.
-/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+/// InsertTop should be new preheader, InsertBot new loop exit.
+/// Returns the new cloned loop that is created.
 static Loop *
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
-                const bool UseEpilogRemainder, const bool UnrollRemainder,
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
+                const bool UnrollRemainder,
                 BasicBlock *InsertTop,
                 BasicBlock *InsertBot, BasicBlock *Preheader,
                 std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
@@ -315,8 +323,6 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
   Loop *ParentLoop = L->getParentLoop();
   NewLoopsMap NewLoops;
   NewLoops[ParentLoop] = ParentLoop;
-  if (!CreateRemainderLoop)
-    NewLoops[L] = ParentLoop;
 
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
@@ -324,11 +330,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
 
-    // If we're unrolling the outermost loop, there's no remainder loop,
-    // and this block isn't in a nested loop, then the new block is not
-    // in any loop. Otherwise, add it to loopinfo.
-    if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
-      addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
+    addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
 
     VMap[*BB] = NewBB;
     if (Header == *BB) {
@@ -349,27 +351,24 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
     }
 
     if (Latch == *BB) {
-      // For the last block, if CreateRemainderLoop is false, create a direct
-      // jump to InsertBot. If not, create a loop back to cloned head.
+      // For the last block, create a loop back to cloned head.
       VMap.erase((*BB)->getTerminator());
+      // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count.
+      // Subtle: NewIter can be 0 if we wrapped when computing the trip count,
+      // thus we must compare the post-increment (wrapping) value.
       BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
       BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
       IRBuilder<> Builder(LatchBR);
-      if (!CreateRemainderLoop) {
-        Builder.CreateBr(InsertBot);
-      } else {
-        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
-                                          suffix + ".iter",
-                                          FirstLoopBB->getFirstNonPHI());
-        Value *IdxSub =
-            Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
-                              NewIdx->getName() + ".sub");
-        Value *IdxCmp =
-            Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
-        Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
-        NewIdx->addIncoming(NewIter, InsertTop);
-        NewIdx->addIncoming(IdxSub, NewBB);
-      }
+      PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+                                        suffix + ".iter",
+                                        FirstLoopBB->getFirstNonPHI());
+      auto *Zero = ConstantInt::get(NewIdx->getType(), 0);
+      auto *One = ConstantInt::get(NewIdx->getType(), 1);
+      Value *IdxNext = Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
+      Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");
+      Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
+      NewIdx->addIncoming(Zero, InsertTop);
+      NewIdx->addIncoming(IdxNext, NewBB);
       LatchBR->eraseFromParent();
     }
   }
@@ -378,99 +377,45 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
   // cloned loop.
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
     PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-    if (!CreateRemainderLoop) {
-      if (UseEpilogRemainder) {
-        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
-        NewPHI->setIncomingBlock(idx, InsertTop);
-        NewPHI->removeIncomingValue(Latch, false);
-      } else {
-        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
-        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
-      }
-    } else {
-      unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
-      NewPHI->setIncomingBlock(idx, InsertTop);
-      BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
-      idx = NewPHI->getBasicBlockIndex(Latch);
-      Value *InVal = NewPHI->getIncomingValue(idx);
-      NewPHI->setIncomingBlock(idx, NewLatch);
-      if (Value *V = VMap.lookup(InVal))
-        NewPHI->setIncomingValue(idx, V);
-    }
-  }
-  if (CreateRemainderLoop) {
-    Loop *NewLoop = NewLoops[L];  
-    assert(NewLoop && "L should have been cloned");
-    MDNode *LoopID = NewLoop->getLoopID();
-
-    // Only add loop metadata if the loop is not going to be completely
-    // unrolled.
-    if (UnrollRemainder)
-      return NewLoop;
-
-    Optional<MDNode *> NewLoopID = makeFollowupLoopID(
-        LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
-    if (NewLoopID.hasValue()) {
-      NewLoop->setLoopID(NewLoopID.getValue());
-
-      // Do not setLoopAlreadyUnrolled if loop attributes have been defined
-      // explicitly.
-      return NewLoop;
-    }
-
-    // Add unroll disable metadata to disable future unrolling for this loop.
-    NewLoop->setLoopAlreadyUnrolled();
-    return NewLoop;
+    unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+    NewPHI->setIncomingBlock(idx, InsertTop);
+    BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+    idx = NewPHI->getBasicBlockIndex(Latch);
+    Value *InVal = NewPHI->getIncomingValue(idx);
+    NewPHI->setIncomingBlock(idx, NewLatch);
+    if (Value *V = VMap.lookup(InVal))
+      NewPHI->setIncomingValue(idx, V);
   }
-  else
-    return nullptr;
-}
 
-/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
-/// is populated with all the loop exit blocks other than the LatchExit block.
-static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit,
-                                         bool PreserveLCSSA,
-                                         bool UseEpilogRemainder) {
+  Loop *NewLoop = NewLoops[L];
+  assert(NewLoop && "L should have been cloned");
+  MDNode *LoopID = NewLoop->getLoopID();
 
-  // We currently have some correctness constrains in unrolling a multi-exit
-  // loop. Check for these below.
+  // Only add loop metadata if the loop is not going to be completely
+  // unrolled.
+  if (UnrollRemainder)
+    return NewLoop;
 
-  // We rely on LCSSA form being preserved when the exit blocks are transformed.
-  if (!PreserveLCSSA)
-    return false;
+  Optional<MDNode *> NewLoopID = makeFollowupLoopID(
+      LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
+  if (NewLoopID.hasValue()) {
+    NewLoop->setLoopID(NewLoopID.getValue());
 
-  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
-  // UnrollRuntimeMultiExit is true. This will need updating the logic in
-  // connectEpilog/connectProlog.
-  if (!LatchExit->getSinglePredecessor()) {
-    LLVM_DEBUG(
-        dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
-                  "predecessor.\n");
-    return false;
+    // Do not setLoopAlreadyUnrolled if loop attributes have been defined
+    // explicitly.
+    return NewLoop;
   }
-  // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
-  // and L is an inner loop. This is because in presence of multiple exits, the
-  // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
-  // outer loop. This is automatically handled in the prolog case, so we do not
-  // have that bug in prolog generation.
-  if (UseEpilogRemainder && L->getParentLoop())
-    return false;
 
-  // All constraints have been satisfied.
-  return true;
+  // Add unroll disable metadata to disable future unrolling for this loop.
+  NewLoop->setLoopAlreadyUnrolled();
+  return NewLoop;
 }
 
 /// Returns true if we can profitably unroll the multi-exit loop L. Currently,
 /// we return true only if UnrollRuntimeMultiExit is set to true.
 static bool canProfitablyUnrollMultiExitLoop(
     Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
-    bool PreserveLCSSA, bool UseEpilogRemainder) {
-
-#if !defined(NDEBUG)
-  assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
-                                      UseEpilogRemainder) &&
-         "Should be safe to unroll before checking profitability!");
-#endif
+    bool UseEpilogRemainder) {
 
   // Priority goes to UnrollRuntimeMultiExit if it's supplied.
   if (UnrollRuntimeMultiExit.getNumOccurrences())
@@ -523,24 +468,56 @@ static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
   uint64_t TrueWeight, FalseWeight;
   BranchInst *LatchBR =
       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
-  if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
-    uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
-                              ? FalseWeight
-                              : TrueWeight;
-    assert(UnrollFactor > 1);
-    uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
-    BasicBlock *Header = RemainderLoop->getHeader();
-    BasicBlock *Latch = RemainderLoop->getLoopLatch();
-    auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
-    unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
-    MDBuilder MDB(RemainderLatchBR->getContext());
-    MDNode *WeightNode =
-        HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
-                  : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
-    RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
-  }
+  if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
+    return;
+  uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
+                            ? FalseWeight
+                            : TrueWeight;
+  assert(UnrollFactor > 1);
+  uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
+  BasicBlock *Header = RemainderLoop->getHeader();
+  BasicBlock *Latch = RemainderLoop->getLoopLatch();
+  auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
+  MDBuilder MDB(RemainderLatchBR->getContext());
+  MDNode *WeightNode =
+    HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
+                : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+  RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
 }
 
+/// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain
+/// accounting for the possibility of unsigned overflow in the 2s complement
+/// domain. Preconditions:
+/// 1) TripCount = BECount + 1 (allowing overflow)
+/// 2) Log2(Count) <= BitWidth(BECount)
+static Value *CreateTripRemainder(IRBuilder<> &B, Value *BECount,
+                                  Value *TripCount, unsigned Count) {
+  // Note that TripCount is BECount + 1.
+  if (isPowerOf2_32(Count))
+    // If the expression is zero, then either:
+    //  1. There are no iterations to be run in the prolog/epilog loop.
+    // OR
+    //  2. The addition computing TripCount overflowed.
+    //
+    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+    // the number of iterations that remain to be run in the original loop is a
+    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (a
+    // precondition of this method).
+    return B.CreateAnd(TripCount, Count - 1, "xtraiter");
+
+  // As (BECount + 1) can potentially unsigned overflow we count
+  // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+  Constant *CountC = ConstantInt::get(BECount->getType(), Count);
+  Value *ModValTmp = B.CreateURem(BECount, CountC);
+  Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                 ConstantInt::get(ModValTmp->getType(), 1));
+  // At that point (BECount % Count) + 1 could be equal to Count.
+  // To handle this case we need to take mod by Count one more time.
+  return B.CreateURem(ModValAdd, CountC, "xtraiter");
+}
+
+
 /// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
@@ -624,19 +601,22 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // These are exit blocks other than the target of the latch exiting block.
   SmallVector<BasicBlock *, 4> OtherExits;
   L->getUniqueNonLatchExitBlocks(OtherExits);
-  bool isMultiExitUnrollingEnabled =
-      canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
-                                   UseEpilogRemainder) &&
-      canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA,
-                                       UseEpilogRemainder);
-  // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
-  if (!isMultiExitUnrollingEnabled &&
-      (!L->getExitingBlock() || OtherExits.size())) {
-    LLVM_DEBUG(
-        dbgs()
-        << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
-           "enabled!\n");
-    return false;
+  // Support only single exit and exiting block unless multi-exit loop
+  // unrolling is enabled.
+  if (!L->getExitingBlock() || OtherExits.size()) {
+    // We rely on LCSSA form being preserved when the exit blocks are transformed.
+    // (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)
+    if (!PreserveLCSSA)
+      return false;
+
+    if (!canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit,
+                                          UseEpilogRemainder)) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
+             "enabled!\n");
+      return false;
+    }
   }
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
@@ -659,6 +639,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
 
   // Add 1 since the backedge count doesn't include the first loop iteration.
+  // (Note that overflow can occur, this is handled explicitly below)
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC)) {
@@ -706,8 +687,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
     NewPreHeader->setName(PreHeader->getName() + ".new");
     // Split LatchExit to create phi nodes from branch above.
-    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
-    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI,
+    NewExit = SplitBlockPredecessors(LatchExit, {Latch}, ".unr-lcssa", DT, LI,
                                      nullptr, PreserveLCSSA);
     // NewExit gets its DebugLoc from LatchExit, which is not part of the
     // original Loop.
@@ -717,6 +697,21 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // Split NewExit to insert epilog remainder loop.
     EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);
     EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+
+    // If the latch exits from multiple level of nested loops, then
+    // by assumption there must be another loop exit which branches to the
+    // outer loop and we must adjust the loop for the newly inserted blocks
+    // to account for the fact that our epilogue is still in the same outer
+    // loop. Note that this leaves loopinfo temporarily out of sync with the
+    // CFG until the actual epilogue loop is inserted.
+    if (auto *ParentL = L->getParentLoop())
+      if (LI->getLoopFor(LatchExit) != ParentL) {
+        LI->removeBlock(NewExit);
+        ParentL->addBasicBlockToLoop(NewExit, *LI);
+        LI->removeBlock(EpilogPreHeader);
+        ParentL->addBasicBlockToLoop(EpilogPreHeader, *LI);
+      }
+
   } else {
     // If prolog remainder
     // Split the original preheader twice to insert prolog remainder loop
@@ -751,35 +746,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
   Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
                                           PreHeaderBR);
   IRBuilder<> B(PreHeaderBR);
-  Value *ModVal;
-  // Calculate ModVal = (BECount + 1) % Count.
-  // Note that TripCount is BECount + 1.
-  if (isPowerOf2_32(Count)) {
-    // When Count is power of 2 we don't BECount for epilog case, however we'll
-    // need it for a branch around unrolling loop for prolog case.
-    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-    //  1. There are no iterations to be run in the prolog/epilog loop.
-    // OR
-    //  2. The addition computing TripCount overflowed.
-    //
-    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
-    // the number of iterations that remain to be run in the original loop is a
-    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
-    // explicitly check this above).
-  } else {
-    // As (BECount + 1) can potentially unsigned overflow we count
-    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
-    Value *ModValTmp = B.CreateURem(BECount,
-                                    ConstantInt::get(BECount->getType(),
-                                                     Count));
-    Value *ModValAdd = B.CreateAdd(ModValTmp,
-                                   ConstantInt::get(ModValTmp->getType(), 1));
-    // At that point (BECount % Count) + 1 could be equal to Count.
-    // To handle this case we need to take mod by Count one more time.
-    ModVal = B.CreateURem(ModValAdd,
-                          ConstantInt::get(BECount->getType(), Count),
-                          "xtraiter");
-  }
+  Value * const ModVal = CreateTripRemainder(B, BECount, TripCount, Count);
+
   Value *BranchVal =
       UseEpilogRemainder ? B.CreateICmpULT(BECount,
                                            ConstantInt::get(BECount->getType(),
@@ -810,18 +778,13 @@ bool llvm::UnrollRuntimeLoopRemainder(
   std::vector<BasicBlock *> NewBlocks;
   ValueToValueMapTy VMap;
 
-  // For unroll factor 2 remainder loop will have 1 iterations.
-  // Do not create 1 iteration loop.
-  bool CreateRemainderLoop = (Count != 2);
-
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
   Loop *remainderLoop = CloneLoopBlocks(
-      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
-      InsertTop, InsertBot,
+      L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,
       NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
   // Assign the maximum possible trip count as the back edge weight for the
@@ -840,36 +803,33 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // work is to update the phi nodes in the original loop, and take in the
   // values from the cloned region.
   for (auto *BB : OtherExits) {
-   for (auto &II : *BB) {
-
-     // Given we preserve LCSSA form, we know that the values used outside the
-     // loop will be used through these phi nodes at the exit blocks that are
-     // transformed below.
-     if (!isa<PHINode>(II))
-       break;
-     PHINode *Phi = cast<PHINode>(&II);
-     unsigned oldNumOperands = Phi->getNumIncomingValues();
+    // Given we preserve LCSSA form, we know that the values used outside the
+    // loop will be used through these phi nodes at the exit blocks that are
+    // transformed below.
+    for (PHINode &PN : BB->phis()) {
+     unsigned oldNumOperands = PN.getNumIncomingValues();
      // Add the incoming values from the remainder code to the end of the phi
      // node.
-     for (unsigned i =0; i < oldNumOperands; i++){
-       Value *newVal = VMap.lookup(Phi->getIncomingValue(i));
-       // newVal can be a constant or derived from values outside the loop, and
-       // hence need not have a VMap value. Also, since lookup already generated
-       // a default "null" VMap entry for this value, we need to populate that
-       // VMap entry correctly, with the mapped entry being itself.
-       if (!newVal) {
-         newVal = Phi->getIncomingValue(i);
-         VMap[Phi->getIncomingValue(i)] = Phi->getIncomingValue(i);
-       }
-       Phi->addIncoming(newVal,
-                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+     for (unsigned i = 0; i < oldNumOperands; i++){
+       auto *PredBB =PN.getIncomingBlock(i);
+       if (PredBB == Latch)
+         // The latch exit is handled seperately, see connectX
+         continue;
+       if (!L->contains(PredBB))
+         // Even if we had dedicated exits, the code above inserted an
+         // extra branch which can reach the latch exit.
+         continue;
+
+       auto *V = PN.getIncomingValue(i);
+       if (Instruction *I = dyn_cast<Instruction>(V))
+         if (L->contains(I))
+           V = VMap.lookup(I);
+       PN.addIncoming(V, cast<BasicBlock>(VMap[PredBB]));
      }
    }
 #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
     for (BasicBlock *SuccBB : successors(BB)) {
-      assert(!(any_of(OtherExits,
-                      [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
-               SuccBB == LatchExit) &&
+      assert(!(llvm::is_contained(OtherExits, SuccBB) || SuccBB == LatchExit) &&
              "Breaks the definition of dedicated exits!");
     }
 #endif
@@ -931,23 +891,22 @@ bool llvm::UnrollRuntimeLoopRemainder(
                   PreserveLCSSA);
 
     // Update counter in loop for unrolling.
-    // I should be multiply of Count.
+    // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count.
+    // Subtle: TestVal can be 0 if we wrapped when computing the trip count,
+    // thus we must compare the post-increment (wrapping) value.
     IRBuilder<> B2(NewPreHeader->getTerminator());
     Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
     BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-    B2.SetInsertPoint(LatchBR);
     PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
                                       Header->getFirstNonPHI());
-    Value *IdxSub =
-        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
-                     NewIdx->getName() + ".nsub");
-    Value *IdxCmp;
-    if (LatchBR->getSuccessor(0) == Header)
-      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
-    else
-      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
-    NewIdx->addIncoming(TestVal, NewPreHeader);
-    NewIdx->addIncoming(IdxSub, Latch);
+    B2.SetInsertPoint(LatchBR);
+    auto *Zero = ConstantInt::get(NewIdx->getType(), 0);
+    auto *One = ConstantInt::get(NewIdx->getType(), 1);
+    Value *IdxNext = B2.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
+    auto Pred = LatchBR->getSuccessor(0) == Header ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+    Value *IdxCmp = B2.CreateICmp(Pred, IdxNext, TestVal, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(Zero, NewPreHeader);
+    NewIdx->addIncoming(IdxNext, Latch);
     LatchBR->setCondition(IdxCmp);
   } else {
     // Connect the prolog code to the original loop and update the
@@ -960,12 +919,49 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // of its parent loops, so the Scalar Evolution pass needs to be run again.
   SE->forgetTopmostLoop(L);
 
-  // Verify that the Dom Tree is correct.
+  // Verify that the Dom Tree and Loop Info are correct.
 #if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
-  if (DT)
+  if (DT) {
     assert(DT->verify(DominatorTree::VerificationLevel::Full));
+    LI->verify(*DT);
+  }
 #endif
 
+  // For unroll factor 2 remainder loop will have 1 iteration.
+  if (Count == 2 && DT && LI && SE) {
+    // TODO: This code could probably be pulled out into a helper function
+    // (e.g. breakLoopBackedgeAndSimplify) and reused in loop-deletion.
+    BasicBlock *RemainderLatch = remainderLoop->getLoopLatch();
+    assert(RemainderLatch);
+    SmallVector<BasicBlock*> RemainderBlocks(remainderLoop->getBlocks().begin(),
+                                             remainderLoop->getBlocks().end());
+    breakLoopBackedge(remainderLoop, *DT, *SE, *LI, nullptr);
+    remainderLoop = nullptr;
+
+    // Simplify loop values after breaking the backedge
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    for (BasicBlock *BB : RemainderBlocks) {
+      for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
+        if (Value *V = SimplifyInstruction(&Inst, {DL, nullptr, DT, AC}))
+          if (LI->replacementPreservesLCSSAForm(&Inst, V))
+            Inst.replaceAllUsesWith(V);
+        if (isInstructionTriviallyDead(&Inst))
+          DeadInsts.emplace_back(&Inst);
+      }
+      // We can't do recursive deletion until we're done iterating, as we might
+      // have a phi which (potentially indirectly) uses instructions later in
+      // the block we're iterating through.
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+    }
+
+    // Merge latch into exit block.
+    auto *ExitBB = RemainderLatch->getSingleSuccessor();
+    assert(ExitBB && "required after breaking cond br backedge");
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+    MergeBlockIntoPredecessor(ExitBB, &DTU, LI);
+  }
+
   // Canonicalize to LoopSimplifyForm both original and remainder loops. We
   // cannot rely on the LoopUnrollPass to do this because it only does
   // canonicalization for parent/subloops and not the sibling loops.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index e4d78f9ada08..f0f079335683 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -612,10 +612,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
     for (auto *Block : L->blocks())
       for (Instruction &I : *Block) {
         auto *Undef = UndefValue::get(I.getType());
-        for (Value::use_iterator UI = I.use_begin(), E = I.use_end();
-             UI != E;) {
-          Use &U = *UI;
-          ++UI;
+        for (Use &U : llvm::make_early_inc_range(I.uses())) {
           if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
             if (L->contains(Usr->getParent()))
               continue;
@@ -710,21 +707,58 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 
   SE.forgetLoop(L);
 
-  // Note: By splitting the backedge, and then explicitly making it unreachable
-  // we gracefully handle corner cases such as non-bottom tested loops and the
-  // like.  We also have the benefit of being able to reuse existing well tested
-  // code.  It might be worth special casing the common bottom tested case at
-  // some point to avoid code churn.
-
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   if (MSSA)
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
-  auto *BackedgeBB = SplitEdge(Latch, Header, &DT, &LI, MSSAU.get());
+  // Update the CFG and domtree.  We chose to special case a couple of
+  // of common cases for code quality and test readability reasons.
+  [&]() -> void {
+    if (auto *BI = dyn_cast<BranchInst>(Latch->getTerminator())) {
+      if (!BI->isConditional()) {
+        DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+        (void)changeToUnreachable(BI, /*PreserveLCSSA*/ true, &DTU,
+                                  MSSAU.get());
+        return;
+      }
+
+      // Conditional latch/exit - note that latch can be shared by inner
+      // and outer loop so the other target doesn't need to an exit
+      if (L->isLoopExiting(Latch)) {
+        // TODO: Generalize ConstantFoldTerminator so that it can be used
+        // here without invalidating LCSSA or MemorySSA.  (Tricky case for
+        // LCSSA: header is an exit block of a preceeding sibling loop w/o
+        // dedicated exits.)
+        const unsigned ExitIdx = L->contains(BI->getSuccessor(0)) ? 1 : 0;
+        BasicBlock *ExitBB = BI->getSuccessor(ExitIdx);
+
+        DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+        Header->removePredecessor(Latch, true);
+
+        IRBuilder<> Builder(BI);
+        auto *NewBI = Builder.CreateBr(ExitBB);
+        // Transfer the metadata to the new branch instruction (minus the
+        // loop info since this is no longer a loop)
+        NewBI->copyMetadata(*BI, {LLVMContext::MD_dbg,
+                                  LLVMContext::MD_annotation});
+
+        BI->eraseFromParent();
+        DTU.applyUpdates({{DominatorTree::Delete, Latch, Header}});
+        if (MSSA)
+          MSSAU->applyUpdates({{DominatorTree::Delete, Latch, Header}}, DT);
+        return;
+      }
+    }
 
-  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
-  (void)changeToUnreachable(BackedgeBB->getTerminator(),
-                            /*PreserveLCSSA*/ true, &DTU, MSSAU.get());
+    // General case.  By splitting the backedge, and then explicitly making it
+    // unreachable we gracefully handle corner cases such as switch and invoke
+    // termiantors.
+    auto *BackedgeBB = SplitEdge(Latch, Header, &DT, &LI, MSSAU.get());
+
+    DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+    (void)changeToUnreachable(BackedgeBB->getTerminator(),
+                              /*PreserveLCSSA*/ true, &DTU, MSSAU.get());
+  }();
 
   // Erase (and destroy) this loop instance.  Handles relinking sub-loops
   // and blocks within the loop as needed.
@@ -852,32 +886,37 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
-Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
-                            Value *Right) {
-  CmpInst::Predicate Pred;
+CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
   switch (RK) {
   default:
     llvm_unreachable("Unknown min/max recurrence kind");
   case RecurKind::UMin:
-    Pred = CmpInst::ICMP_ULT;
-    break;
+    return CmpInst::ICMP_ULT;
   case RecurKind::UMax:
-    Pred = CmpInst::ICMP_UGT;
-    break;
+    return CmpInst::ICMP_UGT;
   case RecurKind::SMin:
-    Pred = CmpInst::ICMP_SLT;
-    break;
+    return CmpInst::ICMP_SLT;
   case RecurKind::SMax:
-    Pred = CmpInst::ICMP_SGT;
-    break;
+    return CmpInst::ICMP_SGT;
   case RecurKind::FMin:
-    Pred = CmpInst::FCMP_OLT;
-    break;
+    return CmpInst::FCMP_OLT;
   case RecurKind::FMax:
-    Pred = CmpInst::FCMP_OGT;
-    break;
+    return CmpInst::FCMP_OGT;
   }
+}
 
+Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal,
+                               RecurKind RK, Value *Left, Value *Right) {
+  if (auto VTy = dyn_cast<VectorType>(Left->getType()))
+    StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp");
+  return Builder.CreateSelect(Cmp, Left, Right, "rdx.select");
+}
+
+Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
+                            Value *Right) {
+  CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK);
   Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
@@ -955,15 +994,50 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
 }
 
+Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder,
+                                            const TargetTransformInfo *TTI,
+                                            Value *Src,
+                                            const RecurrenceDescriptor &Desc,
+                                            PHINode *OrigPhi) {
+  assert(RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+             Desc.getRecurrenceKind()) &&
+         "Unexpected reduction kind");
+  Value *InitVal = Desc.getRecurrenceStartValue();
+  Value *NewVal = nullptr;
+
+  // First use the original phi to determine the new value we're trying to
+  // select from in the loop.
+  SelectInst *SI = nullptr;
+  for (auto *U : OrigPhi->users()) {
+    if ((SI = dyn_cast<SelectInst>(U)))
+      break;
+  }
+  assert(SI && "One user of the original phi should be a select");
+
+  if (SI->getTrueValue() == OrigPhi)
+    NewVal = SI->getFalseValue();
+  else {
+    assert(SI->getFalseValue() == OrigPhi &&
+           "At least one input to the select should be the original Phi");
+    NewVal = SI->getTrueValue();
+  }
+
+  // Create a splat vector with the new value and compare this to the vector
+  // we want to reduce.
+  ElementCount EC = cast<VectorType>(Src->getType())->getElementCount();
+  Value *Right = Builder.CreateVectorSplat(EC, InitVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp");
+
+  // If any predicate is true it means that we want to select the new value.
+  Cmp = Builder.CreateOrReduce(Cmp);
+  return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select");
+}
+
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
                                          const TargetTransformInfo *TTI,
                                          Value *Src, RecurKind RdxKind,
                                          ArrayRef<Value *> RedOps) {
-  TargetTransformInfo::ReductionFlags RdxFlags;
-  RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax ||
-                     RdxKind == RecurKind::FMax;
-  RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin;
-
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
   switch (RdxKind) {
   case RecurKind::Add:
@@ -1000,14 +1074,19 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
 
 Value *llvm::createTargetReduction(IRBuilderBase &B,
                                    const TargetTransformInfo *TTI,
-                                   const RecurrenceDescriptor &Desc,
-                                   Value *Src) {
+                                   const RecurrenceDescriptor &Desc, Value *Src,
+                                   PHINode *OrigPhi) {
   // TODO: Support in-order reductions based on the recurrence descriptor.
   // All ops in the reduction inherit fast-math-flags from the recurrence
   // descriptor.
   IRBuilderBase::FastMathFlagGuard FMFGuard(B);
   B.setFastMathFlags(Desc.getFastMathFlags());
-  return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
+
+  RecurKind RK = Desc.getRecurrenceKind();
+  if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
+    return createSelectCmpTargetReduction(B, TTI, Src, Desc, OrigPhi);
+
+  return createSimpleTargetReduction(B, TTI, Src, RK);
 }
 
 Value *llvm::createOrderedReduction(IRBuilderBase &B,
@@ -1081,58 +1160,6 @@ bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
 // As a side effect, reduces the amount of IV processing within the loop.
 //===----------------------------------------------------------------------===//
 
-// Return true if the SCEV expansion generated by the rewriter can replace the
-// original value. SCEV guarantees that it produces the same value, but the way
-// it is produced may be illegal IR.  Ideally, this function will only be
-// called for verification.
-static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
-  // If an SCEV expression subsumed multiple pointers, its expansion could
-  // reassociate the GEP changing the base pointer. This is illegal because the
-  // final address produced by a GEP chain must be inbounds relative to its
-  // underlying object. Otherwise basic alias analysis, among other things,
-  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
-  // producing an expression involving multiple pointers. Until then, we must
-  // bail out here.
-  //
-  // Retrieve the pointer operand of the GEP. Don't use getUnderlyingObject
-  // because it understands lcssa phis while SCEV does not.
-  Value *FromPtr = FromVal;
-  Value *ToPtr = ToVal;
-  if (auto *GEP = dyn_cast<GEPOperator>(FromVal))
-    FromPtr = GEP->getPointerOperand();
-
-  if (auto *GEP = dyn_cast<GEPOperator>(ToVal))
-    ToPtr = GEP->getPointerOperand();
-
-  if (FromPtr != FromVal || ToPtr != ToVal) {
-    // Quickly check the common case
-    if (FromPtr == ToPtr)
-      return true;
-
-    // SCEV may have rewritten an expression that produces the GEP's pointer
-    // operand. That's ok as long as the pointer operand has the same base
-    // pointer. Unlike getUnderlyingObject(), getPointerBase() will find the
-    // base of a recurrence. This handles the case in which SCEV expansion
-    // converts a pointer type recurrence into a nonrecurrent pointer base
-    // indexed by an integer recurrence.
-
-    // If the GEP base pointer is a vector of pointers, abort.
-    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
-      return false;
-
-    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
-    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
-    if (FromBase == ToBase)
-      return true;
-
-    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out "
-                      << *FromBase << " != " << *ToBase << "\n");
-
-    return false;
-  }
-  return true;
-}
-
 static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) {
   SmallPtrSet<const Instruction *, 8> Visited;
   SmallVector<const Instruction *, 8> WorkList;
@@ -1165,9 +1192,6 @@ struct RewritePhi {
   Instruction *ExpansionPoint; // Where we'd like to expand that SCEV?
   bool HighCost;               // Is this expansion a high-cost?
 
-  Value *Expansion = nullptr;
-  bool ValidRewrite = false;
-
   RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt,
              bool H)
       : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt),
@@ -1204,8 +1228,6 @@ static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet)
     // phase later. Skip it in the loop invariant check below.
     bool found = false;
     for (const RewritePhi &Phi : RewritePhiSet) {
-      if (!Phi.ValidRewrite)
-        continue;
       unsigned i = Phi.Ith;
       if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
         found = true;
@@ -1264,13 +1286,6 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
       if (!SE->isSCEVable(PN->getType()))
         continue;
 
-      // It's necessary to tell ScalarEvolution about this explicitly so that
-      // it can walk the def-use list and forget all SCEVs, as it may not be
-      // watching the PHI itself. Once the new exit value is in place, there
-      // may not be a def-use connection between the loop and every instruction
-      // which got a SCEVAddRecExpr for that loop.
-      SE->forgetValue(PN);
-
       // Iterate over all of the values in all the PHI nodes.
       for (unsigned i = 0; i != NumPreds; ++i) {
         // If the value being merged in is not integer or is not defined
@@ -1339,61 +1354,49 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
     }
   }
 
-  // Now that we've done preliminary filtering and billed all the SCEV's,
-  // we can perform the last sanity check - the expansion must be valid.
-  for (RewritePhi &Phi : RewritePhiSet) {
-    Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(),
-                                           Phi.ExpansionPoint);
+  // TODO: evaluate whether it is beneficial to change how we calculate
+  // high-cost: if we have SCEV 'A' which we know we will expand, should we
+  // calculate the cost of other SCEV's after expanding SCEV 'A', thus
+  // potentially giving cost bonus to those other SCEV's?
 
-    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = "
-                      << *(Phi.Expansion) << '\n'
-                      << "  LoopVal = " << *(Phi.ExpansionPoint) << "\n");
+  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+  int NumReplaced = 0;
+
+  // Transformation.
+  for (const RewritePhi &Phi : RewritePhiSet) {
+    PHINode *PN = Phi.PN;
 
-    // FIXME: isValidRewrite() is a hack. it should be an assert, eventually.
-    Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion);
-    if (!Phi.ValidRewrite) {
-      DeadInsts.push_back(Phi.Expansion);
+    // Only do the rewrite when the ExitValue can be expanded cheaply.
+    // If LoopCanBeDel is true, rewrite exit value aggressively.
+    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost)
       continue;
-    }
+
+    Value *ExitVal = Rewriter.expandCodeFor(
+        Phi.ExpansionSCEV, Phi.PN->getType(), Phi.ExpansionPoint);
+
+    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = " << *ExitVal
+                      << '\n'
+                      << "  LoopVal = " << *(Phi.ExpansionPoint) << "\n");
 
 #ifndef NDEBUG
     // If we reuse an instruction from a loop which is neither L nor one of
     // its containing loops, we end up breaking LCSSA form for this loop by
     // creating a new use of its instruction.
-    if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion))
+    if (auto *ExitInsn = dyn_cast<Instruction>(ExitVal))
       if (auto *EVL = LI->getLoopFor(ExitInsn->getParent()))
         if (EVL != L)
           assert(EVL->contains(L) && "LCSSA breach detected!");
 #endif
-  }
-
-  // TODO: after isValidRewrite() is an assertion, evaluate whether
-  // it is beneficial to change how we calculate high-cost:
-  // if we have SCEV 'A' which we know we will expand, should we calculate
-  // the cost of other SCEV's after expanding SCEV 'A',
-  // thus potentially giving cost bonus to those other SCEV's?
-
-  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
-  int NumReplaced = 0;
-
-  // Transformation.
-  for (const RewritePhi &Phi : RewritePhiSet) {
-    if (!Phi.ValidRewrite)
-      continue;
-
-    PHINode *PN = Phi.PN;
-    Value *ExitVal = Phi.Expansion;
-
-    // Only do the rewrite when the ExitValue can be expanded cheaply.
-    // If LoopCanBeDel is true, rewrite exit value aggressively.
-    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
-      DeadInsts.push_back(ExitVal);
-      continue;
-    }
 
     NumReplaced++;
     Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
     PN->setIncomingValue(Phi.Ith, ExitVal);
+    // It's necessary to tell ScalarEvolution about this explicitly so that
+    // it can walk the def-use list and forget all SCEVs, as it may not be
+    // watching the PHI itself. Once the new exit value is in place, there
+    // may not be a def-use connection between the loop and every instruction
+    // which got a SCEVAddRecExpr for that loop.
+    SE->forgetValue(PN);
 
     // If this instruction is dead now, delete it. Don't do it now to avoid
     // invalidating iterators.
@@ -1554,7 +1557,7 @@ expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
   return ChecksWithBounds;
 }
 
-std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
+Value *llvm::addRuntimeChecks(
     Instruction *Loc, Loop *TheLoop,
     const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
     SCEVExpander &Exp) {
@@ -1563,22 +1566,10 @@ std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
   auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp);
 
   LLVMContext &Ctx = Loc->getContext();
-  Instruction *FirstInst = nullptr;
   IRBuilder<> ChkBuilder(Loc);
   // Our instructions might fold to a constant.
   Value *MemoryRuntimeCheck = nullptr;
 
-  // FIXME: this helper is currently a duplicate of the one in
-  // LoopVectorize.cpp.
-  auto GetFirstInst = [](Instruction *FirstInst, Value *V,
-                         Instruction *Loc) -> Instruction * {
-    if (FirstInst)
-      return FirstInst;
-    if (Instruction *I = dyn_cast<Instruction>(V))
-      return I->getParent() == Loc->getParent() ? I : nullptr;
-    return nullptr;
-  };
-
   for (const auto &Check : ExpandedChecks) {
     const PointerBounds &A = Check.first, &B = Check.second;
     // Check if two pointers (A and B) conflict where conflict is computed as:
@@ -1607,30 +1598,16 @@ std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
     // bound1 = (A.Start < B.End)
     //  IsConflict = bound0 & bound1
     Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
-    FirstInst = GetFirstInst(FirstInst, Cmp0, Loc);
     Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
-    FirstInst = GetFirstInst(FirstInst, Cmp1, Loc);
     Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
-    FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
     if (MemoryRuntimeCheck) {
       IsConflict =
           ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
-      FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
     }
     MemoryRuntimeCheck = IsConflict;
   }
 
-  if (!MemoryRuntimeCheck)
-    return std::make_pair(nullptr, nullptr);
-
-  // We have to do this trickery because the IRBuilder might fold the check to a
-  // constant expression in which case there is no Instruction anchored in a
-  // the block.
-  Instruction *Check =
-      BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx));
-  ChkBuilder.Insert(Check, "memcheck.conflict");
-  FirstInst = GetFirstInst(FirstInst, Check, Loc);
-  return std::make_pair(FirstInst, Check);
+  return MemoryRuntimeCheck;
 }
 
 Optional<IVConditionInfo> llvm::hasPartialIVCondition(Loop &L,
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 8a89158788cf..771b7d25b0f2 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -14,9 +14,9 @@
 
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -52,8 +52,7 @@ void LoopVersioning::versionLoop(
   assert(VersionedLoop->isLoopSimplifyForm() &&
          "Loop is not in loop-simplify form");
 
-  Instruction *FirstCheckInst;
-  Instruction *MemRuntimeCheck;
+  Value *MemRuntimeCheck;
   Value *SCEVRuntimeCheck;
   Value *RuntimeCheck = nullptr;
 
@@ -64,8 +63,8 @@ void LoopVersioning::versionLoop(
   SCEVExpander Exp2(*RtPtrChecking.getSE(),
                     VersionedLoop->getHeader()->getModule()->getDataLayout(),
                     "induction");
-  std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks(
-      RuntimeCheckBB->getTerminator(), VersionedLoop, AliasChecks, Exp2);
+  MemRuntimeCheck = addRuntimeChecks(RuntimeCheckBB->getTerminator(),
+                                     VersionedLoop, AliasChecks, Exp2);
 
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
@@ -354,14 +353,11 @@ PreservedAnalyses LoopVersioningPass::run(Function &F,
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  MemorySSA *MSSA = EnableMSSALoopDependency
-                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                        : nullptr;
 
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & {
-    LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                      TLI, TTI, nullptr, MSSA};
+    LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
+                                      TLI, TTI, nullptr, nullptr, nullptr};
     return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 616b4e8eb01c..8dc4702993c3 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -442,7 +442,7 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
         /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
         /* SrcIsVolatile */ Memcpy->isVolatile(),
         /* DstIsVolatile */ Memcpy->isVolatile(),
-        /* TargetTransfomrInfo */ TTI);
+        /* TargetTransformInfo */ TTI);
   }
 }
 
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index ec8d7a7074cd..aff9d1311688 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -524,16 +524,14 @@ bool LowerSwitch(Function &F, LazyValueInfo *LVI, AssumptionCache *AC) {
   bool Changed = false;
   SmallPtrSet<BasicBlock *, 8> DeleteList;
 
-  for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
-    BasicBlock *Cur =
-        &*I++; // Advance over block so we don't traverse new blocks
-
+  // We use make_early_inc_range here so that we don't traverse new blocks.
+  for (BasicBlock &Cur : llvm::make_early_inc_range(F)) {
     // If the block is a dead Default block that will be deleted later, don't
     // waste time processing it.
-    if (DeleteList.count(Cur))
+    if (DeleteList.count(&Cur))
       continue;
 
-    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur.getTerminator())) {
       Changed = true;
       ProcessSwitchInst(SI, DeleteList, AC, LVI);
     }
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index 2aef37205c53..bb5ff59cba4b 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -125,7 +125,7 @@ Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) {
   Function *Ctor = Function::createWithDefaultAttr(
       FunctionType::get(Type::getVoidTy(M.getContext()), false),
       GlobalValue::InternalLinkage, 0, CtorName, &M);
-  Ctor->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+  Ctor->addFnAttr(Attribute::NoUnwind);
   BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
   ReturnInst::Create(M.getContext(), CtorBB);
   // Ensure Ctor cannot be discarded, even if in a comdat.
@@ -165,7 +165,7 @@ llvm::getOrCreateSanitizerCtorAndInitFunctions(
   if (Function *Ctor = M.getFunction(CtorName))
     // FIXME: Sink this logic into the module, similar to the handling of
     // globals. This will make moving to a concurrent model much easier.
-    if (Ctor->arg_size() == 0 ||
+    if (Ctor->arg_empty() ||
         Ctor->getReturnType() == Type::getVoidTy(M.getContext()))
       return {Ctor, declareSanitizerInitFunction(M, InitName, InitArgTypes)};
 
@@ -297,7 +297,6 @@ void VFABI::setVectorVariantNames(
            "vector function declaration is missing.");
   }
 #endif
-  CI->addAttribute(
-      AttributeList::FunctionIndex,
+  CI->addFnAttr(
       Attribute::get(M->getContext(), MappingsAttrName, Buffer.str()));
 }
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 91280762aaa7..bd2b6fafdf2e 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -23,6 +24,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -566,10 +568,18 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
     // to ensure we dominate all of our uses.  Always insert right before the
     // relevant instruction (terminator, assume), so that we insert in proper
     // order in the case of multiple predicateinfo in the same block.
+    // The number of named values is used to detect if a new declaration was
+    // added. If so, that declaration is tracked so that it can be removed when
+    // the analysis is done. The corner case were a new declaration results in
+    // a name clash and the old name being renamed is not considered as that
+    // represents an invalid module.
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
+      auto NumDecls = F.getParent()->getNumNamedValues();
       Function *IF = Intrinsic::getDeclaration(
           F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      if (NumDecls != F.getParent()->getNumNamedValues())
+        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
       PI.PredicateMap.insert({PIC, ValInfo});
@@ -581,8 +591,11 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
       // Insert the predicate directly after the assume. While it also holds
       // directly before it, assume(i1 true) is not a useful fact.
       IRBuilder<> B(PAssume->AssumeInst->getNextNode());
+      auto NumDecls = F.getParent()->getNumNamedValues();
       Function *IF = Intrinsic::getDeclaration(
           F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      if (NumDecls != F.getParent()->getNumNamedValues())
+        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
@@ -761,6 +774,23 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
   Builder.buildPredicateInfo();
 }
 
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+  // Collect function pointers in set first, as SmallSet uses a SmallVector
+  // internally and we have to remove the asserting value handles first.
+  SmallPtrSet<Function *, 20> FunctionPtrs;
+  for (auto &F : CreatedDeclarations)
+    FunctionPtrs.insert(&*F);
+  CreatedDeclarations.clear();
+
+  for (Function *F : FunctionPtrs) {
+    assert(F->user_begin() == F->user_end() &&
+           "PredicateInfo consumer did not remove all SSA copies.");
+    F->eraseFromParent();
+  }
+}
+
 Optional<PredicateConstraint> PredicateBase::getConstraint() const {
   switch (Type) {
   case PT_Assume:
@@ -827,6 +857,19 @@ void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AssumptionCacheTracker>();
 }
 
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+  for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) {
+    const auto *PI = PredInfo.getPredicateInfoFor(&Inst);
+    auto *II = dyn_cast<IntrinsicInst>(&Inst);
+    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+      continue;
+
+    Inst.replaceAllUsesWith(II->getOperand(0));
+    Inst.eraseFromParent();
+  }
+}
+
 bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -834,6 +877,8 @@ bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   PredInfo->print(dbgs());
   if (VerifyPredicateInfo)
     PredInfo->verifyPredicateInfo();
+
+  replaceCreatedSSACopys(*PredInfo, F);
   return false;
 }
 
@@ -845,6 +890,7 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
   auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
   PredInfo->print(OS);
 
+  replaceCreatedSSACopys(*PredInfo, F);
   return PreservedAnalyses::all();
 }
 
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 427028066026..b35ab57e0d87 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -70,7 +70,8 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
       if (LI->isVolatile())
         return false;
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI)
+      if (SI->getValueOperand() == AI ||
+          SI->getValueOperand()->getType() != AI->getAllocatedType())
         return false; // Don't allow a store OF the AI, only INTO the AI.
       // Note that atomic stores can be transformed; atomic semantics do
       // not have any meaning for a local alloca.
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 85e5adaeaf5e..3ebc89158173 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -177,9 +177,7 @@ static bool convertToRelativeLookupTables(
 
   bool Changed = false;
 
-  for (auto GVI = M.global_begin(), E = M.global_end(); GVI != E;) {
-    GlobalVariable &GV = *GVI++;
-
+  for (GlobalVariable &GV : llvm::make_early_inc_range(M.globals())) {
     if (!shouldConvertToRelLookupTable(M, GV))
       continue;
 
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 4cf99abcc10f..d7e8eaf677c6 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -540,8 +540,14 @@ void SCCPInstVisitor::markArgInFuncSpecialization(Function *F, Argument *A,
             E = F->arg_end();
        I != E; ++I, ++J)
     if (J != A && ValueState.count(I)) {
-      ValueState[J] = ValueState[I];
-      pushToWorkList(ValueState[J], J);
+      // Note: This previously looked like this:
+      // ValueState[J] = ValueState[I];
+      // This is incorrect because the DenseMap class may resize the underlying
+      // memory when inserting `J`, which will invalidate the reference to `I`.
+      // Instead, we make sure `J` exists, then set it to `I` afterwards.
+      auto &NewValue = ValueState[J];
+      NewValue = ValueState[I];
+      pushToWorkList(NewValue, J);
     }
 }
 
@@ -802,6 +808,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
     return;
 
   ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+  if (OpSt.isUnknownOrUndef())
+    return;
+
   if (Constant *OpC = getConstant(OpSt)) {
     // Fold the constant as we build.
     Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL);
@@ -809,9 +818,14 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
       return;
     // Propagate constant value
     markConstant(&I, C);
-  } else if (OpSt.isConstantRange() && I.getDestTy()->isIntegerTy()) {
+  } else if (I.getDestTy()->isIntegerTy()) {
     auto &LV = getValueState(&I);
-    ConstantRange OpRange = OpSt.getConstantRange();
+    ConstantRange OpRange =
+        OpSt.isConstantRange()
+            ? OpSt.getConstantRange()
+            : ConstantRange::getFull(
+                  I.getOperand(0)->getType()->getScalarSizeInBits());
+
     Type *DestTy = I.getDestTy();
     // Vectors where all elements have the same known constant range are treated
     // as a single constant range in the lattice. When bitcasting such vectors,
@@ -826,7 +840,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
     ConstantRange Res =
         OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
     mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
-  } else if (!OpSt.isUnknownOrUndef())
+  } else
     markOverdefined(&I);
 }
 
@@ -1183,10 +1197,10 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
   // a declaration, maybe we can constant fold it.
   if (F && F->isDeclaration() && canConstantFoldCallTo(&CB, F)) {
     SmallVector<Constant *, 8> Operands;
-    for (auto AI = CB.arg_begin(), E = CB.arg_end(); AI != E; ++AI) {
-      if (AI->get()->getType()->isStructTy())
+    for (const Use &A : CB.args()) {
+      if (A.get()->getType()->isStructTy())
         return markOverdefined(&CB); // Can't handle struct args.
-      ValueLatticeElement State = getValueState(*AI);
+      ValueLatticeElement State = getValueState(A);
 
       if (State.isUnknownOrUndef())
         return; // Operands are not resolved yet.
diff --git a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index 917d5e0a1ef0..7de76b86817b 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -65,12 +65,6 @@ void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
   Rewrites[Var].Uses.push_back(U);
 }
 
-/// Return true if the SSAUpdater already has a value for the specified variable
-/// in the specified block.
-bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
-  return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
-}
-
 // Compute value at the given block BB. We either should already know it, or we
 // should be able to recursively reach it going up dominator tree.
 Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3978e1e29825..a042146d7ace 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -747,9 +747,8 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
   // so that pointer operands are inserted first, which the code below relies on
   // to form more involved GEPs.
   SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
-  for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()),
-       E(S->op_begin()); I != E; ++I)
-    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+  for (const SCEV *Op : reverse(S->operands()))
+    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(Op), Op));
 
   // Sort by loop. Use a stable sort so that constants follow non-constants and
   // pointer operands precede non-pointer operands.
@@ -765,7 +764,11 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
       // This is the first operand. Just expand it.
       Sum = expand(Op);
       ++I;
-    } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
+      continue;
+    }
+
+    assert(!Op->getType()->isPointerTy() && "Only first op can be pointer");
+    if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
       // The running sum expression is a pointer. Try to form a getelementptr
       // at this level with that as the base.
       SmallVector<const SCEV *, 4> NewOps;
@@ -779,16 +782,6 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
         NewOps.push_back(X);
       }
       Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
-    } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
-      // The running sum is an integer, and there's a pointer at this level.
-      // Try to form a getelementptr. If the running sum is instructions,
-      // use a SCEVUnknown to avoid re-analyzing them.
-      SmallVector<const SCEV *, 4> NewOps;
-      NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) :
-                                               SE.getSCEV(Sum));
-      for (++I; I != E && I->first == CurLoop; ++I)
-        NewOps.push_back(I->second);
-      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op));
     } else if (Op->isNonConstantNegative()) {
       // Instead of doing a negate and add, just do a subtract.
       Value *W = expandCodeForImpl(SE.getNegativeSCEV(Op), Ty, false);
@@ -817,9 +810,8 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
   // Collect all the mul operands in a loop, along with their associated loops.
   // Iterate in reverse so that constants are emitted last, all else equal.
   SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
-  for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()),
-       E(S->op_begin()); I != E; ++I)
-    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+  for (const SCEV *Op : reverse(S->operands()))
+    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(Op), Op));
 
   // Sort by loop. Use a stable sort so that constants follow non-constants.
   llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
@@ -923,28 +915,6 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
                      /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS()));
 }
 
-/// Move parts of Base into Rest to leave Base with the minimal
-/// expression that provides a pointer operand suitable for a
-/// GEP expansion.
-static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
-                              ScalarEvolution &SE) {
-  while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
-    Base = A->getStart();
-    Rest = SE.getAddExpr(Rest,
-                         SE.getAddRecExpr(SE.getConstant(A->getType(), 0),
-                                          A->getStepRecurrence(SE),
-                                          A->getLoop(),
-                                          A->getNoWrapFlags(SCEV::FlagNW)));
-  }
-  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
-    Base = A->getOperand(A->getNumOperands()-1);
-    SmallVector<const SCEV *, 8> NewAddOps(A->operands());
-    NewAddOps.back() = Rest;
-    Rest = SE.getAddExpr(NewAddOps);
-    ExposePointerBase(Base, Rest, SE);
-  }
-}
-
 /// Determine if this is a well-behaved chain of instructions leading back to
 /// the PHI. If so, it may be reused by expanded expressions.
 bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
@@ -1125,22 +1095,6 @@ Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
   return IncV;
 }
 
-/// Hoist the addrec instruction chain rooted in the loop phi above the
-/// position. This routine assumes that this is possible (has been checked).
-void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
-                                  Instruction *Pos, PHINode *LoopPhi) {
-  do {
-    if (DT->dominates(InstToHoist, Pos))
-      break;
-    // Make sure the increment is where we want it. But don't move it
-    // down past a potential existing post-inc user.
-    fixupInsertPoints(InstToHoist);
-    InstToHoist->moveBefore(Pos);
-    Pos = InstToHoist;
-    InstToHoist = cast<Instruction>(InstToHoist->getOperand(0));
-  } while (InstToHoist != LoopPhi);
-}
-
 /// Check whether we can cheaply express the requested SCEV in terms of
 /// the available PHI SCEV by truncation and/or inversion of the step.
 static bool canBeCheaplyTransformed(ScalarEvolution &SE,
@@ -1264,8 +1218,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       if (LSRMode) {
         if (!isExpandedAddRecExprPHI(&PN, TempIncV, L))
           continue;
-        if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos))
-          continue;
       } else {
         if (!isNormalAddRecExprPHI(&PN, TempIncV, L))
           continue;
@@ -1293,11 +1245,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
     }
 
     if (AddRecPhiMatch) {
-      // Potentially, move the increment. We have made sure in
-      // isExpandedAddRecExprPHI or hoistIVInc that this is possible.
-      if (L == IVIncInsertLoop)
-        hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
-
       // Ok, the add recurrence looks usable.
       // Remember this PHI, even in post-inc mode.
       InsertedValues.insert(AddRecPhiMatch);
@@ -1597,29 +1544,17 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
 
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
+    if (PointerType *PTy = dyn_cast<PointerType>(S->getType())) {
+      Value *StartV = expand(SE.getPointerBase(S));
+      assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
+      return expandAddToGEP(SE.removePointerBase(S), PTy, Ty, StartV);
+    }
+
     SmallVector<const SCEV *, 4> NewOps(S->operands());
     NewOps[0] = SE.getConstant(Ty, 0);
     const SCEV *Rest = SE.getAddRecExpr(NewOps, L,
                                         S->getNoWrapFlags(SCEV::FlagNW));
 
-    // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
-    // comments on expandAddToGEP for details.
-    const SCEV *Base = S->getStart();
-    // Dig into the expression to find the pointer base for a GEP.
-    const SCEV *ExposedRest = Rest;
-    ExposePointerBase(Base, ExposedRest, SE);
-    // If we found a pointer, expand the AddRec with a GEP.
-    if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
-      // Make sure the Base isn't something exotic, such as a multiplied
-      // or divided pointer value. In those cases, the result type isn't
-      // actually a pointer type.
-      if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
-        Value *StartV = expand(Base);
-        assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
-        return expandAddToGEP(ExposedRest, PTy, Ty, StartV);
-      }
-    }
-
     // Just do a normal add. Pre-expand the operands to suppress folding.
     //
     // The LHS and RHS values are factored out of the expand call to make the
@@ -1898,6 +1833,22 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
   return V;
 }
 
+/// Check whether value has nuw/nsw/exact set but SCEV does not.
+/// TODO: In reality it is better to check the poison recursively
+/// but this is better than nothing.
+static bool SCEVLostPoisonFlags(const SCEV *S, const Instruction *I) {
+  if (isa<OverflowingBinaryOperator>(I)) {
+    if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) {
+      if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap())
+        return true;
+      if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap())
+        return true;
+    }
+  } else if (isa<PossiblyExactOperator>(I) && I->isExact())
+    return true;
+  return false;
+}
+
 ScalarEvolution::ValueOffsetPair
 SCEVExpander::FindValueInExprValueMap(const SCEV *S,
                                       const Instruction *InsertPt) {
@@ -1907,19 +1858,22 @@ SCEVExpander::FindValueInExprValueMap(const SCEV *S,
   if (CanonicalMode || !SE.containsAddRecurrence(S)) {
     // If S is scConstant, it may be worse to reuse an existing Value.
     if (S->getSCEVType() != scConstant && Set) {
-      // Choose a Value from the set which dominates the insertPt.
-      // insertPt should be inside the Value's parent loop so as not to break
+      // Choose a Value from the set which dominates the InsertPt.
+      // InsertPt should be inside the Value's parent loop so as not to break
       // the LCSSA form.
       for (auto const &VOPair : *Set) {
         Value *V = VOPair.first;
         ConstantInt *Offset = VOPair.second;
-        Instruction *EntInst = nullptr;
-        if (V && isa<Instruction>(V) && (EntInst = cast<Instruction>(V)) &&
-            S->getType() == V->getType() &&
-            EntInst->getFunction() == InsertPt->getFunction() &&
+        Instruction *EntInst = dyn_cast_or_null<Instruction>(V);
+        if (!EntInst)
+          continue;
+
+        assert(EntInst->getFunction() == InsertPt->getFunction());
+        if (S->getType() == V->getType() &&
             SE.DT.dominates(EntInst, InsertPt) &&
             (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
-             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
+             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)) &&
+            !SCEVLostPoisonFlags(S, EntInst))
           return {V, Offset};
       }
     }
@@ -2068,7 +2022,9 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
     Phis.push_back(&PN);
 
   if (TTI)
-    llvm::sort(Phis, [](Value *LHS, Value *RHS) {
+    // Use stable_sort to preserve order of equivalent PHIs, so the order
+    // of the sorted Phis is the same from run to run on the same loop.
+    llvm::stable_sort(Phis, [](Value *LHS, Value *RHS) {
       // Put pointers at the back and make sure pointer < pointer = false.
       if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
         return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy();
@@ -2524,18 +2480,14 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
 
   IntegerType *Ty =
       IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
-  Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
 
   Value *StepValue = expandCodeForImpl(Step, Ty, Loc, false);
   Value *NegStepValue =
       expandCodeForImpl(SE.getNegativeSCEV(Step), Ty, Loc, false);
-  Value *StartValue = expandCodeForImpl(
-      isa<PointerType>(ARExpandTy) ? Start
-                                   : SE.getPtrToIntExpr(Start, ARExpandTy),
-      ARExpandTy, Loc, false);
+  Value *StartValue = expandCodeForImpl(Start, ARTy, Loc, false);
 
   ConstantInt *Zero =
-      ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
+      ConstantInt::get(Loc->getContext(), APInt::getZero(DstBits));
 
   Builder.SetInsertPoint(Loc);
   // Compute |Step|
@@ -2544,25 +2496,33 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
 
   // Get the backedge taken count and truncate or extended to the AR type.
   Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
-  auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
-                                         Intrinsic::umul_with_overflow, Ty);
 
   // Compute |Step| * Backedge
-  CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
-  Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
-  Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+  Value *MulV, *OfMul;
+  if (Step->isOne()) {
+    // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
+    // needed, there is never an overflow, so to avoid artificially inflating
+    // the cost of the check, directly emit the optimized IR.
+    MulV = TruncTripCount;
+    OfMul = ConstantInt::getFalse(MulV->getContext());
+  } else {
+    auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+                                           Intrinsic::umul_with_overflow, Ty);
+    CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+    MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+    OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+  }
 
   // Compute:
   //   Start + |Step| * Backedge < Start
   //   Start - |Step| * Backedge > Start
   Value *Add = nullptr, *Sub = nullptr;
-  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) {
-    const SCEV *MulS = SE.getSCEV(MulV);
-    const SCEV *NegMulS = SE.getNegativeSCEV(MulS);
-    Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue),
-                                ARPtrTy);
-    Sub = Builder.CreateBitCast(
-        expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy);
+  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) {
+    StartValue = InsertNoopCastOfTo(
+        StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace()));
+    Value *NegMulV = Builder.CreateNeg(MulV);
+    Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV);
+    Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV);
   } else {
     Add = Builder.CreateAdd(StartValue, MulV);
     Sub = Builder.CreateSub(StartValue, MulV);
@@ -2686,9 +2646,11 @@ namespace {
 // perfectly reduced form, which can't be guaranteed.
 struct SCEVFindUnsafe {
   ScalarEvolution &SE;
+  bool CanonicalMode;
   bool IsUnsafe;
 
-  SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
+  SCEVFindUnsafe(ScalarEvolution &SE, bool CanonicalMode)
+      : SE(SE), CanonicalMode(CanonicalMode), IsUnsafe(false) {}
 
   bool follow(const SCEV *S) {
     if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
@@ -2704,6 +2666,14 @@ struct SCEVFindUnsafe {
         IsUnsafe = true;
         return false;
       }
+
+      // For non-affine addrecs or in non-canonical mode we need a preheader
+      // to insert into.
+      if (!AR->getLoop()->getLoopPreheader() &&
+          (!CanonicalMode || !AR->isAffine())) {
+        IsUnsafe = true;
+        return false;
+      }
     }
     return true;
   }
@@ -2712,8 +2682,8 @@ struct SCEVFindUnsafe {
 }
 
 namespace llvm {
-bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
-  SCEVFindUnsafe Search(SE);
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE, bool CanonicalMode) {
+  SCEVFindUnsafe Search(SE, CanonicalMode);
   visitAll(S, Search);
   return !Search.IsUnsafe;
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 583bb379488e..f467de5f924e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GuardUtils.h"
@@ -159,6 +160,13 @@ static cl::opt<unsigned>
                         cl::desc("Maximum cost of combining conditions when "
                                  "folding branches"));
 
+static cl::opt<unsigned> BranchFoldToCommonDestVectorMultiplier(
+    "simplifycfg-branch-fold-common-dest-vector-multiplier", cl::Hidden,
+    cl::init(2),
+    cl::desc("Multiplier to apply to threshold when determining whether or not "
+             "to fold branch to common destination when vector operations are "
+             "present"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -272,7 +280,6 @@ public:
   }
 
   bool simplifyOnce(BasicBlock *BB);
-  bool simplifyOnceImpl(BasicBlock *BB);
   bool run(BasicBlock *BB);
 
   // Helper to set Resimplify and return change indication.
@@ -1094,17 +1101,24 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     // Update (liveout) uses of bonus instructions,
     // now that the bonus instruction has been cloned into predecessor.
-    SSAUpdater SSAUpdate;
-    SSAUpdate.Initialize(BonusInst.getType(),
-                         (NewBonusInst->getName() + ".merge").str());
-    SSAUpdate.AddAvailableValue(BB, &BonusInst);
-    SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst);
+    // Note that we expect to be in a block-closed SSA form for this to work!
     for (Use &U : make_early_inc_range(BonusInst.uses())) {
       auto *UI = cast<Instruction>(U.getUser());
-      if (UI->getParent() != PredBlock)
-        SSAUpdate.RewriteUseAfterInsertions(U);
-      else // Use is in the same block as, and comes before, NewBonusInst.
-        SSAUpdate.RewriteUse(U);
+      auto *PN = dyn_cast<PHINode>(UI);
+      if (!PN) {
+        assert(UI->getParent() == BB && BonusInst.comesBefore(UI) &&
+               "If the user is not a PHI node, then it should be in the same "
+               "block as, and come after, the original bonus instruction.");
+        continue; // Keep using the original bonus instruction.
+      }
+      // Is this the block-closed SSA form PHI node?
+      if (PN->getIncomingBlock(U) == BB)
+        continue; // Great, keep using the original bonus instruction.
+      // The only other alternative is an "use" when coming from
+      // the predecessor block - here we should refer to the cloned bonus instr.
+      assert(PN->getIncomingBlock(U) == PredBlock &&
+             "Not in block-closed SSA form?");
+      U.set(NewBonusInst);
     }
   }
 }
@@ -2044,7 +2058,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
     unsigned NumPHIdValues = 0;
     for (auto *I : *LRI)
       for (auto *V : PHIOperands[I]) {
-        if (InstructionsToSink.count(V) == 0)
+        if (!InstructionsToSink.contains(V))
           ++NumPHIdValues;
         // FIXME: this check is overly optimistic. We may end up not sinking
         // said instruction, due to the very same profitability check.
@@ -2250,6 +2264,23 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
         return SI->getValueOperand();
       return nullptr; // Unknown store.
     }
+
+    if (auto *LI = dyn_cast<LoadInst>(&CurI)) {
+      if (LI->getPointerOperand() == StorePtr && LI->getType() == StoreTy &&
+          LI->isSimple()) {
+        // Local objects (created by an `alloca` instruction) are always
+        // writable, so once we are past a read from a location it is valid to
+        // also write to that same location.
+        // If the address of the local object never escapes the function, that
+        // means it's never concurrently read or written, hence moving the store
+        // from under the condition will not introduce a data race.
+        auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(StorePtr));
+        if (AI && !PointerMayBeCaptured(AI, false, true))
+          // Found a previous load, return it.
+          return LI;
+      }
+      // The load didn't work out, but we may still find a store.
+    }
   }
 
   return nullptr;
@@ -2545,17 +2576,17 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   int Size = 0;
 
   SmallPtrSet<const Value *, 32> EphValues;
-  auto IsEphemeral = [&](const Value *V) {
-    if (isa<AssumeInst>(V))
+  auto IsEphemeral = [&](const Instruction *I) {
+    if (isa<AssumeInst>(I))
       return true;
-    return isSafeToSpeculativelyExecute(V) &&
-           all_of(V->users(),
+    return !I->mayHaveSideEffects() && !I->isTerminator() &&
+           all_of(I->users(),
                   [&](const User *U) { return EphValues.count(U); });
   };
 
   // Walk the loop in reverse so that we can identify ephemeral values properly
   // (values only feeding assumes).
-  for (Instruction &I : reverse(BB->instructionsWithoutDebug())) {
+  for (Instruction &I : reverse(BB->instructionsWithoutDebug(false))) {
     // Can't fold blocks that contain noduplicate or convergent calls.
     if (CallInst *CI = dyn_cast<CallInst>(&I))
       if (CI->cannotDuplicate() || CI->isConvergent())
@@ -2588,8 +2619,10 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// If we have a conditional branch on a PHI node value that is defined in the
 /// same block as the branch and if any PHI entries are constants, thread edges
 /// corresponding to that entry to be branches to their ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
-                                const DataLayout &DL, AssumptionCache *AC) {
+static Optional<bool> FoldCondBranchOnPHIImpl(BranchInst *BI,
+                                              DomTreeUpdater *DTU,
+                                              const DataLayout &DL,
+                                              AssumptionCache *AC) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -2703,13 +2736,25 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
       DTU->applyUpdates(Updates);
     }
 
-    // Recurse, simplifying any other constants.
-    return FoldCondBranchOnPHI(BI, DTU, DL, AC) || true;
+    // Signal repeat, simplifying any other constants.
+    return None;
   }
 
   return false;
 }
 
+static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
+                                const DataLayout &DL, AssumptionCache *AC) {
+  Optional<bool> Result;
+  bool EverChanged = false;
+  do {
+    // Note that None means "we changed things, but recurse further."
+    Result = FoldCondBranchOnPHIImpl(BI, DTU, DL, AC);
+    EverChanged |= Result == None || *Result;
+  } while (Result == None);
+  return EverChanged;
+}
+
 /// Given a BB that starts with the specified two-entry PHI node,
 /// see if we can eliminate it.
 static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
@@ -2845,8 +2890,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // instructions.
   for (BasicBlock *IfBlock : IfBlocks)
     for (BasicBlock::iterator I = IfBlock->begin(); !I->isTerminator(); ++I)
-      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
-          !isa<PseudoProbeInst>(I)) {
+      if (!AggressiveInsts.count(&*I) && !I->isDebugOrPseudoInst()) {
         // This is not an aggressive instruction that we can promote.
         // Because of this, we won't be able to get rid of the control flow, so
         // the xform is not worth it.
@@ -3105,6 +3149,14 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
   return true;
 }
 
+/// Return if an instruction's type or any of its operands' types are a vector
+/// type.
+static bool isVectorOp(Instruction &I) {
+  return I.getType()->isVectorTy() || any_of(I.operands(), [](Use &U) {
+           return U->getType()->isVectorTy();
+         });
+}
+
 /// If this basic block is simple enough, and if a predecessor branches to us
 /// and one of our successors, fold the block into the predecessor and use
 /// logical operations to pick the right destination.
@@ -3189,6 +3241,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
   // number of the bonus instructions we'll need to create when cloning into
   // each predecessor does not exceed a certain threshold.
   unsigned NumBonusInsts = 0;
+  bool SawVectorOp = false;
   const unsigned PredCount = Preds.size();
   for (Instruction &I : *BB) {
     // Don't check the branch condition comparison itself.
@@ -3200,14 +3253,35 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
       return false;
+    SawVectorOp |= isVectorOp(I);
 
     // Account for the cost of duplicating this instruction into each
-    // predecessor.
-    NumBonusInsts += PredCount;
-    // Early exits once we reach the limit.
-    if (NumBonusInsts > BonusInstThreshold)
+    // predecessor. Ignore free instructions.
+    if (!TTI ||
+        TTI->getUserCost(&I, CostKind) != TargetTransformInfo::TCC_Free) {
+      NumBonusInsts += PredCount;
+
+      // Early exits once we reach the limit.
+      if (NumBonusInsts >
+          BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier)
+        return false;
+    }
+
+    auto IsBCSSAUse = [BB, &I](Use &U) {
+      auto *UI = cast<Instruction>(U.getUser());
+      if (auto *PN = dyn_cast<PHINode>(UI))
+        return PN->getIncomingBlock(U) == BB;
+      return UI->getParent() == BB && I.comesBefore(UI);
+    };
+
+    // Does this instruction require rewriting of uses?
+    if (!all_of(I.uses(), IsBCSSAUse))
       return false;
   }
+  if (NumBonusInsts >
+      BonusInstThreshold *
+          (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1))
+    return false;
 
   // Ok, we have the budget. Perform the transformation.
   for (BasicBlock *PredBlock : Preds) {
@@ -3340,7 +3414,7 @@ static bool mergeConditionalStoreToAddress(
     InstructionCost Cost = 0;
     InstructionCost Budget =
         PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
-    for (auto &I : BB->instructionsWithoutDebug()) {
+    for (auto &I : BB->instructionsWithoutDebug(false)) {
       // Consider terminator instruction to be free.
       if (I.isTerminator())
         continue;
@@ -3413,10 +3487,7 @@ static bool mergeConditionalStoreToAddress(
                                       /*BranchWeights=*/nullptr, DTU);
   QB.SetInsertPoint(T);
   StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
-  AAMDNodes AAMD;
-  PStore->getAAMetadata(AAMD, /*Merge=*/false);
-  PStore->getAAMetadata(AAMD, /*Merge=*/true);
-  SI->setAAMetadata(AAMD);
+  SI->setAAMetadata(PStore->getAAMetadata().merge(QStore->getAAMetadata()));
   // Choose the minimum alignment. If we could prove both stores execute, we
   // could use biggest one.  In this case, though, we only know that one of the
   // stores executes.  And we don't know it's safe to take the alignment from a
@@ -3666,7 +3737,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // fold the conditions into logical ops and one cond br.
 
   // Ignore dbg intrinsics.
-  if (&*BB->instructionsWithoutDebug().begin() != BI)
+  if (&*BB->instructionsWithoutDebug(false).begin() != BI)
     return false;
 
   int PBIOp, BIOp;
@@ -4711,29 +4782,6 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
   return true;
 }
 
-static void createUnreachableSwitchDefault(SwitchInst *Switch,
-                                           DomTreeUpdater *DTU) {
-  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-  auto *BB = Switch->getParent();
-  BasicBlock *NewDefaultBlock = SplitBlockPredecessors(
-      Switch->getDefaultDest(), Switch->getParent(), "", DTU);
-  auto *OrigDefaultBlock = Switch->getDefaultDest();
-  Switch->setDefaultDest(&*NewDefaultBlock);
-  if (DTU)
-    DTU->applyUpdates({{DominatorTree::Insert, BB, &*NewDefaultBlock},
-                       {DominatorTree::Delete, BB, OrigDefaultBlock}});
-  SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front(), DTU);
-  SmallVector<DominatorTree::UpdateType, 2> Updates;
-  if (DTU)
-    for (auto *Successor : successors(NewDefaultBlock))
-      Updates.push_back({DominatorTree::Delete, NewDefaultBlock, Successor});
-  auto *NewTerminator = NewDefaultBlock->getTerminator();
-  new UnreachableInst(Switch->getContext(), NewTerminator);
-  EraseTerminatorAndDCECond(NewTerminator);
-  if (DTU)
-    DTU->applyUpdates(Updates);
-}
-
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
 bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
@@ -5039,9 +5087,10 @@ static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI
     return false;
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
-    if (!CE->isGEPWithNoNotionalOverIndexing())
-      return false;
-    if (!ValidLookupTableConstant(CE->getOperand(0), TTI))
+    // Pointer casts and in-bounds GEPs will not prohibit the backend from
+    // materializing the array of constants.
+    Constant *StrippedC = cast<Constant>(CE->stripInBoundsConstantOffsets());
+    if (StrippedC == C || !ValidLookupTableConstant(StrippedC, TTI))
       return false;
   }
 
@@ -5111,7 +5160,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
   // which we can constant-propagate the CaseVal, continue to its successor.
   SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
-  for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
+  for (Instruction &I : CaseDest->instructionsWithoutDebug(false)) {
     if (I.isTerminator()) {
       // If the terminator is a simple branch, continue to the next block.
       if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator())
@@ -5604,8 +5653,32 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
   return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
 }
 
+static bool isTypeLegalForLookupTable(Type *Ty, const TargetTransformInfo &TTI,
+                                      const DataLayout &DL) {
+  // Allow any legal type.
+  if (TTI.isTypeLegal(Ty))
+    return true;
+
+  auto *IT = dyn_cast<IntegerType>(Ty);
+  if (!IT)
+    return false;
+
+  // Also allow power of 2 integer types that have at least 8 bits and fit in
+  // a register. These types are common in frontend languages and targets
+  // usually support loads of these types.
+  // TODO: We could relax this to any integer that fits in a register and rely
+  // on ABI alignment and padding in the table to allow the load to be widened.
+  // Or we could widen the constants and truncate the load.
+  unsigned BitWidth = IT->getBitWidth();
+  return BitWidth >= 8 && isPowerOf2_32(BitWidth) &&
+         DL.fitsInLegalInteger(IT->getBitWidth());
+}
+
 /// Determine whether a lookup table should be built for this switch, based on
 /// the number of cases, size of the table, and the types of the results.
+// TODO: We could support larger than legal types by limiting based on the
+// number of loads required and/or table size. If the constants are small we
+// could use smaller table entries and extend after the load.
 static bool
 ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
                        const TargetTransformInfo &TTI, const DataLayout &DL,
@@ -5619,7 +5692,7 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
     Type *Ty = I.second;
 
     // Saturate this flag to true.
-    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+    HasIllegalType = HasIllegalType || !isTypeLegalForLookupTable(Ty, TTI, DL);
 
     // Saturate this flag to false.
     AllTablesFitInRegister =
@@ -6102,7 +6175,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
-    if (SI == &*BB->instructionsWithoutDebug().begin())
+    if (SI == &*BB->instructionsWithoutDebug(false).begin())
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
         return requestResimplify();
   }
@@ -6246,12 +6319,9 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
 
     // The debug info in OtherPred doesn't cover the merged control flow that
     // used to go through BB.  We need to delete it or update it.
-    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
-      Instruction &Inst = *I;
-      I++;
+    for (Instruction &Inst : llvm::make_early_inc_range(*OtherPred))
       if (isa<DbgInfoIntrinsic>(Inst))
         Inst.eraseFromParent();
-    }
 
     SmallPtrSet<BasicBlock *, 16> Succs(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : Succs) {
@@ -6338,6 +6408,11 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
 }
 
 bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+  assert(
+      !isa<ConstantInt>(BI->getCondition()) &&
+      BI->getSuccessor(0) != BI->getSuccessor(1) &&
+      "Tautological conditional branch should have been eliminated already.");
+
   BasicBlock *BB = BI->getParent();
   if (!Options.SimplifyCondBranch)
     return false;
@@ -6452,19 +6527,21 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
 
   if (C->isNullValue() || isa<UndefValue>(C)) {
     // Only look at the first use, avoid hurting compile time with long uselists
-    User *Use = *I->user_begin();
+    auto *Use = cast<Instruction>(*I->user_begin());
+    // Bail out if Use is not in the same BB as I or Use == I or Use comes
+    // before I in the block. The latter two can be the case if Use is a PHI
+    // node.
+    if (Use->getParent() != I->getParent() || Use == I || Use->comesBefore(I))
+      return false;
 
     // Now make sure that there are no instructions in between that can alter
     // control flow (eg. calls)
-    for (BasicBlock::iterator
-             i = ++BasicBlock::iterator(I),
-             UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
-         i != UI; ++i) {
-      if (i == I->getParent()->end())
-        return false;
-      if (!isGuaranteedToTransferExecutionToSuccessor(&*i))
-        return false;
-    }
+    auto InstrRange =
+        make_range(std::next(I->getIterator()), Use->getIterator());
+    if (any_of(InstrRange, [](Instruction &I) {
+          return !isGuaranteedToTransferExecutionToSuccessor(&I);
+        }))
+      return false;
 
     // Look through GEPs. A load from a GEP derived from NULL is still undefined
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
@@ -6540,21 +6617,51 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB,
           // destination from conditional branches.
           if (BI->isUnconditional())
             Builder.CreateUnreachable();
-          else
+          else {
+            // Preserve guarding condition in assume, because it might not be
+            // inferrable from any dominating condition.
+            Value *Cond = BI->getCondition();
+            if (BI->getSuccessor(0) == BB)
+              Builder.CreateAssumption(Builder.CreateNot(Cond));
+            else
+              Builder.CreateAssumption(Cond);
             Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
                                                        : BI->getSuccessor(0));
+          }
           BI->eraseFromParent();
           if (DTU)
             DTU->applyUpdates({{DominatorTree::Delete, Predecessor, BB}});
           return true;
+        } else if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+          // Redirect all branches leading to UB into
+          // a newly created unreachable block.
+          BasicBlock *Unreachable = BasicBlock::Create(
+              Predecessor->getContext(), "unreachable", BB->getParent(), BB);
+          Builder.SetInsertPoint(Unreachable);
+          // The new block contains only one instruction: Unreachable
+          Builder.CreateUnreachable();
+          for (auto &Case : SI->cases())
+            if (Case.getCaseSuccessor() == BB) {
+              BB->removePredecessor(Predecessor);
+              Case.setSuccessor(Unreachable);
+            }
+          if (SI->getDefaultDest() == BB) {
+            BB->removePredecessor(Predecessor);
+            SI->setDefaultDest(Unreachable);
+          }
+
+          if (DTU)
+            DTU->applyUpdates(
+                { { DominatorTree::Insert, Predecessor, Unreachable },
+                  { DominatorTree::Delete, Predecessor, BB } });
+          return true;
         }
-        // TODO: SwitchInst.
       }
 
   return false;
 }
 
-bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
+bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   bool Changed = false;
 
   assert(BB && BB->getParent() && "Block not embedded in function!");
@@ -6578,7 +6685,8 @@ bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
   Changed |= EliminateDuplicatePHINodes(BB);
 
   // Check for and remove branches that will always cause undefined behavior.
-  Changed |= removeUndefIntroducingPredecessor(BB, DTU);
+  if (removeUndefIntroducingPredecessor(BB, DTU))
+    return requestResimplify();
 
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
@@ -6603,7 +6711,8 @@ bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
     // eliminate it, do so now.
     if (auto *PN = dyn_cast<PHINode>(BB->begin()))
       if (PN->getNumIncomingValues() == 2)
-        Changed |= FoldTwoEntryPHINode(PN, TTI, DTU, DL);
+        if (FoldTwoEntryPHINode(PN, TTI, DTU, DL))
+          return true;
   }
 
   Instruction *Terminator = BB->getTerminator();
@@ -6632,12 +6741,6 @@ bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
   return Changed;
 }
 
-bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
-  bool Changed = simplifyOnceImpl(BB);
-
-  return Changed;
-}
-
 bool SimplifyCFGOpt::run(BasicBlock *BB) {
   bool Changed = false;
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index bd30be011472..5b7fd4349c6c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -942,6 +942,7 @@ bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
 
 } // namespace llvm
 
+namespace {
 //===----------------------------------------------------------------------===//
 // Widen Induction Variables - Extend the width of an IV to cover its
 // widest uses.
@@ -1072,7 +1073,7 @@ protected:
 private:
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 };
-
+} // namespace
 
 /// Determine the insertion point for this user. By default, insert immediately
 /// before the user. SCEVExpander or LICM will hoist loop invariants out of the
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index b8e0f63c481d..e190a1294eb3 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -142,12 +142,10 @@ static void annotateDereferenceableBytes(CallInst *CI,
     unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
     if (!llvm::NullPointerIsDefined(F, AS) ||
         CI->paramHasAttr(ArgNo, Attribute::NonNull))
-      DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
-                                ArgNo + AttributeList::FirstArgIndex),
+      DerefBytes = std::max(CI->getParamDereferenceableOrNullBytes(ArgNo),
                             DereferenceableBytes);
-  
-    if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
-        DerefBytes) {
+
+    if (CI->getParamDereferenceableBytes(ArgNo) < DerefBytes) {
       CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
       if (!llvm::NullPointerIsDefined(F, AS) ||
           CI->paramHasAttr(ArgNo, Attribute::NonNull))
@@ -512,14 +510,18 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
       B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return Dst;
 }
 
 Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+
+  // stpcpy(d,s) -> strcpy(d,s) if the result is not used.
+  if (CI->use_empty())
+    return emitStrCpy(Dst, Src, B, TLI);
+
   if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
     Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
@@ -541,8 +543,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return DstEnd;
 }
 
@@ -577,9 +578,9 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
   if (SrcLen == 0) {
     // strncpy(x, "", y) -> memset(x, '\0', y)
     Align MemSetAlign =
-        CI->getAttributes().getParamAttributes(0).getAlignment().valueOrOne();
+        CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne();
     CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign);
-    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
+    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0));
     NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
         CI->getContext(), 0, ArgAttrs));
     return Dst;
@@ -604,8 +605,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                                    ConstantInt::get(DL.getIntPtrType(PT), Len));
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return Dst;
 }
 
@@ -1082,8 +1082,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
                                    CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1136,8 +1135,7 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
   // any return attributes are compliant.
   // TODO: Attach return value attributes to the 1st operand to preserve them?
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
 }
 
@@ -1151,70 +1149,21 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
                                     CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
-/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
-Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) {
-  // This has to be a memset of zeros (bzero).
-  auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
-  if (!FillValue || FillValue->getZExtValue() != 0)
-    return nullptr;
-
-  // TODO: We should handle the case where the malloc has more than one use.
-  // This is necessary to optimize common patterns such as when the result of
-  // the malloc is checked against null or when a memset intrinsic is used in
-  // place of a memset library call.
-  auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
-  if (!Malloc || !Malloc->hasOneUse())
-    return nullptr;
-
-  // Is the inner call really malloc()?
-  Function *InnerCallee = Malloc->getCalledFunction();
-  if (!InnerCallee)
-    return nullptr;
-
-  LibFunc Func;
-  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
-      Func != LibFunc_malloc)
-    return nullptr;
-
-  // The memset must cover the same number of bytes that are malloc'd.
-  if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
-    return nullptr;
-
-  // Replace the malloc with a calloc. We need the data layout to know what the
-  // actual size of a 'size_t' parameter is.
-  B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
-  const DataLayout &DL = Malloc->getModule()->getDataLayout();
-  IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
-  if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
-                                 Malloc->getArgOperand(0),
-                                 Malloc->getAttributes(), B, *TLI)) {
-    substituteInParent(Malloc, Calloc);
-    return Calloc;
-  }
-
-  return nullptr;
-}
-
 Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
   Value *Size = CI->getArgOperand(2);
   annotateNonNullAndDereferenceable(CI, 0, Size, DL);
   if (isa<IntrinsicInst>(CI))
     return nullptr;
 
-  if (auto *Calloc = foldMallocMemset(CI, B))
-    return Calloc;
-
   // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1346,13 +1295,13 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
   B.setFastMathFlags(CI->getFastMathFlags());
 
   Value *Real, *Imag;
-  if (CI->getNumArgOperands() == 1) {
+  if (CI->arg_size() == 1) {
     Value *Op = CI->getArgOperand(0);
     assert(Op->getType()->isArrayTy() && "Unexpected signature for cabs!");
     Real = B.CreateExtractValue(Op, 0, "real");
     Imag = B.CreateExtractValue(Op, 1, "imag");
   } else {
-    assert(CI->getNumArgOperands() == 2 && "Unexpected signature for cabs!");
+    assert(CI->arg_size() == 2 && "Unexpected signature for cabs!");
     Real = CI->getArgOperand(0);
     Imag = CI->getArgOperand(1);
   }
@@ -2333,7 +2282,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B,
   // Proceedings of PACT'98, Oct. 1998, IEEE
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
-    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
+    CI->addFnAttr(Attribute::Cold);
   }
 
   return nullptr;
@@ -2349,7 +2298,7 @@ static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
   // These functions might be considered cold, but only if their stream
   // argument is stderr.
 
-  if (StreamArg >= (int)CI->getNumArgOperands())
+  if (StreamArg >= (int)CI->arg_size())
     return false;
   LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
   if (!LI)
@@ -2381,7 +2330,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
     return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
 
   // Try to remove call or emit putchar/puts.
-  if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
+  if (FormatStr == "%s" && CI->arg_size() > 1) {
     StringRef OperandStr;
     if (!getConstantStringInfo(CI->getOperand(1), OperandStr))
       return nullptr;
@@ -2402,7 +2351,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
 
   // printf("foo\n") --> puts("foo")
   if (FormatStr.back() == '\n' &&
-      FormatStr.find('%') == StringRef::npos) { // No format characters.
+      !FormatStr.contains('%')) { // No format characters.
     // Create a string literal with no \n on it.  We expect the constant merge
     // pass to be run after this pass, to merge duplicate strings.
     FormatStr = FormatStr.drop_back();
@@ -2412,12 +2361,12 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
 
   // Optimize specific format strings.
   // printf("%c", chr) --> putchar(chr)
-  if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+  if (FormatStr == "%c" && CI->arg_size() > 1 &&
       CI->getArgOperand(1)->getType()->isIntegerTy())
     return emitPutChar(CI->getArgOperand(1), B, TLI);
 
   // printf("%s\n", str) --> puts(str)
-  if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+  if (FormatStr == "%s\n" && CI->arg_size() > 1 &&
       CI->getArgOperand(1)->getType()->isPointerTy())
     return emitPutS(CI->getArgOperand(1), B, TLI);
   return nullptr;
@@ -2469,10 +2418,10 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
 
   // If we just have a format string (nothing else crazy) transform it.
   Value *Dest = CI->getArgOperand(0);
-  if (CI->getNumArgOperands() == 2) {
+  if (CI->arg_size() == 2) {
     // Make sure there's no % in the constant array.  We could try to handle
     // %% -> % in the future if we cared.
-    if (FormatStr.find('%') != StringRef::npos)
+    if (FormatStr.contains('%'))
       return nullptr; // we found a format specifier, bail out.
 
     // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
@@ -2485,8 +2434,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
 
   // The remaining optimizations require the format string to be "%s" or "%c"
   // and have an extra operand.
-  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-      CI->getNumArgOperands() < 3)
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->arg_size() < 3)
     return nullptr;
 
   // Decode the second character of the format string.
@@ -2597,10 +2545,10 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
     return nullptr;
 
   // If we just have a format string (nothing else crazy) transform it.
-  if (CI->getNumArgOperands() == 3) {
+  if (CI->arg_size() == 3) {
     // Make sure there's no % in the constant array.  We could try to handle
     // %% -> % in the future if we cared.
-    if (FormatStr.find('%') != StringRef::npos)
+    if (FormatStr.contains('%'))
       return nullptr; // we found a format specifier, bail out.
 
     if (N == 0)
@@ -2619,8 +2567,7 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
 
   // The remaining optimizations require the format string to be "%s" or "%c"
   // and have an extra operand.
-  if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
-      CI->getNumArgOperands() == 4) {
+  if (FormatStr.size() == 2 && FormatStr[0] == '%' && CI->arg_size() == 4) {
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
@@ -2688,9 +2635,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
     return nullptr;
 
   // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
-  if (CI->getNumArgOperands() == 2) {
+  if (CI->arg_size() == 2) {
     // Could handle %% -> % if we cared.
-    if (FormatStr.find('%') != StringRef::npos)
+    if (FormatStr.contains('%'))
       return nullptr; // We found a format specifier.
 
     return emitFWrite(
@@ -2701,8 +2648,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
 
   // The remaining optimizations require the format string to be "%s" or "%c"
   // and have an extra operand.
-  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-      CI->getNumArgOperands() < 3)
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->arg_size() < 3)
     return nullptr;
 
   // Decode the second character of the format string.
@@ -3066,7 +3012,6 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
       return optimizeSqrt(CI, Builder);
-    // TODO: Use foldMallocMemset() with memset intrinsic.
     case Intrinsic::memset:
       return optimizeMemSet(CI, Builder);
     case Intrinsic::memcpy:
@@ -3266,8 +3211,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
         B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
                        Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
-    NewCI->removeAttributes(AttributeList::ReturnIndex,
-                            AttributeFuncs::typeIncompatible(NewCI->getType()));
+    NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3280,8 +3224,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
         B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
                         Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
-    NewCI->removeAttributes(AttributeList::ReturnIndex,
-                            AttributeFuncs::typeIncompatible(NewCI->getType()));
+    NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3289,15 +3232,12 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
                                                      IRBuilderBase &B) {
-  // TODO: Try foldMallocMemset() here.
-
   if (isFortifiedCallFoldable(CI, 3, 2)) {
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
     CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
                                      CI->getArgOperand(2), Align(1));
     NewCI->setAttributes(CI->getAttributes());
-    NewCI->removeAttributes(AttributeList::ReturnIndex,
-                            AttributeFuncs::typeIncompatible(NewCI->getType()));
+    NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3311,9 +3251,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
                                   CI->getArgOperand(2), B, DL, TLI)) {
       CallInst *NewCI = cast<CallInst>(Call);
       NewCI->setAttributes(CI->getAttributes());
-      NewCI->removeAttributes(
-          AttributeList::ReturnIndex,
-          AttributeFuncs::typeIncompatible(NewCI->getType()));
+      NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
       return NewCI;
     }
   return nullptr;
@@ -3354,7 +3292,11 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   else
     return nullptr;
 
-  Type *SizeTTy = DL.getIntPtrType(CI->getContext());
+  // FIXME: There is really no guarantee that sizeof(size_t) is equal to
+  // sizeof(int*) for every target. So the assumption used here to derive the
+  // SizeTBits based on the size of an integer pointer in address space zero
+  // isn't always valid.
+  Type *SizeTTy = DL.getIntPtrType(CI->getContext(), /*AddressSpace=*/0);
   Value *LenV = ConstantInt::get(SizeTTy, Len);
   Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
   // If the function was an __stpcpy_chk, and we were able to fold it into
diff --git a/llvm/lib/Transforms/Utils/SplitModule.cpp b/llvm/lib/Transforms/Utils/SplitModule.cpp
index 32f2f4e233b2..7e12bbd2851c 100644
--- a/llvm/lib/Transforms/Utils/SplitModule.cpp
+++ b/llvm/lib/Transforms/Utils/SplitModule.cpp
@@ -24,7 +24,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
@@ -65,9 +64,8 @@ static void addNonConstUser(ClusterMapType &GVtoClusterMap,
   if (const Instruction *I = dyn_cast<Instruction>(U)) {
     const GlobalValue *F = I->getParent()->getParent();
     GVtoClusterMap.unionSets(GV, F);
-  } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
-             isa<GlobalVariable>(U)) {
-    GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
+  } else if (const GlobalValue *GVU = dyn_cast<GlobalValue>(U)) {
+    GVtoClusterMap.unionSets(GV, GVU);
   } else {
     llvm_unreachable("Underimplemented use case");
   }
@@ -91,6 +89,13 @@ static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
   }
 }
 
+static const GlobalObject *getGVPartitioningRoot(const GlobalValue *GV) {
+  const GlobalObject *GO = GV->getAliaseeObject();
+  if (const auto *GI = dyn_cast_or_null<GlobalIFunc>(GO))
+    GO = GI->getResolverFunction();
+  return GO;
+}
+
 // Find partitions for module in the way that no locals need to be
 // globalized.
 // Try to balance pack those partitions into N files since this roughly equals
@@ -123,12 +128,11 @@ static void findPartitions(Module &M, ClusterIDMapType &ClusterIDMap,
         Member = &GV;
     }
 
-    // For aliases we should not separate them from their aliasees regardless
-    // of linkage.
-    if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
-      if (const GlobalObject *Base = GIS->getBaseObject())
-        GVtoClusterMap.unionSets(&GV, Base);
-    }
+    // Aliases should not be separated from their aliasees and ifuncs should
+    // not be separated from their resolvers regardless of linkage.
+    if (const GlobalObject *Root = getGVPartitioningRoot(&GV))
+      if (&GV != Root)
+        GVtoClusterMap.unionSets(&GV, Root);
 
     if (const Function *F = dyn_cast<Function>(&GV)) {
       for (const BasicBlock &BB : *F) {
@@ -225,9 +229,8 @@ static void externalize(GlobalValue *GV) {
 
 // Returns whether GV should be in partition (0-based) I of N.
 static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
-  if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
-    if (const GlobalObject *Base = GIS->getBaseObject())
-      GV = Base;
+  if (const GlobalObject *Root = getGVPartitioningRoot(GV))
+    GV = Root;
 
   StringRef Name;
   if (const Comdat *C = GV->getComdat())
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index ec4ea848a5d4..6a0eb34a7999 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -184,7 +184,7 @@ performOnModule(Module &M) {
 
     std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error);
     if (!Error.empty())
-      report_fatal_error("unable to transforn " + C.getName() + " in " +
+      report_fatal_error(Twine("unable to transforn ") + C.getName() + " in " +
                          M.getModuleIdentifier() + ": " + Error);
 
     if (C.getName() == Name)
@@ -256,11 +256,11 @@ bool RewriteMapParser::parse(const std::string &MapFile,
       MemoryBuffer::getFile(MapFile);
 
   if (!Mapping)
-    report_fatal_error("unable to read rewrite map '" + MapFile + "': " +
-                       Mapping.getError().message());
+    report_fatal_error(Twine("unable to read rewrite map '") + MapFile +
+                       "': " + Mapping.getError().message());
 
   if (!parse(*Mapping, DL))
-    report_fatal_error("unable to parse rewrite map '" + MapFile + "'");
+    report_fatal_error(Twine("unable to parse rewrite map '") + MapFile + "'");
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 6336af25ef98..dbe3cc93e72b 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -403,19 +403,10 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (Offset == -1)
     return Offset;
 
-  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
-  if (Offset) {
-    Src = ConstantExpr::getBitCast(Src,
-                                   Type::getInt8PtrTy(Src->getContext(), AS));
-    Constant *OffsetCst =
-        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
-                                         Src, OffsetCst);
-  }
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+  unsigned IndexSize = DL.getIndexTypeSizeInBits(Src->getType());
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, APInt(IndexSize, Offset), DL))
     return Offset;
   return -1;
 }
@@ -584,19 +575,11 @@ T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
 
-  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
-  if (Offset) {
-    Src = ConstantExpr::getBitCast(Src,
-                                   Type::getInt8PtrTy(Src->getContext(), AS));
-    Constant *OffsetCst =
-        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
-                                         Src, OffsetCst);
-  }
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+  unsigned IndexSize = DL.getIndexTypeSizeInBits(Src->getType());
+  return ConstantFoldLoadFromConstPtr(
+      Src, LoadTy, APInt(IndexSize, Offset), DL);
 }
 
 /// This function is called when we have a
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index f3afd42e6163..c3eafd6b2492 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -26,7 +26,8 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
@@ -68,7 +69,7 @@ struct WorklistEntry {
   enum EntryKind {
     MapGlobalInit,
     MapAppendingVar,
-    MapGlobalIndirectSymbol,
+    MapAliasOrIFunc,
     RemapFunction
   };
   struct GVInitTy {
@@ -79,8 +80,8 @@ struct WorklistEntry {
     GlobalVariable *GV;
     Constant *InitPrefix;
   };
-  struct GlobalIndirectSymbolTy {
-    GlobalIndirectSymbol *GIS;
+  struct AliasOrIFuncTy {
+    GlobalValue *GV;
     Constant *Target;
   };
 
@@ -91,7 +92,7 @@ struct WorklistEntry {
   union {
     GVInitTy GVInit;
     AppendingGVTy AppendingGV;
-    GlobalIndirectSymbolTy GlobalIndirectSymbol;
+    AliasOrIFuncTy AliasOrIFunc;
     Function *RemapF;
   } Data;
 };
@@ -163,8 +164,8 @@ public:
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MCID);
-  void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target,
-                                       unsigned MCID);
+  void scheduleMapAliasOrIFunc(GlobalValue &GV, Constant &Target,
+                               unsigned MCID);
   void scheduleRemapFunction(Function &F, unsigned MCID);
 
   void flush();
@@ -873,10 +874,17 @@ void Mapper::flush() {
                            E.AppendingGVIsOldCtorDtor, makeArrayRef(NewInits));
       break;
     }
-    case WorklistEntry::MapGlobalIndirectSymbol:
-      E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol(
-          mapConstant(E.Data.GlobalIndirectSymbol.Target));
+    case WorklistEntry::MapAliasOrIFunc: {
+      GlobalValue *GV = E.Data.AliasOrIFunc.GV;
+      Constant *Target = mapConstant(E.Data.AliasOrIFunc.Target);
+      if (auto *GA = dyn_cast<GlobalAlias>(GV))
+        GA->setAliasee(Target);
+      else if (auto *GI = dyn_cast<GlobalIFunc>(GV))
+        GI->setResolver(Target);
+      else
+        llvm_unreachable("Not alias or ifunc");
       break;
+    }
     case WorklistEntry::RemapFunction:
       remapFunction(*E.Data.RemapF);
       break;
@@ -944,12 +952,13 @@ void Mapper::remapInstruction(Instruction *I) {
     LLVMContext &C = CB->getContext();
     AttributeList Attrs = CB->getAttributes();
     for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
-      for (Attribute::AttrKind TypedAttr :
-             {Attribute::ByVal, Attribute::StructRet, Attribute::ByRef,
-              Attribute::InAlloca}) {
-        if (Type *Ty = Attrs.getAttribute(i, TypedAttr).getValueAsType()) {
-          Attrs = Attrs.replaceAttributeType(C, i, TypedAttr,
-                                             TypeMapper->remapType(Ty));
+      for (int AttrIdx = Attribute::FirstTypeAttr;
+           AttrIdx <= Attribute::LastTypeAttr; AttrIdx++) {
+        Attribute::AttrKind TypedAttr = (Attribute::AttrKind)AttrIdx;
+        if (Type *Ty =
+                Attrs.getAttributeAtIndex(i, TypedAttr).getValueAsType()) {
+          Attrs = Attrs.replaceAttributeTypeAtIndex(C, i, TypedAttr,
+                                                    TypeMapper->remapType(Ty));
           break;
         }
       }
@@ -1068,16 +1077,18 @@ void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
   AppendingInits.append(NewMembers.begin(), NewMembers.end());
 }
 
-void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
-                                             Constant &Target, unsigned MCID) {
-  assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule");
+void Mapper::scheduleMapAliasOrIFunc(GlobalValue &GV, Constant &Target,
+                                     unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert((isa<GlobalAlias>(GV) || isa<GlobalIFunc>(GV)) &&
+         "Should be alias or ifunc");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
-  WE.Kind = WorklistEntry::MapGlobalIndirectSymbol;
+  WE.Kind = WorklistEntry::MapAliasOrIFunc;
   WE.MCID = MCID;
-  WE.Data.GlobalIndirectSymbol.GIS = &GIS;
-  WE.Data.GlobalIndirectSymbol.Target = &Target;
+  WE.Data.AliasOrIFunc.GV = &GV;
+  WE.Data.AliasOrIFunc.Target = &Target;
   Worklist.push_back(WE);
 }
 
@@ -1174,10 +1185,14 @@ void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
       GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
 }
 
-void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
-                                                  Constant &Target,
-                                                  unsigned MCID) {
-  getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID);
+void ValueMapper::scheduleMapGlobalAlias(GlobalAlias &GA, Constant &Aliasee,
+                                         unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapAliasOrIFunc(GA, Aliasee, MCID);
+}
+
+void ValueMapper::scheduleMapGlobalIFunc(GlobalIFunc &GI, Constant &Resolver,
+                                         unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapAliasOrIFunc(GI, Resolver, MCID);
 }
 
 void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 3b90997100f1..5a4a2f0924f6 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -694,31 +694,16 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
   });
 
   for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
-    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
-      if (!is_contained(Chain, &I))
-        MemoryInstrs.push_back(&I);
-      else
-        ChainInstrs.push_back(&I);
-    } else if (isa<IntrinsicInst>(&I) &&
-               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
-                   Intrinsic::sideeffect) {
-      // Ignore llvm.sideeffect calls.
-    } else if (isa<IntrinsicInst>(&I) &&
-               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
-                   Intrinsic::pseudoprobe) {
-      // Ignore llvm.pseudoprobe calls.
-    } else if (isa<IntrinsicInst>(&I) &&
-               cast<IntrinsicInst>(&I)->getIntrinsicID() == Intrinsic::assume) {
-      // Ignore llvm.assume calls.
-    } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
-      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
-                        << '\n');
-      break;
-    } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
-      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
-                        << '\n');
+    if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
+      ChainInstrs.push_back(&I);
+      continue;
+    }
+    if (I.mayThrow()) {
+      LLVM_DEBUG(dbgs() << "LSV: Found may-throw operation: " << I << '\n');
       break;
     }
+    if (I.mayReadOrWriteMemory())
+      MemoryInstrs.push_back(&I);
   }
 
   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
@@ -751,26 +736,28 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
         return LI->hasMetadata(LLVMContext::MD_invariant_load);
       };
 
-      // We can ignore the alias as long as the load comes before the store,
-      // because that means we won't be moving the load past the store to
-      // vectorize it (the vectorized load is inserted at the location of the
-      // first load in the chain).
-      if (isa<StoreInst>(MemInstr) && ChainLoad &&
-          (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
-        continue;
-
-      // Same case, but in reverse.
-      if (MemLoad && isa<StoreInst>(ChainInstr) &&
-          (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
-        continue;
+      if (IsLoadChain) {
+        // We can ignore the alias as long as the load comes before the store,
+        // because that means we won't be moving the load past the store to
+        // vectorize it (the vectorized load is inserted at the location of the
+        // first load in the chain).
+        if (ChainInstr->comesBefore(MemInstr) ||
+            (ChainLoad && IsInvariantLoad(ChainLoad)))
+          continue;
+      } else {
+        // Same case, but in reverse.
+        if (MemInstr->comesBefore(ChainInstr) ||
+            (MemLoad && IsInvariantLoad(MemLoad)))
+          continue;
+      }
 
-      if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
-                        MemoryLocation::get(ChainInstr))) {
+      ModRefInfo MR =
+          AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
+      if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
         LLVM_DEBUG({
           dbgs() << "LSV: Found alias:\n"
-                    "  Aliasing instruction and pointer:\n"
+                    "  Aliasing instruction:\n"
                  << "  " << *MemInstr << '\n'
-                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
                  << "  Aliased instruction and pointer:\n"
                  << "  " << *ChainInstr << '\n'
                  << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
@@ -1085,9 +1072,12 @@ bool Vectorizer::vectorizeStoreChain(
   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
     LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
                          " Creating two separate arrays.\n");
-    return vectorizeStoreChain(Chain.slice(0, TargetVF),
-                               InstructionsProcessed) |
-           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
+    bool Vectorized = false;
+    Vectorized |=
+        vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
+    Vectorized |=
+        vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
+    return Vectorized;
   }
 
   LLVM_DEBUG({
@@ -1104,8 +1094,10 @@ bool Vectorizer::vectorizeStoreChain(
   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
     if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
       auto Chains = splitOddVectorElts(Chain, Sz);
-      return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
-             vectorizeStoreChain(Chains.second, InstructionsProcessed);
+      bool Vectorized = false;
+      Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
+      Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
+      return Vectorized;
     }
 
     Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
@@ -1119,15 +1111,17 @@ bool Vectorizer::vectorizeStoreChain(
 
   if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
     auto Chains = splitOddVectorElts(Chain, Sz);
-    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
-           vectorizeStoreChain(Chains.second, InstructionsProcessed);
+    bool Vectorized = false;
+    Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
+    Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
+    return Vectorized;
   }
 
   BasicBlock::iterator First, Last;
   std::tie(First, Last) = getBoundaryInstrs(Chain);
   Builder.SetInsertPoint(&*Last);
 
-  Value *Vec = UndefValue::get(VecTy);
+  Value *Vec = PoisonValue::get(VecTy);
 
   if (VecStoreTy) {
     unsigned VecWidth = VecStoreTy->getNumElements();
@@ -1237,8 +1231,12 @@ bool Vectorizer::vectorizeLoadChain(
   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
     LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
                          " Creating two separate arrays.\n");
-    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
-           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
+    bool Vectorized = false;
+    Vectorized |=
+        vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
+    Vectorized |=
+        vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
+    return Vectorized;
   }
 
   // We won't try again to vectorize the elements of the chain, regardless of
@@ -1249,8 +1247,10 @@ bool Vectorizer::vectorizeLoadChain(
   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
     if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
       auto Chains = splitOddVectorElts(Chain, Sz);
-      return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
-             vectorizeLoadChain(Chains.second, InstructionsProcessed);
+      bool Vectorized = false;
+      Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
+      Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
+      return Vectorized;
     }
 
     Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
@@ -1264,8 +1264,10 @@ bool Vectorizer::vectorizeLoadChain(
 
   if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
     auto Chains = splitOddVectorElts(Chain, Sz);
-    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
-           vectorizeLoadChain(Chains.second, InstructionsProcessed);
+    bool Vectorized = false;
+    Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
+    Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
+    return Vectorized;
   }
 
   LLVM_DEBUG({
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3c484fb0d28a..805011191da0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -419,7 +419,8 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
   return false;
 }
 
-int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) const {
+int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
+                                                Value *Ptr) const {
   const ValueToValueMap &Strides =
       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
 
@@ -428,7 +429,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) const {
                     llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
                                                 PGSOQueryType::IRPass);
   bool CanAddPredicate = !OptForSize;
-  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
+  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
+                            CanAddPredicate, false);
   if (Stride == 1 || Stride == -1)
     return Stride;
   return 0;
@@ -747,7 +749,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI) {
         auto *SE = PSE.getSE();
         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
-        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+        for (unsigned i = 0, e = CI->arg_size(); i != e; ++i)
           if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 5c4c4fdfa3f7..a7d6609f8c56 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -268,12 +268,6 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
-  /// The best number of elements of the vector types used in the
-  /// transformed loop. BestVF = None means that vectorization is
-  /// disabled.
-  Optional<ElementCount> BestVF = None;
-  unsigned BestUF = 0;
-
 public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo *TTI,
@@ -295,12 +289,13 @@ public:
   /// VF and its cost.
   VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
 
-  /// Finalize the best decision and dispose of all other VPlans.
-  void setBestPlan(ElementCount VF, unsigned UF);
+  /// Return the best VPlan for \p VF.
+  VPlan &getBestPlanFor(ElementCount VF) const;
 
   /// Generate the IR code for the body of the vectorized loop according to the
-  /// best selected VPlan.
-  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+  /// best selected \p VF, \p UF and VPlan \p BestPlan.
+  void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
+                   InnerLoopVectorizer &LB, DominatorTree *DT);
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printPlans(raw_ostream &O);
@@ -308,12 +303,9 @@ public:
 
   /// Look through the existing plans and return true if we have one with all
   /// the vectorization factors in question.
-  bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
-    return any_of(VPlans, [&](const VPlanPtr &Plan) {
-      return all_of(VFs, [&](const ElementCount &VF) {
-        return Plan->hasVF(VF);
-      });
-    });
+  bool hasPlanWithVF(ElementCount VF) const {
+    return any_of(VPlans,
+                  [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
   }
 
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
@@ -351,13 +343,14 @@ private:
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
   void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
 
-  /// Adjust the recipes for any inloop reductions. The chain of instructions
-  /// leading from the loop exit instr to the phi need to be converted to
-  /// reductions, with one operand being vector and the other being the scalar
-  /// reduction chain.
-  void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
-                                        VPRecipeBuilder &RecipeBuilder,
-                                        ElementCount MinVF);
+  // Adjust the recipes for reductions. For in-loop reductions the chain of
+  // instructions leading from the loop exit instr to the phi need to be
+  // converted to reductions, with one operand being vector and the other being
+  // the scalar reduction chain. For other reductions, a select is introduced
+  // between the phi and live-out recipes when folding the tail.
+  void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
+                                  VPRecipeBuilder &RecipeBuilder,
+                                  ElementCount MinVF);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f24ae6b100d5..23bb6f0860c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -87,7 +87,6 @@
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -332,8 +331,8 @@ static cl::opt<bool>
                            cl::desc("Prefer in-loop vector reductions, "
                                     "overriding the targets preference."));
 
-cl::opt<bool> EnableStrictReductions(
-    "enable-strict-reductions", cl::init(false), cl::Hidden,
+static cl::opt<bool> ForceOrderedReductions(
+    "force-ordered-reductions", cl::init(false), cl::Hidden,
     cl::desc("Enable the vectorisation of loops with in-order (strict) "
              "FP reductions"));
 
@@ -545,7 +544,8 @@ public:
   /// vectorized loop.
   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
                                   VPValue *Def, VPValue *Addr,
-                                  VPValue *StoredValue, VPValue *BlockInMask);
+                                  VPValue *StoredValue, VPValue *BlockInMask,
+                                  bool ConsecutiveStride, bool Reverse);
 
   /// Set the debug location in the builder \p Ptr using the debug location in
   /// \p V. If \p Ptr is None then it uses the class member's Builder.
@@ -590,12 +590,11 @@ protected:
   /// Handle all cross-iteration phis in the header.
   void fixCrossIterationPHIs(VPTransformState &State);
 
-  /// Fix a first-order recurrence. This is the second phase of vectorizing
-  /// this phi node.
+  /// Create the exit value of first order recurrences in the middle block and
+  /// update their users.
   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
 
-  /// Fix a reduction cross-iteration phi. This is the second phase of
-  /// vectorizing this phi node.
+  /// Create code for the loop exit value of the reduction.
   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
 
   /// Clear NSW/NUW flags from reduction instructions if necessary.
@@ -621,9 +620,9 @@ protected:
   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
   /// to each vector element of Val. The sequence starts at StartIndex.
   /// \p Opcode is relevant for FP induction variable.
-  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
-                               Instruction::BinaryOps Opcode =
-                               Instruction::BinaryOpsEnd);
+  virtual Value *
+  getStepVector(Value *Val, Value *StartIdx, Value *Step,
+                Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
 
   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
   /// variable on which to base the steps, \p Step is the size of the step, and
@@ -890,9 +889,9 @@ public:
 
 private:
   Value *getBroadcastInstrs(Value *V) override;
-  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
-                       Instruction::BinaryOps Opcode =
-                       Instruction::BinaryOpsEnd) override;
+  Value *getStepVector(
+      Value *Val, Value *StartIdx, Value *Step,
+      Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
   Value *reverseVector(Value *Vec) override;
 };
 
@@ -911,10 +910,9 @@ struct EpilogueLoopVectorizationInfo {
   Value *TripCount = nullptr;
   Value *VectorTripCount = nullptr;
 
-  EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
-                                unsigned EUF)
-      : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
-        EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
+  EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
+                                ElementCount EVF, unsigned EUF)
+      : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
     assert(EUF == 1 &&
            "A high UF for the epilogue loop is likely not beneficial.");
   }
@@ -1105,11 +1103,10 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
 }
 
 /// Return a value for Step multiplied by VF.
-static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
-  assert(isa<ConstantInt>(Step) && "Expected an integer step");
-  Constant *StepVal = ConstantInt::get(
-      Step->getType(),
-      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+                              int64_t Step) {
+  assert(Ty->isIntegerTy() && "Expected an integer step");
+  Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
 }
 
@@ -1121,6 +1118,13 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
   return VF.isScalable() ? B.CreateVScale(EC) : EC;
 }
 
+static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
+  assert(FTy->isFloatingPointTy() && "Expected floating point type!");
+  Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
+  Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
+  return B.CreateUIToFP(RuntimeVF, FTy);
+}
+
 void reportVectorizationFailure(const StringRef DebugMsg,
                                 const StringRef OREMsg, const StringRef ORETag,
                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
@@ -1319,8 +1323,7 @@ public:
   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
   /// of FP operations.
   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
-    return EnableStrictReductions && !Hints->allowReordering() &&
-           RdxDesc.isOrdered();
+    return !Hints->allowReordering() && RdxDesc.isOrdered();
   }
 
   /// \returns The smallest bitwidth each instruction can be represented with.
@@ -1495,14 +1498,14 @@ public:
   /// Returns true if the target machine supports masked store operation
   /// for the given \p DataType and kind of access to \p Ptr.
   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
-    return Legal->isConsecutivePtr(Ptr) &&
+    return Legal->isConsecutivePtr(DataType, Ptr) &&
            TTI.isLegalMaskedStore(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked load operation
   /// for the given \p DataType and kind of access to \p Ptr.
   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
-    return Legal->isConsecutivePtr(Ptr) &&
+    return Legal->isConsecutivePtr(DataType, Ptr) &&
            TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
@@ -1539,7 +1542,7 @@ public:
   // through scalar predication or masked load/store or masked gather/scatter.
   // Superset of instructions that return true for isScalarWithPredication.
   bool isPredicatedInst(Instruction *I) {
-    if (!blockNeedsPredication(I->getParent()))
+    if (!blockNeedsPredicationForAnyReason(I->getParent()))
       return false;
     // Loads and stores that need some form of masked operation are predicated
     // instructions.
@@ -1593,7 +1596,10 @@ public:
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const { return FoldTailByMasking; }
 
-  bool blockNeedsPredication(BasicBlock *BB) const {
+  /// Returns true if the instructions in this block requires predication
+  /// for any reason, e.g. because tail folding now requires a predicate
+  /// or because the block in the original loop was predicated.
+  bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
@@ -1928,7 +1934,7 @@ class GeneratedRTChecks {
   /// The value representing the result of the generated memory runtime checks.
   /// If it is nullptr, either no memory runtime checks have been generated or
   /// they have been used.
-  Instruction *MemRuntimeCheckCond = nullptr;
+  Value *MemRuntimeCheckCond = nullptr;
 
   DominatorTree *DT;
   LoopInfo *LI;
@@ -1971,7 +1977,7 @@ public:
       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
                                  "vector.memcheck");
 
-      std::tie(std::ignore, MemRuntimeCheckCond) =
+      MemRuntimeCheckCond =
           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
                            RtPtrChecking.getChecks(), MemCheckExp);
       assert(MemRuntimeCheckCond &&
@@ -2030,7 +2036,6 @@ public:
         if (MemCheckExp.isInsertedInstruction(&I))
           continue;
         SE.forgetValue(&I);
-        SE.eraseValueFromMap(&I);
         I.eraseFromParent();
       }
     }
@@ -2289,9 +2294,11 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     Step = Builder.CreateTrunc(Step, TruncType);
     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
   }
+
+  Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
   Value *SteppedStart =
-      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+      getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
 
   // We create vector phi nodes for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
@@ -2308,12 +2315,11 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
   Type *StepType = Step->getType();
+  Value *RuntimeVF;
   if (Step->getType()->isFloatingPointTy())
-    StepType = IntegerType::get(StepType->getContext(),
-                                StepType->getScalarSizeInBits());
-  Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
-  if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
+    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
+  else
+    RuntimeVF = getRuntimeVF(Builder, StepType, VF);
   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
 
   // Create a vector splat to use in the induction update.
@@ -2388,9 +2394,13 @@ void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
   if (isa<TruncInst>(EntryVal))
     return;
 
-  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
-  if (Casts.empty())
+  if (!CastDef) {
+    assert(ID.getCastInsts().empty() &&
+           "there are casts for ID, but no CastDef");
     return;
+  }
+  assert(!ID.getCastInsts().empty() &&
+         "there is a CastDef, but no casts for ID");
   // Only the first Cast instruction in the Casts vector is of interest.
   // The rest of the Casts (if exist) have no uses outside the
   // induction update chain itself.
@@ -2462,9 +2472,14 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
       assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      Value *StartIdx;
+      if (Step->getType()->isFloatingPointTy())
+        StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
+      else
+        StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
+
       Value *EntryPart =
-          getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
-                        ID.getInductionOpcode());
+          getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
       State.set(Def, EntryPart, Part);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
@@ -2520,7 +2535,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
 }
 
-Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
+                                          Value *Step,
                                           Instruction::BinaryOps BinOp) {
   // Create and check the types.
   auto *ValVTy = cast<VectorType>(Val->getType());
@@ -2543,12 +2559,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
   }
   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
 
-  // Add on StartIdx
-  Value *StartIdxSplat = Builder.CreateVectorSplat(
-      VLen, ConstantInt::get(InitVecValSTy, StartIdx));
-  InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+  // Splat the StartIdx
+  Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
 
   if (STy->isIntegerTy()) {
+    InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
     Step = Builder.CreateVectorSplat(VLen, Step);
     assert(Step->getType() == Val->getType() && "Invalid step vec");
     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
@@ -2561,6 +2576,8 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
          "Binary Opcode should be specified for FP induction");
   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+  InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
   Step = Builder.CreateVectorSplat(VLen, Step);
   Value *MulOp = Builder.CreateFMul(InitVec, Step);
   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
@@ -2609,8 +2626,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   }
 
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Value *StartIdx0 =
-        createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+    Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
 
     if (!IsUniform && VF.isScalable()) {
       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
@@ -2838,12 +2854,25 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   auto *SubVT = VectorType::get(ScalarTy, VF);
 
   // Vectorize the interleaved store group.
+  MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+  assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
+         "masked interleaved groups are not allowed.");
+  assert((!MaskForGaps || !VF.isScalable()) &&
+         "masking gaps for scalable vectors is not yet supported.");
   for (unsigned Part = 0; Part < UF; Part++) {
     // Collect the stored vector from each member.
     SmallVector<Value *, 4> StoredVecs;
     for (unsigned i = 0; i < InterleaveFactor; i++) {
-      // Interleaved store group doesn't allow a gap, so each index has a member
-      assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
+      assert((Group->getMember(i) || MaskForGaps) &&
+             "Fail to get a member from an interleaved store group");
+      Instruction *Member = Group->getMember(i);
+
+      // Skip the gaps in the group.
+      if (!Member) {
+        Value *Undef = PoisonValue::get(SubVT);
+        StoredVecs.push_back(Undef);
+        continue;
+      }
 
       Value *StoredVec = State.get(StoredValues[i], Part);
 
@@ -2867,16 +2896,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         "interleaved.vec");
 
     Instruction *NewStoreInstr;
-    if (BlockInMask) {
-      Value *BlockInMaskPart = State.get(BlockInMask, Part);
-      Value *ShuffledMask = Builder.CreateShuffleVector(
-          BlockInMaskPart,
-          createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
-          "interleaved.mask");
-      NewStoreInstr = Builder.CreateMaskedStore(
-          IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
-    }
-    else
+    if (BlockInMask || MaskForGaps) {
+      Value *GroupMask = MaskForGaps;
+      if (BlockInMask) {
+        Value *BlockInMaskPart = State.get(BlockInMask, Part);
+        Value *ShuffledMask = Builder.CreateShuffleVector(
+            BlockInMaskPart,
+            createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+            "interleaved.mask");
+        GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
+                                                      ShuffledMask, MaskForGaps)
+                                : ShuffledMask;
+      }
+      NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
+                                                Group->getAlign(), GroupMask);
+    } else
       NewStoreInstr =
           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
 
@@ -2886,7 +2920,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
 
 void InnerLoopVectorizer::vectorizeMemoryInstruction(
     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
-    VPValue *StoredValue, VPValue *BlockInMask) {
+    VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
+    bool Reverse) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2895,31 +2930,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
   assert((!SI || StoredValue) && "No stored value provided for widened store");
   assert((!LI || !StoredValue) && "Stored value provided for widened load");
 
-  LoopVectorizationCostModel::InstWidening Decision =
-      Cost->getWideningDecision(Instr, VF);
-  assert((Decision == LoopVectorizationCostModel::CM_Widen ||
-          Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
-          Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
-         "CM decision is not to widen the memory instruction");
-
   Type *ScalarDataTy = getLoadStoreType(Instr);
 
   auto *DataTy = VectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
-
-  // Determine if the pointer operand of the access is either consecutive or
-  // reverse consecutive.
-  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
-  bool ConsecutiveStride =
-      Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
-  bool CreateGatherScatter =
-      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
-
-  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
-  // gather/scatter. Otherwise Decision should have been to Scalarize.
-  assert((ConsecutiveStride || CreateGatherScatter) &&
-         "The instruction should be scalarized");
-  (void)ConsecutiveStride;
+  bool CreateGatherScatter = !ConsecutiveStride;
 
   VectorParts BlockInMaskParts(UF);
   bool isMaskRequired = BlockInMask;
@@ -2953,7 +2968,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
-      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
+      Value *Increment =
+          createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);
       PartPtr = cast<GetElementPtrInst>(
           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
@@ -3172,7 +3188,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
 
   Type *Ty = TC->getType();
   // This is where we can make the step a runtime constant.
-  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
+  Value *Step = createStepForVF(Builder, Ty, VF, UF);
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -3262,8 +3278,7 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
   if (!Cost->foldTailByMasking()) {
-    Value *Step =
-        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+    Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
   }
   // Create new preheader for vector loop.
@@ -3433,7 +3448,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
     assert(isa<SCEVConstant>(Step) &&
            "Expected constant step for pointer induction");
     return B.CreateGEP(
-        StartValue->getType()->getPointerElementType(), StartValue,
+        ID.getElementType(), StartValue,
         CreateMul(Index,
                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
                                     GetInsertPoint())));
@@ -3739,7 +3754,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
-  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
+  Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Induction =
       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -3857,21 +3872,19 @@ struct CSEDenseMapInfo {
 static void cse(BasicBlock *BB) {
   // Perform simple cse.
   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-    Instruction *In = &*I++;
-
-    if (!CSEDenseMapInfo::canHandle(In))
+  for (Instruction &In : llvm::make_early_inc_range(*BB)) {
+    if (!CSEDenseMapInfo::canHandle(&In))
       continue;
 
     // Check if we can replace this instruction with any of the
     // visited instructions.
-    if (Instruction *V = CSEMap.lookup(In)) {
-      In->replaceAllUsesWith(V);
-      In->eraseFromParent();
+    if (Instruction *V = CSEMap.lookup(&In)) {
+      In.replaceAllUsesWith(V);
+      In.eraseFromParent();
       continue;
     }
 
-    CSEMap[In] = In;
+    CSEMap[&In] = &In;
   }
 }
 
@@ -3881,7 +3894,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
-  for (auto &ArgOp : CI->arg_operands())
+  for (auto &ArgOp : CI->args())
     ScalarTys.push_back(ArgOp->getType());
 
   // Estimate cost of scalarized vector call. The source operands are assumed
@@ -3940,7 +3953,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
+  SmallVector<const Value *> Arguments(CI->args());
   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
   SmallVector<Type *> ParamTys;
   std::transform(FTy->param_begin(), FTy->param_end(),
@@ -3974,7 +3987,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from State indicates that it
     // wasn't vectorized.
-    VPValue *Def = State.Plan->getVPValue(KV.first);
+    // FIXME: Should not rely on getVPValue at this point.
+    VPValue *Def = State.Plan->getVPValue(KV.first, true);
     if (!State.hasAnyVectorValue(Def))
       continue;
     for (unsigned Part = 0; Part < UF; ++Part) {
@@ -4081,7 +4095,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from State indicates that it
     // wasn't vectorized.
-    VPValue *Def = State.Plan->getVPValue(KV.first);
+    // FIXME: Should not rely on getVPValue at this point.
+    VPValue *Def = State.Plan->getVPValue(KV.first, true);
     if (!State.hasAnyVectorValue(Def))
       continue;
     for (unsigned Part = 0; Part < UF; ++Part) {
@@ -4222,17 +4237,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
   // After execution completes the vector loop, we extract the next value of
   // the recurrence (x) to use as the initial value in the scalar loop.
 
-  auto *IdxTy = Builder.getInt32Ty();
-  auto *VecPhi = cast<PHINode>(State.get(PhiR, 0));
-
-  // Fix the latch value of the new recurrence in the vector loop.
-  VPValue *PreviousDef = PhiR->getBackedgeValue();
-  Value *Incoming = State.get(PreviousDef, UF - 1);
-  VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
-
   // Extract the last vector element in the middle block. This will be the
   // initial value for the recurrence when jumping to the scalar loop.
+  VPValue *PreviousDef = PhiR->getBackedgeValue();
+  Value *Incoming = State.get(PreviousDef, UF - 1);
   auto *ExtractForScalar = Incoming;
+  auto *IdxTy = Builder.getInt32Ty();
   if (VF.isVector()) {
     auto *One = ConstantInt::get(IdxTy, 1);
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -4283,8 +4293,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
   // and thus no phis which needed updated.
   if (!Cost->requiresScalarEpilogue(VF))
     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-      if (any_of(LCSSAPhi.incoming_values(),
-                 [Phi](Value *V) { return V == Phi; }))
+      if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
 }
 
@@ -4301,29 +4310,13 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
   setDebugLocFromInst(ReductionStartValue);
 
-  VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
+  VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
   // This is the vector-clone of the value that leaves the loop.
   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
 
   // Wrap flags are in general invalid after vectorization, clear them.
   clearReductionWrapFlags(RdxDesc, State);
 
-  // Fix the vector-loop phi.
-
-  // Reductions do not have to start at zero. They can start with
-  // any loop invariant values.
-  BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
-
-  unsigned LastPartForNewPhi = PhiR->isOrdered() ? 1 : UF;
-  for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-    Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
-    Value *Val = State.get(PhiR->getBackedgeValue(), Part);
-    if (PhiR->isOrdered())
-      Val = State.get(PhiR->getBackedgeValue(), UF - 1);
-
-    cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
-  }
-
   // Before each round, move the insertion point right between
   // the PHIs and the values we are going to write.
   // This allows us to write both PHINodes and the extractelement
@@ -4361,7 +4354,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
               RdxDesc.getOpcode(), PhiTy,
               TargetTransformInfo::ReductionFlags())) {
         auto *VecRdxPhi =
-            cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
+            cast<PHINode>(State.get(PhiR, Part));
         VecRdxPhi->setIncomingValueForBlock(
             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
       }
@@ -4382,13 +4375,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
                                         : Builder.CreateZExt(Trunc, VecTy);
-      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
-           UI != RdxParts[Part]->user_end();)
-        if (*UI != Trunc) {
-          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+      for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
+        if (U != Trunc) {
+          U->replaceUsesOfWith(RdxParts[Part], Extnd);
           RdxParts[Part] = Extnd;
-        } else {
-          ++UI;
         }
     }
     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
@@ -4421,9 +4411,11 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
         ReducedPartRdx = Builder.CreateBinOp(
             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-      } else {
+      } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
+        ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
+                                           ReducedPartRdx, RdxPart);
+      else
         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-      }
     }
   }
 
@@ -4431,7 +4423,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // target reduction in the loop using a Reduction recipe.
   if (VF.isVector() && !PhiR->isInLoop()) {
     ReducedPartRdx =
-        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
     // If the reduction can be performed in a smaller type, we need to extend
     // the reduction to the wider type before we branch to the original loop.
     if (PhiTy != RdxDesc.getRecurrenceType())
@@ -4456,8 +4448,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
   // fixFirstOrderRecurrence for a more complete explaination of the logic.
   if (!Cost->requiresScalarEpilogue(VF))
     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-      if (any_of(LCSSAPhi.incoming_values(),
-                 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
+      if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
 
   // Fix the scalar loop reduction variable with the incoming reduction sum
@@ -4488,7 +4479,8 @@ void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &Rd
     Instruction *Cur = Worklist.pop_back_val();
     if (isa<OverflowingBinaryOperator>(Cur))
       for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *V = State.get(State.Plan->getVPValue(Cur), Part);
+        // FIXME: Should not rely on getVPValue at this point.
+        Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
         cast<Instruction>(V)->dropPoisonGeneratingFlags();
       }
 
@@ -4519,11 +4511,12 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
 
     // Can be a loop invariant incoming value or the last scalar value to be
     // extracted from the vectorized loop.
+    // FIXME: Should not rely on getVPValue at this point.
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
     Value *lastIncomingValue =
         OrigLoop->isLoopInvariant(IncomingValue)
             ? IncomingValue
-            : State.get(State.Plan->getVPValue(IncomingValue),
+            : State.get(State.Plan->getVPValue(IncomingValue, true),
                         VPIteration(UF - 1, Lane));
     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
   }
@@ -4763,10 +4756,18 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
       }
 
       for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *PartStart = createStepForVF(
-            Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
+        Value *PartStart =
+            createStepForVF(Builder, PtrInd->getType(), VF, Part);
 
         if (NeedsVectorIndex) {
+          // Here we cache the whole vector, which means we can support the
+          // extraction of any lane. However, in some cases the extractelement
+          // instruction that is generated for scalar uses of this vector (e.g.
+          // a load instruction) is not folded away. Therefore we still
+          // calculate values for the first n lanes to avoid redundant moves
+          // (when extracting the 0th element) and to produce scalar code (i.e.
+          // additional add/gep instructions instead of expensive extractelement
+          // instructions) when extracting higher-order elements.
           Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
           Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
           Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
@@ -4774,9 +4775,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
               emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
           SclrGep->setName("next.gep");
           State.set(PhiR, SclrGep, Part);
-          // We've cached the whole vector, which means we can support the
-          // extraction of any lane.
-          continue;
         }
 
         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
@@ -4813,7 +4811,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
     Value *NumUnrolledElems =
         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
     Value *InductionGEP = GetElementPtrInst::Create(
-        ScStValueType->getPointerElementType(), NewPointerPhi,
+        II.getElementType(), NewPointerPhi,
         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
         InductionLoc);
     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
@@ -4832,7 +4830,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
 
       Value *GEP = Builder.CreateGEP(
-          ScStValueType->getPointerElementType(), NewPointerPhi,
+          II.getElementType(), NewPointerPhi,
           Builder.CreateMul(
               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
               "vector.gep"));
@@ -4979,7 +4977,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
   auto *CI = cast<CallInst>(&I);
 
   SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI->arg_operands())
+  for (Value *ArgOperand : CI->args())
     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
@@ -5128,8 +5126,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
       Instruction *Update = cast<Instruction>(
           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-      ScalarPtrs.insert(Update);
-      return;
+
+      // If there is more than one user of Update (Ptr), we shouldn't assume it
+      // will be scalar after vectorisation as other users of the instruction
+      // may require widening. Otherwise, add it to ScalarPtrs.
+      if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
+        ScalarPtrs.insert(Update);
+        return;
+      }
     }
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
@@ -5142,12 +5146,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     if (Worklist.count(I))
       return;
 
-    // If all users of the pointer will be memory accesses and scalar, place the
-    // pointer in ScalarPtrs. Otherwise, place the pointer in
-    // PossibleNonScalarPtrs.
-    if (llvm::all_of(I->users(), [&](User *U) {
-          return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
-                 isScalarUse(cast<Instruction>(U), Ptr);
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
         }))
       ScalarPtrs.insert(I);
     else
@@ -5254,7 +5257,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 }
 
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
-  if (!blockNeedsPredication(I->getParent()))
+  if (!blockNeedsPredicationForAnyReason(I->getParent()))
     return false;
   switch(I->getOpcode()) {
   default:
@@ -5297,12 +5300,20 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
 
   // Check if masking is required.
   // A Group may need masking for one of two reasons: it resides in a block that
-  // needs predication, or it was decided to use masking to deal with gaps.
+  // needs predication, or it was decided to use masking to deal with gaps
+  // (either a gap at the end of a load-access that may result in a speculative
+  // load, or any gaps in a store-access).
   bool PredicatedAccessRequiresMasking =
-      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
-  bool AccessWithGapsRequiresMasking =
-      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
-  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+      blockNeedsPredicationForAnyReason(I->getParent()) &&
+      Legal->isMaskRequired(I);
+  bool LoadAccessWithGapsRequiresEpilogMasking =
+      isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
+      !isScalarEpilogueAllowed();
+  bool StoreAccessWithGapsRequiresMasking =
+      isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
+  if (!PredicatedAccessRequiresMasking &&
+      !LoadAccessWithGapsRequiresEpilogMasking &&
+      !StoreAccessWithGapsRequiresMasking)
     return true;
 
   // If masked interleaving is required, we expect that the user/target had
@@ -5311,6 +5322,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   assert(useMaskedInterleavedAccesses(TTI) &&
          "Masked interleave-groups for predicated accesses are not enabled.");
 
+  if (Group->isReverse())
+    return false;
+
   auto *Ty = getLoadStoreType(I);
   const Align Alignment = getLoadStoreAlignment(I);
   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
@@ -5320,14 +5334,13 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
     Instruction *I, ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
-  LoadInst *LI = dyn_cast<LoadInst>(I);
-  StoreInst *SI = dyn_cast<StoreInst>(I);
-  assert((LI || SI) && "Invalid memory instruction");
+  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
 
   auto *Ptr = getLoadStorePointerOperand(I);
+  auto *ScalarTy = getLoadStoreType(I);
 
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!Legal->isConsecutivePtr(Ptr))
+  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
     return false;
 
   // If the instruction is a store located in a predicated block, it will be
@@ -5338,7 +5351,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
-  auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
@@ -5369,12 +5381,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     return (!I || !TheLoop->contains(I));
   };
 
+  // Worklist containing uniform instructions demanding lane 0.
   SetVector<Instruction *> Worklist;
   BasicBlock *Latch = TheLoop->getLoopLatch();
 
-  // Instructions that are scalar with predication must not be considered
-  // uniform after vectorization, because that would create an erroneous
-  // replicating region where only a single instance out of VF should be formed.
+  // Add uniform instructions demanding lane 0 to the worklist. Instructions
+  // that are scalar with predication must not be considered uniform after
+  // vectorization, because that would create an erroneous replicating region
+  // where only a single instance out of VF should be formed.
   // TODO: optimize such seldom cases if found important, see PR40816.
   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
     if (isOutOfScope(I)) {
@@ -5433,6 +5447,30 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::sideeffect:
+        case Intrinsic::experimental_noalias_scope_decl:
+        case Intrinsic::assume:
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+          if (TheLoop->hasLoopInvariantOperands(&I))
+            addToWorklistIfAllowed(&I);
+          break;
+        default:
+          break;
+        }
+      }
+
+      // ExtractValue instructions must be uniform, because the operands are
+      // known to be loop-invariant.
+      if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
+        assert(isOutOfScope(EVI->getAggregateOperand()) &&
+               "Expected aggregate value to be loop invariant");
+        addToWorklistIfAllowed(EVI);
+        continue;
+      }
+
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
       if (!Ptr)
@@ -5565,13 +5603,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
 
 ElementCount
 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
-  if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
-    reportVectorizationInfo(
-        "Disabling scalable vectorization, because target does not "
-        "support scalable vectors.",
-        "ScalableVectorsUnsupported", ORE, TheLoop);
+  if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
     return ElementCount::getScalable(0);
-  }
 
   if (Hints->isScalableVectorizationDisabled()) {
     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
@@ -5579,6 +5612,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
     return ElementCount::getScalable(0);
   }
 
+  LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
+
   auto MaxScalableVF = ElementCount::getScalable(
       std::numeric_limits<ElementCount::ScalarTy>::max());
 
@@ -5614,6 +5649,13 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
 
   // Limit MaxScalableVF by the maximum safe dependence distance.
   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+  if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+    unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
+                             .getVScaleRangeArgs()
+                             .second;
+    if (VScaleMax > 0)
+      MaxVScale = VScaleMax;
+  }
   MaxScalableVF = ElementCount::getScalable(
       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
   if (!MaxScalableVF)
@@ -5681,17 +5723,32 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
       return MaxSafeFixedVF;
     }
 
-    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
-                      << " is unsafe. Ignoring scalable UserVF.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
-                                        TheLoop->getStartLoc(),
-                                        TheLoop->getHeader())
-             << "User-specified vectorization factor "
-             << ore::NV("UserVectorizationFactor", UserVF)
-             << " is unsafe. Ignoring the hint to let the compiler pick a "
-                "suitable VF.";
-    });
+    if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
+      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                        << " is ignored because scalable vectors are not "
+                           "available.\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                          TheLoop->getStartLoc(),
+                                          TheLoop->getHeader())
+               << "User-specified vectorization factor "
+               << ore::NV("UserVectorizationFactor", UserVF)
+               << " is ignored because the target does not support scalable "
+                  "vectors. The compiler will pick a more suitable value.";
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                        << " is unsafe. Ignoring scalable UserVF.\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                          TheLoop->getStartLoc(),
+                                          TheLoop->getHeader())
+               << "User-specified vectorization factor "
+               << ore::NV("UserVectorizationFactor", UserVF)
+               << " is unsafe. Ignoring the hint to let the compiler pick a "
+                  "more suitable value.";
+      });
+    }
   }
 
   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
@@ -5972,19 +6029,27 @@ bool LoopVectorizationCostModel::isMoreProfitable(
     return RTCostA < RTCostB;
   }
 
-  // When set to preferred, for now assume vscale may be larger than 1, so
-  // that scalable vectorization is slightly favorable over fixed-width
-  // vectorization.
+  // Improve estimate for the vector width if it is scalable.
+  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
+  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+  if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
+    if (A.Width.isScalable())
+      EstimatedWidthA *= VScale.getValue();
+    if (B.Width.isScalable())
+      EstimatedWidthB *= VScale.getValue();
+  }
+
+  // When set to preferred, for now assume vscale may be larger than 1 (or the
+  // one being tuned for), so that scalable vectorization is slightly favorable
+  // over fixed-width vectorization.
   if (Hints->isScalableVectorizationPreferred())
     if (A.Width.isScalable() && !B.Width.isScalable())
-      return (CostA * B.Width.getKnownMinValue()) <=
-             (CostB * A.Width.getKnownMinValue());
+      return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
 
   // To avoid the need for FP division:
   //      (CostA / A.Width) < (CostB / B.Width)
   // <=>  (CostA * B.Width) < (CostB * A.Width)
-  return (CostA * B.Width.getKnownMinValue()) <
-         (CostB * A.Width.getKnownMinValue());
+  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
 }
 
 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
@@ -6014,11 +6079,22 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
 
     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
     VectorizationFactor Candidate(i, C.first);
-    LLVM_DEBUG(
-        dbgs() << "LV: Vector loop of width " << i << " costs: "
-               << (Candidate.Cost / Candidate.Width.getKnownMinValue())
-               << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
-               << ".\n");
+
+#ifndef NDEBUG
+    unsigned AssumedMinimumVscale = 1;
+    if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
+      AssumedMinimumVscale = VScale.getValue();
+    unsigned Width =
+        Candidate.Width.isScalable()
+            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+            : Candidate.Width.getFixedValue();
+    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+                      << " costs: " << (Candidate.Cost / Width));
+    if (i.isScalable())
+      LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+                        << AssumedMinimumVscale << ")");
+    LLVM_DEBUG(dbgs() << ".\n");
+#endif
 
     if (!C.second && !ForceVectorization) {
       LLVM_DEBUG(
@@ -6182,15 +6258,6 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
     return Result;
   }
 
-  // FIXME: This can be fixed for scalable vectors later, because at this stage
-  // the LoopVectorizer will only consider vectorizing a loop with scalable
-  // vectors when the loop has a hint to enable vectorization for a given VF.
-  if (MainLoopVF.isScalable()) {
-    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
-                         "yet supported.\n");
-    return Result;
-  }
-
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
@@ -6202,9 +6269,9 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
 
   if (EpilogueVectorizationForceVF > 1) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
-    if (LVP.hasPlanWithVFs(
-            {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
-      return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+    ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
+    if (LVP.hasPlanWithVF(ForcedEC))
+      return {ForcedEC, 0};
     else {
       LLVM_DEBUG(
           dbgs()
@@ -6221,14 +6288,24 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
     return Result;
   }
 
-  if (!isEpilogueVectorizationProfitable(MainLoopVF))
+  auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
+  if (MainLoopVF.isScalable())
+    LLVM_DEBUG(
+        dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
+                  "yet supported. Converting to fixed-width (VF="
+               << FixedMainLoopVF << ") instead\n");
+
+  if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
+                         "this loop\n");
     return Result;
+  }
 
   for (auto &NextVF : ProfitableVFs)
-    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+    if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
         (Result.Width.getFixedValue() == 1 ||
          isMoreProfitable(NextVF, Result)) &&
-        LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+        LVP.hasPlanWithVF(NextVF.Width))
       Result = NextVF;
 
   if (Result != VectorizationFactor::Disabled())
@@ -6471,6 +6548,22 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
 
+    // There is little point in interleaving for reductions containing selects
+    // and compares when VF=1 since it may just create more overhead than it's
+    // worth for loops with small trip counts. This is because we still have to
+    // do the final reduction after the loop.
+    bool HasSelectCmpReductions =
+        HasReductions &&
+        any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+          const RecurrenceDescriptor &RdxDesc = Reduction.second;
+          return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+              RdxDesc.getRecurrenceKind());
+        });
+    if (HasSelectCmpReductions) {
+      LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
+      return 1;
+    }
+
     // If we have a scalar reduction (vector reductions are already dealt with
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. For tree-wise reductions
@@ -6756,7 +6849,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
   // determine if it would be better to not if-convert the blocks they are in.
   // If so, we also record the instructions to scalarize.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockNeedsPredication(BB))
+    if (!blockNeedsPredicationForAnyReason(BB))
       continue;
     for (Instruction &I : *BB)
       if (isScalarWithPredication(&I)) {
@@ -6851,7 +6944,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnesValue(VF.getFixedValue()), true, false);
+          APInt::getAllOnes(VF.getFixedValue()), true, false);
       ScalarCost +=
           VF.getFixedValue() *
           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
@@ -6870,7 +6963,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
         else if (needsExtract(J, VF)) {
           ScalarCost += TTI.getScalarizationOverhead(
               cast<VectorType>(ToVectorTy(J->getType(), VF)),
-              APInt::getAllOnesValue(VF.getFixedValue()), false, true);
+              APInt::getAllOnes(VF.getFixedValue()), false, true);
         }
       }
 
@@ -7016,7 +7109,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
     auto *Vec_i1Ty =
         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
     Cost += TTI.getScalarizationOverhead(
-        Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+        Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
         /*Insert=*/false, /*Extract=*/true);
     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
 
@@ -7036,7 +7129,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
-  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -7117,18 +7210,16 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   unsigned InterleaveFactor = Group->getFactor();
   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
 
-  // Holds the indices of existing members in an interleaved load group.
-  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  // Holds the indices of existing members in the interleaved group.
   SmallVector<unsigned, 4> Indices;
-  if (isa<LoadInst>(I)) {
-    for (unsigned i = 0; i < InterleaveFactor; i++)
-      if (Group->getMember(i))
-        Indices.push_back(i);
-  }
+  for (unsigned IF = 0; IF < InterleaveFactor; IF++)
+    if (Group->getMember(IF))
+      Indices.push_back(IF);
 
   // Calculate the cost of the whole interleaved group.
   bool UseMaskForGaps =
-      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+      (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
+      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -7210,8 +7301,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
 
   Instruction *Op0, *Op1;
-  if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
-      !TheLoop->isLoopInvariant(RedOp)) {
+  if (RedOp &&
+      match(RedOp,
+            m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
+      match(Op0, m_ZExtOrSExt(m_Value())) &&
+      Op0->getOpcode() == Op1->getOpcode() &&
+      Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+      !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
+      (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
+
+    // Matched reduce(ext(mul(ext(A), ext(B)))
+    // Note that the extend opcodes need to all match, or if A==B they will have
+    // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
+    // which is equally fine.
+    bool IsUnsigned = isa<ZExtInst>(Op0);
+    auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
+    auto *MulType = VectorType::get(Op0->getType(), VectorTy);
+
+    InstructionCost ExtCost =
+        TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
+                             TTI::CastContextHint::None, CostKind, Op0);
+    InstructionCost MulCost =
+        TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
+    InstructionCost Ext2Cost =
+        TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
+                             TTI::CastContextHint::None, CostKind, RedOp);
+
+    InstructionCost RedCost = TTI.getExtendedAddReductionCost(
+        /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+        CostKind);
+
+    if (RedCost.isValid() &&
+        RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
+      return I == RetI ? RedCost : 0;
+  } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
+             !TheLoop->isLoopInvariant(RedOp)) {
     // Matched reduce(ext(A))
     bool IsUnsigned = isa<ZExtInst>(RedOp);
     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
@@ -7245,7 +7369,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
 
       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
         return I == RetI ? RedCost : 0;
-    } else {
+    } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
       // Matched reduce(mul())
       InstructionCost MulCost =
           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
@@ -7304,9 +7428,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   Type *VectorTy;
   InstructionCost C = getInstructionCost(I, VF, VectorTy);
 
-  bool TypeNotScalarized =
-      VF.isVector() && VectorTy->isVectorTy() &&
-      TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
+  bool TypeNotScalarized = false;
+  if (VF.isVector() && VectorTy->isVectorTy()) {
+    unsigned NumParts = TTI.getNumberOfParts(VectorTy);
+    if (NumParts)
+      TypeNotScalarized = NumParts < VF.getKnownMinValue();
+    else
+      C = InstructionCost::getInvalid();
+  }
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
@@ -7327,8 +7456,8 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
-        true, false);
+        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
+        false);
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -7340,7 +7469,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
 
   // Collect operands to consider.
   CallInst *CI = dyn_cast<CallInst>(I);
-  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+  Instruction::op_range Ops = CI ? CI->args() : I->operands();
 
   // Skip operands that do not require extraction/scalarization and do not incur
   // any overhead.
@@ -7391,8 +7520,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       // We assume that widening is the best solution when possible.
       if (memoryInstructionCanBeWidened(&I, VF)) {
         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
-        int ConsecutiveStride =
-               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
+        int ConsecutiveStride = Legal->isConsecutivePtr(
+            getLoadStoreType(&I), getLoadStorePointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
                "Expected consecutive stride.");
         InstWidening Decision =
@@ -7579,8 +7708,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
       return (
           TTI.getScalarizationOverhead(
-              Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false,
-              true) +
+              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
       // The back-edge branch will remain, as will all scalar branches.
@@ -7893,7 +8021,7 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
   // Check if the pointer operand of a load or store instruction is
   // consecutive.
   if (auto *Ptr = getLoadStorePointerOperand(Inst))
-    return Legal->isConsecutivePtr(Ptr);
+    return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
   return false;
 }
 
@@ -8019,7 +8147,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     return None;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
-  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+  if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(*TTI)) {
     LLVM_DEBUG(
         dbgs()
@@ -8105,28 +8233,30 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return SelectedVF;
 }
 
-void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
-  LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
-                    << '\n');
-  BestVF = VF;
-  BestUF = UF;
+VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
+  assert(count_if(VPlans,
+                  [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
+             1 &&
+         "Best VF has not a single VPlan.");
 
-  erase_if(VPlans, [VF](const VPlanPtr &Plan) {
-    return !Plan->hasVF(VF);
-  });
-  assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+  for (const VPlanPtr &Plan : VPlans) {
+    if (Plan->hasVF(VF))
+      return *Plan.get();
+  }
+  llvm_unreachable("No plan found!");
 }
 
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
+                                           VPlan &BestVPlan,
+                                           InnerLoopVectorizer &ILV,
                                            DominatorTree *DT) {
+  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
+                    << '\n');
+
   // Perform the actual loop transformation.
 
   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
-  assert(BestVF.hasValue() && "Vectorization Factor is missing");
-  assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
-
-  VPTransformState State{
-      *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
+  VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
   State.CanonicalIV = ILV.Induction;
@@ -8142,7 +8272,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
   //===------------------------------------------------===//
 
   // 2. Copy and widen instructions from the old loop into the new loop.
-  VPlans.front()->execute(&State);
+  BestVPlan.execute(&State);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
@@ -8222,21 +8352,19 @@ Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
 
 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
 
-Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
+                                        Value *Step,
                                         Instruction::BinaryOps BinOp) {
   // When unrolling and the VF is 1, we only need to add a simple scalar.
   Type *Ty = Val->getType();
   assert(!Ty->isVectorTy() && "Val must be a scalar");
 
   if (Ty->isFloatingPointTy()) {
-    Constant *C = ConstantFP::get(Ty, (double)StartIdx);
-
     // Floating-point operations inherit FMF via the builder's flags.
-    Value *MulOp = Builder.CreateFMul(C, Step);
+    Value *MulOp = Builder.CreateFMul(StartIdx, Step);
     return Builder.CreateBinOp(BinOp, Val, MulOp);
   }
-  Constant *C = ConstantInt::get(Ty, StartIdx);
-  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+  return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
 }
 
 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
@@ -8311,7 +8439,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   OldInduction = Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+
+  IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
+  Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   EPI.VectorTripCount = CountRoundDown;
   Induction =
@@ -8329,9 +8459,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
   LLVM_DEBUG({
     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
-           << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
+           << "Main Loop VF:" << EPI.MainLoopVF
            << ", Main Loop UF:" << EPI.MainLoopUF
-           << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << ", Epilogue Loop VF:" << EPI.EpilogueVF
            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
   });
 }
@@ -8346,8 +8476,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
   assert(L && "Expected valid Loop.");
   assert(Bypass && "Expected valid bypass basic block.");
-  unsigned VFactor =
-      ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
+  ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
   Value *Count = getOrCreateTripCount(L);
   // Reuse existing vector loop preheader for TC checks.
@@ -8361,7 +8490,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
 
   Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
+      P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
       "min.iters.check");
 
   if (!ForEpilogue)
@@ -8513,11 +8642,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
 
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count,
-      ConstantInt::get(Count->getType(),
-                       EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
-      "min.epilog.iters.check");
+  Value *CheckMinIters =
+      Builder.CreateICmp(P, Count,
+                         createStepForVF(Builder, Count->getType(),
+                                         EPI.EpilogueVF, EPI.EpilogueUF),
+                         "min.epilog.iters.check");
 
   ReplaceInstWithInst(
       Insert->getTerminator(),
@@ -8530,7 +8659,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
   LLVM_DEBUG({
     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
-           << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
+           << "Epilogue Loop VF:" << EPI.EpilogueVF
            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
   });
 }
@@ -8628,7 +8757,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   VPValue *BlockMask = nullptr;
 
   if (OrigLoop->getHeader() == BB) {
-    if (!CM.blockNeedsPredication(BB))
+    if (!CM.blockNeedsPredicationForAnyReason(BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
     // Create the block in mask as the first non-phi instruction in the block.
@@ -8643,9 +8772,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
     if (Legal->getPrimaryInduction())
       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
     else {
-      auto IVRecipe = new VPWidenCanonicalIVRecipe();
+      auto *IVRecipe = new VPWidenCanonicalIVRecipe();
       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
-      IV = IVRecipe->getVPSingleValue();
+      IV = IVRecipe;
     }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
     bool TailFolded = !CM.isScalarEpilogueAllowed();
@@ -8708,12 +8837,21 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
   if (Legal->isMaskRequired(I))
     Mask = createBlockInMask(I->getParent(), Plan);
 
+  // Determine if the pointer operand of the access is either consecutive or
+  // reverse consecutive.
+  LoopVectorizationCostModel::InstWidening Decision =
+      CM.getWideningDecision(I, Range.Start);
+  bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+  bool Consecutive =
+      Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
+    return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
+                                              Consecutive, Reverse);
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
-                                            Mask);
+                                            Mask, Consecutive, Reverse);
 }
 
 VPWidenIntOrFpInductionRecipe *
@@ -8829,7 +8967,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
     return nullptr;
 
-  ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
+  ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
 }
 
@@ -8916,6 +9054,37 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
 
+  // Even if the instruction is not marked as uniform, there are certain
+  // intrinsic calls that can be effectively treated as such, so we check for
+  // them here. Conservatively, we only do this for scalable vectors, since
+  // for fixed-width VFs we can always fall back on full scalarization.
+  if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
+    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    case Intrinsic::assume:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // For scalable vectors if one of the operands is variant then we still
+      // want to mark as uniform, which will generate one instruction for just
+      // the first lane of the vector. We can't scalarize the call in the same
+      // way as for fixed-width vectors because we don't know how many lanes
+      // there are.
+      //
+      // The reasons for doing it this way for scalable vectors are:
+      //   1. For the assume intrinsic generating the instruction for the first
+      //      lane is still be better than not generating any at all. For
+      //      example, the input may be a splat across all lanes.
+      //   2. For the lifetime start/end intrinsics the pointer operand only
+      //      does anything useful when the input comes from a stack object,
+      //      which suggests it should always be uniform. For non-stack objects
+      //      the effect is to poison the object, which still allows us to
+      //      remove the call.
+      IsUniform = true;
+      break;
+    default:
+      break;
+    }
+  }
+
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
   setRecipe(I, Recipe);
@@ -9137,6 +9306,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
       RecipeBuilder.recordRecipeOf(R);
       // For min/max reducitons, where we have a pair of icmp/select, we also
       // need to record the ICmp recipe, so it can be removed later.
+      assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
+             "Only min/max recurrences allowed for inloop reductions");
       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
     }
@@ -9165,22 +9336,27 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   // visit each basic block after having visited its predecessor basic blocks.
   // ---------------------------------------------------------------------------
 
-  // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
   auto Plan = std::make_unique<VPlan>();
-  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
-  Plan->setEntry(VPBB);
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   LoopBlocksDFS DFS(OrigLoop);
   DFS.perform(LI);
 
+  VPBasicBlock *VPBB = nullptr;
+  VPBasicBlock *HeaderVPBB = nullptr;
+  SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
     unsigned VPBBsForBB = 0;
     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
-    VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+    if (VPBB)
+      VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+    else {
+      Plan->setEntry(FirstVPBBForBB);
+      HeaderVPBB = FirstVPBBForBB;
+    }
     VPBB = FirstVPBBForBB;
     Builder.setInsertPoint(VPBB);
 
@@ -9222,6 +9398,17 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
           Plan->addVPValue(UV, Def);
         }
 
+        if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
+            HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
+          // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
+          // of the header block. That can happen for truncates of induction
+          // variables. Those recipes are moved to the phi section of the header
+          // block after applying SinkAfter, which relies on the original
+          // position of the trunc.
+          assert(isa<TruncInst>(Instr));
+          InductionsToMove.push_back(
+              cast<VPWidenIntOrFpInductionRecipe>(Recipe));
+        }
         RecipeBuilder.setRecipe(Instr, Recipe);
         VPBB->appendRecipe(Recipe);
         continue;
@@ -9239,17 +9426,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     }
   }
 
+  assert(isa<VPBasicBlock>(Plan->getEntry()) &&
+         !Plan->getEntry()->getEntryBasicBlock()->empty() &&
+         "entry block must be set to a non-empty VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
-  // may also be empty, such as the last one VPBB, reflecting original
-  // basic-blocks with no recipes.
-  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
-  assert(PreEntry->empty() && "Expecting empty pre-entry block.");
-  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
-  VPBlockUtils::disconnectBlocks(PreEntry, Entry);
-  delete PreEntry;
-
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
   // bring the VPlan to its final state.
@@ -9318,6 +9499,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     }
   }
 
+  // Now that sink-after is done, move induction recipes for optimized truncates
+  // to the phi section of the header block.
+  for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
+    Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+
+  // Adjust the recipes for any inloop reductions.
+  adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
+
   // Introduce a recipe to combine the incoming and previous values of a
   // first-order recurrence.
   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
@@ -9325,16 +9514,20 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     if (!RecurPhi)
       continue;
 
+    VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
+    VPBasicBlock *InsertBlock = PrevRecipe->getParent();
+    auto *Region = GetReplicateRegion(PrevRecipe);
+    if (Region)
+      InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
+    if (Region || PrevRecipe->isPhi())
+      Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
+    else
+      Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
+
     auto *RecurSplice = cast<VPInstruction>(
         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
                              {RecurPhi, RecurPhi->getBackedgeValue()}));
 
-    VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
-    if (auto *Region = GetReplicateRegion(PrevRecipe)) {
-      VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor());
-      RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi());
-    } else
-      RecurSplice->moveAfter(PrevRecipe);
     RecurPhi->replaceAllUsesWith(RecurSplice);
     // Set the first operand of RecurSplice to RecurPhi again, after replacing
     // all users.
@@ -9372,22 +9565,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
       }
   }
 
-  // Adjust the recipes for any inloop reductions.
-  adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start);
-
-  // Finally, if tail is folded by masking, introduce selects between the phi
-  // and the live-out instruction of each reduction, at the end of the latch.
-  if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
-    Builder.setInsertPoint(VPBB);
-    auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
-    for (auto &Reduction : Legal->getReductionVars()) {
-      if (CM.isInLoopReduction(Reduction.first))
-        continue;
-      VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
-      VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
-      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
-    }
-  }
+  // From this point onwards, VPlan-to-VPlan transformations may change the plan
+  // in ways that accessing values using original IR values is incorrect.
+  Plan->disableValue2VPValue();
 
   VPlanTransforms::sinkScalarOperands(*Plan);
   VPlanTransforms::mergeReplicateRegions(*Plan);
@@ -9405,6 +9585,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   RSO.flush();
   Plan->setName(PlanName);
 
+  assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
 
@@ -9443,12 +9624,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   return Plan;
 }
 
-// Adjust the recipes for any inloop reductions. The chain of instructions
-// leading from the loop exit instr to the phi need to be converted to
-// reductions, with one operand being vector and the other being the scalar
-// reduction chain.
-void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
-    VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
+// Adjust the recipes for reductions. For in-loop reductions the chain of
+// instructions leading from the loop exit instr to the phi need to be converted
+// to reductions, with one operand being vector and the other being the scalar
+// reduction chain. For other reductions, a select is introduced between the phi
+// and live-out recipes when folding the tail.
+void LoopVectorizationPlanner::adjustRecipesForReductions(
+    VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
+    ElementCount MinVF) {
   for (auto &Reduction : CM.getInLoopReductionChains()) {
     PHINode *Phi = Reduction.first;
     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
@@ -9468,6 +9651,8 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
 
       VPValue *ChainOp = Plan->getVPValue(Chain);
       unsigned FirstOpId;
+      assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
+             "Only min/max recurrences allowed for inloop reductions");
       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
                "Expected to replace a VPWidenSelectSC");
@@ -9505,6 +9690,21 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
       Chain = R;
     }
   }
+
+  // If tail is folded by masking, introduce selects between the phi
+  // and the live-out instruction of each reduction, at the end of the latch.
+  if (CM.foldTailByMasking()) {
+    for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
+      VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+      if (!PhiR || PhiR->isInLoop())
+        continue;
+      Builder.setInsertPoint(LatchVPBB);
+      VPValue *Cond =
+          RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+      VPValue *Red = PhiR->getBackedgeValue();
+      Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
+    }
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -9519,9 +9719,22 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
     O << ", ";
     Mask->printAsOperand(O, SlotTracker);
   }
-  for (unsigned i = 0; i < IG->getFactor(); ++i)
-    if (Instruction *I = IG->getMember(i))
-      O << "\n" << Indent << "  " << VPlanIngredient(I) << " " << i;
+
+  unsigned OpIdx = 0;
+  for (unsigned i = 0; i < IG->getFactor(); ++i) {
+    if (!IG->getMember(i))
+      continue;
+    if (getNumStoreOperands() > 0) {
+      O << "\n" << Indent << "  store ";
+      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
+      O << " to index " << i;
+    } else {
+      O << "\n" << Indent << "  ";
+      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+      O << " = load from index " << i;
+    }
+    ++OpIdx;
+  }
 }
 #endif
 
@@ -9605,17 +9818,20 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
   Value *PrevInChain = State.get(getChainOp(), 0);
+  RecurKind Kind = RdxDesc->getRecurrenceKind();
+  bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
+  // Propagate the fast-math flags carried by the underlying instruction.
+  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
+  State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
-    RecurKind Kind = RdxDesc->getRecurrenceKind();
-    bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
     Value *NewVecOp = State.get(getVecOp(), Part);
     if (VPValue *Cond = getCondOp()) {
       Value *NewCond = State.get(Cond, Part);
       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+      Value *Iden = RdxDesc->getRecurrenceIdentity(
           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
-      Constant *IdenVec =
-          ConstantVector::getSplat(VecTy->getElementCount(), Iden);
+      Value *IdenVec =
+          State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
       NewVecOp = Select;
     }
@@ -9627,8 +9843,8 @@ void VPReductionRecipe::execute(VPTransformState &State) {
                                         PrevInChain);
       else
         NewRed = State.Builder.CreateBinOp(
-            (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
-            PrevInChain, NewVecOp);
+            (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
+            NewVecOp);
       PrevInChain = NewRed;
     } else {
       PrevInChain = State.get(getChainOp(), Part);
@@ -9640,11 +9856,10 @@ void VPReductionRecipe::execute(VPTransformState &State) {
                          NewRed, PrevInChain);
     } else if (IsOrdered)
       NextInChain = NewRed;
-    else {
+    else
       NextInChain = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
+          (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
           PrevInChain);
-    }
     State.set(this, NextInChain, Part);
   }
 }
@@ -9757,7 +9972,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
   State.ILV->vectorizeMemoryInstruction(
       &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
-      StoredValue, getMask());
+      StoredValue, getMask(), Consecutive, Reverse);
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -9923,7 +10138,7 @@ static bool processLoopInVPlanNativePath(
       VectorizationFactor::Disabled() == VF)
     return false;
 
-  LVP.setBestPlan(VF.Width, 1);
+  VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
 
   {
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
@@ -9932,7 +10147,7 @@ static bool processLoopInVPlanNativePath(
                            &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(LB, DT);
+    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
   }
 
   // Mark the loop as already vectorized to avoid vectorizing again.
@@ -10103,7 +10318,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (!LVL.canVectorizeFPMath(EnableStrictReductions)) {
+  bool AllowOrderedReductions;
+  // If the flag is set, use that instead and override the TTI behaviour.
+  if (ForceOrderedReductions.getNumOccurrences() > 0)
+    AllowOrderedReductions = ForceOrderedReductions;
+  else
+    AllowOrderedReductions = TTI->enableOrderedReductions();
+  if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
     ORE->emit([&]() {
       auto *ExactFPMathInst = Requirements.getExactFPInst();
       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
@@ -10248,7 +10469,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                              F->getParent()->getDataLayout());
     if (!VF.Width.isScalar() || IC > 1)
       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
-    LVP.setBestPlan(VF.Width, IC);
 
     using namespace ore;
     if (!VectorizeLoop) {
@@ -10257,7 +10477,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       // interleave it.
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
-      LVP.executePlan(Unroller, DT);
+
+      VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
 
       ORE->emit([&]() {
         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10276,14 +10498,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
-        EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
-                                          EpilogueVF.Width.getKnownMinValue(),
-                                          1);
+        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
-        LVP.executePlan(MainILV, DT);
+        VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+        LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
+                        DT);
         ++LoopsVectorized;
 
         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
@@ -10291,13 +10512,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
         // Second pass vectorizes the epilogue and adjusts the control flow
         // edges from the first pass.
-        LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
         EPI.MainLoopVF = EPI.EpilogueVF;
         EPI.MainLoopUF = EPI.EpilogueUF;
         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
                                                  Checks);
-        LVP.executePlan(EpilogILV, DT);
+
+        VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+        LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
+                        DT);
         ++LoopsEpilogueVectorized;
 
         if (!MainILV.areSafetyChecksAdded())
@@ -10305,7 +10528,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       } else {
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                                &LVL, &CM, BFI, PSI, Checks);
-        LVP.executePlan(LB, DT);
+
+        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
@@ -10423,15 +10648,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &AC = AM.getResult<AssumptionAnalysis>(F);
     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-    MemorySSA *MSSA = EnableMSSALoopDependency
-                          ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                          : nullptr;
 
     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
         [&](Loop &L) -> const LoopAccessInfo & {
-      LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                        TLI, TTI, nullptr, MSSA};
+      LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
+                                        TLI, TTI, nullptr, nullptr, nullptr};
       return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
@@ -10455,3 +10677,14 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
       PA.preserveSet<CFGAnalyses>();
     return PA;
 }
+
+void LoopVectorizePass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+
+  OS << "<";
+  OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
+  OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
+  OS << ">";
+}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc3f5c7d4b48..e3ef0b794f68 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
@@ -200,12 +201,39 @@ static bool isValidElementType(Type *Ty) {
          !Ty->isPPC_FP128Ty();
 }
 
+/// \returns True if the value is a constant (but not globals/constant
+/// expressions).
+static bool isConstant(Value *V) {
+  return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
+}
+
+/// Checks if \p V is one of vector-like instructions, i.e. undef,
+/// insertelement/extractelement with constant indices for fixed vector type or
+/// extractvalue instruction.
+static bool isVectorLikeInstWithConstOps(Value *V) {
+  if (!isa<InsertElementInst, ExtractElementInst>(V) &&
+      !isa<ExtractValueInst, UndefValue>(V))
+    return false;
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I || isa<ExtractValueInst>(I))
+    return true;
+  if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
+    return false;
+  if (isa<ExtractElementInst>(I))
+    return isConstant(I->getOperand(1));
+  assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
+  return isConstant(I->getOperand(2));
+}
+
 /// \returns true if all of the instructions in \p VL are in the same block or
 /// false otherwise.
 static bool allSameBlock(ArrayRef<Value *> VL) {
   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
   if (!I0)
     return false;
+  if (all_of(VL, isVectorLikeInstWithConstOps))
+    return true;
+
   BasicBlock *BB = I0->getParent();
   for (int I = 1, E = VL.size(); I < E; I++) {
     auto *II = dyn_cast<Instruction>(VL[I]);
@@ -218,12 +246,6 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
   return true;
 }
 
-/// \returns True if the value is a constant (but not globals/constant
-/// expressions).
-static bool isConstant(Value *V) {
-  return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
-}
-
 /// \returns True if all of the values in \p VL are constants (but not
 /// globals/constant expressions).
 static bool allConstant(ArrayRef<Value *> VL) {
@@ -232,12 +254,21 @@ static bool allConstant(ArrayRef<Value *> VL) {
   return all_of(VL, isConstant);
 }
 
-/// \returns True if all of the values in \p VL are identical.
+/// \returns True if all of the values in \p VL are identical or some of them
+/// are UndefValue.
 static bool isSplat(ArrayRef<Value *> VL) {
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    if (VL[i] != VL[0])
+  Value *FirstNonUndef = nullptr;
+  for (Value *V : VL) {
+    if (isa<UndefValue>(V))
+      continue;
+    if (!FirstNonUndef) {
+      FirstNonUndef = V;
+      continue;
+    }
+    if (V != FirstNonUndef)
       return false;
-  return true;
+  }
+  return FirstNonUndef != nullptr;
 }
 
 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
@@ -295,8 +326,10 @@ static bool isCommutative(Instruction *I) {
 /// TODO: Can we split off and reuse the shuffle mask detection from
 /// TargetTransformInfo::getInstructionThroughput?
 static Optional<TargetTransformInfo::ShuffleKind>
-isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
+isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
   auto *EI0 = cast<ExtractElementInst>(VL[0]);
+  if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
+    return None;
   unsigned Size =
       cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
   Value *Vec1 = nullptr;
@@ -504,7 +537,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   case Instruction::Call: {
     CallInst *CI = cast<CallInst>(UserInst);
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-    for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+    for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
       if (hasVectorInstrinsicScalarOpd(ID, i))
         return (CI->getArgOperand(i) == Scalar);
     }
@@ -535,13 +568,67 @@ static bool isSimple(Instruction *I) {
   return true;
 }
 
+/// Shuffles \p Mask in accordance with the given \p SubMask.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+  if (SubMask.empty())
+    return;
+  if (Mask.empty()) {
+    Mask.append(SubMask.begin(), SubMask.end());
+    return;
+  }
+  SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
+  int TermValue = std::min(Mask.size(), SubMask.size());
+  for (int I = 0, E = SubMask.size(); I < E; ++I) {
+    if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+        Mask[SubMask[I]] >= TermValue)
+      continue;
+    NewMask[I] = Mask[SubMask[I]];
+  }
+  Mask.swap(NewMask);
+}
+
+/// Order may have elements assigned special value (size) which is out of
+/// bounds. Such indices only appear on places which correspond to undef values
+/// (see canReuseExtract for details) and used in order to avoid undef values
+/// have effect on operands ordering.
+/// The first loop below simply finds all unused indices and then the next loop
+/// nest assigns these indices for undef values positions.
+/// As an example below Order has two undef positions and they have assigned
+/// values 3 and 7 respectively:
+/// before:  6 9 5 4 9 2 1 0
+/// after:   6 3 5 4 7 2 1 0
+static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
+  const unsigned Sz = Order.size();
+  SmallBitVector UsedIndices(Sz);
+  SmallVector<int> MaskedIndices;
+  for (unsigned I = 0; I < Sz; ++I) {
+    if (Order[I] < Sz)
+      UsedIndices.set(Order[I]);
+    else
+      MaskedIndices.push_back(I);
+  }
+  if (MaskedIndices.empty())
+    return;
+  SmallVector<int> AvailableIndices(MaskedIndices.size());
+  unsigned Cnt = 0;
+  int Idx = UsedIndices.find_first();
+  do {
+    AvailableIndices[Cnt] = Idx;
+    Idx = UsedIndices.find_next(Idx);
+    ++Cnt;
+  } while (Idx > 0);
+  assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
+  for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
+    Order[MaskedIndices[I]] = AvailableIndices[I];
+}
+
 namespace llvm {
 
 static void inversePermutation(ArrayRef<unsigned> Indices,
                                SmallVectorImpl<int> &Mask) {
   Mask.clear();
   const unsigned E = Indices.size();
-  Mask.resize(E, E + 1);
+  Mask.resize(E, UndefMaskElem);
   for (unsigned I = 0; I < E; ++I)
     Mask[Indices[I]] = I;
 }
@@ -581,6 +668,22 @@ static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
   return Index;
 }
 
+/// Reorders the list of scalars in accordance with the given \p Order and then
+/// the \p Mask. \p Order - is the original order of the scalars, need to
+/// reorder scalars into an unordered state at first according to the given
+/// order. Then the ordered scalars are shuffled once again in accordance with
+/// the provided mask.
+static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
+                           ArrayRef<int> Mask) {
+  assert(!Mask.empty() && "Expected non-empty mask.");
+  SmallVector<Value *> Prev(Scalars.size(),
+                            UndefValue::get(Scalars.front()->getType()));
+  Prev.swap(Scalars);
+  for (unsigned I = 0, E = Prev.size(); I < E; ++I)
+    if (Mask[I] != UndefMaskElem)
+      Scalars[Mask[I]] = Prev[I];
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -645,13 +748,12 @@ public:
   void buildTree(ArrayRef<Value *> Roots,
                  ArrayRef<Value *> UserIgnoreLst = None);
 
-  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
-  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
-  /// into account (and updating it, if required) list of externally used
-  /// values stored in \p ExternallyUsedValues.
-  void buildTree(ArrayRef<Value *> Roots,
-                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
-                 ArrayRef<Value *> UserIgnoreLst = None);
+  /// Builds external uses of the vectorized scalars, i.e. the list of
+  /// vectorized scalars to be extracted, their lanes and their scalar users. \p
+  /// ExternallyUsedValues contains additional list of external uses to handle
+  /// vectorization of reductions.
+  void
+  buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
@@ -659,8 +761,6 @@ public:
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
-    NumOpsWantToKeepOrder.clear();
-    NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -674,103 +774,28 @@ public:
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
-  /// \returns The best order of instructions for vectorization.
-  Optional<ArrayRef<unsigned>> bestOrder() const {
-    assert(llvm::all_of(
-               NumOpsWantToKeepOrder,
-               [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
-                 return D.getFirst().size() ==
-                        VectorizableTree[0]->Scalars.size();
-               }) &&
-           "All orders must have the same size as number of instructions in "
-           "tree node.");
-    auto I = std::max_element(
-        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
-        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
-           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
-          return D1.second < D2.second;
-        });
-    if (I == NumOpsWantToKeepOrder.end() ||
-        I->getSecond() <= NumOpsWantToKeepOriginalOrder)
-      return None;
-
-    return makeArrayRef(I->getFirst());
-  }
-
-  /// Builds the correct order for root instructions.
-  /// If some leaves have the same instructions to be vectorized, we may
-  /// incorrectly evaluate the best order for the root node (it is built for the
-  /// vector of instructions without repeated instructions and, thus, has less
-  /// elements than the root node). This function builds the correct order for
-  /// the root node.
-  /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
-  /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
-  /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
-  /// be reordered, the best order will be \<1, 0\>. We need to extend this
-  /// order for the root node. For the root node this order should look like
-  /// \<3, 0, 1, 2\>. This function extends the order for the reused
-  /// instructions.
-  void findRootOrder(OrdersType &Order) {
-    // If the leaf has the same number of instructions to vectorize as the root
-    // - order must be set already.
-    unsigned RootSize = VectorizableTree[0]->Scalars.size();
-    if (Order.size() == RootSize)
-      return;
-    SmallVector<unsigned, 4> RealOrder(Order.size());
-    std::swap(Order, RealOrder);
-    SmallVector<int, 4> Mask;
-    inversePermutation(RealOrder, Mask);
-    Order.assign(Mask.begin(), Mask.end());
-    // The leaf has less number of instructions - need to find the true order of
-    // the root.
-    // Scan the nodes starting from the leaf back to the root.
-    const TreeEntry *PNode = VectorizableTree.back().get();
-    SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
-    SmallPtrSet<const TreeEntry *, 4> Visited;
-    while (!Nodes.empty() && Order.size() != RootSize) {
-      const TreeEntry *PNode = Nodes.pop_back_val();
-      if (!Visited.insert(PNode).second)
-        continue;
-      const TreeEntry &Node = *PNode;
-      for (const EdgeInfo &EI : Node.UserTreeIndices)
-        if (EI.UserTE)
-          Nodes.push_back(EI.UserTE);
-      if (Node.ReuseShuffleIndices.empty())
-        continue;
-      // Build the order for the parent node.
-      OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
-      SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
-      // The algorithm of the order extension is:
-      // 1. Calculate the number of the same instructions for the order.
-      // 2. Calculate the index of the new order: total number of instructions
-      // with order less than the order of the current instruction + reuse
-      // number of the current instruction.
-      // 3. The new order is just the index of the instruction in the original
-      // vector of the instructions.
-      for (unsigned I : Node.ReuseShuffleIndices)
-        ++OrderCounter[Order[I]];
-      SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
-      for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
-        unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
-        unsigned OrderIdx = Order[ReusedIdx];
-        unsigned NewIdx = 0;
-        for (unsigned J = 0; J < OrderIdx; ++J)
-          NewIdx += OrderCounter[J];
-        NewIdx += CurrentCounter[OrderIdx];
-        ++CurrentCounter[OrderIdx];
-        assert(NewOrder[NewIdx] == RootSize &&
-               "The order index should not be written already.");
-        NewOrder[NewIdx] = I;
-      }
-      std::swap(Order, NewOrder);
-    }
-    assert(Order.size() == RootSize &&
-           "Root node is expected or the size of the order must be the same as "
-           "the number of elements in the root node.");
-    assert(llvm::all_of(Order,
-                        [RootSize](unsigned Val) { return Val != RootSize; }) &&
-           "All indices must be initialized");
-  }
+  /// Checks if the specified gather tree entry \p TE can be represented as a
+  /// shuffled vector entry + (possibly) permutation with other gathers. It
+  /// implements the checks only for possibly ordered scalars (Loads,
+  /// ExtractElement, ExtractValue), which can be part of the graph.
+  Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
+
+  /// Reorders the current graph to the most profitable order starting from the
+  /// root node to the leaf nodes. The best order is chosen only from the nodes
+  /// of the same size (vectorization factor). Smaller nodes are considered
+  /// parts of subgraph with smaller VF and they are reordered independently. We
+  /// can make it because we still need to extend smaller nodes to the wider VF
+  /// and we can merge reordering shuffles with the widening shuffles.
+  void reorderTopToBottom();
+
+  /// Reorders the current graph to the most profitable order starting from
+  /// leaves to the root. It allows to rotate small subgraphs and reduce the
+  /// number of reshuffles if the leaf nodes use the same order. In this case we
+  /// can merge the orders and just shuffle user node instead of shuffling its
+  /// operands. Plus, even the leaf nodes have different orders, it allows to
+  /// sink reordering in the graph closer to the root node and merge it later
+  /// during analysis.
+  void reorderBottomToTop(bool IgnoreReorder = false);
 
   /// \return The vector element size in bits to use when vectorizing the
   /// expression tree ending at \p V. If V is a store, the size is the width of
@@ -793,6 +818,10 @@ public:
     return MinVecRegSize;
   }
 
+  unsigned getMinVF(unsigned Sz) const {
+    return std::max(2U, getMinVecRegSize() / Sz);
+  }
+
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
     unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
       MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
@@ -809,7 +838,7 @@ public:
 
   /// \returns True if the VectorizableTree is both tiny and not fully
   /// vectorizable. We do not vectorize such trees.
-  bool isTreeTinyAndNotFullyVectorizable() const;
+  bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
 
   /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
   /// can be load combined in the backend. Load combining may not be allowed in
@@ -1578,10 +1607,12 @@ private:
   Value *vectorizeTree(ArrayRef<Value *> VL);
 
   /// \returns the scalarization cost for this type. Scalarization in this
-  /// context means the creation of vectors from a group of scalars.
-  InstructionCost
-  getGatherCost(FixedVectorType *Ty,
-                const DenseSet<unsigned> &ShuffledIndices) const;
+  /// context means the creation of vectors from a group of scalars. If \p
+  /// NeedToShuffle is true, need to add a cost of reshuffling some of the
+  /// vector elements.
+  InstructionCost getGatherCost(FixedVectorType *Ty,
+                                const DenseSet<unsigned> &ShuffledIndices,
+                                bool NeedToShuffle) const;
 
   /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
   /// tree entries.
@@ -1605,7 +1636,7 @@ private:
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
-  bool isFullyVectorizableTinyTree() const;
+  bool isFullyVectorizableTinyTree(bool ForReduction) const;
 
   /// Reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
@@ -1621,14 +1652,43 @@ private:
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
-      if (VL.size() == Scalars.size())
-        return std::equal(VL.begin(), VL.end(), Scalars.begin());
-      return VL.size() == ReuseShuffleIndices.size() &&
-             std::equal(
-                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
-                 [this](Value *V, int Idx) { return V == Scalars[Idx]; });
+      auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
+        if (Mask.size() != VL.size() && VL.size() == Scalars.size())
+          return std::equal(VL.begin(), VL.end(), Scalars.begin());
+        return VL.size() == Mask.size() &&
+               std::equal(VL.begin(), VL.end(), Mask.begin(),
+                          [Scalars](Value *V, int Idx) {
+                            return (isa<UndefValue>(V) &&
+                                    Idx == UndefMaskElem) ||
+                                   (Idx != UndefMaskElem && V == Scalars[Idx]);
+                          });
+      };
+      if (!ReorderIndices.empty()) {
+        // TODO: implement matching if the nodes are just reordered, still can
+        // treat the vector as the same if the list of scalars matches VL
+        // directly, without reordering.
+        SmallVector<int> Mask;
+        inversePermutation(ReorderIndices, Mask);
+        if (VL.size() == Scalars.size())
+          return IsSame(Scalars, Mask);
+        if (VL.size() == ReuseShuffleIndices.size()) {
+          ::addMask(Mask, ReuseShuffleIndices);
+          return IsSame(Scalars, Mask);
+        }
+        return false;
+      }
+      return IsSame(Scalars, ReuseShuffleIndices);
     }
 
+    /// \return Final vectorization factor for the node. Defined by the total
+    /// number of vectorized scalars, including those, used several times in the
+    /// entry and counted in the \a ReuseShuffleIndices, if any.
+    unsigned getVectorFactor() const {
+      if (!ReuseShuffleIndices.empty())
+        return ReuseShuffleIndices.size();
+      return Scalars.size();
+    };
+
     /// A vector of scalars.
     ValueList Scalars;
 
@@ -1701,6 +1761,12 @@ private:
       }
     }
 
+    /// Reorders operands of the node to the given mask \p Mask.
+    void reorderOperands(ArrayRef<int> Mask) {
+      for (ValueList &Operand : Operands)
+        reorderScalars(Operand, Mask);
+    }
+
     /// \returns the \p OpIdx operand of this TreeEntry.
     ValueList &getOperand(unsigned OpIdx) {
       assert(OpIdx < Operands.size() && "Off bounds");
@@ -1760,19 +1826,14 @@ private:
       return AltOp ? AltOp->getOpcode() : 0;
     }
 
-    /// Update operations state of this entry if reorder occurred.
-    bool updateStateIfReorder() {
-      if (ReorderIndices.empty())
-        return false;
-      InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
-      setOperations(S);
-      return true;
-    }
-    /// When ReuseShuffleIndices is empty it just returns position of \p V
-    /// within vector of Scalars. Otherwise, try to remap on its reuse index.
+    /// When ReuseReorderShuffleIndices is empty it just returns position of \p
+    /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     int findLaneForValue(Value *V) const {
       unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
       assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
+      if (!ReorderIndices.empty())
+        FoundLane = ReorderIndices[FoundLane];
+      assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
       if (!ReuseShuffleIndices.empty()) {
         FoundLane = std::distance(ReuseShuffleIndices.begin(),
                                   find(ReuseShuffleIndices, FoundLane));
@@ -1856,7 +1917,7 @@ private:
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
-                          ArrayRef<unsigned> ReuseShuffleIndices = None,
+                          ArrayRef<int> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
     TreeEntry::EntryState EntryState =
         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
@@ -1869,7 +1930,7 @@ private:
                           Optional<ScheduleData *> Bundle,
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
-                          ArrayRef<unsigned> ReuseShuffleIndices = None,
+                          ArrayRef<int> ReuseShuffleIndices = None,
                           ArrayRef<unsigned> ReorderIndices = None) {
     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
@@ -1877,12 +1938,25 @@ private:
     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
-    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->State = EntryState;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
-    Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
-    Last->setOperations(S);
+    if (ReorderIndices.empty()) {
+      Last->Scalars.assign(VL.begin(), VL.end());
+      Last->setOperations(S);
+    } else {
+      // Reorder scalars and build final mask.
+      Last->Scalars.assign(VL.size(), nullptr);
+      transform(ReorderIndices, Last->Scalars.begin(),
+                [VL](unsigned Idx) -> Value * {
+                  if (Idx >= VL.size())
+                    return UndefValue::get(VL.front()->getType());
+                  return VL[Idx];
+                });
+      InstructionsState S = getSameOpcode(Last->Scalars);
+      Last->setOperations(S);
+      Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
+    }
     if (Last->State != TreeEntry::NeedToGather) {
       for (Value *V : VL) {
         assert(!getTreeEntry(V) && "Scalar already in tree!");
@@ -1965,12 +2039,9 @@ private:
     if (result.hasValue()) {
       return result.getValue();
     }
-    MemoryLocation Loc2 = getLocation(Inst2, AA);
     bool aliased = true;
-    if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
-      // Do the alias check.
-      aliased = !AA->isNoAlias(Loc1, Loc2);
-    }
+    if (Loc1.Ptr && isSimple(Inst1))
+      aliased = isModOrRefSet(AA->getModRefInfo(Inst2, Loc1));
     // Store the result in the cache.
     result = aliased;
     return aliased;
@@ -2434,14 +2505,6 @@ private:
     }
   };
 
-  /// Contains orders of operations along with the number of bundles that have
-  /// operations in this order. It stores only those orders that require
-  /// reordering, if reordering is not required it is counted using \a
-  /// NumOpsWantToKeepOriginalOrder.
-  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
-  /// Number of bundles that do not require reordering.
-  unsigned NumOpsWantToKeepOriginalOrder = 0;
-
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
@@ -2540,10 +2603,8 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
   std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
     std::string Str;
     raw_string_ostream OS(Str);
-    if (isSplat(Entry->Scalars)) {
-      OS << "<splat> " << *Entry->Scalars[0];
-      return Str;
-    }
+    if (isSplat(Entry->Scalars))
+      OS << "<splat> ";
     for (auto V : Entry->Scalars) {
       OS << *V;
       if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
@@ -2594,21 +2655,539 @@ void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
   };
 }
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
-                        ArrayRef<Value *> UserIgnoreLst) {
-  ExtraValueToDebugLocsMap ExternallyUsedValues;
-  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
+/// contains original mask for the scalars reused in the node. Procedure
+/// transform this mask in accordance with the given \p Mask.
+static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
+  assert(!Mask.empty() && Reuses.size() == Mask.size() &&
+         "Expected non-empty mask.");
+  SmallVector<int> Prev(Reuses.begin(), Reuses.end());
+  Prev.swap(Reuses);
+  for (unsigned I = 0, E = Prev.size(); I < E; ++I)
+    if (Mask[I] != UndefMaskElem)
+      Reuses[Mask[I]] = Prev[I];
 }
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
-                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
-                        ArrayRef<Value *> UserIgnoreLst) {
-  deleteTree();
-  UserIgnoreList = UserIgnoreLst;
-  if (!allSameType(Roots))
+/// Reorders the given \p Order according to the given \p Mask. \p Order - is
+/// the original order of the scalars. Procedure transforms the provided order
+/// in accordance with the given \p Mask. If the resulting \p Order is just an
+/// identity order, \p Order is cleared.
+static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {
+  assert(!Mask.empty() && "Expected non-empty mask.");
+  SmallVector<int> MaskOrder;
+  if (Order.empty()) {
+    MaskOrder.resize(Mask.size());
+    std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
+  } else {
+    inversePermutation(Order, MaskOrder);
+  }
+  reorderReuses(MaskOrder, Mask);
+  if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
+    Order.clear();
     return;
-  buildTree_rec(Roots, 0, EdgeInfo());
+  }
+  Order.assign(Mask.size(), Mask.size());
+  for (unsigned I = 0, E = Mask.size(); I < E; ++I)
+    if (MaskOrder[I] != UndefMaskElem)
+      Order[MaskOrder[I]] = I;
+  fixupOrderingIndices(Order);
+}
+
+Optional<BoUpSLP::OrdersType>
+BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
+  assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
+  unsigned NumScalars = TE.Scalars.size();
+  OrdersType CurrentOrder(NumScalars, NumScalars);
+  SmallVector<int> Positions;
+  SmallBitVector UsedPositions(NumScalars);
+  const TreeEntry *STE = nullptr;
+  // Try to find all gathered scalars that are gets vectorized in other
+  // vectorize node. Here we can have only one single tree vector node to
+  // correctly identify order of the gathered scalars.
+  for (unsigned I = 0; I < NumScalars; ++I) {
+    Value *V = TE.Scalars[I];
+    if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
+      continue;
+    if (const auto *LocalSTE = getTreeEntry(V)) {
+      if (!STE)
+        STE = LocalSTE;
+      else if (STE != LocalSTE)
+        // Take the order only from the single vector node.
+        return None;
+      unsigned Lane =
+          std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
+      if (Lane >= NumScalars)
+        return None;
+      if (CurrentOrder[Lane] != NumScalars) {
+        if (Lane != I)
+          continue;
+        UsedPositions.reset(CurrentOrder[Lane]);
+      }
+      // The partial identity (where only some elements of the gather node are
+      // in the identity order) is good.
+      CurrentOrder[Lane] = I;
+      UsedPositions.set(I);
+    }
+  }
+  // Need to keep the order if we have a vector entry and at least 2 scalars or
+  // the vectorized entry has just 2 scalars.
+  if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
+    auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
+      for (unsigned I = 0; I < NumScalars; ++I)
+        if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
+          return false;
+      return true;
+    };
+    if (IsIdentityOrder(CurrentOrder)) {
+      CurrentOrder.clear();
+      return CurrentOrder;
+    }
+    auto *It = CurrentOrder.begin();
+    for (unsigned I = 0; I < NumScalars;) {
+      if (UsedPositions.test(I)) {
+        ++I;
+        continue;
+      }
+      if (*It == NumScalars) {
+        *It = I;
+        ++I;
+      }
+      ++It;
+    }
+    return CurrentOrder;
+  }
+  return None;
+}
 
+void BoUpSLP::reorderTopToBottom() {
+  // Maps VF to the graph nodes.
+  DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
+  // ExtractElement gather nodes which can be vectorized and need to handle
+  // their ordering.
+  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+  // Find all reorderable nodes with the given VF.
+  // Currently the are vectorized loads,extracts + some gathering of extracts.
+  for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
+                                 const std::unique_ptr<TreeEntry> &TE) {
+    // No need to reorder if need to shuffle reuses, still need to shuffle the
+    // node.
+    if (!TE->ReuseShuffleIndices.empty())
+      return;
+    if (TE->State == TreeEntry::Vectorize &&
+        isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,
+            InsertElementInst>(TE->getMainOp()) &&
+        !TE->isAltShuffle()) {
+      VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+      return;
+    }
+    if (TE->State == TreeEntry::NeedToGather) {
+      if (TE->getOpcode() == Instruction::ExtractElement &&
+          !TE->isAltShuffle() &&
+          isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
+                                   ->getVectorOperandType()) &&
+          allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
+        // Check that gather of extractelements can be represented as
+        // just a shuffle of a single vector.
+        OrdersType CurrentOrder;
+        bool Reuse =
+            canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
+        if (Reuse || !CurrentOrder.empty()) {
+          VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+          GathersToOrders.try_emplace(TE.get(), CurrentOrder);
+          return;
+        }
+      }
+      if (Optional<OrdersType> CurrentOrder =
+              findReusedOrderedScalars(*TE.get())) {
+        VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+      }
+    }
+  });
+
+  // Reorder the graph nodes according to their vectorization factor.
+  for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
+       VF /= 2) {
+    auto It = VFToOrderedEntries.find(VF);
+    if (It == VFToOrderedEntries.end())
+      continue;
+    // Try to find the most profitable order. We just are looking for the most
+    // used order and reorder scalar elements in the nodes according to this
+    // mostly used order.
+    const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond();
+    // All operands are reordered and used only in this node - propagate the
+    // most used order to the user node.
+    MapVector<OrdersType, unsigned,
+              DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
+        OrdersUses;
+    SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+    for (const TreeEntry *OpTE : OrderedEntries) {
+      // No need to reorder this nodes, still need to extend and to use shuffle,
+      // just need to merge reordering shuffle and the reuse shuffle.
+      if (!OpTE->ReuseShuffleIndices.empty())
+        continue;
+      // Count number of orders uses.
+      const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
+        if (OpTE->State == TreeEntry::NeedToGather)
+          return GathersToOrders.find(OpTE)->second;
+        return OpTE->ReorderIndices;
+      }();
+      // Stores actually store the mask, not the order, need to invert.
+      if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
+          OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
+        SmallVector<int> Mask;
+        inversePermutation(Order, Mask);
+        unsigned E = Order.size();
+        OrdersType CurrentOrder(E, E);
+        transform(Mask, CurrentOrder.begin(), [E](int Idx) {
+          return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+        });
+        fixupOrderingIndices(CurrentOrder);
+        ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+      } else {
+        ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+      }
+    }
+    // Set order of the user node.
+    if (OrdersUses.empty())
+      continue;
+    // Choose the most used order.
+    ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
+    unsigned Cnt = OrdersUses.front().second;
+    for (const auto &Pair : drop_begin(OrdersUses)) {
+      if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+        BestOrder = Pair.first;
+        Cnt = Pair.second;
+      }
+    }
+    // Set order of the user node.
+    if (BestOrder.empty())
+      continue;
+    SmallVector<int> Mask;
+    inversePermutation(BestOrder, Mask);
+    SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+    unsigned E = BestOrder.size();
+    transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
+      return I < E ? static_cast<int>(I) : UndefMaskElem;
+    });
+    // Do an actual reordering, if profitable.
+    for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+      // Just do the reordering for the nodes with the given VF.
+      if (TE->Scalars.size() != VF) {
+        if (TE->ReuseShuffleIndices.size() == VF) {
+          // Need to reorder the reuses masks of the operands with smaller VF to
+          // be able to find the match between the graph nodes and scalar
+          // operands of the given node during vectorization/cost estimation.
+          assert(all_of(TE->UserTreeIndices,
+                        [VF, &TE](const EdgeInfo &EI) {
+                          return EI.UserTE->Scalars.size() == VF ||
+                                 EI.UserTE->Scalars.size() ==
+                                     TE->Scalars.size();
+                        }) &&
+                 "All users must be of VF size.");
+          // Update ordering of the operands with the smaller VF than the given
+          // one.
+          reorderReuses(TE->ReuseShuffleIndices, Mask);
+        }
+        continue;
+      }
+      if (TE->State == TreeEntry::Vectorize &&
+          isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+              InsertElementInst>(TE->getMainOp()) &&
+          !TE->isAltShuffle()) {
+        // Build correct orders for extract{element,value}, loads and
+        // stores.
+        reorderOrder(TE->ReorderIndices, Mask);
+        if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
+          TE->reorderOperands(Mask);
+      } else {
+        // Reorder the node and its operands.
+        TE->reorderOperands(Mask);
+        assert(TE->ReorderIndices.empty() &&
+               "Expected empty reorder sequence.");
+        reorderScalars(TE->Scalars, Mask);
+      }
+      if (!TE->ReuseShuffleIndices.empty()) {
+        // Apply reversed order to keep the original ordering of the reused
+        // elements to avoid extra reorder indices shuffling.
+        OrdersType CurrentOrder;
+        reorderOrder(CurrentOrder, MaskOrder);
+        SmallVector<int> NewReuses;
+        inversePermutation(CurrentOrder, NewReuses);
+        addMask(NewReuses, TE->ReuseShuffleIndices);
+        TE->ReuseShuffleIndices.swap(NewReuses);
+      }
+    }
+  }
+}
+
+void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
+  SetVector<TreeEntry *> OrderedEntries;
+  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+  // Find all reorderable leaf nodes with the given VF.
+  // Currently the are vectorized loads,extracts without alternate operands +
+  // some gathering of extracts.
+  SmallVector<TreeEntry *> NonVectorized;
+  for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
+                              &NonVectorized](
+                                 const std::unique_ptr<TreeEntry> &TE) {
+    if (TE->State != TreeEntry::Vectorize)
+      NonVectorized.push_back(TE.get());
+    // No need to reorder if need to shuffle reuses, still need to shuffle the
+    // node.
+    if (!TE->ReuseShuffleIndices.empty())
+      return;
+    if (TE->State == TreeEntry::Vectorize &&
+        isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&
+        !TE->isAltShuffle()) {
+      OrderedEntries.insert(TE.get());
+      return;
+    }
+    if (TE->State == TreeEntry::NeedToGather) {
+      if (TE->getOpcode() == Instruction::ExtractElement &&
+          !TE->isAltShuffle() &&
+          isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
+                                   ->getVectorOperandType()) &&
+          allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
+        // Check that gather of extractelements can be represented as
+        // just a shuffle of a single vector with a single user only.
+        OrdersType CurrentOrder;
+        bool Reuse =
+            canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
+        if ((Reuse || !CurrentOrder.empty()) &&
+            !any_of(VectorizableTree,
+                    [&TE](const std::unique_ptr<TreeEntry> &Entry) {
+                      return Entry->State == TreeEntry::NeedToGather &&
+                             Entry.get() != TE.get() &&
+                             Entry->isSame(TE->Scalars);
+                    })) {
+          OrderedEntries.insert(TE.get());
+          GathersToOrders.try_emplace(TE.get(), CurrentOrder);
+          return;
+        }
+      }
+      if (Optional<OrdersType> CurrentOrder =
+              findReusedOrderedScalars(*TE.get())) {
+        OrderedEntries.insert(TE.get());
+        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+      }
+    }
+  });
+
+  // Checks if the operands of the users are reordarable and have only single
+  // use.
+  auto &&CheckOperands =
+      [this, &NonVectorized](const auto &Data,
+                             SmallVectorImpl<TreeEntry *> &GatherOps) {
+        for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
+          if (any_of(Data.second,
+                     [I](const std::pair<unsigned, TreeEntry *> &OpData) {
+                       return OpData.first == I &&
+                              OpData.second->State == TreeEntry::Vectorize;
+                     }))
+            continue;
+          ArrayRef<Value *> VL = Data.first->getOperand(I);
+          const TreeEntry *TE = nullptr;
+          const auto *It = find_if(VL, [this, &TE](Value *V) {
+            TE = getTreeEntry(V);
+            return TE;
+          });
+          if (It != VL.end() && TE->isSame(VL))
+            return false;
+          TreeEntry *Gather = nullptr;
+          if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
+                assert(TE->State != TreeEntry::Vectorize &&
+                       "Only non-vectorized nodes are expected.");
+                if (TE->isSame(VL)) {
+                  Gather = TE;
+                  return true;
+                }
+                return false;
+              }) > 1)
+            return false;
+          if (Gather)
+            GatherOps.push_back(Gather);
+        }
+        return true;
+      };
+  // 1. Propagate order to the graph nodes, which use only reordered nodes.
+  // I.e., if the node has operands, that are reordered, try to make at least
+  // one operand order in the natural order and reorder others + reorder the
+  // user node itself.
+  SmallPtrSet<const TreeEntry *, 4> Visited;
+  while (!OrderedEntries.empty()) {
+    // 1. Filter out only reordered nodes.
+    // 2. If the entry has multiple uses - skip it and jump to the next node.
+    MapVector<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
+    SmallVector<TreeEntry *> Filtered;
+    for (TreeEntry *TE : OrderedEntries) {
+      if (!(TE->State == TreeEntry::Vectorize ||
+            (TE->State == TreeEntry::NeedToGather &&
+             GathersToOrders.count(TE))) ||
+          TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
+          !all_of(drop_begin(TE->UserTreeIndices),
+                  [TE](const EdgeInfo &EI) {
+                    return EI.UserTE == TE->UserTreeIndices.front().UserTE;
+                  }) ||
+          !Visited.insert(TE).second) {
+        Filtered.push_back(TE);
+        continue;
+      }
+      // Build a map between user nodes and their operands order to speedup
+      // search. The graph currently does not provide this dependency directly.
+      for (EdgeInfo &EI : TE->UserTreeIndices) {
+        TreeEntry *UserTE = EI.UserTE;
+        auto It = Users.find(UserTE);
+        if (It == Users.end())
+          It = Users.insert({UserTE, {}}).first;
+        It->second.emplace_back(EI.EdgeIdx, TE);
+      }
+    }
+    // Erase filtered entries.
+    for_each(Filtered,
+             [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
+    for (const auto &Data : Users) {
+      // Check that operands are used only in the User node.
+      SmallVector<TreeEntry *> GatherOps;
+      if (!CheckOperands(Data, GatherOps)) {
+        for_each(Data.second,
+                 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
+                   OrderedEntries.remove(Op.second);
+                 });
+        continue;
+      }
+      // All operands are reordered and used only in this node - propagate the
+      // most used order to the user node.
+      MapVector<OrdersType, unsigned,
+                DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
+          OrdersUses;
+      SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+      for (const auto &Op : Data.second) {
+        TreeEntry *OpTE = Op.second;
+        if (!OpTE->ReuseShuffleIndices.empty() ||
+            (IgnoreReorder && OpTE == VectorizableTree.front().get()))
+          continue;
+        const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
+          if (OpTE->State == TreeEntry::NeedToGather)
+            return GathersToOrders.find(OpTE)->second;
+          return OpTE->ReorderIndices;
+        }();
+        // Stores actually store the mask, not the order, need to invert.
+        if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
+            OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
+          SmallVector<int> Mask;
+          inversePermutation(Order, Mask);
+          unsigned E = Order.size();
+          OrdersType CurrentOrder(E, E);
+          transform(Mask, CurrentOrder.begin(), [E](int Idx) {
+            return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
+          });
+          fixupOrderingIndices(CurrentOrder);
+          ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+        } else {
+          ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+        }
+        if (VisitedOps.insert(OpTE).second)
+          OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
+              OpTE->UserTreeIndices.size();
+        assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
+        --OrdersUses[{}];
+      }
+      // If no orders - skip current nodes and jump to the next one, if any.
+      if (OrdersUses.empty()) {
+        for_each(Data.second,
+                 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
+                   OrderedEntries.remove(Op.second);
+                 });
+        continue;
+      }
+      // Choose the best order.
+      ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
+      unsigned Cnt = OrdersUses.front().second;
+      for (const auto &Pair : drop_begin(OrdersUses)) {
+        if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+          BestOrder = Pair.first;
+          Cnt = Pair.second;
+        }
+      }
+      // Set order of the user node (reordering of operands and user nodes).
+      if (BestOrder.empty()) {
+        for_each(Data.second,
+                 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
+                   OrderedEntries.remove(Op.second);
+                 });
+        continue;
+      }
+      // Erase operands from OrderedEntries list and adjust their orders.
+      VisitedOps.clear();
+      SmallVector<int> Mask;
+      inversePermutation(BestOrder, Mask);
+      SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
+      unsigned E = BestOrder.size();
+      transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
+        return I < E ? static_cast<int>(I) : UndefMaskElem;
+      });
+      for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
+        TreeEntry *TE = Op.second;
+        OrderedEntries.remove(TE);
+        if (!VisitedOps.insert(TE).second)
+          continue;
+        if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
+          // Just reorder reuses indices.
+          reorderReuses(TE->ReuseShuffleIndices, Mask);
+          continue;
+        }
+        // Gathers are processed separately.
+        if (TE->State != TreeEntry::Vectorize)
+          continue;
+        assert((BestOrder.size() == TE->ReorderIndices.size() ||
+                TE->ReorderIndices.empty()) &&
+               "Non-matching sizes of user/operand entries.");
+        reorderOrder(TE->ReorderIndices, Mask);
+      }
+      // For gathers just need to reorder its scalars.
+      for (TreeEntry *Gather : GatherOps) {
+        assert(Gather->ReorderIndices.empty() &&
+               "Unexpected reordering of gathers.");
+        if (!Gather->ReuseShuffleIndices.empty()) {
+          // Just reorder reuses indices.
+          reorderReuses(Gather->ReuseShuffleIndices, Mask);
+          continue;
+        }
+        reorderScalars(Gather->Scalars, Mask);
+        OrderedEntries.remove(Gather);
+      }
+      // Reorder operands of the user node and set the ordering for the user
+      // node itself.
+      if (Data.first->State != TreeEntry::Vectorize ||
+          !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
+              Data.first->getMainOp()) ||
+          Data.first->isAltShuffle())
+        Data.first->reorderOperands(Mask);
+      if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
+          Data.first->isAltShuffle()) {
+        reorderScalars(Data.first->Scalars, Mask);
+        reorderOrder(Data.first->ReorderIndices, MaskOrder);
+        if (Data.first->ReuseShuffleIndices.empty() &&
+            !Data.first->ReorderIndices.empty() &&
+            !Data.first->isAltShuffle()) {
+          // Insert user node to the list to try to sink reordering deeper in
+          // the graph.
+          OrderedEntries.insert(Data.first);
+        }
+      } else {
+        reorderOrder(Data.first->ReorderIndices, Mask);
+      }
+    }
+  }
+  // If the reordering is unnecessary, just remove the reorder.
+  if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
+      VectorizableTree.front()->ReuseShuffleIndices.empty())
+    VectorizableTree.front()->ReorderIndices.clear();
+}
+
+void BoUpSLP::buildExternalUses(
+    const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   // Collect the values that we need to extract from the tree.
   for (auto &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
@@ -2636,6 +3215,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
         if (!UserInst)
           continue;
 
+        if (isDeleted(UserInst))
+          continue;
+
         // Skip in-tree scalars that become vectors
         if (TreeEntry *UseEntry = getTreeEntry(U)) {
           Value *UseScalar = UseEntry->Scalars[0];
@@ -2664,14 +3246,120 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   }
 }
 
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ArrayRef<Value *> UserIgnoreLst) {
+  deleteTree();
+  UserIgnoreList = UserIgnoreLst;
+  if (!allSameType(Roots))
+    return;
+  buildTree_rec(Roots, 0, EdgeInfo());
+}
+
+namespace {
+/// Tracks the state we can represent the loads in the given sequence.
+enum class LoadsState { Gather, Vectorize, ScatterVectorize };
+} // anonymous namespace
+
+/// Checks if the given array of loads can be represented as a vectorized,
+/// scatter or just simple gather.
+static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
+                                    const TargetTransformInfo &TTI,
+                                    const DataLayout &DL, ScalarEvolution &SE,
+                                    SmallVectorImpl<unsigned> &Order,
+                                    SmallVectorImpl<Value *> &PointerOps) {
+  // Check that a vectorized load would load the same memory as a scalar
+  // load. For example, we don't want to vectorize loads that are smaller
+  // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+  // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+  // from such a struct, we read/write packed bits disagreeing with the
+  // unvectorized version.
+  Type *ScalarTy = VL0->getType();
+
+  if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
+    return LoadsState::Gather;
+
+  // Make sure all loads in the bundle are simple - we can't vectorize
+  // atomic or volatile loads.
+  PointerOps.clear();
+  PointerOps.resize(VL.size());
+  auto *POIter = PointerOps.begin();
+  for (Value *V : VL) {
+    auto *L = cast<LoadInst>(V);
+    if (!L->isSimple())
+      return LoadsState::Gather;
+    *POIter = L->getPointerOperand();
+    ++POIter;
+  }
+
+  Order.clear();
+  // Check the order of pointer operands.
+  if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) {
+    Value *Ptr0;
+    Value *PtrN;
+    if (Order.empty()) {
+      Ptr0 = PointerOps.front();
+      PtrN = PointerOps.back();
+    } else {
+      Ptr0 = PointerOps[Order.front()];
+      PtrN = PointerOps[Order.back()];
+    }
+    Optional<int> Diff =
+        getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+    // Check that the sorted loads are consecutive.
+    if (static_cast<unsigned>(*Diff) == VL.size() - 1)
+      return LoadsState::Vectorize;
+    Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+    for (Value *V : VL)
+      CommonAlignment =
+          commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+    if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
+                                CommonAlignment))
+      return LoadsState::ScatterVectorize;
+  }
+
+  return LoadsState::Gather;
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
+  SmallVector<int> ReuseShuffleIndicies;
+  SmallVector<Value *> UniqueValues;
+  auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
+                                &UserTreeIdx,
+                                this](const InstructionsState &S) {
+    // Check that every instruction appears once in this bundle.
+    DenseMap<Value *, unsigned> UniquePositions;
+    for (Value *V : VL) {
+      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+      ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
+                                                           : Res.first->second);
+      if (Res.second)
+        UniqueValues.emplace_back(V);
+    }
+    size_t NumUniqueScalarValues = UniqueValues.size();
+    if (NumUniqueScalarValues == VL.size()) {
+      ReuseShuffleIndicies.clear();
+    } else {
+      LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+      if (NumUniqueScalarValues <= 1 ||
+          !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+        LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+        return false;
+      }
+      VL = UniqueValues;
+    }
+    return true;
+  };
+
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    if (TryToFindDuplicates(S))
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
     return;
   }
 
@@ -2680,7 +3368,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       isa<ScalableVectorType>(
           cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    if (TryToFindDuplicates(S))
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
     return;
   }
 
@@ -2700,9 +3390,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+  // If we deal with insert/extract instructions, they all must have constant
+  // indices, otherwise we should gather them, not try to vectorize.
+  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
+      (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
+       !all_of(VL, isVectorLikeInstWithConstOps))) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    if (TryToFindDuplicates(S))
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndicies);
     return;
   }
 
@@ -2724,7 +3420,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      if (TryToFindDuplicates(S))
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
@@ -2743,7 +3441,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (getTreeEntry(I)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      if (TryToFindDuplicates(S))
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
       return;
     }
   }
@@ -2754,7 +3454,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   for (Value *V : VL) {
     if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      if (TryToFindDuplicates(S))
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
       return;
     }
   }
@@ -2773,28 +3475,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Check that every instruction appears once in this bundle.
-  SmallVector<unsigned, 4> ReuseShuffleIndicies;
-  SmallVector<Value *, 4> UniqueValues;
-  DenseMap<Value *, unsigned> UniquePositions;
-  for (Value *V : VL) {
-    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
-    ReuseShuffleIndicies.emplace_back(Res.first->second);
-    if (Res.second)
-      UniqueValues.emplace_back(V);
-  }
-  size_t NumUniqueScalarValues = UniqueValues.size();
-  if (NumUniqueScalarValues == VL.size()) {
-    ReuseShuffleIndicies.clear();
-  } else {
-    LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-    if (NumUniqueScalarValues <= 1 ||
-        !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
-      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
-      return;
-    }
-    VL = UniqueValues;
-  }
+  if (!TryToFindDuplicates(S))
+    return;
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
@@ -2867,7 +3549,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        ++NumOpsWantToKeepOriginalOrder;
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         // This is a special case, as it does not gather, but at the same time
@@ -2885,12 +3566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             dbgs() << " " << Idx;
           dbgs() << "\n";
         });
+        fixupOrderingIndices(CurrentOrder);
         // Insert new order with initial value 0, if it does not exist,
         // otherwise return the iterator to the existing one.
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies, CurrentOrder);
-        findRootOrder(CurrentOrder);
-        ++NumOpsWantToKeepOrder[CurrentOrder];
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
@@ -2910,8 +3590,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Check that we have a buildvector and not a shuffle of 2 or more
       // different vectors.
       ValueSet SourceVectors;
-      for (Value *V : VL)
+      int MinIdx = std::numeric_limits<int>::max();
+      for (Value *V : VL) {
         SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
+        Optional<int> Idx = *getInsertIndex(V, 0);
+        if (!Idx || *Idx == UndefMaskElem)
+          continue;
+        MinIdx = std::min(MinIdx, *Idx);
+      }
 
       if (count_if(VL, [&SourceVectors](Value *V) {
             return !SourceVectors.contains(V);
@@ -2919,13 +3605,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Found 2nd source vector - cancel.
         LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
                              "different source vectors.\n");
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
         BS.cancelScheduling(VL, VL0);
         return;
       }
 
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
+      auto OrdCompare = [](const std::pair<int, int> &P1,
+                           const std::pair<int, int> &P2) {
+        return P1.first > P2.first;
+      };
+      PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
+                    decltype(OrdCompare)>
+          Indices(OrdCompare);
+      for (int I = 0, E = VL.size(); I < E; ++I) {
+        Optional<int> Idx = *getInsertIndex(VL[I], 0);
+        if (!Idx || *Idx == UndefMaskElem)
+          continue;
+        Indices.emplace(*Idx, I);
+      }
+      OrdersType CurrentOrder(VL.size(), VL.size());
+      bool IsIdentity = true;
+      for (int I = 0, E = VL.size(); I < E; ++I) {
+        CurrentOrder[Indices.top().second] = I;
+        IsIdentity &= Indices.top().second == I;
+        Indices.pop();
+      }
+      if (IsIdentity)
+        CurrentOrder.clear();
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   None, CurrentOrder);
       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
       constexpr int NumOps = 2;
@@ -2936,7 +3644,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
         TE->setOperand(I, VectorOperands[I]);
       }
-      buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0});
+      buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
       return;
     }
     case Instruction::Load: {
@@ -2946,90 +3654,52 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // treats loading/storing it as an i8 struct. If we vectorize loads/stores
       // from such a struct, we read/write packed bits disagreeing with the
       // unvectorized version.
-      Type *ScalarTy = VL0->getType();
-
-      if (DL->getTypeSizeInBits(ScalarTy) !=
-          DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
-        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
-        return;
-      }
-
-      // Make sure all loads in the bundle are simple - we can't vectorize
-      // atomic or volatile loads.
-      SmallVector<Value *, 4> PointerOps(VL.size());
-      auto POIter = PointerOps.begin();
-      for (Value *V : VL) {
-        auto *L = cast<LoadInst>(V);
-        if (!L->isSimple()) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
-          return;
-        }
-        *POIter = L->getPointerOperand();
-        ++POIter;
-      }
-
+      SmallVector<Value *> PointerOps;
       OrdersType CurrentOrder;
-      // Check the order of pointer operands.
-      if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
-        Value *Ptr0;
-        Value *PtrN;
+      TreeEntry *TE = nullptr;
+      switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder,
+                                PointerOps)) {
+      case LoadsState::Vectorize:
         if (CurrentOrder.empty()) {
-          Ptr0 = PointerOps.front();
-          PtrN = PointerOps.back();
+          // Original loads are consecutive and does not require reordering.
+          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                            ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         } else {
-          Ptr0 = PointerOps[CurrentOrder.front()];
-          PtrN = PointerOps[CurrentOrder.back()];
-        }
-        Optional<int> Diff = getPointersDiff(
-            ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
-        // Check that the sorted loads are consecutive.
-        if (static_cast<unsigned>(*Diff) == VL.size() - 1) {
-          if (CurrentOrder.empty()) {
-            // Original loads are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
-            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
-                                         UserTreeIdx, ReuseShuffleIndicies);
-            TE->setOperandsInOrder();
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
-          } else {
-            // Need to reorder.
-            TreeEntry *TE =
-                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder();
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
-          }
-          return;
-        }
-        Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
-        for (Value *V : VL)
-          CommonAlignment =
-              commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
-        if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
-                                     CommonAlignment)) {
-          // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-          TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle,
-                                       S, UserTreeIdx, ReuseShuffleIndicies);
-          TE->setOperandsInOrder();
-          buildTree_rec(PointerOps, Depth + 1, {TE, 0});
-          LLVM_DEBUG(dbgs()
-                     << "SLP: added a vector of non-consecutive loads.\n");
-          return;
+          fixupOrderingIndices(CurrentOrder);
+          // Need to reorder.
+          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                            ReuseShuffleIndicies, CurrentOrder);
+          LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
         }
+        TE->setOperandsInOrder();
+        break;
+      case LoadsState::ScatterVectorize:
+        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+        TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
+                          UserTreeIdx, ReuseShuffleIndicies);
+        TE->setOperandsInOrder();
+        buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+        break;
+      case LoadsState::Gather:
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+#ifndef NDEBUG
+        Type *ScalarTy = VL0->getType();
+        if (DL->getTypeSizeInBits(ScalarTy) !=
+            DL->getTypeAllocSizeInBits(ScalarTy))
+          LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+        else if (any_of(VL, [](Value *V) {
+                   return !cast<LoadInst>(V)->isSimple();
+                 }))
+          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+        else
+          LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+#endif // NDEBUG
+        break;
       }
-
-      LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -3213,15 +3883,40 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
-      TE->setOperandsInOrder();
-      for (unsigned i = 0, e = 2; i < e; ++i) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+      SmallVector<ValueList, 2> Operands(2);
+      // Prepare the operand vector for pointer operands.
+      for (Value *V : VL)
+        Operands.front().push_back(
+            cast<GetElementPtrInst>(V)->getPointerOperand());
+      TE->setOperand(0, Operands.front());
+      // Need to cast all indices to the same type before vectorization to
+      // avoid crash.
+      // Required to be able to find correct matches between different gather
+      // nodes and reuse the vectorized values rather than trying to gather them
+      // again.
+      int IndexIdx = 1;
+      Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
+      Type *Ty = all_of(VL,
+                        [VL0Ty, IndexIdx](Value *V) {
+                          return VL0Ty == cast<GetElementPtrInst>(V)
+                                              ->getOperand(IndexIdx)
+                                              ->getType();
+                        })
+                     ? VL0Ty
+                     : DL->getIndexType(cast<GetElementPtrInst>(VL0)
+                                            ->getPointerOperandType()
+                                            ->getScalarType());
+      // Prepare the operand vector.
+      for (Value *V : VL) {
+        auto *Op = cast<Instruction>(V)->getOperand(IndexIdx);
+        auto *CI = cast<ConstantInt>(Op);
+        Operands.back().push_back(ConstantExpr::getIntegerCast(
+            CI, Ty, CI->getValue().isSignBitSet()));
       }
+      TE->setOperand(IndexIdx, Operands.back());
+
+      for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
+        buildTree_rec(Operands[I], Depth + 1, {TE, I});
       return;
     }
     case Instruction::Store: {
@@ -3276,21 +3971,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
           if (CurrentOrder.empty()) {
             // Original stores are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
             TE->setOperandsInOrder();
             buildTree_rec(Operands, Depth + 1, {TE, 0});
             LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
           } else {
+            fixupOrderingIndices(CurrentOrder);
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                              ReuseShuffleIndicies, CurrentOrder);
             TE->setOperandsInOrder();
             buildTree_rec(Operands, Depth + 1, {TE, 0});
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
           }
           return;
         }
@@ -3321,7 +4014,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         return;
       }
       Function *F = CI->getCalledFunction();
-      unsigned NumArgs = CI->getNumArgOperands();
+      unsigned NumArgs = CI->arg_size();
       SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
       for (unsigned j = 0; j != NumArgs; ++j)
         if (hasVectorInstrinsicScalarOpd(ID, j))
@@ -3373,7 +4066,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
       TE->setOperandsInOrder();
-      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+      for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
+        // For scalar operands no need to to create an entry since no need to
+        // vectorize it.
+        if (hasVectorInstrinsicScalarOpd(ID, i))
+          continue;
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL) {
@@ -3548,7 +4245,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   FastMathFlags FMF;
   if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
     FMF = FPCI->getFastMathFlags();
-  SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
+  SmallVector<const Value *> Arguments(CI->args());
   IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
                                     dyn_cast<IntrinsicInst>(CI));
   auto IntrinsicCost =
@@ -3621,25 +4318,42 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
   return Cost;
 }
 
-/// Shuffles \p Mask in accordance with the given \p SubMask.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
-  if (SubMask.empty())
-    return;
-  if (Mask.empty()) {
-    Mask.append(SubMask.begin(), SubMask.end());
-    return;
-  }
-  SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
-  int TermValue = std::min(Mask.size(), SubMask.size());
-  for (int I = 0, E = SubMask.size(); I < E; ++I) {
-    if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
-        Mask[SubMask[I]] >= TermValue) {
-      NewMask[I] = UndefMaskElem;
-      continue;
+/// Build shuffle mask for shuffle graph entries and lists of main and alternate
+/// operations operands.
+static void
+buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
+                     ArrayRef<int> ReusesIndices,
+                     const function_ref<bool(Instruction *)> IsAltOp,
+                     SmallVectorImpl<int> &Mask,
+                     SmallVectorImpl<Value *> *OpScalars = nullptr,
+                     SmallVectorImpl<Value *> *AltScalars = nullptr) {
+  unsigned Sz = VL.size();
+  Mask.assign(Sz, UndefMaskElem);
+  SmallVector<int> OrderMask;
+  if (!ReorderIndices.empty())
+    inversePermutation(ReorderIndices, OrderMask);
+  for (unsigned I = 0; I < Sz; ++I) {
+    unsigned Idx = I;
+    if (!ReorderIndices.empty())
+      Idx = OrderMask[I];
+    auto *OpInst = cast<Instruction>(VL[Idx]);
+    if (IsAltOp(OpInst)) {
+      Mask[I] = Sz + Idx;
+      if (AltScalars)
+        AltScalars->push_back(OpInst);
+    } else {
+      Mask[I] = Idx;
+      if (OpScalars)
+        OpScalars->push_back(OpInst);
     }
-    NewMask[I] = Mask[SubMask[I]];
   }
-  Mask.swap(NewMask);
+  if (!ReusesIndices.empty()) {
+    SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);
+    transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {
+      return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;
+    });
+    Mask.swap(NewMask);
+  }
 }
 
 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
@@ -3661,13 +4375,10 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   if (MinBWs.count(VL[0]))
     VecTy = FixedVectorType::get(
         IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
-  auto *FinalVecTy = VecTy;
+  unsigned EntryVF = E->getVectorFactor();
+  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
 
-  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  if (NeedToShuffleReuses)
-    FinalVecTy =
-        FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers);
   // FIXME: it tries to fix a problem with MSVC buildbots.
   TargetTransformInfo &TTIRef = *TTI;
   auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
@@ -3785,7 +4496,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // shuffle of a single/two vectors the scalars are extracted from.
       SmallVector<int> Mask;
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
-          isShuffle(VL, Mask);
+          isFixedVectorShuffle(VL, Mask);
       if (ShuffleKind.hasValue()) {
         // Found the bunch of extractelement instructions that must be gathered
         // into a vector and can be represented as a permutation elements in a
@@ -3803,6 +4514,92 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
     if (NeedToShuffleReuses)
       ReuseShuffleCost = TTI->getShuffleCost(
           TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
+    // Improve gather cost for gather of loads, if we can group some of the
+    // loads into vector loads.
+    if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
+        !E->isAltShuffle()) {
+      BoUpSLP::ValueSet VectorizedLoads;
+      unsigned StartIdx = 0;
+      unsigned VF = VL.size() / 2;
+      unsigned VectorizedCnt = 0;
+      unsigned ScatterVectorizeCnt = 0;
+      const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
+      for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+        for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+             Cnt += VF) {
+          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+          if (!VectorizedLoads.count(Slice.front()) &&
+              !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+            SmallVector<Value *> PointerOps;
+            OrdersType CurrentOrder;
+            LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL,
+                                              *SE, CurrentOrder, PointerOps);
+            switch (LS) {
+            case LoadsState::Vectorize:
+            case LoadsState::ScatterVectorize:
+              // Mark the vectorized loads so that we don't vectorize them
+              // again.
+              if (LS == LoadsState::Vectorize)
+                ++VectorizedCnt;
+              else
+                ++ScatterVectorizeCnt;
+              VectorizedLoads.insert(Slice.begin(), Slice.end());
+              // If we vectorized initial block, no need to try to vectorize it
+              // again.
+              if (Cnt == StartIdx)
+                StartIdx += VF;
+              break;
+            case LoadsState::Gather:
+              break;
+            }
+          }
+        }
+        // Check if the whole array was vectorized already - exit.
+        if (StartIdx >= VL.size())
+          break;
+        // Found vectorizable parts - exit.
+        if (!VectorizedLoads.empty())
+          break;
+      }
+      if (!VectorizedLoads.empty()) {
+        InstructionCost GatherCost = 0;
+        unsigned NumParts = TTI->getNumberOfParts(VecTy);
+        bool NeedInsertSubvectorAnalysis =
+            !NumParts || (VL.size() / VF) > NumParts;
+        // Get the cost for gathered loads.
+        for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+          if (VectorizedLoads.contains(VL[I]))
+            continue;
+          GatherCost += getGatherCost(VL.slice(I, VF));
+        }
+        // The cost for vectorized loads.
+        InstructionCost ScalarsCost = 0;
+        for (Value *V : VectorizedLoads) {
+          auto *LI = cast<LoadInst>(V);
+          ScalarsCost += TTI->getMemoryOpCost(
+              Instruction::Load, LI->getType(), LI->getAlign(),
+              LI->getPointerAddressSpace(), CostKind, LI);
+        }
+        auto *LI = cast<LoadInst>(E->getMainOp());
+        auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+        Align Alignment = LI->getAlign();
+        GatherCost +=
+            VectorizedCnt *
+            TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                 LI->getPointerAddressSpace(), CostKind, LI);
+        GatherCost += ScatterVectorizeCnt *
+                      TTI->getGatherScatterOpCost(
+                          Instruction::Load, LoadTy, LI->getPointerOperand(),
+                          /*VariableMask=*/false, Alignment, CostKind, LI);
+        if (NeedInsertSubvectorAnalysis) {
+          // Add the cost for the subvectors insert.
+          for (int I = VF, E = VL.size(); I < E; I += VF)
+            GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+                                              None, I, LoadTy);
+        }
+        return ReuseShuffleCost + GatherCost - ScalarsCost;
+      }
+    }
     return ReuseShuffleCost + getGatherCost(VL);
   }
   InstructionCost CommonCost = 0;
@@ -3852,7 +4649,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
             ++Idx;
           }
         }
-        Idx = ReuseShuffleNumbers;
+        Idx = EntryVF;
         for (Value *V : VL) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *EE = cast<ExtractElementInst>(V);
@@ -3895,29 +4692,33 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       return CommonCost;
     }
     case Instruction::InsertElement: {
+      assert(E->ReuseShuffleIndices.empty() &&
+             "Unique insertelements only are expected.");
       auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
 
       unsigned const NumElts = SrcVecTy->getNumElements();
       unsigned const NumScalars = VL.size();
-      APInt DemandedElts = APInt::getNullValue(NumElts);
+      APInt DemandedElts = APInt::getZero(NumElts);
       // TODO: Add support for Instruction::InsertValue.
-      unsigned Offset = UINT_MAX;
+      SmallVector<int> Mask;
+      if (!E->ReorderIndices.empty()) {
+        inversePermutation(E->ReorderIndices, Mask);
+        Mask.append(NumElts - NumScalars, UndefMaskElem);
+      } else {
+        Mask.assign(NumElts, UndefMaskElem);
+        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+      }
+      unsigned Offset = *getInsertIndex(VL0, 0);
       bool IsIdentity = true;
-      SmallVector<int> ShuffleMask(NumElts, UndefMaskElem);
+      SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+      Mask.swap(PrevMask);
       for (unsigned I = 0; I < NumScalars; ++I) {
-        Optional<int> InsertIdx = getInsertIndex(VL[I], 0);
+        Optional<int> InsertIdx = getInsertIndex(VL[PrevMask[I]], 0);
         if (!InsertIdx || *InsertIdx == UndefMaskElem)
           continue;
-        unsigned Idx = *InsertIdx;
-        DemandedElts.setBit(Idx);
-        if (Idx < Offset) {
-          Offset = Idx;
-          IsIdentity &= I == 0;
-        } else {
-          assert(Idx >= Offset && "Failed to find vector index offset");
-          IsIdentity &= Idx - Offset == I;
-        }
-        ShuffleMask[Idx] = I;
+        DemandedElts.setBit(*InsertIdx);
+        IsIdentity &= *InsertIdx - Offset == I;
+        Mask[*InsertIdx - Offset] = I;
       }
       assert(Offset < NumElts && "Failed to find vector index offset");
 
@@ -3932,8 +4733,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
             TargetTransformInfo::SK_PermuteSingleSrc,
             FixedVectorType::get(SrcVecTy->getElementType(), Sz));
       } else if (!IsIdentity) {
-        Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
-                                    ShuffleMask);
+        auto *FirstInsert =
+            cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+              return !is_contained(E->Scalars,
+                                   cast<Instruction>(V)->getOperand(0));
+            }));
+        if (isa<UndefValue>(FirstInsert->getOperand(0))) {
+          Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
+        } else {
+          SmallVector<int> InsertMask(NumElts);
+          std::iota(InsertMask.begin(), InsertMask.end(), 0);
+          for (unsigned I = 0; I < NumElts; I++) {
+            if (Mask[I] != UndefMaskElem)
+              InsertMask[Offset + I] = NumElts + I;
+          }
+          Cost +=
+              TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
+        }
       }
 
       return Cost;
@@ -3955,7 +4771,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
                                 TTI::getCastContextHint(VL0), CostKind, VL0);
       if (NeedToShuffleReuses) {
-        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
@@ -3980,7 +4796,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
       if (NeedToShuffleReuses) {
-        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
       auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
@@ -4085,7 +4901,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
                                       Op2VK, Op1VP, Op2VP, Operands, VL0);
       if (NeedToShuffleReuses) {
-        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecCost =
@@ -4103,7 +4919,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
           Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
-        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
       InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecCost = TTI->getArithmeticInstrCost(
@@ -4117,7 +4933,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
           Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
       if (NeedToShuffleReuses) {
-        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
       InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       InstructionCost VecLdCost;
@@ -4160,7 +4976,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       InstructionCost ScalarEltCost =
           TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
       if (NeedToShuffleReuses) {
-        CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
       InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
@@ -4215,14 +5031,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                          TTI::CastContextHint::None, CostKind);
       }
 
-      SmallVector<int> Mask(E->Scalars.size());
-      for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {
-        auto *OpInst = cast<Instruction>(E->Scalars[I]);
-        assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);
-      }
-      VecCost +=
-          TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);
+      SmallVector<int> Mask;
+      buildSuffleEntryMask(
+          E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+          [E](Instruction *I) {
+            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+            return I->getOpcode() == E->getAltOpcode();
+          },
+          Mask);
+      CommonCost =
+          TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask);
       LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
       return CommonCost + VecCost - ScalarCost;
     }
@@ -4231,13 +5049,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   }
 }
 
-bool BoUpSLP::isFullyVectorizableTinyTree() const {
+bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
                     << VectorizableTree.size() << " is fully vectorizable .\n");
 
+  auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
+    SmallVector<int> Mask;
+    return TE->State == TreeEntry::NeedToGather &&
+           !any_of(TE->Scalars,
+                   [this](Value *V) { return EphValues.contains(V); }) &&
+           (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
+            TE->Scalars.size() < Limit ||
+            (TE->getOpcode() == Instruction::ExtractElement &&
+             isFixedVectorShuffle(TE->Scalars, Mask)) ||
+            (TE->State == TreeEntry::NeedToGather &&
+             TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
+  };
+
   // We only handle trees of heights 1 and 2.
   if (VectorizableTree.size() == 1 &&
-      VectorizableTree[0]->State == TreeEntry::Vectorize)
+      (VectorizableTree[0]->State == TreeEntry::Vectorize ||
+       (ForReduction &&
+        AreVectorizableGathers(VectorizableTree[0].get(),
+                               VectorizableTree[0]->Scalars.size()) &&
+        VectorizableTree[0]->getVectorFactor() > 2)))
     return true;
 
   if (VectorizableTree.size() != 2)
@@ -4249,19 +5084,14 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
   // or they are extractelements, which form shuffle.
   SmallVector<int> Mask;
   if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
-      (allConstant(VectorizableTree[1]->Scalars) ||
-       isSplat(VectorizableTree[1]->Scalars) ||
-       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
-        VectorizableTree[1]->Scalars.size() <
-            VectorizableTree[0]->Scalars.size()) ||
-       (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
-        VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
-        isShuffle(VectorizableTree[1]->Scalars, Mask))))
+      AreVectorizableGathers(VectorizableTree[1].get(),
+                             VectorizableTree[0]->Scalars.size()))
     return true;
 
   // Gathering cost would be too much for tiny trees.
   if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
-      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+      (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+       VectorizableTree[0]->State != TreeEntry::ScatterVectorize))
     return false;
 
   return true;
@@ -4330,7 +5160,7 @@ bool BoUpSLP::isLoadCombineCandidate() const {
   return true;
 }
 
-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   // No need to vectorize inserts of gathered values.
   if (VectorizableTree.size() == 2 &&
       isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
@@ -4344,7 +5174,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
   // can vectorize it if we can prove it fully vectorizable.
-  if (isFullyVectorizableTinyTree())
+  if (isFullyVectorizableTinyTree(ForReduction))
     return false;
 
   assert(VectorizableTree.empty()
@@ -4496,7 +5326,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     // If found user is an insertelement, do not calculate extract cost but try
     // to detect it as a final shuffled/identity match.
-    if (EU.User && isa<InsertElementInst>(EU.User)) {
+    if (isa_and_nonnull<InsertElementInst>(EU.User)) {
       if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
         Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
         if (!InsertIdx || *InsertIdx == UndefMaskElem)
@@ -4508,8 +5338,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
             return false;
           auto *IE1 = cast<InsertElementInst>(VU);
           auto *IE2 = cast<InsertElementInst>(V);
-          // Go though of insertelement instructions trying to find either VU as
-          // the original vector for IE2 or V as the original vector for IE1.
+          // Go through of insertelement instructions trying to find either VU
+          // as the original vector for IE2 or V as the original vector for IE1.
           do {
             if (IE1 == VU || IE2 == V)
               return true;
@@ -4525,7 +5355,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           VF.push_back(FTy->getNumElements());
           ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
           FirstUsers.push_back(EU.User);
-          DemandedElts.push_back(APInt::getNullValue(VF.back()));
+          DemandedElts.push_back(APInt::getZero(VF.back()));
           VecId = FirstUsers.size() - 1;
         } else {
           VecId = std::distance(FirstUsers.begin(), It);
@@ -4705,18 +5535,11 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
   } else {
     // Try to find nodes with the same vector factor.
     assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
-    // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
-    // landed.
-    auto &&GetVF = [](const TreeEntry *TE) {
-      if (!TE->ReuseShuffleIndices.empty())
-        return TE->ReuseShuffleIndices.size();
-      return TE->Scalars.size();
-    };
     DenseMap<int, const TreeEntry *> VFToTE;
     for (const TreeEntry *TE : UsedTEs.front())
-      VFToTE.try_emplace(GetVF(TE), TE);
+      VFToTE.try_emplace(TE->getVectorFactor(), TE);
     for (const TreeEntry *TE : UsedTEs.back()) {
-      auto It = VFToTE.find(GetVF(TE));
+      auto It = VFToTE.find(TE->getVectorFactor());
       if (It != VFToTE.end()) {
         VF = It->first;
         Entries.push_back(It->second);
@@ -4757,16 +5580,17 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
 
 InstructionCost
 BoUpSLP::getGatherCost(FixedVectorType *Ty,
-                       const DenseSet<unsigned> &ShuffledIndices) const {
+                       const DenseSet<unsigned> &ShuffledIndices,
+                       bool NeedToShuffle) const {
   unsigned NumElts = Ty->getNumElements();
-  APInt DemandedElts = APInt::getNullValue(NumElts);
+  APInt DemandedElts = APInt::getZero(NumElts);
   for (unsigned I = 0; I < NumElts; ++I)
     if (!ShuffledIndices.count(I))
       DemandedElts.setBit(I);
   InstructionCost Cost =
       TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
                                     /*Extract*/ false);
-  if (!ShuffledIndices.empty())
+  if (NeedToShuffle)
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
 }
@@ -4777,6 +5601,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  bool DuplicateNonConst = false;
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
@@ -4785,12 +5610,17 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
   // Iterate in reverse order to consider insert elements with the high cost.
   for (unsigned I = VL.size(); I > 0; --I) {
     unsigned Idx = I - 1;
-    if (isConstant(VL[Idx]))
+    // No need to shuffle duplicates for constants.
+    if (isConstant(VL[Idx])) {
+      ShuffledElements.insert(Idx);
       continue;
-    if (!UniqueElements.insert(VL[Idx]).second)
+    }
+    if (!UniqueElements.insert(VL[Idx]).second) {
+      DuplicateNonConst = true;
       ShuffledElements.insert(Idx);
+    }
   }
-  return getGatherCost(VecTy, ShuffledElements);
+  return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
 }
 
 // Perform operand reordering on the instructions in VL and return the reordered
@@ -5006,17 +5836,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
 
             // block:
             // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
-            // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
+            // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
             // ... (use %2)
-            // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
+            // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
             // br %block
-            SmallVector<int> UniqueIdxs;
+            SmallVector<int> UniqueIdxs(VF, UndefMaskElem);
             SmallSet<int, 4> UsedIdxs;
             int Pos = 0;
             int Sz = VL.size();
             for (int Idx : E->ReuseShuffleIndices) {
-              if (Idx != Sz && UsedIdxs.insert(Idx).second)
-                UniqueIdxs.emplace_back(Pos);
+              if (Idx != Sz && Idx != UndefMaskElem &&
+                  UsedIdxs.insert(Idx).second)
+                UniqueIdxs[Idx] = Pos;
               ++Pos;
             }
             assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
@@ -5047,11 +5878,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
                                   }).base());
     VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));
     int UniqueVals = 0;
-    bool HasUndefs = false;
     for (Value *V : VL.drop_back(VL.size() - VF)) {
       if (isa<UndefValue>(V)) {
         ReuseShuffleIndicies.emplace_back(UndefMaskElem);
-        HasUndefs = true;
         continue;
       }
       if (isConstant(V)) {
@@ -5066,15 +5895,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
         ++UniqueVals;
       }
     }
-    if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) {
+    if (UniqueVals == 1 && UniqueValues.size() == 1) {
       // Emit pure splat vector.
-      // FIXME: why it is not identified as an identity.
-      unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem);
-      if (NumUndefs == ReuseShuffleIndicies.size() - 1)
-        ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
-                                    UndefMaskElem);
-      else
-        ReuseShuffleIndicies.assign(VF, 0);
+      ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
+                                  UndefMaskElem);
     } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
       ReuseShuffleIndicies.clear();
       UniqueValues.clear();
@@ -5107,12 +5931,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   }
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  unsigned VF = E->Scalars.size();
-  if (NeedToShuffleReuses)
-    VF = E->ReuseShuffleIndices.size();
+  unsigned VF = E->getVectorFactor();
   ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
   if (E->State == TreeEntry::NeedToGather) {
-    setInsertPointAfterBundle(E);
+    if (E->getMainOp())
+      setInsertPointAfterBundle(E);
     Value *Vec;
     SmallVector<int> Mask;
     SmallVector<const TreeEntry *> Entries;
@@ -5152,13 +5975,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
+      assert(
+          (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) &&
+          "PHI reordering is free.");
       auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
-      if (NeedToShuffleReuses)
-        V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
+      ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+      V = ShuffleBuilder.finalize(V);
 
       E->VectorizedValue = V;
 
@@ -5209,53 +6036,48 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return NewV;
     }
     case Instruction::InsertElement: {
-      Builder.SetInsertPoint(VL0);
+      assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
+      Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
       Value *V = vectorizeTree(E->getOperand(1));
 
+      // Create InsertVector shuffle if necessary
+      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+      }));
       const unsigned NumElts =
-          cast<FixedVectorType>(VL0->getType())->getNumElements();
+          cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
       const unsigned NumScalars = E->Scalars.size();
 
+      unsigned Offset = *getInsertIndex(VL0, 0);
+      assert(Offset < NumElts && "Failed to find vector index offset");
+
+      // Create shuffle to resize vector
+      SmallVector<int> Mask;
+      if (!E->ReorderIndices.empty()) {
+        inversePermutation(E->ReorderIndices, Mask);
+        Mask.append(NumElts - NumScalars, UndefMaskElem);
+      } else {
+        Mask.assign(NumElts, UndefMaskElem);
+        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+      }
       // Create InsertVector shuffle if necessary
-      Instruction *FirstInsert = nullptr;
       bool IsIdentity = true;
-      unsigned Offset = UINT_MAX;
+      SmallVector<int> PrevMask(NumElts, UndefMaskElem);
+      Mask.swap(PrevMask);
       for (unsigned I = 0; I < NumScalars; ++I) {
-        Value *Scalar = E->Scalars[I];
-        if (!FirstInsert &&
-            !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0)))
-          FirstInsert = cast<Instruction>(Scalar);
+        Value *Scalar = E->Scalars[PrevMask[I]];
         Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
         if (!InsertIdx || *InsertIdx == UndefMaskElem)
           continue;
-        unsigned Idx = *InsertIdx;
-        if (Idx < Offset) {
-          Offset = Idx;
-          IsIdentity &= I == 0;
-        } else {
-          assert(Idx >= Offset && "Failed to find vector index offset");
-          IsIdentity &= Idx - Offset == I;
-        }
-      }
-      assert(Offset < NumElts && "Failed to find vector index offset");
-
-      // Create shuffle to resize vector
-      SmallVector<int> Mask(NumElts, UndefMaskElem);
-      if (!IsIdentity) {
-        for (unsigned I = 0; I < NumScalars; ++I) {
-          Value *Scalar = E->Scalars[I];
-          Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
-          if (!InsertIdx || *InsertIdx == UndefMaskElem)
-            continue;
-          Mask[*InsertIdx - Offset] = I;
-        }
-      } else {
-        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+        IsIdentity &= *InsertIdx - Offset == I;
+        Mask[*InsertIdx - Offset] = I;
       }
       if (!IsIdentity || NumElts != NumScalars)
         V = Builder.CreateShuffleVector(V, Mask);
 
-      if (NumElts != NumScalars) {
+      if ((!IsIdentity || Offset != 0 ||
+           !isa<UndefValue>(FirstInsert->getOperand(0))) &&
+          NumElts != NumScalars) {
         SmallVector<int> InsertMask(NumElts);
         std::iota(InsertMask.begin(), InsertMask.end(), 0);
         for (unsigned I = 0; I < NumElts; I++) {
@@ -5295,6 +6117,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       auto *CI = cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5317,6 +6140,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V = Builder.CreateCmp(P0, L, R);
       propagateIRFlags(V, E->Scalars, VL0);
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5337,6 +6161,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5360,6 +6185,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5403,6 +6229,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5414,9 +6241,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      bool IsReorder = E->updateStateIfReorder();
-      if (IsReorder)
-        VL0 = E->getMainOp();
       setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
@@ -5430,8 +6254,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         // The pointer operand uses an in-tree scalar so we add the new BitCast
         // to ExternalUses list to make sure that an extract will be generated
         // in the future.
-        if (getTreeEntry(PO))
-          ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
+        if (TreeEntry *Entry = getTreeEntry(PO)) {
+          // Find which lane we need to extract.
+          unsigned FoundLane = Entry->findLaneForValue(PO);
+          ExternalUses.emplace_back(PO, cast<User>(VecPtr), FoundLane);
+        }
 
         NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       } else {
@@ -5454,9 +6281,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::Store: {
-      bool IsReorder = !E->ReorderIndices.empty();
-      auto *SI = cast<StoreInst>(
-          IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
+      auto *SI = cast<StoreInst>(VL0);
       unsigned AS = SI->getPointerAddressSpace();
 
       setInsertPointAfterBundle(E);
@@ -5474,8 +6299,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
       // future.
-      if (getTreeEntry(ScalarPtr))
-        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+      if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {
+        // Find which lane we need to extract.
+        unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);
+        ExternalUses.push_back(
+            ExternalUser(ScalarPtr, cast<User>(VecPtr), FoundLane));
+      }
 
       Value *V = propagateMetadata(ST, E->Scalars);
 
@@ -5484,37 +6313,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       return V;
     }
     case Instruction::GetElementPtr: {
+      auto *GEP0 = cast<GetElementPtrInst>(VL0);
       setInsertPointAfterBundle(E);
 
       Value *Op0 = vectorizeTree(E->getOperand(0));
 
-      std::vector<Value *> OpVecs;
-      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
-           ++j) {
-        ValueList &VL = E->getOperand(j);
-        // Need to cast all elements to the same type before vectorization to
-        // avoid crash.
-        Type *VL0Ty = VL0->getOperand(j)->getType();
-        Type *Ty = llvm::all_of(
-                       VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
-                       ? VL0Ty
-                       : DL->getIndexType(cast<GetElementPtrInst>(VL0)
-                                              ->getPointerOperandType()
-                                              ->getScalarType());
-        for (Value *&V : VL) {
-          auto *CI = cast<ConstantInt>(V);
-          V = ConstantExpr::getIntegerCast(CI, Ty,
-                                           CI->getValue().isSignBitSet());
-        }
-        Value *OpVec = vectorizeTree(VL);
+      SmallVector<Value *> OpVecs;
+      for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
+        Value *OpVec = vectorizeTree(E->getOperand(J));
         OpVecs.push_back(OpVec);
       }
 
-      Value *V = Builder.CreateGEP(
-          cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+      Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5541,7 +6355,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       std::vector<Value *> OpVecs;
       SmallVector<Type *, 2> TysForDecl =
           {FixedVectorType::get(CI->getType(), E->Scalars.size())};
-      for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+      for (int j = 0, e = CI->arg_size(); j < e; ++j) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
@@ -5577,10 +6391,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
-      if (ScalarArg && getTreeEntry(ScalarArg))
-        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+      if (ScalarArg) {
+        if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {
+          // Find which lane we need to extract.
+          unsigned FoundLane = Entry->findLaneForValue(ScalarArg);
+          ExternalUses.push_back(
+              ExternalUser(ScalarArg, cast<User>(V), FoundLane));
+        }
+      }
 
       propagateIRFlags(V, E->Scalars, VL0);
+      ShuffleBuilder.addInversedMask(E->ReorderIndices);
       ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
@@ -5628,19 +6449,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // Also, gather up main and alt scalar ops to propagate IR flags to
       // each vector operation.
       ValueList OpScalars, AltScalars;
-      unsigned Sz = E->Scalars.size();
-      SmallVector<int> Mask(Sz);
-      for (unsigned I = 0; I < Sz; ++I) {
-        auto *OpInst = cast<Instruction>(E->Scalars[I]);
-        assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == E->getAltOpcode()) {
-          Mask[I] = Sz + I;
-          AltScalars.push_back(E->Scalars[I]);
-        } else {
-          Mask[I] = I;
-          OpScalars.push_back(E->Scalars[I]);
-        }
-      }
+      SmallVector<int> Mask;
+      buildSuffleEntryMask(
+          E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+          [E](Instruction *I) {
+            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+            return I->getOpcode() == E->getAltOpcode();
+          },
+          Mask, &OpScalars, &AltScalars);
 
       propagateIRFlags(V0, OpScalars);
       propagateIRFlags(V1, AltScalars);
@@ -5648,7 +6464,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
-      ShuffleBuilder.addMask(E->ReuseShuffleIndices);
       V = ShuffleBuilder.finalize(V);
 
       E->VectorizedValue = V;
@@ -5823,7 +6638,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to delete users in the ignorelist.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U) ||
+                  (isa_and_nonnull<Instruction>(U) &&
+                   isDeleted(cast<Instruction>(U)))) &&
                  "Deleting out-of-tree value");
         }
       }
@@ -5898,27 +6715,28 @@ void BoUpSLP::optimizeGatherSequence() {
            "Worklist not sorted properly!");
     BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
-      Instruction *In = &*it++;
-      if (isDeleted(In))
+    for (Instruction &In : llvm::make_early_inc_range(*BB)) {
+      if (isDeleted(&In))
         continue;
-      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+      if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
+          !isa<ShuffleVectorInst>(&In))
         continue;
 
       // Check if we can replace this instruction with any of the
       // visited instructions.
+      bool Replaced = false;
       for (Instruction *v : Visited) {
-        if (In->isIdenticalTo(v) &&
-            DT->dominates(v->getParent(), In->getParent())) {
-          In->replaceAllUsesWith(v);
-          eraseInstruction(In);
-          In = nullptr;
+        if (In.isIdenticalTo(v) &&
+            DT->dominates(v->getParent(), In.getParent())) {
+          In.replaceAllUsesWith(v);
+          eraseInstruction(&In);
+          Replaced = true;
           break;
         }
       }
-      if (In) {
-        assert(!is_contained(Visited, In));
-        Visited.push_back(In);
+      if (!Replaced) {
+        assert(!is_contained(Visited, &In));
+        Visited.push_back(&In);
       }
     }
   }
@@ -5931,7 +6749,9 @@ void BoUpSLP::optimizeGatherSequence() {
 Optional<BoUpSLP::ScheduleData *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S) {
-  if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue))
+  // No need to schedule PHIs, insertelement, extractelement and extractvalue
+  // instructions.
+  if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))
     return nullptr;
 
   // Initialize the instruction bundle.
@@ -6027,7 +6847,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                 Value *OpValue) {
-  if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue))
+  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))
     return;
 
   ScheduleData *Bundle = getScheduleData(OpValue);
@@ -6067,8 +6887,9 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
-  assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) &&
-         "phi nodes/insertelements don't need to be scheduled");
+  assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
+         "phi nodes/insertelements/extractelements/extractvalues don't need to "
+         "be scheduled");
   auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
     ScheduleData *ISD = getScheduleData(I);
     if (!ISD)
@@ -6338,7 +7159,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
-      assert((isa<InsertElementInst>(SD->Inst) ||
+      assert((isVectorLikeInstWithConstOps(SD->Inst) ||
               SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
@@ -6681,9 +7502,7 @@ struct SLPVectorizer : public FunctionPass {
     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
   }
 
-  bool doInitialization(Module &M) override {
-    return false;
-  }
+  bool doInitialization(Module &M) override { return false; }
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
@@ -6818,44 +7637,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   return Changed;
 }
 
-/// Order may have elements assigned special value (size) which is out of
-/// bounds. Such indices only appear on places which correspond to undef values
-/// (see canReuseExtract for details) and used in order to avoid undef values
-/// have effect on operands ordering.
-/// The first loop below simply finds all unused indices and then the next loop
-/// nest assigns these indices for undef values positions.
-/// As an example below Order has two undef positions and they have assigned
-/// values 3 and 7 respectively:
-/// before:  6 9 5 4 9 2 1 0
-/// after:   6 3 5 4 7 2 1 0
-/// \returns Fixed ordering.
-static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) {
-  BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end());
-  const unsigned Sz = NewOrder.size();
-  SmallBitVector UsedIndices(Sz);
-  SmallVector<int> MaskedIndices;
-  for (int I = 0, E = NewOrder.size(); I < E; ++I) {
-    if (NewOrder[I] < Sz)
-      UsedIndices.set(NewOrder[I]);
-    else
-      MaskedIndices.push_back(I);
-  }
-  if (MaskedIndices.empty())
-    return NewOrder;
-  SmallVector<int> AvailableIndices(MaskedIndices.size());
-  unsigned Cnt = 0;
-  int Idx = UsedIndices.find_first();
-  do {
-    AvailableIndices[Cnt] = Idx;
-    Idx = UsedIndices.find_next(Idx);
-    ++Cnt;
-  } while (Idx > 0);
-  assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
-  for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
-    NewOrder[MaskedIndices[I]] = AvailableIndices[I];
-  return NewOrder;
-}
-
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned Idx) {
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
@@ -6871,19 +7652,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                     << "\n");
 
   R.buildTree(Chain);
-  Optional<ArrayRef<unsigned>> Order = R.bestOrder();
-  // TODO: Handle orders of size less than number of elements in the vector.
-  if (Order && Order->size() == Chain.size()) {
-    // TODO: reorder tree nodes without tree rebuilding.
-    SmallVector<Value *, 4> ReorderedOps(Chain.size());
-    transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
-              [Chain](const unsigned Idx) { return Chain[Idx]; });
-    R.buildTree(ReorderedOps);
-  }
   if (R.isTreeTinyAndNotFullyVectorizable())
     return false;
   if (R.isLoadCombineCandidate())
     return false;
+  R.reorderTopToBottom();
+  R.reorderBottomToTop();
+  R.buildExternalUses();
 
   R.computeMinimumValueSizes();
 
@@ -7006,7 +7781,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
     unsigned EltSize = R.getVectorElementSize(Operands[0]);
     unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
 
-    unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);
+    unsigned MinVF = R.getMinVF(EltSize);
     unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
                               MaxElts);
 
@@ -7079,11 +7854,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
   Value *VL[] = {A, B};
-  return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
+  return tryToVectorizeList(VL, R);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           bool AllowReorder) {
+                                           bool LimitForRegisterSize) {
   if (VL.size() < 2)
     return false;
 
@@ -7117,7 +7892,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   }
 
   unsigned Sz = R.getVectorElementSize(I0);
-  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+  unsigned MinVF = R.getMinVF(Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
@@ -7155,7 +7930,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if (!isPowerOf2_32(OpsWidth))
         continue;
 
-      if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
+      if ((LimitForRegisterSize && OpsWidth < MaxVF) ||
+          (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
         break;
 
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -7170,18 +7946,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                         << "\n");
 
       R.buildTree(Ops);
-      if (AllowReorder) {
-        Optional<ArrayRef<unsigned>> Order = R.bestOrder();
-        if (Order) {
-          // TODO: reorder tree nodes without tree rebuilding.
-          SmallVector<Value *, 4> ReorderedOps(Ops.size());
-          transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
-                    [Ops](const unsigned Idx) { return Ops[Idx]; });
-          R.buildTree(ReorderedOps);
-        }
-      }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
+      R.reorderTopToBottom();
+      R.reorderBottomToTop();
+      R.buildExternalUses();
 
       R.computeMinimumValueSizes();
       InstructionCost Cost = R.getTreeCost();
@@ -7374,10 +8143,20 @@ class HorizontalReduction {
                          Value *RHS, const Twine &Name, bool UseSelect) {
     unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
     switch (Kind) {
-    case RecurKind::Add:
-    case RecurKind::Mul:
     case RecurKind::Or:
+      if (UseSelect &&
+          LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
+        return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
+      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+                                 Name);
     case RecurKind::And:
+      if (UseSelect &&
+          LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
+        return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
+      return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+                                 Name);
+    case RecurKind::Add:
+    case RecurKind::Mul:
     case RecurKind::Xor:
     case RecurKind::FAdd:
     case RecurKind::FMul:
@@ -7421,8 +8200,12 @@ class HorizontalReduction {
   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
                          Value *RHS, const Twine &Name,
                          const ReductionOpsListType &ReductionOps) {
-    bool UseSelect = ReductionOps.size() == 2;
-    assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) &&
+    bool UseSelect = ReductionOps.size() == 2 ||
+                     // Logical or/and.
+                     (ReductionOps.size() == 1 &&
+                      isa<SelectInst>(ReductionOps.front().front()));
+    assert((!UseSelect || ReductionOps.size() != 2 ||
+            isa<SelectInst>(ReductionOps[1][0])) &&
            "Expected cmp + select pairs for reduction");
     Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
@@ -7560,10 +8343,10 @@ class HorizontalReduction {
   /// Checks if the instruction is in basic block \p BB.
   /// For a cmp+sel min/max reduction check that both ops are in \p BB.
   static bool hasSameParent(Instruction *I, BasicBlock *BB) {
-    if (isCmpSelMinMax(I)) {
+    if (isCmpSelMinMax(I) || (isBoolLogicOp(I) && isa<SelectInst>(I))) {
       auto *Sel = cast<SelectInst>(I);
-      auto *Cmp = cast<Instruction>(Sel->getCondition());
-      return Sel->getParent() == BB && Cmp->getParent() == BB;
+      auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
+      return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
     }
     return I->getParent() == BB;
   }
@@ -7745,13 +8528,13 @@ public:
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
-  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+  Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
     // If there are a sufficient number of reduction values, reduce
     // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
     if (NumReducedVals < 4)
-      return false;
+      return nullptr;
 
     // Intersect the fast-math-flags from all reduction operations.
     FastMathFlags RdxFMF;
@@ -7825,22 +8608,14 @@ public:
     unsigned i = 0;
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
-      Optional<ArrayRef<unsigned>> Order = V.bestOrder();
-      if (Order) {
-        assert(Order->size() == VL.size() &&
-               "Order size must be the same as number of vectorized "
-               "instructions.");
-        // TODO: reorder tree nodes without tree rebuilding.
-        SmallVector<Value *, 4> ReorderedOps(VL.size());
-        transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
-                  [VL](const unsigned Idx) { return VL[Idx]; });
-        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
-      }
-      if (V.isTreeTinyAndNotFullyVectorizable())
+      V.buildTree(VL, IgnoreList);
+      if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
         break;
       if (V.isLoadCombineReductionCandidate(RdxKind))
         break;
+      V.reorderTopToBottom();
+      V.reorderBottomToTop(/*IgnoreReorder=*/true);
+      V.buildExternalUses(ExternallyUsedValues);
 
       // For a poison-safe boolean logic reduction, do not replace select
       // instructions with logic ops. All reduced values will be frozen (see
@@ -7860,7 +8635,7 @@ public:
       InstructionCost Cost = TreeCost + ReductionCost;
       if (!Cost.isValid()) {
         LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
-        return false;
+        return nullptr;
       }
       if (Cost >= -SLPCostThreshold) {
         V.getORE()->emit([&]() {
@@ -7940,7 +8715,7 @@ public:
       // vector reductions.
       V.eraseInstructions(IgnoreList);
     }
-    return VectorizedTree != nullptr;
+    return VectorizedTree;
   }
 
   unsigned numReductionValues() const { return ReducedVals.size(); }
@@ -7950,6 +8725,7 @@ private:
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    Value *FirstReducedVal, unsigned ReduxWidth,
                                    FastMathFlags FMF) {
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     Type *ScalarTy = FirstReducedVal->getType();
     FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
     InstructionCost VectorCost, ScalarCost;
@@ -7962,33 +8738,39 @@ private:
     case RecurKind::FAdd:
     case RecurKind::FMul: {
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
-      VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF);
-      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
+      VectorCost =
+          TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
       break;
     }
     case RecurKind::FMax:
     case RecurKind::FMin: {
+      auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
       VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                               /*unsigned=*/false);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+                                               /*unsigned=*/false, CostKind);
+      CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+      ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind) +
+                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind);
       break;
     }
     case RecurKind::SMax:
     case RecurKind::SMin:
     case RecurKind::UMax:
     case RecurKind::UMin: {
+      auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
       bool IsUnsigned =
           RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
-      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
+                                               CostKind);
+      CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+      ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind) +
+                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind);
       break;
     }
     default:
@@ -8010,6 +8792,7 @@ private:
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
+    ++NumVectorInstructions;
     return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
                                        ReductionOps.back());
   }
@@ -8219,32 +9002,45 @@ static bool tryToVectorizeHorReductionOrInstOperands(
   // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
   // CmpInsts so we can skip extra attempts in
   // tryToVectorizeHorReductionOrInstOperands and save compile time.
-  SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
+  std::queue<std::pair<Instruction *, unsigned>> Stack;
+  Stack.emplace(Root, 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
+  SmallVector<WeakTrackingVH> PostponedInsts;
   bool Res = false;
+  auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
+                                     Value *&B1) -> Value * {
+    bool IsBinop = matchRdxBop(Inst, B0, B1);
+    bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
+    if (IsBinop || IsSelect) {
+      HorizontalReduction HorRdx;
+      if (HorRdx.matchAssociativeReduction(P, Inst))
+        return HorRdx.tryToReduce(R, TTI);
+    }
+    return nullptr;
+  };
   while (!Stack.empty()) {
     Instruction *Inst;
     unsigned Level;
-    std::tie(Inst, Level) = Stack.pop_back_val();
+    std::tie(Inst, Level) = Stack.front();
+    Stack.pop();
     // Do not try to analyze instruction that has already been vectorized.
     // This may happen when we vectorize instruction operands on a previous
     // iteration while stack was populated before that happened.
     if (R.isDeleted(Inst))
       continue;
-    Value *B0, *B1;
-    bool IsBinop = matchRdxBop(Inst, B0, B1);
-    bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
-    if (IsBinop || IsSelect) {
-      HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, Inst)) {
-        if (HorRdx.tryToReduce(R, TTI)) {
-          Res = true;
-          // Set P to nullptr to avoid re-analysis of phi node in
-          // matchAssociativeReduction function unless this is the root node.
-          P = nullptr;
-          continue;
-        }
+    Value *B0 = nullptr, *B1 = nullptr;
+    if (Value *V = TryToReduce(Inst, B0, B1)) {
+      Res = true;
+      // Set P to nullptr to avoid re-analysis of phi node in
+      // matchAssociativeReduction function unless this is the root node.
+      P = nullptr;
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        // Try to find another reduction.
+        Stack.emplace(I, Level);
+        continue;
       }
+    } else {
+      bool IsBinop = B0 && B1;
       if (P && IsBinop) {
         Inst = dyn_cast<Instruction>(B0);
         if (Inst == P)
@@ -8256,14 +9052,14 @@ static bool tryToVectorizeHorReductionOrInstOperands(
           continue;
         }
       }
-    }
-    // Set P to nullptr to avoid re-analysis of phi node in
-    // matchAssociativeReduction function unless this is the root node.
-    P = nullptr;
-    // Do not try to vectorize CmpInst operands, this is done separately.
-    if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) {
-      Res = true;
-      continue;
+      // Set P to nullptr to avoid re-analysis of phi node in
+      // matchAssociativeReduction function unless this is the root node.
+      P = nullptr;
+      // Do not try to vectorize CmpInst operands, this is done separately.
+      // Final attempt for binop args vectorization should happen after the loop
+      // to try to find reductions.
+      if (!isa<CmpInst>(Inst))
+        PostponedInsts.push_back(Inst);
     }
 
     // Try to vectorize operands.
@@ -8277,8 +9073,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
             // separately.
             if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
                 I->getParent() == BB)
-              Stack.emplace_back(I, Level);
+              Stack.emplace(I, Level);
   }
+  // Try to vectorized binops where reductions were not found.
+  for (Value *V : PostponedInsts)
+    if (auto *Inst = dyn_cast<Instruction>(V))
+      if (!R.isDeleted(Inst))
+        Res |= Vectorize(Inst, R);
   return Res;
 }
 
@@ -8313,7 +9114,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false);
+  return tryToVectorizeList(BuildVectorOpds, R);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
@@ -8324,11 +9125,11 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
-       isShuffle(BuildVectorOpds, Mask)))
+       isFixedVectorShuffle(BuildVectorOpds, Mask)))
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
-  return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true);
+  return tryToVectorizeList(BuildVectorInsts, R);
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
@@ -8369,6 +9170,78 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
   return OpsChanged;
 }
 
+template <typename T>
+static bool
+tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
+                       function_ref<unsigned(T *)> Limit,
+                       function_ref<bool(T *, T *)> Comparator,
+                       function_ref<bool(T *, T *)> AreCompatible,
+                       function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize,
+                       bool LimitForRegisterSize) {
+  bool Changed = false;
+  // Sort by type, parent, operands.
+  stable_sort(Incoming, Comparator);
+
+  // Try to vectorize elements base on their type.
+  SmallVector<T *> Candidates;
+  for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
+    // Look for the next elements with the same type, parent and operand
+    // kinds.
+    auto *SameTypeIt = IncIt;
+    while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
+      ++SameTypeIt;
+
+    // Try to vectorize them.
+    unsigned NumElts = (SameTypeIt - IncIt);
+    LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
+                      << NumElts << ")\n");
+    // The vectorization is a 3-state attempt:
+    // 1. Try to vectorize instructions with the same/alternate opcodes with the
+    // size of maximal register at first.
+    // 2. Try to vectorize remaining instructions with the same type, if
+    // possible. This may result in the better vectorization results rather than
+    // if we try just to vectorize instructions with the same/alternate opcodes.
+    // 3. Final attempt to try to vectorize all instructions with the
+    // same/alternate ops only, this may result in some extra final
+    // vectorization.
+    if (NumElts > 1 &&
+        TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+      // Success start over because instructions might have been changed.
+      Changed = true;
+    } else if (NumElts < Limit(*IncIt) &&
+               (Candidates.empty() ||
+                Candidates.front()->getType() == (*IncIt)->getType())) {
+      Candidates.append(IncIt, std::next(IncIt, NumElts));
+    }
+    // Final attempt to vectorize instructions with the same types.
+    if (Candidates.size() > 1 &&
+        (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
+      if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) {
+        // Success start over because instructions might have been changed.
+        Changed = true;
+      } else if (LimitForRegisterSize) {
+        // Try to vectorize using small vectors.
+        for (auto *It = Candidates.begin(), *End = Candidates.end();
+             It != End;) {
+          auto *SameTypeIt = It;
+          while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
+            ++SameTypeIt;
+          unsigned NumElts = (SameTypeIt - It);
+          if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts),
+                                            /*LimitForRegisterSize=*/false))
+            Changed = true;
+          It = SameTypeIt;
+        }
+      }
+      Candidates.clear();
+    }
+
+    // Start over at the next instruction of a different type (or the end).
+    IncIt = SameTypeIt;
+  }
+  return Changed;
+}
+
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
@@ -8377,11 +9250,89 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   // node. Allows better to identify the chains that can be vectorized in the
   // better way.
   DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
+  auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
+    assert(isValidElementType(V1->getType()) &&
+           isValidElementType(V2->getType()) &&
+           "Expected vectorizable types only.");
+    // It is fine to compare type IDs here, since we expect only vectorizable
+    // types, like ints, floats and pointers, we don't care about other type.
+    if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
+      return true;
+    if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
+      return false;
+    ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+    ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+    if (Opcodes1.size() < Opcodes2.size())
+      return true;
+    if (Opcodes1.size() > Opcodes2.size())
+      return false;
+    for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+      // Undefs are compatible with any other value.
+      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+        continue;
+      if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+        if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+          DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
+          DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
+          if (!NodeI1)
+            return NodeI2 != nullptr;
+          if (!NodeI2)
+            return false;
+          assert((NodeI1 == NodeI2) ==
+                     (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
+                 "Different nodes should have different DFS numbers");
+          if (NodeI1 != NodeI2)
+            return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
+          InstructionsState S = getSameOpcode({I1, I2});
+          if (S.getOpcode())
+            continue;
+          return I1->getOpcode() < I2->getOpcode();
+        }
+      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+        continue;
+      if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
+        return true;
+      if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
+        return false;
+    }
+    return false;
+  };
+  auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
+    if (V1 == V2)
+      return true;
+    if (V1->getType() != V2->getType())
+      return false;
+    ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+    ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+    if (Opcodes1.size() != Opcodes2.size())
+      return false;
+    for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+      // Undefs are compatible with any other value.
+      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+        continue;
+      if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+        if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+          if (I1->getParent() != I2->getParent())
+            return false;
+          InstructionsState S = getSameOpcode({I1, I2});
+          if (S.getOpcode())
+            continue;
+          return false;
+        }
+      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+        continue;
+      if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
+        return false;
+    }
+    return true;
+  };
+  auto Limit = [&R](Value *V) {
+    unsigned EltSize = R.getVectorElementSize(V);
+    return std::max(2U, R.getMaxVecRegSize() / EltSize);
+  };
 
-  bool HaveVectorizedPhiNodes = true;
-  while (HaveVectorizedPhiNodes) {
-    HaveVectorizedPhiNodes = false;
-
+  bool HaveVectorizedPhiNodes = false;
+  do {
     // Collect the incoming values from the PHIs.
     Incoming.clear();
     for (Instruction &I : *BB) {
@@ -8419,132 +9370,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       }
     }
 
-    // Sort by type, parent, operands.
-    stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) {
-      assert(isValidElementType(V1->getType()) &&
-             isValidElementType(V2->getType()) &&
-             "Expected vectorizable types only.");
-      // It is fine to compare type IDs here, since we expect only vectorizable
-      // types, like ints, floats and pointers, we don't care about other type.
-      if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
-        return true;
-      if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
-        return false;
-      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
-      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
-      if (Opcodes1.size() < Opcodes2.size())
-        return true;
-      if (Opcodes1.size() > Opcodes2.size())
-        return false;
-      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
-        // Undefs are compatible with any other value.
-        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
-          continue;
-        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
-          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
-            DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
-            DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
-            if (!NodeI1)
-              return NodeI2 != nullptr;
-            if (!NodeI2)
-              return false;
-            assert((NodeI1 == NodeI2) ==
-                       (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
-                   "Different nodes should have different DFS numbers");
-            if (NodeI1 != NodeI2)
-              return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
-            InstructionsState S = getSameOpcode({I1, I2});
-            if (S.getOpcode())
-              continue;
-            return I1->getOpcode() < I2->getOpcode();
-          }
-        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
-          continue;
-        if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
-          return true;
-        if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
-          return false;
-      }
-      return false;
-    });
-
-    auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
-      if (V1 == V2)
-        return true;
-      if (V1->getType() != V2->getType())
-        return false;
-      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
-      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
-      if (Opcodes1.size() != Opcodes2.size())
-        return false;
-      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
-        // Undefs are compatible with any other value.
-        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
-          continue;
-        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
-          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
-            if (I1->getParent() != I2->getParent())
-              return false;
-            InstructionsState S = getSameOpcode({I1, I2});
-            if (S.getOpcode())
-              continue;
-            return false;
-          }
-        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
-          continue;
-        if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
-          return false;
-      }
-      return true;
-    };
-
-    // Try to vectorize elements base on their type.
-    SmallVector<Value *, 4> Candidates;
-    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
-                                           E = Incoming.end();
-         IncIt != E;) {
-
-      // Look for the next elements with the same type, parent and operand
-      // kinds.
-      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
-      while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) {
-        VisitedInstrs.insert(*SameTypeIt);
-        ++SameTypeIt;
-      }
-
-      // Try to vectorize them.
-      unsigned NumElts = (SameTypeIt - IncIt);
-      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
-                        << NumElts << ")\n");
-      // The order in which the phi nodes appear in the program does not matter.
-      // So allow tryToVectorizeList to reorder them if it is beneficial. This
-      // is done when there are exactly two elements since tryToVectorizeList
-      // asserts that there are only two values when AllowReorder is true.
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
-                                            /*AllowReorder=*/true)) {
-        // Success start over because instructions might have been changed.
-        HaveVectorizedPhiNodes = true;
-        Changed = true;
-      } else if (NumElts < 4 &&
-                 (Candidates.empty() ||
-                  Candidates.front()->getType() == (*IncIt)->getType())) {
-        Candidates.append(IncIt, std::next(IncIt, NumElts));
-      }
-      // Final attempt to vectorize phis with the same types.
-      if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
-        if (Candidates.size() > 1 &&
-            tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) {
-          // Success start over because instructions might have been changed.
-          HaveVectorizedPhiNodes = true;
-          Changed = true;
-        }
-        Candidates.clear();
-      }
-
-      // Start over at the next instruction of a different type (or the end).
-      IncIt = SameTypeIt;
-    }
-  }
+    HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
+        Incoming, Limit, PHICompare, AreCompatiblePHIs,
+        [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
+          return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+        },
+        /*LimitForRegisterSize=*/true);
+    Changed |= HaveVectorizedPhiNodes;
+    VisitedInstrs.insert(Incoming.begin(), Incoming.end());
+  } while (HaveVectorizedPhiNodes);
 
   VisitedInstrs.clear();
 
@@ -8797,6 +9631,10 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
     return V1->getValueOperand()->getValueID() ==
            V2->getValueOperand()->getValueID();
   };
+  auto Limit = [&R, this](StoreInst *SI) {
+    unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+    return R.getMinVF(EltSize);
+  };
 
   // Attempt to sort and vectorize each of the store-groups.
   for (auto &Pair : Stores) {
@@ -8806,33 +9644,15 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
     LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
                       << Pair.second.size() << ".\n");
 
-    stable_sort(Pair.second, StoreSorter);
-
-    // Try to vectorize elements based on their compatibility.
-    for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(),
-                                         E = Pair.second.end();
-         IncIt != E;) {
-
-      // Look for the next elements with the same type.
-      ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt;
-      Type *EltTy = (*IncIt)->getPointerOperand()->getType();
-
-      while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))
-        ++SameTypeIt;
-
-      // Try to vectorize them.
-      unsigned NumElts = (SameTypeIt - IncIt);
-      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("
-                        << NumElts << ")\n");
-      if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&
-          vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {
-        // Success start over because instructions might have been changed.
-        Changed = true;
-      }
+    if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
+      continue;
 
-      // Start over at the next instruction of a different type (or the end).
-      IncIt = SameTypeIt;
-    }
+    Changed |= tryToVectorizeSequence<StoreInst>(
+        Pair.second, Limit, StoreSorter, AreCompatibleStores,
+        [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
+          return vectorizeStores(Candidates, R);
+        },
+        /*LimitForRegisterSize=*/false);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5f39fe1c17a3..638467f94e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -815,6 +815,28 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : depth_first(Entry))
     Block->execute(State);
 
+  // Fix the latch value of reduction and first-order recurrences phis in the
+  // vector loop.
+  VPBasicBlock *Header = Entry->getEntryBasicBlock();
+  for (VPRecipeBase &R : Header->phis()) {
+    auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
+    if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
+                   isa<VPReductionPHIRecipe>(&R)))
+      continue;
+    // For first-order recurrences and in-order reduction phis, only a single
+    // part is generated, which provides the last part from the previous
+    // iteration. Otherwise all UF parts are generated.
+    bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
+                            cast<VPReductionPHIRecipe>(&R)->isOrdered();
+    unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
+    for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+      Value *VecPhi = State->get(PhiR, Part);
+      Value *Val = State->get(PhiR->getBackedgeValue(),
+                              SinglePartNeeded ? State->UF - 1 : Part);
+      cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB);
+    }
+  }
+
   // Setup branch terminator successors for VPBBs in VPBBsToFix based on
   // VPBB's successors.
   for (auto VPBB : State->CFG.VPBBsToFix) {
@@ -862,6 +884,13 @@ void VPlan::print(raw_ostream &O) const {
   VPSlotTracker SlotTracker(this);
 
   O << "VPlan '" << Name << "' {";
+
+  if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
+    O << "\nLive-in ";
+    BackedgeTakenCount->printAsOperand(O, SlotTracker);
+    O << " = backedge-taken count\n";
+  }
+
   for (const VPBlockBase *Block : depth_first(getEntry())) {
     O << '\n';
     Block->print(O, "", SlotTracker);
@@ -920,12 +949,12 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
+Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
   return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
          Twine(getOrCreateBID(Block));
 }
 
-const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
+Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
   const std::string &Name = Block->getName();
   if (!Name.empty())
     return Name;
@@ -1235,7 +1264,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
         VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
     // Add the consecutive indices to the vector value.
     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
-    State.set(getVPSingleValue(), CanonicalVectorIV, Part);
+    State.set(this, CanonicalVectorIV, Part);
   }
 }
 
@@ -1243,7 +1272,7 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
-  getVPSingleValue()->printAsOperand(O, SlotTracker);
+  printAsOperand(O, SlotTracker);
   O << " = WIDEN-CANONICAL-INDUCTION";
 }
 #endif
@@ -1306,12 +1335,16 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
         PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
     State.set(this, EntryPart, Part);
   }
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
   VPValue *StartVPV = getStartValue();
   Value *StartV = StartVPV->getLiveInIRValue();
 
   Value *Iden = nullptr;
   RecurKind RK = RdxDesc.getRecurrenceKind();
-  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
+  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
     // MinMax reduction have the start value as their identify.
     if (ScalarPHI) {
       Iden = StartV;
@@ -1322,12 +1355,11 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
     }
   } else {
-    Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
-        RK, VecTy->getScalarType(), RdxDesc.getFastMathFlags());
-    Iden = IdenC;
+    Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
+                                         RdxDesc.getFastMathFlags());
 
     if (!ScalarPHI) {
-      Iden = ConstantVector::getSplat(State.VF, IdenC);
+      Iden = Builder.CreateVectorSplat(State.VF, Iden);
       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
       Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
       Constant *Zero = Builder.getInt32(0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bdf09d15c27f..00ee31007cb7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1312,7 +1312,7 @@ public:
     // The first operand is the address, followed by the stored values, followed
     // by an optional mask.
     return ArrayRef<VPValue *>(op_begin(), getNumOperands())
-        .slice(1, getNumOperands() - (HasMask ? 2 : 1));
+        .slice(1, getNumStoreOperands());
   }
 
   /// Generate the wide load or store, and shuffles.
@@ -1325,6 +1325,12 @@ public:
 #endif
 
   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+
+  /// Returns the number of stored operands of this interleave group. Returns 0
+  /// for load interleave groups.
+  unsigned getNumStoreOperands() const {
+    return getNumOperands() - (HasMask ? 2 : 1);
+  }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -1508,6 +1514,12 @@ public:
 class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   Instruction &Ingredient;
 
+  // Whether the loaded-from / stored-to addresses are consecutive.
+  bool Consecutive;
+
+  // Whether the consecutive loaded/stored addresses are in reverse order.
+  bool Reverse;
+
   void setMask(VPValue *Mask) {
     if (!Mask)
       return;
@@ -1519,16 +1531,21 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   }
 
 public:
-  VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load) {
+  VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
+                                 bool Consecutive, bool Reverse)
+      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
+        Consecutive(Consecutive), Reverse(Reverse) {
+    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
     setMask(Mask);
   }
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
-                                 VPValue *StoredValue, VPValue *Mask)
+                                 VPValue *StoredValue, VPValue *Mask,
+                                 bool Consecutive, bool Reverse)
       : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
-        Ingredient(Store) {
+        Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
+    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     setMask(Mask);
   }
 
@@ -1558,6 +1575,13 @@ public:
     return getOperand(1); // Stored value is the 2nd, mandatory operand.
   }
 
+  // Return whether the loaded-from / stored-to addresses are consecutive.
+  bool isConsecutive() const { return Consecutive; }
+
+  // Return whether the consecutive loaded/stored addresses are in reverse
+  // order.
+  bool isReverse() const { return Reverse; }
+
   /// Generate the wide load/store.
   void execute(VPTransformState &State) override;
 
@@ -1569,11 +1593,11 @@ public:
 };
 
 /// A Recipe for widening the canonical induction variable of the vector loop.
-class VPWidenCanonicalIVRecipe : public VPRecipeBase {
+class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 public:
-  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC, {}) {
-    new VPValue(nullptr, this);
-  }
+  VPWidenCanonicalIVRecipe()
+      : VPRecipeBase(VPWidenCanonicalIVSC, {}),
+        VPValue(VPValue::VPVWidenCanonicalIVSC, nullptr, this) {}
 
   ~VPWidenCanonicalIVRecipe() override = default;
 
@@ -2094,6 +2118,10 @@ class VPlan {
   /// Holds the VPLoopInfo analysis for this VPlan.
   VPLoopInfo VPLInfo;
 
+  /// Indicates whether it is safe use the Value2VPValue mapping or if the
+  /// mapping cannot be used any longer, because it is stale.
+  bool Value2VPValueEnabled = true;
+
 public:
   VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
     if (Entry)
@@ -2135,6 +2163,10 @@ public:
     return BackedgeTakenCount;
   }
 
+  /// Mark the plan to indicate that using Value2VPValue is not safe any
+  /// longer, because it may be stale.
+  void disableValue2VPValue() { Value2VPValueEnabled = false; }
+
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
   bool hasVF(ElementCount VF) { return VFs.count(VF); }
@@ -2148,6 +2180,8 @@ public:
   void addExternalDef(VPValue *VPVal) { VPExternalDefs.insert(VPVal); }
 
   void addVPValue(Value *V) {
+    assert(Value2VPValueEnabled &&
+           "IR value to VPValue mapping may be out of date!");
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
     VPValue *VPV = new VPValue(V);
@@ -2156,25 +2190,39 @@ public:
   }
 
   void addVPValue(Value *V, VPValue *VPV) {
+    assert(Value2VPValueEnabled && "Value2VPValue mapping may be out of date!");
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
     Value2VPValue[V] = VPV;
   }
 
-  VPValue *getVPValue(Value *V) {
+  /// Returns the VPValue for \p V. \p OverrideAllowed can be used to disable
+  /// checking whether it is safe to query VPValues using IR Values.
+  VPValue *getVPValue(Value *V, bool OverrideAllowed = false) {
+    assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&
+           "Value2VPValue mapping may be out of date!");
     assert(V && "Trying to get the VPValue of a null Value");
     assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
     return Value2VPValue[V];
   }
 
-  VPValue *getOrAddVPValue(Value *V) {
+  /// Gets the VPValue or adds a new one (if none exists yet) for \p V. \p
+  /// OverrideAllowed can be used to disable checking whether it is safe to
+  /// query VPValues using IR Values.
+  VPValue *getOrAddVPValue(Value *V, bool OverrideAllowed = false) {
+    assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&
+           "Value2VPValue mapping may be out of date!");
     assert(V && "Trying to get or add the VPValue of a null Value");
     if (!Value2VPValue.count(V))
       addVPValue(V);
     return getVPValue(V);
   }
 
-  void removeVPValueFor(Value *V) { Value2VPValue.erase(V); }
+  void removeVPValueFor(Value *V) {
+    assert(Value2VPValueEnabled &&
+           "IR value to VPValue mapping may be out of date!");
+    Value2VPValue.erase(V);
+  }
 
   /// Return the VPLoopInfo analysis for this VPlan.
   VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
@@ -2244,9 +2292,9 @@ class VPlanPrinter {
     return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
   }
 
-  const Twine getOrCreateName(const VPBlockBase *Block);
+  Twine getOrCreateName(const VPBlockBase *Block);
 
-  const Twine getUID(const VPBlockBase *Block);
+  Twine getUID(const VPBlockBase *Block);
 
   /// Print the information related to a CFG edge between two VPBlockBases.
   void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 52b5ae083d0e..ded5bc04beb5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -31,19 +31,18 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
 
     VPBasicBlock *VPBB = Base->getEntryBasicBlock();
     // Introduce each ingredient into VPlan.
-    for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
-      VPRecipeBase *Ingredient = &*I++;
-      VPValue *VPV = Ingredient->getVPSingleValue();
+    for (VPRecipeBase &Ingredient : llvm::make_early_inc_range(*VPBB)) {
+      VPValue *VPV = Ingredient.getVPSingleValue();
       Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
       if (DeadInstructions.count(Inst)) {
         VPValue DummyValue;
         VPV->replaceAllUsesWith(&DummyValue);
-        Ingredient->eraseFromParent();
+        Ingredient.eraseFromParent();
         continue;
       }
 
       VPRecipeBase *NewRecipe = nullptr;
-      if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Ingredient)) {
+      if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
         auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
         InductionDescriptor II = Inductions.lookup(Phi);
         if (II.getKind() == InductionDescriptor::IK_IntInduction ||
@@ -55,25 +54,25 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
           continue;
         }
       } else {
-        assert(isa<VPInstruction>(Ingredient) &&
+        assert(isa<VPInstruction>(&Ingredient) &&
                "only VPInstructions expected here");
         assert(!isa<PHINode>(Inst) && "phis should be handled above");
         // Create VPWidenMemoryInstructionRecipe for loads and stores.
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenMemoryInstructionRecipe(
               *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
-              nullptr /*Mask*/);
+              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenMemoryInstructionRecipe(
               *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
-              Plan->getOrAddVPValue(Store->getValueOperand()),
-              nullptr /*Mask*/);
+              Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/,
+              false /*Consecutive*/, false /*Reverse*/);
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(
               GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
-          NewRecipe = new VPWidenCallRecipe(
-              *CI, Plan->mapToVPValues(CI->arg_operands()));
+          NewRecipe =
+              new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()));
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
           bool InvariantCond =
               SE.isLoopInvariant(SE.getSCEV(SI->getOperand(0)), OrigLoop);
@@ -85,13 +84,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         }
       }
 
-      NewRecipe->insertBefore(Ingredient);
+      NewRecipe->insertBefore(&Ingredient);
       if (NewRecipe->getNumDefinedValues() == 1)
         VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
       else
         assert(NewRecipe->getNumDefinedValues() == 0 &&
                "Only recpies with zero or one defined values expected");
-      Ingredient->eraseFromParent();
+      Ingredient.eraseFromParent();
       Plan->removeVPValueFor(Inst);
       for (auto *Def : NewRecipe->definedValues()) {
         Plan->addVPValue(Inst, Def);
@@ -106,44 +105,76 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
   bool Changed = false;
   // First, collect the operands of all predicated replicate recipes as seeds
   // for sinking.
-  SetVector<VPValue *> WorkList;
+  SetVector<std::pair<VPBasicBlock *, VPValue *>> WorkList;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
     for (auto &Recipe : *VPBB) {
       auto *RepR = dyn_cast<VPReplicateRecipe>(&Recipe);
       if (!RepR || !RepR->isPredicated())
         continue;
-      WorkList.insert(RepR->op_begin(), RepR->op_end());
+      for (VPValue *Op : RepR->operands())
+        WorkList.insert(std::make_pair(RepR->getParent(), Op));
     }
   }
 
   // Try to sink each replicate recipe in the worklist.
   while (!WorkList.empty()) {
-    auto *C = WorkList.pop_back_val();
+    VPBasicBlock *SinkTo;
+    VPValue *C;
+    std::tie(SinkTo, C) = WorkList.pop_back_val();
     auto *SinkCandidate = dyn_cast_or_null<VPReplicateRecipe>(C->Def);
-    if (!SinkCandidate || SinkCandidate->isUniform())
-      continue;
-
-    // All users of SinkCandidate must be in the same block in order to perform
-    // sinking. Therefore the destination block for sinking must match the block
-    // containing the first user.
-    auto *FirstUser = dyn_cast<VPRecipeBase>(*SinkCandidate->user_begin());
-    if (!FirstUser)
-      continue;
-    VPBasicBlock *SinkTo = FirstUser->getParent();
-    if (SinkCandidate->getParent() == SinkTo ||
+    if (!SinkCandidate || SinkCandidate->isUniform() ||
+        SinkCandidate->getParent() == SinkTo ||
         SinkCandidate->mayHaveSideEffects() ||
         SinkCandidate->mayReadOrWriteMemory())
       continue;
 
-    // All recipe users of the sink candidate must be in the same block SinkTo.
-    if (any_of(SinkCandidate->users(), [SinkTo](VPUser *U) {
-          auto *UI = dyn_cast<VPRecipeBase>(U);
-          return !UI || UI->getParent() != SinkTo;
-        }))
+    bool NeedsDuplicating = false;
+    // All recipe users of the sink candidate must be in the same block SinkTo
+    // or all users outside of SinkTo must be uniform-after-vectorization (
+    // i.e., only first lane is used) . In the latter case, we need to duplicate
+    // SinkCandidate. At the moment, we identify such UAV's by looking for the
+    // address operands of widened memory recipes.
+    auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
+                            SinkCandidate](VPUser *U) {
+      auto *UI = dyn_cast<VPRecipeBase>(U);
+      if (!UI)
+        return false;
+      if (UI->getParent() == SinkTo)
+        return true;
+      auto *WidenI = dyn_cast<VPWidenMemoryInstructionRecipe>(UI);
+      if (WidenI && WidenI->getAddr() == SinkCandidate) {
+        NeedsDuplicating = true;
+        return true;
+      }
+      return false;
+    };
+    if (!all_of(SinkCandidate->users(), CanSinkWithUser))
       continue;
 
+    if (NeedsDuplicating) {
+      Instruction *I = cast<Instruction>(SinkCandidate->getUnderlyingValue());
+      auto *Clone =
+          new VPReplicateRecipe(I, SinkCandidate->operands(), true, false);
+      // TODO: add ".cloned" suffix to name of Clone's VPValue.
+
+      Clone->insertBefore(SinkCandidate);
+      SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(),
+                                     SinkCandidate->user_end());
+      for (auto *U : Users) {
+        auto *UI = cast<VPRecipeBase>(U);
+        if (UI->getParent() == SinkTo)
+          continue;
+
+        for (unsigned Idx = 0; Idx != UI->getNumOperands(); Idx++) {
+          if (UI->getOperand(Idx) != SinkCandidate)
+            continue;
+          UI->setOperand(Idx, Clone);
+        }
+      }
+    }
     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
-    WorkList.insert(SinkCandidate->op_begin(), SinkCandidate->op_end());
+    for (VPValue *Op : SinkCandidate->operands())
+      WorkList.insert(std::make_pair(SinkTo, Op));
     Changed = true;
   }
   return Changed;
@@ -234,12 +265,15 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
     for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
       VPValue *PredInst1 =
           cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
-      for (VPUser *U : Phi1ToMove.getVPSingleValue()->users()) {
+      VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
+      SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(),
+                                  Phi1ToMoveV->user_end());
+      for (VPUser *U : Users) {
         auto *UI = dyn_cast<VPRecipeBase>(U);
         if (!UI || UI->getParent() != Then2)
           continue;
         for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) {
-          if (Phi1ToMove.getVPSingleValue() != U->getOperand(I))
+          if (Phi1ToMoveV != U->getOperand(I))
             continue;
           U->setOperand(I, PredInst1);
         }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 6eec8d14de4a..6d6ea4eb30f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -128,3 +128,33 @@ void VPlanVerifier::verifyHierarchicalCFG(
   assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
   verifyRegionRec(TopRegion);
 }
+
+bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
+  auto Iter = depth_first(
+      VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
+  for (const VPBasicBlock *VPBB :
+       VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {
+    // Verify that phi-like recipes are at the beginning of the block, with no
+    // other recipes in between.
+    auto RecipeI = VPBB->begin();
+    auto End = VPBB->end();
+    while (RecipeI != End && RecipeI->isPhi())
+      RecipeI++;
+
+    while (RecipeI != End) {
+      if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
+        errs() << "Found phi-like recipe after non-phi recipe";
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+        errs() << ": ";
+        RecipeI->dump();
+        errs() << "after\n";
+        std::prev(RecipeI)->dump();
+#endif
+        return false;
+      }
+      RecipeI++;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
index 8e8de441648a..839c24e2c9f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -26,6 +26,7 @@
 
 namespace llvm {
 class VPRegionBlock;
+class VPlan;
 
 /// Struct with utility functions that can be used to check the consistency and
 /// invariants of a VPlan, including the components of its H-CFG.
@@ -35,6 +36,12 @@ struct VPlanVerifier {
   /// 1. Region/Block verification: Check the Region/Block verification
   /// invariants for every region in the H-CFG.
   void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+
+  /// Verify invariants for general VPlans. Currently it checks the following:
+  /// 1. all phi-like recipes must be at the beginning of a block, with no other
+  /// recipes in between. Note that currently there is still an exception for
+  /// VPBlendRecipes.
+  static bool verifyPlanIsValid(const VPlan &Plan);
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d18bcd34620c..57b11e9414ba 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -31,10 +31,12 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 
+#define DEBUG_TYPE "vector-combine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
@@ -61,8 +63,10 @@ namespace {
 class VectorCombine {
 public:
   VectorCombine(Function &F, const TargetTransformInfo &TTI,
-                const DominatorTree &DT, AAResults &AA, AssumptionCache &AC)
-      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC) {}
+                const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
+                bool ScalarizationOnly)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC),
+        ScalarizationOnly(ScalarizationOnly) {}
 
   bool run();
 
@@ -74,12 +78,18 @@ private:
   AAResults &AA;
   AssumptionCache &AC;
 
+  /// If true only perform scalarization combines and do not introduce new
+  /// vector operations.
+  bool ScalarizationOnly;
+
+  InstructionWorklist Worklist;
+
   bool vectorizeLoadInsert(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
   bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
-                             unsigned Opcode,
+                             const Instruction &I,
                              ExtractElementInst *&ConvertToShuffle,
                              unsigned PreferredExtractIndex);
   void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
@@ -92,14 +102,27 @@ private:
   bool foldExtractedCmps(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
+  bool foldShuffleOfBinops(Instruction &I);
+
+  void replaceValue(Value &Old, Value &New) {
+    Old.replaceAllUsesWith(&New);
+    New.takeName(&Old);
+    if (auto *NewI = dyn_cast<Instruction>(&New)) {
+      Worklist.pushUsersToWorkList(*NewI);
+      Worklist.pushValue(NewI);
+    }
+    Worklist.pushValue(&Old);
+  }
+
+  void eraseInstruction(Instruction &I) {
+    for (Value *Op : I.operands())
+      Worklist.pushValue(Op);
+    Worklist.remove(&I);
+    I.eraseFromParent();
+  }
 };
 } // namespace
 
-static void replaceValue(Value &Old, Value &New) {
-  Old.replaceAllUsesWith(&New);
-  New.takeName(&Old);
-}
-
 bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // Match insert into fixed vector of scalar value.
   // TODO: Handle non-zero insert index.
@@ -284,12 +307,13 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
 /// \p ConvertToShuffle to that extract instruction.
 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                           ExtractElementInst *Ext1,
-                                          unsigned Opcode,
+                                          const Instruction &I,
                                           ExtractElementInst *&ConvertToShuffle,
                                           unsigned PreferredExtractIndex) {
   assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
          isa<ConstantInt>(Ext1->getOperand(1)) &&
          "Expected constant extract indexes");
+  unsigned Opcode = I.getOpcode();
   Type *ScalarTy = Ext0->getType();
   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
   InstructionCost ScalarOpCost, VectorOpCost;
@@ -302,10 +326,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   } else {
     assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
            "Expected a compare");
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
-                                          CmpInst::makeCmpResultType(ScalarTy));
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
-                                          CmpInst::makeCmpResultType(VecTy));
+    CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
+    ScalarOpCost = TTI.getCmpSelInstrCost(
+        Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+    VectorOpCost = TTI.getCmpSelInstrCost(
+        Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
   }
 
   // Get cost estimates for the extract elements. These costs will factor into
@@ -480,8 +505,7 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
           m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
 
   ExtractElementInst *ExtractToChange;
-  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
-                            InsertIndex))
+  if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
     return false;
 
   if (ExtractToChange) {
@@ -501,6 +525,8 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
   else
     foldExtExtBinop(Ext0, Ext1, I);
 
+  Worklist.push(Ext0);
+  Worklist.push(Ext1);
   return true;
 }
 
@@ -623,8 +649,11 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   unsigned Opcode = I.getOpcode();
   InstructionCost ScalarOpCost, VectorOpCost;
   if (IsCmp) {
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+    CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
+    ScalarOpCost = TTI.getCmpSelInstrCost(
+        Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+    VectorOpCost = TTI.getCmpSelInstrCost(
+        Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
   } else {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
@@ -724,7 +753,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   InstructionCost OldCost =
       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
   OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
-  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+  OldCost +=
+      TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
+                             CmpInst::makeCmpResultType(I0->getType()), Pred) *
+      2;
   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
 
   // The proposed vector pattern is:
@@ -733,7 +765,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
-  InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  InstructionCost NewCost = TTI.getCmpSelInstrCost(
+      CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
   ShufMask[CheapIndex] = ExpensiveIndex;
   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
@@ -774,18 +807,98 @@ static bool isMemModifiedBetween(BasicBlock::iterator Begin,
   });
 }
 
+/// Helper class to indicate whether a vector index can be safely scalarized and
+/// if a freeze needs to be inserted.
+class ScalarizationResult {
+  enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
+
+  StatusTy Status;
+  Value *ToFreeze;
+
+  ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
+      : Status(Status), ToFreeze(ToFreeze) {}
+
+public:
+  ScalarizationResult(const ScalarizationResult &Other) = default;
+  ~ScalarizationResult() {
+    assert(!ToFreeze && "freeze() not called with ToFreeze being set");
+  }
+
+  static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
+  static ScalarizationResult safe() { return {StatusTy::Safe}; }
+  static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
+    return {StatusTy::SafeWithFreeze, ToFreeze};
+  }
+
+  /// Returns true if the index can be scalarize without requiring a freeze.
+  bool isSafe() const { return Status == StatusTy::Safe; }
+  /// Returns true if the index cannot be scalarized.
+  bool isUnsafe() const { return Status == StatusTy::Unsafe; }
+  /// Returns true if the index can be scalarize, but requires inserting a
+  /// freeze.
+  bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
+
+  /// Reset the state of Unsafe and clear ToFreze if set.
+  void discard() {
+    ToFreeze = nullptr;
+    Status = StatusTy::Unsafe;
+  }
+
+  /// Freeze the ToFreeze and update the use in \p User to use it.
+  void freeze(IRBuilder<> &Builder, Instruction &UserI) {
+    assert(isSafeWithFreeze() &&
+           "should only be used when freezing is required");
+    assert(is_contained(ToFreeze->users(), &UserI) &&
+           "UserI must be a user of ToFreeze");
+    IRBuilder<>::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(cast<Instruction>(&UserI));
+    Value *Frozen =
+        Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
+    for (Use &U : make_early_inc_range((UserI.operands())))
+      if (U.get() == ToFreeze)
+        U.set(Frozen);
+
+    ToFreeze = nullptr;
+  }
+};
+
 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p
 /// Idx. \p Idx must access a valid vector element.
-static bool canScalarizeAccess(FixedVectorType *VecTy, Value *Idx,
-                               Instruction *CtxI, AssumptionCache &AC) {
-  if (auto *C = dyn_cast<ConstantInt>(Idx))
-    return C->getValue().ult(VecTy->getNumElements());
+static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy,
+                                              Value *Idx, Instruction *CtxI,
+                                              AssumptionCache &AC,
+                                              const DominatorTree &DT) {
+  if (auto *C = dyn_cast<ConstantInt>(Idx)) {
+    if (C->getValue().ult(VecTy->getNumElements()))
+      return ScalarizationResult::safe();
+    return ScalarizationResult::unsafe();
+  }
 
-  APInt Zero(Idx->getType()->getScalarSizeInBits(), 0);
-  APInt MaxElts(Idx->getType()->getScalarSizeInBits(), VecTy->getNumElements());
+  unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
+  APInt Zero(IntWidth, 0);
+  APInt MaxElts(IntWidth, VecTy->getNumElements());
   ConstantRange ValidIndices(Zero, MaxElts);
-  ConstantRange IdxRange = computeConstantRange(Idx, true, &AC, CtxI, 0);
-  return ValidIndices.contains(IdxRange);
+  ConstantRange IdxRange(IntWidth, true);
+
+  if (isGuaranteedNotToBePoison(Idx, &AC)) {
+    if (ValidIndices.contains(computeConstantRange(Idx, true, &AC, CtxI, &DT)))
+      return ScalarizationResult::safe();
+    return ScalarizationResult::unsafe();
+  }
+
+  // If the index may be poison, check if we can insert a freeze before the
+  // range of the index is restricted.
+  Value *IdxBase;
+  ConstantInt *CI;
+  if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
+    IdxRange = IdxRange.binaryAnd(CI->getValue());
+  } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
+    IdxRange = IdxRange.urem(CI->getValue());
+  }
+
+  if (ValidIndices.contains(IdxRange))
+    return ScalarizationResult::safeWithFreeze(IdxBase);
+  return ScalarizationResult::unsafe();
 }
 
 /// The memory operation on a vector of \p ScalarType had alignment of
@@ -833,12 +946,17 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
     // modified between, vector type matches store size, and index is inbounds.
     if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
         !DL.typeSizeEqualsStoreSize(Load->getType()) ||
-        !canScalarizeAccess(VecTy, Idx, Load, AC) ||
-        SrcAddr != SI->getPointerOperand()->stripPointerCasts() ||
+        SrcAddr != SI->getPointerOperand()->stripPointerCasts())
+      return false;
+
+    auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
+    if (ScalarizableIdx.isUnsafe() ||
         isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
                              MemoryLocation::get(SI), AA))
       return false;
 
+    if (ScalarizableIdx.isSafeWithFreeze())
+      ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
     Value *GEP = Builder.CreateInBoundsGEP(
         SI->getValueOperand()->getType(), SI->getPointerOperand(),
         {ConstantInt::get(Idx->getType(), 0), Idx});
@@ -849,8 +967,7 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
         DL);
     NSI->setAlignment(ScalarOpAlignment);
     replaceValue(I, *NSI);
-    // Need erasing the store manually.
-    I.eraseFromParent();
+    eraseInstruction(I);
     return true;
   }
 
@@ -860,11 +977,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
 /// Try to scalarize vector loads feeding extractelement instructions.
 bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   Value *Ptr;
-  Value *Idx;
-  if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx))))
+  if (!match(&I, m_Load(m_Value(Ptr))))
     return false;
 
-  auto *LI = cast<LoadInst>(I.getOperand(0));
+  auto *LI = cast<LoadInst>(&I);
   const DataLayout &DL = I.getModule()->getDataLayout();
   if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
     return false;
@@ -909,8 +1025,12 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
     else if (LastCheckedInst->comesBefore(UI))
       LastCheckedInst = UI;
 
-    if (!canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC))
+    auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);
+    if (!ScalarIdx.isSafe()) {
+      // TODO: Freeze index if it is safe to do so.
+      ScalarIdx.discard();
       return false;
+    }
 
     auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
     OriginalCost +=
@@ -946,6 +1066,60 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   return true;
 }
 
+/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
+/// "binop (shuffle), (shuffle)".
+bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
+  auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!VecTy)
+    return false;
+
+  BinaryOperator *B0, *B1;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)),
+                           m_Mask(Mask))) ||
+      B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy)
+    return false;
+
+  // Try to replace a binop with a shuffle if the shuffle is not costly.
+  // The new shuffle will choose from a single, common operand, so it may be
+  // cheaper than the existing two-operand shuffle.
+  SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size());
+  Instruction::BinaryOps Opcode = B0->getOpcode();
+  InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  InstructionCost ShufCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask);
+  if (ShufCost > BinopCost)
+    return false;
+
+  // If we have something like "add X, Y" and "add Z, X", swap ops to match.
+  Value *X = B0->getOperand(0), *Y = B0->getOperand(1);
+  Value *Z = B1->getOperand(0), *W = B1->getOperand(1);
+  if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
+    std::swap(X, Y);
+
+  Value *Shuf0, *Shuf1;
+  if (X == Z) {
+    // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
+    Shuf0 = Builder.CreateShuffleVector(X, UnaryMask);
+    Shuf1 = Builder.CreateShuffleVector(Y, W, Mask);
+  } else if (Y == W) {
+    // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
+    Shuf0 = Builder.CreateShuffleVector(X, Z, Mask);
+    Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask);
+  } else {
+    return false;
+  }
+
+  Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+  // Intersect flags from the old binops.
+  if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+    NewInst->copyIRFlags(B0);
+    NewInst->andIRFlags(B1);
+  }
+  replaceValue(I, *NewBO);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -957,29 +1131,43 @@ bool VectorCombine::run() {
     return false;
 
   bool MadeChange = false;
+  auto FoldInst = [this, &MadeChange](Instruction &I) {
+    Builder.SetInsertPoint(&I);
+    if (!ScalarizationOnly) {
+      MadeChange |= vectorizeLoadInsert(I);
+      MadeChange |= foldExtractExtract(I);
+      MadeChange |= foldBitcastShuf(I);
+      MadeChange |= foldExtractedCmps(I);
+      MadeChange |= foldShuffleOfBinops(I);
+    }
+    MadeChange |= scalarizeBinopOrCmp(I);
+    MadeChange |= scalarizeLoadExtract(I);
+    MadeChange |= foldSingleElementStore(I);
+  };
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
     if (!DT.isReachableFromEntry(&BB))
       continue;
     // Use early increment range so that we can erase instructions in loop.
     for (Instruction &I : make_early_inc_range(BB)) {
-      if (isa<DbgInfoIntrinsic>(I))
+      if (I.isDebugOrPseudoInst())
         continue;
-      Builder.SetInsertPoint(&I);
-      MadeChange |= vectorizeLoadInsert(I);
-      MadeChange |= foldExtractExtract(I);
-      MadeChange |= foldBitcastShuf(I);
-      MadeChange |= scalarizeBinopOrCmp(I);
-      MadeChange |= foldExtractedCmps(I);
-      MadeChange |= scalarizeLoadExtract(I);
-      MadeChange |= foldSingleElementStore(I);
+      FoldInst(I);
     }
   }
 
-  // We're done with transforms, so remove dead instructions.
-  if (MadeChange)
-    for (BasicBlock &BB : F)
-      SimplifyInstructionsInBlock(&BB);
+  while (!Worklist.isEmpty()) {
+    Instruction *I = Worklist.removeOne();
+    if (!I)
+      continue;
+
+    if (isInstructionTriviallyDead(I)) {
+      eraseInstruction(*I);
+      continue;
+    }
+
+    FoldInst(*I);
+  }
 
   return MadeChange;
 }
@@ -1014,7 +1202,7 @@ public:
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-    VectorCombine Combiner(F, TTI, DT, AA, AC);
+    VectorCombine Combiner(F, TTI, DT, AA, AC, false);
     return Combiner.run();
   }
 };
@@ -1038,7 +1226,7 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   AAResults &AA = FAM.getResult<AAManager>(F);
-  VectorCombine Combiner(F, TTI, DT, AA, AC);
+  VectorCombine Combiner(F, TTI, DT, AA, AC, ScalarizationOnly);
   if (!Combiner.run())
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
diff --git a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
index 6af7bc699d05..1be1d34417eb 100644
--- a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
+++ b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
@@ -35,7 +35,7 @@ void WindowsManifestError::log(raw_ostream &OS) const { OS << Msg; }
 class WindowsManifestMerger::WindowsManifestMergerImpl {
 public:
   ~WindowsManifestMergerImpl();
-  Error merge(const MemoryBuffer &Manifest);
+  Error merge(MemoryBufferRef Manifest);
   std::unique_ptr<MemoryBuffer> getMergedManifest();
 
 private:
@@ -620,7 +620,7 @@ WindowsManifestMerger::WindowsManifestMergerImpl::~WindowsManifestMergerImpl() {
 }
 
 Error WindowsManifestMerger::WindowsManifestMergerImpl::merge(
-    const MemoryBuffer &Manifest) {
+    MemoryBufferRef Manifest) {
   if (Merged)
     return make_error<WindowsManifestError>(
         "merge after getMergedManifest is not supported");
@@ -690,7 +690,7 @@ WindowsManifestMerger::WindowsManifestMergerImpl::~WindowsManifestMergerImpl() {
 }
 
 Error WindowsManifestMerger::WindowsManifestMergerImpl::merge(
-    const MemoryBuffer &Manifest) {
+    MemoryBufferRef Manifest) {
   return make_error<WindowsManifestError>("no libxml2");
 }
 
@@ -708,7 +708,7 @@ WindowsManifestMerger::WindowsManifestMerger()
 
 WindowsManifestMerger::~WindowsManifestMerger() {}
 
-Error WindowsManifestMerger::merge(const MemoryBuffer &Manifest) {
+Error WindowsManifestMerger::merge(MemoryBufferRef Manifest) {
   return Impl->merge(Manifest);
 }
 
diff --git a/llvm/lib/XRay/InstrumentationMap.cpp b/llvm/lib/XRay/InstrumentationMap.cpp
index e6534e5a7be7..c60efa465bb6 100644
--- a/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/llvm/lib/XRay/InstrumentationMap.cpp
@@ -86,10 +86,8 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
         "Failed to find XRay instrumentation map.",
         std::make_error_code(std::errc::executable_format_error));
 
-  if (Expected<StringRef> E = I->getContents())
-    Contents = *E;
-  else
-    return E.takeError();
+  if (Error E = I->getContents().moveInto(Contents))
+    return E;
 
   RelocMap Relocs;
   if (ObjFile.getBinary()->isELF()) {
@@ -190,7 +188,7 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
         SledEntry::FunctionKinds::TAIL,
         SledEntry::FunctionKinds::LOG_ARGS_ENTER,
         SledEntry::FunctionKinds::CUSTOM_EVENT};
-    if (Kind >= sizeof(Kinds))
+    if (Kind >= sizeof(Kinds) / sizeof(Kinds[0]))
       return errorCodeToError(
           std::make_error_code(std::errc::executable_format_error));
     Entry.Kind = Kinds[Kind];
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index 2601ee318f7d..451e1cd98ee8 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -269,7 +269,7 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
     std::vector<GlobalValue *> ToRemove;
     // First, remove aliases to functions we're about to purge.
     for (GlobalAlias &Alias : M->aliases()) {
-      GlobalObject *Root = Alias.getBaseObject();
+      GlobalObject *Root = Alias.getAliaseeObject();
       Function *F = dyn_cast_or_null<Function>(Root);
       if (F) {
         if (Functions.count(F))
@@ -358,8 +358,7 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
   for (auto A : Attrs)
     AB.addAttribute(A);
   AttributeList NewAttrs;
-  NewAttrs =
-      NewAttrs.addAttributes(BD.getContext(), AttributeList::FunctionIndex, AB);
+  NewAttrs = NewAttrs.addFnAttributes(BD.getContext(), AB);
 
   // Set this new list of attributes on the function.
   F->setAttributes(NewAttrs);
@@ -375,7 +374,7 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
 
     // Pass along the set of attributes that caused the crash.
     Attrs.clear();
-    for (Attribute A : NewAttrs.getFnAttributes()) {
+    for (Attribute A : NewAttrs.getFnAttrs()) {
       Attrs.push_back(A);
     }
     return true;
@@ -787,14 +786,13 @@ bool ReduceCrashingInstructions::TestInsts(
 
   for (Module::iterator MI = M->begin(), ME = M->end(); MI != ME; ++MI)
     for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE; ++FI)
-      for (BasicBlock::iterator I = FI->begin(), E = FI->end(); I != E;) {
-        Instruction *Inst = &*I++;
-        if (!Instructions.count(Inst) && !Inst->isTerminator() &&
-            !Inst->isEHPad() && !Inst->getType()->isTokenTy() &&
-            !Inst->isSwiftError()) {
-          if (!Inst->getType()->isVoidTy())
-            Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
-          Inst->eraseFromParent();
+      for (Instruction &Inst : llvm::make_early_inc_range(*FI)) {
+        if (!Instructions.count(&Inst) && !Inst.isTerminator() &&
+            !Inst.isEHPad() && !Inst.getType()->isTokenTy() &&
+            !Inst.isSwiftError()) {
+          if (!Inst.getType()->isVoidTy())
+            Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+          Inst.eraseFromParent();
         }
       }
 
@@ -1232,7 +1230,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
         assert(Fn && "Could not find function?");
 
         std::vector<Attribute> Attrs;
-        for (Attribute A : Fn->getAttributes().getFnAttributes())
+        for (Attribute A : Fn->getAttributes().getFnAttrs())
           Attrs.push_back(A);
 
         OldSize += Attrs.size();
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index ca78735202fc..848baf90965b 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -223,8 +223,8 @@ bool BugDriver::runPasses(Module &Program,
   for (std::vector<std::string>::const_iterator I = pass_args.begin(),
                                                 E = pass_args.end();
        I != E; ++I)
-    Args.push_back(I->c_str());
-  Args.push_back(Temp->TmpName.c_str());
+    Args.push_back(*I);
+  Args.push_back(Temp->TmpName);
   Args.append(ExtraArgs.begin(), ExtraArgs.end());
 
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
diff --git a/llvm/tools/bugpoint/ToolRunner.cpp b/llvm/tools/bugpoint/ToolRunner.cpp
index b81ab07980dd..d3111e574e7c 100644
--- a/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/llvm/tools/bugpoint/ToolRunner.cpp
@@ -192,7 +192,7 @@ Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
   outs() << "<lli>";
   outs().flush();
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
-             for (unsigned i = 0, e = LLIArgs.size() - 1; i != e; ++i) errs()
+             for (unsigned i = 0, e = LLIArgs.size(); i != e; ++i) errs()
              << " " << LLIArgs[i];
              errs() << "\n";);
   return RunProgramWithTimeout(LLIPath, LLIArgs, InputFile, OutputFile,
@@ -460,7 +460,7 @@ Expected<CC::FileType> LLC::OutputCode(const std::string &Bitcode,
   outs() << (UseIntegratedAssembler ? "<llc-ia>" : "<llc>");
   outs().flush();
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
-             for (unsigned i = 0, e = LLCArgs.size() - 1; i != e; ++i) errs()
+             for (unsigned i = 0, e = LLCArgs.size(); i != e; ++i) errs()
              << " " << LLCArgs[i];
              errs() << "\n";);
   if (RunProgramWithTimeout(LLCPath, LLCArgs, "", "", "", Timeout, MemoryLimit))
@@ -578,7 +578,7 @@ Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
   outs() << "<jit>";
   outs().flush();
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
-             for (unsigned i = 0, e = JITArgs.size() - 1; i != e; ++i) errs()
+             for (unsigned i = 0, e = JITArgs.size(); i != e; ++i) errs()
              << " " << JITArgs[i];
              errs() << "\n";);
   LLVM_DEBUG(errs() << "\nSending output to " << OutputFile << "\n");
@@ -685,7 +685,7 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
   outs() << "<CC>";
   outs().flush();
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
-             for (unsigned i = 0, e = CCArgs.size() - 1; i != e; ++i) errs()
+             for (unsigned i = 0, e = CCArgs.size(); i != e; ++i) errs()
              << " " << CCArgs[i];
              errs() << "\n";);
   if (RunProgramWithTimeout(CCPath, CCArgs, "", "", ""))
@@ -733,7 +733,7 @@ Expected<int> CC::ExecuteProgram(const std::string &ProgramFile,
   outs().flush();
   LLVM_DEBUG(
       errs() << "\nAbout to run:\t";
-      for (unsigned i = 0, e = ProgramArgs.size() - 1; i != e; ++i) errs()
+      for (unsigned i = 0, e = ProgramArgs.size(); i != e; ++i) errs()
       << " " << ProgramArgs[i];
       errs() << "\n";);
 
@@ -829,7 +829,7 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
   outs() << "<CC>";
   outs().flush();
   LLVM_DEBUG(errs() << "\nAbout to run:\t";
-             for (unsigned i = 0, e = CCArgs.size() - 1; i != e; ++i) errs()
+             for (unsigned i = 0, e = CCArgs.size(); i != e; ++i) errs()
              << " " << CCArgs[i];
              errs() << "\n";);
   if (RunProgramWithTimeout(CCPath, CCArgs, "", "", ""))
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 6a1e2bae2096..9d80f062c8f9 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/CommandFlags.h"
@@ -36,6 +37,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
@@ -47,8 +49,8 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -82,6 +84,19 @@ TimeCompilations("time-compilations", cl::Hidden, cl::init(1u),
                  cl::value_desc("N"),
                  cl::desc("Repeat compilation N times for timing"));
 
+static cl::opt<bool> TimeTrace("time-trace", cl::desc("Record time trace"));
+
+static cl::opt<unsigned> TimeTraceGranularity(
+    "time-trace-granularity",
+    cl::desc(
+        "Minimum time granularity (in microseconds) traced by time profiler"),
+    cl::init(500), cl::Hidden);
+
+static cl::opt<std::string>
+    TimeTraceFile("time-trace-file",
+                  cl::desc("Specify time trace file destination"),
+                  cl::value_desc("filename"));
+
 static cl::opt<std::string>
     BinutilsVersion("binutils-version", cl::Hidden,
                     cl::desc("Produced object files can use all ELF features "
@@ -201,8 +216,7 @@ static cl::opt<RunPassOption, true, cl::parser<std::string>> RunPass(
 
 static int compileModule(char **, LLVMContext &);
 
-LLVM_ATTRIBUTE_NORETURN static void reportError(Twine Msg,
-                                                StringRef Filename = "") {
+[[noreturn]] static void reportError(Twine Msg, StringRef Filename = "") {
   SmallString<256> Prefix;
   if (!Filename.empty()) {
     if (Filename == "-")
@@ -213,7 +227,7 @@ LLVM_ATTRIBUTE_NORETURN static void reportError(Twine Msg,
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN static void reportError(Error Err, StringRef Filename) {
+[[noreturn]] static void reportError(Error Err, StringRef Filename) {
   assert(Err);
   handleAllErrors(createFileError(Filename, std::move(Err)),
                   [&](const ErrorInfoBase &EI) { reportError(EI.message()); });
@@ -330,8 +344,6 @@ int main(int argc, char **argv) {
   // Enable debug stream buffering.
   EnableDebugBuffering = true;
 
-  LLVMContext Context;
-
   // Initialize targets first, so that --version shows registered targets.
   InitializeAllTargets();
   InitializeAllTargetMCs();
@@ -366,6 +378,21 @@ int main(int argc, char **argv) {
 
   cl::ParseCommandLineOptions(argc, argv, "llvm system compiler\n");
 
+  if (TimeTrace)
+    timeTraceProfilerInitialize(TimeTraceGranularity, argv[0]);
+  auto TimeTraceScopeExit = make_scope_exit([]() {
+    if (TimeTrace) {
+      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
+        handleAllErrors(std::move(E), [&](const StringError &SE) {
+          errs() << SE.getMessage() << "\n";
+        });
+        return;
+      }
+      timeTraceProfilerCleanup();
+    }
+  });
+
+  LLVMContext Context;
   Context.setDiscardValueNames(DiscardValueNames);
 
   // Set a diagnostic handler that doesn't exit on the first error
diff --git a/llvm/tools/lli/ChildTarget/ChildTarget.cpp b/llvm/tools/lli/ChildTarget/ChildTarget.cpp
index 5772baca1d09..cf1b03a141c5 100644
--- a/llvm/tools/lli/ChildTarget/ChildTarget.cpp
+++ b/llvm/tools/lli/ChildTarget/ChildTarget.cpp
@@ -1,69 +1,76 @@
-#include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
-#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h"
-#include "llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h"
-#include "llvm/Support/Debug.h"
+//===----------- ChildTarget.cpp - Out-of-proc executor for lli -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple out-of-process executor for lli.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h"
 #include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/Process.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
 #include <sstream>
 
-#include "../RemoteJITUtils.h"
-
 using namespace llvm;
 using namespace llvm::orc;
-using namespace llvm::sys;
-
-#ifdef __x86_64__
-typedef OrcX86_64_SysV HostOrcArch;
-#else
-typedef OrcGenericABI HostOrcArch;
-#endif
 
 ExitOnError ExitOnErr;
 
 int main(int argc, char *argv[]) {
+#if LLVM_ENABLE_THREADS
 
   if (argc != 3) {
     errs() << "Usage: " << argv[0] << " <input fd> <output fd>\n";
     return 1;
   }
 
-  ExitOnErr.setBanner(std::string(argv[0]) + ":");
-
-  int InFD;
-  int OutFD;
-  {
-    std::istringstream InFDStream(argv[1]), OutFDStream(argv[2]);
-    InFDStream >> InFD;
-    OutFDStream >> OutFD;
-  }
-
   if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr)) {
     errs() << "Error loading program symbols.\n";
     return 1;
   }
 
-  auto SymbolLookup = [](const std::string &Name) {
-    return RTDyldMemoryManager::getSymbolAddressInProcess(Name);
-  };
+  ExitOnErr.setBanner(std::string(argv[0]) + ": ");
 
-  auto RegisterEHFrames = [](uint8_t *Addr, uint32_t Size) {
-    RTDyldMemoryManager::registerEHFramesInProcess(Addr, Size);
-  };
-
-  auto DeregisterEHFrames = [](uint8_t *Addr, uint32_t Size) {
-    RTDyldMemoryManager::deregisterEHFramesInProcess(Addr, Size);
-  };
-
-  shared::FDRawByteChannel Channel(InFD, OutFD);
-  typedef remote::OrcRemoteTargetServer<shared::FDRawByteChannel, HostOrcArch>
-      JITServer;
-  JITServer Server(Channel, SymbolLookup, RegisterEHFrames, DeregisterEHFrames);
-
-  while (!Server.receivedTerminate())
-    ExitOnErr(Server.handleOne());
+  int InFD = 0;
+  int OutFD = 0;
+  {
+    std::istringstream InFDStream(argv[1]), OutFDStream(argv[2]);
+    InFDStream >> InFD;
+    OutFDStream >> OutFD;
+  }
 
-  close(InFD);
-  close(OutFD);
+  auto Server =
+      ExitOnErr(SimpleRemoteEPCServer::Create<FDSimpleRemoteEPCTransport>(
+          [](SimpleRemoteEPCServer::Setup &S) -> Error {
+            S.setDispatcher(
+                std::make_unique<SimpleRemoteEPCServer::ThreadDispatcher>());
+            S.bootstrapSymbols() =
+                SimpleRemoteEPCServer::defaultBootstrapSymbols();
+            S.services().push_back(
+                std::make_unique<rt_bootstrap::SimpleExecutorMemoryManager>());
+            return Error::success();
+          },
+          InFD, OutFD));
+
+  ExitOnErr(Server->waitForDisconnect());
 
   return 0;
+
+#else
+  errs() << argv[0]
+         << " error: this tool requires threads, but LLVM was "
+            "built with LLVM_ENABLE_THREADS=Off\n";
+  return 1;
+#endif
 }
diff --git a/llvm/tools/lli/RemoteJITUtils.h b/llvm/tools/lli/ForwardingMemoryManager.h
index cc8d034f62a5..99a545e60de4 100644
--- a/llvm/tools/lli/RemoteJITUtils.h
+++ b/llvm/tools/lli/ForwardingMemoryManager.h
@@ -10,21 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLI_REMOTEJITUTILS_H
-#define LLVM_TOOLS_LLI_REMOTEJITUTILS_H
+#ifndef LLVM_TOOLS_LLI_FORWARDINGMEMORYMANAGER_H
+#define LLVM_TOOLS_LLI_FORWARDINGMEMORYMANAGER_H
 
-#include "llvm/ExecutionEngine/Orc/Shared/FDRawByteChannel.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericDylibManager.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include <mutex>
-
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-
-// launch the remote process (see lli.cpp) and return a channel to it.
-std::unique_ptr<llvm::orc::shared::FDRawByteChannel> launchRemote();
 
 namespace llvm {
 
@@ -70,9 +60,7 @@ public:
     MemMgr->registerEHFrames(Addr, LoadAddr, Size);
   }
 
-  void deregisterEHFrames() override {
-    MemMgr->deregisterEHFrames();
-  }
+  void deregisterEHFrames() override { MemMgr->deregisterEHFrames(); }
 
   bool finalizeMemory(std::string *ErrMsg = nullptr) override {
     return MemMgr->finalizeMemory(ErrMsg);
@@ -90,8 +78,7 @@ public:
     return Resolver->findSymbol(Name);
   }
 
-  JITSymbol
-  findSymbolInLogicalDylib(const std::string &Name) override {
+  JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {
     return Resolver->findSymbolInLogicalDylib(Name);
   }
 
@@ -100,17 +87,31 @@ private:
   std::shared_ptr<LegacyJITSymbolResolver> Resolver;
 };
 
-template <typename RemoteT>
 class RemoteResolver : public LegacyJITSymbolResolver {
 public:
-
-  RemoteResolver(RemoteT &R) : R(R) {}
+  static Expected<std::unique_ptr<RemoteResolver>>
+  Create(orc::ExecutorProcessControl &EPC) {
+    auto DylibMgr =
+        orc::EPCGenericDylibManager::CreateWithDefaultBootstrapSymbols(EPC);
+    if (!DylibMgr)
+      return DylibMgr.takeError();
+    auto H = DylibMgr->open("", 0);
+    if (!H)
+      return H.takeError();
+    return std::unique_ptr<RemoteResolver>(
+        new RemoteResolver(std::move(*DylibMgr), std::move(*H)));
+  }
 
   JITSymbol findSymbol(const std::string &Name) override {
-    if (auto Addr = R.getSymbolAddress(Name))
-      return JITSymbol(*Addr, JITSymbolFlags::Exported);
-    else
-      return Addr.takeError();
+    orc::RemoteSymbolLookupSet R;
+    R.push_back({std::move(Name), false});
+    if (auto Addrs = DylibMgr.lookup(H, R)) {
+      if (Addrs->size() != 1)
+        return make_error<StringError>("Unexpected remote lookup result",
+                                       inconvertibleErrorCode());
+      return JITSymbol(Addrs->front().getValue(), JITSymbolFlags::Exported);
+    } else
+      return Addrs.takeError();
   }
 
   JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {
@@ -118,8 +119,13 @@ public:
   }
 
 public:
-  RemoteT &R;
+  RemoteResolver(orc::EPCGenericDylibManager DylibMgr,
+                 orc::tpctypes::DylibHandle H)
+      : DylibMgr(std::move(DylibMgr)), H(std::move(H)) {}
+
+  orc::EPCGenericDylibManager DylibMgr;
+  orc::tpctypes::DylibHandle H;
 };
-}
+} // namespace llvm
 
-#endif
+#endif // LLVM_TOOLS_LLI_FORWARDINGMEMORYMANAGER_H
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index af614c01b9a8..d20daa07196b 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ExecutionUtils.h"
-#include "RemoteJITUtils.h"
+#include "ForwardingMemoryManager.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -30,11 +30,12 @@
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h"
 #include "llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h"
+#include "llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
-#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
@@ -68,6 +69,12 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include <cerrno>
 
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
 #ifdef __CYGWIN__
 #include <cygwin/version.h>
 #if defined(CYGWIN_VERSION_DLL_MAJOR) && CYGWIN_VERSION_DLL_MAJOR<1007
@@ -348,13 +355,12 @@ private:
       return false;
 
     std::string CacheSubdir = ModID.substr(PrefixLength);
-#if defined(_WIN32)
-    // Transform "X:\foo" => "/X\foo" for convenience.
-    if (isalpha(CacheSubdir[0]) && CacheSubdir[1] == ':') {
+    // Transform "X:\foo" => "/X\foo" for convenience on Windows.
+    if (is_style_windows(llvm::sys::path::Style::native) &&
+        isalpha(CacheSubdir[0]) && CacheSubdir[1] == ':') {
       CacheSubdir[1] = CacheSubdir[0];
       CacheSubdir[0] = '/';
     }
-#endif
 
     CacheName = CacheDir + CacheSubdir;
     size_t pos = CacheName.rfind('.');
@@ -410,8 +416,7 @@ CodeGenOpt::Level getOptLevel() {
   llvm_unreachable("Unrecognized opt level.");
 }
 
-LLVM_ATTRIBUTE_NORETURN
-static void reportError(SMDiagnostic Err, const char *ProgName) {
+[[noreturn]] static void reportError(SMDiagnostic Err, const char *ProgName) {
   Err.print(ProgName, errs());
   exit(1);
 }
@@ -419,6 +424,7 @@ static void reportError(SMDiagnostic Err, const char *ProgName) {
 Error loadDylibs();
 int runOrcJIT(const char *ProgName);
 void disallowOrcOptions();
+Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote();
 
 //===----------------------------------------------------------------------===//
 // main Driver function
@@ -659,6 +665,10 @@ int main(int argc, char **argv, char * const *envp) {
 #endif
   }
 
+  std::unique_ptr<orc::ExecutorProcessControl> EPC =
+      RemoteMCJIT ? ExitOnErr(launchRemote())
+                  : ExitOnErr(orc::SelfExecutorProcessControl::Create());
+
   if (!RemoteMCJIT) {
     // If the program doesn't explicitly call exit, we will need the Exit
     // function later on to make an explicit call, so get the function now.
@@ -709,22 +719,10 @@ int main(int argc, char **argv, char * const *envp) {
     // it couldn't. This is a limitation of the LLI implementation, not the
     // MCJIT itself. FIXME.
 
-    // Lanch the remote process and get a channel to it.
-    std::unique_ptr<orc::shared::FDRawByteChannel> C = launchRemote();
-    if (!C) {
-      WithColor::error(errs(), argv[0]) << "failed to launch remote JIT.\n";
-      exit(1);
-    }
-
-    // Create a remote target client running over the channel.
-    llvm::orc::ExecutionSession ES(
-        std::make_unique<orc::UnsupportedExecutorProcessControl>());
-    ES.setErrorReporter([&](Error Err) { ExitOnErr(std::move(Err)); });
-    typedef orc::remote::OrcRemoteTargetClient MyRemote;
-    auto R = ExitOnErr(MyRemote::Create(*C, ES));
-
     // Create a remote memory manager.
-    auto RemoteMM = ExitOnErr(R->createRemoteMemoryManager());
+    auto RemoteMM = ExitOnErr(
+        orc::EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
+            *EPC));
 
     // Forward MCJIT's memory manager calls to the remote memory manager.
     static_cast<ForwardingMemoryManager*>(RTDyldMM)->setMemMgr(
@@ -732,16 +730,16 @@ int main(int argc, char **argv, char * const *envp) {
 
     // Forward MCJIT's symbol resolution calls to the remote.
     static_cast<ForwardingMemoryManager *>(RTDyldMM)->setResolver(
-        std::make_unique<RemoteResolver<MyRemote>>(*R));
-
+        ExitOnErr(RemoteResolver::Create(*EPC)));
     // Grab the target address of the JIT'd main function on the remote and call
     // it.
     // FIXME: argv and envp handling.
-    JITTargetAddress Entry = EE->getFunctionAddress(EntryFn->getName().str());
+    auto Entry =
+        orc::ExecutorAddr(EE->getFunctionAddress(EntryFn->getName().str()));
     EE->finalizeObject();
     LLVM_DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at 0x"
-                      << format("%llx", Entry) << "\n");
-    Result = ExitOnErr(R->callIntVoid(Entry));
+                      << format("%llx", Entry.getValue()) << "\n");
+    Result = ExitOnErr(EPC->runAsMain(Entry, {}));
 
     // Like static constructors, the remote target MCJIT support doesn't handle
     // this yet. It could. FIXME.
@@ -752,7 +750,7 @@ int main(int argc, char **argv, char * const *envp) {
     EE.reset();
 
     // Signal the remote target that we're done JITing.
-    ExitOnErr(R->terminateSession());
+    ExitOnErr(EPC->disconnect());
   }
 
   return Result;
@@ -1062,7 +1060,8 @@ int runOrcJIT(const char *ProgName) {
 
   if (EPC) {
     // ExecutorProcessControl-based execution with JITLink.
-    Result = ExitOnErr(EPC->runAsMain(MainSym.getAddress(), InputArgv));
+    Result = ExitOnErr(
+        EPC->runAsMain(orc::ExecutorAddr(MainSym.getAddress()), InputArgv));
   } else {
     // Manual in-process execution with RuntimeDyld.
     using MainFnTy = int(int, char *[]);
@@ -1099,7 +1098,7 @@ void disallowOrcOptions() {
   }
 }
 
-std::unique_ptr<orc::shared::FDRawByteChannel> launchRemote() {
+Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote() {
 #ifndef LLVM_ON_UNIX
   llvm_unreachable("launchRemote not supported on non-Unix platforms");
 #else
@@ -1148,8 +1147,9 @@ std::unique_ptr<orc::shared::FDRawByteChannel> launchRemote() {
   close(PipeFD[0][0]);
   close(PipeFD[1][1]);
 
-  // Return an RPC channel connected to our end of the pipes.
-  return std::make_unique<orc::shared::FDRawByteChannel>(PipeFD[1][0],
-                                                         PipeFD[0][1]);
+  // Return a SimpleRemoteEPC instance connected to our end of the pipes.
+  return orc::SimpleRemoteEPC::Create<orc::FDSimpleRemoteEPCTransport>(
+      std::make_unique<llvm::orc::InPlaceTaskDispatcher>(),
+      llvm::orc::SimpleRemoteEPC::Setup(), PipeFD[1][0], PipeFD[0][1]);
 #endif
 }
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index 0e1dce6bc2e8..175ec8d022c2 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -96,11 +96,11 @@ OPTIONS:
 OPERATIONS:
   d - delete [files] from the archive
   m - move [files] in the archive
-  p - print [files] found in the archive
+  p - print contents of [files] found in the archive
   q - quick append [files] to the archive
   r - replace or insert [files] into the archive
   s - act as ranlib
-  t - display contents of archive
+  t - display list of files in archive
   x - extract [files] from the archive
 
 MODIFIERS:
@@ -136,14 +136,14 @@ static unsigned MRILineNumber;
 static bool ParsingMRIScript;
 
 // Show the error plus the usage message, and exit.
-LLVM_ATTRIBUTE_NORETURN static void badUsage(Twine Error) {
+[[noreturn]] static void badUsage(Twine Error) {
   WithColor::error(errs(), ToolName) << Error << "\n";
   printHelpMessage();
   exit(1);
 }
 
 // Show the error message and exit.
-LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
+[[noreturn]] static void fail(Twine Error) {
   if (ParsingMRIScript) {
     WithColor::error(errs(), ToolName)
         << "script line " << MRILineNumber << ": " << Error << "\n";
diff --git a/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index f4851bfb2a9c..a238b0cf5922 100644
--- a/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/llvm/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -11,8 +11,9 @@
 //  llvm-bcanalyzer [options] x.bc - Read LLVM bitcode from the x.bc file
 //
 //  Options:
-//      --help      - Output information about command line switches
-//      --dump      - Dump low-level bitcode structure in readable format
+//      --help            - Output information about command line switches
+//      --dump            - Dump low-level bitcode structure in readable format
+//      --dump-blockinfo  - Dump the BLOCKINFO_BLOCK, when used with --dump
 //
 // This tool provides analytical information about a bitcode file. It is
 // intended as an aid to developers of bitcode reading and writing software. It
@@ -47,6 +48,11 @@ static cl::opt<std::string> InputFilename(cl::Positional,
 static cl::opt<bool> Dump("dump", cl::desc("Dump low level bitcode trace"),
                           cl::cat(BCAnalyzerCategory));
 
+static cl::opt<bool> DumpBlockinfo("dump-blockinfo",
+                                   cl::desc("Include BLOCKINFO details in low"
+                                            " level dump"),
+                                   cl::cat(BCAnalyzerCategory));
+
 //===----------------------------------------------------------------------===//
 // Bitcode specific analysis.
 //===----------------------------------------------------------------------===//
@@ -114,6 +120,7 @@ int main(int argc, char **argv) {
   O.Histogram = !NoHistogram;
   O.Symbolic = !NonSymbolic;
   O.ShowBinaryBlobs = ShowBinaryBlobs;
+  O.DumpBlockinfo = DumpBlockinfo;
 
   ExitOnErr(BA.analyze(
       Dump ? Optional<BCDumpOptions>(O) : Optional<BCDumpOptions>(None),
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 02c0106cbc29..5c9ff41a2d5d 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -784,10 +784,18 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
 
     // If path-equivalence was given and is a comma seperated pair then set
     // PathRemapping.
-    auto EquivPair = StringRef(PathRemap).split(',');
-    if (!(EquivPair.first.empty() && EquivPair.second.empty()))
+    if (!PathRemap.empty()) {
+      auto EquivPair = StringRef(PathRemap).split(',');
+      if (EquivPair.first.empty() || EquivPair.second.empty()) {
+        error("invalid argument '" + PathRemap +
+                  "', must be in format 'from,to'",
+              "-path-equivalence");
+        return 1;
+      }
+
       PathRemapping = {std::string(EquivPair.first),
                        std::string(EquivPair.second)};
+    }
 
     // If a demangler is supplied, check if it exists and register it.
     if (!DemanglerOpts.empty()) {
diff --git a/llvm/tools/llvm-cov/CoverageExporterLcov.cpp b/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
index 6cf5d9285b90..0096a3d44d85 100644
--- a/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
@@ -167,7 +167,7 @@ void renderLineSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
 
 void renderBranchSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
   OS << "BRF:" << Summary.BranchCoverage.getNumBranches() << '\n'
-     << "BFH:" << Summary.BranchCoverage.getCovered() << '\n';
+     << "BRH:" << Summary.BranchCoverage.getCovered() << '\n';
 }
 
 void renderFile(raw_ostream &OS, const coverage::CoverageMapping &Coverage,
diff --git a/llvm/tools/llvm-cov/CoverageFilters.cpp b/llvm/tools/llvm-cov/CoverageFilters.cpp
index da3b5214eec4..fac7518d7da2 100644
--- a/llvm/tools/llvm-cov/CoverageFilters.cpp
+++ b/llvm/tools/llvm-cov/CoverageFilters.cpp
@@ -21,7 +21,7 @@ bool NameCoverageFilter::matches(
     const coverage::CoverageMapping &,
     const coverage::FunctionRecord &Function) const {
   StringRef FuncName = Function.Name;
-  return FuncName.find(Name) != StringRef::npos;
+  return FuncName.contains(Name);
 }
 
 bool NameRegexCoverageFilter::matches(
diff --git a/llvm/tools/llvm-cxxdump/Error.cpp b/llvm/tools/llvm-cxxdump/Error.cpp
index 25317820409c..053d0e0764bc 100644
--- a/llvm/tools/llvm-cxxdump/Error.cpp
+++ b/llvm/tools/llvm-cxxdump/Error.cpp
@@ -12,6 +12,7 @@
 
 #include "Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <string>
 
 using namespace llvm;
 
diff --git a/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp b/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
index f214288e951b..1430674dbadc 100644
--- a/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
+++ b/llvm/tools/llvm-cxxdump/llvm-cxxdump.cpp
@@ -13,6 +13,7 @@
 #include "llvm-cxxdump.h"
 #include "Error.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolSize.h"
@@ -20,7 +21,6 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
@@ -49,7 +49,7 @@ static void error(std::error_code EC) {
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN static void error(Error Err) {
+[[noreturn]] static void error(Error Err) {
   logAllUnhandledErrors(std::move(Err), WithColor::error(outs()),
                         "reading file: ");
   outs().flush();
diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index d8bf8dbccce0..ccfaaa96deb2 100644
--- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -65,34 +65,27 @@ static void error(const Twine &Message) {
 }
 
 static std::string demangle(const std::string &Mangled) {
-  int Status;
-  std::string Prefix;
-
   const char *DecoratedStr = Mangled.c_str();
   if (StripUnderscore)
     if (DecoratedStr[0] == '_')
       ++DecoratedStr;
-  size_t DecoratedLength = strlen(DecoratedStr);
 
+  std::string Result;
+  if (nonMicrosoftDemangle(DecoratedStr, Result))
+    return Result;
+
+  std::string Prefix;
   char *Undecorated = nullptr;
 
-  if (Types ||
-      ((DecoratedLength >= 2 && strncmp(DecoratedStr, "_Z", 2) == 0) ||
-       (DecoratedLength >= 4 && strncmp(DecoratedStr, "___Z", 4) == 0)))
-    Undecorated = itaniumDemangle(DecoratedStr, nullptr, nullptr, &Status);
+  if (Types)
+    Undecorated = itaniumDemangle(DecoratedStr, nullptr, nullptr, nullptr);
 
-  if (!Undecorated &&
-      (DecoratedLength > 6 && strncmp(DecoratedStr, "__imp_", 6) == 0)) {
+  if (!Undecorated && strncmp(DecoratedStr, "__imp_", 6) == 0) {
     Prefix = "import thunk for ";
-    Undecorated = itaniumDemangle(DecoratedStr + 6, nullptr, nullptr, &Status);
-  }
-
-  if (!Undecorated &&
-      (DecoratedLength >= 2 && strncmp(DecoratedStr, "_R", 2) == 0)) {
-    Undecorated = rustDemangle(DecoratedStr, nullptr, nullptr, &Status);
+    Undecorated = itaniumDemangle(DecoratedStr + 6, nullptr, nullptr, nullptr);
   }
 
-  std::string Result(Undecorated ? Prefix + Undecorated : Mangled);
+  Result = Undecorated ? Prefix + Undecorated : Mangled;
   free(Undecorated);
   return Result;
 }
@@ -128,7 +121,7 @@ static void SplitStringDelims(
 static bool IsLegalItaniumChar(char C) {
   // Itanium CXX ABI [External Names]p5.1.1:
   // '$' and '.' in mangled names are reserved for private implementations.
-  return isalnum(C) || C == '.' || C == '$' || C == '_';
+  return isAlnum(C) || C == '.' || C == '$' || C == '_';
 }
 
 // If 'Split' is true, then 'Mangled' is broken into individual words and each
diff --git a/llvm/tools/llvm-diff/DiffConsumer.cpp b/llvm/tools/llvm-diff/lib/DiffConsumer.cpp
index a703f42f14c3..b6eb71916acf 100644
--- a/llvm/tools/llvm-diff/DiffConsumer.cpp
+++ b/llvm/tools/llvm-diff/lib/DiffConsumer.cpp
@@ -134,6 +134,12 @@ void DiffConsumer::indent() {
   while (N--) out << ' ';
 }
 
+void DiffConsumer::reset() {
+  contexts.clear();
+  Differences = false;
+  Indent = 0;
+}
+
 bool DiffConsumer::hadDifferences() const {
   return Differences;
 }
diff --git a/llvm/tools/llvm-diff/DiffConsumer.h b/llvm/tools/llvm-diff/lib/DiffConsumer.h
index f7b2f2450eec..08c3afcbe111 100644
--- a/llvm/tools/llvm-diff/DiffConsumer.h
+++ b/llvm/tools/llvm-diff/lib/DiffConsumer.h
@@ -78,6 +78,7 @@ class StringRef;
     DiffConsumer()
       : out(errs()), Differences(false), Indent(0) {}
 
+    void reset();
     bool hadDifferences() const;
     void enterContext(const Value *L, const Value *R) override;
     void exitContext() override;
diff --git a/llvm/tools/llvm-diff/DiffLog.cpp b/llvm/tools/llvm-diff/lib/DiffLog.cpp
index d31a345d255c..d31a345d255c 100644
--- a/llvm/tools/llvm-diff/DiffLog.cpp
+++ b/llvm/tools/llvm-diff/lib/DiffLog.cpp
diff --git a/llvm/tools/llvm-diff/DiffLog.h b/llvm/tools/llvm-diff/lib/DiffLog.h
index d8b07b971198..d8b07b971198 100644
--- a/llvm/tools/llvm-diff/DiffLog.h
+++ b/llvm/tools/llvm-diff/lib/DiffLog.h
diff --git a/llvm/tools/llvm-diff/DifferenceEngine.cpp b/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp
index eb746cd2a865..eb746cd2a865 100644
--- a/llvm/tools/llvm-diff/DifferenceEngine.cpp
+++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp
diff --git a/llvm/tools/llvm-diff/DifferenceEngine.h b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
index 436a35566360..436a35566360 100644
--- a/llvm/tools/llvm-diff/DifferenceEngine.h
+++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
diff --git a/llvm/tools/llvm-diff/llvm-diff.cpp b/llvm/tools/llvm-diff/llvm-diff.cpp
index 8a11179e741e..d9d19f35ffee 100644
--- a/llvm/tools/llvm-diff/llvm-diff.cpp
+++ b/llvm/tools/llvm-diff/llvm-diff.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DiffLog.h"
-#include "DifferenceEngine.h"
+#include "lib/DiffLog.h"
+#include "lib/DifferenceEngine.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp
index 19a971afa311..b237e014038d 100644
--- a/llvm/tools/llvm-dwarfdump/Statistics.cpp
+++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp
@@ -29,6 +29,9 @@ constexpr int NumOfCoverageCategories = 12;
 /// This is used for zero location coverage bucket.
 constexpr unsigned ZeroCoverageBucket = 0;
 
+/// The UINT64_MAX is used as an indication of the overflow.
+constexpr uint64_t OverflowValue = std::numeric_limits<uint64_t>::max();
+
 /// This represents variables DIE offsets.
 using AbstractOriginVarsTy = llvm::SmallVector<uint64_t>;
 /// This maps function DIE offset to its variables.
@@ -36,22 +39,43 @@ using AbstractOriginVarsTyMap = llvm::DenseMap<uint64_t, AbstractOriginVarsTy>;
 /// This represents function DIE offsets containing an abstract_origin.
 using FunctionsWithAbstractOriginTy = llvm::SmallVector<uint64_t>;
 
+/// This represents a data type for the stats and it helps us to
+/// detect an overflow.
+/// NOTE: This can be implemented as a template if there is an another type
+/// needing this.
+struct SaturatingUINT64 {
+  /// Number that represents the stats.
+  uint64_t Value;
+
+  SaturatingUINT64(uint64_t Value_) : Value(Value_) {}
+
+  void operator++(int) { return *this += 1; }
+  void operator+=(uint64_t Value_) {
+    if (Value != OverflowValue) {
+      if (Value < OverflowValue - Value_)
+        Value += Value_;
+      else
+        Value = OverflowValue;
+    }
+  }
+};
+
 /// Holds statistics for one function (or other entity that has a PC range and
 /// contains variables, such as a compile unit).
 struct PerFunctionStats {
   /// Number of inlined instances of this function.
-  unsigned NumFnInlined = 0;
+  uint64_t NumFnInlined = 0;
   /// Number of out-of-line instances of this function.
-  unsigned NumFnOutOfLine = 0;
+  uint64_t NumFnOutOfLine = 0;
   /// Number of inlined instances that have abstract origins.
-  unsigned NumAbstractOrigins = 0;
+  uint64_t NumAbstractOrigins = 0;
   /// Number of variables and parameters with location across all inlined
   /// instances.
-  unsigned TotalVarWithLoc = 0;
+  uint64_t TotalVarWithLoc = 0;
   /// Number of constants with location across all inlined instances.
-  unsigned ConstantMembers = 0;
+  uint64_t ConstantMembers = 0;
   /// Number of arificial variables, parameters or members across all instances.
-  unsigned NumArtificial = 0;
+  uint64_t NumArtificial = 0;
   /// List of all Variables and parameters in this function.
   StringSet<> VarsInFunction;
   /// Compile units also cover a PC range, but have this flag set to false.
@@ -59,63 +83,63 @@ struct PerFunctionStats {
   /// Function has source location information.
   bool HasSourceLocation = false;
   /// Number of function parameters.
-  unsigned NumParams = 0;
+  uint64_t NumParams = 0;
   /// Number of function parameters with source location.
-  unsigned NumParamSourceLocations = 0;
+  uint64_t NumParamSourceLocations = 0;
   /// Number of function parameters with type.
-  unsigned NumParamTypes = 0;
+  uint64_t NumParamTypes = 0;
   /// Number of function parameters with a DW_AT_location.
-  unsigned NumParamLocations = 0;
+  uint64_t NumParamLocations = 0;
   /// Number of local variables.
-  unsigned NumLocalVars = 0;
+  uint64_t NumLocalVars = 0;
   /// Number of local variables with source location.
-  unsigned NumLocalVarSourceLocations = 0;
+  uint64_t NumLocalVarSourceLocations = 0;
   /// Number of local variables with type.
-  unsigned NumLocalVarTypes = 0;
+  uint64_t NumLocalVarTypes = 0;
   /// Number of local variables with DW_AT_location.
-  unsigned NumLocalVarLocations = 0;
+  uint64_t NumLocalVarLocations = 0;
 };
 
 /// Holds accumulated global statistics about DIEs.
 struct GlobalStats {
   /// Total number of PC range bytes covered by DW_AT_locations.
-  unsigned TotalBytesCovered = 0;
+  SaturatingUINT64 TotalBytesCovered = 0;
   /// Total number of parent DIE PC range bytes covered by DW_AT_Locations.
-  unsigned ScopeBytesCovered = 0;
+  SaturatingUINT64 ScopeBytesCovered = 0;
   /// Total number of PC range bytes in each variable's enclosing scope.
-  unsigned ScopeBytes = 0;
+  SaturatingUINT64 ScopeBytes = 0;
   /// Total number of PC range bytes covered by DW_AT_locations with
   /// the debug entry values (DW_OP_entry_value).
-  unsigned ScopeEntryValueBytesCovered = 0;
+  SaturatingUINT64 ScopeEntryValueBytesCovered = 0;
   /// Total number of PC range bytes covered by DW_AT_locations of
   /// formal parameters.
-  unsigned ParamScopeBytesCovered = 0;
+  SaturatingUINT64 ParamScopeBytesCovered = 0;
   /// Total number of PC range bytes in each parameter's enclosing scope.
-  unsigned ParamScopeBytes = 0;
+  SaturatingUINT64 ParamScopeBytes = 0;
   /// Total number of PC range bytes covered by DW_AT_locations with
   /// the debug entry values (DW_OP_entry_value) (only for parameters).
-  unsigned ParamScopeEntryValueBytesCovered = 0;
+  SaturatingUINT64 ParamScopeEntryValueBytesCovered = 0;
   /// Total number of PC range bytes covered by DW_AT_locations (only for local
   /// variables).
-  unsigned LocalVarScopeBytesCovered = 0;
+  SaturatingUINT64 LocalVarScopeBytesCovered = 0;
   /// Total number of PC range bytes in each local variable's enclosing scope.
-  unsigned LocalVarScopeBytes = 0;
+  SaturatingUINT64 LocalVarScopeBytes = 0;
   /// Total number of PC range bytes covered by DW_AT_locations with
   /// the debug entry values (DW_OP_entry_value) (only for local variables).
-  unsigned LocalVarScopeEntryValueBytesCovered = 0;
+  SaturatingUINT64 LocalVarScopeEntryValueBytesCovered = 0;
   /// Total number of call site entries (DW_AT_call_file & DW_AT_call_line).
-  unsigned CallSiteEntries = 0;
+  SaturatingUINT64 CallSiteEntries = 0;
   /// Total number of call site DIEs (DW_TAG_call_site).
-  unsigned CallSiteDIEs = 0;
+  SaturatingUINT64 CallSiteDIEs = 0;
   /// Total number of call site parameter DIEs (DW_TAG_call_site_parameter).
-  unsigned CallSiteParamDIEs = 0;
+  SaturatingUINT64 CallSiteParamDIEs = 0;
   /// Total byte size of concrete functions. This byte size includes
   /// inline functions contained in the concrete functions.
-  unsigned FunctionSize = 0;
+  SaturatingUINT64 FunctionSize = 0;
   /// Total byte size of inlined functions. This is the total number of bytes
   /// for the top inline functions within concrete functions. This can help
   /// tune the inline settings when compiling to match user expectations.
-  unsigned InlineFunctionSize = 0;
+  SaturatingUINT64 InlineFunctionSize = 0;
 };
 
 /// Holds accumulated debug location statistics about local variables and
@@ -126,37 +150,37 @@ struct LocationStats {
   /// of variables with the no debug location at all, but the last element
   /// in the vector represents the number of fully covered variables within
   /// its scope.
-  std::vector<unsigned> VarParamLocStats{
-      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  std::vector<SaturatingUINT64> VarParamLocStats{
+      std::vector<SaturatingUINT64>(NumOfCoverageCategories, 0)};
   /// Map non debug entry values coverage.
-  std::vector<unsigned> VarParamNonEntryValLocStats{
-      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  std::vector<SaturatingUINT64> VarParamNonEntryValLocStats{
+      std::vector<SaturatingUINT64>(NumOfCoverageCategories, 0)};
   /// The debug location statistics for formal parameters.
-  std::vector<unsigned> ParamLocStats{
-      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  std::vector<SaturatingUINT64> ParamLocStats{
+      std::vector<SaturatingUINT64>(NumOfCoverageCategories, 0)};
   /// Map non debug entry values coverage for formal parameters.
-  std::vector<unsigned> ParamNonEntryValLocStats{
-      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  std::vector<SaturatingUINT64> ParamNonEntryValLocStats{
+      std::vector<SaturatingUINT64>(NumOfCoverageCategories, 0)};
   /// The debug location statistics for local variables.
-  std::vector<unsigned> LocalVarLocStats{
-      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  std::vector<SaturatingUINT64> LocalVarLocStats{
+      std::vector<SaturatingUINT64>(NumOfCoverageCategories, 0)};
   /// Map non debug entry values coverage for local variables.
-  std::vector<unsigned> LocalVarNonEntryValLocStats{
-      std::vector<unsigned>(NumOfCoverageCategories, 0)};
+  std::vector<SaturatingUINT64> LocalVarNonEntryValLocStats{
+      std::vector<SaturatingUINT64>(NumOfCoverageCategories, 0)};
   /// Total number of local variables and function parameters processed.
-  unsigned NumVarParam = 0;
+  SaturatingUINT64 NumVarParam = 0;
   /// Total number of formal parameters processed.
-  unsigned NumParam = 0;
+  SaturatingUINT64 NumParam = 0;
   /// Total number of local variables processed.
-  unsigned NumVar = 0;
+  SaturatingUINT64 NumVar = 0;
 };
 } // namespace
 
 /// Collect debug location statistics for one DIE.
 static void collectLocStats(uint64_t ScopeBytesCovered, uint64_t BytesInScope,
-                            std::vector<unsigned> &VarParamLocStats,
-                            std::vector<unsigned> &ParamLocStats,
-                            std::vector<unsigned> &LocalVarLocStats,
+                            std::vector<SaturatingUINT64> &VarParamLocStats,
+                            std::vector<SaturatingUINT64> &ParamLocStats,
+                            std::vector<SaturatingUINT64> &LocalVarLocStats,
                             bool IsParam, bool IsLocalVar) {
   auto getCoverageBucket = [ScopeBytesCovered, BytesInScope]() -> unsigned {
     // No debug location at all for the variable.
@@ -173,11 +197,11 @@ static void collectLocStats(uint64_t ScopeBytesCovered, uint64_t BytesInScope,
 
   unsigned CoverageBucket = getCoverageBucket();
 
-  VarParamLocStats[CoverageBucket]++;
+  VarParamLocStats[CoverageBucket].Value++;
   if (IsParam)
-    ParamLocStats[CoverageBucket]++;
+    ParamLocStats[CoverageBucket].Value++;
   else if (IsLocalVar)
-    LocalVarLocStats[CoverageBucket]++;
+    LocalVarLocStats[CoverageBucket].Value++;
 }
 
 /// Construct an identifier for a given DIE from its Prefix, Name, DeclFileName
@@ -298,7 +322,7 @@ static void collectStatsForDie(DWARFDie Die, const std::string &FnPrefix,
                                U->getFormParams().Format);
     // Consider the expression containing the DW_OP_entry_value as
     // an entry value.
-    return llvm::any_of(Expression, [](DWARFExpression::Operation &Op) {
+    return llvm::any_of(Expression, [](const DWARFExpression::Operation &Op) {
       return Op.getCode() == dwarf::DW_OP_entry_value ||
              Op.getCode() == dwarf::DW_OP_GNU_entry_value;
     });
@@ -350,11 +374,11 @@ static void collectStatsForDie(DWARFDie Die, const std::string &FnPrefix,
 
   // Calculate the debug location statistics.
   if (BytesInScope && !DeferLocStats) {
-    LocStats.NumVarParam++;
+    LocStats.NumVarParam.Value++;
     if (IsParam)
-      LocStats.NumParam++;
+      LocStats.NumParam.Value++;
     else if (IsLocalVar)
-      LocStats.NumVar++;
+      LocStats.NumVar.Value++;
 
     collectLocStats(ScopeBytesCovered, BytesInScope, LocStats.VarParamLocStats,
                     LocStats.ParamLocStats, LocStats.LocalVarLocStats, IsParam,
@@ -389,7 +413,7 @@ static void collectStatsForDie(DWARFDie Die, const std::string &FnPrefix,
       GlobalStats.LocalVarScopeEntryValueBytesCovered +=
           BytesEntryValuesCovered;
     }
-    assert(GlobalStats.ScopeBytesCovered <= GlobalStats.ScopeBytes);
+    assert(GlobalStats.ScopeBytesCovered.Value <= GlobalStats.ScopeBytes.Value);
   }
 
   if (IsConstantMember) {
@@ -603,45 +627,78 @@ static void collectStatsRecursive(
 /// Print human-readable output.
 /// \{
 static void printDatum(json::OStream &J, const char *Key, json::Value Value) {
-  J.attribute(Key, Value);
+  if (Value == OverflowValue)
+    J.attribute(Key, "overflowed");
+  else
+    J.attribute(Key, Value);
+
   LLVM_DEBUG(llvm::dbgs() << Key << ": " << Value << '\n');
 }
 
 static void printLocationStats(json::OStream &J, const char *Key,
-                               std::vector<unsigned> &LocationStats) {
-  J.attribute(
-      (Twine(Key) + " with 0% of parent scope covered by DW_AT_location").str(),
-      LocationStats[0]);
+                               std::vector<SaturatingUINT64> &LocationStats) {
+  if (LocationStats[0].Value == OverflowValue)
+    J.attribute((Twine(Key) +
+                 " with (0%,10%) of parent scope covered by DW_AT_location")
+                    .str(),
+                "overflowed");
+  else
+    J.attribute(
+        (Twine(Key) + " with 0% of parent scope covered by DW_AT_location")
+            .str(),
+        LocationStats[0].Value);
   LLVM_DEBUG(
       llvm::dbgs() << Key
                    << " with 0% of parent scope covered by DW_AT_location: \\"
-                   << LocationStats[0] << '\n');
-  J.attribute(
-      (Twine(Key) + " with (0%,10%) of parent scope covered by DW_AT_location")
-          .str(),
-      LocationStats[1]);
+                   << LocationStats[0].Value << '\n');
+
+  if (LocationStats[1].Value == OverflowValue)
+    J.attribute((Twine(Key) +
+                 " with (0%,10%) of parent scope covered by DW_AT_location")
+                    .str(),
+                "overflowed");
+  else
+    J.attribute((Twine(Key) +
+                 " with (0%,10%) of parent scope covered by DW_AT_location")
+                    .str(),
+                LocationStats[1].Value);
   LLVM_DEBUG(llvm::dbgs()
              << Key
              << " with (0%,10%) of parent scope covered by DW_AT_location: "
-             << LocationStats[1] << '\n');
+             << LocationStats[1].Value << '\n');
+
   for (unsigned i = 2; i < NumOfCoverageCategories - 1; ++i) {
-    J.attribute((Twine(Key) + " with [" + Twine((i - 1) * 10) + "%," +
-                 Twine(i * 10) + "%) of parent scope covered by DW_AT_location")
-                    .str(),
-                LocationStats[i]);
+    if (LocationStats[i].Value == OverflowValue)
+      J.attribute((Twine(Key) + " with [" + Twine((i - 1) * 10) + "%," +
+                   Twine(i * 10) +
+                   "%) of parent scope covered by DW_AT_location")
+                      .str(),
+                  "overflowed");
+    else
+      J.attribute((Twine(Key) + " with [" + Twine((i - 1) * 10) + "%," +
+                   Twine(i * 10) +
+                   "%) of parent scope covered by DW_AT_location")
+                      .str(),
+                  LocationStats[i].Value);
     LLVM_DEBUG(llvm::dbgs()
                << Key << " with [" << (i - 1) * 10 << "%," << i * 10
                << "%) of parent scope covered by DW_AT_location: "
-               << LocationStats[i]);
+               << LocationStats[i].Value);
   }
-  J.attribute(
-      (Twine(Key) + " with 100% of parent scope covered by DW_AT_location")
-          .str(),
-      LocationStats[NumOfCoverageCategories - 1]);
+  if (LocationStats[NumOfCoverageCategories - 1].Value == OverflowValue)
+    J.attribute(
+        (Twine(Key) + " with 100% of parent scope covered by DW_AT_location")
+            .str(),
+        "overflowed");
+  else
+    J.attribute(
+        (Twine(Key) + " with 100% of parent scope covered by DW_AT_location")
+            .str(),
+        LocationStats[NumOfCoverageCategories - 1].Value);
   LLVM_DEBUG(
       llvm::dbgs() << Key
                    << " with 100% of parent scope covered by DW_AT_location: "
-                   << LocationStats[NumOfCoverageCategories - 1]);
+                   << LocationStats[NumOfCoverageCategories - 1].Value);
 }
 
 static void printSectionSizes(json::OStream &J, const SectionSizes &Sizes) {
@@ -750,31 +807,31 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   /// The version number should be increased every time the algorithm is changed
   /// (including bug fixes). New metrics may be added without increasing the
   /// version.
-  unsigned Version = 8;
-  unsigned VarParamTotal = 0;
-  unsigned VarParamUnique = 0;
-  unsigned VarParamWithLoc = 0;
-  unsigned NumFunctions = 0;
-  unsigned NumInlinedFunctions = 0;
-  unsigned NumFuncsWithSrcLoc = 0;
-  unsigned NumAbstractOrigins = 0;
-  unsigned ParamTotal = 0;
-  unsigned ParamWithType = 0;
-  unsigned ParamWithLoc = 0;
-  unsigned ParamWithSrcLoc = 0;
-  unsigned LocalVarTotal = 0;
-  unsigned LocalVarWithType = 0;
-  unsigned LocalVarWithSrcLoc = 0;
-  unsigned LocalVarWithLoc = 0;
+  unsigned Version = 9;
+  SaturatingUINT64 VarParamTotal = 0;
+  SaturatingUINT64 VarParamUnique = 0;
+  SaturatingUINT64 VarParamWithLoc = 0;
+  SaturatingUINT64 NumFunctions = 0;
+  SaturatingUINT64 NumInlinedFunctions = 0;
+  SaturatingUINT64 NumFuncsWithSrcLoc = 0;
+  SaturatingUINT64 NumAbstractOrigins = 0;
+  SaturatingUINT64 ParamTotal = 0;
+  SaturatingUINT64 ParamWithType = 0;
+  SaturatingUINT64 ParamWithLoc = 0;
+  SaturatingUINT64 ParamWithSrcLoc = 0;
+  SaturatingUINT64 LocalVarTotal = 0;
+  SaturatingUINT64 LocalVarWithType = 0;
+  SaturatingUINT64 LocalVarWithSrcLoc = 0;
+  SaturatingUINT64 LocalVarWithLoc = 0;
   for (auto &Entry : Statistics) {
     PerFunctionStats &Stats = Entry.getValue();
-    unsigned TotalVars = Stats.VarsInFunction.size() *
+    uint64_t TotalVars = Stats.VarsInFunction.size() *
                          (Stats.NumFnInlined + Stats.NumFnOutOfLine);
     // Count variables in global scope.
     if (!Stats.IsFunction)
       TotalVars =
           Stats.NumLocalVars + Stats.ConstantMembers + Stats.NumArtificial;
-    unsigned Constants = Stats.ConstantMembers;
+    uint64_t Constants = Stats.ConstantMembers;
     VarParamWithLoc += Stats.TotalVarWithLoc + Constants;
     VarParamTotal += TotalVars;
     VarParamUnique += Stats.VarsInFunction.size();
@@ -806,70 +863,72 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   printDatum(J, "file", Filename.str());
   printDatum(J, "format", FormatName);
 
-  printDatum(J, "#functions", NumFunctions);
-  printDatum(J, "#functions with location", NumFuncsWithSrcLoc);
-  printDatum(J, "#inlined functions", NumInlinedFunctions);
-  printDatum(J, "#inlined functions with abstract origins", NumAbstractOrigins);
+  printDatum(J, "#functions", NumFunctions.Value);
+  printDatum(J, "#functions with location", NumFuncsWithSrcLoc.Value);
+  printDatum(J, "#inlined functions", NumInlinedFunctions.Value);
+  printDatum(J, "#inlined functions with abstract origins",
+             NumAbstractOrigins.Value);
 
   // This includes local variables and formal parameters.
-  printDatum(J, "#unique source variables", VarParamUnique);
-  printDatum(J, "#source variables", VarParamTotal);
-  printDatum(J, "#source variables with location", VarParamWithLoc);
+  printDatum(J, "#unique source variables", VarParamUnique.Value);
+  printDatum(J, "#source variables", VarParamTotal.Value);
+  printDatum(J, "#source variables with location", VarParamWithLoc.Value);
 
-  printDatum(J, "#call site entries", GlobalStats.CallSiteEntries);
-  printDatum(J, "#call site DIEs", GlobalStats.CallSiteDIEs);
-  printDatum(J, "#call site parameter DIEs", GlobalStats.CallSiteParamDIEs);
+  printDatum(J, "#call site entries", GlobalStats.CallSiteEntries.Value);
+  printDatum(J, "#call site DIEs", GlobalStats.CallSiteDIEs.Value);
+  printDatum(J, "#call site parameter DIEs",
+             GlobalStats.CallSiteParamDIEs.Value);
 
   printDatum(J, "sum_all_variables(#bytes in parent scope)",
-             GlobalStats.ScopeBytes);
+             GlobalStats.ScopeBytes.Value);
   printDatum(J,
              "sum_all_variables(#bytes in any scope covered by DW_AT_location)",
-             GlobalStats.TotalBytesCovered);
+             GlobalStats.TotalBytesCovered.Value);
   printDatum(J,
              "sum_all_variables(#bytes in parent scope covered by "
              "DW_AT_location)",
-             GlobalStats.ScopeBytesCovered);
+             GlobalStats.ScopeBytesCovered.Value);
   printDatum(J,
              "sum_all_variables(#bytes in parent scope covered by "
              "DW_OP_entry_value)",
-             GlobalStats.ScopeEntryValueBytesCovered);
+             GlobalStats.ScopeEntryValueBytesCovered.Value);
 
   printDatum(J, "sum_all_params(#bytes in parent scope)",
-             GlobalStats.ParamScopeBytes);
+             GlobalStats.ParamScopeBytes.Value);
   printDatum(J,
              "sum_all_params(#bytes in parent scope covered by DW_AT_location)",
-             GlobalStats.ParamScopeBytesCovered);
+             GlobalStats.ParamScopeBytesCovered.Value);
   printDatum(J,
              "sum_all_params(#bytes in parent scope covered by "
              "DW_OP_entry_value)",
-             GlobalStats.ParamScopeEntryValueBytesCovered);
+             GlobalStats.ParamScopeEntryValueBytesCovered.Value);
 
   printDatum(J, "sum_all_local_vars(#bytes in parent scope)",
-             GlobalStats.LocalVarScopeBytes);
+             GlobalStats.LocalVarScopeBytes.Value);
   printDatum(J,
              "sum_all_local_vars(#bytes in parent scope covered by "
              "DW_AT_location)",
-             GlobalStats.LocalVarScopeBytesCovered);
+             GlobalStats.LocalVarScopeBytesCovered.Value);
   printDatum(J,
              "sum_all_local_vars(#bytes in parent scope covered by "
              "DW_OP_entry_value)",
-             GlobalStats.LocalVarScopeEntryValueBytesCovered);
+             GlobalStats.LocalVarScopeEntryValueBytesCovered.Value);
 
-  printDatum(J, "#bytes within functions", GlobalStats.FunctionSize);
+  printDatum(J, "#bytes within functions", GlobalStats.FunctionSize.Value);
   printDatum(J, "#bytes within inlined functions",
-             GlobalStats.InlineFunctionSize);
+             GlobalStats.InlineFunctionSize.Value);
 
   // Print the summary for formal parameters.
-  printDatum(J, "#params", ParamTotal);
-  printDatum(J, "#params with source location", ParamWithSrcLoc);
-  printDatum(J, "#params with type", ParamWithType);
-  printDatum(J, "#params with binary location", ParamWithLoc);
+  printDatum(J, "#params", ParamTotal.Value);
+  printDatum(J, "#params with source location", ParamWithSrcLoc.Value);
+  printDatum(J, "#params with type", ParamWithType.Value);
+  printDatum(J, "#params with binary location", ParamWithLoc.Value);
 
   // Print the summary for local variables.
-  printDatum(J, "#local vars", LocalVarTotal);
-  printDatum(J, "#local vars with source location", LocalVarWithSrcLoc);
-  printDatum(J, "#local vars with type", LocalVarWithType);
-  printDatum(J, "#local vars with binary location", LocalVarWithLoc);
+  printDatum(J, "#local vars", LocalVarTotal.Value);
+  printDatum(J, "#local vars with source location", LocalVarWithSrcLoc.Value);
+  printDatum(J, "#local vars with type", LocalVarWithType.Value);
+  printDatum(J, "#local vars with binary location", LocalVarWithLoc.Value);
 
   // Print the debug section sizes.
   printSectionSizes(J, Sizes);
@@ -877,32 +936,34 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   // Print the location statistics for variables (includes local variables
   // and formal parameters).
   printDatum(J, "#variables processed by location statistics",
-             LocStats.NumVarParam);
+             LocStats.NumVarParam.Value);
   printLocationStats(J, "#variables", LocStats.VarParamLocStats);
   printLocationStats(J, "#variables - entry values",
                      LocStats.VarParamNonEntryValLocStats);
 
   // Print the location statistics for formal parameters.
-  printDatum(J, "#params processed by location statistics", LocStats.NumParam);
+  printDatum(J, "#params processed by location statistics",
+             LocStats.NumParam.Value);
   printLocationStats(J, "#params", LocStats.ParamLocStats);
   printLocationStats(J, "#params - entry values",
                      LocStats.ParamNonEntryValLocStats);
 
   // Print the location statistics for local variables.
   printDatum(J, "#local vars processed by location statistics",
-             LocStats.NumVar);
+             LocStats.NumVar.Value);
   printLocationStats(J, "#local vars", LocStats.LocalVarLocStats);
   printLocationStats(J, "#local vars - entry values",
                      LocStats.LocalVarNonEntryValLocStats);
   J.objectEnd();
   OS << '\n';
-  LLVM_DEBUG(
-      llvm::dbgs() << "Total Availability: "
-                   << (int)std::round((VarParamWithLoc * 100.0) / VarParamTotal)
-                   << "%\n";
-      llvm::dbgs() << "PC Ranges covered: "
-                   << (int)std::round((GlobalStats.ScopeBytesCovered * 100.0) /
-                                      GlobalStats.ScopeBytes)
-                   << "%\n");
+  LLVM_DEBUG(llvm::dbgs() << "Total Availability: "
+                          << (int)std::round((VarParamWithLoc.Value * 100.0) /
+                                             VarParamTotal.Value)
+                          << "%\n";
+             llvm::dbgs() << "PC Ranges covered: "
+                          << (int)std::round(
+                                 (GlobalStats.ScopeBytesCovered.Value * 100.0) /
+                                 GlobalStats.ScopeBytes.Value)
+                          << "%\n");
   return true;
 }
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index a324ff710af5..9eeaddf14928 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -169,7 +169,7 @@ static list<std::string>
 static alias FindAlias("f", desc("Alias for --find."), aliasopt(Find),
                        cl::NotHidden);
 static opt<bool> IgnoreCase("ignore-case",
-                            desc("Ignore case distinctions when searching."),
+                            desc("Ignore case distinctions when using --name."),
                             value_desc("i"), cat(DwarfDumpCategory));
 static alias IgnoreCaseAlias("i", desc("Alias for --ignore-case."),
                              aliasopt(IgnoreCase), cl::NotHidden);
@@ -192,11 +192,12 @@ static opt<std::string>
                    cl::value_desc("filename"), cat(DwarfDumpCategory));
 static alias OutputFilenameAlias("out-file", desc("Alias for -o."),
                                  aliasopt(OutputFilename));
-static opt<bool>
-    UseRegex("regex",
-             desc("Treat any <pattern> strings as regular expressions when "
-                  "searching instead of just as an exact string match."),
-             cat(DwarfDumpCategory));
+static opt<bool> UseRegex(
+    "regex",
+    desc("Treat any <pattern> strings as regular "
+         "expressions when searching with --name. If --ignore-case is also "
+         "specified, the regular expression becomes case-insensitive."),
+    cat(DwarfDumpCategory));
 static alias RegexAlias("x", desc("Alias for --regex"), aliasopt(UseRegex),
                         cl::NotHidden);
 static opt<bool>
@@ -536,8 +537,9 @@ static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
   };
   if (auto *Obj = dyn_cast<ObjectFile>(BinOrErr->get())) {
     if (filterArch(*Obj)) {
-      std::unique_ptr<DWARFContext> DICtx =
-          DWARFContext::create(*Obj, nullptr, "", RecoverableErrorHandler);
+      std::unique_ptr<DWARFContext> DICtx = DWARFContext::create(
+          *Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, "",
+          RecoverableErrorHandler);
       if (!HandleObj(*Obj, *DICtx, Filename, OS))
         Result = false;
     }
@@ -548,8 +550,9 @@ static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
       if (auto MachOOrErr = ObjForArch.getAsObjectFile()) {
         auto &Obj = **MachOOrErr;
         if (filterArch(Obj)) {
-          std::unique_ptr<DWARFContext> DICtx =
-              DWARFContext::create(Obj, nullptr, "", RecoverableErrorHandler);
+          std::unique_ptr<DWARFContext> DICtx = DWARFContext::create(
+              Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, "",
+              RecoverableErrorHandler);
           if (!HandleObj(Obj, *DICtx, ObjName, OS))
             Result = false;
         }
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 1f583728c141..4b6f7bc8dd34 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -20,10 +20,10 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index 45bfa84fb826..995ebacacb87 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -227,6 +227,10 @@ static cl::opt<bool> ListDependentLibrariesOnly(
         "Instead of running LTO, list the dependent libraries in each IR file"),
     cl::cat(LTOCategory));
 
+static cl::opt<bool> QueryHasCtorDtor(
+    "query-hasCtorDtor", cl::init(false),
+    cl::desc("Queries LTOModule::hasCtorDtor() on each IR file"));
+
 static cl::opt<bool>
     SetMergedModule("set-merged-module", cl::init(false),
                     cl::desc("Use the first input module as the merged module"),
@@ -371,7 +375,7 @@ static void printIndexStats() {
         ExitOnErr(getModuleSummaryIndexForFile(Filename));
     // Skip files without a module summary.
     if (!Index)
-      report_fatal_error(Filename + " does not contain an index");
+      report_fatal_error(Twine(Filename) + " does not contain an index");
 
     unsigned Calls = 0, Refs = 0, Functions = 0, Alias = 0, Globals = 0;
     for (auto &Summaries : *Index) {
@@ -394,22 +398,27 @@ static void printIndexStats() {
   }
 }
 
-/// List symbols in each IR file.
+/// Load each IR file and dump certain information based on active flags.
 ///
 /// The main point here is to provide lit-testable coverage for the LTOModule
-/// functionality that's exposed by the C API to list symbols.  Moreover, this
-/// provides testing coverage for modules that have been created in their own
-/// contexts.
-static void listSymbols(const TargetOptions &Options) {
+/// functionality that's exposed by the C API. Moreover, this provides testing
+/// coverage for modules that have been created in their own contexts.
+static void testLTOModule(const TargetOptions &Options) {
   for (auto &Filename : InputFilenames) {
     std::unique_ptr<MemoryBuffer> Buffer;
     std::unique_ptr<LTOModule> Module =
         getLocalLTOModule(Filename, Buffer, Options);
 
-    // List the symbols.
-    outs() << Filename << ":\n";
-    for (int I = 0, E = Module->getSymbolCount(); I != E; ++I)
-      outs() << Module->getSymbolName(I) << "\n";
+    if (ListSymbolsOnly) {
+      // List the symbols.
+      outs() << Filename << ":\n";
+      for (int I = 0, E = Module->getSymbolCount(); I != E; ++I)
+        outs() << Module->getSymbolName(I) << "\n";
+    }
+    if (QueryHasCtorDtor)
+      outs() << Filename
+             << ": hasCtorDtor = " << (Module->hasCtorDtor() ? "true" : "false")
+             << "\n";
   }
 }
 
@@ -478,6 +487,10 @@ static void createCombinedModuleSummaryIndex() {
         ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(Filename)));
     ExitOnErr(readModuleSummaryIndex(*MB, CombinedIndex, NextModuleId++));
   }
+  // In order to use this index for testing, specifically import testing, we
+  // need to update any indirect call edges created from SamplePGO, so that they
+  // point to the correct GUIDs.
+  updateIndirectCalls(CombinedIndex);
   std::error_code EC;
   assert(!OutputFilename.empty());
   raw_fd_ostream OS(OutputFilename + ".thinlto.bc", EC,
@@ -939,8 +952,8 @@ int main(int argc, char **argv) {
   // set up the TargetOptions for the machine
   TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple());
 
-  if (ListSymbolsOnly) {
-    listSymbols(Options);
+  if (ListSymbolsOnly || QueryHasCtorDtor) {
+    testLTOModule(Options);
     return 0;
   }
 
@@ -1050,7 +1063,7 @@ int main(int argc, char **argv) {
     CodeGen.addMustPreserveSymbol(KeptDSOSyms[i]);
 
   // Set cpu and attrs strings for the default target/subtarget.
-  CodeGen.setCpu(codegen::getMCPU().c_str());
+  CodeGen.setCpu(codegen::getMCPU());
 
   CodeGen.setOptLevel(OptLevel - '0');
   CodeGen.setAttrs(codegen::getMAttrs());
@@ -1084,8 +1097,7 @@ int main(int argc, char **argv) {
         error("writing merged module failed.");
     }
 
-    auto AddStream =
-        [&](size_t Task) -> std::unique_ptr<lto::NativeObjectStream> {
+    auto AddStream = [&](size_t Task) -> std::unique_ptr<CachedFileStream> {
       std::string PartFilename = OutputFilename;
       if (Parallelism != 1)
         PartFilename += "." + utostr(Task);
@@ -1095,7 +1107,7 @@ int main(int argc, char **argv) {
           std::make_unique<raw_fd_ostream>(PartFilename, EC, sys::fs::OF_None);
       if (EC)
         error("error opening the file '" + PartFilename + "': " + EC.message());
-      return std::make_unique<lto::NativeObjectStream>(std::move(S));
+      return std::make_unique<CachedFileStream>(std::move(S));
     };
 
     if (!CodeGen.compileOptimized(AddStream, Parallelism))
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index c0bff1eabee2..6f6f6c1ed90f 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -19,10 +19,10 @@
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/LTO/Caching.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
@@ -362,23 +362,23 @@ static int run(int argc, char **argv) {
   if (HasErrors)
     return 1;
 
-  auto AddStream =
-      [&](size_t Task) -> std::unique_ptr<lto::NativeObjectStream> {
+  auto AddStream = [&](size_t Task) -> std::unique_ptr<CachedFileStream> {
     std::string Path = OutputFilename + "." + utostr(Task);
 
     std::error_code EC;
     auto S = std::make_unique<raw_fd_ostream>(Path, EC, sys::fs::OF_None);
     check(EC, Path);
-    return std::make_unique<lto::NativeObjectStream>(std::move(S));
+    return std::make_unique<CachedFileStream>(std::move(S));
   };
 
   auto AddBuffer = [&](size_t Task, std::unique_ptr<MemoryBuffer> MB) {
     *AddStream(Task)->OS << MB->getBuffer();
   };
 
-  NativeObjectCache Cache;
+  FileCache Cache;
   if (!CacheDir.empty())
-    Cache = check(localCache(CacheDir, AddBuffer), "failed to create cache");
+    Cache = check(localCache("ThinLTO", "Thin", CacheDir, AddBuffer),
+                  "failed to create cache");
 
   check(Lto.run(AddStream, Cache), "LTO::run failed");
   return 0;
diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp
index 16ab99548adf..ac55d05db192 100644
--- a/llvm/tools/llvm-mc/Disassembler.cpp
+++ b/llvm/tools/llvm-mc/Disassembler.cpp
@@ -21,9 +21,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -156,7 +156,7 @@ int Disassembler::disassemble(const Target &T, const std::string &Triple,
   }
 
   // Set up initial section manually here
-  Streamer.InitSections(false);
+  Streamer.initSections(false, STI);
 
   bool ErrorOccurred = false;
 
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 24c601b7033f..4e5a12e53a6b 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/FileUtilities.h"
@@ -34,7 +35,6 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
@@ -571,7 +571,7 @@ int main(int argc, char **argv) {
         MCOptions.MCIncrementalLinkerCompatible,
         /*DWARFMustBeAtTheEnd*/ false));
     if (NoExecStack)
-      Str->InitSections(true);
+      Str->initSections(true, *STI);
   }
 
   // Use Assembler information for parsing.
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
index 6ad2a65592b9..6cdd0ba797aa 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -114,7 +114,7 @@ Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions(
 
   // Need to initialize an MCTargetStreamer otherwise
   // certain asm directives will cause a segfault.
-  // Using nulls() so that anything emitted by the MCTagetStreamer
+  // Using nulls() so that anything emitted by the MCTargetStreamer
   // doesn't show up in the llvm-mca output.
   raw_ostream &OSRef = nulls();
   formatted_raw_ostream FOSRef(OSRef);
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.h b/llvm/tools/llvm-mca/CodeRegionGenerator.h
index 1c11784ca3fb..ac02131b2f39 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.h
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.h
@@ -20,9 +20,9 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include <memory>
 
 namespace llvm {
@@ -37,7 +37,7 @@ protected:
   CodeRegionGenerator &operator=(const CodeRegionGenerator &) = delete;
 
 public:
-  CodeRegionGenerator(SourceMgr &SM) : Regions(SM) {}
+  CodeRegionGenerator(llvm::SourceMgr &SM) : Regions(SM) {}
   virtual ~CodeRegionGenerator();
   virtual Expected<const CodeRegions &>
   parseCodeRegions(const std::unique_ptr<MCInstPrinter> &IP) = 0;
@@ -54,7 +54,7 @@ class AsmCodeRegionGenerator final : public CodeRegionGenerator {
   unsigned AssemblerDialect; // This is set during parsing.
 
 public:
-  AsmCodeRegionGenerator(const Target &T, SourceMgr &SM, MCContext &C,
+  AsmCodeRegionGenerator(const Target &T, llvm::SourceMgr &SM, MCContext &C,
                          const MCAsmInfo &A, const MCSubtargetInfo &S,
                          const MCInstrInfo &I)
       : CodeRegionGenerator(SM), TheTarget(T), Ctx(C), MAI(A), STI(S), MCII(I),
diff --git a/llvm/tools/llvm-mca/PipelinePrinter.cpp b/llvm/tools/llvm-mca/PipelinePrinter.cpp
index 955b825891fa..9d06c6a19395 100644
--- a/llvm/tools/llvm-mca/PipelinePrinter.cpp
+++ b/llvm/tools/llvm-mca/PipelinePrinter.cpp
@@ -14,7 +14,6 @@
 #include "PipelinePrinter.h"
 #include "CodeRegion.h"
 #include "Views/InstructionView.h"
-#include "Views/View.h"
 
 namespace llvm {
 namespace mca {
diff --git a/llvm/tools/llvm-mca/PipelinePrinter.h b/llvm/tools/llvm-mca/PipelinePrinter.h
index 1365f75be0f5..fd262f0a8a5d 100644
--- a/llvm/tools/llvm-mca/PipelinePrinter.h
+++ b/llvm/tools/llvm-mca/PipelinePrinter.h
@@ -16,11 +16,11 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
 #define LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
 
-#include "Views/View.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MCA/Context.h"
 #include "llvm/MCA/Pipeline.h"
+#include "llvm/MCA/View.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "llvm-mca"
diff --git a/llvm/tools/llvm-mca/Views/DispatchStatistics.h b/llvm/tools/llvm-mca/Views/DispatchStatistics.h
index 81b582f74a6b..cfd12691c03f 100644
--- a/llvm/tools/llvm-mca/Views/DispatchStatistics.h
+++ b/llvm/tools/llvm-mca/Views/DispatchStatistics.h
@@ -33,9 +33,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_DISPATCHVIEW_H
 
-#include "Views/View.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/View.h"
 #include <map>
 
 namespace llvm {
diff --git a/llvm/tools/llvm-mca/Views/InstructionView.h b/llvm/tools/llvm-mca/Views/InstructionView.h
index 1843b0513dfc..cec07eef6a80 100644
--- a/llvm/tools/llvm-mca/Views/InstructionView.h
+++ b/llvm/tools/llvm-mca/Views/InstructionView.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONVIEW_H
 
-#include "Views/View.h"
+#include "llvm/MCA/View.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
index ec5c5f431e12..3de2a22ac32d 100644
--- a/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
+++ b/llvm/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -35,9 +35,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H
 #define LLVM_TOOLS_LLVM_MCA_REGISTERFILESTATISTICS_H
 
-#include "Views/View.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/View.h"
 
 namespace llvm {
 namespace mca {
diff --git a/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
index 86b46e93aa7c..ed3736c64515 100644
--- a/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
+++ b/llvm/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -28,8 +28,8 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
 #define LLVM_TOOLS_LLVM_MCA_RETIRECONTROLUNITSTATISTICS_H
 
-#include "Views/View.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/View.h"
 #include <map>
 
 namespace llvm {
diff --git a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
index 66f4b0011866..9d2f71c13e5a 100644
--- a/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/llvm/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -36,9 +36,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
 #define LLVM_TOOLS_LLVM_MCA_SCHEDULERSTATISTICS_H
 
-#include "Views/View.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/View.h"
 #include <map>
 
 namespace llvm {
diff --git a/llvm/tools/llvm-mca/Views/SummaryView.h b/llvm/tools/llvm-mca/Views/SummaryView.h
index e2c7cfd19e94..21f3fad23ca0 100644
--- a/llvm/tools/llvm-mca/Views/SummaryView.h
+++ b/llvm/tools/llvm-mca/Views/SummaryView.h
@@ -28,9 +28,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
 
-#include "Views/View.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/View.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -71,12 +71,6 @@ class SummaryView : public View {
   // Used to map resource indices to actual processor resource IDs.
   llvm::SmallVector<unsigned, 8> ResIdx2ProcResID;
 
-  // Compute the reciprocal throughput for the analyzed code block.
-  // The reciprocal block throughput is computed as the MAX between:
-  //   - NumMicroOps / DispatchWidth
-  //   - Total Resource Cycles / #Units   (for every resource consumed).
-  double getBlockRThroughput() const;
-
   /// Compute the data we want to print out in the object DV.
   void collectData(DisplayValues &DV) const;
 
diff --git a/llvm/tools/llvm-mca/Views/TimelineView.cpp b/llvm/tools/llvm-mca/Views/TimelineView.cpp
index 9a949761bb75..5c05edbdea68 100644
--- a/llvm/tools/llvm-mca/Views/TimelineView.cpp
+++ b/llvm/tools/llvm-mca/Views/TimelineView.cpp
@@ -145,10 +145,11 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
 
   double AverageTime1, AverageTime2, AverageTime3;
   AverageTime1 =
-      (double)Entry.CyclesSpentInSchedulerQueue / CumulativeExecutions;
-  AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / CumulativeExecutions;
-  AverageTime3 =
-      (double)Entry.CyclesSpentAfterWBAndBeforeRetire / CumulativeExecutions;
+      (double)(Entry.CyclesSpentInSchedulerQueue * 10) / CumulativeExecutions;
+  AverageTime2 =
+      (double)(Entry.CyclesSpentInSQWhileReady * 10) / CumulativeExecutions;
+  AverageTime3 = (double)(Entry.CyclesSpentAfterWBAndBeforeRetire * 10) /
+                 CumulativeExecutions;
 
   OS << Executions;
   OS.PadToColumn(13);
@@ -157,18 +158,18 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, CumulativeExecutions,
                    BufferSize);
-  OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10);
+  OS << format("%.1f", floor(AverageTime1 + 0.5) / 10);
   OS.PadToColumn(20);
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, CumulativeExecutions,
                    BufferSize);
-  OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10);
+  OS << format("%.1f", floor(AverageTime2 + 0.5) / 10);
   OS.PadToColumn(27);
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire,
                    CumulativeExecutions,
                    getSubTargetInfo().getSchedModel().MicroOpBufferSize);
-  OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
+  OS << format("%.1f", floor(AverageTime3 + 0.5) / 10);
 
   if (OS.has_colors())
     OS.resetColor();
@@ -295,8 +296,10 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
       // attribute is set correctly whether or not it is greater
       // than timeline-max-cycles so we can use that to ensure
       // we don't early exit because of a 0 latency instruction.
-      if (Entry.CycleRetired == 0 && Entry.CycleExecuted != 0)
+      if (Entry.CycleRetired == 0 && Entry.CycleExecuted != 0) {
+        FOS << "Truncated display due to cycle limit\n";
         return;
+      }
 
       unsigned SourceIndex = IID % Source.size();
       printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
deleted file mode 100644
index a655f3faf1bf..000000000000
--- a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file implements methods from the AMDGPUCustomBehaviour class.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUCustomBehaviour.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "llvm/Support/WithColor.h"
-
-namespace llvm {
-namespace mca {
-
-AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
-                                             const SourceMgr &SrcMgr,
-                                             const MCInstrInfo &MCII)
-    : CustomBehaviour(STI, SrcMgr, MCII) {}
-
-unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
-                                                  const InstRef &IR) {
-  return 0;
-}
-
-} // namespace mca
-} // namespace llvm
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
deleted file mode 100644
index 0dd21c7b4c44..000000000000
--- a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===------------------- AMDGPUCustomBehaviour.h ----------------*-C++ -* -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file defines the AMDGPUCustomBehaviour class which inherits from
-/// CustomBehaviour.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_MCA_LIB_AMDGPU_AMDGPUCUSTOMBEHAVIOUR_H
-#define LLVM_TOOLS_LLVM_MCA_LIB_AMDGPU_AMDGPUCUSTOMBEHAVIOUR_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MCA/CustomBehaviour.h"
-#include "llvm/Support/TargetParser.h"
-
-namespace llvm {
-namespace mca {
-
-class AMDGPUInstrPostProcess : public InstrPostProcess {
-public:
-  AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
-      : InstrPostProcess(STI, MCII) {}
-
-  ~AMDGPUInstrPostProcess() {}
-
-  void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
-                              const MCInst &MCI) override {}
-};
-
-class AMDGPUCustomBehaviour : public CustomBehaviour {
-public:
-  AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
-                        const MCInstrInfo &MCII);
-
-  ~AMDGPUCustomBehaviour() {}
-
-  /// This method is used to determine if an instruction
-  /// should be allowed to be dispatched. The return value is
-  /// how many cycles until the instruction can be dispatched.
-  /// This method is called after MCA has already checked for
-  /// register and hardware dependencies so this method should only
-  /// implement custom behaviour and dependencies that are not picked up
-  /// by MCA naturally.
-  unsigned checkCustomHazard(ArrayRef<InstRef> IssuedInst,
-                             const InstRef &IR) override;
-};
-
-} // namespace mca
-} // namespace llvm
-
-#endif /* LLVM_TOOLS_LLVM_MCA_LIB_AMDGPU_AMDGPUCUSTOMBEHAVIOUR_H */
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index a473cd8f1719..0b58ca377ce1 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -32,9 +32,6 @@
 #include "Views/SchedulerStatistics.h"
 #include "Views/SummaryView.h"
 #include "Views/TimelineView.h"
-#ifdef HAS_AMDGPU
-#include "lib/AMDGPU/AMDGPUCustomBehaviour.h"
-#endif
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -43,6 +40,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/MCA/CodeEmitter.h"
 #include "llvm/MCA/Context.h"
 #include "llvm/MCA/CustomBehaviour.h"
@@ -59,7 +57,6 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
@@ -293,39 +290,6 @@ static void processViewOptions(bool IsOutOfOrder) {
     processOptionImpl(PrintRetireStats, Default);
 }
 
-std::unique_ptr<mca::InstrPostProcess>
-createInstrPostProcess(const Triple &TheTriple, const MCSubtargetInfo &STI,
-                       const MCInstrInfo &MCII) {
-  // Might be a good idea to have a separate flag so that InstrPostProcess
-  // can be used with or without CustomBehaviour
-  if (DisableCustomBehaviour)
-    return std::make_unique<mca::InstrPostProcess>(STI, MCII);
-#ifdef HAS_AMDGPU
-  if (TheTriple.isAMDGPU())
-    return std::make_unique<mca::AMDGPUInstrPostProcess>(STI, MCII);
-#endif
-  return std::make_unique<mca::InstrPostProcess>(STI, MCII);
-}
-
-std::unique_ptr<mca::CustomBehaviour>
-createCustomBehaviour(const Triple &TheTriple, const MCSubtargetInfo &STI,
-                      const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII) {
-  // Build the appropriate CustomBehaviour object for the current target.
-  // The CustomBehaviour class should never depend on the source code,
-  // but it can depend on the list of mca::Instruction and any classes
-  // that can be built using just the target info. If you need extra
-  // information from the source code or the list of MCInst, consider
-  // adding that information to the mca::Instruction class and setting
-  // it during InstrBuilder::createInstruction().
-  if (DisableCustomBehaviour)
-    return std::make_unique<mca::CustomBehaviour>(STI, SrcMgr, MCII);
-#ifdef HAS_AMDGPU
-  if (TheTriple.isAMDGPU())
-    return std::make_unique<mca::AMDGPUCustomBehaviour>(STI, SrcMgr, MCII);
-#endif
-  return std::make_unique<mca::CustomBehaviour>(STI, SrcMgr, MCII);
-}
-
 // Returns true on success.
 static bool runPipeline(mca::Pipeline &P) {
   // Handle pipeline errors here.
@@ -344,6 +308,7 @@ int main(int argc, char **argv) {
   InitializeAllTargetInfos();
   InitializeAllTargetMCs();
   InitializeAllAsmParsers();
+  InitializeAllTargetMCAs();
 
   // Enable printing of available targets when flag --version is specified.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
@@ -532,8 +497,18 @@ int main(int argc, char **argv) {
     // Lower the MCInst sequence into an mca::Instruction sequence.
     ArrayRef<MCInst> Insts = Region->getInstructions();
     mca::CodeEmitter CE(*STI, *MAB, *MCE, Insts);
-    std::unique_ptr<mca::InstrPostProcess> IPP =
-        createInstrPostProcess(TheTriple, *STI, *MCII);
+
+    std::unique_ptr<mca::InstrPostProcess> IPP;
+    if (!DisableCustomBehaviour) {
+      IPP = std::unique_ptr<mca::InstrPostProcess>(
+          TheTarget->createInstrPostProcess(*STI, *MCII));
+    }
+    if (!IPP)
+      // If the target doesn't have its own IPP implemented (or the
+      // -disable-cb flag is set) then we use the base class
+      // (which does nothing).
+      IPP = std::make_unique<mca::InstrPostProcess>(*STI, *MCII);
+
     std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
       Expected<std::unique_ptr<mca::Instruction>> Inst =
@@ -602,14 +577,35 @@ int main(int argc, char **argv) {
     // the source code (but it can depend on the list of
     // mca::Instruction or any objects that can be reconstructed
     // from the target information).
-    std::unique_ptr<mca::CustomBehaviour> CB =
-        createCustomBehaviour(TheTriple, *STI, S, *MCII);
+    std::unique_ptr<mca::CustomBehaviour> CB;
+    if (!DisableCustomBehaviour)
+      CB = std::unique_ptr<mca::CustomBehaviour>(
+          TheTarget->createCustomBehaviour(*STI, S, *MCII));
+    if (!CB)
+      // If the target doesn't have its own CB implemented (or the -disable-cb
+      // flag is set) then we use the base class (which does nothing).
+      CB = std::make_unique<mca::CustomBehaviour>(*STI, S, *MCII);
 
     // Create a basic pipeline simulating an out-of-order backend.
     auto P = MCA.createDefaultPipeline(PO, S, *CB);
 
     mca::PipelinePrinter Printer(*P, *Region, RegionIdx, *STI, PO);
 
+    // Targets can define their own custom Views that exist within their
+    // /lib/Target/ directory so that the View can utilize their CustomBehaviour
+    // or other backend symbols / functionality that are not already exposed
+    // through one of the MC-layer classes. These Views will be initialized
+    // using the CustomBehaviour::getViews() variants.
+    // If a target makes a custom View that does not depend on their target
+    // CB or their backend, they should put the View within
+    // /tools/llvm-mca/Views/ instead.
+    if (!DisableCustomBehaviour) {
+      std::vector<std::unique_ptr<mca::View>> CBViews =
+          CB->getStartViews(*IP, Insts);
+      for (auto &CBView : CBViews)
+        Printer.addView(std::move(CBView));
+    }
+
     // When we output JSON, we add a view that contains the instructions
     // and CPU resource information.
     if (PrintJson) {
@@ -635,6 +631,16 @@ int main(int argc, char **argv) {
       Printer.addView(std::make_unique<mca::InstructionInfoView>(
           *STI, *MCII, CE, ShowEncoding, Insts, *IP));
 
+    // Fetch custom Views that are to be placed after the InstructionInfoView.
+    // Refer to the comment paired with the CB->getStartViews(*IP, Insts); line
+    // for more info.
+    if (!DisableCustomBehaviour) {
+      std::vector<std::unique_ptr<mca::View>> CBViews =
+          CB->getPostInstrInfoViews(*IP, Insts);
+      for (auto &CBView : CBViews)
+        Printer.addView(std::move(CBView));
+    }
+
     if (PrintDispatchStats)
       Printer.addView(std::make_unique<mca::DispatchStatistics>());
 
@@ -659,6 +665,16 @@ int main(int argc, char **argv) {
           TimelineMaxCycles));
     }
 
+    // Fetch custom Views that are to be placed after all other Views.
+    // Refer to the comment paired with the CB->getStartViews(*IP, Insts); line
+    // for more info.
+    if (!DisableCustomBehaviour) {
+      std::vector<std::unique_ptr<mca::View>> CBViews =
+          CB->getEndViews(*IP, Insts);
+      for (auto &CBView : CBViews)
+        Printer.addView(std::move(CBView));
+    }
+
     if (!runPipeline(*P))
       return 1;
 
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index ffb427a3f2bd..0864985377ce 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -64,7 +64,7 @@ enum ID {
 #include "Opts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info InfoTable[] = {
+const opt::OptTable::Info InfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {                                                                            \
@@ -530,7 +530,7 @@ struct DarwinStabName {
   uint8_t NType;
   const char *Name;
 };
-static const struct DarwinStabName DarwinStabNames[] = {
+const struct DarwinStabName DarwinStabNames[] = {
     {MachO::N_GSYM, "GSYM"},
     {MachO::N_FNAME, "FNAME"},
     {MachO::N_FUN, "FUN"},
@@ -599,22 +599,16 @@ static void darwinPrintStab(MachOObjectFile *MachO, const NMSymbol &S) {
     outs() << format("   %02x", NType);
 }
 
-static Optional<std::string> demangle(StringRef Name, bool StripUnderscore) {
-  if (StripUnderscore && !Name.empty() && Name[0] == '_')
-    Name = Name.substr(1);
+static Optional<std::string> demangle(const std::string &Name,
+                                      bool StripUnderscore) {
+  const char *Mangled = Name.c_str();
+  if (StripUnderscore && Mangled[0] == '_')
+    Mangled = Mangled + 1;
 
-  if (!Name.startswith("_Z"))
-    return None;
-
-  int Status;
-  char *Undecorated =
-      itaniumDemangle(Name.str().c_str(), nullptr, nullptr, &Status);
-  if (Status != 0)
-    return None;
-
-  std::string S(Undecorated);
-  free(Undecorated);
-  return S;
+  std::string Demangled;
+  if (nonMicrosoftDemangle(Mangled, Demangled))
+    return Demangled;
+  return None;
 }
 
 static bool symbolIsDefined(const NMSymbol &Sym) {
@@ -1575,90 +1569,11 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO) {
   }
 }
 
-namespace {
-struct SymbolVersion {
-  std::string Name;
-  bool IsDefault;
-};
-} // namespace
-
-template <class ELFT>
-static Expected<std::vector<SymbolVersion>>
-readSymbolVersionsELF(const ELFFile<ELFT> &Obj, StringRef FileName,
-                      ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
-  using Elf_Shdr = typename ELFT::Shdr;
-
-  // We called sections() earlier, so can't fail here.
-  typename ELFT::ShdrRange SectionsOrErr = cantFail(Obj.sections());
-  const Elf_Shdr *SymVerSec = nullptr;
-  const Elf_Shdr *SymVerNeedSec = nullptr;
-  const Elf_Shdr *SymVerDefSec = nullptr;
-  for (const Elf_Shdr &Sec : SectionsOrErr) {
-    if (Sec.sh_type == ELF::SHT_GNU_versym)
-      SymVerSec = &Sec;
-    else if (Sec.sh_type == ELF::SHT_GNU_verdef)
-      SymVerDefSec = &Sec;
-    else if (Sec.sh_type == ELF::SHT_GNU_verneed)
-      SymVerNeedSec = &Sec;
-  }
-
-  if (!SymVerSec)
-    return std::vector<SymbolVersion>{};
-
-  Expected<SmallVector<Optional<VersionEntry>, 0>> MapOrErr =
-      Obj.loadVersionMap(SymVerNeedSec, SymVerDefSec);
-  if (!MapOrErr)
-    return MapOrErr.takeError();
-
-  std::vector<SymbolVersion> Ret;
-  size_t I = 0;
-  for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) {
-    ++I;
-    Expected<const typename ELFT::Versym *> VerEntryOrErr =
-        Obj.template getEntry<typename ELFT::Versym>(*SymVerSec, I);
-    if (!VerEntryOrErr)
-      return createError("unable to read an entry with index " + Twine(I) +
-                         " from " + describe(Obj, *SymVerSec) + ": " +
-                         toString(VerEntryOrErr.takeError()));
-
-    Expected<uint32_t> FlagsOrErr = It->getFlags();
-    if (!FlagsOrErr)
-      return createError("unable to read flags for symbol with index " +
-                         Twine(I) + ": " + toString(FlagsOrErr.takeError()));
-
-    bool IsDefault;
-    Expected<StringRef> VerOrErr = Obj.getSymbolVersionByIndex(
-        (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr,
-        (*FlagsOrErr) & SymbolRef::SF_Undefined);
-    if (!VerOrErr)
-      return createError("unable to get a version for entry " + Twine(I) +
-                         " of " + describe(Obj, *SymVerSec) + ": " +
-                         toString(VerOrErr.takeError()));
-
-    Ret.push_back({(*VerOrErr).str(), IsDefault});
-  }
-
-  return Ret;
-}
-
-static Expected<std::vector<SymbolVersion>>
-readSymbolVersionsELF(const ELFObjectFileBase &Obj,
-                      ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
-  if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
-    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
-  else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
-    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
-  else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
-    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
-  return readSymbolVersionsELF(cast<ELF64BEObjectFile>(&Obj)->getELFFile(),
-                               Obj.getFileName(), Symbols);
-}
-
 static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
                                       StringRef ArchiveName = {},
                                       StringRef ArchitectureName = {}) {
   auto Symbols = Obj.symbols();
-  std::vector<SymbolVersion> SymbolVersions;
+  std::vector<VersionEntry> SymbolVersions;
   if (DynamicSyms) {
     const auto *E = dyn_cast<ELFObjectFileBase>(&Obj);
     if (!E) {
@@ -1667,8 +1582,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
     }
     Symbols = E->getDynamicSymbolIterators();
 
-    if (Expected<std::vector<SymbolVersion>> VersionsOrErr =
-            readSymbolVersionsELF(*E, Symbols))
+    if (Expected<std::vector<VersionEntry>> VersionsOrErr =
+            E->readDynsymVersions())
       SymbolVersions = std::move(*VersionsOrErr);
     else
       WithColor::warning(errs(), ToolName)
@@ -1738,7 +1653,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       }
       if (!SymbolVersions.empty() && !SymbolVersions[I].Name.empty())
         S.Name +=
-            (SymbolVersions[I].IsDefault ? "@@" : "@") + SymbolVersions[I].Name;
+            (SymbolVersions[I].IsVerDef ? "@@" : "@") + SymbolVersions[I].Name;
 
       S.Sym = Sym;
       SymbolList.push_back(S);
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
index e50ac2e12e2f..38c9cd09433b 100644
--- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
@@ -94,7 +94,7 @@ static Error addGnuDebugLink(Object &Obj, StringRef DebugLinkFile) {
   return Error::success();
 }
 
-static void setSectionFlags(Section &Sec, SectionFlag AllFlags) {
+static uint32_t flagsToCharacteristics(SectionFlag AllFlags, uint32_t OldChar) {
   // Need to preserve alignment flags.
   const uint32_t PreserveMask =
       IMAGE_SCN_ALIGN_1BYTES | IMAGE_SCN_ALIGN_2BYTES | IMAGE_SCN_ALIGN_4BYTES |
@@ -107,8 +107,7 @@ static void setSectionFlags(Section &Sec, SectionFlag AllFlags) {
 
   // Setup new section characteristics based on the flags provided in command
   // line.
-  uint32_t NewCharacteristics =
-      (Sec.Header.Characteristics & PreserveMask) | IMAGE_SCN_MEM_READ;
+  uint32_t NewCharacteristics = (OldChar & PreserveMask) | IMAGE_SCN_MEM_READ;
 
   if ((AllFlags & SectionFlag::SecAlloc) && !(AllFlags & SectionFlag::SecLoad))
     NewCharacteristics |= IMAGE_SCN_CNT_UNINITIALIZED_DATA;
@@ -128,7 +127,7 @@ static void setSectionFlags(Section &Sec, SectionFlag AllFlags) {
   if (AllFlags & SectionFlag::SecExclude)
     NewCharacteristics |= IMAGE_SCN_LNK_REMOVE;
 
-  Sec.Header.Characteristics = NewCharacteristics;
+  return NewCharacteristics;
 }
 
 static Error handleArgs(const CommonConfig &Config, Object &Obj) {
@@ -226,7 +225,8 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) {
     for (Section &Sec : Obj.getMutableSections()) {
       const auto It = Config.SetSectionFlags.find(Sec.Name);
       if (It != Config.SetSectionFlags.end())
-        setSectionFlags(Sec, It->second.NewFlags);
+        Sec.Header.Characteristics = flagsToCharacteristics(
+            It->second.NewFlags, Sec.Header.Characteristics);
     }
 
   for (const auto &Flag : Config.AddSection) {
@@ -238,11 +238,18 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) {
       return createFileError(FileName, errorCodeToError(BufOrErr.getError()));
     auto Buf = std::move(*BufOrErr);
 
+    uint32_t Characteristics;
+    const auto It = Config.SetSectionFlags.find(SecName);
+    if (It != Config.SetSectionFlags.end())
+      Characteristics = flagsToCharacteristics(It->second.NewFlags, 0);
+    else
+      Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_ALIGN_1BYTES;
+
     addSection(
         Obj, SecName,
         makeArrayRef(reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
                      Buf->getBufferSize()),
-        IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_ALIGN_1BYTES);
+        Characteristics);
   }
 
   if (!Config.AddGnuDebugLink.empty())
diff --git a/llvm/tools/llvm-objcopy/COFF/Object.cpp b/llvm/tools/llvm-objcopy/COFF/Object.cpp
index 1c17b8408ee7..ec2628c7eca9 100644
--- a/llvm/tools/llvm-objcopy/COFF/Object.cpp
+++ b/llvm/tools/llvm-objcopy/COFF/Object.cpp
@@ -107,7 +107,7 @@ void Object::removeSections(function_ref<bool(const Section &)> ToRemove) {
           // section,
           // remove those as well as nothing will include them (and we can't
           // leave them dangling).
-          if (RemovedSections.count(Sym.AssociativeComdatTargetSectionId) == 1)
+          if (RemovedSections.contains(Sym.AssociativeComdatTargetSectionId))
             AssociatedSections.insert(Sym.TargetSectionId);
           return RemovedSections.contains(Sym.TargetSectionId);
         });
diff --git a/llvm/tools/llvm-objcopy/COFF/Writer.cpp b/llvm/tools/llvm-objcopy/COFF/Writer.cpp
index e7be64faab65..cbd0e4261238 100644
--- a/llvm/tools/llvm-objcopy/COFF/Writer.cpp
+++ b/llvm/tools/llvm-objcopy/COFF/Writer.cpp
@@ -406,7 +406,7 @@ Expected<uint32_t> COFFWriter::virtualAddressToFileAddress(uint32_t RVA) {
 // the debug_directory structs in there, and set the PointerToRawData field
 // in all of them, according to their new physical location in the file.
 Error COFFWriter::patchDebugDirectory() {
-  if (Obj.DataDirectories.size() < DEBUG_DIRECTORY)
+  if (Obj.DataDirectories.size() <= DEBUG_DIRECTORY)
     return Error::success();
   const data_directory *Dir = &Obj.DataDirectories[DEBUG_DIRECTORY];
   if (Dir->Size <= 0)
@@ -426,15 +426,13 @@ Error COFFWriter::patchDebugDirectory() {
       uint8_t *End = Ptr + Dir->Size;
       while (Ptr < End) {
         debug_directory *Debug = reinterpret_cast<debug_directory *>(Ptr);
-        if (!Debug->AddressOfRawData)
-          return createStringError(object_error::parse_failed,
-                                   "debug directory payload outside of "
-                                   "mapped sections not supported");
-        if (Expected<uint32_t> FilePosOrErr =
-                virtualAddressToFileAddress(Debug->AddressOfRawData))
-          Debug->PointerToRawData = *FilePosOrErr;
-        else
-          return FilePosOrErr.takeError();
+        if (Debug->PointerToRawData) {
+          if (Expected<uint32_t> FilePosOrErr =
+                  virtualAddressToFileAddress(Debug->AddressOfRawData))
+            Debug->PointerToRawData = *FilePosOrErr;
+          else
+            return FilePosOrErr.takeError();
+        }
         Ptr += sizeof(debug_directory);
         Offset += sizeof(debug_directory);
       }
diff --git a/llvm/tools/llvm-objcopy/CommonConfig.h b/llvm/tools/llvm-objcopy/CommonConfig.h
index 131ce5c59114..ea39a6da2ba5 100644
--- a/llvm/tools/llvm-objcopy/CommonConfig.h
+++ b/llvm/tools/llvm-objcopy/CommonConfig.h
@@ -210,14 +210,7 @@ struct CommonConfig {
   // Repeated options
   std::vector<StringRef> AddSection;
   std::vector<StringRef> DumpSection;
-  std::vector<StringRef> RPathToAdd;
-  std::vector<StringRef> RPathToPrepend;
-  DenseMap<StringRef, StringRef> RPathsToUpdate;
-  DenseMap<StringRef, StringRef> InstallNamesToUpdate;
-  DenseSet<StringRef> RPathsToRemove;
-
-  // install-name-tool's id option
-  Optional<StringRef> SharedLibId;
+  std::vector<StringRef> UpdateSection;
 
   // Section matchers
   NameMatcher KeepSection;
@@ -239,23 +232,13 @@ struct CommonConfig {
   StringMap<SectionFlagsUpdate> SetSectionFlags;
   StringMap<StringRef> SymbolsToRename;
 
-  // ELF entry point address expression. The input parameter is an entry point
-  // address in the input ELF file. The entry address in the output file is
-  // calculated with EntryExpr(input_address), when either --set-start or
-  // --change-start is used.
-  std::function<uint64_t(uint64_t)> EntryExpr;
-
   // Symbol info specified by --add-symbol option.
   std::vector<NewSymbolInfo> SymbolsToAdd;
 
   // Boolean options
-  bool AllowBrokenLinks = false;
   bool DeterministicArchives = true;
   bool ExtractDWO = false;
   bool ExtractMainPartition = false;
-  bool KeepFileSymbols = false;
-  bool KeepUndefined = false;
-  bool LocalizeHidden = false;
   bool OnlyKeepDebug = false;
   bool PreserveDates = false;
   bool StripAll = false;
@@ -264,12 +247,9 @@ struct CommonConfig {
   bool StripDebug = false;
   bool StripNonAlloc = false;
   bool StripSections = false;
-  bool StripSwiftSymbols = false;
   bool StripUnneeded = false;
   bool Weaken = false;
   bool DecompressDebugSections = false;
-  // install-name-tool's --delete_all_rpaths
-  bool RemoveAllRpaths = false;
 
   DebugCompressionType CompressionType = DebugCompressionType::None;
 };
diff --git a/llvm/tools/llvm-objcopy/ConfigManager.cpp b/llvm/tools/llvm-objcopy/ConfigManager.cpp
index 9f7d06b99418..2e5cf9357a52 100644
--- a/llvm/tools/llvm-objcopy/ConfigManager.cpp
+++ b/llvm/tools/llvm-objcopy/ConfigManager.cpp
@@ -39,7 +39,7 @@ enum ObjcopyID {
 #include "ObjcopyOpts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info ObjcopyInfoTable[] = {
+const opt::OptTable::Info ObjcopyInfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {OBJCOPY_##PREFIX,                                                           \
@@ -79,7 +79,7 @@ enum InstallNameToolID {
 #include "InstallNameToolOpts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info InstallNameToolInfoTable[] = {
+const opt::OptTable::Info InstallNameToolInfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {INSTALL_NAME_TOOL_##PREFIX,                                                 \
@@ -116,7 +116,7 @@ enum BitcodeStripID {
 #include "BitcodeStripOpts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info BitcodeStripInfoTable[] = {
+const opt::OptTable::Info BitcodeStripInfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {BITCODE_STRIP_##PREFIX,                                                     \
@@ -153,7 +153,7 @@ enum StripID {
 #include "StripOpts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info StripInfoTable[] = {
+const opt::OptTable::Info StripInfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
@@ -559,27 +559,21 @@ static Expected<NewSymbolInfo> parseNewSymbolInfo(StringRef FlagValue) {
 }
 
 Expected<const ELFConfig &> ConfigManager::getELFConfig() const {
-  if (Common.StripSwiftSymbols || Common.KeepUndefined)
-    return createStringError(llvm::errc::invalid_argument,
-                             "option not supported by llvm-objcopy for ELF");
-
   return ELF;
 }
 
 Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
-  if (Common.AllowBrokenLinks || !Common.SplitDWO.empty() ||
-      !Common.SymbolsPrefix.empty() || !Common.AllocSectionsPrefix.empty() ||
-      !Common.DumpSection.empty() || !Common.KeepSection.empty() ||
-      ELF.NewSymbolVisibility || !Common.SymbolsToGlobalize.empty() ||
+  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() || !Common.DumpSection.empty() ||
+      !Common.KeepSection.empty() || !Common.SymbolsToGlobalize.empty() ||
       !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() ||
       !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
       !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      Common.ExtractDWO || Common.LocalizeHidden || Common.PreserveDates ||
-      Common.StripDWO || Common.StripNonAlloc || Common.StripSections ||
-      Common.StripSwiftSymbols || Common.KeepUndefined || Common.Weaken ||
+      Common.ExtractDWO || Common.PreserveDates || Common.StripDWO ||
+      Common.StripNonAlloc || Common.StripSections || Common.Weaken ||
       Common.DecompressDebugSections ||
       Common.DiscardMode == DiscardType::Locals ||
-      !Common.SymbolsToAdd.empty() || Common.EntryExpr) {
+      !Common.SymbolsToAdd.empty()) {
     return createStringError(llvm::errc::invalid_argument,
                              "option not supported by llvm-objcopy for COFF");
   }
@@ -588,19 +582,18 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
 }
 
 Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
-  if (Common.AllowBrokenLinks || !Common.SplitDWO.empty() ||
-      !Common.SymbolsPrefix.empty() || !Common.AllocSectionsPrefix.empty() ||
-      !Common.KeepSection.empty() || ELF.NewSymbolVisibility ||
+  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
       !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
       !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() ||
       !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() ||
       !Common.UnneededSymbolsToRemove.empty() ||
       !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
-      Common.ExtractDWO || Common.LocalizeHidden || Common.PreserveDates ||
-      Common.StripAllGNU || Common.StripDWO || Common.StripNonAlloc ||
-      Common.StripSections || Common.Weaken || Common.DecompressDebugSections ||
-      Common.StripUnneeded || Common.DiscardMode == DiscardType::Locals ||
-      !Common.SymbolsToAdd.empty() || Common.EntryExpr) {
+      Common.ExtractDWO || Common.PreserveDates || Common.StripAllGNU ||
+      Common.StripDWO || Common.StripNonAlloc || Common.StripSections ||
+      Common.Weaken || Common.DecompressDebugSections || Common.StripUnneeded ||
+      Common.DiscardMode == DiscardType::Locals ||
+      !Common.SymbolsToAdd.empty()) {
     return createStringError(llvm::errc::invalid_argument,
                              "option not supported by llvm-objcopy for MachO");
   }
@@ -612,8 +605,7 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
   if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
       !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
       !Common.AllocSectionsPrefix.empty() ||
-      Common.DiscardMode != DiscardType::None || ELF.NewSymbolVisibility ||
-      !Common.SymbolsToAdd.empty() || !Common.RPathToAdd.empty() ||
+      Common.DiscardMode != DiscardType::None || !Common.SymbolsToAdd.empty() ||
       !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToLocalize.empty() ||
       !Common.SymbolsToKeep.empty() || !Common.SymbolsToRemove.empty() ||
       !Common.UnneededSymbolsToRemove.empty() ||
@@ -684,6 +676,7 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   ConfigManager ConfigMgr;
   CommonConfig &Config = ConfigMgr.Common;
   ELFConfig &ELFConfig = ConfigMgr.ELF;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
   Config.InputFilename = Positional[0];
   Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
   if (InputArgs.hasArg(OBJCOPY_target) &&
@@ -887,6 +880,17 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
           "bad format for --add-section: missing file name");
     Config.AddSection.push_back(ArgValue);
   }
+  for (auto Arg : InputArgs.filtered(OBJCOPY_update_section)) {
+    StringRef ArgValue(Arg->getValue());
+    if (!ArgValue.contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --update-section: missing '='");
+    if (ArgValue.split("=").second.empty())
+      return createStringError(
+          errc::invalid_argument,
+          "bad format for --update-section: missing file name");
+    Config.UpdateSection.push_back(ArgValue);
+  }
   for (auto *Arg : InputArgs.filtered(OBJCOPY_dump_section)) {
     StringRef Value(Arg->getValue());
     if (Value.split('=').second.empty())
@@ -905,7 +909,7 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
   Config.ExtractMainPartition =
       InputArgs.hasArg(OBJCOPY_extract_main_partition);
-  Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
+  ELFConfig.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
   Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
   if (InputArgs.hasArg(OBJCOPY_discard_all, OBJCOPY_discard_locals))
     Config.DiscardMode =
@@ -913,13 +917,13 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
             ? DiscardType::All
             : DiscardType::Locals;
   Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
-  Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
-  Config.KeepUndefined = InputArgs.hasArg(OBJCOPY_keep_undefined);
+  ELFConfig.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
+  MachOConfig.KeepUndefined = InputArgs.hasArg(OBJCOPY_keep_undefined);
   Config.DecompressDebugSections =
       InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
   if (Config.DiscardMode == DiscardType::All) {
     Config.StripDebug = true;
-    Config.KeepFileSymbols = true;
+    ELFConfig.KeepFileSymbols = true;
   }
   for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
     if (Error E = Config.SymbolsToLocalize.addMatcher(NameOrPattern::create(
@@ -993,7 +997,7 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
     Config.SymbolsToAdd.push_back(*SymInfo);
   }
 
-  Config.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
+  ELFConfig.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
 
   Config.DeterministicArchives = InputArgs.hasFlag(
       OBJCOPY_enable_deterministic_archives,
@@ -1013,16 +1017,16 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
         return createStringError(
             EAddr.getError(), "bad entry point address: '%s'", Arg->getValue());
 
-      Config.EntryExpr = [EAddr](uint64_t) { return *EAddr; };
+      ELFConfig.EntryExpr = [EAddr](uint64_t) { return *EAddr; };
     } else if (Arg->getOption().matches(OBJCOPY_change_start)) {
       auto EIncr = getAsInteger<int64_t>(Arg->getValue());
       if (!EIncr)
         return createStringError(EIncr.getError(),
                                  "bad entry point increment: '%s'",
                                  Arg->getValue());
-      auto Expr = Config.EntryExpr ? std::move(Config.EntryExpr)
-                                   : [](uint64_t A) { return A; };
-      Config.EntryExpr = [Expr, EIncr](uint64_t EAddr) {
+      auto Expr = ELFConfig.EntryExpr ? std::move(ELFConfig.EntryExpr)
+                                      : [](uint64_t A) { return A; };
+      ELFConfig.EntryExpr = [Expr, EIncr](uint64_t EAddr) {
         return Expr(EAddr) + *EIncr;
       };
     }
@@ -1057,6 +1061,7 @@ objcopy::parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
   DriverConfig DC;
   ConfigManager ConfigMgr;
   CommonConfig &Config = ConfigMgr.Common;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
   InstallNameToolOptTable T;
   unsigned MissingArgumentIndex, MissingArgumentCount;
   llvm::opt::InputArgList InputArgs =
@@ -1087,27 +1092,27 @@ objcopy::parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
   }
 
   for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_add_rpath))
-    Config.RPathToAdd.push_back(Arg->getValue());
+    MachOConfig.RPathToAdd.push_back(Arg->getValue());
 
   for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_prepend_rpath))
-    Config.RPathToPrepend.push_back(Arg->getValue());
+    MachOConfig.RPathToPrepend.push_back(Arg->getValue());
 
   for (auto Arg : InputArgs.filtered(INSTALL_NAME_TOOL_delete_rpath)) {
     StringRef RPath = Arg->getValue();
 
     // Cannot add and delete the same rpath at the same time.
-    if (is_contained(Config.RPathToAdd, RPath))
+    if (is_contained(MachOConfig.RPathToAdd, RPath))
       return createStringError(
           errc::invalid_argument,
           "cannot specify both -add_rpath '%s' and -delete_rpath '%s'",
           RPath.str().c_str(), RPath.str().c_str());
-    if (is_contained(Config.RPathToPrepend, RPath))
+    if (is_contained(MachOConfig.RPathToPrepend, RPath))
       return createStringError(
           errc::invalid_argument,
           "cannot specify both -prepend_rpath '%s' and -delete_rpath '%s'",
           RPath.str().c_str(), RPath.str().c_str());
 
-    Config.RPathsToRemove.insert(RPath);
+    MachOConfig.RPathsToRemove.insert(RPath);
   }
 
   for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_rpath)) {
@@ -1118,51 +1123,52 @@ objcopy::parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
 
     // Cannot specify duplicate -rpath entries
     auto It1 = find_if(
-        Config.RPathsToUpdate,
+        MachOConfig.RPathsToUpdate,
         [&Match](const DenseMap<StringRef, StringRef>::value_type &OldNew) {
           return Match(OldNew.getFirst()) || Match(OldNew.getSecond());
         });
-    if (It1 != Config.RPathsToUpdate.end())
+    if (It1 != MachOConfig.RPathsToUpdate.end())
       return createStringError(errc::invalid_argument,
                                "cannot specify both -rpath '" +
                                    It1->getFirst() + "' '" + It1->getSecond() +
                                    "' and -rpath '" + Old + "' '" + New + "'");
 
     // Cannot specify the same rpath under both -delete_rpath and -rpath
-    auto It2 = find_if(Config.RPathsToRemove, Match);
-    if (It2 != Config.RPathsToRemove.end())
+    auto It2 = find_if(MachOConfig.RPathsToRemove, Match);
+    if (It2 != MachOConfig.RPathsToRemove.end())
       return createStringError(errc::invalid_argument,
                                "cannot specify both -delete_rpath '" + *It2 +
                                    "' and -rpath '" + Old + "' '" + New + "'");
 
     // Cannot specify the same rpath under both -add_rpath and -rpath
-    auto It3 = find_if(Config.RPathToAdd, Match);
-    if (It3 != Config.RPathToAdd.end())
+    auto It3 = find_if(MachOConfig.RPathToAdd, Match);
+    if (It3 != MachOConfig.RPathToAdd.end())
       return createStringError(errc::invalid_argument,
                                "cannot specify both -add_rpath '" + *It3 +
                                    "' and -rpath '" + Old + "' '" + New + "'");
 
     // Cannot specify the same rpath under both -prepend_rpath and -rpath.
-    auto It4 = find_if(Config.RPathToPrepend, Match);
-    if (It4 != Config.RPathToPrepend.end())
+    auto It4 = find_if(MachOConfig.RPathToPrepend, Match);
+    if (It4 != MachOConfig.RPathToPrepend.end())
       return createStringError(errc::invalid_argument,
                                "cannot specify both -prepend_rpath '" + *It4 +
                                    "' and -rpath '" + Old + "' '" + New + "'");
 
-    Config.RPathsToUpdate.insert({Old, New});
+    MachOConfig.RPathsToUpdate.insert({Old, New});
   }
 
   if (auto *Arg = InputArgs.getLastArg(INSTALL_NAME_TOOL_id)) {
-    Config.SharedLibId = Arg->getValue();
-    if (Config.SharedLibId->empty())
+    MachOConfig.SharedLibId = Arg->getValue();
+    if (MachOConfig.SharedLibId->empty())
       return createStringError(errc::invalid_argument,
                                "cannot specify an empty id");
   }
 
   for (auto *Arg : InputArgs.filtered(INSTALL_NAME_TOOL_change))
-    Config.InstallNamesToUpdate.insert({Arg->getValue(0), Arg->getValue(1)});
+    MachOConfig.InstallNamesToUpdate.insert(
+        {Arg->getValue(0), Arg->getValue(1)});
 
-  Config.RemoveAllRpaths =
+  MachOConfig.RemoveAllRpaths =
       InputArgs.hasArg(INSTALL_NAME_TOOL_delete_all_rpaths);
 
   SmallVector<StringRef, 2> Positional;
@@ -1281,6 +1287,8 @@ objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
 
   ConfigManager ConfigMgr;
   CommonConfig &Config = ConfigMgr.Common;
+  ELFConfig &ELFConfig = ConfigMgr.ELF;
+  MachOConfig &MachOConfig = ConfigMgr.MachO;
 
   if (InputArgs.hasArg(STRIP_regex) && InputArgs.hasArg(STRIP_wildcard))
     return createStringError(errc::invalid_argument,
@@ -1292,7 +1300,7 @@ objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
                                     : InputArgs.hasArg(STRIP_wildcard)
                                           ? MatchStyle::Wildcard
                                           : MatchStyle::Literal;
-  Config.AllowBrokenLinks = InputArgs.hasArg(STRIP_allow_broken_links);
+  ELFConfig.AllowBrokenLinks = InputArgs.hasArg(STRIP_allow_broken_links);
   Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
 
   if (InputArgs.hasArg(STRIP_discard_all, STRIP_discard_locals))
@@ -1305,10 +1313,10 @@ objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
   if (auto Arg = InputArgs.getLastArg(STRIP_strip_all, STRIP_no_strip_all))
     Config.StripAll = Arg->getOption().getID() == STRIP_strip_all;
   Config.StripAllGNU = InputArgs.hasArg(STRIP_strip_all_gnu);
-  Config.StripSwiftSymbols = InputArgs.hasArg(STRIP_strip_swift_symbols);
+  MachOConfig.StripSwiftSymbols = InputArgs.hasArg(STRIP_strip_swift_symbols);
   Config.OnlyKeepDebug = InputArgs.hasArg(STRIP_only_keep_debug);
-  Config.KeepFileSymbols = InputArgs.hasArg(STRIP_keep_file_symbols);
-  Config.KeepUndefined = InputArgs.hasArg(STRIP_keep_undefined);
+  ELFConfig.KeepFileSymbols = InputArgs.hasArg(STRIP_keep_file_symbols);
+  MachOConfig.KeepUndefined = InputArgs.hasArg(STRIP_keep_undefined);
 
   for (auto Arg : InputArgs.filtered(STRIP_keep_section))
     if (Error E = Config.KeepSection.addMatcher(NameOrPattern::create(
@@ -1337,7 +1345,7 @@ objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
 
   if (Config.DiscardMode == DiscardType::All) {
     Config.StripDebug = true;
-    Config.KeepFileSymbols = true;
+    ELFConfig.KeepFileSymbols = true;
   }
 
   Config.DeterministicArchives =
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFConfig.h b/llvm/tools/llvm-objcopy/ELF/ELFConfig.h
index 42d407da17ff..229a8d61fb83 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFConfig.h
+++ b/llvm/tools/llvm-objcopy/ELF/ELFConfig.h
@@ -20,6 +20,16 @@ namespace objcopy {
 // ELF specific configuration for copying/stripping a single file.
 struct ELFConfig {
   uint8_t NewSymbolVisibility = (uint8_t)ELF::STV_DEFAULT;
+
+  // ELF entry point address expression. The input parameter is an entry point
+  // address in the input ELF file. The entry address in the output file is
+  // calculated with EntryExpr(input_address), when either --set-start or
+  // --change-start is used.
+  std::function<uint64_t(uint64_t)> EntryExpr;
+
+  bool AllowBrokenLinks = false;
+  bool KeepFileSymbols = false;
+  bool LocalizeHidden = false;
 };
 
 } // namespace objcopy
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index 986eeca6256c..16de84a961b5 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -204,8 +204,7 @@ static bool isCompressable(const SectionBase &Sec) {
 }
 
 static Error replaceDebugSections(
-    Object &Obj, SectionPred &RemovePred,
-    function_ref<bool(const SectionBase &)> ShouldReplace,
+    Object &Obj, function_ref<bool(const SectionBase &)> ShouldReplace,
     function_ref<Expected<SectionBase *>(const SectionBase *)> AddSection) {
   // Build a list of the debug sections we are going to replace.
   // We can't call `AddSection` while iterating over sections,
@@ -225,17 +224,7 @@ static Error replaceDebugSections(
     FromTo[S] = *NewSection;
   }
 
-  // Now we want to update the target sections of relocation
-  // sections. Also we will update the relocations themselves
-  // to update the symbol references.
-  for (auto &Sec : Obj.sections())
-    Sec.replaceSectionReferences(FromTo);
-
-  RemovePred = [ShouldReplace, RemovePred](const SectionBase &Sec) {
-    return ShouldReplace(Sec) || RemovePred(Sec);
-  };
-
-  return Error::success();
+  return Obj.replaceSections(FromTo);
 }
 
 static bool isUnneededSymbol(const Symbol &Sym) {
@@ -244,7 +233,8 @@ static bool isUnneededSymbol(const Symbol &Sym) {
          Sym.Type != STT_SECTION;
 }
 
-static Error updateAndRemoveSymbols(const CommonConfig &Config, Object &Obj) {
+static Error updateAndRemoveSymbols(const CommonConfig &Config,
+                                    const ELFConfig &ELFConfig, Object &Obj) {
   // TODO: update or remove symbols only if there is an option that affects
   // them.
   if (!Obj.SymbolTable)
@@ -254,7 +244,7 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config, Object &Obj) {
     // Common and undefined symbols don't make sense as local symbols, and can
     // even cause crashes if we localize those, so skip them.
     if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
-        ((Config.LocalizeHidden &&
+        ((ELFConfig.LocalizeHidden &&
           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
          Config.SymbolsToLocalize.matches(Sym.Name)))
       Sym.Binding = STB_LOCAL;
@@ -304,7 +294,7 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config, Object &Obj) {
 
   auto RemoveSymbolsPred = [&](const Symbol &Sym) {
     if (Config.SymbolsToKeep.matches(Sym.Name) ||
-        (Config.KeepFileSymbols && Sym.Type == STT_FILE))
+        (ELFConfig.KeepFileSymbols && Sym.Type == STT_FILE))
       return false;
 
     if ((Config.DiscardMode == DiscardType::All ||
@@ -339,7 +329,8 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config, Object &Obj) {
   return Obj.removeSymbols(RemoveSymbolsPred);
 }
 
-static Error replaceAndRemoveSections(const CommonConfig &Config, Object &Obj) {
+static Error replaceAndRemoveSections(const CommonConfig &Config,
+                                      const ELFConfig &ELFConfig, Object &Obj) {
   SectionPred RemovePred = [](const SectionBase &) { return false; };
 
   // Removes:
@@ -465,7 +456,7 @@ static Error replaceAndRemoveSections(const CommonConfig &Config, Object &Obj) {
   // and at least one of those symbols is present
   // (equivalently, the updated symbol table is not empty)
   // the symbol table and the string table should not be removed.
-  if ((!Config.SymbolsToKeep.empty() || Config.KeepFileSymbols) &&
+  if ((!Config.SymbolsToKeep.empty() || ELFConfig.KeepFileSymbols) &&
       Obj.SymbolTable && !Obj.SymbolTable->empty()) {
     RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
       if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
@@ -474,9 +465,12 @@ static Error replaceAndRemoveSections(const CommonConfig &Config, Object &Obj) {
     };
   }
 
+  if (Error E = Obj.removeSections(ELFConfig.AllowBrokenLinks, RemovePred))
+    return E;
+
   if (Config.CompressionType != DebugCompressionType::None) {
     if (Error Err = replaceDebugSections(
-            Obj, RemovePred, isCompressable,
+            Obj, isCompressable,
             [&Config, &Obj](const SectionBase *S) -> Expected<SectionBase *> {
               Expected<CompressedSection> NewSection =
                   CompressedSection::create(*S, Config.CompressionType);
@@ -488,7 +482,7 @@ static Error replaceAndRemoveSections(const CommonConfig &Config, Object &Obj) {
       return Err;
   } else if (Config.DecompressDebugSections) {
     if (Error Err = replaceDebugSections(
-            Obj, RemovePred,
+            Obj,
             [](const SectionBase &S) { return isa<CompressedSection>(&S); },
             [&Obj](const SectionBase *S) {
               const CompressedSection *CS = cast<CompressedSection>(S);
@@ -497,7 +491,7 @@ static Error replaceAndRemoveSections(const CommonConfig &Config, Object &Obj) {
       return Err;
   }
 
-  return Obj.removeSections(Config.AllowBrokenLinks, RemovePred);
+  return Error::success();
 }
 
 // Add symbol to the Object symbol table with the specified properties.
@@ -554,6 +548,22 @@ static void addSymbol(Object &Obj, const NewSymbolInfo &SymInfo,
       Sec ? (uint16_t)SYMBOL_SIMPLE_INDEX : (uint16_t)SHN_ABS, 0);
 }
 
+static Error
+handleUserSection(StringRef Flag,
+                  function_ref<Error(StringRef, ArrayRef<uint8_t>)> F) {
+  std::pair<StringRef, StringRef> SecPair = Flag.split("=");
+  StringRef SecName = SecPair.first;
+  StringRef File = SecPair.second;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr = MemoryBuffer::getFile(File);
+  if (!BufOrErr)
+    return createFileError(File, errorCodeToError(BufOrErr.getError()));
+  std::unique_ptr<MemoryBuffer> Buf = std::move(*BufOrErr);
+  ArrayRef<uint8_t> Data(
+      reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
+      Buf->getBufferSize());
+  return F(SecName, Data);
+}
+
 // This function handles the high level operations of GNU objcopy including
 // handling command line options. It's important to outline certain properties
 // we expect to hold of the command line operations. Any operation that "keeps"
@@ -570,7 +580,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
 
   if (!Config.SplitDWO.empty() && Config.ExtractDWO) {
     return Obj.removeSections(
-        Config.AllowBrokenLinks,
+        ELFConfig.AllowBrokenLinks,
         [&Obj](const SectionBase &Sec) { return onlyKeepDWOPred(Obj, Sec); });
   }
 
@@ -587,21 +597,39 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
   // remove the relocation sections before removing the symbols. That allows
   // us to avoid reporting the inappropriate errors about removing symbols
   // named in relocations.
-  if (Error E = replaceAndRemoveSections(Config, Obj))
+  if (Error E = replaceAndRemoveSections(Config, ELFConfig, Obj))
     return E;
 
-  if (Error E = updateAndRemoveSymbols(Config, Obj))
+  if (Error E = updateAndRemoveSymbols(Config, ELFConfig, Obj))
     return E;
 
   if (!Config.SectionsToRename.empty()) {
+    std::vector<RelocationSectionBase *> RelocSections;
+    DenseSet<SectionBase *> RenamedSections;
     for (SectionBase &Sec : Obj.sections()) {
+      auto *RelocSec = dyn_cast<RelocationSectionBase>(&Sec);
       const auto Iter = Config.SectionsToRename.find(Sec.Name);
       if (Iter != Config.SectionsToRename.end()) {
         const SectionRename &SR = Iter->second;
         Sec.Name = std::string(SR.NewName);
         if (SR.NewFlags.hasValue())
           setSectionFlagsAndType(Sec, SR.NewFlags.getValue());
-      }
+        RenamedSections.insert(&Sec);
+      } else if (RelocSec && !(Sec.Flags & SHF_ALLOC))
+        // Postpone processing relocation sections which are not specified in
+        // their explicit '--rename-section' commands until after their target
+        // sections are renamed.
+        // Dynamic relocation sections (i.e. ones with SHF_ALLOC) should be
+        // renamed only explicitly. Otherwise, renaming, for example, '.got.plt'
+        // would affect '.rela.plt', which is not desirable.
+        RelocSections.push_back(RelocSec);
+    }
+
+    // Rename relocation sections according to their target sections.
+    for (RelocationSectionBase *RelocSec : RelocSections) {
+      auto Iter = RenamedSections.find(RelocSec->getSection());
+      if (Iter != RenamedSections.end())
+        RelocSec->Name = (RelocSec->getNamePrefix() + (*Iter)->Name).str();
     }
   }
 
@@ -624,27 +652,16 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
         // .rela.prefix.plt since GNU objcopy does so.
         const SectionBase *TargetSec = RelocSec->getSection();
         if (TargetSec && (TargetSec->Flags & SHF_ALLOC)) {
-          StringRef prefix;
-          switch (Sec.Type) {
-          case SHT_REL:
-            prefix = ".rel";
-            break;
-          case SHT_RELA:
-            prefix = ".rela";
-            break;
-          default:
-            llvm_unreachable("not a relocation section");
-          }
-
           // If the relocation section comes *after* the target section, we
           // don't add Config.AllocSectionsPrefix because we've already added
           // the prefix to TargetSec->Name. Otherwise, if the relocation
           // section comes *before* the target section, we add the prefix.
           if (PrefixedSections.count(TargetSec))
-            Sec.Name = (prefix + TargetSec->Name).str();
+            Sec.Name = (RelocSec->getNamePrefix() + TargetSec->Name).str();
           else
-            Sec.Name =
-                (prefix + Config.AllocSectionsPrefix + TargetSec->Name).str();
+            Sec.Name = (RelocSec->getNamePrefix() + Config.AllocSectionsPrefix +
+                        TargetSec->Name)
+                           .str();
         }
       }
     }
@@ -664,21 +681,23 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
         Sec.Type = SHT_NOBITS;
 
   for (const auto &Flag : Config.AddSection) {
-    std::pair<StringRef, StringRef> SecPair = Flag.split("=");
-    StringRef SecName = SecPair.first;
-    StringRef File = SecPair.second;
-    ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
-        MemoryBuffer::getFile(File);
-    if (!BufOrErr)
-      return createFileError(File, errorCodeToError(BufOrErr.getError()));
-    std::unique_ptr<MemoryBuffer> Buf = std::move(*BufOrErr);
-    ArrayRef<uint8_t> Data(
-        reinterpret_cast<const uint8_t *>(Buf->getBufferStart()),
-        Buf->getBufferSize());
-    OwnedDataSection &NewSection =
-        Obj.addSection<OwnedDataSection>(SecName, Data);
-    if (SecName.startswith(".note") && SecName != ".note.GNU-stack")
-      NewSection.Type = SHT_NOTE;
+    auto AddSection = [&](StringRef Name, ArrayRef<uint8_t> Data) {
+      OwnedDataSection &NewSection =
+          Obj.addSection<OwnedDataSection>(Name, Data);
+      if (Name.startswith(".note") && Name != ".note.GNU-stack")
+        NewSection.Type = SHT_NOTE;
+      return Error::success();
+    };
+    if (Error E = handleUserSection(Flag, AddSection))
+      return E;
+  }
+
+  for (StringRef Flag : Config.UpdateSection) {
+    auto UpdateSection = [&](StringRef Name, ArrayRef<uint8_t> Data) {
+      return Obj.updateSection(Name, Data);
+    };
+    if (Error E = handleUserSection(Flag, UpdateSection))
+      return E;
   }
 
   if (!Config.AddGnuDebugLink.empty())
@@ -705,8 +724,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
     }
   }
 
-  if (Config.EntryExpr)
-    Obj.Entry = Config.EntryExpr(Obj.Entry);
+  if (ELFConfig.EntryExpr)
+    Obj.Entry = ELFConfig.EntryExpr(Obj.Entry);
   return Error::success();
 }
 
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.cpp b/llvm/tools/llvm-objcopy/ELF/Object.cpp
index ba91d08e5540..3db5028e85f7 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.cpp
+++ b/llvm/tools/llvm-objcopy/ELF/Object.cpp
@@ -893,6 +893,17 @@ Error SymbolTableSection::accept(MutableSectionVisitor &Visitor) {
   return Visitor.visit(*this);
 }
 
+StringRef RelocationSectionBase::getNamePrefix() const {
+  switch (Type) {
+  case SHT_REL:
+    return ".rel";
+  case SHT_RELA:
+    return ".rela";
+  default:
+    llvm_unreachable("not a relocation section");
+  }
+}
+
 Error RelocationSection::removeSectionReferences(
     bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
   if (ToRemove(Symbols)) {
@@ -1342,13 +1353,16 @@ void IHexELFBuilder::addDataSections() {
       if (R.HexData.empty())
         continue;
       RecAddr = R.Addr + SegmentAddr + BaseAddr;
-      if (!Section || Section->Addr + Section->Size != RecAddr)
-        // OriginalOffset field is only used to sort section properly, so
-        // instead of keeping track of real offset in IHEX file, we use
-        // section number.
+      if (!Section || Section->Addr + Section->Size != RecAddr) {
+        // OriginalOffset field is only used to sort sections before layout, so
+        // instead of keeping track of real offsets in IHEX file, and as
+        // layoutSections() and layoutSectionsForOnlyKeepDebug() use
+        // llvm::stable_sort(), we can just set it to a constant (zero).
         Section = &Obj->addSection<OwnedDataSection>(
-            ".sec" + std::to_string(SecNo++), RecAddr,
-            ELF::SHF_ALLOC | ELF::SHF_WRITE, SecNo);
+            ".sec" + std::to_string(SecNo), RecAddr,
+            ELF::SHF_ALLOC | ELF::SHF_WRITE, 0);
+        SecNo++;
+      }
       Section->appendHexData(R.HexData);
       break;
     case IHexRecord::EndOfFile:
@@ -2093,6 +2107,17 @@ template <class ELFT> void ELFWriter<ELFT>::writeSegmentData() {
                 Size);
   }
 
+  for (auto it : Obj.getUpdatedSections()) {
+    SectionBase *Sec = it.first;
+    ArrayRef<uint8_t> Data = it.second;
+
+    auto *Parent = Sec->ParentSegment;
+    assert(Parent && "This section should've been part of a segment.");
+    uint64_t Offset =
+        Sec->OriginalOffset - Parent->OriginalOffset + Parent->Offset;
+    llvm::copy(Data, Buf->getBufferStart() + Offset);
+  }
+
   // Iterate over removed sections and overwrite their old data with zeroes.
   for (auto &Sec : Obj.removedSections()) {
     Segment *Parent = Sec.ParentSegment;
@@ -2110,6 +2135,37 @@ ELFWriter<ELFT>::ELFWriter(Object &Obj, raw_ostream &Buf, bool WSH,
     : Writer(Obj, Buf), WriteSectionHeaders(WSH && Obj.HadShdrs),
       OnlyKeepDebug(OnlyKeepDebug) {}
 
+Error Object::updateSection(StringRef Name, ArrayRef<uint8_t> Data) {
+  auto It = llvm::find_if(Sections,
+                          [&](const SecPtr &Sec) { return Sec->Name == Name; });
+  if (It == Sections.end())
+    return createStringError(errc::invalid_argument, "section '%s' not found",
+                             Name.str().c_str());
+
+  auto *OldSec = It->get();
+  if (!OldSec->hasContents())
+    return createStringError(
+        errc::invalid_argument,
+        "section '%s' can't be updated because it does not have contents",
+        Name.str().c_str());
+
+  if (Data.size() > OldSec->Size && OldSec->ParentSegment)
+    return createStringError(errc::invalid_argument,
+                             "cannot fit data of size %zu into section '%s' "
+                             "with size %zu that is part of a segment",
+                             Data.size(), Name.str().c_str(), OldSec->Size);
+
+  if (!OldSec->ParentSegment) {
+    *It = std::make_unique<OwnedDataSection>(*OldSec, Data);
+  } else {
+    // The segment writer will be in charge of updating these contents.
+    OldSec->Size = Data.size();
+    UpdatedSections[OldSec] = Data;
+  }
+
+  return Error::success();
+}
+
 Error Object::removeSections(
     bool AllowBrokenLinks, std::function<bool(const SectionBase &)> ToRemove) {
 
@@ -2162,6 +2218,30 @@ Error Object::removeSections(
   return Error::success();
 }
 
+Error Object::replaceSections(
+    const DenseMap<SectionBase *, SectionBase *> &FromTo) {
+  auto SectionIndexLess = [](const SecPtr &Lhs, const SecPtr &Rhs) {
+    return Lhs->Index < Rhs->Index;
+  };
+  assert(llvm::is_sorted(Sections, SectionIndexLess) &&
+         "Sections are expected to be sorted by Index");
+  // Set indices of new sections so that they can be later sorted into positions
+  // of removed ones.
+  for (auto &I : FromTo)
+    I.second->Index = I.first->Index;
+
+  // Notify all sections about the replacement.
+  for (auto &Sec : Sections)
+    Sec->replaceSectionReferences(FromTo);
+
+  if (Error E = removeSections(
+          /*AllowBrokenLinks=*/false,
+          [=](const SectionBase &Sec) { return FromTo.count(&Sec) > 0; }))
+    return E;
+  llvm::sort(Sections, SectionIndexLess);
+  return Error::success();
+}
+
 Error Object::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
   if (SymbolTable)
     for (const SecPtr &Sec : Sections)
@@ -2200,20 +2280,6 @@ Error Object::addNewSymbolTable() {
   return Error::success();
 }
 
-void Object::sortSections() {
-  // Use stable_sort to maintain the original ordering as closely as possible.
-  llvm::stable_sort(Sections, [](const SecPtr &A, const SecPtr &B) {
-    // Put SHT_GROUP sections first, since group section headers must come
-    // before the sections they contain. This also matches what GNU objcopy
-    // does.
-    if (A->Type != B->Type &&
-        (A->Type == ELF::SHT_GROUP || B->Type == ELF::SHT_GROUP))
-      return A->Type == ELF::SHT_GROUP;
-    // For all other sections, sort by offset order.
-    return A->OriginalOffset < B->OriginalOffset;
-  });
-}
-
 // Orders segments such that if x = y->ParentSegment then y comes before x.
 static void orderSegments(std::vector<Segment *> &Segments) {
   llvm::stable_sort(Segments, compareSegmentsByOffset);
@@ -2262,6 +2328,9 @@ static uint64_t layoutSections(Range Sections, uint64_t Offset) {
   // the offset from the start of the segment. Using the offset from the start
   // of the segment we can assign a new offset to the section. For sections not
   // covered by segments we can just bump Offset to the next valid location.
+  // While it is not necessary, layout the sections in the order based on their
+  // original offsets to resemble the input file as close as possible.
+  std::vector<SectionBase *> OutOfSegmentSections;
   uint32_t Index = 1;
   for (auto &Sec : Sections) {
     Sec.Index = Index++;
@@ -2269,12 +2338,19 @@ static uint64_t layoutSections(Range Sections, uint64_t Offset) {
       auto Segment = *Sec.ParentSegment;
       Sec.Offset =
           Segment.Offset + (Sec.OriginalOffset - Segment.OriginalOffset);
-    } else {
-      Offset = alignTo(Offset, Sec.Align == 0 ? 1 : Sec.Align);
-      Sec.Offset = Offset;
-      if (Sec.Type != SHT_NOBITS)
-        Offset += Sec.Size;
-    }
+    } else
+      OutOfSegmentSections.push_back(&Sec);
+  }
+
+  llvm::stable_sort(OutOfSegmentSections,
+                    [](const SectionBase *Lhs, const SectionBase *Rhs) {
+                      return Lhs->OriginalOffset < Rhs->OriginalOffset;
+                    });
+  for (auto *Sec : OutOfSegmentSections) {
+    Offset = alignTo(Offset, Sec->Align == 0 ? 1 : Sec->Align);
+    Sec->Offset = Offset;
+    if (Sec->Type != SHT_NOBITS)
+      Offset += Sec->Size;
   }
   return Offset;
 }
@@ -2282,38 +2358,49 @@ static uint64_t layoutSections(Range Sections, uint64_t Offset) {
 // Rewrite sh_offset after some sections are changed to SHT_NOBITS and thus
 // occupy no space in the file.
 static uint64_t layoutSectionsForOnlyKeepDebug(Object &Obj, uint64_t Off) {
+  // The layout algorithm requires the sections to be handled in the order of
+  // their offsets in the input file, at least inside segments.
+  std::vector<SectionBase *> Sections;
+  Sections.reserve(Obj.sections().size());
   uint32_t Index = 1;
   for (auto &Sec : Obj.sections()) {
     Sec.Index = Index++;
-
-    auto *FirstSec = Sec.ParentSegment && Sec.ParentSegment->Type == PT_LOAD
-                         ? Sec.ParentSegment->firstSection()
+    Sections.push_back(&Sec);
+  }
+  llvm::stable_sort(Sections,
+                    [](const SectionBase *Lhs, const SectionBase *Rhs) {
+                      return Lhs->OriginalOffset < Rhs->OriginalOffset;
+                    });
+
+  for (auto *Sec : Sections) {
+    auto *FirstSec = Sec->ParentSegment && Sec->ParentSegment->Type == PT_LOAD
+                         ? Sec->ParentSegment->firstSection()
                          : nullptr;
 
     // The first section in a PT_LOAD has to have congruent offset and address
     // modulo the alignment, which usually equals the maximum page size.
-    if (FirstSec && FirstSec == &Sec)
-      Off = alignTo(Off, Sec.ParentSegment->Align, Sec.Addr);
+    if (FirstSec && FirstSec == Sec)
+      Off = alignTo(Off, Sec->ParentSegment->Align, Sec->Addr);
 
     // sh_offset is not significant for SHT_NOBITS sections, but the congruence
     // rule must be followed if it is the first section in a PT_LOAD. Do not
     // advance Off.
-    if (Sec.Type == SHT_NOBITS) {
-      Sec.Offset = Off;
+    if (Sec->Type == SHT_NOBITS) {
+      Sec->Offset = Off;
       continue;
     }
 
     if (!FirstSec) {
       // FirstSec being nullptr generally means that Sec does not have the
       // SHF_ALLOC flag.
-      Off = Sec.Align ? alignTo(Off, Sec.Align) : Off;
-    } else if (FirstSec != &Sec) {
+      Off = Sec->Align ? alignTo(Off, Sec->Align) : Off;
+    } else if (FirstSec != Sec) {
       // The offset is relative to the first section in the PT_LOAD segment. Use
       // sh_offset for non-SHF_ALLOC sections.
-      Off = Sec.OriginalOffset - FirstSec->OriginalOffset + FirstSec->Offset;
+      Off = Sec->OriginalOffset - FirstSec->OriginalOffset + FirstSec->Offset;
     }
-    Sec.Offset = Off;
-    Off += Sec.Size;
+    Sec->Offset = Off;
+    Off += Sec->Size;
   }
   return Off;
 }
@@ -2460,7 +2547,6 @@ template <class ELFT> Error ELFWriter<ELFT>::finalize() {
 
   if (Error E = removeUnneededSections(Obj))
     return E;
-  Obj.sortSections();
 
   // We need to assign indexes before we perform layout because we need to know
   // if we need large indexes or not. We can assign indexes first and check as
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.h b/llvm/tools/llvm-objcopy/ELF/Object.h
index 6fd26afa3ca1..811af4b51310 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.h
+++ b/llvm/tools/llvm-objcopy/ELF/Object.h
@@ -48,12 +48,12 @@ class Object;
 struct Symbol;
 
 class SectionTableRef {
-  MutableArrayRef<std::unique_ptr<SectionBase>> Sections;
+  ArrayRef<std::unique_ptr<SectionBase>> Sections;
 
 public:
-  using iterator = pointee_iterator<std::unique_ptr<SectionBase> *>;
+  using iterator = pointee_iterator<const std::unique_ptr<SectionBase> *>;
 
-  explicit SectionTableRef(MutableArrayRef<std::unique_ptr<SectionBase>> Secs)
+  explicit SectionTableRef(ArrayRef<std::unique_ptr<SectionBase>> Secs)
       : Sections(Secs) {}
   SectionTableRef(const SectionTableRef &) = default;
 
@@ -429,6 +429,7 @@ public:
   virtual void markSymbols();
   virtual void
   replaceSectionReferences(const DenseMap<SectionBase *, SectionBase *> &);
+  virtual bool hasContents() const { return false; }
   // Notify the section that it is subject to removal.
   virtual void onRemove();
 };
@@ -493,6 +494,9 @@ public:
       function_ref<bool(const SectionBase *)> ToRemove) override;
   Error initialize(SectionTableRef SecTable) override;
   void finalize() override;
+  bool hasContents() const override {
+    return Type != ELF::SHT_NOBITS && Type != ELF::SHT_NULL;
+  }
 };
 
 class OwnedDataSection : public SectionBase {
@@ -518,9 +522,15 @@ public:
     OriginalOffset = SecOff;
   }
 
+  OwnedDataSection(SectionBase &S, ArrayRef<uint8_t> Data)
+      : SectionBase(S), Data(std::begin(Data), std::end(Data)) {
+    Size = Data.size();
+  }
+
   void appendHexData(StringRef HexData);
   Error accept(SectionVisitor &Sec) const override;
   Error accept(MutableSectionVisitor &Visitor) override;
+  bool hasContents() const override { return true; }
 };
 
 class CompressedSection : public SectionBase {
@@ -745,6 +755,8 @@ public:
   const SectionBase *getSection() const { return SecToApplyRel; }
   void setSection(SectionBase *Sec) { SecToApplyRel = Sec; }
 
+  StringRef getNamePrefix() const;
+
   static bool classof(const SectionBase *S) {
     return S->OriginalType == ELF::SHT_REL || S->OriginalType == ELF::SHT_RELA;
   }
@@ -1016,6 +1028,7 @@ private:
   std::vector<SecPtr> Sections;
   std::vector<SegPtr> Segments;
   std::vector<SecPtr> RemovedSections;
+  DenseMap<SectionBase *, std::vector<uint8_t>> UpdatedSections;
 
   static bool sectionIsAlloc(const SectionBase &Sec) {
     return Sec.Flags & ELF::SHF_ALLOC;
@@ -1023,10 +1036,6 @@ private:
 
 public:
   template <class T>
-  using Range = iterator_range<
-      pointee_iterator<typename std::vector<std::unique_ptr<T>>::iterator>>;
-
-  template <class T>
   using ConstRange = iterator_range<pointee_iterator<
       typename std::vector<std::unique_ptr<T>>::const_iterator>>;
 
@@ -1054,11 +1063,7 @@ public:
   SymbolTableSection *SymbolTable = nullptr;
   SectionIndexSection *SectionIndexTable = nullptr;
 
-  void sortSections();
-  SectionTableRef sections() { return SectionTableRef(Sections); }
-  ConstRange<SectionBase> sections() const {
-    return make_pointee_range(Sections);
-  }
+  SectionTableRef sections() const { return SectionTableRef(Sections); }
   iterator_range<
       filter_iterator<pointee_iterator<std::vector<SecPtr>::const_iterator>,
                       decltype(&sectionIsAlloc)>>
@@ -1066,6 +1071,9 @@ public:
     return make_filter_range(make_pointee_range(Sections), sectionIsAlloc);
   }
 
+  const auto &getUpdatedSections() const { return UpdatedSections; }
+  Error updateSection(StringRef Name, ArrayRef<uint8_t> Data);
+
   SectionBase *findSection(StringRef Name) {
     auto SecIt =
         find_if(Sections, [&](const SecPtr &Sec) { return Sec->Name == Name; });
@@ -1073,11 +1081,11 @@ public:
   }
   SectionTableRef removedSections() { return SectionTableRef(RemovedSections); }
 
-  Range<Segment> segments() { return make_pointee_range(Segments); }
   ConstRange<Segment> segments() const { return make_pointee_range(Segments); }
 
   Error removeSections(bool AllowBrokenLinks,
                        std::function<bool(const SectionBase &)> ToRemove);
+  Error replaceSections(const DenseMap<SectionBase *, SectionBase *> &FromTo);
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
   template <class T, class... Ts> T &addSection(Ts &&... Args) {
     auto Sec = std::make_unique<T>(std::forward<Ts>(Args)...);
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOConfig.h b/llvm/tools/llvm-objcopy/MachO/MachOConfig.h
index 7c5dbfde19a0..93f9facfcf0b 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOConfig.h
+++ b/llvm/tools/llvm-objcopy/MachO/MachOConfig.h
@@ -9,11 +9,33 @@
 #ifndef LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
 #define LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include <vector>
+
 namespace llvm {
 namespace objcopy {
 
 // Mach-O specific configuration for copying/stripping a single file.
-struct MachOConfig {};
+struct MachOConfig {
+  // Repeated options
+  std::vector<StringRef> RPathToAdd;
+  std::vector<StringRef> RPathToPrepend;
+  DenseMap<StringRef, StringRef> RPathsToUpdate;
+  DenseMap<StringRef, StringRef> InstallNamesToUpdate;
+  DenseSet<StringRef> RPathsToRemove;
+
+  // install-name-tool's id option
+  Optional<StringRef> SharedLibId;
+
+  // Boolean options
+  bool StripSwiftSymbols = false;
+  bool KeepUndefined = false;
+
+  // install-name-tool's --delete_all_rpaths
+  bool RemoveAllRpaths = false;
+};
 
 } // namespace objcopy
 } // namespace llvm
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp b/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
index 6ed21806fe5e..3cac77411845 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
@@ -249,8 +249,12 @@ Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
   uint64_t StartOfExportTrie =
       StartOfLazyBindingInfo + O.LazyBinds.Opcodes.size();
   uint64_t StartOfFunctionStarts = StartOfExportTrie + O.Exports.Trie.size();
-  uint64_t StartOfDataInCode =
+  uint64_t StartOfDyldExportsTrie =
       StartOfFunctionStarts + O.FunctionStarts.Data.size();
+  uint64_t StartOfChainedFixups =
+      StartOfDyldExportsTrie + O.ExportsTrie.Data.size();
+  uint64_t StartOfDataInCode =
+      StartOfChainedFixups + O.ChainedFixups.Data.size();
   uint64_t StartOfLinkerOptimizationHint =
       StartOfDataInCode + O.DataInCode.Data.size();
   uint64_t StartOfSymbols =
@@ -262,10 +266,31 @@ Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
       sizeof(uint32_t) * O.IndirectSymTable.Symbols.size();
   uint64_t StartOfCodeSignature =
       StartOfSymbolStrings + StrTableBuilder.getSize();
-  if (O.CodeSignatureCommandIndex)
+  uint32_t CodeSignatureSize = 0;
+  if (O.CodeSignatureCommandIndex) {
     StartOfCodeSignature = alignTo(StartOfCodeSignature, 16);
+
+    // Note: These calculations are to be kept in sync with the same
+    // calculations performed in LLD's CodeSignatureSection.
+    const uint32_t AllHeadersSize =
+        alignTo(CodeSignature.FixedHeadersSize + OutputFileName.size() + 1,
+                CodeSignature.Align);
+    const uint32_t BlockCount =
+        (StartOfCodeSignature + CodeSignature.BlockSize - 1) /
+        CodeSignature.BlockSize;
+    const uint32_t Size =
+        alignTo(AllHeadersSize + BlockCount * CodeSignature.HashSize,
+                CodeSignature.Align);
+
+    CodeSignature.StartOffset = StartOfCodeSignature;
+    CodeSignature.AllHeadersSize = AllHeadersSize;
+    CodeSignature.BlockCount = BlockCount;
+    CodeSignature.OutputFileName = OutputFileName;
+    CodeSignature.Size = Size;
+    CodeSignatureSize = Size;
+  }
   uint64_t LinkEditSize =
-      (StartOfCodeSignature + O.CodeSignature.Data.size()) - StartOfLinkEdit;
+      StartOfCodeSignature + CodeSignatureSize - StartOfLinkEdit;
 
   // Now we have determined the layout of the contents of the __LINKEDIT
   // segment. Update its load command.
@@ -293,7 +318,7 @@ Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
     switch (cmd) {
     case MachO::LC_CODE_SIGNATURE:
       MLC.linkedit_data_command_data.dataoff = StartOfCodeSignature;
-      MLC.linkedit_data_command_data.datasize = O.CodeSignature.Data.size();
+      MLC.linkedit_data_command_data.datasize = CodeSignatureSize;
       break;
     case MachO::LC_SYMTAB:
       MLC.symtab_command_data.symoff = StartOfSymbols;
@@ -332,6 +357,14 @@ Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
       MLC.linkedit_data_command_data.dataoff = StartOfFunctionStarts;
       MLC.linkedit_data_command_data.datasize = O.FunctionStarts.Data.size();
       break;
+    case MachO::LC_DYLD_CHAINED_FIXUPS:
+      MLC.linkedit_data_command_data.dataoff = StartOfChainedFixups;
+      MLC.linkedit_data_command_data.datasize = O.ChainedFixups.Data.size();
+      break;
+    case MachO::LC_DYLD_EXPORTS_TRIE:
+      MLC.linkedit_data_command_data.dataoff = StartOfDyldExportsTrie;
+      MLC.linkedit_data_command_data.datasize = O.ExportsTrie.Data.size();
+      break;
     case MachO::LC_DYLD_INFO:
     case MachO::LC_DYLD_INFO_ONLY:
       MLC.dyld_info_command_data.rebase_off =
@@ -380,6 +413,10 @@ Error MachOLayoutBuilder::layoutTail(uint64_t Offset) {
     case MachO::LC_SOURCE_VERSION:
     case MachO::LC_THREAD:
     case MachO::LC_UNIXTHREAD:
+    case MachO::LC_SUB_FRAMEWORK:
+    case MachO::LC_SUB_UMBRELLA:
+    case MachO::LC_SUB_CLIENT:
+    case MachO::LC_SUB_LIBRARY:
       // Nothing to update.
       break;
     default:
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h b/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
index 5fe6683e27f3..44d03b4af7e8 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
+++ b/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
@@ -16,10 +16,49 @@ namespace llvm {
 namespace objcopy {
 namespace macho {
 
+/// When MachO binaries include a LC_CODE_SIGNATURE load command,
+/// the __LINKEDIT data segment will include a section corresponding
+/// to the LC_CODE_SIGNATURE load command. This section serves as a signature
+/// for the binary. Included in the CodeSignature section is a header followed
+/// by a hash of the binary. If present, the CodeSignature section is the
+/// last component of the binary.
+struct CodeSignatureInfo {
+  // NOTE: These values are to be kept in sync with those in
+  // LLD's CodeSignatureSection class.
+
+  static constexpr uint32_t Align = 16;
+  static constexpr uint8_t BlockSizeShift = 12;
+  // The binary is read in blocks of the following size.
+  static constexpr size_t BlockSize = (1 << BlockSizeShift); // 4 KiB
+  // For each block, a SHA256 hash (256 bits, 32 bytes) is written to
+  // the CodeSignature section.
+  static constexpr size_t HashSize = 256 / 8;
+  static constexpr size_t BlobHeadersSize = llvm::alignTo<8>(
+      sizeof(llvm::MachO::CS_SuperBlob) + sizeof(llvm::MachO::CS_BlobIndex));
+  // The size of the entire header depends upon the filename the binary is being
+  // written to, but the rest of the header is fixed in size.
+  static constexpr uint32_t FixedHeadersSize =
+      BlobHeadersSize + sizeof(llvm::MachO::CS_CodeDirectory);
+
+  // The offset relative to the start of the binary where
+  // the CodeSignature section should begin.
+  uint32_t StartOffset;
+  // The size of the entire header, output file name size included.
+  uint32_t AllHeadersSize;
+  // The number of blocks required to hash the binary.
+  uint32_t BlockCount;
+  StringRef OutputFileName;
+  // The size of the entire CodeSignature section, including both the header and
+  // hashes.
+  uint32_t Size;
+};
+
 class MachOLayoutBuilder {
   Object &O;
   bool Is64Bit;
+  StringRef OutputFileName;
   uint64_t PageSize;
+  CodeSignatureInfo CodeSignature;
 
   // Points to the __LINKEDIT segment if it exists.
   MachO::macho_load_command *LinkEditLoadCommand = nullptr;
@@ -37,14 +76,18 @@ class MachOLayoutBuilder {
                                                             bool Is64Bit);
 
 public:
-  MachOLayoutBuilder(Object &O, bool Is64Bit, uint64_t PageSize)
-      : O(O), Is64Bit(Is64Bit), PageSize(PageSize),
+  MachOLayoutBuilder(Object &O, bool Is64Bit, StringRef OutputFileName,
+                     uint64_t PageSize)
+      : O(O), Is64Bit(Is64Bit), OutputFileName(OutputFileName),
+        PageSize(PageSize),
         StrTableBuilder(getStringTableBuilderKind(O, Is64Bit)) {}
 
   // Recomputes and updates fields in the given object such as file offsets.
   Error layout();
 
   StringTableBuilder &getStringTableBuilder() { return StrTableBuilder; }
+
+  const CodeSignatureInfo &getCodeSignature() { return CodeSignature; }
 };
 
 } // end namespace macho
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
index 823306916bbe..9e7b91d73057 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
@@ -9,6 +9,7 @@
 #include "MachOObjcopy.h"
 #include "../llvm-objcopy.h"
 #include "CommonConfig.h"
+#include "MachO/MachOConfig.h"
 #include "MachOReader.h"
 #include "MachOWriter.h"
 #include "MultiFormatConfig.h"
@@ -19,6 +20,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
 
 using namespace llvm;
@@ -87,17 +89,20 @@ static void markSymbols(const CommonConfig &, Object &Obj) {
       (*ISE.Symbol)->Referenced = true;
 }
 
-static void updateAndRemoveSymbols(const CommonConfig &Config, Object &Obj) {
+static void updateAndRemoveSymbols(const CommonConfig &Config,
+                                   const MachOConfig &MachOConfig,
+                                   Object &Obj) {
   for (SymbolEntry &Sym : Obj.SymTable) {
     auto I = Config.SymbolsToRename.find(Sym.Name);
     if (I != Config.SymbolsToRename.end())
       Sym.Name = std::string(I->getValue());
   }
 
-  auto RemovePred = [Config, &Obj](const std::unique_ptr<SymbolEntry> &N) {
+  auto RemovePred = [Config, MachOConfig,
+                     &Obj](const std::unique_ptr<SymbolEntry> &N) {
     if (N->Referenced)
       return false;
-    if (Config.KeepUndefined && N->isUndefinedSymbol())
+    if (MachOConfig.KeepUndefined && N->isUndefinedSymbol())
       return false;
     if (N->n_desc & MachO::REFERENCED_DYNAMICALLY)
       return false;
@@ -106,8 +111,9 @@ static void updateAndRemoveSymbols(const CommonConfig &Config, Object &Obj) {
     if (Config.DiscardMode == DiscardType::All && !(N->n_type & MachO::N_EXT))
       return true;
     // This behavior is consistent with cctools' strip.
-    if (Config.StripSwiftSymbols && (Obj.Header.Flags & MachO::MH_DYLDLINK) &&
-        Obj.SwiftVersion && *Obj.SwiftVersion && N->isSwiftSymbol())
+    if (MachOConfig.StripSwiftSymbols &&
+        (Obj.Header.Flags & MachO::MH_DYLDLINK) && Obj.SwiftVersion &&
+        *Obj.SwiftVersion && N->isSwiftSymbol())
       return true;
     return false;
   };
@@ -139,17 +145,17 @@ static LoadCommand buildRPathLoadCommand(StringRef Path) {
   return LC;
 }
 
-static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
+static Error processLoadCommands(const MachOConfig &MachOConfig, Object &Obj) {
   // Remove RPaths.
-  DenseSet<StringRef> RPathsToRemove(Config.RPathsToRemove.begin(),
-                                     Config.RPathsToRemove.end());
+  DenseSet<StringRef> RPathsToRemove(MachOConfig.RPathsToRemove.begin(),
+                                     MachOConfig.RPathsToRemove.end());
 
   LoadCommandPred RemovePred = [&RPathsToRemove,
-                                &Config](const LoadCommand &LC) {
+                                &MachOConfig](const LoadCommand &LC) {
     if (LC.MachOLoadCommand.load_command_data.cmd == MachO::LC_RPATH) {
       // When removing all RPaths we don't need to care
       // about what it contains
-      if (Config.RemoveAllRpaths)
+      if (MachOConfig.RemoveAllRpaths)
         return true;
 
       StringRef RPath = getPayloadString(LC);
@@ -166,7 +172,7 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
 
   // Emit an error if the Mach-O binary does not contain an rpath path name
   // specified in -delete_rpath.
-  for (StringRef RPath : Config.RPathsToRemove) {
+  for (StringRef RPath : MachOConfig.RPathsToRemove) {
     if (RPathsToRemove.count(RPath))
       return createStringError(errc::invalid_argument,
                                "no LC_RPATH load command with path: %s",
@@ -182,7 +188,7 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
   }
 
   // Throw errors for invalid RPaths.
-  for (const auto &OldNew : Config.RPathsToUpdate) {
+  for (const auto &OldNew : MachOConfig.RPathsToUpdate) {
     StringRef Old = OldNew.getFirst();
     StringRef New = OldNew.getSecond();
     if (!RPaths.contains(Old))
@@ -198,14 +204,14 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
   for (LoadCommand &LC : Obj.LoadCommands) {
     switch (LC.MachOLoadCommand.load_command_data.cmd) {
     case MachO::LC_ID_DYLIB:
-      if (Config.SharedLibId)
+      if (MachOConfig.SharedLibId)
         updateLoadCommandPayloadString<MachO::dylib_command>(
-            LC, *Config.SharedLibId);
+            LC, *MachOConfig.SharedLibId);
       break;
 
     case MachO::LC_RPATH: {
       StringRef RPath = getPayloadString(LC);
-      StringRef NewRPath = Config.RPathsToUpdate.lookup(RPath);
+      StringRef NewRPath = MachOConfig.RPathsToUpdate.lookup(RPath);
       if (!NewRPath.empty())
         updateLoadCommandPayloadString<MachO::rpath_command>(LC, NewRPath);
       break;
@@ -217,7 +223,7 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
     case MachO::LC_LOAD_WEAK_DYLIB:
       StringRef InstallName = getPayloadString(LC);
       StringRef NewInstallName =
-          Config.InstallNamesToUpdate.lookup(InstallName);
+          MachOConfig.InstallNamesToUpdate.lookup(InstallName);
       if (!NewInstallName.empty())
         updateLoadCommandPayloadString<MachO::dylib_command>(LC,
                                                              NewInstallName);
@@ -226,7 +232,7 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
   }
 
   // Add new RPaths.
-  for (StringRef RPath : Config.RPathToAdd) {
+  for (StringRef RPath : MachOConfig.RPathToAdd) {
     if (RPaths.contains(RPath))
       return createStringError(errc::invalid_argument,
                                "rpath '" + RPath +
@@ -235,7 +241,7 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
     Obj.LoadCommands.push_back(buildRPathLoadCommand(RPath));
   }
 
-  for (StringRef RPath : Config.RPathToPrepend) {
+  for (StringRef RPath : MachOConfig.RPathToPrepend) {
     if (RPaths.contains(RPath))
       return createStringError(errc::invalid_argument,
                                "rpath '" + RPath +
@@ -248,7 +254,7 @@ static Error processLoadCommands(const CommonConfig &Config, Object &Obj) {
 
   // Unlike appending rpaths, the indexes of subsequent load commands must
   // be recalculated after prepending one.
-  if (!Config.RPathToPrepend.empty())
+  if (!MachOConfig.RPathToPrepend.empty())
     Obj.updateLoadCommandIndexes();
 
   return Error::success();
@@ -333,7 +339,8 @@ static Error isValidMachOCannonicalName(StringRef Name) {
   return Error::success();
 }
 
-static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+static Error handleArgs(const CommonConfig &Config,
+                        const MachOConfig &MachOConfig, Object &Obj) {
   // Dump sections before add/remove for compatibility with GNU objcopy.
   for (StringRef Flag : Config.DumpSection) {
     StringRef SectionName;
@@ -350,7 +357,7 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) {
   if (Config.StripAll)
     markSymbols(Config, Obj);
 
-  updateAndRemoveSymbols(Config, Obj);
+  updateAndRemoveSymbols(Config, MachOConfig, Obj);
 
   if (Config.StripAll)
     for (LoadCommand &LC : Obj.LoadCommands)
@@ -367,14 +374,14 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) {
       return E;
   }
 
-  if (Error E = processLoadCommands(Config, Obj))
+  if (Error E = processLoadCommands(MachOConfig, Obj))
     return E;
 
   return Error::success();
 }
 
 Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config,
-                                             const MachOConfig &,
+                                             const MachOConfig &MachOConfig,
                                              object::MachOObjectFile &In,
                                              raw_ostream &Out) {
   MachOReader Reader(In);
@@ -382,7 +389,12 @@ Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config,
   if (!O)
     return createFileError(Config.InputFilename, O.takeError());
 
-  if (Error E = handleArgs(Config, **O))
+  if (O->get()->Header.FileType == MachO::HeaderFileType::MH_PRELOAD)
+    return createStringError(std::errc::not_supported,
+                             "%s: MH_PRELOAD files are not supported",
+                             Config.InputFilename.str().c_str());
+
+  if (Error E = handleArgs(Config, MachOConfig, **O))
     return createFileError(Config.InputFilename, std::move(E));
 
   // Page size used for alignment of segment sizes in Mach-O executables and
@@ -398,7 +410,8 @@ Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config,
     PageSize = 4096;
   }
 
-  MachOWriter Writer(**O, In.is64Bit(), In.isLittleEndian(), PageSize, Out);
+  MachOWriter Writer(**O, In.is64Bit(), In.isLittleEndian(),
+                     sys::path::filename(Config.OutputFilename), PageSize, Out);
   if (auto E = Writer.finalize())
     return E;
   return Writer.write();
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
index e30940a8d6eb..d03eee9d5fdb 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
+++ b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
@@ -24,7 +24,8 @@ struct MachOConfig;
 class MultiFormatConfig;
 
 namespace macho {
-Error executeObjcopyOnBinary(const CommonConfig &Config, const MachOConfig &,
+Error executeObjcopyOnBinary(const CommonConfig &Config,
+                             const MachOConfig &MachOConfig,
                              object::MachOObjectFile &In, raw_ostream &Out);
 
 Error executeObjcopyOnMachOUniversalBinary(
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp b/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
index 7d1c29b42c2e..d68d1692997a 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
@@ -116,6 +116,7 @@ Expected<std::vector<std::unique_ptr<Section>>> static extractSections(
 Error MachOReader::readLoadCommands(Object &O) const {
   // For MachO sections indices start from 1.
   uint32_t NextSectionIndex = 1;
+  static constexpr char TextSegmentName[] = "__TEXT";
   for (auto LoadCmd : MachOObj.load_commands()) {
     LoadCommand LC;
     switch (LoadCmd.C.cmd) {
@@ -123,6 +124,14 @@ Error MachOReader::readLoadCommands(Object &O) const {
       O.CodeSignatureCommandIndex = O.LoadCommands.size();
       break;
     case MachO::LC_SEGMENT:
+      // LoadCmd.Ptr might not be aligned temporarily as
+      // MachO::segment_command requires, but the segname char pointer do not
+      // have alignment restrictions.
+      if (StringRef(reinterpret_cast<const char *>(
+              LoadCmd.Ptr + offsetof(MachO::segment_command, segname))) ==
+          TextSegmentName)
+        O.TextSegmentCommandIndex = O.LoadCommands.size();
+
       if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
               extractSections<MachO::section, MachO::segment_command>(
                   LoadCmd, MachOObj, NextSectionIndex))
@@ -131,6 +140,14 @@ Error MachOReader::readLoadCommands(Object &O) const {
         return Sections.takeError();
       break;
     case MachO::LC_SEGMENT_64:
+      // LoadCmd.Ptr might not be aligned temporarily as
+      // MachO::segment_command_64 requires, but the segname char pointer do
+      // not have alignment restrictions.
+      if (StringRef(reinterpret_cast<const char *>(
+              LoadCmd.Ptr + offsetof(MachO::segment_command_64, segname))) ==
+          TextSegmentName)
+        O.TextSegmentCommandIndex = O.LoadCommands.size();
+
       if (Expected<std::vector<std::unique_ptr<Section>>> Sections =
               extractSections<MachO::section_64, MachO::segment_command_64>(
                   LoadCmd, MachOObj, NextSectionIndex))
@@ -157,6 +174,12 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_FUNCTION_STARTS:
       O.FunctionStartsCommandIndex = O.LoadCommands.size();
       break;
+    case MachO::LC_DYLD_EXPORTS_TRIE:
+      O.ExportsTrieCommandIndex = O.LoadCommands.size();
+      break;
+    case MachO::LC_DYLD_CHAINED_FIXUPS:
+      O.ChainedFixupsCommandIndex = O.LoadCommands.size();
+      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
@@ -271,10 +294,6 @@ void MachOReader::readLinkData(Object &O, Optional<size_t> LCIndex,
       arrayRefFromStringRef(MachOObj.getData().substr(LC.dataoff, LC.datasize));
 }
 
-void MachOReader::readCodeSignature(Object &O) const {
-  return readLinkData(O, O.CodeSignatureCommandIndex, O.CodeSignature);
-}
-
 void MachOReader::readDataInCodeData(Object &O) const {
   return readLinkData(O, O.DataInCodeCommandIndex, O.DataInCode);
 }
@@ -288,6 +307,14 @@ void MachOReader::readFunctionStartsData(Object &O) const {
   return readLinkData(O, O.FunctionStartsCommandIndex, O.FunctionStarts);
 }
 
+void MachOReader::readExportsTrie(Object &O) const {
+  return readLinkData(O, O.ExportsTrieCommandIndex, O.ExportsTrie);
+}
+
+void MachOReader::readChainedFixups(Object &O) const {
+  return readLinkData(O, O.ChainedFixupsCommandIndex, O.ChainedFixups);
+}
+
 void MachOReader::readIndirectSymbolTable(Object &O) const {
   MachO::dysymtab_command DySymTab = MachOObj.getDysymtabLoadCommand();
   constexpr uint32_t AbsOrLocalMask =
@@ -336,10 +363,11 @@ Expected<std::unique_ptr<Object>> MachOReader::create() const {
   readWeakBindInfo(*Obj);
   readLazyBindInfo(*Obj);
   readExportInfo(*Obj);
-  readCodeSignature(*Obj);
   readDataInCodeData(*Obj);
   readLinkerOptimizationHint(*Obj);
   readFunctionStartsData(*Obj);
+  readExportsTrie(*Obj);
+  readChainedFixups(*Obj);
   readIndirectSymbolTable(*Obj);
   readSwiftVersion(*Obj);
   return std::move(Obj);
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.h b/llvm/tools/llvm-objcopy/MachO/MachOReader.h
index ca3a0214cb6d..b29e86ca642e 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.h
+++ b/llvm/tools/llvm-objcopy/MachO/MachOReader.h
@@ -41,6 +41,8 @@ class MachOReader : public Reader {
   void readDataInCodeData(Object &O) const;
   void readLinkerOptimizationHint(Object &O) const;
   void readFunctionStartsData(Object &O) const;
+  void readExportsTrie(Object &O) const;
+  void readChainedFixups(Object &O) const;
   void readIndirectSymbolTable(Object &O) const;
   void readSwiftVersion(Object &O) const;
 
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp b/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
index 295098ed4118..688945afe944 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
@@ -14,10 +14,16 @@
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SHA256.h"
 #include <memory>
 
+#if defined(__APPLE__)
+#include <sys/mman.h>
+#endif
+
 using namespace llvm;
 using namespace llvm::objcopy::macho;
+using namespace llvm::support::endian;
 
 size_t MachOWriter::headerSize() const {
   return Is64Bit ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header);
@@ -127,6 +133,26 @@ size_t MachOWriter::totalSize() const {
                      LinkEditDataCommand.datasize);
   }
 
+  if (O.ChainedFixupsCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.ChainedFixupsCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
+
+    if (LinkEditDataCommand.dataoff)
+      Ends.push_back(LinkEditDataCommand.dataoff +
+                     LinkEditDataCommand.datasize);
+  }
+
+  if (O.ExportsTrieCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.ExportsTrieCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
+
+    if (LinkEditDataCommand.dataoff)
+      Ends.push_back(LinkEditDataCommand.dataoff +
+                     LinkEditDataCommand.datasize);
+  }
+
   // Otherwise, use the last section / reloction.
   for (const LoadCommand &LC : O.LoadCommands)
     for (const std::unique_ptr<Section> &S : LC.Sections) {
@@ -423,8 +449,147 @@ void MachOWriter::writeLinkData(Optional<size_t> LCIndex, const LinkData &LD) {
   memcpy(Out, LD.Data.data(), LD.Data.size());
 }
 
+static uint64_t
+getSegmentFileOffset(const LoadCommand &TextSegmentLoadCommand) {
+  const MachO::macho_load_command &MLC =
+      TextSegmentLoadCommand.MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return MLC.segment_command_data.fileoff;
+  case MachO::LC_SEGMENT_64:
+    return MLC.segment_command_64_data.fileoff;
+  default:
+    return 0;
+  }
+}
+
+static uint64_t getSegmentFileSize(const LoadCommand &TextSegmentLoadCommand) {
+  const MachO::macho_load_command &MLC =
+      TextSegmentLoadCommand.MachOLoadCommand;
+  switch (MLC.load_command_data.cmd) {
+  case MachO::LC_SEGMENT:
+    return MLC.segment_command_data.filesize;
+  case MachO::LC_SEGMENT_64:
+    return MLC.segment_command_64_data.filesize;
+  default:
+    return 0;
+  }
+}
+
 void MachOWriter::writeCodeSignatureData() {
-  return writeLinkData(O.CodeSignatureCommandIndex, O.CodeSignature);
+  // NOTE: This CodeSignature section behaviour must be kept in sync with that
+  // performed in LLD's CodeSignatureSection::write /
+  // CodeSignatureSection::writeHashes. Furthermore, this call must occur only
+  // after the rest of the binary has already been written to the buffer. This
+  // is because the buffer is read from to perform the necessary hashing.
+
+  // The CodeSignature section is the last section in the MachO binary and
+  // contains a hash of all content in the binary before it. Since llvm-objcopy
+  // has likely modified the target binary, the hash must be regenerated
+  // entirely. To generate this hash, we must read from the start of the binary
+  // (HashReadStart) to just before the start of the CodeSignature section
+  // (HashReadEnd).
+
+  const CodeSignatureInfo &CodeSignature = LayoutBuilder.getCodeSignature();
+
+  uint8_t *BufferStart = reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+  uint8_t *HashReadStart = BufferStart;
+  uint8_t *HashReadEnd = BufferStart + CodeSignature.StartOffset;
+
+  // The CodeSignature section begins with a header, after which the hashes
+  // of each page of the binary are written.
+  uint8_t *HashWriteStart = HashReadEnd + CodeSignature.AllHeadersSize;
+
+  uint32_t TextSegmentFileOff = 0;
+  uint32_t TextSegmentFileSize = 0;
+  if (O.TextSegmentCommandIndex) {
+    const LoadCommand &TextSegmentLoadCommand =
+        O.LoadCommands[*O.TextSegmentCommandIndex];
+    assert(TextSegmentLoadCommand.MachOLoadCommand.load_command_data.cmd ==
+               MachO::LC_SEGMENT ||
+           TextSegmentLoadCommand.MachOLoadCommand.load_command_data.cmd ==
+               MachO::LC_SEGMENT_64);
+    assert(StringRef(TextSegmentLoadCommand.MachOLoadCommand
+                         .segment_command_data.segname) == "__TEXT");
+    TextSegmentFileOff = getSegmentFileOffset(TextSegmentLoadCommand);
+    TextSegmentFileSize = getSegmentFileSize(TextSegmentLoadCommand);
+  }
+
+  const uint32_t FileNamePad = CodeSignature.AllHeadersSize -
+                               CodeSignature.FixedHeadersSize -
+                               CodeSignature.OutputFileName.size();
+
+  // Write code section header.
+  auto *SuperBlob = reinterpret_cast<MachO::CS_SuperBlob *>(HashReadEnd);
+  write32be(&SuperBlob->magic, MachO::CSMAGIC_EMBEDDED_SIGNATURE);
+  write32be(&SuperBlob->length, CodeSignature.Size);
+  write32be(&SuperBlob->count, 1);
+  auto *BlobIndex = reinterpret_cast<MachO::CS_BlobIndex *>(&SuperBlob[1]);
+  write32be(&BlobIndex->type, MachO::CSSLOT_CODEDIRECTORY);
+  write32be(&BlobIndex->offset, CodeSignature.BlobHeadersSize);
+  auto *CodeDirectory = reinterpret_cast<MachO::CS_CodeDirectory *>(
+      HashReadEnd + CodeSignature.BlobHeadersSize);
+  write32be(&CodeDirectory->magic, MachO::CSMAGIC_CODEDIRECTORY);
+  write32be(&CodeDirectory->length,
+            CodeSignature.Size - CodeSignature.BlobHeadersSize);
+  write32be(&CodeDirectory->version, MachO::CS_SUPPORTSEXECSEG);
+  write32be(&CodeDirectory->flags, MachO::CS_ADHOC | MachO::CS_LINKER_SIGNED);
+  write32be(&CodeDirectory->hashOffset,
+            sizeof(MachO::CS_CodeDirectory) +
+                CodeSignature.OutputFileName.size() + FileNamePad);
+  write32be(&CodeDirectory->identOffset, sizeof(MachO::CS_CodeDirectory));
+  CodeDirectory->nSpecialSlots = 0;
+  write32be(&CodeDirectory->nCodeSlots, CodeSignature.BlockCount);
+  write32be(&CodeDirectory->codeLimit, CodeSignature.StartOffset);
+  CodeDirectory->hashSize = static_cast<uint8_t>(CodeSignature.HashSize);
+  CodeDirectory->hashType = MachO::kSecCodeSignatureHashSHA256;
+  CodeDirectory->platform = 0;
+  CodeDirectory->pageSize = CodeSignature.BlockSizeShift;
+  CodeDirectory->spare2 = 0;
+  CodeDirectory->scatterOffset = 0;
+  CodeDirectory->teamOffset = 0;
+  CodeDirectory->spare3 = 0;
+  CodeDirectory->codeLimit64 = 0;
+  write64be(&CodeDirectory->execSegBase, TextSegmentFileOff);
+  write64be(&CodeDirectory->execSegLimit, TextSegmentFileSize);
+  write64be(&CodeDirectory->execSegFlags, O.Header.FileType == MachO::MH_EXECUTE
+                                              ? MachO::CS_EXECSEG_MAIN_BINARY
+                                              : 0);
+
+  auto *Id = reinterpret_cast<char *>(&CodeDirectory[1]);
+  memcpy(Id, CodeSignature.OutputFileName.begin(),
+         CodeSignature.OutputFileName.size());
+  memset(Id + CodeSignature.OutputFileName.size(), 0, FileNamePad);
+
+  // Write the hashes.
+  uint8_t *CurrHashReadPosition = HashReadStart;
+  uint8_t *CurrHashWritePosition = HashWriteStart;
+  while (CurrHashReadPosition < HashReadEnd) {
+    StringRef Block(reinterpret_cast<char *>(CurrHashReadPosition),
+                    std::min(HashReadEnd - CurrHashReadPosition,
+                             static_cast<ssize_t>(CodeSignature.BlockSize)));
+    SHA256 Hasher;
+    Hasher.update(Block);
+    StringRef Hash = Hasher.final();
+    assert(Hash.size() == CodeSignature.HashSize);
+    memcpy(CurrHashWritePosition, Hash.data(), CodeSignature.HashSize);
+    CurrHashReadPosition += CodeSignature.BlockSize;
+    CurrHashWritePosition += CodeSignature.HashSize;
+  }
+#if defined(__APPLE__)
+  // This is macOS-specific work-around and makes no sense for any
+  // other host OS. See https://openradar.appspot.com/FB8914231
+  //
+  // The macOS kernel maintains a signature-verification cache to
+  // quickly validate applications at time of execve(2).  The trouble
+  // is that for the kernel creates the cache entry at the time of the
+  // mmap(2) call, before we have a chance to write either the code to
+  // sign or the signature header+hashes.  The fix is to invalidate
+  // all cached data associated with the output file, thus discarding
+  // the bogus prematurely-cached signature.
+  msync(BufferStart, CodeSignature.StartOffset + CodeSignature.Size,
+        MS_INVALIDATE);
+#endif
 }
 
 void MachOWriter::writeDataInCodeData() {
@@ -440,6 +605,14 @@ void MachOWriter::writeFunctionStartsData() {
   return writeLinkData(O.FunctionStartsCommandIndex, O.FunctionStarts);
 }
 
+void MachOWriter::writeChainedFixupsData() {
+  return writeLinkData(O.ChainedFixupsCommandIndex, O.ChainedFixups);
+}
+
+void MachOWriter::writeExportsTrieData() {
+  return writeLinkData(O.ExportsTrieCommandIndex, O.ExportsTrie);
+}
+
 void MachOWriter::writeTail() {
   typedef void (MachOWriter::*WriteHandlerType)(void);
   typedef std::pair<uint64_t, WriteHandlerType> WriteOperation;
@@ -525,6 +698,26 @@ void MachOWriter::writeTail() {
                          &MachOWriter::writeFunctionStartsData);
   }
 
+  if (O.ChainedFixupsCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.ChainedFixupsCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
+
+    if (LinkEditDataCommand.dataoff)
+      Queue.emplace_back(LinkEditDataCommand.dataoff,
+                         &MachOWriter::writeChainedFixupsData);
+  }
+
+  if (O.ExportsTrieCommandIndex) {
+    const MachO::linkedit_data_command &LinkEditDataCommand =
+        O.LoadCommands[*O.ExportsTrieCommandIndex]
+            .MachOLoadCommand.linkedit_data_command_data;
+
+    if (LinkEditDataCommand.dataoff)
+      Queue.emplace_back(LinkEditDataCommand.dataoff,
+                         &MachOWriter::writeExportsTrieData);
+  }
+
   llvm::sort(Queue, [](const WriteOperation &LHS, const WriteOperation &RHS) {
     return LHS.first < RHS.first;
   });
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOWriter.h b/llvm/tools/llvm-objcopy/MachO/MachOWriter.h
index c8c06d644e9f..a172534dac8a 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOWriter.h
+++ b/llvm/tools/llvm-objcopy/MachO/MachOWriter.h
@@ -50,13 +50,16 @@ class MachOWriter {
   void writeDataInCodeData();
   void writeLinkerOptimizationHint();
   void writeFunctionStartsData();
+  void writeChainedFixupsData();
+  void writeExportsTrieData();
   void writeTail();
 
 public:
-  MachOWriter(Object &O, bool Is64Bit, bool IsLittleEndian, uint64_t PageSize,
-              raw_ostream &Out)
+  MachOWriter(Object &O, bool Is64Bit, bool IsLittleEndian,
+              StringRef OutputFileName, uint64_t PageSize, raw_ostream &Out)
       : O(O), Is64Bit(Is64Bit), IsLittleEndian(IsLittleEndian),
-        PageSize(PageSize), Out(Out), LayoutBuilder(O, Is64Bit, PageSize) {}
+        PageSize(PageSize), Out(Out),
+        LayoutBuilder(O, Is64Bit, OutputFileName, PageSize) {}
 
   size_t totalSize() const;
   Error finalize();
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.cpp b/llvm/tools/llvm-objcopy/MachO/Object.cpp
index b4f98fa84cb5..6312adbbc9f7 100644
--- a/llvm/tools/llvm-objcopy/MachO/Object.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/Object.cpp
@@ -29,10 +29,24 @@ void SymbolTable::removeSymbols(
 }
 
 void Object::updateLoadCommandIndexes() {
+  static constexpr char TextSegmentName[] = "__TEXT";
   // Update indices of special load commands
   for (size_t Index = 0, Size = LoadCommands.size(); Index < Size; ++Index) {
     LoadCommand &LC = LoadCommands[Index];
     switch (LC.MachOLoadCommand.load_command_data.cmd) {
+    case MachO::LC_CODE_SIGNATURE:
+      CodeSignatureCommandIndex = Index;
+      break;
+    case MachO::LC_SEGMENT:
+      if (StringRef(LC.MachOLoadCommand.segment_command_data.segname) ==
+          TextSegmentName)
+        TextSegmentCommandIndex = Index;
+      break;
+    case MachO::LC_SEGMENT_64:
+      if (StringRef(LC.MachOLoadCommand.segment_command_64_data.segname) ==
+          TextSegmentName)
+        TextSegmentCommandIndex = Index;
+      break;
     case MachO::LC_SYMTAB:
       SymTabCommandIndex = Index;
       break;
@@ -52,6 +66,12 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_FUNCTION_STARTS:
       FunctionStartsCommandIndex = Index;
       break;
+    case MachO::LC_DYLD_CHAINED_FIXUPS:
+      ChainedFixupsCommandIndex = Index;
+      break;
+    case MachO::LC_DYLD_EXPORTS_TRIE:
+      ExportsTrieCommandIndex = Index;
+      break;
     }
   }
 }
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.h b/llvm/tools/llvm-objcopy/MachO/Object.h
index 207502e2241b..13aaf42634b0 100644
--- a/llvm/tools/llvm-objcopy/MachO/Object.h
+++ b/llvm/tools/llvm-objcopy/MachO/Object.h
@@ -315,7 +315,8 @@ struct Object {
   LinkData DataInCode;
   LinkData LinkerOptimizationHint;
   LinkData FunctionStarts;
-  LinkData CodeSignature;
+  LinkData ExportsTrie;
+  LinkData ChainedFixups;
 
   Optional<uint32_t> SwiftVersion;
 
@@ -325,14 +326,21 @@ struct Object {
   Optional<size_t> SymTabCommandIndex;
   /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
   Optional<size_t> DyLdInfoCommandIndex;
-  /// The index LC_DYSYMTAB load comamnd if present.
+  /// The index LC_DYSYMTAB load command if present.
   Optional<size_t> DySymTabCommandIndex;
-  /// The index LC_DATA_IN_CODE load comamnd if present.
+  /// The index LC_DATA_IN_CODE load command if present.
   Optional<size_t> DataInCodeCommandIndex;
-  /// The index of LC_LINKER_OPTIMIZATIN_HINT load comamnd if present.
+  /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present.
   Optional<size_t> LinkerOptimizationHintCommandIndex;
-  /// The index LC_FUNCTION_STARTS load comamnd if present.
+  /// The index LC_FUNCTION_STARTS load command if present.
   Optional<size_t> FunctionStartsCommandIndex;
+  /// The index LC_DYLD_CHAINED_FIXUPS load command if present.
+  Optional<size_t> ChainedFixupsCommandIndex;
+  /// The index LC_DYLD_EXPORTS_TRIE load command if present.
+  Optional<size_t> ExportsTrieCommandIndex;
+  /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
+  /// corresponding to the __TEXT segment.
+  Optional<size_t> TextSegmentCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index 63abbe4c2020..bc624442aa51 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -50,7 +50,8 @@ defm rename_section
     : Eq<"rename-section",
          "Renames a section from old to new, optionally with specified flags. "
          "Flags supported for GNU compatibility: alloc, load, noload, "
-         "readonly, debug, code, data, rom, share, contents, merge, strings.">,
+         "readonly, exclude, debug, code, data, rom, share, contents, merge, "
+         "strings.">,
       MetaVarName<"old=new[,flag1,...]">;
 defm redefine_symbol
     : Eq<"redefine-sym", "Change the name of a symbol old to new">,
@@ -82,8 +83,8 @@ defm set_section_alignment
 defm set_section_flags
     : Eq<"set-section-flags",
          "Set section flags for a given section. Flags supported for GNU "
-         "compatibility: alloc, load, noload, readonly, debug, code, data, "
-         "rom, share, contents, merge, strings.">,
+         "compatibility: alloc, load, noload, readonly, exclude, debug, code, "
+         "data, rom, share, contents, merge, strings.">,
       MetaVarName<"section=flag1[,flag2,...]">;
 
 def S : Flag<["-"], "S">,
@@ -214,3 +215,7 @@ defm add_symbol
          "compatibility: debug, constructor, warning, indirect, synthetic, "
          "unique-object, before.">,
       MetaVarName<"name=[section:]value[,flags]">;
+
+defm update_section
+    : Eq<"update-section", "Add section <name> with contents from a file <file>.">,
+      MetaVarName<"name=file">;
diff --git a/llvm/tools/llvm-objdump/COFFDump.cpp b/llvm/tools/llvm-objdump/COFFDump.cpp
index 09a900182d24..32fdd1a4d5c3 100644
--- a/llvm/tools/llvm-objdump/COFFDump.cpp
+++ b/llvm/tools/llvm-objdump/COFFDump.cpp
@@ -31,6 +31,159 @@ using namespace llvm::objdump;
 using namespace llvm::object;
 using namespace llvm::Win64EH;
 
+namespace {
+template <typename T> struct EnumEntry {
+  T Value;
+  StringRef Name;
+};
+
+class COFFDumper {
+public:
+  explicit COFFDumper(const llvm::object::COFFObjectFile &Obj) : Obj(Obj) {
+    Is64 = !Obj.getPE32Header();
+  }
+
+  template <class PEHeader> void printPEHeader(const PEHeader &Hdr) const;
+
+private:
+  template <typename T> FormattedNumber formatAddr(T V) const {
+    return format_hex_no_prefix(V, Is64 ? 16 : 8);
+  }
+
+  uint32_t getBaseOfData(const void *Hdr) const {
+    return Is64 ? 0 : static_cast<const pe32_header *>(Hdr)->BaseOfData;
+  }
+
+  const llvm::object::COFFObjectFile &Obj;
+  bool Is64;
+};
+} // namespace
+
+constexpr EnumEntry<uint16_t> PEHeaderMagic[] = {
+    {uint16_t(COFF::PE32Header::PE32), "PE32"},
+    {uint16_t(COFF::PE32Header::PE32_PLUS), "PE32+"},
+};
+
+constexpr EnumEntry<COFF::WindowsSubsystem> PEWindowsSubsystem[] = {
+    {COFF::IMAGE_SUBSYSTEM_UNKNOWN, "unspecified"},
+    {COFF::IMAGE_SUBSYSTEM_NATIVE, "NT native"},
+    {COFF::IMAGE_SUBSYSTEM_WINDOWS_GUI, "Windows GUI"},
+    {COFF::IMAGE_SUBSYSTEM_WINDOWS_CUI, "Windows CUI"},
+    {COFF::IMAGE_SUBSYSTEM_POSIX_CUI, "POSIX CUI"},
+    {COFF::IMAGE_SUBSYSTEM_WINDOWS_CE_GUI, "Wince CUI"},
+    {COFF::IMAGE_SUBSYSTEM_EFI_APPLICATION, "EFI application"},
+    {COFF::IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER, "EFI boot service driver"},
+    {COFF::IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER, "EFI runtime driver"},
+    {COFF::IMAGE_SUBSYSTEM_EFI_ROM, "SAL runtime driver"},
+    {COFF::IMAGE_SUBSYSTEM_XBOX, "XBOX"},
+};
+
+template <typename T, typename TEnum>
+static void printOptionalEnumName(T Value,
+                                  ArrayRef<EnumEntry<TEnum>> EnumValues) {
+  for (const EnumEntry<TEnum> &I : EnumValues)
+    if (I.Value == Value) {
+      outs() << "\t(" << I.Name << ')';
+      return;
+    }
+}
+
+template <class PEHeader>
+void COFFDumper::printPEHeader(const PEHeader &Hdr) const {
+  auto print = [](const char *K, auto V, const char *Fmt = "%d\n") {
+    outs() << format("%-23s ", K) << format(Fmt, V);
+  };
+  auto printU16 = [&](const char *K, support::ulittle16_t V,
+                      const char *Fmt = "%d\n") { print(K, uint16_t(V), Fmt); };
+  auto printU32 = [&](const char *K, support::ulittle32_t V,
+                      const char *Fmt = "%d\n") { print(K, uint32_t(V), Fmt); };
+  auto printAddr = [=](const char *K, uint64_t V) {
+    outs() << format("%-23s ", K) << formatAddr(V) << '\n';
+  };
+
+  printU16("Magic", Hdr.Magic, "%04x");
+  printOptionalEnumName(Hdr.Magic, makeArrayRef(PEHeaderMagic));
+  outs() << '\n';
+  print("MajorLinkerVersion", Hdr.MajorLinkerVersion);
+  print("MinorLinkerVersion", Hdr.MinorLinkerVersion);
+  printAddr("SizeOfCode", Hdr.SizeOfCode);
+  printAddr("SizeOfInitializedData", Hdr.SizeOfInitializedData);
+  printAddr("SizeOfUninitializedData", Hdr.SizeOfUninitializedData);
+  printAddr("AddressOfEntryPoint", Hdr.AddressOfEntryPoint);
+  printAddr("BaseOfCode", Hdr.BaseOfCode);
+  if (!Is64)
+    printAddr("BaseOfData", getBaseOfData(&Hdr));
+  printAddr("ImageBase", Hdr.ImageBase);
+  printU32("SectionAlignment", Hdr.SectionAlignment, "%08x\n");
+  printU32("FileAlignment", Hdr.FileAlignment, "%08x\n");
+  printU16("MajorOSystemVersion", Hdr.MajorOperatingSystemVersion);
+  printU16("MinorOSystemVersion", Hdr.MinorOperatingSystemVersion);
+  printU16("MajorImageVersion", Hdr.MajorImageVersion);
+  printU16("MinorImageVersion", Hdr.MinorImageVersion);
+  printU16("MajorSubsystemVersion", Hdr.MajorSubsystemVersion);
+  printU16("MinorSubsystemVersion", Hdr.MinorSubsystemVersion);
+  printU32("Win32Version", Hdr.Win32VersionValue, "%08x\n");
+  printU32("SizeOfImage", Hdr.SizeOfImage, "%08x\n");
+  printU32("SizeOfHeaders", Hdr.SizeOfHeaders, "%08x\n");
+  printU32("CheckSum", Hdr.CheckSum, "%08x\n");
+  printU16("Subsystem", Hdr.Subsystem, "%08x");
+  printOptionalEnumName(Hdr.Subsystem, makeArrayRef(PEWindowsSubsystem));
+  outs() << '\n';
+
+  printU16("DllCharacteristics", Hdr.DLLCharacteristics, "%08x\n");
+#define FLAG(Name)                                                             \
+  if (Hdr.DLLCharacteristics & COFF::IMAGE_DLL_CHARACTERISTICS_##Name)         \
+    outs() << "\t\t\t\t\t" << #Name << '\n';
+  FLAG(HIGH_ENTROPY_VA);
+  FLAG(DYNAMIC_BASE);
+  FLAG(FORCE_INTEGRITY);
+  FLAG(NX_COMPAT);
+  FLAG(NO_ISOLATION);
+  FLAG(NO_SEH);
+  FLAG(NO_BIND);
+  FLAG(APPCONTAINER);
+  FLAG(WDM_DRIVER);
+  FLAG(GUARD_CF);
+  FLAG(TERMINAL_SERVER_AWARE);
+#undef FLAG
+
+  printAddr("SizeOfStackReserve", Hdr.SizeOfStackReserve);
+  printAddr("SizeOfStackCommit", Hdr.SizeOfStackCommit);
+  printAddr("SizeOfHeapReserve", Hdr.SizeOfHeapReserve);
+  printAddr("SizeOfHeapCommit", Hdr.SizeOfHeapCommit);
+  printU32("LoaderFlags", Hdr.LoaderFlags, "%08x\n");
+  printU32("NumberOfRvaAndSizes", Hdr.NumberOfRvaAndSize, "%08x\n");
+
+  static const char *DirName[COFF::NUM_DATA_DIRECTORIES + 1] = {
+      "Export Directory [.edata (or where ever we found it)]",
+      "Import Directory [parts of .idata]",
+      "Resource Directory [.rsrc]",
+      "Exception Directory [.pdata]",
+      "Security Directory",
+      "Base Relocation Directory [.reloc]",
+      "Debug Directory",
+      "Description Directory",
+      "Special Directory",
+      "Thread Storage Directory [.tls]",
+      "Load Configuration Directory",
+      "Bound Import Directory",
+      "Import Address Table Directory",
+      "Delay Import Directory",
+      "CLR Runtime Header",
+      "Reserved",
+  };
+  outs() << "\nThe Data Directory\n";
+  for (uint32_t I = 0; I != array_lengthof(DirName); ++I) {
+    uint32_t Addr = 0, Size = 0;
+    if (const data_directory *Data = Obj.getDataDirectory(I)) {
+      Addr = Data->RelativeVirtualAddress;
+      Size = Data->Size;
+    }
+    outs() << format("Entry %x ", I) << formatAddr(Addr)
+           << format(" %08x %s\n", uint32_t(Size), DirName[I]);
+  }
+}
+
 // Returns the name of the unwind code.
 static StringRef getUnwindCodeTypeName(uint8_t Code) {
   switch(Code) {
@@ -278,10 +431,7 @@ static void printTLSDirectory(const COFFObjectFile *Obj) {
     return;
 
   const data_directory *DataDir = Obj->getDataDirectory(COFF::TLS_TABLE);
-  if (!DataDir)
-    reportError("missing data dir for TLS table", Obj->getFileName());
-
-  if (DataDir->RelativeVirtualAddress == 0)
+  if (!DataDir || DataDir->RelativeVirtualAddress == 0)
     return;
 
   uintptr_t IntPtr = 0;
@@ -625,12 +775,47 @@ void objdump::printCOFFUnwindInfo(const COFFObjectFile *Obj) {
   }
 }
 
-void objdump::printCOFFFileHeader(const object::ObjectFile *Obj) {
-  const COFFObjectFile *file = dyn_cast<const COFFObjectFile>(Obj);
-  printTLSDirectory(file);
-  printLoadConfiguration(file);
-  printImportTables(file);
-  printExportTable(file);
+void objdump::printCOFFFileHeader(const COFFObjectFile &Obj) {
+  COFFDumper CD(Obj);
+  const uint16_t Cha = Obj.getCharacteristics();
+  outs() << "Characteristics 0x" << Twine::utohexstr(Cha) << '\n';
+#define FLAG(F, Name)                                                          \
+  if (Cha & F)                                                                 \
+    outs() << '\t' << Name << '\n';
+  FLAG(COFF::IMAGE_FILE_RELOCS_STRIPPED, "relocations stripped");
+  FLAG(COFF::IMAGE_FILE_EXECUTABLE_IMAGE, "executable");
+  FLAG(COFF::IMAGE_FILE_LINE_NUMS_STRIPPED, "line numbers stripped");
+  FLAG(COFF::IMAGE_FILE_LOCAL_SYMS_STRIPPED, "symbols stripped");
+  FLAG(COFF::IMAGE_FILE_LARGE_ADDRESS_AWARE, "large address aware");
+  FLAG(COFF::IMAGE_FILE_BYTES_REVERSED_LO, "little endian");
+  FLAG(COFF::IMAGE_FILE_32BIT_MACHINE, "32 bit words");
+  FLAG(COFF::IMAGE_FILE_DEBUG_STRIPPED, "debugging information removed");
+  FLAG(COFF::IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP,
+       "copy to swap file if on removable media");
+  FLAG(COFF::IMAGE_FILE_NET_RUN_FROM_SWAP,
+       "copy to swap file if on network media");
+  FLAG(COFF::IMAGE_FILE_SYSTEM, "system file");
+  FLAG(COFF::IMAGE_FILE_DLL, "DLL");
+  FLAG(COFF::IMAGE_FILE_UP_SYSTEM_ONLY, "run only on uniprocessor machine");
+  FLAG(COFF::IMAGE_FILE_BYTES_REVERSED_HI, "big endian");
+#undef FLAG
+
+  // TODO Support PE_IMAGE_DEBUG_TYPE_REPRO.
+  // Since ctime(3) returns a 26 character string of the form:
+  // "Sun Sep 16 01:03:52 1973\n\0"
+  // just print 24 characters.
+  const time_t Timestamp = Obj.getTimeDateStamp();
+  outs() << format("\nTime/Date               %.24s\n", ctime(&Timestamp));
+
+  if (const pe32_header *Hdr = Obj.getPE32Header())
+    CD.printPEHeader<pe32_header>(*Hdr);
+  else if (const pe32plus_header *Hdr = Obj.getPE32PlusHeader())
+    CD.printPEHeader<pe32plus_header>(*Hdr);
+
+  printTLSDirectory(&Obj);
+  printLoadConfiguration(&Obj);
+  printImportTables(&Obj);
+  printExportTable(&Obj);
 }
 
 void objdump::printCOFFSymbolTable(const object::COFFImportFile *i) {
diff --git a/llvm/tools/llvm-objdump/COFFDump.h b/llvm/tools/llvm-objdump/COFFDump.h
index 21f97bdeb83c..f933f79523a0 100644
--- a/llvm/tools/llvm-objdump/COFFDump.h
+++ b/llvm/tools/llvm-objdump/COFFDump.h
@@ -28,7 +28,7 @@ Error getCOFFRelocationValueString(const object::COFFObjectFile *Obj,
                                    llvm::SmallVectorImpl<char> &Result);
 
 void printCOFFUnwindInfo(const object::COFFObjectFile *O);
-void printCOFFFileHeader(const object::ObjectFile *O);
+void printCOFFFileHeader(const object::COFFObjectFile &Obj);
 void printCOFFSymbolTable(const object::COFFImportFile *I);
 void printCOFFSymbolTable(const object::COFFObjectFile *O);
 } // namespace objdump
diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp
index da7415834c63..98e71497d022 100644
--- a/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -145,7 +145,7 @@ static uint64_t getSectionLMA(const ELFFile<ELFT> &Obj,
                               const object::ELFSectionRef &Sec) {
   auto PhdrRangeOrErr = Obj.program_headers();
   if (!PhdrRangeOrErr)
-    report_fatal_error(toString(PhdrRangeOrErr.takeError()));
+    report_fatal_error(Twine(toString(PhdrRangeOrErr.takeError())));
 
   // Search for a PT_LOAD segment containing the requested section. Use this
   // segment's p_addr to calculate the section's LMA.
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 7c1fdf03542f..b0cf1f775ced 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Option/ArgList.h"
@@ -44,7 +45,6 @@
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
@@ -10053,6 +10053,10 @@ static void PrintLinkEditDataCommand(MachO::linkedit_data_command ld,
     outs() << "      cmd LC_DYLIB_CODE_SIGN_DRS\n";
   else if (ld.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT)
     outs() << "      cmd LC_LINKER_OPTIMIZATION_HINT\n";
+  else if (ld.cmd == MachO::LC_DYLD_EXPORTS_TRIE)
+    outs() << "      cmd LC_DYLD_EXPORTS_TRIE\n";
+  else if (ld.cmd == MachO::LC_DYLD_CHAINED_FIXUPS)
+    outs() << "      cmd LC_DYLD_CHAINED_FIXUPS\n";
   else
     outs() << "      cmd " << ld.cmd << " (?)\n";
   outs() << "  cmdsize " << ld.cmdsize;
@@ -10196,7 +10200,9 @@ static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t filetype,
                Command.C.cmd == MachO::LC_FUNCTION_STARTS ||
                Command.C.cmd == MachO::LC_DATA_IN_CODE ||
                Command.C.cmd == MachO::LC_DYLIB_CODE_SIGN_DRS ||
-               Command.C.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT) {
+               Command.C.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT ||
+               Command.C.cmd == MachO::LC_DYLD_EXPORTS_TRIE ||
+               Command.C.cmd == MachO::LC_DYLD_CHAINED_FIXUPS) {
       MachO::linkedit_data_command Ld =
           Obj->getLinkeditDataLoadCommand(Command);
       PrintLinkEditDataCommand(Ld, Buf.size());
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index 1b19733c65d0..9f27a6cdf163 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -1,5 +1,12 @@
 include "llvm/Option/OptParser.td"
 
+multiclass Eq<string name, string help> {
+  def NAME : Separate<["--"], name>;
+  def NAME #_eq : Joined<["--"], name #"=">,
+                  Alias<!cast<Separate>(NAME)>,
+                  HelpText<help>;
+}
+
 def help : Flag<["--"], "help">,
   HelpText<"Display available options (--help-hidden for more)">;
 
@@ -16,7 +23,8 @@ def adjust_vma_EQ : Joined<["--"], "adjust-vma=">,
   HelpText<"Increase the displayed address by the specified offset">;
 
 def all_headers : Flag<["--"], "all-headers">,
-  HelpText<"Display all available header information">;
+  HelpText<"Display all available header information, "
+           "relocation entries and the symbol table">;
 def : Flag<["-"], "x">, Alias<all_headers>, HelpText<"Alias for --all-headers">;
 
 def arch_name_EQ : Joined<["--"], "arch-name=">,
@@ -32,11 +40,11 @@ def demangle : Flag<["--"], "demangle">, HelpText<"Demangle symbol names">;
 def : Flag<["-"], "C">, Alias<demangle>, HelpText<"Alias for --demangle">;
 
 def disassemble : Flag<["--"], "disassemble">,
-  HelpText<"Display assembler mnemonics for the machine instructions">;
+  HelpText<"Disassemble all executable sections found in the input files">;
 def : Flag<["-"], "d">, Alias<disassemble>, HelpText<"Alias for --disassemble">;
 
 def disassemble_all : Flag<["--"], "disassemble-all">,
-  HelpText<"Display assembler mnemonics for the machine instructions">;
+  HelpText<"Disassemble all sections found in the input files">;
 def : Flag<["-"], "D">, Alias<disassemble_all>,
   HelpText<"Alias for --disassemble-all">;
 
@@ -66,10 +74,12 @@ def : Flag<["-"], "R">, Alias<dynamic_reloc>,
   HelpText<"Alias for --dynamic-reloc">;
 
 def dwarf_EQ : Joined<["--"], "dwarf=">,
-  HelpText<"Dump of dwarf debug sections">, Values<"frames">;
+  HelpText<"Dump the specified DWARF debug sections. The "
+           "only supported value is 'frames'">,
+  Values<"frames">;
 
 def fault_map_section : Flag<["--"], "fault-map-section">,
-  HelpText<"Display contents of faultmap section">;
+  HelpText<"Display the content of the fault map section">;
 
 def file_headers : Flag<["--"], "file-headers">,
   HelpText<"Display the contents of the overall file header">;
@@ -82,9 +92,10 @@ def : Flag<["-"], "s">, Alias<full_contents>,
   HelpText<"Alias for --full-contents">;
 
 def line_numbers : Flag<["--"], "line-numbers">,
-  HelpText<"Display source line numbers with "
-            "disassembly. Implies disassemble object">;
-def : Flag<["-"], "l">, Alias<line_numbers>,
+  HelpText<"When disassembling, display source line numbers. "
+           "Implies --disassemble">;
+def : Flag<["-"], "l">,
+  Alias<line_numbers>,
   HelpText<"Alias for --line-numbers">;
 
 def macho : Flag<["--"], "macho">,
@@ -104,7 +115,7 @@ def no_show_raw_insn : Flag<["--"], "no-show-raw-insn">,
            "do not print the instruction bytes.">;
 
 def no_leading_addr : Flag<["--"], "no-leading-addr">,
-  HelpText<"Print no leading address">;
+  HelpText<"When disassembling, do not print leading addresses">;
 
 def raw_clang_ast : Flag<["--"], "raw-clang-ast">,
   HelpText<"Dump the raw binary contents of the clang AST section">;
@@ -143,15 +154,18 @@ def show_lma : Flag<["--"], "show-lma">,
   HelpText<"Display LMA column when dumping ELF section headers">;
 
 def source : Flag<["--"], "source">,
-  HelpText<"Display source inlined with disassembly. Implies disassemble object">;
+  HelpText<"When disassembling, display source interleaved with the "
+           "disassembly. Implies --disassemble">;
 def : Flag<["-"], "S">, Alias<source>, HelpText<"Alias for --source">;
 
 def start_address_EQ : Joined<["--"], "start-address=">,
   MetaVarName<"address">,
-  HelpText<"Disassemble beginning at address">;
+  HelpText<"Set the start address for disassembling, "
+           "printing relocations and printing symbols">;
 def stop_address_EQ : Joined<["--"], "stop-address=">,
   MetaVarName<"address">,
-  HelpText<"Stop disassembly at address">;
+  HelpText<"Set the stop address for disassembling, "
+           "printing relocations and printing symbols">;
 
 def syms : Flag<["--"], "syms">,
   HelpText<"Display the symbol table">;
@@ -180,19 +194,19 @@ def wide : Flag<["--"], "wide">,
   HelpText<"Ignored for compatibility with GNU objdump">;
 def : Flag<["-"], "w">, Alias<wide>;
 
-def prefix : Separate<["--"], "prefix">,
-  HelpText<"Add prefix to absolute paths">;
-
-def prefix_strip : Separate<["--"], "prefix-strip">,
-  HelpText<"Strip out initial directories from absolute "
-           "paths. No effect without --prefix">;
+defm prefix : Eq<"prefix", "Add prefix to absolute paths">,
+              MetaVarName<"prefix">;
+defm prefix_strip
+    : Eq<"prefix-strip", "Strip out initial directories from absolute "
+                         "paths. No effect without --prefix">,
+      MetaVarName<"prefix">;
 
 def debug_vars_EQ : Joined<["--"], "debug-vars=">,
-  Values<"unicode,ascii">;
-def : Flag<["--"], "debug-vars">,
   HelpText<"Print the locations (in registers or memory) of "
-           "source-level variables alongside disassembly">,
-  Alias<debug_vars_EQ>, AliasArgs<["unicode"]>;
+           "source-level variables alongside disassembly. "
+           "Supported formats: ascii, unicode (default)">,
+  Values<"unicode,ascii">;
+def : Flag<["--"], "debug-vars">, Alias<debug_vars_EQ>, AliasArgs<["unicode"]>;
 
 def debug_vars_indent_EQ : Joined<["--"], "debug-vars-indent=">,
   HelpText<"Distance to indent the source-level variable display, "
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.cpp b/llvm/tools/llvm-objdump/XCOFFDump.cpp
index c4cc5fe7e21c..b8fb2ed3d063 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.cpp
+++ b/llvm/tools/llvm-objdump/XCOFFDump.cpp
@@ -58,6 +58,24 @@ objdump::getXCOFFSymbolCsectSMC(const XCOFFObjectFile *Obj,
   return CsectAuxEntOrErr.get().getStorageMappingClass();
 }
 
+Optional<object::SymbolRef>
+objdump::getXCOFFSymbolContainingSymbolRef(const XCOFFObjectFile *Obj,
+                                           const SymbolRef &Sym) {
+
+  const XCOFFSymbolRef SymRef = Obj->toSymbolRef(Sym.getRawDataRefImpl());
+  if (!SymRef.isCsectSymbol())
+    return None;
+
+  Expected<XCOFFCsectAuxRef> CsectAuxEntOrErr = SymRef.getXCOFFCsectAuxRef();
+  if (!CsectAuxEntOrErr || !CsectAuxEntOrErr.get().isLabel())
+    return None;
+  uint32_t Idx =
+      static_cast<uint32_t>(CsectAuxEntOrErr.get().getSectionOrLength());
+  DataRefImpl DRI;
+  DRI.p = Obj->getSymbolByIndex(Idx);
+  return SymbolRef(DRI, Obj);
+}
+
 bool objdump::isLabel(const XCOFFObjectFile *Obj, const SymbolRef &Sym) {
 
   const XCOFFSymbolRef SymRef = Obj->toSymbolRef(Sym.getRawDataRefImpl());
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.h b/llvm/tools/llvm-objdump/XCOFFDump.h
index dbf520021594..6796f00aef6f 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.h
+++ b/llvm/tools/llvm-objdump/XCOFFDump.h
@@ -20,6 +20,10 @@ Optional<XCOFF::StorageMappingClass>
 getXCOFFSymbolCsectSMC(const object::XCOFFObjectFile *Obj,
                        const object::SymbolRef &Sym);
 
+Optional<object::SymbolRef>
+getXCOFFSymbolContainingSymbolRef(const object::XCOFFObjectFile *Obj,
+                                  const object::SymbolRef &Sym);
+
 bool isLabel(const object::XCOFFObjectFile *Obj, const object::SymbolRef &Sym);
 
 std::string getXCOFFSymbolDescription(const SymbolInfoTy &SymbolInfo,
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 48ae92f734c7..6f6f543f2f47 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -47,6 +47,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/COFFImportFile.h"
@@ -71,7 +72,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/StringSaver.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
@@ -297,16 +297,15 @@ void objdump::reportWarning(const Twine &Message, StringRef File) {
       << "'" << File << "': " << Message << "\n";
 }
 
-LLVM_ATTRIBUTE_NORETURN void objdump::reportError(StringRef File,
-                                                  const Twine &Message) {
+[[noreturn]] void objdump::reportError(StringRef File, const Twine &Message) {
   outs().flush();
   WithColor::error(errs(), ToolName) << "'" << File << "': " << Message << "\n";
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN void objdump::reportError(Error E, StringRef FileName,
-                                                  StringRef ArchiveName,
-                                                  StringRef ArchitectureName) {
+[[noreturn]] void objdump::reportError(Error E, StringRef FileName,
+                                       StringRef ArchiveName,
+                                       StringRef ArchitectureName) {
   assert(E);
   outs().flush();
   WithColor::error(errs(), ToolName);
@@ -325,7 +324,7 @@ static void reportCmdLineWarning(const Twine &Message) {
   WithColor::warning(errs(), ToolName) << Message << "\n";
 }
 
-LLVM_ATTRIBUTE_NORETURN static void reportCmdLineError(const Twine &Message) {
+[[noreturn]] static void reportCmdLineError(const Twine &Message) {
   WithColor::error(errs(), ToolName) << Message << "\n";
   exit(1);
 }
@@ -1286,6 +1285,10 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
     if (shouldAdjustVA(Section))
       VMAAdjustment = AdjustVMA;
 
+    // In executable and shared objects, r_offset holds a virtual address.
+    // Subtract SectionAddr from the r_offset field of a relocation to get
+    // the section offset.
+    uint64_t RelAdjustment = Obj->isRelocatableObject() ? 0 : SectionAddr;
     uint64_t Size;
     uint64_t Index;
     bool PrintedSection = false;
@@ -1432,7 +1435,8 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
             // For --reloc: print zero blocks patched by relocations, so that
             // relocations can be shown in the dump.
             if (RelCur != RelEnd)
-              MaxOffset = RelCur->getOffset() - Index;
+              MaxOffset = std::min(RelCur->getOffset() - RelAdjustment - Index,
+                                   MaxOffset);
 
             if (size_t N =
                     countSkippableZeroBytes(Bytes.slice(Index, MaxOffset))) {
@@ -1481,7 +1485,7 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
             if (!PrintTarget)
               if (Optional<uint64_t> MaybeTarget =
                       MIA->evaluateMemoryOperandAddress(
-                          Inst, SectionAddr + Index, Size)) {
+                          Inst, STI, SectionAddr + Index, Size)) {
                 Target = *MaybeTarget;
                 PrintTarget = true;
                 // Do not print real address when symbolizing.
@@ -1581,7 +1585,7 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
         if (Obj->getArch() != Triple::hexagon) {
           // Print relocation for instruction and data.
           while (RelCur != RelEnd) {
-            uint64_t Offset = RelCur->getOffset();
+            uint64_t Offset = RelCur->getOffset() - RelAdjustment;
             // If this relocation is hidden, skip it.
             if (getHidden(*RelCur) || SectionAddr + Offset < StartAddress) {
               ++RelCur;
@@ -1770,7 +1774,9 @@ void objdump::printDynamicRelocations(const ObjectFile *Obj) {
     return;
 
   const auto *Elf = dyn_cast<ELFObjectFileBase>(Obj);
-  if (!Elf || Elf->getEType() != ELF::ET_DYN) {
+  if (!Elf || !any_of(Elf->sections(), [](const ELFSectionRef Sec) {
+        return Sec.getType() == ELF::SHT_DYNAMIC;
+      })) {
     reportError(Obj->getFileName(), "not a dynamic object");
     return;
   }
@@ -1779,7 +1785,12 @@ void objdump::printDynamicRelocations(const ObjectFile *Obj) {
   if (DynRelSec.empty())
     return;
 
-  outs() << "DYNAMIC RELOCATION RECORDS\n";
+  outs() << "\nDYNAMIC RELOCATION RECORDS\n";
+  const uint32_t OffsetPadding = (Obj->getBytesInAddress() > 4 ? 16 : 8);
+  const uint32_t TypePadding = 24;
+  outs() << left_justify("OFFSET", OffsetPadding) << ' '
+         << left_justify("TYPE", TypePadding) << " VALUE\n";
+
   StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 : "%08" PRIx64;
   for (const SectionRef &Section : DynRelSec)
     for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1789,8 +1800,8 @@ void objdump::printDynamicRelocations(const ObjectFile *Obj) {
       Reloc.getTypeName(RelocName);
       if (Error E = getRelocationValueString(Reloc, ValueStr))
         reportError(std::move(E), Obj->getFileName());
-      outs() << format(Fmt.data(), Address) << " " << RelocName << " "
-             << ValueStr << "\n";
+      outs() << format(Fmt.data(), Address) << ' '
+             << left_justify(RelocName, TypePadding) << ' ' << ValueStr << '\n';
     }
 }
 
@@ -1922,7 +1933,8 @@ void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
   if (!DumpDynamic) {
     outs() << "\nSYMBOL TABLE:\n";
     for (auto I = O->symbol_begin(); I != O->symbol_end(); ++I)
-      printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic);
+      printSymbol(O, *I, {}, FileName, ArchiveName, ArchitectureName,
+                  DumpDynamic);
     return;
   }
 
@@ -1935,12 +1947,21 @@ void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
   }
 
   const ELFObjectFileBase *ELF = cast<const ELFObjectFileBase>(O);
-  for (auto I = ELF->getDynamicSymbolIterators().begin();
-       I != ELF->getDynamicSymbolIterators().end(); ++I)
-    printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic);
+  auto Symbols = ELF->getDynamicSymbolIterators();
+  Expected<std::vector<VersionEntry>> SymbolVersionsOrErr =
+      ELF->readDynsymVersions();
+  if (!SymbolVersionsOrErr) {
+    reportWarning(toString(SymbolVersionsOrErr.takeError()), FileName);
+    SymbolVersionsOrErr = std::vector<VersionEntry>();
+    (void)!SymbolVersionsOrErr;
+  }
+  for (auto &Sym : Symbols)
+    printSymbol(O, Sym, *SymbolVersionsOrErr, FileName, ArchiveName,
+                ArchitectureName, DumpDynamic);
 }
 
 void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
+                          ArrayRef<VersionEntry> SymbolVersions,
                           StringRef FileName, StringRef ArchiveName,
                           StringRef ArchitectureName, bool DumpDynamic) {
   const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(O);
@@ -2029,22 +2050,66 @@ void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
   } else if (Common) {
     outs() << "*COM*";
   } else if (Section == O->section_end()) {
-    outs() << "*UND*";
+    if (O->isXCOFF()) {
+      XCOFFSymbolRef XCOFFSym = dyn_cast<const XCOFFObjectFile>(O)->toSymbolRef(
+          Symbol.getRawDataRefImpl());
+      if (XCOFF::N_DEBUG == XCOFFSym.getSectionNumber())
+        outs() << "*DEBUG*";
+      else
+        outs() << "*UND*";
+    } else
+      outs() << "*UND*";
   } else {
     StringRef SegmentName = getSegmentName(MachO, *Section);
     if (!SegmentName.empty())
       outs() << SegmentName << ",";
     StringRef SectionName = unwrapOrError(Section->getName(), FileName);
     outs() << SectionName;
-  }
+    if (O->isXCOFF()) {
+      Optional<SymbolRef> SymRef = getXCOFFSymbolContainingSymbolRef(
+          dyn_cast<const XCOFFObjectFile>(O), Symbol);
+      if (SymRef) {
 
-  if (Common || O->isELF()) {
-    uint64_t Val =
-        Common ? Symbol.getAlignment() : ELFSymbolRef(Symbol).getSize();
-    outs() << '\t' << format(Fmt, Val);
+        Expected<StringRef> NameOrErr = SymRef.getValue().getName();
+
+        if (NameOrErr) {
+          outs() << " (csect:";
+          std::string SymName(NameOrErr.get());
+
+          if (Demangle)
+            SymName = demangle(SymName);
+
+          if (SymbolDescription)
+            SymName = getXCOFFSymbolDescription(
+                createSymbolInfo(O, SymRef.getValue()), SymName);
+
+          outs() << ' ' << SymName;
+          outs() << ") ";
+        } else
+          reportWarning(toString(NameOrErr.takeError()), FileName);
+      }
+    }
   }
 
+  if (Common)
+    outs() << '\t' << format(Fmt, static_cast<uint64_t>(Symbol.getAlignment()));
+  else if (O->isXCOFF())
+    outs() << '\t'
+           << format(Fmt, dyn_cast<const XCOFFObjectFile>(O)->getSymbolSize(
+                              Symbol.getRawDataRefImpl()));
+  else if (O->isELF())
+    outs() << '\t' << format(Fmt, ELFSymbolRef(Symbol).getSize());
+
   if (O->isELF()) {
+    if (!SymbolVersions.empty()) {
+      const VersionEntry &Ver =
+          SymbolVersions[Symbol.getRawDataRefImpl().d.b - 1];
+      std::string Str;
+      if (!Ver.Name.empty())
+        Str = Ver.IsVerDef ? ' ' + Ver.Name : '(' + Ver.Name + ')';
+      outs() << ' ' << left_justify(Str, 12);
+    }
+
     uint8_t Other = ELFSymbolRef(Symbol).getOther();
     switch (Other) {
     case ELF::STV_DEFAULT:
@@ -2066,10 +2131,14 @@ void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
     outs() << " .hidden";
   }
 
+  std::string SymName(Name);
   if (Demangle)
-    outs() << ' ' << demangle(std::string(Name)) << '\n';
-  else
-    outs() << ' ' << Name << '\n';
+    SymName = demangle(SymName);
+
+  if (O->isXCOFF() && SymbolDescription)
+    SymName = getXCOFFSymbolDescription(createSymbolInfo(O, Symbol), SymName);
+
+  outs() << ' ' << SymName << '\n';
 }
 
 static void printUnwindInfo(const ObjectFile *O) {
@@ -2176,7 +2245,7 @@ static void printPrivateFileHeaders(const ObjectFile *O, bool OnlyFirst) {
     return;
   }
   if (O->isCOFF())
-    return printCOFFFileHeader(O);
+    return printCOFFFileHeader(cast<object::COFFObjectFile>(*O));
   if (O->isWasm())
     return printWasmFileHeader(O);
   if (O->isMachO()) {
@@ -2431,6 +2500,11 @@ static void parseIntArg(const llvm::opt::InputArgList &InputArgs, int ID,
   }
 }
 
+static void invalidArgValue(const opt::Arg *A) {
+  reportCmdLineError("'" + StringRef(A->getValue()) +
+                     "' is not a valid value for '" + A->getSpelling() + "'");
+}
+
 static std::vector<std::string>
 commaSeparatedValues(const llvm::opt::InputArgList &InputArgs, int ID) {
   std::vector<std::string> Values;
@@ -2504,8 +2578,11 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
       commaSeparatedValues(InputArgs, OBJDUMP_disassemble_symbols_EQ);
   DisassembleZeroes = InputArgs.hasArg(OBJDUMP_disassemble_zeroes);
   if (const opt::Arg *A = InputArgs.getLastArg(OBJDUMP_dwarf_EQ)) {
-    DwarfDumpType =
-        StringSwitch<DIDumpType>(A->getValue()).Case("frames", DIDT_DebugFrame);
+    DwarfDumpType = StringSwitch<DIDumpType>(A->getValue())
+                        .Case("frames", DIDT_DebugFrame)
+                        .Default(DIDT_Null);
+    if (DwarfDumpType == DIDT_Null)
+      invalidArgValue(A);
   }
   DynamicRelocations = InputArgs.hasArg(OBJDUMP_dynamic_reloc);
   FaultMapSection = InputArgs.hasArg(OBJDUMP_fault_map_section);
@@ -2542,7 +2619,10 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
   if (const opt::Arg *A = InputArgs.getLastArg(OBJDUMP_debug_vars_EQ)) {
     DbgVariables = StringSwitch<DebugVarsFormat>(A->getValue())
                        .Case("ascii", DVASCII)
-                       .Case("unicode", DVUnicode);
+                       .Case("unicode", DVUnicode)
+                       .Default(DVInvalid);
+    if (DbgVariables == DVInvalid)
+      invalidArgValue(A);
   }
   parseIntArg(InputArgs, OBJDUMP_debug_vars_indent_EQ, DbgIndent);
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 33fb3f207f8e..864a9920efbe 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -26,15 +26,12 @@ class ELFSectionRef;
 class MachOObjectFile;
 class MachOUniversalBinary;
 class RelocationRef;
+struct VersionEntry;
 } // namespace object
 
 namespace objdump {
 
-enum DebugVarsFormat {
-  DVDisabled,
-  DVUnicode,
-  DVASCII,
-};
+enum DebugVarsFormat { DVDisabled, DVUnicode, DVASCII, DVInvalid };
 
 extern bool ArchiveHeaders;
 extern int DbgIndent;
@@ -137,12 +134,13 @@ void printSymbolTable(const object::ObjectFile *O, StringRef ArchiveName,
                       StringRef ArchitectureName = StringRef(),
                       bool DumpDynamic = false);
 void printSymbol(const object::ObjectFile *O, const object::SymbolRef &Symbol,
+                 ArrayRef<object::VersionEntry> SymbolVersions,
                  StringRef FileName, StringRef ArchiveName,
                  StringRef ArchitectureName, bool DumpDynamic);
-LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, const Twine &Message);
-LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
-                                         StringRef ArchiveName = "",
-                                         StringRef ArchitectureName = "");
+[[noreturn]] void reportError(StringRef File, const Twine &Message);
+[[noreturn]] void reportError(Error E, StringRef FileName,
+                              StringRef ArchiveName = "",
+                              StringRef ArchitectureName = "");
 void reportWarning(const Twine &Message, StringRef File);
 
 template <typename T, typename... Ts>
diff --git a/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp b/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
index 3d2490509c03..b631bdf8f2b1 100644
--- a/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/ExplainOutputStyle.cpp
@@ -373,7 +373,7 @@ static void explainDbiModiSubstreamOffset(LinePrinter &P, DbiStream &Dbi,
     ++Index;
   }
 
-  DbiModuleDescriptor &Descriptor = *Prev;
+  const DbiModuleDescriptor &Descriptor = *Prev;
   P.formatLine("which contains the descriptor for module {0} ({1}).", Index,
                Descriptor.getModuleName());
 }
diff --git a/llvm/tools/llvm-pdbutil/LinePrinter.cpp b/llvm/tools/llvm-pdbutil/LinePrinter.cpp
index 280c000bd65f..dd6ca5bf41b1 100644
--- a/llvm/tools/llvm-pdbutil/LinePrinter.cpp
+++ b/llvm/tools/llvm-pdbutil/LinePrinter.cpp
@@ -100,7 +100,7 @@ bool LinePrinter::IsClassExcluded(const ClassLayout &Class) {
 }
 
 void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
-                               uint32_t StartOffset) {
+                               uint64_t StartOffset) {
   NewLine();
   OS << Label << " (";
   if (!Data.empty()) {
@@ -113,7 +113,7 @@ void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
 }
 
 void LinePrinter::formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
-                               uint64_t Base, uint32_t StartOffset) {
+                               uint64_t Base, uint64_t StartOffset) {
   NewLine();
   OS << Label << " (";
   if (!Data.empty()) {
@@ -131,7 +131,7 @@ struct Run {
   Run() = default;
   explicit Run(uint32_t Block) : Block(Block) {}
   uint32_t Block = 0;
-  uint32_t ByteLen = 0;
+  uint64_t ByteLen = 0;
 };
 } // namespace
 
@@ -143,7 +143,7 @@ static std::vector<Run> computeBlockRuns(uint32_t BlockSize,
 
   ArrayRef<support::ulittle32_t> Blocks = Layout.Blocks;
   assert(!Blocks.empty());
-  uint32_t StreamBytesRemaining = Layout.Length;
+  uint64_t StreamBytesRemaining = Layout.Length;
   uint32_t CurrentBlock = Blocks[0];
   Runs.emplace_back(CurrentBlock);
   while (!Blocks.empty()) {
@@ -153,7 +153,8 @@ static std::vector<Run> computeBlockRuns(uint32_t BlockSize,
       Runs.emplace_back(NextBlock);
       CurrentRun = &Runs.back();
     }
-    uint32_t Used = std::min(BlockSize, StreamBytesRemaining);
+    uint64_t Used =
+        std::min(static_cast<uint64_t>(BlockSize), StreamBytesRemaining);
     CurrentRun->ByteLen += Used;
     StreamBytesRemaining -= Used;
     CurrentBlock = NextBlock;
@@ -162,7 +163,7 @@ static std::vector<Run> computeBlockRuns(uint32_t BlockSize,
   return Runs;
 }
 
-static std::pair<Run, uint32_t> findRun(uint32_t Offset, ArrayRef<Run> Runs) {
+static std::pair<Run, uint64_t> findRun(uint64_t Offset, ArrayRef<Run> Runs) {
   for (const auto &R : Runs) {
     if (Offset < R.ByteLen)
       return std::make_pair(R, Offset);
@@ -173,8 +174,8 @@ static std::pair<Run, uint32_t> findRun(uint32_t Offset, ArrayRef<Run> Runs) {
 
 void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
                                       uint32_t StreamIdx,
-                                      StringRef StreamPurpose, uint32_t Offset,
-                                      uint32_t Size) {
+                                      StringRef StreamPurpose, uint64_t Offset,
+                                      uint64_t Size) {
   if (StreamIdx >= File.getNumStreams()) {
     formatLine("Stream {0}: Not present", StreamIdx);
     return;
@@ -193,7 +194,7 @@ void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
     return;
   }
 
-  uint32_t End =
+  uint64_t End =
       (Size == 0) ? S->getLength() : std::min(Offset + Size, S->getLength());
   Size = End - Offset;
 
@@ -222,10 +223,10 @@ void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
     OS << "\n";
 
     Run FoundRun;
-    uint32_t RunOffset;
+    uint64_t RunOffset;
     std::tie(FoundRun, RunOffset) = findRun(Substream.Offset, Runs);
     assert(FoundRun.ByteLen >= RunOffset);
-    uint32_t Len = FoundRun.ByteLen - RunOffset;
+    uint64_t Len = FoundRun.ByteLen - RunOffset;
     Len = std::min(Len, Reader.bytesRemaining());
     uint64_t Base = FoundRun.Block * File.getBlockSize() + RunOffset;
     ArrayRef<uint8_t> Data;
@@ -246,13 +247,14 @@ void LinePrinter::formatMsfStreamData(StringRef Label, PDBFile &File,
 void LinePrinter::formatMsfStreamBlocks(
     PDBFile &File, const msf::MSFStreamLayout &StreamLayout) {
   auto Blocks = makeArrayRef(StreamLayout.Blocks);
-  uint32_t L = StreamLayout.Length;
+  uint64_t L = StreamLayout.Length;
 
   while (L > 0) {
     NewLine();
     assert(!Blocks.empty());
     OS << formatv("Block {0} (\n", uint32_t(Blocks.front()));
-    uint32_t UsedBytes = std::min(L, File.getBlockSize());
+    uint64_t UsedBytes =
+        std::min(L, static_cast<uint64_t>(File.getBlockSize()));
     ArrayRef<uint8_t> BlockData =
         cantFail(File.getBlockData(Blocks.front(), File.getBlockSize()));
     uint64_t BaseOffset = Blocks.front();
@@ -267,7 +269,7 @@ void LinePrinter::formatMsfStreamBlocks(
   }
 }
 
-bool LinePrinter::IsTypeExcluded(llvm::StringRef TypeName, uint32_t Size) {
+bool LinePrinter::IsTypeExcluded(llvm::StringRef TypeName, uint64_t Size) {
   if (IsItemExcluded(TypeName, IncludeTypeFilters, ExcludeTypeFilters))
     return true;
   if (Size < opts::pretty::SizeThreshold)
diff --git a/llvm/tools/llvm-pdbutil/LinePrinter.h b/llvm/tools/llvm-pdbutil/LinePrinter.h
index 7ecfae17354f..aa8159c0e094 100644
--- a/llvm/tools/llvm-pdbutil/LinePrinter.h
+++ b/llvm/tools/llvm-pdbutil/LinePrinter.h
@@ -49,13 +49,13 @@ public:
   }
 
   void formatBinary(StringRef Label, ArrayRef<uint8_t> Data,
-                    uint32_t StartOffset);
+                    uint64_t StartOffset);
   void formatBinary(StringRef Label, ArrayRef<uint8_t> Data, uint64_t BaseAddr,
-                    uint32_t StartOffset);
+                    uint64_t StartOffset);
 
   void formatMsfStreamData(StringRef Label, PDBFile &File, uint32_t StreamIdx,
-                           StringRef StreamPurpose, uint32_t Offset,
-                           uint32_t Size);
+                           StringRef StreamPurpose, uint64_t Offset,
+                           uint64_t Size);
   void formatMsfStreamData(StringRef Label, PDBFile &File,
                            const msf::MSFStreamLayout &Stream,
                            BinarySubstreamRef Substream);
@@ -66,7 +66,7 @@ public:
   int getIndentLevel() const { return CurrentIndent; }
 
   bool IsClassExcluded(const ClassLayout &Class);
-  bool IsTypeExcluded(llvm::StringRef TypeName, uint32_t Size);
+  bool IsTypeExcluded(llvm::StringRef TypeName, uint64_t Size);
   bool IsSymbolExcluded(llvm::StringRef SymbolName);
   bool IsCompilandExcluded(llvm::StringRef CompilandName);
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 66d70120ac9b..fd67cac3cdd2 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -521,9 +521,9 @@ adjustInstrProfile(std::unique_ptr<WriterContext> &WC,
   // Find hot/warm functions in sample profile which is cold in instr profile
   // and adjust the profiles of those functions in the instr profile.
   for (const auto &PD : Reader->getProfiles()) {
-    StringRef FName = PD.getKey();
-    const sampleprof::FunctionSamples &FS = PD.getValue();
-    auto It = InstrProfileMap.find(FName);
+    auto &FContext = PD.first;
+    const sampleprof::FunctionSamples &FS = PD.second;
+    auto It = InstrProfileMap.find(FContext.toString());
     if (FS.getHeadSamples() > ColdSampleThreshold &&
         It != InstrProfileMap.end() &&
         It->second.MaxCount <= ColdInstrThreshold &&
@@ -690,7 +690,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
                    bool SampleMergeColdContext, bool SampleTrimColdContext,
                    bool SampleColdContextFrameDepth, FailureMode FailMode) {
   using namespace sampleprof;
-  StringMap<FunctionSamples> ProfileMap;
+  SampleProfileMap ProfileMap;
   SmallVector<std::unique_ptr<sampleprof::SampleProfileReader>, 5> Readers;
   LLVMContext Context;
   sampleprof::ProfileSymbolList WriterList;
@@ -716,7 +716,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
       continue;
     }
 
-    StringMap<FunctionSamples> &Profiles = Reader->getProfiles();
+    SampleProfileMap &Profiles = Reader->getProfiles();
     if (ProfileIsProbeBased.hasValue() &&
         ProfileIsProbeBased != FunctionSamples::ProfileIsProbeBased)
       exitWithError(
@@ -725,19 +725,19 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
     if (ProfileIsCS.hasValue() && ProfileIsCS != FunctionSamples::ProfileIsCS)
       exitWithError("cannot merge CS profile with non-CS profile");
     ProfileIsCS = FunctionSamples::ProfileIsCS;
-    for (StringMap<FunctionSamples>::iterator I = Profiles.begin(),
-                                              E = Profiles.end();
+    for (SampleProfileMap::iterator I = Profiles.begin(), E = Profiles.end();
          I != E; ++I) {
       sampleprof_error Result = sampleprof_error::success;
       FunctionSamples Remapped =
           Remapper ? remapSamples(I->second, *Remapper, Result)
                    : FunctionSamples();
       FunctionSamples &Samples = Remapper ? Remapped : I->second;
-      StringRef FName = Samples.getNameWithContext();
-      MergeResult(Result, ProfileMap[FName].merge(Samples, Input.Weight));
+      SampleContext FContext = Samples.getContext();
+      MergeResult(Result, ProfileMap[FContext].merge(Samples, Input.Weight));
       if (Result != sampleprof_error::success) {
         std::error_code EC = make_error_code(Result);
-        handleMergeWriterError(errorCodeToError(EC), Input.Filename, FName);
+        handleMergeWriterError(errorCodeToError(EC), Input.Filename,
+                               FContext.toString());
       }
     }
 
@@ -759,7 +759,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
     SampleContextTrimmer(ProfileMap)
         .trimAndMergeColdContextProfiles(
             SampleProfColdThreshold, SampleTrimColdContext,
-            SampleMergeColdContext, SampleColdContextFrameDepth);
+            SampleMergeColdContext, SampleColdContextFrameDepth, false);
   }
 
   auto WriterOrErr =
@@ -836,7 +836,7 @@ static void parseInputFilenamesFile(MemoryBuffer *Buffer,
     if (SanitizedEntry.startswith("#"))
       continue;
     // If there's no comma, it's an unweighted profile.
-    else if (SanitizedEntry.find(',') == StringRef::npos)
+    else if (!SanitizedEntry.contains(','))
       addWeightedInput(WFV, {std::string(SanitizedEntry), 1});
     else
       addWeightedInput(WFV, parseWeightedFile(SanitizedEntry));
@@ -1022,8 +1022,8 @@ static void overlapInstrProfile(const std::string &BaseFilename,
 
 namespace {
 struct SampleOverlapStats {
-  StringRef BaseName;
-  StringRef TestName;
+  SampleContext BaseName;
+  SampleContext TestName;
   // Number of overlap units
   uint64_t OverlapCount;
   // Total samples of overlap units
@@ -1226,6 +1226,9 @@ public:
   /// Load profiles specified by BaseFilename and TestFilename.
   std::error_code loadProfiles();
 
+  using FuncSampleStatsMap =
+      std::unordered_map<SampleContext, FuncSampleStats, SampleContext::Hash>;
+
 private:
   SampleOverlapStats ProfOverlap;
   SampleOverlapStats HotFuncOverlap;
@@ -1236,8 +1239,8 @@ private:
   std::unique_ptr<sampleprof::SampleProfileReader> TestReader;
   // BaseStats and TestStats hold FuncSampleStats for each function, with
   // function name as the key.
-  StringMap<FuncSampleStats> BaseStats;
-  StringMap<FuncSampleStats> TestStats;
+  FuncSampleStatsMap BaseStats;
+  FuncSampleStatsMap TestStats;
   // Low similarity threshold in floating point number
   double LowSimilarityThreshold;
   // Block samples above BaseHotThreshold or TestHotThreshold are considered hot
@@ -1276,8 +1279,8 @@ private:
   void updateHotBlockOverlap(uint64_t BaseSample, uint64_t TestSample,
                              uint64_t HotBlockCount);
 
-  void getHotFunctions(const StringMap<FuncSampleStats> &ProfStats,
-                       StringMap<FuncSampleStats> &HotFunc,
+  void getHotFunctions(const FuncSampleStatsMap &ProfStats,
+                       FuncSampleStatsMap &HotFunc,
                        uint64_t HotThreshold) const;
 
   void computeHotFuncOverlap();
@@ -1381,26 +1384,26 @@ void SampleOverlapAggregator::updateHotBlockOverlap(uint64_t BaseSample,
 }
 
 void SampleOverlapAggregator::getHotFunctions(
-    const StringMap<FuncSampleStats> &ProfStats,
-    StringMap<FuncSampleStats> &HotFunc, uint64_t HotThreshold) const {
+    const FuncSampleStatsMap &ProfStats, FuncSampleStatsMap &HotFunc,
+    uint64_t HotThreshold) const {
   for (const auto &F : ProfStats) {
     if (isFunctionHot(F.second, HotThreshold))
-      HotFunc.try_emplace(F.first(), F.second);
+      HotFunc.emplace(F.first, F.second);
   }
 }
 
 void SampleOverlapAggregator::computeHotFuncOverlap() {
-  StringMap<FuncSampleStats> BaseHotFunc;
+  FuncSampleStatsMap BaseHotFunc;
   getHotFunctions(BaseStats, BaseHotFunc, BaseHotThreshold);
   HotFuncOverlap.BaseCount = BaseHotFunc.size();
 
-  StringMap<FuncSampleStats> TestHotFunc;
+  FuncSampleStatsMap TestHotFunc;
   getHotFunctions(TestStats, TestHotFunc, TestHotThreshold);
   HotFuncOverlap.TestCount = TestHotFunc.size();
   HotFuncOverlap.UnionCount = HotFuncOverlap.TestCount;
 
   for (const auto &F : BaseHotFunc) {
-    if (TestHotFunc.count(F.first()))
+    if (TestHotFunc.count(F.first))
       ++HotFuncOverlap.OverlapCount;
     else
       ++HotFuncOverlap.UnionCount;
@@ -1612,23 +1615,25 @@ double SampleOverlapAggregator::computeSampleFunctionOverlap(
 void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
   using namespace sampleprof;
 
-  StringMap<const FunctionSamples *> BaseFuncProf;
+  std::unordered_map<SampleContext, const FunctionSamples *,
+                     SampleContext::Hash>
+      BaseFuncProf;
   const auto &BaseProfiles = BaseReader->getProfiles();
   for (const auto &BaseFunc : BaseProfiles) {
-    BaseFuncProf.try_emplace(BaseFunc.second.getNameWithContext(),
-                             &(BaseFunc.second));
+    BaseFuncProf.emplace(BaseFunc.second.getContext(), &(BaseFunc.second));
   }
   ProfOverlap.UnionCount = BaseFuncProf.size();
 
   const auto &TestProfiles = TestReader->getProfiles();
   for (const auto &TestFunc : TestProfiles) {
     SampleOverlapStats FuncOverlap;
-    FuncOverlap.TestName = TestFunc.second.getNameWithContext();
+    FuncOverlap.TestName = TestFunc.second.getContext();
     assert(TestStats.count(FuncOverlap.TestName) &&
            "TestStats should have records for all functions in test profile "
            "except inlinees");
     FuncOverlap.TestSample = TestStats[FuncOverlap.TestName].SampleSum;
 
+    bool Matched = false;
     const auto Match = BaseFuncProf.find(FuncOverlap.TestName);
     if (Match == BaseFuncProf.end()) {
       const FuncSampleStats &FuncStats = TestStats[FuncOverlap.TestName];
@@ -1650,7 +1655,7 @@ void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
 
       // Two functions match with each other. Compute function-level overlap and
       // aggregate them into profile-level overlap.
-      FuncOverlap.BaseName = Match->second->getNameWithContext();
+      FuncOverlap.BaseName = Match->second->getContext();
       assert(BaseStats.count(FuncOverlap.BaseName) &&
              "BaseStats should have records for all functions in base profile "
              "except inlinees");
@@ -1673,6 +1678,7 @@ void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
       // Remove matched base functions for later reporting functions not found
       // in test profile.
       BaseFuncProf.erase(Match);
+      Matched = true;
     }
 
     // Print function-level similarity information if specified by options.
@@ -1680,11 +1686,10 @@ void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
            "TestStats should have records for all functions in test profile "
            "except inlinees");
     if (TestStats[FuncOverlap.TestName].MaxSample >= FuncFilter.ValueCutoff ||
-        (Match != BaseFuncProf.end() &&
-         FuncOverlap.Similarity < LowSimilarityThreshold) ||
-        (Match != BaseFuncProf.end() && !FuncFilter.NameFilter.empty() &&
-         FuncOverlap.BaseName.find(FuncFilter.NameFilter) !=
-             FuncOverlap.BaseName.npos)) {
+        (Matched && FuncOverlap.Similarity < LowSimilarityThreshold) ||
+        (Matched && !FuncFilter.NameFilter.empty() &&
+         FuncOverlap.BaseName.toString().find(FuncFilter.NameFilter) !=
+             std::string::npos)) {
       assert(ProfOverlap.BaseSample > 0 &&
              "Total samples in base profile should be greater than 0");
       FuncOverlap.BaseWeight =
@@ -1699,11 +1704,10 @@ void SampleOverlapAggregator::computeSampleProfileOverlap(raw_fd_ostream &OS) {
 
   // Traverse through functions in base profile but not in test profile.
   for (const auto &F : BaseFuncProf) {
-    assert(BaseStats.count(F.second->getNameWithContext()) &&
+    assert(BaseStats.count(F.second->getContext()) &&
            "BaseStats should have records for all functions in base profile "
            "except inlinees");
-    const FuncSampleStats &FuncStats =
-        BaseStats[F.second->getNameWithContext()];
+    const FuncSampleStats &FuncStats = BaseStats[F.second->getContext()];
     ++ProfOverlap.BaseUniqueCount;
     ProfOverlap.BaseUniqueSample += FuncStats.SampleSum;
 
@@ -1734,7 +1738,7 @@ void SampleOverlapAggregator::initializeSampleProfileOverlap() {
     FuncSampleStats FuncStats;
     getFuncSampleStats(I.second, FuncStats, BaseHotThreshold);
     ProfOverlap.BaseSample += FuncStats.SampleSum;
-    BaseStats.try_emplace(I.second.getNameWithContext(), FuncStats);
+    BaseStats.emplace(I.second.getContext(), FuncStats);
   }
 
   const auto &TestProf = TestReader->getProfiles();
@@ -1743,7 +1747,7 @@ void SampleOverlapAggregator::initializeSampleProfileOverlap() {
     FuncSampleStats FuncStats;
     getFuncSampleStats(I.second, FuncStats, TestHotThreshold);
     ProfOverlap.TestSample += FuncStats.SampleSum;
-    TestStats.try_emplace(I.second.getNameWithContext(), FuncStats);
+    TestStats.emplace(I.second.getContext(), FuncStats);
   }
 
   ProfOverlap.BaseName = StringRef(BaseFilename);
@@ -1807,13 +1811,15 @@ void SampleOverlapAggregator::dumpFuncSimilarity(raw_fd_ostream &OS) const {
     FOS.PadToColumn(TestSampleCol);
     FOS << F.second.TestSample;
     FOS.PadToColumn(FuncNameCol);
-    FOS << F.second.TestName << "\n";
+    FOS << F.second.TestName.toString() << "\n";
   }
 }
 
 void SampleOverlapAggregator::dumpProgramSummary(raw_fd_ostream &OS) const {
-  OS << "Profile overlap infomation for base_profile: " << ProfOverlap.BaseName
-     << " and test_profile: " << ProfOverlap.TestName << "\nProgram level:\n";
+  OS << "Profile overlap infomation for base_profile: "
+     << ProfOverlap.BaseName.toString()
+     << " and test_profile: " << ProfOverlap.TestName.toString()
+     << "\nProgram level:\n";
 
   OS << "  Whole program profile similarity: "
      << format("%.3f%%", ProfOverlap.Similarity * 100) << "\n";
@@ -1909,22 +1915,13 @@ std::error_code SampleOverlapAggregator::loadProfiles() {
 
   // Load BaseHotThreshold and TestHotThreshold as 99-percentile threshold in
   // profile summary.
-  const uint64_t HotCutoff = 990000;
   ProfileSummary &BasePS = BaseReader->getSummary();
-  for (const auto &SummaryEntry : BasePS.getDetailedSummary()) {
-    if (SummaryEntry.Cutoff == HotCutoff) {
-      BaseHotThreshold = SummaryEntry.MinCount;
-      break;
-    }
-  }
-
   ProfileSummary &TestPS = TestReader->getSummary();
-  for (const auto &SummaryEntry : TestPS.getDetailedSummary()) {
-    if (SummaryEntry.Cutoff == HotCutoff) {
-      TestHotThreshold = SummaryEntry.MinCount;
-      break;
-    }
-  }
+  BaseHotThreshold =
+      ProfileSummaryBuilder::getHotCountThreshold(BasePS.getDetailedSummary());
+  TestHotThreshold =
+      ProfileSummaryBuilder::getHotCountThreshold(TestPS.getDetailedSummary());
+
   return std::error_code();
 }
 
@@ -2111,9 +2108,8 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
       if (FuncIsCS != ShowCS)
         continue;
     }
-    bool Show =
-        ShowAllFunctions || (!ShowFunction.empty() &&
-                             Func.Name.find(ShowFunction) != Func.Name.npos);
+    bool Show = ShowAllFunctions ||
+                (!ShowFunction.empty() && Func.Name.contains(ShowFunction));
 
     bool doTextFormatDump = (Show && TextFormat);
 
@@ -2271,7 +2267,7 @@ static void showSectionInfo(sampleprof::SampleProfileReader *Reader,
 
 namespace {
 struct HotFuncInfo {
-  StringRef FuncName;
+  std::string FuncName;
   uint64_t TotalCount;
   double TotalCountPercent;
   uint64_t MaxCount;
@@ -2282,8 +2278,8 @@ struct HotFuncInfo {
         EntryCount(0) {}
 
   HotFuncInfo(StringRef FN, uint64_t TS, double TSP, uint64_t MS, uint64_t ES)
-      : FuncName(FN), TotalCount(TS), TotalCountPercent(TSP), MaxCount(MS),
-        EntryCount(ES) {}
+      : FuncName(FN.begin(), FN.end()), TotalCount(TS), TotalCountPercent(TSP),
+        MaxCount(MS), EntryCount(ES) {}
 };
 } // namespace
 
@@ -2298,7 +2294,7 @@ static void dumpHotFunctionList(const std::vector<std::string> &ColumnTitle,
                                 uint64_t HotFuncCount, uint64_t TotalFuncCount,
                                 uint64_t HotProfCount, uint64_t TotalProfCount,
                                 const std::string &HotFuncMetric,
-                                raw_fd_ostream &OS) {
+                                uint32_t TopNFunctions, raw_fd_ostream &OS) {
   assert(ColumnOffset.size() == ColumnTitle.size() &&
          "ColumnOffset and ColumnTitle should have the same size");
   assert(ColumnTitle.size() >= 4 &&
@@ -2327,7 +2323,10 @@ static void dumpHotFunctionList(const std::vector<std::string> &ColumnTitle,
   }
   FOS << "\n";
 
-  for (const HotFuncInfo &R : PrintValues) {
+  uint32_t Count = 0;
+  for (const auto &R : PrintValues) {
+    if (TopNFunctions && (Count++ == TopNFunctions))
+      break;
     FOS.PadToColumn(ColumnOffset[0]);
     FOS << R.TotalCount << " (" << format("%.2f%%", R.TotalCountPercent) << ")";
     FOS.PadToColumn(ColumnOffset[1]);
@@ -2339,9 +2338,9 @@ static void dumpHotFunctionList(const std::vector<std::string> &ColumnTitle,
   }
 }
 
-static int
-showHotFunctionList(const StringMap<sampleprof::FunctionSamples> &Profiles,
-                    ProfileSummary &PS, raw_fd_ostream &OS) {
+static int showHotFunctionList(const sampleprof::SampleProfileMap &Profiles,
+                               ProfileSummary &PS, uint32_t TopN,
+                               raw_fd_ostream &OS) {
   using namespace sampleprof;
 
   const uint32_t HotFuncCutoff = 990000;
@@ -2391,18 +2390,19 @@ showHotFunctionList(const StringMap<sampleprof::FunctionSamples> &Profiles,
             ? (Func.getTotalSamples() * 100.0) / ProfileTotalSample
             : 0;
     PrintValues.emplace_back(HotFuncInfo(
-        Func.getNameWithContext(), Func.getTotalSamples(), TotalSamplePercent,
-        FuncPair.second.second, Func.getEntrySamples()));
+        Func.getContext().toString(), Func.getTotalSamples(),
+        TotalSamplePercent, FuncPair.second.second, Func.getEntrySamples()));
   }
   dumpHotFunctionList(ColumnTitle, ColumnOffset, PrintValues, HotFuncCount,
                       Profiles.size(), HotFuncSample, ProfileTotalSample,
-                      Metric, OS);
+                      Metric, TopN, OS);
 
   return 0;
 }
 
 static int showSampleProfile(const std::string &Filename, bool ShowCounts,
-                             bool ShowAllFunctions, bool ShowDetailedSummary,
+                             uint32_t TopN, bool ShowAllFunctions,
+                             bool ShowDetailedSummary,
                              const std::string &ShowFunction,
                              bool ShowProfileSymbolList,
                              bool ShowSectionInfoOnly, bool ShowHotFuncList,
@@ -2426,7 +2426,8 @@ static int showSampleProfile(const std::string &Filename, bool ShowCounts,
   if (ShowAllFunctions || ShowFunction.empty())
     Reader->dump(OS);
   else
-    Reader->dumpFunctionProfile(ShowFunction, OS);
+    // TODO: parse context string to support filtering by contexts.
+    Reader->dumpFunctionProfile(StringRef(ShowFunction), OS);
 
   if (ShowProfileSymbolList) {
     std::unique_ptr<sampleprof::ProfileSymbolList> ReaderList =
@@ -2440,8 +2441,8 @@ static int showSampleProfile(const std::string &Filename, bool ShowCounts,
     PS.printDetailedSummary(OS);
   }
 
-  if (ShowHotFuncList)
-    showHotFunctionList(Reader->getProfiles(), Reader->getSummary(), OS);
+  if (ShowHotFuncList || TopN)
+    showHotFunctionList(Reader->getProfiles(), Reader->getSummary(), TopN, OS);
 
   return 0;
 }
@@ -2532,10 +2533,10 @@ static int show_main(int argc, const char *argv[]) {
         ShowAllFunctions, ShowCS, ValueCutoff, OnlyListBelow, ShowFunction,
         TextFormat, ShowBinaryIds, OS);
   else
-    return showSampleProfile(Filename, ShowCounts, ShowAllFunctions,
-                             ShowDetailedSummary, ShowFunction,
-                             ShowProfileSymbolList, ShowSectionInfoOnly,
-                             ShowHotFuncList, OS);
+    return showSampleProfile(Filename, ShowCounts, TopNFunctions,
+                             ShowAllFunctions, ShowDetailedSummary,
+                             ShowFunction, ShowProfileSymbolList,
+                             ShowSectionInfoOnly, ShowHotFuncList, OS);
 }
 
 int main(int argc, const char *argv[]) {
diff --git a/llvm/tools/llvm-readobj/ARMEHABIPrinter.h b/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
index 3d8acbf48fa9..d97cea4b6d6a 100644
--- a/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
+++ b/llvm/tools/llvm-readobj/ARMEHABIPrinter.h
@@ -426,7 +426,7 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
 
       auto Ret = ELF.getSection(*Symbol, SymTab, ShndxTable);
       if (!Ret)
-        report_fatal_error(errorToErrorCode(Ret.takeError()).message());
+        report_fatal_error(Twine(errorToErrorCode(Ret.takeError()).message()));
       return *Ret;
     }
   }
diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
index 99ee639fc45d..78be632f2153 100644
--- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -238,22 +238,27 @@ ErrorOr<SymbolRef> Decoder::getRelocatedSymbol(const COFFObjectFile &,
   return inconvertibleErrorCode();
 }
 
-SymbolRef Decoder::getPreferredSymbol(const COFFObjectFile &COFF,
-                                      SymbolRef Sym) {
+SymbolRef Decoder::getPreferredSymbol(const COFFObjectFile &COFF, SymbolRef Sym,
+                                      uint64_t &SymbolOffset) {
   // The symbol resolved by getRelocatedSymbol can be any internal
   // nondescriptive symbol; try to resolve a more descriptive one.
   COFFSymbolRef CoffSym = COFF.getCOFFSymbol(Sym);
-  if (CoffSym.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL)
+  if (CoffSym.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL &&
+      CoffSym.getSectionDefinition() == nullptr)
     return Sym;
   for (const auto &S : COFF.symbols()) {
     COFFSymbolRef CS = COFF.getCOFFSymbol(S);
     if (CS.getSectionNumber() == CoffSym.getSectionNumber() &&
-        CS.getValue() == CoffSym.getValue()) {
-      if (CS.isExternal())
-        return S;
-      if (CS.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL) {
+        CS.getValue() <= CoffSym.getValue() + SymbolOffset &&
+        CS.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL &&
+        CS.getSectionDefinition() == nullptr) {
+      uint32_t Offset = CoffSym.getValue() + SymbolOffset - CS.getValue();
+      if (Offset <= SymbolOffset) {
+        SymbolOffset = Offset;
         Sym = S;
         CoffSym = CS;
+        if (CS.isExternal() && SymbolOffset == 0)
+          return Sym;
       }
     }
   }
@@ -277,12 +282,14 @@ ErrorOr<SymbolRef> Decoder::getSymbolForLocation(
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
       logAllUnhandledErrors(AddressOrErr.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
     // We apply SymbolOffset here directly. We return it separately to allow
     // the caller to print it as an offset on the symbol name.
     SymbolAddress = *AddressOrErr + SymbolOffset;
+
+    if (FunctionOnly) // Resolve label/section symbols into function names.
+      SymOrErr = getPreferredSymbol(COFF, *SymOrErr, SymbolOffset);
   } else {
     // No matching relocation found; operating on a linked image. Try to
     // find a descriptive symbol if possible. The immediate offset contains
@@ -292,8 +299,6 @@ ErrorOr<SymbolRef> Decoder::getSymbolForLocation(
     SymbolOffset = 0;
     SymOrErr = getSymbol(COFF, SymbolAddress, FunctionOnly);
   }
-  if (SymOrErr && FunctionOnly) // Resolve label symbols into function names
-    SymOrErr = getPreferredSymbol(COFF, *SymOrErr);
   return SymOrErr;
 }
 
@@ -1000,8 +1005,7 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
       logAllUnhandledErrors(Name.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
 
     ListScope EHS(SW, "ExceptionHandler");
@@ -1040,8 +1044,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
       logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
     FunctionName = *FunctionNameOrErr;
   }
@@ -1055,8 +1058,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
       logAllUnhandledErrors(Name.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
 
     SW.printString("ExceptionRecord",
@@ -1101,8 +1103,7 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
       logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
     FunctionName = *FunctionNameOrErr;
   }
@@ -1143,8 +1144,7 @@ bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
       std::string Buf;
       llvm::raw_string_ostream OS(Buf);
       logAllUnhandledErrors(FunctionNameOrErr.takeError(), OS);
-      OS.flush();
-      report_fatal_error(Buf);
+      report_fatal_error(Twine(OS.str()));
     }
     FunctionName = *FunctionNameOrErr;
   }
diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.h b/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
index efe16850c7fa..920d4e5f7332 100644
--- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
+++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -154,7 +154,8 @@ class Decoder {
                        bool FunctionOnly = false);
 
   object::SymbolRef getPreferredSymbol(const object::COFFObjectFile &COFF,
-                                       object::SymbolRef Sym);
+                                       object::SymbolRef Sym,
+                                       uint64_t &SymbolOffset);
 
   bool dumpXDataRecord(const object::COFFObjectFile &COFF,
                        const object::SectionRef &Section,
diff --git a/llvm/tools/llvm-readobj/COFFDumper.cpp b/llvm/tools/llvm-readobj/COFFDumper.cpp
index 96124cc03484..b235398e7a45 100644
--- a/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -337,7 +337,7 @@ void COFFDumper::printBinaryBlockWithRelocs(StringRef Label,
   }
 }
 
-static const EnumEntry<COFF::MachineTypes> ImageFileMachineType[] = {
+const EnumEntry<COFF::MachineTypes> ImageFileMachineType[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_MACHINE_UNKNOWN  ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_MACHINE_AM33     ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_MACHINE_AMD64    ),
@@ -362,7 +362,7 @@ static const EnumEntry<COFF::MachineTypes> ImageFileMachineType[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_MACHINE_WCEMIPSV2)
 };
 
-static const EnumEntry<COFF::Characteristics> ImageFileCharacteristics[] = {
+const EnumEntry<COFF::Characteristics> ImageFileCharacteristics[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_RELOCS_STRIPPED        ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_EXECUTABLE_IMAGE       ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_LINE_NUMS_STRIPPED     ),
@@ -380,7 +380,7 @@ static const EnumEntry<COFF::Characteristics> ImageFileCharacteristics[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_FILE_BYTES_REVERSED_HI      )
 };
 
-static const EnumEntry<COFF::WindowsSubsystem> PEWindowsSubsystem[] = {
+const EnumEntry<COFF::WindowsSubsystem> PEWindowsSubsystem[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_SUBSYSTEM_UNKNOWN                ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_SUBSYSTEM_NATIVE                 ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_SUBSYSTEM_WINDOWS_GUI            ),
@@ -394,7 +394,7 @@ static const EnumEntry<COFF::WindowsSubsystem> PEWindowsSubsystem[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_SUBSYSTEM_XBOX                   ),
 };
 
-static const EnumEntry<COFF::DLLCharacteristics> PEDLLCharacteristics[] = {
+const EnumEntry<COFF::DLLCharacteristics> PEDLLCharacteristics[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA      ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE         ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY      ),
@@ -453,7 +453,7 @@ ImageSectionCharacteristics[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_SCN_MEM_WRITE             )
 };
 
-static const EnumEntry<COFF::SymbolBaseType> ImageSymType[] = {
+const EnumEntry<COFF::SymbolBaseType> ImageSymType[] = {
   { "Null"  , COFF::IMAGE_SYM_TYPE_NULL   },
   { "Void"  , COFF::IMAGE_SYM_TYPE_VOID   },
   { "Char"  , COFF::IMAGE_SYM_TYPE_CHAR   },
@@ -472,14 +472,14 @@ static const EnumEntry<COFF::SymbolBaseType> ImageSymType[] = {
   { "DWord" , COFF::IMAGE_SYM_TYPE_DWORD  }
 };
 
-static const EnumEntry<COFF::SymbolComplexType> ImageSymDType[] = {
+const EnumEntry<COFF::SymbolComplexType> ImageSymDType[] = {
   { "Null"    , COFF::IMAGE_SYM_DTYPE_NULL     },
   { "Pointer" , COFF::IMAGE_SYM_DTYPE_POINTER  },
   { "Function", COFF::IMAGE_SYM_DTYPE_FUNCTION },
   { "Array"   , COFF::IMAGE_SYM_DTYPE_ARRAY    }
 };
 
-static const EnumEntry<COFF::SymbolStorageClass> ImageSymClass[] = {
+const EnumEntry<COFF::SymbolStorageClass> ImageSymClass[] = {
   { "EndOfFunction"  , COFF::IMAGE_SYM_CLASS_END_OF_FUNCTION  },
   { "Null"           , COFF::IMAGE_SYM_CLASS_NULL             },
   { "Automatic"      , COFF::IMAGE_SYM_CLASS_AUTOMATIC        },
@@ -509,7 +509,7 @@ static const EnumEntry<COFF::SymbolStorageClass> ImageSymClass[] = {
   { "CLRToken"       , COFF::IMAGE_SYM_CLASS_CLR_TOKEN        }
 };
 
-static const EnumEntry<COFF::COMDATType> ImageCOMDATSelect[] = {
+const EnumEntry<COFF::COMDATType> ImageCOMDATSelect[] = {
   { "NoDuplicates", COFF::IMAGE_COMDAT_SELECT_NODUPLICATES },
   { "Any"         , COFF::IMAGE_COMDAT_SELECT_ANY          },
   { "SameSize"    , COFF::IMAGE_COMDAT_SELECT_SAME_SIZE    },
@@ -519,7 +519,7 @@ static const EnumEntry<COFF::COMDATType> ImageCOMDATSelect[] = {
   { "Newest"      , COFF::IMAGE_COMDAT_SELECT_NEWEST       }
 };
 
-static const EnumEntry<COFF::DebugType> ImageDebugType[] = {
+const EnumEntry<COFF::DebugType> ImageDebugType[] = {
     {"Unknown", COFF::IMAGE_DEBUG_TYPE_UNKNOWN},
     {"COFF", COFF::IMAGE_DEBUG_TYPE_COFF},
     {"CodeView", COFF::IMAGE_DEBUG_TYPE_CODEVIEW},
@@ -548,7 +548,7 @@ WeakExternalCharacteristics[] = {
   { "Alias"    , COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS     }
 };
 
-static const EnumEntry<uint32_t> SubSectionTypes[] = {
+const EnumEntry<uint32_t> SubSectionTypes[] = {
     LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, Symbols),
     LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, Lines),
     LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, StringTable),
@@ -564,13 +564,13 @@ static const EnumEntry<uint32_t> SubSectionTypes[] = {
     LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, CoffSymbolRVA),
 };
 
-static const EnumEntry<uint32_t> FrameDataFlags[] = {
+const EnumEntry<uint32_t> FrameDataFlags[] = {
     LLVM_READOBJ_ENUM_ENT(FrameData, HasSEH),
     LLVM_READOBJ_ENUM_ENT(FrameData, HasEH),
     LLVM_READOBJ_ENUM_ENT(FrameData, IsFunctionStart),
 };
 
-static const EnumEntry<uint8_t> FileChecksumKindNames[] = {
+const EnumEntry<uint8_t> FileChecksumKindNames[] = {
   LLVM_READOBJ_ENUM_CLASS_ENT(FileChecksumKind, None),
   LLVM_READOBJ_ENUM_CLASS_ENT(FileChecksumKind, MD5),
   LLVM_READOBJ_ENUM_CLASS_ENT(FileChecksumKind, SHA1),
@@ -709,7 +709,10 @@ void COFFDumper::printPEHeader(const PEHeader *Hdr) {
     };
 
     for (uint32_t i = 0; i < Hdr->NumberOfRvaAndSize; ++i)
-      printDataDirectory(i, directory[i]);
+      if (i < sizeof(directory) / sizeof(char *))
+        printDataDirectory(i, directory[i]);
+      else
+        printDataDirectory(i, "Unknown");
   }
 }
 
diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 2dfe21684a62..5dc947e024b9 100644
--- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -185,7 +185,8 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
     reportError(DataOrErr.takeError(), ObjF.getFileName());
 
   // Construct DWARFDataExtractor to handle relocations ("PC Begin" fields).
-  std::unique_ptr<DWARFContext> DICtx = DWARFContext::create(ObjF, nullptr);
+  std::unique_ptr<DWARFContext> DICtx = DWARFContext::create(
+      ObjF, DWARFContext::ProcessDebugRelocations::Process, nullptr);
   DWARFDataExtractor DE(DICtx->getDWARFObj(),
                         DICtx->getDWARFObj().getEHFrameSection(),
                         ELFT::TargetEndianness == support::endianness::little,
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index f221acba979a..4abea0b1d23d 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -49,6 +49,8 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MSP430AttributeParser.h"
+#include "llvm/Support/MSP430Attributes.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MipsABIFlags.h"
 #include "llvm/Support/RISCVAttributeParser.h"
@@ -339,7 +341,8 @@ protected:
     return DynRegionInfo(ObjF, *this, Obj.base() + Offset, Size, EntSize);
   }
 
-  void printAttributes();
+  void printAttributes(unsigned, std::unique_ptr<ELFAttributeParser>,
+                       support::endianness);
   void printMipsReginfo();
   void printMipsOptions();
 
@@ -963,19 +966,19 @@ findNotEmptySectionByAddress(const ELFO &Obj, StringRef FileName,
   return nullptr;
 }
 
-static const EnumEntry<unsigned> ElfClass[] = {
+const EnumEntry<unsigned> ElfClass[] = {
   {"None",   "none",   ELF::ELFCLASSNONE},
   {"32-bit", "ELF32",  ELF::ELFCLASS32},
   {"64-bit", "ELF64",  ELF::ELFCLASS64},
 };
 
-static const EnumEntry<unsigned> ElfDataEncoding[] = {
+const EnumEntry<unsigned> ElfDataEncoding[] = {
   {"None",         "none",                          ELF::ELFDATANONE},
   {"LittleEndian", "2's complement, little endian", ELF::ELFDATA2LSB},
   {"BigEndian",    "2's complement, big endian",    ELF::ELFDATA2MSB},
 };
 
-static const EnumEntry<unsigned> ElfObjectFileType[] = {
+const EnumEntry<unsigned> ElfObjectFileType[] = {
   {"None",         "NONE (none)",              ELF::ET_NONE},
   {"Relocatable",  "REL (Relocatable file)",   ELF::ET_REL},
   {"Executable",   "EXEC (Executable file)",   ELF::ET_EXEC},
@@ -983,7 +986,7 @@ static const EnumEntry<unsigned> ElfObjectFileType[] = {
   {"Core",         "CORE (Core file)",         ELF::ET_CORE},
 };
 
-static const EnumEntry<unsigned> ElfOSABI[] = {
+const EnumEntry<unsigned> ElfOSABI[] = {
   {"SystemV",      "UNIX - System V",      ELF::ELFOSABI_NONE},
   {"HPUX",         "UNIX - HP-UX",         ELF::ELFOSABI_HPUX},
   {"NetBSD",       "UNIX - NetBSD",        ELF::ELFOSABI_NETBSD},
@@ -1004,22 +1007,22 @@ static const EnumEntry<unsigned> ElfOSABI[] = {
   {"Standalone",   "Standalone App",       ELF::ELFOSABI_STANDALONE}
 };
 
-static const EnumEntry<unsigned> AMDGPUElfOSABI[] = {
+const EnumEntry<unsigned> AMDGPUElfOSABI[] = {
   {"AMDGPU_HSA",    "AMDGPU - HSA",    ELF::ELFOSABI_AMDGPU_HSA},
   {"AMDGPU_PAL",    "AMDGPU - PAL",    ELF::ELFOSABI_AMDGPU_PAL},
   {"AMDGPU_MESA3D", "AMDGPU - MESA3D", ELF::ELFOSABI_AMDGPU_MESA3D}
 };
 
-static const EnumEntry<unsigned> ARMElfOSABI[] = {
+const EnumEntry<unsigned> ARMElfOSABI[] = {
   {"ARM", "ARM", ELF::ELFOSABI_ARM}
 };
 
-static const EnumEntry<unsigned> C6000ElfOSABI[] = {
+const EnumEntry<unsigned> C6000ElfOSABI[] = {
   {"C6000_ELFABI", "Bare-metal C6000", ELF::ELFOSABI_C6000_ELFABI},
   {"C6000_LINUX",  "Linux C6000",      ELF::ELFOSABI_C6000_LINUX}
 };
 
-static const EnumEntry<unsigned> ElfMachineType[] = {
+const EnumEntry<unsigned> ElfMachineType[] = {
   ENUM_ENT(EM_NONE,          "None"),
   ENUM_ENT(EM_M32,           "WE32100"),
   ENUM_ENT(EM_SPARC,         "Sparc"),
@@ -1185,19 +1188,19 @@ static const EnumEntry<unsigned> ElfMachineType[] = {
   ENUM_ENT(EM_VE,            "NEC SX-Aurora Vector Engine"),
 };
 
-static const EnumEntry<unsigned> ElfSymbolBindings[] = {
+const EnumEntry<unsigned> ElfSymbolBindings[] = {
     {"Local",  "LOCAL",  ELF::STB_LOCAL},
     {"Global", "GLOBAL", ELF::STB_GLOBAL},
     {"Weak",   "WEAK",   ELF::STB_WEAK},
     {"Unique", "UNIQUE", ELF::STB_GNU_UNIQUE}};
 
-static const EnumEntry<unsigned> ElfSymbolVisibilities[] = {
+const EnumEntry<unsigned> ElfSymbolVisibilities[] = {
     {"DEFAULT",   "DEFAULT",   ELF::STV_DEFAULT},
     {"INTERNAL",  "INTERNAL",  ELF::STV_INTERNAL},
     {"HIDDEN",    "HIDDEN",    ELF::STV_HIDDEN},
     {"PROTECTED", "PROTECTED", ELF::STV_PROTECTED}};
 
-static const EnumEntry<unsigned> AMDGPUSymbolTypes[] = {
+const EnumEntry<unsigned> AMDGPUSymbolTypes[] = {
   { "AMDGPU_HSA_KERNEL",            ELF::STT_AMDGPU_HSA_KERNEL }
 };
 
@@ -1208,7 +1211,7 @@ static const char *getGroupType(uint32_t Flag) {
     return "(unknown)";
 }
 
-static const EnumEntry<unsigned> ElfSectionFlags[] = {
+const EnumEntry<unsigned> ElfSectionFlags[] = {
   ENUM_ENT(SHF_WRITE,            "W"),
   ENUM_ENT(SHF_ALLOC,            "A"),
   ENUM_ENT(SHF_EXECINSTR,        "X"),
@@ -1224,20 +1227,20 @@ static const EnumEntry<unsigned> ElfSectionFlags[] = {
   ENUM_ENT(SHF_EXCLUDE,          "E"),
 };
 
-static const EnumEntry<unsigned> ElfXCoreSectionFlags[] = {
+const EnumEntry<unsigned> ElfXCoreSectionFlags[] = {
   ENUM_ENT(XCORE_SHF_CP_SECTION, ""),
   ENUM_ENT(XCORE_SHF_DP_SECTION, "")
 };
 
-static const EnumEntry<unsigned> ElfARMSectionFlags[] = {
+const EnumEntry<unsigned> ElfARMSectionFlags[] = {
   ENUM_ENT(SHF_ARM_PURECODE, "y")
 };
 
-static const EnumEntry<unsigned> ElfHexagonSectionFlags[] = {
+const EnumEntry<unsigned> ElfHexagonSectionFlags[] = {
   ENUM_ENT(SHF_HEX_GPREL, "")
 };
 
-static const EnumEntry<unsigned> ElfMipsSectionFlags[] = {
+const EnumEntry<unsigned> ElfMipsSectionFlags[] = {
   ENUM_ENT(SHF_MIPS_NODUPES, ""),
   ENUM_ENT(SHF_MIPS_NAMES,   ""),
   ENUM_ENT(SHF_MIPS_LOCAL,   ""),
@@ -1248,7 +1251,7 @@ static const EnumEntry<unsigned> ElfMipsSectionFlags[] = {
   ENUM_ENT(SHF_MIPS_STRING,  "")
 };
 
-static const EnumEntry<unsigned> ElfX86_64SectionFlags[] = {
+const EnumEntry<unsigned> ElfX86_64SectionFlags[] = {
   ENUM_ENT(SHF_X86_64_LARGE, "l")
 };
 
@@ -1395,13 +1398,13 @@ static std::string getGNUPtType(unsigned Arch, unsigned Type) {
   return Seg.drop_front(3).str();
 }
 
-static const EnumEntry<unsigned> ElfSegmentFlags[] = {
+const EnumEntry<unsigned> ElfSegmentFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, PF_X),
   LLVM_READOBJ_ENUM_ENT(ELF, PF_W),
   LLVM_READOBJ_ENUM_ENT(ELF, PF_R)
 };
 
-static const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
+const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
   ENUM_ENT(EF_MIPS_NOREORDER, "noreorder"),
   ENUM_ENT(EF_MIPS_PIC, "pic"),
   ENUM_ENT(EF_MIPS_CPIC, "cpic"),
@@ -1447,7 +1450,7 @@ static const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
   ENUM_ENT(EF_MIPS_ARCH_64R6, "mips64r6")
 };
 
-static const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
+const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_NONE),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_R600),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_R630),
@@ -1501,7 +1504,7 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion3[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_SRAMECC_V3)
 };
 
-static const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
+const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_NONE),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_R600),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_R600_R630),
@@ -1559,7 +1562,7 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
 };
 
-static const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
+const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
   ENUM_ENT(EF_RISCV_RVC, "RVC"),
   ENUM_ENT(EF_RISCV_FLOAT_ABI_SINGLE, "single-float ABI"),
   ENUM_ENT(EF_RISCV_FLOAT_ABI_DOUBLE, "double-float ABI"),
@@ -1567,7 +1570,7 @@ static const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
   ENUM_ENT(EF_RISCV_RVE, "RVE")
 };
 
-static const EnumEntry<unsigned> ElfHeaderAVRFlags[] = {
+const EnumEntry<unsigned> ElfHeaderAVRFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AVR_ARCH_AVR1),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AVR_ARCH_AVR2),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AVR_ARCH_AVR25),
@@ -1590,29 +1593,32 @@ static const EnumEntry<unsigned> ElfHeaderAVRFlags[] = {
 };
 
 
-static const EnumEntry<unsigned> ElfSymOtherFlags[] = {
+const EnumEntry<unsigned> ElfSymOtherFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, STV_INTERNAL),
   LLVM_READOBJ_ENUM_ENT(ELF, STV_HIDDEN),
   LLVM_READOBJ_ENUM_ENT(ELF, STV_PROTECTED)
 };
 
-static const EnumEntry<unsigned> ElfMipsSymOtherFlags[] = {
+const EnumEntry<unsigned> ElfMipsSymOtherFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_OPTIONAL),
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_PLT),
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_PIC),
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_MICROMIPS)
 };
 
-static const EnumEntry<unsigned> ElfAArch64SymOtherFlags[] = {
+const EnumEntry<unsigned> ElfAArch64SymOtherFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, STO_AARCH64_VARIANT_PCS)
 };
 
-static const EnumEntry<unsigned> ElfMips16SymOtherFlags[] = {
+const EnumEntry<unsigned> ElfMips16SymOtherFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_OPTIONAL),
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_PLT),
   LLVM_READOBJ_ENUM_ENT(ELF, STO_MIPS_MIPS16)
 };
 
+const EnumEntry<unsigned> ElfRISCVSymOtherFlags[] = {
+    LLVM_READOBJ_ENUM_ENT(ELF, STO_RISCV_VARIANT_CC)};
+
 static const char *getElfMipsOptionsOdkType(unsigned Odk) {
   switch (Odk) {
   LLVM_READOBJ_ENUM_CASE(ELF, ODK_NULL);
@@ -2065,7 +2071,7 @@ template <typename ELFT> void ELFDumper<ELFT>::printVersionInfo() {
 #define LLVM_READOBJ_DT_FLAG_ENT(prefix, enum)                                 \
   { #enum, prefix##_##enum }
 
-static const EnumEntry<unsigned> ElfDynamicDTFlags[] = {
+const EnumEntry<unsigned> ElfDynamicDTFlags[] = {
   LLVM_READOBJ_DT_FLAG_ENT(DF, ORIGIN),
   LLVM_READOBJ_DT_FLAG_ENT(DF, SYMBOLIC),
   LLVM_READOBJ_DT_FLAG_ENT(DF, TEXTREL),
@@ -2073,7 +2079,7 @@ static const EnumEntry<unsigned> ElfDynamicDTFlags[] = {
   LLVM_READOBJ_DT_FLAG_ENT(DF, STATIC_TLS)
 };
 
-static const EnumEntry<unsigned> ElfDynamicDTFlags1[] = {
+const EnumEntry<unsigned> ElfDynamicDTFlags1[] = {
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, NOW),
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, GLOBAL),
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, GROUP),
@@ -2103,7 +2109,7 @@ static const EnumEntry<unsigned> ElfDynamicDTFlags1[] = {
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, PIE),
 };
 
-static const EnumEntry<unsigned> ElfDynamicDTMipsFlags[] = {
+const EnumEntry<unsigned> ElfDynamicDTMipsFlags[] = {
   LLVM_READOBJ_DT_FLAG_ENT(RHF, NONE),
   LLVM_READOBJ_DT_FLAG_ENT(RHF, QUICKSTART),
   LLVM_READOBJ_DT_FLAG_ENT(RHF, NOTPOT),
@@ -2292,6 +2298,8 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
   case DT_INIT_ARRAYSZ:
   case DT_FINI_ARRAYSZ:
   case DT_PREINIT_ARRAYSZ:
+  case DT_RELRSZ:
+  case DT_RELRENT:
   case DT_ANDROID_RELSZ:
   case DT_ANDROID_RELASZ:
     return std::to_string(Value) + " (bytes)";
@@ -2557,8 +2565,27 @@ template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
 template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   switch (Obj.getHeader().e_machine) {
   case EM_ARM:
+    if (Obj.isLE())
+      printAttributes(ELF::SHT_ARM_ATTRIBUTES,
+                      std::make_unique<ARMAttributeParser>(&W),
+                      support::little);
+    else
+      reportUniqueWarning("attribute printing not implemented for big-endian "
+                          "ARM objects");
+    break;
   case EM_RISCV:
-    printAttributes();
+    if (Obj.isLE())
+      printAttributes(ELF::SHT_RISCV_ATTRIBUTES,
+                      std::make_unique<RISCVAttributeParser>(&W),
+                      support::little);
+    else
+      reportUniqueWarning("attribute printing not implemented for big-endian "
+                          "RISC-V objects");
+    break;
+  case EM_MSP430:
+    printAttributes(ELF::SHT_MSP430_ATTRIBUTES,
+                    std::make_unique<MSP430AttributeParser>(&W),
+                    support::little);
     break;
   case EM_MIPS: {
     printMipsABIFlags();
@@ -2581,20 +2608,15 @@ template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   }
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
-  if (!Obj.isLE()) {
-    W.startLine() << "Attributes not implemented.\n";
-    return;
-  }
-
-  const unsigned Machine = Obj.getHeader().e_machine;
-  assert((Machine == EM_ARM || Machine == EM_RISCV) &&
-         "Attributes not implemented.");
-
+template <class ELFT>
+void ELFDumper<ELFT>::printAttributes(
+    unsigned AttrShType, std::unique_ptr<ELFAttributeParser> AttrParser,
+    support::endianness Endianness) {
+  assert((AttrShType != ELF::SHT_NULL) && AttrParser &&
+         "Incomplete ELF attribute implementation");
   DictScope BA(W, "BuildAttributes");
   for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
-    if (Sec.sh_type != ELF::SHT_ARM_ATTRIBUTES &&
-        Sec.sh_type != ELF::SHT_RISCV_ATTRIBUTES)
+    if (Sec.sh_type != AttrShType)
       continue;
 
     ArrayRef<uint8_t> Contents;
@@ -2613,13 +2635,7 @@ template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
 
     W.printHex("FormatVersion", Contents[0]);
 
-    auto ParseAttrubutes = [&]() {
-      if (Machine == EM_ARM)
-        return ARMAttributeParser(&W).parse(Contents, support::little);
-      return RISCVAttributeParser(&W).parse(Contents, support::little);
-    };
-
-    if (Error E = ParseAttrubutes())
+    if (Error E = AttrParser->parse(Contents, Endianness))
       reportUniqueWarning("unable to dump attributes from the " +
                           describe(Sec) + ": " + toString(std::move(E)));
   }
@@ -2934,7 +2950,7 @@ MipsGOTParser<ELFT>::getPltSym(const Entry *E) const {
   }
 }
 
-static const EnumEntry<unsigned> ElfMipsISAExtType[] = {
+const EnumEntry<unsigned> ElfMipsISAExtType[] = {
   {"None",                    Mips::AFL_EXT_NONE},
   {"Broadcom SB-1",           Mips::AFL_EXT_SB1},
   {"Cavium Networks Octeon",  Mips::AFL_EXT_OCTEON},
@@ -2957,7 +2973,7 @@ static const EnumEntry<unsigned> ElfMipsISAExtType[] = {
   {"Toshiba R3900",           Mips::AFL_EXT_3900}
 };
 
-static const EnumEntry<unsigned> ElfMipsASEFlags[] = {
+const EnumEntry<unsigned> ElfMipsASEFlags[] = {
   {"DSP",                Mips::AFL_ASE_DSP},
   {"DSPR2",              Mips::AFL_ASE_DSPR2},
   {"Enhanced VA Scheme", Mips::AFL_ASE_EVA},
@@ -2975,7 +2991,7 @@ static const EnumEntry<unsigned> ElfMipsASEFlags[] = {
   {"GINV",               Mips::AFL_ASE_GINV},
 };
 
-static const EnumEntry<unsigned> ElfMipsFpABIType[] = {
+const EnumEntry<unsigned> ElfMipsFpABIType[] = {
   {"Hard or soft float",                  Mips::Val_GNU_MIPS_ABI_FP_ANY},
   {"Hard float (double precision)",       Mips::Val_GNU_MIPS_ABI_FP_DOUBLE},
   {"Hard float (single precision)",       Mips::Val_GNU_MIPS_ABI_FP_SINGLE},
@@ -3762,6 +3778,15 @@ void GNUELFDumper<ELFT>::printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
           Fields[5].Str.append(" | " + to_hexString(Other, false));
         Fields[5].Str.append("]");
       }
+    } else if (this->Obj.getHeader().e_machine == ELF::EM_RISCV) {
+      uint8_t Other = Symbol.st_other & ~0x3;
+      if (Other & STO_RISCV_VARIANT_CC) {
+        Other &= ~STO_RISCV_VARIANT_CC;
+        Fields[5].Str += " [VARIANT_CC";
+        if (Other != 0)
+          Fields[5].Str.append(" | " + to_hexString(Other, false));
+        Fields[5].Str.append("]");
+      }
     } else {
       Fields[5].Str +=
           " [<other: " + to_string(format_hex(Symbol.st_other, 2)) + ">]";
@@ -4358,7 +4383,7 @@ template <class ELFT> void GNUELFDumper<ELFT>::printDynamicTable() {
   for (auto Entry : Table) {
     uintX_t Tag = Entry.getTag();
     std::string Type =
-        std::string("(") + this->Obj.getDynamicTagAsString(Tag).c_str() + ")";
+        std::string("(") + this->Obj.getDynamicTagAsString(Tag) + ")";
     std::string Value = this->getDynamicEntry(Tag, Entry.getVal());
     OS << "  " << format_hex(Tag, ELFT::Is64Bits ? 18 : 10)
        << format(ValueFmt.c_str(), Type.c_str()) << Value << "\n";
@@ -4951,7 +4976,7 @@ static std::string getGNUBuildId(ArrayRef<uint8_t> Desc) {
   return OS.str();
 }
 
-static StringRef getGNUGoldVersion(ArrayRef<uint8_t> Desc) {
+static StringRef getDescAsStringRef(ArrayRef<uint8_t> Desc) {
   return StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
 }
 
@@ -4975,7 +5000,7 @@ static bool printGNUNote(raw_ostream &OS, uint32_t NoteType,
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
-    OS << "    Version: " << getGNUGoldVersion(Desc);
+    OS << "    Version: " << getDescAsStringRef(Desc);
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     OS << "    Properties:";
@@ -4987,7 +5012,27 @@ static bool printGNUNote(raw_ostream &OS, uint32_t NoteType,
   return true;
 }
 
-static const EnumEntry<unsigned> FreeBSDFeatureCtlFlags[] = {
+template <typename ELFT>
+static bool printLLVMOMPOFFLOADNote(raw_ostream &OS, uint32_t NoteType,
+                                    ArrayRef<uint8_t> Desc) {
+  switch (NoteType) {
+  default:
+    return false;
+  case ELF::NT_LLVM_OPENMP_OFFLOAD_VERSION:
+    OS << "    Version: " << getDescAsStringRef(Desc);
+    break;
+  case ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER:
+    OS << "    Producer: " << getDescAsStringRef(Desc);
+    break;
+  case ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION:
+    OS << "    Producer version: " << getDescAsStringRef(Desc);
+    break;
+  }
+  OS << '\n';
+  return true;
+}
+
+const EnumEntry<unsigned> FreeBSDFeatureCtlFlags[] = {
     {"ASLR_DISABLE", NT_FREEBSD_FCTL_ASLR_DISABLE},
     {"PROTMAX_DISABLE", NT_FREEBSD_FCTL_PROTMAX_DISABLE},
     {"STKGAP_DISABLE", NT_FREEBSD_FCTL_STKGAP_DISABLE},
@@ -5251,14 +5296,14 @@ static void printCoreNote(raw_ostream &OS, const CoreNote &Note) {
   }
 }
 
-static const NoteType GenericNoteTypes[] = {
+const NoteType GenericNoteTypes[] = {
     {ELF::NT_VERSION, "NT_VERSION (version)"},
     {ELF::NT_ARCH, "NT_ARCH (architecture)"},
     {ELF::NT_GNU_BUILD_ATTRIBUTE_OPEN, "OPEN"},
     {ELF::NT_GNU_BUILD_ATTRIBUTE_FUNC, "func"},
 };
 
-static const NoteType GNUNoteTypes[] = {
+const NoteType GNUNoteTypes[] = {
     {ELF::NT_GNU_ABI_TAG, "NT_GNU_ABI_TAG (ABI version tag)"},
     {ELF::NT_GNU_HWCAP, "NT_GNU_HWCAP (DSO-supplied software HWCAP info)"},
     {ELF::NT_GNU_BUILD_ID, "NT_GNU_BUILD_ID (unique build ID bitstring)"},
@@ -5266,7 +5311,7 @@ static const NoteType GNUNoteTypes[] = {
     {ELF::NT_GNU_PROPERTY_TYPE_0, "NT_GNU_PROPERTY_TYPE_0 (property note)"},
 };
 
-static const NoteType FreeBSDCoreNoteTypes[] = {
+const NoteType FreeBSDCoreNoteTypes[] = {
     {ELF::NT_FREEBSD_THRMISC, "NT_THRMISC (thrmisc structure)"},
     {ELF::NT_FREEBSD_PROCSTAT_PROC, "NT_PROCSTAT_PROC (proc data)"},
     {ELF::NT_FREEBSD_PROCSTAT_FILES, "NT_PROCSTAT_FILES (files data)"},
@@ -5280,7 +5325,7 @@ static const NoteType FreeBSDCoreNoteTypes[] = {
     {ELF::NT_FREEBSD_PROCSTAT_AUXV, "NT_PROCSTAT_AUXV (auxv data)"},
 };
 
-static const NoteType FreeBSDNoteTypes[] = {
+const NoteType FreeBSDNoteTypes[] = {
     {ELF::NT_FREEBSD_ABI_TAG, "NT_FREEBSD_ABI_TAG (ABI version tag)"},
     {ELF::NT_FREEBSD_NOINIT_TAG, "NT_FREEBSD_NOINIT_TAG (no .init tag)"},
     {ELF::NT_FREEBSD_ARCH_TAG, "NT_FREEBSD_ARCH_TAG (architecture tag)"},
@@ -5288,7 +5333,15 @@ static const NoteType FreeBSDNoteTypes[] = {
      "NT_FREEBSD_FEATURE_CTL (FreeBSD feature control)"},
 };
 
-static const NoteType AMDNoteTypes[] = {
+const NoteType OpenBSDCoreNoteTypes[] = {
+    {ELF::NT_OPENBSD_PROCINFO, "NT_OPENBSD_PROCINFO (procinfo structure)"},
+    {ELF::NT_OPENBSD_AUXV, "NT_OPENBSD_AUXV (ELF auxiliary vector data)"},
+    {ELF::NT_OPENBSD_REGS, "NT_OPENBSD_REGS (regular registers)"},
+    {ELF::NT_OPENBSD_FPREGS, "NT_OPENBSD_FPREGS (floating point registers)"},
+    {ELF::NT_OPENBSD_WCOOKIE, "NT_OPENBSD_WCOOKIE (window cookie)"},
+};
+
+const NoteType AMDNoteTypes[] = {
     {ELF::NT_AMD_HSA_CODE_OBJECT_VERSION,
      "NT_AMD_HSA_CODE_OBJECT_VERSION (AMD HSA Code Object Version)"},
     {ELF::NT_AMD_HSA_HSAIL, "NT_AMD_HSA_HSAIL (AMD HSA HSAIL Properties)"},
@@ -5298,11 +5351,20 @@ static const NoteType AMDNoteTypes[] = {
     {ELF::NT_AMD_PAL_METADATA, "NT_AMD_PAL_METADATA (AMD PAL Metadata)"},
 };
 
-static const NoteType AMDGPUNoteTypes[] = {
+const NoteType AMDGPUNoteTypes[] = {
     {ELF::NT_AMDGPU_METADATA, "NT_AMDGPU_METADATA (AMDGPU Metadata)"},
 };
 
-static const NoteType CoreNoteTypes[] = {
+const NoteType LLVMOMPOFFLOADNoteTypes[] = {
+    {ELF::NT_LLVM_OPENMP_OFFLOAD_VERSION,
+     "NT_LLVM_OPENMP_OFFLOAD_VERSION (image format version)"},
+    {ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER,
+     "NT_LLVM_OPENMP_OFFLOAD_PRODUCER (producing toolchain)"},
+    {ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION,
+     "NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION (producing toolchain version)"},
+};
+
+const NoteType CoreNoteTypes[] = {
     {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
     {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
     {ELF::NT_PRPSINFO, "NT_PRPSINFO (prpsinfo structure)"},
@@ -5391,10 +5453,19 @@ StringRef getNoteTypeName(const typename ELFT::Note &Note, unsigned ELFType) {
       return FindNote(FreeBSDNoteTypes);
     }
   }
+  if (Name.startswith("OpenBSD") && ELFType == ELF::ET_CORE) {
+    // OpenBSD also places the generic core notes in the OpenBSD namespace.
+    StringRef Result = FindNote(OpenBSDCoreNoteTypes);
+    if (!Result.empty())
+      return Result;
+    return FindNote(CoreNoteTypes);
+  }
   if (Name == "AMD")
     return FindNote(AMDNoteTypes);
   if (Name == "AMDGPU")
     return FindNote(AMDGPUNoteTypes);
+  if (Name == "LLVMOMPOFFLOAD")
+    return FindNote(LLVMOMPOFFLOADNoteTypes);
 
   if (ELFType == ELF::ET_CORE)
     return FindNote(CoreNoteTypes);
@@ -5530,6 +5601,9 @@ template <class ELFT> void GNUELFDumper<ELFT>::printNotes() {
         OS << "    " << N.Type << ":\n        " << N.Value << '\n';
         return Error::success();
       }
+    } else if (Name == "LLVMOMPOFFLOAD") {
+      if (printLLVMOMPOFFLOADNote<ELFT>(OS, Type, Descriptor))
+        return Error::success();
     } else if (Name == "CORE") {
       if (Type == ELF::NT_FILE) {
         DataExtractor DescExtractor(Descriptor,
@@ -6532,6 +6606,10 @@ void LLVMELFDumper<ELFT>::printSymbol(const Elf_Sym &Symbol, unsigned SymIndex,
       SymOtherFlags.insert(SymOtherFlags.end(),
                            std::begin(ElfAArch64SymOtherFlags),
                            std::end(ElfAArch64SymOtherFlags));
+    } else if (this->Obj.getHeader().e_machine == EM_RISCV) {
+      SymOtherFlags.insert(SymOtherFlags.end(),
+                           std::begin(ElfRISCVSymOtherFlags),
+                           std::end(ElfRISCVSymOtherFlags));
     }
     W.printFlags("Other", Symbol.st_other, makeArrayRef(SymOtherFlags), 0x3u);
   }
@@ -6650,7 +6728,7 @@ void LLVMELFDumper<ELFT>::printVersionSymbolSection(const Elf_Shdr *Sec) {
   }
 }
 
-static const EnumEntry<unsigned> SymVersionFlags[] = {
+const EnumEntry<unsigned> SymVersionFlags[] = {
     {"Base", "BASE", VER_FLG_BASE},
     {"Weak", "WEAK", VER_FLG_WEAK},
     {"Info", "INFO", VER_FLG_INFO}};
@@ -6818,14 +6896,14 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
       FunctionSec =
           unwrapOrError(this->FileName, this->Obj.getSection(Sec.sh_link));
     ListScope L(W, "BBAddrMap");
-    Expected<std::vector<Elf_BBAddrMap>> BBAddrMapOrErr =
+    Expected<std::vector<BBAddrMap>> BBAddrMapOrErr =
         this->Obj.decodeBBAddrMap(Sec);
     if (!BBAddrMapOrErr) {
       this->reportUniqueWarning("unable to dump " + this->describe(Sec) + ": " +
                                 toString(BBAddrMapOrErr.takeError()));
       continue;
     }
-    for (const Elf_BBAddrMap &AM : *BBAddrMapOrErr) {
+    for (const BBAddrMap &AM : *BBAddrMapOrErr) {
       DictScope D(W, "Function");
       W.printHex("At", AM.Addr);
       SmallVector<uint32_t> FuncSymIndex =
@@ -6840,7 +6918,7 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
       W.printString("Name", FuncName);
 
       ListScope L(W, "BB entries");
-      for (const typename Elf_BBAddrMap::BBEntry &BBE : AM.BBEntries) {
+      for (const BBAddrMap::BBEntry &BBE : AM.BBEntries) {
         DictScope L(W);
         W.printHex("Offset", BBE.Offset);
         W.printHex("Size", BBE.Size);
@@ -6892,7 +6970,7 @@ static bool printGNUNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
-    W.printString("Version", getGNUGoldVersion(Desc));
+    W.printString("Version", getDescAsStringRef(Desc));
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     ListScope D(W, "Property");
@@ -6903,6 +6981,26 @@ static bool printGNUNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
   return true;
 }
 
+template <typename ELFT>
+static bool printLLVMOMPOFFLOADNoteLLVMStyle(uint32_t NoteType,
+                                             ArrayRef<uint8_t> Desc,
+                                             ScopedPrinter &W) {
+  switch (NoteType) {
+  default:
+    return false;
+  case ELF::NT_LLVM_OPENMP_OFFLOAD_VERSION:
+    W.printString("Version", getDescAsStringRef(Desc));
+    break;
+  case ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER:
+    W.printString("Producer", getDescAsStringRef(Desc));
+    break;
+  case ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION:
+    W.printString("Producer version", getDescAsStringRef(Desc));
+    break;
+  }
+  return true;
+}
+
 static void printCoreNoteLLVMStyle(const CoreNote &Note, ScopedPrinter &W) {
   W.printNumber("Page Size", Note.PageSize);
   for (const CoreFileMapping &Mapping : Note.Mappings) {
@@ -6970,6 +7068,9 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printNotes() {
         W.printString(N.Type, N.Value);
         return Error::success();
       }
+    } else if (Name == "LLVMOMPOFFLOAD") {
+      if (printLLVMOMPOFFLOADNoteLLVMStyle<ELFT>(Type, Descriptor, W))
+        return Error::success();
     } else if (Name == "CORE") {
       if (Type == ELF::NT_FILE) {
         DataExtractor DescExtractor(Descriptor,
diff --git a/llvm/tools/llvm-readobj/MachODumper.cpp b/llvm/tools/llvm-readobj/MachODumper.cpp
index 433ca9335324..945b16b8db86 100644
--- a/llvm/tools/llvm-readobj/MachODumper.cpp
+++ b/llvm/tools/llvm-readobj/MachODumper.cpp
@@ -74,7 +74,7 @@ std::unique_ptr<ObjDumper> createMachODumper(const object::MachOObjectFile &Obj,
 
 } // namespace llvm
 
-static const EnumEntry<uint32_t> MachOMagics[] = {
+const EnumEntry<uint32_t> MachOMagics[] = {
   { "Magic",      MachO::MH_MAGIC    },
   { "Cigam",      MachO::MH_CIGAM    },
   { "Magic64",    MachO::MH_MAGIC_64 },
@@ -83,7 +83,7 @@ static const EnumEntry<uint32_t> MachOMagics[] = {
   { "FatCigam",   MachO::FAT_CIGAM   },
 };
 
-static const EnumEntry<uint32_t> MachOHeaderFileTypes[] = {
+const EnumEntry<uint32_t> MachOHeaderFileTypes[] = {
   { "Relocatable",          MachO::MH_OBJECT      },
   { "Executable",           MachO::MH_EXECUTE     },
   { "FixedVMLibrary",       MachO::MH_FVMLIB      },
@@ -97,7 +97,7 @@ static const EnumEntry<uint32_t> MachOHeaderFileTypes[] = {
   { "KextBundle",           MachO::MH_KEXT_BUNDLE },
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuTypes[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuTypes[] = {
   { "Any"       , static_cast<uint32_t>(MachO::CPU_TYPE_ANY) },
   { "X86"       , MachO::CPU_TYPE_X86       },
   { "X86-64"    , MachO::CPU_TYPE_X86_64    },
@@ -109,7 +109,7 @@ static const EnumEntry<uint32_t> MachOHeaderCpuTypes[] = {
   { "PowerPC64" , MachO::CPU_TYPE_POWERPC64 },
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX86[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX86[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_I386_ALL),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_386),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_486),
@@ -132,13 +132,13 @@ static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX86[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_XEON_MP),
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX64[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX64[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_64_ALL),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_ARCH1),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_64_H),
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_ALL),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V4T),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V6),
@@ -153,17 +153,17 @@ static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7EM),
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM64[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM64[] = {
     LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_ALL),
     LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_V8),
     LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64E),
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesSPARC[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuSubtypesSPARC[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_SPARC_ALL),
 };
 
-static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesPPC[] = {
+const EnumEntry<uint32_t> MachOHeaderCpuSubtypesPPC[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_ALL),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_601),
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_602),
@@ -179,7 +179,7 @@ static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesPPC[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_970),
 };
 
-static const EnumEntry<uint32_t> MachOHeaderFlags[] = {
+const EnumEntry<uint32_t> MachOHeaderFlags[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, MH_NOUNDEFS),
   LLVM_READOBJ_ENUM_ENT(MachO, MH_INCRLINK),
   LLVM_READOBJ_ENUM_ENT(MachO, MH_DYLDLINK),
@@ -208,7 +208,7 @@ static const EnumEntry<uint32_t> MachOHeaderFlags[] = {
   LLVM_READOBJ_ENUM_ENT(MachO, MH_APP_EXTENSION_SAFE),
 };
 
-static const EnumEntry<unsigned> MachOSectionTypes[] = {
+const EnumEntry<unsigned> MachOSectionTypes[] = {
   { "Regular"                        , MachO::S_REGULAR },
   { "ZeroFill"                       , MachO::S_ZEROFILL },
   { "CStringLiterals"                , MachO::S_CSTRING_LITERALS },
@@ -233,7 +233,7 @@ static const EnumEntry<unsigned> MachOSectionTypes[] = {
   { "ThreadLocalInitFunctionPointers", MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS }
 };
 
-static const EnumEntry<unsigned> MachOSectionAttributes[] = {
+const EnumEntry<unsigned> MachOSectionAttributes[] = {
   { "LocReloc"         , 1 <<  0 /*S_ATTR_LOC_RELOC          */ },
   { "ExtReloc"         , 1 <<  1 /*S_ATTR_EXT_RELOC          */ },
   { "SomeInstructions" , 1 <<  2 /*S_ATTR_SOME_INSTRUCTIONS  */ },
@@ -246,7 +246,7 @@ static const EnumEntry<unsigned> MachOSectionAttributes[] = {
   { "PureInstructions" , 1 << 23 /*S_ATTR_PURE_INSTRUCTIONS  */ },
 };
 
-static const EnumEntry<unsigned> MachOSymbolRefTypes[] = {
+const EnumEntry<unsigned> MachOSymbolRefTypes[] = {
   { "UndefinedNonLazy",                     0 },
   { "ReferenceFlagUndefinedLazy",           1 },
   { "ReferenceFlagDefined",                 2 },
@@ -255,7 +255,7 @@ static const EnumEntry<unsigned> MachOSymbolRefTypes[] = {
   { "ReferenceFlagPrivateUndefinedLazy",    5 }
 };
 
-static const EnumEntry<unsigned> MachOSymbolFlags[] = {
+const EnumEntry<unsigned> MachOSymbolFlags[] = {
   { "ThumbDef",               0x8 },
   { "ReferencedDynamically", 0x10 },
   { "NoDeadStrip",           0x20 },
@@ -266,7 +266,7 @@ static const EnumEntry<unsigned> MachOSymbolFlags[] = {
   { "ColdFunc",             0x400 },
 };
 
-static const EnumEntry<unsigned> MachOSymbolTypes[] = {
+const EnumEntry<unsigned> MachOSymbolTypes[] = {
   { "Undef",           0x0 },
   { "Abs",             0x2 },
   { "Indirect",        0xA },
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 87c229356e20..dc4a3031f914 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -52,9 +52,23 @@ static void printAsPrintable(raw_ostream &W, const uint8_t *Start, size_t Len) {
     W << (isPrint(Start[i]) ? static_cast<char>(Start[i]) : '.');
 }
 
-void ObjDumper::printAsStringList(StringRef StringContent) {
+void ObjDumper::printAsStringList(StringRef StringContent,
+                                  size_t StringDataOffset) {
+  size_t StrSize = StringContent.size();
+  if (StrSize == 0)
+    return;
+  if (StrSize < StringDataOffset) {
+    reportUniqueWarning("offset (0x" + Twine::utohexstr(StringDataOffset) +
+                        ") is past the end of the contents (size 0x" +
+                        Twine::utohexstr(StrSize) + ")");
+    return;
+  }
+
   const uint8_t *StrContent = StringContent.bytes_begin();
-  const uint8_t *CurrentWord = StrContent;
+  // Some formats contain additional metadata at the start which should not be
+  // interpreted as strings. Skip these bytes, but account for them in the
+  // string offsets.
+  const uint8_t *CurrentWord = StrContent + StringDataOffset;
   const uint8_t *StrEnd = StringContent.bytes_end();
 
   while (CurrentWord <= StrEnd) {
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 7e1c0ca35127..b395a95f3cb4 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -97,6 +97,9 @@ public:
                      llvm::codeview::GlobalTypeTableBuilder &GlobalCVTypes,
                      bool GHash) {}
 
+  // Only implement for XCOFF
+  virtual void printAuxiliaryHeader() {}
+
   // Only implemented for MachO.
   virtual void printMachODataInCode() { }
   virtual void printMachOVersionMin() { }
@@ -110,7 +113,7 @@ public:
 
   virtual void printStackMap() const = 0;
 
-  void printAsStringList(StringRef StringContent);
+  void printAsStringList(StringRef StringContent, size_t StringDataOffset = 0);
 
   void printSectionsAsString(const object::ObjectFile &Obj,
                              ArrayRef<std::string> Sections);
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index 493b93769eb4..7723691e8225 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -29,14 +29,14 @@ def file_header : FF<"file-header", "Display file header">;
 def headers : FF<"headers", "Equivalent to setting: --file-header, --program-headers, --section-headers">;
 defm hex_dump : Eq<"hex-dump", "Display the specified section(s) as hexadecimal bytes">, MetaVarName<"<name or index>">;
 def relocs : FF<"relocs", "Display the relocation entries in the file">;
-def section_data : FF<"section-data", "Display section data for each section shown">;
+def section_data : FF<"section-data", "Display section data for each section shown. This option has no effect for GNU style output">;
 def section_details : FF<"section-details", "Display the section details">;
 def section_headers : FF<"section-headers", "Display section headers">;
 def section_mapping : FF<"section-mapping", "Display the section to segment mapping">;
 def section_mapping_EQ_false : FF<"section-mapping=false", "Don't display the section to segment mapping">, Flags<[HelpHidden]>;
-def section_relocations : FF<"section-relocations", "Display relocations for each section shown">;
-def section_symbols : FF<"section-symbols", "Display symbols for each section shown">;
-def stack_sizes : FF<"stack-sizes", "Display contents of all stack sizes sections">;
+def section_relocations : FF<"section-relocations", "Display relocations for each section shown. This option has no effect for GNU style output">;
+def section_symbols : FF<"section-symbols", "Display symbols for each section shown. This option has no effect for GNU style output">;
+def stack_sizes : FF<"stack-sizes", "Display contents of all stack sizes sections. This option has no effect for GNU style output">;
 def stackmap : FF<"stackmap", "Display contents of stackmap section">;
 defm string_dump : Eq<"string-dump", "Display the specified section(s) as a list of strings">, MetaVarName<"<name or index>">;
 def string_table : FF<"string-table", "Display the string table (only for XCOFF now)">;
@@ -47,10 +47,10 @@ def unwind : FF<"unwind", "Display unwind information">;
 def grp_elf : OptionGroup<"kind">, HelpText<"OPTIONS (ELF specific)">;
 def dynamic_table : FF<"dynamic-table", "Display the dynamic section table">, Group<grp_elf>;
 def elf_linker_options : FF<"elf-linker-options", "Display the .linker-options section">, Group<grp_elf>;
-defm elf_output_style : Eq<"elf-output-style", "Specify ELF dump style">, Group<grp_elf>;
+defm elf_output_style : Eq<"elf-output-style", "Specify ELF dump style: LLVM or GNU">, Group<grp_elf>;
 def histogram : FF<"histogram", "Display bucket list histogram for hash sections">, Group<grp_elf>;
 def section_groups : FF<"section-groups", "Display section groups">, Group<grp_elf>;
-def gnu_hash_table : FF<"gnu-hash-table", "Display .gnu.hash section">, Group<grp_elf>;
+def gnu_hash_table : FF<"gnu-hash-table", "Display the GNU hash table for dynamic symbols">, Group<grp_elf>;
 def hash_symbols : FF<"hash-symbols", "Display the dynamic symbols derived from the hash section">, Group<grp_elf>;
 def hash_table : FF<"hash-table", "Display .hash section">, Group<grp_elf>;
 def needed_libs : FF<"needed-libs", "Display the needed libraries">, Group<grp_elf>;
@@ -83,12 +83,16 @@ def coff_load_config : FF<"coff-load-config", "Display load config">, Group<grp_
 def coff_resources : FF<"coff-resources", "Display .rsrc section">, Group<grp_coff>;
 def coff_tls_directory : FF<"coff-tls-directory", "Display TLS directory">, Group<grp_coff>;
 
+// XCOFF specific options.
+def grp_xcoff : OptionGroup<"kind">, HelpText<"OPTIONS (XCOFF specific)">;
+def auxiliary_header : FF<"auxiliary-header" , "display the auxiliary header">, Group<grp_xcoff>;
+
 def help : FF<"help", "Display this help">;
 def version : FF<"version", "Display the version">;
 
 // Ignored for GNU readelf compatibility.
-def : F<"W", "Ignored for GNU readelf compatibility">;
-def : FF<"wide", "Ignored for GNU readelf compatibility">;
+def wide : FF<"wide", "Ignored for GNU readelf compatibility">;
+def : F<"W", "Ignored for GNU readelf compatibility">, Alias<wide>;
 
 // Traditional llvm-readobj Aliases.
 def : Flag<["--"], "dt">, Alias<dyn_syms>, HelpText<"Alias for --dyn-syms">;
diff --git a/llvm/tools/llvm-readobj/WasmDumper.cpp b/llvm/tools/llvm-readobj/WasmDumper.cpp
index f7dcaa35656f..d76332d1ba36 100644
--- a/llvm/tools/llvm-readobj/WasmDumper.cpp
+++ b/llvm/tools/llvm-readobj/WasmDumper.cpp
@@ -20,7 +20,7 @@ using namespace object;
 
 namespace {
 
-static const EnumEntry<unsigned> WasmSymbolTypes[] = {
+const EnumEntry<unsigned> WasmSymbolTypes[] = {
 #define ENUM_ENTRY(X)                                                          \
   { #X, wasm::WASM_SYMBOL_TYPE_##X }
     ENUM_ENTRY(FUNCTION), ENUM_ENTRY(DATA), ENUM_ENTRY(GLOBAL),
@@ -28,7 +28,7 @@ static const EnumEntry<unsigned> WasmSymbolTypes[] = {
 #undef ENUM_ENTRY
 };
 
-static const EnumEntry<uint32_t> WasmSectionTypes[] = {
+const EnumEntry<uint32_t> WasmSectionTypes[] = {
 #define ENUM_ENTRY(X)                                                          \
   { #X, wasm::WASM_SEC_##X }
     ENUM_ENTRY(CUSTOM),   ENUM_ENTRY(TYPE),      ENUM_ENTRY(IMPORT),
@@ -39,7 +39,7 @@ static const EnumEntry<uint32_t> WasmSectionTypes[] = {
 #undef ENUM_ENTRY
 };
 
-static const EnumEntry<unsigned> WasmSymbolFlags[] = {
+const EnumEntry<unsigned> WasmSymbolFlags[] = {
 #define ENUM_ENTRY(X)                                                          \
   { #X, wasm::WASM_SYMBOL_##X }
   ENUM_ENTRY(BINDING_GLOBAL),
diff --git a/llvm/tools/llvm-readobj/Win64EHDumper.cpp b/llvm/tools/llvm-readobj/Win64EHDumper.cpp
index 7e84c1bca35d..da964d3132e7 100644
--- a/llvm/tools/llvm-readobj/Win64EHDumper.cpp
+++ b/llvm/tools/llvm-readobj/Win64EHDumper.cpp
@@ -16,13 +16,13 @@ using namespace llvm;
 using namespace llvm::object;
 using namespace llvm::Win64EH;
 
-static const EnumEntry<unsigned> UnwindFlags[] = {
+const EnumEntry<unsigned> UnwindFlags[] = {
   { "ExceptionHandler", UNW_ExceptionHandler },
   { "TerminateHandler", UNW_TerminateHandler },
   { "ChainInfo"       , UNW_ChainInfo        }
 };
 
-static const EnumEntry<unsigned> UnwindOpInfo[] = {
+const EnumEntry<unsigned> UnwindOpInfo[] = {
   { "RAX",  0 },
   { "RCX",  1 },
   { "RDX",  2 },
@@ -125,14 +125,52 @@ static std::error_code getSymbol(const COFFObjectFile &COFF, uint64_t VA,
   return inconvertibleErrorCode();
 }
 
+static object::SymbolRef getPreferredSymbol(const COFFObjectFile &COFF,
+                                            object::SymbolRef Sym,
+                                            uint32_t &SymbolOffset,
+                                            bool IsRangeEnd) {
+  // The symbol resolved by ResolveSymbol can be any internal
+  // nondescriptive symbol; try to resolve a more descriptive one.
+  COFFSymbolRef CoffSym = COFF.getCOFFSymbol(Sym);
+  if (CoffSym.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL &&
+      CoffSym.getSectionDefinition() == nullptr)
+    return Sym;
+  for (const auto &S : COFF.symbols()) {
+    COFFSymbolRef CS = COFF.getCOFFSymbol(S);
+    if (CS.getSectionNumber() == CoffSym.getSectionNumber() &&
+        CS.getValue() <= CoffSym.getValue() + SymbolOffset &&
+        CS.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL &&
+        CS.getSectionDefinition() == nullptr) {
+      uint32_t Offset = CoffSym.getValue() + SymbolOffset - CS.getValue();
+      // For the end of a range, don't pick a symbol with a zero offset;
+      // prefer a symbol with a small positive offset.
+      if (Offset <= SymbolOffset && (!IsRangeEnd || Offset > 0)) {
+        SymbolOffset = Offset;
+        Sym = S;
+        CoffSym = CS;
+        if (CS.isExternal() && SymbolOffset == 0)
+          return Sym;
+      }
+    }
+  }
+  return Sym;
+}
+
 static std::string formatSymbol(const Dumper::Context &Ctx,
                                 const coff_section *Section, uint64_t Offset,
-                                uint32_t Displacement) {
+                                uint32_t Displacement,
+                                bool IsRangeEnd = false) {
   std::string Buffer;
   raw_string_ostream OS(Buffer);
 
   SymbolRef Symbol;
   if (!Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData)) {
+    // We found a relocation at the given offset in the section, pointing
+    // at a symbol.
+
+    // Try to resolve label/section symbols into function names.
+    Symbol = getPreferredSymbol(Ctx.COFF, Symbol, Displacement, IsRangeEnd);
+
     Expected<StringRef> Name = Symbol.getName();
     if (Name) {
       OS << *Name;
@@ -207,7 +245,8 @@ void Dumper::printRuntimeFunctionEntry(const Context &Ctx,
   SW.printString("StartAddress",
                  formatSymbol(Ctx, Section, Offset + 0, RF.StartAddress));
   SW.printString("EndAddress",
-                 formatSymbol(Ctx, Section, Offset + 4, RF.EndAddress));
+                 formatSymbol(Ctx, Section, Offset + 4, RF.EndAddress,
+                              /*IsRangeEnd=*/true));
   SW.printString("UnwindInfoAddress",
                  formatSymbol(Ctx, Section, Offset + 8, RF.UnwindInfoOffset));
 }
diff --git a/llvm/tools/llvm-readobj/XCOFFDumper.cpp b/llvm/tools/llvm-readobj/XCOFFDumper.cpp
index 94ef96e447ce..38e459cd5425 100644
--- a/llvm/tools/llvm-readobj/XCOFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/XCOFFDumper.cpp
@@ -13,8 +13,11 @@
 #include "ObjDumper.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/XCOFFObjectFile.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/ScopedPrinter.h"
 
+#include <stddef.h>
+
 using namespace llvm;
 using namespace object;
 
@@ -27,6 +30,7 @@ public:
       : ObjDumper(Writer, Obj.getFileName()), Obj(Obj) {}
 
   void printFileHeaders() override;
+  void printAuxiliaryHeader() override;
   void printSectionHeaders() override;
   void printRelocations() override;
   void printSymbols() override;
@@ -44,7 +48,11 @@ private:
   void printCsectAuxEnt(XCOFFCsectAuxRef AuxEntRef);
   void printSectAuxEntForStat(const XCOFFSectAuxEntForStat *AuxEntPtr);
   void printSymbol(const SymbolRef &);
-  void printRelocations(ArrayRef<XCOFFSectionHeader32> Sections);
+  template <typename RelTy> void printRelocation(RelTy Reloc);
+  template <typename Shdr, typename RelTy>
+  void printRelocations(ArrayRef<Shdr> Sections);
+  void printAuxiliaryHeader(const XCOFFAuxiliaryHeader32 *AuxHeader);
+  void printAuxiliaryHeader(const XCOFFAuxiliaryHeader64 *AuxHeader);
   const XCOFFObjectFile &Obj;
 };
 } // anonymous namespace
@@ -96,6 +104,13 @@ void XCOFFDumper::printFileHeaders() {
   // XCOFFObjectFile has the necessary support.
 }
 
+void XCOFFDumper::printAuxiliaryHeader() {
+  if (Obj.is64Bit())
+    printAuxiliaryHeader(Obj.auxiliaryHeader64());
+  else
+    printAuxiliaryHeader(Obj.auxiliaryHeader32());
+}
+
 void XCOFFDumper::printSectionHeaders() {
   if (Obj.is64Bit())
     printSectionHeaders(Obj.sections64());
@@ -105,12 +120,12 @@ void XCOFFDumper::printSectionHeaders() {
 
 void XCOFFDumper::printRelocations() {
   if (Obj.is64Bit())
-    llvm_unreachable("64-bit relocation output not implemented!");
+    printRelocations<XCOFFSectionHeader64, XCOFFRelocation64>(Obj.sections64());
   else
-    printRelocations(Obj.sections32());
+    printRelocations<XCOFFSectionHeader32, XCOFFRelocation32>(Obj.sections32());
 }
 
-static const EnumEntry<XCOFF::RelocationType> RelocationTypeNameclass[] = {
+const EnumEntry<XCOFF::RelocationType> RelocationTypeNameclass[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(R_POS),    ECase(R_RL),     ECase(R_RLA),    ECase(R_NEG),
@@ -122,50 +137,71 @@ static const EnumEntry<XCOFF::RelocationType> RelocationTypeNameclass[] = {
 #undef ECase
 };
 
-void XCOFFDumper::printRelocations(ArrayRef<XCOFFSectionHeader32> Sections) {
-  if (!opts::ExpandRelocs)
-    report_fatal_error("Unexpanded relocation output not implemented.");
+template <typename RelTy> void XCOFFDumper::printRelocation(RelTy Reloc) {
+  Expected<StringRef> ErrOrSymbolName =
+      Obj.getSymbolNameByIndex(Reloc.SymbolIndex);
+  if (Error E = ErrOrSymbolName.takeError()) {
+    reportUniqueWarning(std::move(E));
+    return;
+  }
+  StringRef SymbolName = *ErrOrSymbolName;
+  StringRef RelocName = XCOFF::getRelocationTypeString(Reloc.Type);
+  if (opts::ExpandRelocs) {
+    DictScope Group(W, "Relocation");
+    W.printHex("Virtual Address", Reloc.VirtualAddress);
+    W.printNumber("Symbol", SymbolName, Reloc.SymbolIndex);
+    W.printString("IsSigned", Reloc.isRelocationSigned() ? "Yes" : "No");
+    W.printNumber("FixupBitValue", Reloc.isFixupIndicated() ? 1 : 0);
+    W.printNumber("Length", Reloc.getRelocatedLength());
+    W.printEnum("Type", (uint8_t)Reloc.Type,
+                makeArrayRef(RelocationTypeNameclass));
+  } else {
+    raw_ostream &OS = W.startLine();
+    OS << W.hex(Reloc.VirtualAddress) << " " << RelocName << " " << SymbolName
+       << "(" << Reloc.SymbolIndex << ") " << W.hex(Reloc.Info) << "\n";
+  }
+}
 
+template <typename Shdr, typename RelTy>
+void XCOFFDumper::printRelocations(ArrayRef<Shdr> Sections) {
   ListScope LS(W, "Relocations");
   uint16_t Index = 0;
-  for (const auto &Sec : Sections) {
+  for (const Shdr &Sec : Sections) {
     ++Index;
     // Only the .text, .data, .tdata, and STYP_DWARF sections have relocation.
     if (Sec.Flags != XCOFF::STYP_TEXT && Sec.Flags != XCOFF::STYP_DATA &&
         Sec.Flags != XCOFF::STYP_TDATA && Sec.Flags != XCOFF::STYP_DWARF)
       continue;
-    auto Relocations = unwrapOrError(Obj.getFileName(), Obj.relocations(Sec));
+    Expected<ArrayRef<RelTy>> ErrOrRelocations = Obj.relocations<Shdr, RelTy>(Sec);
+    if (Error E = ErrOrRelocations.takeError()) {
+      reportUniqueWarning(std::move(E));
+      continue;
+    }
+
+    const ArrayRef<RelTy> Relocations = *ErrOrRelocations;
     if (Relocations.empty())
       continue;
 
     W.startLine() << "Section (index: " << Index << ") " << Sec.getName()
                   << " {\n";
-    for (auto Reloc : Relocations) {
-      StringRef SymbolName = unwrapOrError(
-          Obj.getFileName(), Obj.getSymbolNameByIndex(Reloc.SymbolIndex));
-
-      DictScope RelocScope(W, "Relocation");
-      W.printHex("Virtual Address", Reloc.VirtualAddress);
-      W.printNumber("Symbol", SymbolName, Reloc.SymbolIndex);
-      W.printString("IsSigned", Reloc.isRelocationSigned() ? "Yes" : "No");
-      W.printNumber("FixupBitValue", Reloc.isFixupIndicated() ? 1 : 0);
-      W.printNumber("Length", Reloc.getRelocatedLength());
-      W.printEnum("Type", (uint8_t)Reloc.Type,
-                  makeArrayRef(RelocationTypeNameclass));
-    }
+    W.indent();
+
+    for (const RelTy Reloc : Relocations)
+      printRelocation(Reloc);
+
     W.unindent();
     W.startLine() << "}\n";
   }
 }
 
-static const EnumEntry<XCOFF::CFileStringType> FileStringType[] = {
+const EnumEntry<XCOFF::CFileStringType> FileStringType[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(XFT_FN), ECase(XFT_CT), ECase(XFT_CV), ECase(XFT_CD)
 #undef ECase
 };
 
-static const EnumEntry<XCOFF::SymbolAuxType> SymAuxType[] = {
+const EnumEntry<XCOFF::SymbolAuxType> SymAuxType[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(AUX_EXCEPT), ECase(AUX_FCN), ECase(AUX_SYM), ECase(AUX_FILE),
@@ -203,7 +239,7 @@ static const EnumEntry<XCOFF::StorageMappingClass> CsectStorageMappingClass[] =
 #undef ECase
 };
 
-static const EnumEntry<XCOFF::SymbolType> CsectSymbolTypeClass[] = {
+const EnumEntry<XCOFF::SymbolType> CsectSymbolTypeClass[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(XTY_ER), ECase(XTY_SD), ECase(XTY_LD), ECase(XTY_CM)
@@ -253,7 +289,7 @@ void XCOFFDumper::printSectAuxEntForStat(
   W.printNumber("NumberOfLineNum", AuxEntPtr->NumberOfLineNum);
 }
 
-static const EnumEntry<XCOFF::StorageClass> SymStorageClass[] = {
+const EnumEntry<XCOFF::StorageClass> SymStorageClass[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(C_NULL),  ECase(C_AUTO),    ECase(C_EXT),     ECase(C_STAT),
@@ -302,14 +338,14 @@ static StringRef GetSymbolValueName(XCOFF::StorageClass SC) {
   }
 }
 
-static const EnumEntry<XCOFF::CFileLangId> CFileLangIdClass[] = {
+const EnumEntry<XCOFF::CFileLangId> CFileLangIdClass[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(TB_C), ECase(TB_CPLUSPLUS)
 #undef ECase
 };
 
-static const EnumEntry<XCOFF::CFileCpuId> CFileCpuIdClass[] = {
+const EnumEntry<XCOFF::CFileCpuId> CFileCpuIdClass[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(TCPU_PPC64), ECase(TCPU_COM), ECase(TCPU_970)
@@ -460,7 +496,12 @@ void XCOFFDumper::printSymbols() {
 void XCOFFDumper::printStringTable() {
   DictScope DS(W, "StringTable");
   StringRef StrTable = Obj.getStringTable();
-  printAsStringList(StrTable);
+  uint32_t StrTabSize = StrTable.size();
+  W.printNumber("Length", StrTabSize);
+  // Print strings from the fifth byte, since the first four bytes contain the
+  // length (in bytes) of the string table (including the length field).
+  if (StrTabSize > 4)
+    printAsStringList(StrTable, 4);
 }
 
 void XCOFFDumper::printDynamicSymbols() {
@@ -476,10 +517,46 @@ void XCOFFDumper::printStackMap() const {
 }
 
 void XCOFFDumper::printNeededLibraries() {
-  llvm_unreachable("Unimplemented functionality for XCOFFDumper");
+  ListScope D(W, "NeededLibraries");
+  auto ImportFilesOrError = Obj.getImportFileTable();
+  if (!ImportFilesOrError) {
+    reportUniqueWarning(ImportFilesOrError.takeError());
+    return;
+  }
+
+  StringRef ImportFileTable = ImportFilesOrError.get();
+  const char *CurrentStr = ImportFileTable.data();
+  const char *TableEnd = ImportFileTable.end();
+  // Default column width for names is 13 even if no names are that long.
+  size_t BaseWidth = 13;
+
+  // Get the max width of BASE columns.
+  for (size_t StrIndex = 0; CurrentStr < TableEnd; ++StrIndex) {
+    size_t CurrentLen = strlen(CurrentStr);
+    CurrentStr += strlen(CurrentStr) + 1;
+    if (StrIndex % 3 == 1)
+      BaseWidth = std::max(BaseWidth, CurrentLen);
+  }
+
+  auto &OS = static_cast<formatted_raw_ostream &>(W.startLine());
+  // Each entry consists of 3 strings: the path_name, base_name and
+  // archive_member_name. The first entry is a default LIBPATH value and other
+  // entries have no path_name. We just dump the base_name and
+  // archive_member_name here.
+  OS << left_justify("BASE", BaseWidth)  << " MEMBER\n";
+  CurrentStr = ImportFileTable.data();
+  for (size_t StrIndex = 0; CurrentStr < TableEnd;
+       ++StrIndex, CurrentStr += strlen(CurrentStr) + 1) {
+    if (StrIndex >= 3 && StrIndex % 3 != 0) {
+      if (StrIndex % 3 == 1)
+        OS << "  " << left_justify(CurrentStr, BaseWidth) << " ";
+      else
+        OS << CurrentStr << "\n";
+    }
+  }
 }
 
-static const EnumEntry<XCOFF::SectionTypeFlags> SectionTypeFlagsNames[] = {
+const EnumEntry<XCOFF::SectionTypeFlags> SectionTypeFlagsNames[] = {
 #define ECase(X)                                                               \
   { #X, XCOFF::X }
     ECase(STYP_PAD),    ECase(STYP_DWARF), ECase(STYP_TEXT),
@@ -523,6 +600,176 @@ void XCOFFDumper::printGenericSectionHeader(T &Sec) const {
   W.printNumber("NumberOfLineNumbers", Sec.NumberOfLineNumbers);
 }
 
+void XCOFFDumper::printAuxiliaryHeader(
+    const XCOFFAuxiliaryHeader32 *AuxHeader) {
+  if (AuxHeader == nullptr)
+    return;
+  uint16_t AuxSize = Obj.getOptionalHeaderSize();
+  uint16_t PartialFieldOffset = AuxSize;
+  const char *PartialFieldName = nullptr;
+
+  DictScope DS(W, "AuxiliaryHeader");
+
+#define PrintAuxMember32(H, S, T)                                              \
+  if (offsetof(XCOFFAuxiliaryHeader32, T) +                                    \
+          sizeof(XCOFFAuxiliaryHeader32::T) <=                                 \
+      AuxSize)                                                                 \
+    W.print##H(S, AuxHeader->T);                                               \
+  else if (offsetof(XCOFFAuxiliaryHeader32, T) < AuxSize) {                    \
+    PartialFieldOffset = offsetof(XCOFFAuxiliaryHeader32, T);                  \
+    PartialFieldName = S;                                                      \
+  }
+
+  PrintAuxMember32(Hex, "Magic", AuxMagic);
+  PrintAuxMember32(Hex, "Version", Version);
+  PrintAuxMember32(Hex, "Size of .text section", TextSize);
+  PrintAuxMember32(Hex, "Size of .data section", InitDataSize);
+  PrintAuxMember32(Hex, "Size of .bss section", BssDataSize);
+  PrintAuxMember32(Hex, "Entry point address", EntryPointAddr);
+  PrintAuxMember32(Hex, ".text section start address", TextStartAddr);
+  PrintAuxMember32(Hex, ".data section start address", DataStartAddr);
+  PrintAuxMember32(Hex, "TOC anchor address", TOCAnchorAddr);
+  PrintAuxMember32(Number, "Section number of entryPoint", SecNumOfEntryPoint);
+  PrintAuxMember32(Number, "Section number of .text", SecNumOfText);
+  PrintAuxMember32(Number, "Section number of .data", SecNumOfData);
+  PrintAuxMember32(Number, "Section number of TOC", SecNumOfTOC);
+  PrintAuxMember32(Number, "Section number of loader data", SecNumOfLoader);
+  PrintAuxMember32(Number, "Section number of .bss", SecNumOfBSS);
+  PrintAuxMember32(Hex, "Maxium alignment of .text", MaxAlignOfText);
+  PrintAuxMember32(Hex, "Maxium alignment of .data", MaxAlignOfData);
+  PrintAuxMember32(Hex, "Module type", ModuleType);
+  PrintAuxMember32(Hex, "CPU type of objects", CpuFlag);
+  PrintAuxMember32(Hex, "(Reserved)", CpuType);
+  PrintAuxMember32(Hex, "Maximum stack size", MaxStackSize);
+  PrintAuxMember32(Hex, "Maximum data size", MaxDataSize);
+  PrintAuxMember32(Hex, "Reserved for debugger", ReservedForDebugger);
+  PrintAuxMember32(Hex, "Text page size", TextPageSize);
+  PrintAuxMember32(Hex, "Data page size", DataPageSize);
+  PrintAuxMember32(Hex, "Stack page size", StackPageSize);
+  if (offsetof(XCOFFAuxiliaryHeader32, FlagAndTDataAlignment) +
+          sizeof(XCOFFAuxiliaryHeader32::FlagAndTDataAlignment) <=
+      AuxSize) {
+    W.printHex("Flag", AuxHeader->getFlag());
+    W.printHex("Alignment of thread-local storage",
+               AuxHeader->getTDataAlignment());
+  }
+
+  PrintAuxMember32(Number, "Section number for .tdata", SecNumOfTData);
+  PrintAuxMember32(Number, "Section number for .tbss", SecNumOfTBSS);
+
+  // Deal with error.
+  if (PartialFieldOffset < AuxSize) {
+    std::string ErrInfo;
+    llvm::raw_string_ostream StringOS(ErrInfo);
+    StringOS << "Only partial field for " << PartialFieldName << " at offset ("
+             << PartialFieldOffset << ").";
+    StringOS.flush();
+    reportWarning(
+        make_error<GenericBinaryError>(ErrInfo, object_error::parse_failed),
+        "-");
+    W.printBinary(
+        "Raw data", "",
+        ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) + PartialFieldOffset,
+                          AuxSize - PartialFieldOffset));
+  } else if (sizeof(XCOFFAuxiliaryHeader32) < AuxSize) {
+    reportWarning(make_error<GenericBinaryError>(
+                      "There are extra data beyond auxiliary header",
+                      object_error::parse_failed),
+                  "-");
+    W.printBinary("Extra raw data", "",
+                  ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) +
+                                        sizeof(XCOFFAuxiliaryHeader32),
+                                    AuxSize - sizeof(XCOFFAuxiliaryHeader32)));
+  }
+
+#undef PrintAuxMember32
+}
+
+void XCOFFDumper::printAuxiliaryHeader(
+    const XCOFFAuxiliaryHeader64 *AuxHeader) {
+  if (AuxHeader == nullptr)
+    return;
+  uint16_t AuxSize = Obj.getOptionalHeaderSize();
+  uint16_t PartialFieldOffset = AuxSize;
+  const char *PartialFieldName = nullptr;
+
+  DictScope DS(W, "AuxiliaryHeader");
+
+#define PrintAuxMember64(H, S, T)                                              \
+  if (offsetof(XCOFFAuxiliaryHeader64, T) +                                    \
+          sizeof(XCOFFAuxiliaryHeader64::T) <=                                 \
+      AuxSize)                                                                 \
+    W.print##H(S, AuxHeader->T);                                               \
+  else if (offsetof(XCOFFAuxiliaryHeader64, T) < AuxSize) {                    \
+    PartialFieldOffset = offsetof(XCOFFAuxiliaryHeader64, T);                  \
+    PartialFieldName = S;                                                      \
+  }
+
+  PrintAuxMember64(Hex, "Magic", AuxMagic);
+  PrintAuxMember64(Hex, "Version", Version);
+  PrintAuxMember64(Hex, "Reserved for debugger", ReservedForDebugger);
+  PrintAuxMember64(Hex, ".text section start address", TextStartAddr);
+  PrintAuxMember64(Hex, ".data section start address", DataStartAddr);
+  PrintAuxMember64(Hex, "TOC anchor address", TOCAnchorAddr);
+  PrintAuxMember64(Number, "Section number of entryPoint", SecNumOfEntryPoint);
+  PrintAuxMember64(Number, "Section number of .text", SecNumOfText);
+  PrintAuxMember64(Number, "Section number of .data", SecNumOfData);
+  PrintAuxMember64(Number, "Section number of TOC", SecNumOfTOC);
+  PrintAuxMember64(Number, "Section number of loader data", SecNumOfLoader);
+  PrintAuxMember64(Number, "Section number of .bss", SecNumOfBSS);
+  PrintAuxMember64(Hex, "Maxium alignment of .text", MaxAlignOfText);
+  PrintAuxMember64(Hex, "Maxium alignment of .data", MaxAlignOfData);
+  PrintAuxMember64(Hex, "Module type", ModuleType);
+  PrintAuxMember64(Hex, "CPU type of objects", CpuFlag);
+  PrintAuxMember64(Hex, "(Reserved)", CpuType);
+  PrintAuxMember64(Hex, "Text page size", TextPageSize);
+  PrintAuxMember64(Hex, "Data page size", DataPageSize);
+  PrintAuxMember64(Hex, "Stack page size", StackPageSize);
+  if (offsetof(XCOFFAuxiliaryHeader64, FlagAndTDataAlignment) +
+          sizeof(XCOFFAuxiliaryHeader64::FlagAndTDataAlignment) <=
+      AuxSize) {
+    W.printHex("Flag", AuxHeader->getFlag());
+    W.printHex("Alignment of thread-local storage",
+               AuxHeader->getTDataAlignment());
+  }
+  PrintAuxMember64(Hex, "Size of .text section", TextSize);
+  PrintAuxMember64(Hex, "Size of .data section", InitDataSize);
+  PrintAuxMember64(Hex, "Size of .bss section", BssDataSize);
+  PrintAuxMember64(Hex, "Entry point address", EntryPointAddr);
+  PrintAuxMember64(Hex, "Maximum stack size", MaxStackSize);
+  PrintAuxMember64(Hex, "Maximum data size", MaxDataSize);
+  PrintAuxMember64(Number, "Section number for .tdata", SecNumOfTData);
+  PrintAuxMember64(Number, "Section number for .tbss", SecNumOfTBSS);
+  PrintAuxMember64(Hex, "Additional flags 64-bit XCOFF", XCOFF64Flag);
+
+  if (PartialFieldOffset < AuxSize) {
+    std::string ErrInfo;
+    llvm::raw_string_ostream StringOS(ErrInfo);
+    StringOS << "Only partial field for " << PartialFieldName << " at offset ("
+             << PartialFieldOffset << ").";
+    StringOS.flush();
+    reportWarning(
+        make_error<GenericBinaryError>(ErrInfo, object_error::parse_failed),
+        "-");
+    ;
+    W.printBinary(
+        "Raw data", "",
+        ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) + PartialFieldOffset,
+                          AuxSize - PartialFieldOffset));
+  } else if (sizeof(XCOFFAuxiliaryHeader64) < AuxSize) {
+    reportWarning(make_error<GenericBinaryError>(
+                      "There are extra data beyond auxiliary header",
+                      object_error::parse_failed),
+                  "-");
+    W.printBinary("Extra raw data", "",
+                  ArrayRef<uint8_t>((const uint8_t *)(AuxHeader) +
+                                        sizeof(XCOFFAuxiliaryHeader64),
+                                    AuxSize - sizeof(XCOFFAuxiliaryHeader64)));
+  }
+
+#undef PrintAuxMember64
+}
+
 template <typename T>
 void XCOFFDumper::printSectionHeaders(ArrayRef<T> Sections) {
   ListScope Group(W, "Sections");
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 0b49f03f4275..a598e2c28832 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -23,6 +23,7 @@
 #include "WindowsResourceDumper.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/ELFObjectFile.h"
@@ -44,7 +45,6 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/WithColor.h"
 
 using namespace llvm;
@@ -65,7 +65,7 @@ enum ID {
 #include "Opts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info InfoTable[] = {
+const opt::OptTable::Info InfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {                                                                            \
@@ -149,6 +149,9 @@ static bool COFFLoadConfig;
 static bool COFFResources;
 static bool COFFTLSDirectory;
 
+// XCOFF specific options.
+static bool XCOFFAuxiliaryHeader;
+
 OutputStyleTy Output = OutputStyleTy::LLVM;
 static std::vector<std::string> InputFilenames;
 } // namespace opts
@@ -157,7 +160,7 @@ static StringRef ToolName;
 
 namespace llvm {
 
-LLVM_ATTRIBUTE_NORETURN static void error(Twine Msg) {
+[[noreturn]] static void error(Twine Msg) {
   // Flush the standard output to print the error at a
   // proper place.
   fouts().flush();
@@ -165,7 +168,7 @@ LLVM_ATTRIBUTE_NORETURN static void error(Twine Msg) {
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN void reportError(Error Err, StringRef Input) {
+[[noreturn]] void reportError(Error Err, StringRef Input) {
   assert(Err);
   if (Input == "-")
     Input = "<stdin>";
@@ -268,6 +271,9 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::COFFResources = Args.hasArg(OPT_coff_resources);
   opts::COFFTLSDirectory = Args.hasArg(OPT_coff_tls_directory);
 
+  // XCOFF specific options.
+  opts::XCOFFAuxiliaryHeader = Args.hasArg(OPT_auxiliary_header);
+
   opts::InputFilenames = Args.getAllArgValues(OPT_INPUT);
 }
 
@@ -343,6 +349,9 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
   if (opts::FileHeaders)
     Dumper->printFileHeaders();
 
+  if (Obj.isXCOFF() && opts::XCOFFAuxiliaryHeader)
+    Dumper->printAuxiliaryHeader();
+
   // This is only used for ELF currently. In some cases, when an object is
   // corrupt (e.g. truncated), we can't dump anything except the file header.
   if (!ContentErrString.empty())
@@ -577,6 +586,7 @@ int main(int argc, char *argv[]) {
 
   if (opts::All) {
     opts::FileHeaders = true;
+    opts::XCOFFAuxiliaryHeader = true;
     opts::ProgramHeaders = true;
     opts::SectionHeaders = true;
     opts::Symbols = true;
@@ -595,6 +605,7 @@ int main(int argc, char *argv[]) {
 
   if (opts::Headers) {
     opts::FileHeaders = true;
+    opts::XCOFFAuxiliaryHeader = true;
     opts::ProgramHeaders = true;
     opts::SectionHeaders = true;
   }
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.h b/llvm/tools/llvm-readobj/llvm-readobj.h
index 43d19b4d3f5c..7672da5c0aae 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.h
+++ b/llvm/tools/llvm-readobj/llvm-readobj.h
@@ -21,7 +21,7 @@ namespace llvm {
   }
 
   // Various helper functions.
-  LLVM_ATTRIBUTE_NORETURN void reportError(Error Err, StringRef Input); 
+  [[noreturn]] void reportError(Error Err, StringRef Input);
   void reportWarning(Error Err, StringRef Input);
 
   template <class T> T unwrapOrError(StringRef Input, Expected<T> EO) {
diff --git a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
index f02d8981b30e..21339a3f8f3d 100644
--- a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/SymbolSize.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -33,7 +34,6 @@
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -206,6 +206,9 @@ public:
   uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID, StringRef SectionName,
                                bool IsReadOnly) override;
+  TrivialMemoryManager::TLSSection
+  allocateTLSSection(uintptr_t Size, unsigned Alignment, unsigned SectionID,
+                     StringRef SectionName) override;
 
   /// If non null, records subsequent Name -> SectionID mappings.
   void setSectionIDsMap(SectionIDMap *SecIDMap) {
@@ -252,7 +255,8 @@ public:
                                         sys::Memory::MF_WRITE,
                                         EC);
     if (!MB.base())
-      report_fatal_error("Can't allocate enough memory: " + EC.message());
+      report_fatal_error(Twine("Can't allocate enough memory: ") +
+                         EC.message());
 
     PreallocSlab = MB;
     UsePreallocation = true;
@@ -282,6 +286,9 @@ private:
   uintptr_t SlabSize = 0;
   uintptr_t CurrentSlabOffset = 0;
   SectionIDMap *SecIDMap = nullptr;
+#if defined(__x86_64__) && defined(__ELF__)
+  unsigned UsedTLSStorage = 0;
+#endif
 };
 
 uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
@@ -306,7 +313,8 @@ uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
                                       sys::Memory::MF_WRITE,
                                       EC);
   if (!MB.base())
-    report_fatal_error("MemoryManager allocation failed: " + EC.message());
+    report_fatal_error(Twine("MemoryManager allocation failed: ") +
+                       EC.message());
   FunctionMemory.push_back(SectionInfo(SectionName, MB, SectionID));
   return (uint8_t*)MB.base();
 }
@@ -334,11 +342,52 @@ uint8_t *TrivialMemoryManager::allocateDataSection(uintptr_t Size,
                                       sys::Memory::MF_WRITE,
                                       EC);
   if (!MB.base())
-    report_fatal_error("MemoryManager allocation failed: " + EC.message());
+    report_fatal_error(Twine("MemoryManager allocation failed: ") +
+                       EC.message());
   DataMemory.push_back(SectionInfo(SectionName, MB, SectionID));
   return (uint8_t*)MB.base();
 }
 
+// In case the execution needs TLS storage, we define a very small TLS memory
+// area here that will be used in allocateTLSSection().
+#if defined(__x86_64__) && defined(__ELF__)
+extern "C" {
+alignas(16) __attribute__((visibility("hidden"), tls_model("initial-exec"),
+                           used)) thread_local char LLVMRTDyldTLSSpace[16];
+}
+#endif
+
+TrivialMemoryManager::TLSSection
+TrivialMemoryManager::allocateTLSSection(uintptr_t Size, unsigned Alignment,
+                                         unsigned SectionID,
+                                         StringRef SectionName) {
+#if defined(__x86_64__) && defined(__ELF__)
+  if (Size + UsedTLSStorage > sizeof(LLVMRTDyldTLSSpace)) {
+    return {};
+  }
+
+  // Get the offset of the TLSSpace in the TLS block by using a tpoff
+  // relocation here.
+  int64_t TLSOffset;
+  asm("leaq LLVMRTDyldTLSSpace@tpoff, %0" : "=r"(TLSOffset));
+
+  TLSSection Section;
+  // We use the storage directly as the initialization image. This means that
+  // when a new thread is spawned after this allocation, it will not be
+  // initialized correctly. This means, llvm-rtdyld will only support TLS in a
+  // single thread.
+  Section.InitializationImage =
+      reinterpret_cast<uint8_t *>(LLVMRTDyldTLSSpace + UsedTLSStorage);
+  Section.Offset = TLSOffset + UsedTLSStorage;
+
+  UsedTLSStorage += Size;
+
+  return Section;
+#else
+  return {};
+#endif
+}
+
 static const char *ProgramName;
 
 static void ErrorAndExit(const Twine &Msg) {
@@ -349,10 +398,10 @@ static void ErrorAndExit(const Twine &Msg) {
 static void loadDylibs() {
   for (const std::string &Dylib : Dylibs) {
     if (!sys::fs::is_regular_file(Dylib))
-      report_fatal_error("Dylib not found: '" + Dylib + "'.");
+      report_fatal_error(Twine("Dylib not found: '") + Dylib + "'.");
     std::string ErrMsg;
     if (sys::DynamicLibrary::LoadLibraryPermanently(Dylib.c_str(), &ErrMsg))
-      report_fatal_error("Error loading '" + Dylib + "': " + ErrMsg);
+      report_fatal_error(Twine("Error loading '") + Dylib + "': " + ErrMsg);
   }
 }
 
@@ -413,8 +462,9 @@ static int printLineInfoForInput(bool LoadObjects, bool UseDebugObj) {
       }
     }
 
-    std::unique_ptr<DIContext> Context =
-        DWARFContext::create(*SymbolObj, LoadedObjInfo.get());
+    std::unique_ptr<DIContext> Context = DWARFContext::create(
+        *SymbolObj, DWARFContext::ProcessDebugRelocations::Process,
+        LoadedObjInfo.get());
 
     std::vector<std::pair<SymbolRef, uint64_t>> SymAddr =
         object::computeSymbolSizes(*SymbolObj);
@@ -710,15 +760,15 @@ static void remapSectionsAndSymbols(const llvm::Triple &TargetTriple,
     size_t EqualsIdx = Mapping.find_first_of('=');
 
     if (EqualsIdx == StringRef::npos)
-      report_fatal_error("Invalid dummy symbol specification '" + Mapping +
-                         "'. Should be '<symbol name>=<addr>'");
+      report_fatal_error(Twine("Invalid dummy symbol specification '") +
+                         Mapping + "'. Should be '<symbol name>=<addr>'");
 
     std::string Symbol = Mapping.substr(0, EqualsIdx);
     std::string AddrStr = Mapping.substr(EqualsIdx + 1);
 
     uint64_t Addr;
     if (StringRef(AddrStr).getAsInteger(0, Addr))
-      report_fatal_error("Invalid symbol mapping '" + Mapping + "'.");
+      report_fatal_error(Twine("Invalid symbol mapping '") + Mapping + "'.");
 
     MemMgr.addDummySymbol(Symbol, Addr);
   }
@@ -974,7 +1024,7 @@ int main(int argc, char **argv) {
 
   Timers = ShowTimes ? std::make_unique<RTDyldTimers>() : nullptr;
 
-  int Result;
+  int Result = 0;
   switch (Action) {
   case AC_Execute:
     Result = executeInput();
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index ece322999107..f2be4e7d0712 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -452,10 +452,10 @@ struct ConstModifier: public Modifier {
       switch (getRandom() % 7) {
       case 0:
         return PT->push_back(ConstantInt::get(
-            Ty, APInt::getAllOnesValue(Ty->getPrimitiveSizeInBits())));
+            Ty, APInt::getAllOnes(Ty->getPrimitiveSizeInBits())));
       case 1:
-        return PT->push_back(ConstantInt::get(
-            Ty, APInt::getNullValue(Ty->getPrimitiveSizeInBits())));
+        return PT->push_back(
+            ConstantInt::get(Ty, APInt::getZero(Ty->getPrimitiveSizeInBits())));
       case 2:
       case 3:
       case 4:
diff --git a/llvm/tools/llvm-strings/llvm-strings.cpp b/llvm/tools/llvm-strings/llvm-strings.cpp
index 0b068749917b..26be3914fb92 100644
--- a/llvm/tools/llvm-strings/llvm-strings.cpp
+++ b/llvm/tools/llvm-strings/llvm-strings.cpp
@@ -43,7 +43,7 @@ enum ID {
 #include "Opts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info InfoTable[] = {
+const opt::OptTable::Info InfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {                                                                            \
@@ -73,7 +73,7 @@ static bool PrintFileName;
 enum radix { none, octal, hexadecimal, decimal };
 static radix Radix;
 
-LLVM_ATTRIBUTE_NORETURN static void reportCmdLineError(const Twine &Message) {
+[[noreturn]] static void reportCmdLineError(const Twine &Message) {
   WithColor::error(errs(), ToolName) << Message << "\n";
   exit(1);
 }
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 227ce12a6d9a..2adbf1f1731d 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -52,7 +52,7 @@ enum ID {
 #include "Opts.inc"
 #undef PREFIX
 
-static const opt::OptTable::Info InfoTable[] = {
+const opt::OptTable::Info InfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
                HELPTEXT, METAVAR, VALUES)                                      \
   {                                                                            \
diff --git a/llvm/tools/llvm-tli-checker/Opts.td b/llvm/tools/llvm-tli-checker/Opts.td
new file mode 100644
index 000000000000..b1acef4093c4
--- /dev/null
+++ b/llvm/tools/llvm-tli-checker/Opts.td
@@ -0,0 +1,16 @@
+include "llvm/Option/OptParser.td"
+
+class F<string name, string help> : Flag<["--"], name>, HelpText<help>;
+multiclass Eq<string name, string metavar, string help> {
+  def NAME #_EQ : Joined<["--"], name #"=">,
+                  HelpText<help>, MetaVarName<metavar>;
+  def : Separate<["--"], name>, Alias<!cast<Joined>(NAME #_EQ)>;
+}
+
+def help : F<"help", "Display available options">;
+def : Flag<["-"], "h">, HelpText<"Alias for --help">, Alias<help>;
+def dump_tli : F<"dump-tli", "Dump TLI's list of functions and whether they are available">;
+defm triple : Eq<"triple", "<triple>", "Target triple">;
+defm libdir : Eq<"libdir", "<directory>", "Root directory for finding library files">;
+def separate : F<"separate", "Report on each library file separately">;
+def report_EQ : Joined<["--"], "report=">, HelpText<"Level of detail to report">, Values<"summary,discrepancy,full">;
diff --git a/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp b/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp
new file mode 100644
index 000000000000..bf25efc0b0bd
--- /dev/null
+++ b/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp
@@ -0,0 +1,357 @@
+//===-- llvm-tli-checker.cpp - Compare TargetLibraryInfo to SDK libraries -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Demangle/Demangle.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/WithColor.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+// Command-line option boilerplate.
+namespace {
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OPT_##ID,
+#include "Opts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Opts.inc"
+#undef PREFIX
+
+const opt::OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {                                                                            \
+      PREFIX,      NAME,      HELPTEXT,                                        \
+      METAVAR,     OPT_##ID,  opt::Option::KIND##Class,                        \
+      PARAM,       FLAGS,     OPT_##GROUP,                                     \
+      OPT_##ALIAS, ALIASARGS, VALUES},
+#include "Opts.inc"
+#undef OPTION
+};
+
+class TLICheckerOptTable : public opt::OptTable {
+public:
+  TLICheckerOptTable() : OptTable(InfoTable) {}
+};
+} // namespace
+
+// We have three levels of reporting.
+enum class ReportKind {
+  Error,       // For argument parsing errors.
+  Summary,     // Report counts but not details.
+  Discrepancy, // Report where TLI and the library differ.
+  Full         // Report for every known-to-TLI function.
+};
+
+// Most of the ObjectFile interfaces return an Expected<T>, so make it easy
+// to ignore those.
+template <typename T> T unwrapIgnoreError(Expected<T> E) {
+  if (E)
+    return std::move(*E);
+  // Sink the error and return a nothing value.
+  consumeError(E.takeError());
+  return T();
+}
+
+static void fail(const Twine &Message) {
+  WithColor::error() << Message << '\n';
+  exit(EXIT_FAILURE);
+}
+
+// Some problem occurred with an archive member; complain and continue.
+static void reportArchiveChildIssue(const object::Archive::Child &C, int Index,
+                                    StringRef ArchiveFilename) {
+  // First get the member name.
+  std::string ChildName;
+  Expected<StringRef> NameOrErr = C.getName();
+  if (NameOrErr)
+    ChildName = std::string(NameOrErr.get());
+  else {
+    // Ignore the name-fetch error, just report the index.
+    consumeError(NameOrErr.takeError());
+    ChildName = "<file index: " + std::to_string(Index) + ">";
+  }
+
+  WithColor::warning() << ArchiveFilename << "(" << ChildName
+                       << "): member is not usable\n";
+}
+
+// Return Name, and if Name is mangled, append "aka" and the demangled name.
+static std::string PrintableName(StringRef Name) {
+  std::string OutputName = "'";
+  OutputName += Name;
+  OutputName += "'";
+  if (Name.startswith("_Z") || Name.startswith("??")) {
+    OutputName += " aka ";
+    OutputName += demangle(Name.str());
+  }
+  return OutputName;
+}
+
+// Store all the names that TargetLibraryInfo knows about; the bool indicates
+// whether TLI has it marked as "available" for the target of interest.
+// This is a vector to preserve the sorted order for better reporting.
+struct TLINameList : std::vector<std::pair<StringRef, bool>> {
+  // Record all the TLI info in the vector.
+  void initialize(StringRef TargetTriple);
+  // Print out what we found.
+  void dump();
+};
+TLINameList TLINames;
+
+void TLINameList::initialize(StringRef TargetTriple) {
+  Triple T(TargetTriple);
+  TargetLibraryInfoImpl TLII(T);
+  TargetLibraryInfo TLI(TLII);
+
+  reserve(LibFunc::NumLibFuncs);
+  size_t NumAvailable = 0;
+  for (unsigned FI = 0; FI != LibFunc::NumLibFuncs; ++FI) {
+    LibFunc LF = (LibFunc)FI;
+    bool Available = TLI.has(LF);
+    // getName returns names only for available funcs.
+    TLII.setAvailable(LF);
+    emplace_back(TLI.getName(LF), Available);
+    if (Available)
+      ++NumAvailable;
+  }
+  outs() << "TLI knows " << LibFunc::NumLibFuncs << " symbols, " << NumAvailable
+         << " available for '" << TargetTriple << "'\n";
+}
+
+void TLINameList::dump() {
+  // Assume this gets called after initialize(), so we have the above line of
+  // output as a header.  So, for example, no need to repeat the triple.
+  for (auto &TLIName : TLINames) {
+    outs() << (TLIName.second ? "    " : "not ")
+           << "available: " << PrintableName(TLIName.first) << '\n';
+  }
+}
+
+// Store all the exported symbol names we found in the input libraries.
+// We use a map to get hashed lookup speed; the bool is meaningless.
+class SDKNameMap : public StringMap<bool> {
+  void populateFromObject(ObjectFile *O);
+  void populateFromArchive(Archive *A);
+
+public:
+  void populateFromFile(StringRef LibDir, StringRef LibName);
+};
+SDKNameMap SDKNames;
+
+// Given an ObjectFile, extract the global function symbols.
+void SDKNameMap::populateFromObject(ObjectFile *O) {
+  // FIXME: Support COFF.
+  if (!O->isELF()) {
+    WithColor::warning() << "Only ELF-format files are supported\n";
+    return;
+  }
+  auto *ELF = cast<const ELFObjectFileBase>(O);
+
+  for (auto I = ELF->getDynamicSymbolIterators().begin();
+       I != ELF->getDynamicSymbolIterators().end(); ++I) {
+    // We want only global function symbols.
+    SymbolRef::Type Type = unwrapIgnoreError(I->getType());
+    uint32_t Flags = unwrapIgnoreError(I->getFlags());
+    StringRef Name = unwrapIgnoreError(I->getName());
+    if (Type == SymbolRef::ST_Function && (Flags & SymbolRef::SF_Global))
+      insert({Name, true});
+  }
+}
+
+// Unpack an archive and populate from the component object files.
+// This roughly imitates dumpArchive() from llvm-objdump.cpp.
+void SDKNameMap::populateFromArchive(Archive *A) {
+  Error Err = Error::success();
+  int Index = -1;
+  for (auto &C : A->children(Err)) {
+    ++Index;
+    Expected<std::unique_ptr<object::Binary>> ChildOrErr = C.getAsBinary();
+    if (!ChildOrErr) {
+      if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
+        // Issue a generic warning.
+        consumeError(std::move(E));
+        reportArchiveChildIssue(C, Index, A->getFileName());
+      }
+      continue;
+    }
+    if (ObjectFile *O = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
+      populateFromObject(O);
+    // Ignore non-object archive members.
+  }
+  if (Err)
+    WithColor::defaultErrorHandler(std::move(Err));
+}
+
+// Unpack a library file and extract the global function names.
+void SDKNameMap::populateFromFile(StringRef LibDir, StringRef LibName) {
+  // Pick an arbitrary but reasonable default size.
+  SmallString<255> Filepath(LibDir);
+  sys::path::append(Filepath, LibName);
+  if (!sys::fs::exists(Filepath)) {
+    WithColor::warning() << "Could not find '" << StringRef(Filepath) << "'\n";
+    return;
+  }
+  outs() << "\nLooking for symbols in '" << StringRef(Filepath) << "'\n";
+  auto ExpectedBinary = createBinary(Filepath);
+  if (!ExpectedBinary) {
+    // FIXME: Report this better.
+    WithColor::defaultWarningHandler(ExpectedBinary.takeError());
+    return;
+  }
+  OwningBinary<Binary> OBinary = std::move(*ExpectedBinary);
+  Binary &Binary = *OBinary.getBinary();
+  size_t Precount = size();
+  if (Archive *A = dyn_cast<Archive>(&Binary))
+    populateFromArchive(A);
+  else if (ObjectFile *O = dyn_cast<ObjectFile>(&Binary))
+    populateFromObject(O);
+  else {
+    WithColor::warning() << "Not an Archive or ObjectFile: '"
+                         << StringRef(Filepath) << "'\n";
+    return;
+  }
+  if (Precount == size())
+    WithColor::warning() << "No symbols found in '" << StringRef(Filepath)
+                         << "'\n";
+  else
+    outs() << "Found " << size() - Precount << " global function symbols in '"
+           << StringRef(Filepath) << "'\n";
+}
+
+int main(int argc, char *argv[]) {
+  InitLLVM X(argc, argv);
+  BumpPtrAllocator A;
+  StringSaver Saver(A);
+  TLICheckerOptTable Tbl;
+  opt::InputArgList Args = Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver,
+                                         [&](StringRef Msg) { fail(Msg); });
+
+  if (Args.hasArg(OPT_help)) {
+    std::string Usage(argv[0]);
+    Usage += " [options] library-file [library-file...]";
+    Tbl.printHelp(outs(), Usage.c_str(),
+                  "LLVM TargetLibraryInfo versus SDK checker");
+    outs() << "\nPass @FILE as argument to read options or library names from "
+              "FILE.\n";
+    return 0;
+  }
+
+  TLINames.initialize(Args.getLastArgValue(OPT_triple_EQ));
+
+  // --dump-tli doesn't require any input files.
+  if (Args.hasArg(OPT_dump_tli)) {
+    TLINames.dump();
+    return 0;
+  }
+
+  std::vector<std::string> LibList = Args.getAllArgValues(OPT_INPUT);
+  if (LibList.empty()) {
+    WithColor::error() << "No input files\n";
+    exit(EXIT_FAILURE);
+  }
+  StringRef LibDir = Args.getLastArgValue(OPT_libdir_EQ);
+  bool SeparateMode = Args.hasArg(OPT_separate);
+
+  ReportKind ReportLevel =
+      SeparateMode ? ReportKind::Summary : ReportKind::Discrepancy;
+  if (const opt::Arg *A = Args.getLastArg(OPT_report_EQ)) {
+    ReportLevel = StringSwitch<ReportKind>(A->getValue())
+                      .Case("summary", ReportKind::Summary)
+                      .Case("discrepancy", ReportKind::Discrepancy)
+                      .Case("full", ReportKind::Full)
+                      .Default(ReportKind::Error);
+    if (ReportLevel == ReportKind::Error) {
+      WithColor::error() << "invalid option for --report: " << A->getValue();
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  for (size_t I = 0; I < LibList.size(); ++I) {
+    // In SeparateMode we report on input libraries individually; otherwise
+    // we do one big combined search.  Reading to the end of LibList here
+    // will cause the outer while loop to terminate cleanly.
+    if (SeparateMode) {
+      SDKNames.clear();
+      SDKNames.populateFromFile(LibDir, LibList[I]);
+      if (SDKNames.empty())
+        continue;
+    } else {
+      do
+        SDKNames.populateFromFile(LibDir, LibList[I]);
+      while (++I < LibList.size());
+      if (SDKNames.empty()) {
+        WithColor::error() << "NO symbols found!\n";
+        break;
+      }
+      outs() << "Found a grand total of " << SDKNames.size()
+             << " library symbols\n";
+    }
+    unsigned TLIdoesSDKdoesnt = 0;
+    unsigned TLIdoesntSDKdoes = 0;
+    unsigned TLIandSDKboth = 0;
+    unsigned TLIandSDKneither = 0;
+    for (auto &TLIName : TLINames) {
+      bool TLIHas = TLIName.second;
+      bool SDKHas = SDKNames.count(TLIName.first) == 1;
+      int Which = int(TLIHas) * 2 + int(SDKHas);
+      switch (Which) {
+      case 0: ++TLIandSDKneither; break;
+      case 1: ++TLIdoesntSDKdoes; break;
+      case 2: ++TLIdoesSDKdoesnt; break;
+      case 3: ++TLIandSDKboth;    break;
+      }
+      // If the results match, report only if user requested a full report.
+      ReportKind Threshold =
+          TLIHas == SDKHas ? ReportKind::Full : ReportKind::Discrepancy;
+      if (Threshold <= ReportLevel) {
+        constexpr char YesNo[2][4] = {"no ", "yes"};
+        constexpr char Indicator[4][3] = {"!!", ">>", "<<", "=="};
+        outs() << Indicator[Which] << " TLI " << YesNo[TLIHas] << " SDK "
+               << YesNo[SDKHas] << ": " << PrintableName(TLIName.first) << '\n';
+      }
+    }
+
+    assert(TLIandSDKboth + TLIandSDKneither + TLIdoesSDKdoesnt +
+               TLIdoesntSDKdoes ==
+           LibFunc::NumLibFuncs);
+    outs() << "<< Total TLI yes SDK no:  " << TLIdoesSDKdoesnt
+           << "\n>> Total TLI no  SDK yes: " << TLIdoesntSDKdoes
+           << "\n== Total TLI yes SDK yes: " << TLIandSDKboth;
+    if (TLIandSDKboth == 0) {
+      outs() << " *** NO TLI SYMBOLS FOUND";
+      if (SeparateMode)
+        outs() << " in '" << LibList[I] << "'";
+    }
+    outs() << '\n';
+
+    if (!SeparateMode) {
+      if (TLIdoesSDKdoesnt == 0 && TLIdoesntSDKdoes == 0)
+        outs() << "PASS: LLVM TLI matched SDK libraries successfully.\n";
+      else
+        outs() << "FAIL: LLVM TLI doesn't match SDK libraries.\n";
+    }
+  }
+}
diff --git a/llvm/tools/llvm-xray/xray-color-helper.cpp b/llvm/tools/llvm-xray/xray-color-helper.cpp
index e2cae21e162b..b2ed63881bdc 100644
--- a/llvm/tools/llvm-xray/xray-color-helper.cpp
+++ b/llvm/tools/llvm-xray/xray-color-helper.cpp
@@ -21,7 +21,7 @@ using namespace xray;
 //  Sequential ColorMaps, which are used to represent information
 //  from some minimum to some maximum.
 
-static const std::tuple<uint8_t, uint8_t, uint8_t> SequentialMaps[][9] = {
+const std::tuple<uint8_t, uint8_t, uint8_t> SequentialMaps[][9] = {
     {// The greys color scheme from http://colorbrewer2.org/
      std::make_tuple(255, 255, 255), std::make_tuple(240, 240, 240),
      std::make_tuple(217, 217, 217), std::make_tuple(189, 189, 189),
@@ -42,7 +42,7 @@ static const std::tuple<uint8_t, uint8_t, uint8_t> SequentialMaps[][9] = {
      std::make_tuple(2, 56, 88)}};
 
 // Sequential Maps extend the last colors given out of range inputs.
-static const std::tuple<uint8_t, uint8_t, uint8_t> SequentialBounds[][2] = {
+const std::tuple<uint8_t, uint8_t, uint8_t> SequentialBounds[][2] = {
     {// The Bounds for the greys color scheme
      std::make_tuple(255, 255, 255), std::make_tuple(0, 0, 0)},
     {// The Bounds for the OrRd color Scheme
@@ -58,7 +58,7 @@ ColorHelper::ColorHelper(ColorHelper::SequentialScheme S)
 // representing differenes, or a range that goes from negative to positive.
 // These take an input in the range [-1,1].
 
-static const std::tuple<uint8_t, uint8_t, uint8_t> DivergingCoeffs[][11] = {
+const std::tuple<uint8_t, uint8_t, uint8_t> DivergingCoeffs[][11] = {
     {// The PiYG color scheme from http://colorbrewer2.org/
      std::make_tuple(142, 1, 82), std::make_tuple(197, 27, 125),
      std::make_tuple(222, 119, 174), std::make_tuple(241, 182, 218),
@@ -69,7 +69,7 @@ static const std::tuple<uint8_t, uint8_t, uint8_t> DivergingCoeffs[][11] = {
 
 // Diverging maps use out of bounds ranges to show missing data. Missing Right
 // Being below min, and missing left being above max.
-static const std::tuple<uint8_t, uint8_t, uint8_t> DivergingBounds[][2] = {
+const std::tuple<uint8_t, uint8_t, uint8_t> DivergingBounds[][2] = {
     {// The PiYG color scheme has green and red for missing right and left
      // respectively.
      std::make_tuple(255, 0, 0), std::make_tuple(0, 255, 0)}};
diff --git a/llvm/tools/llvm-xray/xray-converter.cpp b/llvm/tools/llvm-xray/xray-converter.cpp
index 47cb645a5408..82d0261ec4da 100644
--- a/llvm/tools/llvm-xray/xray-converter.cpp
+++ b/llvm/tools/llvm-xray/xray-converter.cpp
@@ -57,6 +57,15 @@ static cl::opt<bool>
                      cl::init(false), cl::sub(Convert));
 static cl::alias ConvertSymbolize2("y", cl::aliasopt(ConvertSymbolize),
                                    cl::desc("Alias for -symbolize"));
+static cl::opt<bool>
+    NoDemangle("no-demangle",
+               cl::desc("determines whether to demangle function name "
+                        "when symbolizing function ids from the input log"),
+               cl::init(false), cl::sub(Convert));
+
+static cl::opt<bool> Demangle("demangle",
+                              cl::desc("demangle symbols (default)"),
+                              cl::sub(Convert));
 
 static cl::opt<std::string>
     ConvertInstrMap("instr_map",
@@ -373,7 +382,10 @@ static CommandRegistration Unused(&Convert, []() -> Error {
   }
 
   const auto &FunctionAddresses = Map.getFunctionAddresses();
-  symbolize::LLVMSymbolizer Symbolizer;
+  symbolize::LLVMSymbolizer::Options SymbolizerOpts;
+  if (Demangle.getPosition() < NoDemangle.getPosition())
+    SymbolizerOpts.Demangle = false;
+  symbolize::LLVMSymbolizer Symbolizer(SymbolizerOpts);
   llvm::xray::FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
                                                   FunctionAddresses);
   llvm::xray::TraceConverter TC(FuncIdHelper, ConvertSymbolize);
diff --git a/llvm/tools/llvm-xray/xray-extract.cpp b/llvm/tools/llvm-xray/xray-extract.cpp
index a6ffacc6ab92..52767a00f615 100644
--- a/llvm/tools/llvm-xray/xray-extract.cpp
+++ b/llvm/tools/llvm-xray/xray-extract.cpp
@@ -45,11 +45,12 @@ static cl::opt<bool> ExtractSymbolize("symbolize", cl::value_desc("symbolize"),
                                       cl::sub(Extract));
 static cl::alias ExtractSymbolize2("s", cl::aliasopt(ExtractSymbolize),
                                    cl::desc("alias for -symbolize"));
-static cl::opt<bool> ExtractNoDemangle("no-demangle",
-                                       cl::value_desc("no-demangle"),
-                                       cl::init(false),
-                                       cl::desc("don't demangle symbols"),
-                                       cl::sub(Extract));
+static cl::opt<bool> Demangle("demangle",
+                              cl::desc("demangle symbols (default)"),
+                              cl::sub(Extract));
+static cl::opt<bool> NoDemangle("no-demangle",
+                                cl::desc("don't demangle symbols"),
+                                cl::sub(Extract));
 
 namespace {
 
@@ -90,7 +91,7 @@ static CommandRegistration Unused(&Extract, []() -> Error {
   const auto &FunctionAddresses =
       InstrumentationMapOrError->getFunctionAddresses();
   symbolize::LLVMSymbolizer::Options opts;
-  if (ExtractNoDemangle)
+  if (Demangle.getPosition() < NoDemangle.getPosition())
     opts.Demangle = false;
   symbolize::LLVMSymbolizer Symbolizer(opts);
   llvm::xray::FuncIdConversionHelper FuncIdHelper(ExtractInput, Symbolizer,
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 8b1fbd09e40b..631d8eed5d7a 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -137,6 +137,7 @@ extern cl::opt<std::string> ProfileFile;
 extern cl::opt<CSPGOKind> CSPGOKindFlag;
 extern cl::opt<std::string> CSProfileGenFile;
 extern cl::opt<bool> DisableBasicAA;
+extern cl::opt<bool> PrintPipelinePasses;
 } // namespace llvm
 
 static cl::opt<std::string>
@@ -173,58 +174,58 @@ bool tryParsePipelineText(PassBuilder &PB,
 static void registerEPCallbacks(PassBuilder &PB) {
   if (tryParsePipelineText<FunctionPassManager>(PB, PeepholeEPPipeline))
     PB.registerPeepholeEPCallback(
-        [&PB](FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](FunctionPassManager &PM, OptimizationLevel Level) {
           ExitOnError Err("Unable to parse PeepholeEP pipeline: ");
           Err(PB.parsePassPipeline(PM, PeepholeEPPipeline));
         });
   if (tryParsePipelineText<LoopPassManager>(PB,
                                             LateLoopOptimizationsEPPipeline))
     PB.registerLateLoopOptimizationsEPCallback(
-        [&PB](LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](LoopPassManager &PM, OptimizationLevel Level) {
           ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: ");
           Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline));
         });
   if (tryParsePipelineText<LoopPassManager>(PB, LoopOptimizerEndEPPipeline))
     PB.registerLoopOptimizerEndEPCallback(
-        [&PB](LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](LoopPassManager &PM, OptimizationLevel Level) {
           ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: ");
           Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline));
         });
   if (tryParsePipelineText<FunctionPassManager>(PB,
                                                 ScalarOptimizerLateEPPipeline))
     PB.registerScalarOptimizerLateEPCallback(
-        [&PB](FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](FunctionPassManager &PM, OptimizationLevel Level) {
           ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: ");
           Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline));
         });
   if (tryParsePipelineText<CGSCCPassManager>(PB, CGSCCOptimizerLateEPPipeline))
     PB.registerCGSCCOptimizerLateEPCallback(
-        [&PB](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](CGSCCPassManager &PM, OptimizationLevel Level) {
           ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: ");
           Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline));
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, VectorizerStartEPPipeline))
     PB.registerVectorizerStartEPCallback(
-        [&PB](FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+        [&PB](FunctionPassManager &PM, OptimizationLevel Level) {
           ExitOnError Err("Unable to parse VectorizerStartEP pipeline: ");
           Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline));
         });
   if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
     PB.registerPipelineStartEPCallback(
-        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
+        [&PB](ModulePassManager &PM, OptimizationLevel) {
           ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
           Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline));
         });
   if (tryParsePipelineText<ModulePassManager>(
           PB, PipelineEarlySimplificationEPPipeline))
     PB.registerPipelineEarlySimplificationEPCallback(
-        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
+        [&PB](ModulePassManager &PM, OptimizationLevel) {
           ExitOnError Err("Unable to parse EarlySimplification pipeline: ");
           Err(PB.parsePassPipeline(PM, PipelineEarlySimplificationEPPipeline));
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
-        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
+        [&PB](ModulePassManager &PM, OptimizationLevel) {
           ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
           Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline));
         });
@@ -259,12 +260,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
                    PGOOptions::SampleUse);
     break;
   case NoPGO:
-    if (DebugInfoForProfiling)
+    if (DebugInfoForProfiling || PseudoProbeForProfiling)
       P = PGOOptions("", "", "", PGOOptions::NoAction, PGOOptions::NoCSAction,
-                     true);
-    else if (PseudoProbeForProfiling)
-      P = PGOOptions("", "", "", PGOOptions::NoAction, PGOOptions::NoCSAction,
-                     false, true);
+                     DebugInfoForProfiling, PseudoProbeForProfiling);
     else
       P = None;
   }
@@ -287,6 +285,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
       P->CSAction = PGOOptions::CSIRUse;
     }
   }
+  if (TM)
+    TM->setPGOOption(P);
+
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
@@ -339,18 +340,17 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   PB.registerPipelineParsingCallback(
       [](StringRef Name, ModulePassManager &MPM,
          ArrayRef<PassBuilder::PipelineElement>) {
+        AddressSanitizerOptions Opts;
         if (Name == "asan-pipeline") {
           MPM.addPass(
               RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
-          MPM.addPass(
-              createModuleToFunctionPassAdaptor(AddressSanitizerPass()));
-          MPM.addPass(ModuleAddressSanitizerPass());
+          MPM.addPass(ModuleAddressSanitizerPass(Opts));
           return true;
         } else if (Name == "asan-function-pipeline") {
           MPM.addPass(
               RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
           MPM.addPass(
-              createModuleToFunctionPassAdaptor(AddressSanitizerPass()));
+              createModuleToFunctionPassAdaptor(AddressSanitizerPass(Opts)));
           return true;
         }
         return false;
@@ -411,6 +411,7 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (EnableDebugify)
     MPM.addPass(NewPMDebugifyPass());
 
+  // Add passes according to the -passes options.
   if (!PassPipeline.empty()) {
     assert(Passes.empty() &&
            "PassPipeline and Passes should not both contain passes");
@@ -419,10 +420,26 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
       return false;
     }
   }
+  // Add passes specified using the legacy PM syntax (i.e. not using
+  // -passes). This should be removed later when such support has been
+  // deprecated, i.e. when all lit tests running opt (and not using
+  // -enable-new-pm=0) have been updated to use -passes.
   for (auto PassName : Passes) {
     std::string ModifiedPassName(PassName.begin(), PassName.end());
     if (PB.isAnalysisPassName(PassName))
       ModifiedPassName = "require<" + ModifiedPassName + ">";
+    // FIXME: These translations are supposed to be removed when lit tests that
+    // use these names have been updated to use the -passes syntax (and when the
+    // support for using the old syntax to specify passes is considered as
+    // deprecated for the new PM).
+    if (ModifiedPassName == "early-cse-memssa")
+      ModifiedPassName = "early-cse<memssa>";
+    else if (ModifiedPassName == "post-inline-ee-instrument")
+      ModifiedPassName = "ee-instrument<post-inline>";
+    else if (ModifiedPassName == "loop-extract-single")
+      ModifiedPassName = "loop-extract<single>";
+    else if (ModifiedPassName == "lower-matrix-intrinsics-minimal")
+      ModifiedPassName = "lower-matrix-intrinsics<minimal>";
     if (auto Err = PB.parsePassPipeline(MPM, ModifiedPassName)) {
       errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
       return false;
@@ -455,6 +472,17 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // Before executing passes, print the final values of the LLVM options.
   cl::PrintOptionValues();
 
+  // Print a textual, '-passes=' compatible, representation of pipeline if
+  // requested.
+  if (PrintPipelinePasses) {
+    MPM.printPipeline(outs(), [&PIC](StringRef ClassName) {
+      auto PassName = PIC.getPassNameForClassName(ClassName);
+      return PassName.empty() ? ClassName : PassName;
+    });
+    outs() << "\n";
+    return true;
+  }
+
   // Now that we have all of the passes ready, run them.
   MPM.run(M, MAM);
 
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 094f517fb703..7793a5471793 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -38,6 +38,7 @@
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
@@ -46,7 +47,6 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SystemUtils.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -102,9 +102,6 @@ static cl::opt<bool>
 Force("f", cl::desc("Enable binary output on terminals"));
 
 static cl::opt<bool>
-PrintEachXForm("p", cl::desc("Print module after each transformation"));
-
-static cl::opt<bool>
 NoOutput("disable-output",
          cl::desc("Do not write result bitcode file"), cl::Hidden);
 
@@ -146,17 +143,7 @@ static cl::opt<bool>
     StripNamedMetadata("strip-named-metadata",
                        cl::desc("Strip module-level named metadata"));
 
-static cl::opt<bool>
-    DisableInline("disable-inlining",
-                  cl::desc("Do not run the inliner pass (legacy PM only)"));
-
-static cl::opt<bool>
-DisableOptimizations("disable-opt",
-                     cl::desc("Do not run any optimization passes"));
 
-static cl::opt<bool> StandardLinkOpts(
-    "std-link-opts",
-    cl::desc("Include the standard link time optimizations (legacy PM only)"));
 
 static cl::opt<bool>
     OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
@@ -368,9 +355,7 @@ static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
   Builder.OptLevel = OptLevel;
   Builder.SizeLevel = SizeLevel;
 
-  if (DisableInline) {
-    // No inlining pass
-  } else if (OptLevel > 1) {
+  if (OptLevel > 1) {
     Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel, false);
   } else {
     Builder.Inliner = createAlwaysInlinerLegacyPass();
@@ -418,17 +403,6 @@ static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
   Builder.populateModulePassManager(MPM);
 }
 
-static void AddStandardLinkPasses(legacy::PassManagerBase &PM) {
-  PassManagerBuilder Builder;
-  Builder.VerifyInput = true;
-  if (DisableOptimizations)
-    Builder.OptLevel = 0;
-
-  if (!DisableInline)
-    Builder.Inliner = createFunctionInliningPass();
-  Builder.populateLTOPassManager(PM);
-}
-
 //===----------------------------------------------------------------------===//
 // CodeGen-related helper functions.
 //
@@ -507,9 +481,10 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
     return false;
 
   std::vector<StringRef> PassNamePrefix = {
-      "x86-",  "xcore-", "wasm-",    "systemz-", "ppc-",    "nvvm-",   "nvptx-",
-      "mips-", "lanai-", "hexagon-", "bpf-",     "avr-",    "thumb2-", "arm-",
-      "si-",   "gcn-",   "amdgpu-",  "aarch64-", "amdgcn-", "polly-"};
+      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
+      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
+      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
+      "amdgcn-", "polly-", "riscv-"};
   std::vector<StringRef> PassNameContain = {"ehprepare"};
   std::vector<StringRef> PassNameExact = {
       "safe-stack",           "cost-model",
@@ -797,19 +772,32 @@ int main(int argc, char **argv) {
           << "Cannot specify passes via both -foo-pass and --passes=foo-pass\n";
       return 1;
     }
+    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
+                     OptLevelOs + OptLevelOz;
+    if (NumOLevel > 1) {
+      errs() << "Cannot specify multiple -O#\n";
+      return 1;
+    }
+    if (NumOLevel > 0 && PassPipeline.getNumOccurrences() > 0) {
+      errs() << "Cannot specify -O# and --passes=, use "
+                "-passes='default<O#>,other-pass'\n";
+      return 1;
+    }
+    std::string Pipeline = PassPipeline;
+
     SmallVector<StringRef, 4> Passes;
     if (OptLevelO0)
-      Passes.push_back("default<O0>");
+      Pipeline = "default<O0>";
     if (OptLevelO1)
-      Passes.push_back("default<O1>");
+      Pipeline = "default<O1>";
     if (OptLevelO2)
-      Passes.push_back("default<O2>");
+      Pipeline = "default<O2>";
     if (OptLevelO3)
-      Passes.push_back("default<O3>");
+      Pipeline = "default<O3>";
     if (OptLevelOs)
-      Passes.push_back("default<Os>");
+      Pipeline = "default<Os>";
     if (OptLevelOz)
-      Passes.push_back("default<Oz>");
+      Pipeline = "default<Oz>";
     for (const auto &P : PassList)
       Passes.push_back(P->getPassArgument());
     OutputKind OK = OK_NoOutput;
@@ -828,7 +816,7 @@ int main(int argc, char **argv) {
     // string. Hand off the rest of the functionality to the new code for that
     // layer.
     return runPassPipeline(argv[0], *M, TM.get(), &TLII, Out.get(),
-                           ThinLinkOut.get(), RemarksFile.get(), PassPipeline,
+                           ThinLinkOut.get(), RemarksFile.get(), Pipeline,
                            Passes, OK, VK, PreserveAssemblyUseListOrder,
                            PreserveBitcodeUseListOrder, EmitSummaryIndex,
                            EmitModuleHash, EnableDebugify)
@@ -909,12 +897,6 @@ int main(int argc, char **argv) {
 
   // Create a new optimization pass for each one specified on the command line
   for (unsigned i = 0; i < PassList.size(); ++i) {
-    if (StandardLinkOpts &&
-        StandardLinkOpts.getPosition() < PassList.getPosition(i)) {
-      AddStandardLinkPasses(Passes);
-      StandardLinkOpts = false;
-    }
-
     if (OptLevelO0 && OptLevelO0.getPosition() < PassList.getPosition(i)) {
       AddOptimizationPasses(Passes, *FPasses, TM.get(), 0, 0);
       OptLevelO0 = false;
@@ -976,15 +958,6 @@ int main(int argc, char **argv) {
         }
       }
     }
-
-    if (PrintEachXForm)
-      Passes.add(
-          createPrintModulePass(errs(), "", PreserveAssemblyUseListOrder));
-  }
-
-  if (StandardLinkOpts) {
-    AddStandardLinkPasses(Passes);
-    StandardLinkOpts = false;
   }
 
   if (OptLevelO0)
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 00bdd127e3c2..be17d5c718c2 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -636,6 +636,15 @@ struct MatchableInfo {
     if (RequiredFeatures.size() != RHS.RequiredFeatures.size())
       return RequiredFeatures.size() > RHS.RequiredFeatures.size();
 
+    // For X86 AVX/AVX512 instructions, we prefer vex encoding because the
+    // vex encoding size is smaller. Since X86InstrSSE.td is included ahead
+    // of X86InstrAVX512.td, the AVX instruction ID is less than AVX512 ID.
+    // We use the ID to sort AVX instruction before AVX512 instruction in
+    // matching table.
+    if (TheDef->isSubClassOf("Instruction") &&
+        TheDef->getValueAsBit("HasPositionOrder"))
+      return TheDef->getID() < RHS.TheDef->getID();
+
     return false;
   }
 
@@ -1062,7 +1071,7 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   // Remove comments from the asm string.  We know that the asmstring only
   // has one line.
   if (!CommentDelimiter.empty() &&
-      StringRef(AsmString).find(CommentDelimiter) != StringRef::npos)
+      StringRef(AsmString).contains(CommentDelimiter))
     PrintFatalError(TheDef->getLoc(),
                   "asmstring for instruction has comment character in it, "
                   "mark it isCodeGenOnly");
@@ -1077,7 +1086,7 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool IsAlias) const {
   std::set<std::string> OperandNames;
   for (const AsmOperand &Op : AsmOperands) {
     StringRef Tok = Op.Token;
-    if (Tok[0] == '$' && Tok.find(':') != StringRef::npos)
+    if (Tok[0] == '$' && Tok.contains(':'))
       PrintFatalError(TheDef->getLoc(),
                       "matchable with operand modifier '" + Tok +
                       "' not supported by asm matcher.  Mark isCodeGenOnly!");
@@ -3915,8 +3924,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   if (HasDeprecation) {
     OS << "    std::string Info;\n";
-    OS << "    if (!getParser().getTargetParser().\n";
-    OS << "        getTargetOptions().MCNoDeprecatedWarn &&\n";
+    OS << "    if (!getParser().getTargetParser().getTargetOptions().MCNoDeprecatedWarn &&\n";
     OS << "        MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand &)*Operands[0]).getStartLoc();\n";
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index 94fd8f7e92b4..bb13c4033db7 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -457,9 +457,14 @@ void AsmWriterEmitter::EmitPrintInstruction(
   StringRef ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
   bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget");
 
+  // This function has some huge switch statements that causing excessive
+  // compile time in LLVM profile instrumenation build. This print function
+  // usually is not frequently called in compilation. Here we disable the
+  // profile instrumenation for this function.
   O << "/// printInstruction - This method is automatically generated by "
        "tablegen\n"
        "/// from the instruction set description.\n"
+       "LLVM_NO_PROFILE_INSTRUMENT_FUNCTION\n"
        "void "
     << Target.getName() << ClassName
     << "::printInstruction(const MCInst *MI, uint64_t Address, "
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index ffc2878d3508..ee77ef5eda5f 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -515,7 +515,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
     << "    std::string msg;\n"
     << "    raw_string_ostream Msg(msg);\n"
     << "    Msg << \"Not supported instr: \" << MI;\n"
-    << "    report_fatal_error(Msg.str());\n"
+    << "    report_fatal_error(Msg.str().c_str());\n"
     << "  }\n";
   if (UseAPInt)
     o << "  Inst = Value;\n";
@@ -638,7 +638,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
     << "      if (MissingFeatures.test(i))\n"
     << "        Msg << SubtargetFeatureNames[i] << \" \";\n"
     << "    Msg << \"predicate(s) are not met\";\n"
-    << "    report_fatal_error(Msg.str());\n"
+    << "    report_fatal_error(Msg.str().c_str());\n"
     << "  }\n"
     << "#else\n"
     << "  // Silence unused variable warning on targets that don't use MCII for "
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index c1a3a34d928b..4a247050ceeb 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -451,13 +451,16 @@ static Iter max_if(Iter B, Iter E, Pred P, Less L) {
 }
 
 /// Make sure that for each type in Small, there exists a larger type in Big.
-bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
-                                   TypeSetByHwMode &Big) {
+bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small, TypeSetByHwMode &Big,
+                                   bool SmallIsVT) {
   ValidateOnExit _1(Small, *this), _2(Big, *this);
   if (TP.hasError())
     return false;
   bool Changed = false;
 
+  assert((!SmallIsVT || !Small.empty()) &&
+         "Small should not be empty for SDTCisVTSmallerThanOp");
+
   if (Small.empty())
     Changed |= EnforceAny(Small);
   if (Big.empty())
@@ -476,7 +479,9 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
     TypeSetByHwMode::SetType &S = Small.get(M);
     TypeSetByHwMode::SetType &B = Big.get(M);
 
-    if (any_of(S, isIntegerOrPtr) && any_of(S, isIntegerOrPtr)) {
+    assert((!SmallIsVT || !S.empty()) && "Expected non-empty type");
+
+    if (any_of(S, isIntegerOrPtr) && any_of(B, isIntegerOrPtr)) {
       auto NotInt = [](MVT VT) { return !isIntegerOrPtr(VT); };
       Changed |= berase_if(S, NotInt);
       Changed |= berase_if(B, NotInt);
@@ -484,6 +489,11 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
       auto NotFP = [](MVT VT) { return !isFloatingPoint(VT); };
       Changed |= berase_if(S, NotFP);
       Changed |= berase_if(B, NotFP);
+    } else if (SmallIsVT && B.empty()) {
+      // B is empty and since S is a specific VT, it will never be empty. Don't
+      // report this as a change, just clear S and continue. This prevents an
+      // infinite loop.
+      S.clear();
     } else if (S.empty() || B.empty()) {
       Changed = !S.empty() || !B.empty();
       S.clear();
@@ -1612,20 +1622,22 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
     unsigned OResNo = 0;
     TreePatternNode *OtherNode =
       getOperandNum(x.SDTCisSameAs_Info.OtherOperandNum, N, NodeInfo, OResNo);
-    return NodeToApply->UpdateNodeType(ResNo, OtherNode->getExtType(OResNo),TP)|
-           OtherNode->UpdateNodeType(OResNo,NodeToApply->getExtType(ResNo),TP);
+    return (int)NodeToApply->UpdateNodeType(ResNo,
+                                            OtherNode->getExtType(OResNo), TP) |
+           (int)OtherNode->UpdateNodeType(OResNo,
+                                          NodeToApply->getExtType(ResNo), TP);
   }
   case SDTCisVTSmallerThanOp: {
     // The NodeToApply must be a leaf node that is a VT.  OtherOperandNum must
     // have an integer type that is smaller than the VT.
     if (!NodeToApply->isLeaf() ||
         !isa<DefInit>(NodeToApply->getLeafValue()) ||
-        !static_cast<DefInit*>(NodeToApply->getLeafValue())->getDef()
+        !cast<DefInit>(NodeToApply->getLeafValue())->getDef()
                ->isSubClassOf("ValueType")) {
       TP.error(N->getOperator()->getName() + " expects a VT operand!");
       return false;
     }
-    DefInit *DI = static_cast<DefInit*>(NodeToApply->getLeafValue());
+    DefInit *DI = cast<DefInit>(NodeToApply->getLeafValue());
     const CodeGenTarget &T = TP.getDAGPatterns().getTargetInfo();
     auto VVT = getValueTypeByHwMode(DI->getDef(), T.getHwModes());
     TypeSetByHwMode TypeListTmp(VVT);
@@ -1635,7 +1647,8 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
       getOperandNum(x.SDTCisVTSmallerThanOp_Info.OtherOperandNum, N, NodeInfo,
                     OResNo);
 
-    return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo));
+    return TI.EnforceSmallerThan(TypeListTmp, OtherNode->getExtType(OResNo),
+                                 /*SmallIsVT*/ true);
   }
   case SDTCisOpSmallerThanOp: {
     unsigned BResNo = 0;
@@ -3819,7 +3832,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(
     InstInputs.erase(OpName);   // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
-      Record *InRec = static_cast<DefInit*>(InVal->getLeafValue())->getDef();
+      Record *InRec = cast<DefInit>(InVal->getLeafValue())->getDef();
       if (!checkOperandClass(Op, InRec))
         I.error("Operand $" + OpName + "'s register class disagrees"
                  " between the operand and pattern");
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index a69f1e2e3030..39d81230a4f2 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -298,8 +298,11 @@ struct TypeInfer {
   /// unchanged.
   bool EnforceAny(TypeSetByHwMode &Out);
   /// Make sure that for each type in \p Small, there exists a larger type
-  /// in \p Big.
-  bool EnforceSmallerThan(TypeSetByHwMode &Small, TypeSetByHwMode &Big);
+  /// in \p Big. \p SmallIsVT indicates that this is being called for
+  /// SDTCisVTSmallerThanOp. In that case the TypeSetByHwMode is re-created for
+  /// each call and needs special consideration in how we detect changes.
+  bool EnforceSmallerThan(TypeSetByHwMode &Small, TypeSetByHwMode &Big,
+                          bool SmallIsVT = false);
   /// 1. Ensure that for each type T in \p Vec, T is a vector type, and that
   ///    for each type U in \p Elem, U is a scalar type.
   /// 2. Ensure that for each (scalar) type U in \p Elem, there exists a
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 6f718acbac3e..38871eb8cf3c 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// CodeGenMapTable provides functionality for the TabelGen to create
+// CodeGenMapTable provides functionality for the TableGen to create
 // relation mapping between instructions. Relation models are defined using
 // InstrMapping as a base class. This file implements the functionality which
 // parses these definitions and generates relation maps using the information
@@ -443,14 +443,16 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
   if (ValueCols.size() > 1) {
     for (unsigned i = 0, e = ValueCols.size(); i < e; i++) {
       ListInit *ColumnI = ValueCols[i];
+      OS << "  if (";
       for (unsigned j = 0, ColSize = ColumnI->size(); j < ColSize; ++j) {
         std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
-        OS << "  if (in" << ColName;
+        OS << "in" << ColName;
         OS << " == ";
         OS << ColName << "_" << ColumnI->getElement(j)->getAsUnquotedString();
-        if (j < ColumnI->size() - 1) OS << " && ";
-        else OS << ")\n";
+        if (j < ColumnI->size() - 1)
+          OS << " && ";
       }
+      OS << ")\n";
       OS << "    return " << InstrMapDesc.getName();
       OS << "Table[mid]["<<i+1<<"];\n";
     }
@@ -480,9 +482,10 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
   if (ValueCols.size() > 1) {
     for (Init *CF : ColFields->getValues()) {
       std::string ColName = CF->getAsUnquotedString();
-      OS << ", enum " << ColName << " in" << ColName << ") {\n";
+      OS << ", enum " << ColName << " in" << ColName;
     }
-  } else { OS << ") {\n"; }
+  }
+  OS << ") {\n";
 
   // Emit map table.
   unsigned TableSize = emitBinSearchTable(OS);
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 930b7742103e..afaeb73ffab1 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -734,7 +734,7 @@ static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) {
 
 CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
     : TheDef(R), Name(std::string(R->getName())),
-      TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1) {
+      TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) {
   GeneratePressureSet = R->getValueAsBit("GeneratePressureSet");
   std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
   if (TypeList.empty())
@@ -802,6 +802,12 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   if (AllocationPriority < 0 || AllocationPriority > 63)
     PrintFatalError(R->getLoc(), "AllocationPriority out of range [0,63]");
   this->AllocationPriority = AllocationPriority;
+
+  BitsInit *TSF = R->getValueAsBitsInit("TSFlags");
+  for (unsigned I = 0, E = TSF->getNumBits(); I != E; ++I) {
+    BitInit *Bit = cast<BitInit>(TSF->getBit(I));
+    TSFlags |= uint8_t(Bit->getValue()) << I;
+  }
 }
 
 // Create an inferred register class that was missing from the .td files.
@@ -811,7 +817,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
                                            StringRef Name, Key Props)
     : Members(*Props.Members), TheDef(nullptr), Name(std::string(Name)),
       TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), RSI(Props.RSI),
-      CopyCost(0), Allocatable(true), AllocationPriority(0) {
+      CopyCost(0), Allocatable(true), AllocationPriority(0), TSFlags(0) {
   Artificial = true;
   GeneratePressureSet = false;
   for (const auto R : Members) {
@@ -839,6 +845,7 @@ void CodeGenRegisterClass::inheritProperties(CodeGenRegBank &RegBank) {
   });
   AltOrderSelect = Super.AltOrderSelect;
   AllocationPriority = Super.AllocationPriority;
+  TSFlags = Super.TSFlags;
   GeneratePressureSet |= Super.GeneratePressureSet;
 
   // Copy all allocation orders, filter out foreign registers from the larger
@@ -1617,9 +1624,9 @@ static void computeUberSets(std::vector<UberRegSet> &UberSets,
     assert(USetID && "register number 0 is invalid");
 
     AllocatableRegs.insert((*Regs.begin())->EnumValue);
-    for (auto I = std::next(Regs.begin()), E = Regs.end(); I != E; ++I) {
-      AllocatableRegs.insert((*I)->EnumValue);
-      UberSetIDs.join(USetID, (*I)->EnumValue);
+    for (const CodeGenRegister *CGR : llvm::drop_begin(Regs)) {
+      AllocatableRegs.insert(CGR->EnumValue);
+      UberSetIDs.join(USetID, CGR->EnumValue);
     }
   }
   // Combine non-allocatable regs.
@@ -1908,6 +1915,9 @@ void CodeGenRegBank::computeRegUnitSets() {
       RegUnitSets.pop_back();
   }
 
+  if (RegUnitSets.empty())
+    PrintFatalError("RegUnitSets cannot be empty!");
+
   LLVM_DEBUG(dbgs() << "\nBefore pruning:\n"; for (unsigned USIdx = 0,
                                                    USEnd = RegUnitSets.size();
                                                    USIdx < USEnd; ++USIdx) {
@@ -2018,7 +2028,8 @@ void CodeGenRegBank::computeRegUnitSets() {
       }
     }
     LLVM_DEBUG(dbgs() << "\n");
-    assert(!RegClassUnitSets[RCIdx].empty() && "missing unit set for regclass");
+    assert((!RegClassUnitSets[RCIdx].empty() || !RC.GeneratePressureSet) &&
+           "missing unit set for regclass");
   }
 
   // For each register unit, ensure that we have the list of UnitSets that
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index 6a0696011a40..c9fcf83b0a8a 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -332,6 +332,7 @@ namespace llvm {
     bool Allocatable;
     StringRef AltOrderSelect;
     uint8_t AllocationPriority;
+    uint8_t TSFlags;
     /// Contains the combination of the lane masks of all subregisters.
     LaneBitmask LaneMask;
     /// True if there are at least 2 subregisters which do not interfere.
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 7311819f77ff..137f99078faf 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -77,6 +77,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::ppcf128:  return "MVT::ppcf128";
   case MVT::x86mmx:   return "MVT::x86mmx";
   case MVT::x86amx:   return "MVT::x86amx";
+  case MVT::i64x8:    return "MVT::i64x8";
   case MVT::Glue:     return "MVT::Glue";
   case MVT::isVoid:   return "MVT::isVoid";
   case MVT::v1i1:     return "MVT::v1i1";
diff --git a/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index e931801f82a4..94ad6ee285d4 100644
--- a/llvm/utils/TableGen/RISCVCompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -1,17 +1,17 @@
-//===- RISCVCompressInstEmitter.cpp - Generator for RISCV Compression -===//
+//===-------- CompressInstEmitter.cpp - Generator for Compression ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// RISCVCompressInstEmitter implements a tablegen-driven CompressPat based
-// RISCV Instruction Compression mechanism.
+// CompressInstEmitter implements a tablegen-driven CompressPat based
+// Instruction Compression mechanism.
 //
-//===--------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
-// RISCVCompressInstEmitter implements a tablegen-driven CompressPat Instruction
-// Compression mechanism for generating RISCV compressed instructions
-// (C ISA Extension) from the expanded instruction form.
+// CompressInstEmitter implements a tablegen-driven CompressPat Instruction
+// Compression mechanism for generating compressed instructions from the
+// expanded instruction form.
 
 // This tablegen backend processes CompressPat declarations in a
 // td file and generates all the required checks to validate the pattern
@@ -21,10 +21,18 @@
 // immediate inputs.
 //
 // Example:
-// class CompressPat<dag input, dag output> {
+// /// Defines a Pat match between compressed and uncompressed instruction.
+// /// The relationship and helper function generation are handled by
+// /// CompressInstEmitter backend.
+// class CompressPat<dag input, dag output, list<Predicate> predicates = []> {
+//   /// Uncompressed instruction description.
 //   dag Input  = input;
-//   dag Output    = output;
-//   list<Predicate> Predicates = [];
+//   /// Compressed instruction description.
+//   dag Output = output;
+//   /// Predicates that must be true for this to match.
+//   list<Predicate> Predicates = predicates;
+//   /// Duplicate match when tied operand is just different.
+//   bit isCompressOnly = false;
 // }
 //
 // let Predicates = [HasStdExtC] in {
@@ -32,10 +40,9 @@
 //                   (C_ADD GPRNoX0:$rs1, GPRNoX0:$rs2)>;
 // }
 //
-// The result is an auto-generated header file
-// 'RISCVGenCompressInstEmitter.inc' which exports two functions for
-// compressing/uncompressing MCInst instructions, plus
-// some helper functions:
+// The <TargetName>GenCompressInstEmitter.inc is an auto-generated header
+// file which exports two functions for compressing/uncompressing MCInst
+// instructions, plus some helper functions:
 //
 // bool compressInst(MCInst &OutInst, const MCInst &MI,
 //                   const MCSubtargetInfo &STI,
@@ -49,9 +56,9 @@
 // an instruction is compressable:
 //
 // bool isCompressibleInst(const MachineInstr& MI,
-//                           const RISCVSubtarget *Subtarget,
-//                           const MCRegisterInfo &MRI,
-//                           const MCSubtargetInfo &STI);
+//                         const <TargetName>Subtarget *Subtarget,
+//                         const MCRegisterInfo &MRI,
+//                         const MCSubtargetInfo &STI);
 //
 // The clients that include this auto-generated header file and
 // invoke these functions can compress an instruction before emitting
@@ -79,28 +86,35 @@ using namespace llvm;
 #define DEBUG_TYPE "compress-inst-emitter"
 
 namespace {
-class RISCVCompressInstEmitter {
+class CompressInstEmitter {
   struct OpData {
     enum MapKind { Operand, Imm, Reg };
     MapKind Kind;
     union {
-      unsigned Operand; // Operand number mapped to.
-      int64_t Imm;      // Integer immediate value.
-      Record *Reg;      // Physical register.
+      // Operand number mapped to.
+      unsigned Operand;
+      // Integer immediate value.
+      int64_t Imm;
+      // Physical register.
+      Record *Reg;
     } Data;
-    int TiedOpIdx = -1; // Tied operand index within the instruction.
+    // Tied operand index within the instruction.
+    int TiedOpIdx = -1;
   };
   struct CompressPat {
-    CodeGenInstruction Source; // The source instruction definition.
-    CodeGenInstruction Dest;   // The destination instruction to transform to.
-    std::vector<Record *>
-        PatReqFeatures; // Required target features to enable pattern.
-    IndexedMap<OpData>
-        SourceOperandMap; // Maps operands in the Source Instruction to
-                          // the corresponding Dest instruction operand.
-    IndexedMap<OpData>
-        DestOperandMap; // Maps operands in the Dest Instruction
-                        // to the corresponding Source instruction operand.
+    // The source instruction definition.
+    CodeGenInstruction Source;
+    // The destination instruction to transform to.
+    CodeGenInstruction Dest;
+    // Required target features to enable pattern.
+    std::vector<Record *> PatReqFeatures;
+    // Maps operands in the Source Instruction to
+    IndexedMap<OpData> SourceOperandMap;
+    // the corresponding Dest instruction operand.
+    // Maps operands in the Dest Instruction
+    // to the corresponding Source instruction operand.
+    IndexedMap<OpData> DestOperandMap;
+
     bool IsCompressOnly;
     CompressPat(CodeGenInstruction &S, CodeGenInstruction &D,
                 std::vector<Record *> RF, IndexedMap<OpData> &SourceMap,
@@ -132,13 +146,13 @@ class RISCVCompressInstEmitter {
                                 CodeGenInstruction &DestInst);
 
 public:
-  RISCVCompressInstEmitter(RecordKeeper &R) : Records(R), Target(R) {}
+  CompressInstEmitter(RecordKeeper &R) : Records(R), Target(R) {}
 
   void run(raw_ostream &o);
 };
 } // End anonymous namespace.
 
-bool RISCVCompressInstEmitter::validateRegister(Record *Reg, Record *RegClass) {
+bool CompressInstEmitter::validateRegister(Record *Reg, Record *RegClass) {
   assert(Reg->isSubClassOf("Register") && "Reg record should be a Register");
   assert(RegClass->isSubClassOf("RegisterClass") &&
          "RegClass record should be a RegisterClass");
@@ -148,9 +162,8 @@ bool RISCVCompressInstEmitter::validateRegister(Record *Reg, Record *RegClass) {
   return RC.contains(R);
 }
 
-bool RISCVCompressInstEmitter::validateTypes(Record *DagOpType,
-                                             Record *InstOpType,
-                                             bool IsSourceInst) {
+bool CompressInstEmitter::validateTypes(Record *DagOpType, Record *InstOpType,
+                                        bool IsSourceInst) {
   if (DagOpType == InstOpType)
     return true;
   // Only source instruction operands are allowed to not match Input Dag
@@ -187,9 +200,10 @@ bool RISCVCompressInstEmitter::validateTypes(Record *DagOpType,
 /// operands and fixed registers it expects the Dag operand type to be contained
 /// in the instantiated instruction operand type. For immediate operands and
 /// immediates no validation checks are enforced at pattern validation time.
-void RISCVCompressInstEmitter::addDagOperandMapping(
-    Record *Rec, DagInit *Dag, CodeGenInstruction &Inst,
-    IndexedMap<OpData> &OperandMap, bool IsSourceInst) {
+void CompressInstEmitter::addDagOperandMapping(Record *Rec, DagInit *Dag,
+                                               CodeGenInstruction &Inst,
+                                               IndexedMap<OpData> &OperandMap,
+                                               bool IsSourceInst) {
   // TiedCount keeps track of the number of operands skipped in Inst
   // operands list to get to the corresponding Dag operand. This is
   // necessary because the number of operands in Inst might be greater
@@ -293,7 +307,7 @@ static bool validateArgsTypes(Init *Arg1, Init *Arg2) {
 // name have the same types. For example in 'C_ADD $rs1, $rs2' we generate the
 // mapping $rs1 --> 0, $rs2 ---> 1. If the operand appears twice in the (tied)
 // same Dag we use the last occurrence for indexing.
-void RISCVCompressInstEmitter::createDagOperandMapping(
+void CompressInstEmitter::createDagOperandMapping(
     Record *Rec, StringMap<unsigned> &SourceOperands,
     StringMap<unsigned> &DestOperands, DagInit *SourceDag, DagInit *DestDag,
     IndexedMap<OpData> &SourceOperandMap) {
@@ -340,7 +354,7 @@ void RISCVCompressInstEmitter::createDagOperandMapping(
 /// Map operand names in the Dag to their index in both corresponding input and
 /// output instructions. Validate that operands defined in the input are
 /// used in the output pattern while populating the maps.
-void RISCVCompressInstEmitter::createInstOperandMapping(
+void CompressInstEmitter::createInstOperandMapping(
     Record *Rec, DagInit *SourceDag, DagInit *DestDag,
     IndexedMap<OpData> &SourceOperandMap, IndexedMap<OpData> &DestOperandMap,
     StringMap<unsigned> &SourceOperands, CodeGenInstruction &DestInst) {
@@ -409,7 +423,7 @@ void RISCVCompressInstEmitter::createInstOperandMapping(
 ///   and generate warning.
 /// - Immediate operand type in Dag Input differs from the corresponding Source
 ///   Instruction type  and generate a warning.
-void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
+void CompressInstEmitter::evaluateCompressPat(Record *Rec) {
   // Validate input Dag operands.
   DagInit *SourceDag = Rec->getValueAsDag("Input");
   assert(SourceDag && "Missing 'Input' in compress pattern!");
@@ -417,9 +431,6 @@ void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
 
   // Checking we are transforming from compressed to uncompressed instructions.
   Record *Operator = SourceDag->getOperatorAsDef(Rec->getLoc());
-  if (!Operator->isSubClassOf("RVInst"))
-    PrintFatalError(Rec->getLoc(), "Input instruction '" + Operator->getName() +
-                                       "' is not a 32 bit wide instruction!");
   CodeGenInstruction SourceInst(Operator);
   verifyDagOpCount(SourceInst, SourceDag, true);
 
@@ -429,13 +440,16 @@ void RISCVCompressInstEmitter::evaluateCompressPat(Record *Rec) {
   LLVM_DEBUG(dbgs() << "Output: " << *DestDag << "\n");
 
   Record *DestOperator = DestDag->getOperatorAsDef(Rec->getLoc());
-  if (!DestOperator->isSubClassOf("RVInst16"))
-    PrintFatalError(Rec->getLoc(), "Output instruction  '" +
-                                       DestOperator->getName() +
-                                       "' is not a 16 bit wide instruction!");
   CodeGenInstruction DestInst(DestOperator);
   verifyDagOpCount(DestInst, DestDag, false);
 
+  if (Operator->getValueAsInt("Size") <= DestOperator->getValueAsInt("Size"))
+    PrintFatalError(
+        Rec->getLoc(),
+        "Compressed instruction '" + DestOperator->getName() +
+            "'is not strictly smaller than the uncompressed instruction '" +
+            Operator->getName() + "' !");
+
   // Fill the mapping from the source to destination instructions.
 
   IndexedMap<OpData> SourceOperandMap;
@@ -548,15 +562,15 @@ static void mergeCondAndCode(raw_ostream &CombinedStream, StringRef CondStr,
   CombinedStream.indent(4) << "} // if\n";
 }
 
-void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
-                                                       EmitterType EType) {
+void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
+                                                  EmitterType EType) {
   Record *AsmWriter = Target.getAsmWriter();
   if (!AsmWriter->getValueAsInt("PassSubtarget"))
     PrintFatalError(AsmWriter->getLoc(),
                     "'PassSubtarget' is false. SubTargetInfo object is needed "
                     "for target features.\n");
 
-  StringRef Namespace = Target.getName();
+  StringRef TargetName = Target.getName();
 
   // Sort entries in CompressPatterns to handle instructions that can have more
   // than one candidate for compression\uncompression, e.g ADD can be
@@ -599,14 +613,14 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     FuncH.indent(25) << "const MCInst &MI,\n";
     FuncH.indent(25) << "const MCSubtargetInfo &STI,\n";
     FuncH.indent(25) << "MCContext &Context) {\n";
-  } else if (EType == EmitterType::Uncompress){
+  } else if (EType == EmitterType::Uncompress) {
     FuncH << "static bool uncompressInst(MCInst &OutInst,\n";
     FuncH.indent(27) << "const MCInst &MI,\n";
     FuncH.indent(27) << "const MCRegisterInfo &MRI,\n";
     FuncH.indent(27) << "const MCSubtargetInfo &STI) {\n";
   } else if (EType == EmitterType::CheckCompress) {
     FuncH << "static bool isCompressibleInst(const MachineInstr &MI,\n";
-    FuncH.indent(27) << "const RISCVSubtarget *Subtarget,\n";
+    FuncH.indent(27) << "const " << TargetName << "Subtarget *Subtarget,\n";
     FuncH.indent(27) << "const MCRegisterInfo &MRI,\n";
     FuncH.indent(27) << "const MCSubtargetInfo &STI) {\n";
   }
@@ -631,9 +645,9 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
   CaseStream << "    default: return false;\n";
 
   bool CompressOrCheck =
-    EType == EmitterType::Compress || EType == EmitterType::CheckCompress;
+      EType == EmitterType::Compress || EType == EmitterType::CheckCompress;
   bool CompressOrUncompress =
-    EType == EmitterType::Compress || EType == EmitterType::Uncompress;
+      EType == EmitterType::Compress || EType == EmitterType::Uncompress;
 
   for (auto &CompressPat : CompressPatterns) {
     if (EType == EmitterType::Uncompress && CompressPat.IsCompressOnly)
@@ -644,20 +658,22 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     raw_string_ostream CondStream(CondString);
     raw_string_ostream CodeStream(CodeString);
     CodeGenInstruction &Source =
-        CompressOrCheck ?  CompressPat.Source : CompressPat.Dest;
+        CompressOrCheck ? CompressPat.Source : CompressPat.Dest;
     CodeGenInstruction &Dest =
         CompressOrCheck ? CompressPat.Dest : CompressPat.Source;
-    IndexedMap<OpData> SourceOperandMap = CompressOrCheck ?
-      CompressPat.SourceOperandMap : CompressPat.DestOperandMap;
-    IndexedMap<OpData> &DestOperandMap = CompressOrCheck ?
-      CompressPat.DestOperandMap : CompressPat.SourceOperandMap;
+    IndexedMap<OpData> SourceOperandMap = CompressOrCheck
+                                              ? CompressPat.SourceOperandMap
+                                              : CompressPat.DestOperandMap;
+    IndexedMap<OpData> &DestOperandMap = CompressOrCheck
+                                             ? CompressPat.DestOperandMap
+                                             : CompressPat.SourceOperandMap;
 
     CurOp = Source.TheDef->getName();
     // Check current and previous opcode to decide to continue or end a case.
     if (CurOp != PrevOp) {
       if (!PrevOp.empty())
         CaseStream.indent(6) << "break;\n    } // case " + PrevOp + "\n";
-      CaseStream.indent(4) << "case " + Namespace + "::" + CurOp + ": {\n";
+      CaseStream.indent(4) << "case " + TargetName + "::" + CurOp + ": {\n";
     }
 
     std::set<std::pair<bool, StringRef>> FeaturesSet;
@@ -676,7 +692,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     // Emit checks for all required features.
     for (auto &Op : FeaturesSet) {
       StringRef Not = Op.first ? "!" : "";
-      CondStream.indent(6) << Not << "STI.getFeatureBits()[" << Namespace
+      CondStream.indent(6) << Not << "STI.getFeatureBits()[" << TargetName
                            << "::" << Op.second << "]"
                            << " &&\n";
     }
@@ -687,7 +703,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
       for (auto &Op : Set) {
         bool isLast = &Op == &*Set.rbegin();
         StringRef Not = Op.first ? "!" : "";
-        CondStream << Not << "STI.getFeatureBits()[" << Namespace
+        CondStream << Not << "STI.getFeatureBits()[" << TargetName
                    << "::" << Op.second << "]";
         if (!isLast)
           CondStream << " || ";
@@ -720,7 +736,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
       case OpData::Reg: {
         Record *Reg = SourceOperandMap[OpNo].Data.Reg;
         CondStream.indent(6)
-            << "(MI.getOperand(" << OpNo << ").getReg() == " << Namespace
+            << "(MI.getOperand(" << OpNo << ").getReg() == " << TargetName
             << "::" << Reg->getName() << ") &&\n";
         break;
       }
@@ -728,7 +744,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     }
     CodeStream.indent(6) << "// " << Dest.AsmString << "\n";
     if (CompressOrUncompress)
-      CodeStream.indent(6) << "OutInst.setOpcode(" << Namespace
+      CodeStream.indent(6) << "OutInst.setOpcode(" << TargetName
                            << "::" << Dest.TheDef->getName() << ");\n";
     OpNo = 0;
     for (const auto &DestOperand : Dest.Operands) {
@@ -744,7 +760,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
           // Don't check register class if this is a tied operand, it was done
           // for the operand its tied to.
           if (DestOperand.getTiedRegister() == -1)
-            CondStream.indent(6) << "(MRI.getRegClass(" << Namespace
+            CondStream.indent(6) << "(MRI.getRegClass(" << TargetName
                                  << "::" << DestOperand.Rec->getName()
                                  << "RegClassID).contains(MI.getOperand("
                                  << OpIdx << ").getReg())) &&\n";
@@ -759,7 +775,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
                 getPredicates(MCOpPredicateMap, MCOpPredicates, DestOperand.Rec,
                               "MCOperandPredicate");
             CondStream.indent(6)
-                << Namespace << "ValidateMCOperand("
+                << TargetName << "ValidateMCOperand("
                 << "MI.getOperand(" << OpIdx << "), STI, " << Entry << ") &&\n";
           } else {
             unsigned Entry =
@@ -767,7 +783,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
                               DestOperand.Rec, "ImmediateCode");
             CondStream.indent(6)
                 << "MI.getOperand(" << OpIdx << ").isImm() &&\n";
-            CondStream.indent(6) << Namespace << "ValidateMachineOperand("
+            CondStream.indent(6) << TargetName << "ValidateMachineOperand("
                                  << "MI.getOperand(" << OpIdx
                                  << "), Subtarget, " << Entry << ") &&\n";
           }
@@ -782,14 +798,14 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
           unsigned Entry = getPredicates(MCOpPredicateMap, MCOpPredicates,
                                          DestOperand.Rec, "MCOperandPredicate");
           CondStream.indent(6)
-              << Namespace << "ValidateMCOperand("
+              << TargetName << "ValidateMCOperand("
               << "MCOperand::createImm(" << DestOperandMap[OpNo].Data.Imm
               << "), STI, " << Entry << ") &&\n";
         } else {
           unsigned Entry = getPredicates(ImmLeafPredicateMap, ImmLeafPredicates,
                                          DestOperand.Rec, "ImmediateCode");
           CondStream.indent(6)
-              << Namespace
+              << TargetName
               << "ValidateMachineOperand(MachineOperand::CreateImm("
               << DestOperandMap[OpNo].Data.Imm << "), SubTarget, " << Entry
               << ") &&\n";
@@ -803,7 +819,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
           // Fixed register has been validated at pattern validation time.
           Record *Reg = DestOperandMap[OpNo].Data.Reg;
           CodeStream.indent(6)
-              << "OutInst.addOperand(MCOperand::createReg(" << Namespace
+              << "OutInst.addOperand(MCOperand::createReg(" << TargetName
               << "::" << Reg->getName() << "));\n";
         }
       } break;
@@ -822,7 +838,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
   Func.indent(2) << "return false;\n}\n";
 
   if (!MCOpPredicates.empty()) {
-    o << "static bool " << Namespace
+    o << "static bool " << TargetName
       << "ValidateMCOperand(const MCOperand &MCOp,\n"
       << "                  const MCSubtargetInfo &STI,\n"
       << "                  unsigned PredicateIndex) {\n"
@@ -838,9 +854,9 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
   }
 
   if (!ImmLeafPredicates.empty()) {
-    o << "static bool " << Namespace
+    o << "static bool " << TargetName
       << "ValidateMachineOperand(const MachineOperand &MO,\n"
-      << "                  const RISCVSubtarget *Subtarget,\n"
+      << "                  const " << TargetName << "Subtarget *Subtarget,\n"
       << "                  unsigned PredicateIndex) {\n"
       << "  int64_t Imm = MO.getImm();\n"
       << "  switch (PredicateIndex) {\n"
@@ -867,7 +883,7 @@ void RISCVCompressInstEmitter::emitCompressInstEmitter(raw_ostream &o,
     o << "\n#endif //GEN_CHECK_COMPRESS_INSTR\n\n";
 }
 
-void RISCVCompressInstEmitter::run(raw_ostream &o) {
+void CompressInstEmitter::run(raw_ostream &o) {
   std::vector<Record *> Insts = Records.getAllDerivedDefinitions("CompressPat");
 
   // Process the CompressPat definitions, validating them as we do so.
@@ -887,7 +903,7 @@ void RISCVCompressInstEmitter::run(raw_ostream &o) {
 namespace llvm {
 
 void EmitCompressInst(RecordKeeper &RK, raw_ostream &OS) {
-  RISCVCompressInstEmitter(RK).run(OS);
+  CompressInstEmitter(RK).run(OS);
 }
 
 } // namespace llvm
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 693073672fc1..d08186b7094b 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -1212,11 +1212,13 @@ PredicateListMatcher<OperandPredicateMatcher>::getNoPredicateComment() const {
 /// one as another.
 class SameOperandMatcher : public OperandPredicateMatcher {
   std::string MatchingName;
+  unsigned OrigOpIdx;
 
 public:
-  SameOperandMatcher(unsigned InsnVarID, unsigned OpIdx, StringRef MatchingName)
+  SameOperandMatcher(unsigned InsnVarID, unsigned OpIdx, StringRef MatchingName,
+                     unsigned OrigOpIdx)
       : OperandPredicateMatcher(OPM_SameOperand, InsnVarID, OpIdx),
-        MatchingName(MatchingName) {}
+        MatchingName(MatchingName), OrigOpIdx(OrigOpIdx) {}
 
   static bool classof(const PredicateMatcher *P) {
     return P->getKind() == OPM_SameOperand;
@@ -1227,6 +1229,7 @@ public:
 
   bool isIdentical(const PredicateMatcher &B) const override {
     return OperandPredicateMatcher::isIdentical(B) &&
+           OrigOpIdx == cast<SameOperandMatcher>(&B)->OrigOpIdx &&
            MatchingName == cast<SameOperandMatcher>(&B)->MatchingName;
   }
 };
@@ -3291,7 +3294,8 @@ void RuleMatcher::defineOperand(StringRef SymbolicName, OperandMatcher &OM) {
 
   // If the operand is already defined, then we must ensure both references in
   // the matcher have the exact same node.
-  OM.addPredicate<SameOperandMatcher>(OM.getSymbolicName());
+  OM.addPredicate<SameOperandMatcher>(
+      OM.getSymbolicName(), getOperandMatcher(OM.getSymbolicName()).getOpIdx());
 }
 
 void RuleMatcher::definePhysRegOperand(Record *Reg, OperandMatcher &OM) {
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 3d1d258e342e..437b5f002027 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -249,7 +249,8 @@ enum IIT_Info {
   IIT_BF16 = 48,
   IIT_STRUCT9 = 49,
   IIT_V256 = 50,
-  IIT_AMX  = 51
+  IIT_AMX  = 51,
+  IIT_PPCF128 = 52
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -274,6 +275,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   case MVT::f32: return Sig.push_back(IIT_F32);
   case MVT::f64: return Sig.push_back(IIT_F64);
   case MVT::f128: return Sig.push_back(IIT_F128);
+  case MVT::ppcf128: return Sig.push_back(IIT_PPCF128);
   case MVT::token: return Sig.push_back(IIT_TOKEN);
   case MVT::Metadata: return Sig.push_back(IIT_METADATA);
   case MVT::x86mmx: return Sig.push_back(IIT_MMX);
diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp
index a76640f6d11f..a7256499d566 100644
--- a/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/PredicateExpander.cpp
@@ -470,7 +470,7 @@ void STIPredicateExpander::expandOpcodeGroup(raw_ostream &OS, const OpcodeGroup
     increaseIndentLevel();
     OS.indent(getIndentLevel() * 2);
     if (ShouldUpdateOpcodeMask) {
-      if (PI.OperandMask.isNullValue())
+      if (PI.OperandMask.isZero())
         OS << "Mask.clearAllBits();\n";
       else
         OS << "Mask = " << PI.OperandMask << ";\n";
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 037fad207ac7..1ed7bc103f9c 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1411,6 +1411,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
          << SuperRegIdxSeqs.get(SuperRegIdxLists[RC.EnumValue]) << ",\n    ";
       printMask(OS, RC.LaneMask);
       OS << ",\n    " << (unsigned)RC.AllocationPriority << ",\n    "
+         << format("0x%02x", RC.TSFlags) << ", /* TSFlags */\n    "
          << (RC.HasDisjunctSubRegs?"true":"false")
          << ", /* HasDisjunctSubRegs */\n    "
          << (RC.CoveredBySubRegs?"true":"false")
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 89069ec3e4ff..d1a9ecb06a2b 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -994,6 +994,8 @@ void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[5], XOP9_MAP_STR);
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[6], XOPA_MAP_STR);
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[7], THREEDNOW_MAP_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[8], MAP5_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[9], MAP6_STR);
 }
 
 void DisassemblerTables::emit(raw_ostream &o) const {
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h
index 63af68b6fbfa..2e4ff1e2ce08 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.h
+++ b/llvm/utils/TableGen/X86DisassemblerTables.h
@@ -41,7 +41,9 @@ private:
   /// [5] XOP9 map opcode
   /// [6] XOPA map opcode
   /// [7] 3dnow map opcode
-  std::unique_ptr<ContextDecision> Tables[8];
+  /// [8] fixed length MAP5 opcode
+  /// [9] fixed length MAP6 opcode
+  std::unique_ptr<ContextDecision> Tables[10];
 
   // Table of ModRM encodings.
   typedef std::map<std::vector<unsigned>, unsigned> ModRMMapTy;
diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 009dc036cf97..36c71843d70e 100644
--- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -65,7 +65,7 @@ void X86EVEX2VEXTablesEmitter::printTable(const std::vector<Entry> &Table,
      << "  // EVEX scalar with corresponding VEX.\n";
 
   // Print all entries added to the table
-  for (auto Pair : Table) {
+  for (const auto &Pair : Table) {
     OS << "  { X86::" << Pair.first->TheDef->getName()
        << ", X86::" << Pair.second->TheDef->getName() << " },\n";
   }
@@ -80,7 +80,7 @@ void X86EVEX2VEXTablesEmitter::printCheckPredicate(
      << "  unsigned Opc = MI.getOpcode();\n"
      << "  switch (Opc) {\n"
      << "    default: return true;\n";
-  for (auto Pair : Predicates)
+  for (const auto &Pair : Predicates)
     OS << "    case X86::" << Pair.first << ": return " << Pair.second << ";\n";
   OS << "  }\n"
      << "}\n\n";
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 85d926215113..0a8d0750cf13 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -79,13 +79,13 @@ const ManualMapEntry ManualMapSet[] = {
 
 static bool isExplicitAlign(const CodeGenInstruction *Inst) {
   return any_of(ExplicitAlign, [Inst](const char *InstStr) {
-    return Inst->TheDef->getName().find(InstStr) != StringRef::npos;
+    return Inst->TheDef->getName().contains(InstStr);
   });
 }
 
 static bool isExplicitUnalign(const CodeGenInstruction *Inst) {
   return any_of(ExplicitUnalign, [Inst](const char *InstStr) {
-    return Inst->TheDef->getName().find(InstStr) != StringRef::npos;
+    return Inst->TheDef->getName().contains(InstStr);
   });
 }
 
@@ -278,7 +278,7 @@ static inline bool hasMemoryFormat(const Record *Inst) {
 }
 
 static inline bool isNOREXRegClass(const Record *Op) {
-  return Op->getName().find("_NOREX") != StringRef::npos;
+  return Op->getName().contains("_NOREX");
 }
 
 static inline bool isRegisterOperand(const Record *Rec) {
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index c2ca3791ac36..a9b384155965 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -109,12 +109,12 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   // FIXME: Is there some better way to check for In64BitMode?
   std::vector<Record*> Predicates = Rec->getValueAsListOfDefs("Predicates");
   for (unsigned i = 0, e = Predicates.size(); i != e; ++i) {
-    if (Predicates[i]->getName().find("Not64Bit") != Name.npos ||
-        Predicates[i]->getName().find("In32Bit") != Name.npos) {
+    if (Predicates[i]->getName().contains("Not64Bit") ||
+        Predicates[i]->getName().contains("In32Bit")) {
       Is32Bit = true;
       break;
     }
-    if (Predicates[i]->getName().find("In64Bit") != Name.npos) {
+    if (Predicates[i]->getName().contains("In64Bit")) {
       Is64Bit = true;
       break;
     }
@@ -752,6 +752,8 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   case X86Local::XOP9:      opcodeType = XOP9_MAP;      break;
   case X86Local::XOPA:      opcodeType = XOPA_MAP;      break;
   case X86Local::ThreeDNow: opcodeType = THREEDNOW_MAP; break;
+  case X86Local::T_MAP5:    opcodeType = MAP5;          break;
+  case X86Local::T_MAP6:    opcodeType = MAP6;          break;
   }
 
   std::unique_ptr<ModRMFilter> filter;
@@ -901,10 +903,13 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("FR64X",               TYPE_XMM)
   TYPE("f64mem",              TYPE_M)
   TYPE("sdmem",               TYPE_M)
+  TYPE("FR16X",               TYPE_XMM)
   TYPE("FR32",                TYPE_XMM)
   TYPE("FR32X",               TYPE_XMM)
   TYPE("f32mem",              TYPE_M)
+  TYPE("f16mem",              TYPE_M)
   TYPE("ssmem",               TYPE_M)
+  TYPE("shmem",               TYPE_M)
   TYPE("RST",                 TYPE_ST)
   TYPE("RSTi",                TYPE_ST)
   TYPE("i128mem",             TYPE_M)
@@ -1019,6 +1024,7 @@ RecognizableInstr::immediateEncodingFromString(const std::string &s,
   ENCODING("FR128",           ENCODING_IB)
   ENCODING("VR128",           ENCODING_IB)
   ENCODING("VR256",           ENCODING_IB)
+  ENCODING("FR16X",           ENCODING_IB)
   ENCODING("FR32X",           ENCODING_IB)
   ENCODING("FR64X",           ENCODING_IB)
   ENCODING("VR128X",          ENCODING_IB)
@@ -1047,6 +1053,7 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s,
   ENCODING("FR32",            ENCODING_RM)
   ENCODING("FR64X",           ENCODING_RM)
   ENCODING("FR32X",           ENCODING_RM)
+  ENCODING("FR16X",           ENCODING_RM)
   ENCODING("VR64",            ENCODING_RM)
   ENCODING("VR256",           ENCODING_RM)
   ENCODING("VR256X",          ENCODING_RM)
@@ -1058,11 +1065,6 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s,
   ENCODING("VK16",            ENCODING_RM)
   ENCODING("VK32",            ENCODING_RM)
   ENCODING("VK64",            ENCODING_RM)
-  ENCODING("VK1PAIR",         ENCODING_RM)
-  ENCODING("VK2PAIR",         ENCODING_RM)
-  ENCODING("VK4PAIR",         ENCODING_RM)
-  ENCODING("VK8PAIR",         ENCODING_RM)
-  ENCODING("VK16PAIR",        ENCODING_RM)
   ENCODING("BNDR",            ENCODING_RM)
   ENCODING("TILE",            ENCODING_RM)
   errs() << "Unhandled R/M register encoding " << s << "\n";
@@ -1091,6 +1093,7 @@ RecognizableInstr::roRegisterEncodingFromString(const std::string &s,
   ENCODING("VR128X",          ENCODING_REG)
   ENCODING("FR64X",           ENCODING_REG)
   ENCODING("FR32X",           ENCODING_REG)
+  ENCODING("FR16X",           ENCODING_REG)
   ENCODING("VR512",           ENCODING_REG)
   ENCODING("VK1",             ENCODING_REG)
   ENCODING("VK2",             ENCODING_REG)
@@ -1127,6 +1130,7 @@ RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s,
   ENCODING("FR64",            ENCODING_VVVV)
   ENCODING("VR128",           ENCODING_VVVV)
   ENCODING("VR256",           ENCODING_VVVV)
+  ENCODING("FR16X",           ENCODING_VVVV)
   ENCODING("FR32X",           ENCODING_VVVV)
   ENCODING("FR64X",           ENCODING_VVVV)
   ENCODING("VR128X",          ENCODING_VVVV)
@@ -1139,11 +1143,6 @@ RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s,
   ENCODING("VK16",            ENCODING_VVVV)
   ENCODING("VK32",            ENCODING_VVVV)
   ENCODING("VK64",            ENCODING_VVVV)
-  ENCODING("VK1PAIR",         ENCODING_VVVV)
-  ENCODING("VK2PAIR",         ENCODING_VVVV)
-  ENCODING("VK4PAIR",         ENCODING_VVVV)
-  ENCODING("VK8PAIR",         ENCODING_VVVV)
-  ENCODING("VK16PAIR",        ENCODING_VVVV)
   ENCODING("TILE",            ENCODING_VVVV)
   errs() << "Unhandled VEX.vvvv register encoding " << s << "\n";
   llvm_unreachable("Unhandled VEX.vvvv register encoding");
@@ -1170,6 +1169,7 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s,
   ENCODING("i32mem",          ENCODING_RM)
   ENCODING("i64mem",          ENCODING_RM)
   ENCODING("i8mem",           ENCODING_RM)
+  ENCODING("shmem",           ENCODING_RM)
   ENCODING("ssmem",           ENCODING_RM)
   ENCODING("sdmem",           ENCODING_RM)
   ENCODING("f128mem",         ENCODING_RM)
@@ -1177,6 +1177,7 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s,
   ENCODING("f512mem",         ENCODING_RM)
   ENCODING("f64mem",          ENCODING_RM)
   ENCODING("f32mem",          ENCODING_RM)
+  ENCODING("f16mem",          ENCODING_RM)
   ENCODING("i128mem",         ENCODING_RM)
   ENCODING("i256mem",         ENCODING_RM)
   ENCODING("i512mem",         ENCODING_RM)
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index a7b88b4d12ed..d4fad2cc3f0f 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -130,7 +130,8 @@ namespace X86Local {
   };
 
   enum {
-    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7
+    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7,
+    T_MAP5 = 8, T_MAP6 = 9
   };
 
   enum {